1 ;*****************************************************************************
2 ;* predict-a.asm: h264 encoder library
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2008 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Fiona Glaser <fiona@x264.com>
9 ;* This program is free software; you can redistribute it and/or modify
10 ;* it under the terms of the GNU General Public License as published by
11 ;* the Free Software Foundation; either version 2 of the License, or
12 ;* (at your option) any later version.
14 ;* This program is distributed in the hope that it will be useful,
15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 ;* GNU General Public License for more details.
19 ;* You should have received a copy of the GNU General Public License
20 ;* along with this program; if not, write to the Free Software
21 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
22 ;*****************************************************************************
27 movq [r0 + 0*FDEC_STRIDE], %1
28 movq [r0 + 1*FDEC_STRIDE], %1
29 movq [r0 + 2*FDEC_STRIDE], %1
30 movq [r0 + 3*FDEC_STRIDE], %1
31 movq [r0 + 4*FDEC_STRIDE], %2
32 movq [r0 + 5*FDEC_STRIDE], %2
33 movq [r0 + 6*FDEC_STRIDE], %2
34 movq [r0 + 7*FDEC_STRIDE], %2
40 movq [r0 + 0*FDEC_STRIDE], %1
41 movq [r0 + 1*FDEC_STRIDE], %1
42 movq [r0 + 2*FDEC_STRIDE], %1
43 movq [r0 + 3*FDEC_STRIDE], %1
44 movq [r0 + 0*FDEC_STRIDE + 8], %2
45 movq [r0 + 1*FDEC_STRIDE + 8], %2
46 movq [r0 + 2*FDEC_STRIDE + 8], %2
47 movq [r0 + 3*FDEC_STRIDE + 8], %2
53 %macro STORE16x16_SSE2 1
56 movdqa [r0 + 0*FDEC_STRIDE], %1
57 movdqa [r0 + 1*FDEC_STRIDE], %1
58 movdqa [r0 + 2*FDEC_STRIDE], %1
59 movdqa [r0 + 3*FDEC_STRIDE], %1
73 pw_3210: dw 0, 1, 2, 3, 4, 5, 6, 7
74 pb_00s_ff: times 8 db 0
75 pb_0s_ff: times 7 db 0
80 ; dest, left, right, src, tmp
81 ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
82 %macro PRED8x8_LOWPASS0 6
87 pand %3, [pb_1 GLOBAL]
91 %macro PRED8x8_LOWPASS 5
92 PRED8x8_LOWPASS0 %1, %2, %3, %4, %5, q
94 %macro PRED8x8_LOWPASS_XMM 5
95 PRED8x8_LOWPASS0 %1, %2, %3, %4, %5, dqa
99 ;-----------------------------------------------------------------------------
100 ; void predict_4x4_ddl_mmxext( uint8_t *src )
101 ;-----------------------------------------------------------------------------
102 cglobal predict_4x4_ddl_mmxext, 1,1,1
107 movq mm4, [pb_0s_ff GLOBAL]
112 PRED8x8_LOWPASS mm0, mm1, mm2, mm3, mm5
117 movd [r0+Y*FDEC_STRIDE], mm0
123 ;-----------------------------------------------------------------------------
124 ; void predict_4x4_vl_mmxext( uint8_t *src )
125 ;-----------------------------------------------------------------------------
126 cglobal predict_4x4_vl_mmxext, 1,1,1
127 movq mm1, [r0-FDEC_STRIDE]
135 PRED8x8_LOWPASS mm0, mm1, mm2, mm3, mm5
137 movd [r0+0*FDEC_STRIDE], mm4
138 movd [r0+1*FDEC_STRIDE], mm0
141 movd [r0+2*FDEC_STRIDE], mm4
142 movd [r0+3*FDEC_STRIDE], mm0
146 ;-----------------------------------------------------------------------------
147 ; void predict_8x8_v_mmxext( uint8_t *src, uint8_t *edge )
148 ;-----------------------------------------------------------------------------
149 cglobal predict_8x8_v_mmxext, 2,2
154 ;-----------------------------------------------------------------------------
155 ; void predict_8x8_dc_mmxext( uint8_t *src, uint8_t *edge );
156 ;-----------------------------------------------------------------------------
157 cglobal predict_8x8_dc_mmxext, 2,2,1
162 paddw mm0, [pw_8 GLOBAL]
170 ;-----------------------------------------------------------------------------
171 ; void predict_8x8_dc_top_mmxext( uint8_t *src, uint8_t *edge );
172 ;-----------------------------------------------------------------------------
177 paddw mm0, [pw_4 GLOBAL]
185 PRED8x8_DC predict_8x8_dc_top_mmxext, 16
186 PRED8x8_DC predict_8x8_dc_left_mmxext, 7
189 ; sse2 is faster even on amd, so there's no sense in spending exe size on these
190 ; functions if we know sse2 is available.
192 ;-----------------------------------------------------------------------------
193 ; void predict_8x8_ddl_mmxext( uint8_t *src, uint8_t *edge )
194 ;-----------------------------------------------------------------------------
195 cglobal predict_8x8_ddl_mmxext, 2,2,1
202 PRED8x8_LOWPASS mm0, mm1, mm2, mm5, mm7
203 PRED8x8_LOWPASS mm1, mm3, mm4, [r1+24], mm6
207 movq [r0+Y*FDEC_STRIDE], mm1
215 movq [r0+Y*FDEC_STRIDE], mm1
220 movq [r0+Y*FDEC_STRIDE], mm1
223 ;-----------------------------------------------------------------------------
224 ; void predict_8x8_ddr_mmxext( uint8_t *src, uint8_t *edge )
225 ;-----------------------------------------------------------------------------
226 cglobal predict_8x8_ddr_mmxext, 2,2,1
231 PRED8x8_LOWPASS mm0, mm1, mm2, [r1+8], mm7
232 PRED8x8_LOWPASS mm1, mm3, mm4, [r1+16], mm6
236 movq [r0+Y*FDEC_STRIDE], mm0
244 movq [r0+Y*FDEC_STRIDE], mm0
249 movq [r0+Y*FDEC_STRIDE], mm0
252 %endif ; !ARCH_X86_64
254 ;-----------------------------------------------------------------------------
255 ; void predict_8x8_ddl_sse2( uint8_t *src, uint8_t *edge )
256 ;-----------------------------------------------------------------------------
257 cglobal predict_8x8_ddl_sse2, 2,2,1
262 PRED8x8_LOWPASS_XMM xmm0, xmm1, xmm2, xmm3, xmm4
267 movq [r0+Y*FDEC_STRIDE], xmm0
272 ;-----------------------------------------------------------------------------
273 ; void predict_8x8_ddr_sse2( uint8_t *src, uint8_t *edge )
274 ;-----------------------------------------------------------------------------
275 cglobal predict_8x8_ddr_sse2, 2,2,1
280 PRED8x8_LOWPASS_XMM xmm0, xmm1, xmm2, xmm3, xmm4
286 movq [r0+Y*FDEC_STRIDE], xmm0
287 movq [r0+(Y-1)*FDEC_STRIDE], xmm1
292 movq [r0+1*FDEC_STRIDE], xmm0
293 movq [r0+0*FDEC_STRIDE], xmm1
297 ;-----------------------------------------------------------------------------
298 ; void predict_8x8_vl_sse2( uint8_t *src, uint8_t *edge )
299 ;-----------------------------------------------------------------------------
300 cglobal predict_8x8_vl_sse2, 2,2,1
308 PRED8x8_LOWPASS_XMM xmm0, xmm1, xmm2, xmm4, xmm5
309 ; xmm0: (t0 + 2*t1 + t2 + 2) >> 2
310 ; xmm3: (t0 + t1 + 1) >> 1
315 movq [r0+ Y *FDEC_STRIDE], xmm3
316 movq [r0+(Y+1)*FDEC_STRIDE], xmm0
321 movq [r0+ Y *FDEC_STRIDE], xmm3
322 movq [r0+(Y+1)*FDEC_STRIDE], xmm0
326 ;-----------------------------------------------------------------------------
327 ; void predict_8x8_vr_core_mmxext( uint8_t *src, uint8_t *edge )
328 ;-----------------------------------------------------------------------------
330 ; fills only some pixels:
341 cglobal predict_8x8_vr_core_mmxext, 2,2,1
347 PRED8x8_LOWPASS mm0, mm1, mm2, mm4, mm7
351 movq [r0+ Y *FDEC_STRIDE], mm3
352 movq [r0+(Y+1)*FDEC_STRIDE], mm0
357 movq [r0+ Y *FDEC_STRIDE], mm3
358 movq [r0+(Y+1)*FDEC_STRIDE], mm0
362 ;-----------------------------------------------------------------------------
363 ; void predict_8x8c_v_mmx( uint8_t *src )
364 ;-----------------------------------------------------------------------------
365 cglobal predict_8x8c_v_mmx, 1,1
366 movq mm0, [r0 - FDEC_STRIDE]
370 ;-----------------------------------------------------------------------------
371 ; void predict_8x8c_dc_core_mmxext( uint8_t *src, int s2, int s3 )
372 ;-----------------------------------------------------------------------------
373 cglobal predict_8x8c_dc_core_mmxext, 1,1,1
374 movq mm0, [r0 - FDEC_STRIDE]
392 paddw mm1, [pw_2 GLOBAL]
395 pshufw mm0, mm0, 0 ; dc0 (w)
397 psrlw mm3, 3 ; dc3 (w)
398 psrlw mm2, 2 ; dc2 (w)
399 psrlw mm1, 2 ; dc1 (w)
401 packuswb mm0, mm1 ; dc0,dc1 (b)
402 packuswb mm2, mm3 ; dc2,dc3 (b)
407 %macro LOAD_PLANE_ARGS 0
422 ;-----------------------------------------------------------------------------
423 ; void predict_8x8c_p_core_mmxext( uint8_t *src, int i00, int b, int c )
424 ;-----------------------------------------------------------------------------
425 cglobal predict_8x8c_p_core_mmxext, 1,2,1
428 pmullw mm2, [pw_3210 GLOBAL]
430 paddsw mm0, mm2 ; mm0 = {i+0*b, i+1*b, i+2*b, i+3*b}
431 paddsw mm1, mm0 ; mm1 = {i+4*b, i+5*b, i+6*b, i+7*b}
450 ;-----------------------------------------------------------------------------
451 ; void predict_16x16_p_core_mmxext( uint8_t *src, int i00, int b, int c )
452 ;-----------------------------------------------------------------------------
453 cglobal predict_16x16_p_core_mmxext, 1,2,1
457 pmullw mm5, [pw_3210 GLOBAL]
461 paddsw mm0, mm5 ; mm0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b}
462 paddsw mm1, mm0 ; mm1 = {i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b}
463 paddsw mm2, mm0 ; mm2 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b}
464 paddsw mm3, mm1 ; mm3 = {i+12*b, i+13*b, i+14*b, i+15*b}
492 ;-----------------------------------------------------------------------------
493 ; void predict_16x16_p_core_sse2( uint8_t *src, int i00, int b, int c )
494 ;-----------------------------------------------------------------------------
495 cglobal predict_16x16_p_core_sse2, 1,2,1
499 pshuflw xmm0, xmm0, 0
500 pshuflw xmm1, xmm1, 0
501 pshuflw xmm2, xmm2, 0
502 punpcklqdq xmm0, xmm0
503 punpcklqdq xmm1, xmm1
504 punpcklqdq xmm2, xmm2
506 pmullw xmm3, [pw_76543210 GLOBAL]
508 paddsw xmm0, xmm3 ; xmm0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b, i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b}
509 paddsw xmm1, xmm0 ; xmm1 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b, i+12*b, i+13*b, i+14*b, i+15*b}
528 ;-----------------------------------------------------------------------------
529 ; void predict_16x16_v_mmx( uint8_t *src )
530 ;-----------------------------------------------------------------------------
531 cglobal predict_16x16_v_mmx, 1,2
532 movq mm0, [r0 - FDEC_STRIDE]
533 movq mm1, [r0 - FDEC_STRIDE + 8]
537 ;-----------------------------------------------------------------------------
538 ; void predict_16x16_v_sse2( uint8_t *src )
539 ;-----------------------------------------------------------------------------
540 cglobal predict_16x16_v_sse2, 1,2
541 movdqa xmm0, [r0 - FDEC_STRIDE]
545 ;-----------------------------------------------------------------------------
546 ; void predict_16x16_dc_core_mmxext( uint8_t *src, int i_dc_left )
547 ;-----------------------------------------------------------------------------
549 %macro PRED16x16_DC 2
552 psadbw mm0, [r0 - FDEC_STRIDE]
553 psadbw mm1, [r0 - FDEC_STRIDE + 8]
558 packuswb mm0, mm0 ; dc in bytes
562 cglobal predict_16x16_dc_core_mmxext, 1,2
571 cglobal predict_16x16_dc_top_mmxext, 1,2,1
572 PRED16x16_DC [pw_8 GLOBAL], 4
575 ;-----------------------------------------------------------------------------
576 ; void predict_16x16_dc_core_sse2( uint8_t *src, int i_dc_left )
577 ;-----------------------------------------------------------------------------
579 %macro PRED16x16_DC_SSE2 2
581 psadbw xmm0, [r0 - FDEC_STRIDE]
586 pshuflw xmm0, xmm0, 0
587 punpcklqdq xmm0, xmm0
588 packuswb xmm0, xmm0 ; dc in bytes
592 cglobal predict_16x16_dc_core_sse2, 1,2
594 PRED16x16_DC_SSE2 xmm2, 5
597 cglobal predict_16x16_dc_top_sse2, 1,2,1
598 PRED16x16_DC_SSE2 [pw_8 GLOBAL], 4