1 ;*****************************************************************************
2 ;* predict-a.asm: h264 encoder library
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2008 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Holger Lubitz <holger@lubitz.org>
8 ;* Fiona Glaser <fiona@x264.com>
10 ;* This program is free software; you can redistribute it and/or modify
11 ;* it under the terms of the GNU General Public License as published by
12 ;* the Free Software Foundation; either version 2 of the License, or
13 ;* (at your option) any later version.
15 ;* This program is distributed in the hope that it will be useful,
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 ;* GNU General Public License for more details.
20 ;* You should have received a copy of the GNU General Public License
21 ;* along with this program; if not, write to the Free Software
22 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
23 ;*****************************************************************************
26 %include "x86util.asm"
30 movq [r0 + -4*FDEC_STRIDE], %1
31 movq [r0 + -3*FDEC_STRIDE], %1
32 movq [r0 + -2*FDEC_STRIDE], %1
33 movq [r0 + -1*FDEC_STRIDE], %1
34 movq [r0 + 0*FDEC_STRIDE], %2
35 movq [r0 + 1*FDEC_STRIDE], %2
36 movq [r0 + 2*FDEC_STRIDE], %2
37 movq [r0 + 3*FDEC_STRIDE], %2
43 movq [r0 + 0*FDEC_STRIDE], %1
44 movq [r0 + 1*FDEC_STRIDE], %1
45 movq [r0 + 2*FDEC_STRIDE], %1
46 movq [r0 + 3*FDEC_STRIDE], %1
47 movq [r0 + 0*FDEC_STRIDE + 8], %2
48 movq [r0 + 1*FDEC_STRIDE + 8], %2
49 movq [r0 + 2*FDEC_STRIDE + 8], %2
50 movq [r0 + 3*FDEC_STRIDE + 8], %2
56 %macro STORE16x16_SSE2 1
58 movdqa [r0 + -4*FDEC_STRIDE], %1
59 movdqa [r0 + -3*FDEC_STRIDE], %1
60 movdqa [r0 + -2*FDEC_STRIDE], %1
61 movdqa [r0 + -1*FDEC_STRIDE], %1
62 movdqa [r0 + 0*FDEC_STRIDE], %1
63 movdqa [r0 + 1*FDEC_STRIDE], %1
64 movdqa [r0 + 2*FDEC_STRIDE], %1
65 movdqa [r0 + 3*FDEC_STRIDE], %1
67 movdqa [r0 + -4*FDEC_STRIDE], %1
68 movdqa [r0 + -3*FDEC_STRIDE], %1
69 movdqa [r0 + -2*FDEC_STRIDE], %1
70 movdqa [r0 + -1*FDEC_STRIDE], %1
71 movdqa [r0 + 0*FDEC_STRIDE], %1
72 movdqa [r0 + 1*FDEC_STRIDE], %1
73 movdqa [r0 + 2*FDEC_STRIDE], %1
74 movdqa [r0 + 3*FDEC_STRIDE], %1
86 pw_3210: dw 0, 1, 2, 3, 4, 5, 6, 7
87 pb_00s_ff: times 8 db 0
88 pb_0s_ff: times 7 db 0
90 pw_ff00: times 8 dw 0xff00
91 pb_reverse: db 7, 6, 5, 4, 3, 2, 1, 0
95 ; dest, left, right, src, tmp
96 ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
97 %macro PRED8x8_LOWPASS0 6
102 pand %3, [pb_1 GLOBAL]
106 %macro PRED8x8_LOWPASS 5
107 PRED8x8_LOWPASS0 %1, %2, %3, %4, %5, q
109 %macro PRED8x8_LOWPASS_XMM 5
110 PRED8x8_LOWPASS0 %1, %2, %3, %4, %5, dqa
113 %macro LOAD_PLANE_ARGS 0
128 ;-----------------------------------------------------------------------------
129 ; void predict_4x4_ddl_mmxext( uint8_t *src )
130 ;-----------------------------------------------------------------------------
131 cglobal predict_4x4_ddl_mmxext, 1,1
132 movq mm1, [r0-FDEC_STRIDE]
140 PRED8x8_LOWPASS mm0, mm1, mm3, mm4, mm5
145 movd [r0+Y*FDEC_STRIDE], mm0
151 ;-----------------------------------------------------------------------------
152 ; void predict_4x4_ddr_mmxext( uint8_t *src )
153 ;-----------------------------------------------------------------------------
155 cglobal predict_4x4_ddr_%1, 1,1
156 movq mm1, [r0+1*FDEC_STRIDE-8]
157 movq mm2, [r0+0*FDEC_STRIDE-8]
158 punpckhbw mm2, [r0-1*FDEC_STRIDE-8]
159 movd mm3, [r0-1*FDEC_STRIDE]
161 PALIGNR mm3, mm1, 5, mm1
163 PALIGNR mm3, [r0+2*FDEC_STRIDE-8], 7, mm4
165 PALIGNR mm3, [r0+3*FDEC_STRIDE-8], 7, mm4
166 PRED8x8_LOWPASS mm0, mm3, mm1, mm2, mm4
168 movd [r0+Y*FDEC_STRIDE], mm0
172 movd [r0+Y*FDEC_STRIDE], mm0
176 cglobal predict_4x4_vr_%1, 1,1
177 movd mm0, [r0-1*FDEC_STRIDE] ; ........t3t2t1t0
179 PALIGNR mm0, [r0-1*FDEC_STRIDE-8], 7, mm1 ; ......t3t2t1t0lt
181 PALIGNR mm0, [r0+0*FDEC_STRIDE-8], 7, mm1 ; ....t3t2t1t0ltl0
183 PALIGNR mm0, [r0+1*FDEC_STRIDE-8], 7, mm2 ; ..t3t2t1t0ltl0l1
185 PALIGNR mm0, [r0+2*FDEC_STRIDE-8], 7, mm3 ; t3t2t1t0ltl0l1l2
186 PRED8x8_LOWPASS mm3, mm1, mm0, mm2, mm4
190 movd [r0+0*FDEC_STRIDE], mm7
191 movd [r0+1*FDEC_STRIDE], mm3
192 PALIGNR mm7, mm1, 7, mm2
194 movd [r0+2*FDEC_STRIDE], mm7
195 PALIGNR mm3, mm1, 7, mm1
196 movd [r0+3*FDEC_STRIDE], mm3
199 cglobal predict_4x4_hd_%1, 1,1
200 movd mm0, [r0-1*FDEC_STRIDE-4] ; lt ..
201 punpckldq mm0, [r0-1*FDEC_STRIDE] ; t3 t2 t1 t0 lt .. .. ..
202 psllq mm0, 8 ; t2 t1 t0 lt .. .. .. ..
203 movq mm1, [r0+3*FDEC_STRIDE-8] ; l3
204 punpckhbw mm1, [r0+2*FDEC_STRIDE-8] ; l2 l3
205 movq mm2, [r0+1*FDEC_STRIDE-8] ; l1
206 punpckhbw mm2, [r0+0*FDEC_STRIDE-8] ; l0 l1
207 punpckhwd mm1, mm2 ; l0 l1 l2 l3
208 punpckhdq mm1, mm0 ; t2 t1 t0 lt l0 l1 l2 l3
212 psrlq mm0, 16 ; .. .. t2 t1 t0 lt l0 l1
213 psrlq mm2, 8 ; .. t2 t1 t0 lt l0 l1 l2
215 PRED8x8_LOWPASS mm3, mm1, mm0, mm2, mm4
218 PALIGNR mm3, mm7, 6, mm6
220 movd [r0+Y*FDEC_STRIDE], mm7
224 movd [r0+Y*FDEC_STRIDE], mm7
226 movd [r0+0*FDEC_STRIDE], mm3
230 %define PALIGNR PALIGNR_MMX
232 %define PALIGNR PALIGNR_SSSE3
235 ;-----------------------------------------------------------------------------
236 ; void predict_4x4_hu_mmxext( uint8_t *src )
237 ;-----------------------------------------------------------------------------
238 cglobal predict_4x4_hu_mmxext, 1,1
239 movq mm0, [r0+0*FDEC_STRIDE-8]
240 punpckhbw mm0, [r0+1*FDEC_STRIDE-8]
241 movq mm1, [r0+2*FDEC_STRIDE-8]
242 punpckhbw mm1, [r0+3*FDEC_STRIDE-8]
246 pshufw mm1, mm1, 0xFF
254 PRED8x8_LOWPASS mm4, mm0, mm2, mm3, mm5
257 movd [r0+Y*FDEC_STRIDE], mm7
261 movd [r0+Y*FDEC_STRIDE], mm7
263 movd [r0+3*FDEC_STRIDE], mm1
266 ;-----------------------------------------------------------------------------
267 ; void predict_4x4_vl_mmxext( uint8_t *src )
268 ;-----------------------------------------------------------------------------
269 cglobal predict_4x4_vl_mmxext, 1,1
270 movq mm1, [r0-FDEC_STRIDE]
278 PRED8x8_LOWPASS mm0, mm1, mm2, mm3, mm5
280 movd [r0+0*FDEC_STRIDE], mm4
281 movd [r0+1*FDEC_STRIDE], mm0
284 movd [r0+2*FDEC_STRIDE], mm4
285 movd [r0+3*FDEC_STRIDE], mm0
289 ;-----------------------------------------------------------------------------
290 ; void predict_4x4_dc( uint8_t *src )
291 ;-----------------------------------------------------------------------------
293 cglobal predict_4x4_dc_mmxext, 1,4
295 movd mm0, [r0-FDEC_STRIDE]
298 movzx r1d, byte [r0-1]
301 movzx r2d, byte [r0+FDEC_STRIDE*n-1]
308 mov [r0+FDEC_STRIDE*0], r1d
309 mov [r0+FDEC_STRIDE*1], r1d
310 mov [r0+FDEC_STRIDE*2], r1d
311 mov [r0+FDEC_STRIDE*3], r1d
314 %macro PREDICT_FILTER 1
315 ;-----------------------------------------------------------------------------
316 ;void predict_8x8_filter( uint8_t *src, uint8_t edge[33], int i_neighbor, int i_filters )
317 ;-----------------------------------------------------------------------------
319 cglobal predict_8x8_filter_%1, 4,5
332 movq mm0, [src+0*FDEC_STRIDE-8]
333 punpckhbw mm0, [src-1*FDEC_STRIDE-8]
334 movq mm1, [src+2*FDEC_STRIDE-8]
335 punpckhbw mm1, [src+1*FDEC_STRIDE-8]
337 movq mm2, [src+4*FDEC_STRIDE-8]
338 punpckhbw mm2, [src+3*FDEC_STRIDE-8]
339 movq mm3, [src+6*FDEC_STRIDE-8]
340 punpckhbw mm3, [src+5*FDEC_STRIDE-8]
343 movq mm0, [src+7*FDEC_STRIDE-8]
344 movq mm1, [src-1*FDEC_STRIDE]
347 PALIGNR mm4, mm0, 7, mm0
348 PALIGNR mm1, mm2, 1, mm2
353 PRED8x8_LOWPASS mm2, mm1, mm4, mm3, mm5
356 PRED8x8_LOWPASS mm1, mm3, mm0, mm4, mm5
362 movq mm0, [src-1*FDEC_STRIDE-8]
363 movq mm3, [src-1*FDEC_STRIDE]
364 movq mm1, [src-1*FDEC_STRIDE+8]
367 PALIGNR mm2, mm0, 7, mm0
368 PALIGNR mm1, mm4, 1, mm4
374 PRED8x8_LOWPASS mm4, mm2, mm1, mm3, mm5
380 movq mm0, [src-1*FDEC_STRIDE+8]
385 PALIGNR mm2, mm3, 7, mm3
386 PALIGNR mm5, mm4, 1, mm4
387 PRED8x8_LOWPASS mm1, mm2, mm5, mm0, mm4
391 pshufw mm1, mm3, 0xFF
423 %define PALIGNR PALIGNR_MMX
424 PREDICT_FILTER mmxext
425 %define PALIGNR PALIGNR_SSSE3
428 ;-----------------------------------------------------------------------------
429 ; void predict_8x8_v_mmxext( uint8_t *src, uint8_t *edge )
430 ;-----------------------------------------------------------------------------
431 cglobal predict_8x8_v_mmxext, 2,2
436 ;-----------------------------------------------------------------------------
437 ; void predict_8x8_h_mmxext( uint8_t *src, uint8_t edge[33] )
438 ;-----------------------------------------------------------------------------
441 cglobal predict_8x8_h_mmxext, 2,2
456 mova [r0+n*FDEC_STRIDE], m %+ n
461 ;-----------------------------------------------------------------------------
462 ; void predict_8x8_dc_mmxext( uint8_t *src, uint8_t *edge );
463 ;-----------------------------------------------------------------------------
464 cglobal predict_8x8_dc_mmxext, 2,2
469 paddw mm0, [pw_8 GLOBAL]
477 ;-----------------------------------------------------------------------------
478 ; void predict_8x8_dc_top_mmxext( uint8_t *src, uint8_t *edge );
479 ;-----------------------------------------------------------------------------
484 paddw mm0, [pw_4 GLOBAL]
492 PRED8x8_DC predict_8x8_dc_top_mmxext, 16
493 PRED8x8_DC predict_8x8_dc_left_mmxext, 7
496 ; sse2 is faster even on amd, so there's no sense in spending exe size on these
497 ; functions if we know sse2 is available.
499 ;-----------------------------------------------------------------------------
500 ; void predict_8x8_ddl_mmxext( uint8_t *src, uint8_t *edge )
501 ;-----------------------------------------------------------------------------
502 cglobal predict_8x8_ddl_mmxext, 2,2
509 PRED8x8_LOWPASS mm0, mm1, mm2, mm5, mm7
510 PRED8x8_LOWPASS mm1, mm3, mm4, [r1+24], mm6
514 movq [r0+Y*FDEC_STRIDE], mm1
522 movq [r0+Y*FDEC_STRIDE], mm1
527 movq [r0+Y*FDEC_STRIDE], mm1
530 ;-----------------------------------------------------------------------------
531 ; void predict_8x8_ddr_mmxext( uint8_t *src, uint8_t *edge )
532 ;-----------------------------------------------------------------------------
533 cglobal predict_8x8_ddr_mmxext, 2,2
538 PRED8x8_LOWPASS mm0, mm1, mm2, [r1+8], mm7
539 PRED8x8_LOWPASS mm1, mm3, mm4, [r1+16], mm6
543 movq [r0+Y*FDEC_STRIDE], mm0
551 movq [r0+Y*FDEC_STRIDE], mm0
556 movq [r0+Y*FDEC_STRIDE], mm0
559 ;-----------------------------------------------------------------------------
560 ; void predict_8x8_hu_mmxext( uint8_t *src, uint8_t *edge )
561 ;-----------------------------------------------------------------------------
562 %define PALIGNR PALIGNR_MMX
563 cglobal predict_8x8_hu_mmxext, 2,2
564 movq mm1, [r1+7] ; l0 l1 l2 l3 l4 l5 l6 l7
565 add r0, 4*FDEC_STRIDE
566 pshufw mm0, mm1, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1
567 psllq mm1, 56 ; l7 .. .. .. .. .. .. ..
571 por mm2, mm0 ; l7 l6 l5 l4 l3 l2 l1 l0
577 por mm2, mm1 ; l7 l7 l6 l5 l4 l3 l2 l1
579 por mm3, mm1 ; l7 l7 l7 l6 l5 l4 l3 l2
581 PRED8x8_LOWPASS mm1, mm3, mm5, mm2, mm6
583 punpcklbw mm4, mm1 ; p4 p3 p2 p1
584 punpckhbw mm5, mm1 ; p8 p7 p6 p5
588 PALIGNR mm5, mm4, 2, mm1
589 pshufw mm1, mm6, 11111001b
590 PALIGNR mm6, mm4, 4, mm2
591 pshufw mm2, mm7, 11111110b
592 PALIGNR mm7, mm4, 6, mm3
593 pshufw mm3, mm0, 11111111b
594 movq [r0-4*FDEC_STRIDE], mm4
595 movq [r0-3*FDEC_STRIDE], mm5
596 movq [r0-2*FDEC_STRIDE], mm6
597 movq [r0-1*FDEC_STRIDE], mm7
598 movq [r0+0*FDEC_STRIDE], mm0
599 movq [r0+1*FDEC_STRIDE], mm1
600 movq [r0+2*FDEC_STRIDE], mm2
601 movq [r0+3*FDEC_STRIDE], mm3
604 ;-----------------------------------------------------------------------------
605 ; void predict_8x8_vr_core_mmxext( uint8_t *src, uint8_t *edge )
606 ;-----------------------------------------------------------------------------
608 ; fills only some pixels:
619 cglobal predict_8x8_vr_core_mmxext, 2,2
625 PRED8x8_LOWPASS mm0, mm1, mm2, mm4, mm7
629 movq [r0+ Y *FDEC_STRIDE], mm3
630 movq [r0+(Y+1)*FDEC_STRIDE], mm0
635 movq [r0+ Y *FDEC_STRIDE], mm3
636 movq [r0+(Y+1)*FDEC_STRIDE], mm0
640 ;-----------------------------------------------------------------------------
641 ; void predict_8x8c_p_core_mmxext( uint8_t *src, int i00, int b, int c )
642 ;-----------------------------------------------------------------------------
643 cglobal predict_8x8c_p_core_mmxext, 1,2
646 pmullw mm2, [pw_3210 GLOBAL]
648 paddsw mm0, mm2 ; mm0 = {i+0*b, i+1*b, i+2*b, i+3*b}
649 paddsw mm1, mm0 ; mm1 = {i+4*b, i+5*b, i+6*b, i+7*b}
668 ;-----------------------------------------------------------------------------
669 ; void predict_16x16_p_core_mmxext( uint8_t *src, int i00, int b, int c )
670 ;-----------------------------------------------------------------------------
671 cglobal predict_16x16_p_core_mmxext, 1,2
675 pmullw mm5, [pw_3210 GLOBAL]
679 paddsw mm0, mm5 ; mm0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b}
680 paddsw mm1, mm0 ; mm1 = {i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b}
681 paddsw mm2, mm0 ; mm2 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b}
682 paddsw mm3, mm1 ; mm3 = {i+12*b, i+13*b, i+14*b, i+15*b}
710 %endif ; !ARCH_X86_64
712 ;-----------------------------------------------------------------------------
713 ; void predict_8x8_ddl_sse2( uint8_t *src, uint8_t *edge )
714 ;-----------------------------------------------------------------------------
715 cglobal predict_8x8_ddl_sse2, 2,2
720 PRED8x8_LOWPASS_XMM xmm0, xmm1, xmm2, xmm3, xmm4
725 movq [r0+Y*FDEC_STRIDE], xmm0
730 ;-----------------------------------------------------------------------------
731 ; void predict_8x8_ddr_sse2( uint8_t *src, uint8_t *edge )
732 ;-----------------------------------------------------------------------------
733 cglobal predict_8x8_ddr_sse2, 2,2
738 PRED8x8_LOWPASS_XMM xmm0, xmm1, xmm2, xmm3, xmm4
744 movq [r0+Y*FDEC_STRIDE], xmm0
745 movq [r0+(Y-1)*FDEC_STRIDE], xmm1
750 movq [r0+1*FDEC_STRIDE], xmm0
751 movq [r0+0*FDEC_STRIDE], xmm1
755 ;-----------------------------------------------------------------------------
756 ; void predict_8x8_vl_sse2( uint8_t *src, uint8_t *edge )
757 ;-----------------------------------------------------------------------------
758 cglobal predict_8x8_vl_sse2, 2,2
766 PRED8x8_LOWPASS_XMM xmm0, xmm1, xmm2, xmm4, xmm5
767 ; xmm0: (t0 + 2*t1 + t2 + 2) >> 2
768 ; xmm3: (t0 + t1 + 1) >> 1
773 movq [r0+ Y *FDEC_STRIDE], xmm3
774 movq [r0+(Y+1)*FDEC_STRIDE], xmm0
779 movq [r0+ Y *FDEC_STRIDE], xmm3
780 movq [r0+(Y+1)*FDEC_STRIDE], xmm0
784 ;-----------------------------------------------------------------------------
785 ; void predict_8x8_vr_sse2( uint8_t *src, uint8_t *edge )
786 ;-----------------------------------------------------------------------------
787 cglobal predict_8x8_vr_sse2, 2,2,7
789 movdqa xmm6, [pw_ff00 GLOBAL]
790 add r0, 4*FDEC_STRIDE
797 PRED8x8_LOWPASS_XMM xmm4, xmm3, xmm1, xmm0, xmm5
803 movhps [r0-3*FDEC_STRIDE], xmm5
804 movhps [r0-4*FDEC_STRIDE], xmm2
813 movq [r0+Y*FDEC_STRIDE], xmm5
814 movq [r0+(Y-1)*FDEC_STRIDE], xmm2
819 ;-----------------------------------------------------------------------------
820 ; void predict_8x8_hd_mmxext( uint8_t *src, uint8_t *edge )
821 ;-----------------------------------------------------------------------------
822 %define PALIGNR PALIGNR_MMX
823 cglobal predict_8x8_hd_mmxext, 2,2
824 add r0, 4*FDEC_STRIDE
825 movq mm0, [r1] ; l7 .. .. .. .. .. .. ..
826 movq mm1, [r1+8] ; lt l0 l1 l2 l3 l4 l5 l6
827 movq mm2, [r1+16] ; t7 t6 t5 t4 t3 t2 t1 t0
828 movq mm3, mm1 ; lt l0 l1 l2 l3 l4 l5 l6
829 movq mm4, mm2 ; t7 t6 t5 t4 t3 t2 t1 t0
830 PALIGNR mm2, mm1, 7, mm5 ; t6 t5 t4 t3 t2 t1 t0 lt
831 PALIGNR mm1, mm0, 7, mm6 ; l0 l1 l2 l3 l4 l5 l6 l7
832 PALIGNR mm4, mm3, 1, mm7 ; t0 lt l0 l1 l2 l3 l4 l5
835 PRED8x8_LOWPASS mm0, mm4, mm1, mm5, mm7
837 movq mm1, mm2 ; t6 t5 t4 t3 t2 t1 t0 lt
838 psrlq mm4, 16 ; .. .. t6 t5 t4 t3 t2 t1
839 psrlq mm1, 8 ; .. t6 t5 t4 t3 t2 t1 t0
840 PRED8x8_LOWPASS mm6, mm4, mm2, mm1, mm5
843 punpcklbw mm3, mm0 ; p4 p3 p2 p1
844 punpckhbw mm7, mm0 ; p8 p7 p6 p5
848 movq [r0+3*FDEC_STRIDE], mm3
849 PALIGNR mm7, mm3, 2, mm5
850 movq [r0+2*FDEC_STRIDE], mm7
851 PALIGNR mm1, mm3, 4, mm5
852 movq [r0+1*FDEC_STRIDE], mm1
853 PALIGNR mm0, mm3, 6, mm3
854 movq [r0+0*FDEC_STRIDE], mm0
857 movq [r0-1*FDEC_STRIDE], mm4
858 PALIGNR mm6, mm4, 2, mm5
859 movq [r0-2*FDEC_STRIDE], mm6
860 PALIGNR mm2, mm4, 4, mm5
861 movq [r0-3*FDEC_STRIDE], mm2
862 PALIGNR mm3, mm4, 6, mm4
863 movq [r0-4*FDEC_STRIDE], mm3
866 ;-----------------------------------------------------------------------------
867 ; void predict_8x8_hd_ssse3( uint8_t *src, uint8_t *edge )
868 ;-----------------------------------------------------------------------------
869 %macro PREDICT_8x8_HD 1
870 cglobal predict_8x8_hd_%1, 2,2
871 add r0, 4*FDEC_STRIDE
876 PALIGNR xmm1, xmm0, 7, xmm4
877 PALIGNR xmm2, xmm0, 9, xmm5
878 PALIGNR xmm3, xmm0, 8, xmm0
881 PRED8x8_LOWPASS_XMM xmm0, xmm1, xmm2, xmm3, xmm5
887 movq [r0+(Y)*FDEC_STRIDE], xmm4
888 movq [r0+(Y-4)*FDEC_STRIDE], xmm0
893 movq [r0+(Y)*FDEC_STRIDE], xmm4
894 movq [r0+(Y-4)*FDEC_STRIDE], xmm0
900 %define PALIGNR PALIGNR_SSSE3
903 %define PALIGNR PALIGNR_MMX
905 ;-----------------------------------------------------------------------------
906 ; void predict_8x8_hu_sse2( uint8_t *src, uint8_t *edge )
907 ;-----------------------------------------------------------------------------
908 %macro PREDICT_8x8_HU 1
909 cglobal predict_8x8_hu_%1, 2,2
910 add r0, 4*FDEC_STRIDE
913 movq mm6, [pb_reverse GLOBAL]
924 movq mm1, [r1+7] ; l0 l1 l2 l3 l4 l5 l6 l7
925 pshufw mm0, mm1, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1
929 por mm2, mm0 ; l7 l6 l5 l4 l3 l2 l1 l0
930 psllq mm1, 56 ; l7 .. .. .. .. .. .. ..
936 por mm2, mm1 ; l7 l7 l6 l5 l4 l3 l2 l1
938 por mm3, mm1 ; l7 l7 l7 l6 l5 l4 l3 l2
941 PRED8x8_LOWPASS mm1, mm3, mm5, mm2, mm6
949 movq [r0+Y*FDEC_STRIDE], xmm0
953 pshufw mm5, mm4, 11111001b
954 pshufw mm6, mm4, 11111110b
955 pshufw mm7, mm4, 11111111b
956 movq [r0+Y*FDEC_STRIDE], xmm0
957 movq [r0+0*FDEC_STRIDE], mm4
958 movq [r0+1*FDEC_STRIDE], mm5
959 movq [r0+2*FDEC_STRIDE], mm6
960 movq [r0+3*FDEC_STRIDE], mm7
967 ;-----------------------------------------------------------------------------
968 ; void predict_8x8c_v_mmx( uint8_t *src )
969 ;-----------------------------------------------------------------------------
970 cglobal predict_8x8c_v_mmx, 1,1
971 movq mm0, [r0 - FDEC_STRIDE]
975 ;-----------------------------------------------------------------------------
976 ; void predict_8x8c_h_mmxext( uint8_t *src )
977 ;-----------------------------------------------------------------------------
980 cglobal predict_8x8c_h_%1, 1,1
982 mova m1, [pb_3 GLOBAL]
986 SPLATB m0, r0+FDEC_STRIDE*n-1, m1
987 mova [r0+FDEC_STRIDE*n], m0
994 %define SPLATB SPLATB_MMX
996 %define SPLATB SPLATB_SSSE3
999 ;-----------------------------------------------------------------------------
1000 ; void predict_8x8c_dc_core_mmxext( uint8_t *src, int s2, int s3 )
1001 ;-----------------------------------------------------------------------------
1002 cglobal predict_8x8c_dc_core_mmxext, 1,1
1003 movq mm0, [r0 - FDEC_STRIDE]
1008 psadbw mm1, mm2 ; s1
1009 psadbw mm0, mm2 ; s0
1021 paddw mm1, [pw_2 GLOBAL]
1024 pshufw mm0, mm0, 0 ; dc0 (w)
1026 psrlw mm3, 3 ; dc3 (w)
1027 psrlw mm2, 2 ; dc2 (w)
1028 psrlw mm1, 2 ; dc1 (w)
1030 packuswb mm0, mm1 ; dc0,dc1 (b)
1031 packuswb mm2, mm3 ; dc2,dc3 (b)
1036 cglobal predict_8x8c_dc_top_mmxext, 1,1
1037 movq mm0, [r0 - FDEC_STRIDE]
1042 psadbw mm1, mm2 ; s1
1043 psadbw mm0, mm2 ; s0
1049 pshufw mm0, mm0, 0 ; dc0 (w)
1050 packuswb mm0, mm1 ; dc0,dc1 (b)
1054 ;-----------------------------------------------------------------------------
1055 ; void predict_8x8c_p_core_sse2( uint8_t *src, int i00, int b, int c )
1056 ;-----------------------------------------------------------------------------
1058 cglobal predict_8x8c_p_core_sse2, 1,1
1062 pshuflw xmm0, xmm0, 0
1063 pshuflw xmm2, xmm2, 0
1064 pshuflw xmm4, xmm4, 0
1065 punpcklqdq xmm0, xmm0
1066 punpcklqdq xmm2, xmm2
1067 punpcklqdq xmm4, xmm4
1068 pmullw xmm2, [pw_76543210 GLOBAL]
1069 paddsw xmm0, xmm2 ; xmm0 = {i+0*b, i+1*b, i+2*b, i+3*b, i+4*b, i+5*b, i+6*b, i+7*b}
1074 add r0, FDEC_STRIDE*4
1081 movq [r0+FDEC_STRIDE*0], xmm0
1082 movhps [r0+FDEC_STRIDE*1], xmm0
1090 movq [r0+FDEC_STRIDE*2], xmm5
1091 movhps [r0+FDEC_STRIDE*3], xmm5
1096 ;-----------------------------------------------------------------------------
1097 ; void predict_16x16_p_core_sse2( uint8_t *src, int i00, int b, int c )
1098 ;-----------------------------------------------------------------------------
1099 cglobal predict_16x16_p_core_sse2, 1,2,8
1103 pshuflw xmm0, xmm0, 0
1104 pshuflw xmm1, xmm1, 0
1105 pshuflw xmm2, xmm2, 0
1106 punpcklqdq xmm0, xmm0
1107 punpcklqdq xmm1, xmm1
1108 punpcklqdq xmm2, xmm2
1110 pmullw xmm3, [pw_76543210 GLOBAL]
1112 paddsw xmm0, xmm3 ; xmm0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b, i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b}
1113 paddsw xmm1, xmm0 ; xmm1 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b, i+12*b, i+13*b, i+14*b, i+15*b}
1131 movdqa [r0+FDEC_STRIDE*0], xmm3
1132 movdqa [r0+FDEC_STRIDE*1], xmm5
1135 add r0, FDEC_STRIDE*2
1140 ;-----------------------------------------------------------------------------
1141 ; void predict_16x16_v_mmx( uint8_t *src )
1142 ;-----------------------------------------------------------------------------
1143 cglobal predict_16x16_v_mmx, 1,2
1144 movq mm0, [r0 - FDEC_STRIDE]
1145 movq mm1, [r0 - FDEC_STRIDE + 8]
1149 ;-----------------------------------------------------------------------------
1150 ; void predict_16x16_v_sse2( uint8_t *src )
1151 ;-----------------------------------------------------------------------------
1152 cglobal predict_16x16_v_sse2, 1,1
1153 movdqa xmm0, [r0 - FDEC_STRIDE]
1154 STORE16x16_SSE2 xmm0
1157 ;-----------------------------------------------------------------------------
1158 ; void predict_16x16_h_mmxext( uint8_t *src )
1159 ;-----------------------------------------------------------------------------
1161 %macro PRED_16x16_H 1
1162 cglobal predict_16x16_h_%1, 1,2
1163 mov r1, FDEC_STRIDE*12
1165 mova m1, [pb_3 GLOBAL]
1170 SPLATB m0, r0+r1+FDEC_STRIDE*n-1, m1
1171 mova [r0+r1+FDEC_STRIDE*n], m0
1173 mova [r0+r1+FDEC_STRIDE*n+8], m0
1177 add r1, -FDEC_STRIDE*4
1182 ;no SSE2, its slower than MMX on all systems that don't support SSSE3
1184 %define SPLATB SPLATB_MMX
1187 %define SPLATB SPLATB_SSSE3
1190 ;-----------------------------------------------------------------------------
1191 ; void predict_16x16_dc_core_mmxext( uint8_t *src, int i_dc_left )
1192 ;-----------------------------------------------------------------------------
1194 %macro PRED16x16_DC 2
1197 psadbw mm0, [r0 - FDEC_STRIDE]
1198 psadbw mm1, [r0 - FDEC_STRIDE + 8]
1203 packuswb mm0, mm0 ; dc in bytes
1207 cglobal predict_16x16_dc_core_mmxext, 1,2
1216 cglobal predict_16x16_dc_top_mmxext, 1,2
1217 PRED16x16_DC [pw_8 GLOBAL], 4
1220 cglobal predict_16x16_dc_left_core_mmxext, 1,1
1227 ;-----------------------------------------------------------------------------
1228 ; void predict_16x16_dc_core_sse2( uint8_t *src, int i_dc_left )
1229 ;-----------------------------------------------------------------------------
1231 %macro PRED16x16_DC_SSE2 2
1233 psadbw xmm0, [r0 - FDEC_STRIDE]
1238 pshuflw xmm0, xmm0, 0
1239 punpcklqdq xmm0, xmm0
1240 packuswb xmm0, xmm0 ; dc in bytes
1241 STORE16x16_SSE2 xmm0
1244 cglobal predict_16x16_dc_core_sse2, 1,1
1246 PRED16x16_DC_SSE2 xmm2, 5
1249 cglobal predict_16x16_dc_top_sse2, 1,1
1250 PRED16x16_DC_SSE2 [pw_8 GLOBAL], 4
1253 cglobal predict_16x16_dc_left_core_sse2, 1,1
1255 pshuflw xmm0, xmm0, 0
1256 punpcklqdq xmm0, xmm0
1258 STORE16x16_SSE2 xmm0