1 ;*****************************************************************************
2 ;* predict-a.asm: x86 intra prediction
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2010 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Holger Lubitz <holger@lubitz.org>
8 ;* Fiona Glaser <fiona@x264.com>
10 ;* This program is free software; you can redistribute it and/or modify
11 ;* it under the terms of the GNU General Public License as published by
12 ;* the Free Software Foundation; either version 2 of the License, or
13 ;* (at your option) any later version.
15 ;* This program is distributed in the hope that it will be useful,
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 ;* GNU General Public License for more details.
20 ;* You should have received a copy of the GNU General Public License
21 ;* along with this program; if not, write to the Free Software
22 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
24 ;* This program is also available under a commercial proprietary license.
25 ;* For more information, contact us at licensing@x264.com.
26 ;*****************************************************************************
29 %include "x86util.asm"
34 pw_3210: dw 0, 1, 2, 3, 4, 5, 6, 7
35 pb_00s_ff: times 8 db 0
36 pb_0s_ff: times 7 db 0
52 add r0, 4*FDEC_STRIDEB
53 mova [r0 + -4*FDEC_STRIDEB], %1
54 mova [r0 + -3*FDEC_STRIDEB], %1
55 mova [r0 + -2*FDEC_STRIDEB], %1
56 mova [r0 + -1*FDEC_STRIDEB], %1
57 mova [r0 + 0*FDEC_STRIDEB], %2
58 mova [r0 + 1*FDEC_STRIDEB], %2
59 mova [r0 + 2*FDEC_STRIDEB], %2
60 mova [r0 + 3*FDEC_STRIDEB], %2
67 mova [r0 + 0*FDEC_STRIDEB + 0], %1
68 mova [r0 + 1*FDEC_STRIDEB + 0], %1
69 mova [r0 + 0*FDEC_STRIDEB + 8], %2
70 mova [r0 + 1*FDEC_STRIDEB + 8], %2
71 mova [r0 + 0*FDEC_STRIDEB +16], %3
72 mova [r0 + 1*FDEC_STRIDEB +16], %3
73 mova [r0 + 0*FDEC_STRIDEB +24], %4
74 mova [r0 + 1*FDEC_STRIDEB +24], %4
75 add r0, 2*FDEC_STRIDEB
81 mova [r0 + 0*FDEC_STRIDE], %1
82 mova [r0 + 1*FDEC_STRIDE], %1
83 mova [r0 + 2*FDEC_STRIDE], %1
84 mova [r0 + 3*FDEC_STRIDE], %1
85 mova [r0 + 0*FDEC_STRIDE + 8], %2
86 mova [r0 + 1*FDEC_STRIDE + 8], %2
87 mova [r0 + 2*FDEC_STRIDE + 8], %2
88 mova [r0 + 3*FDEC_STRIDE + 8], %2
95 %macro STORE16x16_SSE2 1-2
99 mova [r0+0*FDEC_STRIDEB+ 0], %1
100 mova [r0+0*FDEC_STRIDEB+16], %2
101 mova [r0+1*FDEC_STRIDEB+ 0], %1
102 mova [r0+1*FDEC_STRIDEB+16], %2
103 mova [r0+2*FDEC_STRIDEB+ 0], %1
104 mova [r0+2*FDEC_STRIDEB+16], %2
105 mova [r0+3*FDEC_STRIDEB+ 0], %1
106 mova [r0+3*FDEC_STRIDEB+16], %2
107 add r0, 4*FDEC_STRIDEB
111 add r0, 4*FDEC_STRIDEB
112 mova [r0 + -4*FDEC_STRIDEB], %1
113 mova [r0 + -3*FDEC_STRIDEB], %1
114 mova [r0 + -2*FDEC_STRIDEB], %1
115 mova [r0 + -1*FDEC_STRIDEB], %1
116 mova [r0 + 0*FDEC_STRIDEB], %1
117 mova [r0 + 1*FDEC_STRIDEB], %1
118 mova [r0 + 2*FDEC_STRIDEB], %1
119 mova [r0 + 3*FDEC_STRIDEB], %1
120 add r0, 8*FDEC_STRIDEB
121 mova [r0 + -4*FDEC_STRIDEB], %1
122 mova [r0 + -3*FDEC_STRIDEB], %1
123 mova [r0 + -2*FDEC_STRIDEB], %1
124 mova [r0 + -1*FDEC_STRIDEB], %1
125 mova [r0 + 0*FDEC_STRIDEB], %1
126 mova [r0 + 1*FDEC_STRIDEB], %1
127 mova [r0 + 2*FDEC_STRIDEB], %1
128 mova [r0 + 3*FDEC_STRIDEB], %1
132 ; dest, left, right, src, tmp
133 ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
134 %macro PRED8x8_LOWPASS 5-6
151 %macro LOAD_PLANE_ARGS 0
166 ;-----------------------------------------------------------------------------
167 ; void predict_4x4_ddl( pixel *src )
168 ;-----------------------------------------------------------------------------
169 %macro PREDICT_4x4_DDL 4
170 cglobal predict_4x4_ddl_%1, 1,1
171 mova m1, [r0-FDEC_STRIDEB]
179 PRED8x8_LOWPASS %4, m0, m1, m3, m4, m5
184 movh [r0+Y*FDEC_STRIDEB], m0
191 %ifdef HIGH_BIT_DEPTH
193 PREDICT_4x4_DDL sse2, dq, 2, w
195 %define PALIGNR PALIGNR_MMX
196 cglobal predict_4x4_ddl_mmxext, 1,2
197 mova m1, [r0-2*FDEC_STRIDE+4]
198 mova m2, [r0-2*FDEC_STRIDE+0]
199 mova m3, [r0-2*FDEC_STRIDE+2]
200 PRED8x8_LOWPASS w, m0, m1, m2, m3
201 mova [r0+0*FDEC_STRIDE], m0
203 mova m5, [r0-2*FDEC_STRIDE+6]
204 mova m6, [r0-2*FDEC_STRIDE+8]
206 PRED8x8_LOWPASS w, m4, m7, m5, m6
207 mova [r0+6*FDEC_STRIDE], m4
210 PALIGNR m4, m0, 6, m1
211 mova [r0+4*FDEC_STRIDE], m4
214 PALIGNR m4, m0, 6, m0
215 mova [r0+2*FDEC_STRIDE], m4
219 PREDICT_4x4_DDL mmxext, q , 8, b
222 ;-----------------------------------------------------------------------------
223 ; void predict_4x4_ddr( pixel *src )
224 ;-----------------------------------------------------------------------------
226 cglobal predict_4x4_ddr_%1, 1,1
227 mova m1, [r0+1*FDEC_STRIDEB-8*SIZEOF_PIXEL]
228 mova m2, [r0+0*FDEC_STRIDEB-8*SIZEOF_PIXEL]
229 punpckh%2 m2, [r0-1*FDEC_STRIDEB-8*SIZEOF_PIXEL]
230 movh m3, [r0-1*FDEC_STRIDEB]
232 PALIGNR m3, m1, 5*SIZEOF_PIXEL, m1
234 PALIGNR m3, [r0+2*FDEC_STRIDEB-8*SIZEOF_PIXEL], 7*SIZEOF_PIXEL, m4
236 PALIGNR m3, [r0+3*FDEC_STRIDEB-8*SIZEOF_PIXEL], 7*SIZEOF_PIXEL, m4
237 PRED8x8_LOWPASS %5, m0, m3, m1, m2, m4
239 movh [r0+Y*FDEC_STRIDEB], m0
243 movh [r0+Y*FDEC_STRIDEB], m0
247 cglobal predict_4x4_vr_%1, 1,1,6*(mmsize/16)
248 movh m0, [r0-1*FDEC_STRIDEB] ; ........t3t2t1t0
250 PALIGNR m0, [r0-1*FDEC_STRIDEB-8*SIZEOF_PIXEL], 7*SIZEOF_PIXEL, m1 ; ......t3t2t1t0lt
252 PALIGNR m0, [r0+0*FDEC_STRIDEB-8*SIZEOF_PIXEL], 7*SIZEOF_PIXEL, m1 ; ....t3t2t1t0ltl0
254 PALIGNR m0, [r0+1*FDEC_STRIDEB-8*SIZEOF_PIXEL], 7*SIZEOF_PIXEL, m2 ; ..t3t2t1t0ltl0l1
256 PALIGNR m0, [r0+2*FDEC_STRIDEB-8*SIZEOF_PIXEL], 7*SIZEOF_PIXEL, m3 ; t3t2t1t0ltl0l1l2
257 PRED8x8_LOWPASS %5, m3, m1, m0, m2, m4
261 movh [r0+0*FDEC_STRIDEB], m5
262 movh [r0+1*FDEC_STRIDEB], m3
263 PALIGNR m5, m1, 7*SIZEOF_PIXEL, m2
265 movh [r0+2*FDEC_STRIDEB], m5
266 PALIGNR m3, m1, 7*SIZEOF_PIXEL, m1
267 movh [r0+3*FDEC_STRIDEB], m3
270 cglobal predict_4x4_hd_%1, 1,1,6*(mmsize/16)
271 movh m0, [r0-1*FDEC_STRIDEB-4*SIZEOF_PIXEL] ; lt ..
272 punpckl%6 m0, [r0-1*FDEC_STRIDEB] ; t3 t2 t1 t0 lt .. .. ..
273 psll%4 m0, %7 ; t2 t1 t0 lt .. .. .. ..
274 mova m1, [r0+3*FDEC_STRIDEB-8*SIZEOF_PIXEL] ; l3
275 punpckh%2 m1, [r0+2*FDEC_STRIDEB-8*SIZEOF_PIXEL] ; l2 l3
276 mova m2, [r0+1*FDEC_STRIDEB-8*SIZEOF_PIXEL] ; l1
277 punpckh%2 m2, [r0+0*FDEC_STRIDEB-8*SIZEOF_PIXEL] ; l0 l1
278 punpckh%3 m1, m2 ; l0 l1 l2 l3
279 punpckh%6 m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3
283 psrl%4 m0, %7*2 ; .. .. t2 t1 t0 lt l0 l1
284 psrl%4 m2, %7 ; .. t2 t1 t0 lt l0 l1 l2
286 PRED8x8_LOWPASS %5, m3, m1, m0, m2, m4
289 PALIGNR m3, m5, 6*SIZEOF_PIXEL, m4
291 movh [r0+Y*FDEC_STRIDEB], m5
295 movh [r0+Y*FDEC_STRIDEB], m5
297 movh [r0+0*FDEC_STRIDEB], m3
301 %ifdef HIGH_BIT_DEPTH
303 %define PALIGNR PALIGNR_SSSE3
304 PREDICT_4x4 ssse3 , wd, dq, dq, w, qdq, 2
307 %define PALIGNR PALIGNR_MMX
308 PREDICT_4x4 mmxext, bw, wd, q , b, dq , 8
309 %define PALIGNR PALIGNR_SSSE3
310 PREDICT_4x4 ssse3 , bw, wd, q , b, dq , 8
313 ;-----------------------------------------------------------------------------
314 ; void predict_4x4_hu( pixel *src )
315 ;-----------------------------------------------------------------------------
316 %ifdef HIGH_BIT_DEPTH
318 cglobal predict_4x4_hu_sse2, 1,1,6
319 movq mm0, [r0+0*FDEC_STRIDEB-4*2]
320 punpckhwd mm0, [r0+1*FDEC_STRIDEB-4*2]
321 movq mm1, [r0+2*FDEC_STRIDEB-4*2]
322 punpckhwd mm1, [r0+3*FDEC_STRIDEB-4*2]
324 pshufw mm1, mm1, 0xFF
334 PRED8x8_LOWPASS w, m4, m0, m2, m3, m5
337 movq [r0+0*FDEC_STRIDEB], m1
339 movq [r0+1*FDEC_STRIDEB], m1
341 movq [r0+2*FDEC_STRIDEB], m1
342 movq [r0+3*FDEC_STRIDEB], mm1
347 cglobal predict_4x4_hu_mmxext, 1,1
348 movq mm0, [r0+0*FDEC_STRIDE-8]
349 punpckhbw mm0, [r0+1*FDEC_STRIDE-8]
350 movq mm1, [r0+2*FDEC_STRIDE-8]
351 punpckhbw mm1, [r0+3*FDEC_STRIDE-8]
355 pshufw mm1, mm1, 0xFF
363 PRED8x8_LOWPASS b, mm4, mm0, mm2, mm3, mm5
366 movd [r0+Y*FDEC_STRIDE], mm7
370 movd [r0+Y*FDEC_STRIDE], mm7
372 movd [r0+3*FDEC_STRIDE], mm1
374 %endif ; HIGH_BIT_DEPTH
376 ;-----------------------------------------------------------------------------
377 ; void predict_4x4_vl( pixel *src )
378 ;-----------------------------------------------------------------------------
379 %macro PREDICT_4x4_V1 4
380 cglobal predict_4x4_vl_%1, 1,1,6*(mmsize/16)
381 mova m1, [r0-FDEC_STRIDEB]
388 PRED8x8_LOWPASS %4, m0, m1, m2, m3, m5
390 movh [r0+0*FDEC_STRIDEB], m4
391 movh [r0+1*FDEC_STRIDEB], m0
394 movh [r0+2*FDEC_STRIDEB], m4
395 movh [r0+3*FDEC_STRIDEB], m0
399 %ifdef HIGH_BIT_DEPTH
401 PREDICT_4x4_V1 sse2, dq, 2, w
404 %define PALIGNR PALIGNR_MMX
405 cglobal predict_4x4_vl_mmxext, 1,4
406 mova m1, [r0-FDEC_STRIDEB+0]
407 mova m2, [r0-FDEC_STRIDEB+8]
409 PALIGNR m2, m1, 4, m6
410 PALIGNR m3, m1, 2, m5
413 mova [r0+0*FDEC_STRIDEB], m4
415 mova [r0+2*FDEC_STRIDEB], m4
416 PRED8x8_LOWPASS w, m0, m1, m2, m3, m6
417 mova [r0+1*FDEC_STRIDEB], m0
419 mova [r0+3*FDEC_STRIDEB], m0
421 movzx r1d, word [r0-FDEC_STRIDEB+ 8]
422 movzx r2d, word [r0-FDEC_STRIDEB+10]
423 movzx r3d, word [r0-FDEC_STRIDEB+12]
429 mov [r0+2*FDEC_STRIDEB+6], r1w
430 mov [r0+3*FDEC_STRIDEB+6], r3w
434 PREDICT_4x4_V1 mmxext, q , 8, b
437 ;-----------------------------------------------------------------------------
438 ; void predict_4x4_dc( pixel *src )
439 ;-----------------------------------------------------------------------------
440 %ifdef HIGH_BIT_DEPTH
442 cglobal predict_4x4_dc_mmxext, 1,1
443 mova m2, [r0+0*FDEC_STRIDEB-4*SIZEOF_PIXEL]
444 paddw m2, [r0+1*FDEC_STRIDEB-4*SIZEOF_PIXEL]
445 paddw m2, [r0+2*FDEC_STRIDEB-4*SIZEOF_PIXEL]
446 paddw m2, [r0+3*FDEC_STRIDEB-4*SIZEOF_PIXEL]
448 mova m0, [r0-FDEC_STRIDEB]
454 mova [r0+0*FDEC_STRIDEB], m0
455 mova [r0+1*FDEC_STRIDEB], m0
456 mova [r0+2*FDEC_STRIDEB], m0
457 mova [r0+3*FDEC_STRIDEB], m0
462 cglobal predict_4x4_dc_mmxext, 1,4
464 movd mm0, [r0-FDEC_STRIDE]
467 movzx r1d, byte [r0-1]
470 movzx r2d, byte [r0+FDEC_STRIDE*n-1]
477 mov [r0+FDEC_STRIDE*0], r1d
478 mov [r0+FDEC_STRIDE*1], r1d
479 mov [r0+FDEC_STRIDE*2], r1d
480 mov [r0+FDEC_STRIDE*3], r1d
482 %endif ; HIGH_BIT_DEPTH
484 %macro PREDICT_FILTER 1
485 ;-----------------------------------------------------------------------------
486 ;void predict_8x8_filter( uint8_t *src, uint8_t edge[33], int i_neighbor, int i_filters )
487 ;-----------------------------------------------------------------------------
489 cglobal predict_8x8_filter_%1, 4,5
502 movq mm0, [src+0*FDEC_STRIDE-8]
503 punpckhbw mm0, [src-1*FDEC_STRIDE-8]
504 movq mm1, [src+2*FDEC_STRIDE-8]
505 punpckhbw mm1, [src+1*FDEC_STRIDE-8]
507 movq mm2, [src+4*FDEC_STRIDE-8]
508 punpckhbw mm2, [src+3*FDEC_STRIDE-8]
509 movq mm3, [src+6*FDEC_STRIDE-8]
510 punpckhbw mm3, [src+5*FDEC_STRIDE-8]
513 movq mm0, [src+7*FDEC_STRIDE-8]
514 movq mm1, [src-1*FDEC_STRIDE]
517 PALIGNR mm4, mm0, 7, mm0
518 PALIGNR mm1, mm2, 1, mm2
523 PRED8x8_LOWPASS b, mm2, mm1, mm4, mm3, mm5
526 PRED8x8_LOWPASS b, mm1, mm3, mm0, mm4, mm5
532 movq mm0, [src-1*FDEC_STRIDE-8]
533 movq mm3, [src-1*FDEC_STRIDE]
534 movq mm1, [src-1*FDEC_STRIDE+8]
537 PALIGNR mm2, mm0, 7, mm0
538 PALIGNR mm1, mm4, 1, mm4
544 PRED8x8_LOWPASS b, mm4, mm2, mm1, mm3, mm5
550 movq mm0, [src-1*FDEC_STRIDE+8]
555 PALIGNR mm2, mm3, 7, mm3
556 PALIGNR mm5, mm4, 1, mm4
557 PRED8x8_LOWPASS b, mm1, mm2, mm5, mm0, mm4
561 pshufw mm1, mm3, 0xFF
593 %define PALIGNR PALIGNR_MMX
595 PREDICT_FILTER mmxext
596 %define PALIGNR PALIGNR_SSSE3
599 ;-----------------------------------------------------------------------------
600 ; void predict_8x8_v( pixel *src, pixel *edge )
601 ;-----------------------------------------------------------------------------
602 %macro PREDICT_8x8_V 1
603 cglobal predict_8x8_v_%1, 2,2
604 mova m0, [r1+16*SIZEOF_PIXEL]
609 %ifdef HIGH_BIT_DEPTH
617 ;-----------------------------------------------------------------------------
618 ; void predict_8x8_h( pixel *src, pixel edge[33] )
619 ;-----------------------------------------------------------------------------
620 %macro PREDICT_8x8_H 3
621 cglobal predict_8x8_h_%1, 2,2
622 movu m1, [r1+7*SIZEOF_PIXEL]
623 add r0, 4*FDEC_STRIDEB
630 SPLAT%3 m0, m %+ i, (3-n)&3
631 mova [r0+(n-4)*FDEC_STRIDEB], m0
637 %ifdef HIGH_BIT_DEPTH
639 PREDICT_8x8_H sse2 , wd, D
642 PREDICT_8x8_H mmxext, bw, W
645 ;-----------------------------------------------------------------------------
646 ; void predict_8x8_dc( pixel *src, pixel *edge );
647 ;-----------------------------------------------------------------------------
648 %ifdef HIGH_BIT_DEPTH
650 cglobal predict_8x8_dc_sse2, 2,2
662 cglobal predict_8x8_dc_mmxext, 2,2
674 %endif ; HIGH_BIT_DEPTH
676 ;-----------------------------------------------------------------------------
677 ; void predict_8x8_dc_top ( pixel *src, pixel *edge );
678 ; void predict_8x8_dc_left( pixel *src, pixel *edge );
679 ;-----------------------------------------------------------------------------
680 %ifdef HIGH_BIT_DEPTH
692 PRED8x8_DC predict_8x8_dc_top_sse2 , 32, mova
693 PRED8x8_DC predict_8x8_dc_left_sse2, 14, movu
708 PRED8x8_DC predict_8x8_dc_top_mmxext, 16
709 PRED8x8_DC predict_8x8_dc_left_mmxext, 7
710 %endif ; HIGH_BIT_DEPTH
713 ; sse2 is faster even on amd, so there's no sense in spending exe size on these
714 ; functions if we know sse2 is available.
716 ;-----------------------------------------------------------------------------
717 ; void predict_8x8_ddl( uint8_t *src, uint8_t *edge )
718 ;-----------------------------------------------------------------------------
719 cglobal predict_8x8_ddl_mmxext, 2,2
726 add r0, FDEC_STRIDE*4
727 PRED8x8_LOWPASS b, mm0, mm1, mm2, mm5, mm7
728 PRED8x8_LOWPASS b, mm1, mm3, mm4, [r1+24], mm6
731 movq [r0+Y*FDEC_STRIDE], mm1
739 movq [r0+Y*FDEC_STRIDE], mm1
744 movq [r0+Y*FDEC_STRIDE], mm1
747 ;-----------------------------------------------------------------------------
748 ; void predict_8x8_ddr( uint8_t *src, uint8_t *edge )
749 ;-----------------------------------------------------------------------------
750 cglobal predict_8x8_ddr_mmxext, 2,2
755 add r0, FDEC_STRIDE*4
756 PRED8x8_LOWPASS b, mm0, mm1, mm2, [r1+8], mm7
757 PRED8x8_LOWPASS b, mm1, mm3, mm4, [r1+16], mm6
760 movq [r0+Y*FDEC_STRIDE], mm0
768 movq [r0+Y*FDEC_STRIDE], mm0
773 movq [r0+Y*FDEC_STRIDE], mm0
776 ;-----------------------------------------------------------------------------
777 ; void predict_8x8_hu( uint8_t *src, uint8_t *edge )
778 ;-----------------------------------------------------------------------------
779 %define PALIGNR PALIGNR_MMX
780 cglobal predict_8x8_hu_mmxext, 2,2
781 movq mm1, [r1+7] ; l0 l1 l2 l3 l4 l5 l6 l7
782 add r0, 4*FDEC_STRIDE
783 pshufw mm0, mm1, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1
784 psllq mm1, 56 ; l7 .. .. .. .. .. .. ..
788 por mm2, mm0 ; l7 l6 l5 l4 l3 l2 l1 l0
794 por mm2, mm1 ; l7 l7 l6 l5 l4 l3 l2 l1
796 por mm3, mm1 ; l7 l7 l7 l6 l5 l4 l3 l2
798 PRED8x8_LOWPASS b, mm1, mm3, mm5, mm2, mm6
800 punpcklbw mm4, mm1 ; p4 p3 p2 p1
801 punpckhbw mm5, mm1 ; p8 p7 p6 p5
805 PALIGNR mm5, mm4, 2, mm1
806 pshufw mm1, mm6, 11111001b
807 PALIGNR mm6, mm4, 4, mm2
808 pshufw mm2, mm7, 11111110b
809 PALIGNR mm7, mm4, 6, mm3
810 pshufw mm3, mm0, 11111111b
811 movq [r0-4*FDEC_STRIDE], mm4
812 movq [r0-3*FDEC_STRIDE], mm5
813 movq [r0-2*FDEC_STRIDE], mm6
814 movq [r0-1*FDEC_STRIDE], mm7
815 movq [r0+0*FDEC_STRIDE], mm0
816 movq [r0+1*FDEC_STRIDE], mm1
817 movq [r0+2*FDEC_STRIDE], mm2
818 movq [r0+3*FDEC_STRIDE], mm3
821 ;-----------------------------------------------------------------------------
822 ; void predict_8x8_vr_core( uint8_t *src, uint8_t *edge )
823 ;-----------------------------------------------------------------------------
825 ; fills only some pixels:
836 cglobal predict_8x8_vr_core_mmxext, 2,2
842 add r0, FDEC_STRIDE*4
843 PRED8x8_LOWPASS b, mm0, mm1, mm2, mm4, mm7
847 movq [r0+ Y *FDEC_STRIDE], mm3
848 movq [r0+(Y+1)*FDEC_STRIDE], mm0
853 movq [r0+ Y *FDEC_STRIDE], mm3
854 movq [r0+(Y+1)*FDEC_STRIDE], mm0
858 ;-----------------------------------------------------------------------------
859 ; void predict_8x8c_p_core( uint8_t *src, int i00, int b, int c )
860 ;-----------------------------------------------------------------------------
861 cglobal predict_8x8c_p_core_mmxext, 1,2
864 pmullw mm2, [pw_3210]
866 paddsw mm0, mm2 ; mm0 = {i+0*b, i+1*b, i+2*b, i+3*b}
867 paddsw mm1, mm0 ; mm1 = {i+4*b, i+5*b, i+6*b, i+7*b}
886 ;-----------------------------------------------------------------------------
887 ; void predict_16x16_p_core( uint8_t *src, int i00, int b, int c )
888 ;-----------------------------------------------------------------------------
889 cglobal predict_16x16_p_core_mmxext, 1,2
893 pmullw mm5, [pw_3210]
897 paddsw mm0, mm5 ; mm0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b}
898 paddsw mm1, mm0 ; mm1 = {i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b}
899 paddsw mm2, mm0 ; mm2 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b}
900 paddsw mm3, mm1 ; mm3 = {i+12*b, i+13*b, i+14*b, i+15*b}
928 %endif ; !ARCH_X86_64
931 ;-----------------------------------------------------------------------------
932 ; void predict_8x8_ddl( uint8_t *src, uint8_t *edge )
933 ;-----------------------------------------------------------------------------
934 cglobal predict_8x8_ddl_sse2, 2,2
939 add r0, FDEC_STRIDE*4
940 PRED8x8_LOWPASS b, xmm0, xmm1, xmm2, xmm3, xmm4
945 movq [r0+Y*FDEC_STRIDE], xmm0
950 ;-----------------------------------------------------------------------------
951 ; void predict_8x8_ddr( uint8_t *src, uint8_t *edge )
952 ;-----------------------------------------------------------------------------
953 cglobal predict_8x8_ddr_sse2, 2,2
958 add r0, FDEC_STRIDE*4
959 PRED8x8_LOWPASS b, xmm0, xmm1, xmm2, xmm3, xmm4
965 movq [r0+Y*FDEC_STRIDE], xmm0
966 movq [r0+(Y-1)*FDEC_STRIDE], xmm1
971 movq [r0-3*FDEC_STRIDE], xmm0
972 movq [r0-4*FDEC_STRIDE], xmm1
976 ;-----------------------------------------------------------------------------
977 ; void predict_8x8_vl( uint8_t *src, uint8_t *edge )
978 ;-----------------------------------------------------------------------------
979 cglobal predict_8x8_vl_sse2, 2,2
987 add r0, FDEC_STRIDE*4
988 PRED8x8_LOWPASS b, xmm0, xmm1, xmm2, xmm4, xmm5
989 ; xmm0: (t0 + 2*t1 + t2 + 2) >> 2
990 ; xmm3: (t0 + t1 + 1) >> 1
995 movq [r0+ Y *FDEC_STRIDE], xmm3
996 movq [r0+(Y+1)*FDEC_STRIDE], xmm0
1001 movq [r0+ Y *FDEC_STRIDE], xmm3
1002 movq [r0+(Y+1)*FDEC_STRIDE], xmm0
1006 ;-----------------------------------------------------------------------------
1007 ; void predict_8x8_vr( uint8_t *src, uint8_t *edge )
1008 ;-----------------------------------------------------------------------------
1009 cglobal predict_8x8_vr_sse2, 2,2,7
1011 movdqa xmm6, [pw_ff00]
1012 add r0, 4*FDEC_STRIDE
1019 PRED8x8_LOWPASS b, xmm4, xmm3, xmm1, xmm0, xmm5
1025 movhps [r0-3*FDEC_STRIDE], xmm5
1026 movhps [r0-4*FDEC_STRIDE], xmm2
1035 movq [r0+Y*FDEC_STRIDE], xmm5
1036 movq [r0+(Y-1)*FDEC_STRIDE], xmm2
1042 ;-----------------------------------------------------------------------------
1043 ; void predict_8x8_hd( uint8_t *src, uint8_t *edge )
1044 ;-----------------------------------------------------------------------------
1045 %define PALIGNR PALIGNR_MMX
1046 cglobal predict_8x8_hd_mmxext, 2,2
1047 add r0, 4*FDEC_STRIDE
1048 movq mm0, [r1] ; l7 .. .. .. .. .. .. ..
1049 movq mm1, [r1+8] ; lt l0 l1 l2 l3 l4 l5 l6
1050 movq mm2, [r1+16] ; t7 t6 t5 t4 t3 t2 t1 t0
1051 movq mm3, mm1 ; lt l0 l1 l2 l3 l4 l5 l6
1052 movq mm4, mm2 ; t7 t6 t5 t4 t3 t2 t1 t0
1053 PALIGNR mm2, mm1, 7, mm5 ; t6 t5 t4 t3 t2 t1 t0 lt
1054 PALIGNR mm1, mm0, 7, mm6 ; l0 l1 l2 l3 l4 l5 l6 l7
1055 PALIGNR mm4, mm3, 1, mm7 ; t0 lt l0 l1 l2 l3 l4 l5
1058 PRED8x8_LOWPASS b, mm0, mm4, mm1, mm5, mm7
1060 movq mm1, mm2 ; t6 t5 t4 t3 t2 t1 t0 lt
1061 psrlq mm4, 16 ; .. .. t6 t5 t4 t3 t2 t1
1062 psrlq mm1, 8 ; .. t6 t5 t4 t3 t2 t1 t0
1063 PRED8x8_LOWPASS b, mm6, mm4, mm2, mm1, mm5
1066 punpcklbw mm3, mm0 ; p4 p3 p2 p1
1067 punpckhbw mm7, mm0 ; p8 p7 p6 p5
1071 movq [r0+3*FDEC_STRIDE], mm3
1072 PALIGNR mm7, mm3, 2, mm5
1073 movq [r0+2*FDEC_STRIDE], mm7
1074 PALIGNR mm1, mm3, 4, mm5
1075 movq [r0+1*FDEC_STRIDE], mm1
1076 PALIGNR mm0, mm3, 6, mm3
1077 movq [r0+0*FDEC_STRIDE], mm0
1080 movq [r0-1*FDEC_STRIDE], mm4
1081 PALIGNR mm6, mm4, 2, mm5
1082 movq [r0-2*FDEC_STRIDE], mm6
1083 PALIGNR mm2, mm4, 4, mm5
1084 movq [r0-3*FDEC_STRIDE], mm2
1085 PALIGNR mm3, mm4, 6, mm4
1086 movq [r0-4*FDEC_STRIDE], mm3
1089 ;-----------------------------------------------------------------------------
1090 ; void predict_8x8_hd( uint8_t *src, uint8_t *edge )
1091 ;-----------------------------------------------------------------------------
1092 %macro PREDICT_8x8_HD 1
1093 cglobal predict_8x8_hd_%1, 2,2
1094 add r0, 4*FDEC_STRIDE
1096 movdqa xmm1, [r1+16]
1099 PALIGNR xmm1, xmm0, 7, xmm4
1100 PALIGNR xmm2, xmm0, 9, xmm5
1101 PALIGNR xmm3, xmm0, 8, xmm0
1104 PRED8x8_LOWPASS b, xmm0, xmm1, xmm2, xmm3, xmm5
1105 punpcklbw xmm4, xmm0
1110 movq [r0+(Y)*FDEC_STRIDE], xmm4
1111 movq [r0+(Y-4)*FDEC_STRIDE], xmm0
1116 movq [r0+(Y)*FDEC_STRIDE], xmm4
1117 movq [r0+(Y-4)*FDEC_STRIDE], xmm0
1123 %define PALIGNR PALIGNR_SSSE3
1124 PREDICT_8x8_HD ssse3
1126 %define PALIGNR PALIGNR_MMX
1128 ;-----------------------------------------------------------------------------
1129 ; void predict_8x8_hu( uint8_t *src, uint8_t *edge )
1130 ;-----------------------------------------------------------------------------
1131 %macro PREDICT_8x8_HU 1
1132 cglobal predict_8x8_hu_%1, 2,2
1133 add r0, 4*FDEC_STRIDE
1136 movq mm6, [pb_reverse]
1147 movq mm1, [r1+7] ; l0 l1 l2 l3 l4 l5 l6 l7
1148 pshufw mm0, mm1, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1
1152 por mm2, mm0 ; l7 l6 l5 l4 l3 l2 l1 l0
1153 psllq mm1, 56 ; l7 .. .. .. .. .. .. ..
1159 por mm2, mm1 ; l7 l7 l6 l5 l4 l3 l2 l1
1161 por mm3, mm1 ; l7 l7 l7 l6 l5 l4 l3 l2
1164 PRED8x8_LOWPASS b, mm1, mm3, mm5, mm2, mm6
1168 punpcklbw xmm0, xmm1
1172 movq [r0+Y*FDEC_STRIDE], xmm0
1176 pshufw mm5, mm4, 11111001b
1177 pshufw mm6, mm4, 11111110b
1178 pshufw mm7, mm4, 11111111b
1179 movq [r0+Y*FDEC_STRIDE], xmm0
1180 movq [r0+0*FDEC_STRIDE], mm4
1181 movq [r0+1*FDEC_STRIDE], mm5
1182 movq [r0+2*FDEC_STRIDE], mm6
1183 movq [r0+3*FDEC_STRIDE], mm7
1188 PREDICT_8x8_HU ssse3
1190 ;-----------------------------------------------------------------------------
1191 ; void predict_8x8c_v( uint8_t *src )
1192 ;-----------------------------------------------------------------------------
1193 cglobal predict_8x8c_v_mmx, 1,1
1194 movq mm0, [r0 - FDEC_STRIDE]
1198 ;-----------------------------------------------------------------------------
1199 ; void predict_8x8c_h( uint8_t *src )
1200 ;-----------------------------------------------------------------------------
1202 %macro PRED_8x8C_H 1
1203 cglobal predict_8x8c_h_%1, 1,1
1207 add r0, FDEC_STRIDE*4
1210 SPLATB m0, r0+FDEC_STRIDE*n-1, m1
1211 mova [r0+FDEC_STRIDE*n], m0
1218 %define SPLATB SPLATB_MMX
1220 %define SPLATB SPLATB_SSSE3
1223 ;-----------------------------------------------------------------------------
1224 ; void predict_8x8c_dc_core( uint8_t *src, int s2, int s3 )
1225 ;-----------------------------------------------------------------------------
1226 cglobal predict_8x8c_dc_core_mmxext, 1,1
1227 movq mm0, [r0 - FDEC_STRIDE]
1232 psadbw mm1, mm2 ; s1
1233 psadbw mm0, mm2 ; s0
1248 pshufw mm0, mm0, 0 ; dc0 (w)
1250 psrlw mm3, 3 ; dc3 (w)
1251 psrlw mm2, 2 ; dc2 (w)
1252 psrlw mm1, 2 ; dc1 (w)
1254 packuswb mm0, mm1 ; dc0,dc1 (b)
1255 packuswb mm2, mm3 ; dc2,dc3 (b)
1260 cglobal predict_8x8c_dc_top_mmxext, 1,1
1261 movq mm0, [r0 - FDEC_STRIDE]
1266 psadbw mm1, mm2 ; s1
1267 psadbw mm0, mm2 ; s0
1273 pshufw mm0, mm0, 0 ; dc0 (w)
1274 packuswb mm0, mm1 ; dc0,dc1 (b)
1278 ;-----------------------------------------------------------------------------
1279 ; void predict_8x8c_p_core( uint8_t *src, int i00, int b, int c )
1280 ;-----------------------------------------------------------------------------
1282 cglobal predict_8x8c_p_core_sse2, 1,1
1286 pshuflw xmm0, xmm0, 0
1287 pshuflw xmm2, xmm2, 0
1288 pshuflw xmm4, xmm4, 0
1289 punpcklqdq xmm0, xmm0
1290 punpcklqdq xmm2, xmm2
1291 punpcklqdq xmm4, xmm4
1292 pmullw xmm2, [pw_76543210]
1293 paddsw xmm0, xmm2 ; xmm0 = {i+0*b, i+1*b, i+2*b, i+3*b, i+4*b, i+5*b, i+6*b, i+7*b}
1298 add r0, FDEC_STRIDE*4
1305 movq [r0+FDEC_STRIDE*0], xmm0
1306 movhps [r0+FDEC_STRIDE*1], xmm0
1314 movq [r0+FDEC_STRIDE*2], xmm5
1315 movhps [r0+FDEC_STRIDE*3], xmm5
1320 ;-----------------------------------------------------------------------------
1321 ; void predict_16x16_p_core( uint8_t *src, int i00, int b, int c )
1322 ;-----------------------------------------------------------------------------
1323 cglobal predict_16x16_p_core_sse2, 1,2,8
1327 pshuflw xmm0, xmm0, 0
1328 pshuflw xmm1, xmm1, 0
1329 pshuflw xmm2, xmm2, 0
1330 punpcklqdq xmm0, xmm0
1331 punpcklqdq xmm1, xmm1
1332 punpcklqdq xmm2, xmm2
1334 pmullw xmm3, [pw_76543210]
1336 paddsw xmm0, xmm3 ; xmm0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b, i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b}
1337 paddsw xmm1, xmm0 ; xmm1 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b, i+12*b, i+13*b, i+14*b, i+15*b}
1355 movdqa [r0+FDEC_STRIDE*0], xmm3
1356 movdqa [r0+FDEC_STRIDE*1], xmm5
1359 add r0, FDEC_STRIDE*2
1364 ;-----------------------------------------------------------------------------
1365 ; void predict_16x16_v( pixel *src )
1366 ;-----------------------------------------------------------------------------
1367 %ifdef HIGH_BIT_DEPTH
1369 cglobal predict_16x16_v_mmx, 1,2
1370 mova m0, [r0 - FDEC_STRIDEB+ 0]
1371 mova m1, [r0 - FDEC_STRIDEB+ 8]
1372 mova m2, [r0 - FDEC_STRIDEB+16]
1373 mova m3, [r0 - FDEC_STRIDEB+24]
1374 STORE16x16 m0, m1, m2, m3
1377 cglobal predict_16x16_v_sse2, 2,2
1378 mova m0, [r0 - FDEC_STRIDEB+ 0]
1379 mova m1, [r0 - FDEC_STRIDEB+16]
1380 STORE16x16_SSE2 m0, m1
1384 cglobal predict_16x16_v_mmx, 1,2
1385 movq m0, [r0 - FDEC_STRIDE + 0]
1386 movq m1, [r0 - FDEC_STRIDE + 8]
1390 cglobal predict_16x16_v_sse2, 1,1
1391 movdqa xmm0, [r0 - FDEC_STRIDE]
1392 STORE16x16_SSE2 xmm0
1396 ;-----------------------------------------------------------------------------
1397 ; void predict_16x16_h( pixel *src )
1398 ;-----------------------------------------------------------------------------
1399 %macro PRED_16x16_H 1
1400 cglobal predict_16x16_h_%1, 1,2
1401 mov r1, 12*FDEC_STRIDEB
1402 %ifdef HIGH_BIT_DEPTH
1406 movd m0, [r0+r1+n*FDEC_STRIDEB-2*SIZEOF_PIXEL]
1408 mova [r0+r1+n*FDEC_STRIDEB+ 0], m0
1409 mova [r0+r1+n*FDEC_STRIDEB+16], m0
1411 mova [r0+r1+n*FDEC_STRIDEB+ 8], m0
1412 mova [r0+r1+n*FDEC_STRIDEB+24], m0
1424 SPLATB m0, r0+r1+FDEC_STRIDE*n-1, m1
1425 mova [r0+r1+FDEC_STRIDE*n], m0
1427 mova [r0+r1+FDEC_STRIDE*n+8], m0
1431 %endif ; HIGH_BIT_DEPTH
1432 sub r1, 4*FDEC_STRIDEB
1438 %define SPLATB SPLATB_MMX
1441 %ifdef HIGH_BIT_DEPTH
1444 ;no SSE2 for 8-bit, it's slower than MMX on all systems that don't support SSSE3
1445 %define SPLATB SPLATB_SSSE3
1449 ;-----------------------------------------------------------------------------
1450 ; void predict_16x16_dc_core( pixel *src, int i_dc_left )
1451 ;-----------------------------------------------------------------------------
1453 %macro PRED16x16_DC 2
1454 %ifdef HIGH_BIT_DEPTH
1455 mova m0, [r0 - FDEC_STRIDEB+ 0]
1456 paddw m0, [r0 - FDEC_STRIDEB+ 8]
1457 paddw m0, [r0 - FDEC_STRIDEB+16]
1458 paddw m0, [r0 - FDEC_STRIDEB+24]
1463 STORE16x16 m0, m0, m0, m0
1467 psadbw m0, [r0 - FDEC_STRIDE]
1468 psadbw m1, [r0 - FDEC_STRIDE + 8]
1473 packuswb m0, m0 ; dc in bytes
1479 cglobal predict_16x16_dc_core_mmxext, 1,2
1489 cglobal predict_16x16_dc_top_mmxext, 1,2
1490 PRED16x16_DC [pw_8], 4
1494 %ifdef HIGH_BIT_DEPTH
1495 cglobal predict_16x16_dc_left_core_mmxext, 1,2
1498 STORE16x16 m0, m0, m0, m0
1501 cglobal predict_16x16_dc_left_core_mmxext, 1,1
1509 ;-----------------------------------------------------------------------------
1510 ; void predict_16x16_dc_core( pixel *src, int i_dc_left )
1511 ;-----------------------------------------------------------------------------
1513 %macro PRED16x16_DC_SSE2 2
1514 %ifdef HIGH_BIT_DEPTH
1515 mova m0, [r0 - FDEC_STRIDEB+ 0]
1516 paddw m0, [r0 - FDEC_STRIDEB+16]
1521 STORE16x16_SSE2 m0, m0
1524 psadbw m0, [r0 - FDEC_STRIDE]
1530 packuswb m0, m0 ; dc in bytes
1536 cglobal predict_16x16_dc_core_sse2, 2,2,4
1538 PRED16x16_DC_SSE2 m3, 5
1541 cglobal predict_16x16_dc_top_sse2, 1,2
1542 PRED16x16_DC_SSE2 [pw_8], 4
1546 %ifdef HIGH_BIT_DEPTH
1547 cglobal predict_16x16_dc_left_core_sse2, 1,2
1550 STORE16x16_SSE2 m0, m0
1553 cglobal predict_16x16_dc_left_core_sse2, 1,1