1 ;*****************************************************************************
2 ;* predict-a.asm: x86 intra prediction
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2010 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Holger Lubitz <holger@lubitz.org>
8 ;* Fiona Glaser <fiona@x264.com>
10 ;* This program is free software; you can redistribute it and/or modify
11 ;* it under the terms of the GNU General Public License as published by
12 ;* the Free Software Foundation; either version 2 of the License, or
13 ;* (at your option) any later version.
15 ;* This program is distributed in the hope that it will be useful,
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 ;* GNU General Public License for more details.
20 ;* You should have received a copy of the GNU General Public License
21 ;* along with this program; if not, write to the Free Software
22 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
24 ;* This program is also available under a commercial proprietary license.
25 ;* For more information, contact us at licensing@x264.com.
26 ;*****************************************************************************
29 %include "x86util.asm"
34 pw_3210: dw 0, 1, 2, 3, 4, 5, 6, 7
35 pb_00s_ff: times 8 db 0
36 pb_0s_ff: times 7 db 0
52 add r0, 4*FDEC_STRIDEB
53 mova [r0 + -4*FDEC_STRIDEB], %1
54 mova [r0 + -3*FDEC_STRIDEB], %1
55 mova [r0 + -2*FDEC_STRIDEB], %1
56 mova [r0 + -1*FDEC_STRIDEB], %1
57 mova [r0 + 0*FDEC_STRIDEB], %2
58 mova [r0 + 1*FDEC_STRIDEB], %2
59 mova [r0 + 2*FDEC_STRIDEB], %2
60 mova [r0 + 3*FDEC_STRIDEB], %2
67 mova [r0 + 0*FDEC_STRIDEB + 0], %1
68 mova [r0 + 1*FDEC_STRIDEB + 0], %1
69 mova [r0 + 0*FDEC_STRIDEB + 8], %2
70 mova [r0 + 1*FDEC_STRIDEB + 8], %2
71 mova [r0 + 0*FDEC_STRIDEB +16], %3
72 mova [r0 + 1*FDEC_STRIDEB +16], %3
73 mova [r0 + 0*FDEC_STRIDEB +24], %4
74 mova [r0 + 1*FDEC_STRIDEB +24], %4
75 add r0, 2*FDEC_STRIDEB
81 mova [r0 + 0*FDEC_STRIDE], %1
82 mova [r0 + 1*FDEC_STRIDE], %1
83 mova [r0 + 2*FDEC_STRIDE], %1
84 mova [r0 + 3*FDEC_STRIDE], %1
85 mova [r0 + 0*FDEC_STRIDE + 8], %2
86 mova [r0 + 1*FDEC_STRIDE + 8], %2
87 mova [r0 + 2*FDEC_STRIDE + 8], %2
88 mova [r0 + 3*FDEC_STRIDE + 8], %2
95 %macro STORE16x16_SSE2 1-2
99 mova [r0+0*FDEC_STRIDEB+ 0], %1
100 mova [r0+0*FDEC_STRIDEB+16], %2
101 mova [r0+1*FDEC_STRIDEB+ 0], %1
102 mova [r0+1*FDEC_STRIDEB+16], %2
103 mova [r0+2*FDEC_STRIDEB+ 0], %1
104 mova [r0+2*FDEC_STRIDEB+16], %2
105 mova [r0+3*FDEC_STRIDEB+ 0], %1
106 mova [r0+3*FDEC_STRIDEB+16], %2
107 add r0, 4*FDEC_STRIDEB
111 add r0, 4*FDEC_STRIDEB
112 mova [r0 + -4*FDEC_STRIDEB], %1
113 mova [r0 + -3*FDEC_STRIDEB], %1
114 mova [r0 + -2*FDEC_STRIDEB], %1
115 mova [r0 + -1*FDEC_STRIDEB], %1
116 mova [r0 + 0*FDEC_STRIDEB], %1
117 mova [r0 + 1*FDEC_STRIDEB], %1
118 mova [r0 + 2*FDEC_STRIDEB], %1
119 mova [r0 + 3*FDEC_STRIDEB], %1
120 add r0, 8*FDEC_STRIDEB
121 mova [r0 + -4*FDEC_STRIDEB], %1
122 mova [r0 + -3*FDEC_STRIDEB], %1
123 mova [r0 + -2*FDEC_STRIDEB], %1
124 mova [r0 + -1*FDEC_STRIDEB], %1
125 mova [r0 + 0*FDEC_STRIDEB], %1
126 mova [r0 + 1*FDEC_STRIDEB], %1
127 mova [r0 + 2*FDEC_STRIDEB], %1
128 mova [r0 + 3*FDEC_STRIDEB], %1
132 ; dest, left, right, src, tmp
133 ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
134 %macro PRED8x8_LOWPASS 5-6
151 %macro LOAD_PLANE_ARGS 0
166 ;-----------------------------------------------------------------------------
167 ; void predict_4x4_ddl( pixel *src )
168 ;-----------------------------------------------------------------------------
169 %macro PREDICT_4x4_DDL 4
170 cglobal predict_4x4_ddl_%1, 1,1
171 movu m1, [r0-FDEC_STRIDEB]
179 PRED8x8_LOWPASS %4, m0, m1, m3, m4, m5
184 movh [r0+Y*FDEC_STRIDEB], m0
191 %ifdef HIGH_BIT_DEPTH
193 PREDICT_4x4_DDL sse2, dq, 2, w
195 %define PALIGNR PALIGNR_MMX
196 cglobal predict_4x4_ddl_mmxext, 1,2
197 mova m1, [r0-2*FDEC_STRIDE+4]
198 mova m2, [r0-2*FDEC_STRIDE+0]
199 mova m3, [r0-2*FDEC_STRIDE+2]
200 PRED8x8_LOWPASS w, m0, m1, m2, m3
201 mova [r0+0*FDEC_STRIDE], m0
203 mova m5, [r0-2*FDEC_STRIDE+6]
204 mova m6, [r0-2*FDEC_STRIDE+8]
206 PRED8x8_LOWPASS w, m4, m7, m5, m6
207 mova [r0+6*FDEC_STRIDE], m4
210 PALIGNR m4, m0, 6, m1
211 mova [r0+4*FDEC_STRIDE], m4
214 PALIGNR m4, m0, 6, m0
215 mova [r0+2*FDEC_STRIDE], m4
219 PREDICT_4x4_DDL mmxext, q , 8, b
222 ;-----------------------------------------------------------------------------
223 ; void predict_4x4_ddr( pixel *src )
224 ;-----------------------------------------------------------------------------
226 cglobal predict_4x4_ddr_%1, 1,1
227 movu m1, [r0+1*FDEC_STRIDEB-8*SIZEOF_PIXEL]
228 movq m2, [r0+0*FDEC_STRIDEB-8]
229 %ifdef HIGH_BIT_DEPTH
230 movh m4, [r0-1*FDEC_STRIDEB-4*SIZEOF_PIXEL]
232 movh m3, [r0-1*FDEC_STRIDEB]
234 PALIGNR m3, m1, 5*SIZEOF_PIXEL, m1
236 movhps m4, [r0+2*FDEC_STRIDEB-4*SIZEOF_PIXEL]
237 PALIGNR m3, m4, 7*SIZEOF_PIXEL, m4
239 movhps m4, [r0+3*FDEC_STRIDEB-4*SIZEOF_PIXEL]
240 PALIGNR m3, m4, 7*SIZEOF_PIXEL, m4
242 punpckh%2 m2, [r0-1*FDEC_STRIDEB-8*SIZEOF_PIXEL]
243 movh m3, [r0-1*FDEC_STRIDEB]
245 PALIGNR m3, m1, 5*SIZEOF_PIXEL, m1
247 PALIGNR m3, [r0+2*FDEC_STRIDEB-8*SIZEOF_PIXEL], 7*SIZEOF_PIXEL, m4
249 PALIGNR m3, [r0+3*FDEC_STRIDEB-8*SIZEOF_PIXEL], 7*SIZEOF_PIXEL, m4
251 PRED8x8_LOWPASS %5, m0, m3, m1, m2, m4
253 movh [r0+Y*FDEC_STRIDEB], m0
257 movh [r0+Y*FDEC_STRIDEB], m0
261 cglobal predict_4x4_vr_%1, 1,1,6*(mmsize/16)
262 movh m0, [r0-1*FDEC_STRIDEB] ; ........t3t2t1t0
264 %ifdef HIGH_BIT_DEPTH
265 movhps m1, [r0-1*FDEC_STRIDEB-4*SIZEOF_PIXEL]
266 PALIGNR m0, m1, 7*SIZEOF_PIXEL, m1 ; ......t3t2t1t0lt
268 movhps m1, [r0+0*FDEC_STRIDEB-4*SIZEOF_PIXEL]
269 PALIGNR m0, m1, 7*SIZEOF_PIXEL, m1 ; ....t3t2t1t0ltl0
271 movhps m2, [r0+1*FDEC_STRIDEB-4*SIZEOF_PIXEL]
272 PALIGNR m0, m2, 7*SIZEOF_PIXEL, m2 ; ..t3t2t1t0ltl0l1
274 movhps m3, [r0+2*FDEC_STRIDEB-4*SIZEOF_PIXEL]
275 PALIGNR m0, m3, 7*SIZEOF_PIXEL, m3 ; t3t2t1t0ltl0l1l2
277 PALIGNR m0, [r0-1*FDEC_STRIDEB-8*SIZEOF_PIXEL], 7*SIZEOF_PIXEL, m1 ; ......t3t2t1t0lt
279 PALIGNR m0, [r0+0*FDEC_STRIDEB-8*SIZEOF_PIXEL], 7*SIZEOF_PIXEL, m1 ; ....t3t2t1t0ltl0
281 PALIGNR m0, [r0+1*FDEC_STRIDEB-8*SIZEOF_PIXEL], 7*SIZEOF_PIXEL, m2 ; ..t3t2t1t0ltl0l1
283 PALIGNR m0, [r0+2*FDEC_STRIDEB-8*SIZEOF_PIXEL], 7*SIZEOF_PIXEL, m3 ; t3t2t1t0ltl0l1l2
285 PRED8x8_LOWPASS %5, m3, m1, m0, m2, m4
289 movh [r0+0*FDEC_STRIDEB], m5
290 movh [r0+1*FDEC_STRIDEB], m3
291 PALIGNR m5, m1, 7*SIZEOF_PIXEL, m2
293 movh [r0+2*FDEC_STRIDEB], m5
294 PALIGNR m3, m1, 7*SIZEOF_PIXEL, m1
295 movh [r0+3*FDEC_STRIDEB], m3
298 cglobal predict_4x4_hd_%1, 1,1,6*(mmsize/16)
299 movh m0, [r0-1*FDEC_STRIDEB-4*SIZEOF_PIXEL] ; lt ..
300 %ifdef HIGH_BIT_DEPTH
301 movh m1, [r0-1*FDEC_STRIDEB]
302 punpckl%6 m0, m1 ; t3 t2 t1 t0 lt .. .. ..
303 psll%4 m0, %7 ; t2 t1 t0 lt .. .. .. ..
304 movh m1, [r0+3*FDEC_STRIDEB-4*SIZEOF_PIXEL] ; l3
305 movh m2, [r0+2*FDEC_STRIDEB-4*SIZEOF_PIXEL]
306 punpckl%2 m1, m2 ; l2 l3
307 movh m2, [r0+1*FDEC_STRIDEB-4*SIZEOF_PIXEL] ; l1
308 movh m3, [r0+0*FDEC_STRIDEB-4*SIZEOF_PIXEL]
309 punpckl%2 m2, m3 ; l0 l1
311 punpckl%6 m0, [r0-1*FDEC_STRIDEB] ; t3 t2 t1 t0 lt .. .. ..
312 psll%4 m0, %7 ; t2 t1 t0 lt .. .. .. ..
313 movu m1, [r0+3*FDEC_STRIDEB-8*SIZEOF_PIXEL] ; l3
314 punpckh%2 m1, [r0+2*FDEC_STRIDEB-8*SIZEOF_PIXEL] ; l2 l3
315 movu m2, [r0+1*FDEC_STRIDEB-8*SIZEOF_PIXEL] ; l1
316 punpckh%2 m2, [r0+0*FDEC_STRIDEB-8*SIZEOF_PIXEL] ; l0 l1
318 punpckh%3 m1, m2 ; l0 l1 l2 l3
319 punpckh%6 m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3
323 psrl%4 m0, %7*2 ; .. .. t2 t1 t0 lt l0 l1
324 psrl%4 m2, %7 ; .. t2 t1 t0 lt l0 l1 l2
326 PRED8x8_LOWPASS %5, m3, m1, m0, m2, m4
329 PALIGNR m3, m5, 6*SIZEOF_PIXEL, m4
331 movh [r0+Y*FDEC_STRIDEB], m5
335 movh [r0+Y*FDEC_STRIDEB], m5
337 movh [r0+0*FDEC_STRIDEB], m3
341 %ifdef HIGH_BIT_DEPTH
343 %define PALIGNR PALIGNR_SSSE3
344 PREDICT_4x4 ssse3 , wd, dq, dq, w, qdq, 2
347 %define PALIGNR PALIGNR_MMX
348 PREDICT_4x4 mmxext, bw, wd, q , b, dq , 8
349 %define PALIGNR PALIGNR_SSSE3
350 PREDICT_4x4 ssse3 , bw, wd, q , b, dq , 8
353 ;-----------------------------------------------------------------------------
354 ; void predict_4x4_hu( pixel *src )
355 ;-----------------------------------------------------------------------------
356 %ifdef HIGH_BIT_DEPTH
358 cglobal predict_4x4_hu_sse2, 1,1,6
359 movq mm0, [r0+0*FDEC_STRIDEB-4*2]
360 punpckhwd mm0, [r0+1*FDEC_STRIDEB-4*2]
361 movq mm1, [r0+2*FDEC_STRIDEB-4*2]
362 punpckhwd mm1, [r0+3*FDEC_STRIDEB-4*2]
364 pshufw mm1, mm1, 0xFF
374 PRED8x8_LOWPASS w, m4, m0, m2, m3, m5
377 movq [r0+0*FDEC_STRIDEB], m1
379 movq [r0+1*FDEC_STRIDEB], m1
381 movq [r0+2*FDEC_STRIDEB], m1
382 movq [r0+3*FDEC_STRIDEB], mm1
387 cglobal predict_4x4_hu_mmxext, 1,1
388 movq mm0, [r0+0*FDEC_STRIDE-8]
389 punpckhbw mm0, [r0+1*FDEC_STRIDE-8]
390 movq mm1, [r0+2*FDEC_STRIDE-8]
391 punpckhbw mm1, [r0+3*FDEC_STRIDE-8]
395 pshufw mm1, mm1, 0xFF
403 PRED8x8_LOWPASS b, mm4, mm0, mm2, mm3, mm5
406 movd [r0+Y*FDEC_STRIDE], mm7
410 movd [r0+Y*FDEC_STRIDE], mm7
412 movd [r0+3*FDEC_STRIDE], mm1
414 %endif ; HIGH_BIT_DEPTH
416 ;-----------------------------------------------------------------------------
417 ; void predict_4x4_vl( pixel *src )
418 ;-----------------------------------------------------------------------------
419 %macro PREDICT_4x4_V1 4
420 cglobal predict_4x4_vl_%1, 1,1,6*(mmsize/16)
421 movu m1, [r0-FDEC_STRIDEB]
428 PRED8x8_LOWPASS %4, m0, m1, m2, m3, m5
430 movh [r0+0*FDEC_STRIDEB], m4
431 movh [r0+1*FDEC_STRIDEB], m0
434 movh [r0+2*FDEC_STRIDEB], m4
435 movh [r0+3*FDEC_STRIDEB], m0
439 %ifdef HIGH_BIT_DEPTH
441 PREDICT_4x4_V1 sse2, dq, 2, w
444 %define PALIGNR PALIGNR_MMX
445 cglobal predict_4x4_vl_mmxext, 1,4
446 mova m1, [r0-FDEC_STRIDEB+0]
447 mova m2, [r0-FDEC_STRIDEB+8]
449 PALIGNR m2, m1, 4, m6
450 PALIGNR m3, m1, 2, m5
453 mova [r0+0*FDEC_STRIDEB], m4
455 mova [r0+2*FDEC_STRIDEB], m4
456 PRED8x8_LOWPASS w, m0, m1, m2, m3, m6
457 mova [r0+1*FDEC_STRIDEB], m0
459 mova [r0+3*FDEC_STRIDEB], m0
461 movzx r1d, word [r0-FDEC_STRIDEB+ 8]
462 movzx r2d, word [r0-FDEC_STRIDEB+10]
463 movzx r3d, word [r0-FDEC_STRIDEB+12]
469 mov [r0+2*FDEC_STRIDEB+6], r1w
470 mov [r0+3*FDEC_STRIDEB+6], r3w
474 PREDICT_4x4_V1 mmxext, q , 8, b
477 ;-----------------------------------------------------------------------------
478 ; void predict_4x4_dc( pixel *src )
479 ;-----------------------------------------------------------------------------
480 %ifdef HIGH_BIT_DEPTH
482 cglobal predict_4x4_dc_mmxext, 1,1
483 mova m2, [r0+0*FDEC_STRIDEB-4*SIZEOF_PIXEL]
484 paddw m2, [r0+1*FDEC_STRIDEB-4*SIZEOF_PIXEL]
485 paddw m2, [r0+2*FDEC_STRIDEB-4*SIZEOF_PIXEL]
486 paddw m2, [r0+3*FDEC_STRIDEB-4*SIZEOF_PIXEL]
488 mova m0, [r0-FDEC_STRIDEB]
494 mova [r0+0*FDEC_STRIDEB], m0
495 mova [r0+1*FDEC_STRIDEB], m0
496 mova [r0+2*FDEC_STRIDEB], m0
497 mova [r0+3*FDEC_STRIDEB], m0
502 cglobal predict_4x4_dc_mmxext, 1,4
504 movd mm0, [r0-FDEC_STRIDE]
507 movzx r1d, byte [r0-1]
510 movzx r2d, byte [r0+FDEC_STRIDE*n-1]
517 mov [r0+FDEC_STRIDE*0], r1d
518 mov [r0+FDEC_STRIDE*1], r1d
519 mov [r0+FDEC_STRIDE*2], r1d
520 mov [r0+FDEC_STRIDE*3], r1d
522 %endif ; HIGH_BIT_DEPTH
524 %macro PREDICT_FILTER 1
525 ;-----------------------------------------------------------------------------
526 ;void predict_8x8_filter( uint8_t *src, uint8_t edge[33], int i_neighbor, int i_filters )
527 ;-----------------------------------------------------------------------------
529 cglobal predict_8x8_filter_%1, 4,5
542 movq mm0, [src+0*FDEC_STRIDE-8]
543 punpckhbw mm0, [src-1*FDEC_STRIDE-8]
544 movq mm1, [src+2*FDEC_STRIDE-8]
545 punpckhbw mm1, [src+1*FDEC_STRIDE-8]
547 movq mm2, [src+4*FDEC_STRIDE-8]
548 punpckhbw mm2, [src+3*FDEC_STRIDE-8]
549 movq mm3, [src+6*FDEC_STRIDE-8]
550 punpckhbw mm3, [src+5*FDEC_STRIDE-8]
553 movq mm0, [src+7*FDEC_STRIDE-8]
554 movq mm1, [src-1*FDEC_STRIDE]
557 PALIGNR mm4, mm0, 7, mm0
558 PALIGNR mm1, mm2, 1, mm2
563 PRED8x8_LOWPASS b, mm2, mm1, mm4, mm3, mm5
566 PRED8x8_LOWPASS b, mm1, mm3, mm0, mm4, mm5
572 movq mm0, [src-1*FDEC_STRIDE-8]
573 movq mm3, [src-1*FDEC_STRIDE]
574 movq mm1, [src-1*FDEC_STRIDE+8]
577 PALIGNR mm2, mm0, 7, mm0
578 PALIGNR mm1, mm4, 1, mm4
584 PRED8x8_LOWPASS b, mm4, mm2, mm1, mm3, mm5
590 movq mm0, [src-1*FDEC_STRIDE+8]
595 PALIGNR mm2, mm3, 7, mm3
596 PALIGNR mm5, mm4, 1, mm4
597 PRED8x8_LOWPASS b, mm1, mm2, mm5, mm0, mm4
601 pshufw mm1, mm3, 0xFF
633 %define PALIGNR PALIGNR_MMX
635 PREDICT_FILTER mmxext
636 %define PALIGNR PALIGNR_SSSE3
639 ;-----------------------------------------------------------------------------
640 ; void predict_8x8_v( pixel *src, pixel *edge )
641 ;-----------------------------------------------------------------------------
642 %macro PREDICT_8x8_V 1
643 cglobal predict_8x8_v_%1, 2,2
644 mova m0, [r1+16*SIZEOF_PIXEL]
649 %ifdef HIGH_BIT_DEPTH
657 ;-----------------------------------------------------------------------------
658 ; void predict_8x8_h( pixel *src, pixel edge[33] )
659 ;-----------------------------------------------------------------------------
660 %macro PREDICT_8x8_H 3
661 cglobal predict_8x8_h_%1, 2,2
662 movu m1, [r1+7*SIZEOF_PIXEL]
663 add r0, 4*FDEC_STRIDEB
670 SPLAT%3 m0, m %+ i, (3-n)&3
671 mova [r0+(n-4)*FDEC_STRIDEB], m0
677 %ifdef HIGH_BIT_DEPTH
679 PREDICT_8x8_H sse2 , wd, D
682 PREDICT_8x8_H mmxext, bw, W
685 ;-----------------------------------------------------------------------------
686 ; void predict_8x8_dc( pixel *src, pixel *edge );
687 ;-----------------------------------------------------------------------------
688 %ifdef HIGH_BIT_DEPTH
690 cglobal predict_8x8_dc_sse2, 2,2
702 cglobal predict_8x8_dc_mmxext, 2,2
714 %endif ; HIGH_BIT_DEPTH
716 ;-----------------------------------------------------------------------------
717 ; void predict_8x8_dc_top ( pixel *src, pixel *edge );
718 ; void predict_8x8_dc_left( pixel *src, pixel *edge );
719 ;-----------------------------------------------------------------------------
720 %ifdef HIGH_BIT_DEPTH
732 PRED8x8_DC predict_8x8_dc_top_sse2 , 32, mova
733 PRED8x8_DC predict_8x8_dc_left_sse2, 14, movu
748 PRED8x8_DC predict_8x8_dc_top_mmxext, 16
749 PRED8x8_DC predict_8x8_dc_left_mmxext, 7
750 %endif ; HIGH_BIT_DEPTH
753 ; sse2 is faster even on amd, so there's no sense in spending exe size on these
754 ; functions if we know sse2 is available.
756 ;-----------------------------------------------------------------------------
757 ; void predict_8x8_ddl( uint8_t *src, uint8_t *edge )
758 ;-----------------------------------------------------------------------------
759 cglobal predict_8x8_ddl_mmxext, 2,2
766 add r0, FDEC_STRIDE*4
767 PRED8x8_LOWPASS b, mm0, mm1, mm2, mm5, mm7
768 PRED8x8_LOWPASS b, mm1, mm3, mm4, [r1+24], mm6
771 movq [r0+Y*FDEC_STRIDE], mm1
779 movq [r0+Y*FDEC_STRIDE], mm1
784 movq [r0+Y*FDEC_STRIDE], mm1
787 ;-----------------------------------------------------------------------------
788 ; void predict_8x8_ddr( uint8_t *src, uint8_t *edge )
789 ;-----------------------------------------------------------------------------
790 cglobal predict_8x8_ddr_mmxext, 2,2
795 add r0, FDEC_STRIDE*4
796 PRED8x8_LOWPASS b, mm0, mm1, mm2, [r1+8], mm7
797 PRED8x8_LOWPASS b, mm1, mm3, mm4, [r1+16], mm6
800 movq [r0+Y*FDEC_STRIDE], mm0
808 movq [r0+Y*FDEC_STRIDE], mm0
813 movq [r0+Y*FDEC_STRIDE], mm0
816 ;-----------------------------------------------------------------------------
817 ; void predict_8x8_hu( uint8_t *src, uint8_t *edge )
818 ;-----------------------------------------------------------------------------
819 %define PALIGNR PALIGNR_MMX
820 cglobal predict_8x8_hu_mmxext, 2,2
821 movq mm1, [r1+7] ; l0 l1 l2 l3 l4 l5 l6 l7
822 add r0, 4*FDEC_STRIDE
823 pshufw mm0, mm1, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1
824 psllq mm1, 56 ; l7 .. .. .. .. .. .. ..
828 por mm2, mm0 ; l7 l6 l5 l4 l3 l2 l1 l0
834 por mm2, mm1 ; l7 l7 l6 l5 l4 l3 l2 l1
836 por mm3, mm1 ; l7 l7 l7 l6 l5 l4 l3 l2
838 PRED8x8_LOWPASS b, mm1, mm3, mm5, mm2, mm6
840 punpcklbw mm4, mm1 ; p4 p3 p2 p1
841 punpckhbw mm5, mm1 ; p8 p7 p6 p5
845 PALIGNR mm5, mm4, 2, mm1
846 pshufw mm1, mm6, 11111001b
847 PALIGNR mm6, mm4, 4, mm2
848 pshufw mm2, mm7, 11111110b
849 PALIGNR mm7, mm4, 6, mm3
850 pshufw mm3, mm0, 11111111b
851 movq [r0-4*FDEC_STRIDE], mm4
852 movq [r0-3*FDEC_STRIDE], mm5
853 movq [r0-2*FDEC_STRIDE], mm6
854 movq [r0-1*FDEC_STRIDE], mm7
855 movq [r0+0*FDEC_STRIDE], mm0
856 movq [r0+1*FDEC_STRIDE], mm1
857 movq [r0+2*FDEC_STRIDE], mm2
858 movq [r0+3*FDEC_STRIDE], mm3
861 ;-----------------------------------------------------------------------------
862 ; void predict_8x8_vr_core( uint8_t *src, uint8_t *edge )
863 ;-----------------------------------------------------------------------------
865 ; fills only some pixels:
876 cglobal predict_8x8_vr_core_mmxext, 2,2
882 add r0, FDEC_STRIDE*4
883 PRED8x8_LOWPASS b, mm0, mm1, mm2, mm4, mm7
887 movq [r0+ Y *FDEC_STRIDE], mm3
888 movq [r0+(Y+1)*FDEC_STRIDE], mm0
893 movq [r0+ Y *FDEC_STRIDE], mm3
894 movq [r0+(Y+1)*FDEC_STRIDE], mm0
898 ;-----------------------------------------------------------------------------
899 ; void predict_8x8c_p_core( uint8_t *src, int i00, int b, int c )
900 ;-----------------------------------------------------------------------------
901 cglobal predict_8x8c_p_core_mmxext, 1,2
904 pmullw mm2, [pw_3210]
906 paddsw mm0, mm2 ; mm0 = {i+0*b, i+1*b, i+2*b, i+3*b}
907 paddsw mm1, mm0 ; mm1 = {i+4*b, i+5*b, i+6*b, i+7*b}
926 ;-----------------------------------------------------------------------------
927 ; void predict_16x16_p_core( uint8_t *src, int i00, int b, int c )
928 ;-----------------------------------------------------------------------------
929 cglobal predict_16x16_p_core_mmxext, 1,2
933 pmullw mm5, [pw_3210]
937 paddsw mm0, mm5 ; mm0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b}
938 paddsw mm1, mm0 ; mm1 = {i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b}
939 paddsw mm2, mm0 ; mm2 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b}
940 paddsw mm3, mm1 ; mm3 = {i+12*b, i+13*b, i+14*b, i+15*b}
968 %endif ; !ARCH_X86_64
971 ;-----------------------------------------------------------------------------
972 ; void predict_8x8_ddl( uint8_t *src, uint8_t *edge )
973 ;-----------------------------------------------------------------------------
974 cglobal predict_8x8_ddl_sse2, 2,2
979 add r0, FDEC_STRIDE*4
980 PRED8x8_LOWPASS b, xmm0, xmm1, xmm2, xmm3, xmm4
985 movq [r0+Y*FDEC_STRIDE], xmm0
990 ;-----------------------------------------------------------------------------
991 ; void predict_8x8_ddr( uint8_t *src, uint8_t *edge )
992 ;-----------------------------------------------------------------------------
993 cglobal predict_8x8_ddr_sse2, 2,2
998 add r0, FDEC_STRIDE*4
999 PRED8x8_LOWPASS b, xmm0, xmm1, xmm2, xmm3, xmm4
1005 movq [r0+Y*FDEC_STRIDE], xmm0
1006 movq [r0+(Y-1)*FDEC_STRIDE], xmm1
1011 movq [r0-3*FDEC_STRIDE], xmm0
1012 movq [r0-4*FDEC_STRIDE], xmm1
1016 ;-----------------------------------------------------------------------------
1017 ; void predict_8x8_vl( uint8_t *src, uint8_t *edge )
1018 ;-----------------------------------------------------------------------------
1019 cglobal predict_8x8_vl_sse2, 2,2
1020 movdqa xmm4, [r1+16]
1027 add r0, FDEC_STRIDE*4
1028 PRED8x8_LOWPASS b, xmm0, xmm1, xmm2, xmm4, xmm5
1029 ; xmm0: (t0 + 2*t1 + t2 + 2) >> 2
1030 ; xmm3: (t0 + t1 + 1) >> 1
1035 movq [r0+ Y *FDEC_STRIDE], xmm3
1036 movq [r0+(Y+1)*FDEC_STRIDE], xmm0
1041 movq [r0+ Y *FDEC_STRIDE], xmm3
1042 movq [r0+(Y+1)*FDEC_STRIDE], xmm0
1046 ;-----------------------------------------------------------------------------
1047 ; void predict_8x8_vr( uint8_t *src, uint8_t *edge )
1048 ;-----------------------------------------------------------------------------
1049 cglobal predict_8x8_vr_sse2, 2,2,7
1051 movdqa xmm6, [pw_ff00]
1052 add r0, 4*FDEC_STRIDE
1059 PRED8x8_LOWPASS b, xmm4, xmm3, xmm1, xmm0, xmm5
1065 movhps [r0-3*FDEC_STRIDE], xmm5
1066 movhps [r0-4*FDEC_STRIDE], xmm2
1075 movq [r0+Y*FDEC_STRIDE], xmm5
1076 movq [r0+(Y-1)*FDEC_STRIDE], xmm2
1082 ;-----------------------------------------------------------------------------
1083 ; void predict_8x8_hd( uint8_t *src, uint8_t *edge )
1084 ;-----------------------------------------------------------------------------
1085 %define PALIGNR PALIGNR_MMX
1086 cglobal predict_8x8_hd_mmxext, 2,2
1087 add r0, 4*FDEC_STRIDE
1088 movq mm0, [r1] ; l7 .. .. .. .. .. .. ..
1089 movq mm1, [r1+8] ; lt l0 l1 l2 l3 l4 l5 l6
1090 movq mm2, [r1+16] ; t7 t6 t5 t4 t3 t2 t1 t0
1091 movq mm3, mm1 ; lt l0 l1 l2 l3 l4 l5 l6
1092 movq mm4, mm2 ; t7 t6 t5 t4 t3 t2 t1 t0
1093 PALIGNR mm2, mm1, 7, mm5 ; t6 t5 t4 t3 t2 t1 t0 lt
1094 PALIGNR mm1, mm0, 7, mm6 ; l0 l1 l2 l3 l4 l5 l6 l7
1095 PALIGNR mm4, mm3, 1, mm7 ; t0 lt l0 l1 l2 l3 l4 l5
1098 PRED8x8_LOWPASS b, mm0, mm4, mm1, mm5, mm7
1100 movq mm1, mm2 ; t6 t5 t4 t3 t2 t1 t0 lt
1101 psrlq mm4, 16 ; .. .. t6 t5 t4 t3 t2 t1
1102 psrlq mm1, 8 ; .. t6 t5 t4 t3 t2 t1 t0
1103 PRED8x8_LOWPASS b, mm6, mm4, mm2, mm1, mm5
1106 punpcklbw mm3, mm0 ; p4 p3 p2 p1
1107 punpckhbw mm7, mm0 ; p8 p7 p6 p5
1111 movq [r0+3*FDEC_STRIDE], mm3
1112 PALIGNR mm7, mm3, 2, mm5
1113 movq [r0+2*FDEC_STRIDE], mm7
1114 PALIGNR mm1, mm3, 4, mm5
1115 movq [r0+1*FDEC_STRIDE], mm1
1116 PALIGNR mm0, mm3, 6, mm3
1117 movq [r0+0*FDEC_STRIDE], mm0
1120 movq [r0-1*FDEC_STRIDE], mm4
1121 PALIGNR mm6, mm4, 2, mm5
1122 movq [r0-2*FDEC_STRIDE], mm6
1123 PALIGNR mm2, mm4, 4, mm5
1124 movq [r0-3*FDEC_STRIDE], mm2
1125 PALIGNR mm3, mm4, 6, mm4
1126 movq [r0-4*FDEC_STRIDE], mm3
1129 ;-----------------------------------------------------------------------------
1130 ; void predict_8x8_hd( uint8_t *src, uint8_t *edge )
1131 ;-----------------------------------------------------------------------------
1132 %macro PREDICT_8x8_HD 1
1133 cglobal predict_8x8_hd_%1, 2,2
1134 add r0, 4*FDEC_STRIDE
1136 movdqa xmm1, [r1+16]
1139 PALIGNR xmm1, xmm0, 7, xmm4
1140 PALIGNR xmm2, xmm0, 9, xmm5
1141 PALIGNR xmm3, xmm0, 8, xmm0
1144 PRED8x8_LOWPASS b, xmm0, xmm1, xmm2, xmm3, xmm5
1145 punpcklbw xmm4, xmm0
1150 movq [r0+(Y)*FDEC_STRIDE], xmm4
1151 movq [r0+(Y-4)*FDEC_STRIDE], xmm0
1156 movq [r0+(Y)*FDEC_STRIDE], xmm4
1157 movq [r0+(Y-4)*FDEC_STRIDE], xmm0
1163 %define PALIGNR PALIGNR_SSSE3
1164 PREDICT_8x8_HD ssse3
1166 %define PALIGNR PALIGNR_MMX
1168 ;-----------------------------------------------------------------------------
1169 ; void predict_8x8_hu( uint8_t *src, uint8_t *edge )
1170 ;-----------------------------------------------------------------------------
1171 %macro PREDICT_8x8_HU 1
1172 cglobal predict_8x8_hu_%1, 2,2
1173 add r0, 4*FDEC_STRIDE
1176 movq mm6, [pb_reverse]
1187 movq mm1, [r1+7] ; l0 l1 l2 l3 l4 l5 l6 l7
1188 pshufw mm0, mm1, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1
1192 por mm2, mm0 ; l7 l6 l5 l4 l3 l2 l1 l0
1193 psllq mm1, 56 ; l7 .. .. .. .. .. .. ..
1199 por mm2, mm1 ; l7 l7 l6 l5 l4 l3 l2 l1
1201 por mm3, mm1 ; l7 l7 l7 l6 l5 l4 l3 l2
1204 PRED8x8_LOWPASS b, mm1, mm3, mm5, mm2, mm6
1208 punpcklbw xmm0, xmm1
1212 movq [r0+Y*FDEC_STRIDE], xmm0
1216 pshufw mm5, mm4, 11111001b
1217 pshufw mm6, mm4, 11111110b
1218 pshufw mm7, mm4, 11111111b
1219 movq [r0+Y*FDEC_STRIDE], xmm0
1220 movq [r0+0*FDEC_STRIDE], mm4
1221 movq [r0+1*FDEC_STRIDE], mm5
1222 movq [r0+2*FDEC_STRIDE], mm6
1223 movq [r0+3*FDEC_STRIDE], mm7
1228 PREDICT_8x8_HU ssse3
1230 ;-----------------------------------------------------------------------------
1231 ; void predict_8x8c_v( uint8_t *src )
1232 ;-----------------------------------------------------------------------------
1233 cglobal predict_8x8c_v_mmx, 1,1
1234 movq mm0, [r0 - FDEC_STRIDE]
1238 ;-----------------------------------------------------------------------------
1239 ; void predict_8x8c_h( uint8_t *src )
1240 ;-----------------------------------------------------------------------------
1242 %macro PRED_8x8C_H 1
1243 cglobal predict_8x8c_h_%1, 1,1
1247 add r0, FDEC_STRIDE*4
1250 SPLATB m0, r0+FDEC_STRIDE*n-1, m1
1251 mova [r0+FDEC_STRIDE*n], m0
1258 %define SPLATB SPLATB_MMX
1260 %define SPLATB SPLATB_SSSE3
1263 ;-----------------------------------------------------------------------------
1264 ; void predict_8x8c_dc( pixel *src )
1265 ;-----------------------------------------------------------------------------
1267 %macro PREDICT_8x8C_DC 1
1268 cglobal predict_8x8c_dc_%1, 1,3
1270 %ifdef HIGH_BIT_DEPTH
1271 movq m0, [r0-FDEC_STRIDEB+0]
1272 movq m1, [r0-FDEC_STRIDEB+8]
1276 movd m0, [r0-FDEC_STRIDEB+0]
1277 movd m1, [r0-FDEC_STRIDEB+4]
1281 add r0, FDEC_STRIDEB*4
1283 movzx r1d, pixel [r0-FDEC_STRIDEB*4-SIZEOF_PIXEL]
1284 movzx r2d, pixel [r0-FDEC_STRIDEB*3-SIZEOF_PIXEL]
1286 movzx r2d, pixel [r0-FDEC_STRIDEB*2-SIZEOF_PIXEL]
1288 movzx r2d, pixel [r0-FDEC_STRIDEB*1-SIZEOF_PIXEL]
1292 movzx r1d, pixel [r0+FDEC_STRIDEB*0-SIZEOF_PIXEL]
1293 movzx r2d, pixel [r0+FDEC_STRIDEB*1-SIZEOF_PIXEL]
1295 movzx r2d, pixel [r0+FDEC_STRIDEB*2-SIZEOF_PIXEL]
1297 movzx r2d, pixel [r0+FDEC_STRIDEB*3-SIZEOF_PIXEL]
1303 punpckldq m0, m2 ; s0, s1, s2, s3
1304 pshufw m3, m0, 11110110b ; s2, s1, s3, s3
1305 pshufw m0, m0, 01110100b ; s0, s1, s3, s1
1308 pavgw m0, m7 ; s0+s2, s1, s3, s1+s3
1309 %ifdef HIGH_BIT_DEPTH
1312 punpcklwd xmm0, xmm0
1313 pshufd xmm1, xmm0, 11111010b
1314 punpckldq xmm0, xmm0
1317 %assign i (0 + (n/4))
1318 movdqa [r0+FDEC_STRIDEB*(n-4)+0], xmm %+ i
1328 %assign i (1 + (n/4)*2)
1329 %assign j (2 + (n/4)*2)
1330 movq [r0+FDEC_STRIDEB*(n-4)+0], m %+ i
1331 movq [r0+FDEC_STRIDEB*(n-4)+8], m %+ j
1343 %assign i (0 + (n/4))
1344 movq [r0+FDEC_STRIDEB*(n-4)], m %+ i
1352 PREDICT_8x8C_DC mmxext
1353 %ifdef HIGH_BIT_DEPTH
1354 PREDICT_8x8C_DC sse2
1357 cglobal predict_8x8c_dc_top_mmxext, 1,1
1358 movq mm0, [r0 - FDEC_STRIDE]
1363 psadbw mm1, mm2 ; s1
1364 psadbw mm0, mm2 ; s0
1370 pshufw mm0, mm0, 0 ; dc0 (w)
1371 packuswb mm0, mm1 ; dc0,dc1 (b)
1375 ;-----------------------------------------------------------------------------
1376 ; void predict_8x8c_p_core( uint8_t *src, int i00, int b, int c )
1377 ;-----------------------------------------------------------------------------
1379 cglobal predict_8x8c_p_core_sse2, 1,1
1383 pshuflw xmm0, xmm0, 0
1384 pshuflw xmm2, xmm2, 0
1385 pshuflw xmm4, xmm4, 0
1386 punpcklqdq xmm0, xmm0
1387 punpcklqdq xmm2, xmm2
1388 punpcklqdq xmm4, xmm4
1389 pmullw xmm2, [pw_76543210]
1390 paddsw xmm0, xmm2 ; xmm0 = {i+0*b, i+1*b, i+2*b, i+3*b, i+4*b, i+5*b, i+6*b, i+7*b}
1395 add r0, FDEC_STRIDE*4
1402 movq [r0+FDEC_STRIDE*0], xmm0
1403 movhps [r0+FDEC_STRIDE*1], xmm0
1411 movq [r0+FDEC_STRIDE*2], xmm5
1412 movhps [r0+FDEC_STRIDE*3], xmm5
1417 ;-----------------------------------------------------------------------------
1418 ; void predict_16x16_p_core( uint8_t *src, int i00, int b, int c )
1419 ;-----------------------------------------------------------------------------
1420 cglobal predict_16x16_p_core_sse2, 1,2,8
1424 pshuflw xmm0, xmm0, 0
1425 pshuflw xmm1, xmm1, 0
1426 pshuflw xmm2, xmm2, 0
1427 punpcklqdq xmm0, xmm0
1428 punpcklqdq xmm1, xmm1
1429 punpcklqdq xmm2, xmm2
1431 pmullw xmm3, [pw_76543210]
1433 paddsw xmm0, xmm3 ; xmm0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b, i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b}
1434 paddsw xmm1, xmm0 ; xmm1 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b, i+12*b, i+13*b, i+14*b, i+15*b}
1452 movdqa [r0+FDEC_STRIDE*0], xmm3
1453 movdqa [r0+FDEC_STRIDE*1], xmm5
1456 add r0, FDEC_STRIDE*2
1461 ;-----------------------------------------------------------------------------
1462 ; void predict_16x16_v( pixel *src )
1463 ;-----------------------------------------------------------------------------
1464 %ifdef HIGH_BIT_DEPTH
1466 cglobal predict_16x16_v_mmx, 1,2
1467 mova m0, [r0 - FDEC_STRIDEB+ 0]
1468 mova m1, [r0 - FDEC_STRIDEB+ 8]
1469 mova m2, [r0 - FDEC_STRIDEB+16]
1470 mova m3, [r0 - FDEC_STRIDEB+24]
1471 STORE16x16 m0, m1, m2, m3
1474 cglobal predict_16x16_v_sse2, 2,2
1475 mova m0, [r0 - FDEC_STRIDEB+ 0]
1476 mova m1, [r0 - FDEC_STRIDEB+16]
1477 STORE16x16_SSE2 m0, m1
1481 cglobal predict_16x16_v_mmx, 1,2
1482 movq m0, [r0 - FDEC_STRIDE + 0]
1483 movq m1, [r0 - FDEC_STRIDE + 8]
1487 cglobal predict_16x16_v_sse2, 1,1
1488 movdqa xmm0, [r0 - FDEC_STRIDE]
1489 STORE16x16_SSE2 xmm0
1493 ;-----------------------------------------------------------------------------
1494 ; void predict_16x16_h( pixel *src )
1495 ;-----------------------------------------------------------------------------
1496 %macro PRED_16x16_H 1
1497 cglobal predict_16x16_h_%1, 1,2
1498 mov r1, 12*FDEC_STRIDEB
1499 %ifdef HIGH_BIT_DEPTH
1503 movd m0, [r0+r1+n*FDEC_STRIDEB-2*SIZEOF_PIXEL]
1505 mova [r0+r1+n*FDEC_STRIDEB+ 0], m0
1506 mova [r0+r1+n*FDEC_STRIDEB+16], m0
1508 mova [r0+r1+n*FDEC_STRIDEB+ 8], m0
1509 mova [r0+r1+n*FDEC_STRIDEB+24], m0
1521 SPLATB m0, r0+r1+FDEC_STRIDE*n-1, m1
1522 mova [r0+r1+FDEC_STRIDE*n], m0
1524 mova [r0+r1+FDEC_STRIDE*n+8], m0
1528 %endif ; HIGH_BIT_DEPTH
1529 sub r1, 4*FDEC_STRIDEB
1535 %define SPLATB SPLATB_MMX
1538 %ifdef HIGH_BIT_DEPTH
1541 ;no SSE2 for 8-bit, it's slower than MMX on all systems that don't support SSSE3
1542 %define SPLATB SPLATB_SSSE3
1546 ;-----------------------------------------------------------------------------
1547 ; void predict_16x16_dc_core( pixel *src, int i_dc_left )
1548 ;-----------------------------------------------------------------------------
1550 %macro PRED16x16_DC 2
1551 %ifdef HIGH_BIT_DEPTH
1552 mova m0, [r0 - FDEC_STRIDEB+ 0]
1553 paddw m0, [r0 - FDEC_STRIDEB+ 8]
1554 paddw m0, [r0 - FDEC_STRIDEB+16]
1555 paddw m0, [r0 - FDEC_STRIDEB+24]
1560 STORE16x16 m0, m0, m0, m0
1564 psadbw m0, [r0 - FDEC_STRIDE]
1565 psadbw m1, [r0 - FDEC_STRIDE + 8]
1570 packuswb m0, m0 ; dc in bytes
1576 cglobal predict_16x16_dc_core_mmxext, 1,2
1586 cglobal predict_16x16_dc_top_mmxext, 1,2
1587 PRED16x16_DC [pw_8], 4
1591 %ifdef HIGH_BIT_DEPTH
1592 cglobal predict_16x16_dc_left_core_mmxext, 1,2
1595 STORE16x16 m0, m0, m0, m0
1598 cglobal predict_16x16_dc_left_core_mmxext, 1,1
1606 ;-----------------------------------------------------------------------------
1607 ; void predict_16x16_dc_core( pixel *src, int i_dc_left )
1608 ;-----------------------------------------------------------------------------
1610 %macro PRED16x16_DC_SSE2 2
1611 %ifdef HIGH_BIT_DEPTH
1612 mova m0, [r0 - FDEC_STRIDEB+ 0]
1613 paddw m0, [r0 - FDEC_STRIDEB+16]
1618 STORE16x16_SSE2 m0, m0
1621 psadbw m0, [r0 - FDEC_STRIDE]
1627 packuswb m0, m0 ; dc in bytes
1633 cglobal predict_16x16_dc_core_sse2, 2,2,4
1635 PRED16x16_DC_SSE2 m3, 5
1638 cglobal predict_16x16_dc_top_sse2, 1,2
1639 PRED16x16_DC_SSE2 [pw_8], 4
1643 %ifdef HIGH_BIT_DEPTH
1644 cglobal predict_16x16_dc_left_core_sse2, 1,2
1647 STORE16x16_SSE2 m0, m0
1650 cglobal predict_16x16_dc_left_core_sse2, 1,1