1 ;*****************************************************************************
2 ;* predict-a.asm: x86 intra prediction
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2012 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Holger Lubitz <holger@lubitz.org>
8 ;* Fiona Glaser <fiona@x264.com>
10 ;* This program is free software; you can redistribute it and/or modify
11 ;* it under the terms of the GNU General Public License as published by
12 ;* the Free Software Foundation; either version 2 of the License, or
13 ;* (at your option) any later version.
15 ;* This program is distributed in the hope that it will be useful,
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 ;* GNU General Public License for more details.
20 ;* You should have received a copy of the GNU General Public License
21 ;* along with this program; if not, write to the Free Software
22 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
24 ;* This program is also available under a commercial proprietary license.
25 ;* For more information, contact us at licensing@x264.com.
26 ;*****************************************************************************
29 %include "x86util.asm"
34 pw_3210: dw 0, 1, 2, 3, 4, 5, 6, 7
35 pw_43210123: dw -3, -2, -1, 0, 1, 2, 3, 4
38 pb_00s_ff: times 8 db 0
39 pb_0s_ff: times 7 db 0
41 shuf_fixtr: db 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7
42 shuf_nop: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
43 shuf_hu: db 7,6,5,4,3,2,1,0,0,0,0,0,0,0,0,0
44 shuf_vr: db 2,4,6,8,9,10,11,12,13,14,15,0,1,3,5,7
45 pw_reverse: db 14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1
61 add r0, 4*FDEC_STRIDEB
62 mova [r0 + -4*FDEC_STRIDEB], %1
63 mova [r0 + -3*FDEC_STRIDEB], %1
64 mova [r0 + -2*FDEC_STRIDEB], %1
65 mova [r0 + -1*FDEC_STRIDEB], %1
66 mova [r0 + 0*FDEC_STRIDEB], %2
67 mova [r0 + 1*FDEC_STRIDEB], %2
68 mova [r0 + 2*FDEC_STRIDEB], %2
69 mova [r0 + 3*FDEC_STRIDEB], %2
73 add r0, 4*FDEC_STRIDEB
74 mova [r0 + -4*FDEC_STRIDEB], %1
75 mova [r0 + -3*FDEC_STRIDEB], %1
76 mova [r0 + -2*FDEC_STRIDEB], %1
77 mova [r0 + -1*FDEC_STRIDEB], %1
78 add r0, 4*FDEC_STRIDEB
79 mova [r0 + -4*FDEC_STRIDEB], %2
80 mova [r0 + -3*FDEC_STRIDEB], %2
81 mova [r0 + -2*FDEC_STRIDEB], %2
82 mova [r0 + -1*FDEC_STRIDEB], %2
83 add r0, 4*FDEC_STRIDEB
84 mova [r0 + -4*FDEC_STRIDEB], %3
85 mova [r0 + -3*FDEC_STRIDEB], %3
86 mova [r0 + -2*FDEC_STRIDEB], %3
87 mova [r0 + -1*FDEC_STRIDEB], %3
88 mova [r0 + 0*FDEC_STRIDEB], %4
89 mova [r0 + 1*FDEC_STRIDEB], %4
90 mova [r0 + 2*FDEC_STRIDEB], %4
91 mova [r0 + 3*FDEC_STRIDEB], %4
98 mova [r0 + 0*FDEC_STRIDEB + 0], %1
99 mova [r0 + 1*FDEC_STRIDEB + 0], %1
100 mova [r0 + 0*FDEC_STRIDEB + 8], %2
101 mova [r0 + 1*FDEC_STRIDEB + 8], %2
102 mova [r0 + 0*FDEC_STRIDEB +16], %3
103 mova [r0 + 1*FDEC_STRIDEB +16], %3
104 mova [r0 + 0*FDEC_STRIDEB +24], %4
105 mova [r0 + 1*FDEC_STRIDEB +24], %4
106 add r0, 2*FDEC_STRIDEB
112 mova [r0 + 0*FDEC_STRIDE], %1
113 mova [r0 + 1*FDEC_STRIDE], %1
114 mova [r0 + 2*FDEC_STRIDE], %1
115 mova [r0 + 3*FDEC_STRIDE], %1
116 mova [r0 + 0*FDEC_STRIDE + 8], %2
117 mova [r0 + 1*FDEC_STRIDE + 8], %2
118 mova [r0 + 2*FDEC_STRIDE + 8], %2
119 mova [r0 + 3*FDEC_STRIDE + 8], %2
120 add r0, 4*FDEC_STRIDE
126 %macro STORE16x16_SSE2 1-2
130 mova [r0+0*FDEC_STRIDEB+ 0], %1
131 mova [r0+0*FDEC_STRIDEB+16], %2
132 mova [r0+1*FDEC_STRIDEB+ 0], %1
133 mova [r0+1*FDEC_STRIDEB+16], %2
134 mova [r0+2*FDEC_STRIDEB+ 0], %1
135 mova [r0+2*FDEC_STRIDEB+16], %2
136 mova [r0+3*FDEC_STRIDEB+ 0], %1
137 mova [r0+3*FDEC_STRIDEB+16], %2
138 add r0, 4*FDEC_STRIDEB
142 add r0, 4*FDEC_STRIDEB
143 mova [r0 + -4*FDEC_STRIDEB], %1
144 mova [r0 + -3*FDEC_STRIDEB], %1
145 mova [r0 + -2*FDEC_STRIDEB], %1
146 mova [r0 + -1*FDEC_STRIDEB], %1
147 mova [r0 + 0*FDEC_STRIDEB], %1
148 mova [r0 + 1*FDEC_STRIDEB], %1
149 mova [r0 + 2*FDEC_STRIDEB], %1
150 mova [r0 + 3*FDEC_STRIDEB], %1
151 add r0, 8*FDEC_STRIDEB
152 mova [r0 + -4*FDEC_STRIDEB], %1
153 mova [r0 + -3*FDEC_STRIDEB], %1
154 mova [r0 + -2*FDEC_STRIDEB], %1
155 mova [r0 + -1*FDEC_STRIDEB], %1
156 mova [r0 + 0*FDEC_STRIDEB], %1
157 mova [r0 + 1*FDEC_STRIDEB], %1
158 mova [r0 + 2*FDEC_STRIDEB], %1
159 mova [r0 + 3*FDEC_STRIDEB], %1
163 ; dest, left, right, src, tmp
164 ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
165 %macro PRED8x8_LOWPASS 4-5
180 ;-----------------------------------------------------------------------------
181 ; void predict_4x4_ddl( pixel *src )
182 ;-----------------------------------------------------------------------------
183 %macro PREDICT_4x4_DDL 0
184 cglobal predict_4x4_ddl, 1,1
185 movu m1, [r0-FDEC_STRIDEB]
190 pshufhw m1, m1, q2210
196 PRED8x8_LOWPASS m0, m2, m1, m0, m3
201 movh [r0+Y*FDEC_STRIDEB], m0
214 cglobal predict_4x4_ddl, 1,2
215 movu m1, [r0-FDEC_STRIDEB+4]
216 PRED8x8_LOWPASS m0, m1, [r0-FDEC_STRIDEB+0], [r0-FDEC_STRIDEB+2]
217 mova m3, [r0-FDEC_STRIDEB+8]
218 mova [r0+0*FDEC_STRIDEB], m0
220 PRED8x8_LOWPASS m2, m4, [r0-FDEC_STRIDEB+6], m3
221 mova [r0+3*FDEC_STRIDEB], m2
224 mova [r0+1*FDEC_STRIDEB], m1
226 PALIGNR m2, m0, 6, m0
227 mova [r0+2*FDEC_STRIDEB], m2
229 %else ; !HIGH_BIT_DEPTH
234 ;-----------------------------------------------------------------------------
235 ; void predict_4x4_vr( pixel *src )
236 ;-----------------------------------------------------------------------------
237 %if HIGH_BIT_DEPTH == 0
239 cglobal predict_4x4_vr, 1,1
240 movd m1, [r0-1*FDEC_STRIDEB] ; ........t3t2t1t0
242 palignr m1, [r0-1*FDEC_STRIDEB-8], 7 ; ......t3t2t1t0lt
244 palignr m1, [r0+0*FDEC_STRIDEB-8], 7 ; ....t3t2t1t0ltl0
246 palignr m1, [r0+1*FDEC_STRIDEB-8], 7 ; ..t3t2t1t0ltl0l1
248 palignr m1, [r0+2*FDEC_STRIDEB-8], 7 ; t3t2t1t0ltl0l1l2
249 PRED8x8_LOWPASS m2, m0, m1, m2, m3
252 movd [r0+0*FDEC_STRIDEB], m4
254 movd [r0+1*FDEC_STRIDEB], m2
256 movd [r0+2*FDEC_STRIDEB], m4
258 movd [r0+3*FDEC_STRIDEB], m2
260 %endif ; !HIGH_BIT_DEPTH
262 ;-----------------------------------------------------------------------------
263 ; void predict_4x4_ddr( pixel *src )
264 ;-----------------------------------------------------------------------------
266 cglobal predict_4x4_ddr, 1,1
268 movu m2, [r0-1*FDEC_STRIDEB-8]
269 pinsrw m2, [r0+0*FDEC_STRIDEB-2], 2
270 pinsrw m2, [r0+1*FDEC_STRIDEB-2], 1
271 pinsrw m2, [r0+2*FDEC_STRIDEB-2], 0
272 movhps m3, [r0+3*FDEC_STRIDEB-8]
273 %else ; !HIGH_BIT_DEPTH
274 movd m0, [r0+2*FDEC_STRIDEB-4]
275 movd m1, [r0+0*FDEC_STRIDEB-4]
276 punpcklbw m0, [r0+1*FDEC_STRIDEB-4]
277 punpcklbw m1, [r0-1*FDEC_STRIDEB-4]
279 movd m2, [r0-1*FDEC_STRIDEB]
287 movd m3, [r0+3*FDEC_STRIDEB-4]
289 %endif ; !HIGH_BIT_DEPTH
293 PALIGNR m2, m3, 7*SIZEOF_PIXEL, m3
294 PRED8x8_LOWPASS m0, m2, m1, m0, m3
296 movh [r0+Y*FDEC_STRIDEB], m0
300 movh [r0+Y*FDEC_STRIDEB], m0
304 ;-----------------------------------------------------------------------------
305 ; void predict_4x4_vr( pixel *src )
306 ;-----------------------------------------------------------------------------
307 cglobal predict_4x4_vr, 1,1
309 movu m1, [r0-1*FDEC_STRIDEB-8]
310 pinsrw m1, [r0+0*FDEC_STRIDEB-2], 2
311 pinsrw m1, [r0+1*FDEC_STRIDEB-2], 1
312 pinsrw m1, [r0+2*FDEC_STRIDEB-2], 0
313 %else ; !HIGH_BIT_DEPTH
314 movd m0, [r0+2*FDEC_STRIDEB-4]
315 movd m1, [r0+0*FDEC_STRIDEB-4]
316 punpcklbw m0, [r0+1*FDEC_STRIDEB-4]
317 punpcklbw m1, [r0-1*FDEC_STRIDEB-4]
319 movd m1, [r0-1*FDEC_STRIDEB]
327 %endif ; !HIGH_BIT_DEPTH
332 PRED8x8_LOWPASS m2, m0, m1, m2, m3
335 movh [r0+0*FDEC_STRIDEB], m4
336 PALIGNR m4, m0, 7*SIZEOF_PIXEL, m3
337 movh [r0+1*FDEC_STRIDEB], m2
339 movh [r0+2*FDEC_STRIDEB], m4
340 PALIGNR m2, m0, 7*SIZEOF_PIXEL, m0
341 movh [r0+3*FDEC_STRIDEB], m2
344 ;-----------------------------------------------------------------------------
345 ; void predict_4x4_hd( pixel *src )
346 ;-----------------------------------------------------------------------------
347 cglobal predict_4x4_hd, 1,1
349 movu m1, [r0-1*FDEC_STRIDEB-8]
351 pinsrw m1, [r0+0*FDEC_STRIDEB-2], 3
352 pinsrw m1, [r0+1*FDEC_STRIDEB-2], 2
353 pinsrw m1, [r0+2*FDEC_STRIDEB-2], 1
354 pinsrw m1, [r0+3*FDEC_STRIDEB-2], 0
356 movd m0, [r0-1*FDEC_STRIDEB-4] ; lt ..
357 punpckldq m0, [r0-1*FDEC_STRIDEB] ; t3 t2 t1 t0 lt .. .. ..
358 PSLLPIX m0, m0, 1 ; t2 t1 t0 lt .. .. .. ..
359 movd m1, [r0+3*FDEC_STRIDEB-4] ; l3
360 punpcklbw m1, [r0+2*FDEC_STRIDEB-4] ; l2 l3
361 movd m2, [r0+1*FDEC_STRIDEB-4] ; l1
362 punpcklbw m2, [r0+0*FDEC_STRIDEB-4] ; l0 l1
363 punpckh%3 m1, m2 ; l0 l1 l2 l3
364 punpckh%4 m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3
366 PSRLPIX m2, m1, 1 ; .. t2 t1 t0 lt l0 l1 l2
367 PSRLPIX m0, m1, 2 ; .. .. t2 t1 t0 lt l0 l1
369 PRED8x8_LOWPASS m3, m1, m0, m2, m4
372 PALIGNR m3, m5, 6*SIZEOF_PIXEL, m4
374 movh [r0+Y*FDEC_STRIDEB], m5
378 movh [r0+Y*FDEC_STRIDEB], m5
380 movh [r0+0*FDEC_STRIDEB], m3
382 %endmacro ; PREDICT_4x4
384 ;-----------------------------------------------------------------------------
385 ; void predict_4x4_ddr( pixel *src )
386 ;-----------------------------------------------------------------------------
389 cglobal predict_4x4_ddr, 1,1
390 mova m0, [r0+1*FDEC_STRIDEB-8]
391 punpckhwd m0, [r0+0*FDEC_STRIDEB-8]
392 mova m3, [r0+3*FDEC_STRIDEB-8]
393 punpckhwd m3, [r0+2*FDEC_STRIDEB-8]
397 pinsrw m0, [r0-1*FDEC_STRIDEB-2], 3
399 PRED8x8_LOWPASS m0, m1, m3, m0
400 movq [r0+3*FDEC_STRIDEB], m0
402 movq m2, [r0-1*FDEC_STRIDEB-0]
404 pinsrw m4, [r0-1*FDEC_STRIDEB-2], 0
406 PALIGNR m4, m3, 6, m3
407 PRED8x8_LOWPASS m1, m4, m2, m1
408 movq [r0+0*FDEC_STRIDEB], m1
413 PALIGNR m1, m0, 6, m0
414 movq [r0+1*FDEC_STRIDEB], m1
415 movq [r0+2*FDEC_STRIDEB], m2
416 movd [r0+3*FDEC_STRIDEB+4], m1
419 ;-----------------------------------------------------------------------------
420 ; void predict_4x4_hd( pixel *src )
421 ;-----------------------------------------------------------------------------
422 cglobal predict_4x4_hd, 1,1
423 mova m0, [r0+1*FDEC_STRIDEB-8]
424 punpckhwd m0, [r0+0*FDEC_STRIDEB-8]
425 mova m1, [r0+3*FDEC_STRIDEB-8]
426 punpckhwd m1, [r0+2*FDEC_STRIDEB-8]
430 movu m3, [r0-1*FDEC_STRIDEB-2]
434 PALIGNR m3, m1, 2, m2
435 PRED8x8_LOWPASS m2, m4, m1, m3
440 mova [r0+3*FDEC_STRIDEB], m5
441 mova [r0+1*FDEC_STRIDEB], m4
444 mova [r0+2*FDEC_STRIDEB], m5
447 mova m6, [r0-1*FDEC_STRIDEB+0]
448 pinsrw m4, [r0+0*FDEC_STRIDEB-2], 0
449 PRED8x8_LOWPASS m3, m4, m6, m7
450 PALIGNR m3, m0, 6, m0
451 mova [r0+0*FDEC_STRIDEB], m3
455 PREDICT_4x4 w, wd, dq, qdq
457 PREDICT_4x4 w, wd, dq, qdq
459 PREDICT_4x4 w, wd, dq, qdq
460 %else ; !HIGH_BIT_DEPTH
462 PREDICT_4x4 b, bw, wd, dq
464 %define predict_4x4_vr_ssse3 predict_4x4_vr_ssse3_cache64
465 PREDICT_4x4 b, bw, wd, dq
468 ;-----------------------------------------------------------------------------
469 ; void predict_4x4_hu( pixel *src )
470 ;-----------------------------------------------------------------------------
473 cglobal predict_4x4_hu_mmx2, 1,1
474 movq m0, [r0+0*FDEC_STRIDEB-8]
475 punpckhwd m0, [r0+1*FDEC_STRIDEB-8]
476 movq m1, [r0+2*FDEC_STRIDEB-8]
477 punpckhwd m1, [r0+3*FDEC_STRIDEB-8]
480 movq [r0+3*FDEC_STRIDEB], m1
484 PRED8x8_LOWPASS m3, m0, m4, m3
486 mova [r0+0*FDEC_STRIDEB], m4
490 mova [r0+1*FDEC_STRIDEB], m2
492 mova [r0+2*FDEC_STRIDEB], m2
495 %else ; !HIGH_BIT_DEPTH
497 cglobal predict_4x4_hu_mmx2, 1,1
498 movd m1, [r0+0*FDEC_STRIDEB-4]
499 punpcklbw m1, [r0+1*FDEC_STRIDEB-4]
500 movd m0, [r0+2*FDEC_STRIDEB-4]
501 punpcklbw m0, [r0+3*FDEC_STRIDEB-4]
513 PRED8x8_LOWPASS m3, m0, m2, m3, m4
514 movd [r0+3*FDEC_STRIDEB], m1
516 movd [r0+0*FDEC_STRIDEB], m5
518 movd [r0+1*FDEC_STRIDEB], m5
520 movd [r0+2*FDEC_STRIDEB], m5
522 %endif ; HIGH_BIT_DEPTH
524 ;-----------------------------------------------------------------------------
525 ; void predict_4x4_vl( pixel *src )
526 ;-----------------------------------------------------------------------------
527 %macro PREDICT_4x4_V1 1
528 cglobal predict_4x4_vl, 1,1
529 movu m1, [r0-FDEC_STRIDEB]
533 PRED8x8_LOWPASS m0, m1, m2, m3, m5
535 movh [r0+0*FDEC_STRIDEB], m4
536 movh [r0+1*FDEC_STRIDEB], m0
539 movh [r0+2*FDEC_STRIDEB], m4
540 movh [r0+3*FDEC_STRIDEB], m0
551 cglobal predict_4x4_vl, 1,4
552 mova m1, [r0-FDEC_STRIDEB+0]
553 mova m2, [r0-FDEC_STRIDEB+8]
555 PALIGNR m2, m1, 4, m4
556 PALIGNR m0, m1, 2, m4
559 mova [r0+0*FDEC_STRIDEB], m3
561 mova [r0+2*FDEC_STRIDEB], m3
562 PRED8x8_LOWPASS m0, m1, m2, m0
563 mova [r0+1*FDEC_STRIDEB], m0
565 mova [r0+3*FDEC_STRIDEB], m0
567 movzx r1d, word [r0-FDEC_STRIDEB+ 8]
568 movzx r2d, word [r0-FDEC_STRIDEB+10]
569 movzx r3d, word [r0-FDEC_STRIDEB+12]
575 mov [r0+2*FDEC_STRIDEB+6], r1w
576 mov [r0+3*FDEC_STRIDEB+6], r3w
578 %else ; !HIGH_BIT_DEPTH
583 ;-----------------------------------------------------------------------------
584 ; void predict_4x4_dc( pixel *src )
585 ;-----------------------------------------------------------------------------
588 cglobal predict_4x4_dc, 1,1
589 mova m2, [r0+0*FDEC_STRIDEB-4*SIZEOF_PIXEL]
590 paddw m2, [r0+1*FDEC_STRIDEB-4*SIZEOF_PIXEL]
591 paddw m2, [r0+2*FDEC_STRIDEB-4*SIZEOF_PIXEL]
592 paddw m2, [r0+3*FDEC_STRIDEB-4*SIZEOF_PIXEL]
594 mova m0, [r0-FDEC_STRIDEB]
600 mova [r0+0*FDEC_STRIDEB], m0
601 mova [r0+1*FDEC_STRIDEB], m0
602 mova [r0+2*FDEC_STRIDEB], m0
603 mova [r0+3*FDEC_STRIDEB], m0
606 %else ; !HIGH_BIT_DEPTH
607 cglobal predict_4x4_dc, 1,4
609 movd mm0, [r0-FDEC_STRIDEB]
612 movzx r1d, byte [r0-1]
615 movzx r2d, byte [r0+FDEC_STRIDEB*Y-1]
622 mov [r0+FDEC_STRIDEB*0], r1d
623 mov [r0+FDEC_STRIDEB*1], r1d
624 mov [r0+FDEC_STRIDEB*2], r1d
625 mov [r0+FDEC_STRIDEB*3], r1d
627 %endif ; HIGH_BIT_DEPTH
629 %macro PREDICT_FILTER 4
630 ;-----------------------------------------------------------------------------
631 ;void predict_8x8_filter( pixel *src, pixel edge[36], int i_neighbor, int i_filters )
632 ;-----------------------------------------------------------------------------
633 cglobal predict_8x8_filter, 4,6,6
634 add r0, 0x58*SIZEOF_PIXEL
635 %define src r0-0x58*SIZEOF_PIXEL
649 mova m0, [src+0*FDEC_STRIDEB-8*SIZEOF_PIXEL]
650 punpckh%1%2 m0, [src+0*FDEC_STRIDEB-8*SIZEOF_PIXEL+t4*(FDEC_STRIDEB/8)]
651 mova m1, [src+2*FDEC_STRIDEB-8*SIZEOF_PIXEL]
652 punpckh%1%2 m1, [src+1*FDEC_STRIDEB-8*SIZEOF_PIXEL]
654 mova m2, [src+4*FDEC_STRIDEB-8*SIZEOF_PIXEL]
655 punpckh%1%2 m2, [src+3*FDEC_STRIDEB-8*SIZEOF_PIXEL]
656 mova m3, [src+6*FDEC_STRIDEB-8*SIZEOF_PIXEL]
657 punpckh%1%2 m3, [src+5*FDEC_STRIDEB-8*SIZEOF_PIXEL]
660 mova m0, [src+7*FDEC_STRIDEB-8*SIZEOF_PIXEL]
661 mova m1, [src-1*FDEC_STRIDEB]
662 PALIGNR m4, m3, m0, 7*SIZEOF_PIXEL, m0
663 PALIGNR m1, m1, m3, 1*SIZEOF_PIXEL, m2
664 PRED8x8_LOWPASS m3, m1, m4, m3, m5
665 mova [t1+8*SIZEOF_PIXEL], m3
666 movzx t4d, pixel [src+7*FDEC_STRIDEB-1*SIZEOF_PIXEL]
667 movzx r5d, pixel [src+6*FDEC_STRIDEB-1*SIZEOF_PIXEL]
671 mov [t1+7*SIZEOF_PIXEL], t4%1
672 mov [t1+6*SIZEOF_PIXEL], t4%1
676 %if SIZEOF_PIXEL==1 && cpuflag(ssse3)
678 movu m3, [src-1*FDEC_STRIDEB]
679 movhps m0, [src-1*FDEC_STRIDEB-8]
688 pshufb m3, [shuf_fixtr+r2*4] ; neighbor&MB_TOPRIGHT ? shuf_nop : shuf_fixtr
691 PALIGNR m2, m3, m0, 15, m0
692 PALIGNR m1, m3, 1, m5
693 PRED8x8_LOWPASS m0, m2, m1, m3, m5
694 mova [t1+16*SIZEOF_PIXEL], m0
696 movd [t1+32*SIZEOF_PIXEL], m0
704 mova m0, [src-1*FDEC_STRIDEB-8*SIZEOF_PIXEL]
705 mova m3, [src-1*FDEC_STRIDEB]
706 mova m1, [src-1*FDEC_STRIDEB+8*SIZEOF_PIXEL]
712 PALIGNR m2, m3, m0, 7*SIZEOF_PIXEL, m0
713 PALIGNR m0, m1, m3, 1*SIZEOF_PIXEL, m5
714 PRED8x8_LOWPASS m4, m2, m0, m3, m5
715 mova [t1+16*SIZEOF_PIXEL], m4
719 PALIGNR m2, m1, m3, 7*SIZEOF_PIXEL, m3
720 PALIGNR m5, m1, 1*SIZEOF_PIXEL, m4
721 PRED8x8_LOWPASS m0, m2, m5, m1, m4
722 mova [t1+24*SIZEOF_PIXEL], m0
724 movd [t1+32*SIZEOF_PIXEL], m0
732 punpckh%1%2 m1, m3, m3
733 pshuf%2 m1, m1, q3333
740 PREDICT_FILTER w, d, q, dq
742 PREDICT_FILTER w, d, q, dq
744 PREDICT_FILTER w, d, q, dq
747 PREDICT_FILTER b, w, d, q
749 PREDICT_FILTER b, w, d, q
752 ;-----------------------------------------------------------------------------
753 ; void predict_8x8_v( pixel *src, pixel *edge )
754 ;-----------------------------------------------------------------------------
755 %macro PREDICT_8x8_V 0
756 cglobal predict_8x8_v, 2,2
757 mova m0, [r1+16*SIZEOF_PIXEL]
770 ;-----------------------------------------------------------------------------
771 ; void predict_8x8_h( pixel *src, pixel edge[36] )
772 ;-----------------------------------------------------------------------------
773 %macro PREDICT_8x8_H 2
774 cglobal predict_8x8_h, 2,2
775 movu m1, [r1+7*SIZEOF_PIXEL]
776 add r0, 4*FDEC_STRIDEB
782 SPLAT%2 m0, m %+ i, (3-Y)&3
783 mova [r0+(Y-4)*FDEC_STRIDEB], m0
797 ;-----------------------------------------------------------------------------
798 ; void predict_8x8_dc( pixel *src, pixel *edge );
799 ;-----------------------------------------------------------------------------
802 cglobal predict_8x8_dc, 2,2
812 %else ; !HIGH_BIT_DEPTH
814 cglobal predict_8x8_dc, 2,2
826 %endif ; HIGH_BIT_DEPTH
828 ;-----------------------------------------------------------------------------
829 ; void predict_8x8_dc_top ( pixel *src, pixel *edge );
830 ; void predict_8x8_dc_left( pixel *src, pixel *edge );
831 ;-----------------------------------------------------------------------------
833 %macro PREDICT_8x8_DC 3
844 PREDICT_8x8_DC predict_8x8_dc_top , 32, mova
845 PREDICT_8x8_DC predict_8x8_dc_left, 14, movu
847 %else ; !HIGH_BIT_DEPTH
848 %macro PREDICT_8x8_DC 2
860 PREDICT_8x8_DC predict_8x8_dc_top_mmx2, 16
861 PREDICT_8x8_DC predict_8x8_dc_left_mmx2, 7
862 %endif ; HIGH_BIT_DEPTH
864 ; sse2 is faster even on amd for 8-bit, so there's no sense in spending exe
865 ; size on the 8-bit mmx functions below if we know sse2 is available.
866 %macro PREDICT_8x8_DDLR 0
867 ;-----------------------------------------------------------------------------
868 ; void predict_8x8_ddl( pixel *src, pixel *edge )
869 ;-----------------------------------------------------------------------------
870 cglobal predict_8x8_ddl, 2,2,7
871 mova m0, [r1+16*SIZEOF_PIXEL]
872 mova m1, [r1+24*SIZEOF_PIXEL]
874 movd m5, [r1+32*SIZEOF_PIXEL]
875 palignr m3, m1, m0, 1*SIZEOF_PIXEL
876 palignr m5, m5, m1, 1*SIZEOF_PIXEL
877 palignr m4, m1, m0, 7*SIZEOF_PIXEL
879 movu m3, [r1+17*SIZEOF_PIXEL]
880 movu m4, [r1+23*SIZEOF_PIXEL]
881 movu m5, [r1+25*SIZEOF_PIXEL]
884 add r0, FDEC_STRIDEB*4
885 PRED8x8_LOWPASS m0, m2, m3, m0, m6
886 PRED8x8_LOWPASS m1, m4, m5, m1, m6
887 mova [r0+3*FDEC_STRIDEB], m1
890 PALIGNR m1, m0, 7*SIZEOF_PIXEL, m2
892 mova [r0+Y*FDEC_STRIDEB], m1
895 PALIGNR m1, m0, 7*SIZEOF_PIXEL, m0
896 mova [r0+Y*FDEC_STRIDEB], m1
899 ;-----------------------------------------------------------------------------
900 ; void predict_8x8_ddr( pixel *src, pixel *edge )
901 ;-----------------------------------------------------------------------------
902 cglobal predict_8x8_ddr, 2,2,7
903 add r0, FDEC_STRIDEB*4
904 mova m0, [r1+ 8*SIZEOF_PIXEL]
905 mova m1, [r1+16*SIZEOF_PIXEL]
906 ; edge[] is 32byte aligned, so some of the unaligned loads are known to be not cachesplit
907 movu m2, [r1+ 7*SIZEOF_PIXEL]
908 movu m5, [r1+17*SIZEOF_PIXEL]
910 palignr m3, m1, m0, 1*SIZEOF_PIXEL
911 palignr m4, m1, m0, 7*SIZEOF_PIXEL
913 movu m3, [r1+ 9*SIZEOF_PIXEL]
914 movu m4, [r1+15*SIZEOF_PIXEL]
916 PRED8x8_LOWPASS m0, m2, m3, m0, m6
917 PRED8x8_LOWPASS m1, m4, m5, m1, m6
918 mova [r0+3*FDEC_STRIDEB], m0
921 PALIGNR m1, m0, 7*SIZEOF_PIXEL, m2
923 mova [r0+Y*FDEC_STRIDEB], m1
926 PALIGNR m1, m0, 7*SIZEOF_PIXEL, m0
927 mova [r0+Y*FDEC_STRIDEB], m1
929 %endmacro ; PREDICT_8x8_DDLR
936 INIT_XMM ssse3, cache64
938 %elif ARCH_X86_64 == 0
943 ;-----------------------------------------------------------------------------
944 ; void predict_8x8_hu( pixel *src, pixel *edge )
945 ;-----------------------------------------------------------------------------
946 %macro PREDICT_8x8_HU 2
947 cglobal predict_8x8_hu, 2,2,8
948 add r0, 4*FDEC_STRIDEB
951 movu m5, [r1+7*SIZEOF_PIXEL]
952 pshufb m5, [pw_reverse]
954 movq m6, [r1+7*SIZEOF_PIXEL]
955 movq m5, [r1+11*SIZEOF_PIXEL]
956 pshuflw m6, m6, q0123
957 pshuflw m5, m5, q0123
962 pshufhw m2, m2, q2210
963 pshufhw m3, m3, q1110
965 %else ; !HIGH_BIT_DEPTH
966 movu m1, [r1+7*SIZEOF_PIXEL] ; l0 l1 l2 l3 l4 l5 l6 l7
967 pshufw m0, m1, q0123 ; l6 l7 l4 l5 l2 l3 l0 l1
968 psllq m1, 56 ; l7 .. .. .. .. .. .. ..
975 mova m5, m2 ; l7 l6 l5 l4 l3 l2 l1 l0
978 por m2, m1 ; l7 l7 l6 l5 l4 l3 l2 l1
980 por m3, m1 ; l7 l7 l7 l6 l5 l4 l3 l2
982 %endif ; !HIGH_BIT_DEPTH
983 PRED8x8_LOWPASS m2, m3, m5, m2, m6
984 punpckh%2 m0, m4, m2 ; p8 p7 p6 p5
985 punpckl%2 m4, m2 ; p4 p3 p2 p1
986 PALIGNR m5, m0, m4, 2*SIZEOF_PIXEL, m3
987 pshuf%1 m1, m0, q3321
988 PALIGNR m6, m0, m4, 4*SIZEOF_PIXEL, m3
989 pshuf%1 m2, m0, q3332
990 PALIGNR m7, m0, m4, 6*SIZEOF_PIXEL, m3
991 pshuf%1 m3, m0, q3333
992 mova [r0-4*FDEC_STRIDEB], m4
993 mova [r0-3*FDEC_STRIDEB], m5
994 mova [r0-2*FDEC_STRIDEB], m6
995 mova [r0-1*FDEC_STRIDEB], m7
996 mova [r0+0*FDEC_STRIDEB], m0
997 mova [r0+1*FDEC_STRIDEB], m1
998 mova [r0+2*FDEC_STRIDEB], m2
999 mova [r0+3*FDEC_STRIDEB], m3
1005 PREDICT_8x8_HU d, wd
1007 PREDICT_8x8_HU d, wd
1009 PREDICT_8x8_HU d, wd
1010 %elif ARCH_X86_64 == 0
1012 PREDICT_8x8_HU w, bw
1015 ;-----------------------------------------------------------------------------
1016 ; void predict_8x8_vr( pixel *src, pixel *edge )
1017 ;-----------------------------------------------------------------------------
1018 %macro PREDICT_8x8_VR 1
1019 cglobal predict_8x8_vr, 2,3
1020 mova m2, [r1+16*SIZEOF_PIXEL]
1021 %ifidn cpuname, ssse3
1022 mova m0, [r1+8*SIZEOF_PIXEL]
1023 palignr m3, m2, m0, 7*SIZEOF_PIXEL
1024 palignr m1, m2, m0, 6*SIZEOF_PIXEL
1026 movu m3, [r1+15*SIZEOF_PIXEL]
1027 movu m1, [r1+14*SIZEOF_PIXEL]
1030 add r0, FDEC_STRIDEB*4
1031 PRED8x8_LOWPASS m3, m1, m2, m3, m5
1032 mova [r0-4*FDEC_STRIDEB], m4
1033 mova [r0-3*FDEC_STRIDEB], m3
1034 mova m1, [r1+8*SIZEOF_PIXEL]
1037 PRED8x8_LOWPASS m0, m1, m2, m0, m6
1041 PALIGNR m4, m0, 7*SIZEOF_PIXEL, m5
1042 mova [r0+Y*FDEC_STRIDEB], m4
1047 PALIGNR m4, m0, 7*SIZEOF_PIXEL, m0
1048 mova [r0+Y*FDEC_STRIDEB], m4
1059 %elif ARCH_X86_64 == 0
1064 %macro LOAD_PLANE_ARGS 0
1079 ;-----------------------------------------------------------------------------
1080 ; void predict_8x8c_p_core( uint8_t *src, int i00, int b, int c )
1081 ;-----------------------------------------------------------------------------
1082 %if ARCH_X86_64 == 0 && HIGH_BIT_DEPTH == 0
1083 %macro PREDICT_CHROMA_P_MMX 1
1084 cglobal predict_8x%1c_p_core, 1,2
1087 pmullw m2, [pw_3210]
1089 paddsw m0, m2 ; m0 = {i+0*b, i+1*b, i+2*b, i+3*b}
1090 paddsw m1, m0 ; m1 = {i+4*b, i+5*b, i+6*b, i+7*b}
1107 %endmacro ; PREDICT_CHROMA_P_MMX
1110 PREDICT_CHROMA_P_MMX 8
1111 PREDICT_CHROMA_P_MMX 16
1112 %endif ; !ARCH_X86_64 && !HIGH_BIT_DEPTH
1114 %macro PREDICT_CHROMA_P_XMM 1
1116 cglobal predict_8x%1c_p_core, 1,2,7
1120 mova m3, [pw_pixel_max]
1125 pmullw m2, [pw_43210123] ; b
1127 pmullw m5, m4, [pw_m7] ; c
1129 pmullw m5, m4, [pw_m3]
1140 add r0, FDEC_STRIDEB
1144 %else ; !HIGH_BIT_DEPTH
1145 cglobal predict_8x%1c_p_core, 1,2
1152 pmullw m2, [pw_76543210]
1153 paddsw m0, m2 ; m0 = {i+0*b, i+1*b, i+2*b, i+3*b, i+4*b, i+5*b, i+6*b, i+7*b}
1163 movq [r0+FDEC_STRIDE*0], m0
1164 movhps [r0+FDEC_STRIDE*1], m0
1170 movq [r0+FDEC_STRIDE*2], m5
1171 movhps [r0+FDEC_STRIDE*3], m5
1172 add r0, FDEC_STRIDE*4
1176 %endif ; HIGH_BIT_DEPTH
1177 %endmacro ; PREDICT_CHROMA_P_XMM
1180 PREDICT_CHROMA_P_XMM 8
1181 PREDICT_CHROMA_P_XMM 16
1183 PREDICT_CHROMA_P_XMM 8
1184 PREDICT_CHROMA_P_XMM 16
1186 ;-----------------------------------------------------------------------------
1187 ; void predict_16x16_p_core( uint8_t *src, int i00, int b, int c )
1188 ;-----------------------------------------------------------------------------
1189 %if ARCH_X86_64 == 0
1191 cglobal predict_16x16_p_core, 1,2
1195 pmullw mm5, [pw_3210]
1199 paddsw mm0, mm5 ; mm0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b}
1200 paddsw mm1, mm0 ; mm1 = {i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b}
1201 paddsw mm2, mm0 ; mm2 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b}
1202 paddsw mm3, mm1 ; mm3 = {i+12*b, i+13*b, i+14*b, i+15*b}
1229 %endif ; !ARCH_X86_64
1231 %macro PREDICT_16x16_P 0
1232 cglobal predict_16x16_p_core, 1,2,8
1239 pmullw m3, m1, [pw_76543210]
1254 CLIPW m4, [pb_0], [pw_pixel_max]
1255 CLIPW m5, [pb_0], [pw_pixel_max]
1258 add r0, FDEC_STRIDEB
1262 %else ; !HIGH_BIT_DEPTH
1263 paddsw m0, m3 ; m0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b, i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b}
1264 paddsw m1, m0 ; m1 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b, i+12*b, i+13*b, i+14*b, i+15*b}
1277 mova [r0+FDEC_STRIDE*0], m3
1278 mova [r0+FDEC_STRIDE*1], m5
1281 add r0, FDEC_STRIDE*2
1284 %endif ; !HIGH_BIT_DEPTH
1286 %endmacro ; PREDICT_16x16_P
1290 %if HIGH_BIT_DEPTH == 0
1295 %if HIGH_BIT_DEPTH == 0
1296 %macro PREDICT_8x8 0
1297 ;-----------------------------------------------------------------------------
1298 ; void predict_8x8_ddl( uint8_t *src, uint8_t *edge )
1299 ;-----------------------------------------------------------------------------
1300 cglobal predict_8x8_ddl, 2,2
1302 %ifidn cpuname, ssse3
1309 add r0, FDEC_STRIDE*4
1310 PRED8x8_LOWPASS m0, m1, m2, m0, m3
1315 movq [r0+Y*FDEC_STRIDE], m0
1320 %ifnidn cpuname, ssse3
1321 ;-----------------------------------------------------------------------------
1322 ; void predict_8x8_ddr( uint8_t *src, uint8_t *edge )
1323 ;-----------------------------------------------------------------------------
1324 cglobal predict_8x8_ddr, 2,2
1328 add r0, FDEC_STRIDE*4
1329 PRED8x8_LOWPASS m0, m1, m2, m0, m3
1334 movq [r0+Y*FDEC_STRIDE], m0
1335 movq [r0+(Y-1)*FDEC_STRIDE], m1
1340 movq [r0-3*FDEC_STRIDE], m0
1341 movq [r0-4*FDEC_STRIDE], m1
1344 ;-----------------------------------------------------------------------------
1345 ; void predict_8x8_vl( uint8_t *src, uint8_t *edge )
1346 ;-----------------------------------------------------------------------------
1347 cglobal predict_8x8_vl, 2,2
1352 add r0, FDEC_STRIDE*4
1353 PRED8x8_LOWPASS m0, m1, m2, m0, m5
1354 ; m0: (t0 + 2*t1 + t2 + 2) >> 2
1355 ; m3: (t0 + t1 + 1) >> 1
1360 movq [r0+ Y *FDEC_STRIDE], m3
1361 movq [r0+(Y+1)*FDEC_STRIDE], m0
1366 movq [r0+ Y *FDEC_STRIDE], m3
1367 movq [r0+(Y+1)*FDEC_STRIDE], m0
1371 ;-----------------------------------------------------------------------------
1372 ; void predict_8x8_vr( uint8_t *src, uint8_t *edge )
1373 ;-----------------------------------------------------------------------------
1374 cglobal predict_8x8_vr, 2,2
1376 add r0, 4*FDEC_STRIDE
1380 PRED8x8_LOWPASS m0, m2, m1, m0, m4
1381 movhps [r0-4*FDEC_STRIDE], m3
1382 movhps [r0-3*FDEC_STRIDE], m0
1385 pshufb m0, [shuf_vr]
1395 shufps m1, m2, q3210
1400 movq [r0+3*FDEC_STRIDE], m0
1401 movq [r0+2*FDEC_STRIDE], m3
1404 movq [r0+1*FDEC_STRIDE], m0
1405 movq [r0+0*FDEC_STRIDE], m3
1408 movq [r0-1*FDEC_STRIDE], m0
1409 movq [r0-2*FDEC_STRIDE], m3
1411 %endmacro ; PREDICT_8x8
1420 %endif ; !HIGH_BIT_DEPTH
1422 ;-----------------------------------------------------------------------------
1423 ; void predict_8x8_vl( pixel *src, pixel *edge )
1424 ;-----------------------------------------------------------------------------
1425 %macro PREDICT_8x8_VL_10 1
1426 cglobal predict_8x8_vl, 2,2,8
1427 mova m0, [r1+16*SIZEOF_PIXEL]
1428 mova m1, [r1+24*SIZEOF_PIXEL]
1429 PALIGNR m2, m1, m0, SIZEOF_PIXEL*1, m4
1433 add r0, FDEC_STRIDEB*4
1434 mova [r0-4*FDEC_STRIDEB], m6
1435 PALIGNR m3, m7, m6, SIZEOF_PIXEL*1, m5
1436 mova [r0-2*FDEC_STRIDEB], m3
1437 PALIGNR m3, m7, m6, SIZEOF_PIXEL*2, m5
1438 mova [r0+0*FDEC_STRIDEB], m3
1439 PALIGNR m7, m7, m6, SIZEOF_PIXEL*3, m5
1440 mova [r0+2*FDEC_STRIDEB], m7
1441 PALIGNR m3, m1, m0, SIZEOF_PIXEL*7, m6
1443 PRED8x8_LOWPASS m0, m5, m2, m0, m7
1444 PRED8x8_LOWPASS m1, m3, m4, m1, m7
1445 PALIGNR m4, m1, m0, SIZEOF_PIXEL*1, m2
1446 mova [r0-3*FDEC_STRIDEB], m4
1447 PALIGNR m4, m1, m0, SIZEOF_PIXEL*2, m2
1448 mova [r0-1*FDEC_STRIDEB], m4
1449 PALIGNR m4, m1, m0, SIZEOF_PIXEL*3, m2
1450 mova [r0+1*FDEC_STRIDEB], m4
1451 PALIGNR m1, m1, m0, SIZEOF_PIXEL*4, m2
1452 mova [r0+3*FDEC_STRIDEB], m1
1467 ;-----------------------------------------------------------------------------
1468 ; void predict_8x8_hd( pixel *src, pixel *edge )
1469 ;-----------------------------------------------------------------------------
1470 %macro PREDICT_8x8_HD 2
1471 cglobal predict_8x8_hd, 2,2
1472 add r0, 4*FDEC_STRIDEB
1473 mova m0, [r1+ 8*SIZEOF_PIXEL] ; lt l0 l1 l2 l3 l4 l5 l6
1474 movu m1, [r1+ 7*SIZEOF_PIXEL] ; l0 l1 l2 l3 l4 l5 l6 l7
1475 %ifidn cpuname, ssse3
1476 mova m2, [r1+16*SIZEOF_PIXEL] ; t7 t6 t5 t4 t3 t2 t1 t0
1477 mova m4, m2 ; t7 t6 t5 t4 t3 t2 t1 t0
1478 palignr m2, m0, 7*SIZEOF_PIXEL ; t6 t5 t4 t3 t2 t1 t0 lt
1479 palignr m4, m0, 1*SIZEOF_PIXEL ; t0 lt l0 l1 l2 l3 l4 l5
1481 movu m2, [r1+15*SIZEOF_PIXEL]
1482 movu m4, [r1+ 9*SIZEOF_PIXEL]
1485 PRED8x8_LOWPASS m0, m4, m1, m0, m5
1486 PSRLPIX m4, m2, 2 ; .. .. t6 t5 t4 t3 t2 t1
1487 PSRLPIX m1, m2, 1 ; .. t6 t5 t4 t3 t2 t1 t0
1488 PRED8x8_LOWPASS m1, m4, m2, m1, m5
1490 punpckh%2 m2, m3, m0 ; p8 p7 p6 p5
1491 punpckl%2 m3, m0 ; p4 p3 p2 p1
1492 mova [r0+3*FDEC_STRIDEB], m3
1493 PALIGNR m0, m2, m3, 2*SIZEOF_PIXEL, m5
1494 mova [r0+2*FDEC_STRIDEB], m0
1495 PALIGNR m0, m2, m3, 4*SIZEOF_PIXEL, m5
1496 mova [r0+1*FDEC_STRIDEB], m0
1497 PALIGNR m0, m2, m3, 6*SIZEOF_PIXEL, m3
1498 mova [r0+0*FDEC_STRIDEB], m0
1499 mova [r0-1*FDEC_STRIDEB], m2
1500 PALIGNR m0, m1, m2, 2*SIZEOF_PIXEL, m5
1501 mova [r0-2*FDEC_STRIDEB], m0
1502 PALIGNR m0, m1, m2, 4*SIZEOF_PIXEL, m5
1503 mova [r0-3*FDEC_STRIDEB], m0
1504 PALIGNR m1, m1, m2, 6*SIZEOF_PIXEL, m2
1505 mova [r0-4*FDEC_STRIDEB], m1
1511 PREDICT_8x8_HD w, wd
1513 PREDICT_8x8_HD w, wd
1515 PREDICT_8x8_HD w, wd
1518 PREDICT_8x8_HD b, bw
1520 ;-----------------------------------------------------------------------------
1521 ; void predict_8x8_hd( uint8_t *src, uint8_t *edge )
1522 ;-----------------------------------------------------------------------------
1523 %macro PREDICT_8x8_HD 0
1524 cglobal predict_8x8_hd, 2,2
1525 add r0, 4*FDEC_STRIDE
1530 PRED8x8_LOWPASS m0, m1, m2, m3, m5
1536 movq [r0+(Y)*FDEC_STRIDE], m4
1537 movq [r0+(Y-4)*FDEC_STRIDE], m0
1542 movq [r0+(Y)*FDEC_STRIDE], m4
1543 movq [r0+(Y-4)*FDEC_STRIDE], m0
1551 %endif ; HIGH_BIT_DEPTH
1553 %if HIGH_BIT_DEPTH == 0
1554 ;-----------------------------------------------------------------------------
1555 ; void predict_8x8_hu( uint8_t *src, uint8_t *edge )
1556 ;-----------------------------------------------------------------------------
1558 cglobal predict_8x8_hu_sse2, 2,2
1559 add r0, 4*FDEC_STRIDE
1560 movq mm1, [r1+7] ; l0 l1 l2 l3 l4 l5 l6 l7
1561 pshufw mm0, mm1, q0123 ; l6 l7 l4 l5 l2 l3 l0 l1
1565 por mm2, mm0 ; l7 l6 l5 l4 l3 l2 l1 l0
1566 psllq mm1, 56 ; l7 .. .. .. .. .. .. ..
1572 por mm2, mm1 ; l7 l7 l6 l5 l4 l3 l2 l1
1574 por mm3, mm1 ; l7 l7 l7 l6 l5 l4 l3 l2
1576 PRED8x8_LOWPASS mm1, mm3, mm5, mm2, mm6
1580 punpcklbw xmm0, xmm1
1584 movq [r0+Y*FDEC_STRIDE], xmm0
1588 pshufw mm5, mm4, q3321
1589 pshufw mm6, mm4, q3332
1590 pshufw mm7, mm4, q3333
1591 movq [r0+Y*FDEC_STRIDE], xmm0
1592 movq [r0+0*FDEC_STRIDE], mm4
1593 movq [r0+1*FDEC_STRIDE], mm5
1594 movq [r0+2*FDEC_STRIDE], mm6
1595 movq [r0+3*FDEC_STRIDE], mm7
1599 cglobal predict_8x8_hu_ssse3, 2,2
1600 add r0, 4*FDEC_STRIDE
1602 pshufb m3, [shuf_hu]
1606 PRED8x8_LOWPASS m1, m3, m2, m1, m4
1610 movq [r0+ Y *FDEC_STRIDE], m0
1611 movhps [r0+(Y+4)*FDEC_STRIDE], m0
1613 pshufhw m0, m0, q2210
1616 movq [r0+ Y *FDEC_STRIDE], m0
1617 movhps [r0+(Y+4)*FDEC_STRIDE], m0
1619 %endif ; !HIGH_BIT_DEPTH
1621 ;-----------------------------------------------------------------------------
1622 ; void predict_8x8c_v( uint8_t *src )
1623 ;-----------------------------------------------------------------------------
1625 %macro PREDICT_8x8C_V 0
1626 cglobal predict_8x8c_v, 1,1
1627 mova m0, [r0 - FDEC_STRIDEB]
1643 cglobal predict_8x8c_v_mmx, 1,1
1644 mova m0, [r0 - FDEC_STRIDEB]
1645 mova m1, [r0 - FDEC_STRIDEB + 8]
1648 mova [r0 + (Y&1)*FDEC_STRIDEB], m0
1649 mova [r0 + (Y&1)*FDEC_STRIDEB + 8], m1
1651 add r0, FDEC_STRIDEB*2
1659 %macro PREDICT_8x16C_V 0
1660 cglobal predict_8x16c_v, 1,1
1661 mova m0, [r0 - FDEC_STRIDEB]
1662 STORE8x16 m0, m0, m0, m0
1674 ;-----------------------------------------------------------------------------
1675 ; void predict_8x8c_h( uint8_t *src )
1676 ;-----------------------------------------------------------------------------
1679 %macro PREDICT_C_H 1
1680 cglobal predict_8x%1c_h, 1,1
1681 add r0, FDEC_STRIDEB*4
1684 movd m0, [r0+FDEC_STRIDEB*Y-SIZEOF_PIXEL*2]
1686 mova [r0+FDEC_STRIDEB*Y], m0
1688 mova [r0+FDEC_STRIDEB*Y+8], m0
1702 %else ; !HIGH_BIT_DEPTH
1704 %macro PREDICT_C_H_CORE 1
1707 SPLATB_LOAD m0, r0+FDEC_STRIDE*Y-1, m1
1708 mova [r0+FDEC_STRIDE*Y], m0
1713 %macro PREDICT_C_H 1
1714 cglobal predict_8x%1c_h, 1,1
1719 add r0, FDEC_STRIDE*4
1721 add r0, FDEC_STRIDE*4
1724 add r0, FDEC_STRIDE*4
1738 ;-----------------------------------------------------------------------------
1739 ; void predict_8x8c_dc( pixel *src )
1740 ;-----------------------------------------------------------------------------
1743 movzx r1d, pixel [r0+FDEC_STRIDEB*(%1-4)-SIZEOF_PIXEL]
1744 movzx r2d, pixel [r0+FDEC_STRIDEB*(%1-3)-SIZEOF_PIXEL]
1746 movzx r2d, pixel [r0+FDEC_STRIDEB*(%1-2)-SIZEOF_PIXEL]
1748 movzx r2d, pixel [r0+FDEC_STRIDEB*(%1-1)-SIZEOF_PIXEL]
1752 %macro PREDICT_8x8C_DC 0
1753 cglobal predict_8x8c_dc, 1,3
1756 movq m0, [r0-FDEC_STRIDEB+0]
1757 movq m1, [r0-FDEC_STRIDEB+8]
1760 %else ; !HIGH_BIT_DEPTH
1761 movd m0, [r0-FDEC_STRIDEB+0]
1762 movd m1, [r0-FDEC_STRIDEB+4]
1766 add r0, FDEC_STRIDEB*4
1775 punpckldq m0, m2 ; s0, s1, s2, s3
1776 pshufw m3, m0, q3312 ; s2, s1, s3, s3
1777 pshufw m0, m0, q1310 ; s0, s1, s3, s1
1780 pavgw m0, m7 ; s0+s2, s1, s3, s1+s3
1784 punpcklwd xmm0, xmm0
1785 pshufd xmm1, xmm0, q3322
1786 punpckldq xmm0, xmm0
1789 %assign i (0 + (Y/4))
1790 movdqa [r0+FDEC_STRIDEB*(Y-4)+0], xmm %+ i
1794 pshufw m1, m0, q0000
1795 pshufw m2, m0, q1111
1796 pshufw m3, m0, q2222
1797 pshufw m4, m0, q3333
1800 %assign i (1 + (Y/4)*2)
1801 %assign j (2 + (Y/4)*2)
1802 movq [r0+FDEC_STRIDEB*(Y-4)+0], m %+ i
1803 movq [r0+FDEC_STRIDEB*(Y-4)+8], m %+ j
1807 %else ; !HIGH_BIT_DEPTH
1815 %assign i (0 + (Y/4))
1816 movq [r0+FDEC_STRIDEB*(Y-4)], m %+ i
1831 %macro STORE_4LINES 3
1833 movdqa [r0+FDEC_STRIDEB*(%3-4)], %1
1834 movdqa [r0+FDEC_STRIDEB*(%3-3)], %1
1835 movdqa [r0+FDEC_STRIDEB*(%3-2)], %1
1836 movdqa [r0+FDEC_STRIDEB*(%3-1)], %1
1838 movq [r0+FDEC_STRIDEB*(%3-4)+0], %1
1839 movq [r0+FDEC_STRIDEB*(%3-4)+8], %2
1840 movq [r0+FDEC_STRIDEB*(%3-3)+0], %1
1841 movq [r0+FDEC_STRIDEB*(%3-3)+8], %2
1842 movq [r0+FDEC_STRIDEB*(%3-2)+0], %1
1843 movq [r0+FDEC_STRIDEB*(%3-2)+8], %2
1844 movq [r0+FDEC_STRIDEB*(%3-1)+0], %1
1845 movq [r0+FDEC_STRIDEB*(%3-1)+8], %2
1849 %macro STORE_4LINES 2
1850 movq [r0+FDEC_STRIDEB*(%2-4)], %1
1851 movq [r0+FDEC_STRIDEB*(%2-3)], %1
1852 movq [r0+FDEC_STRIDEB*(%2-2)], %1
1853 movq [r0+FDEC_STRIDEB*(%2-1)], %1
1857 %macro PREDICT_8x16C_DC 0
1858 cglobal predict_8x16c_dc, 1,3
1861 movq m0, [r0-FDEC_STRIDEB+0]
1862 movq m1, [r0-FDEC_STRIDEB+8]
1866 movd m0, [r0-FDEC_STRIDEB+0]
1867 movd m1, [r0-FDEC_STRIDEB+4]
1871 punpcklwd m0, m1 ; s0, s1
1873 add r0, FDEC_STRIDEB*4
1877 pinsrw m0, r1d, 3 ; s0, s1, s2, s3
1878 add r0, FDEC_STRIDEB*8
1882 pinsrw m1, r1d, 3 ; s1, __, s4, s5
1883 sub r0, FDEC_STRIDEB*8
1885 pshufw m2, m0, q1310 ; s0, s1, s3, s1
1886 pshufw m0, m0, q3312 ; s2, s1, s3, s3
1887 pshufw m3, m1, q0302 ; s4, s1, s5, s1
1888 pshufw m1, m1, q3322 ; s4, s4, s5, s5
1899 punpcklwd xmm0, xmm0
1900 punpcklwd xmm1, xmm1
1901 pshufd xmm2, xmm0, q3322
1902 pshufd xmm3, xmm1, q3322
1903 punpckldq xmm0, xmm0
1904 punpckldq xmm1, xmm1
1905 STORE_4LINES xmm0, xmm0, 0
1906 STORE_4LINES xmm2, xmm2, 4
1907 STORE_4LINES xmm1, xmm1, 8
1908 STORE_4LINES xmm3, xmm3, 12
1910 pshufw m2, m0, q0000
1911 pshufw m3, m0, q1111
1912 pshufw m4, m0, q2222
1913 pshufw m5, m0, q3333
1914 STORE_4LINES m2, m3, 0
1915 STORE_4LINES m4, m5, 4
1916 pshufw m2, m1, q0000
1917 pshufw m3, m1, q1111
1918 pshufw m4, m1, q2222
1919 pshufw m5, m1, q3333
1920 STORE_4LINES m2, m3, 8
1921 STORE_4LINES m4, m5, 12
1924 packuswb m0, m0 ; dc0, dc1, dc2, dc3
1925 packuswb m1, m1 ; dc4, dc5, dc6, dc7
1928 pshufw m2, m0, q1100
1929 pshufw m3, m0, q3322
1930 pshufw m4, m1, q1100
1931 pshufw m5, m1, q3322
1934 add r0, FDEC_STRIDEB*8
1948 %macro PREDICT_C_DC_TOP 1
1951 cglobal predict_8x%1c_dc_top_sse2, 1,1
1953 mova m0, [r0 - FDEC_STRIDEB]
1954 pshufd m1, m0, q2301
1956 pshuflw m1, m0, q2301
1957 pshufhw m1, m1, q2301
1961 STORE8x%1 m0, m0, m0, m0
1963 %else ; !HIGH_BIT_DEPTH
1965 cglobal predict_8x%1c_dc_top_mmx2, 1,1
1966 movq mm0, [r0 - FDEC_STRIDE]
1971 psadbw mm1, mm2 ; s1
1972 psadbw mm0, mm2 ; s0
1978 pshufw mm0, mm0, 0 ; dc0 (w)
1979 packuswb mm0, mm1 ; dc0,dc1 (b)
1980 STORE8x%1 mm0, mm0, mm0, mm0
1988 ;-----------------------------------------------------------------------------
1989 ; void predict_16x16_v( pixel *src )
1990 ;-----------------------------------------------------------------------------
1993 cglobal predict_16x16_v_mmx2, 1,2
1994 mova m0, [r0 - FDEC_STRIDEB+ 0]
1995 mova m1, [r0 - FDEC_STRIDEB+ 8]
1996 mova m2, [r0 - FDEC_STRIDEB+16]
1997 mova m3, [r0 - FDEC_STRIDEB+24]
1998 STORE16x16 m0, m1, m2, m3
2001 cglobal predict_16x16_v_sse2, 2,2
2002 mova m0, [r0 - FDEC_STRIDEB+ 0]
2003 mova m1, [r0 - FDEC_STRIDEB+16]
2004 STORE16x16_SSE2 m0, m1
2006 %else ; !HIGH_BIT_DEPTH
2008 cglobal predict_16x16_v_mmx2, 1,2
2009 movq m0, [r0 - FDEC_STRIDE + 0]
2010 movq m1, [r0 - FDEC_STRIDE + 8]
2014 cglobal predict_16x16_v_sse2, 1,1
2015 movdqa xmm0, [r0 - FDEC_STRIDE]
2016 STORE16x16_SSE2 xmm0
2020 ;-----------------------------------------------------------------------------
2021 ; void predict_16x16_h( pixel *src )
2022 ;-----------------------------------------------------------------------------
2023 %macro PREDICT_16x16_H 0
2024 cglobal predict_16x16_h, 1,2
2025 mov r1, 12*FDEC_STRIDEB
2030 movd m0, [r0+r1+Y*FDEC_STRIDEB-2*SIZEOF_PIXEL]
2032 mova [r0+r1+Y*FDEC_STRIDEB+ 0], m0
2033 mova [r0+r1+Y*FDEC_STRIDEB+16], m0
2035 mova [r0+r1+Y*FDEC_STRIDEB+ 8], m0
2036 mova [r0+r1+Y*FDEC_STRIDEB+24], m0
2041 %else ; !HIGH_BIT_DEPTH
2048 SPLATB_LOAD m0, r0+r1+FDEC_STRIDE*Y-1, m1
2049 mova [r0+r1+FDEC_STRIDE*Y], m0
2051 mova [r0+r1+FDEC_STRIDE*Y+8], m0
2055 %endif ; HIGH_BIT_DEPTH
2056 sub r1, 4*FDEC_STRIDEB
2067 ;no SSE2 for 8-bit, it's slower than MMX on all systems that don't support SSSE3
2072 ;-----------------------------------------------------------------------------
2073 ; void predict_16x16_dc_core( pixel *src, int i_dc_left )
2074 ;-----------------------------------------------------------------------------
2076 %macro PRED16x16_DC 2
2078 mova m0, [r0 - FDEC_STRIDEB+ 0]
2079 paddw m0, [r0 - FDEC_STRIDEB+ 8]
2080 paddw m0, [r0 - FDEC_STRIDEB+16]
2081 paddw m0, [r0 - FDEC_STRIDEB+24]
2086 STORE16x16 m0, m0, m0, m0
2087 %else ; !HIGH_BIT_DEPTH
2090 psadbw m0, [r0 - FDEC_STRIDE]
2091 psadbw m1, [r0 - FDEC_STRIDE + 8]
2096 packuswb m0, m0 ; dc in bytes
2102 cglobal predict_16x16_dc_core, 1,2
2112 cglobal predict_16x16_dc_top, 1,2
2113 PRED16x16_DC [pw_8], 4
2118 cglobal predict_16x16_dc_left_core, 1,2
2121 STORE16x16 m0, m0, m0, m0
2123 %else ; !HIGH_BIT_DEPTH
2124 cglobal predict_16x16_dc_left_core, 1,1
2132 ;-----------------------------------------------------------------------------
2133 ; void predict_16x16_dc_core( pixel *src, int i_dc_left )
2134 ;-----------------------------------------------------------------------------
2136 %macro PRED16x16_DC_SSE2 2
2138 mova m0, [r0 - FDEC_STRIDEB+ 0]
2139 paddw m0, [r0 - FDEC_STRIDEB+16]
2144 STORE16x16_SSE2 m0, m0
2145 %else ; !HIGH_BIT_DEPTH
2147 psadbw m0, [r0 - FDEC_STRIDE]
2153 packuswb m0, m0 ; dc in bytes
2159 cglobal predict_16x16_dc_core, 2,2,4
2161 PRED16x16_DC_SSE2 m3, 5
2164 cglobal predict_16x16_dc_top, 1,2
2165 PRED16x16_DC_SSE2 [pw_8], 4
2170 cglobal predict_16x16_dc_left_core, 1,2
2173 STORE16x16_SSE2 m0, m0
2175 %else ; !HIGH_BIT_DEPTH
2176 cglobal predict_16x16_dc_left_core, 1,1