1 ;*****************************************************************************
2 ;* predict-a.asm: x86 intra prediction
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2011 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Holger Lubitz <holger@lubitz.org>
8 ;* Fiona Glaser <fiona@x264.com>
10 ;* This program is free software; you can redistribute it and/or modify
11 ;* it under the terms of the GNU General Public License as published by
12 ;* the Free Software Foundation; either version 2 of the License, or
13 ;* (at your option) any later version.
15 ;* This program is distributed in the hope that it will be useful,
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 ;* GNU General Public License for more details.
20 ;* You should have received a copy of the GNU General Public License
21 ;* along with this program; if not, write to the Free Software
22 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
24 ;* This program is also available under a commercial proprietary license.
25 ;* For more information, contact us at licensing@x264.com.
26 ;*****************************************************************************
29 %include "x86util.asm"
34 pw_3210: dw 0, 1, 2, 3, 4, 5, 6, 7
35 pw_43210123: dw -3, -2, -1, 0, 1, 2, 3, 4
37 pb_00s_ff: times 8 db 0
38 pb_0s_ff: times 7 db 0
56 add r0, 4*FDEC_STRIDEB
57 mova [r0 + -4*FDEC_STRIDEB], %1
58 mova [r0 + -3*FDEC_STRIDEB], %1
59 mova [r0 + -2*FDEC_STRIDEB], %1
60 mova [r0 + -1*FDEC_STRIDEB], %1
61 mova [r0 + 0*FDEC_STRIDEB], %2
62 mova [r0 + 1*FDEC_STRIDEB], %2
63 mova [r0 + 2*FDEC_STRIDEB], %2
64 mova [r0 + 3*FDEC_STRIDEB], %2
71 mova [r0 + 0*FDEC_STRIDEB + 0], %1
72 mova [r0 + 1*FDEC_STRIDEB + 0], %1
73 mova [r0 + 0*FDEC_STRIDEB + 8], %2
74 mova [r0 + 1*FDEC_STRIDEB + 8], %2
75 mova [r0 + 0*FDEC_STRIDEB +16], %3
76 mova [r0 + 1*FDEC_STRIDEB +16], %3
77 mova [r0 + 0*FDEC_STRIDEB +24], %4
78 mova [r0 + 1*FDEC_STRIDEB +24], %4
79 add r0, 2*FDEC_STRIDEB
85 mova [r0 + 0*FDEC_STRIDE], %1
86 mova [r0 + 1*FDEC_STRIDE], %1
87 mova [r0 + 2*FDEC_STRIDE], %1
88 mova [r0 + 3*FDEC_STRIDE], %1
89 mova [r0 + 0*FDEC_STRIDE + 8], %2
90 mova [r0 + 1*FDEC_STRIDE + 8], %2
91 mova [r0 + 2*FDEC_STRIDE + 8], %2
92 mova [r0 + 3*FDEC_STRIDE + 8], %2
99 %macro STORE16x16_SSE2 1-2
103 mova [r0+0*FDEC_STRIDEB+ 0], %1
104 mova [r0+0*FDEC_STRIDEB+16], %2
105 mova [r0+1*FDEC_STRIDEB+ 0], %1
106 mova [r0+1*FDEC_STRIDEB+16], %2
107 mova [r0+2*FDEC_STRIDEB+ 0], %1
108 mova [r0+2*FDEC_STRIDEB+16], %2
109 mova [r0+3*FDEC_STRIDEB+ 0], %1
110 mova [r0+3*FDEC_STRIDEB+16], %2
111 add r0, 4*FDEC_STRIDEB
115 add r0, 4*FDEC_STRIDEB
116 mova [r0 + -4*FDEC_STRIDEB], %1
117 mova [r0 + -3*FDEC_STRIDEB], %1
118 mova [r0 + -2*FDEC_STRIDEB], %1
119 mova [r0 + -1*FDEC_STRIDEB], %1
120 mova [r0 + 0*FDEC_STRIDEB], %1
121 mova [r0 + 1*FDEC_STRIDEB], %1
122 mova [r0 + 2*FDEC_STRIDEB], %1
123 mova [r0 + 3*FDEC_STRIDEB], %1
124 add r0, 8*FDEC_STRIDEB
125 mova [r0 + -4*FDEC_STRIDEB], %1
126 mova [r0 + -3*FDEC_STRIDEB], %1
127 mova [r0 + -2*FDEC_STRIDEB], %1
128 mova [r0 + -1*FDEC_STRIDEB], %1
129 mova [r0 + 0*FDEC_STRIDEB], %1
130 mova [r0 + 1*FDEC_STRIDEB], %1
131 mova [r0 + 2*FDEC_STRIDEB], %1
132 mova [r0 + 3*FDEC_STRIDEB], %1
136 ; dest, left, right, src, tmp
137 ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
138 %macro PRED8x8_LOWPASS 5-6
153 %macro LOAD_PLANE_ARGS 0
168 ;-----------------------------------------------------------------------------
169 ; void predict_4x4_ddl( pixel *src )
170 ;-----------------------------------------------------------------------------
171 %macro PREDICT_4x4_DDL 3
172 cglobal predict_4x4_ddl, 1,1
173 movu m1, [r0-FDEC_STRIDEB]
180 PRED8x8_LOWPASS %3, m0, m2, m3, m4, m5
185 movh [r0+Y*FDEC_STRIDEB], m0
192 %ifdef HIGH_BIT_DEPTH
194 PREDICT_4x4_DDL dq, 2, w
196 PREDICT_4x4_DDL dq, 2, w
198 cglobal predict_4x4_ddl, 1,2
199 mova m1, [r0-2*FDEC_STRIDE+4]
200 mova m2, [r0-2*FDEC_STRIDE+0]
201 mova m3, [r0-2*FDEC_STRIDE+2]
202 PRED8x8_LOWPASS w, m0, m1, m2, m3
203 mova [r0+0*FDEC_STRIDE], m0
205 mova m5, [r0-2*FDEC_STRIDE+6]
206 mova m6, [r0-2*FDEC_STRIDE+8]
208 PRED8x8_LOWPASS w, m4, m7, m5, m6
209 mova [r0+6*FDEC_STRIDE], m4
212 PALIGNR m4, m0, 6, m1
213 mova [r0+4*FDEC_STRIDE], m4
216 PALIGNR m4, m0, 6, m0
217 mova [r0+2*FDEC_STRIDE], m4
221 PREDICT_4x4_DDL q, 8, b
224 ;-----------------------------------------------------------------------------
225 ; void predict_4x4_ddr( pixel *src )
226 ;-----------------------------------------------------------------------------
228 cglobal predict_4x4_ddr, 1,1
229 movu m1, [r0+1*FDEC_STRIDEB-8*SIZEOF_PIXEL]
230 movq m2, [r0+0*FDEC_STRIDEB-8]
231 %ifdef HIGH_BIT_DEPTH
232 movh m4, [r0-1*FDEC_STRIDEB-4*SIZEOF_PIXEL]
234 movh m3, [r0-1*FDEC_STRIDEB]
236 PALIGNR m3, m1, 5*SIZEOF_PIXEL, m1
238 movhps m4, [r0+2*FDEC_STRIDEB-4*SIZEOF_PIXEL]
239 PALIGNR m3, m4, 7*SIZEOF_PIXEL, m4
241 movhps m4, [r0+3*FDEC_STRIDEB-4*SIZEOF_PIXEL]
242 PALIGNR m3, m4, 7*SIZEOF_PIXEL, m4
244 punpckh%1 m2, [r0-1*FDEC_STRIDEB-8*SIZEOF_PIXEL]
245 movh m3, [r0-1*FDEC_STRIDEB]
247 PALIGNR m3, m1, 5*SIZEOF_PIXEL, m1
249 PALIGNR m3, [r0+2*FDEC_STRIDEB-8*SIZEOF_PIXEL], 7*SIZEOF_PIXEL, m4
251 PALIGNR m3, [r0+3*FDEC_STRIDEB-8*SIZEOF_PIXEL], 7*SIZEOF_PIXEL, m4
253 PRED8x8_LOWPASS %4, m0, m3, m1, m2, m4
255 movh [r0+Y*FDEC_STRIDEB], m0
259 movh [r0+Y*FDEC_STRIDEB], m0
263 cglobal predict_4x4_vr, 1,1,6
264 movh m0, [r0-1*FDEC_STRIDEB] ; ........t3t2t1t0
266 %ifdef HIGH_BIT_DEPTH
267 movhps m1, [r0-1*FDEC_STRIDEB-4*SIZEOF_PIXEL]
268 PALIGNR m0, m1, 7*SIZEOF_PIXEL, m1 ; ......t3t2t1t0lt
270 movhps m1, [r0+0*FDEC_STRIDEB-4*SIZEOF_PIXEL]
271 PALIGNR m0, m1, 7*SIZEOF_PIXEL, m1 ; ....t3t2t1t0ltl0
273 movhps m2, [r0+1*FDEC_STRIDEB-4*SIZEOF_PIXEL]
274 PALIGNR m0, m2, 7*SIZEOF_PIXEL, m2 ; ..t3t2t1t0ltl0l1
276 movhps m3, [r0+2*FDEC_STRIDEB-4*SIZEOF_PIXEL]
277 PALIGNR m0, m3, 7*SIZEOF_PIXEL, m3 ; t3t2t1t0ltl0l1l2
279 PALIGNR m0, [r0-1*FDEC_STRIDEB-8*SIZEOF_PIXEL], 7*SIZEOF_PIXEL, m1 ; ......t3t2t1t0lt
281 PALIGNR m0, [r0+0*FDEC_STRIDEB-8*SIZEOF_PIXEL], 7*SIZEOF_PIXEL, m1 ; ....t3t2t1t0ltl0
283 PALIGNR m0, [r0+1*FDEC_STRIDEB-8*SIZEOF_PIXEL], 7*SIZEOF_PIXEL, m2 ; ..t3t2t1t0ltl0l1
285 PALIGNR m0, [r0+2*FDEC_STRIDEB-8*SIZEOF_PIXEL], 7*SIZEOF_PIXEL, m3 ; t3t2t1t0ltl0l1l2
287 PRED8x8_LOWPASS %4, m3, m1, m0, m2, m4
290 movh [r0+0*FDEC_STRIDEB], m5
291 movh [r0+1*FDEC_STRIDEB], m3
292 PALIGNR m5, m1, 7*SIZEOF_PIXEL, m2
294 movh [r0+2*FDEC_STRIDEB], m5
295 PALIGNR m3, m1, 7*SIZEOF_PIXEL, m1
296 movh [r0+3*FDEC_STRIDEB], m3
299 cglobal predict_4x4_hd, 1,1,6
300 movh m0, [r0-1*FDEC_STRIDEB-4*SIZEOF_PIXEL] ; lt ..
301 %ifdef HIGH_BIT_DEPTH
302 movh m1, [r0-1*FDEC_STRIDEB]
303 punpckl%5 m0, m1 ; t3 t2 t1 t0 lt .. .. ..
304 psll%3 m0, %6 ; t2 t1 t0 lt .. .. .. ..
305 movh m1, [r0+3*FDEC_STRIDEB-4*SIZEOF_PIXEL] ; l3
306 movh m2, [r0+2*FDEC_STRIDEB-4*SIZEOF_PIXEL]
307 punpckl%1 m1, m2 ; l2 l3
308 movh m2, [r0+1*FDEC_STRIDEB-4*SIZEOF_PIXEL] ; l1
309 movh m3, [r0+0*FDEC_STRIDEB-4*SIZEOF_PIXEL]
310 punpckl%1 m2, m3 ; l0 l1
312 punpckl%5 m0, [r0-1*FDEC_STRIDEB] ; t3 t2 t1 t0 lt .. .. ..
313 psll%3 m0, %6 ; t2 t1 t0 lt .. .. .. ..
314 movu m1, [r0+3*FDEC_STRIDEB-8*SIZEOF_PIXEL] ; l3
315 punpckh%1 m1, [r0+2*FDEC_STRIDEB-8*SIZEOF_PIXEL] ; l2 l3
316 movu m2, [r0+1*FDEC_STRIDEB-8*SIZEOF_PIXEL] ; l1
317 punpckh%1 m2, [r0+0*FDEC_STRIDEB-8*SIZEOF_PIXEL] ; l0 l1
319 punpckh%2 m1, m2 ; l0 l1 l2 l3
320 punpckh%5 m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3
321 psrl%3 m2, m1, %6 ; .. t2 t1 t0 lt l0 l1 l2
322 psrl%3 m0, m1, %6*2 ; .. .. t2 t1 t0 lt l0 l1
324 PRED8x8_LOWPASS %4, m3, m1, m0, m2, m4
327 PALIGNR m3, m5, 6*SIZEOF_PIXEL, m4
329 movh [r0+Y*FDEC_STRIDEB], m5
333 movh [r0+Y*FDEC_STRIDEB], m5
335 movh [r0+0*FDEC_STRIDEB], m3
339 %ifdef HIGH_BIT_DEPTH
341 cglobal predict_4x4_ddr, 1,1
342 movq m3, [r0+3*FDEC_STRIDEB-8]
344 PALIGNR m3, [r0+2*FDEC_STRIDEB-8], 6, m6
345 PALIGNR m3, [r0+1*FDEC_STRIDEB-8], 6, m7
346 movq m6, [r0+0*FDEC_STRIDEB-8]
347 PALIGNR m3, m6, 6, m5
349 movq m4, [r0-1*FDEC_STRIDEB-8]
352 PALIGNR m2, m4, 6, m5
355 PRED8x8_LOWPASS w, m0, m3, m1, m2
357 movq [r0+3*FDEC_STRIDEB], m0
359 movq m2, [r0-1*FDEC_STRIDEB-0]
361 PALIGNR m5, m4, 6, m4
363 PALIGNR m5, m6, 6, m6
364 PRED8x8_LOWPASS w, m1, m5, m2, m3
365 movq [r0+0*FDEC_STRIDEB], m1
368 PALIGNR m1, m0, 6, m2
369 movq [r0+1*FDEC_STRIDEB], m1
371 PALIGNR m1, m0, 6, m0
372 movq [r0+2*FDEC_STRIDEB], m1
374 movd [r0+3*FDEC_STRIDEB+4], m1
377 cglobal predict_4x4_hd, 1,1
378 mova m0, [r0+1*FDEC_STRIDEB-8]
379 punpckhwd m0, [r0+0*FDEC_STRIDEB-8]
380 mova m1, [r0+3*FDEC_STRIDEB-8]
381 punpckhwd m1, [r0+2*FDEC_STRIDEB-8]
386 movu m3, [r0-1*FDEC_STRIDEB-2]
388 punpckhdq m4, [r0-1*FDEC_STRIDEB-6]
389 PALIGNR m3, m1, 2, m2
390 PRED8x8_LOWPASS w, m2, m4, m1, m3, m6
397 mova [r0+3*FDEC_STRIDEB], m5
398 mova [r0+1*FDEC_STRIDEB], m4
401 mova m6, [r0-1*FDEC_STRIDEB+0]
402 PALIGNR m7, [r0+0*FDEC_STRIDEB-8], 6, m5
403 PRED8x8_LOWPASS w, m3, m7, m6, m4, m1
405 PALIGNR m3, m0, 6, m5
406 mova [r0+0*FDEC_STRIDEB], m3
410 mova [r0+2*FDEC_STRIDEB], m0
414 PREDICT_4x4 wd, dq, dq, w, qdq, 2
416 PREDICT_4x4 wd, dq, dq, w, qdq, 2
418 PREDICT_4x4 wd, dq, dq, w, qdq, 2
421 PREDICT_4x4 bw, wd, q , b, dq , 8
423 PREDICT_4x4 bw, wd, q , b, dq , 8
426 ;-----------------------------------------------------------------------------
427 ; void predict_4x4_hu( pixel *src )
428 ;-----------------------------------------------------------------------------
429 %ifdef HIGH_BIT_DEPTH
431 cglobal predict_4x4_hu_mmx2, 1,1
432 movq m0, [r0+0*FDEC_STRIDEB-4*2]
433 punpckhwd m0, [r0+1*FDEC_STRIDEB-4*2]
434 movq m1, [r0+2*FDEC_STRIDEB-4*2]
435 punpckhwd m1, [r0+3*FDEC_STRIDEB-4*2]
438 movq [r0+3*FDEC_STRIDEB], m1
439 movd [r0+2*FDEC_STRIDEB+4], m1
444 pshufw m1, m0, 11111001b
445 pshufw m5, m0, 11111110b
446 PRED8x8_LOWPASS w, m3, m0, m5, m1, m7
449 mova [r0+0*FDEC_STRIDEB], m6
453 mova [r0+1*FDEC_STRIDEB], m2
455 movd [r0+2*FDEC_STRIDEB+0], m2
458 %else ; !HIGH_BIT_DEPTH
460 cglobal predict_4x4_hu_mmx2, 1,1
461 movq mm0, [r0+0*FDEC_STRIDE-8]
462 punpckhbw mm0, [r0+1*FDEC_STRIDE-8]
463 movq mm1, [r0+2*FDEC_STRIDE-8]
464 punpckhbw mm1, [r0+3*FDEC_STRIDE-8]
468 pshufw mm1, mm1, 0xFF
476 PRED8x8_LOWPASS b, mm4, mm0, mm2, mm3, mm5
479 movd [r0+Y*FDEC_STRIDE], mm7
483 movd [r0+Y*FDEC_STRIDE], mm7
485 movd [r0+3*FDEC_STRIDE], mm1
487 %endif ; HIGH_BIT_DEPTH
489 ;-----------------------------------------------------------------------------
490 ; void predict_4x4_vl( pixel *src )
491 ;-----------------------------------------------------------------------------
492 %macro PREDICT_4x4_V1 3
493 cglobal predict_4x4_vl, 1,1,6
494 movu m1, [r0-FDEC_STRIDEB]
498 PRED8x8_LOWPASS %3, m0, m1, m2, m3, m5
500 movh [r0+0*FDEC_STRIDEB], m4
501 movh [r0+1*FDEC_STRIDEB], m0
504 movh [r0+2*FDEC_STRIDEB], m4
505 movh [r0+3*FDEC_STRIDEB], m0
509 %ifdef HIGH_BIT_DEPTH
511 PREDICT_4x4_V1 dq, 2, w
514 PREDICT_4x4_V1 dq, 2, w
518 cglobal predict_4x4_vl, 1,4
519 mova m1, [r0-FDEC_STRIDEB+0]
520 mova m2, [r0-FDEC_STRIDEB+8]
522 PALIGNR m2, m1, 4, m6
523 PALIGNR m3, m1, 2, m5
526 mova [r0+0*FDEC_STRIDEB], m4
528 mova [r0+2*FDEC_STRIDEB], m4
529 PRED8x8_LOWPASS w, m0, m1, m2, m3, m6
530 mova [r0+1*FDEC_STRIDEB], m0
532 mova [r0+3*FDEC_STRIDEB], m0
534 movzx r1d, word [r0-FDEC_STRIDEB+ 8]
535 movzx r2d, word [r0-FDEC_STRIDEB+10]
536 movzx r3d, word [r0-FDEC_STRIDEB+12]
542 mov [r0+2*FDEC_STRIDEB+6], r1w
543 mov [r0+3*FDEC_STRIDEB+6], r3w
547 PREDICT_4x4_V1 q, 8, b
550 ;-----------------------------------------------------------------------------
551 ; void predict_4x4_dc( pixel *src )
552 ;-----------------------------------------------------------------------------
553 %ifdef HIGH_BIT_DEPTH
555 cglobal predict_4x4_dc_mmx2, 1,1
556 mova m2, [r0+0*FDEC_STRIDEB-4*SIZEOF_PIXEL]
557 paddw m2, [r0+1*FDEC_STRIDEB-4*SIZEOF_PIXEL]
558 paddw m2, [r0+2*FDEC_STRIDEB-4*SIZEOF_PIXEL]
559 paddw m2, [r0+3*FDEC_STRIDEB-4*SIZEOF_PIXEL]
561 mova m0, [r0-FDEC_STRIDEB]
567 mova [r0+0*FDEC_STRIDEB], m0
568 mova [r0+1*FDEC_STRIDEB], m0
569 mova [r0+2*FDEC_STRIDEB], m0
570 mova [r0+3*FDEC_STRIDEB], m0
575 cglobal predict_4x4_dc_mmx2, 1,4
577 movd mm0, [r0-FDEC_STRIDE]
580 movzx r1d, byte [r0-1]
583 movzx r2d, byte [r0+FDEC_STRIDE*n-1]
590 mov [r0+FDEC_STRIDE*0], r1d
591 mov [r0+FDEC_STRIDE*1], r1d
592 mov [r0+FDEC_STRIDE*2], r1d
593 mov [r0+FDEC_STRIDE*3], r1d
595 %endif ; HIGH_BIT_DEPTH
597 %macro PREDICT_FILTER 5
598 ;-----------------------------------------------------------------------------
599 ;void predict_8x8_filter( pixel *src, pixel edge[33], int i_neighbor, int i_filters )
600 ;-----------------------------------------------------------------------------
601 cglobal predict_8x8_filter, 4,5,7
602 add r0, 0x58*SIZEOF_PIXEL
603 %define src r0-0x58*SIZEOF_PIXEL
614 mova m0, [src+0*FDEC_STRIDEB-8*SIZEOF_PIXEL]
615 punpckh%1%2 m0, [src-1*FDEC_STRIDEB-8*SIZEOF_PIXEL]
616 mova m1, [src+2*FDEC_STRIDEB-8*SIZEOF_PIXEL]
617 punpckh%1%2 m1, [src+1*FDEC_STRIDEB-8*SIZEOF_PIXEL]
619 mova m2, [src+4*FDEC_STRIDEB-8*SIZEOF_PIXEL]
620 punpckh%1%2 m2, [src+3*FDEC_STRIDEB-8*SIZEOF_PIXEL]
621 mova m3, [src+6*FDEC_STRIDEB-8*SIZEOF_PIXEL]
622 punpckh%1%2 m3, [src+5*FDEC_STRIDEB-8*SIZEOF_PIXEL]
625 mova m0, [src+7*FDEC_STRIDEB-8*SIZEOF_PIXEL]
626 mova m1, [src-1*FDEC_STRIDEB]
629 PALIGNR m4, m0, 7*SIZEOF_PIXEL, m0
630 PALIGNR m1, m2, 1*SIZEOF_PIXEL, m2
635 PRED8x8_LOWPASS %1, m2, m1, m4, m3, m5
636 mova [t1+8*SIZEOF_PIXEL], m2
638 PRED8x8_LOWPASS %1, m1, m3, m0, m4, m5
640 mov [t1+7*SIZEOF_PIXEL], t4%1
644 mova m0, [src-1*FDEC_STRIDEB-8*SIZEOF_PIXEL]
645 mova m3, [src-1*FDEC_STRIDEB]
646 mova m1, [src-1*FDEC_STRIDEB+8*SIZEOF_PIXEL]
649 PALIGNR m2, m0, 7*SIZEOF_PIXEL, m0
650 PALIGNR m1, m4, 1*SIZEOF_PIXEL, m4
656 PRED8x8_LOWPASS %1, m4, m2, m1, m3, m5
657 mova [t1+16*SIZEOF_PIXEL], m4
662 mova m0, [src-1*FDEC_STRIDEB+8*SIZEOF_PIXEL]
666 PALIGNR m2, m3, 7*SIZEOF_PIXEL, m3
667 PALIGNR m5, m4, 1*SIZEOF_PIXEL, m4
668 PRED8x8_LOWPASS %1, m1, m2, m5, m0, m4
674 mova [t1+24*SIZEOF_PIXEL], m1
677 mov [t1+32*SIZEOF_PIXEL], t4%1
701 %ifdef HIGH_BIT_DEPTH
703 PREDICT_FILTER w, d, q, dq, 2
705 PREDICT_FILTER w, d, q, dq, 2
707 PREDICT_FILTER w, d, q, dq, 2
710 PREDICT_FILTER b, w, d, q , 8
712 PREDICT_FILTER b, w, d, q , 8
715 ;-----------------------------------------------------------------------------
716 ; void predict_8x8_v( pixel *src, pixel *edge )
717 ;-----------------------------------------------------------------------------
718 %macro PREDICT_8x8_V 0
719 cglobal predict_8x8_v, 2,2
720 mova m0, [r1+16*SIZEOF_PIXEL]
725 %ifdef HIGH_BIT_DEPTH
733 ;-----------------------------------------------------------------------------
734 ; void predict_8x8_h( pixel *src, pixel edge[33] )
735 ;-----------------------------------------------------------------------------
736 %macro PREDICT_8x8_H 2
737 cglobal predict_8x8_h, 2,2
738 movu m1, [r1+7*SIZEOF_PIXEL]
739 add r0, 4*FDEC_STRIDEB
745 SPLAT%2 m0, m %+ i, (3-n)&3
746 mova [r0+(n-4)*FDEC_STRIDEB], m0
752 %ifdef HIGH_BIT_DEPTH
760 ;-----------------------------------------------------------------------------
761 ; void predict_8x8_dc( pixel *src, pixel *edge );
762 ;-----------------------------------------------------------------------------
763 %ifdef HIGH_BIT_DEPTH
765 cglobal predict_8x8_dc_sse2, 2,2
777 cglobal predict_8x8_dc_mmx2, 2,2
789 %endif ; HIGH_BIT_DEPTH
791 ;-----------------------------------------------------------------------------
792 ; void predict_8x8_dc_top ( pixel *src, pixel *edge );
793 ; void predict_8x8_dc_left( pixel *src, pixel *edge );
794 ;-----------------------------------------------------------------------------
795 %ifdef HIGH_BIT_DEPTH
796 %macro PREDICT_8x8_DC 3
807 PREDICT_8x8_DC predict_8x8_dc_top_sse2 , 32, mova
808 PREDICT_8x8_DC predict_8x8_dc_left_sse2, 14, movu
811 %macro PREDICT_8x8_DC 2
823 PREDICT_8x8_DC predict_8x8_dc_top_mmx2, 16
824 PREDICT_8x8_DC predict_8x8_dc_left_mmx2, 7
825 %endif ; HIGH_BIT_DEPTH
827 ; sse2 is faster even on amd for 8-bit, so there's no sense in spending exe
828 ; size on the 8-bit mmx functions below if we know sse2 is available.
830 ;-----------------------------------------------------------------------------
831 ; void predict_8x8_ddl( pixel *src, pixel *edge )
832 ;-----------------------------------------------------------------------------
833 cglobal predict_8x8_ddl, 2,2,8
834 mova m5, [r1+16*SIZEOF_PIXEL]
835 movu m2, [r1+17*SIZEOF_PIXEL]
836 movu m3, [r1+23*SIZEOF_PIXEL]
837 movu m4, [r1+25*SIZEOF_PIXEL]
839 add r0, FDEC_STRIDEB*4
840 PRED8x8_LOWPASS %1, m0, m1, m2, m5, m7
841 %assign %%bak avx_enabled
842 %assign avx_enabled 0
843 PRED8x8_LOWPASS %1, m1, m3, m4, [r1+24*SIZEOF_PIXEL], m6
844 %assign avx_enabled %%bak
847 mova [r0+Y*FDEC_STRIDEB], m1
854 mova [r0+Y*FDEC_STRIDEB], m1
859 mova [r0+Y*FDEC_STRIDEB], m1
862 ;-----------------------------------------------------------------------------
863 ; void predict_8x8_ddr( pixel *src, pixel *edge )
864 ;-----------------------------------------------------------------------------
866 cglobal predict_8x8_ddr, 2,2,7
867 movu m1, [r1+ 7*SIZEOF_PIXEL]
868 movu m2, [r1+ 9*SIZEOF_PIXEL]
869 movu m3, [r1+15*SIZEOF_PIXEL]
870 movu m4, [r1+17*SIZEOF_PIXEL]
871 add r0, FDEC_STRIDEB*4
872 PRED8x8_LOWPASS %1, m0, m1, m2, [r1+ 8*SIZEOF_PIXEL], m5
873 PRED8x8_LOWPASS %1, m1, m3, m4, [r1+16*SIZEOF_PIXEL], m6
876 mova [r0+Y*FDEC_STRIDEB], m0
883 mova [r0+Y*FDEC_STRIDEB], m0
888 mova [r0+Y*FDEC_STRIDEB], m0
891 %endmacro ; PREDICT_8x8
893 %ifdef HIGH_BIT_DEPTH
898 %elifndef ARCH_X86_64
903 ;-----------------------------------------------------------------------------
904 ; void predict_8x8_hu( pixel *src, pixel *edge )
905 ;-----------------------------------------------------------------------------
906 %macro PREDICT_8x8_HU 5
907 cglobal predict_8x8_hu, 2,2,8
908 movu m1, [r1+7*SIZEOF_PIXEL] ; l0 l1 l2 l3 l4 l5 l6 l7
909 add r0, 4*FDEC_STRIDEB
910 pshuf%3 m0, m1, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1
911 psll%2 m1, 7*%5 ; l7 .. .. .. .. .. .. ..
913 psll%3 m0, 8*SIZEOF_PIXEL
914 psrl%3 m2, 8*SIZEOF_PIXEL
915 por m2, m0 ; l7 l6 l5 l4 l3 l2 l1 l0
920 por m2, m1 ; l7 l7 l6 l5 l4 l3 l2 l1
922 por m3, m1 ; l7 l7 l7 l6 l5 l4 l3 l2
924 PRED8x8_LOWPASS %1, m1, m3, m5, m2, m6
925 punpckh%4 m5, m4, m1 ; p8 p7 p6 p5
926 punpckl%4 m4, m1 ; p4 p3 p2 p1
930 PALIGNR m5, m4, 2*SIZEOF_PIXEL, m1
931 pshuf%3 m1, m6, 11111001b
932 PALIGNR m6, m4, 4*SIZEOF_PIXEL, m2
933 pshuf%3 m2, m7, 11111110b
934 PALIGNR m7, m4, 6*SIZEOF_PIXEL, m3
935 pshuf%3 m3, m0, 11111111b
936 mova [r0-4*FDEC_STRIDEB], m4
937 mova [r0-3*FDEC_STRIDEB], m5
938 mova [r0-2*FDEC_STRIDEB], m6
939 mova [r0-1*FDEC_STRIDEB], m7
940 mova [r0+0*FDEC_STRIDEB], m0
941 mova [r0+1*FDEC_STRIDEB], m1
942 mova [r0+2*FDEC_STRIDEB], m2
943 mova [r0+3*FDEC_STRIDEB], m3
947 %ifdef HIGH_BIT_DEPTH
949 PREDICT_8x8_HU w, dq, d, wd, 2
951 PREDICT_8x8_HU w, dq, d, wd, 2
953 PREDICT_8x8_HU w, dq, d, wd, 2
954 %elifndef ARCH_X86_64
956 PREDICT_8x8_HU b, q , w, bw, 8
959 ;-----------------------------------------------------------------------------
960 ; void predict_8x8_vr( pixel *src, pixel *edge )
961 ;-----------------------------------------------------------------------------
962 %macro PREDICT_8x8_VR 3
963 cglobal predict_8x8_vr, 2,3,7
964 mova m2, [r1+16*SIZEOF_PIXEL]
965 movu m3, [r1+15*SIZEOF_PIXEL]
966 movu m1, [r1+14*SIZEOF_PIXEL]
968 add r0, FDEC_STRIDEB*4
969 PRED8x8_LOWPASS %1, m0, m1, m2, m3, m5
970 mova [r0-4*FDEC_STRIDEB], m4
971 mova [r0-3*FDEC_STRIDEB], m0
974 mova m1, [r1+8*SIZEOF_PIXEL]
979 PRED8x8_LOWPASS %1, m0, m1, m3, m2, m4
983 %assign i (5 + ((Y+3)&1))
984 PALIGNR m %+ i, m0, 7*SIZEOF_PIXEL, m2
985 mova [r0+Y*FDEC_STRIDEB], m %+ i
989 PALIGNR m5, m0, 7*SIZEOF_PIXEL, m0
990 mova [r0+Y*FDEC_STRIDEB], m5
994 %ifdef HIGH_BIT_DEPTH
996 PREDICT_8x8_VR w, dq, 2
998 PREDICT_8x8_VR w, dq, 2
1000 PREDICT_8x8_VR w, dq, 2
1003 PREDICT_8x8_VR b, q , 8
1006 ;-----------------------------------------------------------------------------
1007 ; void predict_8x8c_p_core( uint8_t *src, int i00, int b, int c )
1008 ;-----------------------------------------------------------------------------
1011 cglobal predict_8x8c_p_core_mmx2, 1,2
1014 pmullw mm2, [pw_3210]
1016 paddsw mm0, mm2 ; mm0 = {i+0*b, i+1*b, i+2*b, i+3*b}
1017 paddsw mm1, mm0 ; mm1 = {i+4*b, i+5*b, i+6*b, i+7*b}
1035 %endif ; !ARCH_X86_64
1038 %ifdef HIGH_BIT_DEPTH
1039 cglobal predict_8x8c_p_core_sse2, 1,1,7
1043 mova m3, [pw_pixel_max]
1048 pmullw m2, [pw_43210123] ; b
1049 pmullw m5, m4, [pw_m3] ; c
1059 add r0, FDEC_STRIDEB
1063 %else ; !HIGH_BIT_DEPTH
1064 cglobal predict_8x8c_p_core_sse2, 1,1
1071 pmullw m2, [pw_76543210]
1072 paddsw m0, m2 ; m0 = {i+0*b, i+1*b, i+2*b, i+3*b, i+4*b, i+5*b, i+6*b, i+7*b}
1076 add r0, FDEC_STRIDE*4
1083 movq [r0+FDEC_STRIDE*0], m0
1084 movhps [r0+FDEC_STRIDE*1], m0
1090 movq [r0+FDEC_STRIDE*2], m5
1091 movhps [r0+FDEC_STRIDE*3], m5
1093 %endif ; HIGH_BIT_DEPTH
1095 ;-----------------------------------------------------------------------------
1096 ; void predict_16x16_p_core( uint8_t *src, int i00, int b, int c )
1097 ;-----------------------------------------------------------------------------
1099 cglobal predict_16x16_p_core_mmx2, 1,2
1103 pmullw mm5, [pw_3210]
1107 paddsw mm0, mm5 ; mm0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b}
1108 paddsw mm1, mm0 ; mm1 = {i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b}
1109 paddsw mm2, mm0 ; mm2 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b}
1110 paddsw mm3, mm1 ; mm3 = {i+12*b, i+13*b, i+14*b, i+15*b}
1137 %endif ; !ARCH_X86_64
1139 %macro PREDICT_16x16_P 0
1140 cglobal predict_16x16_p_core, 1,2,8
1147 pmullw m3, m1, [pw_76543210]
1149 %ifdef HIGH_BIT_DEPTH
1162 CLIPW m4, [pb_0], [pw_pixel_max]
1163 CLIPW m5, [pb_0], [pw_pixel_max]
1166 add r0, FDEC_STRIDEB
1170 %else ; !HIGH_BIT_DEPTH
1171 paddsw m0, m3 ; m0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b, i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b}
1172 paddsw m1, m0 ; m1 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b, i+12*b, i+13*b, i+14*b, i+15*b}
1185 mova [r0+FDEC_STRIDE*0], m3
1186 mova [r0+FDEC_STRIDE*1], m5
1189 add r0, FDEC_STRIDE*2
1192 %endif ; !HIGH_BIT_DEPTH
1194 %endmacro ; PREDICT_16x16_P
1198 %ifndef HIGH_BIT_DEPTH
1203 %ifndef HIGH_BIT_DEPTH
1204 %macro PREDICT_8x8 0
1205 ;-----------------------------------------------------------------------------
1206 ; void predict_8x8_ddl( uint8_t *src, uint8_t *edge )
1207 ;-----------------------------------------------------------------------------
1208 cglobal predict_8x8_ddl, 2,2
1209 movdqa xmm3, [r1+16]
1210 movdqu xmm2, [r1+17]
1211 pslldq xmm1, xmm3, 1
1212 add r0, FDEC_STRIDE*4
1213 PRED8x8_LOWPASS b, xmm0, xmm1, xmm2, xmm3, xmm4
1218 movq [r0+Y*FDEC_STRIDE], xmm0
1223 ;-----------------------------------------------------------------------------
1224 ; void predict_8x8_ddr( uint8_t *src, uint8_t *edge )
1225 ;-----------------------------------------------------------------------------
1226 cglobal predict_8x8_ddr, 2,2
1229 psrldq xmm2, xmm3, 1
1230 add r0, FDEC_STRIDE*4
1231 PRED8x8_LOWPASS b, xmm0, xmm1, xmm2, xmm3, xmm4
1233 psrldq xmm1, xmm0, 1
1236 movq [r0+Y*FDEC_STRIDE], xmm0
1237 movq [r0+(Y-1)*FDEC_STRIDE], xmm1
1242 movq [r0-3*FDEC_STRIDE], xmm0
1243 movq [r0-4*FDEC_STRIDE], xmm1
1246 ;-----------------------------------------------------------------------------
1247 ; void predict_8x8_vl( uint8_t *src, uint8_t *edge )
1248 ;-----------------------------------------------------------------------------
1249 cglobal predict_8x8_vl, 2,2
1250 movdqa xmm4, [r1+16]
1251 pslldq xmm1, xmm4, 1
1252 psrldq xmm2, xmm4, 1
1253 pavgb xmm3, xmm4, xmm2
1254 add r0, FDEC_STRIDE*4
1255 PRED8x8_LOWPASS b, xmm0, xmm1, xmm2, xmm4, xmm5
1256 ; xmm0: (t0 + 2*t1 + t2 + 2) >> 2
1257 ; xmm3: (t0 + t1 + 1) >> 1
1262 movq [r0+ Y *FDEC_STRIDE], xmm3
1263 movq [r0+(Y+1)*FDEC_STRIDE], xmm0
1268 movq [r0+ Y *FDEC_STRIDE], xmm3
1269 movq [r0+(Y+1)*FDEC_STRIDE], xmm0
1273 ;-----------------------------------------------------------------------------
1274 ; void predict_8x8_vr( uint8_t *src, uint8_t *edge )
1275 ;-----------------------------------------------------------------------------
1276 cglobal predict_8x8_vr, 2,2,7
1278 movdqa xmm6, [pw_ff00]
1279 add r0, 4*FDEC_STRIDE
1282 pslldq xmm1, xmm0, 2
1285 PRED8x8_LOWPASS b, xmm4, xmm3, xmm1, xmm0, xmm5
1291 movhps [r0-3*FDEC_STRIDE], xmm5
1292 movhps [r0-4*FDEC_STRIDE], xmm2
1301 movq [r0+Y*FDEC_STRIDE], xmm5
1302 movq [r0+(Y-1)*FDEC_STRIDE], xmm2
1306 %endmacro ; PREDICT_8x8
1313 %endif ; !HIGH_BIT_DEPTH
1315 ;-----------------------------------------------------------------------------
1316 ; void predict_8x8_hd( pixel *src, pixel *edge )
1317 ;-----------------------------------------------------------------------------
1318 %macro PREDICT_8x8_HD 4
1319 cglobal predict_8x8_hd, 2,2,8
1320 add r0, 4*FDEC_STRIDEB
1321 mova m0, [r1] ; l7 .. .. .. .. .. .. ..
1322 mova m1, [r1+ 8*SIZEOF_PIXEL] ; lt l0 l1 l2 l3 l4 l5 l6
1323 mova m2, [r1+16*SIZEOF_PIXEL] ; t7 t6 t5 t4 t3 t2 t1 t0
1324 mova m3, m1 ; lt l0 l1 l2 l3 l4 l5 l6
1325 mova m4, m2 ; t7 t6 t5 t4 t3 t2 t1 t0
1326 PALIGNR m2, m1, 7*SIZEOF_PIXEL, m5 ; t6 t5 t4 t3 t2 t1 t0 lt
1327 PALIGNR m1, m0, 7*SIZEOF_PIXEL, m6 ; l0 l1 l2 l3 l4 l5 l6 l7
1328 PALIGNR m4, m3, 1*SIZEOF_PIXEL, m7 ; t0 lt l0 l1 l2 l3 l4 l5
1331 PRED8x8_LOWPASS %1, m0, m4, m1, m5, m7
1332 psrl%2 m4, m2, 2*%4 ; .. .. t6 t5 t4 t3 t2 t1
1333 psrl%2 m1, m2, %4 ; .. t6 t5 t4 t3 t2 t1 t0
1334 PRED8x8_LOWPASS %1, m6, m4, m2, m1, m5
1336 punpckh%3 m7, m3, m0 ; p8 p7 p6 p5
1337 punpckl%3 m3, m0 ; p4 p3 p2 p1
1341 mova [r0+3*FDEC_STRIDEB], m3
1342 PALIGNR m7, m3, 2*SIZEOF_PIXEL, m5
1343 mova [r0+2*FDEC_STRIDEB], m7
1344 PALIGNR m1, m3, 4*SIZEOF_PIXEL, m5
1345 mova [r0+1*FDEC_STRIDEB], m1
1346 PALIGNR m0, m3, 6*SIZEOF_PIXEL, m3
1347 mova [r0+0*FDEC_STRIDEB], m0
1350 mova [r0-1*FDEC_STRIDEB], m4
1351 PALIGNR m6, m4, 2*SIZEOF_PIXEL, m5
1352 mova [r0-2*FDEC_STRIDEB], m6
1353 PALIGNR m2, m4, 4*SIZEOF_PIXEL, m5
1354 mova [r0-3*FDEC_STRIDEB], m2
1355 PALIGNR m3, m4, 6*SIZEOF_PIXEL, m4
1356 mova [r0-4*FDEC_STRIDEB], m3
1360 %ifdef HIGH_BIT_DEPTH
1362 PREDICT_8x8_HD w, dq, wd, 2
1364 PREDICT_8x8_HD w, dq, wd, 2
1366 PREDICT_8x8_HD w, dq, wd, 2
1369 PREDICT_8x8_HD b, q , bw, 8
1371 ;-----------------------------------------------------------------------------
1372 ; void predict_8x8_hd( uint8_t *src, uint8_t *edge )
1373 ;-----------------------------------------------------------------------------
1374 %macro PREDICT_8x8_HD 0
1375 cglobal predict_8x8_hd, 2,2
1376 add r0, 4*FDEC_STRIDE
1378 movdqa xmm1, [r1+16]
1381 PALIGNR xmm1, xmm0, 7, xmm4
1382 PALIGNR xmm2, xmm0, 9, xmm5
1383 PALIGNR xmm3, xmm0, 8, xmm0
1384 pavgb xmm4, xmm1, xmm3
1385 PRED8x8_LOWPASS b, xmm0, xmm1, xmm2, xmm3, xmm5
1386 punpcklbw xmm4, xmm0
1391 movq [r0+(Y)*FDEC_STRIDE], xmm4
1392 movq [r0+(Y-4)*FDEC_STRIDE], xmm0
1397 movq [r0+(Y)*FDEC_STRIDE], xmm4
1398 movq [r0+(Y-4)*FDEC_STRIDE], xmm0
1408 %endif ; HIGH_BIT_DEPTH
1410 ;-----------------------------------------------------------------------------
1411 ; void predict_8x8_hu( uint8_t *src, uint8_t *edge )
1412 ;-----------------------------------------------------------------------------
1413 %macro PREDICT_8x8_HU 0
1414 cglobal predict_8x8_hu, 2,2
1415 add r0, 4*FDEC_STRIDE
1418 movq mm6, [pb_reverse]
1429 movq mm1, [r1+7] ; l0 l1 l2 l3 l4 l5 l6 l7
1430 pshufw mm0, mm1, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1
1434 por mm2, mm0 ; l7 l6 l5 l4 l3 l2 l1 l0
1435 psllq mm1, 56 ; l7 .. .. .. .. .. .. ..
1441 por mm2, mm1 ; l7 l7 l6 l5 l4 l3 l2 l1
1443 por mm3, mm1 ; l7 l7 l7 l6 l5 l4 l3 l2
1446 PRED8x8_LOWPASS b, mm1, mm3, mm5, mm2, mm6
1450 punpcklbw xmm0, xmm1
1454 movq [r0+Y*FDEC_STRIDE], xmm0
1458 pshufw mm5, mm4, 11111001b
1459 pshufw mm6, mm4, 11111110b
1460 pshufw mm7, mm4, 11111111b
1461 movq [r0+Y*FDEC_STRIDE], xmm0
1462 movq [r0+0*FDEC_STRIDE], mm4
1463 movq [r0+1*FDEC_STRIDE], mm5
1464 movq [r0+2*FDEC_STRIDE], mm6
1465 movq [r0+3*FDEC_STRIDE], mm7
1469 %ifndef HIGH_BIT_DEPTH
1476 ;-----------------------------------------------------------------------------
1477 ; void predict_8x8c_v( uint8_t *src )
1478 ;-----------------------------------------------------------------------------
1480 %macro PREDICT_8x8C_V 0
1481 cglobal predict_8x8c_v, 1,1
1482 mova m0, [r0 - FDEC_STRIDEB]
1487 %ifdef HIGH_BIT_DEPTH
1495 %ifdef HIGH_BIT_DEPTH
1498 cglobal predict_8x8c_v_mmx, 1,1
1499 mova m0, [r0 - FDEC_STRIDEB]
1500 mova m1, [r0 - FDEC_STRIDEB + 8]
1503 mova [r0 + (n&1)*FDEC_STRIDEB], m0
1504 mova [r0 + (n&1)*FDEC_STRIDEB + 8], m1
1506 add r0, FDEC_STRIDEB*2
1514 ;-----------------------------------------------------------------------------
1515 ; void predict_8x8c_h( uint8_t *src )
1516 ;-----------------------------------------------------------------------------
1517 %ifdef HIGH_BIT_DEPTH
1520 cglobal predict_8x8c_h, 1,1
1521 add r0, FDEC_STRIDEB*4
1524 movd m0, [r0+FDEC_STRIDEB*n-SIZEOF_PIXEL*2]
1526 mova [r0+FDEC_STRIDEB*n], m0
1533 %macro PREDICT_8x8C_H 0
1534 cglobal predict_8x8c_h, 1,1
1538 add r0, FDEC_STRIDE*4
1541 SPLATB m0, r0+FDEC_STRIDE*n-1, m1
1542 mova [r0+FDEC_STRIDE*n], m0
1554 ;-----------------------------------------------------------------------------
1555 ; void predict_8x8c_dc( pixel *src )
1556 ;-----------------------------------------------------------------------------
1558 %macro PREDICT_8x8C_DC 0
1559 cglobal predict_8x8c_dc, 1,3
1561 %ifdef HIGH_BIT_DEPTH
1562 movq m0, [r0-FDEC_STRIDEB+0]
1563 movq m1, [r0-FDEC_STRIDEB+8]
1567 movd m0, [r0-FDEC_STRIDEB+0]
1568 movd m1, [r0-FDEC_STRIDEB+4]
1572 add r0, FDEC_STRIDEB*4
1574 movzx r1d, pixel [r0-FDEC_STRIDEB*4-SIZEOF_PIXEL]
1575 movzx r2d, pixel [r0-FDEC_STRIDEB*3-SIZEOF_PIXEL]
1577 movzx r2d, pixel [r0-FDEC_STRIDEB*2-SIZEOF_PIXEL]
1579 movzx r2d, pixel [r0-FDEC_STRIDEB*1-SIZEOF_PIXEL]
1583 movzx r1d, pixel [r0+FDEC_STRIDEB*0-SIZEOF_PIXEL]
1584 movzx r2d, pixel [r0+FDEC_STRIDEB*1-SIZEOF_PIXEL]
1586 movzx r2d, pixel [r0+FDEC_STRIDEB*2-SIZEOF_PIXEL]
1588 movzx r2d, pixel [r0+FDEC_STRIDEB*3-SIZEOF_PIXEL]
1594 punpckldq m0, m2 ; s0, s1, s2, s3
1595 pshufw m3, m0, 11110110b ; s2, s1, s3, s3
1596 pshufw m0, m0, 01110100b ; s0, s1, s3, s1
1599 pavgw m0, m7 ; s0+s2, s1, s3, s1+s3
1600 %ifdef HIGH_BIT_DEPTH
1603 punpcklwd xmm0, xmm0
1604 pshufd xmm1, xmm0, 11111010b
1605 punpckldq xmm0, xmm0
1608 %assign i (0 + (n/4))
1609 movdqa [r0+FDEC_STRIDEB*(n-4)+0], xmm %+ i
1619 %assign i (1 + (n/4)*2)
1620 %assign j (2 + (n/4)*2)
1621 movq [r0+FDEC_STRIDEB*(n-4)+0], m %+ i
1622 movq [r0+FDEC_STRIDEB*(n-4)+8], m %+ j
1634 %assign i (0 + (n/4))
1635 movq [r0+FDEC_STRIDEB*(n-4)], m %+ i
1644 %ifdef HIGH_BIT_DEPTH
1649 %ifdef HIGH_BIT_DEPTH
1652 cglobal predict_8x8c_dc_top_sse2, 1,1
1654 mova m0, [r0 - FDEC_STRIDEB]
1668 cglobal predict_8x8c_dc_top_mmx2, 1,1
1669 movq mm0, [r0 - FDEC_STRIDE]
1674 psadbw mm1, mm2 ; s1
1675 psadbw mm0, mm2 ; s0
1681 pshufw mm0, mm0, 0 ; dc0 (w)
1682 packuswb mm0, mm1 ; dc0,dc1 (b)
1688 ;-----------------------------------------------------------------------------
1689 ; void predict_16x16_v( pixel *src )
1690 ;-----------------------------------------------------------------------------
1691 %ifdef HIGH_BIT_DEPTH
1693 cglobal predict_16x16_v_mmx, 1,2
1694 mova m0, [r0 - FDEC_STRIDEB+ 0]
1695 mova m1, [r0 - FDEC_STRIDEB+ 8]
1696 mova m2, [r0 - FDEC_STRIDEB+16]
1697 mova m3, [r0 - FDEC_STRIDEB+24]
1698 STORE16x16 m0, m1, m2, m3
1701 cglobal predict_16x16_v_sse2, 2,2
1702 mova m0, [r0 - FDEC_STRIDEB+ 0]
1703 mova m1, [r0 - FDEC_STRIDEB+16]
1704 STORE16x16_SSE2 m0, m1
1708 cglobal predict_16x16_v_mmx, 1,2
1709 movq m0, [r0 - FDEC_STRIDE + 0]
1710 movq m1, [r0 - FDEC_STRIDE + 8]
1714 cglobal predict_16x16_v_sse2, 1,1
1715 movdqa xmm0, [r0 - FDEC_STRIDE]
1716 STORE16x16_SSE2 xmm0
1720 ;-----------------------------------------------------------------------------
1721 ; void predict_16x16_h( pixel *src )
1722 ;-----------------------------------------------------------------------------
1723 %macro PREDICT_16x16_H 0
1724 cglobal predict_16x16_h, 1,2
1725 mov r1, 12*FDEC_STRIDEB
1726 %ifdef HIGH_BIT_DEPTH
1730 movd m0, [r0+r1+n*FDEC_STRIDEB-2*SIZEOF_PIXEL]
1732 mova [r0+r1+n*FDEC_STRIDEB+ 0], m0
1733 mova [r0+r1+n*FDEC_STRIDEB+16], m0
1735 mova [r0+r1+n*FDEC_STRIDEB+ 8], m0
1736 mova [r0+r1+n*FDEC_STRIDEB+24], m0
1748 SPLATB m0, r0+r1+FDEC_STRIDE*n-1, m1
1749 mova [r0+r1+FDEC_STRIDE*n], m0
1751 mova [r0+r1+FDEC_STRIDE*n+8], m0
1755 %endif ; HIGH_BIT_DEPTH
1756 sub r1, 4*FDEC_STRIDEB
1764 %ifdef HIGH_BIT_DEPTH
1767 ;no SSE2 for 8-bit, it's slower than MMX on all systems that don't support SSSE3
1772 ;-----------------------------------------------------------------------------
1773 ; void predict_16x16_dc_core( pixel *src, int i_dc_left )
1774 ;-----------------------------------------------------------------------------
1776 %macro PRED16x16_DC 2
1777 %ifdef HIGH_BIT_DEPTH
1778 mova m0, [r0 - FDEC_STRIDEB+ 0]
1779 paddw m0, [r0 - FDEC_STRIDEB+ 8]
1780 paddw m0, [r0 - FDEC_STRIDEB+16]
1781 paddw m0, [r0 - FDEC_STRIDEB+24]
1786 STORE16x16 m0, m0, m0, m0
1790 psadbw m0, [r0 - FDEC_STRIDE]
1791 psadbw m1, [r0 - FDEC_STRIDE + 8]
1796 packuswb m0, m0 ; dc in bytes
1802 cglobal predict_16x16_dc_core_mmx2, 1,2
1812 cglobal predict_16x16_dc_top_mmx2, 1,2
1813 PRED16x16_DC [pw_8], 4
1817 %ifdef HIGH_BIT_DEPTH
1818 cglobal predict_16x16_dc_left_core_mmx2, 1,2
1821 STORE16x16 m0, m0, m0, m0
1824 cglobal predict_16x16_dc_left_core_mmx2, 1,1
1832 ;-----------------------------------------------------------------------------
1833 ; void predict_16x16_dc_core( pixel *src, int i_dc_left )
1834 ;-----------------------------------------------------------------------------
1836 %macro PRED16x16_DC_SSE2 2
1837 %ifdef HIGH_BIT_DEPTH
1838 mova m0, [r0 - FDEC_STRIDEB+ 0]
1839 paddw m0, [r0 - FDEC_STRIDEB+16]
1844 STORE16x16_SSE2 m0, m0
1847 psadbw m0, [r0 - FDEC_STRIDE]
1853 packuswb m0, m0 ; dc in bytes
1859 cglobal predict_16x16_dc_core_sse2, 2,2,4
1861 PRED16x16_DC_SSE2 m3, 5
1864 cglobal predict_16x16_dc_top_sse2, 1,2
1865 PRED16x16_DC_SSE2 [pw_8], 4
1869 %ifdef HIGH_BIT_DEPTH
1870 cglobal predict_16x16_dc_left_core_sse2, 1,2
1873 STORE16x16_SSE2 m0, m0
1876 cglobal predict_16x16_dc_left_core_sse2, 1,1