1 ;*****************************************************************************
2 ;* predict-a.asm: x86 intra prediction
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2015 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Holger Lubitz <holger@lubitz.org>
8 ;* Fiona Glaser <fiona@x264.com>
9 ;* Henrik Gramner <henrik@gramner.com>
11 ;* This program is free software; you can redistribute it and/or modify
12 ;* it under the terms of the GNU General Public License as published by
13 ;* the Free Software Foundation; either version 2 of the License, or
14 ;* (at your option) any later version.
16 ;* This program is distributed in the hope that it will be useful,
17 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
18 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 ;* GNU General Public License for more details.
21 ;* You should have received a copy of the GNU General Public License
22 ;* along with this program; if not, write to the Free Software
23 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
25 ;* This program is also available under a commercial proprietary license.
26 ;* For more information, contact us at licensing@x264.com.
27 ;*****************************************************************************
30 %include "x86util.asm"
34 pw_43210123: times 2 dw -3, -2, -1, 0, 1, 2, 3, 4
37 pb_00s_ff: times 8 db 0
38 pb_0s_ff: times 7 db 0
40 shuf_fixtr: db 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7
41 shuf_nop: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
42 shuf_hu: db 7,6,5,4,3,2,1,0,0,0,0,0,0,0,0,0
43 shuf_vr: db 2,4,6,8,9,10,11,12,13,14,15,0,1,3,5,7
44 pw_reverse: db 14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1
61 mova [r0+0*FDEC_STRIDEB], %1
62 mova [r0+1*FDEC_STRIDEB], %1
63 add r0, 4*FDEC_STRIDEB
64 mova [r0-2*FDEC_STRIDEB], %1
65 mova [r0-1*FDEC_STRIDEB], %1
66 mova [r0+0*FDEC_STRIDEB], %1
67 mova [r0+1*FDEC_STRIDEB], %1
68 mova [r0+2*FDEC_STRIDEB], %1
69 mova [r0+3*FDEC_STRIDEB], %1
76 mova [r0+0*FDEC_STRIDEB+0*mmsize], %1
77 mova [r0+0*FDEC_STRIDEB+1*mmsize], %2
78 mova [r0+1*FDEC_STRIDEB+0*mmsize], %1
79 mova [r0+1*FDEC_STRIDEB+1*mmsize], %2
81 mova [r0+0*FDEC_STRIDEB+2*mmsize], %3
82 mova [r0+0*FDEC_STRIDEB+3*mmsize], %4
83 mova [r0+1*FDEC_STRIDEB+2*mmsize], %3
84 mova [r0+1*FDEC_STRIDEB+3*mmsize], %4
85 add r0, 2*FDEC_STRIDEB
87 add r0, 4*FDEC_STRIDEB
88 mova [r0-2*FDEC_STRIDEB+0*mmsize], %1
89 mova [r0-2*FDEC_STRIDEB+1*mmsize], %2
90 mova [r0-1*FDEC_STRIDEB+0*mmsize], %1
91 mova [r0-1*FDEC_STRIDEB+1*mmsize], %2
97 %if HIGH_BIT_DEPTH ; Different code paths to reduce code size
98 add r0, 6*FDEC_STRIDEB
99 mova [r0-2*FDEC_STRIDEB], %1
100 mova [r0-1*FDEC_STRIDEB], %1
101 mova [r0+0*FDEC_STRIDEB], %1
102 mova [r0+1*FDEC_STRIDEB], %1
103 add r0, 4*FDEC_STRIDEB
104 mova [r0-2*FDEC_STRIDEB], %1
105 mova [r0-1*FDEC_STRIDEB], %1
106 mova [r0+0*FDEC_STRIDEB], %1
107 mova [r0+1*FDEC_STRIDEB], %1
109 add r0, 8*FDEC_STRIDE
110 mova [r0-4*FDEC_STRIDE], %1
111 mova [r0-3*FDEC_STRIDE], %1
112 mova [r0-2*FDEC_STRIDE], %1
113 mova [r0-1*FDEC_STRIDE], %1
114 mova [r0+0*FDEC_STRIDE], %1
115 mova [r0+1*FDEC_STRIDE], %1
116 mova [r0+2*FDEC_STRIDE], %1
117 mova [r0+3*FDEC_STRIDE], %1
118 %endif ; HIGH_BIT_DEPTH
122 %macro PRED_H_LOAD 2 ; reg, offset
124 vpbroadcastpix %1, [r0+(%2)*FDEC_STRIDEB-SIZEOF_PIXEL]
126 movd %1, [r0+(%2)*FDEC_STRIDEB-4]
129 SPLATB_LOAD %1, r0+(%2)*FDEC_STRIDE-1, m2
133 %macro PRED_H_STORE 3 ; reg, offset, width
134 %assign %%w %3*SIZEOF_PIXEL
136 movq [r0+(%2)*FDEC_STRIDEB], %1
140 mova [r0+(%2)*FDEC_STRIDEB+%%i], %1
141 %assign %%i %%i+mmsize
146 %macro PRED_H_4ROWS 2 ; width, inc_ptr
149 PRED_H_STORE m0, 0, %1
150 PRED_H_STORE m1, 1, %1
153 add r0, 4*FDEC_STRIDEB
155 PRED_H_LOAD m1, 3-4*%2
156 PRED_H_STORE m0, 2-4*%2, %1
157 PRED_H_STORE m1, 3-4*%2, %1
160 ; dest, left, right, src, tmp
161 ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
162 %macro PRED8x8_LOWPASS 4-5
177 ;-----------------------------------------------------------------------------
178 ; void predict_4x4_h( pixel *src )
179 ;-----------------------------------------------------------------------------
182 cglobal predict_4x4_h, 1,1
187 ;-----------------------------------------------------------------------------
188 ; void predict_4x4_ddl( pixel *src )
189 ;-----------------------------------------------------------------------------
190 %macro PREDICT_4x4_DDL 0
191 cglobal predict_4x4_ddl, 1,1
192 movu m1, [r0-FDEC_STRIDEB]
197 pshufhw m1, m1, q2210
203 PRED8x8_LOWPASS m0, m2, m1, m0, m3
208 movh [r0+Y*FDEC_STRIDEB], m0
221 cglobal predict_4x4_ddl, 1,2
222 movu m1, [r0-FDEC_STRIDEB+4]
223 PRED8x8_LOWPASS m0, m1, [r0-FDEC_STRIDEB+0], [r0-FDEC_STRIDEB+2]
224 mova m3, [r0-FDEC_STRIDEB+8]
225 mova [r0+0*FDEC_STRIDEB], m0
227 PRED8x8_LOWPASS m2, m4, [r0-FDEC_STRIDEB+6], m3
228 mova [r0+3*FDEC_STRIDEB], m2
231 mova [r0+1*FDEC_STRIDEB], m1
233 PALIGNR m2, m0, 6, m0
234 mova [r0+2*FDEC_STRIDEB], m2
236 %else ; !HIGH_BIT_DEPTH
241 ;-----------------------------------------------------------------------------
242 ; void predict_4x4_vr( pixel *src )
243 ;-----------------------------------------------------------------------------
244 %if HIGH_BIT_DEPTH == 0
246 cglobal predict_4x4_vr, 1,1
247 movd m1, [r0-1*FDEC_STRIDEB] ; ........t3t2t1t0
249 palignr m1, [r0-1*FDEC_STRIDEB-8], 7 ; ......t3t2t1t0lt
251 palignr m1, [r0+0*FDEC_STRIDEB-8], 7 ; ....t3t2t1t0ltl0
253 palignr m1, [r0+1*FDEC_STRIDEB-8], 7 ; ..t3t2t1t0ltl0l1
255 palignr m1, [r0+2*FDEC_STRIDEB-8], 7 ; t3t2t1t0ltl0l1l2
256 PRED8x8_LOWPASS m2, m0, m1, m2, m3
259 movd [r0+0*FDEC_STRIDEB], m4
261 movd [r0+1*FDEC_STRIDEB], m2
263 movd [r0+2*FDEC_STRIDEB], m4
265 movd [r0+3*FDEC_STRIDEB], m2
267 %endif ; !HIGH_BIT_DEPTH
269 ;-----------------------------------------------------------------------------
270 ; void predict_4x4_ddr( pixel *src )
271 ;-----------------------------------------------------------------------------
273 cglobal predict_4x4_ddr, 1,1
275 movu m2, [r0-1*FDEC_STRIDEB-8]
276 pinsrw m2, [r0+0*FDEC_STRIDEB-2], 2
277 pinsrw m2, [r0+1*FDEC_STRIDEB-2], 1
278 pinsrw m2, [r0+2*FDEC_STRIDEB-2], 0
279 movhps m3, [r0+3*FDEC_STRIDEB-8]
280 %else ; !HIGH_BIT_DEPTH
281 movd m0, [r0+2*FDEC_STRIDEB-4]
282 movd m1, [r0+0*FDEC_STRIDEB-4]
283 punpcklbw m0, [r0+1*FDEC_STRIDEB-4]
284 punpcklbw m1, [r0-1*FDEC_STRIDEB-4]
286 movd m2, [r0-1*FDEC_STRIDEB]
294 movd m3, [r0+3*FDEC_STRIDEB-4]
296 %endif ; !HIGH_BIT_DEPTH
300 PALIGNR m2, m3, 7*SIZEOF_PIXEL, m3
301 PRED8x8_LOWPASS m0, m2, m1, m0, m3
303 movh [r0+Y*FDEC_STRIDEB], m0
307 movh [r0+Y*FDEC_STRIDEB], m0
311 ;-----------------------------------------------------------------------------
312 ; void predict_4x4_vr( pixel *src )
313 ;-----------------------------------------------------------------------------
314 cglobal predict_4x4_vr, 1,1
316 movu m1, [r0-1*FDEC_STRIDEB-8]
317 pinsrw m1, [r0+0*FDEC_STRIDEB-2], 2
318 pinsrw m1, [r0+1*FDEC_STRIDEB-2], 1
319 pinsrw m1, [r0+2*FDEC_STRIDEB-2], 0
320 %else ; !HIGH_BIT_DEPTH
321 movd m0, [r0+2*FDEC_STRIDEB-4]
322 movd m1, [r0+0*FDEC_STRIDEB-4]
323 punpcklbw m0, [r0+1*FDEC_STRIDEB-4]
324 punpcklbw m1, [r0-1*FDEC_STRIDEB-4]
326 movd m1, [r0-1*FDEC_STRIDEB]
334 %endif ; !HIGH_BIT_DEPTH
339 PRED8x8_LOWPASS m2, m0, m1, m2, m3
342 movh [r0+0*FDEC_STRIDEB], m4
343 PALIGNR m4, m0, 7*SIZEOF_PIXEL, m3
344 movh [r0+1*FDEC_STRIDEB], m2
346 movh [r0+2*FDEC_STRIDEB], m4
347 PALIGNR m2, m0, 7*SIZEOF_PIXEL, m0
348 movh [r0+3*FDEC_STRIDEB], m2
351 ;-----------------------------------------------------------------------------
352 ; void predict_4x4_hd( pixel *src )
353 ;-----------------------------------------------------------------------------
354 cglobal predict_4x4_hd, 1,1
356 movu m1, [r0-1*FDEC_STRIDEB-8]
358 pinsrw m1, [r0+0*FDEC_STRIDEB-2], 3
359 pinsrw m1, [r0+1*FDEC_STRIDEB-2], 2
360 pinsrw m1, [r0+2*FDEC_STRIDEB-2], 1
361 pinsrw m1, [r0+3*FDEC_STRIDEB-2], 0
363 movd m0, [r0-1*FDEC_STRIDEB-4] ; lt ..
364 punpckldq m0, [r0-1*FDEC_STRIDEB] ; t3 t2 t1 t0 lt .. .. ..
365 PSLLPIX m0, m0, 1 ; t2 t1 t0 lt .. .. .. ..
366 movd m1, [r0+3*FDEC_STRIDEB-4] ; l3
367 punpcklbw m1, [r0+2*FDEC_STRIDEB-4] ; l2 l3
368 movd m2, [r0+1*FDEC_STRIDEB-4] ; l1
369 punpcklbw m2, [r0+0*FDEC_STRIDEB-4] ; l0 l1
370 punpckh%3 m1, m2 ; l0 l1 l2 l3
371 punpckh%4 m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3
373 PSRLPIX m2, m1, 1 ; .. t2 t1 t0 lt l0 l1 l2
374 PSRLPIX m0, m1, 2 ; .. .. t2 t1 t0 lt l0 l1
376 PRED8x8_LOWPASS m3, m1, m0, m2, m4
379 PALIGNR m3, m5, 6*SIZEOF_PIXEL, m4
381 movh [r0+Y*FDEC_STRIDEB], m5
385 movh [r0+Y*FDEC_STRIDEB], m5
387 movh [r0+0*FDEC_STRIDEB], m3
389 %endmacro ; PREDICT_4x4
391 ;-----------------------------------------------------------------------------
392 ; void predict_4x4_ddr( pixel *src )
393 ;-----------------------------------------------------------------------------
396 cglobal predict_4x4_ddr, 1,1
397 mova m0, [r0+1*FDEC_STRIDEB-8]
398 punpckhwd m0, [r0+0*FDEC_STRIDEB-8]
399 mova m3, [r0+3*FDEC_STRIDEB-8]
400 punpckhwd m3, [r0+2*FDEC_STRIDEB-8]
404 pinsrw m0, [r0-1*FDEC_STRIDEB-2], 3
406 PRED8x8_LOWPASS m0, m1, m3, m0
407 movq [r0+3*FDEC_STRIDEB], m0
409 movq m2, [r0-1*FDEC_STRIDEB-0]
411 pinsrw m4, [r0-1*FDEC_STRIDEB-2], 0
413 PALIGNR m4, m3, 6, m3
414 PRED8x8_LOWPASS m1, m4, m2, m1
415 movq [r0+0*FDEC_STRIDEB], m1
420 PALIGNR m1, m0, 6, m0
421 movq [r0+1*FDEC_STRIDEB], m1
422 movq [r0+2*FDEC_STRIDEB], m2
423 movd [r0+3*FDEC_STRIDEB+4], m1
426 ;-----------------------------------------------------------------------------
427 ; void predict_4x4_hd( pixel *src )
428 ;-----------------------------------------------------------------------------
429 cglobal predict_4x4_hd, 1,1
430 mova m0, [r0+1*FDEC_STRIDEB-8]
431 punpckhwd m0, [r0+0*FDEC_STRIDEB-8]
432 mova m1, [r0+3*FDEC_STRIDEB-8]
433 punpckhwd m1, [r0+2*FDEC_STRIDEB-8]
437 movu m3, [r0-1*FDEC_STRIDEB-2]
441 PALIGNR m3, m1, 2, m2
442 PRED8x8_LOWPASS m2, m4, m1, m3
447 mova [r0+3*FDEC_STRIDEB], m5
448 mova [r0+1*FDEC_STRIDEB], m4
451 mova [r0+2*FDEC_STRIDEB], m5
454 mova m6, [r0-1*FDEC_STRIDEB+0]
455 pinsrw m4, [r0+0*FDEC_STRIDEB-2], 0
456 PRED8x8_LOWPASS m3, m4, m6, m7
457 PALIGNR m3, m0, 6, m0
458 mova [r0+0*FDEC_STRIDEB], m3
462 PREDICT_4x4 w, wd, dq, qdq
464 PREDICT_4x4 w, wd, dq, qdq
466 PREDICT_4x4 w, wd, dq, qdq
467 %else ; !HIGH_BIT_DEPTH
469 PREDICT_4x4 b, bw, wd, dq
471 %define predict_4x4_vr_ssse3 predict_4x4_vr_ssse3_cache64
472 PREDICT_4x4 b, bw, wd, dq
475 ;-----------------------------------------------------------------------------
476 ; void predict_4x4_hu( pixel *src )
477 ;-----------------------------------------------------------------------------
480 cglobal predict_4x4_hu_mmx2, 1,1
481 movq m0, [r0+0*FDEC_STRIDEB-8]
482 punpckhwd m0, [r0+1*FDEC_STRIDEB-8]
483 movq m1, [r0+2*FDEC_STRIDEB-8]
484 punpckhwd m1, [r0+3*FDEC_STRIDEB-8]
487 movq [r0+3*FDEC_STRIDEB], m1
491 PRED8x8_LOWPASS m3, m0, m4, m3
493 mova [r0+0*FDEC_STRIDEB], m4
497 mova [r0+1*FDEC_STRIDEB], m2
499 mova [r0+2*FDEC_STRIDEB], m2
502 %else ; !HIGH_BIT_DEPTH
504 cglobal predict_4x4_hu_mmx2, 1,1
505 movd m1, [r0+0*FDEC_STRIDEB-4]
506 punpcklbw m1, [r0+1*FDEC_STRIDEB-4]
507 movd m0, [r0+2*FDEC_STRIDEB-4]
508 punpcklbw m0, [r0+3*FDEC_STRIDEB-4]
520 PRED8x8_LOWPASS m3, m0, m2, m3, m4
521 movd [r0+3*FDEC_STRIDEB], m1
523 movd [r0+0*FDEC_STRIDEB], m5
525 movd [r0+1*FDEC_STRIDEB], m5
527 movd [r0+2*FDEC_STRIDEB], m5
529 %endif ; HIGH_BIT_DEPTH
531 ;-----------------------------------------------------------------------------
532 ; void predict_4x4_vl( pixel *src )
533 ;-----------------------------------------------------------------------------
534 %macro PREDICT_4x4_V1 1
535 cglobal predict_4x4_vl, 1,1
536 movu m1, [r0-FDEC_STRIDEB]
540 PRED8x8_LOWPASS m0, m1, m2, m3, m5
542 movh [r0+0*FDEC_STRIDEB], m4
543 movh [r0+1*FDEC_STRIDEB], m0
546 movh [r0+2*FDEC_STRIDEB], m4
547 movh [r0+3*FDEC_STRIDEB], m0
558 cglobal predict_4x4_vl, 1,4
559 mova m1, [r0-FDEC_STRIDEB+0]
560 mova m2, [r0-FDEC_STRIDEB+8]
562 PALIGNR m2, m1, 4, m4
563 PALIGNR m0, m1, 2, m4
566 mova [r0+0*FDEC_STRIDEB], m3
568 mova [r0+2*FDEC_STRIDEB], m3
569 PRED8x8_LOWPASS m0, m1, m2, m0
570 mova [r0+1*FDEC_STRIDEB], m0
572 mova [r0+3*FDEC_STRIDEB], m0
574 movzx r1d, word [r0-FDEC_STRIDEB+ 8]
575 movzx r2d, word [r0-FDEC_STRIDEB+10]
576 movzx r3d, word [r0-FDEC_STRIDEB+12]
582 mov [r0+2*FDEC_STRIDEB+6], r1w
583 mov [r0+3*FDEC_STRIDEB+6], r3w
585 %else ; !HIGH_BIT_DEPTH
590 ;-----------------------------------------------------------------------------
591 ; void predict_4x4_dc( pixel *src )
592 ;-----------------------------------------------------------------------------
595 cglobal predict_4x4_dc, 1,1
596 mova m2, [r0+0*FDEC_STRIDEB-4*SIZEOF_PIXEL]
597 paddw m2, [r0+1*FDEC_STRIDEB-4*SIZEOF_PIXEL]
598 paddw m2, [r0+2*FDEC_STRIDEB-4*SIZEOF_PIXEL]
599 paddw m2, [r0+3*FDEC_STRIDEB-4*SIZEOF_PIXEL]
601 mova m0, [r0-FDEC_STRIDEB]
607 mova [r0+0*FDEC_STRIDEB], m0
608 mova [r0+1*FDEC_STRIDEB], m0
609 mova [r0+2*FDEC_STRIDEB], m0
610 mova [r0+3*FDEC_STRIDEB], m0
613 %else ; !HIGH_BIT_DEPTH
614 cglobal predict_4x4_dc, 1,4
616 movd mm0, [r0-FDEC_STRIDEB]
619 movzx r1d, byte [r0-1]
622 movzx r2d, byte [r0+FDEC_STRIDEB*Y-1]
629 mov [r0+FDEC_STRIDEB*0], r1d
630 mov [r0+FDEC_STRIDEB*1], r1d
631 mov [r0+FDEC_STRIDEB*2], r1d
632 mov [r0+FDEC_STRIDEB*3], r1d
634 %endif ; HIGH_BIT_DEPTH
636 %macro PREDICT_FILTER 4
637 ;-----------------------------------------------------------------------------
638 ;void predict_8x8_filter( pixel *src, pixel edge[36], int i_neighbor, int i_filters )
639 ;-----------------------------------------------------------------------------
640 cglobal predict_8x8_filter, 4,6,6
641 add r0, 0x58*SIZEOF_PIXEL
642 %define src r0-0x58*SIZEOF_PIXEL
656 mova m0, [src+0*FDEC_STRIDEB-8*SIZEOF_PIXEL]
657 punpckh%1%2 m0, [src+0*FDEC_STRIDEB-8*SIZEOF_PIXEL+t4*(FDEC_STRIDEB/8)]
658 mova m1, [src+2*FDEC_STRIDEB-8*SIZEOF_PIXEL]
659 punpckh%1%2 m1, [src+1*FDEC_STRIDEB-8*SIZEOF_PIXEL]
661 mova m2, [src+4*FDEC_STRIDEB-8*SIZEOF_PIXEL]
662 punpckh%1%2 m2, [src+3*FDEC_STRIDEB-8*SIZEOF_PIXEL]
663 mova m3, [src+6*FDEC_STRIDEB-8*SIZEOF_PIXEL]
664 punpckh%1%2 m3, [src+5*FDEC_STRIDEB-8*SIZEOF_PIXEL]
667 mova m0, [src+7*FDEC_STRIDEB-8*SIZEOF_PIXEL]
668 mova m1, [src-1*FDEC_STRIDEB]
669 PALIGNR m4, m3, m0, 7*SIZEOF_PIXEL, m0
670 PALIGNR m1, m1, m3, 1*SIZEOF_PIXEL, m2
671 PRED8x8_LOWPASS m3, m1, m4, m3, m5
672 mova [t1+8*SIZEOF_PIXEL], m3
673 movzx t4d, pixel [src+7*FDEC_STRIDEB-1*SIZEOF_PIXEL]
674 movzx r5d, pixel [src+6*FDEC_STRIDEB-1*SIZEOF_PIXEL]
678 mov [t1+7*SIZEOF_PIXEL], t4%1
679 mov [t1+6*SIZEOF_PIXEL], t4%1
683 %if SIZEOF_PIXEL==1 && cpuflag(ssse3)
685 movu m3, [src-1*FDEC_STRIDEB]
686 movhps m0, [src-1*FDEC_STRIDEB-8]
695 pshufb m3, [shuf_fixtr+r2*4] ; neighbor&MB_TOPRIGHT ? shuf_nop : shuf_fixtr
698 PALIGNR m2, m3, m0, 15, m0
699 PALIGNR m1, m3, 1, m5
700 PRED8x8_LOWPASS m0, m2, m1, m3, m5
701 mova [t1+16*SIZEOF_PIXEL], m0
703 movd [t1+32*SIZEOF_PIXEL], m0
711 mova m0, [src-1*FDEC_STRIDEB-8*SIZEOF_PIXEL]
712 mova m3, [src-1*FDEC_STRIDEB]
713 mova m1, [src-1*FDEC_STRIDEB+8*SIZEOF_PIXEL]
719 PALIGNR m2, m3, m0, 7*SIZEOF_PIXEL, m0
720 PALIGNR m0, m1, m3, 1*SIZEOF_PIXEL, m5
721 PRED8x8_LOWPASS m4, m2, m0, m3, m5
722 mova [t1+16*SIZEOF_PIXEL], m4
726 PALIGNR m2, m1, m3, 7*SIZEOF_PIXEL, m3
727 PALIGNR m5, m1, 1*SIZEOF_PIXEL, m4
728 PRED8x8_LOWPASS m0, m2, m5, m1, m4
729 mova [t1+24*SIZEOF_PIXEL], m0
731 movd [t1+32*SIZEOF_PIXEL], m0
739 punpckh%1%2 m1, m3, m3
740 pshuf%2 m1, m1, q3333
747 PREDICT_FILTER w, d, q, dq
749 PREDICT_FILTER w, d, q, dq
751 PREDICT_FILTER w, d, q, dq
754 PREDICT_FILTER b, w, d, q
756 PREDICT_FILTER b, w, d, q
759 ;-----------------------------------------------------------------------------
760 ; void predict_8x8_v( pixel *src, pixel *edge )
761 ;-----------------------------------------------------------------------------
762 %macro PREDICT_8x8_V 0
763 cglobal predict_8x8_v, 2,2
764 mova m0, [r1+16*SIZEOF_PIXEL]
777 ;-----------------------------------------------------------------------------
778 ; void predict_8x8_h( pixel *src, pixel edge[36] )
779 ;-----------------------------------------------------------------------------
780 %macro PREDICT_8x8_H 2
781 cglobal predict_8x8_h, 2,2
782 movu m1, [r1+7*SIZEOF_PIXEL]
783 add r0, 4*FDEC_STRIDEB
789 SPLAT%2 m0, m %+ i, (3-Y)&3
790 mova [r0+(Y-4)*FDEC_STRIDEB], m0
804 ;-----------------------------------------------------------------------------
805 ; void predict_8x8_dc( pixel *src, pixel *edge );
806 ;-----------------------------------------------------------------------------
809 cglobal predict_8x8_dc, 2,2
819 %else ; !HIGH_BIT_DEPTH
821 cglobal predict_8x8_dc, 2,2
833 %endif ; HIGH_BIT_DEPTH
835 ;-----------------------------------------------------------------------------
836 ; void predict_8x8_dc_top ( pixel *src, pixel *edge );
837 ; void predict_8x8_dc_left( pixel *src, pixel *edge );
838 ;-----------------------------------------------------------------------------
840 %macro PREDICT_8x8_DC 3
851 PREDICT_8x8_DC predict_8x8_dc_top , 32, mova
852 PREDICT_8x8_DC predict_8x8_dc_left, 14, movu
854 %else ; !HIGH_BIT_DEPTH
855 %macro PREDICT_8x8_DC 2
867 PREDICT_8x8_DC predict_8x8_dc_top_mmx2, 16
868 PREDICT_8x8_DC predict_8x8_dc_left_mmx2, 7
869 %endif ; HIGH_BIT_DEPTH
871 ; sse2 is faster even on amd for 8-bit, so there's no sense in spending exe
872 ; size on the 8-bit mmx functions below if we know sse2 is available.
873 %macro PREDICT_8x8_DDLR 0
874 ;-----------------------------------------------------------------------------
875 ; void predict_8x8_ddl( pixel *src, pixel *edge )
876 ;-----------------------------------------------------------------------------
877 cglobal predict_8x8_ddl, 2,2,7
878 mova m0, [r1+16*SIZEOF_PIXEL]
879 mova m1, [r1+24*SIZEOF_PIXEL]
881 movd m5, [r1+32*SIZEOF_PIXEL]
882 palignr m3, m1, m0, 1*SIZEOF_PIXEL
883 palignr m5, m5, m1, 1*SIZEOF_PIXEL
884 palignr m4, m1, m0, 7*SIZEOF_PIXEL
886 movu m3, [r1+17*SIZEOF_PIXEL]
887 movu m4, [r1+23*SIZEOF_PIXEL]
888 movu m5, [r1+25*SIZEOF_PIXEL]
891 add r0, FDEC_STRIDEB*4
892 PRED8x8_LOWPASS m0, m2, m3, m0, m6
893 PRED8x8_LOWPASS m1, m4, m5, m1, m6
894 mova [r0+3*FDEC_STRIDEB], m1
897 PALIGNR m1, m0, 7*SIZEOF_PIXEL, m2
899 mova [r0+Y*FDEC_STRIDEB], m1
902 PALIGNR m1, m0, 7*SIZEOF_PIXEL, m0
903 mova [r0+Y*FDEC_STRIDEB], m1
906 ;-----------------------------------------------------------------------------
907 ; void predict_8x8_ddr( pixel *src, pixel *edge )
908 ;-----------------------------------------------------------------------------
909 cglobal predict_8x8_ddr, 2,2,7
910 add r0, FDEC_STRIDEB*4
911 mova m0, [r1+ 8*SIZEOF_PIXEL]
912 mova m1, [r1+16*SIZEOF_PIXEL]
913 ; edge[] is 32byte aligned, so some of the unaligned loads are known to be not cachesplit
914 movu m2, [r1+ 7*SIZEOF_PIXEL]
915 movu m5, [r1+17*SIZEOF_PIXEL]
917 palignr m3, m1, m0, 1*SIZEOF_PIXEL
918 palignr m4, m1, m0, 7*SIZEOF_PIXEL
920 movu m3, [r1+ 9*SIZEOF_PIXEL]
921 movu m4, [r1+15*SIZEOF_PIXEL]
923 PRED8x8_LOWPASS m0, m2, m3, m0, m6
924 PRED8x8_LOWPASS m1, m4, m5, m1, m6
925 mova [r0+3*FDEC_STRIDEB], m0
928 PALIGNR m1, m0, 7*SIZEOF_PIXEL, m2
930 mova [r0+Y*FDEC_STRIDEB], m1
933 PALIGNR m1, m0, 7*SIZEOF_PIXEL, m0
934 mova [r0+Y*FDEC_STRIDEB], m1
936 %endmacro ; PREDICT_8x8_DDLR
943 INIT_XMM ssse3, cache64
945 %elif ARCH_X86_64 == 0
950 ;-----------------------------------------------------------------------------
951 ; void predict_8x8_hu( pixel *src, pixel *edge )
952 ;-----------------------------------------------------------------------------
953 %macro PREDICT_8x8_HU 2
954 cglobal predict_8x8_hu, 2,2,8
955 add r0, 4*FDEC_STRIDEB
958 movu m5, [r1+7*SIZEOF_PIXEL]
959 pshufb m5, [pw_reverse]
961 movq m6, [r1+7*SIZEOF_PIXEL]
962 movq m5, [r1+11*SIZEOF_PIXEL]
963 pshuflw m6, m6, q0123
964 pshuflw m5, m5, q0123
969 pshufhw m2, m2, q2210
970 pshufhw m3, m3, q1110
972 %else ; !HIGH_BIT_DEPTH
973 movu m1, [r1+7*SIZEOF_PIXEL] ; l0 l1 l2 l3 l4 l5 l6 l7
974 pshufw m0, m1, q0123 ; l6 l7 l4 l5 l2 l3 l0 l1
975 psllq m1, 56 ; l7 .. .. .. .. .. .. ..
982 mova m5, m2 ; l7 l6 l5 l4 l3 l2 l1 l0
985 por m2, m1 ; l7 l7 l6 l5 l4 l3 l2 l1
987 por m3, m1 ; l7 l7 l7 l6 l5 l4 l3 l2
989 %endif ; !HIGH_BIT_DEPTH
990 PRED8x8_LOWPASS m2, m3, m5, m2, m6
991 punpckh%2 m0, m4, m2 ; p8 p7 p6 p5
992 punpckl%2 m4, m2 ; p4 p3 p2 p1
993 PALIGNR m5, m0, m4, 2*SIZEOF_PIXEL, m3
994 pshuf%1 m1, m0, q3321
995 PALIGNR m6, m0, m4, 4*SIZEOF_PIXEL, m3
996 pshuf%1 m2, m0, q3332
997 PALIGNR m7, m0, m4, 6*SIZEOF_PIXEL, m3
998 pshuf%1 m3, m0, q3333
999 mova [r0-4*FDEC_STRIDEB], m4
1000 mova [r0-3*FDEC_STRIDEB], m5
1001 mova [r0-2*FDEC_STRIDEB], m6
1002 mova [r0-1*FDEC_STRIDEB], m7
1003 mova [r0+0*FDEC_STRIDEB], m0
1004 mova [r0+1*FDEC_STRIDEB], m1
1005 mova [r0+2*FDEC_STRIDEB], m2
1006 mova [r0+3*FDEC_STRIDEB], m3
1012 PREDICT_8x8_HU d, wd
1014 PREDICT_8x8_HU d, wd
1016 PREDICT_8x8_HU d, wd
1017 %elif ARCH_X86_64 == 0
1019 PREDICT_8x8_HU w, bw
1022 ;-----------------------------------------------------------------------------
1023 ; void predict_8x8_vr( pixel *src, pixel *edge )
1024 ;-----------------------------------------------------------------------------
1025 %macro PREDICT_8x8_VR 1
1026 cglobal predict_8x8_vr, 2,3
1027 mova m2, [r1+16*SIZEOF_PIXEL]
1028 %ifidn cpuname, ssse3
1029 mova m0, [r1+8*SIZEOF_PIXEL]
1030 palignr m3, m2, m0, 7*SIZEOF_PIXEL
1031 palignr m1, m2, m0, 6*SIZEOF_PIXEL
1033 movu m3, [r1+15*SIZEOF_PIXEL]
1034 movu m1, [r1+14*SIZEOF_PIXEL]
1037 add r0, FDEC_STRIDEB*4
1038 PRED8x8_LOWPASS m3, m1, m2, m3, m5
1039 mova [r0-4*FDEC_STRIDEB], m4
1040 mova [r0-3*FDEC_STRIDEB], m3
1041 mova m1, [r1+8*SIZEOF_PIXEL]
1044 PRED8x8_LOWPASS m0, m1, m2, m0, m6
1048 PALIGNR m4, m0, 7*SIZEOF_PIXEL, m5
1049 mova [r0+Y*FDEC_STRIDEB], m4
1054 PALIGNR m4, m0, 7*SIZEOF_PIXEL, m0
1055 mova [r0+Y*FDEC_STRIDEB], m4
1066 %elif ARCH_X86_64 == 0
1071 %macro LOAD_PLANE_ARGS 0
1072 %if cpuflag(avx2) && ARCH_X86_64 == 0
1073 vpbroadcastw m0, r1m
1074 vpbroadcastw m2, r2m
1075 vpbroadcastw m4, r3m
1076 %elif mmsize == 8 ; MMX is only used on x86_32
1090 ;-----------------------------------------------------------------------------
1091 ; void predict_8x8c_p_core( uint8_t *src, int i00, int b, int c )
1092 ;-----------------------------------------------------------------------------
1093 %if ARCH_X86_64 == 0 && HIGH_BIT_DEPTH == 0
1094 %macro PREDICT_CHROMA_P_MMX 1
1095 cglobal predict_8x%1c_p_core, 1,2
1098 pmullw m2, [pw_0to15]
1100 paddsw m0, m2 ; m0 = {i+0*b, i+1*b, i+2*b, i+3*b}
1101 paddsw m1, m0 ; m1 = {i+4*b, i+5*b, i+6*b, i+7*b}
1118 %endmacro ; PREDICT_CHROMA_P_MMX
1121 PREDICT_CHROMA_P_MMX 8
1122 PREDICT_CHROMA_P_MMX 16
1123 %endif ; !ARCH_X86_64 && !HIGH_BIT_DEPTH
1125 %macro PREDICT_CHROMA_P 1
1127 cglobal predict_8x%1c_p_core, 1,2,7
1129 mova m3, [pw_pixel_max]
1131 pmullw m2, [pw_43210123] ; b
1133 pmullw m5, m4, [pw_m7] ; c
1135 pmullw m5, m4, [pw_m3]
1143 mov r1d, %1/(mmsize/16)
1151 vextracti128 [r0], m6, 1
1152 mova [r0+FDEC_STRIDEB], xm6
1153 add r0, 2*FDEC_STRIDEB
1156 add r0, FDEC_STRIDEB
1161 %else ; !HIGH_BIT_DEPTH
1162 cglobal predict_8x%1c_p_core, 1,2
1165 vbroadcasti128 m1, [pw_0to15] ; 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7
1167 mova xm1, xm4 ; zero upper half
1171 pmullw m2, [pw_0to15]
1173 paddsw m0, m2 ; m0 = {i+0*b, i+1*b, i+2*b, i+3*b, i+4*b, i+5*b, i+6*b, i+7*b}
1176 mov r1d, %1/(mmsize/8)
1184 movq [r0+FDEC_STRIDE*1], xm2
1185 movhps [r0+FDEC_STRIDE*3], xm2
1186 vextracti128 xm2, m2, 1
1187 movq [r0+FDEC_STRIDE*0], xm2
1188 movhps [r0+FDEC_STRIDE*2], xm2
1190 movq [r0+FDEC_STRIDE*0], xm2
1191 movhps [r0+FDEC_STRIDE*1], xm2
1193 add r0, FDEC_STRIDE*mmsize/8
1197 %endif ; HIGH_BIT_DEPTH
1198 %endmacro ; PREDICT_CHROMA_P
1210 ;-----------------------------------------------------------------------------
1211 ; void predict_16x16_p_core( uint8_t *src, int i00, int b, int c )
1212 ;-----------------------------------------------------------------------------
1213 %if HIGH_BIT_DEPTH == 0 && ARCH_X86_64 == 0
1215 cglobal predict_16x16_p_core, 1,2
1219 pmullw mm5, [pw_0to15]
1223 paddsw mm0, mm5 ; mm0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b}
1224 paddsw mm1, mm0 ; mm1 = {i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b}
1225 paddsw mm2, mm0 ; mm2 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b}
1226 paddsw mm3, mm1 ; mm3 = {i+12*b, i+13*b, i+14*b, i+15*b}
1253 %endif ; !HIGH_BIT_DEPTH && !ARCH_X86_64
1255 %macro PREDICT_16x16_P 0
1256 cglobal predict_16x16_p_core, 1,2,8
1263 pmullw m3, m1, [pw_0to15]
1278 CLIPW m4, [pb_0], [pw_pixel_max]
1279 CLIPW m5, [pb_0], [pw_pixel_max]
1282 add r0, FDEC_STRIDEB
1284 %else ; !HIGH_BIT_DEPTH
1285 paddsw m0, m3 ; m0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b, i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b}
1286 paddsw m1, m0 ; m1 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b, i+12*b, i+13*b, i+14*b, i+15*b}
1299 mova [r0+FDEC_STRIDE*0], m3
1300 mova [r0+FDEC_STRIDE*1], m5
1303 add r0, FDEC_STRIDE*2
1304 %endif ; !HIGH_BIT_DEPTH
1308 %endmacro ; PREDICT_16x16_P
1312 %if HIGH_BIT_DEPTH == 0
1318 cglobal predict_16x16_p_core, 1,2,8*HIGH_BIT_DEPTH
1321 pmullw m2, [pw_0to15]
1324 mova m7, [pw_pixel_max]
1335 mova [r0+0*FDEC_STRIDEB], m1
1337 mova [r0+1*FDEC_STRIDEB], m3
1339 add r0, 2*FDEC_STRIDEB
1340 %else ; !HIGH_BIT_DEPTH
1341 vbroadcasti128 m1, [pw_0to15]
1342 mova xm3, xm4 ; zero high bits
1346 paddsw m0, m1 ; X+1*C X+0*C
1347 paddsw m1, m0, m2 ; Y+1*C Y+0*C
1355 packuswb m2, m3 ; X+1*C Y+1*C X+0*C Y+0*C
1356 vextracti128 [r0+0*FDEC_STRIDE], m2, 1
1357 mova [r0+1*FDEC_STRIDE], xm2
1362 packuswb m2, m3 ; X+3*C Y+3*C X+2*C Y+2*C
1363 vextracti128 [r0+2*FDEC_STRIDE], m2, 1
1364 mova [r0+3*FDEC_STRIDE], xm2
1365 add r0, FDEC_STRIDE*4
1366 %endif ; !HIGH_BIT_DEPTH
1371 %if HIGH_BIT_DEPTH == 0
1372 %macro PREDICT_8x8 0
1373 ;-----------------------------------------------------------------------------
1374 ; void predict_8x8_ddl( uint8_t *src, uint8_t *edge )
1375 ;-----------------------------------------------------------------------------
1376 cglobal predict_8x8_ddl, 2,2
1378 %ifidn cpuname, ssse3
1385 add r0, FDEC_STRIDE*4
1386 PRED8x8_LOWPASS m0, m1, m2, m0, m3
1391 movq [r0+Y*FDEC_STRIDE], m0
1396 %ifnidn cpuname, ssse3
1397 ;-----------------------------------------------------------------------------
1398 ; void predict_8x8_ddr( uint8_t *src, uint8_t *edge )
1399 ;-----------------------------------------------------------------------------
1400 cglobal predict_8x8_ddr, 2,2
1404 add r0, FDEC_STRIDE*4
1405 PRED8x8_LOWPASS m0, m1, m2, m0, m3
1410 movq [r0+Y*FDEC_STRIDE], m0
1411 movq [r0+(Y-1)*FDEC_STRIDE], m1
1416 movq [r0-3*FDEC_STRIDE], m0
1417 movq [r0-4*FDEC_STRIDE], m1
1420 ;-----------------------------------------------------------------------------
1421 ; void predict_8x8_vl( uint8_t *src, uint8_t *edge )
1422 ;-----------------------------------------------------------------------------
1423 cglobal predict_8x8_vl, 2,2
1428 add r0, FDEC_STRIDE*4
1429 PRED8x8_LOWPASS m0, m1, m2, m0, m5
1430 ; m0: (t0 + 2*t1 + t2 + 2) >> 2
1431 ; m3: (t0 + t1 + 1) >> 1
1436 movq [r0+ Y *FDEC_STRIDE], m3
1437 movq [r0+(Y+1)*FDEC_STRIDE], m0
1442 movq [r0+ Y *FDEC_STRIDE], m3
1443 movq [r0+(Y+1)*FDEC_STRIDE], m0
1447 ;-----------------------------------------------------------------------------
1448 ; void predict_8x8_vr( uint8_t *src, uint8_t *edge )
1449 ;-----------------------------------------------------------------------------
1450 cglobal predict_8x8_vr, 2,2
1452 add r0, 4*FDEC_STRIDE
1456 PRED8x8_LOWPASS m0, m2, m1, m0, m4
1457 movhps [r0-4*FDEC_STRIDE], m3
1458 movhps [r0-3*FDEC_STRIDE], m0
1461 pshufb m0, [shuf_vr]
1471 shufps m1, m2, q3210
1476 movq [r0+3*FDEC_STRIDE], m0
1477 movq [r0+2*FDEC_STRIDE], m3
1480 movq [r0+1*FDEC_STRIDE], m0
1481 movq [r0+0*FDEC_STRIDE], m3
1484 movq [r0-1*FDEC_STRIDE], m0
1485 movq [r0-2*FDEC_STRIDE], m3
1487 %endmacro ; PREDICT_8x8
1496 %endif ; !HIGH_BIT_DEPTH
1498 ;-----------------------------------------------------------------------------
1499 ; void predict_8x8_vl( pixel *src, pixel *edge )
1500 ;-----------------------------------------------------------------------------
1501 %macro PREDICT_8x8_VL_10 1
1502 cglobal predict_8x8_vl, 2,2,8
1503 mova m0, [r1+16*SIZEOF_PIXEL]
1504 mova m1, [r1+24*SIZEOF_PIXEL]
1505 PALIGNR m2, m1, m0, SIZEOF_PIXEL*1, m4
1509 add r0, FDEC_STRIDEB*4
1510 mova [r0-4*FDEC_STRIDEB], m6
1511 PALIGNR m3, m7, m6, SIZEOF_PIXEL*1, m5
1512 mova [r0-2*FDEC_STRIDEB], m3
1513 PALIGNR m3, m7, m6, SIZEOF_PIXEL*2, m5
1514 mova [r0+0*FDEC_STRIDEB], m3
1515 PALIGNR m7, m7, m6, SIZEOF_PIXEL*3, m5
1516 mova [r0+2*FDEC_STRIDEB], m7
1517 PALIGNR m3, m1, m0, SIZEOF_PIXEL*7, m6
1519 PRED8x8_LOWPASS m0, m5, m2, m0, m7
1520 PRED8x8_LOWPASS m1, m3, m4, m1, m7
1521 PALIGNR m4, m1, m0, SIZEOF_PIXEL*1, m2
1522 mova [r0-3*FDEC_STRIDEB], m4
1523 PALIGNR m4, m1, m0, SIZEOF_PIXEL*2, m2
1524 mova [r0-1*FDEC_STRIDEB], m4
1525 PALIGNR m4, m1, m0, SIZEOF_PIXEL*3, m2
1526 mova [r0+1*FDEC_STRIDEB], m4
1527 PALIGNR m1, m1, m0, SIZEOF_PIXEL*4, m2
1528 mova [r0+3*FDEC_STRIDEB], m1
1543 ;-----------------------------------------------------------------------------
1544 ; void predict_8x8_hd( pixel *src, pixel *edge )
1545 ;-----------------------------------------------------------------------------
1546 %macro PREDICT_8x8_HD 2
1547 cglobal predict_8x8_hd, 2,2
1548 add r0, 4*FDEC_STRIDEB
1549 mova m0, [r1+ 8*SIZEOF_PIXEL] ; lt l0 l1 l2 l3 l4 l5 l6
1550 movu m1, [r1+ 7*SIZEOF_PIXEL] ; l0 l1 l2 l3 l4 l5 l6 l7
1551 %ifidn cpuname, ssse3
1552 mova m2, [r1+16*SIZEOF_PIXEL] ; t7 t6 t5 t4 t3 t2 t1 t0
1553 mova m4, m2 ; t7 t6 t5 t4 t3 t2 t1 t0
1554 palignr m2, m0, 7*SIZEOF_PIXEL ; t6 t5 t4 t3 t2 t1 t0 lt
1555 palignr m4, m0, 1*SIZEOF_PIXEL ; t0 lt l0 l1 l2 l3 l4 l5
1557 movu m2, [r1+15*SIZEOF_PIXEL]
1558 movu m4, [r1+ 9*SIZEOF_PIXEL]
1561 PRED8x8_LOWPASS m0, m4, m1, m0, m5
1562 PSRLPIX m4, m2, 2 ; .. .. t6 t5 t4 t3 t2 t1
1563 PSRLPIX m1, m2, 1 ; .. t6 t5 t4 t3 t2 t1 t0
1564 PRED8x8_LOWPASS m1, m4, m2, m1, m5
1566 punpckh%2 m2, m3, m0 ; p8 p7 p6 p5
1567 punpckl%2 m3, m0 ; p4 p3 p2 p1
1568 mova [r0+3*FDEC_STRIDEB], m3
1569 PALIGNR m0, m2, m3, 2*SIZEOF_PIXEL, m5
1570 mova [r0+2*FDEC_STRIDEB], m0
1571 PALIGNR m0, m2, m3, 4*SIZEOF_PIXEL, m5
1572 mova [r0+1*FDEC_STRIDEB], m0
1573 PALIGNR m0, m2, m3, 6*SIZEOF_PIXEL, m3
1574 mova [r0+0*FDEC_STRIDEB], m0
1575 mova [r0-1*FDEC_STRIDEB], m2
1576 PALIGNR m0, m1, m2, 2*SIZEOF_PIXEL, m5
1577 mova [r0-2*FDEC_STRIDEB], m0
1578 PALIGNR m0, m1, m2, 4*SIZEOF_PIXEL, m5
1579 mova [r0-3*FDEC_STRIDEB], m0
1580 PALIGNR m1, m1, m2, 6*SIZEOF_PIXEL, m2
1581 mova [r0-4*FDEC_STRIDEB], m1
1587 PREDICT_8x8_HD w, wd
1589 PREDICT_8x8_HD w, wd
1591 PREDICT_8x8_HD w, wd
1594 PREDICT_8x8_HD b, bw
1596 ;-----------------------------------------------------------------------------
1597 ; void predict_8x8_hd( uint8_t *src, uint8_t *edge )
1598 ;-----------------------------------------------------------------------------
1599 %macro PREDICT_8x8_HD 0
1600 cglobal predict_8x8_hd, 2,2
1601 add r0, 4*FDEC_STRIDE
1606 PRED8x8_LOWPASS m0, m1, m2, m3, m5
1612 movq [r0+(Y)*FDEC_STRIDE], m4
1613 movq [r0+(Y-4)*FDEC_STRIDE], m0
1618 movq [r0+(Y)*FDEC_STRIDE], m4
1619 movq [r0+(Y-4)*FDEC_STRIDE], m0
1627 %endif ; HIGH_BIT_DEPTH
1629 %if HIGH_BIT_DEPTH == 0
1630 ;-----------------------------------------------------------------------------
1631 ; void predict_8x8_hu( uint8_t *src, uint8_t *edge )
1632 ;-----------------------------------------------------------------------------
1634 cglobal predict_8x8_hu_sse2, 2,2
1635 add r0, 4*FDEC_STRIDE
1636 movq mm1, [r1+7] ; l0 l1 l2 l3 l4 l5 l6 l7
1637 pshufw mm0, mm1, q0123 ; l6 l7 l4 l5 l2 l3 l0 l1
1641 por mm2, mm0 ; l7 l6 l5 l4 l3 l2 l1 l0
1642 psllq mm1, 56 ; l7 .. .. .. .. .. .. ..
1648 por mm2, mm1 ; l7 l7 l6 l5 l4 l3 l2 l1
1650 por mm3, mm1 ; l7 l7 l7 l6 l5 l4 l3 l2
1652 PRED8x8_LOWPASS mm1, mm3, mm5, mm2, mm6
1656 punpcklbw xmm0, xmm1
1660 movq [r0+Y*FDEC_STRIDE], xmm0
1664 pshufw mm5, mm4, q3321
1665 pshufw mm6, mm4, q3332
1666 pshufw mm7, mm4, q3333
1667 movq [r0+Y*FDEC_STRIDE], xmm0
1668 movq [r0+0*FDEC_STRIDE], mm4
1669 movq [r0+1*FDEC_STRIDE], mm5
1670 movq [r0+2*FDEC_STRIDE], mm6
1671 movq [r0+3*FDEC_STRIDE], mm7
1675 cglobal predict_8x8_hu_ssse3, 2,2
1676 add r0, 4*FDEC_STRIDE
1678 pshufb m3, [shuf_hu]
1682 PRED8x8_LOWPASS m1, m3, m2, m1, m4
1686 movq [r0+ Y *FDEC_STRIDE], m0
1687 movhps [r0+(Y+4)*FDEC_STRIDE], m0
1689 pshufhw m0, m0, q2210
1692 movq [r0+ Y *FDEC_STRIDE], m0
1693 movhps [r0+(Y+4)*FDEC_STRIDE], m0
1695 %endif ; !HIGH_BIT_DEPTH
1697 ;-----------------------------------------------------------------------------
1698 ; void predict_8x8c_v( uint8_t *src )
1699 ;-----------------------------------------------------------------------------
1701 %macro PREDICT_8x8C_V 0
1702 cglobal predict_8x8c_v, 1,1
1703 mova m0, [r0 - FDEC_STRIDEB]
1719 cglobal predict_8x8c_v_mmx, 1,1
1720 mova m0, [r0 - FDEC_STRIDEB]
1721 mova m1, [r0 - FDEC_STRIDEB + 8]
1724 mova [r0 + (Y&1)*FDEC_STRIDEB], m0
1725 mova [r0 + (Y&1)*FDEC_STRIDEB + 8], m1
1727 add r0, FDEC_STRIDEB*2
1735 %macro PREDICT_8x16C_V 0
1736 cglobal predict_8x16c_v, 1,1
1737 mova m0, [r0 - FDEC_STRIDEB]
1750 ;-----------------------------------------------------------------------------
1751 ; void predict_8x8c_h( uint8_t *src )
1752 ;-----------------------------------------------------------------------------
1753 %macro PREDICT_C_H 0
1754 cglobal predict_8x8c_h, 1,1
1755 %if cpuflag(ssse3) && notcpuflag(avx2)
1762 cglobal predict_8x16c_h, 1,2
1763 %if cpuflag(ssse3) && notcpuflag(avx2)
1786 ;-----------------------------------------------------------------------------
1787 ; void predict_8x8c_dc( pixel *src )
1788 ;-----------------------------------------------------------------------------
1790 movzx r1d, pixel [r0+FDEC_STRIDEB*(%1-4)-SIZEOF_PIXEL]
1791 movzx r2d, pixel [r0+FDEC_STRIDEB*(%1-3)-SIZEOF_PIXEL]
1793 movzx r2d, pixel [r0+FDEC_STRIDEB*(%1-2)-SIZEOF_PIXEL]
1795 movzx r2d, pixel [r0+FDEC_STRIDEB*(%1-1)-SIZEOF_PIXEL]
1799 %macro PREDICT_8x8C_DC 0
1800 cglobal predict_8x8c_dc, 1,3
1803 movq m0, [r0-FDEC_STRIDEB+0]
1804 movq m1, [r0-FDEC_STRIDEB+8]
1807 %else ; !HIGH_BIT_DEPTH
1808 movd m0, [r0-FDEC_STRIDEB+0]
1809 movd m1, [r0-FDEC_STRIDEB+4]
1813 add r0, FDEC_STRIDEB*4
1822 punpckldq m0, m2 ; s0, s1, s2, s3
1823 pshufw m3, m0, q3312 ; s2, s1, s3, s3
1824 pshufw m0, m0, q1310 ; s0, s1, s3, s1
1827 pavgw m0, m7 ; s0+s2, s1, s3, s1+s3
1831 punpcklwd xmm0, xmm0
1832 pshufd xmm1, xmm0, q3322
1833 punpckldq xmm0, xmm0
1836 %assign i (0 + (Y/4))
1837 movdqa [r0+FDEC_STRIDEB*(Y-4)+0], xmm %+ i
1841 pshufw m1, m0, q0000
1842 pshufw m2, m0, q1111
1843 pshufw m3, m0, q2222
1844 pshufw m4, m0, q3333
1847 %assign i (1 + (Y/4)*2)
1848 %assign j (2 + (Y/4)*2)
1849 movq [r0+FDEC_STRIDEB*(Y-4)+0], m %+ i
1850 movq [r0+FDEC_STRIDEB*(Y-4)+8], m %+ j
1854 %else ; !HIGH_BIT_DEPTH
1862 %assign i (0 + (Y/4))
1863 movq [r0+FDEC_STRIDEB*(Y-4)], m %+ i
1878 %macro STORE_4LINES 3
1880 movdqa [r0+FDEC_STRIDEB*(%3-4)], %1
1881 movdqa [r0+FDEC_STRIDEB*(%3-3)], %1
1882 movdqa [r0+FDEC_STRIDEB*(%3-2)], %1
1883 movdqa [r0+FDEC_STRIDEB*(%3-1)], %1
1885 movq [r0+FDEC_STRIDEB*(%3-4)+0], %1
1886 movq [r0+FDEC_STRIDEB*(%3-4)+8], %2
1887 movq [r0+FDEC_STRIDEB*(%3-3)+0], %1
1888 movq [r0+FDEC_STRIDEB*(%3-3)+8], %2
1889 movq [r0+FDEC_STRIDEB*(%3-2)+0], %1
1890 movq [r0+FDEC_STRIDEB*(%3-2)+8], %2
1891 movq [r0+FDEC_STRIDEB*(%3-1)+0], %1
1892 movq [r0+FDEC_STRIDEB*(%3-1)+8], %2
1896 %macro STORE_4LINES 2
1897 movq [r0+FDEC_STRIDEB*(%2-4)], %1
1898 movq [r0+FDEC_STRIDEB*(%2-3)], %1
1899 movq [r0+FDEC_STRIDEB*(%2-2)], %1
1900 movq [r0+FDEC_STRIDEB*(%2-1)], %1
1904 %macro PREDICT_8x16C_DC 0
1905 cglobal predict_8x16c_dc, 1,3
1908 movq m0, [r0-FDEC_STRIDEB+0]
1909 movq m1, [r0-FDEC_STRIDEB+8]
1913 movd m0, [r0-FDEC_STRIDEB+0]
1914 movd m1, [r0-FDEC_STRIDEB+4]
1918 punpcklwd m0, m1 ; s0, s1
1920 add r0, FDEC_STRIDEB*4
1924 pinsrw m0, r1d, 3 ; s0, s1, s2, s3
1925 add r0, FDEC_STRIDEB*8
1929 pinsrw m1, r1d, 3 ; s1, __, s4, s5
1930 sub r0, FDEC_STRIDEB*8
1932 pshufw m2, m0, q1310 ; s0, s1, s3, s1
1933 pshufw m0, m0, q3312 ; s2, s1, s3, s3
1934 pshufw m3, m1, q0302 ; s4, s1, s5, s1
1935 pshufw m1, m1, q3322 ; s4, s4, s5, s5
1946 punpcklwd xmm0, xmm0
1947 punpcklwd xmm1, xmm1
1948 pshufd xmm2, xmm0, q3322
1949 pshufd xmm3, xmm1, q3322
1950 punpckldq xmm0, xmm0
1951 punpckldq xmm1, xmm1
1952 STORE_4LINES xmm0, xmm0, 0
1953 STORE_4LINES xmm2, xmm2, 4
1954 STORE_4LINES xmm1, xmm1, 8
1955 STORE_4LINES xmm3, xmm3, 12
1957 pshufw m2, m0, q0000
1958 pshufw m3, m0, q1111
1959 pshufw m4, m0, q2222
1960 pshufw m5, m0, q3333
1961 STORE_4LINES m2, m3, 0
1962 STORE_4LINES m4, m5, 4
1963 pshufw m2, m1, q0000
1964 pshufw m3, m1, q1111
1965 pshufw m4, m1, q2222
1966 pshufw m5, m1, q3333
1967 STORE_4LINES m2, m3, 8
1968 STORE_4LINES m4, m5, 12
1971 packuswb m0, m0 ; dc0, dc1, dc2, dc3
1972 packuswb m1, m1 ; dc4, dc5, dc6, dc7
1975 pshufw m2, m0, q1100
1976 pshufw m3, m0, q3322
1977 pshufw m4, m1, q1100
1978 pshufw m5, m1, q3322
1981 add r0, FDEC_STRIDEB*8
1995 %macro PREDICT_C_DC_TOP 1
1998 cglobal predict_8x%1c_dc_top_sse2, 1,1
2000 mova m0, [r0 - FDEC_STRIDEB]
2001 pshufd m1, m0, q2301
2003 pshuflw m1, m0, q2301
2004 pshufhw m1, m1, q2301
2010 %else ; !HIGH_BIT_DEPTH
2012 cglobal predict_8x%1c_dc_top_mmx2, 1,1
2013 movq mm0, [r0 - FDEC_STRIDE]
2018 psadbw mm1, mm2 ; s1
2019 psadbw mm0, mm2 ; s0
2025 pshufw mm0, mm0, 0 ; dc0 (w)
2026 packuswb mm0, mm1 ; dc0,dc1 (b)
2035 ;-----------------------------------------------------------------------------
2036 ; void predict_16x16_v( pixel *src )
2037 ;-----------------------------------------------------------------------------
2039 %macro PREDICT_16x16_V 0
2040 cglobal predict_16x16_v, 1,2
2042 %rep 16*SIZEOF_PIXEL/mmsize
2043 mova m %+ %%i, [r0-FDEC_STRIDEB+%%i*mmsize]
2046 %if 16*SIZEOF_PIXEL/mmsize == 4
2047 STORE16 m0, m1, m2, m3
2048 %elif 16*SIZEOF_PIXEL/mmsize == 2
2065 ;-----------------------------------------------------------------------------
2066 ; void predict_16x16_h( pixel *src )
2067 ;-----------------------------------------------------------------------------
2068 %macro PREDICT_16x16_H 0
2069 cglobal predict_16x16_h, 1,2
2070 %if cpuflag(ssse3) && notcpuflag(avx2)
2089 ;no SSE2 for 8-bit, it's slower than MMX on all systems that don't support SSSE3
2094 ;-----------------------------------------------------------------------------
2095 ; void predict_16x16_dc_core( pixel *src, int i_dc_left )
2096 ;-----------------------------------------------------------------------------
2097 %macro PRED16x16_DC_MMX 2
2099 mova m0, [r0 - FDEC_STRIDEB+ 0]
2100 paddw m0, [r0 - FDEC_STRIDEB+ 8]
2101 paddw m0, [r0 - FDEC_STRIDEB+16]
2102 paddw m0, [r0 - FDEC_STRIDEB+24]
2107 STORE16 m0, m0, m0, m0
2108 %else ; !HIGH_BIT_DEPTH
2111 psadbw m0, [r0 - FDEC_STRIDE]
2112 psadbw m1, [r0 - FDEC_STRIDE + 8]
2117 packuswb m0, m0 ; dc in bytes
2123 cglobal predict_16x16_dc_core, 1,2
2126 PRED16x16_DC_MMX m6, 5
2128 PRED16x16_DC_MMX r1m, 5
2133 cglobal predict_16x16_dc_top, 1,2
2134 PRED16x16_DC_MMX [pw_8], 4
2139 cglobal predict_16x16_dc_left_core, 1,2
2142 STORE16 m0, m0, m0, m0
2144 %else ; !HIGH_BIT_DEPTH
2145 cglobal predict_16x16_dc_left_core, 1,1
2153 %macro PRED16x16_DC 2
2155 mova xm0, [r0 - FDEC_STRIDEB+ 0]
2156 paddw xm0, [r0 - FDEC_STRIDEB+16]
2166 %else ; !HIGH_BIT_DEPTH
2168 psadbw m0, [r0 - FDEC_STRIDE]
2174 packuswb m0, m0 ; dc in bytes
2179 %macro PREDICT_16x16_DC_CORE 0
2180 cglobal predict_16x16_dc_core, 2,2,4
2185 cglobal predict_16x16_dc_top, 1,2
2186 PRED16x16_DC [pw_8], 4
2189 cglobal predict_16x16_dc_left_core, 1,2
2192 %if HIGH_BIT_DEPTH && mmsize == 16
2195 %if HIGH_BIT_DEPTH == 0
2204 PREDICT_16x16_DC_CORE
2207 PREDICT_16x16_DC_CORE
2210 PREDICT_16x16_DC_CORE