;***************************************************************************** ;* predict-a.asm: x86 intra prediction ;***************************************************************************** ;* Copyright (C) 2005-2015 x264 project ;* ;* Authors: Loren Merritt ;* Holger Lubitz ;* Fiona Glaser ;* Henrik Gramner ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by ;* the Free Software Foundation; either version 2 of the License, or ;* (at your option) any later version. ;* ;* This program is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;* GNU General Public License for more details. ;* ;* You should have received a copy of the GNU General Public License ;* along with this program; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. ;* ;* This program is also available under a commercial proprietary license. ;* For more information, contact us at licensing@x264.com. ;***************************************************************************** %include "x86inc.asm" %include "x86util.asm" SECTION_RODATA 32 pw_43210123: times 2 dw -3, -2, -1, 0, 1, 2, 3, 4 pw_m3: times 16 dw -3 pw_m7: times 16 dw -7 pb_00s_ff: times 8 db 0 pb_0s_ff: times 7 db 0 db 0xff shuf_fixtr: db 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7 shuf_nop: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 shuf_hu: db 7,6,5,4,3,2,1,0,0,0,0,0,0,0,0,0 shuf_vr: db 2,4,6,8,9,10,11,12,13,14,15,0,1,3,5,7 pw_reverse: db 14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1 SECTION .text cextern pb_0 cextern pb_1 cextern pb_3 cextern pw_1 cextern pw_2 cextern pw_4 cextern pw_8 cextern pw_16 cextern pw_00ff cextern pw_pixel_max cextern pw_0to15 %macro STORE8 1 mova [r0+0*FDEC_STRIDEB], %1 mova [r0+1*FDEC_STRIDEB], %1 add r0, 4*FDEC_STRIDEB mova [r0-2*FDEC_STRIDEB], %1 mova [r0-1*FDEC_STRIDEB], %1 mova [r0+0*FDEC_STRIDEB], %1 mova [r0+1*FDEC_STRIDEB], %1 mova [r0+2*FDEC_STRIDEB], %1 mova [r0+3*FDEC_STRIDEB], %1 %endmacro %macro STORE16 1-4 %if %0 > 1 mov r1d, 2*%0 .loop: mova [r0+0*FDEC_STRIDEB+0*mmsize], %1 mova [r0+0*FDEC_STRIDEB+1*mmsize], %2 mova [r0+1*FDEC_STRIDEB+0*mmsize], %1 mova [r0+1*FDEC_STRIDEB+1*mmsize], %2 %ifidn %0, 4 mova [r0+0*FDEC_STRIDEB+2*mmsize], %3 mova [r0+0*FDEC_STRIDEB+3*mmsize], %4 mova [r0+1*FDEC_STRIDEB+2*mmsize], %3 mova [r0+1*FDEC_STRIDEB+3*mmsize], %4 add r0, 2*FDEC_STRIDEB %else ; %0 == 2 add r0, 4*FDEC_STRIDEB mova [r0-2*FDEC_STRIDEB+0*mmsize], %1 mova [r0-2*FDEC_STRIDEB+1*mmsize], %2 mova [r0-1*FDEC_STRIDEB+0*mmsize], %1 mova [r0-1*FDEC_STRIDEB+1*mmsize], %2 %endif dec r1d jg .loop %else ; %0 == 1 STORE8 %1 %if HIGH_BIT_DEPTH ; Different code paths to reduce code size add r0, 6*FDEC_STRIDEB mova [r0-2*FDEC_STRIDEB], %1 mova [r0-1*FDEC_STRIDEB], %1 mova [r0+0*FDEC_STRIDEB], %1 mova [r0+1*FDEC_STRIDEB], %1 add r0, 4*FDEC_STRIDEB mova [r0-2*FDEC_STRIDEB], %1 mova [r0-1*FDEC_STRIDEB], %1 mova [r0+0*FDEC_STRIDEB], %1 mova [r0+1*FDEC_STRIDEB], %1 %else add r0, 8*FDEC_STRIDE mova [r0-4*FDEC_STRIDE], %1 mova [r0-3*FDEC_STRIDE], %1 mova [r0-2*FDEC_STRIDE], %1 mova [r0-1*FDEC_STRIDE], %1 mova [r0+0*FDEC_STRIDE], %1 mova [r0+1*FDEC_STRIDE], %1 mova [r0+2*FDEC_STRIDE], %1 mova [r0+3*FDEC_STRIDE], %1 %endif ; HIGH_BIT_DEPTH %endif %endmacro %macro PRED_H_LOAD 2 ; reg, offset %if cpuflag(avx2) vpbroadcastpix %1, [r0+(%2)*FDEC_STRIDEB-SIZEOF_PIXEL] %elif HIGH_BIT_DEPTH movd %1, [r0+(%2)*FDEC_STRIDEB-4] SPLATW %1, %1, 1 %else SPLATB_LOAD %1, r0+(%2)*FDEC_STRIDE-1, m2 %endif %endmacro %macro PRED_H_STORE 3 ; reg, offset, width %assign %%w %3*SIZEOF_PIXEL %if %%w == 8 movq [r0+(%2)*FDEC_STRIDEB], %1 %else %assign %%i 0 %rep %%w/mmsize mova [r0+(%2)*FDEC_STRIDEB+%%i], %1 %assign %%i %%i+mmsize %endrep %endif %endmacro %macro PRED_H_4ROWS 2 ; width, inc_ptr PRED_H_LOAD m0, 0 PRED_H_LOAD m1, 1 PRED_H_STORE m0, 0, %1 PRED_H_STORE m1, 1, %1 PRED_H_LOAD m0, 2 %if %2 add r0, 4*FDEC_STRIDEB %endif PRED_H_LOAD m1, 3-4*%2 PRED_H_STORE m0, 2-4*%2, %1 PRED_H_STORE m1, 3-4*%2, %1 %endmacro ; dest, left, right, src, tmp ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2 %macro PRED8x8_LOWPASS 4-5 %if HIGH_BIT_DEPTH paddw %2, %3 psrlw %2, 1 pavgw %1, %4, %2 %else mova %5, %2 pavgb %2, %3 pxor %3, %5 pand %3, [pb_1] psubusb %2, %3 pavgb %1, %4, %2 %endif %endmacro ;----------------------------------------------------------------------------- ; void predict_4x4_h( pixel *src ) ;----------------------------------------------------------------------------- %if HIGH_BIT_DEPTH INIT_XMM avx2 cglobal predict_4x4_h, 1,1 PRED_H_4ROWS 4, 0 RET %endif ;----------------------------------------------------------------------------- ; void predict_4x4_ddl( pixel *src ) ;----------------------------------------------------------------------------- %macro PREDICT_4x4_DDL 0 cglobal predict_4x4_ddl, 1,1 movu m1, [r0-FDEC_STRIDEB] PSLLPIX m2, m1, 1 mova m0, m1 %if HIGH_BIT_DEPTH PSRLPIX m1, m1, 1 pshufhw m1, m1, q2210 %else pxor m1, m2 PSRLPIX m1, m1, 1 pxor m1, m0 %endif PRED8x8_LOWPASS m0, m2, m1, m0, m3 %assign Y 0 %rep 4 PSRLPIX m0, m0, 1 movh [r0+Y*FDEC_STRIDEB], m0 %assign Y (Y+1) %endrep RET %endmacro %if HIGH_BIT_DEPTH INIT_XMM sse2 PREDICT_4x4_DDL INIT_XMM avx PREDICT_4x4_DDL INIT_MMX mmx2 cglobal predict_4x4_ddl, 1,2 movu m1, [r0-FDEC_STRIDEB+4] PRED8x8_LOWPASS m0, m1, [r0-FDEC_STRIDEB+0], [r0-FDEC_STRIDEB+2] mova m3, [r0-FDEC_STRIDEB+8] mova [r0+0*FDEC_STRIDEB], m0 pshufw m4, m3, q3321 PRED8x8_LOWPASS m2, m4, [r0-FDEC_STRIDEB+6], m3 mova [r0+3*FDEC_STRIDEB], m2 pshufw m1, m0, q0021 punpckldq m1, m2 mova [r0+1*FDEC_STRIDEB], m1 psllq m0, 16 PALIGNR m2, m0, 6, m0 mova [r0+2*FDEC_STRIDEB], m2 RET %else ; !HIGH_BIT_DEPTH INIT_MMX mmx2 PREDICT_4x4_DDL %endif ;----------------------------------------------------------------------------- ; void predict_4x4_vr( pixel *src ) ;----------------------------------------------------------------------------- %if HIGH_BIT_DEPTH == 0 INIT_MMX ssse3 cglobal predict_4x4_vr, 1,1 movd m1, [r0-1*FDEC_STRIDEB] ; ........t3t2t1t0 mova m4, m1 palignr m1, [r0-1*FDEC_STRIDEB-8], 7 ; ......t3t2t1t0lt pavgb m4, m1 palignr m1, [r0+0*FDEC_STRIDEB-8], 7 ; ....t3t2t1t0ltl0 mova m0, m1 palignr m1, [r0+1*FDEC_STRIDEB-8], 7 ; ..t3t2t1t0ltl0l1 mova m2, m1 palignr m1, [r0+2*FDEC_STRIDEB-8], 7 ; t3t2t1t0ltl0l1l2 PRED8x8_LOWPASS m2, m0, m1, m2, m3 pshufw m0, m2, 0 psrlq m2, 16 movd [r0+0*FDEC_STRIDEB], m4 palignr m4, m0, 7 movd [r0+1*FDEC_STRIDEB], m2 psllq m0, 8 movd [r0+2*FDEC_STRIDEB], m4 palignr m2, m0, 7 movd [r0+3*FDEC_STRIDEB], m2 RET %endif ; !HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- ; void predict_4x4_ddr( pixel *src ) ;----------------------------------------------------------------------------- %macro PREDICT_4x4 4 cglobal predict_4x4_ddr, 1,1 %if HIGH_BIT_DEPTH movu m2, [r0-1*FDEC_STRIDEB-8] pinsrw m2, [r0+0*FDEC_STRIDEB-2], 2 pinsrw m2, [r0+1*FDEC_STRIDEB-2], 1 pinsrw m2, [r0+2*FDEC_STRIDEB-2], 0 movhps m3, [r0+3*FDEC_STRIDEB-8] %else ; !HIGH_BIT_DEPTH movd m0, [r0+2*FDEC_STRIDEB-4] movd m1, [r0+0*FDEC_STRIDEB-4] punpcklbw m0, [r0+1*FDEC_STRIDEB-4] punpcklbw m1, [r0-1*FDEC_STRIDEB-4] punpckhwd m0, m1 movd m2, [r0-1*FDEC_STRIDEB] %if cpuflag(ssse3) palignr m2, m0, 4 %else psllq m2, 32 punpckhdq m0, m2 SWAP 2, 0 %endif movd m3, [r0+3*FDEC_STRIDEB-4] psllq m3, 32 %endif ; !HIGH_BIT_DEPTH PSRLPIX m1, m2, 1 mova m0, m2 PALIGNR m2, m3, 7*SIZEOF_PIXEL, m3 PRED8x8_LOWPASS m0, m2, m1, m0, m3 %assign Y 3 movh [r0+Y*FDEC_STRIDEB], m0 %rep 3 %assign Y (Y-1) PSRLPIX m0, m0, 1 movh [r0+Y*FDEC_STRIDEB], m0 %endrep RET ;----------------------------------------------------------------------------- ; void predict_4x4_vr( pixel *src ) ;----------------------------------------------------------------------------- cglobal predict_4x4_vr, 1,1 %if HIGH_BIT_DEPTH movu m1, [r0-1*FDEC_STRIDEB-8] pinsrw m1, [r0+0*FDEC_STRIDEB-2], 2 pinsrw m1, [r0+1*FDEC_STRIDEB-2], 1 pinsrw m1, [r0+2*FDEC_STRIDEB-2], 0 %else ; !HIGH_BIT_DEPTH movd m0, [r0+2*FDEC_STRIDEB-4] movd m1, [r0+0*FDEC_STRIDEB-4] punpcklbw m0, [r0+1*FDEC_STRIDEB-4] punpcklbw m1, [r0-1*FDEC_STRIDEB-4] punpckhwd m0, m1 movd m1, [r0-1*FDEC_STRIDEB] %if cpuflag(ssse3) palignr m1, m0, 4 %else psllq m1, 32 punpckhdq m0, m1 SWAP 1, 0 %endif %endif ; !HIGH_BIT_DEPTH PSRLPIX m2, m1, 1 PSRLPIX m0, m1, 2 pavg%1 m4, m1, m2 PSRLPIX m4, m4, 3 PRED8x8_LOWPASS m2, m0, m1, m2, m3 PSLLPIX m0, m2, 6 PSRLPIX m2, m2, 2 movh [r0+0*FDEC_STRIDEB], m4 PALIGNR m4, m0, 7*SIZEOF_PIXEL, m3 movh [r0+1*FDEC_STRIDEB], m2 PSLLPIX m0, m0, 1 movh [r0+2*FDEC_STRIDEB], m4 PALIGNR m2, m0, 7*SIZEOF_PIXEL, m0 movh [r0+3*FDEC_STRIDEB], m2 RET ;----------------------------------------------------------------------------- ; void predict_4x4_hd( pixel *src ) ;----------------------------------------------------------------------------- cglobal predict_4x4_hd, 1,1 %if HIGH_BIT_DEPTH movu m1, [r0-1*FDEC_STRIDEB-8] PSLLPIX m1, m1, 1 pinsrw m1, [r0+0*FDEC_STRIDEB-2], 3 pinsrw m1, [r0+1*FDEC_STRIDEB-2], 2 pinsrw m1, [r0+2*FDEC_STRIDEB-2], 1 pinsrw m1, [r0+3*FDEC_STRIDEB-2], 0 %else movd m0, [r0-1*FDEC_STRIDEB-4] ; lt .. punpckldq m0, [r0-1*FDEC_STRIDEB] ; t3 t2 t1 t0 lt .. .. .. PSLLPIX m0, m0, 1 ; t2 t1 t0 lt .. .. .. .. movd m1, [r0+3*FDEC_STRIDEB-4] ; l3 punpcklbw m1, [r0+2*FDEC_STRIDEB-4] ; l2 l3 movd m2, [r0+1*FDEC_STRIDEB-4] ; l1 punpcklbw m2, [r0+0*FDEC_STRIDEB-4] ; l0 l1 punpckh%3 m1, m2 ; l0 l1 l2 l3 punpckh%4 m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3 %endif PSRLPIX m2, m1, 1 ; .. t2 t1 t0 lt l0 l1 l2 PSRLPIX m0, m1, 2 ; .. .. t2 t1 t0 lt l0 l1 pavg%1 m5, m1, m2 PRED8x8_LOWPASS m3, m1, m0, m2, m4 punpckl%2 m5, m3 PSRLPIX m3, m3, 4 PALIGNR m3, m5, 6*SIZEOF_PIXEL, m4 %assign Y 3 movh [r0+Y*FDEC_STRIDEB], m5 %rep 2 %assign Y (Y-1) PSRLPIX m5, m5, 2 movh [r0+Y*FDEC_STRIDEB], m5 %endrep movh [r0+0*FDEC_STRIDEB], m3 RET %endmacro ; PREDICT_4x4 ;----------------------------------------------------------------------------- ; void predict_4x4_ddr( pixel *src ) ;----------------------------------------------------------------------------- %if HIGH_BIT_DEPTH INIT_MMX mmx2 cglobal predict_4x4_ddr, 1,1 mova m0, [r0+1*FDEC_STRIDEB-8] punpckhwd m0, [r0+0*FDEC_STRIDEB-8] mova m3, [r0+3*FDEC_STRIDEB-8] punpckhwd m3, [r0+2*FDEC_STRIDEB-8] punpckhdq m3, m0 pshufw m0, m3, q3321 pinsrw m0, [r0-1*FDEC_STRIDEB-2], 3 pshufw m1, m0, q3321 PRED8x8_LOWPASS m0, m1, m3, m0 movq [r0+3*FDEC_STRIDEB], m0 movq m2, [r0-1*FDEC_STRIDEB-0] pshufw m4, m2, q2100 pinsrw m4, [r0-1*FDEC_STRIDEB-2], 0 movq m1, m4 PALIGNR m4, m3, 6, m3 PRED8x8_LOWPASS m1, m4, m2, m1 movq [r0+0*FDEC_STRIDEB], m1 pshufw m2, m0, q3321 punpckldq m2, m1 psllq m0, 16 PALIGNR m1, m0, 6, m0 movq [r0+1*FDEC_STRIDEB], m1 movq [r0+2*FDEC_STRIDEB], m2 movd [r0+3*FDEC_STRIDEB+4], m1 RET ;----------------------------------------------------------------------------- ; void predict_4x4_hd( pixel *src ) ;----------------------------------------------------------------------------- cglobal predict_4x4_hd, 1,1 mova m0, [r0+1*FDEC_STRIDEB-8] punpckhwd m0, [r0+0*FDEC_STRIDEB-8] mova m1, [r0+3*FDEC_STRIDEB-8] punpckhwd m1, [r0+2*FDEC_STRIDEB-8] punpckhdq m1, m0 mova m0, m1 movu m3, [r0-1*FDEC_STRIDEB-2] pshufw m4, m1, q0032 mova m7, m3 punpckldq m4, m3 PALIGNR m3, m1, 2, m2 PRED8x8_LOWPASS m2, m4, m1, m3 pavgw m0, m3 punpcklwd m5, m0, m2 punpckhwd m4, m0, m2 mova [r0+3*FDEC_STRIDEB], m5 mova [r0+1*FDEC_STRIDEB], m4 psrlq m5, 32 punpckldq m5, m4 mova [r0+2*FDEC_STRIDEB], m5 pshufw m4, m7, q2100 mova m6, [r0-1*FDEC_STRIDEB+0] pinsrw m4, [r0+0*FDEC_STRIDEB-2], 0 PRED8x8_LOWPASS m3, m4, m6, m7 PALIGNR m3, m0, 6, m0 mova [r0+0*FDEC_STRIDEB], m3 RET INIT_XMM sse2 PREDICT_4x4 w, wd, dq, qdq INIT_XMM ssse3 PREDICT_4x4 w, wd, dq, qdq INIT_XMM avx PREDICT_4x4 w, wd, dq, qdq %else ; !HIGH_BIT_DEPTH INIT_MMX mmx2 PREDICT_4x4 b, bw, wd, dq INIT_MMX ssse3 %define predict_4x4_vr_ssse3 predict_4x4_vr_ssse3_cache64 PREDICT_4x4 b, bw, wd, dq %endif ;----------------------------------------------------------------------------- ; void predict_4x4_hu( pixel *src ) ;----------------------------------------------------------------------------- %if HIGH_BIT_DEPTH INIT_MMX cglobal predict_4x4_hu_mmx2, 1,1 movq m0, [r0+0*FDEC_STRIDEB-8] punpckhwd m0, [r0+1*FDEC_STRIDEB-8] movq m1, [r0+2*FDEC_STRIDEB-8] punpckhwd m1, [r0+3*FDEC_STRIDEB-8] punpckhdq m0, m1 pshufw m1, m1, q3333 movq [r0+3*FDEC_STRIDEB], m1 pshufw m3, m0, q3321 pshufw m4, m0, q3332 pavgw m2, m0, m3 PRED8x8_LOWPASS m3, m0, m4, m3 punpcklwd m4, m2, m3 mova [r0+0*FDEC_STRIDEB], m4 psrlq m2, 16 psrlq m3, 16 punpcklwd m2, m3 mova [r0+1*FDEC_STRIDEB], m2 punpckhdq m2, m1 mova [r0+2*FDEC_STRIDEB], m2 RET %else ; !HIGH_BIT_DEPTH INIT_MMX cglobal predict_4x4_hu_mmx2, 1,1 movd m1, [r0+0*FDEC_STRIDEB-4] punpcklbw m1, [r0+1*FDEC_STRIDEB-4] movd m0, [r0+2*FDEC_STRIDEB-4] punpcklbw m0, [r0+3*FDEC_STRIDEB-4] punpckhwd m1, m0 movq m0, m1 punpckhbw m1, m1 pshufw m1, m1, q3333 punpckhdq m0, m1 movq m2, m0 movq m3, m0 movq m5, m0 psrlq m3, 8 psrlq m2, 16 pavgb m5, m3 PRED8x8_LOWPASS m3, m0, m2, m3, m4 movd [r0+3*FDEC_STRIDEB], m1 punpcklbw m5, m3 movd [r0+0*FDEC_STRIDEB], m5 psrlq m5, 16 movd [r0+1*FDEC_STRIDEB], m5 psrlq m5, 16 movd [r0+2*FDEC_STRIDEB], m5 RET %endif ; HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- ; void predict_4x4_vl( pixel *src ) ;----------------------------------------------------------------------------- %macro PREDICT_4x4_V1 1 cglobal predict_4x4_vl, 1,1 movu m1, [r0-FDEC_STRIDEB] PSRLPIX m3, m1, 1 PSRLPIX m2, m1, 2 pavg%1 m4, m3, m1 PRED8x8_LOWPASS m0, m1, m2, m3, m5 movh [r0+0*FDEC_STRIDEB], m4 movh [r0+1*FDEC_STRIDEB], m0 PSRLPIX m4, m4, 1 PSRLPIX m0, m0, 1 movh [r0+2*FDEC_STRIDEB], m4 movh [r0+3*FDEC_STRIDEB], m0 RET %endmacro %if HIGH_BIT_DEPTH INIT_XMM sse2 PREDICT_4x4_V1 w INIT_XMM avx PREDICT_4x4_V1 w INIT_MMX mmx2 cglobal predict_4x4_vl, 1,4 mova m1, [r0-FDEC_STRIDEB+0] mova m2, [r0-FDEC_STRIDEB+8] mova m0, m2 PALIGNR m2, m1, 4, m4 PALIGNR m0, m1, 2, m4 mova m3, m0 pavgw m3, m1 mova [r0+0*FDEC_STRIDEB], m3 psrlq m3, 16 mova [r0+2*FDEC_STRIDEB], m3 PRED8x8_LOWPASS m0, m1, m2, m0 mova [r0+1*FDEC_STRIDEB], m0 psrlq m0, 16 mova [r0+3*FDEC_STRIDEB], m0 movzx r1d, word [r0-FDEC_STRIDEB+ 8] movzx r2d, word [r0-FDEC_STRIDEB+10] movzx r3d, word [r0-FDEC_STRIDEB+12] lea r1d, [r1+r2+1] add r3d, r2d lea r3d, [r3+r1+1] shr r1d, 1 shr r3d, 2 mov [r0+2*FDEC_STRIDEB+6], r1w mov [r0+3*FDEC_STRIDEB+6], r3w RET %else ; !HIGH_BIT_DEPTH INIT_MMX mmx2 PREDICT_4x4_V1 b %endif ;----------------------------------------------------------------------------- ; void predict_4x4_dc( pixel *src ) ;----------------------------------------------------------------------------- INIT_MMX mmx2 %if HIGH_BIT_DEPTH cglobal predict_4x4_dc, 1,1 mova m2, [r0+0*FDEC_STRIDEB-4*SIZEOF_PIXEL] paddw m2, [r0+1*FDEC_STRIDEB-4*SIZEOF_PIXEL] paddw m2, [r0+2*FDEC_STRIDEB-4*SIZEOF_PIXEL] paddw m2, [r0+3*FDEC_STRIDEB-4*SIZEOF_PIXEL] psrlq m2, 48 mova m0, [r0-FDEC_STRIDEB] HADDW m0, m1 paddw m0, [pw_4] paddw m0, m2 psrlw m0, 3 SPLATW m0, m0 mova [r0+0*FDEC_STRIDEB], m0 mova [r0+1*FDEC_STRIDEB], m0 mova [r0+2*FDEC_STRIDEB], m0 mova [r0+3*FDEC_STRIDEB], m0 RET %else ; !HIGH_BIT_DEPTH cglobal predict_4x4_dc, 1,4 pxor mm7, mm7 movd mm0, [r0-FDEC_STRIDEB] psadbw mm0, mm7 movd r3d, mm0 movzx r1d, byte [r0-1] %assign Y 1 %rep 3 movzx r2d, byte [r0+FDEC_STRIDEB*Y-1] add r1d, r2d %assign Y Y+1 %endrep lea r1d, [r1+r3+4] shr r1d, 3 imul r1d, 0x01010101 mov [r0+FDEC_STRIDEB*0], r1d mov [r0+FDEC_STRIDEB*1], r1d mov [r0+FDEC_STRIDEB*2], r1d mov [r0+FDEC_STRIDEB*3], r1d RET %endif ; HIGH_BIT_DEPTH %macro PREDICT_FILTER 4 ;----------------------------------------------------------------------------- ;void predict_8x8_filter( pixel *src, pixel edge[36], int i_neighbor, int i_filters ) ;----------------------------------------------------------------------------- cglobal predict_8x8_filter, 4,6,6 add r0, 0x58*SIZEOF_PIXEL %define src r0-0x58*SIZEOF_PIXEL %if ARCH_X86_64 == 0 mov r4, r1 %define t1 r4 %define t4 r1 %else %define t1 r1 %define t4 r4 %endif test r3b, 1 je .check_top mov t4d, r2d and t4d, 8 neg t4 mova m0, [src+0*FDEC_STRIDEB-8*SIZEOF_PIXEL] punpckh%1%2 m0, [src+0*FDEC_STRIDEB-8*SIZEOF_PIXEL+t4*(FDEC_STRIDEB/8)] mova m1, [src+2*FDEC_STRIDEB-8*SIZEOF_PIXEL] punpckh%1%2 m1, [src+1*FDEC_STRIDEB-8*SIZEOF_PIXEL] punpckh%2%3 m1, m0 mova m2, [src+4*FDEC_STRIDEB-8*SIZEOF_PIXEL] punpckh%1%2 m2, [src+3*FDEC_STRIDEB-8*SIZEOF_PIXEL] mova m3, [src+6*FDEC_STRIDEB-8*SIZEOF_PIXEL] punpckh%1%2 m3, [src+5*FDEC_STRIDEB-8*SIZEOF_PIXEL] punpckh%2%3 m3, m2 punpckh%3%4 m3, m1 mova m0, [src+7*FDEC_STRIDEB-8*SIZEOF_PIXEL] mova m1, [src-1*FDEC_STRIDEB] PALIGNR m4, m3, m0, 7*SIZEOF_PIXEL, m0 PALIGNR m1, m1, m3, 1*SIZEOF_PIXEL, m2 PRED8x8_LOWPASS m3, m1, m4, m3, m5 mova [t1+8*SIZEOF_PIXEL], m3 movzx t4d, pixel [src+7*FDEC_STRIDEB-1*SIZEOF_PIXEL] movzx r5d, pixel [src+6*FDEC_STRIDEB-1*SIZEOF_PIXEL] lea t4d, [t4*3+2] add t4d, r5d shr t4d, 2 mov [t1+7*SIZEOF_PIXEL], t4%1 mov [t1+6*SIZEOF_PIXEL], t4%1 test r3b, 2 je .done .check_top: %if SIZEOF_PIXEL==1 && cpuflag(ssse3) INIT_XMM cpuname movu m3, [src-1*FDEC_STRIDEB] movhps m0, [src-1*FDEC_STRIDEB-8] test r2b, 8 je .fix_lt_2 .do_top: and r2d, 4 %ifdef PIC lea r3, [shuf_fixtr] pshufb m3, [r3+r2*4] %else pshufb m3, [shuf_fixtr+r2*4] ; neighbor&MB_TOPRIGHT ? shuf_nop : shuf_fixtr %endif psrldq m1, m3, 15 PALIGNR m2, m3, m0, 15, m0 PALIGNR m1, m3, 1, m5 PRED8x8_LOWPASS m0, m2, m1, m3, m5 mova [t1+16*SIZEOF_PIXEL], m0 psrldq m0, 15 movd [t1+32*SIZEOF_PIXEL], m0 .done: REP_RET .fix_lt_2: pslldq m0, m3, 15 jmp .do_top %else mova m0, [src-1*FDEC_STRIDEB-8*SIZEOF_PIXEL] mova m3, [src-1*FDEC_STRIDEB] mova m1, [src-1*FDEC_STRIDEB+8*SIZEOF_PIXEL] test r2b, 8 je .fix_lt_2 test r2b, 4 je .fix_tr_1 .do_top: PALIGNR m2, m3, m0, 7*SIZEOF_PIXEL, m0 PALIGNR m0, m1, m3, 1*SIZEOF_PIXEL, m5 PRED8x8_LOWPASS m4, m2, m0, m3, m5 mova [t1+16*SIZEOF_PIXEL], m4 test r3b, 4 je .done PSRLPIX m5, m1, 7 PALIGNR m2, m1, m3, 7*SIZEOF_PIXEL, m3 PALIGNR m5, m1, 1*SIZEOF_PIXEL, m4 PRED8x8_LOWPASS m0, m2, m5, m1, m4 mova [t1+24*SIZEOF_PIXEL], m0 PSRLPIX m0, m0, 7 movd [t1+32*SIZEOF_PIXEL], m0 .done: REP_RET .fix_lt_2: PSLLPIX m0, m3, 7 test r2b, 4 jne .do_top .fix_tr_1: punpckh%1%2 m1, m3, m3 pshuf%2 m1, m1, q3333 jmp .do_top %endif %endmacro %if HIGH_BIT_DEPTH INIT_XMM sse2 PREDICT_FILTER w, d, q, dq INIT_XMM ssse3 PREDICT_FILTER w, d, q, dq INIT_XMM avx PREDICT_FILTER w, d, q, dq %else INIT_MMX mmx2 PREDICT_FILTER b, w, d, q INIT_MMX ssse3 PREDICT_FILTER b, w, d, q %endif ;----------------------------------------------------------------------------- ; void predict_8x8_v( pixel *src, pixel *edge ) ;----------------------------------------------------------------------------- %macro PREDICT_8x8_V 0 cglobal predict_8x8_v, 2,2 mova m0, [r1+16*SIZEOF_PIXEL] STORE8 m0 RET %endmacro %if HIGH_BIT_DEPTH INIT_XMM sse PREDICT_8x8_V %else INIT_MMX mmx2 PREDICT_8x8_V %endif ;----------------------------------------------------------------------------- ; void predict_8x8_h( pixel *src, pixel edge[36] ) ;----------------------------------------------------------------------------- %macro PREDICT_8x8_H 2 cglobal predict_8x8_h, 2,2 movu m1, [r1+7*SIZEOF_PIXEL] add r0, 4*FDEC_STRIDEB punpckl%1 m2, m1, m1 punpckh%1 m1, m1 %assign Y 0 %rep 8 %assign i 1+Y/4 SPLAT%2 m0, m %+ i, (3-Y)&3 mova [r0+(Y-4)*FDEC_STRIDEB], m0 %assign Y Y+1 %endrep RET %endmacro %if HIGH_BIT_DEPTH INIT_XMM sse2 PREDICT_8x8_H wd, D %else INIT_MMX mmx2 PREDICT_8x8_H bw, W %endif ;----------------------------------------------------------------------------- ; void predict_8x8_dc( pixel *src, pixel *edge ); ;----------------------------------------------------------------------------- %if HIGH_BIT_DEPTH INIT_XMM sse2 cglobal predict_8x8_dc, 2,2 movu m0, [r1+14] paddw m0, [r1+32] HADDW m0, m1 paddw m0, [pw_8] psrlw m0, 4 SPLATW m0, m0 STORE8 m0 RET %else ; !HIGH_BIT_DEPTH INIT_MMX mmx2 cglobal predict_8x8_dc, 2,2 pxor mm0, mm0 pxor mm1, mm1 psadbw mm0, [r1+7] psadbw mm1, [r1+16] paddw mm0, [pw_8] paddw mm0, mm1 psrlw mm0, 4 pshufw mm0, mm0, 0 packuswb mm0, mm0 STORE8 mm0 RET %endif ; HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- ; void predict_8x8_dc_top ( pixel *src, pixel *edge ); ; void predict_8x8_dc_left( pixel *src, pixel *edge ); ;----------------------------------------------------------------------------- %if HIGH_BIT_DEPTH %macro PREDICT_8x8_DC 3 cglobal %1, 2,2 %3 m0, [r1+%2] HADDW m0, m1 paddw m0, [pw_4] psrlw m0, 3 SPLATW m0, m0 STORE8 m0 RET %endmacro INIT_XMM sse2 PREDICT_8x8_DC predict_8x8_dc_top , 32, mova PREDICT_8x8_DC predict_8x8_dc_left, 14, movu %else ; !HIGH_BIT_DEPTH %macro PREDICT_8x8_DC 2 cglobal %1, 2,2 pxor mm0, mm0 psadbw mm0, [r1+%2] paddw mm0, [pw_4] psrlw mm0, 3 pshufw mm0, mm0, 0 packuswb mm0, mm0 STORE8 mm0 RET %endmacro INIT_MMX PREDICT_8x8_DC predict_8x8_dc_top_mmx2, 16 PREDICT_8x8_DC predict_8x8_dc_left_mmx2, 7 %endif ; HIGH_BIT_DEPTH ; sse2 is faster even on amd for 8-bit, so there's no sense in spending exe ; size on the 8-bit mmx functions below if we know sse2 is available. %macro PREDICT_8x8_DDLR 0 ;----------------------------------------------------------------------------- ; void predict_8x8_ddl( pixel *src, pixel *edge ) ;----------------------------------------------------------------------------- cglobal predict_8x8_ddl, 2,2,7 mova m0, [r1+16*SIZEOF_PIXEL] mova m1, [r1+24*SIZEOF_PIXEL] %if cpuflag(cache64) movd m5, [r1+32*SIZEOF_PIXEL] palignr m3, m1, m0, 1*SIZEOF_PIXEL palignr m5, m5, m1, 1*SIZEOF_PIXEL palignr m4, m1, m0, 7*SIZEOF_PIXEL %else movu m3, [r1+17*SIZEOF_PIXEL] movu m4, [r1+23*SIZEOF_PIXEL] movu m5, [r1+25*SIZEOF_PIXEL] %endif PSLLPIX m2, m0, 1 add r0, FDEC_STRIDEB*4 PRED8x8_LOWPASS m0, m2, m3, m0, m6 PRED8x8_LOWPASS m1, m4, m5, m1, m6 mova [r0+3*FDEC_STRIDEB], m1 %assign Y 2 %rep 6 PALIGNR m1, m0, 7*SIZEOF_PIXEL, m2 PSLLPIX m0, m0, 1 mova [r0+Y*FDEC_STRIDEB], m1 %assign Y (Y-1) %endrep PALIGNR m1, m0, 7*SIZEOF_PIXEL, m0 mova [r0+Y*FDEC_STRIDEB], m1 RET ;----------------------------------------------------------------------------- ; void predict_8x8_ddr( pixel *src, pixel *edge ) ;----------------------------------------------------------------------------- cglobal predict_8x8_ddr, 2,2,7 add r0, FDEC_STRIDEB*4 mova m0, [r1+ 8*SIZEOF_PIXEL] mova m1, [r1+16*SIZEOF_PIXEL] ; edge[] is 32byte aligned, so some of the unaligned loads are known to be not cachesplit movu m2, [r1+ 7*SIZEOF_PIXEL] movu m5, [r1+17*SIZEOF_PIXEL] %if cpuflag(cache64) palignr m3, m1, m0, 1*SIZEOF_PIXEL palignr m4, m1, m0, 7*SIZEOF_PIXEL %else movu m3, [r1+ 9*SIZEOF_PIXEL] movu m4, [r1+15*SIZEOF_PIXEL] %endif PRED8x8_LOWPASS m0, m2, m3, m0, m6 PRED8x8_LOWPASS m1, m4, m5, m1, m6 mova [r0+3*FDEC_STRIDEB], m0 %assign Y -4 %rep 6 PALIGNR m1, m0, 7*SIZEOF_PIXEL, m2 PSLLPIX m0, m0, 1 mova [r0+Y*FDEC_STRIDEB], m1 %assign Y (Y+1) %endrep PALIGNR m1, m0, 7*SIZEOF_PIXEL, m0 mova [r0+Y*FDEC_STRIDEB], m1 RET %endmacro ; PREDICT_8x8_DDLR %if HIGH_BIT_DEPTH INIT_XMM sse2 PREDICT_8x8_DDLR INIT_XMM ssse3 PREDICT_8x8_DDLR INIT_XMM ssse3, cache64 PREDICT_8x8_DDLR %elif ARCH_X86_64 == 0 INIT_MMX mmx2 PREDICT_8x8_DDLR %endif ;----------------------------------------------------------------------------- ; void predict_8x8_hu( pixel *src, pixel *edge ) ;----------------------------------------------------------------------------- %macro PREDICT_8x8_HU 2 cglobal predict_8x8_hu, 2,2,8 add r0, 4*FDEC_STRIDEB %if HIGH_BIT_DEPTH %if cpuflag(ssse3) movu m5, [r1+7*SIZEOF_PIXEL] pshufb m5, [pw_reverse] %else movq m6, [r1+7*SIZEOF_PIXEL] movq m5, [r1+11*SIZEOF_PIXEL] pshuflw m6, m6, q0123 pshuflw m5, m5, q0123 movlhps m5, m6 %endif ; cpuflag psrldq m2, m5, 2 pshufd m3, m5, q0321 pshufhw m2, m2, q2210 pshufhw m3, m3, q1110 pavgw m4, m5, m2 %else ; !HIGH_BIT_DEPTH movu m1, [r1+7*SIZEOF_PIXEL] ; l0 l1 l2 l3 l4 l5 l6 l7 pshufw m0, m1, q0123 ; l6 l7 l4 l5 l2 l3 l0 l1 psllq m1, 56 ; l7 .. .. .. .. .. .. .. mova m2, m0 psllw m0, 8 psrlw m2, 8 por m2, m0 mova m3, m2 mova m4, m2 mova m5, m2 ; l7 l6 l5 l4 l3 l2 l1 l0 psrlq m3, 16 psrlq m2, 8 por m2, m1 ; l7 l7 l6 l5 l4 l3 l2 l1 punpckhbw m1, m1 por m3, m1 ; l7 l7 l7 l6 l5 l4 l3 l2 pavgb m4, m2 %endif ; !HIGH_BIT_DEPTH PRED8x8_LOWPASS m2, m3, m5, m2, m6 punpckh%2 m0, m4, m2 ; p8 p7 p6 p5 punpckl%2 m4, m2 ; p4 p3 p2 p1 PALIGNR m5, m0, m4, 2*SIZEOF_PIXEL, m3 pshuf%1 m1, m0, q3321 PALIGNR m6, m0, m4, 4*SIZEOF_PIXEL, m3 pshuf%1 m2, m0, q3332 PALIGNR m7, m0, m4, 6*SIZEOF_PIXEL, m3 pshuf%1 m3, m0, q3333 mova [r0-4*FDEC_STRIDEB], m4 mova [r0-3*FDEC_STRIDEB], m5 mova [r0-2*FDEC_STRIDEB], m6 mova [r0-1*FDEC_STRIDEB], m7 mova [r0+0*FDEC_STRIDEB], m0 mova [r0+1*FDEC_STRIDEB], m1 mova [r0+2*FDEC_STRIDEB], m2 mova [r0+3*FDEC_STRIDEB], m3 RET %endmacro %if HIGH_BIT_DEPTH INIT_XMM sse2 PREDICT_8x8_HU d, wd INIT_XMM ssse3 PREDICT_8x8_HU d, wd INIT_XMM avx PREDICT_8x8_HU d, wd %elif ARCH_X86_64 == 0 INIT_MMX mmx2 PREDICT_8x8_HU w, bw %endif ;----------------------------------------------------------------------------- ; void predict_8x8_vr( pixel *src, pixel *edge ) ;----------------------------------------------------------------------------- %macro PREDICT_8x8_VR 1 cglobal predict_8x8_vr, 2,3 mova m2, [r1+16*SIZEOF_PIXEL] %ifidn cpuname, ssse3 mova m0, [r1+8*SIZEOF_PIXEL] palignr m3, m2, m0, 7*SIZEOF_PIXEL palignr m1, m2, m0, 6*SIZEOF_PIXEL %else movu m3, [r1+15*SIZEOF_PIXEL] movu m1, [r1+14*SIZEOF_PIXEL] %endif pavg%1 m4, m3, m2 add r0, FDEC_STRIDEB*4 PRED8x8_LOWPASS m3, m1, m2, m3, m5 mova [r0-4*FDEC_STRIDEB], m4 mova [r0-3*FDEC_STRIDEB], m3 mova m1, [r1+8*SIZEOF_PIXEL] PSLLPIX m0, m1, 1 PSLLPIX m2, m1, 2 PRED8x8_LOWPASS m0, m1, m2, m0, m6 %assign Y -2 %rep 5 PALIGNR m4, m0, 7*SIZEOF_PIXEL, m5 mova [r0+Y*FDEC_STRIDEB], m4 PSLLPIX m0, m0, 1 SWAP 3, 4 %assign Y (Y+1) %endrep PALIGNR m4, m0, 7*SIZEOF_PIXEL, m0 mova [r0+Y*FDEC_STRIDEB], m4 RET %endmacro %if HIGH_BIT_DEPTH INIT_XMM sse2 PREDICT_8x8_VR w INIT_XMM ssse3 PREDICT_8x8_VR w INIT_XMM avx PREDICT_8x8_VR w %elif ARCH_X86_64 == 0 INIT_MMX mmx2 PREDICT_8x8_VR b %endif %macro LOAD_PLANE_ARGS 0 %if cpuflag(avx2) && ARCH_X86_64 == 0 vpbroadcastw m0, r1m vpbroadcastw m2, r2m vpbroadcastw m4, r3m %elif mmsize == 8 ; MMX is only used on x86_32 SPLATW m0, r1m SPLATW m2, r2m SPLATW m4, r3m %else movd xm0, r1m movd xm2, r2m movd xm4, r3m SPLATW m0, xm0 SPLATW m2, xm2 SPLATW m4, xm4 %endif %endmacro ;----------------------------------------------------------------------------- ; void predict_8x8c_p_core( uint8_t *src, int i00, int b, int c ) ;----------------------------------------------------------------------------- %if ARCH_X86_64 == 0 && HIGH_BIT_DEPTH == 0 %macro PREDICT_CHROMA_P_MMX 1 cglobal predict_8x%1c_p_core, 1,2 LOAD_PLANE_ARGS movq m1, m2 pmullw m2, [pw_0to15] psllw m1, 2 paddsw m0, m2 ; m0 = {i+0*b, i+1*b, i+2*b, i+3*b} paddsw m1, m0 ; m1 = {i+4*b, i+5*b, i+6*b, i+7*b} mov r1d, %1 ALIGN 4 .loop: movq m5, m0 movq m6, m1 psraw m5, 5 psraw m6, 5 packuswb m5, m6 movq [r0], m5 paddsw m0, m4 paddsw m1, m4 add r0, FDEC_STRIDE dec r1d jg .loop RET %endmacro ; PREDICT_CHROMA_P_MMX INIT_MMX mmx2 PREDICT_CHROMA_P_MMX 8 PREDICT_CHROMA_P_MMX 16 %endif ; !ARCH_X86_64 && !HIGH_BIT_DEPTH %macro PREDICT_CHROMA_P 1 %if HIGH_BIT_DEPTH cglobal predict_8x%1c_p_core, 1,2,7 LOAD_PLANE_ARGS mova m3, [pw_pixel_max] pxor m1, m1 pmullw m2, [pw_43210123] ; b %if %1 == 16 pmullw m5, m4, [pw_m7] ; c %else pmullw m5, m4, [pw_m3] %endif paddw m5, [pw_16] %if mmsize == 32 mova xm6, xm4 paddw m4, m4 paddw m5, m6 %endif mov r1d, %1/(mmsize/16) .loop: paddsw m6, m2, m5 paddsw m6, m0 psraw m6, 5 CLIPW m6, m1, m3 paddw m5, m4 %if mmsize == 32 vextracti128 [r0], m6, 1 mova [r0+FDEC_STRIDEB], xm6 add r0, 2*FDEC_STRIDEB %else mova [r0], m6 add r0, FDEC_STRIDEB %endif dec r1d jg .loop RET %else ; !HIGH_BIT_DEPTH cglobal predict_8x%1c_p_core, 1,2 LOAD_PLANE_ARGS %if mmsize == 32 vbroadcasti128 m1, [pw_0to15] ; 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7 pmullw m2, m1 mova xm1, xm4 ; zero upper half paddsw m4, m4 paddsw m0, m1 %else pmullw m2, [pw_0to15] %endif paddsw m0, m2 ; m0 = {i+0*b, i+1*b, i+2*b, i+3*b, i+4*b, i+5*b, i+6*b, i+7*b} paddsw m1, m0, m4 paddsw m4, m4 mov r1d, %1/(mmsize/8) .loop: psraw m2, m0, 5 psraw m3, m1, 5 paddsw m0, m4 paddsw m1, m4 packuswb m2, m3 %if mmsize == 32 movq [r0+FDEC_STRIDE*1], xm2 movhps [r0+FDEC_STRIDE*3], xm2 vextracti128 xm2, m2, 1 movq [r0+FDEC_STRIDE*0], xm2 movhps [r0+FDEC_STRIDE*2], xm2 %else movq [r0+FDEC_STRIDE*0], xm2 movhps [r0+FDEC_STRIDE*1], xm2 %endif add r0, FDEC_STRIDE*mmsize/8 dec r1d jg .loop RET %endif ; HIGH_BIT_DEPTH %endmacro ; PREDICT_CHROMA_P INIT_XMM sse2 PREDICT_CHROMA_P 8 PREDICT_CHROMA_P 16 INIT_XMM avx PREDICT_CHROMA_P 8 PREDICT_CHROMA_P 16 INIT_YMM avx2 PREDICT_CHROMA_P 8 PREDICT_CHROMA_P 16 ;----------------------------------------------------------------------------- ; void predict_16x16_p_core( uint8_t *src, int i00, int b, int c ) ;----------------------------------------------------------------------------- %if HIGH_BIT_DEPTH == 0 && ARCH_X86_64 == 0 INIT_MMX mmx2 cglobal predict_16x16_p_core, 1,2 LOAD_PLANE_ARGS movq mm5, mm2 movq mm1, mm2 pmullw mm5, [pw_0to15] psllw mm2, 3 psllw mm1, 2 movq mm3, mm2 paddsw mm0, mm5 ; mm0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b} paddsw mm1, mm0 ; mm1 = {i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b} paddsw mm2, mm0 ; mm2 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b} paddsw mm3, mm1 ; mm3 = {i+12*b, i+13*b, i+14*b, i+15*b} mov r1d, 16 ALIGN 4 .loop: movq mm5, mm0 movq mm6, mm1 psraw mm5, 5 psraw mm6, 5 packuswb mm5, mm6 movq [r0], mm5 movq mm5, mm2 movq mm6, mm3 psraw mm5, 5 psraw mm6, 5 packuswb mm5, mm6 movq [r0+8], mm5 paddsw mm0, mm4 paddsw mm1, mm4 paddsw mm2, mm4 paddsw mm3, mm4 add r0, FDEC_STRIDE dec r1d jg .loop RET %endif ; !HIGH_BIT_DEPTH && !ARCH_X86_64 %macro PREDICT_16x16_P 0 cglobal predict_16x16_p_core, 1,2,8 movd m0, r1m movd m1, r2m movd m2, r3m SPLATW m0, m0, 0 SPLATW m1, m1, 0 SPLATW m2, m2, 0 pmullw m3, m1, [pw_0to15] psllw m1, 3 %if HIGH_BIT_DEPTH pxor m6, m6 mov r1d, 16 .loop: mova m4, m0 mova m5, m0 mova m7, m3 paddsw m7, m6 paddsw m4, m7 paddsw m7, m1 paddsw m5, m7 psraw m4, 5 psraw m5, 5 CLIPW m4, [pb_0], [pw_pixel_max] CLIPW m5, [pb_0], [pw_pixel_max] mova [r0], m4 mova [r0+16], m5 add r0, FDEC_STRIDEB paddw m6, m2 %else ; !HIGH_BIT_DEPTH paddsw m0, m3 ; m0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b, i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b} paddsw m1, m0 ; m1 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b, i+12*b, i+13*b, i+14*b, i+15*b} paddsw m7, m2, m2 mov r1d, 8 ALIGN 4 .loop: psraw m3, m0, 5 psraw m4, m1, 5 paddsw m5, m0, m2 paddsw m6, m1, m2 psraw m5, 5 psraw m6, 5 packuswb m3, m4 packuswb m5, m6 mova [r0+FDEC_STRIDE*0], m3 mova [r0+FDEC_STRIDE*1], m5 paddsw m0, m7 paddsw m1, m7 add r0, FDEC_STRIDE*2 %endif ; !HIGH_BIT_DEPTH dec r1d jg .loop RET %endmacro ; PREDICT_16x16_P INIT_XMM sse2 PREDICT_16x16_P %if HIGH_BIT_DEPTH == 0 INIT_XMM avx PREDICT_16x16_P %endif INIT_YMM avx2 cglobal predict_16x16_p_core, 1,2,8*HIGH_BIT_DEPTH LOAD_PLANE_ARGS %if HIGH_BIT_DEPTH pmullw m2, [pw_0to15] pxor m5, m5 pxor m6, m6 mova m7, [pw_pixel_max] mov r1d, 8 .loop: paddsw m1, m2, m5 paddw m5, m4 paddsw m1, m0 paddsw m3, m2, m5 psraw m1, 5 paddsw m3, m0 psraw m3, 5 CLIPW m1, m6, m7 mova [r0+0*FDEC_STRIDEB], m1 CLIPW m3, m6, m7 mova [r0+1*FDEC_STRIDEB], m3 paddw m5, m4 add r0, 2*FDEC_STRIDEB %else ; !HIGH_BIT_DEPTH vbroadcasti128 m1, [pw_0to15] mova xm3, xm4 ; zero high bits pmullw m1, m2 psllw m2, 3 paddsw m0, m3 paddsw m0, m1 ; X+1*C X+0*C paddsw m1, m0, m2 ; Y+1*C Y+0*C paddsw m4, m4 mov r1d, 4 .loop: psraw m2, m0, 5 psraw m3, m1, 5 paddsw m0, m4 paddsw m1, m4 packuswb m2, m3 ; X+1*C Y+1*C X+0*C Y+0*C vextracti128 [r0+0*FDEC_STRIDE], m2, 1 mova [r0+1*FDEC_STRIDE], xm2 psraw m2, m0, 5 psraw m3, m1, 5 paddsw m0, m4 paddsw m1, m4 packuswb m2, m3 ; X+3*C Y+3*C X+2*C Y+2*C vextracti128 [r0+2*FDEC_STRIDE], m2, 1 mova [r0+3*FDEC_STRIDE], xm2 add r0, FDEC_STRIDE*4 %endif ; !HIGH_BIT_DEPTH dec r1d jg .loop RET %if HIGH_BIT_DEPTH == 0 %macro PREDICT_8x8 0 ;----------------------------------------------------------------------------- ; void predict_8x8_ddl( uint8_t *src, uint8_t *edge ) ;----------------------------------------------------------------------------- cglobal predict_8x8_ddl, 2,2 mova m0, [r1+16] %ifidn cpuname, ssse3 movd m2, [r1+32] palignr m2, m0, 1 %else movu m2, [r1+17] %endif pslldq m1, m0, 1 add r0, FDEC_STRIDE*4 PRED8x8_LOWPASS m0, m1, m2, m0, m3 %assign Y -4 %rep 8 psrldq m0, 1 movq [r0+Y*FDEC_STRIDE], m0 %assign Y (Y+1) %endrep RET %ifnidn cpuname, ssse3 ;----------------------------------------------------------------------------- ; void predict_8x8_ddr( uint8_t *src, uint8_t *edge ) ;----------------------------------------------------------------------------- cglobal predict_8x8_ddr, 2,2 movu m0, [r1+8] movu m1, [r1+7] psrldq m2, m0, 1 add r0, FDEC_STRIDE*4 PRED8x8_LOWPASS m0, m1, m2, m0, m3 psrldq m1, m0, 1 %assign Y 3 %rep 3 movq [r0+Y*FDEC_STRIDE], m0 movq [r0+(Y-1)*FDEC_STRIDE], m1 psrldq m0, 2 psrldq m1, 2 %assign Y (Y-2) %endrep movq [r0-3*FDEC_STRIDE], m0 movq [r0-4*FDEC_STRIDE], m1 RET ;----------------------------------------------------------------------------- ; void predict_8x8_vl( uint8_t *src, uint8_t *edge ) ;----------------------------------------------------------------------------- cglobal predict_8x8_vl, 2,2 mova m0, [r1+16] pslldq m1, m0, 1 psrldq m2, m0, 1 pavgb m3, m0, m2 add r0, FDEC_STRIDE*4 PRED8x8_LOWPASS m0, m1, m2, m0, m5 ; m0: (t0 + 2*t1 + t2 + 2) >> 2 ; m3: (t0 + t1 + 1) >> 1 %assign Y -4 %rep 3 psrldq m0, 1 movq [r0+ Y *FDEC_STRIDE], m3 movq [r0+(Y+1)*FDEC_STRIDE], m0 psrldq m3, 1 %assign Y (Y+2) %endrep psrldq m0, 1 movq [r0+ Y *FDEC_STRIDE], m3 movq [r0+(Y+1)*FDEC_STRIDE], m0 RET %endif ; !ssse3 ;----------------------------------------------------------------------------- ; void predict_8x8_vr( uint8_t *src, uint8_t *edge ) ;----------------------------------------------------------------------------- cglobal predict_8x8_vr, 2,2 movu m2, [r1+8] add r0, 4*FDEC_STRIDE pslldq m1, m2, 2 pslldq m0, m2, 1 pavgb m3, m2, m0 PRED8x8_LOWPASS m0, m2, m1, m0, m4 movhps [r0-4*FDEC_STRIDE], m3 movhps [r0-3*FDEC_STRIDE], m0 %if cpuflag(ssse3) movhlps m3, m3 pshufb m0, [shuf_vr] palignr m3, m0, 13 %else mova m2, m0 mova m1, [pw_00ff] pand m1, m0 psrlw m0, 8 packuswb m1, m0 pslldq m1, 4 movhlps m3, m1 shufps m1, m2, q3210 psrldq m3, 5 psrldq m1, 5 SWAP 0, 1 %endif movq [r0+3*FDEC_STRIDE], m0 movq [r0+2*FDEC_STRIDE], m3 psrldq m0, 1 psrldq m3, 1 movq [r0+1*FDEC_STRIDE], m0 movq [r0+0*FDEC_STRIDE], m3 psrldq m0, 1 psrldq m3, 1 movq [r0-1*FDEC_STRIDE], m0 movq [r0-2*FDEC_STRIDE], m3 RET %endmacro ; PREDICT_8x8 INIT_XMM sse2 PREDICT_8x8 INIT_XMM ssse3 PREDICT_8x8 INIT_XMM avx PREDICT_8x8 %endif ; !HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- ; void predict_8x8_vl( pixel *src, pixel *edge ) ;----------------------------------------------------------------------------- %macro PREDICT_8x8_VL_10 1 cglobal predict_8x8_vl, 2,2,8 mova m0, [r1+16*SIZEOF_PIXEL] mova m1, [r1+24*SIZEOF_PIXEL] PALIGNR m2, m1, m0, SIZEOF_PIXEL*1, m4 PSRLPIX m4, m1, 1 pavg%1 m6, m0, m2 pavg%1 m7, m1, m4 add r0, FDEC_STRIDEB*4 mova [r0-4*FDEC_STRIDEB], m6 PALIGNR m3, m7, m6, SIZEOF_PIXEL*1, m5 mova [r0-2*FDEC_STRIDEB], m3 PALIGNR m3, m7, m6, SIZEOF_PIXEL*2, m5 mova [r0+0*FDEC_STRIDEB], m3 PALIGNR m7, m7, m6, SIZEOF_PIXEL*3, m5 mova [r0+2*FDEC_STRIDEB], m7 PALIGNR m3, m1, m0, SIZEOF_PIXEL*7, m6 PSLLPIX m5, m0, 1 PRED8x8_LOWPASS m0, m5, m2, m0, m7 PRED8x8_LOWPASS m1, m3, m4, m1, m7 PALIGNR m4, m1, m0, SIZEOF_PIXEL*1, m2 mova [r0-3*FDEC_STRIDEB], m4 PALIGNR m4, m1, m0, SIZEOF_PIXEL*2, m2 mova [r0-1*FDEC_STRIDEB], m4 PALIGNR m4, m1, m0, SIZEOF_PIXEL*3, m2 mova [r0+1*FDEC_STRIDEB], m4 PALIGNR m1, m1, m0, SIZEOF_PIXEL*4, m2 mova [r0+3*FDEC_STRIDEB], m1 RET %endmacro %if HIGH_BIT_DEPTH INIT_XMM sse2 PREDICT_8x8_VL_10 w INIT_XMM ssse3 PREDICT_8x8_VL_10 w INIT_XMM avx PREDICT_8x8_VL_10 w %else INIT_MMX mmx2 PREDICT_8x8_VL_10 b %endif ;----------------------------------------------------------------------------- ; void predict_8x8_hd( pixel *src, pixel *edge ) ;----------------------------------------------------------------------------- %macro PREDICT_8x8_HD 2 cglobal predict_8x8_hd, 2,2 add r0, 4*FDEC_STRIDEB mova m0, [r1+ 8*SIZEOF_PIXEL] ; lt l0 l1 l2 l3 l4 l5 l6 movu m1, [r1+ 7*SIZEOF_PIXEL] ; l0 l1 l2 l3 l4 l5 l6 l7 %ifidn cpuname, ssse3 mova m2, [r1+16*SIZEOF_PIXEL] ; t7 t6 t5 t4 t3 t2 t1 t0 mova m4, m2 ; t7 t6 t5 t4 t3 t2 t1 t0 palignr m2, m0, 7*SIZEOF_PIXEL ; t6 t5 t4 t3 t2 t1 t0 lt palignr m4, m0, 1*SIZEOF_PIXEL ; t0 lt l0 l1 l2 l3 l4 l5 %else movu m2, [r1+15*SIZEOF_PIXEL] movu m4, [r1+ 9*SIZEOF_PIXEL] %endif ; cpuflag pavg%1 m3, m0, m1 PRED8x8_LOWPASS m0, m4, m1, m0, m5 PSRLPIX m4, m2, 2 ; .. .. t6 t5 t4 t3 t2 t1 PSRLPIX m1, m2, 1 ; .. t6 t5 t4 t3 t2 t1 t0 PRED8x8_LOWPASS m1, m4, m2, m1, m5 ; .. p11 p10 p9 punpckh%2 m2, m3, m0 ; p8 p7 p6 p5 punpckl%2 m3, m0 ; p4 p3 p2 p1 mova [r0+3*FDEC_STRIDEB], m3 PALIGNR m0, m2, m3, 2*SIZEOF_PIXEL, m5 mova [r0+2*FDEC_STRIDEB], m0 PALIGNR m0, m2, m3, 4*SIZEOF_PIXEL, m5 mova [r0+1*FDEC_STRIDEB], m0 PALIGNR m0, m2, m3, 6*SIZEOF_PIXEL, m3 mova [r0+0*FDEC_STRIDEB], m0 mova [r0-1*FDEC_STRIDEB], m2 PALIGNR m0, m1, m2, 2*SIZEOF_PIXEL, m5 mova [r0-2*FDEC_STRIDEB], m0 PALIGNR m0, m1, m2, 4*SIZEOF_PIXEL, m5 mova [r0-3*FDEC_STRIDEB], m0 PALIGNR m1, m1, m2, 6*SIZEOF_PIXEL, m2 mova [r0-4*FDEC_STRIDEB], m1 RET %endmacro %if HIGH_BIT_DEPTH INIT_XMM sse2 PREDICT_8x8_HD w, wd INIT_XMM ssse3 PREDICT_8x8_HD w, wd INIT_XMM avx PREDICT_8x8_HD w, wd %else INIT_MMX mmx2 PREDICT_8x8_HD b, bw ;----------------------------------------------------------------------------- ; void predict_8x8_hd( uint8_t *src, uint8_t *edge ) ;----------------------------------------------------------------------------- %macro PREDICT_8x8_HD 0 cglobal predict_8x8_hd, 2,2 add r0, 4*FDEC_STRIDE movu m1, [r1+7] movu m3, [r1+8] movu m2, [r1+9] pavgb m4, m1, m3 PRED8x8_LOWPASS m0, m1, m2, m3, m5 punpcklbw m4, m0 movhlps m0, m4 %assign Y 3 %rep 3 movq [r0+(Y)*FDEC_STRIDE], m4 movq [r0+(Y-4)*FDEC_STRIDE], m0 psrldq m4, 2 psrldq m0, 2 %assign Y (Y-1) %endrep movq [r0+(Y)*FDEC_STRIDE], m4 movq [r0+(Y-4)*FDEC_STRIDE], m0 RET %endmacro INIT_XMM sse2 PREDICT_8x8_HD INIT_XMM avx PREDICT_8x8_HD %endif ; HIGH_BIT_DEPTH %if HIGH_BIT_DEPTH == 0 ;----------------------------------------------------------------------------- ; void predict_8x8_hu( uint8_t *src, uint8_t *edge ) ;----------------------------------------------------------------------------- INIT_MMX cglobal predict_8x8_hu_sse2, 2,2 add r0, 4*FDEC_STRIDE movq mm1, [r1+7] ; l0 l1 l2 l3 l4 l5 l6 l7 pshufw mm0, mm1, q0123 ; l6 l7 l4 l5 l2 l3 l0 l1 movq mm2, mm0 psllw mm0, 8 psrlw mm2, 8 por mm2, mm0 ; l7 l6 l5 l4 l3 l2 l1 l0 psllq mm1, 56 ; l7 .. .. .. .. .. .. .. movq mm3, mm2 movq mm4, mm2 movq mm5, mm2 psrlq mm2, 8 psrlq mm3, 16 por mm2, mm1 ; l7 l7 l6 l5 l4 l3 l2 l1 punpckhbw mm1, mm1 por mm3, mm1 ; l7 l7 l7 l6 l5 l4 l3 l2 pavgb mm4, mm2 PRED8x8_LOWPASS mm1, mm3, mm5, mm2, mm6 movq2dq xmm0, mm4 movq2dq xmm1, mm1 punpcklbw xmm0, xmm1 punpckhbw mm4, mm1 %assign Y -4 %rep 3 movq [r0+Y*FDEC_STRIDE], xmm0 psrldq xmm0, 2 %assign Y (Y+1) %endrep pshufw mm5, mm4, q3321 pshufw mm6, mm4, q3332 pshufw mm7, mm4, q3333 movq [r0+Y*FDEC_STRIDE], xmm0 movq [r0+0*FDEC_STRIDE], mm4 movq [r0+1*FDEC_STRIDE], mm5 movq [r0+2*FDEC_STRIDE], mm6 movq [r0+3*FDEC_STRIDE], mm7 RET INIT_XMM cglobal predict_8x8_hu_ssse3, 2,2 add r0, 4*FDEC_STRIDE movq m3, [r1+7] pshufb m3, [shuf_hu] psrldq m1, m3, 1 psrldq m2, m3, 2 pavgb m0, m1, m3 PRED8x8_LOWPASS m1, m3, m2, m1, m4 punpcklbw m0, m1 %assign Y -4 %rep 3 movq [r0+ Y *FDEC_STRIDE], m0 movhps [r0+(Y+4)*FDEC_STRIDE], m0 psrldq m0, 2 pshufhw m0, m0, q2210 %assign Y (Y+1) %endrep movq [r0+ Y *FDEC_STRIDE], m0 movhps [r0+(Y+4)*FDEC_STRIDE], m0 RET %endif ; !HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- ; void predict_8x8c_v( uint8_t *src ) ;----------------------------------------------------------------------------- %macro PREDICT_8x8C_V 0 cglobal predict_8x8c_v, 1,1 mova m0, [r0 - FDEC_STRIDEB] STORE8 m0 RET %endmacro %if HIGH_BIT_DEPTH INIT_XMM sse PREDICT_8x8C_V %else INIT_MMX mmx PREDICT_8x8C_V %endif %if HIGH_BIT_DEPTH INIT_MMX cglobal predict_8x8c_v_mmx, 1,1 mova m0, [r0 - FDEC_STRIDEB] mova m1, [r0 - FDEC_STRIDEB + 8] %assign Y 0 %rep 8 mova [r0 + (Y&1)*FDEC_STRIDEB], m0 mova [r0 + (Y&1)*FDEC_STRIDEB + 8], m1 %if (Y&1) && (Y!=7) add r0, FDEC_STRIDEB*2 %endif %assign Y Y+1 %endrep RET %endif %macro PREDICT_8x16C_V 0 cglobal predict_8x16c_v, 1,1 mova m0, [r0 - FDEC_STRIDEB] STORE16 m0 RET %endmacro %if HIGH_BIT_DEPTH INIT_XMM sse PREDICT_8x16C_V %else INIT_MMX mmx PREDICT_8x16C_V %endif ;----------------------------------------------------------------------------- ; void predict_8x8c_h( uint8_t *src ) ;----------------------------------------------------------------------------- %macro PREDICT_C_H 0 cglobal predict_8x8c_h, 1,1 %if cpuflag(ssse3) && notcpuflag(avx2) mova m2, [pb_3] %endif PRED_H_4ROWS 8, 1 PRED_H_4ROWS 8, 0 RET cglobal predict_8x16c_h, 1,2 %if cpuflag(ssse3) && notcpuflag(avx2) mova m2, [pb_3] %endif mov r1d, 4 .loop: PRED_H_4ROWS 8, 1 dec r1d jg .loop RET %endmacro INIT_MMX mmx2 PREDICT_C_H %if HIGH_BIT_DEPTH INIT_XMM sse2 PREDICT_C_H INIT_XMM avx2 PREDICT_C_H %else INIT_MMX ssse3 PREDICT_C_H %endif ;----------------------------------------------------------------------------- ; void predict_8x8c_dc( pixel *src ) ;----------------------------------------------------------------------------- %macro LOAD_LEFT 1 movzx r1d, pixel [r0+FDEC_STRIDEB*(%1-4)-SIZEOF_PIXEL] movzx r2d, pixel [r0+FDEC_STRIDEB*(%1-3)-SIZEOF_PIXEL] add r1d, r2d movzx r2d, pixel [r0+FDEC_STRIDEB*(%1-2)-SIZEOF_PIXEL] add r1d, r2d movzx r2d, pixel [r0+FDEC_STRIDEB*(%1-1)-SIZEOF_PIXEL] add r1d, r2d %endmacro %macro PREDICT_8x8C_DC 0 cglobal predict_8x8c_dc, 1,3 pxor m7, m7 %if HIGH_BIT_DEPTH movq m0, [r0-FDEC_STRIDEB+0] movq m1, [r0-FDEC_STRIDEB+8] HADDW m0, m2 HADDW m1, m2 %else ; !HIGH_BIT_DEPTH movd m0, [r0-FDEC_STRIDEB+0] movd m1, [r0-FDEC_STRIDEB+4] psadbw m0, m7 ; s0 psadbw m1, m7 ; s1 %endif add r0, FDEC_STRIDEB*4 LOAD_LEFT 0 ; s2 movd m2, r1d LOAD_LEFT 4 ; s3 movd m3, r1d punpcklwd m0, m1 punpcklwd m2, m3 punpckldq m0, m2 ; s0, s1, s2, s3 pshufw m3, m0, q3312 ; s2, s1, s3, s3 pshufw m0, m0, q1310 ; s0, s1, s3, s1 paddw m0, m3 psrlw m0, 2 pavgw m0, m7 ; s0+s2, s1, s3, s1+s3 %if HIGH_BIT_DEPTH %if cpuflag(sse2) movq2dq xmm0, m0 punpcklwd xmm0, xmm0 pshufd xmm1, xmm0, q3322 punpckldq xmm0, xmm0 %assign Y 0 %rep 8 %assign i (0 + (Y/4)) movdqa [r0+FDEC_STRIDEB*(Y-4)+0], xmm %+ i %assign Y Y+1 %endrep %else ; !sse2 pshufw m1, m0, q0000 pshufw m2, m0, q1111 pshufw m3, m0, q2222 pshufw m4, m0, q3333 %assign Y 0 %rep 8 %assign i (1 + (Y/4)*2) %assign j (2 + (Y/4)*2) movq [r0+FDEC_STRIDEB*(Y-4)+0], m %+ i movq [r0+FDEC_STRIDEB*(Y-4)+8], m %+ j %assign Y Y+1 %endrep %endif %else ; !HIGH_BIT_DEPTH packuswb m0, m0 punpcklbw m0, m0 movq m1, m0 punpcklbw m0, m0 punpckhbw m1, m1 %assign Y 0 %rep 8 %assign i (0 + (Y/4)) movq [r0+FDEC_STRIDEB*(Y-4)], m %+ i %assign Y Y+1 %endrep %endif RET %endmacro INIT_MMX mmx2 PREDICT_8x8C_DC %if HIGH_BIT_DEPTH INIT_MMX sse2 PREDICT_8x8C_DC %endif %if HIGH_BIT_DEPTH %macro STORE_4LINES 3 %if cpuflag(sse2) movdqa [r0+FDEC_STRIDEB*(%3-4)], %1 movdqa [r0+FDEC_STRIDEB*(%3-3)], %1 movdqa [r0+FDEC_STRIDEB*(%3-2)], %1 movdqa [r0+FDEC_STRIDEB*(%3-1)], %1 %else movq [r0+FDEC_STRIDEB*(%3-4)+0], %1 movq [r0+FDEC_STRIDEB*(%3-4)+8], %2 movq [r0+FDEC_STRIDEB*(%3-3)+0], %1 movq [r0+FDEC_STRIDEB*(%3-3)+8], %2 movq [r0+FDEC_STRIDEB*(%3-2)+0], %1 movq [r0+FDEC_STRIDEB*(%3-2)+8], %2 movq [r0+FDEC_STRIDEB*(%3-1)+0], %1 movq [r0+FDEC_STRIDEB*(%3-1)+8], %2 %endif %endmacro %else %macro STORE_4LINES 2 movq [r0+FDEC_STRIDEB*(%2-4)], %1 movq [r0+FDEC_STRIDEB*(%2-3)], %1 movq [r0+FDEC_STRIDEB*(%2-2)], %1 movq [r0+FDEC_STRIDEB*(%2-1)], %1 %endmacro %endif %macro PREDICT_8x16C_DC 0 cglobal predict_8x16c_dc, 1,3 pxor m7, m7 %if HIGH_BIT_DEPTH movq m0, [r0-FDEC_STRIDEB+0] movq m1, [r0-FDEC_STRIDEB+8] HADDW m0, m2 HADDW m1, m2 %else movd m0, [r0-FDEC_STRIDEB+0] movd m1, [r0-FDEC_STRIDEB+4] psadbw m0, m7 ; s0 psadbw m1, m7 ; s1 %endif punpcklwd m0, m1 ; s0, s1 add r0, FDEC_STRIDEB*4 LOAD_LEFT 0 ; s2 pinsrw m0, r1d, 2 LOAD_LEFT 4 ; s3 pinsrw m0, r1d, 3 ; s0, s1, s2, s3 add r0, FDEC_STRIDEB*8 LOAD_LEFT 0 ; s4 pinsrw m1, r1d, 2 LOAD_LEFT 4 ; s5 pinsrw m1, r1d, 3 ; s1, __, s4, s5 sub r0, FDEC_STRIDEB*8 pshufw m2, m0, q1310 ; s0, s1, s3, s1 pshufw m0, m0, q3312 ; s2, s1, s3, s3 pshufw m3, m1, q0302 ; s4, s1, s5, s1 pshufw m1, m1, q3322 ; s4, s4, s5, s5 paddw m0, m2 paddw m1, m3 psrlw m0, 2 psrlw m1, 2 pavgw m0, m7 pavgw m1, m7 %if HIGH_BIT_DEPTH %if cpuflag(sse2) movq2dq xmm0, m0 movq2dq xmm1, m1 punpcklwd xmm0, xmm0 punpcklwd xmm1, xmm1 pshufd xmm2, xmm0, q3322 pshufd xmm3, xmm1, q3322 punpckldq xmm0, xmm0 punpckldq xmm1, xmm1 STORE_4LINES xmm0, xmm0, 0 STORE_4LINES xmm2, xmm2, 4 STORE_4LINES xmm1, xmm1, 8 STORE_4LINES xmm3, xmm3, 12 %else pshufw m2, m0, q0000 pshufw m3, m0, q1111 pshufw m4, m0, q2222 pshufw m5, m0, q3333 STORE_4LINES m2, m3, 0 STORE_4LINES m4, m5, 4 pshufw m2, m1, q0000 pshufw m3, m1, q1111 pshufw m4, m1, q2222 pshufw m5, m1, q3333 STORE_4LINES m2, m3, 8 STORE_4LINES m4, m5, 12 %endif %else packuswb m0, m0 ; dc0, dc1, dc2, dc3 packuswb m1, m1 ; dc4, dc5, dc6, dc7 punpcklbw m0, m0 punpcklbw m1, m1 pshufw m2, m0, q1100 pshufw m3, m0, q3322 pshufw m4, m1, q1100 pshufw m5, m1, q3322 STORE_4LINES m2, 0 STORE_4LINES m3, 4 add r0, FDEC_STRIDEB*8 STORE_4LINES m4, 0 STORE_4LINES m5, 4 %endif RET %endmacro INIT_MMX mmx2 PREDICT_8x16C_DC %if HIGH_BIT_DEPTH INIT_MMX sse2 PREDICT_8x16C_DC %endif %macro PREDICT_C_DC_TOP 1 %if HIGH_BIT_DEPTH INIT_XMM cglobal predict_8x%1c_dc_top_sse2, 1,1 pxor m2, m2 mova m0, [r0 - FDEC_STRIDEB] pshufd m1, m0, q2301 paddw m0, m1 pshuflw m1, m0, q2301 pshufhw m1, m1, q2301 paddw m0, m1 psrlw m0, 1 pavgw m0, m2 STORE%1 m0 RET %else ; !HIGH_BIT_DEPTH INIT_MMX cglobal predict_8x%1c_dc_top_mmx2, 1,1 movq mm0, [r0 - FDEC_STRIDE] pxor mm1, mm1 pxor mm2, mm2 punpckhbw mm1, mm0 punpcklbw mm0, mm2 psadbw mm1, mm2 ; s1 psadbw mm0, mm2 ; s0 psrlw mm1, 1 psrlw mm0, 1 pavgw mm1, mm2 pavgw mm0, mm2 pshufw mm1, mm1, 0 pshufw mm0, mm0, 0 ; dc0 (w) packuswb mm0, mm1 ; dc0,dc1 (b) STORE%1 mm0 RET %endif %endmacro PREDICT_C_DC_TOP 8 PREDICT_C_DC_TOP 16 ;----------------------------------------------------------------------------- ; void predict_16x16_v( pixel *src ) ;----------------------------------------------------------------------------- %macro PREDICT_16x16_V 0 cglobal predict_16x16_v, 1,2 %assign %%i 0 %rep 16*SIZEOF_PIXEL/mmsize mova m %+ %%i, [r0-FDEC_STRIDEB+%%i*mmsize] %assign %%i %%i+1 %endrep %if 16*SIZEOF_PIXEL/mmsize == 4 STORE16 m0, m1, m2, m3 %elif 16*SIZEOF_PIXEL/mmsize == 2 STORE16 m0, m1 %else STORE16 m0 %endif RET %endmacro INIT_MMX mmx2 PREDICT_16x16_V INIT_XMM sse PREDICT_16x16_V %if HIGH_BIT_DEPTH INIT_YMM avx PREDICT_16x16_V %endif ;----------------------------------------------------------------------------- ; void predict_16x16_h( pixel *src ) ;----------------------------------------------------------------------------- %macro PREDICT_16x16_H 0 cglobal predict_16x16_h, 1,2 %if cpuflag(ssse3) && notcpuflag(avx2) mova m2, [pb_3] %endif mov r1d, 4 .loop: PRED_H_4ROWS 16, 1 dec r1d jg .loop RET %endmacro INIT_MMX mmx2 PREDICT_16x16_H %if HIGH_BIT_DEPTH INIT_XMM sse2 PREDICT_16x16_H INIT_YMM avx2 PREDICT_16x16_H %else ;no SSE2 for 8-bit, it's slower than MMX on all systems that don't support SSSE3 INIT_XMM ssse3 PREDICT_16x16_H %endif ;----------------------------------------------------------------------------- ; void predict_16x16_dc_core( pixel *src, int i_dc_left ) ;----------------------------------------------------------------------------- %macro PRED16x16_DC_MMX 2 %if HIGH_BIT_DEPTH mova m0, [r0 - FDEC_STRIDEB+ 0] paddw m0, [r0 - FDEC_STRIDEB+ 8] paddw m0, [r0 - FDEC_STRIDEB+16] paddw m0, [r0 - FDEC_STRIDEB+24] HADDW m0, m1 paddw m0, %1 psrlw m0, %2 SPLATW m0, m0 STORE16 m0, m0, m0, m0 %else ; !HIGH_BIT_DEPTH pxor m0, m0 pxor m1, m1 psadbw m0, [r0 - FDEC_STRIDE] psadbw m1, [r0 - FDEC_STRIDE + 8] paddusw m0, m1 paddusw m0, %1 psrlw m0, %2 ; dc pshufw m0, m0, 0 packuswb m0, m0 ; dc in bytes STORE16 m0, m0 %endif %endmacro INIT_MMX mmx2 cglobal predict_16x16_dc_core, 1,2 %if ARCH_X86_64 movd m6, r1d PRED16x16_DC_MMX m6, 5 %else PRED16x16_DC_MMX r1m, 5 %endif RET INIT_MMX mmx2 cglobal predict_16x16_dc_top, 1,2 PRED16x16_DC_MMX [pw_8], 4 RET INIT_MMX mmx2 %if HIGH_BIT_DEPTH cglobal predict_16x16_dc_left_core, 1,2 movd m0, r1m SPLATW m0, m0 STORE16 m0, m0, m0, m0 RET %else ; !HIGH_BIT_DEPTH cglobal predict_16x16_dc_left_core, 1,1 movd m0, r1m pshufw m0, m0, 0 packuswb m0, m0 STORE16 m0, m0 RET %endif %macro PRED16x16_DC 2 %if HIGH_BIT_DEPTH mova xm0, [r0 - FDEC_STRIDEB+ 0] paddw xm0, [r0 - FDEC_STRIDEB+16] HADDW xm0, xm2 paddw xm0, %1 psrlw xm0, %2 SPLATW m0, xm0 %if mmsize == 32 STORE16 m0 %else STORE16 m0, m0 %endif %else ; !HIGH_BIT_DEPTH pxor m0, m0 psadbw m0, [r0 - FDEC_STRIDE] movhlps m1, m0 paddw m0, m1 paddusw m0, %1 psrlw m0, %2 ; dc SPLATW m0, m0 packuswb m0, m0 ; dc in bytes STORE16 m0 %endif %endmacro %macro PREDICT_16x16_DC_CORE 0 cglobal predict_16x16_dc_core, 2,2,4 movd xm3, r1m PRED16x16_DC xm3, 5 RET cglobal predict_16x16_dc_top, 1,2 PRED16x16_DC [pw_8], 4 RET cglobal predict_16x16_dc_left_core, 1,2 movd xm0, r1m SPLATW m0, xm0 %if HIGH_BIT_DEPTH && mmsize == 16 STORE16 m0, m0 %else %if HIGH_BIT_DEPTH == 0 packuswb m0, m0 %endif STORE16 m0 %endif RET %endmacro INIT_XMM sse2 PREDICT_16x16_DC_CORE %if HIGH_BIT_DEPTH INIT_YMM avx2 PREDICT_16x16_DC_CORE %else INIT_XMM avx2 PREDICT_16x16_DC_CORE %endif