X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=common%2Fx86%2Fpredict-a.asm;h=16c29eee88bffb34b168552e245ebb49accef2f7;hb=64f4e24909924fceeea6e154d71b7dfbf586c7ea;hp=f676c05a41fc32985ccf24cf5867f207d28505db;hpb=389b401a99f2f33b41db7d74904b3ff7509d79e5;p=x264 diff --git a/common/x86/predict-a.asm b/common/x86/predict-a.asm index f676c05a..16c29eee 100644 --- a/common/x86/predict-a.asm +++ b/common/x86/predict-a.asm @@ -1,11 +1,12 @@ ;***************************************************************************** ;* predict-a.asm: x86 intra prediction ;***************************************************************************** -;* Copyright (C) 2005-2011 x264 project +;* Copyright (C) 2005-2016 x264 project ;* ;* Authors: Loren Merritt ;* Holger Lubitz ;* Fiona Glaser +;* Henrik Gramner ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by @@ -28,12 +29,11 @@ %include "x86inc.asm" %include "x86util.asm" -SECTION_RODATA +SECTION_RODATA 32 -pw_76543210: -pw_3210: dw 0, 1, 2, 3, 4, 5, 6, 7 -pw_43210123: dw -3, -2, -1, 0, 1, 2, 3, 4 -pw_m3: times 8 dw -3 +pw_43210123: times 2 dw -3, -2, -1, 0, 1, 2, 3, 4 +pw_m3: times 16 dw -3 +pw_m7: times 16 dw -7 pb_00s_ff: times 8 db 0 pb_0s_ff: times 7 db 0 db 0xff @@ -55,114 +55,112 @@ cextern pw_8 cextern pw_16 cextern pw_00ff cextern pw_pixel_max - -%macro STORE8x8 2-4 - add r0, 4*FDEC_STRIDEB - mova [r0 + -4*FDEC_STRIDEB], %1 - mova [r0 + -3*FDEC_STRIDEB], %1 - mova [r0 + -2*FDEC_STRIDEB], %1 - mova [r0 + -1*FDEC_STRIDEB], %1 - mova [r0 + 0*FDEC_STRIDEB], %2 - mova [r0 + 1*FDEC_STRIDEB], %2 - mova [r0 + 2*FDEC_STRIDEB], %2 - mova [r0 + 3*FDEC_STRIDEB], %2 +cextern pw_0to15 + +%macro STORE8 1 + mova [r0+0*FDEC_STRIDEB], %1 + mova [r0+1*FDEC_STRIDEB], %1 + add r0, 4*FDEC_STRIDEB + mova [r0-2*FDEC_STRIDEB], %1 + mova [r0-1*FDEC_STRIDEB], %1 + mova [r0+0*FDEC_STRIDEB], %1 + mova [r0+1*FDEC_STRIDEB], %1 + mova [r0+2*FDEC_STRIDEB], %1 + mova [r0+3*FDEC_STRIDEB], %1 %endmacro -%macro STORE8x16 4 - add r0, 4*FDEC_STRIDEB - mova [r0 + -4*FDEC_STRIDEB], %1 - mova [r0 + -3*FDEC_STRIDEB], %1 - mova [r0 + -2*FDEC_STRIDEB], %1 - mova [r0 + -1*FDEC_STRIDEB], %1 - add r0, 4*FDEC_STRIDEB - mova [r0 + -4*FDEC_STRIDEB], %2 - mova [r0 + -3*FDEC_STRIDEB], %2 - mova [r0 + -2*FDEC_STRIDEB], %2 - mova [r0 + -1*FDEC_STRIDEB], %2 - add r0, 4*FDEC_STRIDEB - mova [r0 + -4*FDEC_STRIDEB], %3 - mova [r0 + -3*FDEC_STRIDEB], %3 - mova [r0 + -2*FDEC_STRIDEB], %3 - mova [r0 + -1*FDEC_STRIDEB], %3 - mova [r0 + 0*FDEC_STRIDEB], %4 - mova [r0 + 1*FDEC_STRIDEB], %4 - mova [r0 + 2*FDEC_STRIDEB], %4 - mova [r0 + 3*FDEC_STRIDEB], %4 +%macro STORE16 1-4 +%if %0 > 1 + mov r1d, 2*%0 +.loop: + mova [r0+0*FDEC_STRIDEB+0*mmsize], %1 + mova [r0+0*FDEC_STRIDEB+1*mmsize], %2 + mova [r0+1*FDEC_STRIDEB+0*mmsize], %1 + mova [r0+1*FDEC_STRIDEB+1*mmsize], %2 +%ifidn %0, 4 + mova [r0+0*FDEC_STRIDEB+2*mmsize], %3 + mova [r0+0*FDEC_STRIDEB+3*mmsize], %4 + mova [r0+1*FDEC_STRIDEB+2*mmsize], %3 + mova [r0+1*FDEC_STRIDEB+3*mmsize], %4 + add r0, 2*FDEC_STRIDEB +%else ; %0 == 2 + add r0, 4*FDEC_STRIDEB + mova [r0-2*FDEC_STRIDEB+0*mmsize], %1 + mova [r0-2*FDEC_STRIDEB+1*mmsize], %2 + mova [r0-1*FDEC_STRIDEB+0*mmsize], %1 + mova [r0-1*FDEC_STRIDEB+1*mmsize], %2 +%endif + dec r1d + jg .loop +%else ; %0 == 1 + STORE8 %1 +%if HIGH_BIT_DEPTH ; Different code paths to reduce code size + add r0, 6*FDEC_STRIDEB + mova [r0-2*FDEC_STRIDEB], %1 + mova [r0-1*FDEC_STRIDEB], %1 + mova [r0+0*FDEC_STRIDEB], %1 + mova [r0+1*FDEC_STRIDEB], %1 + add r0, 4*FDEC_STRIDEB + mova [r0-2*FDEC_STRIDEB], %1 + mova [r0-1*FDEC_STRIDEB], %1 + mova [r0+0*FDEC_STRIDEB], %1 + mova [r0+1*FDEC_STRIDEB], %1 +%else + add r0, 8*FDEC_STRIDE + mova [r0-4*FDEC_STRIDE], %1 + mova [r0-3*FDEC_STRIDE], %1 + mova [r0-2*FDEC_STRIDE], %1 + mova [r0-1*FDEC_STRIDE], %1 + mova [r0+0*FDEC_STRIDE], %1 + mova [r0+1*FDEC_STRIDE], %1 + mova [r0+2*FDEC_STRIDE], %1 + mova [r0+3*FDEC_STRIDE], %1 +%endif ; HIGH_BIT_DEPTH +%endif %endmacro -%macro STORE16x16 2-4 -%ifidn %0, 4 - mov r1d, 8 -.loop: - mova [r0 + 0*FDEC_STRIDEB + 0], %1 - mova [r0 + 1*FDEC_STRIDEB + 0], %1 - mova [r0 + 0*FDEC_STRIDEB + 8], %2 - mova [r0 + 1*FDEC_STRIDEB + 8], %2 - mova [r0 + 0*FDEC_STRIDEB +16], %3 - mova [r0 + 1*FDEC_STRIDEB +16], %3 - mova [r0 + 0*FDEC_STRIDEB +24], %4 - mova [r0 + 1*FDEC_STRIDEB +24], %4 - add r0, 2*FDEC_STRIDEB - dec r1d - jg .loop +%macro PRED_H_LOAD 2 ; reg, offset +%if cpuflag(avx2) + vpbroadcastpix %1, [r0+(%2)*FDEC_STRIDEB-SIZEOF_PIXEL] +%elif HIGH_BIT_DEPTH + movd %1, [r0+(%2)*FDEC_STRIDEB-4] + SPLATW %1, %1, 1 %else - mov r1d, 4 -.loop: - mova [r0 + 0*FDEC_STRIDE], %1 - mova [r0 + 1*FDEC_STRIDE], %1 - mova [r0 + 2*FDEC_STRIDE], %1 - mova [r0 + 3*FDEC_STRIDE], %1 - mova [r0 + 0*FDEC_STRIDE + 8], %2 - mova [r0 + 1*FDEC_STRIDE + 8], %2 - mova [r0 + 2*FDEC_STRIDE + 8], %2 - mova [r0 + 3*FDEC_STRIDE + 8], %2 - add r0, 4*FDEC_STRIDE - dec r1d - jg .loop + SPLATB_LOAD %1, r0+(%2)*FDEC_STRIDE-1, m2 %endif %endmacro -%macro STORE16x16_SSE2 1-2 -%ifidn %0,2 - mov r1d, 4 -.loop - mova [r0+0*FDEC_STRIDEB+ 0], %1 - mova [r0+0*FDEC_STRIDEB+16], %2 - mova [r0+1*FDEC_STRIDEB+ 0], %1 - mova [r0+1*FDEC_STRIDEB+16], %2 - mova [r0+2*FDEC_STRIDEB+ 0], %1 - mova [r0+2*FDEC_STRIDEB+16], %2 - mova [r0+3*FDEC_STRIDEB+ 0], %1 - mova [r0+3*FDEC_STRIDEB+16], %2 - add r0, 4*FDEC_STRIDEB - dec r1d - jg .loop +%macro PRED_H_STORE 3 ; reg, offset, width +%assign %%w %3*SIZEOF_PIXEL +%if %%w == 8 + movq [r0+(%2)*FDEC_STRIDEB], %1 %else - add r0, 4*FDEC_STRIDEB - mova [r0 + -4*FDEC_STRIDEB], %1 - mova [r0 + -3*FDEC_STRIDEB], %1 - mova [r0 + -2*FDEC_STRIDEB], %1 - mova [r0 + -1*FDEC_STRIDEB], %1 - mova [r0 + 0*FDEC_STRIDEB], %1 - mova [r0 + 1*FDEC_STRIDEB], %1 - mova [r0 + 2*FDEC_STRIDEB], %1 - mova [r0 + 3*FDEC_STRIDEB], %1 - add r0, 8*FDEC_STRIDEB - mova [r0 + -4*FDEC_STRIDEB], %1 - mova [r0 + -3*FDEC_STRIDEB], %1 - mova [r0 + -2*FDEC_STRIDEB], %1 - mova [r0 + -1*FDEC_STRIDEB], %1 - mova [r0 + 0*FDEC_STRIDEB], %1 - mova [r0 + 1*FDEC_STRIDEB], %1 - mova [r0 + 2*FDEC_STRIDEB], %1 - mova [r0 + 3*FDEC_STRIDEB], %1 + %assign %%i 0 + %rep %%w/mmsize + mova [r0+(%2)*FDEC_STRIDEB+%%i], %1 + %assign %%i %%i+mmsize + %endrep %endif %endmacro +%macro PRED_H_4ROWS 2 ; width, inc_ptr + PRED_H_LOAD m0, 0 + PRED_H_LOAD m1, 1 + PRED_H_STORE m0, 0, %1 + PRED_H_STORE m1, 1, %1 + PRED_H_LOAD m0, 2 +%if %2 + add r0, 4*FDEC_STRIDEB +%endif + PRED_H_LOAD m1, 3-4*%2 + PRED_H_STORE m0, 2-4*%2, %1 + PRED_H_STORE m1, 3-4*%2, %1 +%endmacro + ; dest, left, right, src, tmp ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2 %macro PRED8x8_LOWPASS 4-5 -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH paddw %2, %3 psrlw %2, 1 pavgw %1, %4, %2 @@ -176,6 +174,16 @@ cextern pw_pixel_max %endif %endmacro +;----------------------------------------------------------------------------- +; void predict_4x4_h( pixel *src ) +;----------------------------------------------------------------------------- +%if HIGH_BIT_DEPTH +INIT_XMM avx2 +cglobal predict_4x4_h, 1,1 + PRED_H_4ROWS 4, 0 + RET +%endif + ;----------------------------------------------------------------------------- ; void predict_4x4_ddl( pixel *src ) ;----------------------------------------------------------------------------- @@ -184,7 +192,7 @@ cglobal predict_4x4_ddl, 1,1 movu m1, [r0-FDEC_STRIDEB] PSLLPIX m2, m1, 1 mova m0, m1 -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH PSRLPIX m1, m1, 1 pshufhw m1, m1, q2210 %else @@ -204,7 +212,7 @@ cglobal predict_4x4_ddl, 1,1 RET %endmacro -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH INIT_XMM sse2 PREDICT_4x4_DDL INIT_XMM avx @@ -233,7 +241,7 @@ PREDICT_4x4_DDL ;----------------------------------------------------------------------------- ; void predict_4x4_vr( pixel *src ) ;----------------------------------------------------------------------------- -%ifndef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH == 0 INIT_MMX ssse3 cglobal predict_4x4_vr, 1,1 movd m1, [r0-1*FDEC_STRIDEB] ; ........t3t2t1t0 @@ -263,7 +271,7 @@ cglobal predict_4x4_vr, 1,1 ;----------------------------------------------------------------------------- %macro PREDICT_4x4 4 cglobal predict_4x4_ddr, 1,1 -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH movu m2, [r0-1*FDEC_STRIDEB-8] pinsrw m2, [r0+0*FDEC_STRIDEB-2], 2 pinsrw m2, [r0+1*FDEC_STRIDEB-2], 1 @@ -304,7 +312,7 @@ cglobal predict_4x4_ddr, 1,1 ; void predict_4x4_vr( pixel *src ) ;----------------------------------------------------------------------------- cglobal predict_4x4_vr, 1,1 -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH movu m1, [r0-1*FDEC_STRIDEB-8] pinsrw m1, [r0+0*FDEC_STRIDEB-2], 2 pinsrw m1, [r0+1*FDEC_STRIDEB-2], 1 @@ -344,7 +352,7 @@ cglobal predict_4x4_vr, 1,1 ; void predict_4x4_hd( pixel *src ) ;----------------------------------------------------------------------------- cglobal predict_4x4_hd, 1,1 -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH movu m1, [r0-1*FDEC_STRIDEB-8] PSLLPIX m1, m1, 1 pinsrw m1, [r0+0*FDEC_STRIDEB-2], 3 @@ -383,7 +391,7 @@ cglobal predict_4x4_hd, 1,1 ;----------------------------------------------------------------------------- ; void predict_4x4_ddr( pixel *src ) ;----------------------------------------------------------------------------- -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH INIT_MMX mmx2 cglobal predict_4x4_ddr, 1,1 mova m0, [r0+1*FDEC_STRIDEB-8] @@ -467,7 +475,7 @@ PREDICT_4x4 b, bw, wd, dq ;----------------------------------------------------------------------------- ; void predict_4x4_hu( pixel *src ) ;----------------------------------------------------------------------------- -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH INIT_MMX cglobal predict_4x4_hu_mmx2, 1,1 movq m0, [r0+0*FDEC_STRIDEB-8] @@ -540,7 +548,7 @@ cglobal predict_4x4_vl, 1,1 RET %endmacro -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH INIT_XMM sse2 PREDICT_4x4_V1 w INIT_XMM avx @@ -582,9 +590,9 @@ PREDICT_4x4_V1 b ;----------------------------------------------------------------------------- ; void predict_4x4_dc( pixel *src ) ;----------------------------------------------------------------------------- -%ifdef HIGH_BIT_DEPTH -INIT_MMX -cglobal predict_4x4_dc_mmx2, 1,1 +INIT_MMX mmx2 +%if HIGH_BIT_DEPTH +cglobal predict_4x4_dc, 1,1 mova m2, [r0+0*FDEC_STRIDEB-4*SIZEOF_PIXEL] paddw m2, [r0+1*FDEC_STRIDEB-4*SIZEOF_PIXEL] paddw m2, [r0+2*FDEC_STRIDEB-4*SIZEOF_PIXEL] @@ -603,8 +611,7 @@ cglobal predict_4x4_dc_mmx2, 1,1 RET %else ; !HIGH_BIT_DEPTH -INIT_MMX -cglobal predict_4x4_dc_mmx2, 1,4 +cglobal predict_4x4_dc, 1,4 pxor mm7, mm7 movd mm0, [r0-FDEC_STRIDEB] psadbw mm0, mm7 @@ -633,7 +640,7 @@ cglobal predict_4x4_dc_mmx2, 1,4 cglobal predict_8x8_filter, 4,6,6 add r0, 0x58*SIZEOF_PIXEL %define src r0-0x58*SIZEOF_PIXEL -%ifndef ARCH_X86_64 +%if ARCH_X86_64 == 0 mov r4, r1 %define t1 r4 %define t4 r1 @@ -669,6 +676,7 @@ cglobal predict_8x8_filter, 4,6,6 add t4d, r5d shr t4d, 2 mov [t1+7*SIZEOF_PIXEL], t4%1 + mov [t1+6*SIZEOF_PIXEL], t4%1 test r3b, 2 je .done .check_top: @@ -734,7 +742,7 @@ INIT_XMM cpuname %endif %endmacro -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH INIT_XMM sse2 PREDICT_FILTER w, d, q, dq INIT_XMM ssse3 @@ -754,12 +762,12 @@ PREDICT_FILTER b, w, d, q %macro PREDICT_8x8_V 0 cglobal predict_8x8_v, 2,2 mova m0, [r1+16*SIZEOF_PIXEL] - STORE8x8 m0, m0 + STORE8 m0 RET %endmacro -%ifdef HIGH_BIT_DEPTH -INIT_XMM sse2 +%if HIGH_BIT_DEPTH +INIT_XMM sse PREDICT_8x8_V %else INIT_MMX mmx2 @@ -785,7 +793,7 @@ cglobal predict_8x8_h, 2,2 RET %endmacro -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH INIT_XMM sse2 PREDICT_8x8_H wd, D %else @@ -796,21 +804,21 @@ PREDICT_8x8_H bw, W ;----------------------------------------------------------------------------- ; void predict_8x8_dc( pixel *src, pixel *edge ); ;----------------------------------------------------------------------------- -%ifdef HIGH_BIT_DEPTH -INIT_XMM -cglobal predict_8x8_dc_sse2, 2,2 +%if HIGH_BIT_DEPTH +INIT_XMM sse2 +cglobal predict_8x8_dc, 2,2 movu m0, [r1+14] paddw m0, [r1+32] HADDW m0, m1 paddw m0, [pw_8] psrlw m0, 4 SPLATW m0, m0 - STORE8x8 m0, m0 - REP_RET + STORE8 m0 + RET %else ; !HIGH_BIT_DEPTH -INIT_MMX -cglobal predict_8x8_dc_mmx2, 2,2 +INIT_MMX mmx2 +cglobal predict_8x8_dc, 2,2 pxor mm0, mm0 pxor mm1, mm1 psadbw mm0, [r1+7] @@ -820,7 +828,7 @@ cglobal predict_8x8_dc_mmx2, 2,2 psrlw mm0, 4 pshufw mm0, mm0, 0 packuswb mm0, mm0 - STORE8x8 mm0, mm0 + STORE8 mm0 RET %endif ; HIGH_BIT_DEPTH @@ -828,7 +836,7 @@ cglobal predict_8x8_dc_mmx2, 2,2 ; void predict_8x8_dc_top ( pixel *src, pixel *edge ); ; void predict_8x8_dc_left( pixel *src, pixel *edge ); ;----------------------------------------------------------------------------- -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH %macro PREDICT_8x8_DC 3 cglobal %1, 2,2 %3 m0, [r1+%2] @@ -836,12 +844,12 @@ cglobal %1, 2,2 paddw m0, [pw_4] psrlw m0, 3 SPLATW m0, m0 - STORE8x8 m0, m0 + STORE8 m0 RET %endmacro -INIT_XMM -PREDICT_8x8_DC predict_8x8_dc_top_sse2 , 32, mova -PREDICT_8x8_DC predict_8x8_dc_left_sse2, 14, movu +INIT_XMM sse2 +PREDICT_8x8_DC predict_8x8_dc_top , 32, mova +PREDICT_8x8_DC predict_8x8_dc_left, 14, movu %else ; !HIGH_BIT_DEPTH %macro PREDICT_8x8_DC 2 @@ -852,7 +860,7 @@ cglobal %1, 2,2 psrlw mm0, 3 pshufw mm0, mm0, 0 packuswb mm0, mm0 - STORE8x8 mm0, mm0 + STORE8 mm0 RET %endmacro INIT_MMX @@ -927,14 +935,14 @@ cglobal predict_8x8_ddr, 2,2,7 RET %endmacro ; PREDICT_8x8_DDLR -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH INIT_XMM sse2 PREDICT_8x8_DDLR INIT_XMM ssse3 PREDICT_8x8_DDLR INIT_XMM ssse3, cache64 PREDICT_8x8_DDLR -%elifndef ARCH_X86_64 +%elif ARCH_X86_64 == 0 INIT_MMX mmx2 PREDICT_8x8_DDLR %endif @@ -945,7 +953,7 @@ PREDICT_8x8_DDLR %macro PREDICT_8x8_HU 2 cglobal predict_8x8_hu, 2,2,8 add r0, 4*FDEC_STRIDEB -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH %if cpuflag(ssse3) movu m5, [r1+7*SIZEOF_PIXEL] pshufb m5, [pw_reverse] @@ -999,14 +1007,14 @@ cglobal predict_8x8_hu, 2,2,8 RET %endmacro -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH INIT_XMM sse2 PREDICT_8x8_HU d, wd INIT_XMM ssse3 PREDICT_8x8_HU d, wd INIT_XMM avx PREDICT_8x8_HU d, wd -%elifndef ARCH_X86_64 +%elif ARCH_X86_64 == 0 INIT_MMX mmx2 PREDICT_8x8_HU w, bw %endif @@ -1048,131 +1056,167 @@ cglobal predict_8x8_vr, 2,3 RET %endmacro -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH INIT_XMM sse2 PREDICT_8x8_VR w INIT_XMM ssse3 PREDICT_8x8_VR w INIT_XMM avx PREDICT_8x8_VR w -%elifndef ARCH_X86_64 +%elif ARCH_X86_64 == 0 INIT_MMX mmx2 PREDICT_8x8_VR b %endif %macro LOAD_PLANE_ARGS 0 -%ifdef ARCH_X86_64 - movd mm0, r1d - movd mm2, r2d - movd mm4, r3d - pshufw mm0, mm0, 0 - pshufw mm2, mm2, 0 - pshufw mm4, mm4, 0 +%if cpuflag(avx2) && ARCH_X86_64 == 0 + vpbroadcastw m0, r1m + vpbroadcastw m2, r2m + vpbroadcastw m4, r3m +%elif mmsize == 8 ; MMX is only used on x86_32 + SPLATW m0, r1m + SPLATW m2, r2m + SPLATW m4, r3m %else - pshufw mm0, r1m, 0 - pshufw mm2, r2m, 0 - pshufw mm4, r3m, 0 + movd xm0, r1m + movd xm2, r2m + movd xm4, r3m + SPLATW m0, xm0 + SPLATW m2, xm2 + SPLATW m4, xm4 %endif %endmacro ;----------------------------------------------------------------------------- ; void predict_8x8c_p_core( uint8_t *src, int i00, int b, int c ) ;----------------------------------------------------------------------------- -%ifndef ARCH_X86_64 -INIT_MMX -cglobal predict_8x8c_p_core_mmx2, 1,2 +%if ARCH_X86_64 == 0 && HIGH_BIT_DEPTH == 0 +%macro PREDICT_CHROMA_P_MMX 1 +cglobal predict_8x%1c_p_core, 1,2 LOAD_PLANE_ARGS - movq mm1, mm2 - pmullw mm2, [pw_3210] - psllw mm1, 2 - paddsw mm0, mm2 ; mm0 = {i+0*b, i+1*b, i+2*b, i+3*b} - paddsw mm1, mm0 ; mm1 = {i+4*b, i+5*b, i+6*b, i+7*b} - - mov r1d, 8 + movq m1, m2 + pmullw m2, [pw_0to15] + psllw m1, 2 + paddsw m0, m2 ; m0 = {i+0*b, i+1*b, i+2*b, i+3*b} + paddsw m1, m0 ; m1 = {i+4*b, i+5*b, i+6*b, i+7*b} + mov r1d, %1 ALIGN 4 .loop: - movq mm5, mm0 - movq mm6, mm1 - psraw mm5, 5 - psraw mm6, 5 - packuswb mm5, mm6 - movq [r0], mm5 + movq m5, m0 + movq m6, m1 + psraw m5, 5 + psraw m6, 5 + packuswb m5, m6 + movq [r0], m5 - paddsw mm0, mm4 - paddsw mm1, mm4 + paddsw m0, m4 + paddsw m1, m4 add r0, FDEC_STRIDE dec r1d - jg .loop - REP_RET -%endif ; !ARCH_X86_64 + jg .loop + RET +%endmacro ; PREDICT_CHROMA_P_MMX -INIT_XMM -%ifdef HIGH_BIT_DEPTH -cglobal predict_8x8c_p_core_sse2, 1,1,7 - movd m0, r1m - movd m2, r2m - movd m4, r3m +INIT_MMX mmx2 +PREDICT_CHROMA_P_MMX 8 +PREDICT_CHROMA_P_MMX 16 +%endif ; !ARCH_X86_64 && !HIGH_BIT_DEPTH + +%macro PREDICT_CHROMA_P 1 +%if HIGH_BIT_DEPTH +cglobal predict_8x%1c_p_core, 1,2,7 + LOAD_PLANE_ARGS mova m3, [pw_pixel_max] pxor m1, m1 - SPLATW m0, m0, 0 - SPLATW m2, m2, 0 - SPLATW m4, m4, 0 pmullw m2, [pw_43210123] ; b - pmullw m5, m4, [pw_m3] ; c +%if %1 == 16 + pmullw m5, m4, [pw_m7] ; c +%else + pmullw m5, m4, [pw_m3] +%endif paddw m5, [pw_16] - mov r1d, 8 +%if mmsize == 32 + mova xm6, xm4 + paddw m4, m4 + paddw m5, m6 +%endif + mov r1d, %1/(mmsize/16) .loop: paddsw m6, m2, m5 paddsw m6, m0 psraw m6, 5 CLIPW m6, m1, m3 - mova [r0], m6 paddw m5, m4 +%if mmsize == 32 + vextracti128 [r0], m6, 1 + mova [r0+FDEC_STRIDEB], xm6 + add r0, 2*FDEC_STRIDEB +%else + mova [r0], m6 add r0, FDEC_STRIDEB - dec r1d +%endif + dec r1d jg .loop - REP_RET + RET %else ; !HIGH_BIT_DEPTH -cglobal predict_8x8c_p_core_sse2, 1,1 - movd m0, r1m - movd m2, r2m - movd m4, r3m - SPLATW m0, m0, 0 - SPLATW m2, m2, 0 - SPLATW m4, m4, 0 - pmullw m2, [pw_76543210] - paddsw m0, m2 ; m0 = {i+0*b, i+1*b, i+2*b, i+3*b, i+4*b, i+5*b, i+6*b, i+7*b} - paddsw m3, m0, m4 +cglobal predict_8x%1c_p_core, 1,2 + LOAD_PLANE_ARGS +%if mmsize == 32 + vbroadcasti128 m1, [pw_0to15] ; 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7 + pmullw m2, m1 + mova xm1, xm4 ; zero upper half paddsw m4, m4 -call .loop - add r0, FDEC_STRIDE*4 + paddsw m0, m1 +%else + pmullw m2, [pw_0to15] +%endif + paddsw m0, m2 ; m0 = {i+0*b, i+1*b, i+2*b, i+3*b, i+4*b, i+5*b, i+6*b, i+7*b} + paddsw m1, m0, m4 + paddsw m4, m4 + mov r1d, %1/(mmsize/8) .loop: - paddsw m1, m3, m4 - paddsw m5, m0, m4 - psraw m3, 5 - psraw m0, 5 - packuswb m0, m3 - movq [r0+FDEC_STRIDE*0], m0 - movhps [r0+FDEC_STRIDE*1], m0 - paddsw m0, m5, m4 - paddsw m3, m1, m4 - psraw m5, 5 - psraw m1, 5 - packuswb m5, m1 - movq [r0+FDEC_STRIDE*2], m5 - movhps [r0+FDEC_STRIDE*3], m5 + psraw m2, m0, 5 + psraw m3, m1, 5 + paddsw m0, m4 + paddsw m1, m4 + packuswb m2, m3 +%if mmsize == 32 + movq [r0+FDEC_STRIDE*1], xm2 + movhps [r0+FDEC_STRIDE*3], xm2 + vextracti128 xm2, m2, 1 + movq [r0+FDEC_STRIDE*0], xm2 + movhps [r0+FDEC_STRIDE*2], xm2 +%else + movq [r0+FDEC_STRIDE*0], xm2 + movhps [r0+FDEC_STRIDE*1], xm2 +%endif + add r0, FDEC_STRIDE*mmsize/8 + dec r1d + jg .loop RET %endif ; HIGH_BIT_DEPTH +%endmacro ; PREDICT_CHROMA_P + +INIT_XMM sse2 +PREDICT_CHROMA_P 8 +PREDICT_CHROMA_P 16 +INIT_XMM avx +PREDICT_CHROMA_P 8 +PREDICT_CHROMA_P 16 +INIT_YMM avx2 +PREDICT_CHROMA_P 8 +PREDICT_CHROMA_P 16 ;----------------------------------------------------------------------------- ; void predict_16x16_p_core( uint8_t *src, int i00, int b, int c ) ;----------------------------------------------------------------------------- -%ifndef ARCH_X86_64 -cglobal predict_16x16_p_core_mmx2, 1,2 +%if HIGH_BIT_DEPTH == 0 && ARCH_X86_64 == 0 +INIT_MMX mmx2 +cglobal predict_16x16_p_core, 1,2 LOAD_PLANE_ARGS movq mm5, mm2 movq mm1, mm2 - pmullw mm5, [pw_3210] + pmullw mm5, [pw_0to15] psllw mm2, 3 psllw mm1, 2 movq mm3, mm2 @@ -1205,8 +1249,8 @@ ALIGN 4 add r0, FDEC_STRIDE dec r1d jg .loop - REP_RET -%endif ; !ARCH_X86_64 + RET +%endif ; !HIGH_BIT_DEPTH && !ARCH_X86_64 %macro PREDICT_16x16_P 0 cglobal predict_16x16_p_core, 1,2,8 @@ -1216,9 +1260,9 @@ cglobal predict_16x16_p_core, 1,2,8 SPLATW m0, m0, 0 SPLATW m1, m1, 0 SPLATW m2, m2, 0 - pmullw m3, m1, [pw_76543210] + pmullw m3, m1, [pw_0to15] psllw m1, 3 -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH pxor m6, m6 mov r1d, 16 .loop: @@ -1237,8 +1281,6 @@ cglobal predict_16x16_p_core, 1,2,8 mova [r0+16], m5 add r0, FDEC_STRIDEB paddw m6, m2 - dec r1d - jg .loop %else ; !HIGH_BIT_DEPTH paddsw m0, m3 ; m0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b, i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b} paddsw m1, m0 ; m1 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b, i+12*b, i+13*b, i+14*b, i+15*b} @@ -1259,20 +1301,74 @@ ALIGN 4 paddsw m0, m7 paddsw m1, m7 add r0, FDEC_STRIDE*2 - dec r1d - jg .loop %endif ; !HIGH_BIT_DEPTH - REP_RET + dec r1d + jg .loop + RET %endmacro ; PREDICT_16x16_P INIT_XMM sse2 PREDICT_16x16_P -%ifndef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH == 0 INIT_XMM avx PREDICT_16x16_P %endif -%ifndef HIGH_BIT_DEPTH +INIT_YMM avx2 +cglobal predict_16x16_p_core, 1,2,8*HIGH_BIT_DEPTH + LOAD_PLANE_ARGS +%if HIGH_BIT_DEPTH + pmullw m2, [pw_0to15] + pxor m5, m5 + pxor m6, m6 + mova m7, [pw_pixel_max] + mov r1d, 8 +.loop: + paddsw m1, m2, m5 + paddw m5, m4 + paddsw m1, m0 + paddsw m3, m2, m5 + psraw m1, 5 + paddsw m3, m0 + psraw m3, 5 + CLIPW m1, m6, m7 + mova [r0+0*FDEC_STRIDEB], m1 + CLIPW m3, m6, m7 + mova [r0+1*FDEC_STRIDEB], m3 + paddw m5, m4 + add r0, 2*FDEC_STRIDEB +%else ; !HIGH_BIT_DEPTH + vbroadcasti128 m1, [pw_0to15] + mova xm3, xm4 ; zero high bits + pmullw m1, m2 + psllw m2, 3 + paddsw m0, m3 + paddsw m0, m1 ; X+1*C X+0*C + paddsw m1, m0, m2 ; Y+1*C Y+0*C + paddsw m4, m4 + mov r1d, 4 +.loop: + psraw m2, m0, 5 + psraw m3, m1, 5 + paddsw m0, m4 + paddsw m1, m4 + packuswb m2, m3 ; X+1*C Y+1*C X+0*C Y+0*C + vextracti128 [r0+0*FDEC_STRIDE], m2, 1 + mova [r0+1*FDEC_STRIDE], xm2 + psraw m2, m0, 5 + psraw m3, m1, 5 + paddsw m0, m4 + paddsw m1, m4 + packuswb m2, m3 ; X+3*C Y+3*C X+2*C Y+2*C + vextracti128 [r0+2*FDEC_STRIDE], m2, 1 + mova [r0+3*FDEC_STRIDE], xm2 + add r0, FDEC_STRIDE*4 +%endif ; !HIGH_BIT_DEPTH + dec r1d + jg .loop + RET + +%if HIGH_BIT_DEPTH == 0 %macro PREDICT_8x8 0 ;----------------------------------------------------------------------------- ; void predict_8x8_ddl( uint8_t *src, uint8_t *edge ) @@ -1361,7 +1457,7 @@ cglobal predict_8x8_vr, 2,2 movhps [r0-4*FDEC_STRIDE], m3 movhps [r0-3*FDEC_STRIDE], m0 %if cpuflag(ssse3) - movhlps m3, m3 + punpckhqdq m3, m3 pshufb m0, [shuf_vr] palignr m3, m0, 13 %else @@ -1399,6 +1495,51 @@ PREDICT_8x8 %endif ; !HIGH_BIT_DEPTH +;----------------------------------------------------------------------------- +; void predict_8x8_vl( pixel *src, pixel *edge ) +;----------------------------------------------------------------------------- +%macro PREDICT_8x8_VL_10 1 +cglobal predict_8x8_vl, 2,2,8 + mova m0, [r1+16*SIZEOF_PIXEL] + mova m1, [r1+24*SIZEOF_PIXEL] + PALIGNR m2, m1, m0, SIZEOF_PIXEL*1, m4 + PSRLPIX m4, m1, 1 + pavg%1 m6, m0, m2 + pavg%1 m7, m1, m4 + add r0, FDEC_STRIDEB*4 + mova [r0-4*FDEC_STRIDEB], m6 + PALIGNR m3, m7, m6, SIZEOF_PIXEL*1, m5 + mova [r0-2*FDEC_STRIDEB], m3 + PALIGNR m3, m7, m6, SIZEOF_PIXEL*2, m5 + mova [r0+0*FDEC_STRIDEB], m3 + PALIGNR m7, m7, m6, SIZEOF_PIXEL*3, m5 + mova [r0+2*FDEC_STRIDEB], m7 + PALIGNR m3, m1, m0, SIZEOF_PIXEL*7, m6 + PSLLPIX m5, m0, 1 + PRED8x8_LOWPASS m0, m5, m2, m0, m7 + PRED8x8_LOWPASS m1, m3, m4, m1, m7 + PALIGNR m4, m1, m0, SIZEOF_PIXEL*1, m2 + mova [r0-3*FDEC_STRIDEB], m4 + PALIGNR m4, m1, m0, SIZEOF_PIXEL*2, m2 + mova [r0-1*FDEC_STRIDEB], m4 + PALIGNR m4, m1, m0, SIZEOF_PIXEL*3, m2 + mova [r0+1*FDEC_STRIDEB], m4 + PALIGNR m1, m1, m0, SIZEOF_PIXEL*4, m2 + mova [r0+3*FDEC_STRIDEB], m1 + RET +%endmacro +%if HIGH_BIT_DEPTH +INIT_XMM sse2 +PREDICT_8x8_VL_10 w +INIT_XMM ssse3 +PREDICT_8x8_VL_10 w +INIT_XMM avx +PREDICT_8x8_VL_10 w +%else +INIT_MMX mmx2 +PREDICT_8x8_VL_10 b +%endif + ;----------------------------------------------------------------------------- ; void predict_8x8_hd( pixel *src, pixel *edge ) ;----------------------------------------------------------------------------- @@ -1441,7 +1582,7 @@ cglobal predict_8x8_hd, 2,2 RET %endmacro -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH INIT_XMM sse2 PREDICT_8x8_HD w, wd INIT_XMM ssse3 @@ -1485,7 +1626,7 @@ INIT_XMM avx PREDICT_8x8_HD %endif ; HIGH_BIT_DEPTH -%ifndef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH == 0 ;----------------------------------------------------------------------------- ; void predict_8x8_hu( uint8_t *src, uint8_t *edge ) ;----------------------------------------------------------------------------- @@ -1560,19 +1701,19 @@ cglobal predict_8x8_hu_ssse3, 2,2 %macro PREDICT_8x8C_V 0 cglobal predict_8x8c_v, 1,1 mova m0, [r0 - FDEC_STRIDEB] - STORE8x8 m0, m0 + STORE8 m0 RET %endmacro -%ifdef HIGH_BIT_DEPTH -INIT_XMM sse2 +%if HIGH_BIT_DEPTH +INIT_XMM sse PREDICT_8x8C_V %else INIT_MMX mmx PREDICT_8x8C_V %endif -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH INIT_MMX cglobal predict_8x8c_v_mmx, 1,1 @@ -1594,12 +1735,12 @@ cglobal predict_8x8c_v_mmx, 1,1 %macro PREDICT_8x16C_V 0 cglobal predict_8x16c_v, 1,1 mova m0, [r0 - FDEC_STRIDEB] - STORE8x16 m0, m0, m0, m0 + STORE16 m0 RET %endmacro -%ifdef HIGH_BIT_DEPTH -INIT_XMM sse2 +%if HIGH_BIT_DEPTH +INIT_XMM sse PREDICT_8x16C_V %else INIT_MMX mmx @@ -1609,69 +1750,56 @@ PREDICT_8x16C_V ;----------------------------------------------------------------------------- ; void predict_8x8c_h( uint8_t *src ) ;----------------------------------------------------------------------------- -%ifdef HIGH_BIT_DEPTH - -INIT_XMM sse2 -%macro PREDICT_C_H 1 -cglobal predict_8x%1c_h, 1,1 - add r0, FDEC_STRIDEB*4 -%assign Y -4 -%rep %1 - movd m0, [r0+FDEC_STRIDEB*Y-SIZEOF_PIXEL*2] - SPLATW m0, m0, 1 - mova [r0+FDEC_STRIDEB*Y], m0 -%assign Y Y+1 -%endrep +%macro PREDICT_C_H 0 +cglobal predict_8x8c_h, 1,1 +%if cpuflag(ssse3) && notcpuflag(avx2) + mova m2, [pb_3] +%endif + PRED_H_4ROWS 8, 1 + PRED_H_4ROWS 8, 0 RET -%endmacro - -PREDICT_C_H 8 -PREDICT_C_H 16 - -%else ; !HIGH_BIT_DEPTH - -%macro PREDICT_C_H_CORE 1 -%assign Y %1 -%rep 4 - SPLATB_LOAD m0, r0+FDEC_STRIDE*Y-1, m1 - mova [r0+FDEC_STRIDE*Y], m0 -%assign Y Y+1 -%endrep -%endmacro -%macro PREDICT_C_H 1 -cglobal predict_8x%1c_h, 1,1 -%if cpuflag(ssse3) - mova m1, [pb_3] +cglobal predict_8x16c_h, 1,2 +%if cpuflag(ssse3) && notcpuflag(avx2) + mova m2, [pb_3] %endif -%if %1==16 - add r0, FDEC_STRIDE*4 - PREDICT_C_H_CORE -4 - add r0, FDEC_STRIDE*4 - PREDICT_C_H_CORE -4 -%endif - add r0, FDEC_STRIDE*4 - PREDICT_C_H_CORE -4 - PREDICT_C_H_CORE 0 + mov r1d, 4 +.loop: + PRED_H_4ROWS 8, 1 + dec r1d + jg .loop RET %endmacro INIT_MMX mmx2 -PREDICT_C_H 8 -PREDICT_C_H 16 +PREDICT_C_H +%if HIGH_BIT_DEPTH +INIT_XMM sse2 +PREDICT_C_H +INIT_XMM avx2 +PREDICT_C_H +%else INIT_MMX ssse3 -PREDICT_C_H 8 -PREDICT_C_H 16 - +PREDICT_C_H %endif + ;----------------------------------------------------------------------------- ; void predict_8x8c_dc( pixel *src ) ;----------------------------------------------------------------------------- +%macro LOAD_LEFT 1 + movzx r1d, pixel [r0+FDEC_STRIDEB*(%1-4)-SIZEOF_PIXEL] + movzx r2d, pixel [r0+FDEC_STRIDEB*(%1-3)-SIZEOF_PIXEL] + add r1d, r2d + movzx r2d, pixel [r0+FDEC_STRIDEB*(%1-2)-SIZEOF_PIXEL] + add r1d, r2d + movzx r2d, pixel [r0+FDEC_STRIDEB*(%1-1)-SIZEOF_PIXEL] + add r1d, r2d +%endmacro %macro PREDICT_8x8C_DC 0 cglobal predict_8x8c_dc, 1,3 pxor m7, m7 -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH movq m0, [r0-FDEC_STRIDEB+0] movq m1, [r0-FDEC_STRIDEB+8] HADDW m0, m2 @@ -1684,23 +1812,10 @@ cglobal predict_8x8c_dc, 1,3 %endif add r0, FDEC_STRIDEB*4 - movzx r1d, pixel [r0-FDEC_STRIDEB*4-SIZEOF_PIXEL] - movzx r2d, pixel [r0-FDEC_STRIDEB*3-SIZEOF_PIXEL] - add r1d, r2d - movzx r2d, pixel [r0-FDEC_STRIDEB*2-SIZEOF_PIXEL] - add r1d, r2d - movzx r2d, pixel [r0-FDEC_STRIDEB*1-SIZEOF_PIXEL] - add r1d, r2d - movd m2, r1d ; s2 - - movzx r1d, pixel [r0+FDEC_STRIDEB*0-SIZEOF_PIXEL] - movzx r2d, pixel [r0+FDEC_STRIDEB*1-SIZEOF_PIXEL] - add r1d, r2d - movzx r2d, pixel [r0+FDEC_STRIDEB*2-SIZEOF_PIXEL] - add r1d, r2d - movzx r2d, pixel [r0+FDEC_STRIDEB*3-SIZEOF_PIXEL] - add r1d, r2d - movd m3, r1d ; s3 + LOAD_LEFT 0 ; s2 + movd m2, r1d + LOAD_LEFT 4 ; s3 + movd m3, r1d punpcklwd m0, m1 punpcklwd m2, m3 @@ -1710,7 +1825,7 @@ cglobal predict_8x8c_dc, 1,3 paddw m0, m3 psrlw m0, 2 pavgw m0, m7 ; s0+s2, s1, s3, s1+s3 -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH %if cpuflag(sse2) movq2dq xmm0, m0 punpcklwd xmm0, xmm0 @@ -1754,13 +1869,131 @@ cglobal predict_8x8c_dc, 1,3 INIT_MMX mmx2 PREDICT_8x8C_DC -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH INIT_MMX sse2 PREDICT_8x8C_DC %endif +%if HIGH_BIT_DEPTH +%macro STORE_4LINES 3 +%if cpuflag(sse2) + movdqa [r0+FDEC_STRIDEB*(%3-4)], %1 + movdqa [r0+FDEC_STRIDEB*(%3-3)], %1 + movdqa [r0+FDEC_STRIDEB*(%3-2)], %1 + movdqa [r0+FDEC_STRIDEB*(%3-1)], %1 +%else + movq [r0+FDEC_STRIDEB*(%3-4)+0], %1 + movq [r0+FDEC_STRIDEB*(%3-4)+8], %2 + movq [r0+FDEC_STRIDEB*(%3-3)+0], %1 + movq [r0+FDEC_STRIDEB*(%3-3)+8], %2 + movq [r0+FDEC_STRIDEB*(%3-2)+0], %1 + movq [r0+FDEC_STRIDEB*(%3-2)+8], %2 + movq [r0+FDEC_STRIDEB*(%3-1)+0], %1 + movq [r0+FDEC_STRIDEB*(%3-1)+8], %2 +%endif +%endmacro +%else +%macro STORE_4LINES 2 + movq [r0+FDEC_STRIDEB*(%2-4)], %1 + movq [r0+FDEC_STRIDEB*(%2-3)], %1 + movq [r0+FDEC_STRIDEB*(%2-2)], %1 + movq [r0+FDEC_STRIDEB*(%2-1)], %1 +%endmacro +%endif + +%macro PREDICT_8x16C_DC 0 +cglobal predict_8x16c_dc, 1,3 + pxor m7, m7 +%if HIGH_BIT_DEPTH + movq m0, [r0-FDEC_STRIDEB+0] + movq m1, [r0-FDEC_STRIDEB+8] + HADDW m0, m2 + HADDW m1, m2 +%else + movd m0, [r0-FDEC_STRIDEB+0] + movd m1, [r0-FDEC_STRIDEB+4] + psadbw m0, m7 ; s0 + psadbw m1, m7 ; s1 +%endif + punpcklwd m0, m1 ; s0, s1 + + add r0, FDEC_STRIDEB*4 + LOAD_LEFT 0 ; s2 + pinsrw m0, r1d, 2 + LOAD_LEFT 4 ; s3 + pinsrw m0, r1d, 3 ; s0, s1, s2, s3 + add r0, FDEC_STRIDEB*8 + LOAD_LEFT 0 ; s4 + pinsrw m1, r1d, 2 + LOAD_LEFT 4 ; s5 + pinsrw m1, r1d, 3 ; s1, __, s4, s5 + sub r0, FDEC_STRIDEB*8 + + pshufw m2, m0, q1310 ; s0, s1, s3, s1 + pshufw m0, m0, q3312 ; s2, s1, s3, s3 + pshufw m3, m1, q0302 ; s4, s1, s5, s1 + pshufw m1, m1, q3322 ; s4, s4, s5, s5 + paddw m0, m2 + paddw m1, m3 + psrlw m0, 2 + psrlw m1, 2 + pavgw m0, m7 + pavgw m1, m7 +%if HIGH_BIT_DEPTH +%if cpuflag(sse2) + movq2dq xmm0, m0 + movq2dq xmm1, m1 + punpcklwd xmm0, xmm0 + punpcklwd xmm1, xmm1 + pshufd xmm2, xmm0, q3322 + pshufd xmm3, xmm1, q3322 + punpckldq xmm0, xmm0 + punpckldq xmm1, xmm1 + STORE_4LINES xmm0, xmm0, 0 + STORE_4LINES xmm2, xmm2, 4 + STORE_4LINES xmm1, xmm1, 8 + STORE_4LINES xmm3, xmm3, 12 +%else + pshufw m2, m0, q0000 + pshufw m3, m0, q1111 + pshufw m4, m0, q2222 + pshufw m5, m0, q3333 + STORE_4LINES m2, m3, 0 + STORE_4LINES m4, m5, 4 + pshufw m2, m1, q0000 + pshufw m3, m1, q1111 + pshufw m4, m1, q2222 + pshufw m5, m1, q3333 + STORE_4LINES m2, m3, 8 + STORE_4LINES m4, m5, 12 +%endif +%else + packuswb m0, m0 ; dc0, dc1, dc2, dc3 + packuswb m1, m1 ; dc4, dc5, dc6, dc7 + punpcklbw m0, m0 + punpcklbw m1, m1 + pshufw m2, m0, q1100 + pshufw m3, m0, q3322 + pshufw m4, m1, q1100 + pshufw m5, m1, q3322 + STORE_4LINES m2, 0 + STORE_4LINES m3, 4 + add r0, FDEC_STRIDEB*8 + STORE_4LINES m4, 0 + STORE_4LINES m5, 4 +%endif + RET +%endmacro + +INIT_MMX mmx2 +PREDICT_8x16C_DC +%if HIGH_BIT_DEPTH +INIT_MMX sse2 +PREDICT_8x16C_DC +%endif + %macro PREDICT_C_DC_TOP 1 -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH INIT_XMM cglobal predict_8x%1c_dc_top_sse2, 1,1 pxor m2, m2 @@ -1772,7 +2005,7 @@ cglobal predict_8x%1c_dc_top_sse2, 1,1 paddw m0, m1 psrlw m0, 1 pavgw m0, m2 - STORE8x%1 m0, m0, m0, m0 + STORE%1 m0 RET %else ; !HIGH_BIT_DEPTH INIT_MMX @@ -1791,7 +2024,7 @@ cglobal predict_8x%1c_dc_top_mmx2, 1,1 pshufw mm1, mm1, 0 pshufw mm0, mm0, 0 ; dc0 (w) packuswb mm0, mm1 ; dc0,dc1 (b) - STORE8x%1 mm0, mm0, mm0, mm0 + STORE%1 mm0 RET %endif %endmacro @@ -1802,33 +2035,31 @@ PREDICT_C_DC_TOP 16 ;----------------------------------------------------------------------------- ; void predict_16x16_v( pixel *src ) ;----------------------------------------------------------------------------- -%ifdef HIGH_BIT_DEPTH -INIT_MMX -cglobal predict_16x16_v_mmx2, 1,2 - mova m0, [r0 - FDEC_STRIDEB+ 0] - mova m1, [r0 - FDEC_STRIDEB+ 8] - mova m2, [r0 - FDEC_STRIDEB+16] - mova m3, [r0 - FDEC_STRIDEB+24] - STORE16x16 m0, m1, m2, m3 - REP_RET -INIT_XMM -cglobal predict_16x16_v_sse2, 2,2 - mova m0, [r0 - FDEC_STRIDEB+ 0] - mova m1, [r0 - FDEC_STRIDEB+16] - STORE16x16_SSE2 m0, m1 - REP_RET -%else ; !HIGH_BIT_DEPTH -INIT_MMX -cglobal predict_16x16_v_mmx2, 1,2 - movq m0, [r0 - FDEC_STRIDE + 0] - movq m1, [r0 - FDEC_STRIDE + 8] - STORE16x16 m0, m1 - REP_RET -INIT_XMM -cglobal predict_16x16_v_sse2, 1,1 - movdqa xmm0, [r0 - FDEC_STRIDE] - STORE16x16_SSE2 xmm0 + +%macro PREDICT_16x16_V 0 +cglobal predict_16x16_v, 1,2 +%assign %%i 0 +%rep 16*SIZEOF_PIXEL/mmsize + mova m %+ %%i, [r0-FDEC_STRIDEB+%%i*mmsize] +%assign %%i %%i+1 +%endrep +%if 16*SIZEOF_PIXEL/mmsize == 4 + STORE16 m0, m1, m2, m3 +%elif 16*SIZEOF_PIXEL/mmsize == 2 + STORE16 m0, m1 +%else + STORE16 m0 +%endif RET +%endmacro + +INIT_MMX mmx2 +PREDICT_16x16_V +INIT_XMM sse +PREDICT_16x16_V +%if HIGH_BIT_DEPTH +INIT_YMM avx +PREDICT_16x16_V %endif ;----------------------------------------------------------------------------- @@ -1836,46 +2067,23 @@ cglobal predict_16x16_v_sse2, 1,1 ;----------------------------------------------------------------------------- %macro PREDICT_16x16_H 0 cglobal predict_16x16_h, 1,2 - mov r1, 12*FDEC_STRIDEB -%ifdef HIGH_BIT_DEPTH -.vloop: -%assign Y 0 -%rep 4 - movd m0, [r0+r1+Y*FDEC_STRIDEB-2*SIZEOF_PIXEL] - SPLATW m0, m0, 1 - mova [r0+r1+Y*FDEC_STRIDEB+ 0], m0 - mova [r0+r1+Y*FDEC_STRIDEB+16], m0 -%if mmsize==8 - mova [r0+r1+Y*FDEC_STRIDEB+ 8], m0 - mova [r0+r1+Y*FDEC_STRIDEB+24], m0 +%if cpuflag(ssse3) && notcpuflag(avx2) + mova m2, [pb_3] %endif -%assign Y Y+1 -%endrep - -%else ; !HIGH_BIT_DEPTH -%if cpuflag(ssse3) - mova m1, [pb_3] -%endif -.vloop: -%assign Y 0 -%rep 4 - SPLATB_LOAD m0, r0+r1+FDEC_STRIDE*Y-1, m1 - mova [r0+r1+FDEC_STRIDE*Y], m0 -%if mmsize==8 - mova [r0+r1+FDEC_STRIDE*Y+8], m0 -%endif -%assign Y Y+1 -%endrep -%endif ; HIGH_BIT_DEPTH - sub r1, 4*FDEC_STRIDEB - jge .vloop - REP_RET + mov r1d, 4 +.loop: + PRED_H_4ROWS 16, 1 + dec r1d + jg .loop + RET %endmacro INIT_MMX mmx2 PREDICT_16x16_H +%if HIGH_BIT_DEPTH INIT_XMM sse2 -%ifdef HIGH_BIT_DEPTH +PREDICT_16x16_H +INIT_YMM avx2 PREDICT_16x16_H %else ;no SSE2 for 8-bit, it's slower than MMX on all systems that don't support SSSE3 @@ -1886,9 +2094,8 @@ PREDICT_16x16_H ;----------------------------------------------------------------------------- ; void predict_16x16_dc_core( pixel *src, int i_dc_left ) ;----------------------------------------------------------------------------- - -%macro PRED16x16_DC 2 -%ifdef HIGH_BIT_DEPTH +%macro PRED16x16_DC_MMX 2 +%if HIGH_BIT_DEPTH mova m0, [r0 - FDEC_STRIDEB+ 0] paddw m0, [r0 - FDEC_STRIDEB+ 8] paddw m0, [r0 - FDEC_STRIDEB+16] @@ -1897,7 +2104,7 @@ PREDICT_16x16_H paddw m0, %1 psrlw m0, %2 SPLATW m0, m0 - STORE16x16 m0, m0, m0, m0 + STORE16 m0, m0, m0, m0 %else ; !HIGH_BIT_DEPTH pxor m0, m0 pxor m1, m1 @@ -1908,89 +2115,97 @@ PREDICT_16x16_H psrlw m0, %2 ; dc pshufw m0, m0, 0 packuswb m0, m0 ; dc in bytes - STORE16x16 m0, m0 + STORE16 m0, m0 %endif %endmacro -INIT_MMX -cglobal predict_16x16_dc_core_mmx2, 1,2 -%ifdef ARCH_X86_64 +INIT_MMX mmx2 +cglobal predict_16x16_dc_core, 1,2 +%if ARCH_X86_64 movd m6, r1d - PRED16x16_DC m6, 5 + PRED16x16_DC_MMX m6, 5 %else - PRED16x16_DC r1m, 5 + PRED16x16_DC_MMX r1m, 5 %endif - REP_RET + RET -INIT_MMX -cglobal predict_16x16_dc_top_mmx2, 1,2 - PRED16x16_DC [pw_8], 4 - REP_RET +INIT_MMX mmx2 +cglobal predict_16x16_dc_top, 1,2 + PRED16x16_DC_MMX [pw_8], 4 + RET -INIT_MMX -%ifdef HIGH_BIT_DEPTH -cglobal predict_16x16_dc_left_core_mmx2, 1,2 +INIT_MMX mmx2 +%if HIGH_BIT_DEPTH +cglobal predict_16x16_dc_left_core, 1,2 movd m0, r1m SPLATW m0, m0 - STORE16x16 m0, m0, m0, m0 - REP_RET + STORE16 m0, m0, m0, m0 + RET %else ; !HIGH_BIT_DEPTH -cglobal predict_16x16_dc_left_core_mmx2, 1,1 +cglobal predict_16x16_dc_left_core, 1,1 movd m0, r1m pshufw m0, m0, 0 packuswb m0, m0 - STORE16x16 m0, m0 - REP_RET + STORE16 m0, m0 + RET %endif -;----------------------------------------------------------------------------- -; void predict_16x16_dc_core( pixel *src, int i_dc_left ) -;----------------------------------------------------------------------------- - -%macro PRED16x16_DC_SSE2 2 -%ifdef HIGH_BIT_DEPTH - mova m0, [r0 - FDEC_STRIDEB+ 0] - paddw m0, [r0 - FDEC_STRIDEB+16] - HADDW m0, m2 - paddw m0, %1 - psrlw m0, %2 - SPLATW m0, m0 - STORE16x16_SSE2 m0, m0 +%macro PRED16x16_DC 2 +%if HIGH_BIT_DEPTH + mova xm0, [r0 - FDEC_STRIDEB+ 0] + paddw xm0, [r0 - FDEC_STRIDEB+16] + HADDW xm0, xm2 + paddw xm0, %1 + psrlw xm0, %2 + SPLATW m0, xm0 +%if mmsize == 32 + STORE16 m0 +%else + STORE16 m0, m0 +%endif %else ; !HIGH_BIT_DEPTH pxor m0, m0 psadbw m0, [r0 - FDEC_STRIDE] - movhlps m1, m0 + MOVHL m1, m0 paddw m0, m1 paddusw m0, %1 psrlw m0, %2 ; dc SPLATW m0, m0 packuswb m0, m0 ; dc in bytes - STORE16x16_SSE2 m0 + STORE16 m0 %endif %endmacro -INIT_XMM -cglobal predict_16x16_dc_core_sse2, 2,2,4 - movd m3, r1m - PRED16x16_DC_SSE2 m3, 5 - REP_RET +%macro PREDICT_16x16_DC_CORE 0 +cglobal predict_16x16_dc_core, 2,2,4 + movd xm3, r1m + PRED16x16_DC xm3, 5 + RET -cglobal predict_16x16_dc_top_sse2, 1,2 - PRED16x16_DC_SSE2 [pw_8], 4 - REP_RET +cglobal predict_16x16_dc_top, 1,2 + PRED16x16_DC [pw_8], 4 + RET -INIT_XMM -%ifdef HIGH_BIT_DEPTH -cglobal predict_16x16_dc_left_core_sse2, 1,2 - movd m0, r1m - SPLATW m0, m0 - STORE16x16_SSE2 m0, m0 - REP_RET -%else ; !HIGH_BIT_DEPTH -cglobal predict_16x16_dc_left_core_sse2, 1,1 - movd m0, r1m - SPLATW m0, m0 +cglobal predict_16x16_dc_left_core, 1,2 + movd xm0, r1m + SPLATW m0, xm0 +%if HIGH_BIT_DEPTH && mmsize == 16 + STORE16 m0, m0 +%else +%if HIGH_BIT_DEPTH == 0 packuswb m0, m0 - STORE16x16_SSE2 m0 +%endif + STORE16 m0 +%endif RET +%endmacro + +INIT_XMM sse2 +PREDICT_16x16_DC_CORE +%if HIGH_BIT_DEPTH +INIT_YMM avx2 +PREDICT_16x16_DC_CORE +%else +INIT_XMM avx2 +PREDICT_16x16_DC_CORE %endif