X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=common%2Fx86%2Fpredict-a.asm;h=0bfc2db207a02facf60db21a48b82c831aec4776;hb=b597966bfa8a481489e5af93eb25988456c51a5d;hp=0f64ca214d359eb1036b3950fdcf3741d5ea7406;hpb=cc0c3d4d1e639512e2b9003a68597fdb6ce00d4f;p=x264 diff --git a/common/x86/predict-a.asm b/common/x86/predict-a.asm index 0f64ca21..0bfc2db2 100644 --- a/common/x86/predict-a.asm +++ b/common/x86/predict-a.asm @@ -1,9 +1,10 @@ ;***************************************************************************** -;* predict-a.asm: h264 encoder library +;* predict-a.asm: x86 intra prediction ;***************************************************************************** -;* Copyright (C) 2005-2008 x264 project +;* Copyright (C) 2005-2011 x264 project ;* ;* Authors: Loren Merritt +;* Holger Lubitz ;* Fiona Glaser ;* ;* This program is free software; you can redistribute it and/or modify @@ -19,137 +20,561 @@ ;* You should have received a copy of the GNU General Public License ;* along with this program; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. +;* +;* This program is also available under a commercial proprietary license. +;* For more information, contact us at licensing@x264.com. ;***************************************************************************** %include "x86inc.asm" %include "x86util.asm" +SECTION_RODATA + +pw_76543210: +pw_3210: dw 0, 1, 2, 3, 4, 5, 6, 7 +pw_43210123: dw -3, -2, -1, 0, 1, 2, 3, 4 +pw_m3: times 8 dw -3 +pb_00s_ff: times 8 db 0 +pb_0s_ff: times 7 db 0 + db 0xff +shuf_fixtr: db 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7 +shuf_nop: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + +SECTION .text + +cextern pb_0 +cextern pb_1 +cextern pb_3 +cextern pw_1 +cextern pw_2 +cextern pw_4 +cextern pw_8 +cextern pw_16 +cextern pw_ff00 +cextern pb_reverse +cextern pw_pixel_max + %macro STORE8x8 2 - movq [r0 + 0*FDEC_STRIDE], %1 - movq [r0 + 1*FDEC_STRIDE], %1 - movq [r0 + 2*FDEC_STRIDE], %1 - movq [r0 + 3*FDEC_STRIDE], %1 - movq [r0 + 4*FDEC_STRIDE], %2 - movq [r0 + 5*FDEC_STRIDE], %2 - movq [r0 + 6*FDEC_STRIDE], %2 - movq [r0 + 7*FDEC_STRIDE], %2 + add r0, 4*FDEC_STRIDEB + mova [r0 + -4*FDEC_STRIDEB], %1 + mova [r0 + -3*FDEC_STRIDEB], %1 + mova [r0 + -2*FDEC_STRIDEB], %1 + mova [r0 + -1*FDEC_STRIDEB], %1 + mova [r0 + 0*FDEC_STRIDEB], %2 + mova [r0 + 1*FDEC_STRIDEB], %2 + mova [r0 + 2*FDEC_STRIDEB], %2 + mova [r0 + 3*FDEC_STRIDEB], %2 %endmacro -%macro STORE16x16 2 - mov r1d, 4 +%macro STORE16x16 2-4 +%ifidn %0, 4 + mov r1d, 8 .loop: - movq [r0 + 0*FDEC_STRIDE], %1 - movq [r0 + 1*FDEC_STRIDE], %1 - movq [r0 + 2*FDEC_STRIDE], %1 - movq [r0 + 3*FDEC_STRIDE], %1 - movq [r0 + 0*FDEC_STRIDE + 8], %2 - movq [r0 + 1*FDEC_STRIDE + 8], %2 - movq [r0 + 2*FDEC_STRIDE + 8], %2 - movq [r0 + 3*FDEC_STRIDE + 8], %2 - add r0, 4*FDEC_STRIDE + mova [r0 + 0*FDEC_STRIDEB + 0], %1 + mova [r0 + 1*FDEC_STRIDEB + 0], %1 + mova [r0 + 0*FDEC_STRIDEB + 8], %2 + mova [r0 + 1*FDEC_STRIDEB + 8], %2 + mova [r0 + 0*FDEC_STRIDEB +16], %3 + mova [r0 + 1*FDEC_STRIDEB +16], %3 + mova [r0 + 0*FDEC_STRIDEB +24], %4 + mova [r0 + 1*FDEC_STRIDEB +24], %4 + add r0, 2*FDEC_STRIDEB dec r1d jg .loop -%endmacro - -%macro STORE16x16_SSE2 1 +%else mov r1d, 4 .loop: - movdqa [r0 + 0*FDEC_STRIDE], %1 - movdqa [r0 + 1*FDEC_STRIDE], %1 - movdqa [r0 + 2*FDEC_STRIDE], %1 - movdqa [r0 + 3*FDEC_STRIDE], %1 + mova [r0 + 0*FDEC_STRIDE], %1 + mova [r0 + 1*FDEC_STRIDE], %1 + mova [r0 + 2*FDEC_STRIDE], %1 + mova [r0 + 3*FDEC_STRIDE], %1 + mova [r0 + 0*FDEC_STRIDE + 8], %2 + mova [r0 + 1*FDEC_STRIDE + 8], %2 + mova [r0 + 2*FDEC_STRIDE + 8], %2 + mova [r0 + 3*FDEC_STRIDE + 8], %2 add r0, 4*FDEC_STRIDE dec r1d jg .loop +%endif %endmacro -SECTION_RODATA - -ALIGN 16 -pb_1: times 16 db 1 -pb_3: times 16 db 3 -pw_2: times 4 dw 2 -pw_4: times 4 dw 4 -pw_8: times 8 dw 8 -pw_76543210: -pw_3210: dw 0, 1, 2, 3, 4, 5, 6, 7 -pb_00s_ff: times 8 db 0 -pb_0s_ff: times 7 db 0 - db 0xff - -SECTION .text +%macro STORE16x16_SSE2 1-2 +%ifidn %0,2 + mov r1d, 4 +.loop + mova [r0+0*FDEC_STRIDEB+ 0], %1 + mova [r0+0*FDEC_STRIDEB+16], %2 + mova [r0+1*FDEC_STRIDEB+ 0], %1 + mova [r0+1*FDEC_STRIDEB+16], %2 + mova [r0+2*FDEC_STRIDEB+ 0], %1 + mova [r0+2*FDEC_STRIDEB+16], %2 + mova [r0+3*FDEC_STRIDEB+ 0], %1 + mova [r0+3*FDEC_STRIDEB+16], %2 + add r0, 4*FDEC_STRIDEB + dec r1d + jg .loop +%else + add r0, 4*FDEC_STRIDEB + mova [r0 + -4*FDEC_STRIDEB], %1 + mova [r0 + -3*FDEC_STRIDEB], %1 + mova [r0 + -2*FDEC_STRIDEB], %1 + mova [r0 + -1*FDEC_STRIDEB], %1 + mova [r0 + 0*FDEC_STRIDEB], %1 + mova [r0 + 1*FDEC_STRIDEB], %1 + mova [r0 + 2*FDEC_STRIDEB], %1 + mova [r0 + 3*FDEC_STRIDEB], %1 + add r0, 8*FDEC_STRIDEB + mova [r0 + -4*FDEC_STRIDEB], %1 + mova [r0 + -3*FDEC_STRIDEB], %1 + mova [r0 + -2*FDEC_STRIDEB], %1 + mova [r0 + -1*FDEC_STRIDEB], %1 + mova [r0 + 0*FDEC_STRIDEB], %1 + mova [r0 + 1*FDEC_STRIDEB], %1 + mova [r0 + 2*FDEC_STRIDEB], %1 + mova [r0 + 3*FDEC_STRIDEB], %1 +%endif +%endmacro ; dest, left, right, src, tmp ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2 -%macro PRED8x8_LOWPASS0 6 - mov%6 %5, %2 - pavgb %2, %3 - pxor %3, %5 - mov%6 %1, %4 - pand %3, [pb_1 GLOBAL] - psubusb %2, %3 - pavgb %1, %2 +%macro PRED8x8_LOWPASS 5-6 +%ifidn %1, w + paddw %3, %4 + psrlw %3, 1 + pavgw %2, %5, %3 +%else + mova %6, %3 + pavgb %3, %4 + pxor %4, %6 + pand %4, [pb_1] + psubusb %3, %4 + pavgb %2, %5, %3 +%endif %endmacro -%macro PRED8x8_LOWPASS 5 - PRED8x8_LOWPASS0 %1, %2, %3, %4, %5, q + +%macro LOAD_PLANE_ARGS 0 +%ifdef ARCH_X86_64 + movd mm0, r1d + movd mm2, r2d + movd mm4, r3d + pshufw mm0, mm0, 0 + pshufw mm2, mm2, 0 + pshufw mm4, mm4, 0 +%else + pshufw mm0, r1m, 0 + pshufw mm2, r2m, 0 + pshufw mm4, r3m, 0 +%endif %endmacro -%macro PRED8x8_LOWPASS_XMM 5 - PRED8x8_LOWPASS0 %1, %2, %3, %4, %5, dqa + +;----------------------------------------------------------------------------- +; void predict_4x4_ddl( pixel *src ) +;----------------------------------------------------------------------------- +%macro PREDICT_4x4_DDL 3 +cglobal predict_4x4_ddl, 1,1 + movu m1, [r0-FDEC_STRIDEB] + psll%1 m2, m1, %2 + mova m3, m1 + mova m4, m1 + pxor m1, m2 + psrl%1 m1, %2 + pxor m3, m1 + PRED8x8_LOWPASS %3, m0, m2, m3, m4, m5 + +%assign Y 0 +%rep 4 + psrl%1 m0, %2 + movh [r0+Y*FDEC_STRIDEB], m0 +%assign Y (Y+1) +%endrep + + RET %endmacro +%ifdef HIGH_BIT_DEPTH +INIT_XMM sse2 +PREDICT_4x4_DDL dq, 2, w +INIT_XMM avx +PREDICT_4x4_DDL dq, 2, w +INIT_MMX mmx2 +cglobal predict_4x4_ddl, 1,2 + mova m1, [r0-2*FDEC_STRIDE+4] + mova m2, [r0-2*FDEC_STRIDE+0] + mova m3, [r0-2*FDEC_STRIDE+2] + PRED8x8_LOWPASS w, m0, m1, m2, m3 + mova [r0+0*FDEC_STRIDE], m0 + + mova m5, [r0-2*FDEC_STRIDE+6] + mova m6, [r0-2*FDEC_STRIDE+8] + pshufw m7, m6, q3321 + PRED8x8_LOWPASS w, m4, m7, m5, m6 + mova [r0+6*FDEC_STRIDE], m4 + + psllq m0, 16 + PALIGNR m4, m0, 6, m1 + mova [r0+4*FDEC_STRIDE], m4 + + psllq m0, 16 + PALIGNR m4, m0, 6, m0 + mova [r0+2*FDEC_STRIDE], m4 + RET +%else +INIT_MMX mmx2 +PREDICT_4x4_DDL q, 8, b +%endif ;----------------------------------------------------------------------------- -; void predict_4x4_ddl_mmxext( uint8_t *src ) +; void predict_4x4_ddr( pixel *src ) ;----------------------------------------------------------------------------- -cglobal predict_4x4_ddl_mmxext, 1,1 - sub r0, FDEC_STRIDE - movq mm3, [r0] - movq mm1, [r0-1] - movq mm2, mm3 - movq mm4, [pb_0s_ff GLOBAL] - psrlq mm2, 8 - pand mm4, mm3 - por mm2, mm4 +%macro PREDICT_4x4 6 +cglobal predict_4x4_ddr, 1,1 + movu m1, [r0+1*FDEC_STRIDEB-8*SIZEOF_PIXEL] + movq m2, [r0+0*FDEC_STRIDEB-8] +%ifdef HIGH_BIT_DEPTH + movh m4, [r0-1*FDEC_STRIDEB-4*SIZEOF_PIXEL] + punpckl%1 m2, m4 + movh m3, [r0-1*FDEC_STRIDEB] + punpckh%2 m1, m2 + PALIGNR m3, m1, 5*SIZEOF_PIXEL, m1 + mova m1, m3 + movhps m4, [r0+2*FDEC_STRIDEB-4*SIZEOF_PIXEL] + PALIGNR m3, m4, 7*SIZEOF_PIXEL, m4 + mova m2, m3 + movhps m4, [r0+3*FDEC_STRIDEB-4*SIZEOF_PIXEL] + PALIGNR m3, m4, 7*SIZEOF_PIXEL, m4 +%else + punpckh%1 m2, [r0-1*FDEC_STRIDEB-8*SIZEOF_PIXEL] + movh m3, [r0-1*FDEC_STRIDEB] + punpckh%2 m1, m2 + PALIGNR m3, m1, 5*SIZEOF_PIXEL, m1 + mova m1, m3 + PALIGNR m3, [r0+2*FDEC_STRIDEB-8*SIZEOF_PIXEL], 7*SIZEOF_PIXEL, m4 + mova m2, m3 + PALIGNR m3, [r0+3*FDEC_STRIDEB-8*SIZEOF_PIXEL], 7*SIZEOF_PIXEL, m4 +%endif + PRED8x8_LOWPASS %4, m0, m3, m1, m2, m4 +%assign Y 3 + movh [r0+Y*FDEC_STRIDEB], m0 +%rep 3 +%assign Y (Y-1) + psrl%3 m0, %6 + movh [r0+Y*FDEC_STRIDEB], m0 +%endrep + RET - PRED8x8_LOWPASS mm0, mm1, mm2, mm3, mm5 +cglobal predict_4x4_vr, 1,1,6 + movh m0, [r0-1*FDEC_STRIDEB] ; ........t3t2t1t0 + mova m5, m0 +%ifdef HIGH_BIT_DEPTH + movhps m1, [r0-1*FDEC_STRIDEB-4*SIZEOF_PIXEL] + PALIGNR m0, m1, 7*SIZEOF_PIXEL, m1 ; ......t3t2t1t0lt + pavg%4 m5, m0 + movhps m1, [r0+0*FDEC_STRIDEB-4*SIZEOF_PIXEL] + PALIGNR m0, m1, 7*SIZEOF_PIXEL, m1 ; ....t3t2t1t0ltl0 + mova m1, m0 + movhps m2, [r0+1*FDEC_STRIDEB-4*SIZEOF_PIXEL] + PALIGNR m0, m2, 7*SIZEOF_PIXEL, m2 ; ..t3t2t1t0ltl0l1 + mova m2, m0 + movhps m3, [r0+2*FDEC_STRIDEB-4*SIZEOF_PIXEL] + PALIGNR m0, m3, 7*SIZEOF_PIXEL, m3 ; t3t2t1t0ltl0l1l2 +%else + PALIGNR m0, [r0-1*FDEC_STRIDEB-8*SIZEOF_PIXEL], 7*SIZEOF_PIXEL, m1 ; ......t3t2t1t0lt + pavg%4 m5, m0 + PALIGNR m0, [r0+0*FDEC_STRIDEB-8*SIZEOF_PIXEL], 7*SIZEOF_PIXEL, m1 ; ....t3t2t1t0ltl0 + mova m1, m0 + PALIGNR m0, [r0+1*FDEC_STRIDEB-8*SIZEOF_PIXEL], 7*SIZEOF_PIXEL, m2 ; ..t3t2t1t0ltl0l1 + mova m2, m0 + PALIGNR m0, [r0+2*FDEC_STRIDEB-8*SIZEOF_PIXEL], 7*SIZEOF_PIXEL, m3 ; t3t2t1t0ltl0l1l2 +%endif + PRED8x8_LOWPASS %4, m3, m1, m0, m2, m4 + psll%3 m1, m3, %6*6 + psrl%3 m3, %6*2 + movh [r0+0*FDEC_STRIDEB], m5 + movh [r0+1*FDEC_STRIDEB], m3 + PALIGNR m5, m1, 7*SIZEOF_PIXEL, m2 + psll%3 m1, %6 + movh [r0+2*FDEC_STRIDEB], m5 + PALIGNR m3, m1, 7*SIZEOF_PIXEL, m1 + movh [r0+3*FDEC_STRIDEB], m3 + RET -%assign Y 1 -%rep 4 - psrlq mm0, 8 - movd [r0+Y*FDEC_STRIDE], mm0 -%assign Y (Y+1) +cglobal predict_4x4_hd, 1,1,6 + movh m0, [r0-1*FDEC_STRIDEB-4*SIZEOF_PIXEL] ; lt .. +%ifdef HIGH_BIT_DEPTH + movh m1, [r0-1*FDEC_STRIDEB] + punpckl%5 m0, m1 ; t3 t2 t1 t0 lt .. .. .. + psll%3 m0, %6 ; t2 t1 t0 lt .. .. .. .. + movh m1, [r0+3*FDEC_STRIDEB-4*SIZEOF_PIXEL] ; l3 + movh m2, [r0+2*FDEC_STRIDEB-4*SIZEOF_PIXEL] + punpckl%1 m1, m2 ; l2 l3 + movh m2, [r0+1*FDEC_STRIDEB-4*SIZEOF_PIXEL] ; l1 + movh m3, [r0+0*FDEC_STRIDEB-4*SIZEOF_PIXEL] + punpckl%1 m2, m3 ; l0 l1 +%else + punpckl%5 m0, [r0-1*FDEC_STRIDEB] ; t3 t2 t1 t0 lt .. .. .. + psll%3 m0, %6 ; t2 t1 t0 lt .. .. .. .. + movu m1, [r0+3*FDEC_STRIDEB-8*SIZEOF_PIXEL] ; l3 + punpckh%1 m1, [r0+2*FDEC_STRIDEB-8*SIZEOF_PIXEL] ; l2 l3 + movu m2, [r0+1*FDEC_STRIDEB-8*SIZEOF_PIXEL] ; l1 + punpckh%1 m2, [r0+0*FDEC_STRIDEB-8*SIZEOF_PIXEL] ; l0 l1 +%endif + punpckh%2 m1, m2 ; l0 l1 l2 l3 + punpckh%5 m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3 + psrl%3 m2, m1, %6 ; .. t2 t1 t0 lt l0 l1 l2 + psrl%3 m0, m1, %6*2 ; .. .. t2 t1 t0 lt l0 l1 + pavg%4 m5, m1, m2 + PRED8x8_LOWPASS %4, m3, m1, m0, m2, m4 + punpckl%1 m5, m3 + psrl%3 m3, %6*4 + PALIGNR m3, m5, 6*SIZEOF_PIXEL, m4 +%assign Y 3 + movh [r0+Y*FDEC_STRIDEB], m5 +%rep 2 +%assign Y (Y-1) + psrl%3 m5, %6*2 + movh [r0+Y*FDEC_STRIDEB], m5 %endrep + movh [r0+0*FDEC_STRIDEB], m3 + RET +%endmacro +%ifdef HIGH_BIT_DEPTH +INIT_MMX mmx2 +cglobal predict_4x4_ddr, 1,1 + movq m3, [r0+3*FDEC_STRIDEB-8] + psrlq m3, 48 + PALIGNR m3, [r0+2*FDEC_STRIDEB-8], 6, m6 + PALIGNR m3, [r0+1*FDEC_STRIDEB-8], 6, m7 + movq m6, [r0+0*FDEC_STRIDEB-8] + PALIGNR m3, m6, 6, m5 + + movq m4, [r0-1*FDEC_STRIDEB-8] + movq m2, m3 + movq m1, m3 + PALIGNR m2, m4, 6, m5 + movq m1, m2 + psllq m1, 16 + PRED8x8_LOWPASS w, m0, m3, m1, m2 + pshufw m0, m0, q0123 + movq [r0+3*FDEC_STRIDEB], m0 + + movq m2, [r0-1*FDEC_STRIDEB-0] + movq m5, m2 + PALIGNR m5, m4, 6, m4 + movq m3, m5 + PALIGNR m5, m6, 6, m6 + PRED8x8_LOWPASS w, m1, m5, m2, m3 + movq [r0+0*FDEC_STRIDEB], m1 + + psllq m0, 16 + PALIGNR m1, m0, 6, m2 + movq [r0+1*FDEC_STRIDEB], m1 + psllq m0, 16 + PALIGNR m1, m0, 6, m0 + movq [r0+2*FDEC_STRIDEB], m1 + psrlq m1, 16 + movd [r0+3*FDEC_STRIDEB+4], m1 RET +cglobal predict_4x4_hd, 1,1 + mova m0, [r0+1*FDEC_STRIDEB-8] + punpckhwd m0, [r0+0*FDEC_STRIDEB-8] + mova m1, [r0+3*FDEC_STRIDEB-8] + punpckhwd m1, [r0+2*FDEC_STRIDEB-8] + punpckhdq m1, m0 + mova m0, m1 + mova m4, m1 + + movu m3, [r0-1*FDEC_STRIDEB-2] + mova m7, m3 + punpckhdq m4, [r0-1*FDEC_STRIDEB-6] + PALIGNR m3, m1, 2, m2 + PRED8x8_LOWPASS w, m2, m4, m1, m3, m6 + + pavgw m0, m3 + mova m5, m0 + punpcklwd m5, m2 + mova m4, m0 + punpckhwd m4, m2 + mova [r0+3*FDEC_STRIDEB], m5 + mova [r0+1*FDEC_STRIDEB], m4 + + mova m4, m7 + mova m6, [r0-1*FDEC_STRIDEB+0] + PALIGNR m7, [r0+0*FDEC_STRIDEB-8], 6, m5 + PRED8x8_LOWPASS w, m3, m7, m6, m4, m1 + + PALIGNR m3, m0, 6, m5 + mova [r0+0*FDEC_STRIDEB], m3 + psrlq m0, 16 + psrlq m2, 16 + punpcklwd m0, m2 + mova [r0+2*FDEC_STRIDEB], m0 + RET + +INIT_XMM sse2 +PREDICT_4x4 wd, dq, dq, w, qdq, 2 +INIT_XMM ssse3 +PREDICT_4x4 wd, dq, dq, w, qdq, 2 +INIT_XMM avx +PREDICT_4x4 wd, dq, dq, w, qdq, 2 +%else +INIT_MMX mmx2 +PREDICT_4x4 bw, wd, q , b, dq , 8 +INIT_MMX ssse3 +PREDICT_4x4 bw, wd, q , b, dq , 8 +%endif + ;----------------------------------------------------------------------------- -; void predict_4x4_vl_mmxext( uint8_t *src ) +; void predict_4x4_hu( pixel *src ) ;----------------------------------------------------------------------------- -cglobal predict_4x4_vl_mmxext, 1,1 - movq mm1, [r0-FDEC_STRIDE] - movq mm3, mm1 - movq mm2, mm1 - psrlq mm3, 8 - psrlq mm2, 16 - movq mm4, mm3 - pavgb mm4, mm1 +%ifdef HIGH_BIT_DEPTH +INIT_MMX +cglobal predict_4x4_hu_mmx2, 1,1 + movq m0, [r0+0*FDEC_STRIDEB-4*2] + punpckhwd m0, [r0+1*FDEC_STRIDEB-4*2] + movq m1, [r0+2*FDEC_STRIDEB-4*2] + punpckhwd m1, [r0+3*FDEC_STRIDEB-4*2] + punpckhdq m0, m1 + pshufw m1, m1, q3333 + movq [r0+3*FDEC_STRIDEB], m1 + movd [r0+2*FDEC_STRIDEB+4], m1 + mova m2, m0 + psrlq m2, 16 + pavgw m2, m0 + + pshufw m1, m0, q3321 + pshufw m5, m0, q3332 + PRED8x8_LOWPASS w, m3, m0, m5, m1, m7 + movq m6, m2 + punpcklwd m6, m3 + mova [r0+0*FDEC_STRIDEB], m6 + psrlq m2, 16 + psrlq m3, 16 + punpcklwd m2, m3 + mova [r0+1*FDEC_STRIDEB], m2 + psrlq m2, 32 + movd [r0+2*FDEC_STRIDEB+0], m2 + RET - PRED8x8_LOWPASS mm0, mm1, mm2, mm3, mm5 +%else ; !HIGH_BIT_DEPTH +INIT_MMX +cglobal predict_4x4_hu_mmx2, 1,1 + movq mm0, [r0+0*FDEC_STRIDE-8] + punpckhbw mm0, [r0+1*FDEC_STRIDE-8] + movq mm1, [r0+2*FDEC_STRIDE-8] + punpckhbw mm1, [r0+3*FDEC_STRIDE-8] + punpckhwd mm0, mm1 + movq mm1, mm0 + punpckhbw mm1, mm1 + pshufw mm1, mm1, q3333 + punpckhdq mm0, mm1 + movq mm2, mm0 + movq mm3, mm0 + movq mm7, mm0 + psrlq mm2, 16 + psrlq mm3, 8 + pavgb mm7, mm3 + PRED8x8_LOWPASS b, mm4, mm0, mm2, mm3, mm5 + punpcklbw mm7, mm4 +%assign Y 0 + movd [r0+Y*FDEC_STRIDE], mm7 +%rep 2 +%assign Y (Y+1) + psrlq mm7, 16 + movd [r0+Y*FDEC_STRIDE], mm7 +%endrep + movd [r0+3*FDEC_STRIDE], mm1 + RET +%endif ; HIGH_BIT_DEPTH - movd [r0+0*FDEC_STRIDE], mm4 - movd [r0+1*FDEC_STRIDE], mm0 - psrlq mm4, 8 - psrlq mm0, 8 - movd [r0+2*FDEC_STRIDE], mm4 - movd [r0+3*FDEC_STRIDE], mm0 +;----------------------------------------------------------------------------- +; void predict_4x4_vl( pixel *src ) +;----------------------------------------------------------------------------- +%macro PREDICT_4x4_V1 3 +cglobal predict_4x4_vl, 1,1,6 + movu m1, [r0-FDEC_STRIDEB] + psrl%1 m3, m1, %2 + psrl%1 m2, m1, %2*2 + pavg%3 m4, m3, m1 + PRED8x8_LOWPASS %3, m0, m1, m2, m3, m5 + + movh [r0+0*FDEC_STRIDEB], m4 + movh [r0+1*FDEC_STRIDEB], m0 + psrl%1 m4, %2 + psrl%1 m0, %2 + movh [r0+2*FDEC_STRIDEB], m4 + movh [r0+3*FDEC_STRIDEB], m0 + RET +%endmacro + +%ifdef HIGH_BIT_DEPTH +INIT_XMM sse2 +PREDICT_4x4_V1 dq, 2, w +%ifdef ARCH_X86_64 +INIT_XMM avx +PREDICT_4x4_V1 dq, 2, w +%endif +INIT_MMX mmx2 +cglobal predict_4x4_vl, 1,4 + mova m1, [r0-FDEC_STRIDEB+0] + mova m2, [r0-FDEC_STRIDEB+8] + mova m3, m2 + PALIGNR m2, m1, 4, m6 + PALIGNR m3, m1, 2, m5 + mova m4, m3 + pavgw m4, m1 + mova [r0+0*FDEC_STRIDEB], m4 + psrlq m4, 16 + mova [r0+2*FDEC_STRIDEB], m4 + PRED8x8_LOWPASS w, m0, m1, m2, m3, m6 + mova [r0+1*FDEC_STRIDEB], m0 + psrlq m0, 16 + mova [r0+3*FDEC_STRIDEB], m0 + + movzx r1d, word [r0-FDEC_STRIDEB+ 8] + movzx r2d, word [r0-FDEC_STRIDEB+10] + movzx r3d, word [r0-FDEC_STRIDEB+12] + lea r1d, [r1+r2+1] + add r3d, r2d + lea r3d, [r3+r1+1] + shr r1d, 1 + shr r3d, 2 + mov [r0+2*FDEC_STRIDEB+6], r1w + mov [r0+3*FDEC_STRIDEB+6], r3w RET +%else +INIT_MMX mmx2 +PREDICT_4x4_V1 q, 8, b +%endif ;----------------------------------------------------------------------------- -; void predict_4x4_dc( uint8_t *src ) +; void predict_4x4_dc( pixel *src ) ;----------------------------------------------------------------------------- +%ifdef HIGH_BIT_DEPTH +INIT_MMX +cglobal predict_4x4_dc_mmx2, 1,1 + mova m2, [r0+0*FDEC_STRIDEB-4*SIZEOF_PIXEL] + paddw m2, [r0+1*FDEC_STRIDEB-4*SIZEOF_PIXEL] + paddw m2, [r0+2*FDEC_STRIDEB-4*SIZEOF_PIXEL] + paddw m2, [r0+3*FDEC_STRIDEB-4*SIZEOF_PIXEL] + psrlq m2, 48 + mova m0, [r0-FDEC_STRIDEB] + HADDW m0, m1 + paddw m0, [pw_4] + paddw m0, m2 + psrlw m0, 3 + SPLATW m0, m0 + mova [r0+0*FDEC_STRIDEB], m0 + mova [r0+1*FDEC_STRIDEB], m0 + mova [r0+2*FDEC_STRIDEB], m0 + mova [r0+3*FDEC_STRIDEB], m0 + RET -cglobal predict_4x4_dc_mmxext, 1,4 +%else +INIT_MMX +cglobal predict_4x4_dc_mmx2, 1,4 pxor mm7, mm7 movd mm0, [r0-FDEC_STRIDE] psadbw mm0, mm7 @@ -169,151 +594,635 @@ cglobal predict_4x4_dc_mmxext, 1,4 mov [r0+FDEC_STRIDE*2], r1d mov [r0+FDEC_STRIDE*3], r1d RET +%endif ; HIGH_BIT_DEPTH +%macro PREDICT_FILTER 5 ;----------------------------------------------------------------------------- -; void predict_8x8_v_mmxext( uint8_t *src, uint8_t *edge ) +;void predict_8x8_filter( pixel *src, pixel edge[36], int i_neighbor, int i_filters ) ;----------------------------------------------------------------------------- -cglobal predict_8x8_v_mmxext, 2,2 - movq mm0, [r1+16] - STORE8x8 mm0, mm0 - RET +cglobal predict_8x8_filter, 4,6,6 + add r0, 0x58*SIZEOF_PIXEL +%define src r0-0x58*SIZEOF_PIXEL +%ifndef ARCH_X86_64 + mov r4, r1 +%define t1 r4 +%define t4 r1 +%else +%define t1 r1 +%define t4 r4 +%endif + test r3b, 1 + je .check_top + mov t4d, r2d + and t4d, 8 + neg t4 + mova m0, [src+0*FDEC_STRIDEB-8*SIZEOF_PIXEL] + punpckh%1%2 m0, [src+0*FDEC_STRIDEB-8*SIZEOF_PIXEL+t4*(FDEC_STRIDEB/8)] + mova m1, [src+2*FDEC_STRIDEB-8*SIZEOF_PIXEL] + punpckh%1%2 m1, [src+1*FDEC_STRIDEB-8*SIZEOF_PIXEL] + punpckh%2%3 m1, m0 + mova m2, [src+4*FDEC_STRIDEB-8*SIZEOF_PIXEL] + punpckh%1%2 m2, [src+3*FDEC_STRIDEB-8*SIZEOF_PIXEL] + mova m3, [src+6*FDEC_STRIDEB-8*SIZEOF_PIXEL] + punpckh%1%2 m3, [src+5*FDEC_STRIDEB-8*SIZEOF_PIXEL] + punpckh%2%3 m3, m2 + punpckh%3%4 m3, m1 + mova m0, [src+7*FDEC_STRIDEB-8*SIZEOF_PIXEL] + mova m1, [src-1*FDEC_STRIDEB] + mova m4, m3 + mova m2, m3 + PALIGNR m4, m0, 7*SIZEOF_PIXEL, m0 + PALIGNR m1, m2, 1*SIZEOF_PIXEL, m2 + PRED8x8_LOWPASS %1, m2, m1, m4, m3, m5 + mova [t1+8*SIZEOF_PIXEL], m2 + movzx t4d, pixel [src+7*FDEC_STRIDEB-1*SIZEOF_PIXEL] + movzx r5d, pixel [src+6*FDEC_STRIDEB-1*SIZEOF_PIXEL] + lea t4d, [t4*3+2] + add t4d, r5d + shr t4d, 2 + mov [t1+7*SIZEOF_PIXEL], t4%1 + test r3b, 2 + je .done +.check_top: +%if SIZEOF_PIXEL==1 && cpuflag(ssse3) +INIT_XMM cpuname + movu m3, [src-1*FDEC_STRIDEB] + movhps m0, [src-1*FDEC_STRIDEB-8] + test r2b, 8 + je .fix_lt_2 +.do_top: + and r2d, 4 +%ifdef PIC + lea r3, [shuf_fixtr] + pshufb m3, [r3+r2*4] +%else + pshufb m3, [shuf_fixtr+r2*4] ; neighbor&MB_TOPRIGHT ? shuf_nop : shuf_fixtr +%endif + psrldq m1, m3, 15 + PALIGNR m2, m3, m0, 15, m0 + PALIGNR m1, m3, 1, m5 + PRED8x8_LOWPASS %1, m0, m2, m1, m3, m5 + mova [t1+16*SIZEOF_PIXEL], m0 + psrldq m0, 15 + movd [t1+32*SIZEOF_PIXEL], m0 +.done: + REP_RET +.fix_lt_2: + pslldq m0, m3, 15 + jmp .do_top + +%else + mova m0, [src-1*FDEC_STRIDEB-8*SIZEOF_PIXEL] + mova m3, [src-1*FDEC_STRIDEB] + mova m1, [src-1*FDEC_STRIDEB+8*SIZEOF_PIXEL] + test r2b, 8 + je .fix_lt_2 + test r2b, 4 + je .fix_tr_1 +.do_top: + PALIGNR m2, m3, m0, 7*SIZEOF_PIXEL, m0 + PALIGNR m0, m1, m3, 1*SIZEOF_PIXEL, m5 + PRED8x8_LOWPASS %1, m4, m2, m0, m3, m5 + mova [t1+16*SIZEOF_PIXEL], m4 + test r3b, 4 + je .done + mova m2, m1 + mova m4, m1 + psrl%4 m5, m1, 7*%5 + PALIGNR m2, m3, 7*SIZEOF_PIXEL, m3 + PALIGNR m5, m4, 1*SIZEOF_PIXEL, m4 + PRED8x8_LOWPASS %1, m0, m2, m5, m1, m4 + mova [t1+24*SIZEOF_PIXEL], m0 + psrl%4 m0, 7*%5 + movd [t1+32*SIZEOF_PIXEL], m0 +.done: + REP_RET +.fix_lt_2: + psll%4 m0, m3, 7*%5 + test r2b, 4 + jne .do_top +.fix_tr_1: + punpckh%1%2 m1, m3, m3 + pshuf%2 m1, m1, q3333 + jmp .do_top +%endif +%endmacro + +%ifdef HIGH_BIT_DEPTH +INIT_XMM sse2 +PREDICT_FILTER w, d, q, dq, 2 +INIT_XMM ssse3 +PREDICT_FILTER w, d, q, dq, 2 +INIT_XMM avx +PREDICT_FILTER w, d, q, dq, 2 +%else +INIT_MMX mmx2 +PREDICT_FILTER b, w, d, q , 8 +INIT_MMX ssse3 +PREDICT_FILTER b, w, d, q , 8 +%endif ;----------------------------------------------------------------------------- -; void predict_8x8_h_mmxext( uint8_t *src, uint8_t edge[33] ) +; void predict_8x8_v( pixel *src, pixel *edge ) ;----------------------------------------------------------------------------- +%macro PREDICT_8x8_V 0 +cglobal predict_8x8_v, 2,2 + mova m0, [r1+16*SIZEOF_PIXEL] + STORE8x8 m0, m0 + RET +%endmacro -INIT_MMX -cglobal predict_8x8_h_mmxext, 2,2 - movu m3, [r1+7] - mova m7, m3 - punpckhbw m3, m3 - punpcklbw m7, m7 - pshufw m0, m3, 0xff - pshufw m1, m3, 0xaa - pshufw m2, m3, 0x55 - pshufw m3, m3, 0x00 - pshufw m4, m7, 0xff - pshufw m5, m7, 0xaa - pshufw m6, m7, 0x55 - pshufw m7, m7, 0x00 +%ifdef HIGH_BIT_DEPTH +INIT_XMM sse2 +PREDICT_8x8_V +%else +INIT_MMX mmx2 +PREDICT_8x8_V +%endif + +;----------------------------------------------------------------------------- +; void predict_8x8_h( pixel *src, pixel edge[36] ) +;----------------------------------------------------------------------------- +%macro PREDICT_8x8_H 2 +cglobal predict_8x8_h, 2,2 + movu m1, [r1+7*SIZEOF_PIXEL] + add r0, 4*FDEC_STRIDEB + punpckl%1 m2, m1, m1 + punpckh%1 m1, m1 %assign n 0 %rep 8 - mova [r0+n*FDEC_STRIDE], m %+ n +%assign i 1+n/4 + SPLAT%2 m0, m %+ i, (3-n)&3 + mova [r0+(n-4)*FDEC_STRIDEB], m0 %assign n n+1 %endrep RET +%endmacro + +%ifdef HIGH_BIT_DEPTH +INIT_XMM sse2 +PREDICT_8x8_H wd, D +%else +INIT_MMX mmx2 +PREDICT_8x8_H bw, W +%endif ;----------------------------------------------------------------------------- -; void predict_8x8_dc_mmxext( uint8_t *src, uint8_t *edge ); +; void predict_8x8_dc( pixel *src, pixel *edge ); ;----------------------------------------------------------------------------- -cglobal predict_8x8_dc_mmxext, 2,2 +%ifdef HIGH_BIT_DEPTH +INIT_XMM +cglobal predict_8x8_dc_sse2, 2,2 + movu m0, [r1+14] + paddw m0, [r1+32] + HADDW m0, m1 + paddw m0, [pw_8] + psrlw m0, 4 + SPLATW m0, m0 + STORE8x8 m0, m0 + REP_RET + +%else +INIT_MMX +cglobal predict_8x8_dc_mmx2, 2,2 pxor mm0, mm0 pxor mm1, mm1 psadbw mm0, [r1+7] psadbw mm1, [r1+16] - paddw mm0, [pw_8 GLOBAL] + paddw mm0, [pw_8] paddw mm0, mm1 psrlw mm0, 4 pshufw mm0, mm0, 0 packuswb mm0, mm0 STORE8x8 mm0, mm0 RET +%endif ; HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- -; void predict_8x8_dc_top_mmxext( uint8_t *src, uint8_t *edge ); +; void predict_8x8_dc_top ( pixel *src, pixel *edge ); +; void predict_8x8_dc_left( pixel *src, pixel *edge ); ;----------------------------------------------------------------------------- -%macro PRED8x8_DC 2 +%ifdef HIGH_BIT_DEPTH +%macro PREDICT_8x8_DC 3 +cglobal %1, 2,2 + %3 m0, [r1+%2] + HADDW m0, m1 + paddw m0, [pw_4] + psrlw m0, 3 + SPLATW m0, m0 + STORE8x8 m0, m0 + RET +%endmacro +INIT_XMM +PREDICT_8x8_DC predict_8x8_dc_top_sse2 , 32, mova +PREDICT_8x8_DC predict_8x8_dc_left_sse2, 14, movu + +%else +%macro PREDICT_8x8_DC 2 cglobal %1, 2,2 pxor mm0, mm0 psadbw mm0, [r1+%2] - paddw mm0, [pw_4 GLOBAL] + paddw mm0, [pw_4] psrlw mm0, 3 pshufw mm0, mm0, 0 packuswb mm0, mm0 STORE8x8 mm0, mm0 RET %endmacro +INIT_MMX +PREDICT_8x8_DC predict_8x8_dc_top_mmx2, 16 +PREDICT_8x8_DC predict_8x8_dc_left_mmx2, 7 +%endif ; HIGH_BIT_DEPTH -PRED8x8_DC predict_8x8_dc_top_mmxext, 16 -PRED8x8_DC predict_8x8_dc_left_mmxext, 7 - -%ifndef ARCH_X86_64 -; sse2 is faster even on amd, so there's no sense in spending exe size on these -; functions if we know sse2 is available. - +; sse2 is faster even on amd for 8-bit, so there's no sense in spending exe +; size on the 8-bit mmx functions below if we know sse2 is available. +%macro PREDICT_8x8 3 ;----------------------------------------------------------------------------- -; void predict_8x8_ddl_mmxext( uint8_t *src, uint8_t *edge ) +; void predict_8x8_ddl( pixel *src, pixel *edge ) ;----------------------------------------------------------------------------- -cglobal predict_8x8_ddl_mmxext, 2,2 - movq mm5, [r1+16] - movq mm2, [r1+17] - movq mm3, [r1+23] - movq mm4, [r1+25] - movq mm1, mm5 - psllq mm1, 8 - PRED8x8_LOWPASS mm0, mm1, mm2, mm5, mm7 - PRED8x8_LOWPASS mm1, mm3, mm4, [r1+24], mm6 - -%assign Y 7 +cglobal predict_8x8_ddl, 2,2,8 + mova m5, [r1+16*SIZEOF_PIXEL] + movu m2, [r1+17*SIZEOF_PIXEL] + movu m3, [r1+23*SIZEOF_PIXEL] + movu m4, [r1+25*SIZEOF_PIXEL] + psll%2 m1, m5, %3 + add r0, FDEC_STRIDEB*4 + PRED8x8_LOWPASS %1, m0, m1, m2, m5, m7 +%assign %%bak avx_enabled +%assign avx_enabled 0 + PRED8x8_LOWPASS %1, m1, m3, m4, [r1+24*SIZEOF_PIXEL], m6 +%assign avx_enabled %%bak +%assign Y 3 %rep 6 - movq [r0+Y*FDEC_STRIDE], mm1 - movq mm2, mm0 - psllq mm1, 8 - psrlq mm2, 56 - psllq mm0, 8 - por mm1, mm2 + mova [r0+Y*FDEC_STRIDEB], m1 + psll%2 m1, %3 + psrl%2 m2, m0, 7*%3 + psll%2 m0, %3 + por m1, m2 %assign Y (Y-1) %endrep - movq [r0+Y*FDEC_STRIDE], mm1 - psllq mm1, 8 - psrlq mm0, 56 - por mm1, mm0 + mova [r0+Y*FDEC_STRIDEB], m1 + psll%2 m1, %3 + psrl%2 m0, 7*%3 + por m1, m0 %assign Y (Y-1) - movq [r0+Y*FDEC_STRIDE], mm1 + mova [r0+Y*FDEC_STRIDEB], m1 RET ;----------------------------------------------------------------------------- -; void predict_8x8_ddr_mmxext( uint8_t *src, uint8_t *edge ) +; void predict_8x8_ddr( pixel *src, pixel *edge ) ;----------------------------------------------------------------------------- -cglobal predict_8x8_ddr_mmxext, 2,2 - movq mm1, [r1+7] - movq mm2, [r1+9] - movq mm3, [r1+15] - movq mm4, [r1+17] - PRED8x8_LOWPASS mm0, mm1, mm2, [r1+8], mm7 - PRED8x8_LOWPASS mm1, mm3, mm4, [r1+16], mm6 - -%assign Y 7 +%if avx_enabled == 0 +cglobal predict_8x8_ddr, 2,2,7 + movu m1, [r1+ 7*SIZEOF_PIXEL] + movu m2, [r1+ 9*SIZEOF_PIXEL] + movu m3, [r1+15*SIZEOF_PIXEL] + movu m4, [r1+17*SIZEOF_PIXEL] + add r0, FDEC_STRIDEB*4 + PRED8x8_LOWPASS %1, m0, m1, m2, [r1+ 8*SIZEOF_PIXEL], m5 + PRED8x8_LOWPASS %1, m1, m3, m4, [r1+16*SIZEOF_PIXEL], m6 +%assign Y 3 %rep 6 - movq [r0+Y*FDEC_STRIDE], mm0 - movq mm2, mm1 - psrlq mm0, 8 - psllq mm2, 56 - psrlq mm1, 8 - por mm0, mm2 + mova [r0+Y*FDEC_STRIDEB], m0 + psrl%2 m0, %3 + psll%2 m2, m1, 7*%3 + psrl%2 m1, %3 + por m0, m2 %assign Y (Y-1) %endrep - movq [r0+Y*FDEC_STRIDE], mm0 - psrlq mm0, 8 - psllq mm1, 56 - por mm0, mm1 + mova [r0+Y*FDEC_STRIDEB], m0 + psrl%2 m0, %3 + psll%2 m1, 7*%3 + por m0, m1 %assign Y (Y-1) - movq [r0+Y*FDEC_STRIDE], mm0 + mova [r0+Y*FDEC_STRIDEB], m0 + RET +%endif +%endmacro ; PREDICT_8x8 + +%ifdef HIGH_BIT_DEPTH +INIT_XMM sse2 +PREDICT_8x8 w, dq, 2 +INIT_XMM avx +PREDICT_8x8 w, dq, 2 +%elifndef ARCH_X86_64 +INIT_MMX mmx2 +PREDICT_8x8 b, q , 8 +%endif + +;----------------------------------------------------------------------------- +; void predict_8x8_hu( pixel *src, pixel *edge ) +;----------------------------------------------------------------------------- +%macro PREDICT_8x8_HU 5 +cglobal predict_8x8_hu, 2,2,8 + movu m1, [r1+7*SIZEOF_PIXEL] ; l0 l1 l2 l3 l4 l5 l6 l7 + add r0, 4*FDEC_STRIDEB + pshuf%3 m0, m1, q0123 ; l6 l7 l4 l5 l2 l3 l0 l1 + psll%2 m1, 7*%5 ; l7 .. .. .. .. .. .. .. + mova m2, m0 + psll%3 m0, 8*SIZEOF_PIXEL + psrl%3 m2, 8*SIZEOF_PIXEL + por m2, m0 ; l7 l6 l5 l4 l3 l2 l1 l0 + mova m4, m2 + mova m5, m2 + psrl%2 m3, m2, 2*%5 + psrl%2 m2, %5 + por m2, m1 ; l7 l7 l6 l5 l4 l3 l2 l1 + punpckh%4 m1, m1 + por m3, m1 ; l7 l7 l7 l6 l5 l4 l3 l2 + pavg%1 m4, m2 + PRED8x8_LOWPASS %1, m1, m3, m5, m2, m6 + punpckh%4 m5, m4, m1 ; p8 p7 p6 p5 + punpckl%4 m4, m1 ; p4 p3 p2 p1 + mova m6, m5 + mova m7, m5 + mova m0, m5 + PALIGNR m5, m4, 2*SIZEOF_PIXEL, m1 + pshuf%3 m1, m6, q3321 + PALIGNR m6, m4, 4*SIZEOF_PIXEL, m2 + pshuf%3 m2, m7, q3332 + PALIGNR m7, m4, 6*SIZEOF_PIXEL, m3 + pshuf%3 m3, m0, q3333 + mova [r0-4*FDEC_STRIDEB], m4 + mova [r0-3*FDEC_STRIDEB], m5 + mova [r0-2*FDEC_STRIDEB], m6 + mova [r0-1*FDEC_STRIDEB], m7 + mova [r0+0*FDEC_STRIDEB], m0 + mova [r0+1*FDEC_STRIDEB], m1 + mova [r0+2*FDEC_STRIDEB], m2 + mova [r0+3*FDEC_STRIDEB], m3 + RET +%endmacro + +%ifdef HIGH_BIT_DEPTH +INIT_XMM sse2 +PREDICT_8x8_HU w, dq, d, wd, 2 +INIT_XMM ssse3 +PREDICT_8x8_HU w, dq, d, wd, 2 +INIT_XMM avx +PREDICT_8x8_HU w, dq, d, wd, 2 +%elifndef ARCH_X86_64 +INIT_MMX mmx2 +PREDICT_8x8_HU b, q , w, bw, 8 +%endif + +;----------------------------------------------------------------------------- +; void predict_8x8_vr( pixel *src, pixel *edge ) +;----------------------------------------------------------------------------- +%macro PREDICT_8x8_VR 3 +cglobal predict_8x8_vr, 2,3,7 + mova m2, [r1+16*SIZEOF_PIXEL] + movu m3, [r1+15*SIZEOF_PIXEL] + movu m1, [r1+14*SIZEOF_PIXEL] + pavg%1 m4, m3, m2 + add r0, FDEC_STRIDEB*4 + PRED8x8_LOWPASS %1, m0, m1, m2, m3, m5 + mova [r0-4*FDEC_STRIDEB], m4 + mova [r0-3*FDEC_STRIDEB], m0 + mova m5, m0 + mova m6, m4 + mova m1, [r1+8*SIZEOF_PIXEL] + mova m2, m1 + psll%2 m2, %3 + mova m3, m1 + psll%2 m3, 2*%3 + PRED8x8_LOWPASS %1, m0, m1, m3, m2, m4 + +%assign Y -2 +%rep 5 + %assign i (5 + ((Y+3)&1)) + PALIGNR m %+ i, m0, 7*SIZEOF_PIXEL, m2 + mova [r0+Y*FDEC_STRIDEB], m %+ i + psll%2 m0, %3 +%assign Y (Y+1) +%endrep + PALIGNR m5, m0, 7*SIZEOF_PIXEL, m0 + mova [r0+Y*FDEC_STRIDEB], m5 RET +%endmacro + +%ifdef HIGH_BIT_DEPTH +INIT_XMM sse2 +PREDICT_8x8_VR w, dq, 2 +INIT_XMM ssse3 +PREDICT_8x8_VR w, dq, 2 +INIT_XMM avx +PREDICT_8x8_VR w, dq, 2 +%elifndef ARCH_X86_64 +INIT_MMX mmx2 +PREDICT_8x8_VR b, q , 8 +%endif + +;----------------------------------------------------------------------------- +; void predict_8x8c_p_core( uint8_t *src, int i00, int b, int c ) +;----------------------------------------------------------------------------- +%ifndef ARCH_X86_64 +INIT_MMX +cglobal predict_8x8c_p_core_mmx2, 1,2 + LOAD_PLANE_ARGS + movq mm1, mm2 + pmullw mm2, [pw_3210] + psllw mm1, 2 + paddsw mm0, mm2 ; mm0 = {i+0*b, i+1*b, i+2*b, i+3*b} + paddsw mm1, mm0 ; mm1 = {i+4*b, i+5*b, i+6*b, i+7*b} + mov r1d, 8 +ALIGN 4 +.loop: + movq mm5, mm0 + movq mm6, mm1 + psraw mm5, 5 + psraw mm6, 5 + packuswb mm5, mm6 + movq [r0], mm5 + + paddsw mm0, mm4 + paddsw mm1, mm4 + add r0, FDEC_STRIDE + dec r1d + jg .loop + REP_RET %endif ; !ARCH_X86_64 +INIT_XMM +%ifdef HIGH_BIT_DEPTH +cglobal predict_8x8c_p_core_sse2, 1,1,7 + movd m0, r1m + movd m2, r2m + movd m4, r3m + mova m3, [pw_pixel_max] + pxor m1, m1 + SPLATW m0, m0, 0 + SPLATW m2, m2, 0 + SPLATW m4, m4, 0 + pmullw m2, [pw_43210123] ; b + pmullw m5, m4, [pw_m3] ; c + paddw m5, [pw_16] + mov r1d, 8 +.loop: + paddsw m6, m2, m5 + paddsw m6, m0 + psraw m6, 5 + CLIPW m6, m1, m3 + mova [r0], m6 + paddw m5, m4 + add r0, FDEC_STRIDEB + dec r1d + jg .loop + REP_RET +%else ; !HIGH_BIT_DEPTH +cglobal predict_8x8c_p_core_sse2, 1,1 + movd m0, r1m + movd m2, r2m + movd m4, r3m + SPLATW m0, m0, 0 + SPLATW m2, m2, 0 + SPLATW m4, m4, 0 + pmullw m2, [pw_76543210] + paddsw m0, m2 ; m0 = {i+0*b, i+1*b, i+2*b, i+3*b, i+4*b, i+5*b, i+6*b, i+7*b} + paddsw m3, m0, m4 + paddsw m4, m4 +call .loop + add r0, FDEC_STRIDE*4 +.loop: + paddsw m1, m3, m4 + paddsw m5, m0, m4 + psraw m3, 5 + psraw m0, 5 + packuswb m0, m3 + movq [r0+FDEC_STRIDE*0], m0 + movhps [r0+FDEC_STRIDE*1], m0 + paddsw m0, m5, m4 + paddsw m3, m1, m4 + psraw m5, 5 + psraw m1, 5 + packuswb m5, m1 + movq [r0+FDEC_STRIDE*2], m5 + movhps [r0+FDEC_STRIDE*3], m5 + RET +%endif ; HIGH_BIT_DEPTH + ;----------------------------------------------------------------------------- -; void predict_8x8_ddl_sse2( uint8_t *src, uint8_t *edge ) +; void predict_16x16_p_core( uint8_t *src, int i00, int b, int c ) ;----------------------------------------------------------------------------- -cglobal predict_8x8_ddl_sse2, 2,2 +%ifndef ARCH_X86_64 +cglobal predict_16x16_p_core_mmx2, 1,2 + LOAD_PLANE_ARGS + movq mm5, mm2 + movq mm1, mm2 + pmullw mm5, [pw_3210] + psllw mm2, 3 + psllw mm1, 2 + movq mm3, mm2 + paddsw mm0, mm5 ; mm0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b} + paddsw mm1, mm0 ; mm1 = {i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b} + paddsw mm2, mm0 ; mm2 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b} + paddsw mm3, mm1 ; mm3 = {i+12*b, i+13*b, i+14*b, i+15*b} + + mov r1d, 16 +ALIGN 4 +.loop: + movq mm5, mm0 + movq mm6, mm1 + psraw mm5, 5 + psraw mm6, 5 + packuswb mm5, mm6 + movq [r0], mm5 + + movq mm5, mm2 + movq mm6, mm3 + psraw mm5, 5 + psraw mm6, 5 + packuswb mm5, mm6 + movq [r0+8], mm5 + + paddsw mm0, mm4 + paddsw mm1, mm4 + paddsw mm2, mm4 + paddsw mm3, mm4 + add r0, FDEC_STRIDE + dec r1d + jg .loop + REP_RET +%endif ; !ARCH_X86_64 + +%macro PREDICT_16x16_P 0 +cglobal predict_16x16_p_core, 1,2,8 + movd m0, r1m + movd m1, r2m + movd m2, r3m + SPLATW m0, m0, 0 + SPLATW m1, m1, 0 + SPLATW m2, m2, 0 + pmullw m3, m1, [pw_76543210] + psllw m1, 3 +%ifdef HIGH_BIT_DEPTH + pxor m6, m6 + mov r1d, 16 +.loop: + mova m4, m0 + mova m5, m0 + mova m7, m3 + paddsw m7, m6 + paddsw m4, m7 + paddsw m7, m1 + paddsw m5, m7 + psraw m4, 5 + psraw m5, 5 + CLIPW m4, [pb_0], [pw_pixel_max] + CLIPW m5, [pb_0], [pw_pixel_max] + mova [r0], m4 + mova [r0+16], m5 + add r0, FDEC_STRIDEB + paddw m6, m2 + dec r1d + jg .loop +%else ; !HIGH_BIT_DEPTH + paddsw m0, m3 ; m0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b, i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b} + paddsw m1, m0 ; m1 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b, i+12*b, i+13*b, i+14*b, i+15*b} + paddsw m7, m2, m2 + mov r1d, 8 +ALIGN 4 +.loop: + psraw m3, m0, 5 + psraw m4, m1, 5 + paddsw m5, m0, m2 + paddsw m6, m1, m2 + psraw m5, 5 + psraw m6, 5 + packuswb m3, m4 + packuswb m5, m6 + mova [r0+FDEC_STRIDE*0], m3 + mova [r0+FDEC_STRIDE*1], m5 + paddsw m0, m7 + paddsw m1, m7 + add r0, FDEC_STRIDE*2 + dec r1d + jg .loop +%endif ; !HIGH_BIT_DEPTH + REP_RET +%endmacro ; PREDICT_16x16_P + +INIT_XMM sse2 +PREDICT_16x16_P +%ifndef HIGH_BIT_DEPTH +INIT_XMM avx +PREDICT_16x16_P +%endif + +%ifndef HIGH_BIT_DEPTH +%macro PREDICT_8x8 0 +;----------------------------------------------------------------------------- +; void predict_8x8_ddl( uint8_t *src, uint8_t *edge ) +;----------------------------------------------------------------------------- +cglobal predict_8x8_ddl, 2,2 movdqa xmm3, [r1+16] movdqu xmm2, [r1+17] - movdqa xmm1, xmm3 - pslldq xmm1, 1 - PRED8x8_LOWPASS_XMM xmm0, xmm1, xmm2, xmm3, xmm4 + pslldq xmm1, xmm3, 1 + add r0, FDEC_STRIDE*4 + PRED8x8_LOWPASS b, xmm0, xmm1, xmm2, xmm3, xmm4 -%assign Y 0 +%assign Y -4 %rep 8 psrldq xmm0, 1 movq [r0+Y*FDEC_STRIDE], xmm0 @@ -322,18 +1231,17 @@ cglobal predict_8x8_ddl_sse2, 2,2 RET ;----------------------------------------------------------------------------- -; void predict_8x8_ddr_sse2( uint8_t *src, uint8_t *edge ) +; void predict_8x8_ddr( uint8_t *src, uint8_t *edge ) ;----------------------------------------------------------------------------- -cglobal predict_8x8_ddr_sse2, 2,2 +cglobal predict_8x8_ddr, 2,2 movdqu xmm3, [r1+8] movdqu xmm1, [r1+7] - movdqa xmm2, xmm3 - psrldq xmm2, 1 - PRED8x8_LOWPASS_XMM xmm0, xmm1, xmm2, xmm3, xmm4 + psrldq xmm2, xmm3, 1 + add r0, FDEC_STRIDE*4 + PRED8x8_LOWPASS b, xmm0, xmm1, xmm2, xmm3, xmm4 - movdqa xmm1, xmm0 - psrldq xmm1, 1 -%assign Y 7 + psrldq xmm1, xmm0, 1 +%assign Y 3 %rep 3 movq [r0+Y*FDEC_STRIDE], xmm0 movq [r0+(Y-1)*FDEC_STRIDE], xmm1 @@ -341,27 +1249,24 @@ cglobal predict_8x8_ddr_sse2, 2,2 psrldq xmm1, 2 %assign Y (Y-2) %endrep - movq [r0+1*FDEC_STRIDE], xmm0 - movq [r0+0*FDEC_STRIDE], xmm1 - + movq [r0-3*FDEC_STRIDE], xmm0 + movq [r0-4*FDEC_STRIDE], xmm1 RET ;----------------------------------------------------------------------------- -; void predict_8x8_vl_sse2( uint8_t *src, uint8_t *edge ) +; void predict_8x8_vl( uint8_t *src, uint8_t *edge ) ;----------------------------------------------------------------------------- -cglobal predict_8x8_vl_sse2, 2,2 +cglobal predict_8x8_vl, 2,2 movdqa xmm4, [r1+16] - movdqa xmm2, xmm4 - movdqa xmm1, xmm4 - movdqa xmm3, xmm4 - psrldq xmm2, 1 - pslldq xmm1, 1 - pavgb xmm3, xmm2 - PRED8x8_LOWPASS_XMM xmm0, xmm1, xmm2, xmm4, xmm5 + pslldq xmm1, xmm4, 1 + psrldq xmm2, xmm4, 1 + pavgb xmm3, xmm4, xmm2 + add r0, FDEC_STRIDE*4 + PRED8x8_LOWPASS b, xmm0, xmm1, xmm2, xmm4, xmm5 ; xmm0: (t0 + 2*t1 + t2 + 2) >> 2 ; xmm3: (t0 + t1 + 1) >> 1 -%assign Y 0 +%assign Y -4 %rep 3 psrldq xmm0, 1 movq [r0+ Y *FDEC_STRIDE], xmm3 @@ -376,77 +1281,401 @@ cglobal predict_8x8_vl_sse2, 2,2 RET ;----------------------------------------------------------------------------- -; void predict_8x8_vr_core_mmxext( uint8_t *src, uint8_t *edge ) +; void predict_8x8_vr( uint8_t *src, uint8_t *edge ) ;----------------------------------------------------------------------------- +cglobal predict_8x8_vr, 2,2,7 + movdqu xmm0, [r1+8] + movdqa xmm6, [pw_ff00] + add r0, 4*FDEC_STRIDE + movdqa xmm2, xmm0 + movdqa xmm3, xmm0 + pslldq xmm1, xmm0, 2 + pslldq xmm0, 1 + pavgb xmm2, xmm0 + PRED8x8_LOWPASS b, xmm4, xmm3, xmm1, xmm0, xmm5 + pandn xmm6, xmm4 + movdqa xmm5, xmm4 + psrlw xmm4, 8 + packuswb xmm6, xmm4 + movhlps xmm4, xmm6 + movhps [r0-3*FDEC_STRIDE], xmm5 + movhps [r0-4*FDEC_STRIDE], xmm2 + psrldq xmm5, 4 + movss xmm5, xmm6 + psrldq xmm2, 4 + movss xmm2, xmm4 +%assign Y 3 +%rep 3 + psrldq xmm5, 1 + psrldq xmm2, 1 + movq [r0+Y*FDEC_STRIDE], xmm5 + movq [r0+(Y-1)*FDEC_STRIDE], xmm2 +%assign Y (Y-2) +%endrep + RET +%endmacro ; PREDICT_8x8 -; fills only some pixels: -; f01234567 -; 0........ -; 1,,,,,,,, -; 2 ....... -; 3 ,,,,,,, -; 4 ...... -; 5 ,,,,,, -; 6 ..... -; 7 ,,,,, +INIT_XMM sse2 +PREDICT_8x8 +INIT_XMM avx +PREDICT_8x8 -cglobal predict_8x8_vr_core_mmxext, 2,2 - movq mm2, [r1+16] - movq mm3, [r1+15] - movq mm1, [r1+14] - movq mm4, mm3 - pavgb mm3, mm2 - PRED8x8_LOWPASS mm0, mm1, mm2, mm4, mm7 +%endif ; !HIGH_BIT_DEPTH -%assign Y 0 +;----------------------------------------------------------------------------- +; void predict_8x8_hd( pixel *src, pixel *edge ) +;----------------------------------------------------------------------------- +%macro PREDICT_8x8_HD 4 +cglobal predict_8x8_hd, 2,2,8 + add r0, 4*FDEC_STRIDEB + mova m0, [r1] ; l7 .. .. .. .. .. .. .. + mova m1, [r1+ 8*SIZEOF_PIXEL] ; lt l0 l1 l2 l3 l4 l5 l6 + mova m2, [r1+16*SIZEOF_PIXEL] ; t7 t6 t5 t4 t3 t2 t1 t0 + mova m3, m1 ; lt l0 l1 l2 l3 l4 l5 l6 + mova m4, m2 ; t7 t6 t5 t4 t3 t2 t1 t0 + PALIGNR m2, m1, 7*SIZEOF_PIXEL, m5 ; t6 t5 t4 t3 t2 t1 t0 lt + PALIGNR m1, m0, 7*SIZEOF_PIXEL, m6 ; l0 l1 l2 l3 l4 l5 l6 l7 + PALIGNR m4, m3, 1*SIZEOF_PIXEL, m7 ; t0 lt l0 l1 l2 l3 l4 l5 + mova m5, m3 + pavg%1 m3, m1 + PRED8x8_LOWPASS %1, m0, m4, m1, m5, m7 + psrl%2 m4, m2, 2*%4 ; .. .. t6 t5 t4 t3 t2 t1 + psrl%2 m1, m2, %4 ; .. t6 t5 t4 t3 t2 t1 t0 + PRED8x8_LOWPASS %1, m6, m4, m2, m1, m5 + ; .. p11 p10 p9 + punpckh%3 m7, m3, m0 ; p8 p7 p6 p5 + punpckl%3 m3, m0 ; p4 p3 p2 p1 + mova m1, m7 + mova m0, m7 + mova m4, m7 + mova [r0+3*FDEC_STRIDEB], m3 + PALIGNR m7, m3, 2*SIZEOF_PIXEL, m5 + mova [r0+2*FDEC_STRIDEB], m7 + PALIGNR m1, m3, 4*SIZEOF_PIXEL, m5 + mova [r0+1*FDEC_STRIDEB], m1 + PALIGNR m0, m3, 6*SIZEOF_PIXEL, m3 + mova [r0+0*FDEC_STRIDEB], m0 + mova m2, m6 + mova m3, m6 + mova [r0-1*FDEC_STRIDEB], m4 + PALIGNR m6, m4, 2*SIZEOF_PIXEL, m5 + mova [r0-2*FDEC_STRIDEB], m6 + PALIGNR m2, m4, 4*SIZEOF_PIXEL, m5 + mova [r0-3*FDEC_STRIDEB], m2 + PALIGNR m3, m4, 6*SIZEOF_PIXEL, m4 + mova [r0-4*FDEC_STRIDEB], m3 + RET +%endmacro + +%ifdef HIGH_BIT_DEPTH +INIT_XMM sse2 +PREDICT_8x8_HD w, dq, wd, 2 +INIT_XMM ssse3 +PREDICT_8x8_HD w, dq, wd, 2 +INIT_XMM avx +PREDICT_8x8_HD w, dq, wd, 2 +%else +INIT_MMX mmx2 +PREDICT_8x8_HD b, q , bw, 8 + +;----------------------------------------------------------------------------- +; void predict_8x8_hd( uint8_t *src, uint8_t *edge ) +;----------------------------------------------------------------------------- +%macro PREDICT_8x8_HD 0 +cglobal predict_8x8_hd, 2,2 + add r0, 4*FDEC_STRIDE + movdqa xmm0, [r1] + movdqa xmm1, [r1+16] + movdqa xmm2, xmm1 + movdqa xmm3, xmm1 + PALIGNR xmm1, xmm0, 7, xmm4 + PALIGNR xmm2, xmm0, 9, xmm5 + PALIGNR xmm3, xmm0, 8, xmm0 + pavgb xmm4, xmm1, xmm3 + PRED8x8_LOWPASS b, xmm0, xmm1, xmm2, xmm3, xmm5 + punpcklbw xmm4, xmm0 + movhlps xmm0, xmm4 + +%assign Y 3 %rep 3 - movq [r0+ Y *FDEC_STRIDE], mm3 - movq [r0+(Y+1)*FDEC_STRIDE], mm0 - psllq mm3, 8 - psllq mm0, 8 -%assign Y (Y+2) + movq [r0+(Y)*FDEC_STRIDE], xmm4 + movq [r0+(Y-4)*FDEC_STRIDE], xmm0 + psrldq xmm4, 2 + psrldq xmm0, 2 +%assign Y (Y-1) %endrep - movq [r0+ Y *FDEC_STRIDE], mm3 - movq [r0+(Y+1)*FDEC_STRIDE], mm0 + movq [r0+(Y)*FDEC_STRIDE], xmm4 + movq [r0+(Y-4)*FDEC_STRIDE], xmm0 + RET +%endmacro +INIT_XMM sse2 +PREDICT_8x8_HD +INIT_XMM ssse3 +PREDICT_8x8_HD +INIT_XMM avx +PREDICT_8x8_HD +%endif ; HIGH_BIT_DEPTH + +;----------------------------------------------------------------------------- +; void predict_8x8_hu( uint8_t *src, uint8_t *edge ) +;----------------------------------------------------------------------------- +%macro PREDICT_8x8_HU 0 +cglobal predict_8x8_hu, 2,2 + add r0, 4*FDEC_STRIDE +%if cpuflag(ssse3) + movq mm5, [r1+7] + movq mm6, [pb_reverse] + movq mm1, mm5 + movq mm2, mm5 + movq mm3, mm5 + pshufb mm5, mm6 + psrlq mm6, 8 + pshufb mm2, mm6 + psrlq mm6, 8 + pshufb mm3, mm6 + movq mm4, mm5 +%else + movq mm1, [r1+7] ; l0 l1 l2 l3 l4 l5 l6 l7 + pshufw mm0, mm1, q0123 ; l6 l7 l4 l5 l2 l3 l0 l1 + movq mm2, mm0 + psllw mm0, 8 + psrlw mm2, 8 + por mm2, mm0 ; l7 l6 l5 l4 l3 l2 l1 l0 + psllq mm1, 56 ; l7 .. .. .. .. .. .. .. + movq mm3, mm2 + movq mm4, mm2 + movq mm5, mm2 + psrlq mm2, 8 + psrlq mm3, 16 + por mm2, mm1 ; l7 l7 l6 l5 l4 l3 l2 l1 + punpckhbw mm1, mm1 + por mm3, mm1 ; l7 l7 l7 l6 l5 l4 l3 l2 +%endif + pavgb mm4, mm2 + PRED8x8_LOWPASS b, mm1, mm3, mm5, mm2, mm6 + + movq2dq xmm0, mm4 + movq2dq xmm1, mm1 + punpcklbw xmm0, xmm1 + punpckhbw mm4, mm1 +%assign Y -4 +%rep 3 + movq [r0+Y*FDEC_STRIDE], xmm0 + psrldq xmm0, 2 +%assign Y (Y+1) +%endrep + pshufw mm5, mm4, q3321 + pshufw mm6, mm4, q3332 + pshufw mm7, mm4, q3333 + movq [r0+Y*FDEC_STRIDE], xmm0 + movq [r0+0*FDEC_STRIDE], mm4 + movq [r0+1*FDEC_STRIDE], mm5 + movq [r0+2*FDEC_STRIDE], mm6 + movq [r0+3*FDEC_STRIDE], mm7 RET +%endmacro + +%ifndef HIGH_BIT_DEPTH +INIT_MMX sse2 +PREDICT_8x8_HU +INIT_MMX ssse3 +PREDICT_8x8_HU +%endif ;----------------------------------------------------------------------------- -; void predict_8x8c_v_mmx( uint8_t *src ) +; void predict_8x8c_v( uint8_t *src ) ;----------------------------------------------------------------------------- + +%macro PREDICT_8x8C_V 0 +cglobal predict_8x8c_v, 1,1 + mova m0, [r0 - FDEC_STRIDEB] + STORE8x8 m0, m0 + RET +%endmacro + +%ifdef HIGH_BIT_DEPTH +INIT_XMM sse2 +PREDICT_8x8C_V +%else +INIT_MMX mmx +PREDICT_8x8C_V +%endif + +%ifdef HIGH_BIT_DEPTH + +INIT_MMX cglobal predict_8x8c_v_mmx, 1,1 - movq mm0, [r0 - FDEC_STRIDE] - STORE8x8 mm0, mm0 + mova m0, [r0 - FDEC_STRIDEB] + mova m1, [r0 - FDEC_STRIDEB + 8] +%assign n 0 +%rep 8 + mova [r0 + (n&1)*FDEC_STRIDEB], m0 + mova [r0 + (n&1)*FDEC_STRIDEB + 8], m1 +%if (n&1) && (n!=7) + add r0, FDEC_STRIDEB*2 +%endif +%assign n n+1 +%endrep RET +%endif + ;----------------------------------------------------------------------------- -; void predict_8x8c_h_mmxext( uint8_t *src ) +; void predict_8x8c_h( uint8_t *src ) ;----------------------------------------------------------------------------- +%ifdef HIGH_BIT_DEPTH -%macro PRED_8x8C_H 1 -cglobal predict_8x8c_h_%1, 1,1 -%ifidn %1, ssse3 - mova m1, [pb_3 GLOBAL] +INIT_XMM sse2 +cglobal predict_8x8c_h, 1,1 + add r0, FDEC_STRIDEB*4 +%assign n -4 +%rep 8 + movd m0, [r0+FDEC_STRIDEB*n-SIZEOF_PIXEL*2] + SPLATW m0, m0, 1 + mova [r0+FDEC_STRIDEB*n], m0 +%assign n n+1 +%endrep + RET + +%else + +%macro PREDICT_8x8C_H 0 +cglobal predict_8x8c_h, 1,1 +%if cpuflag(ssse3) + mova m1, [pb_3] %endif -%assign n 0 + add r0, FDEC_STRIDE*4 +%assign n -4 %rep 8 - SPLATB m0, r0+FDEC_STRIDE*n-1, m1 + SPLATB_LOAD m0, r0+FDEC_STRIDE*n-1, m1 mova [r0+FDEC_STRIDE*n], m0 %assign n n+1 %endrep - REP_RET + RET %endmacro -INIT_MMX -%define SPLATB SPLATB_MMX -PRED_8x8C_H mmxext -%define SPLATB SPLATB_SSSE3 -PRED_8x8C_H ssse3 +INIT_MMX mmx2 +PREDICT_8x8C_H +INIT_MMX ssse3 +PREDICT_8x8C_H +%endif ;----------------------------------------------------------------------------- -; void predict_8x8c_dc_core_mmxext( uint8_t *src, int s2, int s3 ) +; void predict_8x8c_dc( pixel *src ) ;----------------------------------------------------------------------------- -cglobal predict_8x8c_dc_core_mmxext, 1,1 + +%macro PREDICT_8x8C_DC 0 +cglobal predict_8x8c_dc, 1,3 + pxor m7, m7 +%ifdef HIGH_BIT_DEPTH + movq m0, [r0-FDEC_STRIDEB+0] + movq m1, [r0-FDEC_STRIDEB+8] + HADDW m0, m2 + HADDW m1, m2 +%else + movd m0, [r0-FDEC_STRIDEB+0] + movd m1, [r0-FDEC_STRIDEB+4] + psadbw m0, m7 ; s0 + psadbw m1, m7 ; s1 +%endif + add r0, FDEC_STRIDEB*4 + + movzx r1d, pixel [r0-FDEC_STRIDEB*4-SIZEOF_PIXEL] + movzx r2d, pixel [r0-FDEC_STRIDEB*3-SIZEOF_PIXEL] + add r1d, r2d + movzx r2d, pixel [r0-FDEC_STRIDEB*2-SIZEOF_PIXEL] + add r1d, r2d + movzx r2d, pixel [r0-FDEC_STRIDEB*1-SIZEOF_PIXEL] + add r1d, r2d + movd m2, r1d ; s2 + + movzx r1d, pixel [r0+FDEC_STRIDEB*0-SIZEOF_PIXEL] + movzx r2d, pixel [r0+FDEC_STRIDEB*1-SIZEOF_PIXEL] + add r1d, r2d + movzx r2d, pixel [r0+FDEC_STRIDEB*2-SIZEOF_PIXEL] + add r1d, r2d + movzx r2d, pixel [r0+FDEC_STRIDEB*3-SIZEOF_PIXEL] + add r1d, r2d + movd m3, r1d ; s3 + + punpcklwd m0, m1 + punpcklwd m2, m3 + punpckldq m0, m2 ; s0, s1, s2, s3 + pshufw m3, m0, q3312 ; s2, s1, s3, s3 + pshufw m0, m0, q1310 ; s0, s1, s3, s1 + paddw m0, m3 + psrlw m0, 2 + pavgw m0, m7 ; s0+s2, s1, s3, s1+s3 +%ifdef HIGH_BIT_DEPTH +%if cpuflag(sse2) + movq2dq xmm0, m0 + punpcklwd xmm0, xmm0 + pshufd xmm1, xmm0, q3322 + punpckldq xmm0, xmm0 +%assign n 0 +%rep 8 +%assign i (0 + (n/4)) + movdqa [r0+FDEC_STRIDEB*(n-4)+0], xmm %+ i +%assign n n+1 +%endrep +%else + pshufw m1, m0, q0000 + pshufw m2, m0, q1111 + pshufw m3, m0, q2222 + pshufw m4, m0, q3333 +%assign n 0 +%rep 8 +%assign i (1 + (n/4)*2) +%assign j (2 + (n/4)*2) + movq [r0+FDEC_STRIDEB*(n-4)+0], m %+ i + movq [r0+FDEC_STRIDEB*(n-4)+8], m %+ j +%assign n n+1 +%endrep +%endif +%else + packuswb m0, m0 + punpcklbw m0, m0 + movq m1, m0 + punpcklbw m0, m0 + punpckhbw m1, m1 +%assign n 0 +%rep 8 +%assign i (0 + (n/4)) + movq [r0+FDEC_STRIDEB*(n-4)], m %+ i +%assign n n+1 +%endrep +%endif + RET +%endmacro + +INIT_MMX mmx2 +PREDICT_8x8C_DC +%ifdef HIGH_BIT_DEPTH +INIT_MMX sse2 +PREDICT_8x8C_DC +%endif + +%ifdef HIGH_BIT_DEPTH + +INIT_XMM +cglobal predict_8x8c_dc_top_sse2, 1,1 + pxor m2, m2 + mova m0, [r0 - FDEC_STRIDEB] + pshufd m1, m0, q2301 + paddw m0, m1 + pshuflw m1, m0, q2301 + pshufhw m1, m1, q2301 + paddw m0, m1 + psrlw m0, 1 + pavgw m0, m2 + STORE8x8 m0, m0 + RET + +%else + +INIT_MMX +cglobal predict_8x8c_dc_top_mmx2, 1,1 movq mm0, [r0 - FDEC_STRIDE] pxor mm1, mm1 pxor mm2, mm2 @@ -454,256 +1683,210 @@ cglobal predict_8x8c_dc_core_mmxext, 1,1 punpcklbw mm0, mm2 psadbw mm1, mm2 ; s1 psadbw mm0, mm2 ; s0 - -%ifdef ARCH_X86_64 - movd mm4, r1d - movd mm5, r2d - paddw mm0, mm4 - pshufw mm2, mm5, 0 -%else - paddw mm0, r1m - pshufw mm2, r2m, 0 -%endif - psrlw mm0, 3 - paddw mm1, [pw_2 GLOBAL] - movq mm3, mm2 + psrlw mm1, 1 + psrlw mm0, 1 + pavgw mm1, mm2 + pavgw mm0, mm2 pshufw mm1, mm1, 0 pshufw mm0, mm0, 0 ; dc0 (w) - paddw mm3, mm1 - psrlw mm3, 3 ; dc3 (w) - psrlw mm2, 2 ; dc2 (w) - psrlw mm1, 2 ; dc1 (w) - packuswb mm0, mm1 ; dc0,dc1 (b) - packuswb mm2, mm3 ; dc2,dc3 (b) - - STORE8x8 mm0, mm2 + STORE8x8 mm0, mm0 RET -%macro LOAD_PLANE_ARGS 0 -%ifdef ARCH_X86_64 - movd mm0, r1d - movd mm2, r2d - movd mm4, r3d - pshufw mm0, mm0, 0 - pshufw mm2, mm2, 0 - pshufw mm4, mm4, 0 -%else - pshufw mm0, r1m, 0 - pshufw mm2, r2m, 0 - pshufw mm4, r3m, 0 %endif -%endmacro - -;----------------------------------------------------------------------------- -; void predict_8x8c_p_core_mmxext( uint8_t *src, int i00, int b, int c ) -;----------------------------------------------------------------------------- -cglobal predict_8x8c_p_core_mmxext, 1,2 - LOAD_PLANE_ARGS - movq mm1, mm2 - pmullw mm2, [pw_3210 GLOBAL] - psllw mm1, 2 - paddsw mm0, mm2 ; mm0 = {i+0*b, i+1*b, i+2*b, i+3*b} - paddsw mm1, mm0 ; mm1 = {i+4*b, i+5*b, i+6*b, i+7*b} - - mov r1d, 8 -ALIGN 4 -.loop: - movq mm5, mm0 - movq mm6, mm1 - psraw mm5, 5 - psraw mm6, 5 - packuswb mm5, mm6 - movq [r0], mm5 - - paddsw mm0, mm4 - paddsw mm1, mm4 - add r0, FDEC_STRIDE - dec r1d - jg .loop - REP_RET ;----------------------------------------------------------------------------- -; void predict_16x16_p_core_mmxext( uint8_t *src, int i00, int b, int c ) +; void predict_16x16_v( pixel *src ) ;----------------------------------------------------------------------------- -cglobal predict_16x16_p_core_mmxext, 1,2 - LOAD_PLANE_ARGS - movq mm5, mm2 - movq mm1, mm2 - pmullw mm5, [pw_3210 GLOBAL] - psllw mm2, 3 - psllw mm1, 2 - movq mm3, mm2 - paddsw mm0, mm5 ; mm0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b} - paddsw mm1, mm0 ; mm1 = {i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b} - paddsw mm2, mm0 ; mm2 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b} - paddsw mm3, mm1 ; mm3 = {i+12*b, i+13*b, i+14*b, i+15*b} - - mov r1d, 16 -ALIGN 4 -.loop: - movq mm5, mm0 - movq mm6, mm1 - psraw mm5, 5 - psraw mm6, 5 - packuswb mm5, mm6 - movq [r0], mm5 - - movq mm5, mm2 - movq mm6, mm3 - psraw mm5, 5 - psraw mm6, 5 - packuswb mm5, mm6 - movq [r0+8], mm5 - - paddsw mm0, mm4 - paddsw mm1, mm4 - paddsw mm2, mm4 - paddsw mm3, mm4 - add r0, FDEC_STRIDE - dec r1d - jg .loop +%ifdef HIGH_BIT_DEPTH +INIT_MMX +cglobal predict_16x16_v_mmx, 1,2 + mova m0, [r0 - FDEC_STRIDEB+ 0] + mova m1, [r0 - FDEC_STRIDEB+ 8] + mova m2, [r0 - FDEC_STRIDEB+16] + mova m3, [r0 - FDEC_STRIDEB+24] + STORE16x16 m0, m1, m2, m3 REP_RET - -;----------------------------------------------------------------------------- -; void predict_16x16_p_core_sse2( uint8_t *src, int i00, int b, int c ) -;----------------------------------------------------------------------------- -cglobal predict_16x16_p_core_sse2, 1,2 - movd xmm0, r1m - movd xmm1, r2m - movd xmm2, r3m - pshuflw xmm0, xmm0, 0 - pshuflw xmm1, xmm1, 0 - pshuflw xmm2, xmm2, 0 - punpcklqdq xmm0, xmm0 - punpcklqdq xmm1, xmm1 - punpcklqdq xmm2, xmm2 - movdqa xmm3, xmm1 - pmullw xmm3, [pw_76543210 GLOBAL] - psllw xmm1, 3 - paddsw xmm0, xmm3 ; xmm0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b, i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b} - paddsw xmm1, xmm0 ; xmm1 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b, i+12*b, i+13*b, i+14*b, i+15*b} - - mov r1d, 16 -ALIGN 4 -.loop: - movdqa xmm3, xmm0 - movdqa xmm4, xmm1 - psraw xmm3, 5 - psraw xmm4, 5 - packuswb xmm3, xmm4 - movdqa [r0], xmm3 - - paddsw xmm0, xmm2 - paddsw xmm1, xmm2 - add r0, FDEC_STRIDE - dec r1d - jg .loop +INIT_XMM +cglobal predict_16x16_v_sse2, 2,2 + mova m0, [r0 - FDEC_STRIDEB+ 0] + mova m1, [r0 - FDEC_STRIDEB+16] + STORE16x16_SSE2 m0, m1 REP_RET - -;----------------------------------------------------------------------------- -; void predict_16x16_v_mmx( uint8_t *src ) -;----------------------------------------------------------------------------- +%else +INIT_MMX cglobal predict_16x16_v_mmx, 1,2 - movq mm0, [r0 - FDEC_STRIDE] - movq mm1, [r0 - FDEC_STRIDE + 8] - STORE16x16 mm0, mm1 + movq m0, [r0 - FDEC_STRIDE + 0] + movq m1, [r0 - FDEC_STRIDE + 8] + STORE16x16 m0, m1 REP_RET - -;----------------------------------------------------------------------------- -; void predict_16x16_v_sse2( uint8_t *src ) -;----------------------------------------------------------------------------- -cglobal predict_16x16_v_sse2, 1,2 +INIT_XMM +cglobal predict_16x16_v_sse2, 1,1 movdqa xmm0, [r0 - FDEC_STRIDE] STORE16x16_SSE2 xmm0 - REP_RET + RET +%endif ;----------------------------------------------------------------------------- -; void predict_16x16_h_mmxext( uint8_t *src ) +; void predict_16x16_h( pixel *src ) ;----------------------------------------------------------------------------- +%macro PREDICT_16x16_H 0 +cglobal predict_16x16_h, 1,2 + mov r1, 12*FDEC_STRIDEB +%ifdef HIGH_BIT_DEPTH +.vloop: +%assign n 0 +%rep 4 + movd m0, [r0+r1+n*FDEC_STRIDEB-2*SIZEOF_PIXEL] + SPLATW m0, m0, 1 + mova [r0+r1+n*FDEC_STRIDEB+ 0], m0 + mova [r0+r1+n*FDEC_STRIDEB+16], m0 +%if mmsize==8 + mova [r0+r1+n*FDEC_STRIDEB+ 8], m0 + mova [r0+r1+n*FDEC_STRIDEB+24], m0 +%endif +%assign n n+1 +%endrep -%macro PRED_16x16_H 1 -cglobal predict_16x16_h_%1, 1,2 - mov r1, FDEC_STRIDE*12 -%ifidn %1, ssse3 - mova m1, [pb_3 GLOBAL] +%else +%if cpuflag(ssse3) + mova m1, [pb_3] %endif .vloop: %assign n 0 %rep 4 - SPLATB m0, r0+r1+FDEC_STRIDE*n-1, m1 + SPLATB_LOAD m0, r0+r1+FDEC_STRIDE*n-1, m1 mova [r0+r1+FDEC_STRIDE*n], m0 %if mmsize==8 mova [r0+r1+FDEC_STRIDE*n+8], m0 %endif %assign n n+1 %endrep - add r1, -FDEC_STRIDE*4 +%endif ; HIGH_BIT_DEPTH + sub r1, 4*FDEC_STRIDEB jge .vloop REP_RET %endmacro -;no SSE2, its slower than MMX on all systems that don't support SSSE3 -INIT_MMX -%define SPLATB SPLATB_MMX -PRED_16x16_H mmxext -INIT_XMM -%define SPLATB SPLATB_SSSE3 -PRED_16x16_H ssse3 +INIT_MMX mmx2 +PREDICT_16x16_H +INIT_XMM sse2 +%ifdef HIGH_BIT_DEPTH +PREDICT_16x16_H +%else +;no SSE2 for 8-bit, it's slower than MMX on all systems that don't support SSSE3 +INIT_XMM ssse3 +PREDICT_16x16_H +%endif ;----------------------------------------------------------------------------- -; void predict_16x16_dc_core_mmxext( uint8_t *src, int i_dc_left ) +; void predict_16x16_dc_core( pixel *src, int i_dc_left ) ;----------------------------------------------------------------------------- %macro PRED16x16_DC 2 - pxor mm0, mm0 - pxor mm1, mm1 - psadbw mm0, [r0 - FDEC_STRIDE] - psadbw mm1, [r0 - FDEC_STRIDE + 8] - paddusw mm0, mm1 - paddusw mm0, %1 - psrlw mm0, %2 ; dc - pshufw mm0, mm0, 0 - packuswb mm0, mm0 ; dc in bytes - STORE16x16 mm0, mm0 +%ifdef HIGH_BIT_DEPTH + mova m0, [r0 - FDEC_STRIDEB+ 0] + paddw m0, [r0 - FDEC_STRIDEB+ 8] + paddw m0, [r0 - FDEC_STRIDEB+16] + paddw m0, [r0 - FDEC_STRIDEB+24] + HADDW m0, m1 + paddw m0, %1 + psrlw m0, %2 + SPLATW m0, m0 + STORE16x16 m0, m0, m0, m0 +%else + pxor m0, m0 + pxor m1, m1 + psadbw m0, [r0 - FDEC_STRIDE] + psadbw m1, [r0 - FDEC_STRIDE + 8] + paddusw m0, m1 + paddusw m0, %1 + psrlw m0, %2 ; dc + pshufw m0, m0, 0 + packuswb m0, m0 ; dc in bytes + STORE16x16 m0, m0 +%endif %endmacro -cglobal predict_16x16_dc_core_mmxext, 1,2 +INIT_MMX +cglobal predict_16x16_dc_core_mmx2, 1,2 %ifdef ARCH_X86_64 - movd mm2, r1d - PRED16x16_DC mm2, 5 + movd m6, r1d + PRED16x16_DC m6, 5 %else PRED16x16_DC r1m, 5 %endif REP_RET -cglobal predict_16x16_dc_top_mmxext, 1,2 - PRED16x16_DC [pw_8 GLOBAL], 4 +INIT_MMX +cglobal predict_16x16_dc_top_mmx2, 1,2 + PRED16x16_DC [pw_8], 4 REP_RET +INIT_MMX +%ifdef HIGH_BIT_DEPTH +cglobal predict_16x16_dc_left_core_mmx2, 1,2 + movd m0, r1m + SPLATW m0, m0 + STORE16x16 m0, m0, m0, m0 + REP_RET +%else +cglobal predict_16x16_dc_left_core_mmx2, 1,1 + movd m0, r1m + pshufw m0, m0, 0 + packuswb m0, m0 + STORE16x16 m0, m0 + REP_RET +%endif + ;----------------------------------------------------------------------------- -; void predict_16x16_dc_core_sse2( uint8_t *src, int i_dc_left ) +; void predict_16x16_dc_core( pixel *src, int i_dc_left ) ;----------------------------------------------------------------------------- %macro PRED16x16_DC_SSE2 2 - pxor xmm0, xmm0 - psadbw xmm0, [r0 - FDEC_STRIDE] - movhlps xmm1, xmm0 - paddw xmm0, xmm1 - paddusw xmm0, %1 - psrlw xmm0, %2 ; dc - pshuflw xmm0, xmm0, 0 - punpcklqdq xmm0, xmm0 - packuswb xmm0, xmm0 ; dc in bytes - STORE16x16_SSE2 xmm0 +%ifdef HIGH_BIT_DEPTH + mova m0, [r0 - FDEC_STRIDEB+ 0] + paddw m0, [r0 - FDEC_STRIDEB+16] + HADDW m0, m2 + paddw m0, %1 + psrlw m0, %2 + SPLATW m0, m0 + STORE16x16_SSE2 m0, m0 +%else + pxor m0, m0 + psadbw m0, [r0 - FDEC_STRIDE] + movhlps m1, m0 + paddw m0, m1 + paddusw m0, %1 + psrlw m0, %2 ; dc + SPLATW m0, m0 + packuswb m0, m0 ; dc in bytes + STORE16x16_SSE2 m0 +%endif %endmacro -cglobal predict_16x16_dc_core_sse2, 1,2 - movd xmm2, r1m - PRED16x16_DC_SSE2 xmm2, 5 +INIT_XMM +cglobal predict_16x16_dc_core_sse2, 2,2,4 + movd m3, r1m + PRED16x16_DC_SSE2 m3, 5 REP_RET cglobal predict_16x16_dc_top_sse2, 1,2 - PRED16x16_DC_SSE2 [pw_8 GLOBAL], 4 + PRED16x16_DC_SSE2 [pw_8], 4 REP_RET +INIT_XMM +%ifdef HIGH_BIT_DEPTH +cglobal predict_16x16_dc_left_core_sse2, 1,2 + movd m0, r1m + SPLATW m0, m0 + STORE16x16_SSE2 m0, m0 + REP_RET +%else +cglobal predict_16x16_dc_left_core_sse2, 1,1 + movd m0, r1m + SPLATW m0, m0 + packuswb m0, m0 + STORE16x16_SSE2 m0 + RET +%endif