X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=common%2Fx86%2Fdct-64.asm;h=c1aff843101f0081686f07ebe04908c7aa803e4a;hb=c1e37099627b1dc2f15b295aa4c2eedd431a6dba;hp=a6d753e8f7f2156e1eb96dea16fd254b36060875;hpb=bdbd4fe7709e129f90cf3d7d59b500e915c6b187;p=x264 diff --git a/common/x86/dct-64.asm b/common/x86/dct-64.asm index a6d753e8..c1aff843 100644 --- a/common/x86/dct-64.asm +++ b/common/x86/dct-64.asm @@ -1,11 +1,12 @@ ;***************************************************************************** -;* dct-64.asm: h264 encoder library +;* dct-64.asm: x86_64 transform and zigzag ;***************************************************************************** -;* Copyright (C) 2003-2008 x264 project +;* Copyright (C) 2003-2013 x264 project ;* -;* Authors: Laurent Aimar (initial version) -;* Loren Merritt (dct8, misc) -;* Min Chen (converted to nasm) +;* Authors: Loren Merritt +;* Holger Lubitz +;* Laurent Aimar +;* Min Chen ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by @@ -20,135 +21,272 @@ ;* You should have received a copy of the GNU General Public License ;* along with this program; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. +;* +;* This program is also available under a commercial proprietary license. +;* For more information, contact us at licensing@x264.com. ;***************************************************************************** %include "x86inc.asm" - -SECTION_RODATA -pw_32: times 8 dw 32 +%include "x86util.asm" SECTION .text -INIT_XMM - -%macro LOAD_DIFF_8P 5 - movq %1, %4 - punpcklbw %1, %3 - movq %2, %5 - punpcklbw %2, %3 - psubw %1, %2 -%endmacro - -%macro SUMSUB_BA 2 - paddw %1, %2 - paddw %2, %2 - psubw %2, %1 -%endmacro - -%macro SBUTTERFLY 4 - mova m%4, m%2 - punpckl%1 m%2, m%3 - punpckh%1 m%4, m%3 - SWAP %3, %4 +cextern pd_32 +cextern pw_pixel_max +cextern pw_2 +cextern pw_m2 +cextern pw_32 +cextern hsub_mul + +; in: size, m0..m7, temp, temp +; out: m0..m7 +%macro DCT8_1D 11 + SUMSUB_BA %1, %6, %5, %11 ; %6=s34, %5=d34 + SUMSUB_BA %1, %7, %4, %11 ; %7=s25, %4=d25 + SUMSUB_BA %1, %8, %3, %11 ; %8=s16, %3=d16 + SUMSUB_BA %1, %9, %2, %11 ; %9=s07, %2=d07 + + SUMSUB_BA %1, %7, %8, %11 ; %7=a1, %8=a3 + SUMSUB_BA %1, %6, %9, %11 ; %6=a0, %9=a2 + + psra%1 m%10, m%2, 1 + padd%1 m%10, m%2 + padd%1 m%10, m%3 + padd%1 m%10, m%4 ; %10=a4 + + psra%1 m%11, m%5, 1 + padd%1 m%11, m%5 + padd%1 m%11, m%3 + psub%1 m%11, m%4 ; %11=a7 + + SUMSUB_BA %1, %5, %2 + psub%1 m%2, m%4 + psub%1 m%5, m%3 + psra%1 m%4, 1 + psra%1 m%3, 1 + psub%1 m%2, m%4 ; %2=a5 + psub%1 m%5, m%3 ; %5=a6 + + psra%1 m%3, m%11, 2 + padd%1 m%3, m%10 ; %3=b1 + psra%1 m%10, 2 + psub%1 m%10, m%11 ; %10=b7 + + SUMSUB_BA %1, %7, %6, %11 ; %7=b0, %6=b4 + + psra%1 m%4, m%8, 1 + padd%1 m%4, m%9 ; %4=b2 + psra%1 m%9, 1 + psub%1 m%9, m%8 ; %9=b6 + + psra%1 m%8, m%5, 2 + padd%1 m%8, m%2 ; %8=b3 + psra%1 m%2, 2 + psub%1 m%5, m%2 ; %5=b5 + + SWAP %2, %7, %5, %8, %9, %10 %endmacro -%macro TRANSPOSE8x8W 9 - SBUTTERFLY wd, %1, %2, %9 - SBUTTERFLY wd, %3, %4, %9 - SBUTTERFLY wd, %5, %6, %9 - SBUTTERFLY wd, %7, %8, %9 - SBUTTERFLY dq, %1, %3, %9 - SBUTTERFLY dq, %2, %4, %9 - SBUTTERFLY dq, %5, %7, %9 - SBUTTERFLY dq, %6, %8, %9 - SBUTTERFLY qdq, %1, %5, %9 - SBUTTERFLY qdq, %2, %6, %9 - SBUTTERFLY qdq, %3, %7, %9 - SBUTTERFLY qdq, %4, %8, %9 - SWAP %2, %5 - SWAP %4, %7 +%macro IDCT8_1D 11 + SUMSUB_BA %1, %6, %2, %10 ; %5=a0, %1=a2 + + psra%1 m%10, m%3, 1 + padd%1 m%10, m%3 + padd%1 m%10, m%5 + padd%1 m%10, m%7 ; %9=a7 + + psra%1 m%11, m%4, 1 + psub%1 m%11, m%8 ; %10=a4 + psra%1 m%8, 1 + padd%1 m%8, m%4 ; %7=a6 + + psra%1 m%4, m%7, 1 + padd%1 m%4, m%7 + padd%1 m%4, m%9 + psub%1 m%4, m%3 ; %3=a5 + + psub%1 m%3, m%5 + psub%1 m%7, m%5 + padd%1 m%3, m%9 + psub%1 m%7, m%9 + psra%1 m%5, 1 + psra%1 m%9, 1 + psub%1 m%3, m%5 ; %2=a3 + psub%1 m%7, m%9 ; %6=a1 + + psra%1 m%5, m%10, 2 + padd%1 m%5, m%7 ; %4=b1 + psra%1 m%7, 2 + psub%1 m%10, m%7 ; %9=b7 + + SUMSUB_BA %1, %8, %6, %7 ; %7=b0, %5=b6 + SUMSUB_BA %1, %11, %2, %7 ; %10=b2, %1=b4 + + psra%1 m%9, m%4, 2 + padd%1 m%9, m%3 ; %8=b3 + psra%1 m%3, 2 + psub%1 m%3, m%4 ; %2=b5 + + SUMSUB_BA %1, %10, %8, %7 ; %9=c0, %7=c7 + SUMSUB_BA %1, %3, %11, %7 ; %2=c1, %10=c6 + SUMSUB_BA %1, %9, %2, %7 ; %8=c2, %1=c5 + SUMSUB_BA %1, %5, %6, %7 ; %4=c3, %5=c4 + + SWAP %11, %4 + SWAP %2, %10, %7 + SWAP %4, %9, %8 %endmacro -%macro STORE_DIFF_8P 4 - psraw %1, 6 - movq %2, %4 - punpcklbw %2, %3 - paddsw %1, %2 - packuswb %1, %1 - movq %4, %1 -%endmacro - -SECTION .text - -%macro DCT8_1D 10 - SUMSUB_BA m%8, m%1 ; %8=s07, %1=d07 - SUMSUB_BA m%7, m%2 ; %7=s16, %2=d16 - SUMSUB_BA m%6, m%3 ; %6=s25, %3=d25 - SUMSUB_BA m%5, m%4 ; %5=s34, %4=d34 - - SUMSUB_BA m%5, m%8 ; %5=a0, %8=a2 - SUMSUB_BA m%6, m%7 ; %6=a1, %7=a3 - - movdqa m%9, m%1 - psraw m%9, 1 - paddw m%9, m%1 - paddw m%9, m%2 - paddw m%9, m%3 ; %9=a4 - - movdqa m%10, m%4 - psraw m%10, 1 - paddw m%10, m%4 - paddw m%10, m%2 - psubw m%10, m%3 ; %10=a7 - - SUMSUB_BA m%4, m%1 - psubw m%1, m%3 - psubw m%4, m%2 - psraw m%3, 1 - psraw m%2, 1 - psubw m%1, m%3 ; %1=a5 - psubw m%4, m%2 ; %4=a6 - - SUMSUB_BA m%6, m%5 ; %6=b0, %5=b4 - - movdqa m%2, m%10 - psraw m%2, 2 - paddw m%2, m%9 ; %2=b1 - psraw m%9, 2 - psubw m%9, m%10 ; %9=b7 - - movdqa m%3, m%7 - psraw m%3, 1 - paddw m%3, m%8 ; %3=b2 - psraw m%8, 1 - psubw m%8, m%7 ; %8=b6 - - movdqa m%7, m%4 - psraw m%7, 2 - paddw m%7, m%1 ; %7=b3 - psraw m%1, 2 - psubw m%4, m%1 ; %4=b5 - - SWAP %1, %6, %4, %7, %8, %9 -%endmacro +%if HIGH_BIT_DEPTH + +%macro SUB8x8_DCT8 0 +cglobal sub8x8_dct8, 3,3,14 + TAIL_CALL .skip_prologue, 0 +global current_function %+ .skip_prologue +.skip_prologue: + LOAD_DIFF8x4 0,1,2,3, none,none, r1, r2 + LOAD_DIFF8x4 4,5,6,7, none,none, r1, r2 + + DCT8_1D w, 0,1,2,3,4,5,6,7, 8,9 + + TRANSPOSE4x4W 0,1,2,3,8 + WIDEN_SXWD 0,8 + WIDEN_SXWD 1,9 + WIDEN_SXWD 2,10 + WIDEN_SXWD 3,11 + DCT8_1D d, 0,8,1,9,2,10,3,11, 12,13 + mova [r0+0x00], m0 + mova [r0+0x20], m8 + mova [r0+0x40], m1 + mova [r0+0x60], m9 + mova [r0+0x80], m2 + mova [r0+0xA0], m10 + mova [r0+0xC0], m3 + mova [r0+0xE0], m11 + + TRANSPOSE4x4W 4,5,6,7,0 + WIDEN_SXWD 4,0 + WIDEN_SXWD 5,1 + WIDEN_SXWD 6,2 + WIDEN_SXWD 7,3 + DCT8_1D d,4,0,5,1,6,2,7,3, 8,9 + mova [r0+0x10], m4 + mova [r0+0x30], m0 + mova [r0+0x50], m5 + mova [r0+0x70], m1 + mova [r0+0x90], m6 + mova [r0+0xB0], m2 + mova [r0+0xD0], m7 + mova [r0+0xF0], m3 + ret +%endmacro ; SUB8x8_DCT8 + +INIT_XMM sse2 +SUB8x8_DCT8 +INIT_XMM sse4 +SUB8x8_DCT8 +INIT_XMM avx +SUB8x8_DCT8 + +%macro ADD8x8_IDCT8 0 +cglobal add8x8_idct8, 2,2,16 + add r1, 128 + TAIL_CALL .skip_prologue, 0 +global current_function %+ .skip_prologue +.skip_prologue: + mova m0, [r1-128] + mova m1, [r1-96] + mova m2, [r1-64] + mova m3, [r1-32] + mova m4, [r1+ 0] + mova m5, [r1+32] + mova m6, [r1+64] + mova m7, [r1+96] + IDCT8_1D d,0,1,2,3,4,5,6,7,8,9 + TRANSPOSE4x4D 0,1,2,3,8 + TRANSPOSE4x4D 4,5,6,7,8 + paddd m0, [pd_32] + paddd m4, [pd_32] + mova [r1+64], m6 + mova [r1+96], m7 + mova m8, [r1-112] + mova m9, [r1-80] + mova m10, [r1-48] + mova m11, [r1-16] + mova m12, [r1+16] + mova m13, [r1+48] + mova m14, [r1+80] + mova m15, [r1+112] + IDCT8_1D d,8,9,10,11,12,13,14,15,6,7 + TRANSPOSE4x4D 8,9,10,11,6 + TRANSPOSE4x4D 12,13,14,15,6 + IDCT8_1D d,0,1,2,3,8,9,10,11,6,7 + mova [r1-112], m8 + mova [r1-80], m9 + mova m6, [r1+64] + mova m7, [r1+96] + IDCT8_1D d,4,5,6,7,12,13,14,15,8,9 + pxor m8, m8 + mova m9, [pw_pixel_max] + STORE_DIFF m0, m4, m8, m9, [r0+0*FDEC_STRIDEB] + STORE_DIFF m1, m5, m8, m9, [r0+1*FDEC_STRIDEB] + STORE_DIFF m2, m6, m8, m9, [r0+2*FDEC_STRIDEB] + STORE_DIFF m3, m7, m8, m9, [r0+3*FDEC_STRIDEB] + mova m0, [r1-112] + mova m1, [r1-80] + STORE_DIFF m0, m12, m8, m9, [r0+4*FDEC_STRIDEB] + STORE_DIFF m1, m13, m8, m9, [r0+5*FDEC_STRIDEB] + STORE_DIFF m10, m14, m8, m9, [r0+6*FDEC_STRIDEB] + STORE_DIFF m11, m15, m8, m9, [r0+7*FDEC_STRIDEB] + ret +%endmacro ; ADD8x8_IDCT8 + +INIT_XMM sse2 +ADD8x8_IDCT8 +INIT_XMM avx +ADD8x8_IDCT8 + +%else ; !HIGH_BIT_DEPTH + +%macro DCT_SUB8 0 +cglobal sub8x8_dct, 3,3,10 + add r2, 4*FDEC_STRIDE +%if cpuflag(ssse3) + mova m7, [hsub_mul] +%endif + TAIL_CALL .skip_prologue, 0 +global current_function %+ .skip_prologue +.skip_prologue: + SWAP 7, 9 + LOAD_DIFF8x4 0, 1, 2, 3, 8, 9, r1, r2-4*FDEC_STRIDE + LOAD_DIFF8x4 4, 5, 6, 7, 8, 9, r1, r2-4*FDEC_STRIDE + DCT4_1D 0, 1, 2, 3, 8 + TRANSPOSE2x4x4W 0, 1, 2, 3, 8 + DCT4_1D 4, 5, 6, 7, 8 + TRANSPOSE2x4x4W 4, 5, 6, 7, 8 + DCT4_1D 0, 1, 2, 3, 8 + STORE_DCT 0, 1, 2, 3, r0, 0 + DCT4_1D 4, 5, 6, 7, 8 + STORE_DCT 4, 5, 6, 7, r0, 64 + ret ;----------------------------------------------------------------------------- -; void x264_sub8x8_dct8_sse2( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 ) +; void sub8x8_dct8( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 ) ;----------------------------------------------------------------------------- -cglobal x264_sub8x8_dct8_sse2 - LOAD_DIFF_8P m0, m8, m9, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE] - LOAD_DIFF_8P m1, m8, m9, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE] - LOAD_DIFF_8P m2, m8, m9, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE] - LOAD_DIFF_8P m3, m8, m9, [r1+3*FENC_STRIDE], [r2+3*FDEC_STRIDE] - LOAD_DIFF_8P m4, m8, m9, [r1+4*FENC_STRIDE], [r2+4*FDEC_STRIDE] - LOAD_DIFF_8P m5, m8, m9, [r1+5*FENC_STRIDE], [r2+5*FDEC_STRIDE] - LOAD_DIFF_8P m6, m8, m9, [r1+6*FENC_STRIDE], [r2+6*FDEC_STRIDE] - LOAD_DIFF_8P m7, m8, m9, [r1+7*FENC_STRIDE], [r2+7*FDEC_STRIDE] - - DCT8_1D 0,1,2,3,4,5,6,7,8,9 +cglobal sub8x8_dct8, 3,3,11 + add r2, 4*FDEC_STRIDE +%if cpuflag(ssse3) + mova m7, [hsub_mul] +%endif + TAIL_CALL .skip_prologue, 0 +global current_function %+ .skip_prologue +.skip_prologue: + SWAP 7, 10 + LOAD_DIFF8x4 0, 1, 2, 3, 4, 10, r1, r2-4*FDEC_STRIDE + LOAD_DIFF8x4 4, 5, 6, 7, 8, 10, r1, r2-4*FDEC_STRIDE + DCT8_1D w, 0,1,2,3,4,5,6,7,8,9 TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8 - DCT8_1D 0,1,2,3,4,5,6,7,8,9 - + DCT8_1D w, 0,1,2,3,4,5,6,7,8,9 movdqa [r0+0x00], m0 movdqa [r0+0x10], m1 movdqa [r0+0x20], m2 @@ -158,65 +296,68 @@ cglobal x264_sub8x8_dct8_sse2 movdqa [r0+0x60], m6 movdqa [r0+0x70], m7 ret - - -%macro IDCT8_1D 10 - SUMSUB_BA m%5, m%1 ; %5=a0, %1=a2 - movdqa m%10, m%3 - psraw m%3, 1 - psubw m%3, m%7 ; %3=a4 - psraw m%7, 1 - paddw m%7, m%10 ; %7=a6 - - movdqa m%9, m%2 - psraw m%9, 1 - paddw m%9, m%2 - paddw m%9, m%4 - paddw m%9, m%6 ; %9=a7 - - movdqa m%10, m%6 - psraw m%10, 1 - paddw m%10, m%6 - paddw m%10, m%8 - psubw m%10, m%2 ; %10=a5 - - psubw m%2, m%4 - psubw m%6, m%4 - paddw m%2, m%8 - psubw m%6, m%8 - psraw m%4, 1 - psraw m%8, 1 - psubw m%2, m%4 ; %2=a3 - psubw m%6, m%8 ; %6=a1 - - SUMSUB_BA m%7, m%5 ; %7=b0, %5=b6 - SUMSUB_BA m%3, m%1 ; %3=b2, %1=b4 - - movdqa m%4, m%9 - psraw m%4, 2 - paddw m%4, m%6 ; %4=b1 - psraw m%6, 2 - psubw m%9, m%6 ; %9=b7 - - movdqa m%8, m%10 - psraw m%8, 2 - paddw m%8, m%2 ; %8=b3 - psraw m%2, 2 - psubw m%2, m%10 ; %2=b5 - - SUMSUB_BA m%9, m%7 ; %9=c0, %7=c7 - SUMSUB_BA m%2, m%3 ; %2=c1, %3=c6 - SUMSUB_BA m%8, m%1 ; %8=c2, %1=c5 - SUMSUB_BA m%4, m%5 ; %4=c3, %5=c4 - - SWAP %1, %9, %6 - SWAP %3, %8, %7 %endmacro +INIT_XMM sse2 +%define movdqa movaps +%define punpcklqdq movlhps +DCT_SUB8 +%undef movdqa +%undef punpcklqdq +INIT_XMM ssse3 +DCT_SUB8 +INIT_XMM avx +DCT_SUB8 +INIT_XMM xop +DCT_SUB8 + +INIT_YMM avx2 +cglobal sub16x16_dct8, 3,3,10 + add r0, 128 + add r2, 4*FDEC_STRIDE + call .sub16x8_dct8 + add r0, 256 + add r1, FENC_STRIDE*8 + add r2, FDEC_STRIDE*8 + call .sub16x8_dct8 + RET +.sub16x8_dct8: + LOAD_DIFF16x2_AVX2 0, 1, 2, 3, 0, 1 + LOAD_DIFF16x2_AVX2 2, 3, 4, 5, 2, 3 + LOAD_DIFF16x2_AVX2 4, 5, 6, 7, 4, 5 + LOAD_DIFF16x2_AVX2 6, 7, 8, 9, 6, 7 + DCT8_1D w, 0,1,2,3,4,5,6,7,8,9 + TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8 + DCT8_1D w, 0,1,2,3,4,5,6,7,8,9 + mova [r0-0x80+0x00], xm0 + vextracti128 [r0+0x00], m0, 1 + mova [r0-0x80+0x10], xm1 + vextracti128 [r0+0x10], m1, 1 + mova [r0-0x80+0x20], xm2 + vextracti128 [r0+0x20], m2, 1 + mova [r0-0x80+0x30], xm3 + vextracti128 [r0+0x30], m3, 1 + mova [r0-0x80+0x40], xm4 + vextracti128 [r0+0x40], m4, 1 + mova [r0-0x80+0x50], xm5 + vextracti128 [r0+0x50], m5, 1 + mova [r0-0x80+0x60], xm6 + vextracti128 [r0+0x60], m6, 1 + mova [r0-0x80+0x70], xm7 + vextracti128 [r0+0x70], m7, 1 + ret + ;----------------------------------------------------------------------------- -; void x264_add8x8_idct8_sse2( uint8_t *p_dst, int16_t dct[8][8] ) +; void add8x8_idct8( uint8_t *p_dst, int16_t dct[8][8] ) ;----------------------------------------------------------------------------- -cglobal x264_add8x8_idct8_sse2 +%macro ADD8x8_IDCT8 0 +cglobal add8x8_idct8, 2,2,11 + add r0, 4*FDEC_STRIDE + pxor m7, m7 + TAIL_CALL .skip_prologue, 0 +global current_function %+ .skip_prologue +.skip_prologue: + SWAP 7, 9 movdqa m0, [r1+0x00] movdqa m1, [r1+0x10] movdqa m2, [r1+0x20] @@ -225,21 +366,65 @@ cglobal x264_add8x8_idct8_sse2 movdqa m5, [r1+0x50] movdqa m6, [r1+0x60] movdqa m7, [r1+0x70] - - IDCT8_1D 0,1,2,3,4,5,6,7,8,9 + IDCT8_1D w,0,1,2,3,4,5,6,7,8,10 TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8 - paddw m0, [pw_32 GLOBAL] ; rounding for the >>6 at the end - IDCT8_1D 0,1,2,3,4,5,6,7,8,9 - - pxor m9, m9 - STORE_DIFF_8P m0, m8, m9, [r0+0*FDEC_STRIDE] - STORE_DIFF_8P m1, m8, m9, [r0+1*FDEC_STRIDE] - STORE_DIFF_8P m2, m8, m9, [r0+2*FDEC_STRIDE] - STORE_DIFF_8P m3, m8, m9, [r0+3*FDEC_STRIDE] - STORE_DIFF_8P m4, m8, m9, [r0+4*FDEC_STRIDE] - STORE_DIFF_8P m5, m8, m9, [r0+5*FDEC_STRIDE] - STORE_DIFF_8P m6, m8, m9, [r0+6*FDEC_STRIDE] - STORE_DIFF_8P m7, m8, m9, [r0+7*FDEC_STRIDE] + paddw m0, [pw_32] ; rounding for the >>6 at the end + IDCT8_1D w,0,1,2,3,4,5,6,7,8,10 + DIFFx2 m0, m1, m8, m9, [r0-4*FDEC_STRIDE], [r0-3*FDEC_STRIDE] + DIFFx2 m2, m3, m8, m9, [r0-2*FDEC_STRIDE], [r0-1*FDEC_STRIDE] + DIFFx2 m4, m5, m8, m9, [r0+0*FDEC_STRIDE], [r0+1*FDEC_STRIDE] + DIFFx2 m6, m7, m8, m9, [r0+2*FDEC_STRIDE], [r0+3*FDEC_STRIDE] + STORE_IDCT m1, m3, m5, m7 + ret +%endmacro ; ADD8x8_IDCT8 + +INIT_XMM sse2 +ADD8x8_IDCT8 +INIT_XMM avx +ADD8x8_IDCT8 + +;----------------------------------------------------------------------------- +; void add8x8_idct( uint8_t *pix, int16_t dct[4][4][4] ) +;----------------------------------------------------------------------------- +%macro ADD8x8 0 +cglobal add8x8_idct, 2,2,11 + add r0, 4*FDEC_STRIDE + pxor m7, m7 + TAIL_CALL .skip_prologue, 0 +global current_function %+ .skip_prologue +.skip_prologue: + SWAP 7, 9 + mova m0, [r1+ 0] + mova m2, [r1+16] + mova m1, [r1+32] + mova m3, [r1+48] + SBUTTERFLY qdq, 0, 1, 4 + SBUTTERFLY qdq, 2, 3, 4 + mova m4, [r1+64] + mova m6, [r1+80] + mova m5, [r1+96] + mova m7, [r1+112] + SBUTTERFLY qdq, 4, 5, 8 + SBUTTERFLY qdq, 6, 7, 8 + IDCT4_1D w,0,1,2,3,8,10 + TRANSPOSE2x4x4W 0,1,2,3,8 + IDCT4_1D w,4,5,6,7,8,10 + TRANSPOSE2x4x4W 4,5,6,7,8 + paddw m0, [pw_32] + IDCT4_1D w,0,1,2,3,8,10 + paddw m4, [pw_32] + IDCT4_1D w,4,5,6,7,8,10 + DIFFx2 m0, m1, m8, m9, [r0-4*FDEC_STRIDE], [r0-3*FDEC_STRIDE] + DIFFx2 m2, m3, m8, m9, [r0-2*FDEC_STRIDE], [r0-1*FDEC_STRIDE] + DIFFx2 m4, m5, m8, m9, [r0+0*FDEC_STRIDE], [r0+1*FDEC_STRIDE] + DIFFx2 m6, m7, m8, m9, [r0+2*FDEC_STRIDE], [r0+3*FDEC_STRIDE] + STORE_IDCT m1, m3, m5, m7 ret +%endmacro ; ADD8x8 +INIT_XMM sse2 +ADD8x8 +INIT_XMM avx +ADD8x8 +%endif ; !HIGH_BIT_DEPTH