From Google Code-In.
dctf->add4x4_idct = x264_add4x4_idct_sse2;
dctf->dct4x4dc = x264_dct4x4dc_sse2;
dctf->idct4x4dc = x264_idct4x4dc_sse2;
+ dctf->sub8x8_dct8 = x264_sub8x8_dct8_sse2;
+ dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse2;
dctf->add8x8_idct = x264_add8x8_idct_sse2;
dctf->add16x16_idct = x264_add16x16_idct_sse2;
dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_sse2;
dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_sse2;
dctf->add16x16_idct_dc= x264_add16x16_idct_dc_sse2;
}
+ if( cpu&X264_CPU_SSE4 )
+ {
+ dctf->sub8x8_dct8 = x264_sub8x8_dct8_sse4;
+ dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse4;
+ }
if( cpu&X264_CPU_AVX )
{
dctf->add4x4_idct = x264_add4x4_idct_avx;
dctf->dct4x4dc = x264_dct4x4dc_avx;
dctf->idct4x4dc = x264_idct4x4dc_avx;
+ dctf->sub8x8_dct8 = x264_sub8x8_dct8_avx;
+ dctf->sub16x16_dct8 = x264_sub16x16_dct8_avx;
dctf->add8x8_idct = x264_add8x8_idct_avx;
dctf->add16x16_idct = x264_add16x16_idct_avx;
dctf->add8x8_idct_dc = x264_add8x8_idct_dc_avx;
cextern pw_32
cextern hsub_mul
-%ifndef HIGH_BIT_DEPTH
+%macro SPILL_SHUFFLE 3-* ; ptr, list of regs, list of memory offsets
+ %xdefine %%base %1
+ %rep %0/2
+ %xdefine %%tmp m%2
+ %rotate %0/2
+ mova [%%base + %2*16], %%tmp
+ %rotate 1-%0/2
+ %endrep
+%endmacro
+
+%macro UNSPILL_SHUFFLE 3-*
+ %xdefine %%base %1
+ %rep %0/2
+ %xdefine %%tmp m%2
+ %rotate %0/2
+ mova %%tmp, [%%base + %2*16]
+ %rotate 1-%0/2
+ %endrep
+%endmacro
+
+%macro SPILL 2+ ; assume offsets are the same as reg numbers
+ SPILL_SHUFFLE %1, %2, %2
+%endmacro
+
+%macro UNSPILL 2+
+ UNSPILL_SHUFFLE %1, %2, %2
+%endmacro
+
+; in: size, m0..m7, %10,%11,%12
+; out: 0,4,6 in memory at %10,%11,%12, rest in regs
+%macro DCT8_1D 12
+ SUMSUB_BA %1, %9, %2 ; %9 = s07, %2 = d07
+ SUMSUB_BA %1, %8, %3 ; %8 = s16, %3 = d16
+ SUMSUB_BA %1, %7, %4 ; %7 = s25, %4 = d25
+ SUMSUB_BA %1, %6, %5 ; %6 = s34, %5 = d34
+ SUMSUB_BA %1, %6, %9 ; %6 = a0, %9 = a2
+ SUMSUB_BA %1, %7, %8 ; %7 = a1, %8 = a3
+ SUMSUB_BA %1, %7, %6 ; %7 = dst0, %6 = dst4
+ mova [%10], m%7
+ mova [%11], m%6
+ psra%1 m%7, m%8, 1 ; a3>>1
+ padd%1 m%7, m%9 ; a2 + (a3>>1)
+ psra%1 m%9, 1 ; a2>>1
+ psub%1 m%9, m%8 ; (a2>>1) - a3
+ mova [%12], m%9
+ psra%1 m%6, m%4, 1
+ padd%1 m%6, m%4 ; d25+(d25>>1)
+ psub%1 m%8, m%2, m%5 ; a5 = d07-d34-(d25+(d25>>1))
+ psub%1 m%8, m%6
+ psra%1 m%6, m%3, 1
+ padd%1 m%6, m%3 ; d16+(d16>>1)
+ padd%1 m%9, m%2, m%5
+ psub%1 m%9, m%6 ; a6 = d07+d34-(d16+(d16>>1))
+ psra%1 m%6, m%2, 1
+ padd%1 m%6, m%2 ; d07+(d07>>1)
+ padd%1 m%6, m%3
+ padd%1 m%6, m%4 ; a4 = d16+d25+(d07+(d07>>1))
+ psra%1 m%2, m%5, 1
+ padd%1 m%2, m%5 ; d34+(d34>>1)
+ padd%1 m%2, m%3
+ psub%1 m%2, m%4 ; a7 = d16-d25+(d34+(d34>>1))
+ psra%1 m%5, m%2, 2
+ padd%1 m%5, m%6 ; a4 + (a7>>2)
+ psra%1 m%4, m%9, 2
+ padd%1 m%4, m%8 ; a5 + (a6>>2)
+ psra%1 m%6, 2
+ psra%1 m%8, 2
+ psub%1 m%6, m%2 ; (a4>>2) - a7
+ psub%1 m%9, m%8 ; a6 - (a5>>2)
+ SWAP %3, %5, %4, %7, %9, %6
+%endmacro
+
+%ifdef HIGH_BIT_DEPTH
+
+%macro SUB8x8_DCT8 0
+cglobal sub8x8_dct8, 3,3,8
+global current_function %+ .skip_prologue
+.skip_prologue:
+ LOAD_DIFF8x4 0,1,2,3, none,none, r1, r2
+ LOAD_DIFF8x4 4,5,6,7, none,none, r1, r2
+
+ DCT8_1D w, 0,1,2,3,4,5,6,7, r0,r0+0x10,r0+0x50
+ mova m0, [r0]
+
+ mova [r0+0x30], m5
+ mova [r0+0x70], m7
+ TRANSPOSE4x4W 0,1,2,3,4
+ WIDEN_SXWD 0,4
+ WIDEN_SXWD 1,5
+ WIDEN_SXWD 2,6
+ WIDEN_SXWD 3,7
+ DCT8_1D d, 0,4,1,5,2,6,3,7, r0,r0+0x80,r0+0xC0
+ mova [r0+0x20], m4
+ mova [r0+0x40], m1
+ mova [r0+0x60], m5
+ mova [r0+0xA0], m6
+ mova [r0+0xE0], m7
+ mova m4, [r0+0x10]
+ mova m5, [r0+0x30]
+ mova m6, [r0+0x50]
+ mova m7, [r0+0x70]
-; in: m0..m7
-; out: 0,4,6 in mem, rest in regs
-%macro DCT8_1D 9
- SUMSUB_BA w, %8, %1 ; %8 = s07, %1 = d07
- SUMSUB_BA w, %7, %2 ; %7 = s16, %2 = d16
- SUMSUB_BA w, %6, %3 ; %6 = s25, %3 = d25
- SUMSUB_BA w, %5, %4 ; %5 = s34, %4 = d34
- SUMSUB_BA w, %5, %8 ; %5 = a0, %8 = a2
- SUMSUB_BA w, %6, %7 ; %6 = a1, %7 = a3
- SUMSUB_BA w, %6, %5 ; %6 = dst0, %5 = dst4
- mova [%9+0x00], m%6
- mova [%9+0x40], m%5
- psraw m%6, m%7, 1 ; a3>>1
- paddw m%6, m%8 ; a2 + (a3>>1)
- psraw m%8, 1 ; a2>>1
- psubw m%8, m%7 ; (a2>>1) - a3
- mova [%9+0x60], m%8
- psraw m%5, m%3, 1
- paddw m%5, m%3 ; d25+(d25>>1)
- psubw m%7, m%1, m%4 ; a5 = d07-d34-(d25+(d25>>1))
- psubw m%7, m%5
- psraw m%5, m%2, 1
- paddw m%5, m%2 ; d16+(d16>>1)
- paddw m%8, m%1, m%4
- psubw m%8, m%5 ; a6 = d07+d34-(d16+(d16>>1))
- psraw m%5, m%1, 1
- paddw m%5, m%1 ; d07+(d07>>1)
- paddw m%5, m%2
- paddw m%5, m%3 ; a4 = d16+d25+(d07+(d07>>1))
- psraw m%1, m%4, 1
- paddw m%1, m%4 ; d34+(d34>>1)
- paddw m%1, m%2
- psubw m%1, m%3 ; a7 = d16-d25+(d34+(d34>>1))
- psraw m%4, m%1, 2
- paddw m%4, m%5 ; a4 + (a7>>2)
- psraw m%3, m%8, 2
- paddw m%3, m%7 ; a5 + (a6>>2)
- psraw m%5, 2
- psraw m%7, 2
- psubw m%5, m%1 ; (a4>>2) - a7
- psubw m%8, m%7 ; a6 - (a5>>2)
- SWAP %2, %4, %3, %6, %8, %5
+ TRANSPOSE4x4W 4,5,6,7,0
+ WIDEN_SXWD 4,0
+ WIDEN_SXWD 5,1
+ WIDEN_SXWD 6,2
+ WIDEN_SXWD 7,3
+ DCT8_1D d,4,0,5,1,6,2,7,3, r0+0x10,r0+0x90,r0+0xD0
+ mova [r0+0x30], m0
+ mova [r0+0x50], m5
+ mova [r0+0x70], m1
+ mova [r0+0xB0], m2
+ mova [r0+0xF0], m3
+ ret
%endmacro
+INIT_XMM sse2
+SUB8x8_DCT8
+INIT_XMM sse4
+SUB8x8_DCT8
+INIT_XMM avx
+SUB8x8_DCT8
+
+%else ; !HIGH_BIT_DEPTH
+
; in: 0,4 in mem, rest in regs
; out: m0..m7
%macro IDCT8_1D 9
ret
cglobal dct8_mmx
- DCT8_1D 0,1,2,3,4,5,6,7,r0
+ DCT8_1D w,0,1,2,3,4,5,6,7,r0,r0+0x40,r0+0x60
SAVE_MM_PERMUTATION
ret
-%macro SPILL_SHUFFLE 3-* ; ptr, list of regs, list of memory offsets
- %xdefine %%base %1
- %rep %0/2
- %xdefine %%tmp m%2
- %rotate %0/2
- mova [%%base + %2*16], %%tmp
- %rotate 1-%0/2
- %endrep
-%endmacro
-
-%macro UNSPILL_SHUFFLE 3-*
- %xdefine %%base %1
- %rep %0/2
- %xdefine %%tmp m%2
- %rotate %0/2
- mova %%tmp, [%%base + %2*16]
- %rotate 1-%0/2
- %endrep
-%endmacro
-
-%macro SPILL 2+ ; assume offsets are the same as reg numbers
- SPILL_SHUFFLE %1, %2, %2
-%endmacro
-
-%macro UNSPILL 2+
- UNSPILL_SHUFFLE %1, %2, %2
-%endmacro
-
;-----------------------------------------------------------------------------
; void sub8x8_dct8( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
LOAD_DIFF m7, m0, none, [r1+7*FENC_STRIDE], [r2+3*FDEC_STRIDE]
UNSPILL r0, 0
%endif
- DCT8_1D 0,1,2,3,4,5,6,7,r0
+ DCT8_1D w,0,1,2,3,4,5,6,7,r0,r0+0x40,r0+0x60
UNSPILL r0, 0,4
TRANSPOSE8x8W 0,1,2,3,4,5,6,7,[r0+0x60],[r0+0x40],1
UNSPILL r0, 4
- DCT8_1D 0,1,2,3,4,5,6,7,r0
+ DCT8_1D w,0,1,2,3,4,5,6,7,r0,r0+0x40,r0+0x60
SPILL r0, 1,2,3,5,7
ret
%endmacro
cextern pw_32
cextern hsub_mul
-%ifndef HIGH_BIT_DEPTH
+; in: size, m0..m7, temp, temp
+; out: m0..m7
+%macro DCT8_1D 11
+ SUMSUB_BA %1, %6, %5, %11 ; %6=s34, %5=d34
+ SUMSUB_BA %1, %7, %4, %11 ; %7=s25, %4=d25
+ SUMSUB_BA %1, %8, %3, %11 ; %8=s16, %3=d16
+ SUMSUB_BA %1, %9, %2, %11 ; %9=s07, %2=d07
-%macro DCT8_1D 10
- SUMSUB_BA w, %5, %4 ; %5=s34, %4=d34
- SUMSUB_BA w, %6, %3 ; %6=s25, %3=d25
- SUMSUB_BA w, %7, %2 ; %7=s16, %2=d16
- SUMSUB_BA w, %8, %1 ; %8=s07, %1=d07
+ SUMSUB_BA %1, %7, %8, %11 ; %7=a1, %8=a3
+ SUMSUB_BA %1, %6, %9, %11 ; %6=a0, %9=a2
- SUMSUB_BA w, %6, %7, %10 ; %6=a1, %7=a3
- SUMSUB_BA w, %5, %8, %10 ; %5=a0, %8=a2
+ psra%1 m%10, m%2, 1
+ padd%1 m%10, m%2
+ padd%1 m%10, m%3
+ padd%1 m%10, m%4 ; %10=a4
- psraw m%9, m%1, 1
- paddw m%9, m%1
- paddw m%9, m%2
- paddw m%9, m%3 ; %9=a4
-
- psraw m%10, m%4, 1
- paddw m%10, m%4
- paddw m%10, m%2
- psubw m%10, m%3 ; %10=a7
-
- SUMSUB_BA w, %4, %1
- psubw m%1, m%3
- psubw m%4, m%2
- psraw m%3, 1
- psraw m%2, 1
- psubw m%1, m%3 ; %1=a5
- psubw m%4, m%2 ; %4=a6
-
- psraw m%2, m%10, 2
- paddw m%2, m%9 ; %2=b1
- psraw m%9, 2
- psubw m%9, m%10 ; %9=b7
-
- SUMSUB_BA w, %6, %5, %10 ; %6=b0, %5=b4
-
- psraw m%3, m%7, 1
- paddw m%3, m%8 ; %3=b2
- psraw m%8, 1
- psubw m%8, m%7 ; %8=b6
+ psra%1 m%11, m%5, 1
+ padd%1 m%11, m%5
+ padd%1 m%11, m%3
+ psub%1 m%11, m%4 ; %11=a7
+
+ SUMSUB_BA %1, %5, %2
+ psub%1 m%2, m%4
+ psub%1 m%5, m%3
+ psra%1 m%4, 1
+ psra%1 m%3, 1
+ psub%1 m%2, m%4 ; %2=a5
+ psub%1 m%5, m%3 ; %5=a6
+
+ psra%1 m%3, m%11, 2
+ padd%1 m%3, m%10 ; %3=b1
+ psra%1 m%10, 2
+ psub%1 m%10, m%11 ; %10=b7
+
+ SUMSUB_BA %1, %7, %6, %11 ; %7=b0, %6=b4
+
+ psra%1 m%4, m%8, 1
+ padd%1 m%4, m%9 ; %4=b2
+ psra%1 m%9, 1
+ psub%1 m%9, m%8 ; %9=b6
- psraw m%7, m%4, 2
- paddw m%7, m%1 ; %7=b3
- psraw m%1, 2
- psubw m%4, m%1 ; %4=b5
+ psra%1 m%8, m%5, 2
+ padd%1 m%8, m%2 ; %8=b3
+ psra%1 m%2, 2
+ psub%1 m%5, m%2 ; %5=b5
- SWAP %1, %6, %4, %7, %8, %9
+ SWAP %2, %7, %5, %8, %9, %10
%endmacro
+%ifdef HIGH_BIT_DEPTH
+
+%macro SUB8x8_DCT8 0
+cglobal sub8x8_dct8, 3,3,14
+%ifdef WIN64
+ call .skip_prologue
+ RET
+%endif
+global current_function %+ .skip_prologue
+.skip_prologue:
+ LOAD_DIFF8x4 0,1,2,3, none,none, r1, r2
+ LOAD_DIFF8x4 4,5,6,7, none,none, r1, r2
+
+ DCT8_1D w, 0,1,2,3,4,5,6,7, 8,9
+
+ TRANSPOSE4x4W 0,1,2,3,8
+ WIDEN_SXWD 0,8
+ WIDEN_SXWD 1,9
+ WIDEN_SXWD 2,10
+ WIDEN_SXWD 3,11
+ DCT8_1D d, 0,8,1,9,2,10,3,11, 12,13
+ mova [r0+0x00], m0
+ mova [r0+0x20], m8
+ mova [r0+0x40], m1
+ mova [r0+0x60], m9
+ mova [r0+0x80], m2
+ mova [r0+0xA0], m10
+ mova [r0+0xC0], m3
+ mova [r0+0xE0], m11
+
+ TRANSPOSE4x4W 4,5,6,7,0
+ WIDEN_SXWD 4,0
+ WIDEN_SXWD 5,1
+ WIDEN_SXWD 6,2
+ WIDEN_SXWD 7,3
+ DCT8_1D d,4,0,5,1,6,2,7,3, 8,9
+ mova [r0+0x10], m4
+ mova [r0+0x30], m0
+ mova [r0+0x50], m5
+ mova [r0+0x70], m1
+ mova [r0+0x90], m6
+ mova [r0+0xB0], m2
+ mova [r0+0xD0], m7
+ mova [r0+0xF0], m3
+ ret
+%endmacro
+
+INIT_XMM sse2
+SUB8x8_DCT8
+INIT_XMM sse4
+SUB8x8_DCT8
+INIT_XMM avx
+SUB8x8_DCT8
+
+%else ; !HIGH_BIT_DEPTH
+
%macro IDCT8_1D 10
SUMSUB_BA w, %5, %1, %9 ; %5=a0, %1=a2
%endmacro
%macro DCT_SUB8 0
-cglobal sub8x8_dct, 3,3,11
+cglobal sub8x8_dct, 3,3,10
add r2, 4*FDEC_STRIDE
%if cpuflag(ssse3)
mova m7, [hsub_mul]
SWAP 7, 10
LOAD_DIFF8x4 0, 1, 2, 3, 4, 10, r1, r2-4*FDEC_STRIDE
LOAD_DIFF8x4 4, 5, 6, 7, 8, 10, r1, r2-4*FDEC_STRIDE
- DCT8_1D 0,1,2,3,4,5,6,7,8,9
+ DCT8_1D w, 0,1,2,3,4,5,6,7,8,9
TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8
- DCT8_1D 0,1,2,3,4,5,6,7,8,9
+ DCT8_1D w, 0,1,2,3,4,5,6,7,8,9
movdqa [r0+0x00], m0
movdqa [r0+0x10], m1
movdqa [r0+0x20], m2
;-----------------------------------------------------------------------------
; void sub8x8_dct( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
-%macro SUB_NxN_DCT 6
-cglobal %1, 3,3,11
+%macro SUB_NxN_DCT 7
+cglobal %1, 3,3,%7
%ifndef HIGH_BIT_DEPTH
%if mmsize == 8
pxor m7, m7
%ifdef HIGH_BIT_DEPTH
INIT_MMX
-SUB_NxN_DCT sub8x8_dct_mmx, sub4x4_dct_mmx, 64, 8, 0, 0
-SUB_NxN_DCT sub16x16_dct_mmx, sub8x8_dct_mmx, 64, 16, 8, 8
+SUB_NxN_DCT sub8x8_dct_mmx, sub4x4_dct_mmx, 64, 8, 0, 0, 0
+SUB_NxN_DCT sub16x16_dct_mmx, sub8x8_dct_mmx, 64, 16, 8, 8, 0
INIT_XMM
ADD_NxN_IDCT add8x8_idct_sse2, add4x4_idct_sse2, 64, 8, 0, 0
ADD_NxN_IDCT add16x16_idct_sse2, add8x8_idct_sse2, 64, 16, 8, 8
ADD_NxN_IDCT add8x8_idct_avx, add4x4_idct_avx, 64, 8, 0, 0
ADD_NxN_IDCT add16x16_idct_avx, add8x8_idct_avx, 64, 16, 8, 8
+cextern sub8x8_dct8_sse2.skip_prologue
+cextern sub8x8_dct8_sse4.skip_prologue
+cextern sub8x8_dct8_avx.skip_prologue
+SUB_NxN_DCT sub16x16_dct8_sse2, sub8x8_dct8_sse2, 256, 16, 0, 0, 14
+SUB_NxN_DCT sub16x16_dct8_sse4, sub8x8_dct8_sse4, 256, 16, 0, 0, 14
+SUB_NxN_DCT sub16x16_dct8_avx, sub8x8_dct8_avx, 256, 16, 0, 0, 14
%else ; !HIGH_BIT_DEPTH
%ifndef ARCH_X86_64
INIT_MMX
-SUB_NxN_DCT sub8x8_dct_mmx, sub4x4_dct_mmx, 32, 4, 0, 0
+SUB_NxN_DCT sub8x8_dct_mmx, sub4x4_dct_mmx, 32, 4, 0, 0, 0
ADD_NxN_IDCT add8x8_idct_mmx, add4x4_idct_mmx, 32, 4, 0, 0
-SUB_NxN_DCT sub16x16_dct_mmx, sub8x8_dct_mmx, 32, 8, 4, 4
+SUB_NxN_DCT sub16x16_dct_mmx, sub8x8_dct_mmx, 32, 8, 4, 4, 0
ADD_NxN_IDCT add16x16_idct_mmx, add8x8_idct_mmx, 32, 8, 4, 4
cextern sub8x8_dct8_mmx.skip_prologue
cextern add8x8_idct8_mmx.skip_prologue
-SUB_NxN_DCT sub16x16_dct8_mmx, sub8x8_dct8_mmx, 128, 8, 0, 0
+SUB_NxN_DCT sub16x16_dct8_mmx, sub8x8_dct8_mmx, 128, 8, 0, 0, 0
ADD_NxN_IDCT add16x16_idct8_mmx, add8x8_idct8_mmx, 128, 8, 0, 0
%endif
cextern sub8x8_dct_ssse3.skip_prologue
cextern sub8x8_dct_avx.skip_prologue
cextern sub8x8_dct_xop.skip_prologue
-SUB_NxN_DCT sub16x16_dct_sse2, sub8x8_dct_sse2, 128, 8, 0, 0
-SUB_NxN_DCT sub16x16_dct_ssse3, sub8x8_dct_ssse3, 128, 8, 0, 0
-SUB_NxN_DCT sub16x16_dct_avx, sub8x8_dct_avx, 128, 8, 0, 0
-SUB_NxN_DCT sub16x16_dct_xop, sub8x8_dct_xop, 128, 8, 0, 0
+SUB_NxN_DCT sub16x16_dct_sse2, sub8x8_dct_sse2, 128, 8, 0, 0, 10
+SUB_NxN_DCT sub16x16_dct_ssse3, sub8x8_dct_ssse3, 128, 8, 0, 0, 10
+SUB_NxN_DCT sub16x16_dct_avx, sub8x8_dct_avx, 128, 8, 0, 0, 10
+SUB_NxN_DCT sub16x16_dct_xop, sub8x8_dct_xop, 128, 8, 0, 0, 10
cextern add8x8_idct_sse2.skip_prologue
cextern add8x8_idct_avx.skip_prologue
cextern sub8x8_dct8_sse2.skip_prologue
cextern sub8x8_dct8_ssse3.skip_prologue
cextern sub8x8_dct8_avx.skip_prologue
-SUB_NxN_DCT sub16x16_dct8_sse2, sub8x8_dct8_sse2, 128, 8, 0, 0
-SUB_NxN_DCT sub16x16_dct8_ssse3, sub8x8_dct8_ssse3, 128, 8, 0, 0
-SUB_NxN_DCT sub16x16_dct8_avx, sub8x8_dct8_avx, 128, 8, 0, 0
+SUB_NxN_DCT sub16x16_dct8_sse2, sub8x8_dct8_sse2, 128, 8, 0, 0, 11
+SUB_NxN_DCT sub16x16_dct8_ssse3, sub8x8_dct8_ssse3, 128, 8, 0, 0, 11
+SUB_NxN_DCT sub16x16_dct8_avx, sub8x8_dct8_avx, 128, 8, 0, 0, 11
%endif ; HIGH_BIT_DEPTH
%ifdef HIGH_BIT_DEPTH
void x264_sub8x8_dct8_mmx ( int16_t dct [64], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct8_mmx ( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub8x8_dct8_sse2 ( int16_t dct [64], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub16x16_dct8_sse2 ( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub8x8_dct8_sse2 ( dctcoef dct [64], pixel *pix1, pixel *pix2 );
+void x264_sub16x16_dct8_sse2 ( dctcoef dct[4][64], pixel *pix1, pixel *pix2 );
void x264_sub8x8_dct8_ssse3 ( int16_t dct [64], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct8_ssse3( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub8x8_dct8_avx ( int16_t dct [64], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub16x16_dct8_avx ( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub8x8_dct8_sse4 ( int32_t dct [64], uint16_t *pix1, uint16_t *pix2 );
+void x264_sub16x16_dct8_sse4 ( int32_t dct[4][64], uint16_t *pix1, uint16_t *pix2 );
+void x264_sub8x8_dct8_avx ( dctcoef dct [64], pixel *pix1, pixel *pix2 );
+void x264_sub16x16_dct8_avx ( dctcoef dct[4][64], pixel *pix1, pixel *pix2 );
void x264_add8x8_idct8_mmx ( uint8_t *dst, int16_t dct [64] );
pw_3210: dw 0, 1, 2, 3, 4, 5, 6, 7
pw_43210123: dw -3, -2, -1, 0, 1, 2, 3, 4
pw_m3: times 8 dw -3
+pw_m7: times 8 dw -7
pb_00s_ff: times 8 db 0
pb_0s_ff: times 7 db 0
db 0xff
; void predict_8x8c_p_core( uint8_t *src, int i00, int b, int c )
;-----------------------------------------------------------------------------
%ifndef ARCH_X86_64
-INIT_MMX
-cglobal predict_8x8c_p_core_mmx2, 1,2
+%ifndef HIGH_BIT_DEPTH
+%macro PREDICT_CHROMA_P_MMX 1
+cglobal predict_8x%1c_p_core, 1,2
LOAD_PLANE_ARGS
- movq mm1, mm2
- pmullw mm2, [pw_3210]
- psllw mm1, 2
- paddsw mm0, mm2 ; mm0 = {i+0*b, i+1*b, i+2*b, i+3*b}
- paddsw mm1, mm0 ; mm1 = {i+4*b, i+5*b, i+6*b, i+7*b}
-
- mov r1d, 8
+ movq m1, m2
+ pmullw m2, [pw_3210]
+ psllw m1, 2
+ paddsw m0, m2 ; m0 = {i+0*b, i+1*b, i+2*b, i+3*b}
+ paddsw m1, m0 ; m1 = {i+4*b, i+5*b, i+6*b, i+7*b}
+ mov r1d, %1
ALIGN 4
.loop:
- movq mm5, mm0
- movq mm6, mm1
- psraw mm5, 5
- psraw mm6, 5
- packuswb mm5, mm6
- movq [r0], mm5
+ movq m5, m0
+ movq m6, m1
+ psraw m5, 5
+ psraw m6, 5
+ packuswb m5, m6
+ movq [r0], m5
- paddsw mm0, mm4
- paddsw mm1, mm4
+ paddsw m0, m4
+ paddsw m1, m4
add r0, FDEC_STRIDE
dec r1d
- jg .loop
+ jg .loop
REP_RET
+%endmacro ; PREDICT_CHROMA_P_MMX
+
+INIT_MMX mmx2
+PREDICT_CHROMA_P_MMX 8
+PREDICT_CHROMA_P_MMX 16
+%endif ; !HIGH_BIT_DEPTH
%endif ; !ARCH_X86_64
-%macro PREDICT_8x8C 0
+%macro PREDICT_CHROMA_P_XMM 1
%ifdef HIGH_BIT_DEPTH
-cglobal predict_8x8c_p_core, 1,1,7
+cglobal predict_8x%1c_p_core, 1,2,7
movd m0, r1m
movd m2, r2m
movd m4, r3m
SPLATW m2, m2, 0
SPLATW m4, m4, 0
pmullw m2, [pw_43210123] ; b
- pmullw m5, m4, [pw_m3] ; c
+%if %1 == 16
+ pmullw m5, m4, [pw_m7] ; c
+%else
+ pmullw m5, m4, [pw_m3]
+%endif
paddw m5, [pw_16]
- mov r1d, 8
+ mov r1d, %1
.loop:
paddsw m6, m2, m5
paddsw m6, m0
mova [r0], m6
paddw m5, m4
add r0, FDEC_STRIDEB
- dec r1d
+ dec r1d
jg .loop
REP_RET
%else ; !HIGH_BIT_DEPTH
-cglobal predict_8x8c_p_core, 1,1
+cglobal predict_8x%1c_p_core, 1,2
movd m0, r1m
movd m2, r2m
movd m4, r3m
paddsw m0, m2 ; m0 = {i+0*b, i+1*b, i+2*b, i+3*b, i+4*b, i+5*b, i+6*b, i+7*b}
paddsw m3, m0, m4
paddsw m4, m4
-call .loop
- add r0, FDEC_STRIDE*4
+ mov r1d, %1/4
.loop:
paddsw m1, m3, m4
paddsw m5, m0, m4
packuswb m5, m1
movq [r0+FDEC_STRIDE*2], m5
movhps [r0+FDEC_STRIDE*3], m5
+ add r0, FDEC_STRIDE*4
+ dec r1d
+ jg .loop
RET
%endif ; HIGH_BIT_DEPTH
-%endmacro
+%endmacro ; PREDICT_CHROMA_P_XMM
INIT_XMM sse2
-PREDICT_8x8C
+PREDICT_CHROMA_P_XMM 8
+PREDICT_CHROMA_P_XMM 16
INIT_XMM avx
-PREDICT_8x8C
+PREDICT_CHROMA_P_XMM 8
+PREDICT_CHROMA_P_XMM 16
;-----------------------------------------------------------------------------
; void predict_16x16_p_core( uint8_t *src, int i00, int b, int c )
PREDICT_16x16_P( avx )
#endif //!HIGH_BIT_DEPTH
+#define PREDICT_8x16C_P_CORE \
+ int H = 0, V = 0;\
+ for( int i = 0; i < 4; i++ )\
+ H += ( i + 1 ) * ( src[4 + i - FDEC_STRIDE] - src[2 - i - FDEC_STRIDE] );\
+ for( int i = 0; i < 8; i++ )\
+ V += ( i + 1 ) * ( src[-1 + (i+8)*FDEC_STRIDE] - src[-1 + (6-i)*FDEC_STRIDE] );\
+ int a = 16 * ( src[-1 + 15*FDEC_STRIDE] + src[7 - FDEC_STRIDE] );\
+ int b = ( 17 * H + 16 ) >> 5;\
+ int c = ( 5 * V + 32 ) >> 6;
+
+#if HIGH_BIT_DEPTH
+#define PREDICT_8x16_P(name)\
+static void x264_predict_8x16c_p_##name( uint16_t *src )\
+{\
+ PREDICT_8x16C_P_CORE \
+ x264_predict_8x16c_p_core_##name( src, a, b, c );\
+}
+
+PREDICT_8x16_P(sse2)
+PREDICT_8x16_P(avx)
+#else
+#define PREDICT_8x16_P(name)\
+static void x264_predict_8x16c_p_##name( uint8_t *src )\
+{\
+ PREDICT_8x16C_P_CORE \
+ int i00 = a -3*b -7*c + 16;\
+ x264_predict_8x16c_p_core_##name( src, i00, b, c );\
+}
+#ifndef ARCH_X86_64
+PREDICT_8x16_P(mmx2)
+#endif
+PREDICT_8x16_P(sse2)
+PREDICT_8x16_P(avx)
+#endif
+
#if HAVE_X86_INLINE_ASM
#if HIGH_BIT_DEPTH
static void x264_predict_16x16_p_sse2( uint16_t *src )
pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x16c_dc_top_sse2;
pf[I_PRED_CHROMA_DC] = x264_predict_8x16c_dc_sse2;
pf[I_PRED_CHROMA_H] = x264_predict_8x16c_h_sse2;
+ pf[I_PRED_CHROMA_P] = x264_predict_8x16c_p_sse2;
+ if( !(cpu&X264_CPU_AVX) )
+ return;
+ pf[I_PRED_CHROMA_P] = x264_predict_8x16c_p_avx;
#else
pf[I_PRED_CHROMA_V] = x264_predict_8x16c_v_mmx;
if( !(cpu&X264_CPU_MMX2) )
pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x16c_dc_top_mmx2;
pf[I_PRED_CHROMA_DC] = x264_predict_8x16c_dc_mmx2;
pf[I_PRED_CHROMA_H] = x264_predict_8x16c_h_mmx2;
+#ifndef ARCH_X86_64
+ pf[I_PRED_CHROMA_P] = x264_predict_8x16c_p_mmx2;
+#endif
+ if( !(cpu&X264_CPU_SSE2) )
+ return;
+ pf[I_PRED_CHROMA_P] = x264_predict_8x16c_p_sse2;
if( !(cpu&X264_CPU_SSSE3) )
return;
pf[I_PRED_CHROMA_H] = x264_predict_8x16c_h_ssse3;
+ if( !(cpu&X264_CPU_AVX) )
+ return;
+ pf[I_PRED_CHROMA_P] = x264_predict_8x16c_p_avx;
#endif // HIGH_BIT_DEPTH
}
void x264_predict_8x16c_h_mmx2( pixel *src );
void x264_predict_8x16c_h_sse2( pixel *src );
void x264_predict_8x16c_h_ssse3( uint8_t *src );
+void x264_predict_8x16c_p_core_mmx2( uint8_t *src, int i00, int b, int c );
+void x264_predict_8x16c_p_core_sse2( pixel *src, int i00, int b, int c );
+void x264_predict_8x16c_p_core_avx( pixel *src, int i00, int b, int c );
void x264_predict_8x8c_p_core_mmx2( uint8_t *src, int i00, int b, int c );
void x264_predict_8x8c_p_core_sse2( pixel *src, int i00, int b, int c );
void x264_predict_8x8c_p_core_avx( pixel *src, int i00, int b, int c );
%endif
%endmacro
+%macro WIDEN_SXWD 2
+ punpckhwd m%2, m%1
+ psrad m%2, 16
+%if cpuflag(sse4)
+ pmovsxwd m%1, m%1
+%else
+ punpcklwd m%1, m%1
+ psrad m%1, 16
+%endif
+%endmacro
+
%macro ABSW 2-3 ; dst, src, tmp (tmp used only if dst==src)
%if cpuflag(ssse3)
pabsw %1, %2
%endmacro
%macro LOAD_DIFF8x4 8 ; 4x dst, 1x tmp, 1x mul, 2x ptr
-%if cpuflag(ssse3)
+%if BIT_DEPTH == 8 && cpuflag(ssse3)
movh m%2, [%8+%1*FDEC_STRIDE]
movh m%1, [%7+%1*FENC_STRIDE]
punpcklbw m%1, m%2
pmaddubsw m%3, m%6
pmaddubsw m%4, m%6
%else
- LOAD_DIFF m%1, m%5, m%6, [%7+%1*FENC_STRIDE], [%8+%1*FDEC_STRIDE]
- LOAD_DIFF m%2, m%5, m%6, [%7+%2*FENC_STRIDE], [%8+%2*FDEC_STRIDE]
- LOAD_DIFF m%3, m%5, m%6, [%7+%3*FENC_STRIDE], [%8+%3*FDEC_STRIDE]
- LOAD_DIFF m%4, m%5, m%6, [%7+%4*FENC_STRIDE], [%8+%4*FDEC_STRIDE]
+ LOAD_DIFF m%1, m%5, m%6, [%7+%1*FENC_STRIDEB], [%8+%1*FDEC_STRIDEB]
+ LOAD_DIFF m%2, m%5, m%6, [%7+%2*FENC_STRIDEB], [%8+%2*FDEC_STRIDEB]
+ LOAD_DIFF m%3, m%5, m%6, [%7+%3*FENC_STRIDEB], [%8+%3*FDEC_STRIDEB]
+ LOAD_DIFF m%4, m%5, m%6, [%7+%4*FENC_STRIDEB], [%8+%4*FDEC_STRIDEB]
%endif
%endmacro
{ \
ok = 0; \
fprintf( stderr, #name " [FAILED]\n" ); \
+ for( int k = 0; k < size; k++ )\
+ printf( "%d ", ((dctcoef*)t1)[k] );\
+ printf("\n");\
+ for( int k = 0; k < size; k++ )\
+ printf( "%d ", ((dctcoef*)t2)[k] );\
+ printf("\n");\
break; \
} \
call_c( dct_c.name, t1, enc, dec ); \