1 ;*****************************************************************************
2 ;* quant-a.asm: x86 quantization and level-run
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2016 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Fiona Glaser <fiona@x264.com>
8 ;* Christian Heine <sennindemokrit@gmx.net>
9 ;* Oskar Arvidsson <oskar@irock.se>
10 ;* Henrik Gramner <henrik@gramner.com>
12 ;* This program is free software; you can redistribute it and/or modify
13 ;* it under the terms of the GNU General Public License as published by
14 ;* the Free Software Foundation; either version 2 of the License, or
15 ;* (at your option) any later version.
17 ;* This program is distributed in the hope that it will be useful,
18 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
19 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 ;* GNU General Public License for more details.
22 ;* You should have received a copy of the GNU General Public License
23 ;* along with this program; if not, write to the Free Software
24 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
26 ;* This program is also available under a commercial proprietary license.
27 ;* For more information, contact us at licensing@x264.com.
28 ;*****************************************************************************
31 %include "x86util.asm"
36 dw %1, %2, %1, %2, %2, %3, %2, %3
39 dw %1, %4, %5, %4, %1, %4, %5, %4
40 dw %4, %2, %6, %2, %4, %2, %6, %2
41 dw %5, %6, %3, %6, %5, %6, %3, %6
42 dw %4, %2, %6, %2, %4, %2, %6, %2
54 DQM8 20, 18, 32, 19, 25, 24
55 DQM8 22, 19, 35, 21, 28, 26
56 DQM8 26, 23, 42, 24, 33, 31
57 DQM8 28, 25, 45, 26, 35, 33
58 DQM8 32, 28, 51, 30, 40, 38
59 DQM8 36, 32, 58, 34, 46, 43
62 db 0,3,2,6,2,5,5,9,1,5,4,8,5,8,8,12,1,4,4,8,4,7,7,11,4,8,7,11,8,11,11,15,1,4
63 db 3,7,4,7,7,11,3,7,6,10,7,10,10,14,4,7,7,11,7,10,10,14,7,11,10,14,11,14,14
64 db 18,0,4,3,7,3,6,6,10,3,7,6,10,7,10,10,14,3,6,6,10,6,9,9,13,6,10,9,13,10,13
65 db 13,17,4,7,6,10,7,10,10,14,6,10,9,13,10,13,13,17,7,10,10,14,10,13,13,17,10
66 db 14,13,17,14,17,17,21,0,3,3,7,3,6,6,10,2,6,5,9,6,9,9,13,3,6,6,10,6,9,9,13
67 db 6,10,9,13,10,13,13,17,3,6,5,9,6,9,9,13,5,9,8,12,9,12,12,16,6,9,9,13,9,12
68 db 12,16,9,13,12,16,13,16,16,20,3,7,6,10,6,9,9,13,6,10,9,13,10,13,13,17,6,9
69 db 9,13,9,12,12,16,9,13,12,16,13,16,16,20,7,10,9,13,10,13,13,17,9,13,12,16
70 db 13,16,16,20,10,13,13,17,13,16,16,20,13,17,16,20,17,20,20,24
72 chroma_dc_dct_mask_mmx: dw 0, 0,-1,-1, 0, 0,-1,-1
73 chroma_dc_dmf_mask_mmx: dw 0, 0,-1,-1, 0,-1,-1, 0
74 chroma_dc_dct_mask: dw 1, 1,-1,-1, 1, 1,-1,-1
75 chroma_dc_dmf_mask: dw 1, 1,-1,-1, 1,-1,-1, 1
79 %macro DCT_COEF_SHUFFLE 8
84 %assign y y<<((~(y>>7))&1)
93 DCT_COEF_SHUFFLE 7, 6, 5, 4, 3, 2, 1, 0
107 cextern deinterleave_shufd
110 %macro QUANT_DC_START 2
112 movd xm%2, r2m ; bias
114 vpbroadcastdct m%1, xm%1
115 vpbroadcastdct m%2, xm%2
119 %elif cpuflag(sse4) ; ssse3, but not faster on conroe
145 cmp ecx, (1<<mmsize)-1
157 %macro QUANT_ONE_DC 4
181 %macro QUANT_TWO_DC 4
200 QUANT_ONE_DC %1, %2, %3, %4
201 QUANT_ONE_DC %1+mmsize, %2, %3, %4+mmsize
205 %macro QUANT_ONE_AC_MMX 5
224 %macro QUANT_TWO_AC 5
231 paddd m3, [%3+mmsize]
233 pmulld m3, [%2+mmsize]
243 QUANT_ONE_AC_MMX %1, %2, %3, %4, %5
244 QUANT_ONE_AC_MMX %1+mmsize, %2+mmsize, %3+mmsize, 1, %5
248 ;-----------------------------------------------------------------------------
249 ; int quant_2x2( int32_t dct[M*N], int mf, int bias )
250 ;-----------------------------------------------------------------------------
252 cglobal quant_%1x%2_dc, 3,3,8
254 %if %1*%2 <= mmsize/4
255 QUANT_ONE_DC r0, m6, m7, 0
258 %rep %1*%2/(mmsize/2)
259 QUANT_TWO_DC r0+x, m6, m7, x
267 ;-----------------------------------------------------------------------------
268 ; int quant_MxN( int32_t dct[M*N], uint32_t mf[M*N], uint32_t bias[M*N] )
269 ;-----------------------------------------------------------------------------
271 cglobal quant_%1x%2, 3,3,8
273 %rep %1*%2/(mmsize/2)
274 QUANT_TWO_AC r0+x, r1+x, r2+x, x, 5
282 QUANT_TWO_AC r0+%1+mmsize*0, r1+mmsize*0, r2+mmsize*0, 0, %2
283 QUANT_TWO_AC r0+%1+mmsize*2, r1+mmsize*2, r2+mmsize*2, 1, %2
287 cglobal quant_4x4x4, 3,3,8
295 packssdw m5, m6 ; AAAA BBBB CCCC DDDD
330 cglobal quant_4x4x4, 3,3,6
331 QUANT_TWO_AC r0, r1, r2, 0, 4
332 QUANT_TWO_AC r0+64, r1, r2, 0, 5
335 QUANT_TWO_AC r0, r1, r2, 0, 5
336 QUANT_TWO_AC r0+64, r1, r2, 0, 1
348 %endif ; HIGH_BIT_DEPTH
350 %if HIGH_BIT_DEPTH == 0
352 ;;; %1 (m64) dct[y][x]
353 ;;; %2 (m64/mmx) mf[y][x] or mf[0][0] (as uint16_t)
354 ;;; %3 (m64/mmx) bias[y][x] or bias[0][0] (as uint16_t)
355 mova m1, %1 ; load dct coeffs
357 paddusw m0, %3 ; round
358 pmulhuw m0, %2 ; divide
359 PSIGNW m0, m1 ; restore sign
378 ACCUM por, %8, 2, %7+mmsize
381 ;-----------------------------------------------------------------------------
382 ; void quant_4x4_dc( int16_t dct[16], int mf, int bias )
383 ;-----------------------------------------------------------------------------
384 %macro QUANT_DC 2-3 0
388 QUANT_ONE [r0], m2, m3, 0, 5
393 QUANT_TWO [r0+x], [r0+x+mmsize], m4, m4, m6, m6, x, 5
401 ;-----------------------------------------------------------------------------
402 ; int quant_4x4( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
403 ;-----------------------------------------------------------------------------
407 QUANT_ONE [r0], [r1], [r2], 0, 5
411 QUANT_TWO [r0+x], [r0+x+mmsize], [r1+x], [r1+x+mmsize], [r2+x], [r2+x+mmsize], x, 5
421 QUANT_TWO [r0+%1+mmsize*0], [r0+%1+mmsize*1], m8, m9, m10, m11, mmsize*0, %2
423 QUANT_TWO [r0+%1+mmsize*0], [r0+%1+mmsize*1], [r1+mmsize*0], [r1+mmsize*1], [r2+mmsize*0], [r2+mmsize*1], mmsize*0, %2
425 QUANT_TWO [r0+%1+mmsize*2], [r0+%1+mmsize*3], [r1+mmsize*2], [r1+mmsize*3], [r2+mmsize*2], [r2+mmsize*3], mmsize*2, %2
431 cglobal quant_4x4x4, 3,3,7
433 mova m8, [r1+mmsize*0]
434 mova m9, [r1+mmsize*1]
435 mova m10, [r2+mmsize*0]
436 mova m11, [r2+mmsize*1]
444 packssdw m4, m5 ; AAAA BBBB CCCC DDDD
453 QUANT_DC quant_2x2_dc, 1
454 %if ARCH_X86_64 == 0 ; not needed because sse2 is faster
455 QUANT_DC quant_4x4_dc, 4
457 QUANT_AC quant_4x4, 4
458 QUANT_AC quant_8x8, 16
462 QUANT_DC quant_4x4_dc, 2, 7
463 QUANT_AC quant_4x4, 2
464 QUANT_AC quant_8x8, 8
468 QUANT_DC quant_4x4_dc, 2, 7
469 QUANT_AC quant_4x4, 2
470 QUANT_AC quant_8x8, 8
474 QUANT_DC quant_2x2_dc, 1
477 ;Not faster on Conroe, so only used in SSE4 versions
478 QUANT_DC quant_4x4_dc, 2, 7
479 QUANT_AC quant_4x4, 2
480 QUANT_AC quant_8x8, 8
483 QUANT_AC quant_4x4, 1
484 QUANT_AC quant_8x8, 4
485 QUANT_DC quant_4x4_dc, 1, 6
488 cglobal quant_4x4x4, 3,3,6
491 QUANT_ONE [r0+ 0], m2, m3, 0, 4
492 QUANT_ONE [r0+32], m2, m3, 0, 5
494 QUANT_ONE [r0+64], m2, m3, 0, 5
495 QUANT_ONE [r0+96], m2, m3, 0, 1
506 %endif ; !HIGH_BIT_DEPTH
510 ;=============================================================================
512 ;=============================================================================
516 ;;; %2,%3 dequant_mf[i_mf][y][x]
541 ;;; %2,%3 dequant_mf[i_mf][y][x]
548 pmadcswd m0, m0, %2, m3
549 pmadcswd m1, m1, %3, m3
563 pmadcswd m0, m0, %2, m3
564 pmadcswd m1, m1, %3, m3
575 %macro DEQUANT_LOOP 3
579 %1 [r0+(t0 )*SIZEOF_PIXEL], [r1+t0*2 ], [r1+t0*2+ 8*%3], [r0+(t0+ 4*%3)*SIZEOF_PIXEL]
580 %1 [r0+(t0+8*%3)*SIZEOF_PIXEL], [r1+t0*2+16*%3], [r1+t0*2+24*%3], [r0+(t0+12*%3)*SIZEOF_PIXEL]
586 %1 [r0+(8*%3)*SIZEOF_PIXEL], [r1+16*%3], [r1+24*%3], [r0+(12*%3)*SIZEOF_PIXEL]
588 %1 [r0+(0 )*SIZEOF_PIXEL], [r1+0 ], [r1+ 8*%3], [r0+( 4*%3)*SIZEOF_PIXEL]
593 %macro DEQUANT16_FLAT 2-5
611 DECLARE_REG_TMP 6,3,2
613 DECLARE_REG_TMP 2,0,1
616 %macro DEQUANT_START 2
619 shr t0d, 8 ; i_qbits = i_qp / 6
622 sub t2d, t1d ; i_mf = i_qp % 6
625 add r1, t2 ; dequant_mf[i_mf]
627 add r1, r1mp ; dequant_mf[i_mf]
631 jl .rshift32 ; negative qbits => rightshift
634 ;-----------------------------------------------------------------------------
635 ; void dequant_4x4( dctcoef dct[4][4], int dequant_mf[6][4][4], int i_qp )
636 ;-----------------------------------------------------------------------------
638 cglobal dequant_%1x%1, 0,3,6
640 DEQUANT_START %2+2, %2
644 DEQUANT_LOOP DEQUANT16_L, %1*%1/4, %3
653 DEQUANT_LOOP DEQUANT32_R, %1*%1/4, %3
655 %if HIGH_BIT_DEPTH == 0 && (notcpuflag(avx) || mmsize == 32)
656 cglobal dequant_%1x%1_flat16, 0,3
660 jl dequant_%1x%1 %+ SUFFIX %+ .skip_prologue
664 shr t0d, 8 ; i_qbits = i_qp / 6
667 sub t2d, t1d ; i_mf = i_qp % 6
670 lea r1, [dequant%1_scale]
673 lea r1, [dequant%1_scale + t2]
679 DEQUANT16_FLAT [r1], 0, 16
680 DEQUANT16_FLAT [r1+8], 8, 24
682 DEQUANT16_FLAT [r1], 0, 16
684 vbroadcasti128 m0, [r1]
690 DEQUANT16_FLAT [r1], 0, 8, 64, 72
691 DEQUANT16_FLAT [r1+16], 16, 24, 48, 56
692 DEQUANT16_FLAT [r1+16], 80, 88, 112, 120
693 DEQUANT16_FLAT [r1+32], 32, 40, 96, 104
695 DEQUANT16_FLAT [r1], 0, 64
696 DEQUANT16_FLAT [r1+16], 16, 48, 80, 112
697 DEQUANT16_FLAT [r1+32], 32, 96
703 pmullw m0, m1, [r0+ 0]
704 pmullw m3, m2, [r0+32]
705 pmullw m4, m1, [r0+64]
706 pmullw m5, m2, [r0+96]
713 %endif ; !HIGH_BIT_DEPTH && !AVX
747 cglobal dequant_4x4dc, 0,3,6
752 vpbroadcastdct m3, [r1]
760 %rep SIZEOF_PIXEL*32/mmsize
763 %assign %%x %%x+mmsize
770 vpbroadcastdct m2, [r1]
783 %rep SIZEOF_PIXEL*32/mmsize
784 pmadcswd m0, m2, [r0+%%x], m4
787 %assign %%x %%x+mmsize
790 %else ; !HIGH_BIT_DEPTH
796 %rep SIZEOF_PIXEL*32/mmsize
806 %assign %%x %%x+mmsize
808 %endif ; !HIGH_BIT_DEPTH
814 DEQUANT_DC d, pmaddwd
816 DEQUANT_DC d, pmaddwd
818 DEQUANT_DC d, pmaddwd
836 ; pextrw with a memory destination requires SSE4.1, go through a GPR as a fallback
846 ;-----------------------------------------------------------------------------
847 ; void idct_dequant_2x4_dc( dctcoef dct[8], dctcoef dct4x4[8][16], int dequant_mf[6][16], int i_qp )
848 ; void idct_dequant_2x4_dconly( dctcoef dct[8], int dequant_mf[6][16], int i_qp )
849 ;-----------------------------------------------------------------------------
851 %macro DEQUANT_2x4_DC 1
853 DECLARE_REG_TMP 6,3,2
854 %define %%args dct, dmf, qp
856 DECLARE_REG_TMP 6,4,3
857 %define %%args dct, dct4x4, dmf, qp
861 DECLARE_REG_TMP 2,0,1
864 cglobal idct_dequant_2x4_%1, 0,3,5, %%args
870 sub t2d, t1d ; qp % 6
871 shl t2d, 6 ; 16 * sizeof(int)
873 imul t2d, [dmfq+t2], -0xffff ; (-dmf) << 16 | dmf
877 imul t2d, [t2], -0xffff
882 SUMSUB_BA d, 1, 0, 2 ; 16-bit intermediate precision is enough for the first two sumsub steps,
883 packssdw m1, m0 ; and by packing to words we can use pmaddwd instead of pmulld later.
888 punpcklqdq m1, m0 ; a0 a1 a2 a3 a4 a5 a6 a7
890 pshufd m0, m1, q2301 ; a2 a3 a0 a1 a6 a7 a4 a5
892 pshuflw m3, m3, q1000 ; + + + -
894 punpcklqdq m3, m3 ; + + + - + + + -
911 psubd m0, m4 ; + 1 << (qp/6-1)
925 movifnidn dct4x4q, dct4x4mp
927 movd [dct4x4q+0*64], m0
929 pextrd [dct4x4q+1*64], m0, 1
931 pextrd [dct4x4q-2*64], m0, 2
932 pextrd [dct4x4q-1*64], m0, 3
933 movd [dct4x4q+0*64], m1
934 pextrd [dct4x4q+1*64], m1, 1
935 pextrd [dct4x4q+2*64], m1, 2
936 pextrd [dct4x4q+3*64], m1, 3
940 movd [dct4x4q+1*64], m0
942 movd [dct4x4q-2*64], m2
944 movd [dct4x4q-1*64], m2
945 movd [dct4x4q+0*64], m1
948 movd [dct4x4q+1*64], m1
949 movd [dct4x4q+2*64], m2
951 movd [dct4x4q+3*64], m2
954 PEXTRW [dct4x4q+0*32], m0, 0, eax
955 PEXTRW [dct4x4q+1*32], m0, 2, eax
956 PEXTRW [dct4x4q+2*32], m0, 4, eax
957 PEXTRW [dct4x4q+3*32], m0, 6, eax
959 PEXTRW [dct4x4q+0*32], m1, 0, eax
960 PEXTRW [dct4x4q+1*32], m1, 2, eax
961 PEXTRW [dct4x4q+2*32], m1, 4, eax
962 PEXTRW [dct4x4q+3*32], m1, 6, eax
968 ; sse4 reduces code size compared to sse2 but isn't any faster, so just go with sse2+avx
971 DEQUANT_2x4_DC dconly
974 DEQUANT_2x4_DC dconly
976 ; t4 is eax for return value.
978 DECLARE_REG_TMP 0,1,2,3,6,4 ; Identical for both Windows and *NIX
980 DECLARE_REG_TMP 4,1,2,3,0,5
983 ;-----------------------------------------------------------------------------
984 ; x264_optimize_chroma_2x2_dc( dctcoef dct[4], int dequant_mf )
985 ;-----------------------------------------------------------------------------
987 %macro OPTIMIZE_CHROMA_2x2_DC 0
988 cglobal optimize_chroma_2x2_dc, 0,6-cpuflag(sse4),7
999 mova m3, [chroma_dc_dct_mask]
1000 mova m5, [chroma_dc_dmf_mask]
1002 mova m3, [chroma_dc_dct_mask_mmx]
1003 mova m5, [chroma_dc_dmf_mask_mmx]
1006 pshufd m0, m1, q0101 ; 1 0 3 2 1 0 3 2
1008 punpcklqdq m1, m1 ; 3 2 1 0 3 2 1 0
1009 mova m6, [pd_1024] ; 32<<5, elements are shifted 5 bits to the left
1010 PSIGNW m0, m3 ; -1 -0 3 2 -1 -0 3 2
1011 PSIGNW m2, m5 ; + - - + - - + +
1012 paddw m0, m1 ; -1+3 -0+2 1+3 0+2 -1+3 -0+2 1+3 0+2
1013 pmaddwd m0, m2 ; 0-1-2+3 0-1+2-3 0+1-2-3 0+1+2+3 * dmf
1015 psrad m2, 16 ; + - - +
1019 %if notcpuflag(ssse3)
1020 psrad m1, 31 ; has to be 0 or -1 in order for PSIGND_MMX to work correctly
1032 jz .ret ; if the DC coefficients already round to zero, terminate early
1035 movsx t3d, word [t0+2*t1] ; dct[coeff]
1036 pshufd m6, m1, q3333
1037 pshufd m1, m1, q2100 ; move the next element to high dword
1046 psubd m3, m5 ; coeff -= sign
1057 paddd m3, m5 ; coeff += sign
1062 pshufd m2, m2, q1320 ; - + - + / - - + +
1072 movsx t3d, word [t0]
1073 punpcklqdq m2, m2 ; + + + +
1080 %if HIGH_BIT_DEPTH == 0
1082 OPTIMIZE_CHROMA_2x2_DC
1084 OPTIMIZE_CHROMA_2x2_DC
1086 OPTIMIZE_CHROMA_2x2_DC
1088 OPTIMIZE_CHROMA_2x2_DC
1089 %endif ; !HIGH_BIT_DEPTH
1092 ;-----------------------------------------------------------------------------
1093 ; void denoise_dct( int32_t *dct, uint32_t *sum, uint32_t *offset, int size )
1094 ;-----------------------------------------------------------------------------
1095 %macro DENOISE_DCT 0
1096 cglobal denoise_dct, 4,4,6
1098 movsxdifnidn r3, r3d
1100 mova m2, [r0+r3*4-2*mmsize]
1101 mova m3, [r0+r3*4-1*mmsize]
1104 paddd m4, m0, [r1+r3*4-2*mmsize]
1105 psubd m0, [r2+r3*4-2*mmsize]
1106 mova [r1+r3*4-2*mmsize], m4
1107 paddd m4, m1, [r1+r3*4-1*mmsize]
1108 psubd m1, [r2+r3*4-1*mmsize]
1109 mova [r1+r3*4-1*mmsize], m4
1116 mova [r0+r3*4-2*mmsize], m0
1117 mova [r0+r3*4-1*mmsize], m1
1123 %if ARCH_X86_64 == 0
1136 %else ; !HIGH_BIT_DEPTH
1138 ;-----------------------------------------------------------------------------
1139 ; void denoise_dct( int16_t *dct, uint32_t *sum, uint16_t *offset, int size )
1140 ;-----------------------------------------------------------------------------
1141 %macro DENOISE_DCT 0
1142 cglobal denoise_dct, 4,4,7
1144 movsxdifnidn r3, r3d
1146 mova m2, [r0+r3*2-2*mmsize]
1147 mova m3, [r0+r3*2-1*mmsize]
1150 psubusw m4, m0, [r2+r3*2-2*mmsize]
1151 psubusw m5, m1, [r2+r3*2-1*mmsize]
1154 mova [r0+r3*2-2*mmsize], m4
1155 mova [r0+r3*2-1*mmsize], m5
1156 punpcklwd m2, m0, m6
1157 punpcklwd m3, m1, m6
1160 paddd m2, [r1+r3*4-4*mmsize]
1161 paddd m0, [r1+r3*4-3*mmsize]
1162 paddd m3, [r1+r3*4-2*mmsize]
1163 paddd m1, [r1+r3*4-1*mmsize]
1164 mova [r1+r3*4-4*mmsize], m2
1165 mova [r1+r3*4-3*mmsize], m0
1166 mova [r1+r3*4-2*mmsize], m3
1167 mova [r1+r3*4-1*mmsize], m1
1173 %if ARCH_X86_64 == 0
1185 cglobal denoise_dct, 4,4,4
1187 movsxdifnidn r3, r3d
1189 mova m1, [r0+r3*2-mmsize]
1191 psubusw m2, m0, [r2+r3*2-mmsize]
1192 vpermq m0, m0, q3120
1194 mova [r0+r3*2-mmsize], m2
1195 punpcklwd m1, m0, m3
1197 paddd m1, [r1+r3*4-2*mmsize]
1198 paddd m0, [r1+r3*4-1*mmsize]
1199 mova [r1+r3*4-2*mmsize], m1
1200 mova [r1+r3*4-1*mmsize], m0
1205 %endif ; !HIGH_BIT_DEPTH
1207 ;-----------------------------------------------------------------------------
1208 ; int decimate_score( dctcoef *dct )
1209 ;-----------------------------------------------------------------------------
1211 %macro DECIMATE_MASK 5
1216 packssdw m0, [%3+16]
1217 packssdw m1, [%3+48]
1218 ABSW2 m0, m1, m0, m1, m3, m4
1220 ABSW m0, [%3+ 0], m3
1221 ABSW m1, [%3+16], m4
1235 packssdw m0, [%3+ 8]
1236 packssdw m1, [%3+24]
1237 packssdw m2, [%3+40]
1238 packssdw m3, [%3+56]
1245 ABSW2 m0, m1, m0, m1, m6, m7
1246 ABSW2 m2, m3, m2, m3, m6, m7
1266 cextern decimate_table4
1267 cextern decimate_table8
1269 %macro DECIMATE4x4 1
1271 cglobal decimate_score%1, 1,3
1273 lea r4, [decimate_table4]
1274 lea r5, [decimate_mask_table4]
1276 %define mask_table r5
1278 %define table decimate_table4
1279 %define mask_table decimate_mask_table4
1281 DECIMATE_MASK edx, eax, r0, [pb_1], ecx
1290 movzx eax, byte [mask_table + rcx]
1299 add al, byte [table + rcx]
1300 add al, byte [mask_table + rdx]
1309 %if ARCH_X86_64 == 0
1321 ; 2x gt1 output, 2x nz output, 1x mask
1322 %macro DECIMATE_MASK64_AVX2 5
1329 pcmpgtb m2, m0, %5 ; the > 1 checks don't care about order, so
1330 pcmpgtb m3, m1, %5 ; we can save latency by doing them here
1335 vpermq m0, m0, q3120
1336 vpermq m1, m1, q3120
1344 %macro DECIMATE8x8 0
1347 cglobal decimate_score64, 1,5
1349 lea r4, [decimate_table8]
1352 %define table decimate_table8
1356 DECIMATE_MASK64_AVX2 eax, r2d, r1d, r3d, m5
1362 DECIMATE_MASK r1d, eax, r0+SIZEOF_DCTCOEF* 0, m5, null
1365 DECIMATE_MASK r2d, eax, r0+SIZEOF_DCTCOEF*16, m5, null
1368 DECIMATE_MASK r2d, r3d, r0+SIZEOF_DCTCOEF*32, m5, null
1372 DECIMATE_MASK r2d, r3d, r0+SIZEOF_DCTCOEF*48, m5, null
1384 add al, byte [table + rcx]
1397 cglobal decimate_score64, 1,6
1399 cglobal decimate_score64, 1,5
1403 DECIMATE_MASK64_AVX2 r0, r2, r3, r4, m5
1409 DECIMATE_MASK r3, r2, r0+SIZEOF_DCTCOEF* 0, m5, r5
1412 DECIMATE_MASK r4, r2, r0+SIZEOF_DCTCOEF*16, m5, r5
1415 DECIMATE_MASK r4, r1, r0+SIZEOF_DCTCOEF*32, m5, r5
1417 DECIMATE_MASK r1, r0, r0+SIZEOF_DCTCOEF*48, m5, r5
1434 add al, byte [decimate_table8 + ecx]
1465 %if ARCH_X86_64 == 0
1476 ;-----------------------------------------------------------------------------
1477 ; int coeff_last( dctcoef *dct )
1478 ;-----------------------------------------------------------------------------
1499 %macro LAST_MASK 3-4
1502 packssdw mm0, [%3+8]
1507 movdqa xmm0, [%3+ 0]
1509 packssdw xmm0, [%3+16]
1512 movdqa xmm1, [%3+32]
1513 packssdw xmm0, [%3+16]
1514 packssdw xmm1, [%3+48]
1522 packssdw mm0, [%3+ 8]
1523 packssdw mm1, [%3+24]
1530 packssdw mm0, [%3+ 8]
1531 packssdw mm1, [%3+24]
1534 packssdw mm3, [%3+40]
1535 packssdw mm4, [%3+56]
1547 %macro COEFF_LAST4 0
1548 cglobal coeff_last4, 1,3
1550 LAST_MASK 4, r1d, r0
1559 INIT_MMX mmx2, lzcnt
1562 %macro COEFF_LAST8 0
1563 cglobal coeff_last8, 1,3
1565 LAST_MASK 8, r1d, r0
1576 %if ARCH_X86_64 == 0
1582 INIT_XMM sse2, lzcnt
1585 %else ; !HIGH_BIT_DEPTH
1586 %macro LAST_MASK 3-4
1592 packsswb mm0, [%3+ 8]
1597 movdqa xmm0, [%3+ 0]
1598 packsswb xmm0, [%3+16]
1604 packsswb mm0, [%3+ 8]
1605 packsswb mm1, [%3+24]
1615 %macro COEFF_LAST48 0
1617 cglobal coeff_last4, 1,1
1622 cglobal coeff_last4, 0,3
1631 lea eax, [eax+ecx*2]
1635 cglobal coeff_last8, 1,3
1637 LAST_MASK 8, r1d, r0, r2d
1645 INIT_MMX mmx2, lzcnt
1647 %endif ; HIGH_BIT_DEPTH
1650 cglobal coeff_last15, 1,3
1652 LAST_MASK 15, r1d, r0-SIZEOF_DCTCOEF, r2d
1658 cglobal coeff_last16, 1,3
1660 LAST_MASK 16, r1d, r0, r2d
1665 %if ARCH_X86_64 == 0
1666 cglobal coeff_last64, 1, 4-mmsize/16
1668 LAST_MASK 16, r1d, r0+SIZEOF_DCTCOEF* 32, r3d
1669 LAST_MASK 16, r2d, r0+SIZEOF_DCTCOEF* 48, r3d
1674 LAST_MASK 16, r1d, r0+SIZEOF_DCTCOEF* 0, r3d
1675 LAST_MASK 16, r2d, r0+SIZEOF_DCTCOEF*16, r3d
1686 cglobal coeff_last64, 1,3
1688 LAST_MASK 16, r1d, r0+SIZEOF_DCTCOEF* 0
1689 LAST_MASK 16, r2d, r0+SIZEOF_DCTCOEF*16
1692 LAST_MASK 16, r2d, r0+SIZEOF_DCTCOEF*32
1693 LAST_MASK 16, r0d, r0+SIZEOF_DCTCOEF*48
1704 %if ARCH_X86_64 == 0
1710 INIT_XMM sse2, lzcnt
1713 %macro LAST_MASK_AVX2 2
1716 packssdw m0, [%2+32]
1718 packssdw m1, [%2+96]
1720 mova m1, [deinterleave_shufd]
1724 packsswb m0, [%2+32]
1725 vpermq m0, m0, q3120
1731 %if ARCH_X86_64 == 0
1733 cglobal coeff_last64, 1,2
1735 LAST_MASK_AVX2 r1d, r0+SIZEOF_DCTCOEF*32
1738 LAST_MASK_AVX2 r1d, r0+SIZEOF_DCTCOEF* 0
1748 cglobal coeff_last64, 1,3
1750 LAST_MASK_AVX2 r1d, r0+SIZEOF_DCTCOEF* 0
1751 LAST_MASK_AVX2 r2d, r0+SIZEOF_DCTCOEF*32
1759 ;-----------------------------------------------------------------------------
1760 ; int coeff_level_run( dctcoef *dct, run_level_t *runlevel )
1761 ;-----------------------------------------------------------------------------
1770 ; t6 = eax for return, t3 = ecx for shift, t[01] = r[01] for x86_64 args
1772 DECLARE_REG_TMP 3,1,2,0,4,5,6
1774 DECLARE_REG_TMP 0,1,2,3,4,5,6
1776 DECLARE_REG_TMP 6,3,2,1,4,5,0
1779 %macro COEFF_LEVELRUN 1
1780 cglobal coeff_level_run%1,0,7
1785 LAST_MASK %1, t5d, t0-(%1&1)*SIZEOF_DCTCOEF, t4d
1794 mov [t1+levelrun.mask], t5d
1797 LZCOUNT t3d, t5d, 0x1f
1802 mov [t1+levelrun.last], t4d
1804 LZCOUNT t3d, t5d, 0x1f
1813 mov [t1+t6*4+levelrun.level], t2d
1815 mov [t1+t6*2+levelrun.level], t2w
1824 %if ARCH_X86_64 == 0
1836 INIT_XMM sse2, lzcnt
1842 INIT_MMX mmx2, lzcnt
1846 ; Similar to the one above, but saves the DCT
1847 ; coefficients in m0/m1 so we don't have to load
1849 %macro LAST_MASK_LUT 3
1857 packsswb xm2, xm0, xm1
1859 vinserti128 m0, m0, xm1, 1
1866 %macro COEFF_LEVELRUN_LUT 1
1867 cglobal coeff_level_run%1,2,4+(%1/9)
1870 %define GLOBAL +r5-$$
1874 LAST_MASK_LUT %1, eax, r0-(%1&1)*SIZEOF_DCTCOEF
1883 mov [r1+levelrun.mask], eax
1892 movzx r4d, ah ; first 8 bits
1895 movzx r2d, al ; second 8 bits
1896 shl eax, 32-%1-(%1&1)
1897 LZCOUNT eax, eax, 0x1f
1900 mov [r1+levelrun.last], r3d
1901 ; Here we abuse pshufb, combined with a lookup table, to do a gather
1902 ; operation based on a bitmask. For example:
1904 ; dct 15-8 (input): 0 0 4 0 0 -2 1 0
1905 ; dct 7-0 (input): 0 0 -1 0 0 0 0 15
1906 ; bitmask 1: 0 0 1 0 0 1 1 0
1907 ; bitmask 2: 0 0 1 0 0 0 0 1
1908 ; gather 15-8: 4 -2 1 __ __ __ __ __
1909 ; gather 7-0: -1 15 __ __ __ __ __ __
1910 ; levels (output): 4 -2 1 -1 15 __ __ __ __ __ __ __ __ __ __ __
1912 ; The overlapping, dependent stores almost surely cause a mess of
1913 ; forwarding issues, but it's still enormously faster.
1915 movzx eax, byte [popcnt_table+r4 GLOBAL]
1916 movzx r3d, byte [popcnt_table+r2 GLOBAL]
1918 movh m3, [dct_coef_shuffle+r4*8 GLOBAL]
1919 movh m2, [dct_coef_shuffle+r2*8 GLOBAL]
1921 ; Storing 8 bytes of shuffle constant and converting it (unpack + or)
1922 ; is neutral to slightly faster in local speed measurements, but it
1923 ; cuts the table size in half, which is surely a big cache win.
1930 mova [r1+levelrun.level], m1
1931 ; This obnoxious unaligned store messes with store forwarding and
1932 ; stalls the CPU to no end, but merging the two registers before
1933 ; storing requires a variable 128-bit shift. Emulating this does
1934 ; work, but requires a lot of ops and the gain is tiny and
1935 ; inconsistent, so we'll err on the side of fewer instructions.
1936 movu [r1+rax*2+levelrun.level], m0
1938 movq xm2, [dct_coef_shuffle+r4*8 GLOBAL]
1939 vinserti128 m2, m2, [dct_coef_shuffle+r2*8 GLOBAL], 1
1943 vextracti128 [r1+levelrun.level], m0, 1
1944 movu [r1+rax*2+levelrun.level], xm0
1948 movzx eax, byte [popcnt_table+r2 GLOBAL]
1949 movh m1, [dct_coef_shuffle+r2*8 GLOBAL]
1953 mova [r1+levelrun.level], m0
1958 %if HIGH_BIT_DEPTH==0
1960 COEFF_LEVELRUN_LUT 4
1962 COEFF_LEVELRUN_LUT 8
1963 COEFF_LEVELRUN_LUT 15
1964 COEFF_LEVELRUN_LUT 16
1965 INIT_MMX ssse3, lzcnt
1966 COEFF_LEVELRUN_LUT 4
1967 INIT_XMM ssse3, lzcnt
1968 COEFF_LEVELRUN_LUT 8
1969 COEFF_LEVELRUN_LUT 15
1970 COEFF_LEVELRUN_LUT 16
1971 INIT_XMM avx2, lzcnt
1972 COEFF_LEVELRUN_LUT 15
1973 COEFF_LEVELRUN_LUT 16