1 ;*****************************************************************************
2 ;* quant-a.asm: x86 quantization and level-run
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2016 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Fiona Glaser <fiona@x264.com>
8 ;* Christian Heine <sennindemokrit@gmx.net>
9 ;* Oskar Arvidsson <oskar@irock.se>
10 ;* Henrik Gramner <henrik@gramner.com>
12 ;* This program is free software; you can redistribute it and/or modify
13 ;* it under the terms of the GNU General Public License as published by
14 ;* the Free Software Foundation; either version 2 of the License, or
15 ;* (at your option) any later version.
17 ;* This program is distributed in the hope that it will be useful,
18 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
19 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 ;* GNU General Public License for more details.
22 ;* You should have received a copy of the GNU General Public License
23 ;* along with this program; if not, write to the Free Software
24 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
26 ;* This program is also available under a commercial proprietary license.
27 ;* For more information, contact us at licensing@x264.com.
28 ;*****************************************************************************
31 %include "x86util.asm"
36 dw %1, %2, %1, %2, %2, %3, %2, %3
39 dw %1, %4, %5, %4, %1, %4, %5, %4
40 dw %4, %2, %6, %2, %4, %2, %6, %2
41 dw %5, %6, %3, %6, %5, %6, %3, %6
42 dw %4, %2, %6, %2, %4, %2, %6, %2
54 DQM8 20, 18, 32, 19, 25, 24
55 DQM8 22, 19, 35, 21, 28, 26
56 DQM8 26, 23, 42, 24, 33, 31
57 DQM8 28, 25, 45, 26, 35, 33
58 DQM8 32, 28, 51, 30, 40, 38
59 DQM8 36, 32, 58, 34, 46, 43
62 db 0,3,2,6,2,5,5,9,1,5,4,8,5,8,8,12,1,4,4,8,4,7,7,11,4,8,7,11,8,11,11,15,1,4
63 db 3,7,4,7,7,11,3,7,6,10,7,10,10,14,4,7,7,11,7,10,10,14,7,11,10,14,11,14,14
64 db 18,0,4,3,7,3,6,6,10,3,7,6,10,7,10,10,14,3,6,6,10,6,9,9,13,6,10,9,13,10,13
65 db 13,17,4,7,6,10,7,10,10,14,6,10,9,13,10,13,13,17,7,10,10,14,10,13,13,17,10
66 db 14,13,17,14,17,17,21,0,3,3,7,3,6,6,10,2,6,5,9,6,9,9,13,3,6,6,10,6,9,9,13
67 db 6,10,9,13,10,13,13,17,3,6,5,9,6,9,9,13,5,9,8,12,9,12,12,16,6,9,9,13,9,12
68 db 12,16,9,13,12,16,13,16,16,20,3,7,6,10,6,9,9,13,6,10,9,13,10,13,13,17,6,9
69 db 9,13,9,12,12,16,9,13,12,16,13,16,16,20,7,10,9,13,10,13,13,17,9,13,12,16
70 db 13,16,16,20,10,13,13,17,13,16,16,20,13,17,16,20,17,20,20,24
72 chroma_dc_dct_mask_mmx: dw 0, 0,-1,-1, 0, 0,-1,-1
73 chroma_dc_dmf_mask_mmx: dw 0, 0,-1,-1, 0,-1,-1, 0
74 chroma_dc_dct_mask: dw 1, 1,-1,-1, 1, 1,-1,-1
75 chroma_dc_dmf_mask: dw 1, 1,-1,-1, 1,-1,-1, 1
79 %macro DCT_COEF_SHUFFLE 8
84 %assign y y<<((~(y>>7))&1)
93 DCT_COEF_SHUFFLE 7, 6, 5, 4, 3, 2, 1, 0
107 cextern deinterleave_shufd
110 %macro QUANT_DC_START 2
112 movd xm%2, r2m ; bias
114 vpbroadcastdct m%1, xm%1
115 vpbroadcastdct m%2, xm%2
119 %elif cpuflag(sse4) ; ssse3, but not faster on conroe
145 cmp ecx, (1<<mmsize)-1
157 %macro QUANT_ONE_DC 4
181 %macro QUANT_TWO_DC 4
200 QUANT_ONE_DC %1, %2, %3, %4
201 QUANT_ONE_DC %1+mmsize, %2, %3, %4+mmsize
205 %macro QUANT_ONE_AC_MMX 5
224 %macro QUANT_TWO_AC 5
231 paddd m3, [%3+mmsize]
233 pmulld m3, [%2+mmsize]
243 QUANT_ONE_AC_MMX %1, %2, %3, %4, %5
244 QUANT_ONE_AC_MMX %1+mmsize, %2+mmsize, %3+mmsize, 1, %5
248 ;-----------------------------------------------------------------------------
249 ; int quant_2x2( int32_t dct[M*N], int mf, int bias )
250 ;-----------------------------------------------------------------------------
252 cglobal quant_%1x%2_dc, 3,3,8
254 %if %1*%2 <= mmsize/4
255 QUANT_ONE_DC r0, m6, m7, 0
258 %rep %1*%2/(mmsize/2)
259 QUANT_TWO_DC r0+x, m6, m7, x
267 ;-----------------------------------------------------------------------------
268 ; int quant_MxN( int32_t dct[M*N], uint32_t mf[M*N], uint32_t bias[M*N] )
269 ;-----------------------------------------------------------------------------
271 cglobal quant_%1x%2, 3,3,8
273 %rep %1*%2/(mmsize/2)
274 QUANT_TWO_AC r0+x, r1+x, r2+x, x, 5
282 QUANT_TWO_AC r0+%1+mmsize*0, r1+mmsize*0, r2+mmsize*0, 0, %2
283 QUANT_TWO_AC r0+%1+mmsize*2, r1+mmsize*2, r2+mmsize*2, 1, %2
287 cglobal quant_4x4x4, 3,3,8
295 packssdw m5, m6 ; AAAA BBBB CCCC DDDD
330 cglobal quant_4x4x4, 3,3,6
331 QUANT_TWO_AC r0, r1, r2, 0, 4
332 QUANT_TWO_AC r0+64, r1, r2, 0, 5
335 QUANT_TWO_AC r0, r1, r2, 0, 5
336 QUANT_TWO_AC r0+64, r1, r2, 0, 1
348 %endif ; HIGH_BIT_DEPTH
350 %if HIGH_BIT_DEPTH == 0
352 ;;; %1 (m64) dct[y][x]
353 ;;; %2 (m64/mmx) mf[y][x] or mf[0][0] (as uint16_t)
354 ;;; %3 (m64/mmx) bias[y][x] or bias[0][0] (as uint16_t)
355 mova m1, %1 ; load dct coeffs
357 paddusw m0, %3 ; round
358 pmulhuw m0, %2 ; divide
359 PSIGNW m0, m1 ; restore sign
378 ACCUM por, %8, 2, %7+mmsize
381 ;-----------------------------------------------------------------------------
382 ; void quant_4x4_dc( int16_t dct[16], int mf, int bias )
383 ;-----------------------------------------------------------------------------
384 %macro QUANT_DC 2-3 0
388 QUANT_ONE [r0], m2, m3, 0, 5
393 QUANT_TWO [r0+x], [r0+x+mmsize], m4, m4, m6, m6, x, 5
401 ;-----------------------------------------------------------------------------
402 ; int quant_4x4( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
403 ;-----------------------------------------------------------------------------
407 QUANT_ONE [r0], [r1], [r2], 0, 5
411 QUANT_TWO [r0+x], [r0+x+mmsize], [r1+x], [r1+x+mmsize], [r2+x], [r2+x+mmsize], x, 5
421 QUANT_TWO [r0+%1+mmsize*0], [r0+%1+mmsize*1], m8, m9, m10, m11, mmsize*0, %2
423 QUANT_TWO [r0+%1+mmsize*0], [r0+%1+mmsize*1], [r1+mmsize*0], [r1+mmsize*1], [r2+mmsize*0], [r2+mmsize*1], mmsize*0, %2
425 QUANT_TWO [r0+%1+mmsize*2], [r0+%1+mmsize*3], [r1+mmsize*2], [r1+mmsize*3], [r2+mmsize*2], [r2+mmsize*3], mmsize*2, %2
431 cglobal quant_4x4x4, 3,3,7
433 mova m8, [r1+mmsize*0]
434 mova m9, [r1+mmsize*1]
435 mova m10, [r2+mmsize*0]
436 mova m11, [r2+mmsize*1]
444 packssdw m4, m5 ; AAAA BBBB CCCC DDDD
453 QUANT_DC quant_2x2_dc, 1
454 %if ARCH_X86_64 == 0 ; not needed because sse2 is faster
455 QUANT_DC quant_4x4_dc, 4
457 QUANT_AC quant_4x4, 4
458 QUANT_AC quant_8x8, 16
462 QUANT_DC quant_4x4_dc, 2, 7
463 QUANT_AC quant_4x4, 2
464 QUANT_AC quant_8x8, 8
468 QUANT_DC quant_4x4_dc, 2, 7
469 QUANT_AC quant_4x4, 2
470 QUANT_AC quant_8x8, 8
474 QUANT_DC quant_2x2_dc, 1
477 ;Not faster on Conroe, so only used in SSE4 versions
478 QUANT_DC quant_4x4_dc, 2, 7
479 QUANT_AC quant_4x4, 2
480 QUANT_AC quant_8x8, 8
483 QUANT_AC quant_4x4, 1
484 QUANT_AC quant_8x8, 4
485 QUANT_DC quant_4x4_dc, 1, 6
488 cglobal quant_4x4x4, 3,3,6
491 QUANT_ONE [r0+ 0], m2, m3, 0, 4
492 QUANT_ONE [r0+32], m2, m3, 0, 5
494 QUANT_ONE [r0+64], m2, m3, 0, 5
495 QUANT_ONE [r0+96], m2, m3, 0, 1
506 %endif ; !HIGH_BIT_DEPTH
510 ;=============================================================================
512 ;=============================================================================
516 ;;; %2,%3 dequant_mf[i_mf][y][x]
541 ;;; %2,%3 dequant_mf[i_mf][y][x]
548 pmadcswd m0, m0, %2, m3
549 pmadcswd m1, m1, %3, m3
563 pmadcswd m0, m0, %2, m3
564 pmadcswd m1, m1, %3, m3
575 %macro DEQUANT_LOOP 3
579 %1 [r0+(t0 )*SIZEOF_PIXEL], [r1+t0*2 ], [r1+t0*2+ 8*%3], [r0+(t0+ 4*%3)*SIZEOF_PIXEL]
580 %1 [r0+(t0+8*%3)*SIZEOF_PIXEL], [r1+t0*2+16*%3], [r1+t0*2+24*%3], [r0+(t0+12*%3)*SIZEOF_PIXEL]
586 %1 [r0+(8*%3)*SIZEOF_PIXEL], [r1+16*%3], [r1+24*%3], [r0+(12*%3)*SIZEOF_PIXEL]
588 %1 [r0+(0 )*SIZEOF_PIXEL], [r1+0 ], [r1+ 8*%3], [r0+( 4*%3)*SIZEOF_PIXEL]
593 %macro DEQUANT16_FLAT 2-5
611 DECLARE_REG_TMP 6,3,2
613 DECLARE_REG_TMP 2,0,1
616 %macro DEQUANT_START 2
619 shr t0d, 8 ; i_qbits = i_qp / 6
622 sub t2d, t1d ; i_mf = i_qp % 6
625 add r1, t2 ; dequant_mf[i_mf]
627 add r1, r1mp ; dequant_mf[i_mf]
631 jl .rshift32 ; negative qbits => rightshift
634 ;-----------------------------------------------------------------------------
635 ; void dequant_4x4( dctcoef dct[4][4], int dequant_mf[6][4][4], int i_qp )
636 ;-----------------------------------------------------------------------------
638 cglobal dequant_%1x%1, 0,3,6
640 DEQUANT_START %2+2, %2
644 DEQUANT_LOOP DEQUANT16_L, %1*%1/4, %3
653 DEQUANT_LOOP DEQUANT32_R, %1*%1/4, %3
655 %if HIGH_BIT_DEPTH == 0 && (notcpuflag(avx) || mmsize == 32)
656 cglobal dequant_%1x%1_flat16, 0,3
660 jl dequant_%1x%1 %+ SUFFIX %+ .skip_prologue
664 shr t0d, 8 ; i_qbits = i_qp / 6
667 sub t2d, t1d ; i_mf = i_qp % 6
670 lea r1, [dequant%1_scale]
673 lea r1, [dequant%1_scale + t2]
679 DEQUANT16_FLAT [r1], 0, 16
680 DEQUANT16_FLAT [r1+8], 8, 24
682 DEQUANT16_FLAT [r1], 0, 16
684 vbroadcasti128 m0, [r1]
690 DEQUANT16_FLAT [r1], 0, 8, 64, 72
691 DEQUANT16_FLAT [r1+16], 16, 24, 48, 56
692 DEQUANT16_FLAT [r1+16], 80, 88, 112, 120
693 DEQUANT16_FLAT [r1+32], 32, 40, 96, 104
695 DEQUANT16_FLAT [r1], 0, 64
696 DEQUANT16_FLAT [r1+16], 16, 48, 80, 112
697 DEQUANT16_FLAT [r1+32], 32, 96
703 pmullw m0, m1, [r0+ 0]
704 pmullw m3, m2, [r0+32]
705 pmullw m4, m1, [r0+64]
706 pmullw m5, m2, [r0+96]
713 %endif ; !HIGH_BIT_DEPTH && !AVX
747 cglobal dequant_4x4dc, 0,3,6
752 vpbroadcastdct m3, [r1]
760 %rep SIZEOF_PIXEL*32/mmsize
763 %assign %%x %%x+mmsize
770 vpbroadcastdct m2, [r1]
783 %rep SIZEOF_PIXEL*32/mmsize
784 pmadcswd m0, m2, [r0+%%x], m4
787 %assign %%x %%x+mmsize
790 %else ; !HIGH_BIT_DEPTH
796 %rep SIZEOF_PIXEL*32/mmsize
806 %assign %%x %%x+mmsize
808 %endif ; !HIGH_BIT_DEPTH
814 DEQUANT_DC d, pmaddwd
816 DEQUANT_DC d, pmaddwd
818 DEQUANT_DC d, pmaddwd
832 ; t4 is eax for return value.
834 DECLARE_REG_TMP 0,1,2,3,6,4 ; Identical for both Windows and *NIX
836 DECLARE_REG_TMP 4,1,2,3,0,5
839 ;-----------------------------------------------------------------------------
840 ; x264_optimize_chroma_2x2_dc( dctcoef dct[4], int dequant_mf )
841 ;-----------------------------------------------------------------------------
843 %macro OPTIMIZE_CHROMA_2x2_DC 0
844 cglobal optimize_chroma_2x2_dc, 0,6-cpuflag(sse4),7
855 mova m3, [chroma_dc_dct_mask]
856 mova m5, [chroma_dc_dmf_mask]
858 mova m3, [chroma_dc_dct_mask_mmx]
859 mova m5, [chroma_dc_dmf_mask_mmx]
862 pshufd m0, m1, q0101 ; 1 0 3 2 1 0 3 2
864 punpcklqdq m1, m1 ; 3 2 1 0 3 2 1 0
865 mova m6, [pd_1024] ; 32<<5, elements are shifted 5 bits to the left
866 PSIGNW m0, m3 ; -1 -0 3 2 -1 -0 3 2
867 PSIGNW m2, m5 ; + - - + - - + +
868 paddw m0, m1 ; -1+3 -0+2 1+3 0+2 -1+3 -0+2 1+3 0+2
869 pmaddwd m0, m2 ; 0-1-2+3 0-1+2-3 0+1-2-3 0+1+2+3 * dmf
871 psrad m2, 16 ; + - - +
875 %if notcpuflag(ssse3)
876 psrad m1, 31 ; has to be 0 or -1 in order for PSIGND_MMX to work correctly
888 jz .ret ; if the DC coefficients already round to zero, terminate early
891 movsx t3d, word [t0+2*t1] ; dct[coeff]
893 pshufd m1, m1, q2100 ; move the next element to high dword
902 psubd m3, m5 ; coeff -= sign
913 paddd m3, m5 ; coeff += sign
918 pshufd m2, m2, q1320 ; - + - + / - - + +
929 punpcklqdq m2, m2 ; + + + +
936 %if HIGH_BIT_DEPTH == 0
938 OPTIMIZE_CHROMA_2x2_DC
940 OPTIMIZE_CHROMA_2x2_DC
942 OPTIMIZE_CHROMA_2x2_DC
944 OPTIMIZE_CHROMA_2x2_DC
945 %endif ; !HIGH_BIT_DEPTH
948 ;-----------------------------------------------------------------------------
949 ; void denoise_dct( int32_t *dct, uint32_t *sum, uint32_t *offset, int size )
950 ;-----------------------------------------------------------------------------
952 cglobal denoise_dct, 4,4,6
956 mova m2, [r0+r3*4-2*mmsize]
957 mova m3, [r0+r3*4-1*mmsize]
960 paddd m4, m0, [r1+r3*4-2*mmsize]
961 psubd m0, [r2+r3*4-2*mmsize]
962 mova [r1+r3*4-2*mmsize], m4
963 paddd m4, m1, [r1+r3*4-1*mmsize]
964 psubd m1, [r2+r3*4-1*mmsize]
965 mova [r1+r3*4-1*mmsize], m4
972 mova [r0+r3*4-2*mmsize], m0
973 mova [r0+r3*4-1*mmsize], m1
992 %else ; !HIGH_BIT_DEPTH
994 ;-----------------------------------------------------------------------------
995 ; void denoise_dct( int16_t *dct, uint32_t *sum, uint16_t *offset, int size )
996 ;-----------------------------------------------------------------------------
998 cglobal denoise_dct, 4,4,7
1000 movsxdifnidn r3, r3d
1002 mova m2, [r0+r3*2-2*mmsize]
1003 mova m3, [r0+r3*2-1*mmsize]
1006 psubusw m4, m0, [r2+r3*2-2*mmsize]
1007 psubusw m5, m1, [r2+r3*2-1*mmsize]
1010 mova [r0+r3*2-2*mmsize], m4
1011 mova [r0+r3*2-1*mmsize], m5
1012 punpcklwd m2, m0, m6
1013 punpcklwd m3, m1, m6
1016 paddd m2, [r1+r3*4-4*mmsize]
1017 paddd m0, [r1+r3*4-3*mmsize]
1018 paddd m3, [r1+r3*4-2*mmsize]
1019 paddd m1, [r1+r3*4-1*mmsize]
1020 mova [r1+r3*4-4*mmsize], m2
1021 mova [r1+r3*4-3*mmsize], m0
1022 mova [r1+r3*4-2*mmsize], m3
1023 mova [r1+r3*4-1*mmsize], m1
1029 %if ARCH_X86_64 == 0
1041 cglobal denoise_dct, 4,4,4
1043 movsxdifnidn r3, r3d
1045 mova m1, [r0+r3*2-mmsize]
1047 psubusw m2, m0, [r2+r3*2-mmsize]
1048 vpermq m0, m0, q3120
1050 mova [r0+r3*2-mmsize], m2
1051 punpcklwd m1, m0, m3
1053 paddd m1, [r1+r3*4-2*mmsize]
1054 paddd m0, [r1+r3*4-1*mmsize]
1055 mova [r1+r3*4-2*mmsize], m1
1056 mova [r1+r3*4-1*mmsize], m0
1061 %endif ; !HIGH_BIT_DEPTH
1063 ;-----------------------------------------------------------------------------
1064 ; int decimate_score( dctcoef *dct )
1065 ;-----------------------------------------------------------------------------
1067 %macro DECIMATE_MASK 5
1072 packssdw m0, [%3+16]
1073 packssdw m1, [%3+48]
1074 ABSW2 m0, m1, m0, m1, m3, m4
1076 ABSW m0, [%3+ 0], m3
1077 ABSW m1, [%3+16], m4
1091 packssdw m0, [%3+ 8]
1092 packssdw m1, [%3+24]
1093 packssdw m2, [%3+40]
1094 packssdw m3, [%3+56]
1101 ABSW2 m0, m1, m0, m1, m6, m7
1102 ABSW2 m2, m3, m2, m3, m6, m7
1122 cextern decimate_table4
1123 cextern decimate_table8
1125 %macro DECIMATE4x4 1
1127 cglobal decimate_score%1, 1,3
1129 lea r4, [decimate_table4]
1130 lea r5, [decimate_mask_table4]
1132 %define mask_table r5
1134 %define table decimate_table4
1135 %define mask_table decimate_mask_table4
1137 DECIMATE_MASK edx, eax, r0, [pb_1], ecx
1146 movzx eax, byte [mask_table + rcx]
1155 add al, byte [table + rcx]
1156 add al, byte [mask_table + rdx]
1165 %if ARCH_X86_64 == 0
1177 ; 2x gt1 output, 2x nz output, 1x mask
1178 %macro DECIMATE_MASK64_AVX2 5
1185 pcmpgtb m2, m0, %5 ; the > 1 checks don't care about order, so
1186 pcmpgtb m3, m1, %5 ; we can save latency by doing them here
1191 vpermq m0, m0, q3120
1192 vpermq m1, m1, q3120
1200 %macro DECIMATE8x8 0
1203 cglobal decimate_score64, 1,5
1205 lea r4, [decimate_table8]
1208 %define table decimate_table8
1212 DECIMATE_MASK64_AVX2 eax, r2d, r1d, r3d, m5
1218 DECIMATE_MASK r1d, eax, r0+SIZEOF_DCTCOEF* 0, m5, null
1221 DECIMATE_MASK r2d, eax, r0+SIZEOF_DCTCOEF*16, m5, null
1224 DECIMATE_MASK r2d, r3d, r0+SIZEOF_DCTCOEF*32, m5, null
1228 DECIMATE_MASK r2d, r3d, r0+SIZEOF_DCTCOEF*48, m5, null
1240 add al, byte [table + rcx]
1253 cglobal decimate_score64, 1,6
1255 cglobal decimate_score64, 1,5
1259 DECIMATE_MASK64_AVX2 r0, r2, r3, r4, m5
1265 DECIMATE_MASK r3, r2, r0+SIZEOF_DCTCOEF* 0, m5, r5
1268 DECIMATE_MASK r4, r2, r0+SIZEOF_DCTCOEF*16, m5, r5
1271 DECIMATE_MASK r4, r1, r0+SIZEOF_DCTCOEF*32, m5, r5
1273 DECIMATE_MASK r1, r0, r0+SIZEOF_DCTCOEF*48, m5, r5
1290 add al, byte [decimate_table8 + ecx]
1321 %if ARCH_X86_64 == 0
1332 ;-----------------------------------------------------------------------------
1333 ; int coeff_last( dctcoef *dct )
1334 ;-----------------------------------------------------------------------------
1355 %macro LAST_MASK 3-4
1358 packssdw mm0, [%3+8]
1363 movdqa xmm0, [%3+ 0]
1365 packssdw xmm0, [%3+16]
1368 movdqa xmm1, [%3+32]
1369 packssdw xmm0, [%3+16]
1370 packssdw xmm1, [%3+48]
1378 packssdw mm0, [%3+ 8]
1379 packssdw mm1, [%3+24]
1386 packssdw mm0, [%3+ 8]
1387 packssdw mm1, [%3+24]
1390 packssdw mm3, [%3+40]
1391 packssdw mm4, [%3+56]
1403 %macro COEFF_LAST4 0
1404 cglobal coeff_last4, 1,3
1406 LAST_MASK 4, r1d, r0
1415 INIT_MMX mmx2, lzcnt
1418 %macro COEFF_LAST8 0
1419 cglobal coeff_last8, 1,3
1421 LAST_MASK 8, r1d, r0
1432 %if ARCH_X86_64 == 0
1438 INIT_XMM sse2, lzcnt
1441 %else ; !HIGH_BIT_DEPTH
1442 %macro LAST_MASK 3-4
1448 packsswb mm0, [%3+ 8]
1453 movdqa xmm0, [%3+ 0]
1454 packsswb xmm0, [%3+16]
1460 packsswb mm0, [%3+ 8]
1461 packsswb mm1, [%3+24]
1471 %macro COEFF_LAST48 0
1473 cglobal coeff_last4, 1,1
1478 cglobal coeff_last4, 0,3
1487 lea eax, [eax+ecx*2]
1491 cglobal coeff_last8, 1,3
1493 LAST_MASK 8, r1d, r0, r2d
1501 INIT_MMX mmx2, lzcnt
1503 %endif ; HIGH_BIT_DEPTH
1506 cglobal coeff_last15, 1,3
1508 LAST_MASK 15, r1d, r0-SIZEOF_DCTCOEF, r2d
1514 cglobal coeff_last16, 1,3
1516 LAST_MASK 16, r1d, r0, r2d
1521 %if ARCH_X86_64 == 0
1522 cglobal coeff_last64, 1, 4-mmsize/16
1524 LAST_MASK 16, r1d, r0+SIZEOF_DCTCOEF* 32, r3d
1525 LAST_MASK 16, r2d, r0+SIZEOF_DCTCOEF* 48, r3d
1530 LAST_MASK 16, r1d, r0+SIZEOF_DCTCOEF* 0, r3d
1531 LAST_MASK 16, r2d, r0+SIZEOF_DCTCOEF*16, r3d
1542 cglobal coeff_last64, 1,3
1544 LAST_MASK 16, r1d, r0+SIZEOF_DCTCOEF* 0
1545 LAST_MASK 16, r2d, r0+SIZEOF_DCTCOEF*16
1548 LAST_MASK 16, r2d, r0+SIZEOF_DCTCOEF*32
1549 LAST_MASK 16, r0d, r0+SIZEOF_DCTCOEF*48
1560 %if ARCH_X86_64 == 0
1566 INIT_XMM sse2, lzcnt
1569 %macro LAST_MASK_AVX2 2
1572 packssdw m0, [%2+32]
1574 packssdw m1, [%2+96]
1576 mova m1, [deinterleave_shufd]
1580 packsswb m0, [%2+32]
1581 vpermq m0, m0, q3120
1587 %if ARCH_X86_64 == 0
1589 cglobal coeff_last64, 1,2
1591 LAST_MASK_AVX2 r1d, r0+SIZEOF_DCTCOEF*32
1594 LAST_MASK_AVX2 r1d, r0+SIZEOF_DCTCOEF* 0
1604 cglobal coeff_last64, 1,3
1606 LAST_MASK_AVX2 r1d, r0+SIZEOF_DCTCOEF* 0
1607 LAST_MASK_AVX2 r2d, r0+SIZEOF_DCTCOEF*32
1615 ;-----------------------------------------------------------------------------
1616 ; int coeff_level_run( dctcoef *dct, run_level_t *runlevel )
1617 ;-----------------------------------------------------------------------------
1626 ; t6 = eax for return, t3 = ecx for shift, t[01] = r[01] for x86_64 args
1628 DECLARE_REG_TMP 3,1,2,0,4,5,6
1630 DECLARE_REG_TMP 0,1,2,3,4,5,6
1632 DECLARE_REG_TMP 6,3,2,1,4,5,0
1635 %macro COEFF_LEVELRUN 1
1636 cglobal coeff_level_run%1,0,7
1641 LAST_MASK %1, t5d, t0-(%1&1)*SIZEOF_DCTCOEF, t4d
1650 mov [t1+levelrun.mask], t5d
1653 LZCOUNT t3d, t5d, 0x1f
1658 mov [t1+levelrun.last], t4d
1660 LZCOUNT t3d, t5d, 0x1f
1669 mov [t1+t6*4+levelrun.level], t2d
1671 mov [t1+t6*2+levelrun.level], t2w
1680 %if ARCH_X86_64 == 0
1692 INIT_XMM sse2, lzcnt
1698 INIT_MMX mmx2, lzcnt
1702 ; Similar to the one above, but saves the DCT
1703 ; coefficients in m0/m1 so we don't have to load
1705 %macro LAST_MASK_LUT 3
1713 packsswb xm2, xm0, xm1
1715 vinserti128 m0, m0, xm1, 1
1722 %macro COEFF_LEVELRUN_LUT 1
1723 cglobal coeff_level_run%1,2,4+(%1/9)
1726 %define GLOBAL +r5-$$
1730 LAST_MASK_LUT %1, eax, r0-(%1&1)*SIZEOF_DCTCOEF
1739 mov [r1+levelrun.mask], eax
1748 movzx r4d, ah ; first 8 bits
1751 movzx r2d, al ; second 8 bits
1752 shl eax, 32-%1-(%1&1)
1753 LZCOUNT eax, eax, 0x1f
1756 mov [r1+levelrun.last], r3d
1757 ; Here we abuse pshufb, combined with a lookup table, to do a gather
1758 ; operation based on a bitmask. For example:
1760 ; dct 15-8 (input): 0 0 4 0 0 -2 1 0
1761 ; dct 7-0 (input): 0 0 -1 0 0 0 0 15
1762 ; bitmask 1: 0 0 1 0 0 1 1 0
1763 ; bitmask 2: 0 0 1 0 0 0 0 1
1764 ; gather 15-8: 4 -2 1 __ __ __ __ __
1765 ; gather 7-0: -1 15 __ __ __ __ __ __
1766 ; levels (output): 4 -2 1 -1 15 __ __ __ __ __ __ __ __ __ __ __
1768 ; The overlapping, dependent stores almost surely cause a mess of
1769 ; forwarding issues, but it's still enormously faster.
1771 movzx eax, byte [popcnt_table+r4 GLOBAL]
1772 movzx r3d, byte [popcnt_table+r2 GLOBAL]
1774 movh m3, [dct_coef_shuffle+r4*8 GLOBAL]
1775 movh m2, [dct_coef_shuffle+r2*8 GLOBAL]
1777 ; Storing 8 bytes of shuffle constant and converting it (unpack + or)
1778 ; is neutral to slightly faster in local speed measurements, but it
1779 ; cuts the table size in half, which is surely a big cache win.
1786 mova [r1+levelrun.level], m1
1787 ; This obnoxious unaligned store messes with store forwarding and
1788 ; stalls the CPU to no end, but merging the two registers before
1789 ; storing requires a variable 128-bit shift. Emulating this does
1790 ; work, but requires a lot of ops and the gain is tiny and
1791 ; inconsistent, so we'll err on the side of fewer instructions.
1792 movu [r1+rax*2+levelrun.level], m0
1794 movq xm2, [dct_coef_shuffle+r4*8 GLOBAL]
1795 vinserti128 m2, m2, [dct_coef_shuffle+r2*8 GLOBAL], 1
1799 vextracti128 [r1+levelrun.level], m0, 1
1800 movu [r1+rax*2+levelrun.level], xm0
1804 movzx eax, byte [popcnt_table+r2 GLOBAL]
1805 movh m1, [dct_coef_shuffle+r2*8 GLOBAL]
1809 mova [r1+levelrun.level], m0
1814 %if HIGH_BIT_DEPTH==0
1816 COEFF_LEVELRUN_LUT 4
1818 COEFF_LEVELRUN_LUT 8
1819 COEFF_LEVELRUN_LUT 15
1820 COEFF_LEVELRUN_LUT 16
1821 INIT_MMX ssse3, lzcnt
1822 COEFF_LEVELRUN_LUT 4
1823 INIT_XMM ssse3, lzcnt
1824 COEFF_LEVELRUN_LUT 8
1825 COEFF_LEVELRUN_LUT 15
1826 COEFF_LEVELRUN_LUT 16
1827 INIT_XMM avx2, lzcnt
1828 COEFF_LEVELRUN_LUT 15
1829 COEFF_LEVELRUN_LUT 16