1 ;*****************************************************************************
2 ;* quant-a.asm: x86 quantization and level-run
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2011 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Fiona Glaser <fiona@x264.com>
8 ;* Christian Heine <sennindemokrit@gmx.net>
9 ;* Oskar Arvidsson <oskar@irock.se>
10 ;* Henrik Gramner <hengar-6@student.ltu.se>
12 ;* This program is free software; you can redistribute it and/or modify
13 ;* it under the terms of the GNU General Public License as published by
14 ;* the Free Software Foundation; either version 2 of the License, or
15 ;* (at your option) any later version.
17 ;* This program is distributed in the hope that it will be useful,
18 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
19 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 ;* GNU General Public License for more details.
22 ;* You should have received a copy of the GNU General Public License
23 ;* along with this program; if not, write to the Free Software
24 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
26 ;* This program is also available under a commercial proprietary license.
27 ;* For more information, contact us at licensing@x264.com.
28 ;*****************************************************************************
31 %include "x86util.asm"
36 dw %1, %2, %1, %2, %2, %3, %2, %3
39 dw %1, %4, %5, %4, %1, %4, %5, %4
40 dw %4, %2, %6, %2, %4, %2, %6, %2
41 dw %5, %6, %3, %6, %5, %6, %3, %6
42 ; last line not used, just padding for power-of-2 stride
55 DQM8 20, 18, 32, 19, 25, 24
56 DQM8 22, 19, 35, 21, 28, 26
57 DQM8 26, 23, 42, 24, 33, 31
58 DQM8 28, 25, 45, 26, 35, 33
59 DQM8 32, 28, 51, 30, 40, 38
60 DQM8 36, 32, 58, 34, 46, 43
63 db 0,3,2,6,2,5,5,9,1,5,4,8,5,8,8,12,1,4,4,8,4,7,7,11,4,8,7,11,8,11,11,15,1,4
64 db 3,7,4,7,7,11,3,7,6,10,7,10,10,14,4,7,7,11,7,10,10,14,7,11,10,14,11,14,14
65 db 18,0,4,3,7,3,6,6,10,3,7,6,10,7,10,10,14,3,6,6,10,6,9,9,13,6,10,9,13,10,13
66 db 13,17,4,7,6,10,7,10,10,14,6,10,9,13,10,13,13,17,7,10,10,14,10,13,13,17,10
67 db 14,13,17,14,17,17,21,0,3,3,7,3,6,6,10,2,6,5,9,6,9,9,13,3,6,6,10,6,9,9,13
68 db 6,10,9,13,10,13,13,17,3,6,5,9,6,9,9,13,5,9,8,12,9,12,12,16,6,9,9,13,9,12
69 db 12,16,9,13,12,16,13,16,16,20,3,7,6,10,6,9,9,13,6,10,9,13,10,13,13,17,6,9
70 db 9,13,9,12,12,16,9,13,12,16,13,16,16,20,7,10,9,13,10,13,13,17,9,13,12,16
71 db 13,16,16,20,10,13,13,17,13,16,16,20,13,17,16,20,17,20,20,24
73 chroma_dc_dct_mask_mmx: dw 0, 0,-1,-1, 0, 0,-1,-1
74 chroma_dc_dmf_mask_mmx: dw 0, 0,-1,-1, 0,-1,-1, 0
75 chroma_dc_dct_mask: dw 1, 1,-1,-1, 1, 1,-1,-1
76 chroma_dc_dmf_mask: dw 1, 1,-1,-1, 1,-1,-1, 1
86 %macro QUANT_DC_START 0
92 %elif cpuflag(sse4) ; ssse3, but not faster on conroe
120 cmp ecx, (1<<mmsize)-1
131 %ifdef HIGH_BIT_DEPTH
132 %macro QUANT_ONE_DC 4
167 %macro QUANT_TWO_DC 4
190 QUANT_ONE_DC %1, %2, %3, %4
191 QUANT_ONE_DC %1+mmsize, %2, %3, %4+mmsize
195 %macro QUANT_ONE_AC_MMX 4
218 %macro QUANT_TWO_AC 4
225 paddd m3, [%3+mmsize]
227 pmulld m3, [%2+mmsize]
241 QUANT_ONE_AC_MMX %1, %2, %3, %4
242 QUANT_ONE_AC_MMX %1+mmsize, %2+mmsize, %3+mmsize, %4+mmsize
246 ;-----------------------------------------------------------------------------
247 ; int quant_2x2( int32_t dct[M*N], int mf, int bias )
248 ;-----------------------------------------------------------------------------
250 cglobal quant_%1x%2_dc, 3,3,8
252 %if %1*%2 <= mmsize/4
253 QUANT_ONE_DC r0, m6, m7, 0
256 %rep %1*%2/(mmsize/2)
257 QUANT_TWO_DC r0+x, m6, m7, x
265 ;-----------------------------------------------------------------------------
266 ; int quant_MxN( int32_t dct[M*N], uint32_t mf[M*N], uint32_t bias[M*N] )
267 ;-----------------------------------------------------------------------------
269 cglobal quant_%1x%2, 3,3,8
271 %rep %1*%2/(mmsize/2)
272 QUANT_TWO_AC r0+x, r1+x, r2+x, x
297 %endif ; HIGH_BIT_DEPTH
299 %ifndef HIGH_BIT_DEPTH
301 ;;; %1 (m64) dct[y][x]
302 ;;; %2 (m64/mmx) mf[y][x] or mf[0][0] (as uint16_t)
303 ;;; %3 (m64/mmx) bias[y][x] or bias[0][0] (as uint16_t)
304 mova m1, %1 ; load dct coeffs
306 paddusw m0, %3 ; round
307 pmulhuw m0, %2 ; divide
308 PSIGNW m0, m1 ; restore sign
339 ;-----------------------------------------------------------------------------
340 ; void quant_4x4_dc( int16_t dct[16], int mf, int bias )
341 ;-----------------------------------------------------------------------------
342 %macro QUANT_DC 2-3 0
346 QUANT_ONE [r0], m6, m7, 0
350 QUANT_TWO [r0+x], [r0+x+mmsize], m6, m6, m7, m7, x
358 ;-----------------------------------------------------------------------------
359 ; int quant_4x4( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
360 ;-----------------------------------------------------------------------------
365 QUANT_TWO [r0+x], [r0+x+mmsize], [r1+x], [r1+x+mmsize], [r2+x], [r2+x+mmsize], x
373 QUANT_DC quant_2x2_dc, 1
374 %ifndef ARCH_X86_64 ; not needed because sse2 is faster
375 QUANT_DC quant_4x4_dc, 4
377 QUANT_AC quant_4x4, 4
378 QUANT_AC quant_8x8, 16
382 QUANT_DC quant_4x4_dc, 2, 8
383 QUANT_AC quant_4x4, 2
384 QUANT_AC quant_8x8, 8
387 QUANT_DC quant_4x4_dc, 2, 8
388 QUANT_AC quant_4x4, 2
389 QUANT_AC quant_8x8, 8
392 QUANT_DC quant_2x2_dc, 1
395 ;Not faster on Conroe, so only used in SSE4 versions
396 QUANT_DC quant_4x4_dc, 2, 8
397 QUANT_AC quant_4x4, 2
398 QUANT_AC quant_8x8, 8
399 %endif ; !HIGH_BIT_DEPTH
403 ;=============================================================================
405 ;=============================================================================
409 ;;; %2,%3 dequant_mf[i_mf][y][x]
412 %ifdef HIGH_BIT_DEPTH
425 ;;; %2,%3 dequant_mf[i_mf][y][x]
430 %ifdef HIGH_BIT_DEPTH
431 pmadcswd m0, m0, %2, m3
436 pmadcswd m0, m0, %2, m3
437 pmadcswd m1, m1, %3, m3
445 %macro DEQUANT_LOOP 3
449 %1 [r0+(t0 )*SIZEOF_PIXEL], [r1+t0*2 ], [r1+t0*2+ 8*%3]
450 %1 [r0+(t0+8*%3)*SIZEOF_PIXEL], [r1+t0*2+16*%3], [r1+t0*2+24*%3]
455 %1 [r0+(8*%3)*SIZEOF_PIXEL], [r1+16*%3], [r1+24*%3]
456 %1 [r0+(0 )*SIZEOF_PIXEL], [r1+0 ], [r1+ 8*%3]
461 %macro DEQUANT16_FLAT 2-5
479 DECLARE_REG_TMP 6,3,2
481 DECLARE_REG_TMP 4,3,2
483 DECLARE_REG_TMP 2,0,1
486 %macro DEQUANT_START 2
489 shr t0d, 8 ; i_qbits = i_qp / 6
492 sub t2d, t1d ; i_mf = i_qp % 6
495 add r1, t2 ; dequant_mf[i_mf]
497 add r1, r1mp ; dequant_mf[i_mf]
501 jl .rshift32 ; negative qbits => rightshift
504 ;-----------------------------------------------------------------------------
505 ; void dequant_4x4( dctcoef dct[4][4], int dequant_mf[6][4][4], int i_qp )
506 ;-----------------------------------------------------------------------------
508 cglobal dequant_%1x%1, 0,3,6
510 DEQUANT_START %2+2, %2
514 DEQUANT_LOOP DEQUANT16_L, %1*%1/4, %3
523 DEQUANT_LOOP DEQUANT32_R, %1*%1/4, %3
525 %ifndef HIGH_BIT_DEPTH
527 cglobal dequant_%1x%1_flat16, 0,3
531 jl dequant_%1x%1 %+ SUFFIX %+ .skip_prologue
535 shr t0d, 8 ; i_qbits = i_qp / 6
538 sub t2d, t1d ; i_mf = i_qp % 6
541 lea r1, [dequant%1_scale]
544 lea r1, [dequant%1_scale + t2]
550 DEQUANT16_FLAT [r1], 0, 16
551 DEQUANT16_FLAT [r1+8], 8, 24
553 DEQUANT16_FLAT [r1], 0, 16
556 DEQUANT16_FLAT [r1], 0, 8, 64, 72
557 DEQUANT16_FLAT [r1+16], 16, 24, 48, 56
558 DEQUANT16_FLAT [r1+16], 80, 88, 112, 120
559 DEQUANT16_FLAT [r1+32], 32, 40, 96, 104
561 DEQUANT16_FLAT [r1], 0, 64
562 DEQUANT16_FLAT [r1+16], 16, 48, 80, 112
563 DEQUANT16_FLAT [r1+32], 32, 96
567 %endif ; !HIGH_BIT_DEPTH
570 %ifdef HIGH_BIT_DEPTH
595 cglobal dequant_4x4dc, 0,3,6
604 %rep SIZEOF_PIXEL*16/mmsize
605 mova m0, [r0+mmsize*0+x]
606 mova m1, [r0+mmsize*1+x]
609 mova [r0+mmsize*0+x], m0
610 mova [r0+mmsize*1+x], m1
624 %ifdef HIGH_BIT_DEPTH
626 %rep SIZEOF_PIXEL*32/mmsize
628 pmadcswd m0, m0, m2, m4
634 %else ; !HIGH_BIT_DEPTH
637 %rep SIZEOF_PIXEL*32/mmsize
649 %endif ; !HIGH_BIT_DEPTH
653 %ifdef HIGH_BIT_DEPTH
655 DEQUANT_DC d, pmaddwd
657 DEQUANT_DC d, pmaddwd
669 ; t4 is eax for return value.
671 DECLARE_REG_TMP 0,1,2,3,6,4 ; Identical for both Windows and *NIX
673 DECLARE_REG_TMP 4,1,2,3,0,5
676 ;-----------------------------------------------------------------------------
677 ; x264_optimize_chroma_2x2_dc( dctcoef dct[4], int dequant_mf )
678 ;-----------------------------------------------------------------------------
680 %macro OPTIMIZE_CHROMA_2x2_DC 0
683 %assign %%regs %%regs-1
686 %assign %%regs %%regs+1 ; t0-t4 are volatile on x86-64
688 cglobal optimize_chroma_2x2_dc, 0,%%regs,7
699 mova m3, [chroma_dc_dct_mask]
700 mova m5, [chroma_dc_dmf_mask]
702 mova m3, [chroma_dc_dct_mask_mmx]
703 mova m5, [chroma_dc_dmf_mask_mmx]
706 pshufd m0, m1, q0101 ; 1 0 3 2 1 0 3 2
708 punpcklqdq m1, m1 ; 3 2 1 0 3 2 1 0
709 mova m6, [pd_1024] ; 32<<5, elements are shifted 5 bits to the left
710 PSIGNW m0, m3 ; -1 -0 3 2 -1 -0 3 2
711 PSIGNW m2, m5 ; + - - + - - + +
712 paddw m0, m1 ; -1+3 -0+2 1+3 0+2 -1+3 -0+2 1+3 0+2
713 pmaddwd m0, m2 ; 0-1-2+3 0-1+2-3 0+1-2-3 0+1+2+3 * dmf
715 psrad m2, 16 ; + - - +
719 %if notcpuflag(ssse3)
720 psrad m1, 31 ; has to be 0 or -1 in order for PSIGND_MMX to work correctly
732 jz .ret ; if the DC coefficients already round to zero, terminate early
735 movsx t3d, word [t0+2*t1] ; dct[coeff]
737 pshufd m1, m1, q2100 ; move the next element to high dword
746 psubd m3, m5 ; coeff -= sign
757 paddd m3, m5 ; coeff += sign
762 pshufd m2, m2, q1320 ; - + - + / - - + +
773 punpcklqdq m2, m2 ; + + + +
780 %ifndef HIGH_BIT_DEPTH
782 OPTIMIZE_CHROMA_2x2_DC
784 OPTIMIZE_CHROMA_2x2_DC
786 OPTIMIZE_CHROMA_2x2_DC
788 OPTIMIZE_CHROMA_2x2_DC
789 %endif ; !HIGH_BIT_DEPTH
791 %ifdef HIGH_BIT_DEPTH
792 ;-----------------------------------------------------------------------------
793 ; void denoise_dct( int32_t *dct, uint32_t *sum, uint32_t *offset, int size )
794 ;-----------------------------------------------------------------------------
796 cglobal denoise_dct, 4,4,8
799 mova m2, [r0+r3*4-2*mmsize]
800 mova m3, [r0+r3*4-1*mmsize]
805 psubd m0, [r2+r3*4-2*mmsize]
806 psubd m1, [r2+r3*4-1*mmsize]
813 mova [r0+r3*4-2*mmsize], m0
814 mova [r0+r3*4-1*mmsize], m1
815 paddd m4, [r1+r3*4-2*mmsize]
816 paddd m5, [r1+r3*4-1*mmsize]
817 mova [r1+r3*4-2*mmsize], m4
818 mova [r1+r3*4-1*mmsize], m5
835 %else ; !HIGH_BIT_DEPTH
837 ;-----------------------------------------------------------------------------
838 ; void denoise_dct( int16_t *dct, uint32_t *sum, uint16_t *offset, int size )
839 ;-----------------------------------------------------------------------------
841 cglobal denoise_dct, 4,4,7
844 mova m2, [r0+r3*2-2*mmsize]
845 mova m3, [r0+r3*2-1*mmsize]
848 psubusw m4, m0, [r2+r3*2-2*mmsize]
849 psubusw m5, m1, [r2+r3*2-1*mmsize]
852 mova [r0+r3*2-2*mmsize], m4
853 mova [r0+r3*2-1*mmsize], m5
858 paddd m2, [r1+r3*4-4*mmsize]
859 paddd m0, [r1+r3*4-3*mmsize]
860 paddd m3, [r1+r3*4-2*mmsize]
861 paddd m1, [r1+r3*4-1*mmsize]
862 mova [r1+r3*4-4*mmsize], m2
863 mova [r1+r3*4-3*mmsize], m0
864 mova [r1+r3*4-2*mmsize], m3
865 mova [r1+r3*4-1*mmsize], m1
882 %endif ; !HIGH_BIT_DEPTH
884 ;-----------------------------------------------------------------------------
885 ; int decimate_score( dctcoef *dct )
886 ;-----------------------------------------------------------------------------
888 %macro DECIMATE_MASK 5
890 %ifdef HIGH_BIT_DEPTH
893 packssdw xmm0, [%3+16]
894 packssdw xmm1, [%3+48]
895 ABSW2 xmm0, xmm1, xmm0, xmm1, xmm3, xmm4
897 ABSW xmm0, [%3+ 0], xmm3
898 ABSW xmm1, [%3+16], xmm4
908 %ifdef HIGH_BIT_DEPTH
913 packssdw mm0, [%3+ 8]
914 packssdw mm1, [%3+24]
915 packssdw mm2, [%3+40]
916 packssdw mm3, [%3+56]
923 ABSW2 mm0, mm1, mm0, mm1, mm6, mm7
924 ABSW2 mm2, mm3, mm2, mm3, mm6, mm7
944 cextern decimate_table4
945 cextern decimate_table8
949 ;A LUT is faster than bsf on AMD processors.
950 ;This is not true for score64.
951 cglobal decimate_score%1, 1,3
953 lea r10, [decimate_table4]
954 lea r11, [decimate_mask_table4]
956 %define mask_table r11
958 %define table decimate_table4
959 %define mask_table decimate_mask_table4
961 DECIMATE_MASK edx, eax, r0, [pb_1], ecx
971 movzx eax, byte [mask_table + rcx]
980 add al, byte [table + rcx]
981 add al, byte [mask_table + rdx]
986 add al, byte [table + rcx]
1002 INIT_MMX mmx2, slowctz
1009 INIT_XMM sse2, slowctz
1015 INIT_XMM ssse3, slowctz
1019 %macro DECIMATE8x8 0
1022 cglobal decimate_score64, 1,4
1024 lea r10, [decimate_table8]
1027 %define table decimate_table8
1030 DECIMATE_MASK r1d, eax, r0+SIZEOF_DCTCOEF* 0, m5, null
1033 DECIMATE_MASK r2d, eax, r0+SIZEOF_DCTCOEF*16, m5, null
1036 DECIMATE_MASK r2d, r3d, r0+SIZEOF_DCTCOEF*32, m5, null
1040 DECIMATE_MASK r2d, r3d, r0+SIZEOF_DCTCOEF*48, m5, null
1050 add al, byte [table + rcx]
1061 cglobal decimate_score64, 1,6
1063 cglobal decimate_score64, 1,5
1066 DECIMATE_MASK r3, r2, r0+SIZEOF_DCTCOEF* 0, m5, r5
1069 DECIMATE_MASK r4, r2, r0+SIZEOF_DCTCOEF*16, m5, r5
1072 DECIMATE_MASK r4, r1, r0+SIZEOF_DCTCOEF*32, m5, r5
1074 DECIMATE_MASK r1, r0, r0+SIZEOF_DCTCOEF*48, m5, r5
1082 jne .ret9 ;r0 is zero at this point, so we don't need to zero it
1089 add r0b, byte [decimate_table8 + ecx]
1092 cmp r0, 6 ;score64's threshold is never higher than 6
1093 jge .ret9 ;this early termination is only useful on 32-bit because it can be done in the latency after shrd
1128 ;-----------------------------------------------------------------------------
1129 ; int coeff_last( dctcoef *dct )
1130 ;-----------------------------------------------------------------------------
1150 %ifdef HIGH_BIT_DEPTH
1151 %macro LAST_MASK 3-4
1154 packssdw mm0, [%3+8]
1159 movdqa xmm0, [%3+ 0]
1160 movdqa xmm1, [%3+32]
1161 packssdw xmm0, [%3+16]
1162 packssdw xmm1, [%3+48]
1169 packssdw mm0, [%3+ 8]
1170 packssdw mm1, [%3+24]
1173 packssdw mm3, [%3+40]
1174 packssdw mm4, [%3+56]
1186 %macro COEFF_LAST4 0
1187 cglobal coeff_last4, 1,3
1189 LAST_MASK 4, r1d, r0
1198 INIT_MMX mmx2, lzcnt
1201 %else ; !HIGH_BIT_DEPTH
1202 %macro LAST_MASK 3-4
1209 movdqa xmm0, [%3+ 0]
1210 packsswb xmm0, [%3+16]
1216 packsswb mm0, [%3+ 8]
1217 packsswb mm1, [%3+24]
1227 %macro COEFF_LAST4 0
1229 cglobal coeff_last4, 1,1
1234 cglobal coeff_last4, 0,3
1243 lea eax, [eax+ecx*2]
1250 INIT_MMX mmx2, lzcnt
1252 %endif ; HIGH_BIT_DEPTH
1255 cglobal coeff_last15, 1,3
1257 LAST_MASK 15, r1d, r0-SIZEOF_DCTCOEF, r2d
1263 cglobal coeff_last16, 1,3
1265 LAST_MASK 16, r1d, r0, r2d
1271 cglobal coeff_last64, 1, 5-mmsize/16
1273 LAST_MASK 16, r2d, r0+SIZEOF_DCTCOEF* 32, r4d
1274 LAST_MASK 16, r3d, r0+SIZEOF_DCTCOEF* 48, r4d
1279 LAST_MASK 16, r1d, r0+SIZEOF_DCTCOEF* 0, r4d
1280 LAST_MASK 16, r3d, r0+SIZEOF_DCTCOEF*16, r4d
1291 cglobal coeff_last64, 1,4
1293 LAST_MASK 16, r1d, r0+SIZEOF_DCTCOEF* 0
1294 LAST_MASK 16, r2d, r0+SIZEOF_DCTCOEF*16
1295 LAST_MASK 16, r3d, r0+SIZEOF_DCTCOEF*32
1296 LAST_MASK 16, r0d, r0+SIZEOF_DCTCOEF*48
1315 INIT_XMM sse2, lzcnt
1318 ;-----------------------------------------------------------------------------
1319 ; int coeff_level_run( dctcoef *dct, run_level_t *runlevel )
1320 ;-----------------------------------------------------------------------------
1322 ; t6 = eax for return, t3 = ecx for shift, t[01] = r[01] for x86_64 args
1324 DECLARE_REG_TMP 3,1,2,0,4,5,6
1325 %elifdef ARCH_X86_64
1326 DECLARE_REG_TMP 0,1,2,3,4,5,6
1328 DECLARE_REG_TMP 6,3,2,1,4,5,0
1331 %macro COEFF_LEVELRUN 1
1332 cglobal coeff_level_run%1,0,7
1336 LAST_MASK %1, t5d, t0-(%1&1)*SIZEOF_DCTCOEF, t4d
1338 shl t5d, 32-((%1+1)&~1)
1340 LZCOUNT t3d, t5d, 0x1f
1347 LZCOUNT t3d, t5d, 0x1f
1348 %ifdef HIGH_BIT_DEPTH
1350 mov [t1+t6 +4+16*4], t3b
1351 mov [t1+t6*4+ 4], t2d
1354 mov [t1+t6 +4+16*2], t3b
1355 mov [t1+t6*2+ 4], t2w
1374 INIT_XMM sse2, lzcnt
1377 INIT_MMX mmx2, lzcnt