1 ;*****************************************************************************
2 ;* quant-a.asm: x86 quantization and level-run
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2011 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Fiona Glaser <fiona@x264.com>
8 ;* Christian Heine <sennindemokrit@gmx.net>
9 ;* Oskar Arvidsson <oskar@irock.se>
10 ;* Henrik Gramner <hengar-6@student.ltu.se>
12 ;* This program is free software; you can redistribute it and/or modify
13 ;* it under the terms of the GNU General Public License as published by
14 ;* the Free Software Foundation; either version 2 of the License, or
15 ;* (at your option) any later version.
17 ;* This program is distributed in the hope that it will be useful,
18 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
19 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 ;* GNU General Public License for more details.
22 ;* You should have received a copy of the GNU General Public License
23 ;* along with this program; if not, write to the Free Software
24 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
26 ;* This program is also available under a commercial proprietary license.
27 ;* For more information, contact us at licensing@x264.com.
28 ;*****************************************************************************
31 %include "x86util.asm"
36 dw %1, %2, %1, %2, %2, %3, %2, %3
39 dw %1, %4, %5, %4, %1, %4, %5, %4
40 dw %4, %2, %6, %2, %4, %2, %6, %2
41 dw %5, %6, %3, %6, %5, %6, %3, %6
42 ; last line not used, just padding for power-of-2 stride
55 DQM8 20, 18, 32, 19, 25, 24
56 DQM8 22, 19, 35, 21, 28, 26
57 DQM8 26, 23, 42, 24, 33, 31
58 DQM8 28, 25, 45, 26, 35, 33
59 DQM8 32, 28, 51, 30, 40, 38
60 DQM8 36, 32, 58, 34, 46, 43
63 db 0,3,2,6,2,5,5,9,1,5,4,8,5,8,8,12,1,4,4,8,4,7,7,11,4,8,7,11,8,11,11,15,1,4
64 db 3,7,4,7,7,11,3,7,6,10,7,10,10,14,4,7,7,11,7,10,10,14,7,11,10,14,11,14,14
65 db 18,0,4,3,7,3,6,6,10,3,7,6,10,7,10,10,14,3,6,6,10,6,9,9,13,6,10,9,13,10,13
66 db 13,17,4,7,6,10,7,10,10,14,6,10,9,13,10,13,13,17,7,10,10,14,10,13,13,17,10
67 db 14,13,17,14,17,17,21,0,3,3,7,3,6,6,10,2,6,5,9,6,9,9,13,3,6,6,10,6,9,9,13
68 db 6,10,9,13,10,13,13,17,3,6,5,9,6,9,9,13,5,9,8,12,9,12,12,16,6,9,9,13,9,12
69 db 12,16,9,13,12,16,13,16,16,20,3,7,6,10,6,9,9,13,6,10,9,13,10,13,13,17,6,9
70 db 9,13,9,12,12,16,9,13,12,16,13,16,16,20,7,10,9,13,10,13,13,17,9,13,12,16
71 db 13,16,16,20,10,13,13,17,13,16,16,20,13,17,16,20,17,20,20,24
73 chroma_dc_dct_mask_mmx: dw 0, 0,-1,-1, 0, 0,-1,-1
74 chroma_dc_dmf_mask_mmx: dw 0, 0,-1,-1, 0,-1,-1, 0
75 chroma_dc_dct_mask: dw 1, 1,-1,-1, 1, 1,-1,-1
76 chroma_dc_dmf_mask: dw 1, 1,-1,-1, 1,-1,-1, 1
86 %macro QUANT_DC_START 0
92 %elif cpuflag(sse4) ; ssse3, but not faster on conroe
120 cmp ecx, (1<<mmsize)-1
131 %ifdef HIGH_BIT_DEPTH
132 %macro QUANT_ONE_DC 4
167 %macro QUANT_TWO_DC 4
190 QUANT_ONE_DC %1, %2, %3, %4
191 QUANT_ONE_DC %1+mmsize, %2, %3, %4+mmsize
195 %macro QUANT_ONE_AC_MMX 4
218 %macro QUANT_TWO_AC 4
225 paddd m3, [%3+mmsize]
227 pmulld m3, [%2+mmsize]
241 QUANT_ONE_AC_MMX %1, %2, %3, %4
242 QUANT_ONE_AC_MMX %1+mmsize, %2+mmsize, %3+mmsize, %4+mmsize
246 ;-----------------------------------------------------------------------------
247 ; int quant_2x2( int32_t dct[M*N], int mf, int bias )
248 ;-----------------------------------------------------------------------------
250 cglobal quant_%1x%2_dc, 3,3,8
252 %if %1*%2 <= mmsize/4
253 QUANT_ONE_DC r0, m6, m7, 0
256 %rep %1*%2/(mmsize/2)
257 QUANT_TWO_DC r0+x, m6, m7, x
265 ;-----------------------------------------------------------------------------
266 ; int quant_MxN( int32_t dct[M*N], uint32_t mf[M*N], uint32_t bias[M*N] )
267 ;-----------------------------------------------------------------------------
269 cglobal quant_%1x%2, 3,3,8
271 %rep %1*%2/(mmsize/2)
272 QUANT_TWO_AC r0+x, r1+x, r2+x, x
297 %endif ; HIGH_BIT_DEPTH
299 %ifndef HIGH_BIT_DEPTH
301 ;;; %1 (m64) dct[y][x]
302 ;;; %2 (m64/mmx) mf[y][x] or mf[0][0] (as uint16_t)
303 ;;; %3 (m64/mmx) bias[y][x] or bias[0][0] (as uint16_t)
304 mova m1, %1 ; load dct coeffs
306 paddusw m0, %3 ; round
307 pmulhuw m0, %2 ; divide
308 PSIGNW m0, m1 ; restore sign
339 ;-----------------------------------------------------------------------------
340 ; void quant_4x4_dc( int16_t dct[16], int mf, int bias )
341 ;-----------------------------------------------------------------------------
342 %macro QUANT_DC 2-3 0
346 QUANT_ONE [r0], m6, m7, 0
350 QUANT_TWO [r0+x], [r0+x+mmsize], m6, m6, m7, m7, x
358 ;-----------------------------------------------------------------------------
359 ; int quant_4x4( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
360 ;-----------------------------------------------------------------------------
365 QUANT_TWO [r0+x], [r0+x+mmsize], [r1+x], [r1+x+mmsize], [r2+x], [r2+x+mmsize], x
373 QUANT_DC quant_2x2_dc, 1
374 %ifndef ARCH_X86_64 ; not needed because sse2 is faster
375 QUANT_DC quant_4x4_dc, 4
377 QUANT_AC quant_4x4, 4
378 QUANT_AC quant_8x8, 16
382 QUANT_DC quant_4x4_dc, 2, 8
383 QUANT_AC quant_4x4, 2
384 QUANT_AC quant_8x8, 8
387 QUANT_DC quant_4x4_dc, 2, 8
388 QUANT_AC quant_4x4, 2
389 QUANT_AC quant_8x8, 8
392 QUANT_DC quant_2x2_dc, 1
395 ;Not faster on Conroe, so only used in SSE4 versions
396 QUANT_DC quant_4x4_dc, 2, 8
397 QUANT_AC quant_4x4, 2
398 QUANT_AC quant_8x8, 8
399 %endif ; !HIGH_BIT_DEPTH
403 ;=============================================================================
405 ;=============================================================================
409 ;;; %2,%3 dequant_mf[i_mf][y][x]
412 %ifdef HIGH_BIT_DEPTH
425 ;;; %2,%3 dequant_mf[i_mf][y][x]
430 %ifdef HIGH_BIT_DEPTH
448 %macro DEQUANT_LOOP 3
452 %1 [r0+(t0 )*SIZEOF_PIXEL], [r1+t0*2 ], [r1+t0*2+ 8*%3]
453 %1 [r0+(t0+8*%3)*SIZEOF_PIXEL], [r1+t0*2+16*%3], [r1+t0*2+24*%3]
458 %1 [r0+(8*%3)*SIZEOF_PIXEL], [r1+16*%3], [r1+24*%3]
459 %1 [r0+(0 )*SIZEOF_PIXEL], [r1+0 ], [r1+ 8*%3]
464 %macro DEQUANT16_FLAT 2-5
482 DECLARE_REG_TMP 6,3,2
484 DECLARE_REG_TMP 4,3,2
486 DECLARE_REG_TMP 2,0,1
489 %macro DEQUANT_START 2
492 shr t0d, 8 ; i_qbits = i_qp / 6
495 sub t2d, t1d ; i_mf = i_qp % 6
498 add r1, t2 ; dequant_mf[i_mf]
500 add r1, r1mp ; dequant_mf[i_mf]
504 jl .rshift32 ; negative qbits => rightshift
507 ;-----------------------------------------------------------------------------
508 ; void dequant_4x4( dctcoef dct[4][4], int dequant_mf[6][4][4], int i_qp )
509 ;-----------------------------------------------------------------------------
511 cglobal dequant_%1x%1, 0,3,6
513 DEQUANT_START %2+2, %2
517 DEQUANT_LOOP DEQUANT16_L, %1*%1/4, %3
526 DEQUANT_LOOP DEQUANT32_R, %1*%1/4, %3
528 %ifndef HIGH_BIT_DEPTH
530 cglobal dequant_%1x%1_flat16, 0,3
534 jl dequant_%1x%1 %+ SUFFIX %+ .skip_prologue
538 shr t0d, 8 ; i_qbits = i_qp / 6
541 sub t2d, t1d ; i_mf = i_qp % 6
544 lea r1, [dequant%1_scale]
547 lea r1, [dequant%1_scale + t2]
553 DEQUANT16_FLAT [r1], 0, 16
554 DEQUANT16_FLAT [r1+8], 8, 24
556 DEQUANT16_FLAT [r1], 0, 16
559 DEQUANT16_FLAT [r1], 0, 8, 64, 72
560 DEQUANT16_FLAT [r1+16], 16, 24, 48, 56
561 DEQUANT16_FLAT [r1+16], 80, 88, 112, 120
562 DEQUANT16_FLAT [r1+32], 32, 40, 96, 104
564 DEQUANT16_FLAT [r1], 0, 64
565 DEQUANT16_FLAT [r1+16], 16, 48, 80, 112
566 DEQUANT16_FLAT [r1+32], 32, 96
570 %endif ; !HIGH_BIT_DEPTH
573 %ifdef HIGH_BIT_DEPTH
592 cglobal dequant_4x4dc, 0,3,6
601 %rep SIZEOF_PIXEL*16/mmsize
602 mova m0, [r0+mmsize*0+x]
603 mova m1, [r0+mmsize*1+x]
606 mova [r0+mmsize*0+x], m0
607 mova [r0+mmsize*1+x], m1
621 %ifdef HIGH_BIT_DEPTH
623 %rep SIZEOF_PIXEL*32/mmsize
632 %else ; !HIGH_BIT_DEPTH
635 %rep SIZEOF_PIXEL*32/mmsize
647 %endif ; !HIGH_BIT_DEPTH
651 %ifdef HIGH_BIT_DEPTH
653 DEQUANT_DC d, pmaddwd
665 ; t4 is eax for return value.
667 DECLARE_REG_TMP 0,1,2,3,6,4 ; Identical for both Windows and *NIX
669 DECLARE_REG_TMP 4,1,2,3,0,5
672 ;-----------------------------------------------------------------------------
673 ; x264_optimize_chroma_2x2_dc( dctcoef dct[4], int dequant_mf )
674 ;-----------------------------------------------------------------------------
676 %macro OPTIMIZE_CHROMA_2x2_DC 0
679 %assign %%regs %%regs-1
682 %assign %%regs %%regs+1 ; t0-t4 are volatile on x86-64
684 cglobal optimize_chroma_2x2_dc, 0,%%regs,7
695 mova m3, [chroma_dc_dct_mask]
696 mova m5, [chroma_dc_dmf_mask]
698 mova m3, [chroma_dc_dct_mask_mmx]
699 mova m5, [chroma_dc_dmf_mask_mmx]
702 pshufd m0, m1, q0101 ; 1 0 3 2 1 0 3 2
704 punpcklqdq m1, m1 ; 3 2 1 0 3 2 1 0
705 mova m6, [pd_1024] ; 32<<5, elements are shifted 5 bits to the left
706 PSIGNW m0, m3 ; -1 -0 3 2 -1 -0 3 2
707 PSIGNW m2, m5 ; + - - + - - + +
708 paddw m0, m1 ; -1+3 -0+2 1+3 0+2 -1+3 -0+2 1+3 0+2
709 pmaddwd m0, m2 ; 0-1-2+3 0-1+2-3 0+1-2-3 0+1+2+3 * dmf
711 psrad m2, 16 ; + - - +
715 %if notcpuflag(ssse3)
716 psrad m1, 31 ; has to be 0 or -1 in order for PSIGND_MMX to work correctly
728 jz .ret ; if the DC coefficients already round to zero, terminate early
731 movsx t3d, word [t0+2*t1] ; dct[coeff]
733 pshufd m1, m1, q2100 ; move the next element to high dword
742 psubd m3, m5 ; coeff -= sign
753 paddd m3, m5 ; coeff += sign
758 pshufd m2, m2, q1320 ; - + - + / - - + +
769 punpcklqdq m2, m2 ; + + + +
776 %ifndef HIGH_BIT_DEPTH
778 OPTIMIZE_CHROMA_2x2_DC
780 OPTIMIZE_CHROMA_2x2_DC
782 OPTIMIZE_CHROMA_2x2_DC
784 OPTIMIZE_CHROMA_2x2_DC
785 %endif ; !HIGH_BIT_DEPTH
787 %ifdef HIGH_BIT_DEPTH
788 ;-----------------------------------------------------------------------------
789 ; void denoise_dct( int32_t *dct, uint32_t *sum, uint32_t *offset, int size )
790 ;-----------------------------------------------------------------------------
792 cglobal denoise_dct, 4,4,8
795 mova m2, [r0+r3*4-2*mmsize]
796 mova m3, [r0+r3*4-1*mmsize]
801 psubd m0, [r2+r3*4-2*mmsize]
802 psubd m1, [r2+r3*4-1*mmsize]
809 mova [r0+r3*4-2*mmsize], m0
810 mova [r0+r3*4-1*mmsize], m1
811 paddd m4, [r1+r3*4-2*mmsize]
812 paddd m5, [r1+r3*4-1*mmsize]
813 mova [r1+r3*4-2*mmsize], m4
814 mova [r1+r3*4-1*mmsize], m5
831 %else ; !HIGH_BIT_DEPTH
833 ;-----------------------------------------------------------------------------
834 ; void denoise_dct( int16_t *dct, uint32_t *sum, uint16_t *offset, int size )
835 ;-----------------------------------------------------------------------------
837 cglobal denoise_dct, 4,4,7
840 mova m2, [r0+r3*2-2*mmsize]
841 mova m3, [r0+r3*2-1*mmsize]
844 psubusw m4, m0, [r2+r3*2-2*mmsize]
845 psubusw m5, m1, [r2+r3*2-1*mmsize]
848 mova [r0+r3*2-2*mmsize], m4
849 mova [r0+r3*2-1*mmsize], m5
854 paddd m2, [r1+r3*4-4*mmsize]
855 paddd m0, [r1+r3*4-3*mmsize]
856 paddd m3, [r1+r3*4-2*mmsize]
857 paddd m1, [r1+r3*4-1*mmsize]
858 mova [r1+r3*4-4*mmsize], m2
859 mova [r1+r3*4-3*mmsize], m0
860 mova [r1+r3*4-2*mmsize], m3
861 mova [r1+r3*4-1*mmsize], m1
878 %endif ; !HIGH_BIT_DEPTH
880 ;-----------------------------------------------------------------------------
881 ; int decimate_score( dctcoef *dct )
882 ;-----------------------------------------------------------------------------
884 %macro DECIMATE_MASK 5
886 %ifdef HIGH_BIT_DEPTH
889 packssdw xmm0, [%3+16]
890 packssdw xmm1, [%3+48]
891 ABSW2 xmm0, xmm1, xmm0, xmm1, xmm3, xmm4
893 ABSW xmm0, [%3+ 0], xmm3
894 ABSW xmm1, [%3+16], xmm4
904 %ifdef HIGH_BIT_DEPTH
909 packssdw mm0, [%3+ 8]
910 packssdw mm1, [%3+24]
911 packssdw mm2, [%3+40]
912 packssdw mm3, [%3+56]
919 ABSW2 mm0, mm1, mm0, mm1, mm6, mm7
920 ABSW2 mm2, mm3, mm2, mm3, mm6, mm7
940 cextern decimate_table4
941 cextern decimate_table8
945 ;A LUT is faster than bsf on AMD processors.
946 ;This is not true for score64.
947 cglobal decimate_score%1, 1,3
949 lea r10, [decimate_table4]
950 lea r11, [decimate_mask_table4]
952 %define mask_table r11
954 %define table decimate_table4
955 %define mask_table decimate_mask_table4
957 DECIMATE_MASK edx, eax, r0, [pb_1], ecx
967 movzx eax, byte [mask_table + rcx]
976 add al, byte [table + rcx]
977 add al, byte [mask_table + rdx]
982 add al, byte [table + rcx]
998 INIT_MMX mmx2, slowctz
1005 INIT_XMM sse2, slowctz
1011 INIT_XMM ssse3, slowctz
1015 %macro DECIMATE8x8 0
1018 cglobal decimate_score64, 1,4
1020 lea r10, [decimate_table8]
1023 %define table decimate_table8
1026 DECIMATE_MASK r1d, eax, r0+SIZEOF_DCTCOEF* 0, m5, null
1029 DECIMATE_MASK r2d, eax, r0+SIZEOF_DCTCOEF*16, m5, null
1032 DECIMATE_MASK r2d, r3d, r0+SIZEOF_DCTCOEF*32, m5, null
1036 DECIMATE_MASK r2d, r3d, r0+SIZEOF_DCTCOEF*48, m5, null
1046 add al, byte [table + rcx]
1057 cglobal decimate_score64, 1,6
1059 cglobal decimate_score64, 1,5
1062 DECIMATE_MASK r3, r2, r0+SIZEOF_DCTCOEF* 0, m5, r5
1065 DECIMATE_MASK r4, r2, r0+SIZEOF_DCTCOEF*16, m5, r5
1068 DECIMATE_MASK r4, r1, r0+SIZEOF_DCTCOEF*32, m5, r5
1070 DECIMATE_MASK r1, r0, r0+SIZEOF_DCTCOEF*48, m5, r5
1078 jne .ret9 ;r0 is zero at this point, so we don't need to zero it
1085 add r0b, byte [decimate_table8 + ecx]
1088 cmp r0, 6 ;score64's threshold is never higher than 6
1089 jge .ret9 ;this early termination is only useful on 32-bit because it can be done in the latency after shrd
1124 ;-----------------------------------------------------------------------------
1125 ; int coeff_last( dctcoef *dct )
1126 ;-----------------------------------------------------------------------------
1146 %ifdef HIGH_BIT_DEPTH
1147 %macro LAST_MASK 3-4
1150 packssdw mm0, [%3+8]
1155 movdqa xmm0, [%3+ 0]
1156 movdqa xmm1, [%3+32]
1157 packssdw xmm0, [%3+16]
1158 packssdw xmm1, [%3+48]
1165 packssdw mm0, [%3+ 8]
1166 packssdw mm1, [%3+24]
1169 packssdw mm3, [%3+40]
1170 packssdw mm4, [%3+56]
1182 %macro COEFF_LAST4 0
1183 cglobal coeff_last4, 1,3
1185 LAST_MASK 4, r1d, r0
1194 INIT_MMX mmx2, lzcnt
1197 %else ; !HIGH_BIT_DEPTH
1198 %macro LAST_MASK 3-4
1205 movdqa xmm0, [%3+ 0]
1206 packsswb xmm0, [%3+16]
1212 packsswb mm0, [%3+ 8]
1213 packsswb mm1, [%3+24]
1223 %macro COEFF_LAST4 0
1225 cglobal coeff_last4, 1,1
1230 cglobal coeff_last4, 0,3
1239 lea eax, [eax+ecx*2]
1246 INIT_MMX mmx2, lzcnt
1248 %endif ; HIGH_BIT_DEPTH
1251 cglobal coeff_last15, 1,3
1253 LAST_MASK 15, r1d, r0-SIZEOF_DCTCOEF, r2d
1259 cglobal coeff_last16, 1,3
1261 LAST_MASK 16, r1d, r0, r2d
1267 cglobal coeff_last64, 1, 5-mmsize/16
1269 LAST_MASK 16, r2d, r0+SIZEOF_DCTCOEF* 32, r4d
1270 LAST_MASK 16, r3d, r0+SIZEOF_DCTCOEF* 48, r4d
1275 LAST_MASK 16, r1d, r0+SIZEOF_DCTCOEF* 0, r4d
1276 LAST_MASK 16, r3d, r0+SIZEOF_DCTCOEF*16, r4d
1287 cglobal coeff_last64, 1,4
1289 LAST_MASK 16, r1d, r0+SIZEOF_DCTCOEF* 0
1290 LAST_MASK 16, r2d, r0+SIZEOF_DCTCOEF*16
1291 LAST_MASK 16, r3d, r0+SIZEOF_DCTCOEF*32
1292 LAST_MASK 16, r0d, r0+SIZEOF_DCTCOEF*48
1311 INIT_XMM sse2, lzcnt
1314 ;-----------------------------------------------------------------------------
1315 ; int coeff_level_run( dctcoef *dct, run_level_t *runlevel )
1316 ;-----------------------------------------------------------------------------
1318 ; t6 = eax for return, t3 = ecx for shift, t[01] = r[01] for x86_64 args
1320 DECLARE_REG_TMP 3,1,2,0,4,5,6
1321 %elifdef ARCH_X86_64
1322 DECLARE_REG_TMP 0,1,2,3,4,5,6
1324 DECLARE_REG_TMP 6,3,2,1,4,5,0
1327 %macro COEFF_LEVELRUN 1
1328 cglobal coeff_level_run%1,0,7
1332 LAST_MASK %1, t5d, t0-(%1&1)*SIZEOF_DCTCOEF, t4d
1334 shl t5d, 32-((%1+1)&~1)
1336 LZCOUNT t3d, t5d, 0x1f
1343 LZCOUNT t3d, t5d, 0x1f
1344 %ifdef HIGH_BIT_DEPTH
1346 mov [t1+t6 +4+16*4], t3b
1347 mov [t1+t6*4+ 4], t2d
1350 mov [t1+t6 +4+16*2], t3b
1351 mov [t1+t6*2+ 4], t2w
1370 INIT_XMM sse2, lzcnt
1373 INIT_MMX mmx2, lzcnt