1 ;*****************************************************************************
2 ;* quant-a.asm: x86 quantization and level-run
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2012 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Fiona Glaser <fiona@x264.com>
8 ;* Christian Heine <sennindemokrit@gmx.net>
9 ;* Oskar Arvidsson <oskar@irock.se>
10 ;* Henrik Gramner <hengar-6@student.ltu.se>
12 ;* This program is free software; you can redistribute it and/or modify
13 ;* it under the terms of the GNU General Public License as published by
14 ;* the Free Software Foundation; either version 2 of the License, or
15 ;* (at your option) any later version.
17 ;* This program is distributed in the hope that it will be useful,
18 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
19 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 ;* GNU General Public License for more details.
22 ;* You should have received a copy of the GNU General Public License
23 ;* along with this program; if not, write to the Free Software
24 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
26 ;* This program is also available under a commercial proprietary license.
27 ;* For more information, contact us at licensing@x264.com.
28 ;*****************************************************************************
31 %include "x86util.asm"
36 dw %1, %2, %1, %2, %2, %3, %2, %3
39 dw %1, %4, %5, %4, %1, %4, %5, %4
40 dw %4, %2, %6, %2, %4, %2, %6, %2
41 dw %5, %6, %3, %6, %5, %6, %3, %6
42 ; last line not used, just padding for power-of-2 stride
55 DQM8 20, 18, 32, 19, 25, 24
56 DQM8 22, 19, 35, 21, 28, 26
57 DQM8 26, 23, 42, 24, 33, 31
58 DQM8 28, 25, 45, 26, 35, 33
59 DQM8 32, 28, 51, 30, 40, 38
60 DQM8 36, 32, 58, 34, 46, 43
63 db 0,3,2,6,2,5,5,9,1,5,4,8,5,8,8,12,1,4,4,8,4,7,7,11,4,8,7,11,8,11,11,15,1,4
64 db 3,7,4,7,7,11,3,7,6,10,7,10,10,14,4,7,7,11,7,10,10,14,7,11,10,14,11,14,14
65 db 18,0,4,3,7,3,6,6,10,3,7,6,10,7,10,10,14,3,6,6,10,6,9,9,13,6,10,9,13,10,13
66 db 13,17,4,7,6,10,7,10,10,14,6,10,9,13,10,13,13,17,7,10,10,14,10,13,13,17,10
67 db 14,13,17,14,17,17,21,0,3,3,7,3,6,6,10,2,6,5,9,6,9,9,13,3,6,6,10,6,9,9,13
68 db 6,10,9,13,10,13,13,17,3,6,5,9,6,9,9,13,5,9,8,12,9,12,12,16,6,9,9,13,9,12
69 db 12,16,9,13,12,16,13,16,16,20,3,7,6,10,6,9,9,13,6,10,9,13,10,13,13,17,6,9
70 db 9,13,9,12,12,16,9,13,12,16,13,16,16,20,7,10,9,13,10,13,13,17,9,13,12,16
71 db 13,16,16,20,10,13,13,17,13,16,16,20,13,17,16,20,17,20,20,24
73 chroma_dc_dct_mask_mmx: dw 0, 0,-1,-1, 0, 0,-1,-1
74 chroma_dc_dmf_mask_mmx: dw 0, 0,-1,-1, 0,-1,-1, 0
75 chroma_dc_dct_mask: dw 1, 1,-1,-1, 1, 1,-1,-1
76 chroma_dc_dmf_mask: dw 1, 1,-1,-1, 1,-1,-1, 1
86 %macro QUANT_DC_START 0
92 %elif cpuflag(sse4) ; ssse3, but not faster on conroe
118 cmp ecx, (1<<mmsize)-1
130 %macro QUANT_ONE_DC 4
154 %macro QUANT_TWO_DC 4
173 QUANT_ONE_DC %1, %2, %3, %4
174 QUANT_ONE_DC %1+mmsize, %2, %3, %4+mmsize
178 %macro QUANT_ONE_AC_MMX 4
197 %macro QUANT_TWO_AC 4
204 paddd m3, [%3+mmsize]
206 pmulld m3, [%2+mmsize]
216 QUANT_ONE_AC_MMX %1, %2, %3, %4
217 QUANT_ONE_AC_MMX %1+mmsize, %2+mmsize, %3+mmsize, %4+mmsize
221 ;-----------------------------------------------------------------------------
222 ; int quant_2x2( int32_t dct[M*N], int mf, int bias )
223 ;-----------------------------------------------------------------------------
225 cglobal quant_%1x%2_dc, 3,3,8
227 %if %1*%2 <= mmsize/4
228 QUANT_ONE_DC r0, m6, m7, 0
231 %rep %1*%2/(mmsize/2)
232 QUANT_TWO_DC r0+x, m6, m7, x
240 ;-----------------------------------------------------------------------------
241 ; int quant_MxN( int32_t dct[M*N], uint32_t mf[M*N], uint32_t bias[M*N] )
242 ;-----------------------------------------------------------------------------
244 cglobal quant_%1x%2, 3,3,8
246 %rep %1*%2/(mmsize/2)
247 QUANT_TWO_AC r0+x, r1+x, r2+x, x
272 %endif ; HIGH_BIT_DEPTH
274 %if HIGH_BIT_DEPTH == 0
276 ;;; %1 (m64) dct[y][x]
277 ;;; %2 (m64/mmx) mf[y][x] or mf[0][0] (as uint16_t)
278 ;;; %3 (m64/mmx) bias[y][x] or bias[0][0] (as uint16_t)
279 mova m1, %1 ; load dct coeffs
281 paddusw m0, %3 ; round
282 pmulhuw m0, %2 ; divide
283 PSIGNW m0, m1 ; restore sign
305 ;-----------------------------------------------------------------------------
306 ; void quant_4x4_dc( int16_t dct[16], int mf, int bias )
307 ;-----------------------------------------------------------------------------
308 %macro QUANT_DC 2-3 0
312 QUANT_ONE [r0], m6, m7, 0
316 QUANT_TWO [r0+x], [r0+x+mmsize], m6, m6, m7, m7, x
324 ;-----------------------------------------------------------------------------
325 ; int quant_4x4( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
326 ;-----------------------------------------------------------------------------
331 QUANT_TWO [r0+x], [r0+x+mmsize], [r1+x], [r1+x+mmsize], [r2+x], [r2+x+mmsize], x
339 QUANT_DC quant_2x2_dc, 1
340 %if ARCH_X86_64 == 0 ; not needed because sse2 is faster
341 QUANT_DC quant_4x4_dc, 4
343 QUANT_AC quant_4x4, 4
344 QUANT_AC quant_8x8, 16
348 QUANT_DC quant_4x4_dc, 2, 8
349 QUANT_AC quant_4x4, 2
350 QUANT_AC quant_8x8, 8
353 QUANT_DC quant_4x4_dc, 2, 8
354 QUANT_AC quant_4x4, 2
355 QUANT_AC quant_8x8, 8
358 QUANT_DC quant_2x2_dc, 1
361 ;Not faster on Conroe, so only used in SSE4 versions
362 QUANT_DC quant_4x4_dc, 2, 8
363 QUANT_AC quant_4x4, 2
364 QUANT_AC quant_8x8, 8
365 %endif ; !HIGH_BIT_DEPTH
369 ;=============================================================================
371 ;=============================================================================
375 ;;; %2,%3 dequant_mf[i_mf][y][x]
391 ;;; %2,%3 dequant_mf[i_mf][y][x]
397 pmadcswd m0, m0, %2, m3
402 pmadcswd m0, m0, %2, m3
403 pmadcswd m1, m1, %3, m3
411 %macro DEQUANT_LOOP 3
415 %1 [r0+(t0 )*SIZEOF_PIXEL], [r1+t0*2 ], [r1+t0*2+ 8*%3]
416 %1 [r0+(t0+8*%3)*SIZEOF_PIXEL], [r1+t0*2+16*%3], [r1+t0*2+24*%3]
421 %1 [r0+(8*%3)*SIZEOF_PIXEL], [r1+16*%3], [r1+24*%3]
422 %1 [r0+(0 )*SIZEOF_PIXEL], [r1+0 ], [r1+ 8*%3]
427 %macro DEQUANT16_FLAT 2-5
445 DECLARE_REG_TMP 6,3,2
447 DECLARE_REG_TMP 4,3,2
449 DECLARE_REG_TMP 2,0,1
452 %macro DEQUANT_START 2
455 shr t0d, 8 ; i_qbits = i_qp / 6
458 sub t2d, t1d ; i_mf = i_qp % 6
461 add r1, t2 ; dequant_mf[i_mf]
463 add r1, r1mp ; dequant_mf[i_mf]
467 jl .rshift32 ; negative qbits => rightshift
470 ;-----------------------------------------------------------------------------
471 ; void dequant_4x4( dctcoef dct[4][4], int dequant_mf[6][4][4], int i_qp )
472 ;-----------------------------------------------------------------------------
474 cglobal dequant_%1x%1, 0,3,6
476 DEQUANT_START %2+2, %2
480 DEQUANT_LOOP DEQUANT16_L, %1*%1/4, %3
489 DEQUANT_LOOP DEQUANT32_R, %1*%1/4, %3
491 %if HIGH_BIT_DEPTH == 0 && notcpuflag(avx)
492 cglobal dequant_%1x%1_flat16, 0,3
496 jl dequant_%1x%1 %+ SUFFIX %+ .skip_prologue
500 shr t0d, 8 ; i_qbits = i_qp / 6
503 sub t2d, t1d ; i_mf = i_qp % 6
506 lea r1, [dequant%1_scale]
509 lea r1, [dequant%1_scale + t2]
515 DEQUANT16_FLAT [r1], 0, 16
516 DEQUANT16_FLAT [r1+8], 8, 24
518 DEQUANT16_FLAT [r1], 0, 16
521 DEQUANT16_FLAT [r1], 0, 8, 64, 72
522 DEQUANT16_FLAT [r1+16], 16, 24, 48, 56
523 DEQUANT16_FLAT [r1+16], 80, 88, 112, 120
524 DEQUANT16_FLAT [r1+32], 32, 40, 96, 104
526 DEQUANT16_FLAT [r1], 0, 64
527 DEQUANT16_FLAT [r1+16], 16, 48, 80, 112
528 DEQUANT16_FLAT [r1+32], 32, 96
531 %endif ; !HIGH_BIT_DEPTH && !AVX
559 cglobal dequant_4x4dc, 0,3,6
568 %rep SIZEOF_PIXEL*16/mmsize
569 mova m0, [r0+mmsize*0+x]
570 mova m1, [r0+mmsize*1+x]
573 mova [r0+mmsize*0+x], m0
574 mova [r0+mmsize*1+x], m1
590 %rep SIZEOF_PIXEL*32/mmsize
592 pmadcswd m0, m0, m2, m4
598 %else ; !HIGH_BIT_DEPTH
601 %rep SIZEOF_PIXEL*32/mmsize
613 %endif ; !HIGH_BIT_DEPTH
619 DEQUANT_DC d, pmaddwd
621 DEQUANT_DC d, pmaddwd
633 ; t4 is eax for return value.
635 DECLARE_REG_TMP 0,1,2,3,6,4 ; Identical for both Windows and *NIX
637 DECLARE_REG_TMP 4,1,2,3,0,5
640 ;-----------------------------------------------------------------------------
641 ; x264_optimize_chroma_2x2_dc( dctcoef dct[4], int dequant_mf )
642 ;-----------------------------------------------------------------------------
644 %macro OPTIMIZE_CHROMA_2x2_DC 0
647 %assign %%regs %%regs-1
650 %assign %%regs %%regs+1 ; t0-t4 are volatile on x86-64
652 cglobal optimize_chroma_2x2_dc, 0,%%regs,7
663 mova m3, [chroma_dc_dct_mask]
664 mova m5, [chroma_dc_dmf_mask]
666 mova m3, [chroma_dc_dct_mask_mmx]
667 mova m5, [chroma_dc_dmf_mask_mmx]
670 pshufd m0, m1, q0101 ; 1 0 3 2 1 0 3 2
672 punpcklqdq m1, m1 ; 3 2 1 0 3 2 1 0
673 mova m6, [pd_1024] ; 32<<5, elements are shifted 5 bits to the left
674 PSIGNW m0, m3 ; -1 -0 3 2 -1 -0 3 2
675 PSIGNW m2, m5 ; + - - + - - + +
676 paddw m0, m1 ; -1+3 -0+2 1+3 0+2 -1+3 -0+2 1+3 0+2
677 pmaddwd m0, m2 ; 0-1-2+3 0-1+2-3 0+1-2-3 0+1+2+3 * dmf
679 psrad m2, 16 ; + - - +
683 %if notcpuflag(ssse3)
684 psrad m1, 31 ; has to be 0 or -1 in order for PSIGND_MMX to work correctly
696 jz .ret ; if the DC coefficients already round to zero, terminate early
699 movsx t3d, word [t0+2*t1] ; dct[coeff]
701 pshufd m1, m1, q2100 ; move the next element to high dword
710 psubd m3, m5 ; coeff -= sign
721 paddd m3, m5 ; coeff += sign
726 pshufd m2, m2, q1320 ; - + - + / - - + +
737 punpcklqdq m2, m2 ; + + + +
744 %if HIGH_BIT_DEPTH == 0
746 OPTIMIZE_CHROMA_2x2_DC
748 OPTIMIZE_CHROMA_2x2_DC
750 OPTIMIZE_CHROMA_2x2_DC
752 OPTIMIZE_CHROMA_2x2_DC
753 %endif ; !HIGH_BIT_DEPTH
756 ;-----------------------------------------------------------------------------
757 ; void denoise_dct( int32_t *dct, uint32_t *sum, uint32_t *offset, int size )
758 ;-----------------------------------------------------------------------------
760 cglobal denoise_dct, 4,4,8
764 mova m2, [r0+r3*4-2*mmsize]
765 mova m3, [r0+r3*4-1*mmsize]
770 psubd m0, [r2+r3*4-2*mmsize]
771 psubd m1, [r2+r3*4-1*mmsize]
778 mova [r0+r3*4-2*mmsize], m0
779 mova [r0+r3*4-1*mmsize], m1
780 paddd m4, [r1+r3*4-2*mmsize]
781 paddd m5, [r1+r3*4-1*mmsize]
782 mova [r1+r3*4-2*mmsize], m4
783 mova [r1+r3*4-1*mmsize], m5
800 %else ; !HIGH_BIT_DEPTH
802 ;-----------------------------------------------------------------------------
803 ; void denoise_dct( int16_t *dct, uint32_t *sum, uint16_t *offset, int size )
804 ;-----------------------------------------------------------------------------
806 cglobal denoise_dct, 4,4,7
810 mova m2, [r0+r3*2-2*mmsize]
811 mova m3, [r0+r3*2-1*mmsize]
814 psubusw m4, m0, [r2+r3*2-2*mmsize]
815 psubusw m5, m1, [r2+r3*2-1*mmsize]
818 mova [r0+r3*2-2*mmsize], m4
819 mova [r0+r3*2-1*mmsize], m5
824 paddd m2, [r1+r3*4-4*mmsize]
825 paddd m0, [r1+r3*4-3*mmsize]
826 paddd m3, [r1+r3*4-2*mmsize]
827 paddd m1, [r1+r3*4-1*mmsize]
828 mova [r1+r3*4-4*mmsize], m2
829 mova [r1+r3*4-3*mmsize], m0
830 mova [r1+r3*4-2*mmsize], m3
831 mova [r1+r3*4-1*mmsize], m1
848 %endif ; !HIGH_BIT_DEPTH
850 ;-----------------------------------------------------------------------------
851 ; int decimate_score( dctcoef *dct )
852 ;-----------------------------------------------------------------------------
854 %macro DECIMATE_MASK 5
859 packssdw xmm0, [%3+16]
860 packssdw xmm1, [%3+48]
861 ABSW2 xmm0, xmm1, xmm0, xmm1, xmm3, xmm4
863 ABSW xmm0, [%3+ 0], xmm3
864 ABSW xmm1, [%3+16], xmm4
879 packssdw mm0, [%3+ 8]
880 packssdw mm1, [%3+24]
881 packssdw mm2, [%3+40]
882 packssdw mm3, [%3+56]
889 ABSW2 mm0, mm1, mm0, mm1, mm6, mm7
890 ABSW2 mm2, mm3, mm2, mm3, mm6, mm7
910 cextern decimate_table4
911 cextern decimate_table8
915 ;A LUT is faster than bsf on older AMD processors.
916 ;This is not true for score64.
917 cglobal decimate_score%1, 1,3
919 lea r4, [decimate_table4]
920 lea r5, [decimate_mask_table4]
922 %define mask_table r5
924 %define table decimate_table4
925 %define mask_table decimate_mask_table4
927 DECIMATE_MASK edx, eax, r0, [pb_1], ecx
937 movzx eax, byte [mask_table + rcx]
946 add al, byte [table + rcx]
947 add al, byte [mask_table + rdx]
952 add al, byte [table + rcx]
968 INIT_MMX mmx2, slowctz
975 INIT_XMM sse2, slowctz
981 INIT_XMM ssse3, slowctz
988 cglobal decimate_score64, 1,5
990 lea r4, [decimate_table8]
993 %define table decimate_table8
996 DECIMATE_MASK r1d, eax, r0+SIZEOF_DCTCOEF* 0, m5, null
999 DECIMATE_MASK r2d, eax, r0+SIZEOF_DCTCOEF*16, m5, null
1002 DECIMATE_MASK r2d, r3d, r0+SIZEOF_DCTCOEF*32, m5, null
1006 DECIMATE_MASK r2d, r3d, r0+SIZEOF_DCTCOEF*48, m5, null
1016 add al, byte [table + rcx]
1027 cglobal decimate_score64, 1,6
1029 cglobal decimate_score64, 1,5
1032 DECIMATE_MASK r3, r2, r0+SIZEOF_DCTCOEF* 0, m5, r5
1035 DECIMATE_MASK r4, r2, r0+SIZEOF_DCTCOEF*16, m5, r5
1038 DECIMATE_MASK r4, r1, r0+SIZEOF_DCTCOEF*32, m5, r5
1040 DECIMATE_MASK r1, r0, r0+SIZEOF_DCTCOEF*48, m5, r5
1048 jne .ret9 ;r0 is zero at this point, so we don't need to zero it
1055 add r0b, byte [decimate_table8 + ecx]
1058 cmp r0, 6 ;score64's threshold is never higher than 6
1059 jge .ret9 ;this early termination is only useful on 32-bit because it can be done in the latency after shrd
1085 %if ARCH_X86_64 == 0
1094 ;-----------------------------------------------------------------------------
1095 ; int coeff_last( dctcoef *dct )
1096 ;-----------------------------------------------------------------------------
1117 %macro LAST_MASK 3-4
1120 packssdw mm0, [%3+8]
1125 movdqa xmm0, [%3+ 0]
1127 packssdw xmm0, [%3+16]
1130 movdqa xmm1, [%3+32]
1131 packssdw xmm0, [%3+16]
1132 packssdw xmm1, [%3+48]
1140 packssdw mm0, [%3+ 8]
1141 packssdw mm1, [%3+24]
1148 packssdw mm0, [%3+ 8]
1149 packssdw mm1, [%3+24]
1152 packssdw mm3, [%3+40]
1153 packssdw mm4, [%3+56]
1165 %macro COEFF_LAST4 0
1166 cglobal coeff_last4, 1,3
1168 LAST_MASK 4, r1d, r0
1177 INIT_MMX mmx2, lzcnt
1180 %macro COEFF_LAST8 0
1181 cglobal coeff_last8, 1,3
1183 LAST_MASK 8, r1d, r0
1194 %if ARCH_X86_64 == 0
1200 INIT_XMM sse2, lzcnt
1203 %else ; !HIGH_BIT_DEPTH
1204 %macro LAST_MASK 3-4
1210 packsswb mm0, [%3+ 8]
1215 movdqa xmm0, [%3+ 0]
1216 packsswb xmm0, [%3+16]
1222 packsswb mm0, [%3+ 8]
1223 packsswb mm1, [%3+24]
1233 %macro COEFF_LAST48 0
1235 cglobal coeff_last4, 1,1
1240 cglobal coeff_last4, 0,3
1249 lea eax, [eax+ecx*2]
1253 cglobal coeff_last8, 1,3
1255 LAST_MASK 8, r1d, r0, r2d
1263 INIT_MMX mmx2, lzcnt
1265 %endif ; HIGH_BIT_DEPTH
1268 cglobal coeff_last15, 1,3
1270 LAST_MASK 15, r1d, r0-SIZEOF_DCTCOEF, r2d
1276 cglobal coeff_last16, 1,3
1278 LAST_MASK 16, r1d, r0, r2d
1283 %if ARCH_X86_64 == 0
1284 cglobal coeff_last64, 1, 5-mmsize/16
1286 LAST_MASK 16, r2d, r0+SIZEOF_DCTCOEF* 32, r4d
1287 LAST_MASK 16, r3d, r0+SIZEOF_DCTCOEF* 48, r4d
1292 LAST_MASK 16, r1d, r0+SIZEOF_DCTCOEF* 0, r4d
1293 LAST_MASK 16, r3d, r0+SIZEOF_DCTCOEF*16, r4d
1304 cglobal coeff_last64, 1,4
1306 LAST_MASK 16, r1d, r0+SIZEOF_DCTCOEF* 0
1307 LAST_MASK 16, r2d, r0+SIZEOF_DCTCOEF*16
1308 LAST_MASK 16, r3d, r0+SIZEOF_DCTCOEF*32
1309 LAST_MASK 16, r0d, r0+SIZEOF_DCTCOEF*48
1322 %if ARCH_X86_64 == 0
1328 INIT_XMM sse2, lzcnt
1331 ;-----------------------------------------------------------------------------
1332 ; int coeff_level_run( dctcoef *dct, run_level_t *runlevel )
1333 ;-----------------------------------------------------------------------------
1335 ; t6 = eax for return, t3 = ecx for shift, t[01] = r[01] for x86_64 args
1337 DECLARE_REG_TMP 3,1,2,0,4,5,6
1339 DECLARE_REG_TMP 0,1,2,3,4,5,6
1341 DECLARE_REG_TMP 6,3,2,1,4,5,0
1344 %macro COEFF_LEVELRUN 1
1345 cglobal coeff_level_run%1,0,7
1349 LAST_MASK %1, t5d, t0-(%1&1)*SIZEOF_DCTCOEF, t4d
1361 LZCOUNT t3d, t5d, 0x1f
1368 LZCOUNT t3d, t5d, 0x1f
1377 mov [t1+t6*4+ 8], t2d
1379 mov [t1+t6*2+ 8], t2w
1388 %if ARCH_X86_64 == 0
1400 INIT_XMM sse2, lzcnt
1406 INIT_MMX mmx2, lzcnt