1 ;*****************************************************************************
2 ;* quant-a.asm: x86 quantization and level-run
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2011 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Fiona Glaser <fiona@x264.com>
8 ;* Christian Heine <sennindemokrit@gmx.net>
9 ;* Oskar Arvidsson <oskar@irock.se>
10 ;* Henrik Gramner <hengar-6@student.ltu.se>
12 ;* This program is free software; you can redistribute it and/or modify
13 ;* it under the terms of the GNU General Public License as published by
14 ;* the Free Software Foundation; either version 2 of the License, or
15 ;* (at your option) any later version.
17 ;* This program is distributed in the hope that it will be useful,
18 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
19 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 ;* GNU General Public License for more details.
22 ;* You should have received a copy of the GNU General Public License
23 ;* along with this program; if not, write to the Free Software
24 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
26 ;* This program is also available under a commercial proprietary license.
27 ;* For more information, contact us at licensing@x264.com.
28 ;*****************************************************************************
31 %include "x86util.asm"
36 dw %1, %2, %1, %2, %2, %3, %2, %3
39 dw %1, %4, %5, %4, %1, %4, %5, %4
40 dw %4, %2, %6, %2, %4, %2, %6, %2
41 dw %5, %6, %3, %6, %5, %6, %3, %6
42 ; last line not used, just padding for power-of-2 stride
55 DQM8 20, 18, 32, 19, 25, 24
56 DQM8 22, 19, 35, 21, 28, 26
57 DQM8 26, 23, 42, 24, 33, 31
58 DQM8 28, 25, 45, 26, 35, 33
59 DQM8 32, 28, 51, 30, 40, 38
60 DQM8 36, 32, 58, 34, 46, 43
63 db 0,3,2,6,2,5,5,9,1,5,4,8,5,8,8,12,1,4,4,8,4,7,7,11,4,8,7,11,8,11,11,15,1,4
64 db 3,7,4,7,7,11,3,7,6,10,7,10,10,14,4,7,7,11,7,10,10,14,7,11,10,14,11,14,14
65 db 18,0,4,3,7,3,6,6,10,3,7,6,10,7,10,10,14,3,6,6,10,6,9,9,13,6,10,9,13,10,13
66 db 13,17,4,7,6,10,7,10,10,14,6,10,9,13,10,13,13,17,7,10,10,14,10,13,13,17,10
67 db 14,13,17,14,17,17,21,0,3,3,7,3,6,6,10,2,6,5,9,6,9,9,13,3,6,6,10,6,9,9,13
68 db 6,10,9,13,10,13,13,17,3,6,5,9,6,9,9,13,5,9,8,12,9,12,12,16,6,9,9,13,9,12
69 db 12,16,9,13,12,16,13,16,16,20,3,7,6,10,6,9,9,13,6,10,9,13,10,13,13,17,6,9
70 db 9,13,9,12,12,16,9,13,12,16,13,16,16,20,7,10,9,13,10,13,13,17,9,13,12,16
71 db 13,16,16,20,10,13,13,17,13,16,16,20,13,17,16,20,17,20,20,24
73 chroma_dc_dct_mask_mmx: dw 0, 0,-1,-1, 0, 0,-1,-1
74 chroma_dc_dmf_mask_mmx: dw 0, 0,-1,-1, 0,-1,-1, 0
75 chroma_dc_dct_mask: dw 1, 1,-1,-1, 1, 1,-1,-1
76 chroma_dc_dmf_mask: dw 1, 1,-1,-1, 1,-1,-1, 1
86 %macro QUANT_DC_START 0
92 %elif cpuflag(sse4) ; ssse3, but not faster on conroe
120 cmp ecx, (1<<mmsize)-1
131 %ifdef HIGH_BIT_DEPTH
132 %macro QUANT_ONE_DC 4
159 %macro QUANT_TWO_DC 4
178 QUANT_ONE_DC %1, %2, %3, %4
179 QUANT_ONE_DC %1+mmsize, %2, %3, %4+mmsize
183 %macro QUANT_ONE_AC_MMX 4
202 %macro QUANT_TWO_AC 4
209 paddd m3, [%3+mmsize]
211 pmulld m3, [%2+mmsize]
221 QUANT_ONE_AC_MMX %1, %2, %3, %4
222 QUANT_ONE_AC_MMX %1+mmsize, %2+mmsize, %3+mmsize, %4+mmsize
226 ;-----------------------------------------------------------------------------
227 ; int quant_2x2( int32_t dct[M*N], int mf, int bias )
228 ;-----------------------------------------------------------------------------
230 cglobal quant_%1x%2_dc, 3,3,8
232 %if %1*%2 <= mmsize/4
233 QUANT_ONE_DC r0, m6, m7, 0
236 %rep %1*%2/(mmsize/2)
237 QUANT_TWO_DC r0+x, m6, m7, x
245 ;-----------------------------------------------------------------------------
246 ; int quant_MxN( int32_t dct[M*N], uint32_t mf[M*N], uint32_t bias[M*N] )
247 ;-----------------------------------------------------------------------------
249 cglobal quant_%1x%2, 3,3,8
251 %rep %1*%2/(mmsize/2)
252 QUANT_TWO_AC r0+x, r1+x, r2+x, x
277 %endif ; HIGH_BIT_DEPTH
279 %ifndef HIGH_BIT_DEPTH
281 ;;; %1 (m64) dct[y][x]
282 ;;; %2 (m64/mmx) mf[y][x] or mf[0][0] (as uint16_t)
283 ;;; %3 (m64/mmx) bias[y][x] or bias[0][0] (as uint16_t)
284 mova m1, %1 ; load dct coeffs
286 paddusw m0, %3 ; round
287 pmulhuw m0, %2 ; divide
288 PSIGNW m0, m1 ; restore sign
310 ;-----------------------------------------------------------------------------
311 ; void quant_4x4_dc( int16_t dct[16], int mf, int bias )
312 ;-----------------------------------------------------------------------------
313 %macro QUANT_DC 2-3 0
317 QUANT_ONE [r0], m6, m7, 0
321 QUANT_TWO [r0+x], [r0+x+mmsize], m6, m6, m7, m7, x
329 ;-----------------------------------------------------------------------------
330 ; int quant_4x4( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
331 ;-----------------------------------------------------------------------------
336 QUANT_TWO [r0+x], [r0+x+mmsize], [r1+x], [r1+x+mmsize], [r2+x], [r2+x+mmsize], x
344 QUANT_DC quant_2x2_dc, 1
345 %ifndef ARCH_X86_64 ; not needed because sse2 is faster
346 QUANT_DC quant_4x4_dc, 4
348 QUANT_AC quant_4x4, 4
349 QUANT_AC quant_8x8, 16
353 QUANT_DC quant_4x4_dc, 2, 8
354 QUANT_AC quant_4x4, 2
355 QUANT_AC quant_8x8, 8
358 QUANT_DC quant_4x4_dc, 2, 8
359 QUANT_AC quant_4x4, 2
360 QUANT_AC quant_8x8, 8
363 QUANT_DC quant_2x2_dc, 1
366 ;Not faster on Conroe, so only used in SSE4 versions
367 QUANT_DC quant_4x4_dc, 2, 8
368 QUANT_AC quant_4x4, 2
369 QUANT_AC quant_8x8, 8
370 %endif ; !HIGH_BIT_DEPTH
374 ;=============================================================================
376 ;=============================================================================
380 ;;; %2,%3 dequant_mf[i_mf][y][x]
383 %ifdef HIGH_BIT_DEPTH
396 ;;; %2,%3 dequant_mf[i_mf][y][x]
401 %ifdef HIGH_BIT_DEPTH
402 pmadcswd m0, m0, %2, m3
407 pmadcswd m0, m0, %2, m3
408 pmadcswd m1, m1, %3, m3
416 %macro DEQUANT_LOOP 3
420 %1 [r0+(t0 )*SIZEOF_PIXEL], [r1+t0*2 ], [r1+t0*2+ 8*%3]
421 %1 [r0+(t0+8*%3)*SIZEOF_PIXEL], [r1+t0*2+16*%3], [r1+t0*2+24*%3]
426 %1 [r0+(8*%3)*SIZEOF_PIXEL], [r1+16*%3], [r1+24*%3]
427 %1 [r0+(0 )*SIZEOF_PIXEL], [r1+0 ], [r1+ 8*%3]
432 %macro DEQUANT16_FLAT 2-5
450 DECLARE_REG_TMP 6,3,2
452 DECLARE_REG_TMP 4,3,2
454 DECLARE_REG_TMP 2,0,1
457 %macro DEQUANT_START 2
460 shr t0d, 8 ; i_qbits = i_qp / 6
463 sub t2d, t1d ; i_mf = i_qp % 6
466 add r1, t2 ; dequant_mf[i_mf]
468 add r1, r1mp ; dequant_mf[i_mf]
472 jl .rshift32 ; negative qbits => rightshift
475 ;-----------------------------------------------------------------------------
476 ; void dequant_4x4( dctcoef dct[4][4], int dequant_mf[6][4][4], int i_qp )
477 ;-----------------------------------------------------------------------------
479 cglobal dequant_%1x%1, 0,3,6
481 DEQUANT_START %2+2, %2
485 DEQUANT_LOOP DEQUANT16_L, %1*%1/4, %3
494 DEQUANT_LOOP DEQUANT32_R, %1*%1/4, %3
496 %ifndef HIGH_BIT_DEPTH
498 cglobal dequant_%1x%1_flat16, 0,3
502 jl dequant_%1x%1 %+ SUFFIX %+ .skip_prologue
506 shr t0d, 8 ; i_qbits = i_qp / 6
509 sub t2d, t1d ; i_mf = i_qp % 6
512 lea r1, [dequant%1_scale]
515 lea r1, [dequant%1_scale + t2]
521 DEQUANT16_FLAT [r1], 0, 16
522 DEQUANT16_FLAT [r1+8], 8, 24
524 DEQUANT16_FLAT [r1], 0, 16
527 DEQUANT16_FLAT [r1], 0, 8, 64, 72
528 DEQUANT16_FLAT [r1+16], 16, 24, 48, 56
529 DEQUANT16_FLAT [r1+16], 80, 88, 112, 120
530 DEQUANT16_FLAT [r1+32], 32, 40, 96, 104
532 DEQUANT16_FLAT [r1], 0, 64
533 DEQUANT16_FLAT [r1+16], 16, 48, 80, 112
534 DEQUANT16_FLAT [r1+32], 32, 96
538 %endif ; !HIGH_BIT_DEPTH
541 %ifdef HIGH_BIT_DEPTH
566 cglobal dequant_4x4dc, 0,3,6
575 %rep SIZEOF_PIXEL*16/mmsize
576 mova m0, [r0+mmsize*0+x]
577 mova m1, [r0+mmsize*1+x]
580 mova [r0+mmsize*0+x], m0
581 mova [r0+mmsize*1+x], m1
595 %ifdef HIGH_BIT_DEPTH
597 %rep SIZEOF_PIXEL*32/mmsize
599 pmadcswd m0, m0, m2, m4
605 %else ; !HIGH_BIT_DEPTH
608 %rep SIZEOF_PIXEL*32/mmsize
620 %endif ; !HIGH_BIT_DEPTH
624 %ifdef HIGH_BIT_DEPTH
626 DEQUANT_DC d, pmaddwd
628 DEQUANT_DC d, pmaddwd
640 ; t4 is eax for return value.
642 DECLARE_REG_TMP 0,1,2,3,6,4 ; Identical for both Windows and *NIX
644 DECLARE_REG_TMP 4,1,2,3,0,5
647 ;-----------------------------------------------------------------------------
648 ; x264_optimize_chroma_2x2_dc( dctcoef dct[4], int dequant_mf )
649 ;-----------------------------------------------------------------------------
651 %macro OPTIMIZE_CHROMA_2x2_DC 0
654 %assign %%regs %%regs-1
657 %assign %%regs %%regs+1 ; t0-t4 are volatile on x86-64
659 cglobal optimize_chroma_2x2_dc, 0,%%regs,7
670 mova m3, [chroma_dc_dct_mask]
671 mova m5, [chroma_dc_dmf_mask]
673 mova m3, [chroma_dc_dct_mask_mmx]
674 mova m5, [chroma_dc_dmf_mask_mmx]
677 pshufd m0, m1, q0101 ; 1 0 3 2 1 0 3 2
679 punpcklqdq m1, m1 ; 3 2 1 0 3 2 1 0
680 mova m6, [pd_1024] ; 32<<5, elements are shifted 5 bits to the left
681 PSIGNW m0, m3 ; -1 -0 3 2 -1 -0 3 2
682 PSIGNW m2, m5 ; + - - + - - + +
683 paddw m0, m1 ; -1+3 -0+2 1+3 0+2 -1+3 -0+2 1+3 0+2
684 pmaddwd m0, m2 ; 0-1-2+3 0-1+2-3 0+1-2-3 0+1+2+3 * dmf
686 psrad m2, 16 ; + - - +
690 %if notcpuflag(ssse3)
691 psrad m1, 31 ; has to be 0 or -1 in order for PSIGND_MMX to work correctly
703 jz .ret ; if the DC coefficients already round to zero, terminate early
706 movsx t3d, word [t0+2*t1] ; dct[coeff]
708 pshufd m1, m1, q2100 ; move the next element to high dword
717 psubd m3, m5 ; coeff -= sign
728 paddd m3, m5 ; coeff += sign
733 pshufd m2, m2, q1320 ; - + - + / - - + +
744 punpcklqdq m2, m2 ; + + + +
751 %ifndef HIGH_BIT_DEPTH
753 OPTIMIZE_CHROMA_2x2_DC
755 OPTIMIZE_CHROMA_2x2_DC
757 OPTIMIZE_CHROMA_2x2_DC
759 OPTIMIZE_CHROMA_2x2_DC
760 %endif ; !HIGH_BIT_DEPTH
762 %ifdef HIGH_BIT_DEPTH
763 ;-----------------------------------------------------------------------------
764 ; void denoise_dct( int32_t *dct, uint32_t *sum, uint32_t *offset, int size )
765 ;-----------------------------------------------------------------------------
767 cglobal denoise_dct, 4,4,8
770 mova m2, [r0+r3*4-2*mmsize]
771 mova m3, [r0+r3*4-1*mmsize]
776 psubd m0, [r2+r3*4-2*mmsize]
777 psubd m1, [r2+r3*4-1*mmsize]
784 mova [r0+r3*4-2*mmsize], m0
785 mova [r0+r3*4-1*mmsize], m1
786 paddd m4, [r1+r3*4-2*mmsize]
787 paddd m5, [r1+r3*4-1*mmsize]
788 mova [r1+r3*4-2*mmsize], m4
789 mova [r1+r3*4-1*mmsize], m5
806 %else ; !HIGH_BIT_DEPTH
808 ;-----------------------------------------------------------------------------
809 ; void denoise_dct( int16_t *dct, uint32_t *sum, uint16_t *offset, int size )
810 ;-----------------------------------------------------------------------------
812 cglobal denoise_dct, 4,4,7
815 mova m2, [r0+r3*2-2*mmsize]
816 mova m3, [r0+r3*2-1*mmsize]
819 psubusw m4, m0, [r2+r3*2-2*mmsize]
820 psubusw m5, m1, [r2+r3*2-1*mmsize]
823 mova [r0+r3*2-2*mmsize], m4
824 mova [r0+r3*2-1*mmsize], m5
829 paddd m2, [r1+r3*4-4*mmsize]
830 paddd m0, [r1+r3*4-3*mmsize]
831 paddd m3, [r1+r3*4-2*mmsize]
832 paddd m1, [r1+r3*4-1*mmsize]
833 mova [r1+r3*4-4*mmsize], m2
834 mova [r1+r3*4-3*mmsize], m0
835 mova [r1+r3*4-2*mmsize], m3
836 mova [r1+r3*4-1*mmsize], m1
853 %endif ; !HIGH_BIT_DEPTH
855 ;-----------------------------------------------------------------------------
856 ; int decimate_score( dctcoef *dct )
857 ;-----------------------------------------------------------------------------
859 %macro DECIMATE_MASK 5
861 %ifdef HIGH_BIT_DEPTH
864 packssdw xmm0, [%3+16]
865 packssdw xmm1, [%3+48]
866 ABSW2 xmm0, xmm1, xmm0, xmm1, xmm3, xmm4
868 ABSW xmm0, [%3+ 0], xmm3
869 ABSW xmm1, [%3+16], xmm4
879 %ifdef HIGH_BIT_DEPTH
884 packssdw mm0, [%3+ 8]
885 packssdw mm1, [%3+24]
886 packssdw mm2, [%3+40]
887 packssdw mm3, [%3+56]
894 ABSW2 mm0, mm1, mm0, mm1, mm6, mm7
895 ABSW2 mm2, mm3, mm2, mm3, mm6, mm7
915 cextern decimate_table4
916 cextern decimate_table8
920 ;A LUT is faster than bsf on AMD processors.
921 ;This is not true for score64.
922 cglobal decimate_score%1, 1,3
924 lea r10, [decimate_table4]
925 lea r11, [decimate_mask_table4]
927 %define mask_table r11
929 %define table decimate_table4
930 %define mask_table decimate_mask_table4
932 DECIMATE_MASK edx, eax, r0, [pb_1], ecx
942 movzx eax, byte [mask_table + rcx]
951 add al, byte [table + rcx]
952 add al, byte [mask_table + rdx]
957 add al, byte [table + rcx]
973 INIT_MMX mmx2, slowctz
980 INIT_XMM sse2, slowctz
986 INIT_XMM ssse3, slowctz
993 cglobal decimate_score64, 1,4
995 lea r10, [decimate_table8]
998 %define table decimate_table8
1001 DECIMATE_MASK r1d, eax, r0+SIZEOF_DCTCOEF* 0, m5, null
1004 DECIMATE_MASK r2d, eax, r0+SIZEOF_DCTCOEF*16, m5, null
1007 DECIMATE_MASK r2d, r3d, r0+SIZEOF_DCTCOEF*32, m5, null
1011 DECIMATE_MASK r2d, r3d, r0+SIZEOF_DCTCOEF*48, m5, null
1021 add al, byte [table + rcx]
1032 cglobal decimate_score64, 1,6
1034 cglobal decimate_score64, 1,5
1037 DECIMATE_MASK r3, r2, r0+SIZEOF_DCTCOEF* 0, m5, r5
1040 DECIMATE_MASK r4, r2, r0+SIZEOF_DCTCOEF*16, m5, r5
1043 DECIMATE_MASK r4, r1, r0+SIZEOF_DCTCOEF*32, m5, r5
1045 DECIMATE_MASK r1, r0, r0+SIZEOF_DCTCOEF*48, m5, r5
1053 jne .ret9 ;r0 is zero at this point, so we don't need to zero it
1060 add r0b, byte [decimate_table8 + ecx]
1063 cmp r0, 6 ;score64's threshold is never higher than 6
1064 jge .ret9 ;this early termination is only useful on 32-bit because it can be done in the latency after shrd
1099 ;-----------------------------------------------------------------------------
1100 ; int coeff_last( dctcoef *dct )
1101 ;-----------------------------------------------------------------------------
1121 %ifdef HIGH_BIT_DEPTH
1122 %macro LAST_MASK 3-4
1125 packssdw mm0, [%3+8]
1130 movdqa xmm0, [%3+ 0]
1132 packssdw xmm0, [%3+16]
1135 movdqa xmm1, [%3+32]
1136 packssdw xmm0, [%3+16]
1137 packssdw xmm1, [%3+48]
1145 packssdw mm0, [%3+ 8]
1146 packssdw mm1, [%3+24]
1153 packssdw mm0, [%3+ 8]
1154 packssdw mm1, [%3+24]
1157 packssdw mm3, [%3+40]
1158 packssdw mm4, [%3+56]
1170 %macro COEFF_LAST4 0
1171 cglobal coeff_last4, 1,3
1173 LAST_MASK 4, r1d, r0
1182 INIT_MMX mmx2, lzcnt
1185 %macro COEFF_LAST8 0
1186 cglobal coeff_last8, 1,3
1188 LAST_MASK 8, r1d, r0
1205 INIT_XMM sse2, lzcnt
1208 %else ; !HIGH_BIT_DEPTH
1209 %macro LAST_MASK 3-4
1215 packsswb mm0, [%3+ 8]
1220 movdqa xmm0, [%3+ 0]
1221 packsswb xmm0, [%3+16]
1227 packsswb mm0, [%3+ 8]
1228 packsswb mm1, [%3+24]
1238 %macro COEFF_LAST48 0
1240 cglobal coeff_last4, 1,1
1245 cglobal coeff_last4, 0,3
1254 lea eax, [eax+ecx*2]
1258 cglobal coeff_last8, 1,3
1260 LAST_MASK 8, r1d, r0, r2d
1268 INIT_MMX mmx2, lzcnt
1270 %endif ; HIGH_BIT_DEPTH
1273 cglobal coeff_last15, 1,3
1275 LAST_MASK 15, r1d, r0-SIZEOF_DCTCOEF, r2d
1281 cglobal coeff_last16, 1,3
1283 LAST_MASK 16, r1d, r0, r2d
1289 cglobal coeff_last64, 1, 5-mmsize/16
1291 LAST_MASK 16, r2d, r0+SIZEOF_DCTCOEF* 32, r4d
1292 LAST_MASK 16, r3d, r0+SIZEOF_DCTCOEF* 48, r4d
1297 LAST_MASK 16, r1d, r0+SIZEOF_DCTCOEF* 0, r4d
1298 LAST_MASK 16, r3d, r0+SIZEOF_DCTCOEF*16, r4d
1309 cglobal coeff_last64, 1,4
1311 LAST_MASK 16, r1d, r0+SIZEOF_DCTCOEF* 0
1312 LAST_MASK 16, r2d, r0+SIZEOF_DCTCOEF*16
1313 LAST_MASK 16, r3d, r0+SIZEOF_DCTCOEF*32
1314 LAST_MASK 16, r0d, r0+SIZEOF_DCTCOEF*48
1333 INIT_XMM sse2, lzcnt
1336 ;-----------------------------------------------------------------------------
1337 ; int coeff_level_run( dctcoef *dct, run_level_t *runlevel )
1338 ;-----------------------------------------------------------------------------
1340 ; t6 = eax for return, t3 = ecx for shift, t[01] = r[01] for x86_64 args
1342 DECLARE_REG_TMP 3,1,2,0,4,5,6
1343 %elifdef ARCH_X86_64
1344 DECLARE_REG_TMP 0,1,2,3,4,5,6
1346 DECLARE_REG_TMP 6,3,2,1,4,5,0
1349 %macro COEFF_LEVELRUN 1
1350 cglobal coeff_level_run%1,0,7
1354 LAST_MASK %1, t5d, t0-(%1&1)*SIZEOF_DCTCOEF, t4d
1356 shl t5d, 32-((%1+1)&~1)
1358 LZCOUNT t3d, t5d, 0x1f
1365 LZCOUNT t3d, t5d, 0x1f
1366 %ifdef HIGH_BIT_DEPTH
1368 mov [t1+t6 +4+16*4], t3b
1369 mov [t1+t6*4+ 4], t2d
1372 mov [t1+t6 +4+16*2], t3b
1373 mov [t1+t6*2+ 4], t2w
1391 %ifdef HIGH_BIT_DEPTH
1396 INIT_XMM sse2, lzcnt
1397 %ifdef HIGH_BIT_DEPTH
1402 INIT_MMX mmx2, lzcnt