1 ;*****************************************************************************
2 ;* quant-a.asm: h264 encoder library
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2008 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Fiona Glaser <fiona@x264.com>
8 ;* Christian Heine <sennindemokrit@gmx.net>
10 ;* This program is free software; you can redistribute it and/or modify
11 ;* it under the terms of the GNU General Public License as published by
12 ;* the Free Software Foundation; either version 2 of the License, or
13 ;* (at your option) any later version.
15 ;* This program is distributed in the hope that it will be useful,
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 ;* GNU General Public License for more details.
20 ;* You should have received a copy of the GNU General Public License
21 ;* along with this program; if not, write to the Free Software
22 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
23 ;*****************************************************************************
26 %include "x86util.asm"
34 dw %1, %2, %1, %2, %2, %3, %2, %3
37 dw %1, %4, %5, %4, %1, %4, %5, %4
38 dw %4, %2, %6, %2, %4, %2, %6, %2
39 dw %5, %6, %3, %6, %5, %6, %3, %6
40 ; last line not used, just padding for power-of-2 stride
53 DQM8 20, 18, 32, 19, 25, 24
54 DQM8 22, 19, 35, 21, 28, 26
55 DQM8 26, 23, 42, 24, 33, 31
56 DQM8 28, 25, 45, 26, 35, 33
57 DQM8 32, 28, 51, 30, 40, 38
58 DQM8 36, 32, 58, 34, 46, 43
61 db 0,3,2,6,2,5,5,9,1,5,4,8,5,8,8,12,1,4,4,8,4,7,7,11,4,8,7,11,8,11,11,15,1,4
62 db 3,7,4,7,7,11,3,7,6,10,7,10,10,14,4,7,7,11,7,10,10,14,7,11,10,14,11,14,14
63 db 18,0,4,3,7,3,6,6,10,3,7,6,10,7,10,10,14,3,6,6,10,6,9,9,13,6,10,9,13,10,13
64 db 13,17,4,7,6,10,7,10,10,14,6,10,9,13,10,13,13,17,7,10,10,14,10,13,13,17,10
65 db 14,13,17,14,17,17,21,0,3,3,7,3,6,6,10,2,6,5,9,6,9,9,13,3,6,6,10,6,9,9,13
66 db 6,10,9,13,10,13,13,17,3,6,5,9,6,9,9,13,5,9,8,12,9,12,12,16,6,9,9,13,9,12
67 db 12,16,9,13,12,16,13,16,16,20,3,7,6,10,6,9,9,13,6,10,9,13,10,13,13,17,6,9
68 db 9,13,9,12,12,16,9,13,12,16,13,16,16,20,7,10,9,13,10,13,13,17,9,13,12,16
69 db 13,16,16,20,10,13,13,17,13,16,16,20,13,17,16,20,17,20,20,24
73 %macro QUANT_DC_START 0
104 %macro PSIGNW_SSSE3 2
109 ;;; %1 (m64) dct[y][x]
110 ;;; %2 (m64/mmx) mf[y][x] or mf[0][0] (as uint16_t)
111 ;;; %3 (m64/mmx) bias[y][x] or bias[0][0] (as uint16_t)
112 mova m1, %1 ; load dct coeffs
114 paddusw m0, %3 ; round
115 pmulhuw m0, %2 ; divide
116 PSIGNW m0, m1 ; restore sign
120 ;-----------------------------------------------------------------------------
121 ; void x264_quant_4x4_dc_mmxext( int16_t dct[16], int mf, int bias )
122 ;-----------------------------------------------------------------------------
128 QUANT_ONE [r0+x], m6, m7
134 ;-----------------------------------------------------------------------------
135 ; void x264_quant_4x4_mmx( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
136 ;-----------------------------------------------------------------------------
141 QUANT_ONE [r0+x], [r1+x], [r2+x]
148 %define PABSW PABSW_MMX
149 %define PSIGNW PSIGNW_MMX
150 QUANT_DC x264_quant_2x2_dc_mmxext, 1
151 %ifndef ARCH_X86_64 ; not needed because sse2 is faster
152 QUANT_DC x264_quant_4x4_dc_mmxext, 4
153 QUANT_AC x264_quant_4x4_mmx, 4
154 QUANT_AC x264_quant_8x8_mmx, 16
158 QUANT_DC x264_quant_4x4_dc_sse2, 2
159 QUANT_AC x264_quant_4x4_sse2, 2
160 QUANT_AC x264_quant_8x8_sse2, 8
162 %define PABSW PABSW_SSSE3
163 %define PSIGNW PSIGNW_SSSE3
164 QUANT_DC x264_quant_4x4_dc_ssse3, 2
165 QUANT_AC x264_quant_4x4_ssse3, 2
166 QUANT_AC x264_quant_8x8_ssse3, 8
169 QUANT_DC x264_quant_2x2_dc_ssse3, 1
173 ;=============================================================================
175 ;=============================================================================
179 ;;; %2,%3 dequant_mf[i_mf][y][x]
191 ;;; %2,%3 dequant_mf[i_mf][y][x]
210 %macro DEQUANT_LOOP 3
214 %1 [r0+t0+8*%3], [r1+t0*2+16*%3], [r1+t0*2+24*%3]
215 %1 [r0+t0 ], [r1+t0*2 ], [r1+t0*2+ 8*%3]
220 %1 [r0+8*%3], [r1+16*%3], [r1+24*%3]
221 %1 [r0 ], [r1 ], [r1+ 8*%3]
226 %macro DEQUANT16_FLAT 2-8
244 DECLARE_REG_TMP 4,3,2
246 DECLARE_REG_TMP 2,0,1
249 %macro DEQUANT_START 2
252 shr t0d, 8 ; i_qbits = i_qp / 6
255 sub t2d, t1d ; i_mf = i_qp % 6
258 add r1, t2 ; dequant_mf[i_mf]
260 add r1, r1m ; dequant_mf[i_mf]
264 jl .rshift32 ; negative qbits => rightshift
267 ;-----------------------------------------------------------------------------
268 ; void x264_dequant_4x4_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )
269 ;-----------------------------------------------------------------------------
271 cglobal x264_dequant_%2x%2_%1, 0,3
272 DEQUANT_START %3+2, %3
276 DEQUANT_LOOP DEQUANT16_L, %2*%2/4, %4
281 mova m6, [pd_1 GLOBAL]
285 DEQUANT_LOOP DEQUANT32_R, %2*%2/4, %4
287 cglobal x264_dequant_%2x%2_flat16_%1, 0,3
291 jl x264_dequant_%2x%2_%1
295 shr t0d, 8 ; i_qbits = i_qp / 6
298 sub t2d, t1d ; i_mf = i_qp % 6
301 lea r1, [dequant%2_scale GLOBAL]
304 lea r1, [dequant%2_scale + t2 GLOBAL]
310 DEQUANT16_FLAT [r1], 0, 16
311 DEQUANT16_FLAT [r1+8], 8, 24
313 DEQUANT16_FLAT [r1], 0, 16
316 DEQUANT16_FLAT [r1], 0, 8, 64, 72
317 DEQUANT16_FLAT [r1+16], 16, 24, 48, 56
318 DEQUANT16_FLAT [r1+16], 80, 88, 112, 120
319 DEQUANT16_FLAT [r1+32], 32, 40, 96, 104
321 DEQUANT16_FLAT [r1], 0, 64
322 DEQUANT16_FLAT [r1+16], 16, 48, 80, 112
323 DEQUANT16_FLAT [r1+32], 32, 96
334 DEQUANT sse2, 4, 4, 2
335 DEQUANT sse2, 8, 6, 2
338 cglobal x264_dequant_4x4dc_%1, 0,3
353 mova m0, [r0+mmsize*0+x]
354 mova m1, [r0+mmsize*1+x]
357 mova [r0+mmsize*0+x], m0
358 mova [r0+mmsize*1+x], m1
366 mova m6, [pw_1 GLOBAL]
399 ;-----------------------------------------------------------------------------
400 ; void x264_denoise_dct_mmx( int16_t *dct, uint32_t *sum, uint16_t *offset, int size )
401 ;-----------------------------------------------------------------------------
403 cglobal x264_denoise_dct_%1, 4,5
404 movzx r4d, word [r0] ; backup DC coefficient
408 mova m2, [r0+r3*2+0*mmsize]
409 mova m3, [r0+r3*2+1*mmsize]
414 psubusw m0, [r2+r3*2+0*mmsize]
415 psubusw m1, [r2+r3*2+1*mmsize]
418 mova [r0+r3*2+0*mmsize], m0
419 mova [r0+r3*2+1*mmsize], m1
426 paddd m4, [r1+r3*4+0*mmsize]
427 paddd m2, [r1+r3*4+1*mmsize]
428 paddd m5, [r1+r3*4+2*mmsize]
429 paddd m3, [r1+r3*4+3*mmsize]
430 mova [r1+r3*4+0*mmsize], m4
431 mova [r1+r3*4+1*mmsize], m2
432 mova [r1+r3*4+2*mmsize], m5
433 mova [r1+r3*4+3*mmsize], m3
435 mov [r0], r4w ; restore DC coefficient
439 %define PABSW PABSW_MMX
440 %define PSIGNW PSIGNW_MMX
447 %define PABSW PABSW_SSSE3
448 %define PSIGNW PSIGNW_SSSE3
453 ;-----------------------------------------------------------------------------
454 ; int x264_decimate_score( int16_t *dct )
455 ;-----------------------------------------------------------------------------
457 %macro DECIMATE_MASK_SSE2 6
464 ABS2_MMX xmm0, xmm1, xmm3, xmm4
474 %macro DECIMATE_MASK_MMX 6
479 ABS2_MMX mm0, mm1, mm4, mm5
480 ABS2_MMX mm2, mm3, mm4, mm5
499 cextern x264_decimate_table4
500 cextern x264_decimate_table8
504 ;A LUT is faster than bsf on AMD processors, and no slower on Intel
505 ;This is not true for score64.
506 cglobal x264_decimate_score%1_%2, 1,3
508 lea r10, [x264_decimate_table4 GLOBAL]
509 lea r11, [decimate_mask_table4 GLOBAL]
511 %define mask_table r11
513 %define table x264_decimate_table4
514 %define mask_table decimate_mask_table4
516 DECIMATE_MASK edx, eax, r0, [pb_1 GLOBAL], %2, ecx
525 movzx eax, byte [mask_table + rcx]
534 add al, byte [table + rcx]
535 add al, byte [mask_table + rdx]
545 %define DECIMATE_MASK DECIMATE_MASK_MMX
546 DECIMATE4x4 15, mmxext
547 DECIMATE4x4 16, mmxext
549 %define DECIMATE_MASK DECIMATE_MASK_SSE2
551 DECIMATE4x4 15, ssse3
553 DECIMATE4x4 16, ssse3
558 cglobal x264_decimate_score64_%1, 1,4
560 lea r10, [x264_decimate_table8 GLOBAL]
563 %define table x264_decimate_table8
565 mova m7, [pb_1 GLOBAL]
566 DECIMATE_MASK r1d, eax, r0, m7, %1, null
569 DECIMATE_MASK r2d, eax, r0+32, m7, %1, null
572 DECIMATE_MASK r2d, r3d, r0+64, m7, %1, null
576 DECIMATE_MASK r2d, r3d, r0+96, m7, %1, null
586 add al, byte [table + rcx]
597 cglobal x264_decimate_score64_%1, 1,6
599 cglobal x264_decimate_score64_%1, 1,5
601 mova m7, [pb_1 GLOBAL]
602 DECIMATE_MASK r3, r2, r0, m7, %1, r5
605 DECIMATE_MASK r4, r2, r0+32, m7, %1, r5
608 DECIMATE_MASK r4, r1, r0+64, m7, %1, r5
610 DECIMATE_MASK r1, r0, r0+96, m7, %1, r5
618 jne .ret9 ;r0 is zero at this point, so we don't need to zero it
625 add r0b, byte [x264_decimate_table8 + ecx]
628 cmp r0, 6 ;score64's threshold is never higher than 6
629 jge .ret9 ;this early termination is only useful on 32-bit because it can be done in the latency after shrd
657 %define DECIMATE_MASK DECIMATE_MASK_MMX
661 %define DECIMATE_MASK DECIMATE_MASK_SSE2
665 ;-----------------------------------------------------------------------------
666 ; int x264_coeff_last( int16_t *dct )
667 ;-----------------------------------------------------------------------------
669 %macro LAST_MASK_SSE2 2-3
672 packsswb xmm0, [%2+16]
677 %macro LAST_MASK_MMX 3
681 packsswb mm0, [%2+ 8]
682 packsswb mm1, [%2+24]
702 cglobal x264_coeff_last4_%1, 1,1
707 cglobal x264_coeff_last4_%1, 0,3
721 %define LAST LAST_X86
723 %define LAST LAST_SSE4A
724 COEFF_LAST4 mmxext_lzcnt
727 cglobal x264_coeff_last15_%1, 1,3
728 LAST_MASK r1d, r0-2, r2d
734 cglobal x264_coeff_last16_%1, 1,3
735 LAST_MASK r1d, r0, r2d
742 cglobal x264_coeff_last64_%1, 1,5
744 cglobal x264_coeff_last64_%1, 1,4
746 LAST_MASK r1d, r0, r4d
747 LAST_MASK r2d, r0+32, r4d
750 LAST_MASK r2d, r0+64, r4d
751 LAST_MASK r3d, r0+96, r4d
767 %macro COEFF_LAST64 1
768 cglobal x264_coeff_last64_%1, 1,4
769 LAST_MASK_SSE2 r1d, r0
770 LAST_MASK_SSE2 r2d, r0+32
771 LAST_MASK_SSE2 r3d, r0+64
772 LAST_MASK_SSE2 r0d, r0+96
784 %define LAST LAST_X86
786 %define LAST LAST_SSE4A
787 COEFF_LAST64 sse2_lzcnt
790 %define LAST LAST_X86
792 %define LAST_MASK LAST_MASK_MMX
795 %define LAST_MASK LAST_MASK_SSE2
797 %define LAST LAST_SSE4A
798 COEFF_LAST sse2_lzcnt
800 ;-----------------------------------------------------------------------------
801 ; int x264_coeff_level_run( int16_t *dct, x264_run_level_t *runlevel )
802 ;-----------------------------------------------------------------------------
804 %macro LAST_MASK4_MMX 2-3
817 %macro LZCOUNT_SSE4A 3
821 ; t6 = eax for return, t3 = ecx for shift, t[01] = r[01] for x86_64 args
823 DECLARE_REG_TMP 0,1,2,3,4,5,6
825 DECLARE_REG_TMP 6,3,2,1,4,5,0
828 %macro COEFF_LEVELRUN 2
829 cglobal x264_coeff_level_run%2_%1,0,7
832 LAST_MASK t5d, t0-(%2&1)*2, t4d
834 shl t5d, 32-((%2+1)&~1)
836 LZCOUNT t3d, t5d, 0x1f
843 LZCOUNT t3d, t5d, 0x1f
846 mov [t1+t6*2+ 4], t2w
855 %define LZCOUNT LZCOUNT_X86
857 %define LAST_MASK LAST_MASK_MMX
858 COEFF_LEVELRUN mmxext, 15
859 COEFF_LEVELRUN mmxext, 16
861 %define LAST_MASK LAST_MASK4_MMX
862 COEFF_LEVELRUN mmxext, 4
863 %define LAST_MASK LAST_MASK_SSE2
864 COEFF_LEVELRUN sse2, 15
865 COEFF_LEVELRUN sse2, 16
866 %define LZCOUNT LZCOUNT_SSE4A
867 COEFF_LEVELRUN sse2_lzcnt, 15
868 COEFF_LEVELRUN sse2_lzcnt, 16
869 %define LAST_MASK LAST_MASK4_MMX
870 COEFF_LEVELRUN mmxext_lzcnt, 4