1 ;*****************************************************************************
2 ;* quant-a.asm: h264 encoder library
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2008 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Fiona Glaser <fiona@x264.com>
8 ;* Christian Heine <sennindemokrit@gmx.net>
10 ;* This program is free software; you can redistribute it and/or modify
11 ;* it under the terms of the GNU General Public License as published by
12 ;* the Free Software Foundation; either version 2 of the License, or
13 ;* (at your option) any later version.
15 ;* This program is distributed in the hope that it will be useful,
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 ;* GNU General Public License for more details.
20 ;* You should have received a copy of the GNU General Public License
21 ;* along with this program; if not, write to the Free Software
22 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
23 ;*****************************************************************************
26 %include "x86util.asm"
34 dw %1, %2, %1, %2, %2, %3, %2, %3
37 dw %1, %4, %5, %4, %1, %4, %5, %4
38 dw %4, %2, %6, %2, %4, %2, %6, %2
39 dw %5, %6, %3, %6, %5, %6, %3, %6
40 ; last line not used, just padding for power-of-2 stride
53 DQM8 20, 18, 32, 19, 25, 24
54 DQM8 22, 19, 35, 21, 28, 26
55 DQM8 26, 23, 42, 24, 33, 31
56 DQM8 28, 25, 45, 26, 35, 33
57 DQM8 32, 28, 51, 30, 40, 38
58 DQM8 36, 32, 58, 34, 46, 43
61 db 0,3,2,6,2,5,5,9,1,5,4,8,5,8,8,12,1,4,4,8,4,7,7,11,4,8,7,11,8,11,11,15,1,4
62 db 3,7,4,7,7,11,3,7,6,10,7,10,10,14,4,7,7,11,7,10,10,14,7,11,10,14,11,14,14
63 db 18,0,4,3,7,3,6,6,10,3,7,6,10,7,10,10,14,3,6,6,10,6,9,9,13,6,10,9,13,10,13
64 db 13,17,4,7,6,10,7,10,10,14,6,10,9,13,10,13,13,17,7,10,10,14,10,13,13,17,10
65 db 14,13,17,14,17,17,21,0,3,3,7,3,6,6,10,2,6,5,9,6,9,9,13,3,6,6,10,6,9,9,13
66 db 6,10,9,13,10,13,13,17,3,6,5,9,6,9,9,13,5,9,8,12,9,12,12,16,6,9,9,13,9,12
67 db 12,16,9,13,12,16,13,16,16,20,3,7,6,10,6,9,9,13,6,10,9,13,10,13,13,17,6,9
68 db 9,13,9,12,12,16,9,13,12,16,13,16,16,20,7,10,9,13,10,13,13,17,9,13,12,16
69 db 13,16,16,20,10,13,13,17,13,16,16,20,13,17,16,20,17,20,20,24
73 %macro QUANT_DC_START 0
104 %macro PSIGNW_SSSE3 2
109 ;;; %1 (m64) dct[y][x]
110 ;;; %2 (m64/mmx) mf[y][x] or mf[0][0] (as uint16_t)
111 ;;; %3 (m64/mmx) bias[y][x] or bias[0][0] (as uint16_t)
112 mova m1, %1 ; load dct coeffs
114 paddusw m0, %3 ; round
115 pmulhuw m0, %2 ; divide
116 PSIGNW m0, m1 ; restore sign
120 ;-----------------------------------------------------------------------------
121 ; void x264_quant_4x4_dc_mmxext( int16_t dct[16], int mf, int bias )
122 ;-----------------------------------------------------------------------------
128 QUANT_ONE [r0+x], m6, m7
134 ;-----------------------------------------------------------------------------
135 ; void x264_quant_4x4_mmx( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
136 ;-----------------------------------------------------------------------------
141 QUANT_ONE [r0+x], [r1+x], [r2+x]
148 %define PABSW PABSW_MMX
149 %define PSIGNW PSIGNW_MMX
150 QUANT_DC x264_quant_2x2_dc_mmxext, 1
151 %ifndef ARCH_X86_64 ; not needed because sse2 is faster
152 QUANT_DC x264_quant_4x4_dc_mmxext, 4
153 QUANT_AC x264_quant_4x4_mmx, 4
154 QUANT_AC x264_quant_8x8_mmx, 16
158 QUANT_DC x264_quant_4x4_dc_sse2, 2
159 QUANT_AC x264_quant_4x4_sse2, 2
160 QUANT_AC x264_quant_8x8_sse2, 8
162 %define PABSW PABSW_SSSE3
163 %define PSIGNW PSIGNW_SSSE3
164 QUANT_DC x264_quant_4x4_dc_ssse3, 2
165 QUANT_AC x264_quant_4x4_ssse3, 2
166 QUANT_AC x264_quant_8x8_ssse3, 8
169 QUANT_DC x264_quant_2x2_dc_ssse3, 1
173 ;=============================================================================
175 ;=============================================================================
179 ;;; %2,%3 dequant_mf[i_mf][y][x]
191 ;;; %2,%3 dequant_mf[i_mf][y][x]
210 %macro DEQUANT_LOOP 3
214 %1 [r0+t0+8*%3], [r1+t0*2+16*%3], [r1+t0*2+24*%3]
215 %1 [r0+t0 ], [r1+t0*2 ], [r1+t0*2+ 8*%3]
220 %1 [r0+8*%3], [r1+16*%3], [r1+24*%3]
221 %1 [r0 ], [r1 ], [r1+ 8*%3]
226 %macro DEQUANT16_FLAT 2-8
244 DECLARE_REG_TMP 4,3,2
246 DECLARE_REG_TMP 2,0,1
249 %macro DEQUANT_START 2
252 shr t0d, 8 ; i_qbits = i_qp / 6
255 sub t2d, t1d ; i_mf = i_qp % 6
258 add r1, t2 ; dequant_mf[i_mf]
260 add r1, r1m ; dequant_mf[i_mf]
264 jl .rshift32 ; negative qbits => rightshift
267 ;-----------------------------------------------------------------------------
268 ; void x264_dequant_4x4_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )
269 ;-----------------------------------------------------------------------------
271 cglobal x264_dequant_%2x%2_%1, 0,3
272 DEQUANT_START %3+2, %3
276 DEQUANT_LOOP DEQUANT16_L, %2*%2/4, %4
281 mova m6, [pd_1 GLOBAL]
285 DEQUANT_LOOP DEQUANT32_R, %2*%2/4, %4
287 cglobal x264_dequant_%2x%2_flat16_%1, 0,3
291 jl x264_dequant_%2x%2_%1
295 shr t0d, 8 ; i_qbits = i_qp / 6
298 sub t2d, t1d ; i_mf = i_qp % 6
301 lea r1, [dequant%2_scale GLOBAL]
304 lea r1, [dequant%2_scale + t2 GLOBAL]
310 DEQUANT16_FLAT [r1], 0, 16
311 DEQUANT16_FLAT [r1+8], 8, 24
313 DEQUANT16_FLAT [r1], 0, 16
316 DEQUANT16_FLAT [r1], 0, 8, 64, 72
317 DEQUANT16_FLAT [r1+16], 16, 24, 48, 56
318 DEQUANT16_FLAT [r1+16], 80, 88, 112, 120
319 DEQUANT16_FLAT [r1+32], 32, 40, 96, 104
321 DEQUANT16_FLAT [r1], 0, 64
322 DEQUANT16_FLAT [r1+16], 16, 48, 80, 112
323 DEQUANT16_FLAT [r1+32], 32, 96
334 DEQUANT sse2, 4, 4, 2
335 DEQUANT sse2, 8, 6, 2
338 cglobal x264_dequant_4x4dc_%1, 0,3
353 mova m0, [r0+mmsize*0+x]
354 mova m1, [r0+mmsize*1+x]
357 mova [r0+mmsize*0+x], m0
358 mova [r0+mmsize*1+x], m1
366 mova m6, [pw_1 GLOBAL]
399 ;-----------------------------------------------------------------------------
400 ; void x264_denoise_dct_mmx( int16_t *dct, uint32_t *sum, uint16_t *offset, int size )
401 ;-----------------------------------------------------------------------------
403 cglobal x264_denoise_dct_%1, 4,5
404 movzx r4d, word [r0] ; backup DC coefficient
408 mova m2, [r0+r3*2+0*mmsize]
409 mova m3, [r0+r3*2+1*mmsize]
414 psubusw m0, [r2+r3*2+0*mmsize]
415 psubusw m1, [r2+r3*2+1*mmsize]
418 mova [r0+r3*2+0*mmsize], m0
419 mova [r0+r3*2+1*mmsize], m1
426 paddd m4, [r1+r3*4+0*mmsize]
427 paddd m2, [r1+r3*4+1*mmsize]
428 paddd m5, [r1+r3*4+2*mmsize]
429 paddd m3, [r1+r3*4+3*mmsize]
430 mova [r1+r3*4+0*mmsize], m4
431 mova [r1+r3*4+1*mmsize], m2
432 mova [r1+r3*4+2*mmsize], m5
433 mova [r1+r3*4+3*mmsize], m3
435 mov [r0], r4w ; restore DC coefficient
439 %define PABSW PABSW_MMX
440 %define PSIGNW PSIGNW_MMX
447 %define PABSW PABSW_SSSE3
448 %define PSIGNW PSIGNW_SSSE3
453 ;-----------------------------------------------------------------------------
454 ; int x264_decimate_score( int16_t *dct )
455 ;-----------------------------------------------------------------------------
457 %macro DECIMATE_MASK_SSE2 6
464 ABS2_MMX xmm0, xmm1, xmm3, xmm4
474 %macro DECIMATE_MASK_MMX 6
479 ABS2_MMX mm0, mm1, mm4, mm5
480 ABS2_MMX mm2, mm3, mm4, mm5
499 cextern x264_decimate_table4
500 cextern x264_decimate_table8
504 ;A LUT is faster than bsf on AMD processors, and no slower on Intel
505 ;This is not true for score64.
506 cglobal x264_decimate_score%1_%2, 1,3
508 lea r10, [x264_decimate_table4 GLOBAL]
509 lea r11, [decimate_mask_table4 GLOBAL]
511 %define mask_table r11
513 %define table x264_decimate_table4
514 %define mask_table decimate_mask_table4
516 DECIMATE_MASK edx, eax, r0, [pb_1 GLOBAL], %2, ecx
525 movzx eax, byte [mask_table + rcx]
534 add al, byte [table + rcx]
535 add al, byte [mask_table + rdx]
545 %define DECIMATE_MASK DECIMATE_MASK_MMX
546 DECIMATE4x4 15, mmxext
547 DECIMATE4x4 16, mmxext
549 %define DECIMATE_MASK DECIMATE_MASK_SSE2
551 DECIMATE4x4 15, ssse3
553 DECIMATE4x4 16, ssse3
558 cglobal x264_decimate_score64_%1, 1,4
560 lea r10, [x264_decimate_table8 GLOBAL]
563 %define table x264_decimate_table8
565 mova m7, [pb_1 GLOBAL]
566 DECIMATE_MASK r1d, eax, r0, m7, %1, null
569 DECIMATE_MASK r2d, eax, r0+32, m7, %1, null
572 DECIMATE_MASK r2d, r3d, r0+64, m7, %1, null
576 DECIMATE_MASK r2d, r3d, r0+96, m7, %1, null
586 add al, byte [table + rcx]
597 cglobal x264_decimate_score64_%1, 1,6
599 cglobal x264_decimate_score64_%1, 1,5
601 mova m7, [pb_1 GLOBAL]
602 DECIMATE_MASK r3, r2, r0, m7, %1, r5
605 DECIMATE_MASK r4, r2, r0+32, m7, %1, r5
608 DECIMATE_MASK r4, r1, r0+64, m7, %1, r5
610 DECIMATE_MASK r1, r0, r0+96, m7, %1, r5
618 jne .ret9 ;r0 is zero at this point, so we don't need to zero it
625 add r0b, byte [x264_decimate_table8 + ecx]
628 cmp r0, 6 ;score64's threshold is never higher than 6
629 jge .ret9 ;this early termination is only useful on 32-bit because it can be done in the latency after shrd
657 %define DECIMATE_MASK DECIMATE_MASK_MMX
661 %define DECIMATE_MASK DECIMATE_MASK_SSE2
665 ;-----------------------------------------------------------------------------
666 ; int x264_coeff_last( int16_t *dct )
667 ;-----------------------------------------------------------------------------
669 %macro LAST_MASK_SSE2 2-3
671 packsswb xmm0, [%2+16]
676 %macro LAST_MASK_MMX 3
679 packsswb mm0, [%2+ 8]
680 packsswb mm1, [%2+24]
700 cglobal x264_coeff_last4_%1, 1,1
705 cglobal x264_coeff_last4_%1, 0,3
719 %define LAST LAST_X86
721 %define LAST LAST_SSE4A
722 COEFF_LAST4 mmxext_lzcnt
725 cglobal x264_coeff_last15_%1, 1,3
727 LAST_MASK r1d, r0-2, r2d
733 cglobal x264_coeff_last16_%1, 1,3
735 LAST_MASK r1d, r0, r2d
741 cglobal x264_coeff_last64_%1, 1, 5-mmsize/16
743 LAST_MASK r1d, r0, r4d
744 LAST_MASK r2d, r0+32, r4d
747 LAST_MASK r2d, r0+64, r4d
748 LAST_MASK r3d, r0+96, r4d
761 cglobal x264_coeff_last64_%1, 1,4
763 LAST_MASK_SSE2 r1d, r0
764 LAST_MASK_SSE2 r2d, r0+32
765 LAST_MASK_SSE2 r3d, r0+64
766 LAST_MASK_SSE2 r0d, r0+96
779 %define LAST LAST_X86
782 %define LAST_MASK LAST_MASK_MMX
786 %define LAST_MASK LAST_MASK_SSE2
788 %define LAST LAST_SSE4A
789 COEFF_LAST sse2_lzcnt
791 ;-----------------------------------------------------------------------------
792 ; int x264_coeff_level_run( int16_t *dct, x264_run_level_t *runlevel )
793 ;-----------------------------------------------------------------------------
795 %macro LAST_MASK4_MMX 2-3
807 %macro LZCOUNT_SSE4A 3
811 ; t6 = eax for return, t3 = ecx for shift, t[01] = r[01] for x86_64 args
813 DECLARE_REG_TMP 0,1,2,3,4,5,6
815 DECLARE_REG_TMP 6,3,2,1,4,5,0
818 %macro COEFF_LEVELRUN 2
819 cglobal x264_coeff_level_run%2_%1,0,7
823 LAST_MASK t5d, t0-(%2&1)*2, t4d
825 shl t5d, 32-((%2+1)&~1)
827 LZCOUNT t3d, t5d, 0x1f
834 LZCOUNT t3d, t5d, 0x1f
837 mov [t1+t6*2+ 4], t2w
847 %define LZCOUNT LZCOUNT_X86
849 %define LAST_MASK LAST_MASK_MMX
850 COEFF_LEVELRUN mmxext, 15
851 COEFF_LEVELRUN mmxext, 16
853 %define LAST_MASK LAST_MASK4_MMX
854 COEFF_LEVELRUN mmxext, 4
856 %define LAST_MASK LAST_MASK_SSE2
857 COEFF_LEVELRUN sse2, 15
858 COEFF_LEVELRUN sse2, 16
859 %define LZCOUNT LZCOUNT_SSE4A
860 COEFF_LEVELRUN sse2_lzcnt, 15
861 COEFF_LEVELRUN sse2_lzcnt, 16
863 %define LAST_MASK LAST_MASK4_MMX
864 COEFF_LEVELRUN mmxext_lzcnt, 4