1 ;*****************************************************************************
2 ;* quant-a.asm: h264 encoder library
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2008 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Fiona Glaser <fiona@x264.com>
8 ;* Christian Heine <sennindemokrit@gmx.net>
10 ;* This program is free software; you can redistribute it and/or modify
11 ;* it under the terms of the GNU General Public License as published by
12 ;* the Free Software Foundation; either version 2 of the License, or
13 ;* (at your option) any later version.
15 ;* This program is distributed in the hope that it will be useful,
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 ;* GNU General Public License for more details.
20 ;* You should have received a copy of the GNU General Public License
21 ;* along with this program; if not, write to the Free Software
22 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
23 ;*****************************************************************************
26 %include "x86util.asm"
32 pb_01: times 8 db 0, 1
35 dw %1, %2, %1, %2, %2, %3, %2, %3
38 dw %1, %4, %5, %4, %1, %4, %5, %4
39 dw %4, %2, %6, %2, %4, %2, %6, %2
40 dw %5, %6, %3, %6, %5, %6, %3, %6
41 ; last line not used, just padding for power-of-2 stride
54 DQM8 20, 18, 32, 19, 25, 24
55 DQM8 22, 19, 35, 21, 28, 26
56 DQM8 26, 23, 42, 24, 33, 31
57 DQM8 28, 25, 45, 26, 35, 33
58 DQM8 32, 28, 51, 30, 40, 38
59 DQM8 36, 32, 58, 34, 46, 43
62 db 0,3,2,6,2,5,5,9,1,5,4,8,5,8,8,12,1,4,4,8,4,7,7,11,4,8,7,11,8,11,11,15,1,4
63 db 3,7,4,7,7,11,3,7,6,10,7,10,10,14,4,7,7,11,7,10,10,14,7,11,10,14,11,14,14
64 db 18,0,4,3,7,3,6,6,10,3,7,6,10,7,10,10,14,3,6,6,10,6,9,9,13,6,10,9,13,10,13
65 db 13,17,4,7,6,10,7,10,10,14,6,10,9,13,10,13,13,17,7,10,10,14,10,13,13,17,10
66 db 14,13,17,14,17,17,21,0,3,3,7,3,6,6,10,2,6,5,9,6,9,9,13,3,6,6,10,6,9,9,13
67 db 6,10,9,13,10,13,13,17,3,6,5,9,6,9,9,13,5,9,8,12,9,12,12,16,6,9,9,13,9,12
68 db 12,16,9,13,12,16,13,16,16,20,3,7,6,10,6,9,9,13,6,10,9,13,10,13,13,17,6,9
69 db 9,13,9,12,12,16,9,13,12,16,13,16,16,20,7,10,9,13,10,13,13,17,9,13,12,16
70 db 13,16,16,20,10,13,13,17,13,16,16,20,13,17,16,20,17,20,20,24
74 %macro QUANT_DC_START_MMX 0
88 %macro QUANT_DC_START_SSSE3 0
113 %macro PSIGNW_SSSE3 2
118 ;;; %1 (m64) dct[y][x]
119 ;;; %2 (m64/mmx) mf[y][x] or mf[0][0] (as uint16_t)
120 ;;; %3 (m64/mmx) bias[y][x] or bias[0][0] (as uint16_t)
121 mova m1, %1 ; load dct coeffs
123 paddusw m0, %3 ; round
124 pmulhuw m0, %2 ; divide
125 PSIGNW m0, m1 ; restore sign
156 %macro QUANT_END_MMX 0
167 cmp ecx, (1<<mmsize)-1
179 %macro QUANT_END_SSE4 0
185 ;-----------------------------------------------------------------------------
186 ; void x264_quant_4x4_dc_mmxext( int16_t dct[16], int mf, int bias )
187 ;-----------------------------------------------------------------------------
188 %macro QUANT_DC 2-3 0
192 QUANT_ONE [r0], m6, m7, 0
196 QUANT_TWO [r0+x], [r0+x+mmsize], m6, m6, m7, m7, x
204 ;-----------------------------------------------------------------------------
205 ; int x264_quant_4x4_mmx( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
206 ;-----------------------------------------------------------------------------
211 QUANT_TWO [r0+x], [r0+x+mmsize], [r1+x], [r1+x+mmsize], [r2+x], [r2+x+mmsize], x
219 %define QUANT_END QUANT_END_MMX
220 %define PABSW PABSW_MMX
221 %define PSIGNW PSIGNW_MMX
222 %define QUANT_DC_START QUANT_DC_START_MMX
223 QUANT_DC x264_quant_2x2_dc_mmxext, 1
224 %ifndef ARCH_X86_64 ; not needed because sse2 is faster
225 QUANT_DC x264_quant_4x4_dc_mmxext, 4
226 QUANT_AC x264_quant_4x4_mmx, 4
227 QUANT_AC x264_quant_8x8_mmx, 16
231 QUANT_DC x264_quant_4x4_dc_sse2, 2, 8
232 QUANT_AC x264_quant_4x4_sse2, 2
233 QUANT_AC x264_quant_8x8_sse2, 8
235 %define PABSW PABSW_SSSE3
236 %define PSIGNW PSIGNW_SSSE3
237 QUANT_DC x264_quant_4x4_dc_ssse3, 2, 8
238 QUANT_AC x264_quant_4x4_ssse3, 2
239 QUANT_AC x264_quant_8x8_ssse3, 8
242 QUANT_DC x264_quant_2x2_dc_ssse3, 1
243 %define QUANT_END QUANT_END_SSE4
244 ;Not faster on Conroe, so only used in SSE4 versions
245 %define QUANT_DC_START QUANT_DC_START_SSSE3
247 QUANT_DC x264_quant_4x4_dc_sse4, 2, 8
248 QUANT_AC x264_quant_4x4_sse4, 2
249 QUANT_AC x264_quant_8x8_sse4, 8
253 ;=============================================================================
255 ;=============================================================================
259 ;;; %2,%3 dequant_mf[i_mf][y][x]
271 ;;; %2,%3 dequant_mf[i_mf][y][x]
290 %macro DEQUANT_LOOP 3
294 %1 [r0+t0+8*%3], [r1+t0*2+16*%3], [r1+t0*2+24*%3]
295 %1 [r0+t0 ], [r1+t0*2 ], [r1+t0*2+ 8*%3]
300 %1 [r0+8*%3], [r1+16*%3], [r1+24*%3]
301 %1 [r0 ], [r1 ], [r1+ 8*%3]
306 %macro DEQUANT16_FLAT 2-5
324 DECLARE_REG_TMP 6,3,2
326 DECLARE_REG_TMP 4,3,2
328 DECLARE_REG_TMP 2,0,1
331 %macro DEQUANT_START 2
334 shr t0d, 8 ; i_qbits = i_qp / 6
337 sub t2d, t1d ; i_mf = i_qp % 6
340 add r1, t2 ; dequant_mf[i_mf]
342 add r1, r1mp ; dequant_mf[i_mf]
346 jl .rshift32 ; negative qbits => rightshift
349 ;-----------------------------------------------------------------------------
350 ; void x264_dequant_4x4_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )
351 ;-----------------------------------------------------------------------------
353 cglobal x264_dequant_%2x%2_%1, 0,3
355 DEQUANT_START %3+2, %3
359 DEQUANT_LOOP DEQUANT16_L, %2*%2/4, %4
368 DEQUANT_LOOP DEQUANT32_R, %2*%2/4, %4
370 cglobal x264_dequant_%2x%2_flat16_%1, 0,3
374 jl x264_dequant_%2x%2_%1.skip_prologue
378 shr t0d, 8 ; i_qbits = i_qp / 6
381 sub t2d, t1d ; i_mf = i_qp % 6
384 lea r1, [dequant%2_scale]
387 lea r1, [dequant%2_scale + t2]
393 DEQUANT16_FLAT [r1], 0, 16
394 DEQUANT16_FLAT [r1+8], 8, 24
396 DEQUANT16_FLAT [r1], 0, 16
399 DEQUANT16_FLAT [r1], 0, 8, 64, 72
400 DEQUANT16_FLAT [r1+16], 16, 24, 48, 56
401 DEQUANT16_FLAT [r1+16], 80, 88, 112, 120
402 DEQUANT16_FLAT [r1+32], 32, 40, 96, 104
404 DEQUANT16_FLAT [r1], 0, 64
405 DEQUANT16_FLAT [r1+16], 16, 48, 80, 112
406 DEQUANT16_FLAT [r1+32], 32, 96
417 DEQUANT sse2, 4, 4, 2
418 DEQUANT sse2, 8, 6, 2
421 cglobal x264_dequant_4x4dc_%1, 0,3
436 mova m0, [r0+mmsize*0+x]
437 mova m1, [r0+mmsize*1+x]
440 mova [r0+mmsize*0+x], m0
441 mova [r0+mmsize*1+x], m1
482 ;-----------------------------------------------------------------------------
483 ; void x264_denoise_dct_mmx( int16_t *dct, uint32_t *sum, uint16_t *offset, int size )
484 ;-----------------------------------------------------------------------------
485 %macro DENOISE_DCT 1-2 0
486 cglobal x264_denoise_dct_%1, 4,5,%2
487 movzx r4d, word [r0] ; backup DC coefficient
491 mova m2, [r0+r3*2+0*mmsize]
492 mova m3, [r0+r3*2+1*mmsize]
497 psubusw m0, [r2+r3*2+0*mmsize]
498 psubusw m1, [r2+r3*2+1*mmsize]
501 mova [r0+r3*2+0*mmsize], m0
502 mova [r0+r3*2+1*mmsize], m1
509 paddd m4, [r1+r3*4+0*mmsize]
510 paddd m2, [r1+r3*4+1*mmsize]
511 paddd m5, [r1+r3*4+2*mmsize]
512 paddd m3, [r1+r3*4+3*mmsize]
513 mova [r1+r3*4+0*mmsize], m4
514 mova [r1+r3*4+1*mmsize], m2
515 mova [r1+r3*4+2*mmsize], m5
516 mova [r1+r3*4+3*mmsize], m3
518 mov [r0], r4w ; restore DC coefficient
522 %define PABSW PABSW_MMX
523 %define PSIGNW PSIGNW_MMX
530 %define PABSW PABSW_SSSE3
531 %define PSIGNW PSIGNW_SSSE3
536 ;-----------------------------------------------------------------------------
537 ; int x264_decimate_score( int16_t *dct )
538 ;-----------------------------------------------------------------------------
540 %macro DECIMATE_MASK_SSE2 6
547 ABS2_MMX xmm0, xmm1, xmm3, xmm4
557 %macro DECIMATE_MASK_MMX 6
562 ABS2_MMX mm0, mm1, mm4, mm5
563 ABS2_MMX mm2, mm3, mm4, mm5
582 cextern x264_decimate_table4
583 cextern x264_decimate_table8
587 ;A LUT is faster than bsf on AMD processors, and no slower on Intel
588 ;This is not true for score64.
589 cglobal x264_decimate_score%1_%2, 1,3
591 lea r10, [x264_decimate_table4]
592 lea r11, [decimate_mask_table4]
594 %define mask_table r11
596 %define table x264_decimate_table4
597 %define mask_table decimate_mask_table4
599 DECIMATE_MASK edx, eax, r0, [pb_1], %2, ecx
608 movzx eax, byte [mask_table + rcx]
617 add al, byte [table + rcx]
618 add al, byte [mask_table + rdx]
628 %define DECIMATE_MASK DECIMATE_MASK_MMX
629 DECIMATE4x4 15, mmxext
630 DECIMATE4x4 16, mmxext
632 %define DECIMATE_MASK DECIMATE_MASK_SSE2
634 DECIMATE4x4 15, ssse3
636 DECIMATE4x4 16, ssse3
641 cglobal x264_decimate_score64_%1, 1,4
643 lea r10, [x264_decimate_table8]
646 %define table x264_decimate_table8
649 DECIMATE_MASK r1d, eax, r0, m5, %1, null
652 DECIMATE_MASK r2d, eax, r0+32, m5, %1, null
655 DECIMATE_MASK r2d, r3d, r0+64, m5, %1, null
659 DECIMATE_MASK r2d, r3d, r0+96, m5, %1, null
669 add al, byte [table + rcx]
680 cglobal x264_decimate_score64_%1, 1,6
682 cglobal x264_decimate_score64_%1, 1,5
685 DECIMATE_MASK r3, r2, r0, m7, %1, r5
688 DECIMATE_MASK r4, r2, r0+32, m7, %1, r5
691 DECIMATE_MASK r4, r1, r0+64, m7, %1, r5
693 DECIMATE_MASK r1, r0, r0+96, m7, %1, r5
701 jne .ret9 ;r0 is zero at this point, so we don't need to zero it
708 add r0b, byte [x264_decimate_table8 + ecx]
711 cmp r0, 6 ;score64's threshold is never higher than 6
712 jge .ret9 ;this early termination is only useful on 32-bit because it can be done in the latency after shrd
740 %define DECIMATE_MASK DECIMATE_MASK_MMX
744 %define DECIMATE_MASK DECIMATE_MASK_SSE2
748 ;-----------------------------------------------------------------------------
749 ; int x264_coeff_last( int16_t *dct )
750 ;-----------------------------------------------------------------------------
752 %macro LAST_MASK_SSE2 2-3
754 packsswb xmm0, [%2+16]
759 %macro LAST_MASK_MMX 3
762 packsswb mm0, [%2+ 8]
763 packsswb mm1, [%2+24]
783 cglobal x264_coeff_last4_%1, 1,1
788 cglobal x264_coeff_last4_%1, 0,3
802 %define LAST LAST_X86
804 %define LAST LAST_SSE4A
805 COEFF_LAST4 mmxext_lzcnt
808 cglobal x264_coeff_last15_%1, 1,3
810 LAST_MASK r1d, r0-2, r2d
816 cglobal x264_coeff_last16_%1, 1,3
818 LAST_MASK r1d, r0, r2d
824 cglobal x264_coeff_last64_%1, 1, 5-mmsize/16
826 LAST_MASK r2d, r0+64, r4d
827 LAST_MASK r3d, r0+96, r4d
832 LAST_MASK r1d, r0, r4d
833 LAST_MASK r3d, r0+32, r4d
844 cglobal x264_coeff_last64_%1, 1,4
846 LAST_MASK_SSE2 r1d, r0
847 LAST_MASK_SSE2 r2d, r0+32
848 LAST_MASK_SSE2 r3d, r0+64
849 LAST_MASK_SSE2 r0d, r0+96
862 %define LAST LAST_X86
865 %define LAST_MASK LAST_MASK_MMX
869 %define LAST_MASK LAST_MASK_SSE2
871 %define LAST LAST_SSE4A
872 COEFF_LAST sse2_lzcnt
874 ;-----------------------------------------------------------------------------
875 ; int x264_coeff_level_run( int16_t *dct, x264_run_level_t *runlevel )
876 ;-----------------------------------------------------------------------------
878 %macro LAST_MASK4_MMX 2-3
890 %macro LZCOUNT_SSE4A 3
894 ; t6 = eax for return, t3 = ecx for shift, t[01] = r[01] for x86_64 args
896 DECLARE_REG_TMP 3,1,2,0,4,5,6
898 DECLARE_REG_TMP 0,1,2,3,4,5,6
900 DECLARE_REG_TMP 6,3,2,1,4,5,0
903 %macro COEFF_LEVELRUN 2
904 cglobal x264_coeff_level_run%2_%1,0,7
908 LAST_MASK t5d, t0-(%2&1)*2, t4d
910 shl t5d, 32-((%2+1)&~1)
912 LZCOUNT t3d, t5d, 0x1f
919 LZCOUNT t3d, t5d, 0x1f
922 mov [t1+t6*2+ 4], t2w
932 %define LZCOUNT LZCOUNT_X86
934 %define LAST_MASK LAST_MASK_MMX
935 COEFF_LEVELRUN mmxext, 15
936 COEFF_LEVELRUN mmxext, 16
938 %define LAST_MASK LAST_MASK4_MMX
939 COEFF_LEVELRUN mmxext, 4
941 %define LAST_MASK LAST_MASK_SSE2
942 COEFF_LEVELRUN sse2, 15
943 COEFF_LEVELRUN sse2, 16
944 %define LZCOUNT LZCOUNT_SSE4A
945 COEFF_LEVELRUN sse2_lzcnt, 15
946 COEFF_LEVELRUN sse2_lzcnt, 16
948 %define LAST_MASK LAST_MASK4_MMX
949 COEFF_LEVELRUN mmxext_lzcnt, 4