1 ;*****************************************************************************
2 ;* quant-a.asm: h264 encoder library
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2008 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Christian Heine <sennindemokrit@gmx.net>
9 ;* This program is free software; you can redistribute it and/or modify
10 ;* it under the terms of the GNU General Public License as published by
11 ;* the Free Software Foundation; either version 2 of the License, or
12 ;* (at your option) any later version.
14 ;* This program is distributed in the hope that it will be useful,
15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 ;* GNU General Public License for more details.
19 ;* You should have received a copy of the GNU General Public License
20 ;* along with this program; if not, write to the Free Software
21 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
22 ;*****************************************************************************
25 %include "x86util.asm"
33 dw %1, %2, %1, %2, %2, %3, %2, %3
36 dw %1, %4, %5, %4, %1, %4, %5, %4
37 dw %4, %2, %6, %2, %4, %2, %6, %2
38 dw %5, %6, %3, %6, %5, %6, %3, %6
39 ; last line not used, just padding for power-of-2 stride
52 DQM8 20, 18, 32, 19, 25, 24
53 DQM8 22, 19, 35, 21, 28, 26
54 DQM8 26, 23, 42, 24, 33, 31
55 DQM8 28, 25, 45, 26, 35, 33
56 DQM8 32, 28, 51, 30, 40, 38
57 DQM8 36, 32, 58, 34, 46, 43
60 db 0,3,2,6,2,5,5,9,1,5,4,8,5,8,8,12,1,4,4,8,4,7,7,11,4,8,7,11,8,11,11,15,1,4
61 db 3,7,4,7,7,11,3,7,6,10,7,10,10,14,4,7,7,11,7,10,10,14,7,11,10,14,11,14,14
62 db 18,0,4,3,7,3,6,6,10,3,7,6,10,7,10,10,14,3,6,6,10,6,9,9,13,6,10,9,13,10,13
63 db 13,17,4,7,6,10,7,10,10,14,6,10,9,13,10,13,13,17,7,10,10,14,10,13,13,17,10
64 db 14,13,17,14,17,17,21,0,3,3,7,3,6,6,10,2,6,5,9,6,9,9,13,3,6,6,10,6,9,9,13
65 db 6,10,9,13,10,13,13,17,3,6,5,9,6,9,9,13,5,9,8,12,9,12,12,16,6,9,9,13,9,12
66 db 12,16,9,13,12,16,13,16,16,20,3,7,6,10,6,9,9,13,6,10,9,13,10,13,13,17,6,9
67 db 9,13,9,12,12,16,9,13,12,16,13,16,16,20,7,10,9,13,10,13,13,17,9,13,12,16
68 db 13,16,16,20,10,13,13,17,13,16,16,20,13,17,16,20,17,20,20,24
72 %macro QUANT_DC_START 0
103 %macro PSIGNW_SSSE3 2
108 ;;; %1 (m64) dct[y][x]
109 ;;; %2 (m64/mmx) mf[y][x] or mf[0][0] (as uint16_t)
110 ;;; %3 (m64/mmx) bias[y][x] or bias[0][0] (as uint16_t)
111 mova m1, %1 ; load dct coeffs
113 paddusw m0, %3 ; round
114 pmulhuw m0, %2 ; divide
115 PSIGNW m0, m1 ; restore sign
119 ;-----------------------------------------------------------------------------
120 ; void x264_quant_4x4_dc_mmxext( int16_t dct[16], int mf, int bias )
121 ;-----------------------------------------------------------------------------
127 QUANT_ONE [r0+x], m6, m7
133 ;-----------------------------------------------------------------------------
134 ; void x264_quant_4x4_mmx( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
135 ;-----------------------------------------------------------------------------
140 QUANT_ONE [r0+x], [r1+x], [r2+x]
147 %define PABSW PABSW_MMX
148 %define PSIGNW PSIGNW_MMX
149 QUANT_DC x264_quant_2x2_dc_mmxext, 1
150 %ifndef ARCH_X86_64 ; not needed because sse2 is faster
151 QUANT_DC x264_quant_4x4_dc_mmxext, 4
152 QUANT_AC x264_quant_4x4_mmx, 4
153 QUANT_AC x264_quant_8x8_mmx, 16
157 QUANT_DC x264_quant_4x4_dc_sse2, 2
158 QUANT_AC x264_quant_4x4_sse2, 2
159 QUANT_AC x264_quant_8x8_sse2, 8
161 %define PABSW PABSW_SSSE3
162 %define PSIGNW PSIGNW_SSSE3
163 QUANT_DC x264_quant_4x4_dc_ssse3, 2
164 QUANT_AC x264_quant_4x4_ssse3, 2
165 QUANT_AC x264_quant_8x8_ssse3, 8
168 QUANT_DC x264_quant_2x2_dc_ssse3, 1
172 ;=============================================================================
174 ;=============================================================================
178 ;;; %2,%3 dequant_mf[i_mf][y][x]
190 ;;; %2,%3 dequant_mf[i_mf][y][x]
209 %macro DEQUANT_LOOP 3
213 %1 [r0+t0+8*%3], [r1+t0*2+16*%3], [r1+t0*2+24*%3]
214 %1 [r0+t0 ], [r1+t0*2 ], [r1+t0*2+ 8*%3]
219 %1 [r0+8*%3], [r1+16*%3], [r1+24*%3]
220 %1 [r0 ], [r1 ], [r1+ 8*%3]
225 %macro DEQUANT16_FLAT 2-8
258 %macro DEQUANT_START 2
261 shr t0d, 8 ; i_qbits = i_qp / 6
264 sub t2d, t1d ; i_mf = i_qp % 6
267 add r1, t2 ; dequant_mf[i_mf]
269 add r1, r1m ; dequant_mf[i_mf]
273 jl .rshift32 ; negative qbits => rightshift
276 ;-----------------------------------------------------------------------------
277 ; void x264_dequant_4x4_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )
278 ;-----------------------------------------------------------------------------
280 cglobal x264_dequant_%2x%2_%1, 0,3
281 DEQUANT_START %3+2, %3
285 DEQUANT_LOOP DEQUANT16_L, %2*%2/4, %4
290 mova m6, [pd_1 GLOBAL]
294 DEQUANT_LOOP DEQUANT32_R, %2*%2/4, %4
296 cglobal x264_dequant_%2x%2_flat16_%1, 0,3
300 jl x264_dequant_%2x%2_%1
304 shr t0d, 8 ; i_qbits = i_qp / 6
307 sub t2d, t1d ; i_mf = i_qp % 6
310 lea r1, [dequant%2_scale GLOBAL]
313 lea r1, [dequant%2_scale + t2 GLOBAL]
319 DEQUANT16_FLAT [r1], 0, 16
320 DEQUANT16_FLAT [r1+8], 8, 24
322 DEQUANT16_FLAT [r1], 0, 16
325 DEQUANT16_FLAT [r1], 0, 8, 64, 72
326 DEQUANT16_FLAT [r1+16], 16, 24, 48, 56
327 DEQUANT16_FLAT [r1+16], 80, 88, 112, 120
328 DEQUANT16_FLAT [r1+32], 32, 40, 96, 104
330 DEQUANT16_FLAT [r1], 0, 64
331 DEQUANT16_FLAT [r1+16], 16, 48, 80, 112
332 DEQUANT16_FLAT [r1+32], 32, 96
343 DEQUANT sse2, 4, 4, 2
344 DEQUANT sse2, 8, 6, 2
347 cglobal x264_dequant_4x4dc_%1, 0,3
362 mova m0, [r0+mmsize*0+x]
363 mova m1, [r0+mmsize*1+x]
366 mova [r0+mmsize*0+x], m0
367 mova [r0+mmsize*1+x], m1
375 mova m6, [pw_1 GLOBAL]
408 ;-----------------------------------------------------------------------------
409 ; void x264_denoise_dct_mmx( int16_t *dct, uint32_t *sum, uint16_t *offset, int size )
410 ;-----------------------------------------------------------------------------
412 cglobal x264_denoise_dct_%1, 4,5
413 movzx r4d, word [r0] ; backup DC coefficient
417 mova m2, [r0+r3*2+0*mmsize]
418 mova m3, [r0+r3*2+1*mmsize]
423 psubusw m0, [r2+r3*2+0*mmsize]
424 psubusw m1, [r2+r3*2+1*mmsize]
427 mova [r0+r3*2+0*mmsize], m0
428 mova [r0+r3*2+1*mmsize], m1
435 paddd m4, [r1+r3*4+0*mmsize]
436 paddd m2, [r1+r3*4+1*mmsize]
437 paddd m5, [r1+r3*4+2*mmsize]
438 paddd m3, [r1+r3*4+3*mmsize]
439 mova [r1+r3*4+0*mmsize], m4
440 mova [r1+r3*4+1*mmsize], m2
441 mova [r1+r3*4+2*mmsize], m5
442 mova [r1+r3*4+3*mmsize], m3
444 mov [r0], r4w ; restore DC coefficient
448 %define PABSW PABSW_MMX
449 %define PSIGNW PSIGNW_MMX
456 %define PABSW PABSW_SSSE3
457 %define PSIGNW PSIGNW_SSSE3
462 ;-----------------------------------------------------------------------------
463 ; int x264_decimate_score( int16_t *dct )
464 ;-----------------------------------------------------------------------------
466 %macro DECIMATE_MASK_SSE2 6
473 ABS2_MMX xmm0, xmm1, xmm3, xmm4
483 %macro DECIMATE_MASK_MMX 6
488 ABS2_MMX mm0, mm1, mm4, mm5
489 ABS2_MMX mm2, mm3, mm4, mm5
508 cextern x264_decimate_table4
509 cextern x264_decimate_table8
513 ;A LUT is faster than bsf on AMD processors, and no slower on Intel
514 ;This is not true for score64.
515 cglobal x264_decimate_score%1_%2, 1,3
517 lea r10, [x264_decimate_table4 GLOBAL]
518 lea r11, [decimate_mask_table4 GLOBAL]
520 %define mask_table r11
522 %define table x264_decimate_table4
523 %define mask_table decimate_mask_table4
525 DECIMATE_MASK edx, eax, r0, [pb_1 GLOBAL], %2, ecx
534 movzx eax, byte [mask_table + rcx]
543 add al, byte [table + rcx]
544 add al, byte [mask_table + rdx]
554 %define DECIMATE_MASK DECIMATE_MASK_MMX
555 DECIMATE4x4 15, mmxext
556 DECIMATE4x4 16, mmxext
558 %define DECIMATE_MASK DECIMATE_MASK_SSE2
560 DECIMATE4x4 15, ssse3
562 DECIMATE4x4 16, ssse3
567 cglobal x264_decimate_score64_%1, 1,4
569 lea r10, [x264_decimate_table8 GLOBAL]
572 %define table x264_decimate_table8
574 mova m7, [pb_1 GLOBAL]
575 DECIMATE_MASK r1d, eax, r0, m7, %1, null
578 DECIMATE_MASK r2d, eax, r0+32, m7, %1, null
581 DECIMATE_MASK r2d, r3d, r0+64, m7, %1, null
585 DECIMATE_MASK r2d, r3d, r0+96, m7, %1, null
595 add al, byte [table + rcx]
606 cglobal x264_decimate_score64_%1, 1,6
608 cglobal x264_decimate_score64_%1, 1,5
610 mova m7, [pb_1 GLOBAL]
611 DECIMATE_MASK r3, r2, r0, m7, %1, r5
614 DECIMATE_MASK r4, r2, r0+32, m7, %1, r5
617 DECIMATE_MASK r4, r1, r0+64, m7, %1, r5
619 DECIMATE_MASK r1, r0, r0+96, m7, %1, r5
627 jne .ret9 ;r0 is zero at this point, so we don't need to zero it
634 add r0b, byte [x264_decimate_table8 + ecx]
637 cmp r0, 6 ;score64's threshold is never higher than 6
638 jge .ret9 ;this early termination is only useful on 32-bit because it can be done in the latency after shrd
666 %define DECIMATE_MASK DECIMATE_MASK_MMX
670 %define DECIMATE_MASK DECIMATE_MASK_SSE2