1 ;*****************************************************************************
2 ;* quant-a.asm: h264 encoder library
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2008 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Christian Heine <sennindemokrit@gmx.net>
9 ;* This program is free software; you can redistribute it and/or modify
10 ;* it under the terms of the GNU General Public License as published by
11 ;* the Free Software Foundation; either version 2 of the License, or
12 ;* (at your option) any later version.
14 ;* This program is distributed in the hope that it will be useful,
15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 ;* GNU General Public License for more details.
19 ;* You should have received a copy of the GNU General Public License
20 ;* along with this program; if not, write to the Free Software
21 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
22 ;*****************************************************************************
25 %include "x86util.asm"
33 dw %1, %2, %1, %2, %2, %3, %2, %3
36 dw %1, %4, %5, %4, %1, %4, %5, %4
37 dw %4, %2, %6, %2, %4, %2, %6, %2
38 dw %5, %6, %3, %6, %5, %6, %3, %6
39 ; last line not used, just padding for power-of-2 stride
52 DQM8 20, 18, 32, 19, 25, 24
53 DQM8 22, 19, 35, 21, 28, 26
54 DQM8 26, 23, 42, 24, 33, 31
55 DQM8 28, 25, 45, 26, 35, 33
56 DQM8 32, 28, 51, 30, 40, 38
57 DQM8 36, 32, 58, 34, 46, 43
60 db 0,3,2,6,2,5,5,9,1,5,4,8,5,8,8,12,1,4,4,8,4,7,7,11,4,8,7,11,8,11,11,15,1,4
61 db 3,7,4,7,7,11,3,7,6,10,7,10,10,14,4,7,7,11,7,10,10,14,7,11,10,14,11,14,14
62 db 18,0,4,3,7,3,6,6,10,3,7,6,10,7,10,10,14,3,6,6,10,6,9,9,13,6,10,9,13,10,13
63 db 13,17,4,7,6,10,7,10,10,14,6,10,9,13,10,13,13,17,7,10,10,14,10,13,13,17,10
64 db 14,13,17,14,17,17,21,0,3,3,7,3,6,6,10,2,6,5,9,6,9,9,13,3,6,6,10,6,9,9,13
65 db 6,10,9,13,10,13,13,17,3,6,5,9,6,9,9,13,5,9,8,12,9,12,12,16,6,9,9,13,9,12
66 db 12,16,9,13,12,16,13,16,16,20,3,7,6,10,6,9,9,13,6,10,9,13,10,13,13,17,6,9
67 db 9,13,9,12,12,16,9,13,12,16,13,16,16,20,7,10,9,13,10,13,13,17,9,13,12,16
68 db 13,16,16,20,10,13,13,17,13,16,16,20,13,17,16,20,17,20,20,24
72 %macro QUANT_DC_START 0
103 %macro PSIGNW_SSSE3 2
108 ;;; %1 (m64) dct[y][x]
109 ;;; %2 (m64/mmx) mf[y][x] or mf[0][0] (as uint16_t)
110 ;;; %3 (m64/mmx) bias[y][x] or bias[0][0] (as uint16_t)
111 mova m1, %1 ; load dct coeffs
113 paddusw m0, %3 ; round
114 pmulhuw m0, %2 ; divide
115 PSIGNW m0, m1 ; restore sign
119 ;-----------------------------------------------------------------------------
120 ; void x264_quant_4x4_dc_mmxext( int16_t dct[16], int mf, int bias )
121 ;-----------------------------------------------------------------------------
127 QUANT_ONE [r0+x], m6, m7
133 ;-----------------------------------------------------------------------------
134 ; void x264_quant_4x4_mmx( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
135 ;-----------------------------------------------------------------------------
140 QUANT_ONE [r0+x], [r1+x], [r2+x]
147 %define PABSW PABSW_MMX
148 %define PSIGNW PSIGNW_MMX
149 QUANT_DC x264_quant_2x2_dc_mmxext, 1
150 %ifndef ARCH_X86_64 ; not needed because sse2 is faster
151 QUANT_DC x264_quant_4x4_dc_mmxext, 4
152 QUANT_AC x264_quant_4x4_mmx, 4
153 QUANT_AC x264_quant_8x8_mmx, 16
157 QUANT_DC x264_quant_4x4_dc_sse2, 2
158 QUANT_AC x264_quant_4x4_sse2, 2
159 QUANT_AC x264_quant_8x8_sse2, 8
161 %define PABSW PABSW_SSSE3
162 %define PSIGNW PSIGNW_SSSE3
163 QUANT_DC x264_quant_4x4_dc_ssse3, 2
164 QUANT_AC x264_quant_4x4_ssse3, 2
165 QUANT_AC x264_quant_8x8_ssse3, 8
168 QUANT_DC x264_quant_2x2_dc_ssse3, 1
172 ;=============================================================================
174 ;=============================================================================
178 ;;; %2,%3 dequant_mf[i_mf][y][x]
190 ;;; %2,%3 dequant_mf[i_mf][y][x]
209 %macro DEQUANT_LOOP 3
213 %1 [r0+t0+8*%3], [r1+t0*2+16*%3], [r1+t0*2+24*%3]
214 %1 [r0+t0 ], [r1+t0*2 ], [r1+t0*2+ 8*%3]
219 %1 [r0+8*%3], [r1+16*%3], [r1+24*%3]
220 %1 [r0 ], [r1 ], [r1+ 8*%3]
225 %macro DEQUANT16_FLAT 2-8
258 ;-----------------------------------------------------------------------------
259 ; void x264_dequant_4x4_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )
260 ;-----------------------------------------------------------------------------
262 cglobal x264_dequant_%2x%2_%1, 0,3
265 shr t0d, 8 ; i_qbits = i_qp / 6
268 sub t2d, t1d ; i_mf = i_qp % 6
271 add r1, t2 ; dequant_mf[i_mf]
273 add r1, r1m ; dequant_mf[i_mf]
277 jl .rshift32 ; negative qbits => rightshift
281 DEQUANT_LOOP DEQUANT16_L, %2*%2/4, %4
286 mova m6, [pd_1 GLOBAL]
290 DEQUANT_LOOP DEQUANT32_R, %2*%2/4, %4
292 cglobal x264_dequant_%2x%2_flat16_%1, 0,3
296 jl x264_dequant_%2x%2_%1
300 shr t0d, 8 ; i_qbits = i_qp / 6
303 sub t2d, t1d ; i_mf = i_qp % 6
306 lea r1, [dequant%2_scale GLOBAL]
309 lea r1, [dequant%2_scale + t2 GLOBAL]
315 DEQUANT16_FLAT [r1], 0, 16
316 DEQUANT16_FLAT [r1+8], 8, 24
318 DEQUANT16_FLAT [r1], 0, 16
321 DEQUANT16_FLAT [r1], 0, 8, 64, 72
322 DEQUANT16_FLAT [r1+16], 16, 24, 48, 56
323 DEQUANT16_FLAT [r1+16], 80, 88, 112, 120
324 DEQUANT16_FLAT [r1+32], 32, 40, 96, 104
326 DEQUANT16_FLAT [r1], 0, 64
327 DEQUANT16_FLAT [r1+16], 16, 48, 80, 112
328 DEQUANT16_FLAT [r1+32], 32, 96
339 DEQUANT sse2, 4, 4, 2
340 DEQUANT sse2, 8, 6, 2
344 ;-----------------------------------------------------------------------------
345 ; void x264_denoise_dct_mmx( int16_t *dct, uint32_t *sum, uint16_t *offset, int size )
346 ;-----------------------------------------------------------------------------
348 cglobal x264_denoise_dct_%1, 4,5
349 movzx r4d, word [r0] ; backup DC coefficient
353 mova m2, [r0+r3*2+0*mmsize]
354 mova m3, [r0+r3*2+1*mmsize]
359 psubusw m0, [r2+r3*2+0*mmsize]
360 psubusw m1, [r2+r3*2+1*mmsize]
363 mova [r0+r3*2+0*mmsize], m0
364 mova [r0+r3*2+1*mmsize], m1
371 paddd m4, [r1+r3*4+0*mmsize]
372 paddd m2, [r1+r3*4+1*mmsize]
373 paddd m5, [r1+r3*4+2*mmsize]
374 paddd m3, [r1+r3*4+3*mmsize]
375 mova [r1+r3*4+0*mmsize], m4
376 mova [r1+r3*4+1*mmsize], m2
377 mova [r1+r3*4+2*mmsize], m5
378 mova [r1+r3*4+3*mmsize], m3
380 mov [r0], r4w ; restore DC coefficient
384 %define PABSW PABSW_MMX
385 %define PSIGNW PSIGNW_MMX
392 %define PABSW PABSW_SSSE3
393 %define PSIGNW PSIGNW_SSSE3
398 ;-----------------------------------------------------------------------------
399 ; int x264_decimate_score( int16_t *dct )
400 ;-----------------------------------------------------------------------------
402 %macro DECIMATE_MASK_SSE2 6
409 ABS2_MMX xmm0, xmm1, xmm3, xmm4
419 %macro DECIMATE_MASK_MMX 6
424 ABS2_MMX mm0, mm1, mm4, mm5
425 ABS2_MMX mm2, mm3, mm4, mm5
444 cextern x264_decimate_table4
445 cextern x264_decimate_table8
449 ;A LUT is faster than bsf on AMD processors, and no slower on Intel
450 ;This is not true for score64.
451 cglobal x264_decimate_score%1_%2, 1,3
453 lea r10, [x264_decimate_table4 GLOBAL]
454 lea r11, [decimate_mask_table4 GLOBAL]
456 %define mask_table r11
458 %define table x264_decimate_table4
459 %define mask_table decimate_mask_table4
461 DECIMATE_MASK edx, eax, r0, [pb_1 GLOBAL], %2, ecx
470 movzx eax, byte [mask_table + rcx]
479 add al, byte [table + rcx]
480 add al, byte [mask_table + rdx]
490 %define DECIMATE_MASK DECIMATE_MASK_MMX
491 DECIMATE4x4 15, mmxext
492 DECIMATE4x4 16, mmxext
494 %define DECIMATE_MASK DECIMATE_MASK_SSE2
496 DECIMATE4x4 15, ssse3
498 DECIMATE4x4 16, ssse3
503 cglobal x264_decimate_score64_%1, 1,4
505 lea r10, [x264_decimate_table8 GLOBAL]
508 %define table x264_decimate_table8
510 mova m7, [pb_1 GLOBAL]
511 DECIMATE_MASK r1d, eax, r0, m7, %1, null
514 DECIMATE_MASK r2d, eax, r0+32, m7, %1, null
517 DECIMATE_MASK r2d, r3d, r0+64, m7, %1, null
521 DECIMATE_MASK r2d, r3d, r0+96, m7, %1, null
531 add al, byte [table + rcx]
542 cglobal x264_decimate_score64_%1, 1,6
544 cglobal x264_decimate_score64_%1, 1,5
546 mova m7, [pb_1 GLOBAL]
547 DECIMATE_MASK r3, r2, r0, m7, %1, r5
550 DECIMATE_MASK r4, r2, r0+32, m7, %1, r5
553 DECIMATE_MASK r4, r1, r0+64, m7, %1, r5
555 DECIMATE_MASK r1, r0, r0+96, m7, %1, r5
563 jne .ret9 ;r0 is zero at this point, so we don't need to zero it
570 add r0b, byte [x264_decimate_table8 + ecx]
573 cmp r0, 6 ;score64's threshold is never higher than 6
574 jge .ret9 ;this early termination is only useful on 32-bit because it can be done in the latency after shrd
602 %define DECIMATE_MASK DECIMATE_MASK_MMX
606 %define DECIMATE_MASK DECIMATE_MASK_SSE2