1 ;*****************************************************************************
2 ;* quant-a.asm: h264 encoder library
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2008 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Christian Heine <sennindemokrit@gmx.net>
9 ;* This program is free software; you can redistribute it and/or modify
10 ;* it under the terms of the GNU General Public License as published by
11 ;* the Free Software Foundation; either version 2 of the License, or
12 ;* (at your option) any later version.
14 ;* This program is distributed in the hope that it will be useful,
15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 ;* GNU General Public License for more details.
19 ;* You should have received a copy of the GNU General Public License
20 ;* along with this program; if not, write to the Free Software
21 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
22 ;*****************************************************************************
31 dw %1, %2, %1, %2, %2, %3, %2, %3
34 dw %1, %4, %5, %4, %1, %4, %5, %4
35 dw %4, %2, %6, %2, %4, %2, %6, %2
36 dw %5, %6, %3, %6, %5, %6, %3, %6
37 ; last line not used, just padding for power-of-2 stride
50 DQM8 20, 18, 32, 19, 25, 24
51 DQM8 22, 19, 35, 21, 28, 26
52 DQM8 26, 23, 42, 24, 33, 31
53 DQM8 28, 25, 45, 26, 35, 33
54 DQM8 32, 28, 51, 30, 40, 38
55 DQM8 36, 32, 58, 34, 46, 43
59 %macro QUANT_DC_START 0
95 ;;; %1 (m64) dct[y][x]
96 ;;; %2 (m64/mmx) mf[y][x] or mf[0][0] (as uint16_t)
97 ;;; %3 (m64/mmx) bias[y][x] or bias[0][0] (as uint16_t)
98 mova m1, %1 ; load dct coeffs
100 paddusw m0, %3 ; round
101 pmulhuw m0, %2 ; divide
102 PSIGNW m0, m1 ; restore sign
106 ;-----------------------------------------------------------------------------
107 ; void x264_quant_4x4_dc_mmxext( int16_t dct[16], int mf, int bias )
108 ;-----------------------------------------------------------------------------
114 QUANT_ONE [r0+x], m6, m7
120 ;-----------------------------------------------------------------------------
121 ; void x264_quant_4x4_mmx( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
122 ;-----------------------------------------------------------------------------
127 QUANT_ONE [r0+x], [r1+x], [r2+x]
134 %define PABSW PABSW_MMX
135 %define PSIGNW PSIGNW_MMX
136 QUANT_DC x264_quant_2x2_dc_mmxext, 1
137 %ifndef ARCH_X86_64 ; not needed because sse2 is faster
138 QUANT_DC x264_quant_4x4_dc_mmxext, 4
139 QUANT_AC x264_quant_4x4_mmx, 4
140 QUANT_AC x264_quant_8x8_mmx, 16
144 QUANT_DC x264_quant_4x4_dc_sse2, 2
145 QUANT_AC x264_quant_4x4_sse2, 2
146 QUANT_AC x264_quant_8x8_sse2, 8
148 %define PABSW PABSW_SSSE3
149 %define PSIGNW PSIGNW_SSSE3
150 QUANT_DC x264_quant_4x4_dc_ssse3, 2
151 QUANT_AC x264_quant_4x4_ssse3, 2
152 QUANT_AC x264_quant_8x8_ssse3, 8
155 QUANT_DC x264_quant_2x2_dc_ssse3, 1
159 ;=============================================================================
161 ;=============================================================================
165 ;;; %2,%3 dequant_mf[i_mf][y][x]
177 ;;; %2,%3 dequant_mf[i_mf][y][x]
196 %macro DEQUANT_LOOP 3
200 %1 [r0+t0+8*%3], [r1+t0*2+16*%3], [r1+t0*2+24*%3]
201 %1 [r0+t0 ], [r1+t0*2 ], [r1+t0*2+ 8*%3]
206 %1 [r0+8*%3], [r1+16*%3], [r1+24*%3]
207 %1 [r0 ], [r1 ], [r1+ 8*%3]
212 %macro DEQUANT16_FLAT 2-8
245 ;-----------------------------------------------------------------------------
246 ; void x264_dequant_4x4_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )
247 ;-----------------------------------------------------------------------------
249 cglobal x264_dequant_%2x%2_%1, 0,3
252 shr t0d, 8 ; i_qbits = i_qp / 6
255 sub t2d, t1d ; i_mf = i_qp % 6
258 add r1, t2 ; dequant_mf[i_mf]
260 add r1, r1m ; dequant_mf[i_mf]
264 jl .rshift32 ; negative qbits => rightshift
268 DEQUANT_LOOP DEQUANT16_L, %2*%2/4, %4
274 mova m6, [pd_1 GLOBAL]
278 DEQUANT_LOOP DEQUANT32_R, %2*%2/4, %4
280 cglobal x264_dequant_%2x%2_flat16_%1, 0,3
284 jl x264_dequant_%2x%2_%1
288 shr t0d, 8 ; i_qbits = i_qp / 6
291 sub t2d, t1d ; i_mf = i_qp % 6
294 lea r1, [dequant%2_scale GLOBAL]
298 lea r1, [t2 + dequant%2_scale GLOBAL]
304 DEQUANT16_FLAT [r1], 0, 16
305 DEQUANT16_FLAT [r1+8], 8, 24
307 DEQUANT16_FLAT [r1], 0, 16
310 DEQUANT16_FLAT [r1], 0, 8, 64, 72
311 DEQUANT16_FLAT [r1+16], 16, 24, 48, 56
312 DEQUANT16_FLAT [r1+16], 80, 88, 112, 120
313 DEQUANT16_FLAT [r1+32], 32, 40, 96, 104
315 DEQUANT16_FLAT [r1], 0, 64
316 DEQUANT16_FLAT [r1+16], 16, 48, 80, 112
317 DEQUANT16_FLAT [r1+32], 32, 96
328 DEQUANT sse2, 4, 4, 2
329 DEQUANT sse2, 8, 6, 2
333 ;-----------------------------------------------------------------------------
334 ; void x264_denoise_dct_core_mmx( int16_t *dct, uint32_t *sum, uint16_t *offset, int size )
335 ;-----------------------------------------------------------------------------
337 cglobal x264_denoise_dct_core_%1, 4,5
338 movzx r4d, word [r0] ; backup DC coefficient
342 mova m2, [r0+r3*2+0*regsize]
343 mova m3, [r0+r3*2+1*regsize]
348 psubusw m0, [r2+r3*2+0*regsize]
349 psubusw m1, [r2+r3*2+1*regsize]
352 mova [r0+r3*2+0*regsize], m0
353 mova [r0+r3*2+1*regsize], m1
360 paddd m4, [r1+r3*4+0*regsize]
361 paddd m2, [r1+r3*4+1*regsize]
362 paddd m5, [r1+r3*4+2*regsize]
363 paddd m3, [r1+r3*4+3*regsize]
364 mova [r1+r3*4+0*regsize], m4
365 mova [r1+r3*4+1*regsize], m2
366 mova [r1+r3*4+2*regsize], m5
367 mova [r1+r3*4+3*regsize], m3
369 mov [r0], r4w ; restore DC coefficient
373 %define PABSW PABSW_MMX
374 %define PSIGNW PSIGNW_MMX
381 %define PABSW PABSW_SSSE3
382 %define PSIGNW PSIGNW_SSSE3