1 ;*****************************************************************************
2 ;* quant-a.asm: h264 encoder library
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2008 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Christian Heine <sennindemokrit@gmx.net>
9 ;* This program is free software; you can redistribute it and/or modify
10 ;* it under the terms of the GNU General Public License as published by
11 ;* the Free Software Foundation; either version 2 of the License, or
12 ;* (at your option) any later version.
14 ;* This program is distributed in the hope that it will be useful,
15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 ;* GNU General Public License for more details.
19 ;* You should have received a copy of the GNU General Public License
20 ;* along with this program; if not, write to the Free Software
21 ;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
22 ;*****************************************************************************
31 dw %1, %2, %1, %2, %2, %3, %2, %3
34 dw %1, %4, %5, %4, %1, %4, %5, %4
35 dw %4, %2, %6, %2, %4, %2, %6, %2
36 dw %5, %6, %3, %6, %5, %6, %3, %6
37 ; last line not used, just padding for power-of-2 stride
50 DQM8 20, 18, 32, 19, 25, 24
51 DQM8 22, 19, 35, 21, 28, 26
52 DQM8 26, 23, 42, 24, 33, 31
53 DQM8 28, 25, 45, 26, 35, 33
54 DQM8 32, 28, 51, 30, 40, 38
55 DQM8 36, 32, 58, 34, 46, 43
59 %macro QUANT_DC_START 0
74 ;;; %1 (m64) dct[y][x]
75 ;;; %2 (m64/mmx) mf[y][x] or mf[0][0] (as uint16_t)
76 ;;; %3 (m64/mmx) bias[y][x] or bias[0][0] (as uint16_t)
78 mova m0, %1 ; load dct coeffs
80 pcmpgtw m1, m0 ; sign(coeff)
82 psubw m0, m1 ; abs(coeff)
83 paddusw m0, %3 ; round
84 pmulhuw m0, %2 ; divide
85 pxor m0, m1 ; restore sign
91 mova m1, %1 ; load dct coeffs
93 paddusw m0, %3 ; round
94 pmulhuw m0, %2 ; divide
95 psignw m0, m1 ; restore sign
101 ;-----------------------------------------------------------------------------
102 ; void x264_quant_2x2_dc_mmxext( int16_t dct[4], int mf, int bias )
103 ;-----------------------------------------------------------------------------
104 cglobal x264_quant_2x2_dc_mmxext, 1,1
106 QUANT_MMX [r0], mm6, mm7
109 cglobal x264_quant_2x2_dc_ssse3, 1,1
111 QUANT_SSSE3 [r0], mm6, mm7
114 ;-----------------------------------------------------------------------------
115 ; void x264_quant_4x4_dc_mmxext( int16_t dct[16], int mf, int bias )
116 ;-----------------------------------------------------------------------------
128 ;-----------------------------------------------------------------------------
129 ; void x264_quant_4x4_mmx( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
130 ;-----------------------------------------------------------------------------
135 %2 [r0+x], [r1+x], [r2+x]
141 %ifndef ARCH_X86_64 ; not needed because sse2 is faster
142 QUANT_DC x264_quant_4x4_dc_mmxext, QUANT_MMX, 4, 8
143 QUANT_AC x264_quant_4x4_mmx, QUANT_MMX, 4, 8
144 QUANT_AC x264_quant_8x8_mmx, QUANT_MMX, 16, 8
149 QUANT_DC x264_quant_4x4_dc_sse2, QUANT_MMX, 2, 16
150 QUANT_AC x264_quant_4x4_sse2, QUANT_MMX, 2, 16
151 QUANT_AC x264_quant_8x8_sse2, QUANT_MMX, 8, 16
153 QUANT_DC x264_quant_4x4_dc_ssse3, QUANT_SSSE3, 2, 16
154 QUANT_AC x264_quant_4x4_ssse3, QUANT_SSSE3, 2, 16
155 QUANT_AC x264_quant_8x8_ssse3, QUANT_SSSE3, 8, 16
159 ;=============================================================================
161 ;=============================================================================
165 ;;; %2,%3 dequant_mf[i_mf][y][x]
177 ;;; %2,%3 dequant_mf[i_mf][y][x]
196 %macro DEQUANT_LOOP 3
200 %1 [r0+t0+8*%3], [r1+t0*2+16*%3], [r1+t0*2+24*%3]
201 %1 [r0+t0 ], [r1+t0*2 ], [r1+t0*2+ 8*%3]
206 %1 [r0+8*%3], [r1+16*%3], [r1+24*%3]
207 %1 [r0 ], [r1 ], [r1+ 8*%3]
212 %macro DEQUANT16_FLAT 2-8
245 ;-----------------------------------------------------------------------------
246 ; void x264_dequant_4x4_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )
247 ;-----------------------------------------------------------------------------
249 cglobal x264_dequant_%2x%2_%1, 0,3
252 shr t0d, 8 ; i_qbits = i_qp / 6
255 sub t2d, t1d ; i_mf = i_qp % 6
258 add r1, t2 ; dequant_mf[i_mf]
260 add r1, r1m ; dequant_mf[i_mf]
264 jl .rshift32 ; negative qbits => rightshift
268 DEQUANT_LOOP DEQUANT16_L, %2*%2/4, %4
274 mova m6, [pd_1 GLOBAL]
278 DEQUANT_LOOP DEQUANT32_R, %2*%2/4, %4
280 cglobal x264_dequant_%2x%2_flat16_%1, 0,3
284 jl x264_dequant_%2x%2_%1
288 shr t0d, 8 ; i_qbits = i_qp / 6
291 sub t2d, t1d ; i_mf = i_qp % 6
294 lea r1, [dequant%2_scale GLOBAL]
298 lea r1, [t2 + dequant%2_scale GLOBAL]
304 DEQUANT16_FLAT [r1], 0, 16
305 DEQUANT16_FLAT [r1+8], 8, 24
307 DEQUANT16_FLAT [r1], 0, 16
310 DEQUANT16_FLAT [r1], 0, 8, 64, 72
311 DEQUANT16_FLAT [r1+16], 16, 24, 48, 56
312 DEQUANT16_FLAT [r1+16], 80, 88, 112, 120
313 DEQUANT16_FLAT [r1+32], 32, 40, 96, 104
315 DEQUANT16_FLAT [r1], 0, 64
316 DEQUANT16_FLAT [r1+16], 16, 48, 80, 112
317 DEQUANT16_FLAT [r1+32], 32, 96
328 DEQUANT sse2, 4, 4, 2
329 DEQUANT sse2, 8, 6, 2