1 ;*****************************************************************************
2 ;* quant-a.asm: h264 encoder library
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2008 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Christian Heine <sennindemokrit@gmx.net>
9 ;* This program is free software; you can redistribute it and/or modify
10 ;* it under the terms of the GNU General Public License as published by
11 ;* the Free Software Foundation; either version 2 of the License, or
12 ;* (at your option) any later version.
14 ;* This program is distributed in the hope that it will be useful,
15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 ;* GNU General Public License for more details.
19 ;* You should have received a copy of the GNU General Public License
20 ;* along with this program; if not, write to the Free Software
21 ;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
22 ;*****************************************************************************
31 dw %1, %2, %1, %2, %2, %3, %2, %3
34 dw %1, %4, %5, %4, %1, %4, %5, %4
35 dw %4, %2, %6, %2, %4, %2, %6, %2
36 dw %5, %6, %3, %6, %5, %6, %3, %6
37 ; last line not used, just padding for power-of-2 stride
50 DQM8 20, 18, 32, 19, 25, 24
51 DQM8 22, 19, 35, 21, 28, 26
52 DQM8 26, 23, 42, 24, 33, 31
53 DQM8 28, 25, 45, 26, 35, 33
54 DQM8 32, 28, 51, 30, 40, 38
55 DQM8 36, 32, 58, 34, 46, 43
59 %macro MMX_QUANT_DC_START 0
66 %macro SSE2_QUANT_DC_START 0
76 ;;; %1 (m64) dct[y][x]
77 ;;; %2 (m64/mmx) mf[y][x] or mf[0][0] (as uint16_t)
78 ;;; %3 (m64/mmx) bias[y][x] or bias[0][0] (as uint16_t)
80 mov%1 %2m0, %3 ; load dct coeffs
82 pcmpgtw %2m1, %2m0 ; sign(coeff)
84 psubw %2m0, %2m1 ; abs(coeff)
85 paddusw %2m0, %5 ; round
86 pmulhuw %2m0, %4 ; divide
87 pxor %2m0, %2m1 ; restore sign
89 mov%1 %3, %2m0 ; store
91 %macro MMX_QUANT_1x4 3
92 QUANT_ONE q, m, %1, %2, %3
94 %macro SSE2_QUANT_1x8 3
95 QUANT_ONE dqa, xm, %1, %2, %3
98 %macro SSSE3_QUANT_1x8 3
99 movdqa xmm1, %1 ; load dct coeffs
101 paddusw xmm0, %3 ; round
102 pmulhuw xmm0, %2 ; divide
103 psignw xmm0, xmm1 ; restore sign
104 movdqa %1, xmm0 ; store
107 ;-----------------------------------------------------------------------------
108 ; void x264_quant_2x2_dc_mmxext( int16_t dct[4], int mf, int bias )
109 ;-----------------------------------------------------------------------------
110 cglobal x264_quant_2x2_dc_mmxext, 1,1
112 MMX_QUANT_1x4 [r0], mm6, mm7
115 ;-----------------------------------------------------------------------------
116 ; void x264_quant_4x4_dc_mmxext( int16_t dct[16], int mf, int bias )
117 ;-----------------------------------------------------------------------------
123 %3 [r0+x], %4m6, %4m7
129 ;-----------------------------------------------------------------------------
130 ; void x264_quant_4x4_mmx( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
131 ;-----------------------------------------------------------------------------
136 %2 [r0+x], [r1+x], [r2+x]
142 %ifndef ARCH_X86_64 ; not needed because sse2 is faster
143 QUANT_DC x264_quant_4x4_dc_mmxext, MMX_QUANT_DC_START, MMX_QUANT_1x4, m, 4, 8
144 QUANT_AC x264_quant_4x4_mmx, MMX_QUANT_1x4, 4, 8
145 QUANT_AC x264_quant_8x8_mmx, MMX_QUANT_1x4, 16, 8
148 QUANT_DC x264_quant_4x4_dc_sse2, SSE2_QUANT_DC_START, SSE2_QUANT_1x8, xm, 2, 16
149 QUANT_AC x264_quant_4x4_sse2, SSE2_QUANT_1x8, 2, 16
150 QUANT_AC x264_quant_8x8_sse2, SSE2_QUANT_1x8, 8, 16
153 QUANT_DC x264_quant_4x4_dc_ssse3, SSE2_QUANT_DC_START, SSSE3_QUANT_1x8, xm, 2, 16
154 QUANT_AC x264_quant_4x4_ssse3, SSSE3_QUANT_1x8, 2, 16
155 QUANT_AC x264_quant_8x8_ssse3, SSSE3_QUANT_1x8, 8, 16
160 ;=============================================================================
162 ;=============================================================================
166 ;;; %2,%3 dequant_mf[i_mf][y][x]
178 ;;; %2,%3 dequant_mf[i_mf][y][x]
197 %macro DEQUANT_LOOP 3
201 %1 [r0+t0+8*%3], [r1+t0*2+16*%3], [r1+t0*2+24*%3]
202 %1 [r0+t0 ], [r1+t0*2 ], [r1+t0*2+ 8*%3]
207 %1 [r0+8*%3], [r1+16*%3], [r1+24*%3]
208 %1 [r0 ], [r1 ], [r1+ 8*%3]
213 %macro DEQUANT16_FLAT 2-8
246 ;-----------------------------------------------------------------------------
247 ; void x264_dequant_4x4_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )
248 ;-----------------------------------------------------------------------------
250 cglobal x264_dequant_%2x%2_%1, 0,3
253 shr t0d, 8 ; i_qbits = i_qp / 6
256 sub t2d, t1d ; i_mf = i_qp % 6
259 add r1, t2 ; dequant_mf[i_mf]
261 add r1, r1m ; dequant_mf[i_mf]
265 jl .rshift32 ; negative qbits => rightshift
269 DEQUANT_LOOP DEQUANT16_L, %2*%2/4, %4
275 movq m6, [pd_1 GLOBAL]
279 DEQUANT_LOOP DEQUANT32_R, %2*%2/4, %4
281 cglobal x264_dequant_%2x%2_flat16_%1, 0,3
285 jl x264_dequant_%2x%2_%1
289 shr t0d, 8 ; i_qbits = i_qp / 6
292 sub t2d, t1d ; i_mf = i_qp % 6
295 lea r1, [dequant%2_scale GLOBAL]
299 lea r1, [t2 + dequant%2_scale GLOBAL]
305 DEQUANT16_FLAT [r1], 0, 16
306 DEQUANT16_FLAT [r1+8], 8, 24
308 DEQUANT16_FLAT [r1], 0, 16
311 DEQUANT16_FLAT [r1], 0, 8, 64, 72
312 DEQUANT16_FLAT [r1+16], 16, 24, 48, 56
313 DEQUANT16_FLAT [r1+16], 80, 88, 112, 120
314 DEQUANT16_FLAT [r1+32], 32, 40, 96, 104
316 DEQUANT16_FLAT [r1], 0, 64
317 DEQUANT16_FLAT [r1+16], 16, 48, 80, 112
318 DEQUANT16_FLAT [r1+32], 32, 96
329 DEQUANT sse2, 4, 4, 2
330 DEQUANT sse2, 8, 6, 2