1 ;*****************************************************************************
2 ;* quant-a.asm: h264 encoder library
3 ;*****************************************************************************
4 ;* Copyright (C) 2005 x264 project
6 ;* Authors: Alex Izvorski <aizvorksi@gmail.com>
7 ;* Christian Heine <sennindemokrit@gmx.net>
9 ;* This program is free software; you can redistribute it and/or modify
10 ;* it under the terms of the GNU General Public License as published by
11 ;* the Free Software Foundation; either version 2 of the License, or
12 ;* (at your option) any later version.
14 ;* This program is distributed in the hope that it will be useful,
15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 ;* GNU General Public License for more details.
19 ;* You should have received a copy of the GNU General Public License
20 ;* along with this program; if not, write to the Free Software
21 ;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
22 ;*****************************************************************************
24 ;*****************************************************************************
26 ;* Revision history: *
28 ;* 2005.07.26 quant 4x4 & 8x8 MMX functions (AI) *
29 ;* 2005.09.04 quant MMXEXT (added precision) and DC (CH) *
30 ;* 2005.09.21 faster MMX and added MMXEXT16 (CH) *
32 ;*****************************************************************************
36 %include "amd64inc.asm"
43 cglobal x264_quant_2x2_dc_core15_mmx
44 cglobal x264_quant_4x4_dc_core15_mmx
45 cglobal x264_quant_4x4_core15_mmx
46 cglobal x264_quant_8x8_core15_mmx
48 cglobal x264_quant_4x4_dc_core15_ssse3
49 cglobal x264_quant_4x4_core15_ssse3
50 cglobal x264_quant_8x8_core15_ssse3
52 cglobal x264_quant_2x2_dc_core16_mmxext
53 cglobal x264_quant_4x4_dc_core16_mmxext
54 cglobal x264_quant_4x4_core16_mmxext
55 cglobal x264_quant_8x8_core16_mmxext
57 cglobal x264_quant_2x2_dc_core32_mmxext
58 cglobal x264_quant_4x4_dc_core32_mmxext
59 cglobal x264_quant_4x4_core32_mmxext
60 cglobal x264_quant_8x8_core32_mmxext
62 cglobal x264_dequant_4x4_mmx
63 cglobal x264_dequant_8x8_mmx
65 %macro MMX_QUANT_AC_START 0
66 ; mov rdi, rdi ; &dct[0][0]
67 ; mov rsi, rsi ; &quant_mf[0][0]
68 movd mm6, parm3d ; i_qbits
70 punpckldq mm7, mm7 ; f in each dword
73 %macro MMX_QUANT15_DC_START 0
74 ; mov rdi, rdi ; &dct[0][0]
75 movd mm5, parm2d ; i_qmf
76 movd mm6, parm3d ; i_qbits
79 punpcklwd mm5, mm5 ; i_qmf in each word
80 punpckldq mm7, mm7 ; f in each dword
83 %macro SSE2_QUANT_AC_START 0
84 movd xmm6, parm3d ; i_qbits
86 pshufd xmm7, xmm7, 0 ; f in each dword
89 %macro SSE2_QUANT15_DC_START 0
90 movd xmm5, parm2d ; i_qmf
91 movd xmm6, parm3d ; i_qbits
94 punpcklqdq xmm5, xmm5 ; i_qmf in each word
95 pshufd xmm7, xmm7, 0 ; f in each dword
98 %macro MMX_QUANT15_1x4 4
99 ;;; %1 (m64) dct[y][x]
100 ;;; %2 (m64/mmx) quant_mf[y][x] or quant_mf[0][0] (as int16_t)
101 ;;; %3 (mmx) i_qbits in the low doubleword
102 ;;; %4 (mmx) f as doublewords
103 ;;; trashes mm0-mm2,mm4
104 movq mm0, %1 ; load dct coeffs
106 pcmpgtw mm4, mm0 ; sign(coeff)
108 psubw mm0, mm4 ; abs(coeff)
118 paddd mm0, %4 ; round with f
123 packssdw mm0, mm1 ; pack
124 pxor mm0, mm4 ; restore sign
129 %macro SSSE3_QUANT15_1x8 4
130 movdqa xmm0, %1 ; load dct coeffs
131 movdqa xmm4, xmm0 ; save sign
142 paddd xmm0, %4 ; round with f
147 packssdw xmm0, xmm1 ; pack
148 psignw xmm0, xmm4 ; restore sign
149 movdqa %1, xmm0 ; store
153 ;-----------------------------------------------------------------------------
154 ; void x264_quant_2x2_dc_core15_mmx( int16_t dct[2][2],
155 ; int const i_qmf, int const i_qbits, int const f );
156 ;-----------------------------------------------------------------------------
157 x264_quant_2x2_dc_core15_mmx:
159 MMX_QUANT15_1x4 [parm1q], mm5, mm6, mm7
163 ;-----------------------------------------------------------------------------
164 ; void x264_quant_4x4_dc_core15_mmx( int16_t dct[4][4],
165 ; int const i_qmf, int const i_qbits, int const f );
166 ;-----------------------------------------------------------------------------
167 x264_quant_4x4_dc_core15_mmx:
171 MMX_QUANT15_1x4 [parm1q], mm5, mm6, mm7
178 ;-----------------------------------------------------------------------------
179 ; void x264_quant_4x4_core15_mmx( int16_t dct[4][4],
180 ; int const quant_mf[4][4], int const i_qbits, int const f );
181 ;-----------------------------------------------------------------------------
182 x264_quant_4x4_core15_mmx:
187 packssdw mm5, [parm2q+8]
188 MMX_QUANT15_1x4 [parm1q], mm5, mm6, mm7
196 ;-----------------------------------------------------------------------------
197 ; void x264_quant_8x8_core15_mmx( int16_t dct[8][8],
198 ; int const quant_mf[8][8], int const i_qbits, int const f );
199 ;-----------------------------------------------------------------------------
200 x264_quant_8x8_core15_mmx:
205 packssdw mm5, [parm2q+8]
206 MMX_QUANT15_1x4 [parm1q], mm5, mm6, mm7
215 ;-----------------------------------------------------------------------------
216 ; void x264_quant_4x4_dc_core15_ssse3( int16_t dct[4][4],
217 ; int const i_qmf, int const i_qbits, int const f );
218 ;-----------------------------------------------------------------------------
219 x264_quant_4x4_dc_core15_ssse3:
220 SSE2_QUANT15_DC_START
221 SSSE3_QUANT15_1x8 [parm1q], xmm5, xmm6, xmm7
222 SSSE3_QUANT15_1x8 [parm1q+16], xmm5, xmm6, xmm7
226 ;-----------------------------------------------------------------------------
227 ; void x264_quant_4x4_core15_ssse3( int16_t dct[4][4],
228 ; int const quant_mf[4][4], int const i_qbits, int const f );
229 ;-----------------------------------------------------------------------------
230 x264_quant_4x4_core15_ssse3:
234 movdqa xmm5, [parm2q+32*x]
235 packssdw xmm5, [parm2q+32*x+16]
236 SSSE3_QUANT15_1x8 [parm1q+16*x], xmm5, xmm6, xmm7
242 ;-----------------------------------------------------------------------------
243 ; void x264_quant_8x8_core15_ssse3( int16_t dct[8][8],
244 ; int const quant_mf[8][8], int const i_qbits, int const f );
245 ;-----------------------------------------------------------------------------
246 x264_quant_8x8_core15_ssse3:
250 movdqa xmm5, [parm2q+32*x]
251 packssdw xmm5, [parm2q+32*x+16]
252 SSSE3_QUANT15_1x8 [parm1q+16*x], xmm5, xmm6, xmm7
259 ; ============================================================================
261 %macro MMXEXT_QUANT16_DC_START 0
262 ; mov rdi, rdi ; &dct[0][0]
263 movd mm5, parm2d ; i_qmf
264 movd mm6, parm3d ; i_qbits
266 pshufw mm5, mm5, 0 ; i_qmf in each word
267 punpckldq mm7, mm7 ; f in each dword
270 %macro MMXEXT_QUANT16_1x4 4
271 ;;; %1 (m64) dct[y][x]
272 ;;; %2 (m64/mmx) quant_mf[y][x] or quant_mf[0][0] (as uint16_t)
273 ;;; %3 (mmx) i_qbits in the low doubleword
274 ;;; %4 (mmx) f as doublewords
275 ;;; trashes mm0-mm2,mm4
276 movq mm0, %1 ; load dct coeffs
278 pcmpgtw mm4, mm0 ; sign(coeff)
280 psubw mm0, mm4 ; abs(coeff)
290 paddd mm0, %4 ; round with f
295 packssdw mm0, mm1 ; pack
296 pxor mm0, mm4 ; restore sign
302 ;-----------------------------------------------------------------------------
303 ; void x264_quant_2x2_dc_core16_mmxext( int16_t dct[2][2],
304 ; int const i_qmf, int const i_qbits, int const f );
305 ;-----------------------------------------------------------------------------
306 x264_quant_2x2_dc_core16_mmxext:
307 MMXEXT_QUANT16_DC_START
308 MMXEXT_QUANT16_1x4 [parm1q], mm5, mm6, mm7
312 ;-----------------------------------------------------------------------------
313 ; void x264_quant_4x4_dc_core16_mmxext( int16_t dct[4][4],
314 ; int const i_qmf, int const i_qbits, int const f );
315 ;-----------------------------------------------------------------------------
316 x264_quant_4x4_dc_core16_mmxext:
317 MMXEXT_QUANT16_DC_START
320 MMXEXT_QUANT16_1x4 [parm1q], mm5, mm6, mm7
327 ;-----------------------------------------------------------------------------
328 ; void x264_quant_4x4_core16_mmxext( int16_t dct[4][4],
329 ; int const quant_mf[4][4], int const i_qbits, int const f );
330 ;-----------------------------------------------------------------------------
331 x264_quant_4x4_core16_mmxext:
335 pshufw mm5, [parm2q], 10110001b
336 paddw mm5, [parm2q+8]
337 pshufw mm5, mm5, 10001101b
338 MMXEXT_QUANT16_1x4 [parm1q], mm5, mm6, mm7
346 ;-----------------------------------------------------------------------------
347 ; void x264_quant_8x8_core16_mmxext( int16_t dct[8][8],
348 ; int const quant_mf[8][8], int const i_qbits, int const f );
349 ;-----------------------------------------------------------------------------
350 x264_quant_8x8_core16_mmxext:
354 pshufw mm5, [parm2q], 10110001b
355 paddw mm5, [parm2q+8]
356 pshufw mm5, mm5, 10001101b
357 MMXEXT_QUANT16_1x4 [parm1q], mm5, mm6, mm7
366 %macro MMX_QUANT32_DC_START 0
367 ; mov rdi, rdi ; &dct[0][0]
368 movd mm5, parm2d ; i_qmf
369 movd mm6, parm3d ; i_qbits
371 punpckldq mm5, mm5 ; i_qmf in each dword
372 punpckldq mm7, mm7 ; f in each dword
375 %macro MMXEXT_QUANT32_1x4 5
376 ;;; %1 (m64) dct[y][x]
377 ;;; %2,%3 (m64/mmx) quant_mf[y][x] or quant_mf[0][0] (as int16_t)
378 ;;; %4 (mmx) i_qbits in the low quadword
379 ;;; %5 (mmx) f as doublewords
381 movq mm0, %1 ; load dct coeffs
383 pcmpgtw mm4, mm0 ; sign(mm0)
385 psubw mm0, mm4 ; abs(mm0)
387 punpcklwd mm0, mm0 ; duplicate the words for the upcomming
388 punpckhwd mm1, mm1 ; 32 bit multiplication
390 movq mm2, mm0 ; like in school ...
392 pmulhuw mm0, %2 ; ... multiply the parts ...
396 pslld mm0, 16 ; ... shift ...
398 paddd mm0, mm2 ; ... and add them
401 paddd mm0, %5 ; round with f
406 packssdw mm0, mm1 ; pack to int16_t
407 pxor mm0, mm4 ; restore sign
413 ;-----------------------------------------------------------------------------
414 ; void x264_quant_2x2_dc_core32_mmxext( int16_t dct[2][2],
415 ; int const i_qmf, int const i_qbits, int const f );
416 ;-----------------------------------------------------------------------------
417 x264_quant_2x2_dc_core32_mmxext:
419 MMXEXT_QUANT32_1x4 [parm1q], mm5, mm5, mm6, mm7
423 ;-----------------------------------------------------------------------------
424 ; void x264_quant_4x4_dc_core32_mmxext( int16_t dct[4][4],
425 ; int const i_qmf, int const i_qbits, int const f );
426 ;-----------------------------------------------------------------------------
427 x264_quant_4x4_dc_core32_mmxext:
431 MMXEXT_QUANT32_1x4 [parm1q], mm5, mm5, mm6, mm7
438 ;-----------------------------------------------------------------------------
439 ; void x264_quant_4x4_core32_mmxext( int16_t dct[4][4],
440 ; int const quant_mf[4][4], int const i_qbits, int const f );
441 ;-----------------------------------------------------------------------------
442 x264_quant_4x4_core32_mmxext:
446 MMXEXT_QUANT32_1x4 [parm1q], [parm2q], [parm2q+8], mm6, mm7
454 ;-----------------------------------------------------------------------------
455 ; void x264_quant_8x8_core32_mmxext( int16_t dct[8][8],
456 ; int const quant_mf[8][8], int const i_qbits, int const f );
457 ;-----------------------------------------------------------------------------
458 x264_quant_8x8_core32_mmxext:
462 MMXEXT_QUANT32_1x4 [parm1q], [parm2q], [parm2q+8], mm6, mm7
470 ;=============================================================================
472 ;=============================================================================
474 %macro DEQUANT16_L_1x4 3
476 ;;; %2,%3 dequant_mf[i_mf][y][x]
488 %macro DEQUANT32_R_1x4 3
490 ;;; %2,%3 dequant_mf[i_mf][y][x]
522 ;;; void x264_dequant_4x4_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )
525 ; mov rsi, rsi ; dequant_mf
526 ; mov edx, edx ; i_qp
529 shr eax, 8 ; i_qbits = i_qp / 6
532 sub edx, ecx ; i_mf = i_qp % 6
535 add rsi, rdx ; dequant_mf[i_mf]
538 jl .rshift32 ; negative qbits => rightshift
544 DEQUANT16_L_1x4 [rdi], [rsi], [rsi+8]
554 movq mm6, [pd_1 GLOBAL]
560 DEQUANT32_R_1x4 [rdi], [rsi], [rsi+8]
568 DEQUANT_WxH x264_dequant_4x4_mmx, 4, 4
569 DEQUANT_WxH x264_dequant_8x8_mmx, 16, 6