git.sesse.net Git - x264/blob - common/amd64/quant-a.asm

   1 ;*****************************************************************************
   2 ;* quant-a.asm: h264 encoder library
   3 ;*****************************************************************************
   4 ;* Copyright (C) 2005 x264 project
   5 ;*
   6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
   7 ;*
   8 ;* This program is free software; you can redistribute it and/or modify
   9 ;* it under the terms of the GNU General Public License as published by
  10 ;* the Free Software Foundation; either version 2 of the License, or
  11 ;* (at your option) any later version.
  12 ;*
  13 ;* This program is distributed in the hope that it will be useful,
  14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 ;* GNU General Public License for more details.
  17 ;*
  18 ;* You should have received a copy of the GNU General Public License
  19 ;* along with this program; if not, write to the Free Software
  20 ;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
  21 ;*****************************************************************************
  22
  23 BITS 64
  24
  25 %include "amd64inc.asm"
  26
  27 SECTION .rodata
  28 pd_1:  times 2 dd 1
  29
  30 SECTION .text
  31
  32 %macro MMX_QUANT_DC_START 0
  33     movd       mm6, parm2d     ; mf
  34     movd       mm7, parm3d     ; bias
  35     pshufw     mm6, mm6, 0
  36     pshufw     mm7, mm7, 0
  37 %endmacro
  38
  39 %macro SSE2_QUANT_DC_START 0
  40     movd       xmm6, parm2d     ; mf
  41     movd       xmm7, parm3d     ; bias
  42     pshuflw    xmm6, xmm6, 0
  43     pshuflw    xmm7, xmm7, 0
  44     punpcklqdq xmm6, xmm6
  45     punpcklqdq xmm7, xmm7
  46 %endmacro
  47
  48 %macro QUANT_ONE 5
  49 ;;; %1      (m64)       dct[y][x]
  50 ;;; %2      (m64/mmx)   mf[y][x] or mf[0][0] (as uint16_t)
  51 ;;; %3      (m64/mmx)   bias[y][x] or bias[0][0] (as uint16_t)
  52
  53     mov%1      %2m0, %3     ; load dct coeffs
  54     pxor       %2m1, %2m1
  55     pcmpgtw    %2m1, %2m0   ; sign(coeff)
  56     pxor       %2m0, %2m1
  57     psubw      %2m0, %2m1   ; abs(coeff)
  58     paddusw    %2m0, %5     ; round
  59     pmulhuw    %2m0, %4     ; divide
  60     pxor       %2m0, %2m1   ; restore sign
  61     psubw      %2m0, %2m1
  62     mov%1        %3, %2m0   ; store
  63 %endmacro
  64 %macro MMX_QUANT_1x4 3
  65     QUANT_ONE q, m, %1, %2, %3
  66 %endmacro
  67 %macro SSE2_QUANT_1x8 3
  68     QUANT_ONE dqa, xm, %1, %2, %3
  69 %endmacro
  70
  71 %macro SSSE3_QUANT_1x8 3
  72     movdqa     xmm1, %1     ; load dct coeffs
  73     pabsw      xmm0, xmm1
  74     paddusw    xmm0, %3     ; round
  75     pmulhuw    xmm0, %2     ; divide
  76     psignw     xmm0, xmm1   ; restore sign
  77     movdqa       %1, xmm0   ; store
  78 %endmacro
  79
  80 ;-----------------------------------------------------------------------------
  81 ; void x264_quant_2x2_dc_mmxext( int16_t dct[4], int mf, int bias )
  82 ;-----------------------------------------------------------------------------
  83 cglobal x264_quant_2x2_dc_mmxext
  84     MMX_QUANT_DC_START
  85     MMX_QUANT_1x4 [parm1q], mm6, mm7
  86     ret
  87
  88 %macro QUANT_SSE 1
  89 ;-----------------------------------------------------------------------------
  90 ; void x264_quant_4x4_dc_sse2( int16_t dct[16], int mf, int bias )
  91 ;-----------------------------------------------------------------------------
  92 cglobal x264_quant_4x4_dc_%1
  93     SSE2_QUANT_DC_START
  94 %assign x 0
  95 %rep 2
  96     QUANT_1x8 [parm1q+x], xmm6, xmm7
  97 %assign x (x+16)
  98 %endrep
  99     ret
 100
 101 ;-----------------------------------------------------------------------------
 102 ; void x264_quant_4x4_sse2( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
 103 ;-----------------------------------------------------------------------------
 104 cglobal x264_quant_4x4_%1
 105 %assign x 0
 106 %rep 2
 107     QUANT_1x8 [parm1q+x], [parm2q+x], [parm3q+x]
 108 %assign x (x+16)
 109 %endrep
 110     ret
 111
 112 ;-----------------------------------------------------------------------------
 113 ; void x264_quant_8x8_sse2( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] )
 114 ;-----------------------------------------------------------------------------
 115 cglobal x264_quant_8x8_%1
 116 %assign x 0
 117 %rep 8
 118     QUANT_1x8 [parm1q+x], [parm2q+x], [parm3q+x]
 119 %assign x (x+16)
 120 %endrep
 121     ret
 122 %endmacro
 123
 124 %define QUANT_1x8 SSE2_QUANT_1x8
 125 QUANT_SSE sse2
 126 %ifdef HAVE_SSE3
 127 %define QUANT_1x8 SSSE3_QUANT_1x8
 128 QUANT_SSE ssse3
 129 %endif
 130
 131
 132
 133 ;=============================================================================
 134 ; dequant
 135 ;=============================================================================
 136
 137 %macro DEQUANT16_L_1x4 3
 138 ;;; %1      dct[y][x]
 139 ;;; %2,%3   dequant_mf[i_mf][y][x]
 140 ;;; mm5     i_qbits
 141
 142     movq     mm1, %2
 143     movq     mm2, %3
 144     movq     mm0, %1
 145     packssdw mm1, mm2
 146     pmullw   mm0, mm1
 147     psllw    mm0, mm5
 148     movq     %1,  mm0
 149 %endmacro
 150
 151 %macro DEQUANT32_R_1x4 3
 152 ;;; %1      dct[y][x]
 153 ;;; %2,%3   dequant_mf[i_mf][y][x]
 154 ;;; mm5     -i_qbits
 155 ;;; mm6     f as dwords
 156 ;;; mm7     0
 157
 158     movq      mm0, %1
 159     movq      mm1, mm0
 160     punpcklwd mm0, mm0
 161     punpckhwd mm1, mm1
 162
 163     movq      mm2, mm0
 164     movq      mm3, mm1
 165     pmulhw    mm0, %2
 166     pmulhw    mm1, %3
 167     pmullw    mm2, %2
 168     pmullw    mm3, %3
 169     pslld     mm0, 16
 170     pslld     mm1, 16
 171     paddd     mm0, mm2
 172     paddd     mm1, mm3
 173
 174     paddd     mm0, mm6
 175     paddd     mm1, mm6
 176     psrad     mm0, mm5
 177     psrad     mm1, mm5
 178
 179     packssdw  mm0, mm1
 180     movq      %1,  mm0
 181 %endmacro
 182
 183 ;-----------------------------------------------------------------------------
 184 ; void x264_dequant_4x4_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )
 185 ;-----------------------------------------------------------------------------
 186 %macro DEQUANT_WxH 3
 187 cglobal %1
 188 ;   mov  rdi, rdi   ; dct
 189 ;   mov  rsi, rsi   ; dequant_mf
 190 ;   mov  edx, edx   ; i_qp
 191
 192     imul eax, edx, 0x2b
 193     shr  eax, 8     ; i_qbits = i_qp / 6
 194     lea  ecx, [eax+eax*2]
 195     sub  edx, ecx
 196     sub  edx, ecx   ; i_mf = i_qp % 6
 197     shl  edx, %3+2
 198     movsxd rdx, edx
 199     add  rsi, rdx   ; dequant_mf[i_mf]
 200
 201     sub  eax, %3
 202     jl   .rshift32  ; negative qbits => rightshift
 203
 204 .lshift:
 205     movd mm5, eax
 206
 207 %rep %2
 208     DEQUANT16_L_1x4 [rdi], [rsi], [rsi+8]
 209     add  rsi, byte 16
 210     add  rdi, byte 8
 211 %endrep
 212
 213     ret
 214
 215 .rshift32:
 216     neg   eax
 217     movd  mm5, eax
 218     movq  mm6, [pd_1 GLOBAL]
 219     pxor  mm7, mm7
 220     pslld mm6, mm5
 221     psrld mm6, 1
 222
 223 %rep %2
 224     DEQUANT32_R_1x4 [rdi], [rsi], [rsi+8]
 225     add  rsi, byte 16
 226     add  rdi, byte 8
 227 %endrep
 228
 229     ret
 230 %endmacro
 231
 232 DEQUANT_WxH x264_dequant_4x4_mmx, 4, 4
 233 DEQUANT_WxH x264_dequant_8x8_mmx, 16, 6