git.sesse.net Git - x264/blob - common/x86/quant-a.asm

   1 ;*****************************************************************************
   2 ;* quant-a.asm: h264 encoder library
   3 ;*****************************************************************************
   4 ;* Copyright (C) 2005-2008 x264 project
   5 ;*
   6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
   7 ;*          Christian Heine <sennindemokrit@gmx.net>
   8 ;*
   9 ;* This program is free software; you can redistribute it and/or modify
  10 ;* it under the terms of the GNU General Public License as published by
  11 ;* the Free Software Foundation; either version 2 of the License, or
  12 ;* (at your option) any later version.
  13 ;*
  14 ;* This program is distributed in the hope that it will be useful,
  15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17 ;* GNU General Public License for more details.
  18 ;*
  19 ;* You should have received a copy of the GNU General Public License
  20 ;* along with this program; if not, write to the Free Software
  21 ;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
  22 ;*****************************************************************************
  23
  24 %include "x86inc.asm"
  25
  26 SECTION_RODATA
  27 pw_1:     times 8 dw 1
  28 pd_1:     times 4 dd 1
  29
  30 %macro DQM4 3
  31     dw %1, %2, %1, %2, %2, %3, %2, %3
  32 %endmacro
  33 %macro DQM8 6
  34     dw %1, %4, %5, %4, %1, %4, %5, %4
  35     dw %4, %2, %6, %2, %4, %2, %6, %2
  36     dw %5, %6, %3, %6, %5, %6, %3, %6
  37     ; last line not used, just padding for power-of-2 stride
  38     times 8 dw 0
  39 %endmacro
  40
  41 dequant4_scale:
  42     DQM4 10, 13, 16
  43     DQM4 11, 14, 18
  44     DQM4 13, 16, 20
  45     DQM4 14, 18, 23
  46     DQM4 16, 20, 25
  47     DQM4 18, 23, 29
  48
  49 dequant8_scale:
  50     DQM8 20, 18, 32, 19, 25, 24
  51     DQM8 22, 19, 35, 21, 28, 26
  52     DQM8 26, 23, 42, 24, 33, 31
  53     DQM8 28, 25, 45, 26, 35, 33
  54     DQM8 32, 28, 51, 30, 40, 38
  55     DQM8 36, 32, 58, 34, 46, 43
  56
  57 SECTION .text
  58
  59 %macro MMX_QUANT_DC_START 0
  60     movd       mm6, r1m     ; mf
  61     movd       mm7, r2m     ; bias
  62     pshufw     mm6, mm6, 0
  63     pshufw     mm7, mm7, 0
  64 %endmacro
  65
  66 %macro SSE2_QUANT_DC_START 0
  67     movd       xmm6, r1m     ; mf
  68     movd       xmm7, r2m     ; bias
  69     pshuflw    xmm6, xmm6, 0
  70     pshuflw    xmm7, xmm7, 0
  71     punpcklqdq xmm6, xmm6
  72     punpcklqdq xmm7, xmm7
  73 %endmacro
  74
  75 %macro QUANT_ONE 5
  76 ;;; %1      (m64)       dct[y][x]
  77 ;;; %2      (m64/mmx)   mf[y][x] or mf[0][0] (as uint16_t)
  78 ;;; %3      (m64/mmx)   bias[y][x] or bias[0][0] (as uint16_t)
  79
  80     mov%1      %2m0, %3     ; load dct coeffs
  81     pxor       %2m1, %2m1
  82     pcmpgtw    %2m1, %2m0   ; sign(coeff)
  83     pxor       %2m0, %2m1
  84     psubw      %2m0, %2m1   ; abs(coeff)
  85     paddusw    %2m0, %5     ; round
  86     pmulhuw    %2m0, %4     ; divide
  87     pxor       %2m0, %2m1   ; restore sign
  88     psubw      %2m0, %2m1
  89     mov%1        %3, %2m0   ; store
  90 %endmacro
  91 %macro MMX_QUANT_1x4 3
  92     QUANT_ONE q, m, %1, %2, %3
  93 %endmacro
  94 %macro SSE2_QUANT_1x8 3
  95     QUANT_ONE dqa, xm, %1, %2, %3
  96 %endmacro
  97
  98 %macro SSSE3_QUANT_1x8 3
  99     movdqa     xmm1, %1     ; load dct coeffs
 100     pabsw      xmm0, xmm1
 101     paddusw    xmm0, %3     ; round
 102     pmulhuw    xmm0, %2     ; divide
 103     psignw     xmm0, xmm1   ; restore sign
 104     movdqa       %1, xmm0   ; store
 105 %endmacro
 106
 107 ;-----------------------------------------------------------------------------
 108 ; void x264_quant_2x2_dc_mmxext( int16_t dct[4], int mf, int bias )
 109 ;-----------------------------------------------------------------------------
 110 cglobal x264_quant_2x2_dc_mmxext, 1,1
 111     MMX_QUANT_DC_START
 112     MMX_QUANT_1x4 [r0], mm6, mm7
 113     RET
 114
 115 ;-----------------------------------------------------------------------------
 116 ; void x264_quant_4x4_dc_mmxext( int16_t dct[16], int mf, int bias )
 117 ;-----------------------------------------------------------------------------
 118 %macro QUANT_DC 6
 119 cglobal %1, 1,1
 120     %2
 121 %assign x 0
 122 %rep %5
 123     %3 [r0+x], %4m6, %4m7
 124 %assign x x+%6
 125 %endrep
 126     RET
 127 %endmacro
 128
 129 ;-----------------------------------------------------------------------------
 130 ; void x264_quant_4x4_mmx( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
 131 ;-----------------------------------------------------------------------------
 132 %macro QUANT_AC 4
 133 cglobal %1, 3,3
 134 %assign x 0
 135 %rep %3
 136     %2 [r0+x], [r1+x], [r2+x]
 137 %assign x x+%4
 138 %endrep
 139     RET
 140 %endmacro
 141
 142 %ifndef ARCH_X86_64 ; not needed because sse2 is faster
 143 QUANT_DC x264_quant_4x4_dc_mmxext, MMX_QUANT_DC_START, MMX_QUANT_1x4, m, 4, 8
 144 QUANT_AC x264_quant_4x4_mmx, MMX_QUANT_1x4, 4, 8
 145 QUANT_AC x264_quant_8x8_mmx, MMX_QUANT_1x4, 16, 8
 146 %endif
 147
 148 QUANT_DC x264_quant_4x4_dc_sse2, SSE2_QUANT_DC_START, SSE2_QUANT_1x8, xm, 2, 16
 149 QUANT_AC x264_quant_4x4_sse2, SSE2_QUANT_1x8, 2, 16
 150 QUANT_AC x264_quant_8x8_sse2, SSE2_QUANT_1x8, 8, 16
 151
 152 %ifdef HAVE_SSE3
 153 QUANT_DC x264_quant_4x4_dc_ssse3, SSE2_QUANT_DC_START, SSSE3_QUANT_1x8, xm, 2, 16
 154 QUANT_AC x264_quant_4x4_ssse3, SSSE3_QUANT_1x8, 2, 16
 155 QUANT_AC x264_quant_8x8_ssse3, SSSE3_QUANT_1x8, 8, 16
 156 %endif
 157
 158
 159
 160 ;=============================================================================
 161 ; dequant
 162 ;=============================================================================
 163
 164 %macro DEQUANT16_L 3
 165 ;;; %1      dct[y][x]
 166 ;;; %2,%3   dequant_mf[i_mf][y][x]
 167 ;;; m5      i_qbits
 168
 169     movq     m0, %2
 170     packssdw m0, %3
 171     pmullw   m0, %1
 172     psllw    m0, m5
 173     movq     %1, m0
 174 %endmacro
 175
 176 %macro DEQUANT32_R 3
 177 ;;; %1      dct[y][x]
 178 ;;; %2,%3   dequant_mf[i_mf][y][x]
 179 ;;; m5      -i_qbits
 180 ;;; m6      f
 181 ;;; m7      0
 182
 183     movq      m0, %1
 184     movq      m1, m0
 185     punpcklwd m0, m7
 186     punpckhwd m1, m7
 187     pmaddwd   m0, %2
 188     pmaddwd   m1, %3
 189     paddd     m0, m6
 190     paddd     m1, m6
 191     psrad     m0, m5
 192     psrad     m1, m5
 193     packssdw  m0, m1
 194     movq      %1, m0
 195 %endmacro
 196
 197 %macro DEQUANT_LOOP 3
 198 %if 8*(%2-2*%3)
 199     mov t0d, 8*(%2-2*%3)
 200 %%loop:
 201     %1 [r0+t0+8*%3], [r1+t0*2+16*%3], [r1+t0*2+24*%3]
 202     %1 [r0+t0     ], [r1+t0*2      ], [r1+t0*2+ 8*%3]
 203     sub t0d, 16*%3
 204     jge %%loop
 205     rep ret
 206 %else
 207     %1 [r0+8*%3], [r1+16*%3], [r1+24*%3]
 208     %1 [r0     ], [r1      ], [r1+ 8*%3]
 209     ret
 210 %endif
 211 %endmacro
 212
 213 %macro DEQUANT16_FLAT 2-8
 214     movq   m0, %1
 215 %assign i %0-2
 216 %rep %0-1
 217 %if i
 218     movq   m %+ i, [r0+%2]
 219     pmullw m %+ i, m0
 220 %else
 221     pmullw m0, [r0+%2]
 222 %endif
 223     psllw  m %+ i, m7
 224     movq   [r0+%2], m %+ i
 225     %assign i i-1
 226     %rotate 1
 227 %endrep
 228 %endmacro
 229
 230 %ifdef ARCH_X86_64
 231     %define t0  r4
 232     %define t0d r4d
 233     %define t1  r3
 234     %define t1d r3d
 235     %define t2  r2
 236     %define t2d r2d
 237 %else
 238     %define t0  r2
 239     %define t0d r2d
 240     %define t1  r0
 241     %define t1d r0d
 242     %define t2  r1
 243     %define t2d r1d
 244 %endif
 245
 246 ;-----------------------------------------------------------------------------
 247 ; void x264_dequant_4x4_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )
 248 ;-----------------------------------------------------------------------------
 249 %macro DEQUANT 4
 250 cglobal x264_dequant_%2x%2_%1, 0,3
 251     movifnidn t2d, r2m
 252     imul t0d, t2d, 0x2b
 253     shr  t0d, 8     ; i_qbits = i_qp / 6
 254     lea  t1, [t0*3]
 255     sub  t2d, t1d
 256     sub  t2d, t1d   ; i_mf = i_qp % 6
 257     shl  t2d, %3+2
 258 %ifdef ARCH_X86_64
 259     add  r1, t2     ; dequant_mf[i_mf]
 260 %else
 261     add  r1, r1m    ; dequant_mf[i_mf]
 262     mov  r0, r0m    ; dct
 263 %endif
 264     sub  t0d, %3
 265     jl   .rshift32  ; negative qbits => rightshift
 266
 267 .lshift:
 268     movd m5, t0d
 269     DEQUANT_LOOP DEQUANT16_L, %2*%2/4, %4
 270
 271 .rshift32:
 272     neg   t0d
 273     movd  m5, t0d
 274     picgetgot t0d
 275     movq  m6, [pd_1 GLOBAL]
 276     pxor  m7, m7
 277     pslld m6, m5
 278     psrld m6, 1
 279     DEQUANT_LOOP DEQUANT32_R, %2*%2/4, %4
 280
 281 cglobal x264_dequant_%2x%2_flat16_%1, 0,3
 282     movifnidn t2d, r2m
 283 %if %2 == 8
 284     cmp  t2d, 12
 285     jl x264_dequant_%2x%2_%1
 286     sub  t2d, 12
 287 %endif
 288     imul t0d, t2d, 0x2b
 289     shr  t0d, 8     ; i_qbits = i_qp / 6
 290     lea  t1, [t0*3]
 291     sub  t2d, t1d
 292     sub  t2d, t1d   ; i_mf = i_qp % 6
 293     shl  t2d, %3
 294 %ifdef PIC64
 295     lea  r1, [dequant%2_scale GLOBAL]
 296     add  r1, t2
 297 %else
 298     picgetgot r0
 299     lea  r1, [t2 + dequant%2_scale GLOBAL]
 300 %endif
 301     movifnidn r0d, r0m
 302     movd m7, t0d
 303 %if %2 == 4
 304 %ifidn %1, mmx
 305     DEQUANT16_FLAT [r1], 0, 16
 306     DEQUANT16_FLAT [r1+8], 8, 24
 307 %else
 308     DEQUANT16_FLAT [r1], 0, 16
 309 %endif
 310 %elifidn %1, mmx
 311     DEQUANT16_FLAT [r1], 0, 8, 64, 72
 312     DEQUANT16_FLAT [r1+16], 16, 24, 48, 56
 313     DEQUANT16_FLAT [r1+16], 80, 88, 112, 120
 314     DEQUANT16_FLAT [r1+32], 32, 40, 96, 104
 315 %else
 316     DEQUANT16_FLAT [r1], 0, 64
 317     DEQUANT16_FLAT [r1+16], 16, 48, 80, 112
 318     DEQUANT16_FLAT [r1+32], 32, 96
 319 %endif
 320     ret
 321 %endmacro ; DEQUANT
 322
 323 %ifndef ARCH_X86_64
 324 INIT_MMX
 325 DEQUANT mmx, 4, 4, 1
 326 DEQUANT mmx, 8, 6, 1
 327 %endif
 328 INIT_XMM
 329 DEQUANT sse2, 4, 4, 2
 330 DEQUANT sse2, 8, 6, 2
 331