git.sesse.net Git - x264/blob - common/x86/quant-a.asm

   1 ;*****************************************************************************
   2 ;* quant-a.asm: h264 encoder library
   3 ;*****************************************************************************
   4 ;* Copyright (C) 2005-2008 x264 project
   5 ;*
   6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
   7 ;*          Christian Heine <sennindemokrit@gmx.net>
   8 ;*
   9 ;* This program is free software; you can redistribute it and/or modify
  10 ;* it under the terms of the GNU General Public License as published by
  11 ;* the Free Software Foundation; either version 2 of the License, or
  12 ;* (at your option) any later version.
  13 ;*
  14 ;* This program is distributed in the hope that it will be useful,
  15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17 ;* GNU General Public License for more details.
  18 ;*
  19 ;* You should have received a copy of the GNU General Public License
  20 ;* along with this program; if not, write to the Free Software
  21 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  22 ;*****************************************************************************
  23
  24 %include "x86inc.asm"
  25
  26 SECTION_RODATA
  27 pw_1:     times 8 dw 1
  28 pd_1:     times 4 dd 1
  29
  30 %macro DQM4 3
  31     dw %1, %2, %1, %2, %2, %3, %2, %3
  32 %endmacro
  33 %macro DQM8 6
  34     dw %1, %4, %5, %4, %1, %4, %5, %4
  35     dw %4, %2, %6, %2, %4, %2, %6, %2
  36     dw %5, %6, %3, %6, %5, %6, %3, %6
  37     ; last line not used, just padding for power-of-2 stride
  38     times 8 dw 0
  39 %endmacro
  40
  41 dequant4_scale:
  42     DQM4 10, 13, 16
  43     DQM4 11, 14, 18
  44     DQM4 13, 16, 20
  45     DQM4 14, 18, 23
  46     DQM4 16, 20, 25
  47     DQM4 18, 23, 29
  48
  49 dequant8_scale:
  50     DQM8 20, 18, 32, 19, 25, 24
  51     DQM8 22, 19, 35, 21, 28, 26
  52     DQM8 26, 23, 42, 24, 33, 31
  53     DQM8 28, 25, 45, 26, 35, 33
  54     DQM8 32, 28, 51, 30, 40, 38
  55     DQM8 36, 32, 58, 34, 46, 43
  56
  57 SECTION .text
  58
  59 %macro QUANT_DC_START 0
  60     movd       m6, r1m     ; mf
  61     movd       m7, r2m     ; bias
  62 %ifidn m0, mm0
  63     pshufw     m6, m6, 0
  64     pshufw     m7, m7, 0
  65 %else
  66     pshuflw    m6, m6, 0
  67     pshuflw    m7, m7, 0
  68     punpcklqdq m6, m6
  69     punpcklqdq m7, m7
  70 %endif
  71 %endmacro
  72
  73 %macro PABSW_MMX 2
  74     pxor       %1, %1
  75     pcmpgtw    %1, %2
  76     pxor       %2, %1
  77     psubw      %2, %1
  78     SWAP       %1, %2
  79 %endmacro
  80
  81 %macro PSIGNW_MMX 2
  82     pxor       %1, %2
  83     psubw      %1, %2
  84 %endmacro
  85
  86 %macro PABSW_SSSE3 2
  87     pabsw      %1, %2
  88 %endmacro
  89
  90 %macro PSIGNW_SSSE3 2
  91     psignw     %1, %2
  92 %endmacro
  93
  94 %macro QUANT_ONE 3
  95 ;;; %1      (m64)       dct[y][x]
  96 ;;; %2      (m64/mmx)   mf[y][x] or mf[0][0] (as uint16_t)
  97 ;;; %3      (m64/mmx)   bias[y][x] or bias[0][0] (as uint16_t)
  98     mova       m1, %1   ; load dct coeffs
  99     PABSW      m0, m1
 100     paddusw    m0, %3   ; round
 101     pmulhuw    m0, %2   ; divide
 102     PSIGNW     m0, m1   ; restore sign
 103     mova       %1, m0   ; store
 104 %endmacro
 105
 106 ;-----------------------------------------------------------------------------
 107 ; void x264_quant_4x4_dc_mmxext( int16_t dct[16], int mf, int bias )
 108 ;-----------------------------------------------------------------------------
 109 %macro QUANT_DC 2
 110 cglobal %1, 1,1
 111     QUANT_DC_START
 112 %assign x 0
 113 %rep %2
 114     QUANT_ONE [r0+x], m6, m7
 115 %assign x x+regsize
 116 %endrep
 117     RET
 118 %endmacro
 119
 120 ;-----------------------------------------------------------------------------
 121 ; void x264_quant_4x4_mmx( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
 122 ;-----------------------------------------------------------------------------
 123 %macro QUANT_AC 2
 124 cglobal %1, 3,3
 125 %assign x 0
 126 %rep %2
 127     QUANT_ONE [r0+x], [r1+x], [r2+x]
 128 %assign x x+regsize
 129 %endrep
 130     RET
 131 %endmacro
 132
 133 INIT_MMX
 134 %define PABSW PABSW_MMX
 135 %define PSIGNW PSIGNW_MMX
 136 QUANT_DC x264_quant_2x2_dc_mmxext, 1
 137 %ifndef ARCH_X86_64 ; not needed because sse2 is faster
 138 QUANT_DC x264_quant_4x4_dc_mmxext, 4
 139 QUANT_AC x264_quant_4x4_mmx, 4
 140 QUANT_AC x264_quant_8x8_mmx, 16
 141 %endif
 142
 143 INIT_XMM
 144 QUANT_DC x264_quant_4x4_dc_sse2, 2
 145 QUANT_AC x264_quant_4x4_sse2, 2
 146 QUANT_AC x264_quant_8x8_sse2, 8
 147
 148 %define PABSW PABSW_SSSE3
 149 %define PSIGNW PSIGNW_SSSE3
 150 QUANT_DC x264_quant_4x4_dc_ssse3, 2
 151 QUANT_AC x264_quant_4x4_ssse3, 2
 152 QUANT_AC x264_quant_8x8_ssse3, 8
 153
 154 INIT_MMX
 155 QUANT_DC x264_quant_2x2_dc_ssse3, 1
 156
 157
 158
 159 ;=============================================================================
 160 ; dequant
 161 ;=============================================================================
 162
 163 %macro DEQUANT16_L 3
 164 ;;; %1      dct[y][x]
 165 ;;; %2,%3   dequant_mf[i_mf][y][x]
 166 ;;; m5      i_qbits
 167
 168     mova     m0, %2
 169     packssdw m0, %3
 170     pmullw   m0, %1
 171     psllw    m0, m5
 172     mova     %1, m0
 173 %endmacro
 174
 175 %macro DEQUANT32_R 3
 176 ;;; %1      dct[y][x]
 177 ;;; %2,%3   dequant_mf[i_mf][y][x]
 178 ;;; m5      -i_qbits
 179 ;;; m6      f
 180 ;;; m7      0
 181
 182     mova      m0, %1
 183     mova      m1, m0
 184     punpcklwd m0, m7
 185     punpckhwd m1, m7
 186     pmaddwd   m0, %2
 187     pmaddwd   m1, %3
 188     paddd     m0, m6
 189     paddd     m1, m6
 190     psrad     m0, m5
 191     psrad     m1, m5
 192     packssdw  m0, m1
 193     mova      %1, m0
 194 %endmacro
 195
 196 %macro DEQUANT_LOOP 3
 197 %if 8*(%2-2*%3)
 198     mov t0d, 8*(%2-2*%3)
 199 %%loop:
 200     %1 [r0+t0+8*%3], [r1+t0*2+16*%3], [r1+t0*2+24*%3]
 201     %1 [r0+t0     ], [r1+t0*2      ], [r1+t0*2+ 8*%3]
 202     sub t0d, 16*%3
 203     jge %%loop
 204     rep ret
 205 %else
 206     %1 [r0+8*%3], [r1+16*%3], [r1+24*%3]
 207     %1 [r0     ], [r1      ], [r1+ 8*%3]
 208     ret
 209 %endif
 210 %endmacro
 211
 212 %macro DEQUANT16_FLAT 2-8
 213     mova   m0, %1
 214 %assign i %0-2
 215 %rep %0-1
 216 %if i
 217     mova   m %+ i, [r0+%2]
 218     pmullw m %+ i, m0
 219 %else
 220     pmullw m0, [r0+%2]
 221 %endif
 222     psllw  m %+ i, m7
 223     mova   [r0+%2], m %+ i
 224     %assign i i-1
 225     %rotate 1
 226 %endrep
 227 %endmacro
 228
 229 %ifdef ARCH_X86_64
 230     %define t0  r4
 231     %define t0d r4d
 232     %define t1  r3
 233     %define t1d r3d
 234     %define t2  r2
 235     %define t2d r2d
 236 %else
 237     %define t0  r2
 238     %define t0d r2d
 239     %define t1  r0
 240     %define t1d r0d
 241     %define t2  r1
 242     %define t2d r1d
 243 %endif
 244
 245 ;-----------------------------------------------------------------------------
 246 ; void x264_dequant_4x4_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )
 247 ;-----------------------------------------------------------------------------
 248 %macro DEQUANT 4
 249 cglobal x264_dequant_%2x%2_%1, 0,3
 250     movifnidn t2d, r2m
 251     imul t0d, t2d, 0x2b
 252     shr  t0d, 8     ; i_qbits = i_qp / 6
 253     lea  t1, [t0*3]
 254     sub  t2d, t1d
 255     sub  t2d, t1d   ; i_mf = i_qp % 6
 256     shl  t2d, %3+2
 257 %ifdef ARCH_X86_64
 258     add  r1, t2     ; dequant_mf[i_mf]
 259 %else
 260     add  r1, r1m    ; dequant_mf[i_mf]
 261     mov  r0, r0m    ; dct
 262 %endif
 263     sub  t0d, %3
 264     jl   .rshift32  ; negative qbits => rightshift
 265
 266 .lshift:
 267     movd m5, t0d
 268     DEQUANT_LOOP DEQUANT16_L, %2*%2/4, %4
 269
 270 .rshift32:
 271     neg   t0d
 272     movd  m5, t0d
 273     picgetgot t0d
 274     mova  m6, [pd_1 GLOBAL]
 275     pxor  m7, m7
 276     pslld m6, m5
 277     psrld m6, 1
 278     DEQUANT_LOOP DEQUANT32_R, %2*%2/4, %4
 279
 280 cglobal x264_dequant_%2x%2_flat16_%1, 0,3
 281     movifnidn t2d, r2m
 282 %if %2 == 8
 283     cmp  t2d, 12
 284     jl x264_dequant_%2x%2_%1
 285     sub  t2d, 12
 286 %endif
 287     imul t0d, t2d, 0x2b
 288     shr  t0d, 8     ; i_qbits = i_qp / 6
 289     lea  t1, [t0*3]
 290     sub  t2d, t1d
 291     sub  t2d, t1d   ; i_mf = i_qp % 6
 292     shl  t2d, %3
 293 %ifdef PIC64
 294     lea  r1, [dequant%2_scale GLOBAL]
 295     add  r1, t2
 296 %else
 297     picgetgot r0
 298     lea  r1, [t2 + dequant%2_scale GLOBAL]
 299 %endif
 300     movifnidn r0d, r0m
 301     movd m7, t0d
 302 %if %2 == 4
 303 %ifidn %1, mmx
 304     DEQUANT16_FLAT [r1], 0, 16
 305     DEQUANT16_FLAT [r1+8], 8, 24
 306 %else
 307     DEQUANT16_FLAT [r1], 0, 16
 308 %endif
 309 %elifidn %1, mmx
 310     DEQUANT16_FLAT [r1], 0, 8, 64, 72
 311     DEQUANT16_FLAT [r1+16], 16, 24, 48, 56
 312     DEQUANT16_FLAT [r1+16], 80, 88, 112, 120
 313     DEQUANT16_FLAT [r1+32], 32, 40, 96, 104
 314 %else
 315     DEQUANT16_FLAT [r1], 0, 64
 316     DEQUANT16_FLAT [r1+16], 16, 48, 80, 112
 317     DEQUANT16_FLAT [r1+32], 32, 96
 318 %endif
 319     ret
 320 %endmacro ; DEQUANT
 321
 322 %ifndef ARCH_X86_64
 323 INIT_MMX
 324 DEQUANT mmx, 4, 4, 1
 325 DEQUANT mmx, 8, 6, 1
 326 %endif
 327 INIT_XMM
 328 DEQUANT sse2, 4, 4, 2
 329 DEQUANT sse2, 8, 6, 2
 330
 331
 332
 333 ;-----------------------------------------------------------------------------
 334 ; void x264_denoise_dct_core_mmx( int16_t *dct, uint32_t *sum, uint16_t *offset, int size )
 335 ;-----------------------------------------------------------------------------
 336 %macro DENOISE_DCT 1
 337 cglobal x264_denoise_dct_core_%1, 4,5
 338     movzx     r4d, word [r0] ; backup DC coefficient
 339     pxor      m7, m7
 340 .loop:
 341     sub       r3, regsize
 342     mova      m2, [r0+r3*2+0*regsize]
 343     mova      m3, [r0+r3*2+1*regsize]
 344     PABSW     m0, m2
 345     PABSW     m1, m3
 346     mova      m4, m0
 347     mova      m5, m1
 348     psubusw   m0, [r2+r3*2+0*regsize]
 349     psubusw   m1, [r2+r3*2+1*regsize]
 350     PSIGNW    m0, m2
 351     PSIGNW    m1, m3
 352     mova      [r0+r3*2+0*regsize], m0
 353     mova      [r0+r3*2+1*regsize], m1
 354     mova      m2, m4
 355     mova      m3, m5
 356     punpcklwd m4, m7
 357     punpckhwd m2, m7
 358     punpcklwd m5, m7
 359     punpckhwd m3, m7
 360     paddd     m4, [r1+r3*4+0*regsize]
 361     paddd     m2, [r1+r3*4+1*regsize]
 362     paddd     m5, [r1+r3*4+2*regsize]
 363     paddd     m3, [r1+r3*4+3*regsize]
 364     mova      [r1+r3*4+0*regsize], m4
 365     mova      [r1+r3*4+1*regsize], m2
 366     mova      [r1+r3*4+2*regsize], m5
 367     mova      [r1+r3*4+3*regsize], m3
 368     jg .loop
 369     mov       [r0], r4w ; restore DC coefficient
 370     RET
 371 %endmacro
 372
 373 %define PABSW PABSW_MMX
 374 %define PSIGNW PSIGNW_MMX
 375 %ifndef ARCH_X86_64
 376 INIT_MMX
 377 DENOISE_DCT mmx
 378 %endif
 379 INIT_XMM
 380 DENOISE_DCT sse2
 381 %define PABSW PABSW_SSSE3
 382 %define PSIGNW PSIGNW_SSSE3
 383 DENOISE_DCT ssse3