git.sesse.net Git - x264/blob - common/amd64/quant-a.asm

   1 ;*****************************************************************************
   2 ;* quant-a.asm: h264 encoder library
   3 ;*****************************************************************************
   4 ;* Copyright (C) 2005 x264 project
   5 ;*
   6 ;* Authors: Alex Izvorski <aizvorksi@gmail.com>
   7 ;*          Christian Heine <sennindemokrit@gmx.net>
   8 ;*
   9 ;* This program is free software; you can redistribute it and/or modify
  10 ;* it under the terms of the GNU General Public License as published by
  11 ;* the Free Software Foundation; either version 2 of the License, or
  12 ;* (at your option) any later version.
  13 ;*
  14 ;* This program is distributed in the hope that it will be useful,
  15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17 ;* GNU General Public License for more details.
  18 ;*
  19 ;* You should have received a copy of the GNU General Public License
  20 ;* along with this program; if not, write to the Free Software
  21 ;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
  22 ;*****************************************************************************
  23
  24 ;*****************************************************************************
  25 ;*                                                                           *
  26 ;*  Revision history:                                                        *
  27 ;*                                                                           *
  28 ;*  2005.07.26  quant 4x4 & 8x8 MMX functions (AI)                           *
  29 ;*  2005.09.04  quant MMXEXT (added precision) and DC (CH)                   *
  30 ;*  2005.09.21  faster MMX and added MMXEXT16 (CH)                           *
  31 ;*                                                                           *
  32 ;*****************************************************************************
  33
  34 BITS 64
  35
  36 %include "amd64inc.asm"
  37
  38 SECTION .rodata
  39 pd_1:  times 2 dd 1
  40
  41 SECTION .text
  42
  43 cglobal x264_quant_2x2_dc_core15_mmx
  44 cglobal x264_quant_4x4_dc_core15_mmx
  45 cglobal x264_quant_4x4_core15_mmx
  46 cglobal x264_quant_8x8_core15_mmx
  47
  48 cglobal x264_quant_4x4_dc_core15_ssse3
  49 cglobal x264_quant_4x4_core15_ssse3
  50 cglobal x264_quant_8x8_core15_ssse3
  51
  52 cglobal x264_quant_2x2_dc_core16_mmxext
  53 cglobal x264_quant_4x4_dc_core16_mmxext
  54 cglobal x264_quant_4x4_core16_mmxext
  55 cglobal x264_quant_8x8_core16_mmxext
  56
  57 cglobal x264_quant_2x2_dc_core32_mmxext
  58 cglobal x264_quant_4x4_dc_core32_mmxext
  59 cglobal x264_quant_4x4_core32_mmxext
  60 cglobal x264_quant_8x8_core32_mmxext
  61
  62 cglobal x264_dequant_4x4_mmx
  63 cglobal x264_dequant_8x8_mmx
  64
  65 %macro MMX_QUANT_AC_START 0
  66 ;   mov         rdi, rdi        ; &dct[0][0]
  67 ;   mov         rsi, rsi        ; &quant_mf[0][0]
  68     movd        mm6, parm3d     ; i_qbits
  69     movd        mm7, parm4d     ; f
  70     punpckldq   mm7, mm7        ; f in each dword
  71 %endmacro
  72
  73 %macro MMX_QUANT15_DC_START 0
  74 ;   mov         rdi, rdi        ; &dct[0][0]
  75     movd        mm5, parm2d     ; i_qmf
  76     movd        mm6, parm3d     ; i_qbits
  77     movd        mm7, parm4d     ; f
  78     punpcklwd   mm5, mm5
  79     punpcklwd   mm5, mm5        ; i_qmf in each word
  80     punpckldq   mm7, mm7        ; f in each dword
  81 %endmacro
  82
  83 %macro SSE2_QUANT_AC_START 0
  84     movd       xmm6, parm3d     ; i_qbits
  85     movd       xmm7, parm4d     ; f
  86     pshufd     xmm7, xmm7, 0    ; f in each dword
  87 %endmacro
  88
  89 %macro SSE2_QUANT15_DC_START 0
  90     movd       xmm5, parm2d     ; i_qmf
  91     movd       xmm6, parm3d     ; i_qbits
  92     movd       xmm7, parm4d     ; f
  93     pshuflw    xmm5, xmm5, 0
  94     punpcklqdq xmm5, xmm5       ; i_qmf in each word
  95     pshufd     xmm7, xmm7, 0    ; f in each dword
  96 %endmacro
  97
  98 %macro MMX_QUANT15_1x4 4
  99 ;;; %1      (m64)       dct[y][x]
 100 ;;; %2      (m64/mmx)   quant_mf[y][x] or quant_mf[0][0] (as int16_t)
 101 ;;; %3      (mmx)       i_qbits in the low doubleword
 102 ;;; %4      (mmx)       f as doublewords
 103 ;;; trashes mm0-mm2,mm4
 104     movq        mm0, %1     ; load dct coeffs
 105     pxor        mm4, mm4
 106     pcmpgtw     mm4, mm0    ; sign(coeff)
 107     pxor        mm0, mm4
 108     psubw       mm0, mm4    ; abs(coeff)
 109
 110     movq        mm2, mm0
 111     pmullw      mm0, %2
 112     pmulhw      mm2, %2
 113
 114     movq        mm1, mm0
 115     punpcklwd   mm0, mm2
 116     punpckhwd   mm1, mm2
 117
 118     paddd       mm0, %4     ; round with f
 119     paddd       mm1, %4
 120     psrad       mm0, %3
 121     psrad       mm1, %3
 122
 123     packssdw    mm0, mm1    ; pack
 124     pxor        mm0, mm4    ; restore sign
 125     psubw       mm0, mm4
 126     movq         %1, mm0    ; store
 127 %endmacro
 128
 129 %macro SSSE3_QUANT15_1x8 4
 130     movdqa     xmm0, %1     ; load dct coeffs
 131     movdqa     xmm4, xmm0   ; save sign
 132     pabsw      xmm0, xmm0
 133
 134     movdqa     xmm2, xmm0
 135     pmullw     xmm0, %2
 136     pmulhw     xmm2, %2
 137
 138     movdqa     xmm1, xmm0
 139     punpcklwd  xmm0, xmm2
 140     punpckhwd  xmm1, xmm2
 141
 142     paddd      xmm0, %4     ; round with f
 143     paddd      xmm1, %4
 144     psrad      xmm0, %3
 145     psrad      xmm1, %3
 146
 147     packssdw   xmm0, xmm1   ; pack
 148     psignw     xmm0, xmm4   ; restore sign
 149     movdqa       %1, xmm0   ; store
 150 %endmacro
 151
 152 ALIGN 16
 153 ;-----------------------------------------------------------------------------
 154 ;   void x264_quant_2x2_dc_core15_mmx( int16_t dct[2][2],
 155 ;       int const i_qmf, int const i_qbits, int const f );
 156 ;-----------------------------------------------------------------------------
 157 x264_quant_2x2_dc_core15_mmx:
 158     MMX_QUANT15_DC_START
 159     MMX_QUANT15_1x4 [parm1q], mm5, mm6, mm7
 160     ret
 161
 162 ALIGN 16
 163 ;-----------------------------------------------------------------------------
 164 ;   void x264_quant_4x4_dc_core15_mmx( int16_t dct[4][4],
 165 ;       int const i_qmf, int const i_qbits, int const f );
 166 ;-----------------------------------------------------------------------------
 167 x264_quant_4x4_dc_core15_mmx:
 168     MMX_QUANT15_DC_START
 169
 170 %rep 4
 171     MMX_QUANT15_1x4 [parm1q], mm5, mm6, mm7
 172     add         parm1q, byte 8
 173 %endrep
 174
 175     ret
 176
 177 ALIGN 16
 178 ;-----------------------------------------------------------------------------
 179 ;   void x264_quant_4x4_core15_mmx( int16_t dct[4][4],
 180 ;       int const quant_mf[4][4], int const i_qbits, int const f );
 181 ;-----------------------------------------------------------------------------
 182 x264_quant_4x4_core15_mmx:
 183     MMX_QUANT_AC_START
 184
 185 %rep 4
 186     movq        mm5, [parm2q]
 187     packssdw    mm5, [parm2q+8]
 188     MMX_QUANT15_1x4 [parm1q], mm5, mm6, mm7
 189     add         parm2q, byte 16
 190     add         parm1q, byte 8
 191 %endrep
 192
 193     ret
 194
 195 ALIGN 16
 196 ;-----------------------------------------------------------------------------
 197 ;   void x264_quant_8x8_core15_mmx( int16_t dct[8][8],
 198 ;       int const quant_mf[8][8], int const i_qbits, int const f );
 199 ;-----------------------------------------------------------------------------
 200 x264_quant_8x8_core15_mmx:
 201     MMX_QUANT_AC_START
 202
 203 %rep 16
 204     movq        mm5, [parm2q]
 205     packssdw    mm5, [parm2q+8]
 206     MMX_QUANT15_1x4 [parm1q], mm5, mm6, mm7
 207     add         parm2q, byte 16
 208     add         parm1q, byte 8
 209 %endrep
 210
 211     ret
 212
 213 %ifdef HAVE_SSE3
 214 ALIGN 16
 215 ;-----------------------------------------------------------------------------
 216 ;   void x264_quant_4x4_dc_core15_ssse3( int16_t dct[4][4],
 217 ;       int const i_qmf, int const i_qbits, int const f );
 218 ;-----------------------------------------------------------------------------
 219 x264_quant_4x4_dc_core15_ssse3:
 220     SSE2_QUANT15_DC_START
 221     SSSE3_QUANT15_1x8 [parm1q], xmm5, xmm6, xmm7
 222     SSSE3_QUANT15_1x8 [parm1q+16], xmm5, xmm6, xmm7
 223     ret
 224
 225 ALIGN 16
 226 ;-----------------------------------------------------------------------------
 227 ;   void x264_quant_4x4_core15_ssse3( int16_t dct[4][4],
 228 ;       int const quant_mf[4][4], int const i_qbits, int const f );
 229 ;-----------------------------------------------------------------------------
 230 x264_quant_4x4_core15_ssse3:
 231     SSE2_QUANT_AC_START
 232 %assign x 0
 233 %rep 2
 234     movdqa      xmm5, [parm2q+32*x]
 235     packssdw    xmm5, [parm2q+32*x+16]
 236     SSSE3_QUANT15_1x8 [parm1q+16*x], xmm5, xmm6, xmm7
 237     %assign x x+1
 238 %endrep
 239     ret
 240
 241 ALIGN 16
 242 ;-----------------------------------------------------------------------------
 243 ;   void x264_quant_8x8_core15_ssse3( int16_t dct[8][8],
 244 ;       int const quant_mf[8][8], int const i_qbits, int const f );
 245 ;-----------------------------------------------------------------------------
 246 x264_quant_8x8_core15_ssse3:
 247     SSE2_QUANT_AC_START
 248 %assign x 0
 249 %rep 8
 250     movdqa      xmm5, [parm2q+32*x]
 251     packssdw    xmm5, [parm2q+32*x+16]
 252     SSSE3_QUANT15_1x8 [parm1q+16*x], xmm5, xmm6, xmm7
 253     %assign x x+1
 254 %endrep
 255     ret
 256 %endif ; HAVE_SSE3
 257
 258
 259 ; ============================================================================
 260
 261 %macro MMXEXT_QUANT16_DC_START 0
 262 ;   mov         rdi, rdi        ; &dct[0][0]
 263     movd        mm5, parm2d     ; i_qmf
 264     movd        mm6, parm3d     ; i_qbits
 265     movd        mm7, parm4d     ; f
 266     pshufw      mm5, mm5, 0     ; i_qmf in each word
 267     punpckldq   mm7, mm7        ; f in each dword
 268 %endmacro
 269
 270 %macro MMXEXT_QUANT16_1x4 4
 271 ;;; %1      (m64)       dct[y][x]
 272 ;;; %2      (m64/mmx)   quant_mf[y][x] or quant_mf[0][0] (as uint16_t)
 273 ;;; %3      (mmx)       i_qbits in the low doubleword
 274 ;;; %4      (mmx)       f as doublewords
 275 ;;; trashes mm0-mm2,mm4
 276     movq        mm0, %1     ; load dct coeffs
 277     pxor        mm4, mm4
 278     pcmpgtw     mm4, mm0    ; sign(coeff)
 279     pxor        mm0, mm4
 280     psubw       mm0, mm4    ; abs(coeff)
 281
 282     movq        mm2, mm0
 283     pmullw      mm0, %2
 284     pmulhuw     mm2, %2
 285
 286     movq        mm1, mm0
 287     punpcklwd   mm0, mm2
 288     punpckhwd   mm1, mm2
 289
 290     paddd       mm0, %4     ; round with f
 291     paddd       mm1, %4
 292     psrad       mm0, %3
 293     psrad       mm1, %3
 294
 295     packssdw    mm0, mm1    ; pack
 296     pxor        mm0, mm4    ; restore sign
 297     psubw       mm0, mm4
 298     movq        %1, mm0     ; store
 299 %endmacro
 300
 301 ALIGN 16
 302 ;-----------------------------------------------------------------------------
 303 ;   void x264_quant_2x2_dc_core16_mmxext( int16_t dct[2][2],
 304 ;       int const i_qmf, int const i_qbits, int const f );
 305 ;-----------------------------------------------------------------------------
 306 x264_quant_2x2_dc_core16_mmxext:
 307     MMXEXT_QUANT16_DC_START
 308     MMXEXT_QUANT16_1x4 [parm1q], mm5, mm6, mm7
 309     ret
 310
 311 ALIGN 16
 312 ;-----------------------------------------------------------------------------
 313 ;   void x264_quant_4x4_dc_core16_mmxext( int16_t dct[4][4],
 314 ;       int const i_qmf, int const i_qbits, int const f );
 315 ;-----------------------------------------------------------------------------
 316 x264_quant_4x4_dc_core16_mmxext:
 317     MMXEXT_QUANT16_DC_START
 318
 319 %rep 4
 320     MMXEXT_QUANT16_1x4 [parm1q], mm5, mm6, mm7
 321     add         parm1q, byte 8
 322 %endrep
 323
 324     ret
 325
 326 ALIGN 16
 327 ;-----------------------------------------------------------------------------
 328 ;   void x264_quant_4x4_core16_mmxext( int16_t dct[4][4],
 329 ;       int const quant_mf[4][4], int const i_qbits, int const f );
 330 ;-----------------------------------------------------------------------------
 331 x264_quant_4x4_core16_mmxext:
 332     MMX_QUANT_AC_START
 333
 334 %rep 4
 335     pshufw      mm5, [parm2q], 10110001b
 336     paddw       mm5, [parm2q+8]
 337     pshufw      mm5, mm5, 10001101b
 338     MMXEXT_QUANT16_1x4 [parm1q], mm5, mm6, mm7
 339     add         parm2q, byte 16
 340     add         parm1q, byte 8
 341 %endrep
 342
 343     ret
 344
 345 ALIGN 16
 346 ;-----------------------------------------------------------------------------
 347 ;   void x264_quant_8x8_core16_mmxext( int16_t dct[8][8],
 348 ;       int const quant_mf[8][8], int const i_qbits, int const f );
 349 ;-----------------------------------------------------------------------------
 350 x264_quant_8x8_core16_mmxext:
 351     MMX_QUANT_AC_START
 352
 353 %rep 16
 354     pshufw      mm5, [parm2q], 10110001b
 355     paddw       mm5, [parm2q+8]
 356     pshufw      mm5, mm5, 10001101b
 357     MMXEXT_QUANT16_1x4 [parm1q], mm5, mm6, mm7
 358     add         parm2q, byte 16
 359     add         parm1q, byte 8
 360 %endrep
 361
 362     ret
 363
 364
 365
 366 %macro MMX_QUANT32_DC_START 0
 367 ;   mov         rdi, rdi        ; &dct[0][0]
 368     movd        mm5, parm2d     ; i_qmf
 369     movd        mm6, parm3d     ; i_qbits
 370     movd        mm7, parm4d     ; f
 371     punpckldq   mm5, mm5        ; i_qmf in each dword
 372     punpckldq   mm7, mm7        ; f in each dword
 373 %endmacro
 374
 375 %macro MMXEXT_QUANT32_1x4 5
 376 ;;; %1      (m64)       dct[y][x]
 377 ;;; %2,%3   (m64/mmx)   quant_mf[y][x] or quant_mf[0][0] (as int16_t)
 378 ;;; %4      (mmx)       i_qbits in the low quadword
 379 ;;; %5      (mmx)       f as doublewords
 380 ;;; trashes mm0-mm4
 381     movq        mm0, %1     ; load dct coeffs
 382     pxor        mm4, mm4
 383     pcmpgtw     mm4, mm0    ; sign(mm0)
 384     pxor        mm0, mm4
 385     psubw       mm0, mm4    ; abs(mm0)
 386     movq        mm1, mm0
 387     punpcklwd   mm0, mm0    ; duplicate the words for the upcomming
 388     punpckhwd   mm1, mm1    ; 32 bit multiplication
 389
 390     movq        mm2, mm0    ; like in school ...
 391     movq        mm3, mm1
 392     pmulhuw     mm0, %2     ; ... multiply the parts ...
 393     pmulhuw     mm1, %3
 394     pmullw      mm2, %2
 395     pmullw      mm3, %3
 396     pslld       mm0, 16     ; ... shift ...
 397     pslld       mm1, 16
 398     paddd       mm0, mm2    ; ... and add them
 399     paddd       mm1, mm3
 400
 401     paddd       mm0, %5     ; round with f
 402     paddd       mm1, %5
 403     psrad       mm0, %4
 404     psrad       mm1, %4
 405
 406     packssdw    mm0, mm1    ; pack to int16_t
 407     pxor        mm0, mm4    ; restore sign
 408     psubw       mm0, mm4
 409     movq        %1, mm0     ; store
 410 %endmacro
 411
 412 ALIGN 16
 413 ;-----------------------------------------------------------------------------
 414 ;   void x264_quant_2x2_dc_core32_mmxext( int16_t dct[2][2],
 415 ;       int const i_qmf, int const i_qbits, int const f );
 416 ;-----------------------------------------------------------------------------
 417 x264_quant_2x2_dc_core32_mmxext:
 418     MMX_QUANT32_DC_START
 419     MMXEXT_QUANT32_1x4 [parm1q], mm5, mm5, mm6, mm7
 420     ret
 421
 422 ALIGN 16
 423 ;-----------------------------------------------------------------------------
 424 ;   void x264_quant_4x4_dc_core32_mmxext( int16_t dct[4][4],
 425 ;       int const i_qmf, int const i_qbits, int const f );
 426 ;-----------------------------------------------------------------------------
 427 x264_quant_4x4_dc_core32_mmxext:
 428     MMX_QUANT32_DC_START
 429
 430 %rep 4
 431     MMXEXT_QUANT32_1x4 [parm1q], mm5, mm5, mm6, mm7
 432     add         parm1q, byte 8
 433 %endrep
 434
 435     ret
 436
 437 ALIGN 16
 438 ;-----------------------------------------------------------------------------
 439 ;   void x264_quant_4x4_core32_mmxext( int16_t dct[4][4],
 440 ;       int const quant_mf[4][4], int const i_qbits, int const f );
 441 ;-----------------------------------------------------------------------------
 442 x264_quant_4x4_core32_mmxext:
 443     MMX_QUANT_AC_START
 444
 445 %rep 4
 446     MMXEXT_QUANT32_1x4 [parm1q], [parm2q], [parm2q+8], mm6, mm7
 447     add         parm1q, byte 8
 448     add         parm2q, byte 16
 449 %endrep
 450
 451     ret
 452
 453 ALIGN 16
 454 ;-----------------------------------------------------------------------------
 455 ;   void x264_quant_8x8_core32_mmxext( int16_t dct[8][8],
 456 ;       int const quant_mf[8][8], int const i_qbits, int const f );
 457 ;-----------------------------------------------------------------------------
 458 x264_quant_8x8_core32_mmxext:
 459     MMX_QUANT_AC_START
 460
 461 %rep 16
 462     MMXEXT_QUANT32_1x4 [parm1q], [parm2q], [parm2q+8], mm6, mm7
 463     add         parm1q, byte 8
 464     add         parm2q, byte 16
 465 %endrep
 466
 467     ret
 468
 469
 470 ;=============================================================================
 471 ; dequant
 472 ;=============================================================================
 473
 474 %macro DEQUANT16_L_1x4 3
 475 ;;; %1      dct[y][x]
 476 ;;; %2,%3   dequant_mf[i_mf][y][x]
 477 ;;; mm5     i_qbits
 478
 479     movq     mm1, %2
 480     movq     mm2, %3
 481     movq     mm0, %1
 482     packssdw mm1, mm2
 483     pmullw   mm0, mm1
 484     psllw    mm0, mm5
 485     movq     %1,  mm0
 486 %endmacro
 487
 488 %macro DEQUANT32_R_1x4 3
 489 ;;; %1      dct[y][x]
 490 ;;; %2,%3   dequant_mf[i_mf][y][x]
 491 ;;; mm5     -i_qbits
 492 ;;; mm6     f as dwords
 493 ;;; mm7     0
 494
 495     movq      mm0, %1
 496     movq      mm1, mm0
 497     punpcklwd mm0, mm0
 498     punpckhwd mm1, mm1
 499
 500     movq      mm2, mm0
 501     movq      mm3, mm1
 502     pmulhw    mm0, %2
 503     pmulhw    mm1, %3
 504     pmullw    mm2, %2
 505     pmullw    mm3, %3
 506     pslld     mm0, 16
 507     pslld     mm1, 16
 508     paddd     mm0, mm2
 509     paddd     mm1, mm3
 510
 511     paddd     mm0, mm6
 512     paddd     mm1, mm6
 513     psrad     mm0, mm5
 514     psrad     mm1, mm5
 515
 516     packssdw  mm0, mm1
 517     movq      %1,  mm0
 518 %endmacro
 519
 520 %macro DEQUANT_WxH 3
 521 ALIGN 16
 522 ;;; void x264_dequant_4x4_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )
 523 %1:
 524 ;   mov  rdi, rdi   ; dct
 525 ;   mov  rsi, rsi   ; dequant_mf
 526 ;   mov  edx, edx   ; i_qp
 527
 528     imul eax, edx, 0x2b
 529     shr  eax, 8     ; i_qbits = i_qp / 6
 530     lea  ecx, [eax+eax*2]
 531     sub  edx, ecx
 532     sub  edx, ecx   ; i_mf = i_qp % 6
 533     shl  edx, %3+2
 534     movsxd rdx, edx
 535     add  rsi, rdx   ; dequant_mf[i_mf]
 536
 537     sub  eax, %3
 538     jl   .rshift32  ; negative qbits => rightshift
 539
 540 .lshift:
 541     movd mm5, eax
 542
 543 %rep %2
 544     DEQUANT16_L_1x4 [rdi], [rsi], [rsi+8]
 545     add  rsi, byte 16
 546     add  rdi, byte 8
 547 %endrep
 548
 549     ret
 550
 551 .rshift32:
 552     neg   eax
 553     movd  mm5, eax
 554     movq  mm6, [pd_1 GLOBAL]
 555     pxor  mm7, mm7
 556     pslld mm6, mm5
 557     psrld mm6, 1
 558
 559 %rep %2
 560     DEQUANT32_R_1x4 [rdi], [rsi], [rsi+8]
 561     add  rsi, byte 16
 562     add  rdi, byte 8
 563 %endrep
 564
 565     ret
 566 %endmacro
 567
 568 DEQUANT_WxH x264_dequant_4x4_mmx, 4, 4
 569 DEQUANT_WxH x264_dequant_8x8_mmx, 16, 6