git.sesse.net Git - x264/blob - common/amd64/dct-a.asm

   1 ;*****************************************************************************
   2 ;* dct.asm: h264 encoder library
   3 ;*****************************************************************************
   4 ;* Copyright (C) 2003 x264 project
   5 ;* $Id: dct.asm,v 1.1 2004/06/03 19:27:07 fenrir Exp $
   6 ;*
   7 ;* Authors: Min Chen <chenm001.163.com> (converted to nasm)
   8 ;*          Laurent Aimar <fenrir@via.ecp.fr> (initial version)
   9 ;*
  10 ;* This program is free software; you can redistribute it and/or modify
  11 ;* it under the terms of the GNU General Public License as published by
  12 ;* the Free Software Foundation; either version 2 of the License, or
  13 ;* (at your option) any later version.
  14 ;*
  15 ;* This program is distributed in the hope that it will be useful,
  16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18 ;* GNU General Public License for more details.
  19 ;*
  20 ;* You should have received a copy of the GNU General Public License
  21 ;* along with this program; if not, write to the Free Software
  22 ;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
  23 ;*****************************************************************************
  24
  25 ;*****************************************************************************
  26 ;*                                                                           *
  27 ;*  Revision history:                                                        *
  28 ;*                                                                           *
  29 ;*  2004.04.28  portab all 4x4 function to nasm (CM)                         *
  30 ;*                                                                           *
  31 ;*****************************************************************************
  32
  33 BITS 64
  34
  35 ;=============================================================================
  36 ; Macros and other preprocessor constants
  37 ;=============================================================================
  38
  39 %ifdef __PIC__
  40         %define GLOBAL wrt rip
  41 %else
  42         %define GLOBAL
  43 %endif
  44
  45 %macro cglobal 1
  46         %ifdef PREFIX
  47                 global _%1
  48                 %define %1 _%1
  49         %else
  50                 global %1
  51         %endif
  52 %endmacro
  53
  54 %macro MMX_ZERO 1
  55     pxor    %1, %1
  56 %endmacro
  57
  58 %macro MMX_LOAD_DIFF_4P 5
  59     movd        %1, %4
  60     punpcklbw   %1, %3
  61     movd        %2, %5
  62     punpcklbw   %2, %3
  63     psubw       %1, %2
  64 %endmacro
  65
  66 %macro MMX_SUMSUB_BA 2
  67     paddw   %1, %2
  68     paddw   %2, %2
  69     psubw   %2, %1
  70 %endmacro
  71
  72 %macro MMX_SUMSUB_BADC 4
  73     paddw   %1, %2
  74     paddw   %3, %4
  75     paddw   %2, %2
  76     paddw   %4, %4
  77     psubw   %2, %1
  78     psubw   %4, %3
  79 %endmacro
  80
  81 %macro MMX_SUMSUB2_AB 3
  82     movq    %3, %1
  83     paddw   %1, %1
  84     paddw   %1, %2
  85     psubw   %3, %2
  86     psubw   %3, %2
  87 %endmacro
  88
  89 %macro MMX_SUMSUBD2_AB 4
  90     movq    %4, %1
  91     movq    %3, %2
  92     psraw   %2, 1
  93     psraw   %4, 1
  94     paddw   %1, %2
  95     psubw   %4, %3
  96 %endmacro
  97
  98 %macro SBUTTERFLYwd 3
  99     movq        %3, %1
 100     punpcklwd   %1, %2
 101     punpckhwd   %3, %2
 102 %endmacro
 103
 104 %macro SBUTTERFLYdq 3
 105     movq        %3, %1
 106     punpckldq   %1, %2
 107     punpckhdq   %3, %2
 108 %endmacro
 109
 110 ;-----------------------------------------------------------------------------
 111 ; input ABCD output ADTC
 112 ;-----------------------------------------------------------------------------
 113 %macro MMX_TRANSPOSE 5
 114     SBUTTERFLYwd %1, %2, %5
 115     SBUTTERFLYwd %3, %4, %2
 116     SBUTTERFLYdq %1, %3, %4
 117     SBUTTERFLYdq %5, %2, %3
 118 %endmacro
 119
 120 %macro MMX_STORE_DIFF_4P 5
 121     paddw       %1, %3
 122     psraw       %1, 6
 123     movd        %2, %5
 124     punpcklbw   %2, %4
 125     paddsw      %1, %2
 126     packuswb    %1, %1
 127     movd        %5, %1
 128 %endmacro
 129
 130 ;%macro
 131 ;%endmacro
 132
 133 ;=============================================================================
 134 ; Local Data (Read Only)
 135 ;=============================================================================
 136
 137 %ifdef FORMAT_COFF
 138 SECTION .rodata
 139 %else
 140 SECTION .rodata
 141 %endif
 142
 143 ;-----------------------------------------------------------------------------
 144 ; Various memory constants (trigonometric values or rounding values)
 145 ;-----------------------------------------------------------------------------
 146
 147 ALIGN 16
 148 x264_mmx_1:        dw  1,  1,  1,  1
 149 x264_mmx_32:       dw 32, 32, 32, 32
 150 x264_mmx_PPNN:     dw  1,  1, -1, -1
 151 x264_mmx_PNPN:     dw  1, -1,  1, -1
 152 x264_mmx_PNNP:     dw  1, -1, -1,  1
 153 x264_mmx_PPPN:     dw  1,  1,  1, -1
 154 x264_mmx_PPNP:     dw  1,  1, -1,  1
 155 x264_mmx_2121:     dw  2,  1,  2,  1
 156 x264_mmx_p2n2p1p1: dw  2, -2,  1,  1
 157
 158 ;=============================================================================
 159 ; Code
 160 ;=============================================================================
 161
 162 SECTION .text
 163
 164 cglobal x264_dct4x4dc_mmxext
 165
 166 ALIGN 16
 167 ;-----------------------------------------------------------------------------
 168 ;   void __cdecl dct4x4dc( int16_t d[4][4] )
 169 ;-----------------------------------------------------------------------------
 170 x264_dct4x4dc_mmxext:
 171     movq    mm0,        [rdi+ 0]
 172     movq    mm1,        [rdi+ 8]
 173     movq    mm2,        [rdi+16]
 174     movq    mm3,        [rdi+24]
 175
 176     MMX_SUMSUB_BADC     mm1, mm0, mm3, mm2          ; mm1=s01  mm0=d01  mm3=s23  mm2=d23
 177     MMX_SUMSUB_BADC     mm3, mm1, mm2, mm0          ; mm3=s01+s23  mm1=s01-s23  mm2=d01+d23  mm0=d01-d23
 178
 179     MMX_TRANSPOSE       mm3, mm1, mm0, mm2, mm4     ; in: mm3, mm1, mm0, mm2  out: mm3, mm2, mm4, mm0
 180
 181     MMX_SUMSUB_BADC     mm2, mm3, mm0, mm4          ; mm2=s01  mm3=d01  mm0=s23  mm4=d23
 182     MMX_SUMSUB_BADC     mm0, mm2, mm4, mm3          ; mm0=s01+s23  mm2=s01-s23  mm4=d01+d23  mm3=d01-d23
 183
 184     MMX_TRANSPOSE       mm0, mm2, mm3, mm4, mm1     ; in: mm0, mm2, mm3, mm4  out: mm0, mm4, mm1, mm3
 185
 186     movq    mm6,        [x264_mmx_1 GLOBAL]
 187     paddw   mm0,        mm6
 188     paddw   mm4,        mm6
 189     psraw   mm0,        1
 190     movq    [rdi+ 0],   mm0
 191     psraw   mm4,        1
 192     movq    [rdi+ 8],   mm4
 193     paddw   mm1,        mm6
 194     paddw   mm3,        mm6
 195     psraw   mm1,        1
 196     movq    [rdi+16],   mm1
 197     psraw   mm3,        1
 198     movq    [rdi+24],   mm3
 199     ret
 200
 201 cglobal x264_idct4x4dc_mmxext
 202
 203 ALIGN 16
 204 ;-----------------------------------------------------------------------------
 205 ;   void __cdecl x264_idct4x4dc_mmxext( int16_t d[4][4] )
 206 ;-----------------------------------------------------------------------------
 207 x264_idct4x4dc_mmxext:
 208     movq    mm0, [rdi+ 0]
 209     movq    mm1, [rdi+ 8]
 210     movq    mm2, [rdi+16]
 211     movq    mm3, [rdi+24]
 212
 213     MMX_SUMSUB_BADC     mm1, mm0, mm3, mm2          ; mm1=s01  mm0=d01  mm3=s23  mm2=d23
 214     MMX_SUMSUB_BADC     mm3, mm1, mm2, mm0          ; mm3=s01+s23 mm1=s01-s23 mm2=d01+d23 mm0=d01-d23
 215
 216     MMX_TRANSPOSE       mm3, mm1, mm0, mm2, mm4     ; in: mm3, mm1, mm0, mm2  out: mm3, mm2, mm4, mm0
 217
 218     MMX_SUMSUB_BADC     mm2, mm3, mm0, mm4          ; mm2=s01  mm3=d01  mm0=s23  mm4=d23
 219     MMX_SUMSUB_BADC     mm0, mm2, mm4, mm3          ; mm0=s01+s23  mm2=s01-s23  mm4=d01+d23  mm3=d01-d23
 220
 221     MMX_TRANSPOSE       mm0, mm2, mm3, mm4, mm1     ; in: mm0, mm2, mm3, mm4  out: mm0, mm4, mm1, mm3
 222
 223     movq    [rdi+ 0],   mm0
 224     movq    [rdi+ 8],   mm4
 225     movq    [rdi+16],   mm1
 226     movq    [rdi+24],   mm3
 227     ret
 228
 229 cglobal x264_sub4x4_dct_mmxext
 230
 231 ALIGN 16
 232 ;-----------------------------------------------------------------------------
 233 ;   void __cdecl x264_sub4x4_dct_mmxext( int16_t dct[4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
 234 ;-----------------------------------------------------------------------------
 235 x264_sub4x4_dct_mmxext:
 236     push    rbx
 237     mov     rax, rsi        ; pix1
 238     movsxd  rbx, edx        ; i_pix1
 239 ;   mov     rcx, rcx        ; pix2
 240     movsxd  rdx, r8d        ; i_pix2
 241
 242     MMX_ZERO    mm7
 243
 244     ; Load 4 lines
 245     MMX_LOAD_DIFF_4P    mm0, mm6, mm7, [rax      ], [rcx]
 246     MMX_LOAD_DIFF_4P    mm1, mm6, mm7, [rax+rbx  ], [rcx+rdx]
 247     MMX_LOAD_DIFF_4P    mm2, mm6, mm7, [rax+rbx*2], [rcx+rdx*2]
 248     add     rax, rbx
 249     add     rcx, rdx
 250     MMX_LOAD_DIFF_4P    mm3, mm6, mm7, [rax+rbx*2], [rcx+rdx*2]
 251
 252     MMX_SUMSUB_BADC     mm3, mm0, mm2, mm1          ; mm3=s03  mm0=d03  mm2=s12  mm1=d12
 253
 254     MMX_SUMSUB_BA       mm2, mm3                    ; mm2=s03+s12      mm3=s03-s12
 255     MMX_SUMSUB2_AB      mm0, mm1, mm4               ; mm0=2.d03+d12    mm4=d03-2.d12
 256
 257     ; transpose in: mm2, mm0, mm3, mm4, out: mm2, mm4, mm1, mm3
 258     MMX_TRANSPOSE       mm2, mm0, mm3, mm4, mm1
 259
 260     MMX_SUMSUB_BADC     mm3, mm2, mm1, mm4          ; mm3=s03  mm2=d03  mm1=s12  mm4=d12
 261
 262     MMX_SUMSUB_BA       mm1, mm3                    ; mm1=s03+s12      mm3=s03-s12
 263     MMX_SUMSUB2_AB      mm2, mm4, mm0               ; mm2=2.d03+d12    mm0=d03-2.d12
 264
 265     ; transpose in: mm1, mm2, mm3, mm0, out: mm1, mm0, mm4, mm3
 266     MMX_TRANSPOSE       mm1, mm2, mm3, mm0, mm4
 267
 268     movq    [rdi+ 0],   mm1 ; dct
 269     movq    [rdi+ 8],   mm0
 270     movq    [rdi+16],   mm4
 271     movq    [rdi+24],   mm3
 272
 273     pop     rbx
 274     ret
 275
 276 cglobal x264_add4x4_idct_mmxext
 277
 278 ALIGN 16
 279 ;-----------------------------------------------------------------------------
 280 ;   void __cdecl x264_add4x4_idct_mmxext( uint8_t *p_dst, int i_dst, int16_t dct[4][4] )
 281 ;-----------------------------------------------------------------------------
 282 x264_add4x4_idct_mmxext:
 283     ; Load dct coeffs
 284     movq    mm0, [rdx+ 0]   ; dct
 285     movq    mm4, [rdx+ 8]
 286     movq    mm3, [rdx+16]
 287     movq    mm1, [rdx+24]
 288
 289     mov     rax, rdi        ; p_dst
 290     movsxd  rcx, esi        ; i_dst
 291     lea     rdx, [rcx+rcx*2]
 292
 293     ; out:mm0, mm1, mm2, mm3
 294     MMX_TRANSPOSE       mm0, mm4, mm3, mm1, mm2
 295
 296     MMX_SUMSUB_BA       mm2, mm0                        ; mm2=s02  mm0=d02
 297     MMX_SUMSUBD2_AB     mm1, mm3, mm5, mm4              ; mm1=s13  mm4=d13 ( well 1 + 3>>1 and 1>>1 + 3)
 298
 299     MMX_SUMSUB_BADC     mm1, mm2, mm4, mm0              ; mm1=s02+s13  mm2=s02-s13  mm4=d02+d13  mm0=d02-d13
 300
 301     ; in: mm1, mm4, mm0, mm2  out: mm1, mm2, mm3, mm0
 302     MMX_TRANSPOSE       mm1, mm4, mm0, mm2, mm3
 303
 304     MMX_SUMSUB_BA       mm3, mm1                        ; mm3=s02  mm1=d02
 305     MMX_SUMSUBD2_AB     mm2, mm0, mm5, mm4              ; mm2=s13  mm4=d13 ( well 1 + 3>>1 and 1>>1 + 3)
 306
 307     MMX_SUMSUB_BADC     mm2, mm3, mm4, mm1              ; mm2=s02+s13  mm3=s02-s13  mm4=d02+d13  mm1=d02-d13
 308
 309     MMX_ZERO            mm7
 310     movq                mm6, [x264_mmx_32 GLOBAL]
 311
 312     MMX_STORE_DIFF_4P   mm2, mm0, mm6, mm7, [rax]
 313     MMX_STORE_DIFF_4P   mm4, mm0, mm6, mm7, [rax+rcx]
 314     MMX_STORE_DIFF_4P   mm1, mm0, mm6, mm7, [rax+rcx*2]
 315     MMX_STORE_DIFF_4P   mm3, mm0, mm6, mm7, [rax+rdx]
 316
 317     ret
 318
 319
 320
 321 ; =============================================================================
 322 ; 8x8 Transform
 323 ; =============================================================================
 324
 325 ; -----------------------------------------------------------------------------
 326 ; input 2x8 unsigned bytes (%5,%6), zero (%7) output: difference (%1,%2)
 327 ; -----------------------------------------------------------------------------
 328 %macro MMX_LOAD_DIFF_8P 7
 329     movq            %1, %5
 330     movq            %2, %1
 331     punpcklbw       %1, %7
 332     punpckhbw       %2, %7
 333     movq            %3, %6
 334     movq            %4, %3
 335     punpcklbw       %3, %7
 336     punpckhbw       %4, %7
 337     psubw           %1, %3
 338     psubw           %2, %4
 339 %endmacro
 340
 341 %macro MMX_LOADSUMSUB 4     ; returns %1=%3+%4, %2=%3-%4
 342     movq            %2, %3
 343     movq            %1, %4
 344     MMX_SUMSUB_BA   %1, %2
 345 %endmacro
 346
 347 %macro MMX_STORE_DIFF_8P 6
 348     movq            %1, %3
 349     movq            %2, %1
 350     punpcklbw       %1, %6
 351     punpckhbw       %2, %6
 352     paddw           %1, %4
 353     paddw           %2, %5
 354     packuswb        %1, %2
 355     movq            %3, %1
 356 %endmacro
 357
 358 cglobal x264_pixel_sub_8x8_mmx
 359 cglobal x264_xdct8_mmxext
 360 cglobal x264_ydct8_mmx
 361 cglobal x264_ydct8_sse2
 362
 363 cglobal x264_xidct8_mmxext
 364 cglobal x264_yidct8_mmx
 365 cglobal x264_yidct8_sse2
 366 cglobal x264_pixel_add_8x8_mmx
 367
 368 ALIGN 16
 369 ;-----------------------------------------------------------------------------
 370 ;   void __cdecl x264_pixel_sub_8x8_mmx( int16_t *diff, uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 );
 371 ;-----------------------------------------------------------------------------
 372 x264_pixel_sub_8x8_mmx:
 373 ;   mov     rdi, rdi        ; diff
 374 ;   mov     rsi, rsi        ; pix1
 375     movsxd  rdx, edx        ; i_pix1
 376 ;   mov     rcx, rcx        ; pix2
 377     movsxd  r8,  r8d        ; i_pix2
 378
 379     MMX_ZERO    mm7
 380
 381     %assign disp 0
 382     %rep  8
 383     MMX_LOAD_DIFF_8P mm0, mm1, mm2, mm3, [rsi], [rcx], mm7
 384     movq        [rdi+disp], mm0
 385     movq        [rdi+disp+8], mm1
 386     add         rsi, rdx
 387     add         rcx, r8
 388     %assign disp disp+16
 389     %endrep
 390
 391     ret
 392
 393 ALIGN 16
 394 ;-----------------------------------------------------------------------------
 395 ;   void __cdecl x264_xdct8_mmxext( int16_t dest[8][8] );
 396 ;-----------------------------------------------------------------------------
 397 x264_xdct8_mmxext:
 398
 399     movq        mm5, [x264_mmx_PPNN GLOBAL]
 400     movq        mm6, [x264_mmx_PNNP GLOBAL]
 401     movq        mm4, [x264_mmx_PPPN GLOBAL]
 402     movq        mm7, [x264_mmx_PPNP GLOBAL]
 403
 404     ;-------------------------------------------------------------------------
 405     ; horizontal dct ( compute 1 row at a time -> 8 loops )
 406     ;-------------------------------------------------------------------------
 407
 408     %assign disp 0
 409     %rep 8
 410
 411     movq        mm0, [rdi+disp]
 412     movq        mm1, [rdi+disp+8]
 413
 414     pshufw      mm2, mm1, 00011011b
 415     movq        mm1, mm0
 416     paddw       mm0, mm2                ; (low)s07/s16/d25/s34(high)
 417     psubw       mm1, mm2                ; (low)d07/d16/d25/d34(high)
 418
 419     pshufw      mm2, mm0, 00011011b     ; (low)s34/s25/s16/s07(high)
 420     pmullw      mm0, mm5                ; (low)s07/s16/-s25/-s34(high)
 421     paddw       mm0, mm2                ; (low)a0/a1/a3/a2(high)
 422
 423     movq        mm3, mm1
 424     psraw       mm1, 1                  ; (low)d07/d16/d25/d34(high) (x>>1)
 425     pshufw      mm2, mm3, 10110001b     ; (low)d16/d07/d34/d25(high)
 426     paddw       mm1, mm3                ; (low)d07/d16/d25/d34(high) (x+(x>>1))
 427     pshufw      mm3, mm2, 00011011b     ; (low)d25/d34/d07/d16(high)
 428     pmullw      mm2, mm5                ; (low)d16/d07/-d34/-d25(high)
 429     pmullw      mm1, mm6                ; (low)d07/-d16/-d25/d34(high) (x+(x>>1))
 430     paddw       mm3, mm2
 431     paddw       mm1, mm3                ; (low)a4/a6/a5/a7(high)
 432
 433
 434     pshufw      mm2, mm0, 11001001b     ; (low)a1/a3/a0/a2(high)
 435     pshufw      mm0, mm0, 10011100b     ; (low)a0/a2/a1/a3(high)
 436     pmullw      mm2, [x264_mmx_2121 GLOBAL]
 437     pmullw      mm0, mm5                ; (low)a0/a2/-a1/-a3(high)
 438     psraw       mm2, 1                  ; (low)a1/a3>>1/a0/a2>>1(high)
 439     paddw       mm0, mm2                ; (low)dst0/dst2/dst4/dst6(high)
 440
 441     pshufw      mm1, mm1, 00100111b     ; (low)a7/a6/a5/a4(high)
 442     pshufw      mm2, mm1, 00011011b     ; (low)a4/a5/a6/a7(high)
 443     psraw       mm1, 2                  ; (low)a7>>2/a6>>2/a5>>2/a4>>2(high)
 444     pmullw      mm2, mm4                ; (low)a4/a5/a6/-a7(high)
 445     pmullw      mm1, mm7                ; (low)a7>>2/a6>>2/-a5>>2/a4>>2(high)
 446     paddw       mm1, mm2                ; (low)dst1/dst3/dst5/dst7(high)
 447
 448     movq        mm2, mm0
 449     punpcklwd   mm0, mm1                ; (low)dst0/dst1/dst2/dst3(high)
 450     punpckhwd   mm2, mm1                ; (low)dst4/dst5/dst6/dst7(high)
 451
 452     movq        [rdi+disp], mm0
 453     movq        [rdi+disp+8], mm2
 454
 455     %assign disp disp+16
 456     %endrep
 457
 458     ret
 459
 460 ALIGN 16
 461 ;-----------------------------------------------------------------------------
 462 ;   void __cdecl x264_ydct8_mmx( int16_t dest[8][8] );
 463 ;-----------------------------------------------------------------------------
 464 x264_ydct8_mmx:
 465
 466     ;-------------------------------------------------------------------------
 467     ; vertical dct ( compute 4 columns at a time -> 2 loops )
 468     ;-------------------------------------------------------------------------
 469
 470     %assign disp 0
 471     %rep 2
 472
 473     MMX_LOADSUMSUB  mm2, mm3, [rdi+disp+0*16], [rdi+disp+7*16] ; mm2 = s07, mm3 = d07
 474     MMX_LOADSUMSUB  mm1, mm5, [rdi+disp+1*16], [rdi+disp+6*16] ; mm1 = s16, mm5 = d16
 475     MMX_LOADSUMSUB  mm0, mm6, [rdi+disp+2*16], [rdi+disp+5*16] ; mm0 = s25, mm6 = d25
 476     MMX_LOADSUMSUB  mm4, mm7, [rdi+disp+3*16], [rdi+disp+4*16] ; mm4 = s34, mm7 = d34
 477
 478     MMX_SUMSUB_BA   mm4, mm2        ; mm4 = a0, mm2 = a2
 479     MMX_SUMSUB_BA   mm0, mm1        ; mm0 = a1, mm1 = a3
 480     MMX_SUMSUB_BA   mm0, mm4        ; mm0 = dst0, mm1 = dst4
 481
 482     movq    [rdi+disp+0*16], mm0
 483     movq    [rdi+disp+4*16], mm4
 484
 485     movq    mm0, mm1         ; a3
 486     psraw   mm0, 1           ; a3>>1
 487     paddw   mm0, mm2         ; a2 + (a3>>1)
 488     psraw   mm2, 1           ; a2>>1
 489     psubw   mm2, mm1         ; (a2>>1) - a3
 490
 491     movq    [rdi+disp+2*16], mm0
 492     movq    [rdi+disp+6*16], mm2
 493
 494     movq    mm0, mm6
 495     psraw   mm0, 1
 496     paddw   mm0, mm6         ; d25+(d25>>1)
 497     movq    mm1, mm3
 498     psubw   mm1, mm7         ; a5 = d07-d34-(d25+(d25>>1))
 499     psubw   mm1, mm0
 500
 501     movq    mm0, mm5
 502     psraw   mm0, 1
 503     paddw   mm0, mm5         ; d16+(d16>>1)
 504     movq    mm2, mm3
 505     paddw   mm2, mm7         ; a6 = d07+d34-(d16+(d16>>1))
 506     psubw   mm2, mm0
 507
 508     movq    mm0, mm3
 509     psraw   mm0, 1
 510     paddw   mm0, mm3         ; d07+(d07>>1)
 511     paddw   mm0, mm5
 512     paddw   mm0, mm6         ; a4 = d16+d25+(d07+(d07>>1))
 513
 514     movq    mm3, mm7
 515     psraw   mm3, 1
 516     paddw   mm3, mm7         ; d34+(d34>>1)
 517     paddw   mm3, mm5
 518     psubw   mm3, mm6         ; a7 = d16-d25+(d34+(d34>>1))
 519
 520     movq    mm7, mm3
 521     psraw   mm7, 2
 522     paddw   mm7, mm0         ; a4 + (a7>>2)
 523
 524     movq    mm6, mm2
 525     psraw   mm6, 2
 526     paddw   mm6, mm1         ; a5 + (a6>>2)
 527
 528     psraw   mm0, 2
 529     psraw   mm1, 2
 530     psubw   mm0, mm3         ; (a4>>2) - a7
 531     psubw   mm2, mm1         ; a6 - (a5>>2)
 532
 533     movq    [rdi+disp+1*16], mm7
 534     movq    [rdi+disp+3*16], mm6
 535     movq    [rdi+disp+5*16], mm2
 536     movq    [rdi+disp+7*16], mm0
 537
 538     %assign disp disp+8
 539     %endrep
 540
 541     ret
 542
 543 ALIGN 16
 544 ;-----------------------------------------------------------------------------
 545 ;   void __cdecl x264_xidct8_mmxext( int16_t dest[8][8] );
 546 ;-----------------------------------------------------------------------------
 547 x264_xidct8_mmxext:
 548
 549     movq        mm4, [x264_mmx_PPNN GLOBAL]
 550     movq        mm5, [x264_mmx_PNPN GLOBAL]
 551     movq        mm6, [x264_mmx_PPNP GLOBAL]
 552     movq        mm7, [x264_mmx_PPPN GLOBAL]
 553
 554     ;-------------------------------------------------------------------------
 555     ; horizontal idct ( compute 1 row at a time -> 8 loops )
 556     ;-------------------------------------------------------------------------
 557
 558     %assign disp 0
 559     %rep 8
 560
 561     pshufw      mm0, [rdi+disp], 11011000b      ; (low)d0,d2,d1,d3(high)
 562     pshufw      mm2, [rdi+disp+8], 11011000b    ; (low)d4,d6,d5,d7(high)
 563     movq        mm1, mm0
 564     punpcklwd   mm0, mm2                ; (low)d0,d4,d2,d6(high)
 565     punpckhwd   mm1, mm2                ; (low)d1,d5,d3,d7(high)
 566
 567     pshufw      mm2, mm0, 10110001b     ; (low)d4,d0,d6,d2(high)
 568     pmullw      mm0, [x264_mmx_p2n2p1p1 GLOBAL]
 569                                         ; (low)2*d0,-2*d4,d2,d6(high)
 570     pmullw      mm2, mm6                ; (low)d4,d0,-d6,d2(high)
 571     psraw       mm0, 1                  ; (low)d0,-d4,d2>>1,d6>>1(high)
 572     paddw       mm0, mm2                ; (low)e0,e2,e4,e6(high)
 573
 574     movq        mm3, mm1                ; (low)d1,d5,d3,d7(high)
 575     psraw       mm1, 1                  ; (low)d1>>1,d5>>1,d3>>1,d7>>1(high)
 576     pshufw      mm2, mm3, 10110001b     ; (low)d5,d1,d7,d3(high)
 577     paddw       mm1, mm3                ; (low)d1+(d1>>1),d5+(d5>>1),d3+(d3>>1),d7+(d7>>1)(high)
 578     pshufw      mm3, mm2, 00011011b     ; (low)d3,d7,d1,d5(high)
 579     pmullw      mm1, mm4                ; (low)d1+(d1>>1),d5+(d5>>1),-d3-(d3>>1),-d7-(d7>>1)(high)
 580     pmullw      mm2, mm5                ; (low)d5,-d1,d7,-d3(high)
 581     paddw       mm1, mm3
 582     paddw       mm1, mm2                ; (low)e7,e5,e3,e1(high)
 583
 584     pshufw      mm2, mm0, 00011011b     ; (low)e6,e4,e2,e0(high)
 585     pmullw      mm0, mm4                ; (low)e0,e2,-e4,-e6(high)
 586     pshufw      mm3, mm1, 00011011b     ; (low)e1,e3,e5,e7(high)
 587     psraw       mm1, 2                  ; (low)e7>>2,e5>>2,e3>>2,e1>>2(high)
 588     pmullw      mm3, mm6                ; (low)e1,e3,-e5,e7(high)
 589     pmullw      mm1, mm7                ; (low)e7>>2,e5>>2,e3>>2,-e1>>2(high)
 590     paddw       mm0, mm2                ; (low)f0,f2,f4,f6(high)
 591     paddw       mm1, mm3                ; (low)f1,f3,f5,f7(high)
 592
 593     pshufw      mm3, mm0, 00011011b     ; (low)f6,f4,f2,f0(high)
 594     pshufw      mm2, mm1, 00011011b     ; (low)f7,f5,f3,f1(high)
 595     psubw       mm3, mm1
 596     paddw       mm0, mm2
 597
 598     movq        [rdi+disp], mm0
 599     movq        [rdi+disp+8], mm3
 600
 601     %assign disp disp+16
 602     %endrep
 603
 604     ret
 605
 606 ALIGN 16
 607 ;-----------------------------------------------------------------------------
 608 ;   void __cdecl x264_yidct8_mmx( int16_t dest[8][8] );
 609 ;-----------------------------------------------------------------------------
 610 x264_yidct8_mmx:
 611
 612     ;-------------------------------------------------------------------------
 613     ; vertical idct ( compute 4 columns at a time -> 2 loops )
 614     ;-------------------------------------------------------------------------
 615
 616     %assign disp 0
 617     %rep 2
 618
 619     movq        mm1, [rdi+disp+1*16]    ; mm1 = d1
 620     movq        mm3, [rdi+disp+3*16]    ; mm3 = d3
 621     movq        mm5, [rdi+disp+5*16]    ; mm5 = d5
 622     movq        mm7, [rdi+disp+7*16]    ; mm7 = d7
 623
 624     movq        mm4, mm7
 625     psraw       mm4, 1
 626     movq        mm0, mm5
 627     psubw       mm0, mm7
 628     psubw       mm0, mm4
 629     psubw       mm0, mm3                ; mm0 = e1
 630
 631     movq        mm6, mm3
 632     psraw       mm6, 1
 633     movq        mm2, mm7
 634     psubw       mm2, mm6
 635     psubw       mm2, mm3
 636     paddw       mm2, mm1                ; mm2 = e3
 637
 638     movq        mm4, mm5
 639     psraw       mm4, 1
 640     paddw       mm4, mm5
 641     paddw       mm4, mm7
 642     psubw       mm4, mm1                ; mm4 = e5
 643
 644     movq        mm6, mm1
 645     psraw       mm6, 1
 646     paddw       mm6, mm1
 647     paddw       mm6, mm5
 648     paddw       mm6, mm3                ; mm6 = e7
 649
 650     movq        mm1, mm0
 651     movq        mm3, mm4
 652     movq        mm5, mm2
 653     movq        mm7, mm6
 654     psraw       mm6, 2
 655     psraw       mm3, 2
 656     psraw       mm5, 2
 657     psraw       mm0, 2
 658     paddw       mm1, mm6                ; mm1 = f1
 659     paddw       mm3, mm2                ; mm3 = f3
 660     psubw       mm5, mm4                ; mm5 = f5
 661     psubw       mm7, mm0                ; mm7 = f7
 662
 663     movq        mm2, [rdi+disp+2*16]    ; mm2 = d2
 664     movq        mm6, [rdi+disp+6*16]    ; mm6 = d6
 665     movq        mm4, mm2
 666     movq        mm0, mm6
 667     psraw       mm4, 1
 668     psraw       mm6, 1
 669     psubw       mm4, mm0                ; mm4 = a4
 670     paddw       mm6, mm2                ; mm6 = a6
 671
 672     movq        mm2, [rdi+disp+0*16]    ; mm2 = d0
 673     movq        mm0, [rdi+disp+4*16]    ; mm0 = d4
 674     MMX_SUMSUB_BA   mm0, mm2                ; mm0 = a0, mm2 = a2
 675
 676     MMX_SUMSUB_BA   mm6, mm0                ; mm6 = f0, mm0 = f6
 677     MMX_SUMSUB_BA   mm4, mm2                ; mm4 = f2, mm2 = f4
 678
 679     MMX_SUMSUB_BA   mm7, mm6                ; mm7 = g0, mm6 = g7
 680     MMX_SUMSUB_BA   mm5, mm4                ; mm5 = g1, mm4 = g6
 681     MMX_SUMSUB_BA   mm3, mm2                ; mm3 = g2, mm2 = g5
 682     MMX_SUMSUB_BA   mm1, mm0                ; mm1 = g3, mm0 = g4
 683
 684     psraw       mm7, 6
 685     psraw       mm6, 6
 686     psraw       mm5, 6
 687     psraw       mm4, 6
 688     psraw       mm3, 6
 689     psraw       mm2, 6
 690     psraw       mm1, 6
 691     psraw       mm0, 6
 692
 693     movq        [rdi+disp+0*16], mm7
 694     movq        [rdi+disp+1*16], mm5
 695     movq        [rdi+disp+2*16], mm3
 696     movq        [rdi+disp+3*16], mm1
 697     movq        [rdi+disp+4*16], mm0
 698     movq        [rdi+disp+5*16], mm2
 699     movq        [rdi+disp+6*16], mm4
 700     movq        [rdi+disp+7*16], mm6
 701
 702     %assign disp disp+8
 703     %endrep
 704
 705     ret
 706
 707 ALIGN 16
 708 ;-----------------------------------------------------------------------------
 709 ;   void __cdecl x264_pixel_add_8x8_mmx( unit8_t *dst, int i_dst, int16_t src[8][8] );
 710 ;-----------------------------------------------------------------------------
 711 x264_pixel_add_8x8_mmx:
 712 ;   mov     rdi, rdi        ; dst
 713     movsxd  rsi, esi        ; i_dst
 714 ;   mov     rdx, rdx        ; src
 715
 716     MMX_ZERO    mm7
 717
 718     %assign disp 0
 719     %rep 8
 720     MMX_STORE_DIFF_8P   mm0, mm1, [rdi], [rdx+disp], [rdx+disp+8], mm7
 721     add         rdi, rsi
 722     %assign disp disp+16
 723     %endrep
 724     ret
 725