git.sesse.net Git - x264/blob - common/amd64/dct-a.asm

   1 ;*****************************************************************************
   2 ;* dct.asm: h264 encoder library
   3 ;*****************************************************************************
   4 ;* Copyright (C) 2003 x264 project
   5 ;* $Id: dct.asm,v 1.1 2004/06/03 19:27:07 fenrir Exp $
   6 ;*
   7 ;* Authors: Min Chen <chenm001.163.com> (converted to nasm)
   8 ;*          Laurent Aimar <fenrir@via.ecp.fr> (initial version)
   9 ;*
  10 ;* This program is free software; you can redistribute it and/or modify
  11 ;* it under the terms of the GNU General Public License as published by
  12 ;* the Free Software Foundation; either version 2 of the License, or
  13 ;* (at your option) any later version.
  14 ;*
  15 ;* This program is distributed in the hope that it will be useful,
  16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18 ;* GNU General Public License for more details.
  19 ;*
  20 ;* You should have received a copy of the GNU General Public License
  21 ;* along with this program; if not, write to the Free Software
  22 ;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
  23 ;*****************************************************************************
  24
  25 ;*****************************************************************************
  26 ;*                                                                           *
  27 ;*  Revision history:                                                        *
  28 ;*                                                                           *
  29 ;*  2004.04.28  portab all 4x4 function to nasm (CM)                         *
  30 ;*                                                                           *
  31 ;*****************************************************************************
  32
  33 BITS 64
  34
  35 ;=============================================================================
  36 ; Macros and other preprocessor constants
  37 ;=============================================================================
  38
  39 %ifdef __PIC__
  40         %define GLOBAL wrt rip
  41 %else
  42         %define GLOBAL
  43 %endif
  44
  45 %macro cglobal 1
  46         %ifdef PREFIX
  47                 global _%1
  48                 %define %1 _%1
  49         %else
  50                 global %1
  51         %endif
  52 %endmacro
  53
  54 %macro MMX_ZERO 1
  55     pxor    %1, %1
  56 %endmacro
  57
  58 %macro MMX_LOAD_DIFF_4P 5
  59     movd        %1, %4
  60     punpcklbw   %1, %3
  61     movd        %2, %5
  62     punpcklbw   %2, %3
  63     psubw       %1, %2
  64 %endmacro
  65
  66 %macro MMX_SUMSUB_BA 2
  67     paddw   %1, %2
  68     paddw   %2, %2
  69     psubw   %2, %1
  70 %endmacro
  71
  72 %macro MMX_SUMSUB_BADC 4
  73     paddw   %1, %2
  74     paddw   %3, %4
  75     paddw   %2, %2
  76     paddw   %4, %4
  77     psubw   %2, %1
  78     psubw   %4, %3
  79 %endmacro
  80
  81 %macro MMX_SUMSUB2_AB 3
  82     movq    %3, %1
  83     paddw   %1, %1
  84     paddw   %1, %2
  85     psubw   %3, %2
  86     psubw   %3, %2
  87 %endmacro
  88
  89 %macro MMX_SUMSUBD2_AB 4
  90     movq    %4, %1
  91     movq    %3, %2
  92     psraw   %2, 1
  93     psraw   %4, 1
  94     paddw   %1, %2
  95     psubw   %4, %3
  96 %endmacro
  97
  98 %macro SBUTTERFLYwd 3
  99     movq        %3, %1
 100     punpcklwd   %1, %2
 101     punpckhwd   %3, %2
 102 %endmacro
 103
 104 %macro SBUTTERFLYdq 3
 105     movq        %3, %1
 106     punpckldq   %1, %2
 107     punpckhdq   %3, %2
 108 %endmacro
 109
 110 ;-----------------------------------------------------------------------------
 111 ; input ABCD output ADTC
 112 ;-----------------------------------------------------------------------------
 113 %macro MMX_TRANSPOSE 5
 114     SBUTTERFLYwd %1, %2, %5
 115     SBUTTERFLYwd %3, %4, %2
 116     SBUTTERFLYdq %1, %3, %4
 117     SBUTTERFLYdq %5, %2, %3
 118 %endmacro
 119
 120 %macro MMX_STORE_DIFF_4P 5
 121     paddw       %1, %3
 122     psraw       %1, 6
 123     movd        %2, %5
 124     punpcklbw   %2, %4
 125     paddsw      %1, %2
 126     packuswb    %1, %1
 127     movd        %5, %1
 128 %endmacro
 129
 130 ;%macro
 131 ;%endmacro
 132
 133 ;=============================================================================
 134 ; Local Data (Read Only)
 135 ;=============================================================================
 136
 137 %ifdef FORMAT_COFF
 138 SECTION .rodata
 139 %else
 140 SECTION .rodata
 141 %endif
 142
 143 ;-----------------------------------------------------------------------------
 144 ; Various memory constants (trigonometric values or rounding values)
 145 ;-----------------------------------------------------------------------------
 146
 147 ALIGN 16
 148 x264_mmx_1:        dw  1,  1,  1,  1
 149 x264_mmx_32:       dw 32, 32, 32, 32
 150 x264_mmx_PPNN:     dw  1,  1, -1, -1
 151 x264_mmx_PNPN:     dw  1, -1,  1, -1
 152 x264_mmx_PNNP:     dw  1, -1, -1,  1
 153 x264_mmx_PPPN:     dw  1,  1,  1, -1
 154 x264_mmx_PPNP:     dw  1,  1, -1,  1
 155 x264_mmx_2121:     dw  2,  1,  2,  1
 156 x264_mmx_p2n2p1p1: dw  2, -2,  1,  1
 157
 158 ;=============================================================================
 159 ; Code
 160 ;=============================================================================
 161
 162 SECTION .text
 163
 164 cglobal x264_dct4x4dc_mmxext
 165
 166 ALIGN 16
 167 ;-----------------------------------------------------------------------------
 168 ;   void __cdecl dct4x4dc( int16_t d[4][4] )
 169 ;-----------------------------------------------------------------------------
 170 x264_dct4x4dc_mmxext:
 171     movq    mm0,        [rdi+ 0]
 172     movq    mm1,        [rdi+ 8]
 173     movq    mm2,        [rdi+16]
 174     movq    mm3,        [rdi+24]
 175
 176     MMX_SUMSUB_BADC     mm1, mm0, mm3, mm2          ; mm1=s01  mm0=d01  mm3=s23  mm2=d23
 177     MMX_SUMSUB_BADC     mm3, mm1, mm2, mm0          ; mm3=s01+s23  mm1=s01-s23  mm2=d01+d23  mm0=d01-d23
 178
 179     MMX_TRANSPOSE       mm3, mm1, mm0, mm2, mm4     ; in: mm3, mm1, mm0, mm2  out: mm3, mm2, mm4, mm0
 180
 181     MMX_SUMSUB_BADC     mm2, mm3, mm0, mm4          ; mm2=s01  mm3=d01  mm0=s23  mm4=d23
 182     MMX_SUMSUB_BADC     mm0, mm2, mm4, mm3          ; mm0=s01+s23  mm2=s01-s23  mm4=d01+d23  mm3=d01-d23
 183
 184     MMX_TRANSPOSE       mm0, mm2, mm3, mm4, mm1     ; in: mm0, mm2, mm3, mm4  out: mm0, mm4, mm1, mm3
 185
 186     movq    mm6,        [x264_mmx_1 GLOBAL]
 187     paddw   mm0,        mm6
 188     paddw   mm4,        mm6
 189     psraw   mm0,        1
 190     movq    [rdi+ 0],   mm0
 191     psraw   mm4,        1
 192     movq    [rdi+ 8],   mm4
 193     paddw   mm1,        mm6
 194     paddw   mm3,        mm6
 195     psraw   mm1,        1
 196     movq    [rdi+16],   mm1
 197     psraw   mm3,        1
 198     movq    [rdi+24],   mm3
 199     ret
 200
 201 cglobal x264_idct4x4dc_mmxext
 202
 203 ALIGN 16
 204 ;-----------------------------------------------------------------------------
 205 ;   void __cdecl x264_idct4x4dc_mmxext( int16_t d[4][4] )
 206 ;-----------------------------------------------------------------------------
 207 x264_idct4x4dc_mmxext:
 208     movq    mm0, [rdi+ 0]
 209     movq    mm1, [rdi+ 8]
 210     movq    mm2, [rdi+16]
 211     movq    mm3, [rdi+24]
 212
 213     MMX_SUMSUB_BADC     mm1, mm0, mm3, mm2          ; mm1=s01  mm0=d01  mm3=s23  mm2=d23
 214     MMX_SUMSUB_BADC     mm3, mm1, mm2, mm0          ; mm3=s01+s23 mm1=s01-s23 mm2=d01+d23 mm0=d01-d23
 215
 216     MMX_TRANSPOSE       mm3, mm1, mm0, mm2, mm4     ; in: mm3, mm1, mm0, mm2  out: mm3, mm2, mm4, mm0
 217
 218     MMX_SUMSUB_BADC     mm2, mm3, mm0, mm4          ; mm2=s01  mm3=d01  mm0=s23  mm4=d23
 219     MMX_SUMSUB_BADC     mm0, mm2, mm4, mm3          ; mm0=s01+s23  mm2=s01-s23  mm4=d01+d23  mm3=d01-d23
 220
 221     MMX_TRANSPOSE       mm0, mm2, mm3, mm4, mm1     ; in: mm0, mm2, mm3, mm4  out: mm0, mm4, mm1, mm3
 222
 223     movq    [rdi+ 0],   mm0
 224     movq    [rdi+ 8],   mm4
 225     movq    [rdi+16],   mm1
 226     movq    [rdi+24],   mm3
 227     ret
 228
 229 cglobal x264_sub4x4_dct_mmxext
 230
 231 ALIGN 16
 232 ;-----------------------------------------------------------------------------
 233 ;   void __cdecl x264_sub4x4_dct_mmxext( int16_t dct[4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
 234 ;-----------------------------------------------------------------------------
 235 x264_sub4x4_dct_mmxext:
 236     push    rbx
 237     mov     rax, rsi        ; pix1
 238     movsxd  rbx, edx        ; i_pix1
 239 ;   mov     rcx, rcx        ; pix2
 240     movsxd  rdx, r8d        ; i_pix2
 241
 242     MMX_ZERO    mm7
 243
 244     ; Load 4 lines
 245     MMX_LOAD_DIFF_4P    mm0, mm6, mm7, [rax      ], [rcx]
 246     MMX_LOAD_DIFF_4P    mm1, mm6, mm7, [rax+rbx  ], [rcx+rdx]
 247     MMX_LOAD_DIFF_4P    mm2, mm6, mm7, [rax+rbx*2], [rcx+rdx*2]
 248     add     rax, rbx
 249     add     rcx, rdx
 250     MMX_LOAD_DIFF_4P    mm3, mm6, mm7, [rax+rbx*2], [rcx+rdx*2]
 251
 252     MMX_SUMSUB_BADC     mm3, mm0, mm2, mm1          ; mm3=s03  mm0=d03  mm2=s12  mm1=d12
 253
 254     MMX_SUMSUB_BA       mm2, mm3                    ; mm2=s03+s12      mm3=s03-s12
 255     MMX_SUMSUB2_AB      mm0, mm1, mm4               ; mm0=2.d03+d12    mm4=d03-2.d12
 256
 257     ; transpose in: mm2, mm0, mm3, mm4, out: mm2, mm4, mm1, mm3
 258     MMX_TRANSPOSE       mm2, mm0, mm3, mm4, mm1
 259
 260     MMX_SUMSUB_BADC     mm3, mm2, mm1, mm4          ; mm3=s03  mm2=d03  mm1=s12  mm4=d12
 261
 262     MMX_SUMSUB_BA       mm1, mm3                    ; mm1=s03+s12      mm3=s03-s12
 263     MMX_SUMSUB2_AB      mm2, mm4, mm0               ; mm2=2.d03+d12    mm0=d03-2.d12
 264
 265     ; transpose in: mm1, mm2, mm3, mm0, out: mm1, mm0, mm4, mm3
 266     MMX_TRANSPOSE       mm1, mm2, mm3, mm0, mm4
 267
 268     movq    [rdi+ 0],   mm1 ; dct
 269     movq    [rdi+ 8],   mm0
 270     movq    [rdi+16],   mm4
 271     movq    [rdi+24],   mm3
 272
 273     pop     rbx
 274     ret
 275
 276 cglobal x264_add4x4_idct_mmxext
 277
 278 ALIGN 16
 279 ;-----------------------------------------------------------------------------
 280 ;   void __cdecl x264_add4x4_idct_mmxext( uint8_t *p_dst, int i_dst, int16_t dct[4][4] )
 281 ;-----------------------------------------------------------------------------
 282 x264_add4x4_idct_mmxext:
 283     ; Load dct coeffs
 284     movq    mm0, [rdx+ 0]   ; dct
 285     movq    mm4, [rdx+ 8]
 286     movq    mm3, [rdx+16]
 287     movq    mm1, [rdx+24]
 288
 289     mov     rax, rdi        ; p_dst
 290     movsxd  rcx, esi        ; i_dst
 291     lea     rdx, [rcx+rcx*2]
 292
 293     ; out:mm0, mm1, mm2, mm3
 294     MMX_TRANSPOSE       mm0, mm4, mm3, mm1, mm2
 295
 296     MMX_SUMSUB_BA       mm2, mm0                        ; mm2=s02  mm0=d02
 297     MMX_SUMSUBD2_AB     mm1, mm3, mm5, mm4              ; mm1=s13  mm4=d13 ( well 1 + 3>>1 and 1>>1 + 3)
 298
 299     MMX_SUMSUB_BADC     mm1, mm2, mm4, mm0              ; mm1=s02+s13  mm2=s02-s13  mm4=d02+d13  mm0=d02-d13
 300
 301     ; in: mm1, mm4, mm0, mm2  out: mm1, mm2, mm3, mm0
 302     MMX_TRANSPOSE       mm1, mm4, mm0, mm2, mm3
 303
 304     MMX_SUMSUB_BA       mm3, mm1                        ; mm3=s02  mm1=d02
 305     MMX_SUMSUBD2_AB     mm2, mm0, mm5, mm4              ; mm2=s13  mm4=d13 ( well 1 + 3>>1 and 1>>1 + 3)
 306
 307     MMX_SUMSUB_BADC     mm2, mm3, mm4, mm1              ; mm2=s02+s13  mm3=s02-s13  mm4=d02+d13  mm1=d02-d13
 308
 309     MMX_ZERO            mm7
 310     movq                mm6, [x264_mmx_32 GLOBAL]
 311
 312     MMX_STORE_DIFF_4P   mm2, mm0, mm6, mm7, [rax]
 313     MMX_STORE_DIFF_4P   mm4, mm0, mm6, mm7, [rax+rcx]
 314     MMX_STORE_DIFF_4P   mm1, mm0, mm6, mm7, [rax+rcx*2]
 315     MMX_STORE_DIFF_4P   mm3, mm0, mm6, mm7, [rax+rdx]
 316
 317     ret
 318
 319
 320
 321 ; =============================================================================
 322 ; 8x8 Transform
 323 ; =============================================================================
 324
 325 ; -----------------------------------------------------------------------------
 326 ; input 2x8 unsigned bytes (%5,%6), zero (%7) output: difference (%1,%2)
 327 ; -----------------------------------------------------------------------------
 328 %macro MMX_LOAD_DIFF_8P 7
 329     movq            %1, %5
 330     movq            %2, %1
 331     punpcklbw       %1, %7
 332     punpckhbw       %2, %7
 333     movq            %3, %6
 334     movq            %4, %3
 335     punpcklbw       %3, %7
 336     punpckhbw       %4, %7
 337     psubw           %1, %3
 338     psubw           %2, %4
 339 %endmacro
 340
 341 %macro MMX_LOADSUMSUB 4     ; returns %1=%3+%4, %2=%3-%4
 342     movq            %2, %3
 343     movq            %1, %4
 344     MMX_SUMSUB_BA   %1, %2
 345 %endmacro
 346
 347 %macro MMX_STORE_DIFF_8P 6
 348     movq            %1, %3
 349     movq            %2, %1
 350     punpcklbw       %1, %6
 351     punpckhbw       %2, %6
 352     paddw           %1, %4
 353     paddw           %2, %5
 354     packuswb        %1, %2
 355     movq            %3, %1
 356 %endmacro
 357
 358 cglobal x264_pixel_sub_8x8_mmx
 359 cglobal x264_xdct8_mmxext
 360 cglobal x264_ydct8_mmx
 361
 362 cglobal x264_xidct8_mmxext
 363 cglobal x264_yidct8_mmx
 364 cglobal x264_pixel_add_8x8_mmx
 365
 366 ALIGN 16
 367 ;-----------------------------------------------------------------------------
 368 ;   void __cdecl x264_pixel_sub_8x8_mmx( int16_t *diff, uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 );
 369 ;-----------------------------------------------------------------------------
 370 x264_pixel_sub_8x8_mmx:
 371 ;   mov     rdi, rdi        ; diff
 372 ;   mov     rsi, rsi        ; pix1
 373     movsxd  rdx, edx        ; i_pix1
 374 ;   mov     rcx, rcx        ; pix2
 375     movsxd  r8,  r8d        ; i_pix2
 376
 377     MMX_ZERO    mm7
 378
 379     %assign disp 0
 380     %rep  8
 381     MMX_LOAD_DIFF_8P mm0, mm1, mm2, mm3, [rsi], [rcx], mm7
 382     movq        [rdi+disp], mm0
 383     movq        [rdi+disp+8], mm1
 384     add         rsi, rdx
 385     add         rcx, r8
 386     %assign disp disp+16
 387     %endrep
 388
 389     ret
 390
 391 ALIGN 16
 392 ;-----------------------------------------------------------------------------
 393 ;   void __cdecl x264_xdct8_mmxext( int16_t dest[8][8] );
 394 ;-----------------------------------------------------------------------------
 395 x264_xdct8_mmxext:
 396
 397     movq        mm5, [x264_mmx_PPNN GLOBAL]
 398     movq        mm6, [x264_mmx_PNNP GLOBAL]
 399     movq        mm4, [x264_mmx_PPPN GLOBAL]
 400     movq        mm7, [x264_mmx_PPNP GLOBAL]
 401
 402     ;-------------------------------------------------------------------------
 403     ; horizontal dct ( compute 1 row at a time -> 8 loops )
 404     ;-------------------------------------------------------------------------
 405
 406     %assign disp 0
 407     %rep 8
 408
 409     movq        mm0, [rdi+disp]
 410     movq        mm1, [rdi+disp+8]
 411
 412     pshufw      mm2, mm1, 00011011b
 413     movq        mm1, mm0
 414     paddw       mm0, mm2                ; (low)s07/s16/d25/s34(high)
 415     psubw       mm1, mm2                ; (low)d07/d16/d25/d34(high)
 416
 417     pshufw      mm2, mm0, 00011011b     ; (low)s34/s25/s16/s07(high)
 418     pmullw      mm0, mm5                ; (low)s07/s16/-s25/-s34(high)
 419     paddw       mm0, mm2                ; (low)a0/a1/a3/a2(high)
 420
 421     movq        mm3, mm1
 422     psraw       mm1, 1                  ; (low)d07/d16/d25/d34(high) (x>>1)
 423     pshufw      mm2, mm3, 10110001b     ; (low)d16/d07/d34/d25(high)
 424     paddw       mm1, mm3                ; (low)d07/d16/d25/d34(high) (x+(x>>1))
 425     pshufw      mm3, mm2, 00011011b     ; (low)d25/d34/d07/d16(high)
 426     pmullw      mm2, mm5                ; (low)d16/d07/-d34/-d25(high)
 427     pmullw      mm1, mm6                ; (low)d07/-d16/-d25/d34(high) (x+(x>>1))
 428     paddw       mm3, mm2
 429     paddw       mm1, mm3                ; (low)a4/a6/a5/a7(high)
 430
 431
 432     pshufw      mm2, mm0, 11001001b     ; (low)a1/a3/a0/a2(high)
 433     pshufw      mm0, mm0, 10011100b     ; (low)a0/a2/a1/a3(high)
 434     pmullw      mm2, [x264_mmx_2121 GLOBAL]
 435     pmullw      mm0, mm5                ; (low)a0/a2/-a1/-a3(high)
 436     psraw       mm2, 1                  ; (low)a1/a3>>1/a0/a2>>1(high)
 437     paddw       mm0, mm2                ; (low)dst0/dst2/dst4/dst6(high)
 438
 439     pshufw      mm1, mm1, 00100111b     ; (low)a7/a6/a5/a4(high)
 440     pshufw      mm2, mm1, 00011011b     ; (low)a4/a5/a6/a7(high)
 441     psraw       mm1, 2                  ; (low)a7>>2/a6>>2/a5>>2/a4>>2(high)
 442     pmullw      mm2, mm4                ; (low)a4/a5/a6/-a7(high)
 443     pmullw      mm1, mm7                ; (low)a7>>2/a6>>2/-a5>>2/a4>>2(high)
 444     paddw       mm1, mm2                ; (low)dst1/dst3/dst5/dst7(high)
 445
 446     movq        mm2, mm0
 447     punpcklwd   mm0, mm1                ; (low)dst0/dst1/dst2/dst3(high)
 448     punpckhwd   mm2, mm1                ; (low)dst4/dst5/dst6/dst7(high)
 449
 450     movq        [rdi+disp], mm0
 451     movq        [rdi+disp+8], mm2
 452
 453     %assign disp disp+16
 454     %endrep
 455
 456     ret
 457
 458 ALIGN 16
 459 ;-----------------------------------------------------------------------------
 460 ;   void __cdecl x264_ydct8_mmx( int16_t dest[8][8] );
 461 ;-----------------------------------------------------------------------------
 462 x264_ydct8_mmx:
 463
 464     ;-------------------------------------------------------------------------
 465     ; vertical dct ( compute 4 columns at a time -> 2 loops )
 466     ;-------------------------------------------------------------------------
 467
 468     %assign disp 0
 469     %rep 2
 470
 471     MMX_LOADSUMSUB  mm2, mm3, [rdi+disp+0*16], [rdi+disp+7*16] ; mm2 = s07, mm3 = d07
 472     MMX_LOADSUMSUB  mm1, mm5, [rdi+disp+1*16], [rdi+disp+6*16] ; mm1 = s16, mm5 = d16
 473     MMX_LOADSUMSUB  mm0, mm6, [rdi+disp+2*16], [rdi+disp+5*16] ; mm0 = s25, mm6 = d25
 474     MMX_LOADSUMSUB  mm4, mm7, [rdi+disp+3*16], [rdi+disp+4*16] ; mm4 = s34, mm7 = d34
 475
 476     MMX_SUMSUB_BA   mm4, mm2        ; mm4 = a0, mm2 = a2
 477     MMX_SUMSUB_BA   mm0, mm1        ; mm0 = a1, mm1 = a3
 478     MMX_SUMSUB_BA   mm0, mm4        ; mm0 = dst0, mm1 = dst4
 479
 480     movq    [rdi+disp+0*16], mm0
 481     movq    [rdi+disp+4*16], mm4
 482
 483     movq    mm0, mm1         ; a3
 484     psraw   mm0, 1           ; a3>>1
 485     paddw   mm0, mm2         ; a2 + (a3>>1)
 486     psraw   mm2, 1           ; a2>>1
 487     psubw   mm2, mm1         ; (a2>>1) - a3
 488
 489     movq    [rdi+disp+2*16], mm0
 490     movq    [rdi+disp+6*16], mm2
 491
 492     movq    mm0, mm6
 493     psraw   mm0, 1
 494     paddw   mm0, mm6         ; d25+(d25>>1)
 495     movq    mm1, mm3
 496     psubw   mm1, mm7         ; a5 = d07-d34-(d25+(d25>>1))
 497     psubw   mm1, mm0
 498
 499     movq    mm0, mm5
 500     psraw   mm0, 1
 501     paddw   mm0, mm5         ; d16+(d16>>1)
 502     movq    mm2, mm3
 503     paddw   mm2, mm7         ; a6 = d07+d34-(d16+(d16>>1))
 504     psubw   mm2, mm0
 505
 506     movq    mm0, mm3
 507     psraw   mm0, 1
 508     paddw   mm0, mm3         ; d07+(d07>>1)
 509     paddw   mm0, mm5
 510     paddw   mm0, mm6         ; a4 = d16+d25+(d07+(d07>>1))
 511
 512     movq    mm3, mm7
 513     psraw   mm3, 1
 514     paddw   mm3, mm7         ; d34+(d34>>1)
 515     paddw   mm3, mm5
 516     psubw   mm3, mm6         ; a7 = d16-d25+(d34+(d34>>1))
 517
 518     movq    mm7, mm3
 519     psraw   mm7, 2
 520     paddw   mm7, mm0         ; a4 + (a7>>2)
 521
 522     movq    mm6, mm2
 523     psraw   mm6, 2
 524     paddw   mm6, mm1         ; a5 + (a6>>2)
 525
 526     psraw   mm0, 2
 527     psraw   mm1, 2
 528     psubw   mm0, mm3         ; (a4>>2) - a7
 529     psubw   mm2, mm1         ; a6 - (a5>>2)
 530
 531     movq    [rdi+disp+1*16], mm7
 532     movq    [rdi+disp+3*16], mm6
 533     movq    [rdi+disp+5*16], mm2
 534     movq    [rdi+disp+7*16], mm0
 535
 536     %assign disp disp+8
 537     %endrep
 538
 539     ret
 540
 541 ALIGN 16
 542 ;-----------------------------------------------------------------------------
 543 ;   void __cdecl x264_xidct8_mmxext( int16_t dest[8][8] );
 544 ;-----------------------------------------------------------------------------
 545 x264_xidct8_mmxext:
 546
 547     movq        mm4, [x264_mmx_PPNN GLOBAL]
 548     movq        mm5, [x264_mmx_PNPN GLOBAL]
 549     movq        mm6, [x264_mmx_PPNP GLOBAL]
 550     movq        mm7, [x264_mmx_PPPN GLOBAL]
 551
 552     ;-------------------------------------------------------------------------
 553     ; horizontal idct ( compute 1 row at a time -> 8 loops )
 554     ;-------------------------------------------------------------------------
 555
 556     %assign disp 0
 557     %rep 8
 558
 559     pshufw      mm0, [rdi+disp], 11011000b      ; (low)d0,d2,d1,d3(high)
 560     pshufw      mm2, [rdi+disp+8], 11011000b    ; (low)d4,d6,d5,d7(high)
 561     movq        mm1, mm0
 562     punpcklwd   mm0, mm2                ; (low)d0,d4,d2,d6(high)
 563     punpckhwd   mm1, mm2                ; (low)d1,d5,d3,d7(high)
 564
 565     pshufw      mm2, mm0, 10110001b     ; (low)d4,d0,d6,d2(high)
 566     pmullw      mm0, [x264_mmx_p2n2p1p1 GLOBAL]
 567                                         ; (low)2*d0,-2*d4,d2,d6(high)
 568     pmullw      mm2, mm6                ; (low)d4,d0,-d6,d2(high)
 569     psraw       mm0, 1                  ; (low)d0,-d4,d2>>1,d6>>1(high)
 570     paddw       mm0, mm2                ; (low)e0,e2,e4,e6(high)
 571
 572     movq        mm3, mm1                ; (low)d1,d5,d3,d7(high)
 573     psraw       mm1, 1                  ; (low)d1>>1,d5>>1,d3>>1,d7>>1(high)
 574     pshufw      mm2, mm3, 10110001b     ; (low)d5,d1,d7,d3(high)
 575     paddw       mm1, mm3                ; (low)d1+(d1>>1),d5+(d5>>1),d3+(d3>>1),d7+(d7>>1)(high)
 576     pshufw      mm3, mm2, 00011011b     ; (low)d3,d7,d1,d5(high)
 577     pmullw      mm1, mm4                ; (low)d1+(d1>>1),d5+(d5>>1),-d3-(d3>>1),-d7-(d7>>1)(high)
 578     pmullw      mm2, mm5                ; (low)d5,-d1,d7,-d3(high)
 579     paddw       mm1, mm3
 580     paddw       mm1, mm2                ; (low)e7,e5,e3,e1(high)
 581
 582     pshufw      mm2, mm0, 00011011b     ; (low)e6,e4,e2,e0(high)
 583     pmullw      mm0, mm4                ; (low)e0,e2,-e4,-e6(high)
 584     pshufw      mm3, mm1, 00011011b     ; (low)e1,e3,e5,e7(high)
 585     psraw       mm1, 2                  ; (low)e7>>2,e5>>2,e3>>2,e1>>2(high)
 586     pmullw      mm3, mm6                ; (low)e1,e3,-e5,e7(high)
 587     pmullw      mm1, mm7                ; (low)e7>>2,e5>>2,e3>>2,-e1>>2(high)
 588     paddw       mm0, mm2                ; (low)f0,f2,f4,f6(high)
 589     paddw       mm1, mm3                ; (low)f1,f3,f5,f7(high)
 590
 591     pshufw      mm3, mm0, 00011011b     ; (low)f6,f4,f2,f0(high)
 592     pshufw      mm2, mm1, 00011011b     ; (low)f7,f5,f3,f1(high)
 593     psubw       mm3, mm1
 594     paddw       mm0, mm2
 595
 596     movq        [rdi+disp], mm0
 597     movq        [rdi+disp+8], mm3
 598
 599     %assign disp disp+16
 600     %endrep
 601
 602     ret
 603
 604 ALIGN 16
 605 ;-----------------------------------------------------------------------------
 606 ;   void __cdecl x264_yidct8_mmx( int16_t dest[8][8] );
 607 ;-----------------------------------------------------------------------------
 608 x264_yidct8_mmx:
 609
 610     ;-------------------------------------------------------------------------
 611     ; vertical idct ( compute 4 columns at a time -> 2 loops )
 612     ;-------------------------------------------------------------------------
 613
 614     %assign disp 0
 615     %rep 2
 616
 617     movq        mm1, [rdi+disp+1*16]    ; mm1 = d1
 618     movq        mm3, [rdi+disp+3*16]    ; mm3 = d3
 619     movq        mm5, [rdi+disp+5*16]    ; mm5 = d5
 620     movq        mm7, [rdi+disp+7*16]    ; mm7 = d7
 621
 622     movq        mm4, mm7
 623     psraw       mm4, 1
 624     movq        mm0, mm5
 625     psubw       mm0, mm7
 626     psubw       mm0, mm4
 627     psubw       mm0, mm3                ; mm0 = e1
 628
 629     movq        mm6, mm3
 630     psraw       mm6, 1
 631     movq        mm2, mm7
 632     psubw       mm2, mm6
 633     psubw       mm2, mm3
 634     paddw       mm2, mm1                ; mm2 = e3
 635
 636     movq        mm4, mm5
 637     psraw       mm4, 1
 638     paddw       mm4, mm5
 639     paddw       mm4, mm7
 640     psubw       mm4, mm1                ; mm4 = e5
 641
 642     movq        mm6, mm1
 643     psraw       mm6, 1
 644     paddw       mm6, mm1
 645     paddw       mm6, mm5
 646     paddw       mm6, mm3                ; mm6 = e7
 647
 648     movq        mm1, mm0
 649     movq        mm3, mm4
 650     movq        mm5, mm2
 651     movq        mm7, mm6
 652     psraw       mm6, 2
 653     psraw       mm3, 2
 654     psraw       mm5, 2
 655     psraw       mm0, 2
 656     paddw       mm1, mm6                ; mm1 = f1
 657     paddw       mm3, mm2                ; mm3 = f3
 658     psubw       mm5, mm4                ; mm5 = f5
 659     psubw       mm7, mm0                ; mm7 = f7
 660
 661     movq        mm2, [rdi+disp+2*16]    ; mm2 = d2
 662     movq        mm6, [rdi+disp+6*16]    ; mm6 = d6
 663     movq        mm4, mm2
 664     movq        mm0, mm6
 665     psraw       mm4, 1
 666     psraw       mm6, 1
 667     psubw       mm4, mm0                ; mm4 = a4
 668     paddw       mm6, mm2                ; mm6 = a6
 669
 670     movq        mm2, [rdi+disp+0*16]    ; mm2 = d0
 671     movq        mm0, [rdi+disp+4*16]    ; mm0 = d4
 672     MMX_SUMSUB_BA   mm0, mm2                ; mm0 = a0, mm2 = a2
 673
 674     MMX_SUMSUB_BA   mm6, mm0                ; mm6 = f0, mm0 = f6
 675     MMX_SUMSUB_BA   mm4, mm2                ; mm4 = f2, mm2 = f4
 676
 677     MMX_SUMSUB_BA   mm7, mm6                ; mm7 = g0, mm6 = g7
 678     MMX_SUMSUB_BA   mm5, mm4                ; mm5 = g1, mm4 = g6
 679     MMX_SUMSUB_BA   mm3, mm2                ; mm3 = g2, mm2 = g5
 680     MMX_SUMSUB_BA   mm1, mm0                ; mm1 = g3, mm0 = g4
 681
 682     psraw       mm7, 6
 683     psraw       mm6, 6
 684     psraw       mm5, 6
 685     psraw       mm4, 6
 686     psraw       mm3, 6
 687     psraw       mm2, 6
 688     psraw       mm1, 6
 689     psraw       mm0, 6
 690
 691     movq        [rdi+disp+0*16], mm7
 692     movq        [rdi+disp+1*16], mm5
 693     movq        [rdi+disp+2*16], mm3
 694     movq        [rdi+disp+3*16], mm1
 695     movq        [rdi+disp+4*16], mm0
 696     movq        [rdi+disp+5*16], mm2
 697     movq        [rdi+disp+6*16], mm4
 698     movq        [rdi+disp+7*16], mm6
 699
 700     %assign disp disp+8
 701     %endrep
 702
 703     ret
 704
 705 ALIGN 16
 706 ;-----------------------------------------------------------------------------
 707 ;   void __cdecl x264_pixel_add_8x8_mmx( unit8_t *dst, int i_dst, int16_t src[8][8] );
 708 ;-----------------------------------------------------------------------------
 709 x264_pixel_add_8x8_mmx:
 710 ;   mov     rdi, rdi        ; dst
 711     movsxd  rsi, esi        ; i_dst
 712 ;   mov     rdx, rdx        ; src
 713
 714     MMX_ZERO    mm7
 715
 716     %assign disp 0
 717     %rep 8
 718     MMX_STORE_DIFF_8P   mm0, mm1, [rdi], [rdx+disp], [rdx+disp+8], mm7
 719     add         rdi, rsi
 720     %assign disp disp+16
 721     %endrep
 722     ret
 723