git.sesse.net Git - x264/blob - common/x86/pixel-32.asm

   1 ;*****************************************************************************
   2 ;* pixel-32.asm: h264 encoder library
   3 ;*****************************************************************************
   4 ;* Copyright (C) 2003-2008 x264 project
   5 ;*
   6 ;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
   7 ;*          Loren Merritt <lorenm@u.washington.edu>
   8 ;*
   9 ;* This program is free software; you can redistribute it and/or modify
  10 ;* it under the terms of the GNU General Public License as published by
  11 ;* the Free Software Foundation; either version 2 of the License, or
  12 ;* (at your option) any later version.
  13 ;*
  14 ;* This program is distributed in the hope that it will be useful,
  15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17 ;* GNU General Public License for more details.
  18 ;*
  19 ;* You should have received a copy of the GNU General Public License
  20 ;* along with this program; if not, write to the Free Software
  21 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  22 ;*****************************************************************************
  23
  24 %include "x86inc.asm"
  25
  26 SECTION .text
  27
  28 %macro SUMSUB_BADC 4
  29     paddw %1, %2
  30     paddw %3, %4
  31     paddw %2, %2
  32     paddw %4, %4
  33     psubw %2, %1
  34     psubw %4, %3
  35 %endmacro
  36
  37 %macro SBUTTERFLY 5
  38     mov%1     %5, %3
  39     punpckl%2 %3, %4
  40     punpckh%2 %5, %4
  41 %endmacro
  42
  43 %macro TRANSPOSE4x4W 5   ; abcd-t -> adtc
  44     SBUTTERFLY q, wd, %1, %2, %5
  45     SBUTTERFLY q, wd, %3, %4, %2
  46     SBUTTERFLY q, dq, %1, %3, %4
  47     SBUTTERFLY q, dq, %5, %2, %3
  48 %endmacro
  49
  50 %macro ABS1 2     ; mma, tmp
  51     pxor    %2, %2
  52     psubw   %2, %1
  53     pmaxsw  %1, %2
  54 %endmacro
  55
  56 %macro ABS2 4     ; mma, mmb, tmp0, tmp1
  57     pxor    %3, %3
  58     pxor    %4, %4
  59     psubw   %3, %1
  60     psubw   %4, %2
  61     pmaxsw  %1, %3
  62     pmaxsw  %2, %4
  63 %endmacro
  64
  65 %macro LOAD_DIFF_4P 4  ; mmp, mmt, dx, dy
  66     movd        %1, [eax+ebx*%4+%3]
  67     movd        %2, [ecx+edx*%4+%3]
  68     punpcklbw   %1, %2
  69     punpcklbw   %2, %2
  70     psubw       %1, %2
  71 %endmacro
  72
  73 %macro LOAD_DIFF_4x8P 1 ; dx
  74     LOAD_DIFF_4P  mm0, mm7, %1, 0
  75     LOAD_DIFF_4P  mm1, mm7, %1, 1
  76     lea  eax, [eax+2*ebx]
  77     lea  ecx, [ecx+2*edx]
  78     LOAD_DIFF_4P  mm2, mm7, %1, 0
  79     LOAD_DIFF_4P  mm3, mm7, %1, 1
  80     lea  eax, [eax+2*ebx]
  81     lea  ecx, [ecx+2*edx]
  82     LOAD_DIFF_4P  mm4, mm7, %1, 0
  83     LOAD_DIFF_4P  mm5, mm7, %1, 1
  84     lea  eax, [eax+2*ebx]
  85     lea  ecx, [ecx+2*edx]
  86     LOAD_DIFF_4P  mm6, mm7, %1, 0
  87     movq [spill], mm6
  88     LOAD_DIFF_4P  mm7, mm6, %1, 1
  89     movq mm6, [spill]
  90 %endmacro
  91
  92 %macro HADAMARD8_1D 8
  93     SUMSUB_BADC %1, %5, %2, %6
  94     SUMSUB_BADC %3, %7, %4, %8
  95     SUMSUB_BADC %1, %3, %2, %4
  96     SUMSUB_BADC %5, %7, %6, %8
  97     SUMSUB_BADC %1, %2, %3, %4
  98     SUMSUB_BADC %5, %6, %7, %8
  99 %endmacro
 100
 101 %macro SUM4x8_MM 0
 102     movq [spill],   mm6
 103     movq [spill+8], mm7
 104     ABS2     mm0, mm1, mm6, mm7
 105     ABS2     mm2, mm3, mm6, mm7
 106     paddw    mm0, mm2
 107     paddw    mm1, mm3
 108     movq     mm6, [spill]
 109     movq     mm7, [spill+8]
 110     ABS2     mm4, mm5, mm2, mm3
 111     ABS2     mm6, mm7, mm2, mm3
 112     paddw    mm4, mm6
 113     paddw    mm5, mm7
 114     paddw    mm0, mm4
 115     paddw    mm1, mm5
 116     paddw    mm0, mm1
 117 %endmacro
 118
 119 ;-----------------------------------------------------------------------------
 120 ; int x264_pixel_sa8d_8x8_mmxext( uint8_t *, int, uint8_t *, int )
 121 ;-----------------------------------------------------------------------------
 122 cglobal x264_pixel_sa8d_8x8_mmxext
 123     push   ebx
 124     mov    eax, [esp+ 8]  ; pix1
 125     mov    ebx, [esp+12]  ; stride1
 126     mov    ecx, [esp+16]  ; pix2
 127     mov    edx, [esp+20]  ; stride2
 128     sub    esp, 0x70
 129 %define args  esp+0x74
 130 %define spill esp+0x60 ; +16
 131 %define trans esp+0    ; +96
 132     LOAD_DIFF_4x8P 0
 133     HADAMARD8_1D mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7
 134
 135     movq   [spill], mm0
 136     TRANSPOSE4x4W mm4, mm5, mm6, mm7, mm0
 137     movq   [trans+0x00], mm4
 138     movq   [trans+0x08], mm7
 139     movq   [trans+0x10], mm0
 140     movq   [trans+0x18], mm6
 141     movq   mm0, [spill]
 142     TRANSPOSE4x4W mm0, mm1, mm2, mm3, mm4
 143     movq   [trans+0x20], mm0
 144     movq   [trans+0x28], mm3
 145     movq   [trans+0x30], mm4
 146     movq   [trans+0x38], mm2
 147
 148     mov    eax, [args+4]
 149     mov    ecx, [args+12]
 150     LOAD_DIFF_4x8P 4
 151     HADAMARD8_1D mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7
 152
 153     movq   [spill], mm7
 154     TRANSPOSE4x4W mm0, mm1, mm2, mm3, mm7
 155     movq   [trans+0x40], mm0
 156     movq   [trans+0x48], mm3
 157     movq   [trans+0x50], mm7
 158     movq   [trans+0x58], mm2
 159     movq   mm7, [spill]
 160     TRANSPOSE4x4W mm4, mm5, mm6, mm7, mm0
 161     movq   mm5, [trans+0x00]
 162     movq   mm1, [trans+0x08]
 163     movq   mm2, [trans+0x10]
 164     movq   mm3, [trans+0x18]
 165
 166     HADAMARD8_1D mm5, mm1, mm2, mm3, mm4, mm7, mm0, mm6
 167     SUM4x8_MM
 168     movq   [trans], mm0
 169
 170     movq   mm0, [trans+0x20]
 171     movq   mm1, [trans+0x28]
 172     movq   mm2, [trans+0x30]
 173     movq   mm3, [trans+0x38]
 174     movq   mm4, [trans+0x40]
 175     movq   mm5, [trans+0x48]
 176     movq   mm6, [trans+0x50]
 177     movq   mm7, [trans+0x58]
 178
 179     HADAMARD8_1D mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7
 180     SUM4x8_MM
 181
 182     pavgw  mm0, [esp]
 183     pshufw mm1, mm0, 01001110b
 184     paddw  mm0, mm1
 185     pshufw mm1, mm0, 10110001b
 186     paddw  mm0, mm1
 187     movd   eax, mm0
 188     and    eax, 0xffff
 189     mov    ecx, eax ; preserve rounding for 16x16
 190     add    eax, 1
 191     shr    eax, 1
 192     add    esp, 0x70
 193     pop    ebx
 194     ret
 195 %undef args
 196 %undef spill
 197 %undef trans
 198
 199 %macro SUM_MM_X3 8 ; 3x sum, 4x tmp, op
 200     pxor        %7, %7
 201     pshufw      %4, %1, 01001110b
 202     pshufw      %5, %2, 01001110b
 203     pshufw      %6, %3, 01001110b
 204     paddusw     %1, %4
 205     paddusw     %2, %5
 206     paddusw     %3, %6
 207     punpcklwd   %1, %7
 208     punpcklwd   %2, %7
 209     punpcklwd   %3, %7
 210     pshufw      %4, %1, 01001110b
 211     pshufw      %5, %2, 01001110b
 212     pshufw      %6, %3, 01001110b
 213     %8          %1, %4
 214     %8          %2, %5
 215     %8          %3, %6
 216 %endmacro
 217
 218 %macro LOAD_4x8P 1 ; dx
 219     pxor        mm7, mm7
 220     movd        mm6, [eax+%1+7*FENC_STRIDE]
 221     movd        mm0, [eax+%1+0*FENC_STRIDE]
 222     movd        mm1, [eax+%1+1*FENC_STRIDE]
 223     movd        mm2, [eax+%1+2*FENC_STRIDE]
 224     movd        mm3, [eax+%1+3*FENC_STRIDE]
 225     movd        mm4, [eax+%1+4*FENC_STRIDE]
 226     movd        mm5, [eax+%1+5*FENC_STRIDE]
 227     punpcklbw   mm6, mm7
 228     punpcklbw   mm0, mm7
 229     punpcklbw   mm1, mm7
 230     movq    [spill], mm6
 231     punpcklbw   mm2, mm7
 232     punpcklbw   mm3, mm7
 233     movd        mm6, [eax+%1+6*FENC_STRIDE]
 234     punpcklbw   mm4, mm7
 235     punpcklbw   mm5, mm7
 236     punpcklbw   mm6, mm7
 237     movq        mm7, [spill]
 238 %endmacro
 239
 240 ;-----------------------------------------------------------------------------
 241 ; void x264_intra_sa8d_x3_8x8_core_mmxext( uint8_t *fenc, int16_t edges[2][8], int *res )
 242 ;-----------------------------------------------------------------------------
 243 cglobal x264_intra_sa8d_x3_8x8_core_mmxext
 244     mov    eax, [esp+4]
 245     mov    ecx, [esp+8]
 246     sub    esp, 0x70
 247 %define args  esp+0x74
 248 %define spill esp+0x60 ; +16
 249 %define trans esp+0    ; +96
 250 %define sum   esp+0    ; +32
 251     LOAD_4x8P 0
 252     HADAMARD8_1D mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7
 253
 254     movq   [spill], mm0
 255     TRANSPOSE4x4W mm4, mm5, mm6, mm7, mm0
 256     movq   [trans+0x00], mm4
 257     movq   [trans+0x08], mm7
 258     movq   [trans+0x10], mm0
 259     movq   [trans+0x18], mm6
 260     movq   mm0, [spill]
 261     TRANSPOSE4x4W mm0, mm1, mm2, mm3, mm4
 262     movq   [trans+0x20], mm0
 263     movq   [trans+0x28], mm3
 264     movq   [trans+0x30], mm4
 265     movq   [trans+0x38], mm2
 266
 267     LOAD_4x8P 4
 268     HADAMARD8_1D mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7
 269
 270     movq   [spill], mm7
 271     TRANSPOSE4x4W mm0, mm1, mm2, mm3, mm7
 272     movq   [trans+0x40], mm0
 273     movq   [trans+0x48], mm3
 274     movq   [trans+0x50], mm7
 275     movq   [trans+0x58], mm2
 276     movq   mm7, [spill]
 277     TRANSPOSE4x4W mm4, mm5, mm6, mm7, mm0
 278     movq   mm5, [trans+0x00]
 279     movq   mm1, [trans+0x08]
 280     movq   mm2, [trans+0x10]
 281     movq   mm3, [trans+0x18]
 282
 283     HADAMARD8_1D mm5, mm1, mm2, mm3, mm4, mm7, mm0, mm6
 284
 285     movq [spill+0], mm5
 286     movq [spill+8], mm7
 287     ABS2     mm0, mm1, mm5, mm7
 288     ABS2     mm2, mm3, mm5, mm7
 289     paddw    mm0, mm2
 290     paddw    mm1, mm3
 291     paddw    mm0, mm1
 292     ABS2     mm4, mm6, mm2, mm3
 293     movq     mm5, [spill+0]
 294     movq     mm7, [spill+8]
 295     paddw    mm0, mm4
 296     paddw    mm0, mm6
 297     ABS1     mm7, mm1
 298     paddw    mm0, mm7 ; 7x4 sum
 299     movq     mm6, mm5
 300     movq     mm7, [ecx+8] ; left bottom
 301     psllw    mm7, 3
 302     psubw    mm6, mm7
 303     ABS2     mm5, mm6, mm2, mm3
 304     paddw    mm5, mm0
 305     paddw    mm6, mm0
 306     movq [sum+0], mm5 ; dc
 307     movq [sum+8], mm6 ; left
 308
 309     movq   mm0, [trans+0x20]
 310     movq   mm1, [trans+0x28]
 311     movq   mm2, [trans+0x30]
 312     movq   mm3, [trans+0x38]
 313     movq   mm4, [trans+0x40]
 314     movq   mm5, [trans+0x48]
 315     movq   mm6, [trans+0x50]
 316     movq   mm7, [trans+0x58]
 317
 318     HADAMARD8_1D mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7
 319
 320     movd   [sum+0x10], mm0
 321     movd   [sum+0x12], mm1
 322     movd   [sum+0x14], mm2
 323     movd   [sum+0x16], mm3
 324     movd   [sum+0x18], mm4
 325     movd   [sum+0x1a], mm5
 326     movd   [sum+0x1c], mm6
 327     movd   [sum+0x1e], mm7
 328
 329     movq [spill],   mm0
 330     movq [spill+8], mm1
 331     ABS2     mm2, mm3, mm0, mm1
 332     ABS2     mm4, mm5, mm0, mm1
 333     paddw    mm2, mm3
 334     paddw    mm4, mm5
 335     paddw    mm2, mm4
 336     movq     mm0, [spill]
 337     movq     mm1, [spill+8]
 338     ABS2     mm6, mm7, mm4, mm5
 339     ABS1     mm1, mm4
 340     paddw    mm2, mm7
 341     paddw    mm1, mm6
 342     paddw    mm2, mm1 ; 7x4 sum
 343     movq     mm1, mm0
 344
 345     movq     mm7, [ecx+0]
 346     psllw    mm7, 3   ; left top
 347
 348     movzx    edx, word [ecx+0]
 349     add      dx,  [ecx+16]
 350     lea      edx, [4*edx+32]
 351     and      edx, -64
 352     movd     mm6, edx ; dc
 353
 354     psubw    mm1, mm7
 355     psubw    mm0, mm6
 356     ABS2     mm0, mm1, mm5, mm6
 357     movq     mm3, [sum+0] ; dc
 358     paddw    mm0, mm2
 359     paddw    mm1, mm2
 360     movq     mm2, mm0
 361     paddw    mm0, mm3
 362     paddw    mm1, [sum+8] ; h
 363     psrlq    mm2, 16
 364     paddw    mm2, mm3
 365
 366     movq     mm3, [ecx+16] ; top left
 367     movq     mm4, [ecx+24] ; top right
 368     psllw    mm3, 3
 369     psllw    mm4, 3
 370     psubw    mm3, [sum+16]
 371     psubw    mm4, [sum+24]
 372     ABS2     mm3, mm4, mm5, mm6
 373     paddw    mm2, mm3
 374     paddw    mm2, mm4 ; v
 375
 376     SUM_MM_X3   mm0, mm1, mm2, mm3, mm4, mm5, mm6, paddd
 377     mov      eax, [args+8]
 378     movd     ecx, mm2
 379     movd     edx, mm1
 380     add      ecx, 2
 381     add      edx, 2
 382     shr      ecx, 2
 383     shr      edx, 2
 384     mov      [eax+0], ecx ; i8x8_v satd
 385     mov      [eax+4], edx ; i8x8_h satd
 386     movd     ecx, mm0
 387     add      ecx, 2
 388     shr      ecx, 2
 389     mov      [eax+8], ecx ; i8x8_dc satd
 390
 391     add      esp, 0x70
 392     ret
 393 %undef args
 394 %undef spill
 395 %undef trans
 396 %undef sum
 397
 398
 399
 400 ;-----------------------------------------------------------------------------
 401 ; void x264_pixel_ssim_4x4x2_core_mmxext( const uint8_t *pix1, int stride1,
 402 ;                                         const uint8_t *pix2, int stride2, int sums[2][4] )
 403 ;-----------------------------------------------------------------------------
 404 cglobal x264_pixel_ssim_4x4x2_core_mmxext
 405     push      ebx
 406     push      edi
 407     mov       ebx, [esp+16]
 408     mov       edx, [esp+24]
 409     mov       edi, 4
 410     pxor      mm0, mm0
 411 .loop:
 412     mov       eax, [esp+12]
 413     mov       ecx, [esp+20]
 414     add       eax, edi
 415     add       ecx, edi
 416     pxor      mm1, mm1
 417     pxor      mm2, mm2
 418     pxor      mm3, mm3
 419     pxor      mm4, mm4
 420 %rep 4
 421     movd      mm5, [eax]
 422     movd      mm6, [ecx]
 423     punpcklbw mm5, mm0
 424     punpcklbw mm6, mm0
 425     paddw     mm1, mm5
 426     paddw     mm2, mm6
 427     movq      mm7, mm5
 428     pmaddwd   mm5, mm5
 429     pmaddwd   mm7, mm6
 430     pmaddwd   mm6, mm6
 431     paddd     mm3, mm5
 432     paddd     mm4, mm7
 433     paddd     mm3, mm6
 434     add       eax, ebx
 435     add       ecx, edx
 436 %endrep
 437     mov       eax, [esp+28]
 438     lea       eax, [eax+edi*4]
 439     pshufw    mm5, mm1, 0xE
 440     pshufw    mm6, mm2, 0xE
 441     paddusw   mm1, mm5
 442     paddusw   mm2, mm6
 443     punpcklwd mm1, mm2
 444     pshufw    mm2, mm1, 0xE
 445     pshufw    mm5, mm3, 0xE
 446     pshufw    mm6, mm4, 0xE
 447     paddusw   mm1, mm2
 448     paddd     mm3, mm5
 449     paddd     mm4, mm6
 450     punpcklwd mm1, mm0
 451     punpckldq mm3, mm4
 452     movq  [eax+0], mm1
 453     movq  [eax+8], mm3
 454     sub       edi, 4
 455     jge       .loop
 456     pop       edi
 457     pop       ebx
 458     emms
 459     ret
 460