git.sesse.net Git - x264/blob - common/amd64/mc-a2.asm

   1 ;*****************************************************************************
   2 ;* mc-a2.asm: h264 encoder library
   3 ;*****************************************************************************
   4 ;* Copyright (C) 2005 x264 project
   5 ;*
   6 ;* This program is free software; you can redistribute it and/or modify
   7 ;* it under the terms of the GNU General Public License as published by
   8 ;* the Free Software Foundation; either version 2 of the License, or
   9 ;* (at your option) any later version.
  10 ;*
  11 ;* This program is distributed in the hope that it will be useful,
  12 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 ;* GNU General Public License for more details.
  15 ;*
  16 ;* You should have received a copy of the GNU General Public License
  17 ;* along with this program; if not, write to the Free Software
  18 ;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
  19 ;*****************************************************************************
  20
  21 BITS 64
  22
  23 ;=============================================================================
  24 ; Macros and other preprocessor constants
  25 ;=============================================================================
  26
  27 %include "amd64inc.asm"
  28
  29 ;=============================================================================
  30 ; Read only data
  31 ;=============================================================================
  32
  33 SECTION .rodata align=16
  34
  35 ALIGN 16
  36 pw_1:  times 4 dw 1
  37 pw_16: times 4 dw 16
  38 pw_32: times 4 dw 32
  39
  40 ;=============================================================================
  41 ; Macros
  42 ;=============================================================================
  43
  44 %macro LOAD_ADD 3
  45     movd        %1,     %2
  46     movd        mm7,    %3
  47     punpcklbw   %1,     mm0
  48     punpcklbw   mm7,    mm0
  49     paddw       %1,     mm7
  50 %endmacro
  51
  52 %macro FILT_V 0
  53     psubw       mm1,    mm2         ; a-b
  54     psubw       mm4,    mm5
  55     psubw       mm2,    mm3         ; b-c
  56     psubw       mm5,    mm6
  57     psllw       mm2,    2
  58     psllw       mm5,    2
  59     psubw       mm1,    mm2         ; a-5*b+4*c
  60     psubw       mm4,    mm5
  61     psllw       mm3,    4
  62     psllw       mm6,    4
  63     paddw       mm1,    mm3         ; a-5*b+20*c
  64     paddw       mm4,    mm6
  65 %endmacro
  66
  67 %macro FILT_H 0
  68     psubw       mm1,    mm2         ; a-b
  69     psubw       mm4,    mm5
  70     psraw       mm1,    2           ; (a-b)/4
  71     psraw       mm4,    2
  72     psubw       mm1,    mm2         ; (a-b)/4-b
  73     psubw       mm4,    mm5
  74     paddw       mm1,    mm3         ; (a-b)/4-b+c
  75     paddw       mm4,    mm6
  76     psraw       mm1,    2           ; ((a-b)/4-b+c)/4
  77     psraw       mm4,    2
  78     paddw       mm1,    mm3         ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
  79     paddw       mm4,    mm6
  80 %endmacro
  81
  82 %macro FILT_PACK 1
  83     paddw       mm1,    mm7
  84     paddw       mm4,    mm7
  85     psraw       mm1,    %1
  86     psraw       mm4,    %1
  87     packuswb    mm1,    mm4
  88 %endmacro
  89
  90
  91 ;=============================================================================
  92 ; Code
  93 ;=============================================================================
  94
  95 SECTION .text
  96
  97 cglobal x264_hpel_filter_mmxext
  98 cglobal x264_plane_copy_mmxext
  99
 100 ;-----------------------------------------------------------------------------
 101 ; void x264_hpel_filter_mmxext( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
 102 ;                               int i_stride, int i_width, int i_height );
 103 ;-----------------------------------------------------------------------------
 104
 105 ALIGN 16
 106 x264_hpel_filter_mmxext :
 107
 108 %ifdef WIN64
 109     push        rdi
 110     pushreg     rdi
 111     push        rsi
 112     pushreg     rsi
 113 %endif
 114     push        rbp
 115     pushreg     rbp
 116     push        rbx
 117     pushreg     rbx
 118     mov         rbp,    rsp
 119     setframe    rbp, 0
 120     endprolog
 121
 122 %ifdef WIN64
 123     mov         rdi,    parm1q
 124     mov         rsi,    parm2q
 125     mov         rdx,    parm3q
 126     mov         rcx,    parm4q
 127     movsxd      r8,     dword [rbp+72]
 128     movsxd      r9,     dword [rbp+80]
 129     mov         ebx,    dword [rbp+88]
 130 %else
 131     mov         ebx,    dword [rbp+24]
 132 %endif
 133     %define     dsth    rdi
 134     %define     dstv    rsi
 135     %define     dstc    rdx
 136     %define     src     rcx
 137     %define     stride  r8
 138     %define     width   r9
 139     %define     height  ebx
 140     %define     stride3 r10
 141     %define     stride5 r11
 142     %define     x       rax
 143     %define     tbuffer rsp + 8
 144
 145     lea         stride3, [stride*3]
 146     lea         stride5, [stride*5]
 147     sub         src,    stride
 148     sub         src,    stride
 149
 150     lea         rax,    [stride*2 + 24]
 151     sub         rsp,    rax
 152
 153     pxor        mm0,    mm0
 154
 155 .loopy:
 156
 157     xor         x,      x
 158 ALIGN 16
 159 .vertical_filter:
 160
 161     prefetcht0  [src + stride5 + 32]
 162
 163     LOAD_ADD    mm1,    [src               ], [src + stride5     ] ; a0
 164     LOAD_ADD    mm2,    [src + stride      ], [src + stride*4    ] ; b0
 165     LOAD_ADD    mm3,    [src + stride*2    ], [src + stride3     ] ; c0
 166     LOAD_ADD    mm4,    [src            + 4], [src + stride5  + 4] ; a1
 167     LOAD_ADD    mm5,    [src + stride   + 4], [src + stride*4 + 4] ; b1
 168     LOAD_ADD    mm6,    [src + stride*2 + 4], [src + stride3  + 4] ; c1
 169
 170     FILT_V
 171
 172     movq        mm7,    [pw_16 GLOBAL]
 173     movq        [tbuffer + x*2],  mm1
 174     movq        [tbuffer + x*2 + 8],  mm4
 175     paddw       mm1,    mm7
 176     paddw       mm4,    mm7
 177     psraw       mm1,    5
 178     psraw       mm4,    5
 179     packuswb    mm1,    mm4
 180     movntq      [dstv + x], mm1
 181
 182     add         x,      8
 183     add         src,    8
 184     cmp         x,      width
 185     jle         .vertical_filter
 186
 187     pshufw      mm2, [tbuffer], 0
 188     movq        [tbuffer - 8], mm2 ; pad left
 189     ; no need to pad right, since vertical_filter already did 4 extra pixels
 190
 191     sub         src,    x
 192     xor         x,      x
 193     movq        mm7,    [pw_32 GLOBAL]
 194 .center_filter:
 195
 196     movq        mm1,    [tbuffer + x*2 - 4 ]
 197     movq        mm2,    [tbuffer + x*2 - 2 ]
 198     movq        mm3,    [tbuffer + x*2     ]
 199     movq        mm4,    [tbuffer + x*2 + 4 ]
 200     movq        mm5,    [tbuffer + x*2 + 6 ]
 201     paddw       mm3,    [tbuffer + x*2 + 2 ] ; c0
 202     paddw       mm2,    mm4                  ; b0
 203     paddw       mm1,    mm5                  ; a0
 204     movq        mm6,    [tbuffer + x*2 + 8 ]
 205     paddw       mm4,    [tbuffer + x*2 + 14] ; a1
 206     paddw       mm5,    [tbuffer + x*2 + 12] ; b1
 207     paddw       mm6,    [tbuffer + x*2 + 10] ; c1
 208
 209     FILT_H
 210     FILT_PACK 6
 211     movntq      [dstc + x], mm1
 212
 213     add         x,      8
 214     cmp         x,      width
 215     jl          .center_filter
 216
 217     lea         src,    [src + stride*2]
 218     xor         x,      x
 219 .horizontal_filter:
 220
 221     movd        mm1,    [src + x - 2]
 222     movd        mm2,    [src + x - 1]
 223     movd        mm3,    [src + x    ]
 224     movd        mm6,    [src + x + 1]
 225     movd        mm4,    [src + x + 2]
 226     movd        mm5,    [src + x + 3]
 227     punpcklbw   mm1,    mm0
 228     punpcklbw   mm2,    mm0
 229     punpcklbw   mm3,    mm0
 230     punpcklbw   mm6,    mm0
 231     punpcklbw   mm4,    mm0
 232     punpcklbw   mm5,    mm0
 233     paddw       mm3,    mm6 ; c0
 234     paddw       mm2,    mm4 ; b0
 235     paddw       mm1,    mm5 ; a0
 236     movd        mm7,    [src + x + 7]
 237     movd        mm6,    [src + x + 6]
 238     punpcklbw   mm7,    mm0
 239     punpcklbw   mm6,    mm0
 240     paddw       mm4,    mm7 ; c1
 241     paddw       mm5,    mm6 ; b1
 242     movd        mm7,    [src + x + 5]
 243     movd        mm6,    [src + x + 4]
 244     punpcklbw   mm7,    mm0
 245     punpcklbw   mm6,    mm0
 246     paddw       mm6,    mm7 ; a1
 247
 248     movq        mm7,    [pw_1 GLOBAL]
 249     FILT_H
 250     FILT_PACK 1
 251     movntq      [dsth + x], mm1
 252
 253     add         x,      8
 254     cmp         x,      width
 255     jl          .horizontal_filter
 256
 257     sub         src,    stride
 258     add         dsth,   stride
 259     add         dstv,   stride
 260     add         dstc,   stride
 261     dec         height
 262     jg          .loopy
 263
 264     mov         rsp,    rbp
 265     pop         rbx
 266     pop         rbp
 267 %ifdef WIN64
 268     pop         rsi
 269     pop         rdi
 270 %endif
 271     ret
 272
 273
 274
 275 ;-----------------------------------------------------------------------------
 276 ; void x264_plane_copy_mmxext( uint8_t *dst, int i_dst,
 277 ;                              uint8_t *src, int i_src, int w, int h)
 278 ;-----------------------------------------------------------------------------
 279 ALIGN 16
 280 x264_plane_copy_mmxext:
 281     movsxd parm2q, parm2d
 282     movsxd parm4q, parm4d
 283     add    parm5d, 3
 284     and    parm5d, ~3
 285     sub    parm2q, parm5q
 286     sub    parm4q, parm5q
 287     ; shuffle regs because movsd needs dst=rdi, src=rsi, w=ecx
 288     xchg   rsi, rdx
 289     mov    rax, parm4q
 290 .loopy:
 291     mov    ecx, parm5d
 292     sub    ecx, 64
 293     jl     .endx
 294 .loopx:
 295     prefetchnta [rsi+256]
 296     movq   mm0, [rsi   ]
 297     movq   mm1, [rsi+ 8]
 298     movq   mm2, [rsi+16]
 299     movq   mm3, [rsi+24]
 300     movq   mm4, [rsi+32]
 301     movq   mm5, [rsi+40]
 302     movq   mm6, [rsi+48]
 303     movq   mm7, [rsi+56]
 304     movntq [rdi   ], mm0
 305     movntq [rdi+ 8], mm1
 306     movntq [rdi+16], mm2
 307     movntq [rdi+24], mm3
 308     movntq [rdi+32], mm4
 309     movntq [rdi+40], mm5
 310     movntq [rdi+48], mm6
 311     movntq [rdi+56], mm7
 312     add    rsi, 64
 313     add    rdi, 64
 314     sub    ecx, 64
 315     jge    .loopx
 316 .endx:
 317     prefetchnta [rsi+256]
 318     add    ecx, 64
 319     shr    ecx, 2
 320     rep movsd
 321     add    rdi, rdx
 322     add    rsi, rax
 323     sub    parm6d, 1
 324     jg     .loopy
 325     emms
 326     ret
 327