git.sesse.net Git - x264/blob - common/amd64/mc-a2.asm

   1 ;*****************************************************************************
   2 ;* mc-a2.asm: h264 encoder library
   3 ;*****************************************************************************
   4 ;* Copyright (C) 2005 x264 project
   5 ;*
   6 ;* This program is free software; you can redistribute it and/or modify
   7 ;* it under the terms of the GNU General Public License as published by
   8 ;* the Free Software Foundation; either version 2 of the License, or
   9 ;* (at your option) any later version.
  10 ;*
  11 ;* This program is distributed in the hope that it will be useful,
  12 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 ;* GNU General Public License for more details.
  15 ;*
  16 ;* You should have received a copy of the GNU General Public License
  17 ;* along with this program; if not, write to the Free Software
  18 ;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
  19 ;*****************************************************************************
  20
  21 BITS 64
  22
  23 ;=============================================================================
  24 ; Macros and other preprocessor constants
  25 ;=============================================================================
  26
  27 %include "amd64inc.asm"
  28
  29 ;=============================================================================
  30 ; Read only data
  31 ;=============================================================================
  32
  33 SECTION_RODATA
  34
  35 pw_1:  times 4 dw 1
  36 pw_16: times 4 dw 16
  37 pw_32: times 4 dw 32
  38
  39 ;=============================================================================
  40 ; Macros
  41 ;=============================================================================
  42
  43 %macro LOAD_ADD 3
  44     movd        %1,     %2
  45     movd        mm7,    %3
  46     punpcklbw   %1,     mm0
  47     punpcklbw   mm7,    mm0
  48     paddw       %1,     mm7
  49 %endmacro
  50
  51 %macro FILT_V 0
  52     psubw       mm1,    mm2         ; a-b
  53     psubw       mm4,    mm5
  54     psubw       mm2,    mm3         ; b-c
  55     psubw       mm5,    mm6
  56     psllw       mm2,    2
  57     psllw       mm5,    2
  58     psubw       mm1,    mm2         ; a-5*b+4*c
  59     psubw       mm4,    mm5
  60     psllw       mm3,    4
  61     psllw       mm6,    4
  62     paddw       mm1,    mm3         ; a-5*b+20*c
  63     paddw       mm4,    mm6
  64 %endmacro
  65
  66 %macro FILT_H 0
  67     psubw       mm1,    mm2         ; a-b
  68     psubw       mm4,    mm5
  69     psraw       mm1,    2           ; (a-b)/4
  70     psraw       mm4,    2
  71     psubw       mm1,    mm2         ; (a-b)/4-b
  72     psubw       mm4,    mm5
  73     paddw       mm1,    mm3         ; (a-b)/4-b+c
  74     paddw       mm4,    mm6
  75     psraw       mm1,    2           ; ((a-b)/4-b+c)/4
  76     psraw       mm4,    2
  77     paddw       mm1,    mm3         ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
  78     paddw       mm4,    mm6
  79 %endmacro
  80
  81 %macro FILT_PACK 1
  82     paddw       mm1,    mm7
  83     paddw       mm4,    mm7
  84     psraw       mm1,    %1
  85     psraw       mm4,    %1
  86     packuswb    mm1,    mm4
  87 %endmacro
  88
  89
  90 ;=============================================================================
  91 ; Code
  92 ;=============================================================================
  93
  94 SECTION .text
  95
  96 ;-----------------------------------------------------------------------------
  97 ; void x264_hpel_filter_mmxext( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
  98 ;                               int i_stride, int i_width, int i_height );
  99 ;-----------------------------------------------------------------------------
 100 cglobal x264_hpel_filter_mmxext
 101
 102 %ifdef WIN64
 103     push        rdi
 104     pushreg     rdi
 105     push        rsi
 106     pushreg     rsi
 107 %endif
 108     push        rbp
 109     pushreg     rbp
 110     push        rbx
 111     pushreg     rbx
 112     mov         rbp,    rsp
 113     setframe    rbp, 0
 114     endprolog
 115
 116 %ifdef WIN64
 117     mov         rdi,    parm1q
 118     mov         rsi,    parm2q
 119     mov         rdx,    parm3q
 120     mov         rcx,    parm4q
 121     movsxd      r8,     dword [rbp+72]
 122     movsxd      r9,     dword [rbp+80]
 123     mov         ebx,    dword [rbp+88]
 124 %else
 125     mov         ebx,    dword [rbp+24]
 126 %endif
 127     %define     dsth    rdi
 128     %define     dstv    rsi
 129     %define     dstc    rdx
 130     %define     src     rcx
 131     %define     stride  r8
 132     %define     width   r9
 133     %define     height  ebx
 134     %define     stride3 r10
 135     %define     stride5 r11
 136     %define     x       rax
 137     %define     tbuffer rsp + 8
 138
 139     lea         stride3, [stride*3]
 140     lea         stride5, [stride*5]
 141     sub         src,    stride
 142     sub         src,    stride
 143
 144     lea         rax,    [stride*2 + 24]
 145     sub         rsp,    rax
 146
 147     pxor        mm0,    mm0
 148
 149 .loopy:
 150
 151     xor         x,      x
 152 ALIGN 16
 153 .vertical_filter:
 154
 155     prefetcht0  [src + stride5 + 32]
 156
 157     LOAD_ADD    mm1,    [src               ], [src + stride5     ] ; a0
 158     LOAD_ADD    mm2,    [src + stride      ], [src + stride*4    ] ; b0
 159     LOAD_ADD    mm3,    [src + stride*2    ], [src + stride3     ] ; c0
 160     LOAD_ADD    mm4,    [src            + 4], [src + stride5  + 4] ; a1
 161     LOAD_ADD    mm5,    [src + stride   + 4], [src + stride*4 + 4] ; b1
 162     LOAD_ADD    mm6,    [src + stride*2 + 4], [src + stride3  + 4] ; c1
 163
 164     FILT_V
 165
 166     movq        mm7,    [pw_16 GLOBAL]
 167     movq        [tbuffer + x*2],  mm1
 168     movq        [tbuffer + x*2 + 8],  mm4
 169     paddw       mm1,    mm7
 170     paddw       mm4,    mm7
 171     psraw       mm1,    5
 172     psraw       mm4,    5
 173     packuswb    mm1,    mm4
 174     movntq      [dstv + x], mm1
 175
 176     add         x,      8
 177     add         src,    8
 178     cmp         x,      width
 179     jle         .vertical_filter
 180
 181     pshufw      mm2, [tbuffer], 0
 182     movq        [tbuffer - 8], mm2 ; pad left
 183     ; no need to pad right, since vertical_filter already did 4 extra pixels
 184
 185     sub         src,    x
 186     xor         x,      x
 187     movq        mm7,    [pw_32 GLOBAL]
 188 .center_filter:
 189
 190     movq        mm1,    [tbuffer + x*2 - 4 ]
 191     movq        mm2,    [tbuffer + x*2 - 2 ]
 192     movq        mm3,    [tbuffer + x*2     ]
 193     movq        mm4,    [tbuffer + x*2 + 4 ]
 194     movq        mm5,    [tbuffer + x*2 + 6 ]
 195     paddw       mm3,    [tbuffer + x*2 + 2 ] ; c0
 196     paddw       mm2,    mm4                  ; b0
 197     paddw       mm1,    mm5                  ; a0
 198     movq        mm6,    [tbuffer + x*2 + 8 ]
 199     paddw       mm4,    [tbuffer + x*2 + 14] ; a1
 200     paddw       mm5,    [tbuffer + x*2 + 12] ; b1
 201     paddw       mm6,    [tbuffer + x*2 + 10] ; c1
 202
 203     FILT_H
 204     FILT_PACK 6
 205     movntq      [dstc + x], mm1
 206
 207     add         x,      8
 208     cmp         x,      width
 209     jl          .center_filter
 210
 211     lea         src,    [src + stride*2]
 212     xor         x,      x
 213 .horizontal_filter:
 214
 215     movd        mm1,    [src + x - 2]
 216     movd        mm2,    [src + x - 1]
 217     movd        mm3,    [src + x    ]
 218     movd        mm6,    [src + x + 1]
 219     movd        mm4,    [src + x + 2]
 220     movd        mm5,    [src + x + 3]
 221     punpcklbw   mm1,    mm0
 222     punpcklbw   mm2,    mm0
 223     punpcklbw   mm3,    mm0
 224     punpcklbw   mm6,    mm0
 225     punpcklbw   mm4,    mm0
 226     punpcklbw   mm5,    mm0
 227     paddw       mm3,    mm6 ; c0
 228     paddw       mm2,    mm4 ; b0
 229     paddw       mm1,    mm5 ; a0
 230     movd        mm7,    [src + x + 7]
 231     movd        mm6,    [src + x + 6]
 232     punpcklbw   mm7,    mm0
 233     punpcklbw   mm6,    mm0
 234     paddw       mm4,    mm7 ; c1
 235     paddw       mm5,    mm6 ; b1
 236     movd        mm7,    [src + x + 5]
 237     movd        mm6,    [src + x + 4]
 238     punpcklbw   mm7,    mm0
 239     punpcklbw   mm6,    mm0
 240     paddw       mm6,    mm7 ; a1
 241
 242     movq        mm7,    [pw_1 GLOBAL]
 243     FILT_H
 244     FILT_PACK 1
 245     movntq      [dsth + x], mm1
 246
 247     add         x,      8
 248     cmp         x,      width
 249     jl          .horizontal_filter
 250
 251     sub         src,    stride
 252     add         dsth,   stride
 253     add         dstv,   stride
 254     add         dstc,   stride
 255     dec         height
 256     jg          .loopy
 257
 258     mov         rsp,    rbp
 259     pop         rbx
 260     pop         rbp
 261 %ifdef WIN64
 262     pop         rsi
 263     pop         rdi
 264 %endif
 265     ret
 266
 267
 268
 269 ;-----------------------------------------------------------------------------
 270 ; void x264_plane_copy_mmxext( uint8_t *dst, int i_dst,
 271 ;                              uint8_t *src, int i_src, int w, int h)
 272 ;-----------------------------------------------------------------------------
 273 cglobal x264_plane_copy_mmxext
 274     movsxd parm2q, parm2d
 275     movsxd parm4q, parm4d
 276     add    parm5d, 3
 277     and    parm5d, ~3
 278     sub    parm2q, parm5q
 279     sub    parm4q, parm5q
 280     ; shuffle regs because movsd needs dst=rdi, src=rsi, w=ecx
 281     xchg   rsi, rdx
 282     mov    rax, parm4q
 283 .loopy:
 284     mov    ecx, parm5d
 285     sub    ecx, 64
 286     jl     .endx
 287 .loopx:
 288     prefetchnta [rsi+256]
 289     movq   mm0, [rsi   ]
 290     movq   mm1, [rsi+ 8]
 291     movq   mm2, [rsi+16]
 292     movq   mm3, [rsi+24]
 293     movq   mm4, [rsi+32]
 294     movq   mm5, [rsi+40]
 295     movq   mm6, [rsi+48]
 296     movq   mm7, [rsi+56]
 297     movntq [rdi   ], mm0
 298     movntq [rdi+ 8], mm1
 299     movntq [rdi+16], mm2
 300     movntq [rdi+24], mm3
 301     movntq [rdi+32], mm4
 302     movntq [rdi+40], mm5
 303     movntq [rdi+48], mm6
 304     movntq [rdi+56], mm7
 305     add    rsi, 64
 306     add    rdi, 64
 307     sub    ecx, 64
 308     jge    .loopx
 309 .endx:
 310     prefetchnta [rsi+256]
 311     add    ecx, 64
 312     shr    ecx, 2
 313     rep movsd
 314     add    rdi, rdx
 315     add    rsi, rax
 316     sub    parm6d, 1
 317     jg     .loopy
 318     emms
 319     ret
 320