git.sesse.net Git - x264/blob - common/amd64/mc-a2.asm

   1 ;*****************************************************************************
   2 ;* mc-a2.asm: h264 encoder library
   3 ;*****************************************************************************
   4 ;* Copyright (C) 2005 x264 project
   5 ;*
   6 ;* This program is free software; you can redistribute it and/or modify
   7 ;* it under the terms of the GNU General Public License as published by
   8 ;* the Free Software Foundation; either version 2 of the License, or
   9 ;* (at your option) any later version.
  10 ;*
  11 ;* This program is distributed in the hope that it will be useful,
  12 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 ;* GNU General Public License for more details.
  15 ;*
  16 ;* You should have received a copy of the GNU General Public License
  17 ;* along with this program; if not, write to the Free Software
  18 ;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
  19 ;*****************************************************************************
  20
  21 BITS 64
  22
  23 ;=============================================================================
  24 ; Macros and other preprocessor constants
  25 ;=============================================================================
  26
  27 %ifdef __PIC__
  28     %define GLOBAL wrt rip
  29 %else
  30     %define GLOBAL
  31 %endif
  32
  33 %macro cglobal 1
  34     %ifdef PREFIX
  35         global _%1
  36         %define %1 _%1
  37     %else
  38         global %1
  39     %endif
  40 %endmacro
  41
  42 ;=============================================================================
  43 ; Read only data
  44 ;=============================================================================
  45
  46 SECTION .rodata
  47
  48 ALIGN 16
  49 mmx_dw_one:
  50     times 4 dw 16
  51 mmx_dd_one:
  52     times 2 dd 512
  53 mmx_dw_20:
  54     times 4 dw 20
  55 mmx_dw_5:
  56     times 4 dw -5
  57
  58 %assign tbuffer 0
  59
  60 ;=============================================================================
  61 ; Macros
  62 ;=============================================================================
  63
  64 %macro LOAD_4 9
  65     movd %1, %5
  66     movd %2, %6
  67     movd %3, %7
  68     movd %4, %8
  69     punpcklbw %1, %9
  70     punpcklbw %2, %9
  71     punpcklbw %3, %9
  72     punpcklbw %4, %9
  73 %endmacro
  74
  75 %macro FILT_2 2
  76     psubw %1, %2
  77     psllw %2, 2
  78     psubw %1, %2
  79 %endmacro
  80
  81 %macro FILT_4 3
  82     paddw %2, %3
  83     psllw %2, 2
  84     paddw %1, %2
  85     psllw %2, 2
  86     paddw %1, %2
  87 %endmacro
  88
  89 %macro FILT_6 4
  90     psubw %1, %2
  91     psllw %2, 2
  92     psubw %1, %2
  93     paddw %1, %3
  94     paddw %1, %4
  95     psraw %1, 5
  96 %endmacro
  97
  98 %macro FILT_ALL 1
  99     LOAD_4      mm1, mm2, mm3, mm4, [%1], [%1 + rcx], [%1 + 2 * rcx], [%1 + rbx], mm0
 100     FILT_2      mm1, mm2
 101     movd        mm5, [%1 + 4 * rcx]
 102     movd        mm6, [%1 + rdx]
 103     FILT_4      mm1, mm3, mm4
 104     punpcklbw   mm5, mm0
 105     punpcklbw   mm6, mm0
 106     psubw       mm1, mm5
 107     psllw       mm5, 2
 108     psubw       mm1, mm5
 109     paddw       mm1, mm6
 110 %endmacro
 111
 112
 113
 114
 115 ;=============================================================================
 116 ; Code
 117 ;=============================================================================
 118
 119 SECTION .text
 120
 121 cglobal x264_vertical_filter_mmxext
 122 cglobal x264_horizontal_filter_mmxext
 123 cglobal x264_center_filter_mmxext
 124
 125 ;-----------------------------------------------------------------------------
 126 ;
 127 ; void x264_center_filter_mmxext( uint8_t *dst1, int i_dst1_stride,
 128 ;                                 uint8_t *dst2, int i_dst2_stride,
 129 ;                                  uint8_t *src, int i_src_stride,
 130 ;                                  int i_width, int i_height );
 131 ;
 132 ;-----------------------------------------------------------------------------
 133
 134 ALIGN 16
 135 x264_center_filter_mmxext :
 136
 137     push        rbp
 138     push        rbx
 139     push        r12
 140     push        r13
 141     push        r14
 142     push        r15
 143     mov         rbp,    rsp
 144
 145     movsxd      r13,    r9d                 ; src_stride
 146     mov         r12,    r8                  ; src
 147     sub         r12,    r13
 148     sub         r12,    r13                 ; tsrc = src - 2 * src_stride
 149
 150     ; use 24 instead of 18 (used in i386/mc-a2.asm) to keep rsp aligned
 151     lea         rax,    [r13 + r13 + 24 + tbuffer]
 152     sub         rsp,    rax
 153
 154     mov         r10,    rdx                 ; dst2
 155     movsxd      r11,    ecx                 ; dst2_stride
 156     mov         r8,     rdi                 ; dst1
 157     movsxd      r9,     esi                 ; dst1_stride
 158     movsxd      r14,    dword [rbp + 56]    ; width
 159     movsxd      r15,    dword [rbp + 64]    ; height
 160
 161     mov         rcx,    r13                 ; src_stride
 162     lea         rbx,    [r13 + r13 * 2]     ; 3 * src_stride
 163     lea         rdx,    [r13 + r13 * 4]     ; 5 * src_stride
 164
 165     pxor        mm0,    mm0                 ; 0 ---> mm0
 166     movq        mm7,    [mmx_dd_one GLOBAL] ; for rounding
 167
 168 loopcy:
 169
 170     xor         rax,    rax
 171     mov         rsi,    r12             ; tsrc
 172
 173     FILT_ALL    rsi
 174
 175     pshufw      mm2,    mm1, 0
 176     movq        [rsp + tbuffer],  mm2
 177     movq        [rsp + tbuffer + 8],  mm1
 178     paddw       mm1,    [mmx_dw_one GLOBAL]
 179     psraw       mm1,    5
 180
 181     packuswb    mm1,    mm1
 182     movd        [r8],   mm1             ; dst1[0] = mm1
 183
 184     add         rax,    8
 185     add         rsi,    4
 186     lea         rdi,    [r8 - 4]        ; rdi = dst1 - 4
 187
 188 loopcx1:
 189
 190     FILT_ALL    rsi
 191
 192     movq        [rsp + tbuffer + 2 * rax],  mm1
 193     paddw       mm1,    [mmx_dw_one GLOBAL]
 194     psraw       mm1,    5
 195     packuswb    mm1,    mm1
 196     movd        [rdi + rax],  mm1   ; dst1[rax - 4] = mm1
 197
 198     add         rsi,    4
 199     add         rax,    4
 200     cmp         rax,    r14         ; cmp rax, width
 201     jnz         loopcx1
 202
 203     FILT_ALL    rsi
 204
 205     pshufw      mm2,    mm1,  7
 206     movq        [rsp + tbuffer + 2 * rax],  mm1
 207     movq        [rsp + tbuffer + 2 * rax + 8],  mm2
 208     paddw       mm1,    [mmx_dw_one GLOBAL]
 209     psraw       mm1,    5
 210     packuswb    mm1,    mm1
 211     movd        [rdi + rax],  mm1   ; dst1[rax - 4] = mm1
 212
 213     add         r12,    r13         ; tsrc = tsrc + src_stride
 214
 215     add         r8,     r9          ; dst1 = dst1 + dst1_stride
 216
 217     xor         rax,    rax
 218
 219 loopcx2:
 220
 221     movq        mm2,    [rsp + 2 * rax + 2  + 4 + tbuffer]
 222     movq        mm3,    [rsp + 2 * rax + 4  + 4 + tbuffer]
 223     movq        mm4,    [rsp + 2 * rax + 6  + 4 + tbuffer]
 224     movq        mm5,    [rsp + 2 * rax + 8  + 4 + tbuffer]
 225     movq        mm1,    [rsp + 2 * rax      + 4 + tbuffer]
 226     movq        mm6,    [rsp + 2 * rax + 10 + 4 + tbuffer]
 227     paddw       mm2,    mm5
 228     paddw       mm3,    mm4
 229     paddw       mm1,    mm6
 230
 231     movq        mm5,    [mmx_dw_20 GLOBAL]
 232     movq        mm4,    [mmx_dw_5 GLOBAL]
 233     movq        mm6,    mm1
 234     pxor        mm7,    mm7
 235
 236     punpckhwd   mm5,    mm2
 237     punpcklwd   mm4,    mm3
 238     punpcklwd   mm2,    [mmx_dw_20 GLOBAL]
 239     punpckhwd   mm3,    [mmx_dw_5 GLOBAL]
 240
 241     pcmpgtw     mm7,    mm1
 242
 243     pmaddwd     mm2,    mm4
 244     pmaddwd     mm3,    mm5
 245
 246     punpcklwd   mm1,    mm7
 247     punpckhwd   mm6,    mm7
 248
 249     paddd       mm2,    mm1
 250     paddd       mm3,    mm6
 251
 252     paddd       mm2,    [mmx_dd_one GLOBAL]
 253     paddd       mm3,    [mmx_dd_one GLOBAL]
 254
 255     psrad       mm2,    10
 256     psrad       mm3,    10
 257
 258     packssdw    mm2,    mm3
 259     packuswb    mm2,    mm0
 260
 261     movd        [r10 + rax], mm2    ; dst2[rax] = mm2
 262
 263     add         rax,    4
 264     cmp         rax,    r14         ; cmp rax, width
 265     jnz         loopcx2
 266
 267     add         r10,    r11         ; dst2 += dst2_stride
 268
 269     dec         r15                 ; height
 270     test        r15,    r15
 271     jnz         loopcy
 272
 273     mov         rsp,    rbp
 274
 275     pop         r15
 276     pop         r14
 277     pop         r13
 278     pop         r12
 279     pop         rbx
 280     pop         rbp
 281
 282     ret
 283
 284 ;-----------------------------------------------------------------------------
 285 ;
 286 ; void x264_horizontal_filter_mmxext( uint8_t *dst, int i_dst_stride,
 287 ;                                     uint8_t *src, int i_src_stride,
 288 ;                                     int i_width, int i_height );
 289 ;
 290 ;-----------------------------------------------------------------------------
 291
 292 ALIGN 16
 293 x264_horizontal_filter_mmxext :
 294     movsxd      r10,    esi                  ; dst_stride
 295     movsxd      r11,    ecx                  ; src_stride
 296     movsxd      r8,     r8d                  ; width
 297
 298 ;   mov         rdi,    rdi                  ; dst
 299     mov         rsi,    rdx                  ; src
 300
 301     pxor        mm0,    mm0
 302     movq        mm7,    [mmx_dw_one GLOBAL]
 303
 304     movsxd      rcx,    r9d                  ; height
 305
 306     sub         rsi,    2
 307
 308 loophy:
 309
 310     dec         rcx
 311     xor         rax,    rax
 312
 313 loophx:
 314
 315     prefetchnta [rsi + rax + 48]
 316
 317     LOAD_4      mm1,    mm2, mm3, mm4, [rsi + rax], [rsi + rax + 1], [rsi + rax + 2], [rsi + rax + 3], mm0
 318     FILT_2      mm1,    mm2
 319     movd        mm5,    [rsi + rax + 4]
 320     movd        mm6,    [rsi + rax + 5]
 321     FILT_4      mm1,    mm3, mm4
 322     movd        mm2,    [rsi + rax + 4]
 323     movd        mm3,    [rsi + rax + 6]
 324     punpcklbw   mm5,    mm0
 325     punpcklbw   mm6,    mm0
 326     FILT_6      mm1,    mm5, mm6, mm7
 327     movd        mm4,    [rsi + rax + 7]
 328     movd        mm5,    [rsi + rax + 8]
 329     punpcklbw   mm2,    mm0
 330     punpcklbw   mm3,    mm0                  ; mm2(1), mm3(20), mm6(-5) ready
 331     FILT_2      mm2,    mm6
 332     movd        mm6,    [rsi + rax + 9]
 333     punpcklbw   mm4,    mm0
 334     punpcklbw   mm5,    mm0                  ; mm2(1-5), mm3(20), mm4(20), mm5(-5) ready
 335     FILT_4      mm2,    mm3, mm4
 336     punpcklbw   mm6,    mm0
 337     FILT_6      mm2,    mm5, mm6, mm7
 338
 339     packuswb    mm1,    mm2
 340     movq        [rdi + rax],  mm1
 341
 342     add         rax,    8
 343     cmp         rax,    r8                   ; cmp rax, width
 344     jnz         loophx
 345
 346     add         rsi,    r11                  ; src_pitch
 347     add         rdi,    r10                  ; dst_pitch
 348
 349     test        rcx,    rcx
 350     jnz         loophy
 351
 352     ret