git.sesse.net Git - x264/blob - common/i386/mc-a.asm

   1 ;*****************************************************************************
   2 ;* mc.asm: h264 encoder library
   3 ;*****************************************************************************
   4 ;* Copyright (C) 2003 x264 project
   5 ;* $Id: mc.asm,v 1.3 2004/06/18 01:59:58 chenm001 Exp $
   6 ;*
   7 ;* Authors: Min Chen <chenm001.163.com> (converted to nasm)
   8 ;*          Laurent Aimar <fenrir@via.ecp.fr> (init algorithm)
   9 ;*
  10 ;* This program is free software; you can redistribute it and/or modify
  11 ;* it under the terms of the GNU General Public License as published by
  12 ;* the Free Software Foundation; either version 2 of the License, or
  13 ;* (at your option) any later version.
  14 ;*
  15 ;* This program is distributed in the hope that it will be useful,
  16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18 ;* GNU General Public License for more details.
  19 ;*
  20 ;* You should have received a copy of the GNU General Public License
  21 ;* along with this program; if not, write to the Free Software
  22 ;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
  23 ;*****************************************************************************
  24
  25 ;*****************************************************************************
  26 ;*                                                                           *
  27 ;*  Revision history:                                                        *
  28 ;*                                                                           *
  29 ;*  2004.05.17 portab mc_copy_w4/8/16 (CM)                                   *
  30 ;*                                                                           *
  31 ;*****************************************************************************
  32
  33 BITS 32
  34
  35 ;=============================================================================
  36 ; Macros and other preprocessor constants
  37 ;=============================================================================
  38
  39 %macro cglobal 1
  40         %ifdef PREFIX
  41                 global _%1
  42                 %define %1 _%1
  43         %else
  44                 global %1
  45         %endif
  46 %endmacro
  47
  48 ;=============================================================================
  49 ; Local Data (Read Only)
  50 ;=============================================================================
  51
  52 %ifdef FORMAT_COFF
  53 SECTION .rodata data
  54 %else
  55 SECTION .rodata data align=16
  56 %endif
  57
  58 ;-----------------------------------------------------------------------------
  59 ; Various memory constants (trigonometric values or rounding values)
  60 ;-----------------------------------------------------------------------------
  61
  62 ALIGN 16
  63
  64 ;=============================================================================
  65 ; Code
  66 ;=============================================================================
  67
  68 SECTION .text
  69
  70 cglobal x264_pixel_avg_w4_mmxext
  71 cglobal x264_pixel_avg_w8_mmxext
  72 cglobal x264_pixel_avg_w16_mmxext
  73 cglobal x264_pixel_avg_w16_sse2
  74
  75 cglobal x264_mc_copy_w4_mmxext
  76 cglobal x264_mc_copy_w8_mmxext
  77 cglobal x264_mc_copy_w16_mmxext
  78 cglobal x264_mc_copy_w16_sse2
  79
  80
  81 ALIGN 16
  82 ;-----------------------------------------------------------------------------
  83 ; void x264_pixel_avg_w4_mmxext( uint8_t *dst,  int i_dst_stride,
  84 ;                                uint8_t *src1, int i_src1_stride,
  85 ;                                uint8_t *src2, int i_src2_stride,
  86 ;                                int i_height );
  87 ;-----------------------------------------------------------------------------
  88 x264_pixel_avg_w4_mmxext:
  89     push        ebp
  90     push        ebx
  91     push        esi
  92     push        edi
  93
  94     mov         edi, [esp+20]       ; dst
  95     mov         ebx, [esp+28]       ; src1
  96     mov         ecx, [esp+36]       ; src2
  97     mov         esi, [esp+24]       ; i_dst_stride
  98     mov         eax, [esp+32]       ; i_src1_stride
  99     mov         edx, [esp+40]       ; i_src2_stride
 100     mov         ebp, [esp+44]       ; i_height
 101 ALIGN 4
 102 .height_loop
 103     movd        mm0, [ebx]
 104     pavgb       mm0, [ecx]
 105     movd        mm1, [ebx+eax]
 106     pavgb       mm1, [ecx+edx]
 107     movd        [edi], mm0
 108     movd        [edi+esi], mm1
 109     dec         ebp
 110     dec         ebp
 111     lea         ebx, [ebx+eax*2]
 112     lea         ecx, [ecx+edx*2]
 113     lea         edi, [edi+esi*2]
 114     jne         .height_loop
 115
 116     pop         edi
 117     pop         esi
 118     pop         ebx
 119     pop         ebp
 120     ret
 121
 122
 123
 124 ALIGN 16
 125 ;-----------------------------------------------------------------------------
 126 ; void x264_pixel_avg_w8_mmxext( uint8_t *dst,  int i_dst_stride,
 127 ;                                uint8_t *src1, int i_src1_stride,
 128 ;                                uint8_t *src2, int i_src2_stride,
 129 ;                                int i_height );
 130 ;-----------------------------------------------------------------------------
 131 x264_pixel_avg_w8_mmxext:
 132     push        ebp
 133     push        ebx
 134     push        esi
 135     push        edi
 136
 137     mov         edi, [esp+20]       ; dst
 138     mov         ebx, [esp+28]       ; src1
 139     mov         ecx, [esp+36]       ; src2
 140     mov         esi, [esp+24]       ; i_dst_stride
 141     mov         eax, [esp+32]       ; i_src1_stride
 142     mov         edx, [esp+40]       ; i_src2_stride
 143     mov         ebp, [esp+44]       ; i_height
 144 ALIGN 4
 145 .height_loop
 146     movq        mm0, [ebx]
 147     pavgb       mm0, [ecx]
 148     movq        [edi], mm0
 149     dec         ebp
 150     lea         ebx, [ebx+eax]
 151     lea         ecx, [ecx+edx]
 152     lea         edi, [edi+esi]
 153     jne         .height_loop
 154
 155     pop         edi
 156     pop         esi
 157     pop         ebx
 158     pop         ebp
 159     ret
 160
 161
 162
 163 ALIGN 16
 164 ;-----------------------------------------------------------------------------
 165 ; void x264_pixel_avg_w16_mmxext( uint8_t *dst,  int i_dst_stride,
 166 ;                                 uint8_t *src1, int i_src1_stride,
 167 ;                                 uint8_t *src2, int i_src2_stride,
 168 ;                                 int i_height );
 169 ;-----------------------------------------------------------------------------
 170 x264_pixel_avg_w16_mmxext:
 171     push        ebp
 172     push        ebx
 173     push        esi
 174     push        edi
 175
 176     mov         edi, [esp+20]       ; dst
 177     mov         ebx, [esp+28]       ; src1
 178     mov         ecx, [esp+36]       ; src2
 179     mov         esi, [esp+24]       ; i_dst_stride
 180     mov         eax, [esp+32]       ; i_src1_stride
 181     mov         edx, [esp+40]       ; i_src2_stride
 182     mov         ebp, [esp+44]       ; i_height
 183 ALIGN 4
 184 .height_loop
 185     movq        mm0, [ebx  ]
 186     movq        mm1, [ebx+8]
 187     pavgb       mm0, [ecx  ]
 188     pavgb       mm1, [ecx+8]
 189     movq        [edi  ], mm0
 190     movq        [edi+8], mm1
 191     dec         ebp
 192     lea         ebx, [ebx+eax]
 193     lea         ecx, [ecx+edx]
 194     lea         edi, [edi+esi]
 195     jne         .height_loop
 196
 197     pop         edi
 198     pop         esi
 199     pop         ebx
 200     pop         ebp
 201     ret
 202
 203 ALIGN 16
 204 ;-----------------------------------------------------------------------------
 205 ; void x264_pixel_avg_w16_sse2( uint8_t *dst,  int i_dst_stride,
 206 ;                               uint8_t *src1, int i_src1_stride,
 207 ;                               uint8_t *src2, int i_src2_stride,
 208 ;                               int i_height );
 209 ;-----------------------------------------------------------------------------
 210 x264_pixel_avg_w16_sse2:
 211     push        ebp
 212     push        ebx
 213     push        esi
 214     push        edi
 215
 216     mov         edi, [esp+20]       ; dst
 217     mov         ebx, [esp+28]       ; src1
 218     mov         ecx, [esp+36]       ; src2
 219     mov         esi, [esp+24]       ; i_dst_stride
 220     mov         eax, [esp+32]       ; i_src1_stride
 221     mov         edx, [esp+40]       ; i_src2_stride
 222     mov         ebp, [esp+44]       ; i_height
 223 ALIGN 4
 224 .height_loop
 225     movdqu      xmm0, [ebx]
 226     pavgb       xmm0, [ecx]
 227     movdqu      [edi], xmm0
 228
 229     dec         ebp
 230     lea         ebx, [ebx+eax]
 231     lea         ecx, [ecx+edx]
 232     lea         edi, [edi+esi]
 233     jne         .height_loop
 234
 235     pop         edi
 236     pop         esi
 237     pop         ebx
 238     pop         ebp
 239     ret
 240
 241
 242
 243 ALIGN 16
 244 ;-----------------------------------------------------------------------------
 245 ;  void x264_mc_copy_w4_mmxext( uint8_t *src, int i_src_stride,
 246 ;                               uint8_t *dst, int i_dst_stride, int i_height )
 247 ;-----------------------------------------------------------------------------
 248 x264_mc_copy_w4_mmxext:
 249     push    ebx
 250     push    esi
 251     push    edi
 252
 253     mov     esi, [esp+16]       ; src
 254     mov     edi, [esp+24]       ; dst
 255     mov     ebx, [esp+20]       ; i_src_stride
 256     mov     edx, [esp+28]       ; i_dst_stride
 257     mov     ecx, [esp+32]       ; i_height
 258 ALIGN 4
 259 .height_loop
 260     mov     eax, [esi]
 261     mov     [edi], eax
 262     mov     eax, [esi+ebx]
 263     mov     [edi+edx], eax
 264     lea     esi, [esi+ebx*2]
 265     lea     edi, [edi+edx*2]
 266     dec     ecx
 267     dec     ecx
 268     jne     .height_loop
 269
 270     pop     edi
 271     pop     esi
 272     pop     ebx
 273     ret
 274
 275 cglobal mc_copy_w8
 276
 277 ALIGN 16
 278 ;-----------------------------------------------------------------------------
 279 ;   void x264_mc_copy_w8_mmxext( uint8_t *src, int i_src_stride,
 280 ;                                uint8_t *dst, int i_dst_stride, int i_height )
 281 ;-----------------------------------------------------------------------------
 282 x264_mc_copy_w8_mmxext:
 283     push    ebx
 284     push    esi
 285     push    edi
 286
 287     mov     esi, [esp+16]       ; src
 288     mov     edi, [esp+24]       ; dst
 289     mov     ebx, [esp+20]       ; i_src_stride
 290     mov     edx, [esp+28]       ; i_dst_stride
 291     mov     ecx, [esp+32]       ; i_height
 292 ALIGN 4
 293 .height_loop
 294     movq    mm0, [esi]
 295     movq    [edi], mm0
 296     movq    mm1, [esi+ebx]
 297     movq    [edi+edx], mm1
 298     movq    mm2, [esi+ebx*2]
 299     movq    [edi+edx*2], mm2
 300     lea     esi, [esi+ebx*2]
 301     lea     edi, [edi+edx*2]
 302     movq    mm3, [esi+ebx]
 303     movq    [edi+edx], mm3
 304     lea     esi, [esi+ebx*2]
 305     lea     edi, [edi+edx*2]
 306
 307     sub     ecx, byte 4
 308     jnz     .height_loop
 309
 310     pop     edi
 311     pop     esi
 312     pop     ebx
 313     ret
 314
 315 cglobal mc_copy_w16
 316
 317 ALIGN 16
 318 ;-----------------------------------------------------------------------------
 319 ;   void x264_mc_copy_w16_mmxext( uint8_t *src, int i_src_stride,
 320 ;                                 uint8_t *dst, int i_dst_stride, int i_height )
 321 ;-----------------------------------------------------------------------------
 322 x264_mc_copy_w16_mmxext:
 323     push    ebx
 324     push    esi
 325     push    edi
 326
 327     mov     esi, [esp+16]       ; src
 328     mov     edi, [esp+24]       ; dst
 329     mov     ebx, [esp+20]       ; i_src_stride
 330     mov     edx, [esp+28]       ; i_dst_stride
 331     mov     ecx, [esp+32]       ; i_height
 332
 333 ALIGN 4
 334 .height_loop
 335     movq    mm0, [esi]
 336     movq    mm1, [esi+8]
 337     movq    [edi], mm0
 338     movq    [edi+8], mm1
 339     movq    mm2, [esi+ebx]
 340     movq    mm3, [esi+ebx+8]
 341     movq    [edi+edx], mm2
 342     movq    [edi+edx+8], mm3
 343     movq    mm4, [esi+ebx*2]
 344     movq    mm5, [esi+ebx*2+8]
 345     movq    [edi+edx*2], mm4
 346     movq    [edi+edx*2+8], mm5
 347     lea     esi, [esi+ebx*2]
 348     lea     edi, [edi+edx*2]
 349     movq    mm6, [esi+ebx]
 350     movq    mm7, [esi+ebx+8]
 351     movq    [edi+edx], mm6
 352     movq    [edi+edx+8], mm7
 353     lea     esi, [esi+ebx*2]
 354     lea     edi, [edi+edx*2]
 355     sub     ecx, byte 4
 356     jnz     .height_loop
 357
 358     pop     edi
 359     pop     esi
 360     pop     ebx
 361     ret
 362
 363
 364 ALIGN 16
 365 ;-----------------------------------------------------------------------------
 366 ;   void x264_mc_copy_w16_sse2( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
 367 ;-----------------------------------------------------------------------------
 368 x264_mc_copy_w16_sse2:
 369     push    ebx
 370     push    esi
 371     push    edi
 372
 373     mov     esi, [esp+16]       ; src
 374     mov     edi, [esp+24]       ; dst
 375     mov     ebx, [esp+20]       ; i_src_stride
 376     mov     edx, [esp+28]       ; i_dst_stride
 377     mov     ecx, [esp+32]       ; i_height
 378
 379 ALIGN 4
 380 .height_loop
 381     movdqu  xmm0, [esi]
 382     movdqu  xmm1, [esi+ebx]
 383     movdqu  [edi], xmm0
 384     movdqu  [edi+edx], xmm1
 385     dec     ecx
 386     dec     ecx
 387     lea     esi, [esi+ebx*2]
 388     lea     edi, [edi+edx*2]
 389     jnz     .height_loop
 390
 391     pop     edi
 392     pop     esi
 393     pop     ebx
 394     ret