x86: synth filter float: implement SSE2 version

[ffmpeg] / libavcodec / x86 / h264_weight.asm
diff --git a/libavcodec/x86/h264_weight.asm b/libavcodec/x86/h264_weight.asm

index 6e89ab3bc776ab548d80810d4165635a8cf40f95..646acdffe30b0f8c6f39d3b0a2fef8f178ad2b62 100644 (file)
--- a/libavcodec/x86/h264_weight.asm
+++ b/libavcodec/x86/h264_weight.asm
@@ -4,45 +4,44 @@
  ;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
  ;* Copyright (C) 2010 Eli Friedman <eli.friedman@gmail.com>
  ;*
-;* This file is part of FFmpeg.
+;* This file is part of Libav.
  ;*
-;* FFmpeg is free software; you can redistribute it and/or
+;* Libav is free software; you can redistribute it and/or
  ;* modify it under the terms of the GNU Lesser General Public
  ;* License as published by the Free Software Foundation; either
  ;* version 2.1 of the License, or (at your option) any later version.
  ;*
-;* FFmpeg is distributed in the hope that it will be useful,
+;* Libav is distributed in the hope that it will be useful,
  ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  ;* Lesser General Public License for more details.
  ;*
  ;* You should have received a copy of the GNU Lesser General Public
-;* License along with FFmpeg; if not, write to the Free Software
-;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;* License along with Libav; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  ;******************************************************************************
  
-%include "x86inc.asm"
+%include "libavutil/x86/x86util.asm"
  
  SECTION .text
  
  ;-----------------------------------------------------------------------------
  ; biweight pred:
  ;
-; void h264_biweight_16x16_sse2(uint8_t *dst, uint8_t *src, int stride,
-;                               int log2_denom, int weightd, int weights,
-;                               int offset);
+; void h264_biweight_16_sse2(uint8_t *dst, uint8_t *src, int stride,
+;                            int height, int log2_denom, int weightd,
+;                            int weights, int offset);
  ; and
-; void h264_weight_16x16_sse2(uint8_t *dst, int stride,
-;                             int log2_denom, int weight,
-;                             int offset);
+; void h264_weight_16_sse2(uint8_t *dst, int stride, int height,
+;                          int log2_denom, int weight, int offset);
  ;-----------------------------------------------------------------------------
  
  %macro WEIGHT_SETUP 0
-    add        r4, r4
-    inc        r4
-    movd       m3, r3
-    movd       m5, r4
-    movd       m6, r2
+    add        r5, r5
+    inc        r5
+    movd       m3, r4d
+    movd       m5, r5d
+    movd       m6, r3d
      pslld      m5, m6
      psrld      m5, 1
  %if mmsize == 16
@@ -71,61 +70,42 @@ SECTION .text
      packuswb      m0, m1
  %endmacro
  
-%macro WEIGHT_FUNC_DBL_MM 1
-cglobal h264_weight_16x%1_mmx2, 5, 5, 0
+INIT_MMX mmxext
+cglobal h264_weight_16, 6, 6, 0
      WEIGHT_SETUP
-    mov        r2, %1
-%if %1 == 16
-.nextrow
+.nextrow:
      WEIGHT_OP 0,  4
      mova     [r0  ], m0
      WEIGHT_OP 8, 12
      mova     [r0+8], m0
      add        r0, r1
-    dec        r2
+    dec        r2d
      jnz .nextrow
      REP_RET
-%else
-    jmp mangle(ff_h264_weight_16x16_mmx2.nextrow)
-%endif
-%endmacro
  
-INIT_MMX
-WEIGHT_FUNC_DBL_MM 16
-WEIGHT_FUNC_DBL_MM  8
-
-%macro WEIGHT_FUNC_MM 4
-cglobal h264_weight_%1x%2_%4, 7, 7, %3
+%macro WEIGHT_FUNC_MM 2
+cglobal h264_weight_%1, 6, 6, %2
      WEIGHT_SETUP
-    mov        r2, %2
-%if %2 == 16
-.nextrow
+.nextrow:
      WEIGHT_OP 0, mmsize/2
      mova     [r0], m0
      add        r0, r1
-    dec        r2
+    dec        r2d
      jnz .nextrow
      REP_RET
-%else
-    jmp mangle(ff_h264_weight_%1x16_%4.nextrow)
-%endif
  %endmacro
  
-INIT_MMX
-WEIGHT_FUNC_MM  8, 16,  0, mmx2
-WEIGHT_FUNC_MM  8,  8,  0, mmx2
-WEIGHT_FUNC_MM  8,  4,  0, mmx2
-INIT_XMM
-WEIGHT_FUNC_MM 16, 16,  8, sse2
-WEIGHT_FUNC_MM 16,  8,  8, sse2
+INIT_MMX mmxext
+WEIGHT_FUNC_MM  8, 0
+INIT_XMM sse2
+WEIGHT_FUNC_MM 16, 8
  
-%macro WEIGHT_FUNC_HALF_MM 5
-cglobal h264_weight_%1x%2_%5, 5, 5, %4
+%macro WEIGHT_FUNC_HALF_MM 2
+cglobal h264_weight_%1, 6, 6, %2
      WEIGHT_SETUP
-    mov        r2, %2/2
+    sar       r2d, 1
      lea        r3, [r1*2]
-%if %2 == mmsize
-.nextrow
+.nextrow:
      WEIGHT_OP 0, r1
      movh     [r0], m0
  %if mmsize == 16
@@ -135,33 +115,45 @@ cglobal h264_weight_%1x%2_%5, 5, 5, %4
      movh     [r0+r1], m0
  %endif
      add        r0, r3
-    dec        r2
+    dec        r2d
      jnz .nextrow
      REP_RET
-%else
-    jmp mangle(ff_h264_weight_%1x%3_%5.nextrow)
-%endif
  %endmacro
  
-INIT_MMX
-WEIGHT_FUNC_HALF_MM 4,  8,  8, 0, mmx2
-WEIGHT_FUNC_HALF_MM 4,  4,  8, 0, mmx2
-WEIGHT_FUNC_HALF_MM 4,  2,  8, 0, mmx2
-INIT_XMM
-WEIGHT_FUNC_HALF_MM 8, 16, 16, 8, sse2
-WEIGHT_FUNC_HALF_MM 8,  8, 16, 8, sse2
-WEIGHT_FUNC_HALF_MM 8,  4, 16, 8, sse2
+INIT_MMX mmxext
+WEIGHT_FUNC_HALF_MM 4, 0
+INIT_XMM sse2
+WEIGHT_FUNC_HALF_MM 8, 8
  
  %macro BIWEIGHT_SETUP 0
-    add        r6, 1
-    or         r6, 1
-    add        r3, 1
-    movd       m3, r4
-    movd       m4, r5
-    movd       m5, r6
-    movd       m6, r3
+%if ARCH_X86_64
+%define off_regd r7d
+%else
+%define off_regd r3d
+%endif
+    mov  off_regd, r7m
+    add  off_regd, 1
+    or   off_regd, 1
+    add        r4, 1
+%if cpuflag(ssse3)
+    movd       m4, r5d
+    movd       m0, r6d
+%else
+    movd       m3, r5d
+    movd       m4, r6d
+%endif
+    movd       m5, off_regd
+    movd       m6, r4d
      pslld      m5, m6
      psrld      m5, 1
+%if cpuflag(ssse3)
+    punpcklbw  m4, m0
+    pshuflw    m4, m4, 0
+    pshuflw    m5, m5, 0
+    punpcklqdq m4, m4
+    punpcklqdq m5, m5
+
+%else
  %if mmsize == 16
      pshuflw    m3, m3, 0
      pshuflw    m4, m4, 0
@@ -175,6 +167,7 @@ WEIGHT_FUNC_HALF_MM 8,  4, 16, 8, sse2
      pshufw     m5, m5, 0
  %endif
      pxor       m7, m7
+%endif
  %endmacro
  
  %macro BIWEIGHT_STEPA 3
@@ -195,12 +188,11 @@ WEIGHT_FUNC_HALF_MM 8,  4, 16, 8, sse2
      packuswb   m0, m1
  %endmacro
  
-%macro BIWEIGHT_FUNC_DBL_MM 1
-cglobal h264_biweight_16x%1_mmx2, 7, 7, 0
+INIT_MMX mmxext
+cglobal h264_biweight_16, 7, 8, 0
      BIWEIGHT_SETUP
-    mov        r3, %1
-%if %1 == 16
-.nextrow
+    movifnidn r3d, r3m
+.nextrow:
      BIWEIGHT_STEPA 0, 1, 0
      BIWEIGHT_STEPA 1, 2, 4
      BIWEIGHT_STEPB
@@ -211,53 +203,38 @@ cglobal h264_biweight_16x%1_mmx2, 7, 7, 0
      mova     [r0+8], m0
      add        r0, r2
      add        r1, r2
-    dec        r3
+    dec        r3d
      jnz .nextrow
      REP_RET
-%else
-    jmp mangle(ff_h264_biweight_16x16_mmx2.nextrow)
-%endif
-%endmacro
-
-INIT_MMX
-BIWEIGHT_FUNC_DBL_MM 16
-BIWEIGHT_FUNC_DBL_MM  8
  
-%macro BIWEIGHT_FUNC_MM 4
-cglobal h264_biweight_%1x%2_%4, 7, 7, %3
+%macro BIWEIGHT_FUNC_MM 2
+cglobal h264_biweight_%1, 7, 8, %2
      BIWEIGHT_SETUP
-    mov        r3, %2
-%if %2 == 16
-.nextrow
+    movifnidn r3d, r3m
+.nextrow:
      BIWEIGHT_STEPA 0, 1, 0
      BIWEIGHT_STEPA 1, 2, mmsize/2
      BIWEIGHT_STEPB
      mova       [r0], m0
      add        r0, r2
      add        r1, r2
-    dec        r3
+    dec        r3d
      jnz .nextrow
      REP_RET
-%else
-    jmp mangle(ff_h264_biweight_%1x16_%4.nextrow)
-%endif
  %endmacro
  
-INIT_MMX
-BIWEIGHT_FUNC_MM  8, 16,  0, mmx2
-BIWEIGHT_FUNC_MM  8,  8,  0, mmx2
-BIWEIGHT_FUNC_MM  8,  4,  0, mmx2
-INIT_XMM
-BIWEIGHT_FUNC_MM 16, 16,  8, sse2
-BIWEIGHT_FUNC_MM 16,  8,  8, sse2
+INIT_MMX mmxext
+BIWEIGHT_FUNC_MM  8, 0
+INIT_XMM sse2
+BIWEIGHT_FUNC_MM 16, 8
  
-%macro BIWEIGHT_FUNC_HALF_MM 5
-cglobal h264_biweight_%1x%2_%5, 7, 7, %4
+%macro BIWEIGHT_FUNC_HALF_MM 2
+cglobal h264_biweight_%1, 7, 8, %2
      BIWEIGHT_SETUP
-    mov        r3, %2/2
+    movifnidn r3d, r3m
+    sar        r3, 1
      lea        r4, [r2*2]
-%if %2 == mmsize
-.nextrow
+.nextrow:
      BIWEIGHT_STEPA 0, 1, 0
      BIWEIGHT_STEPA 1, 2, r2
      BIWEIGHT_STEPB
@@ -270,39 +247,15 @@ cglobal h264_biweight_%1x%2_%5, 7, 7, %4
  %endif
      add        r0, r4
      add        r1, r4
-    dec        r3
+    dec        r3d
      jnz .nextrow
      REP_RET
-%else
-    jmp mangle(ff_h264_biweight_%1x%3_%5.nextrow)
-%endif
  %endmacro
  
-INIT_MMX
-BIWEIGHT_FUNC_HALF_MM 4,  8,  8, 0, mmx2
-BIWEIGHT_FUNC_HALF_MM 4,  4,  8, 0, mmx2
-BIWEIGHT_FUNC_HALF_MM 4,  2,  8, 0, mmx2
-INIT_XMM
-BIWEIGHT_FUNC_HALF_MM 8, 16, 16, 8, sse2
-BIWEIGHT_FUNC_HALF_MM 8,  8, 16, 8, sse2
-BIWEIGHT_FUNC_HALF_MM 8,  4, 16, 8, sse2
-
-%macro BIWEIGHT_SSSE3_SETUP 0
-    add        r6, 1
-    or         r6, 1
-    add        r3, 1
-    movd       m4, r4
-    movd       m0, r5
-    movd       m5, r6
-    movd       m6, r3
-    pslld      m5, m6
-    psrld      m5, 1
-    punpcklbw  m4, m0
-    pshuflw    m4, m4, 0
-    pshuflw    m5, m5, 0
-    punpcklqdq m4, m4
-    punpcklqdq m5, m5
-%endmacro
+INIT_MMX mmxext
+BIWEIGHT_FUNC_HALF_MM 4, 0
+INIT_XMM sse2
+BIWEIGHT_FUNC_HALF_MM 8, 8
  
  %macro BIWEIGHT_SSSE3_OP 0
      pmaddubsw  m0, m4
@@ -314,13 +267,12 @@ BIWEIGHT_FUNC_HALF_MM 8,  4, 16, 8, sse2
      packuswb   m0, m2
  %endmacro
  
-%macro BIWEIGHT_SSSE3_16 1
-cglobal h264_biweight_16x%1_ssse3, 7, 7, 8
-    BIWEIGHT_SSSE3_SETUP
-    mov        r3, %1
+INIT_XMM ssse3
+cglobal h264_biweight_16, 7, 8, 8
+    BIWEIGHT_SETUP
+    movifnidn r3d, r3m
  
-%if %1 == 16
-.nextrow
+.nextrow:
      movh       m0, [r0]
      movh       m2, [r0+8]
      movh       m3, [r1+8]
@@ -330,26 +282,18 @@ cglobal h264_biweight_16x%1_ssse3, 7, 7, 8
      mova       [r0], m0
      add        r0, r2
      add        r1, r2
-    dec        r3
+    dec        r3d
      jnz .nextrow
      REP_RET
-%else
-    jmp mangle(ff_h264_biweight_16x16_ssse3.nextrow)
-%endif
-%endmacro
-
-INIT_XMM
-BIWEIGHT_SSSE3_16 16
-BIWEIGHT_SSSE3_16  8
  
-%macro BIWEIGHT_SSSE3_8 1
-cglobal h264_biweight_8x%1_ssse3, 7, 7, 8
-    BIWEIGHT_SSSE3_SETUP
-    mov        r3, %1/2
+INIT_XMM ssse3
+cglobal h264_biweight_8, 7, 8, 8
+    BIWEIGHT_SETUP
+    movifnidn r3d, r3m
+    sar        r3, 1
      lea        r4, [r2*2]
  
-%if %1 == 16
-.nextrow
+.nextrow:
      movh       m0, [r0]
      movh       m1, [r1]
      movh       m2, [r0+r2]
@@ -361,15 +305,6 @@ cglobal h264_biweight_8x%1_ssse3, 7, 7, 8
      movhps     [r0+r2], m0
      add        r0, r4
      add        r1, r4
-    dec        r3
+    dec        r3d
      jnz .nextrow
      REP_RET
-%else
-    jmp mangle(ff_h264_biweight_8x16_ssse3.nextrow)
-%endif
-%endmacro
-
-INIT_XMM
-BIWEIGHT_SSSE3_8 16
-BIWEIGHT_SSSE3_8  8
-BIWEIGHT_SSSE3_8  4