vaapi_encode: Refactor slightly to allow easier setting of global options

[ffmpeg] / libavcodec / ppc / vp8dsp_altivec.c
diff --git a/libavcodec/ppc/vp8dsp_altivec.c b/libavcodec/ppc/vp8dsp_altivec.c

index 8bb60aae0b487b7dd0d3c61dd74ccc0e13df0fc9..869fe670e7a702b1724c3aafd07d504fd7a8eec6 100644 (file)
--- a/libavcodec/ppc/vp8dsp_altivec.c
+++ b/libavcodec/ppc/vp8dsp_altivec.c
@@ -1,31 +1,35 @@
-/**
+/*
   * VP8 compatible video decoder
   *
   * Copyright (C) 2010 David Conrad
   *
- * This file is part of FFmpeg.
+ * This file is part of Libav.
   *
- * FFmpeg is free software; you can redistribute it and/or
+ * Libav is free software; you can redistribute it and/or
   * modify it under the terms of the GNU Lesser General Public
   * License as published by the Free Software Foundation; either
   * version 2.1 of the License, or (at your option) any later version.
   *
- * FFmpeg is distributed in the hope that it will be useful,
+ * Libav is distributed in the hope that it will be useful,
   * but WITHOUT ANY WARRANTY; without even the implied warranty of
   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   * Lesser General Public License for more details.
   *
   * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
+ * License along with Libav; if not, write to the Free Software
   * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   */
  
+#include "config.h"
  #include "libavutil/cpu.h"
+#include "libavutil/mem.h"
+#include "libavutil/ppc/cpu.h"
+#include "libavutil/ppc/types_altivec.h"
+#include "libavutil/ppc/util_altivec.h"
  #include "libavcodec/vp8dsp.h"
-#include "dsputil_altivec.h"
-#include "types_altivec.h"
-#include "util_altivec.h"
+#include "hpeldsp_altivec.h"
  
+#if HAVE_ALTIVEC && HAVE_BIGENDIAN
  #define REPT4(...) { __VA_ARGS__, __VA_ARGS__, __VA_ARGS__, __VA_ARGS__ }
  
  // h subpel filter uses msum to multiply+add 4 pixel taps at once
@@ -75,8 +79,8 @@ static const vec_s8 h_subpel_filters_outer[3] =
      dstv = vec_sra(dstv, c7)
  
  static av_always_inline
-void put_vp8_epel_h_altivec_core(uint8_t *dst, int dst_stride,
-                                 uint8_t *src, int src_stride,
+void put_vp8_epel_h_altivec_core(uint8_t *dst, ptrdiff_t dst_stride,
+                                 uint8_t *src, ptrdiff_t src_stride,
                                   int h, int mx, int w, int is6tap)
  {
      LOAD_H_SUBPEL_FILTER(mx-1);
@@ -161,8 +165,8 @@ static const vec_u8 v_subpel_filters[7] =
      dstv = vec_sra(dstv, c7)
  
  static av_always_inline
-void put_vp8_epel_v_altivec_core(uint8_t *dst, int dst_stride,
-                                 uint8_t *src, int src_stride,
+void put_vp8_epel_v_altivec_core(uint8_t *dst, ptrdiff_t dst_stride,
+                                 uint8_t *src, ptrdiff_t src_stride,
                                   int h, int my, int w, int is6tap)
  {
      LOAD_V_SUBPEL_FILTER(my-1);
@@ -226,27 +230,27 @@ void put_vp8_epel_v_altivec_core(uint8_t *dst, int dst_stride,
  
  #define EPEL_FUNCS(WIDTH, TAPS) \
  static av_noinline \
-void put_vp8_epel ## WIDTH ## _h ## TAPS ## _altivec(uint8_t *dst, int dst_stride, uint8_t *src, int src_stride, int h, int mx, int my) \
+void put_vp8_epel ## WIDTH ## _h ## TAPS ## _altivec(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int h, int mx, int my) \
  { \
      put_vp8_epel_h_altivec_core(dst, dst_stride, src, src_stride, h, mx, WIDTH, TAPS == 6); \
  } \
  \
  static av_noinline \
-void put_vp8_epel ## WIDTH ## _v ## TAPS ## _altivec(uint8_t *dst, int dst_stride, uint8_t *src, int src_stride, int h, int mx, int my) \
+void put_vp8_epel ## WIDTH ## _v ## TAPS ## _altivec(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int h, int mx, int my) \
  { \
      put_vp8_epel_v_altivec_core(dst, dst_stride, src, src_stride, h, my, WIDTH, TAPS == 6); \
  }
  
  #define EPEL_HV(WIDTH, HTAPS, VTAPS) \
-static void put_vp8_epel ## WIDTH ## _h ## HTAPS ## v ## VTAPS ## _altivec(uint8_t *dst, int stride, uint8_t *src, int s, int h, int mx, int my) \
+static void put_vp8_epel ## WIDTH ## _h ## HTAPS ## v ## VTAPS ## _altivec(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my) \
  { \
      DECLARE_ALIGNED(16, uint8_t, tmp)[(2*WIDTH+5)*16]; \
      if (VTAPS == 6) { \
-        put_vp8_epel ## WIDTH ## _h ## HTAPS ## _altivec(tmp, 16,     src-2*stride, stride, h+5, mx, my); \
-        put_vp8_epel ## WIDTH ## _v ## VTAPS ## _altivec(dst, stride, tmp+2*16,     16,     h,   mx, my); \
+        put_vp8_epel ## WIDTH ## _h ## HTAPS ## _altivec(tmp, 16,      src-2*sstride, sstride, h+5, mx, my); \
+        put_vp8_epel ## WIDTH ## _v ## VTAPS ## _altivec(dst, dstride, tmp+2*16,      16,      h,   mx, my); \
      } else { \
-        put_vp8_epel ## WIDTH ## _h ## HTAPS ## _altivec(tmp, 16,     src-stride, stride, h+4, mx, my); \
-        put_vp8_epel ## WIDTH ## _v ## VTAPS ## _altivec(dst, stride, tmp+16,     16,     h,   mx, my); \
+        put_vp8_epel ## WIDTH ## _h ## HTAPS ## _altivec(tmp, 16,      src-sstride, sstride, h+4, mx, my); \
+        put_vp8_epel ## WIDTH ## _v ## VTAPS ## _altivec(dst, dstride, tmp+16,      16,      h,   mx, my); \
      } \
  }
  
@@ -266,14 +270,53 @@ EPEL_HV(4,  4,6)
  EPEL_HV(4,  6,4)
  EPEL_HV(4,  4,4)
  
-static void put_vp8_pixels16_altivec(uint8_t *dst, int stride, uint8_t *src, int s, int h, int mx, int my)
+static void put_vp8_pixels16_altivec(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my)
  {
-    put_pixels16_altivec(dst, src, stride, h);
+    register vector unsigned char pixelsv1, pixelsv2;
+    register vector unsigned char pixelsv1B, pixelsv2B;
+    register vector unsigned char pixelsv1C, pixelsv2C;
+    register vector unsigned char pixelsv1D, pixelsv2D;
+
+    register vector unsigned char perm = vec_lvsl(0, src);
+    int i;
+    register ptrdiff_t dstride2 = dstride << 1, sstride2 = sstride << 1;
+    register ptrdiff_t dstride3 = dstride2 + dstride, sstride3 = sstride + sstride2;
+    register ptrdiff_t dstride4 = dstride << 2, sstride4 = sstride << 2;
+
+// hand-unrolling the loop by 4 gains about 15%
+// mininum execution time goes from 74 to 60 cycles
+// it's faster than -funroll-loops, but using
+// -funroll-loops w/ this is bad - 74 cycles again.
+// all this is on a 7450, tuning for the 7450
+    for (i = 0; i < h; i += 4) {
+        pixelsv1  = vec_ld( 0, src);
+        pixelsv2  = vec_ld(15, src);
+        pixelsv1B = vec_ld(sstride, src);
+        pixelsv2B = vec_ld(15 + sstride, src);
+        pixelsv1C = vec_ld(sstride2, src);
+        pixelsv2C = vec_ld(15 + sstride2, src);
+        pixelsv1D = vec_ld(sstride3, src);
+        pixelsv2D = vec_ld(15 + sstride3, src);
+        vec_st(vec_perm(pixelsv1, pixelsv2, perm),
+               0, (unsigned char*)dst);
+        vec_st(vec_perm(pixelsv1B, pixelsv2B, perm),
+               dstride, (unsigned char*)dst);
+        vec_st(vec_perm(pixelsv1C, pixelsv2C, perm),
+               dstride2, (unsigned char*)dst);
+        vec_st(vec_perm(pixelsv1D, pixelsv2D, perm),
+               dstride3, (unsigned char*)dst);
+        src += sstride4;
+        dst += dstride4;
+    }
  }
  
-av_cold void ff_vp8dsp_init_altivec(VP8DSPContext *c)
+#endif /* HAVE_ALTIVEC */
+
+
+av_cold void ff_vp78dsp_init_ppc(VP8DSPContext *c)
  {
-    if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC))
+#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+    if (!PPC_ALTIVEC(av_get_cpu_flags()))
          return;
  
      c->put_vp8_epel_pixels_tab[0][0][0] = put_vp8_pixels16_altivec;
@@ -300,4 +343,5 @@ av_cold void ff_vp8dsp_init_altivec(VP8DSPContext *c)
      c->put_vp8_epel_pixels_tab[2][1][1] = put_vp8_epel4_h4v4_altivec;
      c->put_vp8_epel_pixels_tab[2][1][2] = put_vp8_epel4_h6v4_altivec;
      c->put_vp8_epel_pixels_tab[2][2][1] = put_vp8_epel4_h4v6_altivec;
+#endif /* HAVE_ALTIVEC */
  }