checkasm/vf_blend: Decrease iteration count

[ffmpeg] / libavcodec / dcadsp.c
diff --git a/libavcodec/dcadsp.c b/libavcodec/dcadsp.c

index 32b149d09c0baacec9896d238973de6ab76468e8..09faee51fb1157388c36a68c3361fbe4d90c9ba3 100644 (file)
--- a/libavcodec/dcadsp.c
+++ b/libavcodec/dcadsp.c
@@ -1,6 +1,5 @@
  /*
- * Copyright (c) 2004 Gildas Bazin
- * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
+ * Copyright (C) 2016 foo86
   *
   * This file is part of FFmpeg.
   *
@@ -19,116 +18,399 @@
   * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   */
  
-#include "config.h"
-
-#include "libavutil/attributes.h"
-#include "libavutil/intreadwrite.h"
+#include "libavutil/mem.h"
  
  #include "dcadsp.h"
  #include "dcamath.h"
  
-static void decode_hf_c(int32_t dst[DCA_SUBBANDS][SAMPLES_PER_SUBBAND],
-                        const int32_t vq_num[DCA_SUBBANDS],
-                        const int8_t hf_vq[1024][32], intptr_t vq_offset,
-                        int32_t scale[DCA_SUBBANDS][2],
-                        intptr_t start, intptr_t end)
+static void decode_hf_c(int32_t **dst,
+                        const int32_t *vq_index,
+                        const int8_t hf_vq[1024][32],
+                        int32_t scale_factors[32][2],
+                        ptrdiff_t sb_start, ptrdiff_t sb_end,
+                        ptrdiff_t ofs, ptrdiff_t len)
+{
+    int i, j;
+
+    for (i = sb_start; i < sb_end; i++) {
+        const int8_t *coeff = hf_vq[vq_index[i]];
+        int32_t scale = scale_factors[i][0];
+        for (j = 0; j < len; j++)
+            dst[i][j + ofs] = clip23(coeff[j] * scale + (1 << 3) >> 4);
+    }
+}
+
+static void decode_joint_c(int32_t **dst, int32_t **src,
+                           const int32_t *scale_factors,
+                           ptrdiff_t sb_start, ptrdiff_t sb_end,
+                           ptrdiff_t ofs, ptrdiff_t len)
  {
      int i, j;
  
-    for (j = start; j < end; j++) {
-        const int8_t *ptr = &hf_vq[vq_num[j]][vq_offset];
-        for (i = 0; i < 8; i++)
-            dst[j][i] = ptr[i] * scale[j][0] + 8 >> 4;
+    for (i = sb_start; i < sb_end; i++) {
+        int32_t scale = scale_factors[i];
+        for (j = 0; j < len; j++)
+            dst[i][j + ofs] = clip23(mul17(src[i][j + ofs], scale));
      }
  }
  
-static inline void dca_lfe_fir(float *out, const float *in, const float *coefs,
-                               int decifactor)
+static void lfe_fir_float_c(float *pcm_samples, int32_t *lfe_samples,
+                            const float *filter_coeff, ptrdiff_t npcmblocks,
+                            int dec_select)
  {
-    float *out2    = out + 2 * decifactor - 1;
-    int num_coeffs = 256 / decifactor;
-    int j, k;
+    // Select decimation factor
+    int factor = 64 << dec_select;
+    int ncoeffs = 8 >> dec_select;
+    int nlfesamples = npcmblocks >> (dec_select + 1);
+    int i, j, k;
  
-    /* One decimated sample generates 2*decifactor interpolated ones */
-    for (k = 0; k < decifactor; k++) {
-        float v0 = 0.0;
-        float v1 = 0.0;
-        for (j = 0; j < num_coeffs; j++, coefs++) {
-            v0 += in[-j]                 * *coefs;
-            v1 += in[j + 1 - num_coeffs] * *coefs;
+    for (i = 0; i < nlfesamples; i++) {
+        // One decimated sample generates 64 or 128 interpolated ones
+        for (j = 0; j < factor / 2; j++) {
+            float a = 0;
+            float b = 0;
+
+            for (k = 0; k < ncoeffs; k++) {
+                a += filter_coeff[      j * ncoeffs + k] * lfe_samples[-k];
+                b += filter_coeff[255 - j * ncoeffs - k] * lfe_samples[-k];
+            }
+
+            pcm_samples[             j] = a;
+            pcm_samples[factor / 2 + j] = b;
          }
-        *out++  = v0;
-        *out2-- = v1;
+
+        lfe_samples++;
+        pcm_samples += factor;
      }
  }
  
-static void dca_qmf_32_subbands(float samples_in[DCA_SUBBANDS][SAMPLES_PER_SUBBAND], int sb_act,
-                                SynthFilterContext *synth, FFTContext *imdct,
-                                float synth_buf_ptr[512],
-                                int *synth_buf_offset, float synth_buf2[32],
-                                const float window[512], float *samples_out,
-                                float raXin[32], float scale)
+static void lfe_fir0_float_c(float *pcm_samples, int32_t *lfe_samples,
+                             const float *filter_coeff, ptrdiff_t npcmblocks)
+{
+    lfe_fir_float_c(pcm_samples, lfe_samples, filter_coeff, npcmblocks, 0);
+}
+
+static void lfe_fir1_float_c(float *pcm_samples, int32_t *lfe_samples,
+                             const float *filter_coeff, ptrdiff_t npcmblocks)
+{
+    lfe_fir_float_c(pcm_samples, lfe_samples, filter_coeff, npcmblocks, 1);
+}
+
+static void lfe_x96_float_c(float *dst, const float *src,
+                            float *hist, ptrdiff_t len)
  {
+    float prev = *hist;
      int i;
-    int subindex;
-
-    for (i = sb_act; i < 32; i++)
-        raXin[i] = 0.0;
-
-    /* Reconstructed channel sample index */
-    for (subindex = 0; subindex < 8; subindex++) {
-        /* Load in one sample from each subband and clear inactive subbands */
-        for (i = 0; i < sb_act; i++) {
-            unsigned sign = (i - 1) & 2;
-            uint32_t v    = AV_RN32A(&samples_in[i][subindex]) ^ sign << 30;
-            AV_WN32A(&raXin[i], v);
+
+    for (i = 0; i < len; i++) {
+        float a = 0.25f * src[i] + 0.75f * prev;
+        float b = 0.75f * src[i] + 0.25f * prev;
+        prev = src[i];
+        *dst++ = a;
+        *dst++ = b;
+    }
+
+    *hist = prev;
+}
+
+static void sub_qmf32_float_c(SynthFilterContext *synth,
+                              FFTContext *imdct,
+                              float *pcm_samples,
+                              int32_t **subband_samples_lo,
+                              int32_t **subband_samples_hi,
+                              float *hist1, int *offset, float *hist2,
+                              const float *filter_coeff, ptrdiff_t npcmblocks,
+                              float scale)
+{
+    LOCAL_ALIGNED(32, float, input, [32]);
+    int i, j;
+
+    for (j = 0; j < npcmblocks; j++) {
+        // Load in one sample from each subband
+        for (i = 0; i < 32; i++) {
+            if ((i - 1) & 2)
+                input[i] = -subband_samples_lo[i][j];
+            else
+                input[i] =  subband_samples_lo[i][j];
          }
  
-        synth->synth_filter_float(imdct, synth_buf_ptr, synth_buf_offset,
-                                  synth_buf2, window, samples_out, raXin,
-                                  scale);
-        samples_out += 32;
+        // One subband sample generates 32 interpolated ones
+        synth->synth_filter_float(imdct, hist1, offset,
+                                  hist2, filter_coeff,
+                                  pcm_samples, input, scale);
+        pcm_samples += 32;
      }
  }
  
-static void dequantize_c(int32_t *samples, uint32_t step_size, uint32_t scale)
+static void sub_qmf64_float_c(SynthFilterContext *synth,
+                              FFTContext *imdct,
+                              float *pcm_samples,
+                              int32_t **subband_samples_lo,
+                              int32_t **subband_samples_hi,
+                              float *hist1, int *offset, float *hist2,
+                              const float *filter_coeff, ptrdiff_t npcmblocks,
+                              float scale)
  {
-    int64_t step = (int64_t)step_size * scale;
-    int shift, i;
-    int32_t step_scale;
+    LOCAL_ALIGNED(32, float, input, [64]);
+    int i, j;
  
-    if (step > (1 << 23))
-        shift = av_log2(step >> 23) + 1;
-    else
-        shift = 0;
-    step_scale = (int32_t)(step >> shift);
+    if (!subband_samples_hi)
+        memset(&input[32], 0, sizeof(input[0]) * 32);
  
-    for (i = 0; i < SAMPLES_PER_SUBBAND; i++)
-        samples[i] = dca_clip23(dca_norm((int64_t)samples[i] * step_scale, 22 - shift));
+    for (j = 0; j < npcmblocks; j++) {
+        // Load in one sample from each subband
+        if (subband_samples_hi) {
+            // Full 64 subbands, first 32 are residual coded
+            for (i =  0; i < 32; i++) {
+                if ((i - 1) & 2)
+                    input[i] = -subband_samples_lo[i][j] - subband_samples_hi[i][j];
+                else
+                    input[i] =  subband_samples_lo[i][j] + subband_samples_hi[i][j];
+            }
+            for (i = 32; i < 64; i++) {
+                if ((i - 1) & 2)
+                    input[i] = -subband_samples_hi[i][j];
+                else
+                    input[i] =  subband_samples_hi[i][j];
+            }
+        } else {
+            // Only first 32 subbands
+            for (i =  0; i < 32; i++) {
+                if ((i - 1) & 2)
+                    input[i] = -subband_samples_lo[i][j];
+                else
+                    input[i] =  subband_samples_lo[i][j];
+            }
+        }
+
+        // One subband sample generates 64 interpolated ones
+        synth->synth_filter_float_64(imdct, hist1, offset,
+                                     hist2, filter_coeff,
+                                     pcm_samples, input, scale);
+        pcm_samples += 64;
+    }
  }
  
-static void dca_lfe_fir0_c(float *out, const float *in, const float *coefs)
+static void lfe_fir_fixed_c(int32_t *pcm_samples, int32_t *lfe_samples,
+                            const int32_t *filter_coeff, ptrdiff_t npcmblocks)
  {
-    dca_lfe_fir(out, in, coefs, 32);
+    // Select decimation factor
+    int nlfesamples = npcmblocks >> 1;
+    int i, j, k;
+
+    for (i = 0; i < nlfesamples; i++) {
+        // One decimated sample generates 64 interpolated ones
+        for (j = 0; j < 32; j++) {
+            int64_t a = 0;
+            int64_t b = 0;
+
+            for (k = 0; k < 8; k++) {
+                a += (int64_t)filter_coeff[      j * 8 + k] * lfe_samples[-k];
+                b += (int64_t)filter_coeff[255 - j * 8 - k] * lfe_samples[-k];
+            }
+
+            pcm_samples[     j] = clip23(norm23(a));
+            pcm_samples[32 + j] = clip23(norm23(b));
+        }
+
+        lfe_samples++;
+        pcm_samples += 64;
+    }
  }
  
-static void dca_lfe_fir1_c(float *out, const float *in, const float *coefs)
+static void lfe_x96_fixed_c(int32_t *dst, const int32_t *src,
+                            int32_t *hist, ptrdiff_t len)
  {
-    dca_lfe_fir(out, in, coefs, 64);
+    int32_t prev = *hist;
+    int i;
+
+    for (i = 0; i < len; i++) {
+        int64_t a = INT64_C(2097471) * src[i] + INT64_C(6291137) * prev;
+        int64_t b = INT64_C(6291137) * src[i] + INT64_C(2097471) * prev;
+        prev = src[i];
+        *dst++ = clip23(norm23(a));
+        *dst++ = clip23(norm23(b));
+    }
+
+    *hist = prev;
+}
+
+static void sub_qmf32_fixed_c(SynthFilterContext *synth,
+                              DCADCTContext *imdct,
+                              int32_t *pcm_samples,
+                              int32_t **subband_samples_lo,
+                              int32_t **subband_samples_hi,
+                              int32_t *hist1, int *offset, int32_t *hist2,
+                              const int32_t *filter_coeff, ptrdiff_t npcmblocks)
+{
+    LOCAL_ALIGNED(32, int32_t, input, [32]);
+    int i, j;
+
+    for (j = 0; j < npcmblocks; j++) {
+        // Load in one sample from each subband
+        for (i = 0; i < 32; i++)
+            input[i] = subband_samples_lo[i][j];
+
+        // One subband sample generates 32 interpolated ones
+        synth->synth_filter_fixed(imdct, hist1, offset,
+                                  hist2, filter_coeff,
+                                  pcm_samples, input);
+        pcm_samples += 32;
+    }
+}
+
+static void sub_qmf64_fixed_c(SynthFilterContext *synth,
+                              DCADCTContext *imdct,
+                              int32_t *pcm_samples,
+                              int32_t **subband_samples_lo,
+                              int32_t **subband_samples_hi,
+                              int32_t *hist1, int *offset, int32_t *hist2,
+                              const int32_t *filter_coeff, ptrdiff_t npcmblocks)
+{
+    LOCAL_ALIGNED(32, int32_t, input, [64]);
+    int i, j;
+
+    if (!subband_samples_hi)
+        memset(&input[32], 0, sizeof(input[0]) * 32);
+
+    for (j = 0; j < npcmblocks; j++) {
+        // Load in one sample from each subband
+        if (subband_samples_hi) {
+            // Full 64 subbands, first 32 are residual coded
+            for (i =  0; i < 32; i++)
+                input[i] = subband_samples_lo[i][j] + subband_samples_hi[i][j];
+            for (i = 32; i < 64; i++)
+                input[i] = subband_samples_hi[i][j];
+        } else {
+            // Only first 32 subbands
+            for (i =  0; i < 32; i++)
+                input[i] = subband_samples_lo[i][j];
+        }
+
+        // One subband sample generates 64 interpolated ones
+        synth->synth_filter_fixed_64(imdct, hist1, offset,
+                                     hist2, filter_coeff,
+                                     pcm_samples, input);
+        pcm_samples += 64;
+    }
+}
+
+static void decor_c(int32_t *dst, const int32_t *src, int coeff, ptrdiff_t len)
+{
+    int i;
+
+    for (i = 0; i < len; i++)
+        dst[i] += src[i] * coeff + (1 << 2) >> 3;
+}
+
+static void dmix_sub_xch_c(int32_t *dst1, int32_t *dst2,
+                           const int32_t *src, ptrdiff_t len)
+{
+    int i;
+
+    for (i = 0; i < len; i++) {
+        int32_t cs = mul23(src[i], 5931520 /* M_SQRT1_2 * (1 << 23) */);
+        dst1[i] -= cs;
+        dst2[i] -= cs;
+    }
+}
+
+static void dmix_sub_c(int32_t *dst, const int32_t *src, int coeff, ptrdiff_t len)
+{
+    int i;
+
+    for (i = 0; i < len; i++)
+        dst[i] -= mul15(src[i], coeff);
+}
+
+static void dmix_add_c(int32_t *dst, const int32_t *src, int coeff, ptrdiff_t len)
+{
+    int i;
+
+    for (i = 0; i < len; i++)
+        dst[i] += mul15(src[i], coeff);
+}
+
+static void dmix_scale_c(int32_t *dst, int scale, ptrdiff_t len)
+{
+    int i;
+
+    for (i = 0; i < len; i++)
+        dst[i] = mul15(dst[i], scale);
+}
+
+static void dmix_scale_inv_c(int32_t *dst, int scale_inv, ptrdiff_t len)
+{
+    int i;
+
+    for (i = 0; i < len; i++)
+        dst[i] = mul16(dst[i], scale_inv);
+}
+
+static void filter0(int32_t *dst, const int32_t *src, int32_t coeff, ptrdiff_t len)
+{
+    int i;
+
+    for (i = 0; i < len; i++)
+        dst[i] -= mul22(src[i], coeff);
+}
+
+static void filter1(int32_t *dst, const int32_t *src, int32_t coeff, ptrdiff_t len)
+{
+    int i;
+
+    for (i = 0; i < len; i++)
+        dst[i] -= mul23(src[i], coeff);
+}
+
+static void assemble_freq_bands_c(int32_t *dst, int32_t *src0, int32_t *src1,
+                                  const int32_t *coeff, ptrdiff_t len)
+{
+    int i;
+
+    filter0(src0, src1, coeff[0], len);
+    filter0(src1, src0, coeff[1], len);
+    filter0(src0, src1, coeff[2], len);
+    filter0(src1, src0, coeff[3], len);
+
+    for (i = 0; i < 8; i++, src0--) {
+        filter1(src0, src1, coeff[i +  4], len);
+        filter1(src1, src0, coeff[i + 12], len);
+        filter1(src0, src1, coeff[i +  4], len);
+    }
+
+    for (i = 0; i < len; i++) {
+        *dst++ = *src1++;
+        *dst++ = *++src0;
+    }
  }
  
  av_cold void ff_dcadsp_init(DCADSPContext *s)
  {
-    s->lfe_fir[0]      = dca_lfe_fir0_c;
-    s->lfe_fir[1]      = dca_lfe_fir1_c;
-    s->qmf_32_subbands = dca_qmf_32_subbands;
-    s->decode_hf       = decode_hf_c;
-    s->dequantize      = dequantize_c;
+    s->decode_hf     = decode_hf_c;
+    s->decode_joint  = decode_joint_c;
+
+    s->lfe_fir_float[0] = lfe_fir0_float_c;
+    s->lfe_fir_float[1] = lfe_fir1_float_c;
+    s->lfe_x96_float    = lfe_x96_float_c;
+    s->sub_qmf_float[0] = sub_qmf32_float_c;
+    s->sub_qmf_float[1] = sub_qmf64_float_c;
+
+    s->lfe_fir_fixed    = lfe_fir_fixed_c;
+    s->lfe_x96_fixed    = lfe_x96_fixed_c;
+    s->sub_qmf_fixed[0] = sub_qmf32_fixed_c;
+    s->sub_qmf_fixed[1] = sub_qmf64_fixed_c;
+
+    s->decor   = decor_c;
+
+    s->dmix_sub_xch   = dmix_sub_xch_c;
+    s->dmix_sub       = dmix_sub_c;
+    s->dmix_add       = dmix_add_c;
+    s->dmix_scale     = dmix_scale_c;
+    s->dmix_scale_inv = dmix_scale_inv_c;
+
+    s->assemble_freq_bands = assemble_freq_bands_c;
  
-    if (ARCH_AARCH64)
-        ff_dcadsp_init_aarch64(s);
-    if (ARCH_ARM)
-        ff_dcadsp_init_arm(s);
      if (ARCH_X86)
          ff_dcadsp_init_x86(s);
  }