git.sesse.net Git - ffmpeg/blob - libavcodec/sbcdsp.c

   1 /*
   2  * Bluetooth low-complexity, subband codec (SBC)
   3  *
   4  * Copyright (C) 2017  Aurelien Jacobs <aurel@gnuage.org>
   5  * Copyright (C) 2012-2013  Intel Corporation
   6  * Copyright (C) 2008-2010  Nokia Corporation
   7  * Copyright (C) 2004-2010  Marcel Holtmann <marcel@holtmann.org>
   8  * Copyright (C) 2004-2005  Henryk Ploetz <henryk@ploetzli.ch>
   9  * Copyright (C) 2005-2006  Brad Midgley <bmidgley@xmission.com>
  10  *
  11  * This file is part of FFmpeg.
  12  *
  13  * FFmpeg is free software; you can redistribute it and/or
  14  * modify it under the terms of the GNU Lesser General Public
  15  * License as published by the Free Software Foundation; either
  16  * version 2.1 of the License, or (at your option) any later version.
  17  *
  18  * FFmpeg is distributed in the hope that it will be useful,
  19  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  20  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  21  * Lesser General Public License for more details.
  22  *
  23  * You should have received a copy of the GNU Lesser General Public
  24  * License along with FFmpeg; if not, write to the Free Software
  25  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  26  */
  27
  28 /**
  29  * @file
  30  * SBC basic "building bricks"
  31  */
  32
  33 #include <stdint.h>
  34 #include <limits.h>
  35 #include <string.h>
  36 #include "libavutil/common.h"
  37 #include "libavutil/intmath.h"
  38 #include "libavutil/intreadwrite.h"
  39 #include "sbc.h"
  40 #include "sbcdsp.h"
  41 #include "sbcdsp_data.h"
  42
  43 /*
  44  * A reference C code of analysis filter with SIMD-friendly tables
  45  * reordering and code layout. This code can be used to develop platform
  46  * specific SIMD optimizations. Also it may be used as some kind of test
  47  * for compiler autovectorization capabilities (who knows, if the compiler
  48  * is very good at this stuff, hand optimized assembly may be not strictly
  49  * needed for some platform).
  50  *
  51  * Note: It is also possible to make a simple variant of analysis filter,
  52  * which needs only a single constants table without taking care about
  53  * even/odd cases. This simple variant of filter can be implemented without
  54  * input data permutation. The only thing that would be lost is the
  55  * possibility to use pairwise SIMD multiplications. But for some simple
  56  * CPU cores without SIMD extensions it can be useful. If anybody is
  57  * interested in implementing such variant of a filter, sourcecode from
  58  * bluez versions 4.26/4.27 can be used as a reference and the history of
  59  * the changes in git repository done around that time may be worth checking.
  60  */
  61
  62 static av_always_inline void sbc_analyze_simd(const int16_t *in, int32_t *out,
  63                                               const int16_t *consts,
  64                                               unsigned subbands)
  65 {
  66     int32_t t1[8];
  67     int16_t t2[8];
  68     int i, j, hop = 0;
  69
  70     /* rounding coefficient */
  71     for (i = 0; i < subbands; i++)
  72         t1[i] = 1 << (SBC_PROTO_FIXED_SCALE - 1);
  73
  74     /* low pass polyphase filter */
  75     for (hop = 0; hop < 10*subbands; hop += 2*subbands)
  76         for (i = 0; i < 2*subbands; i++)
  77             t1[i >> 1] += in[hop + i] * consts[hop + i];
  78
  79     /* scaling */
  80     for (i = 0; i < subbands; i++)
  81         t2[i] = t1[i] >> SBC_PROTO_FIXED_SCALE;
  82
  83     memset(t1, 0, sizeof(t1));
  84
  85     /* do the cos transform */
  86     for (i = 0; i < subbands/2; i++)
  87         for (j = 0; j < 2*subbands; j++)
  88             t1[j>>1] += t2[i * 2 + (j&1)] * consts[10*subbands + i*2*subbands + j];
  89
  90     for (i = 0; i < subbands; i++)
  91         out[i] = t1[i] >> (SBC_COS_TABLE_FIXED_SCALE - SCALE_OUT_BITS);
  92 }
  93
  94 static void sbc_analyze_4_simd(const int16_t *in, int32_t *out,
  95                                const int16_t *consts)
  96 {
  97     sbc_analyze_simd(in, out, consts, 4);
  98 }
  99
 100 static void sbc_analyze_8_simd(const int16_t *in, int32_t *out,
 101                                const int16_t *consts)
 102 {
 103     sbc_analyze_simd(in, out, consts, 8);
 104 }
 105
 106 static inline void sbc_analyze_4b_4s_simd(SBCDSPContext *s,
 107                                           int16_t *x, int32_t *out, int out_stride)
 108 {
 109     /* Analyze blocks */
 110     s->sbc_analyze_4(x + 12, out, ff_sbcdsp_analysis_consts_fixed4_simd_odd);
 111     out += out_stride;
 112     s->sbc_analyze_4(x + 8, out, ff_sbcdsp_analysis_consts_fixed4_simd_even);
 113     out += out_stride;
 114     s->sbc_analyze_4(x + 4, out, ff_sbcdsp_analysis_consts_fixed4_simd_odd);
 115     out += out_stride;
 116     s->sbc_analyze_4(x + 0, out, ff_sbcdsp_analysis_consts_fixed4_simd_even);
 117 }
 118
 119 static inline void sbc_analyze_4b_8s_simd(SBCDSPContext *s,
 120                                           int16_t *x, int32_t *out, int out_stride)
 121 {
 122     /* Analyze blocks */
 123     s->sbc_analyze_8(x + 24, out, ff_sbcdsp_analysis_consts_fixed8_simd_odd);
 124     out += out_stride;
 125     s->sbc_analyze_8(x + 16, out, ff_sbcdsp_analysis_consts_fixed8_simd_even);
 126     out += out_stride;
 127     s->sbc_analyze_8(x + 8, out, ff_sbcdsp_analysis_consts_fixed8_simd_odd);
 128     out += out_stride;
 129     s->sbc_analyze_8(x + 0, out, ff_sbcdsp_analysis_consts_fixed8_simd_even);
 130 }
 131
 132 static inline void sbc_analyze_1b_8s_simd_even(SBCDSPContext *s,
 133                                                int16_t *x, int32_t *out,
 134                                                int out_stride);
 135
 136 static inline void sbc_analyze_1b_8s_simd_odd(SBCDSPContext *s,
 137                                               int16_t *x, int32_t *out,
 138                                               int out_stride)
 139 {
 140     s->sbc_analyze_8(x, out, ff_sbcdsp_analysis_consts_fixed8_simd_odd);
 141     s->sbc_analyze_8s = sbc_analyze_1b_8s_simd_even;
 142 }
 143
 144 static inline void sbc_analyze_1b_8s_simd_even(SBCDSPContext *s,
 145                                                int16_t *x, int32_t *out,
 146                                                int out_stride)
 147 {
 148     s->sbc_analyze_8(x, out, ff_sbcdsp_analysis_consts_fixed8_simd_even);
 149     s->sbc_analyze_8s = sbc_analyze_1b_8s_simd_odd;
 150 }
 151
 152 /*
 153  * Input data processing functions. The data is endian converted if needed,
 154  * channels are deintrleaved and audio samples are reordered for use in
 155  * SIMD-friendly analysis filter function. The results are put into "X"
 156  * array, getting appended to the previous data (or it is better to say
 157  * prepended, as the buffer is filled from top to bottom). Old data is
 158  * discarded when neededed, but availability of (10 * nrof_subbands)
 159  * contiguous samples is always guaranteed for the input to the analysis
 160  * filter. This is achieved by copying a sufficient part of old data
 161  * to the top of the buffer on buffer wraparound.
 162  */
 163
 164 static int sbc_enc_process_input_4s(int position, const uint8_t *pcm,
 165                                     int16_t X[2][SBC_X_BUFFER_SIZE],
 166                                     int nsamples, int nchannels)
 167 {
 168     int c;
 169
 170     /* handle X buffer wraparound */
 171     if (position < nsamples) {
 172         for (c = 0; c < nchannels; c++)
 173             memcpy(&X[c][SBC_X_BUFFER_SIZE - 40], &X[c][position],
 174                             36 * sizeof(int16_t));
 175         position = SBC_X_BUFFER_SIZE - 40;
 176     }
 177
 178     /* copy/permutate audio samples */
 179     for (; nsamples >= 8; nsamples -= 8, pcm += 16 * nchannels) {
 180         position -= 8;
 181         for (c = 0; c < nchannels; c++) {
 182             int16_t *x = &X[c][position];
 183             x[0] = AV_RN16(pcm + 14*nchannels + 2*c);
 184             x[1] = AV_RN16(pcm +  6*nchannels + 2*c);
 185             x[2] = AV_RN16(pcm + 12*nchannels + 2*c);
 186             x[3] = AV_RN16(pcm +  8*nchannels + 2*c);
 187             x[4] = AV_RN16(pcm +  0*nchannels + 2*c);
 188             x[5] = AV_RN16(pcm +  4*nchannels + 2*c);
 189             x[6] = AV_RN16(pcm +  2*nchannels + 2*c);
 190             x[7] = AV_RN16(pcm + 10*nchannels + 2*c);
 191         }
 192     }
 193
 194     return position;
 195 }
 196
 197 static int sbc_enc_process_input_8s(int position, const uint8_t *pcm,
 198                                     int16_t X[2][SBC_X_BUFFER_SIZE],
 199                                     int nsamples, int nchannels)
 200 {
 201     int c;
 202
 203     /* handle X buffer wraparound */
 204     if (position < nsamples) {
 205         for (c = 0; c < nchannels; c++)
 206             memcpy(&X[c][SBC_X_BUFFER_SIZE - 72], &X[c][position],
 207                             72 * sizeof(int16_t));
 208         position = SBC_X_BUFFER_SIZE - 72;
 209     }
 210
 211     if (position % 16 == 8) {
 212         position -= 8;
 213         nsamples -= 8;
 214         for (c = 0; c < nchannels; c++) {
 215             int16_t *x = &X[c][position];
 216             x[0] = AV_RN16(pcm + 14*nchannels + 2*c);
 217             x[2] = AV_RN16(pcm + 12*nchannels + 2*c);
 218             x[3] = AV_RN16(pcm +  0*nchannels + 2*c);
 219             x[4] = AV_RN16(pcm + 10*nchannels + 2*c);
 220             x[5] = AV_RN16(pcm +  2*nchannels + 2*c);
 221             x[6] = AV_RN16(pcm +  8*nchannels + 2*c);
 222             x[7] = AV_RN16(pcm +  4*nchannels + 2*c);
 223             x[8] = AV_RN16(pcm +  6*nchannels + 2*c);
 224         }
 225         pcm += 16 * nchannels;
 226     }
 227
 228     /* copy/permutate audio samples */
 229     for (; nsamples >= 16; nsamples -= 16, pcm += 32 * nchannels) {
 230         position -= 16;
 231         for (c = 0; c < nchannels; c++) {
 232             int16_t *x = &X[c][position];
 233             x[0]  = AV_RN16(pcm + 30*nchannels + 2*c);
 234             x[1]  = AV_RN16(pcm + 14*nchannels + 2*c);
 235             x[2]  = AV_RN16(pcm + 28*nchannels + 2*c);
 236             x[3]  = AV_RN16(pcm + 16*nchannels + 2*c);
 237             x[4]  = AV_RN16(pcm + 26*nchannels + 2*c);
 238             x[5]  = AV_RN16(pcm + 18*nchannels + 2*c);
 239             x[6]  = AV_RN16(pcm + 24*nchannels + 2*c);
 240             x[7]  = AV_RN16(pcm + 20*nchannels + 2*c);
 241             x[8]  = AV_RN16(pcm + 22*nchannels + 2*c);
 242             x[9]  = AV_RN16(pcm +  6*nchannels + 2*c);
 243             x[10] = AV_RN16(pcm + 12*nchannels + 2*c);
 244             x[11] = AV_RN16(pcm +  0*nchannels + 2*c);
 245             x[12] = AV_RN16(pcm + 10*nchannels + 2*c);
 246             x[13] = AV_RN16(pcm +  2*nchannels + 2*c);
 247             x[14] = AV_RN16(pcm +  8*nchannels + 2*c);
 248             x[15] = AV_RN16(pcm +  4*nchannels + 2*c);
 249         }
 250     }
 251
 252     if (nsamples == 8) {
 253         position -= 8;
 254         for (c = 0; c < nchannels; c++) {
 255             int16_t *x = &X[c][position];
 256             x[-7] = AV_RN16(pcm + 14*nchannels + 2*c);
 257             x[1]  = AV_RN16(pcm +  6*nchannels + 2*c);
 258             x[2]  = AV_RN16(pcm + 12*nchannels + 2*c);
 259             x[3]  = AV_RN16(pcm +  0*nchannels + 2*c);
 260             x[4]  = AV_RN16(pcm + 10*nchannels + 2*c);
 261             x[5]  = AV_RN16(pcm +  2*nchannels + 2*c);
 262             x[6]  = AV_RN16(pcm +  8*nchannels + 2*c);
 263             x[7]  = AV_RN16(pcm +  4*nchannels + 2*c);
 264         }
 265     }
 266
 267     return position;
 268 }
 269
 270 static void sbc_calc_scalefactors(int32_t sb_sample_f[16][2][8],
 271                                   uint32_t scale_factor[2][8],
 272                                   int blocks, int channels, int subbands)
 273 {
 274     int ch, sb, blk;
 275     for (ch = 0; ch < channels; ch++) {
 276         for (sb = 0; sb < subbands; sb++) {
 277             uint32_t x = 1 << SCALE_OUT_BITS;
 278             for (blk = 0; blk < blocks; blk++) {
 279                 int32_t tmp = FFABS(sb_sample_f[blk][ch][sb]);
 280                 if (tmp != 0)
 281                     x |= tmp - 1;
 282             }
 283             scale_factor[ch][sb] = (31 - SCALE_OUT_BITS) - ff_clz(x);
 284         }
 285     }
 286 }
 287
 288 static int sbc_calc_scalefactors_j(int32_t sb_sample_f[16][2][8],
 289                                    uint32_t scale_factor[2][8],
 290                                    int blocks, int subbands)
 291 {
 292     int blk, joint = 0;
 293     int32_t tmp0, tmp1;
 294     uint32_t x, y;
 295
 296     /* last subband does not use joint stereo */
 297     int sb = subbands - 1;
 298     x = 1 << SCALE_OUT_BITS;
 299     y = 1 << SCALE_OUT_BITS;
 300     for (blk = 0; blk < blocks; blk++) {
 301         tmp0 = FFABS(sb_sample_f[blk][0][sb]);
 302         tmp1 = FFABS(sb_sample_f[blk][1][sb]);
 303         if (tmp0 != 0)
 304             x |= tmp0 - 1;
 305         if (tmp1 != 0)
 306             y |= tmp1 - 1;
 307     }
 308     scale_factor[0][sb] = (31 - SCALE_OUT_BITS) - ff_clz(x);
 309     scale_factor[1][sb] = (31 - SCALE_OUT_BITS) - ff_clz(y);
 310
 311     /* the rest of subbands can use joint stereo */
 312     while (--sb >= 0) {
 313         int32_t sb_sample_j[16][2];
 314         x = 1 << SCALE_OUT_BITS;
 315         y = 1 << SCALE_OUT_BITS;
 316         for (blk = 0; blk < blocks; blk++) {
 317             tmp0 = sb_sample_f[blk][0][sb];
 318             tmp1 = sb_sample_f[blk][1][sb];
 319             sb_sample_j[blk][0] = (tmp0 >> 1) + (tmp1 >> 1);
 320             sb_sample_j[blk][1] = (tmp0 >> 1) - (tmp1 >> 1);
 321             tmp0 = FFABS(tmp0);
 322             tmp1 = FFABS(tmp1);
 323             if (tmp0 != 0)
 324                 x |= tmp0 - 1;
 325             if (tmp1 != 0)
 326                 y |= tmp1 - 1;
 327         }
 328         scale_factor[0][sb] = (31 - SCALE_OUT_BITS) -
 329             ff_clz(x);
 330         scale_factor[1][sb] = (31 - SCALE_OUT_BITS) -
 331             ff_clz(y);
 332         x = 1 << SCALE_OUT_BITS;
 333         y = 1 << SCALE_OUT_BITS;
 334         for (blk = 0; blk < blocks; blk++) {
 335             tmp0 = FFABS(sb_sample_j[blk][0]);
 336             tmp1 = FFABS(sb_sample_j[blk][1]);
 337             if (tmp0 != 0)
 338                 x |= tmp0 - 1;
 339             if (tmp1 != 0)
 340                 y |= tmp1 - 1;
 341         }
 342         x = (31 - SCALE_OUT_BITS) - ff_clz(x);
 343         y = (31 - SCALE_OUT_BITS) - ff_clz(y);
 344
 345         /* decide whether to use joint stereo for this subband */
 346         if ((scale_factor[0][sb] + scale_factor[1][sb]) > x + y) {
 347             joint |= 1 << (subbands - 1 - sb);
 348             scale_factor[0][sb] = x;
 349             scale_factor[1][sb] = y;
 350             for (blk = 0; blk < blocks; blk++) {
 351                 sb_sample_f[blk][0][sb] = sb_sample_j[blk][0];
 352                 sb_sample_f[blk][1][sb] = sb_sample_j[blk][1];
 353             }
 354         }
 355     }
 356
 357     /* bitmask with the information about subbands using joint stereo */
 358     return joint;
 359 }
 360
 361 /*
 362  * Detect CPU features and setup function pointers
 363  */
 364 av_cold void ff_sbcdsp_init(SBCDSPContext *s)
 365 {
 366     /* Default implementation for analyze functions */
 367     s->sbc_analyze_4 = sbc_analyze_4_simd;
 368     s->sbc_analyze_8 = sbc_analyze_8_simd;
 369     s->sbc_analyze_4s = sbc_analyze_4b_4s_simd;
 370     if (s->increment == 1)
 371         s->sbc_analyze_8s = sbc_analyze_1b_8s_simd_odd;
 372     else
 373         s->sbc_analyze_8s = sbc_analyze_4b_8s_simd;
 374
 375     /* Default implementation for input reordering / deinterleaving */
 376     s->sbc_enc_process_input_4s = sbc_enc_process_input_4s;
 377     s->sbc_enc_process_input_8s = sbc_enc_process_input_8s;
 378
 379     /* Default implementation for scale factors calculation */
 380     s->sbc_calc_scalefactors = sbc_calc_scalefactors;
 381     s->sbc_calc_scalefactors_j = sbc_calc_scalefactors_j;
 382
 383     if (ARCH_ARM)
 384         ff_sbcdsp_init_arm(s);
 385     if (ARCH_X86)
 386         ff_sbcdsp_init_x86(s);
 387 }