git.sesse.net Git - ffmpeg/blob - libavcodec/mips/aaccoder_mips.c

   1 /*
   2  * Copyright (c) 2012
   3  *      MIPS Technologies, Inc., California.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  * 1. Redistributions of source code must retain the above copyright
   9  *    notice, this list of conditions and the following disclaimer.
  10  * 2. Redistributions in binary form must reproduce the above copyright
  11  *    notice, this list of conditions and the following disclaimer in the
  12  *    documentation and/or other materials provided with the distribution.
  13  * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
  14  *    contributors may be used to endorse or promote products derived from
  15  *    this software without specific prior written permission.
  16  *
  17  * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
  18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
  21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  27  * SUCH DAMAGE.
  28  *
  29  * Author:  Stanislav Ocovaj (socovaj@mips.com)
  30  *          Szabolcs Pal     (sabolc@mips.com)
  31  *
  32  * AAC coefficients encoder optimized for MIPS floating-point architecture
  33  *
  34  * This file is part of FFmpeg.
  35  *
  36  * FFmpeg is free software; you can redistribute it and/or
  37  * modify it under the terms of the GNU Lesser General Public
  38  * License as published by the Free Software Foundation; either
  39  * version 2.1 of the License, or (at your option) any later version.
  40  *
  41  * FFmpeg is distributed in the hope that it will be useful,
  42  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  43  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  44  * Lesser General Public License for more details.
  45  *
  46  * You should have received a copy of the GNU Lesser General Public
  47  * License along with FFmpeg; if not, write to the Free Software
  48  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  49  */
  50
  51 /**
  52  * @file
  53  * Reference: libavcodec/aaccoder.c
  54  */
  55
  56 #include "libavutil/libm.h"
  57
  58 #include <float.h>
  59 #include "libavutil/mathematics.h"
  60 #include "libavcodec/avcodec.h"
  61 #include "libavcodec/put_bits.h"
  62 #include "libavcodec/aac.h"
  63 #include "libavcodec/aacenc.h"
  64 #include "libavcodec/aactab.h"
  65 #include "libavcodec/aacenctab.h"
  66 #include "libavcodec/aacenc_utils.h"
  67
  68 #if HAVE_INLINE_ASM
  69 #if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
  70 typedef struct BandCodingPath {
  71     int prev_idx;
  72     float cost;
  73     int run;
  74 } BandCodingPath;
  75
  76 static const uint8_t uquad_sign_bits[81] = {
  77     0, 1, 1, 1, 2, 2, 1, 2, 2,
  78     1, 2, 2, 2, 3, 3, 2, 3, 3,
  79     1, 2, 2, 2, 3, 3, 2, 3, 3,
  80     1, 2, 2, 2, 3, 3, 2, 3, 3,
  81     2, 3, 3, 3, 4, 4, 3, 4, 4,
  82     2, 3, 3, 3, 4, 4, 3, 4, 4,
  83     1, 2, 2, 2, 3, 3, 2, 3, 3,
  84     2, 3, 3, 3, 4, 4, 3, 4, 4,
  85     2, 3, 3, 3, 4, 4, 3, 4, 4
  86 };
  87
  88 static const uint8_t upair7_sign_bits[64] = {
  89     0, 1, 1, 1, 1, 1, 1, 1,
  90     1, 2, 2, 2, 2, 2, 2, 2,
  91     1, 2, 2, 2, 2, 2, 2, 2,
  92     1, 2, 2, 2, 2, 2, 2, 2,
  93     1, 2, 2, 2, 2, 2, 2, 2,
  94     1, 2, 2, 2, 2, 2, 2, 2,
  95     1, 2, 2, 2, 2, 2, 2, 2,
  96     1, 2, 2, 2, 2, 2, 2, 2,
  97 };
  98
  99 static const uint8_t upair12_sign_bits[169] = {
 100     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 101     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 102     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 103     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 104     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 105     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 106     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 107     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 108     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 109     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 110     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 111     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 112     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
 113 };
 114
 115 static const uint8_t esc_sign_bits[289] = {
 116     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 117     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 118     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 119     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 120     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 121     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 122     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 123     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 124     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 125     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 126     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 127     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 128     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 129     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 130     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 131     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 132     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
 133 };
 134
 135 /**
 136  * Functions developed from template function and optimized for quantizing and encoding band
 137  */
 138 static void quantize_and_encode_band_cost_SQUAD_mips(struct AACEncContext *s,
 139                                                      PutBitContext *pb, const float *in, float *out,
 140                                                      const float *scaled, int size, int scale_idx,
 141                                                      int cb, const float lambda, const float uplim,
 142                                                      int *bits, float *energy, const float ROUNDING)
 143 {
 144     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
 145     const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
 146     int i;
 147     int qc1, qc2, qc3, qc4;
 148     float qenergy = 0.0f;
 149
 150     uint8_t  *p_bits  = (uint8_t  *)ff_aac_spectral_bits[cb-1];
 151     uint16_t *p_codes = (uint16_t *)ff_aac_spectral_codes[cb-1];
 152     float    *p_vec   = (float    *)ff_aac_codebook_vectors[cb-1];
 153
 154     abs_pow34_v(s->scoefs, in, size);
 155     scaled = s->scoefs;
 156     for (i = 0; i < size; i += 4) {
 157         int curidx;
 158         int *in_int = (int *)&in[i];
 159         int t0, t1, t2, t3, t4, t5, t6, t7;
 160         const float *vec;
 161
 162         qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
 163         qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
 164         qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
 165         qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
 166
 167         __asm__ volatile (
 168             ".set push                      \n\t"
 169             ".set noreorder                 \n\t"
 170
 171             "slt    %[qc1], $zero,  %[qc1]  \n\t"
 172             "slt    %[qc2], $zero,  %[qc2]  \n\t"
 173             "slt    %[qc3], $zero,  %[qc3]  \n\t"
 174             "slt    %[qc4], $zero,  %[qc4]  \n\t"
 175             "lw     %[t0],  0(%[in_int])    \n\t"
 176             "lw     %[t1],  4(%[in_int])    \n\t"
 177             "lw     %[t2],  8(%[in_int])    \n\t"
 178             "lw     %[t3],  12(%[in_int])   \n\t"
 179             "srl    %[t0],  %[t0],  31      \n\t"
 180             "srl    %[t1],  %[t1],  31      \n\t"
 181             "srl    %[t2],  %[t2],  31      \n\t"
 182             "srl    %[t3],  %[t3],  31      \n\t"
 183             "subu   %[t4],  $zero,  %[qc1]  \n\t"
 184             "subu   %[t5],  $zero,  %[qc2]  \n\t"
 185             "subu   %[t6],  $zero,  %[qc3]  \n\t"
 186             "subu   %[t7],  $zero,  %[qc4]  \n\t"
 187             "movn   %[qc1], %[t4],  %[t0]   \n\t"
 188             "movn   %[qc2], %[t5],  %[t1]   \n\t"
 189             "movn   %[qc3], %[t6],  %[t2]   \n\t"
 190             "movn   %[qc4], %[t7],  %[t3]   \n\t"
 191
 192             ".set pop                       \n\t"
 193
 194             : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
 195               [qc3]"+r"(qc3), [qc4]"+r"(qc4),
 196               [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
 197               [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7)
 198             : [in_int]"r"(in_int)
 199             : "memory"
 200         );
 201
 202         curidx = qc1;
 203         curidx *= 3;
 204         curidx += qc2;
 205         curidx *= 3;
 206         curidx += qc3;
 207         curidx *= 3;
 208         curidx += qc4;
 209         curidx += 40;
 210
 211         put_bits(pb, p_bits[curidx], p_codes[curidx]);
 212
 213         if (out || energy) {
 214             float e1,e2,e3,e4;
 215             vec = &p_vec[curidx*4];
 216             e1 = vec[0] * IQ;
 217             e2 = vec[1] * IQ;
 218             e3 = vec[2] * IQ;
 219             e4 = vec[3] * IQ;
 220             if (out) {
 221                 out[i+0] = e1;
 222                 out[i+1] = e2;
 223                 out[i+2] = e3;
 224                 out[i+3] = e4;
 225             }
 226             if (energy)
 227                 qenergy += (e1*e1 + e2*e2) + (e3*e3 + e4*e4);
 228         }
 229     }
 230     if (energy)
 231         *energy = qenergy;
 232 }
 233
 234 static void quantize_and_encode_band_cost_UQUAD_mips(struct AACEncContext *s,
 235                                                      PutBitContext *pb, const float *in, float *out,
 236                                                      const float *scaled, int size, int scale_idx,
 237                                                      int cb, const float lambda, const float uplim,
 238                                                      int *bits, float *energy, const float ROUNDING)
 239 {
 240     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
 241     const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
 242     int i;
 243     int qc1, qc2, qc3, qc4;
 244     float qenergy = 0.0f;
 245
 246     uint8_t  *p_bits  = (uint8_t  *)ff_aac_spectral_bits[cb-1];
 247     uint16_t *p_codes = (uint16_t *)ff_aac_spectral_codes[cb-1];
 248     float    *p_vec   = (float    *)ff_aac_codebook_vectors[cb-1];
 249
 250     abs_pow34_v(s->scoefs, in, size);
 251     scaled = s->scoefs;
 252     for (i = 0; i < size; i += 4) {
 253         int curidx, sign, count;
 254         int *in_int = (int *)&in[i];
 255         uint8_t v_bits;
 256         unsigned int v_codes;
 257         int t0, t1, t2, t3, t4;
 258         const float *vec;
 259
 260         qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
 261         qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
 262         qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
 263         qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
 264
 265         __asm__ volatile (
 266             ".set push                              \n\t"
 267             ".set noreorder                         \n\t"
 268
 269             "ori    %[t4],      $zero,      2       \n\t"
 270             "ori    %[sign],    $zero,      0       \n\t"
 271             "slt    %[t0],      %[t4],      %[qc1]  \n\t"
 272             "slt    %[t1],      %[t4],      %[qc2]  \n\t"
 273             "slt    %[t2],      %[t4],      %[qc3]  \n\t"
 274             "slt    %[t3],      %[t4],      %[qc4]  \n\t"
 275             "movn   %[qc1],     %[t4],      %[t0]   \n\t"
 276             "movn   %[qc2],     %[t4],      %[t1]   \n\t"
 277             "movn   %[qc3],     %[t4],      %[t2]   \n\t"
 278             "movn   %[qc4],     %[t4],      %[t3]   \n\t"
 279             "lw     %[t0],      0(%[in_int])        \n\t"
 280             "lw     %[t1],      4(%[in_int])        \n\t"
 281             "lw     %[t2],      8(%[in_int])        \n\t"
 282             "lw     %[t3],      12(%[in_int])       \n\t"
 283             "slt    %[t0],      %[t0],      $zero   \n\t"
 284             "movn   %[sign],    %[t0],      %[qc1]  \n\t"
 285             "slt    %[t1],      %[t1],      $zero   \n\t"
 286             "slt    %[t2],      %[t2],      $zero   \n\t"
 287             "slt    %[t3],      %[t3],      $zero   \n\t"
 288             "sll    %[t0],      %[sign],    1       \n\t"
 289             "or     %[t0],      %[t0],      %[t1]   \n\t"
 290             "movn   %[sign],    %[t0],      %[qc2]  \n\t"
 291             "slt    %[t4],      $zero,      %[qc1]  \n\t"
 292             "slt    %[t1],      $zero,      %[qc2]  \n\t"
 293             "slt    %[count],   $zero,      %[qc3]  \n\t"
 294             "sll    %[t0],      %[sign],    1       \n\t"
 295             "or     %[t0],      %[t0],      %[t2]   \n\t"
 296             "movn   %[sign],    %[t0],      %[qc3]  \n\t"
 297             "slt    %[t2],      $zero,      %[qc4]  \n\t"
 298             "addu   %[count],   %[count],   %[t4]   \n\t"
 299             "addu   %[count],   %[count],   %[t1]   \n\t"
 300             "sll    %[t0],      %[sign],    1       \n\t"
 301             "or     %[t0],      %[t0],      %[t3]   \n\t"
 302             "movn   %[sign],    %[t0],      %[qc4]  \n\t"
 303             "addu   %[count],   %[count],   %[t2]   \n\t"
 304
 305             ".set pop                               \n\t"
 306
 307             : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
 308               [qc3]"+r"(qc3), [qc4]"+r"(qc4),
 309               [sign]"=&r"(sign), [count]"=&r"(count),
 310               [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
 311               [t4]"=&r"(t4)
 312             : [in_int]"r"(in_int)
 313             : "memory"
 314         );
 315
 316         curidx = qc1;
 317         curidx *= 3;
 318         curidx += qc2;
 319         curidx *= 3;
 320         curidx += qc3;
 321         curidx *= 3;
 322         curidx += qc4;
 323
 324         v_codes = (p_codes[curidx] << count) | (sign & ((1 << count) - 1));
 325         v_bits  = p_bits[curidx] + count;
 326         put_bits(pb, v_bits, v_codes);
 327
 328         if (out || energy) {
 329             float e1,e2,e3,e4;
 330             vec = &p_vec[curidx*4];
 331             e1 = copysignf(vec[0] * IQ, in[i+0]);
 332             e2 = copysignf(vec[1] * IQ, in[i+1]);
 333             e3 = copysignf(vec[2] * IQ, in[i+2]);
 334             e4 = copysignf(vec[3] * IQ, in[i+3]);
 335             if (out) {
 336                 out[i+0] = e1;
 337                 out[i+1] = e2;
 338                 out[i+2] = e3;
 339                 out[i+3] = e4;
 340             }
 341             if (energy)
 342                 qenergy += (e1*e1 + e2*e2) + (e3*e3 + e4*e4);
 343         }
 344     }
 345     if (energy)
 346         *energy = qenergy;
 347 }
 348
 349 static void quantize_and_encode_band_cost_SPAIR_mips(struct AACEncContext *s,
 350                                                      PutBitContext *pb, const float *in, float *out,
 351                                                      const float *scaled, int size, int scale_idx,
 352                                                      int cb, const float lambda, const float uplim,
 353                                                      int *bits, float *energy, const float ROUNDING)
 354 {
 355     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
 356     const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
 357     int i;
 358     int qc1, qc2, qc3, qc4;
 359     float qenergy = 0.0f;
 360
 361     uint8_t  *p_bits  = (uint8_t  *)ff_aac_spectral_bits[cb-1];
 362     uint16_t *p_codes = (uint16_t *)ff_aac_spectral_codes[cb-1];
 363     float    *p_vec   = (float    *)ff_aac_codebook_vectors[cb-1];
 364
 365     abs_pow34_v(s->scoefs, in, size);
 366     scaled = s->scoefs;
 367     for (i = 0; i < size; i += 4) {
 368         int curidx, curidx2;
 369         int *in_int = (int *)&in[i];
 370         uint8_t v_bits;
 371         unsigned int v_codes;
 372         int t0, t1, t2, t3, t4, t5, t6, t7;
 373         const float *vec1, *vec2;
 374
 375         qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
 376         qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
 377         qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
 378         qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
 379
 380         __asm__ volatile (
 381             ".set push                      \n\t"
 382             ".set noreorder                 \n\t"
 383
 384             "ori    %[t4],  $zero,  4       \n\t"
 385             "slt    %[t0],  %[t4],  %[qc1]  \n\t"
 386             "slt    %[t1],  %[t4],  %[qc2]  \n\t"
 387             "slt    %[t2],  %[t4],  %[qc3]  \n\t"
 388             "slt    %[t3],  %[t4],  %[qc4]  \n\t"
 389             "movn   %[qc1], %[t4],  %[t0]   \n\t"
 390             "movn   %[qc2], %[t4],  %[t1]   \n\t"
 391             "movn   %[qc3], %[t4],  %[t2]   \n\t"
 392             "movn   %[qc4], %[t4],  %[t3]   \n\t"
 393             "lw     %[t0],  0(%[in_int])    \n\t"
 394             "lw     %[t1],  4(%[in_int])    \n\t"
 395             "lw     %[t2],  8(%[in_int])    \n\t"
 396             "lw     %[t3],  12(%[in_int])   \n\t"
 397             "srl    %[t0],  %[t0],  31      \n\t"
 398             "srl    %[t1],  %[t1],  31      \n\t"
 399             "srl    %[t2],  %[t2],  31      \n\t"
 400             "srl    %[t3],  %[t3],  31      \n\t"
 401             "subu   %[t4],  $zero,  %[qc1]  \n\t"
 402             "subu   %[t5],  $zero,  %[qc2]  \n\t"
 403             "subu   %[t6],  $zero,  %[qc3]  \n\t"
 404             "subu   %[t7],  $zero,  %[qc4]  \n\t"
 405             "movn   %[qc1], %[t4],  %[t0]   \n\t"
 406             "movn   %[qc2], %[t5],  %[t1]   \n\t"
 407             "movn   %[qc3], %[t6],  %[t2]   \n\t"
 408             "movn   %[qc4], %[t7],  %[t3]   \n\t"
 409
 410             ".set pop                       \n\t"
 411
 412             : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
 413               [qc3]"+r"(qc3), [qc4]"+r"(qc4),
 414               [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
 415               [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7)
 416             : [in_int]"r"(in_int)
 417             : "memory"
 418         );
 419
 420         curidx = 9 * qc1;
 421         curidx += qc2 + 40;
 422
 423         curidx2 = 9 * qc3;
 424         curidx2 += qc4 + 40;
 425
 426         v_codes = (p_codes[curidx] << p_bits[curidx2]) | (p_codes[curidx2]);
 427         v_bits  = p_bits[curidx] + p_bits[curidx2];
 428         put_bits(pb, v_bits, v_codes);
 429
 430         if (out || energy) {
 431             float e1,e2,e3,e4;
 432             vec1 = &p_vec[curidx*2 ];
 433             vec2 = &p_vec[curidx2*2];
 434             e1 = vec1[0] * IQ;
 435             e2 = vec1[1] * IQ;
 436             e3 = vec2[0] * IQ;
 437             e4 = vec2[1] * IQ;
 438             if (out) {
 439                 out[i+0] = e1;
 440                 out[i+1] = e2;
 441                 out[i+2] = e3;
 442                 out[i+3] = e4;
 443             }
 444             if (energy)
 445                 qenergy += (e1*e1 + e2*e2) + (e3*e3 + e4*e4);
 446         }
 447     }
 448     if (energy)
 449         *energy = qenergy;
 450 }
 451
 452 static void quantize_and_encode_band_cost_UPAIR7_mips(struct AACEncContext *s,
 453                                                       PutBitContext *pb, const float *in, float *out,
 454                                                       const float *scaled, int size, int scale_idx,
 455                                                       int cb, const float lambda, const float uplim,
 456                                                       int *bits, float *energy, const float ROUNDING)
 457 {
 458     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
 459     const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
 460     int i;
 461     int qc1, qc2, qc3, qc4;
 462     float qenergy = 0.0f;
 463
 464     uint8_t  *p_bits  = (uint8_t*) ff_aac_spectral_bits[cb-1];
 465     uint16_t *p_codes = (uint16_t*)ff_aac_spectral_codes[cb-1];
 466     float    *p_vec   = (float    *)ff_aac_codebook_vectors[cb-1];
 467
 468     abs_pow34_v(s->scoefs, in, size);
 469     scaled = s->scoefs;
 470     for (i = 0; i < size; i += 4) {
 471         int curidx1, curidx2, sign1, count1, sign2, count2;
 472         int *in_int = (int *)&in[i];
 473         uint8_t v_bits;
 474         unsigned int v_codes;
 475         int t0, t1, t2, t3, t4;
 476         const float *vec1, *vec2;
 477
 478         qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
 479         qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
 480         qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
 481         qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
 482
 483         __asm__ volatile (
 484             ".set push                              \n\t"
 485             ".set noreorder                         \n\t"
 486
 487             "ori    %[t4],      $zero,      7       \n\t"
 488             "ori    %[sign1],   $zero,      0       \n\t"
 489             "ori    %[sign2],   $zero,      0       \n\t"
 490             "slt    %[t0],      %[t4],      %[qc1]  \n\t"
 491             "slt    %[t1],      %[t4],      %[qc2]  \n\t"
 492             "slt    %[t2],      %[t4],      %[qc3]  \n\t"
 493             "slt    %[t3],      %[t4],      %[qc4]  \n\t"
 494             "movn   %[qc1],     %[t4],      %[t0]   \n\t"
 495             "movn   %[qc2],     %[t4],      %[t1]   \n\t"
 496             "movn   %[qc3],     %[t4],      %[t2]   \n\t"
 497             "movn   %[qc4],     %[t4],      %[t3]   \n\t"
 498             "lw     %[t0],      0(%[in_int])        \n\t"
 499             "lw     %[t1],      4(%[in_int])        \n\t"
 500             "lw     %[t2],      8(%[in_int])        \n\t"
 501             "lw     %[t3],      12(%[in_int])       \n\t"
 502             "slt    %[t0],      %[t0],      $zero   \n\t"
 503             "movn   %[sign1],   %[t0],      %[qc1]  \n\t"
 504             "slt    %[t2],      %[t2],      $zero   \n\t"
 505             "movn   %[sign2],   %[t2],      %[qc3]  \n\t"
 506             "slt    %[t1],      %[t1],      $zero   \n\t"
 507             "sll    %[t0],      %[sign1],   1       \n\t"
 508             "or     %[t0],      %[t0],      %[t1]   \n\t"
 509             "movn   %[sign1],   %[t0],      %[qc2]  \n\t"
 510             "slt    %[t3],      %[t3],      $zero   \n\t"
 511             "sll    %[t0],      %[sign2],   1       \n\t"
 512             "or     %[t0],      %[t0],      %[t3]   \n\t"
 513             "movn   %[sign2],   %[t0],      %[qc4]  \n\t"
 514             "slt    %[count1],  $zero,      %[qc1]  \n\t"
 515             "slt    %[t1],      $zero,      %[qc2]  \n\t"
 516             "slt    %[count2],  $zero,      %[qc3]  \n\t"
 517             "slt    %[t2],      $zero,      %[qc4]  \n\t"
 518             "addu   %[count1],  %[count1],  %[t1]   \n\t"
 519             "addu   %[count2],  %[count2],  %[t2]   \n\t"
 520
 521             ".set pop                               \n\t"
 522
 523             : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
 524               [qc3]"+r"(qc3), [qc4]"+r"(qc4),
 525               [sign1]"=&r"(sign1), [count1]"=&r"(count1),
 526               [sign2]"=&r"(sign2), [count2]"=&r"(count2),
 527               [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
 528               [t4]"=&r"(t4)
 529             : [in_int]"r"(in_int)
 530             : "t0", "t1", "t2", "t3", "t4",
 531               "memory"
 532         );
 533
 534         curidx1  = 8 * qc1;
 535         curidx1 += qc2;
 536
 537         v_codes = (p_codes[curidx1] << count1) | sign1;
 538         v_bits  = p_bits[curidx1] + count1;
 539         put_bits(pb, v_bits, v_codes);
 540
 541         curidx2  = 8 * qc3;
 542         curidx2 += qc4;
 543
 544         v_codes = (p_codes[curidx2] << count2) | sign2;
 545         v_bits  = p_bits[curidx2] + count2;
 546         put_bits(pb, v_bits, v_codes);
 547
 548         if (out || energy) {
 549             float e1,e2,e3,e4;
 550             vec1 = &p_vec[curidx1*2];
 551             vec2 = &p_vec[curidx2*2];
 552             e1 = copysignf(vec1[0] * IQ, in[i+0]);
 553             e2 = copysignf(vec1[1] * IQ, in[i+1]);
 554             e3 = copysignf(vec2[0] * IQ, in[i+2]);
 555             e4 = copysignf(vec2[1] * IQ, in[i+3]);
 556             if (out) {
 557                 out[i+0] = e1;
 558                 out[i+1] = e2;
 559                 out[i+2] = e3;
 560                 out[i+3] = e4;
 561             }
 562             if (energy)
 563                 qenergy += (e1*e1 + e2*e2) + (e3*e3 + e4*e4);
 564         }
 565     }
 566     if (energy)
 567         *energy = qenergy;
 568 }
 569
 570 static void quantize_and_encode_band_cost_UPAIR12_mips(struct AACEncContext *s,
 571                                                        PutBitContext *pb, const float *in, float *out,
 572                                                        const float *scaled, int size, int scale_idx,
 573                                                        int cb, const float lambda, const float uplim,
 574                                                        int *bits, float *energy, const float ROUNDING)
 575 {
 576     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
 577     const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
 578     int i;
 579     int qc1, qc2, qc3, qc4;
 580     float qenergy = 0.0f;
 581
 582     uint8_t  *p_bits  = (uint8_t*) ff_aac_spectral_bits[cb-1];
 583     uint16_t *p_codes = (uint16_t*)ff_aac_spectral_codes[cb-1];
 584     float    *p_vec   = (float   *)ff_aac_codebook_vectors[cb-1];
 585
 586     abs_pow34_v(s->scoefs, in, size);
 587     scaled = s->scoefs;
 588     for (i = 0; i < size; i += 4) {
 589         int curidx1, curidx2, sign1, count1, sign2, count2;
 590         int *in_int = (int *)&in[i];
 591         uint8_t v_bits;
 592         unsigned int v_codes;
 593         int t0, t1, t2, t3, t4;
 594         const float *vec1, *vec2;
 595
 596         qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
 597         qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
 598         qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
 599         qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
 600
 601         __asm__ volatile (
 602             ".set push                              \n\t"
 603             ".set noreorder                         \n\t"
 604
 605             "ori    %[t4],      $zero,      12      \n\t"
 606             "ori    %[sign1],   $zero,      0       \n\t"
 607             "ori    %[sign2],   $zero,      0       \n\t"
 608             "slt    %[t0],      %[t4],      %[qc1]  \n\t"
 609             "slt    %[t1],      %[t4],      %[qc2]  \n\t"
 610             "slt    %[t2],      %[t4],      %[qc3]  \n\t"
 611             "slt    %[t3],      %[t4],      %[qc4]  \n\t"
 612             "movn   %[qc1],     %[t4],      %[t0]   \n\t"
 613             "movn   %[qc2],     %[t4],      %[t1]   \n\t"
 614             "movn   %[qc3],     %[t4],      %[t2]   \n\t"
 615             "movn   %[qc4],     %[t4],      %[t3]   \n\t"
 616             "lw     %[t0],      0(%[in_int])        \n\t"
 617             "lw     %[t1],      4(%[in_int])        \n\t"
 618             "lw     %[t2],      8(%[in_int])        \n\t"
 619             "lw     %[t3],      12(%[in_int])       \n\t"
 620             "slt    %[t0],      %[t0],      $zero   \n\t"
 621             "movn   %[sign1],   %[t0],      %[qc1]  \n\t"
 622             "slt    %[t2],      %[t2],      $zero   \n\t"
 623             "movn   %[sign2],   %[t2],      %[qc3]  \n\t"
 624             "slt    %[t1],      %[t1],      $zero   \n\t"
 625             "sll    %[t0],      %[sign1],   1       \n\t"
 626             "or     %[t0],      %[t0],      %[t1]   \n\t"
 627             "movn   %[sign1],   %[t0],      %[qc2]  \n\t"
 628             "slt    %[t3],      %[t3],      $zero   \n\t"
 629             "sll    %[t0],      %[sign2],   1       \n\t"
 630             "or     %[t0],      %[t0],      %[t3]   \n\t"
 631             "movn   %[sign2],   %[t0],      %[qc4]  \n\t"
 632             "slt    %[count1],  $zero,      %[qc1]  \n\t"
 633             "slt    %[t1],      $zero,      %[qc2]  \n\t"
 634             "slt    %[count2],  $zero,      %[qc3]  \n\t"
 635             "slt    %[t2],      $zero,      %[qc4]  \n\t"
 636             "addu   %[count1],  %[count1],  %[t1]   \n\t"
 637             "addu   %[count2],  %[count2],  %[t2]   \n\t"
 638
 639             ".set pop                               \n\t"
 640
 641             : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
 642               [qc3]"+r"(qc3), [qc4]"+r"(qc4),
 643               [sign1]"=&r"(sign1), [count1]"=&r"(count1),
 644               [sign2]"=&r"(sign2), [count2]"=&r"(count2),
 645               [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
 646               [t4]"=&r"(t4)
 647             : [in_int]"r"(in_int)
 648             : "memory"
 649         );
 650
 651         curidx1  = 13 * qc1;
 652         curidx1 += qc2;
 653
 654         v_codes = (p_codes[curidx1] << count1) | sign1;
 655         v_bits  = p_bits[curidx1] + count1;
 656         put_bits(pb, v_bits, v_codes);
 657
 658         curidx2  = 13 * qc3;
 659         curidx2 += qc4;
 660
 661         v_codes = (p_codes[curidx2] << count2) | sign2;
 662         v_bits  = p_bits[curidx2] + count2;
 663         put_bits(pb, v_bits, v_codes);
 664
 665         if (out || energy) {
 666             float e1,e2,e3,e4;
 667             vec1 = &p_vec[curidx1*2];
 668             vec2 = &p_vec[curidx2*2];
 669             e1 = copysignf(vec1[0] * IQ, in[i+0]);
 670             e2 = copysignf(vec1[1] * IQ, in[i+1]);
 671             e3 = copysignf(vec2[0] * IQ, in[i+2]);
 672             e4 = copysignf(vec2[1] * IQ, in[i+3]);
 673             if (out) {
 674                 out[i+0] = e1;
 675                 out[i+1] = e2;
 676                 out[i+2] = e3;
 677                 out[i+3] = e4;
 678             }
 679             if (energy)
 680                 qenergy += (e1*e1 + e2*e2) + (e3*e3 + e4*e4);
 681         }
 682     }
 683     if (energy)
 684         *energy = qenergy;
 685 }
 686
 687 static void quantize_and_encode_band_cost_ESC_mips(struct AACEncContext *s,
 688                                                    PutBitContext *pb, const float *in, float *out,
 689                                                    const float *scaled, int size, int scale_idx,
 690                                                    int cb, const float lambda, const float uplim,
 691                                                    int *bits, float *energy, const float ROUNDING)
 692 {
 693     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
 694     const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
 695     int i;
 696     int qc1, qc2, qc3, qc4;
 697     float qenergy = 0.0f;
 698
 699     uint8_t  *p_bits    = (uint8_t* )ff_aac_spectral_bits[cb-1];
 700     uint16_t *p_codes   = (uint16_t*)ff_aac_spectral_codes[cb-1];
 701     float    *p_vectors = (float*   )ff_aac_codebook_vectors[cb-1];
 702
 703     abs_pow34_v(s->scoefs, in, size);
 704     scaled = s->scoefs;
 705
 706     if (cb < 11) {
 707         for (i = 0; i < size; i += 4) {
 708             int curidx, curidx2, sign1, count1, sign2, count2;
 709             int *in_int = (int *)&in[i];
 710             uint8_t v_bits;
 711             unsigned int v_codes;
 712             int t0, t1, t2, t3, t4;
 713             const float *vec1, *vec2;
 714
 715             qc1 = scaled[i  ] * Q34 + ROUNDING;
 716             qc2 = scaled[i+1] * Q34 + ROUNDING;
 717             qc3 = scaled[i+2] * Q34 + ROUNDING;
 718             qc4 = scaled[i+3] * Q34 + ROUNDING;
 719
 720             __asm__ volatile (
 721                 ".set push                                  \n\t"
 722                 ".set noreorder                             \n\t"
 723
 724                 "ori        %[t4],      $zero,      16      \n\t"
 725                 "ori        %[sign1],   $zero,      0       \n\t"
 726                 "ori        %[sign2],   $zero,      0       \n\t"
 727                 "slt        %[t0],      %[t4],      %[qc1]  \n\t"
 728                 "slt        %[t1],      %[t4],      %[qc2]  \n\t"
 729                 "slt        %[t2],      %[t4],      %[qc3]  \n\t"
 730                 "slt        %[t3],      %[t4],      %[qc4]  \n\t"
 731                 "movn       %[qc1],     %[t4],      %[t0]   \n\t"
 732                 "movn       %[qc2],     %[t4],      %[t1]   \n\t"
 733                 "movn       %[qc3],     %[t4],      %[t2]   \n\t"
 734                 "movn       %[qc4],     %[t4],      %[t3]   \n\t"
 735                 "lw         %[t0],      0(%[in_int])        \n\t"
 736                 "lw         %[t1],      4(%[in_int])        \n\t"
 737                 "lw         %[t2],      8(%[in_int])        \n\t"
 738                 "lw         %[t3],      12(%[in_int])       \n\t"
 739                 "slt        %[t0],      %[t0],      $zero   \n\t"
 740                 "movn       %[sign1],   %[t0],      %[qc1]  \n\t"
 741                 "slt        %[t2],      %[t2],      $zero   \n\t"
 742                 "movn       %[sign2],   %[t2],      %[qc3]  \n\t"
 743                 "slt        %[t1],      %[t1],      $zero   \n\t"
 744                 "sll        %[t0],      %[sign1],   1       \n\t"
 745                 "or         %[t0],      %[t0],      %[t1]   \n\t"
 746                 "movn       %[sign1],   %[t0],      %[qc2]  \n\t"
 747                 "slt        %[t3],      %[t3],      $zero   \n\t"
 748                 "sll        %[t0],      %[sign2],   1       \n\t"
 749                 "or         %[t0],      %[t0],      %[t3]   \n\t"
 750                 "movn       %[sign2],   %[t0],      %[qc4]  \n\t"
 751                 "slt        %[count1],  $zero,      %[qc1]  \n\t"
 752                 "slt        %[t1],      $zero,      %[qc2]  \n\t"
 753                 "slt        %[count2],  $zero,      %[qc3]  \n\t"
 754                 "slt        %[t2],      $zero,      %[qc4]  \n\t"
 755                 "addu       %[count1],  %[count1],  %[t1]   \n\t"
 756                 "addu       %[count2],  %[count2],  %[t2]   \n\t"
 757
 758                 ".set pop                                   \n\t"
 759
 760                 : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
 761                   [qc3]"+r"(qc3), [qc4]"+r"(qc4),
 762                   [sign1]"=&r"(sign1), [count1]"=&r"(count1),
 763                   [sign2]"=&r"(sign2), [count2]"=&r"(count2),
 764                   [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
 765                   [t4]"=&r"(t4)
 766                 : [in_int]"r"(in_int)
 767                 : "memory"
 768             );
 769
 770             curidx = 17 * qc1;
 771             curidx += qc2;
 772             curidx2 = 17 * qc3;
 773             curidx2 += qc4;
 774
 775             v_codes = (p_codes[curidx] << count1) | sign1;
 776             v_bits  = p_bits[curidx] + count1;
 777             put_bits(pb, v_bits, v_codes);
 778
 779             v_codes = (p_codes[curidx2] << count2) | sign2;
 780             v_bits  = p_bits[curidx2] + count2;
 781             put_bits(pb, v_bits, v_codes);
 782
 783             if (out || energy) {
 784                 float e1,e2,e3,e4;
 785                 vec1 = &p_vectors[curidx*2 ];
 786                 vec2 = &p_vectors[curidx2*2];
 787                 e1 = copysignf(vec1[0] * IQ, in[i+0]);
 788                 e2 = copysignf(vec1[1] * IQ, in[i+1]);
 789                 e3 = copysignf(vec2[0] * IQ, in[i+2]);
 790                 e4 = copysignf(vec2[1] * IQ, in[i+3]);
 791                 if (out) {
 792                     out[i+0] = e1;
 793                     out[i+1] = e2;
 794                     out[i+2] = e3;
 795                     out[i+3] = e4;
 796                 }
 797                 if (energy)
 798                     qenergy += (e1*e1 + e2*e2) + (e3*e3 + e4*e4);
 799             }
 800         }
 801     } else {
 802         for (i = 0; i < size; i += 4) {
 803             int curidx, curidx2, sign1, count1, sign2, count2;
 804             int *in_int = (int *)&in[i];
 805             uint8_t v_bits;
 806             unsigned int v_codes;
 807             int c1, c2, c3, c4;
 808             int t0, t1, t2, t3, t4;
 809
 810             qc1 = scaled[i  ] * Q34 + ROUNDING;
 811             qc2 = scaled[i+1] * Q34 + ROUNDING;
 812             qc3 = scaled[i+2] * Q34 + ROUNDING;
 813             qc4 = scaled[i+3] * Q34 + ROUNDING;
 814
 815             __asm__ volatile (
 816                 ".set push                                  \n\t"
 817                 ".set noreorder                             \n\t"
 818
 819                 "ori        %[t4],      $zero,      16      \n\t"
 820                 "ori        %[sign1],   $zero,      0       \n\t"
 821                 "ori        %[sign2],   $zero,      0       \n\t"
 822                 "shll_s.w   %[c1],      %[qc1],     18      \n\t"
 823                 "shll_s.w   %[c2],      %[qc2],     18      \n\t"
 824                 "shll_s.w   %[c3],      %[qc3],     18      \n\t"
 825                 "shll_s.w   %[c4],      %[qc4],     18      \n\t"
 826                 "srl        %[c1],      %[c1],      18      \n\t"
 827                 "srl        %[c2],      %[c2],      18      \n\t"
 828                 "srl        %[c3],      %[c3],      18      \n\t"
 829                 "srl        %[c4],      %[c4],      18      \n\t"
 830                 "slt        %[t0],      %[t4],      %[qc1]  \n\t"
 831                 "slt        %[t1],      %[t4],      %[qc2]  \n\t"
 832                 "slt        %[t2],      %[t4],      %[qc3]  \n\t"
 833                 "slt        %[t3],      %[t4],      %[qc4]  \n\t"
 834                 "movn       %[qc1],     %[t4],      %[t0]   \n\t"
 835                 "movn       %[qc2],     %[t4],      %[t1]   \n\t"
 836                 "movn       %[qc3],     %[t4],      %[t2]   \n\t"
 837                 "movn       %[qc4],     %[t4],      %[t3]   \n\t"
 838                 "lw         %[t0],      0(%[in_int])        \n\t"
 839                 "lw         %[t1],      4(%[in_int])        \n\t"
 840                 "lw         %[t2],      8(%[in_int])        \n\t"
 841                 "lw         %[t3],      12(%[in_int])       \n\t"
 842                 "slt        %[t0],      %[t0],      $zero   \n\t"
 843                 "movn       %[sign1],   %[t0],      %[qc1]  \n\t"
 844                 "slt        %[t2],      %[t2],      $zero   \n\t"
 845                 "movn       %[sign2],   %[t2],      %[qc3]  \n\t"
 846                 "slt        %[t1],      %[t1],      $zero   \n\t"
 847                 "sll        %[t0],      %[sign1],   1       \n\t"
 848                 "or         %[t0],      %[t0],      %[t1]   \n\t"
 849                 "movn       %[sign1],   %[t0],      %[qc2]  \n\t"
 850                 "slt        %[t3],      %[t3],      $zero   \n\t"
 851                 "sll        %[t0],      %[sign2],   1       \n\t"
 852                 "or         %[t0],      %[t0],      %[t3]   \n\t"
 853                 "movn       %[sign2],   %[t0],      %[qc4]  \n\t"
 854                 "slt        %[count1],  $zero,      %[qc1]  \n\t"
 855                 "slt        %[t1],      $zero,      %[qc2]  \n\t"
 856                 "slt        %[count2],  $zero,      %[qc3]  \n\t"
 857                 "slt        %[t2],      $zero,      %[qc4]  \n\t"
 858                 "addu       %[count1],  %[count1],  %[t1]   \n\t"
 859                 "addu       %[count2],  %[count2],  %[t2]   \n\t"
 860
 861                 ".set pop                                   \n\t"
 862
 863                 : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
 864                   [qc3]"+r"(qc3), [qc4]"+r"(qc4),
 865                   [sign1]"=&r"(sign1), [count1]"=&r"(count1),
 866                   [sign2]"=&r"(sign2), [count2]"=&r"(count2),
 867                   [c1]"=&r"(c1), [c2]"=&r"(c2),
 868                   [c3]"=&r"(c3), [c4]"=&r"(c4),
 869                   [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
 870                   [t4]"=&r"(t4)
 871                 : [in_int]"r"(in_int)
 872                 : "memory"
 873             );
 874
 875             curidx = 17 * qc1;
 876             curidx += qc2;
 877
 878             curidx2 = 17 * qc3;
 879             curidx2 += qc4;
 880
 881             v_codes = (p_codes[curidx] << count1) | sign1;
 882             v_bits  = p_bits[curidx] + count1;
 883             put_bits(pb, v_bits, v_codes);
 884
 885             if (p_vectors[curidx*2  ] == 64.0f) {
 886                 int len = av_log2(c1);
 887                 v_codes = (((1 << (len - 3)) - 2) << len) | (c1 & ((1 << len) - 1));
 888                 put_bits(pb, len * 2 - 3, v_codes);
 889             }
 890             if (p_vectors[curidx*2+1] == 64.0f) {
 891                 int len = av_log2(c2);
 892                 v_codes = (((1 << (len - 3)) - 2) << len) | (c2 & ((1 << len) - 1));
 893                 put_bits(pb, len*2-3, v_codes);
 894             }
 895
 896             v_codes = (p_codes[curidx2] << count2) | sign2;
 897             v_bits  = p_bits[curidx2] + count2;
 898             put_bits(pb, v_bits, v_codes);
 899
 900             if (p_vectors[curidx2*2  ] == 64.0f) {
 901                 int len = av_log2(c3);
 902                 v_codes = (((1 << (len - 3)) - 2) << len) | (c3 & ((1 << len) - 1));
 903                 put_bits(pb, len* 2 - 3, v_codes);
 904             }
 905             if (p_vectors[curidx2*2+1] == 64.0f) {
 906                 int len = av_log2(c4);
 907                 v_codes = (((1 << (len - 3)) - 2) << len) | (c4 & ((1 << len) - 1));
 908                 put_bits(pb, len * 2 - 3, v_codes);
 909             }
 910
 911             if (out || energy) {
 912                 float e1, e2, e3, e4;
 913                 e1 = copysignf(c1 * cbrtf(c1) * IQ, in[i+0]);
 914                 e2 = copysignf(c2 * cbrtf(c2) * IQ, in[i+1]);
 915                 e3 = copysignf(c3 * cbrtf(c3) * IQ, in[i+2]);
 916                 e4 = copysignf(c4 * cbrtf(c4) * IQ, in[i+3]);
 917                 if (out) {
 918                     out[i+0] = e1;
 919                     out[i+1] = e2;
 920                     out[i+2] = e3;
 921                     out[i+3] = e4;
 922                 }
 923                 if (energy)
 924                     qenergy += (e1*e1 + e2*e2) + (e3*e3 + e4*e4);
 925             }
 926         }
 927     }
 928     if (energy)
 929         *energy = qenergy;
 930 }
 931
 932 static void quantize_and_encode_band_cost_NONE_mips(struct AACEncContext *s,
 933                                                          PutBitContext *pb, const float *in, float *out,
 934                                                          const float *scaled, int size, int scale_idx,
 935                                                          int cb, const float lambda, const float uplim,
 936                                                          int *bits, float *energy, const float ROUNDING) {
 937     av_assert0(0);
 938 }
 939
 940 static void quantize_and_encode_band_cost_ZERO_mips(struct AACEncContext *s,
 941                                                          PutBitContext *pb, const float *in, float *out,
 942                                                          const float *scaled, int size, int scale_idx,
 943                                                          int cb, const float lambda, const float uplim,
 944                                                          int *bits, float *energy, const float ROUNDING) {
 945     int i;
 946     if (bits)
 947         *bits = 0;
 948     if (out) {
 949         for (i = 0; i < size; i += 4) {
 950            out[i  ] = 0.0f;
 951            out[i+1] = 0.0f;
 952            out[i+2] = 0.0f;
 953            out[i+3] = 0.0f;
 954         }
 955     }
 956     if (energy)
 957         *energy = 0.0f;
 958 }
 959
 960 static void (*const quantize_and_encode_band_cost_arr[])(struct AACEncContext *s,
 961                                                          PutBitContext *pb, const float *in, float *out,
 962                                                          const float *scaled, int size, int scale_idx,
 963                                                          int cb, const float lambda, const float uplim,
 964                                                          int *bits, float *energy, const float ROUNDING) = {
 965     quantize_and_encode_band_cost_ZERO_mips,
 966     quantize_and_encode_band_cost_SQUAD_mips,
 967     quantize_and_encode_band_cost_SQUAD_mips,
 968     quantize_and_encode_band_cost_UQUAD_mips,
 969     quantize_and_encode_band_cost_UQUAD_mips,
 970     quantize_and_encode_band_cost_SPAIR_mips,
 971     quantize_and_encode_band_cost_SPAIR_mips,
 972     quantize_and_encode_band_cost_UPAIR7_mips,
 973     quantize_and_encode_band_cost_UPAIR7_mips,
 974     quantize_and_encode_band_cost_UPAIR12_mips,
 975     quantize_and_encode_band_cost_UPAIR12_mips,
 976     quantize_and_encode_band_cost_ESC_mips,
 977     quantize_and_encode_band_cost_NONE_mips, /* cb 12 doesn't exist */
 978     quantize_and_encode_band_cost_ZERO_mips,
 979     quantize_and_encode_band_cost_ZERO_mips,
 980     quantize_and_encode_band_cost_ZERO_mips,
 981 };
 982
 983 #define quantize_and_encode_band_cost(                                       \
 984                                 s, pb, in, out, scaled, size, scale_idx, cb, \
 985                                 lambda, uplim, bits, energy, ROUNDING)       \
 986     quantize_and_encode_band_cost_arr[cb](                                   \
 987                                 s, pb, in, out, scaled, size, scale_idx, cb, \
 988                                 lambda, uplim, bits, energy, ROUNDING)
 989
 990 static void quantize_and_encode_band_mips(struct AACEncContext *s, PutBitContext *pb,
 991                                           const float *in, float *out, int size, int scale_idx,
 992                                           int cb, const float lambda, int rtz)
 993 {
 994     quantize_and_encode_band_cost(s, pb, in, out, NULL, size, scale_idx, cb, lambda,
 995                                   INFINITY, NULL, NULL, (rtz) ? ROUND_TO_ZERO : ROUND_STANDARD);
 996 }
 997
 998 /**
 999  * Functions developed from template function and optimized for getting the number of bits
1000  */
1001 static float get_band_numbits_ZERO_mips(struct AACEncContext *s,
1002                                         PutBitContext *pb, const float *in,
1003                                         const float *scaled, int size, int scale_idx,
1004                                         int cb, const float lambda, const float uplim,
1005                                         int *bits)
1006 {
1007     return 0;
1008 }
1009
1010 static float get_band_numbits_NONE_mips(struct AACEncContext *s,
1011                                         PutBitContext *pb, const float *in,
1012                                         const float *scaled, int size, int scale_idx,
1013                                         int cb, const float lambda, const float uplim,
1014                                         int *bits)
1015 {
1016     av_assert0(0);
1017     return 0;
1018 }
1019
1020 static float get_band_numbits_SQUAD_mips(struct AACEncContext *s,
1021                                          PutBitContext *pb, const float *in,
1022                                          const float *scaled, int size, int scale_idx,
1023                                          int cb, const float lambda, const float uplim,
1024                                          int *bits)
1025 {
1026     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1027     int i;
1028     int qc1, qc2, qc3, qc4;
1029     int curbits = 0;
1030
1031     uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
1032
1033     for (i = 0; i < size; i += 4) {
1034         int curidx;
1035         int *in_int = (int *)&in[i];
1036         int t0, t1, t2, t3, t4, t5, t6, t7;
1037
1038         qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
1039         qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
1040         qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
1041         qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
1042
1043         __asm__ volatile (
1044             ".set push                      \n\t"
1045             ".set noreorder                 \n\t"
1046
1047             "slt    %[qc1], $zero,  %[qc1]  \n\t"
1048             "slt    %[qc2], $zero,  %[qc2]  \n\t"
1049             "slt    %[qc3], $zero,  %[qc3]  \n\t"
1050             "slt    %[qc4], $zero,  %[qc4]  \n\t"
1051             "lw     %[t0],  0(%[in_int])    \n\t"
1052             "lw     %[t1],  4(%[in_int])    \n\t"
1053             "lw     %[t2],  8(%[in_int])    \n\t"
1054             "lw     %[t3],  12(%[in_int])   \n\t"
1055             "srl    %[t0],  %[t0],  31      \n\t"
1056             "srl    %[t1],  %[t1],  31      \n\t"
1057             "srl    %[t2],  %[t2],  31      \n\t"
1058             "srl    %[t3],  %[t3],  31      \n\t"
1059             "subu   %[t4],  $zero,  %[qc1]  \n\t"
1060             "subu   %[t5],  $zero,  %[qc2]  \n\t"
1061             "subu   %[t6],  $zero,  %[qc3]  \n\t"
1062             "subu   %[t7],  $zero,  %[qc4]  \n\t"
1063             "movn   %[qc1], %[t4],  %[t0]   \n\t"
1064             "movn   %[qc2], %[t5],  %[t1]   \n\t"
1065             "movn   %[qc3], %[t6],  %[t2]   \n\t"
1066             "movn   %[qc4], %[t7],  %[t3]   \n\t"
1067
1068             ".set pop                       \n\t"
1069
1070             : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1071               [qc3]"+r"(qc3), [qc4]"+r"(qc4),
1072               [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
1073               [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7)
1074             : [in_int]"r"(in_int)
1075             : "memory"
1076         );
1077
1078         curidx = qc1;
1079         curidx *= 3;
1080         curidx += qc2;
1081         curidx *= 3;
1082         curidx += qc3;
1083         curidx *= 3;
1084         curidx += qc4;
1085         curidx += 40;
1086
1087         curbits += p_bits[curidx];
1088     }
1089     return curbits;
1090 }
1091
1092 static float get_band_numbits_UQUAD_mips(struct AACEncContext *s,
1093                                          PutBitContext *pb, const float *in,
1094                                          const float *scaled, int size, int scale_idx,
1095                                          int cb, const float lambda, const float uplim,
1096                                          int *bits)
1097 {
1098     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1099     int i;
1100     int curbits = 0;
1101     int qc1, qc2, qc3, qc4;
1102
1103     uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
1104
1105     for (i = 0; i < size; i += 4) {
1106         int curidx;
1107         int t0, t1, t2, t3, t4;
1108
1109         qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
1110         qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
1111         qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
1112         qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
1113
1114         __asm__ volatile (
1115             ".set push                      \n\t"
1116             ".set noreorder                 \n\t"
1117
1118             "ori    %[t4],  $zero,  2       \n\t"
1119             "slt    %[t0],  %[t4],  %[qc1]  \n\t"
1120             "slt    %[t1],  %[t4],  %[qc2]  \n\t"
1121             "slt    %[t2],  %[t4],  %[qc3]  \n\t"
1122             "slt    %[t3],  %[t4],  %[qc4]  \n\t"
1123             "movn   %[qc1], %[t4],  %[t0]   \n\t"
1124             "movn   %[qc2], %[t4],  %[t1]   \n\t"
1125             "movn   %[qc3], %[t4],  %[t2]   \n\t"
1126             "movn   %[qc4], %[t4],  %[t3]   \n\t"
1127
1128             ".set pop                       \n\t"
1129
1130             : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1131               [qc3]"+r"(qc3), [qc4]"+r"(qc4),
1132               [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
1133               [t4]"=&r"(t4)
1134         );
1135
1136         curidx = qc1;
1137         curidx *= 3;
1138         curidx += qc2;
1139         curidx *= 3;
1140         curidx += qc3;
1141         curidx *= 3;
1142         curidx += qc4;
1143
1144         curbits += p_bits[curidx];
1145         curbits += uquad_sign_bits[curidx];
1146     }
1147     return curbits;
1148 }
1149
1150 static float get_band_numbits_SPAIR_mips(struct AACEncContext *s,
1151                                          PutBitContext *pb, const float *in,
1152                                          const float *scaled, int size, int scale_idx,
1153                                          int cb, const float lambda, const float uplim,
1154                                          int *bits)
1155 {
1156     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1157     int i;
1158     int qc1, qc2, qc3, qc4;
1159     int curbits = 0;
1160
1161     uint8_t *p_bits = (uint8_t*)ff_aac_spectral_bits[cb-1];
1162
1163     for (i = 0; i < size; i += 4) {
1164         int curidx, curidx2;
1165         int *in_int = (int *)&in[i];
1166         int t0, t1, t2, t3, t4, t5, t6, t7;
1167
1168         qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
1169         qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
1170         qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
1171         qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
1172
1173         __asm__ volatile (
1174             ".set push                      \n\t"
1175             ".set noreorder                 \n\t"
1176
1177             "ori    %[t4],  $zero,  4       \n\t"
1178             "slt    %[t0],  %[t4],  %[qc1]  \n\t"
1179             "slt    %[t1],  %[t4],  %[qc2]  \n\t"
1180             "slt    %[t2],  %[t4],  %[qc3]  \n\t"
1181             "slt    %[t3],  %[t4],  %[qc4]  \n\t"
1182             "movn   %[qc1], %[t4],  %[t0]   \n\t"
1183             "movn   %[qc2], %[t4],  %[t1]   \n\t"
1184             "movn   %[qc3], %[t4],  %[t2]   \n\t"
1185             "movn   %[qc4], %[t4],  %[t3]   \n\t"
1186             "lw     %[t0],  0(%[in_int])    \n\t"
1187             "lw     %[t1],  4(%[in_int])    \n\t"
1188             "lw     %[t2],  8(%[in_int])    \n\t"
1189             "lw     %[t3],  12(%[in_int])   \n\t"
1190             "srl    %[t0],  %[t0],  31      \n\t"
1191             "srl    %[t1],  %[t1],  31      \n\t"
1192             "srl    %[t2],  %[t2],  31      \n\t"
1193             "srl    %[t3],  %[t3],  31      \n\t"
1194             "subu   %[t4],  $zero,  %[qc1]  \n\t"
1195             "subu   %[t5],  $zero,  %[qc2]  \n\t"
1196             "subu   %[t6],  $zero,  %[qc3]  \n\t"
1197             "subu   %[t7],  $zero,  %[qc4]  \n\t"
1198             "movn   %[qc1], %[t4],  %[t0]   \n\t"
1199             "movn   %[qc2], %[t5],  %[t1]   \n\t"
1200             "movn   %[qc3], %[t6],  %[t2]   \n\t"
1201             "movn   %[qc4], %[t7],  %[t3]   \n\t"
1202
1203             ".set pop                       \n\t"
1204
1205             : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1206               [qc3]"+r"(qc3), [qc4]"+r"(qc4),
1207               [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
1208               [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7)
1209             : [in_int]"r"(in_int)
1210             : "memory"
1211         );
1212
1213         curidx  = 9 * qc1;
1214         curidx += qc2 + 40;
1215
1216         curidx2  = 9 * qc3;
1217         curidx2 += qc4 + 40;
1218
1219         curbits += p_bits[curidx] + p_bits[curidx2];
1220     }
1221     return curbits;
1222 }
1223
1224 static float get_band_numbits_UPAIR7_mips(struct AACEncContext *s,
1225                                           PutBitContext *pb, const float *in,
1226                                           const float *scaled, int size, int scale_idx,
1227                                           int cb, const float lambda, const float uplim,
1228                                           int *bits)
1229 {
1230     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1231     int i;
1232     int qc1, qc2, qc3, qc4;
1233     int curbits = 0;
1234
1235     uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
1236
1237     for (i = 0; i < size; i += 4) {
1238         int curidx, curidx2;
1239         int t0, t1, t2, t3, t4;
1240
1241         qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
1242         qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
1243         qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
1244         qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
1245
1246         __asm__ volatile (
1247             ".set push                      \n\t"
1248             ".set noreorder                 \n\t"
1249
1250             "ori    %[t4],  $zero,  7       \n\t"
1251             "slt    %[t0],  %[t4],  %[qc1]  \n\t"
1252             "slt    %[t1],  %[t4],  %[qc2]  \n\t"
1253             "slt    %[t2],  %[t4],  %[qc3]  \n\t"
1254             "slt    %[t3],  %[t4],  %[qc4]  \n\t"
1255             "movn   %[qc1], %[t4],  %[t0]   \n\t"
1256             "movn   %[qc2], %[t4],  %[t1]   \n\t"
1257             "movn   %[qc3], %[t4],  %[t2]   \n\t"
1258             "movn   %[qc4], %[t4],  %[t3]   \n\t"
1259
1260             ".set pop                       \n\t"
1261
1262             : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1263               [qc3]"+r"(qc3), [qc4]"+r"(qc4),
1264               [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
1265               [t4]"=&r"(t4)
1266         );
1267
1268         curidx  = 8 * qc1;
1269         curidx += qc2;
1270
1271         curidx2  = 8 * qc3;
1272         curidx2 += qc4;
1273
1274         curbits += p_bits[curidx] +
1275                    upair7_sign_bits[curidx] +
1276                    p_bits[curidx2] +
1277                    upair7_sign_bits[curidx2];
1278     }
1279     return curbits;
1280 }
1281
1282 static float get_band_numbits_UPAIR12_mips(struct AACEncContext *s,
1283                                            PutBitContext *pb, const float *in,
1284                                            const float *scaled, int size, int scale_idx,
1285                                            int cb, const float lambda, const float uplim,
1286                                            int *bits)
1287 {
1288     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1289     int i;
1290     int qc1, qc2, qc3, qc4;
1291     int curbits = 0;
1292
1293     uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
1294
1295     for (i = 0; i < size; i += 4) {
1296         int curidx, curidx2;
1297         int t0, t1, t2, t3, t4;
1298
1299         qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
1300         qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
1301         qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
1302         qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
1303
1304         __asm__ volatile (
1305             ".set push                      \n\t"
1306             ".set noreorder                 \n\t"
1307
1308             "ori    %[t4],  $zero,  12      \n\t"
1309             "slt    %[t0],  %[t4],  %[qc1]  \n\t"
1310             "slt    %[t1],  %[t4],  %[qc2]  \n\t"
1311             "slt    %[t2],  %[t4],  %[qc3]  \n\t"
1312             "slt    %[t3],  %[t4],  %[qc4]  \n\t"
1313             "movn   %[qc1], %[t4],  %[t0]   \n\t"
1314             "movn   %[qc2], %[t4],  %[t1]   \n\t"
1315             "movn   %[qc3], %[t4],  %[t2]   \n\t"
1316             "movn   %[qc4], %[t4],  %[t3]   \n\t"
1317
1318             ".set pop                       \n\t"
1319
1320             : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1321               [qc3]"+r"(qc3), [qc4]"+r"(qc4),
1322               [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
1323               [t4]"=&r"(t4)
1324         );
1325
1326         curidx  = 13 * qc1;
1327         curidx += qc2;
1328
1329         curidx2  = 13 * qc3;
1330         curidx2 += qc4;
1331
1332         curbits += p_bits[curidx] +
1333                    p_bits[curidx2] +
1334                    upair12_sign_bits[curidx] +
1335                    upair12_sign_bits[curidx2];
1336     }
1337     return curbits;
1338 }
1339
1340 static float get_band_numbits_ESC_mips(struct AACEncContext *s,
1341                                        PutBitContext *pb, const float *in,
1342                                        const float *scaled, int size, int scale_idx,
1343                                        int cb, const float lambda, const float uplim,
1344                                        int *bits)
1345 {
1346     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1347     int i;
1348     int qc1, qc2, qc3, qc4;
1349     int curbits = 0;
1350
1351     uint8_t *p_bits = (uint8_t*)ff_aac_spectral_bits[cb-1];
1352
1353     for (i = 0; i < size; i += 4) {
1354         int curidx, curidx2;
1355         int cond0, cond1, cond2, cond3;
1356         int c1, c2, c3, c4;
1357         int t4, t5;
1358
1359         qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
1360         qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
1361         qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
1362         qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
1363
1364         __asm__ volatile (
1365             ".set push                                  \n\t"
1366             ".set noreorder                             \n\t"
1367
1368             "ori        %[t4],      $zero,  15          \n\t"
1369             "ori        %[t5],      $zero,  16          \n\t"
1370             "shll_s.w   %[c1],      %[qc1], 18          \n\t"
1371             "shll_s.w   %[c2],      %[qc2], 18          \n\t"
1372             "shll_s.w   %[c3],      %[qc3], 18          \n\t"
1373             "shll_s.w   %[c4],      %[qc4], 18          \n\t"
1374             "srl        %[c1],      %[c1],  18          \n\t"
1375             "srl        %[c2],      %[c2],  18          \n\t"
1376             "srl        %[c3],      %[c3],  18          \n\t"
1377             "srl        %[c4],      %[c4],  18          \n\t"
1378             "slt        %[cond0],   %[t4],  %[qc1]      \n\t"
1379             "slt        %[cond1],   %[t4],  %[qc2]      \n\t"
1380             "slt        %[cond2],   %[t4],  %[qc3]      \n\t"
1381             "slt        %[cond3],   %[t4],  %[qc4]      \n\t"
1382             "movn       %[qc1],     %[t5],  %[cond0]    \n\t"
1383             "movn       %[qc2],     %[t5],  %[cond1]    \n\t"
1384             "movn       %[qc3],     %[t5],  %[cond2]    \n\t"
1385             "movn       %[qc4],     %[t5],  %[cond3]    \n\t"
1386             "ori        %[t5],      $zero,  31          \n\t"
1387             "clz        %[c1],      %[c1]               \n\t"
1388             "clz        %[c2],      %[c2]               \n\t"
1389             "clz        %[c3],      %[c3]               \n\t"
1390             "clz        %[c4],      %[c4]               \n\t"
1391             "subu       %[c1],      %[t5],  %[c1]       \n\t"
1392             "subu       %[c2],      %[t5],  %[c2]       \n\t"
1393             "subu       %[c3],      %[t5],  %[c3]       \n\t"
1394             "subu       %[c4],      %[t5],  %[c4]       \n\t"
1395             "sll        %[c1],      %[c1],  1           \n\t"
1396             "sll        %[c2],      %[c2],  1           \n\t"
1397             "sll        %[c3],      %[c3],  1           \n\t"
1398             "sll        %[c4],      %[c4],  1           \n\t"
1399             "addiu      %[c1],      %[c1],  -3          \n\t"
1400             "addiu      %[c2],      %[c2],  -3          \n\t"
1401             "addiu      %[c3],      %[c3],  -3          \n\t"
1402             "addiu      %[c4],      %[c4],  -3          \n\t"
1403             "subu       %[cond0],   $zero,  %[cond0]    \n\t"
1404             "subu       %[cond1],   $zero,  %[cond1]    \n\t"
1405             "subu       %[cond2],   $zero,  %[cond2]    \n\t"
1406             "subu       %[cond3],   $zero,  %[cond3]    \n\t"
1407             "and        %[c1],      %[c1],  %[cond0]    \n\t"
1408             "and        %[c2],      %[c2],  %[cond1]    \n\t"
1409             "and        %[c3],      %[c3],  %[cond2]    \n\t"
1410             "and        %[c4],      %[c4],  %[cond3]    \n\t"
1411
1412             ".set pop                                   \n\t"
1413
1414             : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1415               [qc3]"+r"(qc3), [qc4]"+r"(qc4),
1416               [cond0]"=&r"(cond0), [cond1]"=&r"(cond1),
1417               [cond2]"=&r"(cond2), [cond3]"=&r"(cond3),
1418               [c1]"=&r"(c1), [c2]"=&r"(c2),
1419               [c3]"=&r"(c3), [c4]"=&r"(c4),
1420               [t4]"=&r"(t4), [t5]"=&r"(t5)
1421         );
1422
1423         curidx = 17 * qc1;
1424         curidx += qc2;
1425
1426         curidx2 = 17 * qc3;
1427         curidx2 += qc4;
1428
1429         curbits += p_bits[curidx];
1430         curbits += esc_sign_bits[curidx];
1431         curbits += p_bits[curidx2];
1432         curbits += esc_sign_bits[curidx2];
1433
1434         curbits += c1;
1435         curbits += c2;
1436         curbits += c3;
1437         curbits += c4;
1438     }
1439     return curbits;
1440 }
1441
1442 static float (*const get_band_numbits_arr[])(struct AACEncContext *s,
1443                                              PutBitContext *pb, const float *in,
1444                                              const float *scaled, int size, int scale_idx,
1445                                              int cb, const float lambda, const float uplim,
1446                                              int *bits) = {
1447     get_band_numbits_ZERO_mips,
1448     get_band_numbits_SQUAD_mips,
1449     get_band_numbits_SQUAD_mips,
1450     get_band_numbits_UQUAD_mips,
1451     get_band_numbits_UQUAD_mips,
1452     get_band_numbits_SPAIR_mips,
1453     get_band_numbits_SPAIR_mips,
1454     get_band_numbits_UPAIR7_mips,
1455     get_band_numbits_UPAIR7_mips,
1456     get_band_numbits_UPAIR12_mips,
1457     get_band_numbits_UPAIR12_mips,
1458     get_band_numbits_ESC_mips,
1459     get_band_numbits_NONE_mips, /* cb 12 doesn't exist */
1460     get_band_numbits_ZERO_mips,
1461     get_band_numbits_ZERO_mips,
1462     get_band_numbits_ZERO_mips,
1463 };
1464
1465 #define get_band_numbits(                                  \
1466                                 s, pb, in, scaled, size, scale_idx, cb, \
1467                                 lambda, uplim, bits)                    \
1468     get_band_numbits_arr[cb](                              \
1469                                 s, pb, in, scaled, size, scale_idx, cb, \
1470                                 lambda, uplim, bits)
1471
1472 static float quantize_band_cost_bits(struct AACEncContext *s, const float *in,
1473                                      const float *scaled, int size, int scale_idx,
1474                                      int cb, const float lambda, const float uplim,
1475                                      int *bits, float *energy, int rtz)
1476 {
1477     return get_band_numbits(s, NULL, in, scaled, size, scale_idx, cb, lambda, uplim, bits);
1478 }
1479
1480 /**
1481  * Functions developed from template function and optimized for getting the band cost
1482  */
1483 #if HAVE_MIPSFPU
1484 static float get_band_cost_ZERO_mips(struct AACEncContext *s,
1485                                      PutBitContext *pb, const float *in,
1486                                      const float *scaled, int size, int scale_idx,
1487                                      int cb, const float lambda, const float uplim,
1488                                      int *bits, float *energy)
1489 {
1490     int i;
1491     float cost = 0;
1492
1493     for (i = 0; i < size; i += 4) {
1494         cost += in[i  ] * in[i  ];
1495         cost += in[i+1] * in[i+1];
1496         cost += in[i+2] * in[i+2];
1497         cost += in[i+3] * in[i+3];
1498     }
1499     if (bits)
1500         *bits = 0;
1501     if (energy)
1502         *energy = 0.0f;
1503     return cost * lambda;
1504 }
1505
1506 static float get_band_cost_NONE_mips(struct AACEncContext *s,
1507                                      PutBitContext *pb, const float *in,
1508                                      const float *scaled, int size, int scale_idx,
1509                                      int cb, const float lambda, const float uplim,
1510                                      int *bits, float *energy)
1511 {
1512     av_assert0(0);
1513     return 0;
1514 }
1515
1516 static float get_band_cost_SQUAD_mips(struct AACEncContext *s,
1517                                       PutBitContext *pb, const float *in,
1518                                       const float *scaled, int size, int scale_idx,
1519                                       int cb, const float lambda, const float uplim,
1520                                       int *bits, float *energy)
1521 {
1522     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1523     const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
1524     int i;
1525     float cost = 0;
1526     float qenergy = 0.0f;
1527     int qc1, qc2, qc3, qc4;
1528     int curbits = 0;
1529
1530     uint8_t *p_bits  = (uint8_t *)ff_aac_spectral_bits[cb-1];
1531     float   *p_codes = (float   *)ff_aac_codebook_vectors[cb-1];
1532
1533     for (i = 0; i < size; i += 4) {
1534         const float *vec;
1535         int curidx;
1536         int   *in_int = (int   *)&in[i];
1537         float *in_pos = (float *)&in[i];
1538         float di0, di1, di2, di3;
1539         int t0, t1, t2, t3, t4, t5, t6, t7;
1540
1541         qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
1542         qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
1543         qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
1544         qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
1545
1546         __asm__ volatile (
1547             ".set push                                  \n\t"
1548             ".set noreorder                             \n\t"
1549
1550             "slt        %[qc1], $zero,  %[qc1]          \n\t"
1551             "slt        %[qc2], $zero,  %[qc2]          \n\t"
1552             "slt        %[qc3], $zero,  %[qc3]          \n\t"
1553             "slt        %[qc4], $zero,  %[qc4]          \n\t"
1554             "lw         %[t0],  0(%[in_int])            \n\t"
1555             "lw         %[t1],  4(%[in_int])            \n\t"
1556             "lw         %[t2],  8(%[in_int])            \n\t"
1557             "lw         %[t3],  12(%[in_int])           \n\t"
1558             "srl        %[t0],  %[t0],  31              \n\t"
1559             "srl        %[t1],  %[t1],  31              \n\t"
1560             "srl        %[t2],  %[t2],  31              \n\t"
1561             "srl        %[t3],  %[t3],  31              \n\t"
1562             "subu       %[t4],  $zero,  %[qc1]          \n\t"
1563             "subu       %[t5],  $zero,  %[qc2]          \n\t"
1564             "subu       %[t6],  $zero,  %[qc3]          \n\t"
1565             "subu       %[t7],  $zero,  %[qc4]          \n\t"
1566             "movn       %[qc1], %[t4],  %[t0]           \n\t"
1567             "movn       %[qc2], %[t5],  %[t1]           \n\t"
1568             "movn       %[qc3], %[t6],  %[t2]           \n\t"
1569             "movn       %[qc4], %[t7],  %[t3]           \n\t"
1570
1571             ".set pop                                   \n\t"
1572
1573             : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1574               [qc3]"+r"(qc3), [qc4]"+r"(qc4),
1575               [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
1576               [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7)
1577             : [in_int]"r"(in_int)
1578             : "memory"
1579         );
1580
1581         curidx = qc1;
1582         curidx *= 3;
1583         curidx += qc2;
1584         curidx *= 3;
1585         curidx += qc3;
1586         curidx *= 3;
1587         curidx += qc4;
1588         curidx += 40;
1589
1590         curbits += p_bits[curidx];
1591         vec     = &p_codes[curidx*4];
1592
1593         qenergy += vec[0]*vec[0] + vec[1]*vec[1]
1594                 +  vec[2]*vec[2] + vec[3]*vec[3];
1595
1596         __asm__ volatile (
1597             ".set push                                  \n\t"
1598             ".set noreorder                             \n\t"
1599
1600             "lwc1       $f0,    0(%[in_pos])            \n\t"
1601             "lwc1       $f1,    0(%[vec])               \n\t"
1602             "lwc1       $f2,    4(%[in_pos])            \n\t"
1603             "lwc1       $f3,    4(%[vec])               \n\t"
1604             "lwc1       $f4,    8(%[in_pos])            \n\t"
1605             "lwc1       $f5,    8(%[vec])               \n\t"
1606             "lwc1       $f6,    12(%[in_pos])           \n\t"
1607             "lwc1       $f7,    12(%[vec])              \n\t"
1608             "nmsub.s    %[di0], $f0,    $f1,    %[IQ]   \n\t"
1609             "nmsub.s    %[di1], $f2,    $f3,    %[IQ]   \n\t"
1610             "nmsub.s    %[di2], $f4,    $f5,    %[IQ]   \n\t"
1611             "nmsub.s    %[di3], $f6,    $f7,    %[IQ]   \n\t"
1612
1613             ".set pop                                   \n\t"
1614
1615             : [di0]"=&f"(di0), [di1]"=&f"(di1),
1616               [di2]"=&f"(di2), [di3]"=&f"(di3)
1617             : [in_pos]"r"(in_pos), [vec]"r"(vec),
1618               [IQ]"f"(IQ)
1619             : "$f0", "$f1", "$f2", "$f3",
1620               "$f4", "$f5", "$f6", "$f7",
1621               "memory"
1622         );
1623
1624         cost += di0 * di0 + di1 * di1
1625                 + di2 * di2 + di3 * di3;
1626     }
1627
1628     if (bits)
1629         *bits = curbits;
1630     if (energy)
1631         *energy = qenergy * (IQ*IQ);
1632     return cost * lambda + curbits;
1633 }
1634
1635 static float get_band_cost_UQUAD_mips(struct AACEncContext *s,
1636                                       PutBitContext *pb, const float *in,
1637                                       const float *scaled, int size, int scale_idx,
1638                                       int cb, const float lambda, const float uplim,
1639                                       int *bits, float *energy)
1640 {
1641     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1642     const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
1643     int i;
1644     float cost = 0;
1645     float qenergy = 0.0f;
1646     int curbits = 0;
1647     int qc1, qc2, qc3, qc4;
1648
1649     uint8_t *p_bits  = (uint8_t*)ff_aac_spectral_bits[cb-1];
1650     float   *p_codes = (float  *)ff_aac_codebook_vectors[cb-1];
1651
1652     for (i = 0; i < size; i += 4) {
1653         const float *vec;
1654         int curidx;
1655         float *in_pos = (float *)&in[i];
1656         float di0, di1, di2, di3;
1657         int t0, t1, t2, t3, t4;
1658
1659         qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
1660         qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
1661         qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
1662         qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
1663
1664         __asm__ volatile (
1665             ".set push                                  \n\t"
1666             ".set noreorder                             \n\t"
1667
1668             "ori        %[t4],  $zero,  2               \n\t"
1669             "slt        %[t0],  %[t4],  %[qc1]          \n\t"
1670             "slt        %[t1],  %[t4],  %[qc2]          \n\t"
1671             "slt        %[t2],  %[t4],  %[qc3]          \n\t"
1672             "slt        %[t3],  %[t4],  %[qc4]          \n\t"
1673             "movn       %[qc1], %[t4],  %[t0]           \n\t"
1674             "movn       %[qc2], %[t4],  %[t1]           \n\t"
1675             "movn       %[qc3], %[t4],  %[t2]           \n\t"
1676             "movn       %[qc4], %[t4],  %[t3]           \n\t"
1677
1678             ".set pop                                   \n\t"
1679
1680             : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1681               [qc3]"+r"(qc3), [qc4]"+r"(qc4),
1682               [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
1683               [t4]"=&r"(t4)
1684         );
1685
1686         curidx = qc1;
1687         curidx *= 3;
1688         curidx += qc2;
1689         curidx *= 3;
1690         curidx += qc3;
1691         curidx *= 3;
1692         curidx += qc4;
1693
1694         curbits += p_bits[curidx];
1695         curbits += uquad_sign_bits[curidx];
1696         vec     = &p_codes[curidx*4];
1697
1698         qenergy += vec[0]*vec[0] + vec[1]*vec[1]
1699                 +  vec[2]*vec[2] + vec[3]*vec[3];
1700
1701         __asm__ volatile (
1702             ".set push                                  \n\t"
1703             ".set noreorder                             \n\t"
1704
1705             "lwc1       %[di0], 0(%[in_pos])            \n\t"
1706             "lwc1       %[di1], 4(%[in_pos])            \n\t"
1707             "lwc1       %[di2], 8(%[in_pos])            \n\t"
1708             "lwc1       %[di3], 12(%[in_pos])           \n\t"
1709             "abs.s      %[di0], %[di0]                  \n\t"
1710             "abs.s      %[di1], %[di1]                  \n\t"
1711             "abs.s      %[di2], %[di2]                  \n\t"
1712             "abs.s      %[di3], %[di3]                  \n\t"
1713             "lwc1       $f0,    0(%[vec])               \n\t"
1714             "lwc1       $f1,    4(%[vec])               \n\t"
1715             "lwc1       $f2,    8(%[vec])               \n\t"
1716             "lwc1       $f3,    12(%[vec])              \n\t"
1717             "nmsub.s    %[di0], %[di0], $f0,    %[IQ]   \n\t"
1718             "nmsub.s    %[di1], %[di1], $f1,    %[IQ]   \n\t"
1719             "nmsub.s    %[di2], %[di2], $f2,    %[IQ]   \n\t"
1720             "nmsub.s    %[di3], %[di3], $f3,    %[IQ]   \n\t"
1721
1722             ".set pop                                   \n\t"
1723
1724             : [di0]"=&f"(di0), [di1]"=&f"(di1),
1725               [di2]"=&f"(di2), [di3]"=&f"(di3)
1726             : [in_pos]"r"(in_pos), [vec]"r"(vec),
1727               [IQ]"f"(IQ)
1728             : "$f0", "$f1", "$f2", "$f3",
1729               "memory"
1730         );
1731
1732         cost += di0 * di0 + di1 * di1
1733                 + di2 * di2 + di3 * di3;
1734     }
1735
1736     if (bits)
1737         *bits = curbits;
1738     if (energy)
1739         *energy = qenergy * (IQ*IQ);
1740     return cost * lambda + curbits;
1741 }
1742
1743 static float get_band_cost_SPAIR_mips(struct AACEncContext *s,
1744                                       PutBitContext *pb, const float *in,
1745                                       const float *scaled, int size, int scale_idx,
1746                                       int cb, const float lambda, const float uplim,
1747                                       int *bits, float *energy)
1748 {
1749     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1750     const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
1751     int i;
1752     float cost = 0;
1753     float qenergy = 0.0f;
1754     int qc1, qc2, qc3, qc4;
1755     int curbits = 0;
1756
1757     uint8_t *p_bits  = (uint8_t *)ff_aac_spectral_bits[cb-1];
1758     float   *p_codes = (float   *)ff_aac_codebook_vectors[cb-1];
1759
1760     for (i = 0; i < size; i += 4) {
1761         const float *vec, *vec2;
1762         int curidx, curidx2;
1763         int   *in_int = (int   *)&in[i];
1764         float *in_pos = (float *)&in[i];
1765         float di0, di1, di2, di3;
1766         int t0, t1, t2, t3, t4, t5, t6, t7;
1767
1768         qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
1769         qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
1770         qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
1771         qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
1772
1773         __asm__ volatile (
1774             ".set push                                  \n\t"
1775             ".set noreorder                             \n\t"
1776
1777             "ori        %[t4],  $zero,  4               \n\t"
1778             "slt        %[t0],  %[t4],  %[qc1]          \n\t"
1779             "slt        %[t1],  %[t4],  %[qc2]          \n\t"
1780             "slt        %[t2],  %[t4],  %[qc3]          \n\t"
1781             "slt        %[t3],  %[t4],  %[qc4]          \n\t"
1782             "movn       %[qc1], %[t4],  %[t0]           \n\t"
1783             "movn       %[qc2], %[t4],  %[t1]           \n\t"
1784             "movn       %[qc3], %[t4],  %[t2]           \n\t"
1785             "movn       %[qc4], %[t4],  %[t3]           \n\t"
1786             "lw         %[t0],  0(%[in_int])            \n\t"
1787             "lw         %[t1],  4(%[in_int])            \n\t"
1788             "lw         %[t2],  8(%[in_int])            \n\t"
1789             "lw         %[t3],  12(%[in_int])           \n\t"
1790             "srl        %[t0],  %[t0],  31              \n\t"
1791             "srl        %[t1],  %[t1],  31              \n\t"
1792             "srl        %[t2],  %[t2],  31              \n\t"
1793             "srl        %[t3],  %[t3],  31              \n\t"
1794             "subu       %[t4],  $zero,  %[qc1]          \n\t"
1795             "subu       %[t5],  $zero,  %[qc2]          \n\t"
1796             "subu       %[t6],  $zero,  %[qc3]          \n\t"
1797             "subu       %[t7],  $zero,  %[qc4]          \n\t"
1798             "movn       %[qc1], %[t4],  %[t0]           \n\t"
1799             "movn       %[qc2], %[t5],  %[t1]           \n\t"
1800             "movn       %[qc3], %[t6],  %[t2]           \n\t"
1801             "movn       %[qc4], %[t7],  %[t3]           \n\t"
1802
1803             ".set pop                                   \n\t"
1804
1805             : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1806               [qc3]"+r"(qc3), [qc4]"+r"(qc4),
1807               [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
1808               [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7)
1809             : [in_int]"r"(in_int)
1810             : "memory"
1811         );
1812
1813         curidx = 9 * qc1;
1814         curidx += qc2 + 40;
1815
1816         curidx2 = 9 * qc3;
1817         curidx2 += qc4 + 40;
1818
1819         curbits += p_bits[curidx];
1820         curbits += p_bits[curidx2];
1821
1822         vec     = &p_codes[curidx*2];
1823         vec2    = &p_codes[curidx2*2];
1824
1825         qenergy += vec[0]*vec[0] + vec[1]*vec[1]
1826                 +  vec2[0]*vec2[0] + vec2[1]*vec2[1];
1827
1828         __asm__ volatile (
1829             ".set push                                  \n\t"
1830             ".set noreorder                             \n\t"
1831
1832             "lwc1       $f0,    0(%[in_pos])            \n\t"
1833             "lwc1       $f1,    0(%[vec])               \n\t"
1834             "lwc1       $f2,    4(%[in_pos])            \n\t"
1835             "lwc1       $f3,    4(%[vec])               \n\t"
1836             "lwc1       $f4,    8(%[in_pos])            \n\t"
1837             "lwc1       $f5,    0(%[vec2])              \n\t"
1838             "lwc1       $f6,    12(%[in_pos])           \n\t"
1839             "lwc1       $f7,    4(%[vec2])              \n\t"
1840             "nmsub.s    %[di0], $f0,    $f1,    %[IQ]   \n\t"
1841             "nmsub.s    %[di1], $f2,    $f3,    %[IQ]   \n\t"
1842             "nmsub.s    %[di2], $f4,    $f5,    %[IQ]   \n\t"
1843             "nmsub.s    %[di3], $f6,    $f7,    %[IQ]   \n\t"
1844
1845             ".set pop                                   \n\t"
1846
1847             : [di0]"=&f"(di0), [di1]"=&f"(di1),
1848               [di2]"=&f"(di2), [di3]"=&f"(di3)
1849             : [in_pos]"r"(in_pos), [vec]"r"(vec),
1850               [vec2]"r"(vec2), [IQ]"f"(IQ)
1851             : "$f0", "$f1", "$f2", "$f3",
1852               "$f4", "$f5", "$f6", "$f7",
1853               "memory"
1854         );
1855
1856         cost += di0 * di0 + di1 * di1
1857                 + di2 * di2 + di3 * di3;
1858     }
1859
1860     if (bits)
1861         *bits = curbits;
1862     if (energy)
1863         *energy = qenergy * (IQ*IQ);
1864     return cost * lambda + curbits;
1865 }
1866
1867 static float get_band_cost_UPAIR7_mips(struct AACEncContext *s,
1868                                        PutBitContext *pb, const float *in,
1869                                        const float *scaled, int size, int scale_idx,
1870                                        int cb, const float lambda, const float uplim,
1871                                        int *bits, float *energy)
1872 {
1873     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1874     const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
1875     int i;
1876     float cost = 0;
1877     float qenergy = 0.0f;
1878     int qc1, qc2, qc3, qc4;
1879     int curbits = 0;
1880
1881     uint8_t *p_bits  = (uint8_t *)ff_aac_spectral_bits[cb-1];
1882     float   *p_codes = (float   *)ff_aac_codebook_vectors[cb-1];
1883
1884     for (i = 0; i < size; i += 4) {
1885         const float *vec, *vec2;
1886         int curidx, curidx2, sign1, count1, sign2, count2;
1887         int   *in_int = (int   *)&in[i];
1888         float *in_pos = (float *)&in[i];
1889         float di0, di1, di2, di3;
1890         int t0, t1, t2, t3, t4;
1891
1892         qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
1893         qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
1894         qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
1895         qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
1896
1897         __asm__ volatile (
1898             ".set push                                          \n\t"
1899             ".set noreorder                                     \n\t"
1900
1901             "ori        %[t4],      $zero,      7               \n\t"
1902             "ori        %[sign1],   $zero,      0               \n\t"
1903             "ori        %[sign2],   $zero,      0               \n\t"
1904             "slt        %[t0],      %[t4],      %[qc1]          \n\t"
1905             "slt        %[t1],      %[t4],      %[qc2]          \n\t"
1906             "slt        %[t2],      %[t4],      %[qc3]          \n\t"
1907             "slt        %[t3],      %[t4],      %[qc4]          \n\t"
1908             "movn       %[qc1],     %[t4],      %[t0]           \n\t"
1909             "movn       %[qc2],     %[t4],      %[t1]           \n\t"
1910             "movn       %[qc3],     %[t4],      %[t2]           \n\t"
1911             "movn       %[qc4],     %[t4],      %[t3]           \n\t"
1912             "lw         %[t0],      0(%[in_int])                \n\t"
1913             "lw         %[t1],      4(%[in_int])                \n\t"
1914             "lw         %[t2],      8(%[in_int])                \n\t"
1915             "lw         %[t3],      12(%[in_int])               \n\t"
1916             "slt        %[t0],      %[t0],      $zero           \n\t"
1917             "movn       %[sign1],   %[t0],      %[qc1]          \n\t"
1918             "slt        %[t2],      %[t2],      $zero           \n\t"
1919             "movn       %[sign2],   %[t2],      %[qc3]          \n\t"
1920             "slt        %[t1],      %[t1],      $zero           \n\t"
1921             "sll        %[t0],      %[sign1],   1               \n\t"
1922             "or         %[t0],      %[t0],      %[t1]           \n\t"
1923             "movn       %[sign1],   %[t0],      %[qc2]          \n\t"
1924             "slt        %[t3],      %[t3],      $zero           \n\t"
1925             "sll        %[t0],      %[sign2],   1               \n\t"
1926             "or         %[t0],      %[t0],      %[t3]           \n\t"
1927             "movn       %[sign2],   %[t0],      %[qc4]          \n\t"
1928             "slt        %[count1],  $zero,      %[qc1]          \n\t"
1929             "slt        %[t1],      $zero,      %[qc2]          \n\t"
1930             "slt        %[count2],  $zero,      %[qc3]          \n\t"
1931             "slt        %[t2],      $zero,      %[qc4]          \n\t"
1932             "addu       %[count1],  %[count1],  %[t1]           \n\t"
1933             "addu       %[count2],  %[count2],  %[t2]           \n\t"
1934
1935             ".set pop                                           \n\t"
1936
1937             : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1938               [qc3]"+r"(qc3), [qc4]"+r"(qc4),
1939               [sign1]"=&r"(sign1), [count1]"=&r"(count1),
1940               [sign2]"=&r"(sign2), [count2]"=&r"(count2),
1941               [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
1942               [t4]"=&r"(t4)
1943             : [in_int]"r"(in_int)
1944             : "memory"
1945         );
1946
1947         curidx = 8 * qc1;
1948         curidx += qc2;
1949
1950         curidx2 = 8 * qc3;
1951         curidx2 += qc4;
1952
1953         curbits += p_bits[curidx];
1954         curbits += upair7_sign_bits[curidx];
1955         vec     = &p_codes[curidx*2];
1956
1957         curbits += p_bits[curidx2];
1958         curbits += upair7_sign_bits[curidx2];
1959         vec2    = &p_codes[curidx2*2];
1960
1961         qenergy += vec[0]*vec[0] + vec[1]*vec[1]
1962                 +  vec2[0]*vec2[0] + vec2[1]*vec2[1];
1963
1964         __asm__ volatile (
1965             ".set push                                          \n\t"
1966             ".set noreorder                                     \n\t"
1967
1968             "lwc1       %[di0],     0(%[in_pos])                \n\t"
1969             "lwc1       %[di1],     4(%[in_pos])                \n\t"
1970             "lwc1       %[di2],     8(%[in_pos])                \n\t"
1971             "lwc1       %[di3],     12(%[in_pos])               \n\t"
1972             "abs.s      %[di0],     %[di0]                      \n\t"
1973             "abs.s      %[di1],     %[di1]                      \n\t"
1974             "abs.s      %[di2],     %[di2]                      \n\t"
1975             "abs.s      %[di3],     %[di3]                      \n\t"
1976             "lwc1       $f0,        0(%[vec])                   \n\t"
1977             "lwc1       $f1,        4(%[vec])                   \n\t"
1978             "lwc1       $f2,        0(%[vec2])                  \n\t"
1979             "lwc1       $f3,        4(%[vec2])                  \n\t"
1980             "nmsub.s    %[di0],     %[di0],     $f0,    %[IQ]   \n\t"
1981             "nmsub.s    %[di1],     %[di1],     $f1,    %[IQ]   \n\t"
1982             "nmsub.s    %[di2],     %[di2],     $f2,    %[IQ]   \n\t"
1983             "nmsub.s    %[di3],     %[di3],     $f3,    %[IQ]   \n\t"
1984
1985             ".set pop                                           \n\t"
1986
1987             : [di0]"=&f"(di0), [di1]"=&f"(di1),
1988               [di2]"=&f"(di2), [di3]"=&f"(di3)
1989             : [in_pos]"r"(in_pos), [vec]"r"(vec),
1990               [vec2]"r"(vec2), [IQ]"f"(IQ)
1991             : "$f0", "$f1", "$f2", "$f3",
1992               "memory"
1993         );
1994
1995         cost += di0 * di0 + di1 * di1
1996                 + di2 * di2 + di3 * di3;
1997     }
1998
1999     if (bits)
2000         *bits = curbits;
2001     if (energy)
2002         *energy = qenergy * (IQ*IQ);
2003     return cost * lambda + curbits;
2004 }
2005
2006 static float get_band_cost_UPAIR12_mips(struct AACEncContext *s,
2007                                         PutBitContext *pb, const float *in,
2008                                         const float *scaled, int size, int scale_idx,
2009                                         int cb, const float lambda, const float uplim,
2010                                         int *bits, float *energy)
2011 {
2012     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
2013     const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
2014     int i;
2015     float cost = 0;
2016     float qenergy = 0.0f;
2017     int qc1, qc2, qc3, qc4;
2018     int curbits = 0;
2019
2020     uint8_t *p_bits  = (uint8_t *)ff_aac_spectral_bits[cb-1];
2021     float   *p_codes = (float   *)ff_aac_codebook_vectors[cb-1];
2022
2023     for (i = 0; i < size; i += 4) {
2024         const float *vec, *vec2;
2025         int curidx, curidx2;
2026         int sign1, count1, sign2, count2;
2027         int   *in_int = (int   *)&in[i];
2028         float *in_pos = (float *)&in[i];
2029         float di0, di1, di2, di3;
2030         int t0, t1, t2, t3, t4;
2031
2032         qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
2033         qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
2034         qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
2035         qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
2036
2037         __asm__ volatile (
2038             ".set push                                          \n\t"
2039             ".set noreorder                                     \n\t"
2040
2041             "ori        %[t4],      $zero,      12              \n\t"
2042             "ori        %[sign1],   $zero,      0               \n\t"
2043             "ori        %[sign2],   $zero,      0               \n\t"
2044             "slt        %[t0],      %[t4],      %[qc1]          \n\t"
2045             "slt        %[t1],      %[t4],      %[qc2]          \n\t"
2046             "slt        %[t2],      %[t4],      %[qc3]          \n\t"
2047             "slt        %[t3],      %[t4],      %[qc4]          \n\t"
2048             "movn       %[qc1],     %[t4],      %[t0]           \n\t"
2049             "movn       %[qc2],     %[t4],      %[t1]           \n\t"
2050             "movn       %[qc3],     %[t4],      %[t2]           \n\t"
2051             "movn       %[qc4],     %[t4],      %[t3]           \n\t"
2052             "lw         %[t0],      0(%[in_int])                \n\t"
2053             "lw         %[t1],      4(%[in_int])                \n\t"
2054             "lw         %[t2],      8(%[in_int])                \n\t"
2055             "lw         %[t3],      12(%[in_int])               \n\t"
2056             "slt        %[t0],      %[t0],      $zero           \n\t"
2057             "movn       %[sign1],   %[t0],      %[qc1]          \n\t"
2058             "slt        %[t2],      %[t2],      $zero           \n\t"
2059             "movn       %[sign2],   %[t2],      %[qc3]          \n\t"
2060             "slt        %[t1],      %[t1],      $zero           \n\t"
2061             "sll        %[t0],      %[sign1],   1               \n\t"
2062             "or         %[t0],      %[t0],      %[t1]           \n\t"
2063             "movn       %[sign1],   %[t0],      %[qc2]          \n\t"
2064             "slt        %[t3],      %[t3],      $zero           \n\t"
2065             "sll        %[t0],      %[sign2],   1               \n\t"
2066             "or         %[t0],      %[t0],      %[t3]           \n\t"
2067             "movn       %[sign2],   %[t0],      %[qc4]          \n\t"
2068             "slt        %[count1],  $zero,      %[qc1]          \n\t"
2069             "slt        %[t1],      $zero,      %[qc2]          \n\t"
2070             "slt        %[count2],  $zero,      %[qc3]          \n\t"
2071             "slt        %[t2],      $zero,      %[qc4]          \n\t"
2072             "addu       %[count1],  %[count1],  %[t1]           \n\t"
2073             "addu       %[count2],  %[count2],  %[t2]           \n\t"
2074
2075             ".set pop                                           \n\t"
2076
2077             : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
2078               [qc3]"+r"(qc3), [qc4]"+r"(qc4),
2079               [sign1]"=&r"(sign1), [count1]"=&r"(count1),
2080               [sign2]"=&r"(sign2), [count2]"=&r"(count2),
2081               [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
2082               [t4]"=&r"(t4)
2083             : [in_int]"r"(in_int)
2084             : "memory"
2085         );
2086
2087         curidx = 13 * qc1;
2088         curidx += qc2;
2089
2090         curidx2 = 13 * qc3;
2091         curidx2 += qc4;
2092
2093         curbits += p_bits[curidx];
2094         curbits += p_bits[curidx2];
2095         curbits += upair12_sign_bits[curidx];
2096         curbits += upair12_sign_bits[curidx2];
2097         vec     = &p_codes[curidx*2];
2098         vec2    = &p_codes[curidx2*2];
2099
2100         qenergy += vec[0]*vec[0] + vec[1]*vec[1]
2101                 +  vec2[0]*vec2[0] + vec2[1]*vec2[1];
2102
2103         __asm__ volatile (
2104             ".set push                                          \n\t"
2105             ".set noreorder                                     \n\t"
2106
2107             "lwc1       %[di0],     0(%[in_pos])                \n\t"
2108             "lwc1       %[di1],     4(%[in_pos])                \n\t"
2109             "lwc1       %[di2],     8(%[in_pos])                \n\t"
2110             "lwc1       %[di3],     12(%[in_pos])               \n\t"
2111             "abs.s      %[di0],     %[di0]                      \n\t"
2112             "abs.s      %[di1],     %[di1]                      \n\t"
2113             "abs.s      %[di2],     %[di2]                      \n\t"
2114             "abs.s      %[di3],     %[di3]                      \n\t"
2115             "lwc1       $f0,        0(%[vec])                   \n\t"
2116             "lwc1       $f1,        4(%[vec])                   \n\t"
2117             "lwc1       $f2,        0(%[vec2])                  \n\t"
2118             "lwc1       $f3,        4(%[vec2])                  \n\t"
2119             "nmsub.s    %[di0],     %[di0],     $f0,    %[IQ]   \n\t"
2120             "nmsub.s    %[di1],     %[di1],     $f1,    %[IQ]   \n\t"
2121             "nmsub.s    %[di2],     %[di2],     $f2,    %[IQ]   \n\t"
2122             "nmsub.s    %[di3],     %[di3],     $f3,    %[IQ]   \n\t"
2123
2124             ".set pop                                           \n\t"
2125
2126             : [di0]"=&f"(di0), [di1]"=&f"(di1),
2127               [di2]"=&f"(di2), [di3]"=&f"(di3)
2128             : [in_pos]"r"(in_pos), [vec]"r"(vec),
2129               [vec2]"r"(vec2), [IQ]"f"(IQ)
2130             : "$f0", "$f1", "$f2", "$f3",
2131               "memory"
2132         );
2133
2134         cost += di0 * di0 + di1 * di1
2135                 + di2 * di2 + di3 * di3;
2136     }
2137
2138     if (bits)
2139         *bits = curbits;
2140     if (energy)
2141         *energy = qenergy * (IQ*IQ);
2142     return cost * lambda + curbits;
2143 }
2144
2145 static float get_band_cost_ESC_mips(struct AACEncContext *s,
2146                                     PutBitContext *pb, const float *in,
2147                                     const float *scaled, int size, int scale_idx,
2148                                     int cb, const float lambda, const float uplim,
2149                                     int *bits, float *energy)
2150 {
2151     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
2152     const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
2153     const float CLIPPED_ESCAPE = 165140.0f * IQ;
2154     int i;
2155     float cost = 0;
2156     float qenergy = 0.0f;
2157     int qc1, qc2, qc3, qc4;
2158     int curbits = 0;
2159
2160     uint8_t *p_bits  = (uint8_t*)ff_aac_spectral_bits[cb-1];
2161     float   *p_codes = (float*  )ff_aac_codebook_vectors[cb-1];
2162
2163     for (i = 0; i < size; i += 4) {
2164         const float *vec, *vec2;
2165         int curidx, curidx2;
2166         float t1, t2, t3, t4, V;
2167         float di1, di2, di3, di4;
2168         int cond0, cond1, cond2, cond3;
2169         int c1, c2, c3, c4;
2170         int t6, t7;
2171
2172         qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
2173         qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
2174         qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
2175         qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
2176
2177         __asm__ volatile (
2178             ".set push                                  \n\t"
2179             ".set noreorder                             \n\t"
2180
2181             "ori        %[t6],      $zero,  15          \n\t"
2182             "ori        %[t7],      $zero,  16          \n\t"
2183             "shll_s.w   %[c1],      %[qc1], 18          \n\t"
2184             "shll_s.w   %[c2],      %[qc2], 18          \n\t"
2185             "shll_s.w   %[c3],      %[qc3], 18          \n\t"
2186             "shll_s.w   %[c4],      %[qc4], 18          \n\t"
2187             "srl        %[c1],      %[c1],  18          \n\t"
2188             "srl        %[c2],      %[c2],  18          \n\t"
2189             "srl        %[c3],      %[c3],  18          \n\t"
2190             "srl        %[c4],      %[c4],  18          \n\t"
2191             "slt        %[cond0],   %[t6],  %[qc1]      \n\t"
2192             "slt        %[cond1],   %[t6],  %[qc2]      \n\t"
2193             "slt        %[cond2],   %[t6],  %[qc3]      \n\t"
2194             "slt        %[cond3],   %[t6],  %[qc4]      \n\t"
2195             "movn       %[qc1],     %[t7],  %[cond0]    \n\t"
2196             "movn       %[qc2],     %[t7],  %[cond1]    \n\t"
2197             "movn       %[qc3],     %[t7],  %[cond2]    \n\t"
2198             "movn       %[qc4],     %[t7],  %[cond3]    \n\t"
2199
2200             ".set pop                                   \n\t"
2201
2202             : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
2203               [qc3]"+r"(qc3), [qc4]"+r"(qc4),
2204               [cond0]"=&r"(cond0), [cond1]"=&r"(cond1),
2205               [cond2]"=&r"(cond2), [cond3]"=&r"(cond3),
2206               [c1]"=&r"(c1), [c2]"=&r"(c2),
2207               [c3]"=&r"(c3), [c4]"=&r"(c4),
2208               [t6]"=&r"(t6), [t7]"=&r"(t7)
2209         );
2210
2211         curidx = 17 * qc1;
2212         curidx += qc2;
2213
2214         curidx2 = 17 * qc3;
2215         curidx2 += qc4;
2216
2217         curbits += p_bits[curidx];
2218         curbits += esc_sign_bits[curidx];
2219         vec     = &p_codes[curidx*2];
2220
2221         curbits += p_bits[curidx2];
2222         curbits += esc_sign_bits[curidx2];
2223         vec2     = &p_codes[curidx2*2];
2224
2225         curbits += (av_log2(c1) * 2 - 3) & (-cond0);
2226         curbits += (av_log2(c2) * 2 - 3) & (-cond1);
2227         curbits += (av_log2(c3) * 2 - 3) & (-cond2);
2228         curbits += (av_log2(c4) * 2 - 3) & (-cond3);
2229
2230         t1 = fabsf(in[i  ]);
2231         t2 = fabsf(in[i+1]);
2232         t3 = fabsf(in[i+2]);
2233         t4 = fabsf(in[i+3]);
2234
2235         if (cond0) {
2236             if (t1 >= CLIPPED_ESCAPE) {
2237                 di1 = t1 - CLIPPED_ESCAPE;
2238                 qenergy += CLIPPED_ESCAPE*CLIPPED_ESCAPE;
2239             } else {
2240                 di1 = t1 - (V = c1 * cbrtf(c1) * IQ);
2241                 qenergy += V*V;
2242             }
2243         } else {
2244             di1 = t1 - (V = vec[0] * IQ);
2245             qenergy += V*V;
2246         }
2247
2248         if (cond1) {
2249             if (t2 >= CLIPPED_ESCAPE) {
2250                 di2 = t2 - CLIPPED_ESCAPE;
2251                 qenergy += CLIPPED_ESCAPE*CLIPPED_ESCAPE;
2252             } else {
2253                 di2 = t2 - (V = c2 * cbrtf(c2) * IQ);
2254                 qenergy += V*V;
2255             }
2256         } else {
2257             di2 = t2 - (V = vec[1] * IQ);
2258             qenergy += V*V;
2259         }
2260
2261         if (cond2) {
2262             if (t3 >= CLIPPED_ESCAPE) {
2263                 di3 = t3 - CLIPPED_ESCAPE;
2264                 qenergy += CLIPPED_ESCAPE*CLIPPED_ESCAPE;
2265             } else {
2266                 di3 = t3 - (V = c3 * cbrtf(c3) * IQ);
2267                 qenergy += V*V;
2268             }
2269         } else {
2270             di3 = t3 - (V = vec2[0] * IQ);
2271             qenergy += V*V;
2272         }
2273
2274         if (cond3) {
2275             if (t4 >= CLIPPED_ESCAPE) {
2276                 di4 = t4 - CLIPPED_ESCAPE;
2277                 qenergy += CLIPPED_ESCAPE*CLIPPED_ESCAPE;
2278             } else {
2279                 di4 = t4 - (V = c4 * cbrtf(c4) * IQ);
2280                 qenergy += V*V;
2281             }
2282         } else {
2283             di4 = t4 - (V = vec2[1]*IQ);
2284             qenergy += V*V;
2285         }
2286
2287         cost += di1 * di1 + di2 * di2
2288                 + di3 * di3 + di4 * di4;
2289     }
2290
2291     if (bits)
2292         *bits = curbits;
2293     return cost * lambda + curbits;
2294 }
2295
2296 static float (*const get_band_cost_arr[])(struct AACEncContext *s,
2297                                           PutBitContext *pb, const float *in,
2298                                           const float *scaled, int size, int scale_idx,
2299                                           int cb, const float lambda, const float uplim,
2300                                           int *bits, float *energy) = {
2301     get_band_cost_ZERO_mips,
2302     get_band_cost_SQUAD_mips,
2303     get_band_cost_SQUAD_mips,
2304     get_band_cost_UQUAD_mips,
2305     get_band_cost_UQUAD_mips,
2306     get_band_cost_SPAIR_mips,
2307     get_band_cost_SPAIR_mips,
2308     get_band_cost_UPAIR7_mips,
2309     get_band_cost_UPAIR7_mips,
2310     get_band_cost_UPAIR12_mips,
2311     get_band_cost_UPAIR12_mips,
2312     get_band_cost_ESC_mips,
2313     get_band_cost_NONE_mips, /* cb 12 doesn't exist */
2314     get_band_cost_ZERO_mips,
2315     get_band_cost_ZERO_mips,
2316     get_band_cost_ZERO_mips,
2317 };
2318
2319 #define get_band_cost(                                  \
2320                                 s, pb, in, scaled, size, scale_idx, cb, \
2321                                 lambda, uplim, bits, energy)            \
2322     get_band_cost_arr[cb](                              \
2323                                 s, pb, in, scaled, size, scale_idx, cb, \
2324                                 lambda, uplim, bits, energy)
2325
2326 static float quantize_band_cost(struct AACEncContext *s, const float *in,
2327                                 const float *scaled, int size, int scale_idx,
2328                                 int cb, const float lambda, const float uplim,
2329                                 int *bits, float *energy, int rtz)
2330 {
2331     return get_band_cost(s, NULL, in, scaled, size, scale_idx, cb, lambda, uplim, bits, energy);
2332 }
2333
2334 #include "libavcodec/aacenc_quantization_misc.h"
2335
2336 #include "libavcodec/aaccoder_twoloop.h"
2337
2338 static void search_for_ms_mips(AACEncContext *s, ChannelElement *cpe)
2339 {
2340     int start = 0, i, w, w2, g, sid_sf_boost, prev_mid, prev_side;
2341     uint8_t nextband0[128], nextband1[128];
2342     float M[128], S[128];
2343     float *L34 = s->scoefs, *R34 = s->scoefs + 128, *M34 = s->scoefs + 128*2, *S34 = s->scoefs + 128*3;
2344     const float lambda = s->lambda;
2345     const float mslambda = FFMIN(1.0f, lambda / 120.f);
2346     SingleChannelElement *sce0 = &cpe->ch[0];
2347     SingleChannelElement *sce1 = &cpe->ch[1];
2348     if (!cpe->common_window)
2349         return;
2350
2351     /** Scout out next nonzero bands */
2352     ff_init_nextband_map(sce0, nextband0);
2353     ff_init_nextband_map(sce1, nextband1);
2354
2355     prev_mid = sce0->sf_idx[0];
2356     prev_side = sce1->sf_idx[0];
2357     for (w = 0; w < sce0->ics.num_windows; w += sce0->ics.group_len[w]) {
2358         start = 0;
2359         for (g = 0;  g < sce0->ics.num_swb; g++) {
2360             float bmax = bval2bmax(g * 17.0f / sce0->ics.num_swb) / 0.0045f;
2361             if (!cpe->is_mask[w*16+g])
2362                 cpe->ms_mask[w*16+g] = 0;
2363             if (!sce0->zeroes[w*16+g] && !sce1->zeroes[w*16+g] && !cpe->is_mask[w*16+g]) {
2364                 float Mmax = 0.0f, Smax = 0.0f;
2365
2366                 /* Must compute mid/side SF and book for the whole window group */
2367                 for (w2 = 0; w2 < sce0->ics.group_len[w]; w2++) {
2368                     for (i = 0; i < sce0->ics.swb_sizes[g]; i++) {
2369                         M[i] = (sce0->coeffs[start+(w+w2)*128+i]
2370                               + sce1->coeffs[start+(w+w2)*128+i]) * 0.5;
2371                         S[i] =  M[i]
2372                               - sce1->coeffs[start+(w+w2)*128+i];
2373                     }
2374                     abs_pow34_v(M34, M, sce0->ics.swb_sizes[g]);
2375                     abs_pow34_v(S34, S, sce0->ics.swb_sizes[g]);
2376                     for (i = 0; i < sce0->ics.swb_sizes[g]; i++ ) {
2377                         Mmax = FFMAX(Mmax, M34[i]);
2378                         Smax = FFMAX(Smax, S34[i]);
2379                     }
2380                 }
2381
2382                 for (sid_sf_boost = 0; sid_sf_boost < 4; sid_sf_boost++) {
2383                     float dist1 = 0.0f, dist2 = 0.0f;
2384                     int B0 = 0, B1 = 0;
2385                     int minidx;
2386                     int mididx, sididx;
2387                     int midcb, sidcb;
2388
2389                     minidx = FFMIN(sce0->sf_idx[w*16+g], sce1->sf_idx[w*16+g]);
2390                     mididx = av_clip(minidx, 0, SCALE_MAX_POS - SCALE_DIV_512);
2391                     sididx = av_clip(minidx - sid_sf_boost * 3, 0, SCALE_MAX_POS - SCALE_DIV_512);
2392                     if (sce0->band_type[w*16+g] != NOISE_BT && sce1->band_type[w*16+g] != NOISE_BT
2393                         && (   !ff_sfdelta_can_replace(sce0, nextband0, prev_mid, mididx, w*16+g)
2394                             || !ff_sfdelta_can_replace(sce1, nextband1, prev_side, sididx, w*16+g))) {
2395                         /* scalefactor range violation, bad stuff, will decrease quality unacceptably */
2396                         continue;
2397                     }
2398
2399                     midcb = find_min_book(Mmax, mididx);
2400                     sidcb = find_min_book(Smax, sididx);
2401
2402                     /* No CB can be zero */
2403                     midcb = FFMAX(1,midcb);
2404                     sidcb = FFMAX(1,sidcb);
2405
2406                     for (w2 = 0; w2 < sce0->ics.group_len[w]; w2++) {
2407                         FFPsyBand *band0 = &s->psy.ch[s->cur_channel+0].psy_bands[(w+w2)*16+g];
2408                         FFPsyBand *band1 = &s->psy.ch[s->cur_channel+1].psy_bands[(w+w2)*16+g];
2409                         float minthr = FFMIN(band0->threshold, band1->threshold);
2410                         int b1,b2,b3,b4;
2411                         for (i = 0; i < sce0->ics.swb_sizes[g]; i++) {
2412                             M[i] = (sce0->coeffs[start+(w+w2)*128+i]
2413                                   + sce1->coeffs[start+(w+w2)*128+i]) * 0.5;
2414                             S[i] =  M[i]
2415                                   - sce1->coeffs[start+(w+w2)*128+i];
2416                         }
2417
2418                         abs_pow34_v(L34, sce0->coeffs+start+(w+w2)*128, sce0->ics.swb_sizes[g]);
2419                         abs_pow34_v(R34, sce1->coeffs+start+(w+w2)*128, sce0->ics.swb_sizes[g]);
2420                         abs_pow34_v(M34, M,                         sce0->ics.swb_sizes[g]);
2421                         abs_pow34_v(S34, S,                         sce0->ics.swb_sizes[g]);
2422                         dist1 += quantize_band_cost(s, &sce0->coeffs[start + (w+w2)*128],
2423                                                     L34,
2424                                                     sce0->ics.swb_sizes[g],
2425                                                     sce0->sf_idx[w*16+g],
2426                                                     sce0->band_type[w*16+g],
2427                                                     lambda / band0->threshold, INFINITY, &b1, NULL, 0);
2428                         dist1 += quantize_band_cost(s, &sce1->coeffs[start + (w+w2)*128],
2429                                                     R34,
2430                                                     sce1->ics.swb_sizes[g],
2431                                                     sce1->sf_idx[w*16+g],
2432                                                     sce1->band_type[w*16+g],
2433                                                     lambda / band1->threshold, INFINITY, &b2, NULL, 0);
2434                         dist2 += quantize_band_cost(s, M,
2435                                                     M34,
2436                                                     sce0->ics.swb_sizes[g],
2437                                                     mididx,
2438                                                     midcb,
2439                                                     lambda / minthr, INFINITY, &b3, NULL, 0);
2440                         dist2 += quantize_band_cost(s, S,
2441                                                     S34,
2442                                                     sce1->ics.swb_sizes[g],
2443                                                     sididx,
2444                                                     sidcb,
2445                                                     mslambda / (minthr * bmax), INFINITY, &b4, NULL, 0);
2446                         B0 += b1+b2;
2447                         B1 += b3+b4;
2448                         dist1 -= b1+b2;
2449                         dist2 -= b3+b4;
2450                     }
2451                     cpe->ms_mask[w*16+g] = dist2 <= dist1 && B1 < B0;
2452                     if (cpe->ms_mask[w*16+g]) {
2453                         if (sce0->band_type[w*16+g] != NOISE_BT && sce1->band_type[w*16+g] != NOISE_BT) {
2454                             sce0->sf_idx[w*16+g] = mididx;
2455                             sce1->sf_idx[w*16+g] = sididx;
2456                             sce0->band_type[w*16+g] = midcb;
2457                             sce1->band_type[w*16+g] = sidcb;
2458                         } else if ((sce0->band_type[w*16+g] != NOISE_BT) ^ (sce1->band_type[w*16+g] != NOISE_BT)) {
2459                             /* ms_mask unneeded, and it confuses some decoders */
2460                             cpe->ms_mask[w*16+g] = 0;
2461                         }
2462                         break;
2463                     } else if (B1 > B0) {
2464                         /* More boost won't fix this */
2465                         break;
2466                     }
2467                 }
2468             }
2469             if (!sce0->zeroes[w*16+g] && sce0->band_type[w*16+g] < RESERVED_BT)
2470                 prev_mid = sce0->sf_idx[w*16+g];
2471             if (!sce1->zeroes[w*16+g] && !cpe->is_mask[w*16+g] && sce1->band_type[w*16+g] < RESERVED_BT)
2472                 prev_side = sce1->sf_idx[w*16+g];
2473             start += sce0->ics.swb_sizes[g];
2474         }
2475     }
2476 }
2477 #endif /*HAVE_MIPSFPU */
2478
2479 #include "libavcodec/aaccoder_trellis.h"
2480
2481 #endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */
2482 #endif /* HAVE_INLINE_ASM */
2483
2484 void ff_aac_coder_init_mips(AACEncContext *c) {
2485 #if HAVE_INLINE_ASM
2486 #if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
2487     AACCoefficientsEncoder *e = c->coder;
2488     int option = c->options.coder;
2489
2490     if (option == 2) {
2491         e->quantize_and_encode_band = quantize_and_encode_band_mips;
2492         e->encode_window_bands_info = codebook_trellis_rate;
2493 #if HAVE_MIPSFPU
2494         e->search_for_quantizers    = search_for_quantizers_twoloop;
2495 #endif /* HAVE_MIPSFPU */
2496     }
2497 #if HAVE_MIPSFPU
2498     e->search_for_ms            = search_for_ms_mips;
2499 #endif /* HAVE_MIPSFPU */
2500 #endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */
2501 #endif /* HAVE_INLINE_ASM */
2502 }