git.sesse.net Git - ffmpeg/blob - libavcodec/mips/aaccoder_mips.c

   1 /*
   2  * Copyright (c) 2012
   3  *      MIPS Technologies, Inc., California.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  * 1. Redistributions of source code must retain the above copyright
   9  *    notice, this list of conditions and the following disclaimer.
  10  * 2. Redistributions in binary form must reproduce the above copyright
  11  *    notice, this list of conditions and the following disclaimer in the
  12  *    documentation and/or other materials provided with the distribution.
  13  * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
  14  *    contributors may be used to endorse or promote products derived from
  15  *    this software without specific prior written permission.
  16  *
  17  * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
  18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
  21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  27  * SUCH DAMAGE.
  28  *
  29  * Author:  Stanislav Ocovaj (socovaj@mips.com)
  30  *          Szabolcs Pal     (sabolc@mips.com)
  31  *
  32  * AAC coefficients encoder optimized for MIPS floating-point architecture
  33  *
  34  * This file is part of FFmpeg.
  35  *
  36  * FFmpeg is free software; you can redistribute it and/or
  37  * modify it under the terms of the GNU Lesser General Public
  38  * License as published by the Free Software Foundation; either
  39  * version 2.1 of the License, or (at your option) any later version.
  40  *
  41  * FFmpeg is distributed in the hope that it will be useful,
  42  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  43  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  44  * Lesser General Public License for more details.
  45  *
  46  * You should have received a copy of the GNU Lesser General Public
  47  * License along with FFmpeg; if not, write to the Free Software
  48  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  49  */
  50
  51 /**
  52  * @file
  53  * Reference: libavcodec/aaccoder.c
  54  */
  55
  56 #include "libavutil/libm.h"
  57
  58 #include <float.h>
  59 #include "libavutil/mathematics.h"
  60 #include "libavcodec/avcodec.h"
  61 #include "libavcodec/put_bits.h"
  62 #include "libavcodec/aac.h"
  63 #include "libavcodec/aacenc.h"
  64 #include "libavcodec/aactab.h"
  65
  66 #if HAVE_INLINE_ASM
  67 typedef struct BandCodingPath {
  68     int prev_idx;
  69     float cost;
  70     int run;
  71 } BandCodingPath;
  72
  73 static const uint8_t run_value_bits_long[64] = {
  74      5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
  75      5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5, 10,
  76     10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
  77     10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 15
  78 };
  79
  80 static const uint8_t run_value_bits_short[16] = {
  81     3, 3, 3, 3, 3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 9
  82 };
  83
  84 static const uint8_t * const run_value_bits[2] = {
  85     run_value_bits_long, run_value_bits_short
  86 };
  87
  88 static const uint8_t uquad_sign_bits[81] = {
  89     0, 1, 1, 1, 2, 2, 1, 2, 2,
  90     1, 2, 2, 2, 3, 3, 2, 3, 3,
  91     1, 2, 2, 2, 3, 3, 2, 3, 3,
  92     1, 2, 2, 2, 3, 3, 2, 3, 3,
  93     2, 3, 3, 3, 4, 4, 3, 4, 4,
  94     2, 3, 3, 3, 4, 4, 3, 4, 4,
  95     1, 2, 2, 2, 3, 3, 2, 3, 3,
  96     2, 3, 3, 3, 4, 4, 3, 4, 4,
  97     2, 3, 3, 3, 4, 4, 3, 4, 4
  98 };
  99
 100 static const uint8_t upair7_sign_bits[64] = {
 101     0, 1, 1, 1, 1, 1, 1, 1,
 102     1, 2, 2, 2, 2, 2, 2, 2,
 103     1, 2, 2, 2, 2, 2, 2, 2,
 104     1, 2, 2, 2, 2, 2, 2, 2,
 105     1, 2, 2, 2, 2, 2, 2, 2,
 106     1, 2, 2, 2, 2, 2, 2, 2,
 107     1, 2, 2, 2, 2, 2, 2, 2,
 108     1, 2, 2, 2, 2, 2, 2, 2,
 109 };
 110
 111 static const uint8_t upair12_sign_bits[169] = {
 112     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 113     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 114     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 115     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 116     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 117     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 118     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 119     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 120     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 121     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 122     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 123     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 124     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
 125 };
 126
 127 static const uint8_t esc_sign_bits[289] = {
 128     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 129     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 130     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 131     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 132     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 133     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 134     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 135     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 136     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 137     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 138     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 139     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 140     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 141     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 142     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 143     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 144     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
 145 };
 146
 147 static void abs_pow34_v(float *out, const float *in, const int size) {
 148 #ifndef USE_REALLY_FULL_SEARCH
 149     int i;
 150     float a, b, c, d;
 151     float ax, bx, cx, dx;
 152
 153     for (i = 0; i < size; i += 4) {
 154         a = fabsf(in[i  ]);
 155         b = fabsf(in[i+1]);
 156         c = fabsf(in[i+2]);
 157         d = fabsf(in[i+3]);
 158
 159         ax = sqrtf(a);
 160         bx = sqrtf(b);
 161         cx = sqrtf(c);
 162         dx = sqrtf(d);
 163
 164         a = a * ax;
 165         b = b * bx;
 166         c = c * cx;
 167         d = d * dx;
 168
 169         out[i  ] = sqrtf(a);
 170         out[i+1] = sqrtf(b);
 171         out[i+2] = sqrtf(c);
 172         out[i+3] = sqrtf(d);
 173     }
 174 #endif /* USE_REALLY_FULL_SEARCH */
 175 }
 176
 177 static float find_max_val(int group_len, int swb_size, const float *scaled) {
 178     float maxval = 0.0f;
 179     int w2, i;
 180     for (w2 = 0; w2 < group_len; w2++) {
 181         for (i = 0; i < swb_size; i++) {
 182             maxval = FFMAX(maxval, scaled[w2*128+i]);
 183         }
 184     }
 185     return maxval;
 186 }
 187
 188 static int find_min_book(float maxval, int sf) {
 189     float Q = ff_aac_pow2sf_tab[POW_SF2_ZERO - sf + SCALE_ONE_POS - SCALE_DIV_512];
 190     float Q34 = sqrtf(Q * sqrtf(Q));
 191     int qmaxval, cb;
 192     qmaxval = maxval * Q34 + 0.4054f;
 193     if      (qmaxval ==  0) cb = 0;
 194     else if (qmaxval ==  1) cb = 1;
 195     else if (qmaxval ==  2) cb = 3;
 196     else if (qmaxval <=  4) cb = 5;
 197     else if (qmaxval <=  7) cb = 7;
 198     else if (qmaxval <= 12) cb = 9;
 199     else                    cb = 11;
 200     return cb;
 201 }
 202
 203 /**
 204  * Functions developed from template function and optimized for quantizing and encoding band
 205  */
 206 static void quantize_and_encode_band_cost_SQUAD_mips(struct AACEncContext *s,
 207                                                      PutBitContext *pb, const float *in,
 208                                                      const float *scaled, int size, int scale_idx,
 209                                                      int cb, const float lambda, const float uplim,
 210                                                      int *bits)
 211 {
 212     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
 213     int i;
 214     int qc1, qc2, qc3, qc4;
 215
 216     uint8_t  *p_bits  = (uint8_t  *)ff_aac_spectral_bits[cb-1];
 217     uint16_t *p_codes = (uint16_t *)ff_aac_spectral_codes[cb-1];
 218
 219     abs_pow34_v(s->scoefs, in, size);
 220     scaled = s->scoefs;
 221     for (i = 0; i < size; i += 4) {
 222         int curidx;
 223         int *in_int = (int *)&in[i];
 224
 225         qc1 = scaled[i  ] * Q34 + 0.4054f;
 226         qc2 = scaled[i+1] * Q34 + 0.4054f;
 227         qc3 = scaled[i+2] * Q34 + 0.4054f;
 228         qc4 = scaled[i+3] * Q34 + 0.4054f;
 229
 230         __asm__ volatile (
 231             ".set push                      \n\t"
 232             ".set noreorder                 \n\t"
 233
 234             "slt    %[qc1], $zero,  %[qc1]  \n\t"
 235             "slt    %[qc2], $zero,  %[qc2]  \n\t"
 236             "slt    %[qc3], $zero,  %[qc3]  \n\t"
 237             "slt    %[qc4], $zero,  %[qc4]  \n\t"
 238             "lw     $t0,    0(%[in_int])    \n\t"
 239             "lw     $t1,    4(%[in_int])    \n\t"
 240             "lw     $t2,    8(%[in_int])    \n\t"
 241             "lw     $t3,    12(%[in_int])   \n\t"
 242             "srl    $t0,    $t0,    31      \n\t"
 243             "srl    $t1,    $t1,    31      \n\t"
 244             "srl    $t2,    $t2,    31      \n\t"
 245             "srl    $t3,    $t3,    31      \n\t"
 246             "subu   $t4,    $zero,  %[qc1]  \n\t"
 247             "subu   $t5,    $zero,  %[qc2]  \n\t"
 248             "subu   $t6,    $zero,  %[qc3]  \n\t"
 249             "subu   $t7,    $zero,  %[qc4]  \n\t"
 250             "movn   %[qc1], $t4,    $t0     \n\t"
 251             "movn   %[qc2], $t5,    $t1     \n\t"
 252             "movn   %[qc3], $t6,    $t2     \n\t"
 253             "movn   %[qc4], $t7,    $t3     \n\t"
 254
 255             ".set pop                       \n\t"
 256
 257             : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
 258               [qc3]"+r"(qc3), [qc4]"+r"(qc4)
 259             : [in_int]"r"(in_int)
 260             : "t0", "t1", "t2", "t3",
 261               "t4", "t5", "t6", "t7",
 262               "memory"
 263         );
 264
 265         curidx = qc1;
 266         curidx *= 3;
 267         curidx += qc2;
 268         curidx *= 3;
 269         curidx += qc3;
 270         curidx *= 3;
 271         curidx += qc4;
 272         curidx += 40;
 273
 274         put_bits(pb, p_bits[curidx], p_codes[curidx]);
 275     }
 276 }
 277
 278 static void quantize_and_encode_band_cost_UQUAD_mips(struct AACEncContext *s,
 279                                                      PutBitContext *pb, const float *in,
 280                                                      const float *scaled, int size, int scale_idx,
 281                                                      int cb, const float lambda, const float uplim,
 282                                                      int *bits)
 283 {
 284     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
 285     int i;
 286     int qc1, qc2, qc3, qc4;
 287
 288     uint8_t  *p_bits  = (uint8_t  *)ff_aac_spectral_bits[cb-1];
 289     uint16_t *p_codes = (uint16_t *)ff_aac_spectral_codes[cb-1];
 290
 291     abs_pow34_v(s->scoefs, in, size);
 292     scaled = s->scoefs;
 293     for (i = 0; i < size; i += 4) {
 294         int curidx, sign, count;
 295         int *in_int = (int *)&in[i];
 296         uint8_t v_bits;
 297         unsigned int v_codes;
 298
 299         qc1 = scaled[i  ] * Q34 + 0.4054f;
 300         qc2 = scaled[i+1] * Q34 + 0.4054f;
 301         qc3 = scaled[i+2] * Q34 + 0.4054f;
 302         qc4 = scaled[i+3] * Q34 + 0.4054f;
 303
 304         __asm__ volatile (
 305             ".set push                              \n\t"
 306             ".set noreorder                         \n\t"
 307
 308             "ori    $t4,        $zero,      2       \n\t"
 309             "ori    %[sign],    $zero,      0       \n\t"
 310             "slt    $t0,        $t4,        %[qc1]  \n\t"
 311             "slt    $t1,        $t4,        %[qc2]  \n\t"
 312             "slt    $t2,        $t4,        %[qc3]  \n\t"
 313             "slt    $t3,        $t4,        %[qc4]  \n\t"
 314             "movn   %[qc1],     $t4,        $t0     \n\t"
 315             "movn   %[qc2],     $t4,        $t1     \n\t"
 316             "movn   %[qc3],     $t4,        $t2     \n\t"
 317             "movn   %[qc4],     $t4,        $t3     \n\t"
 318             "lw     $t0,        0(%[in_int])        \n\t"
 319             "lw     $t1,        4(%[in_int])        \n\t"
 320             "lw     $t2,        8(%[in_int])        \n\t"
 321             "lw     $t3,        12(%[in_int])       \n\t"
 322             "slt    $t0,        $t0,        $zero   \n\t"
 323             "movn   %[sign],    $t0,        %[qc1]  \n\t"
 324             "slt    $t1,        $t1,        $zero   \n\t"
 325             "slt    $t2,        $t2,        $zero   \n\t"
 326             "slt    $t3,        $t3,        $zero   \n\t"
 327             "sll    $t0,        %[sign],    1       \n\t"
 328             "or     $t0,        $t0,        $t1     \n\t"
 329             "movn   %[sign],    $t0,        %[qc2]  \n\t"
 330             "slt    $t4,        $zero,      %[qc1]  \n\t"
 331             "slt    $t1,        $zero,      %[qc2]  \n\t"
 332             "slt    %[count],   $zero,      %[qc3]  \n\t"
 333             "sll    $t0,        %[sign],    1       \n\t"
 334             "or     $t0,        $t0,        $t2     \n\t"
 335             "movn   %[sign],    $t0,        %[qc3]  \n\t"
 336             "slt    $t2,        $zero,      %[qc4]  \n\t"
 337             "addu   %[count],   %[count],   $t4     \n\t"
 338             "addu   %[count],   %[count],   $t1     \n\t"
 339             "sll    $t0,        %[sign],    1       \n\t"
 340             "or     $t0,        $t0,        $t3     \n\t"
 341             "movn   %[sign],    $t0,        %[qc4]  \n\t"
 342             "addu   %[count],   %[count],   $t2     \n\t"
 343
 344             ".set pop                               \n\t"
 345
 346             : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
 347               [qc3]"+r"(qc3), [qc4]"+r"(qc4),
 348               [sign]"=&r"(sign), [count]"=&r"(count)
 349             : [in_int]"r"(in_int)
 350             : "t0", "t1", "t2", "t3", "t4",
 351               "memory"
 352         );
 353
 354         curidx = qc1;
 355         curidx *= 3;
 356         curidx += qc2;
 357         curidx *= 3;
 358         curidx += qc3;
 359         curidx *= 3;
 360         curidx += qc4;
 361
 362         v_codes = (p_codes[curidx] << count) | (sign & ((1 << count) - 1));
 363         v_bits  = p_bits[curidx] + count;
 364         put_bits(pb, v_bits, v_codes);
 365     }
 366 }
 367
 368 static void quantize_and_encode_band_cost_SPAIR_mips(struct AACEncContext *s,
 369                                                      PutBitContext *pb, const float *in,
 370                                                      const float *scaled, int size, int scale_idx,
 371                                                      int cb, const float lambda, const float uplim,
 372                                                      int *bits)
 373 {
 374     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
 375     int i;
 376     int qc1, qc2, qc3, qc4;
 377
 378     uint8_t  *p_bits  = (uint8_t  *)ff_aac_spectral_bits[cb-1];
 379     uint16_t *p_codes = (uint16_t *)ff_aac_spectral_codes[cb-1];
 380
 381     abs_pow34_v(s->scoefs, in, size);
 382     scaled = s->scoefs;
 383     for (i = 0; i < size; i += 4) {
 384         int curidx, curidx2;
 385         int *in_int = (int *)&in[i];
 386         uint8_t v_bits;
 387         unsigned int v_codes;
 388
 389         qc1 = scaled[i  ] * Q34 + 0.4054f;
 390         qc2 = scaled[i+1] * Q34 + 0.4054f;
 391         qc3 = scaled[i+2] * Q34 + 0.4054f;
 392         qc4 = scaled[i+3] * Q34 + 0.4054f;
 393
 394         __asm__ volatile (
 395             ".set push                      \n\t"
 396             ".set noreorder                 \n\t"
 397
 398             "ori    $t4,    $zero,  4       \n\t"
 399             "slt    $t0,    $t4,    %[qc1]  \n\t"
 400             "slt    $t1,    $t4,    %[qc2]  \n\t"
 401             "slt    $t2,    $t4,    %[qc3]  \n\t"
 402             "slt    $t3,    $t4,    %[qc4]  \n\t"
 403             "movn   %[qc1], $t4,    $t0     \n\t"
 404             "movn   %[qc2], $t4,    $t1     \n\t"
 405             "movn   %[qc3], $t4,    $t2     \n\t"
 406             "movn   %[qc4], $t4,    $t3     \n\t"
 407             "lw     $t0,    0(%[in_int])    \n\t"
 408             "lw     $t1,    4(%[in_int])    \n\t"
 409             "lw     $t2,    8(%[in_int])    \n\t"
 410             "lw     $t3,    12(%[in_int])   \n\t"
 411             "srl    $t0,    $t0,    31      \n\t"
 412             "srl    $t1,    $t1,    31      \n\t"
 413             "srl    $t2,    $t2,    31      \n\t"
 414             "srl    $t3,    $t3,    31      \n\t"
 415             "subu   $t4,    $zero,  %[qc1]  \n\t"
 416             "subu   $t5,    $zero,  %[qc2]  \n\t"
 417             "subu   $t6,    $zero,  %[qc3]  \n\t"
 418             "subu   $t7,    $zero,  %[qc4]  \n\t"
 419             "movn   %[qc1], $t4,    $t0     \n\t"
 420             "movn   %[qc2], $t5,    $t1     \n\t"
 421             "movn   %[qc3], $t6,    $t2     \n\t"
 422             "movn   %[qc4], $t7,    $t3     \n\t"
 423
 424             ".set pop                       \n\t"
 425
 426             : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
 427               [qc3]"+r"(qc3), [qc4]"+r"(qc4)
 428             : [in_int]"r"(in_int)
 429             : "t0", "t1", "t2", "t3",
 430               "t4", "t5", "t6", "t7",
 431               "memory"
 432         );
 433
 434         curidx = 9 * qc1;
 435         curidx += qc2 + 40;
 436
 437         curidx2 = 9 * qc3;
 438         curidx2 += qc4 + 40;
 439
 440         v_codes = (p_codes[curidx] << p_bits[curidx2]) | (p_codes[curidx2]);
 441         v_bits  = p_bits[curidx] + p_bits[curidx2];
 442         put_bits(pb, v_bits, v_codes);
 443     }
 444 }
 445
 446 static void quantize_and_encode_band_cost_UPAIR7_mips(struct AACEncContext *s,
 447                                                       PutBitContext *pb, const float *in,
 448                                                       const float *scaled, int size, int scale_idx,
 449                                                       int cb, const float lambda, const float uplim,
 450                                                       int *bits)
 451 {
 452     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
 453     int i;
 454     int qc1, qc2, qc3, qc4;
 455
 456     uint8_t  *p_bits  = (uint8_t*) ff_aac_spectral_bits[cb-1];
 457     uint16_t *p_codes = (uint16_t*)ff_aac_spectral_codes[cb-1];
 458
 459     abs_pow34_v(s->scoefs, in, size);
 460     scaled = s->scoefs;
 461     for (i = 0; i < size; i += 4) {
 462         int curidx, sign1, count1, sign2, count2;
 463         int *in_int = (int *)&in[i];
 464         uint8_t v_bits;
 465         unsigned int v_codes;
 466
 467         qc1 = scaled[i  ] * Q34 + 0.4054f;
 468         qc2 = scaled[i+1] * Q34 + 0.4054f;
 469         qc3 = scaled[i+2] * Q34 + 0.4054f;
 470         qc4 = scaled[i+3] * Q34 + 0.4054f;
 471
 472         __asm__ volatile (
 473             ".set push                              \n\t"
 474             ".set noreorder                         \n\t"
 475
 476             "ori    $t4,        $zero,      7       \n\t"
 477             "ori    %[sign1],   $zero,      0       \n\t"
 478             "ori    %[sign2],   $zero,      0       \n\t"
 479             "slt    $t0,        $t4,        %[qc1]  \n\t"
 480             "slt    $t1,        $t4,        %[qc2]  \n\t"
 481             "slt    $t2,        $t4,        %[qc3]  \n\t"
 482             "slt    $t3,        $t4,        %[qc4]  \n\t"
 483             "movn   %[qc1],     $t4,        $t0     \n\t"
 484             "movn   %[qc2],     $t4,        $t1     \n\t"
 485             "movn   %[qc3],     $t4,        $t2     \n\t"
 486             "movn   %[qc4],     $t4,        $t3     \n\t"
 487             "lw     $t0,        0(%[in_int])        \n\t"
 488             "lw     $t1,        4(%[in_int])        \n\t"
 489             "lw     $t2,        8(%[in_int])        \n\t"
 490             "lw     $t3,        12(%[in_int])       \n\t"
 491             "slt    $t0,        $t0,        $zero   \n\t"
 492             "movn   %[sign1],   $t0,        %[qc1]  \n\t"
 493             "slt    $t2,        $t2,        $zero   \n\t"
 494             "movn   %[sign2],   $t2,        %[qc3]  \n\t"
 495             "slt    $t1,        $t1,        $zero   \n\t"
 496             "sll    $t0,        %[sign1],   1       \n\t"
 497             "or     $t0,        $t0,        $t1     \n\t"
 498             "movn   %[sign1],   $t0,        %[qc2]  \n\t"
 499             "slt    $t3,        $t3,        $zero   \n\t"
 500             "sll    $t0,        %[sign2],   1       \n\t"
 501             "or     $t0,        $t0,        $t3     \n\t"
 502             "movn   %[sign2],   $t0,        %[qc4]  \n\t"
 503             "slt    %[count1],  $zero,      %[qc1]  \n\t"
 504             "slt    $t1,        $zero,      %[qc2]  \n\t"
 505             "slt    %[count2],  $zero,      %[qc3]  \n\t"
 506             "slt    $t2,        $zero,      %[qc4]  \n\t"
 507             "addu   %[count1],  %[count1],  $t1     \n\t"
 508             "addu   %[count2],  %[count2],  $t2     \n\t"
 509
 510             ".set pop                               \n\t"
 511
 512             : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
 513               [qc3]"+r"(qc3), [qc4]"+r"(qc4),
 514               [sign1]"=&r"(sign1), [count1]"=&r"(count1),
 515               [sign2]"=&r"(sign2), [count2]"=&r"(count2)
 516             : [in_int]"r"(in_int)
 517             : "t0", "t1", "t2", "t3", "t4",
 518               "memory"
 519         );
 520
 521         curidx  = 8 * qc1;
 522         curidx += qc2;
 523
 524         v_codes = (p_codes[curidx] << count1) | sign1;
 525         v_bits  = p_bits[curidx] + count1;
 526         put_bits(pb, v_bits, v_codes);
 527
 528         curidx  = 8 * qc3;
 529         curidx += qc4;
 530
 531         v_codes = (p_codes[curidx] << count2) | sign2;
 532         v_bits  = p_bits[curidx] + count2;
 533         put_bits(pb, v_bits, v_codes);
 534     }
 535 }
 536
 537 static void quantize_and_encode_band_cost_UPAIR12_mips(struct AACEncContext *s,
 538                                                        PutBitContext *pb, const float *in,
 539                                                        const float *scaled, int size, int scale_idx,
 540                                                        int cb, const float lambda, const float uplim,
 541                                                        int *bits)
 542 {
 543     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
 544     int i;
 545     int qc1, qc2, qc3, qc4;
 546
 547     uint8_t  *p_bits  = (uint8_t*) ff_aac_spectral_bits[cb-1];
 548     uint16_t *p_codes = (uint16_t*)ff_aac_spectral_codes[cb-1];
 549
 550     abs_pow34_v(s->scoefs, in, size);
 551     scaled = s->scoefs;
 552     for (i = 0; i < size; i += 4) {
 553         int curidx, sign1, count1, sign2, count2;
 554         int *in_int = (int *)&in[i];
 555         uint8_t v_bits;
 556         unsigned int v_codes;
 557
 558         qc1 = scaled[i  ] * Q34 + 0.4054f;
 559         qc2 = scaled[i+1] * Q34 + 0.4054f;
 560         qc3 = scaled[i+2] * Q34 + 0.4054f;
 561         qc4 = scaled[i+3] * Q34 + 0.4054f;
 562
 563         __asm__ volatile (
 564             ".set push                              \n\t"
 565             ".set noreorder                         \n\t"
 566
 567             "ori    $t4,        $zero,      12      \n\t"
 568             "ori    %[sign1],   $zero,      0       \n\t"
 569             "ori    %[sign2],   $zero,      0       \n\t"
 570             "slt    $t0,        $t4,        %[qc1]  \n\t"
 571             "slt    $t1,        $t4,        %[qc2]  \n\t"
 572             "slt    $t2,        $t4,        %[qc3]  \n\t"
 573             "slt    $t3,        $t4,        %[qc4]  \n\t"
 574             "movn   %[qc1],     $t4,        $t0     \n\t"
 575             "movn   %[qc2],     $t4,        $t1     \n\t"
 576             "movn   %[qc3],     $t4,        $t2     \n\t"
 577             "movn   %[qc4],     $t4,        $t3     \n\t"
 578             "lw     $t0,        0(%[in_int])        \n\t"
 579             "lw     $t1,        4(%[in_int])        \n\t"
 580             "lw     $t2,        8(%[in_int])        \n\t"
 581             "lw     $t3,        12(%[in_int])       \n\t"
 582             "slt    $t0,        $t0,        $zero   \n\t"
 583             "movn   %[sign1],   $t0,        %[qc1]  \n\t"
 584             "slt    $t2,        $t2,        $zero   \n\t"
 585             "movn   %[sign2],   $t2,        %[qc3]  \n\t"
 586             "slt    $t1,        $t1,        $zero   \n\t"
 587             "sll    $t0,        %[sign1],   1       \n\t"
 588             "or     $t0,        $t0,        $t1     \n\t"
 589             "movn   %[sign1],   $t0,        %[qc2]  \n\t"
 590             "slt    $t3,        $t3,        $zero   \n\t"
 591             "sll    $t0,        %[sign2],   1       \n\t"
 592             "or     $t0,        $t0,        $t3     \n\t"
 593             "movn   %[sign2],   $t0,        %[qc4]  \n\t"
 594             "slt    %[count1],  $zero,      %[qc1]  \n\t"
 595             "slt    $t1,        $zero,      %[qc2]  \n\t"
 596             "slt    %[count2],  $zero,      %[qc3]  \n\t"
 597             "slt    $t2,        $zero,      %[qc4]  \n\t"
 598             "addu   %[count1],  %[count1],  $t1     \n\t"
 599             "addu   %[count2],  %[count2],  $t2     \n\t"
 600
 601             ".set pop                               \n\t"
 602
 603             : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
 604               [qc3]"+r"(qc3), [qc4]"+r"(qc4),
 605               [sign1]"=&r"(sign1), [count1]"=&r"(count1),
 606               [sign2]"=&r"(sign2), [count2]"=&r"(count2)
 607             : [in_int]"r"(in_int)
 608             : "t0", "t1", "t2", "t3", "t4",
 609               "memory"
 610         );
 611
 612         curidx  = 13 * qc1;
 613         curidx += qc2;
 614
 615         v_codes = (p_codes[curidx] << count1) | sign1;
 616         v_bits  = p_bits[curidx] + count1;
 617         put_bits(pb, v_bits, v_codes);
 618
 619         curidx  = 13 * qc3;
 620         curidx += qc4;
 621
 622         v_codes = (p_codes[curidx] << count2) | sign2;
 623         v_bits  = p_bits[curidx] + count2;
 624         put_bits(pb, v_bits, v_codes);
 625     }
 626 }
 627
 628 static void quantize_and_encode_band_cost_ESC_mips(struct AACEncContext *s,
 629                                                    PutBitContext *pb, const float *in,
 630                                                    const float *scaled, int size, int scale_idx,
 631                                                    int cb, const float lambda, const float uplim,
 632                                                    int *bits)
 633 {
 634     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
 635     int i;
 636     int qc1, qc2, qc3, qc4;
 637
 638     uint8_t  *p_bits    = (uint8_t* )ff_aac_spectral_bits[cb-1];
 639     uint16_t *p_codes   = (uint16_t*)ff_aac_spectral_codes[cb-1];
 640     float    *p_vectors = (float*   )ff_aac_codebook_vectors[cb-1];
 641
 642     abs_pow34_v(s->scoefs, in, size);
 643     scaled = s->scoefs;
 644
 645     if (cb < 11) {
 646         for (i = 0; i < size; i += 4) {
 647             int curidx, curidx2, sign1, count1, sign2, count2;
 648             int *in_int = (int *)&in[i];
 649             uint8_t v_bits;
 650             unsigned int v_codes;
 651
 652             qc1 = scaled[i  ] * Q34 + 0.4054f;
 653             qc2 = scaled[i+1] * Q34 + 0.4054f;
 654             qc3 = scaled[i+2] * Q34 + 0.4054f;
 655             qc4 = scaled[i+3] * Q34 + 0.4054f;
 656
 657             __asm__ volatile (
 658                 ".set push                                  \n\t"
 659                 ".set noreorder                             \n\t"
 660
 661                 "ori        $t4,        $zero,      16      \n\t"
 662                 "ori        %[sign1],   $zero,      0       \n\t"
 663                 "ori        %[sign2],   $zero,      0       \n\t"
 664                 "slt        $t0,        $t4,        %[qc1]  \n\t"
 665                 "slt        $t1,        $t4,        %[qc2]  \n\t"
 666                 "slt        $t2,        $t4,        %[qc3]  \n\t"
 667                 "slt        $t3,        $t4,        %[qc4]  \n\t"
 668                 "movn       %[qc1],     $t4,        $t0     \n\t"
 669                 "movn       %[qc2],     $t4,        $t1     \n\t"
 670                 "movn       %[qc3],     $t4,        $t2     \n\t"
 671                 "movn       %[qc4],     $t4,        $t3     \n\t"
 672                 "lw         $t0,        0(%[in_int])        \n\t"
 673                 "lw         $t1,        4(%[in_int])        \n\t"
 674                 "lw         $t2,        8(%[in_int])        \n\t"
 675                 "lw         $t3,        12(%[in_int])       \n\t"
 676                 "slt        $t0,        $t0,        $zero   \n\t"
 677                 "movn       %[sign1],   $t0,        %[qc1]  \n\t"
 678                 "slt        $t2,        $t2,        $zero   \n\t"
 679                 "movn       %[sign2],   $t2,        %[qc3]  \n\t"
 680                 "slt        $t1,        $t1,        $zero   \n\t"
 681                 "sll        $t0,        %[sign1],   1       \n\t"
 682                 "or         $t0,        $t0,        $t1     \n\t"
 683                 "movn       %[sign1],   $t0,        %[qc2]  \n\t"
 684                 "slt        $t3,        $t3,        $zero   \n\t"
 685                 "sll        $t0,        %[sign2],   1       \n\t"
 686                 "or         $t0,        $t0,        $t3     \n\t"
 687                 "movn       %[sign2],   $t0,        %[qc4]  \n\t"
 688                 "slt        %[count1],  $zero,      %[qc1]  \n\t"
 689                 "slt        $t1,        $zero,      %[qc2]  \n\t"
 690                 "slt        %[count2],  $zero,      %[qc3]  \n\t"
 691                 "slt        $t2,        $zero,      %[qc4]  \n\t"
 692                 "addu       %[count1],  %[count1],  $t1     \n\t"
 693                 "addu       %[count2],  %[count2],  $t2     \n\t"
 694
 695                 ".set pop                                   \n\t"
 696
 697                 : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
 698                   [qc3]"+r"(qc3), [qc4]"+r"(qc4),
 699                   [sign1]"=&r"(sign1), [count1]"=&r"(count1),
 700                   [sign2]"=&r"(sign2), [count2]"=&r"(count2)
 701                 : [in_int]"r"(in_int)
 702                 : "t0", "t1", "t2", "t3", "t4",
 703                   "memory"
 704             );
 705
 706             curidx = 17 * qc1;
 707             curidx += qc2;
 708             curidx2 = 17 * qc3;
 709             curidx2 += qc4;
 710
 711             v_codes = (p_codes[curidx] << count1) | sign1;
 712             v_bits  = p_bits[curidx] + count1;
 713             put_bits(pb, v_bits, v_codes);
 714
 715             v_codes = (p_codes[curidx2] << count2) | sign2;
 716             v_bits  = p_bits[curidx2] + count2;
 717             put_bits(pb, v_bits, v_codes);
 718         }
 719     } else {
 720         for (i = 0; i < size; i += 4) {
 721             int curidx, curidx2, sign1, count1, sign2, count2;
 722             int *in_int = (int *)&in[i];
 723             uint8_t v_bits;
 724             unsigned int v_codes;
 725             int c1, c2, c3, c4;
 726
 727             qc1 = scaled[i  ] * Q34 + 0.4054f;
 728             qc2 = scaled[i+1] * Q34 + 0.4054f;
 729             qc3 = scaled[i+2] * Q34 + 0.4054f;
 730             qc4 = scaled[i+3] * Q34 + 0.4054f;
 731
 732             __asm__ volatile (
 733                 ".set push                                  \n\t"
 734                 ".set noreorder                             \n\t"
 735
 736                 "ori        $t4,        $zero,      16      \n\t"
 737                 "ori        %[sign1],   $zero,      0       \n\t"
 738                 "ori        %[sign2],   $zero,      0       \n\t"
 739                 "shll_s.w   %[c1],      %[qc1],     18      \n\t"
 740                 "shll_s.w   %[c2],      %[qc2],     18      \n\t"
 741                 "shll_s.w   %[c3],      %[qc3],     18      \n\t"
 742                 "shll_s.w   %[c4],      %[qc4],     18      \n\t"
 743                 "srl        %[c1],      %[c1],      18      \n\t"
 744                 "srl        %[c2],      %[c2],      18      \n\t"
 745                 "srl        %[c3],      %[c3],      18      \n\t"
 746                 "srl        %[c4],      %[c4],      18      \n\t"
 747                 "slt        $t0,        $t4,        %[qc1]  \n\t"
 748                 "slt        $t1,        $t4,        %[qc2]  \n\t"
 749                 "slt        $t2,        $t4,        %[qc3]  \n\t"
 750                 "slt        $t3,        $t4,        %[qc4]  \n\t"
 751                 "movn       %[qc1],     $t4,        $t0     \n\t"
 752                 "movn       %[qc2],     $t4,        $t1     \n\t"
 753                 "movn       %[qc3],     $t4,        $t2     \n\t"
 754                 "movn       %[qc4],     $t4,        $t3     \n\t"
 755                 "lw         $t0,        0(%[in_int])        \n\t"
 756                 "lw         $t1,        4(%[in_int])        \n\t"
 757                 "lw         $t2,        8(%[in_int])        \n\t"
 758                 "lw         $t3,        12(%[in_int])       \n\t"
 759                 "slt        $t0,        $t0,        $zero   \n\t"
 760                 "movn       %[sign1],   $t0,        %[qc1]  \n\t"
 761                 "slt        $t2,        $t2,        $zero   \n\t"
 762                 "movn       %[sign2],   $t2,        %[qc3]  \n\t"
 763                 "slt        $t1,        $t1,        $zero   \n\t"
 764                 "sll        $t0,        %[sign1],   1       \n\t"
 765                 "or         $t0,        $t0,        $t1     \n\t"
 766                 "movn       %[sign1],   $t0,        %[qc2]  \n\t"
 767                 "slt        $t3,        $t3,        $zero   \n\t"
 768                 "sll        $t0,        %[sign2],   1       \n\t"
 769                 "or         $t0,        $t0,        $t3     \n\t"
 770                 "movn       %[sign2],   $t0,        %[qc4]  \n\t"
 771                 "slt        %[count1],  $zero,      %[qc1]  \n\t"
 772                 "slt        $t1,        $zero,      %[qc2]  \n\t"
 773                 "slt        %[count2],  $zero,      %[qc3]  \n\t"
 774                 "slt        $t2,        $zero,      %[qc4]  \n\t"
 775                 "addu       %[count1],  %[count1],  $t1     \n\t"
 776                 "addu       %[count2],  %[count2],  $t2     \n\t"
 777
 778                 ".set pop                                   \n\t"
 779
 780                 : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
 781                   [qc3]"+r"(qc3), [qc4]"+r"(qc4),
 782                   [sign1]"=&r"(sign1), [count1]"=&r"(count1),
 783                   [sign2]"=&r"(sign2), [count2]"=&r"(count2),
 784                   [c1]"=&r"(c1), [c2]"=&r"(c2),
 785                   [c3]"=&r"(c3), [c4]"=&r"(c4)
 786                 : [in_int]"r"(in_int)
 787                 : "t0", "t1", "t2", "t3", "t4",
 788                   "memory"
 789             );
 790
 791             curidx = 17 * qc1;
 792             curidx += qc2;
 793
 794             curidx2 = 17 * qc3;
 795             curidx2 += qc4;
 796
 797             v_codes = (p_codes[curidx] << count1) | sign1;
 798             v_bits  = p_bits[curidx] + count1;
 799             put_bits(pb, v_bits, v_codes);
 800
 801             if (p_vectors[curidx*2  ] == 64.0f) {
 802                 int len = av_log2(c1);
 803                 v_codes = (((1 << (len - 3)) - 2) << len) | (c1 & ((1 << len) - 1));
 804                 put_bits(pb, len * 2 - 3, v_codes);
 805             }
 806             if (p_vectors[curidx*2+1] == 64.0f) {
 807                 int len = av_log2(c2);
 808                 v_codes = (((1 << (len - 3)) - 2) << len) | (c2 & ((1 << len) - 1));
 809                 put_bits(pb, len*2-3, v_codes);
 810             }
 811
 812             v_codes = (p_codes[curidx2] << count2) | sign2;
 813             v_bits  = p_bits[curidx2] + count2;
 814             put_bits(pb, v_bits, v_codes);
 815
 816             if (p_vectors[curidx2*2  ] == 64.0f) {
 817                 int len = av_log2(c3);
 818                 v_codes = (((1 << (len - 3)) - 2) << len) | (c3 & ((1 << len) - 1));
 819                 put_bits(pb, len* 2 - 3, v_codes);
 820             }
 821             if (p_vectors[curidx2*2+1] == 64.0f) {
 822                 int len = av_log2(c4);
 823                 v_codes = (((1 << (len - 3)) - 2) << len) | (c4 & ((1 << len) - 1));
 824                 put_bits(pb, len * 2 - 3, v_codes);
 825             }
 826         }
 827     }
 828 }
 829
 830 static void (*const quantize_and_encode_band_cost_arr[])(struct AACEncContext *s,
 831                                                          PutBitContext *pb, const float *in,
 832                                                          const float *scaled, int size, int scale_idx,
 833                                                          int cb, const float lambda, const float uplim,
 834                                                          int *bits) = {
 835     NULL,
 836     quantize_and_encode_band_cost_SQUAD_mips,
 837     quantize_and_encode_band_cost_SQUAD_mips,
 838     quantize_and_encode_band_cost_UQUAD_mips,
 839     quantize_and_encode_band_cost_UQUAD_mips,
 840     quantize_and_encode_band_cost_SPAIR_mips,
 841     quantize_and_encode_band_cost_SPAIR_mips,
 842     quantize_and_encode_band_cost_UPAIR7_mips,
 843     quantize_and_encode_band_cost_UPAIR7_mips,
 844     quantize_and_encode_band_cost_UPAIR12_mips,
 845     quantize_and_encode_band_cost_UPAIR12_mips,
 846     quantize_and_encode_band_cost_ESC_mips,
 847 };
 848
 849 #define quantize_and_encode_band_cost(                                  \
 850                                 s, pb, in, scaled, size, scale_idx, cb, \
 851                                 lambda, uplim, bits)                    \
 852     quantize_and_encode_band_cost_arr[cb](                              \
 853                                 s, pb, in, scaled, size, scale_idx, cb, \
 854                                 lambda, uplim, bits)
 855
 856 static void quantize_and_encode_band_mips(struct AACEncContext *s, PutBitContext *pb,
 857                                           const float *in, int size, int scale_idx,
 858                                           int cb, const float lambda)
 859 {
 860     quantize_and_encode_band_cost(s, pb, in, NULL, size, scale_idx, cb, lambda,
 861                                   INFINITY, NULL);
 862 }
 863
 864 /**
 865  * Functions developed from template function and optimized for getting the number of bits
 866  */
 867 static float get_band_numbits_ZERO_mips(struct AACEncContext *s,
 868                                         PutBitContext *pb, const float *in,
 869                                         const float *scaled, int size, int scale_idx,
 870                                         int cb, const float lambda, const float uplim,
 871                                         int *bits)
 872 {
 873     return 0;
 874 }
 875
 876 static float get_band_numbits_SQUAD_mips(struct AACEncContext *s,
 877                                          PutBitContext *pb, const float *in,
 878                                          const float *scaled, int size, int scale_idx,
 879                                          int cb, const float lambda, const float uplim,
 880                                          int *bits)
 881 {
 882     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
 883     int i;
 884     int qc1, qc2, qc3, qc4;
 885     int curbits = 0;
 886
 887     uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
 888
 889     for (i = 0; i < size; i += 4) {
 890         int curidx;
 891         int *in_int = (int *)&in[i];
 892
 893         qc1 = scaled[i  ] * Q34 + 0.4054f;
 894         qc2 = scaled[i+1] * Q34 + 0.4054f;
 895         qc3 = scaled[i+2] * Q34 + 0.4054f;
 896         qc4 = scaled[i+3] * Q34 + 0.4054f;
 897
 898         __asm__ volatile (
 899             ".set push                      \n\t"
 900             ".set noreorder                 \n\t"
 901
 902             "slt    %[qc1], $zero,  %[qc1]  \n\t"
 903             "slt    %[qc2], $zero,  %[qc2]  \n\t"
 904             "slt    %[qc3], $zero,  %[qc3]  \n\t"
 905             "slt    %[qc4], $zero,  %[qc4]  \n\t"
 906             "lw     $t0,    0(%[in_int])    \n\t"
 907             "lw     $t1,    4(%[in_int])    \n\t"
 908             "lw     $t2,    8(%[in_int])    \n\t"
 909             "lw     $t3,    12(%[in_int])   \n\t"
 910             "srl    $t0,    $t0,    31      \n\t"
 911             "srl    $t1,    $t1,    31      \n\t"
 912             "srl    $t2,    $t2,    31      \n\t"
 913             "srl    $t3,    $t3,    31      \n\t"
 914             "subu   $t4,    $zero,  %[qc1]  \n\t"
 915             "subu   $t5,    $zero,  %[qc2]  \n\t"
 916             "subu   $t6,    $zero,  %[qc3]  \n\t"
 917             "subu   $t7,    $zero,  %[qc4]  \n\t"
 918             "movn   %[qc1], $t4,    $t0     \n\t"
 919             "movn   %[qc2], $t5,    $t1     \n\t"
 920             "movn   %[qc3], $t6,    $t2     \n\t"
 921             "movn   %[qc4], $t7,    $t3     \n\t"
 922
 923             ".set pop                       \n\t"
 924
 925             : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
 926               [qc3]"+r"(qc3), [qc4]"+r"(qc4)
 927             : [in_int]"r"(in_int)
 928             : "t0", "t1", "t2", "t3",
 929               "t4", "t5", "t6", "t7",
 930               "memory"
 931         );
 932
 933         curidx = qc1;
 934         curidx *= 3;
 935         curidx += qc2;
 936         curidx *= 3;
 937         curidx += qc3;
 938         curidx *= 3;
 939         curidx += qc4;
 940         curidx += 40;
 941
 942         curbits += p_bits[curidx];
 943     }
 944     return curbits;
 945 }
 946
 947 static float get_band_numbits_UQUAD_mips(struct AACEncContext *s,
 948                                          PutBitContext *pb, const float *in,
 949                                          const float *scaled, int size, int scale_idx,
 950                                          int cb, const float lambda, const float uplim,
 951                                          int *bits)
 952 {
 953     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
 954     int i;
 955     int curbits = 0;
 956     int qc1, qc2, qc3, qc4;
 957
 958     uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
 959
 960     for (i = 0; i < size; i += 4) {
 961         int curidx;
 962
 963         qc1 = scaled[i  ] * Q34 + 0.4054f;
 964         qc2 = scaled[i+1] * Q34 + 0.4054f;
 965         qc3 = scaled[i+2] * Q34 + 0.4054f;
 966         qc4 = scaled[i+3] * Q34 + 0.4054f;
 967
 968         __asm__ volatile (
 969             ".set push                      \n\t"
 970             ".set noreorder                 \n\t"
 971
 972             "ori    $t4,    $zero,  2       \n\t"
 973             "slt    $t0,    $t4,    %[qc1]  \n\t"
 974             "slt    $t1,    $t4,    %[qc2]  \n\t"
 975             "slt    $t2,    $t4,    %[qc3]  \n\t"
 976             "slt    $t3,    $t4,    %[qc4]  \n\t"
 977             "movn   %[qc1], $t4,    $t0     \n\t"
 978             "movn   %[qc2], $t4,    $t1     \n\t"
 979             "movn   %[qc3], $t4,    $t2     \n\t"
 980             "movn   %[qc4], $t4,    $t3     \n\t"
 981
 982             ".set pop                       \n\t"
 983
 984             : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
 985               [qc3]"+r"(qc3), [qc4]"+r"(qc4)
 986             :
 987             : "t0", "t1", "t2", "t3", "t4"
 988         );
 989
 990         curidx = qc1;
 991         curidx *= 3;
 992         curidx += qc2;
 993         curidx *= 3;
 994         curidx += qc3;
 995         curidx *= 3;
 996         curidx += qc4;
 997
 998         curbits += p_bits[curidx];
 999         curbits += uquad_sign_bits[curidx];
1000     }
1001     return curbits;
1002 }
1003
1004 static float get_band_numbits_SPAIR_mips(struct AACEncContext *s,
1005                                          PutBitContext *pb, const float *in,
1006                                          const float *scaled, int size, int scale_idx,
1007                                          int cb, const float lambda, const float uplim,
1008                                          int *bits)
1009 {
1010     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1011     int i;
1012     int qc1, qc2, qc3, qc4;
1013     int curbits = 0;
1014
1015     uint8_t *p_bits = (uint8_t*)ff_aac_spectral_bits[cb-1];
1016
1017     for (i = 0; i < size; i += 4) {
1018         int curidx, curidx2;
1019         int *in_int = (int *)&in[i];
1020
1021         qc1 = scaled[i  ] * Q34 + 0.4054f;
1022         qc2 = scaled[i+1] * Q34 + 0.4054f;
1023         qc3 = scaled[i+2] * Q34 + 0.4054f;
1024         qc4 = scaled[i+3] * Q34 + 0.4054f;
1025
1026         __asm__ volatile (
1027             ".set push                      \n\t"
1028             ".set noreorder                 \n\t"
1029
1030             "ori    $t4,    $zero,  4       \n\t"
1031             "slt    $t0,    $t4,    %[qc1]  \n\t"
1032             "slt    $t1,    $t4,    %[qc2]  \n\t"
1033             "slt    $t2,    $t4,    %[qc3]  \n\t"
1034             "slt    $t3,    $t4,    %[qc4]  \n\t"
1035             "movn   %[qc1], $t4,    $t0     \n\t"
1036             "movn   %[qc2], $t4,    $t1     \n\t"
1037             "movn   %[qc3], $t4,    $t2     \n\t"
1038             "movn   %[qc4], $t4,    $t3     \n\t"
1039             "lw     $t0,    0(%[in_int])    \n\t"
1040             "lw     $t1,    4(%[in_int])    \n\t"
1041             "lw     $t2,    8(%[in_int])    \n\t"
1042             "lw     $t3,    12(%[in_int])   \n\t"
1043             "srl    $t0,    $t0,    31      \n\t"
1044             "srl    $t1,    $t1,    31      \n\t"
1045             "srl    $t2,    $t2,    31      \n\t"
1046             "srl    $t3,    $t3,    31      \n\t"
1047             "subu   $t4,    $zero,  %[qc1]  \n\t"
1048             "subu   $t5,    $zero,  %[qc2]  \n\t"
1049             "subu   $t6,    $zero,  %[qc3]  \n\t"
1050             "subu   $t7,    $zero,  %[qc4]  \n\t"
1051             "movn   %[qc1], $t4,    $t0     \n\t"
1052             "movn   %[qc2], $t5,    $t1     \n\t"
1053             "movn   %[qc3], $t6,    $t2     \n\t"
1054             "movn   %[qc4], $t7,    $t3     \n\t"
1055
1056             ".set pop                       \n\t"
1057
1058             : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1059               [qc3]"+r"(qc3), [qc4]"+r"(qc4)
1060             : [in_int]"r"(in_int)
1061             : "t0", "t1", "t2", "t3",
1062               "t4", "t5", "t6", "t7",
1063               "memory"
1064         );
1065
1066         curidx  = 9 * qc1;
1067         curidx += qc2 + 40;
1068
1069         curidx2  = 9 * qc3;
1070         curidx2 += qc4 + 40;
1071
1072         curbits += p_bits[curidx] + p_bits[curidx2];
1073     }
1074     return curbits;
1075 }
1076
1077 static float get_band_numbits_UPAIR7_mips(struct AACEncContext *s,
1078                                           PutBitContext *pb, const float *in,
1079                                           const float *scaled, int size, int scale_idx,
1080                                           int cb, const float lambda, const float uplim,
1081                                           int *bits)
1082 {
1083     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1084     int i;
1085     int qc1, qc2, qc3, qc4;
1086     int curbits = 0;
1087
1088     uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
1089
1090     for (i = 0; i < size; i += 4) {
1091         int curidx, curidx2;
1092
1093         qc1 = scaled[i  ] * Q34 + 0.4054f;
1094         qc2 = scaled[i+1] * Q34 + 0.4054f;
1095         qc3 = scaled[i+2] * Q34 + 0.4054f;
1096         qc4 = scaled[i+3] * Q34 + 0.4054f;
1097
1098         __asm__ volatile (
1099             ".set push                      \n\t"
1100             ".set noreorder                 \n\t"
1101
1102             "ori    $t4,    $zero,  7       \n\t"
1103             "slt    $t0,    $t4,    %[qc1]  \n\t"
1104             "slt    $t1,    $t4,    %[qc2]  \n\t"
1105             "slt    $t2,    $t4,    %[qc3]  \n\t"
1106             "slt    $t3,    $t4,    %[qc4]  \n\t"
1107             "movn   %[qc1], $t4,    $t0     \n\t"
1108             "movn   %[qc2], $t4,    $t1     \n\t"
1109             "movn   %[qc3], $t4,    $t2     \n\t"
1110             "movn   %[qc4], $t4,    $t3     \n\t"
1111
1112             ".set pop                       \n\t"
1113
1114             : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1115               [qc3]"+r"(qc3), [qc4]"+r"(qc4)
1116             :
1117             : "t0", "t1", "t2", "t3", "t4"
1118         );
1119
1120         curidx  = 8 * qc1;
1121         curidx += qc2;
1122
1123         curidx2  = 8 * qc3;
1124         curidx2 += qc4;
1125
1126         curbits += p_bits[curidx] +
1127                    upair7_sign_bits[curidx] +
1128                    p_bits[curidx2] +
1129                    upair7_sign_bits[curidx2];
1130     }
1131     return curbits;
1132 }
1133
1134 static float get_band_numbits_UPAIR12_mips(struct AACEncContext *s,
1135                                            PutBitContext *pb, const float *in,
1136                                            const float *scaled, int size, int scale_idx,
1137                                            int cb, const float lambda, const float uplim,
1138                                            int *bits)
1139 {
1140     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1141     int i;
1142     int qc1, qc2, qc3, qc4;
1143     int curbits = 0;
1144
1145     uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
1146
1147     for (i = 0; i < size; i += 4) {
1148         int curidx, curidx2;
1149
1150         qc1 = scaled[i  ] * Q34 + 0.4054f;
1151         qc2 = scaled[i+1] * Q34 + 0.4054f;
1152         qc3 = scaled[i+2] * Q34 + 0.4054f;
1153         qc4 = scaled[i+3] * Q34 + 0.4054f;
1154
1155         __asm__ volatile (
1156             ".set push                      \n\t"
1157             ".set noreorder                 \n\t"
1158
1159             "ori    $t4,    $zero,  12      \n\t"
1160             "slt    $t0,    $t4,    %[qc1]  \n\t"
1161             "slt    $t1,    $t4,    %[qc2]  \n\t"
1162             "slt    $t2,    $t4,    %[qc3]  \n\t"
1163             "slt    $t3,    $t4,    %[qc4]  \n\t"
1164             "movn   %[qc1], $t4,    $t0     \n\t"
1165             "movn   %[qc2], $t4,    $t1     \n\t"
1166             "movn   %[qc3], $t4,    $t2     \n\t"
1167             "movn   %[qc4], $t4,    $t3     \n\t"
1168
1169             ".set pop                       \n\t"
1170
1171             : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1172               [qc3]"+r"(qc3), [qc4]"+r"(qc4)
1173             :
1174             : "t0", "t1", "t2", "t3", "t4"
1175         );
1176
1177         curidx  = 13 * qc1;
1178         curidx += qc2;
1179
1180         curidx2  = 13 * qc3;
1181         curidx2 += qc4;
1182
1183         curbits += p_bits[curidx] +
1184                    p_bits[curidx2] +
1185                    upair12_sign_bits[curidx] +
1186                    upair12_sign_bits[curidx2];
1187     }
1188     return curbits;
1189 }
1190
1191 static float get_band_numbits_ESC_mips(struct AACEncContext *s,
1192                                        PutBitContext *pb, const float *in,
1193                                        const float *scaled, int size, int scale_idx,
1194                                        int cb, const float lambda, const float uplim,
1195                                        int *bits)
1196 {
1197     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1198     int i;
1199     int qc1, qc2, qc3, qc4;
1200     int curbits = 0;
1201
1202     uint8_t *p_bits = (uint8_t*)ff_aac_spectral_bits[cb-1];
1203
1204     for (i = 0; i < size; i += 4) {
1205         int curidx, curidx2;
1206         int cond0, cond1, cond2, cond3;
1207         int c1, c2, c3, c4;
1208
1209         qc1 = scaled[i  ] * Q34 + 0.4054f;
1210         qc2 = scaled[i+1] * Q34 + 0.4054f;
1211         qc3 = scaled[i+2] * Q34 + 0.4054f;
1212         qc4 = scaled[i+3] * Q34 + 0.4054f;
1213
1214         __asm__ volatile (
1215             ".set push                                  \n\t"
1216             ".set noreorder                             \n\t"
1217
1218             "ori        $t4,        $zero,  15          \n\t"
1219             "ori        $t5,        $zero,  16          \n\t"
1220             "shll_s.w   %[c1],      %[qc1], 18          \n\t"
1221             "shll_s.w   %[c2],      %[qc2], 18          \n\t"
1222             "shll_s.w   %[c3],      %[qc3], 18          \n\t"
1223             "shll_s.w   %[c4],      %[qc4], 18          \n\t"
1224             "srl        %[c1],      %[c1],  18          \n\t"
1225             "srl        %[c2],      %[c2],  18          \n\t"
1226             "srl        %[c3],      %[c3],  18          \n\t"
1227             "srl        %[c4],      %[c4],  18          \n\t"
1228             "slt        %[cond0],   $t4,    %[qc1]      \n\t"
1229             "slt        %[cond1],   $t4,    %[qc2]      \n\t"
1230             "slt        %[cond2],   $t4,    %[qc3]      \n\t"
1231             "slt        %[cond3],   $t4,    %[qc4]      \n\t"
1232             "movn       %[qc1],     $t5,    %[cond0]    \n\t"
1233             "movn       %[qc2],     $t5,    %[cond1]    \n\t"
1234             "movn       %[qc3],     $t5,    %[cond2]    \n\t"
1235             "movn       %[qc4],     $t5,    %[cond3]    \n\t"
1236             "ori        $t5,        $zero,  31          \n\t"
1237             "clz        %[c1],      %[c1]               \n\t"
1238             "clz        %[c2],      %[c2]               \n\t"
1239             "clz        %[c3],      %[c3]               \n\t"
1240             "clz        %[c4],      %[c4]               \n\t"
1241             "subu       %[c1],      $t5,    %[c1]       \n\t"
1242             "subu       %[c2],      $t5,    %[c2]       \n\t"
1243             "subu       %[c3],      $t5,    %[c3]       \n\t"
1244             "subu       %[c4],      $t5,    %[c4]       \n\t"
1245             "sll        %[c1],      %[c1],  1           \n\t"
1246             "sll        %[c2],      %[c2],  1           \n\t"
1247             "sll        %[c3],      %[c3],  1           \n\t"
1248             "sll        %[c4],      %[c4],  1           \n\t"
1249             "addiu      %[c1],      %[c1],  -3          \n\t"
1250             "addiu      %[c2],      %[c2],  -3          \n\t"
1251             "addiu      %[c3],      %[c3],  -3          \n\t"
1252             "addiu      %[c4],      %[c4],  -3          \n\t"
1253             "subu       %[cond0],   $zero,  %[cond0]    \n\t"
1254             "subu       %[cond1],   $zero,  %[cond1]    \n\t"
1255             "subu       %[cond2],   $zero,  %[cond2]    \n\t"
1256             "subu       %[cond3],   $zero,  %[cond3]    \n\t"
1257             "and        %[c1],      %[c1],  %[cond0]    \n\t"
1258             "and        %[c2],      %[c2],  %[cond1]    \n\t"
1259             "and        %[c3],      %[c3],  %[cond2]    \n\t"
1260             "and        %[c4],      %[c4],  %[cond3]    \n\t"
1261
1262             ".set pop                                   \n\t"
1263
1264             : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1265               [qc3]"+r"(qc3), [qc4]"+r"(qc4),
1266               [cond0]"=&r"(cond0), [cond1]"=&r"(cond1),
1267               [cond2]"=&r"(cond2), [cond3]"=&r"(cond3),
1268               [c1]"=&r"(c1), [c2]"=&r"(c2),
1269               [c3]"=&r"(c3), [c4]"=&r"(c4)
1270             :
1271             : "t4", "t5"
1272         );
1273
1274         curidx = 17 * qc1;
1275         curidx += qc2;
1276
1277         curidx2 = 17 * qc3;
1278         curidx2 += qc4;
1279
1280         curbits += p_bits[curidx];
1281         curbits += esc_sign_bits[curidx];
1282         curbits += p_bits[curidx2];
1283         curbits += esc_sign_bits[curidx2];
1284
1285         curbits += c1;
1286         curbits += c2;
1287         curbits += c3;
1288         curbits += c4;
1289     }
1290     return curbits;
1291 }
1292
1293 static float (*const get_band_numbits_arr[])(struct AACEncContext *s,
1294                                              PutBitContext *pb, const float *in,
1295                                              const float *scaled, int size, int scale_idx,
1296                                              int cb, const float lambda, const float uplim,
1297                                              int *bits) = {
1298     get_band_numbits_ZERO_mips,
1299     get_band_numbits_SQUAD_mips,
1300     get_band_numbits_SQUAD_mips,
1301     get_band_numbits_UQUAD_mips,
1302     get_band_numbits_UQUAD_mips,
1303     get_band_numbits_SPAIR_mips,
1304     get_band_numbits_SPAIR_mips,
1305     get_band_numbits_UPAIR7_mips,
1306     get_band_numbits_UPAIR7_mips,
1307     get_band_numbits_UPAIR12_mips,
1308     get_band_numbits_UPAIR12_mips,
1309     get_band_numbits_ESC_mips,
1310 };
1311
1312 #define get_band_numbits(                                  \
1313                                 s, pb, in, scaled, size, scale_idx, cb, \
1314                                 lambda, uplim, bits)                    \
1315     get_band_numbits_arr[cb](                              \
1316                                 s, pb, in, scaled, size, scale_idx, cb, \
1317                                 lambda, uplim, bits)
1318
1319 static float quantize_band_cost_bits(struct AACEncContext *s, const float *in,
1320                                      const float *scaled, int size, int scale_idx,
1321                                      int cb, const float lambda, const float uplim,
1322                                      int *bits)
1323 {
1324     return get_band_numbits(s, NULL, in, scaled, size, scale_idx, cb, lambda, uplim, bits);
1325 }
1326
1327 /**
1328  * Functions developed from template function and optimized for getting the band cost
1329  */
1330 #if HAVE_MIPSFPU
1331 static float get_band_cost_ZERO_mips(struct AACEncContext *s,
1332                                      PutBitContext *pb, const float *in,
1333                                      const float *scaled, int size, int scale_idx,
1334                                      int cb, const float lambda, const float uplim,
1335                                      int *bits)
1336 {
1337     int i;
1338     float cost = 0;
1339
1340     for (i = 0; i < size; i += 4) {
1341         cost += in[i  ] * in[i  ];
1342         cost += in[i+1] * in[i+1];
1343         cost += in[i+2] * in[i+2];
1344         cost += in[i+3] * in[i+3];
1345     }
1346     if (bits)
1347         *bits = 0;
1348     return cost * lambda;
1349 }
1350
1351 static float get_band_cost_SQUAD_mips(struct AACEncContext *s,
1352                                       PutBitContext *pb, const float *in,
1353                                       const float *scaled, int size, int scale_idx,
1354                                       int cb, const float lambda, const float uplim,
1355                                       int *bits)
1356 {
1357     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1358     const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
1359     int i;
1360     float cost = 0;
1361     int qc1, qc2, qc3, qc4;
1362     int curbits = 0;
1363
1364     uint8_t *p_bits  = (uint8_t *)ff_aac_spectral_bits[cb-1];
1365     float   *p_codes = (float   *)ff_aac_codebook_vectors[cb-1];
1366
1367     for (i = 0; i < size; i += 4) {
1368         const float *vec;
1369         int curidx;
1370         int   *in_int = (int   *)&in[i];
1371         float *in_pos = (float *)&in[i];
1372         float di0, di1, di2, di3;
1373
1374         qc1 = scaled[i  ] * Q34 + 0.4054f;
1375         qc2 = scaled[i+1] * Q34 + 0.4054f;
1376         qc3 = scaled[i+2] * Q34 + 0.4054f;
1377         qc4 = scaled[i+3] * Q34 + 0.4054f;
1378
1379         __asm__ volatile (
1380             ".set push                                  \n\t"
1381             ".set noreorder                             \n\t"
1382
1383             "slt        %[qc1], $zero,  %[qc1]          \n\t"
1384             "slt        %[qc2], $zero,  %[qc2]          \n\t"
1385             "slt        %[qc3], $zero,  %[qc3]          \n\t"
1386             "slt        %[qc4], $zero,  %[qc4]          \n\t"
1387             "lw         $t0,    0(%[in_int])            \n\t"
1388             "lw         $t1,    4(%[in_int])            \n\t"
1389             "lw         $t2,    8(%[in_int])            \n\t"
1390             "lw         $t3,    12(%[in_int])           \n\t"
1391             "srl        $t0,    $t0,    31              \n\t"
1392             "srl        $t1,    $t1,    31              \n\t"
1393             "srl        $t2,    $t2,    31              \n\t"
1394             "srl        $t3,    $t3,    31              \n\t"
1395             "subu       $t4,    $zero,  %[qc1]          \n\t"
1396             "subu       $t5,    $zero,  %[qc2]          \n\t"
1397             "subu       $t6,    $zero,  %[qc3]          \n\t"
1398             "subu       $t7,    $zero,  %[qc4]          \n\t"
1399             "movn       %[qc1], $t4,    $t0             \n\t"
1400             "movn       %[qc2], $t5,    $t1             \n\t"
1401             "movn       %[qc3], $t6,    $t2             \n\t"
1402             "movn       %[qc4], $t7,    $t3             \n\t"
1403
1404             ".set pop                                   \n\t"
1405
1406             : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1407               [qc3]"+r"(qc3), [qc4]"+r"(qc4)
1408             : [in_int]"r"(in_int)
1409             : "t0", "t1", "t2", "t3",
1410               "t4", "t5", "t6", "t7",
1411               "memory"
1412         );
1413
1414         curidx = qc1;
1415         curidx *= 3;
1416         curidx += qc2;
1417         curidx *= 3;
1418         curidx += qc3;
1419         curidx *= 3;
1420         curidx += qc4;
1421         curidx += 40;
1422
1423         curbits += p_bits[curidx];
1424         vec     = &p_codes[curidx*4];
1425
1426         __asm__ volatile (
1427             ".set push                                  \n\t"
1428             ".set noreorder                             \n\t"
1429
1430             "lwc1       $f0,    0(%[in_pos])            \n\t"
1431             "lwc1       $f1,    0(%[vec])               \n\t"
1432             "lwc1       $f2,    4(%[in_pos])            \n\t"
1433             "lwc1       $f3,    4(%[vec])               \n\t"
1434             "lwc1       $f4,    8(%[in_pos])            \n\t"
1435             "lwc1       $f5,    8(%[vec])               \n\t"
1436             "lwc1       $f6,    12(%[in_pos])           \n\t"
1437             "lwc1       $f7,    12(%[vec])              \n\t"
1438             "nmsub.s    %[di0], $f0,    $f1,    %[IQ]   \n\t"
1439             "nmsub.s    %[di1], $f2,    $f3,    %[IQ]   \n\t"
1440             "nmsub.s    %[di2], $f4,    $f5,    %[IQ]   \n\t"
1441             "nmsub.s    %[di3], $f6,    $f7,    %[IQ]   \n\t"
1442
1443             ".set pop                                   \n\t"
1444
1445             : [di0]"=&f"(di0), [di1]"=&f"(di1),
1446               [di2]"=&f"(di2), [di3]"=&f"(di3)
1447             : [in_pos]"r"(in_pos), [vec]"r"(vec),
1448               [IQ]"f"(IQ)
1449             : "$f0", "$f1", "$f2", "$f3",
1450               "$f4", "$f5", "$f6", "$f7",
1451               "memory"
1452         );
1453
1454         cost += di0 * di0 + di1 * di1
1455                 + di2 * di2 + di3 * di3;
1456     }
1457
1458     if (bits)
1459         *bits = curbits;
1460     return cost * lambda + curbits;
1461 }
1462
1463 static float get_band_cost_UQUAD_mips(struct AACEncContext *s,
1464                                       PutBitContext *pb, const float *in,
1465                                       const float *scaled, int size, int scale_idx,
1466                                       int cb, const float lambda, const float uplim,
1467                                       int *bits)
1468 {
1469     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1470     const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
1471     int i;
1472     float cost = 0;
1473     int curbits = 0;
1474     int qc1, qc2, qc3, qc4;
1475
1476     uint8_t *p_bits  = (uint8_t*)ff_aac_spectral_bits[cb-1];
1477     float   *p_codes = (float  *)ff_aac_codebook_vectors[cb-1];
1478
1479     for (i = 0; i < size; i += 4) {
1480         const float *vec;
1481         int curidx;
1482         float *in_pos = (float *)&in[i];
1483         float di0, di1, di2, di3;
1484
1485         qc1 = scaled[i  ] * Q34 + 0.4054f;
1486         qc2 = scaled[i+1] * Q34 + 0.4054f;
1487         qc3 = scaled[i+2] * Q34 + 0.4054f;
1488         qc4 = scaled[i+3] * Q34 + 0.4054f;
1489
1490         __asm__ volatile (
1491             ".set push                                  \n\t"
1492             ".set noreorder                             \n\t"
1493
1494             "ori        $t4,    $zero,  2               \n\t"
1495             "slt        $t0,    $t4,    %[qc1]          \n\t"
1496             "slt        $t1,    $t4,    %[qc2]          \n\t"
1497             "slt        $t2,    $t4,    %[qc3]          \n\t"
1498             "slt        $t3,    $t4,    %[qc4]          \n\t"
1499             "movn       %[qc1], $t4,    $t0             \n\t"
1500             "movn       %[qc2], $t4,    $t1             \n\t"
1501             "movn       %[qc3], $t4,    $t2             \n\t"
1502             "movn       %[qc4], $t4,    $t3             \n\t"
1503
1504             ".set pop                                   \n\t"
1505
1506             : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1507               [qc3]"+r"(qc3), [qc4]"+r"(qc4)
1508             :
1509             : "t0", "t1", "t2", "t3", "t4"
1510         );
1511
1512         curidx = qc1;
1513         curidx *= 3;
1514         curidx += qc2;
1515         curidx *= 3;
1516         curidx += qc3;
1517         curidx *= 3;
1518         curidx += qc4;
1519
1520         curbits += p_bits[curidx];
1521         curbits += uquad_sign_bits[curidx];
1522         vec     = &p_codes[curidx*4];
1523
1524         __asm__ volatile (
1525             ".set push                                  \n\t"
1526             ".set noreorder                             \n\t"
1527
1528             "lwc1       %[di0], 0(%[in_pos])            \n\t"
1529             "lwc1       %[di1], 4(%[in_pos])            \n\t"
1530             "lwc1       %[di2], 8(%[in_pos])            \n\t"
1531             "lwc1       %[di3], 12(%[in_pos])           \n\t"
1532             "abs.s      %[di0], %[di0]                  \n\t"
1533             "abs.s      %[di1], %[di1]                  \n\t"
1534             "abs.s      %[di2], %[di2]                  \n\t"
1535             "abs.s      %[di3], %[di3]                  \n\t"
1536             "lwc1       $f0,    0(%[vec])               \n\t"
1537             "lwc1       $f1,    4(%[vec])               \n\t"
1538             "lwc1       $f2,    8(%[vec])               \n\t"
1539             "lwc1       $f3,    12(%[vec])              \n\t"
1540             "nmsub.s    %[di0], %[di0], $f0,    %[IQ]   \n\t"
1541             "nmsub.s    %[di1], %[di1], $f1,    %[IQ]   \n\t"
1542             "nmsub.s    %[di2], %[di2], $f2,    %[IQ]   \n\t"
1543             "nmsub.s    %[di3], %[di3], $f3,    %[IQ]   \n\t"
1544
1545             ".set pop                                   \n\t"
1546
1547             : [di0]"=&f"(di0), [di1]"=&f"(di1),
1548               [di2]"=&f"(di2), [di3]"=&f"(di3)
1549             : [in_pos]"r"(in_pos), [vec]"r"(vec),
1550               [IQ]"f"(IQ)
1551             : "$f0", "$f1", "$f2", "$f3",
1552               "memory"
1553         );
1554
1555         cost += di0 * di0 + di1 * di1
1556                 + di2 * di2 + di3 * di3;
1557     }
1558
1559     if (bits)
1560         *bits = curbits;
1561     return cost * lambda + curbits;
1562 }
1563
1564 static float get_band_cost_SPAIR_mips(struct AACEncContext *s,
1565                                       PutBitContext *pb, const float *in,
1566                                       const float *scaled, int size, int scale_idx,
1567                                       int cb, const float lambda, const float uplim,
1568                                       int *bits)
1569 {
1570     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1571     const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
1572     int i;
1573     float cost = 0;
1574     int qc1, qc2, qc3, qc4;
1575     int curbits = 0;
1576
1577     uint8_t *p_bits  = (uint8_t *)ff_aac_spectral_bits[cb-1];
1578     float   *p_codes = (float   *)ff_aac_codebook_vectors[cb-1];
1579
1580     for (i = 0; i < size; i += 4) {
1581         const float *vec, *vec2;
1582         int curidx, curidx2;
1583         int   *in_int = (int   *)&in[i];
1584         float *in_pos = (float *)&in[i];
1585         float di0, di1, di2, di3;
1586
1587         qc1 = scaled[i  ] * Q34 + 0.4054f;
1588         qc2 = scaled[i+1] * Q34 + 0.4054f;
1589         qc3 = scaled[i+2] * Q34 + 0.4054f;
1590         qc4 = scaled[i+3] * Q34 + 0.4054f;
1591
1592         __asm__ volatile (
1593             ".set push                                  \n\t"
1594             ".set noreorder                             \n\t"
1595
1596             "ori        $t4,    $zero,  4               \n\t"
1597             "slt        $t0,    $t4,    %[qc1]          \n\t"
1598             "slt        $t1,    $t4,    %[qc2]          \n\t"
1599             "slt        $t2,    $t4,    %[qc3]          \n\t"
1600             "slt        $t3,    $t4,    %[qc4]          \n\t"
1601             "movn       %[qc1], $t4,    $t0             \n\t"
1602             "movn       %[qc2], $t4,    $t1             \n\t"
1603             "movn       %[qc3], $t4,    $t2             \n\t"
1604             "movn       %[qc4], $t4,    $t3             \n\t"
1605             "lw         $t0,    0(%[in_int])            \n\t"
1606             "lw         $t1,    4(%[in_int])            \n\t"
1607             "lw         $t2,    8(%[in_int])            \n\t"
1608             "lw         $t3,    12(%[in_int])           \n\t"
1609             "srl        $t0,    $t0,    31              \n\t"
1610             "srl        $t1,    $t1,    31              \n\t"
1611             "srl        $t2,    $t2,    31              \n\t"
1612             "srl        $t3,    $t3,    31              \n\t"
1613             "subu       $t4,    $zero,  %[qc1]          \n\t"
1614             "subu       $t5,    $zero,  %[qc2]          \n\t"
1615             "subu       $t6,    $zero,  %[qc3]          \n\t"
1616             "subu       $t7,    $zero,  %[qc4]          \n\t"
1617             "movn       %[qc1], $t4,    $t0             \n\t"
1618             "movn       %[qc2], $t5,    $t1             \n\t"
1619             "movn       %[qc3], $t6,    $t2             \n\t"
1620             "movn       %[qc4], $t7,    $t3             \n\t"
1621
1622             ".set pop                                   \n\t"
1623
1624             : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1625               [qc3]"+r"(qc3), [qc4]"+r"(qc4)
1626             : [in_int]"r"(in_int)
1627             : "t0", "t1", "t2", "t3",
1628               "t4", "t5", "t6", "t7",
1629               "memory"
1630         );
1631
1632         curidx = 9 * qc1;
1633         curidx += qc2 + 40;
1634
1635         curidx2 = 9 * qc3;
1636         curidx2 += qc4 + 40;
1637
1638         curbits += p_bits[curidx];
1639         curbits += p_bits[curidx2];
1640
1641         vec     = &p_codes[curidx*2];
1642         vec2    = &p_codes[curidx2*2];
1643
1644         __asm__ volatile (
1645             ".set push                                  \n\t"
1646             ".set noreorder                             \n\t"
1647
1648             "lwc1       $f0,    0(%[in_pos])            \n\t"
1649             "lwc1       $f1,    0(%[vec])               \n\t"
1650             "lwc1       $f2,    4(%[in_pos])            \n\t"
1651             "lwc1       $f3,    4(%[vec])               \n\t"
1652             "lwc1       $f4,    8(%[in_pos])            \n\t"
1653             "lwc1       $f5,    0(%[vec2])              \n\t"
1654             "lwc1       $f6,    12(%[in_pos])           \n\t"
1655             "lwc1       $f7,    4(%[vec2])              \n\t"
1656             "nmsub.s    %[di0], $f0,    $f1,    %[IQ]   \n\t"
1657             "nmsub.s    %[di1], $f2,    $f3,    %[IQ]   \n\t"
1658             "nmsub.s    %[di2], $f4,    $f5,    %[IQ]   \n\t"
1659             "nmsub.s    %[di3], $f6,    $f7,    %[IQ]   \n\t"
1660
1661             ".set pop                                   \n\t"
1662
1663             : [di0]"=&f"(di0), [di1]"=&f"(di1),
1664               [di2]"=&f"(di2), [di3]"=&f"(di3)
1665             : [in_pos]"r"(in_pos), [vec]"r"(vec),
1666               [vec2]"r"(vec2), [IQ]"f"(IQ)
1667             : "$f0", "$f1", "$f2", "$f3",
1668               "$f4", "$f5", "$f6", "$f7",
1669               "memory"
1670         );
1671
1672         cost += di0 * di0 + di1 * di1
1673                 + di2 * di2 + di3 * di3;
1674     }
1675
1676     if (bits)
1677         *bits = curbits;
1678     return cost * lambda + curbits;
1679 }
1680
1681 static float get_band_cost_UPAIR7_mips(struct AACEncContext *s,
1682                                        PutBitContext *pb, const float *in,
1683                                        const float *scaled, int size, int scale_idx,
1684                                        int cb, const float lambda, const float uplim,
1685                                        int *bits)
1686 {
1687     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1688     const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
1689     int i;
1690     float cost = 0;
1691     int qc1, qc2, qc3, qc4;
1692     int curbits = 0;
1693
1694     uint8_t *p_bits  = (uint8_t *)ff_aac_spectral_bits[cb-1];
1695     float   *p_codes = (float   *)ff_aac_codebook_vectors[cb-1];
1696
1697     for (i = 0; i < size; i += 4) {
1698         const float *vec, *vec2;
1699         int curidx, curidx2, sign1, count1, sign2, count2;
1700         int   *in_int = (int   *)&in[i];
1701         float *in_pos = (float *)&in[i];
1702         float di0, di1, di2, di3;
1703
1704         qc1 = scaled[i  ] * Q34 + 0.4054f;
1705         qc2 = scaled[i+1] * Q34 + 0.4054f;
1706         qc3 = scaled[i+2] * Q34 + 0.4054f;
1707         qc4 = scaled[i+3] * Q34 + 0.4054f;
1708
1709         __asm__ volatile (
1710             ".set push                                          \n\t"
1711             ".set noreorder                                     \n\t"
1712
1713             "ori        $t4,        $zero,      7               \n\t"
1714             "ori        %[sign1],   $zero,      0               \n\t"
1715             "ori        %[sign2],   $zero,      0               \n\t"
1716             "slt        $t0,        $t4,        %[qc1]          \n\t"
1717             "slt        $t1,        $t4,        %[qc2]          \n\t"
1718             "slt        $t2,        $t4,        %[qc3]          \n\t"
1719             "slt        $t3,        $t4,        %[qc4]          \n\t"
1720             "movn       %[qc1],     $t4,        $t0             \n\t"
1721             "movn       %[qc2],     $t4,        $t1             \n\t"
1722             "movn       %[qc3],     $t4,        $t2             \n\t"
1723             "movn       %[qc4],     $t4,        $t3             \n\t"
1724             "lw         $t0,        0(%[in_int])                \n\t"
1725             "lw         $t1,        4(%[in_int])                \n\t"
1726             "lw         $t2,        8(%[in_int])                \n\t"
1727             "lw         $t3,        12(%[in_int])               \n\t"
1728             "slt        $t0,        $t0,        $zero           \n\t"
1729             "movn       %[sign1],   $t0,        %[qc1]          \n\t"
1730             "slt        $t2,        $t2,        $zero           \n\t"
1731             "movn       %[sign2],   $t2,        %[qc3]          \n\t"
1732             "slt        $t1,        $t1,        $zero           \n\t"
1733             "sll        $t0,        %[sign1],   1               \n\t"
1734             "or         $t0,        $t0,        $t1             \n\t"
1735             "movn       %[sign1],   $t0,        %[qc2]          \n\t"
1736             "slt        $t3,        $t3,        $zero           \n\t"
1737             "sll        $t0,        %[sign2],   1               \n\t"
1738             "or         $t0,        $t0,        $t3             \n\t"
1739             "movn       %[sign2],   $t0,        %[qc4]          \n\t"
1740             "slt        %[count1],  $zero,      %[qc1]          \n\t"
1741             "slt        $t1,        $zero,      %[qc2]          \n\t"
1742             "slt        %[count2],  $zero,      %[qc3]          \n\t"
1743             "slt        $t2,        $zero,      %[qc4]          \n\t"
1744             "addu       %[count1],  %[count1],  $t1             \n\t"
1745             "addu       %[count2],  %[count2],  $t2             \n\t"
1746
1747             ".set pop                                           \n\t"
1748
1749             : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1750               [qc3]"+r"(qc3), [qc4]"+r"(qc4),
1751               [sign1]"=&r"(sign1), [count1]"=&r"(count1),
1752               [sign2]"=&r"(sign2), [count2]"=&r"(count2)
1753             : [in_int]"r"(in_int)
1754             : "t0", "t1", "t2", "t3", "t4",
1755               "memory"
1756         );
1757
1758         curidx = 8 * qc1;
1759         curidx += qc2;
1760
1761         curidx2 = 8 * qc3;
1762         curidx2 += qc4;
1763
1764         curbits += p_bits[curidx];
1765         curbits += upair7_sign_bits[curidx];
1766         vec     = &p_codes[curidx*2];
1767
1768         curbits += p_bits[curidx2];
1769         curbits += upair7_sign_bits[curidx2];
1770         vec2    = &p_codes[curidx2*2];
1771
1772         __asm__ volatile (
1773             ".set push                                          \n\t"
1774             ".set noreorder                                     \n\t"
1775
1776             "lwc1       %[di0],     0(%[in_pos])                \n\t"
1777             "lwc1       %[di1],     4(%[in_pos])                \n\t"
1778             "lwc1       %[di2],     8(%[in_pos])                \n\t"
1779             "lwc1       %[di3],     12(%[in_pos])               \n\t"
1780             "abs.s      %[di0],     %[di0]                      \n\t"
1781             "abs.s      %[di1],     %[di1]                      \n\t"
1782             "abs.s      %[di2],     %[di2]                      \n\t"
1783             "abs.s      %[di3],     %[di3]                      \n\t"
1784             "lwc1       $f0,        0(%[vec])                   \n\t"
1785             "lwc1       $f1,        4(%[vec])                   \n\t"
1786             "lwc1       $f2,        0(%[vec2])                  \n\t"
1787             "lwc1       $f3,        4(%[vec2])                  \n\t"
1788             "nmsub.s    %[di0],     %[di0],     $f0,    %[IQ]   \n\t"
1789             "nmsub.s    %[di1],     %[di1],     $f1,    %[IQ]   \n\t"
1790             "nmsub.s    %[di2],     %[di2],     $f2,    %[IQ]   \n\t"
1791             "nmsub.s    %[di3],     %[di3],     $f3,    %[IQ]   \n\t"
1792
1793             ".set pop                                           \n\t"
1794
1795             : [di0]"=&f"(di0), [di1]"=&f"(di1),
1796               [di2]"=&f"(di2), [di3]"=&f"(di3)
1797             : [in_pos]"r"(in_pos), [vec]"r"(vec),
1798               [vec2]"r"(vec2), [IQ]"f"(IQ)
1799             : "$f0", "$f1", "$f2", "$f3",
1800               "memory"
1801         );
1802
1803         cost += di0 * di0 + di1 * di1
1804                 + di2 * di2 + di3 * di3;
1805     }
1806
1807     if (bits)
1808         *bits = curbits;
1809     return cost * lambda + curbits;
1810 }
1811
1812 static float get_band_cost_UPAIR12_mips(struct AACEncContext *s,
1813                                         PutBitContext *pb, const float *in,
1814                                         const float *scaled, int size, int scale_idx,
1815                                         int cb, const float lambda, const float uplim,
1816                                         int *bits)
1817 {
1818     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1819     const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
1820     int i;
1821     float cost = 0;
1822     int qc1, qc2, qc3, qc4;
1823     int curbits = 0;
1824
1825     uint8_t *p_bits  = (uint8_t *)ff_aac_spectral_bits[cb-1];
1826     float   *p_codes = (float   *)ff_aac_codebook_vectors[cb-1];
1827
1828     for (i = 0; i < size; i += 4) {
1829         const float *vec, *vec2;
1830         int curidx, curidx2;
1831         int sign1, count1, sign2, count2;
1832         int   *in_int = (int   *)&in[i];
1833         float *in_pos = (float *)&in[i];
1834         float di0, di1, di2, di3;
1835
1836         qc1 = scaled[i  ] * Q34 + 0.4054f;
1837         qc2 = scaled[i+1] * Q34 + 0.4054f;
1838         qc3 = scaled[i+2] * Q34 + 0.4054f;
1839         qc4 = scaled[i+3] * Q34 + 0.4054f;
1840
1841         __asm__ volatile (
1842             ".set push                                          \n\t"
1843             ".set noreorder                                     \n\t"
1844
1845             "ori        $t4,        $zero,      12              \n\t"
1846             "ori        %[sign1],   $zero,      0               \n\t"
1847             "ori        %[sign2],   $zero,      0               \n\t"
1848             "slt        $t0,        $t4,        %[qc1]          \n\t"
1849             "slt        $t1,        $t4,        %[qc2]          \n\t"
1850             "slt        $t2,        $t4,        %[qc3]          \n\t"
1851             "slt        $t3,        $t4,        %[qc4]          \n\t"
1852             "movn       %[qc1],     $t4,        $t0             \n\t"
1853             "movn       %[qc2],     $t4,        $t1             \n\t"
1854             "movn       %[qc3],     $t4,        $t2             \n\t"
1855             "movn       %[qc4],     $t4,        $t3             \n\t"
1856             "lw         $t0,        0(%[in_int])                \n\t"
1857             "lw         $t1,        4(%[in_int])                \n\t"
1858             "lw         $t2,        8(%[in_int])                \n\t"
1859             "lw         $t3,        12(%[in_int])               \n\t"
1860             "slt        $t0,        $t0,        $zero           \n\t"
1861             "movn       %[sign1],   $t0,        %[qc1]          \n\t"
1862             "slt        $t2,        $t2,        $zero           \n\t"
1863             "movn       %[sign2],   $t2,        %[qc3]          \n\t"
1864             "slt        $t1,        $t1,        $zero           \n\t"
1865             "sll        $t0,        %[sign1],   1               \n\t"
1866             "or         $t0,        $t0,        $t1             \n\t"
1867             "movn       %[sign1],   $t0,        %[qc2]          \n\t"
1868             "slt        $t3,        $t3,        $zero           \n\t"
1869             "sll        $t0,        %[sign2],   1               \n\t"
1870             "or         $t0,        $t0,        $t3             \n\t"
1871             "movn       %[sign2],   $t0,        %[qc4]          \n\t"
1872             "slt        %[count1],  $zero,      %[qc1]          \n\t"
1873             "slt        $t1,        $zero,      %[qc2]          \n\t"
1874             "slt        %[count2],  $zero,      %[qc3]          \n\t"
1875             "slt        $t2,        $zero,      %[qc4]          \n\t"
1876             "addu       %[count1],  %[count1],  $t1             \n\t"
1877             "addu       %[count2],  %[count2],  $t2             \n\t"
1878
1879             ".set pop                                           \n\t"
1880
1881             : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1882               [qc3]"+r"(qc3), [qc4]"+r"(qc4),
1883               [sign1]"=&r"(sign1), [count1]"=&r"(count1),
1884               [sign2]"=&r"(sign2), [count2]"=&r"(count2)
1885             : [in_int]"r"(in_int)
1886             : "t0", "t1", "t2", "t3", "t4",
1887               "memory"
1888         );
1889
1890         curidx = 13 * qc1;
1891         curidx += qc2;
1892
1893         curidx2 = 13 * qc3;
1894         curidx2 += qc4;
1895
1896         curbits += p_bits[curidx];
1897         curbits += p_bits[curidx2];
1898         curbits += upair12_sign_bits[curidx];
1899         curbits += upair12_sign_bits[curidx2];
1900         vec     = &p_codes[curidx*2];
1901         vec2    = &p_codes[curidx2*2];
1902
1903         __asm__ volatile (
1904             ".set push                                          \n\t"
1905             ".set noreorder                                     \n\t"
1906
1907             "lwc1       %[di0],     0(%[in_pos])                \n\t"
1908             "lwc1       %[di1],     4(%[in_pos])                \n\t"
1909             "lwc1       %[di2],     8(%[in_pos])                \n\t"
1910             "lwc1       %[di3],     12(%[in_pos])               \n\t"
1911             "abs.s      %[di0],     %[di0]                      \n\t"
1912             "abs.s      %[di1],     %[di1]                      \n\t"
1913             "abs.s      %[di2],     %[di2]                      \n\t"
1914             "abs.s      %[di3],     %[di3]                      \n\t"
1915             "lwc1       $f0,        0(%[vec])                   \n\t"
1916             "lwc1       $f1,        4(%[vec])                   \n\t"
1917             "lwc1       $f2,        0(%[vec2])                  \n\t"
1918             "lwc1       $f3,        4(%[vec2])                  \n\t"
1919             "nmsub.s    %[di0],     %[di0],     $f0,    %[IQ]   \n\t"
1920             "nmsub.s    %[di1],     %[di1],     $f1,    %[IQ]   \n\t"
1921             "nmsub.s    %[di2],     %[di2],     $f2,    %[IQ]   \n\t"
1922             "nmsub.s    %[di3],     %[di3],     $f3,    %[IQ]   \n\t"
1923
1924             ".set pop                                           \n\t"
1925
1926             : [di0]"=&f"(di0), [di1]"=&f"(di1),
1927               [di2]"=&f"(di2), [di3]"=&f"(di3)
1928             : [in_pos]"r"(in_pos), [vec]"r"(vec),
1929               [vec2]"r"(vec2), [IQ]"f"(IQ)
1930             : "$f0", "$f1", "$f2", "$f3",
1931               "memory"
1932         );
1933
1934         cost += di0 * di0 + di1 * di1
1935                 + di2 * di2 + di3 * di3;
1936     }
1937
1938     if (bits)
1939         *bits = curbits;
1940     return cost * lambda + curbits;
1941 }
1942
1943 static float get_band_cost_ESC_mips(struct AACEncContext *s,
1944                                     PutBitContext *pb, const float *in,
1945                                     const float *scaled, int size, int scale_idx,
1946                                     int cb, const float lambda, const float uplim,
1947                                     int *bits)
1948 {
1949     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1950     const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
1951     const float CLIPPED_ESCAPE = 165140.0f * IQ;
1952     int i;
1953     float cost = 0;
1954     int qc1, qc2, qc3, qc4;
1955     int curbits = 0;
1956
1957     uint8_t *p_bits  = (uint8_t*)ff_aac_spectral_bits[cb-1];
1958     float   *p_codes = (float*  )ff_aac_codebook_vectors[cb-1];
1959
1960     for (i = 0; i < size; i += 4) {
1961         const float *vec, *vec2;
1962         int curidx, curidx2;
1963         float t1, t2, t3, t4;
1964         float di1, di2, di3, di4;
1965         int cond0, cond1, cond2, cond3;
1966         int c1, c2, c3, c4;
1967
1968         qc1 = scaled[i  ] * Q34 + 0.4054f;
1969         qc2 = scaled[i+1] * Q34 + 0.4054f;
1970         qc3 = scaled[i+2] * Q34 + 0.4054f;
1971         qc4 = scaled[i+3] * Q34 + 0.4054f;
1972
1973         __asm__ volatile (
1974             ".set push                                  \n\t"
1975             ".set noreorder                             \n\t"
1976
1977             "ori        $t4,        $zero,  15          \n\t"
1978             "ori        $t5,        $zero,  16          \n\t"
1979             "shll_s.w   %[c1],      %[qc1], 18          \n\t"
1980             "shll_s.w   %[c2],      %[qc2], 18          \n\t"
1981             "shll_s.w   %[c3],      %[qc3], 18          \n\t"
1982             "shll_s.w   %[c4],      %[qc4], 18          \n\t"
1983             "srl        %[c1],      %[c1],  18          \n\t"
1984             "srl        %[c2],      %[c2],  18          \n\t"
1985             "srl        %[c3],      %[c3],  18          \n\t"
1986             "srl        %[c4],      %[c4],  18          \n\t"
1987             "slt        %[cond0],   $t4,    %[qc1]      \n\t"
1988             "slt        %[cond1],   $t4,    %[qc2]      \n\t"
1989             "slt        %[cond2],   $t4,    %[qc3]      \n\t"
1990             "slt        %[cond3],   $t4,    %[qc4]      \n\t"
1991             "movn       %[qc1],     $t5,    %[cond0]    \n\t"
1992             "movn       %[qc2],     $t5,    %[cond1]    \n\t"
1993             "movn       %[qc3],     $t5,    %[cond2]    \n\t"
1994             "movn       %[qc4],     $t5,    %[cond3]    \n\t"
1995
1996             ".set pop                                   \n\t"
1997
1998             : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1999               [qc3]"+r"(qc3), [qc4]"+r"(qc4),
2000               [cond0]"=&r"(cond0), [cond1]"=&r"(cond1),
2001               [cond2]"=&r"(cond2), [cond3]"=&r"(cond3),
2002               [c1]"=&r"(c1), [c2]"=&r"(c2),
2003               [c3]"=&r"(c3), [c4]"=&r"(c4)
2004             :
2005             : "t4", "t5"
2006         );
2007
2008         curidx = 17 * qc1;
2009         curidx += qc2;
2010
2011         curidx2 = 17 * qc3;
2012         curidx2 += qc4;
2013
2014         curbits += p_bits[curidx];
2015         curbits += esc_sign_bits[curidx];
2016         vec     = &p_codes[curidx*2];
2017
2018         curbits += p_bits[curidx2];
2019         curbits += esc_sign_bits[curidx2];
2020         vec2     = &p_codes[curidx2*2];
2021
2022         curbits += (av_log2(c1) * 2 - 3) & (-cond0);
2023         curbits += (av_log2(c2) * 2 - 3) & (-cond1);
2024         curbits += (av_log2(c3) * 2 - 3) & (-cond2);
2025         curbits += (av_log2(c4) * 2 - 3) & (-cond3);
2026
2027         t1 = fabsf(in[i  ]);
2028         t2 = fabsf(in[i+1]);
2029         t3 = fabsf(in[i+2]);
2030         t4 = fabsf(in[i+3]);
2031
2032         if (cond0) {
2033             if (t1 >= CLIPPED_ESCAPE) {
2034                 di1 = t1 - CLIPPED_ESCAPE;
2035             } else {
2036                 di1 = t1 - c1 * cbrtf(c1) * IQ;
2037             }
2038         } else
2039             di1 = t1 - vec[0] * IQ;
2040
2041         if (cond1) {
2042             if (t2 >= CLIPPED_ESCAPE) {
2043                 di2 = t2 - CLIPPED_ESCAPE;
2044             } else {
2045                 di2 = t2 - c2 * cbrtf(c2) * IQ;
2046             }
2047         } else
2048             di2 = t2 - vec[1] * IQ;
2049
2050         if (cond2) {
2051             if (t3 >= CLIPPED_ESCAPE) {
2052                 di3 = t3 - CLIPPED_ESCAPE;
2053             } else {
2054                 di3 = t3 - c3 * cbrtf(c3) * IQ;
2055             }
2056         } else
2057             di3 = t3 - vec2[0] * IQ;
2058
2059         if (cond3) {
2060             if (t4 >= CLIPPED_ESCAPE) {
2061                 di4 = t4 - CLIPPED_ESCAPE;
2062             } else {
2063                 di4 = t4 - c4 * cbrtf(c4) * IQ;
2064             }
2065         } else
2066             di4 = t4 - vec2[1]*IQ;
2067
2068         cost += di1 * di1 + di2 * di2
2069                 + di3 * di3 + di4 * di4;
2070     }
2071
2072     if (bits)
2073         *bits = curbits;
2074     return cost * lambda + curbits;
2075 }
2076
2077 static float (*const get_band_cost_arr[])(struct AACEncContext *s,
2078                                           PutBitContext *pb, const float *in,
2079                                           const float *scaled, int size, int scale_idx,
2080                                           int cb, const float lambda, const float uplim,
2081                                           int *bits) = {
2082     get_band_cost_ZERO_mips,
2083     get_band_cost_SQUAD_mips,
2084     get_band_cost_SQUAD_mips,
2085     get_band_cost_UQUAD_mips,
2086     get_band_cost_UQUAD_mips,
2087     get_band_cost_SPAIR_mips,
2088     get_band_cost_SPAIR_mips,
2089     get_band_cost_UPAIR7_mips,
2090     get_band_cost_UPAIR7_mips,
2091     get_band_cost_UPAIR12_mips,
2092     get_band_cost_UPAIR12_mips,
2093     get_band_cost_ESC_mips,
2094 };
2095
2096 #define get_band_cost(                                  \
2097                                 s, pb, in, scaled, size, scale_idx, cb, \
2098                                 lambda, uplim, bits)                    \
2099     get_band_cost_arr[cb](                              \
2100                                 s, pb, in, scaled, size, scale_idx, cb, \
2101                                 lambda, uplim, bits)
2102
2103 static float quantize_band_cost(struct AACEncContext *s, const float *in,
2104                                 const float *scaled, int size, int scale_idx,
2105                                 int cb, const float lambda, const float uplim,
2106                                 int *bits)
2107 {
2108     return get_band_cost(s, NULL, in, scaled, size, scale_idx, cb, lambda, uplim, bits);
2109 }
2110
2111 static void search_for_quantizers_twoloop_mips(AVCodecContext *avctx,
2112                                                AACEncContext *s,
2113                                                SingleChannelElement *sce,
2114                                                const float lambda)
2115 {
2116     int start = 0, i, w, w2, g;
2117     int destbits = avctx->bit_rate * 1024.0 / avctx->sample_rate / avctx->channels;
2118     float dists[128] = { 0 }, uplims[128];
2119     float maxvals[128];
2120     int fflag, minscaler;
2121     int its  = 0;
2122     int allz = 0;
2123     float minthr = INFINITY;
2124
2125     destbits = FFMIN(destbits, 5800);
2126     for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
2127         for (g = 0;  g < sce->ics.num_swb; g++) {
2128             int nz = 0;
2129             float uplim = 0.0f;
2130             for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
2131                 FFPsyBand *band = &s->psy.ch[s->cur_channel].psy_bands[(w+w2)*16+g];
2132                 uplim += band->threshold;
2133                 if (band->energy <= band->threshold || band->threshold == 0.0f) {
2134                     sce->zeroes[(w+w2)*16+g] = 1;
2135                     continue;
2136                 }
2137                 nz = 1;
2138             }
2139             uplims[w*16+g] = uplim *512;
2140             sce->zeroes[w*16+g] = !nz;
2141             if (nz)
2142                 minthr = FFMIN(minthr, uplim);
2143             allz |= nz;
2144         }
2145     }
2146     for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
2147         for (g = 0;  g < sce->ics.num_swb; g++) {
2148             if (sce->zeroes[w*16+g]) {
2149                 sce->sf_idx[w*16+g] = SCALE_ONE_POS;
2150                 continue;
2151             }
2152             sce->sf_idx[w*16+g] = SCALE_ONE_POS + FFMIN(log2f(uplims[w*16+g]/minthr)*4,59);
2153         }
2154     }
2155
2156     if (!allz)
2157         return;
2158     abs_pow34_v(s->scoefs, sce->coeffs, 1024);
2159
2160     for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
2161         start = w*128;
2162         for (g = 0;  g < sce->ics.num_swb; g++) {
2163             const float *scaled = s->scoefs + start;
2164             maxvals[w*16+g] = find_max_val(sce->ics.group_len[w], sce->ics.swb_sizes[g], scaled);
2165             start += sce->ics.swb_sizes[g];
2166         }
2167     }
2168
2169     do {
2170         int tbits, qstep;
2171         minscaler = sce->sf_idx[0];
2172         qstep = its ? 1 : 32;
2173         do {
2174             int prev = -1;
2175             tbits = 0;
2176             fflag = 0;
2177
2178             if (qstep > 1) {
2179                 for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
2180                     start = w*128;
2181                     for (g = 0;  g < sce->ics.num_swb; g++) {
2182                         const float *coefs = sce->coeffs + start;
2183                         const float *scaled = s->scoefs + start;
2184                         int bits = 0;
2185                         int cb;
2186
2187                         if (sce->zeroes[w*16+g] || sce->sf_idx[w*16+g] >= 218) {
2188                             start += sce->ics.swb_sizes[g];
2189                             continue;
2190                         }
2191                         minscaler = FFMIN(minscaler, sce->sf_idx[w*16+g]);
2192                         cb = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]);
2193                         for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
2194                             int b;
2195                             bits += quantize_band_cost_bits(s, coefs + w2*128,
2196                                                             scaled + w2*128,
2197                                                             sce->ics.swb_sizes[g],
2198                                                             sce->sf_idx[w*16+g],
2199                                                             cb,
2200                                                             1.0f,
2201                                                             INFINITY,
2202                                                             &b);
2203                         }
2204                         if (prev != -1) {
2205                             bits += ff_aac_scalefactor_bits[sce->sf_idx[w*16+g] - prev + SCALE_DIFF_ZERO];
2206                         }
2207                         tbits += bits;
2208                         start += sce->ics.swb_sizes[g];
2209                         prev = sce->sf_idx[w*16+g];
2210                     }
2211                 }
2212             }
2213             else {
2214                 for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
2215                     start = w*128;
2216                     for (g = 0;  g < sce->ics.num_swb; g++) {
2217                         const float *coefs = sce->coeffs + start;
2218                         const float *scaled = s->scoefs + start;
2219                         int bits = 0;
2220                         int cb;
2221                         float dist = 0.0f;
2222
2223                         if (sce->zeroes[w*16+g] || sce->sf_idx[w*16+g] >= 218) {
2224                             start += sce->ics.swb_sizes[g];
2225                             continue;
2226                         }
2227                         minscaler = FFMIN(minscaler, sce->sf_idx[w*16+g]);
2228                         cb = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]);
2229                         for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
2230                             int b;
2231                             dist += quantize_band_cost(s, coefs + w2*128,
2232                                                        scaled + w2*128,
2233                                                        sce->ics.swb_sizes[g],
2234                                                        sce->sf_idx[w*16+g],
2235                                                        cb,
2236                                                        1.0f,
2237                                                        INFINITY,
2238                                                        &b);
2239                             bits += b;
2240                         }
2241                         dists[w*16+g] = dist - bits;
2242                         if (prev != -1) {
2243                             bits += ff_aac_scalefactor_bits[sce->sf_idx[w*16+g] - prev + SCALE_DIFF_ZERO];
2244                         }
2245                         tbits += bits;
2246                         start += sce->ics.swb_sizes[g];
2247                         prev = sce->sf_idx[w*16+g];
2248                     }
2249                 }
2250             }
2251             if (tbits > destbits) {
2252                 for (i = 0; i < 128; i++)
2253                     if (sce->sf_idx[i] < 218 - qstep)
2254                         sce->sf_idx[i] += qstep;
2255             } else {
2256                 for (i = 0; i < 128; i++)
2257                     if (sce->sf_idx[i] > 60 - qstep)
2258                         sce->sf_idx[i] -= qstep;
2259             }
2260             qstep >>= 1;
2261             if (!qstep && tbits > destbits*1.02 && sce->sf_idx[0] < 217)
2262                 qstep = 1;
2263         } while (qstep);
2264
2265         fflag = 0;
2266         minscaler = av_clip(minscaler, 60, 255 - SCALE_MAX_DIFF);
2267         for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
2268             for (g = 0; g < sce->ics.num_swb; g++) {
2269                 int prevsc = sce->sf_idx[w*16+g];
2270                 if (dists[w*16+g] > uplims[w*16+g] && sce->sf_idx[w*16+g] > 60) {
2271                     if (find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]-1))
2272                         sce->sf_idx[w*16+g]--;
2273                     else
2274                         sce->sf_idx[w*16+g]-=2;
2275                 }
2276                 sce->sf_idx[w*16+g] = av_clip(sce->sf_idx[w*16+g], minscaler, minscaler + SCALE_MAX_DIFF);
2277                 sce->sf_idx[w*16+g] = FFMIN(sce->sf_idx[w*16+g], 219);
2278                 if (sce->sf_idx[w*16+g] != prevsc)
2279                     fflag = 1;
2280                 sce->band_type[w*16+g] = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]);
2281             }
2282         }
2283         its++;
2284     } while (fflag && its < 10);
2285 }
2286
2287 static void search_for_ms_mips(AACEncContext *s, ChannelElement *cpe,
2288                                const float lambda)
2289 {
2290     int start = 0, i, w, w2, g;
2291     float M[128], S[128];
2292     float *L34 = s->scoefs, *R34 = s->scoefs + 128, *M34 = s->scoefs + 128*2, *S34 = s->scoefs + 128*3;
2293     SingleChannelElement *sce0 = &cpe->ch[0];
2294     SingleChannelElement *sce1 = &cpe->ch[1];
2295     if (!cpe->common_window)
2296         return;
2297     for (w = 0; w < sce0->ics.num_windows; w += sce0->ics.group_len[w]) {
2298         for (g = 0;  g < sce0->ics.num_swb; g++) {
2299             if (!cpe->ch[0].zeroes[w*16+g] && !cpe->ch[1].zeroes[w*16+g]) {
2300                 float dist1 = 0.0f, dist2 = 0.0f;
2301                 for (w2 = 0; w2 < sce0->ics.group_len[w]; w2++) {
2302                     FFPsyBand *band0 = &s->psy.ch[s->cur_channel+0].psy_bands[(w+w2)*16+g];
2303                     FFPsyBand *band1 = &s->psy.ch[s->cur_channel+1].psy_bands[(w+w2)*16+g];
2304                     float minthr = FFMIN(band0->threshold, band1->threshold);
2305                     float maxthr = FFMAX(band0->threshold, band1->threshold);
2306                     for (i = 0; i < sce0->ics.swb_sizes[g]; i+=4) {
2307                         M[i  ] = (sce0->coeffs[start+w2*128+i  ]
2308                                 + sce1->coeffs[start+w2*128+i  ]) * 0.5;
2309                         M[i+1] = (sce0->coeffs[start+w2*128+i+1]
2310                                 + sce1->coeffs[start+w2*128+i+1]) * 0.5;
2311                         M[i+2] = (sce0->coeffs[start+w2*128+i+2]
2312                                 + sce1->coeffs[start+w2*128+i+2]) * 0.5;
2313                         M[i+3] = (sce0->coeffs[start+w2*128+i+3]
2314                                 + sce1->coeffs[start+w2*128+i+3]) * 0.5;
2315
2316                         S[i  ] =  M[i  ]
2317                                 - sce1->coeffs[start+w2*128+i  ];
2318                         S[i+1] =  M[i+1]
2319                                 - sce1->coeffs[start+w2*128+i+1];
2320                         S[i+2] =  M[i+2]
2321                                 - sce1->coeffs[start+w2*128+i+2];
2322                         S[i+3] =  M[i+3]
2323                                 - sce1->coeffs[start+w2*128+i+3];
2324                    }
2325                     abs_pow34_v(L34, sce0->coeffs+start+w2*128, sce0->ics.swb_sizes[g]);
2326                     abs_pow34_v(R34, sce1->coeffs+start+w2*128, sce0->ics.swb_sizes[g]);
2327                     abs_pow34_v(M34, M,                         sce0->ics.swb_sizes[g]);
2328                     abs_pow34_v(S34, S,                         sce0->ics.swb_sizes[g]);
2329                     dist1 += quantize_band_cost(s, sce0->coeffs + start + w2*128,
2330                                                 L34,
2331                                                 sce0->ics.swb_sizes[g],
2332                                                 sce0->sf_idx[(w+w2)*16+g],
2333                                                 sce0->band_type[(w+w2)*16+g],
2334                                                 lambda / band0->threshold, INFINITY, NULL);
2335                     dist1 += quantize_band_cost(s, sce1->coeffs + start + w2*128,
2336                                                 R34,
2337                                                 sce1->ics.swb_sizes[g],
2338                                                 sce1->sf_idx[(w+w2)*16+g],
2339                                                 sce1->band_type[(w+w2)*16+g],
2340                                                 lambda / band1->threshold, INFINITY, NULL);
2341                     dist2 += quantize_band_cost(s, M,
2342                                                 M34,
2343                                                 sce0->ics.swb_sizes[g],
2344                                                 sce0->sf_idx[(w+w2)*16+g],
2345                                                 sce0->band_type[(w+w2)*16+g],
2346                                                 lambda / maxthr, INFINITY, NULL);
2347                     dist2 += quantize_band_cost(s, S,
2348                                                 S34,
2349                                                 sce1->ics.swb_sizes[g],
2350                                                 sce1->sf_idx[(w+w2)*16+g],
2351                                                 sce1->band_type[(w+w2)*16+g],
2352                                                 lambda / minthr, INFINITY, NULL);
2353                 }
2354                 cpe->ms_mask[w*16+g] = dist2 < dist1;
2355             }
2356             start += sce0->ics.swb_sizes[g];
2357         }
2358     }
2359 }
2360 #endif /*HAVE_MIPSFPU */
2361
2362 static void codebook_trellis_rate_mips(AACEncContext *s, SingleChannelElement *sce,
2363                                        int win, int group_len, const float lambda)
2364 {
2365     BandCodingPath path[120][12];
2366     int w, swb, cb, start, size;
2367     int i, j;
2368     const int max_sfb  = sce->ics.max_sfb;
2369     const int run_bits = sce->ics.num_windows == 1 ? 5 : 3;
2370     const int run_esc  = (1 << run_bits) - 1;
2371     int idx, ppos, count;
2372     int stackrun[120], stackcb[120], stack_len;
2373     float next_minbits = INFINITY;
2374     int next_mincb = 0;
2375
2376     abs_pow34_v(s->scoefs, sce->coeffs, 1024);
2377     start = win*128;
2378     for (cb = 0; cb < 12; cb++) {
2379         path[0][cb].cost     = run_bits+4;
2380         path[0][cb].prev_idx = -1;
2381         path[0][cb].run      = 0;
2382     }
2383     for (swb = 0; swb < max_sfb; swb++) {
2384         size = sce->ics.swb_sizes[swb];
2385         if (sce->zeroes[win*16 + swb]) {
2386             float cost_stay_here = path[swb][0].cost;
2387             float cost_get_here  = next_minbits + run_bits + 4;
2388             if (   run_value_bits[sce->ics.num_windows == 8][path[swb][0].run]
2389                 != run_value_bits[sce->ics.num_windows == 8][path[swb][0].run+1])
2390                 cost_stay_here += run_bits;
2391             if (cost_get_here < cost_stay_here) {
2392                 path[swb+1][0].prev_idx = next_mincb;
2393                 path[swb+1][0].cost     = cost_get_here;
2394                 path[swb+1][0].run      = 1;
2395             } else {
2396                 path[swb+1][0].prev_idx = 0;
2397                 path[swb+1][0].cost     = cost_stay_here;
2398                 path[swb+1][0].run      = path[swb][0].run + 1;
2399             }
2400             next_minbits = path[swb+1][0].cost;
2401             next_mincb = 0;
2402             for (cb = 1; cb < 12; cb++) {
2403                 path[swb+1][cb].cost = 61450;
2404                 path[swb+1][cb].prev_idx = -1;
2405                 path[swb+1][cb].run = 0;
2406             }
2407         } else {
2408             float minbits = next_minbits;
2409             int mincb = next_mincb;
2410             int startcb = sce->band_type[win*16+swb];
2411             next_minbits = INFINITY;
2412             next_mincb = 0;
2413             for (cb = 0; cb < startcb; cb++) {
2414                 path[swb+1][cb].cost = 61450;
2415                 path[swb+1][cb].prev_idx = -1;
2416                 path[swb+1][cb].run = 0;
2417             }
2418             for (cb = startcb; cb < 12; cb++) {
2419                 float cost_stay_here, cost_get_here;
2420                 float bits = 0.0f;
2421                 for (w = 0; w < group_len; w++) {
2422                     bits += quantize_band_cost_bits(s, sce->coeffs + start + w*128,
2423                                                     s->scoefs + start + w*128, size,
2424                                                     sce->sf_idx[(win+w)*16+swb], cb,
2425                                                     0, INFINITY, NULL);
2426                 }
2427                 cost_stay_here = path[swb][cb].cost + bits;
2428                 cost_get_here  = minbits            + bits + run_bits + 4;
2429                 if (   run_value_bits[sce->ics.num_windows == 8][path[swb][cb].run]
2430                     != run_value_bits[sce->ics.num_windows == 8][path[swb][cb].run+1])
2431                     cost_stay_here += run_bits;
2432                 if (cost_get_here < cost_stay_here) {
2433                     path[swb+1][cb].prev_idx = mincb;
2434                     path[swb+1][cb].cost     = cost_get_here;
2435                     path[swb+1][cb].run      = 1;
2436                 } else {
2437                     path[swb+1][cb].prev_idx = cb;
2438                     path[swb+1][cb].cost     = cost_stay_here;
2439                     path[swb+1][cb].run      = path[swb][cb].run + 1;
2440                 }
2441                 if (path[swb+1][cb].cost < next_minbits) {
2442                     next_minbits = path[swb+1][cb].cost;
2443                     next_mincb = cb;
2444                 }
2445             }
2446         }
2447         start += sce->ics.swb_sizes[swb];
2448     }
2449
2450     stack_len = 0;
2451     idx       = 0;
2452     for (cb = 1; cb < 12; cb++)
2453         if (path[max_sfb][cb].cost < path[max_sfb][idx].cost)
2454             idx = cb;
2455     ppos = max_sfb;
2456     while (ppos > 0) {
2457         av_assert1(idx >= 0);
2458         cb = idx;
2459         stackrun[stack_len] = path[ppos][cb].run;
2460         stackcb [stack_len] = cb;
2461         idx = path[ppos-path[ppos][cb].run+1][cb].prev_idx;
2462         ppos -= path[ppos][cb].run;
2463         stack_len++;
2464     }
2465
2466     start = 0;
2467     for (i = stack_len - 1; i >= 0; i--) {
2468         put_bits(&s->pb, 4, stackcb[i]);
2469         count = stackrun[i];
2470         memset(sce->zeroes + win*16 + start, !stackcb[i], count);
2471         for (j = 0; j < count; j++) {
2472             sce->band_type[win*16 + start] =  stackcb[i];
2473             start++;
2474         }
2475         while (count >= run_esc) {
2476             put_bits(&s->pb, run_bits, run_esc);
2477             count -= run_esc;
2478         }
2479         put_bits(&s->pb, run_bits, count);
2480     }
2481 }
2482 #endif /* HAVE_INLINE_ASM */
2483
2484 void ff_aac_coder_init_mips(AACEncContext *c) {
2485 #if HAVE_INLINE_ASM
2486     AACCoefficientsEncoder *e = c->coder;
2487     int option = c->options.aac_coder;
2488
2489     if (option == 2) {
2490         e->quantize_and_encode_band = quantize_and_encode_band_mips;
2491         e->encode_window_bands_info = codebook_trellis_rate_mips;
2492 #if HAVE_MIPSFPU
2493         e->search_for_quantizers    = search_for_quantizers_twoloop_mips;
2494         e->search_for_ms            = search_for_ms_mips;
2495 #endif /* HAVE_MIPSFPU */
2496     }
2497 #endif /* HAVE_INLINE_ASM */
2498 }