]> git.sesse.net Git - ffmpeg/blob - libavcodec/mips/aaccoder_mips.c
Merge commit '2d40968dd3ff17b12f7c80dbfad409b14418e267'
[ffmpeg] / libavcodec / mips / aaccoder_mips.c
1 /*
2  * Copyright (c) 2012
3  *      MIPS Technologies, Inc., California.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
14  *    contributors may be used to endorse or promote products derived from
15  *    this software without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  * Author:  Stanislav Ocovaj (socovaj@mips.com)
30  *          Szabolcs Pal     (sabolc@mips.com)
31  *
32  * AAC coefficients encoder optimized for MIPS floating-point architecture
33  *
34  * This file is part of FFmpeg.
35  *
36  * FFmpeg is free software; you can redistribute it and/or
37  * modify it under the terms of the GNU Lesser General Public
38  * License as published by the Free Software Foundation; either
39  * version 2.1 of the License, or (at your option) any later version.
40  *
41  * FFmpeg is distributed in the hope that it will be useful,
42  * but WITHOUT ANY WARRANTY; without even the implied warranty of
43  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
44  * Lesser General Public License for more details.
45  *
46  * You should have received a copy of the GNU Lesser General Public
47  * License along with FFmpeg; if not, write to the Free Software
48  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
49  */
50
51 /**
52  * @file
53  * Reference: libavcodec/aaccoder.c
54  */
55
56 #include "libavutil/libm.h"
57
58 #include <float.h>
59 #include "libavutil/mathematics.h"
60 #include "libavcodec/avcodec.h"
61 #include "libavcodec/put_bits.h"
62 #include "libavcodec/aac.h"
63 #include "libavcodec/aacenc.h"
64 #include "libavcodec/aactab.h"
65
66 #if HAVE_INLINE_ASM
67 typedef struct BandCodingPath {
68     int prev_idx;
69     float cost;
70     int run;
71 } BandCodingPath;
72
73 static const uint8_t run_value_bits_long[64] = {
74      5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
75      5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5, 10,
76     10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
77     10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 15
78 };
79
80 static const uint8_t run_value_bits_short[16] = {
81     3, 3, 3, 3, 3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 9
82 };
83
84 static const uint8_t * const run_value_bits[2] = {
85     run_value_bits_long, run_value_bits_short
86 };
87
88 static const uint8_t uquad_sign_bits[81] = {
89     0, 1, 1, 1, 2, 2, 1, 2, 2,
90     1, 2, 2, 2, 3, 3, 2, 3, 3,
91     1, 2, 2, 2, 3, 3, 2, 3, 3,
92     1, 2, 2, 2, 3, 3, 2, 3, 3,
93     2, 3, 3, 3, 4, 4, 3, 4, 4,
94     2, 3, 3, 3, 4, 4, 3, 4, 4,
95     1, 2, 2, 2, 3, 3, 2, 3, 3,
96     2, 3, 3, 3, 4, 4, 3, 4, 4,
97     2, 3, 3, 3, 4, 4, 3, 4, 4
98 };
99
100 static const uint8_t upair7_sign_bits[64] = {
101     0, 1, 1, 1, 1, 1, 1, 1,
102     1, 2, 2, 2, 2, 2, 2, 2,
103     1, 2, 2, 2, 2, 2, 2, 2,
104     1, 2, 2, 2, 2, 2, 2, 2,
105     1, 2, 2, 2, 2, 2, 2, 2,
106     1, 2, 2, 2, 2, 2, 2, 2,
107     1, 2, 2, 2, 2, 2, 2, 2,
108     1, 2, 2, 2, 2, 2, 2, 2,
109 };
110
111 static const uint8_t upair12_sign_bits[169] = {
112     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
113     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
114     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
115     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
116     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
117     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
118     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
119     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
120     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
121     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
122     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
123     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
124     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
125 };
126
127 static const uint8_t esc_sign_bits[289] = {
128     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
129     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
130     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
131     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
132     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
133     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
134     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
135     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
136     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
137     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
138     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
139     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
140     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
141     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
142     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
143     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
144     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
145 };
146
147 #define ROUND_STANDARD 0.4054f
148 #define ROUND_TO_ZERO 0.1054f
149
150 static void abs_pow34_v(float *out, const float *in, const int size) {
151 #ifndef USE_REALLY_FULL_SEARCH
152     int i;
153     float a, b, c, d;
154     float ax, bx, cx, dx;
155
156     for (i = 0; i < size; i += 4) {
157         a = fabsf(in[i  ]);
158         b = fabsf(in[i+1]);
159         c = fabsf(in[i+2]);
160         d = fabsf(in[i+3]);
161
162         ax = sqrtf(a);
163         bx = sqrtf(b);
164         cx = sqrtf(c);
165         dx = sqrtf(d);
166
167         a = a * ax;
168         b = b * bx;
169         c = c * cx;
170         d = d * dx;
171
172         out[i  ] = sqrtf(a);
173         out[i+1] = sqrtf(b);
174         out[i+2] = sqrtf(c);
175         out[i+3] = sqrtf(d);
176     }
177 #endif /* USE_REALLY_FULL_SEARCH */
178 }
179
180 static float find_max_val(int group_len, int swb_size, const float *scaled) {
181     float maxval = 0.0f;
182     int w2, i;
183     for (w2 = 0; w2 < group_len; w2++) {
184         for (i = 0; i < swb_size; i++) {
185             maxval = FFMAX(maxval, scaled[w2*128+i]);
186         }
187     }
188     return maxval;
189 }
190
191 static int find_min_book(float maxval, int sf) {
192     float Q = ff_aac_pow2sf_tab[POW_SF2_ZERO - sf + SCALE_ONE_POS - SCALE_DIV_512];
193     float Q34 = sqrtf(Q * sqrtf(Q));
194     int qmaxval, cb;
195     qmaxval = maxval * Q34 + 0.4054f;
196     if      (qmaxval ==  0) cb = 0;
197     else if (qmaxval ==  1) cb = 1;
198     else if (qmaxval ==  2) cb = 3;
199     else if (qmaxval <=  4) cb = 5;
200     else if (qmaxval <=  7) cb = 7;
201     else if (qmaxval <= 12) cb = 9;
202     else                    cb = 11;
203     return cb;
204 }
205
206 /**
207  * Functions developed from template function and optimized for quantizing and encoding band
208  */
209 static void quantize_and_encode_band_cost_SQUAD_mips(struct AACEncContext *s,
210                                                      PutBitContext *pb, const float *in, float *out,
211                                                      const float *scaled, int size, int scale_idx,
212                                                      int cb, const float lambda, const float uplim,
213                                                      int *bits, const float ROUNDING)
214 {
215     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
216     int i;
217     int qc1, qc2, qc3, qc4;
218
219     uint8_t  *p_bits  = (uint8_t  *)ff_aac_spectral_bits[cb-1];
220     uint16_t *p_codes = (uint16_t *)ff_aac_spectral_codes[cb-1];
221
222     abs_pow34_v(s->scoefs, in, size);
223     scaled = s->scoefs;
224     for (i = 0; i < size; i += 4) {
225         int curidx;
226         int *in_int = (int *)&in[i];
227         int t0, t1, t2, t3, t4, t5, t6, t7;
228
229         qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
230         qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
231         qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
232         qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
233
234         __asm__ volatile (
235             ".set push                      \n\t"
236             ".set noreorder                 \n\t"
237
238             "slt    %[qc1], $zero,  %[qc1]  \n\t"
239             "slt    %[qc2], $zero,  %[qc2]  \n\t"
240             "slt    %[qc3], $zero,  %[qc3]  \n\t"
241             "slt    %[qc4], $zero,  %[qc4]  \n\t"
242             "lw     %[t0],  0(%[in_int])    \n\t"
243             "lw     %[t1],  4(%[in_int])    \n\t"
244             "lw     %[t2],  8(%[in_int])    \n\t"
245             "lw     %[t3],  12(%[in_int])   \n\t"
246             "srl    %[t0],  %[t0],  31      \n\t"
247             "srl    %[t1],  %[t1],  31      \n\t"
248             "srl    %[t2],  %[t2],  31      \n\t"
249             "srl    %[t3],  %[t3],  31      \n\t"
250             "subu   %[t4],  $zero,  %[qc1]  \n\t"
251             "subu   %[t5],  $zero,  %[qc2]  \n\t"
252             "subu   %[t6],  $zero,  %[qc3]  \n\t"
253             "subu   %[t7],  $zero,  %[qc4]  \n\t"
254             "movn   %[qc1], %[t4],  %[t0]   \n\t"
255             "movn   %[qc2], %[t5],  %[t1]   \n\t"
256             "movn   %[qc3], %[t6],  %[t2]   \n\t"
257             "movn   %[qc4], %[t7],  %[t3]   \n\t"
258
259             ".set pop                       \n\t"
260
261             : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
262               [qc3]"+r"(qc3), [qc4]"+r"(qc4),
263               [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
264               [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7)
265             : [in_int]"r"(in_int)
266             : "memory"
267         );
268
269         curidx = qc1;
270         curidx *= 3;
271         curidx += qc2;
272         curidx *= 3;
273         curidx += qc3;
274         curidx *= 3;
275         curidx += qc4;
276         curidx += 40;
277
278         put_bits(pb, p_bits[curidx], p_codes[curidx]);
279     }
280 }
281
282 static void quantize_and_encode_band_cost_UQUAD_mips(struct AACEncContext *s,
283                                                      PutBitContext *pb, const float *in, float *out,
284                                                      const float *scaled, int size, int scale_idx,
285                                                      int cb, const float lambda, const float uplim,
286                                                      int *bits, const float ROUNDING)
287 {
288     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
289     int i;
290     int qc1, qc2, qc3, qc4;
291
292     uint8_t  *p_bits  = (uint8_t  *)ff_aac_spectral_bits[cb-1];
293     uint16_t *p_codes = (uint16_t *)ff_aac_spectral_codes[cb-1];
294
295     abs_pow34_v(s->scoefs, in, size);
296     scaled = s->scoefs;
297     for (i = 0; i < size; i += 4) {
298         int curidx, sign, count;
299         int *in_int = (int *)&in[i];
300         uint8_t v_bits;
301         unsigned int v_codes;
302         int t0, t1, t2, t3, t4;
303
304         qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
305         qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
306         qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
307         qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
308
309         __asm__ volatile (
310             ".set push                              \n\t"
311             ".set noreorder                         \n\t"
312
313             "ori    %[t4],      $zero,      2       \n\t"
314             "ori    %[sign],    $zero,      0       \n\t"
315             "slt    %[t0],      %[t4],      %[qc1]  \n\t"
316             "slt    %[t1],      %[t4],      %[qc2]  \n\t"
317             "slt    %[t2],      %[t4],      %[qc3]  \n\t"
318             "slt    %[t3],      %[t4],      %[qc4]  \n\t"
319             "movn   %[qc1],     %[t4],      %[t0]   \n\t"
320             "movn   %[qc2],     %[t4],      %[t1]   \n\t"
321             "movn   %[qc3],     %[t4],      %[t2]   \n\t"
322             "movn   %[qc4],     %[t4],      %[t3]   \n\t"
323             "lw     %[t0],      0(%[in_int])        \n\t"
324             "lw     %[t1],      4(%[in_int])        \n\t"
325             "lw     %[t2],      8(%[in_int])        \n\t"
326             "lw     %[t3],      12(%[in_int])       \n\t"
327             "slt    %[t0],      %[t0],      $zero   \n\t"
328             "movn   %[sign],    %[t0],      %[qc1]  \n\t"
329             "slt    %[t1],      %[t1],      $zero   \n\t"
330             "slt    %[t2],      %[t2],      $zero   \n\t"
331             "slt    %[t3],      %[t3],      $zero   \n\t"
332             "sll    %[t0],      %[sign],    1       \n\t"
333             "or     %[t0],      %[t0],      %[t1]   \n\t"
334             "movn   %[sign],    %[t0],      %[qc2]  \n\t"
335             "slt    %[t4],      $zero,      %[qc1]  \n\t"
336             "slt    %[t1],      $zero,      %[qc2]  \n\t"
337             "slt    %[count],   $zero,      %[qc3]  \n\t"
338             "sll    %[t0],      %[sign],    1       \n\t"
339             "or     %[t0],      %[t0],      %[t2]   \n\t"
340             "movn   %[sign],    %[t0],      %[qc3]  \n\t"
341             "slt    %[t2],      $zero,      %[qc4]  \n\t"
342             "addu   %[count],   %[count],   %[t4]   \n\t"
343             "addu   %[count],   %[count],   %[t1]   \n\t"
344             "sll    %[t0],      %[sign],    1       \n\t"
345             "or     %[t0],      %[t0],      %[t3]   \n\t"
346             "movn   %[sign],    %[t0],      %[qc4]  \n\t"
347             "addu   %[count],   %[count],   %[t2]   \n\t"
348
349             ".set pop                               \n\t"
350
351             : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
352               [qc3]"+r"(qc3), [qc4]"+r"(qc4),
353               [sign]"=&r"(sign), [count]"=&r"(count),
354               [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
355               [t4]"=&r"(t4)
356             : [in_int]"r"(in_int)
357             : "memory"
358         );
359
360         curidx = qc1;
361         curidx *= 3;
362         curidx += qc2;
363         curidx *= 3;
364         curidx += qc3;
365         curidx *= 3;
366         curidx += qc4;
367
368         v_codes = (p_codes[curidx] << count) | (sign & ((1 << count) - 1));
369         v_bits  = p_bits[curidx] + count;
370         put_bits(pb, v_bits, v_codes);
371     }
372 }
373
374 static void quantize_and_encode_band_cost_SPAIR_mips(struct AACEncContext *s,
375                                                      PutBitContext *pb, const float *in, float *out,
376                                                      const float *scaled, int size, int scale_idx,
377                                                      int cb, const float lambda, const float uplim,
378                                                      int *bits, const float ROUNDING)
379 {
380     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
381     int i;
382     int qc1, qc2, qc3, qc4;
383
384     uint8_t  *p_bits  = (uint8_t  *)ff_aac_spectral_bits[cb-1];
385     uint16_t *p_codes = (uint16_t *)ff_aac_spectral_codes[cb-1];
386
387     abs_pow34_v(s->scoefs, in, size);
388     scaled = s->scoefs;
389     for (i = 0; i < size; i += 4) {
390         int curidx, curidx2;
391         int *in_int = (int *)&in[i];
392         uint8_t v_bits;
393         unsigned int v_codes;
394         int t0, t1, t2, t3, t4, t5, t6, t7;
395
396         qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
397         qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
398         qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
399         qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
400
401         __asm__ volatile (
402             ".set push                      \n\t"
403             ".set noreorder                 \n\t"
404
405             "ori    %[t4],  $zero,  4       \n\t"
406             "slt    %[t0],  %[t4],  %[qc1]  \n\t"
407             "slt    %[t1],  %[t4],  %[qc2]  \n\t"
408             "slt    %[t2],  %[t4],  %[qc3]  \n\t"
409             "slt    %[t3],  %[t4],  %[qc4]  \n\t"
410             "movn   %[qc1], %[t4],  %[t0]   \n\t"
411             "movn   %[qc2], %[t4],  %[t1]   \n\t"
412             "movn   %[qc3], %[t4],  %[t2]   \n\t"
413             "movn   %[qc4], %[t4],  %[t3]   \n\t"
414             "lw     %[t0],  0(%[in_int])    \n\t"
415             "lw     %[t1],  4(%[in_int])    \n\t"
416             "lw     %[t2],  8(%[in_int])    \n\t"
417             "lw     %[t3],  12(%[in_int])   \n\t"
418             "srl    %[t0],  %[t0],  31      \n\t"
419             "srl    %[t1],  %[t1],  31      \n\t"
420             "srl    %[t2],  %[t2],  31      \n\t"
421             "srl    %[t3],  %[t3],  31      \n\t"
422             "subu   %[t4],  $zero,  %[qc1]  \n\t"
423             "subu   %[t5],  $zero,  %[qc2]  \n\t"
424             "subu   %[t6],  $zero,  %[qc3]  \n\t"
425             "subu   %[t7],  $zero,  %[qc4]  \n\t"
426             "movn   %[qc1], %[t4],  %[t0]   \n\t"
427             "movn   %[qc2], %[t5],  %[t1]   \n\t"
428             "movn   %[qc3], %[t6],  %[t2]   \n\t"
429             "movn   %[qc4], %[t7],  %[t3]   \n\t"
430
431             ".set pop                       \n\t"
432
433             : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
434               [qc3]"+r"(qc3), [qc4]"+r"(qc4),
435               [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
436               [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7)
437             : [in_int]"r"(in_int)
438             : "memory"
439         );
440
441         curidx = 9 * qc1;
442         curidx += qc2 + 40;
443
444         curidx2 = 9 * qc3;
445         curidx2 += qc4 + 40;
446
447         v_codes = (p_codes[curidx] << p_bits[curidx2]) | (p_codes[curidx2]);
448         v_bits  = p_bits[curidx] + p_bits[curidx2];
449         put_bits(pb, v_bits, v_codes);
450     }
451 }
452
453 static void quantize_and_encode_band_cost_UPAIR7_mips(struct AACEncContext *s,
454                                                       PutBitContext *pb, const float *in, float *out,
455                                                       const float *scaled, int size, int scale_idx,
456                                                       int cb, const float lambda, const float uplim,
457                                                       int *bits, const float ROUNDING)
458 {
459     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
460     int i;
461     int qc1, qc2, qc3, qc4;
462
463     uint8_t  *p_bits  = (uint8_t*) ff_aac_spectral_bits[cb-1];
464     uint16_t *p_codes = (uint16_t*)ff_aac_spectral_codes[cb-1];
465
466     abs_pow34_v(s->scoefs, in, size);
467     scaled = s->scoefs;
468     for (i = 0; i < size; i += 4) {
469         int curidx, sign1, count1, sign2, count2;
470         int *in_int = (int *)&in[i];
471         uint8_t v_bits;
472         unsigned int v_codes;
473         int t0, t1, t2, t3, t4;
474
475         qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
476         qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
477         qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
478         qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
479
480         __asm__ volatile (
481             ".set push                              \n\t"
482             ".set noreorder                         \n\t"
483
484             "ori    %[t4],      $zero,      7       \n\t"
485             "ori    %[sign1],   $zero,      0       \n\t"
486             "ori    %[sign2],   $zero,      0       \n\t"
487             "slt    %[t0],      %[t4],      %[qc1]  \n\t"
488             "slt    %[t1],      %[t4],      %[qc2]  \n\t"
489             "slt    %[t2],      %[t4],      %[qc3]  \n\t"
490             "slt    %[t3],      %[t4],      %[qc4]  \n\t"
491             "movn   %[qc1],     %[t4],      %[t0]   \n\t"
492             "movn   %[qc2],     %[t4],      %[t1]   \n\t"
493             "movn   %[qc3],     %[t4],      %[t2]   \n\t"
494             "movn   %[qc4],     %[t4],      %[t3]   \n\t"
495             "lw     %[t0],      0(%[in_int])        \n\t"
496             "lw     %[t1],      4(%[in_int])        \n\t"
497             "lw     %[t2],      8(%[in_int])        \n\t"
498             "lw     %[t3],      12(%[in_int])       \n\t"
499             "slt    %[t0],      %[t0],      $zero   \n\t"
500             "movn   %[sign1],   %[t0],      %[qc1]  \n\t"
501             "slt    %[t2],      %[t2],      $zero   \n\t"
502             "movn   %[sign2],   %[t2],      %[qc3]  \n\t"
503             "slt    %[t1],      %[t1],      $zero   \n\t"
504             "sll    %[t0],      %[sign1],   1       \n\t"
505             "or     %[t0],      %[t0],      %[t1]   \n\t"
506             "movn   %[sign1],   %[t0],      %[qc2]  \n\t"
507             "slt    %[t3],      %[t3],      $zero   \n\t"
508             "sll    %[t0],      %[sign2],   1       \n\t"
509             "or     %[t0],      %[t0],      %[t3]   \n\t"
510             "movn   %[sign2],   %[t0],      %[qc4]  \n\t"
511             "slt    %[count1],  $zero,      %[qc1]  \n\t"
512             "slt    %[t1],      $zero,      %[qc2]  \n\t"
513             "slt    %[count2],  $zero,      %[qc3]  \n\t"
514             "slt    %[t2],      $zero,      %[qc4]  \n\t"
515             "addu   %[count1],  %[count1],  %[t1]   \n\t"
516             "addu   %[count2],  %[count2],  %[t2]   \n\t"
517
518             ".set pop                               \n\t"
519
520             : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
521               [qc3]"+r"(qc3), [qc4]"+r"(qc4),
522               [sign1]"=&r"(sign1), [count1]"=&r"(count1),
523               [sign2]"=&r"(sign2), [count2]"=&r"(count2),
524               [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
525               [t4]"=&r"(t4)
526             : [in_int]"r"(in_int)
527             : "t0", "t1", "t2", "t3", "t4",
528               "memory"
529         );
530
531         curidx  = 8 * qc1;
532         curidx += qc2;
533
534         v_codes = (p_codes[curidx] << count1) | sign1;
535         v_bits  = p_bits[curidx] + count1;
536         put_bits(pb, v_bits, v_codes);
537
538         curidx  = 8 * qc3;
539         curidx += qc4;
540
541         v_codes = (p_codes[curidx] << count2) | sign2;
542         v_bits  = p_bits[curidx] + count2;
543         put_bits(pb, v_bits, v_codes);
544     }
545 }
546
547 static void quantize_and_encode_band_cost_UPAIR12_mips(struct AACEncContext *s,
548                                                        PutBitContext *pb, const float *in, float *out,
549                                                        const float *scaled, int size, int scale_idx,
550                                                        int cb, const float lambda, const float uplim,
551                                                        int *bits, const float ROUNDING)
552 {
553     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
554     int i;
555     int qc1, qc2, qc3, qc4;
556
557     uint8_t  *p_bits  = (uint8_t*) ff_aac_spectral_bits[cb-1];
558     uint16_t *p_codes = (uint16_t*)ff_aac_spectral_codes[cb-1];
559
560     abs_pow34_v(s->scoefs, in, size);
561     scaled = s->scoefs;
562     for (i = 0; i < size; i += 4) {
563         int curidx, sign1, count1, sign2, count2;
564         int *in_int = (int *)&in[i];
565         uint8_t v_bits;
566         unsigned int v_codes;
567         int t0, t1, t2, t3, t4;
568
569         qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
570         qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
571         qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
572         qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
573
574         __asm__ volatile (
575             ".set push                              \n\t"
576             ".set noreorder                         \n\t"
577
578             "ori    %[t4],      $zero,      12      \n\t"
579             "ori    %[sign1],   $zero,      0       \n\t"
580             "ori    %[sign2],   $zero,      0       \n\t"
581             "slt    %[t0],      %[t4],      %[qc1]  \n\t"
582             "slt    %[t1],      %[t4],      %[qc2]  \n\t"
583             "slt    %[t2],      %[t4],      %[qc3]  \n\t"
584             "slt    %[t3],      %[t4],      %[qc4]  \n\t"
585             "movn   %[qc1],     %[t4],      %[t0]   \n\t"
586             "movn   %[qc2],     %[t4],      %[t1]   \n\t"
587             "movn   %[qc3],     %[t4],      %[t2]   \n\t"
588             "movn   %[qc4],     %[t4],      %[t3]   \n\t"
589             "lw     %[t0],      0(%[in_int])        \n\t"
590             "lw     %[t1],      4(%[in_int])        \n\t"
591             "lw     %[t2],      8(%[in_int])        \n\t"
592             "lw     %[t3],      12(%[in_int])       \n\t"
593             "slt    %[t0],      %[t0],      $zero   \n\t"
594             "movn   %[sign1],   %[t0],      %[qc1]  \n\t"
595             "slt    %[t2],      %[t2],      $zero   \n\t"
596             "movn   %[sign2],   %[t2],      %[qc3]  \n\t"
597             "slt    %[t1],      %[t1],      $zero   \n\t"
598             "sll    %[t0],      %[sign1],   1       \n\t"
599             "or     %[t0],      %[t0],      %[t1]   \n\t"
600             "movn   %[sign1],   %[t0],      %[qc2]  \n\t"
601             "slt    %[t3],      %[t3],      $zero   \n\t"
602             "sll    %[t0],      %[sign2],   1       \n\t"
603             "or     %[t0],      %[t0],      %[t3]   \n\t"
604             "movn   %[sign2],   %[t0],      %[qc4]  \n\t"
605             "slt    %[count1],  $zero,      %[qc1]  \n\t"
606             "slt    %[t1],      $zero,      %[qc2]  \n\t"
607             "slt    %[count2],  $zero,      %[qc3]  \n\t"
608             "slt    %[t2],      $zero,      %[qc4]  \n\t"
609             "addu   %[count1],  %[count1],  %[t1]   \n\t"
610             "addu   %[count2],  %[count2],  %[t2]   \n\t"
611
612             ".set pop                               \n\t"
613
614             : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
615               [qc3]"+r"(qc3), [qc4]"+r"(qc4),
616               [sign1]"=&r"(sign1), [count1]"=&r"(count1),
617               [sign2]"=&r"(sign2), [count2]"=&r"(count2),
618               [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
619               [t4]"=&r"(t4)
620             : [in_int]"r"(in_int)
621             : "memory"
622         );
623
624         curidx  = 13 * qc1;
625         curidx += qc2;
626
627         v_codes = (p_codes[curidx] << count1) | sign1;
628         v_bits  = p_bits[curidx] + count1;
629         put_bits(pb, v_bits, v_codes);
630
631         curidx  = 13 * qc3;
632         curidx += qc4;
633
634         v_codes = (p_codes[curidx] << count2) | sign2;
635         v_bits  = p_bits[curidx] + count2;
636         put_bits(pb, v_bits, v_codes);
637     }
638 }
639
640 static void quantize_and_encode_band_cost_ESC_mips(struct AACEncContext *s,
641                                                    PutBitContext *pb, const float *in, float *out,
642                                                    const float *scaled, int size, int scale_idx,
643                                                    int cb, const float lambda, const float uplim,
644                                                    int *bits, const float ROUNDING)
645 {
646     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
647     int i;
648     int qc1, qc2, qc3, qc4;
649
650     uint8_t  *p_bits    = (uint8_t* )ff_aac_spectral_bits[cb-1];
651     uint16_t *p_codes   = (uint16_t*)ff_aac_spectral_codes[cb-1];
652     float    *p_vectors = (float*   )ff_aac_codebook_vectors[cb-1];
653
654     abs_pow34_v(s->scoefs, in, size);
655     scaled = s->scoefs;
656
657     if (cb < 11) {
658         for (i = 0; i < size; i += 4) {
659             int curidx, curidx2, sign1, count1, sign2, count2;
660             int *in_int = (int *)&in[i];
661             uint8_t v_bits;
662             unsigned int v_codes;
663             int t0, t1, t2, t3, t4;
664
665             qc1 = scaled[i  ] * Q34 + ROUNDING;
666             qc2 = scaled[i+1] * Q34 + ROUNDING;
667             qc3 = scaled[i+2] * Q34 + ROUNDING;
668             qc4 = scaled[i+3] * Q34 + ROUNDING;
669
670             __asm__ volatile (
671                 ".set push                                  \n\t"
672                 ".set noreorder                             \n\t"
673
674                 "ori        %[t4],      $zero,      16      \n\t"
675                 "ori        %[sign1],   $zero,      0       \n\t"
676                 "ori        %[sign2],   $zero,      0       \n\t"
677                 "slt        %[t0],      %[t4],      %[qc1]  \n\t"
678                 "slt        %[t1],      %[t4],      %[qc2]  \n\t"
679                 "slt        %[t2],      %[t4],      %[qc3]  \n\t"
680                 "slt        %[t3],      %[t4],      %[qc4]  \n\t"
681                 "movn       %[qc1],     %[t4],      %[t0]   \n\t"
682                 "movn       %[qc2],     %[t4],      %[t1]   \n\t"
683                 "movn       %[qc3],     %[t4],      %[t2]   \n\t"
684                 "movn       %[qc4],     %[t4],      %[t3]   \n\t"
685                 "lw         %[t0],      0(%[in_int])        \n\t"
686                 "lw         %[t1],      4(%[in_int])        \n\t"
687                 "lw         %[t2],      8(%[in_int])        \n\t"
688                 "lw         %[t3],      12(%[in_int])       \n\t"
689                 "slt        %[t0],      %[t0],      $zero   \n\t"
690                 "movn       %[sign1],   %[t0],      %[qc1]  \n\t"
691                 "slt        %[t2],      %[t2],      $zero   \n\t"
692                 "movn       %[sign2],   %[t2],      %[qc3]  \n\t"
693                 "slt        %[t1],      %[t1],      $zero   \n\t"
694                 "sll        %[t0],      %[sign1],   1       \n\t"
695                 "or         %[t0],      %[t0],      %[t1]   \n\t"
696                 "movn       %[sign1],   %[t0],      %[qc2]  \n\t"
697                 "slt        %[t3],      %[t3],      $zero   \n\t"
698                 "sll        %[t0],      %[sign2],   1       \n\t"
699                 "or         %[t0],      %[t0],      %[t3]   \n\t"
700                 "movn       %[sign2],   %[t0],      %[qc4]  \n\t"
701                 "slt        %[count1],  $zero,      %[qc1]  \n\t"
702                 "slt        %[t1],      $zero,      %[qc2]  \n\t"
703                 "slt        %[count2],  $zero,      %[qc3]  \n\t"
704                 "slt        %[t2],      $zero,      %[qc4]  \n\t"
705                 "addu       %[count1],  %[count1],  %[t1]   \n\t"
706                 "addu       %[count2],  %[count2],  %[t2]   \n\t"
707
708                 ".set pop                                   \n\t"
709
710                 : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
711                   [qc3]"+r"(qc3), [qc4]"+r"(qc4),
712                   [sign1]"=&r"(sign1), [count1]"=&r"(count1),
713                   [sign2]"=&r"(sign2), [count2]"=&r"(count2),
714                   [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
715                   [t4]"=&r"(t4)
716                 : [in_int]"r"(in_int)
717                 : "memory"
718             );
719
720             curidx = 17 * qc1;
721             curidx += qc2;
722             curidx2 = 17 * qc3;
723             curidx2 += qc4;
724
725             v_codes = (p_codes[curidx] << count1) | sign1;
726             v_bits  = p_bits[curidx] + count1;
727             put_bits(pb, v_bits, v_codes);
728
729             v_codes = (p_codes[curidx2] << count2) | sign2;
730             v_bits  = p_bits[curidx2] + count2;
731             put_bits(pb, v_bits, v_codes);
732         }
733     } else {
734         for (i = 0; i < size; i += 4) {
735             int curidx, curidx2, sign1, count1, sign2, count2;
736             int *in_int = (int *)&in[i];
737             uint8_t v_bits;
738             unsigned int v_codes;
739             int c1, c2, c3, c4;
740             int t0, t1, t2, t3, t4;
741
742             qc1 = scaled[i  ] * Q34 + ROUNDING;
743             qc2 = scaled[i+1] * Q34 + ROUNDING;
744             qc3 = scaled[i+2] * Q34 + ROUNDING;
745             qc4 = scaled[i+3] * Q34 + ROUNDING;
746
747             __asm__ volatile (
748                 ".set push                                  \n\t"
749                 ".set noreorder                             \n\t"
750
751                 "ori        %[t4],      $zero,      16      \n\t"
752                 "ori        %[sign1],   $zero,      0       \n\t"
753                 "ori        %[sign2],   $zero,      0       \n\t"
754                 "shll_s.w   %[c1],      %[qc1],     18      \n\t"
755                 "shll_s.w   %[c2],      %[qc2],     18      \n\t"
756                 "shll_s.w   %[c3],      %[qc3],     18      \n\t"
757                 "shll_s.w   %[c4],      %[qc4],     18      \n\t"
758                 "srl        %[c1],      %[c1],      18      \n\t"
759                 "srl        %[c2],      %[c2],      18      \n\t"
760                 "srl        %[c3],      %[c3],      18      \n\t"
761                 "srl        %[c4],      %[c4],      18      \n\t"
762                 "slt        %[t0],      %[t4],      %[qc1]  \n\t"
763                 "slt        %[t1],      %[t4],      %[qc2]  \n\t"
764                 "slt        %[t2],      %[t4],      %[qc3]  \n\t"
765                 "slt        %[t3],      %[t4],      %[qc4]  \n\t"
766                 "movn       %[qc1],     %[t4],      %[t0]   \n\t"
767                 "movn       %[qc2],     %[t4],      %[t1]   \n\t"
768                 "movn       %[qc3],     %[t4],      %[t2]   \n\t"
769                 "movn       %[qc4],     %[t4],      %[t3]   \n\t"
770                 "lw         %[t0],      0(%[in_int])        \n\t"
771                 "lw         %[t1],      4(%[in_int])        \n\t"
772                 "lw         %[t2],      8(%[in_int])        \n\t"
773                 "lw         %[t3],      12(%[in_int])       \n\t"
774                 "slt        %[t0],      %[t0],      $zero   \n\t"
775                 "movn       %[sign1],   %[t0],      %[qc1]  \n\t"
776                 "slt        %[t2],      %[t2],      $zero   \n\t"
777                 "movn       %[sign2],   %[t2],      %[qc3]  \n\t"
778                 "slt        %[t1],      %[t1],      $zero   \n\t"
779                 "sll        %[t0],      %[sign1],   1       \n\t"
780                 "or         %[t0],      %[t0],      %[t1]   \n\t"
781                 "movn       %[sign1],   %[t0],      %[qc2]  \n\t"
782                 "slt        %[t3],      %[t3],      $zero   \n\t"
783                 "sll        %[t0],      %[sign2],   1       \n\t"
784                 "or         %[t0],      %[t0],      %[t3]   \n\t"
785                 "movn       %[sign2],   %[t0],      %[qc4]  \n\t"
786                 "slt        %[count1],  $zero,      %[qc1]  \n\t"
787                 "slt        %[t1],      $zero,      %[qc2]  \n\t"
788                 "slt        %[count2],  $zero,      %[qc3]  \n\t"
789                 "slt        %[t2],      $zero,      %[qc4]  \n\t"
790                 "addu       %[count1],  %[count1],  %[t1]   \n\t"
791                 "addu       %[count2],  %[count2],  %[t2]   \n\t"
792
793                 ".set pop                                   \n\t"
794
795                 : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
796                   [qc3]"+r"(qc3), [qc4]"+r"(qc4),
797                   [sign1]"=&r"(sign1), [count1]"=&r"(count1),
798                   [sign2]"=&r"(sign2), [count2]"=&r"(count2),
799                   [c1]"=&r"(c1), [c2]"=&r"(c2),
800                   [c3]"=&r"(c3), [c4]"=&r"(c4),
801                   [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
802                   [t4]"=&r"(t4)
803                 : [in_int]"r"(in_int)
804                 : "memory"
805             );
806
807             curidx = 17 * qc1;
808             curidx += qc2;
809
810             curidx2 = 17 * qc3;
811             curidx2 += qc4;
812
813             v_codes = (p_codes[curidx] << count1) | sign1;
814             v_bits  = p_bits[curidx] + count1;
815             put_bits(pb, v_bits, v_codes);
816
817             if (p_vectors[curidx*2  ] == 64.0f) {
818                 int len = av_log2(c1);
819                 v_codes = (((1 << (len - 3)) - 2) << len) | (c1 & ((1 << len) - 1));
820                 put_bits(pb, len * 2 - 3, v_codes);
821             }
822             if (p_vectors[curidx*2+1] == 64.0f) {
823                 int len = av_log2(c2);
824                 v_codes = (((1 << (len - 3)) - 2) << len) | (c2 & ((1 << len) - 1));
825                 put_bits(pb, len*2-3, v_codes);
826             }
827
828             v_codes = (p_codes[curidx2] << count2) | sign2;
829             v_bits  = p_bits[curidx2] + count2;
830             put_bits(pb, v_bits, v_codes);
831
832             if (p_vectors[curidx2*2  ] == 64.0f) {
833                 int len = av_log2(c3);
834                 v_codes = (((1 << (len - 3)) - 2) << len) | (c3 & ((1 << len) - 1));
835                 put_bits(pb, len* 2 - 3, v_codes);
836             }
837             if (p_vectors[curidx2*2+1] == 64.0f) {
838                 int len = av_log2(c4);
839                 v_codes = (((1 << (len - 3)) - 2) << len) | (c4 & ((1 << len) - 1));
840                 put_bits(pb, len * 2 - 3, v_codes);
841             }
842         }
843     }
844 }
845
846 static void quantize_and_encode_band_cost_NONE_mips(struct AACEncContext *s,
847                                                          PutBitContext *pb, const float *in, float *out,
848                                                          const float *scaled, int size, int scale_idx,
849                                                          int cb, const float lambda, const float uplim,
850                                                          int *bits, const float ROUNDING) {
851     av_assert0(0);
852 }
853
854 static void quantize_and_encode_band_cost_ZERO_mips(struct AACEncContext *s,
855                                                          PutBitContext *pb, const float *in, float *out,
856                                                          const float *scaled, int size, int scale_idx,
857                                                          int cb, const float lambda, const float uplim,
858                                                          int *bits, const float ROUNDING) {
859     int i;
860     if (bits)
861         *bits = 0;
862     if (out) {
863         for (i = 0; i < size; i += 4) {
864            out[i  ] = 0.0f;
865            out[i+1] = 0.0f;
866            out[i+2] = 0.0f;
867            out[i+3] = 0.0f;
868         }
869     }
870 }
871
872 static void (*const quantize_and_encode_band_cost_arr[])(struct AACEncContext *s,
873                                                          PutBitContext *pb, const float *in, float *out,
874                                                          const float *scaled, int size, int scale_idx,
875                                                          int cb, const float lambda, const float uplim,
876                                                          int *bits, const float ROUNDING) = {
877     quantize_and_encode_band_cost_ZERO_mips,
878     quantize_and_encode_band_cost_SQUAD_mips,
879     quantize_and_encode_band_cost_SQUAD_mips,
880     quantize_and_encode_band_cost_UQUAD_mips,
881     quantize_and_encode_band_cost_UQUAD_mips,
882     quantize_and_encode_band_cost_SPAIR_mips,
883     quantize_and_encode_band_cost_SPAIR_mips,
884     quantize_and_encode_band_cost_UPAIR7_mips,
885     quantize_and_encode_band_cost_UPAIR7_mips,
886     quantize_and_encode_band_cost_UPAIR12_mips,
887     quantize_and_encode_band_cost_UPAIR12_mips,
888     quantize_and_encode_band_cost_ESC_mips,
889     quantize_and_encode_band_cost_NONE_mips, /* cb 12 doesn't exist */
890     quantize_and_encode_band_cost_ZERO_mips,
891     quantize_and_encode_band_cost_ZERO_mips,
892     quantize_and_encode_band_cost_ZERO_mips,
893 };
894
895 #define quantize_and_encode_band_cost(                                       \
896                                 s, pb, in, out, scaled, size, scale_idx, cb, \
897                                 lambda, uplim, bits, ROUNDING)               \
898     quantize_and_encode_band_cost_arr[cb](                                   \
899                                 s, pb, in, out, scaled, size, scale_idx, cb, \
900                                 lambda, uplim, bits, ROUNDING)
901
902 static void quantize_and_encode_band_mips(struct AACEncContext *s, PutBitContext *pb,
903                                           const float *in, float *out, int size, int scale_idx,
904                                           int cb, const float lambda, int rtz)
905 {
906     quantize_and_encode_band_cost(s, pb, in, out, NULL, size, scale_idx, cb, lambda,
907                                   INFINITY, NULL, (rtz) ? ROUND_TO_ZERO : ROUND_STANDARD);
908 }
909
910 /**
911  * Functions developed from template function and optimized for getting the number of bits
912  */
913 static float get_band_numbits_ZERO_mips(struct AACEncContext *s,
914                                         PutBitContext *pb, const float *in,
915                                         const float *scaled, int size, int scale_idx,
916                                         int cb, const float lambda, const float uplim,
917                                         int *bits)
918 {
919     return 0;
920 }
921
922 static float get_band_numbits_NONE_mips(struct AACEncContext *s,
923                                         PutBitContext *pb, const float *in,
924                                         const float *scaled, int size, int scale_idx,
925                                         int cb, const float lambda, const float uplim,
926                                         int *bits)
927 {
928     av_assert0(0);
929     return 0;
930 }
931
932 static float get_band_numbits_SQUAD_mips(struct AACEncContext *s,
933                                          PutBitContext *pb, const float *in,
934                                          const float *scaled, int size, int scale_idx,
935                                          int cb, const float lambda, const float uplim,
936                                          int *bits)
937 {
938     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
939     int i;
940     int qc1, qc2, qc3, qc4;
941     int curbits = 0;
942
943     uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
944
945     for (i = 0; i < size; i += 4) {
946         int curidx;
947         int *in_int = (int *)&in[i];
948         int t0, t1, t2, t3, t4, t5, t6, t7;
949
950         qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
951         qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
952         qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
953         qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
954
955         __asm__ volatile (
956             ".set push                      \n\t"
957             ".set noreorder                 \n\t"
958
959             "slt    %[qc1], $zero,  %[qc1]  \n\t"
960             "slt    %[qc2], $zero,  %[qc2]  \n\t"
961             "slt    %[qc3], $zero,  %[qc3]  \n\t"
962             "slt    %[qc4], $zero,  %[qc4]  \n\t"
963             "lw     %[t0],  0(%[in_int])    \n\t"
964             "lw     %[t1],  4(%[in_int])    \n\t"
965             "lw     %[t2],  8(%[in_int])    \n\t"
966             "lw     %[t3],  12(%[in_int])   \n\t"
967             "srl    %[t0],  %[t0],  31      \n\t"
968             "srl    %[t1],  %[t1],  31      \n\t"
969             "srl    %[t2],  %[t2],  31      \n\t"
970             "srl    %[t3],  %[t3],  31      \n\t"
971             "subu   %[t4],  $zero,  %[qc1]  \n\t"
972             "subu   %[t5],  $zero,  %[qc2]  \n\t"
973             "subu   %[t6],  $zero,  %[qc3]  \n\t"
974             "subu   %[t7],  $zero,  %[qc4]  \n\t"
975             "movn   %[qc1], %[t4],  %[t0]   \n\t"
976             "movn   %[qc2], %[t5],  %[t1]   \n\t"
977             "movn   %[qc3], %[t6],  %[t2]   \n\t"
978             "movn   %[qc4], %[t7],  %[t3]   \n\t"
979
980             ".set pop                       \n\t"
981
982             : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
983               [qc3]"+r"(qc3), [qc4]"+r"(qc4),
984               [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
985               [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7)
986             : [in_int]"r"(in_int)
987             : "memory"
988         );
989
990         curidx = qc1;
991         curidx *= 3;
992         curidx += qc2;
993         curidx *= 3;
994         curidx += qc3;
995         curidx *= 3;
996         curidx += qc4;
997         curidx += 40;
998
999         curbits += p_bits[curidx];
1000     }
1001     return curbits;
1002 }
1003
1004 static float get_band_numbits_UQUAD_mips(struct AACEncContext *s,
1005                                          PutBitContext *pb, const float *in,
1006                                          const float *scaled, int size, int scale_idx,
1007                                          int cb, const float lambda, const float uplim,
1008                                          int *bits)
1009 {
1010     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1011     int i;
1012     int curbits = 0;
1013     int qc1, qc2, qc3, qc4;
1014
1015     uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
1016
1017     for (i = 0; i < size; i += 4) {
1018         int curidx;
1019         int t0, t1, t2, t3, t4;
1020
1021         qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
1022         qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
1023         qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
1024         qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
1025
1026         __asm__ volatile (
1027             ".set push                      \n\t"
1028             ".set noreorder                 \n\t"
1029
1030             "ori    %[t4],  $zero,  2       \n\t"
1031             "slt    %[t0],  %[t4],  %[qc1]  \n\t"
1032             "slt    %[t1],  %[t4],  %[qc2]  \n\t"
1033             "slt    %[t2],  %[t4],  %[qc3]  \n\t"
1034             "slt    %[t3],  %[t4],  %[qc4]  \n\t"
1035             "movn   %[qc1], %[t4],  %[t0]   \n\t"
1036             "movn   %[qc2], %[t4],  %[t1]   \n\t"
1037             "movn   %[qc3], %[t4],  %[t2]   \n\t"
1038             "movn   %[qc4], %[t4],  %[t3]   \n\t"
1039
1040             ".set pop                       \n\t"
1041
1042             : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1043               [qc3]"+r"(qc3), [qc4]"+r"(qc4),
1044               [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
1045               [t4]"=&r"(t4)
1046         );
1047
1048         curidx = qc1;
1049         curidx *= 3;
1050         curidx += qc2;
1051         curidx *= 3;
1052         curidx += qc3;
1053         curidx *= 3;
1054         curidx += qc4;
1055
1056         curbits += p_bits[curidx];
1057         curbits += uquad_sign_bits[curidx];
1058     }
1059     return curbits;
1060 }
1061
1062 static float get_band_numbits_SPAIR_mips(struct AACEncContext *s,
1063                                          PutBitContext *pb, const float *in,
1064                                          const float *scaled, int size, int scale_idx,
1065                                          int cb, const float lambda, const float uplim,
1066                                          int *bits)
1067 {
1068     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1069     int i;
1070     int qc1, qc2, qc3, qc4;
1071     int curbits = 0;
1072
1073     uint8_t *p_bits = (uint8_t*)ff_aac_spectral_bits[cb-1];
1074
1075     for (i = 0; i < size; i += 4) {
1076         int curidx, curidx2;
1077         int *in_int = (int *)&in[i];
1078         int t0, t1, t2, t3, t4, t5, t6, t7;
1079
1080         qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
1081         qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
1082         qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
1083         qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
1084
1085         __asm__ volatile (
1086             ".set push                      \n\t"
1087             ".set noreorder                 \n\t"
1088
1089             "ori    %[t4],  $zero,  4       \n\t"
1090             "slt    %[t0],  %[t4],  %[qc1]  \n\t"
1091             "slt    %[t1],  %[t4],  %[qc2]  \n\t"
1092             "slt    %[t2],  %[t4],  %[qc3]  \n\t"
1093             "slt    %[t3],  %[t4],  %[qc4]  \n\t"
1094             "movn   %[qc1], %[t4],  %[t0]   \n\t"
1095             "movn   %[qc2], %[t4],  %[t1]   \n\t"
1096             "movn   %[qc3], %[t4],  %[t2]   \n\t"
1097             "movn   %[qc4], %[t4],  %[t3]   \n\t"
1098             "lw     %[t0],  0(%[in_int])    \n\t"
1099             "lw     %[t1],  4(%[in_int])    \n\t"
1100             "lw     %[t2],  8(%[in_int])    \n\t"
1101             "lw     %[t3],  12(%[in_int])   \n\t"
1102             "srl    %[t0],  %[t0],  31      \n\t"
1103             "srl    %[t1],  %[t1],  31      \n\t"
1104             "srl    %[t2],  %[t2],  31      \n\t"
1105             "srl    %[t3],  %[t3],  31      \n\t"
1106             "subu   %[t4],  $zero,  %[qc1]  \n\t"
1107             "subu   %[t5],  $zero,  %[qc2]  \n\t"
1108             "subu   %[t6],  $zero,  %[qc3]  \n\t"
1109             "subu   %[t7],  $zero,  %[qc4]  \n\t"
1110             "movn   %[qc1], %[t4],  %[t0]   \n\t"
1111             "movn   %[qc2], %[t5],  %[t1]   \n\t"
1112             "movn   %[qc3], %[t6],  %[t2]   \n\t"
1113             "movn   %[qc4], %[t7],  %[t3]   \n\t"
1114
1115             ".set pop                       \n\t"
1116
1117             : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1118               [qc3]"+r"(qc3), [qc4]"+r"(qc4),
1119               [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
1120               [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7)
1121             : [in_int]"r"(in_int)
1122             : "memory"
1123         );
1124
1125         curidx  = 9 * qc1;
1126         curidx += qc2 + 40;
1127
1128         curidx2  = 9 * qc3;
1129         curidx2 += qc4 + 40;
1130
1131         curbits += p_bits[curidx] + p_bits[curidx2];
1132     }
1133     return curbits;
1134 }
1135
1136 static float get_band_numbits_UPAIR7_mips(struct AACEncContext *s,
1137                                           PutBitContext *pb, const float *in,
1138                                           const float *scaled, int size, int scale_idx,
1139                                           int cb, const float lambda, const float uplim,
1140                                           int *bits)
1141 {
1142     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1143     int i;
1144     int qc1, qc2, qc3, qc4;
1145     int curbits = 0;
1146
1147     uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
1148
1149     for (i = 0; i < size; i += 4) {
1150         int curidx, curidx2;
1151         int t0, t1, t2, t3, t4;
1152
1153         qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
1154         qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
1155         qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
1156         qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
1157
1158         __asm__ volatile (
1159             ".set push                      \n\t"
1160             ".set noreorder                 \n\t"
1161
1162             "ori    %[t4],  $zero,  7       \n\t"
1163             "slt    %[t0],  %[t4],  %[qc1]  \n\t"
1164             "slt    %[t1],  %[t4],  %[qc2]  \n\t"
1165             "slt    %[t2],  %[t4],  %[qc3]  \n\t"
1166             "slt    %[t3],  %[t4],  %[qc4]  \n\t"
1167             "movn   %[qc1], %[t4],  %[t0]   \n\t"
1168             "movn   %[qc2], %[t4],  %[t1]   \n\t"
1169             "movn   %[qc3], %[t4],  %[t2]   \n\t"
1170             "movn   %[qc4], %[t4],  %[t3]   \n\t"
1171
1172             ".set pop                       \n\t"
1173
1174             : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1175               [qc3]"+r"(qc3), [qc4]"+r"(qc4),
1176               [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
1177               [t4]"=&r"(t4)
1178         );
1179
1180         curidx  = 8 * qc1;
1181         curidx += qc2;
1182
1183         curidx2  = 8 * qc3;
1184         curidx2 += qc4;
1185
1186         curbits += p_bits[curidx] +
1187                    upair7_sign_bits[curidx] +
1188                    p_bits[curidx2] +
1189                    upair7_sign_bits[curidx2];
1190     }
1191     return curbits;
1192 }
1193
1194 static float get_band_numbits_UPAIR12_mips(struct AACEncContext *s,
1195                                            PutBitContext *pb, const float *in,
1196                                            const float *scaled, int size, int scale_idx,
1197                                            int cb, const float lambda, const float uplim,
1198                                            int *bits)
1199 {
1200     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1201     int i;
1202     int qc1, qc2, qc3, qc4;
1203     int curbits = 0;
1204
1205     uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
1206
1207     for (i = 0; i < size; i += 4) {
1208         int curidx, curidx2;
1209         int t0, t1, t2, t3, t4;
1210
1211         qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
1212         qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
1213         qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
1214         qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
1215
1216         __asm__ volatile (
1217             ".set push                      \n\t"
1218             ".set noreorder                 \n\t"
1219
1220             "ori    %[t4],  $zero,  12      \n\t"
1221             "slt    %[t0],  %[t4],  %[qc1]  \n\t"
1222             "slt    %[t1],  %[t4],  %[qc2]  \n\t"
1223             "slt    %[t2],  %[t4],  %[qc3]  \n\t"
1224             "slt    %[t3],  %[t4],  %[qc4]  \n\t"
1225             "movn   %[qc1], %[t4],  %[t0]   \n\t"
1226             "movn   %[qc2], %[t4],  %[t1]   \n\t"
1227             "movn   %[qc3], %[t4],  %[t2]   \n\t"
1228             "movn   %[qc4], %[t4],  %[t3]   \n\t"
1229
1230             ".set pop                       \n\t"
1231
1232             : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1233               [qc3]"+r"(qc3), [qc4]"+r"(qc4),
1234               [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
1235               [t4]"=&r"(t4)
1236         );
1237
1238         curidx  = 13 * qc1;
1239         curidx += qc2;
1240
1241         curidx2  = 13 * qc3;
1242         curidx2 += qc4;
1243
1244         curbits += p_bits[curidx] +
1245                    p_bits[curidx2] +
1246                    upair12_sign_bits[curidx] +
1247                    upair12_sign_bits[curidx2];
1248     }
1249     return curbits;
1250 }
1251
1252 static float get_band_numbits_ESC_mips(struct AACEncContext *s,
1253                                        PutBitContext *pb, const float *in,
1254                                        const float *scaled, int size, int scale_idx,
1255                                        int cb, const float lambda, const float uplim,
1256                                        int *bits)
1257 {
1258     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1259     int i;
1260     int qc1, qc2, qc3, qc4;
1261     int curbits = 0;
1262
1263     uint8_t *p_bits = (uint8_t*)ff_aac_spectral_bits[cb-1];
1264
1265     for (i = 0; i < size; i += 4) {
1266         int curidx, curidx2;
1267         int cond0, cond1, cond2, cond3;
1268         int c1, c2, c3, c4;
1269         int t4, t5;
1270
1271         qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
1272         qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
1273         qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
1274         qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
1275
1276         __asm__ volatile (
1277             ".set push                                  \n\t"
1278             ".set noreorder                             \n\t"
1279
1280             "ori        %[t4],      $zero,  15          \n\t"
1281             "ori        %[t5],      $zero,  16          \n\t"
1282             "shll_s.w   %[c1],      %[qc1], 18          \n\t"
1283             "shll_s.w   %[c2],      %[qc2], 18          \n\t"
1284             "shll_s.w   %[c3],      %[qc3], 18          \n\t"
1285             "shll_s.w   %[c4],      %[qc4], 18          \n\t"
1286             "srl        %[c1],      %[c1],  18          \n\t"
1287             "srl        %[c2],      %[c2],  18          \n\t"
1288             "srl        %[c3],      %[c3],  18          \n\t"
1289             "srl        %[c4],      %[c4],  18          \n\t"
1290             "slt        %[cond0],   %[t4],  %[qc1]      \n\t"
1291             "slt        %[cond1],   %[t4],  %[qc2]      \n\t"
1292             "slt        %[cond2],   %[t4],  %[qc3]      \n\t"
1293             "slt        %[cond3],   %[t4],  %[qc4]      \n\t"
1294             "movn       %[qc1],     %[t5],  %[cond0]    \n\t"
1295             "movn       %[qc2],     %[t5],  %[cond1]    \n\t"
1296             "movn       %[qc3],     %[t5],  %[cond2]    \n\t"
1297             "movn       %[qc4],     %[t5],  %[cond3]    \n\t"
1298             "ori        %[t5],      $zero,  31          \n\t"
1299             "clz        %[c1],      %[c1]               \n\t"
1300             "clz        %[c2],      %[c2]               \n\t"
1301             "clz        %[c3],      %[c3]               \n\t"
1302             "clz        %[c4],      %[c4]               \n\t"
1303             "subu       %[c1],      %[t5],  %[c1]       \n\t"
1304             "subu       %[c2],      %[t5],  %[c2]       \n\t"
1305             "subu       %[c3],      %[t5],  %[c3]       \n\t"
1306             "subu       %[c4],      %[t5],  %[c4]       \n\t"
1307             "sll        %[c1],      %[c1],  1           \n\t"
1308             "sll        %[c2],      %[c2],  1           \n\t"
1309             "sll        %[c3],      %[c3],  1           \n\t"
1310             "sll        %[c4],      %[c4],  1           \n\t"
1311             "addiu      %[c1],      %[c1],  -3          \n\t"
1312             "addiu      %[c2],      %[c2],  -3          \n\t"
1313             "addiu      %[c3],      %[c3],  -3          \n\t"
1314             "addiu      %[c4],      %[c4],  -3          \n\t"
1315             "subu       %[cond0],   $zero,  %[cond0]    \n\t"
1316             "subu       %[cond1],   $zero,  %[cond1]    \n\t"
1317             "subu       %[cond2],   $zero,  %[cond2]    \n\t"
1318             "subu       %[cond3],   $zero,  %[cond3]    \n\t"
1319             "and        %[c1],      %[c1],  %[cond0]    \n\t"
1320             "and        %[c2],      %[c2],  %[cond1]    \n\t"
1321             "and        %[c3],      %[c3],  %[cond2]    \n\t"
1322             "and        %[c4],      %[c4],  %[cond3]    \n\t"
1323
1324             ".set pop                                   \n\t"
1325
1326             : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1327               [qc3]"+r"(qc3), [qc4]"+r"(qc4),
1328               [cond0]"=&r"(cond0), [cond1]"=&r"(cond1),
1329               [cond2]"=&r"(cond2), [cond3]"=&r"(cond3),
1330               [c1]"=&r"(c1), [c2]"=&r"(c2),
1331               [c3]"=&r"(c3), [c4]"=&r"(c4),
1332               [t4]"=&r"(t4), [t5]"=&r"(t5)
1333         );
1334
1335         curidx = 17 * qc1;
1336         curidx += qc2;
1337
1338         curidx2 = 17 * qc3;
1339         curidx2 += qc4;
1340
1341         curbits += p_bits[curidx];
1342         curbits += esc_sign_bits[curidx];
1343         curbits += p_bits[curidx2];
1344         curbits += esc_sign_bits[curidx2];
1345
1346         curbits += c1;
1347         curbits += c2;
1348         curbits += c3;
1349         curbits += c4;
1350     }
1351     return curbits;
1352 }
1353
1354 static float (*const get_band_numbits_arr[])(struct AACEncContext *s,
1355                                              PutBitContext *pb, const float *in,
1356                                              const float *scaled, int size, int scale_idx,
1357                                              int cb, const float lambda, const float uplim,
1358                                              int *bits) = {
1359     get_band_numbits_ZERO_mips,
1360     get_band_numbits_SQUAD_mips,
1361     get_band_numbits_SQUAD_mips,
1362     get_band_numbits_UQUAD_mips,
1363     get_band_numbits_UQUAD_mips,
1364     get_band_numbits_SPAIR_mips,
1365     get_band_numbits_SPAIR_mips,
1366     get_band_numbits_UPAIR7_mips,
1367     get_band_numbits_UPAIR7_mips,
1368     get_band_numbits_UPAIR12_mips,
1369     get_band_numbits_UPAIR12_mips,
1370     get_band_numbits_ESC_mips,
1371     get_band_numbits_NONE_mips, /* cb 12 doesn't exist */
1372     get_band_numbits_ZERO_mips,
1373     get_band_numbits_ZERO_mips,
1374     get_band_numbits_ZERO_mips,
1375 };
1376
1377 #define get_band_numbits(                                  \
1378                                 s, pb, in, scaled, size, scale_idx, cb, \
1379                                 lambda, uplim, bits)                    \
1380     get_band_numbits_arr[cb](                              \
1381                                 s, pb, in, scaled, size, scale_idx, cb, \
1382                                 lambda, uplim, bits)
1383
1384 static float quantize_band_cost_bits(struct AACEncContext *s, const float *in,
1385                                      const float *scaled, int size, int scale_idx,
1386                                      int cb, const float lambda, const float uplim,
1387                                      int *bits)
1388 {
1389     return get_band_numbits(s, NULL, in, scaled, size, scale_idx, cb, lambda, uplim, bits);
1390 }
1391
1392 /**
1393  * Functions developed from template function and optimized for getting the band cost
1394  */
1395 #if HAVE_MIPSFPU
1396 static float get_band_cost_ZERO_mips(struct AACEncContext *s,
1397                                      PutBitContext *pb, const float *in,
1398                                      const float *scaled, int size, int scale_idx,
1399                                      int cb, const float lambda, const float uplim,
1400                                      int *bits)
1401 {
1402     int i;
1403     float cost = 0;
1404
1405     for (i = 0; i < size; i += 4) {
1406         cost += in[i  ] * in[i  ];
1407         cost += in[i+1] * in[i+1];
1408         cost += in[i+2] * in[i+2];
1409         cost += in[i+3] * in[i+3];
1410     }
1411     if (bits)
1412         *bits = 0;
1413     return cost * lambda;
1414 }
1415
1416 static float get_band_cost_NONE_mips(struct AACEncContext *s,
1417                                      PutBitContext *pb, const float *in,
1418                                      const float *scaled, int size, int scale_idx,
1419                                      int cb, const float lambda, const float uplim,
1420                                      int *bits)
1421 {
1422     av_assert0(0);
1423     return 0;
1424 }
1425
1426 static float get_band_cost_SQUAD_mips(struct AACEncContext *s,
1427                                       PutBitContext *pb, const float *in,
1428                                       const float *scaled, int size, int scale_idx,
1429                                       int cb, const float lambda, const float uplim,
1430                                       int *bits)
1431 {
1432     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1433     const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
1434     int i;
1435     float cost = 0;
1436     int qc1, qc2, qc3, qc4;
1437     int curbits = 0;
1438
1439     uint8_t *p_bits  = (uint8_t *)ff_aac_spectral_bits[cb-1];
1440     float   *p_codes = (float   *)ff_aac_codebook_vectors[cb-1];
1441
1442     for (i = 0; i < size; i += 4) {
1443         const float *vec;
1444         int curidx;
1445         int   *in_int = (int   *)&in[i];
1446         float *in_pos = (float *)&in[i];
1447         float di0, di1, di2, di3;
1448         int t0, t1, t2, t3, t4, t5, t6, t7;
1449
1450         qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
1451         qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
1452         qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
1453         qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
1454
1455         __asm__ volatile (
1456             ".set push                                  \n\t"
1457             ".set noreorder                             \n\t"
1458
1459             "slt        %[qc1], $zero,  %[qc1]          \n\t"
1460             "slt        %[qc2], $zero,  %[qc2]          \n\t"
1461             "slt        %[qc3], $zero,  %[qc3]          \n\t"
1462             "slt        %[qc4], $zero,  %[qc4]          \n\t"
1463             "lw         %[t0],  0(%[in_int])            \n\t"
1464             "lw         %[t1],  4(%[in_int])            \n\t"
1465             "lw         %[t2],  8(%[in_int])            \n\t"
1466             "lw         %[t3],  12(%[in_int])           \n\t"
1467             "srl        %[t0],  %[t0],  31              \n\t"
1468             "srl        %[t1],  %[t1],  31              \n\t"
1469             "srl        %[t2],  %[t2],  31              \n\t"
1470             "srl        %[t3],  %[t3],  31              \n\t"
1471             "subu       %[t4],  $zero,  %[qc1]          \n\t"
1472             "subu       %[t5],  $zero,  %[qc2]          \n\t"
1473             "subu       %[t6],  $zero,  %[qc3]          \n\t"
1474             "subu       %[t7],  $zero,  %[qc4]          \n\t"
1475             "movn       %[qc1], %[t4],  %[t0]           \n\t"
1476             "movn       %[qc2], %[t5],  %[t1]           \n\t"
1477             "movn       %[qc3], %[t6],  %[t2]           \n\t"
1478             "movn       %[qc4], %[t7],  %[t3]           \n\t"
1479
1480             ".set pop                                   \n\t"
1481
1482             : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1483               [qc3]"+r"(qc3), [qc4]"+r"(qc4),
1484               [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
1485               [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7)
1486             : [in_int]"r"(in_int)
1487             : "memory"
1488         );
1489
1490         curidx = qc1;
1491         curidx *= 3;
1492         curidx += qc2;
1493         curidx *= 3;
1494         curidx += qc3;
1495         curidx *= 3;
1496         curidx += qc4;
1497         curidx += 40;
1498
1499         curbits += p_bits[curidx];
1500         vec     = &p_codes[curidx*4];
1501
1502         __asm__ volatile (
1503             ".set push                                  \n\t"
1504             ".set noreorder                             \n\t"
1505
1506             "lwc1       $f0,    0(%[in_pos])            \n\t"
1507             "lwc1       $f1,    0(%[vec])               \n\t"
1508             "lwc1       $f2,    4(%[in_pos])            \n\t"
1509             "lwc1       $f3,    4(%[vec])               \n\t"
1510             "lwc1       $f4,    8(%[in_pos])            \n\t"
1511             "lwc1       $f5,    8(%[vec])               \n\t"
1512             "lwc1       $f6,    12(%[in_pos])           \n\t"
1513             "lwc1       $f7,    12(%[vec])              \n\t"
1514             "nmsub.s    %[di0], $f0,    $f1,    %[IQ]   \n\t"
1515             "nmsub.s    %[di1], $f2,    $f3,    %[IQ]   \n\t"
1516             "nmsub.s    %[di2], $f4,    $f5,    %[IQ]   \n\t"
1517             "nmsub.s    %[di3], $f6,    $f7,    %[IQ]   \n\t"
1518
1519             ".set pop                                   \n\t"
1520
1521             : [di0]"=&f"(di0), [di1]"=&f"(di1),
1522               [di2]"=&f"(di2), [di3]"=&f"(di3)
1523             : [in_pos]"r"(in_pos), [vec]"r"(vec),
1524               [IQ]"f"(IQ)
1525             : "$f0", "$f1", "$f2", "$f3",
1526               "$f4", "$f5", "$f6", "$f7",
1527               "memory"
1528         );
1529
1530         cost += di0 * di0 + di1 * di1
1531                 + di2 * di2 + di3 * di3;
1532     }
1533
1534     if (bits)
1535         *bits = curbits;
1536     return cost * lambda + curbits;
1537 }
1538
1539 static float get_band_cost_UQUAD_mips(struct AACEncContext *s,
1540                                       PutBitContext *pb, const float *in,
1541                                       const float *scaled, int size, int scale_idx,
1542                                       int cb, const float lambda, const float uplim,
1543                                       int *bits)
1544 {
1545     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1546     const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
1547     int i;
1548     float cost = 0;
1549     int curbits = 0;
1550     int qc1, qc2, qc3, qc4;
1551
1552     uint8_t *p_bits  = (uint8_t*)ff_aac_spectral_bits[cb-1];
1553     float   *p_codes = (float  *)ff_aac_codebook_vectors[cb-1];
1554
1555     for (i = 0; i < size; i += 4) {
1556         const float *vec;
1557         int curidx;
1558         float *in_pos = (float *)&in[i];
1559         float di0, di1, di2, di3;
1560         int t0, t1, t2, t3, t4;
1561
1562         qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
1563         qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
1564         qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
1565         qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
1566
1567         __asm__ volatile (
1568             ".set push                                  \n\t"
1569             ".set noreorder                             \n\t"
1570
1571             "ori        %[t4],  $zero,  2               \n\t"
1572             "slt        %[t0],  %[t4],  %[qc1]          \n\t"
1573             "slt        %[t1],  %[t4],  %[qc2]          \n\t"
1574             "slt        %[t2],  %[t4],  %[qc3]          \n\t"
1575             "slt        %[t3],  %[t4],  %[qc4]          \n\t"
1576             "movn       %[qc1], %[t4],  %[t0]           \n\t"
1577             "movn       %[qc2], %[t4],  %[t1]           \n\t"
1578             "movn       %[qc3], %[t4],  %[t2]           \n\t"
1579             "movn       %[qc4], %[t4],  %[t3]           \n\t"
1580
1581             ".set pop                                   \n\t"
1582
1583             : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1584               [qc3]"+r"(qc3), [qc4]"+r"(qc4),
1585               [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
1586               [t4]"=&r"(t4)
1587         );
1588
1589         curidx = qc1;
1590         curidx *= 3;
1591         curidx += qc2;
1592         curidx *= 3;
1593         curidx += qc3;
1594         curidx *= 3;
1595         curidx += qc4;
1596
1597         curbits += p_bits[curidx];
1598         curbits += uquad_sign_bits[curidx];
1599         vec     = &p_codes[curidx*4];
1600
1601         __asm__ volatile (
1602             ".set push                                  \n\t"
1603             ".set noreorder                             \n\t"
1604
1605             "lwc1       %[di0], 0(%[in_pos])            \n\t"
1606             "lwc1       %[di1], 4(%[in_pos])            \n\t"
1607             "lwc1       %[di2], 8(%[in_pos])            \n\t"
1608             "lwc1       %[di3], 12(%[in_pos])           \n\t"
1609             "abs.s      %[di0], %[di0]                  \n\t"
1610             "abs.s      %[di1], %[di1]                  \n\t"
1611             "abs.s      %[di2], %[di2]                  \n\t"
1612             "abs.s      %[di3], %[di3]                  \n\t"
1613             "lwc1       $f0,    0(%[vec])               \n\t"
1614             "lwc1       $f1,    4(%[vec])               \n\t"
1615             "lwc1       $f2,    8(%[vec])               \n\t"
1616             "lwc1       $f3,    12(%[vec])              \n\t"
1617             "nmsub.s    %[di0], %[di0], $f0,    %[IQ]   \n\t"
1618             "nmsub.s    %[di1], %[di1], $f1,    %[IQ]   \n\t"
1619             "nmsub.s    %[di2], %[di2], $f2,    %[IQ]   \n\t"
1620             "nmsub.s    %[di3], %[di3], $f3,    %[IQ]   \n\t"
1621
1622             ".set pop                                   \n\t"
1623
1624             : [di0]"=&f"(di0), [di1]"=&f"(di1),
1625               [di2]"=&f"(di2), [di3]"=&f"(di3)
1626             : [in_pos]"r"(in_pos), [vec]"r"(vec),
1627               [IQ]"f"(IQ)
1628             : "$f0", "$f1", "$f2", "$f3",
1629               "memory"
1630         );
1631
1632         cost += di0 * di0 + di1 * di1
1633                 + di2 * di2 + di3 * di3;
1634     }
1635
1636     if (bits)
1637         *bits = curbits;
1638     return cost * lambda + curbits;
1639 }
1640
1641 static float get_band_cost_SPAIR_mips(struct AACEncContext *s,
1642                                       PutBitContext *pb, const float *in,
1643                                       const float *scaled, int size, int scale_idx,
1644                                       int cb, const float lambda, const float uplim,
1645                                       int *bits)
1646 {
1647     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1648     const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
1649     int i;
1650     float cost = 0;
1651     int qc1, qc2, qc3, qc4;
1652     int curbits = 0;
1653
1654     uint8_t *p_bits  = (uint8_t *)ff_aac_spectral_bits[cb-1];
1655     float   *p_codes = (float   *)ff_aac_codebook_vectors[cb-1];
1656
1657     for (i = 0; i < size; i += 4) {
1658         const float *vec, *vec2;
1659         int curidx, curidx2;
1660         int   *in_int = (int   *)&in[i];
1661         float *in_pos = (float *)&in[i];
1662         float di0, di1, di2, di3;
1663         int t0, t1, t2, t3, t4, t5, t6, t7;
1664
1665         qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
1666         qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
1667         qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
1668         qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
1669
1670         __asm__ volatile (
1671             ".set push                                  \n\t"
1672             ".set noreorder                             \n\t"
1673
1674             "ori        %[t4],  $zero,  4               \n\t"
1675             "slt        %[t0],  %[t4],  %[qc1]          \n\t"
1676             "slt        %[t1],  %[t4],  %[qc2]          \n\t"
1677             "slt        %[t2],  %[t4],  %[qc3]          \n\t"
1678             "slt        %[t3],  %[t4],  %[qc4]          \n\t"
1679             "movn       %[qc1], %[t4],  %[t0]           \n\t"
1680             "movn       %[qc2], %[t4],  %[t1]           \n\t"
1681             "movn       %[qc3], %[t4],  %[t2]           \n\t"
1682             "movn       %[qc4], %[t4],  %[t3]           \n\t"
1683             "lw         %[t0],  0(%[in_int])            \n\t"
1684             "lw         %[t1],  4(%[in_int])            \n\t"
1685             "lw         %[t2],  8(%[in_int])            \n\t"
1686             "lw         %[t3],  12(%[in_int])           \n\t"
1687             "srl        %[t0],  %[t0],  31              \n\t"
1688             "srl        %[t1],  %[t1],  31              \n\t"
1689             "srl        %[t2],  %[t2],  31              \n\t"
1690             "srl        %[t3],  %[t3],  31              \n\t"
1691             "subu       %[t4],  $zero,  %[qc1]          \n\t"
1692             "subu       %[t5],  $zero,  %[qc2]          \n\t"
1693             "subu       %[t6],  $zero,  %[qc3]          \n\t"
1694             "subu       %[t7],  $zero,  %[qc4]          \n\t"
1695             "movn       %[qc1], %[t4],  %[t0]           \n\t"
1696             "movn       %[qc2], %[t5],  %[t1]           \n\t"
1697             "movn       %[qc3], %[t6],  %[t2]           \n\t"
1698             "movn       %[qc4], %[t7],  %[t3]           \n\t"
1699
1700             ".set pop                                   \n\t"
1701
1702             : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1703               [qc3]"+r"(qc3), [qc4]"+r"(qc4),
1704               [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
1705               [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7)
1706             : [in_int]"r"(in_int)
1707             : "memory"
1708         );
1709
1710         curidx = 9 * qc1;
1711         curidx += qc2 + 40;
1712
1713         curidx2 = 9 * qc3;
1714         curidx2 += qc4 + 40;
1715
1716         curbits += p_bits[curidx];
1717         curbits += p_bits[curidx2];
1718
1719         vec     = &p_codes[curidx*2];
1720         vec2    = &p_codes[curidx2*2];
1721
1722         __asm__ volatile (
1723             ".set push                                  \n\t"
1724             ".set noreorder                             \n\t"
1725
1726             "lwc1       $f0,    0(%[in_pos])            \n\t"
1727             "lwc1       $f1,    0(%[vec])               \n\t"
1728             "lwc1       $f2,    4(%[in_pos])            \n\t"
1729             "lwc1       $f3,    4(%[vec])               \n\t"
1730             "lwc1       $f4,    8(%[in_pos])            \n\t"
1731             "lwc1       $f5,    0(%[vec2])              \n\t"
1732             "lwc1       $f6,    12(%[in_pos])           \n\t"
1733             "lwc1       $f7,    4(%[vec2])              \n\t"
1734             "nmsub.s    %[di0], $f0,    $f1,    %[IQ]   \n\t"
1735             "nmsub.s    %[di1], $f2,    $f3,    %[IQ]   \n\t"
1736             "nmsub.s    %[di2], $f4,    $f5,    %[IQ]   \n\t"
1737             "nmsub.s    %[di3], $f6,    $f7,    %[IQ]   \n\t"
1738
1739             ".set pop                                   \n\t"
1740
1741             : [di0]"=&f"(di0), [di1]"=&f"(di1),
1742               [di2]"=&f"(di2), [di3]"=&f"(di3)
1743             : [in_pos]"r"(in_pos), [vec]"r"(vec),
1744               [vec2]"r"(vec2), [IQ]"f"(IQ)
1745             : "$f0", "$f1", "$f2", "$f3",
1746               "$f4", "$f5", "$f6", "$f7",
1747               "memory"
1748         );
1749
1750         cost += di0 * di0 + di1 * di1
1751                 + di2 * di2 + di3 * di3;
1752     }
1753
1754     if (bits)
1755         *bits = curbits;
1756     return cost * lambda + curbits;
1757 }
1758
1759 static float get_band_cost_UPAIR7_mips(struct AACEncContext *s,
1760                                        PutBitContext *pb, const float *in,
1761                                        const float *scaled, int size, int scale_idx,
1762                                        int cb, const float lambda, const float uplim,
1763                                        int *bits)
1764 {
1765     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1766     const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
1767     int i;
1768     float cost = 0;
1769     int qc1, qc2, qc3, qc4;
1770     int curbits = 0;
1771
1772     uint8_t *p_bits  = (uint8_t *)ff_aac_spectral_bits[cb-1];
1773     float   *p_codes = (float   *)ff_aac_codebook_vectors[cb-1];
1774
1775     for (i = 0; i < size; i += 4) {
1776         const float *vec, *vec2;
1777         int curidx, curidx2, sign1, count1, sign2, count2;
1778         int   *in_int = (int   *)&in[i];
1779         float *in_pos = (float *)&in[i];
1780         float di0, di1, di2, di3;
1781         int t0, t1, t2, t3, t4;
1782
1783         qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
1784         qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
1785         qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
1786         qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
1787
1788         __asm__ volatile (
1789             ".set push                                          \n\t"
1790             ".set noreorder                                     \n\t"
1791
1792             "ori        %[t4],      $zero,      7               \n\t"
1793             "ori        %[sign1],   $zero,      0               \n\t"
1794             "ori        %[sign2],   $zero,      0               \n\t"
1795             "slt        %[t0],      %[t4],      %[qc1]          \n\t"
1796             "slt        %[t1],      %[t4],      %[qc2]          \n\t"
1797             "slt        %[t2],      %[t4],      %[qc3]          \n\t"
1798             "slt        %[t3],      %[t4],      %[qc4]          \n\t"
1799             "movn       %[qc1],     %[t4],      %[t0]           \n\t"
1800             "movn       %[qc2],     %[t4],      %[t1]           \n\t"
1801             "movn       %[qc3],     %[t4],      %[t2]           \n\t"
1802             "movn       %[qc4],     %[t4],      %[t3]           \n\t"
1803             "lw         %[t0],      0(%[in_int])                \n\t"
1804             "lw         %[t1],      4(%[in_int])                \n\t"
1805             "lw         %[t2],      8(%[in_int])                \n\t"
1806             "lw         %[t3],      12(%[in_int])               \n\t"
1807             "slt        %[t0],      %[t0],      $zero           \n\t"
1808             "movn       %[sign1],   %[t0],      %[qc1]          \n\t"
1809             "slt        %[t2],      %[t2],      $zero           \n\t"
1810             "movn       %[sign2],   %[t2],      %[qc3]          \n\t"
1811             "slt        %[t1],      %[t1],      $zero           \n\t"
1812             "sll        %[t0],      %[sign1],   1               \n\t"
1813             "or         %[t0],      %[t0],      %[t1]           \n\t"
1814             "movn       %[sign1],   %[t0],      %[qc2]          \n\t"
1815             "slt        %[t3],      %[t3],      $zero           \n\t"
1816             "sll        %[t0],      %[sign2],   1               \n\t"
1817             "or         %[t0],      %[t0],      %[t3]           \n\t"
1818             "movn       %[sign2],   %[t0],      %[qc4]          \n\t"
1819             "slt        %[count1],  $zero,      %[qc1]          \n\t"
1820             "slt        %[t1],      $zero,      %[qc2]          \n\t"
1821             "slt        %[count2],  $zero,      %[qc3]          \n\t"
1822             "slt        %[t2],      $zero,      %[qc4]          \n\t"
1823             "addu       %[count1],  %[count1],  %[t1]           \n\t"
1824             "addu       %[count2],  %[count2],  %[t2]           \n\t"
1825
1826             ".set pop                                           \n\t"
1827
1828             : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1829               [qc3]"+r"(qc3), [qc4]"+r"(qc4),
1830               [sign1]"=&r"(sign1), [count1]"=&r"(count1),
1831               [sign2]"=&r"(sign2), [count2]"=&r"(count2),
1832               [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
1833               [t4]"=&r"(t4)
1834             : [in_int]"r"(in_int)
1835             : "memory"
1836         );
1837
1838         curidx = 8 * qc1;
1839         curidx += qc2;
1840
1841         curidx2 = 8 * qc3;
1842         curidx2 += qc4;
1843
1844         curbits += p_bits[curidx];
1845         curbits += upair7_sign_bits[curidx];
1846         vec     = &p_codes[curidx*2];
1847
1848         curbits += p_bits[curidx2];
1849         curbits += upair7_sign_bits[curidx2];
1850         vec2    = &p_codes[curidx2*2];
1851
1852         __asm__ volatile (
1853             ".set push                                          \n\t"
1854             ".set noreorder                                     \n\t"
1855
1856             "lwc1       %[di0],     0(%[in_pos])                \n\t"
1857             "lwc1       %[di1],     4(%[in_pos])                \n\t"
1858             "lwc1       %[di2],     8(%[in_pos])                \n\t"
1859             "lwc1       %[di3],     12(%[in_pos])               \n\t"
1860             "abs.s      %[di0],     %[di0]                      \n\t"
1861             "abs.s      %[di1],     %[di1]                      \n\t"
1862             "abs.s      %[di2],     %[di2]                      \n\t"
1863             "abs.s      %[di3],     %[di3]                      \n\t"
1864             "lwc1       $f0,        0(%[vec])                   \n\t"
1865             "lwc1       $f1,        4(%[vec])                   \n\t"
1866             "lwc1       $f2,        0(%[vec2])                  \n\t"
1867             "lwc1       $f3,        4(%[vec2])                  \n\t"
1868             "nmsub.s    %[di0],     %[di0],     $f0,    %[IQ]   \n\t"
1869             "nmsub.s    %[di1],     %[di1],     $f1,    %[IQ]   \n\t"
1870             "nmsub.s    %[di2],     %[di2],     $f2,    %[IQ]   \n\t"
1871             "nmsub.s    %[di3],     %[di3],     $f3,    %[IQ]   \n\t"
1872
1873             ".set pop                                           \n\t"
1874
1875             : [di0]"=&f"(di0), [di1]"=&f"(di1),
1876               [di2]"=&f"(di2), [di3]"=&f"(di3)
1877             : [in_pos]"r"(in_pos), [vec]"r"(vec),
1878               [vec2]"r"(vec2), [IQ]"f"(IQ)
1879             : "$f0", "$f1", "$f2", "$f3",
1880               "memory"
1881         );
1882
1883         cost += di0 * di0 + di1 * di1
1884                 + di2 * di2 + di3 * di3;
1885     }
1886
1887     if (bits)
1888         *bits = curbits;
1889     return cost * lambda + curbits;
1890 }
1891
1892 static float get_band_cost_UPAIR12_mips(struct AACEncContext *s,
1893                                         PutBitContext *pb, const float *in,
1894                                         const float *scaled, int size, int scale_idx,
1895                                         int cb, const float lambda, const float uplim,
1896                                         int *bits)
1897 {
1898     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1899     const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
1900     int i;
1901     float cost = 0;
1902     int qc1, qc2, qc3, qc4;
1903     int curbits = 0;
1904
1905     uint8_t *p_bits  = (uint8_t *)ff_aac_spectral_bits[cb-1];
1906     float   *p_codes = (float   *)ff_aac_codebook_vectors[cb-1];
1907
1908     for (i = 0; i < size; i += 4) {
1909         const float *vec, *vec2;
1910         int curidx, curidx2;
1911         int sign1, count1, sign2, count2;
1912         int   *in_int = (int   *)&in[i];
1913         float *in_pos = (float *)&in[i];
1914         float di0, di1, di2, di3;
1915         int t0, t1, t2, t3, t4;
1916
1917         qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
1918         qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
1919         qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
1920         qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
1921
1922         __asm__ volatile (
1923             ".set push                                          \n\t"
1924             ".set noreorder                                     \n\t"
1925
1926             "ori        %[t4],      $zero,      12              \n\t"
1927             "ori        %[sign1],   $zero,      0               \n\t"
1928             "ori        %[sign2],   $zero,      0               \n\t"
1929             "slt        %[t0],      %[t4],      %[qc1]          \n\t"
1930             "slt        %[t1],      %[t4],      %[qc2]          \n\t"
1931             "slt        %[t2],      %[t4],      %[qc3]          \n\t"
1932             "slt        %[t3],      %[t4],      %[qc4]          \n\t"
1933             "movn       %[qc1],     %[t4],      %[t0]           \n\t"
1934             "movn       %[qc2],     %[t4],      %[t1]           \n\t"
1935             "movn       %[qc3],     %[t4],      %[t2]           \n\t"
1936             "movn       %[qc4],     %[t4],      %[t3]           \n\t"
1937             "lw         %[t0],      0(%[in_int])                \n\t"
1938             "lw         %[t1],      4(%[in_int])                \n\t"
1939             "lw         %[t2],      8(%[in_int])                \n\t"
1940             "lw         %[t3],      12(%[in_int])               \n\t"
1941             "slt        %[t0],      %[t0],      $zero           \n\t"
1942             "movn       %[sign1],   %[t0],      %[qc1]          \n\t"
1943             "slt        %[t2],      %[t2],      $zero           \n\t"
1944             "movn       %[sign2],   %[t2],      %[qc3]          \n\t"
1945             "slt        %[t1],      %[t1],      $zero           \n\t"
1946             "sll        %[t0],      %[sign1],   1               \n\t"
1947             "or         %[t0],      %[t0],      %[t1]           \n\t"
1948             "movn       %[sign1],   %[t0],      %[qc2]          \n\t"
1949             "slt        %[t3],      %[t3],      $zero           \n\t"
1950             "sll        %[t0],      %[sign2],   1               \n\t"
1951             "or         %[t0],      %[t0],      %[t3]           \n\t"
1952             "movn       %[sign2],   %[t0],      %[qc4]          \n\t"
1953             "slt        %[count1],  $zero,      %[qc1]          \n\t"
1954             "slt        %[t1],      $zero,      %[qc2]          \n\t"
1955             "slt        %[count2],  $zero,      %[qc3]          \n\t"
1956             "slt        %[t2],      $zero,      %[qc4]          \n\t"
1957             "addu       %[count1],  %[count1],  %[t1]           \n\t"
1958             "addu       %[count2],  %[count2],  %[t2]           \n\t"
1959
1960             ".set pop                                           \n\t"
1961
1962             : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1963               [qc3]"+r"(qc3), [qc4]"+r"(qc4),
1964               [sign1]"=&r"(sign1), [count1]"=&r"(count1),
1965               [sign2]"=&r"(sign2), [count2]"=&r"(count2),
1966               [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
1967               [t4]"=&r"(t4)
1968             : [in_int]"r"(in_int)
1969             : "memory"
1970         );
1971
1972         curidx = 13 * qc1;
1973         curidx += qc2;
1974
1975         curidx2 = 13 * qc3;
1976         curidx2 += qc4;
1977
1978         curbits += p_bits[curidx];
1979         curbits += p_bits[curidx2];
1980         curbits += upair12_sign_bits[curidx];
1981         curbits += upair12_sign_bits[curidx2];
1982         vec     = &p_codes[curidx*2];
1983         vec2    = &p_codes[curidx2*2];
1984
1985         __asm__ volatile (
1986             ".set push                                          \n\t"
1987             ".set noreorder                                     \n\t"
1988
1989             "lwc1       %[di0],     0(%[in_pos])                \n\t"
1990             "lwc1       %[di1],     4(%[in_pos])                \n\t"
1991             "lwc1       %[di2],     8(%[in_pos])                \n\t"
1992             "lwc1       %[di3],     12(%[in_pos])               \n\t"
1993             "abs.s      %[di0],     %[di0]                      \n\t"
1994             "abs.s      %[di1],     %[di1]                      \n\t"
1995             "abs.s      %[di2],     %[di2]                      \n\t"
1996             "abs.s      %[di3],     %[di3]                      \n\t"
1997             "lwc1       $f0,        0(%[vec])                   \n\t"
1998             "lwc1       $f1,        4(%[vec])                   \n\t"
1999             "lwc1       $f2,        0(%[vec2])                  \n\t"
2000             "lwc1       $f3,        4(%[vec2])                  \n\t"
2001             "nmsub.s    %[di0],     %[di0],     $f0,    %[IQ]   \n\t"
2002             "nmsub.s    %[di1],     %[di1],     $f1,    %[IQ]   \n\t"
2003             "nmsub.s    %[di2],     %[di2],     $f2,    %[IQ]   \n\t"
2004             "nmsub.s    %[di3],     %[di3],     $f3,    %[IQ]   \n\t"
2005
2006             ".set pop                                           \n\t"
2007
2008             : [di0]"=&f"(di0), [di1]"=&f"(di1),
2009               [di2]"=&f"(di2), [di3]"=&f"(di3)
2010             : [in_pos]"r"(in_pos), [vec]"r"(vec),
2011               [vec2]"r"(vec2), [IQ]"f"(IQ)
2012             : "$f0", "$f1", "$f2", "$f3",
2013               "memory"
2014         );
2015
2016         cost += di0 * di0 + di1 * di1
2017                 + di2 * di2 + di3 * di3;
2018     }
2019
2020     if (bits)
2021         *bits = curbits;
2022     return cost * lambda + curbits;
2023 }
2024
2025 static float get_band_cost_ESC_mips(struct AACEncContext *s,
2026                                     PutBitContext *pb, const float *in,
2027                                     const float *scaled, int size, int scale_idx,
2028                                     int cb, const float lambda, const float uplim,
2029                                     int *bits)
2030 {
2031     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
2032     const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
2033     const float CLIPPED_ESCAPE = 165140.0f * IQ;
2034     int i;
2035     float cost = 0;
2036     int qc1, qc2, qc3, qc4;
2037     int curbits = 0;
2038
2039     uint8_t *p_bits  = (uint8_t*)ff_aac_spectral_bits[cb-1];
2040     float   *p_codes = (float*  )ff_aac_codebook_vectors[cb-1];
2041
2042     for (i = 0; i < size; i += 4) {
2043         const float *vec, *vec2;
2044         int curidx, curidx2;
2045         float t1, t2, t3, t4;
2046         float di1, di2, di3, di4;
2047         int cond0, cond1, cond2, cond3;
2048         int c1, c2, c3, c4;
2049         int t6, t7;
2050
2051         qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
2052         qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
2053         qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
2054         qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
2055
2056         __asm__ volatile (
2057             ".set push                                  \n\t"
2058             ".set noreorder                             \n\t"
2059
2060             "ori        %[t6],      $zero,  15          \n\t"
2061             "ori        %[t7],      $zero,  16          \n\t"
2062             "shll_s.w   %[c1],      %[qc1], 18          \n\t"
2063             "shll_s.w   %[c2],      %[qc2], 18          \n\t"
2064             "shll_s.w   %[c3],      %[qc3], 18          \n\t"
2065             "shll_s.w   %[c4],      %[qc4], 18          \n\t"
2066             "srl        %[c1],      %[c1],  18          \n\t"
2067             "srl        %[c2],      %[c2],  18          \n\t"
2068             "srl        %[c3],      %[c3],  18          \n\t"
2069             "srl        %[c4],      %[c4],  18          \n\t"
2070             "slt        %[cond0],   %[t6],  %[qc1]      \n\t"
2071             "slt        %[cond1],   %[t6],  %[qc2]      \n\t"
2072             "slt        %[cond2],   %[t6],  %[qc3]      \n\t"
2073             "slt        %[cond3],   %[t6],  %[qc4]      \n\t"
2074             "movn       %[qc1],     %[t7],  %[cond0]    \n\t"
2075             "movn       %[qc2],     %[t7],  %[cond1]    \n\t"
2076             "movn       %[qc3],     %[t7],  %[cond2]    \n\t"
2077             "movn       %[qc4],     %[t7],  %[cond3]    \n\t"
2078
2079             ".set pop                                   \n\t"
2080
2081             : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
2082               [qc3]"+r"(qc3), [qc4]"+r"(qc4),
2083               [cond0]"=&r"(cond0), [cond1]"=&r"(cond1),
2084               [cond2]"=&r"(cond2), [cond3]"=&r"(cond3),
2085               [c1]"=&r"(c1), [c2]"=&r"(c2),
2086               [c3]"=&r"(c3), [c4]"=&r"(c4),
2087               [t6]"=&r"(t6), [t7]"=&r"(t7)
2088         );
2089
2090         curidx = 17 * qc1;
2091         curidx += qc2;
2092
2093         curidx2 = 17 * qc3;
2094         curidx2 += qc4;
2095
2096         curbits += p_bits[curidx];
2097         curbits += esc_sign_bits[curidx];
2098         vec     = &p_codes[curidx*2];
2099
2100         curbits += p_bits[curidx2];
2101         curbits += esc_sign_bits[curidx2];
2102         vec2     = &p_codes[curidx2*2];
2103
2104         curbits += (av_log2(c1) * 2 - 3) & (-cond0);
2105         curbits += (av_log2(c2) * 2 - 3) & (-cond1);
2106         curbits += (av_log2(c3) * 2 - 3) & (-cond2);
2107         curbits += (av_log2(c4) * 2 - 3) & (-cond3);
2108
2109         t1 = fabsf(in[i  ]);
2110         t2 = fabsf(in[i+1]);
2111         t3 = fabsf(in[i+2]);
2112         t4 = fabsf(in[i+3]);
2113
2114         if (cond0) {
2115             if (t1 >= CLIPPED_ESCAPE) {
2116                 di1 = t1 - CLIPPED_ESCAPE;
2117             } else {
2118                 di1 = t1 - c1 * cbrtf(c1) * IQ;
2119             }
2120         } else
2121             di1 = t1 - vec[0] * IQ;
2122
2123         if (cond1) {
2124             if (t2 >= CLIPPED_ESCAPE) {
2125                 di2 = t2 - CLIPPED_ESCAPE;
2126             } else {
2127                 di2 = t2 - c2 * cbrtf(c2) * IQ;
2128             }
2129         } else
2130             di2 = t2 - vec[1] * IQ;
2131
2132         if (cond2) {
2133             if (t3 >= CLIPPED_ESCAPE) {
2134                 di3 = t3 - CLIPPED_ESCAPE;
2135             } else {
2136                 di3 = t3 - c3 * cbrtf(c3) * IQ;
2137             }
2138         } else
2139             di3 = t3 - vec2[0] * IQ;
2140
2141         if (cond3) {
2142             if (t4 >= CLIPPED_ESCAPE) {
2143                 di4 = t4 - CLIPPED_ESCAPE;
2144             } else {
2145                 di4 = t4 - c4 * cbrtf(c4) * IQ;
2146             }
2147         } else
2148             di4 = t4 - vec2[1]*IQ;
2149
2150         cost += di1 * di1 + di2 * di2
2151                 + di3 * di3 + di4 * di4;
2152     }
2153
2154     if (bits)
2155         *bits = curbits;
2156     return cost * lambda + curbits;
2157 }
2158
2159 static float (*const get_band_cost_arr[])(struct AACEncContext *s,
2160                                           PutBitContext *pb, const float *in,
2161                                           const float *scaled, int size, int scale_idx,
2162                                           int cb, const float lambda, const float uplim,
2163                                           int *bits) = {
2164     get_band_cost_ZERO_mips,
2165     get_band_cost_SQUAD_mips,
2166     get_band_cost_SQUAD_mips,
2167     get_band_cost_UQUAD_mips,
2168     get_band_cost_UQUAD_mips,
2169     get_band_cost_SPAIR_mips,
2170     get_band_cost_SPAIR_mips,
2171     get_band_cost_UPAIR7_mips,
2172     get_band_cost_UPAIR7_mips,
2173     get_band_cost_UPAIR12_mips,
2174     get_band_cost_UPAIR12_mips,
2175     get_band_cost_ESC_mips,
2176     get_band_cost_NONE_mips, /* cb 12 doesn't exist */
2177     get_band_cost_ZERO_mips,
2178     get_band_cost_ZERO_mips,
2179     get_band_cost_ZERO_mips,
2180 };
2181
2182 #define get_band_cost(                                  \
2183                                 s, pb, in, scaled, size, scale_idx, cb, \
2184                                 lambda, uplim, bits)                    \
2185     get_band_cost_arr[cb](                              \
2186                                 s, pb, in, scaled, size, scale_idx, cb, \
2187                                 lambda, uplim, bits)
2188
2189 static float quantize_band_cost(struct AACEncContext *s, const float *in,
2190                                 const float *scaled, int size, int scale_idx,
2191                                 int cb, const float lambda, const float uplim,
2192                                 int *bits)
2193 {
2194     return get_band_cost(s, NULL, in, scaled, size, scale_idx, cb, lambda, uplim, bits);
2195 }
2196
2197 static void search_for_quantizers_twoloop_mips(AVCodecContext *avctx,
2198                                                AACEncContext *s,
2199                                                SingleChannelElement *sce,
2200                                                const float lambda)
2201 {
2202     int start = 0, i, w, w2, g;
2203     int destbits = avctx->bit_rate * 1024.0 / avctx->sample_rate / avctx->channels;
2204     float dists[128] = { 0 }, uplims[128];
2205     float maxvals[128];
2206     int fflag, minscaler;
2207     int its  = 0;
2208     int allz = 0;
2209     float minthr = INFINITY;
2210
2211     destbits = FFMIN(destbits, 5800);
2212     for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
2213         for (g = 0;  g < sce->ics.num_swb; g++) {
2214             int nz = 0;
2215             float uplim = 0.0f;
2216             for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
2217                 FFPsyBand *band = &s->psy.ch[s->cur_channel].psy_bands[(w+w2)*16+g];
2218                 uplim += band->threshold;
2219                 if (band->energy <= band->threshold || band->threshold == 0.0f) {
2220                     sce->zeroes[(w+w2)*16+g] = 1;
2221                     continue;
2222                 }
2223                 nz = 1;
2224             }
2225             uplims[w*16+g] = uplim *512;
2226             sce->zeroes[w*16+g] = !nz;
2227             if (nz)
2228                 minthr = FFMIN(minthr, uplim);
2229             allz |= nz;
2230         }
2231     }
2232     for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
2233         for (g = 0;  g < sce->ics.num_swb; g++) {
2234             if (sce->zeroes[w*16+g]) {
2235                 sce->sf_idx[w*16+g] = SCALE_ONE_POS;
2236                 continue;
2237             }
2238             sce->sf_idx[w*16+g] = SCALE_ONE_POS + FFMIN(log2f(uplims[w*16+g]/minthr)*4,59);
2239         }
2240     }
2241
2242     if (!allz)
2243         return;
2244     abs_pow34_v(s->scoefs, sce->coeffs, 1024);
2245
2246     for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
2247         start = w*128;
2248         for (g = 0;  g < sce->ics.num_swb; g++) {
2249             const float *scaled = s->scoefs + start;
2250             maxvals[w*16+g] = find_max_val(sce->ics.group_len[w], sce->ics.swb_sizes[g], scaled);
2251             start += sce->ics.swb_sizes[g];
2252         }
2253     }
2254
2255     do {
2256         int tbits, qstep;
2257         minscaler = sce->sf_idx[0];
2258         qstep = its ? 1 : 32;
2259         do {
2260             int prev = -1;
2261             tbits = 0;
2262             fflag = 0;
2263
2264             if (qstep > 1) {
2265                 for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
2266                     start = w*128;
2267                     for (g = 0;  g < sce->ics.num_swb; g++) {
2268                         const float *coefs = sce->coeffs + start;
2269                         const float *scaled = s->scoefs + start;
2270                         int bits = 0;
2271                         int cb;
2272
2273                         if (sce->zeroes[w*16+g] || sce->sf_idx[w*16+g] >= 218) {
2274                             start += sce->ics.swb_sizes[g];
2275                             continue;
2276                         }
2277                         minscaler = FFMIN(minscaler, sce->sf_idx[w*16+g]);
2278                         cb = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]);
2279                         for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
2280                             int b;
2281                             bits += quantize_band_cost_bits(s, coefs + w2*128,
2282                                                             scaled + w2*128,
2283                                                             sce->ics.swb_sizes[g],
2284                                                             sce->sf_idx[w*16+g],
2285                                                             cb,
2286                                                             1.0f,
2287                                                             INFINITY,
2288                                                             &b);
2289                         }
2290                         if (prev != -1) {
2291                             bits += ff_aac_scalefactor_bits[sce->sf_idx[w*16+g] - prev + SCALE_DIFF_ZERO];
2292                         }
2293                         tbits += bits;
2294                         start += sce->ics.swb_sizes[g];
2295                         prev = sce->sf_idx[w*16+g];
2296                     }
2297                 }
2298             }
2299             else {
2300                 for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
2301                     start = w*128;
2302                     for (g = 0;  g < sce->ics.num_swb; g++) {
2303                         const float *coefs = sce->coeffs + start;
2304                         const float *scaled = s->scoefs + start;
2305                         int bits = 0;
2306                         int cb;
2307                         float dist = 0.0f;
2308
2309                         if (sce->zeroes[w*16+g] || sce->sf_idx[w*16+g] >= 218) {
2310                             start += sce->ics.swb_sizes[g];
2311                             continue;
2312                         }
2313                         minscaler = FFMIN(minscaler, sce->sf_idx[w*16+g]);
2314                         cb = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]);
2315                         for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
2316                             int b;
2317                             dist += quantize_band_cost(s, coefs + w2*128,
2318                                                        scaled + w2*128,
2319                                                        sce->ics.swb_sizes[g],
2320                                                        sce->sf_idx[w*16+g],
2321                                                        cb,
2322                                                        1.0f,
2323                                                        INFINITY,
2324                                                        &b);
2325                             bits += b;
2326                         }
2327                         dists[w*16+g] = dist - bits;
2328                         if (prev != -1) {
2329                             bits += ff_aac_scalefactor_bits[sce->sf_idx[w*16+g] - prev + SCALE_DIFF_ZERO];
2330                         }
2331                         tbits += bits;
2332                         start += sce->ics.swb_sizes[g];
2333                         prev = sce->sf_idx[w*16+g];
2334                     }
2335                 }
2336             }
2337             if (tbits > destbits) {
2338                 for (i = 0; i < 128; i++)
2339                     if (sce->sf_idx[i] < 218 - qstep)
2340                         sce->sf_idx[i] += qstep;
2341             } else {
2342                 for (i = 0; i < 128; i++)
2343                     if (sce->sf_idx[i] > 60 - qstep)
2344                         sce->sf_idx[i] -= qstep;
2345             }
2346             qstep >>= 1;
2347             if (!qstep && tbits > destbits*1.02 && sce->sf_idx[0] < 217)
2348                 qstep = 1;
2349         } while (qstep);
2350
2351         fflag = 0;
2352         minscaler = av_clip(minscaler, 60, 255 - SCALE_MAX_DIFF);
2353         for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
2354             for (g = 0; g < sce->ics.num_swb; g++) {
2355                 int prevsc = sce->sf_idx[w*16+g];
2356                 if (dists[w*16+g] > uplims[w*16+g] && sce->sf_idx[w*16+g] > 60) {
2357                     if (find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]-1))
2358                         sce->sf_idx[w*16+g]--;
2359                     else
2360                         sce->sf_idx[w*16+g]-=2;
2361                 }
2362                 sce->sf_idx[w*16+g] = av_clip(sce->sf_idx[w*16+g], minscaler, minscaler + SCALE_MAX_DIFF);
2363                 sce->sf_idx[w*16+g] = FFMIN(sce->sf_idx[w*16+g], 219);
2364                 if (sce->sf_idx[w*16+g] != prevsc)
2365                     fflag = 1;
2366                 sce->band_type[w*16+g] = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]);
2367             }
2368         }
2369         its++;
2370     } while (fflag && its < 10);
2371 }
2372
2373 static void search_for_ms_mips(AACEncContext *s, ChannelElement *cpe)
2374 {
2375     int start = 0, i, w, w2, g;
2376     float M[128], S[128];
2377     float *L34 = s->scoefs, *R34 = s->scoefs + 128, *M34 = s->scoefs + 128*2, *S34 = s->scoefs + 128*3;
2378     SingleChannelElement *sce0 = &cpe->ch[0];
2379     SingleChannelElement *sce1 = &cpe->ch[1];
2380     if (!cpe->common_window)
2381         return;
2382     for (w = 0; w < sce0->ics.num_windows; w += sce0->ics.group_len[w]) {
2383         for (g = 0;  g < sce0->ics.num_swb; g++) {
2384             if (!cpe->ch[0].zeroes[w*16+g] && !cpe->ch[1].zeroes[w*16+g]) {
2385                 float dist1 = 0.0f, dist2 = 0.0f;
2386                 for (w2 = 0; w2 < sce0->ics.group_len[w]; w2++) {
2387                     FFPsyBand *band0 = &s->psy.ch[s->cur_channel+0].psy_bands[(w+w2)*16+g];
2388                     FFPsyBand *band1 = &s->psy.ch[s->cur_channel+1].psy_bands[(w+w2)*16+g];
2389                     float minthr = FFMIN(band0->threshold, band1->threshold);
2390                     float maxthr = FFMAX(band0->threshold, band1->threshold);
2391                     for (i = 0; i < sce0->ics.swb_sizes[g]; i+=4) {
2392                         M[i  ] = (sce0->coeffs[start+w2*128+i  ]
2393                                 + sce1->coeffs[start+w2*128+i  ]) * 0.5;
2394                         M[i+1] = (sce0->coeffs[start+w2*128+i+1]
2395                                 + sce1->coeffs[start+w2*128+i+1]) * 0.5;
2396                         M[i+2] = (sce0->coeffs[start+w2*128+i+2]
2397                                 + sce1->coeffs[start+w2*128+i+2]) * 0.5;
2398                         M[i+3] = (sce0->coeffs[start+w2*128+i+3]
2399                                 + sce1->coeffs[start+w2*128+i+3]) * 0.5;
2400
2401                         S[i  ] =  M[i  ]
2402                                 - sce1->coeffs[start+w2*128+i  ];
2403                         S[i+1] =  M[i+1]
2404                                 - sce1->coeffs[start+w2*128+i+1];
2405                         S[i+2] =  M[i+2]
2406                                 - sce1->coeffs[start+w2*128+i+2];
2407                         S[i+3] =  M[i+3]
2408                                 - sce1->coeffs[start+w2*128+i+3];
2409                    }
2410                     abs_pow34_v(L34, sce0->coeffs+start+w2*128, sce0->ics.swb_sizes[g]);
2411                     abs_pow34_v(R34, sce1->coeffs+start+w2*128, sce0->ics.swb_sizes[g]);
2412                     abs_pow34_v(M34, M,                         sce0->ics.swb_sizes[g]);
2413                     abs_pow34_v(S34, S,                         sce0->ics.swb_sizes[g]);
2414                     dist1 += quantize_band_cost(s, sce0->coeffs + start + w2*128,
2415                                                 L34,
2416                                                 sce0->ics.swb_sizes[g],
2417                                                 sce0->sf_idx[(w+w2)*16+g],
2418                                                 sce0->band_type[(w+w2)*16+g],
2419                                                 s->lambda / band0->threshold, INFINITY, NULL);
2420                     dist1 += quantize_band_cost(s, sce1->coeffs + start + w2*128,
2421                                                 R34,
2422                                                 sce1->ics.swb_sizes[g],
2423                                                 sce1->sf_idx[(w+w2)*16+g],
2424                                                 sce1->band_type[(w+w2)*16+g],
2425                                                 s->lambda / band1->threshold, INFINITY, NULL);
2426                     dist2 += quantize_band_cost(s, M,
2427                                                 M34,
2428                                                 sce0->ics.swb_sizes[g],
2429                                                 sce0->sf_idx[(w+w2)*16+g],
2430                                                 sce0->band_type[(w+w2)*16+g],
2431                                                 s->lambda / maxthr, INFINITY, NULL);
2432                     dist2 += quantize_band_cost(s, S,
2433                                                 S34,
2434                                                 sce1->ics.swb_sizes[g],
2435                                                 sce1->sf_idx[(w+w2)*16+g],
2436                                                 sce1->band_type[(w+w2)*16+g],
2437                                                 s->lambda / minthr, INFINITY, NULL);
2438                 }
2439                 cpe->ms_mask[w*16+g] = dist2 < dist1;
2440             }
2441             start += sce0->ics.swb_sizes[g];
2442         }
2443     }
2444 }
2445 #endif /*HAVE_MIPSFPU */
2446
2447 static void codebook_trellis_rate_mips(AACEncContext *s, SingleChannelElement *sce,
2448                                        int win, int group_len, const float lambda)
2449 {
2450     BandCodingPath path[120][12];
2451     int w, swb, cb, start, size;
2452     int i, j;
2453     const int max_sfb  = sce->ics.max_sfb;
2454     const int run_bits = sce->ics.num_windows == 1 ? 5 : 3;
2455     const int run_esc  = (1 << run_bits) - 1;
2456     int idx, ppos, count;
2457     int stackrun[120], stackcb[120], stack_len;
2458     float next_minbits = INFINITY;
2459     int next_mincb = 0;
2460
2461     abs_pow34_v(s->scoefs, sce->coeffs, 1024);
2462     start = win*128;
2463     for (cb = 0; cb < 12; cb++) {
2464         path[0][cb].cost     = run_bits+4;
2465         path[0][cb].prev_idx = -1;
2466         path[0][cb].run      = 0;
2467     }
2468     for (swb = 0; swb < max_sfb; swb++) {
2469         size = sce->ics.swb_sizes[swb];
2470         if (sce->zeroes[win*16 + swb]) {
2471             float cost_stay_here = path[swb][0].cost;
2472             float cost_get_here  = next_minbits + run_bits + 4;
2473             if (   run_value_bits[sce->ics.num_windows == 8][path[swb][0].run]
2474                 != run_value_bits[sce->ics.num_windows == 8][path[swb][0].run+1])
2475                 cost_stay_here += run_bits;
2476             if (cost_get_here < cost_stay_here) {
2477                 path[swb+1][0].prev_idx = next_mincb;
2478                 path[swb+1][0].cost     = cost_get_here;
2479                 path[swb+1][0].run      = 1;
2480             } else {
2481                 path[swb+1][0].prev_idx = 0;
2482                 path[swb+1][0].cost     = cost_stay_here;
2483                 path[swb+1][0].run      = path[swb][0].run + 1;
2484             }
2485             next_minbits = path[swb+1][0].cost;
2486             next_mincb = 0;
2487             for (cb = 1; cb < 12; cb++) {
2488                 path[swb+1][cb].cost = 61450;
2489                 path[swb+1][cb].prev_idx = -1;
2490                 path[swb+1][cb].run = 0;
2491             }
2492         } else {
2493             float minbits = next_minbits;
2494             int mincb = next_mincb;
2495             int startcb = sce->band_type[win*16+swb];
2496             next_minbits = INFINITY;
2497             next_mincb = 0;
2498             for (cb = 0; cb < startcb; cb++) {
2499                 path[swb+1][cb].cost = 61450;
2500                 path[swb+1][cb].prev_idx = -1;
2501                 path[swb+1][cb].run = 0;
2502             }
2503             for (cb = startcb; cb < 12; cb++) {
2504                 float cost_stay_here, cost_get_here;
2505                 float bits = 0.0f;
2506                 for (w = 0; w < group_len; w++) {
2507                     bits += quantize_band_cost_bits(s, sce->coeffs + start + w*128,
2508                                                     s->scoefs + start + w*128, size,
2509                                                     sce->sf_idx[(win+w)*16+swb], cb,
2510                                                     0, INFINITY, NULL);
2511                 }
2512                 cost_stay_here = path[swb][cb].cost + bits;
2513                 cost_get_here  = minbits            + bits + run_bits + 4;
2514                 if (   run_value_bits[sce->ics.num_windows == 8][path[swb][cb].run]
2515                     != run_value_bits[sce->ics.num_windows == 8][path[swb][cb].run+1])
2516                     cost_stay_here += run_bits;
2517                 if (cost_get_here < cost_stay_here) {
2518                     path[swb+1][cb].prev_idx = mincb;
2519                     path[swb+1][cb].cost     = cost_get_here;
2520                     path[swb+1][cb].run      = 1;
2521                 } else {
2522                     path[swb+1][cb].prev_idx = cb;
2523                     path[swb+1][cb].cost     = cost_stay_here;
2524                     path[swb+1][cb].run      = path[swb][cb].run + 1;
2525                 }
2526                 if (path[swb+1][cb].cost < next_minbits) {
2527                     next_minbits = path[swb+1][cb].cost;
2528                     next_mincb = cb;
2529                 }
2530             }
2531         }
2532         start += sce->ics.swb_sizes[swb];
2533     }
2534
2535     stack_len = 0;
2536     idx       = 0;
2537     for (cb = 1; cb < 12; cb++)
2538         if (path[max_sfb][cb].cost < path[max_sfb][idx].cost)
2539             idx = cb;
2540     ppos = max_sfb;
2541     while (ppos > 0) {
2542         av_assert1(idx >= 0);
2543         cb = idx;
2544         stackrun[stack_len] = path[ppos][cb].run;
2545         stackcb [stack_len] = cb;
2546         idx = path[ppos-path[ppos][cb].run+1][cb].prev_idx;
2547         ppos -= path[ppos][cb].run;
2548         stack_len++;
2549     }
2550
2551     start = 0;
2552     for (i = stack_len - 1; i >= 0; i--) {
2553         put_bits(&s->pb, 4, stackcb[i]);
2554         count = stackrun[i];
2555         memset(sce->zeroes + win*16 + start, !stackcb[i], count);
2556         for (j = 0; j < count; j++) {
2557             sce->band_type[win*16 + start] =  stackcb[i];
2558             start++;
2559         }
2560         while (count >= run_esc) {
2561             put_bits(&s->pb, run_bits, run_esc);
2562             count -= run_esc;
2563         }
2564         put_bits(&s->pb, run_bits, count);
2565     }
2566 }
2567 #endif /* HAVE_INLINE_ASM */
2568
2569 void ff_aac_coder_init_mips(AACEncContext *c) {
2570 #if HAVE_INLINE_ASM
2571     AACCoefficientsEncoder *e = c->coder;
2572     int option = c->options.aac_coder;
2573
2574     if (option == 2) {
2575 // Disabled due to failure with fate-aac-pns-encode
2576 //         e->quantize_and_encode_band = quantize_and_encode_band_mips;
2577 //         e->encode_window_bands_info = codebook_trellis_rate_mips;
2578 #if HAVE_MIPSFPU
2579         e->search_for_quantizers    = search_for_quantizers_twoloop_mips;
2580         e->search_for_ms            = search_for_ms_mips;
2581 #endif /* HAVE_MIPSFPU */
2582     }
2583 #endif /* HAVE_INLINE_ASM */
2584 }