]> git.sesse.net Git - x264/blob - common/mips/quant-c.c
8deddc6691f6dce594becea9b738c540b71cd1a8
[x264] / common / mips / quant-c.c
1 /*****************************************************************************
2  * quant-c.c: msa quantization and level-run
3  *****************************************************************************
4  * Copyright (C) 2015 x264 project
5  *
6  * Authors: Rishikesh More <rishikesh.more@imgtec.com>
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License as published by
10  * the Free Software Foundation; either version 2 of the License, or
11  * (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
21  *
22  * This program is also available under a commercial proprietary license.
23  * For more information, contact us at licensing@x264.com.
24  *****************************************************************************/
25
26 #include "common/common.h"
27 #include "macros.h"
28
29 #if !HIGH_BIT_DEPTH
30 static void avc_dequant_4x4_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][16],
31                                  int32_t i_qp )
32 {
33     const int32_t i_mf = i_qp % 6;
34     const int32_t q_bits = i_qp / 6 - 4;
35     v8i16 dct0, dct1;
36     v4i32 dequant_m_f0, dequant_m_f1, dequant_m_f2, dequant_m_f3;
37
38     LD_SH2( p_dct, 8, dct0, dct1 );
39
40     LD_SW2( pi_dequant_mf[i_mf], 4, dequant_m_f0, dequant_m_f1 );
41     LD_SW2( pi_dequant_mf[i_mf] + 8, 4, dequant_m_f2, dequant_m_f3 );
42
43     if ( q_bits >= 0 )
44     {
45         v8i16 dequant_mf_h0, dequant_mf_h1, q_bits_vec;
46
47         q_bits_vec = __msa_fill_h( q_bits );
48
49         PCKEV_H2_SH( dequant_m_f1, dequant_m_f0, dequant_m_f3, dequant_m_f2,
50                      dequant_mf_h0, dequant_mf_h1 );
51
52         dct0 *= dequant_mf_h0;
53         dct1 *= dequant_mf_h1;
54         dct0 <<= q_bits_vec;
55         dct1 <<= q_bits_vec;
56         ST_SH2( dct0, dct1, p_dct, 8 );
57     }
58     else
59     {
60         const int32_t q_bits_add = 1 << ( -q_bits - 1 );
61         v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3;
62         v4i32 q_bits_vec, q_bits_vec_add;
63
64         q_bits_vec_add = __msa_fill_w( q_bits_add );
65         q_bits_vec = __msa_fill_w( -q_bits );
66
67         UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 );
68         UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 );
69
70         dct_signed_w0 *= dequant_m_f0;
71         dct_signed_w1 *= dequant_m_f1;
72         dct_signed_w2 *= dequant_m_f2;
73         dct_signed_w3 *= dequant_m_f3;
74         dct_signed_w0 += q_bits_vec_add;
75         dct_signed_w1 += q_bits_vec_add;
76         dct_signed_w2 += q_bits_vec_add;
77         dct_signed_w3 += q_bits_vec_add;
78
79         SRA_4V( dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3,
80                 q_bits_vec );
81         PCKEV_H2_SH( dct_signed_w1, dct_signed_w0, dct_signed_w3, dct_signed_w2,
82                      dct0, dct1 );
83         ST_SH2( dct0, dct1, p_dct, 8 );
84     }
85 }
86
87 static void avc_dequant_8x8_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][64],
88                                  int32_t i_qp )
89 {
90     const int32_t i_mf = i_qp % 6;
91     const int32_t q_bits = i_qp / 6 - 6;
92     v8i16 dct0, dct1, dct2, dct3, dct4, dct5, dct6, dct7;
93     v4i32 dequant_m_f0, dequant_m_f1, dequant_m_f2, dequant_m_f3;
94     v4i32 dequant_m_f4, dequant_m_f5, dequant_m_f6, dequant_m_f7;
95     v4i32 dequant_m_f8, dequant_m_f9, dequant_m_f10, dequant_m_f11;
96     v4i32 dequant_m_f12, dequant_m_f13, dequant_m_f14, dequant_m_f15;
97
98     LD_SH8( p_dct, 8, dct0, dct1, dct2, dct3, dct4, dct5, dct6, dct7 );
99
100     LD_SW2( pi_dequant_mf[i_mf], 4, dequant_m_f0, dequant_m_f1 );
101     LD_SW2( pi_dequant_mf[i_mf] + 8, 4, dequant_m_f2, dequant_m_f3 );
102     LD_SW2( pi_dequant_mf[i_mf] + 16, 4, dequant_m_f4, dequant_m_f5 );
103     LD_SW2( pi_dequant_mf[i_mf] + 24, 4, dequant_m_f6, dequant_m_f7 );
104     LD_SW2( pi_dequant_mf[i_mf] + 32, 4, dequant_m_f8, dequant_m_f9 );
105     LD_SW2( pi_dequant_mf[i_mf] + 40, 4, dequant_m_f10, dequant_m_f11 );
106     LD_SW2( pi_dequant_mf[i_mf] + 48, 4, dequant_m_f12, dequant_m_f13 );
107     LD_SW2( pi_dequant_mf[i_mf] + 56, 4, dequant_m_f14, dequant_m_f15 );
108
109     if ( q_bits >= 0 )
110     {
111         v8i16 q_bits_vec;
112         v8i16 dequant_mf_h0, dequant_mf_h1, dequant_mf_h2, dequant_mf_h3;
113         v8i16 dequant_mf_h4, dequant_mf_h5, dequant_mf_h6, dequant_mf_h7;
114
115         q_bits_vec = __msa_fill_h( q_bits );
116
117         PCKEV_H4_SH( dequant_m_f1, dequant_m_f0, dequant_m_f3, dequant_m_f2,
118                      dequant_m_f5, dequant_m_f4, dequant_m_f7, dequant_m_f6,
119                      dequant_mf_h0, dequant_mf_h1,
120                      dequant_mf_h2, dequant_mf_h3 );
121         PCKEV_H4_SH( dequant_m_f9, dequant_m_f8, dequant_m_f11, dequant_m_f10,
122                      dequant_m_f13, dequant_m_f12, dequant_m_f15, dequant_m_f14,
123                      dequant_mf_h4, dequant_mf_h5,
124                      dequant_mf_h6, dequant_mf_h7 );
125
126         dct0 *= dequant_mf_h0;
127         dct1 *= dequant_mf_h1;
128         dct2 *= dequant_mf_h2;
129         dct3 *= dequant_mf_h3;
130         dct4 *= dequant_mf_h4;
131         dct5 *= dequant_mf_h5;
132         dct6 *= dequant_mf_h6;
133         dct7 *= dequant_mf_h7;
134
135         SLLI_4V( dct0, dct1, dct2, dct3, q_bits_vec );
136         SLLI_4V( dct4, dct5, dct6, dct7, q_bits_vec );
137
138         ST_SH8( dct0, dct1, dct2, dct3, dct4, dct5, dct6, dct7, p_dct, 8 );
139     }
140     else
141     {
142         const int32_t q_bits_add = 1 << ( -q_bits - 1 );
143         v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3;
144         v4i32 dct_signed_w4, dct_signed_w5, dct_signed_w6, dct_signed_w7;
145         v4i32 dct_signed_w8, dct_signed_w9, dct_signed_w10, dct_signed_w11;
146         v4i32 dct_signed_w12, dct_signed_w13, dct_signed_w14, dct_signed_w15;
147         v4i32 q_bits_vec, q_bits_vec_add;
148
149         q_bits_vec_add = __msa_fill_w( q_bits_add );
150         q_bits_vec = __msa_fill_w( -q_bits );
151
152         UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 );
153         UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 );
154         UNPCK_SH_SW( dct2, dct_signed_w4, dct_signed_w5 );
155         UNPCK_SH_SW( dct3, dct_signed_w6, dct_signed_w7 );
156         UNPCK_SH_SW( dct4, dct_signed_w8, dct_signed_w9 );
157         UNPCK_SH_SW( dct5, dct_signed_w10, dct_signed_w11 );
158         UNPCK_SH_SW( dct6, dct_signed_w12, dct_signed_w13 );
159         UNPCK_SH_SW( dct7, dct_signed_w14, dct_signed_w15 );
160
161         dct_signed_w0 *= dequant_m_f0;
162         dct_signed_w1 *= dequant_m_f1;
163         dct_signed_w2 *= dequant_m_f2;
164         dct_signed_w3 *= dequant_m_f3;
165         dct_signed_w4 *= dequant_m_f4;
166         dct_signed_w5 *= dequant_m_f5;
167         dct_signed_w6 *= dequant_m_f6;
168         dct_signed_w7 *= dequant_m_f7;
169         dct_signed_w8 *= dequant_m_f8;
170         dct_signed_w9 *= dequant_m_f9;
171         dct_signed_w10 *= dequant_m_f10;
172         dct_signed_w11 *= dequant_m_f11;
173         dct_signed_w12 *= dequant_m_f12;
174         dct_signed_w13 *= dequant_m_f13;
175         dct_signed_w14 *= dequant_m_f14;
176         dct_signed_w15 *= dequant_m_f15;
177
178         dct_signed_w0 += q_bits_vec_add;
179         dct_signed_w1 += q_bits_vec_add;
180         dct_signed_w2 += q_bits_vec_add;
181         dct_signed_w3 += q_bits_vec_add;
182         dct_signed_w4 += q_bits_vec_add;
183         dct_signed_w5 += q_bits_vec_add;
184         dct_signed_w6 += q_bits_vec_add;
185         dct_signed_w7 += q_bits_vec_add;
186         dct_signed_w8 += q_bits_vec_add;
187         dct_signed_w9 += q_bits_vec_add;
188         dct_signed_w10 += q_bits_vec_add;
189         dct_signed_w11 += q_bits_vec_add;
190         dct_signed_w12 += q_bits_vec_add;
191         dct_signed_w13 += q_bits_vec_add;
192         dct_signed_w14 += q_bits_vec_add;
193         dct_signed_w15 += q_bits_vec_add;
194
195         SRA_4V( dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3,
196                 q_bits_vec );
197         SRA_4V( dct_signed_w4, dct_signed_w5, dct_signed_w6, dct_signed_w7,
198                 q_bits_vec );
199         SRA_4V( dct_signed_w8, dct_signed_w9, dct_signed_w10, dct_signed_w11,
200                 q_bits_vec );
201         SRA_4V( dct_signed_w12, dct_signed_w13, dct_signed_w14, dct_signed_w15,
202                 q_bits_vec );
203         PCKEV_H4_SH( dct_signed_w1, dct_signed_w0, dct_signed_w3, dct_signed_w2,
204                      dct_signed_w5, dct_signed_w4, dct_signed_w7, dct_signed_w6,
205                      dct0, dct1, dct2, dct3 );
206         PCKEV_H4_SH( dct_signed_w9, dct_signed_w8, dct_signed_w11,
207                      dct_signed_w10, dct_signed_w13, dct_signed_w12,
208                      dct_signed_w15, dct_signed_w14, dct4, dct5, dct6, dct7 );
209         ST_SH8( dct0, dct1, dct2, dct3, dct4, dct5, dct6, dct7, p_dct, 8 );
210     }
211 }
212
213 static void avc_dequant_4x4_dc_msa( int16_t *p_dct,
214                                     int32_t pi_dequant_mf[6][16],
215                                     int32_t i_qp )
216 {
217     const int32_t q_bits = i_qp / 6 - 6;
218     int32_t i_dmf = pi_dequant_mf[i_qp % 6][0];
219     v8i16 dct0, dct1, dequant_mf_h;
220
221     LD_SH2( p_dct, 8, dct0, dct1 );
222
223     if ( q_bits >= 0 )
224     {
225         i_dmf <<= q_bits;
226
227         dequant_mf_h = __msa_fill_h( i_dmf );
228         dct0 = dct0 * dequant_mf_h;
229         dct1 = dct1 * dequant_mf_h;
230
231         ST_SH2( dct0, dct1, p_dct, 8 );
232     }
233     else
234     {
235         const int32_t q_bits_add = 1 << ( -q_bits - 1 );
236         v4i32 dequant_m_f, q_bits_vec, q_bits_vec_add;
237         v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3;
238
239         q_bits_vec_add = __msa_fill_w( q_bits_add );
240         q_bits_vec = __msa_fill_w( -q_bits );
241
242         dequant_m_f = __msa_fill_w( i_dmf );
243
244         UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 );
245         UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 );
246
247         dct_signed_w0 *= dequant_m_f;
248         dct_signed_w1 *= dequant_m_f;
249         dct_signed_w2 *= dequant_m_f;
250         dct_signed_w3 *= dequant_m_f;
251
252         dct_signed_w0 += q_bits_vec_add;
253         dct_signed_w1 += q_bits_vec_add;
254         dct_signed_w2 += q_bits_vec_add;
255         dct_signed_w3 += q_bits_vec_add;
256
257         SRA_4V( dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3,
258                 q_bits_vec );
259         PCKEV_H2_SH( dct_signed_w1, dct_signed_w0, dct_signed_w3, dct_signed_w2,
260                      dct0, dct1 );
261         ST_SH2( dct0, dct1, p_dct, 8 );
262     }
263 }
264
265 static int32_t avc_quant_4x4_msa( int16_t *p_dct, uint16_t *p_mf,
266                                   uint16_t *p_bias )
267 {
268     int32_t non_zero = 0;
269     v8i16 dct0, dct1;
270     v8i16 zero = { 0 };
271     v8i16 dct0_mask, dct1_mask;
272     v8i16 dct_h0, dct_h1, mf_h0, mf_h1, bias_h0, bias_h1;
273     v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3;
274     v4i32 dct_w0, dct_w1, dct_w2, dct_w3;
275     v4i32 mf_vec0, mf_vec1, mf_vec2, mf_vec3;
276     v4i32 bias0, bias1, bias2, bias3;
277
278     LD_SH2( p_dct, 8, dct0, dct1 );
279     LD_SH2( p_bias, 8, bias_h0, bias_h1 );
280     LD_SH2( p_mf, 8, mf_h0, mf_h1 );
281
282     dct0_mask = __msa_clei_s_h( dct0, 0 );
283     dct1_mask = __msa_clei_s_h( dct1, 0 );
284
285     UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 );
286     UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 );
287     ILVR_H2_SW( zero, bias_h0, zero, bias_h1, bias0, bias2 );
288     ILVL_H2_SW( zero, bias_h0, zero, bias_h1, bias1, bias3 );
289     ILVR_H2_SW( zero, mf_h0, zero, mf_h1, mf_vec0, mf_vec2 );
290     ILVL_H2_SW( zero, mf_h0, zero, mf_h1, mf_vec1, mf_vec3 );
291
292     dct_w1 = __msa_add_a_w( dct_signed_w1, bias1 );
293     dct_w0 = __msa_add_a_w( dct_signed_w0, bias0 );
294     dct_w2 = __msa_add_a_w( dct_signed_w2, bias2 );
295     dct_w3 = __msa_add_a_w( dct_signed_w3, bias3 );
296
297     dct_w0 *= mf_vec0;
298     dct_w1 *= mf_vec1;
299     dct_w2 *= mf_vec2;
300     dct_w3 *= mf_vec3;
301
302     SRA_4V( dct_w0, dct_w1, dct_w2, dct_w3, 16 );
303     PCKEV_H2_SH( dct_w1, dct_w0, dct_w3, dct_w2, dct_h0, dct_h1 );
304
305     dct0 = zero - dct_h0;
306     dct1 = zero - dct_h1;
307
308     dct0 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h0, ( v16u8 ) dct0,
309                                    ( v16u8 ) dct0_mask );
310     dct1 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h1, ( v16u8 ) dct1,
311                                    ( v16u8 ) dct1_mask );
312     non_zero = HADD_SW_S32( ( v4u32 ) ( dct_h0 + dct_h1 ) );
313     ST_SH2( dct0, dct1, p_dct, 8 );
314
315     return !!non_zero;
316 }
317
318 static int32_t avc_quant_8x8_msa( int16_t *p_dct, uint16_t *p_mf,
319                                   uint16_t *p_bias )
320 {
321     int32_t non_zero = 0;
322     v8i16 dct0, dct1, dct2, dct3;
323     v8i16 zero = { 0 };
324     v8i16 dct0_mask, dct1_mask, dct2_mask, dct3_mask;
325     v8i16 dct_h0, dct_h1, dct_h2, dct_h3, mf_h0, mf_h1, mf_h2, mf_h3;
326     v8i16 bias_h0, bias_h1, bias_h2, bias_h3;
327     v4i32 dct_w0, dct_w1, dct_w2, dct_w3, dct_w4, dct_w5, dct_w6, dct_w7;
328     v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3;
329     v4i32 dct_signed_w4, dct_signed_w5, dct_signed_w6, dct_signed_w7;
330     v4i32 mf_vec0, mf_vec1, mf_vec2, mf_vec3;
331     v4i32 mf_vec4, mf_vec5, mf_vec6, mf_vec7;
332     v4i32 bias0, bias1, bias2, bias3, bias4, bias5, bias6, bias7;
333
334     LD_SH4( p_dct, 8, dct0, dct1, dct2, dct3 );
335
336     dct0_mask = __msa_clei_s_h( dct0, 0 );
337     dct1_mask = __msa_clei_s_h( dct1, 0 );
338     dct2_mask = __msa_clei_s_h( dct2, 0 );
339     dct3_mask = __msa_clei_s_h( dct3, 0 );
340
341     UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 );
342     UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 );
343     UNPCK_SH_SW( dct2, dct_signed_w4, dct_signed_w5 );
344     UNPCK_SH_SW( dct3, dct_signed_w6, dct_signed_w7 );
345     LD_SH4( p_bias, 8, bias_h0, bias_h1, bias_h2, bias_h3 );
346     ILVR_H4_SW( zero, bias_h0, zero, bias_h1, zero, bias_h2, zero, bias_h3,
347                 bias0, bias2, bias4, bias6 );
348     ILVL_H4_SW( zero, bias_h0, zero, bias_h1, zero, bias_h2, zero, bias_h3,
349                 bias1, bias3, bias5, bias7 );
350     LD_SH4( p_mf, 8, mf_h0, mf_h1, mf_h2, mf_h3 );
351     ILVR_H4_SW( zero, mf_h0, zero, mf_h1, zero, mf_h2, zero, mf_h3,
352                 mf_vec0, mf_vec2, mf_vec4, mf_vec6 );
353     ILVL_H4_SW( zero, mf_h0, zero, mf_h1, zero, mf_h2, zero, mf_h3,
354                 mf_vec1, mf_vec3, mf_vec5, mf_vec7 );
355
356     dct_w0 = __msa_add_a_w( dct_signed_w0, bias0 );
357     dct_w1 = __msa_add_a_w( dct_signed_w1, bias1 );
358     dct_w2 = __msa_add_a_w( dct_signed_w2, bias2 );
359     dct_w3 = __msa_add_a_w( dct_signed_w3, bias3 );
360     dct_w4 = __msa_add_a_w( dct_signed_w4, bias4 );
361     dct_w5 = __msa_add_a_w( dct_signed_w5, bias5 );
362     dct_w6 = __msa_add_a_w( dct_signed_w6, bias6 );
363     dct_w7 = __msa_add_a_w( dct_signed_w7, bias7 );
364
365     dct_w0 *= mf_vec0;
366     dct_w1 *= mf_vec1;
367     dct_w2 *= mf_vec2;
368     dct_w3 *= mf_vec3;
369     dct_w4 *= mf_vec4;
370     dct_w5 *= mf_vec5;
371     dct_w6 *= mf_vec6;
372     dct_w7 *= mf_vec7;
373
374     SRA_4V( dct_w0, dct_w1, dct_w2, dct_w3, 16 );
375     SRA_4V( dct_w4, dct_w5, dct_w6, dct_w7, 16 );
376     PCKEV_H4_SH( dct_w1, dct_w0, dct_w3, dct_w2, dct_w5, dct_w4, dct_w7, dct_w6,
377                  dct_h0, dct_h1, dct_h2, dct_h3 );
378     SUB4( zero, dct_h0, zero, dct_h1, zero, dct_h2, zero, dct_h3,
379           dct0, dct1, dct2, dct3 );
380
381     dct0 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h0,
382                                    ( v16u8 ) dct0, ( v16u8 ) dct0_mask );
383     dct1 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h1,
384                                    ( v16u8 ) dct1, ( v16u8 ) dct1_mask );
385     dct2 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h2,
386                                    ( v16u8 ) dct2, ( v16u8 ) dct2_mask );
387     dct3 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h3,
388                                    ( v16u8 ) dct3, ( v16u8 ) dct3_mask );
389
390     non_zero = HADD_SW_S32( ( v4u32 )( dct_h0 + dct_h1 + dct_h2 + dct_h3 ) );
391     ST_SH4( dct0, dct1, dct2, dct3, p_dct, 8 );
392     LD_SH4( p_dct + 32, 8, dct0, dct1, dct2, dct3 );
393
394     dct0_mask = __msa_clei_s_h( dct0, 0 );
395     dct1_mask = __msa_clei_s_h( dct1, 0 );
396     dct2_mask = __msa_clei_s_h( dct2, 0 );
397     dct3_mask = __msa_clei_s_h( dct3, 0 );
398
399     UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 );
400     UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 );
401     UNPCK_SH_SW( dct2, dct_signed_w4, dct_signed_w5 );
402     UNPCK_SH_SW( dct3, dct_signed_w6, dct_signed_w7 );
403     LD_SH4( p_bias + 32, 8, bias_h0, bias_h1, bias_h2, bias_h3 );
404     ILVR_H4_SW( zero, bias_h0, zero, bias_h1, zero, bias_h2, zero, bias_h3,
405                 bias0, bias2, bias4, bias6 );
406     ILVL_H4_SW( zero, bias_h0, zero, bias_h1, zero, bias_h2, zero, bias_h3,
407                 bias1, bias3, bias5, bias7 );
408     LD_SH4( p_mf + 32, 8, mf_h0, mf_h1, mf_h2, mf_h3 );
409     ILVR_H4_SW( zero, mf_h0, zero, mf_h1, zero, mf_h2, zero, mf_h3,
410                 mf_vec0, mf_vec2, mf_vec4, mf_vec6 );
411     ILVL_H4_SW( zero, mf_h0, zero, mf_h1, zero, mf_h2, zero, mf_h3,
412                 mf_vec1, mf_vec3, mf_vec5, mf_vec7 );
413
414     dct_w0 = __msa_add_a_w( dct_signed_w0, bias0 );
415     dct_w1 = __msa_add_a_w( dct_signed_w1, bias1 );
416     dct_w2 = __msa_add_a_w( dct_signed_w2, bias2 );
417     dct_w3 = __msa_add_a_w( dct_signed_w3, bias3 );
418     dct_w4 = __msa_add_a_w( dct_signed_w4, bias4 );
419     dct_w5 = __msa_add_a_w( dct_signed_w5, bias5 );
420     dct_w6 = __msa_add_a_w( dct_signed_w6, bias6 );
421     dct_w7 = __msa_add_a_w( dct_signed_w7, bias7 );
422
423     dct_w0 *= mf_vec0;
424     dct_w1 *= mf_vec1;
425     dct_w2 *= mf_vec2;
426     dct_w3 *= mf_vec3;
427     dct_w4 *= mf_vec4;
428     dct_w5 *= mf_vec5;
429     dct_w6 *= mf_vec6;
430     dct_w7 *= mf_vec7;
431
432     SRA_4V( dct_w0, dct_w1, dct_w2, dct_w3, 16 );
433     SRA_4V( dct_w4, dct_w5, dct_w6, dct_w7, 16 );
434     PCKEV_H2_SH( dct_w1, dct_w0, dct_w3, dct_w2, dct_h0, dct_h1 );
435     PCKEV_H2_SH( dct_w5, dct_w4, dct_w7, dct_w6, dct_h2, dct_h3 );
436     SUB4( zero, dct_h0, zero, dct_h1, zero, dct_h2, zero, dct_h3,
437           dct0, dct1, dct2, dct3 );
438
439     dct0 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h0,
440                                    ( v16u8 ) dct0, ( v16u8 ) dct0_mask );
441     dct1 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h1,
442                                    ( v16u8 ) dct1, ( v16u8 ) dct1_mask );
443     dct2 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h2,
444                                    ( v16u8 ) dct2, ( v16u8 ) dct2_mask );
445     dct3 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h3,
446                                    ( v16u8 ) dct3, ( v16u8 ) dct3_mask );
447
448     non_zero += HADD_SW_S32( ( v4u32 ) ( dct_h0 + dct_h1 + dct_h2 + dct_h3 ) );
449     ST_SH4( dct0, dct1, dct2, dct3, p_dct + 32, 8 );
450
451     return !!non_zero;
452 }
453
454 static int32_t avc_quant_4x4_dc_msa( int16_t *p_dct, int32_t i_mf,
455                                      int32_t i_bias )
456 {
457     int32_t non_zero = 0;
458     v8i16 dct0, dct1, dct0_mask, dct1_mask;
459     v8i16 zero = { 0 };
460     v8i16 dct_h0, dct_h1;
461     v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3;
462     v4i32 dct_w0, dct_w1, dct_w2, dct_w3;
463     v4i32 mf_vec, bias_vec;
464
465     LD_SH2( p_dct, 8, dct0, dct1 );
466
467     dct0_mask = __msa_clei_s_h( dct0, 0 );
468     dct1_mask = __msa_clei_s_h( dct1, 0 );
469
470     UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 );
471     UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 );
472
473     bias_vec = __msa_fill_w( i_bias );
474     mf_vec = __msa_fill_w( i_mf );
475
476     dct_w0 = __msa_add_a_w( dct_signed_w0, bias_vec );
477     dct_w1 = __msa_add_a_w( dct_signed_w1, bias_vec );
478     dct_w2 = __msa_add_a_w( dct_signed_w2, bias_vec );
479     dct_w3 = __msa_add_a_w( dct_signed_w3, bias_vec );
480
481     dct_w0 *= mf_vec;
482     dct_w1 *= mf_vec;
483     dct_w2 *= mf_vec;
484     dct_w3 *= mf_vec;
485
486     SRA_4V( dct_w0, dct_w1, dct_w2, dct_w3, 16 );
487     PCKEV_H2_SH( dct_w1, dct_w0, dct_w3, dct_w2, dct_h0, dct_h1 );
488
489     dct0 = zero - dct_h0;
490     dct1 = zero - dct_h1;
491     dct0 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h0,
492                                    ( v16u8 ) dct0, ( v16u8 ) dct0_mask );
493     dct1 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h1,
494                                    ( v16u8 ) dct1, ( v16u8 ) dct1_mask );
495     non_zero = HADD_SW_S32( ( v4u32 ) ( dct_h0 + dct_h1 ) );
496
497     ST_SH2( dct0, dct1, p_dct, 8 );
498
499     return !!non_zero;
500 }
501
502 static int32_t avc_coeff_last64_msa( int16_t *p_src )
503 {
504     uint32_t u_res;
505     v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
506     v8i16 tmp_h0, tmp_h1, tmp_h2, tmp_h3, tmp_h4, tmp_h5, tmp_h6, tmp_h7;
507     v16u8 tmp0, tmp1, tmp2, tmp3;
508     v8u16 vec0, vec1, vec2, vec3;
509     v4i32 out0;
510     v16u8 mask = { 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128 };
511
512     LD_SH8( p_src, 8, src0, src1, src2, src3, src4, src5, src6, src7 );
513
514     tmp_h0 = __msa_ceqi_h( src0, 0 );
515     tmp_h1 = __msa_ceqi_h( src1, 0 );
516     tmp_h2 = __msa_ceqi_h( src2, 0 );
517     tmp_h3 = __msa_ceqi_h( src3, 0 );
518     tmp_h4 = __msa_ceqi_h( src4, 0 );
519     tmp_h5 = __msa_ceqi_h( src5, 0 );
520     tmp_h6 = __msa_ceqi_h( src6, 0 );
521     tmp_h7 = __msa_ceqi_h( src7, 0 );
522
523     PCKEV_B4_UB( tmp_h1, tmp_h0, tmp_h3, tmp_h2, tmp_h5, tmp_h4, tmp_h7, tmp_h6,
524                  tmp0, tmp1, tmp2, tmp3 );
525
526     tmp0 = tmp0 & mask;
527     tmp1 = tmp1 & mask;
528     tmp2 = tmp2 & mask;
529     tmp3 = tmp3 & mask;
530
531     HADD_UB4_UH( tmp0, tmp1, tmp2, tmp3, vec0, vec1, vec2, vec3 );
532     PCKEV_B2_UB( vec1, vec0, vec3, vec2, tmp0, tmp1 );
533     HADD_UB2_UH( tmp0, tmp1, vec0, vec1 );
534
535     tmp0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) vec1, ( v16i8 ) vec0 );
536     vec0 = __msa_hadd_u_h( tmp0, tmp0 );
537     tmp0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) vec0, ( v16i8 ) vec0 );
538     out0 = ( v4i32 ) __msa_nloc_d( ( v2i64 ) tmp0 );
539     u_res = __msa_copy_u_w( out0, 0 );
540
541     return ( 63 - u_res );
542 }
543
544 static int32_t avc_coeff_last16_msa( int16_t *p_src )
545 {
546     uint32_t u_res;
547     v8i16 src0, src1;
548     v8u16 tmp_h0;
549     v16u8 tmp0;
550     v8i16 out0, out1;
551     v16i8 res0;
552     v16u8 mask = { 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128 };
553
554     LD_SH2( p_src, 8, src0, src1 );
555
556     out0 = __msa_ceqi_h( src0, 0 );
557     out1 = __msa_ceqi_h( src1, 0 );
558
559     tmp0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) out1, ( v16i8 ) out0 );
560     tmp0 = tmp0 & mask;
561     tmp_h0 = __msa_hadd_u_h( tmp0, tmp0 );
562     tmp0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) tmp_h0, ( v16i8 ) tmp_h0 );
563     tmp_h0 = __msa_hadd_u_h( tmp0, tmp0 );
564     tmp0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) tmp_h0, ( v16i8 ) tmp_h0 );
565     tmp_h0 = __msa_hadd_u_h( tmp0, tmp0 );
566     res0 = __msa_pckev_b( ( v16i8 ) tmp_h0, ( v16i8 ) tmp_h0 );
567     out0 = __msa_nloc_h( ( v8i16 ) res0 );
568     u_res = __msa_copy_u_h( out0, 0 );
569
570     return ( 15 - u_res );
571 }
572
573 void x264_dequant_4x4_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][16],
574                            int32_t i_qp )
575 {
576     avc_dequant_4x4_msa( p_dct, pi_dequant_mf, i_qp );
577 }
578
579 void x264_dequant_8x8_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][64],
580                            int32_t i_qp )
581 {
582     avc_dequant_8x8_msa( p_dct, pi_dequant_mf, i_qp );
583 }
584
585 void x264_dequant_4x4_dc_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][16],
586                               int32_t i_qp )
587 {
588     avc_dequant_4x4_dc_msa( p_dct, pi_dequant_mf, i_qp );
589 }
590
591 int32_t x264_quant_4x4_msa( int16_t *p_dct, uint16_t *p_mf, uint16_t *p_bias )
592 {
593     return avc_quant_4x4_msa( p_dct, p_mf, p_bias );
594 }
595
596 int32_t x264_quant_4x4x4_msa( int16_t p_dct[4][16],
597                               uint16_t pu_mf[16], uint16_t pu_bias[16] )
598 {
599     int32_t i_non_zero, i_non_zero_acc = 0;
600
601     for( int32_t j = 0; j < 4; j++  )
602     {
603         i_non_zero = x264_quant_4x4_msa( p_dct[j], pu_mf, pu_bias );
604
605         i_non_zero_acc |= ( !!i_non_zero ) << j;
606     }
607
608     return i_non_zero_acc;
609 }
610
611 int32_t x264_quant_8x8_msa( int16_t *p_dct, uint16_t *p_mf, uint16_t *p_bias )
612 {
613     return avc_quant_8x8_msa( p_dct, p_mf, p_bias );
614 }
615
616 int32_t x264_quant_4x4_dc_msa( int16_t *p_dct, int32_t i_mf, int32_t i_bias )
617 {
618     return avc_quant_4x4_dc_msa( p_dct, i_mf, i_bias );
619 }
620
621 int32_t x264_coeff_last64_msa( int16_t *p_src )
622 {
623     return avc_coeff_last64_msa( p_src );
624 }
625
626 int32_t x264_coeff_last16_msa( int16_t *p_src )
627 {
628     return avc_coeff_last16_msa( p_src );
629 }
630 #endif