1 /*****************************************************************************
2 * quant-c.c: msa quantization and level-run
3 *****************************************************************************
4 * Copyright (C) 2015 x264 project
6 * Authors: Rishikesh More <rishikesh.more@imgtec.com>
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
22 * This program is also available under a commercial proprietary license.
23 * For more information, contact us at licensing@x264.com.
24 *****************************************************************************/
26 #include "common/common.h"
30 static void avc_dequant_4x4_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][16],
33 const int32_t i_mf = i_qp % 6;
34 const int32_t q_bits = i_qp / 6 - 4;
36 v4i32 dequant_m_f0, dequant_m_f1, dequant_m_f2, dequant_m_f3;
38 LD_SH2( p_dct, 8, dct0, dct1 );
40 LD_SW2( pi_dequant_mf[i_mf], 4, dequant_m_f0, dequant_m_f1 );
41 LD_SW2( pi_dequant_mf[i_mf] + 8, 4, dequant_m_f2, dequant_m_f3 );
45 v8i16 dequant_mf_h0, dequant_mf_h1, q_bits_vec;
47 q_bits_vec = __msa_fill_h( q_bits );
49 PCKEV_H2_SH( dequant_m_f1, dequant_m_f0, dequant_m_f3, dequant_m_f2,
50 dequant_mf_h0, dequant_mf_h1 );
52 dct0 *= dequant_mf_h0;
53 dct1 *= dequant_mf_h1;
56 ST_SH2( dct0, dct1, p_dct, 8 );
60 const int32_t q_bits_add = 1 << ( -q_bits - 1 );
61 v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3;
62 v4i32 q_bits_vec, q_bits_vec_add;
64 q_bits_vec_add = __msa_fill_w( q_bits_add );
65 q_bits_vec = __msa_fill_w( -q_bits );
67 UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 );
68 UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 );
70 dct_signed_w0 *= dequant_m_f0;
71 dct_signed_w1 *= dequant_m_f1;
72 dct_signed_w2 *= dequant_m_f2;
73 dct_signed_w3 *= dequant_m_f3;
74 dct_signed_w0 += q_bits_vec_add;
75 dct_signed_w1 += q_bits_vec_add;
76 dct_signed_w2 += q_bits_vec_add;
77 dct_signed_w3 += q_bits_vec_add;
79 SRA_4V( dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3,
81 PCKEV_H2_SH( dct_signed_w1, dct_signed_w0, dct_signed_w3, dct_signed_w2,
83 ST_SH2( dct0, dct1, p_dct, 8 );
87 static void avc_dequant_8x8_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][64],
90 const int32_t i_mf = i_qp % 6;
91 const int32_t q_bits = i_qp / 6 - 6;
92 v8i16 dct0, dct1, dct2, dct3, dct4, dct5, dct6, dct7;
93 v4i32 dequant_m_f0, dequant_m_f1, dequant_m_f2, dequant_m_f3;
94 v4i32 dequant_m_f4, dequant_m_f5, dequant_m_f6, dequant_m_f7;
95 v4i32 dequant_m_f8, dequant_m_f9, dequant_m_f10, dequant_m_f11;
96 v4i32 dequant_m_f12, dequant_m_f13, dequant_m_f14, dequant_m_f15;
98 LD_SH8( p_dct, 8, dct0, dct1, dct2, dct3, dct4, dct5, dct6, dct7 );
100 LD_SW2( pi_dequant_mf[i_mf], 4, dequant_m_f0, dequant_m_f1 );
101 LD_SW2( pi_dequant_mf[i_mf] + 8, 4, dequant_m_f2, dequant_m_f3 );
102 LD_SW2( pi_dequant_mf[i_mf] + 16, 4, dequant_m_f4, dequant_m_f5 );
103 LD_SW2( pi_dequant_mf[i_mf] + 24, 4, dequant_m_f6, dequant_m_f7 );
104 LD_SW2( pi_dequant_mf[i_mf] + 32, 4, dequant_m_f8, dequant_m_f9 );
105 LD_SW2( pi_dequant_mf[i_mf] + 40, 4, dequant_m_f10, dequant_m_f11 );
106 LD_SW2( pi_dequant_mf[i_mf] + 48, 4, dequant_m_f12, dequant_m_f13 );
107 LD_SW2( pi_dequant_mf[i_mf] + 56, 4, dequant_m_f14, dequant_m_f15 );
112 v8i16 dequant_mf_h0, dequant_mf_h1, dequant_mf_h2, dequant_mf_h3;
113 v8i16 dequant_mf_h4, dequant_mf_h5, dequant_mf_h6, dequant_mf_h7;
115 q_bits_vec = __msa_fill_h( q_bits );
117 PCKEV_H4_SH( dequant_m_f1, dequant_m_f0, dequant_m_f3, dequant_m_f2,
118 dequant_m_f5, dequant_m_f4, dequant_m_f7, dequant_m_f6,
119 dequant_mf_h0, dequant_mf_h1,
120 dequant_mf_h2, dequant_mf_h3 );
121 PCKEV_H4_SH( dequant_m_f9, dequant_m_f8, dequant_m_f11, dequant_m_f10,
122 dequant_m_f13, dequant_m_f12, dequant_m_f15, dequant_m_f14,
123 dequant_mf_h4, dequant_mf_h5,
124 dequant_mf_h6, dequant_mf_h7 );
126 dct0 *= dequant_mf_h0;
127 dct1 *= dequant_mf_h1;
128 dct2 *= dequant_mf_h2;
129 dct3 *= dequant_mf_h3;
130 dct4 *= dequant_mf_h4;
131 dct5 *= dequant_mf_h5;
132 dct6 *= dequant_mf_h6;
133 dct7 *= dequant_mf_h7;
135 SLLI_4V( dct0, dct1, dct2, dct3, q_bits_vec );
136 SLLI_4V( dct4, dct5, dct6, dct7, q_bits_vec );
138 ST_SH8( dct0, dct1, dct2, dct3, dct4, dct5, dct6, dct7, p_dct, 8 );
142 const int32_t q_bits_add = 1 << ( -q_bits - 1 );
143 v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3;
144 v4i32 dct_signed_w4, dct_signed_w5, dct_signed_w6, dct_signed_w7;
145 v4i32 dct_signed_w8, dct_signed_w9, dct_signed_w10, dct_signed_w11;
146 v4i32 dct_signed_w12, dct_signed_w13, dct_signed_w14, dct_signed_w15;
147 v4i32 q_bits_vec, q_bits_vec_add;
149 q_bits_vec_add = __msa_fill_w( q_bits_add );
150 q_bits_vec = __msa_fill_w( -q_bits );
152 UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 );
153 UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 );
154 UNPCK_SH_SW( dct2, dct_signed_w4, dct_signed_w5 );
155 UNPCK_SH_SW( dct3, dct_signed_w6, dct_signed_w7 );
156 UNPCK_SH_SW( dct4, dct_signed_w8, dct_signed_w9 );
157 UNPCK_SH_SW( dct5, dct_signed_w10, dct_signed_w11 );
158 UNPCK_SH_SW( dct6, dct_signed_w12, dct_signed_w13 );
159 UNPCK_SH_SW( dct7, dct_signed_w14, dct_signed_w15 );
161 dct_signed_w0 *= dequant_m_f0;
162 dct_signed_w1 *= dequant_m_f1;
163 dct_signed_w2 *= dequant_m_f2;
164 dct_signed_w3 *= dequant_m_f3;
165 dct_signed_w4 *= dequant_m_f4;
166 dct_signed_w5 *= dequant_m_f5;
167 dct_signed_w6 *= dequant_m_f6;
168 dct_signed_w7 *= dequant_m_f7;
169 dct_signed_w8 *= dequant_m_f8;
170 dct_signed_w9 *= dequant_m_f9;
171 dct_signed_w10 *= dequant_m_f10;
172 dct_signed_w11 *= dequant_m_f11;
173 dct_signed_w12 *= dequant_m_f12;
174 dct_signed_w13 *= dequant_m_f13;
175 dct_signed_w14 *= dequant_m_f14;
176 dct_signed_w15 *= dequant_m_f15;
178 dct_signed_w0 += q_bits_vec_add;
179 dct_signed_w1 += q_bits_vec_add;
180 dct_signed_w2 += q_bits_vec_add;
181 dct_signed_w3 += q_bits_vec_add;
182 dct_signed_w4 += q_bits_vec_add;
183 dct_signed_w5 += q_bits_vec_add;
184 dct_signed_w6 += q_bits_vec_add;
185 dct_signed_w7 += q_bits_vec_add;
186 dct_signed_w8 += q_bits_vec_add;
187 dct_signed_w9 += q_bits_vec_add;
188 dct_signed_w10 += q_bits_vec_add;
189 dct_signed_w11 += q_bits_vec_add;
190 dct_signed_w12 += q_bits_vec_add;
191 dct_signed_w13 += q_bits_vec_add;
192 dct_signed_w14 += q_bits_vec_add;
193 dct_signed_w15 += q_bits_vec_add;
195 SRA_4V( dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3,
197 SRA_4V( dct_signed_w4, dct_signed_w5, dct_signed_w6, dct_signed_w7,
199 SRA_4V( dct_signed_w8, dct_signed_w9, dct_signed_w10, dct_signed_w11,
201 SRA_4V( dct_signed_w12, dct_signed_w13, dct_signed_w14, dct_signed_w15,
203 PCKEV_H4_SH( dct_signed_w1, dct_signed_w0, dct_signed_w3, dct_signed_w2,
204 dct_signed_w5, dct_signed_w4, dct_signed_w7, dct_signed_w6,
205 dct0, dct1, dct2, dct3 );
206 PCKEV_H4_SH( dct_signed_w9, dct_signed_w8, dct_signed_w11,
207 dct_signed_w10, dct_signed_w13, dct_signed_w12,
208 dct_signed_w15, dct_signed_w14, dct4, dct5, dct6, dct7 );
209 ST_SH8( dct0, dct1, dct2, dct3, dct4, dct5, dct6, dct7, p_dct, 8 );
213 static void avc_dequant_4x4_dc_msa( int16_t *p_dct,
214 int32_t pi_dequant_mf[6][16],
217 const int32_t q_bits = i_qp / 6 - 6;
218 int32_t i_dmf = pi_dequant_mf[i_qp % 6][0];
219 v8i16 dct0, dct1, dequant_mf_h;
221 LD_SH2( p_dct, 8, dct0, dct1 );
227 dequant_mf_h = __msa_fill_h( i_dmf );
228 dct0 = dct0 * dequant_mf_h;
229 dct1 = dct1 * dequant_mf_h;
231 ST_SH2( dct0, dct1, p_dct, 8 );
235 const int32_t q_bits_add = 1 << ( -q_bits - 1 );
236 v4i32 dequant_m_f, q_bits_vec, q_bits_vec_add;
237 v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3;
239 q_bits_vec_add = __msa_fill_w( q_bits_add );
240 q_bits_vec = __msa_fill_w( -q_bits );
242 dequant_m_f = __msa_fill_w( i_dmf );
244 UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 );
245 UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 );
247 dct_signed_w0 *= dequant_m_f;
248 dct_signed_w1 *= dequant_m_f;
249 dct_signed_w2 *= dequant_m_f;
250 dct_signed_w3 *= dequant_m_f;
252 dct_signed_w0 += q_bits_vec_add;
253 dct_signed_w1 += q_bits_vec_add;
254 dct_signed_w2 += q_bits_vec_add;
255 dct_signed_w3 += q_bits_vec_add;
257 SRA_4V( dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3,
259 PCKEV_H2_SH( dct_signed_w1, dct_signed_w0, dct_signed_w3, dct_signed_w2,
261 ST_SH2( dct0, dct1, p_dct, 8 );
265 static int32_t avc_quant_4x4_msa( int16_t *p_dct, uint16_t *p_mf,
268 int32_t non_zero = 0;
271 v8i16 dct0_mask, dct1_mask;
272 v8i16 dct_h0, dct_h1, mf_h0, mf_h1, bias_h0, bias_h1;
273 v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3;
274 v4i32 dct_w0, dct_w1, dct_w2, dct_w3;
275 v4i32 mf_vec0, mf_vec1, mf_vec2, mf_vec3;
276 v4i32 bias0, bias1, bias2, bias3;
278 LD_SH2( p_dct, 8, dct0, dct1 );
279 LD_SH2( p_bias, 8, bias_h0, bias_h1 );
280 LD_SH2( p_mf, 8, mf_h0, mf_h1 );
282 dct0_mask = __msa_clei_s_h( dct0, 0 );
283 dct1_mask = __msa_clei_s_h( dct1, 0 );
285 UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 );
286 UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 );
287 ILVR_H2_SW( zero, bias_h0, zero, bias_h1, bias0, bias2 );
288 ILVL_H2_SW( zero, bias_h0, zero, bias_h1, bias1, bias3 );
289 ILVR_H2_SW( zero, mf_h0, zero, mf_h1, mf_vec0, mf_vec2 );
290 ILVL_H2_SW( zero, mf_h0, zero, mf_h1, mf_vec1, mf_vec3 );
292 dct_w1 = __msa_add_a_w( dct_signed_w1, bias1 );
293 dct_w0 = __msa_add_a_w( dct_signed_w0, bias0 );
294 dct_w2 = __msa_add_a_w( dct_signed_w2, bias2 );
295 dct_w3 = __msa_add_a_w( dct_signed_w3, bias3 );
302 SRA_4V( dct_w0, dct_w1, dct_w2, dct_w3, 16 );
303 PCKEV_H2_SH( dct_w1, dct_w0, dct_w3, dct_w2, dct_h0, dct_h1 );
305 dct0 = zero - dct_h0;
306 dct1 = zero - dct_h1;
308 dct0 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h0, ( v16u8 ) dct0,
309 ( v16u8 ) dct0_mask );
310 dct1 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h1, ( v16u8 ) dct1,
311 ( v16u8 ) dct1_mask );
312 non_zero = HADD_SW_S32( ( v4u32 ) ( dct_h0 + dct_h1 ) );
313 ST_SH2( dct0, dct1, p_dct, 8 );
318 static int32_t avc_quant_8x8_msa( int16_t *p_dct, uint16_t *p_mf,
321 int32_t non_zero = 0;
322 v8i16 dct0, dct1, dct2, dct3;
324 v8i16 dct0_mask, dct1_mask, dct2_mask, dct3_mask;
325 v8i16 dct_h0, dct_h1, dct_h2, dct_h3, mf_h0, mf_h1, mf_h2, mf_h3;
326 v8i16 bias_h0, bias_h1, bias_h2, bias_h3;
327 v4i32 dct_w0, dct_w1, dct_w2, dct_w3, dct_w4, dct_w5, dct_w6, dct_w7;
328 v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3;
329 v4i32 dct_signed_w4, dct_signed_w5, dct_signed_w6, dct_signed_w7;
330 v4i32 mf_vec0, mf_vec1, mf_vec2, mf_vec3;
331 v4i32 mf_vec4, mf_vec5, mf_vec6, mf_vec7;
332 v4i32 bias0, bias1, bias2, bias3, bias4, bias5, bias6, bias7;
334 LD_SH4( p_dct, 8, dct0, dct1, dct2, dct3 );
336 dct0_mask = __msa_clei_s_h( dct0, 0 );
337 dct1_mask = __msa_clei_s_h( dct1, 0 );
338 dct2_mask = __msa_clei_s_h( dct2, 0 );
339 dct3_mask = __msa_clei_s_h( dct3, 0 );
341 UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 );
342 UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 );
343 UNPCK_SH_SW( dct2, dct_signed_w4, dct_signed_w5 );
344 UNPCK_SH_SW( dct3, dct_signed_w6, dct_signed_w7 );
345 LD_SH4( p_bias, 8, bias_h0, bias_h1, bias_h2, bias_h3 );
346 ILVR_H4_SW( zero, bias_h0, zero, bias_h1, zero, bias_h2, zero, bias_h3,
347 bias0, bias2, bias4, bias6 );
348 ILVL_H4_SW( zero, bias_h0, zero, bias_h1, zero, bias_h2, zero, bias_h3,
349 bias1, bias3, bias5, bias7 );
350 LD_SH4( p_mf, 8, mf_h0, mf_h1, mf_h2, mf_h3 );
351 ILVR_H4_SW( zero, mf_h0, zero, mf_h1, zero, mf_h2, zero, mf_h3,
352 mf_vec0, mf_vec2, mf_vec4, mf_vec6 );
353 ILVL_H4_SW( zero, mf_h0, zero, mf_h1, zero, mf_h2, zero, mf_h3,
354 mf_vec1, mf_vec3, mf_vec5, mf_vec7 );
356 dct_w0 = __msa_add_a_w( dct_signed_w0, bias0 );
357 dct_w1 = __msa_add_a_w( dct_signed_w1, bias1 );
358 dct_w2 = __msa_add_a_w( dct_signed_w2, bias2 );
359 dct_w3 = __msa_add_a_w( dct_signed_w3, bias3 );
360 dct_w4 = __msa_add_a_w( dct_signed_w4, bias4 );
361 dct_w5 = __msa_add_a_w( dct_signed_w5, bias5 );
362 dct_w6 = __msa_add_a_w( dct_signed_w6, bias6 );
363 dct_w7 = __msa_add_a_w( dct_signed_w7, bias7 );
374 SRA_4V( dct_w0, dct_w1, dct_w2, dct_w3, 16 );
375 SRA_4V( dct_w4, dct_w5, dct_w6, dct_w7, 16 );
376 PCKEV_H4_SH( dct_w1, dct_w0, dct_w3, dct_w2, dct_w5, dct_w4, dct_w7, dct_w6,
377 dct_h0, dct_h1, dct_h2, dct_h3 );
378 SUB4( zero, dct_h0, zero, dct_h1, zero, dct_h2, zero, dct_h3,
379 dct0, dct1, dct2, dct3 );
381 dct0 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h0,
382 ( v16u8 ) dct0, ( v16u8 ) dct0_mask );
383 dct1 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h1,
384 ( v16u8 ) dct1, ( v16u8 ) dct1_mask );
385 dct2 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h2,
386 ( v16u8 ) dct2, ( v16u8 ) dct2_mask );
387 dct3 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h3,
388 ( v16u8 ) dct3, ( v16u8 ) dct3_mask );
390 non_zero = HADD_SW_S32( ( v4u32 )( dct_h0 + dct_h1 + dct_h2 + dct_h3 ) );
391 ST_SH4( dct0, dct1, dct2, dct3, p_dct, 8 );
392 LD_SH4( p_dct + 32, 8, dct0, dct1, dct2, dct3 );
394 dct0_mask = __msa_clei_s_h( dct0, 0 );
395 dct1_mask = __msa_clei_s_h( dct1, 0 );
396 dct2_mask = __msa_clei_s_h( dct2, 0 );
397 dct3_mask = __msa_clei_s_h( dct3, 0 );
399 UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 );
400 UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 );
401 UNPCK_SH_SW( dct2, dct_signed_w4, dct_signed_w5 );
402 UNPCK_SH_SW( dct3, dct_signed_w6, dct_signed_w7 );
403 LD_SH4( p_bias + 32, 8, bias_h0, bias_h1, bias_h2, bias_h3 );
404 ILVR_H4_SW( zero, bias_h0, zero, bias_h1, zero, bias_h2, zero, bias_h3,
405 bias0, bias2, bias4, bias6 );
406 ILVL_H4_SW( zero, bias_h0, zero, bias_h1, zero, bias_h2, zero, bias_h3,
407 bias1, bias3, bias5, bias7 );
408 LD_SH4( p_mf + 32, 8, mf_h0, mf_h1, mf_h2, mf_h3 );
409 ILVR_H4_SW( zero, mf_h0, zero, mf_h1, zero, mf_h2, zero, mf_h3,
410 mf_vec0, mf_vec2, mf_vec4, mf_vec6 );
411 ILVL_H4_SW( zero, mf_h0, zero, mf_h1, zero, mf_h2, zero, mf_h3,
412 mf_vec1, mf_vec3, mf_vec5, mf_vec7 );
414 dct_w0 = __msa_add_a_w( dct_signed_w0, bias0 );
415 dct_w1 = __msa_add_a_w( dct_signed_w1, bias1 );
416 dct_w2 = __msa_add_a_w( dct_signed_w2, bias2 );
417 dct_w3 = __msa_add_a_w( dct_signed_w3, bias3 );
418 dct_w4 = __msa_add_a_w( dct_signed_w4, bias4 );
419 dct_w5 = __msa_add_a_w( dct_signed_w5, bias5 );
420 dct_w6 = __msa_add_a_w( dct_signed_w6, bias6 );
421 dct_w7 = __msa_add_a_w( dct_signed_w7, bias7 );
432 SRA_4V( dct_w0, dct_w1, dct_w2, dct_w3, 16 );
433 SRA_4V( dct_w4, dct_w5, dct_w6, dct_w7, 16 );
434 PCKEV_H2_SH( dct_w1, dct_w0, dct_w3, dct_w2, dct_h0, dct_h1 );
435 PCKEV_H2_SH( dct_w5, dct_w4, dct_w7, dct_w6, dct_h2, dct_h3 );
436 SUB4( zero, dct_h0, zero, dct_h1, zero, dct_h2, zero, dct_h3,
437 dct0, dct1, dct2, dct3 );
439 dct0 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h0,
440 ( v16u8 ) dct0, ( v16u8 ) dct0_mask );
441 dct1 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h1,
442 ( v16u8 ) dct1, ( v16u8 ) dct1_mask );
443 dct2 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h2,
444 ( v16u8 ) dct2, ( v16u8 ) dct2_mask );
445 dct3 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h3,
446 ( v16u8 ) dct3, ( v16u8 ) dct3_mask );
448 non_zero += HADD_SW_S32( ( v4u32 ) ( dct_h0 + dct_h1 + dct_h2 + dct_h3 ) );
449 ST_SH4( dct0, dct1, dct2, dct3, p_dct + 32, 8 );
454 static int32_t avc_quant_4x4_dc_msa( int16_t *p_dct, int32_t i_mf,
457 int32_t non_zero = 0;
458 v8i16 dct0, dct1, dct0_mask, dct1_mask;
460 v8i16 dct_h0, dct_h1;
461 v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3;
462 v4i32 dct_w0, dct_w1, dct_w2, dct_w3;
463 v4i32 mf_vec, bias_vec;
465 LD_SH2( p_dct, 8, dct0, dct1 );
467 dct0_mask = __msa_clei_s_h( dct0, 0 );
468 dct1_mask = __msa_clei_s_h( dct1, 0 );
470 UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 );
471 UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 );
473 bias_vec = __msa_fill_w( i_bias );
474 mf_vec = __msa_fill_w( i_mf );
476 dct_w0 = __msa_add_a_w( dct_signed_w0, bias_vec );
477 dct_w1 = __msa_add_a_w( dct_signed_w1, bias_vec );
478 dct_w2 = __msa_add_a_w( dct_signed_w2, bias_vec );
479 dct_w3 = __msa_add_a_w( dct_signed_w3, bias_vec );
486 SRA_4V( dct_w0, dct_w1, dct_w2, dct_w3, 16 );
487 PCKEV_H2_SH( dct_w1, dct_w0, dct_w3, dct_w2, dct_h0, dct_h1 );
489 dct0 = zero - dct_h0;
490 dct1 = zero - dct_h1;
491 dct0 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h0,
492 ( v16u8 ) dct0, ( v16u8 ) dct0_mask );
493 dct1 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h1,
494 ( v16u8 ) dct1, ( v16u8 ) dct1_mask );
495 non_zero = HADD_SW_S32( ( v4u32 ) ( dct_h0 + dct_h1 ) );
497 ST_SH2( dct0, dct1, p_dct, 8 );
502 static int32_t avc_coeff_last64_msa( int16_t *p_src )
505 v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
506 v8i16 tmp_h0, tmp_h1, tmp_h2, tmp_h3, tmp_h4, tmp_h5, tmp_h6, tmp_h7;
507 v16u8 tmp0, tmp1, tmp2, tmp3;
508 v8u16 vec0, vec1, vec2, vec3;
510 v16u8 mask = { 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128 };
512 LD_SH8( p_src, 8, src0, src1, src2, src3, src4, src5, src6, src7 );
514 tmp_h0 = __msa_ceqi_h( src0, 0 );
515 tmp_h1 = __msa_ceqi_h( src1, 0 );
516 tmp_h2 = __msa_ceqi_h( src2, 0 );
517 tmp_h3 = __msa_ceqi_h( src3, 0 );
518 tmp_h4 = __msa_ceqi_h( src4, 0 );
519 tmp_h5 = __msa_ceqi_h( src5, 0 );
520 tmp_h6 = __msa_ceqi_h( src6, 0 );
521 tmp_h7 = __msa_ceqi_h( src7, 0 );
523 PCKEV_B4_UB( tmp_h1, tmp_h0, tmp_h3, tmp_h2, tmp_h5, tmp_h4, tmp_h7, tmp_h6,
524 tmp0, tmp1, tmp2, tmp3 );
531 HADD_UB4_UH( tmp0, tmp1, tmp2, tmp3, vec0, vec1, vec2, vec3 );
532 PCKEV_B2_UB( vec1, vec0, vec3, vec2, tmp0, tmp1 );
533 HADD_UB2_UH( tmp0, tmp1, vec0, vec1 );
535 tmp0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) vec1, ( v16i8 ) vec0 );
536 vec0 = __msa_hadd_u_h( tmp0, tmp0 );
537 tmp0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) vec0, ( v16i8 ) vec0 );
538 out0 = ( v4i32 ) __msa_nloc_d( ( v2i64 ) tmp0 );
539 u_res = __msa_copy_u_w( out0, 0 );
541 return ( 63 - u_res );
544 static int32_t avc_coeff_last16_msa( int16_t *p_src )
552 v16u8 mask = { 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128 };
554 LD_SH2( p_src, 8, src0, src1 );
556 out0 = __msa_ceqi_h( src0, 0 );
557 out1 = __msa_ceqi_h( src1, 0 );
559 tmp0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) out1, ( v16i8 ) out0 );
561 tmp_h0 = __msa_hadd_u_h( tmp0, tmp0 );
562 tmp0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) tmp_h0, ( v16i8 ) tmp_h0 );
563 tmp_h0 = __msa_hadd_u_h( tmp0, tmp0 );
564 tmp0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) tmp_h0, ( v16i8 ) tmp_h0 );
565 tmp_h0 = __msa_hadd_u_h( tmp0, tmp0 );
566 res0 = __msa_pckev_b( ( v16i8 ) tmp_h0, ( v16i8 ) tmp_h0 );
567 out0 = __msa_nloc_h( ( v8i16 ) res0 );
568 u_res = __msa_copy_u_h( out0, 0 );
570 return ( 15 - u_res );
573 void x264_dequant_4x4_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][16],
576 avc_dequant_4x4_msa( p_dct, pi_dequant_mf, i_qp );
579 void x264_dequant_8x8_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][64],
582 avc_dequant_8x8_msa( p_dct, pi_dequant_mf, i_qp );
585 void x264_dequant_4x4_dc_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][16],
588 avc_dequant_4x4_dc_msa( p_dct, pi_dequant_mf, i_qp );
591 int32_t x264_quant_4x4_msa( int16_t *p_dct, uint16_t *p_mf, uint16_t *p_bias )
593 return avc_quant_4x4_msa( p_dct, p_mf, p_bias );
596 int32_t x264_quant_4x4x4_msa( int16_t p_dct[4][16],
597 uint16_t pu_mf[16], uint16_t pu_bias[16] )
599 int32_t i_non_zero, i_non_zero_acc = 0;
601 for( int32_t j = 0; j < 4; j++ )
603 i_non_zero = x264_quant_4x4_msa( p_dct[j], pu_mf, pu_bias );
605 i_non_zero_acc |= ( !!i_non_zero ) << j;
608 return i_non_zero_acc;
611 int32_t x264_quant_8x8_msa( int16_t *p_dct, uint16_t *p_mf, uint16_t *p_bias )
613 return avc_quant_8x8_msa( p_dct, p_mf, p_bias );
616 int32_t x264_quant_4x4_dc_msa( int16_t *p_dct, int32_t i_mf, int32_t i_bias )
618 return avc_quant_4x4_dc_msa( p_dct, i_mf, i_bias );
621 int32_t x264_coeff_last64_msa( int16_t *p_src )
623 return avc_coeff_last64_msa( p_src );
626 int32_t x264_coeff_last16_msa( int16_t *p_src )
628 return avc_coeff_last16_msa( p_src );