1 /*****************************************************************************
2 * mc-c.c: msa motion compensation
3 *****************************************************************************
4 * Copyright (C) 2015-2016 x264 project
6 * Authors: Neha Rana <neha.rana@imgtec.com>
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
22 * This program is also available under a commercial proprietary license.
23 * For more information, contact us at licensing@x264.com.
24 *****************************************************************************/
26 #include "common/common.h"
31 static const uint8_t pu_luma_mask_arr[16 * 8] =
34 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12,
35 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10, 8, 11,
36 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
38 0, 5, 1, 6, 2, 7, 3, 8, 16, 21, 17, 22, 18, 23, 19, 24,
39 1, 4, 2, 5, 3, 6, 4, 7, 17, 20, 18, 21, 19, 22, 20, 23,
40 2, 3, 3, 4, 4, 5, 5, 6, 18, 19, 19, 20, 20, 21, 21, 22,
41 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 24, 25,
42 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26
45 static const uint8_t pu_chroma_mask_arr[16 * 5] =
47 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
48 0, 2, 2, 4, 4, 6, 6, 8, 16, 18, 18, 20, 20, 22, 22, 24,
49 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
50 0, 1, 1, 2, 16, 17, 17, 18, 4, 5, 5, 6, 6, 7, 7, 8,
51 0, 1, 1, 2, 16, 17, 17, 18, 16, 17, 17, 18, 18, 19, 19, 20
54 void x264_mc_copy_w16_msa( uint8_t *p_dst, intptr_t i_dst_stride,
55 uint8_t *p_src, intptr_t i_src_stride,
57 void x264_mc_copy_w8_msa( uint8_t *p_dst, intptr_t i_dst_stride,
58 uint8_t *p_src, intptr_t i_src_stride,
60 void x264_mc_copy_w4_msa( uint8_t *p_dst, intptr_t i_dst_stride, uint8_t *p_src,
61 intptr_t i_src_stride, int32_t i_height );
62 void x264_memzero_aligned_msa( void *p_dst, size_t n );
64 void x264_pixel_avg_16x16_msa( uint8_t *p_pix1, intptr_t i_pix1_stride,
65 uint8_t *p_pix2, intptr_t i_pix2_stride,
66 uint8_t *p_pix3, intptr_t i_pix3_stride,
68 void x264_pixel_avg_16x8_msa( uint8_t *p_pix1, intptr_t i_pix1_stride,
69 uint8_t *p_pix2, intptr_t i_pix2_stride,
70 uint8_t *p_pix3, intptr_t i_pix3_stride,
72 void x264_pixel_avg_8x16_msa( uint8_t *p_pix1, intptr_t i_pix1_stride,
73 uint8_t *p_pix2, intptr_t i_pix2_stride,
74 uint8_t *p_pix3, intptr_t i_pix3_stride,
76 void x264_pixel_avg_8x8_msa( uint8_t *p_pix1, intptr_t i_pix1_stride,
77 uint8_t *p_pix2, intptr_t i_pix2_stride,
78 uint8_t *p_pix3, intptr_t i_pix3_stride,
80 void x264_pixel_avg_8x4_msa( uint8_t *p_pix1, intptr_t i_pix1_stride,
81 uint8_t *p_pix2, intptr_t i_pix2_stride,
82 uint8_t *p_pix3, intptr_t i_pix3_stride,
84 void x264_pixel_avg_4x16_msa( uint8_t *p_pix1, intptr_t pix1_stride,
85 uint8_t *p_pix2, intptr_t pix2_stride,
86 uint8_t *p_pix3, intptr_t pix3_stride,
88 void x264_pixel_avg_4x8_msa( uint8_t *p_pix1, intptr_t i_pix1_stride,
89 uint8_t *p_pix2, intptr_t i_pix2_stride,
90 uint8_t *p_pix3, intptr_t i_pix3_stride,
92 void x264_pixel_avg_4x4_msa( uint8_t *p_pix1, intptr_t i_pix1_stride,
93 uint8_t *p_pix2, intptr_t i_pix2_stride,
94 uint8_t *p_pix3, intptr_t i_pix3_stride,
96 void x264_pixel_avg_4x2_msa( uint8_t *p_pix1, intptr_t i_pix1_stride,
97 uint8_t *p_pix2, intptr_t i_pix2_stride,
98 uint8_t *p_pix3, intptr_t i_pix3_stride,
101 void x264_mc_weight_w20_msa( uint8_t *p_dst, intptr_t i_dst_stride,
102 uint8_t *p_src, intptr_t i_src_stride,
103 const x264_weight_t *pWeight, int32_t i_height );
104 void x264_mc_weight_w4_msa( uint8_t *p_dst, intptr_t i_dst_stride,
105 uint8_t *p_src, intptr_t i_src_stride,
106 const x264_weight_t *pWeight, int32_t i_height );
107 void x264_mc_weight_w8_msa( uint8_t *p_dst, intptr_t i_dst_stride,
108 uint8_t *p_src, intptr_t i_src_stride,
109 const x264_weight_t *pWeight, int32_t i_height );
110 void x264_mc_weight_w16_msa( uint8_t *p_dst, intptr_t i_dst_stride,
111 uint8_t *p_src, intptr_t i_src_stride,
112 const x264_weight_t *pWeight, int32_t i_height );
114 weight_fn_t x264_mc_weight_wtab_msa[6] =
116 x264_mc_weight_w4_msa,
117 x264_mc_weight_w4_msa,
118 x264_mc_weight_w8_msa,
119 x264_mc_weight_w16_msa,
120 x264_mc_weight_w16_msa,
121 x264_mc_weight_w20_msa,
124 void x264_mc_luma_msa( uint8_t *p_dst, intptr_t i_dst_stride,
125 uint8_t *p_src[4], intptr_t i_src_stride,
126 int32_t m_vx, int32_t m_vy,
127 int32_t i_width, int32_t i_height,
128 const x264_weight_t *pWeight );
129 uint8_t *x264_get_ref_msa( uint8_t *p_dst, intptr_t *p_dst_stride,
130 uint8_t *p_src[4], intptr_t i_src_stride,
131 int32_t m_vx, int32_t m_vy,
132 int32_t i_width, int32_t i_height,
133 const x264_weight_t *pWeight );
134 void x264_mc_chroma_msa( uint8_t *p_dst_u, uint8_t *p_dst_v,
135 intptr_t i_dst_stride,
136 uint8_t *p_src, intptr_t i_src_stride,
137 int32_t m_vx, int32_t m_vy,
138 int32_t i_width, int32_t i_height );
139 void x264_hpel_filter_msa( uint8_t *p_dsth, uint8_t *p_dst_v,
140 uint8_t *p_dstc, uint8_t *p_src,
141 intptr_t i_stride, int32_t i_width,
142 int32_t i_height, int16_t *p_buf );
144 void x264_plane_copy_interleave_msa( uint8_t *p_dst, intptr_t i_dst_stride,
145 uint8_t *p_src0, intptr_t i_src_stride0,
146 uint8_t *p_src1, intptr_t i_src_stride1,
147 int32_t i_width, int32_t i_height );
148 void x264_plane_copy_deinterleave_msa( uint8_t *p_dst0, intptr_t i_dst_stride0,
149 uint8_t *p_dst1, intptr_t i_dst_stride1,
150 uint8_t *p_src, intptr_t i_src_stride,
151 int32_t i_width, int32_t i_height );
152 void x264_plane_copy_deinterleave_rgb_msa( uint8_t *p_dst0,
153 intptr_t i_dst_stride0,
155 intptr_t i_dst_stride1,
157 intptr_t i_dst_stride2,
159 intptr_t i_src_stride,
160 int32_t i_src_width, int32_t i_width,
162 void x264_store_interleave_chroma_msa( uint8_t *p_dst, intptr_t i_dst_stride,
163 uint8_t *p_src0, uint8_t *p_src1,
165 void x264_load_deinterleave_chroma_fenc_msa( uint8_t *p_dst, uint8_t *p_src,
166 intptr_t i_src_stride,
168 void x264_load_deinterleave_chroma_fdec_msa( uint8_t *p_dst, uint8_t *p_src,
169 intptr_t i_src_stride,
171 void x264_frame_init_lowres_core_msa( uint8_t *p_src, uint8_t *p_dst0,
172 uint8_t *p_dst1, uint8_t *p_dst2,
173 uint8_t *p_dst3, intptr_t i_src_stride,
174 intptr_t i_dst_stride, int32_t i_width,
177 static void avc_luma_hz_16w_msa( uint8_t *p_src, int32_t i_src_stride,
178 uint8_t *p_dst, int32_t i_dst_stride,
181 uint32_t u_loop_cnt, u_h4w;
183 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
184 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
185 v16i8 mask0, mask1, mask2;
186 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
187 v16i8 vec6, vec7, vec8, vec9, vec10, vec11;
188 v16i8 minus5b = __msa_ldi_b( -5 );
189 v16i8 plus20b = __msa_ldi_b( 20 );
191 u_h4w = i_height % 4;
192 LD_SB3( &pu_luma_mask_arr[0], 16, mask0, mask1, mask2 );
194 for( u_loop_cnt = ( i_height >> 2 ); u_loop_cnt--; )
196 LD_SB2( p_src, 8, src0, src1 );
197 p_src += i_src_stride;
198 LD_SB2( p_src, 8, src2, src3 );
199 p_src += i_src_stride;
201 XORI_B4_128_SB( src0, src1, src2, src3 );
202 VSHF_B2_SB( src0, src0, src1, src1, mask0, mask0, vec0, vec3 );
203 VSHF_B2_SB( src2, src2, src3, src3, mask0, mask0, vec6, vec9 );
204 VSHF_B2_SB( src0, src0, src1, src1, mask1, mask1, vec1, vec4 );
205 VSHF_B2_SB( src2, src2, src3, src3, mask1, mask1, vec7, vec10 );
206 VSHF_B2_SB( src0, src0, src1, src1, mask2, mask2, vec2, vec5 );
207 VSHF_B2_SB( src2, src2, src3, src3, mask2, mask2, vec8, vec11 );
208 HADD_SB4_SH( vec0, vec3, vec6, vec9, res0, res1, res2, res3 );
209 DPADD_SB4_SH( vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
210 minus5b, res0, res1, res2, res3 );
211 DPADD_SB4_SH( vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
212 plus20b, res0, res1, res2, res3 );
214 LD_SB2( p_src, 8, src4, src5 );
215 p_src += i_src_stride;
216 LD_SB2( p_src, 8, src6, src7 );
217 p_src += i_src_stride;
219 XORI_B4_128_SB( src4, src5, src6, src7 );
220 VSHF_B2_SB( src4, src4, src5, src5, mask0, mask0, vec0, vec3 );
221 VSHF_B2_SB( src6, src6, src7, src7, mask0, mask0, vec6, vec9 );
222 VSHF_B2_SB( src4, src4, src5, src5, mask1, mask1, vec1, vec4 );
223 VSHF_B2_SB( src6, src6, src7, src7, mask1, mask1, vec7, vec10 );
224 VSHF_B2_SB( src4, src4, src5, src5, mask2, mask2, vec2, vec5 );
225 VSHF_B2_SB( src6, src6, src7, src7, mask2, mask2, vec8, vec11 );
226 HADD_SB4_SH( vec0, vec3, vec6, vec9, res4, res5, res6, res7 );
227 DPADD_SB4_SH( vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
228 minus5b, res4, res5, res6, res7 );
229 DPADD_SB4_SH( vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
230 plus20b, res4, res5, res6, res7 );
231 SRARI_H4_SH( res0, res1, res2, res3, 5 );
232 SRARI_H4_SH( res4, res5, res6, res7, 5 );
233 SAT_SH4_SH( res0, res1, res2, res3, 7 );
234 SAT_SH4_SH( res4, res5, res6, res7, 7 );
235 PCKEV_B4_SB( res1, res0, res3, res2, res5, res4, res7, res6,
236 vec0, vec1, vec2, vec3 );
237 XORI_B4_128_SB( vec0, vec1, vec2, vec3 );
239 ST_SB4( vec0, vec1, vec2, vec3, p_dst, i_dst_stride );
240 p_dst += ( 4 * i_dst_stride );
243 for( u_loop_cnt = u_h4w; u_loop_cnt--; )
245 LD_SB2( p_src, 8, src0, src1 );
246 p_src += i_src_stride;
248 XORI_B2_128_SB( src0, src1 );
249 VSHF_B2_SB( src0, src0, src1, src1, mask0, mask0, vec0, vec3 );
250 VSHF_B2_SB( src0, src0, src1, src1, mask1, mask1, vec1, vec4 );
251 VSHF_B2_SB( src0, src0, src1, src1, mask2, mask2, vec2, vec5 );
252 res0 = __msa_hadd_s_h( vec0, vec0 );
253 DPADD_SB2_SH( vec1, vec2, minus5b, plus20b, res0, res0 );
254 res1 = __msa_hadd_s_h( vec3, vec3 );
255 DPADD_SB2_SH( vec4, vec5, minus5b, plus20b, res1, res1 );
256 SRARI_H2_SH( res0, res1, 5 );
257 SAT_SH2_SH( res0, res1, 7 );
258 dst0 = PCKEV_XORI128_UB( res0, res1 );
259 ST_UB( dst0, p_dst );
260 p_dst += i_dst_stride;
264 static void avc_luma_vt_16w_msa( uint8_t *p_src, int32_t i_src_stride,
265 uint8_t *p_dst, int32_t i_dst_stride,
268 uint32_t u_loop_cnt, u_h4w;
269 const int16_t i_filt_const0 = 0xfb01;
270 const int16_t i_filt_const1 = 0x1414;
271 const int16_t i_filt_const2 = 0x1fb;
272 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
273 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
274 v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
275 v16i8 src65_l, src87_l;
276 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
277 v16u8 res0, res1, res2, res3;
278 v16i8 filt0, filt1, filt2;
280 u_h4w = i_height % 4;
281 filt0 = ( v16i8 ) __msa_fill_h( i_filt_const0 );
282 filt1 = ( v16i8 ) __msa_fill_h( i_filt_const1 );
283 filt2 = ( v16i8 ) __msa_fill_h( i_filt_const2 );
285 LD_SB5( p_src, i_src_stride, src0, src1, src2, src3, src4 );
286 p_src += ( 5 * i_src_stride );
288 XORI_B5_128_SB( src0, src1, src2, src3, src4 );
289 ILVR_B4_SB( src1, src0, src2, src1, src3, src2, src4, src3,
290 src10_r, src21_r, src32_r, src43_r );
291 ILVL_B4_SB( src1, src0, src2, src1, src3, src2, src4, src3,
292 src10_l, src21_l, src32_l, src43_l );
294 for( u_loop_cnt = ( i_height >> 2 ); u_loop_cnt--; )
296 LD_SB4( p_src, i_src_stride, src5, src6, src7, src8 );
297 p_src += ( 4 * i_src_stride );
299 XORI_B4_128_SB( src5, src6, src7, src8 );
300 ILVR_B4_SB( src5, src4, src6, src5, src7, src6, src8, src7,
301 src54_r, src65_r, src76_r, src87_r );
302 ILVL_B4_SB( src5, src4, src6, src5, src7, src6, src8, src7,
303 src54_l, src65_l, src76_l, src87_l );
304 out0_r = DPADD_SH3_SH( src10_r, src32_r, src54_r,
305 filt0, filt1, filt2 );
306 out1_r = DPADD_SH3_SH( src21_r, src43_r, src65_r,
307 filt0, filt1, filt2 );
308 out2_r = DPADD_SH3_SH( src32_r, src54_r, src76_r,
309 filt0, filt1, filt2 );
310 out3_r = DPADD_SH3_SH( src43_r, src65_r, src87_r,
311 filt0, filt1, filt2 );
312 out0_l = DPADD_SH3_SH( src10_l, src32_l, src54_l,
313 filt0, filt1, filt2 );
314 out1_l = DPADD_SH3_SH( src21_l, src43_l, src65_l,
315 filt0, filt1, filt2 );
316 out2_l = DPADD_SH3_SH( src32_l, src54_l, src76_l,
317 filt0, filt1, filt2 );
318 out3_l = DPADD_SH3_SH( src43_l, src65_l, src87_l,
319 filt0, filt1, filt2 );
320 SRARI_H4_SH( out0_r, out1_r, out2_r, out3_r, 5 );
321 SAT_SH4_SH( out0_r, out1_r, out2_r, out3_r, 7 );
322 SRARI_H4_SH( out0_l, out1_l, out2_l, out3_l, 5 );
323 SAT_SH4_SH( out0_l, out1_l, out2_l, out3_l, 7 );
324 PCKEV_B4_UB( out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
325 out3_r, res0, res1, res2, res3 );
326 XORI_B4_128_UB( res0, res1, res2, res3 );
328 ST_UB4( res0, res1, res2, res3, p_dst, i_dst_stride );
329 p_dst += ( 4 * i_dst_stride );
342 for( u_loop_cnt = u_h4w; u_loop_cnt--; )
344 src5 = LD_SB( p_src );
345 p_src += ( i_src_stride );
346 src5 = ( v16i8 ) __msa_xori_b( ( v16u8 ) src5, 128 );
347 ILVRL_B2_SB( src5, src4, src54_r, src54_l );
348 out0_r = DPADD_SH3_SH( src10_r, src32_r, src54_r,
349 filt0, filt1, filt2 );
350 out0_l = DPADD_SH3_SH( src10_l, src32_l, src54_l,
351 filt0, filt1, filt2 );
352 SRARI_H2_SH( out0_r, out0_l, 5 );
353 SAT_SH2_SH( out0_r, out0_l, 7 );
354 out0_r = ( v8i16 ) __msa_pckev_b( ( v16i8 ) out0_l, ( v16i8 ) out0_r );
355 res0 = __msa_xori_b( ( v16u8 ) out0_r, 128 );
356 ST_UB( res0, p_dst );
357 p_dst += i_dst_stride;
373 static void avc_luma_mid_8w_msa( uint8_t *p_src, int32_t i_src_stride,
374 uint8_t *p_dst, int32_t i_dst_stride,
377 uint32_t u_loop_cnt, u_h4w;
380 v16i8 src0, src1, src2, src3, src4;
381 v16i8 mask0, mask1, mask2;
382 v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
383 v8i16 hz_out4, hz_out5, hz_out6, hz_out7, hz_out8;
384 v8i16 dst0, dst1, dst2, dst3;
387 u_h4w = i_height % 4;
388 LD_SB3( &pu_luma_mask_arr[0], 16, mask0, mask1, mask2 );
390 LD_SB5( p_src, i_src_stride, src0, src1, src2, src3, src4 );
391 XORI_B5_128_SB( src0, src1, src2, src3, src4 );
392 p_src += ( 5 * i_src_stride );
394 hz_out0 = AVC_HORZ_FILTER_SH( src0, mask0, mask1, mask2 );
395 hz_out1 = AVC_HORZ_FILTER_SH( src1, mask0, mask1, mask2 );
396 hz_out2 = AVC_HORZ_FILTER_SH( src2, mask0, mask1, mask2 );
397 hz_out3 = AVC_HORZ_FILTER_SH( src3, mask0, mask1, mask2 );
398 hz_out4 = AVC_HORZ_FILTER_SH( src4, mask0, mask1, mask2 );
400 for( u_loop_cnt = ( i_height >> 2 ); u_loop_cnt--; )
402 LD_SB4( p_src, i_src_stride, src0, src1, src2, src3 );
403 XORI_B4_128_SB( src0, src1, src2, src3 );
404 p_src += ( 4 * i_src_stride );
406 hz_out5 = AVC_HORZ_FILTER_SH( src0, mask0, mask1, mask2 );
407 hz_out6 = AVC_HORZ_FILTER_SH( src1, mask0, mask1, mask2 );
408 hz_out7 = AVC_HORZ_FILTER_SH( src2, mask0, mask1, mask2 );
409 hz_out8 = AVC_HORZ_FILTER_SH( src3, mask0, mask1, mask2 );
410 dst0 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH( hz_out0, hz_out1, hz_out2,
411 hz_out3, hz_out4, hz_out5 );
412 dst1 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH( hz_out1, hz_out2, hz_out3,
413 hz_out4, hz_out5, hz_out6 );
414 dst2 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH( hz_out2, hz_out3, hz_out4,
415 hz_out5, hz_out6, hz_out7 );
416 dst3 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH( hz_out3, hz_out4, hz_out5,
417 hz_out6, hz_out7, hz_out8 );
418 out0 = PCKEV_XORI128_UB( dst0, dst1 );
419 out1 = PCKEV_XORI128_UB( dst2, dst3 );
420 ST8x4_UB( out0, out1, p_dst, i_dst_stride );
422 p_dst += ( 4 * i_dst_stride );
431 for( u_loop_cnt = u_h4w; u_loop_cnt--; )
433 src0 = LD_SB( p_src );
434 p_src += i_src_stride;
436 src0 = ( v16i8 ) __msa_xori_b( ( v16u8 ) src0, 128 );
437 hz_out5 = AVC_HORZ_FILTER_SH( src0, mask0, mask1, mask2 );
439 dst0 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH( hz_out0, hz_out1,
443 tmp0 = __msa_pckev_b( ( v16i8 ) ( dst0 ), ( v16i8 ) ( dst0 ) );
444 tmp0 = ( v16i8 ) __msa_xori_b( ( v16u8 ) tmp0, 128 );
445 u_out0 = __msa_copy_u_d( ( v2i64 ) tmp0, 0 );
447 p_dst += i_dst_stride;
457 static void avc_luma_mid_16w_msa( uint8_t *p_src, int32_t i_src_stride,
458 uint8_t *p_dst, int32_t i_dst_stride,
461 uint32_t u_multiple8_cnt;
463 for( u_multiple8_cnt = 2; u_multiple8_cnt--; )
465 avc_luma_mid_8w_msa( p_src, i_src_stride, p_dst, i_dst_stride,
472 static void avc_interleaved_chroma_hv_2x2_msa( uint8_t *p_src,
473 int32_t i_src_stride,
476 int32_t i_dst_stride,
477 uint32_t u_coef_hor0,
478 uint32_t u_coef_hor1,
479 uint32_t u_coef_ver0,
480 uint32_t u_coef_ver1 )
482 uint16_t u_out0, u_out1, u_out2, u_out3;
483 v16u8 src0, src1, src2, src3, src4;
484 v8u16 res_hz0, res_hz1, res_hz2, res_hz3;
485 v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
487 v16i8 coeff_hz_vec0 = __msa_fill_b( u_coef_hor0 );
488 v16i8 coeff_hz_vec1 = __msa_fill_b( u_coef_hor1 );
489 v16u8 coeff_hz_vec = ( v16u8 ) __msa_ilvr_b( coeff_hz_vec0, coeff_hz_vec1 );
490 v8u16 coeff_vt_vec0 = ( v8u16 ) __msa_fill_h( u_coef_ver0 );
491 v8u16 coeff_vt_vec1 = ( v8u16 ) __msa_fill_h( u_coef_ver1 );
494 mask = LD_SB( &pu_chroma_mask_arr[16] );
496 LD_UB3( p_src, i_src_stride, src0, src1, src2 );
497 VSHF_B2_UB( src0, src1, src1, src2,
498 ( mask + 1 ), ( mask + 1 ), src3, src4 );
499 VSHF_B2_UB( src0, src1, src1, src2, mask, mask, src0, src1 );
500 DOTP_UB4_UH( src0, src1, src3, src4, coeff_hz_vec, coeff_hz_vec,
501 coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2,
503 MUL4( res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2,
504 coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
506 ADD2( res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt2 );
507 SRARI_H2_UH( res_vt0, res_vt2, 6 );
508 SAT_UH2_UH( res_vt0, res_vt2, 7 );
509 PCKEV_B2_SH( res_vt0, res_vt0, res_vt2, res_vt2, res0, res1 );
511 u_out0 = __msa_copy_u_h( res0, 0 );
512 u_out1 = __msa_copy_u_h( res0, 2 );
513 u_out2 = __msa_copy_u_h( res1, 0 );
514 u_out3 = __msa_copy_u_h( res1, 2 );
516 SH( u_out0, p_dst_u );
517 p_dst_u += i_dst_stride;
518 SH( u_out1, p_dst_u );
520 SH( u_out2, p_dst_v );
521 p_dst_v += i_dst_stride;
522 SH( u_out3, p_dst_v );
525 static void avc_interleaved_chroma_hv_2x4_msa( uint8_t *p_src,
526 int32_t i_src_stride,
529 int32_t i_dst_stride,
530 uint32_t u_coef_hor0,
531 uint32_t u_coef_hor1,
532 uint32_t u_coef_ver0,
533 uint32_t u_coef_ver1 )
535 uint16_t u_out0, u_out1, u_out2, u_out3;
536 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
537 v8u16 res_hz0, res_hz1, res_hz2, res_hz3;
538 v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
541 v16i8 coeff_hz_vec0 = __msa_fill_b( u_coef_hor0 );
542 v16i8 coeff_hz_vec1 = __msa_fill_b( u_coef_hor1 );
543 v16u8 coeff_hz_vec = ( v16u8 ) __msa_ilvr_b( coeff_hz_vec0, coeff_hz_vec1 );
544 v8u16 coeff_vt_vec0 = ( v8u16 ) __msa_fill_h( u_coef_ver0 );
545 v8u16 coeff_vt_vec1 = ( v8u16 ) __msa_fill_h( u_coef_ver1 );
547 mask = LD_SB( &pu_chroma_mask_arr[16] );
549 LD_UB5( p_src, i_src_stride, src0, src1, src2, src3, src4 );
551 VSHF_B2_UB( src0, src1, src1, src2,
552 ( mask + 1 ), ( mask + 1 ), src5, src6 );
553 VSHF_B2_UB( src2, src3, src3, src4,
554 ( mask + 1 ), ( mask + 1 ), src7, src8 );
555 VSHF_B2_UB( src0, src1, src1, src2, mask, mask, src0, src1 );
556 VSHF_B2_UB( src2, src3, src3, src4, mask, mask, src2, src3 );
557 DOTP_UB4_UH( src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec,
558 coeff_hz_vec, coeff_hz_vec, res_hz0,
559 res_hz1, res_hz2, res_hz3 );
560 MUL4( res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2,
561 coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
563 ADD2( res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1 );
564 SRARI_H2_UH( res_vt0, res_vt1, 6 );
565 SAT_UH2_UH( res_vt0, res_vt1, 7 );
566 PCKEV_B2_SH( res_vt0, res_vt0, res_vt1, res_vt1, res0, res1 );
568 u_out0 = __msa_copy_u_h( res0, 0 );
569 u_out1 = __msa_copy_u_h( res0, 2 );
570 u_out2 = __msa_copy_u_h( res1, 0 );
571 u_out3 = __msa_copy_u_h( res1, 2 );
573 SH( u_out0, p_dst_u );
574 p_dst_u += i_dst_stride;
575 SH( u_out1, p_dst_u );
576 p_dst_u += i_dst_stride;
577 SH( u_out2, p_dst_u );
578 p_dst_u += i_dst_stride;
579 SH( u_out3, p_dst_u );
581 DOTP_UB4_UH( src5, src6, src7, src8, coeff_hz_vec, coeff_hz_vec,
582 coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2,
584 MUL4( res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2,
585 coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
587 ADD2( res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1 );
588 SRARI_H2_UH( res_vt0, res_vt1, 6 );
589 SAT_UH2_UH( res_vt0, res_vt1, 7 );
590 PCKEV_B2_SH( res_vt0, res_vt0, res_vt1, res_vt1, res0, res1 );
592 u_out0 = __msa_copy_u_h( res0, 0 );
593 u_out1 = __msa_copy_u_h( res0, 2 );
594 u_out2 = __msa_copy_u_h( res1, 0 );
595 u_out3 = __msa_copy_u_h( res1, 2 );
597 SH( u_out0, p_dst_v );
598 p_dst_v += i_dst_stride;
599 SH( u_out1, p_dst_v );
600 p_dst_v += i_dst_stride;
601 SH( u_out2, p_dst_v );
602 p_dst_v += i_dst_stride;
603 SH( u_out3, p_dst_v );
606 static void avc_interleaved_chroma_hv_2w_msa( uint8_t *p_src,
607 int32_t i_src_stride,
610 int32_t i_dst_stride,
611 uint32_t u_coef_hor0,
612 uint32_t u_coef_hor1,
613 uint32_t u_coef_ver0,
614 uint32_t u_coef_ver1,
619 avc_interleaved_chroma_hv_2x2_msa( p_src, i_src_stride,
620 p_dst_u, p_dst_v, i_dst_stride,
621 u_coef_hor0, u_coef_hor1,
622 u_coef_ver0, u_coef_ver1 );
624 else if( 4 == i_height )
626 avc_interleaved_chroma_hv_2x4_msa( p_src, i_src_stride,
627 p_dst_u, p_dst_v, i_dst_stride,
628 u_coef_hor0, u_coef_hor1,
629 u_coef_ver0, u_coef_ver1 );
633 static void avc_interleaved_chroma_hv_4x2_msa( uint8_t *p_src,
634 int32_t i_src_stride,
637 int32_t i_dst_stride,
638 uint32_t u_coef_hor0,
639 uint32_t u_coef_hor1,
640 uint32_t u_coef_ver0,
641 uint32_t u_coef_ver1 )
643 uint32_t u_out0, u_out1, u_out2, u_out3;
644 v16u8 src0, src1, src2, src3, src4;
645 v8u16 res_hz0, res_hz1, res_hz2, res_hz3;
646 v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
648 v16i8 coeff_hz_vec0 = __msa_fill_b( u_coef_hor0 );
649 v16i8 coeff_hz_vec1 = __msa_fill_b( u_coef_hor1 );
650 v16u8 coeff_hz_vec = ( v16u8 ) __msa_ilvr_b( coeff_hz_vec0, coeff_hz_vec1 );
651 v8u16 coeff_vt_vec0 = ( v8u16 ) __msa_fill_h( u_coef_ver0 );
652 v8u16 coeff_vt_vec1 = ( v8u16 ) __msa_fill_h( u_coef_ver1 );
655 mask = LD_SB( &pu_chroma_mask_arr[16] );
657 LD_UB3( p_src, i_src_stride, src0, src1, src2 );
658 VSHF_B2_UB( src0, src1, src1, src2,
659 ( mask + 1 ), ( mask + 1 ), src3, src4 );
660 VSHF_B2_UB( src0, src1, src1, src2, mask, mask, src0, src1 );
661 DOTP_UB4_UH( src0, src1, src3, src4, coeff_hz_vec, coeff_hz_vec,
662 coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2,
664 MUL4( res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2,
665 coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
667 ADD2( res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt2 );
668 SRARI_H2_UH( res_vt0, res_vt2, 6 );
669 SAT_UH2_UH( res_vt0, res_vt2, 7 );
670 PCKEV_B2_SW( res_vt0, res_vt0, res_vt2, res_vt2, res0, res1 );
672 u_out0 = __msa_copy_u_w( res0, 0 );
673 u_out1 = __msa_copy_u_w( res0, 1 );
674 u_out2 = __msa_copy_u_w( res1, 0 );
675 u_out3 = __msa_copy_u_w( res1, 1 );
676 SW( u_out0, p_dst_u );
677 p_dst_u += i_dst_stride;
678 SW( u_out1, p_dst_u );
679 SW( u_out2, p_dst_v );
680 p_dst_v += i_dst_stride;
681 SW( u_out3, p_dst_v );
684 static void avc_interleaved_chroma_hv_4x4mul_msa( uint8_t *p_src,
685 int32_t i_src_stride,
688 int32_t i_dst_stride,
689 uint32_t u_coef_hor0,
690 uint32_t u_coef_hor1,
691 uint32_t u_coef_ver0,
692 uint32_t u_coef_ver1,
696 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
697 v8u16 res_hz0, res_hz1, res_hz2, res_hz3;
698 v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
701 v16i8 coeff_hz_vec0 = __msa_fill_b( u_coef_hor0 );
702 v16i8 coeff_hz_vec1 = __msa_fill_b( u_coef_hor1 );
703 v16u8 coeff_hz_vec = ( v16u8 ) __msa_ilvr_b( coeff_hz_vec0, coeff_hz_vec1 );
704 v8u16 coeff_vt_vec0 = ( v8u16 ) __msa_fill_h( u_coef_ver0 );
705 v8u16 coeff_vt_vec1 = ( v8u16 ) __msa_fill_h( u_coef_ver1 );
707 mask = LD_SB( &pu_chroma_mask_arr[16] );
709 src0 = LD_UB( p_src );
710 p_src += i_src_stride;
712 for( u_row = ( i_height >> 2 ); u_row--; )
714 LD_UB4( p_src, i_src_stride, src1, src2, src3, src4 );
715 p_src += ( 4 * i_src_stride );
717 VSHF_B2_UB( src0, src1, src1, src2,
718 ( mask + 1 ), ( mask + 1 ), src5, src6 );
719 VSHF_B2_UB( src2, src3, src3, src4,
720 ( mask + 1 ), ( mask + 1 ), src7, src8 );
721 VSHF_B2_UB( src0, src1, src1, src2, mask, mask, src0, src1 );
722 VSHF_B2_UB( src2, src3, src3, src4, mask, mask, src2, src3 );
723 DOTP_UB4_UH( src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec,
724 coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2,
726 MUL4( res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2,
727 coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
729 ADD2( res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1 );
730 SRARI_H2_UH( res_vt0, res_vt1, 6 );
731 SAT_UH2_UH( res_vt0, res_vt1, 7 );
732 PCKEV_B2_SW( res_vt0, res_vt0, res_vt1, res_vt1, res0, res1 );
734 ST4x4_UB( res0, res1, 0, 1, 0, 1, p_dst_u, i_dst_stride );
735 p_dst_u += ( 4 * i_dst_stride );
737 DOTP_UB4_UH( src5, src6, src7, src8, coeff_hz_vec, coeff_hz_vec,
738 coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2,
740 MUL4( res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2,
741 coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
743 ADD2( res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1 );
744 SRARI_H2_UH( res_vt0, res_vt1, 6 );
745 SAT_UH2_UH( res_vt0, res_vt1, 7 );
746 PCKEV_B2_SW( res_vt0, res_vt0, res_vt1, res_vt1, res0, res1 );
748 ST4x4_UB( res0, res1, 0, 1, 0, 1, p_dst_v, i_dst_stride );
749 p_dst_v += ( 4 * i_dst_stride );
754 static void avc_interleaved_chroma_hv_4w_msa( uint8_t *p_src,
755 int32_t i_src_stride,
758 int32_t i_dst_stride,
759 uint32_t u_coef_hor0,
760 uint32_t u_coef_hor1,
761 uint32_t u_coef_ver0,
762 uint32_t u_coef_ver1,
767 avc_interleaved_chroma_hv_4x2_msa( p_src, i_src_stride,
768 p_dst_u, p_dst_v, i_dst_stride,
769 u_coef_hor0, u_coef_hor1,
770 u_coef_ver0, u_coef_ver1 );
774 avc_interleaved_chroma_hv_4x4mul_msa( p_src, i_src_stride,
775 p_dst_u, p_dst_v, i_dst_stride,
776 u_coef_hor0, u_coef_hor1,
777 u_coef_ver0, u_coef_ver1,
782 static void avc_interleaved_chroma_hv_8w_msa( uint8_t *p_src,
783 int32_t i_src_stride,
786 int32_t i_dst_stride,
787 uint32_t u_coef_hor0,
788 uint32_t u_coef_hor1,
789 uint32_t u_coef_ver0,
790 uint32_t u_coef_ver1,
794 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
795 v16u8 src10, src11, src12, src13, src14;
796 v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4, res_hz5;
797 v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
798 v16i8 mask = { 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14, 16 };
799 v16i8 coeff_hz_vec0, coeff_hz_vec1;
802 v8u16 coeff_vt_vec0, coeff_vt_vec1;
804 coeff_hz_vec0 = __msa_fill_b( u_coef_hor0 );
805 coeff_hz_vec1 = __msa_fill_b( u_coef_hor1 );
806 coeff_hz_vec = ( v16u8 ) __msa_ilvr_b( coeff_hz_vec0, coeff_hz_vec1 );
807 coeff_vt_vec0 = ( v8u16 ) __msa_fill_h( u_coef_ver0 );
808 coeff_vt_vec1 = ( v8u16 ) __msa_fill_h( u_coef_ver1 );
810 LD_UB2( p_src, 16, src0, src13 );
811 p_src += i_src_stride;
813 VSHF_B2_UB( src0, src13, src0, src13, ( mask + 1 ), mask, src14, src0 );
814 DOTP_UB2_UH( src0, src14, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz5 );
816 for( u_row = ( i_height >> 2 ); u_row--; )
818 LD_UB4( p_src, i_src_stride, src1, src2, src3, src4 );
819 LD_UB4( p_src + 16, i_src_stride, src5, src6, src7, src8 );
820 p_src += ( 4 * i_src_stride );
822 VSHF_B2_UB( src1, src5, src2, src6, mask, mask, src9, src10 );
823 VSHF_B2_UB( src3, src7, src4, src8, mask, mask, src11, src12 );
824 DOTP_UB4_UH( src9, src10, src11, src12, coeff_hz_vec, coeff_hz_vec,
825 coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3,
827 MUL4( res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3,
828 coeff_vt_vec0, res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
831 res_vt0 += ( res_hz0 * coeff_vt_vec1 );
832 res_vt1 += ( res_hz1 * coeff_vt_vec1 );
833 res_vt2 += ( res_hz2 * coeff_vt_vec1 );
834 res_vt3 += ( res_hz3 * coeff_vt_vec1 );
836 SRARI_H4_UH( res_vt0, res_vt1, res_vt2, res_vt3, 6 );
837 SAT_UH4_UH( res_vt0, res_vt1, res_vt2, res_vt3, 7 );
838 PCKEV_B2_SB( res_vt1, res_vt0, res_vt3, res_vt2, tmp0, tmp1 );
839 ST8x4_UB( tmp0, tmp1, p_dst_u, i_dst_stride );
840 p_dst_u += ( 4 * i_dst_stride );
843 VSHF_B2_UB( src1, src5, src2, src6,
844 ( mask + 1 ), ( mask + 1 ), src5, src6 );
845 VSHF_B2_UB( src3, src7, src4, src8,
846 ( mask + 1 ), ( mask + 1 ), src7, src8 );
847 DOTP_UB4_UH( src5, src6, src7, src8, coeff_hz_vec, coeff_hz_vec,
848 coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3,
850 MUL4( res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3,
851 coeff_vt_vec0, res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
854 res_vt0 += ( res_hz5 * coeff_vt_vec1 );
855 res_vt1 += ( res_hz1 * coeff_vt_vec1 );
856 res_vt2 += ( res_hz2 * coeff_vt_vec1 );
857 res_vt3 += ( res_hz3 * coeff_vt_vec1 );
859 SRARI_H4_UH( res_vt0, res_vt1, res_vt2, res_vt3, 6 );
860 SAT_UH4_UH( res_vt0, res_vt1, res_vt2, res_vt3, 7 );
861 PCKEV_B2_SB( res_vt1, res_vt0, res_vt3, res_vt2, tmp0, tmp1 );
862 ST8x4_UB( tmp0, tmp1, p_dst_v, i_dst_stride );
863 p_dst_v += ( 4 * i_dst_stride );
868 static void avc_wgt_opscale_4x2_msa( uint8_t *p_src, int32_t i_src_stride,
869 uint8_t *p_dst, int32_t i_dst_stride,
870 int32_t i_log2_denom, int32_t i_weight,
871 int32_t i_offset_in )
873 uint32_t u_load0, u_load1, u_out0, u_out1;
877 v8u16 temp0, temp1, wgt, denom, offset, tp0, tp1;
880 i_offset_in <<= ( i_log2_denom );
884 i_offset_in += ( 1 << ( i_log2_denom - 1 ) );
887 wgt = ( v8u16 ) __msa_fill_h( i_weight );
888 offset = ( v8u16 ) __msa_fill_h( i_offset_in );
889 denom = ( v8u16 ) __msa_fill_h( i_log2_denom );
891 u_load0 = LW( p_src );
892 p_src += i_src_stride;
893 u_load1 = LW( p_src );
895 src0 = ( v16u8 ) __msa_fill_w( u_load0 );
896 src1 = ( v16u8 ) __msa_fill_w( u_load1 );
898 ILVR_B2_UH( zero, src0, zero, src1, temp0, temp1 );
899 MUL2( wgt, temp0, wgt, temp1, temp0, temp1 );
900 ADDS_SH2_SH( temp0, offset, temp1, offset, vec0, vec1 );
901 MAXI_SH2_SH( vec0, vec1, 0 );
903 tp0 = ( v8u16 ) __msa_srl_h( vec0, ( v8i16 ) denom );
904 tp1 = ( v8u16 ) __msa_srl_h( vec1, ( v8i16 ) denom );
906 SAT_UH2_UH( tp0, tp1, 7 );
907 PCKEV_B2_SW( tp0, tp0, tp1, tp1, dst0, dst1 );
909 u_out0 = __msa_copy_u_w( dst0, 0 );
910 u_out1 = __msa_copy_u_w( dst1, 0 );
912 p_dst += i_dst_stride;
916 static void avc_wgt_opscale_4x4multiple_msa( uint8_t *p_src,
917 int32_t i_src_stride,
919 int32_t i_dst_stride,
921 int32_t i_log2_denom,
923 int32_t i_offset_in )
926 uint32_t u_load0, u_load1, u_load2, u_load3;
928 v16u8 src0, src1, src2, src3;
929 v8u16 temp0, temp1, temp2, temp3;
930 v8u16 wgt, denom, offset;
932 i_offset_in <<= ( i_log2_denom );
936 i_offset_in += ( 1 << ( i_log2_denom - 1 ) );
939 wgt = ( v8u16 ) __msa_fill_h( i_weight );
940 offset = ( v8u16 ) __msa_fill_h( i_offset_in );
941 denom = ( v8u16 ) __msa_fill_h( i_log2_denom );
943 for( u_cnt = i_height / 4; u_cnt--; )
945 LW4( p_src, i_src_stride, u_load0, u_load1, u_load2, u_load3 );
946 p_src += 4 * i_src_stride;
948 src0 = ( v16u8 ) __msa_fill_w( u_load0 );
949 src1 = ( v16u8 ) __msa_fill_w( u_load1 );
950 src2 = ( v16u8 ) __msa_fill_w( u_load2 );
951 src3 = ( v16u8 ) __msa_fill_w( u_load3 );
953 ILVR_B4_UH( zero, src0, zero, src1, zero, src2, zero, src3,
954 temp0, temp1, temp2, temp3 );
955 MUL4( wgt, temp0, wgt, temp1, wgt, temp2, wgt, temp3,
956 temp0, temp1, temp2, temp3 );
957 ADDS_SH4_UH( temp0, offset, temp1, offset, temp2, offset, temp3, offset,
958 temp0, temp1, temp2, temp3 );
959 MAXI_SH4_UH( temp0, temp1, temp2, temp3, 0 );
960 SRL_H4_UH( temp0, temp1, temp2, temp3, denom );
961 SAT_UH4_UH( temp0, temp1, temp2, temp3, 7 );
962 PCKEV_ST4x4_UB( temp0, temp1, temp2, temp3, p_dst, i_dst_stride );
963 p_dst += ( 4 * i_dst_stride );
967 static void avc_wgt_opscale_4width_msa( uint8_t *p_src, int32_t i_src_stride,
968 uint8_t *p_dst, int32_t i_dst_stride,
969 int32_t i_height, int32_t i_log2_denom,
970 int32_t i_weight, int32_t i_offset_in )
974 avc_wgt_opscale_4x2_msa( p_src, i_src_stride, p_dst, i_dst_stride,
975 i_log2_denom, i_weight, i_offset_in );
979 avc_wgt_opscale_4x4multiple_msa( p_src, i_src_stride,
981 i_height, i_log2_denom,
982 i_weight, i_offset_in );
986 static void avc_wgt_opscale_8width_msa( uint8_t *p_src, int32_t i_src_stride,
987 uint8_t *p_dst, int32_t i_dst_stride,
988 int32_t i_height, int32_t i_log2_denom,
989 int32_t i_weight, int32_t i_offset_in )
993 v16u8 src0, src1, src2, src3;
994 v8u16 temp0, temp1, temp2, temp3;
995 v8u16 wgt, denom, offset;
998 i_offset_in <<= ( i_log2_denom );
1002 i_offset_in += ( 1 << ( i_log2_denom - 1 ) );
1005 wgt = ( v8u16 ) __msa_fill_h( i_weight );
1006 offset = ( v8u16 ) __msa_fill_h( i_offset_in );
1007 denom = ( v8u16 ) __msa_fill_h( i_log2_denom );
1009 for( u_cnt = i_height / 4; u_cnt--; )
1011 LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
1012 p_src += 4 * i_src_stride;
1014 ILVR_B4_UH( zero, src0, zero, src1, zero, src2, zero, src3,
1015 temp0, temp1, temp2, temp3 );
1016 MUL4( wgt, temp0, wgt, temp1, wgt, temp2, wgt, temp3,
1017 temp0, temp1, temp2, temp3 );
1018 ADDS_SH4_UH( temp0, offset, temp1, offset, temp2, offset, temp3, offset,
1019 temp0, temp1, temp2, temp3 );
1020 MAXI_SH4_UH( temp0, temp1, temp2, temp3, 0 );
1021 SRL_H4_UH( temp0, temp1, temp2, temp3, denom );
1022 SAT_UH4_UH( temp0, temp1, temp2, temp3, 7 );
1023 PCKEV_B2_SB( temp1, temp0, temp3, temp2, out0, out1 );
1024 ST8x4_UB( out0, out1, p_dst, i_dst_stride );
1025 p_dst += ( 4 * i_dst_stride );
1029 static void avc_wgt_opscale_16width_msa( uint8_t *p_src, int32_t i_src_stride,
1030 uint8_t *p_dst, int32_t i_dst_stride,
1031 int32_t i_height, int32_t i_log2_denom,
1032 int32_t i_weight, int32_t i_offset_in )
1036 v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3;
1037 v8u16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
1038 v8u16 wgt, denom, offset;
1040 i_offset_in <<= ( i_log2_denom );
1044 i_offset_in += ( 1 << ( i_log2_denom - 1 ) );
1047 wgt = ( v8u16 ) __msa_fill_h( i_weight );
1048 offset = ( v8u16 ) __msa_fill_h( i_offset_in );
1049 denom = ( v8u16 ) __msa_fill_h( i_log2_denom );
1051 for( u_cnt = i_height / 4; u_cnt--; )
1053 LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
1054 p_src += 4 * i_src_stride;
1056 ILVR_B4_UH( zero, src0, zero, src1, zero, src2, zero, src3,
1057 temp0, temp2, temp4, temp6 );
1058 ILVL_B4_UH( zero, src0, zero, src1, zero, src2, zero, src3,
1059 temp1, temp3, temp5, temp7 );
1060 MUL4( wgt, temp0, wgt, temp1, wgt, temp2, wgt, temp3,
1061 temp0, temp1, temp2, temp3 );
1062 MUL4( wgt, temp4, wgt, temp5, wgt, temp6, wgt, temp7,
1063 temp4, temp5, temp6, temp7 );
1064 ADDS_SH4_UH( temp0, offset, temp1, offset, temp2, offset, temp3, offset,
1065 temp0, temp1, temp2, temp3 );
1066 ADDS_SH4_UH( temp4, offset, temp5, offset, temp6, offset, temp7, offset,
1067 temp4, temp5, temp6, temp7 );
1068 MAXI_SH4_UH( temp0, temp1, temp2, temp3, 0 );
1069 MAXI_SH4_UH( temp4, temp5, temp6, temp7, 0 );
1070 SRL_H4_UH( temp0, temp1, temp2, temp3, denom );
1071 SRL_H4_UH( temp4, temp5, temp6, temp7, denom );
1072 SAT_UH4_UH( temp0, temp1, temp2, temp3, 7 );
1073 SAT_UH4_UH( temp4, temp5, temp6, temp7, 7 );
1074 PCKEV_B4_UB( temp1, temp0, temp3, temp2, temp5, temp4, temp7, temp6,
1075 dst0, dst1, dst2, dst3 );
1077 ST_UB4( dst0, dst1, dst2, dst3, p_dst, i_dst_stride );
1078 p_dst += 4 * i_dst_stride;
1082 static void avc_biwgt_opscale_4x2_nw_msa( uint8_t *p_src1_in,
1083 int32_t i_src1_stride,
1085 int32_t i_src2_stride,
1087 int32_t i_dst_stride,
1088 int32_t i_log2_denom,
1089 int32_t i_src1_weight,
1090 int32_t i_src2_weight,
1091 int32_t i_offset_in )
1093 uint32_t u_load0, u_load1, u_out0, u_out1;
1094 v8i16 src1_wgt, src2_wgt;
1095 v16u8 in0, in1, in2, in3;
1096 v8i16 temp0, temp1, temp2, temp3;
1098 v8i16 denom = __msa_ldi_h( i_log2_denom + 1 );
1100 src1_wgt = __msa_fill_h( i_src1_weight );
1101 src2_wgt = __msa_fill_h( i_src2_weight );
1102 u_load0 = LW( p_src1_in );
1103 u_load1 = LW( p_src1_in + i_src1_stride );
1104 in0 = ( v16u8 ) __msa_fill_w( u_load0 );
1105 in1 = ( v16u8 ) __msa_fill_w( u_load1 );
1106 u_load0 = LW( p_src2_in );
1107 u_load1 = LW( p_src2_in + i_src2_stride );
1108 in2 = ( v16u8 ) __msa_fill_w( u_load0 );
1109 in3 = ( v16u8 ) __msa_fill_w( u_load1 );
1110 ILVR_B4_SH( zero, in0, zero, in1, zero, in2, zero, in3,
1111 temp0, temp1, temp2, temp3 );
1112 temp0 = ( temp0 * src1_wgt ) + ( temp2 * src2_wgt );
1113 temp1 = ( temp1 * src1_wgt ) + ( temp3 * src2_wgt );
1114 SRAR_H2_SH( temp0, temp1, denom );
1115 CLIP_SH2_0_255( temp0, temp1 );
1116 PCKEV_B2_UB( temp0, temp0, temp1, temp1, in0, in1 );
1117 u_out0 = __msa_copy_u_w( ( v4i32 ) in0, 0 );
1118 u_out1 = __msa_copy_u_w( ( v4i32 ) in1, 0 );
1119 SW( u_out0, p_dst );
1120 p_dst += i_dst_stride;
1121 SW( u_out1, p_dst );
1124 static void avc_biwgt_opscale_4x4multiple_nw_msa( uint8_t *p_src1_in,
1125 int32_t i_src1_stride,
1127 int32_t i_src2_stride,
1129 int32_t i_dst_stride,
1131 int32_t i_log2_denom,
1132 int32_t i_src1_weight,
1133 int32_t i_src2_weight,
1134 int32_t i_offset_in )
1137 uint32_t u_load0, u_load1, u_load2, u_load3;
1138 v8i16 src1_wgt, src2_wgt;
1139 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1140 v8i16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
1142 v8i16 denom = __msa_ldi_h( i_log2_denom + 1 );
1144 src1_wgt = __msa_fill_h( i_src1_weight );
1145 src2_wgt = __msa_fill_h( i_src2_weight );
1146 for( u_cnt = i_height / 4; u_cnt--; )
1148 LW4( p_src1_in, i_src1_stride, u_load0, u_load1, u_load2, u_load3 );
1149 p_src1_in += ( 4 * i_src1_stride );
1150 src0 = ( v16u8 ) __msa_fill_w( u_load0 );
1151 src1 = ( v16u8 ) __msa_fill_w( u_load1 );
1152 src2 = ( v16u8 ) __msa_fill_w( u_load2 );
1153 src3 = ( v16u8 ) __msa_fill_w( u_load3 );
1154 LW4( p_src2_in, i_src2_stride, u_load0, u_load1, u_load2, u_load3 );
1155 p_src2_in += ( 4 * i_src2_stride );
1156 src4 = ( v16u8 ) __msa_fill_w( u_load0 );
1157 src5 = ( v16u8 ) __msa_fill_w( u_load1 );
1158 src6 = ( v16u8 ) __msa_fill_w( u_load2 );
1159 src7 = ( v16u8 ) __msa_fill_w( u_load3 );
1160 ILVR_B4_SH( zero, src0, zero, src1, zero, src2, zero, src3,
1161 temp0, temp1, temp2, temp3 );
1162 ILVR_B4_SH( zero, src4, zero, src5, zero, src6, zero, src7,
1163 temp4, temp5, temp6, temp7 );
1164 temp0 = ( temp0 * src1_wgt ) + ( temp4 * src2_wgt );
1165 temp1 = ( temp1 * src1_wgt ) + ( temp5 * src2_wgt );
1166 temp2 = ( temp2 * src1_wgt ) + ( temp6 * src2_wgt );
1167 temp3 = ( temp3 * src1_wgt ) + ( temp7 * src2_wgt );
1168 SRAR_H4_SH( temp0, temp1, temp2, temp3, denom );
1169 CLIP_SH4_0_255( temp0, temp1, temp2, temp3 );
1170 PCKEV_ST4x4_UB( temp0, temp1, temp2, temp3, p_dst, i_dst_stride );
1171 p_dst += ( 4 * i_dst_stride );
1175 static void avc_biwgt_opscale_4width_nw_msa( uint8_t *p_src1_in,
1176 int32_t i_src1_stride,
1178 int32_t i_src2_stride,
1180 int32_t i_dst_stride,
1182 int32_t i_log2_denom,
1183 int32_t i_src1_weight,
1184 int32_t i_src2_weight,
1185 int32_t i_offset_in )
1189 avc_biwgt_opscale_4x2_nw_msa( p_src1_in, i_src1_stride,
1190 p_src2_in, i_src2_stride,
1191 p_dst, i_dst_stride,
1192 i_log2_denom, i_src1_weight,
1193 i_src2_weight, i_offset_in );
1197 avc_biwgt_opscale_4x4multiple_nw_msa( p_src1_in, i_src1_stride,
1198 p_src2_in, i_src2_stride,
1199 p_dst, i_dst_stride,
1200 i_height, i_log2_denom,
1201 i_src1_weight, i_src2_weight,
1206 static void avc_biwgt_opscale_8width_nw_msa( uint8_t *p_src1_in,
1207 int32_t i_src1_stride,
1209 int32_t i_src2_stride,
1211 int32_t i_dst_stride,
1213 int32_t i_log2_denom,
1214 int32_t i_src1_weight,
1215 int32_t i_src2_weight,
1216 int32_t i_offset_in )
1219 v8i16 src1_wgt, src2_wgt;
1220 v16u8 src0, src1, src2, src3;
1221 v16u8 dst0, dst1, dst2, dst3;
1222 v8i16 temp0, temp1, temp2, temp3;
1223 v8i16 res0, res1, res2, res3;
1225 v8i16 denom = __msa_ldi_h( i_log2_denom + 1 );
1227 src1_wgt = __msa_fill_h( i_src1_weight );
1228 src2_wgt = __msa_fill_h( i_src2_weight );
1230 for( u_cnt = i_height / 4; u_cnt--; )
1232 LD_UB4( p_src1_in, i_src1_stride, src0, src1, src2, src3 );
1233 p_src1_in += ( 4 * i_src1_stride );
1234 LD_UB4( p_src2_in, i_src2_stride, dst0, dst1, dst2, dst3 );
1235 p_src2_in += ( 4 * i_src2_stride );
1236 ILVR_B4_SH( zero, src0, zero, src1, zero, src2, zero, src3,
1237 temp0, temp1, temp2, temp3 );
1238 ILVR_B4_SH( zero, dst0, zero, dst1, zero, dst2, zero, dst3,
1239 res0, res1, res2, res3 );
1240 res0 = ( temp0 * src1_wgt ) + ( res0 * src2_wgt );
1241 res1 = ( temp1 * src1_wgt ) + ( res1 * src2_wgt );
1242 res2 = ( temp2 * src1_wgt ) + ( res2 * src2_wgt );
1243 res3 = ( temp3 * src1_wgt ) + ( res3 * src2_wgt );
1244 SRAR_H4_SH( res0, res1, res2, res3, denom );
1245 CLIP_SH4_0_255( res0, res1, res2, res3 );
1246 PCKEV_B4_UB( res0, res0, res1, res1, res2, res2, res3, res3,
1247 dst0, dst1, dst2, dst3 );
1248 ST8x1_UB( dst0, p_dst );
1249 p_dst += i_dst_stride;
1250 ST8x1_UB( dst1, p_dst );
1251 p_dst += i_dst_stride;
1252 ST8x1_UB( dst2, p_dst );
1253 p_dst += i_dst_stride;
1254 ST8x1_UB( dst3, p_dst );
1255 p_dst += i_dst_stride;
1259 static void avc_biwgt_opscale_16width_nw_msa( uint8_t *p_src1_in,
1260 int32_t i_src1_stride,
1262 int32_t i_src2_stride,
1264 int32_t i_dst_stride,
1266 int32_t i_log2_denom,
1267 int32_t i_src1_weight,
1268 int32_t i_src2_weight,
1269 int32_t i_offset_in )
1272 v8i16 src1_wgt, src2_wgt;
1273 v16u8 src0, src1, src2, src3;
1274 v16u8 dst0, dst1, dst2, dst3;
1275 v8i16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
1276 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
1278 v8i16 denom = __msa_ldi_h( i_log2_denom + 1 );
1280 src1_wgt = __msa_fill_h( i_src1_weight );
1281 src2_wgt = __msa_fill_h( i_src2_weight );
1283 for( u_cnt = i_height / 4; u_cnt--; )
1285 LD_UB4( p_src1_in, i_src1_stride, src0, src1, src2, src3 );
1286 p_src1_in += ( 4 * i_src1_stride );
1287 LD_UB4( p_src2_in, i_src2_stride, dst0, dst1, dst2, dst3 );
1288 p_src2_in += ( 4 * i_src2_stride );
1289 ILVRL_B2_SH( zero, src0, temp1, temp0 );
1290 ILVRL_B2_SH( zero, src1, temp3, temp2 );
1291 ILVRL_B2_SH( zero, src2, temp5, temp4 );
1292 ILVRL_B2_SH( zero, src3, temp7, temp6 );
1293 ILVRL_B2_SH( zero, dst0, res1, res0 );
1294 ILVRL_B2_SH( zero, dst1, res3, res2 );
1295 ILVRL_B2_SH( zero, dst2, res5, res4 );
1296 ILVRL_B2_SH( zero, dst3, res7, res6 );
1297 res0 = ( temp0 * src1_wgt ) + ( res0 * src2_wgt );
1298 res1 = ( temp1 * src1_wgt ) + ( res1 * src2_wgt );
1299 res2 = ( temp2 * src1_wgt ) + ( res2 * src2_wgt );
1300 res3 = ( temp3 * src1_wgt ) + ( res3 * src2_wgt );
1301 res4 = ( temp4 * src1_wgt ) + ( res4 * src2_wgt );
1302 res5 = ( temp5 * src1_wgt ) + ( res5 * src2_wgt );
1303 res6 = ( temp6 * src1_wgt ) + ( res6 * src2_wgt );
1304 res7 = ( temp7 * src1_wgt ) + ( res7 * src2_wgt );
1305 SRAR_H4_SH( res0, res1, res2, res3, denom );
1306 SRAR_H4_SH( res4, res5, res6, res7, denom );
1307 CLIP_SH4_0_255( res0, res1, res2, res3 );
1308 CLIP_SH4_0_255( res4, res5, res6, res7 );
1309 PCKEV_B4_UB( res0, res1, res2, res3, res4, res5, res6, res7,
1310 dst0, dst1, dst2, dst3 );
1311 ST_UB4( dst0, dst1, dst2, dst3, p_dst, i_dst_stride );
1312 p_dst += 4 * i_dst_stride;
1316 static void avc_biwgt_opscale_4x2_msa( uint8_t *p_src1_in,
1317 int32_t i_src1_stride,
1319 int32_t i_src2_stride,
1320 uint8_t *p_dst, int32_t i_dst_stride,
1321 int32_t i_log2_denom,
1322 int32_t i_src1_weight,
1323 int32_t i_src2_weight,
1324 int32_t i_offset_in )
1326 uint32_t u_load0, u_load1, u_out0, u_out1;
1327 v16u8 src1_wgt, src2_wgt, wgt;
1328 v16i8 in0, in1, in2, in3;
1329 v8u16 temp0, temp1, denom, offset;
1331 i_offset_in = ( ( i_offset_in + 1 ) | 1 ) << i_log2_denom;
1333 src1_wgt = ( v16u8 ) __msa_fill_b( i_src1_weight );
1334 src2_wgt = ( v16u8 ) __msa_fill_b( i_src2_weight );
1335 offset = ( v8u16 ) __msa_fill_h( i_offset_in );
1336 denom = ( v8u16 ) __msa_fill_h( i_log2_denom + 1 );
1338 wgt = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) src2_wgt, ( v16i8 ) src1_wgt );
1340 u_load0 = LW( p_src1_in );
1341 u_load1 = LW( p_src1_in + i_src1_stride );
1342 in0 = ( v16i8 ) __msa_fill_w( u_load0 );
1343 in1 = ( v16i8 ) __msa_fill_w( u_load1 );
1345 u_load0 = LW( p_src2_in );
1346 u_load1 = LW( p_src2_in + i_src2_stride );
1347 in2 = ( v16i8 ) __msa_fill_w( u_load0 );
1348 in3 = ( v16i8 ) __msa_fill_w( u_load1 );
1350 ILVR_B2_SB( in2, in0, in3, in1, in0, in1 );
1352 temp0 = __msa_dpadd_u_h( offset, wgt, ( v16u8 ) in0 );
1353 temp1 = __msa_dpadd_u_h( offset, wgt, ( v16u8 ) in1 );
1356 MAXI_SH2_UH( temp0, temp1, 0 );
1357 SAT_UH2_UH( temp0, temp1, 7 );
1358 PCKEV_B2_SB( temp0, temp0, temp1, temp1, in0, in1 );
1360 u_out0 = __msa_copy_u_w( ( v4i32 ) in0, 0 );
1361 u_out1 = __msa_copy_u_w( ( v4i32 ) in1, 0 );
1362 SW( u_out0, p_dst );
1363 p_dst += i_dst_stride;
1364 SW( u_out1, p_dst );
1367 static void avc_biwgt_opscale_4x4multiple_msa( uint8_t *p_src1_in,
1368 int32_t i_src1_stride,
1370 int32_t i_src2_stride,
1372 int32_t i_dst_stride,
1374 int32_t i_log2_denom,
1375 int32_t i_src1_weight,
1376 int32_t i_src2_weight,
1377 int32_t i_offset_in )
1380 uint32_t u_load0, u_load1, u_load2, u_load3;
1381 v16u8 src1_wgt, src2_wgt, wgt;
1382 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1383 v16u8 temp0, temp1, temp2, temp3;
1384 v8u16 res0, res1, res2, res3;
1385 v8u16 denom, offset;
1387 i_offset_in = ( ( i_offset_in + 1 ) | 1 ) << i_log2_denom;
1389 src1_wgt = ( v16u8 ) __msa_fill_b( i_src1_weight );
1390 src2_wgt = ( v16u8 ) __msa_fill_b( i_src2_weight );
1391 offset = ( v8u16 ) __msa_fill_h( i_offset_in );
1392 denom = ( v8u16 ) __msa_fill_h( i_log2_denom + 1 );
1394 wgt = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) src2_wgt, ( v16i8 ) src1_wgt );
1396 for( u_cnt = i_height / 4; u_cnt--; )
1398 LW4( p_src1_in, i_src1_stride, u_load0, u_load1, u_load2, u_load3 );
1399 p_src1_in += ( 4 * i_src1_stride );
1401 src0 = ( v16u8 ) __msa_fill_w( u_load0 );
1402 src1 = ( v16u8 ) __msa_fill_w( u_load1 );
1403 src2 = ( v16u8 ) __msa_fill_w( u_load2 );
1404 src3 = ( v16u8 ) __msa_fill_w( u_load3 );
1406 LW4( p_src2_in, i_src2_stride, u_load0, u_load1, u_load2, u_load3 );
1407 p_src2_in += ( 4 * i_src2_stride );
1409 src4 = ( v16u8 ) __msa_fill_w( u_load0 );
1410 src5 = ( v16u8 ) __msa_fill_w( u_load1 );
1411 src6 = ( v16u8 ) __msa_fill_w( u_load2 );
1412 src7 = ( v16u8 ) __msa_fill_w( u_load3 );
1414 ILVR_B4_UB( src4, src0, src5, src1, src6, src2, src7, src3,
1415 temp0, temp1, temp2, temp3 );
1416 DOTP_UB4_UH( temp0, temp1, temp2, temp3, wgt, wgt, wgt, wgt,
1417 res0, res1, res2, res3 );
1418 ADD4( res0, offset, res1, offset, res2, offset, res3, offset,
1419 res0, res1, res2, res3 );
1420 SRA_4V( res0, res1, res2, res3, denom );
1421 MAXI_SH4_UH( res0, res1, res2, res3, 0 );
1422 SAT_UH4_UH( res0, res1, res2, res3, 7 );
1423 PCKEV_ST4x4_UB( res0, res1, res2, res3, p_dst, i_dst_stride );
1424 p_dst += ( 4 * i_dst_stride );
1428 static void avc_biwgt_opscale_4width_msa( uint8_t *p_src1_in,
1429 int32_t i_src1_stride,
1431 int32_t i_src2_stride,
1433 int32_t i_dst_stride,
1435 int32_t i_log2_denom,
1436 int32_t i_src1_weight,
1437 int32_t i_src2_weight,
1438 int32_t i_offset_in )
1442 avc_biwgt_opscale_4x2_msa( p_src1_in, i_src1_stride,
1443 p_src2_in, i_src2_stride,
1444 p_dst, i_dst_stride,
1445 i_log2_denom, i_src1_weight,
1446 i_src2_weight, i_offset_in );
1450 avc_biwgt_opscale_4x4multiple_msa( p_src1_in, i_src1_stride,
1451 p_src2_in, i_src2_stride,
1452 p_dst, i_dst_stride,
1453 i_height, i_log2_denom,
1455 i_src2_weight, i_offset_in );
1460 static void avc_biwgt_opscale_8width_msa( uint8_t *p_src1_in,
1461 int32_t i_src1_stride,
1463 int32_t i_src2_stride,
1465 int32_t i_dst_stride,
1467 int32_t i_log2_denom,
1468 int32_t i_src1_weight,
1469 int32_t i_src2_weight,
1470 int32_t i_offset_in )
1473 v16u8 src1_wgt, src2_wgt, wgt;
1474 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1475 v16u8 temp0, temp1, temp2, temp3;
1476 v8u16 res0, res1, res2, res3;
1477 v8u16 denom, offset;
1480 i_offset_in = ( ( i_offset_in + 1 ) | 1 ) << i_log2_denom;
1482 src1_wgt = ( v16u8 ) __msa_fill_b( i_src1_weight );
1483 src2_wgt = ( v16u8 ) __msa_fill_b( i_src2_weight );
1484 offset = ( v8u16 ) __msa_fill_h( i_offset_in );
1485 denom = ( v8u16 ) __msa_fill_h( i_log2_denom + 1 );
1487 wgt = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) src2_wgt, ( v16i8 ) src1_wgt );
1489 for( u_cnt = i_height / 4; u_cnt--; )
1491 LD_UB4( p_src1_in, i_src1_stride, src0, src1, src2, src3 );
1492 p_src1_in += ( 4 * i_src1_stride );
1494 LD_UB4( p_src2_in, i_src2_stride, src4, src5, src6, src7 );
1495 p_src2_in += ( 4 * i_src2_stride );
1497 ILVR_B4_UB( src4, src0, src5, src1, src6, src2, src7, src3,
1498 temp0, temp1, temp2, temp3 );
1499 DOTP_UB4_UH( temp0, temp1, temp2, temp3, wgt, wgt, wgt, wgt,
1500 res0, res1, res2, res3 );
1501 ADD4( res0, offset, res1, offset, res2, offset, res3, offset,
1502 res0, res1, res2, res3 );
1503 SRA_4V( res0, res1, res2, res3, denom );
1504 MAXI_SH4_UH( res0, res1, res2, res3, 0 );
1505 SAT_UH4_UH( res0, res1, res2, res3, 7 );
1506 PCKEV_B2_SB( res1, res0, res3, res2, out0, out1 );
1507 ST8x4_UB( out0, out1, p_dst, i_dst_stride );
1508 p_dst += 4 * i_dst_stride;
1512 static void avc_biwgt_opscale_16width_msa( uint8_t *p_src1_in,
1513 int32_t i_src1_stride,
1515 int32_t i_src2_stride,
1517 int32_t i_dst_stride,
1519 int32_t i_log2_denom,
1520 int32_t i_src1_weight,
1521 int32_t i_src2_weight,
1522 int32_t i_offset_in )
1525 v16u8 src1_wgt, src2_wgt, wgt;
1526 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1527 v16u8 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
1528 v8u16 res0, res1, res2, res3, res4, res5, res6, res7;
1529 v8u16 denom, offset;
1531 i_offset_in = ( ( i_offset_in + 1 ) | 1 ) << i_log2_denom;
1533 src1_wgt = ( v16u8 ) __msa_fill_b( i_src1_weight );
1534 src2_wgt = ( v16u8 ) __msa_fill_b( i_src2_weight );
1535 offset = ( v8u16 ) __msa_fill_h( i_offset_in );
1536 denom = ( v8u16 ) __msa_fill_h( i_log2_denom + 1 );
1538 wgt = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) src2_wgt, ( v16i8 ) src1_wgt );
1540 for( u_cnt = i_height / 4; u_cnt--; )
1542 LD_UB4( p_src1_in, i_src1_stride, src0, src1, src2, src3 );
1543 p_src1_in += ( 4 * i_src1_stride );
1545 LD_UB4( p_src2_in, i_src2_stride, src4, src5, src6, src7 );
1546 p_src2_in += ( 4 * i_src2_stride );
1548 ILVR_B4_UB( src4, src0, src5, src1, src6, src2, src7, src3,
1549 temp0, temp2, temp4, temp6 );
1550 ILVL_B4_UB( src4, src0, src5, src1, src6, src2, src7, src3,
1551 temp1, temp3, temp5, temp7 );
1552 DOTP_UB4_UH( temp0, temp1, temp2, temp3, wgt, wgt, wgt, wgt,
1553 res0, res1, res2, res3 );
1554 ADD4( res0, offset, res1, offset, res2, offset, res3, offset,
1555 res0, res1, res2, res3 );
1556 DOTP_UB4_UH( temp4, temp5, temp6, temp7, wgt, wgt, wgt, wgt,
1557 res4, res5, res6, res7 );
1558 ADD4( res4, offset, res5, offset, res6, offset, res7, offset,
1559 res4, res5, res6, res7 );
1560 SRA_4V( res0, res1, res2, res3, denom );
1561 SRA_4V( res4, res5, res6, res7, denom );
1562 MAXI_SH4_UH( res0, res1, res2, res3, 0 );
1563 MAXI_SH4_UH( res4, res5, res6, res7, 0 );
1564 SAT_UH4_UH( res0, res1, res2, res3, 7 );
1565 SAT_UH4_UH( res4, res5, res6, res7, 7 );
1566 PCKEV_B4_UB( res1, res0, res3, res2, res5, res4, res7, res6,
1567 temp0, temp1, temp2, temp3 );
1568 ST_UB4( temp0, temp1, temp2, temp3, p_dst, i_dst_stride );
1569 p_dst += 4 * i_dst_stride;
1573 static void copy_width4_msa( uint8_t *p_src, int32_t i_src_stride,
1574 uint8_t *p_dst, int32_t i_dst_stride,
1578 uint32_t u_src0, u_src1;
1580 for( i_cnt = ( i_height / 2 ); i_cnt--; )
1582 u_src0 = LW( p_src );
1583 p_src += i_src_stride;
1584 u_src1 = LW( p_src );
1585 p_src += i_src_stride;
1587 SW( u_src0, p_dst );
1588 p_dst += i_dst_stride;
1589 SW( u_src1, p_dst );
1590 p_dst += i_dst_stride;
1594 static void copy_width8_msa( uint8_t *p_src, int32_t i_src_stride,
1595 uint8_t *p_dst, int32_t i_dst_stride,
1599 uint64_t u_out0, u_out1, u_out2, u_out3, u_out4, u_out5, u_out6, u_out7;
1600 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1602 if( 0 == i_height % 12 )
1604 for( i_cnt = ( i_height / 12 ); i_cnt--; )
1606 LD_UB8( p_src, i_src_stride,
1607 src0, src1, src2, src3, src4, src5, src6, src7 );
1608 p_src += ( 8 * i_src_stride );
1610 u_out0 = __msa_copy_u_d( ( v2i64 ) src0, 0 );
1611 u_out1 = __msa_copy_u_d( ( v2i64 ) src1, 0 );
1612 u_out2 = __msa_copy_u_d( ( v2i64 ) src2, 0 );
1613 u_out3 = __msa_copy_u_d( ( v2i64 ) src3, 0 );
1614 u_out4 = __msa_copy_u_d( ( v2i64 ) src4, 0 );
1615 u_out5 = __msa_copy_u_d( ( v2i64 ) src5, 0 );
1616 u_out6 = __msa_copy_u_d( ( v2i64 ) src6, 0 );
1617 u_out7 = __msa_copy_u_d( ( v2i64 ) src7, 0 );
1619 SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
1620 p_dst += ( 4 * i_dst_stride );
1621 SD4( u_out4, u_out5, u_out6, u_out7, p_dst, i_dst_stride );
1622 p_dst += ( 4 * i_dst_stride );
1624 LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
1625 p_src += ( 4 * i_src_stride );
1627 u_out0 = __msa_copy_u_d( ( v2i64 ) src0, 0 );
1628 u_out1 = __msa_copy_u_d( ( v2i64 ) src1, 0 );
1629 u_out2 = __msa_copy_u_d( ( v2i64 ) src2, 0 );
1630 u_out3 = __msa_copy_u_d( ( v2i64 ) src3, 0 );
1632 SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
1633 p_dst += ( 4 * i_dst_stride );
1636 else if( 0 == i_height % 8 )
1638 for( i_cnt = i_height >> 3; i_cnt--; )
1640 LD_UB8( p_src, i_src_stride,
1641 src0, src1, src2, src3, src4, src5, src6, src7 );
1642 p_src += ( 8 * i_src_stride );
1644 u_out0 = __msa_copy_u_d( ( v2i64 ) src0, 0 );
1645 u_out1 = __msa_copy_u_d( ( v2i64 ) src1, 0 );
1646 u_out2 = __msa_copy_u_d( ( v2i64 ) src2, 0 );
1647 u_out3 = __msa_copy_u_d( ( v2i64 ) src3, 0 );
1648 u_out4 = __msa_copy_u_d( ( v2i64 ) src4, 0 );
1649 u_out5 = __msa_copy_u_d( ( v2i64 ) src5, 0 );
1650 u_out6 = __msa_copy_u_d( ( v2i64 ) src6, 0 );
1651 u_out7 = __msa_copy_u_d( ( v2i64 ) src7, 0 );
1653 SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
1654 p_dst += ( 4 * i_dst_stride );
1655 SD4( u_out4, u_out5, u_out6, u_out7, p_dst, i_dst_stride );
1656 p_dst += ( 4 * i_dst_stride );
1659 else if( 0 == i_height % 4 )
1661 for( i_cnt = ( i_height / 4 ); i_cnt--; )
1663 LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
1664 p_src += ( 4 * i_src_stride );
1665 u_out0 = __msa_copy_u_d( ( v2i64 ) src0, 0 );
1666 u_out1 = __msa_copy_u_d( ( v2i64 ) src1, 0 );
1667 u_out2 = __msa_copy_u_d( ( v2i64 ) src2, 0 );
1668 u_out3 = __msa_copy_u_d( ( v2i64 ) src3, 0 );
1670 SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
1671 p_dst += ( 4 * i_dst_stride );
1674 else if( 0 == i_height % 2 )
1676 for( i_cnt = ( i_height / 2 ); i_cnt--; )
1678 LD_UB2( p_src, i_src_stride, src0, src1 );
1679 p_src += ( 2 * i_src_stride );
1680 u_out0 = __msa_copy_u_d( ( v2i64 ) src0, 0 );
1681 u_out1 = __msa_copy_u_d( ( v2i64 ) src1, 0 );
1683 SD( u_out0, p_dst );
1684 p_dst += i_dst_stride;
1685 SD( u_out1, p_dst );
1686 p_dst += i_dst_stride;
1692 static void copy_16multx8mult_msa( uint8_t *p_src, int32_t i_src_stride,
1693 uint8_t *p_dst, int32_t i_dst_stride,
1694 int32_t i_height, int32_t i_width )
1696 int32_t i_cnt, i_loop_cnt;
1697 uint8_t *p_src_tmp, *p_dst_tmp;
1698 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1700 for( i_cnt = ( i_width >> 4 ); i_cnt--; )
1705 for( i_loop_cnt = ( i_height >> 3 ); i_loop_cnt--; )
1707 LD_UB8( p_src_tmp, i_src_stride,
1708 src0, src1, src2, src3, src4, src5, src6, src7 );
1709 p_src_tmp += ( 8 * i_src_stride );
1711 ST_UB8( src0, src1, src2, src3, src4, src5, src6, src7,
1712 p_dst_tmp, i_dst_stride );
1713 p_dst_tmp += ( 8 * i_dst_stride );
1721 static void copy_width16_msa( uint8_t *p_src, int32_t i_src_stride,
1722 uint8_t *p_dst, int32_t i_dst_stride,
1726 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1728 if( 0 == i_height % 12 )
1730 for( i_cnt = ( i_height / 12 ); i_cnt--; )
1732 LD_UB8( p_src, i_src_stride,
1733 src0, src1, src2, src3, src4, src5, src6, src7 );
1734 p_src += ( 8 * i_src_stride );
1735 ST_UB8( src0, src1, src2, src3, src4, src5, src6, src7,
1736 p_dst, i_dst_stride );
1737 p_dst += ( 8 * i_dst_stride );
1739 LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
1740 p_src += ( 4 * i_src_stride );
1741 ST_UB4( src0, src1, src2, src3, p_dst, i_dst_stride );
1742 p_dst += ( 4 * i_dst_stride );
1745 else if( 0 == i_height % 8 )
1747 copy_16multx8mult_msa( p_src, i_src_stride,
1748 p_dst, i_dst_stride, i_height, 16 );
1750 else if( 0 == i_height % 4 )
1752 for( i_cnt = ( i_height >> 2 ); i_cnt--; )
1754 LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
1755 p_src += ( 4 * i_src_stride );
1757 ST_UB4( src0, src1, src2, src3, p_dst, i_dst_stride );
1758 p_dst += ( 4 * i_dst_stride );
1763 static void avg_src_width4_msa( uint8_t *p_src1, int32_t i_src1_stride,
1764 uint8_t *p_src2, int32_t i_src2_stride,
1765 uint8_t *p_dst, int32_t i_dst_stride,
1769 uint32_t u_out0, u_out1;
1770 v16u8 src0, src1, src2, src3;
1773 for( i_cnt = ( i_height / 2 ); i_cnt--; )
1775 LD_UB2( p_src1, i_src1_stride, src0, src1 );
1776 p_src1 += ( 2 * i_src1_stride );
1777 LD_UB2( p_src2, i_src2_stride, src2, src3 );
1778 p_src2 += ( 2 * i_src2_stride );
1780 AVER_UB2_UB( src0, src2, src1, src3, dst0, dst1 );
1782 u_out0 = __msa_copy_u_w( ( v4i32 ) dst0, 0 );
1783 u_out1 = __msa_copy_u_w( ( v4i32 ) dst1, 0 );
1784 SW( u_out0, p_dst );
1785 p_dst += i_dst_stride;
1786 SW( u_out1, p_dst );
1787 p_dst += i_dst_stride;
1791 static void avg_src_width8_msa( uint8_t *p_src1, int32_t i_src1_stride,
1792 uint8_t *p_src2, int32_t i_src2_stride,
1793 uint8_t *p_dst, int32_t i_dst_stride,
1797 uint64_t u_out0, u_out1, u_out2, u_out3;
1798 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1799 v16u8 dst0, dst1, dst2, dst3;
1801 for( i_cnt = ( i_height / 4 ); i_cnt--; )
1803 LD_UB4( p_src1, i_src1_stride, src0, src1, src2, src3 );
1804 p_src1 += ( 4 * i_src1_stride );
1805 LD_UB4( p_src2, i_src2_stride, src4, src5, src6, src7 );
1806 p_src2 += ( 4 * i_src2_stride );
1808 AVER_UB4_UB( src0, src4, src1, src5, src2, src6, src3, src7,
1809 dst0, dst1, dst2, dst3 );
1811 u_out0 = __msa_copy_u_d( ( v2i64 ) dst0, 0 );
1812 u_out1 = __msa_copy_u_d( ( v2i64 ) dst1, 0 );
1813 u_out2 = __msa_copy_u_d( ( v2i64 ) dst2, 0 );
1814 u_out3 = __msa_copy_u_d( ( v2i64 ) dst3, 0 );
1815 SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
1816 p_dst += ( 4 * i_dst_stride );
1820 static void avg_src_width16_msa( uint8_t *p_src1, int32_t i_src1_stride,
1821 uint8_t *p_src2, int32_t i_src2_stride,
1822 uint8_t *p_dst, int32_t i_dst_stride,
1826 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1827 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1829 for( i_cnt = ( i_height / 8 ); i_cnt--; )
1831 LD_UB8( p_src1, i_src1_stride,
1832 src0, src1, src2, src3, src4, src5, src6, src7 );
1833 p_src1 += ( 8 * i_src1_stride );
1834 LD_UB8( p_src2, i_src2_stride,
1835 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7 );
1836 p_src2 += ( 8 * i_src2_stride );
1838 AVER_UB4_UB( src0, dst0, src1, dst1, src2, dst2, src3, dst3,
1839 dst0, dst1, dst2, dst3 );
1840 AVER_UB4_UB( src4, dst4, src5, dst5, src6, dst6, src7, dst7,
1841 dst4, dst5, dst6, dst7 );
1843 ST_UB8( dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7,
1844 p_dst, i_dst_stride );
1845 p_dst += ( 8 * i_dst_stride );
1849 static void memset_zero_16width_msa( uint8_t *p_src, int32_t i_stride,
1855 for( i_cnt = ( i_height / 2 ); i_cnt--; )
1857 ST_UB( zero, p_src );
1859 ST_UB( zero, p_src );
1864 static void plane_copy_interleave_msa( uint8_t *p_src0, int32_t i_src0_stride,
1865 uint8_t *p_src1, int32_t i_src1_stride,
1866 uint8_t *p_dst, int32_t i_dst_stride,
1867 int32_t i_width, int32_t i_height )
1869 int32_t i_loop_width, i_loop_height, i_w_mul8, i_h4w;
1870 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1871 v16u8 vec_ilv_r0, vec_ilv_r1, vec_ilv_r2, vec_ilv_r3;
1872 v16u8 vec_ilv_l0, vec_ilv_l1, vec_ilv_l2, vec_ilv_l3;
1874 i_w_mul8 = i_width - i_width % 8;
1875 i_h4w = i_height - i_height % 4;
1877 for( i_loop_height = ( i_h4w >> 2 ); i_loop_height--; )
1879 for( i_loop_width = ( i_width >> 4 ); i_loop_width--; )
1881 LD_UB4( p_src0, i_src0_stride, src0, src1, src2, src3 );
1882 LD_UB4( p_src1, i_src1_stride, src4, src5, src6, src7 );
1883 ILVR_B4_UB( src4, src0, src5, src1, src6, src2, src7, src3,
1884 vec_ilv_r0, vec_ilv_r1, vec_ilv_r2, vec_ilv_r3 );
1885 ILVL_B4_UB( src4, src0, src5, src1, src6, src2, src7, src3,
1886 vec_ilv_l0, vec_ilv_l1, vec_ilv_l2, vec_ilv_l3 );
1887 ST_UB4( vec_ilv_r0, vec_ilv_r1, vec_ilv_r2, vec_ilv_r3,
1888 p_dst, i_dst_stride );
1889 ST_UB4( vec_ilv_l0, vec_ilv_l1, vec_ilv_l2, vec_ilv_l3,
1890 ( p_dst + 16 ), i_dst_stride );
1896 for( i_loop_width = ( i_width % 16 ) >> 3; i_loop_width--; )
1898 LD_UB4( p_src0, i_src0_stride, src0, src1, src2, src3 );
1899 LD_UB4( p_src1, i_src1_stride, src4, src5, src6, src7 );
1900 ILVR_B4_UB( src4, src0, src5, src1, src6, src2, src7, src3,
1901 vec_ilv_r0, vec_ilv_r1, vec_ilv_r2, vec_ilv_r3 );
1902 ST_UB4( vec_ilv_r0, vec_ilv_r1, vec_ilv_r2, vec_ilv_r3,
1903 p_dst, i_dst_stride );
1909 for( i_loop_width = i_w_mul8; i_loop_width < i_width; i_loop_width++ )
1911 p_dst[0] = p_src0[0];
1912 p_dst[1] = p_src1[0];
1913 p_dst[i_dst_stride] = p_src0[i_src0_stride];
1914 p_dst[i_dst_stride + 1] = p_src1[i_src1_stride];
1915 p_dst[2 * i_dst_stride] = p_src0[2 * i_src0_stride];
1916 p_dst[2 * i_dst_stride + 1] = p_src1[2 * i_src1_stride];
1917 p_dst[3 * i_dst_stride] = p_src0[3 * i_src0_stride];
1918 p_dst[3 * i_dst_stride + 1] = p_src1[3 * i_src1_stride];
1924 p_src0 += ( ( 4 * i_src0_stride ) - i_width );
1925 p_src1 += ( ( 4 * i_src1_stride ) - i_width );
1926 p_dst += ( ( 4 * i_dst_stride ) - ( i_width * 2 ) );
1929 for( i_loop_height = i_h4w; i_loop_height < i_height; i_loop_height++ )
1931 for( i_loop_width = ( i_width >> 4 ); i_loop_width--; )
1933 src0 = LD_UB( p_src0 );
1934 src4 = LD_UB( p_src1 );
1935 ILVRL_B2_UB( src4, src0, vec_ilv_r0, vec_ilv_l0 );
1936 ST_UB2( vec_ilv_r0, vec_ilv_l0, p_dst, 16 );
1942 for( i_loop_width = ( i_width % 16 ) >> 3; i_loop_width--; )
1944 src0 = LD_UB( p_src0 );
1945 src4 = LD_UB( p_src1 );
1946 vec_ilv_r0 = ( v16u8 ) __msa_ilvr_b( ( v16i8 ) src4,
1948 ST_UB( vec_ilv_r0, p_dst );
1954 for( i_loop_width = i_w_mul8; i_loop_width < i_width; i_loop_width++ )
1956 p_dst[0] = p_src0[0];
1957 p_dst[1] = p_src1[0];
1963 p_src0 += ( i_src0_stride - i_width );
1964 p_src1 += ( i_src1_stride - i_width );
1965 p_dst += ( i_dst_stride - ( i_width * 2 ) );
1969 static void plane_copy_deinterleave_msa( uint8_t *p_src, int32_t i_src_stride,
1970 uint8_t *p_dst0, int32_t dst0_stride,
1971 uint8_t *p_dst1, int32_t dst1_stride,
1972 int32_t i_width, int32_t i_height )
1974 int32_t i_loop_width, i_loop_height, i_w_mul4, i_w_mul8, i_h4w;
1975 uint32_t u_res_w0, u_res_w1;
1976 v16u8 in0, in1, in2, in3, in4, in5, in6, in7;
1977 v16u8 vec_pckev0, vec_pckev1, vec_pckev2, vec_pckev3;
1978 v16u8 vec_pckod0, vec_pckod1, vec_pckod2, vec_pckod3;
1981 i_w_mul8 = i_width - i_width % 8;
1982 i_w_mul4 = i_width - i_width % 4;
1983 i_h4w = i_height - i_height % 8;
1985 for( i_loop_height = ( i_h4w >> 3 ); i_loop_height--; )
1987 for( i_loop_width = ( i_w_mul8 >> 3 ); i_loop_width--; )
1989 LD_UB8( p_src, i_src_stride,
1990 in0, in1, in2, in3, in4, in5, in6, in7 );
1992 PCKEV_B4_UB( in1, in0, in3, in2, in5, in4, in7, in6,
1993 vec_pckev0, vec_pckev1, vec_pckev2, vec_pckev3 );
1994 PCKOD_B4_UB( in1, in0, in3, in2, in5, in4, in7, in6,
1995 vec_pckod0, vec_pckod1, vec_pckod2, vec_pckod3 );
1996 ST8x4_UB( vec_pckev0, vec_pckev1, p_dst0, dst0_stride );
1997 p_dst = p_dst0 + 4 * dst0_stride;
1998 ST8x4_UB( vec_pckev2, vec_pckev3, p_dst, dst0_stride );
1999 ST8x4_UB( vec_pckod0, vec_pckod1, p_dst1, dst1_stride );
2000 p_dst = p_dst1 + 4 * dst1_stride;
2001 ST8x4_UB( vec_pckod2, vec_pckod3, p_dst, dst1_stride );
2006 for( i_loop_width = ( ( i_width % 8 ) >> 2 ); i_loop_width--; )
2008 LD_UB8( p_src, i_src_stride,
2009 in0, in1, in2, in3, in4, in5, in6, in7 );
2011 PCKEV_B4_UB( in1, in0, in3, in2, in5, in4, in7, in6,
2012 vec_pckev0, vec_pckev1, vec_pckev2, vec_pckev3 );
2013 PCKOD_B4_UB( in1, in0, in3, in2, in5, in4, in7, in6,
2014 vec_pckod0, vec_pckod1, vec_pckod2, vec_pckod3 );
2015 ST4x4_UB( vec_pckev0, vec_pckev1, 0, 2, 0, 2, p_dst0, dst0_stride );
2016 p_dst = p_dst0 + 4 * dst0_stride;
2017 ST4x4_UB( vec_pckev2, vec_pckev3, 0, 2, 0, 2, p_dst, dst0_stride );
2018 ST4x4_UB( vec_pckod0, vec_pckod1, 0, 2, 0, 2, p_dst1, dst1_stride );
2019 p_dst = p_dst1 + 4 * dst1_stride;
2020 ST4x4_UB( vec_pckod2, vec_pckod3, 0, 2, 0, 2, p_dst, dst1_stride );
2025 for( i_loop_width = i_w_mul4; i_loop_width < i_width; i_loop_width++ )
2027 p_dst0[0] = p_src[0];
2028 p_dst1[0] = p_src[1];
2029 p_dst0[dst0_stride] = p_src[i_src_stride];
2030 p_dst1[dst1_stride] = p_src[i_src_stride + 1];
2031 p_dst0[2 * dst0_stride] = p_src[2 * i_src_stride];
2032 p_dst1[2 * dst1_stride] = p_src[2 * i_src_stride + 1];
2033 p_dst0[3 * dst0_stride] = p_src[3 * i_src_stride];
2034 p_dst1[3 * dst1_stride] = p_src[3 * i_src_stride + 1];
2035 p_dst0[4 * dst0_stride] = p_src[4 * i_src_stride];
2036 p_dst1[4 * dst1_stride] = p_src[4 * i_src_stride + 1];
2037 p_dst0[5 * dst0_stride] = p_src[5 * i_src_stride];
2038 p_dst1[5 * dst1_stride] = p_src[5 * i_src_stride + 1];
2039 p_dst0[6 * dst0_stride] = p_src[6 * i_src_stride];
2040 p_dst1[6 * dst1_stride] = p_src[6 * i_src_stride + 1];
2041 p_dst0[7 * dst0_stride] = p_src[7 * i_src_stride];
2042 p_dst1[7 * dst1_stride] = p_src[7 * i_src_stride + 1];
2048 p_src += ( ( 8 * i_src_stride ) - ( i_width << 1 ) );
2049 p_dst0 += ( ( 8 * dst0_stride ) - i_width );
2050 p_dst1 += ( ( 8 * dst1_stride ) - i_width );
2053 for( i_loop_height = i_h4w; i_loop_height < i_height; i_loop_height++ )
2055 for( i_loop_width = ( i_w_mul8 >> 3 ); i_loop_width--; )
2057 in0 = LD_UB( p_src );
2059 vec_pckev0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) in0,
2061 vec_pckod0 = ( v16u8 ) __msa_pckod_b( ( v16i8 ) in0,
2063 ST8x1_UB( vec_pckev0, p_dst0 );
2064 ST8x1_UB( vec_pckod0, p_dst1 );
2069 for( i_loop_width = ( ( i_width % 8 ) >> 2 ); i_loop_width--; )
2071 in0 = LD_UB( p_src );
2073 vec_pckev0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) in0,
2075 vec_pckod0 = ( v16u8 ) __msa_pckod_b( ( v16i8 ) in0,
2077 u_res_w0 = __msa_copy_u_w( ( v4i32 ) vec_pckev0, 0 );
2078 SW( u_res_w0, p_dst0 );
2079 u_res_w1 = __msa_copy_u_w( ( v4i32 ) vec_pckod0, 0 );
2080 SW( u_res_w1, p_dst1 );
2085 for( i_loop_width = i_w_mul4; i_loop_width < i_width; i_loop_width++ )
2087 p_dst0[0] = p_src[0];
2088 p_dst1[0] = p_src[1];
2094 p_src += ( ( i_src_stride ) - ( i_width << 1 ) );
2095 p_dst0 += ( ( dst0_stride ) - i_width );
2096 p_dst1 += ( ( dst1_stride ) - i_width );
2101 static void plane_copy_deinterleave_rgb_msa( uint8_t *p_src,
2102 int32_t i_src_stride,
2104 int32_t i_dst0_stride,
2106 int32_t i_dst1_stride,
2108 int32_t i_dst2_stride,
2112 uint8_t *p_src_orig = p_src;
2113 uint8_t *p_dst0_orig = p_dst0;
2114 uint8_t *p_dst1_orig = p_dst1;
2115 uint8_t *p_dst2_orig = p_dst2;
2116 int32_t i_loop_width, i_loop_height, i_w_mul8, i_h_mul4;
2117 v16i8 in0, in1, in2, in3, in4, in5, in6, in7;
2118 v16i8 temp0, temp1, temp2, temp3;
2119 v16i8 mask0 = { 0, 3, 6, 9, 12, 15, 18, 21, 0, 0, 0, 0, 0, 0, 0, 0 };
2120 v16i8 mask1 = { 1, 4, 7, 10, 13, 16, 19, 22, 0, 0, 0, 0, 0, 0, 0, 0 };
2121 v16i8 mask2 = { 2, 5, 8, 11, 14, 17, 20, 23, 0, 0, 0, 0, 0, 0, 0, 0 };
2123 i_w_mul8 = i_width - i_width % 8;
2124 i_h_mul4 = i_height - i_height % 4;
2126 for( i_loop_height = ( i_height >> 2 ); i_loop_height--; )
2129 p_dst0 = p_dst0_orig;
2130 p_dst1 = p_dst1_orig;
2131 p_dst2 = p_dst2_orig;
2133 for( i_loop_width = ( i_width >> 3 ); i_loop_width--; )
2135 LD_SB4( p_src, i_src_stride, in0, in1, in2, in3 );
2136 LD_SB4( ( p_src + 16 ), i_src_stride, in4, in5, in6, in7 );
2138 VSHF_B2_SB( in0, in4, in1, in5, mask0, mask0, temp0, temp1 );
2139 VSHF_B2_SB( in2, in6, in3, in7, mask0, mask0, temp2, temp3 );
2140 ST8x1_UB( temp0, p_dst0 );
2141 ST8x1_UB( temp1, p_dst0 + i_dst0_stride );
2142 ST8x1_UB( temp2, p_dst0 + 2 * i_dst0_stride );
2143 ST8x1_UB( temp3, p_dst0 + 3 * i_dst0_stride );
2145 VSHF_B2_SB( in0, in4, in1, in5, mask1, mask1, temp0, temp1 );
2146 VSHF_B2_SB( in2, in6, in3, in7, mask1, mask1, temp2, temp3 );
2147 ST8x1_UB( temp0, p_dst1 );
2148 ST8x1_UB( temp1, p_dst1 + i_dst1_stride );
2149 ST8x1_UB( temp2, p_dst1 + 2 * i_dst1_stride );
2150 ST8x1_UB( temp3, p_dst1 + 3 * i_dst1_stride );
2152 VSHF_B2_SB( in0, in4, in1, in5, mask2, mask2, temp0, temp1 );
2153 VSHF_B2_SB( in2, in6, in3, in7, mask2, mask2, temp2, temp3 );
2154 ST8x1_UB( temp0, p_dst2 );
2155 ST8x1_UB( temp1, p_dst2 + i_dst2_stride );
2156 ST8x1_UB( temp2, p_dst2 + 2 * i_dst2_stride );
2157 ST8x1_UB( temp3, p_dst2 + 3 * i_dst2_stride );
2165 for( i_loop_width = i_w_mul8; i_loop_width < i_width; i_loop_width++ )
2167 p_dst0_orig[i_loop_width] = p_src_orig[0 + 3 * i_loop_width];
2168 p_dst1_orig[i_loop_width] = p_src_orig[1 + 3 * i_loop_width];
2169 p_dst2_orig[i_loop_width] = p_src_orig[2 + 3 * i_loop_width];
2171 p_dst0_orig[i_loop_width + i_dst0_stride] =
2172 p_src_orig[0 + i_src_stride + 3 * i_loop_width];
2173 p_dst1_orig[i_loop_width + i_dst1_stride] =
2174 p_src_orig[1 + i_src_stride + 3 * i_loop_width];
2175 p_dst2_orig[i_loop_width + i_dst2_stride] =
2176 p_src_orig[2 + i_src_stride + 3 * i_loop_width];
2178 p_dst0_orig[i_loop_width + 2 * i_dst0_stride] =
2179 p_src_orig[0 + 2 * i_src_stride + 3 * i_loop_width];
2180 p_dst1_orig[i_loop_width + 2 * i_dst1_stride] =
2181 p_src_orig[1 + 2 * i_src_stride + 3 * i_loop_width];
2182 p_dst2_orig[i_loop_width + 2 * i_dst2_stride] =
2183 p_src_orig[2 + 2 * i_src_stride + 3 * i_loop_width];
2185 p_dst0_orig[i_loop_width + 3 * i_dst0_stride] =
2186 p_src_orig[0 + 3 * i_src_stride + 3 * i_loop_width];
2187 p_dst1_orig[i_loop_width + 3 * i_dst1_stride] =
2188 p_src_orig[1 + 3 * i_src_stride + 3 * i_loop_width];
2189 p_dst2_orig[i_loop_width + 3 * i_dst2_stride] =
2190 p_src_orig[2 + 3 * i_src_stride + 3 * i_loop_width];
2193 p_src_orig += ( 4 * i_src_stride );
2194 p_dst0_orig += ( 4 * i_dst0_stride );
2195 p_dst1_orig += ( 4 * i_dst1_stride );
2196 p_dst2_orig += ( 4 * i_dst2_stride );
2199 for( i_loop_height = i_h_mul4; i_loop_height < i_height; i_loop_height++ )
2202 p_dst0 = p_dst0_orig;
2203 p_dst1 = p_dst1_orig;
2204 p_dst2 = p_dst2_orig;
2206 for( i_loop_width = ( i_width >> 3 ); i_loop_width--; )
2208 in0 = LD_SB( p_src );
2209 in4 = LD_SB( p_src + 16 );
2210 temp0 = __msa_vshf_b( mask0, in4, in0 );
2211 ST8x1_UB( temp0, p_dst0 );
2212 temp0 = __msa_vshf_b( mask1, in4, in0 );
2213 ST8x1_UB( temp0, p_dst1 );
2214 temp0 = __msa_vshf_b( mask2, in4, in0 );
2215 ST8x1_UB( temp0, p_dst2 );
2223 for( i_loop_width = i_w_mul8; i_loop_width < i_width; i_loop_width++ )
2225 p_dst0_orig[i_loop_width] = p_src_orig[3 * i_loop_width];
2226 p_dst1_orig[i_loop_width] = p_src_orig[3 * i_loop_width + 1];
2227 p_dst2_orig[i_loop_width] = p_src_orig[3 * i_loop_width + 2];
2230 p_src_orig += ( i_src_stride );
2231 p_dst0_orig += ( i_dst0_stride );
2232 p_dst1_orig += ( i_dst1_stride );
2233 p_dst2_orig += ( i_dst2_stride );
2237 static void plane_copy_deinterleave_rgba_msa( uint8_t *p_src,
2238 int32_t i_src_stride,
2240 int32_t i_dst0_stride,
2242 int32_t i_dst1_stride,
2244 int32_t i_dst2_stride,
2248 uint8_t *p_src_orig = p_src;
2249 uint8_t *p_dst0_orig = p_dst0;
2250 uint8_t *p_dst1_orig = p_dst1;
2251 uint8_t *p_dst2_orig = p_dst2;
2252 int32_t i_loop_width, i_loop_height, i_w_mul8, i_h_mul4;
2253 v16i8 in0, in1, in2, in3, in4, in5, in6, in7;
2254 v16i8 in8, in9, in10, in11, in12, in13, in14, in15;
2255 v8i16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
2256 v8i16 temp8, temp9, temp10, temp11, temp12, temp13, temp14, temp15;
2258 i_w_mul8 = i_width - i_width % 8;
2259 i_h_mul4 = i_height - i_height % 4;
2261 for( i_loop_height = ( i_height >> 2 ); i_loop_height--; )
2264 p_dst0 = p_dst0_orig;
2265 p_dst1 = p_dst1_orig;
2266 p_dst2 = p_dst2_orig;
2268 for( i_loop_width = ( i_width >> 4 ); i_loop_width--; )
2270 LD_SB4( p_src, i_src_stride, in0, in1, in2, in3 );
2271 LD_SB4( ( p_src + 16 ), i_src_stride, in4, in5, in6, in7 );
2272 LD_SB4( ( p_src + 32 ), i_src_stride, in8, in9, in10, in11 );
2273 LD_SB4( ( p_src + 48 ), i_src_stride, in12, in13, in14, in15 );
2275 PCKEV_H2_SH( in4, in0, in12, in8, temp0, temp1 );
2276 temp2 = __msa_pckod_h( ( v8i16 ) in4, ( v8i16 ) in0 );
2277 temp3 = __msa_pckod_h( ( v8i16 ) in12, ( v8i16 ) in8 );
2278 PCKEV_H2_SH( in5, in1, in13, in9, temp4, temp5 );
2279 temp6 = __msa_pckod_h( ( v8i16 ) in5, ( v8i16 ) in1 );
2280 temp7 = __msa_pckod_h( ( v8i16 ) in13, ( v8i16 ) in9 );
2281 PCKEV_H2_SH( in6, in2, in14, in10, temp8, temp9 );
2282 temp10 = __msa_pckod_h( ( v8i16 ) in6, ( v8i16 ) in2 );
2283 temp11 = __msa_pckod_h( ( v8i16 ) in14, ( v8i16 ) in10 );
2284 PCKEV_H2_SH( in7, in3, in15, in11, temp12, temp13 );
2285 temp14 = __msa_pckod_h( ( v8i16 ) in7, ( v8i16 ) in3 );
2286 temp15 = __msa_pckod_h( ( v8i16 ) in15, ( v8i16 ) in11 );
2287 PCKEV_B2_SB( temp1, temp0, temp3, temp2, in0, in1 );
2288 in2 = __msa_pckod_b( ( v16i8 ) temp1, ( v16i8 ) temp0 );
2289 PCKEV_B2_SB( temp5, temp4, temp7, temp6, in4, in5 );
2290 in6 = __msa_pckod_b( ( v16i8 ) temp5, ( v16i8 ) temp4 );
2291 PCKEV_B2_SB( temp9, temp8, temp11, temp10, in8, in9 );
2292 in10 = __msa_pckod_b( ( v16i8 ) temp9, ( v16i8 ) temp8 );
2293 PCKEV_B2_SB( temp13, temp12, temp15, temp14, in12, in13 );
2294 in14 = __msa_pckod_b( ( v16i8 ) temp13, ( v16i8 ) temp12 );
2295 ST_SB4( in0, in4, in8, in12, p_dst0, i_dst0_stride );
2296 ST_SB4( in1, in5, in9, in13, p_dst2, i_dst2_stride );
2297 ST_SB4( in2, in6, in10, in14, p_dst1, i_dst1_stride );
2305 for( i_loop_width = ( ( i_width % 16 ) >> 3 ); i_loop_width--; )
2307 LD_SB4( p_src, i_src_stride, in0, in1, in2, in3 );
2308 LD_SB4( p_src + 16, i_src_stride, in4, in5, in6, in7 );
2310 PCKEV_H2_SH( in4, in0, in5, in1, temp0, temp4 );
2311 temp2 = __msa_pckod_h( ( v8i16 ) in4, ( v8i16 ) in0 );
2312 temp6 = __msa_pckod_h( ( v8i16 ) in5, ( v8i16 ) in1 );
2314 PCKEV_H2_SH( in6, in2, in7, in3, temp8, temp12 );
2315 temp10 = __msa_pckod_h( ( v8i16 ) in6, ( v8i16 ) in2 );
2316 temp14 = __msa_pckod_h( ( v8i16 ) in7, ( v8i16 ) in3 );
2318 PCKEV_B2_SB( temp0, temp0, temp2, temp2, in0, in1 );
2319 in2 = __msa_pckod_b( ( v16i8 ) temp0, ( v16i8 ) temp0 );
2320 PCKEV_B2_SB( temp4, temp4, temp6, temp6, in4, in5 );
2321 in6 = __msa_pckod_b( ( v16i8 ) temp4, ( v16i8 ) temp4 );
2322 PCKEV_B2_SB( temp8, temp8, temp10, temp10, in8, in9 );
2323 in10 = __msa_pckod_b( ( v16i8 ) temp8, ( v16i8 ) temp8 );
2324 PCKEV_B2_SB( temp12, temp12, temp14, temp14, in12, in13 );
2325 in14 = __msa_pckod_b( ( v16i8 ) temp12, ( v16i8 ) temp12 );
2327 ST8x1_UB( in0, p_dst0 );
2328 ST8x1_UB( in4, p_dst0 + i_dst0_stride );
2329 ST8x1_UB( in8, p_dst0 + 2 * i_dst0_stride );
2330 ST8x1_UB( in12, p_dst0 + 3 * i_dst0_stride );
2332 ST8x1_UB( in1, p_dst2 );
2333 ST8x1_UB( in5, p_dst2 + i_dst2_stride );
2334 ST8x1_UB( in9, p_dst2 + 2 * i_dst2_stride );
2335 ST8x1_UB( in13, p_dst2 + 3 * i_dst2_stride );
2337 ST8x1_UB( in2, p_dst1 );
2338 ST8x1_UB( in6, p_dst1 + i_dst1_stride );
2339 ST8x1_UB( in10, p_dst1 + 2 * i_dst1_stride );
2340 ST8x1_UB( in14, p_dst1 + 3 * i_dst1_stride );
2348 for( i_loop_width = i_w_mul8; i_loop_width < i_width; i_loop_width++ )
2350 p_dst0_orig[i_loop_width] = p_src_orig[4 * i_loop_width];
2351 p_dst1_orig[i_loop_width] = p_src_orig[4 * i_loop_width + 1];
2352 p_dst2_orig[i_loop_width] = p_src_orig[4 * i_loop_width + 2];
2354 p_dst0_orig[i_dst0_stride + i_loop_width] =
2355 p_src_orig[i_src_stride + 4 * i_loop_width];
2356 p_dst1_orig[i_dst1_stride + i_loop_width] =
2357 p_src_orig[i_src_stride + 4 * i_loop_width + 1];
2358 p_dst2_orig[i_dst2_stride + i_loop_width] =
2359 p_src_orig[i_src_stride + 4 * i_loop_width + 2];
2361 p_dst0_orig[2 * i_dst0_stride + i_loop_width] =
2362 p_src_orig[2 * i_src_stride + 4 * i_loop_width];
2363 p_dst1_orig[2 * i_dst1_stride + i_loop_width] =
2364 p_src_orig[2 * i_src_stride + 4 * i_loop_width + 1];
2365 p_dst2_orig[2 * i_dst2_stride + i_loop_width] =
2366 p_src_orig[2 * i_src_stride + 4 * i_loop_width + 2];
2368 p_dst0_orig[3 * i_dst0_stride + i_loop_width] =
2369 p_src_orig[3 * i_src_stride + 4 * i_loop_width];
2370 p_dst1_orig[3 * i_dst1_stride + i_loop_width] =
2371 p_src_orig[3 * i_src_stride + 4 * i_loop_width + 1];
2372 p_dst2_orig[3 * i_dst2_stride + i_loop_width] =
2373 p_src_orig[3 * i_src_stride + 4 * i_loop_width + 2];
2376 p_src_orig += ( 4 * i_src_stride );
2377 p_dst0_orig += ( 4 * i_dst0_stride );
2378 p_dst1_orig += ( 4 * i_dst1_stride );
2379 p_dst2_orig += ( 4 * i_dst2_stride );
2382 for( i_loop_height = i_h_mul4; i_loop_height < i_height; i_loop_height++ )
2385 p_dst0 = p_dst0_orig;
2386 p_dst1 = p_dst1_orig;
2387 p_dst2 = p_dst2_orig;
2389 for( i_loop_width = ( i_width >> 4 ); i_loop_width--; )
2391 LD_SB4( p_src, 16, in0, in4, in8, in12 );
2393 PCKEV_H2_SH( in4, in0, in12, in8, temp0, temp1 );
2394 temp2 = __msa_pckod_h( ( v8i16 ) in4, ( v8i16 ) in0 );
2395 temp3 = __msa_pckod_h( ( v8i16 ) in12, ( v8i16 ) in8 );
2396 PCKEV_B2_SB( temp1, temp0, temp3, temp2, in0, in1 );
2397 in2 = __msa_pckod_b( ( v16i8 ) temp1, ( v16i8 ) temp0 );
2398 ST_SB( in0, p_dst0 );
2399 ST_SB( in0, p_dst0 );
2400 ST_SB( in1, p_dst2 );
2401 ST_SB( in1, p_dst2 );
2402 ST_SB( in2, p_dst1 );
2403 ST_SB( in2, p_dst1 );
2411 for( i_loop_width = ( ( i_width % 16 ) >> 3 ); i_loop_width--; )
2413 in0 = LD_SB( p_src );
2414 in4 = LD_SB( p_src + 16 );
2416 temp0 = __msa_pckev_h( ( v8i16 ) in4, ( v8i16 ) in0 );
2417 temp2 = __msa_pckod_h( ( v8i16 ) in4, ( v8i16 ) in0 );
2418 PCKEV_B2_SB( temp0, temp0, temp2, temp2, in0, in1 );
2419 in2 = __msa_pckod_b( ( v16i8 ) temp0, ( v16i8 ) temp0 );
2420 ST8x1_UB( in0, p_dst0 );
2421 ST8x1_UB( in1, p_dst2 );
2422 ST8x1_UB( in2, p_dst1 );
2430 for( i_loop_width = i_w_mul8; i_loop_width < i_width; i_loop_width++ )
2432 p_dst0_orig[i_loop_width] = p_src_orig[4 * i_loop_width];
2433 p_dst1_orig[i_loop_width] = p_src_orig[4 * i_loop_width + 1];
2434 p_dst2_orig[i_loop_width] = p_src_orig[4 * i_loop_width + 2];
2437 p_src_orig += ( i_src_stride );
2438 p_dst0_orig += ( i_dst0_stride );
2439 p_dst1_orig += ( i_dst1_stride );
2440 p_dst2_orig += ( i_dst2_stride );
2444 static void store_interleave_chroma_msa( uint8_t *p_src0, int32_t i_src0_stride,
2445 uint8_t *p_src1, int32_t i_src1_stride,
2446 uint8_t *p_dst, int32_t i_dst_stride,
2449 int32_t i_loop_height, i_h4w;
2450 v16u8 in0, in1, in2, in3, in4, in5, in6, in7;
2451 v16u8 ilvr_vec0, ilvr_vec1, ilvr_vec2, ilvr_vec3;
2453 i_h4w = i_height % 4;
2454 for( i_loop_height = ( i_height >> 2 ); i_loop_height--; )
2456 LD_UB4( p_src0, i_src0_stride, in0, in1, in2, in3 );
2457 p_src0 += ( 4 * i_src0_stride );
2458 LD_UB4( p_src1, i_src1_stride, in4, in5, in6, in7 );
2459 p_src1 += ( 4 * i_src1_stride );
2460 ILVR_B4_UB( in4, in0, in5, in1, in6, in2, in7, in3,
2461 ilvr_vec0, ilvr_vec1, ilvr_vec2, ilvr_vec3 );
2462 ST_UB4( ilvr_vec0, ilvr_vec1, ilvr_vec2, ilvr_vec3,
2463 p_dst, i_dst_stride );
2464 p_dst += ( 4 * i_dst_stride );
2467 for( i_loop_height = i_h4w; i_loop_height--; )
2469 in0 = LD_UB( p_src0 );
2470 p_src0 += ( i_src0_stride );
2471 in1 = LD_UB( p_src1 );
2472 p_src1 += ( i_src1_stride );
2473 ilvr_vec0 = ( v16u8 ) __msa_ilvr_b( ( v16i8 ) in1, ( v16i8 ) in0 );
2474 ST_UB( ilvr_vec0, p_dst );
2475 p_dst += ( i_dst_stride );
2479 static void frame_init_lowres_core_msa( uint8_t *p_src, int32_t i_src_stride,
2480 uint8_t *p_dst0, int32_t dst0_stride,
2481 uint8_t *p_dst1, int32_t dst1_stride,
2482 uint8_t *p_dst2, int32_t dst2_stride,
2483 uint8_t *p_dst3, int32_t dst3_stride,
2484 int32_t i_width, int32_t i_height )
2486 int32_t i_loop_width, i_loop_height, i_w16_mul;
2487 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2488 v16u8 sld1_vec0, sld1_vec1, sld1_vec2, sld1_vec3, sld1_vec4, sld1_vec5;
2489 v16u8 pckev_vec0, pckev_vec1, pckev_vec2;
2490 v16u8 pckod_vec0, pckod_vec1, pckod_vec2;
2491 v16u8 tmp0, tmp1, tmp2, tmp3;
2494 i_w16_mul = i_width - i_width % 16;
2495 for( i_loop_height = i_height; i_loop_height--; )
2497 LD_UB3( p_src, i_src_stride, src0, src1, src2 );
2499 for( i_loop_width = 0; i_loop_width < ( i_w16_mul >> 4 ); i_loop_width++ )
2501 LD_UB3( p_src, i_src_stride, src3, src4, src5 );
2503 LD_UB3( p_src, i_src_stride, src6, src7, src8 );
2505 PCKEV_B2_UB( src3, src0, src4, src1, pckev_vec0, pckev_vec1 );
2506 PCKOD_B2_UB( src3, src0, src4, src1, pckod_vec0, pckod_vec1 );
2507 pckev_vec2 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) src5,
2509 pckod_vec2 = ( v16u8 ) __msa_pckod_b( ( v16i8 ) src5,
2511 AVER_UB4_UB( pckev_vec1, pckev_vec0, pckod_vec1, pckod_vec0,
2512 pckev_vec2, pckev_vec1, pckod_vec2, pckod_vec1,
2513 tmp0, tmp1, tmp2, tmp3 );
2514 AVER_UB2_UB( tmp1, tmp0, tmp3, tmp2, res0, res1 );
2515 ST_UB( res0, p_dst0 );
2516 ST_UB( res1, p_dst2 );
2518 SLDI_B2_UB( src3, src4, src0, src1, sld1_vec0, sld1_vec1, 1 );
2519 SLDI_B2_UB( src5, src6, src2, src3, sld1_vec2, sld1_vec3, 1 );
2520 SLDI_B2_UB( src7, src8, src4, src5, sld1_vec4, sld1_vec5, 1 );
2521 PCKOD_B2_UB( sld1_vec3, sld1_vec0, sld1_vec4, sld1_vec1,
2522 pckev_vec0, pckev_vec1 )
2523 pckev_vec2 = ( v16u8 ) __msa_pckod_b( ( v16i8 ) sld1_vec5,
2524 ( v16i8 ) sld1_vec2 );
2525 AVER_UB4_UB( pckev_vec1, pckev_vec0, pckod_vec1, pckod_vec0,
2526 pckev_vec2, pckev_vec1, pckod_vec2, pckod_vec1,
2527 tmp0, tmp1, tmp2, tmp3 );
2528 AVER_UB2_UB( tmp1, tmp0, tmp3, tmp2, res0, res1 );
2529 ST_UB( res0, p_dst1 );
2530 ST_UB( res1, p_dst3 );
2541 for( i_loop_width = i_w16_mul; i_loop_width < i_width;
2544 LD_UB3( p_src, i_src_stride, src3, src4, src5 );
2546 PCKEV_B2_UB( src3, src0, src4, src1, pckev_vec0, pckev_vec1 );
2547 PCKOD_B2_UB( src3, src0, src4, src1, pckod_vec0, pckod_vec1 );
2548 pckev_vec2 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) src5,
2550 pckod_vec2 = ( v16u8 ) __msa_pckod_b( ( v16i8 ) src5,
2552 AVER_UB4_UB( pckev_vec1, pckev_vec0, pckod_vec1, pckod_vec0,
2553 pckev_vec2, pckev_vec1, pckod_vec2, pckod_vec1,
2554 tmp0, tmp1, tmp2, tmp3 );
2555 AVER_UB2_UB( tmp1, tmp0, tmp3, tmp2, res0, res1 );
2556 ST8x1_UB( res0, p_dst0 );
2557 ST8x1_UB( res1, p_dst2 );
2559 SLDI_B2_UB( src3, src4, src0, src1, sld1_vec0, sld1_vec1, 1 );
2560 SLDI_B2_UB( src5, src3, src2, src3, sld1_vec2, sld1_vec3, 1 );
2561 SLDI_B2_UB( src4, src5, src4, src5, sld1_vec4, sld1_vec5, 1 );
2562 PCKOD_B2_UB( sld1_vec3, sld1_vec0, sld1_vec4, sld1_vec1,
2563 pckev_vec0, pckev_vec1 )
2564 pckev_vec2 = ( v16u8 ) __msa_pckod_b( ( v16i8 ) sld1_vec5,
2565 ( v16i8 ) sld1_vec2 );
2566 AVER_UB4_UB( pckev_vec1, pckev_vec0, pckod_vec1, pckod_vec0,
2567 pckev_vec2, pckev_vec1, pckod_vec2, pckod_vec1,
2568 tmp0, tmp1, tmp2, tmp3 );
2569 AVER_UB2_UB( tmp1, tmp0, tmp3, tmp2, res0, res1 );
2570 ST8x1_UB( res0, p_dst1 );
2571 ST8x1_UB( res1, p_dst3 );
2578 p_src += ( i_src_stride * 2 - ( ( i_width * 2 ) + 16 ) );
2579 p_dst0 += ( dst0_stride - i_width );
2580 p_dst1 += ( dst1_stride - i_width );
2581 p_dst2 += ( dst2_stride - i_width );
2582 p_dst3 += ( dst3_stride - i_width );
2586 void x264_mc_copy_w16_msa( uint8_t *p_dst, intptr_t i_dst_stride,
2587 uint8_t *p_src, intptr_t i_src_stride,
2590 copy_width16_msa( p_src, i_src_stride, p_dst, i_dst_stride, i_height );
2593 void x264_mc_copy_w8_msa( uint8_t *p_dst, intptr_t i_dst_stride, uint8_t *p_src,
2594 intptr_t i_src_stride, int32_t i_height )
2596 copy_width8_msa( p_src, i_src_stride, p_dst, i_dst_stride, i_height );
2599 void x264_mc_copy_w4_msa( uint8_t *p_dst, intptr_t i_dst_stride, uint8_t *p_src,
2600 intptr_t i_src_stride, int32_t i_height )
2602 copy_width4_msa( p_src, i_src_stride, p_dst, i_dst_stride, i_height );
2605 void x264_pixel_avg_16x16_msa( uint8_t *p_pix1, intptr_t pix1_stride,
2606 uint8_t *p_pix2, intptr_t pix2_stride,
2607 uint8_t *p_pix3, intptr_t pix3_stride,
2610 if( 32 == i_weight )
2612 avg_src_width16_msa( p_pix2, pix2_stride, p_pix3, pix3_stride,
2613 p_pix1, pix1_stride, 16 );
2615 else if( i_weight < 0 || i_weight > 63 )
2617 avc_biwgt_opscale_16width_nw_msa( p_pix2, pix2_stride,
2618 p_pix3, pix3_stride,
2619 p_pix1, pix1_stride,
2621 ( 64 - i_weight ), 0 );
2625 avc_biwgt_opscale_16width_msa( p_pix2, pix2_stride,
2626 p_pix3, pix3_stride,
2627 p_pix1, pix1_stride,
2629 ( 64 - i_weight ), 0 );
2633 void x264_pixel_avg_16x8_msa( uint8_t *p_pix1, intptr_t pix1_stride,
2634 uint8_t *p_pix2, intptr_t pix2_stride,
2635 uint8_t *p_pix3, intptr_t pix3_stride,
2638 if( 32 == i_weight )
2640 avg_src_width16_msa( p_pix2, pix2_stride, p_pix3, pix3_stride,
2641 p_pix1, pix1_stride, 8 );
2643 else if( i_weight < 0 || i_weight > 63 )
2645 avc_biwgt_opscale_16width_nw_msa( p_pix2, pix2_stride,
2646 p_pix3, pix3_stride,
2647 p_pix1, pix1_stride,
2649 ( 64 - i_weight ), 0 );
2653 avc_biwgt_opscale_16width_msa( p_pix2, pix2_stride,
2654 p_pix3, pix3_stride,
2655 p_pix1, pix1_stride,
2657 ( 64 - i_weight ), 0 );
2661 void x264_pixel_avg_8x16_msa( uint8_t *p_pix1, intptr_t pix1_stride,
2662 uint8_t *p_pix2, intptr_t pix2_stride,
2663 uint8_t *p_pix3, intptr_t pix3_stride,
2666 if( 32 == i_weight )
2668 avg_src_width8_msa( p_pix2, pix2_stride, p_pix3, pix3_stride,
2669 p_pix1, pix1_stride, 16 );
2671 else if( i_weight < 0 || i_weight > 63 )
2673 avc_biwgt_opscale_8width_nw_msa( p_pix2, pix2_stride,
2674 p_pix3, pix3_stride,
2675 p_pix1, pix1_stride, 16, 5, i_weight,
2676 ( 64 - i_weight ), 0 );
2680 avc_biwgt_opscale_8width_msa( p_pix2, pix2_stride,
2681 p_pix3, pix3_stride,
2682 p_pix1, pix1_stride, 16, 5, i_weight,
2683 ( 64 - i_weight ), 0 );
2687 void x264_pixel_avg_8x8_msa( uint8_t *p_pix1, intptr_t pix1_stride,
2688 uint8_t *p_pix2, intptr_t pix2_stride,
2689 uint8_t *p_pix3, intptr_t pix3_stride,
2692 if( 32 == i_weight )
2694 avg_src_width8_msa( p_pix2, pix2_stride, p_pix3, pix3_stride,
2695 p_pix1, pix1_stride, 8 );
2697 else if( i_weight < 0 || i_weight > 63 )
2699 avc_biwgt_opscale_8width_nw_msa( p_pix2, pix2_stride,
2700 p_pix3, pix3_stride,
2701 p_pix1, pix1_stride, 8, 5, i_weight,
2702 ( 64 - i_weight ), 0 );
2706 avc_biwgt_opscale_8width_msa( p_pix2, pix2_stride,
2707 p_pix3, pix3_stride,
2708 p_pix1, pix1_stride, 8, 5, i_weight,
2709 ( 64 - i_weight ), 0 );
2713 void x264_pixel_avg_8x4_msa( uint8_t *p_pix1, intptr_t pix1_stride,
2714 uint8_t *p_pix2, intptr_t pix2_stride,
2715 uint8_t *p_pix3, intptr_t pix3_stride,
2718 if( 32 == i_weight )
2720 avg_src_width8_msa( p_pix2, pix2_stride, p_pix3, pix3_stride,
2721 p_pix1, pix1_stride, 4 );
2723 else if( i_weight < 0 || i_weight > 63 )
2725 avc_biwgt_opscale_8width_nw_msa( p_pix2, pix2_stride,
2726 p_pix3, pix3_stride,
2727 p_pix1, pix1_stride, 4, 5, i_weight,
2728 ( 64 - i_weight ), 0 );
2732 avc_biwgt_opscale_8width_msa( p_pix2, pix2_stride,
2733 p_pix3, pix3_stride,
2734 p_pix1, pix1_stride, 4, 5, i_weight,
2735 ( 64 - i_weight ), 0 );
2739 void x264_pixel_avg_4x16_msa( uint8_t *p_pix1, intptr_t pix1_stride,
2740 uint8_t *p_pix2, intptr_t pix2_stride,
2741 uint8_t *p_pix3, intptr_t pix3_stride,
2744 if( 32 == i_weight )
2746 avg_src_width4_msa( p_pix2, pix2_stride, p_pix3, pix3_stride,
2747 p_pix1, pix1_stride, 16 );
2749 else if( i_weight < 0 || i_weight > 63 )
2751 avc_biwgt_opscale_4width_nw_msa( p_pix2, pix2_stride,
2752 p_pix3, pix3_stride,
2753 p_pix1, pix1_stride, 16, 5, i_weight,
2754 ( 64 - i_weight ), 0 );
2758 avc_biwgt_opscale_4width_msa( p_pix2, pix2_stride,
2759 p_pix3, pix3_stride,
2760 p_pix1, pix1_stride, 16, 5, i_weight,
2761 ( 64 - i_weight ), 0 );
2765 void x264_pixel_avg_4x8_msa( uint8_t *p_pix1, intptr_t pix1_stride,
2766 uint8_t *p_pix2, intptr_t pix2_stride,
2767 uint8_t *p_pix3, intptr_t pix3_stride,
2770 if( 32 == i_weight )
2772 avg_src_width4_msa( p_pix2, pix2_stride, p_pix3, pix3_stride,
2773 p_pix1, pix1_stride, 8 );
2775 else if( i_weight < 0 || i_weight > 63 )
2777 avc_biwgt_opscale_4width_nw_msa( p_pix2, pix2_stride,
2778 p_pix3, pix3_stride,
2779 p_pix1, pix1_stride, 8, 5, i_weight,
2780 ( 64 - i_weight ), 0 );
2784 avc_biwgt_opscale_4width_msa( p_pix2, pix2_stride,
2785 p_pix3, pix3_stride,
2786 p_pix1, pix1_stride, 8, 5, i_weight,
2787 ( 64 - i_weight ), 0 );
2791 void x264_pixel_avg_4x4_msa( uint8_t *p_pix1, intptr_t pix1_stride,
2792 uint8_t *p_pix2, intptr_t pix2_stride,
2793 uint8_t *p_pix3, intptr_t pix3_stride,
2796 if( 32 == i_weight )
2798 avg_src_width4_msa( p_pix2, pix2_stride, p_pix3, pix3_stride,
2799 p_pix1, pix1_stride, 4 );
2801 else if( i_weight < 0 || i_weight > 63 )
2803 avc_biwgt_opscale_4width_nw_msa( p_pix2, pix2_stride,
2804 p_pix3, pix3_stride,
2805 p_pix1, pix1_stride, 4, 5, i_weight,
2806 ( 64 - i_weight ), 0 );
2810 avc_biwgt_opscale_4width_msa( p_pix2, pix2_stride,
2811 p_pix3, pix3_stride,
2812 p_pix1, pix1_stride, 4, 5, i_weight,
2813 ( 64 - i_weight ), 0 );
2817 void x264_pixel_avg_4x2_msa( uint8_t *p_pix1, intptr_t pix1_stride,
2818 uint8_t *p_pix2, intptr_t pix2_stride,
2819 uint8_t *p_pix3, intptr_t pix3_stride,
2822 if( 32 == i_weight )
2824 avg_src_width4_msa( p_pix2, pix2_stride, p_pix3, pix3_stride,
2825 p_pix1, pix1_stride, 2 );
2827 else if( i_weight < 0 || i_weight > 63 )
2829 avc_biwgt_opscale_4x2_nw_msa( p_pix2, pix2_stride,
2830 p_pix3, pix3_stride,
2831 p_pix1, pix1_stride, 5, i_weight,
2832 ( 64 - i_weight ), 0 );
2836 avc_biwgt_opscale_4x2_msa( p_pix2, pix2_stride,
2837 p_pix3, pix3_stride,
2838 p_pix1, pix1_stride, 5, i_weight,
2839 ( 64 - i_weight ), 0 );
2844 void x264_memzero_aligned_msa( void *p_dst, size_t n )
2846 uint32_t u_tot32_mul_lines = n >> 5;
2847 uint32_t u_remaining = n - ( u_tot32_mul_lines << 5 );
2849 memset_zero_16width_msa( p_dst, 16, ( n / 16 ) );
2853 memset( p_dst + ( u_tot32_mul_lines << 5 ), 0, u_remaining );
2857 void x264_mc_weight_w4_msa( uint8_t *p_dst, intptr_t i_dst_stride,
2858 uint8_t *p_src, intptr_t i_src_stride,
2859 const x264_weight_t *pWeight, int32_t i_height )
2861 int32_t i_log2_denom = pWeight->i_denom;
2862 int32_t i_offset = pWeight->i_offset;
2863 int32_t i_weight = pWeight->i_scale;
2865 avc_wgt_opscale_4width_msa( p_src, i_src_stride, p_dst, i_dst_stride,
2866 i_height, i_log2_denom, i_weight, i_offset );
2869 void x264_mc_weight_w8_msa( uint8_t *p_dst, intptr_t i_dst_stride,
2870 uint8_t *p_src, intptr_t i_src_stride,
2871 const x264_weight_t *pWeight, int32_t i_height )
2873 int32_t i_log2_denom = pWeight->i_denom;
2874 int32_t i_offset = pWeight->i_offset;
2875 int32_t i_weight = pWeight->i_scale;
2877 avc_wgt_opscale_8width_msa( p_src, i_src_stride, p_dst, i_dst_stride,
2878 i_height, i_log2_denom, i_weight, i_offset );
2881 void x264_mc_weight_w16_msa( uint8_t *p_dst, intptr_t i_dst_stride,
2882 uint8_t *p_src, intptr_t i_src_stride,
2883 const x264_weight_t *pWeight, int32_t i_height )
2885 int32_t i_log2_denom = pWeight->i_denom;
2886 int32_t i_offset = pWeight->i_offset;
2887 int32_t i_weight = pWeight->i_scale;
2889 avc_wgt_opscale_16width_msa( p_src, i_src_stride, p_dst, i_dst_stride,
2890 i_height, i_log2_denom, i_weight, i_offset );
2893 void x264_mc_weight_w20_msa( uint8_t *p_dst, intptr_t i_dst_stride,
2894 uint8_t *p_src, intptr_t i_src_stride,
2895 const x264_weight_t *pWeight, int32_t i_height )
2897 x264_mc_weight_w16_msa( p_dst, i_dst_stride, p_src, i_src_stride,
2898 pWeight, i_height );
2899 x264_mc_weight_w4_msa( p_dst + 16, i_dst_stride, p_src + 16, i_src_stride,
2900 pWeight, i_height );
2903 void x264_mc_luma_msa( uint8_t *p_dst, intptr_t i_dst_stride,
2904 uint8_t *p_src[4], intptr_t i_src_stride,
2905 int32_t m_vx, int32_t m_vy,
2906 int32_t i_width, int32_t i_height,
2907 const x264_weight_t *pWeight )
2913 i_qpel_idx = ( ( m_vy & 3 ) << 2 ) + ( m_vx & 3 );
2914 i_offset = ( m_vy >> 2 ) * i_src_stride + ( m_vx >> 2 );
2915 p_src1 = p_src[x264_hpel_ref0[i_qpel_idx]] + i_offset +
2916 ( 3 == ( m_vy & 3 ) ) * i_src_stride;
2918 if( i_qpel_idx & 5 )
2920 uint8_t *p_src2 = p_src[x264_hpel_ref1[i_qpel_idx]] +
2921 i_offset + ( 3 == ( m_vx&3 ) );
2925 avg_src_width16_msa( p_src1, i_src_stride, p_src2, i_src_stride,
2926 p_dst, i_dst_stride, i_height );
2928 else if( 8 == i_width )
2930 avg_src_width8_msa( p_src1, i_src_stride, p_src2, i_src_stride,
2931 p_dst, i_dst_stride, i_height );
2933 else if( 4 == i_width )
2935 avg_src_width4_msa( p_src1, i_src_stride, p_src2, i_src_stride,
2936 p_dst, i_dst_stride, i_height );
2939 if( pWeight->weightfn )
2943 x264_mc_weight_w16_msa( p_dst, i_dst_stride,
2944 p_dst, i_dst_stride,
2945 pWeight, i_height );
2947 else if( 8 == i_width )
2949 x264_mc_weight_w8_msa( p_dst, i_dst_stride, p_dst, i_dst_stride,
2950 pWeight, i_height );
2952 else if( 4 == i_width )
2954 x264_mc_weight_w4_msa( p_dst, i_dst_stride, p_dst, i_dst_stride,
2955 pWeight, i_height );
2959 else if( pWeight->weightfn )
2963 x264_mc_weight_w16_msa( p_dst, i_dst_stride, p_src1, i_src_stride,
2964 pWeight, i_height );
2966 else if( 8 == i_width )
2968 x264_mc_weight_w8_msa( p_dst, i_dst_stride, p_src1, i_src_stride,
2969 pWeight, i_height );
2971 else if( 4 == i_width )
2973 x264_mc_weight_w4_msa( p_dst, i_dst_stride, p_src1, i_src_stride,
2974 pWeight, i_height );
2981 copy_width16_msa( p_src1, i_src_stride, p_dst, i_dst_stride,
2984 else if( 8 == i_width )
2986 copy_width8_msa( p_src1, i_src_stride, p_dst, i_dst_stride,
2989 else if( 4 == i_width )
2991 copy_width4_msa( p_src1, i_src_stride, p_dst, i_dst_stride,
2997 void x264_mc_chroma_msa( uint8_t *p_dst_u, uint8_t *p_dst_v,
2998 intptr_t i_dst_stride,
2999 uint8_t *p_src, intptr_t i_src_stride,
3000 int32_t m_vx, int32_t m_vy,
3001 int32_t i_width, int32_t i_height )
3003 int32_t i_d8x = m_vx & 0x07;
3004 int32_t i_d8y = m_vy & 0x07;
3005 int32_t i_coeff_horiz1 = ( 8 - i_d8x );
3006 int32_t i_coeff_vert1 = ( 8 - i_d8y );
3007 int32_t i_coeff_horiz0 = i_d8x;
3008 int32_t i_coeff_vert0 = i_d8y;
3010 p_src += ( m_vy >> 3 ) * i_src_stride + ( m_vx >> 3 ) * 2;
3014 avc_interleaved_chroma_hv_2w_msa( p_src, i_src_stride,
3015 p_dst_u, p_dst_v, i_dst_stride,
3016 i_coeff_horiz0, i_coeff_horiz1,
3017 i_coeff_vert0, i_coeff_vert1,
3020 else if( 4 == i_width )
3022 avc_interleaved_chroma_hv_4w_msa( p_src, i_src_stride,
3023 p_dst_u, p_dst_v, i_dst_stride,
3024 i_coeff_horiz0, i_coeff_horiz1,
3025 i_coeff_vert0, i_coeff_vert1,
3028 else if( 8 == i_width )
3030 avc_interleaved_chroma_hv_8w_msa( p_src, i_src_stride,
3031 p_dst_u, p_dst_v, i_dst_stride,
3032 i_coeff_horiz0, i_coeff_horiz1,
3033 i_coeff_vert0, i_coeff_vert1,
3038 void x264_hpel_filter_msa( uint8_t *p_dsth, uint8_t *p_dst_v,
3039 uint8_t *p_dstc, uint8_t *p_src,
3040 intptr_t i_stride, int32_t i_width,
3041 int32_t i_height, int16_t *p_buf )
3043 for( int32_t i = 0; i < ( i_width / 16 ); i++ )
3045 avc_luma_vt_16w_msa( p_src - 2 - ( 2 * i_stride ), i_stride,
3046 p_dst_v - 2, i_stride, i_height );
3047 avc_luma_mid_16w_msa( p_src - 2 - ( 2 * i_stride ) , i_stride,
3048 p_dstc, i_stride, i_height );
3049 avc_luma_hz_16w_msa( p_src - 2, i_stride, p_dsth, i_stride, i_height );
3058 void x264_plane_copy_interleave_msa( uint8_t *p_dst, intptr_t i_dst_stride,
3059 uint8_t *p_src0, intptr_t i_src_stride0,
3060 uint8_t *p_src1, intptr_t i_src_stride1,
3061 int32_t i_width, int32_t i_height )
3063 plane_copy_interleave_msa( p_src0, i_src_stride0, p_src1, i_src_stride1,
3064 p_dst, i_dst_stride, i_width, i_height );
3067 void x264_plane_copy_deinterleave_msa( uint8_t *p_dst0, intptr_t i_dst_stride0,
3068 uint8_t *p_dst1, intptr_t i_dst_stride1,
3069 uint8_t *p_src, intptr_t i_src_stride,
3070 int32_t i_width, int32_t i_height )
3072 plane_copy_deinterleave_msa( p_src, i_src_stride, p_dst0, i_dst_stride0,
3073 p_dst1, i_dst_stride1, i_width, i_height );
3076 void x264_plane_copy_deinterleave_rgb_msa( uint8_t *p_dst0,
3077 intptr_t i_dst_stride0,
3079 intptr_t i_dst_stride1,
3081 intptr_t i_dst_stride2,
3083 intptr_t i_src_stride,
3084 int32_t i_src_width,
3088 if( 3 == i_src_width )
3090 plane_copy_deinterleave_rgb_msa( p_src, i_src_stride,
3091 p_dst0, i_dst_stride0,
3092 p_dst1, i_dst_stride1,
3093 p_dst2, i_dst_stride2,
3094 i_width, i_height );
3096 else if( 4 == i_src_width )
3098 plane_copy_deinterleave_rgba_msa( p_src, i_src_stride,
3099 p_dst0, i_dst_stride0,
3100 p_dst1, i_dst_stride1,
3101 p_dst2, i_dst_stride2,
3102 i_width, i_height );
3106 void x264_store_interleave_chroma_msa( uint8_t *p_dst, intptr_t i_dst_stride,
3107 uint8_t *p_src0, uint8_t *p_src1,
3110 store_interleave_chroma_msa( p_src0, FDEC_STRIDE, p_src1, FDEC_STRIDE,
3111 p_dst, i_dst_stride, i_height );
3114 void x264_load_deinterleave_chroma_fenc_msa( uint8_t *p_dst, uint8_t *p_src,
3115 intptr_t i_src_stride,
3118 plane_copy_deinterleave_msa( p_src, i_src_stride, p_dst, FENC_STRIDE,
3119 ( p_dst + ( FENC_STRIDE / 2 ) ), FENC_STRIDE,
3123 void x264_load_deinterleave_chroma_fdec_msa( uint8_t *p_dst, uint8_t *p_src,
3124 intptr_t i_src_stride,
3127 plane_copy_deinterleave_msa( p_src, i_src_stride, p_dst, FDEC_STRIDE,
3128 ( p_dst + ( FDEC_STRIDE / 2 ) ), FDEC_STRIDE,
3132 void x264_frame_init_lowres_core_msa( uint8_t *p_src, uint8_t *p_dst0,
3133 uint8_t *p_dst1, uint8_t *p_dst2,
3134 uint8_t *p_dst3, intptr_t i_src_stride,
3135 intptr_t i_dst_stride, int32_t i_width,
3138 frame_init_lowres_core_msa( p_src, i_src_stride, p_dst0, i_dst_stride,
3139 p_dst1, i_dst_stride, p_dst2, i_dst_stride,
3140 p_dst3, i_dst_stride, i_width, i_height );
3143 uint8_t *x264_get_ref_msa( uint8_t *p_dst, intptr_t *p_dst_stride,
3144 uint8_t *p_src[4], intptr_t i_src_stride,
3145 int32_t m_vx, int32_t m_vy,
3146 int32_t i_width, int32_t i_height,
3147 const x264_weight_t *pWeight )
3149 int32_t i_qpel_idx, i_cnt, i_h4w;
3151 uint8_t *p_src1, *src1_org;
3153 i_qpel_idx = ( ( m_vy & 3 ) << 2 ) + ( m_vx & 3 );
3154 i_offset = ( m_vy >> 2 ) * i_src_stride + ( m_vx >> 2 );
3155 p_src1 = p_src[x264_hpel_ref0[i_qpel_idx]] + i_offset +
3156 ( 3 == ( m_vy & 3 ) ) * i_src_stride;
3158 i_h4w = i_height - i_height%4;
3160 if( i_qpel_idx & 5 )
3162 uint8_t *p_src2 = p_src[x264_hpel_ref1[i_qpel_idx]] +
3163 i_offset + ( 3 == ( m_vx & 3 ) );
3167 avg_src_width16_msa( p_src1, i_src_stride,
3168 p_src2, i_src_stride,
3169 p_dst, *p_dst_stride, i_h4w );
3170 for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
3172 v16u8 src_vec1, src_vec2;
3175 src_vec1 = LD_UB( p_src1 + i_cnt * i_src_stride );
3176 src_vec2 = LD_UB( p_src2 + i_cnt * i_src_stride );
3178 dst_vec0 = __msa_aver_u_b( src_vec1, src_vec2 );
3180 ST_UB( dst_vec0, p_dst + i_cnt * ( *p_dst_stride ) );
3183 else if( 20 == i_width )
3185 avg_src_width16_msa( p_src1, i_src_stride, p_src2, i_src_stride,
3186 p_dst, *p_dst_stride, i_h4w );
3187 avg_src_width4_msa( p_src1 + 16, i_src_stride,
3188 p_src2 + 16, i_src_stride,
3189 p_dst + 16, *p_dst_stride, i_h4w );
3191 for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
3193 v16u8 src_vec1, src_vec2, src_vec3, src_vec4;
3194 v16u8 dst_vec0, dst_vec1;
3197 src_vec1 = LD_UB( p_src1 + i_cnt * i_src_stride );
3198 src_vec2 = LD_UB( p_src2 + i_cnt * i_src_stride );
3199 src_vec3 = LD_UB( p_src1 + i_cnt * i_src_stride + 16 );
3200 src_vec4 = LD_UB( p_src2 + i_cnt * i_src_stride + 16 );
3202 dst_vec0 = __msa_aver_u_b( src_vec1, src_vec2 );
3203 dst_vec1 = __msa_aver_u_b( src_vec3, src_vec4 );
3205 temp0 = __msa_copy_u_w( ( v4i32 ) dst_vec1, 0 );
3207 ST_UB( dst_vec0, p_dst + i_cnt * ( *p_dst_stride ) );
3208 SW( temp0, p_dst + i_cnt * ( *p_dst_stride ) + 16 );
3211 else if( 12 == i_width )
3213 avg_src_width8_msa( p_src1, i_src_stride,
3214 p_src2, i_src_stride,
3215 p_dst, *p_dst_stride, i_h4w );
3216 avg_src_width4_msa( p_src1 + 8, i_src_stride,
3217 p_src2 + 8, i_src_stride,
3218 p_dst + 8, *p_dst_stride, i_h4w );
3219 for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
3223 v16u8 src_vec1, src_vec2;
3226 src_vec1 = LD_UB( p_src1 + i_cnt * i_src_stride );
3227 src_vec2 = LD_UB( p_src2 + i_cnt * i_src_stride );
3229 dst_vec0 = __msa_aver_u_b( src_vec1, src_vec2 );
3231 dst0 = __msa_copy_u_d( ( v2i64 ) dst_vec0, 0 );
3232 temp0 = __msa_copy_u_w( ( v4i32 ) dst_vec0, 2 );
3234 SD( dst0, p_dst + i_cnt * ( *p_dst_stride ) );
3235 SW( temp0, p_dst + i_cnt * ( *p_dst_stride ) + 8 );
3238 else if( 8 == i_width )
3240 avg_src_width8_msa( p_src1, i_src_stride,
3241 p_src2, i_src_stride,
3242 p_dst, *p_dst_stride, i_h4w );
3243 for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
3246 v16u8 src_vec1, src_vec2;
3249 src_vec1 = LD_UB( p_src1 + i_cnt * i_src_stride );
3250 src_vec2 = LD_UB( p_src2 + i_cnt * i_src_stride );
3252 dst_vec0 = __msa_aver_u_b( src_vec1, src_vec2 );
3254 dst0 = __msa_copy_u_d( ( v2i64 ) dst_vec0, 0 );
3256 SD( dst0, p_dst + i_cnt * ( *p_dst_stride ) );
3259 else if( 4 == i_width )
3261 avg_src_width4_msa( p_src1, i_src_stride,
3262 p_src2, i_src_stride,
3263 p_dst, *p_dst_stride, i_h4w );
3264 for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
3267 v16u8 src_vec1, src_vec2;
3270 src_vec1 = LD_UB( p_src1 + i_cnt * i_src_stride );
3271 src_vec2 = LD_UB( p_src2 + i_cnt * i_src_stride );
3273 dst_vec0 = __msa_aver_u_b( src_vec1, src_vec2 );
3274 temp0 = __msa_copy_u_w( ( v4i32 ) dst_vec0, 0 );
3276 SW( temp0, p_dst + i_cnt * ( *p_dst_stride ) );
3280 if( pWeight->weightfn )
3282 int32_t i_log2_denom;
3283 int32_t i_offset_val;
3286 i_log2_denom = pWeight->i_denom;
3287 i_offset_val = pWeight->i_offset;
3288 i_weight = pWeight->i_scale;
3290 if( 16 == i_width || 12 == i_width )
3292 x264_mc_weight_w16_msa( p_dst, *p_dst_stride,
3293 p_dst, *p_dst_stride,
3295 for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
3300 v8u16 temp_vec0, temp_vec1;
3301 v8u16 wgt, offset_val0;
3304 i_offset_val <<= ( i_log2_denom );
3308 i_offset_val += ( 1 << ( i_log2_denom - 1 ) );
3311 wgt = ( v8u16 ) __msa_fill_h( i_weight );
3312 offset_val0 = ( v8u16 ) __msa_fill_h( i_offset_val );
3313 denom = __msa_fill_h( i_log2_denom );
3315 src_vec0 = LD_UB( p_dst + i_cnt * ( *p_dst_stride ) );
3317 temp_vec1 = ( v8u16 ) __msa_ilvl_b( zero,
3318 ( v16i8 ) src_vec0 );
3319 temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero,
3320 ( v16i8 ) src_vec0 );
3322 temp_vec0 = wgt * temp_vec0;
3323 temp_vec1 = wgt * temp_vec1;
3326 ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0,
3327 ( v8i16 ) offset_val0 );
3329 ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec1,
3330 ( v8i16 ) offset_val0 );
3333 ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 );
3335 ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec1, 0 );
3338 ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0, denom );
3340 ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec1, denom );
3342 temp_vec0 = __msa_sat_u_h( temp_vec0, 7 );
3343 temp_vec1 = __msa_sat_u_h( temp_vec1, 7 );
3345 tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec1,
3346 ( v16i8 ) temp_vec0 );
3347 ST_SB( tmp0, p_dst + i_cnt * ( *p_dst_stride ) );
3350 else if( 20 == i_width )
3352 x264_mc_weight_w20_msa( p_dst, *p_dst_stride,
3353 p_dst, *p_dst_stride,
3355 for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
3361 v8u16 temp_vec0, temp_vec1;
3363 v8i16 denom, offset_val0;
3365 i_offset_val <<= ( i_log2_denom );
3369 i_offset_val += ( 1 << ( i_log2_denom - 1 ) );
3372 wgt = ( v8u16 ) __msa_fill_h( i_weight );
3373 offset_val0 = __msa_fill_h( i_offset_val );
3374 denom = __msa_fill_h( i_log2_denom );
3376 src_vec0 = LD_UB( p_dst + i_cnt * ( *p_dst_stride ) );
3377 temp0 = LW( p_dst + i_cnt * ( *p_dst_stride ) + 16 );
3379 temp_vec1 = ( v8u16 ) __msa_ilvl_b( zero,
3380 ( v16i8 ) src_vec0 );
3381 temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero,
3382 ( v16i8 ) src_vec0 );
3384 temp_vec0 = wgt * temp_vec0;
3385 temp_vec1 = wgt * temp_vec1;
3387 temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0,
3389 temp_vec1 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec1,
3393 ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 );
3395 ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec1, 0 );
3398 ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0, denom );
3400 ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec1, denom );
3402 temp_vec0 = __msa_sat_u_h( temp_vec0, 7 );
3403 temp_vec1 = __msa_sat_u_h( temp_vec1, 7 );
3405 tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec1,
3406 ( v16i8 ) temp_vec0 );
3407 ST_SB( tmp0, p_dst + i_cnt * ( *p_dst_stride ) );
3409 src_vec0 = ( v16u8 ) __msa_fill_w( temp0 );
3410 temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero,
3411 ( v16i8 ) src_vec0 );
3412 temp_vec0 = wgt * temp_vec0;
3414 temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0,
3417 ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 );
3418 temp_vec0 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0,
3420 temp_vec0 = __msa_sat_u_h( temp_vec0, 7 );
3422 tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec0,
3423 ( v16i8 ) temp_vec0 );
3424 temp0 = __msa_copy_u_w( ( v4i32 ) tmp0, 0 );
3425 SW( temp0, p_dst + i_cnt * ( *p_dst_stride ) + 16 );
3428 else if( 8 == i_width )
3430 x264_mc_weight_w8_msa( p_dst, *p_dst_stride,
3431 p_dst, *p_dst_stride,
3433 for( i_cnt = i_h4w; i_cnt < i_height ; i_cnt++ )
3441 v8i16 denom, offset_val0;
3443 i_offset_val = i_offset_val << i_log2_denom;
3447 i_offset_val += ( 1 << ( i_log2_denom - 1 ) );
3450 wgt = ( v8u16 ) __msa_fill_h( i_weight );
3451 offset_val0 = __msa_fill_h( i_offset_val );
3452 denom = __msa_fill_h( i_log2_denom );
3454 src_vec0 = LD_UB( p_dst + i_cnt * ( *p_dst_stride ) );
3456 temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero,
3457 ( v16i8 ) src_vec0 );
3458 temp_vec0 = wgt * temp_vec0;
3460 temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0,
3463 ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 );
3465 ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0, denom );
3466 temp_vec0 = __msa_sat_u_h( temp_vec0, 7 );
3468 tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec0,
3469 ( v16i8 ) temp_vec0 );
3470 temp0 = __msa_copy_u_d( ( v2i64 ) tmp0, 0 );
3471 SD( temp0, p_dst + i_cnt * ( *p_dst_stride ) );
3474 else if( 4 == i_width )
3476 x264_mc_weight_w4_msa( p_dst, *p_dst_stride,
3477 p_dst, *p_dst_stride,
3479 for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
3487 v8i16 denom, offset_val0;
3489 i_offset_val <<= ( i_log2_denom );
3493 i_offset_val += ( 1 << ( i_log2_denom - 1 ) );
3496 wgt = ( v8u16 ) __msa_fill_h( i_weight );
3497 offset_val0 = __msa_fill_h( i_offset_val );
3498 denom = __msa_fill_h( i_log2_denom );
3500 temp0 = LW( p_dst + i_cnt * ( *p_dst_stride ) );
3502 src_vec0 = ( v16u8 ) __msa_fill_w( temp0 );
3504 temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero,
3505 ( v16i8 ) src_vec0 );
3506 temp_vec0 = wgt * temp_vec0;
3508 temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0,
3511 ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 );
3512 temp_vec0 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0,
3514 temp_vec0 = __msa_sat_u_h( temp_vec0, 7 );
3516 tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec0,
3517 ( v16i8 ) temp_vec0 );
3518 temp0 = __msa_copy_u_w( ( v4i32 ) tmp0, 0 );
3519 SW( temp0, p_dst + i_cnt * ( *p_dst_stride ) );
3526 else if( pWeight->weightfn )
3528 int32_t i_offset_val, i_log2_denom, i_weight;
3530 i_log2_denom = pWeight->i_denom;
3531 i_offset_val = pWeight->i_offset;
3532 i_weight = pWeight->i_scale;
3534 i_h4w = i_height - i_height%4;
3538 if( 16 == i_width || 12 == i_width )
3540 x264_mc_weight_w16_msa( p_dst, *p_dst_stride, p_src1, i_src_stride,
3542 p_src1 = src1_org + i_h4w * i_src_stride;
3544 for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
3549 v8u16 temp_vec0, temp_vec1;
3551 v8i16 denom, offset_val0;
3553 i_offset_val <<= ( i_log2_denom );
3557 i_offset_val += ( 1 << ( i_log2_denom - 1 ) );
3560 wgt = ( v8u16 ) __msa_fill_h( i_weight );
3561 offset_val0 = __msa_fill_h( i_offset_val );
3562 denom = __msa_fill_h( i_log2_denom );
3564 src_vec0 = LD_UB( p_src1 );
3565 p_src1 += i_src_stride;
3567 temp_vec1 = ( v8u16 ) __msa_ilvl_b( zero, ( v16i8 ) src_vec0 );
3568 temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero, ( v16i8 ) src_vec0 );
3570 temp_vec0 = wgt * temp_vec0;
3571 temp_vec1 = wgt * temp_vec1;
3573 temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0,
3575 temp_vec1 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec1,
3578 temp_vec0 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 );
3579 temp_vec1 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec1, 0 );
3581 temp_vec0 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0, denom );
3582 temp_vec1 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec1, denom );
3584 temp_vec0 = __msa_sat_u_h( temp_vec0, 7 );
3585 temp_vec1 = __msa_sat_u_h( temp_vec1, 7 );
3587 tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec1,
3588 ( v16i8 ) temp_vec0 );
3589 ST_SB( tmp0, p_dst + i_cnt * ( *p_dst_stride ) );
3592 else if( 20 == i_width )
3594 x264_mc_weight_w20_msa( p_dst, *p_dst_stride, p_src1, i_src_stride,
3596 p_src1 = src1_org + i_h4w * i_src_stride;
3598 for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
3604 v8u16 temp_vec0, temp_vec1;
3606 v8i16 denom, offset_val0;
3608 i_offset_val <<= ( i_log2_denom );
3612 i_offset_val += ( 1 << ( i_log2_denom - 1 ) );
3615 wgt = ( v8u16 ) __msa_fill_h( i_weight );
3616 offset_val0 = __msa_fill_h( i_offset_val );
3617 denom = __msa_fill_h( i_log2_denom );
3619 src_vec0 = LD_UB( p_src1 );
3620 temp0 = LW( p_src1 + 16 );
3621 p_src1 += i_src_stride;
3623 temp_vec1 = ( v8u16 ) __msa_ilvl_b( zero, ( v16i8 ) src_vec0 );
3624 temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero, ( v16i8 ) src_vec0 );
3626 temp_vec0 = wgt * temp_vec0;
3627 temp_vec1 = wgt * temp_vec1;
3629 temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0,
3631 temp_vec1 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec1,
3634 temp_vec0 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 );
3635 temp_vec1 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec1, 0 );
3637 temp_vec0 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0, denom );
3638 temp_vec1 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec1, denom );
3640 temp_vec0 = __msa_sat_u_h( temp_vec0, 7 );
3641 temp_vec1 = __msa_sat_u_h( temp_vec1, 7 );
3643 tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec1,
3644 ( v16i8 ) temp_vec0 );
3645 ST_SB( tmp0, p_dst + i_cnt * ( *p_dst_stride ) );
3647 src_vec0 = ( v16u8 ) __msa_fill_w( temp0 );
3648 temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero, ( v16i8 ) src_vec0 );
3649 temp_vec0 = wgt * temp_vec0;
3651 temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0,
3653 temp_vec0 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 );
3654 temp_vec0 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0, denom );
3655 temp_vec0 = __msa_sat_u_h( temp_vec0, 7 );
3657 tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec0,
3658 ( v16i8 ) temp_vec0 );
3659 temp0 = __msa_copy_u_w( ( v4i32 ) tmp0, 0 );
3660 SW( temp0,p_dst + i_cnt * ( *p_dst_stride ) + 16 );
3663 else if( 8 == i_width )
3665 x264_mc_weight_w8_msa( p_dst, *p_dst_stride, p_src1, i_src_stride,
3667 p_src1 = src1_org + i_h4w * i_src_stride;
3669 for( i_cnt = i_h4w; i_cnt < i_height ; i_cnt++ )
3677 v8i16 denom, offset_val0;
3679 i_offset_val = i_offset_val << i_log2_denom;
3683 i_offset_val += ( 1 << ( i_log2_denom - 1 ) );
3686 wgt = ( v8u16 ) __msa_fill_h( i_weight );
3687 offset_val0 = __msa_fill_h( i_offset_val );
3688 denom = __msa_fill_h( i_log2_denom );
3690 src_vec0 = LD_UB( p_src1 );
3691 p_src1 += i_src_stride;
3693 temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero, ( v16i8 ) src_vec0 );
3694 temp_vec0 = wgt * temp_vec0;
3696 temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0,
3698 temp_vec0 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 );
3699 temp_vec0 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0, denom );
3700 temp_vec0 = __msa_sat_u_h( temp_vec0, 7 );
3702 tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec0,
3703 ( v16i8 ) temp_vec0 );
3704 u_temp0 = __msa_copy_u_d( ( v2i64 ) tmp0, 0 );
3705 SD( u_temp0, p_dst + i_cnt * ( *p_dst_stride ) );
3708 else if( 4 == i_width )
3710 x264_mc_weight_w4_msa( p_dst, *p_dst_stride, p_src1, i_src_stride,
3712 p_src1 = src1_org + i_h4w * i_src_stride;
3714 for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
3722 v8i16 denom, offset_val0;
3724 i_offset_val <<= ( i_log2_denom );
3728 i_offset_val += ( 1 << ( i_log2_denom - 1 ) );
3731 wgt = ( v8u16 ) __msa_fill_h( i_weight );
3732 offset_val0 = __msa_fill_h( i_offset_val );
3733 denom = __msa_fill_h( i_log2_denom );
3735 u_temp0 = LW( p_src1 );
3736 p_src1 += i_src_stride;
3738 src_vec0 = ( v16u8 ) __msa_fill_w( u_temp0 );
3740 temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero, ( v16i8 ) src_vec0 );
3741 temp_vec0 = wgt * temp_vec0;
3743 temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0,
3745 temp_vec0 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 );
3746 temp_vec0 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0, denom );
3747 temp_vec0 = __msa_sat_u_h( temp_vec0, 7 );
3749 tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec0,
3750 ( v16i8 ) temp_vec0 );
3751 u_temp0 = __msa_copy_u_w( ( v4i32 ) tmp0, 0 );
3752 SW( u_temp0, p_dst + i_cnt * ( *p_dst_stride ) );
3760 *p_dst_stride = i_src_stride;
3765 void x264_mc_init_mips( int32_t cpu, x264_mc_functions_t *pf )
3767 if( cpu & X264_CPU_MSA )
3769 pf->mc_luma = x264_mc_luma_msa;
3770 pf->mc_chroma = x264_mc_chroma_msa;
3771 pf->get_ref = x264_get_ref_msa;
3773 pf->avg[PIXEL_16x16]= x264_pixel_avg_16x16_msa;
3774 pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_msa;
3775 pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_msa;
3776 pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_msa;
3777 pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_msa;
3778 pf->avg[PIXEL_4x16] = x264_pixel_avg_4x16_msa;
3779 pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_msa;
3780 pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_msa;
3781 pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_msa;
3783 pf->weight = x264_mc_weight_wtab_msa;
3784 pf->offsetadd = x264_mc_weight_wtab_msa;
3785 pf->offsetsub = x264_mc_weight_wtab_msa;
3787 pf->copy_16x16_unaligned = x264_mc_copy_w16_msa;
3788 pf->copy[PIXEL_16x16] = x264_mc_copy_w16_msa;
3789 pf->copy[PIXEL_8x8] = x264_mc_copy_w8_msa;
3790 pf->copy[PIXEL_4x4] = x264_mc_copy_w4_msa;
3792 pf->store_interleave_chroma = x264_store_interleave_chroma_msa;
3793 pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_msa;
3794 pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_msa;
3796 pf->plane_copy_interleave = x264_plane_copy_interleave_msa;
3797 pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_msa;
3798 pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_msa;
3800 pf->hpel_filter = x264_hpel_filter_msa;
3802 pf->memcpy_aligned = memcpy;
3803 pf->memzero_aligned = x264_memzero_aligned_msa;
3804 pf->frame_init_lowres_core = x264_frame_init_lowres_core_msa;