2 * Copyright (c) 2015 - 2017 Shivraj Patil (Shivraj.Patil@imgtec.com)
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #include "libavutil/mips/generic_macros_msa.h"
22 #include "h264chroma_mips.h"
24 static const uint8_t chroma_mask_arr[16 * 5] = {
25 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
26 0, 2, 2, 4, 4, 6, 6, 8, 16, 18, 18, 20, 20, 22, 22, 24,
27 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
28 0, 1, 1, 2, 16, 17, 17, 18, 4, 5, 5, 6, 6, 7, 7, 8,
29 0, 1, 1, 2, 16, 17, 17, 18, 16, 17, 17, 18, 18, 19, 19, 20
32 static void avc_chroma_hz_2x2_msa(uint8_t *src, uint8_t *dst, int32_t stride,
33 uint32_t coeff0, uint32_t coeff1)
40 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
41 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
42 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
44 mask = LD_SB(&chroma_mask_arr[0]);
46 LD_SB2(src, stride, src0, src1);
48 src0 = __msa_vshf_b(mask, src1, src0);
49 res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec);
51 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
52 res_r = __msa_sat_u_h(res_r, 7);
53 res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
55 out0 = __msa_copy_u_h(res, 0);
56 out1 = __msa_copy_u_h(res, 2);
63 static void avc_chroma_hz_2x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
64 uint32_t coeff0, uint32_t coeff1)
66 v16u8 src0, src1, src2, src3;
70 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
71 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
72 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
74 mask = LD_SB(&chroma_mask_arr[64]);
76 LD_UB4(src, stride, src0, src1, src2, src3);
78 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
80 src0 = (v16u8) __msa_ilvr_d((v2i64) src2, (v2i64) src0);
82 res_r = __msa_dotp_u_h(src0, coeff_vec);
84 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
85 res_r = __msa_sat_u_h(res_r, 7);
86 res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
88 ST2x4_UB(res, 0, dst, stride);
91 static void avc_chroma_hz_2w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
92 uint32_t coeff0, uint32_t coeff1,
96 avc_chroma_hz_2x2_msa(src, dst, stride, coeff0, coeff1);
97 } else if (4 == height) {
98 avc_chroma_hz_2x4_msa(src, dst, stride, coeff0, coeff1);
102 static void avc_chroma_hz_4x2_msa(uint8_t *src, uint8_t *dst, int32_t stride,
103 uint32_t coeff0, uint32_t coeff1)
109 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
110 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
111 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
113 mask = LD_SB(&chroma_mask_arr[0]);
115 LD_SB2(src, stride, src0, src1);
117 src0 = __msa_vshf_b(mask, src1, src0);
118 res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec);
120 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
121 res_r = __msa_sat_u_h(res_r, 7);
122 res = (v4i32) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
124 ST4x2_UB(res, dst, stride);
127 static void avc_chroma_hz_4x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
128 uint32_t coeff0, uint32_t coeff1)
130 v16u8 src0, src1, src2, src3, out;
131 v8u16 res0_r, res1_r;
133 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
134 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
135 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
137 mask = LD_SB(&chroma_mask_arr[0]);
139 LD_UB4(src, stride, src0, src1, src2, src3);
140 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
141 DOTP_UB2_UH(src0, src2, coeff_vec, coeff_vec, res0_r, res1_r);
144 SRARI_H2_UH(res0_r, res1_r, 6);
145 SAT_UH2_UH(res0_r, res1_r, 7);
146 out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r);
147 ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
150 static void avc_chroma_hz_4x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
151 uint32_t coeff0, uint32_t coeff1)
153 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, out0, out1;
155 v8u16 res0, res1, res2, res3;
156 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
157 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
158 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
160 mask = LD_SB(&chroma_mask_arr[0]);
162 LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
163 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
164 VSHF_B2_UB(src4, src5, src6, src7, mask, mask, src4, src6);
165 DOTP_UB2_UH(src0, src2, coeff_vec, coeff_vec, res0, res1);
166 DOTP_UB2_UH(src4, src6, coeff_vec, coeff_vec, res2, res3);
167 SLLI_4V(res0, res1, res2, res3, 3);
168 SRARI_H4_UH(res0, res1, res2, res3, 6);
169 SAT_UH4_UH(res0, res1, res2, res3, 7);
170 PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
171 ST4x8_UB(out0, out1, dst, stride);
174 static void avc_chroma_hz_4w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
175 uint32_t coeff0, uint32_t coeff1,
179 avc_chroma_hz_4x2_msa(src, dst, stride, coeff0, coeff1);
180 } else if (4 == height) {
181 avc_chroma_hz_4x4_msa(src, dst, stride, coeff0, coeff1);
182 } else if (8 == height) {
183 avc_chroma_hz_4x8_msa(src, dst, stride, coeff0, coeff1);
187 static void avc_chroma_hz_8x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
188 uint32_t coeff0, uint32_t coeff1)
190 v16u8 src0, src1, src2, src3, out0, out1;
191 v8u16 res0, res1, res2, res3;
193 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
194 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
195 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
197 mask = LD_SB(&chroma_mask_arr[32]);
198 LD_UB4(src, stride, src0, src1, src2, src3);
199 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, src0, src1);
200 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, src2, src3);
201 DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
202 coeff_vec, res0, res1, res2, res3);
203 SLLI_4V(res0, res1, res2, res3, 3);
204 SRARI_H4_UH(res0, res1, res2, res3, 6);
205 SAT_UH4_UH(res0, res1, res2, res3, 7);
206 PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
207 ST8x4_UB(out0, out1, dst, stride);
210 static void avc_chroma_hz_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
211 uint32_t coeff0, uint32_t coeff1)
213 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
214 v16u8 out0, out1, out2, out3;
215 v8u16 res0, res1, res2, res3, res4, res5, res6, res7;
217 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
218 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
219 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
221 mask = LD_SB(&chroma_mask_arr[32]);
223 LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
224 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, src0, src1);
225 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, src2, src3);
226 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, src4, src5);
227 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, src6, src7);
228 DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
229 coeff_vec, res0, res1, res2, res3);
230 DOTP_UB4_UH(src4, src5, src6, src7, coeff_vec, coeff_vec, coeff_vec,
231 coeff_vec, res4, res5, res6, res7);
232 SLLI_4V(res0, res1, res2, res3, 3);
233 SLLI_4V(res4, res5, res6, res7, 3);
234 SRARI_H4_UH(res0, res1, res2, res3, 6);
235 SRARI_H4_UH(res4, res5, res6, res7, 6);
236 SAT_UH4_UH(res0, res1, res2, res3, 7);
237 SAT_UH4_UH(res4, res5, res6, res7, 7);
238 PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
239 PCKEV_B2_UB(res5, res4, res7, res6, out2, out3);
240 ST8x8_UB(out0, out1, out2, out3, dst, stride);
243 static void avc_chroma_hz_nonmult_msa(uint8_t *src, uint8_t *dst,
244 int32_t stride, uint32_t coeff0,
245 uint32_t coeff1, int32_t height)
248 v16u8 src0, src1, src2, src3, out0, out1;
249 v8u16 res0, res1, res2, res3;
251 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
252 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
253 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
255 mask = LD_SB(&chroma_mask_arr[32]);
257 for (row = height >> 2; row--;) {
258 LD_UB4(src, stride, src0, src1, src2, src3);
261 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, src0, src1);
262 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, src2, src3);
263 DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
264 coeff_vec, res0, res1, res2, res3);
265 SLLI_4V(res0, res1, res2, res3, 3);
266 SRARI_H4_UH(res0, res1, res2, res3, 6);
267 SAT_UH4_UH(res0, res1, res2, res3, 7);
268 PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
269 ST8x4_UB(out0, out1, dst, stride);
273 if (0 != (height % 4)) {
274 for (row = (height % 4); row--;) {
278 src0 = (v16u8) __msa_vshf_b(mask, (v16i8) src0, (v16i8) src0);
280 res0 = __msa_dotp_u_h(src0, coeff_vec);
282 res0 = (v8u16) __msa_srari_h((v8i16) res0, 6);
283 res0 = __msa_sat_u_h(res0, 7);
284 res0 = (v8u16) __msa_pckev_b((v16i8) res0, (v16i8) res0);
292 static void avc_chroma_hz_8w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
293 uint32_t coeff0, uint32_t coeff1,
297 avc_chroma_hz_8x4_msa(src, dst, stride, coeff0, coeff1);
298 } else if (8 == height) {
299 avc_chroma_hz_8x8_msa(src, dst, stride, coeff0, coeff1);
301 avc_chroma_hz_nonmult_msa(src, dst, stride, coeff0, coeff1, height);
305 static void avc_chroma_vt_2x2_msa(uint8_t *src, uint8_t *dst, int32_t stride,
306 uint32_t coeff0, uint32_t coeff1)
309 v16i8 src0, src1, src2;
313 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
314 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
315 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
317 LD_SB3(src, stride, src0, src1, src2);
319 ILVR_B2_UB(src1, src0, src2, src1, tmp0, tmp1);
321 tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0);
323 res_r = __msa_dotp_u_h(tmp0, coeff_vec);
325 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
326 res_r = __msa_sat_u_h(res_r, 7);
327 res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
329 out0 = __msa_copy_u_h(res, 0);
330 out1 = __msa_copy_u_h(res, 2);
337 static void avc_chroma_vt_2x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
338 uint32_t coeff0, uint32_t coeff1)
340 v16u8 src0, src1, src2, src3, src4;
341 v16u8 tmp0, tmp1, tmp2, tmp3;
344 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
345 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
346 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
348 LD_UB5(src, stride, src0, src1, src2, src3, src4);
349 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
350 tmp0, tmp1, tmp2, tmp3);
351 ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
353 tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0);
355 res_r = __msa_dotp_u_h(tmp0, coeff_vec);
357 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
358 res_r = __msa_sat_u_h(res_r, 7);
360 res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
362 ST2x4_UB(res, 0, dst, stride);
365 static void avc_chroma_vt_2w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
366 uint32_t coeff0, uint32_t coeff1,
370 avc_chroma_vt_2x2_msa(src, dst, stride, coeff0, coeff1);
371 } else if (4 == height) {
372 avc_chroma_vt_2x4_msa(src, dst, stride, coeff0, coeff1);
376 static void avc_chroma_vt_4x2_msa(uint8_t *src, uint8_t *dst, int32_t stride,
377 uint32_t coeff0, uint32_t coeff1)
379 v16u8 src0, src1, src2;
383 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
384 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
385 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
387 LD_UB3(src, stride, src0, src1, src2);
388 ILVR_B2_UB(src1, src0, src2, src1, tmp0, tmp1);
390 tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0);
391 res_r = __msa_dotp_u_h(tmp0, coeff_vec);
393 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
394 res_r = __msa_sat_u_h(res_r, 7);
395 res = (v4i32) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
397 ST4x2_UB(res, dst, stride);
400 static void avc_chroma_vt_4x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
401 uint32_t coeff0, uint32_t coeff1)
403 v16u8 src0, src1, src2, src3, src4;
404 v16u8 tmp0, tmp1, tmp2, tmp3;
406 v8u16 res0_r, res1_r;
407 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
408 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
409 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
411 LD_UB5(src, stride, src0, src1, src2, src3, src4);
412 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, tmp0, tmp1, tmp2,
414 ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
415 DOTP_UB2_UH(tmp0, tmp2, coeff_vec, coeff_vec, res0_r, res1_r);
418 SRARI_H2_UH(res0_r, res1_r, 6);
419 SAT_UH2_UH(res0_r, res1_r, 7);
420 out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r);
421 ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
424 static void avc_chroma_vt_4x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
425 uint32_t coeff0, uint32_t coeff1)
427 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
428 v16u8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, out0, out1;
429 v8u16 res0, res1, res2, res3;
430 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
431 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
432 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
434 LD_UB5(src, stride, src0, src1, src2, src3, src4);
436 LD_UB4(src, stride, src5, src6, src7, src8);
437 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, tmp0, tmp1, tmp2,
439 ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, tmp4, tmp5, tmp6,
441 ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
442 ILVR_D2_UB(tmp5, tmp4, tmp7, tmp6, tmp4, tmp6);
443 DOTP_UB2_UH(tmp0, tmp2, coeff_vec, coeff_vec, res0, res1);
444 DOTP_UB2_UH(tmp4, tmp6, coeff_vec, coeff_vec, res2, res3);
445 SLLI_4V(res0, res1, res2, res3, 3);
446 SRARI_H4_UH(res0, res1, res2, res3, 6);
447 SAT_UH4_UH(res0, res1, res2, res3, 7);
448 PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
449 ST4x8_UB(out0, out1, dst, stride);
452 static void avc_chroma_vt_4w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
453 uint32_t coeff0, uint32_t coeff1,
457 avc_chroma_vt_4x2_msa(src, dst, stride, coeff0, coeff1);
458 } else if (4 == height) {
459 avc_chroma_vt_4x4_msa(src, dst, stride, coeff0, coeff1);
460 } else if (8 == height) {
461 avc_chroma_vt_4x8_msa(src, dst, stride, coeff0, coeff1);
465 static void avc_chroma_vt_8x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
466 uint32_t coeff0, uint32_t coeff1)
468 v16u8 src0, src1, src2, src3, src4, out0, out1;
469 v8u16 res0, res1, res2, res3;
470 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
471 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
472 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
474 LD_UB5(src, stride, src0, src1, src2, src3, src4);
475 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src0, src1, src2,
477 DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
478 coeff_vec, res0, res1, res2, res3);
479 SLLI_4V(res0, res1, res2, res3, 3);
480 SRARI_H4_UH(res0, res1, res2, res3, 6);
481 SAT_UH4_UH(res0, res1, res2, res3, 7);
482 PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
483 ST8x4_UB(out0, out1, dst, stride);
486 static void avc_chroma_vt_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
487 uint32_t coeff0, uint32_t coeff1)
489 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
490 v16u8 out0, out1, out2, out3;
491 v8u16 res0, res1, res2, res3, res4, res5, res6, res7;
492 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
493 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
494 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
496 LD_UB5(src, stride, src0, src1, src2, src3, src4);
498 LD_UB4(src, stride, src5, src6, src7, src8);
499 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src0, src1, src2,
501 ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, src4, src5, src6,
503 DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
504 coeff_vec, res0, res1, res2, res3);
505 DOTP_UB4_UH(src4, src5, src6, src7, coeff_vec, coeff_vec, coeff_vec,
506 coeff_vec, res4, res5, res6, res7);
507 SLLI_4V(res0, res1, res2, res3, 3);
508 SLLI_4V(res4, res5, res6, res7, 3);
509 SRARI_H4_UH(res0, res1, res2, res3, 6);
510 SRARI_H4_UH(res4, res5, res6, res7, 6);
511 SAT_UH4_UH(res0, res1, res2, res3, 7);
512 SAT_UH4_UH(res0, res1, res2, res3, 7);
513 PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
514 PCKEV_B2_UB(res5, res4, res7, res6, out2, out3);
515 ST8x8_UB(out0, out1, out2, out3, dst, stride);
518 static void avc_chroma_vt_8w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
519 uint32_t coeff0, uint32_t coeff1,
523 avc_chroma_vt_8x4_msa(src, dst, stride, coeff0, coeff1);
524 } else if (8 == height) {
525 avc_chroma_vt_8x8_msa(src, dst, stride, coeff0, coeff1);
529 static void avc_chroma_hv_2x2_msa(uint8_t *src, int32_t src_stride,
530 uint8_t *dst, int32_t dst_stride,
531 uint32_t coef_hor0, uint32_t coef_hor1,
532 uint32_t coef_ver0, uint32_t coef_ver1)
535 v16u8 src0, src1, src2;
536 v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
539 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
540 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
541 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
542 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
543 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
545 mask = LD_SB(&chroma_mask_arr[48]);
547 LD_UB3(src, src_stride, src0, src1, src2);
548 VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
549 DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
550 MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
553 res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
554 res_vt0 = __msa_sat_u_h(res_vt0, 7);
555 res_vert = (v8i16) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
557 out0 = __msa_copy_u_h(res_vert, 0);
558 out1 = __msa_copy_u_h(res_vert, 1);
565 static void avc_chroma_hv_2x4_msa(uint8_t *src, int32_t src_stride,
566 uint8_t *dst, int32_t dst_stride,
567 uint32_t coef_hor0, uint32_t coef_hor1,
568 uint32_t coef_ver0, uint32_t coef_ver1)
570 v16u8 src0, src1, src2, src3, src4;
571 v16u8 tmp0, tmp1, tmp2, tmp3;
572 v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
575 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
576 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
577 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
578 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
579 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
581 mask = LD_SB(&chroma_mask_arr[48]);
583 LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
585 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, tmp0, tmp1);
586 VSHF_B2_UB(src1, src2, src3, src4, mask, mask, tmp2, tmp3);
587 ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
588 DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
589 MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
592 res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
593 res_vt0 = __msa_sat_u_h(res_vt0, 7);
594 res = (v8i16) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
596 ST2x4_UB(res, 0, dst, dst_stride);
599 static void avc_chroma_hv_2x8_msa(uint8_t *src, int32_t src_stride,
600 uint8_t *dst, int32_t dst_stride,
601 uint32_t coef_hor0, uint32_t coef_hor1,
602 uint32_t coef_ver0, uint32_t coef_ver1)
604 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
605 v16u8 tmp0, tmp1, tmp2, tmp3;
606 v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
609 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
610 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
611 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
612 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
613 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
615 mask = LD_SB(&chroma_mask_arr[48]);
617 LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
618 src += (5 * src_stride);
619 LD_UB4(src, src_stride, src5, src6, src7, src8);
621 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, tmp0, tmp1);
622 VSHF_B2_UB(src1, src2, src3, src4, mask, mask, tmp2, tmp3);
623 ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
624 VSHF_B2_UB(src4, src5, src6, src7, mask, mask, tmp0, tmp1);
625 VSHF_B2_UB(src5, src6, src7, src8, mask, mask, tmp2, tmp3);
626 ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src4, src5);
627 DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
628 MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
631 res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
632 res_vt0 = __msa_sat_u_h(res_vt0, 7);
634 res = (v8i16) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
636 ST2x4_UB(res, 0, dst, dst_stride);
637 dst += (4 * dst_stride);
639 DOTP_UB2_UH(src4, src5, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
640 MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
643 res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
644 res_vt0 = __msa_sat_u_h(res_vt0, 7);
646 res = (v8i16) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
648 ST2x4_UB(res, 0, dst, dst_stride);
651 static void avc_chroma_hv_2w_msa(uint8_t *src, int32_t src_stride,
652 uint8_t *dst, int32_t dst_stride,
653 uint32_t coef_hor0, uint32_t coef_hor1,
654 uint32_t coef_ver0, uint32_t coef_ver1,
658 avc_chroma_hv_2x2_msa(src, src_stride, dst, dst_stride, coef_hor0,
659 coef_hor1, coef_ver0, coef_ver1);
660 } else if (4 == height) {
661 avc_chroma_hv_2x4_msa(src, src_stride, dst, dst_stride, coef_hor0,
662 coef_hor1, coef_ver0, coef_ver1);
663 } else if (8 == height) {
664 avc_chroma_hv_2x8_msa(src, src_stride, dst, dst_stride, coef_hor0,
665 coef_hor1, coef_ver0, coef_ver1);
669 static void avc_chroma_hv_4x2_msa(uint8_t *src, int32_t src_stride,
670 uint8_t *dst, int32_t dst_stride,
671 uint32_t coef_hor0, uint32_t coef_hor1,
672 uint32_t coef_ver0, uint32_t coef_ver1)
674 v16u8 src0, src1, src2;
675 v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
678 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
679 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
680 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
681 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
682 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
684 mask = LD_SB(&chroma_mask_arr[0]);
685 LD_UB3(src, src_stride, src0, src1, src2);
686 VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
687 DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
688 MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
691 res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
692 res_vt0 = __msa_sat_u_h(res_vt0, 7);
693 res = (v4i32) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
695 ST4x2_UB(res, dst, dst_stride);
698 static void avc_chroma_hv_4x4multiple_msa(uint8_t *src, int32_t src_stride,
699 uint8_t *dst, int32_t dst_stride,
707 v16u8 src0, src1, src2, src3, src4;
708 v8u16 res_hz0, res_hz1, res_hz2, res_hz3;
709 v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
711 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
712 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
713 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
714 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
715 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
718 mask = LD_SB(&chroma_mask_arr[0]);
723 for (row = (height >> 2); row--;) {
724 LD_UB4(src, src_stride, src1, src2, src3, src4);
725 src += (4 * src_stride);
727 VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
728 VSHF_B2_UB(src2, src3, src3, src4, mask, mask, src2, src3);
729 DOTP_UB4_UH(src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec,
730 coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2,
732 MUL4(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2,
733 coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
735 ADD2(res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1);
736 SRARI_H2_UH(res_vt0, res_vt1, 6);
737 SAT_UH2_UH(res_vt0, res_vt1, 7);
738 PCKEV_B2_SW(res_vt0, res_vt0, res_vt1, res_vt1, res0, res1);
740 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
741 dst += (4 * dst_stride);
746 static void avc_chroma_hv_4w_msa(uint8_t *src, int32_t src_stride,
747 uint8_t *dst, int32_t dst_stride,
748 uint32_t coef_hor0, uint32_t coef_hor1,
749 uint32_t coef_ver0, uint32_t coef_ver1,
753 avc_chroma_hv_4x2_msa(src, src_stride, dst, dst_stride, coef_hor0,
754 coef_hor1, coef_ver0, coef_ver1);
756 avc_chroma_hv_4x4multiple_msa(src, src_stride, dst, dst_stride,
757 coef_hor0, coef_hor1, coef_ver0,
762 static void avc_chroma_hv_8w_msa(uint8_t *src, int32_t src_stride,
763 uint8_t *dst, int32_t dst_stride,
764 uint32_t coef_hor0, uint32_t coef_hor1,
765 uint32_t coef_ver0, uint32_t coef_ver1,
769 v16u8 src0, src1, src2, src3, src4, out0, out1;
770 v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4;
771 v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
773 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
774 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
775 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
776 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
777 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
779 mask = LD_SB(&chroma_mask_arr[32]);
784 src0 = (v16u8) __msa_vshf_b(mask, (v16i8) src0, (v16i8) src0);
785 res_hz0 = __msa_dotp_u_h(src0, coeff_hz_vec);
787 for (row = (height >> 2); row--;) {
788 LD_UB4(src, src_stride, src1, src2, src3, src4);
789 src += (4 * src_stride);
791 VSHF_B2_UB(src1, src1, src2, src2, mask, mask, src1, src2);
792 VSHF_B2_UB(src3, src3, src4, src4, mask, mask, src3, src4);
793 DOTP_UB4_UH(src1, src2, src3, src4, coeff_hz_vec, coeff_hz_vec,
794 coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3,
796 MUL4(res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3,
797 coeff_vt_vec0, res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
800 res_vt0 += (res_hz0 * coeff_vt_vec1);
801 res_vt1 += (res_hz1 * coeff_vt_vec1);
802 res_vt2 += (res_hz2 * coeff_vt_vec1);
803 res_vt3 += (res_hz3 * coeff_vt_vec1);
805 SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
806 SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
807 PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, out0, out1);
808 ST8x4_UB(out0, out1, dst, dst_stride);
810 dst += (4 * dst_stride);
816 static void avc_chroma_hz_and_aver_dst_2x2_msa(uint8_t *src, int32_t src_stride,
817 uint8_t *dst, int32_t dst_stride,
818 uint32_t coeff0, uint32_t coeff1)
821 uint32_t load0, load1;
823 v16u8 dst_data = { 0 };
827 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
828 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
829 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
831 mask = LD_SB(&chroma_mask_arr[0]);
833 LD_SB2(src, src_stride, src0, src1);
836 load1 = LW(dst + dst_stride);
838 INSERT_W2_UB(load0, load1, dst_data);
840 src0 = __msa_vshf_b(mask, src1, src0);
842 res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec);
844 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
845 res_r = __msa_sat_u_h(res_r, 7);
847 res = (v16u8) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
848 dst_data = __msa_aver_u_b(res, dst_data);
850 out0 = __msa_copy_u_h((v8i16) dst_data, 0);
851 out1 = __msa_copy_u_h((v8i16) dst_data, 2);
858 static void avc_chroma_hz_and_aver_dst_2x4_msa(uint8_t *src, int32_t src_stride,
859 uint8_t *dst, int32_t dst_stride,
860 uint32_t coeff0, uint32_t coeff1)
862 v16u8 src0, src1, src2, src3;
863 v16u8 dst0, dst1, dst2, dst3;
866 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
867 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
868 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
870 mask = LD_SB(&chroma_mask_arr[64]);
872 LD_UB4(src, src_stride, src0, src1, src2, src3);
873 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
875 dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 1, (v8i16) dst1);
876 dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 2, (v8i16) dst2);
877 dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 3, (v8i16) dst3);
879 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
881 src0 = (v16u8) __msa_ilvr_d((v2i64) src2, (v2i64) src0);
883 res_r = __msa_dotp_u_h(src0, coeff_vec);
885 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
886 res_r = __msa_sat_u_h(res_r, 7);
888 res = __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
889 dst0 = __msa_aver_u_b((v16u8) res, dst0);
891 ST2x4_UB(dst0, 0, dst, dst_stride);
894 static void avc_chroma_hz_and_aver_dst_2x8_msa(uint8_t *src, int32_t src_stride,
895 uint8_t *dst, int32_t dst_stride,
896 uint32_t coeff0, uint32_t coeff1)
898 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
899 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
900 v8u16 res0_r, res1_r;
901 v16u8 res0, res1, mask;
902 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
903 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
904 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
906 mask = LD_UB(&chroma_mask_arr[64]);
908 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
909 LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
911 dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 1, (v8i16) dst1);
912 dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 2, (v8i16) dst2);
913 dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 3, (v8i16) dst3);
915 dst4 = (v16u8) __msa_insve_h((v8i16) dst4, 1, (v8i16) dst5);
916 dst4 = (v16u8) __msa_insve_h((v8i16) dst4, 2, (v8i16) dst6);
917 dst4 = (v16u8) __msa_insve_h((v8i16) dst4, 3, (v8i16) dst7);
919 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
920 VSHF_B2_UB(src4, src5, src6, src7, mask, mask, src4, src6);
921 ILVR_D2_UB(src2, src0, src6, src4, src0, src4);
922 DOTP_UB2_UH(src0, src4, coeff_vec, coeff_vec, res0_r, res1_r);
927 SRARI_H2_UH(res0_r, res1_r, 6);
928 SAT_UH2_UH(res0_r, res1_r, 7);
929 PCKEV_B2_UB(res0_r, res0_r, res1_r, res1_r, res0, res1);
930 AVER_UB2_UB(res0, dst0, res1, dst4, dst0, dst4);
932 ST2x4_UB(dst0, 0, dst, dst_stride);
933 dst += (4 * dst_stride);
934 ST2x4_UB(dst4, 0, dst, dst_stride);
937 static void avc_chroma_hz_and_aver_dst_2w_msa(uint8_t *src, int32_t src_stride,
938 uint8_t *dst, int32_t dst_stride,
939 uint32_t coeff0, uint32_t coeff1,
943 avc_chroma_hz_and_aver_dst_2x2_msa(src, src_stride, dst, dst_stride,
945 } else if (4 == height) {
946 avc_chroma_hz_and_aver_dst_2x4_msa(src, src_stride, dst, dst_stride,
948 } else if (8 == height) {
949 avc_chroma_hz_and_aver_dst_2x8_msa(src, src_stride, dst, dst_stride,
954 static void avc_chroma_hz_and_aver_dst_4x2_msa(uint8_t *src, int32_t src_stride,
955 uint8_t *dst, int32_t dst_stride,
956 uint32_t coeff0, uint32_t coeff1)
958 uint32_t load0, load1;
960 v16u8 dst_data = { 0 };
963 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
964 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
965 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
967 mask = LD_SB(&chroma_mask_arr[0]);
969 LD_SB2(src, src_stride, src0, src1);
972 load1 = LW(dst + dst_stride);
974 INSERT_W2_UB(load0, load1, dst_data);
976 src0 = __msa_vshf_b(mask, src1, src0);
978 res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec);
980 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
981 res_r = __msa_sat_u_h(res_r, 7);
982 res = __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
983 dst_data = __msa_aver_u_b((v16u8) res, dst_data);
985 ST4x2_UB(dst_data, dst, dst_stride);
988 static void avc_chroma_hz_and_aver_dst_4x4multiple_msa(uint8_t *src,
996 uint32_t load0, load1;
998 v16u8 src0, src1, src2, src3;
1001 v8u16 res0_r, res1_r;
1002 v16u8 res0, res1, mask;
1003 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1004 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1005 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1007 mask = LD_UB(&chroma_mask_arr[0]);
1009 for (row = (height >> 2); row--;) {
1010 LD_UB4(src, src_stride, src0, src1, src2, src3);
1011 src += (4 * src_stride);
1014 load1 = LW(dst + dst_stride);
1016 INSERT_W2_UB(load0, load1, dst0);
1018 load0 = LW(dst + 2 * dst_stride);
1019 load1 = LW(dst + 3 * dst_stride);
1021 INSERT_W2_UB(load0, load1, dst1);
1023 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
1024 DOTP_UB2_UH(src0, src2, coeff_vec, coeff_vec, res0_r, res1_r);
1029 SRARI_H2_UH(res0_r, res1_r, 6);
1030 SAT_UH2_UH(res0_r, res1_r, 7);
1031 PCKEV_B2_UB(res0_r, res0_r, res1_r, res1_r, res0, res1);
1032 AVER_UB2_UB(res0, dst0, res1, dst1, dst0, dst1);
1034 ST4x4_UB(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
1035 dst += (4 * dst_stride);
1039 static void avc_chroma_hz_and_aver_dst_4w_msa(uint8_t *src, int32_t src_stride,
1040 uint8_t *dst, int32_t dst_stride,
1041 uint32_t coeff0, uint32_t coeff1,
1045 avc_chroma_hz_and_aver_dst_4x2_msa(src, src_stride, dst, dst_stride,
1048 avc_chroma_hz_and_aver_dst_4x4multiple_msa(src, src_stride,
1050 coeff0, coeff1, height);
1054 static void avc_chroma_hz_and_aver_dst_8w_msa(uint8_t *src, int32_t src_stride,
1055 uint8_t *dst, int32_t dst_stride,
1056 uint32_t coeff0, uint32_t coeff1,
1060 v16u8 src0, src1, src2, src3, out0, out1;
1061 v8u16 res0, res1, res2, res3;
1062 v16u8 dst0, dst1, dst2, dst3;
1064 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1065 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1066 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1068 mask = LD_SB(&chroma_mask_arr[32]);
1070 for (row = height >> 2; row--;) {
1071 LD_UB4(src, src_stride, src0, src1, src2, src3);
1072 src += (4 * src_stride);
1073 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1074 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, src0, src1);
1075 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, src2, src3);
1076 DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
1077 coeff_vec, res0, res1, res2, res3);
1078 SLLI_4V(res0, res1, res2, res3, 3);
1079 SRARI_H4_UH(res0, res1, res2, res3, 6);
1080 SAT_UH4_UH(res0, res1, res2, res3, 7);
1081 PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
1082 PCKEV_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
1083 AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
1084 ST8x4_UB(out0, out1, dst, dst_stride);
1085 dst += (4 * dst_stride);
1089 static void avc_chroma_vt_and_aver_dst_2x2_msa(uint8_t *src, int32_t src_stride,
1090 uint8_t *dst, int32_t dst_stride,
1091 uint32_t coeff0, uint32_t coeff1)
1093 uint16_t out0, out1;
1094 uint32_t load0, load1;
1095 v16i8 src0, src1, src2, tmp0, tmp1, res;
1096 v16u8 dst_data = { 0 };
1098 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1099 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1100 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1102 LD_SB3(src, src_stride, src0, src1, src2);
1104 load1 = LW(dst + dst_stride);
1106 INSERT_W2_UB(load0, load1, dst_data);
1108 ILVR_B2_SB(src1, src0, src2, src1, tmp0, tmp1);
1110 tmp0 = (v16i8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0);
1111 res_r = __msa_dotp_u_h((v16u8) tmp0, coeff_vec);
1113 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
1114 res_r = __msa_sat_u_h(res_r, 7);
1115 res = __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
1116 dst_data = __msa_aver_u_b((v16u8) res, dst_data);
1117 out0 = __msa_copy_u_h((v8i16) dst_data, 0);
1118 out1 = __msa_copy_u_h((v8i16) dst_data, 2);
1125 static void avc_chroma_vt_and_aver_dst_2x4_msa(uint8_t *src, int32_t src_stride,
1126 uint8_t *dst, int32_t dst_stride,
1127 uint32_t coeff0, uint32_t coeff1)
1129 uint32_t load0, load1;
1130 v16i8 src0, src1, src2, src3, src4;
1131 v16u8 tmp0, tmp1, tmp2, tmp3;
1134 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1135 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1136 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1137 v16u8 dst_data = { 0 };
1139 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1142 load1 = LW(dst + dst_stride);
1144 dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 0, load0);
1145 dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 1, load1);
1147 load0 = LW(dst + 2 * dst_stride);
1148 load1 = LW(dst + 3 * dst_stride);
1150 dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 2, load0);
1151 dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 3, load1);
1153 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
1154 tmp0, tmp1, tmp2, tmp3);
1155 ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
1157 tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0);
1159 res_r = __msa_dotp_u_h(tmp0, coeff_vec);
1161 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
1162 res_r = __msa_sat_u_h(res_r, 7);
1164 res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
1165 res = (v8i16) __msa_aver_u_b((v16u8) res, dst_data);
1167 ST2x4_UB(res, 0, dst, dst_stride);
1168 dst += (4 * dst_stride);
1171 static void avc_chroma_vt_and_aver_dst_2x8_msa(uint8_t *src, int32_t src_stride,
1172 uint8_t *dst, int32_t dst_stride,
1173 uint32_t coeff0, uint32_t coeff1)
1175 uint32_t load0, load1, load2, load3;
1176 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1177 v16u8 tmp0, tmp1, tmp2, tmp3;
1180 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1181 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1182 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1183 v16u8 dst_data0 = { 0 };
1184 v16u8 dst_data1 = { 0 };
1186 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1187 src += (5 * src_stride);
1188 LD_SB4(src, src_stride, src5, src6, src7, src8);
1190 LW4(dst, dst_stride, load0, load1, load2, load3);
1192 dst_data0 = (v16u8) __msa_insert_h((v8i16) dst_data0, 0, load0);
1193 dst_data0 = (v16u8) __msa_insert_h((v8i16) dst_data0, 1, load1);
1194 dst_data0 = (v16u8) __msa_insert_h((v8i16) dst_data0, 2, load2);
1195 dst_data0 = (v16u8) __msa_insert_h((v8i16) dst_data0, 3, load3);
1197 LW4(dst + 4 * dst_stride, dst_stride, load0, load1, load2, load3);
1199 dst_data1 = (v16u8) __msa_insert_h((v8i16) dst_data1, 0, load0);
1200 dst_data1 = (v16u8) __msa_insert_h((v8i16) dst_data1, 1, load1);
1201 dst_data1 = (v16u8) __msa_insert_h((v8i16) dst_data1, 2, load2);
1202 dst_data1 = (v16u8) __msa_insert_h((v8i16) dst_data1, 3, load3);
1204 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
1205 tmp0, tmp1, tmp2, tmp3);
1207 ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
1209 tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0);
1211 res_r = __msa_dotp_u_h(tmp0, coeff_vec);
1213 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
1214 res_r = __msa_sat_u_h(res_r, 7);
1216 res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
1217 res = (v8i16) __msa_aver_u_b((v16u8) res, dst_data0);
1219 ST2x4_UB(res, 0, dst, dst_stride);
1220 dst += (4 * dst_stride);
1222 ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7,
1223 tmp0, tmp1, tmp2, tmp3);
1225 ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
1227 tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0);
1229 res_r = __msa_dotp_u_h(tmp0, coeff_vec);
1231 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
1232 res_r = __msa_sat_u_h(res_r, 7);
1234 res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
1235 res = (v8i16) __msa_aver_u_b((v16u8) res, dst_data1);
1237 ST2x4_UB(res, 0, dst, dst_stride);
1240 static void avc_chroma_vt_and_aver_dst_2w_msa(uint8_t *src, int32_t src_stride,
1241 uint8_t *dst, int32_t dst_stride,
1242 uint32_t coeff0, uint32_t coeff1,
1246 avc_chroma_vt_and_aver_dst_2x2_msa(src, src_stride, dst, dst_stride,
1248 } else if (4 == height) {
1249 avc_chroma_vt_and_aver_dst_2x4_msa(src, src_stride, dst, dst_stride,
1251 } else if (8 == height) {
1252 avc_chroma_vt_and_aver_dst_2x8_msa(src, src_stride, dst, dst_stride,
1257 static void avc_chroma_vt_and_aver_dst_4x2_msa(uint8_t *src, int32_t src_stride,
1258 uint8_t *dst, int32_t dst_stride,
1259 uint32_t coeff0, uint32_t coeff1)
1261 uint32_t load0, load1;
1262 v16i8 src0, src1, src2, tmp0, tmp1;
1263 v16u8 dst_data = { 0 };
1266 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1267 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1268 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1270 LD_SB3(src, src_stride, src0, src1, src2);
1273 load1 = LW(dst + dst_stride);
1275 INSERT_W2_UB(load0, load1, dst_data);
1276 ILVR_B2_SB(src1, src0, src2, src1, tmp0, tmp1);
1278 tmp0 = (v16i8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0);
1280 res_r = __msa_dotp_u_h((v16u8) tmp0, coeff_vec);
1282 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
1283 res_r = __msa_sat_u_h(res_r, 7);
1284 res = (v16u8) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
1285 res = __msa_aver_u_b(res, dst_data);
1287 ST4x2_UB(res, dst, dst_stride);
1290 static void avc_chroma_vt_and_aver_dst_4x4mul_msa(uint8_t *src,
1298 uint32_t load0, load1, row;
1299 v16i8 src0, src1, src2, src3, src4;
1300 v16u8 tmp0, tmp1, tmp2, tmp3;
1303 v8u16 res0_r, res1_r;
1305 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1306 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1307 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1312 for (row = (height >> 2); row--;) {
1313 LD_SB4(src, src_stride, src1, src2, src3, src4);
1314 src += (4 * src_stride);
1317 load1 = LW(dst + dst_stride);
1319 INSERT_W2_UB(load0, load1, dst0);
1320 load0 = LW(dst + 2 * dst_stride);
1321 load1 = LW(dst + 3 * dst_stride);
1322 INSERT_W2_UB(load0, load1, dst1);
1324 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
1325 tmp0, tmp1, tmp2, tmp3);
1326 ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
1327 DOTP_UB2_UH(tmp0, tmp2, coeff_vec, coeff_vec, res0_r, res1_r);
1332 SRARI_H2_UH(res0_r, res1_r, 6);
1333 SAT_UH2_UH(res0_r, res1_r, 7);
1334 PCKEV_B2_UB(res0_r, res0_r, res1_r, res1_r, res0, res1);
1335 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
1337 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
1338 dst += (4 * dst_stride);
1343 static void avc_chroma_vt_and_aver_dst_4w_msa(uint8_t *src, int32_t src_stride,
1344 uint8_t *dst, int32_t dst_stride,
1345 uint32_t coeff0, uint32_t coeff1,
1349 avc_chroma_vt_and_aver_dst_4x2_msa(src, src_stride, dst, dst_stride,
1352 avc_chroma_vt_and_aver_dst_4x4mul_msa(src, src_stride, dst, dst_stride,
1353 coeff0, coeff1, height);
1357 static void avc_chroma_vt_and_aver_dst_8w_msa(uint8_t *src, int32_t src_stride,
1358 uint8_t *dst, int32_t dst_stride,
1359 uint32_t coeff0, uint32_t coeff1,
1363 v16u8 src0, src1, src2, src3, src4;
1365 v8u16 res0, res1, res2, res3;
1366 v16u8 dst0, dst1, dst2, dst3;
1367 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1368 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1369 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1374 for (row = height >> 2; row--;) {
1375 LD_UB4(src, src_stride, src1, src2, src3, src4);
1376 src += (4 * src_stride);
1377 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1378 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
1379 src0, src1, src2, src3);
1380 DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
1381 coeff_vec, res0, res1, res2, res3);
1382 SLLI_4V(res0, res1, res2, res3, 3);
1383 SRARI_H4_UH(res0, res1, res2, res3, 6);
1384 SAT_UH4_UH(res0, res1, res2, res3, 7);
1385 PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
1386 PCKEV_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
1387 AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
1388 ST8x4_UB(out0, out1, dst, dst_stride);
1390 dst += (4 * dst_stride);
1395 static void avc_chroma_hv_and_aver_dst_2x2_msa(uint8_t *src, int32_t src_stride,
1396 uint8_t *dst, int32_t dst_stride,
1402 uint16_t out0, out1;
1404 v16u8 src0, src1, src2;
1405 v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
1407 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
1408 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
1409 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
1410 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
1411 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
1413 mask = LD_SB(&chroma_mask_arr[48]);
1415 LD_UB3(src, src_stride, src0, src1, src2);
1416 LD_UB2(dst, dst_stride, dst0, dst1);
1417 VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
1418 DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
1419 MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
1422 res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
1423 res_vt0 = __msa_sat_u_h(res_vt0, 7);
1424 res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
1425 dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 1, (v8i16) dst1);
1426 dst0 = __msa_aver_u_b((v16u8) res, dst0);
1427 out0 = __msa_copy_u_h((v8i16) dst0, 0);
1428 out1 = __msa_copy_u_h((v8i16) dst0, 1);
1435 static void avc_chroma_hv_and_aver_dst_2x4_msa(uint8_t *src, int32_t src_stride,
1436 uint8_t *dst, int32_t dst_stride,
1442 v16u8 src0, src1, src2, src3, src4;
1443 v16u8 tmp0, tmp1, tmp2, tmp3;
1444 v16u8 dst0, dst1, dst2, dst3;
1445 v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
1447 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
1448 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
1449 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
1450 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
1451 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
1453 mask = LD_SB(&chroma_mask_arr[48]);
1455 LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
1456 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1457 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, tmp0, tmp1);
1458 VSHF_B2_UB(src1, src2, src3, src4, mask, mask, tmp2, tmp3);
1459 ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
1460 DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
1461 MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
1464 res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
1465 res_vt0 = __msa_sat_u_h(res_vt0, 7);
1466 res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
1468 dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 1, (v8i16) dst1);
1469 dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 2, (v8i16) dst2);
1470 dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 3, (v8i16) dst3);
1471 dst0 = __msa_aver_u_b((v16u8) res, dst0);
1473 ST2x4_UB(dst0, 0, dst, dst_stride);
1476 static void avc_chroma_hv_and_aver_dst_2x8_msa(uint8_t *src, int32_t src_stride,
1477 uint8_t *dst, int32_t dst_stride,
1483 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1484 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1485 v16u8 tmp0, tmp1, tmp2, tmp3;
1486 v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
1488 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
1489 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
1490 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
1491 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
1492 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
1494 mask = LD_SB(&chroma_mask_arr[48]);
1496 LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
1497 src += (5 * src_stride);
1498 LD_UB4(src, src_stride, src5, src6, src7, src8);
1500 LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
1502 dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 1, (v8i16) dst1);
1503 dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 2, (v8i16) dst2);
1504 dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 3, (v8i16) dst3);
1506 dst4 = (v16u8) __msa_insve_h((v8i16) dst4, 1, (v8i16) dst5);
1507 dst4 = (v16u8) __msa_insve_h((v8i16) dst4, 2, (v8i16) dst6);
1508 dst4 = (v16u8) __msa_insve_h((v8i16) dst4, 3, (v8i16) dst7);
1510 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, tmp0, tmp1);
1511 VSHF_B2_UB(src1, src2, src3, src4, mask, mask, tmp2, tmp3);
1512 ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
1513 VSHF_B2_UB(src4, src5, src6, src7, mask, mask, tmp0, tmp1);
1514 VSHF_B2_UB(src5, src6, src7, src8, mask, mask, tmp2, tmp3);
1515 ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src4, src5);
1516 DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
1517 MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
1520 res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
1521 res_vt0 = __msa_sat_u_h(res_vt0, 7);
1522 res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
1523 dst0 = __msa_aver_u_b((v16u8) res, dst0);
1525 ST2x4_UB(dst0, 0, dst, dst_stride);
1526 dst += (4 * dst_stride);
1528 DOTP_UB2_UH(src4, src5, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
1529 MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
1532 res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
1533 res_vt0 = __msa_sat_u_h(res_vt0, 7);
1534 res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
1535 dst4 = __msa_aver_u_b((v16u8) res, dst4);
1537 ST2x4_UB(dst4, 0, dst, dst_stride);
1540 static void avc_chroma_hv_and_aver_dst_2w_msa(uint8_t *src, int32_t src_stride,
1541 uint8_t *dst, int32_t dst_stride,
1549 avc_chroma_hv_and_aver_dst_2x2_msa(src, src_stride, dst, dst_stride,
1550 coef_hor0, coef_hor1,
1551 coef_ver0, coef_ver1);
1552 } else if (4 == height) {
1553 avc_chroma_hv_and_aver_dst_2x4_msa(src, src_stride, dst, dst_stride,
1554 coef_hor0, coef_hor1,
1555 coef_ver0, coef_ver1);
1556 } else if (8 == height) {
1557 avc_chroma_hv_and_aver_dst_2x8_msa(src, src_stride, dst, dst_stride,
1558 coef_hor0, coef_hor1,
1559 coef_ver0, coef_ver1);
1563 static void avc_chroma_hv_and_aver_dst_4x2_msa(uint8_t *src, int32_t src_stride,
1564 uint8_t *dst, int32_t dst_stride,
1570 v16u8 src0, src1, src2;
1572 v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
1574 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
1575 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
1576 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
1577 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
1578 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
1580 mask = LD_SB(&chroma_mask_arr[0]);
1582 LD_UB3(src, src_stride, src0, src1, src2);
1583 LD_UB2(dst, dst_stride, dst0, dst1);
1584 VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
1585 DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
1586 MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
1589 res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
1590 res_vt0 = __msa_sat_u_h(res_vt0, 7);
1591 res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
1592 dst0 = (v16u8) __msa_insve_w((v4i32) dst0, 1, (v4i32) dst1);
1593 dst0 = __msa_aver_u_b((v16u8) res, dst0);
1595 ST4x2_UB(dst0, dst, dst_stride);
1598 static void avc_chroma_hv_and_aver_dst_4x4mul_msa(uint8_t *src,
1609 v16u8 src0, src1, src2, src3, src4;
1610 v16u8 dst0, dst1, dst2, dst3;
1611 v8u16 res_hz0, res_hz1, res_hz2, res_hz3;
1612 v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
1614 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
1615 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
1616 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
1617 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
1618 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
1621 mask = LD_SB(&chroma_mask_arr[0]);
1626 for (row = (height >> 2); row--;) {
1627 LD_UB4(src, src_stride, src1, src2, src3, src4);
1628 src += (4 * src_stride);
1630 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1632 VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
1633 VSHF_B2_UB(src2, src3, src3, src4, mask, mask, src2, src3);
1634 DOTP_UB4_UH(src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec,
1635 coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2,
1637 MUL4(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2,
1638 coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
1640 ADD2(res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1);
1641 SRARI_H2_UH(res_vt0, res_vt1, 6);
1642 SAT_UH2_UH(res_vt0, res_vt1, 7);
1643 PCKEV_B2_UB(res_vt0, res_vt0, res_vt1, res_vt1, res0, res1);
1645 dst0 = (v16u8) __msa_insve_w((v4i32) dst0, 1, (v4i32) dst1);
1646 dst1 = (v16u8) __msa_insve_w((v4i32) dst2, 1, (v4i32) dst3);
1648 AVER_UB2_UB(res0, dst0, res1, dst1, dst0, dst1);
1650 ST4x4_UB(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
1651 dst += (4 * dst_stride);
1656 static void avc_chroma_hv_and_aver_dst_4w_msa(uint8_t *src, int32_t src_stride,
1657 uint8_t *dst, int32_t dst_stride,
1665 avc_chroma_hv_and_aver_dst_4x2_msa(src, src_stride, dst, dst_stride,
1666 coef_hor0, coef_hor1,
1667 coef_ver0, coef_ver1);
1669 avc_chroma_hv_and_aver_dst_4x4mul_msa(src, src_stride, dst, dst_stride,
1670 coef_hor0, coef_hor1,
1671 coef_ver0, coef_ver1, height);
1675 static void avc_chroma_hv_and_aver_dst_8w_msa(uint8_t *src, int32_t src_stride,
1676 uint8_t *dst, int32_t dst_stride,
1684 v16u8 src0, src1, src2, src3, src4, out0, out1;
1685 v8u16 res_hz0, res_hz1, res_hz2;
1686 v8u16 res_hz3, res_hz4;
1687 v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
1688 v16u8 dst0, dst1, dst2, dst3;
1690 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
1691 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
1692 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
1693 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
1694 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
1696 mask = LD_SB(&chroma_mask_arr[32]);
1701 src0 = (v16u8) __msa_vshf_b(mask, (v16i8) src0, (v16i8) src0);
1702 res_hz0 = __msa_dotp_u_h(src0, coeff_hz_vec);
1704 for (row = (height >> 2); row--;) {
1705 LD_UB4(src, src_stride, src1, src2, src3, src4);
1706 src += (4 * src_stride);
1708 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1709 VSHF_B2_UB(src1, src1, src2, src2, mask, mask, src1, src2);
1710 VSHF_B2_UB(src3, src3, src4, src4, mask, mask, src3, src4);
1711 DOTP_UB4_UH(src1, src2, src3, src4, coeff_hz_vec, coeff_hz_vec,
1712 coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3,
1714 MUL4(res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3,
1715 coeff_vt_vec0, res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
1718 res_vt0 += (res_hz0 * coeff_vt_vec1);
1719 res_vt1 += (res_hz1 * coeff_vt_vec1);
1720 res_vt2 += (res_hz2 * coeff_vt_vec1);
1721 res_vt3 += (res_hz3 * coeff_vt_vec1);
1723 SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
1724 SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
1726 PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, out0, out1);
1727 PCKEV_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
1728 AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
1729 ST8x4_UB(out0, out1, dst, dst_stride);
1730 dst += (4 * dst_stride);
1736 static void copy_width8_msa(uint8_t *src, int32_t src_stride,
1737 uint8_t *dst, int32_t dst_stride,
1741 uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
1742 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1744 if (0 == height % 12) {
1745 for (cnt = (height / 12); cnt--;) {
1746 LD_UB8(src, src_stride,
1747 src0, src1, src2, src3, src4, src5, src6, src7);
1748 src += (8 * src_stride);
1750 out0 = __msa_copy_u_d((v2i64) src0, 0);
1751 out1 = __msa_copy_u_d((v2i64) src1, 0);
1752 out2 = __msa_copy_u_d((v2i64) src2, 0);
1753 out3 = __msa_copy_u_d((v2i64) src3, 0);
1754 out4 = __msa_copy_u_d((v2i64) src4, 0);
1755 out5 = __msa_copy_u_d((v2i64) src5, 0);
1756 out6 = __msa_copy_u_d((v2i64) src6, 0);
1757 out7 = __msa_copy_u_d((v2i64) src7, 0);
1759 SD4(out0, out1, out2, out3, dst, dst_stride);
1760 dst += (4 * dst_stride);
1761 SD4(out4, out5, out6, out7, dst, dst_stride);
1762 dst += (4 * dst_stride);
1764 LD_UB4(src, src_stride, src0, src1, src2, src3);
1765 src += (4 * src_stride);
1767 out0 = __msa_copy_u_d((v2i64) src0, 0);
1768 out1 = __msa_copy_u_d((v2i64) src1, 0);
1769 out2 = __msa_copy_u_d((v2i64) src2, 0);
1770 out3 = __msa_copy_u_d((v2i64) src3, 0);
1772 SD4(out0, out1, out2, out3, dst, dst_stride);
1773 dst += (4 * dst_stride);
1775 } else if (0 == height % 8) {
1776 for (cnt = height >> 3; cnt--;) {
1777 LD_UB8(src, src_stride,
1778 src0, src1, src2, src3, src4, src5, src6, src7);
1779 src += (8 * src_stride);
1781 out0 = __msa_copy_u_d((v2i64) src0, 0);
1782 out1 = __msa_copy_u_d((v2i64) src1, 0);
1783 out2 = __msa_copy_u_d((v2i64) src2, 0);
1784 out3 = __msa_copy_u_d((v2i64) src3, 0);
1785 out4 = __msa_copy_u_d((v2i64) src4, 0);
1786 out5 = __msa_copy_u_d((v2i64) src5, 0);
1787 out6 = __msa_copy_u_d((v2i64) src6, 0);
1788 out7 = __msa_copy_u_d((v2i64) src7, 0);
1790 SD4(out0, out1, out2, out3, dst, dst_stride);
1791 dst += (4 * dst_stride);
1792 SD4(out4, out5, out6, out7, dst, dst_stride);
1793 dst += (4 * dst_stride);
1795 } else if (0 == height % 4) {
1796 for (cnt = (height / 4); cnt--;) {
1797 LD_UB4(src, src_stride, src0, src1, src2, src3);
1798 src += (4 * src_stride);
1799 out0 = __msa_copy_u_d((v2i64) src0, 0);
1800 out1 = __msa_copy_u_d((v2i64) src1, 0);
1801 out2 = __msa_copy_u_d((v2i64) src2, 0);
1802 out3 = __msa_copy_u_d((v2i64) src3, 0);
1804 SD4(out0, out1, out2, out3, dst, dst_stride);
1805 dst += (4 * dst_stride);
1807 } else if (0 == height % 2) {
1808 for (cnt = (height / 2); cnt--;) {
1809 LD_UB2(src, src_stride, src0, src1);
1810 src += (2 * src_stride);
1811 out0 = __msa_copy_u_d((v2i64) src0, 0);
1812 out1 = __msa_copy_u_d((v2i64) src1, 0);
1822 static void avg_width4_msa(uint8_t *src, int32_t src_stride,
1823 uint8_t *dst, int32_t dst_stride,
1827 uint32_t out0, out1, out2, out3;
1828 v16u8 src0, src1, src2, src3;
1829 v16u8 dst0, dst1, dst2, dst3;
1831 if (0 == (height % 4)) {
1832 for (cnt = (height / 4); cnt--;) {
1833 LD_UB4(src, src_stride, src0, src1, src2, src3);
1834 src += (4 * src_stride);
1836 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1838 AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
1839 dst0, dst1, dst2, dst3);
1841 out0 = __msa_copy_u_w((v4i32) dst0, 0);
1842 out1 = __msa_copy_u_w((v4i32) dst1, 0);
1843 out2 = __msa_copy_u_w((v4i32) dst2, 0);
1844 out3 = __msa_copy_u_w((v4i32) dst3, 0);
1845 SW4(out0, out1, out2, out3, dst, dst_stride);
1846 dst += (4 * dst_stride);
1848 } else if (0 == (height % 2)) {
1849 for (cnt = (height / 2); cnt--;) {
1850 LD_UB2(src, src_stride, src0, src1);
1851 src += (2 * src_stride);
1853 LD_UB2(dst, dst_stride, dst0, dst1);
1855 AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1);
1857 out0 = __msa_copy_u_w((v4i32) dst0, 0);
1858 out1 = __msa_copy_u_w((v4i32) dst1, 0);
1867 static void avg_width8_msa(uint8_t *src, int32_t src_stride,
1868 uint8_t *dst, int32_t dst_stride,
1872 uint64_t out0, out1, out2, out3;
1873 v16u8 src0, src1, src2, src3;
1874 v16u8 dst0, dst1, dst2, dst3;
1876 for (cnt = (height / 4); cnt--;) {
1877 LD_UB4(src, src_stride, src0, src1, src2, src3);
1878 src += (4 * src_stride);
1879 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1881 AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
1882 dst0, dst1, dst2, dst3);
1884 out0 = __msa_copy_u_d((v2i64) dst0, 0);
1885 out1 = __msa_copy_u_d((v2i64) dst1, 0);
1886 out2 = __msa_copy_u_d((v2i64) dst2, 0);
1887 out3 = __msa_copy_u_d((v2i64) dst3, 0);
1888 SD4(out0, out1, out2, out3, dst, dst_stride);
1889 dst += (4 * dst_stride);
1893 void ff_put_h264_chroma_mc8_msa(uint8_t *dst, uint8_t *src,
1894 ptrdiff_t stride, int height, int x, int y)
1896 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
1899 avc_chroma_hv_8w_msa(src, stride, dst,
1900 stride, x, (8 - x), y, (8 - y), height);
1902 avc_chroma_hz_8w_msa(src, dst, stride, x, (8 - x), height);
1904 avc_chroma_vt_8w_msa(src, dst, stride, y, (8 - y), height);
1906 copy_width8_msa(src, stride, dst, stride, height);
1910 void ff_put_h264_chroma_mc4_msa(uint8_t *dst, uint8_t *src,
1911 ptrdiff_t stride, int height, int x, int y)
1915 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
1918 avc_chroma_hv_4w_msa(src, stride, dst,
1919 stride, x, (8 - x), y, (8 - y), height);
1921 avc_chroma_hz_4w_msa(src, dst, stride, x, (8 - x), height);
1923 avc_chroma_vt_4w_msa(src, dst, stride, y, (8 - y), height);
1925 for (cnt = height; cnt--;) {
1926 *((uint32_t *) dst) = *((uint32_t *) src);
1934 void ff_put_h264_chroma_mc2_msa(uint8_t *dst, uint8_t *src,
1935 ptrdiff_t stride, int height, int x, int y)
1939 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
1942 avc_chroma_hv_2w_msa(src, stride, dst,
1943 stride, x, (8 - x), y, (8 - y), height);
1945 avc_chroma_hz_2w_msa(src, dst, stride, x, (8 - x), height);
1947 avc_chroma_vt_2w_msa(src, dst, stride, y, (8 - y), height);
1949 for (cnt = height; cnt--;) {
1950 *((uint16_t *) dst) = *((uint16_t *) src);
1958 void ff_avg_h264_chroma_mc8_msa(uint8_t *dst, uint8_t *src,
1959 ptrdiff_t stride, int height, int x, int y)
1961 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
1965 avc_chroma_hv_and_aver_dst_8w_msa(src, stride, dst,
1966 stride, x, (8 - x), y,
1969 avc_chroma_hz_and_aver_dst_8w_msa(src, stride, dst,
1970 stride, x, (8 - x), height);
1972 avc_chroma_vt_and_aver_dst_8w_msa(src, stride, dst,
1973 stride, y, (8 - y), height);
1975 avg_width8_msa(src, stride, dst, stride, height);
1979 void ff_avg_h264_chroma_mc4_msa(uint8_t *dst, uint8_t *src,
1980 ptrdiff_t stride, int height, int x, int y)
1982 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
1985 avc_chroma_hv_and_aver_dst_4w_msa(src, stride, dst,
1986 stride, x, (8 - x), y,
1989 avc_chroma_hz_and_aver_dst_4w_msa(src, stride, dst,
1990 stride, x, (8 - x), height);
1992 avc_chroma_vt_and_aver_dst_4w_msa(src, stride, dst,
1993 stride, y, (8 - y), height);
1995 avg_width4_msa(src, stride, dst, stride, height);
1999 void ff_avg_h264_chroma_mc2_msa(uint8_t *dst, uint8_t *src,
2000 ptrdiff_t stride, int height, int x, int y)
2004 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2007 avc_chroma_hv_and_aver_dst_2w_msa(src, stride, dst,
2008 stride, x, (8 - x), y,
2011 avc_chroma_hz_and_aver_dst_2w_msa(src, stride, dst,
2012 stride, x, (8 - x), height);
2014 avc_chroma_vt_and_aver_dst_2w_msa(src, stride, dst,
2015 stride, y, (8 - y), height);
2017 for (cnt = height; cnt--;) {
2018 dst[0] = (dst[0] + src[0] + 1) >> 1;
2019 dst[1] = (dst[1] + src[1] + 1) >> 1;