2 * Copyright (c) 2015 - 2017 Shivraj Patil (Shivraj.Patil@imgtec.com)
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #include "libavutil/mips/generic_macros_msa.h"
22 #include "h264chroma_mips.h"
24 static const uint8_t chroma_mask_arr[16 * 5] = {
25 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
26 0, 2, 2, 4, 4, 6, 6, 8, 16, 18, 18, 20, 20, 22, 22, 24,
27 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
28 0, 1, 1, 2, 16, 17, 17, 18, 4, 5, 5, 6, 6, 7, 7, 8,
29 0, 1, 1, 2, 16, 17, 17, 18, 16, 17, 17, 18, 18, 19, 19, 20
32 static void avc_chroma_hz_2x2_msa(uint8_t *src, uint8_t *dst, int32_t stride,
33 uint32_t coeff0, uint32_t coeff1)
40 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
41 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
42 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
44 mask = LD_SB(&chroma_mask_arr[0]);
46 LD_SB2(src, stride, src0, src1);
48 src0 = __msa_vshf_b(mask, src1, src0);
49 res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec);
51 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
52 res_r = __msa_sat_u_h(res_r, 7);
53 res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
55 out0 = __msa_copy_u_h(res, 0);
56 out1 = __msa_copy_u_h(res, 2);
63 static void avc_chroma_hz_2x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
64 uint32_t coeff0, uint32_t coeff1)
66 v16u8 src0, src1, src2, src3;
70 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
71 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
72 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
74 mask = LD_SB(&chroma_mask_arr[64]);
76 LD_UB4(src, stride, src0, src1, src2, src3);
78 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
80 src0 = (v16u8) __msa_ilvr_d((v2i64) src2, (v2i64) src0);
82 res_r = __msa_dotp_u_h(src0, coeff_vec);
84 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
85 res_r = __msa_sat_u_h(res_r, 7);
86 res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
88 ST2x4_UB(res, 0, dst, stride);
91 static void avc_chroma_hz_2w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
92 uint32_t coeff0, uint32_t coeff1,
96 avc_chroma_hz_2x2_msa(src, dst, stride, coeff0, coeff1);
97 } else if (4 == height) {
98 avc_chroma_hz_2x4_msa(src, dst, stride, coeff0, coeff1);
102 static void avc_chroma_hz_4x2_msa(uint8_t *src, uint8_t *dst, int32_t stride,
103 uint32_t coeff0, uint32_t coeff1)
109 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
110 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
111 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
113 mask = LD_SB(&chroma_mask_arr[0]);
115 LD_SB2(src, stride, src0, src1);
117 src0 = __msa_vshf_b(mask, src1, src0);
118 res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec);
120 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
121 res_r = __msa_sat_u_h(res_r, 7);
122 res = (v4i32) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
124 ST4x2_UB(res, dst, stride);
127 static void avc_chroma_hz_4x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
128 uint32_t coeff0, uint32_t coeff1)
130 v16u8 src0, src1, src2, src3, out;
131 v8u16 res0_r, res1_r;
133 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
134 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
135 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
137 mask = LD_SB(&chroma_mask_arr[0]);
139 LD_UB4(src, stride, src0, src1, src2, src3);
140 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
141 DOTP_UB2_UH(src0, src2, coeff_vec, coeff_vec, res0_r, res1_r);
144 SRARI_H2_UH(res0_r, res1_r, 6);
145 SAT_UH2_UH(res0_r, res1_r, 7);
146 out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r);
147 ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
150 static void avc_chroma_hz_4x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
151 uint32_t coeff0, uint32_t coeff1)
153 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, out0, out1;
155 v8u16 res0, res1, res2, res3;
156 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
157 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
158 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
160 mask = LD_SB(&chroma_mask_arr[0]);
162 LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
163 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
164 VSHF_B2_UB(src4, src5, src6, src7, mask, mask, src4, src6);
165 DOTP_UB2_UH(src0, src2, coeff_vec, coeff_vec, res0, res1);
166 DOTP_UB2_UH(src4, src6, coeff_vec, coeff_vec, res2, res3);
167 SLLI_4V(res0, res1, res2, res3, 3);
168 SRARI_H4_UH(res0, res1, res2, res3, 6);
169 SAT_UH4_UH(res0, res1, res2, res3, 7);
170 PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
171 ST4x8_UB(out0, out1, dst, stride);
174 static void avc_chroma_hz_4w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
175 uint32_t coeff0, uint32_t coeff1,
179 avc_chroma_hz_4x2_msa(src, dst, stride, coeff0, coeff1);
180 } else if (4 == height) {
181 avc_chroma_hz_4x4_msa(src, dst, stride, coeff0, coeff1);
182 } else if (8 == height) {
183 avc_chroma_hz_4x8_msa(src, dst, stride, coeff0, coeff1);
187 static void avc_chroma_hz_8x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
188 uint32_t coeff0, uint32_t coeff1)
190 v16u8 src0, src1, src2, src3, out0, out1;
191 v8u16 res0, res1, res2, res3;
193 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
194 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
195 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
197 mask = LD_SB(&chroma_mask_arr[32]);
198 LD_UB4(src, stride, src0, src1, src2, src3);
199 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, src0, src1);
200 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, src2, src3);
201 DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
202 coeff_vec, res0, res1, res2, res3);
203 SLLI_4V(res0, res1, res2, res3, 3);
204 SRARI_H4_UH(res0, res1, res2, res3, 6);
205 SAT_UH4_UH(res0, res1, res2, res3, 7);
206 PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
207 ST8x4_UB(out0, out1, dst, stride);
210 static void avc_chroma_hz_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
211 uint32_t coeff0, uint32_t coeff1)
213 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
214 v16u8 out0, out1, out2, out3;
215 v8u16 res0, res1, res2, res3, res4, res5, res6, res7;
217 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
218 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
219 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
221 mask = LD_SB(&chroma_mask_arr[32]);
223 LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
224 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, src0, src1);
225 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, src2, src3);
226 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, src4, src5);
227 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, src6, src7);
228 DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
229 coeff_vec, res0, res1, res2, res3);
230 DOTP_UB4_UH(src4, src5, src6, src7, coeff_vec, coeff_vec, coeff_vec,
231 coeff_vec, res4, res5, res6, res7);
232 SLLI_4V(res0, res1, res2, res3, 3);
233 SLLI_4V(res4, res5, res6, res7, 3);
234 SRARI_H4_UH(res0, res1, res2, res3, 6);
235 SRARI_H4_UH(res4, res5, res6, res7, 6);
236 SAT_UH4_UH(res0, res1, res2, res3, 7);
237 SAT_UH4_UH(res4, res5, res6, res7, 7);
238 PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
239 PCKEV_B2_UB(res5, res4, res7, res6, out2, out3);
240 ST8x8_UB(out0, out1, out2, out3, dst, stride);
243 static void avc_chroma_hz_nonmult_msa(uint8_t *src, uint8_t *dst,
244 int32_t stride, uint32_t coeff0,
245 uint32_t coeff1, int32_t height)
248 v16u8 src0, src1, src2, src3, out0, out1;
249 v8u16 res0, res1, res2, res3;
251 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
252 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
253 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
255 mask = LD_SB(&chroma_mask_arr[32]);
257 for (row = height >> 2; row--;) {
258 LD_UB4(src, stride, src0, src1, src2, src3);
261 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, src0, src1);
262 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, src2, src3);
263 DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
264 coeff_vec, res0, res1, res2, res3);
265 SLLI_4V(res0, res1, res2, res3, 3);
266 SRARI_H4_UH(res0, res1, res2, res3, 6);
267 SAT_UH4_UH(res0, res1, res2, res3, 7);
268 PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
269 ST8x4_UB(out0, out1, dst, stride);
273 if (0 != (height % 4)) {
274 for (row = (height % 4); row--;) {
278 src0 = (v16u8) __msa_vshf_b(mask, (v16i8) src0, (v16i8) src0);
280 res0 = __msa_dotp_u_h(src0, coeff_vec);
282 res0 = (v8u16) __msa_srari_h((v8i16) res0, 6);
283 res0 = __msa_sat_u_h(res0, 7);
284 res0 = (v8u16) __msa_pckev_b((v16i8) res0, (v16i8) res0);
292 static void avc_chroma_hz_8w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
293 uint32_t coeff0, uint32_t coeff1,
297 avc_chroma_hz_8x4_msa(src, dst, stride, coeff0, coeff1);
298 } else if (8 == height) {
299 avc_chroma_hz_8x8_msa(src, dst, stride, coeff0, coeff1);
301 avc_chroma_hz_nonmult_msa(src, dst, stride, coeff0, coeff1, height);
305 static void avc_chroma_vt_2x2_msa(uint8_t *src, uint8_t *dst, int32_t stride,
306 uint32_t coeff0, uint32_t coeff1)
309 v16i8 src0, src1, src2;
313 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
314 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
315 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
317 LD_SB3(src, stride, src0, src1, src2);
319 ILVR_B2_UB(src1, src0, src2, src1, tmp0, tmp1);
321 tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0);
323 res_r = __msa_dotp_u_h(tmp0, coeff_vec);
325 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
326 res_r = __msa_sat_u_h(res_r, 7);
327 res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
329 out0 = __msa_copy_u_h(res, 0);
330 out1 = __msa_copy_u_h(res, 2);
337 static void avc_chroma_vt_2x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
338 uint32_t coeff0, uint32_t coeff1)
340 v16u8 src0, src1, src2, src3, src4;
341 v16u8 tmp0, tmp1, tmp2, tmp3;
344 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
345 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
346 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
348 LD_UB5(src, stride, src0, src1, src2, src3, src4);
349 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
350 tmp0, tmp1, tmp2, tmp3);
351 ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
353 tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0);
355 res_r = __msa_dotp_u_h(tmp0, coeff_vec);
357 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
358 res_r = __msa_sat_u_h(res_r, 7);
360 res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
362 ST2x4_UB(res, 0, dst, stride);
365 static void avc_chroma_vt_2w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
366 uint32_t coeff0, uint32_t coeff1,
370 avc_chroma_vt_2x2_msa(src, dst, stride, coeff0, coeff1);
371 } else if (4 == height) {
372 avc_chroma_vt_2x4_msa(src, dst, stride, coeff0, coeff1);
376 static void avc_chroma_vt_4x2_msa(uint8_t *src, uint8_t *dst, int32_t stride,
377 uint32_t coeff0, uint32_t coeff1)
379 v16u8 src0, src1, src2;
383 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
384 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
385 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
387 LD_UB3(src, stride, src0, src1, src2);
388 ILVR_B2_UB(src1, src0, src2, src1, tmp0, tmp1);
390 tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0);
391 res_r = __msa_dotp_u_h(tmp0, coeff_vec);
393 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
394 res_r = __msa_sat_u_h(res_r, 7);
395 res = (v4i32) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
397 ST4x2_UB(res, dst, stride);
400 static void avc_chroma_vt_4x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
401 uint32_t coeff0, uint32_t coeff1)
403 v16u8 src0, src1, src2, src3, src4;
404 v16u8 tmp0, tmp1, tmp2, tmp3;
406 v8u16 res0_r, res1_r;
407 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
408 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
409 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
411 LD_UB5(src, stride, src0, src1, src2, src3, src4);
412 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, tmp0, tmp1, tmp2,
414 ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
415 DOTP_UB2_UH(tmp0, tmp2, coeff_vec, coeff_vec, res0_r, res1_r);
418 SRARI_H2_UH(res0_r, res1_r, 6);
419 SAT_UH2_UH(res0_r, res1_r, 7);
420 out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r);
421 ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
424 static void avc_chroma_vt_4x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
425 uint32_t coeff0, uint32_t coeff1)
427 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
428 v16u8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, out0, out1;
429 v8u16 res0, res1, res2, res3;
430 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
431 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
432 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
434 LD_UB5(src, stride, src0, src1, src2, src3, src4);
436 LD_UB4(src, stride, src5, src6, src7, src8);
437 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, tmp0, tmp1, tmp2,
439 ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, tmp4, tmp5, tmp6,
441 ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
442 ILVR_D2_UB(tmp5, tmp4, tmp7, tmp6, tmp4, tmp6);
443 DOTP_UB2_UH(tmp0, tmp2, coeff_vec, coeff_vec, res0, res1);
444 DOTP_UB2_UH(tmp4, tmp6, coeff_vec, coeff_vec, res2, res3);
445 SLLI_4V(res0, res1, res2, res3, 3);
446 SRARI_H4_UH(res0, res1, res2, res3, 6);
447 SAT_UH4_UH(res0, res1, res2, res3, 7);
448 PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
449 ST4x8_UB(out0, out1, dst, stride);
452 static void avc_chroma_vt_4w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
453 uint32_t coeff0, uint32_t coeff1,
457 avc_chroma_vt_4x2_msa(src, dst, stride, coeff0, coeff1);
458 } else if (4 == height) {
459 avc_chroma_vt_4x4_msa(src, dst, stride, coeff0, coeff1);
460 } else if (8 == height) {
461 avc_chroma_vt_4x8_msa(src, dst, stride, coeff0, coeff1);
465 static void avc_chroma_vt_8x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
466 uint32_t coeff0, uint32_t coeff1)
468 v16u8 src0, src1, src2, src3, src4, out0, out1;
469 v8u16 res0, res1, res2, res3;
470 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
471 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
472 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
474 LD_UB5(src, stride, src0, src1, src2, src3, src4);
475 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src0, src1, src2,
477 DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
478 coeff_vec, res0, res1, res2, res3);
479 SLLI_4V(res0, res1, res2, res3, 3);
480 SRARI_H4_UH(res0, res1, res2, res3, 6);
481 SAT_UH4_UH(res0, res1, res2, res3, 7);
482 PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
483 ST8x4_UB(out0, out1, dst, stride);
486 static void avc_chroma_vt_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
487 uint32_t coeff0, uint32_t coeff1)
489 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
490 v16u8 out0, out1, out2, out3;
491 v8u16 res0, res1, res2, res3, res4, res5, res6, res7;
492 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
493 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
494 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
496 LD_UB5(src, stride, src0, src1, src2, src3, src4);
498 LD_UB4(src, stride, src5, src6, src7, src8);
499 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src0, src1, src2,
501 ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, src4, src5, src6,
503 DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
504 coeff_vec, res0, res1, res2, res3);
505 DOTP_UB4_UH(src4, src5, src6, src7, coeff_vec, coeff_vec, coeff_vec,
506 coeff_vec, res4, res5, res6, res7);
507 SLLI_4V(res0, res1, res2, res3, 3);
508 SLLI_4V(res4, res5, res6, res7, 3);
509 SRARI_H4_UH(res0, res1, res2, res3, 6);
510 SRARI_H4_UH(res4, res5, res6, res7, 6);
511 SAT_UH4_UH(res0, res1, res2, res3, 7);
512 SAT_UH4_UH(res0, res1, res2, res3, 7);
513 PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
514 PCKEV_B2_UB(res5, res4, res7, res6, out2, out3);
515 ST8x8_UB(out0, out1, out2, out3, dst, stride);
518 static void avc_chroma_vt_8w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
519 uint32_t coeff0, uint32_t coeff1,
523 avc_chroma_vt_8x4_msa(src, dst, stride, coeff0, coeff1);
524 } else if (8 == height) {
525 avc_chroma_vt_8x8_msa(src, dst, stride, coeff0, coeff1);
529 static void avc_chroma_hv_2x2_msa(uint8_t *src, uint8_t *dst, int32_t stride,
530 uint32_t coef_hor0, uint32_t coef_hor1,
531 uint32_t coef_ver0, uint32_t coef_ver1)
534 v16u8 src0, src1, src2;
535 v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
538 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
539 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
540 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
541 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
542 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
544 mask = LD_SB(&chroma_mask_arr[48]);
546 LD_UB3(src, stride, src0, src1, src2);
547 VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
548 DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
549 MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
552 res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
553 res_vt0 = __msa_sat_u_h(res_vt0, 7);
554 res_vert = (v8i16) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
556 out0 = __msa_copy_u_h(res_vert, 0);
557 out1 = __msa_copy_u_h(res_vert, 1);
564 static void avc_chroma_hv_2x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
565 uint32_t coef_hor0, uint32_t coef_hor1,
566 uint32_t coef_ver0, uint32_t coef_ver1)
568 v16u8 src0, src1, src2, src3, src4;
569 v16u8 tmp0, tmp1, tmp2, tmp3;
570 v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
573 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
574 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
575 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
576 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
577 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
579 mask = LD_SB(&chroma_mask_arr[48]);
581 LD_UB5(src, stride, src0, src1, src2, src3, src4);
583 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, tmp0, tmp1);
584 VSHF_B2_UB(src1, src2, src3, src4, mask, mask, tmp2, tmp3);
585 ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
586 DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
587 MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
590 res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
591 res_vt0 = __msa_sat_u_h(res_vt0, 7);
593 res = (v8i16) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
595 ST2x4_UB(res, 0, dst, stride);
598 static void avc_chroma_hv_2w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
599 uint32_t coef_hor0, uint32_t coef_hor1,
600 uint32_t coef_ver0, uint32_t coef_ver1,
604 avc_chroma_hv_2x2_msa(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
606 } else if (4 == height) {
607 avc_chroma_hv_2x4_msa(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
612 static void avc_chroma_hv_4x2_msa(uint8_t *src, uint8_t *dst, int32_t stride,
613 uint32_t coef_hor0, uint32_t coef_hor1,
614 uint32_t coef_ver0, uint32_t coef_ver1)
616 v16u8 src0, src1, src2;
617 v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
620 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
621 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
622 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
623 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
624 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
626 mask = LD_SB(&chroma_mask_arr[0]);
627 LD_UB3(src, stride, src0, src1, src2);
628 VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
629 DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
630 MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
633 res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
634 res_vt0 = __msa_sat_u_h(res_vt0, 7);
635 res = (v4i32) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
637 ST4x2_UB(res, dst, stride);
640 static void avc_chroma_hv_4x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
641 uint32_t coef_hor0, uint32_t coef_hor1,
642 uint32_t coef_ver0, uint32_t coef_ver1)
644 v16u8 src0, src1, src2, src3, src4;
645 v8u16 res_hz0, res_hz1, res_hz2, res_hz3;
646 v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
648 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
649 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
650 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
651 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
652 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
655 mask = LD_SB(&chroma_mask_arr[0]);
657 LD_UB5(src, stride, src0, src1, src2, src3, src4);
658 VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
659 VSHF_B2_UB(src2, src3, src3, src4, mask, mask, src2, src3);
660 DOTP_UB4_UH(src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec,
661 coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2,
663 MUL4(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec1,
664 res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3);
665 ADD2(res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1);
666 SRARI_H2_UH(res_vt0, res_vt1, 6);
667 SAT_UH2_UH(res_vt0, res_vt1, 7);
668 PCKEV_B2_SW(res_vt0, res_vt0, res_vt1, res_vt1, res0, res1);
669 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, stride);
672 static void avc_chroma_hv_4x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
673 uint32_t coef_hor0, uint32_t coef_hor1,
674 uint32_t coef_ver0, uint32_t coef_ver1)
676 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, res0, res1;
677 v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4, res_hz5, res_hz6, res_hz7;
678 v8u16 res_vt0, res_vt1, res_vt2, res_vt3, res_vt4, res_vt5, res_vt6, res_vt7;
680 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
681 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
682 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
683 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
684 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
686 mask = LD_SB(&chroma_mask_arr[0]);
688 LD_UB5(src, stride, src0, src1, src2, src3, src4);
690 LD_UB4(src, stride, src5, src6, src7, src8);
692 VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
693 VSHF_B2_UB(src2, src3, src3, src4, mask, mask, src2, src3);
694 VSHF_B2_UB(src4, src5, src5, src6, mask, mask, src4, src5);
695 VSHF_B2_UB(src6, src7, src7, src8, mask, mask, src6, src7);
696 DOTP_UB4_UH(src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec,
697 coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2, res_hz3);
698 DOTP_UB4_UH(src4, src5, src6, src7, coeff_hz_vec, coeff_hz_vec,
699 coeff_hz_vec, coeff_hz_vec, res_hz4, res_hz5, res_hz6, res_hz7);
700 MUL4(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec1,
701 res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3);
702 MUL4(res_hz4, coeff_vt_vec1, res_hz5, coeff_vt_vec0, res_hz6, coeff_vt_vec1,
703 res_hz7, coeff_vt_vec0, res_vt4, res_vt5, res_vt6, res_vt7);
704 ADD2(res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1);
705 ADD2(res_vt4, res_vt5, res_vt6, res_vt7, res_vt2, res_vt3);
706 SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
707 SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
708 PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, res0, res1);
709 ST4x8_UB(res0, res1, dst, stride);
712 static void avc_chroma_hv_4w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
713 uint32_t coef_hor0, uint32_t coef_hor1,
714 uint32_t coef_ver0, uint32_t coef_ver1,
718 avc_chroma_hv_4x2_msa(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
720 } else if (4 == height) {
721 avc_chroma_hv_4x4_msa(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
723 } else if (8 == height) {
724 avc_chroma_hv_4x8_msa(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
729 static void avc_chroma_hv_8x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
730 uint32_t coef_hor0, uint32_t coef_hor1,
731 uint32_t coef_ver0, uint32_t coef_ver1)
733 v16u8 src0, src1, src2, src3, src4, out0, out1;
734 v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4;
735 v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
737 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
738 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
739 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
740 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
741 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
743 mask = LD_SB(&chroma_mask_arr[32]);
748 src0 = (v16u8) __msa_vshf_b(mask, (v16i8) src0, (v16i8) src0);
749 res_hz0 = __msa_dotp_u_h(src0, coeff_hz_vec);
751 LD_UB4(src, stride, src1, src2, src3, src4);
754 VSHF_B2_UB(src1, src1, src2, src2, mask, mask, src1, src2);
755 VSHF_B2_UB(src3, src3, src4, src4, mask, mask, src3, src4);
756 DOTP_UB4_UH(src1, src2, src3, src4, coeff_hz_vec, coeff_hz_vec,
757 coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3, res_hz4);
758 MUL4(res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3, coeff_vt_vec0,
759 res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3);
761 res_vt0 += (res_hz0 * coeff_vt_vec1);
762 res_vt1 += (res_hz1 * coeff_vt_vec1);
763 res_vt2 += (res_hz2 * coeff_vt_vec1);
764 res_vt3 += (res_hz3 * coeff_vt_vec1);
766 SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
767 SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
768 PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, out0, out1);
769 ST8x4_UB(out0, out1, dst, stride);
772 static void avc_chroma_hv_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
773 uint32_t coef_hor0, uint32_t coef_hor1,
774 uint32_t coef_ver0, uint32_t coef_ver1)
776 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
777 v16u8 out0, out1, out2, out3;
778 v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4;
779 v8u16 res_hz5, res_hz6, res_hz7, res_hz8;
780 v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
781 v8u16 res_vt4, res_vt5, res_vt6, res_vt7;
783 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
784 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
785 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
786 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
787 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
789 mask = LD_SB(&chroma_mask_arr[32]);
791 LD_UB5(src, stride, src0, src1, src2, src3, src4);
793 LD_UB4(src, stride, src5, src6, src7, src8);
794 src0 = (v16u8) __msa_vshf_b(mask, (v16i8) src0, (v16i8) src0);
795 VSHF_B2_UB(src1, src1, src2, src2, mask, mask, src1, src2);
796 VSHF_B2_UB(src3, src3, src4, src4, mask, mask, src3, src4);
797 VSHF_B2_UB(src5, src5, src6, src6, mask, mask, src5, src6);
798 VSHF_B2_UB(src7, src7, src8, src8, mask, mask, src7, src8);
799 res_hz0 = __msa_dotp_u_h(src0, coeff_hz_vec);
800 DOTP_UB4_UH(src1, src2, src3, src4, coeff_hz_vec, coeff_hz_vec,
801 coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3,
803 DOTP_UB4_UH(src5, src6, src7, src8, coeff_hz_vec, coeff_hz_vec,
804 coeff_hz_vec, coeff_hz_vec, res_hz5, res_hz6, res_hz7, res_hz8);
805 MUL4(res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3,
806 coeff_vt_vec0, res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
808 MUL4(res_hz5, coeff_vt_vec0, res_hz6, coeff_vt_vec0, res_hz7,
809 coeff_vt_vec0, res_hz8, coeff_vt_vec0, res_vt4, res_vt5, res_vt6,
811 res_vt0 += (res_hz0 * coeff_vt_vec1);
812 res_vt1 += (res_hz1 * coeff_vt_vec1);
813 res_vt2 += (res_hz2 * coeff_vt_vec1);
814 res_vt3 += (res_hz3 * coeff_vt_vec1);
815 res_vt4 += (res_hz4 * coeff_vt_vec1);
816 res_vt5 += (res_hz5 * coeff_vt_vec1);
817 res_vt6 += (res_hz6 * coeff_vt_vec1);
818 res_vt7 += (res_hz7 * coeff_vt_vec1);
819 SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
820 SRARI_H4_UH(res_vt4, res_vt5, res_vt6, res_vt7, 6);
821 SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
822 SAT_UH4_UH(res_vt4, res_vt5, res_vt6, res_vt7, 7);
823 PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, out0, out1);
824 PCKEV_B2_UB(res_vt5, res_vt4, res_vt7, res_vt6, out2, out3);
825 ST8x8_UB(out0, out1, out2, out3, dst, stride);
828 static void avc_chroma_hv_8w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
829 uint32_t coef_hor0, uint32_t coef_hor1,
830 uint32_t coef_ver0, uint32_t coef_ver1,
834 avc_chroma_hv_8x4_msa(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
836 } else if (8 == height) {
837 avc_chroma_hv_8x8_msa(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
842 static void avc_chroma_hz_and_aver_dst_2x2_msa(uint8_t *src, uint8_t *dst,
843 int32_t stride, uint32_t coeff0,
848 v16u8 dst_data = { 0 };
852 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
853 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
854 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
856 mask = LD_SB(&chroma_mask_arr[0]);
858 LD_SB2(src, stride, src0, src1);
861 out1 = LH(dst + stride);
863 dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 0, out0);
864 dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 2, out1);
866 src0 = __msa_vshf_b(mask, src1, src0);
868 res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec);
870 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
871 res_r = __msa_sat_u_h(res_r, 7);
873 res = (v16u8) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
874 dst_data = __msa_aver_u_b(res, dst_data);
876 out0 = __msa_copy_u_h((v8i16) dst_data, 0);
877 out1 = __msa_copy_u_h((v8i16) dst_data, 2);
884 static void avc_chroma_hz_and_aver_dst_2x4_msa(uint8_t *src, uint8_t *dst,
885 int32_t stride, uint32_t coeff0,
888 uint16_t tp0, tp1, tp2, tp3;
889 v16u8 src0, src1, src2, src3;
890 v16u8 dst0, dst_data = { 0 };
893 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
894 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
895 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
897 mask = LD_SB(&chroma_mask_arr[64]);
899 LD_UB4(src, stride, src0, src1, src2, src3);
901 tp1 = LH(dst + stride);
902 tp2 = LH(dst + 2 * stride);
903 tp3 = LH(dst + 3 * stride);
904 dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 0, tp0);
905 dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 1, tp1);
906 dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 2, tp2);
907 dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 3, tp3);
909 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
911 src0 = (v16u8) __msa_ilvr_d((v2i64) src2, (v2i64) src0);
913 res_r = __msa_dotp_u_h(src0, coeff_vec);
915 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
916 res_r = __msa_sat_u_h(res_r, 7);
918 dst0 = (v16u8) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
919 dst0 = __msa_aver_u_b(dst0, dst_data);
921 ST2x4_UB(dst0, 0, dst, stride);
924 static void avc_chroma_hz_and_aver_dst_2w_msa(uint8_t *src, uint8_t *dst,
925 int32_t stride, uint32_t coeff0,
926 uint32_t coeff1, int32_t height)
929 avc_chroma_hz_and_aver_dst_2x2_msa(src, dst, stride, coeff0, coeff1);
930 } else if (4 == height) {
931 avc_chroma_hz_and_aver_dst_2x4_msa(src, dst, stride, coeff0, coeff1);
935 static void avc_chroma_hz_and_aver_dst_4x2_msa(uint8_t *src, uint8_t *dst,
936 int32_t stride, uint32_t coeff0,
939 uint32_t load0, load1;
941 v16u8 dst_data = { 0 };
944 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
945 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
946 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
948 mask = LD_SB(&chroma_mask_arr[0]);
950 LD_SB2(src, stride, src0, src1);
952 LW2(dst, stride, load0, load1);
954 INSERT_W2_UB(load0, load1, dst_data);
956 src0 = __msa_vshf_b(mask, src1, src0);
958 res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec);
960 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
961 res_r = __msa_sat_u_h(res_r, 7);
962 res = __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
963 dst_data = __msa_aver_u_b((v16u8) res, dst_data);
965 ST4x2_UB(dst_data, dst, stride);
968 static void avc_chroma_hz_and_aver_dst_4x4_msa(uint8_t *src, uint8_t *dst,
969 int32_t stride, uint32_t coeff0,
972 uint32_t tp0, tp1, tp2, tp3;
973 v16u8 src0, src1, src2, src3;
974 v16u8 out, dst_data = { 0 };
976 v8u16 res0_r, res1_r;
977 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
978 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
979 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
981 mask = LD_SB(&chroma_mask_arr[0]);
983 LD_UB4(src, stride, src0, src1, src2, src3);
984 LW4(dst, stride, tp0, tp1, tp2, tp3);
985 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst_data);
986 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
987 DOTP_UB2_UH(src0, src2, coeff_vec, coeff_vec, res0_r, res1_r);
990 SRARI_H2_UH(res0_r, res1_r, 6);
991 SAT_UH2_UH(res0_r, res1_r, 7);
992 out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r);
993 out = __msa_aver_u_b(out, dst_data);
994 ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
997 static void avc_chroma_hz_and_aver_dst_4x8_msa(uint8_t *src, uint8_t *dst,
998 int32_t stride, uint32_t coeff0,
1001 uint32_t tp0, tp1, tp2, tp3;
1002 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, out0, out1;
1003 v16u8 dst0 = { 0 }, dst1 = { 0 };
1005 v8u16 res0, res1, res2, res3;
1006 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1007 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1008 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1010 mask = LD_SB(&chroma_mask_arr[0]);
1012 LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
1013 LW4(dst, stride, tp0, tp1, tp2, tp3);
1014 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
1015 LW4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
1016 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
1017 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
1018 VSHF_B2_UB(src4, src5, src6, src7, mask, mask, src4, src6);
1019 DOTP_UB2_UH(src0, src2, coeff_vec, coeff_vec, res0, res1);
1020 DOTP_UB2_UH(src4, src6, coeff_vec, coeff_vec, res2, res3);
1021 SLLI_4V(res0, res1, res2, res3, 3);
1022 SRARI_H4_UH(res0, res1, res2, res3, 6);
1023 SAT_UH4_UH(res0, res1, res2, res3, 7);
1024 PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
1025 AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
1026 ST4x8_UB(out0, out1, dst, stride);
1029 static void avc_chroma_hz_and_aver_dst_4w_msa(uint8_t *src, uint8_t *dst,
1030 int32_t stride, uint32_t coeff0,
1031 uint32_t coeff1, int32_t height)
1034 avc_chroma_hz_and_aver_dst_4x2_msa(src, dst, stride, coeff0, coeff1);
1035 } else if (4 == height) {
1036 avc_chroma_hz_and_aver_dst_4x4_msa(src, dst, stride, coeff0, coeff1);
1037 } else if (8 == height) {
1038 avc_chroma_hz_and_aver_dst_4x8_msa(src, dst, stride, coeff0, coeff1);
1042 static void avc_chroma_hz_and_aver_dst_8x4_msa(uint8_t *src, uint8_t *dst,
1043 int32_t stride, uint32_t coeff0,
1046 uint64_t tp0, tp1, tp2, tp3;
1047 v16u8 src0, src1, src2, src3, out0, out1;
1048 v16u8 dst0 = { 0 }, dst1 = { 0 };
1049 v8u16 res0, res1, res2, res3;
1051 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1052 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1053 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1055 mask = LD_SB(&chroma_mask_arr[32]);
1056 LD_UB4(src, stride, src0, src1, src2, src3);
1057 LD4(dst, stride, tp0, tp1, tp2, tp3);
1058 INSERT_D2_UB(tp0, tp1, dst0);
1059 INSERT_D2_UB(tp2, tp3, dst1);
1060 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, src0, src1);
1061 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, src2, src3);
1062 DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
1063 coeff_vec, res0, res1, res2, res3);
1064 SLLI_4V(res0, res1, res2, res3, 3);
1065 SRARI_H4_UH(res0, res1, res2, res3, 6);
1066 SAT_UH4_UH(res0, res1, res2, res3, 7);
1067 PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
1068 AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
1069 ST8x4_UB(dst0, dst1, dst, stride);
1072 static void avc_chroma_hz_and_aver_dst_8x8_msa(uint8_t *src, uint8_t *dst,
1073 int32_t stride, uint32_t coeff0,
1076 uint64_t tp0, tp1, tp2, tp3;
1077 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1078 v16u8 out0, out1, out2, out3;
1079 v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
1080 v8u16 res0, res1, res2, res3, res4, res5, res6, res7;
1082 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1083 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1084 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1086 mask = LD_SB(&chroma_mask_arr[32]);
1088 LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
1089 LD4(dst, stride, tp0, tp1, tp2, tp3);
1090 INSERT_D2_UB(tp0, tp1, dst0);
1091 INSERT_D2_UB(tp2, tp3, dst1);
1092 LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
1093 INSERT_D2_UB(tp0, tp1, dst2);
1094 INSERT_D2_UB(tp2, tp3, dst3);
1095 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, src0, src1);
1096 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, src2, src3);
1097 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, src4, src5);
1098 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, src6, src7);
1099 DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
1100 coeff_vec, res0, res1, res2, res3);
1101 DOTP_UB4_UH(src4, src5, src6, src7, coeff_vec, coeff_vec, coeff_vec,
1102 coeff_vec, res4, res5, res6, res7);
1103 SLLI_4V(res0, res1, res2, res3, 3);
1104 SLLI_4V(res4, res5, res6, res7, 3);
1105 SRARI_H4_UH(res0, res1, res2, res3, 6);
1106 SRARI_H4_UH(res4, res5, res6, res7, 6);
1107 SAT_UH4_UH(res0, res1, res2, res3, 7);
1108 SAT_UH4_UH(res4, res5, res6, res7, 7);
1109 PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
1110 PCKEV_B2_UB(res5, res4, res7, res6, out2, out3);
1111 AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
1112 AVER_UB2_UB(out2, dst2, out3, dst3, out2, out3);
1113 ST8x8_UB(out0, out1, out2, out3, dst, stride);
1116 static void avc_chroma_hz_and_aver_dst_8w_msa(uint8_t *src, uint8_t *dst,
1117 int32_t stride, uint32_t coeff0,
1118 uint32_t coeff1, int32_t height)
1121 avc_chroma_hz_and_aver_dst_8x4_msa(src, dst, stride, coeff0, coeff1);
1122 } else if (8 == height) {
1123 avc_chroma_hz_and_aver_dst_8x8_msa(src, dst, stride, coeff0, coeff1);
1127 static void avc_chroma_vt_and_aver_dst_2x2_msa(uint8_t *src, int32_t src_stride,
1128 uint8_t *dst, int32_t dst_stride,
1129 uint32_t coeff0, uint32_t coeff1)
1131 uint16_t out0, out1;
1132 uint32_t load0, load1;
1133 v16i8 src0, src1, src2, tmp0, tmp1, res;
1134 v16u8 dst_data = { 0 };
1136 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1137 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1138 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1140 LD_SB3(src, src_stride, src0, src1, src2);
1142 load1 = LW(dst + dst_stride);
1144 INSERT_W2_UB(load0, load1, dst_data);
1146 ILVR_B2_SB(src1, src0, src2, src1, tmp0, tmp1);
1148 tmp0 = (v16i8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0);
1149 res_r = __msa_dotp_u_h((v16u8) tmp0, coeff_vec);
1151 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
1152 res_r = __msa_sat_u_h(res_r, 7);
1153 res = __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
1154 dst_data = __msa_aver_u_b((v16u8) res, dst_data);
1155 out0 = __msa_copy_u_h((v8i16) dst_data, 0);
1156 out1 = __msa_copy_u_h((v8i16) dst_data, 2);
1163 static void avc_chroma_vt_and_aver_dst_2x4_msa(uint8_t *src, int32_t src_stride,
1164 uint8_t *dst, int32_t dst_stride,
1165 uint32_t coeff0, uint32_t coeff1)
1167 uint32_t load0, load1;
1168 v16i8 src0, src1, src2, src3, src4;
1169 v16u8 tmp0, tmp1, tmp2, tmp3;
1172 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1173 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1174 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1175 v16u8 dst_data = { 0 };
1177 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1180 load1 = LW(dst + dst_stride);
1182 dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 0, load0);
1183 dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 1, load1);
1185 load0 = LW(dst + 2 * dst_stride);
1186 load1 = LW(dst + 3 * dst_stride);
1188 dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 2, load0);
1189 dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 3, load1);
1191 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
1192 tmp0, tmp1, tmp2, tmp3);
1193 ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
1195 tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0);
1197 res_r = __msa_dotp_u_h(tmp0, coeff_vec);
1199 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
1200 res_r = __msa_sat_u_h(res_r, 7);
1202 res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
1203 res = (v8i16) __msa_aver_u_b((v16u8) res, dst_data);
1205 ST2x4_UB(res, 0, dst, dst_stride);
1206 dst += (4 * dst_stride);
1209 static void avc_chroma_vt_and_aver_dst_2x8_msa(uint8_t *src, int32_t src_stride,
1210 uint8_t *dst, int32_t dst_stride,
1211 uint32_t coeff0, uint32_t coeff1)
1213 uint32_t load0, load1, load2, load3;
1214 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1215 v16u8 tmp0, tmp1, tmp2, tmp3;
1218 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1219 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1220 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1221 v16u8 dst_data0 = { 0 };
1222 v16u8 dst_data1 = { 0 };
1224 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1225 src += (5 * src_stride);
1226 LD_SB4(src, src_stride, src5, src6, src7, src8);
1228 LW4(dst, dst_stride, load0, load1, load2, load3);
1230 dst_data0 = (v16u8) __msa_insert_h((v8i16) dst_data0, 0, load0);
1231 dst_data0 = (v16u8) __msa_insert_h((v8i16) dst_data0, 1, load1);
1232 dst_data0 = (v16u8) __msa_insert_h((v8i16) dst_data0, 2, load2);
1233 dst_data0 = (v16u8) __msa_insert_h((v8i16) dst_data0, 3, load3);
1235 LW4(dst + 4 * dst_stride, dst_stride, load0, load1, load2, load3);
1237 dst_data1 = (v16u8) __msa_insert_h((v8i16) dst_data1, 0, load0);
1238 dst_data1 = (v16u8) __msa_insert_h((v8i16) dst_data1, 1, load1);
1239 dst_data1 = (v16u8) __msa_insert_h((v8i16) dst_data1, 2, load2);
1240 dst_data1 = (v16u8) __msa_insert_h((v8i16) dst_data1, 3, load3);
1242 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
1243 tmp0, tmp1, tmp2, tmp3);
1245 ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
1247 tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0);
1249 res_r = __msa_dotp_u_h(tmp0, coeff_vec);
1251 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
1252 res_r = __msa_sat_u_h(res_r, 7);
1254 res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
1255 res = (v8i16) __msa_aver_u_b((v16u8) res, dst_data0);
1257 ST2x4_UB(res, 0, dst, dst_stride);
1258 dst += (4 * dst_stride);
1260 ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7,
1261 tmp0, tmp1, tmp2, tmp3);
1263 ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
1265 tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0);
1267 res_r = __msa_dotp_u_h(tmp0, coeff_vec);
1269 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
1270 res_r = __msa_sat_u_h(res_r, 7);
1272 res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
1273 res = (v8i16) __msa_aver_u_b((v16u8) res, dst_data1);
1275 ST2x4_UB(res, 0, dst, dst_stride);
1278 static void avc_chroma_vt_and_aver_dst_2w_msa(uint8_t *src, int32_t src_stride,
1279 uint8_t *dst, int32_t dst_stride,
1280 uint32_t coeff0, uint32_t coeff1,
1284 avc_chroma_vt_and_aver_dst_2x2_msa(src, src_stride, dst, dst_stride,
1286 } else if (4 == height) {
1287 avc_chroma_vt_and_aver_dst_2x4_msa(src, src_stride, dst, dst_stride,
1289 } else if (8 == height) {
1290 avc_chroma_vt_and_aver_dst_2x8_msa(src, src_stride, dst, dst_stride,
1295 static void avc_chroma_vt_and_aver_dst_4x2_msa(uint8_t *src, int32_t src_stride,
1296 uint8_t *dst, int32_t dst_stride,
1297 uint32_t coeff0, uint32_t coeff1)
1299 uint32_t load0, load1;
1300 v16i8 src0, src1, src2, tmp0, tmp1;
1301 v16u8 dst_data = { 0 };
1304 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1305 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1306 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1308 LD_SB3(src, src_stride, src0, src1, src2);
1311 load1 = LW(dst + dst_stride);
1313 INSERT_W2_UB(load0, load1, dst_data);
1314 ILVR_B2_SB(src1, src0, src2, src1, tmp0, tmp1);
1316 tmp0 = (v16i8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0);
1318 res_r = __msa_dotp_u_h((v16u8) tmp0, coeff_vec);
1320 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
1321 res_r = __msa_sat_u_h(res_r, 7);
1322 res = (v16u8) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
1323 res = __msa_aver_u_b(res, dst_data);
1325 ST4x2_UB(res, dst, dst_stride);
1328 static void avc_chroma_vt_and_aver_dst_4x4mul_msa(uint8_t *src,
1336 uint32_t load0, load1, row;
1337 v16i8 src0, src1, src2, src3, src4;
1338 v16u8 tmp0, tmp1, tmp2, tmp3;
1341 v8u16 res0_r, res1_r;
1343 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1344 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1345 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1350 for (row = (height >> 2); row--;) {
1351 LD_SB4(src, src_stride, src1, src2, src3, src4);
1352 src += (4 * src_stride);
1355 load1 = LW(dst + dst_stride);
1357 INSERT_W2_UB(load0, load1, dst0);
1358 load0 = LW(dst + 2 * dst_stride);
1359 load1 = LW(dst + 3 * dst_stride);
1360 INSERT_W2_UB(load0, load1, dst1);
1362 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
1363 tmp0, tmp1, tmp2, tmp3);
1364 ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
1365 DOTP_UB2_UH(tmp0, tmp2, coeff_vec, coeff_vec, res0_r, res1_r);
1370 SRARI_H2_UH(res0_r, res1_r, 6);
1371 SAT_UH2_UH(res0_r, res1_r, 7);
1372 PCKEV_B2_UB(res0_r, res0_r, res1_r, res1_r, res0, res1);
1373 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
1375 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
1376 dst += (4 * dst_stride);
1381 static void avc_chroma_vt_and_aver_dst_4w_msa(uint8_t *src, int32_t src_stride,
1382 uint8_t *dst, int32_t dst_stride,
1383 uint32_t coeff0, uint32_t coeff1,
1387 avc_chroma_vt_and_aver_dst_4x2_msa(src, src_stride, dst, dst_stride,
1390 avc_chroma_vt_and_aver_dst_4x4mul_msa(src, src_stride, dst, dst_stride,
1391 coeff0, coeff1, height);
1395 static void avc_chroma_vt_and_aver_dst_8w_msa(uint8_t *src, int32_t src_stride,
1396 uint8_t *dst, int32_t dst_stride,
1397 uint32_t coeff0, uint32_t coeff1,
1401 v16u8 src0, src1, src2, src3, src4;
1403 v8u16 res0, res1, res2, res3;
1404 v16u8 dst0, dst1, dst2, dst3;
1405 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1406 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1407 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1412 for (row = height >> 2; row--;) {
1413 LD_UB4(src, src_stride, src1, src2, src3, src4);
1414 src += (4 * src_stride);
1415 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1416 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
1417 src0, src1, src2, src3);
1418 DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
1419 coeff_vec, res0, res1, res2, res3);
1420 SLLI_4V(res0, res1, res2, res3, 3);
1421 SRARI_H4_UH(res0, res1, res2, res3, 6);
1422 SAT_UH4_UH(res0, res1, res2, res3, 7);
1423 PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
1424 PCKEV_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
1425 AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
1426 ST8x4_UB(out0, out1, dst, dst_stride);
1428 dst += (4 * dst_stride);
1433 static void avc_chroma_hv_and_aver_dst_2x2_msa(uint8_t *src, int32_t src_stride,
1434 uint8_t *dst, int32_t dst_stride,
1440 uint16_t out0, out1;
1442 v16u8 src0, src1, src2;
1443 v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
1445 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
1446 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
1447 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
1448 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
1449 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
1451 mask = LD_SB(&chroma_mask_arr[48]);
1453 LD_UB3(src, src_stride, src0, src1, src2);
1454 LD_UB2(dst, dst_stride, dst0, dst1);
1455 VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
1456 DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
1457 MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
1460 res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
1461 res_vt0 = __msa_sat_u_h(res_vt0, 7);
1462 res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
1463 dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 1, (v8i16) dst1);
1464 dst0 = __msa_aver_u_b((v16u8) res, dst0);
1465 out0 = __msa_copy_u_h((v8i16) dst0, 0);
1466 out1 = __msa_copy_u_h((v8i16) dst0, 1);
1473 static void avc_chroma_hv_and_aver_dst_2x4_msa(uint8_t *src, int32_t src_stride,
1474 uint8_t *dst, int32_t dst_stride,
1480 v16u8 src0, src1, src2, src3, src4;
1481 v16u8 tmp0, tmp1, tmp2, tmp3;
1482 v16u8 dst0, dst1, dst2, dst3;
1483 v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
1485 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
1486 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
1487 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
1488 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
1489 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
1491 mask = LD_SB(&chroma_mask_arr[48]);
1493 LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
1494 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1495 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, tmp0, tmp1);
1496 VSHF_B2_UB(src1, src2, src3, src4, mask, mask, tmp2, tmp3);
1497 ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
1498 DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
1499 MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
1502 res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
1503 res_vt0 = __msa_sat_u_h(res_vt0, 7);
1504 res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
1506 dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 1, (v8i16) dst1);
1507 dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 2, (v8i16) dst2);
1508 dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 3, (v8i16) dst3);
1509 dst0 = __msa_aver_u_b((v16u8) res, dst0);
1511 ST2x4_UB(dst0, 0, dst, dst_stride);
1514 static void avc_chroma_hv_and_aver_dst_2x8_msa(uint8_t *src, int32_t src_stride,
1515 uint8_t *dst, int32_t dst_stride,
1521 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1522 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1523 v16u8 tmp0, tmp1, tmp2, tmp3;
1524 v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
1526 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
1527 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
1528 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
1529 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
1530 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
1532 mask = LD_SB(&chroma_mask_arr[48]);
1534 LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
1535 src += (5 * src_stride);
1536 LD_UB4(src, src_stride, src5, src6, src7, src8);
1538 LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
1540 dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 1, (v8i16) dst1);
1541 dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 2, (v8i16) dst2);
1542 dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 3, (v8i16) dst3);
1544 dst4 = (v16u8) __msa_insve_h((v8i16) dst4, 1, (v8i16) dst5);
1545 dst4 = (v16u8) __msa_insve_h((v8i16) dst4, 2, (v8i16) dst6);
1546 dst4 = (v16u8) __msa_insve_h((v8i16) dst4, 3, (v8i16) dst7);
1548 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, tmp0, tmp1);
1549 VSHF_B2_UB(src1, src2, src3, src4, mask, mask, tmp2, tmp3);
1550 ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
1551 VSHF_B2_UB(src4, src5, src6, src7, mask, mask, tmp0, tmp1);
1552 VSHF_B2_UB(src5, src6, src7, src8, mask, mask, tmp2, tmp3);
1553 ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src4, src5);
1554 DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
1555 MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
1558 res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
1559 res_vt0 = __msa_sat_u_h(res_vt0, 7);
1560 res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
1561 dst0 = __msa_aver_u_b((v16u8) res, dst0);
1563 ST2x4_UB(dst0, 0, dst, dst_stride);
1564 dst += (4 * dst_stride);
1566 DOTP_UB2_UH(src4, src5, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
1567 MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
1570 res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
1571 res_vt0 = __msa_sat_u_h(res_vt0, 7);
1572 res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
1573 dst4 = __msa_aver_u_b((v16u8) res, dst4);
1575 ST2x4_UB(dst4, 0, dst, dst_stride);
1578 static void avc_chroma_hv_and_aver_dst_2w_msa(uint8_t *src, int32_t src_stride,
1579 uint8_t *dst, int32_t dst_stride,
1587 avc_chroma_hv_and_aver_dst_2x2_msa(src, src_stride, dst, dst_stride,
1588 coef_hor0, coef_hor1,
1589 coef_ver0, coef_ver1);
1590 } else if (4 == height) {
1591 avc_chroma_hv_and_aver_dst_2x4_msa(src, src_stride, dst, dst_stride,
1592 coef_hor0, coef_hor1,
1593 coef_ver0, coef_ver1);
1594 } else if (8 == height) {
1595 avc_chroma_hv_and_aver_dst_2x8_msa(src, src_stride, dst, dst_stride,
1596 coef_hor0, coef_hor1,
1597 coef_ver0, coef_ver1);
1601 static void avc_chroma_hv_and_aver_dst_4x2_msa(uint8_t *src, int32_t src_stride,
1602 uint8_t *dst, int32_t dst_stride,
1608 v16u8 src0, src1, src2;
1610 v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
1612 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
1613 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
1614 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
1615 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
1616 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
1618 mask = LD_SB(&chroma_mask_arr[0]);
1620 LD_UB3(src, src_stride, src0, src1, src2);
1621 LD_UB2(dst, dst_stride, dst0, dst1);
1622 VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
1623 DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
1624 MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
1627 res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
1628 res_vt0 = __msa_sat_u_h(res_vt0, 7);
1629 res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
1630 dst0 = (v16u8) __msa_insve_w((v4i32) dst0, 1, (v4i32) dst1);
1631 dst0 = __msa_aver_u_b((v16u8) res, dst0);
1633 ST4x2_UB(dst0, dst, dst_stride);
1636 static void avc_chroma_hv_and_aver_dst_4x4mul_msa(uint8_t *src,
1647 v16u8 src0, src1, src2, src3, src4;
1648 v16u8 dst0, dst1, dst2, dst3;
1649 v8u16 res_hz0, res_hz1, res_hz2, res_hz3;
1650 v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
1652 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
1653 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
1654 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
1655 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
1656 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
1659 mask = LD_SB(&chroma_mask_arr[0]);
1664 for (row = (height >> 2); row--;) {
1665 LD_UB4(src, src_stride, src1, src2, src3, src4);
1666 src += (4 * src_stride);
1668 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1670 VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
1671 VSHF_B2_UB(src2, src3, src3, src4, mask, mask, src2, src3);
1672 DOTP_UB4_UH(src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec,
1673 coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2,
1675 MUL4(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2,
1676 coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
1678 ADD2(res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1);
1679 SRARI_H2_UH(res_vt0, res_vt1, 6);
1680 SAT_UH2_UH(res_vt0, res_vt1, 7);
1681 PCKEV_B2_UB(res_vt0, res_vt0, res_vt1, res_vt1, res0, res1);
1683 dst0 = (v16u8) __msa_insve_w((v4i32) dst0, 1, (v4i32) dst1);
1684 dst1 = (v16u8) __msa_insve_w((v4i32) dst2, 1, (v4i32) dst3);
1686 AVER_UB2_UB(res0, dst0, res1, dst1, dst0, dst1);
1688 ST4x4_UB(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
1689 dst += (4 * dst_stride);
1694 static void avc_chroma_hv_and_aver_dst_4w_msa(uint8_t *src, int32_t src_stride,
1695 uint8_t *dst, int32_t dst_stride,
1703 avc_chroma_hv_and_aver_dst_4x2_msa(src, src_stride, dst, dst_stride,
1704 coef_hor0, coef_hor1,
1705 coef_ver0, coef_ver1);
1707 avc_chroma_hv_and_aver_dst_4x4mul_msa(src, src_stride, dst, dst_stride,
1708 coef_hor0, coef_hor1,
1709 coef_ver0, coef_ver1, height);
1713 static void avc_chroma_hv_and_aver_dst_8w_msa(uint8_t *src, int32_t src_stride,
1714 uint8_t *dst, int32_t dst_stride,
1722 v16u8 src0, src1, src2, src3, src4, out0, out1;
1723 v8u16 res_hz0, res_hz1, res_hz2;
1724 v8u16 res_hz3, res_hz4;
1725 v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
1726 v16u8 dst0, dst1, dst2, dst3;
1728 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
1729 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
1730 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
1731 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
1732 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
1734 mask = LD_SB(&chroma_mask_arr[32]);
1739 src0 = (v16u8) __msa_vshf_b(mask, (v16i8) src0, (v16i8) src0);
1740 res_hz0 = __msa_dotp_u_h(src0, coeff_hz_vec);
1742 for (row = (height >> 2); row--;) {
1743 LD_UB4(src, src_stride, src1, src2, src3, src4);
1744 src += (4 * src_stride);
1746 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1747 VSHF_B2_UB(src1, src1, src2, src2, mask, mask, src1, src2);
1748 VSHF_B2_UB(src3, src3, src4, src4, mask, mask, src3, src4);
1749 DOTP_UB4_UH(src1, src2, src3, src4, coeff_hz_vec, coeff_hz_vec,
1750 coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3,
1752 MUL4(res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3,
1753 coeff_vt_vec0, res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
1756 res_vt0 += (res_hz0 * coeff_vt_vec1);
1757 res_vt1 += (res_hz1 * coeff_vt_vec1);
1758 res_vt2 += (res_hz2 * coeff_vt_vec1);
1759 res_vt3 += (res_hz3 * coeff_vt_vec1);
1761 SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
1762 SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
1764 PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, out0, out1);
1765 PCKEV_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
1766 AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
1767 ST8x4_UB(out0, out1, dst, dst_stride);
1768 dst += (4 * dst_stride);
1774 static void copy_width8_msa(uint8_t *src, int32_t src_stride,
1775 uint8_t *dst, int32_t dst_stride,
1779 uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
1780 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1782 if (0 == height % 12) {
1783 for (cnt = (height / 12); cnt--;) {
1784 LD_UB8(src, src_stride,
1785 src0, src1, src2, src3, src4, src5, src6, src7);
1786 src += (8 * src_stride);
1788 out0 = __msa_copy_u_d((v2i64) src0, 0);
1789 out1 = __msa_copy_u_d((v2i64) src1, 0);
1790 out2 = __msa_copy_u_d((v2i64) src2, 0);
1791 out3 = __msa_copy_u_d((v2i64) src3, 0);
1792 out4 = __msa_copy_u_d((v2i64) src4, 0);
1793 out5 = __msa_copy_u_d((v2i64) src5, 0);
1794 out6 = __msa_copy_u_d((v2i64) src6, 0);
1795 out7 = __msa_copy_u_d((v2i64) src7, 0);
1797 SD4(out0, out1, out2, out3, dst, dst_stride);
1798 dst += (4 * dst_stride);
1799 SD4(out4, out5, out6, out7, dst, dst_stride);
1800 dst += (4 * dst_stride);
1802 LD_UB4(src, src_stride, src0, src1, src2, src3);
1803 src += (4 * src_stride);
1805 out0 = __msa_copy_u_d((v2i64) src0, 0);
1806 out1 = __msa_copy_u_d((v2i64) src1, 0);
1807 out2 = __msa_copy_u_d((v2i64) src2, 0);
1808 out3 = __msa_copy_u_d((v2i64) src3, 0);
1810 SD4(out0, out1, out2, out3, dst, dst_stride);
1811 dst += (4 * dst_stride);
1813 } else if (0 == height % 8) {
1814 for (cnt = height >> 3; cnt--;) {
1815 LD_UB8(src, src_stride,
1816 src0, src1, src2, src3, src4, src5, src6, src7);
1817 src += (8 * src_stride);
1819 out0 = __msa_copy_u_d((v2i64) src0, 0);
1820 out1 = __msa_copy_u_d((v2i64) src1, 0);
1821 out2 = __msa_copy_u_d((v2i64) src2, 0);
1822 out3 = __msa_copy_u_d((v2i64) src3, 0);
1823 out4 = __msa_copy_u_d((v2i64) src4, 0);
1824 out5 = __msa_copy_u_d((v2i64) src5, 0);
1825 out6 = __msa_copy_u_d((v2i64) src6, 0);
1826 out7 = __msa_copy_u_d((v2i64) src7, 0);
1828 SD4(out0, out1, out2, out3, dst, dst_stride);
1829 dst += (4 * dst_stride);
1830 SD4(out4, out5, out6, out7, dst, dst_stride);
1831 dst += (4 * dst_stride);
1833 } else if (0 == height % 4) {
1834 for (cnt = (height / 4); cnt--;) {
1835 LD_UB4(src, src_stride, src0, src1, src2, src3);
1836 src += (4 * src_stride);
1837 out0 = __msa_copy_u_d((v2i64) src0, 0);
1838 out1 = __msa_copy_u_d((v2i64) src1, 0);
1839 out2 = __msa_copy_u_d((v2i64) src2, 0);
1840 out3 = __msa_copy_u_d((v2i64) src3, 0);
1842 SD4(out0, out1, out2, out3, dst, dst_stride);
1843 dst += (4 * dst_stride);
1845 } else if (0 == height % 2) {
1846 for (cnt = (height / 2); cnt--;) {
1847 LD_UB2(src, src_stride, src0, src1);
1848 src += (2 * src_stride);
1849 out0 = __msa_copy_u_d((v2i64) src0, 0);
1850 out1 = __msa_copy_u_d((v2i64) src1, 0);
1860 static void avg_width4_msa(uint8_t *src, int32_t src_stride,
1861 uint8_t *dst, int32_t dst_stride,
1865 uint32_t out0, out1, out2, out3;
1866 v16u8 src0, src1, src2, src3;
1867 v16u8 dst0, dst1, dst2, dst3;
1869 if (0 == (height % 4)) {
1870 for (cnt = (height / 4); cnt--;) {
1871 LD_UB4(src, src_stride, src0, src1, src2, src3);
1872 src += (4 * src_stride);
1874 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1876 AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
1877 dst0, dst1, dst2, dst3);
1879 out0 = __msa_copy_u_w((v4i32) dst0, 0);
1880 out1 = __msa_copy_u_w((v4i32) dst1, 0);
1881 out2 = __msa_copy_u_w((v4i32) dst2, 0);
1882 out3 = __msa_copy_u_w((v4i32) dst3, 0);
1883 SW4(out0, out1, out2, out3, dst, dst_stride);
1884 dst += (4 * dst_stride);
1886 } else if (0 == (height % 2)) {
1887 for (cnt = (height / 2); cnt--;) {
1888 LD_UB2(src, src_stride, src0, src1);
1889 src += (2 * src_stride);
1891 LD_UB2(dst, dst_stride, dst0, dst1);
1893 AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1);
1895 out0 = __msa_copy_u_w((v4i32) dst0, 0);
1896 out1 = __msa_copy_u_w((v4i32) dst1, 0);
1905 static void avg_width8_msa(uint8_t *src, int32_t src_stride,
1906 uint8_t *dst, int32_t dst_stride,
1910 uint64_t out0, out1, out2, out3;
1911 v16u8 src0, src1, src2, src3;
1912 v16u8 dst0, dst1, dst2, dst3;
1914 for (cnt = (height / 4); cnt--;) {
1915 LD_UB4(src, src_stride, src0, src1, src2, src3);
1916 src += (4 * src_stride);
1917 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1919 AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
1920 dst0, dst1, dst2, dst3);
1922 out0 = __msa_copy_u_d((v2i64) dst0, 0);
1923 out1 = __msa_copy_u_d((v2i64) dst1, 0);
1924 out2 = __msa_copy_u_d((v2i64) dst2, 0);
1925 out3 = __msa_copy_u_d((v2i64) dst3, 0);
1926 SD4(out0, out1, out2, out3, dst, dst_stride);
1927 dst += (4 * dst_stride);
1931 void ff_put_h264_chroma_mc8_msa(uint8_t *dst, uint8_t *src,
1932 ptrdiff_t stride, int height, int x, int y)
1934 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
1937 avc_chroma_hv_8w_msa(src, dst, stride, x, (8 - x), y, (8 - y), height);
1939 avc_chroma_hz_8w_msa(src, dst, stride, x, (8 - x), height);
1941 avc_chroma_vt_8w_msa(src, dst, stride, y, (8 - y), height);
1943 copy_width8_msa(src, stride, dst, stride, height);
1947 void ff_put_h264_chroma_mc4_msa(uint8_t *dst, uint8_t *src,
1948 ptrdiff_t stride, int height, int x, int y)
1952 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
1955 avc_chroma_hv_4w_msa(src, dst, stride, x, (8 - x), y, (8 - y), height);
1957 avc_chroma_hz_4w_msa(src, dst, stride, x, (8 - x), height);
1959 avc_chroma_vt_4w_msa(src, dst, stride, y, (8 - y), height);
1961 for (cnt = height; cnt--;) {
1962 *((uint32_t *) dst) = *((uint32_t *) src);
1970 void ff_put_h264_chroma_mc2_msa(uint8_t *dst, uint8_t *src,
1971 ptrdiff_t stride, int height, int x, int y)
1975 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
1978 avc_chroma_hv_2w_msa(src, dst, stride, x, (8 - x), y, (8 - y), height);
1980 avc_chroma_hz_2w_msa(src, dst, stride, x, (8 - x), height);
1982 avc_chroma_vt_2w_msa(src, dst, stride, y, (8 - y), height);
1984 for (cnt = height; cnt--;) {
1985 *((uint16_t *) dst) = *((uint16_t *) src);
1993 void ff_avg_h264_chroma_mc8_msa(uint8_t *dst, uint8_t *src,
1994 ptrdiff_t stride, int height, int x, int y)
1996 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2000 avc_chroma_hv_and_aver_dst_8w_msa(src, stride, dst,
2001 stride, x, (8 - x), y,
2004 avc_chroma_hz_and_aver_dst_8w_msa(src, dst, stride, x, (8 - x), height);
2006 avc_chroma_vt_and_aver_dst_8w_msa(src, stride, dst,
2007 stride, y, (8 - y), height);
2009 avg_width8_msa(src, stride, dst, stride, height);
2013 void ff_avg_h264_chroma_mc4_msa(uint8_t *dst, uint8_t *src,
2014 ptrdiff_t stride, int height, int x, int y)
2016 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2019 avc_chroma_hv_and_aver_dst_4w_msa(src, stride, dst,
2020 stride, x, (8 - x), y,
2023 avc_chroma_hz_and_aver_dst_4w_msa(src, dst, stride, x, (8 - x), height);
2025 avc_chroma_vt_and_aver_dst_4w_msa(src, stride, dst,
2026 stride, y, (8 - y), height);
2028 avg_width4_msa(src, stride, dst, stride, height);
2032 void ff_avg_h264_chroma_mc2_msa(uint8_t *dst, uint8_t *src,
2033 ptrdiff_t stride, int height, int x, int y)
2037 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2040 avc_chroma_hv_and_aver_dst_2w_msa(src, stride, dst,
2041 stride, x, (8 - x), y,
2044 avc_chroma_hz_and_aver_dst_2w_msa(src, dst, stride, x, (8 - x), height);
2046 avc_chroma_vt_and_aver_dst_2w_msa(src, stride, dst,
2047 stride, y, (8 - y), height);
2049 for (cnt = height; cnt--;) {
2050 dst[0] = (dst[0] + src[0] + 1) >> 1;
2051 dst[1] = (dst[1] + src[1] + 1) >> 1;