2 * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #include "libavutil/mips/generic_macros_msa.h"
22 #include "qpeldsp_mips.h"
24 #define APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, coef0, coef1, coef2) \
26 v16u8 out, tmp0, tmp1; \
27 v16u8 data0, data1, data2, data3, data4, data5; \
29 v8u16 sum0_r, sum1_r, sum2_r, sum3_r; \
30 v8u16 sum0_l, sum1_l, sum2_l, sum3_l; \
32 VSHF_B2_UB(inp0, inp0, inp1, inp1, mask, mask, tmp0, tmp1); \
33 ILVRL_B2_UH(inp1, inp0, sum0_r, sum0_l); \
34 data0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 15); \
35 data3 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 1); \
36 HADD_UB2_UH(sum0_r, sum0_l, sum0_r, sum0_l); \
37 ILVRL_B2_UH(data3, data0, sum1_r, sum1_l); \
38 data1 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 14); \
39 data4 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 2); \
40 sum0_r *= (v8u16) (coef0); \
41 sum0_l *= (v8u16) (coef0); \
42 ILVRL_B2_UH(data4, data1, sum2_r, sum2_l); \
43 data2 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 13); \
44 data5 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 3); \
45 DPADD_UB2_UH(sum2_r, sum2_l, coef2, coef2, sum0_r, sum0_l); \
46 ILVRL_B2_UH(data5, data2, sum3_r, sum3_l); \
47 HADD_UB2_UH(sum3_r, sum3_l, sum3_r, sum3_l); \
48 DPADD_UB2_UH(sum1_r, sum1_l, coef1, coef1, sum3_r, sum3_l); \
49 res_r = (v8i16) (sum0_r - sum3_r); \
50 res_l = (v8i16) (sum0_l - sum3_l); \
51 SRARI_H2_SH(res_r, res_l, 5); \
52 CLIP_SH2_0_255(res_r, res_l); \
53 out = (v16u8) __msa_pckev_b((v16i8) res_l, (v16i8) res_r); \
58 #define APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, \
59 mask0, mask1, mask2, mask3, \
60 coef0, coef1, coef2) \
63 v8u16 sum0_r, sum1_r, sum2_r, sum3_r; \
64 v8u16 sum4_r, sum5_r, sum6_r, sum7_r; \
65 v8i16 res0_r, res1_r; \
67 VSHF_B2_UH(inp0, inp0, inp1, inp1, mask0, mask0, sum0_r, sum4_r); \
68 VSHF_B2_UH(inp0, inp0, inp1, inp1, mask3, mask3, sum3_r, sum7_r); \
69 HADD_UB2_UH(sum3_r, sum7_r, sum3_r, sum7_r); \
70 DOTP_UB2_UH(sum0_r, sum4_r, coef0, coef0, sum0_r, sum4_r); \
71 VSHF_B2_UH(inp0, inp0, inp1, inp1, mask2, mask2, sum2_r, sum6_r); \
72 VSHF_B2_UH(inp0, inp0, inp1, inp1, mask1, mask1, sum1_r, sum5_r); \
73 DPADD_UB2_UH(sum2_r, sum6_r, coef2, coef2, sum0_r, sum4_r); \
74 DPADD_UB2_UH(sum1_r, sum5_r, coef1, coef1, sum3_r, sum7_r); \
75 res0_r = (v8i16) (sum0_r - sum3_r); \
76 res1_r = (v8i16) (sum4_r - sum7_r); \
77 SRARI_H2_SH(res0_r, res1_r, 5); \
78 CLIP_SH2_0_255(res0_r, res1_r); \
79 out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r); \
84 #define APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, \
85 mask0, mask1, mask2, mask3, \
86 coef0, coef1, coef2) \
90 v8u16 sum0_r, sum1_r, sum2_r, sum3_r; \
92 VSHF_B2_UH(inp0, inp0, inp0, inp0, mask0, mask3, sum0_r, sum3_r); \
93 sum3_r = __msa_hadd_u_h((v16u8) sum3_r, (v16u8) sum3_r); \
94 sum0_r = __msa_dotp_u_h((v16u8) sum0_r, (v16u8) coef0); \
95 VSHF_B2_UH(inp0, inp0, inp0, inp0, mask2, mask1, sum2_r, sum1_r); \
96 DPADD_UB2_UH(sum2_r, sum1_r, coef2, coef1, sum0_r, sum3_r); \
97 res0_r = (v8i16) (sum0_r - sum3_r); \
98 res0_r = __msa_srari_h(res0_r, 5); \
99 res0_r = CLIP_SH_0_255(res0_r); \
100 out = (v16u8) __msa_pckev_b((v16i8) res0_r, (v16i8) res0_r); \
105 #define APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1, \
106 mask2, mask3, coef0, \
111 v8u16 sum0_r, sum1_r, sum2_r, sum3_r; \
113 VSHF_B2_UH(inp0, inp0, inp0, inp0, mask0, mask3, sum0_r, sum3_r); \
114 sum3_r = __msa_hadd_u_h((v16u8) sum3_r, (v16u8) sum3_r); \
115 sum0_r = __msa_dotp_u_h((v16u8) sum0_r, (v16u8) coef0); \
116 VSHF_B2_UH(inp0, inp0, inp0, inp0, mask2, mask1, sum2_r, sum1_r); \
117 DPADD_UB2_UH(sum2_r, sum1_r, coef2, coef1, sum0_r, sum3_r); \
118 res0_r = (v8i16) (sum0_r - sum3_r); \
121 res0_r = CLIP_SH_0_255(res0_r); \
122 out = (v16u8) __msa_pckev_b((v16i8) res0_r, (v16i8) res0_r); \
127 #define APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask, \
128 coef0, coef1, coef2) \
130 v16u8 out, tmp0, tmp1; \
131 v16u8 data0, data1, data2, data3, data4, data5; \
132 v8i16 res_r, res_l; \
133 v8u16 sum0_r, sum1_r, sum2_r, sum3_r; \
134 v8u16 sum0_l, sum1_l, sum2_l, sum3_l; \
136 VSHF_B2_UB(inp0, inp0, inp1, inp1, mask, mask, tmp0, tmp1); \
137 ILVRL_B2_UH(inp1, inp0, sum0_r, sum0_l); \
138 data0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 15); \
139 data3 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 1); \
140 HADD_UB2_UH(sum0_r, sum0_l, sum0_r, sum0_l); \
141 ILVRL_B2_UH(data3, data0, sum1_r, sum1_l); \
142 data1 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 14); \
143 data4 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 2); \
144 sum0_r *= (v8u16) (coef0); \
145 sum0_l *= (v8u16) (coef0); \
146 ILVRL_B2_UH(data4, data1, sum2_r, sum2_l); \
147 data2 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 13); \
148 data5 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 3); \
149 DPADD_UB2_UH(sum2_r, sum2_l, coef2, coef2, sum0_r, sum0_l); \
150 ILVRL_B2_UH(data5, data2, sum3_r, sum3_l); \
151 HADD_UB2_UH(sum3_r, sum3_l, sum3_r, sum3_l); \
152 DPADD_UB2_UH(sum1_r, sum1_l, coef1, coef1, sum3_r, sum3_l); \
153 res_r = (v8i16) (sum0_r - sum3_r); \
154 res_l = (v8i16) (sum0_l - sum3_l); \
159 CLIP_SH2_0_255(res_r, res_l); \
160 out = (v16u8) __msa_pckev_b((v16i8) res_l, (v16i8) res_r); \
165 #define APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, \
166 mask0, mask1, mask2, mask3, \
167 coef0, coef1, coef2) \
170 v8i16 res0_r, res1_r; \
171 v8u16 sum0_r, sum1_r, sum2_r, sum3_r; \
172 v8u16 sum4_r, sum5_r, sum6_r, sum7_r; \
174 VSHF_B2_UH(inp0, inp0, inp1, inp1, mask0, mask0, sum0_r, sum4_r); \
175 VSHF_B2_UH(inp0, inp0, inp1, inp1, mask3, mask3, sum3_r, sum7_r); \
176 HADD_UB2_UH(sum3_r, sum7_r, sum3_r, sum7_r); \
177 DOTP_UB2_UH(sum0_r, sum4_r, coef0, coef0, sum0_r, sum4_r); \
178 VSHF_B2_UH(inp0, inp0, inp1, inp1, mask2, mask2, sum2_r, sum6_r); \
179 VSHF_B2_UH(inp0, inp0, inp1, inp1, mask1, mask1, sum1_r, sum5_r); \
180 DPADD_UB2_UH(sum2_r, sum6_r, coef2, coef2, sum0_r, sum4_r); \
181 DPADD_UB2_UH(sum1_r, sum5_r, coef1, coef1, sum3_r, sum7_r); \
182 res0_r = (v8i16) (sum0_r - sum3_r); \
183 res1_r = (v8i16) (sum4_r - sum7_r); \
188 CLIP_SH2_0_255(res0_r, res1_r); \
189 out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r); \
194 #define APPLY_VERT_QPEL_FILTER(inp0, inp1, inp2, inp3, \
195 inp4, inp5, inp6, inp7, \
196 coef0, coef1, coef2) \
199 v8i16 res_r, res_l; \
200 v8u16 sum0_r, sum1_r, sum2_r, sum3_r; \
201 v8u16 sum0_l, sum1_l, sum2_l, sum3_l; \
203 ILVRL_B2_UH(inp4, inp0, sum0_r, sum0_l); \
204 ILVRL_B2_UH(inp7, inp3, sum3_r, sum3_l); \
205 DOTP_UB2_UH(sum0_r, sum0_l, coef0, coef0, sum0_r, sum0_l); \
206 HADD_UB2_UH(sum3_r, sum3_l, sum3_r, sum3_l); \
207 ILVRL_B2_UH(inp6, inp2, sum2_r, sum2_l); \
208 ILVRL_B2_UH(inp5, inp1, sum1_r, sum1_l); \
209 DPADD_UB2_UH(sum2_r, sum2_l, coef2, coef2, sum0_r, sum0_l); \
210 DPADD_UB2_UH(sum1_r, sum1_l, coef1, coef1, sum3_r, sum3_l); \
211 res_r = (v8i16) (sum0_r - sum3_r); \
212 res_l = (v8i16) (sum0_l - sum3_l); \
213 SRARI_H2_SH(res_r, res_l, 5); \
214 CLIP_SH2_0_255(res_r, res_l); \
215 res = (v16u8) __msa_pckev_b((v16i8) res_l, (v16i8) res_r); \
220 #define APPLY_VERT_QPEL_FILTER_8BYTE(inp00, inp01, inp02, inp03, \
221 inp04, inp05, inp06, inp07, \
222 inp10, inp11, inp12, inp13, \
223 inp14, inp15, inp16, inp17, \
224 coef0, coef1, coef2) \
228 v8u16 sum00, sum01, sum02, sum03; \
229 v8u16 sum10, sum11, sum12, sum13; \
231 ILVR_B4_UH(inp04, inp00, inp14, inp10, inp07, inp03, inp17, inp13, \
232 sum00, sum10, sum03, sum13); \
233 DOTP_UB2_UH(sum00, sum10, coef0, coef0, sum00, sum10); \
234 HADD_UB2_UH(sum03, sum13, sum03, sum13); \
235 ILVR_B4_UH(inp06, inp02, inp16, inp12, inp05, inp01, inp15, inp11, \
236 sum02, sum12, sum01, sum11); \
237 DPADD_UB2_UH(sum02, sum12, coef2, coef2, sum00, sum10); \
238 DPADD_UB2_UH(sum01, sum11, coef1, coef1, sum03, sum13); \
239 val0 = (v8i16) (sum00 - sum03); \
240 val1 = (v8i16) (sum10 - sum13); \
241 SRARI_H2_SH(val0, val1, 5); \
242 CLIP_SH2_0_255(val0, val1); \
243 res = (v16u8) __msa_pckev_b((v16i8) val1, (v16i8) val0); \
248 #define APPLY_VERT_QPEL_NO_ROUND_FILTER(inp0, inp1, inp2, inp3, \
249 inp4, inp5, inp6, inp7, \
250 coef0, coef1, coef2) \
253 v8i16 res_r, res_l; \
254 v8u16 sum0_r, sum1_r, sum2_r, sum3_r; \
255 v8u16 sum0_l, sum1_l, sum2_l, sum3_l; \
257 ILVRL_B2_UH(inp4, inp0, sum0_r, sum0_l); \
258 ILVRL_B2_UH(inp7, inp3, sum3_r, sum3_l); \
259 DOTP_UB2_UH(sum0_r, sum0_l, coef0, coef0, sum0_r, sum0_l); \
260 HADD_UB2_UH(sum3_r, sum3_l, sum3_r, sum3_l); \
261 ILVRL_B2_UH(inp6, inp2, sum2_r, sum2_l); \
262 ILVRL_B2_UH(inp5, inp1, sum1_r, sum1_l); \
263 DPADD_UB2_UH(sum2_r, sum2_l, coef2, coef2, sum0_r, sum0_l); \
264 DPADD_UB2_UH(sum1_r, sum1_l, coef1, coef1, sum3_r, sum3_l); \
265 res_r = (v8i16) (sum0_r - sum3_r); \
266 res_l = (v8i16) (sum0_l - sum3_l); \
271 CLIP_SH2_0_255(res_r, res_l); \
272 res = (v16u8) __msa_pckev_b((v16i8) res_l, (v16i8) res_r); \
277 #define APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp00, inp01, inp02, inp03, \
278 inp04, inp05, inp06, inp07, \
279 inp10, inp11, inp12, inp13, \
280 inp14, inp15, inp16, inp17, \
281 coef0, coef1, coef2) \
285 v8u16 sum00, sum01, sum02, sum03; \
286 v8u16 sum10, sum11, sum12, sum13; \
288 ILVR_B4_UH(inp04, inp00, inp14, inp10, inp07, inp03, inp17, inp13, \
289 sum00, sum10, sum03, sum13); \
290 DOTP_UB2_UH(sum00, sum10, coef0, coef0, sum00, sum10); \
291 HADD_UB2_UH(sum03, sum13, sum03, sum13); \
292 ILVR_B4_UH(inp06, inp02, inp16, inp12, inp05, inp01, inp15, inp11, \
293 sum02, sum12, sum01, sum11); \
294 DPADD_UB2_UH(sum02, sum12, coef2, coef2, sum00, sum10); \
295 DPADD_UB2_UH(sum01, sum11, coef1, coef1, sum03, sum13); \
296 val0 = (v8i16) (sum00 - sum03); \
297 val1 = (v8i16) (sum10 - sum13); \
302 CLIP_SH2_0_255(val0, val1); \
303 res = (v16u8) __msa_pckev_b((v16i8) val1, (v16i8) val0); \
308 static void horiz_mc_qpel_aver_src0_8width_msa(const uint8_t *src,
315 v16u8 inp0, inp1, inp2, inp3;
317 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
318 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
319 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
320 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
321 v16u8 const20 = (v16u8) __msa_ldi_b(20);
322 v16u8 const6 = (v16u8) __msa_ldi_b(6);
323 v16u8 const3 = (v16u8) __msa_ldi_b(3);
325 for (loop_count = (height >> 2); loop_count--;) {
326 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
327 src += (4 * src_stride);
328 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
329 mask0, mask1, mask2, mask3,
330 const20, const6, const3);
331 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
332 mask0, mask1, mask2, mask3,
333 const20, const6, const3);
334 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
335 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
336 AVER_UB2_UB(inp0, res0, inp2, res1, res0, res1);
337 ST8x4_UB(res0, res1, dst, dst_stride);
338 dst += (4 * dst_stride);
342 static void horiz_mc_qpel_aver_src0_16width_msa(const uint8_t *src,
349 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
351 v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
352 v16u8 const6 = (v16u8) __msa_ldi_b(6);
353 v16u8 const3 = (v16u8) __msa_ldi_b(3);
354 v8u16 const20 = (v8u16) __msa_ldi_h(20);
356 for (loop_count = (height >> 2); loop_count--;) {
357 LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
358 LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
359 src += (4 * src_stride);
360 res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
361 const20, const6, const3);
362 res = __msa_aver_u_b(inp0, res);
366 res = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
367 const20, const6, const3);
368 res = __msa_aver_u_b(inp2, res);
372 res = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
373 const20, const6, const3);
374 res = __msa_aver_u_b(inp4, res);
378 res = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
379 const20, const6, const3);
380 res = __msa_aver_u_b(inp6, res);
386 static void horiz_mc_qpel_8width_msa(const uint8_t *src,
393 v16u8 inp0, inp1, inp2, inp3;
395 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
396 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
397 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
398 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
399 v16u8 const20 = (v16u8) __msa_ldi_b(20);
400 v16u8 const6 = (v16u8) __msa_ldi_b(6);
401 v16u8 const3 = (v16u8) __msa_ldi_b(3);
403 for (loop_count = (height >> 2); loop_count--;) {
404 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
405 src += (4 * src_stride);
406 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
407 mask0, mask1, mask2, mask3,
408 const20, const6, const3);
409 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
410 mask0, mask1, mask2, mask3,
411 const20, const6, const3);
412 ST8x4_UB(res0, res1, dst, dst_stride);
413 dst += (4 * dst_stride);
417 static void horiz_mc_qpel_16width_msa(const uint8_t *src,
424 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
426 v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
427 v8u16 const20 = (v8u16) __msa_ldi_h(20);
428 v16u8 const6 = (v16u8) __msa_ldi_b(6);
429 v16u8 const3 = (v16u8) __msa_ldi_b(3);
431 for (loop_count = (height >> 2); loop_count--;) {
432 LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
433 LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
434 src += (4 * src_stride);
435 res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
436 const20, const6, const3);
440 res = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
441 const20, const6, const3);
445 res = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
446 const20, const6, const3);
450 res = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
451 const20, const6, const3);
457 static void horiz_mc_qpel_aver_src1_8width_msa(const uint8_t *src,
464 v16u8 inp0, inp1, inp2, inp3;
466 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
467 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
468 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
469 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
470 v16u8 const20 = (v16u8) __msa_ldi_b(20);
471 v16u8 const6 = (v16u8) __msa_ldi_b(6);
472 v16u8 const3 = (v16u8) __msa_ldi_b(3);
474 for (loop_count = (height >> 2); loop_count--;) {
475 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
476 src += (4 * src_stride);
477 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
478 mask0, mask1, mask2, mask3,
479 const20, const6, const3);
480 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
481 mask0, mask1, mask2, mask3,
482 const20, const6, const3);
483 SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
484 SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
485 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
486 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
487 AVER_UB2_UB(inp0, res0, inp2, res1, res0, res1);
488 ST8x4_UB(res0, res1, dst, dst_stride);
489 dst += (4 * dst_stride);
493 static void horiz_mc_qpel_aver_src1_16width_msa(const uint8_t *src,
500 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
502 v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
503 v8u16 const20 = (v8u16) __msa_ldi_h(20);
504 v16u8 const6 = (v16u8) __msa_ldi_b(6);
505 v16u8 const3 = (v16u8) __msa_ldi_b(3);
507 for (loop_count = (height >> 2); loop_count--;) {
508 LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
509 LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
510 src += (4 * src_stride);
511 res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
512 const20, const6, const3);
513 res = __msa_aver_u_b(res, inp1);
517 res = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
518 const20, const6, const3);
519 res = __msa_aver_u_b(res, inp3);
523 res = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
524 const20, const6, const3);
525 res = __msa_aver_u_b(res, inp5);
529 res = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
530 const20, const6, const3);
531 res = __msa_aver_u_b(res, inp7);
537 static void horiz_mc_qpel_no_rnd_aver_src0_8width_msa(const uint8_t *src,
544 v16u8 inp0, inp1, inp2, inp3;
546 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
547 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
548 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
549 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
550 v16u8 const20 = (v16u8) __msa_ldi_b(20);
551 v16u8 const6 = (v16u8) __msa_ldi_b(6);
552 v16u8 const3 = (v16u8) __msa_ldi_b(3);
554 for (loop_count = (height >> 2); loop_count--;) {
555 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
556 src += (4 * src_stride);
557 res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
558 mask2, mask3, const20,
560 res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
561 mask2, mask3, const20,
563 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
564 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
565 res0 = __msa_ave_u_b(inp0, res0);
566 res1 = __msa_ave_u_b(inp2, res1);
567 ST8x4_UB(res0, res1, dst, dst_stride);
568 dst += (4 * dst_stride);
572 static void horiz_mc_qpel_no_rnd_aver_src0_16width_msa(const uint8_t *src,
579 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
581 v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
582 v8u16 const20 = (v8u16) __msa_ldi_h(20);
583 v16u8 const6 = (v16u8) __msa_ldi_b(6);
584 v16u8 const3 = (v16u8) __msa_ldi_b(3);
586 for (loop_count = (height >> 2); loop_count--;) {
587 LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
588 LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
589 src += (4 * src_stride);
590 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
591 const20, const6, const3);
592 res = __msa_ave_u_b(inp0, res);
596 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp2, inp3, mask,
597 const20, const6, const3);
598 res = __msa_ave_u_b(inp2, res);
602 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp4, inp5, mask,
603 const20, const6, const3);
604 res = __msa_ave_u_b(inp4, res);
608 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp6, inp7, mask,
609 const20, const6, const3);
610 res = __msa_ave_u_b(inp6, res);
616 static void horiz_mc_qpel_no_rnd_8width_msa(const uint8_t *src,
623 v16u8 inp0, inp1, inp2, inp3;
625 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
626 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
627 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
628 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
629 v16u8 const20 = (v16u8) __msa_ldi_b(20);
630 v16u8 const6 = (v16u8) __msa_ldi_b(6);
631 v16u8 const3 = (v16u8) __msa_ldi_b(3);
633 for (loop_count = (height >> 2); loop_count--;) {
634 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
635 src += (4 * src_stride);
636 res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
637 mask2, mask3, const20,
639 res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
640 mask2, mask3, const20,
642 ST8x4_UB(res0, res1, dst, dst_stride);
643 dst += (4 * dst_stride);
647 static void horiz_mc_qpel_no_rnd_16width_msa(const uint8_t *src,
654 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
656 v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
657 v16u8 const6 = (v16u8) __msa_ldi_b(6);
658 v16u8 const3 = (v16u8) __msa_ldi_b(3);
659 v8u16 const20 = (v8u16) __msa_ldi_h(20);
661 for (loop_count = (height >> 2); loop_count--;) {
662 LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
663 LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
664 src += (4 * src_stride);
665 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
666 const20, const6, const3);
670 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp2, inp3, mask,
671 const20, const6, const3);
675 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp4, inp5, mask,
676 const20, const6, const3);
680 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp6, inp7, mask,
681 const20, const6, const3);
687 static void horiz_mc_qpel_no_rnd_aver_src1_8width_msa(const uint8_t *src,
694 v16u8 inp0, inp1, inp2, inp3;
696 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
697 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
698 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
699 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
700 v16u8 const20 = (v16u8) __msa_ldi_b(20);
701 v16u8 const6 = (v16u8) __msa_ldi_b(6);
702 v16u8 const3 = (v16u8) __msa_ldi_b(3);
704 for (loop_count = (height >> 2); loop_count--;) {
705 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
706 src += (4 * src_stride);
707 res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
708 mask2, mask3, const20,
710 res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
711 mask2, mask3, const20,
713 SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
714 SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
715 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
716 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
717 res0 = __msa_ave_u_b(inp0, res0);
718 res1 = __msa_ave_u_b(inp2, res1);
719 ST8x4_UB(res0, res1, dst, dst_stride);
720 dst += (4 * dst_stride);
724 static void horiz_mc_qpel_no_rnd_aver_src1_16width_msa(const uint8_t *src,
731 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
733 v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
734 v16u8 const6 = (v16u8) __msa_ldi_b(6);
735 v16u8 const3 = (v16u8) __msa_ldi_b(3);
736 v8u16 const20 = (v8u16) __msa_ldi_h(20);
738 for (loop_count = (height >> 2); loop_count--;) {
739 LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
740 LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
741 src += (4 * src_stride);
742 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
743 const20, const6, const3);
744 res = __msa_ave_u_b(res, inp1);
748 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp2, inp3, mask,
749 const20, const6, const3);
750 res = __msa_ave_u_b(res, inp3);
754 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp4, inp5, mask,
755 const20, const6, const3);
756 res = __msa_ave_u_b(res, inp5);
760 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp6, inp7, mask,
761 const20, const6, const3);
762 res = __msa_ave_u_b(res, inp7);
768 static void horiz_mc_qpel_avg_dst_aver_src0_8width_msa(const uint8_t *src,
775 v16u8 inp0, inp1, inp2, inp3;
776 v16u8 dst0, dst1, dst2, dst3;
778 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
779 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
780 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
781 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
782 v16u8 const20 = (v16u8) __msa_ldi_b(20);
783 v16u8 const6 = (v16u8) __msa_ldi_b(6);
784 v16u8 const3 = (v16u8) __msa_ldi_b(3);
786 for (loop_count = (height >> 2); loop_count--;) {
787 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
788 src += (4 * src_stride);
789 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
790 mask0, mask1, mask2, mask3,
791 const20, const6, const3);
792 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
793 mask0, mask1, mask2, mask3,
794 const20, const6, const3);
795 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
796 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
797 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
798 dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
799 dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
800 AVER_UB2_UB(inp0, res0, inp2, res1, res0, res1);
801 AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
802 ST8x4_UB(res0, res1, dst, dst_stride);
803 dst += (4 * dst_stride);
807 static void horiz_mc_qpel_avg_dst_aver_src0_16width_msa(const uint8_t *src,
814 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
817 v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
818 v16u8 const6 = (v16u8) __msa_ldi_b(6);
819 v16u8 const3 = (v16u8) __msa_ldi_b(3);
820 v8u16 const20 = (v8u16) __msa_ldi_h(20);
822 for (loop_count = (height >> 2); loop_count--;) {
823 LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
824 LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
825 src += (4 * src_stride);
826 res0 = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
827 const20, const6, const3);
828 res1 = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
829 const20, const6, const3);
830 LD_UB2(dst, dst_stride, dst0, dst1);
831 AVER_UB2_UB(inp0, res0, inp2, res1, res0, res1);
832 AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1);
833 ST_UB2(res0, res1, dst, dst_stride);
834 dst += (2 * dst_stride);
836 res0 = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
837 const20, const6, const3);
838 res1 = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
839 const20, const6, const3);
840 LD_UB2(dst, dst_stride, dst0, dst1);
841 AVER_UB2_UB(inp4, res0, inp6, res1, res0, res1);
842 AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1);
843 ST_UB2(res0, res1, dst, dst_stride);
844 dst += (2 * dst_stride);
848 static void horiz_mc_qpel_avg_dst_8width_msa(const uint8_t *src,
855 v16u8 inp0, inp1, inp2, inp3;
856 v16u8 dst0, dst1, dst2, dst3;
858 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
859 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
860 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
861 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
862 v16u8 const20 = (v16u8) __msa_ldi_b(20);
863 v16u8 const6 = (v16u8) __msa_ldi_b(6);
864 v16u8 const3 = (v16u8) __msa_ldi_b(3);
866 for (loop_count = (height >> 2); loop_count--;) {
867 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
868 src += (4 * src_stride);
869 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
870 mask0, mask1, mask2, mask3,
871 const20, const6, const3);
872 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
873 mask0, mask1, mask2, mask3,
874 const20, const6, const3);
875 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
876 dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
877 dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
878 AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
879 ST8x4_UB(res0, res1, dst, dst_stride);
880 dst += (4 * dst_stride);
884 static void horiz_mc_qpel_avg_dst_16width_msa(const uint8_t *src,
891 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
894 v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
895 v16u8 const6 = (v16u8) __msa_ldi_b(6);
896 v16u8 const3 = (v16u8) __msa_ldi_b(3);
897 v8u16 const20 = (v8u16) __msa_ldi_h(20);
899 for (loop_count = (height >> 2); loop_count--;) {
900 LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
901 LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
902 src += (4 * src_stride);
903 res0 = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
904 const20, const6, const3);
905 res1 = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
906 const20, const6, const3);
907 LD_UB2(dst, dst_stride, dst0, dst1);
908 AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1);
909 ST_UB2(res0, res1, dst, dst_stride);
910 dst += (2 * dst_stride);
912 res0 = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
913 const20, const6, const3);
914 res1 = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
915 const20, const6, const3);
916 LD_UB2(dst, dst_stride, dst0, dst1);
917 AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1);
918 ST_UB2(res0, res1, dst, dst_stride);
919 dst += (2 * dst_stride);
923 static void horiz_mc_qpel_avg_dst_aver_src1_8width_msa(const uint8_t *src,
930 v16u8 inp0, inp1, inp2, inp3;
931 v16u8 dst0, dst1, dst2, dst3;
933 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
934 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
935 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
936 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
937 v16u8 const20 = (v16u8) __msa_ldi_b(20);
938 v16u8 const6 = (v16u8) __msa_ldi_b(6);
939 v16u8 const3 = (v16u8) __msa_ldi_b(3);
941 for (loop_count = (height >> 2); loop_count--;) {
942 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
943 src += (4 * src_stride);
944 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
945 mask0, mask1, mask2, mask3,
946 const20, const6, const3);
947 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
948 mask0, mask1, mask2, mask3,
949 const20, const6, const3);
950 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
951 SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
952 SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
953 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
954 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
955 dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
956 dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
957 AVER_UB2_UB(inp0, res0, inp2, res1, res0, res1);
958 AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
959 ST8x4_UB(res0, res1, dst, dst_stride);
960 dst += (4 * dst_stride);
964 static void horiz_mc_qpel_avg_dst_aver_src1_16width_msa(const uint8_t *src,
971 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
972 v16u8 res0, res1, dst0, dst1;
973 v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
974 v16u8 const6 = (v16u8) __msa_ldi_b(6);
975 v16u8 const3 = (v16u8) __msa_ldi_b(3);
976 v8u16 const20 = (v8u16) __msa_ldi_h(20);
978 for (loop_count = (height >> 2); loop_count--;) {
979 LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
980 LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
981 src += (4 * src_stride);
982 res0 = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
983 const20, const6, const3);
984 res1 = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
985 const20, const6, const3);
986 LD_UB2(dst, dst_stride, dst0, dst1);
987 AVER_UB2_UB(res0, inp1, res1, inp3, res0, res1);
988 AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1);
989 ST_UB2(res0, res1, dst, dst_stride);
990 dst += (2 * dst_stride);
991 res0 = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
992 const20, const6, const3);
993 res1 = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
994 const20, const6, const3);
995 LD_UB2(dst, dst_stride, dst0, dst1);
996 AVER_UB2_UB(res0, inp5, res1, inp7, res0, res1);
997 AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1);
998 ST_UB2(res0, res1, dst, dst_stride);
999 dst += (2 * dst_stride);
1004 static void vert_mc_qpel_aver_src0_8x8_msa(const uint8_t *src,
1009 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1010 v16u8 tmp0, tmp1, res0, res1;
1011 v16u8 const20 = (v16u8) __msa_ldi_b(20);
1012 v16u8 const6 = (v16u8) __msa_ldi_b(6);
1013 v16u8 const3 = (v16u8) __msa_ldi_b(3);
1015 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
1016 src += (4 * src_stride);
1017 LD_UB2(src, src_stride, inp4, inp5);
1018 src += (2 * src_stride);
1019 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp0, inp0, inp1, inp2,
1020 inp1, inp2, inp3, inp4,
1021 inp1, inp0, inp0, inp1,
1022 inp2, inp3, inp4, inp5,
1023 const20, const6, const3);
1024 LD_UB2(src, src_stride, inp6, inp7);
1025 src += (2 * src_stride);
1026 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp2, inp1, inp0, inp0,
1027 inp3, inp4, inp5, inp6,
1028 inp3, inp2, inp1, inp0,
1029 inp4, inp5, inp6, inp7,
1030 const20, const6, const3);
1031 tmp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
1032 tmp1 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
1033 AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
1034 ST8x4_UB(res0, res1, dst, dst_stride);
1035 dst += (4 * dst_stride);
1038 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1,
1039 inp5, inp6, inp7, inp8,
1040 inp5, inp4, inp3, inp2,
1041 inp6, inp7, inp8, inp8,
1042 const20, const6, const3);
1043 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp6, inp5, inp4, inp3,
1044 inp7, inp8, inp8, inp7,
1045 inp7, inp6, inp5, inp4,
1046 inp8, inp8, inp7, inp6,
1047 const20, const6, const3);
1048 tmp0 = (v16u8) __msa_insve_d((v2i64) inp4, 1, (v2i64) inp5);
1049 tmp1 = (v16u8) __msa_insve_d((v2i64) inp6, 1, (v2i64) inp7);
1050 AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
1051 ST8x4_UB(res0, res1, dst, dst_stride);
1052 dst += (4 * dst_stride);
1055 static void vert_mc_qpel_aver_src0_16x16_msa(const uint8_t *src,
1060 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1061 v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
1063 v16u8 const20 = (v16u8) __msa_ldi_b(20);
1064 v16u8 const6 = (v16u8) __msa_ldi_b(6);
1065 v16u8 const3 = (v16u8) __msa_ldi_b(3);
1067 LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4);
1068 src += (5 * src_stride);
1069 res0 = APPLY_VERT_QPEL_FILTER(inp0, inp0, inp1, inp2,
1070 inp1, inp2, inp3, inp4,
1071 const20, const6, const3);
1072 res0 = __msa_aver_u_b(res0, inp0);
1078 res0 = APPLY_VERT_QPEL_FILTER(inp1, inp0, inp0, inp1,
1079 inp2, inp3, inp4, inp5,
1080 const20, const6, const3);
1081 res0 = __msa_aver_u_b(res0, inp1);
1087 res0 = APPLY_VERT_QPEL_FILTER(inp2, inp1, inp0, inp0,
1088 inp3, inp4, inp5, inp6,
1089 const20, const6, const3);
1090 res0 = __msa_aver_u_b(res0, inp2);
1096 res0 = APPLY_VERT_QPEL_FILTER(inp3, inp2, inp1, inp0,
1097 inp4, inp5, inp6, inp7,
1098 const20, const6, const3);
1099 res0 = __msa_aver_u_b(res0, inp3);
1103 LD_UB2(src, src_stride, inp8, inp9);
1104 src += (2 * src_stride);
1105 res0 = APPLY_VERT_QPEL_FILTER(inp4, inp3, inp2, inp1,
1106 inp5, inp6, inp7, inp8,
1107 const20, const6, const3);
1108 res0 = __msa_aver_u_b(res0, inp4);
1112 res0 = APPLY_VERT_QPEL_FILTER(inp5, inp4, inp3, inp2,
1113 inp6, inp7, inp8, inp9,
1114 const20, const6, const3);
1115 res0 = __msa_aver_u_b(res0, inp5);
1119 LD_UB2(src, src_stride, inp10, inp11);
1120 src += (2 * src_stride);
1121 res0 = APPLY_VERT_QPEL_FILTER(inp6, inp5, inp4, inp3,
1122 inp7, inp8, inp9, inp10,
1123 const20, const6, const3);
1124 res0 = __msa_aver_u_b(res0, inp6);
1128 res0 = APPLY_VERT_QPEL_FILTER(inp7, inp6, inp5, inp4,
1129 inp8, inp9, inp10, inp11,
1130 const20, const6, const3);
1131 res0 = __msa_aver_u_b(res0, inp7);
1135 LD_UB2(src, src_stride, inp12, inp13);
1136 src += (2 * src_stride);
1137 res0 = APPLY_VERT_QPEL_FILTER(inp8, inp7, inp6, inp5,
1138 inp9, inp10, inp11, inp12,
1139 const20, const6, const3);
1140 res0 = __msa_aver_u_b(res0, inp8);
1144 res0 = APPLY_VERT_QPEL_FILTER(inp9, inp8, inp7, inp6,
1145 inp10, inp11, inp12, inp13,
1146 const20, const6, const3);
1147 res0 = __msa_aver_u_b(res0, inp9);
1151 LD_UB2(src, src_stride, inp14, inp15);
1152 src += (2 * src_stride);
1153 res0 = APPLY_VERT_QPEL_FILTER(inp10, inp9, inp8, inp7,
1154 inp11, inp12, inp13, inp14,
1155 const20, const6, const3);
1156 res0 = __msa_aver_u_b(res0, inp10);
1160 res0 = APPLY_VERT_QPEL_FILTER(inp11, inp10, inp9, inp8,
1161 inp12, inp13, inp14, inp15,
1162 const20, const6, const3);
1163 res0 = __msa_aver_u_b(res0, inp11);
1168 res0 = APPLY_VERT_QPEL_FILTER(inp12, inp11, inp10, inp9,
1169 inp13, inp14, inp15, inp16,
1170 const20, const6, const3);
1171 res0 = __msa_aver_u_b(res0, inp12);
1175 res0 = APPLY_VERT_QPEL_FILTER(inp13, inp12, inp11, inp10,
1176 inp14, inp15, inp16, inp16,
1177 const20, const6, const3);
1178 res0 = __msa_aver_u_b(res0, inp13);
1182 res0 = APPLY_VERT_QPEL_FILTER(inp14, inp13, inp12, inp11,
1183 inp15, inp16, inp16, inp15,
1184 const20, const6, const3);
1185 res0 = __msa_aver_u_b(res0, inp14);
1189 res0 = APPLY_VERT_QPEL_FILTER(inp15, inp14, inp13, inp12,
1190 inp16, inp16, inp15, inp14,
1191 const20, const6, const3);
1192 res0 = __msa_aver_u_b(res0, inp15);
1196 static void vert_mc_qpel_8x8_msa(const uint8_t *src,
1201 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1203 v16u8 const20 = (v16u8) __msa_ldi_b(20);
1204 v16u8 const6 = (v16u8) __msa_ldi_b(6);
1205 v16u8 const3 = (v16u8) __msa_ldi_b(3);
1207 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
1208 src += (4 * src_stride);
1209 LD_UB2(src, src_stride, inp4, inp5);
1210 src += (2 * src_stride);
1211 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp0, inp0, inp1, inp2,
1212 inp1, inp2, inp3, inp4,
1213 inp1, inp0, inp0, inp1,
1214 inp2, inp3, inp4, inp5,
1215 const20, const6, const3);
1216 LD_UB2(src, src_stride, inp6, inp7);
1217 src += (2 * src_stride);
1218 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp2, inp1, inp0, inp0,
1219 inp3, inp4, inp5, inp6,
1220 inp3, inp2, inp1, inp0,
1221 inp4, inp5, inp6, inp7,
1222 const20, const6, const3);
1223 ST8x4_UB(res0, res1, dst, dst_stride);
1224 dst += (4 * dst_stride);
1227 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1,
1228 inp5, inp6, inp7, inp8,
1229 inp5, inp4, inp3, inp2,
1230 inp6, inp7, inp8, inp8,
1231 const20, const6, const3);
1232 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp6, inp5, inp4, inp3,
1233 inp7, inp8, inp8, inp7,
1234 inp7, inp6, inp5, inp4,
1235 inp8, inp8, inp7, inp6,
1236 const20, const6, const3);
1237 ST8x4_UB(res0, res1, dst, dst_stride);
1238 dst += (4 * dst_stride);
1241 static void vert_mc_qpel_16x16_msa(const uint8_t *src,
1246 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1247 v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
1249 v16u8 const20 = (v16u8) __msa_ldi_b(20);
1250 v16u8 const6 = (v16u8) __msa_ldi_b(6);
1251 v16u8 const3 = (v16u8) __msa_ldi_b(3);
1253 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
1254 src += (4 * src_stride);
1257 res0 = APPLY_VERT_QPEL_FILTER(inp0, inp0, inp1, inp2,
1258 inp1, inp2, inp3, inp4,
1259 const20, const6, const3);
1265 res0 = APPLY_VERT_QPEL_FILTER(inp1, inp0, inp0, inp1,
1266 inp2, inp3, inp4, inp5,
1267 const20, const6, const3);
1273 res0 = APPLY_VERT_QPEL_FILTER(inp2, inp1, inp0, inp0,
1274 inp3, inp4, inp5, inp6,
1275 const20, const6, const3);
1281 res0 = APPLY_VERT_QPEL_FILTER(inp3, inp2, inp1, inp0,
1282 inp4, inp5, inp6, inp7,
1283 const20, const6, const3);
1289 res0 = APPLY_VERT_QPEL_FILTER(inp4, inp3, inp2, inp1,
1290 inp5, inp6, inp7, inp8,
1291 const20, const6, const3);
1297 res0 = APPLY_VERT_QPEL_FILTER(inp5, inp4, inp3, inp2,
1298 inp6, inp7, inp8, inp9,
1299 const20, const6, const3);
1305 res0 = APPLY_VERT_QPEL_FILTER(inp6, inp5, inp4, inp3,
1306 inp7, inp8, inp9, inp10,
1307 const20, const6, const3);
1313 res0 = APPLY_VERT_QPEL_FILTER(inp7, inp6, inp5, inp4,
1314 inp8, inp9, inp10, inp11,
1315 const20, const6, const3);
1321 res0 = APPLY_VERT_QPEL_FILTER(inp8, inp7, inp6, inp5,
1322 inp9, inp10, inp11, inp12,
1323 const20, const6, const3);
1329 res0 = APPLY_VERT_QPEL_FILTER(inp9, inp8, inp7, inp6,
1330 inp10, inp11, inp12, inp13,
1331 const20, const6, const3);
1337 res0 = APPLY_VERT_QPEL_FILTER(inp10, inp9, inp8, inp7,
1338 inp11, inp12, inp13, inp14,
1339 const20, const6, const3);
1345 res0 = APPLY_VERT_QPEL_FILTER(inp11, inp10, inp9, inp8,
1346 inp12, inp13, inp14, inp15,
1347 const20, const6, const3);
1352 res0 = APPLY_VERT_QPEL_FILTER(inp12, inp11, inp10, inp9,
1353 inp13, inp14, inp15, inp16,
1354 const20, const6, const3);
1358 res0 = APPLY_VERT_QPEL_FILTER(inp13, inp12, inp11, inp10,
1359 inp14, inp15, inp16, inp16,
1360 const20, const6, const3);
1364 res0 = APPLY_VERT_QPEL_FILTER(inp14, inp13, inp12, inp11,
1365 inp15, inp16, inp16, inp15,
1366 const20, const6, const3);
1370 res0 = APPLY_VERT_QPEL_FILTER(inp15, inp14, inp13, inp12,
1371 inp16, inp16, inp15, inp14,
1372 const20, const6, const3);
1377 static void vert_mc_qpel_aver_src1_8x8_msa(const uint8_t *src,
1382 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1383 v16u8 tmp0, tmp1, res0, res1;
1384 v16u8 const20 = (v16u8) __msa_ldi_b(20);
1385 v16u8 const6 = (v16u8) __msa_ldi_b(6);
1386 v16u8 const3 = (v16u8) __msa_ldi_b(3);
1388 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
1389 src += (4 * src_stride);
1390 LD_UB2(src, src_stride, inp4, inp5);
1391 src += (2 * src_stride);
1392 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp0, inp0, inp1, inp2,
1393 inp1, inp2, inp3, inp4,
1394 inp1, inp0, inp0, inp1,
1395 inp2, inp3, inp4, inp5,
1396 const20, const6, const3);
1398 LD_UB2(src, src_stride, inp6, inp7);
1399 src += (2 * src_stride);
1400 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp2, inp1, inp0, inp0,
1401 inp3, inp4, inp5, inp6,
1402 inp3, inp2, inp1, inp0,
1403 inp4, inp5, inp6, inp7,
1404 const20, const6, const3);
1405 tmp0 = (v16u8) __msa_insve_d((v2i64) inp1, 1, (v2i64) inp2);
1406 tmp1 = (v16u8) __msa_insve_d((v2i64) inp3, 1, (v2i64) inp4);
1407 AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
1408 ST8x4_UB(res0, res1, dst, dst_stride);
1409 dst += (4 * dst_stride);
1412 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1,
1413 inp5, inp6, inp7, inp8,
1414 inp5, inp4, inp3, inp2,
1415 inp6, inp7, inp8, inp8,
1416 const20, const6, const3);
1417 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp6, inp5, inp4, inp3,
1418 inp7, inp8, inp8, inp7,
1419 inp7, inp6, inp5, inp4,
1420 inp8, inp8, inp7, inp6,
1421 const20, const6, const3);
1422 tmp0 = (v16u8) __msa_insve_d((v2i64) inp5, 1, (v2i64) inp6);
1423 tmp1 = (v16u8) __msa_insve_d((v2i64) inp7, 1, (v2i64) inp8);
1424 AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
1425 ST8x4_UB(res0, res1, dst, dst_stride);
1428 static void vert_mc_qpel_aver_src1_16x16_msa(const uint8_t *src,
1433 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1434 v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
1436 v16u8 const20 = (v16u8) __msa_ldi_b(20);
1437 v16u8 const6 = (v16u8) __msa_ldi_b(6);
1438 v16u8 const3 = (v16u8) __msa_ldi_b(3);
1440 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
1441 src += (4 * src_stride);
1444 res0 = APPLY_VERT_QPEL_FILTER(inp0, inp0, inp1, inp2,
1445 inp1, inp2, inp3, inp4,
1446 const20, const6, const3);
1447 res0 = __msa_aver_u_b(res0, inp1);
1453 res0 = APPLY_VERT_QPEL_FILTER(inp1, inp0, inp0, inp1,
1454 inp2, inp3, inp4, inp5,
1455 const20, const6, const3);
1456 res0 = __msa_aver_u_b(res0, inp2);
1462 res0 = APPLY_VERT_QPEL_FILTER(inp2, inp1, inp0, inp0,
1463 inp3, inp4, inp5, inp6,
1464 const20, const6, const3);
1465 res0 = __msa_aver_u_b(res0, inp3);
1471 res0 = APPLY_VERT_QPEL_FILTER(inp3, inp2, inp1, inp0,
1472 inp4, inp5, inp6, inp7,
1473 const20, const6, const3);
1474 res0 = __msa_aver_u_b(res0, inp4);
1480 res0 = APPLY_VERT_QPEL_FILTER(inp4, inp3, inp2, inp1,
1481 inp5, inp6, inp7, inp8,
1482 const20, const6, const3);
1483 res0 = __msa_aver_u_b(res0, inp5);
1489 res0 = APPLY_VERT_QPEL_FILTER(inp5, inp4, inp3, inp2,
1490 inp6, inp7, inp8, inp9,
1491 const20, const6, const3);
1492 res0 = __msa_aver_u_b(res0, inp6);
1498 res0 = APPLY_VERT_QPEL_FILTER(inp6, inp5, inp4, inp3,
1499 inp7, inp8, inp9, inp10,
1500 const20, const6, const3);
1501 res0 = __msa_aver_u_b(res0, inp7);
1507 res0 = APPLY_VERT_QPEL_FILTER(inp7, inp6, inp5, inp4,
1508 inp8, inp9, inp10, inp11,
1509 const20, const6, const3);
1510 res0 = __msa_aver_u_b(res0, inp8);
1516 res0 = APPLY_VERT_QPEL_FILTER(inp8, inp7, inp6, inp5,
1517 inp9, inp10, inp11, inp12,
1518 const20, const6, const3);
1519 res0 = __msa_aver_u_b(res0, inp9);
1525 res0 = APPLY_VERT_QPEL_FILTER(inp9, inp8, inp7, inp6,
1526 inp10, inp11, inp12, inp13,
1527 const20, const6, const3);
1528 res0 = __msa_aver_u_b(res0, inp10);
1534 res0 = APPLY_VERT_QPEL_FILTER(inp10, inp9, inp8, inp7,
1535 inp11, inp12, inp13, inp14,
1536 const20, const6, const3);
1537 res0 = __msa_aver_u_b(res0, inp11);
1543 res0 = APPLY_VERT_QPEL_FILTER(inp11, inp10, inp9, inp8,
1544 inp12, inp13, inp14, inp15,
1545 const20, const6, const3);
1546 res0 = __msa_aver_u_b(res0, inp12);
1551 res0 = APPLY_VERT_QPEL_FILTER(inp12, inp11, inp10, inp9,
1552 inp13, inp14, inp15, inp16,
1553 const20, const6, const3);
1554 res0 = __msa_aver_u_b(res0, inp13);
1558 res0 = APPLY_VERT_QPEL_FILTER(inp13, inp12, inp11, inp10,
1559 inp14, inp15, inp16, inp16,
1560 const20, const6, const3);
1561 res0 = __msa_aver_u_b(res0, inp14);
1565 res0 = APPLY_VERT_QPEL_FILTER(inp14, inp13, inp12, inp11,
1566 inp15, inp16, inp16, inp15,
1567 const20, const6, const3);
1568 res0 = __msa_aver_u_b(res0, inp15);
1572 res0 = APPLY_VERT_QPEL_FILTER(inp15, inp14, inp13, inp12,
1573 inp16, inp16, inp15, inp14,
1574 const20, const6, const3);
1575 res0 = __msa_aver_u_b(res0, inp16);
1579 static void vert_mc_qpel_no_rnd_aver_src0_8x8_msa(const uint8_t *src,
1584 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1585 v16u8 tmp0, tmp1, res0, res1;
1586 v16u8 const20 = (v16u8) __msa_ldi_b(20);
1587 v16u8 const6 = (v16u8) __msa_ldi_b(6);
1588 v16u8 const3 = (v16u8) __msa_ldi_b(3);
1590 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
1591 src += (4 * src_stride);
1592 LD_UB2(src, src_stride, inp4, inp5);
1593 src += (2 * src_stride);
1594 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp0, inp1, inp2,
1595 inp1, inp2, inp3, inp4,
1596 inp1, inp0, inp0, inp1,
1597 inp2, inp3, inp4, inp5,
1598 const20, const6, const3);
1599 LD_UB2(src, src_stride, inp6, inp7);
1600 src += (2 * src_stride);
1601 res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp1, inp0, inp0,
1602 inp3, inp4, inp5, inp6,
1603 inp3, inp2, inp1, inp0,
1604 inp4, inp5, inp6, inp7,
1605 const20, const6, const3);
1606 tmp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
1607 tmp1 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
1608 res0 = __msa_ave_u_b(res0, tmp0);
1609 res1 = __msa_ave_u_b(res1, tmp1);
1610 ST8x4_UB(res0, res1, dst, dst_stride);
1611 dst += (4 * dst_stride);
1614 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp4, inp3, inp2, inp1,
1615 inp5, inp6, inp7, inp8,
1616 inp5, inp4, inp3, inp2,
1617 inp6, inp7, inp8, inp8,
1618 const20, const6, const3);
1619 res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp6, inp5, inp4, inp3,
1620 inp7, inp8, inp8, inp7,
1621 inp7, inp6, inp5, inp4,
1622 inp8, inp8, inp7, inp6,
1623 const20, const6, const3);
1624 tmp0 = (v16u8) __msa_insve_d((v2i64) inp4, 1, (v2i64) inp5);
1625 tmp1 = (v16u8) __msa_insve_d((v2i64) inp6, 1, (v2i64) inp7);
1626 res0 = __msa_ave_u_b(res0, tmp0);
1627 res1 = __msa_ave_u_b(res1, tmp1);
1628 ST8x4_UB(res0, res1, dst, dst_stride);
1629 dst += (4 * dst_stride);
1632 static void vert_mc_qpel_no_rnd_aver_src0_16x16_msa(const uint8_t *src,
1637 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1638 v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
1640 v16u8 const20 = (v16u8) __msa_ldi_b(20);
1641 v16u8 const6 = (v16u8) __msa_ldi_b(6);
1642 v16u8 const3 = (v16u8) __msa_ldi_b(3);
1644 LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4);
1645 src += (5 * src_stride);
1646 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp0, inp0, inp1, inp2,
1647 inp1, inp2, inp3, inp4,
1648 const20, const6, const3);
1649 res0 = __msa_ave_u_b(res0, inp0);
1655 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp1, inp0, inp0, inp1,
1656 inp2, inp3, inp4, inp5,
1657 const20, const6, const3);
1658 res0 = __msa_ave_u_b(res0, inp1);
1664 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp2, inp1, inp0, inp0,
1665 inp3, inp4, inp5, inp6,
1666 const20, const6, const3);
1667 res0 = __msa_ave_u_b(res0, inp2);
1673 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp3, inp2, inp1, inp0,
1674 inp4, inp5, inp6, inp7,
1675 const20, const6, const3);
1676 res0 = __msa_ave_u_b(res0, inp3);
1682 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp4, inp3, inp2, inp1,
1683 inp5, inp6, inp7, inp8,
1684 const20, const6, const3);
1685 res0 = __msa_ave_u_b(res0, inp4);
1691 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp5, inp4, inp3, inp2,
1692 inp6, inp7, inp8, inp9,
1693 const20, const6, const3);
1694 res0 = __msa_ave_u_b(res0, inp5);
1700 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp6, inp5, inp4, inp3,
1701 inp7, inp8, inp9, inp10,
1702 const20, const6, const3);
1703 res0 = __msa_ave_u_b(res0, inp6);
1709 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp7, inp6, inp5, inp4,
1710 inp8, inp9, inp10, inp11,
1711 const20, const6, const3);
1712 res0 = __msa_ave_u_b(res0, inp7);
1718 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp8, inp7, inp6, inp5,
1719 inp9, inp10, inp11, inp12,
1720 const20, const6, const3);
1721 res0 = __msa_ave_u_b(res0, inp8);
1727 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp9, inp8, inp7, inp6,
1728 inp10, inp11, inp12, inp13,
1729 const20, const6, const3);
1730 res0 = __msa_ave_u_b(res0, inp9);
1736 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp10, inp9, inp8, inp7,
1737 inp11, inp12, inp13, inp14,
1738 const20, const6, const3);
1739 res0 = __msa_ave_u_b(res0, inp10);
1745 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp11, inp10, inp9, inp8,
1746 inp12, inp13, inp14, inp15,
1747 const20, const6, const3);
1748 res0 = __msa_ave_u_b(res0, inp11);
1753 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp12, inp11, inp10, inp9,
1754 inp13, inp14, inp15, inp16,
1755 const20, const6, const3);
1756 res0 = __msa_ave_u_b(res0, inp12);
1760 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp13, inp12, inp11, inp10,
1761 inp14, inp15, inp16, inp16,
1762 const20, const6, const3);
1763 res0 = __msa_ave_u_b(res0, inp13);
1767 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp14, inp13, inp12, inp11,
1768 inp15, inp16, inp16, inp15,
1769 const20, const6, const3);
1770 res0 = __msa_ave_u_b(res0, inp14);
1774 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp15, inp14, inp13, inp12,
1775 inp16, inp16, inp15, inp14,
1776 const20, const6, const3);
1777 res0 = __msa_ave_u_b(res0, inp15);
1782 static void vert_mc_qpel_no_rnd_8x8_msa(const uint8_t *src,
1787 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1789 v16u8 const20 = (v16u8) __msa_ldi_b(20);
1790 v16u8 const6 = (v16u8) __msa_ldi_b(6);
1791 v16u8 const3 = (v16u8) __msa_ldi_b(3);
1793 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
1794 src += (4 * src_stride);
1795 LD_UB2(src, src_stride, inp4, inp5);
1796 src += (2 * src_stride);
1797 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp0, inp1, inp2,
1798 inp1, inp2, inp3, inp4,
1799 inp1, inp0, inp0, inp1,
1800 inp2, inp3, inp4, inp5,
1801 const20, const6, const3);
1802 LD_UB2(src, src_stride, inp6, inp7);
1803 src += (2 * src_stride);
1804 res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp1, inp0, inp0,
1805 inp3, inp4, inp5, inp6,
1806 inp3, inp2, inp1, inp0,
1807 inp4, inp5, inp6, inp7,
1808 const20, const6, const3);
1809 ST8x4_UB(res0, res1, dst, dst_stride);
1810 dst += (4 * dst_stride);
1813 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp4, inp3, inp2, inp1,
1814 inp5, inp6, inp7, inp8,
1815 inp5, inp4, inp3, inp2,
1816 inp6, inp7, inp8, inp8,
1817 const20, const6, const3);
1818 res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp6, inp5, inp4, inp3,
1819 inp7, inp8, inp8, inp7,
1820 inp7, inp6, inp5, inp4,
1821 inp8, inp8, inp7, inp6,
1822 const20, const6, const3);
1823 ST8x4_UB(res0, res1, dst, dst_stride);
1824 dst += (4 * dst_stride);
1827 static void vert_mc_qpel_no_rnd_16x16_msa(const uint8_t *src,
1832 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1833 v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
1835 v16u8 const20 = (v16u8) __msa_ldi_b(20);
1836 v16u8 const6 = (v16u8) __msa_ldi_b(6);
1837 v16u8 const3 = (v16u8) __msa_ldi_b(3);
1839 LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4);
1840 src += (5 * src_stride);
1841 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp0, inp0, inp1, inp2,
1842 inp1, inp2, inp3, inp4,
1843 const20, const6, const3);
1849 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp1, inp0, inp0, inp1,
1850 inp2, inp3, inp4, inp5,
1851 const20, const6, const3);
1857 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp2, inp1, inp0, inp0,
1858 inp3, inp4, inp5, inp6,
1859 const20, const6, const3);
1865 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp3, inp2, inp1, inp0,
1866 inp4, inp5, inp6, inp7,
1867 const20, const6, const3);
1873 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp4, inp3, inp2, inp1,
1874 inp5, inp6, inp7, inp8,
1875 const20, const6, const3);
1881 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp5, inp4, inp3, inp2,
1882 inp6, inp7, inp8, inp9,
1883 const20, const6, const3);
1889 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp6, inp5, inp4, inp3,
1890 inp7, inp8, inp9, inp10,
1891 const20, const6, const3);
1897 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp7, inp6, inp5, inp4,
1898 inp8, inp9, inp10, inp11,
1899 const20, const6, const3);
1905 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp8, inp7, inp6, inp5,
1906 inp9, inp10, inp11, inp12,
1907 const20, const6, const3);
1913 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp9, inp8, inp7, inp6,
1914 inp10, inp11, inp12, inp13,
1915 const20, const6, const3);
1921 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp10, inp9, inp8, inp7,
1922 inp11, inp12, inp13, inp14,
1923 const20, const6, const3);
1929 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp11, inp10, inp9, inp8,
1930 inp12, inp13, inp14, inp15,
1931 const20, const6, const3);
1936 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp12, inp11, inp10, inp9,
1937 inp13, inp14, inp15, inp16,
1938 const20, const6, const3);
1942 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp13, inp12, inp11, inp10,
1943 inp14, inp15, inp16, inp16,
1944 const20, const6, const3);
1948 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp14, inp13, inp12, inp11,
1949 inp15, inp16, inp16, inp15,
1950 const20, const6, const3);
1954 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp15, inp14, inp13, inp12,
1955 inp16, inp16, inp15, inp14,
1956 const20, const6, const3);
1960 static void vert_mc_qpel_no_rnd_aver_src1_8x8_msa(const uint8_t *src,
1965 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1966 v16u8 tmp0, tmp1, res0, res1;
1967 v16u8 const20 = (v16u8) __msa_ldi_b(20);
1968 v16u8 const6 = (v16u8) __msa_ldi_b(6);
1969 v16u8 const3 = (v16u8) __msa_ldi_b(3);
1971 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
1972 src += (4 * src_stride);
1973 LD_UB2(src, src_stride, inp4, inp5);
1974 src += (2 * src_stride);
1975 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp0, inp1, inp2,
1976 inp1, inp2, inp3, inp4,
1977 inp1, inp0, inp0, inp1,
1978 inp2, inp3, inp4, inp5,
1979 const20, const6, const3);
1980 LD_UB2(src, src_stride, inp6, inp7);
1981 src += (2 * src_stride);
1982 res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp1, inp0, inp0,
1983 inp3, inp4, inp5, inp6,
1984 inp3, inp2, inp1, inp0,
1985 inp4, inp5, inp6, inp7,
1986 const20, const6, const3);
1987 tmp0 = (v16u8) __msa_insve_d((v2i64) inp1, 1, (v2i64) inp2);
1988 tmp1 = (v16u8) __msa_insve_d((v2i64) inp3, 1, (v2i64) inp4);
1989 res0 = __msa_ave_u_b(res0, tmp0);
1990 res1 = __msa_ave_u_b(res1, tmp1);
1991 ST8x4_UB(res0, res1, dst, dst_stride);
1992 dst += (4 * dst_stride);
1995 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp4, inp3, inp2, inp1,
1996 inp5, inp6, inp7, inp8,
1997 inp5, inp4, inp3, inp2,
1998 inp6, inp7, inp8, inp8,
1999 const20, const6, const3);
2000 res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp6, inp5, inp4, inp3,
2001 inp7, inp8, inp8, inp7,
2002 inp7, inp6, inp5, inp4,
2003 inp8, inp8, inp7, inp6,
2004 const20, const6, const3);
2005 tmp0 = (v16u8) __msa_insve_d((v2i64) inp5, 1, (v2i64) inp6);
2006 tmp1 = (v16u8) __msa_insve_d((v2i64) inp7, 1, (v2i64) inp8);
2007 res0 = __msa_ave_u_b(res0, tmp0);
2008 res1 = __msa_ave_u_b(res1, tmp1);
2009 ST8x4_UB(res0, res1, dst, dst_stride);
2012 static void vert_mc_qpel_no_rnd_aver_src1_16x16_msa(const uint8_t *src,
2017 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
2018 v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
2020 v16u8 const20 = (v16u8) __msa_ldi_b(20);
2021 v16u8 const6 = (v16u8) __msa_ldi_b(6);
2022 v16u8 const3 = (v16u8) __msa_ldi_b(3);
2024 LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4);
2025 src += (5 * src_stride);
2026 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp0, inp0, inp1, inp2,
2027 inp1, inp2, inp3, inp4,
2028 const20, const6, const3);
2029 res0 = __msa_ave_u_b(res0, inp1);
2035 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp1, inp0, inp0, inp1,
2036 inp2, inp3, inp4, inp5,
2037 const20, const6, const3);
2038 res0 = __msa_ave_u_b(res0, inp2);
2044 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp2, inp1, inp0, inp0,
2045 inp3, inp4, inp5, inp6,
2046 const20, const6, const3);
2047 res0 = __msa_ave_u_b(res0, inp3);
2053 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp3, inp2, inp1, inp0,
2054 inp4, inp5, inp6, inp7,
2055 const20, const6, const3);
2056 res0 = __msa_ave_u_b(res0, inp4);
2062 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp4, inp3, inp2, inp1,
2063 inp5, inp6, inp7, inp8,
2064 const20, const6, const3);
2065 res0 = __msa_ave_u_b(res0, inp5);
2071 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp5, inp4, inp3, inp2,
2072 inp6, inp7, inp8, inp9,
2073 const20, const6, const3);
2074 res0 = __msa_ave_u_b(res0, inp6);
2080 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp6, inp5, inp4, inp3,
2081 inp7, inp8, inp9, inp10,
2082 const20, const6, const3);
2083 res0 = __msa_ave_u_b(res0, inp7);
2089 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp7, inp6, inp5, inp4,
2090 inp8, inp9, inp10, inp11,
2091 const20, const6, const3);
2092 res0 = __msa_ave_u_b(res0, inp8);
2098 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp8, inp7, inp6, inp5,
2099 inp9, inp10, inp11, inp12,
2100 const20, const6, const3);
2101 res0 = __msa_ave_u_b(res0, inp9);
2107 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp9, inp8, inp7, inp6,
2108 inp10, inp11, inp12, inp13,
2109 const20, const6, const3);
2110 res0 = __msa_ave_u_b(res0, inp10);
2116 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp10, inp9, inp8, inp7,
2117 inp11, inp12, inp13, inp14,
2118 const20, const6, const3);
2119 res0 = __msa_ave_u_b(res0, inp11);
2125 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp11, inp10, inp9, inp8,
2126 inp12, inp13, inp14, inp15,
2127 const20, const6, const3);
2128 res0 = __msa_ave_u_b(res0, inp12);
2133 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp12, inp11, inp10, inp9,
2134 inp13, inp14, inp15, inp16,
2135 const20, const6, const3);
2136 res0 = __msa_ave_u_b(res0, inp13);
2140 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp13, inp12, inp11, inp10,
2141 inp14, inp15, inp16, inp16,
2142 const20, const6, const3);
2143 res0 = __msa_ave_u_b(res0, inp14);
2147 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp14, inp13, inp12, inp11,
2148 inp15, inp16, inp16, inp15,
2149 const20, const6, const3);
2150 res0 = __msa_ave_u_b(res0, inp15);
2154 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp15, inp14, inp13, inp12,
2155 inp16, inp16, inp15, inp14,
2156 const20, const6, const3);
2157 res0 = __msa_ave_u_b(res0, inp16);
2161 static void vert_mc_qpel_avg_dst_aver_src0_8x8_msa(const uint8_t *src,
2166 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
2167 v16u8 dst0, dst1, dst2, dst3;
2168 v16u8 tmp0, tmp1, res0, res1;
2169 v16u8 const20 = (v16u8) __msa_ldi_b(20);
2170 v16u8 const6 = (v16u8) __msa_ldi_b(6);
2171 v16u8 const3 = (v16u8) __msa_ldi_b(3);
2173 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
2174 src += (4 * src_stride);
2175 LD_UB2(src, src_stride, inp4, inp5);
2176 src += (2 * src_stride);
2177 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp0, inp0, inp1, inp2,
2178 inp1, inp2, inp3, inp4,
2179 inp1, inp0, inp0, inp1,
2180 inp2, inp3, inp4, inp5,
2181 const20, const6, const3);
2183 LD_UB2(src, src_stride, inp6, inp7);
2184 src += (2 * src_stride);
2185 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp2, inp1, inp0, inp0,
2186 inp3, inp4, inp5, inp6,
2187 inp3, inp2, inp1, inp0,
2188 inp4, inp5, inp6, inp7,
2189 const20, const6, const3);
2191 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2192 tmp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
2193 tmp1 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
2194 dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
2195 dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
2196 AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
2197 AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
2198 ST8x4_UB(res0, res1, dst, dst_stride);
2199 dst += (4 * dst_stride);
2202 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1,
2203 inp5, inp6, inp7, inp8,
2204 inp5, inp4, inp3, inp2,
2205 inp6, inp7, inp8, inp8,
2206 const20, const6, const3);
2207 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp6, inp5, inp4, inp3,
2208 inp7, inp8, inp8, inp7,
2209 inp7, inp6, inp5, inp4,
2210 inp8, inp8, inp7, inp6,
2211 const20, const6, const3);
2213 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2214 tmp0 = (v16u8) __msa_insve_d((v2i64) inp4, 1, (v2i64) inp5);
2215 tmp1 = (v16u8) __msa_insve_d((v2i64) inp6, 1, (v2i64) inp7);
2216 dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
2217 dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
2218 AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
2219 AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
2220 ST8x4_UB(res0, res1, dst, dst_stride);
2223 static void vert_mc_qpel_avg_dst_aver_src0_16x16_msa(const uint8_t *src,
2228 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
2229 v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
2230 v16u8 res0, res1, dst0, dst1;
2231 v16u8 const20 = (v16u8) __msa_ldi_b(20);
2232 v16u8 const6 = (v16u8) __msa_ldi_b(6);
2233 v16u8 const3 = (v16u8) __msa_ldi_b(3);
2235 LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4);
2236 src += (5 * src_stride);
2237 res0 = APPLY_VERT_QPEL_FILTER(inp0, inp0, inp1, inp2,
2238 inp1, inp2, inp3, inp4,
2239 const20, const6, const3);
2243 res1 = APPLY_VERT_QPEL_FILTER(inp1, inp0, inp0, inp1,
2244 inp2, inp3, inp4, inp5,
2245 const20, const6, const3);
2247 LD_UB2(dst, dst_stride, dst0, dst1);
2248 AVER_UB2_UB(res0, inp0, res1, inp1, res0, res1);
2249 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2250 ST_UB2(res0, res1, dst, dst_stride);
2251 dst += (2 * dst_stride);
2255 res0 = APPLY_VERT_QPEL_FILTER(inp2, inp1, inp0, inp0,
2256 inp3, inp4, inp5, inp6,
2257 const20, const6, const3);
2261 res1 = APPLY_VERT_QPEL_FILTER(inp3, inp2, inp1, inp0,
2262 inp4, inp5, inp6, inp7,
2263 const20, const6, const3);
2265 LD_UB2(dst, dst_stride, dst0, dst1);
2266 AVER_UB2_UB(res0, inp2, res1, inp3, res0, res1);
2267 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2268 ST_UB2(res0, res1, dst, dst_stride);
2269 dst += (2 * dst_stride);
2271 LD_UB2(src, src_stride, inp8, inp9);
2272 src += (2 * src_stride);
2273 res0 = APPLY_VERT_QPEL_FILTER(inp4, inp3, inp2, inp1,
2274 inp5, inp6, inp7, inp8,
2275 const20, const6, const3);
2276 res1 = APPLY_VERT_QPEL_FILTER(inp5, inp4, inp3, inp2,
2277 inp6, inp7, inp8, inp9,
2278 const20, const6, const3);
2280 LD_UB2(dst, dst_stride, dst0, dst1);
2281 AVER_UB2_UB(res0, inp4, res1, inp5, res0, res1);
2282 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2283 ST_UB2(res0, res1, dst, dst_stride);
2284 dst += (2 * dst_stride);
2286 LD_UB2(src, src_stride, inp10, inp11);
2287 src += (2 * src_stride);
2288 res0 = APPLY_VERT_QPEL_FILTER(inp6, inp5, inp4, inp3,
2289 inp7, inp8, inp9, inp10,
2290 const20, const6, const3);
2291 res1 = APPLY_VERT_QPEL_FILTER(inp7, inp6, inp5, inp4,
2292 inp8, inp9, inp10, inp11,
2293 const20, const6, const3);
2295 LD_UB2(dst, dst_stride, dst0, dst1);
2296 AVER_UB2_UB(res0, inp6, res1, inp7, res0, res1);
2297 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2298 ST_UB2(res0, res1, dst, dst_stride);
2299 dst += (2 * dst_stride);
2301 LD_UB2(src, src_stride, inp12, inp13);
2302 src += (2 * src_stride);
2303 res0 = APPLY_VERT_QPEL_FILTER(inp8, inp7, inp6, inp5,
2304 inp9, inp10, inp11, inp12,
2305 const20, const6, const3);
2306 res1 = APPLY_VERT_QPEL_FILTER(inp9, inp8, inp7, inp6,
2307 inp10, inp11, inp12, inp13,
2308 const20, const6, const3);
2309 LD_UB2(dst, dst_stride, dst0, dst1);
2310 AVER_UB2_UB(res0, inp8, res1, inp9, res0, res1);
2311 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2312 ST_UB2(res0, res1, dst, dst_stride);
2313 dst += (2 * dst_stride);
2315 LD_UB2(src, src_stride, inp14, inp15);
2316 src += (2 * src_stride);
2317 res0 = APPLY_VERT_QPEL_FILTER(inp10, inp9, inp8, inp7,
2318 inp11, inp12, inp13, inp14,
2319 const20, const6, const3);
2320 res1 = APPLY_VERT_QPEL_FILTER(inp11, inp10, inp9, inp8,
2321 inp12, inp13, inp14, inp15,
2322 const20, const6, const3);
2324 LD_UB2(dst, dst_stride, dst0, dst1);
2325 AVER_UB2_UB(res0, inp10, res1, inp11, res0, res1);
2326 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2327 ST_UB2(res0, res1, dst, dst_stride);
2328 dst += (2 * dst_stride);
2331 res0 = APPLY_VERT_QPEL_FILTER(inp12, inp11, inp10, inp9,
2332 inp13, inp14, inp15, inp16,
2333 const20, const6, const3);
2334 res1 = APPLY_VERT_QPEL_FILTER(inp13, inp12, inp11, inp10,
2335 inp14, inp15, inp16, inp16,
2336 const20, const6, const3);
2337 LD_UB2(dst, dst_stride, dst0, dst1);
2338 AVER_UB2_UB(res0, inp12, res1, inp13, res0, res1);
2339 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2340 ST_UB2(res0, res1, dst, dst_stride);
2341 dst += (2 * dst_stride);
2343 res0 = APPLY_VERT_QPEL_FILTER(inp14, inp13, inp12, inp11,
2344 inp15, inp16, inp16, inp15,
2345 const20, const6, const3);
2346 res1 = APPLY_VERT_QPEL_FILTER(inp15, inp14, inp13, inp12,
2347 inp16, inp16, inp15, inp14,
2348 const20, const6, const3);
2349 LD_UB2(dst, dst_stride, dst0, dst1);
2350 AVER_UB2_UB(res0, inp14, res1, inp15, res0, res1);
2351 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2352 ST_UB2(res0, res1, dst, dst_stride);
2355 static void vert_mc_qpel_avg_dst_8x8_msa(const uint8_t *src,
2360 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
2361 v16u8 dst0, dst1, dst2, dst3;
2363 v16u8 const20 = (v16u8) __msa_ldi_b(20);
2364 v16u8 const6 = (v16u8) __msa_ldi_b(6);
2365 v16u8 const3 = (v16u8) __msa_ldi_b(3);
2367 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
2368 src += (4 * src_stride);
2369 LD_UB2(src, src_stride, inp4, inp5);
2370 src += (2 * src_stride);
2371 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp0, inp0, inp1, inp2,
2372 inp1, inp2, inp3, inp4,
2373 inp1, inp0, inp0, inp1,
2374 inp2, inp3, inp4, inp5,
2375 const20, const6, const3);
2376 LD_UB2(src, src_stride, inp6, inp7);
2377 src += (2 * src_stride);
2378 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp2, inp1, inp0, inp0,
2379 inp3, inp4, inp5, inp6,
2380 inp3, inp2, inp1, inp0,
2381 inp4, inp5, inp6, inp7,
2382 const20, const6, const3);
2383 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2384 dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
2385 dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
2386 AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
2387 ST8x4_UB(res0, res1, dst, dst_stride);
2388 dst += (4 * dst_stride);
2391 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1,
2392 inp5, inp6, inp7, inp8,
2393 inp5, inp4, inp3, inp2,
2394 inp6, inp7, inp8, inp8,
2395 const20, const6, const3);
2396 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp6, inp5, inp4, inp3,
2397 inp7, inp8, inp8, inp7,
2398 inp7, inp6, inp5, inp4,
2399 inp8, inp8, inp7, inp6,
2400 const20, const6, const3);
2401 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2402 dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
2403 dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
2404 AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
2405 ST8x4_UB(res0, res1, dst, dst_stride);
2406 dst += (4 * dst_stride);
2409 static void vert_mc_qpel_avg_dst_16x16_msa(const uint8_t *src,
2414 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
2415 v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
2416 v16u8 res0, res1, dst0, dst1;
2417 v16u8 const20 = (v16u8) __msa_ldi_b(20);
2418 v16u8 const6 = (v16u8) __msa_ldi_b(6);
2419 v16u8 const3 = (v16u8) __msa_ldi_b(3);
2421 LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4);
2422 src += (5 * src_stride);
2423 res0 = APPLY_VERT_QPEL_FILTER(inp0, inp0, inp1, inp2,
2424 inp1, inp2, inp3, inp4,
2425 const20, const6, const3);
2428 res1 = APPLY_VERT_QPEL_FILTER(inp1, inp0, inp0, inp1,
2429 inp2, inp3, inp4, inp5,
2430 const20, const6, const3);
2431 LD_UB2(dst, dst_stride, dst0, dst1);
2432 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2433 ST_UB2(res0, res1, dst, dst_stride);
2434 dst += (2 * dst_stride);
2438 res0 = APPLY_VERT_QPEL_FILTER(inp2, inp1, inp0, inp0,
2439 inp3, inp4, inp5, inp6,
2440 const20, const6, const3);
2443 res1 = APPLY_VERT_QPEL_FILTER(inp3, inp2, inp1, inp0,
2444 inp4, inp5, inp6, inp7,
2445 const20, const6, const3);
2446 LD_UB2(dst, dst_stride, dst0, dst1);
2447 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2448 ST_UB2(res0, res1, dst, dst_stride);
2449 dst += (2 * dst_stride);
2453 res0 = APPLY_VERT_QPEL_FILTER(inp4, inp3, inp2, inp1,
2454 inp5, inp6, inp7, inp8,
2455 const20, const6, const3);
2458 res1 = APPLY_VERT_QPEL_FILTER(inp5, inp4, inp3, inp2,
2459 inp6, inp7, inp8, inp9,
2460 const20, const6, const3);
2461 LD_UB2(dst, dst_stride, dst0, dst1);
2462 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2463 ST_UB2(res0, res1, dst, dst_stride);
2464 dst += (2 * dst_stride);
2468 res0 = APPLY_VERT_QPEL_FILTER(inp6, inp5, inp4, inp3,
2469 inp7, inp8, inp9, inp10,
2470 const20, const6, const3);
2473 res1 = APPLY_VERT_QPEL_FILTER(inp7, inp6, inp5, inp4,
2474 inp8, inp9, inp10, inp11,
2475 const20, const6, const3);
2476 LD_UB2(dst, dst_stride, dst0, dst1);
2477 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2478 ST_UB2(res0, res1, dst, dst_stride);
2479 dst += (2 * dst_stride);
2483 res0 = APPLY_VERT_QPEL_FILTER(inp8, inp7, inp6, inp5,
2484 inp9, inp10, inp11, inp12,
2485 const20, const6, const3);
2488 res1 = APPLY_VERT_QPEL_FILTER(inp9, inp8, inp7, inp6,
2489 inp10, inp11, inp12, inp13,
2490 const20, const6, const3);
2491 LD_UB2(dst, dst_stride, dst0, dst1);
2492 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2493 ST_UB2(res0, res1, dst, dst_stride);
2494 dst += (2 * dst_stride);
2498 res0 = APPLY_VERT_QPEL_FILTER(inp10, inp9, inp8, inp7,
2499 inp11, inp12, inp13, inp14,
2500 const20, const6, const3);
2503 res1 = APPLY_VERT_QPEL_FILTER(inp11, inp10, inp9, inp8,
2504 inp12, inp13, inp14, inp15,
2505 const20, const6, const3);
2506 LD_UB2(dst, dst_stride, dst0, dst1);
2507 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2508 ST_UB2(res0, res1, dst, dst_stride);
2509 dst += (2 * dst_stride);
2512 res0 = APPLY_VERT_QPEL_FILTER(inp12, inp11, inp10, inp9,
2513 inp13, inp14, inp15, inp16,
2514 const20, const6, const3);
2515 res1 = APPLY_VERT_QPEL_FILTER(inp13, inp12, inp11, inp10,
2516 inp14, inp15, inp16, inp16,
2517 const20, const6, const3);
2518 LD_UB2(dst, dst_stride, dst0, dst1);
2519 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2520 ST_UB2(res0, res1, dst, dst_stride);
2521 dst += (2 * dst_stride);
2523 res0 = APPLY_VERT_QPEL_FILTER(inp14, inp13, inp12, inp11,
2524 inp15, inp16, inp16, inp15,
2525 const20, const6, const3);
2526 res1 = APPLY_VERT_QPEL_FILTER(inp15, inp14, inp13, inp12,
2527 inp16, inp16, inp15, inp14,
2528 const20, const6, const3);
2529 LD_UB2(dst, dst_stride, dst0, dst1);
2530 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2531 ST_UB2(res0, res1, dst, dst_stride);
2534 static void vert_mc_qpel_avg_dst_aver_src1_8x8_msa(const uint8_t *src,
2539 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
2540 v16u8 dst0, dst1, dst2, dst3;
2541 v16u8 tmp0, tmp1, res0, res1;
2542 v16u8 const20 = (v16u8) __msa_ldi_b(20);
2543 v16u8 const6 = (v16u8) __msa_ldi_b(6);
2544 v16u8 const3 = (v16u8) __msa_ldi_b(3);
2546 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
2547 src += (4 * src_stride);
2548 LD_UB2(src, src_stride, inp4, inp5);
2549 src += (2 * src_stride);
2550 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp0, inp0, inp1, inp2,
2551 inp1, inp2, inp3, inp4,
2552 inp1, inp0, inp0, inp1,
2553 inp2, inp3, inp4, inp5,
2554 const20, const6, const3);
2555 LD_UB2(src, src_stride, inp6, inp7);
2556 src += (2 * src_stride);
2557 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp2, inp1, inp0, inp0,
2558 inp3, inp4, inp5, inp6,
2559 inp3, inp2, inp1, inp0,
2560 inp4, inp5, inp6, inp7,
2561 const20, const6, const3);
2562 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2563 tmp0 = (v16u8) __msa_insve_d((v2i64) inp1, 1, (v2i64) inp2);
2564 tmp1 = (v16u8) __msa_insve_d((v2i64) inp3, 1, (v2i64) inp4);
2565 dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
2566 dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
2567 AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
2568 AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
2569 ST8x4_UB(res0, res1, dst, dst_stride);
2570 dst += (4 * dst_stride);
2573 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1,
2574 inp5, inp6, inp7, inp8,
2575 inp5, inp4, inp3, inp2,
2576 inp6, inp7, inp8, inp8,
2577 const20, const6, const3);
2578 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp6, inp5, inp4, inp3,
2579 inp7, inp8, inp8, inp7,
2580 inp7, inp6, inp5, inp4,
2581 inp8, inp8, inp7, inp6,
2582 const20, const6, const3);
2583 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2584 tmp0 = (v16u8) __msa_insve_d((v2i64) inp5, 1, (v2i64) inp6);
2585 tmp1 = (v16u8) __msa_insve_d((v2i64) inp7, 1, (v2i64) inp8);
2586 dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
2587 dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
2588 AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
2589 AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
2590 ST8x4_UB(res0, res1, dst, dst_stride);
2593 static void vert_mc_qpel_avg_dst_aver_src1_16x16_msa(const uint8_t *src,
2598 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
2599 v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
2600 v16u8 res0, res1, dst0, dst1;
2601 v16u8 const20 = (v16u8) __msa_ldi_b(20);
2602 v16u8 const6 = (v16u8) __msa_ldi_b(6);
2603 v16u8 const3 = (v16u8) __msa_ldi_b(3);
2605 LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4);
2606 src += (5 * src_stride);
2607 res0 = APPLY_VERT_QPEL_FILTER(inp0, inp0, inp1, inp2,
2608 inp1, inp2, inp3, inp4,
2609 const20, const6, const3);
2612 res1 = APPLY_VERT_QPEL_FILTER(inp1, inp0, inp0, inp1,
2613 inp2, inp3, inp4, inp5,
2614 const20, const6, const3);
2615 LD_UB2(dst, dst_stride, dst0, dst1);
2616 AVER_UB2_UB(res0, inp1, res1, inp2, res0, res1);
2617 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2618 ST_UB2(res0, res1, dst, dst_stride);
2619 dst += (2 * dst_stride);
2623 res0 = APPLY_VERT_QPEL_FILTER(inp2, inp1, inp0, inp0,
2624 inp3, inp4, inp5, inp6,
2625 const20, const6, const3);
2628 res1 = APPLY_VERT_QPEL_FILTER(inp3, inp2, inp1, inp0,
2629 inp4, inp5, inp6, inp7,
2630 const20, const6, const3);
2631 LD_UB2(dst, dst_stride, dst0, dst1);
2632 AVER_UB2_UB(res0, inp3, res1, inp4, res0, res1);
2633 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2634 ST_UB2(res0, res1, dst, dst_stride);
2635 dst += (2 * dst_stride);
2639 res0 = APPLY_VERT_QPEL_FILTER(inp4, inp3, inp2, inp1,
2640 inp5, inp6, inp7, inp8,
2641 const20, const6, const3);
2644 res1 = APPLY_VERT_QPEL_FILTER(inp5, inp4, inp3, inp2,
2645 inp6, inp7, inp8, inp9,
2646 const20, const6, const3);
2647 LD_UB2(dst, dst_stride, dst0, dst1);
2648 AVER_UB2_UB(res0, inp5, res1, inp6, res0, res1);
2649 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2650 ST_UB2(res0, res1, dst, dst_stride);
2651 dst += (2 * dst_stride);
2655 res0 = APPLY_VERT_QPEL_FILTER(inp6, inp5, inp4, inp3,
2656 inp7, inp8, inp9, inp10,
2657 const20, const6, const3);
2660 res1 = APPLY_VERT_QPEL_FILTER(inp7, inp6, inp5, inp4,
2661 inp8, inp9, inp10, inp11,
2662 const20, const6, const3);
2663 LD_UB2(dst, dst_stride, dst0, dst1);
2664 AVER_UB2_UB(res0, inp7, res1, inp8, res0, res1);
2665 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2666 ST_UB2(res0, res1, dst, dst_stride);
2667 dst += (2 * dst_stride);
2671 res0 = APPLY_VERT_QPEL_FILTER(inp8, inp7, inp6, inp5,
2672 inp9, inp10, inp11, inp12,
2673 const20, const6, const3);
2676 res1 = APPLY_VERT_QPEL_FILTER(inp9, inp8, inp7, inp6,
2677 inp10, inp11, inp12, inp13,
2678 const20, const6, const3);
2679 LD_UB2(dst, dst_stride, dst0, dst1);
2680 AVER_UB2_UB(res0, inp9, res1, inp10, res0, res1);
2681 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2682 ST_UB2(res0, res1, dst, dst_stride);
2683 dst += (2 * dst_stride);
2687 res0 = APPLY_VERT_QPEL_FILTER(inp10, inp9, inp8, inp7,
2688 inp11, inp12, inp13, inp14,
2689 const20, const6, const3);
2692 res1 = APPLY_VERT_QPEL_FILTER(inp11, inp10, inp9, inp8,
2693 inp12, inp13, inp14, inp15,
2694 const20, const6, const3);
2695 LD_UB2(dst, dst_stride, dst0, dst1);
2696 AVER_UB2_UB(res0, inp11, res1, inp12, res0, res1);
2697 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2698 ST_UB2(res0, res1, dst, dst_stride);
2699 dst += (2 * dst_stride);
2702 res0 = APPLY_VERT_QPEL_FILTER(inp12, inp11, inp10, inp9,
2703 inp13, inp14, inp15, inp16,
2704 const20, const6, const3);
2705 res1 = APPLY_VERT_QPEL_FILTER(inp13, inp12, inp11, inp10,
2706 inp14, inp15, inp16, inp16,
2707 const20, const6, const3);
2708 LD_UB2(dst, dst_stride, dst0, dst1);
2709 AVER_UB2_UB(res0, inp13, res1, inp14, res0, res1);
2710 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2711 ST_UB2(res0, res1, dst, dst_stride);
2712 dst += (2 * dst_stride);
2714 res0 = APPLY_VERT_QPEL_FILTER(inp14, inp13, inp12, inp11,
2715 inp15, inp16, inp16, inp15,
2716 const20, const6, const3);
2717 res1 = APPLY_VERT_QPEL_FILTER(inp15, inp14, inp13, inp12,
2718 inp16, inp16, inp15, inp14,
2719 const20, const6, const3);
2720 LD_UB2(dst, dst_stride, dst0, dst1);
2721 AVER_UB2_UB(res0, inp15, res1, inp16, res0, res1);
2722 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2723 ST_UB2(res0, res1, dst, dst_stride);
2726 static void hv_mc_qpel_no_rnd_horiz_src0_16x16_msa(const uint8_t *src,
2733 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
2735 v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
2736 v16u8 const6 = (v16u8) __msa_ldi_b(6);
2737 v16u8 const3 = (v16u8) __msa_ldi_b(3);
2738 v8u16 const20 = (v8u16) __msa_ldi_h(20);
2740 for (loop_count = (height >> 2); loop_count--;) {
2741 LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
2742 LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
2743 src += (4 * src_stride);
2744 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
2745 const20, const6, const3);
2746 res = __msa_ave_u_b(inp0, res);
2750 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp2, inp3, mask,
2751 const20, const6, const3);
2752 res = __msa_ave_u_b(inp2, res);
2756 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp4, inp5, mask,
2757 const20, const6, const3);
2758 res = __msa_ave_u_b(inp4, res);
2762 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp6, inp7, mask,
2763 const20, const6, const3);
2764 res = __msa_ave_u_b(inp6, res);
2769 LD_UB2(src, 1, inp0, inp1);
2770 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
2771 const20, const6, const3);
2772 res = __msa_ave_u_b(inp0, res);
2776 static void hv_mc_qpel_no_rnd_aver_hv_src00_16x16_msa(const uint8_t *src,
2783 hv_mc_qpel_no_rnd_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
2784 vert_mc_qpel_no_rnd_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
2787 static void hv_mc_qpel_no_rnd_aver_hv_src00_8x8_msa(const uint8_t *src,
2792 v16u8 inp0, inp1, inp2, inp3;
2793 v16u8 res0, res1, avg0, avg1;
2794 v16u8 horiz0, horiz1, horiz2, horiz3;
2795 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
2796 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2797 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
2798 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
2799 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
2800 v16u8 const20 = (v16u8) __msa_ldi_b(20);
2801 v16u8 const6 = (v16u8) __msa_ldi_b(6);
2802 v16u8 const3 = (v16u8) __msa_ldi_b(3);
2804 LD_UB2(src, src_stride, inp0, inp1);
2805 src += (2 * src_stride);
2806 res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
2807 mask2, mask3, const20,
2809 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
2810 horiz0 = __msa_ave_u_b(inp0, res0);
2811 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
2812 LD_UB2(src, src_stride, inp2, inp3);
2813 src += (2 * src_stride);
2814 res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
2815 mask2, mask3, const20,
2817 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
2818 horiz2 = __msa_ave_u_b(inp2, res1);
2819 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
2820 LD_UB2(src, src_stride, inp0, inp1);
2821 src += (2 * src_stride);
2822 res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
2823 mask2, mask3, const20,
2825 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
2826 horiz4 = __msa_ave_u_b(inp0, res0);
2827 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
2828 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
2829 horiz1, horiz2, horiz3, horiz4,
2830 horiz1, horiz0, horiz0, horiz1,
2831 horiz2, horiz3, horiz4, horiz5,
2832 const20, const6, const3);
2833 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
2834 res0 = __msa_ave_u_b(avg0, res0);
2835 ST8x2_UB(res0, dst, dst_stride);
2836 dst += (2 * dst_stride);
2838 LD_UB2(src, src_stride, inp2, inp3);
2839 src += (2 * src_stride);
2840 res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
2841 mask2, mask3, const20,
2843 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
2844 horiz6 = __msa_ave_u_b(inp2, res1);
2845 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
2847 res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
2848 mask2, mask3, const20,
2850 horiz8 = __msa_ave_u_b(inp0, res0);
2851 res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
2852 horiz3, horiz4, horiz5, horiz6,
2853 horiz3, horiz2, horiz1, horiz0,
2854 horiz4, horiz5, horiz6, horiz7,
2855 const20, const6, const3);
2856 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
2857 res1 = __msa_ave_u_b(avg1, res1);
2858 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
2859 horiz5, horiz6, horiz7, horiz8,
2860 horiz5, horiz4, horiz3, horiz2,
2861 horiz6, horiz7, horiz8, horiz8,
2862 const20, const6, const3);
2863 ST8x2_UB(res1, dst, dst_stride);
2864 dst += 2 * dst_stride;
2866 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
2867 res0 = __msa_ave_u_b(avg0, res0);
2868 res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
2869 horiz7, horiz8, horiz8, horiz7,
2870 horiz7, horiz6, horiz5, horiz4,
2871 horiz8, horiz8, horiz7, horiz6,
2872 const20, const6, const3);
2873 ST8x2_UB(res0, dst, dst_stride);
2874 dst += 2 * dst_stride;
2876 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
2877 res1 = __msa_ave_u_b(avg1, res1);
2878 ST8x2_UB(res1, dst, dst_stride);
2881 static void hv_mc_qpel_no_rnd_horiz_16x16_msa(const uint8_t *src,
2888 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
2890 v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
2891 v16u8 const6 = (v16u8) __msa_ldi_b(6);
2892 v16u8 const3 = (v16u8) __msa_ldi_b(3);
2893 v8u16 const20 = (v8u16) __msa_ldi_h(20);
2895 for (loop_count = (height >> 2); loop_count--;) {
2896 LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
2897 LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
2898 src += (4 * src_stride);
2899 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
2900 const20, const6, const3);
2904 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp2, inp3, mask,
2905 const20, const6, const3);
2909 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp4, inp5, mask,
2910 const20, const6, const3);
2914 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp6, inp7, mask,
2915 const20, const6, const3);
2920 LD_UB2(src, 1, inp0, inp1);
2921 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
2922 const20, const6, const3);
2926 static void hv_mc_qpel_no_rnd_aver_v_src0_16x16_msa(const uint8_t *src,
2933 hv_mc_qpel_no_rnd_horiz_16x16_msa(src, src_stride, buff, 16, 16);
2934 vert_mc_qpel_no_rnd_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
2937 static void hv_mc_qpel_no_rnd_aver_v_src0_8x8_msa(const uint8_t *src,
2942 v16u8 inp0, inp1, inp2, inp3;
2943 v16u8 res0, res1, avg0, avg1;
2944 v16u8 horiz0, horiz1, horiz2, horiz3;
2945 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
2946 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2947 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
2948 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
2949 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
2950 v16u8 const20 = (v16u8) __msa_ldi_b(20);
2951 v16u8 const6 = (v16u8) __msa_ldi_b(6);
2952 v16u8 const3 = (v16u8) __msa_ldi_b(3);
2954 LD_UB2(src, src_stride, inp0, inp1);
2955 src += (2 * src_stride);
2956 horiz0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
2957 mask2, mask3, const20,
2959 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
2961 LD_UB2(src, src_stride, inp2, inp3);
2962 src += (2 * src_stride);
2963 horiz2 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
2964 mask2, mask3, const20,
2966 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
2967 LD_UB2(src, src_stride, inp0, inp1);
2968 src += (2 * src_stride);
2969 horiz4 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
2970 mask2, mask3, const20,
2972 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
2973 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
2974 horiz1, horiz2, horiz3, horiz4,
2975 horiz1, horiz0, horiz0, horiz1,
2976 horiz2, horiz3, horiz4, horiz5,
2977 const20, const6, const3);
2978 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
2979 res0 = __msa_ave_u_b(avg0, res0);
2980 ST8x2_UB(res0, dst, dst_stride);
2981 dst += (2 * dst_stride);
2983 LD_UB2(src, src_stride, inp2, inp3);
2984 src += (2 * src_stride);
2985 horiz6 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
2986 mask2, mask3, const20,
2988 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
2990 horiz8 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
2991 mask2, mask3, const20,
2993 res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
2994 horiz3, horiz4, horiz5, horiz6,
2995 horiz3, horiz2, horiz1, horiz0,
2996 horiz4, horiz5, horiz6, horiz7,
2997 const20, const6, const3);
2998 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
2999 res1 = __msa_ave_u_b(avg1, res1);
3000 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
3001 res0 = __msa_ave_u_b(avg0, res0);
3002 ST8x2_UB(res1, dst, dst_stride);
3003 dst += (2 * dst_stride);
3005 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
3006 horiz5, horiz6, horiz7, horiz8,
3007 horiz5, horiz4, horiz3, horiz2,
3008 horiz6, horiz7, horiz8, horiz8,
3009 const20, const6, const3);
3010 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
3011 res0 = __msa_ave_u_b(avg0, res0);
3012 ST8x2_UB(res0, dst, dst_stride);
3013 dst += (2 * dst_stride);
3015 res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
3016 horiz7, horiz8, horiz8, horiz7,
3017 horiz7, horiz6, horiz5, horiz4,
3018 horiz8, horiz8, horiz7, horiz6,
3019 const20, const6, const3);
3020 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
3021 res1 = __msa_ave_u_b(avg1, res1);
3022 ST8x2_UB(res1, dst, dst_stride);
3025 static void hv_mc_qpel_no_rnd_horiz_src1_16x16_msa(const uint8_t *src,
3032 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
3034 v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
3035 v16u8 const6 = (v16u8) __msa_ldi_b(6);
3036 v16u8 const3 = (v16u8) __msa_ldi_b(3);
3037 v8u16 const20 = (v8u16) __msa_ldi_h(20);
3039 for (loop_count = (height >> 2); loop_count--;) {
3040 LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
3041 LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
3042 src += (4 * src_stride);
3043 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
3044 const20, const6, const3);
3045 res = __msa_ave_u_b(res, inp1);
3049 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp2, inp3, mask,
3050 const20, const6, const3);
3051 res = __msa_ave_u_b(res, inp3);
3055 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp4, inp5, mask,
3056 const20, const6, const3);
3057 res = __msa_ave_u_b(res, inp5);
3061 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp6, inp7, mask,
3062 const20, const6, const3);
3063 res = __msa_ave_u_b(res, inp7);
3068 LD_UB2(src, 1, inp0, inp1);
3069 res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
3070 const20, const6, const3);
3071 res = __msa_ave_u_b(inp1, res);
3075 static void hv_mc_qpel_no_rnd_aver_hv_src10_16x16_msa(const uint8_t *src,
3082 hv_mc_qpel_no_rnd_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
3083 vert_mc_qpel_no_rnd_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
3086 static void hv_mc_qpel_no_rnd_aver_hv_src10_8x8_msa(const uint8_t *src,
3091 v16u8 inp0, inp1, inp2, inp3;
3092 v16u8 res0, res1, avg0, avg1;
3093 v16u8 horiz0, horiz1, horiz2, horiz3;
3094 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
3095 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3096 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
3097 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
3098 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
3099 v16u8 const20 = (v16u8) __msa_ldi_b(20);
3100 v16u8 const6 = (v16u8) __msa_ldi_b(6);
3101 v16u8 const3 = (v16u8) __msa_ldi_b(3);
3103 LD_UB2(src, src_stride, inp0, inp1);
3104 src += (2 * src_stride);
3105 res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3106 mask2, mask3, const20,
3108 SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
3110 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
3111 horiz0 = __msa_ave_u_b(inp0, res0);
3112 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
3113 LD_UB2(src, src_stride, inp2, inp3);
3114 src += (2 * src_stride);
3115 res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3116 mask2, mask3, const20,
3118 SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
3120 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
3121 horiz2 = __msa_ave_u_b(inp2, res1);
3122 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
3123 LD_UB2(src, src_stride, inp0, inp1);
3124 src += (2 * src_stride);
3125 res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3126 mask2, mask3, const20,
3128 SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
3130 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
3131 horiz4 = __msa_ave_u_b(inp0, res0);
3132 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
3133 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
3134 horiz1, horiz2, horiz3, horiz4,
3135 horiz1, horiz0, horiz0, horiz1,
3136 horiz2, horiz3, horiz4, horiz5,
3137 const20, const6, const3);
3138 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
3139 res0 = __msa_ave_u_b(avg0, res0);
3140 ST8x2_UB(res0, dst, dst_stride);
3141 dst += (2 * dst_stride);
3143 LD_UB2(src, src_stride, inp2, inp3);
3144 src += (2 * src_stride);
3145 res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3146 mask2, mask3, const20,
3148 SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
3150 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
3151 horiz6 = __msa_ave_u_b(inp2, res1);
3152 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
3154 res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
3155 mask2, mask3, const20,
3157 inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
3158 horiz8 = __msa_ave_u_b(inp0, res0);
3159 res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
3160 horiz3, horiz4, horiz5, horiz6,
3161 horiz3, horiz2, horiz1, horiz0,
3162 horiz4, horiz5, horiz6, horiz7,
3163 const20, const6, const3);
3164 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
3165 res1 = __msa_ave_u_b(avg1, res1);
3166 ST8x2_UB(res1, dst, dst_stride);
3167 dst += (2 * dst_stride);
3169 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
3170 horiz5, horiz6, horiz7, horiz8,
3171 horiz5, horiz4, horiz3, horiz2,
3172 horiz6, horiz7, horiz8, horiz8,
3173 const20, const6, const3);
3174 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
3175 res0 = __msa_ave_u_b(avg0, res0);
3176 ST8x2_UB(res0, dst, dst_stride);
3177 dst += (2 * dst_stride);
3179 res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
3180 horiz7, horiz8, horiz8, horiz7,
3181 horiz7, horiz6, horiz5, horiz4,
3182 horiz8, horiz8, horiz7, horiz6,
3183 const20, const6, const3);
3184 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
3185 res1 = __msa_ave_u_b(avg1, res1);
3186 ST8x2_UB(res1, dst, dst_stride);
3189 static void hv_mc_qpel_no_rnd_aver_h_src0_16x16_msa(const uint8_t *src,
3196 hv_mc_qpel_no_rnd_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
3197 vert_mc_qpel_no_rnd_16x16_msa(buff, 16, dst, dst_stride);
3200 static void hv_mc_qpel_no_rnd_aver_h_src0_8x8_msa(const uint8_t *src,
3205 v16u8 inp0, inp1, inp2, inp3;
3207 v16u8 horiz0, horiz1, horiz2, horiz3;
3208 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
3209 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3210 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
3211 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
3212 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
3213 v16u8 const20 = (v16u8) __msa_ldi_b(20);
3214 v16u8 const6 = (v16u8) __msa_ldi_b(6);
3215 v16u8 const3 = (v16u8) __msa_ldi_b(3);
3217 LD_UB2(src, src_stride, inp0, inp1);
3218 src += (2 * src_stride);
3219 res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3220 mask2, mask3, const20,
3222 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
3223 horiz0 = __msa_ave_u_b(inp0, res0);
3224 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
3225 LD_UB2(src, src_stride, inp2, inp3);
3226 src += (2 * src_stride);
3227 res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3228 mask2, mask3, const20,
3230 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
3231 horiz2 = __msa_ave_u_b(inp2, res1);
3232 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
3233 LD_UB2(src, src_stride, inp0, inp1);
3234 src += (2 * src_stride);
3235 res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3236 mask2, mask3, const20,
3238 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
3239 horiz4 = __msa_ave_u_b(inp0, res0);
3240 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
3241 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
3242 horiz1, horiz2, horiz3, horiz4,
3243 horiz1, horiz0, horiz0, horiz1,
3244 horiz2, horiz3, horiz4, horiz5,
3245 const20, const6, const3);
3247 LD_UB2(src, src_stride, inp2, inp3);
3248 src += (2 * src_stride);
3249 ST8x2_UB(res0, dst, dst_stride);
3250 dst += 2 * dst_stride;
3252 res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3253 mask2, mask3, const20,
3255 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
3256 horiz6 = __msa_ave_u_b(inp2, res1);
3257 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
3259 res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
3260 mask2, mask3, const20,
3262 horiz8 = __msa_ave_u_b(inp0, res0);
3263 res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
3264 horiz3, horiz4, horiz5, horiz6,
3265 horiz3, horiz2, horiz1, horiz0,
3266 horiz4, horiz5, horiz6, horiz7,
3267 const20, const6, const3);
3268 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
3269 horiz5, horiz6, horiz7, horiz8,
3270 horiz5, horiz4, horiz3, horiz2,
3271 horiz6, horiz7, horiz8, horiz8,
3272 const20, const6, const3);
3273 ST8x2_UB(res1, dst, dst_stride);
3274 dst += 2 * dst_stride;
3276 ST8x2_UB(res0, dst, dst_stride);
3277 dst += (2 * dst_stride);
3279 res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
3280 horiz7, horiz8, horiz8, horiz7,
3281 horiz7, horiz6, horiz5, horiz4,
3282 horiz8, horiz8, horiz7, horiz6,
3283 const20, const6, const3);
3284 ST8x2_UB(res1, dst, dst_stride);
3287 static void hv_mc_qpel_no_rnd_16x16_msa(const uint8_t *src,
3294 hv_mc_qpel_no_rnd_horiz_16x16_msa(src, src_stride, buff, 16, 16);
3295 vert_mc_qpel_no_rnd_16x16_msa(buff, 16, dst, dst_stride);
3298 static void hv_mc_qpel_no_rnd_8x8_msa(const uint8_t *src,
3303 v16u8 inp0, inp1, inp2, inp3;
3305 v16u8 horiz0, horiz1, horiz2, horiz3;
3306 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
3307 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3308 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
3309 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
3310 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
3311 v16u8 const20 = (v16u8) __msa_ldi_b(20);
3312 v16u8 const6 = (v16u8) __msa_ldi_b(6);
3313 v16u8 const3 = (v16u8) __msa_ldi_b(3);
3315 LD_UB2(src, src_stride, inp0, inp1);
3316 src += (2 * src_stride);
3317 horiz0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3318 mask2, mask3, const20,
3320 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
3321 LD_UB2(src, src_stride, inp2, inp3);
3322 src += (2 * src_stride);
3323 horiz2 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3324 mask2, mask3, const20,
3326 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
3327 LD_UB2(src, src_stride, inp0, inp1);
3328 src += (2 * src_stride);
3329 horiz4 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3330 mask2, mask3, const20,
3332 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
3333 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
3334 horiz1, horiz2, horiz3, horiz4,
3335 horiz1, horiz0, horiz0, horiz1,
3336 horiz2, horiz3, horiz4, horiz5,
3337 const20, const6, const3);
3338 LD_UB2(src, src_stride, inp2, inp3);
3339 src += (2 * src_stride);
3340 ST8x2_UB(res0, dst, dst_stride);
3341 dst += 2 * dst_stride;
3343 horiz6 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3344 mask2, mask3, const20,
3346 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
3348 horiz8 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
3349 mask2, mask3, const20,
3351 res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
3352 horiz3, horiz4, horiz5, horiz6,
3353 horiz3, horiz2, horiz1, horiz0,
3354 horiz4, horiz5, horiz6, horiz7,
3355 const20, const6, const3);
3356 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
3357 horiz5, horiz6, horiz7, horiz8,
3358 horiz5, horiz4, horiz3, horiz2,
3359 horiz6, horiz7, horiz8, horiz8,
3360 const20, const6, const3);
3361 ST8x2_UB(res1, dst, dst_stride);
3362 dst += 2 * dst_stride;
3365 res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
3366 horiz7, horiz8, horiz8, horiz7,
3367 horiz7, horiz6, horiz5, horiz4,
3368 horiz8, horiz8, horiz7, horiz6,
3369 const20, const6, const3);
3370 ST8x2_UB(res0, dst, dst_stride);
3371 dst += 2 * dst_stride;
3372 ST8x2_UB(res1, dst, dst_stride);
3375 static void hv_mc_qpel_no_rnd_aver_h_src1_16x16_msa(const uint8_t *src,
3382 hv_mc_qpel_no_rnd_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
3383 vert_mc_qpel_no_rnd_16x16_msa(buff, 16, dst, dst_stride);
3386 static void hv_mc_qpel_no_rnd_aver_h_src1_8x8_msa(const uint8_t *src,
3391 v16u8 inp0, inp1, inp2, inp3;
3393 v16u8 horiz0, horiz1, horiz2, horiz3;
3394 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
3395 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3396 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
3397 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
3398 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
3399 v16u8 const20 = (v16u8) __msa_ldi_b(20);
3400 v16u8 const6 = (v16u8) __msa_ldi_b(6);
3401 v16u8 const3 = (v16u8) __msa_ldi_b(3);
3403 LD_UB2(src, src_stride, inp0, inp1);
3404 src += (2 * src_stride);
3405 res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3406 mask2, mask3, const20,
3408 SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
3410 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
3411 horiz0 = __msa_ave_u_b(inp0, res0);
3412 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
3413 LD_UB2(src, src_stride, inp2, inp3);
3414 src += (2 * src_stride);
3415 res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3416 mask2, mask3, const20,
3418 SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
3420 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
3421 horiz2 = __msa_ave_u_b(inp2, res1);
3422 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
3423 LD_UB2(src, src_stride, inp0, inp1);
3424 src += (2 * src_stride);
3425 res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3426 mask2, mask3, const20,
3428 SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
3430 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
3431 horiz4 = __msa_ave_u_b(inp0, res0);
3432 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
3433 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
3434 horiz1, horiz2, horiz3, horiz4,
3435 horiz1, horiz0, horiz0, horiz1,
3436 horiz2, horiz3, horiz4, horiz5,
3437 const20, const6, const3);
3438 LD_UB2(src, src_stride, inp2, inp3);
3439 src += (2 * src_stride);
3440 ST8x2_UB(res0, dst, dst_stride);
3441 dst += 2 * dst_stride;
3443 res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3444 mask2, mask3, const20,
3446 SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
3448 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
3449 horiz6 = __msa_ave_u_b(inp2, res1);
3450 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
3452 res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
3453 mask2, mask3, const20,
3455 inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
3456 horiz8 = __msa_ave_u_b(inp0, res0);
3457 res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
3458 horiz3, horiz4, horiz5, horiz6,
3459 horiz3, horiz2, horiz1, horiz0,
3460 horiz4, horiz5, horiz6, horiz7,
3461 const20, const6, const3);
3462 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
3463 horiz5, horiz6, horiz7, horiz8,
3464 horiz5, horiz4, horiz3, horiz2,
3465 horiz6, horiz7, horiz8, horiz8,
3466 const20, const6, const3);
3467 ST8x2_UB(res1, dst, dst_stride);
3468 dst += 2 * dst_stride;
3470 res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
3471 horiz7, horiz8, horiz8, horiz7,
3472 horiz7, horiz6, horiz5, horiz4,
3473 horiz8, horiz8, horiz7, horiz6,
3474 const20, const6, const3);
3475 ST8x2_UB(res0, dst, dst_stride);
3476 dst += 2 * dst_stride;
3477 ST8x2_UB(res1, dst, dst_stride);
3480 static void hv_mc_qpel_no_rnd_aver_hv_src01_16x16_msa(const uint8_t *src,
3487 hv_mc_qpel_no_rnd_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
3488 vert_mc_qpel_no_rnd_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
3491 static void hv_mc_qpel_no_rnd_aver_hv_src01_8x8_msa(const uint8_t *src,
3496 v16u8 inp0, inp1, inp2, inp3;
3497 v16u8 res0, res1, avg0, avg1;
3498 v16u8 horiz0, horiz1, horiz2, horiz3;
3499 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
3500 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3501 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
3502 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
3503 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
3504 v16u8 const20 = (v16u8) __msa_ldi_b(20);
3505 v16u8 const6 = (v16u8) __msa_ldi_b(6);
3506 v16u8 const3 = (v16u8) __msa_ldi_b(3);
3508 LD_UB2(src, src_stride, inp0, inp1);
3509 src += (2 * src_stride);
3510 res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3511 mask2, mask3, const20,
3513 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
3514 horiz0 = __msa_ave_u_b(inp0, res0);
3515 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
3516 LD_UB2(src, src_stride, inp2, inp3);
3517 src += (2 * src_stride);
3518 res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3519 mask2, mask3, const20,
3521 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
3522 horiz2 = __msa_ave_u_b(inp2, res1);
3523 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
3524 LD_UB2(src, src_stride, inp0, inp1);
3525 src += (2 * src_stride);
3526 res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3527 mask2, mask3, const20,
3529 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
3530 horiz4 = __msa_ave_u_b(inp0, res0);
3531 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
3532 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
3533 horiz1, horiz2, horiz3, horiz4,
3534 horiz1, horiz0, horiz0, horiz1,
3535 horiz2, horiz3, horiz4, horiz5,
3536 const20, const6, const3);
3537 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
3538 res0 = __msa_ave_u_b(avg0, res0);
3539 ST8x2_UB(res0, dst, dst_stride);
3540 dst += (2 * dst_stride);
3542 LD_UB2(src, src_stride, inp2, inp3);
3543 src += (2 * src_stride);
3544 res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3545 mask2, mask3, const20,
3547 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
3548 horiz6 = __msa_ave_u_b(inp2, res1);
3549 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
3551 res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
3552 mask2, mask3, const20,
3554 horiz8 = __msa_ave_u_b(inp0, res0);
3555 res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
3556 horiz3, horiz4, horiz5, horiz6,
3557 horiz3, horiz2, horiz1, horiz0,
3558 horiz4, horiz5, horiz6, horiz7,
3559 const20, const6, const3);
3560 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
3561 res1 = __msa_ave_u_b(avg1, res1);
3562 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
3563 horiz5, horiz6, horiz7, horiz8,
3564 horiz5, horiz4, horiz3, horiz2,
3565 horiz6, horiz7, horiz8, horiz8,
3566 const20, const6, const3);
3567 ST8x2_UB(res1, dst, dst_stride);
3568 dst += 2 * dst_stride;
3570 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
3571 res0 = __msa_ave_u_b(avg0, res0);
3573 res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
3574 horiz7, horiz8, horiz8, horiz7,
3575 horiz7, horiz6, horiz5, horiz4,
3576 horiz8, horiz8, horiz7, horiz6,
3577 const20, const6, const3);
3578 ST8x2_UB(res0, dst, dst_stride);
3579 dst += 2 * dst_stride;
3581 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
3582 res1 = __msa_ave_u_b(avg1, res1);
3583 ST8x2_UB(res1, dst, dst_stride);
3586 static void hv_mc_qpel_no_rnd_aver_v_src1_16x16_msa(const uint8_t *src,
3593 hv_mc_qpel_no_rnd_horiz_16x16_msa(src, src_stride, buff, 16, 16);
3594 vert_mc_qpel_no_rnd_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
3597 static void hv_mc_qpel_no_rnd_aver_v_src1_8x8_msa(const uint8_t *src,
3602 v16u8 inp0, inp1, inp2, inp3;
3603 v16u8 res0, res1, avg0, avg1;
3604 v16u8 horiz0, horiz1, horiz2, horiz3;
3605 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
3606 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3607 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
3608 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
3609 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
3610 v16u8 const20 = (v16u8) __msa_ldi_b(20);
3611 v16u8 const6 = (v16u8) __msa_ldi_b(6);
3612 v16u8 const3 = (v16u8) __msa_ldi_b(3);
3614 LD_UB2(src, src_stride, inp0, inp1);
3615 src += (2 * src_stride);
3616 horiz0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3617 mask2, mask3, const20,
3619 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
3620 LD_UB2(src, src_stride, inp2, inp3);
3621 src += (2 * src_stride);
3622 horiz2 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3623 mask2, mask3, const20,
3625 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
3626 LD_UB2(src, src_stride, inp0, inp1);
3627 src += (2 * src_stride);
3628 horiz4 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3629 mask2, mask3, const20,
3631 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
3632 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
3633 horiz1, horiz2, horiz3, horiz4,
3634 horiz1, horiz0, horiz0, horiz1,
3635 horiz2, horiz3, horiz4, horiz5,
3636 const20, const6, const3);
3637 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
3638 res0 = __msa_ave_u_b(avg0, res0);
3639 LD_UB2(src, src_stride, inp2, inp3);
3640 src += (2 * src_stride);
3641 ST8x2_UB(res0, dst, dst_stride);
3642 dst += 2 * dst_stride;
3644 horiz6 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3645 mask2, mask3, const20,
3647 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
3648 res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
3649 horiz3, horiz4, horiz5, horiz6,
3650 horiz3, horiz2, horiz1, horiz0,
3651 horiz4, horiz5, horiz6, horiz7,
3652 const20, const6, const3);
3653 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
3654 res1 = __msa_ave_u_b(avg1, res1);
3656 horiz8 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
3657 mask2, mask3, const20,
3659 ST8x2_UB(res1, dst, dst_stride);
3660 dst += 2 * dst_stride;
3662 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
3663 horiz5, horiz6, horiz7, horiz8,
3664 horiz5, horiz4, horiz3, horiz2,
3665 horiz6, horiz7, horiz8, horiz8,
3666 const20, const6, const3);
3667 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
3668 res0 = __msa_ave_u_b(avg0, res0);
3669 res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
3670 horiz7, horiz8, horiz8, horiz7,
3671 horiz7, horiz6, horiz5, horiz4,
3672 horiz8, horiz8, horiz7, horiz6,
3673 const20, const6, const3);
3674 ST8x2_UB(res0, dst, dst_stride);
3675 dst += 2 * dst_stride;
3677 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
3678 res1 = __msa_ave_u_b(avg1, res1);
3679 ST8x2_UB(res1, dst, dst_stride);
3682 static void hv_mc_qpel_no_rnd_aver_hv_src11_16x16_msa(const uint8_t *src,
3689 hv_mc_qpel_no_rnd_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
3690 vert_mc_qpel_no_rnd_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
3693 static void hv_mc_qpel_no_rnd_aver_hv_src11_8x8_msa(const uint8_t *src,
3698 v16u8 inp0, inp1, inp2, inp3;
3699 v16u8 res0, res1, avg0, avg1;
3700 v16u8 horiz0, horiz1, horiz2, horiz3;
3701 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
3702 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3703 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
3704 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
3705 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
3706 v16u8 const20 = (v16u8) __msa_ldi_b(20);
3707 v16u8 const6 = (v16u8) __msa_ldi_b(6);
3708 v16u8 const3 = (v16u8) __msa_ldi_b(3);
3710 LD_UB2(src, src_stride, inp0, inp1);
3711 src += (2 * src_stride);
3712 res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3713 mask2, mask3, const20,
3715 SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
3717 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
3718 horiz0 = __msa_ave_u_b(inp0, res0);
3719 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
3720 LD_UB2(src, src_stride, inp2, inp3);
3721 src += (2 * src_stride);
3722 res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3723 mask2, mask3, const20,
3725 SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
3727 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
3728 horiz2 = __msa_ave_u_b(inp2, res1);
3729 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
3730 LD_UB2(src, src_stride, inp0, inp1);
3731 src += (2 * src_stride);
3732 res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3733 mask2, mask3, const20,
3736 SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
3737 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
3738 horiz4 = __msa_ave_u_b(inp0, res0);
3739 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
3740 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
3741 horiz1, horiz2, horiz3, horiz4,
3742 horiz1, horiz0, horiz0, horiz1,
3743 horiz2, horiz3, horiz4, horiz5,
3744 const20, const6, const3);
3745 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
3746 res0 = __msa_ave_u_b(avg0, res0);
3747 ST8x2_UB(res0, dst, dst_stride);
3748 dst += (2 * dst_stride);
3750 LD_UB2(src, src_stride, inp2, inp3);
3751 src += (2 * src_stride);
3752 res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3753 mask2, mask3, const20,
3755 SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
3757 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
3758 horiz6 = __msa_ave_u_b(inp2, res1);
3759 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
3760 res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
3761 horiz3, horiz4, horiz5, horiz6,
3762 horiz3, horiz2, horiz1, horiz0,
3763 horiz4, horiz5, horiz6, horiz7,
3764 const20, const6, const3);
3765 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
3766 res1 = __msa_ave_u_b(avg1, res1);
3767 ST8x2_UB(res1, dst, dst_stride);
3768 dst += (2 * dst_stride);
3771 res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
3772 mask2, mask3, const20,
3774 inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
3775 horiz8 = __msa_ave_u_b(inp0, res0);
3776 res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
3777 horiz5, horiz6, horiz7, horiz8,
3778 horiz5, horiz4, horiz3, horiz2,
3779 horiz6, horiz7, horiz8, horiz8,
3780 const20, const6, const3);
3781 res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
3782 horiz7, horiz8, horiz8, horiz7,
3783 horiz7, horiz6, horiz5, horiz4,
3784 horiz8, horiz8, horiz7, horiz6,
3785 const20, const6, const3);
3786 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
3787 res0 = __msa_ave_u_b(avg0, res0);
3788 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
3789 res1 = __msa_ave_u_b(avg1, res1);
3790 ST8x4_UB(res0, res1, dst, dst_stride);
3793 static void hv_mc_qpel_aver_horiz_src0_16x16_msa(const uint8_t *src,
3800 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
3802 v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
3803 v16u8 const6 = (v16u8) __msa_ldi_b(6);
3804 v16u8 const3 = (v16u8) __msa_ldi_b(3);
3805 v8u16 const20 = (v8u16) __msa_ldi_h(20);
3807 for (loop_count = (height >> 2); loop_count--;) {
3808 LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
3809 LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
3810 src += (4 * src_stride);
3811 res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
3812 const20, const6, const3);
3813 res = __msa_aver_u_b(inp0, res);
3817 res = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
3818 const20, const6, const3);
3819 res = __msa_aver_u_b(inp2, res);
3823 res = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
3824 const20, const6, const3);
3825 res = __msa_aver_u_b(inp4, res);
3829 res = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
3830 const20, const6, const3);
3831 res = __msa_aver_u_b(inp6, res);
3836 LD_UB2(src, 1, inp0, inp1);
3837 res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, const20, const6, const3);
3838 res = __msa_aver_u_b(inp0, res);
3842 static void hv_mc_qpel_aver_hv_src00_16x16_msa(const uint8_t *src,
3849 hv_mc_qpel_aver_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
3850 vert_mc_qpel_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
3853 static void hv_mc_qpel_aver_hv_src00_8x8_msa(const uint8_t *src,
3858 v16u8 inp0, inp1, inp2, inp3;
3859 v16u8 res0, res1, avg0, avg1;
3860 v16u8 horiz0, horiz1, horiz2, horiz3;
3861 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
3862 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3863 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
3864 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
3865 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
3866 v16u8 const20 = (v16u8) __msa_ldi_b(20);
3867 v16u8 const6 = (v16u8) __msa_ldi_b(6);
3868 v16u8 const3 = (v16u8) __msa_ldi_b(3);
3870 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
3871 src += (4 * src_stride);
3872 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
3873 const20, const6, const3);
3874 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
3875 const20, const6, const3);
3876 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
3877 horiz0 = __msa_aver_u_b(inp0, res0);
3878 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
3879 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
3880 horiz2 = __msa_aver_u_b(inp2, res1);
3881 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
3882 LD_UB2(src, src_stride, inp0, inp1);
3883 src += (2 * src_stride);
3884 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
3885 const20, const6, const3);
3886 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
3887 horiz4 = __msa_aver_u_b(inp0, res0);
3888 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
3889 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
3890 horiz1, horiz2, horiz3, horiz4,
3891 horiz1, horiz0, horiz0, horiz1,
3892 horiz2, horiz3, horiz4, horiz5,
3893 const20, const6, const3);
3894 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
3895 res0 = __msa_aver_u_b(avg0, res0);
3896 ST8x2_UB(res0, dst, dst_stride);
3897 dst += (2 * dst_stride);
3899 LD_UB2(src, src_stride, inp2, inp3);
3900 src += (2 * src_stride);
3901 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
3902 const20, const6, const3);
3903 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
3904 horiz6 = __msa_aver_u_b(inp2, res1);
3905 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
3906 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
3907 horiz3, horiz4, horiz5, horiz6,
3908 horiz3, horiz2, horiz1, horiz0,
3909 horiz4, horiz5, horiz6, horiz7,
3910 const20, const6, const3);
3911 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
3912 res1 = __msa_aver_u_b(avg1, res1);
3915 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
3916 const20, const6, const3);
3917 horiz8 = __msa_aver_u_b(inp0, res0);
3918 ST8x2_UB(res1, dst, dst_stride);
3919 dst += 2 * dst_stride;
3921 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
3922 horiz5, horiz6, horiz7, horiz8,
3923 horiz5, horiz4, horiz3, horiz2,
3924 horiz6, horiz7, horiz8, horiz8,
3925 const20, const6, const3);
3926 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
3927 res0 = __msa_aver_u_b(avg0, res0);
3928 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
3929 horiz7, horiz8, horiz8, horiz7,
3930 horiz7, horiz6, horiz5, horiz4,
3931 horiz8, horiz8, horiz7, horiz6,
3932 const20, const6, const3);
3933 ST8x2_UB(res0, dst, dst_stride);
3934 dst += 2 * dst_stride;
3935 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
3936 res1 = __msa_aver_u_b(avg1, res1);
3937 ST8x2_UB(res1, dst, dst_stride);
3940 static void hv_mc_qpel_aver_horiz_16x16_msa(const uint8_t *src,
3947 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
3949 v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
3950 v16u8 const6 = (v16u8) __msa_ldi_b(6);
3951 v16u8 const3 = (v16u8) __msa_ldi_b(3);
3952 v8u16 const20 = (v8u16) __msa_ldi_h(20);
3954 for (loop_count = (height >> 2); loop_count--;) {
3955 LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
3956 LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
3957 src += (4 * src_stride);
3958 res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
3959 const20, const6, const3);
3963 res = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
3964 const20, const6, const3);
3968 res = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
3969 const20, const6, const3);
3973 res = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
3974 const20, const6, const3);
3979 LD_UB2(src, 1, inp0, inp1);
3980 res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, const20, const6, const3);
3984 static void hv_mc_qpel_aver_v_src0_16x16_msa(const uint8_t *src,
3991 hv_mc_qpel_aver_horiz_16x16_msa(src, src_stride, buff, 16, 16);
3992 vert_mc_qpel_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
3995 static void hv_mc_qpel_aver_v_src0_8x8_msa(const uint8_t *src,
4000 v16u8 inp0, inp1, inp2, inp3;
4001 v16u8 res0, res1, avg0, avg1;
4002 v16u8 horiz0, horiz1, horiz2, horiz3;
4003 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
4004 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4005 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
4006 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
4007 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
4008 v16u8 const20 = (v16u8) __msa_ldi_b(20);
4009 v16u8 const6 = (v16u8) __msa_ldi_b(6);
4010 v16u8 const3 = (v16u8) __msa_ldi_b(3);
4012 LD_UB2(src, src_stride, inp0, inp1);
4013 src += (2 * src_stride);
4014 horiz0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
4015 mask0, mask1, mask2, mask3,
4016 const20, const6, const3);
4017 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
4018 LD_UB2(src, src_stride, inp2, inp3);
4019 src += (2 * src_stride);
4020 horiz2 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
4021 mask0, mask1, mask2, mask3,
4022 const20, const6, const3);
4023 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4024 LD_UB2(src, src_stride, inp0, inp1);
4025 src += (2 * src_stride);
4026 horiz4 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
4027 mask0, mask1, mask2, mask3,
4028 const20, const6, const3);
4029 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4030 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
4031 horiz1, horiz2, horiz3, horiz4,
4032 horiz1, horiz0, horiz0, horiz1,
4033 horiz2, horiz3, horiz4, horiz5,
4034 const20, const6, const3);
4035 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
4036 res0 = __msa_aver_u_b(avg0, res0);
4037 ST8x2_UB(res0, dst, dst_stride);
4038 dst += (2 * dst_stride);
4040 LD_UB2(src, src_stride, inp2, inp3);
4041 src += (2 * src_stride);
4042 horiz6 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
4043 mask0, mask1, mask2, mask3,
4044 const20, const6, const3);
4045 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4046 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
4047 horiz3, horiz4, horiz5, horiz6,
4048 horiz3, horiz2, horiz1, horiz0,
4049 horiz4, horiz5, horiz6, horiz7,
4050 const20, const6, const3);
4052 horiz8 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0,
4053 mask0, mask1, mask2, mask3,
4054 const20, const6, const3);
4055 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
4056 res1 = __msa_aver_u_b(avg1, res1);
4057 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
4058 horiz5, horiz6, horiz7, horiz8,
4059 horiz5, horiz4, horiz3, horiz2,
4060 horiz6, horiz7, horiz8, horiz8,
4061 const20, const6, const3);
4062 ST8x2_UB(res1, dst, dst_stride);
4063 dst += 2 * dst_stride;
4065 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
4066 res0 = __msa_aver_u_b(avg0, res0);
4067 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
4068 horiz7, horiz8, horiz8, horiz7,
4069 horiz7, horiz6, horiz5, horiz4,
4070 horiz8, horiz8, horiz7, horiz6,
4071 const20, const6, const3);
4072 ST8x2_UB(res0, dst, dst_stride);
4073 dst += 2 * dst_stride;
4074 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
4075 res1 = __msa_aver_u_b(avg1, res1);
4076 ST8x2_UB(res1, dst, dst_stride);
4079 static void hv_mc_qpel_aver_horiz_src1_16x16_msa(const uint8_t *src,
4086 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
4088 v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
4089 v16u8 const6 = (v16u8) __msa_ldi_b(6);
4090 v16u8 const3 = (v16u8) __msa_ldi_b(3);
4091 v8u16 const20 = (v8u16) __msa_ldi_h(20);
4093 for (loop_count = (height >> 2); loop_count--;) {
4094 LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
4095 LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
4096 src += (4 * src_stride);
4097 res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
4098 const20, const6, const3);
4099 res = __msa_aver_u_b(res, inp1);
4103 res = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
4104 const20, const6, const3);
4105 res = __msa_aver_u_b(res, inp3);
4109 res = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
4110 const20, const6, const3);
4111 res = __msa_aver_u_b(res, inp5);
4115 res = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
4116 const20, const6, const3);
4117 res = __msa_aver_u_b(res, inp7);
4122 LD_UB2(src, 1, inp0, inp1);
4123 res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, const20, const6, const3);
4124 res = __msa_aver_u_b(inp1, res);
4128 static void hv_mc_qpel_aver_hv_src10_16x16_msa(const uint8_t *src,
4135 hv_mc_qpel_aver_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
4136 vert_mc_qpel_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
4139 static void hv_mc_qpel_aver_hv_src10_8x8_msa(const uint8_t *src,
4144 v16u8 inp0, inp1, inp2, inp3;
4145 v16u8 res0, res1, avg0, avg1;
4146 v16u8 horiz0, horiz1, horiz2, horiz3;
4147 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
4148 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4149 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
4150 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
4151 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
4152 v16u8 const20 = (v16u8) __msa_ldi_b(20);
4153 v16u8 const6 = (v16u8) __msa_ldi_b(6);
4154 v16u8 const3 = (v16u8) __msa_ldi_b(3);
4156 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
4157 src += (4 * src_stride);
4158 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
4159 const20, const6, const3);
4160 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
4161 const20, const6, const3);
4162 SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
4164 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
4165 horiz0 = __msa_aver_u_b(inp0, res0);
4166 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
4167 SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
4169 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
4170 horiz2 = __msa_aver_u_b(inp2, res1);
4171 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4172 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
4173 src += (4 * src_stride);
4174 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
4175 const20, const6, const3);
4176 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
4177 const20, const6, const3);
4178 SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
4180 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
4181 horiz4 = __msa_aver_u_b(inp0, res0);
4182 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4183 SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
4185 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
4186 horiz6 = __msa_aver_u_b(inp2, res1);
4187 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4188 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
4189 horiz1, horiz2, horiz3, horiz4,
4190 horiz1, horiz0, horiz0, horiz1,
4191 horiz2, horiz3, horiz4, horiz5,
4192 const20, const6, const3);
4193 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
4194 res0 = __msa_aver_u_b(avg0, res0);
4195 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
4196 horiz3, horiz4, horiz5, horiz6,
4197 horiz3, horiz2, horiz1, horiz0,
4198 horiz4, horiz5, horiz6, horiz7,
4199 const20, const6, const3);
4200 ST8x2_UB(res0, dst, dst_stride);
4201 dst += 2 * dst_stride;
4204 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
4205 const20, const6, const3);
4206 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
4207 res1 = __msa_aver_u_b(avg1, res1);
4208 inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
4209 horiz8 = __msa_aver_u_b(inp0, res0);
4210 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
4211 horiz5, horiz6, horiz7, horiz8,
4212 horiz5, horiz4, horiz3, horiz2,
4213 horiz6, horiz7, horiz8, horiz8,
4214 const20, const6, const3);
4215 ST8x2_UB(res1, dst, dst_stride);
4216 dst += 2 * dst_stride;
4218 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
4219 res0 = __msa_aver_u_b(avg0, res0);
4220 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
4221 horiz7, horiz8, horiz8, horiz7,
4222 horiz7, horiz6, horiz5, horiz4,
4223 horiz8, horiz8, horiz7, horiz6,
4224 const20, const6, const3);
4225 ST8x2_UB(res0, dst, dst_stride);
4226 dst += 2 * dst_stride;
4228 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
4229 res1 = __msa_aver_u_b(avg1, res1);
4230 ST8x2_UB(res1, dst, dst_stride);
4233 static void hv_mc_qpel_aver_h_src0_16x16_msa(const uint8_t *src,
4240 hv_mc_qpel_aver_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
4241 vert_mc_qpel_16x16_msa(buff, 16, dst, dst_stride);
4244 static void hv_mc_qpel_aver_h_src0_8x8_msa(const uint8_t *src,
4249 v16u8 inp0, inp1, inp2, inp3;
4251 v16u8 horiz0, horiz1, horiz2, horiz3;
4252 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
4253 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4254 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
4255 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
4256 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
4257 v16u8 const20 = (v16u8) __msa_ldi_b(20);
4258 v16u8 const6 = (v16u8) __msa_ldi_b(6);
4259 v16u8 const3 = (v16u8) __msa_ldi_b(3);
4261 LD_UB2(src, src_stride, inp0, inp1);
4262 src += (2 * src_stride);
4263 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
4264 const20, const6, const3);
4265 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
4266 horiz0 = __msa_aver_u_b(inp0, res0);
4267 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
4269 LD_UB2(src, src_stride, inp2, inp3);
4270 src += (2 * src_stride);
4271 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
4272 const20, const6, const3);
4273 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
4274 horiz2 = __msa_aver_u_b(inp2, res1);
4275 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4276 LD_UB2(src, src_stride, inp0, inp1);
4277 src += (2 * src_stride);
4278 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
4279 const20, const6, const3);
4280 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
4281 horiz4 = __msa_aver_u_b(inp0, res0);
4282 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4283 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
4284 horiz1, horiz2, horiz3, horiz4,
4285 horiz1, horiz0, horiz0, horiz1,
4286 horiz2, horiz3, horiz4, horiz5,
4287 const20, const6, const3);
4288 ST8x2_UB(res0, dst, dst_stride);
4289 dst += (2 * dst_stride);
4291 LD_UB2(src, src_stride, inp2, inp3);
4292 src += (2 * src_stride);
4293 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
4294 const20, const6, const3);
4295 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
4296 horiz6 = __msa_aver_u_b(inp2, res1);
4297 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4298 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
4299 horiz3, horiz4, horiz5, horiz6,
4300 horiz3, horiz2, horiz1, horiz0,
4301 horiz4, horiz5, horiz6, horiz7,
4302 const20, const6, const3);
4304 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
4305 const20, const6, const3);
4306 horiz8 = __msa_aver_u_b(inp0, res0);
4307 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
4308 horiz5, horiz6, horiz7, horiz8,
4309 horiz5, horiz4, horiz3, horiz2,
4310 horiz6, horiz7, horiz8, horiz8,
4311 const20, const6, const3);
4312 ST8x2_UB(res1, dst, dst_stride);
4313 dst += 2 * dst_stride;
4315 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
4316 horiz7, horiz8, horiz8, horiz7,
4317 horiz7, horiz6, horiz5, horiz4,
4318 horiz8, horiz8, horiz7, horiz6,
4319 const20, const6, const3);
4320 ST8x2_UB(res0, dst, dst_stride);
4321 dst += 2 * dst_stride;
4322 ST8x2_UB(res1, dst, dst_stride);
4325 static void hv_mc_qpel_16x16_msa(const uint8_t *src,
4332 hv_mc_qpel_aver_horiz_16x16_msa(src, src_stride, buff, 16, 16);
4333 vert_mc_qpel_16x16_msa(buff, 16, dst, dst_stride);
4336 static void hv_mc_qpel_8x8_msa(const uint8_t *src, int32_t src_stride,
4337 uint8_t *dst, int32_t dst_stride)
4339 v16u8 inp0, inp1, inp2, inp3;
4341 v16u8 horiz0, horiz1, horiz2, horiz3;
4342 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
4343 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4344 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
4345 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
4346 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
4347 v16u8 const20 = (v16u8) __msa_ldi_b(20);
4348 v16u8 const6 = (v16u8) __msa_ldi_b(6);
4349 v16u8 const3 = (v16u8) __msa_ldi_b(3);
4351 LD_UB2(src, src_stride, inp0, inp1);
4352 src += (2 * src_stride);
4353 horiz0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
4354 mask0, mask1, mask2, mask3,
4355 const20, const6, const3);
4356 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
4357 LD_UB2(src, src_stride, inp2, inp3);
4358 src += (2 * src_stride);
4359 horiz2 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
4360 mask0, mask1, mask2, mask3,
4361 const20, const6, const3);
4362 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4363 LD_UB2(src, src_stride, inp0, inp1);
4364 src += (2 * src_stride);
4365 horiz4 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
4366 mask0, mask1, mask2, mask3,
4367 const20, const6, const3);
4368 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4369 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
4370 horiz1, horiz2, horiz3, horiz4,
4371 horiz1, horiz0, horiz0, horiz1,
4372 horiz2, horiz3, horiz4, horiz5,
4373 const20, const6, const3);
4374 ST8x2_UB(res0, dst, dst_stride);
4375 dst += (2 * dst_stride);
4377 LD_UB2(src, src_stride, inp2, inp3);
4378 src += (2 * src_stride);
4379 horiz6 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
4380 mask0, mask1, mask2, mask3,
4381 const20, const6, const3);
4382 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4383 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
4384 horiz3, horiz4, horiz5, horiz6,
4385 horiz3, horiz2, horiz1, horiz0,
4386 horiz4, horiz5, horiz6, horiz7,
4387 const20, const6, const3);
4389 horiz8 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0,
4390 mask0, mask1, mask2, mask3,
4391 const20, const6, const3);
4392 ST8x2_UB(res1, dst, dst_stride);
4393 dst += 2 * dst_stride;
4395 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
4396 horiz5, horiz6, horiz7, horiz8,
4397 horiz5, horiz4, horiz3, horiz2,
4398 horiz6, horiz7, horiz8, horiz8,
4399 const20, const6, const3);
4400 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
4401 horiz7, horiz8, horiz8, horiz7,
4402 horiz7, horiz6, horiz5, horiz4,
4403 horiz8, horiz8, horiz7, horiz6,
4404 const20, const6, const3);
4405 ST8x2_UB(res0, dst, dst_stride);
4406 dst += 2 * dst_stride;
4407 ST8x2_UB(res1, dst, dst_stride);
4410 static void hv_mc_qpel_aver_h_src1_16x16_msa(const uint8_t *src,
4417 hv_mc_qpel_aver_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
4418 vert_mc_qpel_16x16_msa(buff, 16, dst, dst_stride);
4421 static void hv_mc_qpel_aver_h_src1_8x8_msa(const uint8_t *src,
4426 v16u8 inp0, inp1, inp2, inp3;
4428 v16u8 horiz0, horiz1, horiz2, horiz3;
4429 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
4430 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4431 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
4432 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
4433 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
4434 v16u8 const20 = (v16u8) __msa_ldi_b(20);
4435 v16u8 const6 = (v16u8) __msa_ldi_b(6);
4436 v16u8 const3 = (v16u8) __msa_ldi_b(3);
4438 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
4439 src += (4 * src_stride);
4441 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
4442 const20, const6, const3);
4443 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
4444 const20, const6, const3);
4445 SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
4447 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
4448 horiz0 = __msa_aver_u_b(inp0, res0);
4449 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
4450 SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
4452 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
4453 horiz2 = __msa_aver_u_b(inp2, res1);
4454 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4455 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
4456 src += (4 * src_stride);
4457 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
4458 const20, const6, const3);
4459 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
4460 const20, const6, const3);
4461 SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
4463 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
4464 horiz4 = __msa_aver_u_b(inp0, res0);
4465 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4466 SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
4468 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
4469 horiz6 = __msa_aver_u_b(inp2, res1);
4470 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4472 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
4473 const20, const6, const3);
4474 inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
4475 horiz8 = __msa_aver_u_b(inp0, res0);
4476 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
4477 horiz1, horiz2, horiz3, horiz4,
4478 horiz1, horiz0, horiz0, horiz1,
4479 horiz2, horiz3, horiz4, horiz5,
4480 const20, const6, const3);
4481 ST8x2_UB(res0, dst, dst_stride);
4482 dst += (2 * dst_stride);
4484 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
4485 horiz3, horiz4, horiz5, horiz6,
4486 horiz3, horiz2, horiz1, horiz0,
4487 horiz4, horiz5, horiz6, horiz7,
4488 const20, const6, const3);
4489 ST8x2_UB(res1, dst, dst_stride);
4490 dst += (2 * dst_stride);
4492 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
4493 horiz5, horiz6, horiz7, horiz8,
4494 horiz5, horiz4, horiz3, horiz2,
4495 horiz6, horiz7, horiz8, horiz8,
4496 const20, const6, const3);
4497 ST8x2_UB(res0, dst, dst_stride);
4498 dst += (2 * dst_stride);
4500 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
4501 horiz7, horiz8, horiz8, horiz7,
4502 horiz7, horiz6, horiz5, horiz4,
4503 horiz8, horiz8, horiz7, horiz6,
4504 const20, const6, const3);
4505 ST8x2_UB(res1, dst, dst_stride);
4508 static void hv_mc_qpel_aver_hv_src01_16x16_msa(const uint8_t *src,
4515 hv_mc_qpel_aver_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
4516 vert_mc_qpel_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
4519 static void hv_mc_qpel_aver_hv_src01_8x8_msa(const uint8_t *src,
4524 v16u8 inp0, inp1, inp2, inp3;
4525 v16u8 res0, res1, avg0, avg1;
4526 v16u8 horiz0, horiz1, horiz2, horiz3;
4527 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
4528 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4529 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
4530 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
4531 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
4532 v16u8 const20 = (v16u8) __msa_ldi_b(20);
4533 v16u8 const6 = (v16u8) __msa_ldi_b(6);
4534 v16u8 const3 = (v16u8) __msa_ldi_b(3);
4536 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
4537 src += (4 * src_stride);
4539 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
4540 const20, const6, const3);
4541 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
4542 const20, const6, const3);
4543 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
4544 horiz0 = __msa_aver_u_b(inp0, res0);
4545 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
4546 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
4547 horiz2 = __msa_aver_u_b(inp2, res1);
4548 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4549 LD_UB2(src, src_stride, inp0, inp1);
4550 src += (2 * src_stride);
4552 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
4553 const20, const6, const3);
4554 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
4555 horiz4 = __msa_aver_u_b(inp0, res0);
4556 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4557 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
4558 horiz1, horiz2, horiz3, horiz4,
4559 horiz1, horiz0, horiz0, horiz1,
4560 horiz2, horiz3, horiz4, horiz5,
4561 const20, const6, const3);
4562 avg0 = (v16u8) __msa_insve_d((v2i64) horiz1, 1, (v2i64) horiz2);
4563 res0 = __msa_aver_u_b(avg0, res0);
4564 ST8x2_UB(res0, dst, dst_stride);
4565 dst += (2 * dst_stride);
4567 LD_UB2(src, src_stride, inp2, inp3);
4568 src += (2 * src_stride);
4569 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
4570 const20, const6, const3);
4571 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
4572 horiz6 = __msa_aver_u_b(inp2, res1);
4573 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4575 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
4576 const20, const6, const3);
4577 horiz8 = __msa_aver_u_b(inp0, res0);
4578 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
4579 horiz3, horiz4, horiz5, horiz6,
4580 horiz3, horiz2, horiz1, horiz0,
4581 horiz4, horiz5, horiz6, horiz7,
4582 const20, const6, const3);
4583 avg1 = (v16u8) __msa_insve_d((v2i64) horiz3, 1, (v2i64) horiz4);
4584 res1 = __msa_aver_u_b(avg1, res1);
4585 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
4586 horiz5, horiz6, horiz7, horiz8,
4587 horiz5, horiz4, horiz3, horiz2,
4588 horiz6, horiz7, horiz8, horiz8,
4589 const20, const6, const3);
4590 ST8x2_UB(res1, dst, dst_stride);
4591 dst += 2 * dst_stride;
4593 avg0 = (v16u8) __msa_insve_d((v2i64) horiz5, 1, (v2i64) horiz6);
4594 res0 = __msa_aver_u_b(avg0, res0);
4595 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
4596 horiz7, horiz8, horiz8, horiz7,
4597 horiz7, horiz6, horiz5, horiz4,
4598 horiz8, horiz8, horiz7, horiz6,
4599 const20, const6, const3);
4600 ST8x2_UB(res0, dst, dst_stride);
4601 dst += 2 * dst_stride;
4603 avg1 = (v16u8) __msa_insve_d((v2i64) horiz7, 1, (v2i64) horiz8);
4604 res1 = __msa_aver_u_b(avg1, res1);
4605 ST8x2_UB(res1, dst, dst_stride);
4606 dst += (2 * dst_stride);
4609 static void hv_mc_qpel_aver_v_src1_16x16_msa(const uint8_t *src,
4616 hv_mc_qpel_aver_horiz_16x16_msa(src, src_stride, buff, 16, 16);
4617 vert_mc_qpel_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
4620 static void hv_mc_qpel_aver_v_src1_8x8_msa(const uint8_t *src,
4625 v16u8 inp0, inp1, inp2, inp3;
4626 v16u8 res0, res1, avg0, avg1;
4627 v16u8 horiz0, horiz1, horiz2, horiz3;
4628 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
4629 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4630 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
4631 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
4632 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
4633 v16u8 const20 = (v16u8) __msa_ldi_b(20);
4634 v16u8 const6 = (v16u8) __msa_ldi_b(6);
4635 v16u8 const3 = (v16u8) __msa_ldi_b(3);
4637 LD_UB2(src, src_stride, inp0, inp1);
4638 src += (2 * src_stride);
4639 horiz0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
4640 mask0, mask1, mask2, mask3,
4641 const20, const6, const3);
4642 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
4643 LD_UB2(src, src_stride, inp2, inp3);
4644 src += (2 * src_stride);
4645 horiz2 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
4646 mask0, mask1, mask2, mask3,
4647 const20, const6, const3);
4648 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4649 LD_UB2(src, src_stride, inp0, inp1);
4650 src += (2 * src_stride);
4651 horiz4 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
4652 mask0, mask1, mask2, mask3,
4653 const20, const6, const3);
4654 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4655 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4656 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
4657 horiz1, horiz2, horiz3, horiz4,
4658 horiz1, horiz0, horiz0, horiz1,
4659 horiz2, horiz3, horiz4, horiz5,
4660 const20, const6, const3);
4661 avg0 = (v16u8) __msa_insve_d((v2i64) horiz1, 1, (v2i64) horiz2);
4662 res0 = __msa_aver_u_b(avg0, res0);
4663 ST8x2_UB(res0, dst, dst_stride);
4664 dst += (2 * dst_stride);
4666 LD_UB2(src, src_stride, inp2, inp3);
4667 src += (2 * src_stride);
4668 horiz6 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
4669 mask0, mask1, mask2, mask3,
4670 const20, const6, const3);
4671 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4672 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
4673 horiz3, horiz4, horiz5, horiz6,
4674 horiz3, horiz2, horiz1, horiz0,
4675 horiz4, horiz5, horiz6, horiz7,
4676 const20, const6, const3);
4678 horiz8 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0,
4679 mask0, mask1, mask2, mask3,
4680 const20, const6, const3);
4681 avg1 = (v16u8) __msa_insve_d((v2i64) horiz3, 1, (v2i64) horiz4);
4682 res1 = __msa_aver_u_b(avg1, res1);
4683 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
4684 horiz5, horiz6, horiz7, horiz8,
4685 horiz5, horiz4, horiz3, horiz2,
4686 horiz6, horiz7, horiz8, horiz8,
4687 const20, const6, const3);
4688 ST8x2_UB(res1, dst, dst_stride);
4689 dst += 2 * dst_stride;
4690 avg0 = (v16u8) __msa_insve_d((v2i64) horiz5, 1, (v2i64) horiz6);
4691 res0 = __msa_aver_u_b(avg0, res0);
4693 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
4694 horiz7, horiz8, horiz8, horiz7,
4695 horiz7, horiz6, horiz5, horiz4,
4696 horiz8, horiz8, horiz7, horiz6,
4697 const20, const6, const3);
4698 ST8x2_UB(res0, dst, dst_stride);
4699 dst += 2 * dst_stride;
4700 avg1 = (v16u8) __msa_insve_d((v2i64) horiz7, 1, (v2i64) horiz8);
4701 res1 = __msa_aver_u_b(avg1, res1);
4702 ST8x2_UB(res1, dst, dst_stride);
4705 static void hv_mc_qpel_aver_hv_src11_16x16_msa(const uint8_t *src,
4712 hv_mc_qpel_aver_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
4713 vert_mc_qpel_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
4716 static void hv_mc_qpel_aver_hv_src11_8x8_msa(const uint8_t *src,
4718 uint8_t *dst, int32_t dst_stride)
4720 v16u8 inp0, inp1, inp2, inp3;
4721 v16u8 res0, res1, avg0, avg1;
4722 v16u8 horiz0, horiz1, horiz2, horiz3;
4723 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
4724 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4725 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
4726 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
4727 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
4728 v16u8 const20 = (v16u8) __msa_ldi_b(20);
4729 v16u8 const6 = (v16u8) __msa_ldi_b(6);
4730 v16u8 const3 = (v16u8) __msa_ldi_b(3);
4732 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
4733 src += (4 * src_stride);
4734 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
4735 mask0, mask1, mask2, mask3,
4736 const20, const6, const3);
4737 SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
4739 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
4740 horiz0 = __msa_aver_u_b(inp0, res0);
4741 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
4742 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
4743 const20, const6, const3);
4744 SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
4746 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
4747 horiz2 = __msa_aver_u_b(inp2, res1);
4748 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4749 LD_UB2(src, src_stride, inp0, inp1);
4750 src += (2 * src_stride);
4751 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
4752 const20, const6, const3);
4753 SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
4755 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
4756 horiz4 = __msa_aver_u_b(inp0, res0);
4757 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4758 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
4759 horiz1, horiz2, horiz3, horiz4,
4760 horiz1, horiz0, horiz0, horiz1,
4761 horiz2, horiz3, horiz4, horiz5,
4762 const20, const6, const3);
4763 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
4764 res0 = __msa_aver_u_b(avg0, res0);
4765 LD_UB2(src, src_stride, inp2, inp3);
4766 src += (2 * src_stride);
4767 ST8x2_UB(res0, dst, dst_stride);
4768 dst += 2 * dst_stride;
4770 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
4771 const20, const6, const3);
4772 SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
4774 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
4775 horiz6 = __msa_aver_u_b(inp2, res1);
4776 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4777 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
4778 horiz3, horiz4, horiz5, horiz6,
4779 horiz3, horiz2, horiz1, horiz0,
4780 horiz4, horiz5, horiz6, horiz7,
4781 const20, const6, const3);
4782 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
4783 res1 = __msa_aver_u_b(avg1, res1);
4785 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
4786 const20, const6, const3);
4787 inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
4788 horiz8 = __msa_aver_u_b(inp0, res0);
4789 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
4790 horiz5, horiz6, horiz7, horiz8,
4791 horiz5, horiz4, horiz3, horiz2,
4792 horiz6, horiz7, horiz8, horiz8,
4793 const20, const6, const3);
4794 ST8x2_UB(res1, dst, dst_stride);
4795 dst += 2 * dst_stride;
4797 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
4798 res0 = __msa_aver_u_b(avg0, res0);
4799 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
4800 horiz7, horiz8, horiz8, horiz7,
4801 horiz7, horiz6, horiz5, horiz4,
4802 horiz8, horiz8, horiz7, horiz6,
4803 const20, const6, const3);
4804 ST8x2_UB(res0, dst, dst_stride);
4805 dst += 2 * dst_stride;
4807 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
4808 res1 = __msa_aver_u_b(avg1, res1);
4809 ST8x2_UB(res1, dst, dst_stride);
4812 static void hv_mc_qpel_avg_dst_aver_hv_src00_16x16_msa(const uint8_t *src,
4819 hv_mc_qpel_aver_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
4820 vert_mc_qpel_avg_dst_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
4823 static void hv_mc_qpel_avg_dst_aver_hv_src00_8x8_msa(const uint8_t *src,
4828 v16u8 inp0, inp1, inp2, inp3;
4829 v16u8 res0, res1, avg0, avg1;
4830 v16u8 horiz0, horiz1, horiz2, horiz3;
4831 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
4833 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4834 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
4835 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
4836 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
4837 v16u8 const20 = (v16u8) __msa_ldi_b(20);
4838 v16u8 const6 = (v16u8) __msa_ldi_b(6);
4839 v16u8 const3 = (v16u8) __msa_ldi_b(3);
4841 LD_UB2(src, src_stride, inp0, inp1);
4842 src += (2 * src_stride);
4843 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
4844 const20, const6, const3);
4845 LD_UB2(src, src_stride, inp2, inp3);
4846 src += (2 * src_stride);
4847 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
4848 horiz0 = __msa_aver_u_b(inp0, res0);
4849 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
4850 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
4851 const20, const6, const3);
4852 LD_UB2(src, src_stride, inp0, inp1);
4853 src += (2 * src_stride);
4854 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
4855 horiz2 = __msa_aver_u_b(inp2, res1);
4856 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4857 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
4858 const20, const6, const3);
4859 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
4860 horiz4 = __msa_aver_u_b(inp0, res0);
4861 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4862 LD_UB2(dst, dst_stride, dst0, dst1);
4863 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
4864 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
4865 horiz1, horiz2, horiz3, horiz4,
4866 horiz1, horiz0, horiz0, horiz1,
4867 horiz2, horiz3, horiz4, horiz5,
4868 const20, const6, const3);
4869 res0 = __msa_aver_u_b(avg0, res0);
4870 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
4871 res0 = __msa_aver_u_b(avg0, res0);
4872 ST8x2_UB(res0, dst, dst_stride);
4873 dst += (2 * dst_stride);
4875 LD_UB2(src, src_stride, inp2, inp3);
4876 src += (2 * src_stride);
4877 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
4878 const20, const6, const3);
4879 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
4880 horiz6 = __msa_aver_u_b(inp2, res1);
4881 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4882 LD_UB2(dst, dst_stride, dst0, dst1);
4883 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
4884 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
4885 horiz3, horiz4, horiz5, horiz6,
4886 horiz3, horiz2, horiz1, horiz0,
4887 horiz4, horiz5, horiz6, horiz7,
4888 const20, const6, const3);
4889 res1 = __msa_aver_u_b(avg1, res1);
4890 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
4891 res1 = __msa_aver_u_b(avg1, res1);
4892 ST8x2_UB(res1, dst, dst_stride);
4893 dst += (2 * dst_stride);
4896 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
4897 const20, const6, const3);
4898 horiz8 = __msa_aver_u_b(inp0, res0);
4899 LD_UB2(dst, dst_stride, dst0, dst1);
4900 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
4901 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
4902 horiz5, horiz6, horiz7, horiz8,
4903 horiz5, horiz4, horiz3, horiz2,
4904 horiz6, horiz7, horiz8, horiz8,
4905 const20, const6, const3);
4906 res0 = __msa_aver_u_b(avg0, res0);
4907 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
4908 res0 = __msa_aver_u_b(avg0, res0);
4909 ST8x2_UB(res0, dst, dst_stride);
4910 dst += (2 * dst_stride);
4912 LD_UB2(dst, dst_stride, dst0, dst1);
4913 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
4914 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
4915 horiz7, horiz8, horiz8, horiz7,
4916 horiz7, horiz6, horiz5, horiz4,
4917 horiz8, horiz8, horiz7, horiz6,
4918 const20, const6, const3);
4919 res1 = __msa_aver_u_b(avg1, res1);
4920 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
4921 res1 = __msa_aver_u_b(avg1, res1);
4922 ST8x2_UB(res1, dst, dst_stride);
4925 static void hv_mc_qpel_avg_dst_aver_v_src0_16x16_msa(const uint8_t *src,
4932 hv_mc_qpel_aver_horiz_16x16_msa(src, src_stride, buff, 16, 16);
4933 vert_mc_qpel_avg_dst_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
4936 static void hv_mc_qpel_avg_dst_aver_v_src0_8x8_msa(const uint8_t *src,
4941 v16u8 inp0, inp1, inp2, inp3;
4942 v16u8 res0, res1, avg0, avg1;
4943 v16u8 horiz0, horiz1, horiz2, horiz3;
4944 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
4946 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4947 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
4948 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
4949 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
4950 v16u8 const20 = (v16u8) __msa_ldi_b(20);
4951 v16u8 const6 = (v16u8) __msa_ldi_b(6);
4952 v16u8 const3 = (v16u8) __msa_ldi_b(3);
4954 LD_UB2(src, src_stride, inp0, inp1);
4955 src += (2 * src_stride);
4956 horiz0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
4957 mask0, mask1, mask2, mask3,
4958 const20, const6, const3);
4959 LD_UB2(src, src_stride, inp2, inp3);
4960 src += (2 * src_stride);
4961 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
4962 horiz2 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
4963 mask0, mask1, mask2, mask3,
4964 const20, const6, const3);
4965 LD_UB2(src, src_stride, inp0, inp1);
4966 src += (2 * src_stride);
4967 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4968 horiz4 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
4969 mask0, mask1, mask2, mask3,
4970 const20, const6, const3);
4971 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4972 LD_UB2(dst, dst_stride, dst0, dst1);
4973 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
4974 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
4975 horiz1, horiz2, horiz3, horiz4,
4976 horiz1, horiz0, horiz0, horiz1,
4977 horiz2, horiz3, horiz4, horiz5,
4978 const20, const6, const3);
4979 res0 = __msa_aver_u_b(avg0, res0);
4980 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
4981 res0 = __msa_aver_u_b(avg0, res0);
4982 ST8x2_UB(res0, dst, dst_stride);
4983 dst += (2 * dst_stride);
4985 LD_UB2(src, src_stride, inp2, inp3);
4986 src += (2 * src_stride);
4987 horiz6 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
4988 mask0, mask1, mask2, mask3,
4989 const20, const6, const3);
4990 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4991 LD_UB2(dst, dst_stride, dst0, dst1);
4992 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
4993 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
4994 horiz3, horiz4, horiz5, horiz6,
4995 horiz3, horiz2, horiz1, horiz0,
4996 horiz4, horiz5, horiz6, horiz7,
4997 const20, const6, const3);
4998 res1 = __msa_aver_u_b(avg1, res1);
4999 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5000 res1 = __msa_aver_u_b(avg1, res1);
5001 ST8x2_UB(res1, dst, dst_stride);
5002 dst += (2 * dst_stride);
5005 horiz8 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0,
5006 mask0, mask1, mask2, mask3,
5007 const20, const6, const3);
5008 LD_UB2(dst, dst_stride, dst0, dst1);
5009 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
5010 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
5011 horiz5, horiz6, horiz7, horiz8,
5012 horiz5, horiz4, horiz3, horiz2,
5013 horiz6, horiz7, horiz8, horiz8,
5014 const20, const6, const3);
5015 res0 = __msa_aver_u_b(avg0, res0);
5016 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5017 res0 = __msa_aver_u_b(avg0, res0);
5018 ST8x2_UB(res0, dst, dst_stride);
5019 dst += (2 * dst_stride);
5021 LD_UB2(dst, dst_stride, dst0, dst1);
5022 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
5023 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
5024 horiz7, horiz8, horiz8, horiz7,
5025 horiz7, horiz6, horiz5, horiz4,
5026 horiz8, horiz8, horiz7, horiz6,
5027 const20, const6, const3);
5028 res1 = __msa_aver_u_b(avg1, res1);
5029 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5030 res1 = __msa_aver_u_b(avg1, res1);
5031 ST8x2_UB(res1, dst, dst_stride);
5034 static void hv_mc_qpel_avg_dst_aver_hv_src10_16x16_msa(const uint8_t *src,
5041 hv_mc_qpel_aver_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
5042 vert_mc_qpel_avg_dst_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
5045 static void hv_mc_qpel_avg_dst_aver_hv_src10_8x8_msa(const uint8_t *src,
5050 v16u8 inp0, inp1, inp2, inp3;
5051 v16u8 res0, res1, avg0, avg1;
5052 v16u8 horiz0, horiz1, horiz2, horiz3;
5053 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
5055 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
5056 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
5057 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
5058 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
5059 v16u8 const20 = (v16u8) __msa_ldi_b(20);
5060 v16u8 const6 = (v16u8) __msa_ldi_b(6);
5061 v16u8 const3 = (v16u8) __msa_ldi_b(3);
5063 LD_UB2(src, src_stride, inp0, inp1);
5064 src += (2 * src_stride);
5065 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
5066 const20, const6, const3);
5068 LD_UB2(src, src_stride, inp2, inp3);
5069 src += (2 * src_stride);
5070 SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
5072 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5073 horiz0 = __msa_aver_u_b(inp0, res0);
5074 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
5075 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
5076 const20, const6, const3);
5077 LD_UB2(src, src_stride, inp0, inp1);
5078 src += (2 * src_stride);
5079 SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
5081 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5082 horiz2 = __msa_aver_u_b(inp2, res1);
5083 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
5084 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
5085 const20, const6, const3);
5087 SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
5089 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5090 horiz4 = __msa_aver_u_b(inp0, res0);
5091 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
5092 LD_UB2(dst, dst_stride, dst0, dst1);
5093 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
5094 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
5095 horiz1, horiz2, horiz3, horiz4,
5096 horiz1, horiz0, horiz0, horiz1,
5097 horiz2, horiz3, horiz4, horiz5,
5098 const20, const6, const3);
5099 res0 = __msa_aver_u_b(avg0, res0);
5100 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5101 res0 = __msa_aver_u_b(avg0, res0);
5102 ST8x2_UB(res0, dst, dst_stride);
5103 dst += (2 * dst_stride);
5105 LD_UB2(src, src_stride, inp2, inp3);
5106 src += (2 * src_stride);
5107 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
5108 const20, const6, const3);
5110 SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
5112 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5113 horiz6 = __msa_aver_u_b(inp2, res1);
5114 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
5115 LD_UB2(dst, dst_stride, dst0, dst1);
5116 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
5117 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
5118 horiz3, horiz4, horiz5, horiz6,
5119 horiz3, horiz2, horiz1, horiz0,
5120 horiz4, horiz5, horiz6, horiz7,
5121 const20, const6, const3);
5122 res1 = __msa_aver_u_b(avg1, res1);
5123 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5124 res1 = __msa_aver_u_b(avg1, res1);
5125 ST8x2_UB(res1, dst, dst_stride);
5126 dst += (2 * dst_stride);
5129 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
5130 const20, const6, const3);
5131 inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
5132 horiz8 = __msa_aver_u_b(inp0, res0);
5133 LD_UB2(dst, dst_stride, dst0, dst1);
5134 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
5135 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
5136 horiz5, horiz6, horiz7, horiz8,
5137 horiz5, horiz4, horiz3, horiz2,
5138 horiz6, horiz7, horiz8, horiz8,
5139 const20, const6, const3);
5140 res0 = __msa_aver_u_b(avg0, res0);
5141 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5142 res0 = __msa_aver_u_b(avg0, res0);
5143 ST8x2_UB(res0, dst, dst_stride);
5144 dst += (2 * dst_stride);
5146 LD_UB2(dst, dst_stride, dst0, dst1);
5147 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
5148 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
5149 horiz7, horiz8, horiz8, horiz7,
5150 horiz7, horiz6, horiz5, horiz4,
5151 horiz8, horiz8, horiz7, horiz6,
5152 const20, const6, const3);
5153 res1 = __msa_aver_u_b(avg1, res1);
5154 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5155 res1 = __msa_aver_u_b(avg1, res1);
5156 ST8x2_UB(res1, dst, dst_stride);
5159 static void hv_mc_qpel_avg_dst_aver_h_src0_16x16_msa(const uint8_t *src,
5166 hv_mc_qpel_aver_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
5167 vert_mc_qpel_avg_dst_16x16_msa(buff, 16, dst, dst_stride);
5170 static void hv_mc_qpel_avg_dst_aver_h_src0_8x8_msa(const uint8_t *src,
5175 v16u8 inp0, inp1, inp2, inp3;
5176 v16u8 res0, res1, avg0, avg1;
5177 v16u8 horiz0, horiz1, horiz2, horiz3;
5178 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
5180 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
5181 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
5182 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
5183 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
5184 v16u8 const20 = (v16u8) __msa_ldi_b(20);
5185 v16u8 const6 = (v16u8) __msa_ldi_b(6);
5186 v16u8 const3 = (v16u8) __msa_ldi_b(3);
5188 LD_UB2(src, src_stride, inp0, inp1);
5189 src += (2 * src_stride);
5190 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
5191 const20, const6, const3);
5192 LD_UB2(src, src_stride, inp2, inp3);
5193 src += (2 * src_stride);
5194 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5195 horiz0 = __msa_aver_u_b(inp0, res0);
5196 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
5197 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
5198 const20, const6, const3);
5199 LD_UB2(src, src_stride, inp0, inp1);
5200 src += (2 * src_stride);
5201 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5202 horiz2 = __msa_aver_u_b(inp2, res1);
5203 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
5204 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
5205 const20, const6, const3);
5206 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5207 horiz4 = __msa_aver_u_b(inp0, res0);
5208 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
5209 LD_UB2(dst, dst_stride, dst0, dst1);
5210 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
5211 horiz1, horiz2, horiz3, horiz4,
5212 horiz1, horiz0, horiz0, horiz1,
5213 horiz2, horiz3, horiz4, horiz5,
5214 const20, const6, const3);
5215 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5216 res0 = __msa_aver_u_b(avg0, res0);
5217 ST8x2_UB(res0, dst, dst_stride);
5218 dst += (2 * dst_stride);
5220 LD_UB2(src, src_stride, inp2, inp3);
5221 src += (2 * src_stride);
5222 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
5223 const20, const6, const3);
5224 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5225 horiz6 = __msa_aver_u_b(inp2, res1);
5226 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
5227 LD_UB2(dst, dst_stride, dst0, dst1);
5228 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
5229 horiz3, horiz4, horiz5, horiz6,
5230 horiz3, horiz2, horiz1, horiz0,
5231 horiz4, horiz5, horiz6, horiz7,
5232 const20, const6, const3);
5233 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5234 res1 = __msa_aver_u_b(avg1, res1);
5235 ST8x2_UB(res1, dst, dst_stride);
5236 dst += (2 * dst_stride);
5239 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
5240 const20, const6, const3);
5241 horiz8 = __msa_aver_u_b(inp0, res0);
5242 LD_UB2(dst, dst_stride, dst0, dst1);
5243 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
5244 horiz5, horiz6, horiz7, horiz8,
5245 horiz5, horiz4, horiz3, horiz2,
5246 horiz6, horiz7, horiz8, horiz8,
5247 const20, const6, const3);
5248 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5249 res0 = __msa_aver_u_b(avg0, res0);
5250 ST8x2_UB(res0, dst, dst_stride);
5251 dst += (2 * dst_stride);
5253 LD_UB2(dst, dst_stride, dst0, dst1);
5254 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
5255 horiz7, horiz8, horiz8, horiz7,
5256 horiz7, horiz6, horiz5, horiz4,
5257 horiz8, horiz8, horiz7, horiz6,
5258 const20, const6, const3);
5259 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5260 res1 = __msa_aver_u_b(avg1, res1);
5261 ST8x2_UB(res1, dst, dst_stride);
5262 dst += (2 * dst_stride);
5265 static void hv_mc_qpel_avg_dst_16x16_msa(const uint8_t *src, int32_t src_stride,
5266 uint8_t *dst, int32_t dst_stride)
5270 hv_mc_qpel_aver_horiz_16x16_msa(src, src_stride, buff, 16, 16);
5271 vert_mc_qpel_avg_dst_16x16_msa(buff, 16, dst, dst_stride);
5275 static void hv_mc_qpel_avg_dst_8x8_msa(const uint8_t *src, int32_t src_stride,
5276 uint8_t *dst, int32_t dst_stride)
5278 v16u8 inp0, inp1, inp2, inp3;
5279 v16u8 res0, res1, avg0, avg1;
5280 v16u8 horiz0, horiz1, horiz2, horiz3;
5281 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
5283 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
5284 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
5285 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
5286 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
5287 v16u8 const20 = (v16u8) __msa_ldi_b(20);
5288 v16u8 const6 = (v16u8) __msa_ldi_b(6);
5289 v16u8 const3 = (v16u8) __msa_ldi_b(3);
5291 LD_UB2(src, src_stride, inp0, inp1);
5292 src += (2 * src_stride);
5293 horiz0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
5294 mask0, mask1, mask2, mask3,
5295 const20, const6, const3);
5296 LD_UB2(src, src_stride, inp2, inp3);
5297 src += (2 * src_stride);
5298 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
5299 horiz2 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
5300 mask0, mask1, mask2, mask3,
5301 const20, const6, const3);
5302 LD_UB2(src, src_stride, inp0, inp1);
5303 src += (2 * src_stride);
5304 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
5305 horiz4 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
5306 mask0, mask1, mask2, mask3,
5307 const20, const6, const3);
5308 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
5309 LD_UB2(src, src_stride, inp2, inp3);
5310 src += (2 * src_stride);
5311 horiz6 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
5312 mask0, mask1, mask2, mask3,
5313 const20, const6, const3);
5314 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
5316 horiz8 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0,
5317 mask0, mask1, mask2, mask3,
5318 const20, const6, const3);
5319 LD_UB2(dst, dst_stride, dst0, dst1);
5320 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
5321 horiz1, horiz2, horiz3, horiz4,
5322 horiz1, horiz0, horiz0, horiz1,
5323 horiz2, horiz3, horiz4, horiz5,
5324 const20, const6, const3);
5325 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5326 res0 = __msa_aver_u_b(avg0, res0);
5327 ST8x2_UB(res0, dst, dst_stride);
5328 dst += (2 * dst_stride);
5330 LD_UB2(dst, dst_stride, dst0, dst1);
5331 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
5332 horiz3, horiz4, horiz5, horiz6,
5333 horiz3, horiz2, horiz1, horiz0,
5334 horiz4, horiz5, horiz6, horiz7,
5335 const20, const6, const3);
5336 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5337 res1 = __msa_aver_u_b(avg1, res1);
5338 ST8x2_UB(res1, dst, dst_stride);
5339 dst += (2 * dst_stride);
5341 LD_UB2(dst, dst_stride, dst0, dst1);
5342 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
5343 horiz5, horiz6, horiz7, horiz8,
5344 horiz5, horiz4, horiz3, horiz2,
5345 horiz6, horiz7, horiz8, horiz8,
5346 const20, const6, const3);
5347 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5348 res0 = __msa_aver_u_b(avg0, res0);
5349 ST8x2_UB(res0, dst, dst_stride);
5350 dst += (2 * dst_stride);
5352 LD_UB2(dst, dst_stride, dst0, dst1);
5353 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
5354 horiz7, horiz8, horiz8, horiz7,
5355 horiz7, horiz6, horiz5, horiz4,
5356 horiz8, horiz8, horiz7, horiz6,
5357 const20, const6, const3);
5358 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5359 res1 = __msa_aver_u_b(avg1, res1);
5360 ST8x2_UB(res1, dst, dst_stride);
5363 static void hv_mc_qpel_avg_dst_aver_h_src1_16x16_msa(const uint8_t *src,
5370 hv_mc_qpel_aver_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
5371 vert_mc_qpel_avg_dst_16x16_msa(buff, 16, dst, dst_stride);
5374 static void hv_mc_qpel_avg_dst_aver_h_src1_8x8_msa(const uint8_t *src,
5379 v16u8 inp0, inp1, inp2, inp3;
5380 v16u8 res0, res1, avg0, avg1;
5381 v16u8 horiz0, horiz1, horiz2, horiz3;
5382 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
5384 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
5385 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
5386 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
5387 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
5388 v16u8 const20 = (v16u8) __msa_ldi_b(20);
5389 v16u8 const6 = (v16u8) __msa_ldi_b(6);
5390 v16u8 const3 = (v16u8) __msa_ldi_b(3);
5392 LD_UB2(src, src_stride, inp0, inp1);
5393 src += (2 * src_stride);
5394 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
5395 const20, const6, const3);
5396 LD_UB2(src, src_stride, inp2, inp3);
5397 src += (2 * src_stride);
5398 SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
5400 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5401 horiz0 = __msa_aver_u_b(inp0, res0);
5402 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
5403 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
5404 const20, const6, const3);
5405 LD_UB2(src, src_stride, inp0, inp1);
5406 src += (2 * src_stride);
5407 SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
5409 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5410 horiz2 = __msa_aver_u_b(inp2, res1);
5411 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
5412 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
5413 const20, const6, const3);
5415 SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
5417 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5418 horiz4 = __msa_aver_u_b(inp0, res0);
5419 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
5420 LD_UB2(dst, dst_stride, dst0, dst1);
5421 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
5422 horiz1, horiz2, horiz3, horiz4,
5423 horiz1, horiz0, horiz0, horiz1,
5424 horiz2, horiz3, horiz4, horiz5,
5425 const20, const6, const3);
5426 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5427 res0 = __msa_aver_u_b(avg0, res0);
5428 ST8x2_UB(res0, dst, dst_stride);
5429 dst += (2 * dst_stride);
5431 LD_UB2(src, src_stride, inp2, inp3);
5432 src += (2 * src_stride);
5433 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
5434 const20, const6, const3);
5436 SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
5438 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5439 horiz6 = __msa_aver_u_b(inp2, res1);
5440 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
5441 LD_UB2(dst, dst_stride, dst0, dst1);
5442 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
5443 horiz3, horiz4, horiz5, horiz6,
5444 horiz3, horiz2, horiz1, horiz0,
5445 horiz4, horiz5, horiz6, horiz7,
5446 const20, const6, const3);
5447 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5448 res1 = __msa_aver_u_b(avg1, res1);
5449 ST8x2_UB(res1, dst, dst_stride);
5450 dst += (2 * dst_stride);
5453 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
5454 const20, const6, const3);
5455 inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
5456 horiz8 = __msa_aver_u_b(inp0, res0);
5457 LD_UB2(dst, dst_stride, dst0, dst1);
5458 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
5459 horiz5, horiz6, horiz7, horiz8,
5460 horiz5, horiz4, horiz3, horiz2,
5461 horiz6, horiz7, horiz8, horiz8,
5462 const20, const6, const3);
5463 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5464 res0 = __msa_aver_u_b(avg0, res0);
5465 ST8x2_UB(res0, dst, dst_stride);
5466 dst += (2 * dst_stride);
5468 LD_UB2(dst, dst_stride, dst0, dst1);
5469 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
5470 horiz7, horiz8, horiz8, horiz7,
5471 horiz7, horiz6, horiz5, horiz4,
5472 horiz8, horiz8, horiz7, horiz6,
5473 const20, const6, const3);
5474 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5475 res1 = __msa_aver_u_b(avg1, res1);
5476 ST8x2_UB(res1, dst, dst_stride);
5479 static void hv_mc_qpel_avg_dst_aver_hv_src01_16x16_msa(const uint8_t *src,
5486 hv_mc_qpel_aver_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
5487 vert_mc_qpel_avg_dst_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
5490 static void hv_mc_qpel_avg_dst_aver_hv_src01_8x8_msa(const uint8_t *src,
5495 v16u8 inp0, inp1, inp2, inp3;
5496 v16u8 res0, res1, avg0, avg1;
5497 v16u8 horiz0, horiz1, horiz2, horiz3;
5498 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
5500 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
5501 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
5502 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
5503 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
5504 v16u8 const20 = (v16u8) __msa_ldi_b(20);
5505 v16u8 const6 = (v16u8) __msa_ldi_b(6);
5506 v16u8 const3 = (v16u8) __msa_ldi_b(3);
5508 LD_UB2(src, src_stride, inp0, inp1);
5509 src += (2 * src_stride);
5511 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
5512 const20, const6, const3);
5513 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5514 horiz0 = __msa_aver_u_b(inp0, res0);
5515 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
5516 LD_UB2(src, src_stride, inp2, inp3);
5517 src += (2 * src_stride);
5518 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
5519 const20, const6, const3);
5520 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5521 horiz2 = __msa_aver_u_b(inp2, res1);
5522 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
5523 LD_UB2(dst, dst_stride, dst0, dst1);
5524 LD_UB2(src, src_stride, inp0, inp1);
5525 src += (2 * src_stride);
5526 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
5527 const20, const6, const3);
5528 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5529 horiz4 = __msa_aver_u_b(inp0, res0);
5530 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
5531 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
5532 horiz1, horiz2, horiz3, horiz4,
5533 horiz1, horiz0, horiz0, horiz1,
5534 horiz2, horiz3, horiz4, horiz5,
5535 const20, const6, const3);
5536 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
5537 res0 = __msa_aver_u_b(avg0, res0);
5538 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5539 res0 = __msa_aver_u_b(avg0, res0);
5540 ST8x2_UB(res0, dst, dst_stride);
5541 dst += (2 * dst_stride);
5543 LD_UB2(dst, dst_stride, dst0, dst1);
5544 LD_UB2(src, src_stride, inp2, inp3);
5545 src += (2 * src_stride);
5546 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
5547 const20, const6, const3);
5548 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5549 horiz6 = __msa_aver_u_b(inp2, res1);
5550 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
5551 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
5552 horiz3, horiz4, horiz5, horiz6,
5553 horiz3, horiz2, horiz1, horiz0,
5554 horiz4, horiz5, horiz6, horiz7,
5555 const20, const6, const3);
5556 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
5557 res1 = __msa_aver_u_b(avg1, res1);
5558 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5559 res1 = __msa_aver_u_b(avg1, res1);
5560 ST8x2_UB(res1, dst, dst_stride);
5561 dst += (2 * dst_stride);
5564 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
5565 const20, const6, const3);
5566 horiz8 = __msa_aver_u_b(inp0, res0);
5567 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
5568 horiz5, horiz6, horiz7, horiz8,
5569 horiz5, horiz4, horiz3, horiz2,
5570 horiz6, horiz7, horiz8, horiz8,
5571 const20, const6, const3);
5572 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
5573 horiz7, horiz8, horiz8, horiz7,
5574 horiz7, horiz6, horiz5, horiz4,
5575 horiz8, horiz8, horiz7, horiz6,
5576 const20, const6, const3);
5577 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
5578 res0 = __msa_aver_u_b(avg0, res0);
5579 LD_UB2(dst, dst_stride, dst0, dst1);
5580 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5581 res0 = __msa_aver_u_b(avg0, res0);
5582 ST8x2_UB(res0, dst, dst_stride);
5583 dst += (2 * dst_stride);
5585 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
5586 res1 = __msa_aver_u_b(avg1, res1);
5587 LD_UB2(dst, dst_stride, dst0, dst1);
5588 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5589 res1 = __msa_aver_u_b(avg1, res1);
5590 ST8x2_UB(res1, dst, dst_stride);
5593 static void hv_mc_qpel_avg_dst_aver_v_src1_16x16_msa(const uint8_t *src,
5600 hv_mc_qpel_aver_horiz_16x16_msa(src, src_stride, buff, 16, 16);
5601 vert_mc_qpel_avg_dst_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
5604 static void hv_mc_qpel_avg_dst_aver_v_src1_8x8_msa(const uint8_t *src,
5609 v16u8 inp0, inp1, inp2, inp3;
5610 v16u8 res0, res1, avg0, avg1;
5611 v16u8 horiz0, horiz1, horiz2, horiz3;
5612 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
5614 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
5615 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
5616 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
5617 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
5618 v16u8 const20 = (v16u8) __msa_ldi_b(20);
5619 v16u8 const6 = (v16u8) __msa_ldi_b(6);
5620 v16u8 const3 = (v16u8) __msa_ldi_b(3);
5622 LD_UB2(src, src_stride, inp0, inp1);
5623 src += (2 * src_stride);
5624 horiz0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
5625 mask0, mask1, mask2, mask3,
5626 const20, const6, const3);
5627 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
5628 LD_UB2(src, src_stride, inp2, inp3);
5629 src += (2 * src_stride);
5630 horiz2 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
5631 mask0, mask1, mask2, mask3,
5632 const20, const6, const3);
5633 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
5634 LD_UB2(dst, dst_stride, dst0, dst1);
5635 LD_UB2(src, src_stride, inp0, inp1);
5636 src += (2 * src_stride);
5637 horiz4 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
5638 mask0, mask1, mask2, mask3,
5639 const20, const6, const3);
5640 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
5641 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
5642 horiz1, horiz2, horiz3, horiz4,
5643 horiz1, horiz0, horiz0, horiz1,
5644 horiz2, horiz3, horiz4, horiz5,
5645 const20, const6, const3);
5646 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
5647 res0 = __msa_aver_u_b(avg0, res0);
5648 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5649 res0 = __msa_aver_u_b(avg0, res0);
5650 ST8x2_UB(res0, dst, dst_stride);
5651 dst += (2 * dst_stride);
5653 LD_UB2(dst, dst_stride, dst0, dst1);
5654 LD_UB2(src, src_stride, inp2, inp3);
5655 src += (2 * src_stride);
5656 horiz6 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
5657 mask0, mask1, mask2, mask3,
5658 const20, const6, const3);
5659 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
5660 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
5661 horiz3, horiz4, horiz5, horiz6,
5662 horiz3, horiz2, horiz1, horiz0,
5663 horiz4, horiz5, horiz6, horiz7,
5664 const20, const6, const3);
5665 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
5666 res1 = __msa_aver_u_b(avg1, res1);
5667 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5668 res1 = __msa_aver_u_b(avg1, res1);
5669 ST8x2_UB(res1, dst, dst_stride);
5670 dst += (2 * dst_stride);
5673 horiz8 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0,
5674 mask0, mask1, mask2, mask3,
5675 const20, const6, const3);
5676 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, horiz5,
5677 horiz6, horiz7, horiz8, horiz5, horiz4,
5678 horiz3, horiz2, horiz6, horiz7, horiz8,
5679 horiz8, const20, const6, const3);
5680 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, horiz7,
5681 horiz8, horiz8, horiz7, horiz7, horiz6,
5682 horiz5, horiz4, horiz8, horiz8, horiz7,
5683 horiz6, const20, const6, const3);
5684 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
5685 res0 = __msa_aver_u_b(avg0, res0);
5686 LD_UB2(dst, dst_stride, dst0, dst1);
5687 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5688 res0 = __msa_aver_u_b(avg0, res0);
5689 ST8x2_UB(res0, dst, dst_stride);
5690 dst += (2 * dst_stride);
5692 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
5693 res1 = __msa_aver_u_b(avg1, res1);
5694 LD_UB2(dst, dst_stride, dst0, dst1);
5695 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5696 res1 = __msa_aver_u_b(avg1, res1);
5697 ST8x2_UB(res1, dst, dst_stride);
5700 static void hv_mc_qpel_avg_dst_aver_hv_src11_16x16_msa(const uint8_t *src,
5707 hv_mc_qpel_aver_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
5708 vert_mc_qpel_avg_dst_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
5711 static void hv_mc_qpel_avg_dst_aver_hv_src11_8x8_msa(const uint8_t *src,
5716 v16u8 inp0, inp1, inp2, inp3;
5717 v16u8 res0, res1, avg0, avg1;
5718 v16u8 horiz0, horiz1, horiz2, horiz3;
5719 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
5721 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
5722 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
5723 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
5724 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
5725 v16u8 const20 = (v16u8) __msa_ldi_b(20);
5726 v16u8 const6 = (v16u8) __msa_ldi_b(6);
5727 v16u8 const3 = (v16u8) __msa_ldi_b(3);
5729 LD_UB2(src, src_stride, inp0, inp1);
5730 src += (2 * src_stride);
5731 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
5732 const20, const6, const3);
5733 LD_UB2(src, src_stride, inp2, inp3);
5734 src += (2 * src_stride);
5735 SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
5737 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5738 horiz0 = __msa_aver_u_b(inp0, res0);
5739 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
5740 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
5741 const20, const6, const3);
5742 LD_UB2(src, src_stride, inp0, inp1);
5743 src += (2 * src_stride);
5744 SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
5746 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5747 horiz2 = __msa_aver_u_b(inp2, res1);
5748 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
5749 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
5750 const20, const6, const3);
5751 SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
5753 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5754 horiz4 = __msa_aver_u_b(inp0, res0);
5755 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
5756 LD_UB2(dst, dst_stride, dst0, dst1);
5757 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
5758 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, horiz1,
5759 horiz2, horiz3, horiz4, horiz1, horiz0,
5760 horiz0, horiz1, horiz2, horiz3, horiz4,
5761 horiz5, const20, const6, const3);
5762 res0 = __msa_aver_u_b(avg0, res0);
5763 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5764 res0 = __msa_aver_u_b(avg0, res0);
5765 ST8x2_UB(res0, dst, dst_stride);
5766 dst += (2 * dst_stride);
5768 LD_UB2(src, src_stride, inp2, inp3);
5769 src += (2 * src_stride);
5770 res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
5771 const20, const6, const3);
5772 SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
5774 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5775 horiz6 = __msa_aver_u_b(inp2, res1);
5776 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
5777 LD_UB2(dst, dst_stride, dst0, dst1);
5778 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
5779 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, horiz3,
5780 horiz4, horiz5, horiz6, horiz3, horiz2,
5781 horiz1, horiz0, horiz4, horiz5, horiz6,
5782 horiz7, const20, const6, const3);
5783 res1 = __msa_aver_u_b(avg1, res1);
5784 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5785 res1 = __msa_aver_u_b(avg1, res1);
5786 ST8x2_UB(res1, dst, dst_stride);
5787 dst += (2 * dst_stride);
5790 res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
5791 const20, const6, const3);
5792 inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
5793 horiz8 = __msa_aver_u_b(inp0, res0);
5794 LD_UB2(dst, dst_stride, dst0, dst1);
5795 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
5796 res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, horiz5,
5797 horiz6, horiz7, horiz8, horiz5, horiz4,
5798 horiz3, horiz2, horiz6, horiz7, horiz8,
5799 horiz8, const20, const6, const3);
5800 res0 = __msa_aver_u_b(avg0, res0);
5801 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5802 res0 = __msa_aver_u_b(avg0, res0);
5803 ST8x2_UB(res0, dst, dst_stride);
5804 dst += (2 * dst_stride);
5806 LD_UB2(dst, dst_stride, dst0, dst1);
5807 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
5808 res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, horiz7,
5809 horiz8, horiz8, horiz7, horiz7, horiz6,
5810 horiz5, horiz4, horiz8, horiz8, horiz7,
5811 horiz6, const20, const6, const3);
5812 res1 = __msa_aver_u_b(avg1, res1);
5813 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5814 res1 = __msa_aver_u_b(avg1, res1);
5815 ST8x2_UB(res1, dst, dst_stride);
5818 static void copy_8x8_msa(const uint8_t *src, int32_t src_stride,
5819 uint8_t *dst, int32_t dst_stride)
5821 uint64_t src0, src1;
5824 for (loop_cnt = 4; loop_cnt--;) {
5837 static void copy_16x16_msa(const uint8_t *src, int32_t src_stride,
5838 uint8_t *dst, int32_t dst_stride)
5840 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
5841 v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
5843 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
5844 src += (8 * src_stride);
5845 LD_UB8(src, src_stride,
5846 src8, src9, src10, src11, src12, src13, src14, src15);
5848 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
5849 dst += (8 * dst_stride);
5850 ST_UB8(src8, src9, src10, src11, src12, src13, src14, src15,
5854 static void avg_width8_msa(const uint8_t *src, int32_t src_stride,
5855 uint8_t *dst, int32_t dst_stride,
5859 uint64_t out0, out1, out2, out3;
5860 v16u8 src0, src1, src2, src3;
5861 v16u8 dst0, dst1, dst2, dst3;
5863 for (cnt = (height / 4); cnt--;) {
5864 LD_UB4(src, src_stride, src0, src1, src2, src3);
5865 src += (4 * src_stride);
5866 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
5868 AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
5869 dst0, dst1, dst2, dst3);
5871 out0 = __msa_copy_u_d((v2i64) dst0, 0);
5872 out1 = __msa_copy_u_d((v2i64) dst1, 0);
5873 out2 = __msa_copy_u_d((v2i64) dst2, 0);
5874 out3 = __msa_copy_u_d((v2i64) dst3, 0);
5875 SD4(out0, out1, out2, out3, dst, dst_stride);
5876 dst += (4 * dst_stride);
5880 static void avg_width16_msa(const uint8_t *src, int32_t src_stride,
5881 uint8_t *dst, int32_t dst_stride,
5885 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
5886 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
5888 for (cnt = (height / 8); cnt--;) {
5889 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
5890 src += (8 * src_stride);
5891 LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
5893 AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
5894 dst0, dst1, dst2, dst3);
5895 AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
5896 dst4, dst5, dst6, dst7);
5897 ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, dst_stride);
5898 dst += (8 * dst_stride);
5902 void ff_copy_16x16_msa(uint8_t *dest, const uint8_t *src, ptrdiff_t stride)
5904 copy_16x16_msa(src, stride, dest, stride);
5907 void ff_copy_8x8_msa(uint8_t *dest, const uint8_t *src, ptrdiff_t stride)
5909 copy_8x8_msa(src, stride, dest, stride);
5912 void ff_horiz_mc_qpel_aver_src0_8width_msa(uint8_t *dest,
5916 horiz_mc_qpel_aver_src0_8width_msa(src, stride, dest, stride, 8);
5919 void ff_horiz_mc_qpel_aver_src0_16width_msa(uint8_t *dest,
5923 horiz_mc_qpel_aver_src0_16width_msa(src, stride, dest, stride, 16);
5926 void ff_horiz_mc_qpel_8width_msa(uint8_t *dest, const uint8_t *src,
5929 horiz_mc_qpel_8width_msa(src, stride, dest, stride, 8);
5932 void ff_horiz_mc_qpel_16width_msa(uint8_t *dest,
5933 const uint8_t *src, ptrdiff_t stride)
5935 horiz_mc_qpel_16width_msa(src, stride, dest, stride, 16);
5938 void ff_horiz_mc_qpel_aver_src1_8width_msa(uint8_t *dest,
5942 horiz_mc_qpel_aver_src1_8width_msa(src, stride, dest, stride, 8);
5945 void ff_horiz_mc_qpel_aver_src1_16width_msa(uint8_t *dest,
5949 horiz_mc_qpel_aver_src1_16width_msa(src, stride, dest, stride, 16);
5952 void ff_horiz_mc_qpel_no_rnd_aver_src0_8width_msa(uint8_t *dest,
5956 horiz_mc_qpel_no_rnd_aver_src0_8width_msa(src, stride, dest, stride, 8);
5959 void ff_horiz_mc_qpel_no_rnd_aver_src0_16width_msa(uint8_t *dest,
5963 horiz_mc_qpel_no_rnd_aver_src0_16width_msa(src, stride, dest, stride, 16);
5966 void ff_horiz_mc_qpel_no_rnd_8width_msa(uint8_t *dest,
5967 const uint8_t *src, ptrdiff_t stride)
5969 horiz_mc_qpel_no_rnd_8width_msa(src, stride, dest, stride, 8);
5972 void ff_horiz_mc_qpel_no_rnd_16width_msa(uint8_t *dest,
5973 const uint8_t *src, ptrdiff_t stride)
5975 horiz_mc_qpel_no_rnd_16width_msa(src, stride, dest, stride, 16);
5978 void ff_horiz_mc_qpel_no_rnd_aver_src1_8width_msa(uint8_t *dest,
5982 horiz_mc_qpel_no_rnd_aver_src1_8width_msa(src, stride, dest, stride, 8);
5985 void ff_horiz_mc_qpel_no_rnd_aver_src1_16width_msa(uint8_t *dest,
5989 horiz_mc_qpel_no_rnd_aver_src1_16width_msa(src, stride, dest, stride, 16);
5992 void ff_avg_width8_msa(uint8_t *dest, const uint8_t *src, ptrdiff_t stride)
5994 avg_width8_msa(src, stride, dest, stride, 8);
5997 void ff_avg_width16_msa(uint8_t *dest, const uint8_t *src, ptrdiff_t stride)
5999 avg_width16_msa(src, stride, dest, stride, 16);
6002 void ff_horiz_mc_qpel_avg_dst_aver_src0_8width_msa(uint8_t *dest,
6006 horiz_mc_qpel_avg_dst_aver_src0_8width_msa(src, stride, dest, stride, 8);
6009 void ff_horiz_mc_qpel_avg_dst_aver_src0_16width_msa(uint8_t *dest,
6013 horiz_mc_qpel_avg_dst_aver_src0_16width_msa(src, stride, dest, stride, 16);
6016 void ff_horiz_mc_qpel_avg_dst_8width_msa(uint8_t *dest,
6017 const uint8_t *src, ptrdiff_t stride)
6019 horiz_mc_qpel_avg_dst_8width_msa(src, stride, dest, stride, 8);
6022 void ff_horiz_mc_qpel_avg_dst_16width_msa(uint8_t *dest,
6023 const uint8_t *src, ptrdiff_t stride)
6025 horiz_mc_qpel_avg_dst_16width_msa(src, stride, dest, stride, 16);
6028 void ff_horiz_mc_qpel_avg_dst_aver_src1_8width_msa(uint8_t *dest,
6032 horiz_mc_qpel_avg_dst_aver_src1_8width_msa(src, stride, dest, stride, 8);
6035 void ff_horiz_mc_qpel_avg_dst_aver_src1_16width_msa(uint8_t *dest,
6039 horiz_mc_qpel_avg_dst_aver_src1_16width_msa(src, stride, dest, stride, 16);
6043 void ff_vert_mc_qpel_aver_src0_8x8_msa(uint8_t *dest,
6044 const uint8_t *src, ptrdiff_t stride)
6046 vert_mc_qpel_aver_src0_8x8_msa(src, stride, dest, stride);
6049 void ff_vert_mc_qpel_aver_src0_16x16_msa(uint8_t *dest,
6050 const uint8_t *src, ptrdiff_t stride)
6052 vert_mc_qpel_aver_src0_16x16_msa(src, stride, dest, stride);
6055 void ff_vert_mc_qpel_8x8_msa(uint8_t *dest, const uint8_t *src,
6058 vert_mc_qpel_8x8_msa(src, stride, dest, stride);
6061 void ff_vert_mc_qpel_16x16_msa(uint8_t *dest, const uint8_t *src,
6064 vert_mc_qpel_16x16_msa(src, stride, dest, stride);
6067 void ff_vert_mc_qpel_aver_src1_8x8_msa(uint8_t *dest,
6068 const uint8_t *src, ptrdiff_t stride)
6070 vert_mc_qpel_aver_src1_8x8_msa(src, stride, dest, stride);
6073 void ff_vert_mc_qpel_aver_src1_16x16_msa(uint8_t *dest,
6074 const uint8_t *src, ptrdiff_t stride)
6076 vert_mc_qpel_aver_src1_16x16_msa(src, stride, dest, stride);
6079 void ff_vert_mc_qpel_no_rnd_aver_src0_8x8_msa(uint8_t *dest,
6083 vert_mc_qpel_no_rnd_aver_src0_8x8_msa(src, stride, dest, stride);
6086 void ff_vert_mc_qpel_no_rnd_aver_src0_16x16_msa(uint8_t *dest,
6090 vert_mc_qpel_no_rnd_aver_src0_16x16_msa(src, stride, dest, stride);
6093 void ff_vert_mc_qpel_no_rnd_8x8_msa(uint8_t *dest,
6094 const uint8_t *src, ptrdiff_t stride)
6096 vert_mc_qpel_no_rnd_8x8_msa(src, stride, dest, stride);
6099 void ff_vert_mc_qpel_no_rnd_16x16_msa(uint8_t *dest,
6100 const uint8_t *src, ptrdiff_t stride)
6102 vert_mc_qpel_no_rnd_16x16_msa(src, stride, dest, stride);
6105 void ff_vert_mc_qpel_no_rnd_aver_src1_8x8_msa(uint8_t *dest,
6109 vert_mc_qpel_no_rnd_aver_src1_8x8_msa(src, stride, dest, stride);
6112 void ff_vert_mc_qpel_no_rnd_aver_src1_16x16_msa(uint8_t *dest,
6116 vert_mc_qpel_no_rnd_aver_src1_16x16_msa(src, stride, dest, stride);
6119 void ff_vert_mc_qpel_avg_dst_aver_src0_8x8_msa(uint8_t *dest,
6123 vert_mc_qpel_avg_dst_aver_src0_8x8_msa(src, stride, dest, stride);
6126 void ff_vert_mc_qpel_avg_dst_aver_src0_16x16_msa(uint8_t *dest,
6130 vert_mc_qpel_avg_dst_aver_src0_16x16_msa(src, stride, dest, stride);
6133 void ff_vert_mc_qpel_avg_dst_8x8_msa(uint8_t *dest,
6134 const uint8_t *src, ptrdiff_t stride)
6136 vert_mc_qpel_avg_dst_8x8_msa(src, stride, dest, stride);
6139 void ff_vert_mc_qpel_avg_dst_16x16_msa(uint8_t *dest,
6140 const uint8_t *src, ptrdiff_t stride)
6142 vert_mc_qpel_avg_dst_16x16_msa(src, stride, dest, stride);
6145 void ff_vert_mc_qpel_avg_dst_aver_src1_8x8_msa(uint8_t *dest,
6149 vert_mc_qpel_avg_dst_aver_src1_8x8_msa(src, stride, dest, stride);
6152 void ff_vert_mc_qpel_avg_dst_aver_src1_16x16_msa(uint8_t *dest,
6156 vert_mc_qpel_avg_dst_aver_src1_16x16_msa(src, stride, dest, stride);
6160 void ff_hv_mc_qpel_aver_hv_src00_16x16_msa(uint8_t *dest,
6164 hv_mc_qpel_aver_hv_src00_16x16_msa(src, stride, dest, stride);
6167 void ff_hv_mc_qpel_aver_hv_src00_8x8_msa(uint8_t *dest,
6168 const uint8_t *src, ptrdiff_t stride)
6170 hv_mc_qpel_aver_hv_src00_8x8_msa(src, stride, dest, stride);
6173 void ff_hv_mc_qpel_aver_v_src0_16x16_msa(uint8_t *dest,
6174 const uint8_t *src, ptrdiff_t stride)
6176 hv_mc_qpel_aver_v_src0_16x16_msa(src, stride, dest, stride);
6179 void ff_hv_mc_qpel_aver_v_src0_8x8_msa(uint8_t *dest,
6180 const uint8_t *src, ptrdiff_t stride)
6182 hv_mc_qpel_aver_v_src0_8x8_msa(src, stride, dest, stride);
6185 void ff_hv_mc_qpel_aver_hv_src10_16x16_msa(uint8_t *dest,
6189 hv_mc_qpel_aver_hv_src10_16x16_msa(src, stride, dest, stride);
6192 void ff_hv_mc_qpel_aver_hv_src10_8x8_msa(uint8_t *dest,
6193 const uint8_t *src, ptrdiff_t stride)
6195 hv_mc_qpel_aver_hv_src10_8x8_msa(src, stride, dest, stride);
6198 void ff_hv_mc_qpel_aver_h_src0_16x16_msa(uint8_t *dest,
6199 const uint8_t *src, ptrdiff_t stride)
6201 hv_mc_qpel_aver_h_src0_16x16_msa(src, stride, dest, stride);
6204 void ff_hv_mc_qpel_aver_h_src0_8x8_msa(uint8_t *dest,
6205 const uint8_t *src, ptrdiff_t stride)
6207 hv_mc_qpel_aver_h_src0_8x8_msa(src, stride, dest, stride);
6210 void ff_hv_mc_qpel_16x16_msa(uint8_t *dest, const uint8_t *src,
6213 hv_mc_qpel_16x16_msa(src, stride, dest, stride);
6216 void ff_hv_mc_qpel_8x8_msa(uint8_t *dest, const uint8_t *src,
6219 hv_mc_qpel_8x8_msa(src, stride, dest, stride);
6222 void ff_hv_mc_qpel_aver_h_src1_16x16_msa(uint8_t *dest,
6223 const uint8_t *src, ptrdiff_t stride)
6225 hv_mc_qpel_aver_h_src1_16x16_msa(src, stride, dest, stride);
6228 void ff_hv_mc_qpel_aver_h_src1_8x8_msa(uint8_t *dest,
6229 const uint8_t *src, ptrdiff_t stride)
6231 hv_mc_qpel_aver_h_src1_8x8_msa(src, stride, dest, stride);
6234 void ff_hv_mc_qpel_aver_hv_src01_16x16_msa(uint8_t *dest,
6238 hv_mc_qpel_aver_hv_src01_16x16_msa(src, stride, dest, stride);
6241 void ff_hv_mc_qpel_aver_hv_src01_8x8_msa(uint8_t *dest,
6242 const uint8_t *src, ptrdiff_t stride)
6244 hv_mc_qpel_aver_hv_src01_8x8_msa(src, stride, dest, stride);
6247 void ff_hv_mc_qpel_aver_v_src1_16x16_msa(uint8_t *dest,
6248 const uint8_t *src, ptrdiff_t stride)
6250 hv_mc_qpel_aver_v_src1_16x16_msa(src, stride, dest, stride);
6253 void ff_hv_mc_qpel_aver_v_src1_8x8_msa(uint8_t *dest,
6254 const uint8_t *src, ptrdiff_t stride)
6256 hv_mc_qpel_aver_v_src1_8x8_msa(src, stride, dest, stride);
6259 void ff_hv_mc_qpel_aver_hv_src11_16x16_msa(uint8_t *dest,
6263 hv_mc_qpel_aver_hv_src11_16x16_msa(src, stride, dest, stride);
6266 void ff_hv_mc_qpel_aver_hv_src11_8x8_msa(uint8_t *dest,
6267 const uint8_t *src, ptrdiff_t stride)
6269 hv_mc_qpel_aver_hv_src11_8x8_msa(src, stride, dest, stride);
6272 void ff_hv_mc_qpel_avg_dst_aver_hv_src00_16x16_msa(uint8_t *dest,
6276 hv_mc_qpel_avg_dst_aver_hv_src00_16x16_msa(src, stride, dest, stride);
6279 void ff_hv_mc_qpel_avg_dst_aver_hv_src00_8x8_msa(uint8_t *dest,
6283 hv_mc_qpel_avg_dst_aver_hv_src00_8x8_msa(src, stride, dest, stride);
6286 void ff_hv_mc_qpel_avg_dst_aver_v_src0_16x16_msa(uint8_t *dest,
6290 hv_mc_qpel_avg_dst_aver_v_src0_16x16_msa(src, stride, dest, stride);
6293 void ff_hv_mc_qpel_avg_dst_aver_v_src0_8x8_msa(uint8_t *dest,
6297 hv_mc_qpel_avg_dst_aver_v_src0_8x8_msa(src, stride, dest, stride);
6300 void ff_hv_mc_qpel_avg_dst_aver_hv_src10_16x16_msa(uint8_t *dest,
6304 hv_mc_qpel_avg_dst_aver_hv_src10_16x16_msa(src, stride, dest, stride);
6307 void ff_hv_mc_qpel_avg_dst_aver_hv_src10_8x8_msa(uint8_t *dest,
6311 hv_mc_qpel_avg_dst_aver_hv_src10_8x8_msa(src, stride, dest, stride);
6314 void ff_hv_mc_qpel_avg_dst_aver_h_src0_16x16_msa(uint8_t *dest,
6318 hv_mc_qpel_avg_dst_aver_h_src0_16x16_msa(src, stride, dest, stride);
6321 void ff_hv_mc_qpel_avg_dst_aver_h_src0_8x8_msa(uint8_t *dest,
6325 hv_mc_qpel_avg_dst_aver_h_src0_8x8_msa(src, stride, dest, stride);
6328 void ff_hv_mc_qpel_avg_dst_16x16_msa(uint8_t *dest,
6329 const uint8_t *src, ptrdiff_t stride)
6331 hv_mc_qpel_avg_dst_16x16_msa(src, stride, dest, stride);
6334 void ff_hv_mc_qpel_avg_dst_8x8_msa(uint8_t *dest,
6335 const uint8_t *src, ptrdiff_t stride)
6337 hv_mc_qpel_avg_dst_8x8_msa(src, stride, dest, stride);
6340 void ff_hv_mc_qpel_avg_dst_aver_h_src1_16x16_msa(uint8_t *dest,
6344 hv_mc_qpel_avg_dst_aver_h_src1_16x16_msa(src, stride, dest, stride);
6347 void ff_hv_mc_qpel_avg_dst_aver_h_src1_8x8_msa(uint8_t *dest,
6351 hv_mc_qpel_avg_dst_aver_h_src1_8x8_msa(src, stride, dest, stride);
6354 void ff_hv_mc_qpel_avg_dst_aver_hv_src01_16x16_msa(uint8_t *dest,
6358 hv_mc_qpel_avg_dst_aver_hv_src01_16x16_msa(src, stride, dest, stride);
6361 void ff_hv_mc_qpel_avg_dst_aver_hv_src01_8x8_msa(uint8_t *dest,
6365 hv_mc_qpel_avg_dst_aver_hv_src01_8x8_msa(src, stride, dest, stride);
6368 void ff_hv_mc_qpel_avg_dst_aver_v_src1_16x16_msa(uint8_t *dest,
6372 hv_mc_qpel_avg_dst_aver_v_src1_16x16_msa(src, stride, dest, stride);
6375 void ff_hv_mc_qpel_avg_dst_aver_v_src1_8x8_msa(uint8_t *dest,
6379 hv_mc_qpel_avg_dst_aver_v_src1_8x8_msa(src, stride, dest, stride);
6382 void ff_hv_mc_qpel_avg_dst_aver_hv_src11_16x16_msa(uint8_t *dest,
6386 hv_mc_qpel_avg_dst_aver_hv_src11_16x16_msa(src, stride, dest, stride);
6389 void ff_hv_mc_qpel_avg_dst_aver_hv_src11_8x8_msa(uint8_t *dest,
6393 hv_mc_qpel_avg_dst_aver_hv_src11_8x8_msa(src, stride, dest, stride);
6396 void ff_hv_mc_qpel_no_rnd_aver_hv_src00_16x16_msa(uint8_t *dest,
6400 hv_mc_qpel_no_rnd_aver_hv_src00_16x16_msa(src, stride, dest, stride);
6403 void ff_hv_mc_qpel_no_rnd_aver_hv_src00_8x8_msa(uint8_t *dest,
6407 hv_mc_qpel_no_rnd_aver_hv_src00_8x8_msa(src, stride, dest, stride);
6410 void ff_hv_mc_qpel_no_rnd_aver_v_src0_16x16_msa(uint8_t *dest,
6414 hv_mc_qpel_no_rnd_aver_v_src0_16x16_msa(src, stride, dest, stride);
6417 void ff_hv_mc_qpel_no_rnd_aver_v_src0_8x8_msa(uint8_t *dest,
6421 hv_mc_qpel_no_rnd_aver_v_src0_8x8_msa(src, stride, dest, stride);
6424 void ff_hv_mc_qpel_no_rnd_aver_hv_src10_16x16_msa(uint8_t *dest,
6428 hv_mc_qpel_no_rnd_aver_hv_src10_16x16_msa(src, stride, dest, stride);
6431 void ff_hv_mc_qpel_no_rnd_aver_hv_src10_8x8_msa(uint8_t *dest,
6435 hv_mc_qpel_no_rnd_aver_hv_src10_8x8_msa(src, stride, dest, stride);
6438 void ff_hv_mc_qpel_no_rnd_aver_h_src0_16x16_msa(uint8_t *dest,
6442 hv_mc_qpel_no_rnd_aver_h_src0_16x16_msa(src, stride, dest, stride);
6445 void ff_hv_mc_qpel_no_rnd_aver_h_src0_8x8_msa(uint8_t *dest,
6449 hv_mc_qpel_no_rnd_aver_h_src0_8x8_msa(src, stride, dest, stride);
6452 void ff_hv_mc_qpel_no_rnd_16x16_msa(uint8_t *dest,
6453 const uint8_t *src, ptrdiff_t stride)
6455 hv_mc_qpel_no_rnd_16x16_msa(src, stride, dest, stride);
6458 void ff_hv_mc_qpel_no_rnd_8x8_msa(uint8_t *dest,
6459 const uint8_t *src, ptrdiff_t stride)
6461 hv_mc_qpel_no_rnd_8x8_msa(src, stride, dest, stride);
6464 void ff_hv_mc_qpel_no_rnd_aver_h_src1_16x16_msa(uint8_t *dest,
6468 hv_mc_qpel_no_rnd_aver_h_src1_16x16_msa(src, stride, dest, stride);
6471 void ff_hv_mc_qpel_no_rnd_aver_h_src1_8x8_msa(uint8_t *dest,
6475 hv_mc_qpel_no_rnd_aver_h_src1_8x8_msa(src, stride, dest, stride);
6478 void ff_hv_mc_qpel_no_rnd_aver_hv_src01_16x16_msa(uint8_t *dest,
6482 hv_mc_qpel_no_rnd_aver_hv_src01_16x16_msa(src, stride, dest, stride);
6485 void ff_hv_mc_qpel_no_rnd_aver_hv_src01_8x8_msa(uint8_t *dest,
6489 hv_mc_qpel_no_rnd_aver_hv_src01_8x8_msa(src, stride, dest, stride);
6492 void ff_hv_mc_qpel_no_rnd_aver_v_src1_16x16_msa(uint8_t *dest,
6496 hv_mc_qpel_no_rnd_aver_v_src1_16x16_msa(src, stride, dest, stride);
6499 void ff_hv_mc_qpel_no_rnd_aver_v_src1_8x8_msa(uint8_t *dest,
6503 hv_mc_qpel_no_rnd_aver_v_src1_8x8_msa(src, stride, dest, stride);
6506 void ff_hv_mc_qpel_no_rnd_aver_hv_src11_16x16_msa(uint8_t *dest,
6510 hv_mc_qpel_no_rnd_aver_hv_src11_16x16_msa(src, stride, dest, stride);
6513 void ff_hv_mc_qpel_no_rnd_aver_hv_src11_8x8_msa(uint8_t *dest,
6517 hv_mc_qpel_no_rnd_aver_hv_src11_8x8_msa(src, stride, dest, stride);