2 * Copyright (c) 2015 - 2017 Parag Salasakar (Parag.Salasakar@imgtec.com)
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #include "libavutil/mips/generic_macros_msa.h"
22 #include "h264dsp_mips.h"
24 static void avc_wgt_4x2_msa(uint8_t *data, int32_t stride,
25 int32_t log2_denom, int32_t src_weight,
28 uint32_t data0, data1;
32 v8i16 temp0, temp1, vec0, vec1, wgt, denom, offset;
35 offset_in <<= (log2_denom);
38 offset_in += (1 << (log2_denom - 1));
41 wgt = __msa_fill_h(src_weight);
42 offset = __msa_fill_h(offset_in);
43 denom = __msa_fill_h(log2_denom);
46 data1 = LW(data + stride);
48 src0 = (v16u8) __msa_fill_w(data0);
49 src1 = (v16u8) __msa_fill_w(data1);
51 ILVR_B2_SH(zero, src0, zero, src1, vec0, vec1);
52 MUL2(wgt, vec0, wgt, vec1, temp0, temp1);
53 ADDS_SH2_SH(temp0, offset, temp1, offset, temp0, temp1);
54 MAXI_SH2_SH(temp0, temp1, 0);
56 out0 = (v8u16) __msa_srl_h(temp0, denom);
57 out1 = (v8u16) __msa_srl_h(temp1, denom);
59 SAT_UH2_UH(out0, out1, 7);
60 PCKEV_B2_SW(out0, out0, out1, out1, res0, res1);
62 data0 = __msa_copy_u_w(res0, 0);
63 data1 = __msa_copy_u_w(res1, 0);
69 static void avc_wgt_4x4multiple_msa(uint8_t *data, int32_t stride,
70 int32_t height, int32_t log2_denom,
71 int32_t src_weight, int32_t offset_in)
74 uint32_t data0, data1, data2, data3;
76 v16u8 src0, src1, src2, src3;
77 v8u16 temp0, temp1, temp2, temp3, wgt;
80 offset_in <<= (log2_denom);
83 offset_in += (1 << (log2_denom - 1));
86 wgt = (v8u16) __msa_fill_h(src_weight);
87 offset = __msa_fill_h(offset_in);
88 denom = __msa_fill_h(log2_denom);
90 for (cnt = height / 4; cnt--;) {
91 LW4(data, stride, data0, data1, data2, data3);
93 src0 = (v16u8) __msa_fill_w(data0);
94 src1 = (v16u8) __msa_fill_w(data1);
95 src2 = (v16u8) __msa_fill_w(data2);
96 src3 = (v16u8) __msa_fill_w(data3);
98 ILVR_B4_UH(zero, src0, zero, src1, zero, src2, zero, src3,
99 temp0, temp1, temp2, temp3);
100 MUL4(wgt, temp0, wgt, temp1, wgt, temp2, wgt, temp3,
101 temp0, temp1, temp2, temp3);
102 ADDS_SH4_UH(temp0, offset, temp1, offset, temp2, offset, temp3, offset,
103 temp0, temp1, temp2, temp3);
104 MAXI_SH4_UH(temp0, temp1, temp2, temp3, 0);
105 SRL_H4_UH(temp0, temp1, temp2, temp3, denom);
106 SAT_UH4_UH(temp0, temp1, temp2, temp3, 7);
107 PCKEV_ST4x4_UB(temp0, temp1, temp2, temp3, data, stride);
108 data += (4 * stride);
112 static void avc_wgt_4width_msa(uint8_t *data, int32_t stride,
113 int32_t height, int32_t log2_denom,
114 int32_t src_weight, int32_t offset_in)
117 avc_wgt_4x2_msa(data, stride, log2_denom, src_weight, offset_in);
119 avc_wgt_4x4multiple_msa(data, stride, height, log2_denom, src_weight,
124 static void avc_wgt_8width_msa(uint8_t *data, int32_t stride,
125 int32_t height, int32_t log2_denom,
126 int32_t src_weight, int32_t offset_in)
130 v16u8 src0, src1, src2, src3;
131 v8u16 src0_r, src1_r, src2_r, src3_r;
132 v8u16 temp0, temp1, temp2, temp3;
133 v8u16 wgt, denom, offset;
136 offset_in <<= (log2_denom);
139 offset_in += (1 << (log2_denom - 1));
142 wgt = (v8u16) __msa_fill_h(src_weight);
143 offset = (v8u16) __msa_fill_h(offset_in);
144 denom = (v8u16) __msa_fill_h(log2_denom);
146 for (cnt = height / 4; cnt--;) {
147 LD_UB4(data, stride, src0, src1, src2, src3);
148 ILVR_B4_UH(zero, src0, zero, src1, zero, src2, zero, src3,
149 src0_r, src1_r, src2_r, src3_r);
150 MUL4(wgt, src0_r, wgt, src1_r, wgt, src2_r, wgt, src3_r,
151 temp0, temp1, temp2, temp3);
152 ADDS_SH4_UH(temp0, offset, temp1, offset, temp2, offset, temp3, offset,
153 temp0, temp1, temp2, temp3);
154 MAXI_SH4_UH(temp0, temp1, temp2, temp3, 0);
155 SRL_H4_UH(temp0, temp1, temp2, temp3, denom);
156 SAT_UH4_UH(temp0, temp1, temp2, temp3, 7);
157 PCKEV_B2_SB(temp1, temp0, temp3, temp2, out0, out1);
158 ST8x4_UB(out0, out1, data, stride);
159 data += (4 * stride);
163 static void avc_wgt_16width_msa(uint8_t *data, int32_t stride,
164 int32_t height, int32_t log2_denom,
165 int32_t src_weight, int32_t offset_in)
169 v16u8 src0, src1, src2, src3;
170 v16u8 dst0, dst1, dst2, dst3;
171 v8u16 src0_l, src1_l, src2_l, src3_l, src0_r, src1_r, src2_r, src3_r;
172 v8u16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
173 v8u16 wgt, denom, offset;
175 offset_in <<= (log2_denom);
178 offset_in += (1 << (log2_denom - 1));
181 wgt = (v8u16) __msa_fill_h(src_weight);
182 offset = (v8u16) __msa_fill_h(offset_in);
183 denom = (v8u16) __msa_fill_h(log2_denom);
185 for (cnt = height / 4; cnt--;) {
186 LD_UB4(data, stride, src0, src1, src2, src3);
187 ILVR_B4_UH(zero, src0, zero, src1, zero, src2, zero, src3,
188 src0_r, src1_r, src2_r, src3_r);
189 ILVL_B4_UH(zero, src0, zero, src1, zero, src2, zero, src3,
190 src0_l, src1_l, src2_l, src3_l);
191 MUL4(wgt, src0_r, wgt, src0_l, wgt, src1_r, wgt, src1_l,
192 temp0, temp1, temp2, temp3);
193 MUL4(wgt, src2_r, wgt, src2_l, wgt, src3_r, wgt, src3_l,
194 temp4, temp5, temp6, temp7);
195 ADDS_SH4_UH(temp0, offset, temp1, offset, temp2, offset, temp3, offset,
196 temp0, temp1, temp2, temp3);
197 ADDS_SH4_UH(temp4, offset, temp5, offset, temp6, offset, temp7, offset,
198 temp4, temp5, temp6, temp7);
199 MAXI_SH4_UH(temp0, temp1, temp2, temp3, 0);
200 MAXI_SH4_UH(temp4, temp5, temp6, temp7, 0);
201 SRL_H4_UH(temp0, temp1, temp2, temp3, denom);
202 SRL_H4_UH(temp4, temp5, temp6, temp7, denom);
203 SAT_UH4_UH(temp0, temp1, temp2, temp3, 7);
204 SAT_UH4_UH(temp4, temp5, temp6, temp7, 7);
205 PCKEV_B4_UB(temp1, temp0, temp3, temp2, temp5, temp4, temp7, temp6,
206 dst0, dst1, dst2, dst3);
207 ST_UB4(dst0, dst1, dst2, dst3, data, stride);
212 static void avc_biwgt_4x2_msa(uint8_t *src, int32_t src_stride,
213 uint8_t *dst, int32_t dst_stride,
214 int32_t log2_denom, int32_t src_weight,
215 int32_t dst_weight, int32_t offset_in)
217 uint32_t load0, load1, out0, out1;
218 v16i8 src_wgt, dst_wgt, wgt;
219 v16i8 src0, src1, dst0, dst1;
220 v8i16 temp0, temp1, denom, offset, add_val;
221 int32_t val = 128 * (src_weight + dst_weight);
223 offset_in = ((offset_in + 1) | 1) << log2_denom;
225 src_wgt = __msa_fill_b(src_weight);
226 dst_wgt = __msa_fill_b(dst_weight);
227 offset = __msa_fill_h(offset_in);
228 denom = __msa_fill_h(log2_denom + 1);
229 add_val = __msa_fill_h(val);
232 wgt = __msa_ilvev_b(dst_wgt, src_wgt);
238 src0 = (v16i8) __msa_fill_w(load0);
239 src1 = (v16i8) __msa_fill_w(load1);
242 load1 = LW(dst + dst_stride);
244 dst0 = (v16i8) __msa_fill_w(load0);
245 dst1 = (v16i8) __msa_fill_w(load1);
247 XORI_B4_128_SB(src0, src1, dst0, dst1);
248 ILVR_B2_SH(dst0, src0, dst1, src1, temp0, temp1);
250 temp0 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp0);
251 temp1 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp1);
256 CLIP_SH2_0_255(temp0, temp1);
257 PCKEV_B2_SB(temp0, temp0, temp1, temp1, dst0, dst1);
259 out0 = __msa_copy_u_w((v4i32) dst0, 0);
260 out1 = __msa_copy_u_w((v4i32) dst1, 0);
266 static void avc_biwgt_4x4multiple_msa(uint8_t *src, int32_t src_stride,
267 uint8_t *dst, int32_t dst_stride,
268 int32_t height, int32_t log2_denom,
269 int32_t src_weight, int32_t dst_weight,
273 uint32_t load0, load1, load2, load3;
274 v16i8 src_wgt, dst_wgt, wgt;
275 v16i8 src0, src1, src2, src3;
276 v16i8 dst0, dst1, dst2, dst3;
277 v8i16 temp0, temp1, temp2, temp3;
278 v8i16 denom, offset, add_val;
279 int32_t val = 128 * (src_weight + dst_weight);
281 offset_in = ((offset_in + 1) | 1) << log2_denom;
283 src_wgt = __msa_fill_b(src_weight);
284 dst_wgt = __msa_fill_b(dst_weight);
285 offset = __msa_fill_h(offset_in);
286 denom = __msa_fill_h(log2_denom + 1);
287 add_val = __msa_fill_h(val);
290 wgt = __msa_ilvev_b(dst_wgt, src_wgt);
292 for (cnt = height / 4; cnt--;) {
293 LW4(src, src_stride, load0, load1, load2, load3);
294 src += (4 * src_stride);
296 src0 = (v16i8) __msa_fill_w(load0);
297 src1 = (v16i8) __msa_fill_w(load1);
298 src2 = (v16i8) __msa_fill_w(load2);
299 src3 = (v16i8) __msa_fill_w(load3);
301 LW4(dst, dst_stride, load0, load1, load2, load3);
303 dst0 = (v16i8) __msa_fill_w(load0);
304 dst1 = (v16i8) __msa_fill_w(load1);
305 dst2 = (v16i8) __msa_fill_w(load2);
306 dst3 = (v16i8) __msa_fill_w(load3);
308 XORI_B4_128_SB(src0, src1, src2, src3);
309 XORI_B4_128_SB(dst0, dst1, dst2, dst3);
310 ILVR_B4_SH(dst0, src0, dst1, src1, dst2, src2, dst3, src3,
311 temp0, temp1, temp2, temp3);
313 temp0 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp0);
314 temp1 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp1);
315 temp2 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp2);
316 temp3 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp3);
318 SRA_4V(temp0, temp1, temp2, temp3, denom);
319 CLIP_SH4_0_255(temp0, temp1, temp2, temp3);
320 PCKEV_ST4x4_UB(temp0, temp1, temp2, temp3, dst, dst_stride);
321 dst += (4 * dst_stride);
325 static void avc_biwgt_4width_msa(uint8_t *src, int32_t src_stride,
326 uint8_t *dst, int32_t dst_stride,
327 int32_t height, int32_t log2_denom,
328 int32_t src_weight, int32_t dst_weight,
332 avc_biwgt_4x2_msa(src, src_stride, dst, dst_stride, log2_denom,
333 src_weight, dst_weight, offset_in);
335 avc_biwgt_4x4multiple_msa(src, src_stride, dst, dst_stride, height,
336 log2_denom, src_weight, dst_weight,
341 static void avc_biwgt_8width_msa(uint8_t *src, int32_t src_stride,
342 uint8_t *dst, int32_t dst_stride,
343 int32_t height, int32_t log2_denom,
344 int32_t src_weight, int32_t dst_weight,
348 v16i8 src_wgt, dst_wgt, wgt;
349 v16i8 src0, src1, src2, src3;
350 v16i8 dst0, dst1, dst2, dst3;
352 v8i16 temp0, temp1, temp2, temp3;
353 v8i16 denom, offset, add_val;
354 int32_t val = 128 * (src_weight + dst_weight);
356 offset_in = ((offset_in + 1) | 1) << log2_denom;
358 src_wgt = __msa_fill_b(src_weight);
359 dst_wgt = __msa_fill_b(dst_weight);
360 offset = __msa_fill_h(offset_in);
361 denom = __msa_fill_h(log2_denom + 1);
362 add_val = __msa_fill_h(val);
365 wgt = __msa_ilvev_b(dst_wgt, src_wgt);
367 for (cnt = height / 4; cnt--;) {
368 LD_SB4(src, src_stride, src0, src1, src2, src3);
369 src += (4 * src_stride);
371 LD_SB4(dst, dst_stride, dst0, dst1, dst2, dst3);
372 XORI_B4_128_SB(src0, src1, src2, src3);
373 XORI_B4_128_SB(dst0, dst1, dst2, dst3);
374 ILVR_B4_SH(dst0, src0, dst1, src1, dst2, src2, dst3, src3,
375 temp0, temp1, temp2, temp3);
377 temp0 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp0);
378 temp1 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp1);
379 temp2 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp2);
380 temp3 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp3);
382 SRA_4V(temp0, temp1, temp2, temp3, denom);
383 CLIP_SH4_0_255(temp0, temp1, temp2, temp3);
384 PCKEV_B2_SB(temp1, temp0, temp3, temp2, out0, out1);
385 ST8x4_UB(out0, out1, dst, dst_stride);
386 dst += 4 * dst_stride;
390 static void avc_biwgt_16width_msa(uint8_t *src, int32_t src_stride,
391 uint8_t *dst, int32_t dst_stride,
392 int32_t height, int32_t log2_denom,
393 int32_t src_weight, int32_t dst_weight,
397 v16i8 src_wgt, dst_wgt, wgt;
398 v16i8 src0, src1, src2, src3;
399 v16i8 dst0, dst1, dst2, dst3;
400 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
401 v8i16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
402 v8i16 denom, offset, add_val;
403 int32_t val = 128 * (src_weight + dst_weight);
405 offset_in = ((offset_in + 1) | 1) << log2_denom;
407 src_wgt = __msa_fill_b(src_weight);
408 dst_wgt = __msa_fill_b(dst_weight);
409 offset = __msa_fill_h(offset_in);
410 denom = __msa_fill_h(log2_denom + 1);
411 add_val = __msa_fill_h(val);
414 wgt = __msa_ilvev_b(dst_wgt, src_wgt);
416 for (cnt = height / 4; cnt--;) {
417 LD_SB4(src, src_stride, src0, src1, src2, src3);
418 src += (4 * src_stride);
420 LD_SB4(dst, dst_stride, dst0, dst1, dst2, dst3);
421 XORI_B4_128_SB(src0, src1, src2, src3);
422 XORI_B4_128_SB(dst0, dst1, dst2, dst3);
423 ILVR_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3,
424 vec0, vec2, vec4, vec6);
425 ILVL_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3,
426 vec1, vec3, vec5, vec7);
428 temp0 = __msa_dpadd_s_h(offset, wgt, vec0);
429 temp1 = __msa_dpadd_s_h(offset, wgt, vec1);
430 temp2 = __msa_dpadd_s_h(offset, wgt, vec2);
431 temp3 = __msa_dpadd_s_h(offset, wgt, vec3);
432 temp4 = __msa_dpadd_s_h(offset, wgt, vec4);
433 temp5 = __msa_dpadd_s_h(offset, wgt, vec5);
434 temp6 = __msa_dpadd_s_h(offset, wgt, vec6);
435 temp7 = __msa_dpadd_s_h(offset, wgt, vec7);
437 SRA_4V(temp0, temp1, temp2, temp3, denom);
438 SRA_4V(temp4, temp5, temp6, temp7, denom);
439 CLIP_SH4_0_255(temp0, temp1, temp2, temp3);
440 CLIP_SH4_0_255(temp4, temp5, temp6, temp7);
441 PCKEV_B4_SB(temp1, temp0, temp3, temp2, temp5, temp4, temp7, temp6,
442 dst0, dst1, dst2, dst3);
443 ST_SB4(dst0, dst1, dst2, dst3, dst, dst_stride);
444 dst += 4 * dst_stride;
448 #define AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_or_q3_org_in, p0_or_q0_org_in, \
449 q3_or_p3_org_in, p1_or_q1_org_in, \
450 p2_or_q2_org_in, q1_or_p1_org_in, \
451 p0_or_q0_out, p1_or_q1_out, p2_or_q2_out) \
454 v8i16 const3 = __msa_ldi_h(3); \
456 threshold = (p0_or_q0_org_in) + (q3_or_p3_org_in); \
457 threshold += (p1_or_q1_org_in); \
459 (p0_or_q0_out) = threshold << 1; \
460 (p0_or_q0_out) += (p2_or_q2_org_in); \
461 (p0_or_q0_out) += (q1_or_p1_org_in); \
462 (p0_or_q0_out) = __msa_srari_h((p0_or_q0_out), 3); \
464 (p1_or_q1_out) = (p2_or_q2_org_in) + threshold; \
465 (p1_or_q1_out) = __msa_srari_h((p1_or_q1_out), 2); \
467 (p2_or_q2_out) = (p2_or_q2_org_in) * const3; \
468 (p2_or_q2_out) += (p3_or_q3_org_in); \
469 (p2_or_q2_out) += (p3_or_q3_org_in); \
470 (p2_or_q2_out) += threshold; \
471 (p2_or_q2_out) = __msa_srari_h((p2_or_q2_out), 3); \
474 /* data[-u32_img_width] = (uint8_t)((2 * p1 + p0 + q1 + 2) >> 2); */
475 #define AVC_LPF_P0_OR_Q0(p0_or_q0_org_in, q1_or_p1_org_in, \
476 p1_or_q1_org_in, p0_or_q0_out) \
478 (p0_or_q0_out) = (p0_or_q0_org_in) + (q1_or_p1_org_in); \
479 (p0_or_q0_out) += (p1_or_q1_org_in); \
480 (p0_or_q0_out) += (p1_or_q1_org_in); \
481 (p0_or_q0_out) = __msa_srari_h((p0_or_q0_out), 2); \
484 #define AVC_LPF_P1_OR_Q1(p0_or_q0_org_in, q0_or_p0_org_in, \
485 p1_or_q1_org_in, p2_or_q2_org_in, \
486 negate_tc_in, tc_in, p1_or_q1_out) \
490 clip3 = (v8i16) __msa_aver_u_h((v8u16) p0_or_q0_org_in, \
491 (v8u16) q0_or_p0_org_in); \
492 temp = p1_or_q1_org_in << 1; \
493 clip3 = clip3 - temp; \
494 clip3 = __msa_ave_s_h(p2_or_q2_org_in, clip3); \
495 clip3 = CLIP_SH(clip3, negate_tc_in, tc_in); \
496 p1_or_q1_out = p1_or_q1_org_in + clip3; \
499 #define AVC_LPF_P0Q0(q0_or_p0_org_in, p0_or_q0_org_in, \
500 p1_or_q1_org_in, q1_or_p1_org_in, \
501 negate_threshold_in, threshold_in, \
502 p0_or_q0_out, q0_or_p0_out) \
504 v8i16 q0_sub_p0, p1_sub_q1, delta; \
506 q0_sub_p0 = q0_or_p0_org_in - p0_or_q0_org_in; \
507 p1_sub_q1 = p1_or_q1_org_in - q1_or_p1_org_in; \
510 delta = q0_sub_p0 + p1_sub_q1; \
513 delta = CLIP_SH(delta, negate_threshold_in, threshold_in); \
515 p0_or_q0_out = p0_or_q0_org_in + delta; \
516 q0_or_p0_out = q0_or_p0_org_in - delta; \
518 CLIP_SH2_0_255(p0_or_q0_out, q0_or_p0_out); \
521 #define AVC_LPF_H_CHROMA_422(src, stride, tc_val, alpha, beta, res) \
523 uint32_t load0, load1, load2, load3; \
524 v16u8 src0 = { 0 }; \
525 v16u8 src1 = { 0 }; \
526 v16u8 src2 = { 0 }; \
527 v16u8 src3 = { 0 }; \
528 v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0; \
529 v16u8 is_less_than, is_less_than_alpha, is_less_than_beta; \
530 v8i16 tc, q0_sub_p0, p1_sub_q1, delta; \
531 v8i16 res0_r, res1_r; \
532 v16i8 zeros = { 0 }; \
535 LW4((src - 2), stride, load0, load1, load2, load3); \
536 src0 = (v16u8) __msa_insert_w((v4i32) src0, 0, load0); \
537 src1 = (v16u8) __msa_insert_w((v4i32) src1, 0, load1); \
538 src2 = (v16u8) __msa_insert_w((v4i32) src2, 0, load2); \
539 src3 = (v16u8) __msa_insert_w((v4i32) src3, 0, load3); \
541 TRANSPOSE4x4_UB_UB(src0, src1, src2, src3, src0, src1, src2, src3); \
543 p0_asub_q0 = __msa_asub_u_b(src2, src1); \
544 p1_asub_p0 = __msa_asub_u_b(src1, src0); \
545 q1_asub_q0 = __msa_asub_u_b(src2, src3); \
547 tc = __msa_fill_h(tc_val); \
549 is_less_than_alpha = (p0_asub_q0 < alpha); \
550 is_less_than_beta = (p1_asub_p0 < beta); \
551 is_less_than = is_less_than_alpha & is_less_than_beta; \
552 is_less_than_beta = (q1_asub_q0 < beta); \
553 is_less_than = is_less_than_beta & is_less_than; \
555 ILVR_B2_SH(src2, src1, src0, src3, q0_sub_p0, p1_sub_q1); \
556 HSUB_UB2_SH(q0_sub_p0, p1_sub_q1, q0_sub_p0, p1_sub_q1); \
559 delta = q0_sub_p0 + p1_sub_q1; \
560 delta = __msa_srari_h(delta, 3); \
562 delta = CLIP_SH(delta, -tc, tc); \
564 ILVR_B2_SH(zeros, src1, zeros, src2, res0_r, res1_r); \
569 CLIP_SH2_0_255(res0_r, res1_r); \
570 PCKEV_B2_UB(res0_r, res0_r, res1_r, res1_r, res0, res1); \
572 res0 = __msa_bmnz_v(src1, res0, is_less_than); \
573 res1 = __msa_bmnz_v(src2, res1, is_less_than); \
575 res = (v16u8) __msa_ilvr_b((v16i8) res1, (v16i8) res0); \
578 #define TRANSPOSE2x4_B_UB(in0, in1, out0, out1, out2, out3) \
580 v16i8 zero_m = { 0 }; \
582 out0 = (v16u8) __msa_ilvr_b((v16i8) in1, (v16i8) in0); \
583 out1 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out0, 2); \
584 SLDI_B2_0_UB(out1, out2, out2, out3, 2); \
587 #define AVC_LPF_H_2BYTE_CHROMA_422(src, stride, tc_val, alpha, beta, res) \
589 uint32_t load0, load1; \
590 v16u8 src0 = { 0 }; \
591 v16u8 src1 = { 0 }; \
592 v16u8 src2 = { 0 }; \
593 v16u8 src3 = { 0 }; \
594 v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0; \
595 v16u8 is_less_than, is_less_than_alpha, is_less_than_beta; \
596 v8i16 tc, q0_sub_p0, p1_sub_q1, delta, res0_r, res1_r; \
597 v16i8 zeros = { 0 }; \
600 load0 = LW(src - 2); \
601 load1 = LW(src - 2 + stride); \
603 src0 = (v16u8) __msa_insert_w((v4i32) src0, 0, load0); \
604 src1 = (v16u8) __msa_insert_w((v4i32) src1, 0, load1); \
606 TRANSPOSE2x4_B_UB(src0, src1, src0, src1, src2, src3); \
608 p0_asub_q0 = __msa_asub_u_b(src2, src1); \
609 p1_asub_p0 = __msa_asub_u_b(src1, src0); \
610 q1_asub_q0 = __msa_asub_u_b(src2, src3); \
612 tc = __msa_fill_h(tc_val); \
614 is_less_than_alpha = (p0_asub_q0 < alpha); \
615 is_less_than_beta = (p1_asub_p0 < beta); \
616 is_less_than = is_less_than_alpha & is_less_than_beta; \
617 is_less_than_beta = (q1_asub_q0 < beta); \
618 is_less_than = is_less_than_beta & is_less_than; \
620 ILVR_B2_SH(src2, src1, src0, src3, q0_sub_p0, p1_sub_q1); \
621 HSUB_UB2_SH(q0_sub_p0, p1_sub_q1, q0_sub_p0, p1_sub_q1); \
624 delta = q0_sub_p0 + p1_sub_q1; \
625 delta = __msa_srari_h(delta, 3); \
626 delta = CLIP_SH(delta, -tc, tc); \
628 ILVR_B2_SH(zeros, src1, zeros, src2, res0_r, res1_r); \
633 CLIP_SH2_0_255(res0_r, res1_r); \
634 PCKEV_B2_UB(res0_r, res0_r, res1_r, res1_r, res0, res1); \
636 res0 = __msa_bmnz_v(src1, res0, is_less_than); \
637 res1 = __msa_bmnz_v(src2, res1, is_less_than); \
639 res = (v16u8) __msa_ilvr_b((v16i8) res1, (v16i8) res0); \
642 static void avc_loopfilter_luma_intra_edge_hor_msa(uint8_t *data,
647 v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
648 v16u8 is_less_than, is_less_than_beta, is_less_than_alpha;
649 v16u8 p1_org, p0_org, q0_org, q1_org;
651 LD_UB4(data - (img_width << 1), img_width, p1_org, p0_org, q0_org, q1_org);
653 p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
654 p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
655 q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
657 is_less_than_alpha = (p0_asub_q0 < alpha_in);
658 is_less_than_beta = (p1_asub_p0 < beta_in);
659 is_less_than = is_less_than_beta & is_less_than_alpha;
660 is_less_than_beta = (q1_asub_q0 < beta_in);
661 is_less_than = is_less_than_beta & is_less_than;
663 if (!__msa_test_bz_v(is_less_than)) {
664 v16u8 p2_asub_p0, q2_asub_q0, p0, q0, negate_is_less_than_beta;
670 v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
671 v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
672 v16u8 q2_org = LD_UB(data + (2 * img_width));
673 v16u8 p2_org = LD_UB(data - (3 * img_width));
674 v16u8 tmp_flag = (v16u8)__msa_fill_b((alpha_in >> 2) + 2);
676 UNPCK_UB_SH(p1_org, p1_org_r, p1_org_l);
677 UNPCK_UB_SH(p0_org, p0_org_r, p0_org_l);
678 UNPCK_UB_SH(q0_org, q0_org_r, q0_org_l);
680 tmp_flag = (p0_asub_q0 < tmp_flag);
682 p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org);
683 is_less_than_beta = (p2_asub_p0 < beta_in);
684 is_less_than_beta = is_less_than_beta & tmp_flag;
685 negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff);
686 is_less_than_beta = is_less_than_beta & is_less_than;
687 negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
689 q1_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q1_org);
690 q1_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q1_org);
692 /* combine and store */
693 if (!__msa_test_bz_v(is_less_than_beta)) {
694 v8i16 p3_org_l, p3_org_r;
695 v16u8 p3_org = LD_UB(data - (img_width << 2));
702 ILVR_B2_SH(zero, p3_org, zero, p2_org, p3_org_r, p2_r);
703 AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_r, p0_org_r, q0_org_r, p1_org_r,
704 p2_r, q1_org_r, p0_r, p1_r, p2_r);
706 ILVL_B2_SH(zero, p3_org, zero, p2_org, p3_org_l, p2_l);
707 AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_l, p0_org_l, q0_org_l, p1_org_l,
708 p2_l, q1_org_l, p0_l, p1_l, p2_l);
710 PCKEV_B3_UB(p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, p0, p1, p2);
712 p0_org = __msa_bmnz_v(p0_org, p0, is_less_than_beta);
713 p1_org = __msa_bmnz_v(p1_org, p1, is_less_than_beta);
714 p2_org = __msa_bmnz_v(p2_org, p2, is_less_than_beta);
716 ST_UB(p1_org, data - (2 * img_width));
717 ST_UB(p2_org, data - (3 * img_width));
720 AVC_LPF_P0_OR_Q0(p0_org_r, q1_org_r, p1_org_r, p0_r);
721 AVC_LPF_P0_OR_Q0(p0_org_l, q1_org_l, p1_org_l, p0_l);
724 p0 = (v16u8) __msa_pckev_b((v16i8) p0_l, (v16i8) p0_r);
725 p0_org = __msa_bmnz_v(p0_org, p0, negate_is_less_than_beta);
727 ST_UB(p0_org, data - img_width);
729 /* if (tmpFlag && (unsigned)ABS(q2-q0) < thresholds->beta_in) */
730 q2_asub_q0 = __msa_asub_u_b(q2_org, q0_org);
731 is_less_than_beta = (q2_asub_q0 < beta_in);
732 is_less_than_beta = is_less_than_beta & tmp_flag;
733 negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff);
734 is_less_than_beta = is_less_than_beta & is_less_than;
735 negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
737 /* combine and store */
738 if (!__msa_test_bz_v(is_less_than_beta)) {
739 v8i16 q3_org_r, q3_org_l;
740 v16u8 q3_org = LD_UB(data + (3 * img_width));
747 ILVR_B2_SH(zero, q3_org, zero, q2_org, q3_org_r, q2_r);
748 AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_r, q0_org_r, p0_org_r, q1_org_r,
749 q2_r, p1_org_r, q0_r, q1_r, q2_r);
751 ILVL_B2_SH(zero, q3_org, zero, q2_org, q3_org_l, q2_l);
752 AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_l, q0_org_l, p0_org_l, q1_org_l,
753 q2_l, p1_org_l, q0_l, q1_l, q2_l);
755 PCKEV_B3_UB(q0_l, q0_r, q1_l, q1_r, q2_l, q2_r, q0, q1, q2);
756 q0_org = __msa_bmnz_v(q0_org, q0, is_less_than_beta);
757 q1_org = __msa_bmnz_v(q1_org, q1, is_less_than_beta);
758 q2_org = __msa_bmnz_v(q2_org, q2, is_less_than_beta);
760 ST_UB(q1_org, data + img_width);
761 ST_UB(q2_org, data + 2 * img_width);
764 AVC_LPF_P0_OR_Q0(q0_org_r, p1_org_r, q1_org_r, q0_r);
765 AVC_LPF_P0_OR_Q0(q0_org_l, p1_org_l, q1_org_l, q0_l);
768 q0 = (v16u8) __msa_pckev_b((v16i8) q0_l, (v16i8) q0_r);
769 q0_org = __msa_bmnz_v(q0_org, q0, negate_is_less_than_beta);
775 static void avc_loopfilter_luma_intra_edge_ver_msa(uint8_t *data,
780 uint8_t *src = data - 4;
781 v16u8 alpha, beta, p0_asub_q0;
782 v16u8 is_less_than_alpha, is_less_than, is_less_than_beta;
783 v16u8 p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org;
784 v16u8 p1_asub_p0, q1_asub_q0;
788 v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
789 v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
791 LD_UB8(src, img_width, row0, row1, row2, row3, row4, row5, row6, row7);
792 LD_UB8(src + (8 * img_width), img_width,
793 row8, row9, row10, row11, row12, row13, row14, row15);
795 TRANSPOSE16x8_UB_UB(row0, row1, row2, row3,
796 row4, row5, row6, row7,
797 row8, row9, row10, row11,
798 row12, row13, row14, row15,
799 p3_org, p2_org, p1_org, p0_org,
800 q0_org, q1_org, q2_org, q3_org);
803 p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
804 p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
805 q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
807 alpha = (v16u8) __msa_fill_b(alpha_in);
808 beta = (v16u8) __msa_fill_b(beta_in);
810 is_less_than_alpha = (p0_asub_q0 < alpha);
811 is_less_than_beta = (p1_asub_p0 < beta);
812 is_less_than = is_less_than_beta & is_less_than_alpha;
813 is_less_than_beta = (q1_asub_q0 < beta);
814 is_less_than = is_less_than_beta & is_less_than;
816 if (!__msa_test_bz_v(is_less_than)) {
822 v16u8 tmp_flag, p0, q0, p2_asub_p0, q2_asub_q0;
823 v16u8 negate_is_less_than_beta;
824 v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
825 v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
827 UNPCK_UB_SH(p1_org, p1_org_r, p1_org_l);
828 UNPCK_UB_SH(p0_org, p0_org_r, p0_org_l);
829 UNPCK_UB_SH(q0_org, q0_org_r, q0_org_l);
830 UNPCK_UB_SH(q1_org, q1_org_r, q1_org_l);
832 tmp_flag = alpha >> 2;
833 tmp_flag = tmp_flag + 2;
834 tmp_flag = (p0_asub_q0 < tmp_flag);
836 p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org);
837 is_less_than_beta = (p2_asub_p0 < beta);
838 is_less_than_beta = tmp_flag & is_less_than_beta;
839 negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff);
840 is_less_than_beta = is_less_than_beta & is_less_than;
841 negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
843 if (!__msa_test_bz_v(is_less_than_beta)) {
845 v8i16 p3_org_r, p3_org_l;
851 ILVR_B2_SH(zero, p3_org, zero, p2_org, p3_org_r, p2_r);
852 AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_r, p0_org_r, q0_org_r, p1_org_r,
853 p2_r, q1_org_r, p0_r, p1_r, p2_r);
855 ILVL_B2_SH(zero, p3_org, zero, p2_org, p3_org_l, p2_l);
856 AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_l, p0_org_l, q0_org_l, p1_org_l,
857 p2_l, q1_org_l, p0_l, p1_l, p2_l);
859 PCKEV_B3_UB(p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, p0, p1, p2);
860 p0_org = __msa_bmnz_v(p0_org, p0, is_less_than_beta);
861 p1_org = __msa_bmnz_v(p1_org, p1, is_less_than_beta);
862 p2_org = __msa_bmnz_v(p2_org, p2, is_less_than_beta);
865 AVC_LPF_P0_OR_Q0(p0_org_r, q1_org_r, p1_org_r, p0_r);
866 AVC_LPF_P0_OR_Q0(p0_org_l, q1_org_l, p1_org_l, p0_l);
868 p0 = (v16u8) __msa_pckev_b((v16i8) p0_l, (v16i8) p0_r);
869 p0_org = __msa_bmnz_v(p0_org, p0, negate_is_less_than_beta);
871 q2_asub_q0 = __msa_asub_u_b(q2_org, q0_org);
872 is_less_than_beta = (q2_asub_q0 < beta);
874 is_less_than_beta = is_less_than_beta & tmp_flag;
875 negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff);
877 is_less_than_beta = is_less_than_beta & is_less_than;
878 negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
880 if (!__msa_test_bz_v(is_less_than_beta)) {
882 v8i16 q3_org_r, q3_org_l;
888 ILVR_B2_SH(zero, q3_org, zero, q2_org, q3_org_r, q2_r);
889 AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_r, q0_org_r, p0_org_r, q1_org_r,
890 q2_r, p1_org_r, q0_r, q1_r, q2_r);
892 ILVL_B2_SH(zero, q3_org, zero, q2_org, q3_org_l, q2_l);
893 AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_l, q0_org_l, p0_org_l, q1_org_l,
894 q2_l, p1_org_l, q0_l, q1_l, q2_l);
896 PCKEV_B3_UB(q0_l, q0_r, q1_l, q1_r, q2_l, q2_r, q0, q1, q2);
897 q0_org = __msa_bmnz_v(q0_org, q0, is_less_than_beta);
898 q1_org = __msa_bmnz_v(q1_org, q1, is_less_than_beta);
899 q2_org = __msa_bmnz_v(q2_org, q2, is_less_than_beta);
902 AVC_LPF_P0_OR_Q0(q0_org_r, p1_org_r, q1_org_r, q0_r);
903 AVC_LPF_P0_OR_Q0(q0_org_l, p1_org_l, q1_org_l, q0_l);
905 q0 = (v16u8) __msa_pckev_b((v16i8) q0_l, (v16i8) q0_r);
906 q0_org = __msa_bmnz_v(q0_org, q0, negate_is_less_than_beta);
909 v8i16 tp0, tp1, tp2, tp3, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
911 ILVRL_B2_SH(p1_org, p2_org, tp0, tp2);
912 ILVRL_B2_SH(q0_org, p0_org, tp1, tp3);
913 ILVRL_B2_SH(q2_org, q1_org, tmp2, tmp5);
915 ILVRL_H2_SH(tp1, tp0, tmp3, tmp4);
916 ILVRL_H2_SH(tp3, tp2, tmp6, tmp7);
919 ST4x4_UB(tmp3, tmp3, 0, 1, 2, 3, src, img_width);
920 ST2x4_UB(tmp2, 0, src + 4, img_width);
921 src += 4 * img_width;
922 ST4x4_UB(tmp4, tmp4, 0, 1, 2, 3, src, img_width);
923 ST2x4_UB(tmp2, 4, src + 4, img_width);
924 src += 4 * img_width;
926 ST4x4_UB(tmp6, tmp6, 0, 1, 2, 3, src, img_width);
927 ST2x4_UB(tmp5, 0, src + 4, img_width);
928 src += 4 * img_width;
929 ST4x4_UB(tmp7, tmp7, 0, 1, 2, 3, src, img_width);
930 ST2x4_UB(tmp5, 4, src + 4, img_width);
935 static void avc_h_loop_filter_luma_mbaff_intra_msa(uint8_t *src, int32_t stride,
939 uint64_t load0, load1;
942 v8u16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
943 v8u16 dst0_r, dst1_r, dst4_r, dst5_r;
944 v8u16 dst2_x_r, dst2_y_r, dst3_x_r, dst3_y_r;
945 v16u8 dst0, dst1, dst4, dst5, dst2_x, dst2_y, dst3_x, dst3_y;
946 v8i16 tmp0, tmp1, tmp2, tmp3;
948 v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0, p2_asub_p0, q2_asub_q0;
949 v16u8 is_less_than, is_less_than_alpha, is_less_than_beta;
950 v16u8 is_less_than_beta1, is_less_than_beta2;
962 load1 = LD(src + stride - 4);
963 src0 = (v16i8) __msa_insert_d((v2i64) src0, 0, load0);
964 src1 = (v16i8) __msa_insert_d((v2i64) src1, 0, load1);
966 load0 = LD(src + (2 * stride) - 4);
967 load1 = LD(src + (3 * stride) - 4);
968 src2 = (v16i8) __msa_insert_d((v2i64) src2, 0, load0);
969 src3 = (v16i8) __msa_insert_d((v2i64) src3, 0, load1);
971 load0 = LD(src + (4 * stride) - 4);
972 load1 = LD(src + (5 * stride) - 4);
973 src4 = (v16i8) __msa_insert_d((v2i64) src4, 0, load0);
974 src5 = (v16i8) __msa_insert_d((v2i64) src5, 0, load1);
976 load0 = LD(src + (6 * stride) - 4);
977 load1 = LD(src + (7 * stride) - 4);
978 src6 = (v16i8) __msa_insert_d((v2i64) src6, 0, load0);
979 src7 = (v16i8) __msa_insert_d((v2i64) src7, 0, load1);
981 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src7, src6,
982 src0, src1, src2, src3);
984 ILVR_H2_SH(src1, src0, src3, src2, tmp0, tmp2);
985 ILVL_H2_SH(src1, src0, src3, src2, tmp1, tmp3);
987 ILVR_W2_SB(tmp2, tmp0, tmp3, tmp1, src6, src3);
988 ILVL_W2_SB(tmp2, tmp0, tmp3, tmp1, src1, src5);
989 SLDI_B4_0_SB(src6, src1, src3, src5, src0, src2, src4, src7, 8);
991 p0_asub_q0 = __msa_asub_u_b((v16u8) src2, (v16u8) src3);
992 p1_asub_p0 = __msa_asub_u_b((v16u8) src1, (v16u8) src2);
993 q1_asub_q0 = __msa_asub_u_b((v16u8) src4, (v16u8) src3);
995 alpha = (v16u8) __msa_fill_b(alpha_in);
996 beta = (v16u8) __msa_fill_b(beta_in);
998 is_less_than_alpha = (p0_asub_q0 < alpha);
999 is_less_than_beta = (p1_asub_p0 < beta);
1000 is_less_than = is_less_than_alpha & is_less_than_beta;
1001 is_less_than_beta = (q1_asub_q0 < beta);
1002 is_less_than = is_less_than & is_less_than_beta;
1007 is_less_than_alpha = (p0_asub_q0 < alpha);
1009 p2_asub_p0 = __msa_asub_u_b((v16u8) src0, (v16u8) src2);
1010 is_less_than_beta1 = (p2_asub_p0 < beta);
1011 q2_asub_q0 = __msa_asub_u_b((v16u8) src5, (v16u8) src3);
1012 is_less_than_beta2 = (q2_asub_q0 < beta);
1014 ILVR_B4_UH(zeros, src0, zeros, src1, zeros, src2, zeros, src3,
1015 src0_r, src1_r, src2_r, src3_r);
1016 ILVR_B4_UH(zeros, src4, zeros, src5, zeros, src6, zeros, src7,
1017 src4_r, src5_r, src6_r, src7_r);
1019 dst2_x_r = src1_r + src2_r + src3_r;
1020 dst2_x_r = src0_r + (2 * (dst2_x_r)) + src4_r;
1021 dst2_x_r = (v8u16) __msa_srari_h((v8i16) dst2_x_r, 3);
1022 dst1_r = src0_r + src1_r + src2_r + src3_r;
1023 dst1_r = (v8u16) __msa_srari_h((v8i16) dst1_r, 2);
1025 dst0_r = (2 * src6_r) + (3 * src0_r);
1026 dst0_r += src1_r + src2_r + src3_r;
1027 dst0_r = (v8u16) __msa_srari_h((v8i16) dst0_r, 3);
1028 dst2_y_r = (2 * src1_r) + src2_r + src4_r;
1029 dst2_y_r = (v8u16) __msa_srari_h((v8i16) dst2_y_r, 2);
1031 PCKEV_B2_UB(dst2_x_r, dst2_x_r, dst2_y_r, dst2_y_r, dst2_x, dst2_y);
1032 dst2_x = __msa_bmnz_v(dst2_y, dst2_x, is_less_than_beta1);
1034 dst3_x_r = src2_r + src3_r + src4_r;
1035 dst3_x_r = src1_r + (2 * dst3_x_r) + src5_r;
1036 dst3_x_r = (v8u16) __msa_srari_h((v8i16) dst3_x_r, 3);
1037 dst4_r = src2_r + src3_r + src4_r + src5_r;
1038 dst4_r = (v8u16) __msa_srari_h((v8i16) dst4_r, 2);
1040 dst5_r = (2 * src7_r) + (3 * src5_r);
1041 dst5_r += src4_r + src3_r + src2_r;
1042 dst5_r = (v8u16) __msa_srari_h((v8i16) dst5_r, 3);
1043 dst3_y_r = (2 * src4_r) + src3_r + src1_r;
1044 dst3_y_r = (v8u16) __msa_srari_h((v8i16) dst3_y_r, 2);
1046 PCKEV_B2_UB(dst3_x_r, dst3_x_r, dst3_y_r, dst3_y_r, dst3_x, dst3_y);
1047 dst3_x = __msa_bmnz_v(dst3_y, dst3_x, is_less_than_beta2);
1049 dst2_y_r = (2 * src1_r) + src2_r + src4_r;
1050 dst2_y_r = (v8u16) __msa_srari_h((v8i16) dst2_y_r, 2);
1051 dst3_y_r = (2 * src4_r) + src3_r + src1_r;
1052 dst3_y_r = (v8u16) __msa_srari_h((v8i16) dst3_y_r, 2);
1054 PCKEV_B2_UB(dst2_y_r, dst2_y_r, dst3_y_r, dst3_y_r, dst2_y, dst3_y);
1056 dst2_x = __msa_bmnz_v(dst2_y, dst2_x, is_less_than_alpha);
1057 dst3_x = __msa_bmnz_v(dst3_y, dst3_x, is_less_than_alpha);
1058 dst2_x = __msa_bmnz_v((v16u8) src2, dst2_x, is_less_than);
1059 dst3_x = __msa_bmnz_v((v16u8) src3, dst3_x, is_less_than);
1061 is_less_than = is_less_than_alpha & is_less_than;
1062 dst1 = (v16u8) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst1_r);
1063 is_less_than_beta1 = is_less_than_beta1 & is_less_than;
1064 dst1 = __msa_bmnz_v((v16u8) src1, dst1, is_less_than_beta1);
1066 dst0 = (v16u8) __msa_pckev_b((v16i8) dst0_r, (v16i8) dst0_r);
1067 dst0 = __msa_bmnz_v((v16u8) src0, dst0, is_less_than_beta1);
1068 dst4 = (v16u8) __msa_pckev_b((v16i8) dst4_r, (v16i8) dst4_r);
1069 is_less_than_beta2 = is_less_than_beta2 & is_less_than;
1070 dst4 = __msa_bmnz_v((v16u8) src4, dst4, is_less_than_beta2);
1071 dst5 = (v16u8) __msa_pckev_b((v16i8) dst5_r, (v16i8) dst5_r);
1072 dst5 = __msa_bmnz_v((v16u8) src5, dst5, is_less_than_beta2);
1074 ILVR_B2_UB(dst1, dst0, dst3_x, dst2_x, dst0, dst1);
1075 dst2_x = (v16u8) __msa_ilvr_b((v16i8) dst5, (v16i8) dst4);
1076 ILVRL_H2_SH(dst1, dst0, tmp0, tmp1);
1077 ILVRL_H2_SH(zeros, dst2_x, tmp2, tmp3);
1079 ILVR_W2_UB(tmp2, tmp0, tmp3, tmp1, dst0, dst4);
1080 SLDI_B2_0_UB(dst0, dst4, dst1, dst5, 8);
1081 dst2_x = (v16u8) __msa_ilvl_w((v4i32) tmp2, (v4i32) tmp0);
1082 dst2_y = (v16u8) __msa_ilvl_w((v4i32) tmp3, (v4i32) tmp1);
1083 SLDI_B2_0_UB(dst2_x, dst2_y, dst3_x, dst3_y, 8);
1085 out0 = __msa_copy_u_w((v4i32) dst0, 0);
1086 out1 = __msa_copy_u_h((v8i16) dst0, 2);
1087 out2 = __msa_copy_u_w((v4i32) dst1, 0);
1088 out3 = __msa_copy_u_h((v8i16) dst1, 2);
1090 SW(out0, (src - 3));
1091 SH(out1, (src + 1));
1093 SW(out2, (src - 3));
1094 SH(out3, (src + 1));
1097 out0 = __msa_copy_u_w((v4i32) dst2_x, 0);
1098 out1 = __msa_copy_u_h((v8i16) dst2_x, 2);
1099 out2 = __msa_copy_u_w((v4i32) dst3_x, 0);
1100 out3 = __msa_copy_u_h((v8i16) dst3_x, 2);
1102 SW(out0, (src - 3));
1103 SH(out1, (src + 1));
1105 SW(out2, (src - 3));
1106 SH(out3, (src + 1));
1109 out0 = __msa_copy_u_w((v4i32) dst4, 0);
1110 out1 = __msa_copy_u_h((v8i16) dst4, 2);
1111 out2 = __msa_copy_u_w((v4i32) dst5, 0);
1112 out3 = __msa_copy_u_h((v8i16) dst5, 2);
1114 SW(out0, (src - 3));
1115 SH(out1, (src + 1));
1117 SW(out2, (src - 3));
1118 SH(out3, (src + 1));
1121 out0 = __msa_copy_u_w((v4i32) dst2_y, 0);
1122 out1 = __msa_copy_u_h((v8i16) dst2_y, 2);
1123 out2 = __msa_copy_u_w((v4i32) dst3_y, 0);
1124 out3 = __msa_copy_u_h((v8i16) dst3_y, 2);
1126 SW(out0, (src - 3));
1127 SH(out1, (src + 1));
1129 SW(out2, (src - 3));
1130 SH(out3, (src + 1));
1133 static void avc_loopfilter_cb_or_cr_intra_edge_hor_msa(uint8_t *data_cb_or_cr,
1140 v8i16 p0_or_q0, q0_or_p0;
1141 v16u8 p1_or_q1_org, p0_or_q0_org, q0_or_p0_org, q1_or_p1_org;
1143 v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
1144 v16u8 is_less_than_alpha, is_less_than_beta;
1145 v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
1147 alpha = (v16u8) __msa_fill_b(alpha_in);
1148 beta = (v16u8) __msa_fill_b(beta_in);
1150 LD_UB4(data_cb_or_cr - (img_width << 1), img_width,
1151 p1_or_q1_org, p0_or_q0_org, q0_or_p0_org, q1_or_p1_org);
1153 p0_asub_q0 = __msa_asub_u_b(p0_or_q0_org, q0_or_p0_org);
1154 p1_asub_p0 = __msa_asub_u_b(p1_or_q1_org, p0_or_q0_org);
1155 q1_asub_q0 = __msa_asub_u_b(q1_or_p1_org, q0_or_p0_org);
1157 is_less_than_alpha = (p0_asub_q0 < alpha);
1158 is_less_than_beta = (p1_asub_p0 < beta);
1159 is_less_than = is_less_than_beta & is_less_than_alpha;
1160 is_less_than_beta = (q1_asub_q0 < beta);
1161 is_less_than = is_less_than_beta & is_less_than;
1163 is_less_than = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) is_less_than);
1165 if (!__msa_test_bz_v(is_less_than)) {
1166 ILVR_B4_SH(zero, p1_or_q1_org, zero, p0_or_q0_org, zero, q0_or_p0_org,
1167 zero, q1_or_p1_org, p1_org_r, p0_org_r, q0_org_r, q1_org_r);
1168 AVC_LPF_P0_OR_Q0(p0_org_r, q1_org_r, p1_org_r, p0_or_q0);
1169 AVC_LPF_P0_OR_Q0(q0_org_r, p1_org_r, q1_org_r, q0_or_p0);
1170 PCKEV_B2_SH(zero, p0_or_q0, zero, q0_or_p0, p0_or_q0, q0_or_p0);
1173 __msa_bmnz_v(p0_or_q0_org, (v16u8) p0_or_q0, is_less_than);
1175 __msa_bmnz_v(q0_or_p0_org, (v16u8) q0_or_p0, is_less_than);
1177 ST_UB(q0_or_p0_org, data_cb_or_cr);
1178 ST_UB(p0_or_q0_org, data_cb_or_cr - img_width);
1182 static void avc_loopfilter_cb_or_cr_intra_edge_ver_msa(uint8_t *data_cb_or_cr,
1188 v16u8 alpha, beta, is_less_than;
1189 v8i16 p0_or_q0, q0_or_p0;
1190 v16u8 p1_or_q1_org, p0_or_q0_org, q0_or_p0_org, q1_or_p1_org;
1192 v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
1193 v16u8 is_less_than_alpha, is_less_than_beta;
1194 v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
1197 v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
1199 LD_UB8((data_cb_or_cr - 2), img_width,
1200 row0, row1, row2, row3, row4, row5, row6, row7);
1202 TRANSPOSE8x4_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
1203 p1_or_q1_org, p0_or_q0_org,
1204 q0_or_p0_org, q1_or_p1_org);
1207 alpha = (v16u8) __msa_fill_b(alpha_in);
1208 beta = (v16u8) __msa_fill_b(beta_in);
1210 p0_asub_q0 = __msa_asub_u_b(p0_or_q0_org, q0_or_p0_org);
1211 p1_asub_p0 = __msa_asub_u_b(p1_or_q1_org, p0_or_q0_org);
1212 q1_asub_q0 = __msa_asub_u_b(q1_or_p1_org, q0_or_p0_org);
1214 is_less_than_alpha = (p0_asub_q0 < alpha);
1215 is_less_than_beta = (p1_asub_p0 < beta);
1216 is_less_than = is_less_than_beta & is_less_than_alpha;
1217 is_less_than_beta = (q1_asub_q0 < beta);
1218 is_less_than = is_less_than_beta & is_less_than;
1219 is_less_than = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) is_less_than);
1221 if (!__msa_test_bz_v(is_less_than)) {
1222 ILVR_B4_SH(zero, p1_or_q1_org, zero, p0_or_q0_org, zero, q0_or_p0_org,
1223 zero, q1_or_p1_org, p1_org_r, p0_org_r, q0_org_r, q1_org_r);
1225 AVC_LPF_P0_OR_Q0(p0_org_r, q1_org_r, p1_org_r, p0_or_q0);
1226 AVC_LPF_P0_OR_Q0(q0_org_r, p1_org_r, q1_org_r, q0_or_p0);
1228 /* convert 16 bit output into 8 bit output */
1229 PCKEV_B2_SH(zero, p0_or_q0, zero, q0_or_p0, p0_or_q0, q0_or_p0);
1232 __msa_bmnz_v(p0_or_q0_org, (v16u8) p0_or_q0, is_less_than);
1234 __msa_bmnz_v(q0_or_p0_org, (v16u8) q0_or_p0, is_less_than);
1235 tmp1 = (v8i16) __msa_ilvr_b((v16i8) q0_or_p0_org, (v16i8) p0_or_q0_org);
1238 ST2x4_UB(tmp1, 0, data_cb_or_cr, img_width);
1239 data_cb_or_cr += 4 * img_width;
1240 ST2x4_UB(tmp1, 4, data_cb_or_cr, img_width);
1244 static void avc_loopfilter_luma_inter_edge_ver_msa(uint8_t *data,
1245 uint8_t bs0, uint8_t bs1,
1246 uint8_t bs2, uint8_t bs3,
1247 uint8_t tc0, uint8_t tc1,
1248 uint8_t tc2, uint8_t tc3,
1253 v16u8 tmp_vec, bs = { 0 };
1255 tmp_vec = (v16u8) __msa_fill_b(bs0);
1256 bs = (v16u8) __msa_insve_w((v4i32) bs, 0, (v4i32) tmp_vec);
1257 tmp_vec = (v16u8) __msa_fill_b(bs1);
1258 bs = (v16u8) __msa_insve_w((v4i32) bs, 1, (v4i32) tmp_vec);
1259 tmp_vec = (v16u8) __msa_fill_b(bs2);
1260 bs = (v16u8) __msa_insve_w((v4i32) bs, 2, (v4i32) tmp_vec);
1261 tmp_vec = (v16u8) __msa_fill_b(bs3);
1262 bs = (v16u8) __msa_insve_w((v4i32) bs, 3, (v4i32) tmp_vec);
1264 if (!__msa_test_bz_v(bs)) {
1265 uint8_t *src = data - 4;
1266 v16u8 p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org;
1267 v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta;
1268 v16u8 is_less_than, is_less_than_beta, is_less_than_alpha;
1269 v16u8 is_bs_greater_than0;
1273 tmp_vec = (v16u8) __msa_fill_b(tc0);
1274 tc = (v16u8) __msa_insve_w((v4i32) tc, 0, (v4i32) tmp_vec);
1275 tmp_vec = (v16u8) __msa_fill_b(tc1);
1276 tc = (v16u8) __msa_insve_w((v4i32) tc, 1, (v4i32) tmp_vec);
1277 tmp_vec = (v16u8) __msa_fill_b(tc2);
1278 tc = (v16u8) __msa_insve_w((v4i32) tc, 2, (v4i32) tmp_vec);
1279 tmp_vec = (v16u8) __msa_fill_b(tc3);
1280 tc = (v16u8) __msa_insve_w((v4i32) tc, 3, (v4i32) tmp_vec);
1282 is_bs_greater_than0 = (zero < bs);
1285 v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
1286 v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
1288 LD_UB8(src, img_width,
1289 row0, row1, row2, row3, row4, row5, row6, row7);
1290 src += (8 * img_width);
1291 LD_UB8(src, img_width,
1292 row8, row9, row10, row11, row12, row13, row14, row15);
1294 TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
1295 row8, row9, row10, row11,
1296 row12, row13, row14, row15,
1297 p3_org, p2_org, p1_org, p0_org,
1298 q0_org, q1_org, q2_org, q3_org);
1301 p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
1302 p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
1303 q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
1305 alpha = (v16u8) __msa_fill_b(alpha_in);
1306 beta = (v16u8) __msa_fill_b(beta_in);
1308 is_less_than_alpha = (p0_asub_q0 < alpha);
1309 is_less_than_beta = (p1_asub_p0 < beta);
1310 is_less_than = is_less_than_beta & is_less_than_alpha;
1311 is_less_than_beta = (q1_asub_q0 < beta);
1312 is_less_than = is_less_than_beta & is_less_than;
1313 is_less_than = is_less_than & is_bs_greater_than0;
1315 if (!__msa_test_bz_v(is_less_than)) {
1316 v16i8 negate_tc, sign_negate_tc;
1317 v16u8 p0, q0, p2_asub_p0, q2_asub_q0;
1318 v8i16 tc_r, tc_l, negate_tc_r, i16_negatetc_l;
1319 v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
1320 v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
1321 v8i16 p0_r, q0_r, p0_l, q0_l;
1323 negate_tc = zero - (v16i8) tc;
1324 sign_negate_tc = __msa_clti_s_b(negate_tc, 0);
1326 ILVRL_B2_SH(sign_negate_tc, negate_tc, negate_tc_r, i16_negatetc_l);
1328 UNPCK_UB_SH(tc, tc_r, tc_l);
1329 UNPCK_UB_SH(p1_org, p1_org_r, p1_org_l);
1330 UNPCK_UB_SH(p0_org, p0_org_r, p0_org_l);
1331 UNPCK_UB_SH(q0_org, q0_org_r, q0_org_l);
1333 p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org);
1334 is_less_than_beta = (p2_asub_p0 < beta);
1335 is_less_than_beta = is_less_than_beta & is_less_than;
1337 if (!__msa_test_bz_v(is_less_than_beta)) {
1341 v8i16 p2_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) p2_org);
1342 v8i16 p2_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) p2_org);
1344 AVC_LPF_P1_OR_Q1(p0_org_r, q0_org_r, p1_org_r, p2_org_r,
1345 negate_tc_r, tc_r, p1_r);
1346 AVC_LPF_P1_OR_Q1(p0_org_l, q0_org_l, p1_org_l, p2_org_l,
1347 i16_negatetc_l, tc_l, p1_l);
1349 p1 = (v16u8) __msa_pckev_b((v16i8) p1_l, (v16i8) p1_r);
1350 p1_org = __msa_bmnz_v(p1_org, p1, is_less_than_beta);
1352 is_less_than_beta = __msa_andi_b(is_less_than_beta, 1);
1353 tc = tc + is_less_than_beta;
1356 q2_asub_q0 = __msa_asub_u_b(q2_org, q0_org);
1357 is_less_than_beta = (q2_asub_q0 < beta);
1358 is_less_than_beta = is_less_than_beta & is_less_than;
1360 q1_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q1_org);
1361 q1_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q1_org);
1363 if (!__msa_test_bz_v(is_less_than_beta)) {
1367 v8i16 q2_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q2_org);
1368 v8i16 q2_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q2_org);
1370 AVC_LPF_P1_OR_Q1(p0_org_r, q0_org_r, q1_org_r, q2_org_r,
1371 negate_tc_r, tc_r, q1_r);
1372 AVC_LPF_P1_OR_Q1(p0_org_l, q0_org_l, q1_org_l, q2_org_l,
1373 i16_negatetc_l, tc_l, q1_l);
1375 q1 = (v16u8) __msa_pckev_b((v16i8) q1_l, (v16i8) q1_r);
1376 q1_org = __msa_bmnz_v(q1_org, q1, is_less_than_beta);
1378 is_less_than_beta = __msa_andi_b(is_less_than_beta, 1);
1379 tc = tc + is_less_than_beta;
1383 v8i16 threshold_r, negate_thresh_r;
1384 v8i16 threshold_l, negate_thresh_l;
1385 v16i8 negate_thresh, sign_negate_thresh;
1387 negate_thresh = zero - (v16i8) tc;
1388 sign_negate_thresh = __msa_clti_s_b(negate_thresh, 0);
1390 ILVR_B2_SH(zero, tc, sign_negate_thresh, negate_thresh,
1391 threshold_r, negate_thresh_r);
1393 AVC_LPF_P0Q0(q0_org_r, p0_org_r, p1_org_r, q1_org_r,
1394 negate_thresh_r, threshold_r, p0_r, q0_r);
1396 threshold_l = (v8i16) __msa_ilvl_b(zero, (v16i8) tc);
1397 negate_thresh_l = (v8i16) __msa_ilvl_b(sign_negate_thresh,
1400 AVC_LPF_P0Q0(q0_org_l, p0_org_l, p1_org_l, q1_org_l,
1401 negate_thresh_l, threshold_l, p0_l, q0_l);
1404 PCKEV_B2_UB(p0_l, p0_r, q0_l, q0_r, p0, q0);
1406 p0_org = __msa_bmnz_v(p0_org, p0, is_less_than);
1407 q0_org = __msa_bmnz_v(q0_org, q0, is_less_than);
1410 v16i8 tp0, tp1, tp2, tp3;
1412 v4i32 tmp3, tmp4, tmp6, tmp7;
1413 uint32_t out0, out2;
1414 uint16_t out1, out3;
1418 ILVRL_B2_SB(p1_org, p2_org, tp0, tp2);
1419 ILVRL_B2_SB(q0_org, p0_org, tp1, tp3);
1420 ILVRL_B2_SH(q2_org, q1_org, tmp2, tmp5);
1422 ILVRL_H2_SW(tp1, tp0, tmp3, tmp4);
1423 ILVRL_H2_SW(tp3, tp2, tmp6, tmp7);
1425 out0 = __msa_copy_u_w(tmp3, 0);
1426 out1 = __msa_copy_u_h(tmp2, 0);
1427 out2 = __msa_copy_u_w(tmp3, 1);
1428 out3 = __msa_copy_u_h(tmp2, 1);
1431 SH(out1, (src + 4));
1434 SH(out3, (src + 4));
1436 out0 = __msa_copy_u_w(tmp3, 2);
1437 out1 = __msa_copy_u_h(tmp2, 2);
1438 out2 = __msa_copy_u_w(tmp3, 3);
1439 out3 = __msa_copy_u_h(tmp2, 3);
1443 SH(out1, (src + 4));
1446 SH(out3, (src + 4));
1448 out0 = __msa_copy_u_w(tmp4, 0);
1449 out1 = __msa_copy_u_h(tmp2, 4);
1450 out2 = __msa_copy_u_w(tmp4, 1);
1451 out3 = __msa_copy_u_h(tmp2, 5);
1455 SH(out1, (src + 4));
1458 SH(out3, (src + 4));
1460 out0 = __msa_copy_u_w(tmp4, 2);
1461 out1 = __msa_copy_u_h(tmp2, 6);
1462 out2 = __msa_copy_u_w(tmp4, 3);
1463 out3 = __msa_copy_u_h(tmp2, 7);
1467 SH(out1, (src + 4));
1470 SH(out3, (src + 4));
1472 out0 = __msa_copy_u_w(tmp6, 0);
1473 out1 = __msa_copy_u_h(tmp5, 0);
1474 out2 = __msa_copy_u_w(tmp6, 1);
1475 out3 = __msa_copy_u_h(tmp5, 1);
1479 SH(out1, (src + 4));
1482 SH(out3, (src + 4));
1484 out0 = __msa_copy_u_w(tmp6, 2);
1485 out1 = __msa_copy_u_h(tmp5, 2);
1486 out2 = __msa_copy_u_w(tmp6, 3);
1487 out3 = __msa_copy_u_h(tmp5, 3);
1491 SH(out1, (src + 4));
1494 SH(out3, (src + 4));
1496 out0 = __msa_copy_u_w(tmp7, 0);
1497 out1 = __msa_copy_u_h(tmp5, 4);
1498 out2 = __msa_copy_u_w(tmp7, 1);
1499 out3 = __msa_copy_u_h(tmp5, 5);
1503 SH(out1, (src + 4));
1506 SH(out3, (src + 4));
1508 out0 = __msa_copy_u_w(tmp7, 2);
1509 out1 = __msa_copy_u_h(tmp5, 6);
1510 out2 = __msa_copy_u_w(tmp7, 3);
1511 out3 = __msa_copy_u_h(tmp5, 7);
1515 SH(out1, (src + 4));
1518 SH(out3, (src + 4));
1524 static void avc_loopfilter_luma_inter_edge_hor_msa(uint8_t *data,
1525 uint8_t bs0, uint8_t bs1,
1526 uint8_t bs2, uint8_t bs3,
1527 uint8_t tc0, uint8_t tc1,
1528 uint8_t tc2, uint8_t tc3,
1531 uint32_t image_width)
1536 tmp_vec = (v16u8) __msa_fill_b(bs0);
1537 bs = (v16u8) __msa_insve_w((v4i32) bs, 0, (v4i32) tmp_vec);
1538 tmp_vec = (v16u8) __msa_fill_b(bs1);
1539 bs = (v16u8) __msa_insve_w((v4i32) bs, 1, (v4i32) tmp_vec);
1540 tmp_vec = (v16u8) __msa_fill_b(bs2);
1541 bs = (v16u8) __msa_insve_w((v4i32) bs, 2, (v4i32) tmp_vec);
1542 tmp_vec = (v16u8) __msa_fill_b(bs3);
1543 bs = (v16u8) __msa_insve_w((v4i32) bs, 3, (v4i32) tmp_vec);
1545 if (!__msa_test_bz_v(bs)) {
1546 v16u8 alpha, beta, is_less_than, is_less_than_beta;
1547 v16u8 p0, q0, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org;
1548 v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
1549 v16u8 is_less_than_alpha, is_bs_greater_than0;
1550 v8i16 p0_r, q0_r, p0_l, q0_l;
1551 v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
1552 v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
1556 tmp_vec = (v16u8) __msa_fill_b(tc0);
1557 tc = (v16i8) __msa_insve_w((v4i32) tc, 0, (v4i32) tmp_vec);
1558 tmp_vec = (v16u8) __msa_fill_b(tc1);
1559 tc = (v16i8) __msa_insve_w((v4i32) tc, 1, (v4i32) tmp_vec);
1560 tmp_vec = (v16u8) __msa_fill_b(tc2);
1561 tc = (v16i8) __msa_insve_w((v4i32) tc, 2, (v4i32) tmp_vec);
1562 tmp_vec = (v16u8) __msa_fill_b(tc3);
1563 tc = (v16i8) __msa_insve_w((v4i32) tc, 3, (v4i32) tmp_vec);
1565 alpha = (v16u8) __msa_fill_b(alpha_in);
1566 beta = (v16u8) __msa_fill_b(beta_in);
1568 LD_UB5(data - (3 * image_width), image_width,
1569 p2_org, p1_org, p0_org, q0_org, q1_org);
1571 is_bs_greater_than0 = ((v16u8) zero < bs);
1572 p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
1573 p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
1574 q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
1576 is_less_than_alpha = (p0_asub_q0 < alpha);
1577 is_less_than_beta = (p1_asub_p0 < beta);
1578 is_less_than = is_less_than_beta & is_less_than_alpha;
1579 is_less_than_beta = (q1_asub_q0 < beta);
1580 is_less_than = is_less_than_beta & is_less_than;
1581 is_less_than = is_less_than & is_bs_greater_than0;
1583 if (!__msa_test_bz_v(is_less_than)) {
1584 v16i8 sign_negate_tc, negate_tc;
1585 v8i16 negate_tc_r, i16_negatetc_l, tc_l, tc_r;
1586 v16u8 p2_asub_p0, q2_asub_q0;
1588 q2_org = LD_UB(data + (2 * image_width));
1589 negate_tc = zero - tc;
1590 sign_negate_tc = __msa_clti_s_b(negate_tc, 0);
1592 ILVRL_B2_SH(sign_negate_tc, negate_tc, negate_tc_r, i16_negatetc_l);
1594 UNPCK_UB_SH(tc, tc_r, tc_l);
1595 UNPCK_UB_SH(p1_org, p1_org_r, p1_org_l);
1596 UNPCK_UB_SH(p0_org, p0_org_r, p0_org_l);
1597 UNPCK_UB_SH(q0_org, q0_org_r, q0_org_l);
1599 p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org);
1600 is_less_than_beta = (p2_asub_p0 < beta);
1601 is_less_than_beta = is_less_than_beta & is_less_than;
1603 if (!__msa_test_bz_v(is_less_than_beta)) {
1607 v8i16 p2_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) p2_org);
1608 v8i16 p2_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) p2_org);
1610 AVC_LPF_P1_OR_Q1(p0_org_r, q0_org_r, p1_org_r, p2_org_r,
1611 negate_tc_r, tc_r, p1_r);
1612 AVC_LPF_P1_OR_Q1(p0_org_l, q0_org_l, p1_org_l, p2_org_l,
1613 i16_negatetc_l, tc_l, p1_l);
1615 p1 = (v16u8) __msa_pckev_b((v16i8) p1_l, (v16i8) p1_r);
1616 p1_org = __msa_bmnz_v(p1_org, p1, is_less_than_beta);
1617 ST_UB(p1_org, data - (2 * image_width));
1619 is_less_than_beta = __msa_andi_b(is_less_than_beta, 1);
1620 tc = tc + (v16i8) is_less_than_beta;
1623 q2_asub_q0 = __msa_asub_u_b(q2_org, q0_org);
1624 is_less_than_beta = (q2_asub_q0 < beta);
1625 is_less_than_beta = is_less_than_beta & is_less_than;
1627 q1_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q1_org);
1628 q1_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q1_org);
1630 if (!__msa_test_bz_v(is_less_than_beta)) {
1634 v8i16 q2_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q2_org);
1635 v8i16 q2_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q2_org);
1637 AVC_LPF_P1_OR_Q1(p0_org_r, q0_org_r, q1_org_r, q2_org_r,
1638 negate_tc_r, tc_r, q1_r);
1639 AVC_LPF_P1_OR_Q1(p0_org_l, q0_org_l, q1_org_l, q2_org_l,
1640 i16_negatetc_l, tc_l, q1_l);
1642 q1 = (v16u8) __msa_pckev_b((v16i8) q1_l, (v16i8) q1_r);
1643 q1_org = __msa_bmnz_v(q1_org, q1, is_less_than_beta);
1644 ST_UB(q1_org, data + image_width);
1646 is_less_than_beta = __msa_andi_b(is_less_than_beta, 1);
1647 tc = tc + (v16i8) is_less_than_beta;
1650 v16i8 negate_thresh, sign_negate_thresh;
1651 v8i16 threshold_r, threshold_l;
1652 v8i16 negate_thresh_l, negate_thresh_r;
1654 negate_thresh = zero - tc;
1655 sign_negate_thresh = __msa_clti_s_b(negate_thresh, 0);
1657 ILVR_B2_SH(zero, tc, sign_negate_thresh, negate_thresh,
1658 threshold_r, negate_thresh_r);
1659 AVC_LPF_P0Q0(q0_org_r, p0_org_r, p1_org_r, q1_org_r,
1660 negate_thresh_r, threshold_r, p0_r, q0_r);
1662 threshold_l = (v8i16) __msa_ilvl_b(zero, tc);
1663 negate_thresh_l = (v8i16) __msa_ilvl_b(sign_negate_thresh,
1665 AVC_LPF_P0Q0(q0_org_l, p0_org_l, p1_org_l, q1_org_l,
1666 negate_thresh_l, threshold_l, p0_l, q0_l);
1669 PCKEV_B2_UB(p0_l, p0_r, q0_l, q0_r, p0, q0);
1671 p0_org = __msa_bmnz_v(p0_org, p0, is_less_than);
1672 q0_org = __msa_bmnz_v(q0_org, q0, is_less_than);
1674 ST_UB(p0_org, (data - image_width));
1675 ST_UB(q0_org, data);
1680 static void avc_h_loop_filter_luma_mbaff_msa(uint8_t *in, int32_t stride,
1681 int32_t alpha_in, int32_t beta_in,
1685 uint32_t out0, out1, out2, out3;
1697 v16i8 src0, src1, src2, src3;
1698 v8i16 src4, src5, src6, src7;
1699 v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0, p2_asub_p0, q2_asub_q0;
1700 v16u8 is_less_than, is_less_than_alpha, is_less_than_beta;
1701 v16u8 is_less_than_beta1, is_less_than_beta2;
1702 v8i16 tc, tc_orig_r, tc_plus1;
1703 v16u8 is_tc_orig1, is_tc_orig2, tc_orig = { 0 };
1704 v8i16 p0_ilvr_q0, p0_add_q0, q0_sub_p0, p1_sub_q1;
1705 v8u16 src2_r, src3_r;
1706 v8i16 p2_r, p1_r, q2_r, q1_r;
1707 v16u8 p2, q2, p0, q0;
1709 v16i8 zeros = { 0 };
1711 alpha = (v16u8) __msa_fill_b(alpha_in);
1712 beta = (v16u8) __msa_fill_b(beta_in);
1715 data += (2 * stride);
1717 load = LD(data - 3);
1718 inp0 = (v16i8) __msa_insert_d((v2i64) inp0, 0, load);
1719 load = LD(data - 3 + stride);
1720 inp1 = (v16i8) __msa_insert_d((v2i64) inp1, 0, load);
1721 data += (2 * stride);
1725 data += (2 * stride);
1727 load = LD(data - 3);
1728 inp2 = (v16i8) __msa_insert_d((v2i64) inp2, 0, load);
1729 load = LD(data - 3 + stride);
1730 inp3 = (v16i8) __msa_insert_d((v2i64) inp3, 0, load);
1731 data += (2 * stride);
1735 data += (2 * stride);
1737 load = LD(data - 3);
1738 inp4 = (v16i8) __msa_insert_d((v2i64) inp4, 0, load);
1739 load = LD(data - 3 + stride);
1740 inp5 = (v16i8) __msa_insert_d((v2i64) inp5, 0, load);
1741 data += (2 * stride);
1745 data += (2 * stride);
1747 load = LD(data - 3);
1748 inp6 = (v16i8) __msa_insert_d((v2i64) inp6, 0, load);
1749 load = LD(data - 3 + stride);
1750 inp7 = (v16i8) __msa_insert_d((v2i64) inp7, 0, load);
1751 data += (2 * stride);
1754 ILVR_B4_SB(inp1, inp0, inp3, inp2, inp5, inp4, inp7, inp6,
1755 src0, src1, src2, src3);
1757 ILVR_H2_SH(src1, src0, src3, src2, src4, src6);
1758 ILVL_H2_SH(src1, src0, src3, src2, src5, src7);
1760 src0 = (v16i8) __msa_ilvr_w((v4i32) src6, (v4i32) src4);
1761 src1 = __msa_sldi_b(zeros, (v16i8) src0, 8);
1762 src2 = (v16i8) __msa_ilvl_w((v4i32) src6, (v4i32) src4);
1763 src3 = __msa_sldi_b(zeros, (v16i8) src2, 8);
1764 src4 = (v8i16) __msa_ilvr_w((v4i32) src7, (v4i32) src5);
1765 src5 = (v8i16) __msa_sldi_b(zeros, (v16i8) src4, 8);
1767 p0_asub_q0 = __msa_asub_u_b((v16u8) src2, (v16u8) src3);
1768 p1_asub_p0 = __msa_asub_u_b((v16u8) src1, (v16u8) src2);
1769 q1_asub_q0 = __msa_asub_u_b((v16u8) src4, (v16u8) src3);
1770 p2_asub_p0 = __msa_asub_u_b((v16u8) src0, (v16u8) src2);
1771 q2_asub_q0 = __msa_asub_u_b((v16u8) src5, (v16u8) src3);
1773 is_less_than_alpha = (p0_asub_q0 < alpha);
1774 is_less_than_beta = (p1_asub_p0 < beta);
1775 is_less_than = is_less_than_alpha & is_less_than_beta;
1776 is_less_than_beta = (q1_asub_q0 < beta);
1777 is_less_than = is_less_than_beta & is_less_than;
1779 is_less_than_beta1 = (p2_asub_p0 < beta);
1780 is_less_than_beta2 = (q2_asub_q0 < beta);
1782 p0_ilvr_q0 = (v8i16) __msa_ilvr_b((v16i8) src3, (v16i8) src2);
1783 p0_add_q0 = (v8i16) __msa_hadd_u_h((v16u8) p0_ilvr_q0, (v16u8) p0_ilvr_q0);
1784 p0_add_q0 = __msa_srari_h(p0_add_q0, 1);
1786 ILVR_B2_SH(zeros, src0, zeros, src1, p2_r, p1_r);
1790 ILVR_B2_SH(zeros, src5, zeros, src4, q2_r, q1_r);
1796 tc_orig = (v16u8) __msa_insert_w((v4i32) tc_orig, 0, tc_val);
1797 tc_orig = (v16u8) __msa_ilvr_b((v16i8) tc_orig, (v16i8) tc_orig);
1798 is_tc_orig1 = tc_orig;
1799 is_tc_orig2 = tc_orig;
1800 tc_orig_r = (v8i16) __msa_ilvr_b(zeros, (v16i8) tc_orig);
1803 p2_r = CLIP_SH(p2_r, -tc_orig_r, tc_orig_r);
1804 q2_r = CLIP_SH(q2_r, -tc_orig_r, tc_orig_r);
1809 PCKEV_B2_UB(p2_r, p2_r, q2_r, q2_r, p2, q2);
1811 is_tc_orig1 = (zeros < is_tc_orig1);
1812 is_tc_orig2 = is_tc_orig1;
1813 is_tc_orig1 = is_less_than_beta1 & is_tc_orig1;
1814 is_tc_orig2 = is_less_than_beta2 & is_tc_orig2;
1815 is_tc_orig1 = is_less_than & is_tc_orig1;
1816 is_tc_orig2 = is_less_than & is_tc_orig2;
1818 p2 = __msa_bmnz_v((v16u8) src1, p2, is_tc_orig1);
1819 q2 = __msa_bmnz_v((v16u8) src4, q2, is_tc_orig2);
1821 q0_sub_p0 = __msa_hsub_u_h((v16u8) p0_ilvr_q0, (v16u8) p0_ilvr_q0);
1823 p1_sub_q1 = p1_r - q1_r;
1824 q0_sub_p0 += p1_sub_q1;
1825 q0_sub_p0 = __msa_srari_h(q0_sub_p0, 3);
1828 is_less_than_beta1 = (v16u8) __msa_ilvr_b((v16i8) is_less_than_beta1,
1829 (v16i8) is_less_than_beta1);
1830 tc = (v8i16) __msa_bmnz_v((v16u8) tc, (v16u8) tc_plus1, is_less_than_beta1);
1832 is_less_than_beta2 = (v16u8) __msa_ilvr_b((v16i8) is_less_than_beta2,
1833 (v16i8) is_less_than_beta2);
1834 tc = (v8i16) __msa_bmnz_v((v16u8) tc, (v16u8) tc_plus1, is_less_than_beta2);
1836 q0_sub_p0 = CLIP_SH(q0_sub_p0, -tc, tc);
1838 ILVR_B2_UH(zeros, src2, zeros, src3, src2_r, src3_r);
1839 src2_r += q0_sub_p0;
1840 src3_r -= q0_sub_p0;
1842 src2_r = (v8u16) CLIP_SH_0_255(src2_r);
1843 src3_r = (v8u16) CLIP_SH_0_255(src3_r);
1845 PCKEV_B2_UB(src2_r, src2_r, src3_r, src3_r, p0, q0);
1847 p0 = __msa_bmnz_v((v16u8) src2, p0, is_less_than);
1848 q0 = __msa_bmnz_v((v16u8) src3, q0, is_less_than);
1850 ILVR_B2_UB(p0, p2, q2, q0, p2, q2);
1852 ILVRL_H2_SW(q2, p2, dst0, dst1);
1856 out0 = __msa_copy_u_w(dst0, 0);
1857 out1 = __msa_copy_u_w(dst0, 1);
1858 out2 = __msa_copy_u_w(dst0, 2);
1859 out3 = __msa_copy_u_w(dst0, 3);
1862 data += (2 * stride);
1864 SW(out0, (data - 2));
1866 SW(out1, (data - 2));
1871 data += (2 * stride);
1873 SW(out2, (data - 2));
1875 SW(out3, (data - 2));
1879 out0 = __msa_copy_u_w(dst1, 0);
1880 out1 = __msa_copy_u_w(dst1, 1);
1881 out2 = __msa_copy_u_w(dst1, 2);
1882 out3 = __msa_copy_u_w(dst1, 3);
1885 data += (2 * stride);
1887 SW(out0, (data - 2));
1889 SW(out1, (data - 2));
1894 SW(out2, (data - 2));
1896 SW(out3, (data - 2));
1900 static void avc_loopfilter_cb_or_cr_inter_edge_hor_msa(uint8_t *data,
1901 uint8_t bs0, uint8_t bs1,
1902 uint8_t bs2, uint8_t bs3,
1903 uint8_t tc0, uint8_t tc1,
1904 uint8_t tc2, uint8_t tc3,
1913 v16u8 p0, q0, p0_asub_q0, p1_asub_p0, q1_asub_q0;
1915 v16u8 is_less_than_beta, is_less_than_alpha, is_bs_greater_than0;
1917 v16u8 p1_org, p0_org, q0_org, q1_org;
1918 v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
1919 v16i8 negate_tc, sign_negate_tc;
1920 v8i16 tc_r, negate_tc_r;
1923 tmp_vec = (v8i16) __msa_fill_b(bs0);
1924 bs = __msa_insve_h(bs, 0, tmp_vec);
1925 tmp_vec = (v8i16) __msa_fill_b(bs1);
1926 bs = __msa_insve_h(bs, 1, tmp_vec);
1927 tmp_vec = (v8i16) __msa_fill_b(bs2);
1928 bs = __msa_insve_h(bs, 2, tmp_vec);
1929 tmp_vec = (v8i16) __msa_fill_b(bs3);
1930 bs = __msa_insve_h(bs, 3, tmp_vec);
1932 if (!__msa_test_bz_v((v16u8) bs)) {
1933 tmp_vec = (v8i16) __msa_fill_b(tc0);
1934 tc = __msa_insve_h(tc, 0, tmp_vec);
1935 tmp_vec = (v8i16) __msa_fill_b(tc1);
1936 tc = __msa_insve_h(tc, 1, tmp_vec);
1937 tmp_vec = (v8i16) __msa_fill_b(tc2);
1938 tc = __msa_insve_h(tc, 2, tmp_vec);
1939 tmp_vec = (v8i16) __msa_fill_b(tc3);
1940 tc = __msa_insve_h(tc, 3, tmp_vec);
1942 is_bs_greater_than0 = (v16u8) (zero < (v16i8) bs);
1944 alpha = (v16u8) __msa_fill_b(alpha_in);
1945 beta = (v16u8) __msa_fill_b(beta_in);
1947 LD_UB4(data - (img_width << 1), img_width,
1948 p1_org, p0_org, q0_org, q1_org);
1950 p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
1951 p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
1952 q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
1954 is_less_than_alpha = (p0_asub_q0 < alpha);
1955 is_less_than_beta = (p1_asub_p0 < beta);
1956 is_less_than = is_less_than_beta & is_less_than_alpha;
1957 is_less_than_beta = (q1_asub_q0 < beta);
1958 is_less_than = is_less_than_beta & is_less_than;
1959 is_less_than = is_less_than & is_bs_greater_than0;
1961 is_less_than = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) is_less_than);
1963 if (!__msa_test_bz_v(is_less_than)) {
1964 negate_tc = zero - (v16i8) tc;
1965 sign_negate_tc = __msa_clti_s_b(negate_tc, 0);
1967 ILVR_B2_SH(zero, tc, sign_negate_tc, negate_tc, tc_r, negate_tc_r);
1969 ILVR_B4_SH(zero, p1_org, zero, p0_org, zero, q0_org, zero, q1_org,
1970 p1_org_r, p0_org_r, q0_org_r, q1_org_r);
1972 AVC_LPF_P0Q0(q0_org_r, p0_org_r, p1_org_r, q1_org_r, negate_tc_r,
1975 PCKEV_B2_UB(zero, p0_r, zero, q0_r, p0, q0);
1977 p0_org = __msa_bmnz_v(p0_org, p0, is_less_than);
1978 q0_org = __msa_bmnz_v(q0_org, q0, is_less_than);
1980 ST_UB(q0_org, data);
1981 ST_UB(p0_org, (data - img_width));
1986 static void avc_loopfilter_cb_or_cr_inter_edge_ver_msa(uint8_t *data,
1987 uint8_t bs0, uint8_t bs1,
1988 uint8_t bs2, uint8_t bs3,
1989 uint8_t tc0, uint8_t tc1,
1990 uint8_t tc2, uint8_t tc3,
1997 v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
1998 v16u8 is_less_than, is_less_than_beta, is_less_than_alpha;
2002 v16u8 p1_org, p0_org, q0_org, q1_org;
2003 v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
2004 v16u8 is_bs_greater_than0;
2005 v8i16 tc_r, negate_tc_r;
2006 v16i8 negate_tc, sign_negate_tc;
2008 v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
2009 v8i16 tmp1, tmp_vec, bs = { 0 };
2012 tmp_vec = (v8i16) __msa_fill_b(bs0);
2013 bs = __msa_insve_h(bs, 0, tmp_vec);
2014 tmp_vec = (v8i16) __msa_fill_b(bs1);
2015 bs = __msa_insve_h(bs, 1, tmp_vec);
2016 tmp_vec = (v8i16) __msa_fill_b(bs2);
2017 bs = __msa_insve_h(bs, 2, tmp_vec);
2018 tmp_vec = (v8i16) __msa_fill_b(bs3);
2019 bs = __msa_insve_h(bs, 3, tmp_vec);
2021 if (!__msa_test_bz_v((v16u8) bs)) {
2022 tmp_vec = (v8i16) __msa_fill_b(tc0);
2023 tc = __msa_insve_h(tc, 0, tmp_vec);
2024 tmp_vec = (v8i16) __msa_fill_b(tc1);
2025 tc = __msa_insve_h(tc, 1, tmp_vec);
2026 tmp_vec = (v8i16) __msa_fill_b(tc2);
2027 tc = __msa_insve_h(tc, 2, tmp_vec);
2028 tmp_vec = (v8i16) __msa_fill_b(tc3);
2029 tc = __msa_insve_h(tc, 3, tmp_vec);
2031 is_bs_greater_than0 = (v16u8) (zero < (v16i8) bs);
2033 LD_UB8((data - 2), img_width,
2034 row0, row1, row2, row3, row4, row5, row6, row7);
2036 TRANSPOSE8x4_UB_UB(row0, row1, row2, row3,
2037 row4, row5, row6, row7,
2038 p1_org, p0_org, q0_org, q1_org);
2040 p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
2041 p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
2042 q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
2044 alpha = (v16u8) __msa_fill_b(alpha_in);
2045 beta = (v16u8) __msa_fill_b(beta_in);
2047 is_less_than_alpha = (p0_asub_q0 < alpha);
2048 is_less_than_beta = (p1_asub_p0 < beta);
2049 is_less_than = is_less_than_beta & is_less_than_alpha;
2050 is_less_than_beta = (q1_asub_q0 < beta);
2051 is_less_than = is_less_than_beta & is_less_than;
2052 is_less_than = is_bs_greater_than0 & is_less_than;
2054 is_less_than = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) is_less_than);
2056 if (!__msa_test_bz_v(is_less_than)) {
2057 ILVR_B4_SH(zero, p1_org, zero, p0_org, zero, q0_org, zero, q1_org,
2058 p1_org_r, p0_org_r, q0_org_r, q1_org_r);
2060 negate_tc = zero - (v16i8) tc;
2061 sign_negate_tc = __msa_clti_s_b(negate_tc, 0);
2063 ILVR_B2_SH(sign_negate_tc, negate_tc, zero, tc, negate_tc_r, tc_r);
2065 AVC_LPF_P0Q0(q0_org_r, p0_org_r, p1_org_r, q1_org_r, negate_tc_r,
2068 PCKEV_B2_UB(zero, p0_r, zero, q0_r, p0, q0);
2070 p0_org = __msa_bmnz_v(p0_org, p0, is_less_than);
2071 q0_org = __msa_bmnz_v(q0_org, q0, is_less_than);
2072 tmp1 = (v8i16) __msa_ilvr_b((v16i8) q0_org, (v16i8) p0_org);
2074 ST2x4_UB(tmp1, 0, src, img_width);
2075 src += 4 * img_width;
2076 ST2x4_UB(tmp1, 4, src, img_width);
2081 static void avc_h_loop_filter_chroma422_msa(uint8_t *src, int32_t stride,
2082 int32_t alpha_in, int32_t beta_in,
2085 int32_t col, tc_val;
2086 v16u8 alpha, beta, res;
2088 alpha = (v16u8) __msa_fill_b(alpha_in);
2089 beta = (v16u8) __msa_fill_b(beta_in);
2091 for (col = 0; col < 4; col++) {
2092 tc_val = (tc0[col] - 1) + 1;
2095 src += (4 * stride);
2099 AVC_LPF_H_CHROMA_422(src, stride, tc_val, alpha, beta, res);
2100 ST2x4_UB(res, 0, (src - 1), stride);
2101 src += (4 * stride);
2105 static void avc_h_loop_filter_chroma422_mbaff_msa(uint8_t *src, int32_t stride,
2110 int32_t col, tc_val;
2112 v16u8 alpha, beta, res;
2114 alpha = (v16u8) __msa_fill_b(alpha_in);
2115 beta = (v16u8) __msa_fill_b(beta_in);
2117 for (col = 0; col < 4; col++) {
2118 tc_val = (tc0[col] - 1) + 1;
2125 AVC_LPF_H_2BYTE_CHROMA_422(src, stride, tc_val, alpha, beta, res);
2127 out0 = __msa_copy_s_h((v8i16) res, 0);
2128 out1 = __msa_copy_s_h((v8i16) res, 1);
2130 SH(out0, (src - 1));
2132 SH(out1, (src - 1));
2137 void ff_h264_h_lpf_luma_inter_msa(uint8_t *data, int img_width,
2138 int alpha, int beta, int8_t *tc)
2154 avc_loopfilter_luma_inter_edge_ver_msa(data, bs0, bs1, bs2, bs3,
2155 tc[0], tc[1], tc[2], tc[3],
2156 alpha, beta, img_width);
2159 void ff_h264_v_lpf_luma_inter_msa(uint8_t *data, int img_width,
2160 int alpha, int beta, int8_t *tc)
2177 avc_loopfilter_luma_inter_edge_hor_msa(data, bs0, bs1, bs2, bs3,
2178 tc[0], tc[1], tc[2], tc[3],
2179 alpha, beta, img_width);
2182 void ff_h264_h_lpf_chroma_inter_msa(uint8_t *data, int img_width,
2183 int alpha, int beta, int8_t *tc)
2199 avc_loopfilter_cb_or_cr_inter_edge_ver_msa(data, bs0, bs1, bs2, bs3,
2200 tc[0], tc[1], tc[2], tc[3],
2201 alpha, beta, img_width);
2204 void ff_h264_v_lpf_chroma_inter_msa(uint8_t *data, int img_width,
2205 int alpha, int beta, int8_t *tc)
2221 avc_loopfilter_cb_or_cr_inter_edge_hor_msa(data, bs0, bs1, bs2, bs3,
2222 tc[0], tc[1], tc[2], tc[3],
2223 alpha, beta, img_width);
2226 void ff_h264_h_lpf_luma_intra_msa(uint8_t *data, int img_width,
2227 int alpha, int beta)
2229 avc_loopfilter_luma_intra_edge_ver_msa(data, (uint8_t) alpha,
2231 (unsigned int) img_width);
2234 void ff_h264_v_lpf_luma_intra_msa(uint8_t *data, int img_width,
2235 int alpha, int beta)
2237 avc_loopfilter_luma_intra_edge_hor_msa(data, (uint8_t) alpha,
2239 (unsigned int) img_width);
2242 void ff_h264_h_lpf_chroma_intra_msa(uint8_t *data, int img_width,
2243 int alpha, int beta)
2245 avc_loopfilter_cb_or_cr_intra_edge_ver_msa(data, (uint8_t) alpha,
2247 (unsigned int) img_width);
2250 void ff_h264_v_lpf_chroma_intra_msa(uint8_t *data, int img_width,
2251 int alpha, int beta)
2253 avc_loopfilter_cb_or_cr_intra_edge_hor_msa(data, (uint8_t) alpha,
2255 (unsigned int) img_width);
2258 void ff_h264_h_loop_filter_chroma422_msa(uint8_t *src,
2260 int32_t alpha, int32_t beta,
2263 avc_h_loop_filter_chroma422_msa(src, ystride, alpha, beta, tc0);
2266 void ff_h264_h_loop_filter_chroma422_mbaff_msa(uint8_t *src,
2272 avc_h_loop_filter_chroma422_mbaff_msa(src, ystride, alpha, beta, tc0);
2275 void ff_h264_h_loop_filter_luma_mbaff_msa(uint8_t *src,
2281 avc_h_loop_filter_luma_mbaff_msa(src, ystride, alpha, beta, tc0);
2284 void ff_h264_h_loop_filter_luma_mbaff_intra_msa(uint8_t *src,
2289 avc_h_loop_filter_luma_mbaff_intra_msa(src, ystride, alpha, beta);
2292 void ff_weight_h264_pixels16_8_msa(uint8_t *src, ptrdiff_t stride,
2293 int height, int log2_denom,
2294 int weight_src, int offset)
2296 avc_wgt_16width_msa(src, stride, height, log2_denom, weight_src, offset);
2299 void ff_weight_h264_pixels8_8_msa(uint8_t *src, ptrdiff_t stride,
2300 int height, int log2_denom,
2301 int weight_src, int offset)
2303 avc_wgt_8width_msa(src, stride, height, log2_denom, weight_src, offset);
2306 void ff_weight_h264_pixels4_8_msa(uint8_t *src, ptrdiff_t stride,
2307 int height, int log2_denom,
2308 int weight_src, int offset)
2310 avc_wgt_4width_msa(src, stride, height, log2_denom, weight_src, offset);
2313 void ff_biweight_h264_pixels16_8_msa(uint8_t *dst, uint8_t *src,
2314 ptrdiff_t stride, int height,
2315 int log2_denom, int weight_dst,
2316 int weight_src, int offset)
2318 avc_biwgt_16width_msa(src, stride, dst, stride, height, log2_denom,
2319 weight_src, weight_dst, offset);
2322 void ff_biweight_h264_pixels8_8_msa(uint8_t *dst, uint8_t *src,
2323 ptrdiff_t stride, int height,
2324 int log2_denom, int weight_dst,
2325 int weight_src, int offset)
2327 avc_biwgt_8width_msa(src, stride, dst, stride, height, log2_denom,
2328 weight_src, weight_dst, offset);
2331 void ff_biweight_h264_pixels4_8_msa(uint8_t *dst, uint8_t *src,
2332 ptrdiff_t stride, int height,
2333 int log2_denom, int weight_dst,
2334 int weight_src, int offset)
2336 avc_biwgt_4width_msa(src, stride, dst, stride, height, log2_denom,
2337 weight_src, weight_dst, offset);