2 * Copyright (c) 2015 - 2017 Parag Salasakar (Parag.Salasakar@imgtec.com)
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #include "libavutil/mips/generic_macros_msa.h"
22 #include "h264dsp_mips.h"
24 static void avc_wgt_4x2_msa(uint8_t *data, int32_t stride,
25 int32_t log2_denom, int32_t src_weight,
28 uint32_t tp0, tp1, offset_val;
31 v8i16 src0_r, tmp0, wgt, denom, offset;
33 offset_val = (unsigned) offset_in << log2_denom;
35 wgt = __msa_fill_h(src_weight);
36 offset = __msa_fill_h(offset_val);
37 denom = __msa_fill_h(log2_denom);
39 LW2(data, stride, tp0, tp1);
40 INSERT_W2_UB(tp0, tp1, src0);
41 src0_r = (v8i16) __msa_ilvr_b((v16i8) zero, (v16i8) src0);
43 tmp0 = __msa_adds_s_h(tmp0, offset);
44 tmp0 = __msa_maxi_s_h(tmp0, 0);
45 tmp0 = __msa_srlr_h(tmp0, denom);
46 tmp0 = (v8i16) __msa_sat_u_h((v8u16) tmp0, 7);
47 src0 = (v16u8) __msa_pckev_b((v16i8) tmp0, (v16i8) tmp0);
48 ST4x2_UB(src0, data, stride);
51 static void avc_wgt_4x4_msa(uint8_t *data, int32_t stride, int32_t log2_denom,
52 int32_t src_weight, int32_t offset_in)
54 uint32_t tp0, tp1, tp2, tp3, offset_val;
56 v8i16 src0_r, src1_r, tmp0, tmp1, wgt, denom, offset;
58 offset_val = (unsigned) offset_in << log2_denom;
60 wgt = __msa_fill_h(src_weight);
61 offset = __msa_fill_h(offset_val);
62 denom = __msa_fill_h(log2_denom);
64 LW4(data, stride, tp0, tp1, tp2, tp3);
65 INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
66 UNPCK_UB_SH(src0, src0_r, src1_r);
67 MUL2(wgt, src0_r, wgt, src1_r, tmp0, tmp1);
68 ADDS_SH2_SH(tmp0, offset, tmp1, offset, tmp0, tmp1);
69 MAXI_SH2_SH(tmp0, tmp1, 0);
70 tmp0 = __msa_srlr_h(tmp0, denom);
71 tmp1 = __msa_srlr_h(tmp1, denom);
72 SAT_UH2_SH(tmp0, tmp1, 7);
73 src0 = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
74 ST4x4_UB(src0, src0, 0, 1, 2, 3, data, stride);
77 static void avc_wgt_4x8_msa(uint8_t *data, int32_t stride, int32_t log2_denom,
78 int32_t src_weight, int32_t offset_in)
80 uint32_t tp0, tp1, tp2, tp3, offset_val;
81 v16u8 src0 = { 0 }, src1 = { 0 };
82 v8i16 src0_r, src1_r, src2_r, src3_r, tmp0, tmp1, tmp2, tmp3;
83 v8i16 wgt, denom, offset;
85 offset_val = (unsigned) offset_in << log2_denom;
87 wgt = __msa_fill_h(src_weight);
88 offset = __msa_fill_h(offset_val);
89 denom = __msa_fill_h(log2_denom);
91 LW4(data, stride, tp0, tp1, tp2, tp3);
92 INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
93 LW4(data + 4 * stride, stride, tp0, tp1, tp2, tp3);
94 INSERT_W4_UB(tp0, tp1, tp2, tp3, src1);
95 UNPCK_UB_SH(src0, src0_r, src1_r);
96 UNPCK_UB_SH(src1, src2_r, src3_r);
97 MUL4(wgt, src0_r, wgt, src1_r, wgt, src2_r, wgt, src3_r, tmp0, tmp1, tmp2,
99 ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset, tmp0,
101 MAXI_SH4_SH(tmp0, tmp1, tmp2, tmp3, 0);
102 SRLR_H4_SH(tmp0, tmp1, tmp2, tmp3, denom);
103 SAT_UH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
104 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
105 ST4x8_UB(src0, src1, data, stride);
108 static void avc_wgt_8x4_msa(uint8_t *data, int32_t stride, int32_t log2_denom,
109 int32_t src_weight, int32_t offset_in)
112 uint64_t tp0, tp1, tp2, tp3;
113 v16u8 src0 = { 0 }, src1 = { 0 };
114 v8i16 src0_r, src1_r, src2_r, src3_r, tmp0, tmp1, tmp2, tmp3;
115 v8i16 wgt, denom, offset;
117 offset_val = (unsigned) offset_in << log2_denom;
119 wgt = __msa_fill_h(src_weight);
120 offset = __msa_fill_h(offset_val);
121 denom = __msa_fill_h(log2_denom);
123 LD4(data, stride, tp0, tp1, tp2, tp3);
124 INSERT_D2_UB(tp0, tp1, src0);
125 INSERT_D2_UB(tp2, tp3, src1);
126 UNPCK_UB_SH(src0, src0_r, src1_r);
127 UNPCK_UB_SH(src1, src2_r, src3_r);
128 MUL4(wgt, src0_r, wgt, src1_r, wgt, src2_r, wgt, src3_r, tmp0, tmp1, tmp2,
130 ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset, tmp0,
132 MAXI_SH4_SH(tmp0, tmp1, tmp2, tmp3, 0);
133 SRLR_H4_SH(tmp0, tmp1, tmp2, tmp3, denom);
134 SAT_UH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
135 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
136 ST8x4_UB(src0, src1, data, stride);
139 static void avc_wgt_8x8_msa(uint8_t *data, int32_t stride, int32_t log2_denom,
140 int32_t src_weight, int32_t offset_in)
143 uint64_t tp0, tp1, tp2, tp3;
144 v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
145 v8i16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
146 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
147 v8i16 wgt, denom, offset;
149 offset_val = (unsigned) offset_in << log2_denom;
151 wgt = __msa_fill_h(src_weight);
152 offset = __msa_fill_h(offset_val);
153 denom = __msa_fill_h(log2_denom);
155 LD4(data, stride, tp0, tp1, tp2, tp3);
156 INSERT_D2_UB(tp0, tp1, src0);
157 INSERT_D2_UB(tp2, tp3, src1);
158 LD4(data + 4 * stride, stride, tp0, tp1, tp2, tp3);
159 INSERT_D2_UB(tp0, tp1, src2);
160 INSERT_D2_UB(tp2, tp3, src3);
161 UNPCK_UB_SH(src0, src0_r, src1_r);
162 UNPCK_UB_SH(src1, src2_r, src3_r);
163 UNPCK_UB_SH(src2, src4_r, src5_r);
164 UNPCK_UB_SH(src3, src6_r, src7_r);
165 MUL4(wgt, src0_r, wgt, src1_r, wgt, src2_r, wgt, src3_r, tmp0, tmp1, tmp2,
167 MUL4(wgt, src4_r, wgt, src5_r, wgt, src6_r, wgt, src7_r, tmp4, tmp5, tmp6,
169 ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset, tmp0,
171 ADDS_SH4_SH(tmp4, offset, tmp5, offset, tmp6, offset, tmp7, offset, tmp4,
173 MAXI_SH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 0);
174 SRLR_H8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom);
175 SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7);
176 PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, src0, src1,
178 ST8x8_UB(src0, src1, src2, src3, data, stride);
181 static void avc_wgt_8x16_msa(uint8_t *data, int32_t stride, int32_t log2_denom,
182 int32_t src_weight, int32_t offset_in)
184 uint32_t offset_val, cnt;
185 uint64_t tp0, tp1, tp2, tp3;
186 v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
187 v8i16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
188 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
189 v8i16 wgt, denom, offset;
191 offset_val = (unsigned) offset_in << log2_denom;
193 wgt = __msa_fill_h(src_weight);
194 offset = __msa_fill_h(offset_val);
195 denom = __msa_fill_h(log2_denom);
197 for (cnt = 2; cnt--;) {
198 LD4(data, stride, tp0, tp1, tp2, tp3);
199 INSERT_D2_UB(tp0, tp1, src0);
200 INSERT_D2_UB(tp2, tp3, src1);
201 LD4(data + 4 * stride, stride, tp0, tp1, tp2, tp3);
202 INSERT_D2_UB(tp0, tp1, src2);
203 INSERT_D2_UB(tp2, tp3, src3);
204 UNPCK_UB_SH(src0, src0_r, src1_r);
205 UNPCK_UB_SH(src1, src2_r, src3_r);
206 UNPCK_UB_SH(src2, src4_r, src5_r);
207 UNPCK_UB_SH(src3, src6_r, src7_r);
208 MUL4(wgt, src0_r, wgt, src1_r, wgt, src2_r, wgt, src3_r, tmp0, tmp1,
210 MUL4(wgt, src4_r, wgt, src5_r, wgt, src6_r, wgt, src7_r, tmp4, tmp5,
212 ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset,
213 tmp0, tmp1, tmp2, tmp3);
214 ADDS_SH4_SH(tmp4, offset, tmp5, offset, tmp6, offset, tmp7, offset,
215 tmp4, tmp5, tmp6, tmp7);
216 MAXI_SH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 0);
217 SRLR_H8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom);
218 SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7);
219 PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, src0, src1,
221 ST8x8_UB(src0, src1, src2, src3, data, stride);
226 static void avc_biwgt_4x2_msa(uint8_t *src, int32_t src_stride,
227 uint8_t *dst, int32_t dst_stride,
228 int32_t log2_denom, int32_t src_weight,
229 int32_t dst_weight, int32_t offset_in)
231 uint32_t load0, load1, out0, out1;
232 v16i8 src_wgt, dst_wgt, wgt;
233 v16i8 src0, src1, dst0, dst1;
234 v8i16 temp0, temp1, denom, offset, add_val;
235 int32_t val = 128 * (src_weight + dst_weight);
237 offset_in = ((offset_in + 1) | 1) << log2_denom;
239 src_wgt = __msa_fill_b(src_weight);
240 dst_wgt = __msa_fill_b(dst_weight);
241 offset = __msa_fill_h(offset_in);
242 denom = __msa_fill_h(log2_denom + 1);
243 add_val = __msa_fill_h(val);
246 wgt = __msa_ilvev_b(dst_wgt, src_wgt);
252 src0 = (v16i8) __msa_fill_w(load0);
253 src1 = (v16i8) __msa_fill_w(load1);
256 load1 = LW(dst + dst_stride);
258 dst0 = (v16i8) __msa_fill_w(load0);
259 dst1 = (v16i8) __msa_fill_w(load1);
261 XORI_B4_128_SB(src0, src1, dst0, dst1);
262 ILVR_B2_SH(dst0, src0, dst1, src1, temp0, temp1);
264 temp0 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp0);
265 temp1 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp1);
270 CLIP_SH2_0_255(temp0, temp1);
271 PCKEV_B2_SB(temp0, temp0, temp1, temp1, dst0, dst1);
273 out0 = __msa_copy_u_w((v4i32) dst0, 0);
274 out1 = __msa_copy_u_w((v4i32) dst1, 0);
280 static void avc_biwgt_4x4multiple_msa(uint8_t *src, int32_t src_stride,
281 uint8_t *dst, int32_t dst_stride,
282 int32_t height, int32_t log2_denom,
283 int32_t src_weight, int32_t dst_weight,
287 uint32_t load0, load1, load2, load3;
288 v16i8 src_wgt, dst_wgt, wgt;
289 v16i8 src0, src1, src2, src3;
290 v16i8 dst0, dst1, dst2, dst3;
291 v8i16 temp0, temp1, temp2, temp3;
292 v8i16 denom, offset, add_val;
293 int32_t val = 128 * (src_weight + dst_weight);
295 offset_in = ((offset_in + 1) | 1) << log2_denom;
297 src_wgt = __msa_fill_b(src_weight);
298 dst_wgt = __msa_fill_b(dst_weight);
299 offset = __msa_fill_h(offset_in);
300 denom = __msa_fill_h(log2_denom + 1);
301 add_val = __msa_fill_h(val);
304 wgt = __msa_ilvev_b(dst_wgt, src_wgt);
306 for (cnt = height / 4; cnt--;) {
307 LW4(src, src_stride, load0, load1, load2, load3);
308 src += (4 * src_stride);
310 src0 = (v16i8) __msa_fill_w(load0);
311 src1 = (v16i8) __msa_fill_w(load1);
312 src2 = (v16i8) __msa_fill_w(load2);
313 src3 = (v16i8) __msa_fill_w(load3);
315 LW4(dst, dst_stride, load0, load1, load2, load3);
317 dst0 = (v16i8) __msa_fill_w(load0);
318 dst1 = (v16i8) __msa_fill_w(load1);
319 dst2 = (v16i8) __msa_fill_w(load2);
320 dst3 = (v16i8) __msa_fill_w(load3);
322 XORI_B4_128_SB(src0, src1, src2, src3);
323 XORI_B4_128_SB(dst0, dst1, dst2, dst3);
324 ILVR_B4_SH(dst0, src0, dst1, src1, dst2, src2, dst3, src3,
325 temp0, temp1, temp2, temp3);
327 temp0 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp0);
328 temp1 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp1);
329 temp2 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp2);
330 temp3 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp3);
332 SRA_4V(temp0, temp1, temp2, temp3, denom);
333 CLIP_SH4_0_255(temp0, temp1, temp2, temp3);
334 PCKEV_ST4x4_UB(temp0, temp1, temp2, temp3, dst, dst_stride);
335 dst += (4 * dst_stride);
339 static void avc_biwgt_4width_msa(uint8_t *src, int32_t src_stride,
340 uint8_t *dst, int32_t dst_stride,
341 int32_t height, int32_t log2_denom,
342 int32_t src_weight, int32_t dst_weight,
346 avc_biwgt_4x2_msa(src, src_stride, dst, dst_stride, log2_denom,
347 src_weight, dst_weight, offset_in);
349 avc_biwgt_4x4multiple_msa(src, src_stride, dst, dst_stride, height,
350 log2_denom, src_weight, dst_weight,
355 static void avc_biwgt_8width_msa(uint8_t *src, int32_t src_stride,
356 uint8_t *dst, int32_t dst_stride,
357 int32_t height, int32_t log2_denom,
358 int32_t src_weight, int32_t dst_weight,
362 v16i8 src_wgt, dst_wgt, wgt;
363 v16i8 src0, src1, src2, src3;
364 v16i8 dst0, dst1, dst2, dst3;
366 v8i16 temp0, temp1, temp2, temp3;
367 v8i16 denom, offset, add_val;
368 int32_t val = 128 * (src_weight + dst_weight);
370 offset_in = ((offset_in + 1) | 1) << log2_denom;
372 src_wgt = __msa_fill_b(src_weight);
373 dst_wgt = __msa_fill_b(dst_weight);
374 offset = __msa_fill_h(offset_in);
375 denom = __msa_fill_h(log2_denom + 1);
376 add_val = __msa_fill_h(val);
379 wgt = __msa_ilvev_b(dst_wgt, src_wgt);
381 for (cnt = height / 4; cnt--;) {
382 LD_SB4(src, src_stride, src0, src1, src2, src3);
383 src += (4 * src_stride);
385 LD_SB4(dst, dst_stride, dst0, dst1, dst2, dst3);
386 XORI_B4_128_SB(src0, src1, src2, src3);
387 XORI_B4_128_SB(dst0, dst1, dst2, dst3);
388 ILVR_B4_SH(dst0, src0, dst1, src1, dst2, src2, dst3, src3,
389 temp0, temp1, temp2, temp3);
391 temp0 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp0);
392 temp1 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp1);
393 temp2 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp2);
394 temp3 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp3);
396 SRA_4V(temp0, temp1, temp2, temp3, denom);
397 CLIP_SH4_0_255(temp0, temp1, temp2, temp3);
398 PCKEV_B2_SB(temp1, temp0, temp3, temp2, out0, out1);
399 ST8x4_UB(out0, out1, dst, dst_stride);
400 dst += 4 * dst_stride;
404 static void avc_biwgt_16width_msa(uint8_t *src, int32_t src_stride,
405 uint8_t *dst, int32_t dst_stride,
406 int32_t height, int32_t log2_denom,
407 int32_t src_weight, int32_t dst_weight,
411 v16i8 src_wgt, dst_wgt, wgt;
412 v16i8 src0, src1, src2, src3;
413 v16i8 dst0, dst1, dst2, dst3;
414 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
415 v8i16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
416 v8i16 denom, offset, add_val;
417 int32_t val = 128 * (src_weight + dst_weight);
419 offset_in = ((offset_in + 1) | 1) << log2_denom;
421 src_wgt = __msa_fill_b(src_weight);
422 dst_wgt = __msa_fill_b(dst_weight);
423 offset = __msa_fill_h(offset_in);
424 denom = __msa_fill_h(log2_denom + 1);
425 add_val = __msa_fill_h(val);
428 wgt = __msa_ilvev_b(dst_wgt, src_wgt);
430 for (cnt = height / 4; cnt--;) {
431 LD_SB4(src, src_stride, src0, src1, src2, src3);
432 src += (4 * src_stride);
434 LD_SB4(dst, dst_stride, dst0, dst1, dst2, dst3);
435 XORI_B4_128_SB(src0, src1, src2, src3);
436 XORI_B4_128_SB(dst0, dst1, dst2, dst3);
437 ILVR_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3,
438 vec0, vec2, vec4, vec6);
439 ILVL_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3,
440 vec1, vec3, vec5, vec7);
442 temp0 = __msa_dpadd_s_h(offset, wgt, vec0);
443 temp1 = __msa_dpadd_s_h(offset, wgt, vec1);
444 temp2 = __msa_dpadd_s_h(offset, wgt, vec2);
445 temp3 = __msa_dpadd_s_h(offset, wgt, vec3);
446 temp4 = __msa_dpadd_s_h(offset, wgt, vec4);
447 temp5 = __msa_dpadd_s_h(offset, wgt, vec5);
448 temp6 = __msa_dpadd_s_h(offset, wgt, vec6);
449 temp7 = __msa_dpadd_s_h(offset, wgt, vec7);
451 SRA_4V(temp0, temp1, temp2, temp3, denom);
452 SRA_4V(temp4, temp5, temp6, temp7, denom);
453 CLIP_SH4_0_255(temp0, temp1, temp2, temp3);
454 CLIP_SH4_0_255(temp4, temp5, temp6, temp7);
455 PCKEV_B4_SB(temp1, temp0, temp3, temp2, temp5, temp4, temp7, temp6,
456 dst0, dst1, dst2, dst3);
457 ST_SB4(dst0, dst1, dst2, dst3, dst, dst_stride);
458 dst += 4 * dst_stride;
462 #define AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_or_q3_org_in, p0_or_q0_org_in, \
463 q3_or_p3_org_in, p1_or_q1_org_in, \
464 p2_or_q2_org_in, q1_or_p1_org_in, \
465 p0_or_q0_out, p1_or_q1_out, p2_or_q2_out) \
468 v8i16 const3 = __msa_ldi_h(3); \
470 threshold = (p0_or_q0_org_in) + (q3_or_p3_org_in); \
471 threshold += (p1_or_q1_org_in); \
473 (p0_or_q0_out) = threshold << 1; \
474 (p0_or_q0_out) += (p2_or_q2_org_in); \
475 (p0_or_q0_out) += (q1_or_p1_org_in); \
476 (p0_or_q0_out) = __msa_srari_h((p0_or_q0_out), 3); \
478 (p1_or_q1_out) = (p2_or_q2_org_in) + threshold; \
479 (p1_or_q1_out) = __msa_srari_h((p1_or_q1_out), 2); \
481 (p2_or_q2_out) = (p2_or_q2_org_in) * const3; \
482 (p2_or_q2_out) += (p3_or_q3_org_in); \
483 (p2_or_q2_out) += (p3_or_q3_org_in); \
484 (p2_or_q2_out) += threshold; \
485 (p2_or_q2_out) = __msa_srari_h((p2_or_q2_out), 3); \
488 /* data[-u32_img_width] = (uint8_t)((2 * p1 + p0 + q1 + 2) >> 2); */
489 #define AVC_LPF_P0_OR_Q0(p0_or_q0_org_in, q1_or_p1_org_in, \
490 p1_or_q1_org_in, p0_or_q0_out) \
492 (p0_or_q0_out) = (p0_or_q0_org_in) + (q1_or_p1_org_in); \
493 (p0_or_q0_out) += (p1_or_q1_org_in); \
494 (p0_or_q0_out) += (p1_or_q1_org_in); \
495 (p0_or_q0_out) = __msa_srari_h((p0_or_q0_out), 2); \
498 #define AVC_LPF_P1_OR_Q1(p0_or_q0_org_in, q0_or_p0_org_in, \
499 p1_or_q1_org_in, p2_or_q2_org_in, \
500 negate_tc_in, tc_in, p1_or_q1_out) \
504 clip3 = (v8i16) __msa_aver_u_h((v8u16) p0_or_q0_org_in, \
505 (v8u16) q0_or_p0_org_in); \
506 temp = p1_or_q1_org_in << 1; \
507 clip3 = clip3 - temp; \
508 clip3 = __msa_ave_s_h(p2_or_q2_org_in, clip3); \
509 clip3 = CLIP_SH(clip3, negate_tc_in, tc_in); \
510 p1_or_q1_out = p1_or_q1_org_in + clip3; \
513 #define AVC_LPF_P0Q0(q0_or_p0_org_in, p0_or_q0_org_in, \
514 p1_or_q1_org_in, q1_or_p1_org_in, \
515 negate_threshold_in, threshold_in, \
516 p0_or_q0_out, q0_or_p0_out) \
518 v8i16 q0_sub_p0, p1_sub_q1, delta; \
520 q0_sub_p0 = q0_or_p0_org_in - p0_or_q0_org_in; \
521 p1_sub_q1 = p1_or_q1_org_in - q1_or_p1_org_in; \
524 delta = q0_sub_p0 + p1_sub_q1; \
527 delta = CLIP_SH(delta, negate_threshold_in, threshold_in); \
529 p0_or_q0_out = p0_or_q0_org_in + delta; \
530 q0_or_p0_out = q0_or_p0_org_in - delta; \
532 CLIP_SH2_0_255(p0_or_q0_out, q0_or_p0_out); \
535 #define AVC_LPF_H_CHROMA_422(src, stride, tc_val, alpha, beta, res) \
537 uint32_t load0, load1, load2, load3; \
538 v16u8 src0 = { 0 }; \
539 v16u8 src1 = { 0 }; \
540 v16u8 src2 = { 0 }; \
541 v16u8 src3 = { 0 }; \
542 v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0; \
543 v16u8 is_less_than, is_less_than_alpha, is_less_than_beta; \
544 v8i16 tc, q0_sub_p0, p1_sub_q1, delta; \
545 v8i16 res0_r, res1_r; \
546 v16i8 zeros = { 0 }; \
549 LW4((src - 2), stride, load0, load1, load2, load3); \
550 src0 = (v16u8) __msa_insert_w((v4i32) src0, 0, load0); \
551 src1 = (v16u8) __msa_insert_w((v4i32) src1, 0, load1); \
552 src2 = (v16u8) __msa_insert_w((v4i32) src2, 0, load2); \
553 src3 = (v16u8) __msa_insert_w((v4i32) src3, 0, load3); \
555 TRANSPOSE4x4_UB_UB(src0, src1, src2, src3, src0, src1, src2, src3); \
557 p0_asub_q0 = __msa_asub_u_b(src2, src1); \
558 p1_asub_p0 = __msa_asub_u_b(src1, src0); \
559 q1_asub_q0 = __msa_asub_u_b(src2, src3); \
561 tc = __msa_fill_h(tc_val); \
563 is_less_than_alpha = (p0_asub_q0 < alpha); \
564 is_less_than_beta = (p1_asub_p0 < beta); \
565 is_less_than = is_less_than_alpha & is_less_than_beta; \
566 is_less_than_beta = (q1_asub_q0 < beta); \
567 is_less_than = is_less_than_beta & is_less_than; \
569 ILVR_B2_SH(src2, src1, src0, src3, q0_sub_p0, p1_sub_q1); \
570 HSUB_UB2_SH(q0_sub_p0, p1_sub_q1, q0_sub_p0, p1_sub_q1); \
573 delta = q0_sub_p0 + p1_sub_q1; \
574 delta = __msa_srari_h(delta, 3); \
576 delta = CLIP_SH(delta, -tc, tc); \
578 ILVR_B2_SH(zeros, src1, zeros, src2, res0_r, res1_r); \
583 CLIP_SH2_0_255(res0_r, res1_r); \
584 PCKEV_B2_UB(res0_r, res0_r, res1_r, res1_r, res0, res1); \
586 res0 = __msa_bmnz_v(src1, res0, is_less_than); \
587 res1 = __msa_bmnz_v(src2, res1, is_less_than); \
589 res = (v16u8) __msa_ilvr_b((v16i8) res1, (v16i8) res0); \
592 #define TRANSPOSE2x4_B_UB(in0, in1, out0, out1, out2, out3) \
594 v16i8 zero_m = { 0 }; \
596 out0 = (v16u8) __msa_ilvr_b((v16i8) in1, (v16i8) in0); \
597 out1 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out0, 2); \
598 SLDI_B2_0_UB(out1, out2, out2, out3, 2); \
601 #define AVC_LPF_H_2BYTE_CHROMA_422(src, stride, tc_val, alpha, beta, res) \
603 uint32_t load0, load1; \
604 v16u8 src0 = { 0 }; \
605 v16u8 src1 = { 0 }; \
606 v16u8 src2 = { 0 }; \
607 v16u8 src3 = { 0 }; \
608 v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0; \
609 v16u8 is_less_than, is_less_than_alpha, is_less_than_beta; \
610 v8i16 tc, q0_sub_p0, p1_sub_q1, delta, res0_r, res1_r; \
611 v16i8 zeros = { 0 }; \
614 load0 = LW(src - 2); \
615 load1 = LW(src - 2 + stride); \
617 src0 = (v16u8) __msa_insert_w((v4i32) src0, 0, load0); \
618 src1 = (v16u8) __msa_insert_w((v4i32) src1, 0, load1); \
620 TRANSPOSE2x4_B_UB(src0, src1, src0, src1, src2, src3); \
622 p0_asub_q0 = __msa_asub_u_b(src2, src1); \
623 p1_asub_p0 = __msa_asub_u_b(src1, src0); \
624 q1_asub_q0 = __msa_asub_u_b(src2, src3); \
626 tc = __msa_fill_h(tc_val); \
628 is_less_than_alpha = (p0_asub_q0 < alpha); \
629 is_less_than_beta = (p1_asub_p0 < beta); \
630 is_less_than = is_less_than_alpha & is_less_than_beta; \
631 is_less_than_beta = (q1_asub_q0 < beta); \
632 is_less_than = is_less_than_beta & is_less_than; \
634 ILVR_B2_SH(src2, src1, src0, src3, q0_sub_p0, p1_sub_q1); \
635 HSUB_UB2_SH(q0_sub_p0, p1_sub_q1, q0_sub_p0, p1_sub_q1); \
638 delta = q0_sub_p0 + p1_sub_q1; \
639 delta = __msa_srari_h(delta, 3); \
640 delta = CLIP_SH(delta, -tc, tc); \
642 ILVR_B2_SH(zeros, src1, zeros, src2, res0_r, res1_r); \
647 CLIP_SH2_0_255(res0_r, res1_r); \
648 PCKEV_B2_UB(res0_r, res0_r, res1_r, res1_r, res0, res1); \
650 res0 = __msa_bmnz_v(src1, res0, is_less_than); \
651 res1 = __msa_bmnz_v(src2, res1, is_less_than); \
653 res = (v16u8) __msa_ilvr_b((v16i8) res1, (v16i8) res0); \
656 static void avc_loopfilter_luma_intra_edge_hor_msa(uint8_t *data,
661 v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
662 v16u8 is_less_than, is_less_than_beta, is_less_than_alpha;
663 v16u8 p1_org, p0_org, q0_org, q1_org;
665 LD_UB4(data - (img_width << 1), img_width, p1_org, p0_org, q0_org, q1_org);
667 p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
668 p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
669 q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
671 is_less_than_alpha = (p0_asub_q0 < alpha_in);
672 is_less_than_beta = (p1_asub_p0 < beta_in);
673 is_less_than = is_less_than_beta & is_less_than_alpha;
674 is_less_than_beta = (q1_asub_q0 < beta_in);
675 is_less_than = is_less_than_beta & is_less_than;
677 if (!__msa_test_bz_v(is_less_than)) {
678 v16u8 p2_asub_p0, q2_asub_q0, p0, q0, negate_is_less_than_beta;
684 v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
685 v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
686 v16u8 q2_org = LD_UB(data + (2 * img_width));
687 v16u8 p2_org = LD_UB(data - (3 * img_width));
688 v16u8 tmp_flag = (v16u8)__msa_fill_b((alpha_in >> 2) + 2);
690 UNPCK_UB_SH(p1_org, p1_org_r, p1_org_l);
691 UNPCK_UB_SH(p0_org, p0_org_r, p0_org_l);
692 UNPCK_UB_SH(q0_org, q0_org_r, q0_org_l);
694 tmp_flag = (p0_asub_q0 < tmp_flag);
696 p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org);
697 is_less_than_beta = (p2_asub_p0 < beta_in);
698 is_less_than_beta = is_less_than_beta & tmp_flag;
699 negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff);
700 is_less_than_beta = is_less_than_beta & is_less_than;
701 negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
703 q1_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q1_org);
704 q1_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q1_org);
706 /* combine and store */
707 if (!__msa_test_bz_v(is_less_than_beta)) {
708 v8i16 p3_org_l, p3_org_r;
709 v16u8 p3_org = LD_UB(data - (img_width << 2));
716 ILVR_B2_SH(zero, p3_org, zero, p2_org, p3_org_r, p2_r);
717 AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_r, p0_org_r, q0_org_r, p1_org_r,
718 p2_r, q1_org_r, p0_r, p1_r, p2_r);
720 ILVL_B2_SH(zero, p3_org, zero, p2_org, p3_org_l, p2_l);
721 AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_l, p0_org_l, q0_org_l, p1_org_l,
722 p2_l, q1_org_l, p0_l, p1_l, p2_l);
724 PCKEV_B3_UB(p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, p0, p1, p2);
726 p0_org = __msa_bmnz_v(p0_org, p0, is_less_than_beta);
727 p1_org = __msa_bmnz_v(p1_org, p1, is_less_than_beta);
728 p2_org = __msa_bmnz_v(p2_org, p2, is_less_than_beta);
730 ST_UB(p1_org, data - (2 * img_width));
731 ST_UB(p2_org, data - (3 * img_width));
734 AVC_LPF_P0_OR_Q0(p0_org_r, q1_org_r, p1_org_r, p0_r);
735 AVC_LPF_P0_OR_Q0(p0_org_l, q1_org_l, p1_org_l, p0_l);
738 p0 = (v16u8) __msa_pckev_b((v16i8) p0_l, (v16i8) p0_r);
739 p0_org = __msa_bmnz_v(p0_org, p0, negate_is_less_than_beta);
741 ST_UB(p0_org, data - img_width);
743 /* if (tmpFlag && (unsigned)ABS(q2-q0) < thresholds->beta_in) */
744 q2_asub_q0 = __msa_asub_u_b(q2_org, q0_org);
745 is_less_than_beta = (q2_asub_q0 < beta_in);
746 is_less_than_beta = is_less_than_beta & tmp_flag;
747 negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff);
748 is_less_than_beta = is_less_than_beta & is_less_than;
749 negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
751 /* combine and store */
752 if (!__msa_test_bz_v(is_less_than_beta)) {
753 v8i16 q3_org_r, q3_org_l;
754 v16u8 q3_org = LD_UB(data + (3 * img_width));
761 ILVR_B2_SH(zero, q3_org, zero, q2_org, q3_org_r, q2_r);
762 AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_r, q0_org_r, p0_org_r, q1_org_r,
763 q2_r, p1_org_r, q0_r, q1_r, q2_r);
765 ILVL_B2_SH(zero, q3_org, zero, q2_org, q3_org_l, q2_l);
766 AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_l, q0_org_l, p0_org_l, q1_org_l,
767 q2_l, p1_org_l, q0_l, q1_l, q2_l);
769 PCKEV_B3_UB(q0_l, q0_r, q1_l, q1_r, q2_l, q2_r, q0, q1, q2);
770 q0_org = __msa_bmnz_v(q0_org, q0, is_less_than_beta);
771 q1_org = __msa_bmnz_v(q1_org, q1, is_less_than_beta);
772 q2_org = __msa_bmnz_v(q2_org, q2, is_less_than_beta);
774 ST_UB(q1_org, data + img_width);
775 ST_UB(q2_org, data + 2 * img_width);
778 AVC_LPF_P0_OR_Q0(q0_org_r, p1_org_r, q1_org_r, q0_r);
779 AVC_LPF_P0_OR_Q0(q0_org_l, p1_org_l, q1_org_l, q0_l);
782 q0 = (v16u8) __msa_pckev_b((v16i8) q0_l, (v16i8) q0_r);
783 q0_org = __msa_bmnz_v(q0_org, q0, negate_is_less_than_beta);
789 static void avc_loopfilter_luma_intra_edge_ver_msa(uint8_t *data,
794 uint8_t *src = data - 4;
795 v16u8 alpha, beta, p0_asub_q0;
796 v16u8 is_less_than_alpha, is_less_than, is_less_than_beta;
797 v16u8 p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org;
798 v16u8 p1_asub_p0, q1_asub_q0;
802 v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
803 v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
805 LD_UB8(src, img_width, row0, row1, row2, row3, row4, row5, row6, row7);
806 LD_UB8(src + (8 * img_width), img_width,
807 row8, row9, row10, row11, row12, row13, row14, row15);
809 TRANSPOSE16x8_UB_UB(row0, row1, row2, row3,
810 row4, row5, row6, row7,
811 row8, row9, row10, row11,
812 row12, row13, row14, row15,
813 p3_org, p2_org, p1_org, p0_org,
814 q0_org, q1_org, q2_org, q3_org);
817 p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
818 p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
819 q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
821 alpha = (v16u8) __msa_fill_b(alpha_in);
822 beta = (v16u8) __msa_fill_b(beta_in);
824 is_less_than_alpha = (p0_asub_q0 < alpha);
825 is_less_than_beta = (p1_asub_p0 < beta);
826 is_less_than = is_less_than_beta & is_less_than_alpha;
827 is_less_than_beta = (q1_asub_q0 < beta);
828 is_less_than = is_less_than_beta & is_less_than;
830 if (!__msa_test_bz_v(is_less_than)) {
836 v16u8 tmp_flag, p0, q0, p2_asub_p0, q2_asub_q0;
837 v16u8 negate_is_less_than_beta;
838 v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
839 v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
841 UNPCK_UB_SH(p1_org, p1_org_r, p1_org_l);
842 UNPCK_UB_SH(p0_org, p0_org_r, p0_org_l);
843 UNPCK_UB_SH(q0_org, q0_org_r, q0_org_l);
844 UNPCK_UB_SH(q1_org, q1_org_r, q1_org_l);
846 tmp_flag = alpha >> 2;
847 tmp_flag = tmp_flag + 2;
848 tmp_flag = (p0_asub_q0 < tmp_flag);
850 p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org);
851 is_less_than_beta = (p2_asub_p0 < beta);
852 is_less_than_beta = tmp_flag & is_less_than_beta;
853 negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff);
854 is_less_than_beta = is_less_than_beta & is_less_than;
855 negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
857 if (!__msa_test_bz_v(is_less_than_beta)) {
859 v8i16 p3_org_r, p3_org_l;
865 ILVR_B2_SH(zero, p3_org, zero, p2_org, p3_org_r, p2_r);
866 AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_r, p0_org_r, q0_org_r, p1_org_r,
867 p2_r, q1_org_r, p0_r, p1_r, p2_r);
869 ILVL_B2_SH(zero, p3_org, zero, p2_org, p3_org_l, p2_l);
870 AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_l, p0_org_l, q0_org_l, p1_org_l,
871 p2_l, q1_org_l, p0_l, p1_l, p2_l);
873 PCKEV_B3_UB(p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, p0, p1, p2);
874 p0_org = __msa_bmnz_v(p0_org, p0, is_less_than_beta);
875 p1_org = __msa_bmnz_v(p1_org, p1, is_less_than_beta);
876 p2_org = __msa_bmnz_v(p2_org, p2, is_less_than_beta);
879 AVC_LPF_P0_OR_Q0(p0_org_r, q1_org_r, p1_org_r, p0_r);
880 AVC_LPF_P0_OR_Q0(p0_org_l, q1_org_l, p1_org_l, p0_l);
882 p0 = (v16u8) __msa_pckev_b((v16i8) p0_l, (v16i8) p0_r);
883 p0_org = __msa_bmnz_v(p0_org, p0, negate_is_less_than_beta);
885 q2_asub_q0 = __msa_asub_u_b(q2_org, q0_org);
886 is_less_than_beta = (q2_asub_q0 < beta);
888 is_less_than_beta = is_less_than_beta & tmp_flag;
889 negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff);
891 is_less_than_beta = is_less_than_beta & is_less_than;
892 negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
894 if (!__msa_test_bz_v(is_less_than_beta)) {
896 v8i16 q3_org_r, q3_org_l;
902 ILVR_B2_SH(zero, q3_org, zero, q2_org, q3_org_r, q2_r);
903 AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_r, q0_org_r, p0_org_r, q1_org_r,
904 q2_r, p1_org_r, q0_r, q1_r, q2_r);
906 ILVL_B2_SH(zero, q3_org, zero, q2_org, q3_org_l, q2_l);
907 AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_l, q0_org_l, p0_org_l, q1_org_l,
908 q2_l, p1_org_l, q0_l, q1_l, q2_l);
910 PCKEV_B3_UB(q0_l, q0_r, q1_l, q1_r, q2_l, q2_r, q0, q1, q2);
911 q0_org = __msa_bmnz_v(q0_org, q0, is_less_than_beta);
912 q1_org = __msa_bmnz_v(q1_org, q1, is_less_than_beta);
913 q2_org = __msa_bmnz_v(q2_org, q2, is_less_than_beta);
916 AVC_LPF_P0_OR_Q0(q0_org_r, p1_org_r, q1_org_r, q0_r);
917 AVC_LPF_P0_OR_Q0(q0_org_l, p1_org_l, q1_org_l, q0_l);
919 q0 = (v16u8) __msa_pckev_b((v16i8) q0_l, (v16i8) q0_r);
920 q0_org = __msa_bmnz_v(q0_org, q0, negate_is_less_than_beta);
923 v8i16 tp0, tp1, tp2, tp3, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
925 ILVRL_B2_SH(p1_org, p2_org, tp0, tp2);
926 ILVRL_B2_SH(q0_org, p0_org, tp1, tp3);
927 ILVRL_B2_SH(q2_org, q1_org, tmp2, tmp5);
929 ILVRL_H2_SH(tp1, tp0, tmp3, tmp4);
930 ILVRL_H2_SH(tp3, tp2, tmp6, tmp7);
933 ST4x4_UB(tmp3, tmp3, 0, 1, 2, 3, src, img_width);
934 ST2x4_UB(tmp2, 0, src + 4, img_width);
935 src += 4 * img_width;
936 ST4x4_UB(tmp4, tmp4, 0, 1, 2, 3, src, img_width);
937 ST2x4_UB(tmp2, 4, src + 4, img_width);
938 src += 4 * img_width;
940 ST4x4_UB(tmp6, tmp6, 0, 1, 2, 3, src, img_width);
941 ST2x4_UB(tmp5, 0, src + 4, img_width);
942 src += 4 * img_width;
943 ST4x4_UB(tmp7, tmp7, 0, 1, 2, 3, src, img_width);
944 ST2x4_UB(tmp5, 4, src + 4, img_width);
949 static void avc_h_loop_filter_luma_mbaff_intra_msa(uint8_t *src, int32_t stride,
953 uint64_t load0, load1;
956 v8u16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
957 v8u16 dst0_r, dst1_r, dst4_r, dst5_r;
958 v8u16 dst2_x_r, dst2_y_r, dst3_x_r, dst3_y_r;
959 v16u8 dst0, dst1, dst4, dst5, dst2_x, dst2_y, dst3_x, dst3_y;
960 v8i16 tmp0, tmp1, tmp2, tmp3;
962 v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0, p2_asub_p0, q2_asub_q0;
963 v16u8 is_less_than, is_less_than_alpha, is_less_than_beta;
964 v16u8 is_less_than_beta1, is_less_than_beta2;
976 load1 = LD(src + stride - 4);
977 src0 = (v16i8) __msa_insert_d((v2i64) src0, 0, load0);
978 src1 = (v16i8) __msa_insert_d((v2i64) src1, 0, load1);
980 load0 = LD(src + (2 * stride) - 4);
981 load1 = LD(src + (3 * stride) - 4);
982 src2 = (v16i8) __msa_insert_d((v2i64) src2, 0, load0);
983 src3 = (v16i8) __msa_insert_d((v2i64) src3, 0, load1);
985 load0 = LD(src + (4 * stride) - 4);
986 load1 = LD(src + (5 * stride) - 4);
987 src4 = (v16i8) __msa_insert_d((v2i64) src4, 0, load0);
988 src5 = (v16i8) __msa_insert_d((v2i64) src5, 0, load1);
990 load0 = LD(src + (6 * stride) - 4);
991 load1 = LD(src + (7 * stride) - 4);
992 src6 = (v16i8) __msa_insert_d((v2i64) src6, 0, load0);
993 src7 = (v16i8) __msa_insert_d((v2i64) src7, 0, load1);
995 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src7, src6,
996 src0, src1, src2, src3);
998 ILVR_H2_SH(src1, src0, src3, src2, tmp0, tmp2);
999 ILVL_H2_SH(src1, src0, src3, src2, tmp1, tmp3);
1001 ILVR_W2_SB(tmp2, tmp0, tmp3, tmp1, src6, src3);
1002 ILVL_W2_SB(tmp2, tmp0, tmp3, tmp1, src1, src5);
1003 SLDI_B4_0_SB(src6, src1, src3, src5, src0, src2, src4, src7, 8);
1005 p0_asub_q0 = __msa_asub_u_b((v16u8) src2, (v16u8) src3);
1006 p1_asub_p0 = __msa_asub_u_b((v16u8) src1, (v16u8) src2);
1007 q1_asub_q0 = __msa_asub_u_b((v16u8) src4, (v16u8) src3);
1009 alpha = (v16u8) __msa_fill_b(alpha_in);
1010 beta = (v16u8) __msa_fill_b(beta_in);
1012 is_less_than_alpha = (p0_asub_q0 < alpha);
1013 is_less_than_beta = (p1_asub_p0 < beta);
1014 is_less_than = is_less_than_alpha & is_less_than_beta;
1015 is_less_than_beta = (q1_asub_q0 < beta);
1016 is_less_than = is_less_than & is_less_than_beta;
1021 is_less_than_alpha = (p0_asub_q0 < alpha);
1023 p2_asub_p0 = __msa_asub_u_b((v16u8) src0, (v16u8) src2);
1024 is_less_than_beta1 = (p2_asub_p0 < beta);
1025 q2_asub_q0 = __msa_asub_u_b((v16u8) src5, (v16u8) src3);
1026 is_less_than_beta2 = (q2_asub_q0 < beta);
1028 ILVR_B4_UH(zeros, src0, zeros, src1, zeros, src2, zeros, src3,
1029 src0_r, src1_r, src2_r, src3_r);
1030 ILVR_B4_UH(zeros, src4, zeros, src5, zeros, src6, zeros, src7,
1031 src4_r, src5_r, src6_r, src7_r);
1033 dst2_x_r = src1_r + src2_r + src3_r;
1034 dst2_x_r = src0_r + (2 * (dst2_x_r)) + src4_r;
1035 dst2_x_r = (v8u16) __msa_srari_h((v8i16) dst2_x_r, 3);
1036 dst1_r = src0_r + src1_r + src2_r + src3_r;
1037 dst1_r = (v8u16) __msa_srari_h((v8i16) dst1_r, 2);
1039 dst0_r = (2 * src6_r) + (3 * src0_r);
1040 dst0_r += src1_r + src2_r + src3_r;
1041 dst0_r = (v8u16) __msa_srari_h((v8i16) dst0_r, 3);
1042 dst2_y_r = (2 * src1_r) + src2_r + src4_r;
1043 dst2_y_r = (v8u16) __msa_srari_h((v8i16) dst2_y_r, 2);
1045 PCKEV_B2_UB(dst2_x_r, dst2_x_r, dst2_y_r, dst2_y_r, dst2_x, dst2_y);
1046 dst2_x = __msa_bmnz_v(dst2_y, dst2_x, is_less_than_beta1);
1048 dst3_x_r = src2_r + src3_r + src4_r;
1049 dst3_x_r = src1_r + (2 * dst3_x_r) + src5_r;
1050 dst3_x_r = (v8u16) __msa_srari_h((v8i16) dst3_x_r, 3);
1051 dst4_r = src2_r + src3_r + src4_r + src5_r;
1052 dst4_r = (v8u16) __msa_srari_h((v8i16) dst4_r, 2);
1054 dst5_r = (2 * src7_r) + (3 * src5_r);
1055 dst5_r += src4_r + src3_r + src2_r;
1056 dst5_r = (v8u16) __msa_srari_h((v8i16) dst5_r, 3);
1057 dst3_y_r = (2 * src4_r) + src3_r + src1_r;
1058 dst3_y_r = (v8u16) __msa_srari_h((v8i16) dst3_y_r, 2);
1060 PCKEV_B2_UB(dst3_x_r, dst3_x_r, dst3_y_r, dst3_y_r, dst3_x, dst3_y);
1061 dst3_x = __msa_bmnz_v(dst3_y, dst3_x, is_less_than_beta2);
1063 dst2_y_r = (2 * src1_r) + src2_r + src4_r;
1064 dst2_y_r = (v8u16) __msa_srari_h((v8i16) dst2_y_r, 2);
1065 dst3_y_r = (2 * src4_r) + src3_r + src1_r;
1066 dst3_y_r = (v8u16) __msa_srari_h((v8i16) dst3_y_r, 2);
1068 PCKEV_B2_UB(dst2_y_r, dst2_y_r, dst3_y_r, dst3_y_r, dst2_y, dst3_y);
1070 dst2_x = __msa_bmnz_v(dst2_y, dst2_x, is_less_than_alpha);
1071 dst3_x = __msa_bmnz_v(dst3_y, dst3_x, is_less_than_alpha);
1072 dst2_x = __msa_bmnz_v((v16u8) src2, dst2_x, is_less_than);
1073 dst3_x = __msa_bmnz_v((v16u8) src3, dst3_x, is_less_than);
1075 is_less_than = is_less_than_alpha & is_less_than;
1076 dst1 = (v16u8) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst1_r);
1077 is_less_than_beta1 = is_less_than_beta1 & is_less_than;
1078 dst1 = __msa_bmnz_v((v16u8) src1, dst1, is_less_than_beta1);
1080 dst0 = (v16u8) __msa_pckev_b((v16i8) dst0_r, (v16i8) dst0_r);
1081 dst0 = __msa_bmnz_v((v16u8) src0, dst0, is_less_than_beta1);
1082 dst4 = (v16u8) __msa_pckev_b((v16i8) dst4_r, (v16i8) dst4_r);
1083 is_less_than_beta2 = is_less_than_beta2 & is_less_than;
1084 dst4 = __msa_bmnz_v((v16u8) src4, dst4, is_less_than_beta2);
1085 dst5 = (v16u8) __msa_pckev_b((v16i8) dst5_r, (v16i8) dst5_r);
1086 dst5 = __msa_bmnz_v((v16u8) src5, dst5, is_less_than_beta2);
1088 ILVR_B2_UB(dst1, dst0, dst3_x, dst2_x, dst0, dst1);
1089 dst2_x = (v16u8) __msa_ilvr_b((v16i8) dst5, (v16i8) dst4);
1090 ILVRL_H2_SH(dst1, dst0, tmp0, tmp1);
1091 ILVRL_H2_SH(zeros, dst2_x, tmp2, tmp3);
1093 ILVR_W2_UB(tmp2, tmp0, tmp3, tmp1, dst0, dst4);
1094 SLDI_B2_0_UB(dst0, dst4, dst1, dst5, 8);
1095 dst2_x = (v16u8) __msa_ilvl_w((v4i32) tmp2, (v4i32) tmp0);
1096 dst2_y = (v16u8) __msa_ilvl_w((v4i32) tmp3, (v4i32) tmp1);
1097 SLDI_B2_0_UB(dst2_x, dst2_y, dst3_x, dst3_y, 8);
1099 out0 = __msa_copy_u_w((v4i32) dst0, 0);
1100 out1 = __msa_copy_u_h((v8i16) dst0, 2);
1101 out2 = __msa_copy_u_w((v4i32) dst1, 0);
1102 out3 = __msa_copy_u_h((v8i16) dst1, 2);
1104 SW(out0, (src - 3));
1105 SH(out1, (src + 1));
1107 SW(out2, (src - 3));
1108 SH(out3, (src + 1));
1111 out0 = __msa_copy_u_w((v4i32) dst2_x, 0);
1112 out1 = __msa_copy_u_h((v8i16) dst2_x, 2);
1113 out2 = __msa_copy_u_w((v4i32) dst3_x, 0);
1114 out3 = __msa_copy_u_h((v8i16) dst3_x, 2);
1116 SW(out0, (src - 3));
1117 SH(out1, (src + 1));
1119 SW(out2, (src - 3));
1120 SH(out3, (src + 1));
1123 out0 = __msa_copy_u_w((v4i32) dst4, 0);
1124 out1 = __msa_copy_u_h((v8i16) dst4, 2);
1125 out2 = __msa_copy_u_w((v4i32) dst5, 0);
1126 out3 = __msa_copy_u_h((v8i16) dst5, 2);
1128 SW(out0, (src - 3));
1129 SH(out1, (src + 1));
1131 SW(out2, (src - 3));
1132 SH(out3, (src + 1));
1135 out0 = __msa_copy_u_w((v4i32) dst2_y, 0);
1136 out1 = __msa_copy_u_h((v8i16) dst2_y, 2);
1137 out2 = __msa_copy_u_w((v4i32) dst3_y, 0);
1138 out3 = __msa_copy_u_h((v8i16) dst3_y, 2);
1140 SW(out0, (src - 3));
1141 SH(out1, (src + 1));
1143 SW(out2, (src - 3));
1144 SH(out3, (src + 1));
1147 static void avc_loopfilter_cb_or_cr_intra_edge_hor_msa(uint8_t *data_cb_or_cr,
1154 v8i16 p0_or_q0, q0_or_p0;
1155 v16u8 p1_or_q1_org, p0_or_q0_org, q0_or_p0_org, q1_or_p1_org;
1157 v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
1158 v16u8 is_less_than_alpha, is_less_than_beta;
1159 v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
1161 alpha = (v16u8) __msa_fill_b(alpha_in);
1162 beta = (v16u8) __msa_fill_b(beta_in);
1164 LD_UB4(data_cb_or_cr - (img_width << 1), img_width,
1165 p1_or_q1_org, p0_or_q0_org, q0_or_p0_org, q1_or_p1_org);
1167 p0_asub_q0 = __msa_asub_u_b(p0_or_q0_org, q0_or_p0_org);
1168 p1_asub_p0 = __msa_asub_u_b(p1_or_q1_org, p0_or_q0_org);
1169 q1_asub_q0 = __msa_asub_u_b(q1_or_p1_org, q0_or_p0_org);
1171 is_less_than_alpha = (p0_asub_q0 < alpha);
1172 is_less_than_beta = (p1_asub_p0 < beta);
1173 is_less_than = is_less_than_beta & is_less_than_alpha;
1174 is_less_than_beta = (q1_asub_q0 < beta);
1175 is_less_than = is_less_than_beta & is_less_than;
1177 is_less_than = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) is_less_than);
1179 if (!__msa_test_bz_v(is_less_than)) {
1180 ILVR_B4_SH(zero, p1_or_q1_org, zero, p0_or_q0_org, zero, q0_or_p0_org,
1181 zero, q1_or_p1_org, p1_org_r, p0_org_r, q0_org_r, q1_org_r);
1182 AVC_LPF_P0_OR_Q0(p0_org_r, q1_org_r, p1_org_r, p0_or_q0);
1183 AVC_LPF_P0_OR_Q0(q0_org_r, p1_org_r, q1_org_r, q0_or_p0);
1184 PCKEV_B2_SH(zero, p0_or_q0, zero, q0_or_p0, p0_or_q0, q0_or_p0);
1187 __msa_bmnz_v(p0_or_q0_org, (v16u8) p0_or_q0, is_less_than);
1189 __msa_bmnz_v(q0_or_p0_org, (v16u8) q0_or_p0, is_less_than);
1191 ST_UB(q0_or_p0_org, data_cb_or_cr);
1192 ST_UB(p0_or_q0_org, data_cb_or_cr - img_width);
1196 static void avc_loopfilter_cb_or_cr_intra_edge_ver_msa(uint8_t *data_cb_or_cr,
1202 v16u8 alpha, beta, is_less_than;
1203 v8i16 p0_or_q0, q0_or_p0;
1204 v16u8 p1_or_q1_org, p0_or_q0_org, q0_or_p0_org, q1_or_p1_org;
1206 v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
1207 v16u8 is_less_than_alpha, is_less_than_beta;
1208 v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
1211 v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
1213 LD_UB8((data_cb_or_cr - 2), img_width,
1214 row0, row1, row2, row3, row4, row5, row6, row7);
1216 TRANSPOSE8x4_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
1217 p1_or_q1_org, p0_or_q0_org,
1218 q0_or_p0_org, q1_or_p1_org);
1221 alpha = (v16u8) __msa_fill_b(alpha_in);
1222 beta = (v16u8) __msa_fill_b(beta_in);
1224 p0_asub_q0 = __msa_asub_u_b(p0_or_q0_org, q0_or_p0_org);
1225 p1_asub_p0 = __msa_asub_u_b(p1_or_q1_org, p0_or_q0_org);
1226 q1_asub_q0 = __msa_asub_u_b(q1_or_p1_org, q0_or_p0_org);
1228 is_less_than_alpha = (p0_asub_q0 < alpha);
1229 is_less_than_beta = (p1_asub_p0 < beta);
1230 is_less_than = is_less_than_beta & is_less_than_alpha;
1231 is_less_than_beta = (q1_asub_q0 < beta);
1232 is_less_than = is_less_than_beta & is_less_than;
1233 is_less_than = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) is_less_than);
1235 if (!__msa_test_bz_v(is_less_than)) {
1236 ILVR_B4_SH(zero, p1_or_q1_org, zero, p0_or_q0_org, zero, q0_or_p0_org,
1237 zero, q1_or_p1_org, p1_org_r, p0_org_r, q0_org_r, q1_org_r);
1239 AVC_LPF_P0_OR_Q0(p0_org_r, q1_org_r, p1_org_r, p0_or_q0);
1240 AVC_LPF_P0_OR_Q0(q0_org_r, p1_org_r, q1_org_r, q0_or_p0);
1242 /* convert 16 bit output into 8 bit output */
1243 PCKEV_B2_SH(zero, p0_or_q0, zero, q0_or_p0, p0_or_q0, q0_or_p0);
1246 __msa_bmnz_v(p0_or_q0_org, (v16u8) p0_or_q0, is_less_than);
1248 __msa_bmnz_v(q0_or_p0_org, (v16u8) q0_or_p0, is_less_than);
1249 tmp1 = (v8i16) __msa_ilvr_b((v16i8) q0_or_p0_org, (v16i8) p0_or_q0_org);
1252 ST2x4_UB(tmp1, 0, data_cb_or_cr, img_width);
1253 data_cb_or_cr += 4 * img_width;
1254 ST2x4_UB(tmp1, 4, data_cb_or_cr, img_width);
1258 static void avc_loopfilter_luma_inter_edge_ver_msa(uint8_t *data,
1259 uint8_t bs0, uint8_t bs1,
1260 uint8_t bs2, uint8_t bs3,
1261 uint8_t tc0, uint8_t tc1,
1262 uint8_t tc2, uint8_t tc3,
1267 v16u8 tmp_vec, bs = { 0 };
1269 tmp_vec = (v16u8) __msa_fill_b(bs0);
1270 bs = (v16u8) __msa_insve_w((v4i32) bs, 0, (v4i32) tmp_vec);
1271 tmp_vec = (v16u8) __msa_fill_b(bs1);
1272 bs = (v16u8) __msa_insve_w((v4i32) bs, 1, (v4i32) tmp_vec);
1273 tmp_vec = (v16u8) __msa_fill_b(bs2);
1274 bs = (v16u8) __msa_insve_w((v4i32) bs, 2, (v4i32) tmp_vec);
1275 tmp_vec = (v16u8) __msa_fill_b(bs3);
1276 bs = (v16u8) __msa_insve_w((v4i32) bs, 3, (v4i32) tmp_vec);
1278 if (!__msa_test_bz_v(bs)) {
1279 uint8_t *src = data - 4;
1280 v16u8 p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org;
1281 v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta;
1282 v16u8 is_less_than, is_less_than_beta, is_less_than_alpha;
1283 v16u8 is_bs_greater_than0;
1287 tmp_vec = (v16u8) __msa_fill_b(tc0);
1288 tc = (v16u8) __msa_insve_w((v4i32) tc, 0, (v4i32) tmp_vec);
1289 tmp_vec = (v16u8) __msa_fill_b(tc1);
1290 tc = (v16u8) __msa_insve_w((v4i32) tc, 1, (v4i32) tmp_vec);
1291 tmp_vec = (v16u8) __msa_fill_b(tc2);
1292 tc = (v16u8) __msa_insve_w((v4i32) tc, 2, (v4i32) tmp_vec);
1293 tmp_vec = (v16u8) __msa_fill_b(tc3);
1294 tc = (v16u8) __msa_insve_w((v4i32) tc, 3, (v4i32) tmp_vec);
1296 is_bs_greater_than0 = (zero < bs);
1299 v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
1300 v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
1302 LD_UB8(src, img_width,
1303 row0, row1, row2, row3, row4, row5, row6, row7);
1304 src += (8 * img_width);
1305 LD_UB8(src, img_width,
1306 row8, row9, row10, row11, row12, row13, row14, row15);
1308 TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
1309 row8, row9, row10, row11,
1310 row12, row13, row14, row15,
1311 p3_org, p2_org, p1_org, p0_org,
1312 q0_org, q1_org, q2_org, q3_org);
1315 p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
1316 p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
1317 q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
1319 alpha = (v16u8) __msa_fill_b(alpha_in);
1320 beta = (v16u8) __msa_fill_b(beta_in);
1322 is_less_than_alpha = (p0_asub_q0 < alpha);
1323 is_less_than_beta = (p1_asub_p0 < beta);
1324 is_less_than = is_less_than_beta & is_less_than_alpha;
1325 is_less_than_beta = (q1_asub_q0 < beta);
1326 is_less_than = is_less_than_beta & is_less_than;
1327 is_less_than = is_less_than & is_bs_greater_than0;
1329 if (!__msa_test_bz_v(is_less_than)) {
1330 v16i8 negate_tc, sign_negate_tc;
1331 v16u8 p0, q0, p2_asub_p0, q2_asub_q0;
1332 v8i16 tc_r, tc_l, negate_tc_r, i16_negatetc_l;
1333 v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
1334 v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
1335 v8i16 p0_r, q0_r, p0_l, q0_l;
1337 negate_tc = zero - (v16i8) tc;
1338 sign_negate_tc = __msa_clti_s_b(negate_tc, 0);
1340 ILVRL_B2_SH(sign_negate_tc, negate_tc, negate_tc_r, i16_negatetc_l);
1342 UNPCK_UB_SH(tc, tc_r, tc_l);
1343 UNPCK_UB_SH(p1_org, p1_org_r, p1_org_l);
1344 UNPCK_UB_SH(p0_org, p0_org_r, p0_org_l);
1345 UNPCK_UB_SH(q0_org, q0_org_r, q0_org_l);
1347 p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org);
1348 is_less_than_beta = (p2_asub_p0 < beta);
1349 is_less_than_beta = is_less_than_beta & is_less_than;
1351 if (!__msa_test_bz_v(is_less_than_beta)) {
1355 v8i16 p2_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) p2_org);
1356 v8i16 p2_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) p2_org);
1358 AVC_LPF_P1_OR_Q1(p0_org_r, q0_org_r, p1_org_r, p2_org_r,
1359 negate_tc_r, tc_r, p1_r);
1360 AVC_LPF_P1_OR_Q1(p0_org_l, q0_org_l, p1_org_l, p2_org_l,
1361 i16_negatetc_l, tc_l, p1_l);
1363 p1 = (v16u8) __msa_pckev_b((v16i8) p1_l, (v16i8) p1_r);
1364 p1_org = __msa_bmnz_v(p1_org, p1, is_less_than_beta);
1366 is_less_than_beta = __msa_andi_b(is_less_than_beta, 1);
1367 tc = tc + is_less_than_beta;
1370 q2_asub_q0 = __msa_asub_u_b(q2_org, q0_org);
1371 is_less_than_beta = (q2_asub_q0 < beta);
1372 is_less_than_beta = is_less_than_beta & is_less_than;
1374 q1_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q1_org);
1375 q1_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q1_org);
1377 if (!__msa_test_bz_v(is_less_than_beta)) {
1381 v8i16 q2_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q2_org);
1382 v8i16 q2_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q2_org);
1384 AVC_LPF_P1_OR_Q1(p0_org_r, q0_org_r, q1_org_r, q2_org_r,
1385 negate_tc_r, tc_r, q1_r);
1386 AVC_LPF_P1_OR_Q1(p0_org_l, q0_org_l, q1_org_l, q2_org_l,
1387 i16_negatetc_l, tc_l, q1_l);
1389 q1 = (v16u8) __msa_pckev_b((v16i8) q1_l, (v16i8) q1_r);
1390 q1_org = __msa_bmnz_v(q1_org, q1, is_less_than_beta);
1392 is_less_than_beta = __msa_andi_b(is_less_than_beta, 1);
1393 tc = tc + is_less_than_beta;
1397 v8i16 threshold_r, negate_thresh_r;
1398 v8i16 threshold_l, negate_thresh_l;
1399 v16i8 negate_thresh, sign_negate_thresh;
1401 negate_thresh = zero - (v16i8) tc;
1402 sign_negate_thresh = __msa_clti_s_b(negate_thresh, 0);
1404 ILVR_B2_SH(zero, tc, sign_negate_thresh, negate_thresh,
1405 threshold_r, negate_thresh_r);
1407 AVC_LPF_P0Q0(q0_org_r, p0_org_r, p1_org_r, q1_org_r,
1408 negate_thresh_r, threshold_r, p0_r, q0_r);
1410 threshold_l = (v8i16) __msa_ilvl_b(zero, (v16i8) tc);
1411 negate_thresh_l = (v8i16) __msa_ilvl_b(sign_negate_thresh,
1414 AVC_LPF_P0Q0(q0_org_l, p0_org_l, p1_org_l, q1_org_l,
1415 negate_thresh_l, threshold_l, p0_l, q0_l);
1418 PCKEV_B2_UB(p0_l, p0_r, q0_l, q0_r, p0, q0);
1420 p0_org = __msa_bmnz_v(p0_org, p0, is_less_than);
1421 q0_org = __msa_bmnz_v(q0_org, q0, is_less_than);
1424 v16i8 tp0, tp1, tp2, tp3;
1426 v4i32 tmp3, tmp4, tmp6, tmp7;
1427 uint32_t out0, out2;
1428 uint16_t out1, out3;
1432 ILVRL_B2_SB(p1_org, p2_org, tp0, tp2);
1433 ILVRL_B2_SB(q0_org, p0_org, tp1, tp3);
1434 ILVRL_B2_SH(q2_org, q1_org, tmp2, tmp5);
1436 ILVRL_H2_SW(tp1, tp0, tmp3, tmp4);
1437 ILVRL_H2_SW(tp3, tp2, tmp6, tmp7);
1439 out0 = __msa_copy_u_w(tmp3, 0);
1440 out1 = __msa_copy_u_h(tmp2, 0);
1441 out2 = __msa_copy_u_w(tmp3, 1);
1442 out3 = __msa_copy_u_h(tmp2, 1);
1445 SH(out1, (src + 4));
1448 SH(out3, (src + 4));
1450 out0 = __msa_copy_u_w(tmp3, 2);
1451 out1 = __msa_copy_u_h(tmp2, 2);
1452 out2 = __msa_copy_u_w(tmp3, 3);
1453 out3 = __msa_copy_u_h(tmp2, 3);
1457 SH(out1, (src + 4));
1460 SH(out3, (src + 4));
1462 out0 = __msa_copy_u_w(tmp4, 0);
1463 out1 = __msa_copy_u_h(tmp2, 4);
1464 out2 = __msa_copy_u_w(tmp4, 1);
1465 out3 = __msa_copy_u_h(tmp2, 5);
1469 SH(out1, (src + 4));
1472 SH(out3, (src + 4));
1474 out0 = __msa_copy_u_w(tmp4, 2);
1475 out1 = __msa_copy_u_h(tmp2, 6);
1476 out2 = __msa_copy_u_w(tmp4, 3);
1477 out3 = __msa_copy_u_h(tmp2, 7);
1481 SH(out1, (src + 4));
1484 SH(out3, (src + 4));
1486 out0 = __msa_copy_u_w(tmp6, 0);
1487 out1 = __msa_copy_u_h(tmp5, 0);
1488 out2 = __msa_copy_u_w(tmp6, 1);
1489 out3 = __msa_copy_u_h(tmp5, 1);
1493 SH(out1, (src + 4));
1496 SH(out3, (src + 4));
1498 out0 = __msa_copy_u_w(tmp6, 2);
1499 out1 = __msa_copy_u_h(tmp5, 2);
1500 out2 = __msa_copy_u_w(tmp6, 3);
1501 out3 = __msa_copy_u_h(tmp5, 3);
1505 SH(out1, (src + 4));
1508 SH(out3, (src + 4));
1510 out0 = __msa_copy_u_w(tmp7, 0);
1511 out1 = __msa_copy_u_h(tmp5, 4);
1512 out2 = __msa_copy_u_w(tmp7, 1);
1513 out3 = __msa_copy_u_h(tmp5, 5);
1517 SH(out1, (src + 4));
1520 SH(out3, (src + 4));
1522 out0 = __msa_copy_u_w(tmp7, 2);
1523 out1 = __msa_copy_u_h(tmp5, 6);
1524 out2 = __msa_copy_u_w(tmp7, 3);
1525 out3 = __msa_copy_u_h(tmp5, 7);
1529 SH(out1, (src + 4));
1532 SH(out3, (src + 4));
1538 static void avc_loopfilter_luma_inter_edge_hor_msa(uint8_t *data,
1539 uint8_t bs0, uint8_t bs1,
1540 uint8_t bs2, uint8_t bs3,
1541 uint8_t tc0, uint8_t tc1,
1542 uint8_t tc2, uint8_t tc3,
1545 uint32_t image_width)
1550 tmp_vec = (v16u8) __msa_fill_b(bs0);
1551 bs = (v16u8) __msa_insve_w((v4i32) bs, 0, (v4i32) tmp_vec);
1552 tmp_vec = (v16u8) __msa_fill_b(bs1);
1553 bs = (v16u8) __msa_insve_w((v4i32) bs, 1, (v4i32) tmp_vec);
1554 tmp_vec = (v16u8) __msa_fill_b(bs2);
1555 bs = (v16u8) __msa_insve_w((v4i32) bs, 2, (v4i32) tmp_vec);
1556 tmp_vec = (v16u8) __msa_fill_b(bs3);
1557 bs = (v16u8) __msa_insve_w((v4i32) bs, 3, (v4i32) tmp_vec);
1559 if (!__msa_test_bz_v(bs)) {
1560 v16u8 alpha, beta, is_less_than, is_less_than_beta;
1561 v16u8 p0, q0, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org;
1562 v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
1563 v16u8 is_less_than_alpha, is_bs_greater_than0;
1564 v8i16 p0_r, q0_r, p0_l, q0_l;
1565 v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
1566 v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
1570 tmp_vec = (v16u8) __msa_fill_b(tc0);
1571 tc = (v16i8) __msa_insve_w((v4i32) tc, 0, (v4i32) tmp_vec);
1572 tmp_vec = (v16u8) __msa_fill_b(tc1);
1573 tc = (v16i8) __msa_insve_w((v4i32) tc, 1, (v4i32) tmp_vec);
1574 tmp_vec = (v16u8) __msa_fill_b(tc2);
1575 tc = (v16i8) __msa_insve_w((v4i32) tc, 2, (v4i32) tmp_vec);
1576 tmp_vec = (v16u8) __msa_fill_b(tc3);
1577 tc = (v16i8) __msa_insve_w((v4i32) tc, 3, (v4i32) tmp_vec);
1579 alpha = (v16u8) __msa_fill_b(alpha_in);
1580 beta = (v16u8) __msa_fill_b(beta_in);
1582 LD_UB5(data - (3 * image_width), image_width,
1583 p2_org, p1_org, p0_org, q0_org, q1_org);
1585 is_bs_greater_than0 = ((v16u8) zero < bs);
1586 p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
1587 p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
1588 q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
1590 is_less_than_alpha = (p0_asub_q0 < alpha);
1591 is_less_than_beta = (p1_asub_p0 < beta);
1592 is_less_than = is_less_than_beta & is_less_than_alpha;
1593 is_less_than_beta = (q1_asub_q0 < beta);
1594 is_less_than = is_less_than_beta & is_less_than;
1595 is_less_than = is_less_than & is_bs_greater_than0;
1597 if (!__msa_test_bz_v(is_less_than)) {
1598 v16i8 sign_negate_tc, negate_tc;
1599 v8i16 negate_tc_r, i16_negatetc_l, tc_l, tc_r;
1600 v16u8 p2_asub_p0, q2_asub_q0;
1602 q2_org = LD_UB(data + (2 * image_width));
1603 negate_tc = zero - tc;
1604 sign_negate_tc = __msa_clti_s_b(negate_tc, 0);
1606 ILVRL_B2_SH(sign_negate_tc, negate_tc, negate_tc_r, i16_negatetc_l);
1608 UNPCK_UB_SH(tc, tc_r, tc_l);
1609 UNPCK_UB_SH(p1_org, p1_org_r, p1_org_l);
1610 UNPCK_UB_SH(p0_org, p0_org_r, p0_org_l);
1611 UNPCK_UB_SH(q0_org, q0_org_r, q0_org_l);
1613 p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org);
1614 is_less_than_beta = (p2_asub_p0 < beta);
1615 is_less_than_beta = is_less_than_beta & is_less_than;
1617 if (!__msa_test_bz_v(is_less_than_beta)) {
1621 v8i16 p2_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) p2_org);
1622 v8i16 p2_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) p2_org);
1624 AVC_LPF_P1_OR_Q1(p0_org_r, q0_org_r, p1_org_r, p2_org_r,
1625 negate_tc_r, tc_r, p1_r);
1626 AVC_LPF_P1_OR_Q1(p0_org_l, q0_org_l, p1_org_l, p2_org_l,
1627 i16_negatetc_l, tc_l, p1_l);
1629 p1 = (v16u8) __msa_pckev_b((v16i8) p1_l, (v16i8) p1_r);
1630 p1_org = __msa_bmnz_v(p1_org, p1, is_less_than_beta);
1631 ST_UB(p1_org, data - (2 * image_width));
1633 is_less_than_beta = __msa_andi_b(is_less_than_beta, 1);
1634 tc = tc + (v16i8) is_less_than_beta;
1637 q2_asub_q0 = __msa_asub_u_b(q2_org, q0_org);
1638 is_less_than_beta = (q2_asub_q0 < beta);
1639 is_less_than_beta = is_less_than_beta & is_less_than;
1641 q1_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q1_org);
1642 q1_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q1_org);
1644 if (!__msa_test_bz_v(is_less_than_beta)) {
1648 v8i16 q2_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q2_org);
1649 v8i16 q2_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q2_org);
1651 AVC_LPF_P1_OR_Q1(p0_org_r, q0_org_r, q1_org_r, q2_org_r,
1652 negate_tc_r, tc_r, q1_r);
1653 AVC_LPF_P1_OR_Q1(p0_org_l, q0_org_l, q1_org_l, q2_org_l,
1654 i16_negatetc_l, tc_l, q1_l);
1656 q1 = (v16u8) __msa_pckev_b((v16i8) q1_l, (v16i8) q1_r);
1657 q1_org = __msa_bmnz_v(q1_org, q1, is_less_than_beta);
1658 ST_UB(q1_org, data + image_width);
1660 is_less_than_beta = __msa_andi_b(is_less_than_beta, 1);
1661 tc = tc + (v16i8) is_less_than_beta;
1664 v16i8 negate_thresh, sign_negate_thresh;
1665 v8i16 threshold_r, threshold_l;
1666 v8i16 negate_thresh_l, negate_thresh_r;
1668 negate_thresh = zero - tc;
1669 sign_negate_thresh = __msa_clti_s_b(negate_thresh, 0);
1671 ILVR_B2_SH(zero, tc, sign_negate_thresh, negate_thresh,
1672 threshold_r, negate_thresh_r);
1673 AVC_LPF_P0Q0(q0_org_r, p0_org_r, p1_org_r, q1_org_r,
1674 negate_thresh_r, threshold_r, p0_r, q0_r);
1676 threshold_l = (v8i16) __msa_ilvl_b(zero, tc);
1677 negate_thresh_l = (v8i16) __msa_ilvl_b(sign_negate_thresh,
1679 AVC_LPF_P0Q0(q0_org_l, p0_org_l, p1_org_l, q1_org_l,
1680 negate_thresh_l, threshold_l, p0_l, q0_l);
1683 PCKEV_B2_UB(p0_l, p0_r, q0_l, q0_r, p0, q0);
1685 p0_org = __msa_bmnz_v(p0_org, p0, is_less_than);
1686 q0_org = __msa_bmnz_v(q0_org, q0, is_less_than);
1688 ST_UB(p0_org, (data - image_width));
1689 ST_UB(q0_org, data);
1694 static void avc_h_loop_filter_luma_mbaff_msa(uint8_t *in, int32_t stride,
1695 int32_t alpha_in, int32_t beta_in,
1699 uint32_t out0, out1, out2, out3;
1711 v16i8 src0, src1, src2, src3;
1712 v8i16 src4, src5, src6, src7;
1713 v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0, p2_asub_p0, q2_asub_q0;
1714 v16u8 is_less_than, is_less_than_alpha, is_less_than_beta;
1715 v16u8 is_less_than_beta1, is_less_than_beta2;
1716 v8i16 tc, tc_orig_r, tc_plus1;
1717 v16u8 is_tc_orig1, is_tc_orig2, tc_orig = { 0 };
1718 v8i16 p0_ilvr_q0, p0_add_q0, q0_sub_p0, p1_sub_q1;
1719 v8u16 src2_r, src3_r;
1720 v8i16 p2_r, p1_r, q2_r, q1_r;
1721 v16u8 p2, q2, p0, q0;
1723 v16i8 zeros = { 0 };
1725 alpha = (v16u8) __msa_fill_b(alpha_in);
1726 beta = (v16u8) __msa_fill_b(beta_in);
1729 data += (2 * stride);
1731 load = LD(data - 3);
1732 inp0 = (v16i8) __msa_insert_d((v2i64) inp0, 0, load);
1733 load = LD(data - 3 + stride);
1734 inp1 = (v16i8) __msa_insert_d((v2i64) inp1, 0, load);
1735 data += (2 * stride);
1739 data += (2 * stride);
1741 load = LD(data - 3);
1742 inp2 = (v16i8) __msa_insert_d((v2i64) inp2, 0, load);
1743 load = LD(data - 3 + stride);
1744 inp3 = (v16i8) __msa_insert_d((v2i64) inp3, 0, load);
1745 data += (2 * stride);
1749 data += (2 * stride);
1751 load = LD(data - 3);
1752 inp4 = (v16i8) __msa_insert_d((v2i64) inp4, 0, load);
1753 load = LD(data - 3 + stride);
1754 inp5 = (v16i8) __msa_insert_d((v2i64) inp5, 0, load);
1755 data += (2 * stride);
1759 data += (2 * stride);
1761 load = LD(data - 3);
1762 inp6 = (v16i8) __msa_insert_d((v2i64) inp6, 0, load);
1763 load = LD(data - 3 + stride);
1764 inp7 = (v16i8) __msa_insert_d((v2i64) inp7, 0, load);
1765 data += (2 * stride);
1768 ILVR_B4_SB(inp1, inp0, inp3, inp2, inp5, inp4, inp7, inp6,
1769 src0, src1, src2, src3);
1771 ILVR_H2_SH(src1, src0, src3, src2, src4, src6);
1772 ILVL_H2_SH(src1, src0, src3, src2, src5, src7);
1774 src0 = (v16i8) __msa_ilvr_w((v4i32) src6, (v4i32) src4);
1775 src1 = __msa_sldi_b(zeros, (v16i8) src0, 8);
1776 src2 = (v16i8) __msa_ilvl_w((v4i32) src6, (v4i32) src4);
1777 src3 = __msa_sldi_b(zeros, (v16i8) src2, 8);
1778 src4 = (v8i16) __msa_ilvr_w((v4i32) src7, (v4i32) src5);
1779 src5 = (v8i16) __msa_sldi_b(zeros, (v16i8) src4, 8);
1781 p0_asub_q0 = __msa_asub_u_b((v16u8) src2, (v16u8) src3);
1782 p1_asub_p0 = __msa_asub_u_b((v16u8) src1, (v16u8) src2);
1783 q1_asub_q0 = __msa_asub_u_b((v16u8) src4, (v16u8) src3);
1784 p2_asub_p0 = __msa_asub_u_b((v16u8) src0, (v16u8) src2);
1785 q2_asub_q0 = __msa_asub_u_b((v16u8) src5, (v16u8) src3);
1787 is_less_than_alpha = (p0_asub_q0 < alpha);
1788 is_less_than_beta = (p1_asub_p0 < beta);
1789 is_less_than = is_less_than_alpha & is_less_than_beta;
1790 is_less_than_beta = (q1_asub_q0 < beta);
1791 is_less_than = is_less_than_beta & is_less_than;
1793 is_less_than_beta1 = (p2_asub_p0 < beta);
1794 is_less_than_beta2 = (q2_asub_q0 < beta);
1796 p0_ilvr_q0 = (v8i16) __msa_ilvr_b((v16i8) src3, (v16i8) src2);
1797 p0_add_q0 = (v8i16) __msa_hadd_u_h((v16u8) p0_ilvr_q0, (v16u8) p0_ilvr_q0);
1798 p0_add_q0 = __msa_srari_h(p0_add_q0, 1);
1800 ILVR_B2_SH(zeros, src0, zeros, src1, p2_r, p1_r);
1804 ILVR_B2_SH(zeros, src5, zeros, src4, q2_r, q1_r);
1810 tc_orig = (v16u8) __msa_insert_w((v4i32) tc_orig, 0, tc_val);
1811 tc_orig = (v16u8) __msa_ilvr_b((v16i8) tc_orig, (v16i8) tc_orig);
1812 is_tc_orig1 = tc_orig;
1813 is_tc_orig2 = tc_orig;
1814 tc_orig_r = (v8i16) __msa_ilvr_b(zeros, (v16i8) tc_orig);
1817 p2_r = CLIP_SH(p2_r, -tc_orig_r, tc_orig_r);
1818 q2_r = CLIP_SH(q2_r, -tc_orig_r, tc_orig_r);
1823 PCKEV_B2_UB(p2_r, p2_r, q2_r, q2_r, p2, q2);
1825 is_tc_orig1 = (zeros < is_tc_orig1);
1826 is_tc_orig2 = is_tc_orig1;
1827 is_tc_orig1 = is_less_than_beta1 & is_tc_orig1;
1828 is_tc_orig2 = is_less_than_beta2 & is_tc_orig2;
1829 is_tc_orig1 = is_less_than & is_tc_orig1;
1830 is_tc_orig2 = is_less_than & is_tc_orig2;
1832 p2 = __msa_bmnz_v((v16u8) src1, p2, is_tc_orig1);
1833 q2 = __msa_bmnz_v((v16u8) src4, q2, is_tc_orig2);
1835 q0_sub_p0 = __msa_hsub_u_h((v16u8) p0_ilvr_q0, (v16u8) p0_ilvr_q0);
1837 p1_sub_q1 = p1_r - q1_r;
1838 q0_sub_p0 += p1_sub_q1;
1839 q0_sub_p0 = __msa_srari_h(q0_sub_p0, 3);
1842 is_less_than_beta1 = (v16u8) __msa_ilvr_b((v16i8) is_less_than_beta1,
1843 (v16i8) is_less_than_beta1);
1844 tc = (v8i16) __msa_bmnz_v((v16u8) tc, (v16u8) tc_plus1, is_less_than_beta1);
1846 is_less_than_beta2 = (v16u8) __msa_ilvr_b((v16i8) is_less_than_beta2,
1847 (v16i8) is_less_than_beta2);
1848 tc = (v8i16) __msa_bmnz_v((v16u8) tc, (v16u8) tc_plus1, is_less_than_beta2);
1850 q0_sub_p0 = CLIP_SH(q0_sub_p0, -tc, tc);
1852 ILVR_B2_UH(zeros, src2, zeros, src3, src2_r, src3_r);
1853 src2_r += q0_sub_p0;
1854 src3_r -= q0_sub_p0;
1856 src2_r = (v8u16) CLIP_SH_0_255(src2_r);
1857 src3_r = (v8u16) CLIP_SH_0_255(src3_r);
1859 PCKEV_B2_UB(src2_r, src2_r, src3_r, src3_r, p0, q0);
1861 p0 = __msa_bmnz_v((v16u8) src2, p0, is_less_than);
1862 q0 = __msa_bmnz_v((v16u8) src3, q0, is_less_than);
1864 ILVR_B2_UB(p0, p2, q2, q0, p2, q2);
1866 ILVRL_H2_SW(q2, p2, dst0, dst1);
1870 out0 = __msa_copy_u_w(dst0, 0);
1871 out1 = __msa_copy_u_w(dst0, 1);
1872 out2 = __msa_copy_u_w(dst0, 2);
1873 out3 = __msa_copy_u_w(dst0, 3);
1876 data += (2 * stride);
1878 SW(out0, (data - 2));
1880 SW(out1, (data - 2));
1885 data += (2 * stride);
1887 SW(out2, (data - 2));
1889 SW(out3, (data - 2));
1893 out0 = __msa_copy_u_w(dst1, 0);
1894 out1 = __msa_copy_u_w(dst1, 1);
1895 out2 = __msa_copy_u_w(dst1, 2);
1896 out3 = __msa_copy_u_w(dst1, 3);
1899 data += (2 * stride);
1901 SW(out0, (data - 2));
1903 SW(out1, (data - 2));
1908 SW(out2, (data - 2));
1910 SW(out3, (data - 2));
1914 static void avc_loopfilter_cb_or_cr_inter_edge_hor_msa(uint8_t *data,
1915 uint8_t bs0, uint8_t bs1,
1916 uint8_t bs2, uint8_t bs3,
1917 uint8_t tc0, uint8_t tc1,
1918 uint8_t tc2, uint8_t tc3,
1927 v16u8 p0, q0, p0_asub_q0, p1_asub_p0, q1_asub_q0;
1929 v16u8 is_less_than_beta, is_less_than_alpha, is_bs_greater_than0;
1931 v16u8 p1_org, p0_org, q0_org, q1_org;
1932 v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
1933 v16i8 negate_tc, sign_negate_tc;
1934 v8i16 tc_r, negate_tc_r;
1937 tmp_vec = (v8i16) __msa_fill_b(bs0);
1938 bs = __msa_insve_h(bs, 0, tmp_vec);
1939 tmp_vec = (v8i16) __msa_fill_b(bs1);
1940 bs = __msa_insve_h(bs, 1, tmp_vec);
1941 tmp_vec = (v8i16) __msa_fill_b(bs2);
1942 bs = __msa_insve_h(bs, 2, tmp_vec);
1943 tmp_vec = (v8i16) __msa_fill_b(bs3);
1944 bs = __msa_insve_h(bs, 3, tmp_vec);
1946 if (!__msa_test_bz_v((v16u8) bs)) {
1947 tmp_vec = (v8i16) __msa_fill_b(tc0);
1948 tc = __msa_insve_h(tc, 0, tmp_vec);
1949 tmp_vec = (v8i16) __msa_fill_b(tc1);
1950 tc = __msa_insve_h(tc, 1, tmp_vec);
1951 tmp_vec = (v8i16) __msa_fill_b(tc2);
1952 tc = __msa_insve_h(tc, 2, tmp_vec);
1953 tmp_vec = (v8i16) __msa_fill_b(tc3);
1954 tc = __msa_insve_h(tc, 3, tmp_vec);
1956 is_bs_greater_than0 = (v16u8) (zero < (v16i8) bs);
1958 alpha = (v16u8) __msa_fill_b(alpha_in);
1959 beta = (v16u8) __msa_fill_b(beta_in);
1961 LD_UB4(data - (img_width << 1), img_width,
1962 p1_org, p0_org, q0_org, q1_org);
1964 p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
1965 p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
1966 q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
1968 is_less_than_alpha = (p0_asub_q0 < alpha);
1969 is_less_than_beta = (p1_asub_p0 < beta);
1970 is_less_than = is_less_than_beta & is_less_than_alpha;
1971 is_less_than_beta = (q1_asub_q0 < beta);
1972 is_less_than = is_less_than_beta & is_less_than;
1973 is_less_than = is_less_than & is_bs_greater_than0;
1975 is_less_than = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) is_less_than);
1977 if (!__msa_test_bz_v(is_less_than)) {
1978 negate_tc = zero - (v16i8) tc;
1979 sign_negate_tc = __msa_clti_s_b(negate_tc, 0);
1981 ILVR_B2_SH(zero, tc, sign_negate_tc, negate_tc, tc_r, negate_tc_r);
1983 ILVR_B4_SH(zero, p1_org, zero, p0_org, zero, q0_org, zero, q1_org,
1984 p1_org_r, p0_org_r, q0_org_r, q1_org_r);
1986 AVC_LPF_P0Q0(q0_org_r, p0_org_r, p1_org_r, q1_org_r, negate_tc_r,
1989 PCKEV_B2_UB(zero, p0_r, zero, q0_r, p0, q0);
1991 p0_org = __msa_bmnz_v(p0_org, p0, is_less_than);
1992 q0_org = __msa_bmnz_v(q0_org, q0, is_less_than);
1994 ST_UB(q0_org, data);
1995 ST_UB(p0_org, (data - img_width));
2000 static void avc_loopfilter_cb_or_cr_inter_edge_ver_msa(uint8_t *data,
2001 uint8_t bs0, uint8_t bs1,
2002 uint8_t bs2, uint8_t bs3,
2003 uint8_t tc0, uint8_t tc1,
2004 uint8_t tc2, uint8_t tc3,
2011 v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
2012 v16u8 is_less_than, is_less_than_beta, is_less_than_alpha;
2016 v16u8 p1_org, p0_org, q0_org, q1_org;
2017 v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
2018 v16u8 is_bs_greater_than0;
2019 v8i16 tc_r, negate_tc_r;
2020 v16i8 negate_tc, sign_negate_tc;
2022 v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
2023 v8i16 tmp1, tmp_vec, bs = { 0 };
2026 tmp_vec = (v8i16) __msa_fill_b(bs0);
2027 bs = __msa_insve_h(bs, 0, tmp_vec);
2028 tmp_vec = (v8i16) __msa_fill_b(bs1);
2029 bs = __msa_insve_h(bs, 1, tmp_vec);
2030 tmp_vec = (v8i16) __msa_fill_b(bs2);
2031 bs = __msa_insve_h(bs, 2, tmp_vec);
2032 tmp_vec = (v8i16) __msa_fill_b(bs3);
2033 bs = __msa_insve_h(bs, 3, tmp_vec);
2035 if (!__msa_test_bz_v((v16u8) bs)) {
2036 tmp_vec = (v8i16) __msa_fill_b(tc0);
2037 tc = __msa_insve_h(tc, 0, tmp_vec);
2038 tmp_vec = (v8i16) __msa_fill_b(tc1);
2039 tc = __msa_insve_h(tc, 1, tmp_vec);
2040 tmp_vec = (v8i16) __msa_fill_b(tc2);
2041 tc = __msa_insve_h(tc, 2, tmp_vec);
2042 tmp_vec = (v8i16) __msa_fill_b(tc3);
2043 tc = __msa_insve_h(tc, 3, tmp_vec);
2045 is_bs_greater_than0 = (v16u8) (zero < (v16i8) bs);
2047 LD_UB8((data - 2), img_width,
2048 row0, row1, row2, row3, row4, row5, row6, row7);
2050 TRANSPOSE8x4_UB_UB(row0, row1, row2, row3,
2051 row4, row5, row6, row7,
2052 p1_org, p0_org, q0_org, q1_org);
2054 p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
2055 p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
2056 q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
2058 alpha = (v16u8) __msa_fill_b(alpha_in);
2059 beta = (v16u8) __msa_fill_b(beta_in);
2061 is_less_than_alpha = (p0_asub_q0 < alpha);
2062 is_less_than_beta = (p1_asub_p0 < beta);
2063 is_less_than = is_less_than_beta & is_less_than_alpha;
2064 is_less_than_beta = (q1_asub_q0 < beta);
2065 is_less_than = is_less_than_beta & is_less_than;
2066 is_less_than = is_bs_greater_than0 & is_less_than;
2068 is_less_than = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) is_less_than);
2070 if (!__msa_test_bz_v(is_less_than)) {
2071 ILVR_B4_SH(zero, p1_org, zero, p0_org, zero, q0_org, zero, q1_org,
2072 p1_org_r, p0_org_r, q0_org_r, q1_org_r);
2074 negate_tc = zero - (v16i8) tc;
2075 sign_negate_tc = __msa_clti_s_b(negate_tc, 0);
2077 ILVR_B2_SH(sign_negate_tc, negate_tc, zero, tc, negate_tc_r, tc_r);
2079 AVC_LPF_P0Q0(q0_org_r, p0_org_r, p1_org_r, q1_org_r, negate_tc_r,
2082 PCKEV_B2_UB(zero, p0_r, zero, q0_r, p0, q0);
2084 p0_org = __msa_bmnz_v(p0_org, p0, is_less_than);
2085 q0_org = __msa_bmnz_v(q0_org, q0, is_less_than);
2086 tmp1 = (v8i16) __msa_ilvr_b((v16i8) q0_org, (v16i8) p0_org);
2088 ST2x4_UB(tmp1, 0, src, img_width);
2089 src += 4 * img_width;
2090 ST2x4_UB(tmp1, 4, src, img_width);
2095 static void avc_h_loop_filter_chroma422_msa(uint8_t *src, int32_t stride,
2096 int32_t alpha_in, int32_t beta_in,
2099 int32_t col, tc_val;
2100 v16u8 alpha, beta, res;
2102 alpha = (v16u8) __msa_fill_b(alpha_in);
2103 beta = (v16u8) __msa_fill_b(beta_in);
2105 for (col = 0; col < 4; col++) {
2106 tc_val = (tc0[col] - 1) + 1;
2109 src += (4 * stride);
2113 AVC_LPF_H_CHROMA_422(src, stride, tc_val, alpha, beta, res);
2114 ST2x4_UB(res, 0, (src - 1), stride);
2115 src += (4 * stride);
2119 static void avc_h_loop_filter_chroma422_mbaff_msa(uint8_t *src, int32_t stride,
2124 int32_t col, tc_val;
2126 v16u8 alpha, beta, res;
2128 alpha = (v16u8) __msa_fill_b(alpha_in);
2129 beta = (v16u8) __msa_fill_b(beta_in);
2131 for (col = 0; col < 4; col++) {
2132 tc_val = (tc0[col] - 1) + 1;
2139 AVC_LPF_H_2BYTE_CHROMA_422(src, stride, tc_val, alpha, beta, res);
2141 out0 = __msa_copy_s_h((v8i16) res, 0);
2142 out1 = __msa_copy_s_h((v8i16) res, 1);
2144 SH(out0, (src - 1));
2146 SH(out1, (src - 1));
2151 void ff_h264_h_lpf_luma_inter_msa(uint8_t *data, int img_width,
2152 int alpha, int beta, int8_t *tc)
2168 avc_loopfilter_luma_inter_edge_ver_msa(data, bs0, bs1, bs2, bs3,
2169 tc[0], tc[1], tc[2], tc[3],
2170 alpha, beta, img_width);
2173 void ff_h264_v_lpf_luma_inter_msa(uint8_t *data, int img_width,
2174 int alpha, int beta, int8_t *tc)
2191 avc_loopfilter_luma_inter_edge_hor_msa(data, bs0, bs1, bs2, bs3,
2192 tc[0], tc[1], tc[2], tc[3],
2193 alpha, beta, img_width);
2196 void ff_h264_h_lpf_chroma_inter_msa(uint8_t *data, int img_width,
2197 int alpha, int beta, int8_t *tc)
2213 avc_loopfilter_cb_or_cr_inter_edge_ver_msa(data, bs0, bs1, bs2, bs3,
2214 tc[0], tc[1], tc[2], tc[3],
2215 alpha, beta, img_width);
2218 void ff_h264_v_lpf_chroma_inter_msa(uint8_t *data, int img_width,
2219 int alpha, int beta, int8_t *tc)
2235 avc_loopfilter_cb_or_cr_inter_edge_hor_msa(data, bs0, bs1, bs2, bs3,
2236 tc[0], tc[1], tc[2], tc[3],
2237 alpha, beta, img_width);
2240 void ff_h264_h_lpf_luma_intra_msa(uint8_t *data, int img_width,
2241 int alpha, int beta)
2243 avc_loopfilter_luma_intra_edge_ver_msa(data, (uint8_t) alpha,
2245 (unsigned int) img_width);
2248 void ff_h264_v_lpf_luma_intra_msa(uint8_t *data, int img_width,
2249 int alpha, int beta)
2251 avc_loopfilter_luma_intra_edge_hor_msa(data, (uint8_t) alpha,
2253 (unsigned int) img_width);
2256 void ff_h264_h_lpf_chroma_intra_msa(uint8_t *data, int img_width,
2257 int alpha, int beta)
2259 avc_loopfilter_cb_or_cr_intra_edge_ver_msa(data, (uint8_t) alpha,
2261 (unsigned int) img_width);
2264 void ff_h264_v_lpf_chroma_intra_msa(uint8_t *data, int img_width,
2265 int alpha, int beta)
2267 avc_loopfilter_cb_or_cr_intra_edge_hor_msa(data, (uint8_t) alpha,
2269 (unsigned int) img_width);
2272 void ff_h264_h_loop_filter_chroma422_msa(uint8_t *src,
2274 int32_t alpha, int32_t beta,
2277 avc_h_loop_filter_chroma422_msa(src, ystride, alpha, beta, tc0);
2280 void ff_h264_h_loop_filter_chroma422_mbaff_msa(uint8_t *src,
2286 avc_h_loop_filter_chroma422_mbaff_msa(src, ystride, alpha, beta, tc0);
2289 void ff_h264_h_loop_filter_luma_mbaff_msa(uint8_t *src,
2295 avc_h_loop_filter_luma_mbaff_msa(src, ystride, alpha, beta, tc0);
2298 void ff_h264_h_loop_filter_luma_mbaff_intra_msa(uint8_t *src,
2303 avc_h_loop_filter_luma_mbaff_intra_msa(src, ystride, alpha, beta);
2306 void ff_weight_h264_pixels16_8_msa(uint8_t *src, ptrdiff_t stride,
2307 int height, int log2_denom,
2308 int weight_src, int offset_in)
2310 uint32_t offset_val;
2312 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
2313 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2314 v8i16 src0_l, src1_l, src2_l, src3_l, src0_r, src1_r, src2_r, src3_r;
2315 v8i16 src4_l, src5_l, src6_l, src7_l, src4_r, src5_r, src6_r, src7_r;
2316 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2317 v8i16 tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
2318 v8i16 wgt, denom, offset;
2320 offset_val = (unsigned) offset_in << log2_denom;
2322 wgt = __msa_fill_h(weight_src);
2323 offset = __msa_fill_h(offset_val);
2324 denom = __msa_fill_h(log2_denom);
2326 LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
2327 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, src0_r, src1_r,
2329 ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, src0_l, src1_l,
2331 ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, src4_r, src5_r,
2333 ILVL_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, src4_l, src5_l,
2335 MUL4(wgt, src0_r, wgt, src0_l, wgt, src1_r, wgt, src1_l, tmp0, tmp1, tmp2,
2337 MUL4(wgt, src2_r, wgt, src2_l, wgt, src3_r, wgt, src3_l, tmp4, tmp5, tmp6,
2339 MUL4(wgt, src4_r, wgt, src4_l, wgt, src5_r, wgt, src5_l, tmp8, tmp9, tmp10,
2341 MUL4(wgt, src6_r, wgt, src6_l, wgt, src7_r, wgt, src7_l, tmp12, tmp13,
2343 ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset, tmp0,
2345 ADDS_SH4_SH(tmp4, offset, tmp5, offset, tmp6, offset, tmp7, offset, tmp4,
2347 ADDS_SH4_SH(tmp8, offset, tmp9, offset, tmp10, offset, tmp11, offset, tmp8,
2348 tmp9, tmp10, tmp11);
2349 ADDS_SH4_SH(tmp12, offset, tmp13, offset, tmp14, offset, tmp15, offset,
2350 tmp12, tmp13, tmp14, tmp15);
2351 MAXI_SH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 0);
2352 MAXI_SH8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, 0);
2353 SRLR_H8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom);
2354 SRLR_H8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, denom);
2355 SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7);
2356 SAT_UH8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, 7);
2357 PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, dst0, dst1,
2359 PCKEV_B4_UB(tmp9, tmp8, tmp11, tmp10, tmp13, tmp12, tmp15, tmp14, dst4,
2361 ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, src, stride);
2365 LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
2366 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, src0_r,
2367 src1_r, src2_r, src3_r);
2368 ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, src0_l,
2369 src1_l, src2_l, src3_l);
2370 ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, src4_r,
2371 src5_r, src6_r, src7_r);
2372 ILVL_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, src4_l,
2373 src5_l, src6_l, src7_l);
2374 MUL4(wgt, src0_r, wgt, src0_l, wgt, src1_r, wgt, src1_l, tmp0, tmp1,
2376 MUL4(wgt, src2_r, wgt, src2_l, wgt, src3_r, wgt, src3_l, tmp4, tmp5,
2378 MUL4(wgt, src4_r, wgt, src4_l, wgt, src5_r, wgt, src5_l, tmp8, tmp9,
2380 MUL4(wgt, src6_r, wgt, src6_l, wgt, src7_r, wgt, src7_l, tmp12, tmp13,
2382 ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset,
2383 tmp0, tmp1, tmp2, tmp3);
2384 ADDS_SH4_SH(tmp4, offset, tmp5, offset, tmp6, offset, tmp7, offset,
2385 tmp4, tmp5, tmp6, tmp7);
2386 ADDS_SH4_SH(tmp8, offset, tmp9, offset, tmp10, offset, tmp11, offset,
2387 tmp8, tmp9, tmp10, tmp11);
2388 ADDS_SH4_SH(tmp12, offset, tmp13, offset, tmp14, offset, tmp15, offset,
2389 tmp12, tmp13, tmp14, tmp15);
2390 MAXI_SH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 0);
2391 MAXI_SH8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, 0);
2392 SRLR_H8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom);
2393 SRLR_H8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, denom);
2394 SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7);
2395 SAT_UH8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, 7);
2396 PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, dst0, dst1,
2398 PCKEV_B4_UB(tmp9, tmp8, tmp11, tmp10, tmp13, tmp12, tmp15, tmp14, dst4,
2400 ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, src, stride);
2404 void ff_weight_h264_pixels8_8_msa(uint8_t *src, ptrdiff_t stride,
2405 int height, int log2_denom,
2406 int weight_src, int offset)
2409 avc_wgt_8x4_msa(src, stride, log2_denom, weight_src, offset);
2410 } else if (8 == height) {
2411 avc_wgt_8x8_msa(src, stride, log2_denom, weight_src, offset);
2413 avc_wgt_8x16_msa(src, stride, log2_denom, weight_src, offset);
2417 void ff_weight_h264_pixels4_8_msa(uint8_t *src, ptrdiff_t stride,
2418 int height, int log2_denom,
2419 int weight_src, int offset)
2422 avc_wgt_4x2_msa(src, stride, log2_denom, weight_src, offset);
2423 } else if (4 == height) {
2424 avc_wgt_4x4_msa(src, stride, log2_denom, weight_src, offset);
2426 avc_wgt_4x8_msa(src, stride, log2_denom, weight_src, offset);
2430 void ff_biweight_h264_pixels16_8_msa(uint8_t *dst, uint8_t *src,
2431 ptrdiff_t stride, int height,
2432 int log2_denom, int weight_dst,
2433 int weight_src, int offset)
2435 avc_biwgt_16width_msa(src, stride, dst, stride, height, log2_denom,
2436 weight_src, weight_dst, offset);
2439 void ff_biweight_h264_pixels8_8_msa(uint8_t *dst, uint8_t *src,
2440 ptrdiff_t stride, int height,
2441 int log2_denom, int weight_dst,
2442 int weight_src, int offset)
2444 avc_biwgt_8width_msa(src, stride, dst, stride, height, log2_denom,
2445 weight_src, weight_dst, offset);
2448 void ff_biweight_h264_pixels4_8_msa(uint8_t *dst, uint8_t *src,
2449 ptrdiff_t stride, int height,
2450 int log2_denom, int weight_dst,
2451 int weight_src, int offset)
2453 avc_biwgt_4width_msa(src, stride, dst, stride, height, log2_denom,
2454 weight_src, weight_dst, offset);