2 * Copyright (c) 2018 gxw <guxiwei-hf@loongson.cn>
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #include "vp3dsp_mips.h"
22 #include "libavutil/mips/generic_macros_msa.h"
23 #include "libavutil/intreadwrite.h"
24 #include "libavcodec/rnd_avg.h"
26 static void idct_msa(uint8_t *dst, int stride, int16_t *input, int type)
28 v8i16 r0, r1, r2, r3, r4, r5, r6, r7, sign;
29 v4i32 r0_r, r0_l, r1_r, r1_l, r2_r, r2_l, r3_r, r3_l,
30 r4_r, r4_l, r5_r, r5_l, r6_r, r6_l, r7_r, r7_l;
31 v4i32 A, B, C, D, Ad, Bd, Cd, Dd, E, F, G, H;
32 v4i32 Ed, Gd, Add, Bdd, Fd, Hd;
34 v16i8 d0, d1, d2, d3, d4, d5, d6, d7;
35 v4i32 c0, c1, c2, c3, c4, c5, c6, c7;
36 v4i32 f0, f1, f2, f3, f4, f5, f6, f7;
39 v16i8 mask = {0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0};
40 v4i32 cnst64277w = {64277, 64277, 64277, 64277};
41 v4i32 cnst60547w = {60547, 60547, 60547, 60547};
42 v4i32 cnst54491w = {54491, 54491, 54491, 54491};
43 v4i32 cnst46341w = {46341, 46341, 46341, 46341};
44 v4i32 cnst36410w = {36410, 36410, 36410, 36410};
45 v4i32 cnst25080w = {25080, 25080, 25080, 25080};
46 v4i32 cnst12785w = {12785, 12785, 12785, 12785};
47 v4i32 cnst8w = {8, 8, 8, 8};
48 v4i32 cnst2048w = {2048, 2048, 2048, 2048};
49 v4i32 cnst128w = {128, 128, 128, 128};
52 /* Extended input data */
53 LD_SH8(input, 8, r0, r1, r2, r3, r4, r5, r6, r7);
54 sign = __msa_clti_s_h(r0, 0);
55 r0_r = (v4i32) __msa_ilvr_h(sign, r0);
56 r0_l = (v4i32) __msa_ilvl_h(sign, r0);
57 sign = __msa_clti_s_h(r1, 0);
58 r1_r = (v4i32) __msa_ilvr_h(sign, r1);
59 r1_l = (v4i32) __msa_ilvl_h(sign, r1);
60 sign = __msa_clti_s_h(r2, 0);
61 r2_r = (v4i32) __msa_ilvr_h(sign, r2);
62 r2_l = (v4i32) __msa_ilvl_h(sign, r2);
63 sign = __msa_clti_s_h(r3, 0);
64 r3_r = (v4i32) __msa_ilvr_h(sign, r3);
65 r3_l = (v4i32) __msa_ilvl_h(sign, r3);
66 sign = __msa_clti_s_h(r4, 0);
67 r4_r = (v4i32) __msa_ilvr_h(sign, r4);
68 r4_l = (v4i32) __msa_ilvl_h(sign, r4);
69 sign = __msa_clti_s_h(r5, 0);
70 r5_r = (v4i32) __msa_ilvr_h(sign, r5);
71 r5_l = (v4i32) __msa_ilvl_h(sign, r5);
72 sign = __msa_clti_s_h(r6, 0);
73 r6_r = (v4i32) __msa_ilvr_h(sign, r6);
74 r6_l = (v4i32) __msa_ilvl_h(sign, r6);
75 sign = __msa_clti_s_h(r7, 0);
76 r7_r = (v4i32) __msa_ilvr_h(sign, r7);
77 r7_l = (v4i32) __msa_ilvl_h(sign, r7);
80 A = ((r1_r * cnst64277w) >> 16) + ((r7_r * cnst12785w) >> 16);
81 B = ((r1_r * cnst12785w) >> 16) - ((r7_r * cnst64277w) >> 16);
82 C = ((r3_r * cnst54491w) >> 16) + ((r5_r * cnst36410w) >> 16);
83 D = ((r5_r * cnst54491w) >> 16) - ((r3_r * cnst36410w) >> 16);
84 Ad = ((A - C) * cnst46341w) >> 16;
85 Bd = ((B - D) * cnst46341w) >> 16;
88 E = ((r0_r + r4_r) * cnst46341w) >> 16;
89 F = ((r0_r - r4_r) * cnst46341w) >> 16;
90 G = ((r2_r * cnst60547w) >> 16) + ((r6_r * cnst25080w) >> 16);
91 H = ((r2_r * cnst25080w) >> 16) - ((r6_r * cnst60547w) >> 16);
108 A = ((r1_l * cnst64277w) >> 16) + ((r7_l * cnst12785w) >> 16);
109 B = ((r1_l * cnst12785w) >> 16) - ((r7_l * cnst64277w) >> 16);
110 C = ((r3_l * cnst54491w) >> 16) + ((r5_l * cnst36410w) >> 16);
111 D = ((r5_l * cnst54491w) >> 16) - ((r3_l * cnst36410w) >> 16);
112 Ad = ((A - C) * cnst46341w) >> 16;
113 Bd = ((B - D) * cnst46341w) >> 16;
116 E = ((r0_l + r4_l) * cnst46341w) >> 16;
117 F = ((r0_l - r4_l) * cnst46341w) >> 16;
118 G = ((r2_l * cnst60547w) >> 16) + ((r6_l * cnst25080w) >> 16);
119 H = ((r2_l * cnst25080w) >> 16) - ((r6_l * cnst60547w) >> 16);
136 TRANSPOSE4x4_SW_SW(r0_r, r1_r, r2_r, r3_r,
137 r0_r, r1_r, r2_r, r3_r);
138 TRANSPOSE4x4_SW_SW(r0_l, r1_l, r2_l, r3_l,
139 r0_l, r1_l, r2_l, r3_l);
140 A = ((r1_r * cnst64277w) >> 16) + ((r3_l * cnst12785w) >> 16);
141 B = ((r1_r * cnst12785w) >> 16) - ((r3_l * cnst64277w) >> 16);
142 C = ((r3_r * cnst54491w) >> 16) + ((r1_l * cnst36410w) >> 16);
143 D = ((r1_l * cnst54491w) >> 16) - ((r3_r * cnst36410w) >> 16);
144 Ad = ((A - C) * cnst46341w) >> 16;
145 Bd = ((B - D) * cnst46341w) >> 16;
148 E = ((r0_r + r0_l) * cnst46341w) >> 16;
150 F = ((r0_r - r0_l) * cnst46341w) >> 16;
152 if (type == 1) { // HACK
156 G = ((r2_r * cnst60547w) >> 16) + ((r2_l * cnst25080w) >> 16);
157 H = ((r2_r * cnst25080w) >> 16) - ((r2_l * cnst60547w) >> 16);
173 LD_SB8(dst, stride, d0, d1, d2, d3, d4, d5, d6, d7);
174 ILVR_B4_SW(zero, d0, zero, d1, zero, d2, zero, d3,
176 ILVR_B4_SW(zero, d4, zero, d5, zero, d6, zero, d7,
178 ILVR_H4_SW(zero, f0, zero, f1, zero, f2, zero, f3,
180 ILVR_H4_SW(zero, f4, zero, f5, zero, f6, zero, f7,
191 A = CLIP_SW_0_255(A);
192 B = CLIP_SW_0_255(B);
193 C = CLIP_SW_0_255(C);
194 D = CLIP_SW_0_255(D);
195 E = CLIP_SW_0_255(E);
196 F = CLIP_SW_0_255(F);
197 G = CLIP_SW_0_255(G);
198 H = CLIP_SW_0_255(H);
199 sign_l = __msa_or_v((v16u8)r1_r, (v16u8)r2_r);
200 sign_l = __msa_or_v(sign_l, (v16u8)r3_r);
201 sign_l = __msa_or_v(sign_l, (v16u8)r0_l);
202 sign_l = __msa_or_v(sign_l, (v16u8)r1_l);
203 sign_l = __msa_or_v(sign_l, (v16u8)r2_l);
204 sign_l = __msa_or_v(sign_l, (v16u8)r3_l);
205 sign_t = __msa_ceqi_w((v4i32)sign_l, 0);
206 Add = ((r0_r * cnst46341w) + (8 << 16)) >> 20;
208 Bdd = Add + cnst128w;
209 Bdd = CLIP_SW_0_255(Bdd);
227 Ad = CLIP_SW_0_255(Ad);
228 Bd = CLIP_SW_0_255(Bd);
229 Cd = CLIP_SW_0_255(Cd);
230 Dd = CLIP_SW_0_255(Dd);
231 Ed = CLIP_SW_0_255(Ed);
232 Fd = CLIP_SW_0_255(Fd);
233 Gd = CLIP_SW_0_255(Gd);
234 Hd = CLIP_SW_0_255(Hd);
236 Ad = (v4i32)__msa_and_v((v16u8)Ad, (v16u8)sign_t);
237 Bd = (v4i32)__msa_and_v((v16u8)Bd, (v16u8)sign_t);
238 Cd = (v4i32)__msa_and_v((v16u8)Cd, (v16u8)sign_t);
239 Dd = (v4i32)__msa_and_v((v16u8)Dd, (v16u8)sign_t);
240 Ed = (v4i32)__msa_and_v((v16u8)Ed, (v16u8)sign_t);
241 Fd = (v4i32)__msa_and_v((v16u8)Fd, (v16u8)sign_t);
242 Gd = (v4i32)__msa_and_v((v16u8)Gd, (v16u8)sign_t);
243 Hd = (v4i32)__msa_and_v((v16u8)Hd, (v16u8)sign_t);
244 sign_t = __msa_ceqi_w(sign_t, 0);
245 A = (v4i32)__msa_and_v((v16u8)A, (v16u8)sign_t);
246 B = (v4i32)__msa_and_v((v16u8)B, (v16u8)sign_t);
247 C = (v4i32)__msa_and_v((v16u8)C, (v16u8)sign_t);
248 D = (v4i32)__msa_and_v((v16u8)D, (v16u8)sign_t);
249 E = (v4i32)__msa_and_v((v16u8)E, (v16u8)sign_t);
250 F = (v4i32)__msa_and_v((v16u8)F, (v16u8)sign_t);
251 G = (v4i32)__msa_and_v((v16u8)G, (v16u8)sign_t);
252 H = (v4i32)__msa_and_v((v16u8)H, (v16u8)sign_t);
263 TRANSPOSE4x4_SW_SW(r4_r, r5_r, r6_r, r7_r,
264 r4_r, r5_r, r6_r, r7_r);
265 TRANSPOSE4x4_SW_SW(r4_l, r5_l, r6_l, r7_l,
266 r4_l, r5_l, r6_l, r7_l);
267 A = ((r5_r * cnst64277w) >> 16) + ((r7_l * cnst12785w) >> 16);
268 B = ((r5_r * cnst12785w) >> 16) - ((r7_l * cnst64277w) >> 16);
269 C = ((r7_r * cnst54491w) >> 16) + ((r5_l * cnst36410w) >> 16);
270 D = ((r5_l * cnst54491w) >> 16) - ((r7_r * cnst36410w) >> 16);
271 Ad = ((A - C) * cnst46341w) >> 16;
272 Bd = ((B - D) * cnst46341w) >> 16;
275 E = ((r4_r + r4_l) * cnst46341w) >> 16;
277 F = ((r4_r - r4_l) * cnst46341w) >> 16;
279 if (type == 1) { // HACK
283 G = ((r6_r * cnst60547w) >> 16) + ((r6_l * cnst25080w) >> 16);
284 H = ((r6_r * cnst25080w) >> 16) - ((r6_l * cnst60547w) >> 16);
300 ILVL_H4_SW(zero, f0, zero, f1, zero, f2, zero, f3,
302 ILVL_H4_SW(zero, f4, zero, f5, zero, f6, zero, f7,
313 A = CLIP_SW_0_255(A);
314 B = CLIP_SW_0_255(B);
315 C = CLIP_SW_0_255(C);
316 D = CLIP_SW_0_255(D);
317 E = CLIP_SW_0_255(E);
318 F = CLIP_SW_0_255(F);
319 G = CLIP_SW_0_255(G);
320 H = CLIP_SW_0_255(H);
321 sign_l = __msa_or_v((v16u8)r5_r, (v16u8)r6_r);
322 sign_l = __msa_or_v(sign_l, (v16u8)r7_r);
323 sign_l = __msa_or_v(sign_l, (v16u8)r4_l);
324 sign_l = __msa_or_v(sign_l, (v16u8)r5_l);
325 sign_l = __msa_or_v(sign_l, (v16u8)r6_l);
326 sign_l = __msa_or_v(sign_l, (v16u8)r7_l);
327 sign_t = __msa_ceqi_w((v4i32)sign_l, 0);
328 Add = ((r4_r * cnst46341w) + (8 << 16)) >> 20;
330 Bdd = Add + cnst128w;
331 Bdd = CLIP_SW_0_255(Bdd);
349 Ad = CLIP_SW_0_255(Ad);
350 Bd = CLIP_SW_0_255(Bd);
351 Cd = CLIP_SW_0_255(Cd);
352 Dd = CLIP_SW_0_255(Dd);
353 Ed = CLIP_SW_0_255(Ed);
354 Fd = CLIP_SW_0_255(Fd);
355 Gd = CLIP_SW_0_255(Gd);
356 Hd = CLIP_SW_0_255(Hd);
358 Ad = (v4i32)__msa_and_v((v16u8)Ad, (v16u8)sign_t);
359 Bd = (v4i32)__msa_and_v((v16u8)Bd, (v16u8)sign_t);
360 Cd = (v4i32)__msa_and_v((v16u8)Cd, (v16u8)sign_t);
361 Dd = (v4i32)__msa_and_v((v16u8)Dd, (v16u8)sign_t);
362 Ed = (v4i32)__msa_and_v((v16u8)Ed, (v16u8)sign_t);
363 Fd = (v4i32)__msa_and_v((v16u8)Fd, (v16u8)sign_t);
364 Gd = (v4i32)__msa_and_v((v16u8)Gd, (v16u8)sign_t);
365 Hd = (v4i32)__msa_and_v((v16u8)Hd, (v16u8)sign_t);
366 sign_t = __msa_ceqi_w(sign_t, 0);
367 A = (v4i32)__msa_and_v((v16u8)A, (v16u8)sign_t);
368 B = (v4i32)__msa_and_v((v16u8)B, (v16u8)sign_t);
369 C = (v4i32)__msa_and_v((v16u8)C, (v16u8)sign_t);
370 D = (v4i32)__msa_and_v((v16u8)D, (v16u8)sign_t);
371 E = (v4i32)__msa_and_v((v16u8)E, (v16u8)sign_t);
372 F = (v4i32)__msa_and_v((v16u8)F, (v16u8)sign_t);
373 G = (v4i32)__msa_and_v((v16u8)G, (v16u8)sign_t);
374 H = (v4i32)__msa_and_v((v16u8)H, (v16u8)sign_t);
383 VSHF_B2_SB(r0_r, r4_r, r1_r, r5_r, mask, mask, d0, d1);
384 VSHF_B2_SB(r2_r, r6_r, r3_r, r7_r, mask, mask, d2, d3);
385 VSHF_B2_SB(r0_l, r4_l, r1_l, r5_l, mask, mask, d4, d5);
386 VSHF_B2_SB(r2_l, r6_l, r3_l, r7_l, mask, mask, d6, d7);
388 /* Final sequence of operations over-write original dst */
390 ST8x1_UB(d1, dst + nstride);
392 ST8x1_UB(d2, dst + nstride);
394 ST8x1_UB(d3, dst + nstride);
396 ST8x1_UB(d4, dst + nstride);
398 ST8x1_UB(d5, dst + nstride);
400 ST8x1_UB(d6, dst + nstride);
402 ST8x1_UB(d7, dst + nstride);
405 void ff_vp3_idct_put_msa(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
407 idct_msa(dest, line_size, block, 1);
408 memset(block, 0, sizeof(*block) * 64);
411 void ff_vp3_idct_add_msa(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
413 idct_msa(dest, line_size, block, 2);
414 memset(block, 0, sizeof(*block) * 64);
417 void ff_vp3_idct_dc_add_msa(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
419 int i = (block[0] + 15) >> 5;
420 v4i32 dc = {i, i, i, i};
421 v16i8 d0, d1, d2, d3, d4, d5, d6, d7;
422 v4i32 c0, c1, c2, c3, c4, c5, c6, c7;
423 v4i32 e0, e1, e2, e3, e4, e5, e6, e7;
424 v4i32 r0, r1, r2, r3, r4, r5, r6, r7;
425 v16i8 mask = {0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0};
427 int nstride = line_size;
429 LD_SB8(dest, line_size, d0, d1, d2, d3, d4, d5, d6, d7);
430 ILVR_B4_SW(zero, d0, zero, d1, zero, d2, zero, d3,
432 ILVR_B4_SW(zero, d4, zero, d5, zero, d6, zero, d7,
435 ILVR_H4_SW(zero, c0, zero, c1, zero, c2, zero, c3,
437 ILVR_H4_SW(zero, c4, zero, c5, zero, c6, zero, c7,
447 e0 = CLIP_SW_0_255(e0);
448 e1 = CLIP_SW_0_255(e1);
449 e2 = CLIP_SW_0_255(e2);
450 e3 = CLIP_SW_0_255(e3);
451 e4 = CLIP_SW_0_255(e4);
452 e5 = CLIP_SW_0_255(e5);
453 e6 = CLIP_SW_0_255(e6);
454 e7 = CLIP_SW_0_255(e7);
457 ILVL_H4_SW(zero, c0, zero, c1, zero, c2, zero, c3,
459 ILVL_H4_SW(zero, c4, zero, c5, zero, c6, zero, c7,
469 r0 = CLIP_SW_0_255(r0);
470 r1 = CLIP_SW_0_255(r1);
471 r2 = CLIP_SW_0_255(r2);
472 r3 = CLIP_SW_0_255(r3);
473 r4 = CLIP_SW_0_255(r4);
474 r5 = CLIP_SW_0_255(r5);
475 r6 = CLIP_SW_0_255(r6);
476 r7 = CLIP_SW_0_255(r7);
477 VSHF_B2_SB(e0, r0, e1, r1, mask, mask, d0, d1);
478 VSHF_B2_SB(e2, r2, e3, r3, mask, mask, d2, d3);
479 VSHF_B2_SB(e4, r4, e5, r5, mask, mask, d4, d5);
480 VSHF_B2_SB(e6, r6, e7, r7, mask, mask, d6, d7);
482 /* Final sequence of operations over-write original dst */
484 ST8x1_UB(d1, dest + nstride);
485 nstride += line_size;
486 ST8x1_UB(d2, dest + nstride);
487 nstride += line_size;
488 ST8x1_UB(d3, dest + nstride);
489 nstride += line_size;
490 ST8x1_UB(d4, dest + nstride);
491 nstride += line_size;
492 ST8x1_UB(d5, dest + nstride);
493 nstride += line_size;
494 ST8x1_UB(d6, dest + nstride);
495 nstride += line_size;
496 ST8x1_UB(d7, dest + nstride);
501 void ff_vp3_v_loop_filter_msa(uint8_t *first_pixel, ptrdiff_t stride,
502 int *bounding_values)
504 int nstride = -stride;
505 v4i32 e0, e1, f0, f1, g0, g1;
507 v16i8 d0, d1, d2, d3;
508 v8i16 c0, c1, c2, c3;
510 v8i16 cnst3h = {3, 3, 3, 3, 3, 3, 3, 3},
511 cnst4h = {4, 4, 4, 4, 4, 4, 4, 4};
512 v16i8 mask = {0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0};
516 LD_SB4(first_pixel + nstride * 2, stride, d0, d1, d2, d3);
517 ILVR_B4_SH(zero, d0, zero, d1, zero, d2, zero, d3,
519 r0 = (c0 - c3) + (c2 - c1) * cnst3h;
522 /* Get filter_value from bounding_values one by one */
524 for (int i = 0; i < 8; i++)
525 temp_32[i] = bounding_values[temp_16[i]];
526 LD_SW2(temp_32, 4, e0, e1);
527 ILVR_H2_SW(zero, c1, zero, c2, f0, g0);
528 ILVL_H2_SW(zero, c1, zero, c2, f1, g1);
533 f0 = CLIP_SW_0_255(f0);
534 f1 = CLIP_SW_0_255(f1);
535 g0 = CLIP_SW_0_255(g0);
536 g1 = CLIP_SW_0_255(g1);
537 VSHF_B2_SB(f0, f1, g0, g1, mask, mask, d1, d2);
539 /* Final move to first_pixel */
540 ST8x1_UB(d1, first_pixel + nstride);
541 ST8x1_UB(d2, first_pixel);
544 void ff_vp3_h_loop_filter_msa(uint8_t *first_pixel, ptrdiff_t stride,
545 int *bounding_values)
547 v16i8 d0, d1, d2, d3, d4, d5, d6, d7;
548 v8i16 c0, c1, c2, c3, c4, c5, c6, c7;
550 v4i32 e0, e1, f0, f1, g0, g1;
552 v8i16 cnst3h = {3, 3, 3, 3, 3, 3, 3, 3},
553 cnst4h = {4, 4, 4, 4, 4, 4, 4, 4};
554 v16i8 mask = {0, 16, 4, 20, 8, 24, 12, 28, 0, 0, 0, 0, 0, 0, 0, 0};
558 LD_SB8(first_pixel - 2, stride, d0, d1, d2, d3, d4, d5, d6, d7);
559 ILVR_B4_SH(zero, d0, zero, d1, zero, d2, zero, d3,
561 ILVR_B4_SH(zero, d4, zero, d5, zero, d6, zero, d7,
563 TRANSPOSE8x8_SH_SH(c0, c1, c2, c3, c4, c5, c6, c7,
564 c0, c1, c2, c3, c4, c5, c6, c7);
565 r0 = (c0 - c3) + (c2 - c1) * cnst3h;
569 /* Get filter_value from bounding_values one by one */
571 for (int i = 0; i < 8; i++)
572 temp_32[i] = bounding_values[temp_16[i]];
573 LD_SW2(temp_32, 4, e0, e1);
574 ILVR_H2_SW(zero, c1, zero, c2, f0, g0);
575 ILVL_H2_SW(zero, c1, zero, c2, f1, g1);
580 f0 = CLIP_SW_0_255(f0);
581 f1 = CLIP_SW_0_255(f1);
582 g0 = CLIP_SW_0_255(g0);
583 g1 = CLIP_SW_0_255(g1);
584 VSHF_B2_SB(f0, g0, f1, g1, mask, mask, d1, d2);
585 /* Final move to first_pixel */
586 ST2x4_UB(d1, 0, first_pixel - 1, stride);
587 ST2x4_UB(d2, 0, first_pixel - 1 + 4 * stride, stride);
590 void ff_put_no_rnd_pixels_l2_msa(uint8_t *dst, const uint8_t *src1,
591 const uint8_t *src2, ptrdiff_t stride, int h)
594 v16i8 d0, d1, d2, d3, d4, d5, d6, d7;
595 v16i8 c0, c1, c2, c3;
596 v4i32 a0, a1, a2, a3, b0, b1, b2, b3;
599 v4u32 t0, t1, t2, t3;
600 v16i8 mask = {0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23};
601 int32_t value = 0xfefefefe;
602 v4i32 fmask = {value, value, value, value};
604 LD_SB8(src1, stride, d0, d1, d2, d3, d4, d5, d6, d7);
605 VSHF_B2_SB(d0, d1, d2, d3, mask, mask, c0, c1);
606 VSHF_B2_SB(d4, d5, d6, d7, mask, mask, c2, c3);
607 a0 = (v4i32) __msa_pckev_d((v2i64)c1, (v2i64)c0);
608 a2 = (v4i32) __msa_pckod_d((v2i64)c1, (v2i64)c0);
609 a1 = (v4i32) __msa_pckev_d((v2i64)c3, (v2i64)c2);
610 a3 = (v4i32) __msa_pckod_d((v2i64)c3, (v2i64)c2);
612 LD_SB8(src2, stride, d0, d1, d2, d3, d4, d5, d6, d7);
613 VSHF_B2_SB(d0, d1, d2, d3, mask, mask, c0, c1);
614 VSHF_B2_SB(d4, d5, d6, d7, mask, mask, c2, c3);
615 b0 = (v4i32) __msa_pckev_d((v2i64)c1, (v2i64)c0);
616 b2 = (v4i32) __msa_pckod_d((v2i64)c1, (v2i64)c0);
617 b1 = (v4i32) __msa_pckev_d((v2i64)c3, (v2i64)c2);
618 b3 = (v4i32) __msa_pckod_d((v2i64)c3, (v2i64)c2);
620 e0 = (v4i32) __msa_xor_v((v16u8)a0, (v16u8)b0);
621 e0 = (v4i32) __msa_and_v((v16u8)e0, (v16u8)fmask);
622 t0 = ((v4u32)e0) >> 1;
623 e2 = (v4i32) __msa_and_v((v16u8)a0, (v16u8)b0);
626 e1 = (v4i32) __msa_xor_v((v16u8)a1, (v16u8)b1);
627 e1 = (v4i32) __msa_and_v((v16u8)e1, (v16u8)fmask);
628 t1 = ((v4u32)e1) >> 1;
629 e2 = (v4i32) __msa_and_v((v16u8)a1, (v16u8)b1);
632 f0 = (v4i32) __msa_xor_v((v16u8)a2, (v16u8)b2);
633 f0 = (v4i32) __msa_and_v((v16u8)f0, (v16u8)fmask);
634 t2 = ((v4u32)f0) >> 1;
635 f2 = (v4i32) __msa_and_v((v16u8)a2, (v16u8)b2);
638 f1 = (v4i32) __msa_xor_v((v16u8)a3, (v16u8)b3);
639 f1 = (v4i32) __msa_and_v((v16u8)f1, (v16u8)fmask);
640 t3 = ((v4u32)f1) >> 1;
641 f2 = (v4i32) __msa_and_v((v16u8)a3, (v16u8)b3);
644 ST4x4_UB(t0, t0, 0, 1, 2, 3, dst, stride);
645 ST4x4_UB(t1, t1, 0, 1, 2, 3, dst + 4 * stride, stride);
646 ST4x4_UB(t2, t2, 0, 1, 2, 3, dst + 4, stride);
647 ST4x4_UB(t3, t3, 0, 1, 2, 3, dst + 4 + 4 * stride, stride);
651 for (i = 0; i < h; i++) {
654 a = AV_RN32(&src1[i * stride]);
655 b = AV_RN32(&src2[i * stride]);
656 AV_WN32A(&dst[i * stride], no_rnd_avg32(a, b));
657 a = AV_RN32(&src1[i * stride + 4]);
658 b = AV_RN32(&src2[i * stride + 4]);
659 AV_WN32A(&dst[i * stride + 4], no_rnd_avg32(a, b));