2 * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #ifndef AVUTIL_MIPS_GENERIC_MACROS_MSA_H
22 #define AVUTIL_MIPS_GENERIC_MACROS_MSA_H
27 #define LOAD_UB(psrc) \
30 out_m = *((v16u8 *) (psrc)); \
34 #define LOAD_SB(psrc) \
37 out_m = *((v16i8 *) (psrc)); \
41 #define LOAD_UH(psrc) *((const v8u16 *)(psrc))
43 #define LOAD_SH(psrc) \
46 out_m = *((v8i16 *) (psrc)); \
50 #define LOAD_SW(psrc) *((const v4i32 *)(psrc))
52 #define STORE_UB(vec, pdest) *((v16u8 *)(pdest)) = (vec)
53 #define STORE_SB(vec, pdest) *((v16i8 *)(pdest)) = (vec)
55 #define STORE_SH(vec, pdest) \
57 *((v8i16 *) (pdest)) = (vec); \
60 #define STORE_SW(vec, pdest) \
62 *((v4i32 *) (pdest)) = (vec); \
65 #if (__mips_isa_rev >= 6)
66 #define LOAD_WORD(psrc) \
68 uint8_t *src_m = (uint8_t *) (psrc); \
72 "lw %[val_m], %[src_m] \n\t" \
74 : [val_m] "=r" (val_m) \
75 : [src_m] "m" (*src_m) \
82 #define LOAD_DWORD(psrc) \
84 uint8_t *src_m = (uint8_t *) (psrc); \
88 "ld %[val_m], %[src_m] \n\t" \
90 : [val_m] "=r" (val_m) \
91 : [src_m] "m" (*src_m) \
97 #define LOAD_DWORD(psrc) \
99 uint8_t *src1_m = (uint8_t *) (psrc); \
100 uint8_t *src2_m = ((uint8_t *) (psrc)) + 4; \
101 uint32_t val0_m, val1_m; \
102 uint64_t genval_m = 0; \
105 "lw %[val0_m], %[src1_m] \n\t" \
107 : [val0_m] "=r" (val0_m) \
108 : [src1_m] "m" (*src1_m) \
112 "lw %[val1_m], %[src2_m] \n\t" \
114 : [val1_m] "=r" (val1_m) \
115 : [src2_m] "m" (*src2_m) \
118 genval_m = (uint64_t) (val1_m); \
119 genval_m = (uint64_t) ((genval_m << 32) & 0xFFFFFFFF00000000); \
120 genval_m = (uint64_t) (genval_m | (uint64_t) val0_m); \
126 #define STORE_WORD(pdst, val) \
128 uint8_t *dst_ptr_m = (uint8_t *) (pdst); \
129 uint32_t val_m = (val); \
132 "sw %[val_m], %[dst_ptr_m] \n\t" \
134 : [dst_ptr_m] "=m" (*dst_ptr_m) \
135 : [val_m] "r" (val_m) \
139 #define STORE_DWORD(pdst, val) \
141 uint8_t *dst_ptr_m = (uint8_t *) (pdst); \
142 uint64_t val_m = (val); \
145 "sd %[val_m], %[dst_ptr_m] \n\t" \
147 : [dst_ptr_m] "=m" (*dst_ptr_m) \
148 : [val_m] "r" (val_m) \
151 #define STORE_HWORD(pdst, val) \
153 uint8_t *dst_ptr_m = (uint8_t *) (pdst); \
154 uint16_t val_m = (val); \
157 "sh %[val_m], %[dst_ptr_m] \n\t" \
159 : [dst_ptr_m] "=m" (*dst_ptr_m) \
160 : [val_m] "r" (val_m) \
165 #define LOAD_WORD(psrc) \
167 uint8_t *src_m = (uint8_t *) (psrc); \
171 "ulw %[val_m], %[src_m] \n\t" \
173 : [val_m] "=r" (val_m) \
174 : [src_m] "m" (*src_m) \
181 #define LOAD_DWORD(psrc) \
183 uint8_t *src_m = (uint8_t *) (psrc); \
184 uint64_t val_m = 0; \
187 "uld %[val_m], %[src_m] \n\t" \
189 : [val_m] "=r" (val_m) \
190 : [src_m] "m" (*src_m) \
196 #define LOAD_DWORD(psrc) \
198 uint8_t *src1_m = (uint8_t *) (psrc); \
199 uint8_t *src2_m = ((uint8_t *) (psrc)) + 4; \
200 uint32_t val0_m, val1_m; \
201 uint64_t genval_m = 0; \
204 "ulw %[val0_m], %[src1_m] \n\t" \
206 : [val0_m] "=r" (val0_m) \
207 : [src1_m] "m" (*src1_m) \
211 "ulw %[val1_m], %[src2_m] \n\t" \
213 : [val1_m] "=r" (val1_m) \
214 : [src2_m] "m" (*src2_m) \
217 genval_m = (uint64_t) (val1_m); \
218 genval_m = (uint64_t) ((genval_m << 32) & 0xFFFFFFFF00000000); \
219 genval_m = (uint64_t) (genval_m | (uint64_t) val0_m); \
225 #define STORE_WORD(pdst, val) \
227 uint8_t *dst_ptr_m = (uint8_t *) (pdst); \
228 uint32_t val_m = (val); \
231 "usw %[val_m], %[dst_ptr_m] \n\t" \
233 : [dst_ptr_m] "=m" (*dst_ptr_m) \
234 : [val_m] "r" (val_m) \
238 #define STORE_DWORD(pdst, val) \
240 uint8_t *dst1_m = (uint8_t *) (pdst); \
241 uint8_t *dst2_m = ((uint8_t *) (pdst)) + 4; \
242 uint32_t val0_m, val1_m; \
244 val0_m = (uint32_t) ((val) & 0x00000000FFFFFFFF); \
245 val1_m = (uint32_t) (((val) >> 32) & 0x00000000FFFFFFFF); \
248 "usw %[val0_m], %[dst1_m] \n\t" \
249 "usw %[val1_m], %[dst2_m] \n\t" \
251 : [dst1_m] "=m" (*dst1_m), [dst2_m] "=m" (*dst2_m) \
252 : [val0_m] "r" (val0_m), [val1_m] "r" (val1_m) \
256 #define STORE_HWORD(pdst, val) \
258 uint8_t *dst_ptr_m = (uint8_t *) (pdst); \
259 uint16_t val_m = (val); \
262 "ush %[val_m], %[dst_ptr_m] \n\t" \
264 : [dst_ptr_m] "=m" (*dst_ptr_m) \
265 : [val_m] "r" (val_m) \
271 #define LOAD_4WORDS_WITH_STRIDE(psrc, src_stride, \
272 src0, src1, src2, src3) \
274 src0 = LOAD_WORD(psrc + 0 * src_stride); \
275 src1 = LOAD_WORD(psrc + 1 * src_stride); \
276 src2 = LOAD_WORD(psrc + 2 * src_stride); \
277 src3 = LOAD_WORD(psrc + 3 * src_stride); \
280 #define LOAD_2VECS_UB(psrc, stride, \
283 val0 = LOAD_UB(psrc + 0 * stride); \
284 val1 = LOAD_UB(psrc + 1 * stride); \
287 #define LOAD_2VECS_SB(psrc, stride, \
290 val0 = LOAD_SB(psrc + 0 * stride); \
291 val1 = LOAD_SB(psrc + 1 * stride); \
294 #define LOAD_3VECS_UB(psrc, stride, \
297 val0 = LOAD_UB(psrc + 0 * stride); \
298 val1 = LOAD_UB(psrc + 1 * stride); \
299 val2 = LOAD_UB(psrc + 2 * stride); \
302 #define LOAD_3VECS_SB(psrc, stride, \
305 val0 = LOAD_SB(psrc + 0 * stride); \
306 val1 = LOAD_SB(psrc + 1 * stride); \
307 val2 = LOAD_SB(psrc + 2 * stride); \
310 #define LOAD_4VECS_UB(psrc, stride, \
311 val0, val1, val2, val3) \
313 val0 = LOAD_UB(psrc + 0 * stride); \
314 val1 = LOAD_UB(psrc + 1 * stride); \
315 val2 = LOAD_UB(psrc + 2 * stride); \
316 val3 = LOAD_UB(psrc + 3 * stride); \
319 #define LOAD_4VECS_SB(psrc, stride, \
320 val0, val1, val2, val3) \
322 val0 = LOAD_SB(psrc + 0 * stride); \
323 val1 = LOAD_SB(psrc + 1 * stride); \
324 val2 = LOAD_SB(psrc + 2 * stride); \
325 val3 = LOAD_SB(psrc + 3 * stride); \
328 #define LOAD_5VECS_UB(psrc, stride, \
329 out0, out1, out2, out3, out4) \
331 LOAD_4VECS_UB((psrc), (stride), \
332 (out0), (out1), (out2), (out3)); \
333 out4 = LOAD_UB(psrc + 4 * stride); \
336 #define LOAD_5VECS_SB(psrc, stride, \
337 out0, out1, out2, out3, out4) \
339 LOAD_4VECS_SB((psrc), (stride), \
340 (out0), (out1), (out2), (out3)); \
341 out4 = LOAD_SB(psrc + 4 * stride); \
344 #define LOAD_6VECS_SB(psrc, stride, \
345 out0, out1, out2, out3, out4, out5) \
347 LOAD_4VECS_SB((psrc), (stride), \
348 (out0), (out1), (out2), (out3)); \
349 LOAD_2VECS_SB((psrc + 4 * stride), (stride), \
353 #define LOAD_7VECS_UB(psrc, stride, \
354 val0, val1, val2, val3, \
357 val0 = LOAD_UB((psrc) + 0 * (stride)); \
358 val1 = LOAD_UB((psrc) + 1 * (stride)); \
359 val2 = LOAD_UB((psrc) + 2 * (stride)); \
360 val3 = LOAD_UB((psrc) + 3 * (stride)); \
361 val4 = LOAD_UB((psrc) + 4 * (stride)); \
362 val5 = LOAD_UB((psrc) + 5 * (stride)); \
363 val6 = LOAD_UB((psrc) + 6 * (stride)); \
366 #define LOAD_7VECS_SB(psrc, stride, \
367 val0, val1, val2, val3, \
370 val0 = LOAD_SB((psrc) + 0 * (stride)); \
371 val1 = LOAD_SB((psrc) + 1 * (stride)); \
372 val2 = LOAD_SB((psrc) + 2 * (stride)); \
373 val3 = LOAD_SB((psrc) + 3 * (stride)); \
374 val4 = LOAD_SB((psrc) + 4 * (stride)); \
375 val5 = LOAD_SB((psrc) + 5 * (stride)); \
376 val6 = LOAD_SB((psrc) + 6 * (stride)); \
379 #define LOAD_8VECS_UB(psrc, stride, \
380 out0, out1, out2, out3, \
381 out4, out5, out6, out7) \
383 LOAD_4VECS_UB((psrc), (stride), \
384 (out0), (out1), (out2), (out3)); \
385 LOAD_4VECS_UB((psrc + 4 * stride), (stride), \
386 (out4), (out5), (out6), (out7)); \
389 #define LOAD_8VECS_SB(psrc, stride, \
390 out0, out1, out2, out3, \
391 out4, out5, out6, out7) \
393 LOAD_4VECS_SB((psrc), (stride), \
394 (out0), (out1), (out2), (out3)); \
395 LOAD_4VECS_SB((psrc + 4 * stride), (stride), \
396 (out4), (out5), (out6), (out7)); \
399 #define LOAD_2VECS_UH(psrc, stride, \
402 val0 = LOAD_UH((psrc) + 0 * (stride)); \
403 val1 = LOAD_UH((psrc) + 1 * (stride)); \
406 #define LOAD_2VECS_SH(psrc, stride, \
409 val0 = LOAD_SH((psrc) + 0 * (stride)); \
410 val1 = LOAD_SH((psrc) + 1 * (stride)); \
413 #define LOAD_4VECS_UH(psrc, stride, \
414 val0, val1, val2, val3) \
416 LOAD_2VECS_UH((psrc), (stride), val0, val1); \
417 LOAD_2VECS_UH((psrc + 2 * stride), (stride), val2, val3); \
420 #define LOAD_4VECS_SH(psrc, stride, \
421 val0, val1, val2, val3) \
423 LOAD_2VECS_SH((psrc), (stride), val0, val1); \
424 LOAD_2VECS_SH((psrc + 2 * stride), (stride), val2, val3); \
427 #define LOAD_6VECS_SH(psrc, stride, \
428 val0, val1, val2, val3, val4, val5) \
430 LOAD_2VECS_SH((psrc), (stride), val0, val1); \
431 LOAD_2VECS_SH((psrc + 2 * stride), (stride), val2, val3); \
432 LOAD_2VECS_SH((psrc + 4 * stride), (stride), val4, val5); \
435 #define LOAD_8VECS_UH(psrc, stride, \
436 val0, val1, val2, val3, \
437 val4, val5, val6, val7) \
439 LOAD_4VECS_UH((psrc), (stride), \
440 val0, val1, val2, val3); \
441 LOAD_4VECS_UH((psrc + 4 * stride), (stride), \
442 val4, val5, val6, val7); \
445 #define LOAD_8VECS_SH(psrc, stride, \
446 val0, val1, val2, val3, \
447 val4, val5, val6, val7) \
449 LOAD_4VECS_SH((psrc), (stride), \
450 val0, val1, val2, val3); \
451 LOAD_4VECS_SH((psrc + 4 * stride), (stride), \
452 val4, val5, val6, val7); \
455 #define LOAD_16VECS_SH(psrc, stride, \
456 val0, val1, val2, val3, \
457 val4, val5, val6, val7, \
458 val8, val9, val10, val11, \
459 val12, val13, val14, val15) \
461 LOAD_8VECS_SH((psrc), (stride), \
462 val0, val1, val2, val3, \
463 val4, val5, val6, val7); \
464 LOAD_8VECS_SH((psrc + 8 * (stride)), (stride), \
465 val8, val9, val10, val11, \
466 val12, val13, val14, val15); \
469 #define STORE_4VECS_UB(dst_out, pitch, \
470 in0, in1, in2, in3) \
472 STORE_UB((in0), (dst_out)); \
473 STORE_UB((in1), ((dst_out) + (pitch))); \
474 STORE_UB((in2), ((dst_out) + 2 * (pitch))); \
475 STORE_UB((in3), ((dst_out) + 3 * (pitch))); \
478 #define STORE_4VECS_SB(dst_out, pitch, \
479 in0, in1, in2, in3) \
481 STORE_SB((in0), (dst_out)); \
482 STORE_SB((in1), ((dst_out) + (pitch))); \
483 STORE_SB((in2), ((dst_out) + 2 * (pitch))); \
484 STORE_SB((in3), ((dst_out) + 3 * (pitch))); \
487 #define STORE_8VECS_UB(dst_out, pitch_in, \
488 in0, in1, in2, in3, \
489 in4, in5, in6, in7) \
491 STORE_4VECS_UB(dst_out, pitch_in, \
492 in0, in1, in2, in3); \
493 STORE_4VECS_UB((dst_out + 4 * (pitch_in)), pitch_in, \
494 in4, in5, in6, in7); \
497 #define STORE_2VECS_SH(ptr, stride, \
500 STORE_SH(in0, ((ptr) + 0 * stride)); \
501 STORE_SH(in1, ((ptr) + 1 * stride)); \
504 #define STORE_4VECS_SH(ptr, stride, \
505 in0, in1, in2, in3) \
507 STORE_SH(in0, ((ptr) + 0 * stride)); \
508 STORE_SH(in1, ((ptr) + 1 * stride)); \
509 STORE_SH(in2, ((ptr) + 2 * stride)); \
510 STORE_SH(in3, ((ptr) + 3 * stride)); \
513 #define STORE_6VECS_SH(ptr, stride, \
514 in0, in1, in2, in3, \
517 STORE_SH(in0, ((ptr) + 0 * stride)); \
518 STORE_SH(in1, ((ptr) + 1 * stride)); \
519 STORE_SH(in2, ((ptr) + 2 * stride)); \
520 STORE_SH(in3, ((ptr) + 3 * stride)); \
521 STORE_SH(in4, ((ptr) + 4 * stride)); \
522 STORE_SH(in5, ((ptr) + 5 * stride)); \
525 #define STORE_8VECS_SH(ptr, stride, \
526 in0, in1, in2, in3, \
527 in4, in5, in6, in7) \
529 STORE_SH(in0, ((ptr) + 0 * stride)); \
530 STORE_SH(in1, ((ptr) + 1 * stride)); \
531 STORE_SH(in2, ((ptr) + 2 * stride)); \
532 STORE_SH(in3, ((ptr) + 3 * stride)); \
533 STORE_SH(in4, ((ptr) + 4 * stride)); \
534 STORE_SH(in5, ((ptr) + 5 * stride)); \
535 STORE_SH(in6, ((ptr) + 6 * stride)); \
536 STORE_SH(in7, ((ptr) + 7 * stride)); \
539 #define CLIP_MIN_TO_MAX_H(in, min, max) \
543 out_m = __msa_max_s_h((v8i16) (min), (v8i16) (in)); \
544 out_m = __msa_min_s_h((v8i16) (max), (v8i16) out_m); \
548 #define CLIP_UNSIGNED_CHAR_H(in) \
550 v8i16 max_m = __msa_ldi_h(255); \
553 out_m = __msa_maxi_s_h((v8i16) (in), 0); \
554 out_m = __msa_min_s_h((v8i16) max_m, (v8i16) out_m); \
558 #define CLIP_UNSIGNED_CHAR_W(in) \
560 v4i32 max_m = __msa_ldi_w(255); \
563 out_m = __msa_maxi_s_w((v4i32) (in), 0); \
564 out_m = __msa_min_s_w((v4i32) max_m, (v4i32) out_m); \
568 #define TRANSPOSE4x4_B_UB(in0, in1, in2, in3, \
569 out0, out1, out2, out3) \
571 v16i8 zero_m = { 0 }; \
572 v16i8 s0_m, s1_m, s2_m, s3_m; \
574 s0_m = (v16i8) __msa_ilvr_d((v2i64) (in1), (v2i64) (in0)); \
575 s1_m = (v16i8) __msa_ilvr_d((v2i64) (in3), (v2i64) (in2)); \
576 s2_m = __msa_ilvr_b(s1_m, s0_m); \
577 s3_m = __msa_ilvl_b(s1_m, s0_m); \
579 out0 = (v16u8) __msa_ilvr_b(s3_m, s2_m); \
580 out1 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out0, 4); \
581 out2 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out1, 4); \
582 out3 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out2, 4); \
585 #define TRANSPOSE8x4_B_UB(in0, in1, in2, in3, \
586 in4, in5, in6, in7, \
587 out0, out1, out2, out3) \
589 v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
591 tmp0_m = (v16i8) __msa_ilvev_w((v4i32) (in4), (v4i32) (in0)); \
592 tmp1_m = (v16i8) __msa_ilvev_w((v4i32) (in5), (v4i32) (in1)); \
593 tmp2_m = __msa_ilvr_b(tmp1_m, tmp0_m); \
594 tmp0_m = (v16i8) __msa_ilvev_w((v4i32) (in6), (v4i32) (in2)); \
595 tmp1_m = (v16i8) __msa_ilvev_w((v4i32) (in7), (v4i32) (in3)); \
597 tmp3_m = __msa_ilvr_b(tmp1_m, tmp0_m); \
598 tmp0_m = (v16i8) __msa_ilvr_h((v8i16) tmp3_m, (v8i16) tmp2_m); \
599 tmp1_m = (v16i8) __msa_ilvl_h((v8i16) tmp3_m, (v8i16) tmp2_m); \
601 out0 = (v16u8) __msa_ilvr_w((v4i32) tmp1_m, (v4i32) tmp0_m); \
602 out2 = (v16u8) __msa_ilvl_w((v4i32) tmp1_m, (v4i32) tmp0_m); \
603 out1 = (v16u8) __msa_ilvl_d((v2i64) out2, (v2i64) out0); \
604 out3 = (v16u8) __msa_ilvl_d((v2i64) out0, (v2i64) out2); \
607 #define TRANSPOSE8x4_B_UH(in0, in1, in2, in3, \
608 in4, in5, in6, in7, \
609 out0, out1, out2, out3) \
611 v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
613 tmp0_m = (v16i8) __msa_ilvev_w((v4i32) (in4), (v4i32) (in0)); \
614 tmp1_m = (v16i8) __msa_ilvev_w((v4i32) (in5), (v4i32) (in1)); \
615 tmp2_m = __msa_ilvr_b(tmp1_m, tmp0_m); \
616 tmp0_m = (v16i8) __msa_ilvev_w((v4i32) (in6), (v4i32) (in2)); \
617 tmp1_m = (v16i8) __msa_ilvev_w((v4i32) (in7), (v4i32) (in3)); \
619 tmp3_m = __msa_ilvr_b(tmp1_m, tmp0_m); \
620 tmp0_m = (v16i8) __msa_ilvr_h((v8i16) tmp3_m, (v8i16) tmp2_m); \
621 tmp1_m = (v16i8) __msa_ilvl_h((v8i16) tmp3_m, (v8i16) tmp2_m); \
623 out0 = (v8u16) __msa_ilvr_w((v4i32) tmp1_m, (v4i32) tmp0_m); \
624 out2 = (v8u16) __msa_ilvl_w((v4i32) tmp1_m, (v4i32) tmp0_m); \
625 out1 = (v8u16) __msa_ilvl_d((v2i64) out2, (v2i64) out0); \
626 out3 = (v8u16) __msa_ilvl_d((v2i64) out0, (v2i64) out2); \
629 #define TRANSPOSE8x8_B_UB(in0, in1, in2, in3, \
630 in4, in5, in6, in7, \
631 out0, out1, out2, out3, \
632 out4, out5, out6, out7) \
634 v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
635 v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
636 v16i8 zero_m = { 0 }; \
638 tmp0_m = __msa_ilvr_b((v16i8) (in2), (v16i8) (in0)); \
639 tmp1_m = __msa_ilvr_b((v16i8) (in3), (v16i8) (in1)); \
640 tmp2_m = __msa_ilvr_b((v16i8) (in6), (v16i8) (in4)); \
641 tmp3_m = __msa_ilvr_b((v16i8) (in7), (v16i8) (in5)); \
643 tmp4_m = __msa_ilvr_b((v16i8) tmp1_m, (v16i8) tmp0_m); \
644 tmp5_m = __msa_ilvl_b((v16i8) tmp1_m, (v16i8) tmp0_m); \
645 tmp6_m = __msa_ilvr_b((v16i8) tmp3_m, (v16i8) tmp2_m); \
646 tmp7_m = __msa_ilvl_b((v16i8) tmp3_m, (v16i8) tmp2_m); \
648 out0 = (v16u8) __msa_ilvr_w((v4i32) tmp6_m, (v4i32) tmp4_m); \
649 out2 = (v16u8) __msa_ilvl_w((v4i32) tmp6_m, (v4i32) tmp4_m); \
650 out4 = (v16u8) __msa_ilvr_w((v4i32) tmp7_m, (v4i32) tmp5_m); \
651 out6 = (v16u8) __msa_ilvl_w((v4i32) tmp7_m, (v4i32) tmp5_m); \
653 out1 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out0, 8); \
654 out3 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out2, 8); \
655 out5 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out4, 8); \
656 out7 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out6, 8); \
659 #define TRANSPOSE8x8_B_UH(in0, in1, in2, in3, \
660 in4, in5, in6, in7, \
661 out0, out1, out2, out3, \
662 out4, out5, out6, out7) \
664 v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
665 v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
666 v16i8 zero_m = { 0 }; \
668 tmp0_m = __msa_ilvr_b((v16i8) (in2), (v16i8) (in0)); \
669 tmp1_m = __msa_ilvr_b((v16i8) (in3), (v16i8) (in1)); \
670 tmp2_m = __msa_ilvr_b((v16i8) (in6), (v16i8) (in4)); \
671 tmp3_m = __msa_ilvr_b((v16i8) (in7), (v16i8) (in5)); \
673 tmp4_m = __msa_ilvr_b((v16i8) tmp1_m, (v16i8) tmp0_m); \
674 tmp5_m = __msa_ilvl_b((v16i8) tmp1_m, (v16i8) tmp0_m); \
675 tmp6_m = __msa_ilvr_b((v16i8) tmp3_m, (v16i8) tmp2_m); \
676 tmp7_m = __msa_ilvl_b((v16i8) tmp3_m, (v16i8) tmp2_m); \
678 out0 = (v8u16) __msa_ilvr_w((v4i32) tmp6_m, (v4i32) tmp4_m); \
679 out2 = (v8u16) __msa_ilvl_w((v4i32) tmp6_m, (v4i32) tmp4_m); \
680 out4 = (v8u16) __msa_ilvr_w((v4i32) tmp7_m, (v4i32) tmp5_m); \
681 out6 = (v8u16) __msa_ilvl_w((v4i32) tmp7_m, (v4i32) tmp5_m); \
682 out1 = (v8u16) __msa_sldi_b(zero_m, (v16i8) out0, 8); \
683 out3 = (v8u16) __msa_sldi_b(zero_m, (v16i8) out2, 8); \
684 out5 = (v8u16) __msa_sldi_b(zero_m, (v16i8) out4, 8); \
685 out7 = (v8u16) __msa_sldi_b(zero_m, (v16i8) out6, 8); \
688 #define TRANSPOSE16x8_B_UB(in0, in1, in2, in3, \
689 in4, in5, in6, in7, \
690 in8, in9, in10, in11, \
691 in12, in13, in14, in15, \
692 out0, out1, out2, out3, \
693 out4, out5, out6, out7) \
695 v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
696 v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
698 (out7) = (v16u8) __msa_ilvev_d((v2i64) (in8), (v2i64) (in0)); \
699 (out6) = (v16u8) __msa_ilvev_d((v2i64) (in9), (v2i64) (in1)); \
700 (out5) = (v16u8) __msa_ilvev_d((v2i64) (in10), (v2i64) (in2)); \
701 (out4) = (v16u8) __msa_ilvev_d((v2i64) (in11), (v2i64) (in3)); \
702 (out3) = (v16u8) __msa_ilvev_d((v2i64) (in12), (v2i64) (in4)); \
703 (out2) = (v16u8) __msa_ilvev_d((v2i64) (in13), (v2i64) (in5)); \
704 (out1) = (v16u8) __msa_ilvev_d((v2i64) (in14), (v2i64) (in6)); \
705 (out0) = (v16u8) __msa_ilvev_d((v2i64) (in15), (v2i64) (in7)); \
707 tmp0_m = (v16u8) __msa_ilvev_b((v16i8) (out6), (v16i8) (out7)); \
708 tmp4_m = (v16u8) __msa_ilvod_b((v16i8) (out6), (v16i8) (out7)); \
709 tmp1_m = (v16u8) __msa_ilvev_b((v16i8) (out4), (v16i8) (out5)); \
710 tmp5_m = (v16u8) __msa_ilvod_b((v16i8) (out4), (v16i8) (out5)); \
711 (out5) = (v16u8) __msa_ilvev_b((v16i8) (out2), (v16i8) (out3)); \
712 tmp6_m = (v16u8) __msa_ilvod_b((v16i8) (out2), (v16i8) (out3)); \
713 (out7) = (v16u8) __msa_ilvev_b((v16i8) (out0), (v16i8) (out1)); \
714 tmp7_m = (v16u8) __msa_ilvod_b((v16i8) (out0), (v16i8) (out1)); \
716 tmp2_m = (v16u8) __msa_ilvev_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
717 tmp3_m = (v16u8) __msa_ilvev_h((v8i16) (out7), (v8i16) (out5)); \
718 (out0) = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
719 (out4) = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
721 tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
722 tmp3_m = (v16u8) __msa_ilvod_h((v8i16) (out7), (v8i16) (out5)); \
723 (out2) = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
724 (out6) = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
726 tmp2_m = (v16u8) __msa_ilvev_h((v8i16) tmp5_m, (v8i16) tmp4_m); \
727 tmp3_m = (v16u8) __msa_ilvev_h((v8i16) tmp7_m, (v8i16) tmp6_m); \
728 (out1) = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
729 (out5) = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
731 tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp5_m, (v8i16) tmp4_m); \
732 tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp5_m, (v8i16) tmp4_m); \
733 tmp3_m = (v16u8) __msa_ilvod_h((v8i16) tmp7_m, (v8i16) tmp6_m); \
734 tmp3_m = (v16u8) __msa_ilvod_h((v8i16) tmp7_m, (v8i16) tmp6_m); \
735 (out3) = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
736 (out7) = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
739 #define TRANSPOSE8x8_H_SH(in0, in1, in2, in3, \
740 in4, in5, in6, in7, \
741 out0, out1, out2, out3, \
742 out4, out5, out6, out7) \
745 v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
746 v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
748 s0_m = __msa_ilvr_h((v8i16) (in6), (v8i16) (in4)); \
749 s1_m = __msa_ilvr_h((v8i16) (in7), (v8i16) (in5)); \
750 tmp0_m = __msa_ilvr_h((v8i16) s1_m, (v8i16) s0_m); \
751 tmp1_m = __msa_ilvl_h((v8i16) s1_m, (v8i16) s0_m); \
753 s0_m = __msa_ilvl_h((v8i16) (in6), (v8i16) (in4)); \
754 s1_m = __msa_ilvl_h((v8i16) (in7), (v8i16) (in5)); \
755 tmp2_m = __msa_ilvr_h((v8i16) s1_m, (v8i16) s0_m); \
756 tmp3_m = __msa_ilvl_h((v8i16) s1_m, (v8i16) s0_m); \
758 s0_m = __msa_ilvr_h((v8i16) (in2), (v8i16) (in0)); \
759 s1_m = __msa_ilvr_h((v8i16) (in3), (v8i16) (in1)); \
760 tmp4_m = __msa_ilvr_h((v8i16) s1_m, (v8i16) s0_m); \
761 tmp5_m = __msa_ilvl_h((v8i16) s1_m, (v8i16) s0_m); \
763 s0_m = __msa_ilvl_h((v8i16) (in2), (v8i16) (in0)); \
764 s1_m = __msa_ilvl_h((v8i16) (in3), (v8i16) (in1)); \
765 tmp6_m = __msa_ilvr_h((v8i16) s1_m, (v8i16) s0_m); \
766 tmp7_m = __msa_ilvl_h((v8i16) s1_m, (v8i16) s0_m); \
768 out0 = (v8i16) __msa_pckev_d((v2i64) tmp0_m, (v2i64) tmp4_m); \
769 out1 = (v8i16) __msa_pckod_d((v2i64) tmp0_m, (v2i64) tmp4_m); \
770 out2 = (v8i16) __msa_pckev_d((v2i64) tmp1_m, (v2i64) tmp5_m); \
771 out3 = (v8i16) __msa_pckod_d((v2i64) tmp1_m, (v2i64) tmp5_m); \
772 out4 = (v8i16) __msa_pckev_d((v2i64) tmp2_m, (v2i64) tmp6_m); \
773 out5 = (v8i16) __msa_pckod_d((v2i64) tmp2_m, (v2i64) tmp6_m); \
774 out6 = (v8i16) __msa_pckev_d((v2i64) tmp3_m, (v2i64) tmp7_m); \
775 out7 = (v8i16) __msa_pckod_d((v2i64) tmp3_m, (v2i64) tmp7_m); \
778 #define TRANSPOSE4x4_W(in0, in1, in2, in3, \
779 out0, out1, out2, out3) \
781 v4i32 s0_m, s1_m, s2_m, s3_m; \
783 s0_m = __msa_ilvr_w((v4i32) (in1), (v4i32) (in0)); \
784 s1_m = __msa_ilvl_w((v4i32) (in1), (v4i32) (in0)); \
785 s2_m = __msa_ilvr_w((v4i32) (in3), (v4i32) (in2)); \
786 s3_m = __msa_ilvl_w((v4i32) (in3), (v4i32) (in2)); \
788 out0 = (v4i32) __msa_ilvr_d((v2i64) s2_m, (v2i64) s0_m); \
789 out1 = (v4i32) __msa_ilvl_d((v2i64) s2_m, (v2i64) s0_m); \
790 out2 = (v4i32) __msa_ilvr_d((v2i64) s3_m, (v2i64) s1_m); \
791 out3 = (v4i32) __msa_ilvl_d((v2i64) s3_m, (v2i64) s1_m); \
794 #define ILV_B_LRLR_SB(in0, in1, in2, in3, \
795 out0, out1, out2, out3) \
797 out0 = __msa_ilvl_b((v16i8) (in1), (v16i8) (in0)); \
798 out1 = __msa_ilvr_b((v16i8) (in1), (v16i8) (in0)); \
799 out2 = __msa_ilvl_b((v16i8) (in3), (v16i8) (in2)); \
800 out3 = __msa_ilvr_b((v16i8) (in3), (v16i8) (in2)); \
803 #define ILV_B_LRLR_UH(in0, in1, in2, in3, \
804 out0, out1, out2, out3) \
806 out0 = (v8u16) __msa_ilvl_b((v16i8) (in1), (v16i8) (in0)); \
807 out1 = (v8u16) __msa_ilvr_b((v16i8) (in1), (v16i8) (in0)); \
808 out2 = (v8u16) __msa_ilvl_b((v16i8) (in3), (v16i8) (in2)); \
809 out3 = (v8u16) __msa_ilvr_b((v16i8) (in3), (v16i8) (in2)); \
812 #define ILV_B_LRLR_SH(in0, in1, in2, in3, \
813 out0, out1, out2, out3) \
815 out0 = (v8i16) __msa_ilvl_b((v16i8) (in1), (v16i8) (in0)); \
816 out1 = (v8i16) __msa_ilvr_b((v16i8) (in1), (v16i8) (in0)); \
817 out2 = (v8i16) __msa_ilvl_b((v16i8) (in3), (v16i8) (in2)); \
818 out3 = (v8i16) __msa_ilvr_b((v16i8) (in3), (v16i8) (in2)); \
821 #define ILV_H_LRLR_SW(in0, in1, in2, in3, \
822 out0, out1, out2, out3) \
824 out0 = (v4i32) __msa_ilvl_h((v8i16) (in1), (v8i16) (in0)); \
825 out1 = (v4i32) __msa_ilvr_h((v8i16) (in1), (v8i16) (in0)); \
826 out2 = (v4i32) __msa_ilvl_h((v8i16) (in3), (v8i16) (in2)); \
827 out3 = (v4i32) __msa_ilvr_h((v8i16) (in3), (v8i16) (in2)); \
830 #define ILVR_B_2VECS_UB(in0_r, in1_r, in0_l, in1_l, \
833 out0 = (v16u8) __msa_ilvr_b((v16i8) (in0_l), (v16i8) (in0_r)); \
834 out1 = (v16u8) __msa_ilvr_b((v16i8) (in1_l), (v16i8) (in1_r)); \
837 #define ILVR_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l, \
840 out0 = __msa_ilvr_b((v16i8) (in0_l), (v16i8) (in0_r)); \
841 out1 = __msa_ilvr_b((v16i8) (in1_l), (v16i8) (in1_r)); \
844 #define ILVR_B_4VECS_SB(in0_r, in1_r, in2_r, in3_r, \
845 in0_l, in1_l, in2_l, in3_l, \
846 out0, out1, out2, out3) \
848 ILVR_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l, \
850 ILVR_B_2VECS_SB(in2_r, in3_r, in2_l, in3_l, \
854 #define ILVR_B_6VECS_SB(in0_r, in1_r, in2_r, \
855 in3_r, in4_r, in5_r, \
856 in0_l, in1_l, in2_l, \
857 in3_l, in4_l, in5_l, \
861 ILVR_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l, \
863 ILVR_B_2VECS_SB(in2_r, in3_r, in2_l, in3_l, \
865 ILVR_B_2VECS_SB(in4_r, in5_r, in4_l, in5_l, \
869 #define ILVR_B_8VECS_SB(in0_r, in1_r, in2_r, in3_r, \
870 in4_r, in5_r, in6_r, in7_r, \
871 in0_l, in1_l, in2_l, in3_l, \
872 in4_l, in5_l, in6_l, in7_l, \
873 out0, out1, out2, out3, \
874 out4, out5, out6, out7) \
876 ILVR_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l, \
878 ILVR_B_2VECS_SB(in2_r, in3_r, in2_l, in3_l, \
880 ILVR_B_2VECS_SB(in4_r, in5_r, in4_l, in5_l, \
882 ILVR_B_2VECS_SB(in6_r, in7_r, in6_l, in7_l, \
886 #define ILVR_B_2VECS_UH(in0_r, in1_r, in0_l, in1_l, \
889 out0 = (v8u16) __msa_ilvr_b((v16i8) (in0_l), (v16i8) (in0_r)); \
890 out1 = (v8u16) __msa_ilvr_b((v16i8) (in1_l), (v16i8) (in1_r)); \
893 #define ILVR_B_2VECS_SH(in0_r, in1_r, in0_l, in1_l, \
896 out0 = (v8i16) __msa_ilvr_b((v16i8) (in0_l), (v16i8) (in0_r)); \
897 out1 = (v8i16) __msa_ilvr_b((v16i8) (in1_l), (v16i8) (in1_r)); \
900 #define ILVR_B_3VECS_SH(in0_r, in1_r, in2_r, in0_l, in1_l, in2_l, \
903 ILVR_B_2VECS_SH(in0_r, in1_r, in0_l, in1_l, out0, out1); \
904 out2 = (v8i16) __msa_ilvr_b((v16i8) (in2_l), (v16i8) (in2_r)); \
907 #define ILVR_B_4VECS_UH(in0_r, in1_r, in2_r, in3_r, \
908 in0_l, in1_l, in2_l, in3_l, \
909 out0, out1, out2, out3) \
911 ILVR_B_2VECS_UH(in0_r, in1_r, in0_l, in1_l, \
913 ILVR_B_2VECS_UH(in2_r, in3_r, in2_l, in3_l, \
917 #define ILVR_B_4VECS_SH(in0_r, in1_r, in2_r, in3_r, \
918 in0_l, in1_l, in2_l, in3_l, \
919 out0, out1, out2, out3) \
921 ILVR_B_2VECS_SH(in0_r, in1_r, in0_l, in1_l, \
923 ILVR_B_2VECS_SH(in2_r, in3_r, in2_l, in3_l, \
927 #define ILVR_H_2VECS_SH(in0_r, in1_r, in0_l, in1_l, \
930 out0 = __msa_ilvr_h((v8i16) (in0_l), (v8i16) (in0_r)); \
931 out1 = __msa_ilvr_h((v8i16) (in1_l), (v8i16) (in1_r)); \
934 #define ILVR_H_4VECS_SH(in0_r, in1_r, in2_r, in3_r, \
935 in0_l, in1_l, in2_l, in3_l, \
936 out0, out1, out2, out3) \
938 ILVR_H_2VECS_SH(in0_r, in1_r, in0_l, in1_l, \
940 ILVR_H_2VECS_SH(in2_r, in3_r, in2_l, in3_l, \
944 #define ILVR_H_6VECS_SH(in0_r, in1_r, in2_r, \
945 in3_r, in4_r, in5_r, \
946 in0_l, in1_l, in2_l, \
947 in3_l, in4_l, in5_l, \
951 ILVR_H_2VECS_SH(in0_r, in1_r, in0_l, in1_l, \
953 ILVR_H_2VECS_SH(in2_r, in3_r, in2_l, in3_l, \
955 ILVR_H_2VECS_SH(in4_r, in5_r, in4_l, in5_l, \
959 #define ILVR_H_8VECS_SH(in0_r, in1_r, in2_r, in3_r, \
960 in4_r, in5_r, in6_r, in7_r, \
961 in0_l, in1_l, in2_l, in3_l, \
962 in4_l, in5_l, in6_l, in7_l, \
963 out0, out1, out2, out3, \
964 out4, out5, out6, out7) \
966 ILVR_H_2VECS_SH(in0_r, in1_r, in0_l, in1_l, \
968 ILVR_H_2VECS_SH(in2_r, in3_r, in2_l, in3_l, \
970 ILVR_H_2VECS_SH(in4_r, in5_r, in4_l, in5_l, \
972 ILVR_H_2VECS_SH(in6_r, in7_r, in6_l, in7_l, \
976 #define ILVL_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l, \
979 out0 = __msa_ilvl_b((v16i8) (in0_l), (v16i8) (in0_r)); \
980 out1 = __msa_ilvl_b((v16i8) (in1_l), (v16i8) (in1_r)); \
983 #define ILVL_B_4VECS_SB(in0_r, in1_r, in2_r, in3_r, \
984 in0_l, in1_l, in2_l, in3_l, \
985 out0, out1, out2, out3) \
987 ILVL_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l, \
989 ILVL_B_2VECS_SB(in2_r, in3_r, in2_l, in3_l, \
993 #define ILVL_B_6VECS_SB(in0_r, in1_r, in2_r, \
994 in3_r, in4_r, in5_r, \
995 in0_l, in1_l, in2_l, \
996 in3_l, in4_l, in5_l, \
1000 ILVL_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l, \
1002 ILVL_B_2VECS_SB(in2_r, in3_r, in2_l, in3_l, \
1004 ILVL_B_2VECS_SB(in4_r, in5_r, in4_l, in5_l, \
1008 #define ILVL_H_2VECS_SH(in0_r, in1_r, in0_l, in1_l, \
1011 out0 = __msa_ilvl_h((v8i16) (in0_l), (v8i16) (in0_r)); \
1012 out1 = __msa_ilvl_h((v8i16) (in1_l), (v8i16) (in1_r)); \
1015 #define ILVL_H_4VECS_SH(in0_r, in1_r, in2_r, in3_r, \
1016 in0_l, in1_l, in2_l, in3_l, \
1017 out0, out1, out2, out3) \
1019 ILVL_H_2VECS_SH(in0_r, in1_r, in0_l, in1_l, \
1021 ILVL_H_2VECS_SH(in2_r, in3_r, in2_l, in3_l, \
1025 #define ILVL_H_6VECS_SH(in0_r, in1_r, in2_r, \
1026 in3_r, in4_r, in5_r, \
1027 in0_l, in1_l, in2_l, \
1028 in3_l, in4_l, in5_l, \
1032 ILVL_H_2VECS_SH(in0_r, in1_r, in0_l, in1_l, \
1034 ILVL_H_2VECS_SH(in2_r, in3_r, in2_l, in3_l, \
1036 ILVL_H_2VECS_SH(in4_r, in5_r, in4_l, in5_l, \
1040 #define ILVL_H_8VECS_SH(in0_r, in1_r, in2_r, in3_r, \
1041 in4_r, in5_r, in6_r, in7_r, \
1042 in0_l, in1_l, in2_l, in3_l, \
1043 in4_l, in5_l, in6_l, in7_l, \
1044 out0, out1, out2, out3, \
1045 out4, out5, out6, out7) \
1047 ILVL_H_2VECS_SH(in0_r, in1_r, in0_l, in1_l, \
1049 ILVL_H_2VECS_SH(in2_r, in3_r, in2_l, in3_l, \
1051 ILVL_H_2VECS_SH(in4_r, in5_r, in4_l, in5_l, \
1053 ILVL_H_2VECS_SH(in6_r, in7_r, in6_l, in7_l, \
1057 #define ILVR_D_2VECS_SB(out0, in0_l, in0_r, \
1058 out1, in1_l, in1_r) \
1060 out0 = (v16i8) __msa_ilvr_d((v2i64) (in0_l), (v2i64) (in0_r)); \
1061 out1 = (v16i8) __msa_ilvr_d((v2i64) (in1_l), (v2i64) (in1_r)); \
1064 #define ILVR_D_3VECS_SB(out0, in0_l, in0_r, \
1065 out1, in1_l, in1_r, \
1066 out2, in2_l, in2_r) \
1068 ILVR_D_2VECS_SB(out0, in0_l, in0_r, \
1069 out1, in1_l, in1_r); \
1070 out2 = (v16i8) __msa_ilvr_d((v2i64) (in2_l), (v2i64) (in2_r)); \
1073 #define ILVR_D_4VECS_SB(out0, in0_l, in0_r, \
1074 out1, in1_l, in1_r, \
1075 out2, in2_l, in2_r, \
1076 out3, in3_l, in3_r) \
1078 ILVR_D_2VECS_SB(out0, in0_l, in0_r, \
1079 out1, in1_l, in1_r); \
1080 ILVR_D_2VECS_SB(out2, in2_l, in2_r, \
1081 out3, in3_l, in3_r); \
1084 #define MAXI_S_H_4VECS_UH(vec0, vec1, vec2, vec3, max_value) \
1086 vec0 = (v8u16) __msa_maxi_s_h((v8i16) (vec0), (max_value)); \
1087 vec1 = (v8u16) __msa_maxi_s_h((v8i16) (vec1), (max_value)); \
1088 vec2 = (v8u16) __msa_maxi_s_h((v8i16) (vec2), (max_value)); \
1089 vec3 = (v8u16) __msa_maxi_s_h((v8i16) (vec3), (max_value)); \
1092 #define SAT_U_H_4VECS_UH(vec0, vec1, vec2, vec3, sat_value) \
1094 vec0 = __msa_sat_u_h((v8u16) (vec0), (sat_value)); \
1095 vec1 = __msa_sat_u_h((v8u16) (vec1), (sat_value)); \
1096 vec2 = __msa_sat_u_h((v8u16) (vec2), (sat_value)); \
1097 vec3 = __msa_sat_u_h((v8u16) (vec3), (sat_value)); \
1100 #define PCKEV_B_4VECS_UB(in0_l, in1_l, in2_l, in3_l, \
1101 in0_r, in1_r, in2_r, in3_r, \
1102 out0, out1, out2, out3) \
1104 out0 = (v16u8) __msa_pckev_b((v16i8) (in0_l), (v16i8) (in0_r)); \
1105 out1 = (v16u8) __msa_pckev_b((v16i8) (in1_l), (v16i8) (in1_r)); \
1106 out2 = (v16u8) __msa_pckev_b((v16i8) (in2_l), (v16i8) (in2_r)); \
1107 out3 = (v16u8) __msa_pckev_b((v16i8) (in3_l), (v16i8) (in3_r)); \
1110 #define PCKEV_B_4VECS_SB(in0_l, in1_l, in2_l, in3_l, \
1111 in0_r, in1_r, in2_r, in3_r, \
1112 out0, out1, out2, out3) \
1114 out0 = __msa_pckev_b((v16i8) (in0_l), (v16i8) (in0_r)); \
1115 out1 = __msa_pckev_b((v16i8) (in1_l), (v16i8) (in1_r)); \
1116 out2 = __msa_pckev_b((v16i8) (in2_l), (v16i8) (in2_r)); \
1117 out3 = __msa_pckev_b((v16i8) (in3_l), (v16i8) (in3_r)); \
1120 #define XORI_B_2VECS_UB(val0, val1, out0, out1, xor_val) \
1122 out0 = __msa_xori_b((v16u8) (val0), (xor_val)); \
1123 out1 = __msa_xori_b((v16u8) (val1), (xor_val)); \
1126 #define XORI_B_2VECS_SB(val0, val1, \
1127 out0, out1, xor_val) \
1129 out0 = (v16i8) __msa_xori_b((v16u8) (val0), (xor_val)); \
1130 out1 = (v16i8) __msa_xori_b((v16u8) (val1), (xor_val)); \
1133 #define XORI_B_3VECS_SB(val0, val1, val2, \
1137 XORI_B_2VECS_SB(val0, val1, \
1138 out0, out1, xor_val); \
1139 out2 = (v16i8) __msa_xori_b((v16u8) (val2), (xor_val)); \
1142 #define XORI_B_4VECS_UB(val0, val1, val2, val3, \
1143 out0, out1, out2, out3, xor_val) \
1145 XORI_B_2VECS_UB(val0, val1, out0, out1, xor_val); \
1146 XORI_B_2VECS_UB(val2, val3, out2, out3, xor_val); \
1149 #define XORI_B_4VECS_SB(val0, val1, val2, val3, \
1150 out0, out1, out2, out3, \
1153 XORI_B_2VECS_SB(val0, val1, \
1154 out0, out1, xor_val); \
1155 XORI_B_2VECS_SB(val2, val3, \
1156 out2, out3, xor_val); \
1159 #define XORI_B_5VECS_SB(val0, val1, val2, val3, val4, \
1160 out0, out1, out2, out3, out4, \
1163 XORI_B_3VECS_SB(val0, val1, val2, \
1164 out0, out1, out2, xor_val); \
1165 XORI_B_2VECS_SB(val3, val4, \
1166 out3, out4, xor_val); \
1169 #define XORI_B_6VECS_SB(val0, val1, val2, val3, val4, val5, \
1170 out0, out1, out2, out3, out4, out5, \
1173 XORI_B_4VECS_SB(val0, val1, val2, val3, \
1174 out0, out1, out2, out3, xor_val); \
1175 XORI_B_2VECS_SB(val4, val5,out4, out5, xor_val); \
1178 #define XORI_B_7VECS_SB(val0, val1, val2, val3, \
1180 out0, out1, out2, out3, \
1184 XORI_B_4VECS_SB(val0, val1, val2, val3, \
1185 out0, out1, out2, out3, xor_val); \
1186 XORI_B_3VECS_SB(val4, val5, val6, \
1187 out4, out5, out6, xor_val); \
1190 #define XORI_B_8VECS_SB(val0, val1, val2, val3, \
1191 val4, val5, val6, val7, \
1192 out0, out1, out2, out3, \
1193 out4, out5, out6, out7, xor_val) \
1195 XORI_B_4VECS_SB(val0, val1, val2, val3, \
1196 out0, out1, out2, out3, xor_val); \
1197 XORI_B_4VECS_SB(val4, val5, val6, val7, \
1198 out4, out5, out6, out7, xor_val); \
1200 #define ADDS_S_H_4VECS_UH(in0, in1, in2, in3, in4, in5, in6, in7, \
1201 out0, out1, out2, out3) \
1203 out0 = (v8u16) __msa_adds_s_h((v8i16) (in0), (v8i16) (in1)); \
1204 out1 = (v8u16) __msa_adds_s_h((v8i16) (in2), (v8i16) (in3)); \
1205 out2 = (v8u16) __msa_adds_s_h((v8i16) (in4), (v8i16) (in5)); \
1206 out3 = (v8u16) __msa_adds_s_h((v8i16) (in6), (v8i16) (in7)); \
1208 #define SRA_4VECS(in0, in1, in2, in3, \
1209 out0, out1, out2, out3, \
1212 out0 = (in0) >> (shift_right_vec); \
1213 out1 = (in1) >> (shift_right_vec); \
1214 out2 = (in2) >> (shift_right_vec); \
1215 out3 = (in3) >> (shift_right_vec); \
1218 #define SRL_H_4VECS_UH(in0, in1, in2, in3, \
1219 out0, out1, out2, out3, \
1222 out0 = (v8u16) __msa_srl_h((v8i16) (in0), (v8i16) (shift_right_vec)); \
1223 out1 = (v8u16) __msa_srl_h((v8i16) (in1), (v8i16) (shift_right_vec)); \
1224 out2 = (v8u16) __msa_srl_h((v8i16) (in2), (v8i16) (shift_right_vec)); \
1225 out3 = (v8u16) __msa_srl_h((v8i16) (in3), (v8i16) (shift_right_vec)); \
1228 #define SRAR_SATURATE_SIGNED_H(input, right_shift_vec, sat_val) \
1232 out_m = __msa_srar_h((v8i16) (input), (v8i16) (right_shift_vec)); \
1233 out_m = __msa_sat_s_h(out_m, (sat_val)); \
1237 #define PCKEV_2B_XORI128_STORE_4_BYTES_4(in1, in2, \
1240 uint32_t out0_m, out1_m, out2_m, out3_m; \
1242 uint8_t *dst_m = (uint8_t *) (pdst); \
1244 tmp0_m = __msa_pckev_b((v16i8) (in2), (v16i8) (in1)); \
1245 tmp0_m = (v16i8) __msa_xori_b((v16u8) tmp0_m, 128); \
1247 out0_m = __msa_copy_u_w((v4i32) tmp0_m, 0); \
1248 out1_m = __msa_copy_u_w((v4i32) tmp0_m, 1); \
1249 out2_m = __msa_copy_u_w((v4i32) tmp0_m, 2); \
1250 out3_m = __msa_copy_u_w((v4i32) tmp0_m, 3); \
1252 STORE_WORD(dst_m, out0_m); \
1254 STORE_WORD(dst_m, out1_m); \
1256 STORE_WORD(dst_m, out2_m); \
1258 STORE_WORD(dst_m, out3_m); \
1261 #define PCKEV_B_XORI128_STORE_8_BYTES(in1, in2, pdest) \
1266 tmp_m = __msa_pckev_b((v16i8) (in1), (v16i8) (in2)); \
1267 tmp_m = (v16i8) __msa_xori_b((v16u8) tmp_m, 128); \
1268 out_m = __msa_copy_u_d((v2i64) tmp_m, 0); \
1269 STORE_DWORD((pdest), out_m); \
1272 #define PCKEV_B_XORI128_STORE_8_BYTES_2(in1, in2, \
1275 uint64_t out0_m, out1_m; \
1277 uint8_t *dst_m = (uint8_t *) (pdst); \
1279 tmp0_m = __msa_pckev_b((v16i8) (in2), (v16i8) (in1)); \
1280 tmp0_m = (v16i8) __msa_xori_b((v16u8) tmp0_m, 128); \
1282 out0_m = __msa_copy_u_d((v2i64) tmp0_m, 0); \
1283 out1_m = __msa_copy_u_d((v2i64) tmp0_m, 1); \
1285 STORE_DWORD(dst_m, out0_m); \
1287 STORE_DWORD(dst_m, out1_m); \
1290 #define PCKEV_B_XORI128_STORE_6_BYTES_4(in1, in2, in3, in4, \
1293 uint32_t out0_m, out1_m, out2_m, out3_m; \
1294 uint16_t out4_m, out5_m, out6_m, out7_m; \
1295 v16i8 tmp0_m, tmp1_m; \
1296 uint8_t *dst_m = (uint8_t *) (pdst); \
1298 tmp0_m = __msa_pckev_b((v16i8) (in2), (v16i8) (in1)); \
1299 tmp1_m = __msa_pckev_b((v16i8) (in4), (v16i8) (in3)); \
1301 tmp0_m = (v16i8) __msa_xori_b((v16u8) tmp0_m, 128); \
1302 tmp1_m = (v16i8) __msa_xori_b((v16u8) tmp1_m, 128); \
1304 out0_m = __msa_copy_u_w((v4i32) tmp0_m, 0); \
1305 out1_m = __msa_copy_u_w((v4i32) tmp0_m, 2); \
1306 out2_m = __msa_copy_u_w((v4i32) tmp1_m, 0); \
1307 out3_m = __msa_copy_u_w((v4i32) tmp1_m, 2); \
1309 out4_m = __msa_copy_u_h((v8i16) tmp0_m, 2); \
1310 out5_m = __msa_copy_u_h((v8i16) tmp0_m, 6); \
1311 out6_m = __msa_copy_u_h((v8i16) tmp1_m, 2); \
1312 out7_m = __msa_copy_u_h((v8i16) tmp1_m, 6); \
1314 STORE_WORD(dst_m, out0_m); \
1315 STORE_HWORD((dst_m + 4), out4_m); \
1317 STORE_WORD(dst_m, out1_m); \
1318 STORE_HWORD((dst_m + 4), out5_m); \
1320 STORE_WORD(dst_m, out2_m); \
1321 STORE_HWORD((dst_m + 4), out6_m); \
1323 STORE_WORD(dst_m, out3_m); \
1324 STORE_HWORD((dst_m + 4), out7_m); \
1327 #define PCKEV_B_4_XORI128_STORE_8_BYTES_4(in1, in2, in3, in4, \
1330 uint64_t out0_m, out1_m, out2_m, out3_m; \
1331 v16i8 tmp0_m, tmp1_m; \
1332 uint8_t *dst_m = (uint8_t *) (pdst); \
1334 tmp0_m = __msa_pckev_b((v16i8) (in2), (v16i8) (in1)); \
1335 tmp1_m = __msa_pckev_b((v16i8) (in4), (v16i8) (in3)); \
1337 tmp0_m = (v16i8) __msa_xori_b((v16u8) tmp0_m, 128); \
1338 tmp1_m = (v16i8) __msa_xori_b((v16u8) tmp1_m, 128); \
1340 out0_m = __msa_copy_u_d((v2i64) tmp0_m, 0); \
1341 out1_m = __msa_copy_u_d((v2i64) tmp0_m, 1); \
1342 out2_m = __msa_copy_u_d((v2i64) tmp1_m, 0); \
1343 out3_m = __msa_copy_u_d((v2i64) tmp1_m, 1); \
1345 STORE_DWORD(dst_m, out0_m); \
1347 STORE_DWORD(dst_m, out1_m); \
1349 STORE_DWORD(dst_m, out2_m); \
1351 STORE_DWORD(dst_m, out3_m); \
1353 #define PCKEV_B_XORI128_STORE_VEC(in1, in2, pdest) \
1357 tmp_m = __msa_pckev_b((v16i8) (in1), (v16i8) (in2)); \
1358 tmp_m = (v16i8) __msa_xori_b((v16u8) tmp_m, 128); \
1359 STORE_SB(tmp_m, (pdest)); \
1362 #define PCKEV_B_STORE_4_BYTES_4(in1, in2, in3, in4, \
1365 uint32_t out0_m, out1_m, out2_m, out3_m; \
1366 v16i8 tmp0_m, tmp1_m; \
1367 uint8_t *dst_m = (uint8_t *) (pdst); \
1369 tmp0_m = __msa_pckev_b((v16i8) (in2), (v16i8) (in1)); \
1370 tmp1_m = __msa_pckev_b((v16i8) (in4), (v16i8) (in3)); \
1372 out0_m = __msa_copy_u_w((v4i32) tmp0_m, 0); \
1373 out1_m = __msa_copy_u_w((v4i32) tmp0_m, 2); \
1374 out2_m = __msa_copy_u_w((v4i32) tmp1_m, 0); \
1375 out3_m = __msa_copy_u_w((v4i32) tmp1_m, 2); \
1377 STORE_WORD(dst_m, out0_m); \
1379 STORE_WORD(dst_m, out1_m); \
1381 STORE_WORD(dst_m, out2_m); \
1383 STORE_WORD(dst_m, out3_m); \
1386 #define PCKEV_B_STORE_8_BYTES_4(in1, in2, in3, in4, \
1389 uint64_t out0_m, out1_m, out2_m, out3_m; \
1390 v16i8 tmp0_m, tmp1_m; \
1391 uint8_t *dst_m = (uint8_t *) (pdst); \
1393 tmp0_m = __msa_pckev_b((v16i8) (in2), (v16i8) (in1)); \
1394 tmp1_m = __msa_pckev_b((v16i8) (in4), (v16i8) (in3)); \
1396 out0_m = __msa_copy_u_d((v2i64) tmp0_m, 0); \
1397 out1_m = __msa_copy_u_d((v2i64) tmp0_m, 1); \
1398 out2_m = __msa_copy_u_d((v2i64) tmp1_m, 0); \
1399 out3_m = __msa_copy_u_d((v2i64) tmp1_m, 1); \
1401 STORE_DWORD(dst_m, out0_m); \
1403 STORE_DWORD(dst_m, out1_m); \
1405 STORE_DWORD(dst_m, out2_m); \
1407 STORE_DWORD(dst_m, out3_m); \
1410 #define UNPCK_SIGNED_B_TO_H(in, out1, out2) \
1414 tmp_m = __msa_clti_s_b((v16i8) (in), 0); \
1415 out1 = (v8i16) __msa_ilvr_b(tmp_m, (v16i8) (in)); \
1416 out2 = (v8i16) __msa_ilvl_b(tmp_m, (v16i8) (in)); \
1419 #define SWAP_VECS(Vec0, Vec1) \
1421 Vec0 = Vec0 ^ Vec1; \
1422 Vec1 = Vec0 ^ Vec1; \
1423 Vec0 = Vec0 ^ Vec1; \
1426 #endif /* AVUTIL_MIPS_GENERIC_MACROS_MSA_H */