2 * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #ifndef AVUTIL_MIPS_GENERIC_MACROS_MSA_H
22 #define AVUTIL_MIPS_GENERIC_MACROS_MSA_H
27 #define LD_B(RTYPE, psrc) *((RTYPE *)(psrc))
28 #define LD_UB(...) LD_B(v16u8, __VA_ARGS__)
29 #define LD_SB(...) LD_B(v16i8, __VA_ARGS__)
31 #define LD_H(RTYPE, psrc) *((RTYPE *)(psrc))
32 #define LD_SH(...) LD_H(v8i16, __VA_ARGS__)
34 #define LD_W(RTYPE, psrc) *((RTYPE *)(psrc))
35 #define LD_SW(...) LD_W(v4i32, __VA_ARGS__)
37 #define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
38 #define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
40 #define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
41 #define ST_SH(...) ST_H(v8i16, __VA_ARGS__)
43 #define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
44 #define ST_SW(...) ST_W(v4i32, __VA_ARGS__)
46 #if (__mips_isa_rev >= 6)
49 uint8_t *psrc_m = (uint8_t *) (psrc); \
53 "lw %[val_m], %[psrc_m] \n\t" \
55 : [val_m] "=r" (val_m) \
56 : [psrc_m] "m" (*psrc_m) \
65 uint8_t *psrc_m = (uint8_t *) (psrc); \
69 "ld %[val_m], %[psrc_m] \n\t" \
71 : [val_m] "=r" (val_m) \
72 : [psrc_m] "m" (*psrc_m) \
77 #else // !(__mips == 64)
80 uint8_t *psrc_m = (uint8_t *) (psrc); \
81 uint32_t val0_m, val1_m; \
84 val0_m = LW(psrc_m); \
85 val1_m = LW(psrc_m + 4); \
87 val_m = (uint64_t) (val1_m); \
88 val_m = (uint64_t) ((val_m << 32) & 0xFFFFFFFF00000000); \
89 val_m = (uint64_t) (val_m | (uint64_t) val0_m); \
93 #endif // (__mips == 64)
95 #define SH(val, pdst) \
97 uint8_t *pdst_m = (uint8_t *) (pdst); \
98 uint16_t val_m = (val); \
101 "sh %[val_m], %[pdst_m] \n\t" \
103 : [pdst_m] "=m" (*pdst_m) \
104 : [val_m] "r" (val_m) \
108 #define SW(val, pdst) \
110 uint8_t *pdst_m = (uint8_t *) (pdst); \
111 uint32_t val_m = (val); \
114 "sw %[val_m], %[pdst_m] \n\t" \
116 : [pdst_m] "=m" (*pdst_m) \
117 : [val_m] "r" (val_m) \
121 #define SD(val, pdst) \
123 uint8_t *pdst_m = (uint8_t *) (pdst); \
124 uint64_t val_m = (val); \
127 "sd %[val_m], %[pdst_m] \n\t" \
129 : [pdst_m] "=m" (*pdst_m) \
130 : [val_m] "r" (val_m) \
133 #else // !(__mips_isa_rev >= 6)
136 uint8_t *psrc_m = (uint8_t *) (psrc); \
140 "ulw %[val_m], %[psrc_m] \n\t" \
142 : [val_m] "=r" (val_m) \
143 : [psrc_m] "m" (*psrc_m) \
152 uint8_t *psrc_m = (uint8_t *) (psrc); \
153 uint64_t val_m = 0; \
156 "uld %[val_m], %[psrc_m] \n\t" \
158 : [val_m] "=r" (val_m) \
159 : [psrc_m] "m" (*psrc_m) \
164 #else // !(__mips == 64)
167 uint8_t *psrc_m1 = (uint8_t *) (psrc); \
168 uint32_t val0_m, val1_m; \
169 uint64_t val_m = 0; \
171 val0_m = LW(psrc_m1); \
172 val1_m = LW(psrc_m1 + 4); \
174 val_m = (uint64_t) (val1_m); \
175 val_m = (uint64_t) ((val_m << 32) & 0xFFFFFFFF00000000); \
176 val_m = (uint64_t) (val_m | (uint64_t) val0_m); \
180 #endif // (__mips == 64)
182 #define SH(val, pdst) \
184 uint8_t *pdst_m = (uint8_t *) (pdst); \
185 uint16_t val_m = (val); \
188 "ush %[val_m], %[pdst_m] \n\t" \
190 : [pdst_m] "=m" (*pdst_m) \
191 : [val_m] "r" (val_m) \
195 #define SW(val, pdst) \
197 uint8_t *pdst_m = (uint8_t *) (pdst); \
198 uint32_t val_m = (val); \
201 "usw %[val_m], %[pdst_m] \n\t" \
203 : [pdst_m] "=m" (*pdst_m) \
204 : [val_m] "r" (val_m) \
208 #define SD(val, pdst) \
210 uint8_t *pdst_m1 = (uint8_t *) (pdst); \
211 uint32_t val0_m, val1_m; \
213 val0_m = (uint32_t) ((val) & 0x00000000FFFFFFFF); \
214 val1_m = (uint32_t) (((val) >> 32) & 0x00000000FFFFFFFF); \
216 SW(val0_m, pdst_m1); \
217 SW(val1_m, pdst_m1 + 4); \
219 #endif // (__mips_isa_rev >= 6)
221 /* Description : Load 4 words with stride
222 Arguments : Inputs - psrc (source pointer to load from)
224 Outputs - out0, out1, out2, out3
225 Details : Loads word in 'out0' from (psrc)
226 Loads word in 'out1' from (psrc + stride)
227 Loads word in 'out2' from (psrc + 2 * stride)
228 Loads word in 'out3' from (psrc + 3 * stride)
230 #define LW4(psrc, stride, out0, out1, out2, out3) \
233 out1 = LW((psrc) + stride); \
234 out2 = LW((psrc) + 2 * stride); \
235 out3 = LW((psrc) + 3 * stride); \
238 /* Description : Store 4 words with stride
239 Arguments : Inputs - in0, in1, in2, in3, pdst, stride
240 Details : Stores word from 'in0' to (pdst)
241 Stores word from 'in1' to (pdst + stride)
242 Stores word from 'in2' to (pdst + 2 * stride)
243 Stores word from 'in3' to (pdst + 3 * stride)
245 #define SW4(in0, in1, in2, in3, pdst, stride) \
248 SW(in1, (pdst) + stride); \
249 SW(in2, (pdst) + 2 * stride); \
250 SW(in3, (pdst) + 3 * stride); \
253 /* Description : Store 4 double words with stride
254 Arguments : Inputs - in0, in1, in2, in3, pdst, stride
255 Details : Stores double word from 'in0' to (pdst)
256 Stores double word from 'in1' to (pdst + stride)
257 Stores double word from 'in2' to (pdst + 2 * stride)
258 Stores double word from 'in3' to (pdst + 3 * stride)
260 #define SD4(in0, in1, in2, in3, pdst, stride) \
263 SD(in1, (pdst) + stride); \
264 SD(in2, (pdst) + 2 * stride); \
265 SD(in3, (pdst) + 3 * stride); \
268 /* Description : Load vectors with 16 byte elements with stride
269 Arguments : Inputs - psrc (source pointer to load from)
272 Return Type - as per RTYPE
273 Details : Loads 16 byte elements in 'out0' from (psrc)
274 Loads 16 byte elements in 'out1' from (psrc + stride)
276 #define LD_B2(RTYPE, psrc, stride, out0, out1) \
278 out0 = LD_B(RTYPE, (psrc)); \
279 out1 = LD_B(RTYPE, (psrc) + stride); \
281 #define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)
282 #define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__)
284 #define LD_B3(RTYPE, psrc, stride, out0, out1, out2) \
286 LD_B2(RTYPE, (psrc), stride, out0, out1); \
287 out2 = LD_B(RTYPE, (psrc) + 2 * stride); \
289 #define LD_SB3(...) LD_B3(v16i8, __VA_ARGS__)
291 #define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \
293 LD_B2(RTYPE, (psrc), stride, out0, out1); \
294 LD_B2(RTYPE, (psrc) + 2 * stride , stride, out2, out3); \
296 #define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)
297 #define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__)
299 #define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \
301 LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
302 out4 = LD_B(RTYPE, (psrc) + 4 * stride); \
304 #define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__)
305 #define LD_SB5(...) LD_B5(v16i8, __VA_ARGS__)
307 #define LD_B6(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5) \
309 LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
310 LD_B2(RTYPE, (psrc) + 4 * stride, stride, out4, out5); \
312 #define LD_SB6(...) LD_B6(v16i8, __VA_ARGS__)
314 #define LD_B7(RTYPE, psrc, stride, \
315 out0, out1, out2, out3, out4, out5, out6) \
317 LD_B5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4); \
318 LD_B2(RTYPE, (psrc) + 5 * stride, stride, out5, out6); \
320 #define LD_SB7(...) LD_B7(v16i8, __VA_ARGS__)
322 #define LD_B8(RTYPE, psrc, stride, \
323 out0, out1, out2, out3, out4, out5, out6, out7) \
325 LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
326 LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \
328 #define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__)
329 #define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__)
331 /* Description : Store vectors of 16 byte elements with stride
332 Arguments : Inputs - in0, in1, stride
333 Outputs - pdst (destination pointer to store to)
334 Details : Stores 16 byte elements from 'in0' to (pdst)
335 Stores 16 byte elements from 'in1' to (pdst + stride)
337 #define ST_B2(RTYPE, in0, in1, pdst, stride) \
339 ST_B(RTYPE, in0, (pdst)); \
340 ST_B(RTYPE, in1, (pdst) + stride); \
342 #define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)
343 #define ST_SB2(...) ST_B2(v16i8, __VA_ARGS__)
345 #define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) \
347 ST_B2(RTYPE, in0, in1, (pdst), stride); \
348 ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
350 #define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
351 #define ST_SB4(...) ST_B4(v16i8, __VA_ARGS__)
353 #define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
356 ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride); \
357 ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \
359 #define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__)
361 /* Description : Store vectors of 8 halfword elements with stride
362 Arguments : Inputs - in0, in1, stride
363 Outputs - pdst (destination pointer to store to)
364 Details : Stores 8 halfword elements from 'in0' to (pdst)
365 Stores 8 halfword elements from 'in1' to (pdst + stride)
367 #define ST_H2(RTYPE, in0, in1, pdst, stride) \
369 ST_H(RTYPE, in0, (pdst)); \
370 ST_H(RTYPE, in1, (pdst) + stride); \
372 #define ST_UH2(...) ST_H2(v8u16, __VA_ARGS__)
373 #define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__)
375 #define ST_H4(RTYPE, in0, in1, in2, in3, pdst, stride) \
377 ST_H2(RTYPE, in0, in1, (pdst), stride); \
378 ST_H2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
380 #define ST_SH4(...) ST_H4(v8i16, __VA_ARGS__)
382 #define ST_H6(RTYPE, in0, in1, in2, in3, in4, in5, pdst, stride) \
384 ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride); \
385 ST_H2(RTYPE, in4, in5, (pdst) + 4 * stride, stride); \
387 #define ST_SH6(...) ST_H6(v8i16, __VA_ARGS__)
389 #define ST_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
391 ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride); \
392 ST_H4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \
394 #define ST_SH8(...) ST_H8(v8i16, __VA_ARGS__)
396 /* Description : Store vectors of word elements with stride
397 Arguments : Inputs - in0, in1, stride
398 Outputs - pdst (destination pointer to store to)
399 Return Type - signed word
400 Details : Stores 4 word elements from 'in0' to (pdst)
401 Stores 4 word elements from 'in1' to (pdst + stride)
403 #define ST_SW2(in0, in1, pdst, stride) \
405 ST_SW(in0, (pdst)); \
406 ST_SW(in1, (pdst) + stride); \
409 /* Description : Store as 2x4 byte block to destination memory from input vector
410 Arguments : Inputs - in, stidx, pdst, stride
411 Return Type - unsigned byte
412 Details : Index stidx halfword element from 'in' vector is copied and
414 Index stidx+1 halfword element from 'in' vector is copied and
415 stored on second line
416 Index stidx+2 halfword element from 'in' vector is copied and
418 Index stidx+3 halfword element from 'in' vector is copied and
419 stored on fourth line
421 #define ST2x4_UB(in, stidx, pdst, stride) \
423 uint16_t out0_m, out1_m, out2_m, out3_m; \
424 uint8_t *pblk_2x4_m = (uint8_t *) (pdst); \
426 out0_m = __msa_copy_u_h((v8i16) in, (stidx)); \
427 out1_m = __msa_copy_u_h((v8i16) in, (stidx + 1)); \
428 out2_m = __msa_copy_u_h((v8i16) in, (stidx + 2)); \
429 out3_m = __msa_copy_u_h((v8i16) in, (stidx + 3)); \
431 SH(out0_m, pblk_2x4_m); \
432 SH(out1_m, pblk_2x4_m + stride); \
433 SH(out2_m, pblk_2x4_m + 2 * stride); \
434 SH(out3_m, pblk_2x4_m + 3 * stride); \
437 /* Description : Store as 4x2 byte block to destination memory from input vector
438 Arguments : Inputs - in, pdst, stride
439 Return Type - unsigned byte
440 Details : Index 0 word element from input vector is copied and stored
442 Index 1 word element from input vector is copied and stored
445 #define ST4x2_UB(in, pdst, stride) \
447 uint32_t out0_m, out1_m; \
448 uint8_t *pblk_4x2_m = (uint8_t *) (pdst); \
450 out0_m = __msa_copy_u_w((v4i32) in, 0); \
451 out1_m = __msa_copy_u_w((v4i32) in, 1); \
453 SW(out0_m, pblk_4x2_m); \
454 SW(out1_m, pblk_4x2_m + stride); \
457 /* Description : Store as 4x4 byte block to destination memory from input vector
458 Arguments : Inputs - in0, in1, pdst, stride
459 Return Type - unsigned byte
460 Details : Idx0 word element from input vector 'in0' is copied and stored
462 Idx1 word element from input vector 'in0' is copied and stored
464 Idx2 word element from input vector 'in1' is copied and stored
466 Idx3 word element from input vector 'in1' is copied and stored
469 #define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \
471 uint32_t out0_m, out1_m, out2_m, out3_m; \
472 uint8_t *pblk_4x4_m = (uint8_t *) (pdst); \
474 out0_m = __msa_copy_u_w((v4i32) in0, idx0); \
475 out1_m = __msa_copy_u_w((v4i32) in0, idx1); \
476 out2_m = __msa_copy_u_w((v4i32) in1, idx2); \
477 out3_m = __msa_copy_u_w((v4i32) in1, idx3); \
479 SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride); \
482 /* Description : Store as 8x2 byte block to destination memory from input vector
483 Arguments : Inputs - in, pdst, stride
484 Details : Index 0 double word element from input vector 'in' is copied
485 and stored to destination memory at (pdst)
486 Index 1 double word element from input vector 'in' is copied
487 and stored to destination memory at (pdst + stride)
489 #define ST8x2_UB(in, pdst, stride) \
491 uint64_t out0_m, out1_m; \
492 uint8_t *pblk_8x2_m = (uint8_t *) (pdst); \
494 out0_m = __msa_copy_u_d((v2i64) in, 0); \
495 out1_m = __msa_copy_u_d((v2i64) in, 1); \
497 SD(out0_m, pblk_8x2_m); \
498 SD(out1_m, pblk_8x2_m + stride); \
501 /* Description : Store as 8x4 byte block to destination memory from input
503 Arguments : Inputs - in0, in1, pdst, stride
504 Details : Index 0 double word element from input vector 'in0' is copied
505 and stored to destination memory at (pblk_8x4_m)
506 Index 1 double word element from input vector 'in0' is copied
507 and stored to destination memory at (pblk_8x4_m + stride)
508 Index 0 double word element from input vector 'in1' is copied
509 and stored to destination memory at (pblk_8x4_m + 2 * stride)
510 Index 1 double word element from input vector 'in1' is copied
511 and stored to destination memory at (pblk_8x4_m + 3 * stride)
513 #define ST8x4_UB(in0, in1, pdst, stride) \
515 uint64_t out0_m, out1_m, out2_m, out3_m; \
516 uint8_t *pblk_8x4_m = (uint8_t *) (pdst); \
518 out0_m = __msa_copy_u_d((v2i64) in0, 0); \
519 out1_m = __msa_copy_u_d((v2i64) in0, 1); \
520 out2_m = __msa_copy_u_d((v2i64) in1, 0); \
521 out3_m = __msa_copy_u_d((v2i64) in1, 1); \
523 SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride); \
525 #define ST8x8_UB(in0, in1, in2, in3, pdst, stride) \
527 uint8_t *pblk_8x8_m = (uint8_t *) (pdst); \
529 ST8x4_UB(in0, in1, pblk_8x8_m, stride); \
530 ST8x4_UB(in2, in3, pblk_8x8_m + 4 * stride, stride); \
533 /* Description : Store as 12x8 byte block to destination memory from
535 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
536 Details : Index 0 double word element from input vector 'in0' is copied
537 and stored to destination memory at (pblk_12x8_m) followed by
538 index 2 word element from same input vector 'in0' at
540 Similar to remaining lines
542 #define ST12x8_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
544 uint64_t out0_m, out1_m, out2_m, out3_m; \
545 uint64_t out4_m, out5_m, out6_m, out7_m; \
546 uint32_t out8_m, out9_m, out10_m, out11_m; \
547 uint32_t out12_m, out13_m, out14_m, out15_m; \
548 uint8_t *pblk_12x8_m = (uint8_t *) (pdst); \
550 out0_m = __msa_copy_u_d((v2i64) in0, 0); \
551 out1_m = __msa_copy_u_d((v2i64) in1, 0); \
552 out2_m = __msa_copy_u_d((v2i64) in2, 0); \
553 out3_m = __msa_copy_u_d((v2i64) in3, 0); \
554 out4_m = __msa_copy_u_d((v2i64) in4, 0); \
555 out5_m = __msa_copy_u_d((v2i64) in5, 0); \
556 out6_m = __msa_copy_u_d((v2i64) in6, 0); \
557 out7_m = __msa_copy_u_d((v2i64) in7, 0); \
559 out8_m = __msa_copy_u_w((v4i32) in0, 2); \
560 out9_m = __msa_copy_u_w((v4i32) in1, 2); \
561 out10_m = __msa_copy_u_w((v4i32) in2, 2); \
562 out11_m = __msa_copy_u_w((v4i32) in3, 2); \
563 out12_m = __msa_copy_u_w((v4i32) in4, 2); \
564 out13_m = __msa_copy_u_w((v4i32) in5, 2); \
565 out14_m = __msa_copy_u_w((v4i32) in6, 2); \
566 out15_m = __msa_copy_u_w((v4i32) in7, 2); \
568 SD(out0_m, pblk_12x8_m); \
569 SW(out8_m, pblk_12x8_m + 8); \
570 pblk_12x8_m += stride; \
571 SD(out1_m, pblk_12x8_m); \
572 SW(out9_m, pblk_12x8_m + 8); \
573 pblk_12x8_m += stride; \
574 SD(out2_m, pblk_12x8_m); \
575 SW(out10_m, pblk_12x8_m + 8); \
576 pblk_12x8_m += stride; \
577 SD(out3_m, pblk_12x8_m); \
578 SW(out11_m, pblk_12x8_m + 8); \
579 pblk_12x8_m += stride; \
580 SD(out4_m, pblk_12x8_m); \
581 SW(out12_m, pblk_12x8_m + 8); \
582 pblk_12x8_m += stride; \
583 SD(out5_m, pblk_12x8_m); \
584 SW(out13_m, pblk_12x8_m + 8); \
585 pblk_12x8_m += stride; \
586 SD(out6_m, pblk_12x8_m); \
587 SW(out14_m, pblk_12x8_m + 8); \
588 pblk_12x8_m += stride; \
589 SD(out7_m, pblk_12x8_m); \
590 SW(out15_m, pblk_12x8_m + 8); \
593 /* Description : Immediate number of columns to slide with zero
594 Arguments : Inputs - in0, in1, slide_val
596 Return Type - as per RTYPE
597 Details : Byte elements from 'zero_m' vector are slide into 'in0' by
598 number of elements specified by 'slide_val'
600 #define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val) \
602 v16i8 zero_m = { 0 }; \
603 out0 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in0, slide_val); \
604 out1 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in1, slide_val); \
606 #define SLDI_B2_0_UB(...) SLDI_B2_0(v16u8, __VA_ARGS__)
608 #define SLDI_B4_0(RTYPE, in0, in1, in2, in3, \
609 out0, out1, out2, out3, slide_val) \
611 SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val); \
612 SLDI_B2_0(RTYPE, in2, in3, out2, out3, slide_val); \
614 #define SLDI_B4_0_SB(...) SLDI_B4_0(v16i8, __VA_ARGS__)
616 /* Description : Shuffle byte vector elements as per mask vector
617 Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
619 Return Type - as per RTYPE
620 Details : Selective byte elements from in0 & in1 are copied to out0 as
621 per control vector mask0
622 Selective byte elements from in2 & in3 are copied to out1 as
623 per control vector mask1
625 #define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \
627 out0 = (RTYPE) __msa_vshf_b((v16i8) mask0, (v16i8) in1, (v16i8) in0); \
628 out1 = (RTYPE) __msa_vshf_b((v16i8) mask1, (v16i8) in3, (v16i8) in2); \
630 #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
631 #define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)
632 #define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)
633 #define VSHF_B2_SH(...) VSHF_B2(v8i16, __VA_ARGS__)
635 #define VSHF_B3(RTYPE, in0, in1, in2, in3, in4, in5, mask0, mask1, mask2, \
638 VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1); \
639 out2 = (RTYPE) __msa_vshf_b((v16i8) mask2, (v16i8) in5, (v16i8) in4); \
641 #define VSHF_B3_SB(...) VSHF_B3(v16i8, __VA_ARGS__)
643 #define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3, \
644 out0, out1, out2, out3) \
646 VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1); \
647 VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3); \
649 #define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__)
651 /* Description : Shuffle byte vector elements as per mask vector
652 Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
654 Return Type - as per RTYPE
655 Details : Selective byte elements from in0 & in1 are copied to out0 as
656 per control vector mask0
657 Selective byte elements from in2 & in3 are copied to out1 as
658 per control vector mask1
660 #define VSHF_W2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \
662 out0 = (RTYPE) __msa_vshf_w((v4i32) mask0, (v4i32) in1, (v4i32) in0); \
663 out1 = (RTYPE) __msa_vshf_w((v4i32) mask1, (v4i32) in3, (v4i32) in2); \
665 #define VSHF_W2_SB(...) VSHF_W2(v16i8, __VA_ARGS__)
667 /* Description : Dot product of byte vector elements
668 Arguments : Inputs - mult0, mult1
671 Return Type - signed halfword
672 Details : Signed byte elements from mult0 are multiplied with
673 signed byte elements from cnst0 producing a result
674 twice the size of input i.e. signed halfword.
675 Then this multiplication results of adjacent odd-even elements
676 are added together and stored to the out vector
677 (2 signed halfword results)
679 #define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
681 out0 = (RTYPE) __msa_dotp_s_h((v16i8) mult0, (v16i8) cnst0); \
682 out1 = (RTYPE) __msa_dotp_s_h((v16i8) mult1, (v16i8) cnst1); \
684 #define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__)
686 #define DOTP_SB3(RTYPE, mult0, mult1, mult2, cnst0, cnst1, cnst2, \
689 DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
690 out2 = (RTYPE) __msa_dotp_s_h((v16i8) mult2, (v16i8) cnst2); \
692 #define DOTP_SB3_SH(...) DOTP_SB3(v8i16, __VA_ARGS__)
694 #define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3, \
695 cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) \
697 DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
698 DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
700 #define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__)
702 /* Description : Dot product & addition of byte vector elements
703 Arguments : Inputs - mult0, mult1
706 Return Type - signed halfword
707 Details : Signed byte elements from mult0 are multiplied with
708 signed byte elements from cnst0 producing a result
709 twice the size of input i.e. signed halfword.
710 Then this multiplication results of adjacent odd-even elements
711 are added to the out vector
712 (2 signed halfword results)
714 #define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
716 out0 = (RTYPE) __msa_dpadd_s_h((v8i16) out0, \
717 (v16i8) mult0, (v16i8) cnst0); \
718 out1 = (RTYPE) __msa_dpadd_s_h((v8i16) out1, \
719 (v16i8) mult1, (v16i8) cnst1); \
721 #define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__)
723 #define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3, \
724 cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) \
726 DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
727 DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
729 #define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__)
731 /* Description : Dot product & addition of halfword vector elements
732 Arguments : Inputs - mult0, mult1
735 Return Type - signed word
736 Details : Signed halfword elements from mult0 are multiplied with
737 signed halfword elements from cnst0 producing a result
738 twice the size of input i.e. signed word.
739 Then this multiplication results of adjacent odd-even elements
740 are added to the out vector
741 (2 signed word results)
743 #define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
745 out0 = (RTYPE) __msa_dpadd_s_w((v4i32) out0, \
746 (v8i16) mult0, (v8i16) cnst0); \
747 out1 = (RTYPE) __msa_dpadd_s_w((v4i32) out1, \
748 (v8i16) mult1, (v8i16) cnst1); \
750 #define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__)
752 /* Description : Clips all halfword elements of input vector between min & max
753 out = ((in) < (min)) ? (min) : (((in) > (max)) ? (max) : (in))
754 Arguments : Inputs - in (input vector)
755 - min (min threshold)
756 - max (max threshold)
757 Outputs - out_m (output vector with clipped elements)
758 Return Type - signed halfword
760 #define CLIP_SH(in, min, max) \
764 out_m = __msa_max_s_h((v8i16) min, (v8i16) in); \
765 out_m = __msa_min_s_h((v8i16) max, (v8i16) out_m); \
769 /* Description : Clips all signed halfword elements of input vector
771 Arguments : Inputs - in (input vector)
772 Outputs - out_m (output vector with clipped elements)
773 Return Type - signed halfword
775 #define CLIP_SH_0_255(in) \
777 v8i16 max_m = __msa_ldi_h(255); \
780 out_m = __msa_maxi_s_h((v8i16) in, 0); \
781 out_m = __msa_min_s_h((v8i16) max_m, (v8i16) out_m); \
784 #define CLIP_SH2_0_255(in0, in1) \
786 in0 = CLIP_SH_0_255(in0); \
787 in1 = CLIP_SH_0_255(in1); \
789 #define CLIP_SH4_0_255(in0, in1, in2, in3) \
791 CLIP_SH2_0_255(in0, in1); \
792 CLIP_SH2_0_255(in2, in3); \
795 /* Description : Clips all signed word elements of input vector
797 Arguments : Inputs - in (input vector)
798 Outputs - out_m (output vector with clipped elements)
799 Return Type - signed word
801 #define CLIP_SW_0_255(in) \
803 v4i32 max_m = __msa_ldi_w(255); \
806 out_m = __msa_maxi_s_w((v4i32) in, 0); \
807 out_m = __msa_min_s_w((v4i32) max_m, (v4i32) out_m); \
811 /* Description : Horizontal subtraction of unsigned byte vector elements
812 Arguments : Inputs - in0, in1
814 Return Type - as per RTYPE
815 Details : Each unsigned odd byte element from 'in0' is subtracted from
816 even unsigned byte element from 'in0' (pairwise) and the
817 halfword result is stored in 'out0'
819 #define HSUB_UB2(RTYPE, in0, in1, out0, out1) \
821 out0 = (RTYPE) __msa_hsub_u_h((v16u8) in0, (v16u8) in0); \
822 out1 = (RTYPE) __msa_hsub_u_h((v16u8) in1, (v16u8) in1); \
824 #define HSUB_UB2_UH(...) HSUB_UB2(v8u16, __VA_ARGS__)
825 #define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
827 /* Description : Interleave even halfword elements from vectors
828 Arguments : Inputs - in0, in1, in2, in3
830 Return Type - as per RTYPE
831 Details : Even halfword elements of 'in0' and even halfword
832 elements of 'in1' are interleaved and copied to 'out0'
833 Even halfword elements of 'in2' and even halfword
834 elements of 'in3' are interleaved and copied to 'out1'
836 #define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
838 out0 = (RTYPE) __msa_ilvev_h((v8i16) in1, (v8i16) in0); \
839 out1 = (RTYPE) __msa_ilvev_h((v8i16) in3, (v8i16) in2); \
841 #define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__)
843 /* Description : Interleave even word elements from vectors
844 Arguments : Inputs - in0, in1, in2, in3
846 Return Type - as per RTYPE
847 Details : Even word elements of 'in0' and even word
848 elements of 'in1' are interleaved and copied to 'out0'
849 Even word elements of 'in2' and even word
850 elements of 'in3' are interleaved and copied to 'out1'
852 #define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) \
854 out0 = (RTYPE) __msa_ilvev_w((v4i32) in1, (v4i32) in0); \
855 out1 = (RTYPE) __msa_ilvev_w((v4i32) in3, (v4i32) in2); \
857 #define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__)
859 /* Description : Interleave even double word elements from vectors
860 Arguments : Inputs - in0, in1, in2, in3
862 Return Type - as per RTYPE
863 Details : Even double word elements of 'in0' and even double word
864 elements of 'in1' are interleaved and copied to 'out0'
865 Even double word elements of 'in2' and even double word
866 elements of 'in3' are interleaved and copied to 'out1'
868 #define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
870 out0 = (RTYPE) __msa_ilvev_d((v2i64) in1, (v2i64) in0); \
871 out1 = (RTYPE) __msa_ilvev_d((v2i64) in3, (v2i64) in2); \
873 #define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)
875 /* Description : Interleave left half of byte elements from vectors
876 Arguments : Inputs - in0, in1, in2, in3
878 Return Type - as per RTYPE
879 Details : Left half of byte elements of in0 and left half of byte
880 elements of in1 are interleaved and copied to out0.
881 Left half of byte elements of in2 and left half of byte
882 elements of in3 are interleaved and copied to out1.
884 #define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
886 out0 = (RTYPE) __msa_ilvl_b((v16i8) in0, (v16i8) in1); \
887 out1 = (RTYPE) __msa_ilvl_b((v16i8) in2, (v16i8) in3); \
889 #define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__)
890 #define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__)
892 #define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
893 out0, out1, out2, out3) \
895 ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
896 ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
898 #define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__)
899 #define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__)
900 #define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__)
902 /* Description : Interleave left half of halfword elements from vectors
903 Arguments : Inputs - in0, in1, in2, in3
905 Return Type - as per RTYPE
906 Details : Left half of halfword elements of in0 and left half of halfword
907 elements of in1 are interleaved and copied to out0.
908 Left half of halfword elements of in2 and left half of halfword
909 elements of in3 are interleaved and copied to out1.
911 #define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
913 out0 = (RTYPE) __msa_ilvl_h((v8i16) in0, (v8i16) in1); \
914 out1 = (RTYPE) __msa_ilvl_h((v8i16) in2, (v8i16) in3); \
916 #define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__)
918 #define ILVL_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
919 out0, out1, out2, out3) \
921 ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
922 ILVL_H2(RTYPE, in4, in5, in6, in7, out2, out3); \
924 #define ILVL_H4_SH(...) ILVL_H4(v8i16, __VA_ARGS__)
926 /* Description : Interleave left half of word elements from vectors
927 Arguments : Inputs - in0, in1, in2, in3
929 Return Type - as per RTYPE
930 Details : Left half of word elements of in0 and left half of word
931 elements of in1 are interleaved and copied to out0.
932 Left half of word elements of in2 and left half of word
933 elements of in3 are interleaved and copied to out1.
935 #define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1) \
937 out0 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1); \
938 out1 = (RTYPE) __msa_ilvl_w((v4i32) in2, (v4i32) in3); \
940 #define ILVL_W2_SB(...) ILVL_W2(v16i8, __VA_ARGS__)
942 /* Description : Interleave right half of byte elements from vectors
943 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
944 Outputs - out0, out1, out2, out3
945 Return Type - as per RTYPE
946 Details : Right half of byte elements of in0 and right half of byte
947 elements of in1 are interleaved and copied to out0.
948 Right half of byte elements of in2 and right half of byte
949 elements of in3 are interleaved and copied to out1.
950 Similar for other pairs
952 #define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
954 out0 = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1); \
955 out1 = (RTYPE) __msa_ilvr_b((v16i8) in2, (v16i8) in3); \
957 #define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__)
958 #define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)
959 #define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__)
960 #define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
962 #define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
963 out0, out1, out2, out3) \
965 ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
966 ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
968 #define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__)
969 #define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__)
970 #define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__)
972 /* Description : Interleave right half of halfword elements from vectors
973 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
974 Outputs - out0, out1, out2, out3
975 Return Type - signed halfword
976 Details : Right half of halfword elements of in0 and right half of
977 halfword elements of in1 are interleaved and copied to out0.
978 Right half of halfword elements of in2 and right half of
979 halfword elements of in3 are interleaved and copied to out1.
980 Similar for other pairs
982 #define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
984 out0 = (RTYPE) __msa_ilvr_h((v8i16) in0, (v8i16) in1); \
985 out1 = (RTYPE) __msa_ilvr_h((v8i16) in2, (v8i16) in3); \
987 #define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
989 #define ILVR_H3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
991 ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
992 out2 = (RTYPE) __msa_ilvr_h((v8i16) in4, (v8i16) in5); \
994 #define ILVR_H3_SH(...) ILVR_H3(v8i16, __VA_ARGS__)
996 #define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
997 out0, out1, out2, out3) \
999 ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
1000 ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3); \
1002 #define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__)
1004 #define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1) \
1006 out0 = (RTYPE) __msa_ilvr_w((v4i32) in0, (v4i32) in1); \
1007 out1 = (RTYPE) __msa_ilvr_w((v4i32) in2, (v4i32) in3); \
1009 #define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__)
1010 #define ILVR_W2_SB(...) ILVR_W2(v16i8, __VA_ARGS__)
1012 #define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1013 out0, out1, out2, out3) \
1015 ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1); \
1016 ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3); \
1018 #define ILVR_W4_SB(...) ILVR_W4(v16i8, __VA_ARGS__)
1020 /* Description : Interleave right half of double word elements from vectors
1021 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
1022 Outputs - out0, out1, out2, out3
1023 Return Type - unsigned double word
1024 Details : Right half of double word elements of in0 and right half of
1025 double word elements of in1 are interleaved and copied to out0.
1026 Right half of double word elements of in2 and right half of
1027 double word elements of in3 are interleaved and copied to out1.
1029 #define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
1031 out0 = (RTYPE) __msa_ilvr_d((v2i64) (in0), (v2i64) (in1)); \
1032 out1 = (RTYPE) __msa_ilvr_d((v2i64) (in2), (v2i64) (in3)); \
1034 #define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
1035 #define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
1037 #define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
1039 ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
1040 out2 = (RTYPE) __msa_ilvr_d((v2i64) (in4), (v2i64) (in5)); \
1042 #define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__)
1044 #define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1045 out0, out1, out2, out3) \
1047 ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
1048 ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3); \
1050 #define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__)
1052 /* Description : Interleave both left and right half of input vectors
1053 Arguments : Inputs - in0, in1
1054 Outputs - out0, out1
1055 Return Type - as per RTYPE
1056 Details : Right half of byte elements from 'in0' and 'in1' are
1057 interleaved and stored to 'out0'
1058 Left half of byte elements from 'in0' and 'in1' are
1059 interleaved and stored to 'out1'
1061 #define ILVRL_B2(RTYPE, in0, in1, out0, out1) \
1063 out0 = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1); \
1064 out1 = (RTYPE) __msa_ilvl_b((v16i8) in0, (v16i8) in1); \
1066 #define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)
1067 #define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
1069 #define ILVRL_H2(RTYPE, in0, in1, out0, out1) \
1071 out0 = (RTYPE) __msa_ilvr_h((v8i16) in0, (v8i16) in1); \
1072 out1 = (RTYPE) __msa_ilvl_h((v8i16) in0, (v8i16) in1); \
1074 #define ILVRL_H2_SB(...) ILVRL_H2(v16i8, __VA_ARGS__)
1075 #define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)
1076 #define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
1078 #define ILVRL_W2(RTYPE, in0, in1, out0, out1) \
1080 out0 = (RTYPE) __msa_ilvr_w((v4i32) in0, (v4i32) in1); \
1081 out1 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1); \
1083 #define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
1085 /* Description : Maximum values between signed elements of vector and
1086 5-bit signed immediate value are copied to the output vector
1087 Arguments : Inputs - in0, in1, in2, in3, max_val
1088 Outputs - in0, in1, in2, in3 (in place)
1089 Return Type - unsigned halfword
1090 Details : Maximum of signed halfword element values from 'in0' and
1091 'max_val' are written to output vector 'in0'
1093 #define MAXI_SH2(RTYPE, in0, in1, max_val) \
1095 in0 = (RTYPE) __msa_maxi_s_h((v8i16) in0, (max_val)); \
1096 in1 = (RTYPE) __msa_maxi_s_h((v8i16) in1, (max_val)); \
1098 #define MAXI_SH2_SH(...) MAXI_SH2(v8i16, __VA_ARGS__)
1100 #define MAXI_SH4(RTYPE, in0, in1, in2, in3, max_val) \
1102 MAXI_SH2(RTYPE, in0, in1, max_val); \
1103 MAXI_SH2(RTYPE, in2, in3, max_val); \
1105 #define MAXI_SH4_UH(...) MAXI_SH4(v8u16, __VA_ARGS__)
1107 /* Description : Saturate the halfword element values to the max
1108 unsigned value of (sat_val+1 bits)
1109 The element data width remains unchanged
1110 Arguments : Inputs - in0, in1, in2, in3, sat_val
1111 Outputs - in0, in1, in2, in3 (in place)
1112 Return Type - unsigned halfword
1113 Details : Each unsigned halfword element from 'in0' is saturated to the
1114 value generated with (sat_val+1) bit range
1115 Results are in placed to original vectors
1117 #define SAT_UH2(RTYPE, in0, in1, sat_val) \
1119 in0 = (RTYPE) __msa_sat_u_h((v8u16) in0, sat_val); \
1120 in1 = (RTYPE) __msa_sat_u_h((v8u16) in1, sat_val); \
1122 #define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__)
1124 #define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val) \
1126 SAT_UH2(RTYPE, in0, in1, sat_val); \
1127 SAT_UH2(RTYPE, in2, in3, sat_val) \
1129 #define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__)
1131 /* Description : Saturate the halfword element values to the max
1132 unsigned value of (sat_val+1 bits)
1133 The element data width remains unchanged
1134 Arguments : Inputs - in0, in1, in2, in3, sat_val
1135 Outputs - in0, in1, in2, in3 (in place)
1136 Return Type - unsigned halfword
1137 Details : Each unsigned halfword element from 'in0' is saturated to the
1138 value generated with (sat_val+1) bit range
1139 Results are in placed to original vectors
1141 #define SAT_SH2(RTYPE, in0, in1, sat_val) \
1143 in0 = (RTYPE) __msa_sat_s_h((v8i16) in0, sat_val); \
1144 in1 = (RTYPE) __msa_sat_s_h((v8i16) in1, sat_val); \
1146 #define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__)
1148 #define SAT_SH3(RTYPE, in0, in1, in2, sat_val) \
1150 SAT_SH2(RTYPE, in0, in1, sat_val) \
1151 in2 = (RTYPE) __msa_sat_s_h((v8i16) in2, sat_val); \
1153 #define SAT_SH3_SH(...) SAT_SH3(v8i16, __VA_ARGS__)
1155 #define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) \
1157 SAT_SH2(RTYPE, in0, in1, sat_val); \
1158 SAT_SH2(RTYPE, in2, in3, sat_val); \
1160 #define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__)
1162 /* Description : Indexed halfword element values are replicated to all
1163 elements in output vector
1164 Arguments : Inputs - in, idx0, idx1
1165 Outputs - out0, out1
1166 Return Type - as per RTYPE
1167 Details : 'idx0' element value from 'in' vector is replicated to all
1168 elements in 'out0' vector
1169 Valid index range for halfword operation is 0-7
1171 #define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) \
1173 out0 = (RTYPE) __msa_splati_h((v8i16) in, idx0); \
1174 out1 = (RTYPE) __msa_splati_h((v8i16) in, idx1); \
1176 #define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__)
1178 #define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3, \
1179 out0, out1, out2, out3) \
1181 SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1); \
1182 SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3); \
1184 #define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__)
1185 #define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__)
1187 /* Description : Indexed word element values are replicated to all
1188 elements in output vector
1189 Arguments : Inputs - in, stidx
1190 Outputs - out0, out1
1191 Return Type - as per RTYPE
1192 Details : 'stidx' element value from 'in' vector is replicated to all
1193 elements in 'out0' vector
1194 'stidx + 1' element value from 'in' vector is replicated to all
1195 elements in 'out1' vector
1196 Valid index range for halfword operation is 0-3
1198 #define SPLATI_W2(RTYPE, in, stidx, out0, out1) \
1200 out0 = (RTYPE) __msa_splati_w((v4i32) in, stidx); \
1201 out1 = (RTYPE) __msa_splati_w((v4i32) in, (stidx+1)); \
1203 #define SPLATI_W2_SW(...) SPLATI_W2(v4i32, __VA_ARGS__)
1205 #define SPLATI_W4(RTYPE, in, out0, out1, out2, out3) \
1207 SPLATI_W2(RTYPE, in, 0, out0, out1); \
1208 SPLATI_W2(RTYPE, in, 2, out2, out3); \
1210 #define SPLATI_W4_SW(...) SPLATI_W4(v4i32, __VA_ARGS__)
1212 /* Description : Pack even byte elements of vector pairs
1213 Arguments : Inputs - in0, in1, in2, in3
1214 Outputs - out0, out1
1215 Return Type - as per RTYPE
1216 Details : Even byte elements of in0 are copied to the left half of
1217 out0 & even byte elements of in1 are copied to the right
1219 Even byte elements of in2 are copied to the left half of
1220 out1 & even byte elements of in3 are copied to the right
1223 #define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
1225 out0 = (RTYPE) __msa_pckev_b((v16i8) in0, (v16i8) in1); \
1226 out1 = (RTYPE) __msa_pckev_b((v16i8) in2, (v16i8) in3); \
1228 #define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)
1229 #define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)
1230 #define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__)
1231 #define PCKEV_B2_SW(...) PCKEV_B2(v4i32, __VA_ARGS__)
1233 #define PCKEV_B3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
1235 PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1236 out2 = (RTYPE) __msa_pckev_b((v16i8) in4, (v16i8) in5); \
1238 #define PCKEV_B3_UB(...) PCKEV_B3(v16u8, __VA_ARGS__)
1239 #define PCKEV_B3_SB(...) PCKEV_B3(v16i8, __VA_ARGS__)
1241 #define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1242 out0, out1, out2, out3) \
1244 PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1245 PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
1247 #define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__)
1248 #define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__)
1250 /* Description : Pack even halfword elements of vector pairs
1251 Arguments : Inputs - in0, in1, in2, in3
1252 Outputs - out0, out1
1253 Return Type - as per RTYPE
1254 Details : Even halfword elements of in0 are copied to the left half of
1255 out0 & even halfword elements of in1 are copied to the right
1257 Even halfword elements of in2 are copied to the left half of
1258 out1 & even halfword elements of in3 are copied to the right
1261 #define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
1263 out0 = (RTYPE) __msa_pckev_h((v8i16) in0, (v8i16) in1); \
1264 out1 = (RTYPE) __msa_pckev_h((v8i16) in2, (v8i16) in3); \
1266 #define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__)
1267 #define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__)
1269 #define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1270 out0, out1, out2, out3) \
1272 PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
1273 PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3); \
1275 #define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__)
1276 #define PCKEV_H4_SW(...) PCKEV_H4(v4i32, __VA_ARGS__)
1278 /* Description : Each byte element is logically xor'ed with immediate 128
1279 Arguments : Inputs - in0, in1
1280 Outputs - in0, in1 (in-place)
1281 Return Type - as per RTYPE
1282 Details : Each unsigned byte element from input vector 'in0' is
1283 logically xor'ed with 128 and result is in-place stored in
1285 Each unsigned byte element from input vector 'in1' is
1286 logically xor'ed with 128 and result is in-place stored in
1288 Similar for other pairs
1290 #define XORI_B2_128(RTYPE, in0, in1) \
1292 in0 = (RTYPE) __msa_xori_b((v16u8) in0, 128); \
1293 in1 = (RTYPE) __msa_xori_b((v16u8) in1, 128); \
1295 #define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__)
1297 #define XORI_B3_128(RTYPE, in0, in1, in2) \
1299 XORI_B2_128(RTYPE, in0, in1); \
1300 in2 = (RTYPE) __msa_xori_b((v16u8) in2, 128); \
1302 #define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__)
1304 #define XORI_B4_128(RTYPE, in0, in1, in2, in3) \
1306 XORI_B2_128(RTYPE, in0, in1); \
1307 XORI_B2_128(RTYPE, in2, in3); \
1309 #define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__)
1310 #define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__)
1311 #define XORI_B4_128_SH(...) XORI_B4_128(v8i16, __VA_ARGS__)
1313 #define XORI_B5_128(RTYPE, in0, in1, in2, in3, in4) \
1315 XORI_B3_128(RTYPE, in0, in1, in2); \
1316 XORI_B2_128(RTYPE, in3, in4); \
1318 #define XORI_B5_128_SB(...) XORI_B5_128(v16i8, __VA_ARGS__)
1320 #define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6) \
1322 XORI_B4_128(RTYPE, in0, in1, in2, in3); \
1323 XORI_B3_128(RTYPE, in4, in5, in6); \
1325 #define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__)
1327 #define XORI_B8_128(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7) \
1329 XORI_B4_128(RTYPE, in0, in1, in2, in3); \
1330 XORI_B4_128(RTYPE, in4, in5, in6, in7); \
1332 #define XORI_B8_128_SB(...) XORI_B8_128(v16i8, __VA_ARGS__)
1334 /* Description : Addition of signed halfword elements and signed saturation
1335 Arguments : Inputs - in0, in1, in2, in3
1336 Outputs - out0, out1
1337 Return Type - as per RTYPE
1338 Details : Signed halfword elements from 'in0' are added to signed
1339 halfword elements of 'in1'. The result is then signed saturated
1340 between -32768 to +32767 (as per halfword data type)
1341 Similar for other pairs
1343 #define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1) \
1345 out0 = (RTYPE) __msa_adds_s_h((v8i16) in0, (v8i16) in1); \
1346 out1 = (RTYPE) __msa_adds_s_h((v8i16) in2, (v8i16) in3); \
1348 #define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__)
1350 #define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1351 out0, out1, out2, out3) \
1353 ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1); \
1354 ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3); \
1356 #define ADDS_SH4_UH(...) ADDS_SH4(v8u16, __VA_ARGS__)
1357 #define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__)
1359 /* Description : Shift left all elements of vector (generic for all data types)
1360 Arguments : Inputs - in0, in1, in2, in3, shift
1361 Outputs - in0, in1, in2, in3 (in place)
1362 Return Type - as per input vector RTYPE
1363 Details : Each element of vector 'in0' is left shifted by 'shift' and
1364 result is in place written to 'in0'
1365 Similar for other pairs
1367 #define SLLI_4V(in0, in1, in2, in3, shift) \
1369 in0 = in0 << shift; \
1370 in1 = in1 << shift; \
1371 in2 = in2 << shift; \
1372 in3 = in3 << shift; \
1375 /* Description : Arithmetic shift right all elements of vector
1376 (generic for all data types)
1377 Arguments : Inputs - in0, in1, in2, in3, shift
1378 Outputs - in0, in1, in2, in3 (in place)
1379 Return Type - as per input vector RTYPE
1380 Details : Each element of vector 'in0' is right shifted by 'shift' and
1381 result is in place written to 'in0'
1382 Here, 'shift' is GP variable passed in
1383 Similar for other pairs
1385 #define SRA_4V(in0, in1, in2, in3, shift) \
1387 in0 = in0 >> shift; \
1388 in1 = in1 >> shift; \
1389 in2 = in2 >> shift; \
1390 in3 = in3 >> shift; \
1393 /* Description : Shift right logical all halfword elements of vector
1394 Arguments : Inputs - in0, in1, in2, in3, shift
1395 Outputs - in0, in1, in2, in3 (in place)
1396 Return Type - unsigned halfword
1397 Details : Each element of vector 'in0' is shifted right logical by
1398 number of bits respective element holds in vector 'shift' and
1399 result is in place written to 'in0'
1400 Here, 'shift' is a vector passed in
1401 Similar for other pairs
1403 #define SRL_H4(RTYPE, in0, in1, in2, in3, shift) \
1405 in0 = (RTYPE) __msa_srl_h((v8i16) in0, (v8i16) shift); \
1406 in1 = (RTYPE) __msa_srl_h((v8i16) in1, (v8i16) shift); \
1407 in2 = (RTYPE) __msa_srl_h((v8i16) in2, (v8i16) shift); \
1408 in3 = (RTYPE) __msa_srl_h((v8i16) in3, (v8i16) shift); \
1410 #define SRL_H4_UH(...) SRL_H4(v8u16, __VA_ARGS__)
1412 /* Description : Shift right arithmetic rounded halfwords
1413 Arguments : Inputs - in0, in1, shift
1414 Outputs - in0, in1, (in place)
1415 Return Type - unsigned halfword
1416 Details : Each element of vector 'in0' is shifted right arithmetic by
1417 number of bits respective element holds in vector 'shift'.
1418 The last discarded bit is added to shifted value for rounding
1419 and the result is in place written to 'in0'
1420 Here, 'shift' is a vector passed in
1421 Similar for other pairs
1423 #define SRAR_H2(RTYPE, in0, in1, shift) \
1425 in0 = (RTYPE) __msa_srar_h((v8i16) in0, (v8i16) shift); \
1426 in1 = (RTYPE) __msa_srar_h((v8i16) in1, (v8i16) shift); \
1428 #define SRAR_H2_UH(...) SRAR_H2(v8u16, __VA_ARGS__)
1429 #define SRAR_H2_SH(...) SRAR_H2(v8i16, __VA_ARGS__)
1431 #define SRAR_H3(RTYPE, in0, in1, in2, shift) \
1433 SRAR_H2(RTYPE, in0, in1, shift) \
1434 in2 = (RTYPE) __msa_srar_h((v8i16) in2, (v8i16) shift); \
1436 #define SRAR_H3_SH(...) SRAR_H3(v8i16, __VA_ARGS__)
1438 #define SRAR_H4(RTYPE, in0, in1, in2, in3, shift) \
1440 SRAR_H2(RTYPE, in0, in1, shift) \
1441 SRAR_H2(RTYPE, in2, in3, shift) \
1443 #define SRAR_H4_UH(...) SRAR_H4(v8u16, __VA_ARGS__)
1444 #define SRAR_H4_SH(...) SRAR_H4(v8i16, __VA_ARGS__)
1445 /* Description : Shift right arithmetic rounded (immediate)
1446 Arguments : Inputs - in0, in1, shift
1447 Outputs - in0, in1 (in place)
1448 Return Type - as per RTYPE
1449 Details : Each element of vector 'in0' is shifted right arithmetic by
1451 The last discarded bit is added to shifted value for rounding
1452 and the result is in place written to 'in0'
1453 Similar for other pairs
1455 #define SRARI_W2(RTYPE, in0, in1, shift) \
1457 in0 = (RTYPE) __msa_srari_w((v4i32) in0, shift); \
1458 in1 = (RTYPE) __msa_srari_w((v4i32) in1, shift); \
1460 #define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__)
1462 #define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) \
1464 SRARI_W2(RTYPE, in0, in1, shift); \
1465 SRARI_W2(RTYPE, in2, in3, shift); \
1467 #define SRARI_W4_SH(...) SRARI_W4(v8i16, __VA_ARGS__)
1468 #define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)
1470 /* Description : Multiplication of pairs of vectors
1471 Arguments : Inputs - in0, in1, in2, in3
1472 Outputs - out0, out1
1473 Details : Each element from 'in0' is multiplied with elements from 'in1'
1474 and result is written to 'out0'
1475 Similar for other pairs
1477 #define MUL2(in0, in1, in2, in3, out0, out1) \
1482 #define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
1484 MUL2(in0, in1, in2, in3, out0, out1); \
1485 MUL2(in4, in5, in6, in7, out2, out3); \
1488 /* Description : Zero extend unsigned byte elements to halfword elements
1489 Arguments : Inputs - in (1 input unsigned byte vector)
1490 Outputs - out0, out1 (unsigned 2 halfword vectors)
1491 Return Type - signed halfword
1492 Details : Zero extended right half of vector is returned in 'out0'
1493 Zero extended left half of vector is returned in 'out1'
1495 #define UNPCK_UB_SH(in, out0, out1) \
1497 v16i8 zero_m = { 0 }; \
1499 ILVRL_B2_SH(zero_m, in, out0, out1); \
1502 /* Description : Transposes input 4x4 byte block
1503 Arguments : Inputs - in0, in1, in2, in3 (input 4x4 byte block)
1504 Outputs - out0, out1, out2, out3 (output 4x4 byte block)
1505 Return Type - unsigned byte
1508 #define TRANSPOSE4x4_UB_UB(in0, in1, in2, in3, out0, out1, out2, out3) \
1510 v16i8 zero_m = { 0 }; \
1511 v16i8 s0_m, s1_m, s2_m, s3_m; \
1513 ILVR_D2_SB(in1, in0, in3, in2, s0_m, s1_m); \
1514 ILVRL_B2_SB(s1_m, s0_m, s2_m, s3_m); \
1516 out0 = (v16u8) __msa_ilvr_b(s3_m, s2_m); \
1517 out1 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out0, 4); \
1518 out2 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out1, 4); \
1519 out3 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out2, 4); \
1522 /* Description : Transposes input 8x4 byte block into 4x8
1523 Arguments : Inputs - in0, in1, in2, in3 (input 8x4 byte block)
1524 Outputs - out0, out1, out2, out3 (output 4x8 byte block)
1525 Return Type - unsigned byte
1528 #define TRANSPOSE8x4_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1529 out0, out1, out2, out3) \
1531 v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
1533 ILVEV_W2_SB(in0, in4, in1, in5, tmp0_m, tmp1_m); \
1534 tmp2_m = __msa_ilvr_b(tmp1_m, tmp0_m); \
1535 ILVEV_W2_SB(in2, in6, in3, in7, tmp0_m, tmp1_m); \
1537 tmp3_m = __msa_ilvr_b(tmp1_m, tmp0_m); \
1538 ILVRL_H2_SB(tmp3_m, tmp2_m, tmp0_m, tmp1_m); \
1540 ILVRL_W2(RTYPE, tmp1_m, tmp0_m, out0, out2); \
1541 out1 = (RTYPE) __msa_ilvl_d((v2i64) out2, (v2i64) out0); \
1542 out3 = (RTYPE) __msa_ilvl_d((v2i64) out0, (v2i64) out2); \
1545 #define TRANSPOSE8x4_UB_UB(...) TRANSPOSE8x4_UB(v16u8, __VA_ARGS__)
1547 /* Description : Transposes 16x8 block into 8x16 with byte elements in vectors
1548 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7,
1549 in8, in9, in10, in11, in12, in13, in14, in15
1550 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1551 Return Type - unsigned byte
1554 #define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
1555 in8, in9, in10, in11, in12, in13, in14, in15, \
1556 out0, out1, out2, out3, out4, out5, out6, out7) \
1558 v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
1559 v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
1561 ILVEV_D2_UB(in0, in8, in1, in9, out7, out6); \
1562 ILVEV_D2_UB(in2, in10, in3, in11, out5, out4); \
1563 ILVEV_D2_UB(in4, in12, in5, in13, out3, out2); \
1564 ILVEV_D2_UB(in6, in14, in7, in15, out1, out0); \
1566 tmp0_m = (v16u8) __msa_ilvev_b((v16i8) out6, (v16i8) out7); \
1567 tmp4_m = (v16u8) __msa_ilvod_b((v16i8) out6, (v16i8) out7); \
1568 tmp1_m = (v16u8) __msa_ilvev_b((v16i8) out4, (v16i8) out5); \
1569 tmp5_m = (v16u8) __msa_ilvod_b((v16i8) out4, (v16i8) out5); \
1570 out5 = (v16u8) __msa_ilvev_b((v16i8) out2, (v16i8) out3); \
1571 tmp6_m = (v16u8) __msa_ilvod_b((v16i8) out2, (v16i8) out3); \
1572 out7 = (v16u8) __msa_ilvev_b((v16i8) out0, (v16i8) out1); \
1573 tmp7_m = (v16u8) __msa_ilvod_b((v16i8) out0, (v16i8) out1); \
1575 ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m); \
1576 out0 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
1577 out4 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
1579 tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
1580 tmp3_m = (v16u8) __msa_ilvod_h((v8i16) out7, (v8i16) out5); \
1581 out2 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
1582 out6 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
1584 ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m); \
1585 out1 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
1586 out5 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
1588 tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp5_m, (v8i16) tmp4_m); \
1589 tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp5_m, (v8i16) tmp4_m); \
1590 tmp3_m = (v16u8) __msa_ilvod_h((v8i16) tmp7_m, (v8i16) tmp6_m); \
1591 tmp3_m = (v16u8) __msa_ilvod_h((v8i16) tmp7_m, (v8i16) tmp6_m); \
1592 out3 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
1593 out7 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
1596 /* Description : Pack even elements of input vectors & xor with 128
1597 Arguments : Inputs - in0, in1
1599 Return Type - unsigned byte
1600 Details : Signed byte even elements from 'in0' and 'in1' are packed
1601 together in one vector and the resulted vector is xor'ed with
1602 128 to shift the range from signed to unsigned byte
1604 #define PCKEV_XORI128_UB(in0, in1) \
1607 out_m = (v16u8) __msa_pckev_b((v16i8) in1, (v16i8) in0); \
1608 out_m = (v16u8) __msa_xori_b((v16u8) out_m, 128); \
1612 /* Description : Pack even byte elements, extract 0 & 2 index words from pair
1613 of results and store 4 words in destination memory as per
1615 Arguments : Inputs - in0, in1, in2, in3, pdst, stride
1617 #define PCKEV_ST4x4_UB(in0, in1, in2, in3, pdst, stride) \
1619 uint32_t out0_m, out1_m, out2_m, out3_m; \
1620 v16i8 tmp0_m, tmp1_m; \
1622 PCKEV_B2_SB(in1, in0, in3, in2, tmp0_m, tmp1_m); \
1624 out0_m = __msa_copy_u_w((v4i32) tmp0_m, 0); \
1625 out1_m = __msa_copy_u_w((v4i32) tmp0_m, 2); \
1626 out2_m = __msa_copy_u_w((v4i32) tmp1_m, 0); \
1627 out3_m = __msa_copy_u_w((v4i32) tmp1_m, 2); \
1629 SW4(out0_m, out1_m, out2_m, out3_m, pdst, stride); \
1631 #endif /* AVUTIL_MIPS_GENERIC_MACROS_MSA_H */