2 * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #ifndef AVUTIL_MIPS_GENERIC_MACROS_MSA_H
22 #define AVUTIL_MIPS_GENERIC_MACROS_MSA_H
27 #define LD_B(RTYPE, psrc) *((RTYPE *)(psrc))
28 #define LD_UB(...) LD_B(v16u8, __VA_ARGS__)
29 #define LD_SB(...) LD_B(v16i8, __VA_ARGS__)
31 #define LD_H(RTYPE, psrc) *((RTYPE *)(psrc))
32 #define LD_UH(...) LD_H(v8u16, __VA_ARGS__)
33 #define LD_SH(...) LD_H(v8i16, __VA_ARGS__)
35 #define LD_W(RTYPE, psrc) *((RTYPE *)(psrc))
36 #define LD_UW(...) LD_W(v4u32, __VA_ARGS__)
37 #define LD_SW(...) LD_W(v4i32, __VA_ARGS__)
39 #define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
40 #define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
41 #define ST_SB(...) ST_B(v16i8, __VA_ARGS__)
43 #define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
44 #define ST_UH(...) ST_H(v8u16, __VA_ARGS__)
45 #define ST_SH(...) ST_H(v8i16, __VA_ARGS__)
47 #define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
48 #define ST_UW(...) ST_W(v4u32, __VA_ARGS__)
49 #define ST_SW(...) ST_W(v4i32, __VA_ARGS__)
51 #if (__mips_isa_rev >= 6)
54 uint8_t *psrc_m = (uint8_t *) (psrc); \
58 "lw %[val_m], %[psrc_m] \n\t" \
60 : [val_m] "=r" (val_m) \
61 : [psrc_m] "m" (*psrc_m) \
70 uint8_t *psrc_m = (uint8_t *) (psrc); \
74 "ld %[val_m], %[psrc_m] \n\t" \
76 : [val_m] "=r" (val_m) \
77 : [psrc_m] "m" (*psrc_m) \
82 #else // !(__mips == 64)
85 uint8_t *psrc_m = (uint8_t *) (psrc); \
86 uint32_t val0_m, val1_m; \
89 val0_m = LW(psrc_m); \
90 val1_m = LW(psrc_m + 4); \
92 val_m = (uint64_t) (val1_m); \
93 val_m = (uint64_t) ((val_m << 32) & 0xFFFFFFFF00000000); \
94 val_m = (uint64_t) (val_m | (uint64_t) val0_m); \
98 #endif // (__mips == 64)
100 #define SH(val, pdst) \
102 uint8_t *pdst_m = (uint8_t *) (pdst); \
103 uint16_t val_m = (val); \
106 "sh %[val_m], %[pdst_m] \n\t" \
108 : [pdst_m] "=m" (*pdst_m) \
109 : [val_m] "r" (val_m) \
113 #define SW(val, pdst) \
115 uint8_t *pdst_m = (uint8_t *) (pdst); \
116 uint32_t val_m = (val); \
119 "sw %[val_m], %[pdst_m] \n\t" \
121 : [pdst_m] "=m" (*pdst_m) \
122 : [val_m] "r" (val_m) \
126 #define SD(val, pdst) \
128 uint8_t *pdst_m = (uint8_t *) (pdst); \
129 uint64_t val_m = (val); \
132 "sd %[val_m], %[pdst_m] \n\t" \
134 : [pdst_m] "=m" (*pdst_m) \
135 : [val_m] "r" (val_m) \
138 #else // !(__mips_isa_rev >= 6)
141 uint8_t *psrc_m = (uint8_t *) (psrc); \
145 "ulw %[val_m], %[psrc_m] \n\t" \
147 : [val_m] "=r" (val_m) \
148 : [psrc_m] "m" (*psrc_m) \
157 uint8_t *psrc_m = (uint8_t *) (psrc); \
158 uint64_t val_m = 0; \
161 "uld %[val_m], %[psrc_m] \n\t" \
163 : [val_m] "=r" (val_m) \
164 : [psrc_m] "m" (*psrc_m) \
169 #else // !(__mips == 64)
172 uint8_t *psrc_m1 = (uint8_t *) (psrc); \
173 uint32_t val0_m, val1_m; \
174 uint64_t val_m = 0; \
176 val0_m = LW(psrc_m1); \
177 val1_m = LW(psrc_m1 + 4); \
179 val_m = (uint64_t) (val1_m); \
180 val_m = (uint64_t) ((val_m << 32) & 0xFFFFFFFF00000000); \
181 val_m = (uint64_t) (val_m | (uint64_t) val0_m); \
185 #endif // (__mips == 64)
187 #define SH(val, pdst) \
189 uint8_t *pdst_m = (uint8_t *) (pdst); \
190 uint16_t val_m = (val); \
193 "ush %[val_m], %[pdst_m] \n\t" \
195 : [pdst_m] "=m" (*pdst_m) \
196 : [val_m] "r" (val_m) \
200 #define SW(val, pdst) \
202 uint8_t *pdst_m = (uint8_t *) (pdst); \
203 uint32_t val_m = (val); \
206 "usw %[val_m], %[pdst_m] \n\t" \
208 : [pdst_m] "=m" (*pdst_m) \
209 : [val_m] "r" (val_m) \
213 #define SD(val, pdst) \
215 uint8_t *pdst_m1 = (uint8_t *) (pdst); \
216 uint32_t val0_m, val1_m; \
218 val0_m = (uint32_t) ((val) & 0x00000000FFFFFFFF); \
219 val1_m = (uint32_t) (((val) >> 32) & 0x00000000FFFFFFFF); \
221 SW(val0_m, pdst_m1); \
222 SW(val1_m, pdst_m1 + 4); \
224 #endif // (__mips_isa_rev >= 6)
226 /* Description : Load 4 words with stride
227 Arguments : Inputs - psrc (source pointer to load from)
229 Outputs - out0, out1, out2, out3
230 Details : Loads word in 'out0' from (psrc)
231 Loads word in 'out1' from (psrc + stride)
232 Loads word in 'out2' from (psrc + 2 * stride)
233 Loads word in 'out3' from (psrc + 3 * stride)
235 #define LW4(psrc, stride, out0, out1, out2, out3) \
238 out1 = LW((psrc) + stride); \
239 out2 = LW((psrc) + 2 * stride); \
240 out3 = LW((psrc) + 3 * stride); \
243 /* Description : Store 4 words with stride
244 Arguments : Inputs - in0, in1, in2, in3, pdst, stride
245 Details : Stores word from 'in0' to (pdst)
246 Stores word from 'in1' to (pdst + stride)
247 Stores word from 'in2' to (pdst + 2 * stride)
248 Stores word from 'in3' to (pdst + 3 * stride)
250 #define SW4(in0, in1, in2, in3, pdst, stride) \
253 SW(in1, (pdst) + stride); \
254 SW(in2, (pdst) + 2 * stride); \
255 SW(in3, (pdst) + 3 * stride); \
258 /* Description : Store 4 double words with stride
259 Arguments : Inputs - in0, in1, in2, in3, pdst, stride
260 Details : Stores double word from 'in0' to (pdst)
261 Stores double word from 'in1' to (pdst + stride)
262 Stores double word from 'in2' to (pdst + 2 * stride)
263 Stores double word from 'in3' to (pdst + 3 * stride)
265 #define SD4(in0, in1, in2, in3, pdst, stride) \
268 SD(in1, (pdst) + stride); \
269 SD(in2, (pdst) + 2 * stride); \
270 SD(in3, (pdst) + 3 * stride); \
273 /* Description : Load vectors with 16 byte elements with stride
274 Arguments : Inputs - psrc (source pointer to load from)
277 Return Type - as per RTYPE
278 Details : Loads 16 byte elements in 'out0' from (psrc)
279 Loads 16 byte elements in 'out1' from (psrc + stride)
281 #define LD_B2(RTYPE, psrc, stride, out0, out1) \
283 out0 = LD_B(RTYPE, (psrc)); \
284 out1 = LD_B(RTYPE, (psrc) + stride); \
286 #define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)
287 #define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__)
289 #define LD_B3(RTYPE, psrc, stride, out0, out1, out2) \
291 LD_B2(RTYPE, (psrc), stride, out0, out1); \
292 out2 = LD_B(RTYPE, (psrc) + 2 * stride); \
294 #define LD_SB3(...) LD_B3(v16i8, __VA_ARGS__)
296 #define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \
298 LD_B2(RTYPE, (psrc), stride, out0, out1); \
299 LD_B2(RTYPE, (psrc) + 2 * stride , stride, out2, out3); \
301 #define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)
302 #define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__)
304 #define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \
306 LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
307 out4 = LD_B(RTYPE, (psrc) + 4 * stride); \
309 #define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__)
310 #define LD_SB5(...) LD_B5(v16i8, __VA_ARGS__)
312 #define LD_B6(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5) \
314 LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
315 LD_B2(RTYPE, (psrc) + 4 * stride, stride, out4, out5); \
317 #define LD_SB6(...) LD_B6(v16i8, __VA_ARGS__)
319 #define LD_B7(RTYPE, psrc, stride, \
320 out0, out1, out2, out3, out4, out5, out6) \
322 LD_B5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4); \
323 LD_B2(RTYPE, (psrc) + 5 * stride, stride, out5, out6); \
325 #define LD_SB7(...) LD_B7(v16i8, __VA_ARGS__)
327 #define LD_B8(RTYPE, psrc, stride, \
328 out0, out1, out2, out3, out4, out5, out6, out7) \
330 LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
331 LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \
333 #define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__)
334 #define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__)
336 /* Description : Load vectors with 8 halfword elements with stride
337 Arguments : Inputs - psrc (source pointer to load from)
340 Details : Loads 8 halfword elements in 'out0' from (psrc)
341 Loads 8 halfword elements in 'out1' from (psrc + stride)
343 #define LD_H2(RTYPE, psrc, stride, out0, out1) \
345 out0 = LD_H(RTYPE, (psrc)); \
346 out1 = LD_H(RTYPE, (psrc) + (stride)); \
348 #define LD_UH2(...) LD_H2(v8u16, __VA_ARGS__)
349 #define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__)
351 #define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3) \
353 LD_H2(RTYPE, (psrc), stride, out0, out1); \
354 LD_H2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \
356 #define LD_UH4(...) LD_H4(v8u16, __VA_ARGS__)
357 #define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__)
359 #define LD_H6(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5) \
361 LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
362 LD_H2(RTYPE, (psrc) + 4 * stride, stride, out4, out5); \
364 #define LD_UH6(...) LD_H6(v8u16, __VA_ARGS__)
365 #define LD_SH6(...) LD_H6(v8i16, __VA_ARGS__)
367 #define LD_H8(RTYPE, psrc, stride, \
368 out0, out1, out2, out3, out4, out5, out6, out7) \
370 LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
371 LD_H4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \
373 #define LD_UH8(...) LD_H8(v8u16, __VA_ARGS__)
374 #define LD_SH8(...) LD_H8(v8i16, __VA_ARGS__)
376 /* Description : Store vectors of 16 byte elements with stride
377 Arguments : Inputs - in0, in1, stride
378 Outputs - pdst (destination pointer to store to)
379 Details : Stores 16 byte elements from 'in0' to (pdst)
380 Stores 16 byte elements from 'in1' to (pdst + stride)
382 #define ST_B2(RTYPE, in0, in1, pdst, stride) \
384 ST_B(RTYPE, in0, (pdst)); \
385 ST_B(RTYPE, in1, (pdst) + stride); \
387 #define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)
388 #define ST_SB2(...) ST_B2(v16i8, __VA_ARGS__)
390 #define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) \
392 ST_B2(RTYPE, in0, in1, (pdst), stride); \
393 ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
395 #define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
396 #define ST_SB4(...) ST_B4(v16i8, __VA_ARGS__)
398 #define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
401 ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride); \
402 ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \
404 #define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__)
406 /* Description : Store vectors of 8 halfword elements with stride
407 Arguments : Inputs - in0, in1, stride
408 Outputs - pdst (destination pointer to store to)
409 Details : Stores 8 halfword elements from 'in0' to (pdst)
410 Stores 8 halfword elements from 'in1' to (pdst + stride)
412 #define ST_H2(RTYPE, in0, in1, pdst, stride) \
414 ST_H(RTYPE, in0, (pdst)); \
415 ST_H(RTYPE, in1, (pdst) + stride); \
417 #define ST_UH2(...) ST_H2(v8u16, __VA_ARGS__)
418 #define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__)
420 #define ST_H4(RTYPE, in0, in1, in2, in3, pdst, stride) \
422 ST_H2(RTYPE, in0, in1, (pdst), stride); \
423 ST_H2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
425 #define ST_SH4(...) ST_H4(v8i16, __VA_ARGS__)
427 #define ST_H6(RTYPE, in0, in1, in2, in3, in4, in5, pdst, stride) \
429 ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride); \
430 ST_H2(RTYPE, in4, in5, (pdst) + 4 * stride, stride); \
432 #define ST_SH6(...) ST_H6(v8i16, __VA_ARGS__)
434 #define ST_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
436 ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride); \
437 ST_H4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \
439 #define ST_SH8(...) ST_H8(v8i16, __VA_ARGS__)
441 /* Description : Store vectors of word elements with stride
442 Arguments : Inputs - in0, in1, stride
443 Outputs - pdst (destination pointer to store to)
444 Return Type - signed word
445 Details : Stores 4 word elements from 'in0' to (pdst)
446 Stores 4 word elements from 'in1' to (pdst + stride)
448 #define ST_SW2(in0, in1, pdst, stride) \
450 ST_SW(in0, (pdst)); \
451 ST_SW(in1, (pdst) + stride); \
454 /* Description : Store as 2x4 byte block to destination memory from input vector
455 Arguments : Inputs - in, stidx, pdst, stride
456 Return Type - unsigned byte
457 Details : Index stidx halfword element from 'in' vector is copied and
459 Index stidx+1 halfword element from 'in' vector is copied and
460 stored on second line
461 Index stidx+2 halfword element from 'in' vector is copied and
463 Index stidx+3 halfword element from 'in' vector is copied and
464 stored on fourth line
466 #define ST2x4_UB(in, stidx, pdst, stride) \
468 uint16_t out0_m, out1_m, out2_m, out3_m; \
469 uint8_t *pblk_2x4_m = (uint8_t *) (pdst); \
471 out0_m = __msa_copy_u_h((v8i16) in, (stidx)); \
472 out1_m = __msa_copy_u_h((v8i16) in, (stidx + 1)); \
473 out2_m = __msa_copy_u_h((v8i16) in, (stidx + 2)); \
474 out3_m = __msa_copy_u_h((v8i16) in, (stidx + 3)); \
476 SH(out0_m, pblk_2x4_m); \
477 SH(out1_m, pblk_2x4_m + stride); \
478 SH(out2_m, pblk_2x4_m + 2 * stride); \
479 SH(out3_m, pblk_2x4_m + 3 * stride); \
482 /* Description : Store as 4x2 byte block to destination memory from input vector
483 Arguments : Inputs - in, pdst, stride
484 Return Type - unsigned byte
485 Details : Index 0 word element from input vector is copied and stored
487 Index 1 word element from input vector is copied and stored
490 #define ST4x2_UB(in, pdst, stride) \
492 uint32_t out0_m, out1_m; \
493 uint8_t *pblk_4x2_m = (uint8_t *) (pdst); \
495 out0_m = __msa_copy_u_w((v4i32) in, 0); \
496 out1_m = __msa_copy_u_w((v4i32) in, 1); \
498 SW(out0_m, pblk_4x2_m); \
499 SW(out1_m, pblk_4x2_m + stride); \
502 /* Description : Store as 4x4 byte block to destination memory from input vector
503 Arguments : Inputs - in0, in1, pdst, stride
504 Return Type - unsigned byte
505 Details : Idx0 word element from input vector 'in0' is copied and stored
507 Idx1 word element from input vector 'in0' is copied and stored
509 Idx2 word element from input vector 'in1' is copied and stored
511 Idx3 word element from input vector 'in1' is copied and stored
514 #define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \
516 uint32_t out0_m, out1_m, out2_m, out3_m; \
517 uint8_t *pblk_4x4_m = (uint8_t *) (pdst); \
519 out0_m = __msa_copy_u_w((v4i32) in0, idx0); \
520 out1_m = __msa_copy_u_w((v4i32) in0, idx1); \
521 out2_m = __msa_copy_u_w((v4i32) in1, idx2); \
522 out3_m = __msa_copy_u_w((v4i32) in1, idx3); \
524 SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride); \
526 #define ST4x8_UB(in0, in1, pdst, stride) \
528 uint8_t *pblk_4x8 = (uint8_t *) (pdst); \
530 ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride); \
531 ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride); \
534 /* Description : Store as 6x4 byte block to destination memory from input
536 Arguments : Inputs - in0, in1, pdst, stride
537 Return Type - unsigned byte
538 Details : Index 0 word element from input vector 'in0' is copied and
539 stored on first line followed by index 2 halfword element
540 Index 2 word element from input vector 'in0' is copied and
541 stored on second line followed by index 2 halfword element
542 Index 0 word element from input vector 'in1' is copied and
543 stored on third line followed by index 2 halfword element
544 Index 2 word element from input vector 'in1' is copied and
545 stored on fourth line followed by index 2 halfword element
547 #define ST6x4_UB(in0, in1, pdst, stride) \
549 uint32_t out0_m, out1_m, out2_m, out3_m; \
550 uint16_t out4_m, out5_m, out6_m, out7_m; \
551 uint8_t *pblk_6x4_m = (uint8_t *) (pdst); \
553 out0_m = __msa_copy_u_w((v4i32) in0, 0); \
554 out1_m = __msa_copy_u_w((v4i32) in0, 2); \
555 out2_m = __msa_copy_u_w((v4i32) in1, 0); \
556 out3_m = __msa_copy_u_w((v4i32) in1, 2); \
558 out4_m = __msa_copy_u_h((v8i16) in0, 2); \
559 out5_m = __msa_copy_u_h((v8i16) in0, 6); \
560 out6_m = __msa_copy_u_h((v8i16) in1, 2); \
561 out7_m = __msa_copy_u_h((v8i16) in1, 6); \
563 SW(out0_m, pblk_6x4_m); \
564 SH(out4_m, (pblk_6x4_m + 4)); \
565 pblk_6x4_m += stride; \
566 SW(out1_m, pblk_6x4_m); \
567 SH(out5_m, (pblk_6x4_m + 4)); \
568 pblk_6x4_m += stride; \
569 SW(out2_m, pblk_6x4_m); \
570 SH(out6_m, (pblk_6x4_m + 4)); \
571 pblk_6x4_m += stride; \
572 SW(out3_m, pblk_6x4_m); \
573 SH(out7_m, (pblk_6x4_m + 4)); \
576 /* Description : Store as 8x2 byte block to destination memory from input vector
577 Arguments : Inputs - in, pdst, stride
578 Details : Index 0 double word element from input vector 'in' is copied
579 and stored to destination memory at (pdst)
580 Index 1 double word element from input vector 'in' is copied
581 and stored to destination memory at (pdst + stride)
583 #define ST8x2_UB(in, pdst, stride) \
585 uint64_t out0_m, out1_m; \
586 uint8_t *pblk_8x2_m = (uint8_t *) (pdst); \
588 out0_m = __msa_copy_u_d((v2i64) in, 0); \
589 out1_m = __msa_copy_u_d((v2i64) in, 1); \
591 SD(out0_m, pblk_8x2_m); \
592 SD(out1_m, pblk_8x2_m + stride); \
595 /* Description : Store as 8x4 byte block to destination memory from input
597 Arguments : Inputs - in0, in1, pdst, stride
598 Details : Index 0 double word element from input vector 'in0' is copied
599 and stored to destination memory at (pblk_8x4_m)
600 Index 1 double word element from input vector 'in0' is copied
601 and stored to destination memory at (pblk_8x4_m + stride)
602 Index 0 double word element from input vector 'in1' is copied
603 and stored to destination memory at (pblk_8x4_m + 2 * stride)
604 Index 1 double word element from input vector 'in1' is copied
605 and stored to destination memory at (pblk_8x4_m + 3 * stride)
607 #define ST8x4_UB(in0, in1, pdst, stride) \
609 uint64_t out0_m, out1_m, out2_m, out3_m; \
610 uint8_t *pblk_8x4_m = (uint8_t *) (pdst); \
612 out0_m = __msa_copy_u_d((v2i64) in0, 0); \
613 out1_m = __msa_copy_u_d((v2i64) in0, 1); \
614 out2_m = __msa_copy_u_d((v2i64) in1, 0); \
615 out3_m = __msa_copy_u_d((v2i64) in1, 1); \
617 SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride); \
619 #define ST8x8_UB(in0, in1, in2, in3, pdst, stride) \
621 uint8_t *pblk_8x8_m = (uint8_t *) (pdst); \
623 ST8x4_UB(in0, in1, pblk_8x8_m, stride); \
624 ST8x4_UB(in2, in3, pblk_8x8_m + 4 * stride, stride); \
626 #define ST12x4_UB(in0, in1, in2, pdst, stride) \
628 uint8_t *pblk_12x4_m = (uint8_t *) (pdst); \
631 ST8x4_UB(in0, in1, pblk_12x4_m, stride); \
633 ST4x4_UB(in2, in2, 0, 1, 2, 3, pblk_12x4_m + 8, stride); \
636 /* Description : Store as 12x8 byte block to destination memory from
638 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
639 Details : Index 0 double word element from input vector 'in0' is copied
640 and stored to destination memory at (pblk_12x8_m) followed by
641 index 2 word element from same input vector 'in0' at
643 Similar to remaining lines
645 #define ST12x8_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
647 uint64_t out0_m, out1_m, out2_m, out3_m; \
648 uint64_t out4_m, out5_m, out6_m, out7_m; \
649 uint32_t out8_m, out9_m, out10_m, out11_m; \
650 uint32_t out12_m, out13_m, out14_m, out15_m; \
651 uint8_t *pblk_12x8_m = (uint8_t *) (pdst); \
653 out0_m = __msa_copy_u_d((v2i64) in0, 0); \
654 out1_m = __msa_copy_u_d((v2i64) in1, 0); \
655 out2_m = __msa_copy_u_d((v2i64) in2, 0); \
656 out3_m = __msa_copy_u_d((v2i64) in3, 0); \
657 out4_m = __msa_copy_u_d((v2i64) in4, 0); \
658 out5_m = __msa_copy_u_d((v2i64) in5, 0); \
659 out6_m = __msa_copy_u_d((v2i64) in6, 0); \
660 out7_m = __msa_copy_u_d((v2i64) in7, 0); \
662 out8_m = __msa_copy_u_w((v4i32) in0, 2); \
663 out9_m = __msa_copy_u_w((v4i32) in1, 2); \
664 out10_m = __msa_copy_u_w((v4i32) in2, 2); \
665 out11_m = __msa_copy_u_w((v4i32) in3, 2); \
666 out12_m = __msa_copy_u_w((v4i32) in4, 2); \
667 out13_m = __msa_copy_u_w((v4i32) in5, 2); \
668 out14_m = __msa_copy_u_w((v4i32) in6, 2); \
669 out15_m = __msa_copy_u_w((v4i32) in7, 2); \
671 SD(out0_m, pblk_12x8_m); \
672 SW(out8_m, pblk_12x8_m + 8); \
673 pblk_12x8_m += stride; \
674 SD(out1_m, pblk_12x8_m); \
675 SW(out9_m, pblk_12x8_m + 8); \
676 pblk_12x8_m += stride; \
677 SD(out2_m, pblk_12x8_m); \
678 SW(out10_m, pblk_12x8_m + 8); \
679 pblk_12x8_m += stride; \
680 SD(out3_m, pblk_12x8_m); \
681 SW(out11_m, pblk_12x8_m + 8); \
682 pblk_12x8_m += stride; \
683 SD(out4_m, pblk_12x8_m); \
684 SW(out12_m, pblk_12x8_m + 8); \
685 pblk_12x8_m += stride; \
686 SD(out5_m, pblk_12x8_m); \
687 SW(out13_m, pblk_12x8_m + 8); \
688 pblk_12x8_m += stride; \
689 SD(out6_m, pblk_12x8_m); \
690 SW(out14_m, pblk_12x8_m + 8); \
691 pblk_12x8_m += stride; \
692 SD(out7_m, pblk_12x8_m); \
693 SW(out15_m, pblk_12x8_m + 8); \
696 /* Description : Immediate number of columns to slide with zero
697 Arguments : Inputs - in0, in1, slide_val
699 Return Type - as per RTYPE
700 Details : Byte elements from 'zero_m' vector are slide into 'in0' by
701 number of elements specified by 'slide_val'
703 #define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val) \
705 v16i8 zero_m = { 0 }; \
706 out0 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in0, slide_val); \
707 out1 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in1, slide_val); \
709 #define SLDI_B2_0_UB(...) SLDI_B2_0(v16u8, __VA_ARGS__)
711 #define SLDI_B4_0(RTYPE, in0, in1, in2, in3, \
712 out0, out1, out2, out3, slide_val) \
714 SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val); \
715 SLDI_B2_0(RTYPE, in2, in3, out2, out3, slide_val); \
717 #define SLDI_B4_0_SB(...) SLDI_B4_0(v16i8, __VA_ARGS__)
719 /* Description : Shuffle byte vector elements as per mask vector
720 Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
722 Return Type - as per RTYPE
723 Details : Selective byte elements from in0 & in1 are copied to out0 as
724 per control vector mask0
725 Selective byte elements from in2 & in3 are copied to out1 as
726 per control vector mask1
728 #define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \
730 out0 = (RTYPE) __msa_vshf_b((v16i8) mask0, (v16i8) in1, (v16i8) in0); \
731 out1 = (RTYPE) __msa_vshf_b((v16i8) mask1, (v16i8) in3, (v16i8) in2); \
733 #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
734 #define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)
735 #define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)
736 #define VSHF_B2_SH(...) VSHF_B2(v8i16, __VA_ARGS__)
738 #define VSHF_B3(RTYPE, in0, in1, in2, in3, in4, in5, mask0, mask1, mask2, \
741 VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1); \
742 out2 = (RTYPE) __msa_vshf_b((v16i8) mask2, (v16i8) in5, (v16i8) in4); \
744 #define VSHF_B3_SB(...) VSHF_B3(v16i8, __VA_ARGS__)
746 #define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3, \
747 out0, out1, out2, out3) \
749 VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1); \
750 VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3); \
752 #define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__)
754 /* Description : Shuffle byte vector elements as per mask vector
755 Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
757 Return Type - as per RTYPE
758 Details : Selective byte elements from in0 & in1 are copied to out0 as
759 per control vector mask0
760 Selective byte elements from in2 & in3 are copied to out1 as
761 per control vector mask1
763 #define VSHF_W2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \
765 out0 = (RTYPE) __msa_vshf_w((v4i32) mask0, (v4i32) in1, (v4i32) in0); \
766 out1 = (RTYPE) __msa_vshf_w((v4i32) mask1, (v4i32) in3, (v4i32) in2); \
768 #define VSHF_W2_SB(...) VSHF_W2(v16i8, __VA_ARGS__)
770 /* Description : Dot product of byte vector elements
771 Arguments : Inputs - mult0, mult1
774 Return Type - signed halfword
775 Details : Signed byte elements from mult0 are multiplied with
776 signed byte elements from cnst0 producing a result
777 twice the size of input i.e. signed halfword.
778 Then this multiplication results of adjacent odd-even elements
779 are added together and stored to the out vector
780 (2 signed halfword results)
782 #define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
784 out0 = (RTYPE) __msa_dotp_s_h((v16i8) mult0, (v16i8) cnst0); \
785 out1 = (RTYPE) __msa_dotp_s_h((v16i8) mult1, (v16i8) cnst1); \
787 #define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__)
789 #define DOTP_SB3(RTYPE, mult0, mult1, mult2, cnst0, cnst1, cnst2, \
792 DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
793 out2 = (RTYPE) __msa_dotp_s_h((v16i8) mult2, (v16i8) cnst2); \
795 #define DOTP_SB3_SH(...) DOTP_SB3(v8i16, __VA_ARGS__)
797 #define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3, \
798 cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) \
800 DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
801 DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
803 #define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__)
805 /* Description : Dot product of halfword vector elements
806 Arguments : Inputs - mult0, mult1
809 Return Type - signed word
810 Details : Signed halfword elements from mult0 are multiplied with
811 signed halfword elements from cnst0 producing a result
812 twice the size of input i.e. signed word.
813 Then this multiplication results of adjacent odd-even elements
814 are added together and stored to the out vector
815 (2 signed word results)
817 #define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
819 out0 = (RTYPE) __msa_dotp_s_w((v8i16) mult0, (v8i16) cnst0); \
820 out1 = (RTYPE) __msa_dotp_s_w((v8i16) mult1, (v8i16) cnst1); \
822 #define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__)
824 #define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3, \
825 cnst0, cnst1, cnst2, cnst3, \
826 out0, out1, out2, out3) \
828 DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
829 DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
831 #define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__)
833 /* Description : Dot product & addition of byte vector elements
834 Arguments : Inputs - mult0, mult1
837 Return Type - signed halfword
838 Details : Signed byte elements from mult0 are multiplied with
839 signed byte elements from cnst0 producing a result
840 twice the size of input i.e. signed halfword.
841 Then this multiplication results of adjacent odd-even elements
842 are added to the out vector
843 (2 signed halfword results)
845 #define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
847 out0 = (RTYPE) __msa_dpadd_s_h((v8i16) out0, \
848 (v16i8) mult0, (v16i8) cnst0); \
849 out1 = (RTYPE) __msa_dpadd_s_h((v8i16) out1, \
850 (v16i8) mult1, (v16i8) cnst1); \
852 #define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__)
854 #define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3, \
855 cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) \
857 DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
858 DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
860 #define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__)
862 /* Description : Dot product & addition of halfword vector elements
863 Arguments : Inputs - mult0, mult1
866 Return Type - signed word
867 Details : Signed halfword elements from mult0 are multiplied with
868 signed halfword elements from cnst0 producing a result
869 twice the size of input i.e. signed word.
870 Then this multiplication results of adjacent odd-even elements
871 are added to the out vector
872 (2 signed word results)
874 #define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
876 out0 = (RTYPE) __msa_dpadd_s_w((v4i32) out0, \
877 (v8i16) mult0, (v8i16) cnst0); \
878 out1 = (RTYPE) __msa_dpadd_s_w((v4i32) out1, \
879 (v8i16) mult1, (v8i16) cnst1); \
881 #define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__)
883 /* Description : Clips all halfword elements of input vector between min & max
884 out = ((in) < (min)) ? (min) : (((in) > (max)) ? (max) : (in))
885 Arguments : Inputs - in (input vector)
886 - min (min threshold)
887 - max (max threshold)
888 Outputs - out_m (output vector with clipped elements)
889 Return Type - signed halfword
891 #define CLIP_SH(in, min, max) \
895 out_m = __msa_max_s_h((v8i16) min, (v8i16) in); \
896 out_m = __msa_min_s_h((v8i16) max, (v8i16) out_m); \
900 /* Description : Clips all signed halfword elements of input vector
902 Arguments : Inputs - in (input vector)
903 Outputs - out_m (output vector with clipped elements)
904 Return Type - signed halfword
906 #define CLIP_SH_0_255(in) \
908 v8i16 max_m = __msa_ldi_h(255); \
911 out_m = __msa_maxi_s_h((v8i16) in, 0); \
912 out_m = __msa_min_s_h((v8i16) max_m, (v8i16) out_m); \
915 #define CLIP_SH2_0_255(in0, in1) \
917 in0 = CLIP_SH_0_255(in0); \
918 in1 = CLIP_SH_0_255(in1); \
920 #define CLIP_SH4_0_255(in0, in1, in2, in3) \
922 CLIP_SH2_0_255(in0, in1); \
923 CLIP_SH2_0_255(in2, in3); \
926 /* Description : Clips all signed word elements of input vector
928 Arguments : Inputs - in (input vector)
929 Outputs - out_m (output vector with clipped elements)
930 Return Type - signed word
932 #define CLIP_SW_0_255(in) \
934 v4i32 max_m = __msa_ldi_w(255); \
937 out_m = __msa_maxi_s_w((v4i32) in, 0); \
938 out_m = __msa_min_s_w((v4i32) max_m, (v4i32) out_m); \
942 /* Description : Horizontal subtraction of unsigned byte vector elements
943 Arguments : Inputs - in0, in1
945 Return Type - as per RTYPE
946 Details : Each unsigned odd byte element from 'in0' is subtracted from
947 even unsigned byte element from 'in0' (pairwise) and the
948 halfword result is stored in 'out0'
950 #define HSUB_UB2(RTYPE, in0, in1, out0, out1) \
952 out0 = (RTYPE) __msa_hsub_u_h((v16u8) in0, (v16u8) in0); \
953 out1 = (RTYPE) __msa_hsub_u_h((v16u8) in1, (v16u8) in1); \
955 #define HSUB_UB2_UH(...) HSUB_UB2(v8u16, __VA_ARGS__)
956 #define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
958 /* Description : Interleave even halfword elements from vectors
959 Arguments : Inputs - in0, in1, in2, in3
961 Return Type - as per RTYPE
962 Details : Even halfword elements of 'in0' and even halfword
963 elements of 'in1' are interleaved and copied to 'out0'
964 Even halfword elements of 'in2' and even halfword
965 elements of 'in3' are interleaved and copied to 'out1'
967 #define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
969 out0 = (RTYPE) __msa_ilvev_h((v8i16) in1, (v8i16) in0); \
970 out1 = (RTYPE) __msa_ilvev_h((v8i16) in3, (v8i16) in2); \
972 #define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__)
974 /* Description : Interleave even word elements from vectors
975 Arguments : Inputs - in0, in1, in2, in3
977 Return Type - as per RTYPE
978 Details : Even word elements of 'in0' and even word
979 elements of 'in1' are interleaved and copied to 'out0'
980 Even word elements of 'in2' and even word
981 elements of 'in3' are interleaved and copied to 'out1'
983 #define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) \
985 out0 = (RTYPE) __msa_ilvev_w((v4i32) in1, (v4i32) in0); \
986 out1 = (RTYPE) __msa_ilvev_w((v4i32) in3, (v4i32) in2); \
988 #define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__)
990 /* Description : Interleave even double word elements from vectors
991 Arguments : Inputs - in0, in1, in2, in3
993 Return Type - as per RTYPE
994 Details : Even double word elements of 'in0' and even double word
995 elements of 'in1' are interleaved and copied to 'out0'
996 Even double word elements of 'in2' and even double word
997 elements of 'in3' are interleaved and copied to 'out1'
999 #define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
1001 out0 = (RTYPE) __msa_ilvev_d((v2i64) in1, (v2i64) in0); \
1002 out1 = (RTYPE) __msa_ilvev_d((v2i64) in3, (v2i64) in2); \
1004 #define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)
1006 /* Description : Interleave left half of byte elements from vectors
1007 Arguments : Inputs - in0, in1, in2, in3
1008 Outputs - out0, out1
1009 Return Type - as per RTYPE
1010 Details : Left half of byte elements of in0 and left half of byte
1011 elements of in1 are interleaved and copied to out0.
1012 Left half of byte elements of in2 and left half of byte
1013 elements of in3 are interleaved and copied to out1.
1015 #define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
1017 out0 = (RTYPE) __msa_ilvl_b((v16i8) in0, (v16i8) in1); \
1018 out1 = (RTYPE) __msa_ilvl_b((v16i8) in2, (v16i8) in3); \
1020 #define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__)
1021 #define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__)
1023 #define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1024 out0, out1, out2, out3) \
1026 ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1027 ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
1029 #define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__)
1030 #define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__)
1031 #define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__)
1033 /* Description : Interleave left half of halfword elements from vectors
1034 Arguments : Inputs - in0, in1, in2, in3
1035 Outputs - out0, out1
1036 Return Type - as per RTYPE
1037 Details : Left half of halfword elements of in0 and left half of halfword
1038 elements of in1 are interleaved and copied to out0.
1039 Left half of halfword elements of in2 and left half of halfword
1040 elements of in3 are interleaved and copied to out1.
1042 #define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
1044 out0 = (RTYPE) __msa_ilvl_h((v8i16) in0, (v8i16) in1); \
1045 out1 = (RTYPE) __msa_ilvl_h((v8i16) in2, (v8i16) in3); \
1047 #define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__)
1048 #define ILVL_H2_SW(...) ILVL_H2(v4i32, __VA_ARGS__)
1050 #define ILVL_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1051 out0, out1, out2, out3) \
1053 ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
1054 ILVL_H2(RTYPE, in4, in5, in6, in7, out2, out3); \
1056 #define ILVL_H4_SH(...) ILVL_H4(v8i16, __VA_ARGS__)
1058 /* Description : Interleave left half of word elements from vectors
1059 Arguments : Inputs - in0, in1, in2, in3
1060 Outputs - out0, out1
1061 Return Type - as per RTYPE
1062 Details : Left half of word elements of in0 and left half of word
1063 elements of in1 are interleaved and copied to out0.
1064 Left half of word elements of in2 and left half of word
1065 elements of in3 are interleaved and copied to out1.
1067 #define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1) \
1069 out0 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1); \
1070 out1 = (RTYPE) __msa_ilvl_w((v4i32) in2, (v4i32) in3); \
1072 #define ILVL_W2_SB(...) ILVL_W2(v16i8, __VA_ARGS__)
1074 /* Description : Interleave right half of byte elements from vectors
1075 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
1076 Outputs - out0, out1, out2, out3
1077 Return Type - as per RTYPE
1078 Details : Right half of byte elements of in0 and right half of byte
1079 elements of in1 are interleaved and copied to out0.
1080 Right half of byte elements of in2 and right half of byte
1081 elements of in3 are interleaved and copied to out1.
1082 Similar for other pairs
1084 #define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
1086 out0 = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1); \
1087 out1 = (RTYPE) __msa_ilvr_b((v16i8) in2, (v16i8) in3); \
1089 #define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__)
1090 #define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)
1091 #define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__)
1092 #define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
1094 #define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1095 out0, out1, out2, out3) \
1097 ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1098 ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
1100 #define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__)
1101 #define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__)
1102 #define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__)
1104 /* Description : Interleave right half of halfword elements from vectors
1105 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
1106 Outputs - out0, out1, out2, out3
1107 Return Type - signed halfword
1108 Details : Right half of halfword elements of in0 and right half of
1109 halfword elements of in1 are interleaved and copied to out0.
1110 Right half of halfword elements of in2 and right half of
1111 halfword elements of in3 are interleaved and copied to out1.
1112 Similar for other pairs
1114 #define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
1116 out0 = (RTYPE) __msa_ilvr_h((v8i16) in0, (v8i16) in1); \
1117 out1 = (RTYPE) __msa_ilvr_h((v8i16) in2, (v8i16) in3); \
1119 #define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
1120 #define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__)
1122 #define ILVR_H3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
1124 ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
1125 out2 = (RTYPE) __msa_ilvr_h((v8i16) in4, (v8i16) in5); \
1127 #define ILVR_H3_SH(...) ILVR_H3(v8i16, __VA_ARGS__)
1129 #define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1130 out0, out1, out2, out3) \
1132 ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
1133 ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3); \
1135 #define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__)
1137 #define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1) \
1139 out0 = (RTYPE) __msa_ilvr_w((v4i32) in0, (v4i32) in1); \
1140 out1 = (RTYPE) __msa_ilvr_w((v4i32) in2, (v4i32) in3); \
1142 #define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__)
1143 #define ILVR_W2_SB(...) ILVR_W2(v16i8, __VA_ARGS__)
1145 #define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1146 out0, out1, out2, out3) \
1148 ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1); \
1149 ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3); \
1151 #define ILVR_W4_SB(...) ILVR_W4(v16i8, __VA_ARGS__)
1153 /* Description : Interleave right half of double word elements from vectors
1154 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
1155 Outputs - out0, out1, out2, out3
1156 Return Type - unsigned double word
1157 Details : Right half of double word elements of in0 and right half of
1158 double word elements of in1 are interleaved and copied to out0.
1159 Right half of double word elements of in2 and right half of
1160 double word elements of in3 are interleaved and copied to out1.
1162 #define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
1164 out0 = (RTYPE) __msa_ilvr_d((v2i64) (in0), (v2i64) (in1)); \
1165 out1 = (RTYPE) __msa_ilvr_d((v2i64) (in2), (v2i64) (in3)); \
1167 #define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
1168 #define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
1170 #define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
1172 ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
1173 out2 = (RTYPE) __msa_ilvr_d((v2i64) (in4), (v2i64) (in5)); \
1175 #define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__)
1177 #define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1178 out0, out1, out2, out3) \
1180 ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
1181 ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3); \
1183 #define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__)
1185 /* Description : Interleave both left and right half of input vectors
1186 Arguments : Inputs - in0, in1
1187 Outputs - out0, out1
1188 Return Type - as per RTYPE
1189 Details : Right half of byte elements from 'in0' and 'in1' are
1190 interleaved and stored to 'out0'
1191 Left half of byte elements from 'in0' and 'in1' are
1192 interleaved and stored to 'out1'
1194 #define ILVRL_B2(RTYPE, in0, in1, out0, out1) \
1196 out0 = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1); \
1197 out1 = (RTYPE) __msa_ilvl_b((v16i8) in0, (v16i8) in1); \
1199 #define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)
1200 #define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
1202 #define ILVRL_H2(RTYPE, in0, in1, out0, out1) \
1204 out0 = (RTYPE) __msa_ilvr_h((v8i16) in0, (v8i16) in1); \
1205 out1 = (RTYPE) __msa_ilvl_h((v8i16) in0, (v8i16) in1); \
1207 #define ILVRL_H2_SB(...) ILVRL_H2(v16i8, __VA_ARGS__)
1208 #define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)
1209 #define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
1211 #define ILVRL_W2(RTYPE, in0, in1, out0, out1) \
1213 out0 = (RTYPE) __msa_ilvr_w((v4i32) in0, (v4i32) in1); \
1214 out1 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1); \
1216 #define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
1218 /* Description : Maximum values between signed elements of vector and
1219 5-bit signed immediate value are copied to the output vector
1220 Arguments : Inputs - in0, in1, in2, in3, max_val
1221 Outputs - in0, in1, in2, in3 (in place)
1222 Return Type - unsigned halfword
1223 Details : Maximum of signed halfword element values from 'in0' and
1224 'max_val' are written to output vector 'in0'
1226 #define MAXI_SH2(RTYPE, in0, in1, max_val) \
1228 in0 = (RTYPE) __msa_maxi_s_h((v8i16) in0, (max_val)); \
1229 in1 = (RTYPE) __msa_maxi_s_h((v8i16) in1, (max_val)); \
1231 #define MAXI_SH2_SH(...) MAXI_SH2(v8i16, __VA_ARGS__)
1233 #define MAXI_SH4(RTYPE, in0, in1, in2, in3, max_val) \
1235 MAXI_SH2(RTYPE, in0, in1, max_val); \
1236 MAXI_SH2(RTYPE, in2, in3, max_val); \
1238 #define MAXI_SH4_UH(...) MAXI_SH4(v8u16, __VA_ARGS__)
1240 /* Description : Saturate the halfword element values to the max
1241 unsigned value of (sat_val+1 bits)
1242 The element data width remains unchanged
1243 Arguments : Inputs - in0, in1, in2, in3, sat_val
1244 Outputs - in0, in1, in2, in3 (in place)
1245 Return Type - unsigned halfword
1246 Details : Each unsigned halfword element from 'in0' is saturated to the
1247 value generated with (sat_val+1) bit range
1248 Results are in placed to original vectors
1250 #define SAT_UH2(RTYPE, in0, in1, sat_val) \
1252 in0 = (RTYPE) __msa_sat_u_h((v8u16) in0, sat_val); \
1253 in1 = (RTYPE) __msa_sat_u_h((v8u16) in1, sat_val); \
1255 #define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__)
1257 #define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val) \
1259 SAT_UH2(RTYPE, in0, in1, sat_val); \
1260 SAT_UH2(RTYPE, in2, in3, sat_val) \
1262 #define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__)
1264 /* Description : Saturate the halfword element values to the max
1265 unsigned value of (sat_val+1 bits)
1266 The element data width remains unchanged
1267 Arguments : Inputs - in0, in1, in2, in3, sat_val
1268 Outputs - in0, in1, in2, in3 (in place)
1269 Return Type - unsigned halfword
1270 Details : Each unsigned halfword element from 'in0' is saturated to the
1271 value generated with (sat_val+1) bit range
1272 Results are in placed to original vectors
1274 #define SAT_SH2(RTYPE, in0, in1, sat_val) \
1276 in0 = (RTYPE) __msa_sat_s_h((v8i16) in0, sat_val); \
1277 in1 = (RTYPE) __msa_sat_s_h((v8i16) in1, sat_val); \
1279 #define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__)
1281 #define SAT_SH3(RTYPE, in0, in1, in2, sat_val) \
1283 SAT_SH2(RTYPE, in0, in1, sat_val) \
1284 in2 = (RTYPE) __msa_sat_s_h((v8i16) in2, sat_val); \
1286 #define SAT_SH3_SH(...) SAT_SH3(v8i16, __VA_ARGS__)
1288 #define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) \
1290 SAT_SH2(RTYPE, in0, in1, sat_val); \
1291 SAT_SH2(RTYPE, in2, in3, sat_val); \
1293 #define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__)
1295 /* Description : Indexed halfword element values are replicated to all
1296 elements in output vector
1297 Arguments : Inputs - in, idx0, idx1
1298 Outputs - out0, out1
1299 Return Type - as per RTYPE
1300 Details : 'idx0' element value from 'in' vector is replicated to all
1301 elements in 'out0' vector
1302 Valid index range for halfword operation is 0-7
1304 #define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) \
1306 out0 = (RTYPE) __msa_splati_h((v8i16) in, idx0); \
1307 out1 = (RTYPE) __msa_splati_h((v8i16) in, idx1); \
1309 #define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__)
1311 #define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3, \
1312 out0, out1, out2, out3) \
1314 SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1); \
1315 SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3); \
1317 #define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__)
1318 #define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__)
1320 /* Description : Indexed word element values are replicated to all
1321 elements in output vector
1322 Arguments : Inputs - in, stidx
1323 Outputs - out0, out1
1324 Return Type - as per RTYPE
1325 Details : 'stidx' element value from 'in' vector is replicated to all
1326 elements in 'out0' vector
1327 'stidx + 1' element value from 'in' vector is replicated to all
1328 elements in 'out1' vector
1329 Valid index range for halfword operation is 0-3
1331 #define SPLATI_W2(RTYPE, in, stidx, out0, out1) \
1333 out0 = (RTYPE) __msa_splati_w((v4i32) in, stidx); \
1334 out1 = (RTYPE) __msa_splati_w((v4i32) in, (stidx+1)); \
1336 #define SPLATI_W2_SW(...) SPLATI_W2(v4i32, __VA_ARGS__)
1338 #define SPLATI_W4(RTYPE, in, out0, out1, out2, out3) \
1340 SPLATI_W2(RTYPE, in, 0, out0, out1); \
1341 SPLATI_W2(RTYPE, in, 2, out2, out3); \
1343 #define SPLATI_W4_SW(...) SPLATI_W4(v4i32, __VA_ARGS__)
1345 /* Description : Pack even byte elements of vector pairs
1346 Arguments : Inputs - in0, in1, in2, in3
1347 Outputs - out0, out1
1348 Return Type - as per RTYPE
1349 Details : Even byte elements of in0 are copied to the left half of
1350 out0 & even byte elements of in1 are copied to the right
1352 Even byte elements of in2 are copied to the left half of
1353 out1 & even byte elements of in3 are copied to the right
1356 #define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
1358 out0 = (RTYPE) __msa_pckev_b((v16i8) in0, (v16i8) in1); \
1359 out1 = (RTYPE) __msa_pckev_b((v16i8) in2, (v16i8) in3); \
1361 #define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)
1362 #define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)
1363 #define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__)
1364 #define PCKEV_B2_SW(...) PCKEV_B2(v4i32, __VA_ARGS__)
1366 #define PCKEV_B3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
1368 PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1369 out2 = (RTYPE) __msa_pckev_b((v16i8) in4, (v16i8) in5); \
1371 #define PCKEV_B3_UB(...) PCKEV_B3(v16u8, __VA_ARGS__)
1372 #define PCKEV_B3_SB(...) PCKEV_B3(v16i8, __VA_ARGS__)
1374 #define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1375 out0, out1, out2, out3) \
1377 PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1378 PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
1380 #define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__)
1381 #define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__)
1382 #define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__)
1383 #define PCKEV_B4_SW(...) PCKEV_B4(v4i32, __VA_ARGS__)
1385 /* Description : Pack even halfword elements of vector pairs
1386 Arguments : Inputs - in0, in1, in2, in3
1387 Outputs - out0, out1
1388 Return Type - as per RTYPE
1389 Details : Even halfword elements of in0 are copied to the left half of
1390 out0 & even halfword elements of in1 are copied to the right
1392 Even halfword elements of in2 are copied to the left half of
1393 out1 & even halfword elements of in3 are copied to the right
1396 #define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
1398 out0 = (RTYPE) __msa_pckev_h((v8i16) in0, (v8i16) in1); \
1399 out1 = (RTYPE) __msa_pckev_h((v8i16) in2, (v8i16) in3); \
1401 #define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__)
1402 #define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__)
1404 #define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1405 out0, out1, out2, out3) \
1407 PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
1408 PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3); \
1410 #define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__)
1411 #define PCKEV_H4_SW(...) PCKEV_H4(v4i32, __VA_ARGS__)
1413 /* Description : Each byte element is logically xor'ed with immediate 128
1414 Arguments : Inputs - in0, in1
1415 Outputs - in0, in1 (in-place)
1416 Return Type - as per RTYPE
1417 Details : Each unsigned byte element from input vector 'in0' is
1418 logically xor'ed with 128 and result is in-place stored in
1420 Each unsigned byte element from input vector 'in1' is
1421 logically xor'ed with 128 and result is in-place stored in
1423 Similar for other pairs
1425 #define XORI_B2_128(RTYPE, in0, in1) \
1427 in0 = (RTYPE) __msa_xori_b((v16u8) in0, 128); \
1428 in1 = (RTYPE) __msa_xori_b((v16u8) in1, 128); \
1430 #define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__)
1432 #define XORI_B3_128(RTYPE, in0, in1, in2) \
1434 XORI_B2_128(RTYPE, in0, in1); \
1435 in2 = (RTYPE) __msa_xori_b((v16u8) in2, 128); \
1437 #define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__)
1439 #define XORI_B4_128(RTYPE, in0, in1, in2, in3) \
1441 XORI_B2_128(RTYPE, in0, in1); \
1442 XORI_B2_128(RTYPE, in2, in3); \
1444 #define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__)
1445 #define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__)
1446 #define XORI_B4_128_SH(...) XORI_B4_128(v8i16, __VA_ARGS__)
1448 #define XORI_B5_128(RTYPE, in0, in1, in2, in3, in4) \
1450 XORI_B3_128(RTYPE, in0, in1, in2); \
1451 XORI_B2_128(RTYPE, in3, in4); \
1453 #define XORI_B5_128_SB(...) XORI_B5_128(v16i8, __VA_ARGS__)
1455 #define XORI_B6_128(RTYPE, in0, in1, in2, in3, in4, in5) \
1457 XORI_B4_128(RTYPE, in0, in1, in2, in3); \
1458 XORI_B2_128(RTYPE, in4, in5); \
1460 #define XORI_B6_128_SB(...) XORI_B6_128(v16i8, __VA_ARGS__)
1462 #define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6) \
1464 XORI_B4_128(RTYPE, in0, in1, in2, in3); \
1465 XORI_B3_128(RTYPE, in4, in5, in6); \
1467 #define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__)
1469 #define XORI_B8_128(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7) \
1471 XORI_B4_128(RTYPE, in0, in1, in2, in3); \
1472 XORI_B4_128(RTYPE, in4, in5, in6, in7); \
1474 #define XORI_B8_128_SB(...) XORI_B8_128(v16i8, __VA_ARGS__)
1476 /* Description : Addition of signed halfword elements and signed saturation
1477 Arguments : Inputs - in0, in1, in2, in3
1478 Outputs - out0, out1
1479 Return Type - as per RTYPE
1480 Details : Signed halfword elements from 'in0' are added to signed
1481 halfword elements of 'in1'. The result is then signed saturated
1482 between -32768 to +32767 (as per halfword data type)
1483 Similar for other pairs
1485 #define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1) \
1487 out0 = (RTYPE) __msa_adds_s_h((v8i16) in0, (v8i16) in1); \
1488 out1 = (RTYPE) __msa_adds_s_h((v8i16) in2, (v8i16) in3); \
1490 #define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__)
1492 #define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1493 out0, out1, out2, out3) \
1495 ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1); \
1496 ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3); \
1498 #define ADDS_SH4_UH(...) ADDS_SH4(v8u16, __VA_ARGS__)
1499 #define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__)
1501 /* Description : Shift left all elements of vector (generic for all data types)
1502 Arguments : Inputs - in0, in1, in2, in3, shift
1503 Outputs - in0, in1, in2, in3 (in place)
1504 Return Type - as per input vector RTYPE
1505 Details : Each element of vector 'in0' is left shifted by 'shift' and
1506 result is in place written to 'in0'
1507 Similar for other pairs
1509 #define SLLI_4V(in0, in1, in2, in3, shift) \
1511 in0 = in0 << shift; \
1512 in1 = in1 << shift; \
1513 in2 = in2 << shift; \
1514 in3 = in3 << shift; \
1517 /* Description : Arithmetic shift right all elements of vector
1518 (generic for all data types)
1519 Arguments : Inputs - in0, in1, in2, in3, shift
1520 Outputs - in0, in1, in2, in3 (in place)
1521 Return Type - as per input vector RTYPE
1522 Details : Each element of vector 'in0' is right shifted by 'shift' and
1523 result is in place written to 'in0'
1524 Here, 'shift' is GP variable passed in
1525 Similar for other pairs
1527 #define SRA_4V(in0, in1, in2, in3, shift) \
1529 in0 = in0 >> shift; \
1530 in1 = in1 >> shift; \
1531 in2 = in2 >> shift; \
1532 in3 = in3 >> shift; \
1535 /* Description : Shift right logical all halfword elements of vector
1536 Arguments : Inputs - in0, in1, in2, in3, shift
1537 Outputs - in0, in1, in2, in3 (in place)
1538 Return Type - unsigned halfword
1539 Details : Each element of vector 'in0' is shifted right logical by
1540 number of bits respective element holds in vector 'shift' and
1541 result is in place written to 'in0'
1542 Here, 'shift' is a vector passed in
1543 Similar for other pairs
1545 #define SRL_H4(RTYPE, in0, in1, in2, in3, shift) \
1547 in0 = (RTYPE) __msa_srl_h((v8i16) in0, (v8i16) shift); \
1548 in1 = (RTYPE) __msa_srl_h((v8i16) in1, (v8i16) shift); \
1549 in2 = (RTYPE) __msa_srl_h((v8i16) in2, (v8i16) shift); \
1550 in3 = (RTYPE) __msa_srl_h((v8i16) in3, (v8i16) shift); \
1552 #define SRL_H4_UH(...) SRL_H4(v8u16, __VA_ARGS__)
1554 /* Description : Shift right arithmetic rounded halfwords
1555 Arguments : Inputs - in0, in1, shift
1556 Outputs - in0, in1, (in place)
1557 Return Type - unsigned halfword
1558 Details : Each element of vector 'in0' is shifted right arithmetic by
1559 number of bits respective element holds in vector 'shift'.
1560 The last discarded bit is added to shifted value for rounding
1561 and the result is in place written to 'in0'
1562 Here, 'shift' is a vector passed in
1563 Similar for other pairs
1565 #define SRAR_H2(RTYPE, in0, in1, shift) \
1567 in0 = (RTYPE) __msa_srar_h((v8i16) in0, (v8i16) shift); \
1568 in1 = (RTYPE) __msa_srar_h((v8i16) in1, (v8i16) shift); \
1570 #define SRAR_H2_UH(...) SRAR_H2(v8u16, __VA_ARGS__)
1571 #define SRAR_H2_SH(...) SRAR_H2(v8i16, __VA_ARGS__)
1573 #define SRAR_H3(RTYPE, in0, in1, in2, shift) \
1575 SRAR_H2(RTYPE, in0, in1, shift) \
1576 in2 = (RTYPE) __msa_srar_h((v8i16) in2, (v8i16) shift); \
1578 #define SRAR_H3_SH(...) SRAR_H3(v8i16, __VA_ARGS__)
1580 #define SRAR_H4(RTYPE, in0, in1, in2, in3, shift) \
1582 SRAR_H2(RTYPE, in0, in1, shift) \
1583 SRAR_H2(RTYPE, in2, in3, shift) \
1585 #define SRAR_H4_UH(...) SRAR_H4(v8u16, __VA_ARGS__)
1586 #define SRAR_H4_SH(...) SRAR_H4(v8i16, __VA_ARGS__)
1588 /* Description : Shift right arithmetic rounded words
1589 Arguments : Inputs - in0, in1, shift
1590 Outputs - in0, in1, (in place)
1591 Return Type - as per RTYPE
1592 Details : Each element of vector 'in0' is shifted right arithmetic by
1593 number of bits respective element holds in vector 'shift'.
1594 The last discarded bit is added to shifted value for rounding
1595 and the result is in place written to 'in0'
1596 Here, 'shift' is a vector passed in
1597 Similar for other pairs
1599 #define SRAR_W2(RTYPE, in0, in1, shift) \
1601 in0 = (RTYPE) __msa_srar_w((v4i32) in0, (v4i32) shift); \
1602 in1 = (RTYPE) __msa_srar_w((v4i32) in1, (v4i32) shift); \
1604 #define SRAR_W2_SW(...) SRAR_W2(v4i32, __VA_ARGS__)
1606 #define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) \
1608 SRAR_W2(RTYPE, in0, in1, shift) \
1609 SRAR_W2(RTYPE, in2, in3, shift) \
1611 #define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__)
1613 /* Description : Shift right arithmetic rounded (immediate)
1614 Arguments : Inputs - in0, in1, in2, in3, shift
1615 Outputs - in0, in1, in2, in3 (in place)
1616 Return Type - as per RTYPE
1617 Details : Each element of vector 'in0' is shifted right arithmetic by
1619 The last discarded bit is added to shifted value for rounding
1620 and the result is in place written to 'in0'
1621 Similar for other pairs
1623 #define SRARI_H2(RTYPE, in0, in1, shift) \
1625 in0 = (RTYPE) __msa_srari_h((v8i16) in0, shift); \
1626 in1 = (RTYPE) __msa_srari_h((v8i16) in1, shift); \
1628 #define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__)
1629 #define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__)
1631 /* Description : Shift right arithmetic rounded (immediate)
1632 Arguments : Inputs - in0, in1, shift
1633 Outputs - in0, in1 (in place)
1634 Return Type - as per RTYPE
1635 Details : Each element of vector 'in0' is shifted right arithmetic by
1637 The last discarded bit is added to shifted value for rounding
1638 and the result is in place written to 'in0'
1639 Similar for other pairs
1641 #define SRARI_W2(RTYPE, in0, in1, shift) \
1643 in0 = (RTYPE) __msa_srari_w((v4i32) in0, shift); \
1644 in1 = (RTYPE) __msa_srari_w((v4i32) in1, shift); \
1646 #define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__)
1648 #define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) \
1650 SRARI_W2(RTYPE, in0, in1, shift); \
1651 SRARI_W2(RTYPE, in2, in3, shift); \
1653 #define SRARI_W4_SH(...) SRARI_W4(v8i16, __VA_ARGS__)
1654 #define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)
1656 /* Description : Multiplication of pairs of vectors
1657 Arguments : Inputs - in0, in1, in2, in3
1658 Outputs - out0, out1
1659 Details : Each element from 'in0' is multiplied with elements from 'in1'
1660 and result is written to 'out0'
1661 Similar for other pairs
1663 #define MUL2(in0, in1, in2, in3, out0, out1) \
1668 #define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
1670 MUL2(in0, in1, in2, in3, out0, out1); \
1671 MUL2(in4, in5, in6, in7, out2, out3); \
1674 /* Description : Addition of 2 pairs of vectors
1675 Arguments : Inputs - in0, in1, in2, in3
1676 Outputs - out0, out1
1677 Details : Each element from 2 pairs vectors is added and 2 results are
1680 #define ADD2(in0, in1, in2, in3, out0, out1) \
1685 #define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
1687 ADD2(in0, in1, in2, in3, out0, out1); \
1688 ADD2(in4, in5, in6, in7, out2, out3); \
1691 /* Description : Zero extend unsigned byte elements to halfword elements
1692 Arguments : Inputs - in (1 input unsigned byte vector)
1693 Outputs - out0, out1 (unsigned 2 halfword vectors)
1694 Return Type - signed halfword
1695 Details : Zero extended right half of vector is returned in 'out0'
1696 Zero extended left half of vector is returned in 'out1'
1698 #define UNPCK_UB_SH(in, out0, out1) \
1700 v16i8 zero_m = { 0 }; \
1702 ILVRL_B2_SH(zero_m, in, out0, out1); \
1705 /* Description : Sign extend halfword elements from input vector and return
1706 result in pair of vectors
1707 Arguments : Inputs - in (1 input halfword vector)
1708 Outputs - out0, out1 (sign extended 2 word vectors)
1709 Return Type - signed word
1710 Details : Sign bit of halfword elements from input vector 'in' is
1711 extracted and interleaved right with same vector 'in0' to
1712 generate 4 signed word elements in 'out0'
1713 Then interleaved left with same vector 'in0' to
1714 generate 4 signed word elements in 'out1'
1716 #define UNPCK_SH_SW(in, out0, out1) \
1720 tmp_m = __msa_clti_s_h((v8i16) in, 0); \
1721 ILVRL_H2_SW(tmp_m, in, out0, out1); \
1724 /* Description : Transposes input 4x4 byte block
1725 Arguments : Inputs - in0, in1, in2, in3 (input 4x4 byte block)
1726 Outputs - out0, out1, out2, out3 (output 4x4 byte block)
1727 Return Type - unsigned byte
1730 #define TRANSPOSE4x4_UB_UB(in0, in1, in2, in3, out0, out1, out2, out3) \
1732 v16i8 zero_m = { 0 }; \
1733 v16i8 s0_m, s1_m, s2_m, s3_m; \
1735 ILVR_D2_SB(in1, in0, in3, in2, s0_m, s1_m); \
1736 ILVRL_B2_SB(s1_m, s0_m, s2_m, s3_m); \
1738 out0 = (v16u8) __msa_ilvr_b(s3_m, s2_m); \
1739 out1 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out0, 4); \
1740 out2 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out1, 4); \
1741 out3 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out2, 4); \
1744 /* Description : Transposes input 8x4 byte block into 4x8
1745 Arguments : Inputs - in0, in1, in2, in3 (input 8x4 byte block)
1746 Outputs - out0, out1, out2, out3 (output 4x8 byte block)
1747 Return Type - unsigned byte
1750 #define TRANSPOSE8x4_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1751 out0, out1, out2, out3) \
1753 v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
1755 ILVEV_W2_SB(in0, in4, in1, in5, tmp0_m, tmp1_m); \
1756 tmp2_m = __msa_ilvr_b(tmp1_m, tmp0_m); \
1757 ILVEV_W2_SB(in2, in6, in3, in7, tmp0_m, tmp1_m); \
1759 tmp3_m = __msa_ilvr_b(tmp1_m, tmp0_m); \
1760 ILVRL_H2_SB(tmp3_m, tmp2_m, tmp0_m, tmp1_m); \
1762 ILVRL_W2(RTYPE, tmp1_m, tmp0_m, out0, out2); \
1763 out1 = (RTYPE) __msa_ilvl_d((v2i64) out2, (v2i64) out0); \
1764 out3 = (RTYPE) __msa_ilvl_d((v2i64) out0, (v2i64) out2); \
1767 #define TRANSPOSE8x4_UB_UB(...) TRANSPOSE8x4_UB(v16u8, __VA_ARGS__)
1769 /* Description : Transposes 16x8 block into 8x16 with byte elements in vectors
1770 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7,
1771 in8, in9, in10, in11, in12, in13, in14, in15
1772 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1773 Return Type - unsigned byte
1776 #define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
1777 in8, in9, in10, in11, in12, in13, in14, in15, \
1778 out0, out1, out2, out3, out4, out5, out6, out7) \
1780 v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
1781 v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
1783 ILVEV_D2_UB(in0, in8, in1, in9, out7, out6); \
1784 ILVEV_D2_UB(in2, in10, in3, in11, out5, out4); \
1785 ILVEV_D2_UB(in4, in12, in5, in13, out3, out2); \
1786 ILVEV_D2_UB(in6, in14, in7, in15, out1, out0); \
1788 tmp0_m = (v16u8) __msa_ilvev_b((v16i8) out6, (v16i8) out7); \
1789 tmp4_m = (v16u8) __msa_ilvod_b((v16i8) out6, (v16i8) out7); \
1790 tmp1_m = (v16u8) __msa_ilvev_b((v16i8) out4, (v16i8) out5); \
1791 tmp5_m = (v16u8) __msa_ilvod_b((v16i8) out4, (v16i8) out5); \
1792 out5 = (v16u8) __msa_ilvev_b((v16i8) out2, (v16i8) out3); \
1793 tmp6_m = (v16u8) __msa_ilvod_b((v16i8) out2, (v16i8) out3); \
1794 out7 = (v16u8) __msa_ilvev_b((v16i8) out0, (v16i8) out1); \
1795 tmp7_m = (v16u8) __msa_ilvod_b((v16i8) out0, (v16i8) out1); \
1797 ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m); \
1798 out0 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
1799 out4 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
1801 tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
1802 tmp3_m = (v16u8) __msa_ilvod_h((v8i16) out7, (v8i16) out5); \
1803 out2 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
1804 out6 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
1806 ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m); \
1807 out1 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
1808 out5 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
1810 tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp5_m, (v8i16) tmp4_m); \
1811 tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp5_m, (v8i16) tmp4_m); \
1812 tmp3_m = (v16u8) __msa_ilvod_h((v8i16) tmp7_m, (v8i16) tmp6_m); \
1813 tmp3_m = (v16u8) __msa_ilvod_h((v8i16) tmp7_m, (v8i16) tmp6_m); \
1814 out3 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
1815 out7 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
1818 /* Description : Pack even elements of input vectors & xor with 128
1819 Arguments : Inputs - in0, in1
1821 Return Type - unsigned byte
1822 Details : Signed byte even elements from 'in0' and 'in1' are packed
1823 together in one vector and the resulted vector is xor'ed with
1824 128 to shift the range from signed to unsigned byte
1826 #define PCKEV_XORI128_UB(in0, in1) \
1829 out_m = (v16u8) __msa_pckev_b((v16i8) in1, (v16i8) in0); \
1830 out_m = (v16u8) __msa_xori_b((v16u8) out_m, 128); \
1834 /* Description : Pack even byte elements, extract 0 & 2 index words from pair
1835 of results and store 4 words in destination memory as per
1837 Arguments : Inputs - in0, in1, in2, in3, pdst, stride
1839 #define PCKEV_ST4x4_UB(in0, in1, in2, in3, pdst, stride) \
1841 uint32_t out0_m, out1_m, out2_m, out3_m; \
1842 v16i8 tmp0_m, tmp1_m; \
1844 PCKEV_B2_SB(in1, in0, in3, in2, tmp0_m, tmp1_m); \
1846 out0_m = __msa_copy_u_w((v4i32) tmp0_m, 0); \
1847 out1_m = __msa_copy_u_w((v4i32) tmp0_m, 2); \
1848 out2_m = __msa_copy_u_w((v4i32) tmp1_m, 0); \
1849 out3_m = __msa_copy_u_w((v4i32) tmp1_m, 2); \
1851 SW4(out0_m, out1_m, out2_m, out3_m, pdst, stride); \
1853 #endif /* AVUTIL_MIPS_GENERIC_MACROS_MSA_H */