2 * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #ifndef AVUTIL_MIPS_GENERIC_MACROS_MSA_H
22 #define AVUTIL_MIPS_GENERIC_MACROS_MSA_H
27 #define LD_B(RTYPE, psrc) *((RTYPE *)(psrc))
28 #define LD_UB(...) LD_B(v16u8, __VA_ARGS__)
29 #define LD_SB(...) LD_B(v16i8, __VA_ARGS__)
31 #define LD_H(RTYPE, psrc) *((RTYPE *)(psrc))
32 #define LD_UH(...) LD_H(v8u16, __VA_ARGS__)
33 #define LD_SH(...) LD_H(v8i16, __VA_ARGS__)
35 #define LD_W(RTYPE, psrc) *((RTYPE *)(psrc))
36 #define LD_UW(...) LD_W(v4u32, __VA_ARGS__)
37 #define LD_SW(...) LD_W(v4i32, __VA_ARGS__)
39 #define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
40 #define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
41 #define ST_SB(...) ST_B(v16i8, __VA_ARGS__)
43 #define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
44 #define ST_UH(...) ST_H(v8u16, __VA_ARGS__)
45 #define ST_SH(...) ST_H(v8i16, __VA_ARGS__)
47 #define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
48 #define ST_UW(...) ST_W(v4u32, __VA_ARGS__)
49 #define ST_SW(...) ST_W(v4i32, __VA_ARGS__)
51 #if (__mips_isa_rev >= 6)
54 uint8_t *psrc_m = (uint8_t *) (psrc); \
58 "lw %[val_m], %[psrc_m] \n\t" \
60 : [val_m] "=r" (val_m) \
61 : [psrc_m] "m" (*psrc_m) \
70 uint8_t *psrc_m = (uint8_t *) (psrc); \
74 "ld %[val_m], %[psrc_m] \n\t" \
76 : [val_m] "=r" (val_m) \
77 : [psrc_m] "m" (*psrc_m) \
82 #else // !(__mips == 64)
85 uint8_t *psrc_m = (uint8_t *) (psrc); \
86 uint32_t val0_m, val1_m; \
89 val0_m = LW(psrc_m); \
90 val1_m = LW(psrc_m + 4); \
92 val_m = (uint64_t) (val1_m); \
93 val_m = (uint64_t) ((val_m << 32) & 0xFFFFFFFF00000000); \
94 val_m = (uint64_t) (val_m | (uint64_t) val0_m); \
98 #endif // (__mips == 64)
100 #define SH(val, pdst) \
102 uint8_t *pdst_m = (uint8_t *) (pdst); \
103 uint16_t val_m = (val); \
106 "sh %[val_m], %[pdst_m] \n\t" \
108 : [pdst_m] "=m" (*pdst_m) \
109 : [val_m] "r" (val_m) \
113 #define SW(val, pdst) \
115 uint8_t *pdst_m = (uint8_t *) (pdst); \
116 uint32_t val_m = (val); \
119 "sw %[val_m], %[pdst_m] \n\t" \
121 : [pdst_m] "=m" (*pdst_m) \
122 : [val_m] "r" (val_m) \
126 #define SD(val, pdst) \
128 uint8_t *pdst_m = (uint8_t *) (pdst); \
129 uint64_t val_m = (val); \
132 "sd %[val_m], %[pdst_m] \n\t" \
134 : [pdst_m] "=m" (*pdst_m) \
135 : [val_m] "r" (val_m) \
138 #else // !(__mips_isa_rev >= 6)
141 uint8_t *psrc_m = (uint8_t *) (psrc); \
145 "ulw %[val_m], %[psrc_m] \n\t" \
147 : [val_m] "=r" (val_m) \
148 : [psrc_m] "m" (*psrc_m) \
157 uint8_t *psrc_m = (uint8_t *) (psrc); \
158 uint64_t val_m = 0; \
161 "uld %[val_m], %[psrc_m] \n\t" \
163 : [val_m] "=r" (val_m) \
164 : [psrc_m] "m" (*psrc_m) \
169 #else // !(__mips == 64)
172 uint8_t *psrc_m1 = (uint8_t *) (psrc); \
173 uint32_t val0_m, val1_m; \
174 uint64_t val_m = 0; \
176 val0_m = LW(psrc_m1); \
177 val1_m = LW(psrc_m1 + 4); \
179 val_m = (uint64_t) (val1_m); \
180 val_m = (uint64_t) ((val_m << 32) & 0xFFFFFFFF00000000); \
181 val_m = (uint64_t) (val_m | (uint64_t) val0_m); \
185 #endif // (__mips == 64)
187 #define SH(val, pdst) \
189 uint8_t *pdst_m = (uint8_t *) (pdst); \
190 uint16_t val_m = (val); \
193 "ush %[val_m], %[pdst_m] \n\t" \
195 : [pdst_m] "=m" (*pdst_m) \
196 : [val_m] "r" (val_m) \
200 #define SW(val, pdst) \
202 uint8_t *pdst_m = (uint8_t *) (pdst); \
203 uint32_t val_m = (val); \
206 "usw %[val_m], %[pdst_m] \n\t" \
208 : [pdst_m] "=m" (*pdst_m) \
209 : [val_m] "r" (val_m) \
213 #define SD(val, pdst) \
215 uint8_t *pdst_m1 = (uint8_t *) (pdst); \
216 uint32_t val0_m, val1_m; \
218 val0_m = (uint32_t) ((val) & 0x00000000FFFFFFFF); \
219 val1_m = (uint32_t) (((val) >> 32) & 0x00000000FFFFFFFF); \
221 SW(val0_m, pdst_m1); \
222 SW(val1_m, pdst_m1 + 4); \
224 #endif // (__mips_isa_rev >= 6)
226 /* Description : Load 4 words with stride
227 Arguments : Inputs - psrc (source pointer to load from)
229 Outputs - out0, out1, out2, out3
230 Details : Loads word in 'out0' from (psrc)
231 Loads word in 'out1' from (psrc + stride)
232 Loads word in 'out2' from (psrc + 2 * stride)
233 Loads word in 'out3' from (psrc + 3 * stride)
235 #define LW4(psrc, stride, out0, out1, out2, out3) \
238 out1 = LW((psrc) + stride); \
239 out2 = LW((psrc) + 2 * stride); \
240 out3 = LW((psrc) + 3 * stride); \
243 /* Description : Store 4 words with stride
244 Arguments : Inputs - in0, in1, in2, in3, pdst, stride
245 Details : Stores word from 'in0' to (pdst)
246 Stores word from 'in1' to (pdst + stride)
247 Stores word from 'in2' to (pdst + 2 * stride)
248 Stores word from 'in3' to (pdst + 3 * stride)
250 #define SW4(in0, in1, in2, in3, pdst, stride) \
253 SW(in1, (pdst) + stride); \
254 SW(in2, (pdst) + 2 * stride); \
255 SW(in3, (pdst) + 3 * stride); \
258 /* Description : Store 4 double words with stride
259 Arguments : Inputs - in0, in1, in2, in3, pdst, stride
260 Details : Stores double word from 'in0' to (pdst)
261 Stores double word from 'in1' to (pdst + stride)
262 Stores double word from 'in2' to (pdst + 2 * stride)
263 Stores double word from 'in3' to (pdst + 3 * stride)
265 #define SD4(in0, in1, in2, in3, pdst, stride) \
268 SD(in1, (pdst) + stride); \
269 SD(in2, (pdst) + 2 * stride); \
270 SD(in3, (pdst) + 3 * stride); \
273 /* Description : Load vectors with 16 byte elements with stride
274 Arguments : Inputs - psrc (source pointer to load from)
277 Return Type - as per RTYPE
278 Details : Loads 16 byte elements in 'out0' from (psrc)
279 Loads 16 byte elements in 'out1' from (psrc + stride)
281 #define LD_B2(RTYPE, psrc, stride, out0, out1) \
283 out0 = LD_B(RTYPE, (psrc)); \
284 out1 = LD_B(RTYPE, (psrc) + stride); \
286 #define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)
287 #define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__)
289 #define LD_B3(RTYPE, psrc, stride, out0, out1, out2) \
291 LD_B2(RTYPE, (psrc), stride, out0, out1); \
292 out2 = LD_B(RTYPE, (psrc) + 2 * stride); \
294 #define LD_SB3(...) LD_B3(v16i8, __VA_ARGS__)
296 #define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \
298 LD_B2(RTYPE, (psrc), stride, out0, out1); \
299 LD_B2(RTYPE, (psrc) + 2 * stride , stride, out2, out3); \
301 #define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)
302 #define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__)
304 #define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \
306 LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
307 out4 = LD_B(RTYPE, (psrc) + 4 * stride); \
309 #define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__)
310 #define LD_SB5(...) LD_B5(v16i8, __VA_ARGS__)
312 #define LD_B6(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5) \
314 LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
315 LD_B2(RTYPE, (psrc) + 4 * stride, stride, out4, out5); \
317 #define LD_SB6(...) LD_B6(v16i8, __VA_ARGS__)
319 #define LD_B7(RTYPE, psrc, stride, \
320 out0, out1, out2, out3, out4, out5, out6) \
322 LD_B5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4); \
323 LD_B2(RTYPE, (psrc) + 5 * stride, stride, out5, out6); \
325 #define LD_SB7(...) LD_B7(v16i8, __VA_ARGS__)
327 #define LD_B8(RTYPE, psrc, stride, \
328 out0, out1, out2, out3, out4, out5, out6, out7) \
330 LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
331 LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \
333 #define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__)
334 #define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__)
336 /* Description : Load vectors with 8 halfword elements with stride
337 Arguments : Inputs - psrc (source pointer to load from)
340 Details : Loads 8 halfword elements in 'out0' from (psrc)
341 Loads 8 halfword elements in 'out1' from (psrc + stride)
343 #define LD_H2(RTYPE, psrc, stride, out0, out1) \
345 out0 = LD_H(RTYPE, (psrc)); \
346 out1 = LD_H(RTYPE, (psrc) + (stride)); \
348 #define LD_UH2(...) LD_H2(v8u16, __VA_ARGS__)
349 #define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__)
351 #define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3) \
353 LD_H2(RTYPE, (psrc), stride, out0, out1); \
354 LD_H2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \
356 #define LD_UH4(...) LD_H4(v8u16, __VA_ARGS__)
357 #define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__)
359 #define LD_H6(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5) \
361 LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
362 LD_H2(RTYPE, (psrc) + 4 * stride, stride, out4, out5); \
364 #define LD_UH6(...) LD_H6(v8u16, __VA_ARGS__)
365 #define LD_SH6(...) LD_H6(v8i16, __VA_ARGS__)
367 #define LD_H8(RTYPE, psrc, stride, \
368 out0, out1, out2, out3, out4, out5, out6, out7) \
370 LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
371 LD_H4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \
373 #define LD_UH8(...) LD_H8(v8u16, __VA_ARGS__)
374 #define LD_SH8(...) LD_H8(v8i16, __VA_ARGS__)
376 /* Description : Store vectors of 16 byte elements with stride
377 Arguments : Inputs - in0, in1, stride
378 Outputs - pdst (destination pointer to store to)
379 Details : Stores 16 byte elements from 'in0' to (pdst)
380 Stores 16 byte elements from 'in1' to (pdst + stride)
382 #define ST_B2(RTYPE, in0, in1, pdst, stride) \
384 ST_B(RTYPE, in0, (pdst)); \
385 ST_B(RTYPE, in1, (pdst) + stride); \
387 #define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)
388 #define ST_SB2(...) ST_B2(v16i8, __VA_ARGS__)
390 #define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) \
392 ST_B2(RTYPE, in0, in1, (pdst), stride); \
393 ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
395 #define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
396 #define ST_SB4(...) ST_B4(v16i8, __VA_ARGS__)
398 #define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
401 ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride); \
402 ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \
404 #define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__)
406 /* Description : Store vectors of 8 halfword elements with stride
407 Arguments : Inputs - in0, in1, stride
408 Outputs - pdst (destination pointer to store to)
409 Details : Stores 8 halfword elements from 'in0' to (pdst)
410 Stores 8 halfword elements from 'in1' to (pdst + stride)
412 #define ST_H2(RTYPE, in0, in1, pdst, stride) \
414 ST_H(RTYPE, in0, (pdst)); \
415 ST_H(RTYPE, in1, (pdst) + stride); \
417 #define ST_UH2(...) ST_H2(v8u16, __VA_ARGS__)
418 #define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__)
420 #define ST_H4(RTYPE, in0, in1, in2, in3, pdst, stride) \
422 ST_H2(RTYPE, in0, in1, (pdst), stride); \
423 ST_H2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
425 #define ST_SH4(...) ST_H4(v8i16, __VA_ARGS__)
427 #define ST_H6(RTYPE, in0, in1, in2, in3, in4, in5, pdst, stride) \
429 ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride); \
430 ST_H2(RTYPE, in4, in5, (pdst) + 4 * stride, stride); \
432 #define ST_SH6(...) ST_H6(v8i16, __VA_ARGS__)
434 #define ST_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
436 ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride); \
437 ST_H4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \
439 #define ST_SH8(...) ST_H8(v8i16, __VA_ARGS__)
441 /* Description : Store vectors of word elements with stride
442 Arguments : Inputs - in0, in1, stride
443 Outputs - pdst (destination pointer to store to)
444 Return Type - signed word
445 Details : Stores 4 word elements from 'in0' to (pdst)
446 Stores 4 word elements from 'in1' to (pdst + stride)
448 #define ST_SW2(in0, in1, pdst, stride) \
450 ST_SW(in0, (pdst)); \
451 ST_SW(in1, (pdst) + stride); \
454 /* Description : Store as 2x4 byte block to destination memory from input vector
455 Arguments : Inputs - in, stidx, pdst, stride
456 Return Type - unsigned byte
457 Details : Index stidx halfword element from 'in' vector is copied and
459 Index stidx+1 halfword element from 'in' vector is copied and
460 stored on second line
461 Index stidx+2 halfword element from 'in' vector is copied and
463 Index stidx+3 halfword element from 'in' vector is copied and
464 stored on fourth line
466 #define ST2x4_UB(in, stidx, pdst, stride) \
468 uint16_t out0_m, out1_m, out2_m, out3_m; \
469 uint8_t *pblk_2x4_m = (uint8_t *) (pdst); \
471 out0_m = __msa_copy_u_h((v8i16) in, (stidx)); \
472 out1_m = __msa_copy_u_h((v8i16) in, (stidx + 1)); \
473 out2_m = __msa_copy_u_h((v8i16) in, (stidx + 2)); \
474 out3_m = __msa_copy_u_h((v8i16) in, (stidx + 3)); \
476 SH(out0_m, pblk_2x4_m); \
477 SH(out1_m, pblk_2x4_m + stride); \
478 SH(out2_m, pblk_2x4_m + 2 * stride); \
479 SH(out3_m, pblk_2x4_m + 3 * stride); \
482 /* Description : Store as 4x2 byte block to destination memory from input vector
483 Arguments : Inputs - in, pdst, stride
484 Return Type - unsigned byte
485 Details : Index 0 word element from input vector is copied and stored
487 Index 1 word element from input vector is copied and stored
490 #define ST4x2_UB(in, pdst, stride) \
492 uint32_t out0_m, out1_m; \
493 uint8_t *pblk_4x2_m = (uint8_t *) (pdst); \
495 out0_m = __msa_copy_u_w((v4i32) in, 0); \
496 out1_m = __msa_copy_u_w((v4i32) in, 1); \
498 SW(out0_m, pblk_4x2_m); \
499 SW(out1_m, pblk_4x2_m + stride); \
502 /* Description : Store as 4x4 byte block to destination memory from input vector
503 Arguments : Inputs - in0, in1, pdst, stride
504 Return Type - unsigned byte
505 Details : Idx0 word element from input vector 'in0' is copied and stored
507 Idx1 word element from input vector 'in0' is copied and stored
509 Idx2 word element from input vector 'in1' is copied and stored
511 Idx3 word element from input vector 'in1' is copied and stored
514 #define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \
516 uint32_t out0_m, out1_m, out2_m, out3_m; \
517 uint8_t *pblk_4x4_m = (uint8_t *) (pdst); \
519 out0_m = __msa_copy_u_w((v4i32) in0, idx0); \
520 out1_m = __msa_copy_u_w((v4i32) in0, idx1); \
521 out2_m = __msa_copy_u_w((v4i32) in1, idx2); \
522 out3_m = __msa_copy_u_w((v4i32) in1, idx3); \
524 SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride); \
526 #define ST4x8_UB(in0, in1, pdst, stride) \
528 uint8_t *pblk_4x8 = (uint8_t *) (pdst); \
530 ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride); \
531 ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride); \
534 /* Description : Store as 6x4 byte block to destination memory from input
536 Arguments : Inputs - in0, in1, pdst, stride
537 Return Type - unsigned byte
538 Details : Index 0 word element from input vector 'in0' is copied and
539 stored on first line followed by index 2 halfword element
540 Index 2 word element from input vector 'in0' is copied and
541 stored on second line followed by index 2 halfword element
542 Index 0 word element from input vector 'in1' is copied and
543 stored on third line followed by index 2 halfword element
544 Index 2 word element from input vector 'in1' is copied and
545 stored on fourth line followed by index 2 halfword element
547 #define ST6x4_UB(in0, in1, pdst, stride) \
549 uint32_t out0_m, out1_m, out2_m, out3_m; \
550 uint16_t out4_m, out5_m, out6_m, out7_m; \
551 uint8_t *pblk_6x4_m = (uint8_t *) (pdst); \
553 out0_m = __msa_copy_u_w((v4i32) in0, 0); \
554 out1_m = __msa_copy_u_w((v4i32) in0, 2); \
555 out2_m = __msa_copy_u_w((v4i32) in1, 0); \
556 out3_m = __msa_copy_u_w((v4i32) in1, 2); \
558 out4_m = __msa_copy_u_h((v8i16) in0, 2); \
559 out5_m = __msa_copy_u_h((v8i16) in0, 6); \
560 out6_m = __msa_copy_u_h((v8i16) in1, 2); \
561 out7_m = __msa_copy_u_h((v8i16) in1, 6); \
563 SW(out0_m, pblk_6x4_m); \
564 SH(out4_m, (pblk_6x4_m + 4)); \
565 pblk_6x4_m += stride; \
566 SW(out1_m, pblk_6x4_m); \
567 SH(out5_m, (pblk_6x4_m + 4)); \
568 pblk_6x4_m += stride; \
569 SW(out2_m, pblk_6x4_m); \
570 SH(out6_m, (pblk_6x4_m + 4)); \
571 pblk_6x4_m += stride; \
572 SW(out3_m, pblk_6x4_m); \
573 SH(out7_m, (pblk_6x4_m + 4)); \
576 /* Description : Store as 8x2 byte block to destination memory from input vector
577 Arguments : Inputs - in, pdst, stride
578 Details : Index 0 double word element from input vector 'in' is copied
579 and stored to destination memory at (pdst)
580 Index 1 double word element from input vector 'in' is copied
581 and stored to destination memory at (pdst + stride)
583 #define ST8x2_UB(in, pdst, stride) \
585 uint64_t out0_m, out1_m; \
586 uint8_t *pblk_8x2_m = (uint8_t *) (pdst); \
588 out0_m = __msa_copy_u_d((v2i64) in, 0); \
589 out1_m = __msa_copy_u_d((v2i64) in, 1); \
591 SD(out0_m, pblk_8x2_m); \
592 SD(out1_m, pblk_8x2_m + stride); \
595 /* Description : Store as 8x4 byte block to destination memory from input
597 Arguments : Inputs - in0, in1, pdst, stride
598 Details : Index 0 double word element from input vector 'in0' is copied
599 and stored to destination memory at (pblk_8x4_m)
600 Index 1 double word element from input vector 'in0' is copied
601 and stored to destination memory at (pblk_8x4_m + stride)
602 Index 0 double word element from input vector 'in1' is copied
603 and stored to destination memory at (pblk_8x4_m + 2 * stride)
604 Index 1 double word element from input vector 'in1' is copied
605 and stored to destination memory at (pblk_8x4_m + 3 * stride)
607 #define ST8x4_UB(in0, in1, pdst, stride) \
609 uint64_t out0_m, out1_m, out2_m, out3_m; \
610 uint8_t *pblk_8x4_m = (uint8_t *) (pdst); \
612 out0_m = __msa_copy_u_d((v2i64) in0, 0); \
613 out1_m = __msa_copy_u_d((v2i64) in0, 1); \
614 out2_m = __msa_copy_u_d((v2i64) in1, 0); \
615 out3_m = __msa_copy_u_d((v2i64) in1, 1); \
617 SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride); \
619 #define ST8x8_UB(in0, in1, in2, in3, pdst, stride) \
621 uint8_t *pblk_8x8_m = (uint8_t *) (pdst); \
623 ST8x4_UB(in0, in1, pblk_8x8_m, stride); \
624 ST8x4_UB(in2, in3, pblk_8x8_m + 4 * stride, stride); \
626 #define ST12x4_UB(in0, in1, in2, pdst, stride) \
628 uint8_t *pblk_12x4_m = (uint8_t *) (pdst); \
631 ST8x4_UB(in0, in1, pblk_12x4_m, stride); \
633 ST4x4_UB(in2, in2, 0, 1, 2, 3, pblk_12x4_m + 8, stride); \
636 /* Description : Store as 12x8 byte block to destination memory from
638 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
639 Details : Index 0 double word element from input vector 'in0' is copied
640 and stored to destination memory at (pblk_12x8_m) followed by
641 index 2 word element from same input vector 'in0' at
643 Similar to remaining lines
645 #define ST12x8_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
647 uint64_t out0_m, out1_m, out2_m, out3_m; \
648 uint64_t out4_m, out5_m, out6_m, out7_m; \
649 uint32_t out8_m, out9_m, out10_m, out11_m; \
650 uint32_t out12_m, out13_m, out14_m, out15_m; \
651 uint8_t *pblk_12x8_m = (uint8_t *) (pdst); \
653 out0_m = __msa_copy_u_d((v2i64) in0, 0); \
654 out1_m = __msa_copy_u_d((v2i64) in1, 0); \
655 out2_m = __msa_copy_u_d((v2i64) in2, 0); \
656 out3_m = __msa_copy_u_d((v2i64) in3, 0); \
657 out4_m = __msa_copy_u_d((v2i64) in4, 0); \
658 out5_m = __msa_copy_u_d((v2i64) in5, 0); \
659 out6_m = __msa_copy_u_d((v2i64) in6, 0); \
660 out7_m = __msa_copy_u_d((v2i64) in7, 0); \
662 out8_m = __msa_copy_u_w((v4i32) in0, 2); \
663 out9_m = __msa_copy_u_w((v4i32) in1, 2); \
664 out10_m = __msa_copy_u_w((v4i32) in2, 2); \
665 out11_m = __msa_copy_u_w((v4i32) in3, 2); \
666 out12_m = __msa_copy_u_w((v4i32) in4, 2); \
667 out13_m = __msa_copy_u_w((v4i32) in5, 2); \
668 out14_m = __msa_copy_u_w((v4i32) in6, 2); \
669 out15_m = __msa_copy_u_w((v4i32) in7, 2); \
671 SD(out0_m, pblk_12x8_m); \
672 SW(out8_m, pblk_12x8_m + 8); \
673 pblk_12x8_m += stride; \
674 SD(out1_m, pblk_12x8_m); \
675 SW(out9_m, pblk_12x8_m + 8); \
676 pblk_12x8_m += stride; \
677 SD(out2_m, pblk_12x8_m); \
678 SW(out10_m, pblk_12x8_m + 8); \
679 pblk_12x8_m += stride; \
680 SD(out3_m, pblk_12x8_m); \
681 SW(out11_m, pblk_12x8_m + 8); \
682 pblk_12x8_m += stride; \
683 SD(out4_m, pblk_12x8_m); \
684 SW(out12_m, pblk_12x8_m + 8); \
685 pblk_12x8_m += stride; \
686 SD(out5_m, pblk_12x8_m); \
687 SW(out13_m, pblk_12x8_m + 8); \
688 pblk_12x8_m += stride; \
689 SD(out6_m, pblk_12x8_m); \
690 SW(out14_m, pblk_12x8_m + 8); \
691 pblk_12x8_m += stride; \
692 SD(out7_m, pblk_12x8_m); \
693 SW(out15_m, pblk_12x8_m + 8); \
696 /* Description : Immediate number of columns to slide with zero
697 Arguments : Inputs - in0, in1, slide_val
699 Return Type - as per RTYPE
700 Details : Byte elements from 'zero_m' vector are slide into 'in0' by
701 number of elements specified by 'slide_val'
703 #define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val) \
705 v16i8 zero_m = { 0 }; \
706 out0 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in0, slide_val); \
707 out1 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in1, slide_val); \
709 #define SLDI_B2_0_UB(...) SLDI_B2_0(v16u8, __VA_ARGS__)
711 #define SLDI_B4_0(RTYPE, in0, in1, in2, in3, \
712 out0, out1, out2, out3, slide_val) \
714 SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val); \
715 SLDI_B2_0(RTYPE, in2, in3, out2, out3, slide_val); \
717 #define SLDI_B4_0_SB(...) SLDI_B4_0(v16i8, __VA_ARGS__)
719 /* Description : Shuffle byte vector elements as per mask vector
720 Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
722 Return Type - as per RTYPE
723 Details : Selective byte elements from in0 & in1 are copied to out0 as
724 per control vector mask0
725 Selective byte elements from in2 & in3 are copied to out1 as
726 per control vector mask1
728 #define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \
730 out0 = (RTYPE) __msa_vshf_b((v16i8) mask0, (v16i8) in1, (v16i8) in0); \
731 out1 = (RTYPE) __msa_vshf_b((v16i8) mask1, (v16i8) in3, (v16i8) in2); \
733 #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
734 #define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)
735 #define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)
736 #define VSHF_B2_SH(...) VSHF_B2(v8i16, __VA_ARGS__)
738 #define VSHF_B3(RTYPE, in0, in1, in2, in3, in4, in5, mask0, mask1, mask2, \
741 VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1); \
742 out2 = (RTYPE) __msa_vshf_b((v16i8) mask2, (v16i8) in5, (v16i8) in4); \
744 #define VSHF_B3_SB(...) VSHF_B3(v16i8, __VA_ARGS__)
746 #define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3, \
747 out0, out1, out2, out3) \
749 VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1); \
750 VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3); \
752 #define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__)
754 /* Description : Shuffle byte vector elements as per mask vector
755 Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
757 Return Type - as per RTYPE
758 Details : Selective byte elements from in0 & in1 are copied to out0 as
759 per control vector mask0
760 Selective byte elements from in2 & in3 are copied to out1 as
761 per control vector mask1
763 #define VSHF_W2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \
765 out0 = (RTYPE) __msa_vshf_w((v4i32) mask0, (v4i32) in1, (v4i32) in0); \
766 out1 = (RTYPE) __msa_vshf_w((v4i32) mask1, (v4i32) in3, (v4i32) in2); \
768 #define VSHF_W2_SB(...) VSHF_W2(v16i8, __VA_ARGS__)
770 /* Description : Dot product of byte vector elements
771 Arguments : Inputs - mult0, mult1
774 Return Type - signed halfword
775 Details : Signed byte elements from mult0 are multiplied with
776 signed byte elements from cnst0 producing a result
777 twice the size of input i.e. signed halfword.
778 Then this multiplication results of adjacent odd-even elements
779 are added together and stored to the out vector
780 (2 signed halfword results)
782 #define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
784 out0 = (RTYPE) __msa_dotp_s_h((v16i8) mult0, (v16i8) cnst0); \
785 out1 = (RTYPE) __msa_dotp_s_h((v16i8) mult1, (v16i8) cnst1); \
787 #define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__)
789 #define DOTP_SB3(RTYPE, mult0, mult1, mult2, cnst0, cnst1, cnst2, \
792 DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
793 out2 = (RTYPE) __msa_dotp_s_h((v16i8) mult2, (v16i8) cnst2); \
795 #define DOTP_SB3_SH(...) DOTP_SB3(v8i16, __VA_ARGS__)
797 #define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3, \
798 cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) \
800 DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
801 DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
803 #define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__)
805 /* Description : Dot product & addition of byte vector elements
806 Arguments : Inputs - mult0, mult1
809 Return Type - signed halfword
810 Details : Signed byte elements from mult0 are multiplied with
811 signed byte elements from cnst0 producing a result
812 twice the size of input i.e. signed halfword.
813 Then this multiplication results of adjacent odd-even elements
814 are added to the out vector
815 (2 signed halfword results)
817 #define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
819 out0 = (RTYPE) __msa_dpadd_s_h((v8i16) out0, \
820 (v16i8) mult0, (v16i8) cnst0); \
821 out1 = (RTYPE) __msa_dpadd_s_h((v8i16) out1, \
822 (v16i8) mult1, (v16i8) cnst1); \
824 #define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__)
826 #define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3, \
827 cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) \
829 DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
830 DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
832 #define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__)
834 /* Description : Dot product & addition of halfword vector elements
835 Arguments : Inputs - mult0, mult1
838 Return Type - signed word
839 Details : Signed halfword elements from mult0 are multiplied with
840 signed halfword elements from cnst0 producing a result
841 twice the size of input i.e. signed word.
842 Then this multiplication results of adjacent odd-even elements
843 are added to the out vector
844 (2 signed word results)
846 #define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
848 out0 = (RTYPE) __msa_dpadd_s_w((v4i32) out0, \
849 (v8i16) mult0, (v8i16) cnst0); \
850 out1 = (RTYPE) __msa_dpadd_s_w((v4i32) out1, \
851 (v8i16) mult1, (v8i16) cnst1); \
853 #define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__)
855 /* Description : Clips all halfword elements of input vector between min & max
856 out = ((in) < (min)) ? (min) : (((in) > (max)) ? (max) : (in))
857 Arguments : Inputs - in (input vector)
858 - min (min threshold)
859 - max (max threshold)
860 Outputs - out_m (output vector with clipped elements)
861 Return Type - signed halfword
863 #define CLIP_SH(in, min, max) \
867 out_m = __msa_max_s_h((v8i16) min, (v8i16) in); \
868 out_m = __msa_min_s_h((v8i16) max, (v8i16) out_m); \
872 /* Description : Clips all signed halfword elements of input vector
874 Arguments : Inputs - in (input vector)
875 Outputs - out_m (output vector with clipped elements)
876 Return Type - signed halfword
878 #define CLIP_SH_0_255(in) \
880 v8i16 max_m = __msa_ldi_h(255); \
883 out_m = __msa_maxi_s_h((v8i16) in, 0); \
884 out_m = __msa_min_s_h((v8i16) max_m, (v8i16) out_m); \
887 #define CLIP_SH2_0_255(in0, in1) \
889 in0 = CLIP_SH_0_255(in0); \
890 in1 = CLIP_SH_0_255(in1); \
892 #define CLIP_SH4_0_255(in0, in1, in2, in3) \
894 CLIP_SH2_0_255(in0, in1); \
895 CLIP_SH2_0_255(in2, in3); \
898 /* Description : Clips all signed word elements of input vector
900 Arguments : Inputs - in (input vector)
901 Outputs - out_m (output vector with clipped elements)
902 Return Type - signed word
904 #define CLIP_SW_0_255(in) \
906 v4i32 max_m = __msa_ldi_w(255); \
909 out_m = __msa_maxi_s_w((v4i32) in, 0); \
910 out_m = __msa_min_s_w((v4i32) max_m, (v4i32) out_m); \
914 /* Description : Horizontal subtraction of unsigned byte vector elements
915 Arguments : Inputs - in0, in1
917 Return Type - as per RTYPE
918 Details : Each unsigned odd byte element from 'in0' is subtracted from
919 even unsigned byte element from 'in0' (pairwise) and the
920 halfword result is stored in 'out0'
922 #define HSUB_UB2(RTYPE, in0, in1, out0, out1) \
924 out0 = (RTYPE) __msa_hsub_u_h((v16u8) in0, (v16u8) in0); \
925 out1 = (RTYPE) __msa_hsub_u_h((v16u8) in1, (v16u8) in1); \
927 #define HSUB_UB2_UH(...) HSUB_UB2(v8u16, __VA_ARGS__)
928 #define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
930 /* Description : Interleave even halfword elements from vectors
931 Arguments : Inputs - in0, in1, in2, in3
933 Return Type - as per RTYPE
934 Details : Even halfword elements of 'in0' and even halfword
935 elements of 'in1' are interleaved and copied to 'out0'
936 Even halfword elements of 'in2' and even halfword
937 elements of 'in3' are interleaved and copied to 'out1'
939 #define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
941 out0 = (RTYPE) __msa_ilvev_h((v8i16) in1, (v8i16) in0); \
942 out1 = (RTYPE) __msa_ilvev_h((v8i16) in3, (v8i16) in2); \
944 #define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__)
946 /* Description : Interleave even word elements from vectors
947 Arguments : Inputs - in0, in1, in2, in3
949 Return Type - as per RTYPE
950 Details : Even word elements of 'in0' and even word
951 elements of 'in1' are interleaved and copied to 'out0'
952 Even word elements of 'in2' and even word
953 elements of 'in3' are interleaved and copied to 'out1'
955 #define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) \
957 out0 = (RTYPE) __msa_ilvev_w((v4i32) in1, (v4i32) in0); \
958 out1 = (RTYPE) __msa_ilvev_w((v4i32) in3, (v4i32) in2); \
960 #define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__)
962 /* Description : Interleave even double word elements from vectors
963 Arguments : Inputs - in0, in1, in2, in3
965 Return Type - as per RTYPE
966 Details : Even double word elements of 'in0' and even double word
967 elements of 'in1' are interleaved and copied to 'out0'
968 Even double word elements of 'in2' and even double word
969 elements of 'in3' are interleaved and copied to 'out1'
971 #define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
973 out0 = (RTYPE) __msa_ilvev_d((v2i64) in1, (v2i64) in0); \
974 out1 = (RTYPE) __msa_ilvev_d((v2i64) in3, (v2i64) in2); \
976 #define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)
978 /* Description : Interleave left half of byte elements from vectors
979 Arguments : Inputs - in0, in1, in2, in3
981 Return Type - as per RTYPE
982 Details : Left half of byte elements of in0 and left half of byte
983 elements of in1 are interleaved and copied to out0.
984 Left half of byte elements of in2 and left half of byte
985 elements of in3 are interleaved and copied to out1.
987 #define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
989 out0 = (RTYPE) __msa_ilvl_b((v16i8) in0, (v16i8) in1); \
990 out1 = (RTYPE) __msa_ilvl_b((v16i8) in2, (v16i8) in3); \
992 #define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__)
993 #define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__)
995 #define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
996 out0, out1, out2, out3) \
998 ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
999 ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
1001 #define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__)
1002 #define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__)
1003 #define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__)
1005 /* Description : Interleave left half of halfword elements from vectors
1006 Arguments : Inputs - in0, in1, in2, in3
1007 Outputs - out0, out1
1008 Return Type - as per RTYPE
1009 Details : Left half of halfword elements of in0 and left half of halfword
1010 elements of in1 are interleaved and copied to out0.
1011 Left half of halfword elements of in2 and left half of halfword
1012 elements of in3 are interleaved and copied to out1.
1014 #define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
1016 out0 = (RTYPE) __msa_ilvl_h((v8i16) in0, (v8i16) in1); \
1017 out1 = (RTYPE) __msa_ilvl_h((v8i16) in2, (v8i16) in3); \
1019 #define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__)
1021 #define ILVL_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1022 out0, out1, out2, out3) \
1024 ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
1025 ILVL_H2(RTYPE, in4, in5, in6, in7, out2, out3); \
1027 #define ILVL_H4_SH(...) ILVL_H4(v8i16, __VA_ARGS__)
1029 /* Description : Interleave left half of word elements from vectors
1030 Arguments : Inputs - in0, in1, in2, in3
1031 Outputs - out0, out1
1032 Return Type - as per RTYPE
1033 Details : Left half of word elements of in0 and left half of word
1034 elements of in1 are interleaved and copied to out0.
1035 Left half of word elements of in2 and left half of word
1036 elements of in3 are interleaved and copied to out1.
1038 #define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1) \
1040 out0 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1); \
1041 out1 = (RTYPE) __msa_ilvl_w((v4i32) in2, (v4i32) in3); \
1043 #define ILVL_W2_SB(...) ILVL_W2(v16i8, __VA_ARGS__)
1045 /* Description : Interleave right half of byte elements from vectors
1046 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
1047 Outputs - out0, out1, out2, out3
1048 Return Type - as per RTYPE
1049 Details : Right half of byte elements of in0 and right half of byte
1050 elements of in1 are interleaved and copied to out0.
1051 Right half of byte elements of in2 and right half of byte
1052 elements of in3 are interleaved and copied to out1.
1053 Similar for other pairs
1055 #define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
1057 out0 = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1); \
1058 out1 = (RTYPE) __msa_ilvr_b((v16i8) in2, (v16i8) in3); \
1060 #define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__)
1061 #define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)
1062 #define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__)
1063 #define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
1065 #define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1066 out0, out1, out2, out3) \
1068 ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1069 ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
1071 #define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__)
1072 #define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__)
1073 #define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__)
1075 /* Description : Interleave right half of halfword elements from vectors
1076 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
1077 Outputs - out0, out1, out2, out3
1078 Return Type - signed halfword
1079 Details : Right half of halfword elements of in0 and right half of
1080 halfword elements of in1 are interleaved and copied to out0.
1081 Right half of halfword elements of in2 and right half of
1082 halfword elements of in3 are interleaved and copied to out1.
1083 Similar for other pairs
1085 #define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
1087 out0 = (RTYPE) __msa_ilvr_h((v8i16) in0, (v8i16) in1); \
1088 out1 = (RTYPE) __msa_ilvr_h((v8i16) in2, (v8i16) in3); \
1090 #define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
1092 #define ILVR_H3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
1094 ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
1095 out2 = (RTYPE) __msa_ilvr_h((v8i16) in4, (v8i16) in5); \
1097 #define ILVR_H3_SH(...) ILVR_H3(v8i16, __VA_ARGS__)
1099 #define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1100 out0, out1, out2, out3) \
1102 ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
1103 ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3); \
1105 #define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__)
1107 #define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1) \
1109 out0 = (RTYPE) __msa_ilvr_w((v4i32) in0, (v4i32) in1); \
1110 out1 = (RTYPE) __msa_ilvr_w((v4i32) in2, (v4i32) in3); \
1112 #define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__)
1113 #define ILVR_W2_SB(...) ILVR_W2(v16i8, __VA_ARGS__)
1115 #define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1116 out0, out1, out2, out3) \
1118 ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1); \
1119 ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3); \
1121 #define ILVR_W4_SB(...) ILVR_W4(v16i8, __VA_ARGS__)
1123 /* Description : Interleave right half of double word elements from vectors
1124 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
1125 Outputs - out0, out1, out2, out3
1126 Return Type - unsigned double word
1127 Details : Right half of double word elements of in0 and right half of
1128 double word elements of in1 are interleaved and copied to out0.
1129 Right half of double word elements of in2 and right half of
1130 double word elements of in3 are interleaved and copied to out1.
1132 #define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
1134 out0 = (RTYPE) __msa_ilvr_d((v2i64) (in0), (v2i64) (in1)); \
1135 out1 = (RTYPE) __msa_ilvr_d((v2i64) (in2), (v2i64) (in3)); \
1137 #define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
1138 #define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
1140 #define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
1142 ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
1143 out2 = (RTYPE) __msa_ilvr_d((v2i64) (in4), (v2i64) (in5)); \
1145 #define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__)
1147 #define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1148 out0, out1, out2, out3) \
1150 ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
1151 ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3); \
1153 #define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__)
1155 /* Description : Interleave both left and right half of input vectors
1156 Arguments : Inputs - in0, in1
1157 Outputs - out0, out1
1158 Return Type - as per RTYPE
1159 Details : Right half of byte elements from 'in0' and 'in1' are
1160 interleaved and stored to 'out0'
1161 Left half of byte elements from 'in0' and 'in1' are
1162 interleaved and stored to 'out1'
1164 #define ILVRL_B2(RTYPE, in0, in1, out0, out1) \
1166 out0 = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1); \
1167 out1 = (RTYPE) __msa_ilvl_b((v16i8) in0, (v16i8) in1); \
1169 #define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)
1170 #define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
1172 #define ILVRL_H2(RTYPE, in0, in1, out0, out1) \
1174 out0 = (RTYPE) __msa_ilvr_h((v8i16) in0, (v8i16) in1); \
1175 out1 = (RTYPE) __msa_ilvl_h((v8i16) in0, (v8i16) in1); \
1177 #define ILVRL_H2_SB(...) ILVRL_H2(v16i8, __VA_ARGS__)
1178 #define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)
1179 #define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
1181 #define ILVRL_W2(RTYPE, in0, in1, out0, out1) \
1183 out0 = (RTYPE) __msa_ilvr_w((v4i32) in0, (v4i32) in1); \
1184 out1 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1); \
1186 #define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
1188 /* Description : Maximum values between signed elements of vector and
1189 5-bit signed immediate value are copied to the output vector
1190 Arguments : Inputs - in0, in1, in2, in3, max_val
1191 Outputs - in0, in1, in2, in3 (in place)
1192 Return Type - unsigned halfword
1193 Details : Maximum of signed halfword element values from 'in0' and
1194 'max_val' are written to output vector 'in0'
1196 #define MAXI_SH2(RTYPE, in0, in1, max_val) \
1198 in0 = (RTYPE) __msa_maxi_s_h((v8i16) in0, (max_val)); \
1199 in1 = (RTYPE) __msa_maxi_s_h((v8i16) in1, (max_val)); \
1201 #define MAXI_SH2_SH(...) MAXI_SH2(v8i16, __VA_ARGS__)
1203 #define MAXI_SH4(RTYPE, in0, in1, in2, in3, max_val) \
1205 MAXI_SH2(RTYPE, in0, in1, max_val); \
1206 MAXI_SH2(RTYPE, in2, in3, max_val); \
1208 #define MAXI_SH4_UH(...) MAXI_SH4(v8u16, __VA_ARGS__)
1210 /* Description : Saturate the halfword element values to the max
1211 unsigned value of (sat_val+1 bits)
1212 The element data width remains unchanged
1213 Arguments : Inputs - in0, in1, in2, in3, sat_val
1214 Outputs - in0, in1, in2, in3 (in place)
1215 Return Type - unsigned halfword
1216 Details : Each unsigned halfword element from 'in0' is saturated to the
1217 value generated with (sat_val+1) bit range
1218 Results are in placed to original vectors
1220 #define SAT_UH2(RTYPE, in0, in1, sat_val) \
1222 in0 = (RTYPE) __msa_sat_u_h((v8u16) in0, sat_val); \
1223 in1 = (RTYPE) __msa_sat_u_h((v8u16) in1, sat_val); \
1225 #define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__)
1227 #define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val) \
1229 SAT_UH2(RTYPE, in0, in1, sat_val); \
1230 SAT_UH2(RTYPE, in2, in3, sat_val) \
1232 #define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__)
1234 /* Description : Saturate the halfword element values to the max
1235 unsigned value of (sat_val+1 bits)
1236 The element data width remains unchanged
1237 Arguments : Inputs - in0, in1, in2, in3, sat_val
1238 Outputs - in0, in1, in2, in3 (in place)
1239 Return Type - unsigned halfword
1240 Details : Each unsigned halfword element from 'in0' is saturated to the
1241 value generated with (sat_val+1) bit range
1242 Results are in placed to original vectors
1244 #define SAT_SH2(RTYPE, in0, in1, sat_val) \
1246 in0 = (RTYPE) __msa_sat_s_h((v8i16) in0, sat_val); \
1247 in1 = (RTYPE) __msa_sat_s_h((v8i16) in1, sat_val); \
1249 #define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__)
1251 #define SAT_SH3(RTYPE, in0, in1, in2, sat_val) \
1253 SAT_SH2(RTYPE, in0, in1, sat_val) \
1254 in2 = (RTYPE) __msa_sat_s_h((v8i16) in2, sat_val); \
1256 #define SAT_SH3_SH(...) SAT_SH3(v8i16, __VA_ARGS__)
1258 #define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) \
1260 SAT_SH2(RTYPE, in0, in1, sat_val); \
1261 SAT_SH2(RTYPE, in2, in3, sat_val); \
1263 #define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__)
1265 /* Description : Indexed halfword element values are replicated to all
1266 elements in output vector
1267 Arguments : Inputs - in, idx0, idx1
1268 Outputs - out0, out1
1269 Return Type - as per RTYPE
1270 Details : 'idx0' element value from 'in' vector is replicated to all
1271 elements in 'out0' vector
1272 Valid index range for halfword operation is 0-7
1274 #define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) \
1276 out0 = (RTYPE) __msa_splati_h((v8i16) in, idx0); \
1277 out1 = (RTYPE) __msa_splati_h((v8i16) in, idx1); \
1279 #define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__)
1281 #define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3, \
1282 out0, out1, out2, out3) \
1284 SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1); \
1285 SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3); \
1287 #define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__)
1288 #define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__)
1290 /* Description : Indexed word element values are replicated to all
1291 elements in output vector
1292 Arguments : Inputs - in, stidx
1293 Outputs - out0, out1
1294 Return Type - as per RTYPE
1295 Details : 'stidx' element value from 'in' vector is replicated to all
1296 elements in 'out0' vector
1297 'stidx + 1' element value from 'in' vector is replicated to all
1298 elements in 'out1' vector
1299 Valid index range for halfword operation is 0-3
1301 #define SPLATI_W2(RTYPE, in, stidx, out0, out1) \
1303 out0 = (RTYPE) __msa_splati_w((v4i32) in, stidx); \
1304 out1 = (RTYPE) __msa_splati_w((v4i32) in, (stidx+1)); \
1306 #define SPLATI_W2_SW(...) SPLATI_W2(v4i32, __VA_ARGS__)
1308 #define SPLATI_W4(RTYPE, in, out0, out1, out2, out3) \
1310 SPLATI_W2(RTYPE, in, 0, out0, out1); \
1311 SPLATI_W2(RTYPE, in, 2, out2, out3); \
1313 #define SPLATI_W4_SW(...) SPLATI_W4(v4i32, __VA_ARGS__)
1315 /* Description : Pack even byte elements of vector pairs
1316 Arguments : Inputs - in0, in1, in2, in3
1317 Outputs - out0, out1
1318 Return Type - as per RTYPE
1319 Details : Even byte elements of in0 are copied to the left half of
1320 out0 & even byte elements of in1 are copied to the right
1322 Even byte elements of in2 are copied to the left half of
1323 out1 & even byte elements of in3 are copied to the right
1326 #define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
1328 out0 = (RTYPE) __msa_pckev_b((v16i8) in0, (v16i8) in1); \
1329 out1 = (RTYPE) __msa_pckev_b((v16i8) in2, (v16i8) in3); \
1331 #define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)
1332 #define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)
1333 #define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__)
1334 #define PCKEV_B2_SW(...) PCKEV_B2(v4i32, __VA_ARGS__)
1336 #define PCKEV_B3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
1338 PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1339 out2 = (RTYPE) __msa_pckev_b((v16i8) in4, (v16i8) in5); \
1341 #define PCKEV_B3_UB(...) PCKEV_B3(v16u8, __VA_ARGS__)
1342 #define PCKEV_B3_SB(...) PCKEV_B3(v16i8, __VA_ARGS__)
1344 #define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1345 out0, out1, out2, out3) \
1347 PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1348 PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
1350 #define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__)
1351 #define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__)
1352 #define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__)
1353 #define PCKEV_B4_SW(...) PCKEV_B4(v4i32, __VA_ARGS__)
1355 /* Description : Pack even halfword elements of vector pairs
1356 Arguments : Inputs - in0, in1, in2, in3
1357 Outputs - out0, out1
1358 Return Type - as per RTYPE
1359 Details : Even halfword elements of in0 are copied to the left half of
1360 out0 & even halfword elements of in1 are copied to the right
1362 Even halfword elements of in2 are copied to the left half of
1363 out1 & even halfword elements of in3 are copied to the right
1366 #define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
1368 out0 = (RTYPE) __msa_pckev_h((v8i16) in0, (v8i16) in1); \
1369 out1 = (RTYPE) __msa_pckev_h((v8i16) in2, (v8i16) in3); \
1371 #define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__)
1372 #define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__)
1374 #define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1375 out0, out1, out2, out3) \
1377 PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
1378 PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3); \
1380 #define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__)
1381 #define PCKEV_H4_SW(...) PCKEV_H4(v4i32, __VA_ARGS__)
1383 /* Description : Each byte element is logically xor'ed with immediate 128
1384 Arguments : Inputs - in0, in1
1385 Outputs - in0, in1 (in-place)
1386 Return Type - as per RTYPE
1387 Details : Each unsigned byte element from input vector 'in0' is
1388 logically xor'ed with 128 and result is in-place stored in
1390 Each unsigned byte element from input vector 'in1' is
1391 logically xor'ed with 128 and result is in-place stored in
1393 Similar for other pairs
1395 #define XORI_B2_128(RTYPE, in0, in1) \
1397 in0 = (RTYPE) __msa_xori_b((v16u8) in0, 128); \
1398 in1 = (RTYPE) __msa_xori_b((v16u8) in1, 128); \
1400 #define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__)
1402 #define XORI_B3_128(RTYPE, in0, in1, in2) \
1404 XORI_B2_128(RTYPE, in0, in1); \
1405 in2 = (RTYPE) __msa_xori_b((v16u8) in2, 128); \
1407 #define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__)
1409 #define XORI_B4_128(RTYPE, in0, in1, in2, in3) \
1411 XORI_B2_128(RTYPE, in0, in1); \
1412 XORI_B2_128(RTYPE, in2, in3); \
1414 #define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__)
1415 #define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__)
1416 #define XORI_B4_128_SH(...) XORI_B4_128(v8i16, __VA_ARGS__)
1418 #define XORI_B5_128(RTYPE, in0, in1, in2, in3, in4) \
1420 XORI_B3_128(RTYPE, in0, in1, in2); \
1421 XORI_B2_128(RTYPE, in3, in4); \
1423 #define XORI_B5_128_SB(...) XORI_B5_128(v16i8, __VA_ARGS__)
1425 #define XORI_B6_128(RTYPE, in0, in1, in2, in3, in4, in5) \
1427 XORI_B4_128(RTYPE, in0, in1, in2, in3); \
1428 XORI_B2_128(RTYPE, in4, in5); \
1430 #define XORI_B6_128_SB(...) XORI_B6_128(v16i8, __VA_ARGS__)
1432 #define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6) \
1434 XORI_B4_128(RTYPE, in0, in1, in2, in3); \
1435 XORI_B3_128(RTYPE, in4, in5, in6); \
1437 #define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__)
1439 #define XORI_B8_128(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7) \
1441 XORI_B4_128(RTYPE, in0, in1, in2, in3); \
1442 XORI_B4_128(RTYPE, in4, in5, in6, in7); \
1444 #define XORI_B8_128_SB(...) XORI_B8_128(v16i8, __VA_ARGS__)
1446 /* Description : Addition of signed halfword elements and signed saturation
1447 Arguments : Inputs - in0, in1, in2, in3
1448 Outputs - out0, out1
1449 Return Type - as per RTYPE
1450 Details : Signed halfword elements from 'in0' are added to signed
1451 halfword elements of 'in1'. The result is then signed saturated
1452 between -32768 to +32767 (as per halfword data type)
1453 Similar for other pairs
1455 #define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1) \
1457 out0 = (RTYPE) __msa_adds_s_h((v8i16) in0, (v8i16) in1); \
1458 out1 = (RTYPE) __msa_adds_s_h((v8i16) in2, (v8i16) in3); \
1460 #define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__)
1462 #define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1463 out0, out1, out2, out3) \
1465 ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1); \
1466 ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3); \
1468 #define ADDS_SH4_UH(...) ADDS_SH4(v8u16, __VA_ARGS__)
1469 #define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__)
1471 /* Description : Shift left all elements of vector (generic for all data types)
1472 Arguments : Inputs - in0, in1, in2, in3, shift
1473 Outputs - in0, in1, in2, in3 (in place)
1474 Return Type - as per input vector RTYPE
1475 Details : Each element of vector 'in0' is left shifted by 'shift' and
1476 result is in place written to 'in0'
1477 Similar for other pairs
1479 #define SLLI_4V(in0, in1, in2, in3, shift) \
1481 in0 = in0 << shift; \
1482 in1 = in1 << shift; \
1483 in2 = in2 << shift; \
1484 in3 = in3 << shift; \
1487 /* Description : Arithmetic shift right all elements of vector
1488 (generic for all data types)
1489 Arguments : Inputs - in0, in1, in2, in3, shift
1490 Outputs - in0, in1, in2, in3 (in place)
1491 Return Type - as per input vector RTYPE
1492 Details : Each element of vector 'in0' is right shifted by 'shift' and
1493 result is in place written to 'in0'
1494 Here, 'shift' is GP variable passed in
1495 Similar for other pairs
1497 #define SRA_4V(in0, in1, in2, in3, shift) \
1499 in0 = in0 >> shift; \
1500 in1 = in1 >> shift; \
1501 in2 = in2 >> shift; \
1502 in3 = in3 >> shift; \
1505 /* Description : Shift right logical all halfword elements of vector
1506 Arguments : Inputs - in0, in1, in2, in3, shift
1507 Outputs - in0, in1, in2, in3 (in place)
1508 Return Type - unsigned halfword
1509 Details : Each element of vector 'in0' is shifted right logical by
1510 number of bits respective element holds in vector 'shift' and
1511 result is in place written to 'in0'
1512 Here, 'shift' is a vector passed in
1513 Similar for other pairs
1515 #define SRL_H4(RTYPE, in0, in1, in2, in3, shift) \
1517 in0 = (RTYPE) __msa_srl_h((v8i16) in0, (v8i16) shift); \
1518 in1 = (RTYPE) __msa_srl_h((v8i16) in1, (v8i16) shift); \
1519 in2 = (RTYPE) __msa_srl_h((v8i16) in2, (v8i16) shift); \
1520 in3 = (RTYPE) __msa_srl_h((v8i16) in3, (v8i16) shift); \
1522 #define SRL_H4_UH(...) SRL_H4(v8u16, __VA_ARGS__)
1524 /* Description : Shift right arithmetic rounded halfwords
1525 Arguments : Inputs - in0, in1, shift
1526 Outputs - in0, in1, (in place)
1527 Return Type - unsigned halfword
1528 Details : Each element of vector 'in0' is shifted right arithmetic by
1529 number of bits respective element holds in vector 'shift'.
1530 The last discarded bit is added to shifted value for rounding
1531 and the result is in place written to 'in0'
1532 Here, 'shift' is a vector passed in
1533 Similar for other pairs
1535 #define SRAR_H2(RTYPE, in0, in1, shift) \
1537 in0 = (RTYPE) __msa_srar_h((v8i16) in0, (v8i16) shift); \
1538 in1 = (RTYPE) __msa_srar_h((v8i16) in1, (v8i16) shift); \
1540 #define SRAR_H2_UH(...) SRAR_H2(v8u16, __VA_ARGS__)
1541 #define SRAR_H2_SH(...) SRAR_H2(v8i16, __VA_ARGS__)
1543 #define SRAR_H3(RTYPE, in0, in1, in2, shift) \
1545 SRAR_H2(RTYPE, in0, in1, shift) \
1546 in2 = (RTYPE) __msa_srar_h((v8i16) in2, (v8i16) shift); \
1548 #define SRAR_H3_SH(...) SRAR_H3(v8i16, __VA_ARGS__)
1550 #define SRAR_H4(RTYPE, in0, in1, in2, in3, shift) \
1552 SRAR_H2(RTYPE, in0, in1, shift) \
1553 SRAR_H2(RTYPE, in2, in3, shift) \
1555 #define SRAR_H4_UH(...) SRAR_H4(v8u16, __VA_ARGS__)
1556 #define SRAR_H4_SH(...) SRAR_H4(v8i16, __VA_ARGS__)
1558 /* Description : Shift right arithmetic rounded (immediate)
1559 Arguments : Inputs - in0, in1, in2, in3, shift
1560 Outputs - in0, in1, in2, in3 (in place)
1561 Return Type - as per RTYPE
1562 Details : Each element of vector 'in0' is shifted right arithmetic by
1564 The last discarded bit is added to shifted value for rounding
1565 and the result is in place written to 'in0'
1566 Similar for other pairs
1568 #define SRARI_H2(RTYPE, in0, in1, shift) \
1570 in0 = (RTYPE) __msa_srari_h((v8i16) in0, shift); \
1571 in1 = (RTYPE) __msa_srari_h((v8i16) in1, shift); \
1573 #define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__)
1574 #define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__)
1576 /* Description : Shift right arithmetic rounded (immediate)
1577 Arguments : Inputs - in0, in1, shift
1578 Outputs - in0, in1 (in place)
1579 Return Type - as per RTYPE
1580 Details : Each element of vector 'in0' is shifted right arithmetic by
1582 The last discarded bit is added to shifted value for rounding
1583 and the result is in place written to 'in0'
1584 Similar for other pairs
1586 #define SRARI_W2(RTYPE, in0, in1, shift) \
1588 in0 = (RTYPE) __msa_srari_w((v4i32) in0, shift); \
1589 in1 = (RTYPE) __msa_srari_w((v4i32) in1, shift); \
1591 #define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__)
1593 #define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) \
1595 SRARI_W2(RTYPE, in0, in1, shift); \
1596 SRARI_W2(RTYPE, in2, in3, shift); \
1598 #define SRARI_W4_SH(...) SRARI_W4(v8i16, __VA_ARGS__)
1599 #define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)
1601 /* Description : Multiplication of pairs of vectors
1602 Arguments : Inputs - in0, in1, in2, in3
1603 Outputs - out0, out1
1604 Details : Each element from 'in0' is multiplied with elements from 'in1'
1605 and result is written to 'out0'
1606 Similar for other pairs
1608 #define MUL2(in0, in1, in2, in3, out0, out1) \
1613 #define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
1615 MUL2(in0, in1, in2, in3, out0, out1); \
1616 MUL2(in4, in5, in6, in7, out2, out3); \
1619 /* Description : Zero extend unsigned byte elements to halfword elements
1620 Arguments : Inputs - in (1 input unsigned byte vector)
1621 Outputs - out0, out1 (unsigned 2 halfword vectors)
1622 Return Type - signed halfword
1623 Details : Zero extended right half of vector is returned in 'out0'
1624 Zero extended left half of vector is returned in 'out1'
1626 #define UNPCK_UB_SH(in, out0, out1) \
1628 v16i8 zero_m = { 0 }; \
1630 ILVRL_B2_SH(zero_m, in, out0, out1); \
1633 /* Description : Sign extend halfword elements from input vector and return
1634 result in pair of vectors
1635 Arguments : Inputs - in (1 input halfword vector)
1636 Outputs - out0, out1 (sign extended 2 word vectors)
1637 Return Type - signed word
1638 Details : Sign bit of halfword elements from input vector 'in' is
1639 extracted and interleaved right with same vector 'in0' to
1640 generate 4 signed word elements in 'out0'
1641 Then interleaved left with same vector 'in0' to
1642 generate 4 signed word elements in 'out1'
1644 #define UNPCK_SH_SW(in, out0, out1) \
1648 tmp_m = __msa_clti_s_h((v8i16) in, 0); \
1649 ILVRL_H2_SW(tmp_m, in, out0, out1); \
1652 /* Description : Transposes input 4x4 byte block
1653 Arguments : Inputs - in0, in1, in2, in3 (input 4x4 byte block)
1654 Outputs - out0, out1, out2, out3 (output 4x4 byte block)
1655 Return Type - unsigned byte
1658 #define TRANSPOSE4x4_UB_UB(in0, in1, in2, in3, out0, out1, out2, out3) \
1660 v16i8 zero_m = { 0 }; \
1661 v16i8 s0_m, s1_m, s2_m, s3_m; \
1663 ILVR_D2_SB(in1, in0, in3, in2, s0_m, s1_m); \
1664 ILVRL_B2_SB(s1_m, s0_m, s2_m, s3_m); \
1666 out0 = (v16u8) __msa_ilvr_b(s3_m, s2_m); \
1667 out1 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out0, 4); \
1668 out2 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out1, 4); \
1669 out3 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out2, 4); \
1672 /* Description : Transposes input 8x4 byte block into 4x8
1673 Arguments : Inputs - in0, in1, in2, in3 (input 8x4 byte block)
1674 Outputs - out0, out1, out2, out3 (output 4x8 byte block)
1675 Return Type - unsigned byte
1678 #define TRANSPOSE8x4_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1679 out0, out1, out2, out3) \
1681 v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
1683 ILVEV_W2_SB(in0, in4, in1, in5, tmp0_m, tmp1_m); \
1684 tmp2_m = __msa_ilvr_b(tmp1_m, tmp0_m); \
1685 ILVEV_W2_SB(in2, in6, in3, in7, tmp0_m, tmp1_m); \
1687 tmp3_m = __msa_ilvr_b(tmp1_m, tmp0_m); \
1688 ILVRL_H2_SB(tmp3_m, tmp2_m, tmp0_m, tmp1_m); \
1690 ILVRL_W2(RTYPE, tmp1_m, tmp0_m, out0, out2); \
1691 out1 = (RTYPE) __msa_ilvl_d((v2i64) out2, (v2i64) out0); \
1692 out3 = (RTYPE) __msa_ilvl_d((v2i64) out0, (v2i64) out2); \
1695 #define TRANSPOSE8x4_UB_UB(...) TRANSPOSE8x4_UB(v16u8, __VA_ARGS__)
1697 /* Description : Transposes 16x8 block into 8x16 with byte elements in vectors
1698 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7,
1699 in8, in9, in10, in11, in12, in13, in14, in15
1700 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1701 Return Type - unsigned byte
1704 #define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
1705 in8, in9, in10, in11, in12, in13, in14, in15, \
1706 out0, out1, out2, out3, out4, out5, out6, out7) \
1708 v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
1709 v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
1711 ILVEV_D2_UB(in0, in8, in1, in9, out7, out6); \
1712 ILVEV_D2_UB(in2, in10, in3, in11, out5, out4); \
1713 ILVEV_D2_UB(in4, in12, in5, in13, out3, out2); \
1714 ILVEV_D2_UB(in6, in14, in7, in15, out1, out0); \
1716 tmp0_m = (v16u8) __msa_ilvev_b((v16i8) out6, (v16i8) out7); \
1717 tmp4_m = (v16u8) __msa_ilvod_b((v16i8) out6, (v16i8) out7); \
1718 tmp1_m = (v16u8) __msa_ilvev_b((v16i8) out4, (v16i8) out5); \
1719 tmp5_m = (v16u8) __msa_ilvod_b((v16i8) out4, (v16i8) out5); \
1720 out5 = (v16u8) __msa_ilvev_b((v16i8) out2, (v16i8) out3); \
1721 tmp6_m = (v16u8) __msa_ilvod_b((v16i8) out2, (v16i8) out3); \
1722 out7 = (v16u8) __msa_ilvev_b((v16i8) out0, (v16i8) out1); \
1723 tmp7_m = (v16u8) __msa_ilvod_b((v16i8) out0, (v16i8) out1); \
1725 ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m); \
1726 out0 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
1727 out4 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
1729 tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
1730 tmp3_m = (v16u8) __msa_ilvod_h((v8i16) out7, (v8i16) out5); \
1731 out2 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
1732 out6 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
1734 ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m); \
1735 out1 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
1736 out5 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
1738 tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp5_m, (v8i16) tmp4_m); \
1739 tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp5_m, (v8i16) tmp4_m); \
1740 tmp3_m = (v16u8) __msa_ilvod_h((v8i16) tmp7_m, (v8i16) tmp6_m); \
1741 tmp3_m = (v16u8) __msa_ilvod_h((v8i16) tmp7_m, (v8i16) tmp6_m); \
1742 out3 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
1743 out7 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
1746 /* Description : Pack even elements of input vectors & xor with 128
1747 Arguments : Inputs - in0, in1
1749 Return Type - unsigned byte
1750 Details : Signed byte even elements from 'in0' and 'in1' are packed
1751 together in one vector and the resulted vector is xor'ed with
1752 128 to shift the range from signed to unsigned byte
1754 #define PCKEV_XORI128_UB(in0, in1) \
1757 out_m = (v16u8) __msa_pckev_b((v16i8) in1, (v16i8) in0); \
1758 out_m = (v16u8) __msa_xori_b((v16u8) out_m, 128); \
1762 /* Description : Pack even byte elements, extract 0 & 2 index words from pair
1763 of results and store 4 words in destination memory as per
1765 Arguments : Inputs - in0, in1, in2, in3, pdst, stride
1767 #define PCKEV_ST4x4_UB(in0, in1, in2, in3, pdst, stride) \
1769 uint32_t out0_m, out1_m, out2_m, out3_m; \
1770 v16i8 tmp0_m, tmp1_m; \
1772 PCKEV_B2_SB(in1, in0, in3, in2, tmp0_m, tmp1_m); \
1774 out0_m = __msa_copy_u_w((v4i32) tmp0_m, 0); \
1775 out1_m = __msa_copy_u_w((v4i32) tmp0_m, 2); \
1776 out2_m = __msa_copy_u_w((v4i32) tmp1_m, 0); \
1777 out3_m = __msa_copy_u_w((v4i32) tmp1_m, 2); \
1779 SW4(out0_m, out1_m, out2_m, out3_m, pdst, stride); \
1781 #endif /* AVUTIL_MIPS_GENERIC_MACROS_MSA_H */