2 * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #ifndef AVUTIL_MIPS_GENERIC_MACROS_MSA_H
22 #define AVUTIL_MIPS_GENERIC_MACROS_MSA_H
28 #define ALLOC_ALIGNED(align) __attribute__ ((aligned((align) << 1)))
30 #define LD_B(RTYPE, psrc) *((RTYPE *)(psrc))
31 #define LD_UB(...) LD_B(v16u8, __VA_ARGS__)
32 #define LD_SB(...) LD_B(v16i8, __VA_ARGS__)
34 #define LD_H(RTYPE, psrc) *((RTYPE *)(psrc))
35 #define LD_UH(...) LD_H(v8u16, __VA_ARGS__)
36 #define LD_SH(...) LD_H(v8i16, __VA_ARGS__)
38 #define LD_W(RTYPE, psrc) *((RTYPE *)(psrc))
39 #define LD_UW(...) LD_W(v4u32, __VA_ARGS__)
40 #define LD_SW(...) LD_W(v4i32, __VA_ARGS__)
42 #define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
43 #define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
44 #define ST_SB(...) ST_B(v16i8, __VA_ARGS__)
46 #define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
47 #define ST_UH(...) ST_H(v8u16, __VA_ARGS__)
48 #define ST_SH(...) ST_H(v8i16, __VA_ARGS__)
50 #define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
51 #define ST_UW(...) ST_W(v4u32, __VA_ARGS__)
52 #define ST_SW(...) ST_W(v4i32, __VA_ARGS__)
54 #if (__mips_isa_rev >= 6)
57 uint8_t *psrc_m = (uint8_t *) (psrc); \
61 "lw %[val_m], %[psrc_m] \n\t" \
63 : [val_m] "=r" (val_m) \
64 : [psrc_m] "m" (*psrc_m) \
73 uint8_t *psrc_m = (uint8_t *) (psrc); \
77 "ld %[val_m], %[psrc_m] \n\t" \
79 : [val_m] "=r" (val_m) \
80 : [psrc_m] "m" (*psrc_m) \
85 #else // !(__mips == 64)
88 uint8_t *psrc_m = (uint8_t *) (psrc); \
89 uint32_t val0_m, val1_m; \
92 val0_m = LW(psrc_m); \
93 val1_m = LW(psrc_m + 4); \
95 val_m = (uint64_t) (val1_m); \
96 val_m = (uint64_t) ((val_m << 32) & 0xFFFFFFFF00000000); \
97 val_m = (uint64_t) (val_m | (uint64_t) val0_m); \
101 #endif // (__mips == 64)
103 #define SH(val, pdst) \
105 uint8_t *pdst_m = (uint8_t *) (pdst); \
106 uint16_t val_m = (val); \
109 "sh %[val_m], %[pdst_m] \n\t" \
111 : [pdst_m] "=m" (*pdst_m) \
112 : [val_m] "r" (val_m) \
116 #define SW(val, pdst) \
118 uint8_t *pdst_m = (uint8_t *) (pdst); \
119 uint32_t val_m = (val); \
122 "sw %[val_m], %[pdst_m] \n\t" \
124 : [pdst_m] "=m" (*pdst_m) \
125 : [val_m] "r" (val_m) \
129 #define SD(val, pdst) \
131 uint8_t *pdst_m = (uint8_t *) (pdst); \
132 uint64_t val_m = (val); \
135 "sd %[val_m], %[pdst_m] \n\t" \
137 : [pdst_m] "=m" (*pdst_m) \
138 : [val_m] "r" (val_m) \
141 #else // !(__mips_isa_rev >= 6)
144 uint8_t *psrc_m = (uint8_t *) (psrc); \
148 "ulw %[val_m], %[psrc_m] \n\t" \
150 : [val_m] "=r" (val_m) \
151 : [psrc_m] "m" (*psrc_m) \
160 uint8_t *psrc_m = (uint8_t *) (psrc); \
161 uint64_t val_m = 0; \
164 "uld %[val_m], %[psrc_m] \n\t" \
166 : [val_m] "=r" (val_m) \
167 : [psrc_m] "m" (*psrc_m) \
172 #else // !(__mips == 64)
175 uint8_t *psrc_m1 = (uint8_t *) (psrc); \
176 uint32_t val0_m, val1_m; \
177 uint64_t val_m = 0; \
179 val0_m = LW(psrc_m1); \
180 val1_m = LW(psrc_m1 + 4); \
182 val_m = (uint64_t) (val1_m); \
183 val_m = (uint64_t) ((val_m << 32) & 0xFFFFFFFF00000000); \
184 val_m = (uint64_t) (val_m | (uint64_t) val0_m); \
188 #endif // (__mips == 64)
190 #define SH(val, pdst) \
192 uint8_t *pdst_m = (uint8_t *) (pdst); \
193 uint16_t val_m = (val); \
196 "ush %[val_m], %[pdst_m] \n\t" \
198 : [pdst_m] "=m" (*pdst_m) \
199 : [val_m] "r" (val_m) \
203 #define SW(val, pdst) \
205 uint8_t *pdst_m = (uint8_t *) (pdst); \
206 uint32_t val_m = (val); \
209 "usw %[val_m], %[pdst_m] \n\t" \
211 : [pdst_m] "=m" (*pdst_m) \
212 : [val_m] "r" (val_m) \
216 #define SD(val, pdst) \
218 uint8_t *pdst_m1 = (uint8_t *) (pdst); \
219 uint32_t val0_m, val1_m; \
221 val0_m = (uint32_t) ((val) & 0x00000000FFFFFFFF); \
222 val1_m = (uint32_t) (((val) >> 32) & 0x00000000FFFFFFFF); \
224 SW(val0_m, pdst_m1); \
225 SW(val1_m, pdst_m1 + 4); \
227 #endif // (__mips_isa_rev >= 6)
229 /* Description : Load 4 words with stride
230 Arguments : Inputs - psrc (source pointer to load from)
232 Outputs - out0, out1, out2, out3
233 Details : Loads word in 'out0' from (psrc)
234 Loads word in 'out1' from (psrc + stride)
235 Loads word in 'out2' from (psrc + 2 * stride)
236 Loads word in 'out3' from (psrc + 3 * stride)
238 #define LW4(psrc, stride, out0, out1, out2, out3) \
241 out1 = LW((psrc) + stride); \
242 out2 = LW((psrc) + 2 * stride); \
243 out3 = LW((psrc) + 3 * stride); \
246 /* Description : Load double words with stride
247 Arguments : Inputs - psrc (source pointer to load from)
250 Details : Loads double word in 'out0' from (psrc)
251 Loads double word in 'out1' from (psrc + stride)
253 #define LD2(psrc, stride, out0, out1) \
256 out1 = LD((psrc) + stride); \
258 #define LD4(psrc, stride, out0, out1, out2, out3) \
260 LD2((psrc), stride, out0, out1); \
261 LD2((psrc) + 2 * stride, stride, out2, out3); \
264 /* Description : Store 4 words with stride
265 Arguments : Inputs - in0, in1, in2, in3, pdst, stride
266 Details : Stores word from 'in0' to (pdst)
267 Stores word from 'in1' to (pdst + stride)
268 Stores word from 'in2' to (pdst + 2 * stride)
269 Stores word from 'in3' to (pdst + 3 * stride)
271 #define SW4(in0, in1, in2, in3, pdst, stride) \
274 SW(in1, (pdst) + stride); \
275 SW(in2, (pdst) + 2 * stride); \
276 SW(in3, (pdst) + 3 * stride); \
279 /* Description : Store 4 double words with stride
280 Arguments : Inputs - in0, in1, in2, in3, pdst, stride
281 Details : Stores double word from 'in0' to (pdst)
282 Stores double word from 'in1' to (pdst + stride)
283 Stores double word from 'in2' to (pdst + 2 * stride)
284 Stores double word from 'in3' to (pdst + 3 * stride)
286 #define SD4(in0, in1, in2, in3, pdst, stride) \
289 SD(in1, (pdst) + stride); \
290 SD(in2, (pdst) + 2 * stride); \
291 SD(in3, (pdst) + 3 * stride); \
294 /* Description : Load vectors with 16 byte elements with stride
295 Arguments : Inputs - psrc (source pointer to load from)
298 Return Type - as per RTYPE
299 Details : Loads 16 byte elements in 'out0' from (psrc)
300 Loads 16 byte elements in 'out1' from (psrc + stride)
302 #define LD_B2(RTYPE, psrc, stride, out0, out1) \
304 out0 = LD_B(RTYPE, (psrc)); \
305 out1 = LD_B(RTYPE, (psrc) + stride); \
307 #define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)
308 #define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__)
310 #define LD_B3(RTYPE, psrc, stride, out0, out1, out2) \
312 LD_B2(RTYPE, (psrc), stride, out0, out1); \
313 out2 = LD_B(RTYPE, (psrc) + 2 * stride); \
315 #define LD_UB3(...) LD_B3(v16u8, __VA_ARGS__)
316 #define LD_SB3(...) LD_B3(v16i8, __VA_ARGS__)
318 #define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \
320 LD_B2(RTYPE, (psrc), stride, out0, out1); \
321 LD_B2(RTYPE, (psrc) + 2 * stride , stride, out2, out3); \
323 #define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)
324 #define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__)
326 #define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \
328 LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
329 out4 = LD_B(RTYPE, (psrc) + 4 * stride); \
331 #define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__)
332 #define LD_SB5(...) LD_B5(v16i8, __VA_ARGS__)
334 #define LD_B6(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5) \
336 LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
337 LD_B2(RTYPE, (psrc) + 4 * stride, stride, out4, out5); \
339 #define LD_UB6(...) LD_B6(v16u8, __VA_ARGS__)
340 #define LD_SB6(...) LD_B6(v16i8, __VA_ARGS__)
342 #define LD_B7(RTYPE, psrc, stride, \
343 out0, out1, out2, out3, out4, out5, out6) \
345 LD_B5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4); \
346 LD_B2(RTYPE, (psrc) + 5 * stride, stride, out5, out6); \
348 #define LD_UB7(...) LD_B7(v16u8, __VA_ARGS__)
349 #define LD_SB7(...) LD_B7(v16i8, __VA_ARGS__)
351 #define LD_B8(RTYPE, psrc, stride, \
352 out0, out1, out2, out3, out4, out5, out6, out7) \
354 LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
355 LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \
357 #define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__)
358 #define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__)
360 /* Description : Load vectors with 8 halfword elements with stride
361 Arguments : Inputs - psrc (source pointer to load from)
364 Details : Loads 8 halfword elements in 'out0' from (psrc)
365 Loads 8 halfword elements in 'out1' from (psrc + stride)
367 #define LD_H2(RTYPE, psrc, stride, out0, out1) \
369 out0 = LD_H(RTYPE, (psrc)); \
370 out1 = LD_H(RTYPE, (psrc) + (stride)); \
372 #define LD_UH2(...) LD_H2(v8u16, __VA_ARGS__)
373 #define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__)
375 #define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3) \
377 LD_H2(RTYPE, (psrc), stride, out0, out1); \
378 LD_H2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \
380 #define LD_UH4(...) LD_H4(v8u16, __VA_ARGS__)
381 #define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__)
383 #define LD_H6(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5) \
385 LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
386 LD_H2(RTYPE, (psrc) + 4 * stride, stride, out4, out5); \
388 #define LD_UH6(...) LD_H6(v8u16, __VA_ARGS__)
389 #define LD_SH6(...) LD_H6(v8i16, __VA_ARGS__)
391 #define LD_H8(RTYPE, psrc, stride, \
392 out0, out1, out2, out3, out4, out5, out6, out7) \
394 LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
395 LD_H4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \
397 #define LD_UH8(...) LD_H8(v8u16, __VA_ARGS__)
398 #define LD_SH8(...) LD_H8(v8i16, __VA_ARGS__)
400 #define LD_H16(RTYPE, psrc, stride, \
401 out0, out1, out2, out3, out4, out5, out6, out7, \
402 out8, out9, out10, out11, out12, out13, out14, out15) \
404 LD_H8(RTYPE, (psrc), stride, \
405 out0, out1, out2, out3, out4, out5, out6, out7); \
406 LD_H8(RTYPE, (psrc) + 8 * stride, stride, \
407 out8, out9, out10, out11, out12, out13, out14, out15); \
409 #define LD_SH16(...) LD_H16(v8i16, __VA_ARGS__)
411 /* Description : Load as 4x4 block of signed halfword elements from 1D source
412 data into 4 vectors (Each vector with 4 signed halfwords)
413 Arguments : Inputs - psrc
414 Outputs - out0, out1, out2, out3
416 #define LD4x4_SH(psrc, out0, out1, out2, out3) \
418 out0 = LD_SH(psrc); \
419 out2 = LD_SH(psrc + 8); \
420 out1 = (v8i16) __msa_ilvl_d((v2i64) out0, (v2i64) out0); \
421 out3 = (v8i16) __msa_ilvl_d((v2i64) out2, (v2i64) out2); \
424 /* Description : Load 2 vectors of signed word elements with stride
425 Arguments : Inputs - psrc (source pointer to load from)
428 Return Type - signed word
430 #define LD_SW2(psrc, stride, out0, out1) \
432 out0 = LD_SW((psrc)); \
433 out1 = LD_SW((psrc) + stride); \
436 /* Description : Store vectors of 16 byte elements with stride
437 Arguments : Inputs - in0, in1, stride
438 Outputs - pdst (destination pointer to store to)
439 Details : Stores 16 byte elements from 'in0' to (pdst)
440 Stores 16 byte elements from 'in1' to (pdst + stride)
442 #define ST_B2(RTYPE, in0, in1, pdst, stride) \
444 ST_B(RTYPE, in0, (pdst)); \
445 ST_B(RTYPE, in1, (pdst) + stride); \
447 #define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)
448 #define ST_SB2(...) ST_B2(v16i8, __VA_ARGS__)
450 #define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) \
452 ST_B2(RTYPE, in0, in1, (pdst), stride); \
453 ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
455 #define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
456 #define ST_SB4(...) ST_B4(v16i8, __VA_ARGS__)
458 #define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
461 ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride); \
462 ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \
464 #define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__)
466 /* Description : Store vectors of 8 halfword elements with stride
467 Arguments : Inputs - in0, in1, stride
468 Outputs - pdst (destination pointer to store to)
469 Details : Stores 8 halfword elements from 'in0' to (pdst)
470 Stores 8 halfword elements from 'in1' to (pdst + stride)
472 #define ST_H2(RTYPE, in0, in1, pdst, stride) \
474 ST_H(RTYPE, in0, (pdst)); \
475 ST_H(RTYPE, in1, (pdst) + stride); \
477 #define ST_UH2(...) ST_H2(v8u16, __VA_ARGS__)
478 #define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__)
480 #define ST_H4(RTYPE, in0, in1, in2, in3, pdst, stride) \
482 ST_H2(RTYPE, in0, in1, (pdst), stride); \
483 ST_H2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
485 #define ST_SH4(...) ST_H4(v8i16, __VA_ARGS__)
487 #define ST_H6(RTYPE, in0, in1, in2, in3, in4, in5, pdst, stride) \
489 ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride); \
490 ST_H2(RTYPE, in4, in5, (pdst) + 4 * stride, stride); \
492 #define ST_SH6(...) ST_H6(v8i16, __VA_ARGS__)
494 #define ST_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
496 ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride); \
497 ST_H4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \
499 #define ST_SH8(...) ST_H8(v8i16, __VA_ARGS__)
501 /* Description : Store vectors of word elements with stride
502 Arguments : Inputs - in0, in1, stride
503 Outputs - pdst (destination pointer to store to)
504 Return Type - signed word
505 Details : Stores 4 word elements from 'in0' to (pdst)
506 Stores 4 word elements from 'in1' to (pdst + stride)
508 #define ST_SW2(in0, in1, pdst, stride) \
510 ST_SW(in0, (pdst)); \
511 ST_SW(in1, (pdst) + stride); \
513 #define ST_SW8(in0, in1, in2, in3, in4, in5, in6, in7, \
516 ST_SW2(in0, in1, (pdst), stride); \
517 ST_SW2(in2, in3, (pdst) + 2 * stride, stride); \
518 ST_SW2(in4, in5, (pdst) + 4 * stride, stride); \
519 ST_SW2(in6, in7, (pdst) + 6 * stride, stride); \
522 /* Description : Store as 2x4 byte block to destination memory from input vector
523 Arguments : Inputs - in, stidx, pdst, stride
524 Return Type - unsigned byte
525 Details : Index stidx halfword element from 'in' vector is copied and
527 Index stidx+1 halfword element from 'in' vector is copied and
528 stored on second line
529 Index stidx+2 halfword element from 'in' vector is copied and
531 Index stidx+3 halfword element from 'in' vector is copied and
532 stored on fourth line
534 #define ST2x4_UB(in, stidx, pdst, stride) \
536 uint16_t out0_m, out1_m, out2_m, out3_m; \
537 uint8_t *pblk_2x4_m = (uint8_t *) (pdst); \
539 out0_m = __msa_copy_u_h((v8i16) in, (stidx)); \
540 out1_m = __msa_copy_u_h((v8i16) in, (stidx + 1)); \
541 out2_m = __msa_copy_u_h((v8i16) in, (stidx + 2)); \
542 out3_m = __msa_copy_u_h((v8i16) in, (stidx + 3)); \
544 SH(out0_m, pblk_2x4_m); \
545 SH(out1_m, pblk_2x4_m + stride); \
546 SH(out2_m, pblk_2x4_m + 2 * stride); \
547 SH(out3_m, pblk_2x4_m + 3 * stride); \
550 /* Description : Store as 4x2 byte block to destination memory from input vector
551 Arguments : Inputs - in, pdst, stride
552 Return Type - unsigned byte
553 Details : Index 0 word element from input vector is copied and stored
555 Index 1 word element from input vector is copied and stored
558 #define ST4x2_UB(in, pdst, stride) \
560 uint32_t out0_m, out1_m; \
561 uint8_t *pblk_4x2_m = (uint8_t *) (pdst); \
563 out0_m = __msa_copy_u_w((v4i32) in, 0); \
564 out1_m = __msa_copy_u_w((v4i32) in, 1); \
566 SW(out0_m, pblk_4x2_m); \
567 SW(out1_m, pblk_4x2_m + stride); \
570 /* Description : Store as 4x4 byte block to destination memory from input vector
571 Arguments : Inputs - in0, in1, pdst, stride
572 Return Type - unsigned byte
573 Details : Idx0 word element from input vector 'in0' is copied and stored
575 Idx1 word element from input vector 'in0' is copied and stored
577 Idx2 word element from input vector 'in1' is copied and stored
579 Idx3 word element from input vector 'in1' is copied and stored
582 #define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \
584 uint32_t out0_m, out1_m, out2_m, out3_m; \
585 uint8_t *pblk_4x4_m = (uint8_t *) (pdst); \
587 out0_m = __msa_copy_u_w((v4i32) in0, idx0); \
588 out1_m = __msa_copy_u_w((v4i32) in0, idx1); \
589 out2_m = __msa_copy_u_w((v4i32) in1, idx2); \
590 out3_m = __msa_copy_u_w((v4i32) in1, idx3); \
592 SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride); \
594 #define ST4x8_UB(in0, in1, pdst, stride) \
596 uint8_t *pblk_4x8 = (uint8_t *) (pdst); \
598 ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride); \
599 ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride); \
602 /* Description : Store as 6x4 byte block to destination memory from input
604 Arguments : Inputs - in0, in1, pdst, stride
605 Return Type - unsigned byte
606 Details : Index 0 word element from input vector 'in0' is copied and
607 stored on first line followed by index 2 halfword element
608 Index 2 word element from input vector 'in0' is copied and
609 stored on second line followed by index 2 halfword element
610 Index 0 word element from input vector 'in1' is copied and
611 stored on third line followed by index 2 halfword element
612 Index 2 word element from input vector 'in1' is copied and
613 stored on fourth line followed by index 2 halfword element
615 #define ST6x4_UB(in0, in1, pdst, stride) \
617 uint32_t out0_m, out1_m, out2_m, out3_m; \
618 uint16_t out4_m, out5_m, out6_m, out7_m; \
619 uint8_t *pblk_6x4_m = (uint8_t *) (pdst); \
621 out0_m = __msa_copy_u_w((v4i32) in0, 0); \
622 out1_m = __msa_copy_u_w((v4i32) in0, 2); \
623 out2_m = __msa_copy_u_w((v4i32) in1, 0); \
624 out3_m = __msa_copy_u_w((v4i32) in1, 2); \
626 out4_m = __msa_copy_u_h((v8i16) in0, 2); \
627 out5_m = __msa_copy_u_h((v8i16) in0, 6); \
628 out6_m = __msa_copy_u_h((v8i16) in1, 2); \
629 out7_m = __msa_copy_u_h((v8i16) in1, 6); \
631 SW(out0_m, pblk_6x4_m); \
632 SH(out4_m, (pblk_6x4_m + 4)); \
633 pblk_6x4_m += stride; \
634 SW(out1_m, pblk_6x4_m); \
635 SH(out5_m, (pblk_6x4_m + 4)); \
636 pblk_6x4_m += stride; \
637 SW(out2_m, pblk_6x4_m); \
638 SH(out6_m, (pblk_6x4_m + 4)); \
639 pblk_6x4_m += stride; \
640 SW(out3_m, pblk_6x4_m); \
641 SH(out7_m, (pblk_6x4_m + 4)); \
644 /* Description : Store as 8x1 byte block to destination memory from input vector
645 Arguments : Inputs - in, pdst
646 Details : Index 0 double word element from input vector 'in' is copied
647 and stored to destination memory at (pdst)
649 #define ST8x1_UB(in, pdst) \
652 out0_m = __msa_copy_u_d((v2i64) in, 0); \
656 /* Description : Store as 8x2 byte block to destination memory from input vector
657 Arguments : Inputs - in, pdst, stride
658 Details : Index 0 double word element from input vector 'in' is copied
659 and stored to destination memory at (pdst)
660 Index 1 double word element from input vector 'in' is copied
661 and stored to destination memory at (pdst + stride)
663 #define ST8x2_UB(in, pdst, stride) \
665 uint64_t out0_m, out1_m; \
666 uint8_t *pblk_8x2_m = (uint8_t *) (pdst); \
668 out0_m = __msa_copy_u_d((v2i64) in, 0); \
669 out1_m = __msa_copy_u_d((v2i64) in, 1); \
671 SD(out0_m, pblk_8x2_m); \
672 SD(out1_m, pblk_8x2_m + stride); \
675 /* Description : Store as 8x4 byte block to destination memory from input
677 Arguments : Inputs - in0, in1, pdst, stride
678 Details : Index 0 double word element from input vector 'in0' is copied
679 and stored to destination memory at (pblk_8x4_m)
680 Index 1 double word element from input vector 'in0' is copied
681 and stored to destination memory at (pblk_8x4_m + stride)
682 Index 0 double word element from input vector 'in1' is copied
683 and stored to destination memory at (pblk_8x4_m + 2 * stride)
684 Index 1 double word element from input vector 'in1' is copied
685 and stored to destination memory at (pblk_8x4_m + 3 * stride)
687 #define ST8x4_UB(in0, in1, pdst, stride) \
689 uint64_t out0_m, out1_m, out2_m, out3_m; \
690 uint8_t *pblk_8x4_m = (uint8_t *) (pdst); \
692 out0_m = __msa_copy_u_d((v2i64) in0, 0); \
693 out1_m = __msa_copy_u_d((v2i64) in0, 1); \
694 out2_m = __msa_copy_u_d((v2i64) in1, 0); \
695 out3_m = __msa_copy_u_d((v2i64) in1, 1); \
697 SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride); \
699 #define ST8x8_UB(in0, in1, in2, in3, pdst, stride) \
701 uint8_t *pblk_8x8_m = (uint8_t *) (pdst); \
703 ST8x4_UB(in0, in1, pblk_8x8_m, stride); \
704 ST8x4_UB(in2, in3, pblk_8x8_m + 4 * stride, stride); \
706 #define ST12x4_UB(in0, in1, in2, pdst, stride) \
708 uint8_t *pblk_12x4_m = (uint8_t *) (pdst); \
711 ST8x4_UB(in0, in1, pblk_12x4_m, stride); \
713 ST4x4_UB(in2, in2, 0, 1, 2, 3, pblk_12x4_m + 8, stride); \
716 /* Description : Store as 12x8 byte block to destination memory from
718 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
719 Details : Index 0 double word element from input vector 'in0' is copied
720 and stored to destination memory at (pblk_12x8_m) followed by
721 index 2 word element from same input vector 'in0' at
723 Similar to remaining lines
725 #define ST12x8_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
727 uint64_t out0_m, out1_m, out2_m, out3_m; \
728 uint64_t out4_m, out5_m, out6_m, out7_m; \
729 uint32_t out8_m, out9_m, out10_m, out11_m; \
730 uint32_t out12_m, out13_m, out14_m, out15_m; \
731 uint8_t *pblk_12x8_m = (uint8_t *) (pdst); \
733 out0_m = __msa_copy_u_d((v2i64) in0, 0); \
734 out1_m = __msa_copy_u_d((v2i64) in1, 0); \
735 out2_m = __msa_copy_u_d((v2i64) in2, 0); \
736 out3_m = __msa_copy_u_d((v2i64) in3, 0); \
737 out4_m = __msa_copy_u_d((v2i64) in4, 0); \
738 out5_m = __msa_copy_u_d((v2i64) in5, 0); \
739 out6_m = __msa_copy_u_d((v2i64) in6, 0); \
740 out7_m = __msa_copy_u_d((v2i64) in7, 0); \
742 out8_m = __msa_copy_u_w((v4i32) in0, 2); \
743 out9_m = __msa_copy_u_w((v4i32) in1, 2); \
744 out10_m = __msa_copy_u_w((v4i32) in2, 2); \
745 out11_m = __msa_copy_u_w((v4i32) in3, 2); \
746 out12_m = __msa_copy_u_w((v4i32) in4, 2); \
747 out13_m = __msa_copy_u_w((v4i32) in5, 2); \
748 out14_m = __msa_copy_u_w((v4i32) in6, 2); \
749 out15_m = __msa_copy_u_w((v4i32) in7, 2); \
751 SD(out0_m, pblk_12x8_m); \
752 SW(out8_m, pblk_12x8_m + 8); \
753 pblk_12x8_m += stride; \
754 SD(out1_m, pblk_12x8_m); \
755 SW(out9_m, pblk_12x8_m + 8); \
756 pblk_12x8_m += stride; \
757 SD(out2_m, pblk_12x8_m); \
758 SW(out10_m, pblk_12x8_m + 8); \
759 pblk_12x8_m += stride; \
760 SD(out3_m, pblk_12x8_m); \
761 SW(out11_m, pblk_12x8_m + 8); \
762 pblk_12x8_m += stride; \
763 SD(out4_m, pblk_12x8_m); \
764 SW(out12_m, pblk_12x8_m + 8); \
765 pblk_12x8_m += stride; \
766 SD(out5_m, pblk_12x8_m); \
767 SW(out13_m, pblk_12x8_m + 8); \
768 pblk_12x8_m += stride; \
769 SD(out6_m, pblk_12x8_m); \
770 SW(out14_m, pblk_12x8_m + 8); \
771 pblk_12x8_m += stride; \
772 SD(out7_m, pblk_12x8_m); \
773 SW(out15_m, pblk_12x8_m + 8); \
776 /* Description : average with rounding (in0 + in1 + 1) / 2.
777 Arguments : Inputs - in0, in1, in2, in3,
779 Return Type - signed byte
780 Details : Each byte element from 'in0' vector is added with each byte
781 element from 'in1' vector. The addition of the elements plus 1
782 (for rounding) is done unsigned with full precision,
783 i.e. the result has one extra bit. Unsigned division by 2
784 (or logical shift right by one bit) is performed before writing
785 the result to vector 'out0'
786 Similar for the pair of 'in2' and 'in3'
788 #define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \
790 out0 = (RTYPE) __msa_aver_u_b((v16u8) in0, (v16u8) in1); \
791 out1 = (RTYPE) __msa_aver_u_b((v16u8) in2, (v16u8) in3); \
793 #define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__)
795 #define AVER_UB4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
796 out0, out1, out2, out3) \
798 AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \
799 AVER_UB2(RTYPE, in4, in5, in6, in7, out2, out3) \
801 #define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__)
803 /* Description : Immediate number of columns to slide with zero
804 Arguments : Inputs - in0, in1, slide_val
806 Return Type - as per RTYPE
807 Details : Byte elements from 'zero_m' vector are slide into 'in0' by
808 number of elements specified by 'slide_val'
810 #define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val) \
812 v16i8 zero_m = { 0 }; \
813 out0 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in0, slide_val); \
814 out1 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in1, slide_val); \
816 #define SLDI_B2_0_UB(...) SLDI_B2_0(v16u8, __VA_ARGS__)
817 #define SLDI_B2_0_SB(...) SLDI_B2_0(v16i8, __VA_ARGS__)
818 #define SLDI_B2_0_SW(...) SLDI_B2_0(v4i32, __VA_ARGS__)
820 #define SLDI_B3_0(RTYPE, in0, in1, in2, out0, out1, out2, slide_val) \
822 v16i8 zero_m = { 0 }; \
823 SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val); \
824 out2 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in2, slide_val); \
826 #define SLDI_B3_0_UB(...) SLDI_B3_0(v16u8, __VA_ARGS__)
827 #define SLDI_B3_0_SB(...) SLDI_B3_0(v16i8, __VA_ARGS__)
829 #define SLDI_B4_0(RTYPE, in0, in1, in2, in3, \
830 out0, out1, out2, out3, slide_val) \
832 SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val); \
833 SLDI_B2_0(RTYPE, in2, in3, out2, out3, slide_val); \
835 #define SLDI_B4_0_UB(...) SLDI_B4_0(v16u8, __VA_ARGS__)
836 #define SLDI_B4_0_SB(...) SLDI_B4_0(v16i8, __VA_ARGS__)
837 #define SLDI_B4_0_SH(...) SLDI_B4_0(v8i16, __VA_ARGS__)
839 /* Description : Immediate number of columns to slide
840 Arguments : Inputs - in0_0, in0_1, in1_0, in1_1, slide_val
842 Return Type - as per RTYPE
843 Details : Byte elements from 'in0_0' vector are slide into 'in1_0' by
844 number of elements specified by 'slide_val'
846 #define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \
848 out0 = (RTYPE) __msa_sldi_b((v16i8) in0_0, (v16i8) in1_0, slide_val); \
849 out1 = (RTYPE) __msa_sldi_b((v16i8) in0_1, (v16i8) in1_1, slide_val); \
851 #define SLDI_B2_UB(...) SLDI_B2(v16u8, __VA_ARGS__)
852 #define SLDI_B2_SB(...) SLDI_B2(v16i8, __VA_ARGS__)
853 #define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__)
855 #define SLDI_B3(RTYPE, in0_0, in0_1, in0_2, in1_0, in1_1, in1_2, \
856 out0, out1, out2, slide_val) \
858 SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \
859 out2 = (RTYPE) __msa_sldi_b((v16i8) in0_2, (v16i8) in1_2, slide_val); \
861 #define SLDI_B3_SB(...) SLDI_B3(v16i8, __VA_ARGS__)
862 #define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__)
864 /* Description : Shuffle byte vector elements as per mask vector
865 Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
867 Return Type - as per RTYPE
868 Details : Selective byte elements from in0 & in1 are copied to out0 as
869 per control vector mask0
870 Selective byte elements from in2 & in3 are copied to out1 as
871 per control vector mask1
873 #define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \
875 out0 = (RTYPE) __msa_vshf_b((v16i8) mask0, (v16i8) in1, (v16i8) in0); \
876 out1 = (RTYPE) __msa_vshf_b((v16i8) mask1, (v16i8) in3, (v16i8) in2); \
878 #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
879 #define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)
880 #define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)
881 #define VSHF_B2_SH(...) VSHF_B2(v8i16, __VA_ARGS__)
883 #define VSHF_B3(RTYPE, in0, in1, in2, in3, in4, in5, mask0, mask1, mask2, \
886 VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1); \
887 out2 = (RTYPE) __msa_vshf_b((v16i8) mask2, (v16i8) in5, (v16i8) in4); \
889 #define VSHF_B3_SB(...) VSHF_B3(v16i8, __VA_ARGS__)
891 #define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3, \
892 out0, out1, out2, out3) \
894 VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1); \
895 VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3); \
897 #define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__)
898 #define VSHF_B4_SH(...) VSHF_B4(v8i16, __VA_ARGS__)
900 /* Description : Shuffle halfword vector elements as per mask vector
901 Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
903 Return Type - as per RTYPE
904 Details : Selective halfword elements from in0 & in1 are copied to out0
905 as per control vector mask0
906 Selective halfword elements from in2 & in3 are copied to out1
907 as per control vector mask1
909 #define VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \
911 out0 = (RTYPE) __msa_vshf_h((v8i16) mask0, (v8i16) in1, (v8i16) in0); \
912 out1 = (RTYPE) __msa_vshf_h((v8i16) mask1, (v8i16) in3, (v8i16) in2); \
914 #define VSHF_H2_SH(...) VSHF_H2(v8i16, __VA_ARGS__)
916 #define VSHF_H3(RTYPE, in0, in1, in2, in3, in4, in5, mask0, mask1, mask2, \
919 VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1); \
920 out2 = (RTYPE) __msa_vshf_h((v8i16) mask2, (v8i16) in5, (v8i16) in4); \
922 #define VSHF_H3_SH(...) VSHF_H3(v8i16, __VA_ARGS__)
924 /* Description : Shuffle byte vector elements as per mask vector
925 Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
927 Return Type - as per RTYPE
928 Details : Selective byte elements from in0 & in1 are copied to out0 as
929 per control vector mask0
930 Selective byte elements from in2 & in3 are copied to out1 as
931 per control vector mask1
933 #define VSHF_W2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \
935 out0 = (RTYPE) __msa_vshf_w((v4i32) mask0, (v4i32) in1, (v4i32) in0); \
936 out1 = (RTYPE) __msa_vshf_w((v4i32) mask1, (v4i32) in3, (v4i32) in2); \
938 #define VSHF_W2_SB(...) VSHF_W2(v16i8, __VA_ARGS__)
940 /* Description : Dot product of byte vector elements
941 Arguments : Inputs - mult0, mult1
944 Return Type - unsigned halfword
945 Details : Unsigned byte elements from mult0 are multiplied with
946 unsigned byte elements from cnst0 producing a result
947 twice the size of input i.e. unsigned halfword.
948 Then this multiplication results of adjacent odd-even elements
949 are added together and stored to the out vector
950 (2 unsigned halfword results)
952 #define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
954 out0 = (RTYPE) __msa_dotp_u_h((v16u8) mult0, (v16u8) cnst0); \
955 out1 = (RTYPE) __msa_dotp_u_h((v16u8) mult1, (v16u8) cnst1); \
957 #define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__)
959 #define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3, \
960 cnst0, cnst1, cnst2, cnst3, \
961 out0, out1, out2, out3) \
963 DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
964 DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
966 #define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__)
968 /* Description : Dot product of byte vector elements
969 Arguments : Inputs - mult0, mult1
972 Return Type - signed halfword
973 Details : Signed byte elements from mult0 are multiplied with
974 signed byte elements from cnst0 producing a result
975 twice the size of input i.e. signed halfword.
976 Then this multiplication results of adjacent odd-even elements
977 are added together and stored to the out vector
978 (2 signed halfword results)
980 #define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
982 out0 = (RTYPE) __msa_dotp_s_h((v16i8) mult0, (v16i8) cnst0); \
983 out1 = (RTYPE) __msa_dotp_s_h((v16i8) mult1, (v16i8) cnst1); \
985 #define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__)
987 #define DOTP_SB3(RTYPE, mult0, mult1, mult2, cnst0, cnst1, cnst2, \
990 DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
991 out2 = (RTYPE) __msa_dotp_s_h((v16i8) mult2, (v16i8) cnst2); \
993 #define DOTP_SB3_SH(...) DOTP_SB3(v8i16, __VA_ARGS__)
995 #define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3, \
996 cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) \
998 DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
999 DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
1001 #define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__)
1003 /* Description : Dot product of halfword vector elements
1004 Arguments : Inputs - mult0, mult1
1006 Outputs - out0, out1
1007 Return Type - signed word
1008 Details : Signed halfword elements from mult0 are multiplied with
1009 signed halfword elements from cnst0 producing a result
1010 twice the size of input i.e. signed word.
1011 Then this multiplication results of adjacent odd-even elements
1012 are added together and stored to the out vector
1013 (2 signed word results)
1015 #define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
1017 out0 = (RTYPE) __msa_dotp_s_w((v8i16) mult0, (v8i16) cnst0); \
1018 out1 = (RTYPE) __msa_dotp_s_w((v8i16) mult1, (v8i16) cnst1); \
1020 #define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__)
1022 #define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3, \
1023 cnst0, cnst1, cnst2, cnst3, \
1024 out0, out1, out2, out3) \
1026 DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
1027 DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
1029 #define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__)
1031 /* Description : Dot product & addition of byte vector elements
1032 Arguments : Inputs - mult0, mult1
1034 Outputs - out0, out1
1035 Return Type - signed halfword
1036 Details : Signed byte elements from mult0 are multiplied with
1037 signed byte elements from cnst0 producing a result
1038 twice the size of input i.e. signed halfword.
1039 Then this multiplication results of adjacent odd-even elements
1040 are added to the out vector
1041 (2 signed halfword results)
1043 #define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
1045 out0 = (RTYPE) __msa_dpadd_s_h((v8i16) out0, \
1046 (v16i8) mult0, (v16i8) cnst0); \
1047 out1 = (RTYPE) __msa_dpadd_s_h((v8i16) out1, \
1048 (v16i8) mult1, (v16i8) cnst1); \
1050 #define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__)
1052 #define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3, \
1053 cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) \
1055 DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
1056 DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
1058 #define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__)
1060 /* Description : Dot product & addition of byte vector elements
1061 Arguments : Inputs - mult0, mult1
1063 Outputs - out0, out1
1064 Return Type - unsigned halfword
1065 Details : Unsigned byte elements from mult0 are multiplied with
1066 unsigned byte elements from cnst0 producing a result
1067 twice the size of input i.e. unsigned halfword.
1068 Then this multiplication results of adjacent odd-even elements
1069 are added to the out vector
1070 (2 unsigned halfword results)
1072 #define DPADD_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
1074 out0 = (RTYPE) __msa_dpadd_u_h((v8u16) out0, \
1075 (v16u8) mult0, (v16u8) cnst0); \
1076 out1 = (RTYPE) __msa_dpadd_u_h((v8u16) out1, \
1077 (v16u8) mult1, (v16u8) cnst1); \
1079 #define DPADD_UB2_UH(...) DPADD_UB2(v8u16, __VA_ARGS__)
1081 /* Description : Dot product & addition of halfword vector elements
1082 Arguments : Inputs - mult0, mult1
1084 Outputs - out0, out1
1085 Return Type - signed word
1086 Details : Signed halfword elements from mult0 are multiplied with
1087 signed halfword elements from cnst0 producing a result
1088 twice the size of input i.e. signed word.
1089 Then this multiplication results of adjacent odd-even elements
1090 are added to the out vector
1091 (2 signed word results)
1093 #define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
1095 out0 = (RTYPE) __msa_dpadd_s_w((v4i32) out0, \
1096 (v8i16) mult0, (v8i16) cnst0); \
1097 out1 = (RTYPE) __msa_dpadd_s_w((v4i32) out1, \
1098 (v8i16) mult1, (v8i16) cnst1); \
1100 #define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__)
1102 #define DPADD_SH4(RTYPE, mult0, mult1, mult2, mult3, \
1103 cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) \
1105 DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
1106 DPADD_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
1108 #define DPADD_SH4_SW(...) DPADD_SH4(v4i32, __VA_ARGS__)
1110 /* Description : Minimum values between unsigned elements of
1111 either vector are copied to the output vector
1112 Arguments : Inputs - in0, in1, min_vec
1113 Outputs - in0, in1, (in place)
1114 Return Type - unsigned halfword
1115 Details : Minimum of unsigned halfword element values from 'in0' and
1116 'min_value' are written to output vector 'in0'
1118 #define MIN_UH2(RTYPE, in0, in1, min_vec) \
1120 in0 = (RTYPE) __msa_min_u_h((v8u16) in0, min_vec); \
1121 in1 = (RTYPE) __msa_min_u_h((v8u16) in1, min_vec); \
1123 #define MIN_UH2_UH(...) MIN_UH2(v8u16, __VA_ARGS__)
1125 #define MIN_UH4(RTYPE, in0, in1, in2, in3, min_vec) \
1127 MIN_UH2(RTYPE, in0, in1, min_vec); \
1128 MIN_UH2(RTYPE, in2, in3, min_vec); \
1130 #define MIN_UH4_UH(...) MIN_UH4(v8u16, __VA_ARGS__)
1132 /* Description : Clips all halfword elements of input vector between min & max
1133 out = ((in) < (min)) ? (min) : (((in) > (max)) ? (max) : (in))
1134 Arguments : Inputs - in (input vector)
1135 - min (min threshold)
1136 - max (max threshold)
1137 Outputs - out_m (output vector with clipped elements)
1138 Return Type - signed halfword
1140 #define CLIP_SH(in, min, max) \
1144 out_m = __msa_max_s_h((v8i16) min, (v8i16) in); \
1145 out_m = __msa_min_s_h((v8i16) max, (v8i16) out_m); \
1149 /* Description : Clips all signed halfword elements of input vector
1151 Arguments : Inputs - in (input vector)
1152 Outputs - out_m (output vector with clipped elements)
1153 Return Type - signed halfword
1155 #define CLIP_SH_0_255(in) \
1157 v8i16 max_m = __msa_ldi_h(255); \
1160 out_m = __msa_maxi_s_h((v8i16) in, 0); \
1161 out_m = __msa_min_s_h((v8i16) max_m, (v8i16) out_m); \
1164 #define CLIP_SH2_0_255(in0, in1) \
1166 in0 = CLIP_SH_0_255(in0); \
1167 in1 = CLIP_SH_0_255(in1); \
1169 #define CLIP_SH4_0_255(in0, in1, in2, in3) \
1171 CLIP_SH2_0_255(in0, in1); \
1172 CLIP_SH2_0_255(in2, in3); \
1175 /* Description : Clips all signed word elements of input vector
1177 Arguments : Inputs - in (input vector)
1178 Outputs - out_m (output vector with clipped elements)
1179 Return Type - signed word
1181 #define CLIP_SW_0_255(in) \
1183 v4i32 max_m = __msa_ldi_w(255); \
1186 out_m = __msa_maxi_s_w((v4i32) in, 0); \
1187 out_m = __msa_min_s_w((v4i32) max_m, (v4i32) out_m); \
1191 /* Description : Addition of 4 signed word elements
1192 4 signed word elements of input vector are added together and
1193 resulted integer sum is returned
1194 Arguments : Inputs - in (signed word vector)
1195 Outputs - sum_m (i32 sum)
1196 Return Type - signed word
1198 #define HADD_SW_S32(in) \
1200 v2i64 res0_m, res1_m; \
1203 res0_m = __msa_hadd_s_d((v4i32) in, (v4i32) in); \
1204 res1_m = __msa_splati_d(res0_m, 1); \
1205 res0_m = res0_m + res1_m; \
1206 sum_m = __msa_copy_s_w((v4i32) res0_m, 0); \
1210 /* Description : Addition of 8 unsigned halfword elements
1211 8 unsigned halfword elements of input vector are added
1212 together and resulted integer sum is returned
1213 Arguments : Inputs - in (unsigned halfword vector)
1214 Outputs - sum_m (u32 sum)
1215 Return Type - unsigned word
1217 #define HADD_UH_U32(in) \
1220 v2u64 res0_m, res1_m; \
1223 res_m = __msa_hadd_u_w((v8u16) in, (v8u16) in); \
1224 res0_m = __msa_hadd_u_d(res_m, res_m); \
1225 res1_m = (v2u64) __msa_splati_d((v2i64) res0_m, 1); \
1226 res0_m = res0_m + res1_m; \
1227 sum_m = __msa_copy_u_w((v4i32) res0_m, 0); \
1231 /* Description : Horizontal addition of signed byte vector elements
1232 Arguments : Inputs - in0, in1
1233 Outputs - out0, out1
1234 Return Type - as per RTYPE
1235 Details : Each signed odd byte element from 'in0' is added to
1236 even signed byte element from 'in0' (pairwise) and the
1237 halfword result is stored in 'out0'
1239 #define HADD_SB2(RTYPE, in0, in1, out0, out1) \
1241 out0 = (RTYPE) __msa_hadd_s_h((v16i8) in0, (v16i8) in0); \
1242 out1 = (RTYPE) __msa_hadd_s_h((v16i8) in1, (v16i8) in1); \
1244 #define HADD_SB2_SH(...) HADD_SB2(v8i16, __VA_ARGS__)
1246 #define HADD_SB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \
1248 HADD_SB2(RTYPE, in0, in1, out0, out1); \
1249 HADD_SB2(RTYPE, in2, in3, out2, out3); \
1251 #define HADD_SB4_UH(...) HADD_SB4(v8u16, __VA_ARGS__)
1252 #define HADD_SB4_SH(...) HADD_SB4(v8i16, __VA_ARGS__)
1254 /* Description : Horizontal addition of unsigned byte vector elements
1255 Arguments : Inputs - in0, in1
1256 Outputs - out0, out1
1257 Return Type - as per RTYPE
1258 Details : Each unsigned odd byte element from 'in0' is added to
1259 even unsigned byte element from 'in0' (pairwise) and the
1260 halfword result is stored in 'out0'
1262 #define HADD_UB2(RTYPE, in0, in1, out0, out1) \
1264 out0 = (RTYPE) __msa_hadd_u_h((v16u8) in0, (v16u8) in0); \
1265 out1 = (RTYPE) __msa_hadd_u_h((v16u8) in1, (v16u8) in1); \
1267 #define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__)
1269 #define HADD_UB3(RTYPE, in0, in1, in2, out0, out1, out2) \
1271 HADD_UB2(RTYPE, in0, in1, out0, out1); \
1272 out2 = (RTYPE) __msa_hadd_u_h((v16u8) in2, (v16u8) in2); \
1274 #define HADD_UB3_UH(...) HADD_UB3(v8u16, __VA_ARGS__)
1276 #define HADD_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \
1278 HADD_UB2(RTYPE, in0, in1, out0, out1); \
1279 HADD_UB2(RTYPE, in2, in3, out2, out3); \
1281 #define HADD_UB4_UB(...) HADD_UB4(v16u8, __VA_ARGS__)
1282 #define HADD_UB4_UH(...) HADD_UB4(v8u16, __VA_ARGS__)
1283 #define HADD_UB4_SH(...) HADD_UB4(v8i16, __VA_ARGS__)
1285 /* Description : Horizontal subtraction of unsigned byte vector elements
1286 Arguments : Inputs - in0, in1
1287 Outputs - out0, out1
1288 Return Type - as per RTYPE
1289 Details : Each unsigned odd byte element from 'in0' is subtracted from
1290 even unsigned byte element from 'in0' (pairwise) and the
1291 halfword result is stored in 'out0'
1293 #define HSUB_UB2(RTYPE, in0, in1, out0, out1) \
1295 out0 = (RTYPE) __msa_hsub_u_h((v16u8) in0, (v16u8) in0); \
1296 out1 = (RTYPE) __msa_hsub_u_h((v16u8) in1, (v16u8) in1); \
1298 #define HSUB_UB2_UH(...) HSUB_UB2(v8u16, __VA_ARGS__)
1299 #define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
1301 #define HSUB_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \
1303 HSUB_UB2(RTYPE, in0, in1, out0, out1); \
1304 HSUB_UB2(RTYPE, in2, in3, out2, out3); \
1306 #define HSUB_UB4_UH(...) HSUB_UB4(v8u16, __VA_ARGS__)
1307 #define HSUB_UB4_SH(...) HSUB_UB4(v8i16, __VA_ARGS__)
1309 /* Description : SAD (Sum of Absolute Difference)
1310 Arguments : Inputs - in0, in1, ref0, ref1 (unsigned byte src & ref)
1311 Outputs - sad_m (halfword vector with sad)
1312 Return Type - unsigned halfword
1313 Details : Absolute difference of all the byte elements from 'in0' with
1314 'ref0' is calculated and preserved in 'diff0'. From the 16
1315 unsigned absolute diff values, even-odd pairs are added
1316 together to generate 8 halfword results.
1318 #define SAD_UB2_UH(in0, in1, ref0, ref1) \
1320 v16u8 diff0_m, diff1_m; \
1321 v8u16 sad_m = { 0 }; \
1323 diff0_m = __msa_asub_u_b((v16u8) in0, (v16u8) ref0); \
1324 diff1_m = __msa_asub_u_b((v16u8) in1, (v16u8) ref1); \
1326 sad_m += __msa_hadd_u_h((v16u8) diff0_m, (v16u8) diff0_m); \
1327 sad_m += __msa_hadd_u_h((v16u8) diff1_m, (v16u8) diff1_m); \
1332 /* Description : Insert specified word elements from input vectors to 1
1334 Arguments : Inputs - in0, in1, in2, in3 (4 input vectors)
1335 Outputs - out (output vector)
1336 Return Type - as per RTYPE
1338 #define INSERT_W2(RTYPE, in0, in1, out) \
1340 out = (RTYPE) __msa_insert_w((v4i32) out, 0, in0); \
1341 out = (RTYPE) __msa_insert_w((v4i32) out, 1, in1); \
1343 #define INSERT_W2_UB(...) INSERT_W2(v16u8, __VA_ARGS__)
1344 #define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__)
1346 #define INSERT_W4(RTYPE, in0, in1, in2, in3, out) \
1348 out = (RTYPE) __msa_insert_w((v4i32) out, 0, in0); \
1349 out = (RTYPE) __msa_insert_w((v4i32) out, 1, in1); \
1350 out = (RTYPE) __msa_insert_w((v4i32) out, 2, in2); \
1351 out = (RTYPE) __msa_insert_w((v4i32) out, 3, in3); \
1353 #define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__)
1354 #define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__)
1355 #define INSERT_W4_SW(...) INSERT_W4(v4i32, __VA_ARGS__)
1357 /* Description : Insert specified double word elements from input vectors to 1
1359 Arguments : Inputs - in0, in1 (2 input vectors)
1360 Outputs - out (output vector)
1361 Return Type - as per RTYPE
1363 #define INSERT_D2(RTYPE, in0, in1, out) \
1365 out = (RTYPE) __msa_insert_d((v2i64) out, 0, in0); \
1366 out = (RTYPE) __msa_insert_d((v2i64) out, 1, in1); \
1368 #define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__)
1369 #define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)
1370 #define INSERT_D2_SD(...) INSERT_D2(v2i64, __VA_ARGS__)
1372 /* Description : Interleave even byte elements from vectors
1373 Arguments : Inputs - in0, in1, in2, in3
1374 Outputs - out0, out1
1375 Return Type - as per RTYPE
1376 Details : Even byte elements of 'in0' and even byte
1377 elements of 'in1' are interleaved and copied to 'out0'
1378 Even byte elements of 'in2' and even byte
1379 elements of 'in3' are interleaved and copied to 'out1'
1381 #define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
1383 out0 = (RTYPE) __msa_ilvev_b((v16i8) in1, (v16i8) in0); \
1384 out1 = (RTYPE) __msa_ilvev_b((v16i8) in3, (v16i8) in2); \
1386 #define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__)
1387 #define ILVEV_B2_SB(...) ILVEV_B2(v16i8, __VA_ARGS__)
1388 #define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__)
1389 #define ILVEV_B2_SD(...) ILVEV_B2(v2i64, __VA_ARGS__)
1391 /* Description : Interleave even halfword elements from vectors
1392 Arguments : Inputs - in0, in1, in2, in3
1393 Outputs - out0, out1
1394 Return Type - as per RTYPE
1395 Details : Even halfword elements of 'in0' and even halfword
1396 elements of 'in1' are interleaved and copied to 'out0'
1397 Even halfword elements of 'in2' and even halfword
1398 elements of 'in3' are interleaved and copied to 'out1'
1400 #define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
1402 out0 = (RTYPE) __msa_ilvev_h((v8i16) in1, (v8i16) in0); \
1403 out1 = (RTYPE) __msa_ilvev_h((v8i16) in3, (v8i16) in2); \
1405 #define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__)
1406 #define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__)
1407 #define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__)
1409 /* Description : Interleave even word elements from vectors
1410 Arguments : Inputs - in0, in1, in2, in3
1411 Outputs - out0, out1
1412 Return Type - as per RTYPE
1413 Details : Even word elements of 'in0' and even word
1414 elements of 'in1' are interleaved and copied to 'out0'
1415 Even word elements of 'in2' and even word
1416 elements of 'in3' are interleaved and copied to 'out1'
1418 #define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) \
1420 out0 = (RTYPE) __msa_ilvev_w((v4i32) in1, (v4i32) in0); \
1421 out1 = (RTYPE) __msa_ilvev_w((v4i32) in3, (v4i32) in2); \
1423 #define ILVEV_W2_UB(...) ILVEV_W2(v16u8, __VA_ARGS__)
1424 #define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__)
1425 #define ILVEV_W2_UH(...) ILVEV_W2(v8u16, __VA_ARGS__)
1426 #define ILVEV_W2_SD(...) ILVEV_W2(v2i64, __VA_ARGS__)
1428 /* Description : Interleave even double word elements from vectors
1429 Arguments : Inputs - in0, in1, in2, in3
1430 Outputs - out0, out1
1431 Return Type - as per RTYPE
1432 Details : Even double word elements of 'in0' and even double word
1433 elements of 'in1' are interleaved and copied to 'out0'
1434 Even double word elements of 'in2' and even double word
1435 elements of 'in3' are interleaved and copied to 'out1'
1437 #define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
1439 out0 = (RTYPE) __msa_ilvev_d((v2i64) in1, (v2i64) in0); \
1440 out1 = (RTYPE) __msa_ilvev_d((v2i64) in3, (v2i64) in2); \
1442 #define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)
1443 #define ILVEV_D2_SB(...) ILVEV_D2(v16i8, __VA_ARGS__)
1444 #define ILVEV_D2_SW(...) ILVEV_D2(v4i32, __VA_ARGS__)
1446 /* Description : Interleave left half of byte elements from vectors
1447 Arguments : Inputs - in0, in1, in2, in3
1448 Outputs - out0, out1
1449 Return Type - as per RTYPE
1450 Details : Left half of byte elements of in0 and left half of byte
1451 elements of in1 are interleaved and copied to out0.
1452 Left half of byte elements of in2 and left half of byte
1453 elements of in3 are interleaved and copied to out1.
1455 #define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
1457 out0 = (RTYPE) __msa_ilvl_b((v16i8) in0, (v16i8) in1); \
1458 out1 = (RTYPE) __msa_ilvl_b((v16i8) in2, (v16i8) in3); \
1460 #define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__)
1461 #define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__)
1462 #define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__)
1463 #define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__)
1465 #define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1466 out0, out1, out2, out3) \
1468 ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1469 ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
1471 #define ILVL_B4_UB(...) ILVL_B4(v16u8, __VA_ARGS__)
1472 #define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__)
1473 #define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__)
1474 #define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__)
1476 /* Description : Interleave left half of halfword elements from vectors
1477 Arguments : Inputs - in0, in1, in2, in3
1478 Outputs - out0, out1
1479 Return Type - as per RTYPE
1480 Details : Left half of halfword elements of in0 and left half of halfword
1481 elements of in1 are interleaved and copied to out0.
1482 Left half of halfword elements of in2 and left half of halfword
1483 elements of in3 are interleaved and copied to out1.
1485 #define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
1487 out0 = (RTYPE) __msa_ilvl_h((v8i16) in0, (v8i16) in1); \
1488 out1 = (RTYPE) __msa_ilvl_h((v8i16) in2, (v8i16) in3); \
1490 #define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__)
1491 #define ILVL_H2_SW(...) ILVL_H2(v4i32, __VA_ARGS__)
1493 #define ILVL_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1494 out0, out1, out2, out3) \
1496 ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
1497 ILVL_H2(RTYPE, in4, in5, in6, in7, out2, out3); \
1499 #define ILVL_H4_SH(...) ILVL_H4(v8i16, __VA_ARGS__)
1500 #define ILVL_H4_SW(...) ILVL_H4(v4i32, __VA_ARGS__)
1502 /* Description : Interleave left half of word elements from vectors
1503 Arguments : Inputs - in0, in1, in2, in3
1504 Outputs - out0, out1
1505 Return Type - as per RTYPE
1506 Details : Left half of word elements of in0 and left half of word
1507 elements of in1 are interleaved and copied to out0.
1508 Left half of word elements of in2 and left half of word
1509 elements of in3 are interleaved and copied to out1.
1511 #define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1) \
1513 out0 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1); \
1514 out1 = (RTYPE) __msa_ilvl_w((v4i32) in2, (v4i32) in3); \
1516 #define ILVL_W2_UB(...) ILVL_W2(v16u8, __VA_ARGS__)
1517 #define ILVL_W2_SB(...) ILVL_W2(v16i8, __VA_ARGS__)
1518 #define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__)
1520 /* Description : Interleave right half of byte elements from vectors
1521 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
1522 Outputs - out0, out1, out2, out3
1523 Return Type - as per RTYPE
1524 Details : Right half of byte elements of in0 and right half of byte
1525 elements of in1 are interleaved and copied to out0.
1526 Right half of byte elements of in2 and right half of byte
1527 elements of in3 are interleaved and copied to out1.
1528 Similar for other pairs
1530 #define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
1532 out0 = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1); \
1533 out1 = (RTYPE) __msa_ilvr_b((v16i8) in2, (v16i8) in3); \
1535 #define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__)
1536 #define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)
1537 #define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__)
1538 #define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
1539 #define ILVR_B2_SW(...) ILVR_B2(v4i32, __VA_ARGS__)
1541 #define ILVR_B3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
1543 ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1544 out2 = (RTYPE) __msa_ilvr_b((v16i8) in4, (v16i8) in5); \
1546 #define ILVR_B3_UB(...) ILVR_B3(v16u8, __VA_ARGS__)
1547 #define ILVR_B3_UH(...) ILVR_B3(v8u16, __VA_ARGS__)
1548 #define ILVR_B3_SH(...) ILVR_B3(v8i16, __VA_ARGS__)
1550 #define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1551 out0, out1, out2, out3) \
1553 ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1554 ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
1556 #define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__)
1557 #define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__)
1558 #define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__)
1559 #define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__)
1560 #define ILVR_B4_SW(...) ILVR_B4(v4i32, __VA_ARGS__)
1562 #define ILVR_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1563 in8, in9, in10, in11, in12, in13, in14, in15, \
1564 out0, out1, out2, out3, out4, out5, out6, out7) \
1566 ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1567 out0, out1, out2, out3); \
1568 ILVR_B4(RTYPE, in8, in9, in10, in11, in12, in13, in14, in15, \
1569 out4, out5, out6, out7); \
1571 #define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__)
1573 /* Description : Interleave right half of halfword elements from vectors
1574 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
1575 Outputs - out0, out1, out2, out3
1576 Return Type - signed halfword
1577 Details : Right half of halfword elements of in0 and right half of
1578 halfword elements of in1 are interleaved and copied to out0.
1579 Right half of halfword elements of in2 and right half of
1580 halfword elements of in3 are interleaved and copied to out1.
1581 Similar for other pairs
1583 #define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
1585 out0 = (RTYPE) __msa_ilvr_h((v8i16) in0, (v8i16) in1); \
1586 out1 = (RTYPE) __msa_ilvr_h((v8i16) in2, (v8i16) in3); \
1588 #define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
1589 #define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__)
1591 #define ILVR_H3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
1593 ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
1594 out2 = (RTYPE) __msa_ilvr_h((v8i16) in4, (v8i16) in5); \
1596 #define ILVR_H3_SH(...) ILVR_H3(v8i16, __VA_ARGS__)
1598 #define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1599 out0, out1, out2, out3) \
1601 ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
1602 ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3); \
1604 #define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__)
1605 #define ILVR_H4_SW(...) ILVR_H4(v4i32, __VA_ARGS__)
1607 #define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1) \
1609 out0 = (RTYPE) __msa_ilvr_w((v4i32) in0, (v4i32) in1); \
1610 out1 = (RTYPE) __msa_ilvr_w((v4i32) in2, (v4i32) in3); \
1612 #define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__)
1613 #define ILVR_W2_SB(...) ILVR_W2(v16i8, __VA_ARGS__)
1614 #define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__)
1616 #define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1617 out0, out1, out2, out3) \
1619 ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1); \
1620 ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3); \
1622 #define ILVR_W4_SB(...) ILVR_W4(v16i8, __VA_ARGS__)
1623 #define ILVR_W4_UB(...) ILVR_W4(v16u8, __VA_ARGS__)
1625 /* Description : Interleave right half of double word elements from vectors
1626 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
1627 Outputs - out0, out1, out2, out3
1628 Return Type - unsigned double word
1629 Details : Right half of double word elements of in0 and right half of
1630 double word elements of in1 are interleaved and copied to out0.
1631 Right half of double word elements of in2 and right half of
1632 double word elements of in3 are interleaved and copied to out1.
1634 #define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
1636 out0 = (RTYPE) __msa_ilvr_d((v2i64) (in0), (v2i64) (in1)); \
1637 out1 = (RTYPE) __msa_ilvr_d((v2i64) (in2), (v2i64) (in3)); \
1639 #define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__)
1640 #define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
1641 #define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
1643 #define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
1645 ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
1646 out2 = (RTYPE) __msa_ilvr_d((v2i64) (in4), (v2i64) (in5)); \
1648 #define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__)
1650 #define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1651 out0, out1, out2, out3) \
1653 ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
1654 ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3); \
1656 #define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__)
1657 #define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__)
1659 /* Description : Interleave both left and right half of input vectors
1660 Arguments : Inputs - in0, in1
1661 Outputs - out0, out1
1662 Return Type - as per RTYPE
1663 Details : Right half of byte elements from 'in0' and 'in1' are
1664 interleaved and stored to 'out0'
1665 Left half of byte elements from 'in0' and 'in1' are
1666 interleaved and stored to 'out1'
1668 #define ILVRL_B2(RTYPE, in0, in1, out0, out1) \
1670 out0 = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1); \
1671 out1 = (RTYPE) __msa_ilvl_b((v16i8) in0, (v16i8) in1); \
1673 #define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
1674 #define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)
1675 #define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__)
1676 #define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
1677 #define ILVRL_B2_SW(...) ILVRL_B2(v4i32, __VA_ARGS__)
1679 #define ILVRL_H2(RTYPE, in0, in1, out0, out1) \
1681 out0 = (RTYPE) __msa_ilvr_h((v8i16) in0, (v8i16) in1); \
1682 out1 = (RTYPE) __msa_ilvl_h((v8i16) in0, (v8i16) in1); \
1684 #define ILVRL_H2_SB(...) ILVRL_H2(v16i8, __VA_ARGS__)
1685 #define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)
1686 #define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
1688 #define ILVRL_W2(RTYPE, in0, in1, out0, out1) \
1690 out0 = (RTYPE) __msa_ilvr_w((v4i32) in0, (v4i32) in1); \
1691 out1 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1); \
1693 #define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__)
1694 #define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
1695 #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
1697 /* Description : Maximum values between signed elements of vector and
1698 5-bit signed immediate value are copied to the output vector
1699 Arguments : Inputs - in0, in1, in2, in3, max_val
1700 Outputs - in0, in1, in2, in3 (in place)
1701 Return Type - unsigned halfword
1702 Details : Maximum of signed halfword element values from 'in0' and
1703 'max_val' are written to output vector 'in0'
1705 #define MAXI_SH2(RTYPE, in0, in1, max_val) \
1707 in0 = (RTYPE) __msa_maxi_s_h((v8i16) in0, (max_val)); \
1708 in1 = (RTYPE) __msa_maxi_s_h((v8i16) in1, (max_val)); \
1710 #define MAXI_SH2_UH(...) MAXI_SH2(v8u16, __VA_ARGS__)
1711 #define MAXI_SH2_SH(...) MAXI_SH2(v8i16, __VA_ARGS__)
1713 #define MAXI_SH4(RTYPE, in0, in1, in2, in3, max_val) \
1715 MAXI_SH2(RTYPE, in0, in1, max_val); \
1716 MAXI_SH2(RTYPE, in2, in3, max_val); \
1718 #define MAXI_SH4_UH(...) MAXI_SH4(v8u16, __VA_ARGS__)
1720 /* Description : Saturate the halfword element values to the max
1721 unsigned value of (sat_val+1 bits)
1722 The element data width remains unchanged
1723 Arguments : Inputs - in0, in1, in2, in3, sat_val
1724 Outputs - in0, in1, in2, in3 (in place)
1725 Return Type - unsigned halfword
1726 Details : Each unsigned halfword element from 'in0' is saturated to the
1727 value generated with (sat_val+1) bit range
1728 Results are in placed to original vectors
1730 #define SAT_UH2(RTYPE, in0, in1, sat_val) \
1732 in0 = (RTYPE) __msa_sat_u_h((v8u16) in0, sat_val); \
1733 in1 = (RTYPE) __msa_sat_u_h((v8u16) in1, sat_val); \
1735 #define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__)
1736 #define SAT_UH2_SH(...) SAT_UH2(v8i16, __VA_ARGS__)
1738 #define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val) \
1740 SAT_UH2(RTYPE, in0, in1, sat_val); \
1741 SAT_UH2(RTYPE, in2, in3, sat_val) \
1743 #define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__)
1745 /* Description : Saturate the halfword element values to the max
1746 unsigned value of (sat_val+1 bits)
1747 The element data width remains unchanged
1748 Arguments : Inputs - in0, in1, in2, in3, sat_val
1749 Outputs - in0, in1, in2, in3 (in place)
1750 Return Type - unsigned halfword
1751 Details : Each unsigned halfword element from 'in0' is saturated to the
1752 value generated with (sat_val+1) bit range
1753 Results are in placed to original vectors
1755 #define SAT_SH2(RTYPE, in0, in1, sat_val) \
1757 in0 = (RTYPE) __msa_sat_s_h((v8i16) in0, sat_val); \
1758 in1 = (RTYPE) __msa_sat_s_h((v8i16) in1, sat_val); \
1760 #define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__)
1762 #define SAT_SH3(RTYPE, in0, in1, in2, sat_val) \
1764 SAT_SH2(RTYPE, in0, in1, sat_val) \
1765 in2 = (RTYPE) __msa_sat_s_h((v8i16) in2, sat_val); \
1767 #define SAT_SH3_SH(...) SAT_SH3(v8i16, __VA_ARGS__)
1769 #define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) \
1771 SAT_SH2(RTYPE, in0, in1, sat_val); \
1772 SAT_SH2(RTYPE, in2, in3, sat_val); \
1774 #define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__)
1776 /* Description : Saturate the word element values to the max
1777 unsigned value of (sat_val+1 bits)
1778 The element data width remains unchanged
1779 Arguments : Inputs - in0, in1, in2, in3, sat_val
1780 Outputs - in0, in1, in2, in3 (in place)
1781 Return Type - unsigned word
1782 Details : Each unsigned word element from 'in0' is saturated to the
1783 value generated with (sat_val+1) bit range
1784 Results are in placed to original vectors
1786 #define SAT_SW2(RTYPE, in0, in1, sat_val) \
1788 in0 = (RTYPE) __msa_sat_s_w((v4i32) in0, sat_val); \
1789 in1 = (RTYPE) __msa_sat_s_w((v4i32) in1, sat_val); \
1791 #define SAT_SW2_SW(...) SAT_SW2(v4i32, __VA_ARGS__)
1793 #define SAT_SW4(RTYPE, in0, in1, in2, in3, sat_val) \
1795 SAT_SW2(RTYPE, in0, in1, sat_val); \
1796 SAT_SW2(RTYPE, in2, in3, sat_val); \
1798 #define SAT_SW4_SW(...) SAT_SW4(v4i32, __VA_ARGS__)
1800 /* Description : Indexed halfword element values are replicated to all
1801 elements in output vector
1802 Arguments : Inputs - in, idx0, idx1
1803 Outputs - out0, out1
1804 Return Type - as per RTYPE
1805 Details : 'idx0' element value from 'in' vector is replicated to all
1806 elements in 'out0' vector
1807 Valid index range for halfword operation is 0-7
1809 #define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) \
1811 out0 = (RTYPE) __msa_splati_h((v8i16) in, idx0); \
1812 out1 = (RTYPE) __msa_splati_h((v8i16) in, idx1); \
1814 #define SPLATI_H2_SB(...) SPLATI_H2(v16i8, __VA_ARGS__)
1815 #define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__)
1817 #define SPLATI_H3(RTYPE, in, idx0, idx1, idx2, \
1820 SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1); \
1821 out2 = (RTYPE) __msa_splati_h((v8i16) in, idx2); \
1823 #define SPLATI_H3_SB(...) SPLATI_H3(v16i8, __VA_ARGS__)
1824 #define SPLATI_H3_SH(...) SPLATI_H3(v8i16, __VA_ARGS__)
1826 #define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3, \
1827 out0, out1, out2, out3) \
1829 SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1); \
1830 SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3); \
1832 #define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__)
1833 #define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__)
1835 /* Description : Indexed word element values are replicated to all
1836 elements in output vector
1837 Arguments : Inputs - in, stidx
1838 Outputs - out0, out1
1839 Return Type - as per RTYPE
1840 Details : 'stidx' element value from 'in' vector is replicated to all
1841 elements in 'out0' vector
1842 'stidx + 1' element value from 'in' vector is replicated to all
1843 elements in 'out1' vector
1844 Valid index range for halfword operation is 0-3
1846 #define SPLATI_W2(RTYPE, in, stidx, out0, out1) \
1848 out0 = (RTYPE) __msa_splati_w((v4i32) in, stidx); \
1849 out1 = (RTYPE) __msa_splati_w((v4i32) in, (stidx+1)); \
1851 #define SPLATI_W2_SH(...) SPLATI_W2(v8i16, __VA_ARGS__)
1852 #define SPLATI_W2_SW(...) SPLATI_W2(v4i32, __VA_ARGS__)
1854 #define SPLATI_W4(RTYPE, in, out0, out1, out2, out3) \
1856 SPLATI_W2(RTYPE, in, 0, out0, out1); \
1857 SPLATI_W2(RTYPE, in, 2, out2, out3); \
1859 #define SPLATI_W4_SH(...) SPLATI_W4(v8i16, __VA_ARGS__)
1860 #define SPLATI_W4_SW(...) SPLATI_W4(v4i32, __VA_ARGS__)
1862 /* Description : Pack even byte elements of vector pairs
1863 Arguments : Inputs - in0, in1, in2, in3
1864 Outputs - out0, out1
1865 Return Type - as per RTYPE
1866 Details : Even byte elements of in0 are copied to the left half of
1867 out0 & even byte elements of in1 are copied to the right
1869 Even byte elements of in2 are copied to the left half of
1870 out1 & even byte elements of in3 are copied to the right
1873 #define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
1875 out0 = (RTYPE) __msa_pckev_b((v16i8) in0, (v16i8) in1); \
1876 out1 = (RTYPE) __msa_pckev_b((v16i8) in2, (v16i8) in3); \
1878 #define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)
1879 #define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)
1880 #define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__)
1881 #define PCKEV_B2_SW(...) PCKEV_B2(v4i32, __VA_ARGS__)
1883 #define PCKEV_B3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
1885 PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1886 out2 = (RTYPE) __msa_pckev_b((v16i8) in4, (v16i8) in5); \
1888 #define PCKEV_B3_UB(...) PCKEV_B3(v16u8, __VA_ARGS__)
1889 #define PCKEV_B3_SB(...) PCKEV_B3(v16i8, __VA_ARGS__)
1891 #define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1892 out0, out1, out2, out3) \
1894 PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1895 PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
1897 #define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__)
1898 #define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__)
1899 #define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__)
1900 #define PCKEV_B4_SW(...) PCKEV_B4(v4i32, __VA_ARGS__)
1902 /* Description : Pack even halfword elements of vector pairs
1903 Arguments : Inputs - in0, in1, in2, in3
1904 Outputs - out0, out1
1905 Return Type - as per RTYPE
1906 Details : Even halfword elements of in0 are copied to the left half of
1907 out0 & even halfword elements of in1 are copied to the right
1909 Even halfword elements of in2 are copied to the left half of
1910 out1 & even halfword elements of in3 are copied to the right
1913 #define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
1915 out0 = (RTYPE) __msa_pckev_h((v8i16) in0, (v8i16) in1); \
1916 out1 = (RTYPE) __msa_pckev_h((v8i16) in2, (v8i16) in3); \
1918 #define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__)
1919 #define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__)
1921 #define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1922 out0, out1, out2, out3) \
1924 PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
1925 PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3); \
1927 #define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__)
1928 #define PCKEV_H4_SW(...) PCKEV_H4(v4i32, __VA_ARGS__)
1930 /* Description : Pack even double word elements of vector pairs
1931 Arguments : Inputs - in0, in1, in2, in3
1932 Outputs - out0, out1
1933 Return Type - unsigned byte
1934 Details : Even double elements of in0 are copied to the left half of
1935 out0 & even double elements of in1 are copied to the right
1937 Even double elements of in2 are copied to the left half of
1938 out1 & even double elements of in3 are copied to the right
1941 #define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
1943 out0 = (RTYPE) __msa_pckev_d((v2i64) in0, (v2i64) in1); \
1944 out1 = (RTYPE) __msa_pckev_d((v2i64) in2, (v2i64) in3); \
1946 #define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__)
1947 #define PCKEV_D2_SB(...) PCKEV_D2(v16i8, __VA_ARGS__)
1948 #define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__)
1950 #define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1951 out0, out1, out2, out3) \
1953 PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
1954 PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3); \
1956 #define PCKEV_D4_UB(...) PCKEV_D4(v16u8, __VA_ARGS__)
1958 /* Description : Pack odd double word elements of vector pairs
1959 Arguments : Inputs - in0, in1
1960 Outputs - out0, out1
1961 Return Type - as per RTYPE
1962 Details : As operation is on same input 'in0' vector, index 1 double word
1963 element is overwritten to index 0 and result is written to out0
1964 As operation is on same input 'in1' vector, index 1 double word
1965 element is overwritten to index 0 and result is written to out1
1967 #define PCKOD_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
1969 out0 = (RTYPE) __msa_pckod_d((v2i64) in0, (v2i64) in1); \
1970 out1 = (RTYPE) __msa_pckod_d((v2i64) in2, (v2i64) in3); \
1972 #define PCKOD_D2_UB(...) PCKOD_D2(v16u8, __VA_ARGS__)
1973 #define PCKOD_D2_SH(...) PCKOD_D2(v8i16, __VA_ARGS__)
1974 #define PCKOD_D2_SD(...) PCKOD_D2(v2i64, __VA_ARGS__)
1976 /* Description : Each byte element is logically xor'ed with immediate 128
1977 Arguments : Inputs - in0, in1
1978 Outputs - in0, in1 (in-place)
1979 Return Type - as per RTYPE
1980 Details : Each unsigned byte element from input vector 'in0' is
1981 logically xor'ed with 128 and result is in-place stored in
1983 Each unsigned byte element from input vector 'in1' is
1984 logically xor'ed with 128 and result is in-place stored in
1986 Similar for other pairs
1988 #define XORI_B2_128(RTYPE, in0, in1) \
1990 in0 = (RTYPE) __msa_xori_b((v16u8) in0, 128); \
1991 in1 = (RTYPE) __msa_xori_b((v16u8) in1, 128); \
1993 #define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__)
1994 #define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__)
1995 #define XORI_B2_128_SH(...) XORI_B2_128(v8i16, __VA_ARGS__)
1997 #define XORI_B3_128(RTYPE, in0, in1, in2) \
1999 XORI_B2_128(RTYPE, in0, in1); \
2000 in2 = (RTYPE) __msa_xori_b((v16u8) in2, 128); \
2002 #define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__)
2004 #define XORI_B4_128(RTYPE, in0, in1, in2, in3) \
2006 XORI_B2_128(RTYPE, in0, in1); \
2007 XORI_B2_128(RTYPE, in2, in3); \
2009 #define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__)
2010 #define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__)
2011 #define XORI_B4_128_SH(...) XORI_B4_128(v8i16, __VA_ARGS__)
2013 #define XORI_B5_128(RTYPE, in0, in1, in2, in3, in4) \
2015 XORI_B3_128(RTYPE, in0, in1, in2); \
2016 XORI_B2_128(RTYPE, in3, in4); \
2018 #define XORI_B5_128_SB(...) XORI_B5_128(v16i8, __VA_ARGS__)
2020 #define XORI_B6_128(RTYPE, in0, in1, in2, in3, in4, in5) \
2022 XORI_B4_128(RTYPE, in0, in1, in2, in3); \
2023 XORI_B2_128(RTYPE, in4, in5); \
2025 #define XORI_B6_128_SB(...) XORI_B6_128(v16i8, __VA_ARGS__)
2027 #define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6) \
2029 XORI_B4_128(RTYPE, in0, in1, in2, in3); \
2030 XORI_B3_128(RTYPE, in4, in5, in6); \
2032 #define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__)
2034 #define XORI_B8_128(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7) \
2036 XORI_B4_128(RTYPE, in0, in1, in2, in3); \
2037 XORI_B4_128(RTYPE, in4, in5, in6, in7); \
2039 #define XORI_B8_128_SB(...) XORI_B8_128(v16i8, __VA_ARGS__)
2041 /* Description : Addition of signed halfword elements and signed saturation
2042 Arguments : Inputs - in0, in1, in2, in3
2043 Outputs - out0, out1
2044 Return Type - as per RTYPE
2045 Details : Signed halfword elements from 'in0' are added to signed
2046 halfword elements of 'in1'. The result is then signed saturated
2047 between -32768 to +32767 (as per halfword data type)
2048 Similar for other pairs
2050 #define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1) \
2052 out0 = (RTYPE) __msa_adds_s_h((v8i16) in0, (v8i16) in1); \
2053 out1 = (RTYPE) __msa_adds_s_h((v8i16) in2, (v8i16) in3); \
2055 #define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__)
2057 #define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
2058 out0, out1, out2, out3) \
2060 ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1); \
2061 ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3); \
2063 #define ADDS_SH4_UH(...) ADDS_SH4(v8u16, __VA_ARGS__)
2064 #define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__)
2066 /* Description : Shift left all elements of vector (generic for all data types)
2067 Arguments : Inputs - in0, in1, in2, in3, shift
2068 Outputs - in0, in1, in2, in3 (in place)
2069 Return Type - as per input vector RTYPE
2070 Details : Each element of vector 'in0' is left shifted by 'shift' and
2071 result is in place written to 'in0'
2072 Similar for other pairs
2074 #define SLLI_4V(in0, in1, in2, in3, shift) \
2076 in0 = in0 << shift; \
2077 in1 = in1 << shift; \
2078 in2 = in2 << shift; \
2079 in3 = in3 << shift; \
2082 /* Description : Arithmetic shift right all elements of vector
2083 (generic for all data types)
2084 Arguments : Inputs - in0, in1, in2, in3, shift
2085 Outputs - in0, in1, in2, in3 (in place)
2086 Return Type - as per input vector RTYPE
2087 Details : Each element of vector 'in0' is right shifted by 'shift' and
2088 result is in place written to 'in0'
2089 Here, 'shift' is GP variable passed in
2090 Similar for other pairs
2092 #define SRA_4V(in0, in1, in2, in3, shift) \
2094 in0 = in0 >> shift; \
2095 in1 = in1 >> shift; \
2096 in2 = in2 >> shift; \
2097 in3 = in3 >> shift; \
2100 /* Description : Shift right logical all halfword elements of vector
2101 Arguments : Inputs - in0, in1, in2, in3, shift
2102 Outputs - in0, in1, in2, in3 (in place)
2103 Return Type - unsigned halfword
2104 Details : Each element of vector 'in0' is shifted right logical by
2105 number of bits respective element holds in vector 'shift' and
2106 result is in place written to 'in0'
2107 Here, 'shift' is a vector passed in
2108 Similar for other pairs
2110 #define SRL_H4(RTYPE, in0, in1, in2, in3, shift) \
2112 in0 = (RTYPE) __msa_srl_h((v8i16) in0, (v8i16) shift); \
2113 in1 = (RTYPE) __msa_srl_h((v8i16) in1, (v8i16) shift); \
2114 in2 = (RTYPE) __msa_srl_h((v8i16) in2, (v8i16) shift); \
2115 in3 = (RTYPE) __msa_srl_h((v8i16) in3, (v8i16) shift); \
2117 #define SRL_H4_UH(...) SRL_H4(v8u16, __VA_ARGS__)
2119 /* Description : Shift right arithmetic rounded halfwords
2120 Arguments : Inputs - in0, in1, shift
2121 Outputs - in0, in1, (in place)
2122 Return Type - unsigned halfword
2123 Details : Each element of vector 'in0' is shifted right arithmetic by
2124 number of bits respective element holds in vector 'shift'.
2125 The last discarded bit is added to shifted value for rounding
2126 and the result is in place written to 'in0'
2127 Here, 'shift' is a vector passed in
2128 Similar for other pairs
2130 #define SRAR_H2(RTYPE, in0, in1, shift) \
2132 in0 = (RTYPE) __msa_srar_h((v8i16) in0, (v8i16) shift); \
2133 in1 = (RTYPE) __msa_srar_h((v8i16) in1, (v8i16) shift); \
2135 #define SRAR_H2_UH(...) SRAR_H2(v8u16, __VA_ARGS__)
2136 #define SRAR_H2_SH(...) SRAR_H2(v8i16, __VA_ARGS__)
2138 #define SRAR_H3(RTYPE, in0, in1, in2, shift) \
2140 SRAR_H2(RTYPE, in0, in1, shift) \
2141 in2 = (RTYPE) __msa_srar_h((v8i16) in2, (v8i16) shift); \
2143 #define SRAR_H3_SH(...) SRAR_H3(v8i16, __VA_ARGS__)
2145 #define SRAR_H4(RTYPE, in0, in1, in2, in3, shift) \
2147 SRAR_H2(RTYPE, in0, in1, shift) \
2148 SRAR_H2(RTYPE, in2, in3, shift) \
2150 #define SRAR_H4_UH(...) SRAR_H4(v8u16, __VA_ARGS__)
2151 #define SRAR_H4_SH(...) SRAR_H4(v8i16, __VA_ARGS__)
2153 /* Description : Shift right arithmetic rounded words
2154 Arguments : Inputs - in0, in1, shift
2155 Outputs - in0, in1, (in place)
2156 Return Type - as per RTYPE
2157 Details : Each element of vector 'in0' is shifted right arithmetic by
2158 number of bits respective element holds in vector 'shift'.
2159 The last discarded bit is added to shifted value for rounding
2160 and the result is in place written to 'in0'
2161 Here, 'shift' is a vector passed in
2162 Similar for other pairs
2164 #define SRAR_W2(RTYPE, in0, in1, shift) \
2166 in0 = (RTYPE) __msa_srar_w((v4i32) in0, (v4i32) shift); \
2167 in1 = (RTYPE) __msa_srar_w((v4i32) in1, (v4i32) shift); \
2169 #define SRAR_W2_SW(...) SRAR_W2(v4i32, __VA_ARGS__)
2171 #define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) \
2173 SRAR_W2(RTYPE, in0, in1, shift) \
2174 SRAR_W2(RTYPE, in2, in3, shift) \
2176 #define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__)
2178 /* Description : Shift right arithmetic rounded (immediate)
2179 Arguments : Inputs - in0, in1, in2, in3, shift
2180 Outputs - in0, in1, in2, in3 (in place)
2181 Return Type - as per RTYPE
2182 Details : Each element of vector 'in0' is shifted right arithmetic by
2184 The last discarded bit is added to shifted value for rounding
2185 and the result is in place written to 'in0'
2186 Similar for other pairs
2188 #define SRARI_H2(RTYPE, in0, in1, shift) \
2190 in0 = (RTYPE) __msa_srari_h((v8i16) in0, shift); \
2191 in1 = (RTYPE) __msa_srari_h((v8i16) in1, shift); \
2193 #define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__)
2194 #define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__)
2196 #define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) \
2198 SRARI_H2(RTYPE, in0, in1, shift); \
2199 SRARI_H2(RTYPE, in2, in3, shift); \
2201 #define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__)
2202 #define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__)
2204 /* Description : Shift right arithmetic rounded (immediate)
2205 Arguments : Inputs - in0, in1, shift
2206 Outputs - in0, in1 (in place)
2207 Return Type - as per RTYPE
2208 Details : Each element of vector 'in0' is shifted right arithmetic by
2210 The last discarded bit is added to shifted value for rounding
2211 and the result is in place written to 'in0'
2212 Similar for other pairs
2214 #define SRARI_W2(RTYPE, in0, in1, shift) \
2216 in0 = (RTYPE) __msa_srari_w((v4i32) in0, shift); \
2217 in1 = (RTYPE) __msa_srari_w((v4i32) in1, shift); \
2219 #define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__)
2221 #define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) \
2223 SRARI_W2(RTYPE, in0, in1, shift); \
2224 SRARI_W2(RTYPE, in2, in3, shift); \
2226 #define SRARI_W4_SH(...) SRARI_W4(v8i16, __VA_ARGS__)
2227 #define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)
2229 /* Description : Multiplication of pairs of vectors
2230 Arguments : Inputs - in0, in1, in2, in3
2231 Outputs - out0, out1
2232 Details : Each element from 'in0' is multiplied with elements from 'in1'
2233 and result is written to 'out0'
2234 Similar for other pairs
2236 #define MUL2(in0, in1, in2, in3, out0, out1) \
2241 #define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
2243 MUL2(in0, in1, in2, in3, out0, out1); \
2244 MUL2(in4, in5, in6, in7, out2, out3); \
2247 /* Description : Addition of 2 pairs of vectors
2248 Arguments : Inputs - in0, in1, in2, in3
2249 Outputs - out0, out1
2250 Details : Each element from 2 pairs vectors is added and 2 results are
2253 #define ADD2(in0, in1, in2, in3, out0, out1) \
2258 #define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
2260 ADD2(in0, in1, in2, in3, out0, out1); \
2261 ADD2(in4, in5, in6, in7, out2, out3); \
2264 /* Description : Subtraction of 2 pairs of vectors
2265 Arguments : Inputs - in0, in1, in2, in3
2266 Outputs - out0, out1
2267 Details : Each element from 2 pairs vectors is subtracted and 2 results
2270 #define SUB2(in0, in1, in2, in3, out0, out1) \
2275 #define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
2283 /* Description : Sign extend halfword elements from right half of the vector
2284 Arguments : Inputs - in (input halfword vector)
2285 Outputs - out (sign extended word vectors)
2286 Return Type - signed word
2287 Details : Sign bit of halfword elements from input vector 'in' is
2288 extracted and interleaved with same vector 'in0' to generate
2289 4 word elements keeping sign intact
2291 #define UNPCK_R_SH_SW(in, out) \
2295 sign_m = __msa_clti_s_h((v8i16) in, 0); \
2296 out = (v4i32) __msa_ilvr_h(sign_m, (v8i16) in); \
2299 /* Description : Sign extend byte elements from input vector and return
2300 halfword results in pair of vectors
2301 Arguments : Inputs - in (1 input byte vector)
2302 Outputs - out0, out1 (sign extended 2 halfword vectors)
2303 Return Type - signed halfword
2304 Details : Sign bit of byte elements from input vector 'in' is
2305 extracted and interleaved right with same vector 'in0' to
2306 generate 8 signed halfword elements in 'out0'
2307 Then interleaved left with same vector 'in0' to
2308 generate 8 signed halfword elements in 'out1'
2310 #define UNPCK_SB_SH(in, out0, out1) \
2314 tmp_m = __msa_clti_s_b((v16i8) in, 0); \
2315 ILVRL_B2_SH(tmp_m, in, out0, out1); \
2318 /* Description : Zero extend unsigned byte elements to halfword elements
2319 Arguments : Inputs - in (1 input unsigned byte vector)
2320 Outputs - out0, out1 (unsigned 2 halfword vectors)
2321 Return Type - signed halfword
2322 Details : Zero extended right half of vector is returned in 'out0'
2323 Zero extended left half of vector is returned in 'out1'
2325 #define UNPCK_UB_SH(in, out0, out1) \
2327 v16i8 zero_m = { 0 }; \
2329 ILVRL_B2_SH(zero_m, in, out0, out1); \
2332 /* Description : Sign extend halfword elements from input vector and return
2333 result in pair of vectors
2334 Arguments : Inputs - in (1 input halfword vector)
2335 Outputs - out0, out1 (sign extended 2 word vectors)
2336 Return Type - signed word
2337 Details : Sign bit of halfword elements from input vector 'in' is
2338 extracted and interleaved right with same vector 'in0' to
2339 generate 4 signed word elements in 'out0'
2340 Then interleaved left with same vector 'in0' to
2341 generate 4 signed word elements in 'out1'
2343 #define UNPCK_SH_SW(in, out0, out1) \
2347 tmp_m = __msa_clti_s_h((v8i16) in, 0); \
2348 ILVRL_H2_SW(tmp_m, in, out0, out1); \
2351 /* Description : Swap two variables
2352 Arguments : Inputs - in0, in1
2353 Outputs - in0, in1 (in-place)
2354 Details : Swapping of two input variables using xor
2356 #define SWAP(in0, in1) \
2363 /* Description : Butterfly of 4 input vectors
2364 Arguments : Inputs - in0, in1, in2, in3
2365 Outputs - out0, out1, out2, out3
2366 Details : Butterfly operation
2368 #define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) \
2377 /* Description : Butterfly of 8 input vectors
2378 Arguments : Inputs - in0 ... in7
2379 Outputs - out0 .. out7
2380 Details : Butterfly operation
2382 #define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, \
2383 out0, out1, out2, out3, out4, out5, out6, out7) \
2396 /* Description : Butterfly of 16 input vectors
2397 Arguments : Inputs - in0 ... in15
2398 Outputs - out0 .. out15
2399 Details : Butterfly operation
2401 #define BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, \
2402 in8, in9, in10, in11, in12, in13, in14, in15, \
2403 out0, out1, out2, out3, out4, out5, out6, out7, \
2404 out8, out9, out10, out11, out12, out13, out14, out15) \
2406 out0 = in0 + in15; \
2407 out1 = in1 + in14; \
2408 out2 = in2 + in13; \
2409 out3 = in3 + in12; \
2410 out4 = in4 + in11; \
2411 out5 = in5 + in10; \
2417 out10 = in5 - in10; \
2418 out11 = in4 - in11; \
2419 out12 = in3 - in12; \
2420 out13 = in2 - in13; \
2421 out14 = in1 - in14; \
2422 out15 = in0 - in15; \
2425 /* Description : Transposes input 4x4 byte block
2426 Arguments : Inputs - in0, in1, in2, in3 (input 4x4 byte block)
2427 Outputs - out0, out1, out2, out3 (output 4x4 byte block)
2428 Return Type - unsigned byte
2431 #define TRANSPOSE4x4_UB_UB(in0, in1, in2, in3, out0, out1, out2, out3) \
2433 v16i8 zero_m = { 0 }; \
2434 v16i8 s0_m, s1_m, s2_m, s3_m; \
2436 ILVR_D2_SB(in1, in0, in3, in2, s0_m, s1_m); \
2437 ILVRL_B2_SB(s1_m, s0_m, s2_m, s3_m); \
2439 out0 = (v16u8) __msa_ilvr_b(s3_m, s2_m); \
2440 out1 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out0, 4); \
2441 out2 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out1, 4); \
2442 out3 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out2, 4); \
2445 /* Description : Transposes input 8x4 byte block into 4x8
2446 Arguments : Inputs - in0, in1, in2, in3 (input 8x4 byte block)
2447 Outputs - out0, out1, out2, out3 (output 4x8 byte block)
2448 Return Type - unsigned byte
2451 #define TRANSPOSE8x4_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
2452 out0, out1, out2, out3) \
2454 v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2456 ILVEV_W2_SB(in0, in4, in1, in5, tmp0_m, tmp1_m); \
2457 tmp2_m = __msa_ilvr_b(tmp1_m, tmp0_m); \
2458 ILVEV_W2_SB(in2, in6, in3, in7, tmp0_m, tmp1_m); \
2460 tmp3_m = __msa_ilvr_b(tmp1_m, tmp0_m); \
2461 ILVRL_H2_SB(tmp3_m, tmp2_m, tmp0_m, tmp1_m); \
2463 ILVRL_W2(RTYPE, tmp1_m, tmp0_m, out0, out2); \
2464 out1 = (RTYPE) __msa_ilvl_d((v2i64) out2, (v2i64) out0); \
2465 out3 = (RTYPE) __msa_ilvl_d((v2i64) out0, (v2i64) out2); \
2467 #define TRANSPOSE8x4_UB_UB(...) TRANSPOSE8x4_UB(v16u8, __VA_ARGS__)
2468 #define TRANSPOSE8x4_UB_UH(...) TRANSPOSE8x4_UB(v8u16, __VA_ARGS__)
2470 /* Description : Transposes input 8x8 byte block
2471 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
2472 (input 8x8 byte block)
2473 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
2474 (output 8x8 byte block)
2475 Return Type - unsigned byte
2478 #define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
2479 out0, out1, out2, out3, out4, out5, out6, out7) \
2481 v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2482 v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
2484 ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5, \
2485 tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
2486 ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m); \
2487 ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m); \
2488 ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2); \
2489 ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6); \
2490 SLDI_B2_0(RTYPE, out0, out2, out1, out3, 8); \
2491 SLDI_B2_0(RTYPE, out4, out6, out5, out7, 8); \
2493 #define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__)
2494 #define TRANSPOSE8x8_UB_UH(...) TRANSPOSE8x8_UB(v8u16, __VA_ARGS__)
2496 /* Description : Transposes 16x4 block into 4x16 with byte elements in vectors
2497 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7,
2498 in8, in9, in10, in11, in12, in13, in14, in15
2499 Outputs - out0, out1, out2, out3
2500 Return Type - unsigned byte
2503 #define TRANSPOSE16x4_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2504 in8, in9, in10, in11, in12, in13, in14, in15, \
2505 out0, out1, out2, out3) \
2507 v2i64 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2509 ILVEV_W2_SD(in0, in4, in8, in12, tmp0_m, tmp1_m); \
2510 out1 = (v16u8) __msa_ilvev_d(tmp1_m, tmp0_m); \
2512 ILVEV_W2_SD(in1, in5, in9, in13, tmp0_m, tmp1_m); \
2513 out3 = (v16u8) __msa_ilvev_d(tmp1_m, tmp0_m); \
2515 ILVEV_W2_SD(in2, in6, in10, in14, tmp0_m, tmp1_m); \
2517 tmp2_m = __msa_ilvev_d(tmp1_m, tmp0_m); \
2518 ILVEV_W2_SD(in3, in7, in11, in15, tmp0_m, tmp1_m); \
2520 tmp3_m = __msa_ilvev_d(tmp1_m, tmp0_m); \
2521 ILVEV_B2_SD(out1, out3, tmp2_m, tmp3_m, tmp0_m, tmp1_m); \
2522 out0 = (v16u8) __msa_ilvev_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
2523 out2 = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
2525 tmp0_m = (v2i64) __msa_ilvod_b((v16i8) out3, (v16i8) out1); \
2526 tmp1_m = (v2i64) __msa_ilvod_b((v16i8) tmp3_m, (v16i8) tmp2_m); \
2527 out1 = (v16u8) __msa_ilvev_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
2528 out3 = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
2531 /* Description : Transposes 16x8 block into 8x16 with byte elements in vectors
2532 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7,
2533 in8, in9, in10, in11, in12, in13, in14, in15
2534 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
2535 Return Type - unsigned byte
2538 #define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2539 in8, in9, in10, in11, in12, in13, in14, in15, \
2540 out0, out1, out2, out3, out4, out5, out6, out7) \
2542 v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2543 v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
2545 ILVEV_D2_UB(in0, in8, in1, in9, out7, out6); \
2546 ILVEV_D2_UB(in2, in10, in3, in11, out5, out4); \
2547 ILVEV_D2_UB(in4, in12, in5, in13, out3, out2); \
2548 ILVEV_D2_UB(in6, in14, in7, in15, out1, out0); \
2550 tmp0_m = (v16u8) __msa_ilvev_b((v16i8) out6, (v16i8) out7); \
2551 tmp4_m = (v16u8) __msa_ilvod_b((v16i8) out6, (v16i8) out7); \
2552 tmp1_m = (v16u8) __msa_ilvev_b((v16i8) out4, (v16i8) out5); \
2553 tmp5_m = (v16u8) __msa_ilvod_b((v16i8) out4, (v16i8) out5); \
2554 out5 = (v16u8) __msa_ilvev_b((v16i8) out2, (v16i8) out3); \
2555 tmp6_m = (v16u8) __msa_ilvod_b((v16i8) out2, (v16i8) out3); \
2556 out7 = (v16u8) __msa_ilvev_b((v16i8) out0, (v16i8) out1); \
2557 tmp7_m = (v16u8) __msa_ilvod_b((v16i8) out0, (v16i8) out1); \
2559 ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m); \
2560 out0 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2561 out4 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2563 tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
2564 tmp3_m = (v16u8) __msa_ilvod_h((v8i16) out7, (v8i16) out5); \
2565 out2 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2566 out6 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2568 ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m); \
2569 out1 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2570 out5 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2572 tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp5_m, (v8i16) tmp4_m); \
2573 tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp5_m, (v8i16) tmp4_m); \
2574 tmp3_m = (v16u8) __msa_ilvod_h((v8i16) tmp7_m, (v8i16) tmp6_m); \
2575 tmp3_m = (v16u8) __msa_ilvod_h((v8i16) tmp7_m, (v8i16) tmp6_m); \
2576 out3 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2577 out7 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2580 /* Description : Transposes 4x4 block with half word elements in vectors
2581 Arguments : Inputs - in0, in1, in2, in3
2582 Outputs - out0, out1, out2, out3
2583 Return Type - signed halfword
2586 #define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \
2590 ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m); \
2591 ILVRL_W2_SH(s1_m, s0_m, out0, out2); \
2592 out1 = (v8i16) __msa_ilvl_d((v2i64) out0, (v2i64) out0); \
2593 out3 = (v8i16) __msa_ilvl_d((v2i64) out0, (v2i64) out2); \
2596 /* Description : Transposes 8x8 block with half word elements in vectors
2597 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
2598 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
2599 Return Type - signed halfword
2602 #define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
2603 out0, out1, out2, out3, out4, out5, out6, out7) \
2606 v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2607 v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
2609 ILVR_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \
2610 ILVRL_H2_SH(s1_m, s0_m, tmp0_m, tmp1_m); \
2611 ILVL_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \
2612 ILVRL_H2_SH(s1_m, s0_m, tmp2_m, tmp3_m); \
2613 ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \
2614 ILVRL_H2_SH(s1_m, s0_m, tmp4_m, tmp5_m); \
2615 ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \
2616 ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m); \
2617 PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m, \
2618 tmp3_m, tmp7_m, out0, out2, out4, out6); \
2619 out1 = (RTYPE) __msa_pckod_d((v2i64) tmp0_m, (v2i64) tmp4_m); \
2620 out3 = (RTYPE) __msa_pckod_d((v2i64) tmp1_m, (v2i64) tmp5_m); \
2621 out5 = (RTYPE) __msa_pckod_d((v2i64) tmp2_m, (v2i64) tmp6_m); \
2622 out7 = (RTYPE) __msa_pckod_d((v2i64) tmp3_m, (v2i64) tmp7_m); \
2624 #define TRANSPOSE8x8_UH_UH(...) TRANSPOSE8x8_H(v8u16, __VA_ARGS__)
2625 #define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__)
2627 /* Description : Transposes 4x4 block with word elements in vectors
2628 Arguments : Inputs - in0, in1, in2, in3
2629 Outputs - out0, out1, out2, out3
2630 Return Type - signed word
2633 #define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3) \
2635 v4i32 s0_m, s1_m, s2_m, s3_m; \
2637 ILVRL_W2_SW(in1, in0, s0_m, s1_m); \
2638 ILVRL_W2_SW(in3, in2, s2_m, s3_m); \
2640 out0 = (v4i32) __msa_ilvr_d((v2i64) s2_m, (v2i64) s0_m); \
2641 out1 = (v4i32) __msa_ilvl_d((v2i64) s2_m, (v2i64) s0_m); \
2642 out2 = (v4i32) __msa_ilvr_d((v2i64) s3_m, (v2i64) s1_m); \
2643 out3 = (v4i32) __msa_ilvl_d((v2i64) s3_m, (v2i64) s1_m); \
2646 /* Description : Average byte elements from pair of vectors and store 8x4 byte
2647 block in destination memory
2648 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2651 Details : Each byte element from input vector pair 'in0' and 'in1' are
2652 averaged (a + b)/2 and stored in 'tmp0_m'
2653 Each byte element from input vector pair 'in2' and 'in3' are
2654 averaged (a + b)/2 and stored in 'tmp1_m'
2655 Each byte element from input vector pair 'in4' and 'in5' are
2656 averaged (a + b)/2 and stored in 'tmp2_m'
2657 Each byte element from input vector pair 'in6' and 'in7' are
2658 averaged (a + b)/2 and stored in 'tmp3_m'
2659 The half vector results from all 4 vectors are stored in
2660 destination memory as 8x4 byte block
2662 #define AVE_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
2664 uint64_t out0_m, out1_m, out2_m, out3_m; \
2665 v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2667 tmp0_m = __msa_ave_u_b((v16u8) in0, (v16u8) in1); \
2668 tmp1_m = __msa_ave_u_b((v16u8) in2, (v16u8) in3); \
2669 tmp2_m = __msa_ave_u_b((v16u8) in4, (v16u8) in5); \
2670 tmp3_m = __msa_ave_u_b((v16u8) in6, (v16u8) in7); \
2672 out0_m = __msa_copy_u_d((v2i64) tmp0_m, 0); \
2673 out1_m = __msa_copy_u_d((v2i64) tmp1_m, 0); \
2674 out2_m = __msa_copy_u_d((v2i64) tmp2_m, 0); \
2675 out3_m = __msa_copy_u_d((v2i64) tmp3_m, 0); \
2676 SD4(out0_m, out1_m, out2_m, out3_m, pdst, stride); \
2679 /* Description : Average byte elements from pair of vectors and store 16x4 byte
2680 block in destination memory
2681 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2684 Details : Each byte element from input vector pair 'in0' and 'in1' are
2685 averaged (a + b)/2 and stored in 'tmp0_m'
2686 Each byte element from input vector pair 'in2' and 'in3' are
2687 averaged (a + b)/2 and stored in 'tmp1_m'
2688 Each byte element from input vector pair 'in4' and 'in5' are
2689 averaged (a + b)/2 and stored in 'tmp2_m'
2690 Each byte element from input vector pair 'in6' and 'in7' are
2691 averaged (a + b)/2 and stored in 'tmp3_m'
2692 The results from all 4 vectors are stored in destination
2693 memory as 16x4 byte block
2695 #define AVE_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
2697 v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2699 tmp0_m = __msa_ave_u_b((v16u8) in0, (v16u8) in1); \
2700 tmp1_m = __msa_ave_u_b((v16u8) in2, (v16u8) in3); \
2701 tmp2_m = __msa_ave_u_b((v16u8) in4, (v16u8) in5); \
2702 tmp3_m = __msa_ave_u_b((v16u8) in6, (v16u8) in7); \
2704 ST_UB4(tmp0_m, tmp1_m, tmp2_m, tmp3_m, pdst, stride); \
2707 /* Description : Average rounded byte elements from pair of vectors and store
2708 8x4 byte block in destination memory
2709 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2712 Details : Each byte element from input vector pair 'in0' and 'in1' are
2713 average rounded (a + b + 1)/2 and stored in 'tmp0_m'
2714 Each byte element from input vector pair 'in2' and 'in3' are
2715 average rounded (a + b + 1)/2 and stored in 'tmp1_m'
2716 Each byte element from input vector pair 'in4' and 'in5' are
2717 average rounded (a + b + 1)/2 and stored in 'tmp2_m'
2718 Each byte element from input vector pair 'in6' and 'in7' are
2719 average rounded (a + b + 1)/2 and stored in 'tmp3_m'
2720 The half vector results from all 4 vectors are stored in
2721 destination memory as 8x4 byte block
2723 #define AVER_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
2725 uint64_t out0_m, out1_m, out2_m, out3_m; \
2726 v16u8 tp0_m, tp1_m, tp2_m, tp3_m; \
2728 AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2729 tp0_m, tp1_m, tp2_m, tp3_m); \
2731 out0_m = __msa_copy_u_d((v2i64) tp0_m, 0); \
2732 out1_m = __msa_copy_u_d((v2i64) tp1_m, 0); \
2733 out2_m = __msa_copy_u_d((v2i64) tp2_m, 0); \
2734 out3_m = __msa_copy_u_d((v2i64) tp3_m, 0); \
2735 SD4(out0_m, out1_m, out2_m, out3_m, pdst, stride); \
2738 /* Description : Average rounded byte elements from pair of vectors and store
2739 16x4 byte block in destination memory
2740 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2743 Details : Each byte element from input vector pair 'in0' and 'in1' are
2744 average rounded (a + b + 1)/2 and stored in 'tmp0_m'
2745 Each byte element from input vector pair 'in2' and 'in3' are
2746 average rounded (a + b + 1)/2 and stored in 'tmp1_m'
2747 Each byte element from input vector pair 'in4' and 'in5' are
2748 average rounded (a + b + 1)/2 and stored in 'tmp2_m'
2749 Each byte element from input vector pair 'in6' and 'in7' are
2750 average rounded (a + b + 1)/2 and stored in 'tmp3_m'
2751 The vector results from all 4 vectors are stored in
2752 destination memory as 16x4 byte block
2754 #define AVER_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
2756 v16u8 t0_m, t1_m, t2_m, t3_m; \
2758 AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2759 t0_m, t1_m, t2_m, t3_m); \
2760 ST_UB4(t0_m, t1_m, t2_m, t3_m, pdst, stride); \
2763 /* Description : Average rounded byte elements from pair of vectors,
2764 average rounded with destination and store 8x4 byte block
2765 in destination memory
2766 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2769 Details : Each byte element from input vector pair 'in0' and 'in1' are
2770 average rounded (a + b + 1)/2 and stored in 'tmp0_m'
2771 Each byte element from input vector pair 'in2' and 'in3' are
2772 average rounded (a + b + 1)/2 and stored in 'tmp1_m'
2773 Each byte element from input vector pair 'in4' and 'in5' are
2774 average rounded (a + b + 1)/2 and stored in 'tmp2_m'
2775 Each byte element from input vector pair 'in6' and 'in7' are
2776 average rounded (a + b + 1)/2 and stored in 'tmp3_m'
2777 The half vector results from all 4 vectors are stored in
2778 destination memory as 8x4 byte block
2780 #define AVER_DST_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2783 v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2784 v16u8 dst0_m, dst1_m, dst2_m, dst3_m; \
2786 LD_UB4(pdst, stride, dst0_m, dst1_m, dst2_m, dst3_m); \
2787 AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2788 tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
2789 AVER_ST8x4_UB(dst0_m, tmp0_m, dst1_m, tmp1_m, \
2790 dst2_m, tmp2_m, dst3_m, tmp3_m, pdst, stride); \
2793 /* Description : Average rounded byte elements from pair of vectors,
2794 average rounded with destination and store 16x4 byte block
2795 in destination memory
2796 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2799 Details : Each byte element from input vector pair 'in0' and 'in1' are
2800 average rounded (a + b + 1)/2 and stored in 'tmp0_m'
2801 Each byte element from input vector pair 'in2' and 'in3' are
2802 average rounded (a + b + 1)/2 and stored in 'tmp1_m'
2803 Each byte element from input vector pair 'in4' and 'in5' are
2804 average rounded (a + b + 1)/2 and stored in 'tmp2_m'
2805 Each byte element from input vector pair 'in6' and 'in7' are
2806 average rounded (a + b + 1)/2 and stored in 'tmp3_m'
2807 The vector results from all 4 vectors are stored in
2808 destination memory as 16x4 byte block
2810 #define AVER_DST_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2813 v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2814 v16u8 dst0_m, dst1_m, dst2_m, dst3_m; \
2816 LD_UB4(pdst, stride, dst0_m, dst1_m, dst2_m, dst3_m); \
2817 AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2818 tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
2819 AVER_ST16x4_UB(dst0_m, tmp0_m, dst1_m, tmp1_m, \
2820 dst2_m, tmp2_m, dst3_m, tmp3_m, pdst, stride); \
2823 /* Description : Add block 4x4
2824 Arguments : Inputs - in0, in1, in2, in3, pdst, stride
2826 Return Type - unsigned bytes
2827 Details : Least significant 4 bytes from each input vector are added to
2828 the destination bytes, clipped between 0-255 and then stored.
2830 #define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride) \
2832 uint32_t src0_m, src1_m, src2_m, src3_m; \
2833 uint32_t out0_m, out1_m, out2_m, out3_m; \
2834 v8i16 inp0_m, inp1_m, res0_m, res1_m; \
2835 v16i8 dst0_m = { 0 }; \
2836 v16i8 dst1_m = { 0 }; \
2837 v16i8 zero_m = { 0 }; \
2839 ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m) \
2840 LW4(pdst, stride, src0_m, src1_m, src2_m, src3_m); \
2841 INSERT_W2_SB(src0_m, src1_m, dst0_m); \
2842 INSERT_W2_SB(src2_m, src3_m, dst1_m); \
2843 ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m); \
2844 ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m); \
2845 CLIP_SH2_0_255(res0_m, res1_m); \
2846 PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m); \
2848 out0_m = __msa_copy_u_w((v4i32) dst0_m, 0); \
2849 out1_m = __msa_copy_u_w((v4i32) dst0_m, 1); \
2850 out2_m = __msa_copy_u_w((v4i32) dst1_m, 0); \
2851 out3_m = __msa_copy_u_w((v4i32) dst1_m, 1); \
2852 SW4(out0_m, out1_m, out2_m, out3_m, pdst, stride); \
2855 /* Description : Dot product and addition of 3 signed halfword input vectors
2856 Arguments : Inputs - in0, in1, in2, coeff0, coeff1, coeff2
2858 Return Type - signed halfword
2859 Details : Dot product of 'in0' with 'coeff0'
2860 Dot product of 'in1' with 'coeff1'
2861 Dot product of 'in2' with 'coeff2'
2862 Addition of all the 3 vector results
2864 out0_m = (in0 * coeff0) + (in1 * coeff1) + (in2 * coeff2)
2866 #define DPADD_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2) \
2871 out0_m = __msa_dotp_s_h((v16i8) in0, (v16i8) coeff0); \
2872 out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in1, (v16i8) coeff1); \
2873 tmp1_m = __msa_dotp_s_h((v16i8) in2, (v16i8) coeff2); \
2874 out0_m = __msa_adds_s_h(out0_m, tmp1_m); \
2879 /* Description : Pack even elements of input vectors & xor with 128
2880 Arguments : Inputs - in0, in1
2882 Return Type - unsigned byte
2883 Details : Signed byte even elements from 'in0' and 'in1' are packed
2884 together in one vector and the resulted vector is xor'ed with
2885 128 to shift the range from signed to unsigned byte
2887 #define PCKEV_XORI128_UB(in0, in1) \
2890 out_m = (v16u8) __msa_pckev_b((v16i8) in1, (v16i8) in0); \
2891 out_m = (v16u8) __msa_xori_b((v16u8) out_m, 128); \
2895 /* Description : Converts inputs to unsigned bytes, interleave, average & store
2896 as 8x4 unsigned byte block
2897 Arguments : Inputs - in0, in1, in2, in3, dst0, dst1, dst2, dst3,
2900 #define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3, \
2901 dst0, dst1, dst2, dst3, pdst, stride) \
2903 v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2904 uint8_t *pdst_m = (uint8_t *) (pdst); \
2906 tmp0_m = PCKEV_XORI128_UB(in0, in1); \
2907 tmp1_m = PCKEV_XORI128_UB(in2, in3); \
2908 ILVR_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m); \
2909 AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m); \
2910 ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride); \
2913 /* Description : Pack even byte elements, extract 0 & 2 index words from pair
2914 of results and store 4 words in destination memory as per
2916 Arguments : Inputs - in0, in1, in2, in3, pdst, stride
2918 #define PCKEV_ST4x4_UB(in0, in1, in2, in3, pdst, stride) \
2920 uint32_t out0_m, out1_m, out2_m, out3_m; \
2921 v16i8 tmp0_m, tmp1_m; \
2923 PCKEV_B2_SB(in1, in0, in3, in2, tmp0_m, tmp1_m); \
2925 out0_m = __msa_copy_u_w((v4i32) tmp0_m, 0); \
2926 out1_m = __msa_copy_u_w((v4i32) tmp0_m, 2); \
2927 out2_m = __msa_copy_u_w((v4i32) tmp1_m, 0); \
2928 out3_m = __msa_copy_u_w((v4i32) tmp1_m, 2); \
2930 SW4(out0_m, out1_m, out2_m, out3_m, pdst, stride); \
2933 /* Description : Pack even byte elements and store byte vector in destination
2935 Arguments : Inputs - in0, in1, pdst
2937 #define PCKEV_ST_SB(in0, in1, pdst) \
2940 tmp_m = __msa_pckev_b((v16i8) in1, (v16i8) in0); \
2941 ST_SB(tmp_m, (pdst)); \
2944 /* Description : Horizontal 2 tap filter kernel code
2945 Arguments : Inputs - in0, in1, mask, coeff, shift
2947 #define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift) \
2952 tmp0_m = __msa_vshf_b((v16i8) mask, (v16i8) in1, (v16i8) in0); \
2953 tmp1_m = __msa_dotp_u_h((v16u8) tmp0_m, (v16u8) coeff); \
2954 tmp1_m = (v8u16) __msa_srari_h((v8i16) tmp1_m, shift); \
2955 tmp1_m = __msa_sat_u_h(tmp1_m, shift); \
2959 #endif /* AVUTIL_MIPS_GENERIC_MACROS_MSA_H */