2 * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #ifndef AVUTIL_MIPS_GENERIC_MACROS_MSA_H
22 #define AVUTIL_MIPS_GENERIC_MACROS_MSA_H
28 #define ALLOC_ALIGNED(align) __attribute__ ((aligned((align) << 1)))
30 #define LD_V(RTYPE, psrc) *((RTYPE *)(psrc))
31 #define LD_UB(...) LD_V(v16u8, __VA_ARGS__)
32 #define LD_SB(...) LD_V(v16i8, __VA_ARGS__)
33 #define LD_UH(...) LD_V(v8u16, __VA_ARGS__)
34 #define LD_SH(...) LD_V(v8i16, __VA_ARGS__)
35 #define LD_UW(...) LD_V(v4u32, __VA_ARGS__)
36 #define LD_SW(...) LD_V(v4i32, __VA_ARGS__)
38 #define ST_V(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
39 #define ST_UB(...) ST_V(v16u8, __VA_ARGS__)
40 #define ST_SB(...) ST_V(v16i8, __VA_ARGS__)
41 #define ST_UH(...) ST_V(v8u16, __VA_ARGS__)
42 #define ST_SH(...) ST_V(v8i16, __VA_ARGS__)
43 #define ST_UW(...) ST_V(v4u32, __VA_ARGS__)
44 #define ST_SW(...) ST_V(v4i32, __VA_ARGS__)
46 #if (__mips_isa_rev >= 6)
49 uint16_t val_lh_m = *(uint16_t *)(psrc); \
55 uint32_t val_lw_m = *(uint32_t *)(psrc); \
62 uint64_t val_ld_m = *(uint64_t *)(psrc); \
65 #else // !(__mips == 64)
68 uint8_t *psrc_ld_m = (uint8_t *) (psrc); \
69 uint32_t val0_ld_m, val1_ld_m; \
70 uint64_t val_ld_m = 0; \
72 val0_ld_m = LW(psrc_ld_m); \
73 val1_ld_m = LW(psrc_ld_m + 4); \
75 val_ld_m = (uint64_t) (val1_ld_m); \
76 val_ld_m = (uint64_t) ((val_ld_m << 32) & 0xFFFFFFFF00000000); \
77 val_ld_m = (uint64_t) (val_ld_m | (uint64_t) val0_ld_m); \
81 #endif // (__mips == 64)
83 #define SH(val, pdst) *(uint16_t *)(pdst) = (val);
84 #define SW(val, pdst) *(uint32_t *)(pdst) = (val);
85 #define SD(val, pdst) *(uint64_t *)(pdst) = (val);
87 #else // !(__mips_isa_rev >= 6)
90 uint8_t *psrc_lh_m = (uint8_t *) (psrc); \
94 "ulh %[val_lh_m], %[psrc_lh_m] \n\t" \
96 : [val_lh_m] "=r" (val_lh_m) \
97 : [psrc_lh_m] "m" (*psrc_lh_m) \
105 uint8_t *psrc_lw_m = (uint8_t *) (psrc); \
109 "ulw %[val_lw_m], %[psrc_lw_m] \n\t" \
111 : [val_lw_m] "=r" (val_lw_m) \
112 : [psrc_lw_m] "m" (*psrc_lw_m) \
121 uint8_t *psrc_ld_m = (uint8_t *) (psrc); \
122 uint64_t val_ld_m = 0; \
125 "uld %[val_ld_m], %[psrc_ld_m] \n\t" \
127 : [val_ld_m] "=r" (val_ld_m) \
128 : [psrc_ld_m] "m" (*psrc_ld_m) \
133 #else // !(__mips == 64)
136 uint8_t *psrc_ld_m = (uint8_t *) (psrc); \
137 uint32_t val0_ld_m, val1_ld_m; \
138 uint64_t val_ld_m = 0; \
140 val0_ld_m = LW(psrc_ld_m); \
141 val1_ld_m = LW(psrc_ld_m + 4); \
143 val_ld_m = (uint64_t) (val1_ld_m); \
144 val_ld_m = (uint64_t) ((val_ld_m << 32) & 0xFFFFFFFF00000000); \
145 val_ld_m = (uint64_t) (val_ld_m | (uint64_t) val0_ld_m); \
149 #endif // (__mips == 64)
151 #define SH(val, pdst) \
153 uint8_t *pdst_sh_m = (uint8_t *) (pdst); \
154 uint16_t val_sh_m = (val); \
157 "ush %[val_sh_m], %[pdst_sh_m] \n\t" \
159 : [pdst_sh_m] "=m" (*pdst_sh_m) \
160 : [val_sh_m] "r" (val_sh_m) \
164 #define SW(val, pdst) \
166 uint8_t *pdst_sw_m = (uint8_t *) (pdst); \
167 uint32_t val_sw_m = (val); \
170 "usw %[val_sw_m], %[pdst_sw_m] \n\t" \
172 : [pdst_sw_m] "=m" (*pdst_sw_m) \
173 : [val_sw_m] "r" (val_sw_m) \
177 #define SD(val, pdst) \
179 uint8_t *pdst_sd_m = (uint8_t *) (pdst); \
180 uint32_t val0_sd_m, val1_sd_m; \
182 val0_sd_m = (uint32_t) ((val) & 0x00000000FFFFFFFF); \
183 val1_sd_m = (uint32_t) (((val) >> 32) & 0x00000000FFFFFFFF); \
185 SW(val0_sd_m, pdst_sd_m); \
186 SW(val1_sd_m, pdst_sd_m + 4); \
188 #endif // (__mips_isa_rev >= 6)
190 /* Description : Load 4 words with stride
191 Arguments : Inputs - psrc (source pointer to load from)
193 Outputs - out0, out1, out2, out3
194 Details : Loads word in 'out0' from (psrc)
195 Loads word in 'out1' from (psrc + stride)
196 Loads word in 'out2' from (psrc + 2 * stride)
197 Loads word in 'out3' from (psrc + 3 * stride)
199 #define LW4(psrc, stride, out0, out1, out2, out3) \
202 out1 = LW((psrc) + stride); \
203 out2 = LW((psrc) + 2 * stride); \
204 out3 = LW((psrc) + 3 * stride); \
207 #define LW2(psrc, stride, out0, out1) \
210 out1 = LW((psrc) + stride); \
213 /* Description : Load double words with stride
214 Arguments : Inputs - psrc (source pointer to load from)
217 Details : Loads double word in 'out0' from (psrc)
218 Loads double word in 'out1' from (psrc + stride)
220 #define LD2(psrc, stride, out0, out1) \
223 out1 = LD((psrc) + stride); \
225 #define LD4(psrc, stride, out0, out1, out2, out3) \
227 LD2((psrc), stride, out0, out1); \
228 LD2((psrc) + 2 * stride, stride, out2, out3); \
231 /* Description : Store 4 words with stride
232 Arguments : Inputs - in0, in1, in2, in3, pdst, stride
233 Details : Stores word from 'in0' to (pdst)
234 Stores word from 'in1' to (pdst + stride)
235 Stores word from 'in2' to (pdst + 2 * stride)
236 Stores word from 'in3' to (pdst + 3 * stride)
238 #define SW4(in0, in1, in2, in3, pdst, stride) \
241 SW(in1, (pdst) + stride); \
242 SW(in2, (pdst) + 2 * stride); \
243 SW(in3, (pdst) + 3 * stride); \
246 /* Description : Store 4 double words with stride
247 Arguments : Inputs - in0, in1, in2, in3, pdst, stride
248 Details : Stores double word from 'in0' to (pdst)
249 Stores double word from 'in1' to (pdst + stride)
250 Stores double word from 'in2' to (pdst + 2 * stride)
251 Stores double word from 'in3' to (pdst + 3 * stride)
253 #define SD4(in0, in1, in2, in3, pdst, stride) \
256 SD(in1, (pdst) + stride); \
257 SD(in2, (pdst) + 2 * stride); \
258 SD(in3, (pdst) + 3 * stride); \
261 /* Description : Load vector elements with stride
262 Arguments : Inputs - psrc (source pointer to load from)
265 Return Type - as per RTYPE
266 Details : Loads elements in 'out0' from (psrc)
267 Loads elements in 'out1' from (psrc + stride)
269 #define LD_V2(RTYPE, psrc, stride, out0, out1) \
271 out0 = LD_V(RTYPE, (psrc)); \
272 out1 = LD_V(RTYPE, (psrc) + stride); \
274 #define LD_UB2(...) LD_V2(v16u8, __VA_ARGS__)
275 #define LD_SB2(...) LD_V2(v16i8, __VA_ARGS__)
276 #define LD_UH2(...) LD_V2(v8u16, __VA_ARGS__)
277 #define LD_SH2(...) LD_V2(v8i16, __VA_ARGS__)
278 #define LD_SW2(...) LD_V2(v4i32, __VA_ARGS__)
280 #define LD_V3(RTYPE, psrc, stride, out0, out1, out2) \
282 LD_V2(RTYPE, (psrc), stride, out0, out1); \
283 out2 = LD_V(RTYPE, (psrc) + 2 * stride); \
285 #define LD_UB3(...) LD_V3(v16u8, __VA_ARGS__)
286 #define LD_SB3(...) LD_V3(v16i8, __VA_ARGS__)
288 #define LD_V4(RTYPE, psrc, stride, out0, out1, out2, out3) \
290 LD_V2(RTYPE, (psrc), stride, out0, out1); \
291 LD_V2(RTYPE, (psrc) + 2 * stride , stride, out2, out3); \
293 #define LD_UB4(...) LD_V4(v16u8, __VA_ARGS__)
294 #define LD_SB4(...) LD_V4(v16i8, __VA_ARGS__)
295 #define LD_UH4(...) LD_V4(v8u16, __VA_ARGS__)
296 #define LD_SH4(...) LD_V4(v8i16, __VA_ARGS__)
298 #define LD_V5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \
300 LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
301 out4 = LD_V(RTYPE, (psrc) + 4 * stride); \
303 #define LD_UB5(...) LD_V5(v16u8, __VA_ARGS__)
304 #define LD_SB5(...) LD_V5(v16i8, __VA_ARGS__)
306 #define LD_V6(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5) \
308 LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
309 LD_V2(RTYPE, (psrc) + 4 * stride, stride, out4, out5); \
311 #define LD_UB6(...) LD_V6(v16u8, __VA_ARGS__)
312 #define LD_SB6(...) LD_V6(v16i8, __VA_ARGS__)
313 #define LD_UH6(...) LD_V6(v8u16, __VA_ARGS__)
314 #define LD_SH6(...) LD_V6(v8i16, __VA_ARGS__)
316 #define LD_V7(RTYPE, psrc, stride, \
317 out0, out1, out2, out3, out4, out5, out6) \
319 LD_V5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4); \
320 LD_V2(RTYPE, (psrc) + 5 * stride, stride, out5, out6); \
322 #define LD_UB7(...) LD_V7(v16u8, __VA_ARGS__)
323 #define LD_SB7(...) LD_V7(v16i8, __VA_ARGS__)
325 #define LD_V8(RTYPE, psrc, stride, \
326 out0, out1, out2, out3, out4, out5, out6, out7) \
328 LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
329 LD_V4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \
331 #define LD_UB8(...) LD_V8(v16u8, __VA_ARGS__)
332 #define LD_SB8(...) LD_V8(v16i8, __VA_ARGS__)
333 #define LD_UH8(...) LD_V8(v8u16, __VA_ARGS__)
334 #define LD_SH8(...) LD_V8(v8i16, __VA_ARGS__)
336 #define LD_V16(RTYPE, psrc, stride, \
337 out0, out1, out2, out3, out4, out5, out6, out7, \
338 out8, out9, out10, out11, out12, out13, out14, out15) \
340 LD_V8(RTYPE, (psrc), stride, \
341 out0, out1, out2, out3, out4, out5, out6, out7); \
342 LD_V8(RTYPE, (psrc) + 8 * stride, stride, \
343 out8, out9, out10, out11, out12, out13, out14, out15); \
345 #define LD_SH16(...) LD_V16(v8i16, __VA_ARGS__)
347 /* Description : Load as 4x4 block of signed halfword elements from 1D source
348 data into 4 vectors (Each vector with 4 signed halfwords)
349 Arguments : Inputs - psrc
350 Outputs - out0, out1, out2, out3
352 #define LD4x4_SH(psrc, out0, out1, out2, out3) \
354 out0 = LD_SH(psrc); \
355 out2 = LD_SH(psrc + 8); \
356 out1 = (v8i16) __msa_ilvl_d((v2i64) out0, (v2i64) out0); \
357 out3 = (v8i16) __msa_ilvl_d((v2i64) out2, (v2i64) out2); \
360 /* Description : Store vectors with stride
361 Arguments : Inputs - in0, in1, stride
362 Outputs - pdst (destination pointer to store to)
363 Details : Stores elements from 'in0' to (pdst)
364 Stores elements from 'in1' to (pdst + stride)
366 #define ST_V2(RTYPE, in0, in1, pdst, stride) \
368 ST_V(RTYPE, in0, (pdst)); \
369 ST_V(RTYPE, in1, (pdst) + stride); \
371 #define ST_UB2(...) ST_V2(v16u8, __VA_ARGS__)
372 #define ST_SB2(...) ST_V2(v16i8, __VA_ARGS__)
373 #define ST_UH2(...) ST_V2(v8u16, __VA_ARGS__)
374 #define ST_SH2(...) ST_V2(v8i16, __VA_ARGS__)
375 #define ST_SW2(...) ST_V2(v4i32, __VA_ARGS__)
377 #define ST_V4(RTYPE, in0, in1, in2, in3, pdst, stride) \
379 ST_V2(RTYPE, in0, in1, (pdst), stride); \
380 ST_V2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
382 #define ST_UB4(...) ST_V4(v16u8, __VA_ARGS__)
383 #define ST_SB4(...) ST_V4(v16i8, __VA_ARGS__)
384 #define ST_SH4(...) ST_V4(v8i16, __VA_ARGS__)
385 #define ST_SW4(...) ST_V4(v4i32, __VA_ARGS__)
387 #define ST_V6(RTYPE, in0, in1, in2, in3, in4, in5, pdst, stride) \
389 ST_V4(RTYPE, in0, in1, in2, in3, (pdst), stride); \
390 ST_V2(RTYPE, in4, in5, (pdst) + 4 * stride, stride); \
392 #define ST_SH6(...) ST_V6(v8i16, __VA_ARGS__)
394 #define ST_V8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
396 ST_V4(RTYPE, in0, in1, in2, in3, (pdst), stride); \
397 ST_V4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \
399 #define ST_UB8(...) ST_V8(v16u8, __VA_ARGS__)
400 #define ST_SH8(...) ST_V8(v8i16, __VA_ARGS__)
401 #define ST_SW8(...) ST_V8(v4i32, __VA_ARGS__)
403 /* Description : Store as 2x4 byte block to destination memory from input vector
404 Arguments : Inputs - in, stidx, pdst, stride
405 Return Type - unsigned byte
406 Details : Index stidx halfword element from 'in' vector is copied and
408 Index stidx+1 halfword element from 'in' vector is copied and
409 stored on second line
410 Index stidx+2 halfword element from 'in' vector is copied and
412 Index stidx+3 halfword element from 'in' vector is copied and
413 stored on fourth line
415 #define ST2x4_UB(in, stidx, pdst, stride) \
417 uint16_t out0_m, out1_m, out2_m, out3_m; \
418 uint8_t *pblk_2x4_m = (uint8_t *) (pdst); \
420 out0_m = __msa_copy_u_h((v8i16) in, (stidx)); \
421 out1_m = __msa_copy_u_h((v8i16) in, (stidx + 1)); \
422 out2_m = __msa_copy_u_h((v8i16) in, (stidx + 2)); \
423 out3_m = __msa_copy_u_h((v8i16) in, (stidx + 3)); \
425 SH(out0_m, pblk_2x4_m); \
426 SH(out1_m, pblk_2x4_m + stride); \
427 SH(out2_m, pblk_2x4_m + 2 * stride); \
428 SH(out3_m, pblk_2x4_m + 3 * stride); \
431 /* Description : Store as 4x2 byte block to destination memory from input vector
432 Arguments : Inputs - in, pdst, stride
433 Return Type - unsigned byte
434 Details : Index 0 word element from input vector is copied and stored
436 Index 1 word element from input vector is copied and stored
439 #define ST4x2_UB(in, pdst, stride) \
441 uint32_t out0_m, out1_m; \
442 uint8_t *pblk_4x2_m = (uint8_t *) (pdst); \
444 out0_m = __msa_copy_u_w((v4i32) in, 0); \
445 out1_m = __msa_copy_u_w((v4i32) in, 1); \
447 SW(out0_m, pblk_4x2_m); \
448 SW(out1_m, pblk_4x2_m + stride); \
451 /* Description : Store as 4x4 byte block to destination memory from input vector
452 Arguments : Inputs - in0, in1, pdst, stride
453 Return Type - unsigned byte
454 Details : Idx0 word element from input vector 'in0' is copied and stored
456 Idx1 word element from input vector 'in0' is copied and stored
458 Idx2 word element from input vector 'in1' is copied and stored
460 Idx3 word element from input vector 'in1' is copied and stored
463 #define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \
465 uint32_t out0_m, out1_m, out2_m, out3_m; \
466 uint8_t *pblk_4x4_m = (uint8_t *) (pdst); \
468 out0_m = __msa_copy_u_w((v4i32) in0, idx0); \
469 out1_m = __msa_copy_u_w((v4i32) in0, idx1); \
470 out2_m = __msa_copy_u_w((v4i32) in1, idx2); \
471 out3_m = __msa_copy_u_w((v4i32) in1, idx3); \
473 SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride); \
475 #define ST4x8_UB(in0, in1, pdst, stride) \
477 uint8_t *pblk_4x8 = (uint8_t *) (pdst); \
479 ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride); \
480 ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride); \
483 /* Description : Store as 6x4 byte block to destination memory from input
485 Arguments : Inputs - in0, in1, pdst, stride
486 Return Type - unsigned byte
487 Details : Index 0 word element from input vector 'in0' is copied and
488 stored on first line followed by index 2 halfword element
489 Index 2 word element from input vector 'in0' is copied and
490 stored on second line followed by index 2 halfword element
491 Index 0 word element from input vector 'in1' is copied and
492 stored on third line followed by index 2 halfword element
493 Index 2 word element from input vector 'in1' is copied and
494 stored on fourth line followed by index 2 halfword element
496 #define ST6x4_UB(in0, in1, pdst, stride) \
498 uint32_t out0_m, out1_m, out2_m, out3_m; \
499 uint16_t out4_m, out5_m, out6_m, out7_m; \
500 uint8_t *pblk_6x4_m = (uint8_t *) (pdst); \
502 out0_m = __msa_copy_u_w((v4i32) in0, 0); \
503 out1_m = __msa_copy_u_w((v4i32) in0, 2); \
504 out2_m = __msa_copy_u_w((v4i32) in1, 0); \
505 out3_m = __msa_copy_u_w((v4i32) in1, 2); \
507 out4_m = __msa_copy_u_h((v8i16) in0, 2); \
508 out5_m = __msa_copy_u_h((v8i16) in0, 6); \
509 out6_m = __msa_copy_u_h((v8i16) in1, 2); \
510 out7_m = __msa_copy_u_h((v8i16) in1, 6); \
512 SW(out0_m, pblk_6x4_m); \
513 SH(out4_m, (pblk_6x4_m + 4)); \
514 pblk_6x4_m += stride; \
515 SW(out1_m, pblk_6x4_m); \
516 SH(out5_m, (pblk_6x4_m + 4)); \
517 pblk_6x4_m += stride; \
518 SW(out2_m, pblk_6x4_m); \
519 SH(out6_m, (pblk_6x4_m + 4)); \
520 pblk_6x4_m += stride; \
521 SW(out3_m, pblk_6x4_m); \
522 SH(out7_m, (pblk_6x4_m + 4)); \
525 /* Description : Store as 8x1 byte block to destination memory from input vector
526 Arguments : Inputs - in, pdst
527 Details : Index 0 double word element from input vector 'in' is copied
528 and stored to destination memory at (pdst)
530 #define ST8x1_UB(in, pdst) \
533 out0_m = __msa_copy_u_d((v2i64) in, 0); \
537 /* Description : Store as 8x2 byte block to destination memory from input vector
538 Arguments : Inputs - in, pdst, stride
539 Details : Index 0 double word element from input vector 'in' is copied
540 and stored to destination memory at (pdst)
541 Index 1 double word element from input vector 'in' is copied
542 and stored to destination memory at (pdst + stride)
544 #define ST8x2_UB(in, pdst, stride) \
546 uint64_t out0_m, out1_m; \
547 uint8_t *pblk_8x2_m = (uint8_t *) (pdst); \
549 out0_m = __msa_copy_u_d((v2i64) in, 0); \
550 out1_m = __msa_copy_u_d((v2i64) in, 1); \
552 SD(out0_m, pblk_8x2_m); \
553 SD(out1_m, pblk_8x2_m + stride); \
556 /* Description : Store as 8x4 byte block to destination memory from input
558 Arguments : Inputs - in0, in1, pdst, stride
559 Details : Index 0 double word element from input vector 'in0' is copied
560 and stored to destination memory at (pblk_8x4_m)
561 Index 1 double word element from input vector 'in0' is copied
562 and stored to destination memory at (pblk_8x4_m + stride)
563 Index 0 double word element from input vector 'in1' is copied
564 and stored to destination memory at (pblk_8x4_m + 2 * stride)
565 Index 1 double word element from input vector 'in1' is copied
566 and stored to destination memory at (pblk_8x4_m + 3 * stride)
568 #define ST8x4_UB(in0, in1, pdst, stride) \
570 uint64_t out0_m, out1_m, out2_m, out3_m; \
571 uint8_t *pblk_8x4_m = (uint8_t *) (pdst); \
573 out0_m = __msa_copy_u_d((v2i64) in0, 0); \
574 out1_m = __msa_copy_u_d((v2i64) in0, 1); \
575 out2_m = __msa_copy_u_d((v2i64) in1, 0); \
576 out3_m = __msa_copy_u_d((v2i64) in1, 1); \
578 SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride); \
580 #define ST8x8_UB(in0, in1, in2, in3, pdst, stride) \
582 uint8_t *pblk_8x8_m = (uint8_t *) (pdst); \
584 ST8x4_UB(in0, in1, pblk_8x8_m, stride); \
585 ST8x4_UB(in2, in3, pblk_8x8_m + 4 * stride, stride); \
587 #define ST12x4_UB(in0, in1, in2, pdst, stride) \
589 uint8_t *pblk_12x4_m = (uint8_t *) (pdst); \
592 ST8x4_UB(in0, in1, pblk_12x4_m, stride); \
594 ST4x4_UB(in2, in2, 0, 1, 2, 3, pblk_12x4_m + 8, stride); \
597 /* Description : Store as 12x8 byte block to destination memory from
599 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
600 Details : Index 0 double word element from input vector 'in0' is copied
601 and stored to destination memory at (pblk_12x8_m) followed by
602 index 2 word element from same input vector 'in0' at
604 Similar to remaining lines
606 #define ST12x8_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
608 uint64_t out0_m, out1_m, out2_m, out3_m; \
609 uint64_t out4_m, out5_m, out6_m, out7_m; \
610 uint32_t out8_m, out9_m, out10_m, out11_m; \
611 uint32_t out12_m, out13_m, out14_m, out15_m; \
612 uint8_t *pblk_12x8_m = (uint8_t *) (pdst); \
614 out0_m = __msa_copy_u_d((v2i64) in0, 0); \
615 out1_m = __msa_copy_u_d((v2i64) in1, 0); \
616 out2_m = __msa_copy_u_d((v2i64) in2, 0); \
617 out3_m = __msa_copy_u_d((v2i64) in3, 0); \
618 out4_m = __msa_copy_u_d((v2i64) in4, 0); \
619 out5_m = __msa_copy_u_d((v2i64) in5, 0); \
620 out6_m = __msa_copy_u_d((v2i64) in6, 0); \
621 out7_m = __msa_copy_u_d((v2i64) in7, 0); \
623 out8_m = __msa_copy_u_w((v4i32) in0, 2); \
624 out9_m = __msa_copy_u_w((v4i32) in1, 2); \
625 out10_m = __msa_copy_u_w((v4i32) in2, 2); \
626 out11_m = __msa_copy_u_w((v4i32) in3, 2); \
627 out12_m = __msa_copy_u_w((v4i32) in4, 2); \
628 out13_m = __msa_copy_u_w((v4i32) in5, 2); \
629 out14_m = __msa_copy_u_w((v4i32) in6, 2); \
630 out15_m = __msa_copy_u_w((v4i32) in7, 2); \
632 SD(out0_m, pblk_12x8_m); \
633 SW(out8_m, pblk_12x8_m + 8); \
634 pblk_12x8_m += stride; \
635 SD(out1_m, pblk_12x8_m); \
636 SW(out9_m, pblk_12x8_m + 8); \
637 pblk_12x8_m += stride; \
638 SD(out2_m, pblk_12x8_m); \
639 SW(out10_m, pblk_12x8_m + 8); \
640 pblk_12x8_m += stride; \
641 SD(out3_m, pblk_12x8_m); \
642 SW(out11_m, pblk_12x8_m + 8); \
643 pblk_12x8_m += stride; \
644 SD(out4_m, pblk_12x8_m); \
645 SW(out12_m, pblk_12x8_m + 8); \
646 pblk_12x8_m += stride; \
647 SD(out5_m, pblk_12x8_m); \
648 SW(out13_m, pblk_12x8_m + 8); \
649 pblk_12x8_m += stride; \
650 SD(out6_m, pblk_12x8_m); \
651 SW(out14_m, pblk_12x8_m + 8); \
652 pblk_12x8_m += stride; \
653 SD(out7_m, pblk_12x8_m); \
654 SW(out15_m, pblk_12x8_m + 8); \
657 /* Description : average with rounding (in0 + in1 + 1) / 2.
658 Arguments : Inputs - in0, in1, in2, in3,
660 Return Type - as per RTYPE
661 Details : Each byte element from 'in0' vector is added with each byte
662 element from 'in1' vector. The addition of the elements plus 1
663 (for rounding) is done unsigned with full precision,
664 i.e. the result has one extra bit. Unsigned division by 2
665 (or logical shift right by one bit) is performed before writing
666 the result to vector 'out0'
667 Similar for the pair of 'in2' and 'in3'
669 #define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \
671 out0 = (RTYPE) __msa_aver_u_b((v16u8) in0, (v16u8) in1); \
672 out1 = (RTYPE) __msa_aver_u_b((v16u8) in2, (v16u8) in3); \
674 #define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__)
676 #define AVER_UB4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
677 out0, out1, out2, out3) \
679 AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \
680 AVER_UB2(RTYPE, in4, in5, in6, in7, out2, out3) \
682 #define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__)
684 /* Description : Immediate number of columns to slide with zero
685 Arguments : Inputs - in0, in1, slide_val
687 Return Type - as per RTYPE
688 Details : Byte elements from 'zero_m' vector are slide into 'in0' by
689 number of elements specified by 'slide_val'
691 #define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val) \
693 v16i8 zero_m = { 0 }; \
694 out0 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in0, slide_val); \
695 out1 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in1, slide_val); \
697 #define SLDI_B2_0_UB(...) SLDI_B2_0(v16u8, __VA_ARGS__)
698 #define SLDI_B2_0_SB(...) SLDI_B2_0(v16i8, __VA_ARGS__)
699 #define SLDI_B2_0_SW(...) SLDI_B2_0(v4i32, __VA_ARGS__)
701 #define SLDI_B3_0(RTYPE, in0, in1, in2, out0, out1, out2, slide_val) \
703 v16i8 zero_m = { 0 }; \
704 SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val); \
705 out2 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in2, slide_val); \
707 #define SLDI_B3_0_UB(...) SLDI_B3_0(v16u8, __VA_ARGS__)
708 #define SLDI_B3_0_SB(...) SLDI_B3_0(v16i8, __VA_ARGS__)
710 #define SLDI_B4_0(RTYPE, in0, in1, in2, in3, \
711 out0, out1, out2, out3, slide_val) \
713 SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val); \
714 SLDI_B2_0(RTYPE, in2, in3, out2, out3, slide_val); \
716 #define SLDI_B4_0_UB(...) SLDI_B4_0(v16u8, __VA_ARGS__)
717 #define SLDI_B4_0_SB(...) SLDI_B4_0(v16i8, __VA_ARGS__)
718 #define SLDI_B4_0_SH(...) SLDI_B4_0(v8i16, __VA_ARGS__)
720 /* Description : Immediate number of columns to slide
721 Arguments : Inputs - in0_0, in0_1, in1_0, in1_1, slide_val
723 Return Type - as per RTYPE
724 Details : Byte elements from 'in0_0' vector are slide into 'in1_0' by
725 number of elements specified by 'slide_val'
727 #define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \
729 out0 = (RTYPE) __msa_sldi_b((v16i8) in0_0, (v16i8) in1_0, slide_val); \
730 out1 = (RTYPE) __msa_sldi_b((v16i8) in0_1, (v16i8) in1_1, slide_val); \
732 #define SLDI_B2_UB(...) SLDI_B2(v16u8, __VA_ARGS__)
733 #define SLDI_B2_SB(...) SLDI_B2(v16i8, __VA_ARGS__)
734 #define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__)
736 #define SLDI_B3(RTYPE, in0_0, in0_1, in0_2, in1_0, in1_1, in1_2, \
737 out0, out1, out2, slide_val) \
739 SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \
740 out2 = (RTYPE) __msa_sldi_b((v16i8) in0_2, (v16i8) in1_2, slide_val); \
742 #define SLDI_B3_SB(...) SLDI_B3(v16i8, __VA_ARGS__)
743 #define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__)
745 /* Description : Shuffle byte vector elements as per mask vector
746 Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
748 Return Type - as per RTYPE
749 Details : Selective byte elements from in0 & in1 are copied to out0 as
750 per control vector mask0
751 Selective byte elements from in2 & in3 are copied to out1 as
752 per control vector mask1
754 #define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \
756 out0 = (RTYPE) __msa_vshf_b((v16i8) mask0, (v16i8) in1, (v16i8) in0); \
757 out1 = (RTYPE) __msa_vshf_b((v16i8) mask1, (v16i8) in3, (v16i8) in2); \
759 #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
760 #define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)
761 #define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)
762 #define VSHF_B2_SH(...) VSHF_B2(v8i16, __VA_ARGS__)
764 #define VSHF_B3(RTYPE, in0, in1, in2, in3, in4, in5, mask0, mask1, mask2, \
767 VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1); \
768 out2 = (RTYPE) __msa_vshf_b((v16i8) mask2, (v16i8) in5, (v16i8) in4); \
770 #define VSHF_B3_SB(...) VSHF_B3(v16i8, __VA_ARGS__)
772 #define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3, \
773 out0, out1, out2, out3) \
775 VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1); \
776 VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3); \
778 #define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__)
779 #define VSHF_B4_SH(...) VSHF_B4(v8i16, __VA_ARGS__)
781 /* Description : Shuffle halfword vector elements as per mask vector
782 Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
784 Return Type - as per RTYPE
785 Details : Selective halfword elements from in0 & in1 are copied to out0
786 as per control vector mask0
787 Selective halfword elements from in2 & in3 are copied to out1
788 as per control vector mask1
790 #define VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \
792 out0 = (RTYPE) __msa_vshf_h((v8i16) mask0, (v8i16) in1, (v8i16) in0); \
793 out1 = (RTYPE) __msa_vshf_h((v8i16) mask1, (v8i16) in3, (v8i16) in2); \
795 #define VSHF_H2_SH(...) VSHF_H2(v8i16, __VA_ARGS__)
797 #define VSHF_H3(RTYPE, in0, in1, in2, in3, in4, in5, mask0, mask1, mask2, \
800 VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1); \
801 out2 = (RTYPE) __msa_vshf_h((v8i16) mask2, (v8i16) in5, (v8i16) in4); \
803 #define VSHF_H3_SH(...) VSHF_H3(v8i16, __VA_ARGS__)
805 /* Description : Shuffle byte vector elements as per mask vector
806 Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
808 Return Type - as per RTYPE
809 Details : Selective byte elements from in0 & in1 are copied to out0 as
810 per control vector mask0
811 Selective byte elements from in2 & in3 are copied to out1 as
812 per control vector mask1
814 #define VSHF_W2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \
816 out0 = (RTYPE) __msa_vshf_w((v4i32) mask0, (v4i32) in1, (v4i32) in0); \
817 out1 = (RTYPE) __msa_vshf_w((v4i32) mask1, (v4i32) in3, (v4i32) in2); \
819 #define VSHF_W2_SB(...) VSHF_W2(v16i8, __VA_ARGS__)
821 /* Description : Dot product of byte vector elements
822 Arguments : Inputs - mult0, mult1
825 Return Type - as per RTYPE
826 Details : Unsigned byte elements from mult0 are multiplied with
827 unsigned byte elements from cnst0 producing a result
828 twice the size of input i.e. unsigned halfword.
829 Then this multiplication results of adjacent odd-even elements
830 are added together and stored to the out vector
831 (2 unsigned halfword results)
833 #define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
835 out0 = (RTYPE) __msa_dotp_u_h((v16u8) mult0, (v16u8) cnst0); \
836 out1 = (RTYPE) __msa_dotp_u_h((v16u8) mult1, (v16u8) cnst1); \
838 #define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__)
840 #define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3, \
841 cnst0, cnst1, cnst2, cnst3, \
842 out0, out1, out2, out3) \
844 DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
845 DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
847 #define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__)
849 /* Description : Dot product of byte vector elements
850 Arguments : Inputs - mult0, mult1
853 Return Type - as per RTYPE
854 Details : Signed byte elements from mult0 are multiplied with
855 signed byte elements from cnst0 producing a result
856 twice the size of input i.e. signed halfword.
857 Then this multiplication results of adjacent odd-even elements
858 are added together and stored to the out vector
859 (2 signed halfword results)
861 #define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
863 out0 = (RTYPE) __msa_dotp_s_h((v16i8) mult0, (v16i8) cnst0); \
864 out1 = (RTYPE) __msa_dotp_s_h((v16i8) mult1, (v16i8) cnst1); \
866 #define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__)
868 #define DOTP_SB3(RTYPE, mult0, mult1, mult2, cnst0, cnst1, cnst2, \
871 DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
872 out2 = (RTYPE) __msa_dotp_s_h((v16i8) mult2, (v16i8) cnst2); \
874 #define DOTP_SB3_SH(...) DOTP_SB3(v8i16, __VA_ARGS__)
876 #define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3, \
877 cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) \
879 DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
880 DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
882 #define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__)
884 /* Description : Dot product of halfword vector elements
885 Arguments : Inputs - mult0, mult1
888 Return Type - as per RTYPE
889 Details : Signed halfword elements from mult0 are multiplied with
890 signed halfword elements from cnst0 producing a result
891 twice the size of input i.e. signed word.
892 Then this multiplication results of adjacent odd-even elements
893 are added together and stored to the out vector
894 (2 signed word results)
896 #define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
898 out0 = (RTYPE) __msa_dotp_s_w((v8i16) mult0, (v8i16) cnst0); \
899 out1 = (RTYPE) __msa_dotp_s_w((v8i16) mult1, (v8i16) cnst1); \
901 #define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__)
903 #define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3, \
904 cnst0, cnst1, cnst2, cnst3, \
905 out0, out1, out2, out3) \
907 DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
908 DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
910 #define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__)
912 /* Description : Dot product & addition of byte vector elements
913 Arguments : Inputs - mult0, mult1
916 Return Type - as per RTYPE
917 Details : Signed byte elements from mult0 are multiplied with
918 signed byte elements from cnst0 producing a result
919 twice the size of input i.e. signed halfword.
920 Then this multiplication results of adjacent odd-even elements
921 are added to the out vector
922 (2 signed halfword results)
924 #define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
926 out0 = (RTYPE) __msa_dpadd_s_h((v8i16) out0, \
927 (v16i8) mult0, (v16i8) cnst0); \
928 out1 = (RTYPE) __msa_dpadd_s_h((v8i16) out1, \
929 (v16i8) mult1, (v16i8) cnst1); \
931 #define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__)
933 #define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3, \
934 cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) \
936 DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
937 DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
939 #define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__)
941 /* Description : Dot product & addition of byte vector elements
942 Arguments : Inputs - mult0, mult1
945 Return Type - as per RTYPE
946 Details : Unsigned byte elements from mult0 are multiplied with
947 unsigned byte elements from cnst0 producing a result
948 twice the size of input i.e. unsigned halfword.
949 Then this multiplication results of adjacent odd-even elements
950 are added to the out vector
951 (2 unsigned halfword results)
953 #define DPADD_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
955 out0 = (RTYPE) __msa_dpadd_u_h((v8u16) out0, \
956 (v16u8) mult0, (v16u8) cnst0); \
957 out1 = (RTYPE) __msa_dpadd_u_h((v8u16) out1, \
958 (v16u8) mult1, (v16u8) cnst1); \
960 #define DPADD_UB2_UH(...) DPADD_UB2(v8u16, __VA_ARGS__)
962 /* Description : Dot product & addition of halfword vector elements
963 Arguments : Inputs - mult0, mult1
966 Return Type - as per RTYPE
967 Details : Signed halfword elements from mult0 are multiplied with
968 signed halfword elements from cnst0 producing a result
969 twice the size of input i.e. signed word.
970 Then this multiplication results of adjacent odd-even elements
971 are added to the out vector
972 (2 signed word results)
974 #define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
976 out0 = (RTYPE) __msa_dpadd_s_w((v4i32) out0, \
977 (v8i16) mult0, (v8i16) cnst0); \
978 out1 = (RTYPE) __msa_dpadd_s_w((v4i32) out1, \
979 (v8i16) mult1, (v8i16) cnst1); \
981 #define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__)
983 #define DPADD_SH4(RTYPE, mult0, mult1, mult2, mult3, \
984 cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) \
986 DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
987 DPADD_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
989 #define DPADD_SH4_SW(...) DPADD_SH4(v4i32, __VA_ARGS__)
991 /* Description : Minimum values between unsigned elements of
992 either vector are copied to the output vector
993 Arguments : Inputs - in0, in1, min_vec
994 Outputs - in0, in1, (in place)
995 Return Type - as per RTYPE
996 Details : Minimum of unsigned halfword element values from 'in0' and
997 'min_value' are written to output vector 'in0'
999 #define MIN_UH2(RTYPE, in0, in1, min_vec) \
1001 in0 = (RTYPE) __msa_min_u_h((v8u16) in0, min_vec); \
1002 in1 = (RTYPE) __msa_min_u_h((v8u16) in1, min_vec); \
1004 #define MIN_UH2_UH(...) MIN_UH2(v8u16, __VA_ARGS__)
1006 #define MIN_UH4(RTYPE, in0, in1, in2, in3, min_vec) \
1008 MIN_UH2(RTYPE, in0, in1, min_vec); \
1009 MIN_UH2(RTYPE, in2, in3, min_vec); \
1011 #define MIN_UH4_UH(...) MIN_UH4(v8u16, __VA_ARGS__)
1013 /* Description : Clips all halfword elements of input vector between min & max
1014 out = ((in) < (min)) ? (min) : (((in) > (max)) ? (max) : (in))
1015 Arguments : Inputs - in (input vector)
1016 - min (min threshold)
1017 - max (max threshold)
1018 Outputs - out_m (output vector with clipped elements)
1019 Return Type - signed halfword
1021 #define CLIP_SH(in, min, max) \
1025 out_m = __msa_max_s_h((v8i16) min, (v8i16) in); \
1026 out_m = __msa_min_s_h((v8i16) max, (v8i16) out_m); \
1030 /* Description : Clips all signed halfword elements of input vector
1032 Arguments : Inputs - in (input vector)
1033 Outputs - out_m (output vector with clipped elements)
1034 Return Type - signed halfword
1036 #define CLIP_SH_0_255(in) \
1038 v8i16 max_m = __msa_ldi_h(255); \
1041 out_m = __msa_maxi_s_h((v8i16) in, 0); \
1042 out_m = __msa_min_s_h((v8i16) max_m, (v8i16) out_m); \
1045 #define CLIP_SH2_0_255(in0, in1) \
1047 in0 = CLIP_SH_0_255(in0); \
1048 in1 = CLIP_SH_0_255(in1); \
1050 #define CLIP_SH4_0_255(in0, in1, in2, in3) \
1052 CLIP_SH2_0_255(in0, in1); \
1053 CLIP_SH2_0_255(in2, in3); \
1056 #define CLIP_SH_0_255_MAX_SATU(in) \
1060 out_m = __msa_maxi_s_h((v8i16) in, 0); \
1061 out_m = (v8i16) __msa_sat_u_h((v8u16) out_m, 7); \
1064 #define CLIP_SH2_0_255_MAX_SATU(in0, in1) \
1066 in0 = CLIP_SH_0_255_MAX_SATU(in0); \
1067 in1 = CLIP_SH_0_255_MAX_SATU(in1); \
1069 #define CLIP_SH4_0_255_MAX_SATU(in0, in1, in2, in3) \
1071 CLIP_SH2_0_255_MAX_SATU(in0, in1); \
1072 CLIP_SH2_0_255_MAX_SATU(in2, in3); \
1075 /* Description : Clips all signed word elements of input vector
1077 Arguments : Inputs - in (input vector)
1078 Outputs - out_m (output vector with clipped elements)
1079 Return Type - signed word
1081 #define CLIP_SW_0_255(in) \
1083 v4i32 max_m = __msa_ldi_w(255); \
1086 out_m = __msa_maxi_s_w((v4i32) in, 0); \
1087 out_m = __msa_min_s_w((v4i32) max_m, (v4i32) out_m); \
1091 #define CLIP_SW_0_255_MAX_SATU(in) \
1095 out_m = __msa_maxi_s_w((v4i32) in, 0); \
1096 out_m = (v4i32) __msa_sat_u_w((v4u32) out_m, 7); \
1099 #define CLIP_SW2_0_255_MAX_SATU(in0, in1) \
1101 in0 = CLIP_SW_0_255_MAX_SATU(in0); \
1102 in1 = CLIP_SW_0_255_MAX_SATU(in1); \
1104 #define CLIP_SW4_0_255_MAX_SATU(in0, in1, in2, in3) \
1106 CLIP_SW2_0_255_MAX_SATU(in0, in1); \
1107 CLIP_SW2_0_255_MAX_SATU(in2, in3); \
1110 /* Description : Addition of 4 signed word elements
1111 4 signed word elements of input vector are added together and
1112 resulted integer sum is returned
1113 Arguments : Inputs - in (signed word vector)
1114 Outputs - sum_m (i32 sum)
1115 Return Type - signed word
1117 #define HADD_SW_S32(in) \
1119 v2i64 res0_m, res1_m; \
1122 res0_m = __msa_hadd_s_d((v4i32) in, (v4i32) in); \
1123 res1_m = __msa_splati_d(res0_m, 1); \
1125 sum_m = __msa_copy_s_w((v4i32) res0_m, 0); \
1129 /* Description : Addition of 8 unsigned halfword elements
1130 8 unsigned halfword elements of input vector are added
1131 together and resulted integer sum is returned
1132 Arguments : Inputs - in (unsigned halfword vector)
1133 Outputs - sum_m (u32 sum)
1134 Return Type - unsigned word
1136 #define HADD_UH_U32(in) \
1139 v2u64 res0_m, res1_m; \
1142 res_m = __msa_hadd_u_w((v8u16) in, (v8u16) in); \
1143 res0_m = __msa_hadd_u_d(res_m, res_m); \
1144 res1_m = (v2u64) __msa_splati_d((v2i64) res0_m, 1); \
1146 sum_m = __msa_copy_u_w((v4i32) res0_m, 0); \
1150 /* Description : Horizontal addition of signed byte vector elements
1151 Arguments : Inputs - in0, in1
1152 Outputs - out0, out1
1153 Return Type - as per RTYPE
1154 Details : Each signed odd byte element from 'in0' is added to
1155 even signed byte element from 'in0' (pairwise) and the
1156 halfword result is stored in 'out0'
1158 #define HADD_SB2(RTYPE, in0, in1, out0, out1) \
1160 out0 = (RTYPE) __msa_hadd_s_h((v16i8) in0, (v16i8) in0); \
1161 out1 = (RTYPE) __msa_hadd_s_h((v16i8) in1, (v16i8) in1); \
1163 #define HADD_SB2_SH(...) HADD_SB2(v8i16, __VA_ARGS__)
1165 #define HADD_SB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \
1167 HADD_SB2(RTYPE, in0, in1, out0, out1); \
1168 HADD_SB2(RTYPE, in2, in3, out2, out3); \
1170 #define HADD_SB4_UH(...) HADD_SB4(v8u16, __VA_ARGS__)
1171 #define HADD_SB4_SH(...) HADD_SB4(v8i16, __VA_ARGS__)
1173 /* Description : Horizontal addition of unsigned byte vector elements
1174 Arguments : Inputs - in0, in1
1175 Outputs - out0, out1
1176 Return Type - as per RTYPE
1177 Details : Each unsigned odd byte element from 'in0' is added to
1178 even unsigned byte element from 'in0' (pairwise) and the
1179 halfword result is stored in 'out0'
1181 #define HADD_UB2(RTYPE, in0, in1, out0, out1) \
1183 out0 = (RTYPE) __msa_hadd_u_h((v16u8) in0, (v16u8) in0); \
1184 out1 = (RTYPE) __msa_hadd_u_h((v16u8) in1, (v16u8) in1); \
1186 #define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__)
1188 #define HADD_UB3(RTYPE, in0, in1, in2, out0, out1, out2) \
1190 HADD_UB2(RTYPE, in0, in1, out0, out1); \
1191 out2 = (RTYPE) __msa_hadd_u_h((v16u8) in2, (v16u8) in2); \
1193 #define HADD_UB3_UH(...) HADD_UB3(v8u16, __VA_ARGS__)
1195 #define HADD_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \
1197 HADD_UB2(RTYPE, in0, in1, out0, out1); \
1198 HADD_UB2(RTYPE, in2, in3, out2, out3); \
1200 #define HADD_UB4_UB(...) HADD_UB4(v16u8, __VA_ARGS__)
1201 #define HADD_UB4_UH(...) HADD_UB4(v8u16, __VA_ARGS__)
1202 #define HADD_UB4_SH(...) HADD_UB4(v8i16, __VA_ARGS__)
1204 /* Description : Horizontal subtraction of unsigned byte vector elements
1205 Arguments : Inputs - in0, in1
1206 Outputs - out0, out1
1207 Return Type - as per RTYPE
1208 Details : Each unsigned odd byte element from 'in0' is subtracted from
1209 even unsigned byte element from 'in0' (pairwise) and the
1210 halfword result is stored in 'out0'
1212 #define HSUB_UB2(RTYPE, in0, in1, out0, out1) \
1214 out0 = (RTYPE) __msa_hsub_u_h((v16u8) in0, (v16u8) in0); \
1215 out1 = (RTYPE) __msa_hsub_u_h((v16u8) in1, (v16u8) in1); \
1217 #define HSUB_UB2_UH(...) HSUB_UB2(v8u16, __VA_ARGS__)
1218 #define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
1220 #define HSUB_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \
1222 HSUB_UB2(RTYPE, in0, in1, out0, out1); \
1223 HSUB_UB2(RTYPE, in2, in3, out2, out3); \
1225 #define HSUB_UB4_UH(...) HSUB_UB4(v8u16, __VA_ARGS__)
1226 #define HSUB_UB4_SH(...) HSUB_UB4(v8i16, __VA_ARGS__)
1228 /* Description : SAD (Sum of Absolute Difference)
1229 Arguments : Inputs - in0, in1, ref0, ref1 (unsigned byte src & ref)
1230 Outputs - sad_m (halfword vector with sad)
1231 Return Type - unsigned halfword
1232 Details : Absolute difference of all the byte elements from 'in0' with
1233 'ref0' is calculated and preserved in 'diff0'. From the 16
1234 unsigned absolute diff values, even-odd pairs are added
1235 together to generate 8 halfword results.
1237 #define SAD_UB2_UH(in0, in1, ref0, ref1) \
1239 v16u8 diff0_m, diff1_m; \
1240 v8u16 sad_m = { 0 }; \
1242 diff0_m = __msa_asub_u_b((v16u8) in0, (v16u8) ref0); \
1243 diff1_m = __msa_asub_u_b((v16u8) in1, (v16u8) ref1); \
1245 sad_m += __msa_hadd_u_h((v16u8) diff0_m, (v16u8) diff0_m); \
1246 sad_m += __msa_hadd_u_h((v16u8) diff1_m, (v16u8) diff1_m); \
1251 /* Description : Insert specified word elements from input vectors to 1
1253 Arguments : Inputs - in0, in1, in2, in3 (4 input vectors)
1254 Outputs - out (output vector)
1255 Return Type - as per RTYPE
1257 #define INSERT_W2(RTYPE, in0, in1, out) \
1259 out = (RTYPE) __msa_insert_w((v4i32) out, 0, in0); \
1260 out = (RTYPE) __msa_insert_w((v4i32) out, 1, in1); \
1262 #define INSERT_W2_UB(...) INSERT_W2(v16u8, __VA_ARGS__)
1263 #define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__)
1265 #define INSERT_W4(RTYPE, in0, in1, in2, in3, out) \
1267 out = (RTYPE) __msa_insert_w((v4i32) out, 0, in0); \
1268 out = (RTYPE) __msa_insert_w((v4i32) out, 1, in1); \
1269 out = (RTYPE) __msa_insert_w((v4i32) out, 2, in2); \
1270 out = (RTYPE) __msa_insert_w((v4i32) out, 3, in3); \
1272 #define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__)
1273 #define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__)
1274 #define INSERT_W4_SH(...) INSERT_W4(v8i16, __VA_ARGS__)
1275 #define INSERT_W4_SW(...) INSERT_W4(v4i32, __VA_ARGS__)
1277 /* Description : Insert specified double word elements from input vectors to 1
1279 Arguments : Inputs - in0, in1 (2 input vectors)
1280 Outputs - out (output vector)
1281 Return Type - as per RTYPE
1283 #define INSERT_D2(RTYPE, in0, in1, out) \
1285 out = (RTYPE) __msa_insert_d((v2i64) out, 0, in0); \
1286 out = (RTYPE) __msa_insert_d((v2i64) out, 1, in1); \
1288 #define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__)
1289 #define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)
1290 #define INSERT_D2_SH(...) INSERT_D2(v8i16, __VA_ARGS__)
1291 #define INSERT_D2_SD(...) INSERT_D2(v2i64, __VA_ARGS__)
1293 /* Description : Interleave even byte elements from vectors
1294 Arguments : Inputs - in0, in1, in2, in3
1295 Outputs - out0, out1
1296 Return Type - as per RTYPE
1297 Details : Even byte elements of 'in0' and even byte
1298 elements of 'in1' are interleaved and copied to 'out0'
1299 Even byte elements of 'in2' and even byte
1300 elements of 'in3' are interleaved and copied to 'out1'
1302 #define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
1304 out0 = (RTYPE) __msa_ilvev_b((v16i8) in1, (v16i8) in0); \
1305 out1 = (RTYPE) __msa_ilvev_b((v16i8) in3, (v16i8) in2); \
1307 #define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__)
1308 #define ILVEV_B2_SB(...) ILVEV_B2(v16i8, __VA_ARGS__)
1309 #define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__)
1310 #define ILVEV_B2_SD(...) ILVEV_B2(v2i64, __VA_ARGS__)
1312 /* Description : Interleave even halfword elements from vectors
1313 Arguments : Inputs - in0, in1, in2, in3
1314 Outputs - out0, out1
1315 Return Type - as per RTYPE
1316 Details : Even halfword elements of 'in0' and even halfword
1317 elements of 'in1' are interleaved and copied to 'out0'
1318 Even halfword elements of 'in2' and even halfword
1319 elements of 'in3' are interleaved and copied to 'out1'
1321 #define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
1323 out0 = (RTYPE) __msa_ilvev_h((v8i16) in1, (v8i16) in0); \
1324 out1 = (RTYPE) __msa_ilvev_h((v8i16) in3, (v8i16) in2); \
1326 #define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__)
1327 #define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__)
1328 #define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__)
1330 /* Description : Interleave even word elements from vectors
1331 Arguments : Inputs - in0, in1, in2, in3
1332 Outputs - out0, out1
1333 Return Type - as per RTYPE
1334 Details : Even word elements of 'in0' and even word
1335 elements of 'in1' are interleaved and copied to 'out0'
1336 Even word elements of 'in2' and even word
1337 elements of 'in3' are interleaved and copied to 'out1'
1339 #define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) \
1341 out0 = (RTYPE) __msa_ilvev_w((v4i32) in1, (v4i32) in0); \
1342 out1 = (RTYPE) __msa_ilvev_w((v4i32) in3, (v4i32) in2); \
1344 #define ILVEV_W2_UB(...) ILVEV_W2(v16u8, __VA_ARGS__)
1345 #define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__)
1346 #define ILVEV_W2_UH(...) ILVEV_W2(v8u16, __VA_ARGS__)
1347 #define ILVEV_W2_SD(...) ILVEV_W2(v2i64, __VA_ARGS__)
1349 /* Description : Interleave even double word elements from vectors
1350 Arguments : Inputs - in0, in1, in2, in3
1351 Outputs - out0, out1
1352 Return Type - as per RTYPE
1353 Details : Even double word elements of 'in0' and even double word
1354 elements of 'in1' are interleaved and copied to 'out0'
1355 Even double word elements of 'in2' and even double word
1356 elements of 'in3' are interleaved and copied to 'out1'
1358 #define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
1360 out0 = (RTYPE) __msa_ilvev_d((v2i64) in1, (v2i64) in0); \
1361 out1 = (RTYPE) __msa_ilvev_d((v2i64) in3, (v2i64) in2); \
1363 #define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)
1364 #define ILVEV_D2_SB(...) ILVEV_D2(v16i8, __VA_ARGS__)
1365 #define ILVEV_D2_SW(...) ILVEV_D2(v4i32, __VA_ARGS__)
1367 /* Description : Interleave left half of byte elements from vectors
1368 Arguments : Inputs - in0, in1, in2, in3
1369 Outputs - out0, out1
1370 Return Type - as per RTYPE
1371 Details : Left half of byte elements of in0 and left half of byte
1372 elements of in1 are interleaved and copied to out0.
1373 Left half of byte elements of in2 and left half of byte
1374 elements of in3 are interleaved and copied to out1.
1376 #define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
1378 out0 = (RTYPE) __msa_ilvl_b((v16i8) in0, (v16i8) in1); \
1379 out1 = (RTYPE) __msa_ilvl_b((v16i8) in2, (v16i8) in3); \
1381 #define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__)
1382 #define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__)
1383 #define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__)
1384 #define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__)
1386 #define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1387 out0, out1, out2, out3) \
1389 ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1390 ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
1392 #define ILVL_B4_UB(...) ILVL_B4(v16u8, __VA_ARGS__)
1393 #define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__)
1394 #define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__)
1395 #define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__)
1397 /* Description : Interleave left half of halfword elements from vectors
1398 Arguments : Inputs - in0, in1, in2, in3
1399 Outputs - out0, out1
1400 Return Type - as per RTYPE
1401 Details : Left half of halfword elements of in0 and left half of halfword
1402 elements of in1 are interleaved and copied to out0.
1403 Left half of halfword elements of in2 and left half of halfword
1404 elements of in3 are interleaved and copied to out1.
1406 #define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
1408 out0 = (RTYPE) __msa_ilvl_h((v8i16) in0, (v8i16) in1); \
1409 out1 = (RTYPE) __msa_ilvl_h((v8i16) in2, (v8i16) in3); \
1411 #define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__)
1412 #define ILVL_H2_SW(...) ILVL_H2(v4i32, __VA_ARGS__)
1414 #define ILVL_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1415 out0, out1, out2, out3) \
1417 ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
1418 ILVL_H2(RTYPE, in4, in5, in6, in7, out2, out3); \
1420 #define ILVL_H4_SH(...) ILVL_H4(v8i16, __VA_ARGS__)
1421 #define ILVL_H4_SW(...) ILVL_H4(v4i32, __VA_ARGS__)
1423 /* Description : Interleave left half of word elements from vectors
1424 Arguments : Inputs - in0, in1, in2, in3
1425 Outputs - out0, out1
1426 Return Type - as per RTYPE
1427 Details : Left half of word elements of in0 and left half of word
1428 elements of in1 are interleaved and copied to out0.
1429 Left half of word elements of in2 and left half of word
1430 elements of in3 are interleaved and copied to out1.
1432 #define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1) \
1434 out0 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1); \
1435 out1 = (RTYPE) __msa_ilvl_w((v4i32) in2, (v4i32) in3); \
1437 #define ILVL_W2_UB(...) ILVL_W2(v16u8, __VA_ARGS__)
1438 #define ILVL_W2_SB(...) ILVL_W2(v16i8, __VA_ARGS__)
1439 #define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__)
1441 /* Description : Interleave right half of byte elements from vectors
1442 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
1443 Outputs - out0, out1, out2, out3
1444 Return Type - as per RTYPE
1445 Details : Right half of byte elements of in0 and right half of byte
1446 elements of in1 are interleaved and copied to out0.
1447 Right half of byte elements of in2 and right half of byte
1448 elements of in3 are interleaved and copied to out1.
1449 Similar for other pairs
1451 #define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
1453 out0 = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1); \
1454 out1 = (RTYPE) __msa_ilvr_b((v16i8) in2, (v16i8) in3); \
1456 #define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__)
1457 #define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)
1458 #define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__)
1459 #define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
1460 #define ILVR_B2_SW(...) ILVR_B2(v4i32, __VA_ARGS__)
1462 #define ILVR_B3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
1464 ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1465 out2 = (RTYPE) __msa_ilvr_b((v16i8) in4, (v16i8) in5); \
1467 #define ILVR_B3_UB(...) ILVR_B3(v16u8, __VA_ARGS__)
1468 #define ILVR_B3_SB(...) ILVR_B3(v16i8, __VA_ARGS__)
1469 #define ILVR_B3_UH(...) ILVR_B3(v8u16, __VA_ARGS__)
1470 #define ILVR_B3_SH(...) ILVR_B3(v8i16, __VA_ARGS__)
1472 #define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1473 out0, out1, out2, out3) \
1475 ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1476 ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
1478 #define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__)
1479 #define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__)
1480 #define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__)
1481 #define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__)
1482 #define ILVR_B4_SW(...) ILVR_B4(v4i32, __VA_ARGS__)
1484 #define ILVR_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1485 in8, in9, in10, in11, in12, in13, in14, in15, \
1486 out0, out1, out2, out3, out4, out5, out6, out7) \
1488 ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1489 out0, out1, out2, out3); \
1490 ILVR_B4(RTYPE, in8, in9, in10, in11, in12, in13, in14, in15, \
1491 out4, out5, out6, out7); \
1493 #define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__)
1495 /* Description : Interleave right half of halfword elements from vectors
1496 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
1497 Outputs - out0, out1, out2, out3
1498 Return Type - as per RTYPE
1499 Details : Right half of halfword elements of in0 and right half of
1500 halfword elements of in1 are interleaved and copied to out0.
1501 Right half of halfword elements of in2 and right half of
1502 halfword elements of in3 are interleaved and copied to out1.
1503 Similar for other pairs
1505 #define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
1507 out0 = (RTYPE) __msa_ilvr_h((v8i16) in0, (v8i16) in1); \
1508 out1 = (RTYPE) __msa_ilvr_h((v8i16) in2, (v8i16) in3); \
1510 #define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
1511 #define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__)
1513 #define ILVR_H3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
1515 ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
1516 out2 = (RTYPE) __msa_ilvr_h((v8i16) in4, (v8i16) in5); \
1518 #define ILVR_H3_SH(...) ILVR_H3(v8i16, __VA_ARGS__)
1520 #define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1521 out0, out1, out2, out3) \
1523 ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
1524 ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3); \
1526 #define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__)
1527 #define ILVR_H4_SW(...) ILVR_H4(v4i32, __VA_ARGS__)
1529 #define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1) \
1531 out0 = (RTYPE) __msa_ilvr_w((v4i32) in0, (v4i32) in1); \
1532 out1 = (RTYPE) __msa_ilvr_w((v4i32) in2, (v4i32) in3); \
1534 #define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__)
1535 #define ILVR_W2_SB(...) ILVR_W2(v16i8, __VA_ARGS__)
1536 #define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__)
1538 #define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1539 out0, out1, out2, out3) \
1541 ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1); \
1542 ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3); \
1544 #define ILVR_W4_SB(...) ILVR_W4(v16i8, __VA_ARGS__)
1545 #define ILVR_W4_UB(...) ILVR_W4(v16u8, __VA_ARGS__)
1547 /* Description : Interleave right half of double word elements from vectors
1548 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
1549 Outputs - out0, out1, out2, out3
1550 Return Type - as per RTYPE
1551 Details : Right half of double word elements of in0 and right half of
1552 double word elements of in1 are interleaved and copied to out0.
1553 Right half of double word elements of in2 and right half of
1554 double word elements of in3 are interleaved and copied to out1.
1556 #define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
1558 out0 = (RTYPE) __msa_ilvr_d((v2i64) in0, (v2i64) in1); \
1559 out1 = (RTYPE) __msa_ilvr_d((v2i64) in2, (v2i64) in3); \
1561 #define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__)
1562 #define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
1563 #define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
1565 #define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
1567 ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
1568 out2 = (RTYPE) __msa_ilvr_d((v2i64) in4, (v2i64) in5); \
1570 #define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__)
1572 #define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1573 out0, out1, out2, out3) \
1575 ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
1576 ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3); \
1578 #define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__)
1579 #define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__)
1581 /* Description : Interleave left half of double word elements from vectors
1582 Arguments : Inputs - in0, in1, in2, in3
1583 Outputs - out0, out1
1584 Return Type - as per RTYPE
1585 Details : Left half of double word elements of in0 and left half of
1586 double word elements of in1 are interleaved and copied to out0.
1587 Left half of double word elements of in2 and left half of
1588 double word elements of in3 are interleaved and copied to out1.
1590 #define ILVL_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
1592 out0 = (RTYPE) __msa_ilvl_d((v2i64) in0, (v2i64) in1); \
1593 out1 = (RTYPE) __msa_ilvl_d((v2i64) in2, (v2i64) in3); \
1595 #define ILVL_D2_UB(...) ILVL_D2(v16u8, __VA_ARGS__)
1596 #define ILVL_D2_SB(...) ILVL_D2(v16i8, __VA_ARGS__)
1597 #define ILVL_D2_SH(...) ILVL_D2(v8i16, __VA_ARGS__)
1599 /* Description : Interleave both left and right half of input vectors
1600 Arguments : Inputs - in0, in1
1601 Outputs - out0, out1
1602 Return Type - as per RTYPE
1603 Details : Right half of byte elements from 'in0' and 'in1' are
1604 interleaved and stored to 'out0'
1605 Left half of byte elements from 'in0' and 'in1' are
1606 interleaved and stored to 'out1'
1608 #define ILVRL_B2(RTYPE, in0, in1, out0, out1) \
1610 out0 = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1); \
1611 out1 = (RTYPE) __msa_ilvl_b((v16i8) in0, (v16i8) in1); \
1613 #define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
1614 #define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)
1615 #define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__)
1616 #define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
1617 #define ILVRL_B2_SW(...) ILVRL_B2(v4i32, __VA_ARGS__)
1619 #define ILVRL_H2(RTYPE, in0, in1, out0, out1) \
1621 out0 = (RTYPE) __msa_ilvr_h((v8i16) in0, (v8i16) in1); \
1622 out1 = (RTYPE) __msa_ilvl_h((v8i16) in0, (v8i16) in1); \
1624 #define ILVRL_H2_UB(...) ILVRL_H2(v16u8, __VA_ARGS__)
1625 #define ILVRL_H2_SB(...) ILVRL_H2(v16i8, __VA_ARGS__)
1626 #define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)
1627 #define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
1629 #define ILVRL_W2(RTYPE, in0, in1, out0, out1) \
1631 out0 = (RTYPE) __msa_ilvr_w((v4i32) in0, (v4i32) in1); \
1632 out1 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1); \
1634 #define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__)
1635 #define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
1636 #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
1638 /* Description : Maximum values between signed elements of vector and
1639 5-bit signed immediate value are copied to the output vector
1640 Arguments : Inputs - in0, in1, in2, in3, max_val
1641 Outputs - in0, in1, in2, in3 (in place)
1642 Return Type - as per RTYPE
1643 Details : Maximum of signed halfword element values from 'in0' and
1644 'max_val' are written to output vector 'in0'
1646 #define MAXI_SH2(RTYPE, in0, in1, max_val) \
1648 in0 = (RTYPE) __msa_maxi_s_h((v8i16) in0, max_val); \
1649 in1 = (RTYPE) __msa_maxi_s_h((v8i16) in1, max_val); \
1651 #define MAXI_SH2_UH(...) MAXI_SH2(v8u16, __VA_ARGS__)
1652 #define MAXI_SH2_SH(...) MAXI_SH2(v8i16, __VA_ARGS__)
1654 #define MAXI_SH4(RTYPE, in0, in1, in2, in3, max_val) \
1656 MAXI_SH2(RTYPE, in0, in1, max_val); \
1657 MAXI_SH2(RTYPE, in2, in3, max_val); \
1659 #define MAXI_SH4_UH(...) MAXI_SH4(v8u16, __VA_ARGS__)
1660 #define MAXI_SH4_SH(...) MAXI_SH4(v8i16, __VA_ARGS__)
1662 #define MAXI_SH8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, max_val) \
1664 MAXI_SH4(RTYPE, in0, in1, in2, in3, max_val); \
1665 MAXI_SH4(RTYPE, in4, in5, in6, in7, max_val); \
1667 #define MAXI_SH8_UH(...) MAXI_SH8(v8u16, __VA_ARGS__)
1668 #define MAXI_SH8_SH(...) MAXI_SH8(v8i16, __VA_ARGS__)
1670 /* Description : Saturate the halfword element values to the max
1671 unsigned value of (sat_val+1 bits)
1672 The element data width remains unchanged
1673 Arguments : Inputs - in0, in1, in2, in3, sat_val
1674 Outputs - in0, in1, in2, in3 (in place)
1675 Return Type - as per RTYPE
1676 Details : Each unsigned halfword element from 'in0' is saturated to the
1677 value generated with (sat_val+1) bit range
1678 Results are in placed to original vectors
1680 #define SAT_UH2(RTYPE, in0, in1, sat_val) \
1682 in0 = (RTYPE) __msa_sat_u_h((v8u16) in0, sat_val); \
1683 in1 = (RTYPE) __msa_sat_u_h((v8u16) in1, sat_val); \
1685 #define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__)
1686 #define SAT_UH2_SH(...) SAT_UH2(v8i16, __VA_ARGS__)
1688 #define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val) \
1690 SAT_UH2(RTYPE, in0, in1, sat_val); \
1691 SAT_UH2(RTYPE, in2, in3, sat_val); \
1693 #define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__)
1694 #define SAT_UH4_SH(...) SAT_UH4(v8i16, __VA_ARGS__)
1696 #define SAT_UH8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, sat_val) \
1698 SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val); \
1699 SAT_UH4(RTYPE, in4, in5, in6, in7, sat_val); \
1701 #define SAT_UH8_UH(...) SAT_UH8(v8u16, __VA_ARGS__)
1702 #define SAT_UH8_SH(...) SAT_UH8(v8i16, __VA_ARGS__)
1704 /* Description : Saturate the halfword element values to the max
1705 unsigned value of (sat_val+1 bits)
1706 The element data width remains unchanged
1707 Arguments : Inputs - in0, in1, in2, in3, sat_val
1708 Outputs - in0, in1, in2, in3 (in place)
1709 Return Type - as per RTYPE
1710 Details : Each unsigned halfword element from 'in0' is saturated to the
1711 value generated with (sat_val+1) bit range
1712 Results are in placed to original vectors
1714 #define SAT_SH2(RTYPE, in0, in1, sat_val) \
1716 in0 = (RTYPE) __msa_sat_s_h((v8i16) in0, sat_val); \
1717 in1 = (RTYPE) __msa_sat_s_h((v8i16) in1, sat_val); \
1719 #define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__)
1721 #define SAT_SH3(RTYPE, in0, in1, in2, sat_val) \
1723 SAT_SH2(RTYPE, in0, in1, sat_val); \
1724 in2 = (RTYPE) __msa_sat_s_h((v8i16) in2, sat_val); \
1726 #define SAT_SH3_SH(...) SAT_SH3(v8i16, __VA_ARGS__)
1728 #define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) \
1730 SAT_SH2(RTYPE, in0, in1, sat_val); \
1731 SAT_SH2(RTYPE, in2, in3, sat_val); \
1733 #define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__)
1735 /* Description : Saturate the word element values to the max
1736 unsigned value of (sat_val+1 bits)
1737 The element data width remains unchanged
1738 Arguments : Inputs - in0, in1, in2, in3, sat_val
1739 Outputs - in0, in1, in2, in3 (in place)
1740 Return Type - as per RTYPE
1741 Details : Each unsigned word element from 'in0' is saturated to the
1742 value generated with (sat_val+1) bit range
1743 Results are in placed to original vectors
1745 #define SAT_SW2(RTYPE, in0, in1, sat_val) \
1747 in0 = (RTYPE) __msa_sat_s_w((v4i32) in0, sat_val); \
1748 in1 = (RTYPE) __msa_sat_s_w((v4i32) in1, sat_val); \
1750 #define SAT_SW2_SW(...) SAT_SW2(v4i32, __VA_ARGS__)
1752 #define SAT_SW4(RTYPE, in0, in1, in2, in3, sat_val) \
1754 SAT_SW2(RTYPE, in0, in1, sat_val); \
1755 SAT_SW2(RTYPE, in2, in3, sat_val); \
1757 #define SAT_SW4_SW(...) SAT_SW4(v4i32, __VA_ARGS__)
1759 /* Description : Indexed halfword element values are replicated to all
1760 elements in output vector
1761 Arguments : Inputs - in, idx0, idx1
1762 Outputs - out0, out1
1763 Return Type - as per RTYPE
1764 Details : 'idx0' element value from 'in' vector is replicated to all
1765 elements in 'out0' vector
1766 Valid index range for halfword operation is 0-7
1768 #define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) \
1770 out0 = (RTYPE) __msa_splati_h((v8i16) in, idx0); \
1771 out1 = (RTYPE) __msa_splati_h((v8i16) in, idx1); \
1773 #define SPLATI_H2_SB(...) SPLATI_H2(v16i8, __VA_ARGS__)
1774 #define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__)
1776 #define SPLATI_H3(RTYPE, in, idx0, idx1, idx2, \
1779 SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1); \
1780 out2 = (RTYPE) __msa_splati_h((v8i16) in, idx2); \
1782 #define SPLATI_H3_SB(...) SPLATI_H3(v16i8, __VA_ARGS__)
1783 #define SPLATI_H3_SH(...) SPLATI_H3(v8i16, __VA_ARGS__)
1785 #define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3, \
1786 out0, out1, out2, out3) \
1788 SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1); \
1789 SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3); \
1791 #define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__)
1792 #define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__)
1794 /* Description : Indexed word element values are replicated to all
1795 elements in output vector
1796 Arguments : Inputs - in, stidx
1797 Outputs - out0, out1
1798 Return Type - as per RTYPE
1799 Details : 'stidx' element value from 'in' vector is replicated to all
1800 elements in 'out0' vector
1801 'stidx + 1' element value from 'in' vector is replicated to all
1802 elements in 'out1' vector
1803 Valid index range for halfword operation is 0-3
1805 #define SPLATI_W2(RTYPE, in, stidx, out0, out1) \
1807 out0 = (RTYPE) __msa_splati_w((v4i32) in, stidx); \
1808 out1 = (RTYPE) __msa_splati_w((v4i32) in, (stidx+1)); \
1810 #define SPLATI_W2_SH(...) SPLATI_W2(v8i16, __VA_ARGS__)
1811 #define SPLATI_W2_SW(...) SPLATI_W2(v4i32, __VA_ARGS__)
1813 #define SPLATI_W4(RTYPE, in, out0, out1, out2, out3) \
1815 SPLATI_W2(RTYPE, in, 0, out0, out1); \
1816 SPLATI_W2(RTYPE, in, 2, out2, out3); \
1818 #define SPLATI_W4_SH(...) SPLATI_W4(v8i16, __VA_ARGS__)
1819 #define SPLATI_W4_SW(...) SPLATI_W4(v4i32, __VA_ARGS__)
1821 /* Description : Pack even byte elements of vector pairs
1822 Arguments : Inputs - in0, in1, in2, in3
1823 Outputs - out0, out1
1824 Return Type - as per RTYPE
1825 Details : Even byte elements of in0 are copied to the left half of
1826 out0 & even byte elements of in1 are copied to the right
1828 Even byte elements of in2 are copied to the left half of
1829 out1 & even byte elements of in3 are copied to the right
1832 #define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
1834 out0 = (RTYPE) __msa_pckev_b((v16i8) in0, (v16i8) in1); \
1835 out1 = (RTYPE) __msa_pckev_b((v16i8) in2, (v16i8) in3); \
1837 #define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)
1838 #define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)
1839 #define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__)
1840 #define PCKEV_B2_SW(...) PCKEV_B2(v4i32, __VA_ARGS__)
1842 #define PCKEV_B3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
1844 PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1845 out2 = (RTYPE) __msa_pckev_b((v16i8) in4, (v16i8) in5); \
1847 #define PCKEV_B3_UB(...) PCKEV_B3(v16u8, __VA_ARGS__)
1848 #define PCKEV_B3_SB(...) PCKEV_B3(v16i8, __VA_ARGS__)
1850 #define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1851 out0, out1, out2, out3) \
1853 PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1854 PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
1856 #define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__)
1857 #define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__)
1858 #define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__)
1859 #define PCKEV_B4_SW(...) PCKEV_B4(v4i32, __VA_ARGS__)
1861 /* Description : Pack even halfword elements of vector pairs
1862 Arguments : Inputs - in0, in1, in2, in3
1863 Outputs - out0, out1
1864 Return Type - as per RTYPE
1865 Details : Even halfword elements of in0 are copied to the left half of
1866 out0 & even halfword elements of in1 are copied to the right
1868 Even halfword elements of in2 are copied to the left half of
1869 out1 & even halfword elements of in3 are copied to the right
1872 #define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
1874 out0 = (RTYPE) __msa_pckev_h((v8i16) in0, (v8i16) in1); \
1875 out1 = (RTYPE) __msa_pckev_h((v8i16) in2, (v8i16) in3); \
1877 #define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__)
1878 #define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__)
1880 #define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1881 out0, out1, out2, out3) \
1883 PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
1884 PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3); \
1886 #define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__)
1887 #define PCKEV_H4_SW(...) PCKEV_H4(v4i32, __VA_ARGS__)
1889 /* Description : Pack even double word elements of vector pairs
1890 Arguments : Inputs - in0, in1, in2, in3
1891 Outputs - out0, out1
1892 Return Type - as per RTYPE
1893 Details : Even double elements of in0 are copied to the left half of
1894 out0 & even double elements of in1 are copied to the right
1896 Even double elements of in2 are copied to the left half of
1897 out1 & even double elements of in3 are copied to the right
1900 #define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
1902 out0 = (RTYPE) __msa_pckev_d((v2i64) in0, (v2i64) in1); \
1903 out1 = (RTYPE) __msa_pckev_d((v2i64) in2, (v2i64) in3); \
1905 #define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__)
1906 #define PCKEV_D2_SB(...) PCKEV_D2(v16i8, __VA_ARGS__)
1907 #define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__)
1909 #define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1910 out0, out1, out2, out3) \
1912 PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
1913 PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3); \
1915 #define PCKEV_D4_UB(...) PCKEV_D4(v16u8, __VA_ARGS__)
1917 /* Description : Pack odd double word elements of vector pairs
1918 Arguments : Inputs - in0, in1
1919 Outputs - out0, out1
1920 Return Type - as per RTYPE
1921 Details : As operation is on same input 'in0' vector, index 1 double word
1922 element is overwritten to index 0 and result is written to out0
1923 As operation is on same input 'in1' vector, index 1 double word
1924 element is overwritten to index 0 and result is written to out1
1926 #define PCKOD_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
1928 out0 = (RTYPE) __msa_pckod_d((v2i64) in0, (v2i64) in1); \
1929 out1 = (RTYPE) __msa_pckod_d((v2i64) in2, (v2i64) in3); \
1931 #define PCKOD_D2_UB(...) PCKOD_D2(v16u8, __VA_ARGS__)
1932 #define PCKOD_D2_SH(...) PCKOD_D2(v8i16, __VA_ARGS__)
1933 #define PCKOD_D2_SD(...) PCKOD_D2(v2i64, __VA_ARGS__)
1935 /* Description : Each byte element is logically xor'ed with immediate 128
1936 Arguments : Inputs - in0, in1
1937 Outputs - in0, in1 (in-place)
1938 Return Type - as per RTYPE
1939 Details : Each unsigned byte element from input vector 'in0' is
1940 logically xor'ed with 128 and result is in-place stored in
1942 Each unsigned byte element from input vector 'in1' is
1943 logically xor'ed with 128 and result is in-place stored in
1945 Similar for other pairs
1947 #define XORI_B2_128(RTYPE, in0, in1) \
1949 in0 = (RTYPE) __msa_xori_b((v16u8) in0, 128); \
1950 in1 = (RTYPE) __msa_xori_b((v16u8) in1, 128); \
1952 #define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__)
1953 #define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__)
1954 #define XORI_B2_128_SH(...) XORI_B2_128(v8i16, __VA_ARGS__)
1956 #define XORI_B3_128(RTYPE, in0, in1, in2) \
1958 XORI_B2_128(RTYPE, in0, in1); \
1959 in2 = (RTYPE) __msa_xori_b((v16u8) in2, 128); \
1961 #define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__)
1963 #define XORI_B4_128(RTYPE, in0, in1, in2, in3) \
1965 XORI_B2_128(RTYPE, in0, in1); \
1966 XORI_B2_128(RTYPE, in2, in3); \
1968 #define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__)
1969 #define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__)
1970 #define XORI_B4_128_SH(...) XORI_B4_128(v8i16, __VA_ARGS__)
1972 #define XORI_B5_128(RTYPE, in0, in1, in2, in3, in4) \
1974 XORI_B3_128(RTYPE, in0, in1, in2); \
1975 XORI_B2_128(RTYPE, in3, in4); \
1977 #define XORI_B5_128_SB(...) XORI_B5_128(v16i8, __VA_ARGS__)
1979 #define XORI_B6_128(RTYPE, in0, in1, in2, in3, in4, in5) \
1981 XORI_B4_128(RTYPE, in0, in1, in2, in3); \
1982 XORI_B2_128(RTYPE, in4, in5); \
1984 #define XORI_B6_128_SB(...) XORI_B6_128(v16i8, __VA_ARGS__)
1986 #define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6) \
1988 XORI_B4_128(RTYPE, in0, in1, in2, in3); \
1989 XORI_B3_128(RTYPE, in4, in5, in6); \
1991 #define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__)
1993 #define XORI_B8_128(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7) \
1995 XORI_B4_128(RTYPE, in0, in1, in2, in3); \
1996 XORI_B4_128(RTYPE, in4, in5, in6, in7); \
1998 #define XORI_B8_128_SB(...) XORI_B8_128(v16i8, __VA_ARGS__)
1999 #define XORI_B8_128_UB(...) XORI_B8_128(v16u8, __VA_ARGS__)
2001 /* Description : Addition of signed halfword elements and signed saturation
2002 Arguments : Inputs - in0, in1, in2, in3
2003 Outputs - out0, out1
2004 Return Type - as per RTYPE
2005 Details : Signed halfword elements from 'in0' are added to signed
2006 halfword elements of 'in1'. The result is then signed saturated
2007 between -32768 to +32767 (as per halfword data type)
2008 Similar for other pairs
2010 #define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1) \
2012 out0 = (RTYPE) __msa_adds_s_h((v8i16) in0, (v8i16) in1); \
2013 out1 = (RTYPE) __msa_adds_s_h((v8i16) in2, (v8i16) in3); \
2015 #define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__)
2017 #define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
2018 out0, out1, out2, out3) \
2020 ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1); \
2021 ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3); \
2023 #define ADDS_SH4_UH(...) ADDS_SH4(v8u16, __VA_ARGS__)
2024 #define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__)
2026 /* Description : Shift left all elements of vector (generic for all data types)
2027 Arguments : Inputs - in0, in1, in2, in3, shift
2028 Outputs - in0, in1, in2, in3 (in place)
2029 Return Type - as per input vector RTYPE
2030 Details : Each element of vector 'in0' is left shifted by 'shift' and
2031 result is in place written to 'in0'
2032 Similar for other pairs
2034 #define SLLI_2V(in0, in1, shift) \
2036 in0 = in0 << shift; \
2037 in1 = in1 << shift; \
2039 #define SLLI_4V(in0, in1, in2, in3, shift) \
2041 in0 = in0 << shift; \
2042 in1 = in1 << shift; \
2043 in2 = in2 << shift; \
2044 in3 = in3 << shift; \
2047 /* Description : Arithmetic shift right all elements of vector
2048 (generic for all data types)
2049 Arguments : Inputs - in0, in1, in2, in3, shift
2050 Outputs - in0, in1, in2, in3 (in place)
2051 Return Type - as per input vector RTYPE
2052 Details : Each element of vector 'in0' is right shifted by 'shift' and
2053 result is in place written to 'in0'
2054 Here, 'shift' is GP variable passed in
2055 Similar for other pairs
2057 #define SRA_4V(in0, in1, in2, in3, shift) \
2059 in0 = in0 >> shift; \
2060 in1 = in1 >> shift; \
2061 in2 = in2 >> shift; \
2062 in3 = in3 >> shift; \
2065 /* Description : Shift right logical all halfword elements of vector
2066 Arguments : Inputs - in0, in1, in2, in3, shift
2067 Outputs - in0, in1, in2, in3 (in place)
2068 Return Type - as per RTYPE
2069 Details : Each element of vector 'in0' is shifted right logical by
2070 number of bits respective element holds in vector 'shift' and
2071 result is in place written to 'in0'
2072 Here, 'shift' is a vector passed in
2073 Similar for other pairs
2075 #define SRL_H4(RTYPE, in0, in1, in2, in3, shift) \
2077 in0 = (RTYPE) __msa_srl_h((v8i16) in0, (v8i16) shift); \
2078 in1 = (RTYPE) __msa_srl_h((v8i16) in1, (v8i16) shift); \
2079 in2 = (RTYPE) __msa_srl_h((v8i16) in2, (v8i16) shift); \
2080 in3 = (RTYPE) __msa_srl_h((v8i16) in3, (v8i16) shift); \
2082 #define SRL_H4_UH(...) SRL_H4(v8u16, __VA_ARGS__)
2084 #define SRLR_H4(RTYPE, in0, in1, in2, in3, shift) \
2086 in0 = (RTYPE) __msa_srlr_h((v8i16) in0, (v8i16) shift); \
2087 in1 = (RTYPE) __msa_srlr_h((v8i16) in1, (v8i16) shift); \
2088 in2 = (RTYPE) __msa_srlr_h((v8i16) in2, (v8i16) shift); \
2089 in3 = (RTYPE) __msa_srlr_h((v8i16) in3, (v8i16) shift); \
2091 #define SRLR_H4_UH(...) SRLR_H4(v8u16, __VA_ARGS__)
2092 #define SRLR_H4_SH(...) SRLR_H4(v8i16, __VA_ARGS__)
2094 #define SRLR_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, shift) \
2096 SRLR_H4(RTYPE, in0, in1, in2, in3, shift); \
2097 SRLR_H4(RTYPE, in4, in5, in6, in7, shift); \
2099 #define SRLR_H8_UH(...) SRLR_H8(v8u16, __VA_ARGS__)
2100 #define SRLR_H8_SH(...) SRLR_H8(v8i16, __VA_ARGS__)
2102 /* Description : Shift right arithmetic rounded halfwords
2103 Arguments : Inputs - in0, in1, shift
2104 Outputs - in0, in1, (in place)
2105 Return Type - as per RTYPE
2106 Details : Each element of vector 'in0' is shifted right arithmetic by
2107 number of bits respective element holds in vector 'shift'.
2108 The last discarded bit is added to shifted value for rounding
2109 and the result is in place written to 'in0'
2110 Here, 'shift' is a vector passed in
2111 Similar for other pairs
2113 #define SRAR_H2(RTYPE, in0, in1, shift) \
2115 in0 = (RTYPE) __msa_srar_h((v8i16) in0, (v8i16) shift); \
2116 in1 = (RTYPE) __msa_srar_h((v8i16) in1, (v8i16) shift); \
2118 #define SRAR_H2_UH(...) SRAR_H2(v8u16, __VA_ARGS__)
2119 #define SRAR_H2_SH(...) SRAR_H2(v8i16, __VA_ARGS__)
2121 #define SRAR_H3(RTYPE, in0, in1, in2, shift) \
2123 SRAR_H2(RTYPE, in0, in1, shift) \
2124 in2 = (RTYPE) __msa_srar_h((v8i16) in2, (v8i16) shift); \
2126 #define SRAR_H3_SH(...) SRAR_H3(v8i16, __VA_ARGS__)
2128 #define SRAR_H4(RTYPE, in0, in1, in2, in3, shift) \
2130 SRAR_H2(RTYPE, in0, in1, shift) \
2131 SRAR_H2(RTYPE, in2, in3, shift) \
2133 #define SRAR_H4_UH(...) SRAR_H4(v8u16, __VA_ARGS__)
2134 #define SRAR_H4_SH(...) SRAR_H4(v8i16, __VA_ARGS__)
2136 /* Description : Shift right arithmetic rounded words
2137 Arguments : Inputs - in0, in1, shift
2138 Outputs - in0, in1, (in place)
2139 Return Type - as per RTYPE
2140 Details : Each element of vector 'in0' is shifted right arithmetic by
2141 number of bits respective element holds in vector 'shift'.
2142 The last discarded bit is added to shifted value for rounding
2143 and the result is in place written to 'in0'
2144 Here, 'shift' is a vector passed in
2145 Similar for other pairs
2147 #define SRAR_W2(RTYPE, in0, in1, shift) \
2149 in0 = (RTYPE) __msa_srar_w((v4i32) in0, (v4i32) shift); \
2150 in1 = (RTYPE) __msa_srar_w((v4i32) in1, (v4i32) shift); \
2152 #define SRAR_W2_SW(...) SRAR_W2(v4i32, __VA_ARGS__)
2154 #define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) \
2156 SRAR_W2(RTYPE, in0, in1, shift) \
2157 SRAR_W2(RTYPE, in2, in3, shift) \
2159 #define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__)
2161 /* Description : Shift right arithmetic rounded (immediate)
2162 Arguments : Inputs - in0, in1, in2, in3, shift
2163 Outputs - in0, in1, in2, in3 (in place)
2164 Return Type - as per RTYPE
2165 Details : Each element of vector 'in0' is shifted right arithmetic by
2167 The last discarded bit is added to shifted value for rounding
2168 and the result is in place written to 'in0'
2169 Similar for other pairs
2171 #define SRARI_H2(RTYPE, in0, in1, shift) \
2173 in0 = (RTYPE) __msa_srari_h((v8i16) in0, shift); \
2174 in1 = (RTYPE) __msa_srari_h((v8i16) in1, shift); \
2176 #define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__)
2177 #define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__)
2179 #define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) \
2181 SRARI_H2(RTYPE, in0, in1, shift); \
2182 SRARI_H2(RTYPE, in2, in3, shift); \
2184 #define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__)
2185 #define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__)
2187 /* Description : Shift right arithmetic rounded (immediate)
2188 Arguments : Inputs - in0, in1, shift
2189 Outputs - in0, in1 (in place)
2190 Return Type - as per RTYPE
2191 Details : Each element of vector 'in0' is shifted right arithmetic by
2193 The last discarded bit is added to shifted value for rounding
2194 and the result is in place written to 'in0'
2195 Similar for other pairs
2197 #define SRARI_W2(RTYPE, in0, in1, shift) \
2199 in0 = (RTYPE) __msa_srari_w((v4i32) in0, shift); \
2200 in1 = (RTYPE) __msa_srari_w((v4i32) in1, shift); \
2202 #define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__)
2204 #define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) \
2206 SRARI_W2(RTYPE, in0, in1, shift); \
2207 SRARI_W2(RTYPE, in2, in3, shift); \
2209 #define SRARI_W4_SH(...) SRARI_W4(v8i16, __VA_ARGS__)
2210 #define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)
2212 /* Description : Multiplication of pairs of vectors
2213 Arguments : Inputs - in0, in1, in2, in3
2214 Outputs - out0, out1
2215 Details : Each element from 'in0' is multiplied with elements from 'in1'
2216 and result is written to 'out0'
2217 Similar for other pairs
2219 #define MUL2(in0, in1, in2, in3, out0, out1) \
2224 #define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
2226 MUL2(in0, in1, in2, in3, out0, out1); \
2227 MUL2(in4, in5, in6, in7, out2, out3); \
2230 /* Description : Addition of 2 pairs of vectors
2231 Arguments : Inputs - in0, in1, in2, in3
2232 Outputs - out0, out1
2233 Details : Each element from 2 pairs vectors is added and 2 results are
2236 #define ADD2(in0, in1, in2, in3, out0, out1) \
2241 #define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
2243 ADD2(in0, in1, in2, in3, out0, out1); \
2244 ADD2(in4, in5, in6, in7, out2, out3); \
2247 /* Description : Subtraction of 2 pairs of vectors
2248 Arguments : Inputs - in0, in1, in2, in3
2249 Outputs - out0, out1
2250 Details : Each element from 2 pairs vectors is subtracted and 2 results
2253 #define SUB2(in0, in1, in2, in3, out0, out1) \
2258 #define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
2266 /* Description : Sign extend byte elements from right half of the vector
2267 Arguments : Input - in (byte vector)
2268 Output - out (sign extended halfword vector)
2269 Return Type - signed halfword
2270 Details : Sign bit of byte elements from input vector 'in' is
2271 extracted and interleaved with same vector 'in' to generate
2272 8 halfword elements keeping sign intact
2274 #define UNPCK_R_SB_SH(in, out) \
2278 sign_m = __msa_clti_s_b((v16i8) in, 0); \
2279 out = (v8i16) __msa_ilvr_b(sign_m, (v16i8) in); \
2282 /* Description : Sign extend halfword elements from right half of the vector
2283 Arguments : Inputs - in (input halfword vector)
2284 Outputs - out (sign extended word vectors)
2285 Return Type - signed word
2286 Details : Sign bit of halfword elements from input vector 'in' is
2287 extracted and interleaved with same vector 'in0' to generate
2288 4 word elements keeping sign intact
2290 #define UNPCK_R_SH_SW(in, out) \
2294 sign_m = __msa_clti_s_h((v8i16) in, 0); \
2295 out = (v4i32) __msa_ilvr_h(sign_m, (v8i16) in); \
2298 /* Description : Sign extend byte elements from input vector and return
2299 halfword results in pair of vectors
2300 Arguments : Inputs - in (1 input byte vector)
2301 Outputs - out0, out1 (sign extended 2 halfword vectors)
2302 Return Type - signed halfword
2303 Details : Sign bit of byte elements from input vector 'in' is
2304 extracted and interleaved right with same vector 'in0' to
2305 generate 8 signed halfword elements in 'out0'
2306 Then interleaved left with same vector 'in0' to
2307 generate 8 signed halfword elements in 'out1'
2309 #define UNPCK_SB_SH(in, out0, out1) \
2313 tmp_m = __msa_clti_s_b((v16i8) in, 0); \
2314 ILVRL_B2_SH(tmp_m, in, out0, out1); \
2317 /* Description : Zero extend unsigned byte elements to halfword elements
2318 Arguments : Inputs - in (1 input unsigned byte vector)
2319 Outputs - out0, out1 (unsigned 2 halfword vectors)
2320 Return Type - signed halfword
2321 Details : Zero extended right half of vector is returned in 'out0'
2322 Zero extended left half of vector is returned in 'out1'
2324 #define UNPCK_UB_SH(in, out0, out1) \
2326 v16i8 zero_m = { 0 }; \
2328 ILVRL_B2_SH(zero_m, in, out0, out1); \
2331 /* Description : Sign extend halfword elements from input vector and return
2332 result in pair of vectors
2333 Arguments : Inputs - in (1 input halfword vector)
2334 Outputs - out0, out1 (sign extended 2 word vectors)
2335 Return Type - signed word
2336 Details : Sign bit of halfword elements from input vector 'in' is
2337 extracted and interleaved right with same vector 'in0' to
2338 generate 4 signed word elements in 'out0'
2339 Then interleaved left with same vector 'in0' to
2340 generate 4 signed word elements in 'out1'
2342 #define UNPCK_SH_SW(in, out0, out1) \
2346 tmp_m = __msa_clti_s_h((v8i16) in, 0); \
2347 ILVRL_H2_SW(tmp_m, in, out0, out1); \
2350 /* Description : Swap two variables
2351 Arguments : Inputs - in0, in1
2352 Outputs - in0, in1 (in-place)
2353 Details : Swapping of two input variables using xor
2355 #define SWAP(in0, in1) \
2362 /* Description : Butterfly of 4 input vectors
2363 Arguments : Inputs - in0, in1, in2, in3
2364 Outputs - out0, out1, out2, out3
2365 Details : Butterfly operation
2367 #define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) \
2376 /* Description : Butterfly of 8 input vectors
2377 Arguments : Inputs - in0 ... in7
2378 Outputs - out0 .. out7
2379 Details : Butterfly operation
2381 #define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, \
2382 out0, out1, out2, out3, out4, out5, out6, out7) \
2395 /* Description : Butterfly of 16 input vectors
2396 Arguments : Inputs - in0 ... in15
2397 Outputs - out0 .. out15
2398 Details : Butterfly operation
2400 #define BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, \
2401 in8, in9, in10, in11, in12, in13, in14, in15, \
2402 out0, out1, out2, out3, out4, out5, out6, out7, \
2403 out8, out9, out10, out11, out12, out13, out14, out15) \
2405 out0 = in0 + in15; \
2406 out1 = in1 + in14; \
2407 out2 = in2 + in13; \
2408 out3 = in3 + in12; \
2409 out4 = in4 + in11; \
2410 out5 = in5 + in10; \
2416 out10 = in5 - in10; \
2417 out11 = in4 - in11; \
2418 out12 = in3 - in12; \
2419 out13 = in2 - in13; \
2420 out14 = in1 - in14; \
2421 out15 = in0 - in15; \
2424 /* Description : Transposes input 4x4 byte block
2425 Arguments : Inputs - in0, in1, in2, in3 (input 4x4 byte block)
2426 Outputs - out0, out1, out2, out3 (output 4x4 byte block)
2427 Return Type - unsigned byte
2430 #define TRANSPOSE4x4_UB_UB(in0, in1, in2, in3, out0, out1, out2, out3) \
2432 v16i8 zero_m = { 0 }; \
2433 v16i8 s0_m, s1_m, s2_m, s3_m; \
2435 ILVR_D2_SB(in1, in0, in3, in2, s0_m, s1_m); \
2436 ILVRL_B2_SB(s1_m, s0_m, s2_m, s3_m); \
2438 out0 = (v16u8) __msa_ilvr_b(s3_m, s2_m); \
2439 out1 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out0, 4); \
2440 out2 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out1, 4); \
2441 out3 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out2, 4); \
2444 /* Description : Transposes input 8x4 byte block into 4x8
2445 Arguments : Inputs - in0, in1, in2, in3 (input 8x4 byte block)
2446 Outputs - out0, out1, out2, out3 (output 4x8 byte block)
2447 Return Type - as per RTYPE
2450 #define TRANSPOSE8x4_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
2451 out0, out1, out2, out3) \
2453 v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2455 ILVEV_W2_SB(in0, in4, in1, in5, tmp0_m, tmp1_m); \
2456 tmp2_m = __msa_ilvr_b(tmp1_m, tmp0_m); \
2457 ILVEV_W2_SB(in2, in6, in3, in7, tmp0_m, tmp1_m); \
2459 tmp3_m = __msa_ilvr_b(tmp1_m, tmp0_m); \
2460 ILVRL_H2_SB(tmp3_m, tmp2_m, tmp0_m, tmp1_m); \
2462 ILVRL_W2(RTYPE, tmp1_m, tmp0_m, out0, out2); \
2463 out1 = (RTYPE) __msa_ilvl_d((v2i64) out2, (v2i64) out0); \
2464 out3 = (RTYPE) __msa_ilvl_d((v2i64) out0, (v2i64) out2); \
2466 #define TRANSPOSE8x4_UB_UB(...) TRANSPOSE8x4_UB(v16u8, __VA_ARGS__)
2467 #define TRANSPOSE8x4_UB_UH(...) TRANSPOSE8x4_UB(v8u16, __VA_ARGS__)
2469 /* Description : Transposes input 8x8 byte block
2470 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
2471 (input 8x8 byte block)
2472 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
2473 (output 8x8 byte block)
2474 Return Type - as per RTYPE
2477 #define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
2478 out0, out1, out2, out3, out4, out5, out6, out7) \
2480 v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2481 v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
2483 ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5, \
2484 tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
2485 ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m); \
2486 ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m); \
2487 ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2); \
2488 ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6); \
2489 SLDI_B2_0(RTYPE, out0, out2, out1, out3, 8); \
2490 SLDI_B2_0(RTYPE, out4, out6, out5, out7, 8); \
2492 #define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__)
2493 #define TRANSPOSE8x8_UB_UH(...) TRANSPOSE8x8_UB(v8u16, __VA_ARGS__)
2495 /* Description : Transposes 16x4 block into 4x16 with byte elements in vectors
2496 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7,
2497 in8, in9, in10, in11, in12, in13, in14, in15
2498 Outputs - out0, out1, out2, out3
2499 Return Type - unsigned byte
2502 #define TRANSPOSE16x4_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2503 in8, in9, in10, in11, in12, in13, in14, in15, \
2504 out0, out1, out2, out3) \
2506 v2i64 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2508 ILVEV_W2_SD(in0, in4, in8, in12, tmp0_m, tmp1_m); \
2509 out1 = (v16u8) __msa_ilvev_d(tmp1_m, tmp0_m); \
2511 ILVEV_W2_SD(in1, in5, in9, in13, tmp0_m, tmp1_m); \
2512 out3 = (v16u8) __msa_ilvev_d(tmp1_m, tmp0_m); \
2514 ILVEV_W2_SD(in2, in6, in10, in14, tmp0_m, tmp1_m); \
2516 tmp2_m = __msa_ilvev_d(tmp1_m, tmp0_m); \
2517 ILVEV_W2_SD(in3, in7, in11, in15, tmp0_m, tmp1_m); \
2519 tmp3_m = __msa_ilvev_d(tmp1_m, tmp0_m); \
2520 ILVEV_B2_SD(out1, out3, tmp2_m, tmp3_m, tmp0_m, tmp1_m); \
2521 out0 = (v16u8) __msa_ilvev_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
2522 out2 = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
2524 tmp0_m = (v2i64) __msa_ilvod_b((v16i8) out3, (v16i8) out1); \
2525 tmp1_m = (v2i64) __msa_ilvod_b((v16i8) tmp3_m, (v16i8) tmp2_m); \
2526 out1 = (v16u8) __msa_ilvev_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
2527 out3 = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
2530 /* Description : Transposes 16x8 block into 8x16 with byte elements in vectors
2531 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7,
2532 in8, in9, in10, in11, in12, in13, in14, in15
2533 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
2534 Return Type - unsigned byte
2537 #define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2538 in8, in9, in10, in11, in12, in13, in14, in15, \
2539 out0, out1, out2, out3, out4, out5, out6, out7) \
2541 v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2542 v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
2544 ILVEV_D2_UB(in0, in8, in1, in9, out7, out6); \
2545 ILVEV_D2_UB(in2, in10, in3, in11, out5, out4); \
2546 ILVEV_D2_UB(in4, in12, in5, in13, out3, out2); \
2547 ILVEV_D2_UB(in6, in14, in7, in15, out1, out0); \
2549 tmp0_m = (v16u8) __msa_ilvev_b((v16i8) out6, (v16i8) out7); \
2550 tmp4_m = (v16u8) __msa_ilvod_b((v16i8) out6, (v16i8) out7); \
2551 tmp1_m = (v16u8) __msa_ilvev_b((v16i8) out4, (v16i8) out5); \
2552 tmp5_m = (v16u8) __msa_ilvod_b((v16i8) out4, (v16i8) out5); \
2553 out5 = (v16u8) __msa_ilvev_b((v16i8) out2, (v16i8) out3); \
2554 tmp6_m = (v16u8) __msa_ilvod_b((v16i8) out2, (v16i8) out3); \
2555 out7 = (v16u8) __msa_ilvev_b((v16i8) out0, (v16i8) out1); \
2556 tmp7_m = (v16u8) __msa_ilvod_b((v16i8) out0, (v16i8) out1); \
2558 ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m); \
2559 out0 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2560 out4 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2562 tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
2563 tmp3_m = (v16u8) __msa_ilvod_h((v8i16) out7, (v8i16) out5); \
2564 out2 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2565 out6 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2567 ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m); \
2568 out1 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2569 out5 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2571 tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp5_m, (v8i16) tmp4_m); \
2572 tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp5_m, (v8i16) tmp4_m); \
2573 tmp3_m = (v16u8) __msa_ilvod_h((v8i16) tmp7_m, (v8i16) tmp6_m); \
2574 tmp3_m = (v16u8) __msa_ilvod_h((v8i16) tmp7_m, (v8i16) tmp6_m); \
2575 out3 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2576 out7 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2579 /* Description : Transposes 4x4 block with half word elements in vectors
2580 Arguments : Inputs - in0, in1, in2, in3
2581 Outputs - out0, out1, out2, out3
2582 Return Type - signed halfword
2585 #define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \
2589 ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m); \
2590 ILVRL_W2_SH(s1_m, s0_m, out0, out2); \
2591 out1 = (v8i16) __msa_ilvl_d((v2i64) out0, (v2i64) out0); \
2592 out3 = (v8i16) __msa_ilvl_d((v2i64) out0, (v2i64) out2); \
2595 /* Description : Transposes 8x8 block with half word elements in vectors
2596 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
2597 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
2598 Return Type - as per RTYPE
2601 #define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
2602 out0, out1, out2, out3, out4, out5, out6, out7) \
2605 v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2606 v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
2608 ILVR_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \
2609 ILVRL_H2_SH(s1_m, s0_m, tmp0_m, tmp1_m); \
2610 ILVL_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \
2611 ILVRL_H2_SH(s1_m, s0_m, tmp2_m, tmp3_m); \
2612 ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \
2613 ILVRL_H2_SH(s1_m, s0_m, tmp4_m, tmp5_m); \
2614 ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \
2615 ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m); \
2616 PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m, \
2617 tmp3_m, tmp7_m, out0, out2, out4, out6); \
2618 out1 = (RTYPE) __msa_pckod_d((v2i64) tmp0_m, (v2i64) tmp4_m); \
2619 out3 = (RTYPE) __msa_pckod_d((v2i64) tmp1_m, (v2i64) tmp5_m); \
2620 out5 = (RTYPE) __msa_pckod_d((v2i64) tmp2_m, (v2i64) tmp6_m); \
2621 out7 = (RTYPE) __msa_pckod_d((v2i64) tmp3_m, (v2i64) tmp7_m); \
2623 #define TRANSPOSE8x8_UH_UH(...) TRANSPOSE8x8_H(v8u16, __VA_ARGS__)
2624 #define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__)
2626 /* Description : Transposes 4x4 block with word elements in vectors
2627 Arguments : Inputs - in0, in1, in2, in3
2628 Outputs - out0, out1, out2, out3
2629 Return Type - signed word
2632 #define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3) \
2634 v4i32 s0_m, s1_m, s2_m, s3_m; \
2636 ILVRL_W2_SW(in1, in0, s0_m, s1_m); \
2637 ILVRL_W2_SW(in3, in2, s2_m, s3_m); \
2639 out0 = (v4i32) __msa_ilvr_d((v2i64) s2_m, (v2i64) s0_m); \
2640 out1 = (v4i32) __msa_ilvl_d((v2i64) s2_m, (v2i64) s0_m); \
2641 out2 = (v4i32) __msa_ilvr_d((v2i64) s3_m, (v2i64) s1_m); \
2642 out3 = (v4i32) __msa_ilvl_d((v2i64) s3_m, (v2i64) s1_m); \
2645 /* Description : Average byte elements from pair of vectors and store 8x4 byte
2646 block in destination memory
2647 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2648 Details : Each byte element from input vector pair 'in0' and 'in1' are
2649 averaged (a + b)/2 and stored in 'tmp0_m'
2650 Each byte element from input vector pair 'in2' and 'in3' are
2651 averaged (a + b)/2 and stored in 'tmp1_m'
2652 Each byte element from input vector pair 'in4' and 'in5' are
2653 averaged (a + b)/2 and stored in 'tmp2_m'
2654 Each byte element from input vector pair 'in6' and 'in7' are
2655 averaged (a + b)/2 and stored in 'tmp3_m'
2656 The half vector results from all 4 vectors are stored in
2657 destination memory as 8x4 byte block
2659 #define AVE_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
2661 uint64_t out0_m, out1_m, out2_m, out3_m; \
2662 v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2664 tmp0_m = __msa_ave_u_b((v16u8) in0, (v16u8) in1); \
2665 tmp1_m = __msa_ave_u_b((v16u8) in2, (v16u8) in3); \
2666 tmp2_m = __msa_ave_u_b((v16u8) in4, (v16u8) in5); \
2667 tmp3_m = __msa_ave_u_b((v16u8) in6, (v16u8) in7); \
2669 out0_m = __msa_copy_u_d((v2i64) tmp0_m, 0); \
2670 out1_m = __msa_copy_u_d((v2i64) tmp1_m, 0); \
2671 out2_m = __msa_copy_u_d((v2i64) tmp2_m, 0); \
2672 out3_m = __msa_copy_u_d((v2i64) tmp3_m, 0); \
2673 SD4(out0_m, out1_m, out2_m, out3_m, pdst, stride); \
2676 /* Description : Average byte elements from pair of vectors and store 16x4 byte
2677 block in destination memory
2678 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2679 Details : Each byte element from input vector pair 'in0' and 'in1' are
2680 averaged (a + b)/2 and stored in 'tmp0_m'
2681 Each byte element from input vector pair 'in2' and 'in3' are
2682 averaged (a + b)/2 and stored in 'tmp1_m'
2683 Each byte element from input vector pair 'in4' and 'in5' are
2684 averaged (a + b)/2 and stored in 'tmp2_m'
2685 Each byte element from input vector pair 'in6' and 'in7' are
2686 averaged (a + b)/2 and stored in 'tmp3_m'
2687 The results from all 4 vectors are stored in destination
2688 memory as 16x4 byte block
2690 #define AVE_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
2692 v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2694 tmp0_m = __msa_ave_u_b((v16u8) in0, (v16u8) in1); \
2695 tmp1_m = __msa_ave_u_b((v16u8) in2, (v16u8) in3); \
2696 tmp2_m = __msa_ave_u_b((v16u8) in4, (v16u8) in5); \
2697 tmp3_m = __msa_ave_u_b((v16u8) in6, (v16u8) in7); \
2699 ST_UB4(tmp0_m, tmp1_m, tmp2_m, tmp3_m, pdst, stride); \
2702 /* Description : Average rounded byte elements from pair of vectors and store
2703 8x4 byte block in destination memory
2704 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2705 Details : Each byte element from input vector pair 'in0' and 'in1' are
2706 average rounded (a + b + 1)/2 and stored in 'tmp0_m'
2707 Each byte element from input vector pair 'in2' and 'in3' are
2708 average rounded (a + b + 1)/2 and stored in 'tmp1_m'
2709 Each byte element from input vector pair 'in4' and 'in5' are
2710 average rounded (a + b + 1)/2 and stored in 'tmp2_m'
2711 Each byte element from input vector pair 'in6' and 'in7' are
2712 average rounded (a + b + 1)/2 and stored in 'tmp3_m'
2713 The half vector results from all 4 vectors are stored in
2714 destination memory as 8x4 byte block
2716 #define AVER_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
2718 uint64_t out0_m, out1_m, out2_m, out3_m; \
2719 v16u8 tp0_m, tp1_m, tp2_m, tp3_m; \
2721 AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2722 tp0_m, tp1_m, tp2_m, tp3_m); \
2724 out0_m = __msa_copy_u_d((v2i64) tp0_m, 0); \
2725 out1_m = __msa_copy_u_d((v2i64) tp1_m, 0); \
2726 out2_m = __msa_copy_u_d((v2i64) tp2_m, 0); \
2727 out3_m = __msa_copy_u_d((v2i64) tp3_m, 0); \
2728 SD4(out0_m, out1_m, out2_m, out3_m, pdst, stride); \
2731 /* Description : Average rounded byte elements from pair of vectors and store
2732 16x4 byte block in destination memory
2733 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2734 Details : Each byte element from input vector pair 'in0' and 'in1' are
2735 average rounded (a + b + 1)/2 and stored in 'tmp0_m'
2736 Each byte element from input vector pair 'in2' and 'in3' are
2737 average rounded (a + b + 1)/2 and stored in 'tmp1_m'
2738 Each byte element from input vector pair 'in4' and 'in5' are
2739 average rounded (a + b + 1)/2 and stored in 'tmp2_m'
2740 Each byte element from input vector pair 'in6' and 'in7' are
2741 average rounded (a + b + 1)/2 and stored in 'tmp3_m'
2742 The vector results from all 4 vectors are stored in
2743 destination memory as 16x4 byte block
2745 #define AVER_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
2747 v16u8 t0_m, t1_m, t2_m, t3_m; \
2749 AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2750 t0_m, t1_m, t2_m, t3_m); \
2751 ST_UB4(t0_m, t1_m, t2_m, t3_m, pdst, stride); \
2754 /* Description : Average rounded byte elements from pair of vectors,
2755 average rounded with destination and store 8x4 byte block
2756 in destination memory
2757 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2758 Details : Each byte element from input vector pair 'in0' and 'in1' are
2759 average rounded (a + b + 1)/2 and stored in 'tmp0_m'
2760 Each byte element from input vector pair 'in2' and 'in3' are
2761 average rounded (a + b + 1)/2 and stored in 'tmp1_m'
2762 Each byte element from input vector pair 'in4' and 'in5' are
2763 average rounded (a + b + 1)/2 and stored in 'tmp2_m'
2764 Each byte element from input vector pair 'in6' and 'in7' are
2765 average rounded (a + b + 1)/2 and stored in 'tmp3_m'
2766 The half vector results from all 4 vectors are stored in
2767 destination memory as 8x4 byte block
2769 #define AVER_DST_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2772 v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2773 v16u8 dst0_m, dst1_m, dst2_m, dst3_m; \
2775 LD_UB4(pdst, stride, dst0_m, dst1_m, dst2_m, dst3_m); \
2776 AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2777 tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
2778 AVER_ST8x4_UB(dst0_m, tmp0_m, dst1_m, tmp1_m, \
2779 dst2_m, tmp2_m, dst3_m, tmp3_m, pdst, stride); \
2782 /* Description : Average rounded byte elements from pair of vectors,
2783 average rounded with destination and store 16x4 byte block
2784 in destination memory
2785 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2786 Details : Each byte element from input vector pair 'in0' and 'in1' are
2787 average rounded (a + b + 1)/2 and stored in 'tmp0_m'
2788 Each byte element from input vector pair 'in2' and 'in3' are
2789 average rounded (a + b + 1)/2 and stored in 'tmp1_m'
2790 Each byte element from input vector pair 'in4' and 'in5' are
2791 average rounded (a + b + 1)/2 and stored in 'tmp2_m'
2792 Each byte element from input vector pair 'in6' and 'in7' are
2793 average rounded (a + b + 1)/2 and stored in 'tmp3_m'
2794 The vector results from all 4 vectors are stored in
2795 destination memory as 16x4 byte block
2797 #define AVER_DST_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2800 v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2801 v16u8 dst0_m, dst1_m, dst2_m, dst3_m; \
2803 LD_UB4(pdst, stride, dst0_m, dst1_m, dst2_m, dst3_m); \
2804 AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2805 tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
2806 AVER_ST16x4_UB(dst0_m, tmp0_m, dst1_m, tmp1_m, \
2807 dst2_m, tmp2_m, dst3_m, tmp3_m, pdst, stride); \
2810 /* Description : Add block 4x4
2811 Arguments : Inputs - in0, in1, in2, in3, pdst, stride
2812 Details : Least significant 4 bytes from each input vector are added to
2813 the destination bytes, clipped between 0-255 and then stored.
2815 #define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride) \
2817 uint32_t src0_m, src1_m, src2_m, src3_m; \
2818 uint32_t out0_m, out1_m, out2_m, out3_m; \
2819 v8i16 inp0_m, inp1_m, res0_m, res1_m; \
2820 v16i8 dst0_m = { 0 }; \
2821 v16i8 dst1_m = { 0 }; \
2822 v16i8 zero_m = { 0 }; \
2824 ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m) \
2825 LW4(pdst, stride, src0_m, src1_m, src2_m, src3_m); \
2826 INSERT_W2_SB(src0_m, src1_m, dst0_m); \
2827 INSERT_W2_SB(src2_m, src3_m, dst1_m); \
2828 ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m); \
2829 ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m); \
2830 CLIP_SH2_0_255(res0_m, res1_m); \
2831 PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m); \
2833 out0_m = __msa_copy_u_w((v4i32) dst0_m, 0); \
2834 out1_m = __msa_copy_u_w((v4i32) dst0_m, 1); \
2835 out2_m = __msa_copy_u_w((v4i32) dst1_m, 0); \
2836 out3_m = __msa_copy_u_w((v4i32) dst1_m, 1); \
2837 SW4(out0_m, out1_m, out2_m, out3_m, pdst, stride); \
2840 /* Description : Dot product and addition of 3 signed halfword input vectors
2841 Arguments : Inputs - in0, in1, in2, coeff0, coeff1, coeff2
2843 Return Type - signed halfword
2844 Details : Dot product of 'in0' with 'coeff0'
2845 Dot product of 'in1' with 'coeff1'
2846 Dot product of 'in2' with 'coeff2'
2847 Addition of all the 3 vector results
2849 out0_m = (in0 * coeff0) + (in1 * coeff1) + (in2 * coeff2)
2851 #define DPADD_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2) \
2856 out0_m = __msa_dotp_s_h((v16i8) in0, (v16i8) coeff0); \
2857 out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in1, (v16i8) coeff1); \
2858 tmp1_m = __msa_dotp_s_h((v16i8) in2, (v16i8) coeff2); \
2859 out0_m = __msa_adds_s_h(out0_m, tmp1_m); \
2864 /* Description : Pack even elements of input vectors & xor with 128
2865 Arguments : Inputs - in0, in1
2867 Return Type - unsigned byte
2868 Details : Signed byte even elements from 'in0' and 'in1' are packed
2869 together in one vector and the resulted vector is xor'ed with
2870 128 to shift the range from signed to unsigned byte
2872 #define PCKEV_XORI128_UB(in0, in1) \
2875 out_m = (v16u8) __msa_pckev_b((v16i8) in1, (v16i8) in0); \
2876 out_m = (v16u8) __msa_xori_b((v16u8) out_m, 128); \
2880 /* Description : Converts inputs to unsigned bytes, interleave, average & store
2881 as 8x4 unsigned byte block
2882 Arguments : Inputs - in0, in1, in2, in3, dst0, dst1, pdst, stride
2884 #define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3, \
2885 dst0, dst1, pdst, stride) \
2887 v16u8 tmp0_m, tmp1_m; \
2888 uint8_t *pdst_m = (uint8_t *) (pdst); \
2890 tmp0_m = PCKEV_XORI128_UB(in0, in1); \
2891 tmp1_m = PCKEV_XORI128_UB(in2, in3); \
2892 AVER_UB2_UB(tmp0_m, dst0, tmp1_m, dst1, tmp0_m, tmp1_m); \
2893 ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride); \
2896 /* Description : Pack even byte elements, extract 0 & 2 index words from pair
2897 of results and store 4 words in destination memory as per
2899 Arguments : Inputs - in0, in1, in2, in3, pdst, stride
2901 #define PCKEV_ST4x4_UB(in0, in1, in2, in3, pdst, stride) \
2903 uint32_t out0_m, out1_m, out2_m, out3_m; \
2904 v16i8 tmp0_m, tmp1_m; \
2906 PCKEV_B2_SB(in1, in0, in3, in2, tmp0_m, tmp1_m); \
2908 out0_m = __msa_copy_u_w((v4i32) tmp0_m, 0); \
2909 out1_m = __msa_copy_u_w((v4i32) tmp0_m, 2); \
2910 out2_m = __msa_copy_u_w((v4i32) tmp1_m, 0); \
2911 out3_m = __msa_copy_u_w((v4i32) tmp1_m, 2); \
2913 SW4(out0_m, out1_m, out2_m, out3_m, pdst, stride); \
2916 /* Description : Pack even byte elements and store byte vector in destination
2918 Arguments : Inputs - in0, in1, pdst
2920 #define PCKEV_ST_SB(in0, in1, pdst) \
2923 tmp_m = __msa_pckev_b((v16i8) in1, (v16i8) in0); \
2924 ST_SB(tmp_m, (pdst)); \
2927 /* Description : Horizontal 2 tap filter kernel code
2928 Arguments : Inputs - in0, in1, mask, coeff, shift
2930 #define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift) \
2935 tmp0_m = __msa_vshf_b((v16i8) mask, (v16i8) in1, (v16i8) in0); \
2936 tmp1_m = __msa_dotp_u_h((v16u8) tmp0_m, (v16u8) coeff); \
2937 tmp1_m = (v8u16) __msa_srari_h((v8i16) tmp1_m, shift); \
2938 tmp1_m = __msa_sat_u_h(tmp1_m, shift); \
2942 #endif /* AVUTIL_MIPS_GENERIC_MACROS_MSA_H */