2 * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #ifndef AVUTIL_MIPS_GENERIC_MACROS_MSA_H
22 #define AVUTIL_MIPS_GENERIC_MACROS_MSA_H
28 #define ALLOC_ALIGNED(align) __attribute__ ((aligned((align) << 1)))
30 #define LD_V(RTYPE, psrc) *((RTYPE *)(psrc))
31 #define LD_UB(...) LD_V(v16u8, __VA_ARGS__)
32 #define LD_SB(...) LD_V(v16i8, __VA_ARGS__)
33 #define LD_UH(...) LD_V(v8u16, __VA_ARGS__)
34 #define LD_SH(...) LD_V(v8i16, __VA_ARGS__)
35 #define LD_UW(...) LD_V(v4u32, __VA_ARGS__)
36 #define LD_SW(...) LD_V(v4i32, __VA_ARGS__)
38 #define ST_V(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
39 #define ST_UB(...) ST_V(v16u8, __VA_ARGS__)
40 #define ST_SB(...) ST_V(v16i8, __VA_ARGS__)
41 #define ST_UH(...) ST_V(v8u16, __VA_ARGS__)
42 #define ST_SH(...) ST_V(v8i16, __VA_ARGS__)
43 #define ST_UW(...) ST_V(v4u32, __VA_ARGS__)
44 #define ST_SW(...) ST_V(v4i32, __VA_ARGS__)
46 #if (__mips_isa_rev >= 6)
49 uint16_t val_lh_m = *(uint16_t *)(psrc); \
55 uint32_t val_lw_m = *(uint32_t *)(psrc); \
62 uint64_t val_ld_m = *(uint64_t *)(psrc); \
65 #else // !(__mips == 64)
68 uint8_t *psrc_ld_m = (uint8_t *) (psrc); \
69 uint32_t val0_ld_m, val1_ld_m; \
70 uint64_t val_ld_m = 0; \
72 val0_ld_m = LW(psrc_ld_m); \
73 val1_ld_m = LW(psrc_ld_m + 4); \
75 val_ld_m = (uint64_t) (val1_ld_m); \
76 val_ld_m = (uint64_t) ((val_ld_m << 32) & 0xFFFFFFFF00000000); \
77 val_ld_m = (uint64_t) (val_ld_m | (uint64_t) val0_ld_m); \
81 #endif // (__mips == 64)
83 #define SH(val, pdst) *(uint16_t *)(pdst) = (val);
84 #define SW(val, pdst) *(uint32_t *)(pdst) = (val);
85 #define SD(val, pdst) *(uint64_t *)(pdst) = (val);
87 #else // !(__mips_isa_rev >= 6)
90 uint8_t *psrc_lh_m = (uint8_t *) (psrc); \
94 "ulh %[val_lh_m], %[psrc_lh_m] \n\t" \
96 : [val_lh_m] "=r" (val_lh_m) \
97 : [psrc_lh_m] "m" (*psrc_lh_m) \
105 uint8_t *psrc_lw_m = (uint8_t *) (psrc); \
109 "ulw %[val_lw_m], %[psrc_lw_m] \n\t" \
111 : [val_lw_m] "=r" (val_lw_m) \
112 : [psrc_lw_m] "m" (*psrc_lw_m) \
121 uint8_t *psrc_ld_m = (uint8_t *) (psrc); \
122 uint64_t val_ld_m = 0; \
125 "uld %[val_ld_m], %[psrc_ld_m] \n\t" \
127 : [val_ld_m] "=r" (val_ld_m) \
128 : [psrc_ld_m] "m" (*psrc_ld_m) \
133 #else // !(__mips == 64)
136 uint8_t *psrc_ld_m = (uint8_t *) (psrc); \
137 uint32_t val0_ld_m, val1_ld_m; \
138 uint64_t val_ld_m = 0; \
140 val0_ld_m = LW(psrc_ld_m); \
141 val1_ld_m = LW(psrc_ld_m + 4); \
143 val_ld_m = (uint64_t) (val1_ld_m); \
144 val_ld_m = (uint64_t) ((val_ld_m << 32) & 0xFFFFFFFF00000000); \
145 val_ld_m = (uint64_t) (val_ld_m | (uint64_t) val0_ld_m); \
149 #endif // (__mips == 64)
151 #define SH(val, pdst) \
153 uint8_t *pdst_sh_m = (uint8_t *) (pdst); \
154 uint16_t val_sh_m = (val); \
157 "ush %[val_sh_m], %[pdst_sh_m] \n\t" \
159 : [pdst_sh_m] "=m" (*pdst_sh_m) \
160 : [val_sh_m] "r" (val_sh_m) \
164 #define SW(val, pdst) \
166 uint8_t *pdst_sw_m = (uint8_t *) (pdst); \
167 uint32_t val_sw_m = (val); \
170 "usw %[val_sw_m], %[pdst_sw_m] \n\t" \
172 : [pdst_sw_m] "=m" (*pdst_sw_m) \
173 : [val_sw_m] "r" (val_sw_m) \
177 #define SD(val, pdst) \
179 uint8_t *pdst_sd_m = (uint8_t *) (pdst); \
180 uint32_t val0_sd_m, val1_sd_m; \
182 val0_sd_m = (uint32_t) ((val) & 0x00000000FFFFFFFF); \
183 val1_sd_m = (uint32_t) (((val) >> 32) & 0x00000000FFFFFFFF); \
185 SW(val0_sd_m, pdst_sd_m); \
186 SW(val1_sd_m, pdst_sd_m + 4); \
188 #endif // (__mips_isa_rev >= 6)
190 /* Description : Load 4 words with stride
191 Arguments : Inputs - psrc (source pointer to load from)
193 Outputs - out0, out1, out2, out3
194 Details : Loads word in 'out0' from (psrc)
195 Loads word in 'out1' from (psrc + stride)
196 Loads word in 'out2' from (psrc + 2 * stride)
197 Loads word in 'out3' from (psrc + 3 * stride)
199 #define LW4(psrc, stride, out0, out1, out2, out3) \
202 out1 = LW((psrc) + stride); \
203 out2 = LW((psrc) + 2 * stride); \
204 out3 = LW((psrc) + 3 * stride); \
207 /* Description : Load double words with stride
208 Arguments : Inputs - psrc (source pointer to load from)
211 Details : Loads double word in 'out0' from (psrc)
212 Loads double word in 'out1' from (psrc + stride)
214 #define LD2(psrc, stride, out0, out1) \
217 out1 = LD((psrc) + stride); \
219 #define LD4(psrc, stride, out0, out1, out2, out3) \
221 LD2((psrc), stride, out0, out1); \
222 LD2((psrc) + 2 * stride, stride, out2, out3); \
225 /* Description : Store 4 words with stride
226 Arguments : Inputs - in0, in1, in2, in3, pdst, stride
227 Details : Stores word from 'in0' to (pdst)
228 Stores word from 'in1' to (pdst + stride)
229 Stores word from 'in2' to (pdst + 2 * stride)
230 Stores word from 'in3' to (pdst + 3 * stride)
232 #define SW4(in0, in1, in2, in3, pdst, stride) \
235 SW(in1, (pdst) + stride); \
236 SW(in2, (pdst) + 2 * stride); \
237 SW(in3, (pdst) + 3 * stride); \
240 /* Description : Store 4 double words with stride
241 Arguments : Inputs - in0, in1, in2, in3, pdst, stride
242 Details : Stores double word from 'in0' to (pdst)
243 Stores double word from 'in1' to (pdst + stride)
244 Stores double word from 'in2' to (pdst + 2 * stride)
245 Stores double word from 'in3' to (pdst + 3 * stride)
247 #define SD4(in0, in1, in2, in3, pdst, stride) \
250 SD(in1, (pdst) + stride); \
251 SD(in2, (pdst) + 2 * stride); \
252 SD(in3, (pdst) + 3 * stride); \
255 /* Description : Load vector elements with stride
256 Arguments : Inputs - psrc (source pointer to load from)
259 Return Type - as per RTYPE
260 Details : Loads elements in 'out0' from (psrc)
261 Loads elements in 'out1' from (psrc + stride)
263 #define LD_V2(RTYPE, psrc, stride, out0, out1) \
265 out0 = LD_V(RTYPE, (psrc)); \
266 out1 = LD_V(RTYPE, (psrc) + stride); \
268 #define LD_UB2(...) LD_V2(v16u8, __VA_ARGS__)
269 #define LD_SB2(...) LD_V2(v16i8, __VA_ARGS__)
270 #define LD_UH2(...) LD_V2(v8u16, __VA_ARGS__)
271 #define LD_SH2(...) LD_V2(v8i16, __VA_ARGS__)
272 #define LD_SW2(...) LD_V2(v4i32, __VA_ARGS__)
274 #define LD_V3(RTYPE, psrc, stride, out0, out1, out2) \
276 LD_V2(RTYPE, (psrc), stride, out0, out1); \
277 out2 = LD_V(RTYPE, (psrc) + 2 * stride); \
279 #define LD_UB3(...) LD_V3(v16u8, __VA_ARGS__)
280 #define LD_SB3(...) LD_V3(v16i8, __VA_ARGS__)
282 #define LD_V4(RTYPE, psrc, stride, out0, out1, out2, out3) \
284 LD_V2(RTYPE, (psrc), stride, out0, out1); \
285 LD_V2(RTYPE, (psrc) + 2 * stride , stride, out2, out3); \
287 #define LD_UB4(...) LD_V4(v16u8, __VA_ARGS__)
288 #define LD_SB4(...) LD_V4(v16i8, __VA_ARGS__)
289 #define LD_UH4(...) LD_V4(v8u16, __VA_ARGS__)
290 #define LD_SH4(...) LD_V4(v8i16, __VA_ARGS__)
292 #define LD_V5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \
294 LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
295 out4 = LD_V(RTYPE, (psrc) + 4 * stride); \
297 #define LD_UB5(...) LD_V5(v16u8, __VA_ARGS__)
298 #define LD_SB5(...) LD_V5(v16i8, __VA_ARGS__)
300 #define LD_V6(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5) \
302 LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
303 LD_V2(RTYPE, (psrc) + 4 * stride, stride, out4, out5); \
305 #define LD_UB6(...) LD_V6(v16u8, __VA_ARGS__)
306 #define LD_SB6(...) LD_V6(v16i8, __VA_ARGS__)
307 #define LD_UH6(...) LD_V6(v8u16, __VA_ARGS__)
308 #define LD_SH6(...) LD_V6(v8i16, __VA_ARGS__)
310 #define LD_V7(RTYPE, psrc, stride, \
311 out0, out1, out2, out3, out4, out5, out6) \
313 LD_V5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4); \
314 LD_V2(RTYPE, (psrc) + 5 * stride, stride, out5, out6); \
316 #define LD_UB7(...) LD_V7(v16u8, __VA_ARGS__)
317 #define LD_SB7(...) LD_V7(v16i8, __VA_ARGS__)
319 #define LD_V8(RTYPE, psrc, stride, \
320 out0, out1, out2, out3, out4, out5, out6, out7) \
322 LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
323 LD_V4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \
325 #define LD_UB8(...) LD_V8(v16u8, __VA_ARGS__)
326 #define LD_SB8(...) LD_V8(v16i8, __VA_ARGS__)
327 #define LD_UH8(...) LD_V8(v8u16, __VA_ARGS__)
328 #define LD_SH8(...) LD_V8(v8i16, __VA_ARGS__)
330 #define LD_V16(RTYPE, psrc, stride, \
331 out0, out1, out2, out3, out4, out5, out6, out7, \
332 out8, out9, out10, out11, out12, out13, out14, out15) \
334 LD_V8(RTYPE, (psrc), stride, \
335 out0, out1, out2, out3, out4, out5, out6, out7); \
336 LD_V8(RTYPE, (psrc) + 8 * stride, stride, \
337 out8, out9, out10, out11, out12, out13, out14, out15); \
339 #define LD_SH16(...) LD_V16(v8i16, __VA_ARGS__)
341 /* Description : Load as 4x4 block of signed halfword elements from 1D source
342 data into 4 vectors (Each vector with 4 signed halfwords)
343 Arguments : Inputs - psrc
344 Outputs - out0, out1, out2, out3
346 #define LD4x4_SH(psrc, out0, out1, out2, out3) \
348 out0 = LD_SH(psrc); \
349 out2 = LD_SH(psrc + 8); \
350 out1 = (v8i16) __msa_ilvl_d((v2i64) out0, (v2i64) out0); \
351 out3 = (v8i16) __msa_ilvl_d((v2i64) out2, (v2i64) out2); \
354 /* Description : Store vectors with stride
355 Arguments : Inputs - in0, in1, stride
356 Outputs - pdst (destination pointer to store to)
357 Details : Stores elements from 'in0' to (pdst)
358 Stores elements from 'in1' to (pdst + stride)
360 #define ST_V2(RTYPE, in0, in1, pdst, stride) \
362 ST_V(RTYPE, in0, (pdst)); \
363 ST_V(RTYPE, in1, (pdst) + stride); \
365 #define ST_UB2(...) ST_V2(v16u8, __VA_ARGS__)
366 #define ST_SB2(...) ST_V2(v16i8, __VA_ARGS__)
367 #define ST_UH2(...) ST_V2(v8u16, __VA_ARGS__)
368 #define ST_SH2(...) ST_V2(v8i16, __VA_ARGS__)
369 #define ST_SW2(...) ST_V2(v4i32, __VA_ARGS__)
371 #define ST_V4(RTYPE, in0, in1, in2, in3, pdst, stride) \
373 ST_V2(RTYPE, in0, in1, (pdst), stride); \
374 ST_V2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
376 #define ST_UB4(...) ST_V4(v16u8, __VA_ARGS__)
377 #define ST_SB4(...) ST_V4(v16i8, __VA_ARGS__)
378 #define ST_SH4(...) ST_V4(v8i16, __VA_ARGS__)
379 #define ST_SW4(...) ST_V4(v4i32, __VA_ARGS__)
381 #define ST_V6(RTYPE, in0, in1, in2, in3, in4, in5, pdst, stride) \
383 ST_V4(RTYPE, in0, in1, in2, in3, (pdst), stride); \
384 ST_V2(RTYPE, in4, in5, (pdst) + 4 * stride, stride); \
386 #define ST_SH6(...) ST_V6(v8i16, __VA_ARGS__)
388 #define ST_V8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
390 ST_V4(RTYPE, in0, in1, in2, in3, (pdst), stride); \
391 ST_V4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \
393 #define ST_UB8(...) ST_V8(v16u8, __VA_ARGS__)
394 #define ST_SH8(...) ST_V8(v8i16, __VA_ARGS__)
395 #define ST_SW8(...) ST_V8(v4i32, __VA_ARGS__)
397 /* Description : Store as 2x4 byte block to destination memory from input vector
398 Arguments : Inputs - in, stidx, pdst, stride
399 Return Type - unsigned byte
400 Details : Index stidx halfword element from 'in' vector is copied and
402 Index stidx+1 halfword element from 'in' vector is copied and
403 stored on second line
404 Index stidx+2 halfword element from 'in' vector is copied and
406 Index stidx+3 halfword element from 'in' vector is copied and
407 stored on fourth line
409 #define ST2x4_UB(in, stidx, pdst, stride) \
411 uint16_t out0_m, out1_m, out2_m, out3_m; \
412 uint8_t *pblk_2x4_m = (uint8_t *) (pdst); \
414 out0_m = __msa_copy_u_h((v8i16) in, (stidx)); \
415 out1_m = __msa_copy_u_h((v8i16) in, (stidx + 1)); \
416 out2_m = __msa_copy_u_h((v8i16) in, (stidx + 2)); \
417 out3_m = __msa_copy_u_h((v8i16) in, (stidx + 3)); \
419 SH(out0_m, pblk_2x4_m); \
420 SH(out1_m, pblk_2x4_m + stride); \
421 SH(out2_m, pblk_2x4_m + 2 * stride); \
422 SH(out3_m, pblk_2x4_m + 3 * stride); \
425 /* Description : Store as 4x2 byte block to destination memory from input vector
426 Arguments : Inputs - in, pdst, stride
427 Return Type - unsigned byte
428 Details : Index 0 word element from input vector is copied and stored
430 Index 1 word element from input vector is copied and stored
433 #define ST4x2_UB(in, pdst, stride) \
435 uint32_t out0_m, out1_m; \
436 uint8_t *pblk_4x2_m = (uint8_t *) (pdst); \
438 out0_m = __msa_copy_u_w((v4i32) in, 0); \
439 out1_m = __msa_copy_u_w((v4i32) in, 1); \
441 SW(out0_m, pblk_4x2_m); \
442 SW(out1_m, pblk_4x2_m + stride); \
445 /* Description : Store as 4x4 byte block to destination memory from input vector
446 Arguments : Inputs - in0, in1, pdst, stride
447 Return Type - unsigned byte
448 Details : Idx0 word element from input vector 'in0' is copied and stored
450 Idx1 word element from input vector 'in0' is copied and stored
452 Idx2 word element from input vector 'in1' is copied and stored
454 Idx3 word element from input vector 'in1' is copied and stored
457 #define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \
459 uint32_t out0_m, out1_m, out2_m, out3_m; \
460 uint8_t *pblk_4x4_m = (uint8_t *) (pdst); \
462 out0_m = __msa_copy_u_w((v4i32) in0, idx0); \
463 out1_m = __msa_copy_u_w((v4i32) in0, idx1); \
464 out2_m = __msa_copy_u_w((v4i32) in1, idx2); \
465 out3_m = __msa_copy_u_w((v4i32) in1, idx3); \
467 SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride); \
469 #define ST4x8_UB(in0, in1, pdst, stride) \
471 uint8_t *pblk_4x8 = (uint8_t *) (pdst); \
473 ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride); \
474 ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride); \
477 /* Description : Store as 6x4 byte block to destination memory from input
479 Arguments : Inputs - in0, in1, pdst, stride
480 Return Type - unsigned byte
481 Details : Index 0 word element from input vector 'in0' is copied and
482 stored on first line followed by index 2 halfword element
483 Index 2 word element from input vector 'in0' is copied and
484 stored on second line followed by index 2 halfword element
485 Index 0 word element from input vector 'in1' is copied and
486 stored on third line followed by index 2 halfword element
487 Index 2 word element from input vector 'in1' is copied and
488 stored on fourth line followed by index 2 halfword element
490 #define ST6x4_UB(in0, in1, pdst, stride) \
492 uint32_t out0_m, out1_m, out2_m, out3_m; \
493 uint16_t out4_m, out5_m, out6_m, out7_m; \
494 uint8_t *pblk_6x4_m = (uint8_t *) (pdst); \
496 out0_m = __msa_copy_u_w((v4i32) in0, 0); \
497 out1_m = __msa_copy_u_w((v4i32) in0, 2); \
498 out2_m = __msa_copy_u_w((v4i32) in1, 0); \
499 out3_m = __msa_copy_u_w((v4i32) in1, 2); \
501 out4_m = __msa_copy_u_h((v8i16) in0, 2); \
502 out5_m = __msa_copy_u_h((v8i16) in0, 6); \
503 out6_m = __msa_copy_u_h((v8i16) in1, 2); \
504 out7_m = __msa_copy_u_h((v8i16) in1, 6); \
506 SW(out0_m, pblk_6x4_m); \
507 SH(out4_m, (pblk_6x4_m + 4)); \
508 pblk_6x4_m += stride; \
509 SW(out1_m, pblk_6x4_m); \
510 SH(out5_m, (pblk_6x4_m + 4)); \
511 pblk_6x4_m += stride; \
512 SW(out2_m, pblk_6x4_m); \
513 SH(out6_m, (pblk_6x4_m + 4)); \
514 pblk_6x4_m += stride; \
515 SW(out3_m, pblk_6x4_m); \
516 SH(out7_m, (pblk_6x4_m + 4)); \
519 /* Description : Store as 8x1 byte block to destination memory from input vector
520 Arguments : Inputs - in, pdst
521 Details : Index 0 double word element from input vector 'in' is copied
522 and stored to destination memory at (pdst)
524 #define ST8x1_UB(in, pdst) \
527 out0_m = __msa_copy_u_d((v2i64) in, 0); \
531 /* Description : Store as 8x2 byte block to destination memory from input vector
532 Arguments : Inputs - in, pdst, stride
533 Details : Index 0 double word element from input vector 'in' is copied
534 and stored to destination memory at (pdst)
535 Index 1 double word element from input vector 'in' is copied
536 and stored to destination memory at (pdst + stride)
538 #define ST8x2_UB(in, pdst, stride) \
540 uint64_t out0_m, out1_m; \
541 uint8_t *pblk_8x2_m = (uint8_t *) (pdst); \
543 out0_m = __msa_copy_u_d((v2i64) in, 0); \
544 out1_m = __msa_copy_u_d((v2i64) in, 1); \
546 SD(out0_m, pblk_8x2_m); \
547 SD(out1_m, pblk_8x2_m + stride); \
550 /* Description : Store as 8x4 byte block to destination memory from input
552 Arguments : Inputs - in0, in1, pdst, stride
553 Details : Index 0 double word element from input vector 'in0' is copied
554 and stored to destination memory at (pblk_8x4_m)
555 Index 1 double word element from input vector 'in0' is copied
556 and stored to destination memory at (pblk_8x4_m + stride)
557 Index 0 double word element from input vector 'in1' is copied
558 and stored to destination memory at (pblk_8x4_m + 2 * stride)
559 Index 1 double word element from input vector 'in1' is copied
560 and stored to destination memory at (pblk_8x4_m + 3 * stride)
562 #define ST8x4_UB(in0, in1, pdst, stride) \
564 uint64_t out0_m, out1_m, out2_m, out3_m; \
565 uint8_t *pblk_8x4_m = (uint8_t *) (pdst); \
567 out0_m = __msa_copy_u_d((v2i64) in0, 0); \
568 out1_m = __msa_copy_u_d((v2i64) in0, 1); \
569 out2_m = __msa_copy_u_d((v2i64) in1, 0); \
570 out3_m = __msa_copy_u_d((v2i64) in1, 1); \
572 SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride); \
574 #define ST8x8_UB(in0, in1, in2, in3, pdst, stride) \
576 uint8_t *pblk_8x8_m = (uint8_t *) (pdst); \
578 ST8x4_UB(in0, in1, pblk_8x8_m, stride); \
579 ST8x4_UB(in2, in3, pblk_8x8_m + 4 * stride, stride); \
581 #define ST12x4_UB(in0, in1, in2, pdst, stride) \
583 uint8_t *pblk_12x4_m = (uint8_t *) (pdst); \
586 ST8x4_UB(in0, in1, pblk_12x4_m, stride); \
588 ST4x4_UB(in2, in2, 0, 1, 2, 3, pblk_12x4_m + 8, stride); \
591 /* Description : Store as 12x8 byte block to destination memory from
593 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
594 Details : Index 0 double word element from input vector 'in0' is copied
595 and stored to destination memory at (pblk_12x8_m) followed by
596 index 2 word element from same input vector 'in0' at
598 Similar to remaining lines
600 #define ST12x8_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
602 uint64_t out0_m, out1_m, out2_m, out3_m; \
603 uint64_t out4_m, out5_m, out6_m, out7_m; \
604 uint32_t out8_m, out9_m, out10_m, out11_m; \
605 uint32_t out12_m, out13_m, out14_m, out15_m; \
606 uint8_t *pblk_12x8_m = (uint8_t *) (pdst); \
608 out0_m = __msa_copy_u_d((v2i64) in0, 0); \
609 out1_m = __msa_copy_u_d((v2i64) in1, 0); \
610 out2_m = __msa_copy_u_d((v2i64) in2, 0); \
611 out3_m = __msa_copy_u_d((v2i64) in3, 0); \
612 out4_m = __msa_copy_u_d((v2i64) in4, 0); \
613 out5_m = __msa_copy_u_d((v2i64) in5, 0); \
614 out6_m = __msa_copy_u_d((v2i64) in6, 0); \
615 out7_m = __msa_copy_u_d((v2i64) in7, 0); \
617 out8_m = __msa_copy_u_w((v4i32) in0, 2); \
618 out9_m = __msa_copy_u_w((v4i32) in1, 2); \
619 out10_m = __msa_copy_u_w((v4i32) in2, 2); \
620 out11_m = __msa_copy_u_w((v4i32) in3, 2); \
621 out12_m = __msa_copy_u_w((v4i32) in4, 2); \
622 out13_m = __msa_copy_u_w((v4i32) in5, 2); \
623 out14_m = __msa_copy_u_w((v4i32) in6, 2); \
624 out15_m = __msa_copy_u_w((v4i32) in7, 2); \
626 SD(out0_m, pblk_12x8_m); \
627 SW(out8_m, pblk_12x8_m + 8); \
628 pblk_12x8_m += stride; \
629 SD(out1_m, pblk_12x8_m); \
630 SW(out9_m, pblk_12x8_m + 8); \
631 pblk_12x8_m += stride; \
632 SD(out2_m, pblk_12x8_m); \
633 SW(out10_m, pblk_12x8_m + 8); \
634 pblk_12x8_m += stride; \
635 SD(out3_m, pblk_12x8_m); \
636 SW(out11_m, pblk_12x8_m + 8); \
637 pblk_12x8_m += stride; \
638 SD(out4_m, pblk_12x8_m); \
639 SW(out12_m, pblk_12x8_m + 8); \
640 pblk_12x8_m += stride; \
641 SD(out5_m, pblk_12x8_m); \
642 SW(out13_m, pblk_12x8_m + 8); \
643 pblk_12x8_m += stride; \
644 SD(out6_m, pblk_12x8_m); \
645 SW(out14_m, pblk_12x8_m + 8); \
646 pblk_12x8_m += stride; \
647 SD(out7_m, pblk_12x8_m); \
648 SW(out15_m, pblk_12x8_m + 8); \
651 /* Description : average with rounding (in0 + in1 + 1) / 2.
652 Arguments : Inputs - in0, in1, in2, in3,
654 Return Type - as per RTYPE
655 Details : Each byte element from 'in0' vector is added with each byte
656 element from 'in1' vector. The addition of the elements plus 1
657 (for rounding) is done unsigned with full precision,
658 i.e. the result has one extra bit. Unsigned division by 2
659 (or logical shift right by one bit) is performed before writing
660 the result to vector 'out0'
661 Similar for the pair of 'in2' and 'in3'
663 #define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \
665 out0 = (RTYPE) __msa_aver_u_b((v16u8) in0, (v16u8) in1); \
666 out1 = (RTYPE) __msa_aver_u_b((v16u8) in2, (v16u8) in3); \
668 #define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__)
670 #define AVER_UB4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
671 out0, out1, out2, out3) \
673 AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \
674 AVER_UB2(RTYPE, in4, in5, in6, in7, out2, out3) \
676 #define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__)
678 /* Description : Immediate number of columns to slide with zero
679 Arguments : Inputs - in0, in1, slide_val
681 Return Type - as per RTYPE
682 Details : Byte elements from 'zero_m' vector are slide into 'in0' by
683 number of elements specified by 'slide_val'
685 #define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val) \
687 v16i8 zero_m = { 0 }; \
688 out0 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in0, slide_val); \
689 out1 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in1, slide_val); \
691 #define SLDI_B2_0_UB(...) SLDI_B2_0(v16u8, __VA_ARGS__)
692 #define SLDI_B2_0_SB(...) SLDI_B2_0(v16i8, __VA_ARGS__)
693 #define SLDI_B2_0_SW(...) SLDI_B2_0(v4i32, __VA_ARGS__)
695 #define SLDI_B3_0(RTYPE, in0, in1, in2, out0, out1, out2, slide_val) \
697 v16i8 zero_m = { 0 }; \
698 SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val); \
699 out2 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in2, slide_val); \
701 #define SLDI_B3_0_UB(...) SLDI_B3_0(v16u8, __VA_ARGS__)
702 #define SLDI_B3_0_SB(...) SLDI_B3_0(v16i8, __VA_ARGS__)
704 #define SLDI_B4_0(RTYPE, in0, in1, in2, in3, \
705 out0, out1, out2, out3, slide_val) \
707 SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val); \
708 SLDI_B2_0(RTYPE, in2, in3, out2, out3, slide_val); \
710 #define SLDI_B4_0_UB(...) SLDI_B4_0(v16u8, __VA_ARGS__)
711 #define SLDI_B4_0_SB(...) SLDI_B4_0(v16i8, __VA_ARGS__)
712 #define SLDI_B4_0_SH(...) SLDI_B4_0(v8i16, __VA_ARGS__)
714 /* Description : Immediate number of columns to slide
715 Arguments : Inputs - in0_0, in0_1, in1_0, in1_1, slide_val
717 Return Type - as per RTYPE
718 Details : Byte elements from 'in0_0' vector are slide into 'in1_0' by
719 number of elements specified by 'slide_val'
721 #define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \
723 out0 = (RTYPE) __msa_sldi_b((v16i8) in0_0, (v16i8) in1_0, slide_val); \
724 out1 = (RTYPE) __msa_sldi_b((v16i8) in0_1, (v16i8) in1_1, slide_val); \
726 #define SLDI_B2_UB(...) SLDI_B2(v16u8, __VA_ARGS__)
727 #define SLDI_B2_SB(...) SLDI_B2(v16i8, __VA_ARGS__)
728 #define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__)
730 #define SLDI_B3(RTYPE, in0_0, in0_1, in0_2, in1_0, in1_1, in1_2, \
731 out0, out1, out2, slide_val) \
733 SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \
734 out2 = (RTYPE) __msa_sldi_b((v16i8) in0_2, (v16i8) in1_2, slide_val); \
736 #define SLDI_B3_SB(...) SLDI_B3(v16i8, __VA_ARGS__)
737 #define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__)
739 /* Description : Shuffle byte vector elements as per mask vector
740 Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
742 Return Type - as per RTYPE
743 Details : Selective byte elements from in0 & in1 are copied to out0 as
744 per control vector mask0
745 Selective byte elements from in2 & in3 are copied to out1 as
746 per control vector mask1
748 #define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \
750 out0 = (RTYPE) __msa_vshf_b((v16i8) mask0, (v16i8) in1, (v16i8) in0); \
751 out1 = (RTYPE) __msa_vshf_b((v16i8) mask1, (v16i8) in3, (v16i8) in2); \
753 #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
754 #define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)
755 #define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)
756 #define VSHF_B2_SH(...) VSHF_B2(v8i16, __VA_ARGS__)
758 #define VSHF_B3(RTYPE, in0, in1, in2, in3, in4, in5, mask0, mask1, mask2, \
761 VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1); \
762 out2 = (RTYPE) __msa_vshf_b((v16i8) mask2, (v16i8) in5, (v16i8) in4); \
764 #define VSHF_B3_SB(...) VSHF_B3(v16i8, __VA_ARGS__)
766 #define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3, \
767 out0, out1, out2, out3) \
769 VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1); \
770 VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3); \
772 #define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__)
773 #define VSHF_B4_SH(...) VSHF_B4(v8i16, __VA_ARGS__)
775 /* Description : Shuffle halfword vector elements as per mask vector
776 Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
778 Return Type - as per RTYPE
779 Details : Selective halfword elements from in0 & in1 are copied to out0
780 as per control vector mask0
781 Selective halfword elements from in2 & in3 are copied to out1
782 as per control vector mask1
784 #define VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \
786 out0 = (RTYPE) __msa_vshf_h((v8i16) mask0, (v8i16) in1, (v8i16) in0); \
787 out1 = (RTYPE) __msa_vshf_h((v8i16) mask1, (v8i16) in3, (v8i16) in2); \
789 #define VSHF_H2_SH(...) VSHF_H2(v8i16, __VA_ARGS__)
791 #define VSHF_H3(RTYPE, in0, in1, in2, in3, in4, in5, mask0, mask1, mask2, \
794 VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1); \
795 out2 = (RTYPE) __msa_vshf_h((v8i16) mask2, (v8i16) in5, (v8i16) in4); \
797 #define VSHF_H3_SH(...) VSHF_H3(v8i16, __VA_ARGS__)
799 /* Description : Shuffle byte vector elements as per mask vector
800 Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
802 Return Type - as per RTYPE
803 Details : Selective byte elements from in0 & in1 are copied to out0 as
804 per control vector mask0
805 Selective byte elements from in2 & in3 are copied to out1 as
806 per control vector mask1
808 #define VSHF_W2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \
810 out0 = (RTYPE) __msa_vshf_w((v4i32) mask0, (v4i32) in1, (v4i32) in0); \
811 out1 = (RTYPE) __msa_vshf_w((v4i32) mask1, (v4i32) in3, (v4i32) in2); \
813 #define VSHF_W2_SB(...) VSHF_W2(v16i8, __VA_ARGS__)
815 /* Description : Dot product of byte vector elements
816 Arguments : Inputs - mult0, mult1
819 Return Type - as per RTYPE
820 Details : Unsigned byte elements from mult0 are multiplied with
821 unsigned byte elements from cnst0 producing a result
822 twice the size of input i.e. unsigned halfword.
823 Then this multiplication results of adjacent odd-even elements
824 are added together and stored to the out vector
825 (2 unsigned halfword results)
827 #define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
829 out0 = (RTYPE) __msa_dotp_u_h((v16u8) mult0, (v16u8) cnst0); \
830 out1 = (RTYPE) __msa_dotp_u_h((v16u8) mult1, (v16u8) cnst1); \
832 #define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__)
834 #define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3, \
835 cnst0, cnst1, cnst2, cnst3, \
836 out0, out1, out2, out3) \
838 DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
839 DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
841 #define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__)
843 /* Description : Dot product of byte vector elements
844 Arguments : Inputs - mult0, mult1
847 Return Type - as per RTYPE
848 Details : Signed byte elements from mult0 are multiplied with
849 signed byte elements from cnst0 producing a result
850 twice the size of input i.e. signed halfword.
851 Then this multiplication results of adjacent odd-even elements
852 are added together and stored to the out vector
853 (2 signed halfword results)
855 #define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
857 out0 = (RTYPE) __msa_dotp_s_h((v16i8) mult0, (v16i8) cnst0); \
858 out1 = (RTYPE) __msa_dotp_s_h((v16i8) mult1, (v16i8) cnst1); \
860 #define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__)
862 #define DOTP_SB3(RTYPE, mult0, mult1, mult2, cnst0, cnst1, cnst2, \
865 DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
866 out2 = (RTYPE) __msa_dotp_s_h((v16i8) mult2, (v16i8) cnst2); \
868 #define DOTP_SB3_SH(...) DOTP_SB3(v8i16, __VA_ARGS__)
870 #define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3, \
871 cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) \
873 DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
874 DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
876 #define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__)
878 /* Description : Dot product of halfword vector elements
879 Arguments : Inputs - mult0, mult1
882 Return Type - as per RTYPE
883 Details : Signed halfword elements from mult0 are multiplied with
884 signed halfword elements from cnst0 producing a result
885 twice the size of input i.e. signed word.
886 Then this multiplication results of adjacent odd-even elements
887 are added together and stored to the out vector
888 (2 signed word results)
890 #define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
892 out0 = (RTYPE) __msa_dotp_s_w((v8i16) mult0, (v8i16) cnst0); \
893 out1 = (RTYPE) __msa_dotp_s_w((v8i16) mult1, (v8i16) cnst1); \
895 #define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__)
897 #define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3, \
898 cnst0, cnst1, cnst2, cnst3, \
899 out0, out1, out2, out3) \
901 DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
902 DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
904 #define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__)
906 /* Description : Dot product & addition of byte vector elements
907 Arguments : Inputs - mult0, mult1
910 Return Type - as per RTYPE
911 Details : Signed byte elements from mult0 are multiplied with
912 signed byte elements from cnst0 producing a result
913 twice the size of input i.e. signed halfword.
914 Then this multiplication results of adjacent odd-even elements
915 are added to the out vector
916 (2 signed halfword results)
918 #define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
920 out0 = (RTYPE) __msa_dpadd_s_h((v8i16) out0, \
921 (v16i8) mult0, (v16i8) cnst0); \
922 out1 = (RTYPE) __msa_dpadd_s_h((v8i16) out1, \
923 (v16i8) mult1, (v16i8) cnst1); \
925 #define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__)
927 #define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3, \
928 cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) \
930 DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
931 DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
933 #define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__)
935 /* Description : Dot product & addition of byte vector elements
936 Arguments : Inputs - mult0, mult1
939 Return Type - as per RTYPE
940 Details : Unsigned byte elements from mult0 are multiplied with
941 unsigned byte elements from cnst0 producing a result
942 twice the size of input i.e. unsigned halfword.
943 Then this multiplication results of adjacent odd-even elements
944 are added to the out vector
945 (2 unsigned halfword results)
947 #define DPADD_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
949 out0 = (RTYPE) __msa_dpadd_u_h((v8u16) out0, \
950 (v16u8) mult0, (v16u8) cnst0); \
951 out1 = (RTYPE) __msa_dpadd_u_h((v8u16) out1, \
952 (v16u8) mult1, (v16u8) cnst1); \
954 #define DPADD_UB2_UH(...) DPADD_UB2(v8u16, __VA_ARGS__)
956 /* Description : Dot product & addition of halfword vector elements
957 Arguments : Inputs - mult0, mult1
960 Return Type - as per RTYPE
961 Details : Signed halfword elements from mult0 are multiplied with
962 signed halfword elements from cnst0 producing a result
963 twice the size of input i.e. signed word.
964 Then this multiplication results of adjacent odd-even elements
965 are added to the out vector
966 (2 signed word results)
968 #define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
970 out0 = (RTYPE) __msa_dpadd_s_w((v4i32) out0, \
971 (v8i16) mult0, (v8i16) cnst0); \
972 out1 = (RTYPE) __msa_dpadd_s_w((v4i32) out1, \
973 (v8i16) mult1, (v8i16) cnst1); \
975 #define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__)
977 #define DPADD_SH4(RTYPE, mult0, mult1, mult2, mult3, \
978 cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) \
980 DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
981 DPADD_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
983 #define DPADD_SH4_SW(...) DPADD_SH4(v4i32, __VA_ARGS__)
985 /* Description : Minimum values between unsigned elements of
986 either vector are copied to the output vector
987 Arguments : Inputs - in0, in1, min_vec
988 Outputs - in0, in1, (in place)
989 Return Type - as per RTYPE
990 Details : Minimum of unsigned halfword element values from 'in0' and
991 'min_value' are written to output vector 'in0'
993 #define MIN_UH2(RTYPE, in0, in1, min_vec) \
995 in0 = (RTYPE) __msa_min_u_h((v8u16) in0, min_vec); \
996 in1 = (RTYPE) __msa_min_u_h((v8u16) in1, min_vec); \
998 #define MIN_UH2_UH(...) MIN_UH2(v8u16, __VA_ARGS__)
1000 #define MIN_UH4(RTYPE, in0, in1, in2, in3, min_vec) \
1002 MIN_UH2(RTYPE, in0, in1, min_vec); \
1003 MIN_UH2(RTYPE, in2, in3, min_vec); \
1005 #define MIN_UH4_UH(...) MIN_UH4(v8u16, __VA_ARGS__)
1007 /* Description : Clips all halfword elements of input vector between min & max
1008 out = ((in) < (min)) ? (min) : (((in) > (max)) ? (max) : (in))
1009 Arguments : Inputs - in (input vector)
1010 - min (min threshold)
1011 - max (max threshold)
1012 Outputs - out_m (output vector with clipped elements)
1013 Return Type - signed halfword
1015 #define CLIP_SH(in, min, max) \
1019 out_m = __msa_max_s_h((v8i16) min, (v8i16) in); \
1020 out_m = __msa_min_s_h((v8i16) max, (v8i16) out_m); \
1024 /* Description : Clips all signed halfword elements of input vector
1026 Arguments : Inputs - in (input vector)
1027 Outputs - out_m (output vector with clipped elements)
1028 Return Type - signed halfword
1030 #define CLIP_SH_0_255(in) \
1032 v8i16 max_m = __msa_ldi_h(255); \
1035 out_m = __msa_maxi_s_h((v8i16) in, 0); \
1036 out_m = __msa_min_s_h((v8i16) max_m, (v8i16) out_m); \
1039 #define CLIP_SH2_0_255(in0, in1) \
1041 in0 = CLIP_SH_0_255(in0); \
1042 in1 = CLIP_SH_0_255(in1); \
1044 #define CLIP_SH4_0_255(in0, in1, in2, in3) \
1046 CLIP_SH2_0_255(in0, in1); \
1047 CLIP_SH2_0_255(in2, in3); \
1050 /* Description : Clips all signed word elements of input vector
1052 Arguments : Inputs - in (input vector)
1053 Outputs - out_m (output vector with clipped elements)
1054 Return Type - signed word
1056 #define CLIP_SW_0_255(in) \
1058 v4i32 max_m = __msa_ldi_w(255); \
1061 out_m = __msa_maxi_s_w((v4i32) in, 0); \
1062 out_m = __msa_min_s_w((v4i32) max_m, (v4i32) out_m); \
1066 /* Description : Addition of 4 signed word elements
1067 4 signed word elements of input vector are added together and
1068 resulted integer sum is returned
1069 Arguments : Inputs - in (signed word vector)
1070 Outputs - sum_m (i32 sum)
1071 Return Type - signed word
1073 #define HADD_SW_S32(in) \
1075 v2i64 res0_m, res1_m; \
1078 res0_m = __msa_hadd_s_d((v4i32) in, (v4i32) in); \
1079 res1_m = __msa_splati_d(res0_m, 1); \
1081 sum_m = __msa_copy_s_w((v4i32) res0_m, 0); \
1085 /* Description : Addition of 8 unsigned halfword elements
1086 8 unsigned halfword elements of input vector are added
1087 together and resulted integer sum is returned
1088 Arguments : Inputs - in (unsigned halfword vector)
1089 Outputs - sum_m (u32 sum)
1090 Return Type - unsigned word
1092 #define HADD_UH_U32(in) \
1095 v2u64 res0_m, res1_m; \
1098 res_m = __msa_hadd_u_w((v8u16) in, (v8u16) in); \
1099 res0_m = __msa_hadd_u_d(res_m, res_m); \
1100 res1_m = (v2u64) __msa_splati_d((v2i64) res0_m, 1); \
1102 sum_m = __msa_copy_u_w((v4i32) res0_m, 0); \
1106 /* Description : Horizontal addition of signed byte vector elements
1107 Arguments : Inputs - in0, in1
1108 Outputs - out0, out1
1109 Return Type - as per RTYPE
1110 Details : Each signed odd byte element from 'in0' is added to
1111 even signed byte element from 'in0' (pairwise) and the
1112 halfword result is stored in 'out0'
1114 #define HADD_SB2(RTYPE, in0, in1, out0, out1) \
1116 out0 = (RTYPE) __msa_hadd_s_h((v16i8) in0, (v16i8) in0); \
1117 out1 = (RTYPE) __msa_hadd_s_h((v16i8) in1, (v16i8) in1); \
1119 #define HADD_SB2_SH(...) HADD_SB2(v8i16, __VA_ARGS__)
1121 #define HADD_SB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \
1123 HADD_SB2(RTYPE, in0, in1, out0, out1); \
1124 HADD_SB2(RTYPE, in2, in3, out2, out3); \
1126 #define HADD_SB4_UH(...) HADD_SB4(v8u16, __VA_ARGS__)
1127 #define HADD_SB4_SH(...) HADD_SB4(v8i16, __VA_ARGS__)
1129 /* Description : Horizontal addition of unsigned byte vector elements
1130 Arguments : Inputs - in0, in1
1131 Outputs - out0, out1
1132 Return Type - as per RTYPE
1133 Details : Each unsigned odd byte element from 'in0' is added to
1134 even unsigned byte element from 'in0' (pairwise) and the
1135 halfword result is stored in 'out0'
1137 #define HADD_UB2(RTYPE, in0, in1, out0, out1) \
1139 out0 = (RTYPE) __msa_hadd_u_h((v16u8) in0, (v16u8) in0); \
1140 out1 = (RTYPE) __msa_hadd_u_h((v16u8) in1, (v16u8) in1); \
1142 #define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__)
1144 #define HADD_UB3(RTYPE, in0, in1, in2, out0, out1, out2) \
1146 HADD_UB2(RTYPE, in0, in1, out0, out1); \
1147 out2 = (RTYPE) __msa_hadd_u_h((v16u8) in2, (v16u8) in2); \
1149 #define HADD_UB3_UH(...) HADD_UB3(v8u16, __VA_ARGS__)
1151 #define HADD_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \
1153 HADD_UB2(RTYPE, in0, in1, out0, out1); \
1154 HADD_UB2(RTYPE, in2, in3, out2, out3); \
1156 #define HADD_UB4_UB(...) HADD_UB4(v16u8, __VA_ARGS__)
1157 #define HADD_UB4_UH(...) HADD_UB4(v8u16, __VA_ARGS__)
1158 #define HADD_UB4_SH(...) HADD_UB4(v8i16, __VA_ARGS__)
1160 /* Description : Horizontal subtraction of unsigned byte vector elements
1161 Arguments : Inputs - in0, in1
1162 Outputs - out0, out1
1163 Return Type - as per RTYPE
1164 Details : Each unsigned odd byte element from 'in0' is subtracted from
1165 even unsigned byte element from 'in0' (pairwise) and the
1166 halfword result is stored in 'out0'
1168 #define HSUB_UB2(RTYPE, in0, in1, out0, out1) \
1170 out0 = (RTYPE) __msa_hsub_u_h((v16u8) in0, (v16u8) in0); \
1171 out1 = (RTYPE) __msa_hsub_u_h((v16u8) in1, (v16u8) in1); \
1173 #define HSUB_UB2_UH(...) HSUB_UB2(v8u16, __VA_ARGS__)
1174 #define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
1176 #define HSUB_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \
1178 HSUB_UB2(RTYPE, in0, in1, out0, out1); \
1179 HSUB_UB2(RTYPE, in2, in3, out2, out3); \
1181 #define HSUB_UB4_UH(...) HSUB_UB4(v8u16, __VA_ARGS__)
1182 #define HSUB_UB4_SH(...) HSUB_UB4(v8i16, __VA_ARGS__)
1184 /* Description : SAD (Sum of Absolute Difference)
1185 Arguments : Inputs - in0, in1, ref0, ref1 (unsigned byte src & ref)
1186 Outputs - sad_m (halfword vector with sad)
1187 Return Type - unsigned halfword
1188 Details : Absolute difference of all the byte elements from 'in0' with
1189 'ref0' is calculated and preserved in 'diff0'. From the 16
1190 unsigned absolute diff values, even-odd pairs are added
1191 together to generate 8 halfword results.
1193 #define SAD_UB2_UH(in0, in1, ref0, ref1) \
1195 v16u8 diff0_m, diff1_m; \
1196 v8u16 sad_m = { 0 }; \
1198 diff0_m = __msa_asub_u_b((v16u8) in0, (v16u8) ref0); \
1199 diff1_m = __msa_asub_u_b((v16u8) in1, (v16u8) ref1); \
1201 sad_m += __msa_hadd_u_h((v16u8) diff0_m, (v16u8) diff0_m); \
1202 sad_m += __msa_hadd_u_h((v16u8) diff1_m, (v16u8) diff1_m); \
1207 /* Description : Insert specified word elements from input vectors to 1
1209 Arguments : Inputs - in0, in1, in2, in3 (4 input vectors)
1210 Outputs - out (output vector)
1211 Return Type - as per RTYPE
1213 #define INSERT_W2(RTYPE, in0, in1, out) \
1215 out = (RTYPE) __msa_insert_w((v4i32) out, 0, in0); \
1216 out = (RTYPE) __msa_insert_w((v4i32) out, 1, in1); \
1218 #define INSERT_W2_UB(...) INSERT_W2(v16u8, __VA_ARGS__)
1219 #define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__)
1221 #define INSERT_W4(RTYPE, in0, in1, in2, in3, out) \
1223 out = (RTYPE) __msa_insert_w((v4i32) out, 0, in0); \
1224 out = (RTYPE) __msa_insert_w((v4i32) out, 1, in1); \
1225 out = (RTYPE) __msa_insert_w((v4i32) out, 2, in2); \
1226 out = (RTYPE) __msa_insert_w((v4i32) out, 3, in3); \
1228 #define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__)
1229 #define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__)
1230 #define INSERT_W4_SW(...) INSERT_W4(v4i32, __VA_ARGS__)
1232 /* Description : Insert specified double word elements from input vectors to 1
1234 Arguments : Inputs - in0, in1 (2 input vectors)
1235 Outputs - out (output vector)
1236 Return Type - as per RTYPE
1238 #define INSERT_D2(RTYPE, in0, in1, out) \
1240 out = (RTYPE) __msa_insert_d((v2i64) out, 0, in0); \
1241 out = (RTYPE) __msa_insert_d((v2i64) out, 1, in1); \
1243 #define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__)
1244 #define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)
1245 #define INSERT_D2_SD(...) INSERT_D2(v2i64, __VA_ARGS__)
1247 /* Description : Interleave even byte elements from vectors
1248 Arguments : Inputs - in0, in1, in2, in3
1249 Outputs - out0, out1
1250 Return Type - as per RTYPE
1251 Details : Even byte elements of 'in0' and even byte
1252 elements of 'in1' are interleaved and copied to 'out0'
1253 Even byte elements of 'in2' and even byte
1254 elements of 'in3' are interleaved and copied to 'out1'
1256 #define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
1258 out0 = (RTYPE) __msa_ilvev_b((v16i8) in1, (v16i8) in0); \
1259 out1 = (RTYPE) __msa_ilvev_b((v16i8) in3, (v16i8) in2); \
1261 #define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__)
1262 #define ILVEV_B2_SB(...) ILVEV_B2(v16i8, __VA_ARGS__)
1263 #define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__)
1264 #define ILVEV_B2_SD(...) ILVEV_B2(v2i64, __VA_ARGS__)
1266 /* Description : Interleave even halfword elements from vectors
1267 Arguments : Inputs - in0, in1, in2, in3
1268 Outputs - out0, out1
1269 Return Type - as per RTYPE
1270 Details : Even halfword elements of 'in0' and even halfword
1271 elements of 'in1' are interleaved and copied to 'out0'
1272 Even halfword elements of 'in2' and even halfword
1273 elements of 'in3' are interleaved and copied to 'out1'
1275 #define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
1277 out0 = (RTYPE) __msa_ilvev_h((v8i16) in1, (v8i16) in0); \
1278 out1 = (RTYPE) __msa_ilvev_h((v8i16) in3, (v8i16) in2); \
1280 #define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__)
1281 #define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__)
1282 #define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__)
1284 /* Description : Interleave even word elements from vectors
1285 Arguments : Inputs - in0, in1, in2, in3
1286 Outputs - out0, out1
1287 Return Type - as per RTYPE
1288 Details : Even word elements of 'in0' and even word
1289 elements of 'in1' are interleaved and copied to 'out0'
1290 Even word elements of 'in2' and even word
1291 elements of 'in3' are interleaved and copied to 'out1'
1293 #define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) \
1295 out0 = (RTYPE) __msa_ilvev_w((v4i32) in1, (v4i32) in0); \
1296 out1 = (RTYPE) __msa_ilvev_w((v4i32) in3, (v4i32) in2); \
1298 #define ILVEV_W2_UB(...) ILVEV_W2(v16u8, __VA_ARGS__)
1299 #define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__)
1300 #define ILVEV_W2_UH(...) ILVEV_W2(v8u16, __VA_ARGS__)
1301 #define ILVEV_W2_SD(...) ILVEV_W2(v2i64, __VA_ARGS__)
1303 /* Description : Interleave even double word elements from vectors
1304 Arguments : Inputs - in0, in1, in2, in3
1305 Outputs - out0, out1
1306 Return Type - as per RTYPE
1307 Details : Even double word elements of 'in0' and even double word
1308 elements of 'in1' are interleaved and copied to 'out0'
1309 Even double word elements of 'in2' and even double word
1310 elements of 'in3' are interleaved and copied to 'out1'
1312 #define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
1314 out0 = (RTYPE) __msa_ilvev_d((v2i64) in1, (v2i64) in0); \
1315 out1 = (RTYPE) __msa_ilvev_d((v2i64) in3, (v2i64) in2); \
1317 #define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)
1318 #define ILVEV_D2_SB(...) ILVEV_D2(v16i8, __VA_ARGS__)
1319 #define ILVEV_D2_SW(...) ILVEV_D2(v4i32, __VA_ARGS__)
1321 /* Description : Interleave left half of byte elements from vectors
1322 Arguments : Inputs - in0, in1, in2, in3
1323 Outputs - out0, out1
1324 Return Type - as per RTYPE
1325 Details : Left half of byte elements of in0 and left half of byte
1326 elements of in1 are interleaved and copied to out0.
1327 Left half of byte elements of in2 and left half of byte
1328 elements of in3 are interleaved and copied to out1.
1330 #define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
1332 out0 = (RTYPE) __msa_ilvl_b((v16i8) in0, (v16i8) in1); \
1333 out1 = (RTYPE) __msa_ilvl_b((v16i8) in2, (v16i8) in3); \
1335 #define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__)
1336 #define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__)
1337 #define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__)
1338 #define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__)
1340 #define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1341 out0, out1, out2, out3) \
1343 ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1344 ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
1346 #define ILVL_B4_UB(...) ILVL_B4(v16u8, __VA_ARGS__)
1347 #define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__)
1348 #define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__)
1349 #define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__)
1351 /* Description : Interleave left half of halfword elements from vectors
1352 Arguments : Inputs - in0, in1, in2, in3
1353 Outputs - out0, out1
1354 Return Type - as per RTYPE
1355 Details : Left half of halfword elements of in0 and left half of halfword
1356 elements of in1 are interleaved and copied to out0.
1357 Left half of halfword elements of in2 and left half of halfword
1358 elements of in3 are interleaved and copied to out1.
1360 #define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
1362 out0 = (RTYPE) __msa_ilvl_h((v8i16) in0, (v8i16) in1); \
1363 out1 = (RTYPE) __msa_ilvl_h((v8i16) in2, (v8i16) in3); \
1365 #define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__)
1366 #define ILVL_H2_SW(...) ILVL_H2(v4i32, __VA_ARGS__)
1368 #define ILVL_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1369 out0, out1, out2, out3) \
1371 ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
1372 ILVL_H2(RTYPE, in4, in5, in6, in7, out2, out3); \
1374 #define ILVL_H4_SH(...) ILVL_H4(v8i16, __VA_ARGS__)
1375 #define ILVL_H4_SW(...) ILVL_H4(v4i32, __VA_ARGS__)
1377 /* Description : Interleave left half of word elements from vectors
1378 Arguments : Inputs - in0, in1, in2, in3
1379 Outputs - out0, out1
1380 Return Type - as per RTYPE
1381 Details : Left half of word elements of in0 and left half of word
1382 elements of in1 are interleaved and copied to out0.
1383 Left half of word elements of in2 and left half of word
1384 elements of in3 are interleaved and copied to out1.
1386 #define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1) \
1388 out0 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1); \
1389 out1 = (RTYPE) __msa_ilvl_w((v4i32) in2, (v4i32) in3); \
1391 #define ILVL_W2_UB(...) ILVL_W2(v16u8, __VA_ARGS__)
1392 #define ILVL_W2_SB(...) ILVL_W2(v16i8, __VA_ARGS__)
1393 #define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__)
1395 /* Description : Interleave right half of byte elements from vectors
1396 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
1397 Outputs - out0, out1, out2, out3
1398 Return Type - as per RTYPE
1399 Details : Right half of byte elements of in0 and right half of byte
1400 elements of in1 are interleaved and copied to out0.
1401 Right half of byte elements of in2 and right half of byte
1402 elements of in3 are interleaved and copied to out1.
1403 Similar for other pairs
1405 #define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
1407 out0 = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1); \
1408 out1 = (RTYPE) __msa_ilvr_b((v16i8) in2, (v16i8) in3); \
1410 #define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__)
1411 #define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)
1412 #define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__)
1413 #define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
1414 #define ILVR_B2_SW(...) ILVR_B2(v4i32, __VA_ARGS__)
1416 #define ILVR_B3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
1418 ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1419 out2 = (RTYPE) __msa_ilvr_b((v16i8) in4, (v16i8) in5); \
1421 #define ILVR_B3_UB(...) ILVR_B3(v16u8, __VA_ARGS__)
1422 #define ILVR_B3_UH(...) ILVR_B3(v8u16, __VA_ARGS__)
1423 #define ILVR_B3_SH(...) ILVR_B3(v8i16, __VA_ARGS__)
1425 #define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1426 out0, out1, out2, out3) \
1428 ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1429 ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
1431 #define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__)
1432 #define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__)
1433 #define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__)
1434 #define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__)
1435 #define ILVR_B4_SW(...) ILVR_B4(v4i32, __VA_ARGS__)
1437 #define ILVR_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1438 in8, in9, in10, in11, in12, in13, in14, in15, \
1439 out0, out1, out2, out3, out4, out5, out6, out7) \
1441 ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1442 out0, out1, out2, out3); \
1443 ILVR_B4(RTYPE, in8, in9, in10, in11, in12, in13, in14, in15, \
1444 out4, out5, out6, out7); \
1446 #define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__)
1448 /* Description : Interleave right half of halfword elements from vectors
1449 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
1450 Outputs - out0, out1, out2, out3
1451 Return Type - as per RTYPE
1452 Details : Right half of halfword elements of in0 and right half of
1453 halfword elements of in1 are interleaved and copied to out0.
1454 Right half of halfword elements of in2 and right half of
1455 halfword elements of in3 are interleaved and copied to out1.
1456 Similar for other pairs
1458 #define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
1460 out0 = (RTYPE) __msa_ilvr_h((v8i16) in0, (v8i16) in1); \
1461 out1 = (RTYPE) __msa_ilvr_h((v8i16) in2, (v8i16) in3); \
1463 #define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
1464 #define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__)
1466 #define ILVR_H3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
1468 ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
1469 out2 = (RTYPE) __msa_ilvr_h((v8i16) in4, (v8i16) in5); \
1471 #define ILVR_H3_SH(...) ILVR_H3(v8i16, __VA_ARGS__)
1473 #define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1474 out0, out1, out2, out3) \
1476 ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
1477 ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3); \
1479 #define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__)
1480 #define ILVR_H4_SW(...) ILVR_H4(v4i32, __VA_ARGS__)
1482 #define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1) \
1484 out0 = (RTYPE) __msa_ilvr_w((v4i32) in0, (v4i32) in1); \
1485 out1 = (RTYPE) __msa_ilvr_w((v4i32) in2, (v4i32) in3); \
1487 #define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__)
1488 #define ILVR_W2_SB(...) ILVR_W2(v16i8, __VA_ARGS__)
1489 #define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__)
1491 #define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1492 out0, out1, out2, out3) \
1494 ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1); \
1495 ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3); \
1497 #define ILVR_W4_SB(...) ILVR_W4(v16i8, __VA_ARGS__)
1498 #define ILVR_W4_UB(...) ILVR_W4(v16u8, __VA_ARGS__)
1500 /* Description : Interleave right half of double word elements from vectors
1501 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
1502 Outputs - out0, out1, out2, out3
1503 Return Type - as per RTYPE
1504 Details : Right half of double word elements of in0 and right half of
1505 double word elements of in1 are interleaved and copied to out0.
1506 Right half of double word elements of in2 and right half of
1507 double word elements of in3 are interleaved and copied to out1.
1509 #define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
1511 out0 = (RTYPE) __msa_ilvr_d((v2i64) in0, (v2i64) in1); \
1512 out1 = (RTYPE) __msa_ilvr_d((v2i64) in2, (v2i64) in3); \
1514 #define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__)
1515 #define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
1516 #define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
1518 #define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
1520 ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
1521 out2 = (RTYPE) __msa_ilvr_d((v2i64) in4, (v2i64) in5); \
1523 #define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__)
1525 #define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1526 out0, out1, out2, out3) \
1528 ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
1529 ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3); \
1531 #define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__)
1532 #define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__)
1534 /* Description : Interleave left half of double word elements from vectors
1535 Arguments : Inputs - in0, in1, in2, in3
1536 Outputs - out0, out1
1537 Return Type - as per RTYPE
1538 Details : Left half of double word elements of in0 and left half of
1539 double word elements of in1 are interleaved and copied to out0.
1540 Left half of double word elements of in2 and left half of
1541 double word elements of in3 are interleaved and copied to out1.
1543 #define ILVL_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
1545 out0 = (RTYPE) __msa_ilvl_d((v2i64) in0, (v2i64) in1); \
1546 out1 = (RTYPE) __msa_ilvl_d((v2i64) in2, (v2i64) in3); \
1548 #define ILVL_D2_UB(...) ILVL_D2(v16u8, __VA_ARGS__)
1549 #define ILVL_D2_SB(...) ILVL_D2(v16i8, __VA_ARGS__)
1550 #define ILVL_D2_SH(...) ILVL_D2(v8i16, __VA_ARGS__)
1552 /* Description : Interleave both left and right half of input vectors
1553 Arguments : Inputs - in0, in1
1554 Outputs - out0, out1
1555 Return Type - as per RTYPE
1556 Details : Right half of byte elements from 'in0' and 'in1' are
1557 interleaved and stored to 'out0'
1558 Left half of byte elements from 'in0' and 'in1' are
1559 interleaved and stored to 'out1'
1561 #define ILVRL_B2(RTYPE, in0, in1, out0, out1) \
1563 out0 = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1); \
1564 out1 = (RTYPE) __msa_ilvl_b((v16i8) in0, (v16i8) in1); \
1566 #define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
1567 #define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)
1568 #define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__)
1569 #define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
1570 #define ILVRL_B2_SW(...) ILVRL_B2(v4i32, __VA_ARGS__)
1572 #define ILVRL_H2(RTYPE, in0, in1, out0, out1) \
1574 out0 = (RTYPE) __msa_ilvr_h((v8i16) in0, (v8i16) in1); \
1575 out1 = (RTYPE) __msa_ilvl_h((v8i16) in0, (v8i16) in1); \
1577 #define ILVRL_H2_SB(...) ILVRL_H2(v16i8, __VA_ARGS__)
1578 #define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)
1579 #define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
1581 #define ILVRL_W2(RTYPE, in0, in1, out0, out1) \
1583 out0 = (RTYPE) __msa_ilvr_w((v4i32) in0, (v4i32) in1); \
1584 out1 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1); \
1586 #define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__)
1587 #define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
1588 #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
1590 /* Description : Maximum values between signed elements of vector and
1591 5-bit signed immediate value are copied to the output vector
1592 Arguments : Inputs - in0, in1, in2, in3, max_val
1593 Outputs - in0, in1, in2, in3 (in place)
1594 Return Type - as per RTYPE
1595 Details : Maximum of signed halfword element values from 'in0' and
1596 'max_val' are written to output vector 'in0'
1598 #define MAXI_SH2(RTYPE, in0, in1, max_val) \
1600 in0 = (RTYPE) __msa_maxi_s_h((v8i16) in0, max_val); \
1601 in1 = (RTYPE) __msa_maxi_s_h((v8i16) in1, max_val); \
1603 #define MAXI_SH2_UH(...) MAXI_SH2(v8u16, __VA_ARGS__)
1604 #define MAXI_SH2_SH(...) MAXI_SH2(v8i16, __VA_ARGS__)
1606 #define MAXI_SH4(RTYPE, in0, in1, in2, in3, max_val) \
1608 MAXI_SH2(RTYPE, in0, in1, max_val); \
1609 MAXI_SH2(RTYPE, in2, in3, max_val); \
1611 #define MAXI_SH4_UH(...) MAXI_SH4(v8u16, __VA_ARGS__)
1613 /* Description : Saturate the halfword element values to the max
1614 unsigned value of (sat_val+1 bits)
1615 The element data width remains unchanged
1616 Arguments : Inputs - in0, in1, in2, in3, sat_val
1617 Outputs - in0, in1, in2, in3 (in place)
1618 Return Type - as per RTYPE
1619 Details : Each unsigned halfword element from 'in0' is saturated to the
1620 value generated with (sat_val+1) bit range
1621 Results are in placed to original vectors
1623 #define SAT_UH2(RTYPE, in0, in1, sat_val) \
1625 in0 = (RTYPE) __msa_sat_u_h((v8u16) in0, sat_val); \
1626 in1 = (RTYPE) __msa_sat_u_h((v8u16) in1, sat_val); \
1628 #define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__)
1629 #define SAT_UH2_SH(...) SAT_UH2(v8i16, __VA_ARGS__)
1631 #define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val) \
1633 SAT_UH2(RTYPE, in0, in1, sat_val); \
1634 SAT_UH2(RTYPE, in2, in3, sat_val); \
1636 #define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__)
1638 /* Description : Saturate the halfword element values to the max
1639 unsigned value of (sat_val+1 bits)
1640 The element data width remains unchanged
1641 Arguments : Inputs - in0, in1, in2, in3, sat_val
1642 Outputs - in0, in1, in2, in3 (in place)
1643 Return Type - as per RTYPE
1644 Details : Each unsigned halfword element from 'in0' is saturated to the
1645 value generated with (sat_val+1) bit range
1646 Results are in placed to original vectors
1648 #define SAT_SH2(RTYPE, in0, in1, sat_val) \
1650 in0 = (RTYPE) __msa_sat_s_h((v8i16) in0, sat_val); \
1651 in1 = (RTYPE) __msa_sat_s_h((v8i16) in1, sat_val); \
1653 #define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__)
1655 #define SAT_SH3(RTYPE, in0, in1, in2, sat_val) \
1657 SAT_SH2(RTYPE, in0, in1, sat_val); \
1658 in2 = (RTYPE) __msa_sat_s_h((v8i16) in2, sat_val); \
1660 #define SAT_SH3_SH(...) SAT_SH3(v8i16, __VA_ARGS__)
1662 #define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) \
1664 SAT_SH2(RTYPE, in0, in1, sat_val); \
1665 SAT_SH2(RTYPE, in2, in3, sat_val); \
1667 #define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__)
1669 /* Description : Saturate the word element values to the max
1670 unsigned value of (sat_val+1 bits)
1671 The element data width remains unchanged
1672 Arguments : Inputs - in0, in1, in2, in3, sat_val
1673 Outputs - in0, in1, in2, in3 (in place)
1674 Return Type - as per RTYPE
1675 Details : Each unsigned word element from 'in0' is saturated to the
1676 value generated with (sat_val+1) bit range
1677 Results are in placed to original vectors
1679 #define SAT_SW2(RTYPE, in0, in1, sat_val) \
1681 in0 = (RTYPE) __msa_sat_s_w((v4i32) in0, sat_val); \
1682 in1 = (RTYPE) __msa_sat_s_w((v4i32) in1, sat_val); \
1684 #define SAT_SW2_SW(...) SAT_SW2(v4i32, __VA_ARGS__)
1686 #define SAT_SW4(RTYPE, in0, in1, in2, in3, sat_val) \
1688 SAT_SW2(RTYPE, in0, in1, sat_val); \
1689 SAT_SW2(RTYPE, in2, in3, sat_val); \
1691 #define SAT_SW4_SW(...) SAT_SW4(v4i32, __VA_ARGS__)
1693 /* Description : Indexed halfword element values are replicated to all
1694 elements in output vector
1695 Arguments : Inputs - in, idx0, idx1
1696 Outputs - out0, out1
1697 Return Type - as per RTYPE
1698 Details : 'idx0' element value from 'in' vector is replicated to all
1699 elements in 'out0' vector
1700 Valid index range for halfword operation is 0-7
1702 #define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) \
1704 out0 = (RTYPE) __msa_splati_h((v8i16) in, idx0); \
1705 out1 = (RTYPE) __msa_splati_h((v8i16) in, idx1); \
1707 #define SPLATI_H2_SB(...) SPLATI_H2(v16i8, __VA_ARGS__)
1708 #define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__)
1710 #define SPLATI_H3(RTYPE, in, idx0, idx1, idx2, \
1713 SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1); \
1714 out2 = (RTYPE) __msa_splati_h((v8i16) in, idx2); \
1716 #define SPLATI_H3_SB(...) SPLATI_H3(v16i8, __VA_ARGS__)
1717 #define SPLATI_H3_SH(...) SPLATI_H3(v8i16, __VA_ARGS__)
1719 #define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3, \
1720 out0, out1, out2, out3) \
1722 SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1); \
1723 SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3); \
1725 #define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__)
1726 #define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__)
1728 /* Description : Indexed word element values are replicated to all
1729 elements in output vector
1730 Arguments : Inputs - in, stidx
1731 Outputs - out0, out1
1732 Return Type - as per RTYPE
1733 Details : 'stidx' element value from 'in' vector is replicated to all
1734 elements in 'out0' vector
1735 'stidx + 1' element value from 'in' vector is replicated to all
1736 elements in 'out1' vector
1737 Valid index range for halfword operation is 0-3
1739 #define SPLATI_W2(RTYPE, in, stidx, out0, out1) \
1741 out0 = (RTYPE) __msa_splati_w((v4i32) in, stidx); \
1742 out1 = (RTYPE) __msa_splati_w((v4i32) in, (stidx+1)); \
1744 #define SPLATI_W2_SH(...) SPLATI_W2(v8i16, __VA_ARGS__)
1745 #define SPLATI_W2_SW(...) SPLATI_W2(v4i32, __VA_ARGS__)
1747 #define SPLATI_W4(RTYPE, in, out0, out1, out2, out3) \
1749 SPLATI_W2(RTYPE, in, 0, out0, out1); \
1750 SPLATI_W2(RTYPE, in, 2, out2, out3); \
1752 #define SPLATI_W4_SH(...) SPLATI_W4(v8i16, __VA_ARGS__)
1753 #define SPLATI_W4_SW(...) SPLATI_W4(v4i32, __VA_ARGS__)
1755 /* Description : Pack even byte elements of vector pairs
1756 Arguments : Inputs - in0, in1, in2, in3
1757 Outputs - out0, out1
1758 Return Type - as per RTYPE
1759 Details : Even byte elements of in0 are copied to the left half of
1760 out0 & even byte elements of in1 are copied to the right
1762 Even byte elements of in2 are copied to the left half of
1763 out1 & even byte elements of in3 are copied to the right
1766 #define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
1768 out0 = (RTYPE) __msa_pckev_b((v16i8) in0, (v16i8) in1); \
1769 out1 = (RTYPE) __msa_pckev_b((v16i8) in2, (v16i8) in3); \
1771 #define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)
1772 #define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)
1773 #define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__)
1774 #define PCKEV_B2_SW(...) PCKEV_B2(v4i32, __VA_ARGS__)
1776 #define PCKEV_B3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
1778 PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1779 out2 = (RTYPE) __msa_pckev_b((v16i8) in4, (v16i8) in5); \
1781 #define PCKEV_B3_UB(...) PCKEV_B3(v16u8, __VA_ARGS__)
1782 #define PCKEV_B3_SB(...) PCKEV_B3(v16i8, __VA_ARGS__)
1784 #define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1785 out0, out1, out2, out3) \
1787 PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1788 PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
1790 #define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__)
1791 #define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__)
1792 #define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__)
1793 #define PCKEV_B4_SW(...) PCKEV_B4(v4i32, __VA_ARGS__)
1795 /* Description : Pack even halfword elements of vector pairs
1796 Arguments : Inputs - in0, in1, in2, in3
1797 Outputs - out0, out1
1798 Return Type - as per RTYPE
1799 Details : Even halfword elements of in0 are copied to the left half of
1800 out0 & even halfword elements of in1 are copied to the right
1802 Even halfword elements of in2 are copied to the left half of
1803 out1 & even halfword elements of in3 are copied to the right
1806 #define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
1808 out0 = (RTYPE) __msa_pckev_h((v8i16) in0, (v8i16) in1); \
1809 out1 = (RTYPE) __msa_pckev_h((v8i16) in2, (v8i16) in3); \
1811 #define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__)
1812 #define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__)
1814 #define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1815 out0, out1, out2, out3) \
1817 PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
1818 PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3); \
1820 #define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__)
1821 #define PCKEV_H4_SW(...) PCKEV_H4(v4i32, __VA_ARGS__)
1823 /* Description : Pack even double word elements of vector pairs
1824 Arguments : Inputs - in0, in1, in2, in3
1825 Outputs - out0, out1
1826 Return Type - as per RTYPE
1827 Details : Even double elements of in0 are copied to the left half of
1828 out0 & even double elements of in1 are copied to the right
1830 Even double elements of in2 are copied to the left half of
1831 out1 & even double elements of in3 are copied to the right
1834 #define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
1836 out0 = (RTYPE) __msa_pckev_d((v2i64) in0, (v2i64) in1); \
1837 out1 = (RTYPE) __msa_pckev_d((v2i64) in2, (v2i64) in3); \
1839 #define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__)
1840 #define PCKEV_D2_SB(...) PCKEV_D2(v16i8, __VA_ARGS__)
1841 #define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__)
1843 #define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1844 out0, out1, out2, out3) \
1846 PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
1847 PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3); \
1849 #define PCKEV_D4_UB(...) PCKEV_D4(v16u8, __VA_ARGS__)
1851 /* Description : Pack odd double word elements of vector pairs
1852 Arguments : Inputs - in0, in1
1853 Outputs - out0, out1
1854 Return Type - as per RTYPE
1855 Details : As operation is on same input 'in0' vector, index 1 double word
1856 element is overwritten to index 0 and result is written to out0
1857 As operation is on same input 'in1' vector, index 1 double word
1858 element is overwritten to index 0 and result is written to out1
1860 #define PCKOD_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
1862 out0 = (RTYPE) __msa_pckod_d((v2i64) in0, (v2i64) in1); \
1863 out1 = (RTYPE) __msa_pckod_d((v2i64) in2, (v2i64) in3); \
1865 #define PCKOD_D2_UB(...) PCKOD_D2(v16u8, __VA_ARGS__)
1866 #define PCKOD_D2_SH(...) PCKOD_D2(v8i16, __VA_ARGS__)
1867 #define PCKOD_D2_SD(...) PCKOD_D2(v2i64, __VA_ARGS__)
1869 /* Description : Each byte element is logically xor'ed with immediate 128
1870 Arguments : Inputs - in0, in1
1871 Outputs - in0, in1 (in-place)
1872 Return Type - as per RTYPE
1873 Details : Each unsigned byte element from input vector 'in0' is
1874 logically xor'ed with 128 and result is in-place stored in
1876 Each unsigned byte element from input vector 'in1' is
1877 logically xor'ed with 128 and result is in-place stored in
1879 Similar for other pairs
1881 #define XORI_B2_128(RTYPE, in0, in1) \
1883 in0 = (RTYPE) __msa_xori_b((v16u8) in0, 128); \
1884 in1 = (RTYPE) __msa_xori_b((v16u8) in1, 128); \
1886 #define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__)
1887 #define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__)
1888 #define XORI_B2_128_SH(...) XORI_B2_128(v8i16, __VA_ARGS__)
1890 #define XORI_B3_128(RTYPE, in0, in1, in2) \
1892 XORI_B2_128(RTYPE, in0, in1); \
1893 in2 = (RTYPE) __msa_xori_b((v16u8) in2, 128); \
1895 #define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__)
1897 #define XORI_B4_128(RTYPE, in0, in1, in2, in3) \
1899 XORI_B2_128(RTYPE, in0, in1); \
1900 XORI_B2_128(RTYPE, in2, in3); \
1902 #define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__)
1903 #define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__)
1904 #define XORI_B4_128_SH(...) XORI_B4_128(v8i16, __VA_ARGS__)
1906 #define XORI_B5_128(RTYPE, in0, in1, in2, in3, in4) \
1908 XORI_B3_128(RTYPE, in0, in1, in2); \
1909 XORI_B2_128(RTYPE, in3, in4); \
1911 #define XORI_B5_128_SB(...) XORI_B5_128(v16i8, __VA_ARGS__)
1913 #define XORI_B6_128(RTYPE, in0, in1, in2, in3, in4, in5) \
1915 XORI_B4_128(RTYPE, in0, in1, in2, in3); \
1916 XORI_B2_128(RTYPE, in4, in5); \
1918 #define XORI_B6_128_SB(...) XORI_B6_128(v16i8, __VA_ARGS__)
1920 #define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6) \
1922 XORI_B4_128(RTYPE, in0, in1, in2, in3); \
1923 XORI_B3_128(RTYPE, in4, in5, in6); \
1925 #define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__)
1927 #define XORI_B8_128(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7) \
1929 XORI_B4_128(RTYPE, in0, in1, in2, in3); \
1930 XORI_B4_128(RTYPE, in4, in5, in6, in7); \
1932 #define XORI_B8_128_SB(...) XORI_B8_128(v16i8, __VA_ARGS__)
1934 /* Description : Addition of signed halfword elements and signed saturation
1935 Arguments : Inputs - in0, in1, in2, in3
1936 Outputs - out0, out1
1937 Return Type - as per RTYPE
1938 Details : Signed halfword elements from 'in0' are added to signed
1939 halfword elements of 'in1'. The result is then signed saturated
1940 between -32768 to +32767 (as per halfword data type)
1941 Similar for other pairs
1943 #define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1) \
1945 out0 = (RTYPE) __msa_adds_s_h((v8i16) in0, (v8i16) in1); \
1946 out1 = (RTYPE) __msa_adds_s_h((v8i16) in2, (v8i16) in3); \
1948 #define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__)
1950 #define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1951 out0, out1, out2, out3) \
1953 ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1); \
1954 ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3); \
1956 #define ADDS_SH4_UH(...) ADDS_SH4(v8u16, __VA_ARGS__)
1957 #define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__)
1959 /* Description : Shift left all elements of vector (generic for all data types)
1960 Arguments : Inputs - in0, in1, in2, in3, shift
1961 Outputs - in0, in1, in2, in3 (in place)
1962 Return Type - as per input vector RTYPE
1963 Details : Each element of vector 'in0' is left shifted by 'shift' and
1964 result is in place written to 'in0'
1965 Similar for other pairs
1967 #define SLLI_4V(in0, in1, in2, in3, shift) \
1969 in0 = in0 << shift; \
1970 in1 = in1 << shift; \
1971 in2 = in2 << shift; \
1972 in3 = in3 << shift; \
1975 /* Description : Arithmetic shift right all elements of vector
1976 (generic for all data types)
1977 Arguments : Inputs - in0, in1, in2, in3, shift
1978 Outputs - in0, in1, in2, in3 (in place)
1979 Return Type - as per input vector RTYPE
1980 Details : Each element of vector 'in0' is right shifted by 'shift' and
1981 result is in place written to 'in0'
1982 Here, 'shift' is GP variable passed in
1983 Similar for other pairs
1985 #define SRA_4V(in0, in1, in2, in3, shift) \
1987 in0 = in0 >> shift; \
1988 in1 = in1 >> shift; \
1989 in2 = in2 >> shift; \
1990 in3 = in3 >> shift; \
1993 /* Description : Shift right logical all halfword elements of vector
1994 Arguments : Inputs - in0, in1, in2, in3, shift
1995 Outputs - in0, in1, in2, in3 (in place)
1996 Return Type - as per RTYPE
1997 Details : Each element of vector 'in0' is shifted right logical by
1998 number of bits respective element holds in vector 'shift' and
1999 result is in place written to 'in0'
2000 Here, 'shift' is a vector passed in
2001 Similar for other pairs
2003 #define SRL_H4(RTYPE, in0, in1, in2, in3, shift) \
2005 in0 = (RTYPE) __msa_srl_h((v8i16) in0, (v8i16) shift); \
2006 in1 = (RTYPE) __msa_srl_h((v8i16) in1, (v8i16) shift); \
2007 in2 = (RTYPE) __msa_srl_h((v8i16) in2, (v8i16) shift); \
2008 in3 = (RTYPE) __msa_srl_h((v8i16) in3, (v8i16) shift); \
2010 #define SRL_H4_UH(...) SRL_H4(v8u16, __VA_ARGS__)
2012 /* Description : Shift right arithmetic rounded halfwords
2013 Arguments : Inputs - in0, in1, shift
2014 Outputs - in0, in1, (in place)
2015 Return Type - as per RTYPE
2016 Details : Each element of vector 'in0' is shifted right arithmetic by
2017 number of bits respective element holds in vector 'shift'.
2018 The last discarded bit is added to shifted value for rounding
2019 and the result is in place written to 'in0'
2020 Here, 'shift' is a vector passed in
2021 Similar for other pairs
2023 #define SRAR_H2(RTYPE, in0, in1, shift) \
2025 in0 = (RTYPE) __msa_srar_h((v8i16) in0, (v8i16) shift); \
2026 in1 = (RTYPE) __msa_srar_h((v8i16) in1, (v8i16) shift); \
2028 #define SRAR_H2_UH(...) SRAR_H2(v8u16, __VA_ARGS__)
2029 #define SRAR_H2_SH(...) SRAR_H2(v8i16, __VA_ARGS__)
2031 #define SRAR_H3(RTYPE, in0, in1, in2, shift) \
2033 SRAR_H2(RTYPE, in0, in1, shift) \
2034 in2 = (RTYPE) __msa_srar_h((v8i16) in2, (v8i16) shift); \
2036 #define SRAR_H3_SH(...) SRAR_H3(v8i16, __VA_ARGS__)
2038 #define SRAR_H4(RTYPE, in0, in1, in2, in3, shift) \
2040 SRAR_H2(RTYPE, in0, in1, shift) \
2041 SRAR_H2(RTYPE, in2, in3, shift) \
2043 #define SRAR_H4_UH(...) SRAR_H4(v8u16, __VA_ARGS__)
2044 #define SRAR_H4_SH(...) SRAR_H4(v8i16, __VA_ARGS__)
2046 /* Description : Shift right arithmetic rounded words
2047 Arguments : Inputs - in0, in1, shift
2048 Outputs - in0, in1, (in place)
2049 Return Type - as per RTYPE
2050 Details : Each element of vector 'in0' is shifted right arithmetic by
2051 number of bits respective element holds in vector 'shift'.
2052 The last discarded bit is added to shifted value for rounding
2053 and the result is in place written to 'in0'
2054 Here, 'shift' is a vector passed in
2055 Similar for other pairs
2057 #define SRAR_W2(RTYPE, in0, in1, shift) \
2059 in0 = (RTYPE) __msa_srar_w((v4i32) in0, (v4i32) shift); \
2060 in1 = (RTYPE) __msa_srar_w((v4i32) in1, (v4i32) shift); \
2062 #define SRAR_W2_SW(...) SRAR_W2(v4i32, __VA_ARGS__)
2064 #define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) \
2066 SRAR_W2(RTYPE, in0, in1, shift) \
2067 SRAR_W2(RTYPE, in2, in3, shift) \
2069 #define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__)
2071 /* Description : Shift right arithmetic rounded (immediate)
2072 Arguments : Inputs - in0, in1, in2, in3, shift
2073 Outputs - in0, in1, in2, in3 (in place)
2074 Return Type - as per RTYPE
2075 Details : Each element of vector 'in0' is shifted right arithmetic by
2077 The last discarded bit is added to shifted value for rounding
2078 and the result is in place written to 'in0'
2079 Similar for other pairs
2081 #define SRARI_H2(RTYPE, in0, in1, shift) \
2083 in0 = (RTYPE) __msa_srari_h((v8i16) in0, shift); \
2084 in1 = (RTYPE) __msa_srari_h((v8i16) in1, shift); \
2086 #define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__)
2087 #define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__)
2089 #define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) \
2091 SRARI_H2(RTYPE, in0, in1, shift); \
2092 SRARI_H2(RTYPE, in2, in3, shift); \
2094 #define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__)
2095 #define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__)
2097 /* Description : Shift right arithmetic rounded (immediate)
2098 Arguments : Inputs - in0, in1, shift
2099 Outputs - in0, in1 (in place)
2100 Return Type - as per RTYPE
2101 Details : Each element of vector 'in0' is shifted right arithmetic by
2103 The last discarded bit is added to shifted value for rounding
2104 and the result is in place written to 'in0'
2105 Similar for other pairs
2107 #define SRARI_W2(RTYPE, in0, in1, shift) \
2109 in0 = (RTYPE) __msa_srari_w((v4i32) in0, shift); \
2110 in1 = (RTYPE) __msa_srari_w((v4i32) in1, shift); \
2112 #define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__)
2114 #define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) \
2116 SRARI_W2(RTYPE, in0, in1, shift); \
2117 SRARI_W2(RTYPE, in2, in3, shift); \
2119 #define SRARI_W4_SH(...) SRARI_W4(v8i16, __VA_ARGS__)
2120 #define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)
2122 /* Description : Multiplication of pairs of vectors
2123 Arguments : Inputs - in0, in1, in2, in3
2124 Outputs - out0, out1
2125 Details : Each element from 'in0' is multiplied with elements from 'in1'
2126 and result is written to 'out0'
2127 Similar for other pairs
2129 #define MUL2(in0, in1, in2, in3, out0, out1) \
2134 #define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
2136 MUL2(in0, in1, in2, in3, out0, out1); \
2137 MUL2(in4, in5, in6, in7, out2, out3); \
2140 /* Description : Addition of 2 pairs of vectors
2141 Arguments : Inputs - in0, in1, in2, in3
2142 Outputs - out0, out1
2143 Details : Each element from 2 pairs vectors is added and 2 results are
2146 #define ADD2(in0, in1, in2, in3, out0, out1) \
2151 #define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
2153 ADD2(in0, in1, in2, in3, out0, out1); \
2154 ADD2(in4, in5, in6, in7, out2, out3); \
2157 /* Description : Subtraction of 2 pairs of vectors
2158 Arguments : Inputs - in0, in1, in2, in3
2159 Outputs - out0, out1
2160 Details : Each element from 2 pairs vectors is subtracted and 2 results
2163 #define SUB2(in0, in1, in2, in3, out0, out1) \
2168 #define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
2176 /* Description : Sign extend halfword elements from right half of the vector
2177 Arguments : Inputs - in (input halfword vector)
2178 Outputs - out (sign extended word vectors)
2179 Return Type - signed word
2180 Details : Sign bit of halfword elements from input vector 'in' is
2181 extracted and interleaved with same vector 'in0' to generate
2182 4 word elements keeping sign intact
2184 #define UNPCK_R_SH_SW(in, out) \
2188 sign_m = __msa_clti_s_h((v8i16) in, 0); \
2189 out = (v4i32) __msa_ilvr_h(sign_m, (v8i16) in); \
2192 /* Description : Sign extend byte elements from input vector and return
2193 halfword results in pair of vectors
2194 Arguments : Inputs - in (1 input byte vector)
2195 Outputs - out0, out1 (sign extended 2 halfword vectors)
2196 Return Type - signed halfword
2197 Details : Sign bit of byte elements from input vector 'in' is
2198 extracted and interleaved right with same vector 'in0' to
2199 generate 8 signed halfword elements in 'out0'
2200 Then interleaved left with same vector 'in0' to
2201 generate 8 signed halfword elements in 'out1'
2203 #define UNPCK_SB_SH(in, out0, out1) \
2207 tmp_m = __msa_clti_s_b((v16i8) in, 0); \
2208 ILVRL_B2_SH(tmp_m, in, out0, out1); \
2211 /* Description : Zero extend unsigned byte elements to halfword elements
2212 Arguments : Inputs - in (1 input unsigned byte vector)
2213 Outputs - out0, out1 (unsigned 2 halfword vectors)
2214 Return Type - signed halfword
2215 Details : Zero extended right half of vector is returned in 'out0'
2216 Zero extended left half of vector is returned in 'out1'
2218 #define UNPCK_UB_SH(in, out0, out1) \
2220 v16i8 zero_m = { 0 }; \
2222 ILVRL_B2_SH(zero_m, in, out0, out1); \
2225 /* Description : Sign extend halfword elements from input vector and return
2226 result in pair of vectors
2227 Arguments : Inputs - in (1 input halfword vector)
2228 Outputs - out0, out1 (sign extended 2 word vectors)
2229 Return Type - signed word
2230 Details : Sign bit of halfword elements from input vector 'in' is
2231 extracted and interleaved right with same vector 'in0' to
2232 generate 4 signed word elements in 'out0'
2233 Then interleaved left with same vector 'in0' to
2234 generate 4 signed word elements in 'out1'
2236 #define UNPCK_SH_SW(in, out0, out1) \
2240 tmp_m = __msa_clti_s_h((v8i16) in, 0); \
2241 ILVRL_H2_SW(tmp_m, in, out0, out1); \
2244 /* Description : Swap two variables
2245 Arguments : Inputs - in0, in1
2246 Outputs - in0, in1 (in-place)
2247 Details : Swapping of two input variables using xor
2249 #define SWAP(in0, in1) \
2256 /* Description : Butterfly of 4 input vectors
2257 Arguments : Inputs - in0, in1, in2, in3
2258 Outputs - out0, out1, out2, out3
2259 Details : Butterfly operation
2261 #define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) \
2270 /* Description : Butterfly of 8 input vectors
2271 Arguments : Inputs - in0 ... in7
2272 Outputs - out0 .. out7
2273 Details : Butterfly operation
2275 #define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, \
2276 out0, out1, out2, out3, out4, out5, out6, out7) \
2289 /* Description : Butterfly of 16 input vectors
2290 Arguments : Inputs - in0 ... in15
2291 Outputs - out0 .. out15
2292 Details : Butterfly operation
2294 #define BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, \
2295 in8, in9, in10, in11, in12, in13, in14, in15, \
2296 out0, out1, out2, out3, out4, out5, out6, out7, \
2297 out8, out9, out10, out11, out12, out13, out14, out15) \
2299 out0 = in0 + in15; \
2300 out1 = in1 + in14; \
2301 out2 = in2 + in13; \
2302 out3 = in3 + in12; \
2303 out4 = in4 + in11; \
2304 out5 = in5 + in10; \
2310 out10 = in5 - in10; \
2311 out11 = in4 - in11; \
2312 out12 = in3 - in12; \
2313 out13 = in2 - in13; \
2314 out14 = in1 - in14; \
2315 out15 = in0 - in15; \
2318 /* Description : Transposes input 4x4 byte block
2319 Arguments : Inputs - in0, in1, in2, in3 (input 4x4 byte block)
2320 Outputs - out0, out1, out2, out3 (output 4x4 byte block)
2321 Return Type - unsigned byte
2324 #define TRANSPOSE4x4_UB_UB(in0, in1, in2, in3, out0, out1, out2, out3) \
2326 v16i8 zero_m = { 0 }; \
2327 v16i8 s0_m, s1_m, s2_m, s3_m; \
2329 ILVR_D2_SB(in1, in0, in3, in2, s0_m, s1_m); \
2330 ILVRL_B2_SB(s1_m, s0_m, s2_m, s3_m); \
2332 out0 = (v16u8) __msa_ilvr_b(s3_m, s2_m); \
2333 out1 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out0, 4); \
2334 out2 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out1, 4); \
2335 out3 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out2, 4); \
2338 /* Description : Transposes input 8x4 byte block into 4x8
2339 Arguments : Inputs - in0, in1, in2, in3 (input 8x4 byte block)
2340 Outputs - out0, out1, out2, out3 (output 4x8 byte block)
2341 Return Type - as per RTYPE
2344 #define TRANSPOSE8x4_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
2345 out0, out1, out2, out3) \
2347 v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2349 ILVEV_W2_SB(in0, in4, in1, in5, tmp0_m, tmp1_m); \
2350 tmp2_m = __msa_ilvr_b(tmp1_m, tmp0_m); \
2351 ILVEV_W2_SB(in2, in6, in3, in7, tmp0_m, tmp1_m); \
2353 tmp3_m = __msa_ilvr_b(tmp1_m, tmp0_m); \
2354 ILVRL_H2_SB(tmp3_m, tmp2_m, tmp0_m, tmp1_m); \
2356 ILVRL_W2(RTYPE, tmp1_m, tmp0_m, out0, out2); \
2357 out1 = (RTYPE) __msa_ilvl_d((v2i64) out2, (v2i64) out0); \
2358 out3 = (RTYPE) __msa_ilvl_d((v2i64) out0, (v2i64) out2); \
2360 #define TRANSPOSE8x4_UB_UB(...) TRANSPOSE8x4_UB(v16u8, __VA_ARGS__)
2361 #define TRANSPOSE8x4_UB_UH(...) TRANSPOSE8x4_UB(v8u16, __VA_ARGS__)
2363 /* Description : Transposes input 8x8 byte block
2364 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
2365 (input 8x8 byte block)
2366 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
2367 (output 8x8 byte block)
2368 Return Type - as per RTYPE
2371 #define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
2372 out0, out1, out2, out3, out4, out5, out6, out7) \
2374 v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2375 v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
2377 ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5, \
2378 tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
2379 ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m); \
2380 ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m); \
2381 ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2); \
2382 ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6); \
2383 SLDI_B2_0(RTYPE, out0, out2, out1, out3, 8); \
2384 SLDI_B2_0(RTYPE, out4, out6, out5, out7, 8); \
2386 #define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__)
2387 #define TRANSPOSE8x8_UB_UH(...) TRANSPOSE8x8_UB(v8u16, __VA_ARGS__)
2389 /* Description : Transposes 16x4 block into 4x16 with byte elements in vectors
2390 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7,
2391 in8, in9, in10, in11, in12, in13, in14, in15
2392 Outputs - out0, out1, out2, out3
2393 Return Type - unsigned byte
2396 #define TRANSPOSE16x4_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2397 in8, in9, in10, in11, in12, in13, in14, in15, \
2398 out0, out1, out2, out3) \
2400 v2i64 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2402 ILVEV_W2_SD(in0, in4, in8, in12, tmp0_m, tmp1_m); \
2403 out1 = (v16u8) __msa_ilvev_d(tmp1_m, tmp0_m); \
2405 ILVEV_W2_SD(in1, in5, in9, in13, tmp0_m, tmp1_m); \
2406 out3 = (v16u8) __msa_ilvev_d(tmp1_m, tmp0_m); \
2408 ILVEV_W2_SD(in2, in6, in10, in14, tmp0_m, tmp1_m); \
2410 tmp2_m = __msa_ilvev_d(tmp1_m, tmp0_m); \
2411 ILVEV_W2_SD(in3, in7, in11, in15, tmp0_m, tmp1_m); \
2413 tmp3_m = __msa_ilvev_d(tmp1_m, tmp0_m); \
2414 ILVEV_B2_SD(out1, out3, tmp2_m, tmp3_m, tmp0_m, tmp1_m); \
2415 out0 = (v16u8) __msa_ilvev_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
2416 out2 = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
2418 tmp0_m = (v2i64) __msa_ilvod_b((v16i8) out3, (v16i8) out1); \
2419 tmp1_m = (v2i64) __msa_ilvod_b((v16i8) tmp3_m, (v16i8) tmp2_m); \
2420 out1 = (v16u8) __msa_ilvev_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
2421 out3 = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
2424 /* Description : Transposes 16x8 block into 8x16 with byte elements in vectors
2425 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7,
2426 in8, in9, in10, in11, in12, in13, in14, in15
2427 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
2428 Return Type - unsigned byte
2431 #define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2432 in8, in9, in10, in11, in12, in13, in14, in15, \
2433 out0, out1, out2, out3, out4, out5, out6, out7) \
2435 v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2436 v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
2438 ILVEV_D2_UB(in0, in8, in1, in9, out7, out6); \
2439 ILVEV_D2_UB(in2, in10, in3, in11, out5, out4); \
2440 ILVEV_D2_UB(in4, in12, in5, in13, out3, out2); \
2441 ILVEV_D2_UB(in6, in14, in7, in15, out1, out0); \
2443 tmp0_m = (v16u8) __msa_ilvev_b((v16i8) out6, (v16i8) out7); \
2444 tmp4_m = (v16u8) __msa_ilvod_b((v16i8) out6, (v16i8) out7); \
2445 tmp1_m = (v16u8) __msa_ilvev_b((v16i8) out4, (v16i8) out5); \
2446 tmp5_m = (v16u8) __msa_ilvod_b((v16i8) out4, (v16i8) out5); \
2447 out5 = (v16u8) __msa_ilvev_b((v16i8) out2, (v16i8) out3); \
2448 tmp6_m = (v16u8) __msa_ilvod_b((v16i8) out2, (v16i8) out3); \
2449 out7 = (v16u8) __msa_ilvev_b((v16i8) out0, (v16i8) out1); \
2450 tmp7_m = (v16u8) __msa_ilvod_b((v16i8) out0, (v16i8) out1); \
2452 ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m); \
2453 out0 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2454 out4 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2456 tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
2457 tmp3_m = (v16u8) __msa_ilvod_h((v8i16) out7, (v8i16) out5); \
2458 out2 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2459 out6 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2461 ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m); \
2462 out1 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2463 out5 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2465 tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp5_m, (v8i16) tmp4_m); \
2466 tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp5_m, (v8i16) tmp4_m); \
2467 tmp3_m = (v16u8) __msa_ilvod_h((v8i16) tmp7_m, (v8i16) tmp6_m); \
2468 tmp3_m = (v16u8) __msa_ilvod_h((v8i16) tmp7_m, (v8i16) tmp6_m); \
2469 out3 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2470 out7 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2473 /* Description : Transposes 4x4 block with half word elements in vectors
2474 Arguments : Inputs - in0, in1, in2, in3
2475 Outputs - out0, out1, out2, out3
2476 Return Type - signed halfword
2479 #define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \
2483 ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m); \
2484 ILVRL_W2_SH(s1_m, s0_m, out0, out2); \
2485 out1 = (v8i16) __msa_ilvl_d((v2i64) out0, (v2i64) out0); \
2486 out3 = (v8i16) __msa_ilvl_d((v2i64) out0, (v2i64) out2); \
2489 /* Description : Transposes 8x8 block with half word elements in vectors
2490 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
2491 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
2492 Return Type - as per RTYPE
2495 #define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
2496 out0, out1, out2, out3, out4, out5, out6, out7) \
2499 v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2500 v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
2502 ILVR_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \
2503 ILVRL_H2_SH(s1_m, s0_m, tmp0_m, tmp1_m); \
2504 ILVL_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \
2505 ILVRL_H2_SH(s1_m, s0_m, tmp2_m, tmp3_m); \
2506 ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \
2507 ILVRL_H2_SH(s1_m, s0_m, tmp4_m, tmp5_m); \
2508 ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \
2509 ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m); \
2510 PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m, \
2511 tmp3_m, tmp7_m, out0, out2, out4, out6); \
2512 out1 = (RTYPE) __msa_pckod_d((v2i64) tmp0_m, (v2i64) tmp4_m); \
2513 out3 = (RTYPE) __msa_pckod_d((v2i64) tmp1_m, (v2i64) tmp5_m); \
2514 out5 = (RTYPE) __msa_pckod_d((v2i64) tmp2_m, (v2i64) tmp6_m); \
2515 out7 = (RTYPE) __msa_pckod_d((v2i64) tmp3_m, (v2i64) tmp7_m); \
2517 #define TRANSPOSE8x8_UH_UH(...) TRANSPOSE8x8_H(v8u16, __VA_ARGS__)
2518 #define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__)
2520 /* Description : Transposes 4x4 block with word elements in vectors
2521 Arguments : Inputs - in0, in1, in2, in3
2522 Outputs - out0, out1, out2, out3
2523 Return Type - signed word
2526 #define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3) \
2528 v4i32 s0_m, s1_m, s2_m, s3_m; \
2530 ILVRL_W2_SW(in1, in0, s0_m, s1_m); \
2531 ILVRL_W2_SW(in3, in2, s2_m, s3_m); \
2533 out0 = (v4i32) __msa_ilvr_d((v2i64) s2_m, (v2i64) s0_m); \
2534 out1 = (v4i32) __msa_ilvl_d((v2i64) s2_m, (v2i64) s0_m); \
2535 out2 = (v4i32) __msa_ilvr_d((v2i64) s3_m, (v2i64) s1_m); \
2536 out3 = (v4i32) __msa_ilvl_d((v2i64) s3_m, (v2i64) s1_m); \
2539 /* Description : Average byte elements from pair of vectors and store 8x4 byte
2540 block in destination memory
2541 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2542 Details : Each byte element from input vector pair 'in0' and 'in1' are
2543 averaged (a + b)/2 and stored in 'tmp0_m'
2544 Each byte element from input vector pair 'in2' and 'in3' are
2545 averaged (a + b)/2 and stored in 'tmp1_m'
2546 Each byte element from input vector pair 'in4' and 'in5' are
2547 averaged (a + b)/2 and stored in 'tmp2_m'
2548 Each byte element from input vector pair 'in6' and 'in7' are
2549 averaged (a + b)/2 and stored in 'tmp3_m'
2550 The half vector results from all 4 vectors are stored in
2551 destination memory as 8x4 byte block
2553 #define AVE_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
2555 uint64_t out0_m, out1_m, out2_m, out3_m; \
2556 v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2558 tmp0_m = __msa_ave_u_b((v16u8) in0, (v16u8) in1); \
2559 tmp1_m = __msa_ave_u_b((v16u8) in2, (v16u8) in3); \
2560 tmp2_m = __msa_ave_u_b((v16u8) in4, (v16u8) in5); \
2561 tmp3_m = __msa_ave_u_b((v16u8) in6, (v16u8) in7); \
2563 out0_m = __msa_copy_u_d((v2i64) tmp0_m, 0); \
2564 out1_m = __msa_copy_u_d((v2i64) tmp1_m, 0); \
2565 out2_m = __msa_copy_u_d((v2i64) tmp2_m, 0); \
2566 out3_m = __msa_copy_u_d((v2i64) tmp3_m, 0); \
2567 SD4(out0_m, out1_m, out2_m, out3_m, pdst, stride); \
2570 /* Description : Average byte elements from pair of vectors and store 16x4 byte
2571 block in destination memory
2572 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2573 Details : Each byte element from input vector pair 'in0' and 'in1' are
2574 averaged (a + b)/2 and stored in 'tmp0_m'
2575 Each byte element from input vector pair 'in2' and 'in3' are
2576 averaged (a + b)/2 and stored in 'tmp1_m'
2577 Each byte element from input vector pair 'in4' and 'in5' are
2578 averaged (a + b)/2 and stored in 'tmp2_m'
2579 Each byte element from input vector pair 'in6' and 'in7' are
2580 averaged (a + b)/2 and stored in 'tmp3_m'
2581 The results from all 4 vectors are stored in destination
2582 memory as 16x4 byte block
2584 #define AVE_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
2586 v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2588 tmp0_m = __msa_ave_u_b((v16u8) in0, (v16u8) in1); \
2589 tmp1_m = __msa_ave_u_b((v16u8) in2, (v16u8) in3); \
2590 tmp2_m = __msa_ave_u_b((v16u8) in4, (v16u8) in5); \
2591 tmp3_m = __msa_ave_u_b((v16u8) in6, (v16u8) in7); \
2593 ST_UB4(tmp0_m, tmp1_m, tmp2_m, tmp3_m, pdst, stride); \
2596 /* Description : Average rounded byte elements from pair of vectors and store
2597 8x4 byte block in destination memory
2598 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2599 Details : Each byte element from input vector pair 'in0' and 'in1' are
2600 average rounded (a + b + 1)/2 and stored in 'tmp0_m'
2601 Each byte element from input vector pair 'in2' and 'in3' are
2602 average rounded (a + b + 1)/2 and stored in 'tmp1_m'
2603 Each byte element from input vector pair 'in4' and 'in5' are
2604 average rounded (a + b + 1)/2 and stored in 'tmp2_m'
2605 Each byte element from input vector pair 'in6' and 'in7' are
2606 average rounded (a + b + 1)/2 and stored in 'tmp3_m'
2607 The half vector results from all 4 vectors are stored in
2608 destination memory as 8x4 byte block
2610 #define AVER_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
2612 uint64_t out0_m, out1_m, out2_m, out3_m; \
2613 v16u8 tp0_m, tp1_m, tp2_m, tp3_m; \
2615 AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2616 tp0_m, tp1_m, tp2_m, tp3_m); \
2618 out0_m = __msa_copy_u_d((v2i64) tp0_m, 0); \
2619 out1_m = __msa_copy_u_d((v2i64) tp1_m, 0); \
2620 out2_m = __msa_copy_u_d((v2i64) tp2_m, 0); \
2621 out3_m = __msa_copy_u_d((v2i64) tp3_m, 0); \
2622 SD4(out0_m, out1_m, out2_m, out3_m, pdst, stride); \
2625 /* Description : Average rounded byte elements from pair of vectors and store
2626 16x4 byte block in destination memory
2627 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2628 Details : Each byte element from input vector pair 'in0' and 'in1' are
2629 average rounded (a + b + 1)/2 and stored in 'tmp0_m'
2630 Each byte element from input vector pair 'in2' and 'in3' are
2631 average rounded (a + b + 1)/2 and stored in 'tmp1_m'
2632 Each byte element from input vector pair 'in4' and 'in5' are
2633 average rounded (a + b + 1)/2 and stored in 'tmp2_m'
2634 Each byte element from input vector pair 'in6' and 'in7' are
2635 average rounded (a + b + 1)/2 and stored in 'tmp3_m'
2636 The vector results from all 4 vectors are stored in
2637 destination memory as 16x4 byte block
2639 #define AVER_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
2641 v16u8 t0_m, t1_m, t2_m, t3_m; \
2643 AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2644 t0_m, t1_m, t2_m, t3_m); \
2645 ST_UB4(t0_m, t1_m, t2_m, t3_m, pdst, stride); \
2648 /* Description : Average rounded byte elements from pair of vectors,
2649 average rounded with destination and store 8x4 byte block
2650 in destination memory
2651 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2652 Details : Each byte element from input vector pair 'in0' and 'in1' are
2653 average rounded (a + b + 1)/2 and stored in 'tmp0_m'
2654 Each byte element from input vector pair 'in2' and 'in3' are
2655 average rounded (a + b + 1)/2 and stored in 'tmp1_m'
2656 Each byte element from input vector pair 'in4' and 'in5' are
2657 average rounded (a + b + 1)/2 and stored in 'tmp2_m'
2658 Each byte element from input vector pair 'in6' and 'in7' are
2659 average rounded (a + b + 1)/2 and stored in 'tmp3_m'
2660 The half vector results from all 4 vectors are stored in
2661 destination memory as 8x4 byte block
2663 #define AVER_DST_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2666 v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2667 v16u8 dst0_m, dst1_m, dst2_m, dst3_m; \
2669 LD_UB4(pdst, stride, dst0_m, dst1_m, dst2_m, dst3_m); \
2670 AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2671 tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
2672 AVER_ST8x4_UB(dst0_m, tmp0_m, dst1_m, tmp1_m, \
2673 dst2_m, tmp2_m, dst3_m, tmp3_m, pdst, stride); \
2676 /* Description : Average rounded byte elements from pair of vectors,
2677 average rounded with destination and store 16x4 byte block
2678 in destination memory
2679 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2680 Details : Each byte element from input vector pair 'in0' and 'in1' are
2681 average rounded (a + b + 1)/2 and stored in 'tmp0_m'
2682 Each byte element from input vector pair 'in2' and 'in3' are
2683 average rounded (a + b + 1)/2 and stored in 'tmp1_m'
2684 Each byte element from input vector pair 'in4' and 'in5' are
2685 average rounded (a + b + 1)/2 and stored in 'tmp2_m'
2686 Each byte element from input vector pair 'in6' and 'in7' are
2687 average rounded (a + b + 1)/2 and stored in 'tmp3_m'
2688 The vector results from all 4 vectors are stored in
2689 destination memory as 16x4 byte block
2691 #define AVER_DST_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2694 v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2695 v16u8 dst0_m, dst1_m, dst2_m, dst3_m; \
2697 LD_UB4(pdst, stride, dst0_m, dst1_m, dst2_m, dst3_m); \
2698 AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2699 tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
2700 AVER_ST16x4_UB(dst0_m, tmp0_m, dst1_m, tmp1_m, \
2701 dst2_m, tmp2_m, dst3_m, tmp3_m, pdst, stride); \
2704 /* Description : Add block 4x4
2705 Arguments : Inputs - in0, in1, in2, in3, pdst, stride
2706 Details : Least significant 4 bytes from each input vector are added to
2707 the destination bytes, clipped between 0-255 and then stored.
2709 #define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride) \
2711 uint32_t src0_m, src1_m, src2_m, src3_m; \
2712 uint32_t out0_m, out1_m, out2_m, out3_m; \
2713 v8i16 inp0_m, inp1_m, res0_m, res1_m; \
2714 v16i8 dst0_m = { 0 }; \
2715 v16i8 dst1_m = { 0 }; \
2716 v16i8 zero_m = { 0 }; \
2718 ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m) \
2719 LW4(pdst, stride, src0_m, src1_m, src2_m, src3_m); \
2720 INSERT_W2_SB(src0_m, src1_m, dst0_m); \
2721 INSERT_W2_SB(src2_m, src3_m, dst1_m); \
2722 ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m); \
2723 ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m); \
2724 CLIP_SH2_0_255(res0_m, res1_m); \
2725 PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m); \
2727 out0_m = __msa_copy_u_w((v4i32) dst0_m, 0); \
2728 out1_m = __msa_copy_u_w((v4i32) dst0_m, 1); \
2729 out2_m = __msa_copy_u_w((v4i32) dst1_m, 0); \
2730 out3_m = __msa_copy_u_w((v4i32) dst1_m, 1); \
2731 SW4(out0_m, out1_m, out2_m, out3_m, pdst, stride); \
2734 /* Description : Dot product and addition of 3 signed halfword input vectors
2735 Arguments : Inputs - in0, in1, in2, coeff0, coeff1, coeff2
2737 Return Type - signed halfword
2738 Details : Dot product of 'in0' with 'coeff0'
2739 Dot product of 'in1' with 'coeff1'
2740 Dot product of 'in2' with 'coeff2'
2741 Addition of all the 3 vector results
2743 out0_m = (in0 * coeff0) + (in1 * coeff1) + (in2 * coeff2)
2745 #define DPADD_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2) \
2750 out0_m = __msa_dotp_s_h((v16i8) in0, (v16i8) coeff0); \
2751 out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in1, (v16i8) coeff1); \
2752 tmp1_m = __msa_dotp_s_h((v16i8) in2, (v16i8) coeff2); \
2753 out0_m = __msa_adds_s_h(out0_m, tmp1_m); \
2758 /* Description : Pack even elements of input vectors & xor with 128
2759 Arguments : Inputs - in0, in1
2761 Return Type - unsigned byte
2762 Details : Signed byte even elements from 'in0' and 'in1' are packed
2763 together in one vector and the resulted vector is xor'ed with
2764 128 to shift the range from signed to unsigned byte
2766 #define PCKEV_XORI128_UB(in0, in1) \
2769 out_m = (v16u8) __msa_pckev_b((v16i8) in1, (v16i8) in0); \
2770 out_m = (v16u8) __msa_xori_b((v16u8) out_m, 128); \
2774 /* Description : Converts inputs to unsigned bytes, interleave, average & store
2775 as 8x4 unsigned byte block
2776 Arguments : Inputs - in0, in1, in2, in3, dst0, dst1, dst2, dst3,
2779 #define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3, \
2780 dst0, dst1, dst2, dst3, pdst, stride) \
2782 v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2783 uint8_t *pdst_m = (uint8_t *) (pdst); \
2785 tmp0_m = PCKEV_XORI128_UB(in0, in1); \
2786 tmp1_m = PCKEV_XORI128_UB(in2, in3); \
2787 ILVR_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m); \
2788 AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m); \
2789 ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride); \
2792 /* Description : Pack even byte elements, extract 0 & 2 index words from pair
2793 of results and store 4 words in destination memory as per
2795 Arguments : Inputs - in0, in1, in2, in3, pdst, stride
2797 #define PCKEV_ST4x4_UB(in0, in1, in2, in3, pdst, stride) \
2799 uint32_t out0_m, out1_m, out2_m, out3_m; \
2800 v16i8 tmp0_m, tmp1_m; \
2802 PCKEV_B2_SB(in1, in0, in3, in2, tmp0_m, tmp1_m); \
2804 out0_m = __msa_copy_u_w((v4i32) tmp0_m, 0); \
2805 out1_m = __msa_copy_u_w((v4i32) tmp0_m, 2); \
2806 out2_m = __msa_copy_u_w((v4i32) tmp1_m, 0); \
2807 out3_m = __msa_copy_u_w((v4i32) tmp1_m, 2); \
2809 SW4(out0_m, out1_m, out2_m, out3_m, pdst, stride); \
2812 /* Description : Pack even byte elements and store byte vector in destination
2814 Arguments : Inputs - in0, in1, pdst
2816 #define PCKEV_ST_SB(in0, in1, pdst) \
2819 tmp_m = __msa_pckev_b((v16i8) in1, (v16i8) in0); \
2820 ST_SB(tmp_m, (pdst)); \
2823 /* Description : Horizontal 2 tap filter kernel code
2824 Arguments : Inputs - in0, in1, mask, coeff, shift
2826 #define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift) \
2831 tmp0_m = __msa_vshf_b((v16i8) mask, (v16i8) in1, (v16i8) in0); \
2832 tmp1_m = __msa_dotp_u_h((v16u8) tmp0_m, (v16u8) coeff); \
2833 tmp1_m = (v8u16) __msa_srari_h((v8i16) tmp1_m, shift); \
2834 tmp1_m = __msa_sat_u_h(tmp1_m, shift); \
2838 #endif /* AVUTIL_MIPS_GENERIC_MACROS_MSA_H */