git.sesse.net Git - ffmpeg/blob - libavutil/mips/generic_macros_msa.h

   1 /*
   2  * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
   3  *
   4  * This file is part of FFmpeg.
   5  *
   6  * FFmpeg is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * FFmpeg is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with FFmpeg; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19  */
  20
  21 #ifndef AVUTIL_MIPS_GENERIC_MACROS_MSA_H
  22 #define AVUTIL_MIPS_GENERIC_MACROS_MSA_H
  23
  24 #include <stdint.h>
  25 #include <msa.h>
  26
  27 #define ALIGNMENT           16
  28 #define ALLOC_ALIGNED(align) __attribute__ ((aligned((align) << 1)))
  29
  30 #define LD_B(RTYPE, psrc) *((RTYPE *)(psrc))
  31 #define LD_UB(...) LD_B(v16u8, __VA_ARGS__)
  32 #define LD_SB(...) LD_B(v16i8, __VA_ARGS__)
  33
  34 #define LD_H(RTYPE, psrc) *((RTYPE *)(psrc))
  35 #define LD_UH(...) LD_H(v8u16, __VA_ARGS__)
  36 #define LD_SH(...) LD_H(v8i16, __VA_ARGS__)
  37
  38 #define LD_W(RTYPE, psrc) *((RTYPE *)(psrc))
  39 #define LD_UW(...) LD_W(v4u32, __VA_ARGS__)
  40 #define LD_SW(...) LD_W(v4i32, __VA_ARGS__)
  41
  42 #define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
  43 #define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
  44 #define ST_SB(...) ST_B(v16i8, __VA_ARGS__)
  45
  46 #define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
  47 #define ST_UH(...) ST_H(v8u16, __VA_ARGS__)
  48 #define ST_SH(...) ST_H(v8i16, __VA_ARGS__)
  49
  50 #define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
  51 #define ST_UW(...) ST_W(v4u32, __VA_ARGS__)
  52 #define ST_SW(...) ST_W(v4i32, __VA_ARGS__)
  53
  54 #if (__mips_isa_rev >= 6)
  55     #define LW(psrc)                           \
  56     ( {                                        \
  57         uint8_t *psrc_m = (uint8_t *) (psrc);  \
  58         uint32_t val_m;                        \
  59                                                \
  60         __asm__ volatile (                     \
  61             "lw  %[val_m],  %[psrc_m]  \n\t"   \
  62                                                \
  63             : [val_m] "=r" (val_m)             \
  64             : [psrc_m] "m" (*psrc_m)           \
  65         );                                     \
  66                                                \
  67         val_m;                                 \
  68     } )
  69
  70     #if (__mips == 64)
  71         #define LD(psrc)                           \
  72         ( {                                        \
  73             uint8_t *psrc_m = (uint8_t *) (psrc);  \
  74             uint64_t val_m = 0;                    \
  75                                                    \
  76             __asm__ volatile (                     \
  77                 "ld  %[val_m],  %[psrc_m]  \n\t"   \
  78                                                    \
  79                 : [val_m] "=r" (val_m)             \
  80                 : [psrc_m] "m" (*psrc_m)           \
  81             );                                     \
  82                                                    \
  83             val_m;                                 \
  84         } )
  85     #else  // !(__mips == 64)
  86         #define LD(psrc)                                              \
  87         ( {                                                           \
  88             uint8_t *psrc_ld_m = (uint8_t *) (psrc);                  \
  89             uint32_t val0_m, val1_m;                                  \
  90             uint64_t val_m = 0;                                       \
  91                                                                       \
  92             val0_m = LW(psrc_ld_m);                                   \
  93             val1_m = LW(psrc_ld_m + 4);                               \
  94                                                                       \
  95             val_m = (uint64_t) (val1_m);                              \
  96             val_m = (uint64_t) ((val_m << 32) & 0xFFFFFFFF00000000);  \
  97             val_m = (uint64_t) (val_m | (uint64_t) val0_m);           \
  98                                                                       \
  99             val_m;                                                    \
 100         } )
 101     #endif  // (__mips == 64)
 102
 103     #define SH(val, pdst)                      \
 104     {                                          \
 105         uint8_t *pdst_m = (uint8_t *) (pdst);  \
 106         uint16_t val_m = (val);                \
 107                                                \
 108         __asm__ volatile (                     \
 109             "sh  %[val_m],  %[pdst_m]  \n\t"   \
 110                                                \
 111             : [pdst_m] "=m" (*pdst_m)          \
 112             : [val_m] "r" (val_m)              \
 113         );                                     \
 114     }
 115
 116     #define SW(val, pdst)                      \
 117     {                                          \
 118         uint8_t *pdst_m = (uint8_t *) (pdst);  \
 119         uint32_t val_m = (val);                \
 120                                                \
 121         __asm__ volatile (                     \
 122             "sw  %[val_m],  %[pdst_m]  \n\t"   \
 123                                                \
 124             : [pdst_m] "=m" (*pdst_m)          \
 125             : [val_m] "r" (val_m)              \
 126         );                                     \
 127     }
 128
 129     #define SD(val, pdst)                      \
 130     {                                          \
 131         uint8_t *pdst_m = (uint8_t *) (pdst);  \
 132         uint64_t val_m = (val);                \
 133                                                \
 134         __asm__ volatile (                     \
 135             "sd  %[val_m],  %[pdst_m]  \n\t"   \
 136                                                \
 137             : [pdst_m] "=m" (*pdst_m)          \
 138             : [val_m] "r" (val_m)              \
 139         );                                     \
 140     }
 141 #else  // !(__mips_isa_rev >= 6)
 142     #define LW(psrc)                           \
 143     ( {                                        \
 144         uint8_t *psrc_m = (uint8_t *) (psrc);  \
 145         uint32_t val_m;                        \
 146                                                \
 147         __asm__ volatile (                     \
 148             "ulw  %[val_m],  %[psrc_m]  \n\t"  \
 149                                                \
 150             : [val_m] "=r" (val_m)             \
 151             : [psrc_m] "m" (*psrc_m)           \
 152         );                                     \
 153                                                \
 154         val_m;                                 \
 155     } )
 156
 157     #if (__mips == 64)
 158         #define LD(psrc)                           \
 159         ( {                                        \
 160             uint8_t *psrc_m = (uint8_t *) (psrc);  \
 161             uint64_t val_m = 0;                    \
 162                                                    \
 163             __asm__ volatile (                     \
 164                 "uld  %[val_m],  %[psrc_m]  \n\t"  \
 165                                                    \
 166                 : [val_m] "=r" (val_m)             \
 167                 : [psrc_m] "m" (*psrc_m)           \
 168             );                                     \
 169                                                    \
 170             val_m;                                 \
 171         } )
 172     #else  // !(__mips == 64)
 173         #define LD(psrc)                                              \
 174         ( {                                                           \
 175             uint8_t *psrc_ld_m = (uint8_t *) (psrc);                  \
 176             uint32_t val0_m, val1_m;                                  \
 177             uint64_t val_m = 0;                                       \
 178                                                                       \
 179             val0_m = LW(psrc_ld_m);                                   \
 180             val1_m = LW(psrc_ld_m + 4);                               \
 181                                                                       \
 182             val_m = (uint64_t) (val1_m);                              \
 183             val_m = (uint64_t) ((val_m << 32) & 0xFFFFFFFF00000000);  \
 184             val_m = (uint64_t) (val_m | (uint64_t) val0_m);           \
 185                                                                       \
 186             val_m;                                                    \
 187         } )
 188     #endif  // (__mips == 64)
 189
 190     #define SH(val, pdst)                      \
 191     {                                          \
 192         uint8_t *pdst_m = (uint8_t *) (pdst);  \
 193         uint16_t val_m = (val);                \
 194                                                \
 195         __asm__ volatile (                     \
 196             "ush  %[val_m],  %[pdst_m]  \n\t"  \
 197                                                \
 198             : [pdst_m] "=m" (*pdst_m)          \
 199             : [val_m] "r" (val_m)              \
 200         );                                     \
 201     }
 202
 203     #define SW(val, pdst)                      \
 204     {                                          \
 205         uint8_t *pdst_m = (uint8_t *) (pdst);  \
 206         uint32_t val_m = (val);                \
 207                                                \
 208         __asm__ volatile (                     \
 209             "usw  %[val_m],  %[pdst_m]  \n\t"  \
 210                                                \
 211             : [pdst_m] "=m" (*pdst_m)          \
 212             : [val_m] "r" (val_m)              \
 213         );                                     \
 214     }
 215
 216     #define SD(val, pdst)                                          \
 217     {                                                              \
 218         uint8_t *pdst_m1 = (uint8_t *) (pdst);                     \
 219         uint32_t val0_m, val1_m;                                   \
 220                                                                    \
 221         val0_m = (uint32_t) ((val) & 0x00000000FFFFFFFF);          \
 222         val1_m = (uint32_t) (((val) >> 32) & 0x00000000FFFFFFFF);  \
 223                                                                    \
 224         SW(val0_m, pdst_m1);                                       \
 225         SW(val1_m, pdst_m1 + 4);                                   \
 226     }
 227 #endif // (__mips_isa_rev >= 6)
 228
 229 /* Description : Load 4 words with stride
 230    Arguments   : Inputs  - psrc    (source pointer to load from)
 231                          - stride
 232                  Outputs - out0, out1, out2, out3
 233    Details     : Loads word in 'out0' from (psrc)
 234                  Loads word in 'out1' from (psrc + stride)
 235                  Loads word in 'out2' from (psrc + 2 * stride)
 236                  Loads word in 'out3' from (psrc + 3 * stride)
 237 */
 238 #define LW4(psrc, stride, out0, out1, out2, out3)  \
 239 {                                                  \
 240     out0 = LW((psrc));                             \
 241     out1 = LW((psrc) + stride);                    \
 242     out2 = LW((psrc) + 2 * stride);                \
 243     out3 = LW((psrc) + 3 * stride);                \
 244 }
 245
 246 /* Description : Load double words with stride
 247    Arguments   : Inputs  - psrc    (source pointer to load from)
 248                          - stride
 249                  Outputs - out0, out1
 250    Details     : Loads double word in 'out0' from (psrc)
 251                  Loads double word in 'out1' from (psrc + stride)
 252 */
 253 #define LD2(psrc, stride, out0, out1)  \
 254 {                                      \
 255     out0 = LD((psrc));                 \
 256     out1 = LD((psrc) + stride);        \
 257 }
 258 #define LD4(psrc, stride, out0, out1, out2, out3)  \
 259 {                                                  \
 260     LD2((psrc), stride, out0, out1);               \
 261     LD2((psrc) + 2 * stride, stride, out2, out3);  \
 262 }
 263
 264 /* Description : Store 4 words with stride
 265    Arguments   : Inputs  - in0, in1, in2, in3, pdst, stride
 266    Details     : Stores word from 'in0' to (pdst)
 267                  Stores word from 'in1' to (pdst + stride)
 268                  Stores word from 'in2' to (pdst + 2 * stride)
 269                  Stores word from 'in3' to (pdst + 3 * stride)
 270 */
 271 #define SW4(in0, in1, in2, in3, pdst, stride)  \
 272 {                                              \
 273     SW(in0, (pdst))                            \
 274     SW(in1, (pdst) + stride);                  \
 275     SW(in2, (pdst) + 2 * stride);              \
 276     SW(in3, (pdst) + 3 * stride);              \
 277 }
 278
 279 /* Description : Store 4 double words with stride
 280    Arguments   : Inputs  - in0, in1, in2, in3, pdst, stride
 281    Details     : Stores double word from 'in0' to (pdst)
 282                  Stores double word from 'in1' to (pdst + stride)
 283                  Stores double word from 'in2' to (pdst + 2 * stride)
 284                  Stores double word from 'in3' to (pdst + 3 * stride)
 285 */
 286 #define SD4(in0, in1, in2, in3, pdst, stride)  \
 287 {                                              \
 288     SD(in0, (pdst))                            \
 289     SD(in1, (pdst) + stride);                  \
 290     SD(in2, (pdst) + 2 * stride);              \
 291     SD(in3, (pdst) + 3 * stride);              \
 292 }
 293
 294 /* Description : Load vectors with 16 byte elements with stride
 295    Arguments   : Inputs  - psrc    (source pointer to load from)
 296                          - stride
 297                  Outputs - out0, out1
 298                  Return Type - as per RTYPE
 299    Details     : Loads 16 byte elements in 'out0' from (psrc)
 300                  Loads 16 byte elements in 'out1' from (psrc + stride)
 301 */
 302 #define LD_B2(RTYPE, psrc, stride, out0, out1)  \
 303 {                                               \
 304     out0 = LD_B(RTYPE, (psrc));                 \
 305     out1 = LD_B(RTYPE, (psrc) + stride);        \
 306 }
 307 #define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)
 308 #define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__)
 309
 310 #define LD_B3(RTYPE, psrc, stride, out0, out1, out2)  \
 311 {                                                     \
 312     LD_B2(RTYPE, (psrc), stride, out0, out1);         \
 313     out2 = LD_B(RTYPE, (psrc) + 2 * stride);          \
 314 }
 315 #define LD_UB3(...) LD_B3(v16u8, __VA_ARGS__)
 316 #define LD_SB3(...) LD_B3(v16i8, __VA_ARGS__)
 317
 318 #define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3)   \
 319 {                                                            \
 320     LD_B2(RTYPE, (psrc), stride, out0, out1);                \
 321     LD_B2(RTYPE, (psrc) + 2 * stride , stride, out2, out3);  \
 322 }
 323 #define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)
 324 #define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__)
 325
 326 #define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4)  \
 327 {                                                                 \
 328     LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);         \
 329     out4 = LD_B(RTYPE, (psrc) + 4 * stride);                      \
 330 }
 331 #define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__)
 332 #define LD_SB5(...) LD_B5(v16i8, __VA_ARGS__)
 333
 334 #define LD_B6(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5)  \
 335 {                                                                       \
 336     LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);               \
 337     LD_B2(RTYPE, (psrc) + 4 * stride, stride, out4, out5);              \
 338 }
 339 #define LD_UB6(...) LD_B6(v16u8, __VA_ARGS__)
 340 #define LD_SB6(...) LD_B6(v16i8, __VA_ARGS__)
 341
 342 #define LD_B7(RTYPE, psrc, stride,                               \
 343               out0, out1, out2, out3, out4, out5, out6)          \
 344 {                                                                \
 345     LD_B5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4);  \
 346     LD_B2(RTYPE, (psrc) + 5 * stride, stride, out5, out6);       \
 347 }
 348 #define LD_UB7(...) LD_B7(v16u8, __VA_ARGS__)
 349 #define LD_SB7(...) LD_B7(v16i8, __VA_ARGS__)
 350
 351 #define LD_B8(RTYPE, psrc, stride,                                      \
 352               out0, out1, out2, out3, out4, out5, out6, out7)           \
 353 {                                                                       \
 354     LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);               \
 355     LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7);  \
 356 }
 357 #define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__)
 358 #define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__)
 359
 360 /* Description : Load vectors with 8 halfword elements with stride
 361    Arguments   : Inputs  - psrc    (source pointer to load from)
 362                          - stride
 363                  Outputs - out0, out1
 364    Details     : Loads 8 halfword elements in 'out0' from (psrc)
 365                  Loads 8 halfword elements in 'out1' from (psrc + stride)
 366 */
 367 #define LD_H2(RTYPE, psrc, stride, out0, out1)  \
 368 {                                               \
 369     out0 = LD_H(RTYPE, (psrc));                 \
 370     out1 = LD_H(RTYPE, (psrc) + (stride));      \
 371 }
 372 #define LD_UH2(...) LD_H2(v8u16, __VA_ARGS__)
 373 #define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__)
 374
 375 #define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3)  \
 376 {                                                           \
 377     LD_H2(RTYPE, (psrc), stride, out0, out1);               \
 378     LD_H2(RTYPE, (psrc) + 2 * stride, stride, out2, out3);  \
 379 }
 380 #define LD_UH4(...) LD_H4(v8u16, __VA_ARGS__)
 381 #define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__)
 382
 383 #define LD_H6(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5)  \
 384 {                                                                       \
 385     LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3);               \
 386     LD_H2(RTYPE, (psrc) + 4 * stride, stride, out4, out5);              \
 387 }
 388 #define LD_UH6(...) LD_H6(v8u16, __VA_ARGS__)
 389 #define LD_SH6(...) LD_H6(v8i16, __VA_ARGS__)
 390
 391 #define LD_H8(RTYPE, psrc, stride,                                      \
 392               out0, out1, out2, out3, out4, out5, out6, out7)           \
 393 {                                                                       \
 394     LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3);               \
 395     LD_H4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7);  \
 396 }
 397 #define LD_UH8(...) LD_H8(v8u16, __VA_ARGS__)
 398 #define LD_SH8(...) LD_H8(v8i16, __VA_ARGS__)
 399
 400 #define LD_H16(RTYPE, psrc, stride,                                   \
 401                out0, out1, out2, out3, out4, out5, out6, out7,        \
 402                out8, out9, out10, out11, out12, out13, out14, out15)  \
 403 {                                                                     \
 404     LD_H8(RTYPE, (psrc), stride,                                      \
 405           out0, out1, out2, out3, out4, out5, out6, out7);            \
 406     LD_H8(RTYPE, (psrc) + 8 * stride, stride,                         \
 407           out8, out9, out10, out11, out12, out13, out14, out15);      \
 408 }
 409 #define LD_SH16(...) LD_H16(v8i16, __VA_ARGS__)
 410
 411 /* Description : Load as 4x4 block of signed halfword elements from 1D source
 412                  data into 4 vectors (Each vector with 4 signed halfwords)
 413    Arguments   : Inputs  - psrc
 414                  Outputs - out0, out1, out2, out3
 415 */
 416 #define LD4x4_SH(psrc, out0, out1, out2, out3)                \
 417 {                                                             \
 418     out0 = LD_SH(psrc);                                       \
 419     out2 = LD_SH(psrc + 8);                                   \
 420     out1 = (v8i16) __msa_ilvl_d((v2i64) out0, (v2i64) out0);  \
 421     out3 = (v8i16) __msa_ilvl_d((v2i64) out2, (v2i64) out2);  \
 422 }
 423
 424 /* Description : Load 2 vectors of signed word elements with stride
 425    Arguments   : Inputs  - psrc    (source pointer to load from)
 426                          - stride
 427                  Outputs - out0, out1
 428                  Return Type - signed word
 429 */
 430 #define LD_SW2(psrc, stride, out0, out1)  \
 431 {                                         \
 432     out0 = LD_SW((psrc));                 \
 433     out1 = LD_SW((psrc) + stride);        \
 434 }
 435
 436 /* Description : Store vectors of 16 byte elements with stride
 437    Arguments   : Inputs  - in0, in1, stride
 438                  Outputs - pdst    (destination pointer to store to)
 439    Details     : Stores 16 byte elements from 'in0' to (pdst)
 440                  Stores 16 byte elements from 'in1' to (pdst + stride)
 441 */
 442 #define ST_B2(RTYPE, in0, in1, pdst, stride)  \
 443 {                                             \
 444     ST_B(RTYPE, in0, (pdst));                 \
 445     ST_B(RTYPE, in1, (pdst) + stride);        \
 446 }
 447 #define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)
 448 #define ST_SB2(...) ST_B2(v16i8, __VA_ARGS__)
 449
 450 #define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride)    \
 451 {                                                         \
 452     ST_B2(RTYPE, in0, in1, (pdst), stride);               \
 453     ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride);  \
 454 }
 455 #define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
 456 #define ST_SB4(...) ST_B4(v16i8, __VA_ARGS__)
 457
 458 #define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,        \
 459               pdst, stride)                                         \
 460 {                                                                   \
 461     ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride);                 \
 462     ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride);  \
 463 }
 464 #define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__)
 465
 466 /* Description : Store vectors of 8 halfword elements with stride
 467    Arguments   : Inputs  - in0, in1, stride
 468                  Outputs - pdst    (destination pointer to store to)
 469    Details     : Stores 8 halfword elements from 'in0' to (pdst)
 470                  Stores 8 halfword elements from 'in1' to (pdst + stride)
 471 */
 472 #define ST_H2(RTYPE, in0, in1, pdst, stride)  \
 473 {                                             \
 474     ST_H(RTYPE, in0, (pdst));                 \
 475     ST_H(RTYPE, in1, (pdst) + stride);        \
 476 }
 477 #define ST_UH2(...) ST_H2(v8u16, __VA_ARGS__)
 478 #define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__)
 479
 480 #define ST_H4(RTYPE, in0, in1, in2, in3, pdst, stride)    \
 481 {                                                         \
 482     ST_H2(RTYPE, in0, in1, (pdst), stride);               \
 483     ST_H2(RTYPE, in2, in3, (pdst) + 2 * stride, stride);  \
 484 }
 485 #define ST_SH4(...) ST_H4(v8i16, __VA_ARGS__)
 486
 487 #define ST_H6(RTYPE, in0, in1, in2, in3, in4, in5, pdst, stride)  \
 488 {                                                                 \
 489     ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride);             \
 490     ST_H2(RTYPE, in4, in5, (pdst) + 4 * stride, stride);          \
 491 }
 492 #define ST_SH6(...) ST_H6(v8i16, __VA_ARGS__)
 493
 494 #define ST_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)  \
 495 {                                                                           \
 496     ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride);                       \
 497     ST_H4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride);          \
 498 }
 499 #define ST_SH8(...) ST_H8(v8i16, __VA_ARGS__)
 500
 501 /* Description : Store vectors of word elements with stride
 502    Arguments   : Inputs  - in0, in1, stride
 503                  Outputs - pdst    (destination pointer to store to)
 504                  Return Type - signed word
 505    Details     : Stores 4 word elements from 'in0' to (pdst)
 506                  Stores 4 word elements from 'in1' to (pdst + stride)
 507 */
 508 #define ST_SW2(in0, in1, pdst, stride)  \
 509 {                                       \
 510     ST_SW(in0, (pdst));                 \
 511     ST_SW(in1, (pdst) + stride);        \
 512 }
 513 #define ST_SW8(in0, in1, in2, in3, in4, in5, in6, in7,  \
 514                pdst, stride)                            \
 515 {                                                       \
 516     ST_SW2(in0, in1, (pdst), stride);                   \
 517     ST_SW2(in2, in3, (pdst) + 2 * stride, stride);      \
 518     ST_SW2(in4, in5, (pdst) + 4 * stride, stride);      \
 519     ST_SW2(in6, in7, (pdst) + 6 * stride, stride);      \
 520 }
 521
 522 /* Description : Store as 2x4 byte block to destination memory from input vector
 523    Arguments   : Inputs  - in, stidx, pdst, stride
 524                  Return Type - unsigned byte
 525    Details     : Index stidx halfword element from 'in' vector is copied and
 526                  stored on first line
 527                  Index stidx+1 halfword element from 'in' vector is copied and
 528                  stored on second line
 529                  Index stidx+2 halfword element from 'in' vector is copied and
 530                  stored on third line
 531                  Index stidx+3 halfword element from 'in' vector is copied and
 532                  stored on fourth line
 533 */
 534 #define ST2x4_UB(in, stidx, pdst, stride)              \
 535 {                                                      \
 536     uint16_t out0_m, out1_m, out2_m, out3_m;           \
 537     uint8_t *pblk_2x4_m = (uint8_t *) (pdst);          \
 538                                                        \
 539     out0_m = __msa_copy_u_h((v8i16) in, (stidx));      \
 540     out1_m = __msa_copy_u_h((v8i16) in, (stidx + 1));  \
 541     out2_m = __msa_copy_u_h((v8i16) in, (stidx + 2));  \
 542     out3_m = __msa_copy_u_h((v8i16) in, (stidx + 3));  \
 543                                                        \
 544     SH(out0_m, pblk_2x4_m);                            \
 545     SH(out1_m, pblk_2x4_m + stride);                   \
 546     SH(out2_m, pblk_2x4_m + 2 * stride);               \
 547     SH(out3_m, pblk_2x4_m + 3 * stride);               \
 548 }
 549
 550 /* Description : Store as 4x2 byte block to destination memory from input vector
 551    Arguments   : Inputs  - in, pdst, stride
 552                  Return Type - unsigned byte
 553    Details     : Index 0 word element from input vector is copied and stored
 554                  on first line
 555                  Index 1 word element from input vector is copied and stored
 556                  on second line
 557 */
 558 #define ST4x2_UB(in, pdst, stride)             \
 559 {                                              \
 560     uint32_t out0_m, out1_m;                   \
 561     uint8_t *pblk_4x2_m = (uint8_t *) (pdst);  \
 562                                                \
 563     out0_m = __msa_copy_u_w((v4i32) in, 0);    \
 564     out1_m = __msa_copy_u_w((v4i32) in, 1);    \
 565                                                \
 566     SW(out0_m, pblk_4x2_m);                    \
 567     SW(out1_m, pblk_4x2_m + stride);           \
 568 }
 569
 570 /* Description : Store as 4x4 byte block to destination memory from input vector
 571    Arguments   : Inputs  - in0, in1, pdst, stride
 572                  Return Type - unsigned byte
 573    Details     : Idx0 word element from input vector 'in0' is copied and stored
 574                  on first line
 575                  Idx1 word element from input vector 'in0' is copied and stored
 576                  on second line
 577                  Idx2 word element from input vector 'in1' is copied and stored
 578                  on third line
 579                  Idx3 word element from input vector 'in1' is copied and stored
 580                  on fourth line
 581 */
 582 #define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)  \
 583 {                                                                 \
 584     uint32_t out0_m, out1_m, out2_m, out3_m;                      \
 585     uint8_t *pblk_4x4_m = (uint8_t *) (pdst);                     \
 586                                                                   \
 587     out0_m = __msa_copy_u_w((v4i32) in0, idx0);                   \
 588     out1_m = __msa_copy_u_w((v4i32) in0, idx1);                   \
 589     out2_m = __msa_copy_u_w((v4i32) in1, idx2);                   \
 590     out3_m = __msa_copy_u_w((v4i32) in1, idx3);                   \
 591                                                                   \
 592     SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride);      \
 593 }
 594 #define ST4x8_UB(in0, in1, pdst, stride)                            \
 595 {                                                                   \
 596     uint8_t *pblk_4x8 = (uint8_t *) (pdst);                         \
 597                                                                     \
 598     ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride);               \
 599     ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride);  \
 600 }
 601
 602 /* Description : Store as 6x4 byte block to destination memory from input
 603                  vectors
 604    Arguments   : Inputs  - in0, in1, pdst, stride
 605                  Return Type - unsigned byte
 606    Details     : Index 0 word element from input vector 'in0' is copied and
 607                  stored on first line followed by index 2 halfword element
 608                  Index 2 word element from input vector 'in0' is copied and
 609                  stored on second line followed by index 2 halfword element
 610                  Index 0 word element from input vector 'in1' is copied and
 611                  stored on third line followed by index 2 halfword element
 612                  Index 2 word element from input vector 'in1' is copied and
 613                  stored on fourth line followed by index 2 halfword element
 614 */
 615 #define ST6x4_UB(in0, in1, pdst, stride)       \
 616 {                                              \
 617     uint32_t out0_m, out1_m, out2_m, out3_m;   \
 618     uint16_t out4_m, out5_m, out6_m, out7_m;   \
 619     uint8_t *pblk_6x4_m = (uint8_t *) (pdst);  \
 620                                                \
 621     out0_m = __msa_copy_u_w((v4i32) in0, 0);   \
 622     out1_m = __msa_copy_u_w((v4i32) in0, 2);   \
 623     out2_m = __msa_copy_u_w((v4i32) in1, 0);   \
 624     out3_m = __msa_copy_u_w((v4i32) in1, 2);   \
 625                                                \
 626     out4_m = __msa_copy_u_h((v8i16) in0, 2);   \
 627     out5_m = __msa_copy_u_h((v8i16) in0, 6);   \
 628     out6_m = __msa_copy_u_h((v8i16) in1, 2);   \
 629     out7_m = __msa_copy_u_h((v8i16) in1, 6);   \
 630                                                \
 631     SW(out0_m, pblk_6x4_m);                    \
 632     SH(out4_m, (pblk_6x4_m + 4));              \
 633     pblk_6x4_m += stride;                      \
 634     SW(out1_m, pblk_6x4_m);                    \
 635     SH(out5_m, (pblk_6x4_m + 4));              \
 636     pblk_6x4_m += stride;                      \
 637     SW(out2_m, pblk_6x4_m);                    \
 638     SH(out6_m, (pblk_6x4_m + 4));              \
 639     pblk_6x4_m += stride;                      \
 640     SW(out3_m, pblk_6x4_m);                    \
 641     SH(out7_m, (pblk_6x4_m + 4));              \
 642 }
 643
 644 /* Description : Store as 8x1 byte block to destination memory from input vector
 645    Arguments   : Inputs  - in, pdst
 646    Details     : Index 0 double word element from input vector 'in' is copied
 647                  and stored to destination memory at (pdst)
 648 */
 649 #define ST8x1_UB(in, pdst)                   \
 650 {                                            \
 651     uint64_t out0_m;                         \
 652     out0_m = __msa_copy_u_d((v2i64) in, 0);  \
 653     SD(out0_m, pdst);                        \
 654 }
 655
 656 /* Description : Store as 8x2 byte block to destination memory from input vector
 657    Arguments   : Inputs  - in, pdst, stride
 658    Details     : Index 0 double word element from input vector 'in' is copied
 659                  and stored to destination memory at (pdst)
 660                  Index 1 double word element from input vector 'in' is copied
 661                  and stored to destination memory at (pdst + stride)
 662 */
 663 #define ST8x2_UB(in, pdst, stride)             \
 664 {                                              \
 665     uint64_t out0_m, out1_m;                   \
 666     uint8_t *pblk_8x2_m = (uint8_t *) (pdst);  \
 667                                                \
 668     out0_m = __msa_copy_u_d((v2i64) in, 0);    \
 669     out1_m = __msa_copy_u_d((v2i64) in, 1);    \
 670                                                \
 671     SD(out0_m, pblk_8x2_m);                    \
 672     SD(out1_m, pblk_8x2_m + stride);           \
 673 }
 674
 675 /* Description : Store as 8x4 byte block to destination memory from input
 676                  vectors
 677    Arguments   : Inputs  - in0, in1, pdst, stride
 678    Details     : Index 0 double word element from input vector 'in0' is copied
 679                  and stored to destination memory at (pblk_8x4_m)
 680                  Index 1 double word element from input vector 'in0' is copied
 681                  and stored to destination memory at (pblk_8x4_m + stride)
 682                  Index 0 double word element from input vector 'in1' is copied
 683                  and stored to destination memory at (pblk_8x4_m + 2 * stride)
 684                  Index 1 double word element from input vector 'in1' is copied
 685                  and stored to destination memory at (pblk_8x4_m + 3 * stride)
 686 */
 687 #define ST8x4_UB(in0, in1, pdst, stride)                      \
 688 {                                                             \
 689     uint64_t out0_m, out1_m, out2_m, out3_m;                  \
 690     uint8_t *pblk_8x4_m = (uint8_t *) (pdst);                 \
 691                                                               \
 692     out0_m = __msa_copy_u_d((v2i64) in0, 0);                  \
 693     out1_m = __msa_copy_u_d((v2i64) in0, 1);                  \
 694     out2_m = __msa_copy_u_d((v2i64) in1, 0);                  \
 695     out3_m = __msa_copy_u_d((v2i64) in1, 1);                  \
 696                                                               \
 697     SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride);  \
 698 }
 699 #define ST8x8_UB(in0, in1, in2, in3, pdst, stride)        \
 700 {                                                         \
 701     uint8_t *pblk_8x8_m = (uint8_t *) (pdst);             \
 702                                                           \
 703     ST8x4_UB(in0, in1, pblk_8x8_m, stride);               \
 704     ST8x4_UB(in2, in3, pblk_8x8_m + 4 * stride, stride);  \
 705 }
 706 #define ST12x4_UB(in0, in1, in2, pdst, stride)                \
 707 {                                                             \
 708     uint8_t *pblk_12x4_m = (uint8_t *) (pdst);                \
 709                                                               \
 710     /* left 8x4 */                                            \
 711     ST8x4_UB(in0, in1, pblk_12x4_m, stride);                  \
 712     /* right 4x4 */                                           \
 713     ST4x4_UB(in2, in2, 0, 1, 2, 3, pblk_12x4_m + 8, stride);  \
 714 }
 715
 716 /* Description : Store as 12x8 byte block to destination memory from
 717                  input vectors
 718    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
 719    Details     : Index 0 double word element from input vector 'in0' is copied
 720                  and stored to destination memory at (pblk_12x8_m) followed by
 721                  index 2 word element from same input vector 'in0' at
 722                  (pblk_12x8_m + 8)
 723                  Similar to remaining lines
 724 */
 725 #define ST12x8_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)  \
 726 {                                                                        \
 727     uint64_t out0_m, out1_m, out2_m, out3_m;                             \
 728     uint64_t out4_m, out5_m, out6_m, out7_m;                             \
 729     uint32_t out8_m, out9_m, out10_m, out11_m;                           \
 730     uint32_t out12_m, out13_m, out14_m, out15_m;                         \
 731     uint8_t *pblk_12x8_m = (uint8_t *) (pdst);                           \
 732                                                                          \
 733     out0_m = __msa_copy_u_d((v2i64) in0, 0);                             \
 734     out1_m = __msa_copy_u_d((v2i64) in1, 0);                             \
 735     out2_m = __msa_copy_u_d((v2i64) in2, 0);                             \
 736     out3_m = __msa_copy_u_d((v2i64) in3, 0);                             \
 737     out4_m = __msa_copy_u_d((v2i64) in4, 0);                             \
 738     out5_m = __msa_copy_u_d((v2i64) in5, 0);                             \
 739     out6_m = __msa_copy_u_d((v2i64) in6, 0);                             \
 740     out7_m = __msa_copy_u_d((v2i64) in7, 0);                             \
 741                                                                          \
 742     out8_m =  __msa_copy_u_w((v4i32) in0, 2);                            \
 743     out9_m =  __msa_copy_u_w((v4i32) in1, 2);                            \
 744     out10_m = __msa_copy_u_w((v4i32) in2, 2);                            \
 745     out11_m = __msa_copy_u_w((v4i32) in3, 2);                            \
 746     out12_m = __msa_copy_u_w((v4i32) in4, 2);                            \
 747     out13_m = __msa_copy_u_w((v4i32) in5, 2);                            \
 748     out14_m = __msa_copy_u_w((v4i32) in6, 2);                            \
 749     out15_m = __msa_copy_u_w((v4i32) in7, 2);                            \
 750                                                                          \
 751     SD(out0_m, pblk_12x8_m);                                             \
 752     SW(out8_m, pblk_12x8_m + 8);                                         \
 753     pblk_12x8_m += stride;                                               \
 754     SD(out1_m, pblk_12x8_m);                                             \
 755     SW(out9_m, pblk_12x8_m + 8);                                         \
 756     pblk_12x8_m += stride;                                               \
 757     SD(out2_m, pblk_12x8_m);                                             \
 758     SW(out10_m, pblk_12x8_m + 8);                                        \
 759     pblk_12x8_m += stride;                                               \
 760     SD(out3_m, pblk_12x8_m);                                             \
 761     SW(out11_m, pblk_12x8_m + 8);                                        \
 762     pblk_12x8_m += stride;                                               \
 763     SD(out4_m, pblk_12x8_m);                                             \
 764     SW(out12_m, pblk_12x8_m + 8);                                        \
 765     pblk_12x8_m += stride;                                               \
 766     SD(out5_m, pblk_12x8_m);                                             \
 767     SW(out13_m, pblk_12x8_m + 8);                                        \
 768     pblk_12x8_m += stride;                                               \
 769     SD(out6_m, pblk_12x8_m);                                             \
 770     SW(out14_m, pblk_12x8_m + 8);                                        \
 771     pblk_12x8_m += stride;                                               \
 772     SD(out7_m, pblk_12x8_m);                                             \
 773     SW(out15_m, pblk_12x8_m + 8);                                        \
 774 }
 775
 776 /* Description : average with rounding (in0 + in1 + 1) / 2.
 777    Arguments   : Inputs  - in0, in1, in2, in3,
 778                  Outputs - out0, out1
 779                  Return Type - signed byte
 780    Details     : Each byte element from 'in0' vector is added with each byte
 781                  element from 'in1' vector. The addition of the elements plus 1
 782                 (for rounding) is done unsigned with full precision,
 783                 i.e. the result has one extra bit. Unsigned division by 2
 784                 (or logical shift right by one bit) is performed before writing
 785                 the result to vector 'out0'
 786                 Similar for the pair of 'in2' and 'in3'
 787 */
 788 #define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1)       \
 789 {                                                             \
 790     out0 = (RTYPE) __msa_aver_u_b((v16u8) in0, (v16u8) in1);  \
 791     out1 = (RTYPE) __msa_aver_u_b((v16u8) in2, (v16u8) in3);  \
 792 }
 793 #define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__)
 794
 795 #define AVER_UB4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
 796                  out0, out1, out2, out3)                        \
 797 {                                                               \
 798     AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1)             \
 799     AVER_UB2(RTYPE, in4, in5, in6, in7, out2, out3)             \
 800 }
 801 #define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__)
 802
 803 /* Description : Immediate number of columns to slide with zero
 804    Arguments   : Inputs  - in0, in1, slide_val
 805                  Outputs - out0, out1
 806                  Return Type - as per RTYPE
 807    Details     : Byte elements from 'zero_m' vector are slide into 'in0' by
 808                  number of elements specified by 'slide_val'
 809 */
 810 #define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val)                 \
 811 {                                                                         \
 812     v16i8 zero_m = { 0 };                                                 \
 813     out0 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in0, slide_val);  \
 814     out1 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in1, slide_val);  \
 815 }
 816 #define SLDI_B2_0_UB(...) SLDI_B2_0(v16u8, __VA_ARGS__)
 817 #define SLDI_B2_0_SB(...) SLDI_B2_0(v16i8, __VA_ARGS__)
 818 #define SLDI_B2_0_SW(...) SLDI_B2_0(v4i32, __VA_ARGS__)
 819
 820 #define SLDI_B3_0(RTYPE, in0, in1, in2, out0, out1, out2,  slide_val)     \
 821 {                                                                         \
 822     v16i8 zero_m = { 0 };                                                 \
 823     SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val);                    \
 824     out2 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in2, slide_val);  \
 825 }
 826 #define SLDI_B3_0_UB(...) SLDI_B3_0(v16u8, __VA_ARGS__)
 827 #define SLDI_B3_0_SB(...) SLDI_B3_0(v16i8, __VA_ARGS__)
 828
 829 #define SLDI_B4_0(RTYPE, in0, in1, in2, in3,            \
 830                   out0, out1, out2, out3, slide_val)    \
 831 {                                                       \
 832     SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val);  \
 833     SLDI_B2_0(RTYPE, in2, in3, out2, out3, slide_val);  \
 834 }
 835 #define SLDI_B4_0_UB(...) SLDI_B4_0(v16u8, __VA_ARGS__)
 836 #define SLDI_B4_0_SB(...) SLDI_B4_0(v16i8, __VA_ARGS__)
 837 #define SLDI_B4_0_SH(...) SLDI_B4_0(v8i16, __VA_ARGS__)
 838
 839 /* Description : Immediate number of columns to slide
 840    Arguments   : Inputs  - in0_0, in0_1, in1_0, in1_1, slide_val
 841                  Outputs - out0, out1
 842                  Return Type - as per RTYPE
 843    Details     : Byte elements from 'in0_0' vector are slide into 'in1_0' by
 844                  number of elements specified by 'slide_val'
 845 */
 846 #define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val)  \
 847 {                                                                          \
 848     out0 = (RTYPE) __msa_sldi_b((v16i8) in0_0, (v16i8) in1_0, slide_val);  \
 849     out1 = (RTYPE) __msa_sldi_b((v16i8) in0_1, (v16i8) in1_1, slide_val);  \
 850 }
 851 #define SLDI_B2_UB(...) SLDI_B2(v16u8, __VA_ARGS__)
 852 #define SLDI_B2_SB(...) SLDI_B2(v16i8, __VA_ARGS__)
 853 #define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__)
 854
 855 #define SLDI_B3(RTYPE, in0_0, in0_1, in0_2, in1_0, in1_1, in1_2,           \
 856                 out0, out1, out2, slide_val)                               \
 857 {                                                                          \
 858     SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val)      \
 859     out2 = (RTYPE) __msa_sldi_b((v16i8) in0_2, (v16i8) in1_2, slide_val);  \
 860 }
 861 #define SLDI_B3_SB(...) SLDI_B3(v16i8, __VA_ARGS__)
 862 #define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__)
 863
 864 /* Description : Shuffle byte vector elements as per mask vector
 865    Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
 866                  Outputs - out0, out1
 867                  Return Type - as per RTYPE
 868    Details     : Selective byte elements from in0 & in1 are copied to out0 as
 869                  per control vector mask0
 870                  Selective byte elements from in2 & in3 are copied to out1 as
 871                  per control vector mask1
 872 */
 873 #define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1)       \
 874 {                                                                          \
 875     out0 = (RTYPE) __msa_vshf_b((v16i8) mask0, (v16i8) in1, (v16i8) in0);  \
 876     out1 = (RTYPE) __msa_vshf_b((v16i8) mask1, (v16i8) in3, (v16i8) in2);  \
 877 }
 878 #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
 879 #define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)
 880 #define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)
 881 #define VSHF_B2_SH(...) VSHF_B2(v8i16, __VA_ARGS__)
 882
 883 #define VSHF_B3(RTYPE, in0, in1, in2, in3, in4, in5, mask0, mask1, mask2,  \
 884                 out0, out1, out2)                                          \
 885 {                                                                          \
 886     VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1);          \
 887     out2 = (RTYPE) __msa_vshf_b((v16i8) mask2, (v16i8) in5, (v16i8) in4);  \
 888 }
 889 #define VSHF_B3_SB(...) VSHF_B3(v16i8, __VA_ARGS__)
 890
 891 #define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3,       \
 892                 out0, out1, out2, out3)                            \
 893 {                                                                  \
 894     VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1);  \
 895     VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3);  \
 896 }
 897 #define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__)
 898 #define VSHF_B4_SH(...) VSHF_B4(v8i16, __VA_ARGS__)
 899
 900 /* Description : Shuffle halfword vector elements as per mask vector
 901    Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
 902                  Outputs - out0, out1
 903                  Return Type - as per RTYPE
 904    Details     : Selective halfword elements from in0 & in1 are copied to out0
 905                  as per control vector mask0
 906                  Selective halfword elements from in2 & in3 are copied to out1
 907                  as per control vector mask1
 908 */
 909 #define VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1)       \
 910 {                                                                          \
 911     out0 = (RTYPE) __msa_vshf_h((v8i16) mask0, (v8i16) in1, (v8i16) in0);  \
 912     out1 = (RTYPE) __msa_vshf_h((v8i16) mask1, (v8i16) in3, (v8i16) in2);  \
 913 }
 914 #define VSHF_H2_SH(...) VSHF_H2(v8i16, __VA_ARGS__)
 915
 916 #define VSHF_H3(RTYPE, in0, in1, in2, in3, in4, in5, mask0, mask1, mask2,  \
 917                 out0, out1, out2)                                          \
 918 {                                                                          \
 919     VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1);          \
 920     out2 = (RTYPE) __msa_vshf_h((v8i16) mask2, (v8i16) in5, (v8i16) in4);  \
 921 }
 922 #define VSHF_H3_SH(...) VSHF_H3(v8i16, __VA_ARGS__)
 923
 924 /* Description : Shuffle byte vector elements as per mask vector
 925    Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
 926                  Outputs - out0, out1
 927                  Return Type - as per RTYPE
 928    Details     : Selective byte elements from in0 & in1 are copied to out0 as
 929                  per control vector mask0
 930                  Selective byte elements from in2 & in3 are copied to out1 as
 931                  per control vector mask1
 932 */
 933 #define VSHF_W2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1)      \
 934 {                                                                         \
 935     out0 = (RTYPE) __msa_vshf_w((v4i32) mask0, (v4i32) in1, (v4i32) in0); \
 936     out1 = (RTYPE) __msa_vshf_w((v4i32) mask1, (v4i32) in3, (v4i32) in2); \
 937 }
 938 #define VSHF_W2_SB(...) VSHF_W2(v16i8, __VA_ARGS__)
 939
 940 /* Description : Dot product of byte vector elements
 941    Arguments   : Inputs  - mult0, mult1
 942                            cnst0, cnst1
 943                  Outputs - out0, out1
 944                  Return Type - unsigned halfword
 945    Details     : Unsigned byte elements from mult0 are multiplied with
 946                  unsigned byte elements from cnst0 producing a result
 947                  twice the size of input i.e. unsigned halfword.
 948                  Then this multiplication results of adjacent odd-even elements
 949                  are added together and stored to the out vector
 950                  (2 unsigned halfword results)
 951 */
 952 #define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)   \
 953 {                                                                 \
 954     out0 = (RTYPE) __msa_dotp_u_h((v16u8) mult0, (v16u8) cnst0);  \
 955     out1 = (RTYPE) __msa_dotp_u_h((v16u8) mult1, (v16u8) cnst1);  \
 956 }
 957 #define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__)
 958
 959 #define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3,           \
 960                  cnst0, cnst1, cnst2, cnst3,                  \
 961                  out0, out1, out2, out3)                      \
 962 {                                                             \
 963     DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);  \
 964     DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);  \
 965 }
 966 #define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__)
 967
 968 /* Description : Dot product of byte vector elements
 969    Arguments   : Inputs  - mult0, mult1
 970                            cnst0, cnst1
 971                  Outputs - out0, out1
 972                  Return Type - signed halfword
 973    Details     : Signed byte elements from mult0 are multiplied with
 974                  signed byte elements from cnst0 producing a result
 975                  twice the size of input i.e. signed halfword.
 976                  Then this multiplication results of adjacent odd-even elements
 977                  are added together and stored to the out vector
 978                  (2 signed halfword results)
 979 */
 980 #define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)   \
 981 {                                                                 \
 982     out0 = (RTYPE) __msa_dotp_s_h((v16i8) mult0, (v16i8) cnst0);  \
 983     out1 = (RTYPE) __msa_dotp_s_h((v16i8) mult1, (v16i8) cnst1);  \
 984 }
 985 #define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__)
 986
 987 #define DOTP_SB3(RTYPE, mult0, mult1, mult2, cnst0, cnst1, cnst2,  \
 988                  out0, out1, out2)                                 \
 989 {                                                                  \
 990     DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);       \
 991     out2 = (RTYPE) __msa_dotp_s_h((v16i8) mult2, (v16i8) cnst2);   \
 992 }
 993 #define DOTP_SB3_SH(...) DOTP_SB3(v8i16, __VA_ARGS__)
 994
 995 #define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3,                   \
 996                  cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3)  \
 997 {                                                                     \
 998     DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);          \
 999     DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);          \
1000 }
1001 #define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__)
1002
1003 /* Description : Dot product of halfword vector elements
1004    Arguments   : Inputs  - mult0, mult1
1005                            cnst0, cnst1
1006                  Outputs - out0, out1
1007                  Return Type - signed word
1008    Details     : Signed halfword elements from mult0 are multiplied with
1009                  signed halfword elements from cnst0 producing a result
1010                  twice the size of input i.e. signed word.
1011                  Then this multiplication results of adjacent odd-even elements
1012                  are added together and stored to the out vector
1013                  (2 signed word results)
1014 */
1015 #define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)   \
1016 {                                                                 \
1017     out0 = (RTYPE) __msa_dotp_s_w((v8i16) mult0, (v8i16) cnst0);  \
1018     out1 = (RTYPE) __msa_dotp_s_w((v8i16) mult1, (v8i16) cnst1);  \
1019 }
1020 #define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__)
1021
1022 #define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3,           \
1023                  cnst0, cnst1, cnst2, cnst3,                  \
1024                  out0, out1, out2, out3)                      \
1025 {                                                             \
1026     DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);  \
1027     DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);  \
1028 }
1029 #define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__)
1030
1031 /* Description : Dot product & addition of byte vector elements
1032    Arguments   : Inputs  - mult0, mult1
1033                            cnst0, cnst1
1034                  Outputs - out0, out1
1035                  Return Type - signed halfword
1036    Details     : Signed byte elements from mult0 are multiplied with
1037                  signed byte elements from cnst0 producing a result
1038                  twice the size of input i.e. signed halfword.
1039                  Then this multiplication results of adjacent odd-even elements
1040                  are added to the out vector
1041                  (2 signed halfword results)
1042 */
1043 #define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)   \
1044 {                                                                  \
1045     out0 = (RTYPE) __msa_dpadd_s_h((v8i16) out0,                   \
1046                                    (v16i8) mult0, (v16i8) cnst0);  \
1047     out1 = (RTYPE) __msa_dpadd_s_h((v8i16) out1,                   \
1048                                    (v16i8) mult1, (v16i8) cnst1);  \
1049 }
1050 #define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__)
1051
1052 #define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3,                   \
1053                   cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3)  \
1054 {                                                                      \
1055     DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);          \
1056     DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);          \
1057 }
1058 #define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__)
1059
1060 /* Description : Dot product & addition of byte vector elements
1061    Arguments   : Inputs  - mult0, mult1
1062                            cnst0, cnst1
1063                  Outputs - out0, out1
1064                  Return Type - unsigned halfword
1065    Details     : Unsigned byte elements from mult0 are multiplied with
1066                  unsigned byte elements from cnst0 producing a result
1067                  twice the size of input i.e. unsigned halfword.
1068                  Then this multiplication results of adjacent odd-even elements
1069                  are added to the out vector
1070                  (2 unsigned halfword results)
1071 */
1072 #define DPADD_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)   \
1073 {                                                                  \
1074     out0 = (RTYPE) __msa_dpadd_u_h((v8u16) out0,                   \
1075                                    (v16u8) mult0, (v16u8) cnst0);  \
1076     out1 = (RTYPE) __msa_dpadd_u_h((v8u16) out1,                   \
1077                                    (v16u8) mult1, (v16u8) cnst1);  \
1078 }
1079 #define DPADD_UB2_UH(...) DPADD_UB2(v8u16, __VA_ARGS__)
1080
1081 /* Description : Dot product & addition of halfword vector elements
1082    Arguments   : Inputs  - mult0, mult1
1083                            cnst0, cnst1
1084                  Outputs - out0, out1
1085                  Return Type - signed word
1086    Details     : Signed halfword elements from mult0 are multiplied with
1087                  signed halfword elements from cnst0 producing a result
1088                  twice the size of input i.e. signed word.
1089                  Then this multiplication results of adjacent odd-even elements
1090                  are added to the out vector
1091                  (2 signed word results)
1092 */
1093 #define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)   \
1094 {                                                                  \
1095     out0 = (RTYPE) __msa_dpadd_s_w((v4i32) out0,                   \
1096                                    (v8i16) mult0, (v8i16) cnst0);  \
1097     out1 = (RTYPE) __msa_dpadd_s_w((v4i32) out1,                   \
1098                                    (v8i16) mult1, (v8i16) cnst1);  \
1099 }
1100 #define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__)
1101
1102 #define DPADD_SH4(RTYPE, mult0, mult1, mult2, mult3,                   \
1103                   cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3)  \
1104 {                                                                      \
1105     DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);          \
1106     DPADD_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);          \
1107 }
1108 #define DPADD_SH4_SW(...) DPADD_SH4(v4i32, __VA_ARGS__)
1109
1110 /* Description : Minimum values between unsigned elements of
1111                  either vector are copied to the output vector
1112    Arguments   : Inputs  - in0, in1, min_vec
1113                  Outputs - in0, in1, (in place)
1114                  Return Type - unsigned halfword
1115    Details     : Minimum of unsigned halfword element values from 'in0' and
1116                  'min_value' are written to output vector 'in0'
1117 */
1118 #define MIN_UH2(RTYPE, in0, in1, min_vec)               \
1119 {                                                       \
1120     in0 = (RTYPE) __msa_min_u_h((v8u16) in0, min_vec);  \
1121     in1 = (RTYPE) __msa_min_u_h((v8u16) in1, min_vec);  \
1122 }
1123 #define MIN_UH2_UH(...) MIN_UH2(v8u16, __VA_ARGS__)
1124
1125 #define MIN_UH4(RTYPE, in0, in1, in2, in3, min_vec)  \
1126 {                                                    \
1127     MIN_UH2(RTYPE, in0, in1, min_vec);               \
1128     MIN_UH2(RTYPE, in2, in3, min_vec);               \
1129 }
1130 #define MIN_UH4_UH(...) MIN_UH4(v8u16, __VA_ARGS__)
1131
1132 /* Description : Clips all halfword elements of input vector between min & max
1133                  out = ((in) < (min)) ? (min) : (((in) > (max)) ? (max) : (in))
1134    Arguments   : Inputs  - in       (input vector)
1135                          - min      (min threshold)
1136                          - max      (max threshold)
1137                  Outputs - out_m    (output vector with clipped elements)
1138                  Return Type - signed halfword
1139 */
1140 #define CLIP_SH(in, min, max)                           \
1141 ( {                                                     \
1142     v8i16 out_m;                                        \
1143                                                         \
1144     out_m = __msa_max_s_h((v8i16) min, (v8i16) in);     \
1145     out_m = __msa_min_s_h((v8i16) max, (v8i16) out_m);  \
1146     out_m;                                              \
1147 } )
1148
1149 /* Description : Clips all signed halfword elements of input vector
1150                  between 0 & 255
1151    Arguments   : Inputs  - in       (input vector)
1152                  Outputs - out_m    (output vector with clipped elements)
1153                  Return Type - signed halfword
1154 */
1155 #define CLIP_SH_0_255(in)                                 \
1156 ( {                                                       \
1157     v8i16 max_m = __msa_ldi_h(255);                       \
1158     v8i16 out_m;                                          \
1159                                                           \
1160     out_m = __msa_maxi_s_h((v8i16) in, 0);                \
1161     out_m = __msa_min_s_h((v8i16) max_m, (v8i16) out_m);  \
1162     out_m;                                                \
1163 } )
1164 #define CLIP_SH2_0_255(in0, in1)  \
1165 {                                 \
1166     in0 = CLIP_SH_0_255(in0);     \
1167     in1 = CLIP_SH_0_255(in1);     \
1168 }
1169 #define CLIP_SH4_0_255(in0, in1, in2, in3)  \
1170 {                                           \
1171     CLIP_SH2_0_255(in0, in1);               \
1172     CLIP_SH2_0_255(in2, in3);               \
1173 }
1174
1175 /* Description : Clips all signed word elements of input vector
1176                  between 0 & 255
1177    Arguments   : Inputs  - in       (input vector)
1178                  Outputs - out_m    (output vector with clipped elements)
1179                  Return Type - signed word
1180 */
1181 #define CLIP_SW_0_255(in)                                 \
1182 ( {                                                       \
1183     v4i32 max_m = __msa_ldi_w(255);                       \
1184     v4i32 out_m;                                          \
1185                                                           \
1186     out_m = __msa_maxi_s_w((v4i32) in, 0);                \
1187     out_m = __msa_min_s_w((v4i32) max_m, (v4i32) out_m);  \
1188     out_m;                                                \
1189 } )
1190
1191 /* Description : Addition of 4 signed word elements
1192                  4 signed word elements of input vector are added together and
1193                  resulted integer sum is returned
1194    Arguments   : Inputs  - in       (signed word vector)
1195                  Outputs - sum_m    (i32 sum)
1196                  Return Type - signed word
1197 */
1198 #define HADD_SW_S32(in)                               \
1199 ( {                                                   \
1200     v2i64 res0_m, res1_m;                             \
1201     int32_t sum_m;                                    \
1202                                                       \
1203     res0_m = __msa_hadd_s_d((v4i32) in, (v4i32) in);  \
1204     res1_m = __msa_splati_d(res0_m, 1);               \
1205     res0_m = res0_m + res1_m;                         \
1206     sum_m = __msa_copy_s_w((v4i32) res0_m, 0);        \
1207     sum_m;                                            \
1208 } )
1209
1210 /* Description : Addition of 8 unsigned halfword elements
1211                  8 unsigned halfword elements of input vector are added
1212                  together and resulted integer sum is returned
1213    Arguments   : Inputs  - in       (unsigned halfword vector)
1214                  Outputs - sum_m    (u32 sum)
1215                  Return Type - unsigned word
1216 */
1217 #define HADD_UH_U32(in)                                  \
1218 ( {                                                      \
1219     v4u32 res_m;                                         \
1220     v2u64 res0_m, res1_m;                                \
1221     uint32_t sum_m;                                      \
1222                                                          \
1223     res_m = __msa_hadd_u_w((v8u16) in, (v8u16) in);      \
1224     res0_m = __msa_hadd_u_d(res_m, res_m);               \
1225     res1_m = (v2u64) __msa_splati_d((v2i64) res0_m, 1);  \
1226     res0_m = res0_m + res1_m;                            \
1227     sum_m = __msa_copy_u_w((v4i32) res0_m, 0);           \
1228     sum_m;                                               \
1229 } )
1230
1231 /* Description : Horizontal addition of signed byte vector elements
1232    Arguments   : Inputs  - in0, in1
1233                  Outputs - out0, out1
1234                  Return Type - as per RTYPE
1235    Details     : Each signed odd byte element from 'in0' is added to
1236                  even signed byte element from 'in0' (pairwise) and the
1237                  halfword result is stored in 'out0'
1238 */
1239 #define HADD_SB2(RTYPE, in0, in1, out0, out1)                 \
1240 {                                                             \
1241     out0 = (RTYPE) __msa_hadd_s_h((v16i8) in0, (v16i8) in0);  \
1242     out1 = (RTYPE) __msa_hadd_s_h((v16i8) in1, (v16i8) in1);  \
1243 }
1244 #define HADD_SB2_SH(...) HADD_SB2(v8i16, __VA_ARGS__)
1245
1246 #define HADD_SB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3)  \
1247 {                                                                    \
1248     HADD_SB2(RTYPE, in0, in1, out0, out1);                           \
1249     HADD_SB2(RTYPE, in2, in3, out2, out3);                           \
1250 }
1251 #define HADD_SB4_UH(...) HADD_SB4(v8u16, __VA_ARGS__)
1252 #define HADD_SB4_SH(...) HADD_SB4(v8i16, __VA_ARGS__)
1253
1254 /* Description : Horizontal addition of unsigned byte vector elements
1255    Arguments   : Inputs  - in0, in1
1256                  Outputs - out0, out1
1257                  Return Type - as per RTYPE
1258    Details     : Each unsigned odd byte element from 'in0' is added to
1259                  even unsigned byte element from 'in0' (pairwise) and the
1260                  halfword result is stored in 'out0'
1261 */
1262 #define HADD_UB2(RTYPE, in0, in1, out0, out1)                 \
1263 {                                                             \
1264     out0 = (RTYPE) __msa_hadd_u_h((v16u8) in0, (v16u8) in0);  \
1265     out1 = (RTYPE) __msa_hadd_u_h((v16u8) in1, (v16u8) in1);  \
1266 }
1267 #define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__)
1268
1269 #define HADD_UB3(RTYPE, in0, in1, in2, out0, out1, out2)      \
1270 {                                                             \
1271     HADD_UB2(RTYPE, in0, in1, out0, out1);                    \
1272     out2 = (RTYPE) __msa_hadd_u_h((v16u8) in2, (v16u8) in2);  \
1273 }
1274 #define HADD_UB3_UH(...) HADD_UB3(v8u16, __VA_ARGS__)
1275
1276 #define HADD_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3)  \
1277 {                                                                    \
1278     HADD_UB2(RTYPE, in0, in1, out0, out1);                           \
1279     HADD_UB2(RTYPE, in2, in3, out2, out3);                           \
1280 }
1281 #define HADD_UB4_UB(...) HADD_UB4(v16u8, __VA_ARGS__)
1282 #define HADD_UB4_UH(...) HADD_UB4(v8u16, __VA_ARGS__)
1283 #define HADD_UB4_SH(...) HADD_UB4(v8i16, __VA_ARGS__)
1284
1285 /* Description : Horizontal subtraction of unsigned byte vector elements
1286    Arguments   : Inputs  - in0, in1
1287                  Outputs - out0, out1
1288                  Return Type - as per RTYPE
1289    Details     : Each unsigned odd byte element from 'in0' is subtracted from
1290                  even unsigned byte element from 'in0' (pairwise) and the
1291                  halfword result is stored in 'out0'
1292 */
1293 #define HSUB_UB2(RTYPE, in0, in1, out0, out1)                 \
1294 {                                                             \
1295     out0 = (RTYPE) __msa_hsub_u_h((v16u8) in0, (v16u8) in0);  \
1296     out1 = (RTYPE) __msa_hsub_u_h((v16u8) in1, (v16u8) in1);  \
1297 }
1298 #define HSUB_UB2_UH(...) HSUB_UB2(v8u16, __VA_ARGS__)
1299 #define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
1300
1301 #define HSUB_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3)  \
1302 {                                                                    \
1303     HSUB_UB2(RTYPE, in0, in1, out0, out1);                           \
1304     HSUB_UB2(RTYPE, in2, in3, out2, out3);                           \
1305 }
1306 #define HSUB_UB4_UH(...) HSUB_UB4(v8u16, __VA_ARGS__)
1307 #define HSUB_UB4_SH(...) HSUB_UB4(v8i16, __VA_ARGS__)
1308
1309 /* Description : SAD (Sum of Absolute Difference)
1310    Arguments   : Inputs  - in0, in1, ref0, ref1  (unsigned byte src & ref)
1311                  Outputs - sad_m                 (halfword vector with sad)
1312                  Return Type - unsigned halfword
1313    Details     : Absolute difference of all the byte elements from 'in0' with
1314                  'ref0' is calculated and preserved in 'diff0'. From the 16
1315                  unsigned absolute diff values, even-odd pairs are added
1316                  together to generate 8 halfword results.
1317 */
1318 #define SAD_UB2_UH(in0, in1, ref0, ref1)                        \
1319 ( {                                                             \
1320     v16u8 diff0_m, diff1_m;                                     \
1321     v8u16 sad_m = { 0 };                                        \
1322                                                                 \
1323     diff0_m = __msa_asub_u_b((v16u8) in0, (v16u8) ref0);        \
1324     diff1_m = __msa_asub_u_b((v16u8) in1, (v16u8) ref1);        \
1325                                                                 \
1326     sad_m += __msa_hadd_u_h((v16u8) diff0_m, (v16u8) diff0_m);  \
1327     sad_m += __msa_hadd_u_h((v16u8) diff1_m, (v16u8) diff1_m);  \
1328                                                                 \
1329     sad_m;                                                      \
1330 } )
1331
1332 /* Description : Insert specified word elements from input vectors to 1
1333                  destination vector
1334    Arguments   : Inputs  - in0, in1, in2, in3 (4 input vectors)
1335                  Outputs - out                (output vector)
1336                  Return Type - as per RTYPE
1337 */
1338 #define INSERT_W2(RTYPE, in0, in1, out)                 \
1339 {                                                       \
1340     out = (RTYPE) __msa_insert_w((v4i32) out, 0, in0);  \
1341     out = (RTYPE) __msa_insert_w((v4i32) out, 1, in1);  \
1342 }
1343 #define INSERT_W2_UB(...) INSERT_W2(v16u8, __VA_ARGS__)
1344 #define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__)
1345
1346 #define INSERT_W4(RTYPE, in0, in1, in2, in3, out)       \
1347 {                                                       \
1348     out = (RTYPE) __msa_insert_w((v4i32) out, 0, in0);  \
1349     out = (RTYPE) __msa_insert_w((v4i32) out, 1, in1);  \
1350     out = (RTYPE) __msa_insert_w((v4i32) out, 2, in2);  \
1351     out = (RTYPE) __msa_insert_w((v4i32) out, 3, in3);  \
1352 }
1353 #define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__)
1354 #define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__)
1355 #define INSERT_W4_SW(...) INSERT_W4(v4i32, __VA_ARGS__)
1356
1357 /* Description : Insert specified double word elements from input vectors to 1
1358                  destination vector
1359    Arguments   : Inputs  - in0, in1      (2 input vectors)
1360                  Outputs - out           (output vector)
1361                  Return Type - as per RTYPE
1362 */
1363 #define INSERT_D2(RTYPE, in0, in1, out)                 \
1364 {                                                       \
1365     out = (RTYPE) __msa_insert_d((v2i64) out, 0, in0);  \
1366     out = (RTYPE) __msa_insert_d((v2i64) out, 1, in1);  \
1367 }
1368 #define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__)
1369 #define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)
1370 #define INSERT_D2_SD(...) INSERT_D2(v2i64, __VA_ARGS__)
1371
1372 /* Description : Interleave even byte elements from vectors
1373    Arguments   : Inputs  - in0, in1, in2, in3
1374                  Outputs - out0, out1
1375                  Return Type - as per RTYPE
1376    Details     : Even byte elements of 'in0' and even byte
1377                  elements of 'in1' are interleaved and copied to 'out0'
1378                  Even byte elements of 'in2' and even byte
1379                  elements of 'in3' are interleaved and copied to 'out1'
1380 */
1381 #define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1382 {                                                            \
1383     out0 = (RTYPE) __msa_ilvev_b((v16i8) in1, (v16i8) in0);  \
1384     out1 = (RTYPE) __msa_ilvev_b((v16i8) in3, (v16i8) in2);  \
1385 }
1386 #define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__)
1387 #define ILVEV_B2_SB(...) ILVEV_B2(v16i8, __VA_ARGS__)
1388 #define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__)
1389 #define ILVEV_B2_SD(...) ILVEV_B2(v2i64, __VA_ARGS__)
1390
1391 /* Description : Interleave even halfword elements from vectors
1392    Arguments   : Inputs  - in0, in1, in2, in3
1393                  Outputs - out0, out1
1394                  Return Type - as per RTYPE
1395    Details     : Even halfword elements of 'in0' and even halfword
1396                  elements of 'in1' are interleaved and copied to 'out0'
1397                  Even halfword elements of 'in2' and even halfword
1398                  elements of 'in3' are interleaved and copied to 'out1'
1399 */
1400 #define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1401 {                                                            \
1402     out0 = (RTYPE) __msa_ilvev_h((v8i16) in1, (v8i16) in0);  \
1403     out1 = (RTYPE) __msa_ilvev_h((v8i16) in3, (v8i16) in2);  \
1404 }
1405 #define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__)
1406 #define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__)
1407 #define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__)
1408
1409 /* Description : Interleave even word elements from vectors
1410    Arguments   : Inputs  - in0, in1, in2, in3
1411                  Outputs - out0, out1
1412                  Return Type - as per RTYPE
1413    Details     : Even word elements of 'in0' and even word
1414                  elements of 'in1' are interleaved and copied to 'out0'
1415                  Even word elements of 'in2' and even word
1416                  elements of 'in3' are interleaved and copied to 'out1'
1417 */
1418 #define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1419 {                                                            \
1420     out0 = (RTYPE) __msa_ilvev_w((v4i32) in1, (v4i32) in0);  \
1421     out1 = (RTYPE) __msa_ilvev_w((v4i32) in3, (v4i32) in2);  \
1422 }
1423 #define ILVEV_W2_UB(...) ILVEV_W2(v16u8, __VA_ARGS__)
1424 #define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__)
1425 #define ILVEV_W2_UH(...) ILVEV_W2(v8u16, __VA_ARGS__)
1426 #define ILVEV_W2_SD(...) ILVEV_W2(v2i64, __VA_ARGS__)
1427
1428 /* Description : Interleave even double word elements from vectors
1429    Arguments   : Inputs  - in0, in1, in2, in3
1430                  Outputs - out0, out1
1431                  Return Type - as per RTYPE
1432    Details     : Even double word elements of 'in0' and even double word
1433                  elements of 'in1' are interleaved and copied to 'out0'
1434                  Even double word elements of 'in2' and even double word
1435                  elements of 'in3' are interleaved and copied to 'out1'
1436 */
1437 #define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1438 {                                                            \
1439     out0 = (RTYPE) __msa_ilvev_d((v2i64) in1, (v2i64) in0);  \
1440     out1 = (RTYPE) __msa_ilvev_d((v2i64) in3, (v2i64) in2);  \
1441 }
1442 #define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)
1443 #define ILVEV_D2_SB(...) ILVEV_D2(v16i8, __VA_ARGS__)
1444 #define ILVEV_D2_SW(...) ILVEV_D2(v4i32, __VA_ARGS__)
1445
1446 /* Description : Interleave left half of byte elements from vectors
1447    Arguments   : Inputs  - in0, in1, in2, in3
1448                  Outputs - out0, out1
1449                  Return Type - as per RTYPE
1450    Details     : Left half of byte elements of in0 and left half of byte
1451                  elements of in1 are interleaved and copied to out0.
1452                  Left half of byte elements of in2 and left half of byte
1453                  elements of in3 are interleaved and copied to out1.
1454 */
1455 #define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1456 {                                                           \
1457     out0 = (RTYPE) __msa_ilvl_b((v16i8) in0, (v16i8) in1);  \
1458     out1 = (RTYPE) __msa_ilvl_b((v16i8) in2, (v16i8) in3);  \
1459 }
1460 #define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__)
1461 #define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__)
1462 #define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__)
1463 #define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__)
1464
1465 #define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1466                 out0, out1, out2, out3)                         \
1467 {                                                               \
1468     ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1469     ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1470 }
1471 #define ILVL_B4_UB(...) ILVL_B4(v16u8, __VA_ARGS__)
1472 #define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__)
1473 #define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__)
1474 #define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__)
1475
1476 /* Description : Interleave left half of halfword elements from vectors
1477    Arguments   : Inputs  - in0, in1, in2, in3
1478                  Outputs - out0, out1
1479                  Return Type - as per RTYPE
1480    Details     : Left half of halfword elements of in0 and left half of halfword
1481                  elements of in1 are interleaved and copied to out0.
1482                  Left half of halfword elements of in2 and left half of halfword
1483                  elements of in3 are interleaved and copied to out1.
1484 */
1485 #define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1486 {                                                           \
1487     out0 = (RTYPE) __msa_ilvl_h((v8i16) in0, (v8i16) in1);  \
1488     out1 = (RTYPE) __msa_ilvl_h((v8i16) in2, (v8i16) in3);  \
1489 }
1490 #define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__)
1491 #define ILVL_H2_SW(...) ILVL_H2(v4i32, __VA_ARGS__)
1492
1493 #define ILVL_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1494                 out0, out1, out2, out3)                         \
1495 {                                                               \
1496     ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1497     ILVL_H2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1498 }
1499 #define ILVL_H4_SH(...) ILVL_H4(v8i16, __VA_ARGS__)
1500 #define ILVL_H4_SW(...) ILVL_H4(v4i32, __VA_ARGS__)
1501
1502 /* Description : Interleave left half of word elements from vectors
1503    Arguments   : Inputs  - in0, in1, in2, in3
1504                  Outputs - out0, out1
1505                  Return Type - as per RTYPE
1506    Details     : Left half of word elements of in0 and left half of word
1507                  elements of in1 are interleaved and copied to out0.
1508                  Left half of word elements of in2 and left half of word
1509                  elements of in3 are interleaved and copied to out1.
1510 */
1511 #define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1512 {                                                           \
1513     out0 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1);  \
1514     out1 = (RTYPE) __msa_ilvl_w((v4i32) in2, (v4i32) in3);  \
1515 }
1516 #define ILVL_W2_UB(...) ILVL_W2(v16u8, __VA_ARGS__)
1517 #define ILVL_W2_SB(...) ILVL_W2(v16i8, __VA_ARGS__)
1518 #define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__)
1519
1520 /* Description : Interleave right half of byte elements from vectors
1521    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1522                  Outputs - out0, out1, out2, out3
1523                  Return Type - as per RTYPE
1524    Details     : Right half of byte elements of in0 and right half of byte
1525                  elements of in1 are interleaved and copied to out0.
1526                  Right half of byte elements of in2 and right half of byte
1527                  elements of in3 are interleaved and copied to out1.
1528                  Similar for other pairs
1529 */
1530 #define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1531 {                                                           \
1532     out0 = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1);  \
1533     out1 = (RTYPE) __msa_ilvr_b((v16i8) in2, (v16i8) in3);  \
1534 }
1535 #define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__)
1536 #define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)
1537 #define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__)
1538 #define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
1539 #define ILVR_B2_SW(...) ILVR_B2(v4i32, __VA_ARGS__)
1540
1541 #define ILVR_B3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2)  \
1542 {                                                                       \
1543     ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1);                     \
1544     out2 = (RTYPE) __msa_ilvr_b((v16i8) in4, (v16i8) in5);              \
1545 }
1546 #define ILVR_B3_UB(...) ILVR_B3(v16u8, __VA_ARGS__)
1547 #define ILVR_B3_UH(...) ILVR_B3(v8u16, __VA_ARGS__)
1548 #define ILVR_B3_SH(...) ILVR_B3(v8i16, __VA_ARGS__)
1549
1550 #define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1551                 out0, out1, out2, out3)                         \
1552 {                                                               \
1553     ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1554     ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1555 }
1556 #define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__)
1557 #define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__)
1558 #define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__)
1559 #define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__)
1560 #define ILVR_B4_SW(...) ILVR_B4(v4i32, __VA_ARGS__)
1561
1562 #define ILVR_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,    \
1563                 in8, in9, in10, in11, in12, in13, in14, in15,     \
1564                 out0, out1, out2, out3, out4, out5, out6, out7)   \
1565 {                                                                 \
1566     ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,        \
1567             out0, out1, out2, out3);                              \
1568     ILVR_B4(RTYPE, in8, in9, in10, in11, in12, in13, in14, in15,  \
1569             out4, out5, out6, out7);                              \
1570 }
1571 #define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__)
1572
1573 /* Description : Interleave right half of halfword elements from vectors
1574    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1575                  Outputs - out0, out1, out2, out3
1576                  Return Type - signed halfword
1577    Details     : Right half of halfword elements of in0 and right half of
1578                  halfword elements of in1 are interleaved and copied to out0.
1579                  Right half of halfword elements of in2 and right half of
1580                  halfword elements of in3 are interleaved and copied to out1.
1581                  Similar for other pairs
1582 */
1583 #define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1584 {                                                           \
1585     out0 = (RTYPE) __msa_ilvr_h((v8i16) in0, (v8i16) in1);  \
1586     out1 = (RTYPE) __msa_ilvr_h((v8i16) in2, (v8i16) in3);  \
1587 }
1588 #define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
1589 #define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__)
1590
1591 #define ILVR_H3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2)  \
1592 {                                                                       \
1593     ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1);                     \
1594     out2 = (RTYPE) __msa_ilvr_h((v8i16) in4, (v8i16) in5);              \
1595 }
1596 #define ILVR_H3_SH(...) ILVR_H3(v8i16, __VA_ARGS__)
1597
1598 #define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1599                 out0, out1, out2, out3)                         \
1600 {                                                               \
1601     ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1602     ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1603 }
1604 #define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__)
1605 #define ILVR_H4_SW(...) ILVR_H4(v4i32, __VA_ARGS__)
1606
1607 #define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1608 {                                                           \
1609     out0 = (RTYPE) __msa_ilvr_w((v4i32) in0, (v4i32) in1);  \
1610     out1 = (RTYPE) __msa_ilvr_w((v4i32) in2, (v4i32) in3);  \
1611 }
1612 #define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__)
1613 #define ILVR_W2_SB(...) ILVR_W2(v16i8, __VA_ARGS__)
1614 #define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__)
1615
1616 #define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1617                 out0, out1, out2, out3)                         \
1618 {                                                               \
1619     ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1620     ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1621 }
1622 #define ILVR_W4_SB(...) ILVR_W4(v16i8, __VA_ARGS__)
1623 #define ILVR_W4_UB(...) ILVR_W4(v16u8, __VA_ARGS__)
1624
1625 /* Description : Interleave right half of double word elements from vectors
1626    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1627                  Outputs - out0, out1, out2, out3
1628                  Return Type - unsigned double word
1629    Details     : Right half of double word elements of in0 and right half of
1630                  double word elements of in1 are interleaved and copied to out0.
1631                  Right half of double word elements of in2 and right half of
1632                  double word elements of in3 are interleaved and copied to out1.
1633 */
1634 #define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1)          \
1635 {                                                               \
1636     out0 = (RTYPE) __msa_ilvr_d((v2i64) (in0), (v2i64) (in1));  \
1637     out1 = (RTYPE) __msa_ilvr_d((v2i64) (in2), (v2i64) (in3));  \
1638 }
1639 #define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__)
1640 #define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
1641 #define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
1642
1643 #define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2)  \
1644 {                                                                       \
1645     ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);                     \
1646     out2 = (RTYPE) __msa_ilvr_d((v2i64) (in4), (v2i64) (in5));          \
1647 }
1648 #define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__)
1649
1650 #define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1651                 out0, out1, out2, out3)                         \
1652 {                                                               \
1653     ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1654     ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1655 }
1656 #define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__)
1657 #define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__)
1658
1659 /* Description : Interleave both left and right half of input vectors
1660    Arguments   : Inputs  - in0, in1
1661                  Outputs - out0, out1
1662                  Return Type - as per RTYPE
1663    Details     : Right half of byte elements from 'in0' and 'in1' are
1664                  interleaved and stored to 'out0'
1665                  Left half of byte elements from 'in0' and 'in1' are
1666                  interleaved and stored to 'out1'
1667 */
1668 #define ILVRL_B2(RTYPE, in0, in1, out0, out1)               \
1669 {                                                           \
1670     out0 = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1);  \
1671     out1 = (RTYPE) __msa_ilvl_b((v16i8) in0, (v16i8) in1);  \
1672 }
1673 #define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
1674 #define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)
1675 #define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__)
1676 #define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
1677 #define ILVRL_B2_SW(...) ILVRL_B2(v4i32, __VA_ARGS__)
1678
1679 #define ILVRL_H2(RTYPE, in0, in1, out0, out1)               \
1680 {                                                           \
1681     out0 = (RTYPE) __msa_ilvr_h((v8i16) in0, (v8i16) in1);  \
1682     out1 = (RTYPE) __msa_ilvl_h((v8i16) in0, (v8i16) in1);  \
1683 }
1684 #define ILVRL_H2_SB(...) ILVRL_H2(v16i8, __VA_ARGS__)
1685 #define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)
1686 #define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
1687
1688 #define ILVRL_W2(RTYPE, in0, in1, out0, out1)               \
1689 {                                                           \
1690     out0 = (RTYPE) __msa_ilvr_w((v4i32) in0, (v4i32) in1);  \
1691     out1 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1);  \
1692 }
1693 #define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__)
1694 #define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
1695 #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
1696
1697 /* Description : Maximum values between signed elements of vector and
1698                  5-bit signed immediate value are copied to the output vector
1699    Arguments   : Inputs  - in0, in1, in2, in3, max_val
1700                  Outputs - in0, in1, in2, in3 (in place)
1701                  Return Type - unsigned halfword
1702    Details     : Maximum of signed halfword element values from 'in0' and
1703                  'max_val' are written to output vector 'in0'
1704 */
1705 #define MAXI_SH2(RTYPE, in0, in1, max_val)                 \
1706 {                                                          \
1707     in0 = (RTYPE) __msa_maxi_s_h((v8i16) in0, (max_val));  \
1708     in1 = (RTYPE) __msa_maxi_s_h((v8i16) in1, (max_val));  \
1709 }
1710 #define MAXI_SH2_UH(...) MAXI_SH2(v8u16, __VA_ARGS__)
1711 #define MAXI_SH2_SH(...) MAXI_SH2(v8i16, __VA_ARGS__)
1712
1713 #define MAXI_SH4(RTYPE, in0, in1, in2, in3, max_val)  \
1714 {                                                     \
1715     MAXI_SH2(RTYPE, in0, in1, max_val);               \
1716     MAXI_SH2(RTYPE, in2, in3, max_val);               \
1717 }
1718 #define MAXI_SH4_UH(...) MAXI_SH4(v8u16, __VA_ARGS__)
1719
1720 /* Description : Saturate the halfword element values to the max
1721                  unsigned value of (sat_val+1 bits)
1722                  The element data width remains unchanged
1723    Arguments   : Inputs  - in0, in1, in2, in3, sat_val
1724                  Outputs - in0, in1, in2, in3 (in place)
1725                  Return Type - unsigned halfword
1726    Details     : Each unsigned halfword element from 'in0' is saturated to the
1727                  value generated with (sat_val+1) bit range
1728                  Results are in placed to original vectors
1729 */
1730 #define SAT_UH2(RTYPE, in0, in1, sat_val)               \
1731 {                                                       \
1732     in0 = (RTYPE) __msa_sat_u_h((v8u16) in0, sat_val);  \
1733     in1 = (RTYPE) __msa_sat_u_h((v8u16) in1, sat_val);  \
1734 }
1735 #define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__)
1736 #define SAT_UH2_SH(...) SAT_UH2(v8i16, __VA_ARGS__)
1737
1738 #define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val)  \
1739 {                                                    \
1740     SAT_UH2(RTYPE, in0, in1, sat_val);               \
1741     SAT_UH2(RTYPE, in2, in3, sat_val)                \
1742 }
1743 #define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__)
1744
1745 /* Description : Saturate the halfword element values to the max
1746                  unsigned value of (sat_val+1 bits)
1747                  The element data width remains unchanged
1748    Arguments   : Inputs  - in0, in1, in2, in3, sat_val
1749                  Outputs - in0, in1, in2, in3 (in place)
1750                  Return Type - unsigned halfword
1751    Details     : Each unsigned halfword element from 'in0' is saturated to the
1752                  value generated with (sat_val+1) bit range
1753                  Results are in placed to original vectors
1754 */
1755 #define SAT_SH2(RTYPE, in0, in1, sat_val)               \
1756 {                                                       \
1757     in0 = (RTYPE) __msa_sat_s_h((v8i16) in0, sat_val);  \
1758     in1 = (RTYPE) __msa_sat_s_h((v8i16) in1, sat_val);  \
1759 }
1760 #define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__)
1761
1762 #define SAT_SH3(RTYPE, in0, in1, in2, sat_val)          \
1763 {                                                       \
1764     SAT_SH2(RTYPE, in0, in1, sat_val)                   \
1765     in2 = (RTYPE) __msa_sat_s_h((v8i16) in2, sat_val);  \
1766 }
1767 #define SAT_SH3_SH(...) SAT_SH3(v8i16, __VA_ARGS__)
1768
1769 #define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val)  \
1770 {                                                    \
1771     SAT_SH2(RTYPE, in0, in1, sat_val);               \
1772     SAT_SH2(RTYPE, in2, in3, sat_val);               \
1773 }
1774 #define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__)
1775
1776 /* Description : Saturate the word element values to the max
1777                  unsigned value of (sat_val+1 bits)
1778                  The element data width remains unchanged
1779    Arguments   : Inputs  - in0, in1, in2, in3, sat_val
1780                  Outputs - in0, in1, in2, in3 (in place)
1781                  Return Type - unsigned word
1782    Details     : Each unsigned word element from 'in0' is saturated to the
1783                  value generated with (sat_val+1) bit range
1784                  Results are in placed to original vectors
1785 */
1786 #define SAT_SW2(RTYPE, in0, in1, sat_val)               \
1787 {                                                       \
1788     in0 = (RTYPE) __msa_sat_s_w((v4i32) in0, sat_val);  \
1789     in1 = (RTYPE) __msa_sat_s_w((v4i32) in1, sat_val);  \
1790 }
1791 #define SAT_SW2_SW(...) SAT_SW2(v4i32, __VA_ARGS__)
1792
1793 #define SAT_SW4(RTYPE, in0, in1, in2, in3, sat_val)  \
1794 {                                                    \
1795     SAT_SW2(RTYPE, in0, in1, sat_val);               \
1796     SAT_SW2(RTYPE, in2, in3, sat_val);               \
1797 }
1798 #define SAT_SW4_SW(...) SAT_SW4(v4i32, __VA_ARGS__)
1799
1800 /* Description : Indexed halfword element values are replicated to all
1801                  elements in output vector
1802    Arguments   : Inputs  - in, idx0, idx1
1803                  Outputs - out0, out1
1804                  Return Type - as per RTYPE
1805    Details     : 'idx0' element value from 'in' vector is replicated to all
1806                   elements in 'out0' vector
1807                   Valid index range for halfword operation is 0-7
1808 */
1809 #define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1)  \
1810 {                                                     \
1811     out0 = (RTYPE) __msa_splati_h((v8i16) in, idx0);  \
1812     out1 = (RTYPE) __msa_splati_h((v8i16) in, idx1);  \
1813 }
1814 #define SPLATI_H2_SB(...) SPLATI_H2(v16i8, __VA_ARGS__)
1815 #define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__)
1816
1817 #define SPLATI_H3(RTYPE, in, idx0, idx1, idx2,        \
1818                   out0, out1, out2)                   \
1819 {                                                     \
1820     SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1);     \
1821     out2 = (RTYPE) __msa_splati_h((v8i16) in, idx2);  \
1822 }
1823 #define SPLATI_H3_SB(...) SPLATI_H3(v16i8, __VA_ARGS__)
1824 #define SPLATI_H3_SH(...) SPLATI_H3(v8i16, __VA_ARGS__)
1825
1826 #define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3,  \
1827                   out0, out1, out2, out3)             \
1828 {                                                     \
1829     SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1);     \
1830     SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3);     \
1831 }
1832 #define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__)
1833 #define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__)
1834
1835 /* Description : Indexed word element values are replicated to all
1836                  elements in output vector
1837    Arguments   : Inputs  - in, stidx
1838                  Outputs - out0, out1
1839                  Return Type - as per RTYPE
1840    Details     : 'stidx' element value from 'in' vector is replicated to all
1841                   elements in 'out0' vector
1842                  'stidx + 1' element value from 'in' vector is replicated to all
1843                   elements in 'out1' vector
1844                   Valid index range for halfword operation is 0-3
1845 */
1846 #define SPLATI_W2(RTYPE, in, stidx, out0, out1)            \
1847 {                                                          \
1848     out0 = (RTYPE) __msa_splati_w((v4i32) in, stidx);      \
1849     out1 = (RTYPE) __msa_splati_w((v4i32) in, (stidx+1));  \
1850 }
1851 #define SPLATI_W2_SH(...) SPLATI_W2(v8i16, __VA_ARGS__)
1852 #define SPLATI_W2_SW(...) SPLATI_W2(v4i32, __VA_ARGS__)
1853
1854 #define SPLATI_W4(RTYPE, in, out0, out1, out2, out3)  \
1855 {                                                     \
1856     SPLATI_W2(RTYPE, in, 0, out0, out1);              \
1857     SPLATI_W2(RTYPE, in, 2, out2, out3);              \
1858 }
1859 #define SPLATI_W4_SH(...) SPLATI_W4(v8i16, __VA_ARGS__)
1860 #define SPLATI_W4_SW(...) SPLATI_W4(v4i32, __VA_ARGS__)
1861
1862 /* Description : Pack even byte elements of vector pairs
1863    Arguments   : Inputs  - in0, in1, in2, in3
1864                  Outputs - out0, out1
1865                  Return Type - as per RTYPE
1866    Details     : Even byte elements of in0 are copied to the left half of
1867                  out0 & even byte elements of in1 are copied to the right
1868                  half of out0.
1869                  Even byte elements of in2 are copied to the left half of
1870                  out1 & even byte elements of in3 are copied to the right
1871                  half of out1.
1872 */
1873 #define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1874 {                                                            \
1875     out0 = (RTYPE) __msa_pckev_b((v16i8) in0, (v16i8) in1);  \
1876     out1 = (RTYPE) __msa_pckev_b((v16i8) in2, (v16i8) in3);  \
1877 }
1878 #define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)
1879 #define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)
1880 #define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__)
1881 #define PCKEV_B2_SW(...) PCKEV_B2(v4i32, __VA_ARGS__)
1882
1883 #define PCKEV_B3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2)  \
1884 {                                                                        \
1885     PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1);                     \
1886     out2 = (RTYPE) __msa_pckev_b((v16i8) in4, (v16i8) in5);              \
1887 }
1888 #define PCKEV_B3_UB(...) PCKEV_B3(v16u8, __VA_ARGS__)
1889 #define PCKEV_B3_SB(...) PCKEV_B3(v16i8, __VA_ARGS__)
1890
1891 #define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1892                  out0, out1, out2, out3)                         \
1893 {                                                                \
1894     PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1895     PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1896 }
1897 #define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__)
1898 #define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__)
1899 #define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__)
1900 #define PCKEV_B4_SW(...) PCKEV_B4(v4i32, __VA_ARGS__)
1901
1902 /* Description : Pack even halfword elements of vector pairs
1903    Arguments   : Inputs  - in0, in1, in2, in3
1904                  Outputs - out0, out1
1905                  Return Type - as per RTYPE
1906    Details     : Even halfword elements of in0 are copied to the left half of
1907                  out0 & even halfword elements of in1 are copied to the right
1908                  half of out0.
1909                  Even halfword elements of in2 are copied to the left half of
1910                  out1 & even halfword elements of in3 are copied to the right
1911                  half of out1.
1912 */
1913 #define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1914 {                                                            \
1915     out0 = (RTYPE) __msa_pckev_h((v8i16) in0, (v8i16) in1);  \
1916     out1 = (RTYPE) __msa_pckev_h((v8i16) in2, (v8i16) in3);  \
1917 }
1918 #define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__)
1919 #define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__)
1920
1921 #define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1922                  out0, out1, out2, out3)                         \
1923 {                                                                \
1924     PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1925     PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1926 }
1927 #define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__)
1928 #define PCKEV_H4_SW(...) PCKEV_H4(v4i32, __VA_ARGS__)
1929
1930 /* Description : Pack even double word elements of vector pairs
1931    Arguments   : Inputs  - in0, in1, in2, in3
1932                  Outputs - out0, out1
1933                  Return Type - unsigned byte
1934    Details     : Even double elements of in0 are copied to the left half of
1935                  out0 & even double elements of in1 are copied to the right
1936                  half of out0.
1937                  Even double elements of in2 are copied to the left half of
1938                  out1 & even double elements of in3 are copied to the right
1939                  half of out1.
1940 */
1941 #define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1942 {                                                            \
1943     out0 = (RTYPE) __msa_pckev_d((v2i64) in0, (v2i64) in1);  \
1944     out1 = (RTYPE) __msa_pckev_d((v2i64) in2, (v2i64) in3);  \
1945 }
1946 #define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__)
1947 #define PCKEV_D2_SB(...) PCKEV_D2(v16i8, __VA_ARGS__)
1948 #define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__)
1949
1950 #define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1951                  out0, out1, out2, out3)                         \
1952 {                                                                \
1953     PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1954     PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1955 }
1956 #define PCKEV_D4_UB(...) PCKEV_D4(v16u8, __VA_ARGS__)
1957
1958 /* Description : Pack odd double word elements of vector pairs
1959    Arguments   : Inputs  - in0, in1
1960                  Outputs - out0, out1
1961                  Return Type - as per RTYPE
1962    Details     : As operation is on same input 'in0' vector, index 1 double word
1963                  element is overwritten to index 0 and result is written to out0
1964                  As operation is on same input 'in1' vector, index 1 double word
1965                  element is overwritten to index 0 and result is written to out1
1966 */
1967 #define PCKOD_D2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1968 {                                                            \
1969     out0 = (RTYPE) __msa_pckod_d((v2i64) in0, (v2i64) in1);  \
1970     out1 = (RTYPE) __msa_pckod_d((v2i64) in2, (v2i64) in3);  \
1971 }
1972 #define PCKOD_D2_UB(...) PCKOD_D2(v16u8, __VA_ARGS__)
1973 #define PCKOD_D2_SH(...) PCKOD_D2(v8i16, __VA_ARGS__)
1974 #define PCKOD_D2_SD(...) PCKOD_D2(v2i64, __VA_ARGS__)
1975
1976 /* Description : Each byte element is logically xor'ed with immediate 128
1977    Arguments   : Inputs  - in0, in1
1978                  Outputs - in0, in1 (in-place)
1979                  Return Type - as per RTYPE
1980    Details     : Each unsigned byte element from input vector 'in0' is
1981                  logically xor'ed with 128 and result is in-place stored in
1982                  'in0' vector
1983                  Each unsigned byte element from input vector 'in1' is
1984                  logically xor'ed with 128 and result is in-place stored in
1985                  'in1' vector
1986                  Similar for other pairs
1987 */
1988 #define XORI_B2_128(RTYPE, in0, in1)               \
1989 {                                                  \
1990     in0 = (RTYPE) __msa_xori_b((v16u8) in0, 128);  \
1991     in1 = (RTYPE) __msa_xori_b((v16u8) in1, 128);  \
1992 }
1993 #define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__)
1994 #define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__)
1995 #define XORI_B2_128_SH(...) XORI_B2_128(v8i16, __VA_ARGS__)
1996
1997 #define XORI_B3_128(RTYPE, in0, in1, in2)          \
1998 {                                                  \
1999     XORI_B2_128(RTYPE, in0, in1);                  \
2000     in2 = (RTYPE) __msa_xori_b((v16u8) in2, 128);  \
2001 }
2002 #define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__)
2003
2004 #define XORI_B4_128(RTYPE, in0, in1, in2, in3)  \
2005 {                                               \
2006     XORI_B2_128(RTYPE, in0, in1);               \
2007     XORI_B2_128(RTYPE, in2, in3);               \
2008 }
2009 #define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__)
2010 #define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__)
2011 #define XORI_B4_128_SH(...) XORI_B4_128(v8i16, __VA_ARGS__)
2012
2013 #define XORI_B5_128(RTYPE, in0, in1, in2, in3, in4)  \
2014 {                                                    \
2015     XORI_B3_128(RTYPE, in0, in1, in2);               \
2016     XORI_B2_128(RTYPE, in3, in4);                    \
2017 }
2018 #define XORI_B5_128_SB(...) XORI_B5_128(v16i8, __VA_ARGS__)
2019
2020 #define XORI_B6_128(RTYPE, in0, in1, in2, in3, in4, in5)  \
2021 {                                                         \
2022     XORI_B4_128(RTYPE, in0, in1, in2, in3);               \
2023     XORI_B2_128(RTYPE, in4, in5);                         \
2024 }
2025 #define XORI_B6_128_SB(...) XORI_B6_128(v16i8, __VA_ARGS__)
2026
2027 #define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6)  \
2028 {                                                              \
2029     XORI_B4_128(RTYPE, in0, in1, in2, in3);                    \
2030     XORI_B3_128(RTYPE, in4, in5, in6);                         \
2031 }
2032 #define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__)
2033
2034 #define XORI_B8_128(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7)  \
2035 {                                                                   \
2036     XORI_B4_128(RTYPE, in0, in1, in2, in3);                         \
2037     XORI_B4_128(RTYPE, in4, in5, in6, in7);                         \
2038 }
2039 #define XORI_B8_128_SB(...) XORI_B8_128(v16i8, __VA_ARGS__)
2040
2041 /* Description : Addition of signed halfword elements and signed saturation
2042    Arguments   : Inputs  - in0, in1, in2, in3
2043                  Outputs - out0, out1
2044                  Return Type - as per RTYPE
2045    Details     : Signed halfword elements from 'in0' are added to signed
2046                  halfword elements of 'in1'. The result is then signed saturated
2047                  between -32768 to +32767 (as per halfword data type)
2048                  Similar for other pairs
2049 */
2050 #define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1)       \
2051 {                                                             \
2052     out0 = (RTYPE) __msa_adds_s_h((v8i16) in0, (v8i16) in1);  \
2053     out1 = (RTYPE) __msa_adds_s_h((v8i16) in2, (v8i16) in3);  \
2054 }
2055 #define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__)
2056
2057 #define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
2058                  out0, out1, out2, out3)                         \
2059 {                                                                \
2060     ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1);             \
2061     ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3);             \
2062 }
2063 #define ADDS_SH4_UH(...) ADDS_SH4(v8u16, __VA_ARGS__)
2064 #define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__)
2065
2066 /* Description : Shift left all elements of vector (generic for all data types)
2067    Arguments   : Inputs  - in0, in1, in2, in3, shift
2068                  Outputs - in0, in1, in2, in3 (in place)
2069                  Return Type - as per input vector RTYPE
2070    Details     : Each element of vector 'in0' is left shifted by 'shift' and
2071                  result is in place written to 'in0'
2072                  Similar for other pairs
2073 */
2074 #define SLLI_4V(in0, in1, in2, in3, shift)  \
2075 {                                           \
2076     in0 = in0 << shift;                     \
2077     in1 = in1 << shift;                     \
2078     in2 = in2 << shift;                     \
2079     in3 = in3 << shift;                     \
2080 }
2081
2082 /* Description : Arithmetic shift right all elements of vector
2083                  (generic for all data types)
2084    Arguments   : Inputs  - in0, in1, in2, in3, shift
2085                  Outputs - in0, in1, in2, in3 (in place)
2086                  Return Type - as per input vector RTYPE
2087    Details     : Each element of vector 'in0' is right shifted by 'shift' and
2088                  result is in place written to 'in0'
2089                  Here, 'shift' is GP variable passed in
2090                  Similar for other pairs
2091 */
2092 #define SRA_4V(in0, in1, in2, in3, shift)  \
2093 {                                          \
2094     in0 = in0 >> shift;                    \
2095     in1 = in1 >> shift;                    \
2096     in2 = in2 >> shift;                    \
2097     in3 = in3 >> shift;                    \
2098 }
2099
2100 /* Description : Shift right logical all halfword elements of vector
2101    Arguments   : Inputs  - in0, in1, in2, in3, shift
2102                  Outputs - in0, in1, in2, in3 (in place)
2103                  Return Type - unsigned halfword
2104    Details     : Each element of vector 'in0' is shifted right logical by
2105                  number of bits respective element holds in vector 'shift' and
2106                  result is in place written to 'in0'
2107                  Here, 'shift' is a vector passed in
2108                  Similar for other pairs
2109 */
2110 #define SRL_H4(RTYPE, in0, in1, in2, in3, shift)            \
2111 {                                                           \
2112     in0 = (RTYPE) __msa_srl_h((v8i16) in0, (v8i16) shift);  \
2113     in1 = (RTYPE) __msa_srl_h((v8i16) in1, (v8i16) shift);  \
2114     in2 = (RTYPE) __msa_srl_h((v8i16) in2, (v8i16) shift);  \
2115     in3 = (RTYPE) __msa_srl_h((v8i16) in3, (v8i16) shift);  \
2116 }
2117 #define SRL_H4_UH(...) SRL_H4(v8u16, __VA_ARGS__)
2118
2119 /* Description : Shift right arithmetic rounded halfwords
2120    Arguments   : Inputs  - in0, in1, shift
2121                  Outputs - in0, in1, (in place)
2122                  Return Type - unsigned halfword
2123    Details     : Each element of vector 'in0' is shifted right arithmetic by
2124                  number of bits respective element holds in vector 'shift'.
2125                  The last discarded bit is added to shifted value for rounding
2126                  and the result is in place written to 'in0'
2127                  Here, 'shift' is a vector passed in
2128                  Similar for other pairs
2129 */
2130 #define SRAR_H2(RTYPE, in0, in1, shift)                      \
2131 {                                                            \
2132     in0 = (RTYPE) __msa_srar_h((v8i16) in0, (v8i16) shift);  \
2133     in1 = (RTYPE) __msa_srar_h((v8i16) in1, (v8i16) shift);  \
2134 }
2135 #define SRAR_H2_UH(...) SRAR_H2(v8u16, __VA_ARGS__)
2136 #define SRAR_H2_SH(...) SRAR_H2(v8i16, __VA_ARGS__)
2137
2138 #define SRAR_H3(RTYPE, in0, in1, in2, shift)                 \
2139 {                                                            \
2140     SRAR_H2(RTYPE, in0, in1, shift)                          \
2141     in2 = (RTYPE) __msa_srar_h((v8i16) in2, (v8i16) shift);  \
2142 }
2143 #define SRAR_H3_SH(...) SRAR_H3(v8i16, __VA_ARGS__)
2144
2145 #define SRAR_H4(RTYPE, in0, in1, in2, in3, shift)  \
2146 {                                                  \
2147     SRAR_H2(RTYPE, in0, in1, shift)                \
2148     SRAR_H2(RTYPE, in2, in3, shift)                \
2149 }
2150 #define SRAR_H4_UH(...) SRAR_H4(v8u16, __VA_ARGS__)
2151 #define SRAR_H4_SH(...) SRAR_H4(v8i16, __VA_ARGS__)
2152
2153 /* Description : Shift right arithmetic rounded words
2154    Arguments   : Inputs  - in0, in1, shift
2155                  Outputs - in0, in1, (in place)
2156                  Return Type - as per RTYPE
2157    Details     : Each element of vector 'in0' is shifted right arithmetic by
2158                  number of bits respective element holds in vector 'shift'.
2159                  The last discarded bit is added to shifted value for rounding
2160                  and the result is in place written to 'in0'
2161                  Here, 'shift' is a vector passed in
2162                  Similar for other pairs
2163 */
2164 #define SRAR_W2(RTYPE, in0, in1, shift)                      \
2165 {                                                            \
2166     in0 = (RTYPE) __msa_srar_w((v4i32) in0, (v4i32) shift);  \
2167     in1 = (RTYPE) __msa_srar_w((v4i32) in1, (v4i32) shift);  \
2168 }
2169 #define SRAR_W2_SW(...) SRAR_W2(v4i32, __VA_ARGS__)
2170
2171 #define SRAR_W4(RTYPE, in0, in1, in2, in3, shift)  \
2172 {                                                  \
2173     SRAR_W2(RTYPE, in0, in1, shift)                \
2174     SRAR_W2(RTYPE, in2, in3, shift)                \
2175 }
2176 #define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__)
2177
2178 /* Description : Shift right arithmetic rounded (immediate)
2179    Arguments   : Inputs  - in0, in1, in2, in3, shift
2180                  Outputs - in0, in1, in2, in3 (in place)
2181                  Return Type - as per RTYPE
2182    Details     : Each element of vector 'in0' is shifted right arithmetic by
2183                  value in 'shift'.
2184                  The last discarded bit is added to shifted value for rounding
2185                  and the result is in place written to 'in0'
2186                  Similar for other pairs
2187 */
2188 #define SRARI_H2(RTYPE, in0, in1, shift)              \
2189 {                                                     \
2190     in0 = (RTYPE) __msa_srari_h((v8i16) in0, shift);  \
2191     in1 = (RTYPE) __msa_srari_h((v8i16) in1, shift);  \
2192 }
2193 #define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__)
2194 #define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__)
2195
2196 #define SRARI_H4(RTYPE, in0, in1, in2, in3, shift)    \
2197 {                                                     \
2198     SRARI_H2(RTYPE, in0, in1, shift);                 \
2199     SRARI_H2(RTYPE, in2, in3, shift);                 \
2200 }
2201 #define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__)
2202 #define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__)
2203
2204 /* Description : Shift right arithmetic rounded (immediate)
2205    Arguments   : Inputs  - in0, in1, shift
2206                  Outputs - in0, in1     (in place)
2207                  Return Type - as per RTYPE
2208    Details     : Each element of vector 'in0' is shifted right arithmetic by
2209                  value in 'shift'.
2210                  The last discarded bit is added to shifted value for rounding
2211                  and the result is in place written to 'in0'
2212                  Similar for other pairs
2213 */
2214 #define SRARI_W2(RTYPE, in0, in1, shift)              \
2215 {                                                     \
2216     in0 = (RTYPE) __msa_srari_w((v4i32) in0, shift);  \
2217     in1 = (RTYPE) __msa_srari_w((v4i32) in1, shift);  \
2218 }
2219 #define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__)
2220
2221 #define SRARI_W4(RTYPE, in0, in1, in2, in3, shift)  \
2222 {                                                   \
2223     SRARI_W2(RTYPE, in0, in1, shift);               \
2224     SRARI_W2(RTYPE, in2, in3, shift);               \
2225 }
2226 #define SRARI_W4_SH(...) SRARI_W4(v8i16, __VA_ARGS__)
2227 #define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)
2228
2229 /* Description : Multiplication of pairs of vectors
2230    Arguments   : Inputs  - in0, in1, in2, in3
2231                  Outputs - out0, out1
2232    Details     : Each element from 'in0' is multiplied with elements from 'in1'
2233                  and result is written to 'out0'
2234                  Similar for other pairs
2235 */
2236 #define MUL2(in0, in1, in2, in3, out0, out1)  \
2237 {                                             \
2238     out0 = in0 * in1;                         \
2239     out1 = in2 * in3;                         \
2240 }
2241 #define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3)  \
2242 {                                                                             \
2243     MUL2(in0, in1, in2, in3, out0, out1);                                     \
2244     MUL2(in4, in5, in6, in7, out2, out3);                                     \
2245 }
2246
2247 /* Description : Addition of 2 pairs of vectors
2248    Arguments   : Inputs  - in0, in1, in2, in3
2249                  Outputs - out0, out1
2250    Details     : Each element from 2 pairs vectors is added and 2 results are
2251                  produced
2252 */
2253 #define ADD2(in0, in1, in2, in3, out0, out1)  \
2254 {                                             \
2255     out0 = in0 + in1;                         \
2256     out1 = in2 + in3;                         \
2257 }
2258 #define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3)  \
2259 {                                                                             \
2260     ADD2(in0, in1, in2, in3, out0, out1);                                     \
2261     ADD2(in4, in5, in6, in7, out2, out3);                                     \
2262 }
2263
2264 /* Description : Subtraction of 2 pairs of vectors
2265    Arguments   : Inputs  - in0, in1, in2, in3
2266                  Outputs - out0, out1
2267    Details     : Each element from 2 pairs vectors is subtracted and 2 results
2268                  are produced
2269 */
2270 #define SUB2(in0, in1, in2, in3, out0, out1)  \
2271 {                                             \
2272     out0 = in0 - in1;                         \
2273     out1 = in2 - in3;                         \
2274 }
2275 #define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3)  \
2276 {                                                                             \
2277     out0 = in0 - in1;                                                         \
2278     out1 = in2 - in3;                                                         \
2279     out2 = in4 - in5;                                                         \
2280     out3 = in6 - in7;                                                         \
2281 }
2282
2283 /* Description : Sign extend halfword elements from right half of the vector
2284    Arguments   : Inputs  - in    (input halfword vector)
2285                  Outputs - out   (sign extended word vectors)
2286                  Return Type - signed word
2287    Details     : Sign bit of halfword elements from input vector 'in' is
2288                  extracted and interleaved with same vector 'in0' to generate
2289                  4 word elements keeping sign intact
2290 */
2291 #define UNPCK_R_SH_SW(in, out)                       \
2292 {                                                    \
2293     v8i16 sign_m;                                    \
2294                                                      \
2295     sign_m = __msa_clti_s_h((v8i16) in, 0);          \
2296     out = (v4i32) __msa_ilvr_h(sign_m, (v8i16) in);  \
2297 }
2298
2299 /* Description : Sign extend byte elements from input vector and return
2300                  halfword results in pair of vectors
2301    Arguments   : Inputs  - in           (1 input byte vector)
2302                  Outputs - out0, out1   (sign extended 2 halfword vectors)
2303                  Return Type - signed halfword
2304    Details     : Sign bit of byte elements from input vector 'in' is
2305                  extracted and interleaved right with same vector 'in0' to
2306                  generate 8 signed halfword elements in 'out0'
2307                  Then interleaved left with same vector 'in0' to
2308                  generate 8 signed halfword elements in 'out1'
2309 */
2310 #define UNPCK_SB_SH(in, out0, out1)                  \
2311 {                                                    \
2312     v16i8 tmp_m;                                     \
2313                                                      \
2314     tmp_m = __msa_clti_s_b((v16i8) in, 0);           \
2315     ILVRL_B2_SH(tmp_m, in, out0, out1);              \
2316 }
2317
2318 /* Description : Zero extend unsigned byte elements to halfword elements
2319    Arguments   : Inputs  - in           (1 input unsigned byte vector)
2320                  Outputs - out0, out1   (unsigned 2 halfword vectors)
2321                  Return Type - signed halfword
2322    Details     : Zero extended right half of vector is returned in 'out0'
2323                  Zero extended left half of vector is returned in 'out1'
2324 */
2325 #define UNPCK_UB_SH(in, out0, out1)                   \
2326 {                                                     \
2327     v16i8 zero_m = { 0 };                             \
2328                                                       \
2329     ILVRL_B2_SH(zero_m, in, out0, out1);              \
2330 }
2331
2332 /* Description : Sign extend halfword elements from input vector and return
2333                  result in pair of vectors
2334    Arguments   : Inputs  - in           (1 input halfword vector)
2335                  Outputs - out0, out1   (sign extended 2 word vectors)
2336                  Return Type - signed word
2337    Details     : Sign bit of halfword elements from input vector 'in' is
2338                  extracted and interleaved right with same vector 'in0' to
2339                  generate 4 signed word elements in 'out0'
2340                  Then interleaved left with same vector 'in0' to
2341                  generate 4 signed word elements in 'out1'
2342 */
2343 #define UNPCK_SH_SW(in, out0, out1)                  \
2344 {                                                    \
2345     v8i16 tmp_m;                                     \
2346                                                      \
2347     tmp_m = __msa_clti_s_h((v8i16) in, 0);           \
2348     ILVRL_H2_SW(tmp_m, in, out0, out1);              \
2349 }
2350
2351 /* Description : Swap two variables
2352    Arguments   : Inputs  - in0, in1
2353                  Outputs - in0, in1 (in-place)
2354    Details     : Swapping of two input variables using xor
2355 */
2356 #define SWAP(in0, in1)  \
2357 {                       \
2358     in0 = in0 ^ in1;    \
2359     in1 = in0 ^ in1;    \
2360     in0 = in0 ^ in1;    \
2361 }
2362
2363 /* Description : Butterfly of 4 input vectors
2364    Arguments   : Inputs  - in0, in1, in2, in3
2365                  Outputs - out0, out1, out2, out3
2366    Details     : Butterfly operation
2367 */
2368 #define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3)  \
2369 {                                                                \
2370     out0 = in0 + in3;                                            \
2371     out1 = in1 + in2;                                            \
2372                                                                  \
2373     out2 = in1 - in2;                                            \
2374     out3 = in0 - in3;                                            \
2375 }
2376
2377 /* Description : Butterfly of 8 input vectors
2378    Arguments   : Inputs  - in0 ...  in7
2379                  Outputs - out0 .. out7
2380    Details     : Butterfly operation
2381 */
2382 #define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7,          \
2383                     out0, out1, out2, out3, out4, out5, out6, out7)  \
2384 {                                                                    \
2385     out0 = in0 + in7;                                                \
2386     out1 = in1 + in6;                                                \
2387     out2 = in2 + in5;                                                \
2388     out3 = in3 + in4;                                                \
2389                                                                      \
2390     out4 = in3 - in4;                                                \
2391     out5 = in2 - in5;                                                \
2392     out6 = in1 - in6;                                                \
2393     out7 = in0 - in7;                                                \
2394 }
2395
2396 /* Description : Butterfly of 16 input vectors
2397    Arguments   : Inputs  - in0 ...  in15
2398                  Outputs - out0 .. out15
2399    Details     : Butterfly operation
2400 */
2401 #define BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,                \
2402                      in8, in9,  in10, in11, in12, in13, in14, in15,         \
2403                      out0, out1, out2, out3, out4, out5, out6, out7,        \
2404                      out8, out9, out10, out11, out12, out13, out14, out15)  \
2405 {                                                                           \
2406     out0 = in0 + in15;                                                      \
2407     out1 = in1 + in14;                                                      \
2408     out2 = in2 + in13;                                                      \
2409     out3 = in3 + in12;                                                      \
2410     out4 = in4 + in11;                                                      \
2411     out5 = in5 + in10;                                                      \
2412     out6 = in6 + in9;                                                       \
2413     out7 = in7 + in8;                                                       \
2414                                                                             \
2415     out8 = in7 - in8;                                                       \
2416     out9 = in6 - in9;                                                       \
2417     out10 = in5 - in10;                                                     \
2418     out11 = in4 - in11;                                                     \
2419     out12 = in3 - in12;                                                     \
2420     out13 = in2 - in13;                                                     \
2421     out14 = in1 - in14;                                                     \
2422     out15 = in0 - in15;                                                     \
2423 }
2424
2425 /* Description : Transposes input 4x4 byte block
2426    Arguments   : Inputs  - in0, in1, in2, in3      (input 4x4 byte block)
2427                  Outputs - out0, out1, out2, out3  (output 4x4 byte block)
2428                  Return Type - unsigned byte
2429    Details     :
2430 */
2431 #define TRANSPOSE4x4_UB_UB(in0, in1, in2, in3, out0, out1, out2, out3)  \
2432 {                                                                       \
2433     v16i8 zero_m = { 0 };                                               \
2434     v16i8 s0_m, s1_m, s2_m, s3_m;                                       \
2435                                                                         \
2436     ILVR_D2_SB(in1, in0, in3, in2, s0_m, s1_m);                         \
2437     ILVRL_B2_SB(s1_m, s0_m, s2_m, s3_m);                                \
2438                                                                         \
2439     out0 = (v16u8) __msa_ilvr_b(s3_m, s2_m);                            \
2440     out1 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out0, 4);               \
2441     out2 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out1, 4);               \
2442     out3 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out2, 4);               \
2443 }
2444
2445 /* Description : Transposes input 8x4 byte block into 4x8
2446    Arguments   : Inputs  - in0, in1, in2, in3      (input 8x4 byte block)
2447                  Outputs - out0, out1, out2, out3  (output 4x8 byte block)
2448                  Return Type - unsigned byte
2449    Details     :
2450 */
2451 #define TRANSPOSE8x4_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
2452                         out0, out1, out2, out3)                         \
2453 {                                                                       \
2454     v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                               \
2455                                                                         \
2456     ILVEV_W2_SB(in0, in4, in1, in5, tmp0_m, tmp1_m);                    \
2457     tmp2_m = __msa_ilvr_b(tmp1_m, tmp0_m);                              \
2458     ILVEV_W2_SB(in2, in6, in3, in7, tmp0_m, tmp1_m);                    \
2459                                                                         \
2460     tmp3_m = __msa_ilvr_b(tmp1_m, tmp0_m);                              \
2461     ILVRL_H2_SB(tmp3_m, tmp2_m, tmp0_m, tmp1_m);                        \
2462                                                                         \
2463     ILVRL_W2(RTYPE, tmp1_m, tmp0_m, out0, out2);                        \
2464     out1 = (RTYPE) __msa_ilvl_d((v2i64) out2, (v2i64) out0);            \
2465     out3 = (RTYPE) __msa_ilvl_d((v2i64) out0, (v2i64) out2);            \
2466 }
2467 #define TRANSPOSE8x4_UB_UB(...) TRANSPOSE8x4_UB(v16u8, __VA_ARGS__)
2468 #define TRANSPOSE8x4_UB_UH(...) TRANSPOSE8x4_UB(v8u16, __VA_ARGS__)
2469
2470 /* Description : Transposes input 8x8 byte block
2471    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
2472                            (input 8x8 byte block)
2473                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
2474                            (output 8x8 byte block)
2475                  Return Type - unsigned byte
2476    Details     :
2477 */
2478 #define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,   \
2479                         out0, out1, out2, out3, out4, out5, out6, out7)  \
2480 {                                                                        \
2481     v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                \
2482     v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                \
2483                                                                          \
2484     ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5,                   \
2485                tmp0_m, tmp1_m, tmp2_m, tmp3_m);                          \
2486     ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m);                         \
2487     ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m);                         \
2488     ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2);                         \
2489     ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6);                         \
2490     SLDI_B2_0(RTYPE, out0, out2, out1, out3, 8);                         \
2491     SLDI_B2_0(RTYPE, out4, out6, out5, out7, 8);                         \
2492 }
2493 #define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__)
2494 #define TRANSPOSE8x8_UB_UH(...) TRANSPOSE8x8_UB(v8u16, __VA_ARGS__)
2495
2496 /* Description : Transposes 16x4 block into 4x16 with byte elements in vectors
2497    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7,
2498                            in8, in9, in10, in11, in12, in13, in14, in15
2499                  Outputs - out0, out1, out2, out3
2500                  Return Type - unsigned byte
2501    Details     :
2502 */
2503 #define TRANSPOSE16x4_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7,        \
2504                             in8, in9, in10, in11, in12, in13, in14, in15,  \
2505                             out0, out1, out2, out3)                        \
2506 {                                                                          \
2507     v2i64 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                  \
2508                                                                            \
2509     ILVEV_W2_SD(in0, in4, in8, in12, tmp0_m, tmp1_m);                      \
2510     out1 = (v16u8) __msa_ilvev_d(tmp1_m, tmp0_m);                          \
2511                                                                            \
2512     ILVEV_W2_SD(in1, in5, in9, in13, tmp0_m, tmp1_m);                      \
2513     out3 = (v16u8) __msa_ilvev_d(tmp1_m, tmp0_m);                          \
2514                                                                            \
2515     ILVEV_W2_SD(in2, in6, in10, in14, tmp0_m, tmp1_m);                     \
2516                                                                            \
2517     tmp2_m = __msa_ilvev_d(tmp1_m, tmp0_m);                                \
2518     ILVEV_W2_SD(in3, in7, in11, in15, tmp0_m, tmp1_m);                     \
2519                                                                            \
2520     tmp3_m = __msa_ilvev_d(tmp1_m, tmp0_m);                                \
2521     ILVEV_B2_SD(out1, out3, tmp2_m, tmp3_m, tmp0_m, tmp1_m);               \
2522     out0 = (v16u8) __msa_ilvev_h((v8i16) tmp1_m, (v8i16) tmp0_m);          \
2523     out2 = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m);          \
2524                                                                            \
2525     tmp0_m = (v2i64) __msa_ilvod_b((v16i8) out3, (v16i8) out1);            \
2526     tmp1_m = (v2i64) __msa_ilvod_b((v16i8) tmp3_m, (v16i8) tmp2_m);        \
2527     out1 = (v16u8) __msa_ilvev_h((v8i16) tmp1_m, (v8i16) tmp0_m);          \
2528     out3 = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m);          \
2529 }
2530
2531 /* Description : Transposes 16x8 block into 8x16 with byte elements in vectors
2532    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7,
2533                            in8, in9, in10, in11, in12, in13, in14, in15
2534                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
2535                  Return Type - unsigned byte
2536    Details     :
2537 */
2538 #define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7,          \
2539                             in8, in9, in10, in11, in12, in13, in14, in15,    \
2540                             out0, out1, out2, out3, out4, out5, out6, out7)  \
2541 {                                                                            \
2542     v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                    \
2543     v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                    \
2544                                                                              \
2545     ILVEV_D2_UB(in0, in8, in1, in9, out7, out6);                             \
2546     ILVEV_D2_UB(in2, in10, in3, in11, out5, out4);                           \
2547     ILVEV_D2_UB(in4, in12, in5, in13, out3, out2);                           \
2548     ILVEV_D2_UB(in6, in14, in7, in15, out1, out0);                           \
2549                                                                              \
2550     tmp0_m = (v16u8) __msa_ilvev_b((v16i8) out6, (v16i8) out7);              \
2551     tmp4_m = (v16u8) __msa_ilvod_b((v16i8) out6, (v16i8) out7);              \
2552     tmp1_m = (v16u8) __msa_ilvev_b((v16i8) out4, (v16i8) out5);              \
2553     tmp5_m = (v16u8) __msa_ilvod_b((v16i8) out4, (v16i8) out5);              \
2554     out5 = (v16u8) __msa_ilvev_b((v16i8) out2, (v16i8) out3);                \
2555     tmp6_m = (v16u8) __msa_ilvod_b((v16i8) out2, (v16i8) out3);              \
2556     out7 = (v16u8) __msa_ilvev_b((v16i8) out0, (v16i8) out1);                \
2557     tmp7_m = (v16u8) __msa_ilvod_b((v16i8) out0, (v16i8) out1);              \
2558                                                                              \
2559     ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m);                 \
2560     out0 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
2561     out4 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
2562                                                                              \
2563     tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m);          \
2564     tmp3_m = (v16u8) __msa_ilvod_h((v8i16) out7, (v8i16) out5);              \
2565     out2 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
2566     out6 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
2567                                                                              \
2568     ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m);             \
2569     out1 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
2570     out5 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
2571                                                                              \
2572     tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp5_m, (v8i16) tmp4_m);          \
2573     tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp5_m, (v8i16) tmp4_m);          \
2574     tmp3_m = (v16u8) __msa_ilvod_h((v8i16) tmp7_m, (v8i16) tmp6_m);          \
2575     tmp3_m = (v16u8) __msa_ilvod_h((v8i16) tmp7_m, (v8i16) tmp6_m);          \
2576     out3 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
2577     out7 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
2578 }
2579
2580 /* Description : Transposes 4x4 block with half word elements in vectors
2581    Arguments   : Inputs  - in0, in1, in2, in3
2582                  Outputs - out0, out1, out2, out3
2583                  Return Type - signed halfword
2584    Details     :
2585 */
2586 #define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3)  \
2587 {                                                                       \
2588     v8i16 s0_m, s1_m;                                                   \
2589                                                                         \
2590     ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m);                         \
2591     ILVRL_W2_SH(s1_m, s0_m, out0, out2);                                \
2592     out1 = (v8i16) __msa_ilvl_d((v2i64) out0, (v2i64) out0);            \
2593     out3 = (v8i16) __msa_ilvl_d((v2i64) out0, (v2i64) out2);            \
2594 }
2595
2596 /* Description : Transposes 8x8 block with half word elements in vectors
2597    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
2598                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
2599                  Return Type - signed halfword
2600    Details     :
2601 */
2602 #define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,   \
2603                        out0, out1, out2, out3, out4, out5, out6, out7)  \
2604 {                                                                       \
2605     v8i16 s0_m, s1_m;                                                   \
2606     v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                               \
2607     v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                               \
2608                                                                         \
2609     ILVR_H2_SH(in6, in4, in7, in5, s0_m, s1_m);                         \
2610     ILVRL_H2_SH(s1_m, s0_m, tmp0_m, tmp1_m);                            \
2611     ILVL_H2_SH(in6, in4, in7, in5, s0_m, s1_m);                         \
2612     ILVRL_H2_SH(s1_m, s0_m, tmp2_m, tmp3_m);                            \
2613     ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m);                         \
2614     ILVRL_H2_SH(s1_m, s0_m, tmp4_m, tmp5_m);                            \
2615     ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m);                         \
2616     ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m);                            \
2617     PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m,     \
2618              tmp3_m, tmp7_m, out0, out2, out4, out6);                   \
2619     out1 = (RTYPE) __msa_pckod_d((v2i64) tmp0_m, (v2i64) tmp4_m);       \
2620     out3 = (RTYPE) __msa_pckod_d((v2i64) tmp1_m, (v2i64) tmp5_m);       \
2621     out5 = (RTYPE) __msa_pckod_d((v2i64) tmp2_m, (v2i64) tmp6_m);       \
2622     out7 = (RTYPE) __msa_pckod_d((v2i64) tmp3_m, (v2i64) tmp7_m);       \
2623 }
2624 #define TRANSPOSE8x8_UH_UH(...) TRANSPOSE8x8_H(v8u16, __VA_ARGS__)
2625 #define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__)
2626
2627 /* Description : Transposes 4x4 block with word elements in vectors
2628    Arguments   : Inputs  - in0, in1, in2, in3
2629                  Outputs - out0, out1, out2, out3
2630                  Return Type - signed word
2631    Details     :
2632 */
2633 #define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3)  \
2634 {                                                                       \
2635     v4i32 s0_m, s1_m, s2_m, s3_m;                                       \
2636                                                                         \
2637     ILVRL_W2_SW(in1, in0, s0_m, s1_m);                                  \
2638     ILVRL_W2_SW(in3, in2, s2_m, s3_m);                                  \
2639                                                                         \
2640     out0 = (v4i32) __msa_ilvr_d((v2i64) s2_m, (v2i64) s0_m);            \
2641     out1 = (v4i32) __msa_ilvl_d((v2i64) s2_m, (v2i64) s0_m);            \
2642     out2 = (v4i32) __msa_ilvr_d((v2i64) s3_m, (v2i64) s1_m);            \
2643     out3 = (v4i32) __msa_ilvl_d((v2i64) s3_m, (v2i64) s1_m);            \
2644 }
2645
2646 /* Description : Average byte elements from pair of vectors and store 8x4 byte
2647                  block in destination memory
2648    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2649                  Outputs -
2650                  Return Type -
2651    Details     : Each byte element from input vector pair 'in0' and 'in1' are
2652                  averaged (a + b)/2 and stored in 'tmp0_m'
2653                  Each byte element from input vector pair 'in2' and 'in3' are
2654                  averaged (a + b)/2 and stored in 'tmp1_m'
2655                  Each byte element from input vector pair 'in4' and 'in5' are
2656                  averaged (a + b)/2 and stored in 'tmp2_m'
2657                  Each byte element from input vector pair 'in6' and 'in7' are
2658                  averaged (a + b)/2 and stored in 'tmp3_m'
2659                  The half vector results from all 4 vectors are stored in
2660                  destination memory as 8x4 byte block
2661 */
2662 #define AVE_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)  \
2663 {                                                                           \
2664     uint64_t out0_m, out1_m, out2_m, out3_m;                                \
2665     v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \
2666                                                                             \
2667     tmp0_m = __msa_ave_u_b((v16u8) in0, (v16u8) in1);                       \
2668     tmp1_m = __msa_ave_u_b((v16u8) in2, (v16u8) in3);                       \
2669     tmp2_m = __msa_ave_u_b((v16u8) in4, (v16u8) in5);                       \
2670     tmp3_m = __msa_ave_u_b((v16u8) in6, (v16u8) in7);                       \
2671                                                                             \
2672     out0_m = __msa_copy_u_d((v2i64) tmp0_m, 0);                             \
2673     out1_m = __msa_copy_u_d((v2i64) tmp1_m, 0);                             \
2674     out2_m = __msa_copy_u_d((v2i64) tmp2_m, 0);                             \
2675     out3_m = __msa_copy_u_d((v2i64) tmp3_m, 0);                             \
2676     SD4(out0_m, out1_m, out2_m, out3_m, pdst, stride);                      \
2677 }
2678
2679 /* Description : Average byte elements from pair of vectors and store 16x4 byte
2680                  block in destination memory
2681    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2682                  Outputs -
2683                  Return Type -
2684    Details     : Each byte element from input vector pair 'in0' and 'in1' are
2685                  averaged (a + b)/2 and stored in 'tmp0_m'
2686                  Each byte element from input vector pair 'in2' and 'in3' are
2687                  averaged (a + b)/2 and stored in 'tmp1_m'
2688                  Each byte element from input vector pair 'in4' and 'in5' are
2689                  averaged (a + b)/2 and stored in 'tmp2_m'
2690                  Each byte element from input vector pair 'in6' and 'in7' are
2691                  averaged (a + b)/2 and stored in 'tmp3_m'
2692                  The results from all 4 vectors are stored in destination
2693                  memory as 16x4 byte block
2694 */
2695 #define AVE_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)  \
2696 {                                                                            \
2697     v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                    \
2698                                                                              \
2699     tmp0_m = __msa_ave_u_b((v16u8) in0, (v16u8) in1);                        \
2700     tmp1_m = __msa_ave_u_b((v16u8) in2, (v16u8) in3);                        \
2701     tmp2_m = __msa_ave_u_b((v16u8) in4, (v16u8) in5);                        \
2702     tmp3_m = __msa_ave_u_b((v16u8) in6, (v16u8) in7);                        \
2703                                                                              \
2704     ST_UB4(tmp0_m, tmp1_m, tmp2_m, tmp3_m, pdst, stride);                    \
2705 }
2706
2707 /* Description : Average rounded byte elements from pair of vectors and store
2708                  8x4 byte block in destination memory
2709    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2710                  Outputs -
2711                  Return Type -
2712    Details     : Each byte element from input vector pair 'in0' and 'in1' are
2713                  average rounded (a + b + 1)/2 and stored in 'tmp0_m'
2714                  Each byte element from input vector pair 'in2' and 'in3' are
2715                  average rounded (a + b + 1)/2 and stored in 'tmp1_m'
2716                  Each byte element from input vector pair 'in4' and 'in5' are
2717                  average rounded (a + b + 1)/2 and stored in 'tmp2_m'
2718                  Each byte element from input vector pair 'in6' and 'in7' are
2719                  average rounded (a + b + 1)/2 and stored in 'tmp3_m'
2720                  The half vector results from all 4 vectors are stored in
2721                  destination memory as 8x4 byte block
2722 */
2723 #define AVER_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)  \
2724 {                                                                            \
2725     uint64_t out0_m, out1_m, out2_m, out3_m;                                 \
2726     v16u8 tp0_m, tp1_m, tp2_m, tp3_m;                                        \
2727                                                                              \
2728     AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7,                      \
2729                 tp0_m, tp1_m, tp2_m, tp3_m);                                 \
2730                                                                              \
2731     out0_m = __msa_copy_u_d((v2i64) tp0_m, 0);                               \
2732     out1_m = __msa_copy_u_d((v2i64) tp1_m, 0);                               \
2733     out2_m = __msa_copy_u_d((v2i64) tp2_m, 0);                               \
2734     out3_m = __msa_copy_u_d((v2i64) tp3_m, 0);                               \
2735     SD4(out0_m, out1_m, out2_m, out3_m, pdst, stride);                       \
2736 }
2737
2738 /* Description : Average rounded byte elements from pair of vectors and store
2739                  16x4 byte block in destination memory
2740    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2741                  Outputs -
2742                  Return Type -
2743    Details     : Each byte element from input vector pair 'in0' and 'in1' are
2744                  average rounded (a + b + 1)/2 and stored in 'tmp0_m'
2745                  Each byte element from input vector pair 'in2' and 'in3' are
2746                  average rounded (a + b + 1)/2 and stored in 'tmp1_m'
2747                  Each byte element from input vector pair 'in4' and 'in5' are
2748                  average rounded (a + b + 1)/2 and stored in 'tmp2_m'
2749                  Each byte element from input vector pair 'in6' and 'in7' are
2750                  average rounded (a + b + 1)/2 and stored in 'tmp3_m'
2751                  The vector results from all 4 vectors are stored in
2752                  destination memory as 16x4 byte block
2753 */
2754 #define AVER_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)  \
2755 {                                                                             \
2756     v16u8 t0_m, t1_m, t2_m, t3_m;                                             \
2757                                                                               \
2758     AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7,                       \
2759                 t0_m, t1_m, t2_m, t3_m);                                      \
2760     ST_UB4(t0_m, t1_m, t2_m, t3_m, pdst, stride);                             \
2761 }
2762
2763 /* Description : Average rounded byte elements from pair of vectors,
2764                  average rounded with destination and store 8x4 byte block
2765                  in destination memory
2766    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2767                  Outputs -
2768                  Return Type -
2769    Details     : Each byte element from input vector pair 'in0' and 'in1' are
2770                  average rounded (a + b + 1)/2 and stored in 'tmp0_m'
2771                  Each byte element from input vector pair 'in2' and 'in3' are
2772                  average rounded (a + b + 1)/2 and stored in 'tmp1_m'
2773                  Each byte element from input vector pair 'in4' and 'in5' are
2774                  average rounded (a + b + 1)/2 and stored in 'tmp2_m'
2775                  Each byte element from input vector pair 'in6' and 'in7' are
2776                  average rounded (a + b + 1)/2 and stored in 'tmp3_m'
2777                  The half vector results from all 4 vectors are stored in
2778                  destination memory as 8x4 byte block
2779 */
2780 #define AVER_DST_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7,  \
2781                           pdst, stride)                            \
2782 {                                                                  \
2783     v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                          \
2784     v16u8 dst0_m, dst1_m, dst2_m, dst3_m;                          \
2785                                                                    \
2786     LD_UB4(pdst, stride, dst0_m, dst1_m, dst2_m, dst3_m);          \
2787     AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7,            \
2788                 tmp0_m, tmp1_m, tmp2_m, tmp3_m);                   \
2789     AVER_ST8x4_UB(dst0_m, tmp0_m, dst1_m, tmp1_m,                  \
2790                   dst2_m, tmp2_m, dst3_m, tmp3_m, pdst, stride);   \
2791 }
2792
2793 /* Description : Average rounded byte elements from pair of vectors,
2794                  average rounded with destination and store 16x4 byte block
2795                  in destination memory
2796    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2797                  Outputs -
2798                  Return Type -
2799    Details     : Each byte element from input vector pair 'in0' and 'in1' are
2800                  average rounded (a + b + 1)/2 and stored in 'tmp0_m'
2801                  Each byte element from input vector pair 'in2' and 'in3' are
2802                  average rounded (a + b + 1)/2 and stored in 'tmp1_m'
2803                  Each byte element from input vector pair 'in4' and 'in5' are
2804                  average rounded (a + b + 1)/2 and stored in 'tmp2_m'
2805                  Each byte element from input vector pair 'in6' and 'in7' are
2806                  average rounded (a + b + 1)/2 and stored in 'tmp3_m'
2807                  The vector results from all 4 vectors are stored in
2808                  destination memory as 16x4 byte block
2809 */
2810 #define AVER_DST_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7,  \
2811                            pdst, stride)                            \
2812 {                                                                   \
2813     v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                           \
2814     v16u8 dst0_m, dst1_m, dst2_m, dst3_m;                           \
2815                                                                     \
2816     LD_UB4(pdst, stride, dst0_m, dst1_m, dst2_m, dst3_m);           \
2817     AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7,             \
2818                 tmp0_m, tmp1_m, tmp2_m, tmp3_m);                    \
2819     AVER_ST16x4_UB(dst0_m, tmp0_m, dst1_m, tmp1_m,                  \
2820                    dst2_m, tmp2_m, dst3_m, tmp3_m, pdst, stride);   \
2821 }
2822
2823 /* Description : Add block 4x4
2824    Arguments   : Inputs  - in0, in1, in2, in3, pdst, stride
2825                  Outputs -
2826                  Return Type - unsigned bytes
2827    Details     : Least significant 4 bytes from each input vector are added to
2828                  the destination bytes, clipped between 0-255 and then stored.
2829 */
2830 #define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride)         \
2831 {                                                                 \
2832     uint32_t src0_m, src1_m, src2_m, src3_m;                      \
2833     uint32_t out0_m, out1_m, out2_m, out3_m;                      \
2834     v8i16 inp0_m, inp1_m, res0_m, res1_m;                         \
2835     v16i8 dst0_m = { 0 };                                         \
2836     v16i8 dst1_m = { 0 };                                         \
2837     v16i8 zero_m = { 0 };                                         \
2838                                                                   \
2839     ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m)                \
2840     LW4(pdst, stride,  src0_m, src1_m, src2_m, src3_m);           \
2841     INSERT_W2_SB(src0_m, src1_m, dst0_m);                         \
2842     INSERT_W2_SB(src2_m, src3_m, dst1_m);                         \
2843     ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m);   \
2844     ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m);         \
2845     CLIP_SH2_0_255(res0_m, res1_m);                               \
2846     PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m);  \
2847                                                                   \
2848     out0_m = __msa_copy_u_w((v4i32) dst0_m, 0);                   \
2849     out1_m = __msa_copy_u_w((v4i32) dst0_m, 1);                   \
2850     out2_m = __msa_copy_u_w((v4i32) dst1_m, 0);                   \
2851     out3_m = __msa_copy_u_w((v4i32) dst1_m, 1);                   \
2852     SW4(out0_m, out1_m, out2_m, out3_m, pdst, stride);            \
2853 }
2854
2855 /* Description : Dot product and addition of 3 signed halfword input vectors
2856    Arguments   : Inputs  - in0, in1, in2, coeff0, coeff1, coeff2
2857                  Outputs - out0_m
2858                  Return Type - signed halfword
2859    Details     : Dot product of 'in0' with 'coeff0'
2860                  Dot product of 'in1' with 'coeff1'
2861                  Dot product of 'in2' with 'coeff2'
2862                  Addition of all the 3 vector results
2863
2864                  out0_m = (in0 * coeff0) + (in1 * coeff1) + (in2 * coeff2)
2865 */
2866 #define DPADD_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2)         \
2867 ( {                                                                 \
2868     v8i16 tmp1_m;                                                   \
2869     v8i16 out0_m;                                                   \
2870                                                                     \
2871     out0_m = __msa_dotp_s_h((v16i8) in0, (v16i8) coeff0);           \
2872     out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in1, (v16i8) coeff1);  \
2873     tmp1_m = __msa_dotp_s_h((v16i8) in2, (v16i8) coeff2);           \
2874     out0_m = __msa_adds_s_h(out0_m, tmp1_m);                        \
2875                                                                     \
2876     out0_m;                                                         \
2877 } )
2878
2879 /* Description : Pack even elements of input vectors & xor with 128
2880    Arguments   : Inputs  - in0, in1
2881                  Outputs - out_m
2882                  Return Type - unsigned byte
2883    Details     : Signed byte even elements from 'in0' and 'in1' are packed
2884                  together in one vector and the resulted vector is xor'ed with
2885                  128 to shift the range from signed to unsigned byte
2886 */
2887 #define PCKEV_XORI128_UB(in0, in1)                            \
2888 ( {                                                           \
2889     v16u8 out_m;                                              \
2890     out_m = (v16u8) __msa_pckev_b((v16i8) in1, (v16i8) in0);  \
2891     out_m = (v16u8) __msa_xori_b((v16u8) out_m, 128);         \
2892     out_m;                                                    \
2893 } )
2894
2895 /* Description : Converts inputs to unsigned bytes, interleave, average & store
2896                  as 8x4 unsigned byte block
2897    Arguments   : Inputs  - in0, in1, in2, in3, dst0, dst1, dst2, dst3,
2898                            pdst, stride
2899 */
2900 #define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3,                    \
2901                                 dst0, dst1, dst2, dst3, pdst, stride)  \
2902 {                                                                      \
2903     v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                              \
2904     uint8_t *pdst_m = (uint8_t *) (pdst);                              \
2905                                                                        \
2906     tmp0_m = PCKEV_XORI128_UB(in0, in1);                               \
2907     tmp1_m = PCKEV_XORI128_UB(in2, in3);                               \
2908     ILVR_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m);                \
2909     AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m);       \
2910     ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride);                          \
2911 }
2912
2913 /* Description : Pack even byte elements, extract 0 & 2 index words from pair
2914                  of results and store 4 words in destination memory as per
2915                  stride
2916    Arguments   : Inputs  - in0, in1, in2, in3, pdst, stride
2917 */
2918 #define PCKEV_ST4x4_UB(in0, in1, in2, in3, pdst, stride)  \
2919 {                                                         \
2920     uint32_t out0_m, out1_m, out2_m, out3_m;              \
2921     v16i8 tmp0_m, tmp1_m;                                 \
2922                                                           \
2923     PCKEV_B2_SB(in1, in0, in3, in2, tmp0_m, tmp1_m);      \
2924                                                           \
2925     out0_m = __msa_copy_u_w((v4i32) tmp0_m, 0);           \
2926     out1_m = __msa_copy_u_w((v4i32) tmp0_m, 2);           \
2927     out2_m = __msa_copy_u_w((v4i32) tmp1_m, 0);           \
2928     out3_m = __msa_copy_u_w((v4i32) tmp1_m, 2);           \
2929                                                           \
2930     SW4(out0_m, out1_m, out2_m, out3_m, pdst, stride);    \
2931 }
2932
2933 /* Description : Pack even byte elements and store byte vector in destination
2934                  memory
2935    Arguments   : Inputs  - in0, in1, pdst
2936 */
2937 #define PCKEV_ST_SB(in0, in1, pdst)                   \
2938 {                                                     \
2939     v16i8 tmp_m;                                      \
2940     tmp_m = __msa_pckev_b((v16i8) in1, (v16i8) in0);  \
2941     ST_SB(tmp_m, (pdst));                             \
2942 }
2943
2944 /* Description : Horizontal 2 tap filter kernel code
2945    Arguments   : Inputs  - in0, in1, mask, coeff, shift
2946 */
2947 #define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift)            \
2948 ( {                                                                 \
2949     v16i8 tmp0_m;                                                   \
2950     v8u16 tmp1_m;                                                   \
2951                                                                     \
2952     tmp0_m = __msa_vshf_b((v16i8) mask, (v16i8) in1, (v16i8) in0);  \
2953     tmp1_m = __msa_dotp_u_h((v16u8) tmp0_m, (v16u8) coeff);         \
2954     tmp1_m = (v8u16) __msa_srari_h((v8i16) tmp1_m, shift);          \
2955     tmp1_m = __msa_sat_u_h(tmp1_m, shift);                          \
2956                                                                     \
2957     tmp1_m;                                                         \
2958 } )
2959 #endif  /* AVUTIL_MIPS_GENERIC_MACROS_MSA_H */