git.sesse.net Git - ffmpeg/blob - libavutil/mips/generic_macros_msa.h

   1 /*
   2  * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
   3  *
   4  * This file is part of FFmpeg.
   5  *
   6  * FFmpeg is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * FFmpeg is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with FFmpeg; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19  */
  20
  21 #ifndef AVUTIL_MIPS_GENERIC_MACROS_MSA_H
  22 #define AVUTIL_MIPS_GENERIC_MACROS_MSA_H
  23
  24 #include <stdint.h>
  25 #include <msa.h>
  26
  27 #define LD_B(RTYPE, psrc) *((RTYPE *)(psrc))
  28 #define LD_UB(...) LD_B(v16u8, __VA_ARGS__)
  29 #define LD_SB(...) LD_B(v16i8, __VA_ARGS__)
  30
  31 #define LD_H(RTYPE, psrc) *((RTYPE *)(psrc))
  32 #define LD_UH(...) LD_H(v8u16, __VA_ARGS__)
  33 #define LD_SH(...) LD_H(v8i16, __VA_ARGS__)
  34
  35 #define LD_W(RTYPE, psrc) *((RTYPE *)(psrc))
  36 #define LD_UW(...) LD_W(v4u32, __VA_ARGS__)
  37 #define LD_SW(...) LD_W(v4i32, __VA_ARGS__)
  38
  39 #define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
  40 #define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
  41 #define ST_SB(...) ST_B(v16i8, __VA_ARGS__)
  42
  43 #define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
  44 #define ST_UH(...) ST_H(v8u16, __VA_ARGS__)
  45 #define ST_SH(...) ST_H(v8i16, __VA_ARGS__)
  46
  47 #define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
  48 #define ST_UW(...) ST_W(v4u32, __VA_ARGS__)
  49 #define ST_SW(...) ST_W(v4i32, __VA_ARGS__)
  50
  51 #if (__mips_isa_rev >= 6)
  52     #define LW(psrc)                           \
  53     ( {                                        \
  54         uint8_t *psrc_m = (uint8_t *) (psrc);  \
  55         uint32_t val_m;                        \
  56                                                \
  57         __asm__ volatile (                     \
  58             "lw  %[val_m],  %[psrc_m]  \n\t"   \
  59                                                \
  60             : [val_m] "=r" (val_m)             \
  61             : [psrc_m] "m" (*psrc_m)           \
  62         );                                     \
  63                                                \
  64         val_m;                                 \
  65     } )
  66
  67     #if (__mips == 64)
  68         #define LD(psrc)                           \
  69         ( {                                        \
  70             uint8_t *psrc_m = (uint8_t *) (psrc);  \
  71             uint64_t val_m = 0;                    \
  72                                                    \
  73             __asm__ volatile (                     \
  74                 "ld  %[val_m],  %[psrc_m]  \n\t"   \
  75                                                    \
  76                 : [val_m] "=r" (val_m)             \
  77                 : [psrc_m] "m" (*psrc_m)           \
  78             );                                     \
  79                                                    \
  80             val_m;                                 \
  81         } )
  82     #else  // !(__mips == 64)
  83         #define LD(psrc)                                              \
  84         ( {                                                           \
  85             uint8_t *psrc_m = (uint8_t *) (psrc);                     \
  86             uint32_t val0_m, val1_m;                                  \
  87             uint64_t val_m = 0;                                       \
  88                                                                       \
  89             val0_m = LW(psrc_m);                                      \
  90             val1_m = LW(psrc_m + 4);                                  \
  91                                                                       \
  92             val_m = (uint64_t) (val1_m);                              \
  93             val_m = (uint64_t) ((val_m << 32) & 0xFFFFFFFF00000000);  \
  94             val_m = (uint64_t) (val_m | (uint64_t) val0_m);           \
  95                                                                       \
  96             val_m;                                                    \
  97         } )
  98     #endif  // (__mips == 64)
  99
 100     #define SH(val, pdst)                      \
 101     {                                          \
 102         uint8_t *pdst_m = (uint8_t *) (pdst);  \
 103         uint16_t val_m = (val);                \
 104                                                \
 105         __asm__ volatile (                     \
 106             "sh  %[val_m],  %[pdst_m]  \n\t"   \
 107                                                \
 108             : [pdst_m] "=m" (*pdst_m)          \
 109             : [val_m] "r" (val_m)              \
 110         );                                     \
 111     }
 112
 113     #define SW(val, pdst)                      \
 114     {                                          \
 115         uint8_t *pdst_m = (uint8_t *) (pdst);  \
 116         uint32_t val_m = (val);                \
 117                                                \
 118         __asm__ volatile (                     \
 119             "sw  %[val_m],  %[pdst_m]  \n\t"   \
 120                                                \
 121             : [pdst_m] "=m" (*pdst_m)          \
 122             : [val_m] "r" (val_m)              \
 123         );                                     \
 124     }
 125
 126     #define SD(val, pdst)                      \
 127     {                                          \
 128         uint8_t *pdst_m = (uint8_t *) (pdst);  \
 129         uint64_t val_m = (val);                \
 130                                                \
 131         __asm__ volatile (                     \
 132             "sd  %[val_m],  %[pdst_m]  \n\t"   \
 133                                                \
 134             : [pdst_m] "=m" (*pdst_m)          \
 135             : [val_m] "r" (val_m)              \
 136         );                                     \
 137     }
 138 #else  // !(__mips_isa_rev >= 6)
 139     #define LW(psrc)                           \
 140     ( {                                        \
 141         uint8_t *psrc_m = (uint8_t *) (psrc);  \
 142         uint32_t val_m;                        \
 143                                                \
 144         __asm__ volatile (                     \
 145             "ulw  %[val_m],  %[psrc_m]  \n\t"  \
 146                                                \
 147             : [val_m] "=r" (val_m)             \
 148             : [psrc_m] "m" (*psrc_m)           \
 149         );                                     \
 150                                                \
 151         val_m;                                 \
 152     } )
 153
 154     #if (__mips == 64)
 155         #define LD(psrc)                           \
 156         ( {                                        \
 157             uint8_t *psrc_m = (uint8_t *) (psrc);  \
 158             uint64_t val_m = 0;                    \
 159                                                    \
 160             __asm__ volatile (                     \
 161                 "uld  %[val_m],  %[psrc_m]  \n\t"  \
 162                                                    \
 163                 : [val_m] "=r" (val_m)             \
 164                 : [psrc_m] "m" (*psrc_m)           \
 165             );                                     \
 166                                                    \
 167             val_m;                                 \
 168         } )
 169     #else  // !(__mips == 64)
 170         #define LD(psrc)                                              \
 171         ( {                                                           \
 172             uint8_t *psrc_m1 = (uint8_t *) (psrc);                    \
 173             uint32_t val0_m, val1_m;                                  \
 174             uint64_t val_m = 0;                                       \
 175                                                                       \
 176             val0_m = LW(psrc_m1);                                     \
 177             val1_m = LW(psrc_m1 + 4);                                 \
 178                                                                       \
 179             val_m = (uint64_t) (val1_m);                              \
 180             val_m = (uint64_t) ((val_m << 32) & 0xFFFFFFFF00000000);  \
 181             val_m = (uint64_t) (val_m | (uint64_t) val0_m);           \
 182                                                                       \
 183             val_m;                                                    \
 184         } )
 185     #endif  // (__mips == 64)
 186
 187     #define SH(val, pdst)                      \
 188     {                                          \
 189         uint8_t *pdst_m = (uint8_t *) (pdst);  \
 190         uint16_t val_m = (val);                \
 191                                                \
 192         __asm__ volatile (                     \
 193             "ush  %[val_m],  %[pdst_m]  \n\t"  \
 194                                                \
 195             : [pdst_m] "=m" (*pdst_m)          \
 196             : [val_m] "r" (val_m)              \
 197         );                                     \
 198     }
 199
 200     #define SW(val, pdst)                      \
 201     {                                          \
 202         uint8_t *pdst_m = (uint8_t *) (pdst);  \
 203         uint32_t val_m = (val);                \
 204                                                \
 205         __asm__ volatile (                     \
 206             "usw  %[val_m],  %[pdst_m]  \n\t"  \
 207                                                \
 208             : [pdst_m] "=m" (*pdst_m)          \
 209             : [val_m] "r" (val_m)              \
 210         );                                     \
 211     }
 212
 213     #define SD(val, pdst)                                          \
 214     {                                                              \
 215         uint8_t *pdst_m1 = (uint8_t *) (pdst);                     \
 216         uint32_t val0_m, val1_m;                                   \
 217                                                                    \
 218         val0_m = (uint32_t) ((val) & 0x00000000FFFFFFFF);          \
 219         val1_m = (uint32_t) (((val) >> 32) & 0x00000000FFFFFFFF);  \
 220                                                                    \
 221         SW(val0_m, pdst_m1);                                       \
 222         SW(val1_m, pdst_m1 + 4);                                   \
 223     }
 224 #endif // (__mips_isa_rev >= 6)
 225
 226 /* Description : Load 4 words with stride
 227    Arguments   : Inputs  - psrc    (source pointer to load from)
 228                          - stride
 229                  Outputs - out0, out1, out2, out3
 230    Details     : Loads word in 'out0' from (psrc)
 231                  Loads word in 'out1' from (psrc + stride)
 232                  Loads word in 'out2' from (psrc + 2 * stride)
 233                  Loads word in 'out3' from (psrc + 3 * stride)
 234 */
 235 #define LW4(psrc, stride, out0, out1, out2, out3)  \
 236 {                                                  \
 237     out0 = LW((psrc));                             \
 238     out1 = LW((psrc) + stride);                    \
 239     out2 = LW((psrc) + 2 * stride);                \
 240     out3 = LW((psrc) + 3 * stride);                \
 241 }
 242
 243 /* Description : Store 4 words with stride
 244    Arguments   : Inputs  - in0, in1, in2, in3, pdst, stride
 245    Details     : Stores word from 'in0' to (pdst)
 246                  Stores word from 'in1' to (pdst + stride)
 247                  Stores word from 'in2' to (pdst + 2 * stride)
 248                  Stores word from 'in3' to (pdst + 3 * stride)
 249 */
 250 #define SW4(in0, in1, in2, in3, pdst, stride)  \
 251 {                                              \
 252     SW(in0, (pdst))                            \
 253     SW(in1, (pdst) + stride);                  \
 254     SW(in2, (pdst) + 2 * stride);              \
 255     SW(in3, (pdst) + 3 * stride);              \
 256 }
 257
 258 /* Description : Store 4 double words with stride
 259    Arguments   : Inputs  - in0, in1, in2, in3, pdst, stride
 260    Details     : Stores double word from 'in0' to (pdst)
 261                  Stores double word from 'in1' to (pdst + stride)
 262                  Stores double word from 'in2' to (pdst + 2 * stride)
 263                  Stores double word from 'in3' to (pdst + 3 * stride)
 264 */
 265 #define SD4(in0, in1, in2, in3, pdst, stride)  \
 266 {                                              \
 267     SD(in0, (pdst))                            \
 268     SD(in1, (pdst) + stride);                  \
 269     SD(in2, (pdst) + 2 * stride);              \
 270     SD(in3, (pdst) + 3 * stride);              \
 271 }
 272
 273 /* Description : Load vectors with 16 byte elements with stride
 274    Arguments   : Inputs  - psrc    (source pointer to load from)
 275                          - stride
 276                  Outputs - out0, out1
 277                  Return Type - as per RTYPE
 278    Details     : Loads 16 byte elements in 'out0' from (psrc)
 279                  Loads 16 byte elements in 'out1' from (psrc + stride)
 280 */
 281 #define LD_B2(RTYPE, psrc, stride, out0, out1)  \
 282 {                                               \
 283     out0 = LD_B(RTYPE, (psrc));                 \
 284     out1 = LD_B(RTYPE, (psrc) + stride);        \
 285 }
 286 #define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)
 287 #define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__)
 288
 289 #define LD_B3(RTYPE, psrc, stride, out0, out1, out2)  \
 290 {                                                     \
 291     LD_B2(RTYPE, (psrc), stride, out0, out1);         \
 292     out2 = LD_B(RTYPE, (psrc) + 2 * stride);          \
 293 }
 294 #define LD_SB3(...) LD_B3(v16i8, __VA_ARGS__)
 295
 296 #define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3)   \
 297 {                                                            \
 298     LD_B2(RTYPE, (psrc), stride, out0, out1);                \
 299     LD_B2(RTYPE, (psrc) + 2 * stride , stride, out2, out3);  \
 300 }
 301 #define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)
 302 #define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__)
 303
 304 #define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4)  \
 305 {                                                                 \
 306     LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);         \
 307     out4 = LD_B(RTYPE, (psrc) + 4 * stride);                      \
 308 }
 309 #define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__)
 310 #define LD_SB5(...) LD_B5(v16i8, __VA_ARGS__)
 311
 312 #define LD_B6(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5)  \
 313 {                                                                       \
 314     LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);               \
 315     LD_B2(RTYPE, (psrc) + 4 * stride, stride, out4, out5);              \
 316 }
 317 #define LD_SB6(...) LD_B6(v16i8, __VA_ARGS__)
 318
 319 #define LD_B7(RTYPE, psrc, stride,                               \
 320               out0, out1, out2, out3, out4, out5, out6)          \
 321 {                                                                \
 322     LD_B5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4);  \
 323     LD_B2(RTYPE, (psrc) + 5 * stride, stride, out5, out6);       \
 324 }
 325 #define LD_SB7(...) LD_B7(v16i8, __VA_ARGS__)
 326
 327 #define LD_B8(RTYPE, psrc, stride,                                      \
 328               out0, out1, out2, out3, out4, out5, out6, out7)           \
 329 {                                                                       \
 330     LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);               \
 331     LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7);  \
 332 }
 333 #define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__)
 334 #define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__)
 335
 336 /* Description : Load vectors with 8 halfword elements with stride
 337    Arguments   : Inputs  - psrc    (source pointer to load from)
 338                          - stride
 339                  Outputs - out0, out1
 340    Details     : Loads 8 halfword elements in 'out0' from (psrc)
 341                  Loads 8 halfword elements in 'out1' from (psrc + stride)
 342 */
 343 #define LD_H2(RTYPE, psrc, stride, out0, out1)  \
 344 {                                               \
 345     out0 = LD_H(RTYPE, (psrc));                 \
 346     out1 = LD_H(RTYPE, (psrc) + (stride));      \
 347 }
 348 #define LD_UH2(...) LD_H2(v8u16, __VA_ARGS__)
 349 #define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__)
 350
 351 #define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3)  \
 352 {                                                           \
 353     LD_H2(RTYPE, (psrc), stride, out0, out1);               \
 354     LD_H2(RTYPE, (psrc) + 2 * stride, stride, out2, out3);  \
 355 }
 356 #define LD_UH4(...) LD_H4(v8u16, __VA_ARGS__)
 357 #define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__)
 358
 359 #define LD_H6(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5)  \
 360 {                                                                       \
 361     LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3);               \
 362     LD_H2(RTYPE, (psrc) + 4 * stride, stride, out4, out5);              \
 363 }
 364 #define LD_UH6(...) LD_H6(v8u16, __VA_ARGS__)
 365 #define LD_SH6(...) LD_H6(v8i16, __VA_ARGS__)
 366
 367 #define LD_H8(RTYPE, psrc, stride,                                      \
 368               out0, out1, out2, out3, out4, out5, out6, out7)           \
 369 {                                                                       \
 370     LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3);               \
 371     LD_H4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7);  \
 372 }
 373 #define LD_UH8(...) LD_H8(v8u16, __VA_ARGS__)
 374 #define LD_SH8(...) LD_H8(v8i16, __VA_ARGS__)
 375
 376 /* Description : Store vectors of 16 byte elements with stride
 377    Arguments   : Inputs  - in0, in1, stride
 378                  Outputs - pdst    (destination pointer to store to)
 379    Details     : Stores 16 byte elements from 'in0' to (pdst)
 380                  Stores 16 byte elements from 'in1' to (pdst + stride)
 381 */
 382 #define ST_B2(RTYPE, in0, in1, pdst, stride)  \
 383 {                                             \
 384     ST_B(RTYPE, in0, (pdst));                 \
 385     ST_B(RTYPE, in1, (pdst) + stride);        \
 386 }
 387 #define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)
 388 #define ST_SB2(...) ST_B2(v16i8, __VA_ARGS__)
 389
 390 #define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride)    \
 391 {                                                         \
 392     ST_B2(RTYPE, in0, in1, (pdst), stride);               \
 393     ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride);  \
 394 }
 395 #define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
 396 #define ST_SB4(...) ST_B4(v16i8, __VA_ARGS__)
 397
 398 #define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,        \
 399               pdst, stride)                                         \
 400 {                                                                   \
 401     ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride);                 \
 402     ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride);  \
 403 }
 404 #define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__)
 405
 406 /* Description : Store vectors of 8 halfword elements with stride
 407    Arguments   : Inputs  - in0, in1, stride
 408                  Outputs - pdst    (destination pointer to store to)
 409    Details     : Stores 8 halfword elements from 'in0' to (pdst)
 410                  Stores 8 halfword elements from 'in1' to (pdst + stride)
 411 */
 412 #define ST_H2(RTYPE, in0, in1, pdst, stride)  \
 413 {                                             \
 414     ST_H(RTYPE, in0, (pdst));                 \
 415     ST_H(RTYPE, in1, (pdst) + stride);        \
 416 }
 417 #define ST_UH2(...) ST_H2(v8u16, __VA_ARGS__)
 418 #define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__)
 419
 420 #define ST_H4(RTYPE, in0, in1, in2, in3, pdst, stride)    \
 421 {                                                         \
 422     ST_H2(RTYPE, in0, in1, (pdst), stride);               \
 423     ST_H2(RTYPE, in2, in3, (pdst) + 2 * stride, stride);  \
 424 }
 425 #define ST_SH4(...) ST_H4(v8i16, __VA_ARGS__)
 426
 427 #define ST_H6(RTYPE, in0, in1, in2, in3, in4, in5, pdst, stride)  \
 428 {                                                                 \
 429     ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride);             \
 430     ST_H2(RTYPE, in4, in5, (pdst) + 4 * stride, stride);          \
 431 }
 432 #define ST_SH6(...) ST_H6(v8i16, __VA_ARGS__)
 433
 434 #define ST_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)  \
 435 {                                                                           \
 436     ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride);                       \
 437     ST_H4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride);          \
 438 }
 439 #define ST_SH8(...) ST_H8(v8i16, __VA_ARGS__)
 440
 441 /* Description : Store vectors of word elements with stride
 442    Arguments   : Inputs  - in0, in1, stride
 443                  Outputs - pdst    (destination pointer to store to)
 444                  Return Type - signed word
 445    Details     : Stores 4 word elements from 'in0' to (pdst)
 446                  Stores 4 word elements from 'in1' to (pdst + stride)
 447 */
 448 #define ST_SW2(in0, in1, pdst, stride)  \
 449 {                                       \
 450     ST_SW(in0, (pdst));                 \
 451     ST_SW(in1, (pdst) + stride);        \
 452 }
 453
 454 /* Description : Store as 2x4 byte block to destination memory from input vector
 455    Arguments   : Inputs  - in, stidx, pdst, stride
 456                  Return Type - unsigned byte
 457    Details     : Index stidx halfword element from 'in' vector is copied and
 458                  stored on first line
 459                  Index stidx+1 halfword element from 'in' vector is copied and
 460                  stored on second line
 461                  Index stidx+2 halfword element from 'in' vector is copied and
 462                  stored on third line
 463                  Index stidx+3 halfword element from 'in' vector is copied and
 464                  stored on fourth line
 465 */
 466 #define ST2x4_UB(in, stidx, pdst, stride)              \
 467 {                                                      \
 468     uint16_t out0_m, out1_m, out2_m, out3_m;           \
 469     uint8_t *pblk_2x4_m = (uint8_t *) (pdst);          \
 470                                                        \
 471     out0_m = __msa_copy_u_h((v8i16) in, (stidx));      \
 472     out1_m = __msa_copy_u_h((v8i16) in, (stidx + 1));  \
 473     out2_m = __msa_copy_u_h((v8i16) in, (stidx + 2));  \
 474     out3_m = __msa_copy_u_h((v8i16) in, (stidx + 3));  \
 475                                                        \
 476     SH(out0_m, pblk_2x4_m);                            \
 477     SH(out1_m, pblk_2x4_m + stride);                   \
 478     SH(out2_m, pblk_2x4_m + 2 * stride);               \
 479     SH(out3_m, pblk_2x4_m + 3 * stride);               \
 480 }
 481
 482 /* Description : Store as 4x2 byte block to destination memory from input vector
 483    Arguments   : Inputs  - in, pdst, stride
 484                  Return Type - unsigned byte
 485    Details     : Index 0 word element from input vector is copied and stored
 486                  on first line
 487                  Index 1 word element from input vector is copied and stored
 488                  on second line
 489 */
 490 #define ST4x2_UB(in, pdst, stride)             \
 491 {                                              \
 492     uint32_t out0_m, out1_m;                   \
 493     uint8_t *pblk_4x2_m = (uint8_t *) (pdst);  \
 494                                                \
 495     out0_m = __msa_copy_u_w((v4i32) in, 0);    \
 496     out1_m = __msa_copy_u_w((v4i32) in, 1);    \
 497                                                \
 498     SW(out0_m, pblk_4x2_m);                    \
 499     SW(out1_m, pblk_4x2_m + stride);           \
 500 }
 501
 502 /* Description : Store as 4x4 byte block to destination memory from input vector
 503    Arguments   : Inputs  - in0, in1, pdst, stride
 504                  Return Type - unsigned byte
 505    Details     : Idx0 word element from input vector 'in0' is copied and stored
 506                  on first line
 507                  Idx1 word element from input vector 'in0' is copied and stored
 508                  on second line
 509                  Idx2 word element from input vector 'in1' is copied and stored
 510                  on third line
 511                  Idx3 word element from input vector 'in1' is copied and stored
 512                  on fourth line
 513 */
 514 #define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)  \
 515 {                                                                 \
 516     uint32_t out0_m, out1_m, out2_m, out3_m;                      \
 517     uint8_t *pblk_4x4_m = (uint8_t *) (pdst);                     \
 518                                                                   \
 519     out0_m = __msa_copy_u_w((v4i32) in0, idx0);                   \
 520     out1_m = __msa_copy_u_w((v4i32) in0, idx1);                   \
 521     out2_m = __msa_copy_u_w((v4i32) in1, idx2);                   \
 522     out3_m = __msa_copy_u_w((v4i32) in1, idx3);                   \
 523                                                                   \
 524     SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride);      \
 525 }
 526 #define ST4x8_UB(in0, in1, pdst, stride)                            \
 527 {                                                                   \
 528     uint8_t *pblk_4x8 = (uint8_t *) (pdst);                         \
 529                                                                     \
 530     ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride);               \
 531     ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride);  \
 532 }
 533
 534 /* Description : Store as 6x4 byte block to destination memory from input
 535                  vectors
 536    Arguments   : Inputs  - in0, in1, pdst, stride
 537                  Return Type - unsigned byte
 538    Details     : Index 0 word element from input vector 'in0' is copied and
 539                  stored on first line followed by index 2 halfword element
 540                  Index 2 word element from input vector 'in0' is copied and
 541                  stored on second line followed by index 2 halfword element
 542                  Index 0 word element from input vector 'in1' is copied and
 543                  stored on third line followed by index 2 halfword element
 544                  Index 2 word element from input vector 'in1' is copied and
 545                  stored on fourth line followed by index 2 halfword element
 546 */
 547 #define ST6x4_UB(in0, in1, pdst, stride)       \
 548 {                                              \
 549     uint32_t out0_m, out1_m, out2_m, out3_m;   \
 550     uint16_t out4_m, out5_m, out6_m, out7_m;   \
 551     uint8_t *pblk_6x4_m = (uint8_t *) (pdst);  \
 552                                                \
 553     out0_m = __msa_copy_u_w((v4i32) in0, 0);   \
 554     out1_m = __msa_copy_u_w((v4i32) in0, 2);   \
 555     out2_m = __msa_copy_u_w((v4i32) in1, 0);   \
 556     out3_m = __msa_copy_u_w((v4i32) in1, 2);   \
 557                                                \
 558     out4_m = __msa_copy_u_h((v8i16) in0, 2);   \
 559     out5_m = __msa_copy_u_h((v8i16) in0, 6);   \
 560     out6_m = __msa_copy_u_h((v8i16) in1, 2);   \
 561     out7_m = __msa_copy_u_h((v8i16) in1, 6);   \
 562                                                \
 563     SW(out0_m, pblk_6x4_m);                    \
 564     SH(out4_m, (pblk_6x4_m + 4));              \
 565     pblk_6x4_m += stride;                      \
 566     SW(out1_m, pblk_6x4_m);                    \
 567     SH(out5_m, (pblk_6x4_m + 4));              \
 568     pblk_6x4_m += stride;                      \
 569     SW(out2_m, pblk_6x4_m);                    \
 570     SH(out6_m, (pblk_6x4_m + 4));              \
 571     pblk_6x4_m += stride;                      \
 572     SW(out3_m, pblk_6x4_m);                    \
 573     SH(out7_m, (pblk_6x4_m + 4));              \
 574 }
 575
 576 /* Description : Store as 8x2 byte block to destination memory from input vector
 577    Arguments   : Inputs  - in, pdst, stride
 578    Details     : Index 0 double word element from input vector 'in' is copied
 579                  and stored to destination memory at (pdst)
 580                  Index 1 double word element from input vector 'in' is copied
 581                  and stored to destination memory at (pdst + stride)
 582 */
 583 #define ST8x2_UB(in, pdst, stride)             \
 584 {                                              \
 585     uint64_t out0_m, out1_m;                   \
 586     uint8_t *pblk_8x2_m = (uint8_t *) (pdst);  \
 587                                                \
 588     out0_m = __msa_copy_u_d((v2i64) in, 0);    \
 589     out1_m = __msa_copy_u_d((v2i64) in, 1);    \
 590                                                \
 591     SD(out0_m, pblk_8x2_m);                    \
 592     SD(out1_m, pblk_8x2_m + stride);           \
 593 }
 594
 595 /* Description : Store as 8x4 byte block to destination memory from input
 596                  vectors
 597    Arguments   : Inputs  - in0, in1, pdst, stride
 598    Details     : Index 0 double word element from input vector 'in0' is copied
 599                  and stored to destination memory at (pblk_8x4_m)
 600                  Index 1 double word element from input vector 'in0' is copied
 601                  and stored to destination memory at (pblk_8x4_m + stride)
 602                  Index 0 double word element from input vector 'in1' is copied
 603                  and stored to destination memory at (pblk_8x4_m + 2 * stride)
 604                  Index 1 double word element from input vector 'in1' is copied
 605                  and stored to destination memory at (pblk_8x4_m + 3 * stride)
 606 */
 607 #define ST8x4_UB(in0, in1, pdst, stride)                      \
 608 {                                                             \
 609     uint64_t out0_m, out1_m, out2_m, out3_m;                  \
 610     uint8_t *pblk_8x4_m = (uint8_t *) (pdst);                 \
 611                                                               \
 612     out0_m = __msa_copy_u_d((v2i64) in0, 0);                  \
 613     out1_m = __msa_copy_u_d((v2i64) in0, 1);                  \
 614     out2_m = __msa_copy_u_d((v2i64) in1, 0);                  \
 615     out3_m = __msa_copy_u_d((v2i64) in1, 1);                  \
 616                                                               \
 617     SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride);  \
 618 }
 619 #define ST8x8_UB(in0, in1, in2, in3, pdst, stride)        \
 620 {                                                         \
 621     uint8_t *pblk_8x8_m = (uint8_t *) (pdst);             \
 622                                                           \
 623     ST8x4_UB(in0, in1, pblk_8x8_m, stride);               \
 624     ST8x4_UB(in2, in3, pblk_8x8_m + 4 * stride, stride);  \
 625 }
 626 #define ST12x4_UB(in0, in1, in2, pdst, stride)                \
 627 {                                                             \
 628     uint8_t *pblk_12x4_m = (uint8_t *) (pdst);                \
 629                                                               \
 630     /* left 8x4 */                                            \
 631     ST8x4_UB(in0, in1, pblk_12x4_m, stride);                  \
 632     /* right 4x4 */                                           \
 633     ST4x4_UB(in2, in2, 0, 1, 2, 3, pblk_12x4_m + 8, stride);  \
 634 }
 635
 636 /* Description : Store as 12x8 byte block to destination memory from
 637                  input vectors
 638    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
 639    Details     : Index 0 double word element from input vector 'in0' is copied
 640                  and stored to destination memory at (pblk_12x8_m) followed by
 641                  index 2 word element from same input vector 'in0' at
 642                  (pblk_12x8_m + 8)
 643                  Similar to remaining lines
 644 */
 645 #define ST12x8_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)  \
 646 {                                                                        \
 647     uint64_t out0_m, out1_m, out2_m, out3_m;                             \
 648     uint64_t out4_m, out5_m, out6_m, out7_m;                             \
 649     uint32_t out8_m, out9_m, out10_m, out11_m;                           \
 650     uint32_t out12_m, out13_m, out14_m, out15_m;                         \
 651     uint8_t *pblk_12x8_m = (uint8_t *) (pdst);                           \
 652                                                                          \
 653     out0_m = __msa_copy_u_d((v2i64) in0, 0);                             \
 654     out1_m = __msa_copy_u_d((v2i64) in1, 0);                             \
 655     out2_m = __msa_copy_u_d((v2i64) in2, 0);                             \
 656     out3_m = __msa_copy_u_d((v2i64) in3, 0);                             \
 657     out4_m = __msa_copy_u_d((v2i64) in4, 0);                             \
 658     out5_m = __msa_copy_u_d((v2i64) in5, 0);                             \
 659     out6_m = __msa_copy_u_d((v2i64) in6, 0);                             \
 660     out7_m = __msa_copy_u_d((v2i64) in7, 0);                             \
 661                                                                          \
 662     out8_m =  __msa_copy_u_w((v4i32) in0, 2);                            \
 663     out9_m =  __msa_copy_u_w((v4i32) in1, 2);                            \
 664     out10_m = __msa_copy_u_w((v4i32) in2, 2);                            \
 665     out11_m = __msa_copy_u_w((v4i32) in3, 2);                            \
 666     out12_m = __msa_copy_u_w((v4i32) in4, 2);                            \
 667     out13_m = __msa_copy_u_w((v4i32) in5, 2);                            \
 668     out14_m = __msa_copy_u_w((v4i32) in6, 2);                            \
 669     out15_m = __msa_copy_u_w((v4i32) in7, 2);                            \
 670                                                                          \
 671     SD(out0_m, pblk_12x8_m);                                             \
 672     SW(out8_m, pblk_12x8_m + 8);                                         \
 673     pblk_12x8_m += stride;                                               \
 674     SD(out1_m, pblk_12x8_m);                                             \
 675     SW(out9_m, pblk_12x8_m + 8);                                         \
 676     pblk_12x8_m += stride;                                               \
 677     SD(out2_m, pblk_12x8_m);                                             \
 678     SW(out10_m, pblk_12x8_m + 8);                                        \
 679     pblk_12x8_m += stride;                                               \
 680     SD(out3_m, pblk_12x8_m);                                             \
 681     SW(out11_m, pblk_12x8_m + 8);                                        \
 682     pblk_12x8_m += stride;                                               \
 683     SD(out4_m, pblk_12x8_m);                                             \
 684     SW(out12_m, pblk_12x8_m + 8);                                        \
 685     pblk_12x8_m += stride;                                               \
 686     SD(out5_m, pblk_12x8_m);                                             \
 687     SW(out13_m, pblk_12x8_m + 8);                                        \
 688     pblk_12x8_m += stride;                                               \
 689     SD(out6_m, pblk_12x8_m);                                             \
 690     SW(out14_m, pblk_12x8_m + 8);                                        \
 691     pblk_12x8_m += stride;                                               \
 692     SD(out7_m, pblk_12x8_m);                                             \
 693     SW(out15_m, pblk_12x8_m + 8);                                        \
 694 }
 695
 696 /* Description : Immediate number of columns to slide with zero
 697    Arguments   : Inputs  - in0, in1, slide_val
 698                  Outputs - out0, out1
 699                  Return Type - as per RTYPE
 700    Details     : Byte elements from 'zero_m' vector are slide into 'in0' by
 701                  number of elements specified by 'slide_val'
 702 */
 703 #define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val)                 \
 704 {                                                                         \
 705     v16i8 zero_m = { 0 };                                                 \
 706     out0 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in0, slide_val);  \
 707     out1 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in1, slide_val);  \
 708 }
 709 #define SLDI_B2_0_UB(...) SLDI_B2_0(v16u8, __VA_ARGS__)
 710
 711 #define SLDI_B4_0(RTYPE, in0, in1, in2, in3,            \
 712                   out0, out1, out2, out3, slide_val)    \
 713 {                                                       \
 714     SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val);  \
 715     SLDI_B2_0(RTYPE, in2, in3, out2, out3, slide_val);  \
 716 }
 717 #define SLDI_B4_0_SB(...) SLDI_B4_0(v16i8, __VA_ARGS__)
 718
 719 /* Description : Shuffle byte vector elements as per mask vector
 720    Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
 721                  Outputs - out0, out1
 722                  Return Type - as per RTYPE
 723    Details     : Selective byte elements from in0 & in1 are copied to out0 as
 724                  per control vector mask0
 725                  Selective byte elements from in2 & in3 are copied to out1 as
 726                  per control vector mask1
 727 */
 728 #define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1)       \
 729 {                                                                          \
 730     out0 = (RTYPE) __msa_vshf_b((v16i8) mask0, (v16i8) in1, (v16i8) in0);  \
 731     out1 = (RTYPE) __msa_vshf_b((v16i8) mask1, (v16i8) in3, (v16i8) in2);  \
 732 }
 733 #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
 734 #define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)
 735 #define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)
 736 #define VSHF_B2_SH(...) VSHF_B2(v8i16, __VA_ARGS__)
 737
 738 #define VSHF_B3(RTYPE, in0, in1, in2, in3, in4, in5, mask0, mask1, mask2,  \
 739                 out0, out1, out2)                                          \
 740 {                                                                          \
 741     VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1);          \
 742     out2 = (RTYPE) __msa_vshf_b((v16i8) mask2, (v16i8) in5, (v16i8) in4);  \
 743 }
 744 #define VSHF_B3_SB(...) VSHF_B3(v16i8, __VA_ARGS__)
 745
 746 #define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3,       \
 747                 out0, out1, out2, out3)                            \
 748 {                                                                  \
 749     VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1);  \
 750     VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3);  \
 751 }
 752 #define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__)
 753
 754 /* Description : Shuffle byte vector elements as per mask vector
 755    Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
 756                  Outputs - out0, out1
 757                  Return Type - as per RTYPE
 758    Details     : Selective byte elements from in0 & in1 are copied to out0 as
 759                  per control vector mask0
 760                  Selective byte elements from in2 & in3 are copied to out1 as
 761                  per control vector mask1
 762 */
 763 #define VSHF_W2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1)      \
 764 {                                                                         \
 765     out0 = (RTYPE) __msa_vshf_w((v4i32) mask0, (v4i32) in1, (v4i32) in0); \
 766     out1 = (RTYPE) __msa_vshf_w((v4i32) mask1, (v4i32) in3, (v4i32) in2); \
 767 }
 768 #define VSHF_W2_SB(...) VSHF_W2(v16i8, __VA_ARGS__)
 769
 770 /* Description : Dot product of byte vector elements
 771    Arguments   : Inputs  - mult0, mult1
 772                            cnst0, cnst1
 773                  Outputs - out0, out1
 774                  Return Type - signed halfword
 775    Details     : Signed byte elements from mult0 are multiplied with
 776                  signed byte elements from cnst0 producing a result
 777                  twice the size of input i.e. signed halfword.
 778                  Then this multiplication results of adjacent odd-even elements
 779                  are added together and stored to the out vector
 780                  (2 signed halfword results)
 781 */
 782 #define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)   \
 783 {                                                                 \
 784     out0 = (RTYPE) __msa_dotp_s_h((v16i8) mult0, (v16i8) cnst0);  \
 785     out1 = (RTYPE) __msa_dotp_s_h((v16i8) mult1, (v16i8) cnst1);  \
 786 }
 787 #define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__)
 788
 789 #define DOTP_SB3(RTYPE, mult0, mult1, mult2, cnst0, cnst1, cnst2,  \
 790                  out0, out1, out2)                                 \
 791 {                                                                  \
 792     DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);       \
 793     out2 = (RTYPE) __msa_dotp_s_h((v16i8) mult2, (v16i8) cnst2);   \
 794 }
 795 #define DOTP_SB3_SH(...) DOTP_SB3(v8i16, __VA_ARGS__)
 796
 797 #define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3,                   \
 798                  cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3)  \
 799 {                                                                     \
 800     DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);          \
 801     DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);          \
 802 }
 803 #define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__)
 804
 805 /* Description : Dot product & addition of byte vector elements
 806    Arguments   : Inputs  - mult0, mult1
 807                            cnst0, cnst1
 808                  Outputs - out0, out1
 809                  Return Type - signed halfword
 810    Details     : Signed byte elements from mult0 are multiplied with
 811                  signed byte elements from cnst0 producing a result
 812                  twice the size of input i.e. signed halfword.
 813                  Then this multiplication results of adjacent odd-even elements
 814                  are added to the out vector
 815                  (2 signed halfword results)
 816 */
 817 #define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)   \
 818 {                                                                  \
 819     out0 = (RTYPE) __msa_dpadd_s_h((v8i16) out0,                   \
 820                                    (v16i8) mult0, (v16i8) cnst0);  \
 821     out1 = (RTYPE) __msa_dpadd_s_h((v8i16) out1,                   \
 822                                    (v16i8) mult1, (v16i8) cnst1);  \
 823 }
 824 #define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__)
 825
 826 #define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3,                   \
 827                   cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3)  \
 828 {                                                                      \
 829     DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);          \
 830     DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);          \
 831 }
 832 #define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__)
 833
 834 /* Description : Dot product & addition of halfword vector elements
 835    Arguments   : Inputs  - mult0, mult1
 836                            cnst0, cnst1
 837                  Outputs - out0, out1
 838                  Return Type - signed word
 839    Details     : Signed halfword elements from mult0 are multiplied with
 840                  signed halfword elements from cnst0 producing a result
 841                  twice the size of input i.e. signed word.
 842                  Then this multiplication results of adjacent odd-even elements
 843                  are added to the out vector
 844                  (2 signed word results)
 845 */
 846 #define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)   \
 847 {                                                                  \
 848     out0 = (RTYPE) __msa_dpadd_s_w((v4i32) out0,                   \
 849                                    (v8i16) mult0, (v8i16) cnst0);  \
 850     out1 = (RTYPE) __msa_dpadd_s_w((v4i32) out1,                   \
 851                                    (v8i16) mult1, (v8i16) cnst1);  \
 852 }
 853 #define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__)
 854
 855 /* Description : Clips all halfword elements of input vector between min & max
 856                  out = ((in) < (min)) ? (min) : (((in) > (max)) ? (max) : (in))
 857    Arguments   : Inputs  - in       (input vector)
 858                          - min      (min threshold)
 859                          - max      (max threshold)
 860                  Outputs - out_m    (output vector with clipped elements)
 861                  Return Type - signed halfword
 862 */
 863 #define CLIP_SH(in, min, max)                           \
 864 ( {                                                     \
 865     v8i16 out_m;                                        \
 866                                                         \
 867     out_m = __msa_max_s_h((v8i16) min, (v8i16) in);     \
 868     out_m = __msa_min_s_h((v8i16) max, (v8i16) out_m);  \
 869     out_m;                                              \
 870 } )
 871
 872 /* Description : Clips all signed halfword elements of input vector
 873                  between 0 & 255
 874    Arguments   : Inputs  - in       (input vector)
 875                  Outputs - out_m    (output vector with clipped elements)
 876                  Return Type - signed halfword
 877 */
 878 #define CLIP_SH_0_255(in)                                 \
 879 ( {                                                       \
 880     v8i16 max_m = __msa_ldi_h(255);                       \
 881     v8i16 out_m;                                          \
 882                                                           \
 883     out_m = __msa_maxi_s_h((v8i16) in, 0);                \
 884     out_m = __msa_min_s_h((v8i16) max_m, (v8i16) out_m);  \
 885     out_m;                                                \
 886 } )
 887 #define CLIP_SH2_0_255(in0, in1)  \
 888 {                                 \
 889     in0 = CLIP_SH_0_255(in0);     \
 890     in1 = CLIP_SH_0_255(in1);     \
 891 }
 892 #define CLIP_SH4_0_255(in0, in1, in2, in3)  \
 893 {                                           \
 894     CLIP_SH2_0_255(in0, in1);               \
 895     CLIP_SH2_0_255(in2, in3);               \
 896 }
 897
 898 /* Description : Clips all signed word elements of input vector
 899                  between 0 & 255
 900    Arguments   : Inputs  - in       (input vector)
 901                  Outputs - out_m    (output vector with clipped elements)
 902                  Return Type - signed word
 903 */
 904 #define CLIP_SW_0_255(in)                                 \
 905 ( {                                                       \
 906     v4i32 max_m = __msa_ldi_w(255);                       \
 907     v4i32 out_m;                                          \
 908                                                           \
 909     out_m = __msa_maxi_s_w((v4i32) in, 0);                \
 910     out_m = __msa_min_s_w((v4i32) max_m, (v4i32) out_m);  \
 911     out_m;                                                \
 912 } )
 913
 914 /* Description : Horizontal subtraction of unsigned byte vector elements
 915    Arguments   : Inputs  - in0, in1
 916                  Outputs - out0, out1
 917                  Return Type - as per RTYPE
 918    Details     : Each unsigned odd byte element from 'in0' is subtracted from
 919                  even unsigned byte element from 'in0' (pairwise) and the
 920                  halfword result is stored in 'out0'
 921 */
 922 #define HSUB_UB2(RTYPE, in0, in1, out0, out1)                 \
 923 {                                                             \
 924     out0 = (RTYPE) __msa_hsub_u_h((v16u8) in0, (v16u8) in0);  \
 925     out1 = (RTYPE) __msa_hsub_u_h((v16u8) in1, (v16u8) in1);  \
 926 }
 927 #define HSUB_UB2_UH(...) HSUB_UB2(v8u16, __VA_ARGS__)
 928 #define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
 929
 930 /* Description : Interleave even halfword elements from vectors
 931    Arguments   : Inputs  - in0, in1, in2, in3
 932                  Outputs - out0, out1
 933                  Return Type - as per RTYPE
 934    Details     : Even halfword elements of 'in0' and even halfword
 935                  elements of 'in1' are interleaved and copied to 'out0'
 936                  Even halfword elements of 'in2' and even halfword
 937                  elements of 'in3' are interleaved and copied to 'out1'
 938 */
 939 #define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1)      \
 940 {                                                            \
 941     out0 = (RTYPE) __msa_ilvev_h((v8i16) in1, (v8i16) in0);  \
 942     out1 = (RTYPE) __msa_ilvev_h((v8i16) in3, (v8i16) in2);  \
 943 }
 944 #define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__)
 945
 946 /* Description : Interleave even word elements from vectors
 947    Arguments   : Inputs  - in0, in1, in2, in3
 948                  Outputs - out0, out1
 949                  Return Type - as per RTYPE
 950    Details     : Even word elements of 'in0' and even word
 951                  elements of 'in1' are interleaved and copied to 'out0'
 952                  Even word elements of 'in2' and even word
 953                  elements of 'in3' are interleaved and copied to 'out1'
 954 */
 955 #define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1)      \
 956 {                                                            \
 957     out0 = (RTYPE) __msa_ilvev_w((v4i32) in1, (v4i32) in0);  \
 958     out1 = (RTYPE) __msa_ilvev_w((v4i32) in3, (v4i32) in2);  \
 959 }
 960 #define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__)
 961
 962 /* Description : Interleave even double word elements from vectors
 963    Arguments   : Inputs  - in0, in1, in2, in3
 964                  Outputs - out0, out1
 965                  Return Type - as per RTYPE
 966    Details     : Even double word elements of 'in0' and even double word
 967                  elements of 'in1' are interleaved and copied to 'out0'
 968                  Even double word elements of 'in2' and even double word
 969                  elements of 'in3' are interleaved and copied to 'out1'
 970 */
 971 #define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1)      \
 972 {                                                            \
 973     out0 = (RTYPE) __msa_ilvev_d((v2i64) in1, (v2i64) in0);  \
 974     out1 = (RTYPE) __msa_ilvev_d((v2i64) in3, (v2i64) in2);  \
 975 }
 976 #define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)
 977
 978 /* Description : Interleave left half of byte elements from vectors
 979    Arguments   : Inputs  - in0, in1, in2, in3
 980                  Outputs - out0, out1
 981                  Return Type - as per RTYPE
 982    Details     : Left half of byte elements of in0 and left half of byte
 983                  elements of in1 are interleaved and copied to out0.
 984                  Left half of byte elements of in2 and left half of byte
 985                  elements of in3 are interleaved and copied to out1.
 986 */
 987 #define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1)      \
 988 {                                                           \
 989     out0 = (RTYPE) __msa_ilvl_b((v16i8) in0, (v16i8) in1);  \
 990     out1 = (RTYPE) __msa_ilvl_b((v16i8) in2, (v16i8) in3);  \
 991 }
 992 #define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__)
 993 #define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__)
 994
 995 #define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
 996                 out0, out1, out2, out3)                         \
 997 {                                                               \
 998     ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1);             \
 999     ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1000 }
1001 #define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__)
1002 #define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__)
1003 #define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__)
1004
1005 /* Description : Interleave left half of halfword elements from vectors
1006    Arguments   : Inputs  - in0, in1, in2, in3
1007                  Outputs - out0, out1
1008                  Return Type - as per RTYPE
1009    Details     : Left half of halfword elements of in0 and left half of halfword
1010                  elements of in1 are interleaved and copied to out0.
1011                  Left half of halfword elements of in2 and left half of halfword
1012                  elements of in3 are interleaved and copied to out1.
1013 */
1014 #define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1015 {                                                           \
1016     out0 = (RTYPE) __msa_ilvl_h((v8i16) in0, (v8i16) in1);  \
1017     out1 = (RTYPE) __msa_ilvl_h((v8i16) in2, (v8i16) in3);  \
1018 }
1019 #define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__)
1020
1021 #define ILVL_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1022                 out0, out1, out2, out3)                         \
1023 {                                                               \
1024     ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1025     ILVL_H2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1026 }
1027 #define ILVL_H4_SH(...) ILVL_H4(v8i16, __VA_ARGS__)
1028
1029 /* Description : Interleave left half of word elements from vectors
1030    Arguments   : Inputs  - in0, in1, in2, in3
1031                  Outputs - out0, out1
1032                  Return Type - as per RTYPE
1033    Details     : Left half of word elements of in0 and left half of word
1034                  elements of in1 are interleaved and copied to out0.
1035                  Left half of word elements of in2 and left half of word
1036                  elements of in3 are interleaved and copied to out1.
1037 */
1038 #define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1039 {                                                           \
1040     out0 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1);  \
1041     out1 = (RTYPE) __msa_ilvl_w((v4i32) in2, (v4i32) in3);  \
1042 }
1043 #define ILVL_W2_SB(...) ILVL_W2(v16i8, __VA_ARGS__)
1044
1045 /* Description : Interleave right half of byte elements from vectors
1046    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1047                  Outputs - out0, out1, out2, out3
1048                  Return Type - as per RTYPE
1049    Details     : Right half of byte elements of in0 and right half of byte
1050                  elements of in1 are interleaved and copied to out0.
1051                  Right half of byte elements of in2 and right half of byte
1052                  elements of in3 are interleaved and copied to out1.
1053                  Similar for other pairs
1054 */
1055 #define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1056 {                                                           \
1057     out0 = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1);  \
1058     out1 = (RTYPE) __msa_ilvr_b((v16i8) in2, (v16i8) in3);  \
1059 }
1060 #define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__)
1061 #define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)
1062 #define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__)
1063 #define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
1064
1065 #define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1066                 out0, out1, out2, out3)                         \
1067 {                                                               \
1068     ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1069     ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1070 }
1071 #define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__)
1072 #define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__)
1073 #define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__)
1074
1075 /* Description : Interleave right half of halfword elements from vectors
1076    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1077                  Outputs - out0, out1, out2, out3
1078                  Return Type - signed halfword
1079    Details     : Right half of halfword elements of in0 and right half of
1080                  halfword elements of in1 are interleaved and copied to out0.
1081                  Right half of halfword elements of in2 and right half of
1082                  halfword elements of in3 are interleaved and copied to out1.
1083                  Similar for other pairs
1084 */
1085 #define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1086 {                                                           \
1087     out0 = (RTYPE) __msa_ilvr_h((v8i16) in0, (v8i16) in1);  \
1088     out1 = (RTYPE) __msa_ilvr_h((v8i16) in2, (v8i16) in3);  \
1089 }
1090 #define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
1091
1092 #define ILVR_H3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2)  \
1093 {                                                                       \
1094     ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1);                     \
1095     out2 = (RTYPE) __msa_ilvr_h((v8i16) in4, (v8i16) in5);              \
1096 }
1097 #define ILVR_H3_SH(...) ILVR_H3(v8i16, __VA_ARGS__)
1098
1099 #define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1100                 out0, out1, out2, out3)                         \
1101 {                                                               \
1102     ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1103     ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1104 }
1105 #define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__)
1106
1107 #define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1108 {                                                           \
1109     out0 = (RTYPE) __msa_ilvr_w((v4i32) in0, (v4i32) in1);  \
1110     out1 = (RTYPE) __msa_ilvr_w((v4i32) in2, (v4i32) in3);  \
1111 }
1112 #define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__)
1113 #define ILVR_W2_SB(...) ILVR_W2(v16i8, __VA_ARGS__)
1114
1115 #define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1116                 out0, out1, out2, out3)                         \
1117 {                                                               \
1118     ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1119     ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1120 }
1121 #define ILVR_W4_SB(...) ILVR_W4(v16i8, __VA_ARGS__)
1122
1123 /* Description : Interleave right half of double word elements from vectors
1124    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1125                  Outputs - out0, out1, out2, out3
1126                  Return Type - unsigned double word
1127    Details     : Right half of double word elements of in0 and right half of
1128                  double word elements of in1 are interleaved and copied to out0.
1129                  Right half of double word elements of in2 and right half of
1130                  double word elements of in3 are interleaved and copied to out1.
1131 */
1132 #define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1)          \
1133 {                                                               \
1134     out0 = (RTYPE) __msa_ilvr_d((v2i64) (in0), (v2i64) (in1));  \
1135     out1 = (RTYPE) __msa_ilvr_d((v2i64) (in2), (v2i64) (in3));  \
1136 }
1137 #define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
1138 #define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
1139
1140 #define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2)  \
1141 {                                                                       \
1142     ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);                     \
1143     out2 = (RTYPE) __msa_ilvr_d((v2i64) (in4), (v2i64) (in5));          \
1144 }
1145 #define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__)
1146
1147 #define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1148                 out0, out1, out2, out3)                         \
1149 {                                                               \
1150     ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1151     ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1152 }
1153 #define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__)
1154
1155 /* Description : Interleave both left and right half of input vectors
1156    Arguments   : Inputs  - in0, in1
1157                  Outputs - out0, out1
1158                  Return Type - as per RTYPE
1159    Details     : Right half of byte elements from 'in0' and 'in1' are
1160                  interleaved and stored to 'out0'
1161                  Left half of byte elements from 'in0' and 'in1' are
1162                  interleaved and stored to 'out1'
1163 */
1164 #define ILVRL_B2(RTYPE, in0, in1, out0, out1)               \
1165 {                                                           \
1166     out0 = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1);  \
1167     out1 = (RTYPE) __msa_ilvl_b((v16i8) in0, (v16i8) in1);  \
1168 }
1169 #define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)
1170 #define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
1171
1172 #define ILVRL_H2(RTYPE, in0, in1, out0, out1)               \
1173 {                                                           \
1174     out0 = (RTYPE) __msa_ilvr_h((v8i16) in0, (v8i16) in1);  \
1175     out1 = (RTYPE) __msa_ilvl_h((v8i16) in0, (v8i16) in1);  \
1176 }
1177 #define ILVRL_H2_SB(...) ILVRL_H2(v16i8, __VA_ARGS__)
1178 #define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)
1179 #define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
1180
1181 #define ILVRL_W2(RTYPE, in0, in1, out0, out1)               \
1182 {                                                           \
1183     out0 = (RTYPE) __msa_ilvr_w((v4i32) in0, (v4i32) in1);  \
1184     out1 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1);  \
1185 }
1186 #define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
1187
1188 /* Description : Maximum values between signed elements of vector and
1189                  5-bit signed immediate value are copied to the output vector
1190    Arguments   : Inputs  - in0, in1, in2, in3, max_val
1191                  Outputs - in0, in1, in2, in3 (in place)
1192                  Return Type - unsigned halfword
1193    Details     : Maximum of signed halfword element values from 'in0' and
1194                  'max_val' are written to output vector 'in0'
1195 */
1196 #define MAXI_SH2(RTYPE, in0, in1, max_val)                 \
1197 {                                                          \
1198     in0 = (RTYPE) __msa_maxi_s_h((v8i16) in0, (max_val));  \
1199     in1 = (RTYPE) __msa_maxi_s_h((v8i16) in1, (max_val));  \
1200 }
1201 #define MAXI_SH2_SH(...) MAXI_SH2(v8i16, __VA_ARGS__)
1202
1203 #define MAXI_SH4(RTYPE, in0, in1, in2, in3, max_val)  \
1204 {                                                     \
1205     MAXI_SH2(RTYPE, in0, in1, max_val);               \
1206     MAXI_SH2(RTYPE, in2, in3, max_val);               \
1207 }
1208 #define MAXI_SH4_UH(...) MAXI_SH4(v8u16, __VA_ARGS__)
1209
1210 /* Description : Saturate the halfword element values to the max
1211                  unsigned value of (sat_val+1 bits)
1212                  The element data width remains unchanged
1213    Arguments   : Inputs  - in0, in1, in2, in3, sat_val
1214                  Outputs - in0, in1, in2, in3 (in place)
1215                  Return Type - unsigned halfword
1216    Details     : Each unsigned halfword element from 'in0' is saturated to the
1217                  value generated with (sat_val+1) bit range
1218                  Results are in placed to original vectors
1219 */
1220 #define SAT_UH2(RTYPE, in0, in1, sat_val)               \
1221 {                                                       \
1222     in0 = (RTYPE) __msa_sat_u_h((v8u16) in0, sat_val);  \
1223     in1 = (RTYPE) __msa_sat_u_h((v8u16) in1, sat_val);  \
1224 }
1225 #define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__)
1226
1227 #define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val)  \
1228 {                                                    \
1229     SAT_UH2(RTYPE, in0, in1, sat_val);               \
1230     SAT_UH2(RTYPE, in2, in3, sat_val)                \
1231 }
1232 #define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__)
1233
1234 /* Description : Saturate the halfword element values to the max
1235                  unsigned value of (sat_val+1 bits)
1236                  The element data width remains unchanged
1237    Arguments   : Inputs  - in0, in1, in2, in3, sat_val
1238                  Outputs - in0, in1, in2, in3 (in place)
1239                  Return Type - unsigned halfword
1240    Details     : Each unsigned halfword element from 'in0' is saturated to the
1241                  value generated with (sat_val+1) bit range
1242                  Results are in placed to original vectors
1243 */
1244 #define SAT_SH2(RTYPE, in0, in1, sat_val)               \
1245 {                                                       \
1246     in0 = (RTYPE) __msa_sat_s_h((v8i16) in0, sat_val);  \
1247     in1 = (RTYPE) __msa_sat_s_h((v8i16) in1, sat_val);  \
1248 }
1249 #define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__)
1250
1251 #define SAT_SH3(RTYPE, in0, in1, in2, sat_val)          \
1252 {                                                       \
1253     SAT_SH2(RTYPE, in0, in1, sat_val)                   \
1254     in2 = (RTYPE) __msa_sat_s_h((v8i16) in2, sat_val);  \
1255 }
1256 #define SAT_SH3_SH(...) SAT_SH3(v8i16, __VA_ARGS__)
1257
1258 #define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val)  \
1259 {                                                    \
1260     SAT_SH2(RTYPE, in0, in1, sat_val);               \
1261     SAT_SH2(RTYPE, in2, in3, sat_val);               \
1262 }
1263 #define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__)
1264
1265 /* Description : Indexed halfword element values are replicated to all
1266                  elements in output vector
1267    Arguments   : Inputs  - in, idx0, idx1
1268                  Outputs - out0, out1
1269                  Return Type - as per RTYPE
1270    Details     : 'idx0' element value from 'in' vector is replicated to all
1271                   elements in 'out0' vector
1272                   Valid index range for halfword operation is 0-7
1273 */
1274 #define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1)  \
1275 {                                                     \
1276     out0 = (RTYPE) __msa_splati_h((v8i16) in, idx0);  \
1277     out1 = (RTYPE) __msa_splati_h((v8i16) in, idx1);  \
1278 }
1279 #define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__)
1280
1281 #define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3,  \
1282                   out0, out1, out2, out3)             \
1283 {                                                     \
1284     SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1);     \
1285     SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3);     \
1286 }
1287 #define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__)
1288 #define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__)
1289
1290 /* Description : Indexed word element values are replicated to all
1291                  elements in output vector
1292    Arguments   : Inputs  - in, stidx
1293                  Outputs - out0, out1
1294                  Return Type - as per RTYPE
1295    Details     : 'stidx' element value from 'in' vector is replicated to all
1296                   elements in 'out0' vector
1297                  'stidx + 1' element value from 'in' vector is replicated to all
1298                   elements in 'out1' vector
1299                   Valid index range for halfword operation is 0-3
1300 */
1301 #define SPLATI_W2(RTYPE, in, stidx, out0, out1)            \
1302 {                                                          \
1303     out0 = (RTYPE) __msa_splati_w((v4i32) in, stidx);      \
1304     out1 = (RTYPE) __msa_splati_w((v4i32) in, (stidx+1));  \
1305 }
1306 #define SPLATI_W2_SW(...) SPLATI_W2(v4i32, __VA_ARGS__)
1307
1308 #define SPLATI_W4(RTYPE, in, out0, out1, out2, out3)  \
1309 {                                                     \
1310     SPLATI_W2(RTYPE, in, 0, out0, out1);              \
1311     SPLATI_W2(RTYPE, in, 2, out2, out3);              \
1312 }
1313 #define SPLATI_W4_SW(...) SPLATI_W4(v4i32, __VA_ARGS__)
1314
1315 /* Description : Pack even byte elements of vector pairs
1316    Arguments   : Inputs  - in0, in1, in2, in3
1317                  Outputs - out0, out1
1318                  Return Type - as per RTYPE
1319    Details     : Even byte elements of in0 are copied to the left half of
1320                  out0 & even byte elements of in1 are copied to the right
1321                  half of out0.
1322                  Even byte elements of in2 are copied to the left half of
1323                  out1 & even byte elements of in3 are copied to the right
1324                  half of out1.
1325 */
1326 #define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1327 {                                                            \
1328     out0 = (RTYPE) __msa_pckev_b((v16i8) in0, (v16i8) in1);  \
1329     out1 = (RTYPE) __msa_pckev_b((v16i8) in2, (v16i8) in3);  \
1330 }
1331 #define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)
1332 #define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)
1333 #define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__)
1334 #define PCKEV_B2_SW(...) PCKEV_B2(v4i32, __VA_ARGS__)
1335
1336 #define PCKEV_B3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2)  \
1337 {                                                                        \
1338     PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1);                     \
1339     out2 = (RTYPE) __msa_pckev_b((v16i8) in4, (v16i8) in5);              \
1340 }
1341 #define PCKEV_B3_UB(...) PCKEV_B3(v16u8, __VA_ARGS__)
1342 #define PCKEV_B3_SB(...) PCKEV_B3(v16i8, __VA_ARGS__)
1343
1344 #define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1345                  out0, out1, out2, out3)                         \
1346 {                                                                \
1347     PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1348     PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1349 }
1350 #define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__)
1351 #define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__)
1352 #define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__)
1353 #define PCKEV_B4_SW(...) PCKEV_B4(v4i32, __VA_ARGS__)
1354
1355 /* Description : Pack even halfword elements of vector pairs
1356    Arguments   : Inputs  - in0, in1, in2, in3
1357                  Outputs - out0, out1
1358                  Return Type - as per RTYPE
1359    Details     : Even halfword elements of in0 are copied to the left half of
1360                  out0 & even halfword elements of in1 are copied to the right
1361                  half of out0.
1362                  Even halfword elements of in2 are copied to the left half of
1363                  out1 & even halfword elements of in3 are copied to the right
1364                  half of out1.
1365 */
1366 #define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1367 {                                                            \
1368     out0 = (RTYPE) __msa_pckev_h((v8i16) in0, (v8i16) in1);  \
1369     out1 = (RTYPE) __msa_pckev_h((v8i16) in2, (v8i16) in3);  \
1370 }
1371 #define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__)
1372 #define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__)
1373
1374 #define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1375                  out0, out1, out2, out3)                         \
1376 {                                                                \
1377     PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1378     PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1379 }
1380 #define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__)
1381 #define PCKEV_H4_SW(...) PCKEV_H4(v4i32, __VA_ARGS__)
1382
1383 /* Description : Each byte element is logically xor'ed with immediate 128
1384    Arguments   : Inputs  - in0, in1
1385                  Outputs - in0, in1 (in-place)
1386                  Return Type - as per RTYPE
1387    Details     : Each unsigned byte element from input vector 'in0' is
1388                  logically xor'ed with 128 and result is in-place stored in
1389                  'in0' vector
1390                  Each unsigned byte element from input vector 'in1' is
1391                  logically xor'ed with 128 and result is in-place stored in
1392                  'in1' vector
1393                  Similar for other pairs
1394 */
1395 #define XORI_B2_128(RTYPE, in0, in1)               \
1396 {                                                  \
1397     in0 = (RTYPE) __msa_xori_b((v16u8) in0, 128);  \
1398     in1 = (RTYPE) __msa_xori_b((v16u8) in1, 128);  \
1399 }
1400 #define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__)
1401
1402 #define XORI_B3_128(RTYPE, in0, in1, in2)          \
1403 {                                                  \
1404     XORI_B2_128(RTYPE, in0, in1);                  \
1405     in2 = (RTYPE) __msa_xori_b((v16u8) in2, 128);  \
1406 }
1407 #define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__)
1408
1409 #define XORI_B4_128(RTYPE, in0, in1, in2, in3)  \
1410 {                                               \
1411     XORI_B2_128(RTYPE, in0, in1);               \
1412     XORI_B2_128(RTYPE, in2, in3);               \
1413 }
1414 #define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__)
1415 #define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__)
1416 #define XORI_B4_128_SH(...) XORI_B4_128(v8i16, __VA_ARGS__)
1417
1418 #define XORI_B5_128(RTYPE, in0, in1, in2, in3, in4)  \
1419 {                                                    \
1420     XORI_B3_128(RTYPE, in0, in1, in2);               \
1421     XORI_B2_128(RTYPE, in3, in4);                    \
1422 }
1423 #define XORI_B5_128_SB(...) XORI_B5_128(v16i8, __VA_ARGS__)
1424
1425 #define XORI_B6_128(RTYPE, in0, in1, in2, in3, in4, in5)  \
1426 {                                                         \
1427     XORI_B4_128(RTYPE, in0, in1, in2, in3);               \
1428     XORI_B2_128(RTYPE, in4, in5);                         \
1429 }
1430 #define XORI_B6_128_SB(...) XORI_B6_128(v16i8, __VA_ARGS__)
1431
1432 #define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6)  \
1433 {                                                              \
1434     XORI_B4_128(RTYPE, in0, in1, in2, in3);                    \
1435     XORI_B3_128(RTYPE, in4, in5, in6);                         \
1436 }
1437 #define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__)
1438
1439 #define XORI_B8_128(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7)  \
1440 {                                                                   \
1441     XORI_B4_128(RTYPE, in0, in1, in2, in3);                         \
1442     XORI_B4_128(RTYPE, in4, in5, in6, in7);                         \
1443 }
1444 #define XORI_B8_128_SB(...) XORI_B8_128(v16i8, __VA_ARGS__)
1445
1446 /* Description : Addition of signed halfword elements and signed saturation
1447    Arguments   : Inputs  - in0, in1, in2, in3
1448                  Outputs - out0, out1
1449                  Return Type - as per RTYPE
1450    Details     : Signed halfword elements from 'in0' are added to signed
1451                  halfword elements of 'in1'. The result is then signed saturated
1452                  between -32768 to +32767 (as per halfword data type)
1453                  Similar for other pairs
1454 */
1455 #define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1)       \
1456 {                                                             \
1457     out0 = (RTYPE) __msa_adds_s_h((v8i16) in0, (v8i16) in1);  \
1458     out1 = (RTYPE) __msa_adds_s_h((v8i16) in2, (v8i16) in3);  \
1459 }
1460 #define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__)
1461
1462 #define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1463                  out0, out1, out2, out3)                         \
1464 {                                                                \
1465     ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1466     ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1467 }
1468 #define ADDS_SH4_UH(...) ADDS_SH4(v8u16, __VA_ARGS__)
1469 #define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__)
1470
1471 /* Description : Shift left all elements of vector (generic for all data types)
1472    Arguments   : Inputs  - in0, in1, in2, in3, shift
1473                  Outputs - in0, in1, in2, in3 (in place)
1474                  Return Type - as per input vector RTYPE
1475    Details     : Each element of vector 'in0' is left shifted by 'shift' and
1476                  result is in place written to 'in0'
1477                  Similar for other pairs
1478 */
1479 #define SLLI_4V(in0, in1, in2, in3, shift)  \
1480 {                                           \
1481     in0 = in0 << shift;                     \
1482     in1 = in1 << shift;                     \
1483     in2 = in2 << shift;                     \
1484     in3 = in3 << shift;                     \
1485 }
1486
1487 /* Description : Arithmetic shift right all elements of vector
1488                  (generic for all data types)
1489    Arguments   : Inputs  - in0, in1, in2, in3, shift
1490                  Outputs - in0, in1, in2, in3 (in place)
1491                  Return Type - as per input vector RTYPE
1492    Details     : Each element of vector 'in0' is right shifted by 'shift' and
1493                  result is in place written to 'in0'
1494                  Here, 'shift' is GP variable passed in
1495                  Similar for other pairs
1496 */
1497 #define SRA_4V(in0, in1, in2, in3, shift)  \
1498 {                                          \
1499     in0 = in0 >> shift;                    \
1500     in1 = in1 >> shift;                    \
1501     in2 = in2 >> shift;                    \
1502     in3 = in3 >> shift;                    \
1503 }
1504
1505 /* Description : Shift right logical all halfword elements of vector
1506    Arguments   : Inputs  - in0, in1, in2, in3, shift
1507                  Outputs - in0, in1, in2, in3 (in place)
1508                  Return Type - unsigned halfword
1509    Details     : Each element of vector 'in0' is shifted right logical by
1510                  number of bits respective element holds in vector 'shift' and
1511                  result is in place written to 'in0'
1512                  Here, 'shift' is a vector passed in
1513                  Similar for other pairs
1514 */
1515 #define SRL_H4(RTYPE, in0, in1, in2, in3, shift)            \
1516 {                                                           \
1517     in0 = (RTYPE) __msa_srl_h((v8i16) in0, (v8i16) shift);  \
1518     in1 = (RTYPE) __msa_srl_h((v8i16) in1, (v8i16) shift);  \
1519     in2 = (RTYPE) __msa_srl_h((v8i16) in2, (v8i16) shift);  \
1520     in3 = (RTYPE) __msa_srl_h((v8i16) in3, (v8i16) shift);  \
1521 }
1522 #define SRL_H4_UH(...) SRL_H4(v8u16, __VA_ARGS__)
1523
1524 /* Description : Shift right arithmetic rounded halfwords
1525    Arguments   : Inputs  - in0, in1, shift
1526                  Outputs - in0, in1, (in place)
1527                  Return Type - unsigned halfword
1528    Details     : Each element of vector 'in0' is shifted right arithmetic by
1529                  number of bits respective element holds in vector 'shift'.
1530                  The last discarded bit is added to shifted value for rounding
1531                  and the result is in place written to 'in0'
1532                  Here, 'shift' is a vector passed in
1533                  Similar for other pairs
1534 */
1535 #define SRAR_H2(RTYPE, in0, in1, shift)                      \
1536 {                                                            \
1537     in0 = (RTYPE) __msa_srar_h((v8i16) in0, (v8i16) shift);  \
1538     in1 = (RTYPE) __msa_srar_h((v8i16) in1, (v8i16) shift);  \
1539 }
1540 #define SRAR_H2_UH(...) SRAR_H2(v8u16, __VA_ARGS__)
1541 #define SRAR_H2_SH(...) SRAR_H2(v8i16, __VA_ARGS__)
1542
1543 #define SRAR_H3(RTYPE, in0, in1, in2, shift)                 \
1544 {                                                            \
1545     SRAR_H2(RTYPE, in0, in1, shift)                          \
1546     in2 = (RTYPE) __msa_srar_h((v8i16) in2, (v8i16) shift);  \
1547 }
1548 #define SRAR_H3_SH(...) SRAR_H3(v8i16, __VA_ARGS__)
1549
1550 #define SRAR_H4(RTYPE, in0, in1, in2, in3, shift)  \
1551 {                                                  \
1552     SRAR_H2(RTYPE, in0, in1, shift)                \
1553     SRAR_H2(RTYPE, in2, in3, shift)                \
1554 }
1555 #define SRAR_H4_UH(...) SRAR_H4(v8u16, __VA_ARGS__)
1556 #define SRAR_H4_SH(...) SRAR_H4(v8i16, __VA_ARGS__)
1557
1558 /* Description : Shift right arithmetic rounded (immediate)
1559    Arguments   : Inputs  - in0, in1, in2, in3, shift
1560                  Outputs - in0, in1, in2, in3 (in place)
1561                  Return Type - as per RTYPE
1562    Details     : Each element of vector 'in0' is shifted right arithmetic by
1563                  value in 'shift'.
1564                  The last discarded bit is added to shifted value for rounding
1565                  and the result is in place written to 'in0'
1566                  Similar for other pairs
1567 */
1568 #define SRARI_H2(RTYPE, in0, in1, shift)              \
1569 {                                                     \
1570     in0 = (RTYPE) __msa_srari_h((v8i16) in0, shift);  \
1571     in1 = (RTYPE) __msa_srari_h((v8i16) in1, shift);  \
1572 }
1573 #define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__)
1574 #define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__)
1575
1576 /* Description : Shift right arithmetic rounded (immediate)
1577    Arguments   : Inputs  - in0, in1, shift
1578                  Outputs - in0, in1     (in place)
1579                  Return Type - as per RTYPE
1580    Details     : Each element of vector 'in0' is shifted right arithmetic by
1581                  value in 'shift'.
1582                  The last discarded bit is added to shifted value for rounding
1583                  and the result is in place written to 'in0'
1584                  Similar for other pairs
1585 */
1586 #define SRARI_W2(RTYPE, in0, in1, shift)              \
1587 {                                                     \
1588     in0 = (RTYPE) __msa_srari_w((v4i32) in0, shift);  \
1589     in1 = (RTYPE) __msa_srari_w((v4i32) in1, shift);  \
1590 }
1591 #define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__)
1592
1593 #define SRARI_W4(RTYPE, in0, in1, in2, in3, shift)  \
1594 {                                                   \
1595     SRARI_W2(RTYPE, in0, in1, shift);               \
1596     SRARI_W2(RTYPE, in2, in3, shift);               \
1597 }
1598 #define SRARI_W4_SH(...) SRARI_W4(v8i16, __VA_ARGS__)
1599 #define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)
1600
1601 /* Description : Multiplication of pairs of vectors
1602    Arguments   : Inputs  - in0, in1, in2, in3
1603                  Outputs - out0, out1
1604    Details     : Each element from 'in0' is multiplied with elements from 'in1'
1605                  and result is written to 'out0'
1606                  Similar for other pairs
1607 */
1608 #define MUL2(in0, in1, in2, in3, out0, out1)  \
1609 {                                             \
1610     out0 = in0 * in1;                         \
1611     out1 = in2 * in3;                         \
1612 }
1613 #define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3)  \
1614 {                                                                             \
1615     MUL2(in0, in1, in2, in3, out0, out1);                                     \
1616     MUL2(in4, in5, in6, in7, out2, out3);                                     \
1617 }
1618
1619 /* Description : Zero extend unsigned byte elements to halfword elements
1620    Arguments   : Inputs  - in           (1 input unsigned byte vector)
1621                  Outputs - out0, out1   (unsigned 2 halfword vectors)
1622                  Return Type - signed halfword
1623    Details     : Zero extended right half of vector is returned in 'out0'
1624                  Zero extended left half of vector is returned in 'out1'
1625 */
1626 #define UNPCK_UB_SH(in, out0, out1)                   \
1627 {                                                     \
1628     v16i8 zero_m = { 0 };                             \
1629                                                       \
1630     ILVRL_B2_SH(zero_m, in, out0, out1);              \
1631 }
1632
1633 /* Description : Sign extend halfword elements from input vector and return
1634                  result in pair of vectors
1635    Arguments   : Inputs  - in           (1 input halfword vector)
1636                  Outputs - out0, out1   (sign extended 2 word vectors)
1637                  Return Type - signed word
1638    Details     : Sign bit of halfword elements from input vector 'in' is
1639                  extracted and interleaved right with same vector 'in0' to
1640                  generate 4 signed word elements in 'out0'
1641                  Then interleaved left with same vector 'in0' to
1642                  generate 4 signed word elements in 'out1'
1643 */
1644 #define UNPCK_SH_SW(in, out0, out1)                  \
1645 {                                                    \
1646     v8i16 tmp_m;                                     \
1647                                                      \
1648     tmp_m = __msa_clti_s_h((v8i16) in, 0);           \
1649     ILVRL_H2_SW(tmp_m, in, out0, out1);              \
1650 }
1651
1652 /* Description : Transposes input 4x4 byte block
1653    Arguments   : Inputs  - in0, in1, in2, in3      (input 4x4 byte block)
1654                  Outputs - out0, out1, out2, out3  (output 4x4 byte block)
1655                  Return Type - unsigned byte
1656    Details     :
1657 */
1658 #define TRANSPOSE4x4_UB_UB(in0, in1, in2, in3, out0, out1, out2, out3)  \
1659 {                                                                       \
1660     v16i8 zero_m = { 0 };                                               \
1661     v16i8 s0_m, s1_m, s2_m, s3_m;                                       \
1662                                                                         \
1663     ILVR_D2_SB(in1, in0, in3, in2, s0_m, s1_m);                         \
1664     ILVRL_B2_SB(s1_m, s0_m, s2_m, s3_m);                                \
1665                                                                         \
1666     out0 = (v16u8) __msa_ilvr_b(s3_m, s2_m);                            \
1667     out1 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out0, 4);               \
1668     out2 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out1, 4);               \
1669     out3 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out2, 4);               \
1670 }
1671
1672 /* Description : Transposes input 8x4 byte block into 4x8
1673    Arguments   : Inputs  - in0, in1, in2, in3      (input 8x4 byte block)
1674                  Outputs - out0, out1, out2, out3  (output 4x8 byte block)
1675                  Return Type - unsigned byte
1676    Details     :
1677 */
1678 #define TRANSPOSE8x4_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1679                         out0, out1, out2, out3)                         \
1680 {                                                                       \
1681     v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                               \
1682                                                                         \
1683     ILVEV_W2_SB(in0, in4, in1, in5, tmp0_m, tmp1_m);                    \
1684     tmp2_m = __msa_ilvr_b(tmp1_m, tmp0_m);                              \
1685     ILVEV_W2_SB(in2, in6, in3, in7, tmp0_m, tmp1_m);                    \
1686                                                                         \
1687     tmp3_m = __msa_ilvr_b(tmp1_m, tmp0_m);                              \
1688     ILVRL_H2_SB(tmp3_m, tmp2_m, tmp0_m, tmp1_m);                        \
1689                                                                         \
1690     ILVRL_W2(RTYPE, tmp1_m, tmp0_m, out0, out2);                        \
1691     out1 = (RTYPE) __msa_ilvl_d((v2i64) out2, (v2i64) out0);            \
1692     out3 = (RTYPE) __msa_ilvl_d((v2i64) out0, (v2i64) out2);            \
1693 }
1694
1695 #define TRANSPOSE8x4_UB_UB(...) TRANSPOSE8x4_UB(v16u8, __VA_ARGS__)
1696
1697 /* Description : Transposes 16x8 block into 8x16 with byte elements in vectors
1698    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7,
1699                            in8, in9, in10, in11, in12, in13, in14, in15
1700                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1701                  Return Type - unsigned byte
1702    Details     :
1703 */
1704 #define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7,          \
1705                             in8, in9, in10, in11, in12, in13, in14, in15,    \
1706                             out0, out1, out2, out3, out4, out5, out6, out7)  \
1707 {                                                                            \
1708     v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                    \
1709     v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                    \
1710                                                                              \
1711     ILVEV_D2_UB(in0, in8, in1, in9, out7, out6);                             \
1712     ILVEV_D2_UB(in2, in10, in3, in11, out5, out4);                           \
1713     ILVEV_D2_UB(in4, in12, in5, in13, out3, out2);                           \
1714     ILVEV_D2_UB(in6, in14, in7, in15, out1, out0);                           \
1715                                                                              \
1716     tmp0_m = (v16u8) __msa_ilvev_b((v16i8) out6, (v16i8) out7);              \
1717     tmp4_m = (v16u8) __msa_ilvod_b((v16i8) out6, (v16i8) out7);              \
1718     tmp1_m = (v16u8) __msa_ilvev_b((v16i8) out4, (v16i8) out5);              \
1719     tmp5_m = (v16u8) __msa_ilvod_b((v16i8) out4, (v16i8) out5);              \
1720     out5 = (v16u8) __msa_ilvev_b((v16i8) out2, (v16i8) out3);                \
1721     tmp6_m = (v16u8) __msa_ilvod_b((v16i8) out2, (v16i8) out3);              \
1722     out7 = (v16u8) __msa_ilvev_b((v16i8) out0, (v16i8) out1);                \
1723     tmp7_m = (v16u8) __msa_ilvod_b((v16i8) out0, (v16i8) out1);              \
1724                                                                              \
1725     ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m);                 \
1726     out0 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
1727     out4 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
1728                                                                              \
1729     tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m);          \
1730     tmp3_m = (v16u8) __msa_ilvod_h((v8i16) out7, (v8i16) out5);              \
1731     out2 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
1732     out6 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
1733                                                                              \
1734     ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m);             \
1735     out1 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
1736     out5 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
1737                                                                              \
1738     tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp5_m, (v8i16) tmp4_m);          \
1739     tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp5_m, (v8i16) tmp4_m);          \
1740     tmp3_m = (v16u8) __msa_ilvod_h((v8i16) tmp7_m, (v8i16) tmp6_m);          \
1741     tmp3_m = (v16u8) __msa_ilvod_h((v8i16) tmp7_m, (v8i16) tmp6_m);          \
1742     out3 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
1743     out7 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
1744 }
1745
1746 /* Description : Pack even elements of input vectors & xor with 128
1747    Arguments   : Inputs  - in0, in1
1748                  Outputs - out_m
1749                  Return Type - unsigned byte
1750    Details     : Signed byte even elements from 'in0' and 'in1' are packed
1751                  together in one vector and the resulted vector is xor'ed with
1752                  128 to shift the range from signed to unsigned byte
1753 */
1754 #define PCKEV_XORI128_UB(in0, in1)                            \
1755 ( {                                                           \
1756     v16u8 out_m;                                              \
1757     out_m = (v16u8) __msa_pckev_b((v16i8) in1, (v16i8) in0);  \
1758     out_m = (v16u8) __msa_xori_b((v16u8) out_m, 128);         \
1759     out_m;                                                    \
1760 } )
1761
1762 /* Description : Pack even byte elements, extract 0 & 2 index words from pair
1763                  of results and store 4 words in destination memory as per
1764                  stride
1765    Arguments   : Inputs  - in0, in1, in2, in3, pdst, stride
1766 */
1767 #define PCKEV_ST4x4_UB(in0, in1, in2, in3, pdst, stride)  \
1768 {                                                         \
1769     uint32_t out0_m, out1_m, out2_m, out3_m;              \
1770     v16i8 tmp0_m, tmp1_m;                                 \
1771                                                           \
1772     PCKEV_B2_SB(in1, in0, in3, in2, tmp0_m, tmp1_m);      \
1773                                                           \
1774     out0_m = __msa_copy_u_w((v4i32) tmp0_m, 0);           \
1775     out1_m = __msa_copy_u_w((v4i32) tmp0_m, 2);           \
1776     out2_m = __msa_copy_u_w((v4i32) tmp1_m, 0);           \
1777     out3_m = __msa_copy_u_w((v4i32) tmp1_m, 2);           \
1778                                                           \
1779     SW4(out0_m, out1_m, out2_m, out3_m, pdst, stride);    \
1780 }
1781 #endif  /* AVUTIL_MIPS_GENERIC_MACROS_MSA_H */