git.sesse.net Git - ffmpeg/blob - libavutil/mips/generic_macros_msa.h

   1 /*
   2  * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
   3  *
   4  * This file is part of FFmpeg.
   5  *
   6  * FFmpeg is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * FFmpeg is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with FFmpeg; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19  */
  20
  21 #ifndef AVUTIL_MIPS_GENERIC_MACROS_MSA_H
  22 #define AVUTIL_MIPS_GENERIC_MACROS_MSA_H
  23
  24 #include <stdint.h>
  25 #include <msa.h>
  26
  27 #define LD_B(RTYPE, psrc) *((RTYPE *)(psrc))
  28 #define LD_UB(...) LD_B(v16u8, __VA_ARGS__)
  29 #define LD_SB(...) LD_B(v16i8, __VA_ARGS__)
  30
  31 #define LD_H(RTYPE, psrc) *((RTYPE *)(psrc))
  32 #define LD_UH(...) LD_H(v8u16, __VA_ARGS__)
  33 #define LD_SH(...) LD_H(v8i16, __VA_ARGS__)
  34
  35 #define LD_W(RTYPE, psrc) *((RTYPE *)(psrc))
  36 #define LD_UW(...) LD_W(v4u32, __VA_ARGS__)
  37 #define LD_SW(...) LD_W(v4i32, __VA_ARGS__)
  38
  39 #define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
  40 #define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
  41 #define ST_SB(...) ST_B(v16i8, __VA_ARGS__)
  42
  43 #define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
  44 #define ST_UH(...) ST_H(v8u16, __VA_ARGS__)
  45 #define ST_SH(...) ST_H(v8i16, __VA_ARGS__)
  46
  47 #define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
  48 #define ST_UW(...) ST_W(v4u32, __VA_ARGS__)
  49 #define ST_SW(...) ST_W(v4i32, __VA_ARGS__)
  50
  51 #if (__mips_isa_rev >= 6)
  52     #define LW(psrc)                           \
  53     ( {                                        \
  54         uint8_t *psrc_m = (uint8_t *) (psrc);  \
  55         uint32_t val_m;                        \
  56                                                \
  57         __asm__ volatile (                     \
  58             "lw  %[val_m],  %[psrc_m]  \n\t"   \
  59                                                \
  60             : [val_m] "=r" (val_m)             \
  61             : [psrc_m] "m" (*psrc_m)           \
  62         );                                     \
  63                                                \
  64         val_m;                                 \
  65     } )
  66
  67     #if (__mips == 64)
  68         #define LD(psrc)                           \
  69         ( {                                        \
  70             uint8_t *psrc_m = (uint8_t *) (psrc);  \
  71             uint64_t val_m = 0;                    \
  72                                                    \
  73             __asm__ volatile (                     \
  74                 "ld  %[val_m],  %[psrc_m]  \n\t"   \
  75                                                    \
  76                 : [val_m] "=r" (val_m)             \
  77                 : [psrc_m] "m" (*psrc_m)           \
  78             );                                     \
  79                                                    \
  80             val_m;                                 \
  81         } )
  82     #else  // !(__mips == 64)
  83         #define LD(psrc)                                              \
  84         ( {                                                           \
  85             uint8_t *psrc_m = (uint8_t *) (psrc);                     \
  86             uint32_t val0_m, val1_m;                                  \
  87             uint64_t val_m = 0;                                       \
  88                                                                       \
  89             val0_m = LW(psrc_m);                                      \
  90             val1_m = LW(psrc_m + 4);                                  \
  91                                                                       \
  92             val_m = (uint64_t) (val1_m);                              \
  93             val_m = (uint64_t) ((val_m << 32) & 0xFFFFFFFF00000000);  \
  94             val_m = (uint64_t) (val_m | (uint64_t) val0_m);           \
  95                                                                       \
  96             val_m;                                                    \
  97         } )
  98     #endif  // (__mips == 64)
  99
 100     #define SH(val, pdst)                      \
 101     {                                          \
 102         uint8_t *pdst_m = (uint8_t *) (pdst);  \
 103         uint16_t val_m = (val);                \
 104                                                \
 105         __asm__ volatile (                     \
 106             "sh  %[val_m],  %[pdst_m]  \n\t"   \
 107                                                \
 108             : [pdst_m] "=m" (*pdst_m)          \
 109             : [val_m] "r" (val_m)              \
 110         );                                     \
 111     }
 112
 113     #define SW(val, pdst)                      \
 114     {                                          \
 115         uint8_t *pdst_m = (uint8_t *) (pdst);  \
 116         uint32_t val_m = (val);                \
 117                                                \
 118         __asm__ volatile (                     \
 119             "sw  %[val_m],  %[pdst_m]  \n\t"   \
 120                                                \
 121             : [pdst_m] "=m" (*pdst_m)          \
 122             : [val_m] "r" (val_m)              \
 123         );                                     \
 124     }
 125
 126     #define SD(val, pdst)                      \
 127     {                                          \
 128         uint8_t *pdst_m = (uint8_t *) (pdst);  \
 129         uint64_t val_m = (val);                \
 130                                                \
 131         __asm__ volatile (                     \
 132             "sd  %[val_m],  %[pdst_m]  \n\t"   \
 133                                                \
 134             : [pdst_m] "=m" (*pdst_m)          \
 135             : [val_m] "r" (val_m)              \
 136         );                                     \
 137     }
 138 #else  // !(__mips_isa_rev >= 6)
 139     #define LW(psrc)                           \
 140     ( {                                        \
 141         uint8_t *psrc_m = (uint8_t *) (psrc);  \
 142         uint32_t val_m;                        \
 143                                                \
 144         __asm__ volatile (                     \
 145             "ulw  %[val_m],  %[psrc_m]  \n\t"  \
 146                                                \
 147             : [val_m] "=r" (val_m)             \
 148             : [psrc_m] "m" (*psrc_m)           \
 149         );                                     \
 150                                                \
 151         val_m;                                 \
 152     } )
 153
 154     #if (__mips == 64)
 155         #define LD(psrc)                           \
 156         ( {                                        \
 157             uint8_t *psrc_m = (uint8_t *) (psrc);  \
 158             uint64_t val_m = 0;                    \
 159                                                    \
 160             __asm__ volatile (                     \
 161                 "uld  %[val_m],  %[psrc_m]  \n\t"  \
 162                                                    \
 163                 : [val_m] "=r" (val_m)             \
 164                 : [psrc_m] "m" (*psrc_m)           \
 165             );                                     \
 166                                                    \
 167             val_m;                                 \
 168         } )
 169     #else  // !(__mips == 64)
 170         #define LD(psrc)                                              \
 171         ( {                                                           \
 172             uint8_t *psrc_m1 = (uint8_t *) (psrc);                    \
 173             uint32_t val0_m, val1_m;                                  \
 174             uint64_t val_m = 0;                                       \
 175                                                                       \
 176             val0_m = LW(psrc_m1);                                     \
 177             val1_m = LW(psrc_m1 + 4);                                 \
 178                                                                       \
 179             val_m = (uint64_t) (val1_m);                              \
 180             val_m = (uint64_t) ((val_m << 32) & 0xFFFFFFFF00000000);  \
 181             val_m = (uint64_t) (val_m | (uint64_t) val0_m);           \
 182                                                                       \
 183             val_m;                                                    \
 184         } )
 185     #endif  // (__mips == 64)
 186
 187     #define SH(val, pdst)                      \
 188     {                                          \
 189         uint8_t *pdst_m = (uint8_t *) (pdst);  \
 190         uint16_t val_m = (val);                \
 191                                                \
 192         __asm__ volatile (                     \
 193             "ush  %[val_m],  %[pdst_m]  \n\t"  \
 194                                                \
 195             : [pdst_m] "=m" (*pdst_m)          \
 196             : [val_m] "r" (val_m)              \
 197         );                                     \
 198     }
 199
 200     #define SW(val, pdst)                      \
 201     {                                          \
 202         uint8_t *pdst_m = (uint8_t *) (pdst);  \
 203         uint32_t val_m = (val);                \
 204                                                \
 205         __asm__ volatile (                     \
 206             "usw  %[val_m],  %[pdst_m]  \n\t"  \
 207                                                \
 208             : [pdst_m] "=m" (*pdst_m)          \
 209             : [val_m] "r" (val_m)              \
 210         );                                     \
 211     }
 212
 213     #define SD(val, pdst)                                          \
 214     {                                                              \
 215         uint8_t *pdst_m1 = (uint8_t *) (pdst);                     \
 216         uint32_t val0_m, val1_m;                                   \
 217                                                                    \
 218         val0_m = (uint32_t) ((val) & 0x00000000FFFFFFFF);          \
 219         val1_m = (uint32_t) (((val) >> 32) & 0x00000000FFFFFFFF);  \
 220                                                                    \
 221         SW(val0_m, pdst_m1);                                       \
 222         SW(val1_m, pdst_m1 + 4);                                   \
 223     }
 224 #endif // (__mips_isa_rev >= 6)
 225
 226 /* Description : Load 4 words with stride
 227    Arguments   : Inputs  - psrc    (source pointer to load from)
 228                          - stride
 229                  Outputs - out0, out1, out2, out3
 230    Details     : Loads word in 'out0' from (psrc)
 231                  Loads word in 'out1' from (psrc + stride)
 232                  Loads word in 'out2' from (psrc + 2 * stride)
 233                  Loads word in 'out3' from (psrc + 3 * stride)
 234 */
 235 #define LW4(psrc, stride, out0, out1, out2, out3)  \
 236 {                                                  \
 237     out0 = LW((psrc));                             \
 238     out1 = LW((psrc) + stride);                    \
 239     out2 = LW((psrc) + 2 * stride);                \
 240     out3 = LW((psrc) + 3 * stride);                \
 241 }
 242
 243 /* Description : Store 4 words with stride
 244    Arguments   : Inputs  - in0, in1, in2, in3, pdst, stride
 245    Details     : Stores word from 'in0' to (pdst)
 246                  Stores word from 'in1' to (pdst + stride)
 247                  Stores word from 'in2' to (pdst + 2 * stride)
 248                  Stores word from 'in3' to (pdst + 3 * stride)
 249 */
 250 #define SW4(in0, in1, in2, in3, pdst, stride)  \
 251 {                                              \
 252     SW(in0, (pdst))                            \
 253     SW(in1, (pdst) + stride);                  \
 254     SW(in2, (pdst) + 2 * stride);              \
 255     SW(in3, (pdst) + 3 * stride);              \
 256 }
 257
 258 /* Description : Store 4 double words with stride
 259    Arguments   : Inputs  - in0, in1, in2, in3, pdst, stride
 260    Details     : Stores double word from 'in0' to (pdst)
 261                  Stores double word from 'in1' to (pdst + stride)
 262                  Stores double word from 'in2' to (pdst + 2 * stride)
 263                  Stores double word from 'in3' to (pdst + 3 * stride)
 264 */
 265 #define SD4(in0, in1, in2, in3, pdst, stride)  \
 266 {                                              \
 267     SD(in0, (pdst))                            \
 268     SD(in1, (pdst) + stride);                  \
 269     SD(in2, (pdst) + 2 * stride);              \
 270     SD(in3, (pdst) + 3 * stride);              \
 271 }
 272
 273 /* Description : Load vectors with 16 byte elements with stride
 274    Arguments   : Inputs  - psrc    (source pointer to load from)
 275                          - stride
 276                  Outputs - out0, out1
 277                  Return Type - as per RTYPE
 278    Details     : Loads 16 byte elements in 'out0' from (psrc)
 279                  Loads 16 byte elements in 'out1' from (psrc + stride)
 280 */
 281 #define LD_B2(RTYPE, psrc, stride, out0, out1)  \
 282 {                                               \
 283     out0 = LD_B(RTYPE, (psrc));                 \
 284     out1 = LD_B(RTYPE, (psrc) + stride);        \
 285 }
 286 #define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)
 287 #define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__)
 288
 289 #define LD_B3(RTYPE, psrc, stride, out0, out1, out2)  \
 290 {                                                     \
 291     LD_B2(RTYPE, (psrc), stride, out0, out1);         \
 292     out2 = LD_B(RTYPE, (psrc) + 2 * stride);          \
 293 }
 294 #define LD_SB3(...) LD_B3(v16i8, __VA_ARGS__)
 295
 296 #define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3)   \
 297 {                                                            \
 298     LD_B2(RTYPE, (psrc), stride, out0, out1);                \
 299     LD_B2(RTYPE, (psrc) + 2 * stride , stride, out2, out3);  \
 300 }
 301 #define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)
 302 #define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__)
 303
 304 #define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4)  \
 305 {                                                                 \
 306     LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);         \
 307     out4 = LD_B(RTYPE, (psrc) + 4 * stride);                      \
 308 }
 309 #define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__)
 310 #define LD_SB5(...) LD_B5(v16i8, __VA_ARGS__)
 311
 312 #define LD_B6(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5)  \
 313 {                                                                       \
 314     LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);               \
 315     LD_B2(RTYPE, (psrc) + 4 * stride, stride, out4, out5);              \
 316 }
 317 #define LD_SB6(...) LD_B6(v16i8, __VA_ARGS__)
 318
 319 #define LD_B7(RTYPE, psrc, stride,                               \
 320               out0, out1, out2, out3, out4, out5, out6)          \
 321 {                                                                \
 322     LD_B5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4);  \
 323     LD_B2(RTYPE, (psrc) + 5 * stride, stride, out5, out6);       \
 324 }
 325 #define LD_SB7(...) LD_B7(v16i8, __VA_ARGS__)
 326
 327 #define LD_B8(RTYPE, psrc, stride,                                      \
 328               out0, out1, out2, out3, out4, out5, out6, out7)           \
 329 {                                                                       \
 330     LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);               \
 331     LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7);  \
 332 }
 333 #define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__)
 334 #define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__)
 335
 336 /* Description : Load vectors with 8 halfword elements with stride
 337    Arguments   : Inputs  - psrc    (source pointer to load from)
 338                          - stride
 339                  Outputs - out0, out1
 340    Details     : Loads 8 halfword elements in 'out0' from (psrc)
 341                  Loads 8 halfword elements in 'out1' from (psrc + stride)
 342 */
 343 #define LD_H2(RTYPE, psrc, stride, out0, out1)  \
 344 {                                               \
 345     out0 = LD_H(RTYPE, (psrc));                 \
 346     out1 = LD_H(RTYPE, (psrc) + (stride));      \
 347 }
 348 #define LD_UH2(...) LD_H2(v8u16, __VA_ARGS__)
 349 #define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__)
 350
 351 #define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3)  \
 352 {                                                           \
 353     LD_H2(RTYPE, (psrc), stride, out0, out1);               \
 354     LD_H2(RTYPE, (psrc) + 2 * stride, stride, out2, out3);  \
 355 }
 356 #define LD_UH4(...) LD_H4(v8u16, __VA_ARGS__)
 357 #define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__)
 358
 359 #define LD_H6(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5)  \
 360 {                                                                       \
 361     LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3);               \
 362     LD_H2(RTYPE, (psrc) + 4 * stride, stride, out4, out5);              \
 363 }
 364 #define LD_UH6(...) LD_H6(v8u16, __VA_ARGS__)
 365 #define LD_SH6(...) LD_H6(v8i16, __VA_ARGS__)
 366
 367 #define LD_H8(RTYPE, psrc, stride,                                      \
 368               out0, out1, out2, out3, out4, out5, out6, out7)           \
 369 {                                                                       \
 370     LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3);               \
 371     LD_H4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7);  \
 372 }
 373 #define LD_UH8(...) LD_H8(v8u16, __VA_ARGS__)
 374 #define LD_SH8(...) LD_H8(v8i16, __VA_ARGS__)
 375
 376 /* Description : Store vectors of 16 byte elements with stride
 377    Arguments   : Inputs  - in0, in1, stride
 378                  Outputs - pdst    (destination pointer to store to)
 379    Details     : Stores 16 byte elements from 'in0' to (pdst)
 380                  Stores 16 byte elements from 'in1' to (pdst + stride)
 381 */
 382 #define ST_B2(RTYPE, in0, in1, pdst, stride)  \
 383 {                                             \
 384     ST_B(RTYPE, in0, (pdst));                 \
 385     ST_B(RTYPE, in1, (pdst) + stride);        \
 386 }
 387 #define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)
 388 #define ST_SB2(...) ST_B2(v16i8, __VA_ARGS__)
 389
 390 #define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride)    \
 391 {                                                         \
 392     ST_B2(RTYPE, in0, in1, (pdst), stride);               \
 393     ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride);  \
 394 }
 395 #define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
 396 #define ST_SB4(...) ST_B4(v16i8, __VA_ARGS__)
 397
 398 #define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,        \
 399               pdst, stride)                                         \
 400 {                                                                   \
 401     ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride);                 \
 402     ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride);  \
 403 }
 404 #define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__)
 405
 406 /* Description : Store vectors of 8 halfword elements with stride
 407    Arguments   : Inputs  - in0, in1, stride
 408                  Outputs - pdst    (destination pointer to store to)
 409    Details     : Stores 8 halfword elements from 'in0' to (pdst)
 410                  Stores 8 halfword elements from 'in1' to (pdst + stride)
 411 */
 412 #define ST_H2(RTYPE, in0, in1, pdst, stride)  \
 413 {                                             \
 414     ST_H(RTYPE, in0, (pdst));                 \
 415     ST_H(RTYPE, in1, (pdst) + stride);        \
 416 }
 417 #define ST_UH2(...) ST_H2(v8u16, __VA_ARGS__)
 418 #define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__)
 419
 420 #define ST_H4(RTYPE, in0, in1, in2, in3, pdst, stride)    \
 421 {                                                         \
 422     ST_H2(RTYPE, in0, in1, (pdst), stride);               \
 423     ST_H2(RTYPE, in2, in3, (pdst) + 2 * stride, stride);  \
 424 }
 425 #define ST_SH4(...) ST_H4(v8i16, __VA_ARGS__)
 426
 427 #define ST_H6(RTYPE, in0, in1, in2, in3, in4, in5, pdst, stride)  \
 428 {                                                                 \
 429     ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride);             \
 430     ST_H2(RTYPE, in4, in5, (pdst) + 4 * stride, stride);          \
 431 }
 432 #define ST_SH6(...) ST_H6(v8i16, __VA_ARGS__)
 433
 434 #define ST_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)  \
 435 {                                                                           \
 436     ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride);                       \
 437     ST_H4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride);          \
 438 }
 439 #define ST_SH8(...) ST_H8(v8i16, __VA_ARGS__)
 440
 441 /* Description : Store vectors of word elements with stride
 442    Arguments   : Inputs  - in0, in1, stride
 443                  Outputs - pdst    (destination pointer to store to)
 444                  Return Type - signed word
 445    Details     : Stores 4 word elements from 'in0' to (pdst)
 446                  Stores 4 word elements from 'in1' to (pdst + stride)
 447 */
 448 #define ST_SW2(in0, in1, pdst, stride)  \
 449 {                                       \
 450     ST_SW(in0, (pdst));                 \
 451     ST_SW(in1, (pdst) + stride);        \
 452 }
 453
 454 /* Description : Store as 2x4 byte block to destination memory from input vector
 455    Arguments   : Inputs  - in, stidx, pdst, stride
 456                  Return Type - unsigned byte
 457    Details     : Index stidx halfword element from 'in' vector is copied and
 458                  stored on first line
 459                  Index stidx+1 halfword element from 'in' vector is copied and
 460                  stored on second line
 461                  Index stidx+2 halfword element from 'in' vector is copied and
 462                  stored on third line
 463                  Index stidx+3 halfword element from 'in' vector is copied and
 464                  stored on fourth line
 465 */
 466 #define ST2x4_UB(in, stidx, pdst, stride)              \
 467 {                                                      \
 468     uint16_t out0_m, out1_m, out2_m, out3_m;           \
 469     uint8_t *pblk_2x4_m = (uint8_t *) (pdst);          \
 470                                                        \
 471     out0_m = __msa_copy_u_h((v8i16) in, (stidx));      \
 472     out1_m = __msa_copy_u_h((v8i16) in, (stidx + 1));  \
 473     out2_m = __msa_copy_u_h((v8i16) in, (stidx + 2));  \
 474     out3_m = __msa_copy_u_h((v8i16) in, (stidx + 3));  \
 475                                                        \
 476     SH(out0_m, pblk_2x4_m);                            \
 477     SH(out1_m, pblk_2x4_m + stride);                   \
 478     SH(out2_m, pblk_2x4_m + 2 * stride);               \
 479     SH(out3_m, pblk_2x4_m + 3 * stride);               \
 480 }
 481
 482 /* Description : Store as 4x2 byte block to destination memory from input vector
 483    Arguments   : Inputs  - in, pdst, stride
 484                  Return Type - unsigned byte
 485    Details     : Index 0 word element from input vector is copied and stored
 486                  on first line
 487                  Index 1 word element from input vector is copied and stored
 488                  on second line
 489 */
 490 #define ST4x2_UB(in, pdst, stride)             \
 491 {                                              \
 492     uint32_t out0_m, out1_m;                   \
 493     uint8_t *pblk_4x2_m = (uint8_t *) (pdst);  \
 494                                                \
 495     out0_m = __msa_copy_u_w((v4i32) in, 0);    \
 496     out1_m = __msa_copy_u_w((v4i32) in, 1);    \
 497                                                \
 498     SW(out0_m, pblk_4x2_m);                    \
 499     SW(out1_m, pblk_4x2_m + stride);           \
 500 }
 501
 502 /* Description : Store as 4x4 byte block to destination memory from input vector
 503    Arguments   : Inputs  - in0, in1, pdst, stride
 504                  Return Type - unsigned byte
 505    Details     : Idx0 word element from input vector 'in0' is copied and stored
 506                  on first line
 507                  Idx1 word element from input vector 'in0' is copied and stored
 508                  on second line
 509                  Idx2 word element from input vector 'in1' is copied and stored
 510                  on third line
 511                  Idx3 word element from input vector 'in1' is copied and stored
 512                  on fourth line
 513 */
 514 #define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)  \
 515 {                                                                 \
 516     uint32_t out0_m, out1_m, out2_m, out3_m;                      \
 517     uint8_t *pblk_4x4_m = (uint8_t *) (pdst);                     \
 518                                                                   \
 519     out0_m = __msa_copy_u_w((v4i32) in0, idx0);                   \
 520     out1_m = __msa_copy_u_w((v4i32) in0, idx1);                   \
 521     out2_m = __msa_copy_u_w((v4i32) in1, idx2);                   \
 522     out3_m = __msa_copy_u_w((v4i32) in1, idx3);                   \
 523                                                                   \
 524     SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride);      \
 525 }
 526 #define ST4x8_UB(in0, in1, pdst, stride)                            \
 527 {                                                                   \
 528     uint8_t *pblk_4x8 = (uint8_t *) (pdst);                         \
 529                                                                     \
 530     ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride);               \
 531     ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride);  \
 532 }
 533
 534 /* Description : Store as 6x4 byte block to destination memory from input
 535                  vectors
 536    Arguments   : Inputs  - in0, in1, pdst, stride
 537                  Return Type - unsigned byte
 538    Details     : Index 0 word element from input vector 'in0' is copied and
 539                  stored on first line followed by index 2 halfword element
 540                  Index 2 word element from input vector 'in0' is copied and
 541                  stored on second line followed by index 2 halfword element
 542                  Index 0 word element from input vector 'in1' is copied and
 543                  stored on third line followed by index 2 halfword element
 544                  Index 2 word element from input vector 'in1' is copied and
 545                  stored on fourth line followed by index 2 halfword element
 546 */
 547 #define ST6x4_UB(in0, in1, pdst, stride)       \
 548 {                                              \
 549     uint32_t out0_m, out1_m, out2_m, out3_m;   \
 550     uint16_t out4_m, out5_m, out6_m, out7_m;   \
 551     uint8_t *pblk_6x4_m = (uint8_t *) (pdst);  \
 552                                                \
 553     out0_m = __msa_copy_u_w((v4i32) in0, 0);   \
 554     out1_m = __msa_copy_u_w((v4i32) in0, 2);   \
 555     out2_m = __msa_copy_u_w((v4i32) in1, 0);   \
 556     out3_m = __msa_copy_u_w((v4i32) in1, 2);   \
 557                                                \
 558     out4_m = __msa_copy_u_h((v8i16) in0, 2);   \
 559     out5_m = __msa_copy_u_h((v8i16) in0, 6);   \
 560     out6_m = __msa_copy_u_h((v8i16) in1, 2);   \
 561     out7_m = __msa_copy_u_h((v8i16) in1, 6);   \
 562                                                \
 563     SW(out0_m, pblk_6x4_m);                    \
 564     SH(out4_m, (pblk_6x4_m + 4));              \
 565     pblk_6x4_m += stride;                      \
 566     SW(out1_m, pblk_6x4_m);                    \
 567     SH(out5_m, (pblk_6x4_m + 4));              \
 568     pblk_6x4_m += stride;                      \
 569     SW(out2_m, pblk_6x4_m);                    \
 570     SH(out6_m, (pblk_6x4_m + 4));              \
 571     pblk_6x4_m += stride;                      \
 572     SW(out3_m, pblk_6x4_m);                    \
 573     SH(out7_m, (pblk_6x4_m + 4));              \
 574 }
 575
 576 /* Description : Store as 8x2 byte block to destination memory from input vector
 577    Arguments   : Inputs  - in, pdst, stride
 578    Details     : Index 0 double word element from input vector 'in' is copied
 579                  and stored to destination memory at (pdst)
 580                  Index 1 double word element from input vector 'in' is copied
 581                  and stored to destination memory at (pdst + stride)
 582 */
 583 #define ST8x2_UB(in, pdst, stride)             \
 584 {                                              \
 585     uint64_t out0_m, out1_m;                   \
 586     uint8_t *pblk_8x2_m = (uint8_t *) (pdst);  \
 587                                                \
 588     out0_m = __msa_copy_u_d((v2i64) in, 0);    \
 589     out1_m = __msa_copy_u_d((v2i64) in, 1);    \
 590                                                \
 591     SD(out0_m, pblk_8x2_m);                    \
 592     SD(out1_m, pblk_8x2_m + stride);           \
 593 }
 594
 595 /* Description : Store as 8x4 byte block to destination memory from input
 596                  vectors
 597    Arguments   : Inputs  - in0, in1, pdst, stride
 598    Details     : Index 0 double word element from input vector 'in0' is copied
 599                  and stored to destination memory at (pblk_8x4_m)
 600                  Index 1 double word element from input vector 'in0' is copied
 601                  and stored to destination memory at (pblk_8x4_m + stride)
 602                  Index 0 double word element from input vector 'in1' is copied
 603                  and stored to destination memory at (pblk_8x4_m + 2 * stride)
 604                  Index 1 double word element from input vector 'in1' is copied
 605                  and stored to destination memory at (pblk_8x4_m + 3 * stride)
 606 */
 607 #define ST8x4_UB(in0, in1, pdst, stride)                      \
 608 {                                                             \
 609     uint64_t out0_m, out1_m, out2_m, out3_m;                  \
 610     uint8_t *pblk_8x4_m = (uint8_t *) (pdst);                 \
 611                                                               \
 612     out0_m = __msa_copy_u_d((v2i64) in0, 0);                  \
 613     out1_m = __msa_copy_u_d((v2i64) in0, 1);                  \
 614     out2_m = __msa_copy_u_d((v2i64) in1, 0);                  \
 615     out3_m = __msa_copy_u_d((v2i64) in1, 1);                  \
 616                                                               \
 617     SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride);  \
 618 }
 619 #define ST8x8_UB(in0, in1, in2, in3, pdst, stride)        \
 620 {                                                         \
 621     uint8_t *pblk_8x8_m = (uint8_t *) (pdst);             \
 622                                                           \
 623     ST8x4_UB(in0, in1, pblk_8x8_m, stride);               \
 624     ST8x4_UB(in2, in3, pblk_8x8_m + 4 * stride, stride);  \
 625 }
 626 #define ST12x4_UB(in0, in1, in2, pdst, stride)                \
 627 {                                                             \
 628     uint8_t *pblk_12x4_m = (uint8_t *) (pdst);                \
 629                                                               \
 630     /* left 8x4 */                                            \
 631     ST8x4_UB(in0, in1, pblk_12x4_m, stride);                  \
 632     /* right 4x4 */                                           \
 633     ST4x4_UB(in2, in2, 0, 1, 2, 3, pblk_12x4_m + 8, stride);  \
 634 }
 635
 636 /* Description : Store as 12x8 byte block to destination memory from
 637                  input vectors
 638    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
 639    Details     : Index 0 double word element from input vector 'in0' is copied
 640                  and stored to destination memory at (pblk_12x8_m) followed by
 641                  index 2 word element from same input vector 'in0' at
 642                  (pblk_12x8_m + 8)
 643                  Similar to remaining lines
 644 */
 645 #define ST12x8_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)  \
 646 {                                                                        \
 647     uint64_t out0_m, out1_m, out2_m, out3_m;                             \
 648     uint64_t out4_m, out5_m, out6_m, out7_m;                             \
 649     uint32_t out8_m, out9_m, out10_m, out11_m;                           \
 650     uint32_t out12_m, out13_m, out14_m, out15_m;                         \
 651     uint8_t *pblk_12x8_m = (uint8_t *) (pdst);                           \
 652                                                                          \
 653     out0_m = __msa_copy_u_d((v2i64) in0, 0);                             \
 654     out1_m = __msa_copy_u_d((v2i64) in1, 0);                             \
 655     out2_m = __msa_copy_u_d((v2i64) in2, 0);                             \
 656     out3_m = __msa_copy_u_d((v2i64) in3, 0);                             \
 657     out4_m = __msa_copy_u_d((v2i64) in4, 0);                             \
 658     out5_m = __msa_copy_u_d((v2i64) in5, 0);                             \
 659     out6_m = __msa_copy_u_d((v2i64) in6, 0);                             \
 660     out7_m = __msa_copy_u_d((v2i64) in7, 0);                             \
 661                                                                          \
 662     out8_m =  __msa_copy_u_w((v4i32) in0, 2);                            \
 663     out9_m =  __msa_copy_u_w((v4i32) in1, 2);                            \
 664     out10_m = __msa_copy_u_w((v4i32) in2, 2);                            \
 665     out11_m = __msa_copy_u_w((v4i32) in3, 2);                            \
 666     out12_m = __msa_copy_u_w((v4i32) in4, 2);                            \
 667     out13_m = __msa_copy_u_w((v4i32) in5, 2);                            \
 668     out14_m = __msa_copy_u_w((v4i32) in6, 2);                            \
 669     out15_m = __msa_copy_u_w((v4i32) in7, 2);                            \
 670                                                                          \
 671     SD(out0_m, pblk_12x8_m);                                             \
 672     SW(out8_m, pblk_12x8_m + 8);                                         \
 673     pblk_12x8_m += stride;                                               \
 674     SD(out1_m, pblk_12x8_m);                                             \
 675     SW(out9_m, pblk_12x8_m + 8);                                         \
 676     pblk_12x8_m += stride;                                               \
 677     SD(out2_m, pblk_12x8_m);                                             \
 678     SW(out10_m, pblk_12x8_m + 8);                                        \
 679     pblk_12x8_m += stride;                                               \
 680     SD(out3_m, pblk_12x8_m);                                             \
 681     SW(out11_m, pblk_12x8_m + 8);                                        \
 682     pblk_12x8_m += stride;                                               \
 683     SD(out4_m, pblk_12x8_m);                                             \
 684     SW(out12_m, pblk_12x8_m + 8);                                        \
 685     pblk_12x8_m += stride;                                               \
 686     SD(out5_m, pblk_12x8_m);                                             \
 687     SW(out13_m, pblk_12x8_m + 8);                                        \
 688     pblk_12x8_m += stride;                                               \
 689     SD(out6_m, pblk_12x8_m);                                             \
 690     SW(out14_m, pblk_12x8_m + 8);                                        \
 691     pblk_12x8_m += stride;                                               \
 692     SD(out7_m, pblk_12x8_m);                                             \
 693     SW(out15_m, pblk_12x8_m + 8);                                        \
 694 }
 695
 696 /* Description : Immediate number of columns to slide with zero
 697    Arguments   : Inputs  - in0, in1, slide_val
 698                  Outputs - out0, out1
 699                  Return Type - as per RTYPE
 700    Details     : Byte elements from 'zero_m' vector are slide into 'in0' by
 701                  number of elements specified by 'slide_val'
 702 */
 703 #define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val)                 \
 704 {                                                                         \
 705     v16i8 zero_m = { 0 };                                                 \
 706     out0 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in0, slide_val);  \
 707     out1 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in1, slide_val);  \
 708 }
 709 #define SLDI_B2_0_UB(...) SLDI_B2_0(v16u8, __VA_ARGS__)
 710
 711 #define SLDI_B4_0(RTYPE, in0, in1, in2, in3,            \
 712                   out0, out1, out2, out3, slide_val)    \
 713 {                                                       \
 714     SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val);  \
 715     SLDI_B2_0(RTYPE, in2, in3, out2, out3, slide_val);  \
 716 }
 717 #define SLDI_B4_0_SB(...) SLDI_B4_0(v16i8, __VA_ARGS__)
 718
 719 /* Description : Shuffle byte vector elements as per mask vector
 720    Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
 721                  Outputs - out0, out1
 722                  Return Type - as per RTYPE
 723    Details     : Selective byte elements from in0 & in1 are copied to out0 as
 724                  per control vector mask0
 725                  Selective byte elements from in2 & in3 are copied to out1 as
 726                  per control vector mask1
 727 */
 728 #define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1)       \
 729 {                                                                          \
 730     out0 = (RTYPE) __msa_vshf_b((v16i8) mask0, (v16i8) in1, (v16i8) in0);  \
 731     out1 = (RTYPE) __msa_vshf_b((v16i8) mask1, (v16i8) in3, (v16i8) in2);  \
 732 }
 733 #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
 734 #define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)
 735 #define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)
 736 #define VSHF_B2_SH(...) VSHF_B2(v8i16, __VA_ARGS__)
 737
 738 #define VSHF_B3(RTYPE, in0, in1, in2, in3, in4, in5, mask0, mask1, mask2,  \
 739                 out0, out1, out2)                                          \
 740 {                                                                          \
 741     VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1);          \
 742     out2 = (RTYPE) __msa_vshf_b((v16i8) mask2, (v16i8) in5, (v16i8) in4);  \
 743 }
 744 #define VSHF_B3_SB(...) VSHF_B3(v16i8, __VA_ARGS__)
 745
 746 #define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3,       \
 747                 out0, out1, out2, out3)                            \
 748 {                                                                  \
 749     VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1);  \
 750     VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3);  \
 751 }
 752 #define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__)
 753
 754 /* Description : Shuffle byte vector elements as per mask vector
 755    Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
 756                  Outputs - out0, out1
 757                  Return Type - as per RTYPE
 758    Details     : Selective byte elements from in0 & in1 are copied to out0 as
 759                  per control vector mask0
 760                  Selective byte elements from in2 & in3 are copied to out1 as
 761                  per control vector mask1
 762 */
 763 #define VSHF_W2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1)      \
 764 {                                                                         \
 765     out0 = (RTYPE) __msa_vshf_w((v4i32) mask0, (v4i32) in1, (v4i32) in0); \
 766     out1 = (RTYPE) __msa_vshf_w((v4i32) mask1, (v4i32) in3, (v4i32) in2); \
 767 }
 768 #define VSHF_W2_SB(...) VSHF_W2(v16i8, __VA_ARGS__)
 769
 770 /* Description : Dot product of byte vector elements
 771    Arguments   : Inputs  - mult0, mult1
 772                            cnst0, cnst1
 773                  Outputs - out0, out1
 774                  Return Type - signed halfword
 775    Details     : Signed byte elements from mult0 are multiplied with
 776                  signed byte elements from cnst0 producing a result
 777                  twice the size of input i.e. signed halfword.
 778                  Then this multiplication results of adjacent odd-even elements
 779                  are added together and stored to the out vector
 780                  (2 signed halfword results)
 781 */
 782 #define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)   \
 783 {                                                                 \
 784     out0 = (RTYPE) __msa_dotp_s_h((v16i8) mult0, (v16i8) cnst0);  \
 785     out1 = (RTYPE) __msa_dotp_s_h((v16i8) mult1, (v16i8) cnst1);  \
 786 }
 787 #define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__)
 788
 789 #define DOTP_SB3(RTYPE, mult0, mult1, mult2, cnst0, cnst1, cnst2,  \
 790                  out0, out1, out2)                                 \
 791 {                                                                  \
 792     DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);       \
 793     out2 = (RTYPE) __msa_dotp_s_h((v16i8) mult2, (v16i8) cnst2);   \
 794 }
 795 #define DOTP_SB3_SH(...) DOTP_SB3(v8i16, __VA_ARGS__)
 796
 797 #define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3,                   \
 798                  cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3)  \
 799 {                                                                     \
 800     DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);          \
 801     DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);          \
 802 }
 803 #define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__)
 804
 805 /* Description : Dot product of halfword vector elements
 806    Arguments   : Inputs  - mult0, mult1
 807                            cnst0, cnst1
 808                  Outputs - out0, out1
 809                  Return Type - signed word
 810    Details     : Signed halfword elements from mult0 are multiplied with
 811                  signed halfword elements from cnst0 producing a result
 812                  twice the size of input i.e. signed word.
 813                  Then this multiplication results of adjacent odd-even elements
 814                  are added together and stored to the out vector
 815                  (2 signed word results)
 816 */
 817 #define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)   \
 818 {                                                                 \
 819     out0 = (RTYPE) __msa_dotp_s_w((v8i16) mult0, (v8i16) cnst0);  \
 820     out1 = (RTYPE) __msa_dotp_s_w((v8i16) mult1, (v8i16) cnst1);  \
 821 }
 822 #define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__)
 823
 824 #define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3,           \
 825                  cnst0, cnst1, cnst2, cnst3,                  \
 826                  out0, out1, out2, out3)                      \
 827 {                                                             \
 828     DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);  \
 829     DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);  \
 830 }
 831 #define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__)
 832
 833 /* Description : Dot product & addition of byte vector elements
 834    Arguments   : Inputs  - mult0, mult1
 835                            cnst0, cnst1
 836                  Outputs - out0, out1
 837                  Return Type - signed halfword
 838    Details     : Signed byte elements from mult0 are multiplied with
 839                  signed byte elements from cnst0 producing a result
 840                  twice the size of input i.e. signed halfword.
 841                  Then this multiplication results of adjacent odd-even elements
 842                  are added to the out vector
 843                  (2 signed halfword results)
 844 */
 845 #define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)   \
 846 {                                                                  \
 847     out0 = (RTYPE) __msa_dpadd_s_h((v8i16) out0,                   \
 848                                    (v16i8) mult0, (v16i8) cnst0);  \
 849     out1 = (RTYPE) __msa_dpadd_s_h((v8i16) out1,                   \
 850                                    (v16i8) mult1, (v16i8) cnst1);  \
 851 }
 852 #define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__)
 853
 854 #define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3,                   \
 855                   cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3)  \
 856 {                                                                      \
 857     DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);          \
 858     DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);          \
 859 }
 860 #define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__)
 861
 862 /* Description : Dot product & addition of halfword vector elements
 863    Arguments   : Inputs  - mult0, mult1
 864                            cnst0, cnst1
 865                  Outputs - out0, out1
 866                  Return Type - signed word
 867    Details     : Signed halfword elements from mult0 are multiplied with
 868                  signed halfword elements from cnst0 producing a result
 869                  twice the size of input i.e. signed word.
 870                  Then this multiplication results of adjacent odd-even elements
 871                  are added to the out vector
 872                  (2 signed word results)
 873 */
 874 #define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)   \
 875 {                                                                  \
 876     out0 = (RTYPE) __msa_dpadd_s_w((v4i32) out0,                   \
 877                                    (v8i16) mult0, (v8i16) cnst0);  \
 878     out1 = (RTYPE) __msa_dpadd_s_w((v4i32) out1,                   \
 879                                    (v8i16) mult1, (v8i16) cnst1);  \
 880 }
 881 #define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__)
 882
 883 /* Description : Clips all halfword elements of input vector between min & max
 884                  out = ((in) < (min)) ? (min) : (((in) > (max)) ? (max) : (in))
 885    Arguments   : Inputs  - in       (input vector)
 886                          - min      (min threshold)
 887                          - max      (max threshold)
 888                  Outputs - out_m    (output vector with clipped elements)
 889                  Return Type - signed halfword
 890 */
 891 #define CLIP_SH(in, min, max)                           \
 892 ( {                                                     \
 893     v8i16 out_m;                                        \
 894                                                         \
 895     out_m = __msa_max_s_h((v8i16) min, (v8i16) in);     \
 896     out_m = __msa_min_s_h((v8i16) max, (v8i16) out_m);  \
 897     out_m;                                              \
 898 } )
 899
 900 /* Description : Clips all signed halfword elements of input vector
 901                  between 0 & 255
 902    Arguments   : Inputs  - in       (input vector)
 903                  Outputs - out_m    (output vector with clipped elements)
 904                  Return Type - signed halfword
 905 */
 906 #define CLIP_SH_0_255(in)                                 \
 907 ( {                                                       \
 908     v8i16 max_m = __msa_ldi_h(255);                       \
 909     v8i16 out_m;                                          \
 910                                                           \
 911     out_m = __msa_maxi_s_h((v8i16) in, 0);                \
 912     out_m = __msa_min_s_h((v8i16) max_m, (v8i16) out_m);  \
 913     out_m;                                                \
 914 } )
 915 #define CLIP_SH2_0_255(in0, in1)  \
 916 {                                 \
 917     in0 = CLIP_SH_0_255(in0);     \
 918     in1 = CLIP_SH_0_255(in1);     \
 919 }
 920 #define CLIP_SH4_0_255(in0, in1, in2, in3)  \
 921 {                                           \
 922     CLIP_SH2_0_255(in0, in1);               \
 923     CLIP_SH2_0_255(in2, in3);               \
 924 }
 925
 926 /* Description : Clips all signed word elements of input vector
 927                  between 0 & 255
 928    Arguments   : Inputs  - in       (input vector)
 929                  Outputs - out_m    (output vector with clipped elements)
 930                  Return Type - signed word
 931 */
 932 #define CLIP_SW_0_255(in)                                 \
 933 ( {                                                       \
 934     v4i32 max_m = __msa_ldi_w(255);                       \
 935     v4i32 out_m;                                          \
 936                                                           \
 937     out_m = __msa_maxi_s_w((v4i32) in, 0);                \
 938     out_m = __msa_min_s_w((v4i32) max_m, (v4i32) out_m);  \
 939     out_m;                                                \
 940 } )
 941
 942 /* Description : Horizontal subtraction of unsigned byte vector elements
 943    Arguments   : Inputs  - in0, in1
 944                  Outputs - out0, out1
 945                  Return Type - as per RTYPE
 946    Details     : Each unsigned odd byte element from 'in0' is subtracted from
 947                  even unsigned byte element from 'in0' (pairwise) and the
 948                  halfword result is stored in 'out0'
 949 */
 950 #define HSUB_UB2(RTYPE, in0, in1, out0, out1)                 \
 951 {                                                             \
 952     out0 = (RTYPE) __msa_hsub_u_h((v16u8) in0, (v16u8) in0);  \
 953     out1 = (RTYPE) __msa_hsub_u_h((v16u8) in1, (v16u8) in1);  \
 954 }
 955 #define HSUB_UB2_UH(...) HSUB_UB2(v8u16, __VA_ARGS__)
 956 #define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
 957
 958 /* Description : Interleave even halfword elements from vectors
 959    Arguments   : Inputs  - in0, in1, in2, in3
 960                  Outputs - out0, out1
 961                  Return Type - as per RTYPE
 962    Details     : Even halfword elements of 'in0' and even halfword
 963                  elements of 'in1' are interleaved and copied to 'out0'
 964                  Even halfword elements of 'in2' and even halfword
 965                  elements of 'in3' are interleaved and copied to 'out1'
 966 */
 967 #define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1)      \
 968 {                                                            \
 969     out0 = (RTYPE) __msa_ilvev_h((v8i16) in1, (v8i16) in0);  \
 970     out1 = (RTYPE) __msa_ilvev_h((v8i16) in3, (v8i16) in2);  \
 971 }
 972 #define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__)
 973
 974 /* Description : Interleave even word elements from vectors
 975    Arguments   : Inputs  - in0, in1, in2, in3
 976                  Outputs - out0, out1
 977                  Return Type - as per RTYPE
 978    Details     : Even word elements of 'in0' and even word
 979                  elements of 'in1' are interleaved and copied to 'out0'
 980                  Even word elements of 'in2' and even word
 981                  elements of 'in3' are interleaved and copied to 'out1'
 982 */
 983 #define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1)      \
 984 {                                                            \
 985     out0 = (RTYPE) __msa_ilvev_w((v4i32) in1, (v4i32) in0);  \
 986     out1 = (RTYPE) __msa_ilvev_w((v4i32) in3, (v4i32) in2);  \
 987 }
 988 #define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__)
 989
 990 /* Description : Interleave even double word elements from vectors
 991    Arguments   : Inputs  - in0, in1, in2, in3
 992                  Outputs - out0, out1
 993                  Return Type - as per RTYPE
 994    Details     : Even double word elements of 'in0' and even double word
 995                  elements of 'in1' are interleaved and copied to 'out0'
 996                  Even double word elements of 'in2' and even double word
 997                  elements of 'in3' are interleaved and copied to 'out1'
 998 */
 999 #define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1000 {                                                            \
1001     out0 = (RTYPE) __msa_ilvev_d((v2i64) in1, (v2i64) in0);  \
1002     out1 = (RTYPE) __msa_ilvev_d((v2i64) in3, (v2i64) in2);  \
1003 }
1004 #define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)
1005
1006 /* Description : Interleave left half of byte elements from vectors
1007    Arguments   : Inputs  - in0, in1, in2, in3
1008                  Outputs - out0, out1
1009                  Return Type - as per RTYPE
1010    Details     : Left half of byte elements of in0 and left half of byte
1011                  elements of in1 are interleaved and copied to out0.
1012                  Left half of byte elements of in2 and left half of byte
1013                  elements of in3 are interleaved and copied to out1.
1014 */
1015 #define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1016 {                                                           \
1017     out0 = (RTYPE) __msa_ilvl_b((v16i8) in0, (v16i8) in1);  \
1018     out1 = (RTYPE) __msa_ilvl_b((v16i8) in2, (v16i8) in3);  \
1019 }
1020 #define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__)
1021 #define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__)
1022
1023 #define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1024                 out0, out1, out2, out3)                         \
1025 {                                                               \
1026     ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1027     ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1028 }
1029 #define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__)
1030 #define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__)
1031 #define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__)
1032
1033 /* Description : Interleave left half of halfword elements from vectors
1034    Arguments   : Inputs  - in0, in1, in2, in3
1035                  Outputs - out0, out1
1036                  Return Type - as per RTYPE
1037    Details     : Left half of halfword elements of in0 and left half of halfword
1038                  elements of in1 are interleaved and copied to out0.
1039                  Left half of halfword elements of in2 and left half of halfword
1040                  elements of in3 are interleaved and copied to out1.
1041 */
1042 #define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1043 {                                                           \
1044     out0 = (RTYPE) __msa_ilvl_h((v8i16) in0, (v8i16) in1);  \
1045     out1 = (RTYPE) __msa_ilvl_h((v8i16) in2, (v8i16) in3);  \
1046 }
1047 #define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__)
1048 #define ILVL_H2_SW(...) ILVL_H2(v4i32, __VA_ARGS__)
1049
1050 #define ILVL_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1051                 out0, out1, out2, out3)                         \
1052 {                                                               \
1053     ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1054     ILVL_H2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1055 }
1056 #define ILVL_H4_SH(...) ILVL_H4(v8i16, __VA_ARGS__)
1057
1058 /* Description : Interleave left half of word elements from vectors
1059    Arguments   : Inputs  - in0, in1, in2, in3
1060                  Outputs - out0, out1
1061                  Return Type - as per RTYPE
1062    Details     : Left half of word elements of in0 and left half of word
1063                  elements of in1 are interleaved and copied to out0.
1064                  Left half of word elements of in2 and left half of word
1065                  elements of in3 are interleaved and copied to out1.
1066 */
1067 #define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1068 {                                                           \
1069     out0 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1);  \
1070     out1 = (RTYPE) __msa_ilvl_w((v4i32) in2, (v4i32) in3);  \
1071 }
1072 #define ILVL_W2_SB(...) ILVL_W2(v16i8, __VA_ARGS__)
1073
1074 /* Description : Interleave right half of byte elements from vectors
1075    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1076                  Outputs - out0, out1, out2, out3
1077                  Return Type - as per RTYPE
1078    Details     : Right half of byte elements of in0 and right half of byte
1079                  elements of in1 are interleaved and copied to out0.
1080                  Right half of byte elements of in2 and right half of byte
1081                  elements of in3 are interleaved and copied to out1.
1082                  Similar for other pairs
1083 */
1084 #define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1085 {                                                           \
1086     out0 = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1);  \
1087     out1 = (RTYPE) __msa_ilvr_b((v16i8) in2, (v16i8) in3);  \
1088 }
1089 #define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__)
1090 #define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)
1091 #define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__)
1092 #define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
1093
1094 #define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1095                 out0, out1, out2, out3)                         \
1096 {                                                               \
1097     ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1098     ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1099 }
1100 #define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__)
1101 #define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__)
1102 #define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__)
1103
1104 /* Description : Interleave right half of halfword elements from vectors
1105    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1106                  Outputs - out0, out1, out2, out3
1107                  Return Type - signed halfword
1108    Details     : Right half of halfword elements of in0 and right half of
1109                  halfword elements of in1 are interleaved and copied to out0.
1110                  Right half of halfword elements of in2 and right half of
1111                  halfword elements of in3 are interleaved and copied to out1.
1112                  Similar for other pairs
1113 */
1114 #define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1115 {                                                           \
1116     out0 = (RTYPE) __msa_ilvr_h((v8i16) in0, (v8i16) in1);  \
1117     out1 = (RTYPE) __msa_ilvr_h((v8i16) in2, (v8i16) in3);  \
1118 }
1119 #define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
1120 #define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__)
1121
1122 #define ILVR_H3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2)  \
1123 {                                                                       \
1124     ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1);                     \
1125     out2 = (RTYPE) __msa_ilvr_h((v8i16) in4, (v8i16) in5);              \
1126 }
1127 #define ILVR_H3_SH(...) ILVR_H3(v8i16, __VA_ARGS__)
1128
1129 #define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1130                 out0, out1, out2, out3)                         \
1131 {                                                               \
1132     ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1133     ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1134 }
1135 #define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__)
1136
1137 #define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1138 {                                                           \
1139     out0 = (RTYPE) __msa_ilvr_w((v4i32) in0, (v4i32) in1);  \
1140     out1 = (RTYPE) __msa_ilvr_w((v4i32) in2, (v4i32) in3);  \
1141 }
1142 #define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__)
1143 #define ILVR_W2_SB(...) ILVR_W2(v16i8, __VA_ARGS__)
1144
1145 #define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1146                 out0, out1, out2, out3)                         \
1147 {                                                               \
1148     ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1149     ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1150 }
1151 #define ILVR_W4_SB(...) ILVR_W4(v16i8, __VA_ARGS__)
1152
1153 /* Description : Interleave right half of double word elements from vectors
1154    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1155                  Outputs - out0, out1, out2, out3
1156                  Return Type - unsigned double word
1157    Details     : Right half of double word elements of in0 and right half of
1158                  double word elements of in1 are interleaved and copied to out0.
1159                  Right half of double word elements of in2 and right half of
1160                  double word elements of in3 are interleaved and copied to out1.
1161 */
1162 #define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1)          \
1163 {                                                               \
1164     out0 = (RTYPE) __msa_ilvr_d((v2i64) (in0), (v2i64) (in1));  \
1165     out1 = (RTYPE) __msa_ilvr_d((v2i64) (in2), (v2i64) (in3));  \
1166 }
1167 #define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
1168 #define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
1169
1170 #define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2)  \
1171 {                                                                       \
1172     ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);                     \
1173     out2 = (RTYPE) __msa_ilvr_d((v2i64) (in4), (v2i64) (in5));          \
1174 }
1175 #define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__)
1176
1177 #define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1178                 out0, out1, out2, out3)                         \
1179 {                                                               \
1180     ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1181     ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1182 }
1183 #define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__)
1184
1185 /* Description : Interleave both left and right half of input vectors
1186    Arguments   : Inputs  - in0, in1
1187                  Outputs - out0, out1
1188                  Return Type - as per RTYPE
1189    Details     : Right half of byte elements from 'in0' and 'in1' are
1190                  interleaved and stored to 'out0'
1191                  Left half of byte elements from 'in0' and 'in1' are
1192                  interleaved and stored to 'out1'
1193 */
1194 #define ILVRL_B2(RTYPE, in0, in1, out0, out1)               \
1195 {                                                           \
1196     out0 = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1);  \
1197     out1 = (RTYPE) __msa_ilvl_b((v16i8) in0, (v16i8) in1);  \
1198 }
1199 #define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)
1200 #define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
1201
1202 #define ILVRL_H2(RTYPE, in0, in1, out0, out1)               \
1203 {                                                           \
1204     out0 = (RTYPE) __msa_ilvr_h((v8i16) in0, (v8i16) in1);  \
1205     out1 = (RTYPE) __msa_ilvl_h((v8i16) in0, (v8i16) in1);  \
1206 }
1207 #define ILVRL_H2_SB(...) ILVRL_H2(v16i8, __VA_ARGS__)
1208 #define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)
1209 #define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
1210
1211 #define ILVRL_W2(RTYPE, in0, in1, out0, out1)               \
1212 {                                                           \
1213     out0 = (RTYPE) __msa_ilvr_w((v4i32) in0, (v4i32) in1);  \
1214     out1 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1);  \
1215 }
1216 #define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
1217
1218 /* Description : Maximum values between signed elements of vector and
1219                  5-bit signed immediate value are copied to the output vector
1220    Arguments   : Inputs  - in0, in1, in2, in3, max_val
1221                  Outputs - in0, in1, in2, in3 (in place)
1222                  Return Type - unsigned halfword
1223    Details     : Maximum of signed halfword element values from 'in0' and
1224                  'max_val' are written to output vector 'in0'
1225 */
1226 #define MAXI_SH2(RTYPE, in0, in1, max_val)                 \
1227 {                                                          \
1228     in0 = (RTYPE) __msa_maxi_s_h((v8i16) in0, (max_val));  \
1229     in1 = (RTYPE) __msa_maxi_s_h((v8i16) in1, (max_val));  \
1230 }
1231 #define MAXI_SH2_SH(...) MAXI_SH2(v8i16, __VA_ARGS__)
1232
1233 #define MAXI_SH4(RTYPE, in0, in1, in2, in3, max_val)  \
1234 {                                                     \
1235     MAXI_SH2(RTYPE, in0, in1, max_val);               \
1236     MAXI_SH2(RTYPE, in2, in3, max_val);               \
1237 }
1238 #define MAXI_SH4_UH(...) MAXI_SH4(v8u16, __VA_ARGS__)
1239
1240 /* Description : Saturate the halfword element values to the max
1241                  unsigned value of (sat_val+1 bits)
1242                  The element data width remains unchanged
1243    Arguments   : Inputs  - in0, in1, in2, in3, sat_val
1244                  Outputs - in0, in1, in2, in3 (in place)
1245                  Return Type - unsigned halfword
1246    Details     : Each unsigned halfword element from 'in0' is saturated to the
1247                  value generated with (sat_val+1) bit range
1248                  Results are in placed to original vectors
1249 */
1250 #define SAT_UH2(RTYPE, in0, in1, sat_val)               \
1251 {                                                       \
1252     in0 = (RTYPE) __msa_sat_u_h((v8u16) in0, sat_val);  \
1253     in1 = (RTYPE) __msa_sat_u_h((v8u16) in1, sat_val);  \
1254 }
1255 #define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__)
1256
1257 #define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val)  \
1258 {                                                    \
1259     SAT_UH2(RTYPE, in0, in1, sat_val);               \
1260     SAT_UH2(RTYPE, in2, in3, sat_val)                \
1261 }
1262 #define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__)
1263
1264 /* Description : Saturate the halfword element values to the max
1265                  unsigned value of (sat_val+1 bits)
1266                  The element data width remains unchanged
1267    Arguments   : Inputs  - in0, in1, in2, in3, sat_val
1268                  Outputs - in0, in1, in2, in3 (in place)
1269                  Return Type - unsigned halfword
1270    Details     : Each unsigned halfword element from 'in0' is saturated to the
1271                  value generated with (sat_val+1) bit range
1272                  Results are in placed to original vectors
1273 */
1274 #define SAT_SH2(RTYPE, in0, in1, sat_val)               \
1275 {                                                       \
1276     in0 = (RTYPE) __msa_sat_s_h((v8i16) in0, sat_val);  \
1277     in1 = (RTYPE) __msa_sat_s_h((v8i16) in1, sat_val);  \
1278 }
1279 #define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__)
1280
1281 #define SAT_SH3(RTYPE, in0, in1, in2, sat_val)          \
1282 {                                                       \
1283     SAT_SH2(RTYPE, in0, in1, sat_val)                   \
1284     in2 = (RTYPE) __msa_sat_s_h((v8i16) in2, sat_val);  \
1285 }
1286 #define SAT_SH3_SH(...) SAT_SH3(v8i16, __VA_ARGS__)
1287
1288 #define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val)  \
1289 {                                                    \
1290     SAT_SH2(RTYPE, in0, in1, sat_val);               \
1291     SAT_SH2(RTYPE, in2, in3, sat_val);               \
1292 }
1293 #define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__)
1294
1295 /* Description : Indexed halfword element values are replicated to all
1296                  elements in output vector
1297    Arguments   : Inputs  - in, idx0, idx1
1298                  Outputs - out0, out1
1299                  Return Type - as per RTYPE
1300    Details     : 'idx0' element value from 'in' vector is replicated to all
1301                   elements in 'out0' vector
1302                   Valid index range for halfword operation is 0-7
1303 */
1304 #define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1)  \
1305 {                                                     \
1306     out0 = (RTYPE) __msa_splati_h((v8i16) in, idx0);  \
1307     out1 = (RTYPE) __msa_splati_h((v8i16) in, idx1);  \
1308 }
1309 #define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__)
1310
1311 #define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3,  \
1312                   out0, out1, out2, out3)             \
1313 {                                                     \
1314     SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1);     \
1315     SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3);     \
1316 }
1317 #define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__)
1318 #define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__)
1319
1320 /* Description : Indexed word element values are replicated to all
1321                  elements in output vector
1322    Arguments   : Inputs  - in, stidx
1323                  Outputs - out0, out1
1324                  Return Type - as per RTYPE
1325    Details     : 'stidx' element value from 'in' vector is replicated to all
1326                   elements in 'out0' vector
1327                  'stidx + 1' element value from 'in' vector is replicated to all
1328                   elements in 'out1' vector
1329                   Valid index range for halfword operation is 0-3
1330 */
1331 #define SPLATI_W2(RTYPE, in, stidx, out0, out1)            \
1332 {                                                          \
1333     out0 = (RTYPE) __msa_splati_w((v4i32) in, stidx);      \
1334     out1 = (RTYPE) __msa_splati_w((v4i32) in, (stidx+1));  \
1335 }
1336 #define SPLATI_W2_SW(...) SPLATI_W2(v4i32, __VA_ARGS__)
1337
1338 #define SPLATI_W4(RTYPE, in, out0, out1, out2, out3)  \
1339 {                                                     \
1340     SPLATI_W2(RTYPE, in, 0, out0, out1);              \
1341     SPLATI_W2(RTYPE, in, 2, out2, out3);              \
1342 }
1343 #define SPLATI_W4_SW(...) SPLATI_W4(v4i32, __VA_ARGS__)
1344
1345 /* Description : Pack even byte elements of vector pairs
1346    Arguments   : Inputs  - in0, in1, in2, in3
1347                  Outputs - out0, out1
1348                  Return Type - as per RTYPE
1349    Details     : Even byte elements of in0 are copied to the left half of
1350                  out0 & even byte elements of in1 are copied to the right
1351                  half of out0.
1352                  Even byte elements of in2 are copied to the left half of
1353                  out1 & even byte elements of in3 are copied to the right
1354                  half of out1.
1355 */
1356 #define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1357 {                                                            \
1358     out0 = (RTYPE) __msa_pckev_b((v16i8) in0, (v16i8) in1);  \
1359     out1 = (RTYPE) __msa_pckev_b((v16i8) in2, (v16i8) in3);  \
1360 }
1361 #define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)
1362 #define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)
1363 #define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__)
1364 #define PCKEV_B2_SW(...) PCKEV_B2(v4i32, __VA_ARGS__)
1365
1366 #define PCKEV_B3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2)  \
1367 {                                                                        \
1368     PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1);                     \
1369     out2 = (RTYPE) __msa_pckev_b((v16i8) in4, (v16i8) in5);              \
1370 }
1371 #define PCKEV_B3_UB(...) PCKEV_B3(v16u8, __VA_ARGS__)
1372 #define PCKEV_B3_SB(...) PCKEV_B3(v16i8, __VA_ARGS__)
1373
1374 #define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1375                  out0, out1, out2, out3)                         \
1376 {                                                                \
1377     PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1378     PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1379 }
1380 #define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__)
1381 #define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__)
1382 #define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__)
1383 #define PCKEV_B4_SW(...) PCKEV_B4(v4i32, __VA_ARGS__)
1384
1385 /* Description : Pack even halfword elements of vector pairs
1386    Arguments   : Inputs  - in0, in1, in2, in3
1387                  Outputs - out0, out1
1388                  Return Type - as per RTYPE
1389    Details     : Even halfword elements of in0 are copied to the left half of
1390                  out0 & even halfword elements of in1 are copied to the right
1391                  half of out0.
1392                  Even halfword elements of in2 are copied to the left half of
1393                  out1 & even halfword elements of in3 are copied to the right
1394                  half of out1.
1395 */
1396 #define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1397 {                                                            \
1398     out0 = (RTYPE) __msa_pckev_h((v8i16) in0, (v8i16) in1);  \
1399     out1 = (RTYPE) __msa_pckev_h((v8i16) in2, (v8i16) in3);  \
1400 }
1401 #define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__)
1402 #define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__)
1403
1404 #define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1405                  out0, out1, out2, out3)                         \
1406 {                                                                \
1407     PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1408     PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1409 }
1410 #define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__)
1411 #define PCKEV_H4_SW(...) PCKEV_H4(v4i32, __VA_ARGS__)
1412
1413 /* Description : Each byte element is logically xor'ed with immediate 128
1414    Arguments   : Inputs  - in0, in1
1415                  Outputs - in0, in1 (in-place)
1416                  Return Type - as per RTYPE
1417    Details     : Each unsigned byte element from input vector 'in0' is
1418                  logically xor'ed with 128 and result is in-place stored in
1419                  'in0' vector
1420                  Each unsigned byte element from input vector 'in1' is
1421                  logically xor'ed with 128 and result is in-place stored in
1422                  'in1' vector
1423                  Similar for other pairs
1424 */
1425 #define XORI_B2_128(RTYPE, in0, in1)               \
1426 {                                                  \
1427     in0 = (RTYPE) __msa_xori_b((v16u8) in0, 128);  \
1428     in1 = (RTYPE) __msa_xori_b((v16u8) in1, 128);  \
1429 }
1430 #define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__)
1431
1432 #define XORI_B3_128(RTYPE, in0, in1, in2)          \
1433 {                                                  \
1434     XORI_B2_128(RTYPE, in0, in1);                  \
1435     in2 = (RTYPE) __msa_xori_b((v16u8) in2, 128);  \
1436 }
1437 #define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__)
1438
1439 #define XORI_B4_128(RTYPE, in0, in1, in2, in3)  \
1440 {                                               \
1441     XORI_B2_128(RTYPE, in0, in1);               \
1442     XORI_B2_128(RTYPE, in2, in3);               \
1443 }
1444 #define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__)
1445 #define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__)
1446 #define XORI_B4_128_SH(...) XORI_B4_128(v8i16, __VA_ARGS__)
1447
1448 #define XORI_B5_128(RTYPE, in0, in1, in2, in3, in4)  \
1449 {                                                    \
1450     XORI_B3_128(RTYPE, in0, in1, in2);               \
1451     XORI_B2_128(RTYPE, in3, in4);                    \
1452 }
1453 #define XORI_B5_128_SB(...) XORI_B5_128(v16i8, __VA_ARGS__)
1454
1455 #define XORI_B6_128(RTYPE, in0, in1, in2, in3, in4, in5)  \
1456 {                                                         \
1457     XORI_B4_128(RTYPE, in0, in1, in2, in3);               \
1458     XORI_B2_128(RTYPE, in4, in5);                         \
1459 }
1460 #define XORI_B6_128_SB(...) XORI_B6_128(v16i8, __VA_ARGS__)
1461
1462 #define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6)  \
1463 {                                                              \
1464     XORI_B4_128(RTYPE, in0, in1, in2, in3);                    \
1465     XORI_B3_128(RTYPE, in4, in5, in6);                         \
1466 }
1467 #define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__)
1468
1469 #define XORI_B8_128(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7)  \
1470 {                                                                   \
1471     XORI_B4_128(RTYPE, in0, in1, in2, in3);                         \
1472     XORI_B4_128(RTYPE, in4, in5, in6, in7);                         \
1473 }
1474 #define XORI_B8_128_SB(...) XORI_B8_128(v16i8, __VA_ARGS__)
1475
1476 /* Description : Addition of signed halfword elements and signed saturation
1477    Arguments   : Inputs  - in0, in1, in2, in3
1478                  Outputs - out0, out1
1479                  Return Type - as per RTYPE
1480    Details     : Signed halfword elements from 'in0' are added to signed
1481                  halfword elements of 'in1'. The result is then signed saturated
1482                  between -32768 to +32767 (as per halfword data type)
1483                  Similar for other pairs
1484 */
1485 #define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1)       \
1486 {                                                             \
1487     out0 = (RTYPE) __msa_adds_s_h((v8i16) in0, (v8i16) in1);  \
1488     out1 = (RTYPE) __msa_adds_s_h((v8i16) in2, (v8i16) in3);  \
1489 }
1490 #define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__)
1491
1492 #define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1493                  out0, out1, out2, out3)                         \
1494 {                                                                \
1495     ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1496     ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1497 }
1498 #define ADDS_SH4_UH(...) ADDS_SH4(v8u16, __VA_ARGS__)
1499 #define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__)
1500
1501 /* Description : Shift left all elements of vector (generic for all data types)
1502    Arguments   : Inputs  - in0, in1, in2, in3, shift
1503                  Outputs - in0, in1, in2, in3 (in place)
1504                  Return Type - as per input vector RTYPE
1505    Details     : Each element of vector 'in0' is left shifted by 'shift' and
1506                  result is in place written to 'in0'
1507                  Similar for other pairs
1508 */
1509 #define SLLI_4V(in0, in1, in2, in3, shift)  \
1510 {                                           \
1511     in0 = in0 << shift;                     \
1512     in1 = in1 << shift;                     \
1513     in2 = in2 << shift;                     \
1514     in3 = in3 << shift;                     \
1515 }
1516
1517 /* Description : Arithmetic shift right all elements of vector
1518                  (generic for all data types)
1519    Arguments   : Inputs  - in0, in1, in2, in3, shift
1520                  Outputs - in0, in1, in2, in3 (in place)
1521                  Return Type - as per input vector RTYPE
1522    Details     : Each element of vector 'in0' is right shifted by 'shift' and
1523                  result is in place written to 'in0'
1524                  Here, 'shift' is GP variable passed in
1525                  Similar for other pairs
1526 */
1527 #define SRA_4V(in0, in1, in2, in3, shift)  \
1528 {                                          \
1529     in0 = in0 >> shift;                    \
1530     in1 = in1 >> shift;                    \
1531     in2 = in2 >> shift;                    \
1532     in3 = in3 >> shift;                    \
1533 }
1534
1535 /* Description : Shift right logical all halfword elements of vector
1536    Arguments   : Inputs  - in0, in1, in2, in3, shift
1537                  Outputs - in0, in1, in2, in3 (in place)
1538                  Return Type - unsigned halfword
1539    Details     : Each element of vector 'in0' is shifted right logical by
1540                  number of bits respective element holds in vector 'shift' and
1541                  result is in place written to 'in0'
1542                  Here, 'shift' is a vector passed in
1543                  Similar for other pairs
1544 */
1545 #define SRL_H4(RTYPE, in0, in1, in2, in3, shift)            \
1546 {                                                           \
1547     in0 = (RTYPE) __msa_srl_h((v8i16) in0, (v8i16) shift);  \
1548     in1 = (RTYPE) __msa_srl_h((v8i16) in1, (v8i16) shift);  \
1549     in2 = (RTYPE) __msa_srl_h((v8i16) in2, (v8i16) shift);  \
1550     in3 = (RTYPE) __msa_srl_h((v8i16) in3, (v8i16) shift);  \
1551 }
1552 #define SRL_H4_UH(...) SRL_H4(v8u16, __VA_ARGS__)
1553
1554 /* Description : Shift right arithmetic rounded halfwords
1555    Arguments   : Inputs  - in0, in1, shift
1556                  Outputs - in0, in1, (in place)
1557                  Return Type - unsigned halfword
1558    Details     : Each element of vector 'in0' is shifted right arithmetic by
1559                  number of bits respective element holds in vector 'shift'.
1560                  The last discarded bit is added to shifted value for rounding
1561                  and the result is in place written to 'in0'
1562                  Here, 'shift' is a vector passed in
1563                  Similar for other pairs
1564 */
1565 #define SRAR_H2(RTYPE, in0, in1, shift)                      \
1566 {                                                            \
1567     in0 = (RTYPE) __msa_srar_h((v8i16) in0, (v8i16) shift);  \
1568     in1 = (RTYPE) __msa_srar_h((v8i16) in1, (v8i16) shift);  \
1569 }
1570 #define SRAR_H2_UH(...) SRAR_H2(v8u16, __VA_ARGS__)
1571 #define SRAR_H2_SH(...) SRAR_H2(v8i16, __VA_ARGS__)
1572
1573 #define SRAR_H3(RTYPE, in0, in1, in2, shift)                 \
1574 {                                                            \
1575     SRAR_H2(RTYPE, in0, in1, shift)                          \
1576     in2 = (RTYPE) __msa_srar_h((v8i16) in2, (v8i16) shift);  \
1577 }
1578 #define SRAR_H3_SH(...) SRAR_H3(v8i16, __VA_ARGS__)
1579
1580 #define SRAR_H4(RTYPE, in0, in1, in2, in3, shift)  \
1581 {                                                  \
1582     SRAR_H2(RTYPE, in0, in1, shift)                \
1583     SRAR_H2(RTYPE, in2, in3, shift)                \
1584 }
1585 #define SRAR_H4_UH(...) SRAR_H4(v8u16, __VA_ARGS__)
1586 #define SRAR_H4_SH(...) SRAR_H4(v8i16, __VA_ARGS__)
1587
1588 /* Description : Shift right arithmetic rounded words
1589    Arguments   : Inputs  - in0, in1, shift
1590                  Outputs - in0, in1, (in place)
1591                  Return Type - as per RTYPE
1592    Details     : Each element of vector 'in0' is shifted right arithmetic by
1593                  number of bits respective element holds in vector 'shift'.
1594                  The last discarded bit is added to shifted value for rounding
1595                  and the result is in place written to 'in0'
1596                  Here, 'shift' is a vector passed in
1597                  Similar for other pairs
1598 */
1599 #define SRAR_W2(RTYPE, in0, in1, shift)                      \
1600 {                                                            \
1601     in0 = (RTYPE) __msa_srar_w((v4i32) in0, (v4i32) shift);  \
1602     in1 = (RTYPE) __msa_srar_w((v4i32) in1, (v4i32) shift);  \
1603 }
1604 #define SRAR_W2_SW(...) SRAR_W2(v4i32, __VA_ARGS__)
1605
1606 #define SRAR_W4(RTYPE, in0, in1, in2, in3, shift)  \
1607 {                                                  \
1608     SRAR_W2(RTYPE, in0, in1, shift)                \
1609     SRAR_W2(RTYPE, in2, in3, shift)                \
1610 }
1611 #define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__)
1612
1613 /* Description : Shift right arithmetic rounded (immediate)
1614    Arguments   : Inputs  - in0, in1, in2, in3, shift
1615                  Outputs - in0, in1, in2, in3 (in place)
1616                  Return Type - as per RTYPE
1617    Details     : Each element of vector 'in0' is shifted right arithmetic by
1618                  value in 'shift'.
1619                  The last discarded bit is added to shifted value for rounding
1620                  and the result is in place written to 'in0'
1621                  Similar for other pairs
1622 */
1623 #define SRARI_H2(RTYPE, in0, in1, shift)              \
1624 {                                                     \
1625     in0 = (RTYPE) __msa_srari_h((v8i16) in0, shift);  \
1626     in1 = (RTYPE) __msa_srari_h((v8i16) in1, shift);  \
1627 }
1628 #define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__)
1629 #define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__)
1630
1631 /* Description : Shift right arithmetic rounded (immediate)
1632    Arguments   : Inputs  - in0, in1, shift
1633                  Outputs - in0, in1     (in place)
1634                  Return Type - as per RTYPE
1635    Details     : Each element of vector 'in0' is shifted right arithmetic by
1636                  value in 'shift'.
1637                  The last discarded bit is added to shifted value for rounding
1638                  and the result is in place written to 'in0'
1639                  Similar for other pairs
1640 */
1641 #define SRARI_W2(RTYPE, in0, in1, shift)              \
1642 {                                                     \
1643     in0 = (RTYPE) __msa_srari_w((v4i32) in0, shift);  \
1644     in1 = (RTYPE) __msa_srari_w((v4i32) in1, shift);  \
1645 }
1646 #define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__)
1647
1648 #define SRARI_W4(RTYPE, in0, in1, in2, in3, shift)  \
1649 {                                                   \
1650     SRARI_W2(RTYPE, in0, in1, shift);               \
1651     SRARI_W2(RTYPE, in2, in3, shift);               \
1652 }
1653 #define SRARI_W4_SH(...) SRARI_W4(v8i16, __VA_ARGS__)
1654 #define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)
1655
1656 /* Description : Multiplication of pairs of vectors
1657    Arguments   : Inputs  - in0, in1, in2, in3
1658                  Outputs - out0, out1
1659    Details     : Each element from 'in0' is multiplied with elements from 'in1'
1660                  and result is written to 'out0'
1661                  Similar for other pairs
1662 */
1663 #define MUL2(in0, in1, in2, in3, out0, out1)  \
1664 {                                             \
1665     out0 = in0 * in1;                         \
1666     out1 = in2 * in3;                         \
1667 }
1668 #define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3)  \
1669 {                                                                             \
1670     MUL2(in0, in1, in2, in3, out0, out1);                                     \
1671     MUL2(in4, in5, in6, in7, out2, out3);                                     \
1672 }
1673
1674 /* Description : Addition of 2 pairs of vectors
1675    Arguments   : Inputs  - in0, in1, in2, in3
1676                  Outputs - out0, out1
1677    Details     : Each element from 2 pairs vectors is added and 2 results are
1678                  produced
1679 */
1680 #define ADD2(in0, in1, in2, in3, out0, out1)  \
1681 {                                             \
1682     out0 = in0 + in1;                         \
1683     out1 = in2 + in3;                         \
1684 }
1685 #define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3)  \
1686 {                                                                             \
1687     ADD2(in0, in1, in2, in3, out0, out1);                                     \
1688     ADD2(in4, in5, in6, in7, out2, out3);                                     \
1689 }
1690
1691 /* Description : Zero extend unsigned byte elements to halfword elements
1692    Arguments   : Inputs  - in           (1 input unsigned byte vector)
1693                  Outputs - out0, out1   (unsigned 2 halfword vectors)
1694                  Return Type - signed halfword
1695    Details     : Zero extended right half of vector is returned in 'out0'
1696                  Zero extended left half of vector is returned in 'out1'
1697 */
1698 #define UNPCK_UB_SH(in, out0, out1)                   \
1699 {                                                     \
1700     v16i8 zero_m = { 0 };                             \
1701                                                       \
1702     ILVRL_B2_SH(zero_m, in, out0, out1);              \
1703 }
1704
1705 /* Description : Sign extend halfword elements from input vector and return
1706                  result in pair of vectors
1707    Arguments   : Inputs  - in           (1 input halfword vector)
1708                  Outputs - out0, out1   (sign extended 2 word vectors)
1709                  Return Type - signed word
1710    Details     : Sign bit of halfword elements from input vector 'in' is
1711                  extracted and interleaved right with same vector 'in0' to
1712                  generate 4 signed word elements in 'out0'
1713                  Then interleaved left with same vector 'in0' to
1714                  generate 4 signed word elements in 'out1'
1715 */
1716 #define UNPCK_SH_SW(in, out0, out1)                  \
1717 {                                                    \
1718     v8i16 tmp_m;                                     \
1719                                                      \
1720     tmp_m = __msa_clti_s_h((v8i16) in, 0);           \
1721     ILVRL_H2_SW(tmp_m, in, out0, out1);              \
1722 }
1723
1724 /* Description : Transposes input 4x4 byte block
1725    Arguments   : Inputs  - in0, in1, in2, in3      (input 4x4 byte block)
1726                  Outputs - out0, out1, out2, out3  (output 4x4 byte block)
1727                  Return Type - unsigned byte
1728    Details     :
1729 */
1730 #define TRANSPOSE4x4_UB_UB(in0, in1, in2, in3, out0, out1, out2, out3)  \
1731 {                                                                       \
1732     v16i8 zero_m = { 0 };                                               \
1733     v16i8 s0_m, s1_m, s2_m, s3_m;                                       \
1734                                                                         \
1735     ILVR_D2_SB(in1, in0, in3, in2, s0_m, s1_m);                         \
1736     ILVRL_B2_SB(s1_m, s0_m, s2_m, s3_m);                                \
1737                                                                         \
1738     out0 = (v16u8) __msa_ilvr_b(s3_m, s2_m);                            \
1739     out1 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out0, 4);               \
1740     out2 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out1, 4);               \
1741     out3 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out2, 4);               \
1742 }
1743
1744 /* Description : Transposes input 8x4 byte block into 4x8
1745    Arguments   : Inputs  - in0, in1, in2, in3      (input 8x4 byte block)
1746                  Outputs - out0, out1, out2, out3  (output 4x8 byte block)
1747                  Return Type - unsigned byte
1748    Details     :
1749 */
1750 #define TRANSPOSE8x4_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1751                         out0, out1, out2, out3)                         \
1752 {                                                                       \
1753     v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                               \
1754                                                                         \
1755     ILVEV_W2_SB(in0, in4, in1, in5, tmp0_m, tmp1_m);                    \
1756     tmp2_m = __msa_ilvr_b(tmp1_m, tmp0_m);                              \
1757     ILVEV_W2_SB(in2, in6, in3, in7, tmp0_m, tmp1_m);                    \
1758                                                                         \
1759     tmp3_m = __msa_ilvr_b(tmp1_m, tmp0_m);                              \
1760     ILVRL_H2_SB(tmp3_m, tmp2_m, tmp0_m, tmp1_m);                        \
1761                                                                         \
1762     ILVRL_W2(RTYPE, tmp1_m, tmp0_m, out0, out2);                        \
1763     out1 = (RTYPE) __msa_ilvl_d((v2i64) out2, (v2i64) out0);            \
1764     out3 = (RTYPE) __msa_ilvl_d((v2i64) out0, (v2i64) out2);            \
1765 }
1766
1767 #define TRANSPOSE8x4_UB_UB(...) TRANSPOSE8x4_UB(v16u8, __VA_ARGS__)
1768
1769 /* Description : Transposes 16x8 block into 8x16 with byte elements in vectors
1770    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7,
1771                            in8, in9, in10, in11, in12, in13, in14, in15
1772                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1773                  Return Type - unsigned byte
1774    Details     :
1775 */
1776 #define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7,          \
1777                             in8, in9, in10, in11, in12, in13, in14, in15,    \
1778                             out0, out1, out2, out3, out4, out5, out6, out7)  \
1779 {                                                                            \
1780     v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                    \
1781     v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                    \
1782                                                                              \
1783     ILVEV_D2_UB(in0, in8, in1, in9, out7, out6);                             \
1784     ILVEV_D2_UB(in2, in10, in3, in11, out5, out4);                           \
1785     ILVEV_D2_UB(in4, in12, in5, in13, out3, out2);                           \
1786     ILVEV_D2_UB(in6, in14, in7, in15, out1, out0);                           \
1787                                                                              \
1788     tmp0_m = (v16u8) __msa_ilvev_b((v16i8) out6, (v16i8) out7);              \
1789     tmp4_m = (v16u8) __msa_ilvod_b((v16i8) out6, (v16i8) out7);              \
1790     tmp1_m = (v16u8) __msa_ilvev_b((v16i8) out4, (v16i8) out5);              \
1791     tmp5_m = (v16u8) __msa_ilvod_b((v16i8) out4, (v16i8) out5);              \
1792     out5 = (v16u8) __msa_ilvev_b((v16i8) out2, (v16i8) out3);                \
1793     tmp6_m = (v16u8) __msa_ilvod_b((v16i8) out2, (v16i8) out3);              \
1794     out7 = (v16u8) __msa_ilvev_b((v16i8) out0, (v16i8) out1);                \
1795     tmp7_m = (v16u8) __msa_ilvod_b((v16i8) out0, (v16i8) out1);              \
1796                                                                              \
1797     ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m);                 \
1798     out0 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
1799     out4 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
1800                                                                              \
1801     tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m);          \
1802     tmp3_m = (v16u8) __msa_ilvod_h((v8i16) out7, (v8i16) out5);              \
1803     out2 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
1804     out6 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
1805                                                                              \
1806     ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m);             \
1807     out1 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
1808     out5 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
1809                                                                              \
1810     tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp5_m, (v8i16) tmp4_m);          \
1811     tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp5_m, (v8i16) tmp4_m);          \
1812     tmp3_m = (v16u8) __msa_ilvod_h((v8i16) tmp7_m, (v8i16) tmp6_m);          \
1813     tmp3_m = (v16u8) __msa_ilvod_h((v8i16) tmp7_m, (v8i16) tmp6_m);          \
1814     out3 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
1815     out7 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
1816 }
1817
1818 /* Description : Pack even elements of input vectors & xor with 128
1819    Arguments   : Inputs  - in0, in1
1820                  Outputs - out_m
1821                  Return Type - unsigned byte
1822    Details     : Signed byte even elements from 'in0' and 'in1' are packed
1823                  together in one vector and the resulted vector is xor'ed with
1824                  128 to shift the range from signed to unsigned byte
1825 */
1826 #define PCKEV_XORI128_UB(in0, in1)                            \
1827 ( {                                                           \
1828     v16u8 out_m;                                              \
1829     out_m = (v16u8) __msa_pckev_b((v16i8) in1, (v16i8) in0);  \
1830     out_m = (v16u8) __msa_xori_b((v16u8) out_m, 128);         \
1831     out_m;                                                    \
1832 } )
1833
1834 /* Description : Pack even byte elements, extract 0 & 2 index words from pair
1835                  of results and store 4 words in destination memory as per
1836                  stride
1837    Arguments   : Inputs  - in0, in1, in2, in3, pdst, stride
1838 */
1839 #define PCKEV_ST4x4_UB(in0, in1, in2, in3, pdst, stride)  \
1840 {                                                         \
1841     uint32_t out0_m, out1_m, out2_m, out3_m;              \
1842     v16i8 tmp0_m, tmp1_m;                                 \
1843                                                           \
1844     PCKEV_B2_SB(in1, in0, in3, in2, tmp0_m, tmp1_m);      \
1845                                                           \
1846     out0_m = __msa_copy_u_w((v4i32) tmp0_m, 0);           \
1847     out1_m = __msa_copy_u_w((v4i32) tmp0_m, 2);           \
1848     out2_m = __msa_copy_u_w((v4i32) tmp1_m, 0);           \
1849     out3_m = __msa_copy_u_w((v4i32) tmp1_m, 2);           \
1850                                                           \
1851     SW4(out0_m, out1_m, out2_m, out3_m, pdst, stride);    \
1852 }
1853 #endif  /* AVUTIL_MIPS_GENERIC_MACROS_MSA_H */