git.sesse.net Git - ffmpeg/blob - libavutil/mips/generic_macros_msa.h

   1 /*
   2  * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
   3  *
   4  * This file is part of FFmpeg.
   5  *
   6  * FFmpeg is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * FFmpeg is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with FFmpeg; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19  */
  20
  21 #ifndef AVUTIL_MIPS_GENERIC_MACROS_MSA_H
  22 #define AVUTIL_MIPS_GENERIC_MACROS_MSA_H
  23
  24 #include <stdint.h>
  25 #include <msa.h>
  26
  27 #define LD_B(RTYPE, psrc) *((RTYPE *)(psrc))
  28 #define LD_UB(...) LD_B(v16u8, __VA_ARGS__)
  29 #define LD_SB(...) LD_B(v16i8, __VA_ARGS__)
  30
  31 #define LD_H(RTYPE, psrc) *((RTYPE *)(psrc))
  32 #define LD_SH(...) LD_H(v8i16, __VA_ARGS__)
  33
  34 #define LD_W(RTYPE, psrc) *((RTYPE *)(psrc))
  35 #define LD_SW(...) LD_W(v4i32, __VA_ARGS__)
  36
  37 #define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
  38 #define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
  39
  40 #define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
  41 #define ST_SH(...) ST_H(v8i16, __VA_ARGS__)
  42
  43 #define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
  44 #define ST_SW(...) ST_W(v4i32, __VA_ARGS__)
  45
  46 #if (__mips_isa_rev >= 6)
  47     #define LW(psrc)                           \
  48     ( {                                        \
  49         uint8_t *psrc_m = (uint8_t *) (psrc);  \
  50         uint32_t val_m;                        \
  51                                                \
  52         __asm__ volatile (                     \
  53             "lw  %[val_m],  %[psrc_m]  \n\t"   \
  54                                                \
  55             : [val_m] "=r" (val_m)             \
  56             : [psrc_m] "m" (*psrc_m)           \
  57         );                                     \
  58                                                \
  59         val_m;                                 \
  60     } )
  61
  62     #if (__mips == 64)
  63         #define LD(psrc)                           \
  64         ( {                                        \
  65             uint8_t *psrc_m = (uint8_t *) (psrc);  \
  66             uint64_t val_m = 0;                    \
  67                                                    \
  68             __asm__ volatile (                     \
  69                 "ld  %[val_m],  %[psrc_m]  \n\t"   \
  70                                                    \
  71                 : [val_m] "=r" (val_m)             \
  72                 : [psrc_m] "m" (*psrc_m)           \
  73             );                                     \
  74                                                    \
  75             val_m;                                 \
  76         } )
  77     #else  // !(__mips == 64)
  78         #define LD(psrc)                                              \
  79         ( {                                                           \
  80             uint8_t *psrc_m = (uint8_t *) (psrc);                     \
  81             uint32_t val0_m, val1_m;                                  \
  82             uint64_t val_m = 0;                                       \
  83                                                                       \
  84             val0_m = LW(psrc_m);                                      \
  85             val1_m = LW(psrc_m + 4);                                  \
  86                                                                       \
  87             val_m = (uint64_t) (val1_m);                              \
  88             val_m = (uint64_t) ((val_m << 32) & 0xFFFFFFFF00000000);  \
  89             val_m = (uint64_t) (val_m | (uint64_t) val0_m);           \
  90                                                                       \
  91             val_m;                                                    \
  92         } )
  93     #endif  // (__mips == 64)
  94
  95     #define SH(val, pdst)                      \
  96     {                                          \
  97         uint8_t *pdst_m = (uint8_t *) (pdst);  \
  98         uint16_t val_m = (val);                \
  99                                                \
 100         __asm__ volatile (                     \
 101             "sh  %[val_m],  %[pdst_m]  \n\t"   \
 102                                                \
 103             : [pdst_m] "=m" (*pdst_m)          \
 104             : [val_m] "r" (val_m)              \
 105         );                                     \
 106     }
 107
 108     #define SW(val, pdst)                      \
 109     {                                          \
 110         uint8_t *pdst_m = (uint8_t *) (pdst);  \
 111         uint32_t val_m = (val);                \
 112                                                \
 113         __asm__ volatile (                     \
 114             "sw  %[val_m],  %[pdst_m]  \n\t"   \
 115                                                \
 116             : [pdst_m] "=m" (*pdst_m)          \
 117             : [val_m] "r" (val_m)              \
 118         );                                     \
 119     }
 120
 121     #define SD(val, pdst)                      \
 122     {                                          \
 123         uint8_t *pdst_m = (uint8_t *) (pdst);  \
 124         uint64_t val_m = (val);                \
 125                                                \
 126         __asm__ volatile (                     \
 127             "sd  %[val_m],  %[pdst_m]  \n\t"   \
 128                                                \
 129             : [pdst_m] "=m" (*pdst_m)          \
 130             : [val_m] "r" (val_m)              \
 131         );                                     \
 132     }
 133 #else  // !(__mips_isa_rev >= 6)
 134     #define LW(psrc)                           \
 135     ( {                                        \
 136         uint8_t *psrc_m = (uint8_t *) (psrc);  \
 137         uint32_t val_m;                        \
 138                                                \
 139         __asm__ volatile (                     \
 140             "ulw  %[val_m],  %[psrc_m]  \n\t"  \
 141                                                \
 142             : [val_m] "=r" (val_m)             \
 143             : [psrc_m] "m" (*psrc_m)           \
 144         );                                     \
 145                                                \
 146         val_m;                                 \
 147     } )
 148
 149     #if (__mips == 64)
 150         #define LD(psrc)                           \
 151         ( {                                        \
 152             uint8_t *psrc_m = (uint8_t *) (psrc);  \
 153             uint64_t val_m = 0;                    \
 154                                                    \
 155             __asm__ volatile (                     \
 156                 "uld  %[val_m],  %[psrc_m]  \n\t"  \
 157                                                    \
 158                 : [val_m] "=r" (val_m)             \
 159                 : [psrc_m] "m" (*psrc_m)           \
 160             );                                     \
 161                                                    \
 162             val_m;                                 \
 163         } )
 164     #else  // !(__mips == 64)
 165         #define LD(psrc)                                              \
 166         ( {                                                           \
 167             uint8_t *psrc_m1 = (uint8_t *) (psrc);                    \
 168             uint32_t val0_m, val1_m;                                  \
 169             uint64_t val_m = 0;                                       \
 170                                                                       \
 171             val0_m = LW(psrc_m1);                                     \
 172             val1_m = LW(psrc_m1 + 4);                                 \
 173                                                                       \
 174             val_m = (uint64_t) (val1_m);                              \
 175             val_m = (uint64_t) ((val_m << 32) & 0xFFFFFFFF00000000);  \
 176             val_m = (uint64_t) (val_m | (uint64_t) val0_m);           \
 177                                                                       \
 178             val_m;                                                    \
 179         } )
 180     #endif  // (__mips == 64)
 181
 182     #define SH(val, pdst)                      \
 183     {                                          \
 184         uint8_t *pdst_m = (uint8_t *) (pdst);  \
 185         uint16_t val_m = (val);                \
 186                                                \
 187         __asm__ volatile (                     \
 188             "ush  %[val_m],  %[pdst_m]  \n\t"  \
 189                                                \
 190             : [pdst_m] "=m" (*pdst_m)          \
 191             : [val_m] "r" (val_m)              \
 192         );                                     \
 193     }
 194
 195     #define SW(val, pdst)                      \
 196     {                                          \
 197         uint8_t *pdst_m = (uint8_t *) (pdst);  \
 198         uint32_t val_m = (val);                \
 199                                                \
 200         __asm__ volatile (                     \
 201             "usw  %[val_m],  %[pdst_m]  \n\t"  \
 202                                                \
 203             : [pdst_m] "=m" (*pdst_m)          \
 204             : [val_m] "r" (val_m)              \
 205         );                                     \
 206     }
 207
 208     #define SD(val, pdst)                                          \
 209     {                                                              \
 210         uint8_t *pdst_m1 = (uint8_t *) (pdst);                     \
 211         uint32_t val0_m, val1_m;                                   \
 212                                                                    \
 213         val0_m = (uint32_t) ((val) & 0x00000000FFFFFFFF);          \
 214         val1_m = (uint32_t) (((val) >> 32) & 0x00000000FFFFFFFF);  \
 215                                                                    \
 216         SW(val0_m, pdst_m1);                                       \
 217         SW(val1_m, pdst_m1 + 4);                                   \
 218     }
 219 #endif // (__mips_isa_rev >= 6)
 220
 221 /* Description : Load 4 words with stride
 222    Arguments   : Inputs  - psrc    (source pointer to load from)
 223                          - stride
 224                  Outputs - out0, out1, out2, out3
 225    Details     : Loads word in 'out0' from (psrc)
 226                  Loads word in 'out1' from (psrc + stride)
 227                  Loads word in 'out2' from (psrc + 2 * stride)
 228                  Loads word in 'out3' from (psrc + 3 * stride)
 229 */
 230 #define LW4(psrc, stride, out0, out1, out2, out3)  \
 231 {                                                  \
 232     out0 = LW((psrc));                             \
 233     out1 = LW((psrc) + stride);                    \
 234     out2 = LW((psrc) + 2 * stride);                \
 235     out3 = LW((psrc) + 3 * stride);                \
 236 }
 237
 238 /* Description : Store 4 words with stride
 239    Arguments   : Inputs  - in0, in1, in2, in3, pdst, stride
 240    Details     : Stores word from 'in0' to (pdst)
 241                  Stores word from 'in1' to (pdst + stride)
 242                  Stores word from 'in2' to (pdst + 2 * stride)
 243                  Stores word from 'in3' to (pdst + 3 * stride)
 244 */
 245 #define SW4(in0, in1, in2, in3, pdst, stride)  \
 246 {                                              \
 247     SW(in0, (pdst))                            \
 248     SW(in1, (pdst) + stride);                  \
 249     SW(in2, (pdst) + 2 * stride);              \
 250     SW(in3, (pdst) + 3 * stride);              \
 251 }
 252
 253 /* Description : Store 4 double words with stride
 254    Arguments   : Inputs  - in0, in1, in2, in3, pdst, stride
 255    Details     : Stores double word from 'in0' to (pdst)
 256                  Stores double word from 'in1' to (pdst + stride)
 257                  Stores double word from 'in2' to (pdst + 2 * stride)
 258                  Stores double word from 'in3' to (pdst + 3 * stride)
 259 */
 260 #define SD4(in0, in1, in2, in3, pdst, stride)  \
 261 {                                              \
 262     SD(in0, (pdst))                            \
 263     SD(in1, (pdst) + stride);                  \
 264     SD(in2, (pdst) + 2 * stride);              \
 265     SD(in3, (pdst) + 3 * stride);              \
 266 }
 267
 268 /* Description : Load vectors with 16 byte elements with stride
 269    Arguments   : Inputs  - psrc    (source pointer to load from)
 270                          - stride
 271                  Outputs - out0, out1
 272                  Return Type - as per RTYPE
 273    Details     : Loads 16 byte elements in 'out0' from (psrc)
 274                  Loads 16 byte elements in 'out1' from (psrc + stride)
 275 */
 276 #define LD_B2(RTYPE, psrc, stride, out0, out1)  \
 277 {                                               \
 278     out0 = LD_B(RTYPE, (psrc));                 \
 279     out1 = LD_B(RTYPE, (psrc) + stride);        \
 280 }
 281 #define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)
 282 #define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__)
 283
 284 #define LD_B3(RTYPE, psrc, stride, out0, out1, out2)  \
 285 {                                                     \
 286     LD_B2(RTYPE, (psrc), stride, out0, out1);         \
 287     out2 = LD_B(RTYPE, (psrc) + 2 * stride);          \
 288 }
 289 #define LD_SB3(...) LD_B3(v16i8, __VA_ARGS__)
 290
 291 #define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3)   \
 292 {                                                            \
 293     LD_B2(RTYPE, (psrc), stride, out0, out1);                \
 294     LD_B2(RTYPE, (psrc) + 2 * stride , stride, out2, out3);  \
 295 }
 296 #define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)
 297 #define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__)
 298
 299 #define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4)  \
 300 {                                                                 \
 301     LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);         \
 302     out4 = LD_B(RTYPE, (psrc) + 4 * stride);                      \
 303 }
 304 #define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__)
 305 #define LD_SB5(...) LD_B5(v16i8, __VA_ARGS__)
 306
 307 #define LD_B6(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5)  \
 308 {                                                                       \
 309     LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);               \
 310     LD_B2(RTYPE, (psrc) + 4 * stride, stride, out4, out5);              \
 311 }
 312 #define LD_SB6(...) LD_B6(v16i8, __VA_ARGS__)
 313
 314 #define LD_B7(RTYPE, psrc, stride,                               \
 315               out0, out1, out2, out3, out4, out5, out6)          \
 316 {                                                                \
 317     LD_B5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4);  \
 318     LD_B2(RTYPE, (psrc) + 5 * stride, stride, out5, out6);       \
 319 }
 320 #define LD_SB7(...) LD_B7(v16i8, __VA_ARGS__)
 321
 322 #define LD_B8(RTYPE, psrc, stride,                                      \
 323               out0, out1, out2, out3, out4, out5, out6, out7)           \
 324 {                                                                       \
 325     LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);               \
 326     LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7);  \
 327 }
 328 #define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__)
 329 #define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__)
 330
 331 /* Description : Store vectors of 16 byte elements with stride
 332    Arguments   : Inputs  - in0, in1, stride
 333                  Outputs - pdst    (destination pointer to store to)
 334    Details     : Stores 16 byte elements from 'in0' to (pdst)
 335                  Stores 16 byte elements from 'in1' to (pdst + stride)
 336 */
 337 #define ST_B2(RTYPE, in0, in1, pdst, stride)  \
 338 {                                             \
 339     ST_B(RTYPE, in0, (pdst));                 \
 340     ST_B(RTYPE, in1, (pdst) + stride);        \
 341 }
 342 #define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)
 343 #define ST_SB2(...) ST_B2(v16i8, __VA_ARGS__)
 344
 345 #define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride)    \
 346 {                                                         \
 347     ST_B2(RTYPE, in0, in1, (pdst), stride);               \
 348     ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride);  \
 349 }
 350 #define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
 351 #define ST_SB4(...) ST_B4(v16i8, __VA_ARGS__)
 352
 353 #define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,        \
 354               pdst, stride)                                         \
 355 {                                                                   \
 356     ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride);                 \
 357     ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride);  \
 358 }
 359 #define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__)
 360
 361 /* Description : Store vectors of 8 halfword elements with stride
 362    Arguments   : Inputs  - in0, in1, stride
 363                  Outputs - pdst    (destination pointer to store to)
 364    Details     : Stores 8 halfword elements from 'in0' to (pdst)
 365                  Stores 8 halfword elements from 'in1' to (pdst + stride)
 366 */
 367 #define ST_H2(RTYPE, in0, in1, pdst, stride)  \
 368 {                                             \
 369     ST_H(RTYPE, in0, (pdst));                 \
 370     ST_H(RTYPE, in1, (pdst) + stride);        \
 371 }
 372 #define ST_UH2(...) ST_H2(v8u16, __VA_ARGS__)
 373 #define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__)
 374
 375 #define ST_H4(RTYPE, in0, in1, in2, in3, pdst, stride)    \
 376 {                                                         \
 377     ST_H2(RTYPE, in0, in1, (pdst), stride);               \
 378     ST_H2(RTYPE, in2, in3, (pdst) + 2 * stride, stride);  \
 379 }
 380 #define ST_SH4(...) ST_H4(v8i16, __VA_ARGS__)
 381
 382 #define ST_H6(RTYPE, in0, in1, in2, in3, in4, in5, pdst, stride)  \
 383 {                                                                 \
 384     ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride);             \
 385     ST_H2(RTYPE, in4, in5, (pdst) + 4 * stride, stride);          \
 386 }
 387 #define ST_SH6(...) ST_H6(v8i16, __VA_ARGS__)
 388
 389 #define ST_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)  \
 390 {                                                                           \
 391     ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride);                       \
 392     ST_H4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride);          \
 393 }
 394 #define ST_SH8(...) ST_H8(v8i16, __VA_ARGS__)
 395
 396 /* Description : Store vectors of word elements with stride
 397    Arguments   : Inputs  - in0, in1, stride
 398                  Outputs - pdst    (destination pointer to store to)
 399                  Return Type - signed word
 400    Details     : Stores 4 word elements from 'in0' to (pdst)
 401                  Stores 4 word elements from 'in1' to (pdst + stride)
 402 */
 403 #define ST_SW2(in0, in1, pdst, stride)  \
 404 {                                       \
 405     ST_SW(in0, (pdst));                 \
 406     ST_SW(in1, (pdst) + stride);        \
 407 }
 408
 409 /* Description : Store as 2x4 byte block to destination memory from input vector
 410    Arguments   : Inputs  - in, stidx, pdst, stride
 411                  Return Type - unsigned byte
 412    Details     : Index stidx halfword element from 'in' vector is copied and
 413                  stored on first line
 414                  Index stidx+1 halfword element from 'in' vector is copied and
 415                  stored on second line
 416                  Index stidx+2 halfword element from 'in' vector is copied and
 417                  stored on third line
 418                  Index stidx+3 halfword element from 'in' vector is copied and
 419                  stored on fourth line
 420 */
 421 #define ST2x4_UB(in, stidx, pdst, stride)              \
 422 {                                                      \
 423     uint16_t out0_m, out1_m, out2_m, out3_m;           \
 424     uint8_t *pblk_2x4_m = (uint8_t *) (pdst);          \
 425                                                        \
 426     out0_m = __msa_copy_u_h((v8i16) in, (stidx));      \
 427     out1_m = __msa_copy_u_h((v8i16) in, (stidx + 1));  \
 428     out2_m = __msa_copy_u_h((v8i16) in, (stidx + 2));  \
 429     out3_m = __msa_copy_u_h((v8i16) in, (stidx + 3));  \
 430                                                        \
 431     SH(out0_m, pblk_2x4_m);                            \
 432     SH(out1_m, pblk_2x4_m + stride);                   \
 433     SH(out2_m, pblk_2x4_m + 2 * stride);               \
 434     SH(out3_m, pblk_2x4_m + 3 * stride);               \
 435 }
 436
 437 /* Description : Store as 4x2 byte block to destination memory from input vector
 438    Arguments   : Inputs  - in, pdst, stride
 439                  Return Type - unsigned byte
 440    Details     : Index 0 word element from input vector is copied and stored
 441                  on first line
 442                  Index 1 word element from input vector is copied and stored
 443                  on second line
 444 */
 445 #define ST4x2_UB(in, pdst, stride)             \
 446 {                                              \
 447     uint32_t out0_m, out1_m;                   \
 448     uint8_t *pblk_4x2_m = (uint8_t *) (pdst);  \
 449                                                \
 450     out0_m = __msa_copy_u_w((v4i32) in, 0);    \
 451     out1_m = __msa_copy_u_w((v4i32) in, 1);    \
 452                                                \
 453     SW(out0_m, pblk_4x2_m);                    \
 454     SW(out1_m, pblk_4x2_m + stride);           \
 455 }
 456
 457 /* Description : Store as 4x4 byte block to destination memory from input vector
 458    Arguments   : Inputs  - in0, in1, pdst, stride
 459                  Return Type - unsigned byte
 460    Details     : Idx0 word element from input vector 'in0' is copied and stored
 461                  on first line
 462                  Idx1 word element from input vector 'in0' is copied and stored
 463                  on second line
 464                  Idx2 word element from input vector 'in1' is copied and stored
 465                  on third line
 466                  Idx3 word element from input vector 'in1' is copied and stored
 467                  on fourth line
 468 */
 469 #define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)  \
 470 {                                                                 \
 471     uint32_t out0_m, out1_m, out2_m, out3_m;                      \
 472     uint8_t *pblk_4x4_m = (uint8_t *) (pdst);                     \
 473                                                                   \
 474     out0_m = __msa_copy_u_w((v4i32) in0, idx0);                   \
 475     out1_m = __msa_copy_u_w((v4i32) in0, idx1);                   \
 476     out2_m = __msa_copy_u_w((v4i32) in1, idx2);                   \
 477     out3_m = __msa_copy_u_w((v4i32) in1, idx3);                   \
 478                                                                   \
 479     SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride);      \
 480 }
 481
 482 /* Description : Store as 8x2 byte block to destination memory from input vector
 483    Arguments   : Inputs  - in, pdst, stride
 484    Details     : Index 0 double word element from input vector 'in' is copied
 485                  and stored to destination memory at (pdst)
 486                  Index 1 double word element from input vector 'in' is copied
 487                  and stored to destination memory at (pdst + stride)
 488 */
 489 #define ST8x2_UB(in, pdst, stride)             \
 490 {                                              \
 491     uint64_t out0_m, out1_m;                   \
 492     uint8_t *pblk_8x2_m = (uint8_t *) (pdst);  \
 493                                                \
 494     out0_m = __msa_copy_u_d((v2i64) in, 0);    \
 495     out1_m = __msa_copy_u_d((v2i64) in, 1);    \
 496                                                \
 497     SD(out0_m, pblk_8x2_m);                    \
 498     SD(out1_m, pblk_8x2_m + stride);           \
 499 }
 500
 501 /* Description : Store as 8x4 byte block to destination memory from input
 502                  vectors
 503    Arguments   : Inputs  - in0, in1, pdst, stride
 504    Details     : Index 0 double word element from input vector 'in0' is copied
 505                  and stored to destination memory at (pblk_8x4_m)
 506                  Index 1 double word element from input vector 'in0' is copied
 507                  and stored to destination memory at (pblk_8x4_m + stride)
 508                  Index 0 double word element from input vector 'in1' is copied
 509                  and stored to destination memory at (pblk_8x4_m + 2 * stride)
 510                  Index 1 double word element from input vector 'in1' is copied
 511                  and stored to destination memory at (pblk_8x4_m + 3 * stride)
 512 */
 513 #define ST8x4_UB(in0, in1, pdst, stride)                      \
 514 {                                                             \
 515     uint64_t out0_m, out1_m, out2_m, out3_m;                  \
 516     uint8_t *pblk_8x4_m = (uint8_t *) (pdst);                 \
 517                                                               \
 518     out0_m = __msa_copy_u_d((v2i64) in0, 0);                  \
 519     out1_m = __msa_copy_u_d((v2i64) in0, 1);                  \
 520     out2_m = __msa_copy_u_d((v2i64) in1, 0);                  \
 521     out3_m = __msa_copy_u_d((v2i64) in1, 1);                  \
 522                                                               \
 523     SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride);  \
 524 }
 525 #define ST8x8_UB(in0, in1, in2, in3, pdst, stride)        \
 526 {                                                         \
 527     uint8_t *pblk_8x8_m = (uint8_t *) (pdst);             \
 528                                                           \
 529     ST8x4_UB(in0, in1, pblk_8x8_m, stride);               \
 530     ST8x4_UB(in2, in3, pblk_8x8_m + 4 * stride, stride);  \
 531 }
 532
 533 /* Description : Store as 12x8 byte block to destination memory from
 534                  input vectors
 535    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
 536    Details     : Index 0 double word element from input vector 'in0' is copied
 537                  and stored to destination memory at (pblk_12x8_m) followed by
 538                  index 2 word element from same input vector 'in0' at
 539                  (pblk_12x8_m + 8)
 540                  Similar to remaining lines
 541 */
 542 #define ST12x8_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)  \
 543 {                                                                        \
 544     uint64_t out0_m, out1_m, out2_m, out3_m;                             \
 545     uint64_t out4_m, out5_m, out6_m, out7_m;                             \
 546     uint32_t out8_m, out9_m, out10_m, out11_m;                           \
 547     uint32_t out12_m, out13_m, out14_m, out15_m;                         \
 548     uint8_t *pblk_12x8_m = (uint8_t *) (pdst);                           \
 549                                                                          \
 550     out0_m = __msa_copy_u_d((v2i64) in0, 0);                             \
 551     out1_m = __msa_copy_u_d((v2i64) in1, 0);                             \
 552     out2_m = __msa_copy_u_d((v2i64) in2, 0);                             \
 553     out3_m = __msa_copy_u_d((v2i64) in3, 0);                             \
 554     out4_m = __msa_copy_u_d((v2i64) in4, 0);                             \
 555     out5_m = __msa_copy_u_d((v2i64) in5, 0);                             \
 556     out6_m = __msa_copy_u_d((v2i64) in6, 0);                             \
 557     out7_m = __msa_copy_u_d((v2i64) in7, 0);                             \
 558                                                                          \
 559     out8_m =  __msa_copy_u_w((v4i32) in0, 2);                            \
 560     out9_m =  __msa_copy_u_w((v4i32) in1, 2);                            \
 561     out10_m = __msa_copy_u_w((v4i32) in2, 2);                            \
 562     out11_m = __msa_copy_u_w((v4i32) in3, 2);                            \
 563     out12_m = __msa_copy_u_w((v4i32) in4, 2);                            \
 564     out13_m = __msa_copy_u_w((v4i32) in5, 2);                            \
 565     out14_m = __msa_copy_u_w((v4i32) in6, 2);                            \
 566     out15_m = __msa_copy_u_w((v4i32) in7, 2);                            \
 567                                                                          \
 568     SD(out0_m, pblk_12x8_m);                                             \
 569     SW(out8_m, pblk_12x8_m + 8);                                         \
 570     pblk_12x8_m += stride;                                               \
 571     SD(out1_m, pblk_12x8_m);                                             \
 572     SW(out9_m, pblk_12x8_m + 8);                                         \
 573     pblk_12x8_m += stride;                                               \
 574     SD(out2_m, pblk_12x8_m);                                             \
 575     SW(out10_m, pblk_12x8_m + 8);                                        \
 576     pblk_12x8_m += stride;                                               \
 577     SD(out3_m, pblk_12x8_m);                                             \
 578     SW(out11_m, pblk_12x8_m + 8);                                        \
 579     pblk_12x8_m += stride;                                               \
 580     SD(out4_m, pblk_12x8_m);                                             \
 581     SW(out12_m, pblk_12x8_m + 8);                                        \
 582     pblk_12x8_m += stride;                                               \
 583     SD(out5_m, pblk_12x8_m);                                             \
 584     SW(out13_m, pblk_12x8_m + 8);                                        \
 585     pblk_12x8_m += stride;                                               \
 586     SD(out6_m, pblk_12x8_m);                                             \
 587     SW(out14_m, pblk_12x8_m + 8);                                        \
 588     pblk_12x8_m += stride;                                               \
 589     SD(out7_m, pblk_12x8_m);                                             \
 590     SW(out15_m, pblk_12x8_m + 8);                                        \
 591 }
 592
 593 /* Description : Immediate number of columns to slide with zero
 594    Arguments   : Inputs  - in0, in1, slide_val
 595                  Outputs - out0, out1
 596                  Return Type - as per RTYPE
 597    Details     : Byte elements from 'zero_m' vector are slide into 'in0' by
 598                  number of elements specified by 'slide_val'
 599 */
 600 #define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val)                 \
 601 {                                                                         \
 602     v16i8 zero_m = { 0 };                                                 \
 603     out0 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in0, slide_val);  \
 604     out1 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in1, slide_val);  \
 605 }
 606 #define SLDI_B2_0_UB(...) SLDI_B2_0(v16u8, __VA_ARGS__)
 607
 608 #define SLDI_B4_0(RTYPE, in0, in1, in2, in3,            \
 609                   out0, out1, out2, out3, slide_val)    \
 610 {                                                       \
 611     SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val);  \
 612     SLDI_B2_0(RTYPE, in2, in3, out2, out3, slide_val);  \
 613 }
 614 #define SLDI_B4_0_SB(...) SLDI_B4_0(v16i8, __VA_ARGS__)
 615
 616 /* Description : Shuffle byte vector elements as per mask vector
 617    Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
 618                  Outputs - out0, out1
 619                  Return Type - as per RTYPE
 620    Details     : Selective byte elements from in0 & in1 are copied to out0 as
 621                  per control vector mask0
 622                  Selective byte elements from in2 & in3 are copied to out1 as
 623                  per control vector mask1
 624 */
 625 #define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1)       \
 626 {                                                                          \
 627     out0 = (RTYPE) __msa_vshf_b((v16i8) mask0, (v16i8) in1, (v16i8) in0);  \
 628     out1 = (RTYPE) __msa_vshf_b((v16i8) mask1, (v16i8) in3, (v16i8) in2);  \
 629 }
 630 #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
 631 #define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)
 632 #define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)
 633 #define VSHF_B2_SH(...) VSHF_B2(v8i16, __VA_ARGS__)
 634
 635 #define VSHF_B3(RTYPE, in0, in1, in2, in3, in4, in5, mask0, mask1, mask2,  \
 636                 out0, out1, out2)                                          \
 637 {                                                                          \
 638     VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1);          \
 639     out2 = (RTYPE) __msa_vshf_b((v16i8) mask2, (v16i8) in5, (v16i8) in4);  \
 640 }
 641 #define VSHF_B3_SB(...) VSHF_B3(v16i8, __VA_ARGS__)
 642
 643 #define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3,       \
 644                 out0, out1, out2, out3)                            \
 645 {                                                                  \
 646     VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1);  \
 647     VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3);  \
 648 }
 649 #define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__)
 650
 651 /* Description : Shuffle byte vector elements as per mask vector
 652    Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
 653                  Outputs - out0, out1
 654                  Return Type - as per RTYPE
 655    Details     : Selective byte elements from in0 & in1 are copied to out0 as
 656                  per control vector mask0
 657                  Selective byte elements from in2 & in3 are copied to out1 as
 658                  per control vector mask1
 659 */
 660 #define VSHF_W2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1)      \
 661 {                                                                         \
 662     out0 = (RTYPE) __msa_vshf_w((v4i32) mask0, (v4i32) in1, (v4i32) in0); \
 663     out1 = (RTYPE) __msa_vshf_w((v4i32) mask1, (v4i32) in3, (v4i32) in2); \
 664 }
 665 #define VSHF_W2_SB(...) VSHF_W2(v16i8, __VA_ARGS__)
 666
 667 /* Description : Dot product of byte vector elements
 668    Arguments   : Inputs  - mult0, mult1
 669                            cnst0, cnst1
 670                  Outputs - out0, out1
 671                  Return Type - signed halfword
 672    Details     : Signed byte elements from mult0 are multiplied with
 673                  signed byte elements from cnst0 producing a result
 674                  twice the size of input i.e. signed halfword.
 675                  Then this multiplication results of adjacent odd-even elements
 676                  are added together and stored to the out vector
 677                  (2 signed halfword results)
 678 */
 679 #define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)   \
 680 {                                                                 \
 681     out0 = (RTYPE) __msa_dotp_s_h((v16i8) mult0, (v16i8) cnst0);  \
 682     out1 = (RTYPE) __msa_dotp_s_h((v16i8) mult1, (v16i8) cnst1);  \
 683 }
 684 #define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__)
 685
 686 #define DOTP_SB3(RTYPE, mult0, mult1, mult2, cnst0, cnst1, cnst2,  \
 687                  out0, out1, out2)                                 \
 688 {                                                                  \
 689     DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);       \
 690     out2 = (RTYPE) __msa_dotp_s_h((v16i8) mult2, (v16i8) cnst2);   \
 691 }
 692 #define DOTP_SB3_SH(...) DOTP_SB3(v8i16, __VA_ARGS__)
 693
 694 #define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3,                   \
 695                  cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3)  \
 696 {                                                                     \
 697     DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);          \
 698     DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);          \
 699 }
 700 #define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__)
 701
 702 /* Description : Dot product & addition of byte vector elements
 703    Arguments   : Inputs  - mult0, mult1
 704                            cnst0, cnst1
 705                  Outputs - out0, out1
 706                  Return Type - signed halfword
 707    Details     : Signed byte elements from mult0 are multiplied with
 708                  signed byte elements from cnst0 producing a result
 709                  twice the size of input i.e. signed halfword.
 710                  Then this multiplication results of adjacent odd-even elements
 711                  are added to the out vector
 712                  (2 signed halfword results)
 713 */
 714 #define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)   \
 715 {                                                                  \
 716     out0 = (RTYPE) __msa_dpadd_s_h((v8i16) out0,                   \
 717                                    (v16i8) mult0, (v16i8) cnst0);  \
 718     out1 = (RTYPE) __msa_dpadd_s_h((v8i16) out1,                   \
 719                                    (v16i8) mult1, (v16i8) cnst1);  \
 720 }
 721 #define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__)
 722
 723 #define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3,                   \
 724                   cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3)  \
 725 {                                                                      \
 726     DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);          \
 727     DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);          \
 728 }
 729 #define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__)
 730
 731 /* Description : Dot product & addition of halfword vector elements
 732    Arguments   : Inputs  - mult0, mult1
 733                            cnst0, cnst1
 734                  Outputs - out0, out1
 735                  Return Type - signed word
 736    Details     : Signed halfword elements from mult0 are multiplied with
 737                  signed halfword elements from cnst0 producing a result
 738                  twice the size of input i.e. signed word.
 739                  Then this multiplication results of adjacent odd-even elements
 740                  are added to the out vector
 741                  (2 signed word results)
 742 */
 743 #define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)   \
 744 {                                                                  \
 745     out0 = (RTYPE) __msa_dpadd_s_w((v4i32) out0,                   \
 746                                    (v8i16) mult0, (v8i16) cnst0);  \
 747     out1 = (RTYPE) __msa_dpadd_s_w((v4i32) out1,                   \
 748                                    (v8i16) mult1, (v8i16) cnst1);  \
 749 }
 750 #define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__)
 751
 752 /* Description : Clips all halfword elements of input vector between min & max
 753                  out = ((in) < (min)) ? (min) : (((in) > (max)) ? (max) : (in))
 754    Arguments   : Inputs  - in       (input vector)
 755                          - min      (min threshold)
 756                          - max      (max threshold)
 757                  Outputs - out_m    (output vector with clipped elements)
 758                  Return Type - signed halfword
 759 */
 760 #define CLIP_SH(in, min, max)                           \
 761 ( {                                                     \
 762     v8i16 out_m;                                        \
 763                                                         \
 764     out_m = __msa_max_s_h((v8i16) min, (v8i16) in);     \
 765     out_m = __msa_min_s_h((v8i16) max, (v8i16) out_m);  \
 766     out_m;                                              \
 767 } )
 768
 769 /* Description : Clips all signed halfword elements of input vector
 770                  between 0 & 255
 771    Arguments   : Inputs  - in       (input vector)
 772                  Outputs - out_m    (output vector with clipped elements)
 773                  Return Type - signed halfword
 774 */
 775 #define CLIP_SH_0_255(in)                                 \
 776 ( {                                                       \
 777     v8i16 max_m = __msa_ldi_h(255);                       \
 778     v8i16 out_m;                                          \
 779                                                           \
 780     out_m = __msa_maxi_s_h((v8i16) in, 0);                \
 781     out_m = __msa_min_s_h((v8i16) max_m, (v8i16) out_m);  \
 782     out_m;                                                \
 783 } )
 784 #define CLIP_SH2_0_255(in0, in1)  \
 785 {                                 \
 786     in0 = CLIP_SH_0_255(in0);     \
 787     in1 = CLIP_SH_0_255(in1);     \
 788 }
 789 #define CLIP_SH4_0_255(in0, in1, in2, in3)  \
 790 {                                           \
 791     CLIP_SH2_0_255(in0, in1);               \
 792     CLIP_SH2_0_255(in2, in3);               \
 793 }
 794
 795 /* Description : Clips all signed word elements of input vector
 796                  between 0 & 255
 797    Arguments   : Inputs  - in       (input vector)
 798                  Outputs - out_m    (output vector with clipped elements)
 799                  Return Type - signed word
 800 */
 801 #define CLIP_SW_0_255(in)                                 \
 802 ( {                                                       \
 803     v4i32 max_m = __msa_ldi_w(255);                       \
 804     v4i32 out_m;                                          \
 805                                                           \
 806     out_m = __msa_maxi_s_w((v4i32) in, 0);                \
 807     out_m = __msa_min_s_w((v4i32) max_m, (v4i32) out_m);  \
 808     out_m;                                                \
 809 } )
 810
 811 /* Description : Horizontal subtraction of unsigned byte vector elements
 812    Arguments   : Inputs  - in0, in1
 813                  Outputs - out0, out1
 814                  Return Type - as per RTYPE
 815    Details     : Each unsigned odd byte element from 'in0' is subtracted from
 816                  even unsigned byte element from 'in0' (pairwise) and the
 817                  halfword result is stored in 'out0'
 818 */
 819 #define HSUB_UB2(RTYPE, in0, in1, out0, out1)                 \
 820 {                                                             \
 821     out0 = (RTYPE) __msa_hsub_u_h((v16u8) in0, (v16u8) in0);  \
 822     out1 = (RTYPE) __msa_hsub_u_h((v16u8) in1, (v16u8) in1);  \
 823 }
 824 #define HSUB_UB2_UH(...) HSUB_UB2(v8u16, __VA_ARGS__)
 825 #define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
 826
 827 /* Description : Interleave even halfword elements from vectors
 828    Arguments   : Inputs  - in0, in1, in2, in3
 829                  Outputs - out0, out1
 830                  Return Type - as per RTYPE
 831    Details     : Even halfword elements of 'in0' and even halfword
 832                  elements of 'in1' are interleaved and copied to 'out0'
 833                  Even halfword elements of 'in2' and even halfword
 834                  elements of 'in3' are interleaved and copied to 'out1'
 835 */
 836 #define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1)      \
 837 {                                                            \
 838     out0 = (RTYPE) __msa_ilvev_h((v8i16) in1, (v8i16) in0);  \
 839     out1 = (RTYPE) __msa_ilvev_h((v8i16) in3, (v8i16) in2);  \
 840 }
 841 #define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__)
 842
 843 /* Description : Interleave even word elements from vectors
 844    Arguments   : Inputs  - in0, in1, in2, in3
 845                  Outputs - out0, out1
 846                  Return Type - as per RTYPE
 847    Details     : Even word elements of 'in0' and even word
 848                  elements of 'in1' are interleaved and copied to 'out0'
 849                  Even word elements of 'in2' and even word
 850                  elements of 'in3' are interleaved and copied to 'out1'
 851 */
 852 #define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1)      \
 853 {                                                            \
 854     out0 = (RTYPE) __msa_ilvev_w((v4i32) in1, (v4i32) in0);  \
 855     out1 = (RTYPE) __msa_ilvev_w((v4i32) in3, (v4i32) in2);  \
 856 }
 857 #define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__)
 858
 859 /* Description : Interleave even double word elements from vectors
 860    Arguments   : Inputs  - in0, in1, in2, in3
 861                  Outputs - out0, out1
 862                  Return Type - as per RTYPE
 863    Details     : Even double word elements of 'in0' and even double word
 864                  elements of 'in1' are interleaved and copied to 'out0'
 865                  Even double word elements of 'in2' and even double word
 866                  elements of 'in3' are interleaved and copied to 'out1'
 867 */
 868 #define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1)      \
 869 {                                                            \
 870     out0 = (RTYPE) __msa_ilvev_d((v2i64) in1, (v2i64) in0);  \
 871     out1 = (RTYPE) __msa_ilvev_d((v2i64) in3, (v2i64) in2);  \
 872 }
 873 #define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)
 874
 875 /* Description : Interleave left half of byte elements from vectors
 876    Arguments   : Inputs  - in0, in1, in2, in3
 877                  Outputs - out0, out1
 878                  Return Type - as per RTYPE
 879    Details     : Left half of byte elements of in0 and left half of byte
 880                  elements of in1 are interleaved and copied to out0.
 881                  Left half of byte elements of in2 and left half of byte
 882                  elements of in3 are interleaved and copied to out1.
 883 */
 884 #define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1)      \
 885 {                                                           \
 886     out0 = (RTYPE) __msa_ilvl_b((v16i8) in0, (v16i8) in1);  \
 887     out1 = (RTYPE) __msa_ilvl_b((v16i8) in2, (v16i8) in3);  \
 888 }
 889 #define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__)
 890 #define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__)
 891
 892 #define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
 893                 out0, out1, out2, out3)                         \
 894 {                                                               \
 895     ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1);             \
 896     ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3);             \
 897 }
 898 #define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__)
 899 #define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__)
 900 #define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__)
 901
 902 /* Description : Interleave left half of halfword elements from vectors
 903    Arguments   : Inputs  - in0, in1, in2, in3
 904                  Outputs - out0, out1
 905                  Return Type - as per RTYPE
 906    Details     : Left half of halfword elements of in0 and left half of halfword
 907                  elements of in1 are interleaved and copied to out0.
 908                  Left half of halfword elements of in2 and left half of halfword
 909                  elements of in3 are interleaved and copied to out1.
 910 */
 911 #define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1)      \
 912 {                                                           \
 913     out0 = (RTYPE) __msa_ilvl_h((v8i16) in0, (v8i16) in1);  \
 914     out1 = (RTYPE) __msa_ilvl_h((v8i16) in2, (v8i16) in3);  \
 915 }
 916 #define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__)
 917
 918 #define ILVL_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
 919                 out0, out1, out2, out3)                         \
 920 {                                                               \
 921     ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1);             \
 922     ILVL_H2(RTYPE, in4, in5, in6, in7, out2, out3);             \
 923 }
 924 #define ILVL_H4_SH(...) ILVL_H4(v8i16, __VA_ARGS__)
 925
 926 /* Description : Interleave left half of word elements from vectors
 927    Arguments   : Inputs  - in0, in1, in2, in3
 928                  Outputs - out0, out1
 929                  Return Type - as per RTYPE
 930    Details     : Left half of word elements of in0 and left half of word
 931                  elements of in1 are interleaved and copied to out0.
 932                  Left half of word elements of in2 and left half of word
 933                  elements of in3 are interleaved and copied to out1.
 934 */
 935 #define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1)      \
 936 {                                                           \
 937     out0 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1);  \
 938     out1 = (RTYPE) __msa_ilvl_w((v4i32) in2, (v4i32) in3);  \
 939 }
 940 #define ILVL_W2_SB(...) ILVL_W2(v16i8, __VA_ARGS__)
 941
 942 /* Description : Interleave right half of byte elements from vectors
 943    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
 944                  Outputs - out0, out1, out2, out3
 945                  Return Type - as per RTYPE
 946    Details     : Right half of byte elements of in0 and right half of byte
 947                  elements of in1 are interleaved and copied to out0.
 948                  Right half of byte elements of in2 and right half of byte
 949                  elements of in3 are interleaved and copied to out1.
 950                  Similar for other pairs
 951 */
 952 #define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1)      \
 953 {                                                           \
 954     out0 = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1);  \
 955     out1 = (RTYPE) __msa_ilvr_b((v16i8) in2, (v16i8) in3);  \
 956 }
 957 #define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__)
 958 #define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)
 959 #define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__)
 960 #define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
 961
 962 #define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
 963                 out0, out1, out2, out3)                         \
 964 {                                                               \
 965     ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1);             \
 966     ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3);             \
 967 }
 968 #define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__)
 969 #define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__)
 970 #define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__)
 971
 972 /* Description : Interleave right half of halfword elements from vectors
 973    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
 974                  Outputs - out0, out1, out2, out3
 975                  Return Type - signed halfword
 976    Details     : Right half of halfword elements of in0 and right half of
 977                  halfword elements of in1 are interleaved and copied to out0.
 978                  Right half of halfword elements of in2 and right half of
 979                  halfword elements of in3 are interleaved and copied to out1.
 980                  Similar for other pairs
 981 */
 982 #define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1)      \
 983 {                                                           \
 984     out0 = (RTYPE) __msa_ilvr_h((v8i16) in0, (v8i16) in1);  \
 985     out1 = (RTYPE) __msa_ilvr_h((v8i16) in2, (v8i16) in3);  \
 986 }
 987 #define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
 988
 989 #define ILVR_H3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2)  \
 990 {                                                                       \
 991     ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1);                     \
 992     out2 = (RTYPE) __msa_ilvr_h((v8i16) in4, (v8i16) in5);              \
 993 }
 994 #define ILVR_H3_SH(...) ILVR_H3(v8i16, __VA_ARGS__)
 995
 996 #define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
 997                 out0, out1, out2, out3)                         \
 998 {                                                               \
 999     ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1000     ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1001 }
1002 #define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__)
1003
1004 #define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1005 {                                                           \
1006     out0 = (RTYPE) __msa_ilvr_w((v4i32) in0, (v4i32) in1);  \
1007     out1 = (RTYPE) __msa_ilvr_w((v4i32) in2, (v4i32) in3);  \
1008 }
1009 #define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__)
1010 #define ILVR_W2_SB(...) ILVR_W2(v16i8, __VA_ARGS__)
1011
1012 #define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1013                 out0, out1, out2, out3)                         \
1014 {                                                               \
1015     ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1016     ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1017 }
1018 #define ILVR_W4_SB(...) ILVR_W4(v16i8, __VA_ARGS__)
1019
1020 /* Description : Interleave right half of double word elements from vectors
1021    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1022                  Outputs - out0, out1, out2, out3
1023                  Return Type - unsigned double word
1024    Details     : Right half of double word elements of in0 and right half of
1025                  double word elements of in1 are interleaved and copied to out0.
1026                  Right half of double word elements of in2 and right half of
1027                  double word elements of in3 are interleaved and copied to out1.
1028 */
1029 #define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1)          \
1030 {                                                               \
1031     out0 = (RTYPE) __msa_ilvr_d((v2i64) (in0), (v2i64) (in1));  \
1032     out1 = (RTYPE) __msa_ilvr_d((v2i64) (in2), (v2i64) (in3));  \
1033 }
1034 #define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
1035 #define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
1036
1037 #define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2)  \
1038 {                                                                       \
1039     ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);                     \
1040     out2 = (RTYPE) __msa_ilvr_d((v2i64) (in4), (v2i64) (in5));          \
1041 }
1042 #define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__)
1043
1044 #define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1045                 out0, out1, out2, out3)                         \
1046 {                                                               \
1047     ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1048     ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1049 }
1050 #define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__)
1051
1052 /* Description : Interleave both left and right half of input vectors
1053    Arguments   : Inputs  - in0, in1
1054                  Outputs - out0, out1
1055                  Return Type - as per RTYPE
1056    Details     : Right half of byte elements from 'in0' and 'in1' are
1057                  interleaved and stored to 'out0'
1058                  Left half of byte elements from 'in0' and 'in1' are
1059                  interleaved and stored to 'out1'
1060 */
1061 #define ILVRL_B2(RTYPE, in0, in1, out0, out1)               \
1062 {                                                           \
1063     out0 = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1);  \
1064     out1 = (RTYPE) __msa_ilvl_b((v16i8) in0, (v16i8) in1);  \
1065 }
1066 #define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)
1067 #define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
1068
1069 #define ILVRL_H2(RTYPE, in0, in1, out0, out1)               \
1070 {                                                           \
1071     out0 = (RTYPE) __msa_ilvr_h((v8i16) in0, (v8i16) in1);  \
1072     out1 = (RTYPE) __msa_ilvl_h((v8i16) in0, (v8i16) in1);  \
1073 }
1074 #define ILVRL_H2_SB(...) ILVRL_H2(v16i8, __VA_ARGS__)
1075 #define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)
1076 #define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
1077
1078 #define ILVRL_W2(RTYPE, in0, in1, out0, out1)               \
1079 {                                                           \
1080     out0 = (RTYPE) __msa_ilvr_w((v4i32) in0, (v4i32) in1);  \
1081     out1 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1);  \
1082 }
1083 #define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
1084
1085 /* Description : Maximum values between signed elements of vector and
1086                  5-bit signed immediate value are copied to the output vector
1087    Arguments   : Inputs  - in0, in1, in2, in3, max_val
1088                  Outputs - in0, in1, in2, in3 (in place)
1089                  Return Type - unsigned halfword
1090    Details     : Maximum of signed halfword element values from 'in0' and
1091                  'max_val' are written to output vector 'in0'
1092 */
1093 #define MAXI_SH2(RTYPE, in0, in1, max_val)                 \
1094 {                                                          \
1095     in0 = (RTYPE) __msa_maxi_s_h((v8i16) in0, (max_val));  \
1096     in1 = (RTYPE) __msa_maxi_s_h((v8i16) in1, (max_val));  \
1097 }
1098 #define MAXI_SH2_SH(...) MAXI_SH2(v8i16, __VA_ARGS__)
1099
1100 #define MAXI_SH4(RTYPE, in0, in1, in2, in3, max_val)  \
1101 {                                                     \
1102     MAXI_SH2(RTYPE, in0, in1, max_val);               \
1103     MAXI_SH2(RTYPE, in2, in3, max_val);               \
1104 }
1105 #define MAXI_SH4_UH(...) MAXI_SH4(v8u16, __VA_ARGS__)
1106
1107 /* Description : Saturate the halfword element values to the max
1108                  unsigned value of (sat_val+1 bits)
1109                  The element data width remains unchanged
1110    Arguments   : Inputs  - in0, in1, in2, in3, sat_val
1111                  Outputs - in0, in1, in2, in3 (in place)
1112                  Return Type - unsigned halfword
1113    Details     : Each unsigned halfword element from 'in0' is saturated to the
1114                  value generated with (sat_val+1) bit range
1115                  Results are in placed to original vectors
1116 */
1117 #define SAT_UH2(RTYPE, in0, in1, sat_val)               \
1118 {                                                       \
1119     in0 = (RTYPE) __msa_sat_u_h((v8u16) in0, sat_val);  \
1120     in1 = (RTYPE) __msa_sat_u_h((v8u16) in1, sat_val);  \
1121 }
1122 #define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__)
1123
1124 #define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val)  \
1125 {                                                    \
1126     SAT_UH2(RTYPE, in0, in1, sat_val);               \
1127     SAT_UH2(RTYPE, in2, in3, sat_val)                \
1128 }
1129 #define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__)
1130
1131 /* Description : Saturate the halfword element values to the max
1132                  unsigned value of (sat_val+1 bits)
1133                  The element data width remains unchanged
1134    Arguments   : Inputs  - in0, in1, in2, in3, sat_val
1135                  Outputs - in0, in1, in2, in3 (in place)
1136                  Return Type - unsigned halfword
1137    Details     : Each unsigned halfword element from 'in0' is saturated to the
1138                  value generated with (sat_val+1) bit range
1139                  Results are in placed to original vectors
1140 */
1141 #define SAT_SH2(RTYPE, in0, in1, sat_val)               \
1142 {                                                       \
1143     in0 = (RTYPE) __msa_sat_s_h((v8i16) in0, sat_val);  \
1144     in1 = (RTYPE) __msa_sat_s_h((v8i16) in1, sat_val);  \
1145 }
1146 #define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__)
1147
1148 #define SAT_SH3(RTYPE, in0, in1, in2, sat_val)          \
1149 {                                                       \
1150     SAT_SH2(RTYPE, in0, in1, sat_val)                   \
1151     in2 = (RTYPE) __msa_sat_s_h((v8i16) in2, sat_val);  \
1152 }
1153 #define SAT_SH3_SH(...) SAT_SH3(v8i16, __VA_ARGS__)
1154
1155 #define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val)  \
1156 {                                                    \
1157     SAT_SH2(RTYPE, in0, in1, sat_val);               \
1158     SAT_SH2(RTYPE, in2, in3, sat_val);               \
1159 }
1160 #define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__)
1161
1162 /* Description : Indexed halfword element values are replicated to all
1163                  elements in output vector
1164    Arguments   : Inputs  - in, idx0, idx1
1165                  Outputs - out0, out1
1166                  Return Type - as per RTYPE
1167    Details     : 'idx0' element value from 'in' vector is replicated to all
1168                   elements in 'out0' vector
1169                   Valid index range for halfword operation is 0-7
1170 */
1171 #define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1)  \
1172 {                                                     \
1173     out0 = (RTYPE) __msa_splati_h((v8i16) in, idx0);  \
1174     out1 = (RTYPE) __msa_splati_h((v8i16) in, idx1);  \
1175 }
1176 #define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__)
1177
1178 #define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3,  \
1179                   out0, out1, out2, out3)             \
1180 {                                                     \
1181     SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1);     \
1182     SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3);     \
1183 }
1184 #define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__)
1185 #define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__)
1186
1187 /* Description : Indexed word element values are replicated to all
1188                  elements in output vector
1189    Arguments   : Inputs  - in, stidx
1190                  Outputs - out0, out1
1191                  Return Type - as per RTYPE
1192    Details     : 'stidx' element value from 'in' vector is replicated to all
1193                   elements in 'out0' vector
1194                  'stidx + 1' element value from 'in' vector is replicated to all
1195                   elements in 'out1' vector
1196                   Valid index range for halfword operation is 0-3
1197 */
1198 #define SPLATI_W2(RTYPE, in, stidx, out0, out1)            \
1199 {                                                          \
1200     out0 = (RTYPE) __msa_splati_w((v4i32) in, stidx);      \
1201     out1 = (RTYPE) __msa_splati_w((v4i32) in, (stidx+1));  \
1202 }
1203 #define SPLATI_W2_SW(...) SPLATI_W2(v4i32, __VA_ARGS__)
1204
1205 #define SPLATI_W4(RTYPE, in, out0, out1, out2, out3)  \
1206 {                                                     \
1207     SPLATI_W2(RTYPE, in, 0, out0, out1);              \
1208     SPLATI_W2(RTYPE, in, 2, out2, out3);              \
1209 }
1210 #define SPLATI_W4_SW(...) SPLATI_W4(v4i32, __VA_ARGS__)
1211
1212 /* Description : Pack even byte elements of vector pairs
1213    Arguments   : Inputs  - in0, in1, in2, in3
1214                  Outputs - out0, out1
1215                  Return Type - as per RTYPE
1216    Details     : Even byte elements of in0 are copied to the left half of
1217                  out0 & even byte elements of in1 are copied to the right
1218                  half of out0.
1219                  Even byte elements of in2 are copied to the left half of
1220                  out1 & even byte elements of in3 are copied to the right
1221                  half of out1.
1222 */
1223 #define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1224 {                                                            \
1225     out0 = (RTYPE) __msa_pckev_b((v16i8) in0, (v16i8) in1);  \
1226     out1 = (RTYPE) __msa_pckev_b((v16i8) in2, (v16i8) in3);  \
1227 }
1228 #define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)
1229 #define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)
1230 #define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__)
1231 #define PCKEV_B2_SW(...) PCKEV_B2(v4i32, __VA_ARGS__)
1232
1233 #define PCKEV_B3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2)  \
1234 {                                                                        \
1235     PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1);                     \
1236     out2 = (RTYPE) __msa_pckev_b((v16i8) in4, (v16i8) in5);              \
1237 }
1238 #define PCKEV_B3_UB(...) PCKEV_B3(v16u8, __VA_ARGS__)
1239 #define PCKEV_B3_SB(...) PCKEV_B3(v16i8, __VA_ARGS__)
1240
1241 #define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1242                  out0, out1, out2, out3)                         \
1243 {                                                                \
1244     PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1245     PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1246 }
1247 #define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__)
1248 #define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__)
1249
1250 /* Description : Pack even halfword elements of vector pairs
1251    Arguments   : Inputs  - in0, in1, in2, in3
1252                  Outputs - out0, out1
1253                  Return Type - as per RTYPE
1254    Details     : Even halfword elements of in0 are copied to the left half of
1255                  out0 & even halfword elements of in1 are copied to the right
1256                  half of out0.
1257                  Even halfword elements of in2 are copied to the left half of
1258                  out1 & even halfword elements of in3 are copied to the right
1259                  half of out1.
1260 */
1261 #define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1262 {                                                            \
1263     out0 = (RTYPE) __msa_pckev_h((v8i16) in0, (v8i16) in1);  \
1264     out1 = (RTYPE) __msa_pckev_h((v8i16) in2, (v8i16) in3);  \
1265 }
1266 #define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__)
1267 #define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__)
1268
1269 #define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1270                  out0, out1, out2, out3)                         \
1271 {                                                                \
1272     PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1273     PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1274 }
1275 #define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__)
1276 #define PCKEV_H4_SW(...) PCKEV_H4(v4i32, __VA_ARGS__)
1277
1278 /* Description : Each byte element is logically xor'ed with immediate 128
1279    Arguments   : Inputs  - in0, in1
1280                  Outputs - in0, in1 (in-place)
1281                  Return Type - as per RTYPE
1282    Details     : Each unsigned byte element from input vector 'in0' is
1283                  logically xor'ed with 128 and result is in-place stored in
1284                  'in0' vector
1285                  Each unsigned byte element from input vector 'in1' is
1286                  logically xor'ed with 128 and result is in-place stored in
1287                  'in1' vector
1288                  Similar for other pairs
1289 */
1290 #define XORI_B2_128(RTYPE, in0, in1)               \
1291 {                                                  \
1292     in0 = (RTYPE) __msa_xori_b((v16u8) in0, 128);  \
1293     in1 = (RTYPE) __msa_xori_b((v16u8) in1, 128);  \
1294 }
1295 #define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__)
1296
1297 #define XORI_B3_128(RTYPE, in0, in1, in2)          \
1298 {                                                  \
1299     XORI_B2_128(RTYPE, in0, in1);                  \
1300     in2 = (RTYPE) __msa_xori_b((v16u8) in2, 128);  \
1301 }
1302 #define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__)
1303
1304 #define XORI_B4_128(RTYPE, in0, in1, in2, in3)  \
1305 {                                               \
1306     XORI_B2_128(RTYPE, in0, in1);               \
1307     XORI_B2_128(RTYPE, in2, in3);               \
1308 }
1309 #define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__)
1310 #define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__)
1311 #define XORI_B4_128_SH(...) XORI_B4_128(v8i16, __VA_ARGS__)
1312
1313 #define XORI_B5_128(RTYPE, in0, in1, in2, in3, in4)  \
1314 {                                                    \
1315     XORI_B3_128(RTYPE, in0, in1, in2);               \
1316     XORI_B2_128(RTYPE, in3, in4);                    \
1317 }
1318 #define XORI_B5_128_SB(...) XORI_B5_128(v16i8, __VA_ARGS__)
1319
1320 #define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6)  \
1321 {                                                              \
1322     XORI_B4_128(RTYPE, in0, in1, in2, in3);                    \
1323     XORI_B3_128(RTYPE, in4, in5, in6);                         \
1324 }
1325 #define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__)
1326
1327 #define XORI_B8_128(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7)  \
1328 {                                                                   \
1329     XORI_B4_128(RTYPE, in0, in1, in2, in3);                         \
1330     XORI_B4_128(RTYPE, in4, in5, in6, in7);                         \
1331 }
1332 #define XORI_B8_128_SB(...) XORI_B8_128(v16i8, __VA_ARGS__)
1333
1334 /* Description : Addition of signed halfword elements and signed saturation
1335    Arguments   : Inputs  - in0, in1, in2, in3
1336                  Outputs - out0, out1
1337                  Return Type - as per RTYPE
1338    Details     : Signed halfword elements from 'in0' are added to signed
1339                  halfword elements of 'in1'. The result is then signed saturated
1340                  between -32768 to +32767 (as per halfword data type)
1341                  Similar for other pairs
1342 */
1343 #define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1)       \
1344 {                                                             \
1345     out0 = (RTYPE) __msa_adds_s_h((v8i16) in0, (v8i16) in1);  \
1346     out1 = (RTYPE) __msa_adds_s_h((v8i16) in2, (v8i16) in3);  \
1347 }
1348 #define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__)
1349
1350 #define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1351                  out0, out1, out2, out3)                         \
1352 {                                                                \
1353     ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1354     ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1355 }
1356 #define ADDS_SH4_UH(...) ADDS_SH4(v8u16, __VA_ARGS__)
1357 #define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__)
1358
1359 /* Description : Shift left all elements of vector (generic for all data types)
1360    Arguments   : Inputs  - in0, in1, in2, in3, shift
1361                  Outputs - in0, in1, in2, in3 (in place)
1362                  Return Type - as per input vector RTYPE
1363    Details     : Each element of vector 'in0' is left shifted by 'shift' and
1364                  result is in place written to 'in0'
1365                  Similar for other pairs
1366 */
1367 #define SLLI_4V(in0, in1, in2, in3, shift)  \
1368 {                                           \
1369     in0 = in0 << shift;                     \
1370     in1 = in1 << shift;                     \
1371     in2 = in2 << shift;                     \
1372     in3 = in3 << shift;                     \
1373 }
1374
1375 /* Description : Arithmetic shift right all elements of vector
1376                  (generic for all data types)
1377    Arguments   : Inputs  - in0, in1, in2, in3, shift
1378                  Outputs - in0, in1, in2, in3 (in place)
1379                  Return Type - as per input vector RTYPE
1380    Details     : Each element of vector 'in0' is right shifted by 'shift' and
1381                  result is in place written to 'in0'
1382                  Here, 'shift' is GP variable passed in
1383                  Similar for other pairs
1384 */
1385 #define SRA_4V(in0, in1, in2, in3, shift)  \
1386 {                                          \
1387     in0 = in0 >> shift;                    \
1388     in1 = in1 >> shift;                    \
1389     in2 = in2 >> shift;                    \
1390     in3 = in3 >> shift;                    \
1391 }
1392
1393 /* Description : Shift right logical all halfword elements of vector
1394    Arguments   : Inputs  - in0, in1, in2, in3, shift
1395                  Outputs - in0, in1, in2, in3 (in place)
1396                  Return Type - unsigned halfword
1397    Details     : Each element of vector 'in0' is shifted right logical by
1398                  number of bits respective element holds in vector 'shift' and
1399                  result is in place written to 'in0'
1400                  Here, 'shift' is a vector passed in
1401                  Similar for other pairs
1402 */
1403 #define SRL_H4(RTYPE, in0, in1, in2, in3, shift)            \
1404 {                                                           \
1405     in0 = (RTYPE) __msa_srl_h((v8i16) in0, (v8i16) shift);  \
1406     in1 = (RTYPE) __msa_srl_h((v8i16) in1, (v8i16) shift);  \
1407     in2 = (RTYPE) __msa_srl_h((v8i16) in2, (v8i16) shift);  \
1408     in3 = (RTYPE) __msa_srl_h((v8i16) in3, (v8i16) shift);  \
1409 }
1410 #define SRL_H4_UH(...) SRL_H4(v8u16, __VA_ARGS__)
1411
1412 /* Description : Shift right arithmetic rounded halfwords
1413    Arguments   : Inputs  - in0, in1, shift
1414                  Outputs - in0, in1, (in place)
1415                  Return Type - unsigned halfword
1416    Details     : Each element of vector 'in0' is shifted right arithmetic by
1417                  number of bits respective element holds in vector 'shift'.
1418                  The last discarded bit is added to shifted value for rounding
1419                  and the result is in place written to 'in0'
1420                  Here, 'shift' is a vector passed in
1421                  Similar for other pairs
1422 */
1423 #define SRAR_H2(RTYPE, in0, in1, shift)                      \
1424 {                                                            \
1425     in0 = (RTYPE) __msa_srar_h((v8i16) in0, (v8i16) shift);  \
1426     in1 = (RTYPE) __msa_srar_h((v8i16) in1, (v8i16) shift);  \
1427 }
1428 #define SRAR_H2_UH(...) SRAR_H2(v8u16, __VA_ARGS__)
1429 #define SRAR_H2_SH(...) SRAR_H2(v8i16, __VA_ARGS__)
1430
1431 #define SRAR_H3(RTYPE, in0, in1, in2, shift)                 \
1432 {                                                            \
1433     SRAR_H2(RTYPE, in0, in1, shift)                          \
1434     in2 = (RTYPE) __msa_srar_h((v8i16) in2, (v8i16) shift);  \
1435 }
1436 #define SRAR_H3_SH(...) SRAR_H3(v8i16, __VA_ARGS__)
1437
1438 #define SRAR_H4(RTYPE, in0, in1, in2, in3, shift)  \
1439 {                                                  \
1440     SRAR_H2(RTYPE, in0, in1, shift)                \
1441     SRAR_H2(RTYPE, in2, in3, shift)                \
1442 }
1443 #define SRAR_H4_UH(...) SRAR_H4(v8u16, __VA_ARGS__)
1444 #define SRAR_H4_SH(...) SRAR_H4(v8i16, __VA_ARGS__)
1445 /* Description : Shift right arithmetic rounded (immediate)
1446    Arguments   : Inputs  - in0, in1, shift
1447                  Outputs - in0, in1     (in place)
1448                  Return Type - as per RTYPE
1449    Details     : Each element of vector 'in0' is shifted right arithmetic by
1450                  value in 'shift'.
1451                  The last discarded bit is added to shifted value for rounding
1452                  and the result is in place written to 'in0'
1453                  Similar for other pairs
1454 */
1455 #define SRARI_W2(RTYPE, in0, in1, shift)              \
1456 {                                                     \
1457     in0 = (RTYPE) __msa_srari_w((v4i32) in0, shift);  \
1458     in1 = (RTYPE) __msa_srari_w((v4i32) in1, shift);  \
1459 }
1460 #define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__)
1461
1462 #define SRARI_W4(RTYPE, in0, in1, in2, in3, shift)  \
1463 {                                                   \
1464     SRARI_W2(RTYPE, in0, in1, shift);               \
1465     SRARI_W2(RTYPE, in2, in3, shift);               \
1466 }
1467 #define SRARI_W4_SH(...) SRARI_W4(v8i16, __VA_ARGS__)
1468 #define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)
1469
1470 /* Description : Multiplication of pairs of vectors
1471    Arguments   : Inputs  - in0, in1, in2, in3
1472                  Outputs - out0, out1
1473    Details     : Each element from 'in0' is multiplied with elements from 'in1'
1474                  and result is written to 'out0'
1475                  Similar for other pairs
1476 */
1477 #define MUL2(in0, in1, in2, in3, out0, out1)  \
1478 {                                             \
1479     out0 = in0 * in1;                         \
1480     out1 = in2 * in3;                         \
1481 }
1482 #define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3)  \
1483 {                                                                             \
1484     MUL2(in0, in1, in2, in3, out0, out1);                                     \
1485     MUL2(in4, in5, in6, in7, out2, out3);                                     \
1486 }
1487
1488 /* Description : Zero extend unsigned byte elements to halfword elements
1489    Arguments   : Inputs  - in           (1 input unsigned byte vector)
1490                  Outputs - out0, out1   (unsigned 2 halfword vectors)
1491                  Return Type - signed halfword
1492    Details     : Zero extended right half of vector is returned in 'out0'
1493                  Zero extended left half of vector is returned in 'out1'
1494 */
1495 #define UNPCK_UB_SH(in, out0, out1)                   \
1496 {                                                     \
1497     v16i8 zero_m = { 0 };                             \
1498                                                       \
1499     ILVRL_B2_SH(zero_m, in, out0, out1);              \
1500 }
1501
1502 /* Description : Transposes input 4x4 byte block
1503    Arguments   : Inputs  - in0, in1, in2, in3      (input 4x4 byte block)
1504                  Outputs - out0, out1, out2, out3  (output 4x4 byte block)
1505                  Return Type - unsigned byte
1506    Details     :
1507 */
1508 #define TRANSPOSE4x4_UB_UB(in0, in1, in2, in3, out0, out1, out2, out3)  \
1509 {                                                                       \
1510     v16i8 zero_m = { 0 };                                               \
1511     v16i8 s0_m, s1_m, s2_m, s3_m;                                       \
1512                                                                         \
1513     ILVR_D2_SB(in1, in0, in3, in2, s0_m, s1_m);                         \
1514     ILVRL_B2_SB(s1_m, s0_m, s2_m, s3_m);                                \
1515                                                                         \
1516     out0 = (v16u8) __msa_ilvr_b(s3_m, s2_m);                            \
1517     out1 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out0, 4);               \
1518     out2 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out1, 4);               \
1519     out3 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out2, 4);               \
1520 }
1521
1522 /* Description : Transposes input 8x4 byte block into 4x8
1523    Arguments   : Inputs  - in0, in1, in2, in3      (input 8x4 byte block)
1524                  Outputs - out0, out1, out2, out3  (output 4x8 byte block)
1525                  Return Type - unsigned byte
1526    Details     :
1527 */
1528 #define TRANSPOSE8x4_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1529                         out0, out1, out2, out3)                         \
1530 {                                                                       \
1531     v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                               \
1532                                                                         \
1533     ILVEV_W2_SB(in0, in4, in1, in5, tmp0_m, tmp1_m);                    \
1534     tmp2_m = __msa_ilvr_b(tmp1_m, tmp0_m);                              \
1535     ILVEV_W2_SB(in2, in6, in3, in7, tmp0_m, tmp1_m);                    \
1536                                                                         \
1537     tmp3_m = __msa_ilvr_b(tmp1_m, tmp0_m);                              \
1538     ILVRL_H2_SB(tmp3_m, tmp2_m, tmp0_m, tmp1_m);                        \
1539                                                                         \
1540     ILVRL_W2(RTYPE, tmp1_m, tmp0_m, out0, out2);                        \
1541     out1 = (RTYPE) __msa_ilvl_d((v2i64) out2, (v2i64) out0);            \
1542     out3 = (RTYPE) __msa_ilvl_d((v2i64) out0, (v2i64) out2);            \
1543 }
1544
1545 #define TRANSPOSE8x4_UB_UB(...) TRANSPOSE8x4_UB(v16u8, __VA_ARGS__)
1546
1547 /* Description : Transposes 16x8 block into 8x16 with byte elements in vectors
1548    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7,
1549                            in8, in9, in10, in11, in12, in13, in14, in15
1550                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1551                  Return Type - unsigned byte
1552    Details     :
1553 */
1554 #define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7,          \
1555                             in8, in9, in10, in11, in12, in13, in14, in15,    \
1556                             out0, out1, out2, out3, out4, out5, out6, out7)  \
1557 {                                                                            \
1558     v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                    \
1559     v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                    \
1560                                                                              \
1561     ILVEV_D2_UB(in0, in8, in1, in9, out7, out6);                             \
1562     ILVEV_D2_UB(in2, in10, in3, in11, out5, out4);                           \
1563     ILVEV_D2_UB(in4, in12, in5, in13, out3, out2);                           \
1564     ILVEV_D2_UB(in6, in14, in7, in15, out1, out0);                           \
1565                                                                              \
1566     tmp0_m = (v16u8) __msa_ilvev_b((v16i8) out6, (v16i8) out7);              \
1567     tmp4_m = (v16u8) __msa_ilvod_b((v16i8) out6, (v16i8) out7);              \
1568     tmp1_m = (v16u8) __msa_ilvev_b((v16i8) out4, (v16i8) out5);              \
1569     tmp5_m = (v16u8) __msa_ilvod_b((v16i8) out4, (v16i8) out5);              \
1570     out5 = (v16u8) __msa_ilvev_b((v16i8) out2, (v16i8) out3);                \
1571     tmp6_m = (v16u8) __msa_ilvod_b((v16i8) out2, (v16i8) out3);              \
1572     out7 = (v16u8) __msa_ilvev_b((v16i8) out0, (v16i8) out1);                \
1573     tmp7_m = (v16u8) __msa_ilvod_b((v16i8) out0, (v16i8) out1);              \
1574                                                                              \
1575     ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m);                 \
1576     out0 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
1577     out4 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
1578                                                                              \
1579     tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m);          \
1580     tmp3_m = (v16u8) __msa_ilvod_h((v8i16) out7, (v8i16) out5);              \
1581     out2 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
1582     out6 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
1583                                                                              \
1584     ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m);             \
1585     out1 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
1586     out5 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
1587                                                                              \
1588     tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp5_m, (v8i16) tmp4_m);          \
1589     tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp5_m, (v8i16) tmp4_m);          \
1590     tmp3_m = (v16u8) __msa_ilvod_h((v8i16) tmp7_m, (v8i16) tmp6_m);          \
1591     tmp3_m = (v16u8) __msa_ilvod_h((v8i16) tmp7_m, (v8i16) tmp6_m);          \
1592     out3 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
1593     out7 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
1594 }
1595
1596 /* Description : Pack even elements of input vectors & xor with 128
1597    Arguments   : Inputs  - in0, in1
1598                  Outputs - out_m
1599                  Return Type - unsigned byte
1600    Details     : Signed byte even elements from 'in0' and 'in1' are packed
1601                  together in one vector and the resulted vector is xor'ed with
1602                  128 to shift the range from signed to unsigned byte
1603 */
1604 #define PCKEV_XORI128_UB(in0, in1)                            \
1605 ( {                                                           \
1606     v16u8 out_m;                                              \
1607     out_m = (v16u8) __msa_pckev_b((v16i8) in1, (v16i8) in0);  \
1608     out_m = (v16u8) __msa_xori_b((v16u8) out_m, 128);         \
1609     out_m;                                                    \
1610 } )
1611
1612 /* Description : Pack even byte elements, extract 0 & 2 index words from pair
1613                  of results and store 4 words in destination memory as per
1614                  stride
1615    Arguments   : Inputs  - in0, in1, in2, in3, pdst, stride
1616 */
1617 #define PCKEV_ST4x4_UB(in0, in1, in2, in3, pdst, stride)  \
1618 {                                                         \
1619     uint32_t out0_m, out1_m, out2_m, out3_m;              \
1620     v16i8 tmp0_m, tmp1_m;                                 \
1621                                                           \
1622     PCKEV_B2_SB(in1, in0, in3, in2, tmp0_m, tmp1_m);      \
1623                                                           \
1624     out0_m = __msa_copy_u_w((v4i32) tmp0_m, 0);           \
1625     out1_m = __msa_copy_u_w((v4i32) tmp0_m, 2);           \
1626     out2_m = __msa_copy_u_w((v4i32) tmp1_m, 0);           \
1627     out3_m = __msa_copy_u_w((v4i32) tmp1_m, 2);           \
1628                                                           \
1629     SW4(out0_m, out1_m, out2_m, out3_m, pdst, stride);    \
1630 }
1631 #endif  /* AVUTIL_MIPS_GENERIC_MACROS_MSA_H */