git.sesse.net Git - ffmpeg/blob - libavutil/mips/generic_macros_msa.h

   1 /*
   2  * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
   3  *
   4  * This file is part of FFmpeg.
   5  *
   6  * FFmpeg is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * FFmpeg is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with FFmpeg; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19  */
  20
  21 #ifndef AVUTIL_MIPS_GENERIC_MACROS_MSA_H
  22 #define AVUTIL_MIPS_GENERIC_MACROS_MSA_H
  23
  24 #include <stdint.h>
  25 #include <msa.h>
  26
  27 #define ALIGNMENT           16
  28 #define ALLOC_ALIGNED(align) __attribute__ ((aligned((align) << 1)))
  29
  30 #define LD_V(RTYPE, psrc) *((RTYPE *)(psrc))
  31 #define LD_UB(...) LD_V(v16u8, __VA_ARGS__)
  32 #define LD_SB(...) LD_V(v16i8, __VA_ARGS__)
  33 #define LD_UH(...) LD_V(v8u16, __VA_ARGS__)
  34 #define LD_SH(...) LD_V(v8i16, __VA_ARGS__)
  35 #define LD_UW(...) LD_V(v4u32, __VA_ARGS__)
  36 #define LD_SW(...) LD_V(v4i32, __VA_ARGS__)
  37
  38 #define ST_V(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
  39 #define ST_UB(...) ST_V(v16u8, __VA_ARGS__)
  40 #define ST_SB(...) ST_V(v16i8, __VA_ARGS__)
  41 #define ST_UH(...) ST_V(v8u16, __VA_ARGS__)
  42 #define ST_SH(...) ST_V(v8i16, __VA_ARGS__)
  43 #define ST_UW(...) ST_V(v4u32, __VA_ARGS__)
  44 #define ST_SW(...) ST_V(v4i32, __VA_ARGS__)
  45
  46 #if (__mips_isa_rev >= 6)
  47     #define LH(psrc)                              \
  48     ( {                                           \
  49         uint16_t val_lh_m = *(uint16_t *)(psrc);  \
  50         val_lh_m;                                 \
  51     } )
  52
  53     #define LW(psrc)                              \
  54     ( {                                           \
  55         uint32_t val_lw_m = *(uint32_t *)(psrc);  \
  56         val_lw_m;                                 \
  57     } )
  58
  59     #if (__mips == 64)
  60         #define LD(psrc)                               \
  61         ( {                                            \
  62             uint64_t val_ld_m =  *(uint64_t *)(psrc);  \
  63             val_ld_m;                                  \
  64         } )
  65     #else  // !(__mips == 64)
  66         #define LD(psrc)                                                    \
  67         ( {                                                                 \
  68             uint8_t *psrc_ld_m = (uint8_t *) (psrc);                        \
  69             uint32_t val0_ld_m, val1_ld_m;                                  \
  70             uint64_t val_ld_m = 0;                                          \
  71                                                                             \
  72             val0_ld_m = LW(psrc_ld_m);                                      \
  73             val1_ld_m = LW(psrc_ld_m + 4);                                  \
  74                                                                             \
  75             val_ld_m = (uint64_t) (val1_ld_m);                              \
  76             val_ld_m = (uint64_t) ((val_ld_m << 32) & 0xFFFFFFFF00000000);  \
  77             val_ld_m = (uint64_t) (val_ld_m | (uint64_t) val0_ld_m);        \
  78                                                                             \
  79             val_ld_m;                                                       \
  80         } )
  81     #endif  // (__mips == 64)
  82
  83     #define SH(val, pdst)  *(uint16_t *)(pdst) = (val);
  84     #define SW(val, pdst)  *(uint32_t *)(pdst) = (val);
  85     #define SD(val, pdst)  *(uint64_t *)(pdst) = (val);
  86
  87 #else  // !(__mips_isa_rev >= 6)
  88     #define LH(psrc)                                 \
  89     ( {                                              \
  90         uint8_t *psrc_lh_m = (uint8_t *) (psrc);     \
  91         uint16_t val_lh_m;                           \
  92                                                      \
  93         __asm__ volatile (                           \
  94             "ulh  %[val_lh_m],  %[psrc_lh_m]  \n\t"  \
  95                                                      \
  96             : [val_lh_m] "=r" (val_lh_m)             \
  97             : [psrc_lh_m] "m" (*psrc_lh_m)           \
  98         );                                           \
  99                                                      \
 100         val_lh_m;                                    \
 101     } )
 102
 103     #define LW(psrc)                                 \
 104     ( {                                              \
 105         uint8_t *psrc_lw_m = (uint8_t *) (psrc);     \
 106         uint32_t val_lw_m;                           \
 107                                                      \
 108         __asm__ volatile (                           \
 109             "ulw  %[val_lw_m],  %[psrc_lw_m]  \n\t"  \
 110                                                      \
 111             : [val_lw_m] "=r" (val_lw_m)             \
 112             : [psrc_lw_m] "m" (*psrc_lw_m)           \
 113         );                                           \
 114                                                      \
 115         val_lw_m;                                    \
 116     } )
 117
 118     #if (__mips == 64)
 119         #define LD(psrc)                                 \
 120         ( {                                              \
 121             uint8_t *psrc_ld_m = (uint8_t *) (psrc);     \
 122             uint64_t val_ld_m = 0;                       \
 123                                                          \
 124             __asm__ volatile (                           \
 125                 "uld  %[val_ld_m],  %[psrc_ld_m]  \n\t"  \
 126                                                          \
 127                 : [val_ld_m] "=r" (val_ld_m)             \
 128                 : [psrc_ld_m] "m" (*psrc_ld_m)           \
 129             );                                           \
 130                                                          \
 131             val_ld_m;                                    \
 132         } )
 133     #else  // !(__mips == 64)
 134         #define LD(psrc)                                                    \
 135         ( {                                                                 \
 136             uint8_t *psrc_ld_m = (uint8_t *) (psrc);                        \
 137             uint32_t val0_ld_m, val1_ld_m;                                  \
 138             uint64_t val_ld_m = 0;                                          \
 139                                                                             \
 140             val0_ld_m = LW(psrc_ld_m);                                      \
 141             val1_ld_m = LW(psrc_ld_m + 4);                                  \
 142                                                                             \
 143             val_ld_m = (uint64_t) (val1_ld_m);                              \
 144             val_ld_m = (uint64_t) ((val_ld_m << 32) & 0xFFFFFFFF00000000);  \
 145             val_ld_m = (uint64_t) (val_ld_m | (uint64_t) val0_ld_m);        \
 146                                                                             \
 147             val_ld_m;                                                       \
 148         } )
 149     #endif  // (__mips == 64)
 150
 151     #define SH(val, pdst)                            \
 152     {                                                \
 153         uint8_t *pdst_sh_m = (uint8_t *) (pdst);     \
 154         uint16_t val_sh_m = (val);                   \
 155                                                      \
 156         __asm__ volatile (                           \
 157             "ush  %[val_sh_m],  %[pdst_sh_m]  \n\t"  \
 158                                                      \
 159             : [pdst_sh_m] "=m" (*pdst_sh_m)          \
 160             : [val_sh_m] "r" (val_sh_m)              \
 161         );                                           \
 162     }
 163
 164     #define SW(val, pdst)                            \
 165     {                                                \
 166         uint8_t *pdst_sw_m = (uint8_t *) (pdst);     \
 167         uint32_t val_sw_m = (val);                   \
 168                                                      \
 169         __asm__ volatile (                           \
 170             "usw  %[val_sw_m],  %[pdst_sw_m]  \n\t"  \
 171                                                      \
 172             : [pdst_sw_m] "=m" (*pdst_sw_m)          \
 173             : [val_sw_m] "r" (val_sw_m)              \
 174         );                                           \
 175     }
 176
 177     #define SD(val, pdst)                                             \
 178     {                                                                 \
 179         uint8_t *pdst_sd_m = (uint8_t *) (pdst);                      \
 180         uint32_t val0_sd_m, val1_sd_m;                                \
 181                                                                       \
 182         val0_sd_m = (uint32_t) ((val) & 0x00000000FFFFFFFF);          \
 183         val1_sd_m = (uint32_t) (((val) >> 32) & 0x00000000FFFFFFFF);  \
 184                                                                       \
 185         SW(val0_sd_m, pdst_sd_m);                                     \
 186         SW(val1_sd_m, pdst_sd_m + 4);                                 \
 187     }
 188 #endif // (__mips_isa_rev >= 6)
 189
 190 /* Description : Load 4 words with stride
 191    Arguments   : Inputs  - psrc    (source pointer to load from)
 192                          - stride
 193                  Outputs - out0, out1, out2, out3
 194    Details     : Loads word in 'out0' from (psrc)
 195                  Loads word in 'out1' from (psrc + stride)
 196                  Loads word in 'out2' from (psrc + 2 * stride)
 197                  Loads word in 'out3' from (psrc + 3 * stride)
 198 */
 199 #define LW4(psrc, stride, out0, out1, out2, out3)  \
 200 {                                                  \
 201     out0 = LW((psrc));                             \
 202     out1 = LW((psrc) + stride);                    \
 203     out2 = LW((psrc) + 2 * stride);                \
 204     out3 = LW((psrc) + 3 * stride);                \
 205 }
 206
 207 #define LW2(psrc, stride, out0, out1)  \
 208 {                                      \
 209     out0 = LW((psrc));                 \
 210     out1 = LW((psrc) + stride);        \
 211 }
 212
 213 /* Description : Load double words with stride
 214    Arguments   : Inputs  - psrc    (source pointer to load from)
 215                          - stride
 216                  Outputs - out0, out1
 217    Details     : Loads double word in 'out0' from (psrc)
 218                  Loads double word in 'out1' from (psrc + stride)
 219 */
 220 #define LD2(psrc, stride, out0, out1)  \
 221 {                                      \
 222     out0 = LD((psrc));                 \
 223     out1 = LD((psrc) + stride);        \
 224 }
 225 #define LD4(psrc, stride, out0, out1, out2, out3)  \
 226 {                                                  \
 227     LD2((psrc), stride, out0, out1);               \
 228     LD2((psrc) + 2 * stride, stride, out2, out3);  \
 229 }
 230
 231 /* Description : Store 4 words with stride
 232    Arguments   : Inputs  - in0, in1, in2, in3, pdst, stride
 233    Details     : Stores word from 'in0' to (pdst)
 234                  Stores word from 'in1' to (pdst + stride)
 235                  Stores word from 'in2' to (pdst + 2 * stride)
 236                  Stores word from 'in3' to (pdst + 3 * stride)
 237 */
 238 #define SW4(in0, in1, in2, in3, pdst, stride)  \
 239 {                                              \
 240     SW(in0, (pdst))                            \
 241     SW(in1, (pdst) + stride);                  \
 242     SW(in2, (pdst) + 2 * stride);              \
 243     SW(in3, (pdst) + 3 * stride);              \
 244 }
 245
 246 /* Description : Store 4 double words with stride
 247    Arguments   : Inputs  - in0, in1, in2, in3, pdst, stride
 248    Details     : Stores double word from 'in0' to (pdst)
 249                  Stores double word from 'in1' to (pdst + stride)
 250                  Stores double word from 'in2' to (pdst + 2 * stride)
 251                  Stores double word from 'in3' to (pdst + 3 * stride)
 252 */
 253 #define SD4(in0, in1, in2, in3, pdst, stride)  \
 254 {                                              \
 255     SD(in0, (pdst))                            \
 256     SD(in1, (pdst) + stride);                  \
 257     SD(in2, (pdst) + 2 * stride);              \
 258     SD(in3, (pdst) + 3 * stride);              \
 259 }
 260
 261 /* Description : Load vector elements with stride
 262    Arguments   : Inputs  - psrc    (source pointer to load from)
 263                          - stride
 264                  Outputs - out0, out1
 265                  Return Type - as per RTYPE
 266    Details     : Loads elements in 'out0' from (psrc)
 267                  Loads elements in 'out1' from (psrc + stride)
 268 */
 269 #define LD_V2(RTYPE, psrc, stride, out0, out1)  \
 270 {                                               \
 271     out0 = LD_V(RTYPE, (psrc));                 \
 272     out1 = LD_V(RTYPE, (psrc) + stride);        \
 273 }
 274 #define LD_UB2(...) LD_V2(v16u8, __VA_ARGS__)
 275 #define LD_SB2(...) LD_V2(v16i8, __VA_ARGS__)
 276 #define LD_UH2(...) LD_V2(v8u16, __VA_ARGS__)
 277 #define LD_SH2(...) LD_V2(v8i16, __VA_ARGS__)
 278 #define LD_SW2(...) LD_V2(v4i32, __VA_ARGS__)
 279
 280 #define LD_V3(RTYPE, psrc, stride, out0, out1, out2)  \
 281 {                                                     \
 282     LD_V2(RTYPE, (psrc), stride, out0, out1);         \
 283     out2 = LD_V(RTYPE, (psrc) + 2 * stride);          \
 284 }
 285 #define LD_UB3(...) LD_V3(v16u8, __VA_ARGS__)
 286 #define LD_SB3(...) LD_V3(v16i8, __VA_ARGS__)
 287
 288 #define LD_V4(RTYPE, psrc, stride, out0, out1, out2, out3)   \
 289 {                                                            \
 290     LD_V2(RTYPE, (psrc), stride, out0, out1);                \
 291     LD_V2(RTYPE, (psrc) + 2 * stride , stride, out2, out3);  \
 292 }
 293 #define LD_UB4(...) LD_V4(v16u8, __VA_ARGS__)
 294 #define LD_SB4(...) LD_V4(v16i8, __VA_ARGS__)
 295 #define LD_UH4(...) LD_V4(v8u16, __VA_ARGS__)
 296 #define LD_SH4(...) LD_V4(v8i16, __VA_ARGS__)
 297
 298 #define LD_V5(RTYPE, psrc, stride, out0, out1, out2, out3, out4)  \
 299 {                                                                 \
 300     LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3);         \
 301     out4 = LD_V(RTYPE, (psrc) + 4 * stride);                      \
 302 }
 303 #define LD_UB5(...) LD_V5(v16u8, __VA_ARGS__)
 304 #define LD_SB5(...) LD_V5(v16i8, __VA_ARGS__)
 305
 306 #define LD_V6(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5)  \
 307 {                                                                       \
 308     LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3);               \
 309     LD_V2(RTYPE, (psrc) + 4 * stride, stride, out4, out5);              \
 310 }
 311 #define LD_UB6(...) LD_V6(v16u8, __VA_ARGS__)
 312 #define LD_SB6(...) LD_V6(v16i8, __VA_ARGS__)
 313 #define LD_UH6(...) LD_V6(v8u16, __VA_ARGS__)
 314 #define LD_SH6(...) LD_V6(v8i16, __VA_ARGS__)
 315
 316 #define LD_V7(RTYPE, psrc, stride,                               \
 317               out0, out1, out2, out3, out4, out5, out6)          \
 318 {                                                                \
 319     LD_V5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4);  \
 320     LD_V2(RTYPE, (psrc) + 5 * stride, stride, out5, out6);       \
 321 }
 322 #define LD_UB7(...) LD_V7(v16u8, __VA_ARGS__)
 323 #define LD_SB7(...) LD_V7(v16i8, __VA_ARGS__)
 324
 325 #define LD_V8(RTYPE, psrc, stride,                                      \
 326               out0, out1, out2, out3, out4, out5, out6, out7)           \
 327 {                                                                       \
 328     LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3);               \
 329     LD_V4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7);  \
 330 }
 331 #define LD_UB8(...) LD_V8(v16u8, __VA_ARGS__)
 332 #define LD_SB8(...) LD_V8(v16i8, __VA_ARGS__)
 333 #define LD_UH8(...) LD_V8(v8u16, __VA_ARGS__)
 334 #define LD_SH8(...) LD_V8(v8i16, __VA_ARGS__)
 335
 336 #define LD_V16(RTYPE, psrc, stride,                                   \
 337                out0, out1, out2, out3, out4, out5, out6, out7,        \
 338                out8, out9, out10, out11, out12, out13, out14, out15)  \
 339 {                                                                     \
 340     LD_V8(RTYPE, (psrc), stride,                                      \
 341           out0, out1, out2, out3, out4, out5, out6, out7);            \
 342     LD_V8(RTYPE, (psrc) + 8 * stride, stride,                         \
 343           out8, out9, out10, out11, out12, out13, out14, out15);      \
 344 }
 345 #define LD_SH16(...) LD_V16(v8i16, __VA_ARGS__)
 346
 347 /* Description : Load as 4x4 block of signed halfword elements from 1D source
 348                  data into 4 vectors (Each vector with 4 signed halfwords)
 349    Arguments   : Inputs  - psrc
 350                  Outputs - out0, out1, out2, out3
 351 */
 352 #define LD4x4_SH(psrc, out0, out1, out2, out3)                \
 353 {                                                             \
 354     out0 = LD_SH(psrc);                                       \
 355     out2 = LD_SH(psrc + 8);                                   \
 356     out1 = (v8i16) __msa_ilvl_d((v2i64) out0, (v2i64) out0);  \
 357     out3 = (v8i16) __msa_ilvl_d((v2i64) out2, (v2i64) out2);  \
 358 }
 359
 360 /* Description : Store vectors with stride
 361    Arguments   : Inputs  - in0, in1, stride
 362                  Outputs - pdst    (destination pointer to store to)
 363    Details     : Stores elements from 'in0' to (pdst)
 364                  Stores elements from 'in1' to (pdst + stride)
 365 */
 366 #define ST_V2(RTYPE, in0, in1, pdst, stride)  \
 367 {                                             \
 368     ST_V(RTYPE, in0, (pdst));                 \
 369     ST_V(RTYPE, in1, (pdst) + stride);        \
 370 }
 371 #define ST_UB2(...) ST_V2(v16u8, __VA_ARGS__)
 372 #define ST_SB2(...) ST_V2(v16i8, __VA_ARGS__)
 373 #define ST_UH2(...) ST_V2(v8u16, __VA_ARGS__)
 374 #define ST_SH2(...) ST_V2(v8i16, __VA_ARGS__)
 375 #define ST_SW2(...) ST_V2(v4i32, __VA_ARGS__)
 376
 377 #define ST_V4(RTYPE, in0, in1, in2, in3, pdst, stride)    \
 378 {                                                         \
 379     ST_V2(RTYPE, in0, in1, (pdst), stride);               \
 380     ST_V2(RTYPE, in2, in3, (pdst) + 2 * stride, stride);  \
 381 }
 382 #define ST_UB4(...) ST_V4(v16u8, __VA_ARGS__)
 383 #define ST_SB4(...) ST_V4(v16i8, __VA_ARGS__)
 384 #define ST_SH4(...) ST_V4(v8i16, __VA_ARGS__)
 385 #define ST_SW4(...) ST_V4(v4i32, __VA_ARGS__)
 386
 387 #define ST_V6(RTYPE, in0, in1, in2, in3, in4, in5, pdst, stride)  \
 388 {                                                                 \
 389     ST_V4(RTYPE, in0, in1, in2, in3, (pdst), stride);             \
 390     ST_V2(RTYPE, in4, in5, (pdst) + 4 * stride, stride);          \
 391 }
 392 #define ST_SH6(...) ST_V6(v8i16, __VA_ARGS__)
 393
 394 #define ST_V8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)  \
 395 {                                                                           \
 396     ST_V4(RTYPE, in0, in1, in2, in3, (pdst), stride);                       \
 397     ST_V4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride);          \
 398 }
 399 #define ST_UB8(...) ST_V8(v16u8, __VA_ARGS__)
 400 #define ST_SH8(...) ST_V8(v8i16, __VA_ARGS__)
 401 #define ST_SW8(...) ST_V8(v4i32, __VA_ARGS__)
 402
 403 /* Description : Store as 2x4 byte block to destination memory from input vector
 404    Arguments   : Inputs  - in, stidx, pdst, stride
 405                  Return Type - unsigned byte
 406    Details     : Index stidx halfword element from 'in' vector is copied and
 407                  stored on first line
 408                  Index stidx+1 halfword element from 'in' vector is copied and
 409                  stored on second line
 410                  Index stidx+2 halfword element from 'in' vector is copied and
 411                  stored on third line
 412                  Index stidx+3 halfword element from 'in' vector is copied and
 413                  stored on fourth line
 414 */
 415 #define ST2x4_UB(in, stidx, pdst, stride)              \
 416 {                                                      \
 417     uint16_t out0_m, out1_m, out2_m, out3_m;           \
 418     uint8_t *pblk_2x4_m = (uint8_t *) (pdst);          \
 419                                                        \
 420     out0_m = __msa_copy_u_h((v8i16) in, (stidx));      \
 421     out1_m = __msa_copy_u_h((v8i16) in, (stidx + 1));  \
 422     out2_m = __msa_copy_u_h((v8i16) in, (stidx + 2));  \
 423     out3_m = __msa_copy_u_h((v8i16) in, (stidx + 3));  \
 424                                                        \
 425     SH(out0_m, pblk_2x4_m);                            \
 426     SH(out1_m, pblk_2x4_m + stride);                   \
 427     SH(out2_m, pblk_2x4_m + 2 * stride);               \
 428     SH(out3_m, pblk_2x4_m + 3 * stride);               \
 429 }
 430
 431 /* Description : Store as 4x2 byte block to destination memory from input vector
 432    Arguments   : Inputs  - in, pdst, stride
 433                  Return Type - unsigned byte
 434    Details     : Index 0 word element from input vector is copied and stored
 435                  on first line
 436                  Index 1 word element from input vector is copied and stored
 437                  on second line
 438 */
 439 #define ST4x2_UB(in, pdst, stride)             \
 440 {                                              \
 441     uint32_t out0_m, out1_m;                   \
 442     uint8_t *pblk_4x2_m = (uint8_t *) (pdst);  \
 443                                                \
 444     out0_m = __msa_copy_u_w((v4i32) in, 0);    \
 445     out1_m = __msa_copy_u_w((v4i32) in, 1);    \
 446                                                \
 447     SW(out0_m, pblk_4x2_m);                    \
 448     SW(out1_m, pblk_4x2_m + stride);           \
 449 }
 450
 451 /* Description : Store as 4x4 byte block to destination memory from input vector
 452    Arguments   : Inputs  - in0, in1, pdst, stride
 453                  Return Type - unsigned byte
 454    Details     : Idx0 word element from input vector 'in0' is copied and stored
 455                  on first line
 456                  Idx1 word element from input vector 'in0' is copied and stored
 457                  on second line
 458                  Idx2 word element from input vector 'in1' is copied and stored
 459                  on third line
 460                  Idx3 word element from input vector 'in1' is copied and stored
 461                  on fourth line
 462 */
 463 #define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)  \
 464 {                                                                 \
 465     uint32_t out0_m, out1_m, out2_m, out3_m;                      \
 466     uint8_t *pblk_4x4_m = (uint8_t *) (pdst);                     \
 467                                                                   \
 468     out0_m = __msa_copy_u_w((v4i32) in0, idx0);                   \
 469     out1_m = __msa_copy_u_w((v4i32) in0, idx1);                   \
 470     out2_m = __msa_copy_u_w((v4i32) in1, idx2);                   \
 471     out3_m = __msa_copy_u_w((v4i32) in1, idx3);                   \
 472                                                                   \
 473     SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride);      \
 474 }
 475 #define ST4x8_UB(in0, in1, pdst, stride)                            \
 476 {                                                                   \
 477     uint8_t *pblk_4x8 = (uint8_t *) (pdst);                         \
 478                                                                     \
 479     ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride);               \
 480     ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride);  \
 481 }
 482
 483 /* Description : Store as 6x4 byte block to destination memory from input
 484                  vectors
 485    Arguments   : Inputs  - in0, in1, pdst, stride
 486                  Return Type - unsigned byte
 487    Details     : Index 0 word element from input vector 'in0' is copied and
 488                  stored on first line followed by index 2 halfword element
 489                  Index 2 word element from input vector 'in0' is copied and
 490                  stored on second line followed by index 2 halfword element
 491                  Index 0 word element from input vector 'in1' is copied and
 492                  stored on third line followed by index 2 halfword element
 493                  Index 2 word element from input vector 'in1' is copied and
 494                  stored on fourth line followed by index 2 halfword element
 495 */
 496 #define ST6x4_UB(in0, in1, pdst, stride)       \
 497 {                                              \
 498     uint32_t out0_m, out1_m, out2_m, out3_m;   \
 499     uint16_t out4_m, out5_m, out6_m, out7_m;   \
 500     uint8_t *pblk_6x4_m = (uint8_t *) (pdst);  \
 501                                                \
 502     out0_m = __msa_copy_u_w((v4i32) in0, 0);   \
 503     out1_m = __msa_copy_u_w((v4i32) in0, 2);   \
 504     out2_m = __msa_copy_u_w((v4i32) in1, 0);   \
 505     out3_m = __msa_copy_u_w((v4i32) in1, 2);   \
 506                                                \
 507     out4_m = __msa_copy_u_h((v8i16) in0, 2);   \
 508     out5_m = __msa_copy_u_h((v8i16) in0, 6);   \
 509     out6_m = __msa_copy_u_h((v8i16) in1, 2);   \
 510     out7_m = __msa_copy_u_h((v8i16) in1, 6);   \
 511                                                \
 512     SW(out0_m, pblk_6x4_m);                    \
 513     SH(out4_m, (pblk_6x4_m + 4));              \
 514     pblk_6x4_m += stride;                      \
 515     SW(out1_m, pblk_6x4_m);                    \
 516     SH(out5_m, (pblk_6x4_m + 4));              \
 517     pblk_6x4_m += stride;                      \
 518     SW(out2_m, pblk_6x4_m);                    \
 519     SH(out6_m, (pblk_6x4_m + 4));              \
 520     pblk_6x4_m += stride;                      \
 521     SW(out3_m, pblk_6x4_m);                    \
 522     SH(out7_m, (pblk_6x4_m + 4));              \
 523 }
 524
 525 /* Description : Store as 8x1 byte block to destination memory from input vector
 526    Arguments   : Inputs  - in, pdst
 527    Details     : Index 0 double word element from input vector 'in' is copied
 528                  and stored to destination memory at (pdst)
 529 */
 530 #define ST8x1_UB(in, pdst)                   \
 531 {                                            \
 532     uint64_t out0_m;                         \
 533     out0_m = __msa_copy_u_d((v2i64) in, 0);  \
 534     SD(out0_m, pdst);                        \
 535 }
 536
 537 /* Description : Store as 8x2 byte block to destination memory from input vector
 538    Arguments   : Inputs  - in, pdst, stride
 539    Details     : Index 0 double word element from input vector 'in' is copied
 540                  and stored to destination memory at (pdst)
 541                  Index 1 double word element from input vector 'in' is copied
 542                  and stored to destination memory at (pdst + stride)
 543 */
 544 #define ST8x2_UB(in, pdst, stride)             \
 545 {                                              \
 546     uint64_t out0_m, out1_m;                   \
 547     uint8_t *pblk_8x2_m = (uint8_t *) (pdst);  \
 548                                                \
 549     out0_m = __msa_copy_u_d((v2i64) in, 0);    \
 550     out1_m = __msa_copy_u_d((v2i64) in, 1);    \
 551                                                \
 552     SD(out0_m, pblk_8x2_m);                    \
 553     SD(out1_m, pblk_8x2_m + stride);           \
 554 }
 555
 556 /* Description : Store as 8x4 byte block to destination memory from input
 557                  vectors
 558    Arguments   : Inputs  - in0, in1, pdst, stride
 559    Details     : Index 0 double word element from input vector 'in0' is copied
 560                  and stored to destination memory at (pblk_8x4_m)
 561                  Index 1 double word element from input vector 'in0' is copied
 562                  and stored to destination memory at (pblk_8x4_m + stride)
 563                  Index 0 double word element from input vector 'in1' is copied
 564                  and stored to destination memory at (pblk_8x4_m + 2 * stride)
 565                  Index 1 double word element from input vector 'in1' is copied
 566                  and stored to destination memory at (pblk_8x4_m + 3 * stride)
 567 */
 568 #define ST8x4_UB(in0, in1, pdst, stride)                      \
 569 {                                                             \
 570     uint64_t out0_m, out1_m, out2_m, out3_m;                  \
 571     uint8_t *pblk_8x4_m = (uint8_t *) (pdst);                 \
 572                                                               \
 573     out0_m = __msa_copy_u_d((v2i64) in0, 0);                  \
 574     out1_m = __msa_copy_u_d((v2i64) in0, 1);                  \
 575     out2_m = __msa_copy_u_d((v2i64) in1, 0);                  \
 576     out3_m = __msa_copy_u_d((v2i64) in1, 1);                  \
 577                                                               \
 578     SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride);  \
 579 }
 580 #define ST8x8_UB(in0, in1, in2, in3, pdst, stride)        \
 581 {                                                         \
 582     uint8_t *pblk_8x8_m = (uint8_t *) (pdst);             \
 583                                                           \
 584     ST8x4_UB(in0, in1, pblk_8x8_m, stride);               \
 585     ST8x4_UB(in2, in3, pblk_8x8_m + 4 * stride, stride);  \
 586 }
 587 #define ST12x4_UB(in0, in1, in2, pdst, stride)                \
 588 {                                                             \
 589     uint8_t *pblk_12x4_m = (uint8_t *) (pdst);                \
 590                                                               \
 591     /* left 8x4 */                                            \
 592     ST8x4_UB(in0, in1, pblk_12x4_m, stride);                  \
 593     /* right 4x4 */                                           \
 594     ST4x4_UB(in2, in2, 0, 1, 2, 3, pblk_12x4_m + 8, stride);  \
 595 }
 596
 597 /* Description : Store as 12x8 byte block to destination memory from
 598                  input vectors
 599    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
 600    Details     : Index 0 double word element from input vector 'in0' is copied
 601                  and stored to destination memory at (pblk_12x8_m) followed by
 602                  index 2 word element from same input vector 'in0' at
 603                  (pblk_12x8_m + 8)
 604                  Similar to remaining lines
 605 */
 606 #define ST12x8_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)  \
 607 {                                                                        \
 608     uint64_t out0_m, out1_m, out2_m, out3_m;                             \
 609     uint64_t out4_m, out5_m, out6_m, out7_m;                             \
 610     uint32_t out8_m, out9_m, out10_m, out11_m;                           \
 611     uint32_t out12_m, out13_m, out14_m, out15_m;                         \
 612     uint8_t *pblk_12x8_m = (uint8_t *) (pdst);                           \
 613                                                                          \
 614     out0_m = __msa_copy_u_d((v2i64) in0, 0);                             \
 615     out1_m = __msa_copy_u_d((v2i64) in1, 0);                             \
 616     out2_m = __msa_copy_u_d((v2i64) in2, 0);                             \
 617     out3_m = __msa_copy_u_d((v2i64) in3, 0);                             \
 618     out4_m = __msa_copy_u_d((v2i64) in4, 0);                             \
 619     out5_m = __msa_copy_u_d((v2i64) in5, 0);                             \
 620     out6_m = __msa_copy_u_d((v2i64) in6, 0);                             \
 621     out7_m = __msa_copy_u_d((v2i64) in7, 0);                             \
 622                                                                          \
 623     out8_m =  __msa_copy_u_w((v4i32) in0, 2);                            \
 624     out9_m =  __msa_copy_u_w((v4i32) in1, 2);                            \
 625     out10_m = __msa_copy_u_w((v4i32) in2, 2);                            \
 626     out11_m = __msa_copy_u_w((v4i32) in3, 2);                            \
 627     out12_m = __msa_copy_u_w((v4i32) in4, 2);                            \
 628     out13_m = __msa_copy_u_w((v4i32) in5, 2);                            \
 629     out14_m = __msa_copy_u_w((v4i32) in6, 2);                            \
 630     out15_m = __msa_copy_u_w((v4i32) in7, 2);                            \
 631                                                                          \
 632     SD(out0_m, pblk_12x8_m);                                             \
 633     SW(out8_m, pblk_12x8_m + 8);                                         \
 634     pblk_12x8_m += stride;                                               \
 635     SD(out1_m, pblk_12x8_m);                                             \
 636     SW(out9_m, pblk_12x8_m + 8);                                         \
 637     pblk_12x8_m += stride;                                               \
 638     SD(out2_m, pblk_12x8_m);                                             \
 639     SW(out10_m, pblk_12x8_m + 8);                                        \
 640     pblk_12x8_m += stride;                                               \
 641     SD(out3_m, pblk_12x8_m);                                             \
 642     SW(out11_m, pblk_12x8_m + 8);                                        \
 643     pblk_12x8_m += stride;                                               \
 644     SD(out4_m, pblk_12x8_m);                                             \
 645     SW(out12_m, pblk_12x8_m + 8);                                        \
 646     pblk_12x8_m += stride;                                               \
 647     SD(out5_m, pblk_12x8_m);                                             \
 648     SW(out13_m, pblk_12x8_m + 8);                                        \
 649     pblk_12x8_m += stride;                                               \
 650     SD(out6_m, pblk_12x8_m);                                             \
 651     SW(out14_m, pblk_12x8_m + 8);                                        \
 652     pblk_12x8_m += stride;                                               \
 653     SD(out7_m, pblk_12x8_m);                                             \
 654     SW(out15_m, pblk_12x8_m + 8);                                        \
 655 }
 656
 657 /* Description : average with rounding (in0 + in1 + 1) / 2.
 658    Arguments   : Inputs  - in0, in1, in2, in3,
 659                  Outputs - out0, out1
 660                  Return Type - as per RTYPE
 661    Details     : Each byte element from 'in0' vector is added with each byte
 662                  element from 'in1' vector. The addition of the elements plus 1
 663                 (for rounding) is done unsigned with full precision,
 664                 i.e. the result has one extra bit. Unsigned division by 2
 665                 (or logical shift right by one bit) is performed before writing
 666                 the result to vector 'out0'
 667                 Similar for the pair of 'in2' and 'in3'
 668 */
 669 #define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1)       \
 670 {                                                             \
 671     out0 = (RTYPE) __msa_aver_u_b((v16u8) in0, (v16u8) in1);  \
 672     out1 = (RTYPE) __msa_aver_u_b((v16u8) in2, (v16u8) in3);  \
 673 }
 674 #define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__)
 675
 676 #define AVER_UB4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
 677                  out0, out1, out2, out3)                        \
 678 {                                                               \
 679     AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1)             \
 680     AVER_UB2(RTYPE, in4, in5, in6, in7, out2, out3)             \
 681 }
 682 #define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__)
 683
 684 /* Description : Immediate number of columns to slide with zero
 685    Arguments   : Inputs  - in0, in1, slide_val
 686                  Outputs - out0, out1
 687                  Return Type - as per RTYPE
 688    Details     : Byte elements from 'zero_m' vector are slide into 'in0' by
 689                  number of elements specified by 'slide_val'
 690 */
 691 #define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val)                 \
 692 {                                                                         \
 693     v16i8 zero_m = { 0 };                                                 \
 694     out0 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in0, slide_val);  \
 695     out1 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in1, slide_val);  \
 696 }
 697 #define SLDI_B2_0_UB(...) SLDI_B2_0(v16u8, __VA_ARGS__)
 698 #define SLDI_B2_0_SB(...) SLDI_B2_0(v16i8, __VA_ARGS__)
 699 #define SLDI_B2_0_SW(...) SLDI_B2_0(v4i32, __VA_ARGS__)
 700
 701 #define SLDI_B3_0(RTYPE, in0, in1, in2, out0, out1, out2,  slide_val)     \
 702 {                                                                         \
 703     v16i8 zero_m = { 0 };                                                 \
 704     SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val);                    \
 705     out2 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in2, slide_val);  \
 706 }
 707 #define SLDI_B3_0_UB(...) SLDI_B3_0(v16u8, __VA_ARGS__)
 708 #define SLDI_B3_0_SB(...) SLDI_B3_0(v16i8, __VA_ARGS__)
 709
 710 #define SLDI_B4_0(RTYPE, in0, in1, in2, in3,            \
 711                   out0, out1, out2, out3, slide_val)    \
 712 {                                                       \
 713     SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val);  \
 714     SLDI_B2_0(RTYPE, in2, in3, out2, out3, slide_val);  \
 715 }
 716 #define SLDI_B4_0_UB(...) SLDI_B4_0(v16u8, __VA_ARGS__)
 717 #define SLDI_B4_0_SB(...) SLDI_B4_0(v16i8, __VA_ARGS__)
 718 #define SLDI_B4_0_SH(...) SLDI_B4_0(v8i16, __VA_ARGS__)
 719
 720 /* Description : Immediate number of columns to slide
 721    Arguments   : Inputs  - in0_0, in0_1, in1_0, in1_1, slide_val
 722                  Outputs - out0, out1
 723                  Return Type - as per RTYPE
 724    Details     : Byte elements from 'in0_0' vector are slide into 'in1_0' by
 725                  number of elements specified by 'slide_val'
 726 */
 727 #define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val)  \
 728 {                                                                          \
 729     out0 = (RTYPE) __msa_sldi_b((v16i8) in0_0, (v16i8) in1_0, slide_val);  \
 730     out1 = (RTYPE) __msa_sldi_b((v16i8) in0_1, (v16i8) in1_1, slide_val);  \
 731 }
 732 #define SLDI_B2_UB(...) SLDI_B2(v16u8, __VA_ARGS__)
 733 #define SLDI_B2_SB(...) SLDI_B2(v16i8, __VA_ARGS__)
 734 #define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__)
 735
 736 #define SLDI_B3(RTYPE, in0_0, in0_1, in0_2, in1_0, in1_1, in1_2,           \
 737                 out0, out1, out2, slide_val)                               \
 738 {                                                                          \
 739     SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val)      \
 740     out2 = (RTYPE) __msa_sldi_b((v16i8) in0_2, (v16i8) in1_2, slide_val);  \
 741 }
 742 #define SLDI_B3_SB(...) SLDI_B3(v16i8, __VA_ARGS__)
 743 #define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__)
 744
 745 /* Description : Shuffle byte vector elements as per mask vector
 746    Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
 747                  Outputs - out0, out1
 748                  Return Type - as per RTYPE
 749    Details     : Selective byte elements from in0 & in1 are copied to out0 as
 750                  per control vector mask0
 751                  Selective byte elements from in2 & in3 are copied to out1 as
 752                  per control vector mask1
 753 */
 754 #define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1)       \
 755 {                                                                          \
 756     out0 = (RTYPE) __msa_vshf_b((v16i8) mask0, (v16i8) in1, (v16i8) in0);  \
 757     out1 = (RTYPE) __msa_vshf_b((v16i8) mask1, (v16i8) in3, (v16i8) in2);  \
 758 }
 759 #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
 760 #define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)
 761 #define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)
 762 #define VSHF_B2_SH(...) VSHF_B2(v8i16, __VA_ARGS__)
 763
 764 #define VSHF_B3(RTYPE, in0, in1, in2, in3, in4, in5, mask0, mask1, mask2,  \
 765                 out0, out1, out2)                                          \
 766 {                                                                          \
 767     VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1);          \
 768     out2 = (RTYPE) __msa_vshf_b((v16i8) mask2, (v16i8) in5, (v16i8) in4);  \
 769 }
 770 #define VSHF_B3_SB(...) VSHF_B3(v16i8, __VA_ARGS__)
 771
 772 #define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3,       \
 773                 out0, out1, out2, out3)                            \
 774 {                                                                  \
 775     VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1);  \
 776     VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3);  \
 777 }
 778 #define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__)
 779 #define VSHF_B4_SH(...) VSHF_B4(v8i16, __VA_ARGS__)
 780
 781 /* Description : Shuffle halfword vector elements as per mask vector
 782    Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
 783                  Outputs - out0, out1
 784                  Return Type - as per RTYPE
 785    Details     : Selective halfword elements from in0 & in1 are copied to out0
 786                  as per control vector mask0
 787                  Selective halfword elements from in2 & in3 are copied to out1
 788                  as per control vector mask1
 789 */
 790 #define VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1)       \
 791 {                                                                          \
 792     out0 = (RTYPE) __msa_vshf_h((v8i16) mask0, (v8i16) in1, (v8i16) in0);  \
 793     out1 = (RTYPE) __msa_vshf_h((v8i16) mask1, (v8i16) in3, (v8i16) in2);  \
 794 }
 795 #define VSHF_H2_SH(...) VSHF_H2(v8i16, __VA_ARGS__)
 796
 797 #define VSHF_H3(RTYPE, in0, in1, in2, in3, in4, in5, mask0, mask1, mask2,  \
 798                 out0, out1, out2)                                          \
 799 {                                                                          \
 800     VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1);          \
 801     out2 = (RTYPE) __msa_vshf_h((v8i16) mask2, (v8i16) in5, (v8i16) in4);  \
 802 }
 803 #define VSHF_H3_SH(...) VSHF_H3(v8i16, __VA_ARGS__)
 804
 805 /* Description : Shuffle byte vector elements as per mask vector
 806    Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
 807                  Outputs - out0, out1
 808                  Return Type - as per RTYPE
 809    Details     : Selective byte elements from in0 & in1 are copied to out0 as
 810                  per control vector mask0
 811                  Selective byte elements from in2 & in3 are copied to out1 as
 812                  per control vector mask1
 813 */
 814 #define VSHF_W2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1)      \
 815 {                                                                         \
 816     out0 = (RTYPE) __msa_vshf_w((v4i32) mask0, (v4i32) in1, (v4i32) in0); \
 817     out1 = (RTYPE) __msa_vshf_w((v4i32) mask1, (v4i32) in3, (v4i32) in2); \
 818 }
 819 #define VSHF_W2_SB(...) VSHF_W2(v16i8, __VA_ARGS__)
 820
 821 /* Description : Dot product of byte vector elements
 822    Arguments   : Inputs  - mult0, mult1
 823                            cnst0, cnst1
 824                  Outputs - out0, out1
 825                  Return Type - as per RTYPE
 826    Details     : Unsigned byte elements from mult0 are multiplied with
 827                  unsigned byte elements from cnst0 producing a result
 828                  twice the size of input i.e. unsigned halfword.
 829                  Then this multiplication results of adjacent odd-even elements
 830                  are added together and stored to the out vector
 831                  (2 unsigned halfword results)
 832 */
 833 #define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)   \
 834 {                                                                 \
 835     out0 = (RTYPE) __msa_dotp_u_h((v16u8) mult0, (v16u8) cnst0);  \
 836     out1 = (RTYPE) __msa_dotp_u_h((v16u8) mult1, (v16u8) cnst1);  \
 837 }
 838 #define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__)
 839
 840 #define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3,           \
 841                  cnst0, cnst1, cnst2, cnst3,                  \
 842                  out0, out1, out2, out3)                      \
 843 {                                                             \
 844     DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);  \
 845     DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);  \
 846 }
 847 #define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__)
 848
 849 /* Description : Dot product of byte vector elements
 850    Arguments   : Inputs  - mult0, mult1
 851                            cnst0, cnst1
 852                  Outputs - out0, out1
 853                  Return Type - as per RTYPE
 854    Details     : Signed byte elements from mult0 are multiplied with
 855                  signed byte elements from cnst0 producing a result
 856                  twice the size of input i.e. signed halfword.
 857                  Then this multiplication results of adjacent odd-even elements
 858                  are added together and stored to the out vector
 859                  (2 signed halfword results)
 860 */
 861 #define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)   \
 862 {                                                                 \
 863     out0 = (RTYPE) __msa_dotp_s_h((v16i8) mult0, (v16i8) cnst0);  \
 864     out1 = (RTYPE) __msa_dotp_s_h((v16i8) mult1, (v16i8) cnst1);  \
 865 }
 866 #define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__)
 867
 868 #define DOTP_SB3(RTYPE, mult0, mult1, mult2, cnst0, cnst1, cnst2,  \
 869                  out0, out1, out2)                                 \
 870 {                                                                  \
 871     DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);       \
 872     out2 = (RTYPE) __msa_dotp_s_h((v16i8) mult2, (v16i8) cnst2);   \
 873 }
 874 #define DOTP_SB3_SH(...) DOTP_SB3(v8i16, __VA_ARGS__)
 875
 876 #define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3,                   \
 877                  cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3)  \
 878 {                                                                     \
 879     DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);          \
 880     DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);          \
 881 }
 882 #define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__)
 883
 884 /* Description : Dot product of halfword vector elements
 885    Arguments   : Inputs  - mult0, mult1
 886                            cnst0, cnst1
 887                  Outputs - out0, out1
 888                  Return Type - as per RTYPE
 889    Details     : Signed halfword elements from mult0 are multiplied with
 890                  signed halfword elements from cnst0 producing a result
 891                  twice the size of input i.e. signed word.
 892                  Then this multiplication results of adjacent odd-even elements
 893                  are added together and stored to the out vector
 894                  (2 signed word results)
 895 */
 896 #define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)   \
 897 {                                                                 \
 898     out0 = (RTYPE) __msa_dotp_s_w((v8i16) mult0, (v8i16) cnst0);  \
 899     out1 = (RTYPE) __msa_dotp_s_w((v8i16) mult1, (v8i16) cnst1);  \
 900 }
 901 #define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__)
 902
 903 #define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3,           \
 904                  cnst0, cnst1, cnst2, cnst3,                  \
 905                  out0, out1, out2, out3)                      \
 906 {                                                             \
 907     DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);  \
 908     DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);  \
 909 }
 910 #define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__)
 911
 912 /* Description : Dot product & addition of byte vector elements
 913    Arguments   : Inputs  - mult0, mult1
 914                            cnst0, cnst1
 915                  Outputs - out0, out1
 916                  Return Type - as per RTYPE
 917    Details     : Signed byte elements from mult0 are multiplied with
 918                  signed byte elements from cnst0 producing a result
 919                  twice the size of input i.e. signed halfword.
 920                  Then this multiplication results of adjacent odd-even elements
 921                  are added to the out vector
 922                  (2 signed halfword results)
 923 */
 924 #define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)   \
 925 {                                                                  \
 926     out0 = (RTYPE) __msa_dpadd_s_h((v8i16) out0,                   \
 927                                    (v16i8) mult0, (v16i8) cnst0);  \
 928     out1 = (RTYPE) __msa_dpadd_s_h((v8i16) out1,                   \
 929                                    (v16i8) mult1, (v16i8) cnst1);  \
 930 }
 931 #define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__)
 932
 933 #define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3,                   \
 934                   cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3)  \
 935 {                                                                      \
 936     DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);          \
 937     DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);          \
 938 }
 939 #define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__)
 940
 941 /* Description : Dot product & addition of byte vector elements
 942    Arguments   : Inputs  - mult0, mult1
 943                            cnst0, cnst1
 944                  Outputs - out0, out1
 945                  Return Type - as per RTYPE
 946    Details     : Unsigned byte elements from mult0 are multiplied with
 947                  unsigned byte elements from cnst0 producing a result
 948                  twice the size of input i.e. unsigned halfword.
 949                  Then this multiplication results of adjacent odd-even elements
 950                  are added to the out vector
 951                  (2 unsigned halfword results)
 952 */
 953 #define DPADD_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)   \
 954 {                                                                  \
 955     out0 = (RTYPE) __msa_dpadd_u_h((v8u16) out0,                   \
 956                                    (v16u8) mult0, (v16u8) cnst0);  \
 957     out1 = (RTYPE) __msa_dpadd_u_h((v8u16) out1,                   \
 958                                    (v16u8) mult1, (v16u8) cnst1);  \
 959 }
 960 #define DPADD_UB2_UH(...) DPADD_UB2(v8u16, __VA_ARGS__)
 961
 962 /* Description : Dot product & addition of halfword vector elements
 963    Arguments   : Inputs  - mult0, mult1
 964                            cnst0, cnst1
 965                  Outputs - out0, out1
 966                  Return Type - as per RTYPE
 967    Details     : Signed halfword elements from mult0 are multiplied with
 968                  signed halfword elements from cnst0 producing a result
 969                  twice the size of input i.e. signed word.
 970                  Then this multiplication results of adjacent odd-even elements
 971                  are added to the out vector
 972                  (2 signed word results)
 973 */
 974 #define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)   \
 975 {                                                                  \
 976     out0 = (RTYPE) __msa_dpadd_s_w((v4i32) out0,                   \
 977                                    (v8i16) mult0, (v8i16) cnst0);  \
 978     out1 = (RTYPE) __msa_dpadd_s_w((v4i32) out1,                   \
 979                                    (v8i16) mult1, (v8i16) cnst1);  \
 980 }
 981 #define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__)
 982
 983 #define DPADD_SH4(RTYPE, mult0, mult1, mult2, mult3,                   \
 984                   cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3)  \
 985 {                                                                      \
 986     DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);          \
 987     DPADD_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);          \
 988 }
 989 #define DPADD_SH4_SW(...) DPADD_SH4(v4i32, __VA_ARGS__)
 990
 991 /* Description : Minimum values between unsigned elements of
 992                  either vector are copied to the output vector
 993    Arguments   : Inputs  - in0, in1, min_vec
 994                  Outputs - in0, in1, (in place)
 995                  Return Type - as per RTYPE
 996    Details     : Minimum of unsigned halfword element values from 'in0' and
 997                  'min_value' are written to output vector 'in0'
 998 */
 999 #define MIN_UH2(RTYPE, in0, in1, min_vec)               \
1000 {                                                       \
1001     in0 = (RTYPE) __msa_min_u_h((v8u16) in0, min_vec);  \
1002     in1 = (RTYPE) __msa_min_u_h((v8u16) in1, min_vec);  \
1003 }
1004 #define MIN_UH2_UH(...) MIN_UH2(v8u16, __VA_ARGS__)
1005
1006 #define MIN_UH4(RTYPE, in0, in1, in2, in3, min_vec)  \
1007 {                                                    \
1008     MIN_UH2(RTYPE, in0, in1, min_vec);               \
1009     MIN_UH2(RTYPE, in2, in3, min_vec);               \
1010 }
1011 #define MIN_UH4_UH(...) MIN_UH4(v8u16, __VA_ARGS__)
1012
1013 /* Description : Clips all halfword elements of input vector between min & max
1014                  out = ((in) < (min)) ? (min) : (((in) > (max)) ? (max) : (in))
1015    Arguments   : Inputs  - in       (input vector)
1016                          - min      (min threshold)
1017                          - max      (max threshold)
1018                  Outputs - out_m    (output vector with clipped elements)
1019                  Return Type - signed halfword
1020 */
1021 #define CLIP_SH(in, min, max)                           \
1022 ( {                                                     \
1023     v8i16 out_m;                                        \
1024                                                         \
1025     out_m = __msa_max_s_h((v8i16) min, (v8i16) in);     \
1026     out_m = __msa_min_s_h((v8i16) max, (v8i16) out_m);  \
1027     out_m;                                              \
1028 } )
1029
1030 /* Description : Clips all signed halfword elements of input vector
1031                  between 0 & 255
1032    Arguments   : Inputs  - in       (input vector)
1033                  Outputs - out_m    (output vector with clipped elements)
1034                  Return Type - signed halfword
1035 */
1036 #define CLIP_SH_0_255(in)                                 \
1037 ( {                                                       \
1038     v8i16 max_m = __msa_ldi_h(255);                       \
1039     v8i16 out_m;                                          \
1040                                                           \
1041     out_m = __msa_maxi_s_h((v8i16) in, 0);                \
1042     out_m = __msa_min_s_h((v8i16) max_m, (v8i16) out_m);  \
1043     out_m;                                                \
1044 } )
1045 #define CLIP_SH2_0_255(in0, in1)  \
1046 {                                 \
1047     in0 = CLIP_SH_0_255(in0);     \
1048     in1 = CLIP_SH_0_255(in1);     \
1049 }
1050 #define CLIP_SH4_0_255(in0, in1, in2, in3)  \
1051 {                                           \
1052     CLIP_SH2_0_255(in0, in1);               \
1053     CLIP_SH2_0_255(in2, in3);               \
1054 }
1055
1056 #define CLIP_SH_0_255_MAX_SATU(in)                    \
1057 ( {                                                   \
1058     v8i16 out_m;                                      \
1059                                                       \
1060     out_m = __msa_maxi_s_h((v8i16) in, 0);            \
1061     out_m = (v8i16) __msa_sat_u_h((v8u16) out_m, 7);  \
1062     out_m;                                            \
1063 } )
1064 #define CLIP_SH2_0_255_MAX_SATU(in0, in1)  \
1065 {                                          \
1066     in0 = CLIP_SH_0_255_MAX_SATU(in0);     \
1067     in1 = CLIP_SH_0_255_MAX_SATU(in1);     \
1068 }
1069 #define CLIP_SH4_0_255_MAX_SATU(in0, in1, in2, in3)  \
1070 {                                                    \
1071     CLIP_SH2_0_255_MAX_SATU(in0, in1);               \
1072     CLIP_SH2_0_255_MAX_SATU(in2, in3);               \
1073 }
1074
1075 /* Description : Clips all signed word elements of input vector
1076                  between 0 & 255
1077    Arguments   : Inputs  - in       (input vector)
1078                  Outputs - out_m    (output vector with clipped elements)
1079                  Return Type - signed word
1080 */
1081 #define CLIP_SW_0_255(in)                                 \
1082 ( {                                                       \
1083     v4i32 max_m = __msa_ldi_w(255);                       \
1084     v4i32 out_m;                                          \
1085                                                           \
1086     out_m = __msa_maxi_s_w((v4i32) in, 0);                \
1087     out_m = __msa_min_s_w((v4i32) max_m, (v4i32) out_m);  \
1088     out_m;                                                \
1089 } )
1090
1091 #define CLIP_SW_0_255_MAX_SATU(in)                    \
1092 ( {                                                   \
1093     v4i32 out_m;                                      \
1094                                                       \
1095     out_m = __msa_maxi_s_w((v4i32) in, 0);            \
1096     out_m = (v4i32) __msa_sat_u_w((v4u32) out_m, 7);  \
1097     out_m;                                            \
1098 } )
1099 #define CLIP_SW2_0_255_MAX_SATU(in0, in1)  \
1100 {                                          \
1101     in0 = CLIP_SW_0_255_MAX_SATU(in0);     \
1102     in1 = CLIP_SW_0_255_MAX_SATU(in1);     \
1103 }
1104 #define CLIP_SW4_0_255_MAX_SATU(in0, in1, in2, in3)  \
1105 {                                                    \
1106     CLIP_SW2_0_255_MAX_SATU(in0, in1);               \
1107     CLIP_SW2_0_255_MAX_SATU(in2, in3);               \
1108 }
1109
1110 /* Description : Addition of 4 signed word elements
1111                  4 signed word elements of input vector are added together and
1112                  resulted integer sum is returned
1113    Arguments   : Inputs  - in       (signed word vector)
1114                  Outputs - sum_m    (i32 sum)
1115                  Return Type - signed word
1116 */
1117 #define HADD_SW_S32(in)                               \
1118 ( {                                                   \
1119     v2i64 res0_m, res1_m;                             \
1120     int32_t sum_m;                                    \
1121                                                       \
1122     res0_m = __msa_hadd_s_d((v4i32) in, (v4i32) in);  \
1123     res1_m = __msa_splati_d(res0_m, 1);               \
1124     res0_m += res1_m;                                 \
1125     sum_m = __msa_copy_s_w((v4i32) res0_m, 0);        \
1126     sum_m;                                            \
1127 } )
1128
1129 /* Description : Addition of 8 unsigned halfword elements
1130                  8 unsigned halfword elements of input vector are added
1131                  together and resulted integer sum is returned
1132    Arguments   : Inputs  - in       (unsigned halfword vector)
1133                  Outputs - sum_m    (u32 sum)
1134                  Return Type - unsigned word
1135 */
1136 #define HADD_UH_U32(in)                                  \
1137 ( {                                                      \
1138     v4u32 res_m;                                         \
1139     v2u64 res0_m, res1_m;                                \
1140     uint32_t sum_m;                                      \
1141                                                          \
1142     res_m = __msa_hadd_u_w((v8u16) in, (v8u16) in);      \
1143     res0_m = __msa_hadd_u_d(res_m, res_m);               \
1144     res1_m = (v2u64) __msa_splati_d((v2i64) res0_m, 1);  \
1145     res0_m += res1_m;                                    \
1146     sum_m = __msa_copy_u_w((v4i32) res0_m, 0);           \
1147     sum_m;                                               \
1148 } )
1149
1150 /* Description : Horizontal addition of signed byte vector elements
1151    Arguments   : Inputs  - in0, in1
1152                  Outputs - out0, out1
1153                  Return Type - as per RTYPE
1154    Details     : Each signed odd byte element from 'in0' is added to
1155                  even signed byte element from 'in0' (pairwise) and the
1156                  halfword result is stored in 'out0'
1157 */
1158 #define HADD_SB2(RTYPE, in0, in1, out0, out1)                 \
1159 {                                                             \
1160     out0 = (RTYPE) __msa_hadd_s_h((v16i8) in0, (v16i8) in0);  \
1161     out1 = (RTYPE) __msa_hadd_s_h((v16i8) in1, (v16i8) in1);  \
1162 }
1163 #define HADD_SB2_SH(...) HADD_SB2(v8i16, __VA_ARGS__)
1164
1165 #define HADD_SB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3)  \
1166 {                                                                    \
1167     HADD_SB2(RTYPE, in0, in1, out0, out1);                           \
1168     HADD_SB2(RTYPE, in2, in3, out2, out3);                           \
1169 }
1170 #define HADD_SB4_UH(...) HADD_SB4(v8u16, __VA_ARGS__)
1171 #define HADD_SB4_SH(...) HADD_SB4(v8i16, __VA_ARGS__)
1172
1173 /* Description : Horizontal addition of unsigned byte vector elements
1174    Arguments   : Inputs  - in0, in1
1175                  Outputs - out0, out1
1176                  Return Type - as per RTYPE
1177    Details     : Each unsigned odd byte element from 'in0' is added to
1178                  even unsigned byte element from 'in0' (pairwise) and the
1179                  halfword result is stored in 'out0'
1180 */
1181 #define HADD_UB2(RTYPE, in0, in1, out0, out1)                 \
1182 {                                                             \
1183     out0 = (RTYPE) __msa_hadd_u_h((v16u8) in0, (v16u8) in0);  \
1184     out1 = (RTYPE) __msa_hadd_u_h((v16u8) in1, (v16u8) in1);  \
1185 }
1186 #define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__)
1187
1188 #define HADD_UB3(RTYPE, in0, in1, in2, out0, out1, out2)      \
1189 {                                                             \
1190     HADD_UB2(RTYPE, in0, in1, out0, out1);                    \
1191     out2 = (RTYPE) __msa_hadd_u_h((v16u8) in2, (v16u8) in2);  \
1192 }
1193 #define HADD_UB3_UH(...) HADD_UB3(v8u16, __VA_ARGS__)
1194
1195 #define HADD_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3)  \
1196 {                                                                    \
1197     HADD_UB2(RTYPE, in0, in1, out0, out1);                           \
1198     HADD_UB2(RTYPE, in2, in3, out2, out3);                           \
1199 }
1200 #define HADD_UB4_UB(...) HADD_UB4(v16u8, __VA_ARGS__)
1201 #define HADD_UB4_UH(...) HADD_UB4(v8u16, __VA_ARGS__)
1202 #define HADD_UB4_SH(...) HADD_UB4(v8i16, __VA_ARGS__)
1203
1204 /* Description : Horizontal subtraction of unsigned byte vector elements
1205    Arguments   : Inputs  - in0, in1
1206                  Outputs - out0, out1
1207                  Return Type - as per RTYPE
1208    Details     : Each unsigned odd byte element from 'in0' is subtracted from
1209                  even unsigned byte element from 'in0' (pairwise) and the
1210                  halfword result is stored in 'out0'
1211 */
1212 #define HSUB_UB2(RTYPE, in0, in1, out0, out1)                 \
1213 {                                                             \
1214     out0 = (RTYPE) __msa_hsub_u_h((v16u8) in0, (v16u8) in0);  \
1215     out1 = (RTYPE) __msa_hsub_u_h((v16u8) in1, (v16u8) in1);  \
1216 }
1217 #define HSUB_UB2_UH(...) HSUB_UB2(v8u16, __VA_ARGS__)
1218 #define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
1219
1220 #define HSUB_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3)  \
1221 {                                                                    \
1222     HSUB_UB2(RTYPE, in0, in1, out0, out1);                           \
1223     HSUB_UB2(RTYPE, in2, in3, out2, out3);                           \
1224 }
1225 #define HSUB_UB4_UH(...) HSUB_UB4(v8u16, __VA_ARGS__)
1226 #define HSUB_UB4_SH(...) HSUB_UB4(v8i16, __VA_ARGS__)
1227
1228 /* Description : SAD (Sum of Absolute Difference)
1229    Arguments   : Inputs  - in0, in1, ref0, ref1  (unsigned byte src & ref)
1230                  Outputs - sad_m                 (halfword vector with sad)
1231                  Return Type - unsigned halfword
1232    Details     : Absolute difference of all the byte elements from 'in0' with
1233                  'ref0' is calculated and preserved in 'diff0'. From the 16
1234                  unsigned absolute diff values, even-odd pairs are added
1235                  together to generate 8 halfword results.
1236 */
1237 #define SAD_UB2_UH(in0, in1, ref0, ref1)                        \
1238 ( {                                                             \
1239     v16u8 diff0_m, diff1_m;                                     \
1240     v8u16 sad_m = { 0 };                                        \
1241                                                                 \
1242     diff0_m = __msa_asub_u_b((v16u8) in0, (v16u8) ref0);        \
1243     diff1_m = __msa_asub_u_b((v16u8) in1, (v16u8) ref1);        \
1244                                                                 \
1245     sad_m += __msa_hadd_u_h((v16u8) diff0_m, (v16u8) diff0_m);  \
1246     sad_m += __msa_hadd_u_h((v16u8) diff1_m, (v16u8) diff1_m);  \
1247                                                                 \
1248     sad_m;                                                      \
1249 } )
1250
1251 /* Description : Insert specified word elements from input vectors to 1
1252                  destination vector
1253    Arguments   : Inputs  - in0, in1, in2, in3 (4 input vectors)
1254                  Outputs - out                (output vector)
1255                  Return Type - as per RTYPE
1256 */
1257 #define INSERT_W2(RTYPE, in0, in1, out)                 \
1258 {                                                       \
1259     out = (RTYPE) __msa_insert_w((v4i32) out, 0, in0);  \
1260     out = (RTYPE) __msa_insert_w((v4i32) out, 1, in1);  \
1261 }
1262 #define INSERT_W2_UB(...) INSERT_W2(v16u8, __VA_ARGS__)
1263 #define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__)
1264
1265 #define INSERT_W4(RTYPE, in0, in1, in2, in3, out)       \
1266 {                                                       \
1267     out = (RTYPE) __msa_insert_w((v4i32) out, 0, in0);  \
1268     out = (RTYPE) __msa_insert_w((v4i32) out, 1, in1);  \
1269     out = (RTYPE) __msa_insert_w((v4i32) out, 2, in2);  \
1270     out = (RTYPE) __msa_insert_w((v4i32) out, 3, in3);  \
1271 }
1272 #define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__)
1273 #define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__)
1274 #define INSERT_W4_SH(...) INSERT_W4(v8i16, __VA_ARGS__)
1275 #define INSERT_W4_SW(...) INSERT_W4(v4i32, __VA_ARGS__)
1276
1277 /* Description : Insert specified double word elements from input vectors to 1
1278                  destination vector
1279    Arguments   : Inputs  - in0, in1      (2 input vectors)
1280                  Outputs - out           (output vector)
1281                  Return Type - as per RTYPE
1282 */
1283 #define INSERT_D2(RTYPE, in0, in1, out)                 \
1284 {                                                       \
1285     out = (RTYPE) __msa_insert_d((v2i64) out, 0, in0);  \
1286     out = (RTYPE) __msa_insert_d((v2i64) out, 1, in1);  \
1287 }
1288 #define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__)
1289 #define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)
1290 #define INSERT_D2_SH(...) INSERT_D2(v8i16, __VA_ARGS__)
1291 #define INSERT_D2_SD(...) INSERT_D2(v2i64, __VA_ARGS__)
1292
1293 /* Description : Interleave even byte elements from vectors
1294    Arguments   : Inputs  - in0, in1, in2, in3
1295                  Outputs - out0, out1
1296                  Return Type - as per RTYPE
1297    Details     : Even byte elements of 'in0' and even byte
1298                  elements of 'in1' are interleaved and copied to 'out0'
1299                  Even byte elements of 'in2' and even byte
1300                  elements of 'in3' are interleaved and copied to 'out1'
1301 */
1302 #define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1303 {                                                            \
1304     out0 = (RTYPE) __msa_ilvev_b((v16i8) in1, (v16i8) in0);  \
1305     out1 = (RTYPE) __msa_ilvev_b((v16i8) in3, (v16i8) in2);  \
1306 }
1307 #define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__)
1308 #define ILVEV_B2_SB(...) ILVEV_B2(v16i8, __VA_ARGS__)
1309 #define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__)
1310 #define ILVEV_B2_SD(...) ILVEV_B2(v2i64, __VA_ARGS__)
1311
1312 /* Description : Interleave even halfword elements from vectors
1313    Arguments   : Inputs  - in0, in1, in2, in3
1314                  Outputs - out0, out1
1315                  Return Type - as per RTYPE
1316    Details     : Even halfword elements of 'in0' and even halfword
1317                  elements of 'in1' are interleaved and copied to 'out0'
1318                  Even halfword elements of 'in2' and even halfword
1319                  elements of 'in3' are interleaved and copied to 'out1'
1320 */
1321 #define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1322 {                                                            \
1323     out0 = (RTYPE) __msa_ilvev_h((v8i16) in1, (v8i16) in0);  \
1324     out1 = (RTYPE) __msa_ilvev_h((v8i16) in3, (v8i16) in2);  \
1325 }
1326 #define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__)
1327 #define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__)
1328 #define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__)
1329
1330 /* Description : Interleave even word elements from vectors
1331    Arguments   : Inputs  - in0, in1, in2, in3
1332                  Outputs - out0, out1
1333                  Return Type - as per RTYPE
1334    Details     : Even word elements of 'in0' and even word
1335                  elements of 'in1' are interleaved and copied to 'out0'
1336                  Even word elements of 'in2' and even word
1337                  elements of 'in3' are interleaved and copied to 'out1'
1338 */
1339 #define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1340 {                                                            \
1341     out0 = (RTYPE) __msa_ilvev_w((v4i32) in1, (v4i32) in0);  \
1342     out1 = (RTYPE) __msa_ilvev_w((v4i32) in3, (v4i32) in2);  \
1343 }
1344 #define ILVEV_W2_UB(...) ILVEV_W2(v16u8, __VA_ARGS__)
1345 #define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__)
1346 #define ILVEV_W2_UH(...) ILVEV_W2(v8u16, __VA_ARGS__)
1347 #define ILVEV_W2_SD(...) ILVEV_W2(v2i64, __VA_ARGS__)
1348
1349 /* Description : Interleave even double word elements from vectors
1350    Arguments   : Inputs  - in0, in1, in2, in3
1351                  Outputs - out0, out1
1352                  Return Type - as per RTYPE
1353    Details     : Even double word elements of 'in0' and even double word
1354                  elements of 'in1' are interleaved and copied to 'out0'
1355                  Even double word elements of 'in2' and even double word
1356                  elements of 'in3' are interleaved and copied to 'out1'
1357 */
1358 #define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1359 {                                                            \
1360     out0 = (RTYPE) __msa_ilvev_d((v2i64) in1, (v2i64) in0);  \
1361     out1 = (RTYPE) __msa_ilvev_d((v2i64) in3, (v2i64) in2);  \
1362 }
1363 #define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)
1364 #define ILVEV_D2_SB(...) ILVEV_D2(v16i8, __VA_ARGS__)
1365 #define ILVEV_D2_SW(...) ILVEV_D2(v4i32, __VA_ARGS__)
1366
1367 /* Description : Interleave left half of byte elements from vectors
1368    Arguments   : Inputs  - in0, in1, in2, in3
1369                  Outputs - out0, out1
1370                  Return Type - as per RTYPE
1371    Details     : Left half of byte elements of in0 and left half of byte
1372                  elements of in1 are interleaved and copied to out0.
1373                  Left half of byte elements of in2 and left half of byte
1374                  elements of in3 are interleaved and copied to out1.
1375 */
1376 #define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1377 {                                                           \
1378     out0 = (RTYPE) __msa_ilvl_b((v16i8) in0, (v16i8) in1);  \
1379     out1 = (RTYPE) __msa_ilvl_b((v16i8) in2, (v16i8) in3);  \
1380 }
1381 #define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__)
1382 #define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__)
1383 #define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__)
1384 #define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__)
1385
1386 #define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1387                 out0, out1, out2, out3)                         \
1388 {                                                               \
1389     ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1390     ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1391 }
1392 #define ILVL_B4_UB(...) ILVL_B4(v16u8, __VA_ARGS__)
1393 #define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__)
1394 #define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__)
1395 #define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__)
1396
1397 /* Description : Interleave left half of halfword elements from vectors
1398    Arguments   : Inputs  - in0, in1, in2, in3
1399                  Outputs - out0, out1
1400                  Return Type - as per RTYPE
1401    Details     : Left half of halfword elements of in0 and left half of halfword
1402                  elements of in1 are interleaved and copied to out0.
1403                  Left half of halfword elements of in2 and left half of halfword
1404                  elements of in3 are interleaved and copied to out1.
1405 */
1406 #define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1407 {                                                           \
1408     out0 = (RTYPE) __msa_ilvl_h((v8i16) in0, (v8i16) in1);  \
1409     out1 = (RTYPE) __msa_ilvl_h((v8i16) in2, (v8i16) in3);  \
1410 }
1411 #define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__)
1412 #define ILVL_H2_SW(...) ILVL_H2(v4i32, __VA_ARGS__)
1413
1414 #define ILVL_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1415                 out0, out1, out2, out3)                         \
1416 {                                                               \
1417     ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1418     ILVL_H2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1419 }
1420 #define ILVL_H4_SH(...) ILVL_H4(v8i16, __VA_ARGS__)
1421 #define ILVL_H4_SW(...) ILVL_H4(v4i32, __VA_ARGS__)
1422
1423 /* Description : Interleave left half of word elements from vectors
1424    Arguments   : Inputs  - in0, in1, in2, in3
1425                  Outputs - out0, out1
1426                  Return Type - as per RTYPE
1427    Details     : Left half of word elements of in0 and left half of word
1428                  elements of in1 are interleaved and copied to out0.
1429                  Left half of word elements of in2 and left half of word
1430                  elements of in3 are interleaved and copied to out1.
1431 */
1432 #define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1433 {                                                           \
1434     out0 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1);  \
1435     out1 = (RTYPE) __msa_ilvl_w((v4i32) in2, (v4i32) in3);  \
1436 }
1437 #define ILVL_W2_UB(...) ILVL_W2(v16u8, __VA_ARGS__)
1438 #define ILVL_W2_SB(...) ILVL_W2(v16i8, __VA_ARGS__)
1439 #define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__)
1440
1441 /* Description : Interleave right half of byte elements from vectors
1442    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1443                  Outputs - out0, out1, out2, out3
1444                  Return Type - as per RTYPE
1445    Details     : Right half of byte elements of in0 and right half of byte
1446                  elements of in1 are interleaved and copied to out0.
1447                  Right half of byte elements of in2 and right half of byte
1448                  elements of in3 are interleaved and copied to out1.
1449                  Similar for other pairs
1450 */
1451 #define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1452 {                                                           \
1453     out0 = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1);  \
1454     out1 = (RTYPE) __msa_ilvr_b((v16i8) in2, (v16i8) in3);  \
1455 }
1456 #define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__)
1457 #define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)
1458 #define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__)
1459 #define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
1460 #define ILVR_B2_SW(...) ILVR_B2(v4i32, __VA_ARGS__)
1461
1462 #define ILVR_B3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2)  \
1463 {                                                                       \
1464     ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1);                     \
1465     out2 = (RTYPE) __msa_ilvr_b((v16i8) in4, (v16i8) in5);              \
1466 }
1467 #define ILVR_B3_UB(...) ILVR_B3(v16u8, __VA_ARGS__)
1468 #define ILVR_B3_SB(...) ILVR_B3(v16i8, __VA_ARGS__)
1469 #define ILVR_B3_UH(...) ILVR_B3(v8u16, __VA_ARGS__)
1470 #define ILVR_B3_SH(...) ILVR_B3(v8i16, __VA_ARGS__)
1471
1472 #define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1473                 out0, out1, out2, out3)                         \
1474 {                                                               \
1475     ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1476     ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1477 }
1478 #define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__)
1479 #define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__)
1480 #define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__)
1481 #define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__)
1482 #define ILVR_B4_SW(...) ILVR_B4(v4i32, __VA_ARGS__)
1483
1484 #define ILVR_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,    \
1485                 in8, in9, in10, in11, in12, in13, in14, in15,     \
1486                 out0, out1, out2, out3, out4, out5, out6, out7)   \
1487 {                                                                 \
1488     ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,        \
1489             out0, out1, out2, out3);                              \
1490     ILVR_B4(RTYPE, in8, in9, in10, in11, in12, in13, in14, in15,  \
1491             out4, out5, out6, out7);                              \
1492 }
1493 #define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__)
1494
1495 /* Description : Interleave right half of halfword elements from vectors
1496    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1497                  Outputs - out0, out1, out2, out3
1498                  Return Type - as per RTYPE
1499    Details     : Right half of halfword elements of in0 and right half of
1500                  halfword elements of in1 are interleaved and copied to out0.
1501                  Right half of halfword elements of in2 and right half of
1502                  halfword elements of in3 are interleaved and copied to out1.
1503                  Similar for other pairs
1504 */
1505 #define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1506 {                                                           \
1507     out0 = (RTYPE) __msa_ilvr_h((v8i16) in0, (v8i16) in1);  \
1508     out1 = (RTYPE) __msa_ilvr_h((v8i16) in2, (v8i16) in3);  \
1509 }
1510 #define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
1511 #define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__)
1512
1513 #define ILVR_H3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2)  \
1514 {                                                                       \
1515     ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1);                     \
1516     out2 = (RTYPE) __msa_ilvr_h((v8i16) in4, (v8i16) in5);              \
1517 }
1518 #define ILVR_H3_SH(...) ILVR_H3(v8i16, __VA_ARGS__)
1519
1520 #define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1521                 out0, out1, out2, out3)                         \
1522 {                                                               \
1523     ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1524     ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1525 }
1526 #define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__)
1527 #define ILVR_H4_SW(...) ILVR_H4(v4i32, __VA_ARGS__)
1528
1529 #define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1530 {                                                           \
1531     out0 = (RTYPE) __msa_ilvr_w((v4i32) in0, (v4i32) in1);  \
1532     out1 = (RTYPE) __msa_ilvr_w((v4i32) in2, (v4i32) in3);  \
1533 }
1534 #define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__)
1535 #define ILVR_W2_SB(...) ILVR_W2(v16i8, __VA_ARGS__)
1536 #define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__)
1537
1538 #define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1539                 out0, out1, out2, out3)                         \
1540 {                                                               \
1541     ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1542     ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1543 }
1544 #define ILVR_W4_SB(...) ILVR_W4(v16i8, __VA_ARGS__)
1545 #define ILVR_W4_UB(...) ILVR_W4(v16u8, __VA_ARGS__)
1546
1547 /* Description : Interleave right half of double word elements from vectors
1548    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1549                  Outputs - out0, out1, out2, out3
1550                  Return Type - as per RTYPE
1551    Details     : Right half of double word elements of in0 and right half of
1552                  double word elements of in1 are interleaved and copied to out0.
1553                  Right half of double word elements of in2 and right half of
1554                  double word elements of in3 are interleaved and copied to out1.
1555 */
1556 #define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1557 {                                                           \
1558     out0 = (RTYPE) __msa_ilvr_d((v2i64) in0, (v2i64) in1);  \
1559     out1 = (RTYPE) __msa_ilvr_d((v2i64) in2, (v2i64) in3);  \
1560 }
1561 #define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__)
1562 #define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
1563 #define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
1564
1565 #define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2)  \
1566 {                                                                       \
1567     ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);                     \
1568     out2 = (RTYPE) __msa_ilvr_d((v2i64) in4, (v2i64) in5);              \
1569 }
1570 #define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__)
1571
1572 #define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1573                 out0, out1, out2, out3)                         \
1574 {                                                               \
1575     ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1576     ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1577 }
1578 #define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__)
1579 #define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__)
1580
1581 /* Description : Interleave left half of double word elements from vectors
1582    Arguments   : Inputs  - in0, in1, in2, in3
1583                  Outputs - out0, out1
1584                  Return Type - as per RTYPE
1585    Details     : Left half of double word elements of in0 and left half of
1586                  double word elements of in1 are interleaved and copied to out0.
1587                  Left half of double word elements of in2 and left half of
1588                  double word elements of in3 are interleaved and copied to out1.
1589 */
1590 #define ILVL_D2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1591 {                                                           \
1592     out0 = (RTYPE) __msa_ilvl_d((v2i64) in0, (v2i64) in1);  \
1593     out1 = (RTYPE) __msa_ilvl_d((v2i64) in2, (v2i64) in3);  \
1594 }
1595 #define ILVL_D2_UB(...) ILVL_D2(v16u8, __VA_ARGS__)
1596 #define ILVL_D2_SB(...) ILVL_D2(v16i8, __VA_ARGS__)
1597 #define ILVL_D2_SH(...) ILVL_D2(v8i16, __VA_ARGS__)
1598
1599 /* Description : Interleave both left and right half of input vectors
1600    Arguments   : Inputs  - in0, in1
1601                  Outputs - out0, out1
1602                  Return Type - as per RTYPE
1603    Details     : Right half of byte elements from 'in0' and 'in1' are
1604                  interleaved and stored to 'out0'
1605                  Left half of byte elements from 'in0' and 'in1' are
1606                  interleaved and stored to 'out1'
1607 */
1608 #define ILVRL_B2(RTYPE, in0, in1, out0, out1)               \
1609 {                                                           \
1610     out0 = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1);  \
1611     out1 = (RTYPE) __msa_ilvl_b((v16i8) in0, (v16i8) in1);  \
1612 }
1613 #define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
1614 #define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)
1615 #define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__)
1616 #define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
1617 #define ILVRL_B2_SW(...) ILVRL_B2(v4i32, __VA_ARGS__)
1618
1619 #define ILVRL_H2(RTYPE, in0, in1, out0, out1)               \
1620 {                                                           \
1621     out0 = (RTYPE) __msa_ilvr_h((v8i16) in0, (v8i16) in1);  \
1622     out1 = (RTYPE) __msa_ilvl_h((v8i16) in0, (v8i16) in1);  \
1623 }
1624 #define ILVRL_H2_UB(...) ILVRL_H2(v16u8, __VA_ARGS__)
1625 #define ILVRL_H2_SB(...) ILVRL_H2(v16i8, __VA_ARGS__)
1626 #define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)
1627 #define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
1628
1629 #define ILVRL_W2(RTYPE, in0, in1, out0, out1)               \
1630 {                                                           \
1631     out0 = (RTYPE) __msa_ilvr_w((v4i32) in0, (v4i32) in1);  \
1632     out1 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1);  \
1633 }
1634 #define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__)
1635 #define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
1636 #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
1637
1638 /* Description : Maximum values between signed elements of vector and
1639                  5-bit signed immediate value are copied to the output vector
1640    Arguments   : Inputs  - in0, in1, in2, in3, max_val
1641                  Outputs - in0, in1, in2, in3 (in place)
1642                  Return Type - as per RTYPE
1643    Details     : Maximum of signed halfword element values from 'in0' and
1644                  'max_val' are written to output vector 'in0'
1645 */
1646 #define MAXI_SH2(RTYPE, in0, in1, max_val)               \
1647 {                                                        \
1648     in0 = (RTYPE) __msa_maxi_s_h((v8i16) in0, max_val);  \
1649     in1 = (RTYPE) __msa_maxi_s_h((v8i16) in1, max_val);  \
1650 }
1651 #define MAXI_SH2_UH(...) MAXI_SH2(v8u16, __VA_ARGS__)
1652 #define MAXI_SH2_SH(...) MAXI_SH2(v8i16, __VA_ARGS__)
1653
1654 #define MAXI_SH4(RTYPE, in0, in1, in2, in3, max_val)  \
1655 {                                                     \
1656     MAXI_SH2(RTYPE, in0, in1, max_val);               \
1657     MAXI_SH2(RTYPE, in2, in3, max_val);               \
1658 }
1659 #define MAXI_SH4_UH(...) MAXI_SH4(v8u16, __VA_ARGS__)
1660 #define MAXI_SH4_SH(...) MAXI_SH4(v8i16, __VA_ARGS__)
1661
1662 #define MAXI_SH8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, max_val)  \
1663 {                                                                         \
1664     MAXI_SH4(RTYPE, in0, in1, in2, in3, max_val);                         \
1665     MAXI_SH4(RTYPE, in4, in5, in6, in7, max_val);                         \
1666 }
1667 #define MAXI_SH8_UH(...) MAXI_SH8(v8u16, __VA_ARGS__)
1668 #define MAXI_SH8_SH(...) MAXI_SH8(v8i16, __VA_ARGS__)
1669
1670 /* Description : Saturate the halfword element values to the max
1671                  unsigned value of (sat_val+1 bits)
1672                  The element data width remains unchanged
1673    Arguments   : Inputs  - in0, in1, in2, in3, sat_val
1674                  Outputs - in0, in1, in2, in3 (in place)
1675                  Return Type - as per RTYPE
1676    Details     : Each unsigned halfword element from 'in0' is saturated to the
1677                  value generated with (sat_val+1) bit range
1678                  Results are in placed to original vectors
1679 */
1680 #define SAT_UH2(RTYPE, in0, in1, sat_val)               \
1681 {                                                       \
1682     in0 = (RTYPE) __msa_sat_u_h((v8u16) in0, sat_val);  \
1683     in1 = (RTYPE) __msa_sat_u_h((v8u16) in1, sat_val);  \
1684 }
1685 #define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__)
1686 #define SAT_UH2_SH(...) SAT_UH2(v8i16, __VA_ARGS__)
1687
1688 #define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val)  \
1689 {                                                    \
1690     SAT_UH2(RTYPE, in0, in1, sat_val);               \
1691     SAT_UH2(RTYPE, in2, in3, sat_val);               \
1692 }
1693 #define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__)
1694 #define SAT_UH4_SH(...) SAT_UH4(v8i16, __VA_ARGS__)
1695
1696 #define SAT_UH8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, sat_val)  \
1697 {                                                                        \
1698     SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val);                         \
1699     SAT_UH4(RTYPE, in4, in5, in6, in7, sat_val);                         \
1700 }
1701 #define SAT_UH8_UH(...) SAT_UH8(v8u16, __VA_ARGS__)
1702 #define SAT_UH8_SH(...) SAT_UH8(v8i16, __VA_ARGS__)
1703
1704 /* Description : Saturate the halfword element values to the max
1705                  unsigned value of (sat_val+1 bits)
1706                  The element data width remains unchanged
1707    Arguments   : Inputs  - in0, in1, in2, in3, sat_val
1708                  Outputs - in0, in1, in2, in3 (in place)
1709                  Return Type - as per RTYPE
1710    Details     : Each unsigned halfword element from 'in0' is saturated to the
1711                  value generated with (sat_val+1) bit range
1712                  Results are in placed to original vectors
1713 */
1714 #define SAT_SH2(RTYPE, in0, in1, sat_val)               \
1715 {                                                       \
1716     in0 = (RTYPE) __msa_sat_s_h((v8i16) in0, sat_val);  \
1717     in1 = (RTYPE) __msa_sat_s_h((v8i16) in1, sat_val);  \
1718 }
1719 #define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__)
1720
1721 #define SAT_SH3(RTYPE, in0, in1, in2, sat_val)          \
1722 {                                                       \
1723     SAT_SH2(RTYPE, in0, in1, sat_val);                  \
1724     in2 = (RTYPE) __msa_sat_s_h((v8i16) in2, sat_val);  \
1725 }
1726 #define SAT_SH3_SH(...) SAT_SH3(v8i16, __VA_ARGS__)
1727
1728 #define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val)  \
1729 {                                                    \
1730     SAT_SH2(RTYPE, in0, in1, sat_val);               \
1731     SAT_SH2(RTYPE, in2, in3, sat_val);               \
1732 }
1733 #define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__)
1734
1735 /* Description : Saturate the word element values to the max
1736                  unsigned value of (sat_val+1 bits)
1737                  The element data width remains unchanged
1738    Arguments   : Inputs  - in0, in1, in2, in3, sat_val
1739                  Outputs - in0, in1, in2, in3 (in place)
1740                  Return Type - as per RTYPE
1741    Details     : Each unsigned word element from 'in0' is saturated to the
1742                  value generated with (sat_val+1) bit range
1743                  Results are in placed to original vectors
1744 */
1745 #define SAT_SW2(RTYPE, in0, in1, sat_val)               \
1746 {                                                       \
1747     in0 = (RTYPE) __msa_sat_s_w((v4i32) in0, sat_val);  \
1748     in1 = (RTYPE) __msa_sat_s_w((v4i32) in1, sat_val);  \
1749 }
1750 #define SAT_SW2_SW(...) SAT_SW2(v4i32, __VA_ARGS__)
1751
1752 #define SAT_SW4(RTYPE, in0, in1, in2, in3, sat_val)  \
1753 {                                                    \
1754     SAT_SW2(RTYPE, in0, in1, sat_val);               \
1755     SAT_SW2(RTYPE, in2, in3, sat_val);               \
1756 }
1757 #define SAT_SW4_SW(...) SAT_SW4(v4i32, __VA_ARGS__)
1758
1759 /* Description : Indexed halfword element values are replicated to all
1760                  elements in output vector
1761    Arguments   : Inputs  - in, idx0, idx1
1762                  Outputs - out0, out1
1763                  Return Type - as per RTYPE
1764    Details     : 'idx0' element value from 'in' vector is replicated to all
1765                   elements in 'out0' vector
1766                   Valid index range for halfword operation is 0-7
1767 */
1768 #define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1)  \
1769 {                                                     \
1770     out0 = (RTYPE) __msa_splati_h((v8i16) in, idx0);  \
1771     out1 = (RTYPE) __msa_splati_h((v8i16) in, idx1);  \
1772 }
1773 #define SPLATI_H2_SB(...) SPLATI_H2(v16i8, __VA_ARGS__)
1774 #define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__)
1775
1776 #define SPLATI_H3(RTYPE, in, idx0, idx1, idx2,        \
1777                   out0, out1, out2)                   \
1778 {                                                     \
1779     SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1);     \
1780     out2 = (RTYPE) __msa_splati_h((v8i16) in, idx2);  \
1781 }
1782 #define SPLATI_H3_SB(...) SPLATI_H3(v16i8, __VA_ARGS__)
1783 #define SPLATI_H3_SH(...) SPLATI_H3(v8i16, __VA_ARGS__)
1784
1785 #define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3,  \
1786                   out0, out1, out2, out3)             \
1787 {                                                     \
1788     SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1);     \
1789     SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3);     \
1790 }
1791 #define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__)
1792 #define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__)
1793
1794 /* Description : Indexed word element values are replicated to all
1795                  elements in output vector
1796    Arguments   : Inputs  - in, stidx
1797                  Outputs - out0, out1
1798                  Return Type - as per RTYPE
1799    Details     : 'stidx' element value from 'in' vector is replicated to all
1800                   elements in 'out0' vector
1801                  'stidx + 1' element value from 'in' vector is replicated to all
1802                   elements in 'out1' vector
1803                   Valid index range for halfword operation is 0-3
1804 */
1805 #define SPLATI_W2(RTYPE, in, stidx, out0, out1)            \
1806 {                                                          \
1807     out0 = (RTYPE) __msa_splati_w((v4i32) in, stidx);      \
1808     out1 = (RTYPE) __msa_splati_w((v4i32) in, (stidx+1));  \
1809 }
1810 #define SPLATI_W2_SH(...) SPLATI_W2(v8i16, __VA_ARGS__)
1811 #define SPLATI_W2_SW(...) SPLATI_W2(v4i32, __VA_ARGS__)
1812
1813 #define SPLATI_W4(RTYPE, in, out0, out1, out2, out3)  \
1814 {                                                     \
1815     SPLATI_W2(RTYPE, in, 0, out0, out1);              \
1816     SPLATI_W2(RTYPE, in, 2, out2, out3);              \
1817 }
1818 #define SPLATI_W4_SH(...) SPLATI_W4(v8i16, __VA_ARGS__)
1819 #define SPLATI_W4_SW(...) SPLATI_W4(v4i32, __VA_ARGS__)
1820
1821 /* Description : Pack even byte elements of vector pairs
1822    Arguments   : Inputs  - in0, in1, in2, in3
1823                  Outputs - out0, out1
1824                  Return Type - as per RTYPE
1825    Details     : Even byte elements of in0 are copied to the left half of
1826                  out0 & even byte elements of in1 are copied to the right
1827                  half of out0.
1828                  Even byte elements of in2 are copied to the left half of
1829                  out1 & even byte elements of in3 are copied to the right
1830                  half of out1.
1831 */
1832 #define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1833 {                                                            \
1834     out0 = (RTYPE) __msa_pckev_b((v16i8) in0, (v16i8) in1);  \
1835     out1 = (RTYPE) __msa_pckev_b((v16i8) in2, (v16i8) in3);  \
1836 }
1837 #define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)
1838 #define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)
1839 #define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__)
1840 #define PCKEV_B2_SW(...) PCKEV_B2(v4i32, __VA_ARGS__)
1841
1842 #define PCKEV_B3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2)  \
1843 {                                                                        \
1844     PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1);                     \
1845     out2 = (RTYPE) __msa_pckev_b((v16i8) in4, (v16i8) in5);              \
1846 }
1847 #define PCKEV_B3_UB(...) PCKEV_B3(v16u8, __VA_ARGS__)
1848 #define PCKEV_B3_SB(...) PCKEV_B3(v16i8, __VA_ARGS__)
1849
1850 #define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1851                  out0, out1, out2, out3)                         \
1852 {                                                                \
1853     PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1854     PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1855 }
1856 #define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__)
1857 #define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__)
1858 #define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__)
1859 #define PCKEV_B4_SW(...) PCKEV_B4(v4i32, __VA_ARGS__)
1860
1861 /* Description : Pack even halfword elements of vector pairs
1862    Arguments   : Inputs  - in0, in1, in2, in3
1863                  Outputs - out0, out1
1864                  Return Type - as per RTYPE
1865    Details     : Even halfword elements of in0 are copied to the left half of
1866                  out0 & even halfword elements of in1 are copied to the right
1867                  half of out0.
1868                  Even halfword elements of in2 are copied to the left half of
1869                  out1 & even halfword elements of in3 are copied to the right
1870                  half of out1.
1871 */
1872 #define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1873 {                                                            \
1874     out0 = (RTYPE) __msa_pckev_h((v8i16) in0, (v8i16) in1);  \
1875     out1 = (RTYPE) __msa_pckev_h((v8i16) in2, (v8i16) in3);  \
1876 }
1877 #define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__)
1878 #define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__)
1879
1880 #define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1881                  out0, out1, out2, out3)                         \
1882 {                                                                \
1883     PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1884     PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1885 }
1886 #define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__)
1887 #define PCKEV_H4_SW(...) PCKEV_H4(v4i32, __VA_ARGS__)
1888
1889 /* Description : Pack even double word elements of vector pairs
1890    Arguments   : Inputs  - in0, in1, in2, in3
1891                  Outputs - out0, out1
1892                  Return Type - as per RTYPE
1893    Details     : Even double elements of in0 are copied to the left half of
1894                  out0 & even double elements of in1 are copied to the right
1895                  half of out0.
1896                  Even double elements of in2 are copied to the left half of
1897                  out1 & even double elements of in3 are copied to the right
1898                  half of out1.
1899 */
1900 #define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1901 {                                                            \
1902     out0 = (RTYPE) __msa_pckev_d((v2i64) in0, (v2i64) in1);  \
1903     out1 = (RTYPE) __msa_pckev_d((v2i64) in2, (v2i64) in3);  \
1904 }
1905 #define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__)
1906 #define PCKEV_D2_SB(...) PCKEV_D2(v16i8, __VA_ARGS__)
1907 #define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__)
1908
1909 #define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1910                  out0, out1, out2, out3)                         \
1911 {                                                                \
1912     PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1913     PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1914 }
1915 #define PCKEV_D4_UB(...) PCKEV_D4(v16u8, __VA_ARGS__)
1916
1917 /* Description : Pack odd double word elements of vector pairs
1918    Arguments   : Inputs  - in0, in1
1919                  Outputs - out0, out1
1920                  Return Type - as per RTYPE
1921    Details     : As operation is on same input 'in0' vector, index 1 double word
1922                  element is overwritten to index 0 and result is written to out0
1923                  As operation is on same input 'in1' vector, index 1 double word
1924                  element is overwritten to index 0 and result is written to out1
1925 */
1926 #define PCKOD_D2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1927 {                                                            \
1928     out0 = (RTYPE) __msa_pckod_d((v2i64) in0, (v2i64) in1);  \
1929     out1 = (RTYPE) __msa_pckod_d((v2i64) in2, (v2i64) in3);  \
1930 }
1931 #define PCKOD_D2_UB(...) PCKOD_D2(v16u8, __VA_ARGS__)
1932 #define PCKOD_D2_SH(...) PCKOD_D2(v8i16, __VA_ARGS__)
1933 #define PCKOD_D2_SD(...) PCKOD_D2(v2i64, __VA_ARGS__)
1934
1935 /* Description : Each byte element is logically xor'ed with immediate 128
1936    Arguments   : Inputs  - in0, in1
1937                  Outputs - in0, in1 (in-place)
1938                  Return Type - as per RTYPE
1939    Details     : Each unsigned byte element from input vector 'in0' is
1940                  logically xor'ed with 128 and result is in-place stored in
1941                  'in0' vector
1942                  Each unsigned byte element from input vector 'in1' is
1943                  logically xor'ed with 128 and result is in-place stored in
1944                  'in1' vector
1945                  Similar for other pairs
1946 */
1947 #define XORI_B2_128(RTYPE, in0, in1)               \
1948 {                                                  \
1949     in0 = (RTYPE) __msa_xori_b((v16u8) in0, 128);  \
1950     in1 = (RTYPE) __msa_xori_b((v16u8) in1, 128);  \
1951 }
1952 #define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__)
1953 #define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__)
1954 #define XORI_B2_128_SH(...) XORI_B2_128(v8i16, __VA_ARGS__)
1955
1956 #define XORI_B3_128(RTYPE, in0, in1, in2)          \
1957 {                                                  \
1958     XORI_B2_128(RTYPE, in0, in1);                  \
1959     in2 = (RTYPE) __msa_xori_b((v16u8) in2, 128);  \
1960 }
1961 #define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__)
1962
1963 #define XORI_B4_128(RTYPE, in0, in1, in2, in3)  \
1964 {                                               \
1965     XORI_B2_128(RTYPE, in0, in1);               \
1966     XORI_B2_128(RTYPE, in2, in3);               \
1967 }
1968 #define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__)
1969 #define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__)
1970 #define XORI_B4_128_SH(...) XORI_B4_128(v8i16, __VA_ARGS__)
1971
1972 #define XORI_B5_128(RTYPE, in0, in1, in2, in3, in4)  \
1973 {                                                    \
1974     XORI_B3_128(RTYPE, in0, in1, in2);               \
1975     XORI_B2_128(RTYPE, in3, in4);                    \
1976 }
1977 #define XORI_B5_128_SB(...) XORI_B5_128(v16i8, __VA_ARGS__)
1978
1979 #define XORI_B6_128(RTYPE, in0, in1, in2, in3, in4, in5)  \
1980 {                                                         \
1981     XORI_B4_128(RTYPE, in0, in1, in2, in3);               \
1982     XORI_B2_128(RTYPE, in4, in5);                         \
1983 }
1984 #define XORI_B6_128_SB(...) XORI_B6_128(v16i8, __VA_ARGS__)
1985
1986 #define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6)  \
1987 {                                                              \
1988     XORI_B4_128(RTYPE, in0, in1, in2, in3);                    \
1989     XORI_B3_128(RTYPE, in4, in5, in6);                         \
1990 }
1991 #define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__)
1992
1993 #define XORI_B8_128(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7)  \
1994 {                                                                   \
1995     XORI_B4_128(RTYPE, in0, in1, in2, in3);                         \
1996     XORI_B4_128(RTYPE, in4, in5, in6, in7);                         \
1997 }
1998 #define XORI_B8_128_SB(...) XORI_B8_128(v16i8, __VA_ARGS__)
1999 #define XORI_B8_128_UB(...) XORI_B8_128(v16u8, __VA_ARGS__)
2000
2001 /* Description : Addition of signed halfword elements and signed saturation
2002    Arguments   : Inputs  - in0, in1, in2, in3
2003                  Outputs - out0, out1
2004                  Return Type - as per RTYPE
2005    Details     : Signed halfword elements from 'in0' are added to signed
2006                  halfword elements of 'in1'. The result is then signed saturated
2007                  between -32768 to +32767 (as per halfword data type)
2008                  Similar for other pairs
2009 */
2010 #define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1)       \
2011 {                                                             \
2012     out0 = (RTYPE) __msa_adds_s_h((v8i16) in0, (v8i16) in1);  \
2013     out1 = (RTYPE) __msa_adds_s_h((v8i16) in2, (v8i16) in3);  \
2014 }
2015 #define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__)
2016
2017 #define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
2018                  out0, out1, out2, out3)                         \
2019 {                                                                \
2020     ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1);             \
2021     ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3);             \
2022 }
2023 #define ADDS_SH4_UH(...) ADDS_SH4(v8u16, __VA_ARGS__)
2024 #define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__)
2025
2026 /* Description : Shift left all elements of vector (generic for all data types)
2027    Arguments   : Inputs  - in0, in1, in2, in3, shift
2028                  Outputs - in0, in1, in2, in3 (in place)
2029                  Return Type - as per input vector RTYPE
2030    Details     : Each element of vector 'in0' is left shifted by 'shift' and
2031                  result is in place written to 'in0'
2032                  Similar for other pairs
2033 */
2034 #define SLLI_2V(in0, in1, shift)  \
2035 {                                 \
2036     in0 = in0 << shift;           \
2037     in1 = in1 << shift;           \
2038 }
2039 #define SLLI_4V(in0, in1, in2, in3, shift)  \
2040 {                                           \
2041     in0 = in0 << shift;                     \
2042     in1 = in1 << shift;                     \
2043     in2 = in2 << shift;                     \
2044     in3 = in3 << shift;                     \
2045 }
2046
2047 /* Description : Arithmetic shift right all elements of vector
2048                  (generic for all data types)
2049    Arguments   : Inputs  - in0, in1, in2, in3, shift
2050                  Outputs - in0, in1, in2, in3 (in place)
2051                  Return Type - as per input vector RTYPE
2052    Details     : Each element of vector 'in0' is right shifted by 'shift' and
2053                  result is in place written to 'in0'
2054                  Here, 'shift' is GP variable passed in
2055                  Similar for other pairs
2056 */
2057 #define SRA_4V(in0, in1, in2, in3, shift)  \
2058 {                                          \
2059     in0 = in0 >> shift;                    \
2060     in1 = in1 >> shift;                    \
2061     in2 = in2 >> shift;                    \
2062     in3 = in3 >> shift;                    \
2063 }
2064
2065 /* Description : Shift right logical all halfword elements of vector
2066    Arguments   : Inputs  - in0, in1, in2, in3, shift
2067                  Outputs - in0, in1, in2, in3 (in place)
2068                  Return Type - as per RTYPE
2069    Details     : Each element of vector 'in0' is shifted right logical by
2070                  number of bits respective element holds in vector 'shift' and
2071                  result is in place written to 'in0'
2072                  Here, 'shift' is a vector passed in
2073                  Similar for other pairs
2074 */
2075 #define SRL_H4(RTYPE, in0, in1, in2, in3, shift)            \
2076 {                                                           \
2077     in0 = (RTYPE) __msa_srl_h((v8i16) in0, (v8i16) shift);  \
2078     in1 = (RTYPE) __msa_srl_h((v8i16) in1, (v8i16) shift);  \
2079     in2 = (RTYPE) __msa_srl_h((v8i16) in2, (v8i16) shift);  \
2080     in3 = (RTYPE) __msa_srl_h((v8i16) in3, (v8i16) shift);  \
2081 }
2082 #define SRL_H4_UH(...) SRL_H4(v8u16, __VA_ARGS__)
2083
2084 #define SRLR_H4(RTYPE, in0, in1, in2, in3, shift)            \
2085 {                                                            \
2086     in0 = (RTYPE) __msa_srlr_h((v8i16) in0, (v8i16) shift);  \
2087     in1 = (RTYPE) __msa_srlr_h((v8i16) in1, (v8i16) shift);  \
2088     in2 = (RTYPE) __msa_srlr_h((v8i16) in2, (v8i16) shift);  \
2089     in3 = (RTYPE) __msa_srlr_h((v8i16) in3, (v8i16) shift);  \
2090 }
2091 #define SRLR_H4_UH(...) SRLR_H4(v8u16, __VA_ARGS__)
2092 #define SRLR_H4_SH(...) SRLR_H4(v8i16, __VA_ARGS__)
2093
2094 #define SRLR_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, shift)  \
2095 {                                                                      \
2096     SRLR_H4(RTYPE, in0, in1, in2, in3, shift);                         \
2097     SRLR_H4(RTYPE, in4, in5, in6, in7, shift);                         \
2098 }
2099 #define SRLR_H8_UH(...) SRLR_H8(v8u16, __VA_ARGS__)
2100 #define SRLR_H8_SH(...) SRLR_H8(v8i16, __VA_ARGS__)
2101
2102 /* Description : Shift right arithmetic rounded halfwords
2103    Arguments   : Inputs  - in0, in1, shift
2104                  Outputs - in0, in1, (in place)
2105                  Return Type - as per RTYPE
2106    Details     : Each element of vector 'in0' is shifted right arithmetic by
2107                  number of bits respective element holds in vector 'shift'.
2108                  The last discarded bit is added to shifted value for rounding
2109                  and the result is in place written to 'in0'
2110                  Here, 'shift' is a vector passed in
2111                  Similar for other pairs
2112 */
2113 #define SRAR_H2(RTYPE, in0, in1, shift)                      \
2114 {                                                            \
2115     in0 = (RTYPE) __msa_srar_h((v8i16) in0, (v8i16) shift);  \
2116     in1 = (RTYPE) __msa_srar_h((v8i16) in1, (v8i16) shift);  \
2117 }
2118 #define SRAR_H2_UH(...) SRAR_H2(v8u16, __VA_ARGS__)
2119 #define SRAR_H2_SH(...) SRAR_H2(v8i16, __VA_ARGS__)
2120
2121 #define SRAR_H3(RTYPE, in0, in1, in2, shift)                 \
2122 {                                                            \
2123     SRAR_H2(RTYPE, in0, in1, shift)                          \
2124     in2 = (RTYPE) __msa_srar_h((v8i16) in2, (v8i16) shift);  \
2125 }
2126 #define SRAR_H3_SH(...) SRAR_H3(v8i16, __VA_ARGS__)
2127
2128 #define SRAR_H4(RTYPE, in0, in1, in2, in3, shift)  \
2129 {                                                  \
2130     SRAR_H2(RTYPE, in0, in1, shift)                \
2131     SRAR_H2(RTYPE, in2, in3, shift)                \
2132 }
2133 #define SRAR_H4_UH(...) SRAR_H4(v8u16, __VA_ARGS__)
2134 #define SRAR_H4_SH(...) SRAR_H4(v8i16, __VA_ARGS__)
2135
2136 /* Description : Shift right arithmetic rounded words
2137    Arguments   : Inputs  - in0, in1, shift
2138                  Outputs - in0, in1, (in place)
2139                  Return Type - as per RTYPE
2140    Details     : Each element of vector 'in0' is shifted right arithmetic by
2141                  number of bits respective element holds in vector 'shift'.
2142                  The last discarded bit is added to shifted value for rounding
2143                  and the result is in place written to 'in0'
2144                  Here, 'shift' is a vector passed in
2145                  Similar for other pairs
2146 */
2147 #define SRAR_W2(RTYPE, in0, in1, shift)                      \
2148 {                                                            \
2149     in0 = (RTYPE) __msa_srar_w((v4i32) in0, (v4i32) shift);  \
2150     in1 = (RTYPE) __msa_srar_w((v4i32) in1, (v4i32) shift);  \
2151 }
2152 #define SRAR_W2_SW(...) SRAR_W2(v4i32, __VA_ARGS__)
2153
2154 #define SRAR_W4(RTYPE, in0, in1, in2, in3, shift)  \
2155 {                                                  \
2156     SRAR_W2(RTYPE, in0, in1, shift)                \
2157     SRAR_W2(RTYPE, in2, in3, shift)                \
2158 }
2159 #define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__)
2160
2161 /* Description : Shift right arithmetic rounded (immediate)
2162    Arguments   : Inputs  - in0, in1, in2, in3, shift
2163                  Outputs - in0, in1, in2, in3 (in place)
2164                  Return Type - as per RTYPE
2165    Details     : Each element of vector 'in0' is shifted right arithmetic by
2166                  value in 'shift'.
2167                  The last discarded bit is added to shifted value for rounding
2168                  and the result is in place written to 'in0'
2169                  Similar for other pairs
2170 */
2171 #define SRARI_H2(RTYPE, in0, in1, shift)              \
2172 {                                                     \
2173     in0 = (RTYPE) __msa_srari_h((v8i16) in0, shift);  \
2174     in1 = (RTYPE) __msa_srari_h((v8i16) in1, shift);  \
2175 }
2176 #define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__)
2177 #define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__)
2178
2179 #define SRARI_H4(RTYPE, in0, in1, in2, in3, shift)    \
2180 {                                                     \
2181     SRARI_H2(RTYPE, in0, in1, shift);                 \
2182     SRARI_H2(RTYPE, in2, in3, shift);                 \
2183 }
2184 #define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__)
2185 #define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__)
2186
2187 /* Description : Shift right arithmetic rounded (immediate)
2188    Arguments   : Inputs  - in0, in1, shift
2189                  Outputs - in0, in1     (in place)
2190                  Return Type - as per RTYPE
2191    Details     : Each element of vector 'in0' is shifted right arithmetic by
2192                  value in 'shift'.
2193                  The last discarded bit is added to shifted value for rounding
2194                  and the result is in place written to 'in0'
2195                  Similar for other pairs
2196 */
2197 #define SRARI_W2(RTYPE, in0, in1, shift)              \
2198 {                                                     \
2199     in0 = (RTYPE) __msa_srari_w((v4i32) in0, shift);  \
2200     in1 = (RTYPE) __msa_srari_w((v4i32) in1, shift);  \
2201 }
2202 #define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__)
2203
2204 #define SRARI_W4(RTYPE, in0, in1, in2, in3, shift)  \
2205 {                                                   \
2206     SRARI_W2(RTYPE, in0, in1, shift);               \
2207     SRARI_W2(RTYPE, in2, in3, shift);               \
2208 }
2209 #define SRARI_W4_SH(...) SRARI_W4(v8i16, __VA_ARGS__)
2210 #define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)
2211
2212 /* Description : Multiplication of pairs of vectors
2213    Arguments   : Inputs  - in0, in1, in2, in3
2214                  Outputs - out0, out1
2215    Details     : Each element from 'in0' is multiplied with elements from 'in1'
2216                  and result is written to 'out0'
2217                  Similar for other pairs
2218 */
2219 #define MUL2(in0, in1, in2, in3, out0, out1)  \
2220 {                                             \
2221     out0 = in0 * in1;                         \
2222     out1 = in2 * in3;                         \
2223 }
2224 #define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3)  \
2225 {                                                                             \
2226     MUL2(in0, in1, in2, in3, out0, out1);                                     \
2227     MUL2(in4, in5, in6, in7, out2, out3);                                     \
2228 }
2229
2230 /* Description : Addition of 2 pairs of vectors
2231    Arguments   : Inputs  - in0, in1, in2, in3
2232                  Outputs - out0, out1
2233    Details     : Each element from 2 pairs vectors is added and 2 results are
2234                  produced
2235 */
2236 #define ADD2(in0, in1, in2, in3, out0, out1)  \
2237 {                                             \
2238     out0 = in0 + in1;                         \
2239     out1 = in2 + in3;                         \
2240 }
2241 #define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3)  \
2242 {                                                                             \
2243     ADD2(in0, in1, in2, in3, out0, out1);                                     \
2244     ADD2(in4, in5, in6, in7, out2, out3);                                     \
2245 }
2246
2247 /* Description : Subtraction of 2 pairs of vectors
2248    Arguments   : Inputs  - in0, in1, in2, in3
2249                  Outputs - out0, out1
2250    Details     : Each element from 2 pairs vectors is subtracted and 2 results
2251                  are produced
2252 */
2253 #define SUB2(in0, in1, in2, in3, out0, out1)  \
2254 {                                             \
2255     out0 = in0 - in1;                         \
2256     out1 = in2 - in3;                         \
2257 }
2258 #define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3)  \
2259 {                                                                             \
2260     out0 = in0 - in1;                                                         \
2261     out1 = in2 - in3;                                                         \
2262     out2 = in4 - in5;                                                         \
2263     out3 = in6 - in7;                                                         \
2264 }
2265
2266 /* Description : Sign extend byte elements from right half of the vector
2267    Arguments   : Input  - in    (byte vector)
2268                  Output - out   (sign extended halfword vector)
2269                  Return Type - signed halfword
2270    Details     : Sign bit of byte elements from input vector 'in' is
2271                  extracted and interleaved with same vector 'in' to generate
2272                  8 halfword elements keeping sign intact
2273 */
2274 #define UNPCK_R_SB_SH(in, out)                       \
2275 {                                                    \
2276     v16i8 sign_m;                                    \
2277                                                      \
2278     sign_m = __msa_clti_s_b((v16i8) in, 0);          \
2279     out = (v8i16) __msa_ilvr_b(sign_m, (v16i8) in);  \
2280 }
2281
2282 /* Description : Sign extend halfword elements from right half of the vector
2283    Arguments   : Inputs  - in    (input halfword vector)
2284                  Outputs - out   (sign extended word vectors)
2285                  Return Type - signed word
2286    Details     : Sign bit of halfword elements from input vector 'in' is
2287                  extracted and interleaved with same vector 'in0' to generate
2288                  4 word elements keeping sign intact
2289 */
2290 #define UNPCK_R_SH_SW(in, out)                       \
2291 {                                                    \
2292     v8i16 sign_m;                                    \
2293                                                      \
2294     sign_m = __msa_clti_s_h((v8i16) in, 0);          \
2295     out = (v4i32) __msa_ilvr_h(sign_m, (v8i16) in);  \
2296 }
2297
2298 /* Description : Sign extend byte elements from input vector and return
2299                  halfword results in pair of vectors
2300    Arguments   : Inputs  - in           (1 input byte vector)
2301                  Outputs - out0, out1   (sign extended 2 halfword vectors)
2302                  Return Type - signed halfword
2303    Details     : Sign bit of byte elements from input vector 'in' is
2304                  extracted and interleaved right with same vector 'in0' to
2305                  generate 8 signed halfword elements in 'out0'
2306                  Then interleaved left with same vector 'in0' to
2307                  generate 8 signed halfword elements in 'out1'
2308 */
2309 #define UNPCK_SB_SH(in, out0, out1)                  \
2310 {                                                    \
2311     v16i8 tmp_m;                                     \
2312                                                      \
2313     tmp_m = __msa_clti_s_b((v16i8) in, 0);           \
2314     ILVRL_B2_SH(tmp_m, in, out0, out1);              \
2315 }
2316
2317 /* Description : Zero extend unsigned byte elements to halfword elements
2318    Arguments   : Inputs  - in           (1 input unsigned byte vector)
2319                  Outputs - out0, out1   (unsigned 2 halfword vectors)
2320                  Return Type - signed halfword
2321    Details     : Zero extended right half of vector is returned in 'out0'
2322                  Zero extended left half of vector is returned in 'out1'
2323 */
2324 #define UNPCK_UB_SH(in, out0, out1)                   \
2325 {                                                     \
2326     v16i8 zero_m = { 0 };                             \
2327                                                       \
2328     ILVRL_B2_SH(zero_m, in, out0, out1);              \
2329 }
2330
2331 /* Description : Sign extend halfword elements from input vector and return
2332                  result in pair of vectors
2333    Arguments   : Inputs  - in           (1 input halfword vector)
2334                  Outputs - out0, out1   (sign extended 2 word vectors)
2335                  Return Type - signed word
2336    Details     : Sign bit of halfword elements from input vector 'in' is
2337                  extracted and interleaved right with same vector 'in0' to
2338                  generate 4 signed word elements in 'out0'
2339                  Then interleaved left with same vector 'in0' to
2340                  generate 4 signed word elements in 'out1'
2341 */
2342 #define UNPCK_SH_SW(in, out0, out1)                  \
2343 {                                                    \
2344     v8i16 tmp_m;                                     \
2345                                                      \
2346     tmp_m = __msa_clti_s_h((v8i16) in, 0);           \
2347     ILVRL_H2_SW(tmp_m, in, out0, out1);              \
2348 }
2349
2350 /* Description : Swap two variables
2351    Arguments   : Inputs  - in0, in1
2352                  Outputs - in0, in1 (in-place)
2353    Details     : Swapping of two input variables using xor
2354 */
2355 #define SWAP(in0, in1)  \
2356 {                       \
2357     in0 = in0 ^ in1;    \
2358     in1 = in0 ^ in1;    \
2359     in0 = in0 ^ in1;    \
2360 }
2361
2362 /* Description : Butterfly of 4 input vectors
2363    Arguments   : Inputs  - in0, in1, in2, in3
2364                  Outputs - out0, out1, out2, out3
2365    Details     : Butterfly operation
2366 */
2367 #define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3)  \
2368 {                                                                \
2369     out0 = in0 + in3;                                            \
2370     out1 = in1 + in2;                                            \
2371                                                                  \
2372     out2 = in1 - in2;                                            \
2373     out3 = in0 - in3;                                            \
2374 }
2375
2376 /* Description : Butterfly of 8 input vectors
2377    Arguments   : Inputs  - in0 ...  in7
2378                  Outputs - out0 .. out7
2379    Details     : Butterfly operation
2380 */
2381 #define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7,          \
2382                     out0, out1, out2, out3, out4, out5, out6, out7)  \
2383 {                                                                    \
2384     out0 = in0 + in7;                                                \
2385     out1 = in1 + in6;                                                \
2386     out2 = in2 + in5;                                                \
2387     out3 = in3 + in4;                                                \
2388                                                                      \
2389     out4 = in3 - in4;                                                \
2390     out5 = in2 - in5;                                                \
2391     out6 = in1 - in6;                                                \
2392     out7 = in0 - in7;                                                \
2393 }
2394
2395 /* Description : Butterfly of 16 input vectors
2396    Arguments   : Inputs  - in0 ...  in15
2397                  Outputs - out0 .. out15
2398    Details     : Butterfly operation
2399 */
2400 #define BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,                \
2401                      in8, in9,  in10, in11, in12, in13, in14, in15,         \
2402                      out0, out1, out2, out3, out4, out5, out6, out7,        \
2403                      out8, out9, out10, out11, out12, out13, out14, out15)  \
2404 {                                                                           \
2405     out0 = in0 + in15;                                                      \
2406     out1 = in1 + in14;                                                      \
2407     out2 = in2 + in13;                                                      \
2408     out3 = in3 + in12;                                                      \
2409     out4 = in4 + in11;                                                      \
2410     out5 = in5 + in10;                                                      \
2411     out6 = in6 + in9;                                                       \
2412     out7 = in7 + in8;                                                       \
2413                                                                             \
2414     out8 = in7 - in8;                                                       \
2415     out9 = in6 - in9;                                                       \
2416     out10 = in5 - in10;                                                     \
2417     out11 = in4 - in11;                                                     \
2418     out12 = in3 - in12;                                                     \
2419     out13 = in2 - in13;                                                     \
2420     out14 = in1 - in14;                                                     \
2421     out15 = in0 - in15;                                                     \
2422 }
2423
2424 /* Description : Transposes input 4x4 byte block
2425    Arguments   : Inputs  - in0, in1, in2, in3      (input 4x4 byte block)
2426                  Outputs - out0, out1, out2, out3  (output 4x4 byte block)
2427                  Return Type - unsigned byte
2428    Details     :
2429 */
2430 #define TRANSPOSE4x4_UB_UB(in0, in1, in2, in3, out0, out1, out2, out3)  \
2431 {                                                                       \
2432     v16i8 zero_m = { 0 };                                               \
2433     v16i8 s0_m, s1_m, s2_m, s3_m;                                       \
2434                                                                         \
2435     ILVR_D2_SB(in1, in0, in3, in2, s0_m, s1_m);                         \
2436     ILVRL_B2_SB(s1_m, s0_m, s2_m, s3_m);                                \
2437                                                                         \
2438     out0 = (v16u8) __msa_ilvr_b(s3_m, s2_m);                            \
2439     out1 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out0, 4);               \
2440     out2 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out1, 4);               \
2441     out3 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out2, 4);               \
2442 }
2443
2444 /* Description : Transposes input 8x4 byte block into 4x8
2445    Arguments   : Inputs  - in0, in1, in2, in3      (input 8x4 byte block)
2446                  Outputs - out0, out1, out2, out3  (output 4x8 byte block)
2447                  Return Type - as per RTYPE
2448    Details     :
2449 */
2450 #define TRANSPOSE8x4_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
2451                         out0, out1, out2, out3)                         \
2452 {                                                                       \
2453     v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                               \
2454                                                                         \
2455     ILVEV_W2_SB(in0, in4, in1, in5, tmp0_m, tmp1_m);                    \
2456     tmp2_m = __msa_ilvr_b(tmp1_m, tmp0_m);                              \
2457     ILVEV_W2_SB(in2, in6, in3, in7, tmp0_m, tmp1_m);                    \
2458                                                                         \
2459     tmp3_m = __msa_ilvr_b(tmp1_m, tmp0_m);                              \
2460     ILVRL_H2_SB(tmp3_m, tmp2_m, tmp0_m, tmp1_m);                        \
2461                                                                         \
2462     ILVRL_W2(RTYPE, tmp1_m, tmp0_m, out0, out2);                        \
2463     out1 = (RTYPE) __msa_ilvl_d((v2i64) out2, (v2i64) out0);            \
2464     out3 = (RTYPE) __msa_ilvl_d((v2i64) out0, (v2i64) out2);            \
2465 }
2466 #define TRANSPOSE8x4_UB_UB(...) TRANSPOSE8x4_UB(v16u8, __VA_ARGS__)
2467 #define TRANSPOSE8x4_UB_UH(...) TRANSPOSE8x4_UB(v8u16, __VA_ARGS__)
2468
2469 /* Description : Transposes input 8x8 byte block
2470    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
2471                            (input 8x8 byte block)
2472                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
2473                            (output 8x8 byte block)
2474                  Return Type - as per RTYPE
2475    Details     :
2476 */
2477 #define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,   \
2478                         out0, out1, out2, out3, out4, out5, out6, out7)  \
2479 {                                                                        \
2480     v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                \
2481     v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                \
2482                                                                          \
2483     ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5,                   \
2484                tmp0_m, tmp1_m, tmp2_m, tmp3_m);                          \
2485     ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m);                         \
2486     ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m);                         \
2487     ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2);                         \
2488     ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6);                         \
2489     SLDI_B2_0(RTYPE, out0, out2, out1, out3, 8);                         \
2490     SLDI_B2_0(RTYPE, out4, out6, out5, out7, 8);                         \
2491 }
2492 #define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__)
2493 #define TRANSPOSE8x8_UB_UH(...) TRANSPOSE8x8_UB(v8u16, __VA_ARGS__)
2494
2495 /* Description : Transposes 16x4 block into 4x16 with byte elements in vectors
2496    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7,
2497                            in8, in9, in10, in11, in12, in13, in14, in15
2498                  Outputs - out0, out1, out2, out3
2499                  Return Type - unsigned byte
2500    Details     :
2501 */
2502 #define TRANSPOSE16x4_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7,        \
2503                             in8, in9, in10, in11, in12, in13, in14, in15,  \
2504                             out0, out1, out2, out3)                        \
2505 {                                                                          \
2506     v2i64 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                  \
2507                                                                            \
2508     ILVEV_W2_SD(in0, in4, in8, in12, tmp0_m, tmp1_m);                      \
2509     out1 = (v16u8) __msa_ilvev_d(tmp1_m, tmp0_m);                          \
2510                                                                            \
2511     ILVEV_W2_SD(in1, in5, in9, in13, tmp0_m, tmp1_m);                      \
2512     out3 = (v16u8) __msa_ilvev_d(tmp1_m, tmp0_m);                          \
2513                                                                            \
2514     ILVEV_W2_SD(in2, in6, in10, in14, tmp0_m, tmp1_m);                     \
2515                                                                            \
2516     tmp2_m = __msa_ilvev_d(tmp1_m, tmp0_m);                                \
2517     ILVEV_W2_SD(in3, in7, in11, in15, tmp0_m, tmp1_m);                     \
2518                                                                            \
2519     tmp3_m = __msa_ilvev_d(tmp1_m, tmp0_m);                                \
2520     ILVEV_B2_SD(out1, out3, tmp2_m, tmp3_m, tmp0_m, tmp1_m);               \
2521     out0 = (v16u8) __msa_ilvev_h((v8i16) tmp1_m, (v8i16) tmp0_m);          \
2522     out2 = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m);          \
2523                                                                            \
2524     tmp0_m = (v2i64) __msa_ilvod_b((v16i8) out3, (v16i8) out1);            \
2525     tmp1_m = (v2i64) __msa_ilvod_b((v16i8) tmp3_m, (v16i8) tmp2_m);        \
2526     out1 = (v16u8) __msa_ilvev_h((v8i16) tmp1_m, (v8i16) tmp0_m);          \
2527     out3 = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m);          \
2528 }
2529
2530 /* Description : Transposes 16x8 block into 8x16 with byte elements in vectors
2531    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7,
2532                            in8, in9, in10, in11, in12, in13, in14, in15
2533                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
2534                  Return Type - unsigned byte
2535    Details     :
2536 */
2537 #define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7,          \
2538                             in8, in9, in10, in11, in12, in13, in14, in15,    \
2539                             out0, out1, out2, out3, out4, out5, out6, out7)  \
2540 {                                                                            \
2541     v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                    \
2542     v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                    \
2543                                                                              \
2544     ILVEV_D2_UB(in0, in8, in1, in9, out7, out6);                             \
2545     ILVEV_D2_UB(in2, in10, in3, in11, out5, out4);                           \
2546     ILVEV_D2_UB(in4, in12, in5, in13, out3, out2);                           \
2547     ILVEV_D2_UB(in6, in14, in7, in15, out1, out0);                           \
2548                                                                              \
2549     tmp0_m = (v16u8) __msa_ilvev_b((v16i8) out6, (v16i8) out7);              \
2550     tmp4_m = (v16u8) __msa_ilvod_b((v16i8) out6, (v16i8) out7);              \
2551     tmp1_m = (v16u8) __msa_ilvev_b((v16i8) out4, (v16i8) out5);              \
2552     tmp5_m = (v16u8) __msa_ilvod_b((v16i8) out4, (v16i8) out5);              \
2553     out5 = (v16u8) __msa_ilvev_b((v16i8) out2, (v16i8) out3);                \
2554     tmp6_m = (v16u8) __msa_ilvod_b((v16i8) out2, (v16i8) out3);              \
2555     out7 = (v16u8) __msa_ilvev_b((v16i8) out0, (v16i8) out1);                \
2556     tmp7_m = (v16u8) __msa_ilvod_b((v16i8) out0, (v16i8) out1);              \
2557                                                                              \
2558     ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m);                 \
2559     out0 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
2560     out4 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
2561                                                                              \
2562     tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m);          \
2563     tmp3_m = (v16u8) __msa_ilvod_h((v8i16) out7, (v8i16) out5);              \
2564     out2 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
2565     out6 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
2566                                                                              \
2567     ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m);             \
2568     out1 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
2569     out5 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
2570                                                                              \
2571     tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp5_m, (v8i16) tmp4_m);          \
2572     tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp5_m, (v8i16) tmp4_m);          \
2573     tmp3_m = (v16u8) __msa_ilvod_h((v8i16) tmp7_m, (v8i16) tmp6_m);          \
2574     tmp3_m = (v16u8) __msa_ilvod_h((v8i16) tmp7_m, (v8i16) tmp6_m);          \
2575     out3 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
2576     out7 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
2577 }
2578
2579 /* Description : Transposes 4x4 block with half word elements in vectors
2580    Arguments   : Inputs  - in0, in1, in2, in3
2581                  Outputs - out0, out1, out2, out3
2582                  Return Type - signed halfword
2583    Details     :
2584 */
2585 #define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3)  \
2586 {                                                                       \
2587     v8i16 s0_m, s1_m;                                                   \
2588                                                                         \
2589     ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m);                         \
2590     ILVRL_W2_SH(s1_m, s0_m, out0, out2);                                \
2591     out1 = (v8i16) __msa_ilvl_d((v2i64) out0, (v2i64) out0);            \
2592     out3 = (v8i16) __msa_ilvl_d((v2i64) out0, (v2i64) out2);            \
2593 }
2594
2595 /* Description : Transposes 8x8 block with half word elements in vectors
2596    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
2597                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
2598                  Return Type - as per RTYPE
2599    Details     :
2600 */
2601 #define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,   \
2602                        out0, out1, out2, out3, out4, out5, out6, out7)  \
2603 {                                                                       \
2604     v8i16 s0_m, s1_m;                                                   \
2605     v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                               \
2606     v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                               \
2607                                                                         \
2608     ILVR_H2_SH(in6, in4, in7, in5, s0_m, s1_m);                         \
2609     ILVRL_H2_SH(s1_m, s0_m, tmp0_m, tmp1_m);                            \
2610     ILVL_H2_SH(in6, in4, in7, in5, s0_m, s1_m);                         \
2611     ILVRL_H2_SH(s1_m, s0_m, tmp2_m, tmp3_m);                            \
2612     ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m);                         \
2613     ILVRL_H2_SH(s1_m, s0_m, tmp4_m, tmp5_m);                            \
2614     ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m);                         \
2615     ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m);                            \
2616     PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m,     \
2617              tmp3_m, tmp7_m, out0, out2, out4, out6);                   \
2618     out1 = (RTYPE) __msa_pckod_d((v2i64) tmp0_m, (v2i64) tmp4_m);       \
2619     out3 = (RTYPE) __msa_pckod_d((v2i64) tmp1_m, (v2i64) tmp5_m);       \
2620     out5 = (RTYPE) __msa_pckod_d((v2i64) tmp2_m, (v2i64) tmp6_m);       \
2621     out7 = (RTYPE) __msa_pckod_d((v2i64) tmp3_m, (v2i64) tmp7_m);       \
2622 }
2623 #define TRANSPOSE8x8_UH_UH(...) TRANSPOSE8x8_H(v8u16, __VA_ARGS__)
2624 #define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__)
2625
2626 /* Description : Transposes 4x4 block with word elements in vectors
2627    Arguments   : Inputs  - in0, in1, in2, in3
2628                  Outputs - out0, out1, out2, out3
2629                  Return Type - signed word
2630    Details     :
2631 */
2632 #define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3)  \
2633 {                                                                       \
2634     v4i32 s0_m, s1_m, s2_m, s3_m;                                       \
2635                                                                         \
2636     ILVRL_W2_SW(in1, in0, s0_m, s1_m);                                  \
2637     ILVRL_W2_SW(in3, in2, s2_m, s3_m);                                  \
2638                                                                         \
2639     out0 = (v4i32) __msa_ilvr_d((v2i64) s2_m, (v2i64) s0_m);            \
2640     out1 = (v4i32) __msa_ilvl_d((v2i64) s2_m, (v2i64) s0_m);            \
2641     out2 = (v4i32) __msa_ilvr_d((v2i64) s3_m, (v2i64) s1_m);            \
2642     out3 = (v4i32) __msa_ilvl_d((v2i64) s3_m, (v2i64) s1_m);            \
2643 }
2644
2645 /* Description : Average byte elements from pair of vectors and store 8x4 byte
2646                  block in destination memory
2647    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2648    Details     : Each byte element from input vector pair 'in0' and 'in1' are
2649                  averaged (a + b)/2 and stored in 'tmp0_m'
2650                  Each byte element from input vector pair 'in2' and 'in3' are
2651                  averaged (a + b)/2 and stored in 'tmp1_m'
2652                  Each byte element from input vector pair 'in4' and 'in5' are
2653                  averaged (a + b)/2 and stored in 'tmp2_m'
2654                  Each byte element from input vector pair 'in6' and 'in7' are
2655                  averaged (a + b)/2 and stored in 'tmp3_m'
2656                  The half vector results from all 4 vectors are stored in
2657                  destination memory as 8x4 byte block
2658 */
2659 #define AVE_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)  \
2660 {                                                                           \
2661     uint64_t out0_m, out1_m, out2_m, out3_m;                                \
2662     v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \
2663                                                                             \
2664     tmp0_m = __msa_ave_u_b((v16u8) in0, (v16u8) in1);                       \
2665     tmp1_m = __msa_ave_u_b((v16u8) in2, (v16u8) in3);                       \
2666     tmp2_m = __msa_ave_u_b((v16u8) in4, (v16u8) in5);                       \
2667     tmp3_m = __msa_ave_u_b((v16u8) in6, (v16u8) in7);                       \
2668                                                                             \
2669     out0_m = __msa_copy_u_d((v2i64) tmp0_m, 0);                             \
2670     out1_m = __msa_copy_u_d((v2i64) tmp1_m, 0);                             \
2671     out2_m = __msa_copy_u_d((v2i64) tmp2_m, 0);                             \
2672     out3_m = __msa_copy_u_d((v2i64) tmp3_m, 0);                             \
2673     SD4(out0_m, out1_m, out2_m, out3_m, pdst, stride);                      \
2674 }
2675
2676 /* Description : Average byte elements from pair of vectors and store 16x4 byte
2677                  block in destination memory
2678    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2679    Details     : Each byte element from input vector pair 'in0' and 'in1' are
2680                  averaged (a + b)/2 and stored in 'tmp0_m'
2681                  Each byte element from input vector pair 'in2' and 'in3' are
2682                  averaged (a + b)/2 and stored in 'tmp1_m'
2683                  Each byte element from input vector pair 'in4' and 'in5' are
2684                  averaged (a + b)/2 and stored in 'tmp2_m'
2685                  Each byte element from input vector pair 'in6' and 'in7' are
2686                  averaged (a + b)/2 and stored in 'tmp3_m'
2687                  The results from all 4 vectors are stored in destination
2688                  memory as 16x4 byte block
2689 */
2690 #define AVE_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)  \
2691 {                                                                            \
2692     v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                    \
2693                                                                              \
2694     tmp0_m = __msa_ave_u_b((v16u8) in0, (v16u8) in1);                        \
2695     tmp1_m = __msa_ave_u_b((v16u8) in2, (v16u8) in3);                        \
2696     tmp2_m = __msa_ave_u_b((v16u8) in4, (v16u8) in5);                        \
2697     tmp3_m = __msa_ave_u_b((v16u8) in6, (v16u8) in7);                        \
2698                                                                              \
2699     ST_UB4(tmp0_m, tmp1_m, tmp2_m, tmp3_m, pdst, stride);                    \
2700 }
2701
2702 /* Description : Average rounded byte elements from pair of vectors and store
2703                  8x4 byte block in destination memory
2704    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2705    Details     : Each byte element from input vector pair 'in0' and 'in1' are
2706                  average rounded (a + b + 1)/2 and stored in 'tmp0_m'
2707                  Each byte element from input vector pair 'in2' and 'in3' are
2708                  average rounded (a + b + 1)/2 and stored in 'tmp1_m'
2709                  Each byte element from input vector pair 'in4' and 'in5' are
2710                  average rounded (a + b + 1)/2 and stored in 'tmp2_m'
2711                  Each byte element from input vector pair 'in6' and 'in7' are
2712                  average rounded (a + b + 1)/2 and stored in 'tmp3_m'
2713                  The half vector results from all 4 vectors are stored in
2714                  destination memory as 8x4 byte block
2715 */
2716 #define AVER_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)  \
2717 {                                                                            \
2718     uint64_t out0_m, out1_m, out2_m, out3_m;                                 \
2719     v16u8 tp0_m, tp1_m, tp2_m, tp3_m;                                        \
2720                                                                              \
2721     AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7,                      \
2722                 tp0_m, tp1_m, tp2_m, tp3_m);                                 \
2723                                                                              \
2724     out0_m = __msa_copy_u_d((v2i64) tp0_m, 0);                               \
2725     out1_m = __msa_copy_u_d((v2i64) tp1_m, 0);                               \
2726     out2_m = __msa_copy_u_d((v2i64) tp2_m, 0);                               \
2727     out3_m = __msa_copy_u_d((v2i64) tp3_m, 0);                               \
2728     SD4(out0_m, out1_m, out2_m, out3_m, pdst, stride);                       \
2729 }
2730
2731 /* Description : Average rounded byte elements from pair of vectors and store
2732                  16x4 byte block in destination memory
2733    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2734    Details     : Each byte element from input vector pair 'in0' and 'in1' are
2735                  average rounded (a + b + 1)/2 and stored in 'tmp0_m'
2736                  Each byte element from input vector pair 'in2' and 'in3' are
2737                  average rounded (a + b + 1)/2 and stored in 'tmp1_m'
2738                  Each byte element from input vector pair 'in4' and 'in5' are
2739                  average rounded (a + b + 1)/2 and stored in 'tmp2_m'
2740                  Each byte element from input vector pair 'in6' and 'in7' are
2741                  average rounded (a + b + 1)/2 and stored in 'tmp3_m'
2742                  The vector results from all 4 vectors are stored in
2743                  destination memory as 16x4 byte block
2744 */
2745 #define AVER_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)  \
2746 {                                                                             \
2747     v16u8 t0_m, t1_m, t2_m, t3_m;                                             \
2748                                                                               \
2749     AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7,                       \
2750                 t0_m, t1_m, t2_m, t3_m);                                      \
2751     ST_UB4(t0_m, t1_m, t2_m, t3_m, pdst, stride);                             \
2752 }
2753
2754 /* Description : Average rounded byte elements from pair of vectors,
2755                  average rounded with destination and store 8x4 byte block
2756                  in destination memory
2757    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2758    Details     : Each byte element from input vector pair 'in0' and 'in1' are
2759                  average rounded (a + b + 1)/2 and stored in 'tmp0_m'
2760                  Each byte element from input vector pair 'in2' and 'in3' are
2761                  average rounded (a + b + 1)/2 and stored in 'tmp1_m'
2762                  Each byte element from input vector pair 'in4' and 'in5' are
2763                  average rounded (a + b + 1)/2 and stored in 'tmp2_m'
2764                  Each byte element from input vector pair 'in6' and 'in7' are
2765                  average rounded (a + b + 1)/2 and stored in 'tmp3_m'
2766                  The half vector results from all 4 vectors are stored in
2767                  destination memory as 8x4 byte block
2768 */
2769 #define AVER_DST_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7,  \
2770                           pdst, stride)                            \
2771 {                                                                  \
2772     v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                          \
2773     v16u8 dst0_m, dst1_m, dst2_m, dst3_m;                          \
2774                                                                    \
2775     LD_UB4(pdst, stride, dst0_m, dst1_m, dst2_m, dst3_m);          \
2776     AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7,            \
2777                 tmp0_m, tmp1_m, tmp2_m, tmp3_m);                   \
2778     AVER_ST8x4_UB(dst0_m, tmp0_m, dst1_m, tmp1_m,                  \
2779                   dst2_m, tmp2_m, dst3_m, tmp3_m, pdst, stride);   \
2780 }
2781
2782 /* Description : Average rounded byte elements from pair of vectors,
2783                  average rounded with destination and store 16x4 byte block
2784                  in destination memory
2785    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2786    Details     : Each byte element from input vector pair 'in0' and 'in1' are
2787                  average rounded (a + b + 1)/2 and stored in 'tmp0_m'
2788                  Each byte element from input vector pair 'in2' and 'in3' are
2789                  average rounded (a + b + 1)/2 and stored in 'tmp1_m'
2790                  Each byte element from input vector pair 'in4' and 'in5' are
2791                  average rounded (a + b + 1)/2 and stored in 'tmp2_m'
2792                  Each byte element from input vector pair 'in6' and 'in7' are
2793                  average rounded (a + b + 1)/2 and stored in 'tmp3_m'
2794                  The vector results from all 4 vectors are stored in
2795                  destination memory as 16x4 byte block
2796 */
2797 #define AVER_DST_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7,  \
2798                            pdst, stride)                            \
2799 {                                                                   \
2800     v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                           \
2801     v16u8 dst0_m, dst1_m, dst2_m, dst3_m;                           \
2802                                                                     \
2803     LD_UB4(pdst, stride, dst0_m, dst1_m, dst2_m, dst3_m);           \
2804     AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7,             \
2805                 tmp0_m, tmp1_m, tmp2_m, tmp3_m);                    \
2806     AVER_ST16x4_UB(dst0_m, tmp0_m, dst1_m, tmp1_m,                  \
2807                    dst2_m, tmp2_m, dst3_m, tmp3_m, pdst, stride);   \
2808 }
2809
2810 /* Description : Add block 4x4
2811    Arguments   : Inputs  - in0, in1, in2, in3, pdst, stride
2812    Details     : Least significant 4 bytes from each input vector are added to
2813                  the destination bytes, clipped between 0-255 and then stored.
2814 */
2815 #define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride)         \
2816 {                                                                 \
2817     uint32_t src0_m, src1_m, src2_m, src3_m;                      \
2818     uint32_t out0_m, out1_m, out2_m, out3_m;                      \
2819     v8i16 inp0_m, inp1_m, res0_m, res1_m;                         \
2820     v16i8 dst0_m = { 0 };                                         \
2821     v16i8 dst1_m = { 0 };                                         \
2822     v16i8 zero_m = { 0 };                                         \
2823                                                                   \
2824     ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m)                \
2825     LW4(pdst, stride,  src0_m, src1_m, src2_m, src3_m);           \
2826     INSERT_W2_SB(src0_m, src1_m, dst0_m);                         \
2827     INSERT_W2_SB(src2_m, src3_m, dst1_m);                         \
2828     ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m);   \
2829     ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m);         \
2830     CLIP_SH2_0_255(res0_m, res1_m);                               \
2831     PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m);  \
2832                                                                   \
2833     out0_m = __msa_copy_u_w((v4i32) dst0_m, 0);                   \
2834     out1_m = __msa_copy_u_w((v4i32) dst0_m, 1);                   \
2835     out2_m = __msa_copy_u_w((v4i32) dst1_m, 0);                   \
2836     out3_m = __msa_copy_u_w((v4i32) dst1_m, 1);                   \
2837     SW4(out0_m, out1_m, out2_m, out3_m, pdst, stride);            \
2838 }
2839
2840 /* Description : Dot product and addition of 3 signed halfword input vectors
2841    Arguments   : Inputs  - in0, in1, in2, coeff0, coeff1, coeff2
2842                  Outputs - out0_m
2843                  Return Type - signed halfword
2844    Details     : Dot product of 'in0' with 'coeff0'
2845                  Dot product of 'in1' with 'coeff1'
2846                  Dot product of 'in2' with 'coeff2'
2847                  Addition of all the 3 vector results
2848
2849                  out0_m = (in0 * coeff0) + (in1 * coeff1) + (in2 * coeff2)
2850 */
2851 #define DPADD_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2)         \
2852 ( {                                                                 \
2853     v8i16 tmp1_m;                                                   \
2854     v8i16 out0_m;                                                   \
2855                                                                     \
2856     out0_m = __msa_dotp_s_h((v16i8) in0, (v16i8) coeff0);           \
2857     out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in1, (v16i8) coeff1);  \
2858     tmp1_m = __msa_dotp_s_h((v16i8) in2, (v16i8) coeff2);           \
2859     out0_m = __msa_adds_s_h(out0_m, tmp1_m);                        \
2860                                                                     \
2861     out0_m;                                                         \
2862 } )
2863
2864 /* Description : Pack even elements of input vectors & xor with 128
2865    Arguments   : Inputs  - in0, in1
2866                  Outputs - out_m
2867                  Return Type - unsigned byte
2868    Details     : Signed byte even elements from 'in0' and 'in1' are packed
2869                  together in one vector and the resulted vector is xor'ed with
2870                  128 to shift the range from signed to unsigned byte
2871 */
2872 #define PCKEV_XORI128_UB(in0, in1)                            \
2873 ( {                                                           \
2874     v16u8 out_m;                                              \
2875     out_m = (v16u8) __msa_pckev_b((v16i8) in1, (v16i8) in0);  \
2876     out_m = (v16u8) __msa_xori_b((v16u8) out_m, 128);         \
2877     out_m;                                                    \
2878 } )
2879
2880 /* Description : Converts inputs to unsigned bytes, interleave, average & store
2881                  as 8x4 unsigned byte block
2882    Arguments   : Inputs  - in0, in1, in2, in3, dst0, dst1, pdst, stride
2883 */
2884 #define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3,           \
2885                                 dst0, dst1, pdst, stride)     \
2886 {                                                             \
2887     v16u8 tmp0_m, tmp1_m;                                     \
2888     uint8_t *pdst_m = (uint8_t *) (pdst);                     \
2889                                                               \
2890     tmp0_m = PCKEV_XORI128_UB(in0, in1);                      \
2891     tmp1_m = PCKEV_XORI128_UB(in2, in3);                      \
2892     AVER_UB2_UB(tmp0_m, dst0, tmp1_m, dst1, tmp0_m, tmp1_m);  \
2893     ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride);                 \
2894 }
2895
2896 /* Description : Pack even byte elements, extract 0 & 2 index words from pair
2897                  of results and store 4 words in destination memory as per
2898                  stride
2899    Arguments   : Inputs  - in0, in1, in2, in3, pdst, stride
2900 */
2901 #define PCKEV_ST4x4_UB(in0, in1, in2, in3, pdst, stride)  \
2902 {                                                         \
2903     uint32_t out0_m, out1_m, out2_m, out3_m;              \
2904     v16i8 tmp0_m, tmp1_m;                                 \
2905                                                           \
2906     PCKEV_B2_SB(in1, in0, in3, in2, tmp0_m, tmp1_m);      \
2907                                                           \
2908     out0_m = __msa_copy_u_w((v4i32) tmp0_m, 0);           \
2909     out1_m = __msa_copy_u_w((v4i32) tmp0_m, 2);           \
2910     out2_m = __msa_copy_u_w((v4i32) tmp1_m, 0);           \
2911     out3_m = __msa_copy_u_w((v4i32) tmp1_m, 2);           \
2912                                                           \
2913     SW4(out0_m, out1_m, out2_m, out3_m, pdst, stride);    \
2914 }
2915
2916 /* Description : Pack even byte elements and store byte vector in destination
2917                  memory
2918    Arguments   : Inputs  - in0, in1, pdst
2919 */
2920 #define PCKEV_ST_SB(in0, in1, pdst)                   \
2921 {                                                     \
2922     v16i8 tmp_m;                                      \
2923     tmp_m = __msa_pckev_b((v16i8) in1, (v16i8) in0);  \
2924     ST_SB(tmp_m, (pdst));                             \
2925 }
2926
2927 /* Description : Horizontal 2 tap filter kernel code
2928    Arguments   : Inputs  - in0, in1, mask, coeff, shift
2929 */
2930 #define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift)            \
2931 ( {                                                                 \
2932     v16i8 tmp0_m;                                                   \
2933     v8u16 tmp1_m;                                                   \
2934                                                                     \
2935     tmp0_m = __msa_vshf_b((v16i8) mask, (v16i8) in1, (v16i8) in0);  \
2936     tmp1_m = __msa_dotp_u_h((v16u8) tmp0_m, (v16u8) coeff);         \
2937     tmp1_m = (v8u16) __msa_srari_h((v8i16) tmp1_m, shift);          \
2938     tmp1_m = __msa_sat_u_h(tmp1_m, shift);                          \
2939                                                                     \
2940     tmp1_m;                                                         \
2941 } )
2942 #endif  /* AVUTIL_MIPS_GENERIC_MACROS_MSA_H */