git.sesse.net Git - x264/blob - common/mips/mc-c.c

   1 /*****************************************************************************
   2  * mc-c.c: msa motion compensation
   3  *****************************************************************************
   4  * Copyright (C) 2015-2016 x264 project
   5  *
   6  * Authors: Neha Rana <neha.rana@imgtec.com>
   7  *
   8  * This program is free software; you can redistribute it and/or modify
   9  * it under the terms of the GNU General Public License as published by
  10  * the Free Software Foundation; either version 2 of the License, or
  11  * (at your option) any later version.
  12  *
  13  * This program is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16  * GNU General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU General Public License
  19  * along with this program; if not, write to the Free Software
  20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  21  *
  22  * This program is also available under a commercial proprietary license.
  23  * For more information, contact us at licensing@x264.com.
  24  *****************************************************************************/
  25
  26 #include "common/common.h"
  27 #include "macros.h"
  28 #include "mc.h"
  29
  30 #if !HIGH_BIT_DEPTH
  31 static const uint8_t pu_luma_mask_arr[16 * 8] =
  32 {
  33     /* 8 width cases */
  34     0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12,
  35     1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10, 8, 11,
  36     2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
  37     /* 4 width cases */
  38     0, 5, 1, 6, 2, 7, 3, 8, 16, 21, 17, 22, 18, 23, 19, 24,
  39     1, 4, 2, 5, 3, 6, 4, 7, 17, 20, 18, 21, 19, 22, 20, 23,
  40     2, 3, 3, 4, 4, 5, 5, 6, 18, 19, 19, 20, 20, 21, 21, 22,
  41     2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 24, 25,
  42     3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26
  43 };
  44
  45 static const uint8_t pu_chroma_mask_arr[16 * 5] =
  46 {
  47     0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
  48     0, 2, 2, 4, 4, 6, 6, 8, 16, 18, 18, 20, 20, 22, 22, 24,
  49     0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
  50     0, 1, 1, 2, 16, 17, 17, 18, 4, 5, 5, 6, 6, 7, 7, 8,
  51     0, 1, 1, 2, 16, 17, 17, 18, 16, 17, 17, 18, 18, 19, 19, 20
  52 };
  53
  54 void x264_mc_copy_w16_msa( uint8_t *p_dst, intptr_t i_dst_stride,
  55                            uint8_t *p_src, intptr_t i_src_stride,
  56                            int32_t i_height );
  57 void x264_mc_copy_w8_msa( uint8_t *p_dst, intptr_t i_dst_stride,
  58                           uint8_t *p_src, intptr_t i_src_stride,
  59                           int32_t i_height );
  60 void x264_mc_copy_w4_msa( uint8_t *p_dst, intptr_t i_dst_stride, uint8_t *p_src,
  61                           intptr_t i_src_stride, int32_t i_height );
  62 void x264_memzero_aligned_msa( void *p_dst, size_t n );
  63
  64 void x264_pixel_avg_16x16_msa( uint8_t *p_pix1, intptr_t i_pix1_stride,
  65                                uint8_t *p_pix2, intptr_t i_pix2_stride,
  66                                uint8_t *p_pix3, intptr_t i_pix3_stride,
  67                                int32_t i_weight );
  68 void x264_pixel_avg_16x8_msa( uint8_t *p_pix1, intptr_t i_pix1_stride,
  69                               uint8_t *p_pix2, intptr_t i_pix2_stride,
  70                               uint8_t *p_pix3, intptr_t i_pix3_stride,
  71                               int32_t i_weight );
  72 void x264_pixel_avg_8x16_msa( uint8_t *p_pix1, intptr_t i_pix1_stride,
  73                               uint8_t *p_pix2, intptr_t i_pix2_stride,
  74                               uint8_t *p_pix3, intptr_t i_pix3_stride,
  75                               int32_t i_weight );
  76 void x264_pixel_avg_8x8_msa( uint8_t *p_pix1, intptr_t i_pix1_stride,
  77                              uint8_t *p_pix2, intptr_t i_pix2_stride,
  78                              uint8_t *p_pix3, intptr_t i_pix3_stride,
  79                              int32_t i_weight );
  80 void x264_pixel_avg_8x4_msa( uint8_t *p_pix1, intptr_t i_pix1_stride,
  81                              uint8_t *p_pix2, intptr_t i_pix2_stride,
  82                              uint8_t *p_pix3, intptr_t i_pix3_stride,
  83                              int32_t i_weight );
  84 void x264_pixel_avg_4x16_msa( uint8_t *p_pix1, intptr_t pix1_stride,
  85                               uint8_t *p_pix2, intptr_t pix2_stride,
  86                               uint8_t *p_pix3, intptr_t pix3_stride,
  87                               int32_t i_weight );
  88 void x264_pixel_avg_4x8_msa( uint8_t *p_pix1, intptr_t i_pix1_stride,
  89                              uint8_t *p_pix2, intptr_t i_pix2_stride,
  90                              uint8_t *p_pix3, intptr_t i_pix3_stride,
  91                              int32_t i_weight );
  92 void x264_pixel_avg_4x4_msa( uint8_t *p_pix1, intptr_t i_pix1_stride,
  93                              uint8_t *p_pix2, intptr_t i_pix2_stride,
  94                              uint8_t *p_pix3, intptr_t i_pix3_stride,
  95                              int32_t i_weight );
  96 void x264_pixel_avg_4x2_msa( uint8_t *p_pix1, intptr_t i_pix1_stride,
  97                              uint8_t *p_pix2, intptr_t i_pix2_stride,
  98                              uint8_t *p_pix3, intptr_t i_pix3_stride,
  99                              int32_t i_weight );
 100
 101 void x264_mc_weight_w20_msa( uint8_t *p_dst, intptr_t i_dst_stride,
 102                              uint8_t *p_src, intptr_t i_src_stride,
 103                              const x264_weight_t *pWeight, int32_t i_height );
 104 void x264_mc_weight_w4_msa( uint8_t *p_dst, intptr_t i_dst_stride,
 105                             uint8_t *p_src, intptr_t i_src_stride,
 106                             const x264_weight_t *pWeight, int32_t i_height );
 107 void x264_mc_weight_w8_msa( uint8_t *p_dst, intptr_t i_dst_stride,
 108                             uint8_t *p_src, intptr_t i_src_stride,
 109                             const x264_weight_t *pWeight, int32_t i_height );
 110 void x264_mc_weight_w16_msa( uint8_t *p_dst, intptr_t i_dst_stride,
 111                              uint8_t *p_src, intptr_t i_src_stride,
 112                              const x264_weight_t *pWeight, int32_t i_height );
 113
 114 weight_fn_t x264_mc_weight_wtab_msa[6] =
 115 {
 116     x264_mc_weight_w4_msa,
 117     x264_mc_weight_w4_msa,
 118     x264_mc_weight_w8_msa,
 119     x264_mc_weight_w16_msa,
 120     x264_mc_weight_w16_msa,
 121     x264_mc_weight_w20_msa,
 122 };
 123
 124 void x264_mc_luma_msa( uint8_t *p_dst, intptr_t i_dst_stride,
 125                        uint8_t *p_src[4], intptr_t i_src_stride,
 126                        int32_t m_vx, int32_t m_vy,
 127                        int32_t i_width, int32_t i_height,
 128                        const x264_weight_t *pWeight );
 129 uint8_t *x264_get_ref_msa( uint8_t *p_dst,   intptr_t *p_dst_stride,
 130                            uint8_t *p_src[4], intptr_t i_src_stride,
 131                            int32_t m_vx, int32_t m_vy,
 132                            int32_t i_width, int32_t i_height,
 133                            const x264_weight_t *pWeight );
 134 void x264_mc_chroma_msa( uint8_t *p_dst_u, uint8_t *p_dst_v,
 135                          intptr_t i_dst_stride,
 136                          uint8_t *p_src, intptr_t i_src_stride,
 137                          int32_t m_vx, int32_t m_vy,
 138                          int32_t i_width, int32_t i_height );
 139 void x264_hpel_filter_msa( uint8_t *p_dsth, uint8_t *p_dst_v,
 140                            uint8_t *p_dstc, uint8_t *p_src,
 141                            intptr_t i_stride, int32_t i_width,
 142                            int32_t i_height, int16_t *p_buf );
 143
 144 void x264_plane_copy_interleave_msa( uint8_t *p_dst,  intptr_t i_dst_stride,
 145                                      uint8_t *p_src0, intptr_t i_src_stride0,
 146                                      uint8_t *p_src1, intptr_t i_src_stride1,
 147                                      int32_t i_width, int32_t i_height );
 148 void x264_plane_copy_deinterleave_msa( uint8_t *p_dst0, intptr_t i_dst_stride0,
 149                                        uint8_t *p_dst1, intptr_t i_dst_stride1,
 150                                        uint8_t *p_src,  intptr_t i_src_stride,
 151                                        int32_t i_width, int32_t i_height );
 152 void x264_plane_copy_deinterleave_rgb_msa( uint8_t *p_dst0,
 153                                            intptr_t i_dst_stride0,
 154                                            uint8_t *p_dst1,
 155                                            intptr_t i_dst_stride1,
 156                                            uint8_t *p_dst2,
 157                                            intptr_t i_dst_stride2,
 158                                            uint8_t *p_src,
 159                                            intptr_t i_src_stride,
 160                                            int32_t i_src_width, int32_t i_width,
 161                                            int32_t i_height );
 162 void x264_store_interleave_chroma_msa( uint8_t *p_dst, intptr_t i_dst_stride,
 163                                        uint8_t *p_src0, uint8_t *p_src1,
 164                                        int32_t i_height );
 165 void x264_load_deinterleave_chroma_fenc_msa( uint8_t *p_dst, uint8_t *p_src,
 166                                              intptr_t i_src_stride,
 167                                              int32_t i_height );
 168 void x264_load_deinterleave_chroma_fdec_msa( uint8_t *p_dst, uint8_t *p_src,
 169                                              intptr_t i_src_stride,
 170                                              int32_t i_height );
 171 void x264_frame_init_lowres_core_msa( uint8_t *p_src, uint8_t *p_dst0,
 172                                       uint8_t *p_dst1, uint8_t *p_dst2,
 173                                       uint8_t *p_dst3, intptr_t i_src_stride,
 174                                       intptr_t i_dst_stride, int32_t i_width,
 175                                       int32_t i_height );
 176
 177 static void avc_luma_hz_16w_msa( uint8_t *p_src, int32_t i_src_stride,
 178                                  uint8_t *p_dst, int32_t i_dst_stride,
 179                                  int32_t i_height )
 180 {
 181     uint32_t u_loop_cnt, u_h4w;
 182     v16u8 dst0;
 183     v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
 184     v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
 185     v16i8 mask0, mask1, mask2;
 186     v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
 187     v16i8 vec6, vec7, vec8, vec9, vec10, vec11;
 188     v16i8 minus5b = __msa_ldi_b( -5 );
 189     v16i8 plus20b = __msa_ldi_b( 20 );
 190
 191     u_h4w = i_height % 4;
 192     LD_SB3( &pu_luma_mask_arr[0], 16, mask0, mask1, mask2 );
 193
 194     for( u_loop_cnt = ( i_height >> 2 ); u_loop_cnt--; )
 195     {
 196         LD_SB2( p_src, 8, src0, src1 );
 197         p_src += i_src_stride;
 198         LD_SB2( p_src, 8, src2, src3 );
 199         p_src += i_src_stride;
 200
 201         XORI_B4_128_SB( src0, src1, src2, src3 );
 202         VSHF_B2_SB( src0, src0, src1, src1, mask0, mask0, vec0, vec3 );
 203         VSHF_B2_SB( src2, src2, src3, src3, mask0, mask0, vec6, vec9 );
 204         VSHF_B2_SB( src0, src0, src1, src1, mask1, mask1, vec1, vec4 );
 205         VSHF_B2_SB( src2, src2, src3, src3, mask1, mask1, vec7, vec10 );
 206         VSHF_B2_SB( src0, src0, src1, src1, mask2, mask2, vec2, vec5 );
 207         VSHF_B2_SB( src2, src2, src3, src3, mask2, mask2, vec8, vec11 );
 208         HADD_SB4_SH( vec0, vec3, vec6, vec9, res0, res1, res2, res3 );
 209         DPADD_SB4_SH( vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
 210                       minus5b, res0, res1, res2, res3 );
 211         DPADD_SB4_SH( vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
 212                       plus20b, res0, res1, res2, res3 );
 213
 214         LD_SB2( p_src, 8, src4, src5 );
 215         p_src += i_src_stride;
 216         LD_SB2( p_src, 8, src6, src7 );
 217         p_src += i_src_stride;
 218
 219         XORI_B4_128_SB( src4, src5, src6, src7 );
 220         VSHF_B2_SB( src4, src4, src5, src5, mask0, mask0, vec0, vec3 );
 221         VSHF_B2_SB( src6, src6, src7, src7, mask0, mask0, vec6, vec9 );
 222         VSHF_B2_SB( src4, src4, src5, src5, mask1, mask1, vec1, vec4 );
 223         VSHF_B2_SB( src6, src6, src7, src7, mask1, mask1, vec7, vec10 );
 224         VSHF_B2_SB( src4, src4, src5, src5, mask2, mask2, vec2, vec5 );
 225         VSHF_B2_SB( src6, src6, src7, src7, mask2, mask2, vec8, vec11 );
 226         HADD_SB4_SH( vec0, vec3, vec6, vec9, res4, res5, res6, res7 );
 227         DPADD_SB4_SH( vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
 228                       minus5b, res4, res5, res6, res7 );
 229         DPADD_SB4_SH( vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
 230                       plus20b, res4, res5, res6, res7 );
 231         SRARI_H4_SH( res0, res1, res2, res3, 5 );
 232         SRARI_H4_SH( res4, res5, res6, res7, 5 );
 233         SAT_SH4_SH( res0, res1, res2, res3, 7 );
 234         SAT_SH4_SH( res4, res5, res6, res7, 7 );
 235         PCKEV_B4_SB( res1, res0, res3, res2, res5, res4, res7, res6,
 236                      vec0, vec1, vec2, vec3 );
 237         XORI_B4_128_SB( vec0, vec1, vec2, vec3 );
 238
 239         ST_SB4( vec0, vec1, vec2, vec3, p_dst, i_dst_stride );
 240         p_dst += ( 4 * i_dst_stride );
 241     }
 242
 243     for( u_loop_cnt = u_h4w; u_loop_cnt--; )
 244     {
 245         LD_SB2( p_src, 8, src0, src1 );
 246         p_src += i_src_stride;
 247
 248         XORI_B2_128_SB( src0, src1 );
 249         VSHF_B2_SB( src0, src0, src1, src1, mask0, mask0, vec0, vec3 );
 250         VSHF_B2_SB( src0, src0, src1, src1, mask1, mask1, vec1, vec4 );
 251         VSHF_B2_SB( src0, src0, src1, src1, mask2, mask2, vec2, vec5 );
 252         res0 = __msa_hadd_s_h( vec0, vec0 );
 253         DPADD_SB2_SH( vec1, vec2, minus5b, plus20b, res0, res0 );
 254         res1 = __msa_hadd_s_h( vec3, vec3 );
 255         DPADD_SB2_SH( vec4, vec5, minus5b, plus20b, res1, res1 );
 256         SRARI_H2_SH( res0, res1, 5 );
 257         SAT_SH2_SH( res0, res1, 7 );
 258         dst0 = PCKEV_XORI128_UB( res0, res1 );
 259         ST_UB( dst0, p_dst );
 260         p_dst += i_dst_stride;
 261     }
 262 }
 263
 264 static void avc_luma_vt_16w_msa( uint8_t *p_src, int32_t i_src_stride,
 265                                  uint8_t *p_dst, int32_t i_dst_stride,
 266                                  int32_t i_height )
 267 {
 268     uint32_t u_loop_cnt, u_h4w;
 269     const int16_t i_filt_const0 = 0xfb01;
 270     const int16_t i_filt_const1 = 0x1414;
 271     const int16_t i_filt_const2 = 0x1fb;
 272     v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
 273     v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
 274     v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
 275     v16i8 src65_l, src87_l;
 276     v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
 277     v16u8 res0, res1, res2, res3;
 278     v16i8 filt0, filt1, filt2;
 279
 280     u_h4w = i_height % 4;
 281     filt0 = ( v16i8 ) __msa_fill_h( i_filt_const0 );
 282     filt1 = ( v16i8 ) __msa_fill_h( i_filt_const1 );
 283     filt2 = ( v16i8 ) __msa_fill_h( i_filt_const2 );
 284
 285     LD_SB5( p_src, i_src_stride, src0, src1, src2, src3, src4 );
 286     p_src += ( 5 * i_src_stride );
 287
 288     XORI_B5_128_SB( src0, src1, src2, src3, src4 );
 289     ILVR_B4_SB( src1, src0, src2, src1, src3, src2, src4, src3,
 290                 src10_r, src21_r, src32_r, src43_r );
 291     ILVL_B4_SB( src1, src0, src2, src1, src3, src2, src4, src3,
 292                 src10_l, src21_l, src32_l, src43_l );
 293
 294     for( u_loop_cnt = ( i_height >> 2 ); u_loop_cnt--; )
 295     {
 296         LD_SB4( p_src, i_src_stride, src5, src6, src7, src8 );
 297         p_src += ( 4 * i_src_stride );
 298
 299         XORI_B4_128_SB( src5, src6, src7, src8 );
 300         ILVR_B4_SB( src5, src4, src6, src5, src7, src6, src8, src7,
 301                     src54_r, src65_r, src76_r, src87_r );
 302         ILVL_B4_SB( src5, src4, src6, src5, src7, src6, src8, src7,
 303                     src54_l, src65_l, src76_l, src87_l );
 304         out0_r = DPADD_SH3_SH( src10_r, src32_r, src54_r,
 305                                filt0, filt1, filt2 );
 306         out1_r = DPADD_SH3_SH( src21_r, src43_r, src65_r,
 307                                filt0, filt1, filt2 );
 308         out2_r = DPADD_SH3_SH( src32_r, src54_r, src76_r,
 309                                filt0, filt1, filt2 );
 310         out3_r = DPADD_SH3_SH( src43_r, src65_r, src87_r,
 311                                filt0, filt1, filt2 );
 312         out0_l = DPADD_SH3_SH( src10_l, src32_l, src54_l,
 313                                filt0, filt1, filt2 );
 314         out1_l = DPADD_SH3_SH( src21_l, src43_l, src65_l,
 315                                filt0, filt1, filt2 );
 316         out2_l = DPADD_SH3_SH( src32_l, src54_l, src76_l,
 317                                filt0, filt1, filt2 );
 318         out3_l = DPADD_SH3_SH( src43_l, src65_l, src87_l,
 319                                filt0, filt1, filt2 );
 320         SRARI_H4_SH( out0_r, out1_r, out2_r, out3_r, 5 );
 321         SAT_SH4_SH( out0_r, out1_r, out2_r, out3_r, 7 );
 322         SRARI_H4_SH( out0_l, out1_l, out2_l, out3_l, 5 );
 323         SAT_SH4_SH( out0_l, out1_l, out2_l, out3_l, 7 );
 324         PCKEV_B4_UB( out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
 325                      out3_r, res0, res1, res2, res3 );
 326         XORI_B4_128_UB( res0, res1, res2, res3 );
 327
 328         ST_UB4( res0, res1, res2, res3, p_dst, i_dst_stride );
 329         p_dst += ( 4 * i_dst_stride );
 330
 331         src10_r = src54_r;
 332         src32_r = src76_r;
 333         src21_r = src65_r;
 334         src43_r = src87_r;
 335         src10_l = src54_l;
 336         src32_l = src76_l;
 337         src21_l = src65_l;
 338         src43_l = src87_l;
 339         src4 = src8;
 340     }
 341
 342     for( u_loop_cnt = u_h4w; u_loop_cnt--; )
 343     {
 344         src5 = LD_SB( p_src );
 345         p_src += ( i_src_stride );
 346         src5 = ( v16i8 ) __msa_xori_b( ( v16u8 ) src5, 128 );
 347         ILVRL_B2_SB( src5, src4, src54_r, src54_l );
 348         out0_r = DPADD_SH3_SH( src10_r, src32_r, src54_r,
 349                                filt0, filt1, filt2 );
 350         out0_l = DPADD_SH3_SH( src10_l, src32_l, src54_l,
 351                                filt0, filt1, filt2 );
 352         SRARI_H2_SH( out0_r, out0_l, 5 );
 353         SAT_SH2_SH( out0_r, out0_l, 7 );
 354         out0_r = ( v8i16 ) __msa_pckev_b( ( v16i8 ) out0_l, ( v16i8 ) out0_r );
 355         res0 = __msa_xori_b( ( v16u8 ) out0_r, 128 );
 356         ST_UB( res0, p_dst );
 357         p_dst += i_dst_stride;
 358
 359         src10_r = src21_r;
 360         src21_r = src32_r;
 361         src32_r = src43_r;
 362         src43_r = src54_r;
 363
 364         src10_l = src21_l;
 365         src21_l = src32_l;
 366         src32_l = src43_l;
 367         src43_l = src54_l;
 368
 369         src4 = src5;
 370     }
 371 }
 372
 373 static void avc_luma_mid_8w_msa( uint8_t *p_src, int32_t i_src_stride,
 374                                  uint8_t *p_dst, int32_t i_dst_stride,
 375                                  int32_t i_height )
 376 {
 377     uint32_t u_loop_cnt, u_h4w;
 378     uint64_t u_out0;
 379     v16i8 tmp0;
 380     v16i8 src0, src1, src2, src3, src4;
 381     v16i8 mask0, mask1, mask2;
 382     v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
 383     v8i16 hz_out4, hz_out5, hz_out6, hz_out7, hz_out8;
 384     v8i16 dst0, dst1, dst2, dst3;
 385     v16u8 out0, out1;
 386
 387     u_h4w = i_height % 4;
 388     LD_SB3( &pu_luma_mask_arr[0], 16, mask0, mask1, mask2 );
 389
 390     LD_SB5( p_src, i_src_stride, src0, src1, src2, src3, src4 );
 391     XORI_B5_128_SB( src0, src1, src2, src3, src4 );
 392     p_src += ( 5 * i_src_stride );
 393
 394     hz_out0 = AVC_HORZ_FILTER_SH( src0, mask0, mask1, mask2 );
 395     hz_out1 = AVC_HORZ_FILTER_SH( src1, mask0, mask1, mask2 );
 396     hz_out2 = AVC_HORZ_FILTER_SH( src2, mask0, mask1, mask2 );
 397     hz_out3 = AVC_HORZ_FILTER_SH( src3, mask0, mask1, mask2 );
 398     hz_out4 = AVC_HORZ_FILTER_SH( src4, mask0, mask1, mask2 );
 399
 400     for( u_loop_cnt = ( i_height >> 2 ); u_loop_cnt--; )
 401     {
 402         LD_SB4( p_src, i_src_stride, src0, src1, src2, src3 );
 403         XORI_B4_128_SB( src0, src1, src2, src3 );
 404         p_src += ( 4 * i_src_stride );
 405
 406         hz_out5 = AVC_HORZ_FILTER_SH( src0, mask0, mask1, mask2 );
 407         hz_out6 = AVC_HORZ_FILTER_SH( src1, mask0, mask1, mask2 );
 408         hz_out7 = AVC_HORZ_FILTER_SH( src2, mask0, mask1, mask2 );
 409         hz_out8 = AVC_HORZ_FILTER_SH( src3, mask0, mask1, mask2 );
 410         dst0 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH( hz_out0, hz_out1, hz_out2,
 411                                                 hz_out3, hz_out4, hz_out5 );
 412         dst1 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH( hz_out1, hz_out2, hz_out3,
 413                                                 hz_out4, hz_out5, hz_out6 );
 414         dst2 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH( hz_out2, hz_out3, hz_out4,
 415                                                 hz_out5, hz_out6, hz_out7 );
 416         dst3 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH( hz_out3, hz_out4, hz_out5,
 417                                                 hz_out6, hz_out7, hz_out8 );
 418         out0 = PCKEV_XORI128_UB( dst0, dst1 );
 419         out1 = PCKEV_XORI128_UB( dst2, dst3 );
 420         ST8x4_UB( out0, out1, p_dst, i_dst_stride );
 421
 422         p_dst += ( 4 * i_dst_stride );
 423         hz_out3 = hz_out7;
 424         hz_out1 = hz_out5;
 425         hz_out5 = hz_out4;
 426         hz_out4 = hz_out8;
 427         hz_out2 = hz_out6;
 428         hz_out0 = hz_out5;
 429     }
 430
 431     for( u_loop_cnt = u_h4w; u_loop_cnt--; )
 432     {
 433         src0 = LD_SB( p_src );
 434         p_src += i_src_stride;
 435
 436         src0 = ( v16i8 ) __msa_xori_b( ( v16u8 ) src0, 128 );
 437         hz_out5 = AVC_HORZ_FILTER_SH( src0, mask0, mask1, mask2 );
 438
 439         dst0 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH( hz_out0, hz_out1,
 440                                                 hz_out2, hz_out3,
 441                                                 hz_out4, hz_out5 );
 442
 443         tmp0 = __msa_pckev_b( ( v16i8 ) ( dst0 ), ( v16i8 ) ( dst0 ) );
 444         tmp0 = ( v16i8 ) __msa_xori_b( ( v16u8 ) tmp0, 128 );
 445         u_out0 = __msa_copy_u_d( ( v2i64 ) tmp0, 0 );
 446         SD( u_out0, p_dst );
 447         p_dst += i_dst_stride;
 448
 449         hz_out0 = hz_out1;
 450         hz_out1 = hz_out2;
 451         hz_out2 = hz_out3;
 452         hz_out3 = hz_out4;
 453         hz_out4 = hz_out5;
 454     }
 455 }
 456
 457 static void avc_luma_mid_16w_msa( uint8_t *p_src, int32_t i_src_stride,
 458                                   uint8_t *p_dst, int32_t i_dst_stride,
 459                                   int32_t i_height )
 460 {
 461     uint32_t u_multiple8_cnt;
 462
 463     for( u_multiple8_cnt = 2; u_multiple8_cnt--; )
 464     {
 465         avc_luma_mid_8w_msa( p_src, i_src_stride, p_dst, i_dst_stride,
 466                              i_height );
 467         p_src += 8;
 468         p_dst += 8;
 469     }
 470 }
 471
 472 static void avc_interleaved_chroma_hv_2x2_msa( uint8_t *p_src,
 473                                                int32_t i_src_stride,
 474                                                uint8_t *p_dst_u,
 475                                                uint8_t *p_dst_v,
 476                                                int32_t i_dst_stride,
 477                                                uint32_t u_coef_hor0,
 478                                                uint32_t u_coef_hor1,
 479                                                uint32_t u_coef_ver0,
 480                                                uint32_t u_coef_ver1 )
 481 {
 482     uint16_t u_out0, u_out1, u_out2, u_out3;
 483     v16u8 src0, src1, src2, src3, src4;
 484     v8u16 res_hz0, res_hz1, res_hz2, res_hz3;
 485     v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
 486     v16i8 mask;
 487     v16i8 coeff_hz_vec0 = __msa_fill_b( u_coef_hor0 );
 488     v16i8 coeff_hz_vec1 = __msa_fill_b( u_coef_hor1 );
 489     v16u8 coeff_hz_vec = ( v16u8 ) __msa_ilvr_b( coeff_hz_vec0, coeff_hz_vec1 );
 490     v8u16 coeff_vt_vec0 = ( v8u16 ) __msa_fill_h( u_coef_ver0 );
 491     v8u16 coeff_vt_vec1 = ( v8u16 ) __msa_fill_h( u_coef_ver1 );
 492     v8i16 res0, res1;
 493
 494     mask = LD_SB( &pu_chroma_mask_arr[16] );
 495
 496     LD_UB3( p_src, i_src_stride, src0, src1, src2 );
 497     VSHF_B2_UB( src0, src1, src1, src2,
 498                 ( mask + 1 ), ( mask + 1 ), src3, src4 );
 499     VSHF_B2_UB( src0, src1, src1, src2, mask, mask, src0, src1 );
 500     DOTP_UB4_UH( src0, src1, src3, src4, coeff_hz_vec, coeff_hz_vec,
 501                  coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2,
 502                  res_hz3 );
 503     MUL4( res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2,
 504           coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
 505           res_vt3 );
 506     ADD2( res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt2 );
 507     SRARI_H2_UH( res_vt0, res_vt2, 6 );
 508     SAT_UH2_UH( res_vt0, res_vt2, 7 );
 509     PCKEV_B2_SH( res_vt0, res_vt0, res_vt2, res_vt2, res0, res1 );
 510
 511     u_out0 = __msa_copy_u_h( res0, 0 );
 512     u_out1 = __msa_copy_u_h( res0, 2 );
 513     u_out2 = __msa_copy_u_h( res1, 0 );
 514     u_out3 = __msa_copy_u_h( res1, 2 );
 515
 516     SH( u_out0, p_dst_u );
 517     p_dst_u += i_dst_stride;
 518     SH( u_out1, p_dst_u );
 519
 520     SH( u_out2, p_dst_v );
 521     p_dst_v += i_dst_stride;
 522     SH( u_out3, p_dst_v );
 523 }
 524
 525 static void avc_interleaved_chroma_hv_2x4_msa( uint8_t *p_src,
 526                                                int32_t i_src_stride,
 527                                                uint8_t *p_dst_u,
 528                                                uint8_t *p_dst_v,
 529                                                int32_t i_dst_stride,
 530                                                uint32_t u_coef_hor0,
 531                                                uint32_t u_coef_hor1,
 532                                                uint32_t u_coef_ver0,
 533                                                uint32_t u_coef_ver1 )
 534 {
 535     uint16_t u_out0, u_out1, u_out2, u_out3;
 536     v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
 537     v8u16 res_hz0, res_hz1, res_hz2, res_hz3;
 538     v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
 539     v16i8 mask;
 540     v8i16 res0, res1;
 541     v16i8 coeff_hz_vec0 = __msa_fill_b( u_coef_hor0 );
 542     v16i8 coeff_hz_vec1 = __msa_fill_b( u_coef_hor1 );
 543     v16u8 coeff_hz_vec = ( v16u8 ) __msa_ilvr_b( coeff_hz_vec0, coeff_hz_vec1 );
 544     v8u16 coeff_vt_vec0 = ( v8u16 ) __msa_fill_h( u_coef_ver0 );
 545     v8u16 coeff_vt_vec1 = ( v8u16 ) __msa_fill_h( u_coef_ver1 );
 546
 547     mask = LD_SB( &pu_chroma_mask_arr[16] );
 548
 549     LD_UB5( p_src, i_src_stride, src0, src1, src2, src3, src4 );
 550
 551     VSHF_B2_UB( src0, src1, src1, src2,
 552                 ( mask + 1 ), ( mask + 1 ), src5, src6 );
 553     VSHF_B2_UB( src2, src3, src3, src4,
 554                 ( mask + 1 ), ( mask + 1 ), src7, src8 );
 555     VSHF_B2_UB( src0, src1, src1, src2, mask, mask, src0, src1 );
 556     VSHF_B2_UB( src2, src3, src3, src4, mask, mask, src2, src3 );
 557     DOTP_UB4_UH( src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec,
 558                  coeff_hz_vec, coeff_hz_vec, res_hz0,
 559                  res_hz1, res_hz2, res_hz3 );
 560     MUL4( res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2,
 561           coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
 562           res_vt3 );
 563     ADD2( res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1 );
 564     SRARI_H2_UH( res_vt0, res_vt1, 6 );
 565     SAT_UH2_UH( res_vt0, res_vt1, 7 );
 566     PCKEV_B2_SH( res_vt0, res_vt0, res_vt1, res_vt1, res0, res1 );
 567
 568     u_out0 = __msa_copy_u_h( res0, 0 );
 569     u_out1 = __msa_copy_u_h( res0, 2 );
 570     u_out2 = __msa_copy_u_h( res1, 0 );
 571     u_out3 = __msa_copy_u_h( res1, 2 );
 572
 573     SH( u_out0, p_dst_u );
 574     p_dst_u += i_dst_stride;
 575     SH( u_out1, p_dst_u );
 576     p_dst_u += i_dst_stride;
 577     SH( u_out2, p_dst_u );
 578     p_dst_u += i_dst_stride;
 579     SH( u_out3, p_dst_u );
 580
 581     DOTP_UB4_UH( src5, src6, src7, src8, coeff_hz_vec, coeff_hz_vec,
 582                  coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2,
 583                  res_hz3 );
 584     MUL4( res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2,
 585           coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
 586           res_vt3 );
 587     ADD2( res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1 );
 588     SRARI_H2_UH( res_vt0, res_vt1, 6 );
 589     SAT_UH2_UH( res_vt0, res_vt1, 7 );
 590     PCKEV_B2_SH( res_vt0, res_vt0, res_vt1, res_vt1, res0, res1 );
 591
 592     u_out0 = __msa_copy_u_h( res0, 0 );
 593     u_out1 = __msa_copy_u_h( res0, 2 );
 594     u_out2 = __msa_copy_u_h( res1, 0 );
 595     u_out3 = __msa_copy_u_h( res1, 2 );
 596
 597     SH( u_out0, p_dst_v );
 598     p_dst_v += i_dst_stride;
 599     SH( u_out1, p_dst_v );
 600     p_dst_v += i_dst_stride;
 601     SH( u_out2, p_dst_v );
 602     p_dst_v += i_dst_stride;
 603     SH( u_out3, p_dst_v );
 604 }
 605
 606 static void avc_interleaved_chroma_hv_2w_msa( uint8_t *p_src,
 607                                               int32_t i_src_stride,
 608                                               uint8_t *p_dst_u,
 609                                               uint8_t *p_dst_v,
 610                                               int32_t i_dst_stride,
 611                                               uint32_t u_coef_hor0,
 612                                               uint32_t u_coef_hor1,
 613                                               uint32_t u_coef_ver0,
 614                                               uint32_t u_coef_ver1,
 615                                               int32_t i_height )
 616 {
 617     if( 2 == i_height )
 618     {
 619         avc_interleaved_chroma_hv_2x2_msa( p_src, i_src_stride,
 620                                            p_dst_u, p_dst_v, i_dst_stride,
 621                                            u_coef_hor0, u_coef_hor1,
 622                                            u_coef_ver0, u_coef_ver1 );
 623     }
 624     else if( 4 == i_height )
 625     {
 626         avc_interleaved_chroma_hv_2x4_msa( p_src, i_src_stride,
 627                                            p_dst_u, p_dst_v, i_dst_stride,
 628                                            u_coef_hor0, u_coef_hor1,
 629                                            u_coef_ver0, u_coef_ver1 );
 630     }
 631 }
 632
 633 static void avc_interleaved_chroma_hv_4x2_msa( uint8_t *p_src,
 634                                                int32_t i_src_stride,
 635                                                uint8_t *p_dst_u,
 636                                                uint8_t *p_dst_v,
 637                                                int32_t i_dst_stride,
 638                                                uint32_t u_coef_hor0,
 639                                                uint32_t u_coef_hor1,
 640                                                uint32_t u_coef_ver0,
 641                                                uint32_t u_coef_ver1 )
 642 {
 643     uint32_t u_out0, u_out1, u_out2, u_out3;
 644     v16u8 src0, src1, src2, src3, src4;
 645     v8u16 res_hz0, res_hz1, res_hz2, res_hz3;
 646     v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
 647     v16i8 mask;
 648     v16i8 coeff_hz_vec0 = __msa_fill_b( u_coef_hor0 );
 649     v16i8 coeff_hz_vec1 = __msa_fill_b( u_coef_hor1 );
 650     v16u8 coeff_hz_vec = ( v16u8 ) __msa_ilvr_b( coeff_hz_vec0, coeff_hz_vec1 );
 651     v8u16 coeff_vt_vec0 = ( v8u16 ) __msa_fill_h( u_coef_ver0 );
 652     v8u16 coeff_vt_vec1 = ( v8u16 ) __msa_fill_h( u_coef_ver1 );
 653     v4i32 res0, res1;
 654
 655     mask = LD_SB( &pu_chroma_mask_arr[16] );
 656
 657     LD_UB3( p_src, i_src_stride, src0, src1, src2 );
 658     VSHF_B2_UB( src0, src1, src1, src2,
 659                 ( mask + 1 ), ( mask + 1 ), src3, src4 );
 660     VSHF_B2_UB( src0, src1, src1, src2, mask, mask, src0, src1 );
 661     DOTP_UB4_UH( src0, src1, src3, src4, coeff_hz_vec, coeff_hz_vec,
 662                  coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2,
 663                  res_hz3 );
 664     MUL4( res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2,
 665           coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
 666           res_vt3 );
 667     ADD2( res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt2 );
 668     SRARI_H2_UH( res_vt0, res_vt2, 6 );
 669     SAT_UH2_UH( res_vt0, res_vt2, 7 );
 670     PCKEV_B2_SW( res_vt0, res_vt0, res_vt2, res_vt2, res0, res1 );
 671
 672     u_out0 = __msa_copy_u_w( res0, 0 );
 673     u_out1 = __msa_copy_u_w( res0, 1 );
 674     u_out2 = __msa_copy_u_w( res1, 0 );
 675     u_out3 = __msa_copy_u_w( res1, 1 );
 676     SW( u_out0, p_dst_u );
 677     p_dst_u += i_dst_stride;
 678     SW( u_out1, p_dst_u );
 679     SW( u_out2, p_dst_v );
 680     p_dst_v += i_dst_stride;
 681     SW( u_out3, p_dst_v );
 682 }
 683
 684 static void avc_interleaved_chroma_hv_4x4mul_msa( uint8_t *p_src,
 685                                                   int32_t i_src_stride,
 686                                                   uint8_t *p_dst_u,
 687                                                   uint8_t *p_dst_v,
 688                                                   int32_t i_dst_stride,
 689                                                   uint32_t u_coef_hor0,
 690                                                   uint32_t u_coef_hor1,
 691                                                   uint32_t u_coef_ver0,
 692                                                   uint32_t u_coef_ver1,
 693                                                   int32_t i_height )
 694 {
 695     uint32_t u_row;
 696     v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
 697     v8u16 res_hz0, res_hz1, res_hz2, res_hz3;
 698     v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
 699     v16i8 mask;
 700     v4i32 res0, res1;
 701     v16i8 coeff_hz_vec0 = __msa_fill_b( u_coef_hor0 );
 702     v16i8 coeff_hz_vec1 = __msa_fill_b( u_coef_hor1 );
 703     v16u8 coeff_hz_vec = ( v16u8 ) __msa_ilvr_b( coeff_hz_vec0, coeff_hz_vec1 );
 704     v8u16 coeff_vt_vec0 = ( v8u16 ) __msa_fill_h( u_coef_ver0 );
 705     v8u16 coeff_vt_vec1 = ( v8u16 ) __msa_fill_h( u_coef_ver1 );
 706
 707     mask = LD_SB( &pu_chroma_mask_arr[16] );
 708
 709     src0 = LD_UB( p_src );
 710     p_src += i_src_stride;
 711
 712     for( u_row = ( i_height >> 2 ); u_row--; )
 713     {
 714         LD_UB4( p_src, i_src_stride, src1, src2, src3, src4 );
 715         p_src += ( 4 * i_src_stride );
 716
 717         VSHF_B2_UB( src0, src1, src1, src2,
 718                     ( mask + 1 ), ( mask + 1 ), src5, src6 );
 719         VSHF_B2_UB( src2, src3, src3, src4,
 720                     ( mask + 1 ), ( mask + 1 ), src7, src8 );
 721         VSHF_B2_UB( src0, src1, src1, src2, mask, mask, src0, src1 );
 722         VSHF_B2_UB( src2, src3, src3, src4, mask, mask, src2, src3 );
 723         DOTP_UB4_UH( src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec,
 724                      coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2,
 725                      res_hz3 );
 726         MUL4( res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2,
 727               coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
 728               res_vt3 );
 729         ADD2( res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1 );
 730         SRARI_H2_UH( res_vt0, res_vt1, 6 );
 731         SAT_UH2_UH( res_vt0, res_vt1, 7 );
 732         PCKEV_B2_SW( res_vt0, res_vt0, res_vt1, res_vt1, res0, res1 );
 733
 734         ST4x4_UB( res0, res1, 0, 1, 0, 1, p_dst_u, i_dst_stride );
 735         p_dst_u += ( 4 * i_dst_stride );
 736
 737         DOTP_UB4_UH( src5, src6, src7, src8, coeff_hz_vec, coeff_hz_vec,
 738                      coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2,
 739                      res_hz3 );
 740         MUL4( res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2,
 741               coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
 742               res_vt3 );
 743         ADD2( res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1 );
 744         SRARI_H2_UH( res_vt0, res_vt1, 6 );
 745         SAT_UH2_UH( res_vt0, res_vt1, 7 );
 746         PCKEV_B2_SW( res_vt0, res_vt0, res_vt1, res_vt1, res0, res1 );
 747
 748         ST4x4_UB( res0, res1, 0, 1, 0, 1, p_dst_v, i_dst_stride );
 749         p_dst_v += ( 4 * i_dst_stride );
 750         src0 = src4;
 751     }
 752 }
 753
 754 static void avc_interleaved_chroma_hv_4w_msa( uint8_t *p_src,
 755                                               int32_t i_src_stride,
 756                                               uint8_t *p_dst_u,
 757                                               uint8_t *p_dst_v,
 758                                               int32_t i_dst_stride,
 759                                               uint32_t u_coef_hor0,
 760                                               uint32_t u_coef_hor1,
 761                                               uint32_t u_coef_ver0,
 762                                               uint32_t u_coef_ver1,
 763                                               int32_t i_height )
 764 {
 765     if( 2 == i_height )
 766     {
 767         avc_interleaved_chroma_hv_4x2_msa( p_src, i_src_stride,
 768                                            p_dst_u, p_dst_v, i_dst_stride,
 769                                            u_coef_hor0, u_coef_hor1,
 770                                            u_coef_ver0, u_coef_ver1 );
 771     }
 772     else
 773     {
 774         avc_interleaved_chroma_hv_4x4mul_msa( p_src, i_src_stride,
 775                                               p_dst_u, p_dst_v, i_dst_stride,
 776                                               u_coef_hor0, u_coef_hor1,
 777                                               u_coef_ver0, u_coef_ver1,
 778                                               i_height );
 779     }
 780 }
 781
 782 static void avc_interleaved_chroma_hv_8w_msa( uint8_t *p_src,
 783                                               int32_t i_src_stride,
 784                                               uint8_t *p_dst_u,
 785                                               uint8_t *p_dst_v,
 786                                               int32_t i_dst_stride,
 787                                               uint32_t u_coef_hor0,
 788                                               uint32_t u_coef_hor1,
 789                                               uint32_t u_coef_ver0,
 790                                               uint32_t u_coef_ver1,
 791                                               int32_t i_height )
 792 {
 793     uint32_t u_row;
 794     v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
 795     v16u8 src10, src11, src12, src13, src14;
 796     v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4, res_hz5;
 797     v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
 798     v16i8 mask = { 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14, 16 };
 799     v16i8 coeff_hz_vec0, coeff_hz_vec1;
 800     v16i8 tmp0, tmp1;
 801     v16u8 coeff_hz_vec;
 802     v8u16 coeff_vt_vec0, coeff_vt_vec1;
 803
 804     coeff_hz_vec0 = __msa_fill_b( u_coef_hor0 );
 805     coeff_hz_vec1 = __msa_fill_b( u_coef_hor1 );
 806     coeff_hz_vec = ( v16u8 ) __msa_ilvr_b( coeff_hz_vec0, coeff_hz_vec1 );
 807     coeff_vt_vec0 = ( v8u16 ) __msa_fill_h( u_coef_ver0 );
 808     coeff_vt_vec1 = ( v8u16 ) __msa_fill_h( u_coef_ver1 );
 809
 810     LD_UB2( p_src, 16, src0, src13 );
 811     p_src += i_src_stride;
 812
 813     VSHF_B2_UB( src0, src13, src0, src13, ( mask + 1 ), mask, src14, src0 );
 814     DOTP_UB2_UH( src0, src14, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz5 );
 815
 816     for( u_row = ( i_height >> 2 ); u_row--; )
 817     {
 818         LD_UB4( p_src, i_src_stride, src1, src2, src3, src4 );
 819         LD_UB4( p_src + 16, i_src_stride, src5, src6, src7, src8 );
 820         p_src += ( 4 * i_src_stride );
 821
 822         VSHF_B2_UB( src1, src5, src2, src6, mask, mask, src9, src10 );
 823         VSHF_B2_UB( src3, src7, src4, src8, mask, mask, src11, src12 );
 824         DOTP_UB4_UH( src9, src10, src11, src12, coeff_hz_vec, coeff_hz_vec,
 825                      coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3,
 826                      res_hz4 );
 827         MUL4( res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3,
 828               coeff_vt_vec0, res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
 829               res_vt3 );
 830
 831         res_vt0 += ( res_hz0 * coeff_vt_vec1 );
 832         res_vt1 += ( res_hz1 * coeff_vt_vec1 );
 833         res_vt2 += ( res_hz2 * coeff_vt_vec1 );
 834         res_vt3 += ( res_hz3 * coeff_vt_vec1 );
 835
 836         SRARI_H4_UH( res_vt0, res_vt1, res_vt2, res_vt3, 6 );
 837         SAT_UH4_UH( res_vt0, res_vt1, res_vt2, res_vt3, 7 );
 838         PCKEV_B2_SB( res_vt1, res_vt0, res_vt3, res_vt2, tmp0, tmp1 );
 839         ST8x4_UB( tmp0, tmp1, p_dst_u, i_dst_stride );
 840         p_dst_u += ( 4 * i_dst_stride );
 841         res_hz0 = res_hz4;
 842
 843         VSHF_B2_UB( src1, src5, src2, src6,
 844                     ( mask + 1 ), ( mask + 1 ), src5, src6 );
 845         VSHF_B2_UB( src3, src7, src4, src8,
 846                     ( mask + 1 ), ( mask + 1 ), src7, src8 );
 847         DOTP_UB4_UH( src5, src6, src7, src8, coeff_hz_vec, coeff_hz_vec,
 848                      coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3,
 849                      res_hz4 );
 850         MUL4( res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3,
 851               coeff_vt_vec0, res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
 852               res_vt3 );
 853
 854         res_vt0 += ( res_hz5 * coeff_vt_vec1 );
 855         res_vt1 += ( res_hz1 * coeff_vt_vec1 );
 856         res_vt2 += ( res_hz2 * coeff_vt_vec1 );
 857         res_vt3 += ( res_hz3 * coeff_vt_vec1 );
 858
 859         SRARI_H4_UH( res_vt0, res_vt1, res_vt2, res_vt3, 6 );
 860         SAT_UH4_UH( res_vt0, res_vt1, res_vt2, res_vt3, 7 );
 861         PCKEV_B2_SB( res_vt1, res_vt0, res_vt3, res_vt2, tmp0, tmp1 );
 862         ST8x4_UB( tmp0, tmp1, p_dst_v, i_dst_stride );
 863         p_dst_v += ( 4 * i_dst_stride );
 864         res_hz5 = res_hz4;
 865     }
 866 }
 867
 868 static void avc_wgt_opscale_4x2_msa( uint8_t *p_src, int32_t i_src_stride,
 869                                      uint8_t *p_dst, int32_t i_dst_stride,
 870                                      int32_t i_log2_denom, int32_t i_weight,
 871                                      int32_t i_offset_in )
 872 {
 873     uint32_t u_load0, u_load1, u_out0, u_out1;
 874     v16u8 zero = { 0 };
 875     v16u8 src0, src1;
 876     v4i32 dst0, dst1;
 877     v8u16 temp0, temp1, wgt, denom, offset, tp0, tp1;
 878     v8i16 vec0, vec1;
 879
 880     i_offset_in <<= ( i_log2_denom );
 881
 882     if( i_log2_denom )
 883     {
 884         i_offset_in += ( 1 << ( i_log2_denom - 1 ) );
 885     }
 886
 887     wgt = ( v8u16 ) __msa_fill_h( i_weight );
 888     offset = ( v8u16 ) __msa_fill_h( i_offset_in );
 889     denom = ( v8u16 ) __msa_fill_h( i_log2_denom );
 890
 891     u_load0 = LW( p_src );
 892     p_src += i_src_stride;
 893     u_load1 = LW( p_src );
 894
 895     src0 = ( v16u8 ) __msa_fill_w( u_load0 );
 896     src1 = ( v16u8 ) __msa_fill_w( u_load1 );
 897
 898     ILVR_B2_UH( zero, src0, zero, src1, temp0, temp1 );
 899     MUL2( wgt, temp0, wgt, temp1, temp0, temp1 );
 900     ADDS_SH2_SH( temp0, offset, temp1, offset, vec0, vec1 );
 901     MAXI_SH2_SH( vec0, vec1, 0 );
 902
 903     tp0 = ( v8u16 ) __msa_srl_h( vec0, ( v8i16 ) denom );
 904     tp1 = ( v8u16 ) __msa_srl_h( vec1, ( v8i16 ) denom );
 905
 906     SAT_UH2_UH( tp0, tp1, 7 );
 907     PCKEV_B2_SW( tp0, tp0, tp1, tp1, dst0, dst1 );
 908
 909     u_out0 = __msa_copy_u_w( dst0, 0 );
 910     u_out1 = __msa_copy_u_w( dst1, 0 );
 911     SW( u_out0, p_dst );
 912     p_dst += i_dst_stride;
 913     SW( u_out1, p_dst );
 914 }
 915
 916 static void avc_wgt_opscale_4x4multiple_msa( uint8_t *p_src,
 917                                              int32_t i_src_stride,
 918                                              uint8_t *p_dst,
 919                                              int32_t i_dst_stride,
 920                                              int32_t i_height,
 921                                              int32_t i_log2_denom,
 922                                              int32_t i_weight,
 923                                              int32_t i_offset_in )
 924 {
 925     uint8_t u_cnt;
 926     uint32_t u_load0, u_load1, u_load2, u_load3;
 927     v16u8 zero = { 0 };
 928     v16u8 src0, src1, src2, src3;
 929     v8u16 temp0, temp1, temp2, temp3;
 930     v8u16 wgt, denom, offset;
 931
 932     i_offset_in <<= ( i_log2_denom );
 933
 934     if( i_log2_denom )
 935     {
 936         i_offset_in += ( 1 << ( i_log2_denom - 1 ) );
 937     }
 938
 939     wgt = ( v8u16 ) __msa_fill_h( i_weight );
 940     offset = ( v8u16 ) __msa_fill_h( i_offset_in );
 941     denom = ( v8u16 ) __msa_fill_h( i_log2_denom );
 942
 943     for( u_cnt = i_height / 4; u_cnt--; )
 944     {
 945         LW4( p_src, i_src_stride, u_load0, u_load1, u_load2, u_load3 );
 946         p_src += 4 * i_src_stride;
 947
 948         src0 = ( v16u8 ) __msa_fill_w( u_load0 );
 949         src1 = ( v16u8 ) __msa_fill_w( u_load1 );
 950         src2 = ( v16u8 ) __msa_fill_w( u_load2 );
 951         src3 = ( v16u8 ) __msa_fill_w( u_load3 );
 952
 953         ILVR_B4_UH( zero, src0, zero, src1, zero, src2, zero, src3,
 954                     temp0, temp1, temp2, temp3 );
 955         MUL4( wgt, temp0, wgt, temp1, wgt, temp2, wgt, temp3,
 956               temp0, temp1, temp2, temp3 );
 957         ADDS_SH4_UH( temp0, offset, temp1, offset, temp2, offset, temp3, offset,
 958                      temp0, temp1, temp2, temp3 );
 959         MAXI_SH4_UH( temp0, temp1, temp2, temp3, 0 );
 960         SRL_H4_UH( temp0, temp1, temp2, temp3, denom );
 961         SAT_UH4_UH( temp0, temp1, temp2, temp3, 7 );
 962         PCKEV_ST4x4_UB( temp0, temp1, temp2, temp3, p_dst, i_dst_stride );
 963         p_dst += ( 4 * i_dst_stride );
 964     }
 965 }
 966
 967 static void avc_wgt_opscale_4width_msa( uint8_t *p_src, int32_t i_src_stride,
 968                                         uint8_t *p_dst, int32_t i_dst_stride,
 969                                         int32_t i_height, int32_t i_log2_denom,
 970                                         int32_t i_weight, int32_t i_offset_in )
 971 {
 972     if( 2 == i_height )
 973     {
 974         avc_wgt_opscale_4x2_msa( p_src, i_src_stride, p_dst, i_dst_stride,
 975                                  i_log2_denom, i_weight, i_offset_in );
 976     }
 977     else
 978     {
 979         avc_wgt_opscale_4x4multiple_msa( p_src, i_src_stride,
 980                                          p_dst, i_dst_stride,
 981                                          i_height, i_log2_denom,
 982                                          i_weight, i_offset_in );
 983     }
 984 }
 985
 986 static void avc_wgt_opscale_8width_msa( uint8_t *p_src, int32_t i_src_stride,
 987                                         uint8_t *p_dst, int32_t i_dst_stride,
 988                                         int32_t i_height, int32_t i_log2_denom,
 989                                         int32_t i_weight, int32_t i_offset_in )
 990 {
 991     uint8_t u_cnt;
 992     v16u8 zero = { 0 };
 993     v16u8 src0, src1, src2, src3;
 994     v8u16 temp0, temp1, temp2, temp3;
 995     v8u16 wgt, denom, offset;
 996     v16i8 out0, out1;
 997
 998     i_offset_in <<= ( i_log2_denom );
 999
1000     if( i_log2_denom )
1001     {
1002         i_offset_in += ( 1 << ( i_log2_denom - 1 ) );
1003     }
1004
1005     wgt = ( v8u16 ) __msa_fill_h( i_weight );
1006     offset = ( v8u16 ) __msa_fill_h( i_offset_in );
1007     denom = ( v8u16 ) __msa_fill_h( i_log2_denom );
1008
1009     for( u_cnt = i_height / 4; u_cnt--; )
1010     {
1011         LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
1012         p_src += 4 * i_src_stride;
1013
1014         ILVR_B4_UH( zero, src0, zero, src1, zero, src2, zero, src3,
1015                     temp0, temp1, temp2, temp3 );
1016         MUL4( wgt, temp0, wgt, temp1, wgt, temp2, wgt, temp3,
1017               temp0, temp1, temp2, temp3 );
1018         ADDS_SH4_UH( temp0, offset, temp1, offset, temp2, offset, temp3, offset,
1019                      temp0, temp1, temp2, temp3 );
1020         MAXI_SH4_UH( temp0, temp1, temp2, temp3, 0 );
1021         SRL_H4_UH( temp0, temp1, temp2, temp3, denom );
1022         SAT_UH4_UH( temp0, temp1, temp2, temp3, 7 );
1023         PCKEV_B2_SB( temp1, temp0, temp3, temp2, out0, out1 );
1024         ST8x4_UB( out0, out1, p_dst, i_dst_stride );
1025         p_dst += ( 4 * i_dst_stride );
1026     }
1027 }
1028
1029 static void avc_wgt_opscale_16width_msa( uint8_t *p_src, int32_t i_src_stride,
1030                                          uint8_t *p_dst, int32_t i_dst_stride,
1031                                          int32_t i_height, int32_t i_log2_denom,
1032                                          int32_t i_weight, int32_t i_offset_in )
1033 {
1034     uint8_t u_cnt;
1035     v16i8 zero = { 0 };
1036     v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3;
1037     v8u16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
1038     v8u16 wgt, denom, offset;
1039
1040     i_offset_in <<= ( i_log2_denom );
1041
1042     if( i_log2_denom )
1043     {
1044         i_offset_in += ( 1 << ( i_log2_denom - 1 ) );
1045     }
1046
1047     wgt = ( v8u16 ) __msa_fill_h( i_weight );
1048     offset = ( v8u16 ) __msa_fill_h( i_offset_in );
1049     denom = ( v8u16 ) __msa_fill_h( i_log2_denom );
1050
1051     for( u_cnt = i_height / 4; u_cnt--; )
1052     {
1053         LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
1054         p_src += 4 * i_src_stride;
1055
1056         ILVR_B4_UH( zero, src0, zero, src1, zero, src2, zero, src3,
1057                     temp0, temp2, temp4, temp6 );
1058         ILVL_B4_UH( zero, src0, zero, src1, zero, src2, zero, src3,
1059                     temp1, temp3, temp5, temp7 );
1060         MUL4( wgt, temp0, wgt, temp1, wgt, temp2, wgt, temp3,
1061               temp0, temp1, temp2, temp3 );
1062         MUL4( wgt, temp4, wgt, temp5, wgt, temp6, wgt, temp7,
1063               temp4, temp5, temp6, temp7 );
1064         ADDS_SH4_UH( temp0, offset, temp1, offset, temp2, offset, temp3, offset,
1065                      temp0, temp1, temp2, temp3 );
1066         ADDS_SH4_UH( temp4, offset, temp5, offset, temp6, offset, temp7, offset,
1067                      temp4, temp5, temp6, temp7 );
1068         MAXI_SH4_UH( temp0, temp1, temp2, temp3, 0 );
1069         MAXI_SH4_UH( temp4, temp5, temp6, temp7, 0 );
1070         SRL_H4_UH( temp0, temp1, temp2, temp3, denom );
1071         SRL_H4_UH( temp4, temp5, temp6, temp7, denom );
1072         SAT_UH4_UH( temp0, temp1, temp2, temp3, 7 );
1073         SAT_UH4_UH( temp4, temp5, temp6, temp7, 7 );
1074         PCKEV_B4_UB( temp1, temp0, temp3, temp2, temp5, temp4, temp7, temp6,
1075                      dst0, dst1, dst2, dst3 );
1076
1077         ST_UB4( dst0, dst1, dst2, dst3, p_dst, i_dst_stride );
1078         p_dst += 4 * i_dst_stride;
1079     }
1080 }
1081
1082 static void avc_biwgt_opscale_4x2_nw_msa( uint8_t *p_src1_in,
1083                                           int32_t i_src1_stride,
1084                                           uint8_t *p_src2_in,
1085                                           int32_t i_src2_stride,
1086                                           uint8_t *p_dst,
1087                                           int32_t i_dst_stride,
1088                                           int32_t i_log2_denom,
1089                                           int32_t i_src1_weight,
1090                                           int32_t i_src2_weight,
1091                                           int32_t i_offset_in )
1092 {
1093     uint32_t u_load0, u_load1, u_out0, u_out1;
1094     v8i16 src1_wgt, src2_wgt;
1095     v16u8 in0, in1, in2, in3;
1096     v8i16 temp0, temp1, temp2, temp3;
1097     v16i8 zero = { 0 };
1098     v8i16 denom = __msa_ldi_h( i_log2_denom + 1 );
1099
1100     src1_wgt = __msa_fill_h( i_src1_weight );
1101     src2_wgt = __msa_fill_h( i_src2_weight );
1102     u_load0 = LW( p_src1_in );
1103     u_load1 = LW( p_src1_in + i_src1_stride );
1104     in0 = ( v16u8 ) __msa_fill_w( u_load0 );
1105     in1 = ( v16u8 ) __msa_fill_w( u_load1 );
1106     u_load0 = LW( p_src2_in );
1107     u_load1 = LW( p_src2_in + i_src2_stride );
1108     in2 = ( v16u8 ) __msa_fill_w( u_load0 );
1109     in3 = ( v16u8 ) __msa_fill_w( u_load1 );
1110     ILVR_B4_SH( zero, in0, zero, in1, zero, in2, zero, in3,
1111                 temp0, temp1, temp2, temp3 );
1112     temp0 = ( temp0 * src1_wgt ) + ( temp2 * src2_wgt );
1113     temp1 = ( temp1 * src1_wgt ) + ( temp3 * src2_wgt );
1114     SRAR_H2_SH( temp0, temp1, denom );
1115     CLIP_SH2_0_255( temp0, temp1 );
1116     PCKEV_B2_UB( temp0, temp0, temp1, temp1, in0, in1 );
1117     u_out0 = __msa_copy_u_w( ( v4i32 ) in0, 0 );
1118     u_out1 = __msa_copy_u_w( ( v4i32 ) in1, 0 );
1119     SW( u_out0, p_dst );
1120     p_dst += i_dst_stride;
1121     SW( u_out1, p_dst );
1122 }
1123
1124 static void avc_biwgt_opscale_4x4multiple_nw_msa( uint8_t *p_src1_in,
1125                                                   int32_t i_src1_stride,
1126                                                   uint8_t *p_src2_in,
1127                                                   int32_t i_src2_stride,
1128                                                   uint8_t *p_dst,
1129                                                   int32_t i_dst_stride,
1130                                                   int32_t i_height,
1131                                                   int32_t i_log2_denom,
1132                                                   int32_t i_src1_weight,
1133                                                   int32_t i_src2_weight,
1134                                                   int32_t i_offset_in )
1135 {
1136     uint8_t u_cnt;
1137     uint32_t u_load0, u_load1, u_load2, u_load3;
1138     v8i16 src1_wgt, src2_wgt;
1139     v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1140     v8i16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
1141     v16i8 zero = { 0 };
1142     v8i16 denom = __msa_ldi_h( i_log2_denom + 1 );
1143
1144     src1_wgt = __msa_fill_h( i_src1_weight );
1145     src2_wgt = __msa_fill_h( i_src2_weight );
1146     for( u_cnt = i_height / 4; u_cnt--; )
1147     {
1148         LW4( p_src1_in, i_src1_stride, u_load0, u_load1, u_load2, u_load3 );
1149         p_src1_in += ( 4 * i_src1_stride );
1150         src0 = ( v16u8 ) __msa_fill_w( u_load0 );
1151         src1 = ( v16u8 ) __msa_fill_w( u_load1 );
1152         src2 = ( v16u8 ) __msa_fill_w( u_load2 );
1153         src3 = ( v16u8 ) __msa_fill_w( u_load3 );
1154         LW4( p_src2_in, i_src2_stride, u_load0, u_load1, u_load2, u_load3 );
1155         p_src2_in += ( 4 * i_src2_stride );
1156         src4 = ( v16u8 ) __msa_fill_w( u_load0 );
1157         src5 = ( v16u8 ) __msa_fill_w( u_load1 );
1158         src6 = ( v16u8 ) __msa_fill_w( u_load2 );
1159         src7 = ( v16u8 ) __msa_fill_w( u_load3 );
1160         ILVR_B4_SH( zero, src0, zero, src1, zero, src2, zero, src3,
1161                     temp0, temp1, temp2, temp3 );
1162         ILVR_B4_SH( zero, src4, zero, src5, zero, src6, zero, src7,
1163                     temp4, temp5, temp6, temp7 );
1164         temp0 = ( temp0 * src1_wgt ) + ( temp4 * src2_wgt );
1165         temp1 = ( temp1 * src1_wgt ) + ( temp5 * src2_wgt );
1166         temp2 = ( temp2 * src1_wgt ) + ( temp6 * src2_wgt );
1167         temp3 = ( temp3 * src1_wgt ) + ( temp7 * src2_wgt );
1168         SRAR_H4_SH( temp0, temp1, temp2, temp3, denom );
1169         CLIP_SH4_0_255( temp0, temp1, temp2, temp3 );
1170         PCKEV_ST4x4_UB( temp0, temp1, temp2, temp3, p_dst, i_dst_stride );
1171         p_dst += ( 4 * i_dst_stride );
1172     }
1173 }
1174
1175 static void avc_biwgt_opscale_4width_nw_msa( uint8_t *p_src1_in,
1176                                              int32_t i_src1_stride,
1177                                              uint8_t *p_src2_in,
1178                                              int32_t i_src2_stride,
1179                                              uint8_t *p_dst,
1180                                              int32_t i_dst_stride,
1181                                              int32_t i_height,
1182                                              int32_t i_log2_denom,
1183                                              int32_t i_src1_weight,
1184                                              int32_t i_src2_weight,
1185                                              int32_t i_offset_in )
1186 {
1187     if( 2 == i_height )
1188     {
1189         avc_biwgt_opscale_4x2_nw_msa( p_src1_in, i_src1_stride,
1190                                       p_src2_in, i_src2_stride,
1191                                       p_dst, i_dst_stride,
1192                                       i_log2_denom, i_src1_weight,
1193                                       i_src2_weight, i_offset_in );
1194     }
1195     else
1196     {
1197         avc_biwgt_opscale_4x4multiple_nw_msa( p_src1_in, i_src1_stride,
1198                                               p_src2_in, i_src2_stride,
1199                                               p_dst, i_dst_stride,
1200                                               i_height, i_log2_denom,
1201                                               i_src1_weight, i_src2_weight,
1202                                               i_offset_in );
1203     }
1204 }
1205
1206 static void avc_biwgt_opscale_8width_nw_msa( uint8_t *p_src1_in,
1207                                              int32_t i_src1_stride,
1208                                              uint8_t *p_src2_in,
1209                                              int32_t i_src2_stride,
1210                                              uint8_t *p_dst,
1211                                              int32_t i_dst_stride,
1212                                              int32_t i_height,
1213                                              int32_t i_log2_denom,
1214                                              int32_t i_src1_weight,
1215                                              int32_t i_src2_weight,
1216                                              int32_t i_offset_in )
1217 {
1218     uint8_t u_cnt;
1219     v8i16 src1_wgt, src2_wgt;
1220     v16u8 src0, src1, src2, src3;
1221     v16u8 dst0, dst1, dst2, dst3;
1222     v8i16 temp0, temp1, temp2, temp3;
1223     v8i16 res0, res1, res2, res3;
1224     v16i8 zero = { 0 };
1225     v8i16 denom = __msa_ldi_h( i_log2_denom + 1 );
1226
1227     src1_wgt = __msa_fill_h( i_src1_weight );
1228     src2_wgt = __msa_fill_h( i_src2_weight );
1229
1230     for( u_cnt = i_height / 4; u_cnt--; )
1231     {
1232         LD_UB4( p_src1_in, i_src1_stride, src0, src1, src2, src3 );
1233         p_src1_in += ( 4 * i_src1_stride );
1234         LD_UB4( p_src2_in, i_src2_stride, dst0, dst1, dst2, dst3 );
1235         p_src2_in += ( 4 * i_src2_stride );
1236         ILVR_B4_SH( zero, src0, zero, src1, zero, src2, zero, src3,
1237                     temp0, temp1, temp2, temp3 );
1238         ILVR_B4_SH( zero, dst0, zero, dst1, zero, dst2, zero, dst3,
1239                     res0, res1, res2, res3 );
1240         res0 = ( temp0 * src1_wgt ) + ( res0 * src2_wgt );
1241         res1 = ( temp1 * src1_wgt ) + ( res1 * src2_wgt );
1242         res2 = ( temp2 * src1_wgt ) + ( res2 * src2_wgt );
1243         res3 = ( temp3 * src1_wgt ) + ( res3 * src2_wgt );
1244         SRAR_H4_SH( res0, res1, res2, res3, denom );
1245         CLIP_SH4_0_255( res0, res1, res2, res3 );
1246         PCKEV_B4_UB( res0, res0, res1, res1, res2, res2, res3, res3,
1247                      dst0, dst1, dst2, dst3 );
1248         ST8x1_UB( dst0, p_dst );
1249         p_dst += i_dst_stride;
1250         ST8x1_UB( dst1, p_dst );
1251         p_dst += i_dst_stride;
1252         ST8x1_UB( dst2, p_dst );
1253         p_dst += i_dst_stride;
1254         ST8x1_UB( dst3, p_dst );
1255         p_dst += i_dst_stride;
1256     }
1257 }
1258
1259 static void avc_biwgt_opscale_16width_nw_msa( uint8_t *p_src1_in,
1260                                               int32_t i_src1_stride,
1261                                               uint8_t *p_src2_in,
1262                                               int32_t i_src2_stride,
1263                                               uint8_t *p_dst,
1264                                               int32_t i_dst_stride,
1265                                               int32_t i_height,
1266                                               int32_t i_log2_denom,
1267                                               int32_t i_src1_weight,
1268                                               int32_t i_src2_weight,
1269                                               int32_t i_offset_in )
1270 {
1271     uint8_t u_cnt;
1272     v8i16 src1_wgt, src2_wgt;
1273     v16u8 src0, src1, src2, src3;
1274     v16u8 dst0, dst1, dst2, dst3;
1275     v8i16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
1276     v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
1277     v16i8 zero = { 0 };
1278     v8i16 denom = __msa_ldi_h( i_log2_denom + 1 );
1279
1280     src1_wgt = __msa_fill_h( i_src1_weight );
1281     src2_wgt = __msa_fill_h( i_src2_weight );
1282
1283     for( u_cnt = i_height / 4; u_cnt--; )
1284     {
1285         LD_UB4( p_src1_in, i_src1_stride, src0, src1, src2, src3 );
1286         p_src1_in += ( 4 * i_src1_stride );
1287         LD_UB4( p_src2_in, i_src2_stride, dst0, dst1, dst2, dst3 );
1288         p_src2_in += ( 4 * i_src2_stride );
1289         ILVRL_B2_SH( zero, src0, temp1, temp0 );
1290         ILVRL_B2_SH( zero, src1, temp3, temp2 );
1291         ILVRL_B2_SH( zero, src2, temp5, temp4 );
1292         ILVRL_B2_SH( zero, src3, temp7, temp6 );
1293         ILVRL_B2_SH( zero, dst0, res1, res0 );
1294         ILVRL_B2_SH( zero, dst1, res3, res2 );
1295         ILVRL_B2_SH( zero, dst2, res5, res4 );
1296         ILVRL_B2_SH( zero, dst3, res7, res6 );
1297         res0 = ( temp0 * src1_wgt ) + ( res0 * src2_wgt );
1298         res1 = ( temp1 * src1_wgt ) + ( res1 * src2_wgt );
1299         res2 = ( temp2 * src1_wgt ) + ( res2 * src2_wgt );
1300         res3 = ( temp3 * src1_wgt ) + ( res3 * src2_wgt );
1301         res4 = ( temp4 * src1_wgt ) + ( res4 * src2_wgt );
1302         res5 = ( temp5 * src1_wgt ) + ( res5 * src2_wgt );
1303         res6 = ( temp6 * src1_wgt ) + ( res6 * src2_wgt );
1304         res7 = ( temp7 * src1_wgt ) + ( res7 * src2_wgt );
1305         SRAR_H4_SH( res0, res1, res2, res3, denom );
1306         SRAR_H4_SH( res4, res5, res6, res7, denom );
1307         CLIP_SH4_0_255( res0, res1, res2, res3 );
1308         CLIP_SH4_0_255( res4, res5, res6, res7 );
1309         PCKEV_B4_UB( res0, res1, res2, res3, res4, res5, res6, res7,
1310                      dst0, dst1, dst2, dst3 );
1311         ST_UB4( dst0, dst1, dst2, dst3, p_dst, i_dst_stride );
1312         p_dst += 4 * i_dst_stride;
1313     }
1314 }
1315
1316 static void avc_biwgt_opscale_4x2_msa( uint8_t *p_src1_in,
1317                                        int32_t i_src1_stride,
1318                                        uint8_t *p_src2_in,
1319                                        int32_t i_src2_stride,
1320                                        uint8_t *p_dst, int32_t i_dst_stride,
1321                                        int32_t i_log2_denom,
1322                                        int32_t i_src1_weight,
1323                                        int32_t i_src2_weight,
1324                                        int32_t i_offset_in )
1325 {
1326     uint32_t u_load0, u_load1, u_out0, u_out1;
1327     v16u8 src1_wgt, src2_wgt, wgt;
1328     v16i8 in0, in1, in2, in3;
1329     v8u16 temp0, temp1, denom, offset;
1330
1331     i_offset_in = ( ( i_offset_in + 1 ) | 1 ) << i_log2_denom;
1332
1333     src1_wgt = ( v16u8 ) __msa_fill_b( i_src1_weight );
1334     src2_wgt = ( v16u8 ) __msa_fill_b( i_src2_weight );
1335     offset = ( v8u16 ) __msa_fill_h( i_offset_in );
1336     denom = ( v8u16 ) __msa_fill_h( i_log2_denom + 1 );
1337
1338     wgt = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) src2_wgt, ( v16i8 ) src1_wgt );
1339
1340     u_load0 = LW( p_src1_in );
1341     u_load1 = LW( p_src1_in + i_src1_stride );
1342     in0 = ( v16i8 ) __msa_fill_w( u_load0 );
1343     in1 = ( v16i8 ) __msa_fill_w( u_load1 );
1344
1345     u_load0 = LW( p_src2_in );
1346     u_load1 = LW( p_src2_in + i_src2_stride );
1347     in2 = ( v16i8 ) __msa_fill_w( u_load0 );
1348     in3 = ( v16i8 ) __msa_fill_w( u_load1 );
1349
1350     ILVR_B2_SB( in2, in0, in3, in1, in0, in1 );
1351
1352     temp0 = __msa_dpadd_u_h( offset, wgt, ( v16u8 ) in0 );
1353     temp1 = __msa_dpadd_u_h( offset, wgt, ( v16u8 ) in1 );
1354     temp0 >>= denom;
1355     temp1 >>= denom;
1356     MAXI_SH2_UH( temp0, temp1, 0 );
1357     SAT_UH2_UH( temp0, temp1, 7 );
1358     PCKEV_B2_SB( temp0, temp0, temp1, temp1, in0, in1 );
1359
1360     u_out0 = __msa_copy_u_w( ( v4i32 ) in0, 0 );
1361     u_out1 = __msa_copy_u_w( ( v4i32 ) in1, 0 );
1362     SW( u_out0, p_dst );
1363     p_dst += i_dst_stride;
1364     SW( u_out1, p_dst );
1365 }
1366
1367 static void avc_biwgt_opscale_4x4multiple_msa( uint8_t *p_src1_in,
1368                                                int32_t i_src1_stride,
1369                                                uint8_t *p_src2_in,
1370                                                int32_t i_src2_stride,
1371                                                uint8_t *p_dst,
1372                                                int32_t i_dst_stride,
1373                                                int32_t i_height,
1374                                                int32_t i_log2_denom,
1375                                                int32_t i_src1_weight,
1376                                                int32_t i_src2_weight,
1377                                                int32_t i_offset_in )
1378 {
1379     uint8_t u_cnt;
1380     uint32_t u_load0, u_load1, u_load2, u_load3;
1381     v16u8 src1_wgt, src2_wgt, wgt;
1382     v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1383     v16u8 temp0, temp1, temp2, temp3;
1384     v8u16 res0, res1, res2, res3;
1385     v8u16 denom, offset;
1386
1387     i_offset_in = ( ( i_offset_in + 1 ) | 1 ) << i_log2_denom;
1388
1389     src1_wgt = ( v16u8 ) __msa_fill_b( i_src1_weight );
1390     src2_wgt = ( v16u8 ) __msa_fill_b( i_src2_weight );
1391     offset = ( v8u16 ) __msa_fill_h( i_offset_in );
1392     denom = ( v8u16 ) __msa_fill_h( i_log2_denom + 1 );
1393
1394     wgt = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) src2_wgt, ( v16i8 ) src1_wgt );
1395
1396     for( u_cnt = i_height / 4; u_cnt--; )
1397     {
1398         LW4( p_src1_in, i_src1_stride, u_load0, u_load1, u_load2, u_load3 );
1399         p_src1_in += ( 4 * i_src1_stride );
1400
1401         src0 = ( v16u8 ) __msa_fill_w( u_load0 );
1402         src1 = ( v16u8 ) __msa_fill_w( u_load1 );
1403         src2 = ( v16u8 ) __msa_fill_w( u_load2 );
1404         src3 = ( v16u8 ) __msa_fill_w( u_load3 );
1405
1406         LW4( p_src2_in, i_src2_stride, u_load0, u_load1, u_load2, u_load3 );
1407         p_src2_in += ( 4 * i_src2_stride );
1408
1409         src4 = ( v16u8 ) __msa_fill_w( u_load0 );
1410         src5 = ( v16u8 ) __msa_fill_w( u_load1 );
1411         src6 = ( v16u8 ) __msa_fill_w( u_load2 );
1412         src7 = ( v16u8 ) __msa_fill_w( u_load3 );
1413
1414         ILVR_B4_UB( src4, src0, src5, src1, src6, src2, src7, src3,
1415                     temp0, temp1, temp2, temp3 );
1416         DOTP_UB4_UH( temp0, temp1, temp2, temp3, wgt, wgt, wgt, wgt,
1417                      res0, res1, res2, res3 );
1418         ADD4( res0, offset, res1, offset, res2, offset, res3, offset,
1419               res0, res1, res2, res3 );
1420         SRA_4V( res0, res1, res2, res3, denom );
1421         MAXI_SH4_UH( res0, res1, res2, res3, 0 );
1422         SAT_UH4_UH( res0, res1, res2, res3, 7 );
1423         PCKEV_ST4x4_UB( res0, res1, res2, res3, p_dst, i_dst_stride );
1424         p_dst += ( 4 * i_dst_stride );
1425     }
1426 }
1427
1428 static void avc_biwgt_opscale_4width_msa( uint8_t *p_src1_in,
1429                                           int32_t i_src1_stride,
1430                                           uint8_t *p_src2_in,
1431                                           int32_t i_src2_stride,
1432                                           uint8_t *p_dst,
1433                                           int32_t i_dst_stride,
1434                                           int32_t i_height,
1435                                           int32_t i_log2_denom,
1436                                           int32_t i_src1_weight,
1437                                           int32_t i_src2_weight,
1438                                           int32_t i_offset_in )
1439 {
1440     if( 2 == i_height )
1441     {
1442         avc_biwgt_opscale_4x2_msa( p_src1_in, i_src1_stride,
1443                                    p_src2_in, i_src2_stride,
1444                                    p_dst, i_dst_stride,
1445                                    i_log2_denom, i_src1_weight,
1446                                    i_src2_weight, i_offset_in );
1447     }
1448     else
1449     {
1450         avc_biwgt_opscale_4x4multiple_msa( p_src1_in, i_src1_stride,
1451                                            p_src2_in, i_src2_stride,
1452                                            p_dst, i_dst_stride,
1453                                            i_height, i_log2_denom,
1454                                            i_src1_weight,
1455                                            i_src2_weight, i_offset_in );
1456     }
1457 }
1458
1459
1460 static void avc_biwgt_opscale_8width_msa( uint8_t *p_src1_in,
1461                                           int32_t i_src1_stride,
1462                                           uint8_t *p_src2_in,
1463                                           int32_t i_src2_stride,
1464                                           uint8_t *p_dst,
1465                                           int32_t i_dst_stride,
1466                                           int32_t i_height,
1467                                           int32_t i_log2_denom,
1468                                           int32_t i_src1_weight,
1469                                           int32_t i_src2_weight,
1470                                           int32_t i_offset_in )
1471 {
1472     uint8_t u_cnt;
1473     v16u8 src1_wgt, src2_wgt, wgt;
1474     v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1475     v16u8 temp0, temp1, temp2, temp3;
1476     v8u16 res0, res1, res2, res3;
1477     v8u16 denom, offset;
1478     v16i8 out0, out1;
1479
1480     i_offset_in = ( ( i_offset_in + 1 ) | 1 ) << i_log2_denom;
1481
1482     src1_wgt = ( v16u8 ) __msa_fill_b( i_src1_weight );
1483     src2_wgt = ( v16u8 ) __msa_fill_b( i_src2_weight );
1484     offset = ( v8u16 ) __msa_fill_h( i_offset_in );
1485     denom = ( v8u16 ) __msa_fill_h( i_log2_denom + 1 );
1486
1487     wgt = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) src2_wgt, ( v16i8 ) src1_wgt );
1488
1489     for( u_cnt = i_height / 4; u_cnt--; )
1490     {
1491         LD_UB4( p_src1_in, i_src1_stride, src0, src1, src2, src3 );
1492         p_src1_in += ( 4 * i_src1_stride );
1493
1494         LD_UB4( p_src2_in, i_src2_stride, src4, src5, src6, src7 );
1495         p_src2_in += ( 4 * i_src2_stride );
1496
1497         ILVR_B4_UB( src4, src0, src5, src1, src6, src2, src7, src3,
1498                     temp0, temp1, temp2, temp3 );
1499         DOTP_UB4_UH( temp0, temp1, temp2, temp3, wgt, wgt, wgt, wgt,
1500                      res0, res1, res2, res3 );
1501         ADD4( res0, offset, res1, offset, res2, offset, res3, offset,
1502               res0, res1, res2, res3 );
1503         SRA_4V( res0, res1, res2, res3, denom );
1504         MAXI_SH4_UH( res0, res1, res2, res3, 0 );
1505         SAT_UH4_UH( res0, res1, res2, res3, 7 );
1506         PCKEV_B2_SB( res1, res0, res3, res2, out0, out1 );
1507         ST8x4_UB( out0, out1, p_dst, i_dst_stride );
1508         p_dst += 4 * i_dst_stride;
1509     }
1510 }
1511
1512 static void avc_biwgt_opscale_16width_msa( uint8_t *p_src1_in,
1513                                            int32_t i_src1_stride,
1514                                            uint8_t *p_src2_in,
1515                                            int32_t i_src2_stride,
1516                                            uint8_t *p_dst,
1517                                            int32_t i_dst_stride,
1518                                            int32_t i_height,
1519                                            int32_t i_log2_denom,
1520                                            int32_t i_src1_weight,
1521                                            int32_t i_src2_weight,
1522                                            int32_t i_offset_in )
1523 {
1524     uint8_t u_cnt;
1525     v16u8 src1_wgt, src2_wgt, wgt;
1526     v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1527     v16u8 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
1528     v8u16 res0, res1, res2, res3, res4, res5, res6, res7;
1529     v8u16 denom, offset;
1530
1531     i_offset_in = ( ( i_offset_in + 1 ) | 1 ) << i_log2_denom;
1532
1533     src1_wgt = ( v16u8 ) __msa_fill_b( i_src1_weight );
1534     src2_wgt = ( v16u8 ) __msa_fill_b( i_src2_weight );
1535     offset = ( v8u16 ) __msa_fill_h( i_offset_in );
1536     denom = ( v8u16 ) __msa_fill_h( i_log2_denom + 1 );
1537
1538     wgt = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) src2_wgt, ( v16i8 ) src1_wgt );
1539
1540     for( u_cnt = i_height / 4; u_cnt--; )
1541     {
1542         LD_UB4( p_src1_in, i_src1_stride, src0, src1, src2, src3 );
1543         p_src1_in += ( 4 * i_src1_stride );
1544
1545         LD_UB4( p_src2_in, i_src2_stride, src4, src5, src6, src7 );
1546         p_src2_in += ( 4 * i_src2_stride );
1547
1548         ILVR_B4_UB( src4, src0, src5, src1, src6, src2, src7, src3,
1549                     temp0, temp2, temp4, temp6 );
1550         ILVL_B4_UB( src4, src0, src5, src1, src6, src2, src7, src3,
1551                     temp1, temp3, temp5, temp7 );
1552         DOTP_UB4_UH( temp0, temp1, temp2, temp3, wgt, wgt, wgt, wgt,
1553                      res0, res1, res2, res3 );
1554         ADD4( res0, offset, res1, offset, res2, offset, res3, offset,
1555               res0, res1, res2, res3 );
1556         DOTP_UB4_UH( temp4, temp5, temp6, temp7, wgt, wgt, wgt, wgt,
1557                      res4, res5, res6, res7 );
1558         ADD4( res4, offset, res5, offset, res6, offset, res7, offset,
1559               res4, res5, res6, res7 );
1560         SRA_4V( res0, res1, res2, res3, denom );
1561         SRA_4V( res4, res5, res6, res7, denom );
1562         MAXI_SH4_UH( res0, res1, res2, res3, 0 );
1563         MAXI_SH4_UH( res4, res5, res6, res7, 0 );
1564         SAT_UH4_UH( res0, res1, res2, res3, 7 );
1565         SAT_UH4_UH( res4, res5, res6, res7, 7 );
1566         PCKEV_B4_UB( res1, res0, res3, res2, res5, res4, res7, res6,
1567                      temp0, temp1, temp2, temp3 );
1568         ST_UB4( temp0, temp1, temp2, temp3, p_dst, i_dst_stride );
1569         p_dst += 4 * i_dst_stride;
1570     }
1571 }
1572
1573 static void copy_width4_msa( uint8_t *p_src, int32_t i_src_stride,
1574                              uint8_t *p_dst, int32_t i_dst_stride,
1575                              int32_t i_height )
1576 {
1577     int32_t i_cnt;
1578     uint32_t u_src0, u_src1;
1579
1580     for( i_cnt = ( i_height / 2 ); i_cnt--;  )
1581     {
1582         u_src0 = LW( p_src );
1583         p_src += i_src_stride;
1584         u_src1 = LW( p_src );
1585         p_src += i_src_stride;
1586
1587         SW( u_src0, p_dst );
1588         p_dst += i_dst_stride;
1589         SW( u_src1, p_dst );
1590         p_dst += i_dst_stride;
1591     }
1592 }
1593
1594 static void copy_width8_msa( uint8_t *p_src, int32_t i_src_stride,
1595                              uint8_t *p_dst, int32_t i_dst_stride,
1596                              int32_t i_height )
1597 {
1598     int32_t i_cnt;
1599     uint64_t u_out0, u_out1, u_out2, u_out3, u_out4, u_out5, u_out6, u_out7;
1600     v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1601
1602     if( 0 == i_height % 12 )
1603     {
1604         for( i_cnt = ( i_height / 12 ); i_cnt--; )
1605         {
1606             LD_UB8( p_src, i_src_stride,
1607                     src0, src1, src2, src3, src4, src5, src6, src7 );
1608             p_src += ( 8 * i_src_stride );
1609
1610             u_out0 = __msa_copy_u_d( ( v2i64 ) src0, 0 );
1611             u_out1 = __msa_copy_u_d( ( v2i64 ) src1, 0 );
1612             u_out2 = __msa_copy_u_d( ( v2i64 ) src2, 0 );
1613             u_out3 = __msa_copy_u_d( ( v2i64 ) src3, 0 );
1614             u_out4 = __msa_copy_u_d( ( v2i64 ) src4, 0 );
1615             u_out5 = __msa_copy_u_d( ( v2i64 ) src5, 0 );
1616             u_out6 = __msa_copy_u_d( ( v2i64 ) src6, 0 );
1617             u_out7 = __msa_copy_u_d( ( v2i64 ) src7, 0 );
1618
1619             SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
1620             p_dst += ( 4 * i_dst_stride );
1621             SD4( u_out4, u_out5, u_out6, u_out7, p_dst, i_dst_stride );
1622             p_dst += ( 4 * i_dst_stride );
1623
1624             LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
1625             p_src += ( 4 * i_src_stride );
1626
1627             u_out0 = __msa_copy_u_d( ( v2i64 ) src0, 0 );
1628             u_out1 = __msa_copy_u_d( ( v2i64 ) src1, 0 );
1629             u_out2 = __msa_copy_u_d( ( v2i64 ) src2, 0 );
1630             u_out3 = __msa_copy_u_d( ( v2i64 ) src3, 0 );
1631
1632             SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
1633             p_dst += ( 4 * i_dst_stride );
1634         }
1635     }
1636     else if( 0 == i_height % 8 )
1637     {
1638         for( i_cnt = i_height >> 3; i_cnt--; )
1639         {
1640             LD_UB8( p_src, i_src_stride,
1641                     src0, src1, src2, src3, src4, src5, src6, src7 );
1642             p_src += ( 8 * i_src_stride );
1643
1644             u_out0 = __msa_copy_u_d( ( v2i64 ) src0, 0 );
1645             u_out1 = __msa_copy_u_d( ( v2i64 ) src1, 0 );
1646             u_out2 = __msa_copy_u_d( ( v2i64 ) src2, 0 );
1647             u_out3 = __msa_copy_u_d( ( v2i64 ) src3, 0 );
1648             u_out4 = __msa_copy_u_d( ( v2i64 ) src4, 0 );
1649             u_out5 = __msa_copy_u_d( ( v2i64 ) src5, 0 );
1650             u_out6 = __msa_copy_u_d( ( v2i64 ) src6, 0 );
1651             u_out7 = __msa_copy_u_d( ( v2i64 ) src7, 0 );
1652
1653             SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
1654             p_dst += ( 4 * i_dst_stride );
1655             SD4( u_out4, u_out5, u_out6, u_out7, p_dst, i_dst_stride );
1656             p_dst += ( 4 * i_dst_stride );
1657         }
1658     }
1659     else if( 0 == i_height % 4 )
1660     {
1661         for( i_cnt = ( i_height / 4 ); i_cnt--; )
1662         {
1663             LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
1664             p_src += ( 4 * i_src_stride );
1665             u_out0 = __msa_copy_u_d( ( v2i64 ) src0, 0 );
1666             u_out1 = __msa_copy_u_d( ( v2i64 ) src1, 0 );
1667             u_out2 = __msa_copy_u_d( ( v2i64 ) src2, 0 );
1668             u_out3 = __msa_copy_u_d( ( v2i64 ) src3, 0 );
1669
1670             SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
1671             p_dst += ( 4 * i_dst_stride );
1672         }
1673     }
1674     else if( 0 == i_height % 2 )
1675     {
1676         for( i_cnt = ( i_height / 2 ); i_cnt--; )
1677         {
1678             LD_UB2( p_src, i_src_stride, src0, src1 );
1679             p_src += ( 2 * i_src_stride );
1680             u_out0 = __msa_copy_u_d( ( v2i64 ) src0, 0 );
1681             u_out1 = __msa_copy_u_d( ( v2i64 ) src1, 0 );
1682
1683             SD( u_out0, p_dst );
1684             p_dst += i_dst_stride;
1685             SD( u_out1, p_dst );
1686             p_dst += i_dst_stride;
1687         }
1688     }
1689 }
1690
1691
1692 static void copy_16multx8mult_msa( uint8_t *p_src, int32_t i_src_stride,
1693                                    uint8_t *p_dst, int32_t i_dst_stride,
1694                                    int32_t i_height, int32_t i_width )
1695 {
1696     int32_t i_cnt, i_loop_cnt;
1697     uint8_t *p_src_tmp, *p_dst_tmp;
1698     v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1699
1700     for( i_cnt = ( i_width >> 4 ); i_cnt--; )
1701     {
1702         p_src_tmp = p_src;
1703         p_dst_tmp = p_dst;
1704
1705         for( i_loop_cnt = ( i_height >> 3 ); i_loop_cnt--; )
1706         {
1707             LD_UB8( p_src_tmp, i_src_stride,
1708                     src0, src1, src2, src3, src4, src5, src6, src7 );
1709             p_src_tmp += ( 8 * i_src_stride );
1710
1711             ST_UB8( src0, src1, src2, src3, src4, src5, src6, src7,
1712                     p_dst_tmp, i_dst_stride );
1713             p_dst_tmp += ( 8 * i_dst_stride );
1714         }
1715
1716         p_src += 16;
1717         p_dst += 16;
1718     }
1719 }
1720
1721 static void copy_width16_msa( uint8_t *p_src, int32_t i_src_stride,
1722                               uint8_t *p_dst, int32_t i_dst_stride,
1723                               int32_t i_height )
1724 {
1725     int32_t i_cnt;
1726     v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1727
1728     if( 0 == i_height % 12 )
1729     {
1730         for( i_cnt = ( i_height / 12 ); i_cnt--; )
1731         {
1732             LD_UB8( p_src, i_src_stride,
1733                     src0, src1, src2, src3, src4, src5, src6, src7 );
1734             p_src += ( 8 * i_src_stride );
1735             ST_UB8( src0, src1, src2, src3, src4, src5, src6, src7,
1736                     p_dst, i_dst_stride );
1737             p_dst += ( 8 * i_dst_stride );
1738
1739             LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
1740             p_src += ( 4 * i_src_stride );
1741             ST_UB4( src0, src1, src2, src3, p_dst, i_dst_stride );
1742             p_dst += ( 4 * i_dst_stride );
1743         }
1744     }
1745     else if( 0 == i_height % 8 )
1746     {
1747         copy_16multx8mult_msa( p_src, i_src_stride,
1748                                p_dst, i_dst_stride, i_height, 16 );
1749     }
1750     else if( 0 == i_height % 4 )
1751     {
1752         for( i_cnt = ( i_height >> 2 ); i_cnt--; )
1753         {
1754             LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
1755             p_src += ( 4 * i_src_stride );
1756
1757             ST_UB4( src0, src1, src2, src3, p_dst, i_dst_stride );
1758             p_dst += ( 4 * i_dst_stride );
1759         }
1760     }
1761 }
1762
1763 static void avg_src_width4_msa( uint8_t *p_src1, int32_t i_src1_stride,
1764                                 uint8_t *p_src2, int32_t i_src2_stride,
1765                                 uint8_t *p_dst, int32_t i_dst_stride,
1766                                 int32_t i_height )
1767 {
1768     int32_t i_cnt;
1769     uint32_t u_out0, u_out1;
1770     v16u8 src0, src1, src2, src3;
1771     v16u8 dst0, dst1;
1772
1773     for( i_cnt = ( i_height / 2 ); i_cnt--; )
1774     {
1775         LD_UB2( p_src1, i_src1_stride, src0, src1 );
1776         p_src1 += ( 2 * i_src1_stride );
1777         LD_UB2( p_src2, i_src2_stride, src2, src3 );
1778         p_src2 += ( 2 * i_src2_stride );
1779
1780         AVER_UB2_UB( src0, src2, src1, src3, dst0, dst1 );
1781
1782         u_out0 = __msa_copy_u_w( ( v4i32 ) dst0, 0 );
1783         u_out1 = __msa_copy_u_w( ( v4i32 ) dst1, 0 );
1784         SW( u_out0, p_dst );
1785         p_dst += i_dst_stride;
1786         SW( u_out1, p_dst );
1787         p_dst += i_dst_stride;
1788     }
1789 }
1790
1791 static void avg_src_width8_msa( uint8_t *p_src1, int32_t i_src1_stride,
1792                                 uint8_t *p_src2, int32_t i_src2_stride,
1793                                 uint8_t *p_dst, int32_t i_dst_stride,
1794                                 int32_t i_height )
1795 {
1796     int32_t i_cnt;
1797     uint64_t u_out0, u_out1, u_out2, u_out3;
1798     v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1799     v16u8 dst0, dst1, dst2, dst3;
1800
1801     for( i_cnt = ( i_height / 4 ); i_cnt--; )
1802     {
1803         LD_UB4( p_src1, i_src1_stride, src0, src1, src2, src3 );
1804         p_src1 += ( 4 * i_src1_stride );
1805         LD_UB4( p_src2, i_src2_stride, src4, src5, src6, src7 );
1806         p_src2 += ( 4 * i_src2_stride );
1807
1808         AVER_UB4_UB( src0, src4, src1, src5, src2, src6, src3, src7,
1809                      dst0, dst1, dst2, dst3 );
1810
1811         u_out0 = __msa_copy_u_d( ( v2i64 ) dst0, 0 );
1812         u_out1 = __msa_copy_u_d( ( v2i64 ) dst1, 0 );
1813         u_out2 = __msa_copy_u_d( ( v2i64 ) dst2, 0 );
1814         u_out3 = __msa_copy_u_d( ( v2i64 ) dst3, 0 );
1815         SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
1816         p_dst += ( 4 * i_dst_stride );
1817     }
1818 }
1819
1820 static void avg_src_width16_msa( uint8_t *p_src1, int32_t i_src1_stride,
1821                                  uint8_t *p_src2, int32_t i_src2_stride,
1822                                  uint8_t *p_dst, int32_t i_dst_stride,
1823                                  int32_t i_height )
1824 {
1825     int32_t i_cnt;
1826     v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1827     v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1828
1829     for( i_cnt = ( i_height / 8 ); i_cnt--; )
1830     {
1831         LD_UB8( p_src1, i_src1_stride,
1832                 src0, src1, src2, src3, src4, src5, src6, src7 );
1833         p_src1 += ( 8 * i_src1_stride );
1834         LD_UB8( p_src2, i_src2_stride,
1835                 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7 );
1836         p_src2 += ( 8 * i_src2_stride );
1837
1838         AVER_UB4_UB( src0, dst0, src1, dst1, src2, dst2, src3, dst3,
1839                      dst0, dst1, dst2, dst3 );
1840         AVER_UB4_UB( src4, dst4, src5, dst5, src6, dst6, src7, dst7,
1841                      dst4, dst5, dst6, dst7 );
1842
1843         ST_UB8( dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7,
1844                 p_dst, i_dst_stride );
1845         p_dst += ( 8 * i_dst_stride );
1846     }
1847 }
1848
1849 static void memset_zero_16width_msa( uint8_t *p_src, int32_t i_stride,
1850                                      int32_t i_height )
1851 {
1852     int8_t i_cnt;
1853     v16u8 zero = { 0 };
1854
1855     for( i_cnt = ( i_height / 2 ); i_cnt--; )
1856     {
1857         ST_UB( zero, p_src );
1858         p_src += i_stride;
1859         ST_UB( zero, p_src );
1860         p_src += i_stride;
1861     }
1862 }
1863
1864 static void plane_copy_interleave_msa( uint8_t *p_src0, int32_t i_src0_stride,
1865                                        uint8_t *p_src1, int32_t i_src1_stride,
1866                                        uint8_t *p_dst, int32_t i_dst_stride,
1867                                        int32_t i_width, int32_t i_height )
1868 {
1869     int32_t i_loop_width, i_loop_height, i_w_mul8, i_h4w;
1870     v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1871     v16u8 vec_ilv_r0, vec_ilv_r1, vec_ilv_r2, vec_ilv_r3;
1872     v16u8 vec_ilv_l0, vec_ilv_l1, vec_ilv_l2, vec_ilv_l3;
1873
1874     i_w_mul8 = i_width - i_width % 8;
1875     i_h4w = i_height - i_height % 4;
1876
1877     for( i_loop_height = ( i_h4w >> 2 ); i_loop_height--; )
1878     {
1879         for( i_loop_width = ( i_width >> 4 ); i_loop_width--; )
1880         {
1881             LD_UB4( p_src0, i_src0_stride, src0, src1, src2, src3 );
1882             LD_UB4( p_src1, i_src1_stride, src4, src5, src6, src7 );
1883             ILVR_B4_UB( src4, src0, src5, src1, src6, src2, src7, src3,
1884                         vec_ilv_r0, vec_ilv_r1, vec_ilv_r2, vec_ilv_r3 );
1885             ILVL_B4_UB( src4, src0, src5, src1, src6, src2, src7, src3,
1886                         vec_ilv_l0, vec_ilv_l1, vec_ilv_l2, vec_ilv_l3 );
1887             ST_UB4( vec_ilv_r0, vec_ilv_r1, vec_ilv_r2, vec_ilv_r3,
1888                     p_dst, i_dst_stride );
1889             ST_UB4( vec_ilv_l0, vec_ilv_l1, vec_ilv_l2, vec_ilv_l3,
1890                     ( p_dst + 16 ), i_dst_stride );
1891             p_src0 += 16;
1892             p_src1 += 16;
1893             p_dst += 32;
1894         }
1895
1896         for( i_loop_width = ( i_width % 16 ) >> 3; i_loop_width--; )
1897         {
1898             LD_UB4( p_src0, i_src0_stride, src0, src1, src2, src3 );
1899             LD_UB4( p_src1, i_src1_stride, src4, src5, src6, src7 );
1900             ILVR_B4_UB( src4, src0, src5, src1, src6, src2, src7, src3,
1901                         vec_ilv_r0, vec_ilv_r1, vec_ilv_r2, vec_ilv_r3 );
1902             ST_UB4( vec_ilv_r0, vec_ilv_r1, vec_ilv_r2, vec_ilv_r3,
1903                     p_dst, i_dst_stride );
1904             p_src0 += 8;
1905             p_src1 += 8;
1906             p_dst += 16;
1907         }
1908
1909         for( i_loop_width = i_w_mul8; i_loop_width < i_width; i_loop_width++ )
1910         {
1911             p_dst[0] = p_src0[0];
1912             p_dst[1] = p_src1[0];
1913             p_dst[i_dst_stride] = p_src0[i_src0_stride];
1914             p_dst[i_dst_stride + 1] = p_src1[i_src1_stride];
1915             p_dst[2 * i_dst_stride] = p_src0[2 * i_src0_stride];
1916             p_dst[2 * i_dst_stride + 1] = p_src1[2 * i_src1_stride];
1917             p_dst[3 * i_dst_stride] = p_src0[3 * i_src0_stride];
1918             p_dst[3 * i_dst_stride + 1] = p_src1[3 * i_src1_stride];
1919             p_src0 += 1;
1920             p_src1 += 1;
1921             p_dst += 2;
1922         }
1923
1924         p_src0 += ( ( 4 * i_src0_stride ) - i_width );
1925         p_src1 += ( ( 4 * i_src1_stride ) - i_width );
1926         p_dst += ( ( 4 * i_dst_stride ) - ( i_width * 2 ) );
1927     }
1928
1929     for( i_loop_height = i_h4w; i_loop_height < i_height; i_loop_height++ )
1930     {
1931         for( i_loop_width = ( i_width >> 4 ); i_loop_width--; )
1932         {
1933             src0 = LD_UB( p_src0 );
1934             src4 = LD_UB( p_src1 );
1935             ILVRL_B2_UB( src4, src0, vec_ilv_r0, vec_ilv_l0 );
1936             ST_UB2( vec_ilv_r0, vec_ilv_l0, p_dst, 16 );
1937             p_src0 += 16;
1938             p_src1 += 16;
1939             p_dst += 32;
1940         }
1941
1942         for( i_loop_width = ( i_width % 16 ) >> 3; i_loop_width--; )
1943         {
1944             src0 = LD_UB( p_src0 );
1945             src4 = LD_UB( p_src1 );
1946             vec_ilv_r0 = ( v16u8 ) __msa_ilvr_b( ( v16i8 ) src4,
1947                                                  ( v16i8 ) src0 );
1948             ST_UB( vec_ilv_r0, p_dst );
1949             p_src0 += 8;
1950             p_src1 += 8;
1951             p_dst += 16;
1952         }
1953
1954         for( i_loop_width = i_w_mul8; i_loop_width < i_width; i_loop_width++ )
1955         {
1956             p_dst[0] = p_src0[0];
1957             p_dst[1] = p_src1[0];
1958             p_src0 += 1;
1959             p_src1 += 1;
1960             p_dst += 2;
1961         }
1962
1963         p_src0 += ( i_src0_stride - i_width );
1964         p_src1 += ( i_src1_stride - i_width );
1965         p_dst += ( i_dst_stride - ( i_width * 2 ) );
1966     }
1967 }
1968
1969 static void plane_copy_deinterleave_msa( uint8_t *p_src, int32_t i_src_stride,
1970                                          uint8_t *p_dst0, int32_t dst0_stride,
1971                                          uint8_t *p_dst1, int32_t dst1_stride,
1972                                          int32_t i_width, int32_t i_height )
1973 {
1974     int32_t i_loop_width, i_loop_height, i_w_mul4, i_w_mul8, i_h4w;
1975     uint32_t u_res_w0, u_res_w1;
1976     v16u8 in0, in1, in2, in3, in4, in5, in6, in7;
1977     v16u8 vec_pckev0, vec_pckev1, vec_pckev2, vec_pckev3;
1978     v16u8 vec_pckod0, vec_pckod1, vec_pckod2, vec_pckod3;
1979     uint8_t *p_dst;
1980
1981     i_w_mul8 = i_width - i_width % 8;
1982     i_w_mul4 = i_width - i_width % 4;
1983     i_h4w = i_height - i_height % 8;
1984
1985     for( i_loop_height = ( i_h4w >> 3 ); i_loop_height--; )
1986     {
1987         for( i_loop_width = ( i_w_mul8 >> 3 ); i_loop_width--; )
1988         {
1989             LD_UB8( p_src, i_src_stride,
1990                     in0, in1, in2, in3, in4, in5, in6, in7 );
1991             p_src += 16;
1992             PCKEV_B4_UB( in1, in0, in3, in2, in5, in4, in7, in6,
1993                          vec_pckev0, vec_pckev1, vec_pckev2, vec_pckev3 );
1994             PCKOD_B4_UB( in1, in0, in3, in2, in5, in4, in7, in6,
1995                          vec_pckod0, vec_pckod1, vec_pckod2, vec_pckod3 );
1996             ST8x4_UB( vec_pckev0, vec_pckev1, p_dst0, dst0_stride );
1997             p_dst = p_dst0 + 4 * dst0_stride;
1998             ST8x4_UB( vec_pckev2, vec_pckev3, p_dst, dst0_stride );
1999             ST8x4_UB( vec_pckod0, vec_pckod1, p_dst1, dst1_stride );
2000             p_dst = p_dst1 + 4 * dst1_stride;
2001             ST8x4_UB( vec_pckod2, vec_pckod3, p_dst, dst1_stride );
2002             p_dst0 += 8;
2003             p_dst1 += 8;
2004         }
2005
2006         for( i_loop_width = ( ( i_width % 8 ) >> 2 ); i_loop_width--; )
2007         {
2008             LD_UB8( p_src, i_src_stride,
2009                     in0, in1, in2, in3, in4, in5, in6, in7 );
2010             p_src += 8;
2011             PCKEV_B4_UB( in1, in0, in3, in2, in5, in4, in7, in6,
2012                          vec_pckev0, vec_pckev1, vec_pckev2, vec_pckev3 );
2013             PCKOD_B4_UB( in1, in0, in3, in2, in5, in4, in7, in6,
2014                          vec_pckod0, vec_pckod1, vec_pckod2, vec_pckod3 );
2015             ST4x4_UB( vec_pckev0, vec_pckev1, 0, 2, 0, 2, p_dst0, dst0_stride );
2016             p_dst = p_dst0 + 4 * dst0_stride;
2017             ST4x4_UB( vec_pckev2, vec_pckev3, 0, 2, 0, 2, p_dst, dst0_stride );
2018             ST4x4_UB( vec_pckod0, vec_pckod1, 0, 2, 0, 2, p_dst1, dst1_stride );
2019             p_dst = p_dst1 + 4 * dst1_stride;
2020             ST4x4_UB( vec_pckod2, vec_pckod3, 0, 2, 0, 2, p_dst, dst1_stride );
2021             p_dst0 += 4;
2022             p_dst1 += 4;
2023         }
2024
2025         for( i_loop_width = i_w_mul4; i_loop_width < i_width; i_loop_width++ )
2026         {
2027             p_dst0[0] = p_src[0];
2028             p_dst1[0] = p_src[1];
2029             p_dst0[dst0_stride] = p_src[i_src_stride];
2030             p_dst1[dst1_stride] = p_src[i_src_stride + 1];
2031             p_dst0[2 * dst0_stride] = p_src[2 * i_src_stride];
2032             p_dst1[2 * dst1_stride] = p_src[2 * i_src_stride + 1];
2033             p_dst0[3 * dst0_stride] = p_src[3 * i_src_stride];
2034             p_dst1[3 * dst1_stride] = p_src[3 * i_src_stride + 1];
2035             p_dst0[4 * dst0_stride] = p_src[4 * i_src_stride];
2036             p_dst1[4 * dst1_stride] = p_src[4 * i_src_stride + 1];
2037             p_dst0[5 * dst0_stride] = p_src[5 * i_src_stride];
2038             p_dst1[5 * dst1_stride] = p_src[5 * i_src_stride + 1];
2039             p_dst0[6 * dst0_stride] = p_src[6 * i_src_stride];
2040             p_dst1[6 * dst1_stride] = p_src[6 * i_src_stride + 1];
2041             p_dst0[7 * dst0_stride] = p_src[7 * i_src_stride];
2042             p_dst1[7 * dst1_stride] = p_src[7 * i_src_stride + 1];
2043             p_dst0 += 1;
2044             p_dst1 += 1;
2045             p_src += 2;
2046         }
2047
2048         p_src += ( ( 8 * i_src_stride ) - ( i_width << 1 ) );
2049         p_dst0 += ( ( 8 * dst0_stride ) - i_width );
2050         p_dst1 += ( ( 8 * dst1_stride ) - i_width );
2051     }
2052
2053     for( i_loop_height = i_h4w; i_loop_height < i_height; i_loop_height++ )
2054     {
2055         for( i_loop_width = ( i_w_mul8 >> 3 ); i_loop_width--; )
2056         {
2057             in0 = LD_UB( p_src );
2058             p_src += 16;
2059             vec_pckev0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) in0,
2060                                                   ( v16i8 ) in0 );
2061             vec_pckod0 = ( v16u8 ) __msa_pckod_b( ( v16i8 ) in0,
2062                                                   ( v16i8 ) in0 );
2063             ST8x1_UB( vec_pckev0, p_dst0 );
2064             ST8x1_UB( vec_pckod0, p_dst1 );
2065             p_dst0 += 8;
2066             p_dst1 += 8;
2067         }
2068
2069         for( i_loop_width = ( ( i_width % 8 ) >> 2 ); i_loop_width--; )
2070         {
2071             in0 = LD_UB( p_src );
2072             p_src += 8;
2073             vec_pckev0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) in0,
2074                                                   ( v16i8 ) in0 );
2075             vec_pckod0 = ( v16u8 ) __msa_pckod_b( ( v16i8 ) in0,
2076                                                   ( v16i8 ) in0 );
2077             u_res_w0 = __msa_copy_u_w( ( v4i32 ) vec_pckev0, 0 );
2078             SW( u_res_w0, p_dst0 );
2079             u_res_w1 = __msa_copy_u_w( ( v4i32 ) vec_pckod0, 0 );
2080             SW( u_res_w1, p_dst1 );
2081             p_dst0 += 4;
2082             p_dst1 += 4;
2083         }
2084
2085         for( i_loop_width = i_w_mul4; i_loop_width < i_width; i_loop_width++ )
2086         {
2087             p_dst0[0] = p_src[0];
2088             p_dst1[0] = p_src[1];
2089             p_dst0 += 1;
2090             p_dst1 += 1;
2091             p_src += 2;
2092         }
2093
2094         p_src += ( ( i_src_stride ) - ( i_width << 1 ) );
2095         p_dst0 += ( ( dst0_stride ) - i_width );
2096         p_dst1 += ( ( dst1_stride ) - i_width );
2097     }
2098 }
2099
2100
2101 static void plane_copy_deinterleave_rgb_msa( uint8_t *p_src,
2102                                              int32_t i_src_stride,
2103                                              uint8_t *p_dst0,
2104                                              int32_t i_dst0_stride,
2105                                              uint8_t *p_dst1,
2106                                              int32_t i_dst1_stride,
2107                                              uint8_t *p_dst2,
2108                                              int32_t i_dst2_stride,
2109                                              int32_t i_width,
2110                                              int32_t i_height )
2111 {
2112     uint8_t *p_src_orig = p_src;
2113     uint8_t *p_dst0_orig = p_dst0;
2114     uint8_t *p_dst1_orig = p_dst1;
2115     uint8_t *p_dst2_orig = p_dst2;
2116     int32_t i_loop_width, i_loop_height, i_w_mul8, i_h_mul4;
2117     v16i8 in0, in1, in2, in3, in4, in5, in6, in7;
2118     v16i8 temp0, temp1, temp2, temp3;
2119     v16i8 mask0 = { 0, 3, 6, 9, 12, 15, 18, 21, 0, 0, 0, 0, 0, 0, 0, 0 };
2120     v16i8 mask1 = { 1, 4, 7, 10, 13, 16, 19, 22, 0, 0, 0, 0, 0, 0, 0, 0 };
2121     v16i8 mask2 = { 2, 5, 8, 11, 14, 17, 20, 23, 0, 0, 0, 0, 0, 0, 0, 0 };
2122
2123     i_w_mul8 = i_width - i_width % 8;
2124     i_h_mul4 = i_height - i_height % 4;
2125
2126     for( i_loop_height = ( i_height >> 2 ); i_loop_height--; )
2127     {
2128         p_src = p_src_orig;
2129         p_dst0 = p_dst0_orig;
2130         p_dst1 = p_dst1_orig;
2131         p_dst2 = p_dst2_orig;
2132
2133         for( i_loop_width = ( i_width >> 3 ); i_loop_width--; )
2134         {
2135             LD_SB4( p_src, i_src_stride, in0, in1, in2, in3 );
2136             LD_SB4( ( p_src + 16 ), i_src_stride, in4, in5, in6, in7 );
2137
2138             VSHF_B2_SB( in0, in4, in1, in5, mask0, mask0, temp0, temp1 );
2139             VSHF_B2_SB( in2, in6, in3, in7, mask0, mask0, temp2, temp3 );
2140             ST8x1_UB( temp0, p_dst0 );
2141             ST8x1_UB( temp1, p_dst0 + i_dst0_stride );
2142             ST8x1_UB( temp2, p_dst0 + 2 * i_dst0_stride );
2143             ST8x1_UB( temp3, p_dst0 + 3 * i_dst0_stride );
2144
2145             VSHF_B2_SB( in0, in4, in1, in5, mask1, mask1, temp0, temp1 );
2146             VSHF_B2_SB( in2, in6, in3, in7, mask1, mask1, temp2, temp3 );
2147             ST8x1_UB( temp0, p_dst1 );
2148             ST8x1_UB( temp1, p_dst1 + i_dst1_stride );
2149             ST8x1_UB( temp2, p_dst1 + 2 * i_dst1_stride );
2150             ST8x1_UB( temp3, p_dst1 + 3 * i_dst1_stride );
2151
2152             VSHF_B2_SB( in0, in4, in1, in5, mask2, mask2, temp0, temp1 );
2153             VSHF_B2_SB( in2, in6, in3, in7, mask2, mask2, temp2, temp3 );
2154             ST8x1_UB( temp0, p_dst2 );
2155             ST8x1_UB( temp1, p_dst2 + i_dst2_stride );
2156             ST8x1_UB( temp2, p_dst2 + 2 * i_dst2_stride );
2157             ST8x1_UB( temp3, p_dst2 + 3 * i_dst2_stride );
2158
2159             p_src += 8 * 3;
2160             p_dst0 += 8;
2161             p_dst1 += 8;
2162             p_dst2 += 8;
2163         }
2164
2165         for( i_loop_width = i_w_mul8; i_loop_width < i_width; i_loop_width++ )
2166         {
2167             p_dst0_orig[i_loop_width] = p_src_orig[0 + 3 * i_loop_width];
2168             p_dst1_orig[i_loop_width] = p_src_orig[1 + 3 * i_loop_width];
2169             p_dst2_orig[i_loop_width] = p_src_orig[2 + 3 * i_loop_width];
2170
2171             p_dst0_orig[i_loop_width + i_dst0_stride] =
2172                 p_src_orig[0 + i_src_stride + 3 * i_loop_width];
2173             p_dst1_orig[i_loop_width + i_dst1_stride] =
2174                 p_src_orig[1 + i_src_stride + 3 * i_loop_width];
2175             p_dst2_orig[i_loop_width + i_dst2_stride] =
2176                 p_src_orig[2 + i_src_stride + 3 * i_loop_width];
2177
2178             p_dst0_orig[i_loop_width + 2 * i_dst0_stride] =
2179                 p_src_orig[0 + 2 * i_src_stride + 3 * i_loop_width];
2180             p_dst1_orig[i_loop_width + 2 * i_dst1_stride] =
2181                 p_src_orig[1 + 2 * i_src_stride + 3 * i_loop_width];
2182             p_dst2_orig[i_loop_width + 2 * i_dst2_stride] =
2183                 p_src_orig[2 + 2 * i_src_stride + 3 * i_loop_width];
2184
2185             p_dst0_orig[i_loop_width + 3 * i_dst0_stride] =
2186                 p_src_orig[0 + 3 * i_src_stride + 3 * i_loop_width];
2187             p_dst1_orig[i_loop_width + 3 * i_dst1_stride] =
2188                 p_src_orig[1 + 3 * i_src_stride + 3 * i_loop_width];
2189             p_dst2_orig[i_loop_width + 3 * i_dst2_stride] =
2190                 p_src_orig[2 + 3 * i_src_stride + 3 * i_loop_width];
2191         }
2192
2193         p_src_orig += ( 4 * i_src_stride );
2194         p_dst0_orig += ( 4 * i_dst0_stride );
2195         p_dst1_orig += ( 4 * i_dst1_stride );
2196         p_dst2_orig += ( 4 * i_dst2_stride );
2197     }
2198
2199     for( i_loop_height = i_h_mul4; i_loop_height < i_height; i_loop_height++ )
2200     {
2201         p_src = p_src_orig;
2202         p_dst0 = p_dst0_orig;
2203         p_dst1 = p_dst1_orig;
2204         p_dst2 = p_dst2_orig;
2205
2206         for( i_loop_width = ( i_width >> 3 ); i_loop_width--; )
2207         {
2208             in0 = LD_SB( p_src );
2209             in4 = LD_SB( p_src + 16 );
2210             temp0 = __msa_vshf_b( mask0, in4, in0 );
2211             ST8x1_UB( temp0, p_dst0 );
2212             temp0 = __msa_vshf_b( mask1, in4, in0 );
2213             ST8x1_UB( temp0, p_dst1 );
2214             temp0 = __msa_vshf_b( mask2, in4, in0 );
2215             ST8x1_UB( temp0, p_dst2 );
2216
2217             p_src += 8 * 3;
2218             p_dst0 += 8;
2219             p_dst1 += 8;
2220             p_dst2 += 8;
2221         }
2222
2223         for( i_loop_width = i_w_mul8; i_loop_width < i_width; i_loop_width++ )
2224         {
2225             p_dst0_orig[i_loop_width] = p_src_orig[3 * i_loop_width];
2226             p_dst1_orig[i_loop_width] = p_src_orig[3 * i_loop_width + 1];
2227             p_dst2_orig[i_loop_width] = p_src_orig[3 * i_loop_width + 2];
2228         }
2229
2230         p_src_orig += ( i_src_stride );
2231         p_dst0_orig += ( i_dst0_stride );
2232         p_dst1_orig += ( i_dst1_stride );
2233         p_dst2_orig += ( i_dst2_stride );
2234     }
2235 }
2236
2237 static void plane_copy_deinterleave_rgba_msa( uint8_t *p_src,
2238                                               int32_t i_src_stride,
2239                                               uint8_t *p_dst0,
2240                                               int32_t i_dst0_stride,
2241                                               uint8_t *p_dst1,
2242                                               int32_t i_dst1_stride,
2243                                               uint8_t *p_dst2,
2244                                               int32_t i_dst2_stride,
2245                                               int32_t i_width,
2246                                               int32_t i_height )
2247 {
2248     uint8_t *p_src_orig = p_src;
2249     uint8_t *p_dst0_orig = p_dst0;
2250     uint8_t *p_dst1_orig = p_dst1;
2251     uint8_t *p_dst2_orig = p_dst2;
2252     int32_t i_loop_width, i_loop_height, i_w_mul8, i_h_mul4;
2253     v16i8 in0, in1, in2, in3, in4, in5, in6, in7;
2254     v16i8 in8, in9, in10, in11, in12, in13, in14, in15;
2255     v8i16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
2256     v8i16 temp8, temp9, temp10, temp11, temp12, temp13, temp14, temp15;
2257
2258     i_w_mul8 = i_width - i_width % 8;
2259     i_h_mul4 = i_height - i_height % 4;
2260
2261     for( i_loop_height = ( i_height >> 2 ); i_loop_height--; )
2262     {
2263         p_src = p_src_orig;
2264         p_dst0 = p_dst0_orig;
2265         p_dst1 = p_dst1_orig;
2266         p_dst2 = p_dst2_orig;
2267
2268         for( i_loop_width = ( i_width >> 4 ); i_loop_width--; )
2269         {
2270             LD_SB4( p_src, i_src_stride, in0, in1, in2, in3 );
2271             LD_SB4( ( p_src + 16 ), i_src_stride, in4, in5, in6, in7 );
2272             LD_SB4( ( p_src + 32 ), i_src_stride, in8, in9, in10, in11 );
2273             LD_SB4( ( p_src + 48 ), i_src_stride, in12, in13, in14, in15 );
2274
2275             PCKEV_H2_SH( in4, in0, in12, in8, temp0, temp1 );
2276             temp2 = __msa_pckod_h( ( v8i16 ) in4, ( v8i16 ) in0 );
2277             temp3 = __msa_pckod_h( ( v8i16 ) in12, ( v8i16 ) in8 );
2278             PCKEV_H2_SH( in5, in1, in13, in9, temp4, temp5 );
2279             temp6 = __msa_pckod_h( ( v8i16 ) in5, ( v8i16 ) in1 );
2280             temp7 = __msa_pckod_h( ( v8i16 ) in13, ( v8i16 ) in9 );
2281             PCKEV_H2_SH( in6, in2, in14, in10, temp8, temp9 );
2282             temp10 = __msa_pckod_h( ( v8i16 ) in6, ( v8i16 ) in2 );
2283             temp11 = __msa_pckod_h( ( v8i16 ) in14, ( v8i16 ) in10 );
2284             PCKEV_H2_SH( in7, in3, in15, in11, temp12, temp13 );
2285             temp14 = __msa_pckod_h( ( v8i16 ) in7, ( v8i16 ) in3 );
2286             temp15 = __msa_pckod_h( ( v8i16 ) in15, ( v8i16 ) in11 );
2287             PCKEV_B2_SB( temp1, temp0, temp3, temp2, in0, in1 );
2288             in2 = __msa_pckod_b( ( v16i8 ) temp1, ( v16i8 ) temp0 );
2289             PCKEV_B2_SB( temp5, temp4, temp7, temp6, in4, in5 );
2290             in6 = __msa_pckod_b( ( v16i8 ) temp5, ( v16i8 ) temp4 );
2291             PCKEV_B2_SB( temp9, temp8, temp11, temp10, in8, in9 );
2292             in10 = __msa_pckod_b( ( v16i8 ) temp9, ( v16i8 ) temp8 );
2293             PCKEV_B2_SB( temp13, temp12, temp15, temp14, in12, in13 );
2294             in14 = __msa_pckod_b( ( v16i8 ) temp13, ( v16i8 ) temp12 );
2295             ST_SB4( in0, in4, in8, in12, p_dst0, i_dst0_stride );
2296             ST_SB4( in1, in5, in9, in13, p_dst2, i_dst2_stride );
2297             ST_SB4( in2, in6, in10, in14, p_dst1, i_dst1_stride );
2298
2299             p_src += 16 * 4;
2300             p_dst0 += 16;
2301             p_dst1 += 16;
2302             p_dst2 += 16;
2303         }
2304
2305         for( i_loop_width = ( ( i_width % 16 ) >> 3 ); i_loop_width--; )
2306         {
2307             LD_SB4( p_src, i_src_stride, in0, in1, in2, in3 );
2308             LD_SB4( p_src + 16, i_src_stride, in4, in5, in6, in7 );
2309
2310             PCKEV_H2_SH( in4, in0, in5, in1, temp0, temp4 );
2311             temp2 = __msa_pckod_h( ( v8i16 ) in4, ( v8i16 ) in0 );
2312             temp6 = __msa_pckod_h( ( v8i16 ) in5, ( v8i16 ) in1 );
2313
2314             PCKEV_H2_SH( in6, in2, in7, in3, temp8, temp12 );
2315             temp10 = __msa_pckod_h( ( v8i16 ) in6, ( v8i16 ) in2 );
2316             temp14 = __msa_pckod_h( ( v8i16 ) in7, ( v8i16 ) in3 );
2317
2318             PCKEV_B2_SB( temp0, temp0, temp2, temp2, in0, in1 );
2319             in2 = __msa_pckod_b( ( v16i8 ) temp0, ( v16i8 ) temp0 );
2320             PCKEV_B2_SB( temp4, temp4, temp6, temp6, in4, in5 );
2321             in6 = __msa_pckod_b( ( v16i8 ) temp4, ( v16i8 ) temp4 );
2322             PCKEV_B2_SB( temp8, temp8, temp10, temp10, in8, in9 );
2323             in10 = __msa_pckod_b( ( v16i8 ) temp8, ( v16i8 ) temp8 );
2324             PCKEV_B2_SB( temp12, temp12, temp14, temp14, in12, in13 );
2325             in14 = __msa_pckod_b( ( v16i8 ) temp12, ( v16i8 ) temp12 );
2326
2327             ST8x1_UB( in0, p_dst0 );
2328             ST8x1_UB( in4, p_dst0 + i_dst0_stride );
2329             ST8x1_UB( in8, p_dst0 + 2 * i_dst0_stride );
2330             ST8x1_UB( in12, p_dst0 + 3 * i_dst0_stride );
2331
2332             ST8x1_UB( in1, p_dst2 );
2333             ST8x1_UB( in5, p_dst2 + i_dst2_stride );
2334             ST8x1_UB( in9, p_dst2 + 2 * i_dst2_stride );
2335             ST8x1_UB( in13, p_dst2 + 3 * i_dst2_stride );
2336
2337             ST8x1_UB( in2, p_dst1 );
2338             ST8x1_UB( in6, p_dst1 + i_dst1_stride );
2339             ST8x1_UB( in10, p_dst1 + 2 * i_dst1_stride );
2340             ST8x1_UB( in14, p_dst1 + 3 * i_dst1_stride );
2341
2342             p_src += 8 * 4;
2343             p_dst0 += 8;
2344             p_dst1 += 8;
2345             p_dst2 += 8;
2346         }
2347
2348         for( i_loop_width = i_w_mul8; i_loop_width < i_width; i_loop_width++ )
2349         {
2350             p_dst0_orig[i_loop_width] = p_src_orig[4 * i_loop_width];
2351             p_dst1_orig[i_loop_width] = p_src_orig[4 * i_loop_width + 1];
2352             p_dst2_orig[i_loop_width] = p_src_orig[4 * i_loop_width + 2];
2353
2354             p_dst0_orig[i_dst0_stride + i_loop_width] =
2355                 p_src_orig[i_src_stride + 4 * i_loop_width];
2356             p_dst1_orig[i_dst1_stride + i_loop_width] =
2357                 p_src_orig[i_src_stride + 4 * i_loop_width + 1];
2358             p_dst2_orig[i_dst2_stride + i_loop_width] =
2359                 p_src_orig[i_src_stride + 4 * i_loop_width + 2];
2360
2361             p_dst0_orig[2 * i_dst0_stride + i_loop_width] =
2362                 p_src_orig[2 * i_src_stride + 4 * i_loop_width];
2363             p_dst1_orig[2 * i_dst1_stride + i_loop_width] =
2364                 p_src_orig[2 * i_src_stride + 4 * i_loop_width + 1];
2365             p_dst2_orig[2 * i_dst2_stride + i_loop_width] =
2366                 p_src_orig[2 * i_src_stride + 4 * i_loop_width + 2];
2367
2368             p_dst0_orig[3 * i_dst0_stride + i_loop_width] =
2369                 p_src_orig[3 * i_src_stride + 4 * i_loop_width];
2370             p_dst1_orig[3 * i_dst1_stride + i_loop_width] =
2371                 p_src_orig[3 * i_src_stride + 4 * i_loop_width + 1];
2372             p_dst2_orig[3 * i_dst2_stride + i_loop_width] =
2373                 p_src_orig[3 * i_src_stride + 4 * i_loop_width + 2];
2374         }
2375
2376         p_src_orig += ( 4 * i_src_stride );
2377         p_dst0_orig += ( 4 * i_dst0_stride );
2378         p_dst1_orig += ( 4 * i_dst1_stride );
2379         p_dst2_orig += ( 4 * i_dst2_stride );
2380     }
2381
2382     for( i_loop_height = i_h_mul4; i_loop_height < i_height; i_loop_height++ )
2383     {
2384         p_src = p_src_orig;
2385         p_dst0 = p_dst0_orig;
2386         p_dst1 = p_dst1_orig;
2387         p_dst2 = p_dst2_orig;
2388
2389         for( i_loop_width = ( i_width >> 4 ); i_loop_width--; )
2390         {
2391             LD_SB4( p_src, 16, in0, in4, in8, in12 );
2392
2393             PCKEV_H2_SH( in4, in0, in12, in8, temp0, temp1 );
2394             temp2 = __msa_pckod_h( ( v8i16 ) in4, ( v8i16 ) in0 );
2395             temp3 = __msa_pckod_h( ( v8i16 ) in12, ( v8i16 ) in8 );
2396             PCKEV_B2_SB( temp1, temp0, temp3, temp2, in0, in1 );
2397             in2 = __msa_pckod_b( ( v16i8 ) temp1, ( v16i8 ) temp0 );
2398             ST_SB( in0, p_dst0 );
2399             ST_SB( in0, p_dst0 );
2400             ST_SB( in1, p_dst2 );
2401             ST_SB( in1, p_dst2 );
2402             ST_SB( in2, p_dst1 );
2403             ST_SB( in2, p_dst1 );
2404
2405             p_src += 16 * 4;
2406             p_dst0 += 16;
2407             p_dst1 += 16;
2408             p_dst2 += 16;
2409         }
2410
2411         for( i_loop_width = ( ( i_width % 16 ) >> 3 ); i_loop_width--; )
2412         {
2413             in0 = LD_SB( p_src );
2414             in4 = LD_SB( p_src + 16 );
2415
2416             temp0 = __msa_pckev_h( ( v8i16 ) in4, ( v8i16 ) in0 );
2417             temp2 = __msa_pckod_h( ( v8i16 ) in4, ( v8i16 ) in0 );
2418             PCKEV_B2_SB( temp0, temp0, temp2, temp2, in0, in1 );
2419             in2 = __msa_pckod_b( ( v16i8 ) temp0, ( v16i8 ) temp0 );
2420             ST8x1_UB( in0, p_dst0 );
2421             ST8x1_UB( in1, p_dst2 );
2422             ST8x1_UB( in2, p_dst1 );
2423
2424             p_src += 8 * 4;
2425             p_dst0 += 8;
2426             p_dst1 += 8;
2427             p_dst2 += 8;
2428         }
2429
2430         for( i_loop_width = i_w_mul8; i_loop_width < i_width; i_loop_width++ )
2431         {
2432             p_dst0_orig[i_loop_width] = p_src_orig[4 * i_loop_width];
2433             p_dst1_orig[i_loop_width] = p_src_orig[4 * i_loop_width + 1];
2434             p_dst2_orig[i_loop_width] = p_src_orig[4 * i_loop_width + 2];
2435         }
2436
2437         p_src_orig += ( i_src_stride );
2438         p_dst0_orig += ( i_dst0_stride );
2439         p_dst1_orig += ( i_dst1_stride );
2440         p_dst2_orig += ( i_dst2_stride );
2441     }
2442 }
2443
2444 static void store_interleave_chroma_msa( uint8_t *p_src0, int32_t i_src0_stride,
2445                                          uint8_t *p_src1, int32_t i_src1_stride,
2446                                          uint8_t *p_dst, int32_t i_dst_stride,
2447                                          int32_t i_height )
2448 {
2449     int32_t i_loop_height, i_h4w;
2450     v16u8 in0, in1, in2, in3, in4, in5, in6, in7;
2451     v16u8 ilvr_vec0, ilvr_vec1, ilvr_vec2, ilvr_vec3;
2452
2453     i_h4w = i_height % 4;
2454     for( i_loop_height = ( i_height >> 2 ); i_loop_height--; )
2455     {
2456         LD_UB4( p_src0, i_src0_stride, in0, in1, in2, in3 );
2457         p_src0 += ( 4 * i_src0_stride );
2458         LD_UB4( p_src1, i_src1_stride, in4, in5, in6, in7 );
2459         p_src1 += ( 4 * i_src1_stride );
2460         ILVR_B4_UB( in4, in0, in5, in1, in6, in2, in7, in3,
2461                     ilvr_vec0, ilvr_vec1, ilvr_vec2, ilvr_vec3 );
2462         ST_UB4( ilvr_vec0, ilvr_vec1, ilvr_vec2, ilvr_vec3,
2463                 p_dst, i_dst_stride );
2464         p_dst += ( 4 * i_dst_stride );
2465     }
2466
2467     for( i_loop_height = i_h4w; i_loop_height--; )
2468     {
2469         in0 = LD_UB( p_src0 );
2470         p_src0 += ( i_src0_stride );
2471         in1 = LD_UB( p_src1 );
2472         p_src1 += ( i_src1_stride );
2473         ilvr_vec0 = ( v16u8 ) __msa_ilvr_b( ( v16i8 ) in1, ( v16i8 ) in0 );
2474         ST_UB( ilvr_vec0, p_dst );
2475         p_dst += ( i_dst_stride );
2476     }
2477 }
2478
2479 static void frame_init_lowres_core_msa( uint8_t *p_src, int32_t i_src_stride,
2480                                         uint8_t *p_dst0, int32_t dst0_stride,
2481                                         uint8_t *p_dst1, int32_t dst1_stride,
2482                                         uint8_t *p_dst2, int32_t dst2_stride,
2483                                         uint8_t *p_dst3, int32_t dst3_stride,
2484                                         int32_t i_width, int32_t i_height )
2485 {
2486     int32_t i_loop_width, i_loop_height, i_w16_mul;
2487     v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2488     v16u8 sld1_vec0, sld1_vec1, sld1_vec2, sld1_vec3, sld1_vec4, sld1_vec5;
2489     v16u8 pckev_vec0, pckev_vec1, pckev_vec2;
2490     v16u8 pckod_vec0, pckod_vec1, pckod_vec2;
2491     v16u8 tmp0, tmp1, tmp2, tmp3;
2492     v16u8 res0, res1;
2493
2494     i_w16_mul = i_width - i_width % 16;
2495     for( i_loop_height = i_height; i_loop_height--; )
2496     {
2497         LD_UB3( p_src, i_src_stride, src0, src1, src2 );
2498         p_src += 16;
2499         for( i_loop_width = 0; i_loop_width < ( i_w16_mul >> 4 ); i_loop_width++ )
2500         {
2501             LD_UB3( p_src, i_src_stride, src3, src4, src5 );
2502             p_src += 16;
2503             LD_UB3( p_src, i_src_stride, src6, src7, src8 );
2504             p_src += 16;
2505             PCKEV_B2_UB( src3, src0, src4, src1, pckev_vec0, pckev_vec1 );
2506             PCKOD_B2_UB( src3, src0, src4, src1, pckod_vec0, pckod_vec1 );
2507             pckev_vec2 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) src5,
2508                                                   ( v16i8 ) src2 );
2509             pckod_vec2 = ( v16u8 ) __msa_pckod_b( ( v16i8 ) src5,
2510                                                   ( v16i8 ) src2 );
2511             AVER_UB4_UB( pckev_vec1, pckev_vec0, pckod_vec1, pckod_vec0,
2512                          pckev_vec2, pckev_vec1, pckod_vec2, pckod_vec1,
2513                          tmp0, tmp1, tmp2, tmp3 );
2514             AVER_UB2_UB( tmp1, tmp0, tmp3, tmp2, res0, res1 );
2515             ST_UB( res0, p_dst0 );
2516             ST_UB( res1, p_dst2 );
2517
2518             SLDI_B2_UB( src3, src4, src0, src1, sld1_vec0, sld1_vec1, 1 );
2519             SLDI_B2_UB( src5, src6, src2, src3, sld1_vec2, sld1_vec3, 1 );
2520             SLDI_B2_UB( src7, src8, src4, src5, sld1_vec4, sld1_vec5, 1 );
2521             PCKOD_B2_UB( sld1_vec3, sld1_vec0, sld1_vec4, sld1_vec1,
2522                          pckev_vec0, pckev_vec1 )
2523             pckev_vec2 = ( v16u8 ) __msa_pckod_b( ( v16i8 ) sld1_vec5,
2524                                                   ( v16i8 ) sld1_vec2 );
2525             AVER_UB4_UB( pckev_vec1, pckev_vec0, pckod_vec1, pckod_vec0,
2526                          pckev_vec2, pckev_vec1, pckod_vec2, pckod_vec1,
2527                          tmp0, tmp1, tmp2, tmp3 );
2528             AVER_UB2_UB( tmp1, tmp0, tmp3, tmp2, res0, res1 );
2529             ST_UB( res0, p_dst1 );
2530             ST_UB( res1, p_dst3 );
2531
2532             src0 = src6;
2533             src1 = src7;
2534             src2 = src8;
2535             p_dst0 += 16;
2536             p_dst1 += 16;
2537             p_dst2 += 16;
2538             p_dst3 += 16;
2539         }
2540
2541         for( i_loop_width = i_w16_mul; i_loop_width < i_width;
2542              i_loop_width += 8 )
2543         {
2544             LD_UB3( p_src, i_src_stride, src3, src4, src5 );
2545             p_src += 16;
2546             PCKEV_B2_UB( src3, src0, src4, src1, pckev_vec0, pckev_vec1 );
2547             PCKOD_B2_UB( src3, src0, src4, src1, pckod_vec0, pckod_vec1 );
2548             pckev_vec2 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) src5,
2549                                                   ( v16i8 ) src2 );
2550             pckod_vec2 = ( v16u8 ) __msa_pckod_b( ( v16i8 ) src5,
2551                                                   ( v16i8 ) src2 );
2552             AVER_UB4_UB( pckev_vec1, pckev_vec0, pckod_vec1, pckod_vec0,
2553                          pckev_vec2, pckev_vec1, pckod_vec2, pckod_vec1,
2554                          tmp0, tmp1, tmp2, tmp3 );
2555             AVER_UB2_UB( tmp1, tmp0, tmp3, tmp2, res0, res1 );
2556             ST8x1_UB( res0, p_dst0 );
2557             ST8x1_UB( res1, p_dst2 );
2558
2559             SLDI_B2_UB( src3, src4, src0, src1, sld1_vec0, sld1_vec1, 1 );
2560             SLDI_B2_UB( src5, src3, src2, src3, sld1_vec2, sld1_vec3, 1 );
2561             SLDI_B2_UB( src4, src5, src4, src5, sld1_vec4, sld1_vec5, 1 );
2562             PCKOD_B2_UB( sld1_vec3, sld1_vec0, sld1_vec4, sld1_vec1,
2563                          pckev_vec0, pckev_vec1 )
2564             pckev_vec2 = ( v16u8 ) __msa_pckod_b( ( v16i8 ) sld1_vec5,
2565                                                   ( v16i8 ) sld1_vec2 );
2566             AVER_UB4_UB( pckev_vec1, pckev_vec0, pckod_vec1, pckod_vec0,
2567                          pckev_vec2, pckev_vec1, pckod_vec2, pckod_vec1,
2568                          tmp0, tmp1, tmp2, tmp3 );
2569             AVER_UB2_UB( tmp1, tmp0, tmp3, tmp2, res0, res1 );
2570             ST8x1_UB( res0, p_dst1 );
2571             ST8x1_UB( res1, p_dst3 );
2572             p_dst0 += 8;
2573             p_dst1 += 8;
2574             p_dst2 += 8;
2575             p_dst3 += 8;
2576         }
2577
2578         p_src += ( i_src_stride * 2 - ( ( i_width * 2 ) + 16 ) );
2579         p_dst0 += ( dst0_stride - i_width );
2580         p_dst1 += ( dst1_stride - i_width );
2581         p_dst2 += ( dst2_stride - i_width );
2582         p_dst3 += ( dst3_stride - i_width );
2583     }
2584 }
2585
2586 void x264_mc_copy_w16_msa( uint8_t *p_dst, intptr_t i_dst_stride,
2587                            uint8_t *p_src, intptr_t i_src_stride,
2588                            int32_t i_height )
2589 {
2590     copy_width16_msa( p_src, i_src_stride, p_dst, i_dst_stride, i_height );
2591 }
2592
2593 void x264_mc_copy_w8_msa( uint8_t *p_dst, intptr_t i_dst_stride, uint8_t *p_src,
2594                           intptr_t i_src_stride, int32_t i_height )
2595 {
2596     copy_width8_msa( p_src, i_src_stride, p_dst, i_dst_stride, i_height );
2597 }
2598
2599 void x264_mc_copy_w4_msa( uint8_t *p_dst, intptr_t i_dst_stride, uint8_t *p_src,
2600                           intptr_t i_src_stride, int32_t i_height )
2601 {
2602     copy_width4_msa( p_src, i_src_stride, p_dst, i_dst_stride, i_height );
2603 }
2604
2605 void x264_pixel_avg_16x16_msa( uint8_t *p_pix1, intptr_t pix1_stride,
2606                                uint8_t *p_pix2, intptr_t pix2_stride,
2607                                uint8_t *p_pix3, intptr_t pix3_stride,
2608                                int32_t i_weight )
2609 {
2610     if( 32 == i_weight )
2611     {
2612         avg_src_width16_msa( p_pix2, pix2_stride, p_pix3, pix3_stride,
2613                              p_pix1, pix1_stride, 16 );
2614     }
2615     else if( i_weight < 0 || i_weight > 63 )
2616     {
2617         avc_biwgt_opscale_16width_nw_msa( p_pix2, pix2_stride,
2618                                           p_pix3, pix3_stride,
2619                                           p_pix1, pix1_stride,
2620                                           16, 5, i_weight,
2621                                           ( 64 - i_weight ), 0 );
2622     }
2623     else
2624     {
2625         avc_biwgt_opscale_16width_msa( p_pix2, pix2_stride,
2626                                        p_pix3, pix3_stride,
2627                                        p_pix1, pix1_stride,
2628                                        16, 5, i_weight,
2629                                        ( 64 - i_weight ), 0 );
2630     }
2631 }
2632
2633 void x264_pixel_avg_16x8_msa( uint8_t *p_pix1, intptr_t pix1_stride,
2634                               uint8_t *p_pix2, intptr_t pix2_stride,
2635                               uint8_t *p_pix3, intptr_t pix3_stride,
2636                               int32_t i_weight )
2637 {
2638     if( 32 == i_weight )
2639     {
2640         avg_src_width16_msa( p_pix2, pix2_stride, p_pix3, pix3_stride,
2641                              p_pix1, pix1_stride, 8 );
2642     }
2643     else if( i_weight < 0 || i_weight > 63 )
2644     {
2645         avc_biwgt_opscale_16width_nw_msa( p_pix2, pix2_stride,
2646                                           p_pix3, pix3_stride,
2647                                           p_pix1, pix1_stride,
2648                                           8, 5, i_weight,
2649                                           ( 64 - i_weight ), 0 );
2650     }
2651     else
2652     {
2653         avc_biwgt_opscale_16width_msa( p_pix2, pix2_stride,
2654                                        p_pix3, pix3_stride,
2655                                        p_pix1, pix1_stride,
2656                                        8, 5, i_weight,
2657                                        ( 64 - i_weight ), 0 );
2658     }
2659 }
2660
2661 void x264_pixel_avg_8x16_msa( uint8_t *p_pix1, intptr_t pix1_stride,
2662                               uint8_t *p_pix2, intptr_t pix2_stride,
2663                               uint8_t *p_pix3, intptr_t pix3_stride,
2664                               int32_t i_weight )
2665 {
2666     if( 32 == i_weight )
2667     {
2668         avg_src_width8_msa( p_pix2, pix2_stride, p_pix3, pix3_stride,
2669                             p_pix1, pix1_stride, 16 );
2670     }
2671     else if( i_weight < 0 || i_weight > 63 )
2672     {
2673         avc_biwgt_opscale_8width_nw_msa( p_pix2, pix2_stride,
2674                                          p_pix3, pix3_stride,
2675                                          p_pix1, pix1_stride, 16, 5, i_weight,
2676                                          ( 64 - i_weight ), 0 );
2677     }
2678     else
2679     {
2680         avc_biwgt_opscale_8width_msa( p_pix2, pix2_stride,
2681                                       p_pix3, pix3_stride,
2682                                       p_pix1, pix1_stride, 16, 5, i_weight,
2683                                       ( 64 - i_weight ), 0 );
2684     }
2685 }
2686
2687 void x264_pixel_avg_8x8_msa( uint8_t *p_pix1, intptr_t pix1_stride,
2688                              uint8_t *p_pix2, intptr_t pix2_stride,
2689                              uint8_t *p_pix3, intptr_t pix3_stride,
2690                              int32_t i_weight )
2691 {
2692     if( 32 == i_weight )
2693     {
2694         avg_src_width8_msa( p_pix2, pix2_stride, p_pix3, pix3_stride,
2695                             p_pix1, pix1_stride, 8 );
2696     }
2697     else if( i_weight < 0 || i_weight > 63 )
2698     {
2699         avc_biwgt_opscale_8width_nw_msa( p_pix2, pix2_stride,
2700                                          p_pix3, pix3_stride,
2701                                          p_pix1, pix1_stride, 8, 5, i_weight,
2702                                          ( 64 - i_weight ), 0 );
2703     }
2704     else
2705     {
2706         avc_biwgt_opscale_8width_msa( p_pix2, pix2_stride,
2707                                       p_pix3, pix3_stride,
2708                                       p_pix1, pix1_stride, 8, 5, i_weight,
2709                                       ( 64 - i_weight ), 0 );
2710     }
2711 }
2712
2713 void x264_pixel_avg_8x4_msa( uint8_t *p_pix1, intptr_t pix1_stride,
2714                              uint8_t *p_pix2, intptr_t pix2_stride,
2715                              uint8_t *p_pix3, intptr_t pix3_stride,
2716                              int32_t i_weight )
2717 {
2718     if( 32 == i_weight )
2719     {
2720         avg_src_width8_msa( p_pix2, pix2_stride, p_pix3, pix3_stride,
2721                             p_pix1, pix1_stride, 4 );
2722     }
2723     else if( i_weight < 0 || i_weight > 63 )
2724     {
2725         avc_biwgt_opscale_8width_nw_msa( p_pix2, pix2_stride,
2726                                          p_pix3, pix3_stride,
2727                                          p_pix1, pix1_stride, 4, 5, i_weight,
2728                                          ( 64 - i_weight ), 0 );
2729     }
2730     else
2731     {
2732         avc_biwgt_opscale_8width_msa( p_pix2, pix2_stride,
2733                                       p_pix3, pix3_stride,
2734                                       p_pix1, pix1_stride, 4, 5, i_weight,
2735                                       ( 64 - i_weight ), 0 );
2736     }
2737 }
2738
2739 void x264_pixel_avg_4x16_msa( uint8_t *p_pix1, intptr_t pix1_stride,
2740                               uint8_t *p_pix2, intptr_t pix2_stride,
2741                               uint8_t *p_pix3, intptr_t pix3_stride,
2742                               int32_t i_weight )
2743 {
2744     if( 32 == i_weight )
2745     {
2746         avg_src_width4_msa( p_pix2, pix2_stride, p_pix3, pix3_stride,
2747                             p_pix1, pix1_stride, 16 );
2748     }
2749     else if( i_weight < 0 || i_weight > 63 )
2750     {
2751         avc_biwgt_opscale_4width_nw_msa( p_pix2, pix2_stride,
2752                                          p_pix3, pix3_stride,
2753                                          p_pix1, pix1_stride, 16, 5, i_weight,
2754                                          ( 64 - i_weight ), 0 );
2755     }
2756     else
2757     {
2758         avc_biwgt_opscale_4width_msa( p_pix2, pix2_stride,
2759                                       p_pix3, pix3_stride,
2760                                       p_pix1, pix1_stride, 16, 5, i_weight,
2761                                       ( 64 - i_weight ), 0 );
2762     }
2763 }
2764
2765 void x264_pixel_avg_4x8_msa( uint8_t *p_pix1, intptr_t pix1_stride,
2766                              uint8_t *p_pix2, intptr_t pix2_stride,
2767                              uint8_t *p_pix3, intptr_t pix3_stride,
2768                              int32_t i_weight )
2769 {
2770     if( 32 == i_weight )
2771     {
2772         avg_src_width4_msa( p_pix2, pix2_stride, p_pix3, pix3_stride,
2773                             p_pix1, pix1_stride, 8 );
2774     }
2775     else if( i_weight < 0 || i_weight > 63 )
2776     {
2777         avc_biwgt_opscale_4width_nw_msa( p_pix2, pix2_stride,
2778                                          p_pix3, pix3_stride,
2779                                          p_pix1, pix1_stride, 8, 5, i_weight,
2780                                          ( 64 - i_weight ), 0 );
2781     }
2782     else
2783     {
2784         avc_biwgt_opscale_4width_msa( p_pix2, pix2_stride,
2785                                       p_pix3, pix3_stride,
2786                                       p_pix1, pix1_stride, 8, 5, i_weight,
2787                                       ( 64 - i_weight ), 0 );
2788     }
2789 }
2790
2791 void x264_pixel_avg_4x4_msa( uint8_t *p_pix1, intptr_t pix1_stride,
2792                              uint8_t *p_pix2, intptr_t pix2_stride,
2793                              uint8_t *p_pix3, intptr_t pix3_stride,
2794                              int32_t i_weight )
2795 {
2796     if( 32 == i_weight )
2797     {
2798         avg_src_width4_msa( p_pix2, pix2_stride, p_pix3, pix3_stride,
2799                             p_pix1, pix1_stride, 4 );
2800     }
2801     else if( i_weight < 0 || i_weight > 63 )
2802     {
2803         avc_biwgt_opscale_4width_nw_msa( p_pix2, pix2_stride,
2804                                          p_pix3, pix3_stride,
2805                                          p_pix1, pix1_stride, 4, 5, i_weight,
2806                                          ( 64 - i_weight ), 0 );
2807     }
2808     else
2809     {
2810         avc_biwgt_opscale_4width_msa( p_pix2, pix2_stride,
2811                                       p_pix3, pix3_stride,
2812                                       p_pix1, pix1_stride, 4, 5, i_weight,
2813                                       ( 64 - i_weight ), 0 );
2814     }
2815 }
2816
2817 void x264_pixel_avg_4x2_msa( uint8_t *p_pix1, intptr_t pix1_stride,
2818                              uint8_t *p_pix2, intptr_t pix2_stride,
2819                              uint8_t *p_pix3, intptr_t pix3_stride,
2820                              int32_t i_weight )
2821 {
2822     if( 32 == i_weight )
2823     {
2824         avg_src_width4_msa( p_pix2, pix2_stride, p_pix3, pix3_stride,
2825                             p_pix1, pix1_stride, 2 );
2826     }
2827     else if( i_weight < 0 || i_weight > 63 )
2828     {
2829         avc_biwgt_opscale_4x2_nw_msa( p_pix2, pix2_stride,
2830                                       p_pix3, pix3_stride,
2831                                       p_pix1, pix1_stride, 5, i_weight,
2832                                       ( 64 - i_weight ), 0 );
2833     }
2834     else
2835     {
2836         avc_biwgt_opscale_4x2_msa( p_pix2, pix2_stride,
2837                                    p_pix3, pix3_stride,
2838                                    p_pix1, pix1_stride, 5, i_weight,
2839                                    ( 64 - i_weight ), 0 );
2840     }
2841 }
2842
2843
2844 void x264_memzero_aligned_msa( void *p_dst, size_t n )
2845 {
2846     uint32_t u_tot32_mul_lines = n >> 5;
2847     uint32_t u_remaining = n - ( u_tot32_mul_lines << 5 );
2848
2849     memset_zero_16width_msa( p_dst, 16, ( n / 16 ) );
2850
2851     if( u_remaining )
2852     {
2853         memset( p_dst + ( u_tot32_mul_lines << 5 ), 0, u_remaining );
2854     }
2855 }
2856
2857 void x264_mc_weight_w4_msa( uint8_t *p_dst, intptr_t i_dst_stride,
2858                             uint8_t *p_src, intptr_t i_src_stride,
2859                             const x264_weight_t *pWeight, int32_t i_height )
2860 {
2861     int32_t i_log2_denom = pWeight->i_denom;
2862     int32_t i_offset = pWeight->i_offset;
2863     int32_t i_weight = pWeight->i_scale;
2864
2865     avc_wgt_opscale_4width_msa( p_src, i_src_stride, p_dst, i_dst_stride,
2866                                 i_height, i_log2_denom, i_weight, i_offset );
2867 }
2868
2869 void x264_mc_weight_w8_msa( uint8_t *p_dst, intptr_t i_dst_stride,
2870                             uint8_t *p_src, intptr_t i_src_stride,
2871                             const x264_weight_t *pWeight, int32_t i_height )
2872 {
2873     int32_t i_log2_denom = pWeight->i_denom;
2874     int32_t i_offset = pWeight->i_offset;
2875     int32_t i_weight = pWeight->i_scale;
2876
2877     avc_wgt_opscale_8width_msa( p_src, i_src_stride, p_dst, i_dst_stride,
2878                                 i_height, i_log2_denom, i_weight, i_offset );
2879 }
2880
2881 void x264_mc_weight_w16_msa( uint8_t *p_dst, intptr_t i_dst_stride,
2882                              uint8_t *p_src, intptr_t i_src_stride,
2883                              const x264_weight_t *pWeight, int32_t i_height )
2884 {
2885     int32_t i_log2_denom = pWeight->i_denom;
2886     int32_t i_offset = pWeight->i_offset;
2887     int32_t i_weight = pWeight->i_scale;
2888
2889     avc_wgt_opscale_16width_msa( p_src, i_src_stride, p_dst, i_dst_stride,
2890                                  i_height, i_log2_denom, i_weight, i_offset );
2891 }
2892
2893 void x264_mc_weight_w20_msa( uint8_t *p_dst, intptr_t i_dst_stride,
2894                              uint8_t *p_src, intptr_t i_src_stride,
2895                              const x264_weight_t *pWeight, int32_t i_height )
2896 {
2897     x264_mc_weight_w16_msa( p_dst, i_dst_stride, p_src, i_src_stride,
2898                             pWeight, i_height );
2899     x264_mc_weight_w4_msa( p_dst + 16, i_dst_stride, p_src + 16, i_src_stride,
2900                            pWeight, i_height );
2901 }
2902
2903 void x264_mc_luma_msa( uint8_t *p_dst, intptr_t i_dst_stride,
2904                        uint8_t *p_src[4], intptr_t i_src_stride,
2905                        int32_t m_vx, int32_t m_vy,
2906                        int32_t i_width, int32_t i_height,
2907                        const x264_weight_t *pWeight )
2908 {
2909     int32_t  i_qpel_idx;
2910     int32_t  i_offset;
2911     uint8_t  *p_src1;
2912
2913     i_qpel_idx = ( ( m_vy & 3 ) << 2 ) + ( m_vx & 3 );
2914     i_offset = ( m_vy >> 2 ) * i_src_stride + ( m_vx >> 2 );
2915     p_src1 = p_src[x264_hpel_ref0[i_qpel_idx]] + i_offset +
2916              ( 3 == ( m_vy & 3 ) ) * i_src_stride;
2917
2918     if( i_qpel_idx & 5 )
2919     {
2920         uint8_t *p_src2 = p_src[x264_hpel_ref1[i_qpel_idx]] +
2921                           i_offset + ( 3 == ( m_vx&3 ) );
2922
2923         if( 16 == i_width )
2924         {
2925             avg_src_width16_msa( p_src1, i_src_stride, p_src2, i_src_stride,
2926                                  p_dst, i_dst_stride, i_height );
2927         }
2928         else if( 8 == i_width )
2929         {
2930             avg_src_width8_msa( p_src1, i_src_stride, p_src2, i_src_stride,
2931                                 p_dst, i_dst_stride, i_height );
2932         }
2933         else if( 4 == i_width )
2934         {
2935             avg_src_width4_msa( p_src1, i_src_stride, p_src2, i_src_stride,
2936                                 p_dst, i_dst_stride, i_height );
2937         }
2938
2939         if( pWeight->weightfn )
2940         {
2941             if( 16 == i_width )
2942             {
2943                 x264_mc_weight_w16_msa( p_dst, i_dst_stride,
2944                                         p_dst, i_dst_stride,
2945                                         pWeight, i_height );
2946             }
2947             else if( 8 == i_width )
2948             {
2949                 x264_mc_weight_w8_msa( p_dst, i_dst_stride, p_dst, i_dst_stride,
2950                                        pWeight, i_height );
2951             }
2952             else if( 4 == i_width )
2953             {
2954                 x264_mc_weight_w4_msa( p_dst, i_dst_stride, p_dst, i_dst_stride,
2955                                        pWeight, i_height );
2956             }
2957         }
2958     }
2959     else if( pWeight->weightfn )
2960     {
2961         if( 16 == i_width )
2962         {
2963             x264_mc_weight_w16_msa( p_dst, i_dst_stride, p_src1, i_src_stride,
2964                                     pWeight, i_height );
2965         }
2966         else if( 8 == i_width )
2967         {
2968             x264_mc_weight_w8_msa( p_dst, i_dst_stride, p_src1, i_src_stride,
2969                                    pWeight, i_height );
2970         }
2971         else if( 4 == i_width )
2972         {
2973             x264_mc_weight_w4_msa( p_dst, i_dst_stride, p_src1, i_src_stride,
2974                                    pWeight, i_height );
2975         }
2976     }
2977     else
2978     {
2979         if( 16 == i_width )
2980         {
2981             copy_width16_msa( p_src1, i_src_stride, p_dst, i_dst_stride,
2982                               i_height );
2983         }
2984         else if( 8 == i_width )
2985         {
2986             copy_width8_msa( p_src1, i_src_stride, p_dst, i_dst_stride,
2987                              i_height );
2988         }
2989         else if( 4 == i_width )
2990         {
2991             copy_width4_msa( p_src1, i_src_stride, p_dst, i_dst_stride,
2992                              i_height );
2993         }
2994     }
2995 }
2996
2997 void x264_mc_chroma_msa( uint8_t *p_dst_u, uint8_t *p_dst_v,
2998                          intptr_t i_dst_stride,
2999                          uint8_t *p_src, intptr_t i_src_stride,
3000                          int32_t m_vx, int32_t m_vy,
3001                          int32_t i_width, int32_t i_height )
3002 {
3003     int32_t i_d8x = m_vx & 0x07;
3004     int32_t i_d8y = m_vy & 0x07;
3005     int32_t i_coeff_horiz1 = ( 8 - i_d8x );
3006     int32_t i_coeff_vert1 = ( 8 - i_d8y );
3007     int32_t i_coeff_horiz0 = i_d8x;
3008     int32_t i_coeff_vert0 = i_d8y;
3009
3010     p_src += ( m_vy >> 3 ) * i_src_stride + ( m_vx >> 3 ) * 2;
3011
3012     if( 2 == i_width )
3013     {
3014         avc_interleaved_chroma_hv_2w_msa( p_src, i_src_stride,
3015                                           p_dst_u, p_dst_v, i_dst_stride,
3016                                           i_coeff_horiz0, i_coeff_horiz1,
3017                                           i_coeff_vert0, i_coeff_vert1,
3018                                           i_height );
3019     }
3020     else if( 4 == i_width )
3021     {
3022         avc_interleaved_chroma_hv_4w_msa( p_src, i_src_stride,
3023                                           p_dst_u, p_dst_v, i_dst_stride,
3024                                           i_coeff_horiz0, i_coeff_horiz1,
3025                                           i_coeff_vert0, i_coeff_vert1,
3026                                           i_height );
3027     }
3028     else if( 8 == i_width )
3029     {
3030         avc_interleaved_chroma_hv_8w_msa( p_src, i_src_stride,
3031                                           p_dst_u, p_dst_v, i_dst_stride,
3032                                           i_coeff_horiz0, i_coeff_horiz1,
3033                                           i_coeff_vert0, i_coeff_vert1,
3034                                           i_height );
3035     }
3036 }
3037
3038 void x264_hpel_filter_msa( uint8_t *p_dsth, uint8_t *p_dst_v,
3039                            uint8_t *p_dstc, uint8_t *p_src,
3040                            intptr_t i_stride, int32_t i_width,
3041                            int32_t i_height, int16_t *p_buf )
3042 {
3043     for( int32_t i = 0; i < ( i_width / 16 ); i++ )
3044     {
3045         avc_luma_vt_16w_msa( p_src - 2 - ( 2 * i_stride ), i_stride,
3046                              p_dst_v - 2, i_stride, i_height );
3047         avc_luma_mid_16w_msa( p_src - 2 - ( 2 * i_stride ) , i_stride,
3048                               p_dstc, i_stride, i_height );
3049         avc_luma_hz_16w_msa( p_src - 2, i_stride, p_dsth, i_stride, i_height );
3050
3051         p_src += 16;
3052         p_dst_v += 16;
3053         p_dsth += 16;
3054         p_dstc += 16;
3055     }
3056 }
3057
3058 void x264_plane_copy_interleave_msa( uint8_t *p_dst, intptr_t i_dst_stride,
3059                                      uint8_t *p_src0, intptr_t i_src_stride0,
3060                                      uint8_t *p_src1, intptr_t i_src_stride1,
3061                                      int32_t i_width, int32_t i_height )
3062 {
3063     plane_copy_interleave_msa( p_src0, i_src_stride0, p_src1, i_src_stride1,
3064                                p_dst, i_dst_stride, i_width, i_height );
3065 }
3066
3067 void x264_plane_copy_deinterleave_msa( uint8_t *p_dst0, intptr_t i_dst_stride0,
3068                                        uint8_t *p_dst1, intptr_t i_dst_stride1,
3069                                        uint8_t *p_src, intptr_t i_src_stride,
3070                                        int32_t i_width, int32_t i_height )
3071 {
3072     plane_copy_deinterleave_msa( p_src, i_src_stride, p_dst0, i_dst_stride0,
3073                                  p_dst1, i_dst_stride1, i_width, i_height );
3074 }
3075
3076 void x264_plane_copy_deinterleave_rgb_msa( uint8_t *p_dst0,
3077                                            intptr_t i_dst_stride0,
3078                                            uint8_t *p_dst1,
3079                                            intptr_t i_dst_stride1,
3080                                            uint8_t *p_dst2,
3081                                            intptr_t i_dst_stride2,
3082                                            uint8_t *p_src,
3083                                            intptr_t i_src_stride,
3084                                            int32_t i_src_width,
3085                                            int32_t i_width,
3086                                            int32_t i_height )
3087 {
3088     if( 3 == i_src_width )
3089     {
3090         plane_copy_deinterleave_rgb_msa( p_src, i_src_stride,
3091                                          p_dst0, i_dst_stride0,
3092                                          p_dst1, i_dst_stride1,
3093                                          p_dst2, i_dst_stride2,
3094                                          i_width, i_height );
3095     }
3096     else if( 4 == i_src_width )
3097     {
3098         plane_copy_deinterleave_rgba_msa( p_src, i_src_stride,
3099                                           p_dst0, i_dst_stride0,
3100                                           p_dst1, i_dst_stride1,
3101                                           p_dst2, i_dst_stride2,
3102                                           i_width, i_height );
3103     }
3104 }
3105
3106 void x264_store_interleave_chroma_msa( uint8_t *p_dst, intptr_t i_dst_stride,
3107                                        uint8_t *p_src0, uint8_t *p_src1,
3108                                        int32_t i_height )
3109 {
3110     store_interleave_chroma_msa( p_src0, FDEC_STRIDE, p_src1, FDEC_STRIDE,
3111                                  p_dst, i_dst_stride, i_height );
3112 }
3113
3114 void x264_load_deinterleave_chroma_fenc_msa( uint8_t *p_dst, uint8_t *p_src,
3115                                              intptr_t i_src_stride,
3116                                              int32_t i_height )
3117 {
3118     plane_copy_deinterleave_msa( p_src, i_src_stride, p_dst, FENC_STRIDE,
3119                                  ( p_dst + ( FENC_STRIDE / 2 ) ), FENC_STRIDE,
3120                                  8, i_height );
3121 }
3122
3123 void x264_load_deinterleave_chroma_fdec_msa( uint8_t *p_dst, uint8_t *p_src,
3124                                              intptr_t i_src_stride,
3125                                              int32_t i_height )
3126 {
3127     plane_copy_deinterleave_msa( p_src, i_src_stride, p_dst, FDEC_STRIDE,
3128                                  ( p_dst + ( FDEC_STRIDE / 2 ) ), FDEC_STRIDE,
3129                                  8, i_height );
3130 }
3131
3132 void x264_frame_init_lowres_core_msa( uint8_t *p_src, uint8_t *p_dst0,
3133                                       uint8_t *p_dst1, uint8_t *p_dst2,
3134                                       uint8_t *p_dst3, intptr_t i_src_stride,
3135                                       intptr_t i_dst_stride, int32_t i_width,
3136                                       int32_t i_height )
3137 {
3138     frame_init_lowres_core_msa( p_src, i_src_stride, p_dst0, i_dst_stride,
3139                                 p_dst1, i_dst_stride, p_dst2, i_dst_stride,
3140                                 p_dst3, i_dst_stride, i_width, i_height );
3141 }
3142
3143 uint8_t *x264_get_ref_msa( uint8_t *p_dst, intptr_t *p_dst_stride,
3144                            uint8_t *p_src[4], intptr_t i_src_stride,
3145                            int32_t m_vx, int32_t m_vy,
3146                            int32_t i_width, int32_t i_height,
3147                            const x264_weight_t *pWeight )
3148 {
3149     int32_t i_qpel_idx, i_cnt, i_h4w;
3150     int32_t i_offset;
3151     uint8_t *p_src1, *src1_org;
3152
3153     i_qpel_idx = ( ( m_vy & 3 ) << 2 ) + ( m_vx & 3 );
3154     i_offset = ( m_vy >> 2 ) * i_src_stride + ( m_vx >> 2 );
3155     p_src1 = p_src[x264_hpel_ref0[i_qpel_idx]] + i_offset +
3156            ( 3 == ( m_vy & 3 ) ) * i_src_stride;
3157
3158     i_h4w = i_height - i_height%4;
3159
3160     if( i_qpel_idx & 5 )
3161     {
3162         uint8_t *p_src2 = p_src[x264_hpel_ref1[i_qpel_idx]] +
3163                           i_offset + ( 3 == ( m_vx & 3 ) );
3164
3165         if( 16 == i_width )
3166         {
3167             avg_src_width16_msa( p_src1, i_src_stride,
3168                                  p_src2, i_src_stride,
3169                                  p_dst, *p_dst_stride, i_h4w );
3170             for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
3171             {
3172                 v16u8 src_vec1, src_vec2;
3173                 v16u8 dst_vec0;
3174
3175                 src_vec1 = LD_UB( p_src1 + i_cnt * i_src_stride );
3176                 src_vec2 = LD_UB( p_src2 + i_cnt * i_src_stride );
3177
3178                 dst_vec0 = __msa_aver_u_b( src_vec1, src_vec2 );
3179
3180                 ST_UB( dst_vec0, p_dst + i_cnt * ( *p_dst_stride ) );
3181             }
3182         }
3183         else if( 20 == i_width )
3184         {
3185             avg_src_width16_msa( p_src1, i_src_stride, p_src2, i_src_stride,
3186                                  p_dst, *p_dst_stride, i_h4w );
3187             avg_src_width4_msa( p_src1 + 16, i_src_stride,
3188                                 p_src2 + 16, i_src_stride,
3189                                 p_dst + 16, *p_dst_stride, i_h4w );
3190
3191             for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
3192             {
3193                 v16u8 src_vec1, src_vec2, src_vec3, src_vec4;
3194                 v16u8 dst_vec0, dst_vec1;
3195                 uint32_t temp0;
3196
3197                 src_vec1 = LD_UB( p_src1 + i_cnt * i_src_stride );
3198                 src_vec2 = LD_UB( p_src2 + i_cnt * i_src_stride );
3199                 src_vec3 = LD_UB( p_src1 + i_cnt * i_src_stride + 16 );
3200                 src_vec4 = LD_UB( p_src2 + i_cnt * i_src_stride + 16 );
3201
3202                 dst_vec0 = __msa_aver_u_b( src_vec1, src_vec2 );
3203                 dst_vec1 = __msa_aver_u_b( src_vec3, src_vec4 );
3204
3205                 temp0 = __msa_copy_u_w( ( v4i32 ) dst_vec1, 0 );
3206
3207                 ST_UB( dst_vec0, p_dst + i_cnt * ( *p_dst_stride ) );
3208                 SW( temp0, p_dst + i_cnt * ( *p_dst_stride ) + 16 );
3209             }
3210         }
3211         else if( 12 == i_width )
3212         {
3213             avg_src_width8_msa( p_src1, i_src_stride,
3214                                 p_src2, i_src_stride,
3215                                 p_dst, *p_dst_stride, i_h4w );
3216             avg_src_width4_msa( p_src1 + 8, i_src_stride,
3217                                 p_src2 + 8, i_src_stride,
3218                                 p_dst + 8, *p_dst_stride, i_h4w );
3219             for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
3220             {
3221                 uint32_t temp0;
3222                 uint64_t dst0;
3223                 v16u8 src_vec1, src_vec2;
3224                 v16u8 dst_vec0;
3225
3226                 src_vec1 = LD_UB( p_src1 + i_cnt * i_src_stride );
3227                 src_vec2 = LD_UB( p_src2 + i_cnt * i_src_stride );
3228
3229                 dst_vec0 = __msa_aver_u_b( src_vec1, src_vec2 );
3230
3231                 dst0 = __msa_copy_u_d( ( v2i64 ) dst_vec0, 0 );
3232                 temp0 = __msa_copy_u_w( ( v4i32 ) dst_vec0, 2 );
3233
3234                 SD( dst0, p_dst + i_cnt * ( *p_dst_stride ) );
3235                 SW( temp0, p_dst + i_cnt * ( *p_dst_stride ) + 8 );
3236             }
3237         }
3238         else if( 8 == i_width )
3239         {
3240             avg_src_width8_msa( p_src1, i_src_stride,
3241                                 p_src2, i_src_stride,
3242                                 p_dst, *p_dst_stride, i_h4w );
3243             for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
3244             {
3245                 uint64_t dst0;
3246                 v16u8 src_vec1, src_vec2;
3247                 v16u8 dst_vec0;
3248
3249                 src_vec1 = LD_UB( p_src1 + i_cnt * i_src_stride );
3250                 src_vec2 = LD_UB( p_src2 + i_cnt * i_src_stride );
3251
3252                 dst_vec0 = __msa_aver_u_b( src_vec1, src_vec2 );
3253
3254                 dst0 = __msa_copy_u_d( ( v2i64 ) dst_vec0, 0 );
3255
3256                 SD( dst0, p_dst + i_cnt * ( *p_dst_stride ) );
3257             }
3258         }
3259         else if( 4 == i_width )
3260         {
3261             avg_src_width4_msa( p_src1, i_src_stride,
3262                                 p_src2, i_src_stride,
3263                                 p_dst, *p_dst_stride, i_h4w );
3264             for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
3265             {
3266                 uint32_t temp0;
3267                 v16u8 src_vec1, src_vec2;
3268                 v16u8 dst_vec0;
3269
3270                 src_vec1 = LD_UB( p_src1 + i_cnt * i_src_stride );
3271                 src_vec2 = LD_UB( p_src2 + i_cnt * i_src_stride );
3272
3273                 dst_vec0 = __msa_aver_u_b( src_vec1, src_vec2 );
3274                 temp0 = __msa_copy_u_w( ( v4i32 ) dst_vec0, 0 );
3275
3276                 SW( temp0, p_dst + i_cnt * ( *p_dst_stride ) );
3277             }
3278         }
3279
3280         if( pWeight->weightfn )
3281         {
3282             int32_t i_log2_denom;
3283             int32_t i_offset_val;
3284             int32_t i_weight;
3285
3286             i_log2_denom = pWeight->i_denom;
3287             i_offset_val = pWeight->i_offset;
3288             i_weight = pWeight->i_scale;
3289
3290             if( 16 == i_width || 12 == i_width )
3291             {
3292                 x264_mc_weight_w16_msa( p_dst, *p_dst_stride,
3293                                         p_dst, *p_dst_stride,
3294                                         pWeight, i_h4w );
3295                 for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
3296                 {
3297                     v16i8 zero = {0};
3298                     v16u8 src_vec0;
3299                     v16i8 tmp0;
3300                     v8u16 temp_vec0, temp_vec1;
3301                     v8u16 wgt, offset_val0;
3302                     v8i16 denom;
3303
3304                     i_offset_val <<= ( i_log2_denom );
3305
3306                     if( i_log2_denom )
3307                     {
3308                         i_offset_val += ( 1 << ( i_log2_denom - 1 ) );
3309                     }
3310
3311                     wgt = ( v8u16 ) __msa_fill_h( i_weight );
3312                     offset_val0 = ( v8u16 ) __msa_fill_h( i_offset_val );
3313                     denom = __msa_fill_h( i_log2_denom );
3314
3315                     src_vec0 = LD_UB( p_dst + i_cnt * ( *p_dst_stride ) );
3316
3317                     temp_vec1 = ( v8u16 ) __msa_ilvl_b( zero,
3318                                                         ( v16i8 ) src_vec0 );
3319                     temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero,
3320                                                         ( v16i8 ) src_vec0 );
3321
3322                     temp_vec0 = wgt * temp_vec0;
3323                     temp_vec1 = wgt * temp_vec1;
3324
3325                     temp_vec0 =
3326                         ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0,
3327                                                   ( v8i16 ) offset_val0 );
3328                     temp_vec1 =
3329                         ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec1,
3330                                                   ( v8i16 ) offset_val0 );
3331
3332                     temp_vec0 =
3333                         ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 );
3334                     temp_vec1 =
3335                         ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec1, 0 );
3336
3337                     temp_vec0 =
3338                         ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0, denom );
3339                     temp_vec1 =
3340                         ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec1, denom );
3341
3342                     temp_vec0 = __msa_sat_u_h( temp_vec0, 7 );
3343                     temp_vec1 = __msa_sat_u_h( temp_vec1, 7 );
3344
3345                     tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec1,
3346                                           ( v16i8 ) temp_vec0 );
3347                     ST_SB( tmp0, p_dst + i_cnt * ( *p_dst_stride ) );
3348                 }
3349             }
3350             else if( 20 == i_width )
3351             {
3352                 x264_mc_weight_w20_msa( p_dst, *p_dst_stride,
3353                                         p_dst, *p_dst_stride,
3354                                         pWeight, i_h4w );
3355                 for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
3356                 {
3357                     uint32_t temp0;
3358                     v16i8 zero = {0};
3359                     v16u8 src_vec0;
3360                     v16i8 tmp0;
3361                     v8u16 temp_vec0, temp_vec1;
3362                     v8u16 wgt;
3363                     v8i16 denom, offset_val0;
3364
3365                     i_offset_val <<= ( i_log2_denom );
3366
3367                     if( i_log2_denom )
3368                     {
3369                         i_offset_val += ( 1 << ( i_log2_denom - 1 ) );
3370                     }
3371
3372                     wgt = ( v8u16 ) __msa_fill_h( i_weight );
3373                     offset_val0 = __msa_fill_h( i_offset_val );
3374                     denom = __msa_fill_h( i_log2_denom );
3375
3376                     src_vec0 = LD_UB( p_dst + i_cnt * ( *p_dst_stride ) );
3377                     temp0 = LW( p_dst + i_cnt * ( *p_dst_stride ) + 16 );
3378
3379                     temp_vec1 = ( v8u16 ) __msa_ilvl_b( zero,
3380                                                         ( v16i8 ) src_vec0 );
3381                     temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero,
3382                                                         ( v16i8 ) src_vec0 );
3383
3384                     temp_vec0 = wgt * temp_vec0;
3385                     temp_vec1 = wgt * temp_vec1;
3386
3387                     temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0,
3388                                                           offset_val0 );
3389                     temp_vec1 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec1,
3390                                                           offset_val0 );
3391
3392                     temp_vec0 =
3393                         ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 );
3394                     temp_vec1 =
3395                         ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec1, 0 );
3396
3397                     temp_vec0 =
3398                         ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0, denom );
3399                     temp_vec1 =
3400                         ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec1, denom );
3401
3402                     temp_vec0 = __msa_sat_u_h( temp_vec0, 7 );
3403                     temp_vec1 = __msa_sat_u_h( temp_vec1, 7 );
3404
3405                     tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec1,
3406                                           ( v16i8 ) temp_vec0 );
3407                     ST_SB( tmp0, p_dst + i_cnt * ( *p_dst_stride ) );
3408
3409                     src_vec0 = ( v16u8 ) __msa_fill_w( temp0 );
3410                     temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero,
3411                                                         ( v16i8 ) src_vec0 );
3412                     temp_vec0 = wgt * temp_vec0;
3413
3414                     temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0,
3415                                                           offset_val0 );
3416                     temp_vec0 =
3417                         ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 );
3418                     temp_vec0 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0,
3419                                                        denom );
3420                     temp_vec0 = __msa_sat_u_h( temp_vec0, 7 );
3421
3422                     tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec0,
3423                                           ( v16i8 ) temp_vec0 );
3424                     temp0 = __msa_copy_u_w( ( v4i32 ) tmp0, 0 );
3425                     SW( temp0, p_dst + i_cnt * ( *p_dst_stride ) + 16 );
3426                 }
3427             }
3428             else if( 8 == i_width )
3429             {
3430                 x264_mc_weight_w8_msa( p_dst, *p_dst_stride,
3431                                        p_dst, *p_dst_stride,
3432                                        pWeight, i_h4w );
3433                 for( i_cnt = i_h4w; i_cnt < i_height ; i_cnt++ )
3434                 {
3435                     uint64_t temp0;
3436                     v16i8 zero = {0};
3437                     v16u8 src_vec0;
3438                     v16i8 tmp0;
3439                     v8u16 temp_vec0;
3440                     v8u16 wgt;
3441                     v8i16 denom, offset_val0;
3442
3443                     i_offset_val = i_offset_val << i_log2_denom;
3444
3445                     if( i_log2_denom )
3446                     {
3447                         i_offset_val += ( 1 << ( i_log2_denom - 1 ) );
3448                     }
3449
3450                     wgt = ( v8u16 ) __msa_fill_h( i_weight );
3451                     offset_val0 = __msa_fill_h( i_offset_val );
3452                     denom = __msa_fill_h( i_log2_denom );
3453
3454                     src_vec0 = LD_UB( p_dst + i_cnt * ( *p_dst_stride ) );
3455
3456                     temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero,
3457                                                         ( v16i8 ) src_vec0 );
3458                     temp_vec0 = wgt * temp_vec0;
3459
3460                     temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0,
3461                                                           offset_val0 );
3462                     temp_vec0 =
3463                         ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 );
3464                     temp_vec0 =
3465                         ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0, denom );
3466                     temp_vec0 = __msa_sat_u_h( temp_vec0, 7 );
3467
3468                     tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec0,
3469                                           ( v16i8 ) temp_vec0 );
3470                     temp0 = __msa_copy_u_d( ( v2i64 ) tmp0, 0 );
3471                     SD( temp0, p_dst + i_cnt * ( *p_dst_stride ) );
3472                 }
3473             }
3474             else if( 4 == i_width )
3475             {
3476                 x264_mc_weight_w4_msa( p_dst, *p_dst_stride,
3477                                        p_dst, *p_dst_stride,
3478                                        pWeight, i_h4w );
3479                 for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
3480                 {
3481                     uint32_t temp0;
3482                     v16i8 zero = {0};
3483                     v16u8 src_vec0;
3484                     v16i8 tmp0;
3485                     v8u16 temp_vec0;
3486                     v8u16 wgt;
3487                     v8i16 denom, offset_val0;
3488
3489                     i_offset_val <<= ( i_log2_denom );
3490
3491                     if( i_log2_denom )
3492                     {
3493                         i_offset_val += ( 1 << ( i_log2_denom - 1 ) );
3494                     }
3495
3496                     wgt = ( v8u16 ) __msa_fill_h( i_weight );
3497                     offset_val0 = __msa_fill_h( i_offset_val );
3498                     denom = __msa_fill_h( i_log2_denom );
3499
3500                     temp0 = LW( p_dst + i_cnt * ( *p_dst_stride ) );
3501
3502                     src_vec0 = ( v16u8 ) __msa_fill_w( temp0 );
3503
3504                     temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero,
3505                                                         ( v16i8 ) src_vec0 );
3506                     temp_vec0 = wgt * temp_vec0;
3507
3508                     temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0,
3509                                                           offset_val0 );
3510                     temp_vec0 =
3511                         ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 );
3512                     temp_vec0 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0,
3513                                                        denom );
3514                     temp_vec0 = __msa_sat_u_h( temp_vec0, 7 );
3515
3516                     tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec0,
3517                                           ( v16i8 ) temp_vec0 );
3518                     temp0 = __msa_copy_u_w( ( v4i32 ) tmp0, 0 );
3519                     SW( temp0, p_dst + i_cnt * ( *p_dst_stride ) );
3520                 }
3521             }
3522         }
3523
3524         return p_dst;
3525     }
3526     else if( pWeight->weightfn )
3527     {
3528         int32_t i_offset_val, i_log2_denom, i_weight;
3529
3530         i_log2_denom = pWeight->i_denom;
3531         i_offset_val = pWeight->i_offset;
3532         i_weight = pWeight->i_scale;
3533
3534         i_h4w = i_height - i_height%4;
3535
3536         src1_org = p_src1;
3537
3538         if( 16 == i_width || 12 == i_width )
3539         {
3540             x264_mc_weight_w16_msa( p_dst, *p_dst_stride, p_src1, i_src_stride,
3541                                     pWeight, i_h4w );
3542             p_src1 = src1_org + i_h4w * i_src_stride;
3543
3544             for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
3545             {
3546                 v16i8 zero = {0};
3547                 v16u8 src_vec0;
3548                 v16i8 tmp0;
3549                 v8u16 temp_vec0, temp_vec1;
3550                 v8u16 wgt;
3551                 v8i16 denom, offset_val0;
3552
3553                 i_offset_val <<= ( i_log2_denom );
3554
3555                 if( i_log2_denom )
3556                 {
3557                     i_offset_val += ( 1 << ( i_log2_denom - 1 ) );
3558                 }
3559
3560                 wgt = ( v8u16 ) __msa_fill_h( i_weight );
3561                 offset_val0 = __msa_fill_h( i_offset_val );
3562                 denom = __msa_fill_h( i_log2_denom );
3563
3564                 src_vec0 = LD_UB( p_src1 );
3565                 p_src1 += i_src_stride;
3566
3567                 temp_vec1 = ( v8u16 ) __msa_ilvl_b( zero, ( v16i8 ) src_vec0 );
3568                 temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero, ( v16i8 ) src_vec0 );
3569
3570                 temp_vec0 = wgt * temp_vec0;
3571                 temp_vec1 = wgt * temp_vec1;
3572
3573                 temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0,
3574                                                       offset_val0 );
3575                 temp_vec1 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec1,
3576                                                       offset_val0 );
3577
3578                 temp_vec0 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 );
3579                 temp_vec1 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec1, 0 );
3580
3581                 temp_vec0 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0, denom );
3582                 temp_vec1 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec1, denom );
3583
3584                 temp_vec0 = __msa_sat_u_h( temp_vec0, 7 );
3585                 temp_vec1 = __msa_sat_u_h( temp_vec1, 7 );
3586
3587                 tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec1,
3588                                       ( v16i8 ) temp_vec0 );
3589                 ST_SB( tmp0, p_dst + i_cnt * ( *p_dst_stride ) );
3590             }
3591         }
3592         else if( 20 == i_width )
3593         {
3594             x264_mc_weight_w20_msa( p_dst, *p_dst_stride, p_src1, i_src_stride,
3595                                     pWeight, i_h4w );
3596             p_src1 = src1_org + i_h4w * i_src_stride;
3597
3598             for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
3599             {
3600                 uint32_t temp0;
3601                 v16i8 zero = {0};
3602                 v16u8 src_vec0;
3603                 v16i8 tmp0;
3604                 v8u16 temp_vec0, temp_vec1;
3605                 v8u16 wgt;
3606                 v8i16 denom, offset_val0;
3607
3608                 i_offset_val <<= ( i_log2_denom );
3609
3610                 if( i_log2_denom )
3611                 {
3612                     i_offset_val += ( 1 << ( i_log2_denom - 1 ) );
3613                 }
3614
3615                 wgt = ( v8u16 ) __msa_fill_h( i_weight );
3616                 offset_val0 = __msa_fill_h( i_offset_val );
3617                 denom = __msa_fill_h( i_log2_denom );
3618
3619                 src_vec0 = LD_UB( p_src1 );
3620                 temp0 = LW( p_src1 + 16 );
3621                 p_src1 += i_src_stride;
3622
3623                 temp_vec1 = ( v8u16 ) __msa_ilvl_b( zero, ( v16i8 ) src_vec0 );
3624                 temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero, ( v16i8 ) src_vec0 );
3625
3626                 temp_vec0 = wgt * temp_vec0;
3627                 temp_vec1 = wgt * temp_vec1;
3628
3629                 temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0,
3630                                                       offset_val0 );
3631                 temp_vec1 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec1,
3632                                                       offset_val0 );
3633
3634                 temp_vec0 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 );
3635                 temp_vec1 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec1, 0 );
3636
3637                 temp_vec0 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0, denom );
3638                 temp_vec1 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec1, denom );
3639
3640                 temp_vec0 = __msa_sat_u_h( temp_vec0, 7 );
3641                 temp_vec1 = __msa_sat_u_h( temp_vec1, 7 );
3642
3643                 tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec1,
3644                                       ( v16i8 ) temp_vec0 );
3645                 ST_SB( tmp0, p_dst + i_cnt * ( *p_dst_stride ) );
3646
3647                 src_vec0 = ( v16u8 ) __msa_fill_w( temp0 );
3648                 temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero, ( v16i8 ) src_vec0 );
3649                 temp_vec0 = wgt * temp_vec0;
3650
3651                 temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0,
3652                                                       offset_val0 );
3653                 temp_vec0 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 );
3654                 temp_vec0 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0, denom );
3655                 temp_vec0 = __msa_sat_u_h( temp_vec0, 7 );
3656
3657                 tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec0,
3658                                       ( v16i8 ) temp_vec0 );
3659                 temp0 = __msa_copy_u_w( ( v4i32 ) tmp0, 0 );
3660                 SW( temp0,p_dst + i_cnt * ( *p_dst_stride ) + 16 );
3661             }
3662         }
3663         else if( 8 == i_width )
3664         {
3665             x264_mc_weight_w8_msa( p_dst, *p_dst_stride, p_src1, i_src_stride,
3666                                    pWeight, i_h4w );
3667             p_src1 = src1_org + i_h4w * i_src_stride;
3668
3669             for( i_cnt = i_h4w; i_cnt < i_height ; i_cnt++ )
3670             {
3671                 uint64_t u_temp0;
3672                 v16i8 zero = {0};
3673                 v16u8 src_vec0;
3674                 v16i8 tmp0;
3675                 v8u16 temp_vec0;
3676                 v8u16 wgt;
3677                 v8i16 denom, offset_val0;
3678
3679                 i_offset_val = i_offset_val << i_log2_denom;
3680
3681                 if( i_log2_denom )
3682                 {
3683                     i_offset_val += ( 1 << ( i_log2_denom - 1 ) );
3684                 }
3685
3686                 wgt = ( v8u16 ) __msa_fill_h( i_weight );
3687                 offset_val0 = __msa_fill_h( i_offset_val );
3688                 denom = __msa_fill_h( i_log2_denom );
3689
3690                 src_vec0 = LD_UB( p_src1 );
3691                 p_src1 += i_src_stride;
3692
3693                 temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero, ( v16i8 ) src_vec0 );
3694                 temp_vec0 = wgt * temp_vec0;
3695
3696                 temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0,
3697                                                       offset_val0 );
3698                 temp_vec0 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 );
3699                 temp_vec0 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0, denom );
3700                 temp_vec0 = __msa_sat_u_h( temp_vec0, 7 );
3701
3702                 tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec0,
3703                                       ( v16i8 ) temp_vec0 );
3704                 u_temp0 = __msa_copy_u_d( ( v2i64 ) tmp0, 0 );
3705                 SD( u_temp0, p_dst + i_cnt * ( *p_dst_stride ) );
3706             }
3707         }
3708         else if( 4 == i_width )
3709         {
3710             x264_mc_weight_w4_msa( p_dst, *p_dst_stride, p_src1, i_src_stride,
3711                                    pWeight, i_h4w );
3712             p_src1 = src1_org + i_h4w * i_src_stride;
3713
3714             for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
3715             {
3716                 uint32_t u_temp0;
3717                 v16i8 zero = {0};
3718                 v16u8 src_vec0;
3719                 v16i8 tmp0;
3720                 v8u16 temp_vec0;
3721                 v8u16 wgt;
3722                 v8i16 denom, offset_val0;
3723
3724                 i_offset_val <<= ( i_log2_denom );
3725
3726                 if( i_log2_denom )
3727                 {
3728                     i_offset_val += ( 1 << ( i_log2_denom - 1 ) );
3729                 }
3730
3731                 wgt = ( v8u16 ) __msa_fill_h( i_weight );
3732                 offset_val0 = __msa_fill_h( i_offset_val );
3733                 denom = __msa_fill_h( i_log2_denom );
3734
3735                 u_temp0 = LW( p_src1 );
3736                 p_src1 += i_src_stride;
3737
3738                 src_vec0 = ( v16u8 ) __msa_fill_w( u_temp0 );
3739
3740                 temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero, ( v16i8 ) src_vec0 );
3741                 temp_vec0 = wgt * temp_vec0;
3742
3743                 temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0,
3744                                                       offset_val0 );
3745                 temp_vec0 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 );
3746                 temp_vec0 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0, denom );
3747                 temp_vec0 = __msa_sat_u_h( temp_vec0, 7 );
3748
3749                 tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec0,
3750                                       ( v16i8 ) temp_vec0 );
3751                 u_temp0 = __msa_copy_u_w( ( v4i32 ) tmp0, 0 );
3752                 SW( u_temp0, p_dst + i_cnt * ( *p_dst_stride ) );
3753             }
3754         }
3755
3756         return p_dst;
3757     }
3758     else
3759     {
3760         *p_dst_stride = i_src_stride;
3761         return p_src1;
3762     }
3763 }
3764
3765 void x264_mc_init_mips( int32_t cpu, x264_mc_functions_t *pf  )
3766 {
3767     if( cpu & X264_CPU_MSA )
3768     {
3769         pf->mc_luma = x264_mc_luma_msa;
3770         pf->mc_chroma = x264_mc_chroma_msa;
3771         pf->get_ref = x264_get_ref_msa;
3772
3773         pf->avg[PIXEL_16x16]= x264_pixel_avg_16x16_msa;
3774         pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_msa;
3775         pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_msa;
3776         pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_msa;
3777         pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_msa;
3778         pf->avg[PIXEL_4x16] = x264_pixel_avg_4x16_msa;
3779         pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_msa;
3780         pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_msa;
3781         pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_msa;
3782
3783         pf->weight = x264_mc_weight_wtab_msa;
3784         pf->offsetadd = x264_mc_weight_wtab_msa;
3785         pf->offsetsub = x264_mc_weight_wtab_msa;
3786
3787         pf->copy_16x16_unaligned = x264_mc_copy_w16_msa;
3788         pf->copy[PIXEL_16x16] = x264_mc_copy_w16_msa;
3789         pf->copy[PIXEL_8x8] = x264_mc_copy_w8_msa;
3790         pf->copy[PIXEL_4x4] = x264_mc_copy_w4_msa;
3791
3792         pf->store_interleave_chroma = x264_store_interleave_chroma_msa;
3793         pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_msa;
3794         pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_msa;
3795
3796         pf->plane_copy_interleave = x264_plane_copy_interleave_msa;
3797         pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_msa;
3798         pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_msa;
3799
3800         pf->hpel_filter = x264_hpel_filter_msa;
3801
3802         pf->memcpy_aligned = memcpy;
3803         pf->memzero_aligned = x264_memzero_aligned_msa;
3804         pf->frame_init_lowres_core = x264_frame_init_lowres_core_msa;
3805     }
3806 }
3807 #endif