git.sesse.net Git - x264/blob - common/mips/quant-c.c

   1 /*****************************************************************************
   2  * quant-c.c: msa quantization and level-run
   3  *****************************************************************************
   4  * Copyright (C) 2015 x264 project
   5  *
   6  * Authors: Rishikesh More <rishikesh.more@imgtec.com>
   7  *
   8  * This program is free software; you can redistribute it and/or modify
   9  * it under the terms of the GNU General Public License as published by
  10  * the Free Software Foundation; either version 2 of the License, or
  11  * (at your option) any later version.
  12  *
  13  * This program is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16  * GNU General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU General Public License
  19  * along with this program; if not, write to the Free Software
  20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  21  *
  22  * This program is also available under a commercial proprietary license.
  23  * For more information, contact us at licensing@x264.com.
  24  *****************************************************************************/
  25
  26 #include "common/common.h"
  27 #include "macros.h"
  28
  29 #if !HIGH_BIT_DEPTH
  30 static void avc_dequant_4x4_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][16],
  31                                  int32_t i_qp )
  32 {
  33     const int32_t i_mf = i_qp % 6;
  34     const int32_t q_bits = i_qp / 6 - 4;
  35     v8i16 dct0, dct1;
  36     v4i32 dequant_m_f0, dequant_m_f1, dequant_m_f2, dequant_m_f3;
  37
  38     LD_SH2( p_dct, 8, dct0, dct1 );
  39
  40     LD_SW2( pi_dequant_mf[i_mf], 4, dequant_m_f0, dequant_m_f1 );
  41     LD_SW2( pi_dequant_mf[i_mf] + 8, 4, dequant_m_f2, dequant_m_f3 );
  42
  43     if ( q_bits >= 0 )
  44     {
  45         v8i16 dequant_mf_h0, dequant_mf_h1, q_bits_vec;
  46
  47         q_bits_vec = __msa_fill_h( q_bits );
  48
  49         PCKEV_H2_SH( dequant_m_f1, dequant_m_f0, dequant_m_f3, dequant_m_f2,
  50                      dequant_mf_h0, dequant_mf_h1 );
  51
  52         dct0 *= dequant_mf_h0;
  53         dct1 *= dequant_mf_h1;
  54         dct0 <<= q_bits_vec;
  55         dct1 <<= q_bits_vec;
  56         ST_SH2( dct0, dct1, p_dct, 8 );
  57     }
  58     else
  59     {
  60         const int32_t q_bits_add = 1 << ( -q_bits - 1 );
  61         v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3;
  62         v4i32 q_bits_vec, q_bits_vec_add;
  63
  64         q_bits_vec_add = __msa_fill_w( q_bits_add );
  65         q_bits_vec = __msa_fill_w( -q_bits );
  66
  67         UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 );
  68         UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 );
  69
  70         dct_signed_w0 *= dequant_m_f0;
  71         dct_signed_w1 *= dequant_m_f1;
  72         dct_signed_w2 *= dequant_m_f2;
  73         dct_signed_w3 *= dequant_m_f3;
  74         dct_signed_w0 += q_bits_vec_add;
  75         dct_signed_w1 += q_bits_vec_add;
  76         dct_signed_w2 += q_bits_vec_add;
  77         dct_signed_w3 += q_bits_vec_add;
  78
  79         SRA_4V( dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3,
  80                 q_bits_vec );
  81         PCKEV_H2_SH( dct_signed_w1, dct_signed_w0, dct_signed_w3, dct_signed_w2,
  82                      dct0, dct1 );
  83         ST_SH2( dct0, dct1, p_dct, 8 );
  84     }
  85 }
  86
  87 static void avc_dequant_8x8_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][64],
  88                                  int32_t i_qp )
  89 {
  90     const int32_t i_mf = i_qp % 6;
  91     const int32_t q_bits = i_qp / 6 - 6;
  92     v8i16 dct0, dct1, dct2, dct3, dct4, dct5, dct6, dct7;
  93     v4i32 dequant_m_f0, dequant_m_f1, dequant_m_f2, dequant_m_f3;
  94     v4i32 dequant_m_f4, dequant_m_f5, dequant_m_f6, dequant_m_f7;
  95     v4i32 dequant_m_f8, dequant_m_f9, dequant_m_f10, dequant_m_f11;
  96     v4i32 dequant_m_f12, dequant_m_f13, dequant_m_f14, dequant_m_f15;
  97
  98     LD_SH8( p_dct, 8, dct0, dct1, dct2, dct3, dct4, dct5, dct6, dct7 );
  99
 100     LD_SW2( pi_dequant_mf[i_mf], 4, dequant_m_f0, dequant_m_f1 );
 101     LD_SW2( pi_dequant_mf[i_mf] + 8, 4, dequant_m_f2, dequant_m_f3 );
 102     LD_SW2( pi_dequant_mf[i_mf] + 16, 4, dequant_m_f4, dequant_m_f5 );
 103     LD_SW2( pi_dequant_mf[i_mf] + 24, 4, dequant_m_f6, dequant_m_f7 );
 104     LD_SW2( pi_dequant_mf[i_mf] + 32, 4, dequant_m_f8, dequant_m_f9 );
 105     LD_SW2( pi_dequant_mf[i_mf] + 40, 4, dequant_m_f10, dequant_m_f11 );
 106     LD_SW2( pi_dequant_mf[i_mf] + 48, 4, dequant_m_f12, dequant_m_f13 );
 107     LD_SW2( pi_dequant_mf[i_mf] + 56, 4, dequant_m_f14, dequant_m_f15 );
 108
 109     if ( q_bits >= 0 )
 110     {
 111         v8i16 q_bits_vec;
 112         v8i16 dequant_mf_h0, dequant_mf_h1, dequant_mf_h2, dequant_mf_h3;
 113         v8i16 dequant_mf_h4, dequant_mf_h5, dequant_mf_h6, dequant_mf_h7;
 114
 115         q_bits_vec = __msa_fill_h( q_bits );
 116
 117         PCKEV_H4_SH( dequant_m_f1, dequant_m_f0, dequant_m_f3, dequant_m_f2,
 118                      dequant_m_f5, dequant_m_f4, dequant_m_f7, dequant_m_f6,
 119                      dequant_mf_h0, dequant_mf_h1,
 120                      dequant_mf_h2, dequant_mf_h3 );
 121         PCKEV_H4_SH( dequant_m_f9, dequant_m_f8, dequant_m_f11, dequant_m_f10,
 122                      dequant_m_f13, dequant_m_f12, dequant_m_f15, dequant_m_f14,
 123                      dequant_mf_h4, dequant_mf_h5,
 124                      dequant_mf_h6, dequant_mf_h7 );
 125
 126         dct0 *= dequant_mf_h0;
 127         dct1 *= dequant_mf_h1;
 128         dct2 *= dequant_mf_h2;
 129         dct3 *= dequant_mf_h3;
 130         dct4 *= dequant_mf_h4;
 131         dct5 *= dequant_mf_h5;
 132         dct6 *= dequant_mf_h6;
 133         dct7 *= dequant_mf_h7;
 134
 135         SLLI_4V( dct0, dct1, dct2, dct3, q_bits_vec );
 136         SLLI_4V( dct4, dct5, dct6, dct7, q_bits_vec );
 137
 138         ST_SH8( dct0, dct1, dct2, dct3, dct4, dct5, dct6, dct7, p_dct, 8 );
 139     }
 140     else
 141     {
 142         const int32_t q_bits_add = 1 << ( -q_bits - 1 );
 143         v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3;
 144         v4i32 dct_signed_w4, dct_signed_w5, dct_signed_w6, dct_signed_w7;
 145         v4i32 dct_signed_w8, dct_signed_w9, dct_signed_w10, dct_signed_w11;
 146         v4i32 dct_signed_w12, dct_signed_w13, dct_signed_w14, dct_signed_w15;
 147         v4i32 q_bits_vec, q_bits_vec_add;
 148
 149         q_bits_vec_add = __msa_fill_w( q_bits_add );
 150         q_bits_vec = __msa_fill_w( -q_bits );
 151
 152         UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 );
 153         UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 );
 154         UNPCK_SH_SW( dct2, dct_signed_w4, dct_signed_w5 );
 155         UNPCK_SH_SW( dct3, dct_signed_w6, dct_signed_w7 );
 156         UNPCK_SH_SW( dct4, dct_signed_w8, dct_signed_w9 );
 157         UNPCK_SH_SW( dct5, dct_signed_w10, dct_signed_w11 );
 158         UNPCK_SH_SW( dct6, dct_signed_w12, dct_signed_w13 );
 159         UNPCK_SH_SW( dct7, dct_signed_w14, dct_signed_w15 );
 160
 161         dct_signed_w0 *= dequant_m_f0;
 162         dct_signed_w1 *= dequant_m_f1;
 163         dct_signed_w2 *= dequant_m_f2;
 164         dct_signed_w3 *= dequant_m_f3;
 165         dct_signed_w4 *= dequant_m_f4;
 166         dct_signed_w5 *= dequant_m_f5;
 167         dct_signed_w6 *= dequant_m_f6;
 168         dct_signed_w7 *= dequant_m_f7;
 169         dct_signed_w8 *= dequant_m_f8;
 170         dct_signed_w9 *= dequant_m_f9;
 171         dct_signed_w10 *= dequant_m_f10;
 172         dct_signed_w11 *= dequant_m_f11;
 173         dct_signed_w12 *= dequant_m_f12;
 174         dct_signed_w13 *= dequant_m_f13;
 175         dct_signed_w14 *= dequant_m_f14;
 176         dct_signed_w15 *= dequant_m_f15;
 177
 178         dct_signed_w0 += q_bits_vec_add;
 179         dct_signed_w1 += q_bits_vec_add;
 180         dct_signed_w2 += q_bits_vec_add;
 181         dct_signed_w3 += q_bits_vec_add;
 182         dct_signed_w4 += q_bits_vec_add;
 183         dct_signed_w5 += q_bits_vec_add;
 184         dct_signed_w6 += q_bits_vec_add;
 185         dct_signed_w7 += q_bits_vec_add;
 186         dct_signed_w8 += q_bits_vec_add;
 187         dct_signed_w9 += q_bits_vec_add;
 188         dct_signed_w10 += q_bits_vec_add;
 189         dct_signed_w11 += q_bits_vec_add;
 190         dct_signed_w12 += q_bits_vec_add;
 191         dct_signed_w13 += q_bits_vec_add;
 192         dct_signed_w14 += q_bits_vec_add;
 193         dct_signed_w15 += q_bits_vec_add;
 194
 195         SRA_4V( dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3,
 196                 q_bits_vec );
 197         SRA_4V( dct_signed_w4, dct_signed_w5, dct_signed_w6, dct_signed_w7,
 198                 q_bits_vec );
 199         SRA_4V( dct_signed_w8, dct_signed_w9, dct_signed_w10, dct_signed_w11,
 200                 q_bits_vec );
 201         SRA_4V( dct_signed_w12, dct_signed_w13, dct_signed_w14, dct_signed_w15,
 202                 q_bits_vec );
 203         PCKEV_H4_SH( dct_signed_w1, dct_signed_w0, dct_signed_w3, dct_signed_w2,
 204                      dct_signed_w5, dct_signed_w4, dct_signed_w7, dct_signed_w6,
 205                      dct0, dct1, dct2, dct3 );
 206         PCKEV_H4_SH( dct_signed_w9, dct_signed_w8, dct_signed_w11,
 207                      dct_signed_w10, dct_signed_w13, dct_signed_w12,
 208                      dct_signed_w15, dct_signed_w14, dct4, dct5, dct6, dct7 );
 209         ST_SH8( dct0, dct1, dct2, dct3, dct4, dct5, dct6, dct7, p_dct, 8 );
 210     }
 211 }
 212
 213 static void avc_dequant_4x4_dc_msa( int16_t *p_dct,
 214                                     int32_t pi_dequant_mf[6][16],
 215                                     int32_t i_qp )
 216 {
 217     const int32_t q_bits = i_qp / 6 - 6;
 218     int32_t i_dmf = pi_dequant_mf[i_qp % 6][0];
 219     v8i16 dct0, dct1, dequant_mf_h;
 220
 221     LD_SH2( p_dct, 8, dct0, dct1 );
 222
 223     if ( q_bits >= 0 )
 224     {
 225         i_dmf <<= q_bits;
 226
 227         dequant_mf_h = __msa_fill_h( i_dmf );
 228         dct0 = dct0 * dequant_mf_h;
 229         dct1 = dct1 * dequant_mf_h;
 230
 231         ST_SH2( dct0, dct1, p_dct, 8 );
 232     }
 233     else
 234     {
 235         const int32_t q_bits_add = 1 << ( -q_bits - 1 );
 236         v4i32 dequant_m_f, q_bits_vec, q_bits_vec_add;
 237         v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3;
 238
 239         q_bits_vec_add = __msa_fill_w( q_bits_add );
 240         q_bits_vec = __msa_fill_w( -q_bits );
 241
 242         dequant_m_f = __msa_fill_w( i_dmf );
 243
 244         UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 );
 245         UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 );
 246
 247         dct_signed_w0 *= dequant_m_f;
 248         dct_signed_w1 *= dequant_m_f;
 249         dct_signed_w2 *= dequant_m_f;
 250         dct_signed_w3 *= dequant_m_f;
 251
 252         dct_signed_w0 += q_bits_vec_add;
 253         dct_signed_w1 += q_bits_vec_add;
 254         dct_signed_w2 += q_bits_vec_add;
 255         dct_signed_w3 += q_bits_vec_add;
 256
 257         SRA_4V( dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3,
 258                 q_bits_vec );
 259         PCKEV_H2_SH( dct_signed_w1, dct_signed_w0, dct_signed_w3, dct_signed_w2,
 260                      dct0, dct1 );
 261         ST_SH2( dct0, dct1, p_dct, 8 );
 262     }
 263 }
 264
 265 static int32_t avc_quant_4x4_msa( int16_t *p_dct, uint16_t *p_mf,
 266                                   uint16_t *p_bias )
 267 {
 268     int32_t non_zero = 0;
 269     v8i16 dct0, dct1;
 270     v8i16 zero = { 0 };
 271     v8i16 dct0_mask, dct1_mask;
 272     v8i16 dct_h0, dct_h1, mf_h0, mf_h1, bias_h0, bias_h1;
 273     v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3;
 274     v4i32 dct_w0, dct_w1, dct_w2, dct_w3;
 275     v4i32 mf_vec0, mf_vec1, mf_vec2, mf_vec3;
 276     v4i32 bias0, bias1, bias2, bias3;
 277
 278     LD_SH2( p_dct, 8, dct0, dct1 );
 279     LD_SH2( p_bias, 8, bias_h0, bias_h1 );
 280     LD_SH2( p_mf, 8, mf_h0, mf_h1 );
 281
 282     dct0_mask = __msa_clei_s_h( dct0, 0 );
 283     dct1_mask = __msa_clei_s_h( dct1, 0 );
 284
 285     UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 );
 286     UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 );
 287     ILVR_H2_SW( zero, bias_h0, zero, bias_h1, bias0, bias2 );
 288     ILVL_H2_SW( zero, bias_h0, zero, bias_h1, bias1, bias3 );
 289     ILVR_H2_SW( zero, mf_h0, zero, mf_h1, mf_vec0, mf_vec2 );
 290     ILVL_H2_SW( zero, mf_h0, zero, mf_h1, mf_vec1, mf_vec3 );
 291
 292     dct_w1 = __msa_add_a_w( dct_signed_w1, bias1 );
 293     dct_w0 = __msa_add_a_w( dct_signed_w0, bias0 );
 294     dct_w2 = __msa_add_a_w( dct_signed_w2, bias2 );
 295     dct_w3 = __msa_add_a_w( dct_signed_w3, bias3 );
 296
 297     dct_w0 *= mf_vec0;
 298     dct_w1 *= mf_vec1;
 299     dct_w2 *= mf_vec2;
 300     dct_w3 *= mf_vec3;
 301
 302     SRA_4V( dct_w0, dct_w1, dct_w2, dct_w3, 16 );
 303     PCKEV_H2_SH( dct_w1, dct_w0, dct_w3, dct_w2, dct_h0, dct_h1 );
 304
 305     dct0 = zero - dct_h0;
 306     dct1 = zero - dct_h1;
 307
 308     dct0 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h0, ( v16u8 ) dct0,
 309                                    ( v16u8 ) dct0_mask );
 310     dct1 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h1, ( v16u8 ) dct1,
 311                                    ( v16u8 ) dct1_mask );
 312     non_zero = HADD_SW_S32( ( v4u32 ) ( dct_h0 + dct_h1 ) );
 313     ST_SH2( dct0, dct1, p_dct, 8 );
 314
 315     return !!non_zero;
 316 }
 317
 318 static int32_t avc_quant_8x8_msa( int16_t *p_dct, uint16_t *p_mf,
 319                                   uint16_t *p_bias )
 320 {
 321     int32_t non_zero = 0;
 322     v8i16 dct0, dct1, dct2, dct3;
 323     v8i16 zero = { 0 };
 324     v8i16 dct0_mask, dct1_mask, dct2_mask, dct3_mask;
 325     v8i16 dct_h0, dct_h1, dct_h2, dct_h3, mf_h0, mf_h1, mf_h2, mf_h3;
 326     v8i16 bias_h0, bias_h1, bias_h2, bias_h3;
 327     v4i32 dct_w0, dct_w1, dct_w2, dct_w3, dct_w4, dct_w5, dct_w6, dct_w7;
 328     v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3;
 329     v4i32 dct_signed_w4, dct_signed_w5, dct_signed_w6, dct_signed_w7;
 330     v4i32 mf_vec0, mf_vec1, mf_vec2, mf_vec3;
 331     v4i32 mf_vec4, mf_vec5, mf_vec6, mf_vec7;
 332     v4i32 bias0, bias1, bias2, bias3, bias4, bias5, bias6, bias7;
 333
 334     LD_SH4( p_dct, 8, dct0, dct1, dct2, dct3 );
 335
 336     dct0_mask = __msa_clei_s_h( dct0, 0 );
 337     dct1_mask = __msa_clei_s_h( dct1, 0 );
 338     dct2_mask = __msa_clei_s_h( dct2, 0 );
 339     dct3_mask = __msa_clei_s_h( dct3, 0 );
 340
 341     UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 );
 342     UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 );
 343     UNPCK_SH_SW( dct2, dct_signed_w4, dct_signed_w5 );
 344     UNPCK_SH_SW( dct3, dct_signed_w6, dct_signed_w7 );
 345     LD_SH4( p_bias, 8, bias_h0, bias_h1, bias_h2, bias_h3 );
 346     ILVR_H4_SW( zero, bias_h0, zero, bias_h1, zero, bias_h2, zero, bias_h3,
 347                 bias0, bias2, bias4, bias6 );
 348     ILVL_H4_SW( zero, bias_h0, zero, bias_h1, zero, bias_h2, zero, bias_h3,
 349                 bias1, bias3, bias5, bias7 );
 350     LD_SH4( p_mf, 8, mf_h0, mf_h1, mf_h2, mf_h3 );
 351     ILVR_H4_SW( zero, mf_h0, zero, mf_h1, zero, mf_h2, zero, mf_h3,
 352                 mf_vec0, mf_vec2, mf_vec4, mf_vec6 );
 353     ILVL_H4_SW( zero, mf_h0, zero, mf_h1, zero, mf_h2, zero, mf_h3,
 354                 mf_vec1, mf_vec3, mf_vec5, mf_vec7 );
 355
 356     dct_w0 = __msa_add_a_w( dct_signed_w0, bias0 );
 357     dct_w1 = __msa_add_a_w( dct_signed_w1, bias1 );
 358     dct_w2 = __msa_add_a_w( dct_signed_w2, bias2 );
 359     dct_w3 = __msa_add_a_w( dct_signed_w3, bias3 );
 360     dct_w4 = __msa_add_a_w( dct_signed_w4, bias4 );
 361     dct_w5 = __msa_add_a_w( dct_signed_w5, bias5 );
 362     dct_w6 = __msa_add_a_w( dct_signed_w6, bias6 );
 363     dct_w7 = __msa_add_a_w( dct_signed_w7, bias7 );
 364
 365     dct_w0 *= mf_vec0;
 366     dct_w1 *= mf_vec1;
 367     dct_w2 *= mf_vec2;
 368     dct_w3 *= mf_vec3;
 369     dct_w4 *= mf_vec4;
 370     dct_w5 *= mf_vec5;
 371     dct_w6 *= mf_vec6;
 372     dct_w7 *= mf_vec7;
 373
 374     SRA_4V( dct_w0, dct_w1, dct_w2, dct_w3, 16 );
 375     SRA_4V( dct_w4, dct_w5, dct_w6, dct_w7, 16 );
 376     PCKEV_H4_SH( dct_w1, dct_w0, dct_w3, dct_w2, dct_w5, dct_w4, dct_w7, dct_w6,
 377                  dct_h0, dct_h1, dct_h2, dct_h3 );
 378     SUB4( zero, dct_h0, zero, dct_h1, zero, dct_h2, zero, dct_h3,
 379           dct0, dct1, dct2, dct3 );
 380
 381     dct0 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h0,
 382                                    ( v16u8 ) dct0, ( v16u8 ) dct0_mask );
 383     dct1 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h1,
 384                                    ( v16u8 ) dct1, ( v16u8 ) dct1_mask );
 385     dct2 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h2,
 386                                    ( v16u8 ) dct2, ( v16u8 ) dct2_mask );
 387     dct3 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h3,
 388                                    ( v16u8 ) dct3, ( v16u8 ) dct3_mask );
 389
 390     non_zero = HADD_SW_S32( ( v4u32 )( dct_h0 + dct_h1 + dct_h2 + dct_h3 ) );
 391     ST_SH4( dct0, dct1, dct2, dct3, p_dct, 8 );
 392     LD_SH4( p_dct + 32, 8, dct0, dct1, dct2, dct3 );
 393
 394     dct0_mask = __msa_clei_s_h( dct0, 0 );
 395     dct1_mask = __msa_clei_s_h( dct1, 0 );
 396     dct2_mask = __msa_clei_s_h( dct2, 0 );
 397     dct3_mask = __msa_clei_s_h( dct3, 0 );
 398
 399     UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 );
 400     UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 );
 401     UNPCK_SH_SW( dct2, dct_signed_w4, dct_signed_w5 );
 402     UNPCK_SH_SW( dct3, dct_signed_w6, dct_signed_w7 );
 403     LD_SH4( p_bias + 32, 8, bias_h0, bias_h1, bias_h2, bias_h3 );
 404     ILVR_H4_SW( zero, bias_h0, zero, bias_h1, zero, bias_h2, zero, bias_h3,
 405                 bias0, bias2, bias4, bias6 );
 406     ILVL_H4_SW( zero, bias_h0, zero, bias_h1, zero, bias_h2, zero, bias_h3,
 407                 bias1, bias3, bias5, bias7 );
 408     LD_SH4( p_mf + 32, 8, mf_h0, mf_h1, mf_h2, mf_h3 );
 409     ILVR_H4_SW( zero, mf_h0, zero, mf_h1, zero, mf_h2, zero, mf_h3,
 410                 mf_vec0, mf_vec2, mf_vec4, mf_vec6 );
 411     ILVL_H4_SW( zero, mf_h0, zero, mf_h1, zero, mf_h2, zero, mf_h3,
 412                 mf_vec1, mf_vec3, mf_vec5, mf_vec7 );
 413
 414     dct_w0 = __msa_add_a_w( dct_signed_w0, bias0 );
 415     dct_w1 = __msa_add_a_w( dct_signed_w1, bias1 );
 416     dct_w2 = __msa_add_a_w( dct_signed_w2, bias2 );
 417     dct_w3 = __msa_add_a_w( dct_signed_w3, bias3 );
 418     dct_w4 = __msa_add_a_w( dct_signed_w4, bias4 );
 419     dct_w5 = __msa_add_a_w( dct_signed_w5, bias5 );
 420     dct_w6 = __msa_add_a_w( dct_signed_w6, bias6 );
 421     dct_w7 = __msa_add_a_w( dct_signed_w7, bias7 );
 422
 423     dct_w0 *= mf_vec0;
 424     dct_w1 *= mf_vec1;
 425     dct_w2 *= mf_vec2;
 426     dct_w3 *= mf_vec3;
 427     dct_w4 *= mf_vec4;
 428     dct_w5 *= mf_vec5;
 429     dct_w6 *= mf_vec6;
 430     dct_w7 *= mf_vec7;
 431
 432     SRA_4V( dct_w0, dct_w1, dct_w2, dct_w3, 16 );
 433     SRA_4V( dct_w4, dct_w5, dct_w6, dct_w7, 16 );
 434     PCKEV_H2_SH( dct_w1, dct_w0, dct_w3, dct_w2, dct_h0, dct_h1 );
 435     PCKEV_H2_SH( dct_w5, dct_w4, dct_w7, dct_w6, dct_h2, dct_h3 );
 436     SUB4( zero, dct_h0, zero, dct_h1, zero, dct_h2, zero, dct_h3,
 437           dct0, dct1, dct2, dct3 );
 438
 439     dct0 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h0,
 440                                    ( v16u8 ) dct0, ( v16u8 ) dct0_mask );
 441     dct1 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h1,
 442                                    ( v16u8 ) dct1, ( v16u8 ) dct1_mask );
 443     dct2 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h2,
 444                                    ( v16u8 ) dct2, ( v16u8 ) dct2_mask );
 445     dct3 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h3,
 446                                    ( v16u8 ) dct3, ( v16u8 ) dct3_mask );
 447
 448     non_zero += HADD_SW_S32( ( v4u32 ) ( dct_h0 + dct_h1 + dct_h2 + dct_h3 ) );
 449     ST_SH4( dct0, dct1, dct2, dct3, p_dct + 32, 8 );
 450
 451     return !!non_zero;
 452 }
 453
 454 static int32_t avc_quant_4x4_dc_msa( int16_t *p_dct, int32_t i_mf,
 455                                      int32_t i_bias )
 456 {
 457     int32_t non_zero = 0;
 458     v8i16 dct0, dct1, dct0_mask, dct1_mask;
 459     v8i16 zero = { 0 };
 460     v8i16 dct_h0, dct_h1;
 461     v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3;
 462     v4i32 dct_w0, dct_w1, dct_w2, dct_w3;
 463     v4i32 mf_vec, bias_vec;
 464
 465     LD_SH2( p_dct, 8, dct0, dct1 );
 466
 467     dct0_mask = __msa_clei_s_h( dct0, 0 );
 468     dct1_mask = __msa_clei_s_h( dct1, 0 );
 469
 470     UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 );
 471     UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 );
 472
 473     bias_vec = __msa_fill_w( i_bias );
 474     mf_vec = __msa_fill_w( i_mf );
 475
 476     dct_w0 = __msa_add_a_w( dct_signed_w0, bias_vec );
 477     dct_w1 = __msa_add_a_w( dct_signed_w1, bias_vec );
 478     dct_w2 = __msa_add_a_w( dct_signed_w2, bias_vec );
 479     dct_w3 = __msa_add_a_w( dct_signed_w3, bias_vec );
 480
 481     dct_w0 *= mf_vec;
 482     dct_w1 *= mf_vec;
 483     dct_w2 *= mf_vec;
 484     dct_w3 *= mf_vec;
 485
 486     SRA_4V( dct_w0, dct_w1, dct_w2, dct_w3, 16 );
 487     PCKEV_H2_SH( dct_w1, dct_w0, dct_w3, dct_w2, dct_h0, dct_h1 );
 488
 489     dct0 = zero - dct_h0;
 490     dct1 = zero - dct_h1;
 491     dct0 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h0,
 492                                    ( v16u8 ) dct0, ( v16u8 ) dct0_mask );
 493     dct1 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h1,
 494                                    ( v16u8 ) dct1, ( v16u8 ) dct1_mask );
 495     non_zero = HADD_SW_S32( ( v4u32 ) ( dct_h0 + dct_h1 ) );
 496
 497     ST_SH2( dct0, dct1, p_dct, 8 );
 498
 499     return !!non_zero;
 500 }
 501
 502 static int32_t avc_coeff_last64_msa( int16_t *p_src )
 503 {
 504     uint32_t u_res;
 505     v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
 506     v8i16 tmp_h0, tmp_h1, tmp_h2, tmp_h3, tmp_h4, tmp_h5, tmp_h6, tmp_h7;
 507     v16u8 tmp0, tmp1, tmp2, tmp3;
 508     v8u16 vec0, vec1, vec2, vec3;
 509     v4i32 out0;
 510     v16u8 mask = { 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128 };
 511
 512     LD_SH8( p_src, 8, src0, src1, src2, src3, src4, src5, src6, src7 );
 513
 514     tmp_h0 = __msa_ceqi_h( src0, 0 );
 515     tmp_h1 = __msa_ceqi_h( src1, 0 );
 516     tmp_h2 = __msa_ceqi_h( src2, 0 );
 517     tmp_h3 = __msa_ceqi_h( src3, 0 );
 518     tmp_h4 = __msa_ceqi_h( src4, 0 );
 519     tmp_h5 = __msa_ceqi_h( src5, 0 );
 520     tmp_h6 = __msa_ceqi_h( src6, 0 );
 521     tmp_h7 = __msa_ceqi_h( src7, 0 );
 522
 523     PCKEV_B4_UB( tmp_h1, tmp_h0, tmp_h3, tmp_h2, tmp_h5, tmp_h4, tmp_h7, tmp_h6,
 524                  tmp0, tmp1, tmp2, tmp3 );
 525
 526     tmp0 = tmp0 & mask;
 527     tmp1 = tmp1 & mask;
 528     tmp2 = tmp2 & mask;
 529     tmp3 = tmp3 & mask;
 530
 531     HADD_UB4_UH( tmp0, tmp1, tmp2, tmp3, vec0, vec1, vec2, vec3 );
 532     PCKEV_B2_UB( vec1, vec0, vec3, vec2, tmp0, tmp1 );
 533     HADD_UB2_UH( tmp0, tmp1, vec0, vec1 );
 534
 535     tmp0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) vec1, ( v16i8 ) vec0 );
 536     vec0 = __msa_hadd_u_h( tmp0, tmp0 );
 537     tmp0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) vec0, ( v16i8 ) vec0 );
 538     out0 = ( v4i32 ) __msa_nloc_d( ( v2i64 ) tmp0 );
 539     u_res = __msa_copy_u_w( out0, 0 );
 540
 541     return ( 63 - u_res );
 542 }
 543
 544 static int32_t avc_coeff_last16_msa( int16_t *p_src )
 545 {
 546     uint32_t u_res;
 547     v8i16 src0, src1;
 548     v8u16 tmp_h0;
 549     v16u8 tmp0;
 550     v8i16 out0, out1;
 551     v16i8 res0;
 552     v16u8 mask = { 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128 };
 553
 554     LD_SH2( p_src, 8, src0, src1 );
 555
 556     out0 = __msa_ceqi_h( src0, 0 );
 557     out1 = __msa_ceqi_h( src1, 0 );
 558
 559     tmp0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) out1, ( v16i8 ) out0 );
 560     tmp0 = tmp0 & mask;
 561     tmp_h0 = __msa_hadd_u_h( tmp0, tmp0 );
 562     tmp0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) tmp_h0, ( v16i8 ) tmp_h0 );
 563     tmp_h0 = __msa_hadd_u_h( tmp0, tmp0 );
 564     tmp0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) tmp_h0, ( v16i8 ) tmp_h0 );
 565     tmp_h0 = __msa_hadd_u_h( tmp0, tmp0 );
 566     res0 = __msa_pckev_b( ( v16i8 ) tmp_h0, ( v16i8 ) tmp_h0 );
 567     out0 = __msa_nloc_h( ( v8i16 ) res0 );
 568     u_res = __msa_copy_u_h( out0, 0 );
 569
 570     return ( 15 - u_res );
 571 }
 572
 573 void x264_dequant_4x4_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][16],
 574                            int32_t i_qp )
 575 {
 576     avc_dequant_4x4_msa( p_dct, pi_dequant_mf, i_qp );
 577 }
 578
 579 void x264_dequant_8x8_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][64],
 580                            int32_t i_qp )
 581 {
 582     avc_dequant_8x8_msa( p_dct, pi_dequant_mf, i_qp );
 583 }
 584
 585 void x264_dequant_4x4_dc_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][16],
 586                               int32_t i_qp )
 587 {
 588     avc_dequant_4x4_dc_msa( p_dct, pi_dequant_mf, i_qp );
 589 }
 590
 591 int32_t x264_quant_4x4_msa( int16_t *p_dct, uint16_t *p_mf, uint16_t *p_bias )
 592 {
 593     return avc_quant_4x4_msa( p_dct, p_mf, p_bias );
 594 }
 595
 596 int32_t x264_quant_4x4x4_msa( int16_t p_dct[4][16],
 597                               uint16_t pu_mf[16], uint16_t pu_bias[16] )
 598 {
 599     int32_t i_non_zero, i_non_zero_acc = 0;
 600
 601     for( int32_t j = 0; j < 4; j++  )
 602     {
 603         i_non_zero = x264_quant_4x4_msa( p_dct[j], pu_mf, pu_bias );
 604
 605         i_non_zero_acc |= ( !!i_non_zero ) << j;
 606     }
 607
 608     return i_non_zero_acc;
 609 }
 610
 611 int32_t x264_quant_8x8_msa( int16_t *p_dct, uint16_t *p_mf, uint16_t *p_bias )
 612 {
 613     return avc_quant_8x8_msa( p_dct, p_mf, p_bias );
 614 }
 615
 616 int32_t x264_quant_4x4_dc_msa( int16_t *p_dct, int32_t i_mf, int32_t i_bias )
 617 {
 618     return avc_quant_4x4_dc_msa( p_dct, i_mf, i_bias );
 619 }
 620
 621 int32_t x264_coeff_last64_msa( int16_t *p_src )
 622 {
 623     return avc_coeff_last64_msa( p_src );
 624 }
 625
 626 int32_t x264_coeff_last16_msa( int16_t *p_src )
 627 {
 628     return avc_coeff_last16_msa( p_src );
 629 }
 630 #endif