git.sesse.net Git - ffmpeg/blob - libavcodec/mips/vc1dsp_mmi.c

   1 /*
   2  * VC-1 and WMV3 - DSP functions Loongson MMI-optimized
   3  *
   4  * Copyright (c) 2016 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
   5  *
   6  * This file is part of FFmpeg.
   7  *
   8  * FFmpeg is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License as published by the Free Software Foundation; either
  11  * version 2.1 of the License, or (at your option) any later version.
  12  *
  13  * FFmpeg is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16  * Lesser General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU Lesser General Public
  19  * License along with FFmpeg; if not, write to the Free Software
  20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21  */
  22
  23 #include "libavutil/avassert.h"
  24 #include "libavcodec/vc1dsp.h"
  25 #include "constants.h"
  26 #include "vc1dsp_mips.h"
  27 #include "hpeldsp_mips.h"
  28 #include "libavutil/mips/mmiutils.h"
  29
  30
  31 #define VC1_INV_TRANCS_8_STEP1_MMI(fp1,   fp2,   fp3,   fp4,                \
  32                                    o1,    o2,    o3,    o4,                 \
  33                                    t1,    t2,    t3,    t4,                 \
  34                                    ff_p1, ff_p2, ff_p3, ff_p4)              \
  35         "pmullh     "#t1"   ,   "#fp1"  ,   "#ff_p1"                \n\t"   \
  36         "pmullh     "#t2"   ,   "#fp2"  ,   "#ff_p2"                \n\t"   \
  37         "pmullh     "#t3"   ,   "#fp3"  ,   "#ff_p3"                \n\t"   \
  38         "pmullh     "#t4"   ,   "#fp4"  ,   "#ff_p4"                \n\t"   \
  39         "paddh      "#o1"   ,   "#t1"   ,   "#t2"                   \n\t"   \
  40         "paddh      "#o1"   ,   "#o1"   ,   "#t3"                   \n\t"   \
  41         "paddh      "#o1"   ,   "#o1"   ,   "#t4"                   \n\t"   \
  42                                                                             \
  43         "pmullh     "#t1"   ,   "#fp1"  ,   "#ff_p2"                \n\t"   \
  44         "pmullh     "#t2"   ,   "#fp2"  ,   "#ff_p4"                \n\t"   \
  45         "pmullh     "#t3"   ,   "#fp3"  ,   "#ff_p1"                \n\t"   \
  46         "pmullh     "#t4"   ,   "#fp4"  ,   "#ff_p3"                \n\t"   \
  47         "psubh      "#o2"   ,   "#t1"   ,   "#t2"                   \n\t"   \
  48         "psubh      "#o2"   ,   "#o2"   ,   "#t3"                   \n\t"   \
  49         "psubh      "#o2"   ,   "#o2"   ,   "#t4"                   \n\t"   \
  50                                                                             \
  51         "pmullh     "#t1"   ,   "#fp1"  ,   "#ff_p3"                \n\t"   \
  52         "pmullh     "#t2"   ,   "#fp2"  ,   "#ff_p1"                \n\t"   \
  53         "pmullh     "#t3"   ,   "#fp3"  ,   "#ff_p4"                \n\t"   \
  54         "pmullh     "#t4"   ,   "#fp4"  ,   "#ff_p2"                \n\t"   \
  55         "psubh      "#o3"   ,   "#t1"   ,   "#t2"                   \n\t"   \
  56         "paddh      "#o3"   ,   "#o3"   ,   "#t3"                   \n\t"   \
  57         "paddh      "#o3"   ,   "#o3"   ,   "#t4"                   \n\t"   \
  58                                                                             \
  59         "pmullh     "#t1"   ,   "#fp1"  ,   "#ff_p4"                \n\t"   \
  60         "pmullh     "#t2"   ,   "#fp2"  ,   "#ff_p3"                \n\t"   \
  61         "pmullh     "#t3"   ,   "#fp3"  ,   "#ff_p2"                \n\t"   \
  62         "pmullh     "#t4"   ,   "#fp4"  ,   "#ff_p1"                \n\t"   \
  63         "psubh      "#o4"   ,   "#t1"   ,   "#t2"                   \n\t"   \
  64         "paddh      "#o4"   ,   "#o4"   ,   "#t3"                   \n\t"   \
  65         "psubh      "#o4"   ,   "#o4"   ,   "#t4"                   \n\t"
  66
  67
  68 #define VC1_INV_TRANCS_8_STEP2_MMI(fp1,   fp2,   fp3,   fp4,                \
  69                                    fp5,   fp6,   fp7,   fp8,                \
  70                                    o1,    o2,    o3,    o4,                 \
  71                                    ff_p1, ff_p2, ff_p3, ff_pw)              \
  72         "paddh      "#fp5"  ,   "#fp1"  ,   "#fp2"                  \n\t"   \
  73         "psubh      "#fp6"  ,   "#fp1"  ,   "#fp2"                  \n\t"   \
  74         "pmullh     "#fp5"  ,   "#fp5"  ,   "#ff_p1"                \n\t"   \
  75         "pmullh     "#fp6"  ,   "#fp6"  ,   "#ff_p1"                \n\t"   \
  76         "paddh      "#fp5"  ,   "#fp5"  ,   "#ff_pw"                \n\t"   \
  77         "paddh      "#fp6"  ,   "#fp6"  ,   "#ff_pw"                \n\t"   \
  78                                                                             \
  79         "pmullh     "#fp1"  ,   "#fp3"  ,   "#ff_p2"                \n\t"   \
  80         "pmullh     "#fp2"  ,   "#fp4"  ,   "#ff_p3"                \n\t"   \
  81         "pmullh     "#fp3"  ,   "#fp3"  ,   "#ff_p3"                \n\t"   \
  82         "pmullh     "#fp4"  ,   "#fp4"  ,   "#ff_p2"                \n\t"   \
  83         "paddh      "#fp7"  ,   "#fp1"  ,   "#fp2"                  \n\t"   \
  84         "psubh      "#fp8"  ,   "#fp3"  ,   "#fp4"                  \n\t"   \
  85                                                                             \
  86         "paddh      "#fp1"  ,   "#fp5"  ,   "#fp7"                  \n\t"   \
  87         "paddh      "#fp2"  ,   "#fp6"  ,   "#fp8"                  \n\t"   \
  88         "psubh      "#fp3"  ,   "#fp6"  ,   "#fp8"                  \n\t"   \
  89         "psubh      "#fp4"  ,   "#fp5"  ,   "#fp7"                  \n\t"   \
  90                                                                             \
  91         "paddh      "#fp5"  ,   "#fp1"  ,   "#o1"                   \n\t"   \
  92         "paddh      "#fp6"  ,   "#fp2"  ,   "#o2"                   \n\t"   \
  93         "paddh      "#fp7"  ,   "#fp3"  ,   "#o3"                   \n\t"   \
  94         "paddh      "#fp8"  ,   "#fp4"  ,   "#o4"                   \n\t"   \
  95                                                                             \
  96         "psubh      "#fp4"  ,   "#fp4"  ,   "#o4"                   \n\t"   \
  97         "psubh      "#fp3"  ,   "#fp3"  ,   "#o3"                   \n\t"   \
  98         "psubh      "#fp2"  ,   "#fp2"  ,   "#o2"                   \n\t"   \
  99         "psubh      "#fp1"  ,   "#fp1"  ,   "#o1"                   \n\t"
 100
 101
 102 #define VC1_INV_TRANCS_4_STEP1_MMI(fp1,   fp2,   fp3,   fp4,                \
 103                                    fp5,   fp6,   fp7,   fp8,                \
 104                                    ff_p1, ff_p2, ff_p3, ff_pw)              \
 105         "paddh      "#fp5"  ,   "#fp1"  ,   "#fp2"                  \n\t"   \
 106         "psubh      "#fp6"  ,   "#fp1"  ,   "#fp2"                  \n\t"   \
 107         "pmullh     "#fp5"  ,   "#fp5"  ,   "#ff_p1"                \n\t"   \
 108         "pmullh     "#fp6"  ,   "#fp6"  ,   "#ff_p1"                \n\t"   \
 109         "paddh      "#fp5"  ,   "#fp5"  ,   "#ff_pw"                \n\t"   \
 110         "paddh      "#fp6"  ,   "#fp6"  ,   "#ff_pw"                \n\t"   \
 111                                                                             \
 112         "pmullh     "#fp1"  ,   "#fp3"  ,   "#ff_p2"                \n\t"   \
 113         "pmullh     "#fp2"  ,   "#fp4"  ,   "#ff_p3"                \n\t"   \
 114         "pmullh     "#fp3"  ,   "#fp3"  ,   "#ff_p3"                \n\t"   \
 115         "pmullh     "#fp4"  ,   "#fp4"  ,   "#ff_p2"                \n\t"   \
 116         "paddh      "#fp7"  ,   "#fp1"  ,   "#fp2"                  \n\t"   \
 117         "psubh      "#fp8"  ,   "#fp3"  ,   "#fp4"                  \n\t"   \
 118                                                                             \
 119         "paddh      "#fp1"  ,   "#fp5"  ,   "#fp7"                  \n\t"   \
 120         "psubh      "#fp2"  ,   "#fp6"  ,   "#fp8"                  \n\t"   \
 121         "paddh      "#fp3"  ,   "#fp6"  ,   "#fp8"                  \n\t"   \
 122         "psubh      "#fp4"  ,   "#fp5"  ,   "#fp7"                  \n\t"
 123
 124
 125 #define VC1_INV_TRANCS_4_STEP2_MMI(fp1, fp2, fp3, fp4,                      \
 126                                    fp5, fp6, fp7, fp8, zero)                \
 127         "punpcklbh  "#fp5"  ,   "#fp5"  ,   "#zero"                 \n\t"   \
 128         "punpcklbh  "#fp6"  ,   "#fp6"  ,   "#zero"                 \n\t"   \
 129         "punpcklbh  "#fp7"  ,   "#fp7"  ,   "#zero"                 \n\t"   \
 130         "punpcklbh  "#fp8"  ,   "#fp8"  ,   "#zero"                 \n\t"   \
 131                                                                             \
 132         "paddh      "#fp1"  ,   "#fp1"  ,   "#fp5"                  \n\t"   \
 133         "paddh      "#fp2"  ,   "#fp2"  ,   "#fp6"                  \n\t"   \
 134         "paddh      "#fp3"  ,   "#fp3"  ,   "#fp7"                  \n\t"   \
 135         "paddh      "#fp4"  ,   "#fp4"  ,   "#fp8"                  \n\t"   \
 136                                                                             \
 137         "packushb   "#fp1"  ,   "#fp1"  ,   "#zero"                 \n\t"   \
 138         "packushb   "#fp2"  ,   "#fp2"  ,   "#zero"                 \n\t"   \
 139         "packushb   "#fp3"  ,   "#fp3"  ,   "#zero"                 \n\t"   \
 140         "packushb   "#fp4"  ,   "#fp4"  ,   "#zero"                 \n\t"
 141
 142
 143 /* Do inverse transform on 8x8 block */
 144 void ff_vc1_inv_trans_8x8_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
 145 {
 146     int dc = block[0];
 147     double ftmp[9];
 148     mips_reg addr[1];
 149     int count;
 150
 151     dc = (3 * dc +  1) >> 1;
 152     dc = (3 * dc + 16) >> 5;
 153
 154     __asm__ volatile(
 155         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
 156         "pshufh     %[dc],      %[dc],          %[ftmp0]                \n\t"
 157         "li         %[count],   0x02                                    \n\t"
 158
 159         "1:                                                             \n\t"
 160         MMI_LDC1(%[ftmp1], %[dest], 0x00)
 161         PTR_ADDU   "%[addr0],   %[dest],        %[linesize]             \n\t"
 162         MMI_LDC1(%[ftmp2], %[addr0], 0x00)
 163         PTR_ADDU   "%[addr0],   %[addr0],       %[linesize]             \n\t"
 164         MMI_LDC1(%[ftmp3], %[addr0], 0x00)
 165         PTR_ADDU   "%[addr0],   %[addr0],       %[linesize]             \n\t"
 166         MMI_LDC1(%[ftmp4], %[addr0], 0x00)
 167
 168         "punpckhbh  %[ftmp5],   %[ftmp1],       %[ftmp0]                \n\t"
 169         "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
 170         "punpckhbh  %[ftmp6],   %[ftmp2],       %[ftmp0]                \n\t"
 171         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
 172         "punpckhbh  %[ftmp7],   %[ftmp3],       %[ftmp0]                \n\t"
 173         "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
 174         "punpckhbh  %[ftmp8],   %[ftmp4],       %[ftmp0]                \n\t"
 175         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
 176
 177         "paddsh     %[ftmp1],   %[ftmp1],       %[dc]                   \n\t"
 178         "paddsh     %[ftmp2],   %[ftmp2],       %[dc]                   \n\t"
 179         "paddsh     %[ftmp3],   %[ftmp3],       %[dc]                   \n\t"
 180         "paddsh     %[ftmp4],   %[ftmp4],       %[dc]                   \n\t"
 181         "paddsh     %[ftmp5],   %[ftmp5],       %[dc]                   \n\t"
 182         "paddsh     %[ftmp6],   %[ftmp6],       %[dc]                   \n\t"
 183         "paddsh     %[ftmp7],   %[ftmp7],       %[dc]                   \n\t"
 184         "paddsh     %[ftmp8],   %[ftmp8],       %[dc]                   \n\t"
 185
 186         "packushb   %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
 187         "packushb   %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
 188         "packushb   %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
 189         "packushb   %[ftmp4],   %[ftmp4],       %[ftmp8]                \n\t"
 190
 191         MMI_SDC1(%[ftmp1], %[dest], 0x00)
 192         PTR_ADDU   "%[addr0],   %[dest],        %[linesize]             \n\t"
 193         MMI_SDC1(%[ftmp2], %[addr0], 0x00)
 194         PTR_ADDU   "%[addr0],   %[addr0],       %[linesize]             \n\t"
 195         MMI_SDC1(%[ftmp3], %[addr0], 0x00)
 196         PTR_ADDU   "%[addr0],   %[addr0],       %[linesize]             \n\t"
 197         MMI_SDC1(%[ftmp4], %[addr0], 0x00)
 198
 199         "addiu      %[count],   %[count],       -0x01                   \n\t"
 200         PTR_ADDU   "%[dest],    %[addr0],       %[linesize]             \n\t"
 201         "bnez       %[count],   1b                                      \n\t"
 202         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
 203           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
 204           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
 205           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
 206           [ftmp8]"=&f"(ftmp[8]),
 207           [addr0]"=&r"(addr[0]),
 208           [count]"=&r"(count),          [dest]"+&r"(dest)
 209         : [linesize]"r"((mips_reg)linesize),
 210           [dc]"f"(dc)
 211         : "memory"
 212     );
 213 }
 214
 215 #if _MIPS_SIM != _ABIO32
 216 void ff_vc1_inv_trans_8x8_mmi(int16_t block[64])
 217 {
 218     DECLARE_ALIGNED(16, int16_t, temp[64]);
 219     int16_t *src = block;
 220     int16_t *dst = temp;
 221     double ftmp[16];
 222     uint32_t count, tmp[1];
 223
 224     // 1st loop
 225     __asm__ volatile (
 226         "li         %[tmp0],    0x03                                    \n\t"
 227         "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
 228         "li         %[count],   0x02                                    \n\t"
 229
 230         "1:                                                             \n\t"
 231         MMI_LDC1(%[ftmp5], %[src], 0x10)
 232         MMI_LDC1(%[ftmp6], %[src], 0x30)
 233         MMI_LDC1(%[ftmp7], %[src], 0x50)
 234         MMI_LDC1(%[ftmp8], %[src], 0x70)
 235
 236         VC1_INV_TRANCS_8_STEP1_MMI(%[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
 237                                    %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
 238                                    %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
 239                                    %[ff_pw_16], %[ff_pw_15], %[ff_pw_9],
 240                                    %[ff_pw_4])
 241
 242         MMI_LDC1(%[ftmp1], %[src], 0x00)
 243         MMI_LDC1(%[ftmp2], %[src], 0x40)
 244         MMI_LDC1(%[ftmp3], %[src], 0x20)
 245         MMI_LDC1(%[ftmp4], %[src], 0x60)
 246
 247         VC1_INV_TRANCS_8_STEP2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
 248                                    %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
 249                                    %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
 250                                    %[ff_pw_12], %[ff_pw_16], %[ff_pw_6],
 251                                    %[ff_pw_4])
 252
 253
 254         PSRAH_8_MMI(%[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
 255                     %[ftmp4], %[ftmp3], %[ftmp2], %[ftmp1], %[ftmp0])
 256
 257         TRANSPOSE_4H(%[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
 258                      %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
 259                      %[ftmp13], %[tmp0],  %[ftmp14], %[ftmp15])
 260
 261         MMI_SDC1(%[ftmp5], %[dst], 0x00)
 262         MMI_SDC1(%[ftmp6], %[dst], 0x10)
 263         MMI_SDC1(%[ftmp7], %[dst], 0x20)
 264         MMI_SDC1(%[ftmp8], %[dst], 0x30)
 265
 266         TRANSPOSE_4H(%[ftmp4], %[ftmp3], %[ftmp2], %[ftmp1],
 267                      %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
 268                      %[ftmp13], %[tmp0],  %[ftmp14], %[ftmp15])
 269
 270         MMI_SDC1(%[ftmp4], %[dst], 0x08)
 271         MMI_SDC1(%[ftmp3], %[dst], 0x18)
 272         MMI_SDC1(%[ftmp2], %[dst], 0x28)
 273         MMI_SDC1(%[ftmp1], %[dst], 0x38)
 274
 275         "addiu      %[count],   %[count],  -0x01                        \n\t"
 276         PTR_ADDIU  "%[src],     %[src],     0x08                        \n\t"
 277         PTR_ADDIU  "%[dst],     %[dst],     0x40                        \n\t"
 278         "bnez       %[count],   1b                                      \n\t"
 279         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
 280           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
 281           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
 282           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
 283           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
 284           [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
 285           [ftmp12]"=&f"(ftmp[12]),      [ftmp13]"=&f"(ftmp[13]),
 286           [ftmp14]"=&f"(ftmp[14]),      [ftmp15]"=&f"(ftmp[15]),
 287           [tmp0]"=&r"(tmp[0]),
 288           [count]"=&r"(count),
 289           [src]"+&r"(src),              [dst]"+&r"(dst)
 290         : [ff_pw_4]"f"(ff_pw_4),        [ff_pw_6]"f"(ff_pw_6),
 291           [ff_pw_9]"f"(ff_pw_9),        [ff_pw_12]"f"(ff_pw_12),
 292           [ff_pw_15]"f"(ff_pw_15),      [ff_pw_16]"f"(ff_pw_16)
 293         : "memory"
 294     );
 295
 296     src = temp;
 297     dst = block;
 298
 299     // 2nd loop
 300     __asm__ volatile (
 301         "li         %[tmp0],    0x07                                    \n\t"
 302         "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
 303         "li         %[count],   0x02                                    \n\t"
 304
 305         "1:                                                             \n\t"
 306         MMI_LDC1(%[ftmp5], %[src], 0x10)
 307         MMI_LDC1(%[ftmp6], %[src], 0x30)
 308         MMI_LDC1(%[ftmp7], %[src], 0x50)
 309         MMI_LDC1(%[ftmp8], %[src], 0x70)
 310
 311         VC1_INV_TRANCS_8_STEP1_MMI(%[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
 312                                    %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
 313                                    %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
 314                                    %[ff_pw_16], %[ff_pw_15], %[ff_pw_9],
 315                                    %[ff_pw_4])
 316
 317         MMI_LDC1(%[ftmp1], %[src], 0x00)
 318         MMI_LDC1(%[ftmp2], %[src], 0x40)
 319         MMI_LDC1(%[ftmp3], %[src], 0x20)
 320         MMI_LDC1(%[ftmp4], %[src], 0x60)
 321
 322         VC1_INV_TRANCS_8_STEP2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
 323                                    %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
 324                                    %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
 325                                    %[ff_pw_12], %[ff_pw_16], %[ff_pw_6],
 326                                    %[ff_pw_64])
 327
 328         "paddh      %[ftmp4],   %[ftmp4],   %[ff_pw_1]                  \n\t"
 329         "paddh      %[ftmp3],   %[ftmp3],   %[ff_pw_1]                  \n\t"
 330         "paddh      %[ftmp2],   %[ftmp2],   %[ff_pw_1]                  \n\t"
 331         "paddh      %[ftmp1],   %[ftmp1],   %[ff_pw_1]                  \n\t"
 332
 333         PSRAH_8_MMI(%[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
 334                     %[ftmp4], %[ftmp3], %[ftmp2], %[ftmp1], %[ftmp0])
 335
 336         MMI_SDC1(%[ftmp5], %[dst], 0x00)
 337         MMI_SDC1(%[ftmp6], %[dst], 0x10)
 338         MMI_SDC1(%[ftmp7], %[dst], 0x20)
 339         MMI_SDC1(%[ftmp8], %[dst], 0x30)
 340
 341         MMI_SDC1(%[ftmp4], %[dst], 0x40)
 342         MMI_SDC1(%[ftmp3], %[dst], 0x50)
 343         MMI_SDC1(%[ftmp2], %[dst], 0x60)
 344         MMI_SDC1(%[ftmp1], %[dst], 0x70)
 345
 346         "addiu      %[count],   %[count],  -0x01                        \n\t"
 347         PTR_ADDIU  "%[src],     %[src],     0x08                        \n\t"
 348         PTR_ADDIU  "%[dst],     %[dst],     0x08                        \n\t"
 349         "bnez       %[count],   1b                                      \n\t"
 350         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
 351           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
 352           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
 353           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
 354           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
 355           [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
 356           [ftmp12]"=&f"(ftmp[12]),      [ftmp13]"=&f"(ftmp[13]),
 357           [ftmp14]"=&f"(ftmp[14]),      [ftmp15]"=&f"(ftmp[15]),
 358           [tmp0]"=&r"(tmp[0]),
 359           [count]"=&r"(count),
 360           [src]"+&r"(src),              [dst]"+&r"(dst)
 361         : [ff_pw_1]"f"(ff_pw_1),        [ff_pw_4]"f"(ff_pw_4),
 362           [ff_pw_6]"f"(ff_pw_6),        [ff_pw_9]"f"(ff_pw_9),
 363           [ff_pw_12]"f"(ff_pw_12),      [ff_pw_15]"f"(ff_pw_15),
 364           [ff_pw_16]"f"(ff_pw_16),      [ff_pw_64]"f"(ff_pw_64)
 365         : "memory"
 366     );
 367 }
 368 #endif
 369
 370 /* Do inverse transform on 8x4 part of block */
 371 void ff_vc1_inv_trans_8x4_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
 372 {
 373     int dc = block[0];
 374     double ftmp[9];
 375
 376     dc = ( 3 * dc +  1) >> 1;
 377     dc = (17 * dc + 64) >> 7;
 378
 379     __asm__ volatile(
 380         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
 381         "pshufh     %[dc],      %[dc],          %[ftmp0]                \n\t"
 382
 383         MMI_LDC1(%[ftmp1], %[dest0], 0x00)
 384         MMI_LDC1(%[ftmp2], %[dest1], 0x00)
 385         MMI_LDC1(%[ftmp3], %[dest2], 0x00)
 386         MMI_LDC1(%[ftmp4], %[dest3], 0x00)
 387
 388         "punpckhbh  %[ftmp5],   %[ftmp1],       %[ftmp0]                \n\t"
 389         "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
 390         "punpckhbh  %[ftmp6],   %[ftmp2],       %[ftmp0]                \n\t"
 391         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
 392         "punpckhbh  %[ftmp7],   %[ftmp3],       %[ftmp0]                \n\t"
 393         "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
 394         "punpckhbh  %[ftmp8],   %[ftmp4],       %[ftmp0]                \n\t"
 395         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
 396
 397         "paddsh     %[ftmp1],   %[ftmp1],       %[dc]                   \n\t"
 398         "paddsh     %[ftmp2],   %[ftmp2],       %[dc]                   \n\t"
 399         "paddsh     %[ftmp3],   %[ftmp3],       %[dc]                   \n\t"
 400         "paddsh     %[ftmp4],   %[ftmp4],       %[dc]                   \n\t"
 401         "paddsh     %[ftmp5],   %[ftmp5],       %[dc]                   \n\t"
 402         "paddsh     %[ftmp6],   %[ftmp6],       %[dc]                   \n\t"
 403         "paddsh     %[ftmp7],   %[ftmp7],       %[dc]                   \n\t"
 404         "paddsh     %[ftmp8],   %[ftmp8],       %[dc]                   \n\t"
 405
 406         "packushb   %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
 407         "packushb   %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
 408         "packushb   %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
 409         "packushb   %[ftmp4],   %[ftmp4],       %[ftmp8]                \n\t"
 410
 411         MMI_SDC1(%[ftmp1], %[dest0], 0x00)
 412         MMI_SDC1(%[ftmp2], %[dest1], 0x00)
 413         MMI_SDC1(%[ftmp3], %[dest2], 0x00)
 414         MMI_SDC1(%[ftmp4], %[dest3], 0x00)
 415         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
 416           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
 417           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
 418           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
 419           [ftmp8]"=&f"(ftmp[8])
 420         : [dest0]"r"(dest+0*linesize),  [dest1]"r"(dest+1*linesize),
 421           [dest2]"r"(dest+2*linesize),  [dest3]"r"(dest+3*linesize),
 422           [dc]"f"(dc)
 423         : "memory"
 424     );
 425 }
 426
 427 #if _MIPS_SIM != _ABIO32
 428 void ff_vc1_inv_trans_8x4_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
 429 {
 430     int16_t *src = block;
 431     int16_t *dst = block;
 432     double ftmp[16];
 433     uint32_t tmp[1];
 434     mips_reg addr[1];
 435     DECLARE_VAR_LOW32;
 436
 437     // 1st loop
 438     __asm__ volatile (
 439         MMI_LDC1(%[ftmp1], %[src], 0x00)
 440         MMI_LDC1(%[ftmp2], %[src], 0x08)
 441         MMI_LDC1(%[ftmp3], %[src], 0x10)
 442         MMI_LDC1(%[ftmp4], %[src], 0x18)
 443         MMI_LDC1(%[ftmp5], %[src], 0x20)
 444         MMI_LDC1(%[ftmp6], %[src], 0x28)
 445         MMI_LDC1(%[ftmp7], %[src], 0x30)
 446         MMI_LDC1(%[ftmp8], %[src], 0x38)
 447
 448         //             a1        b1        a3        b2
 449         TRANSPOSE_4H(%[ftmp1], %[ftmp3], %[ftmp5], %[ftmp7],
 450                      %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
 451                      %[ftmp13], %[tmp0],  %[ftmp14], %[ftmp15])
 452
 453         //             a2        b3        a4        b4
 454         TRANSPOSE_4H(%[ftmp2], %[ftmp4], %[ftmp6], %[ftmp8],
 455                      %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
 456                      %[ftmp13], %[tmp0],  %[ftmp14], %[ftmp15])
 457
 458         // input b1 b2 b3 b4
 459         VC1_INV_TRANCS_8_STEP1_MMI(%[ftmp3], %[ftmp7], %[ftmp4], %[ftmp8],
 460                                    %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
 461                                    %[ftmp0], %[ftmp13], %[ftmp14], %[ftmp15],
 462                                    %[ff_pw_16], %[ff_pw_15], %[ff_pw_9],
 463                                    %[ff_pw_4])
 464         // input a1 a2 a3 a4
 465         VC1_INV_TRANCS_8_STEP2_MMI(%[ftmp1], %[ftmp2], %[ftmp5], %[ftmp6],
 466                                    %[ftmp3], %[ftmp7], %[ftmp4], %[ftmp8],
 467                                    %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
 468                                    %[ff_pw_12], %[ff_pw_16], %[ff_pw_6],
 469                                    %[ff_pw_4])
 470
 471         "li         %[tmp0],    0x03                                    \n\t"
 472         "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
 473
 474         PSRAH_8_MMI(%[ftmp3], %[ftmp7], %[ftmp4], %[ftmp8],
 475                     %[ftmp6], %[ftmp5], %[ftmp2], %[ftmp1], %[ftmp0])
 476
 477         TRANSPOSE_4H(%[ftmp3], %[ftmp7], %[ftmp4], %[ftmp8],
 478                      %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
 479                      %[ftmp13], %[tmp0],  %[ftmp14], %[ftmp15])
 480
 481         MMI_SDC1(%[ftmp3], %[dst], 0x00)
 482         MMI_SDC1(%[ftmp7], %[dst], 0x10)
 483         MMI_SDC1(%[ftmp4], %[dst], 0x20)
 484         MMI_SDC1(%[ftmp8], %[dst], 0x30)
 485
 486         TRANSPOSE_4H(%[ftmp6], %[ftmp5], %[ftmp2], %[ftmp1],
 487                      %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
 488                      %[ftmp13], %[tmp0],  %[ftmp14], %[ftmp15])
 489
 490         MMI_SDC1(%[ftmp6], %[dst], 0x08)
 491         MMI_SDC1(%[ftmp5], %[dst], 0x18)
 492         MMI_SDC1(%[ftmp2], %[dst], 0x28)
 493         MMI_SDC1(%[ftmp1], %[dst], 0x38)
 494         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
 495           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
 496           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
 497           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
 498           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
 499           [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
 500           [ftmp12]"=&f"(ftmp[12]),      [ftmp13]"=&f"(ftmp[13]),
 501           [ftmp14]"=&f"(ftmp[14]),      [ftmp15]"=&f"(ftmp[15]),
 502           [tmp0]"=&r"(tmp[0])
 503         : [src]"r"(src),                [dst]"r"(dst),
 504           [ff_pw_4]"f"(ff_pw_4),        [ff_pw_6]"f"(ff_pw_6),
 505           [ff_pw_9]"f"(ff_pw_9),        [ff_pw_12]"f"(ff_pw_12),
 506           [ff_pw_15]"f"(ff_pw_15),      [ff_pw_16]"f"(ff_pw_16)
 507         : "memory"
 508     );
 509
 510     src = block;
 511
 512     // 2nd loop
 513     __asm__ volatile (
 514         "li         %[tmp0],    0x07                                    \n\t"
 515         "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
 516         "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
 517
 518         // dest low 32bit
 519         MMI_LDC1(%[ftmp1], %[src], 0x00)
 520         MMI_LDC1(%[ftmp2], %[src], 0x20)
 521         MMI_LDC1(%[ftmp3], %[src], 0x30)
 522         MMI_LDC1(%[ftmp4], %[src], 0x10)
 523
 524         VC1_INV_TRANCS_4_STEP1_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
 525                                    %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
 526                                    %[ff_pw_17], %[ff_pw_10], %[ff_pw_22],
 527                                    %[ff_pw_64])
 528
 529         PSRAH_4_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], %[ftmp9])
 530
 531         MMI_LWC1(%[ftmp5], %[dest], 0x00)
 532         PTR_ADDU   "%[addr0],   %[dest],    %[linesize]                 \n\t"
 533         MMI_LWC1(%[ftmp6], %[addr0], 0x00)
 534         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
 535         MMI_LWC1(%[ftmp7], %[addr0], 0x00)
 536         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
 537         MMI_LWC1(%[ftmp8], %[addr0], 0x00)
 538
 539         VC1_INV_TRANCS_4_STEP2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
 540                                    %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
 541                                    %[ftmp0])
 542
 543         MMI_SWC1(%[ftmp1], %[dest], 0x00)
 544         PTR_ADDU   "%[addr0],   %[dest],    %[linesize]                 \n\t"
 545         MMI_SWC1(%[ftmp2], %[addr0], 0x00)
 546         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
 547         MMI_SWC1(%[ftmp3], %[addr0], 0x00)
 548         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
 549         MMI_SWC1(%[ftmp4], %[addr0], 0x00)
 550
 551         // dest high 32bit
 552         MMI_LDC1(%[ftmp1], %[src], 0x08)
 553         MMI_LDC1(%[ftmp2], %[src], 0x28)
 554         MMI_LDC1(%[ftmp3], %[src], 0x38)
 555         MMI_LDC1(%[ftmp4], %[src], 0x18)
 556
 557         VC1_INV_TRANCS_4_STEP1_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
 558                                    %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
 559                                    %[ff_pw_17], %[ff_pw_10], %[ff_pw_22],
 560                                    %[ff_pw_64])
 561
 562         PSRAH_4_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], %[ftmp9])
 563
 564         MMI_LWC1(%[ftmp5], %[dest], 0x04)
 565         PTR_ADDU   "%[addr0],   %[dest],    %[linesize]                 \n\t"
 566         MMI_LWC1(%[ftmp6], %[addr0], 0x04)
 567         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
 568         MMI_LWC1(%[ftmp7], %[addr0], 0x04)
 569         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
 570         MMI_LWC1(%[ftmp8], %[addr0], 0x04)
 571
 572         VC1_INV_TRANCS_4_STEP2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
 573                                    %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
 574                                    %[ftmp0])
 575
 576         MMI_SWC1(%[ftmp1], %[dest], 0x04)
 577         PTR_ADDU   "%[addr0],   %[dest],    %[linesize]                 \n\t"
 578         MMI_SWC1(%[ftmp2], %[addr0], 0x04)
 579         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
 580         MMI_SWC1(%[ftmp3], %[addr0], 0x04)
 581         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
 582         MMI_SWC1(%[ftmp4], %[addr0], 0x04)
 583
 584         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
 585           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
 586           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
 587           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
 588           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
 589           [tmp0]"=&r"(tmp[0]),
 590           RESTRICT_ASM_LOW32
 591           [addr0]"=&r"(addr[0])
 592         : [src]"r"(src),                [dest]"r"(dest),
 593           [linesize]"r"((mips_reg)linesize),
 594           [ff_pw_17]"f"(ff_pw_17),      [ff_pw_22]"f"(ff_pw_22),
 595           [ff_pw_10]"f"(ff_pw_10),      [ff_pw_64]"f"(ff_pw_64)
 596         : "memory"
 597     );
 598 }
 599 #endif
 600
 601 /* Do inverse transform on 4x8 parts of block */
 602 void ff_vc1_inv_trans_4x8_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
 603 {
 604     int dc = block[0];
 605     double ftmp[9];
 606     DECLARE_VAR_LOW32;
 607
 608     dc = (17 * dc +  4) >> 3;
 609     dc = (12 * dc + 64) >> 7;
 610
 611     __asm__ volatile(
 612         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
 613         "pshufh     %[dc],      %[dc],          %[ftmp0]                \n\t"
 614
 615         MMI_LWC1(%[ftmp1], %[dest0], 0x00)
 616         MMI_LWC1(%[ftmp2], %[dest1], 0x00)
 617         MMI_LWC1(%[ftmp3], %[dest2], 0x00)
 618         MMI_LWC1(%[ftmp4], %[dest3], 0x00)
 619         MMI_LWC1(%[ftmp5], %[dest4], 0x00)
 620         MMI_LWC1(%[ftmp6], %[dest5], 0x00)
 621         MMI_LWC1(%[ftmp7], %[dest6], 0x00)
 622         MMI_LWC1(%[ftmp8], %[dest7], 0x00)
 623
 624         "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
 625         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
 626         "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
 627         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
 628         "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
 629         "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
 630         "punpcklbh  %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
 631         "punpcklbh  %[ftmp8],   %[ftmp8],       %[ftmp0]                \n\t"
 632
 633         "paddsh     %[ftmp1],   %[ftmp1],       %[dc]                   \n\t"
 634         "paddsh     %[ftmp2],   %[ftmp2],       %[dc]                   \n\t"
 635         "paddsh     %[ftmp3],   %[ftmp3],       %[dc]                   \n\t"
 636         "paddsh     %[ftmp4],   %[ftmp4],       %[dc]                   \n\t"
 637         "paddsh     %[ftmp5],   %[ftmp5],       %[dc]                   \n\t"
 638         "paddsh     %[ftmp6],   %[ftmp6],       %[dc]                   \n\t"
 639         "paddsh     %[ftmp7],   %[ftmp7],       %[dc]                   \n\t"
 640         "paddsh     %[ftmp8],   %[ftmp8],       %[dc]                   \n\t"
 641
 642         "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
 643         "packushb   %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
 644         "packushb   %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
 645         "packushb   %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
 646         "packushb   %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
 647         "packushb   %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
 648         "packushb   %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
 649         "packushb   %[ftmp8],   %[ftmp8],       %[ftmp0]                \n\t"
 650
 651         MMI_SWC1(%[ftmp1], %[dest0], 0x00)
 652         MMI_SWC1(%[ftmp2], %[dest1], 0x00)
 653         MMI_SWC1(%[ftmp3], %[dest2], 0x00)
 654         MMI_SWC1(%[ftmp4], %[dest3], 0x00)
 655         MMI_SWC1(%[ftmp5], %[dest4], 0x00)
 656         MMI_SWC1(%[ftmp6], %[dest5], 0x00)
 657         MMI_SWC1(%[ftmp7], %[dest6], 0x00)
 658         MMI_SWC1(%[ftmp8], %[dest7], 0x00)
 659         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
 660           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
 661           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
 662           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
 663           RESTRICT_ASM_LOW32
 664           [ftmp8]"=&f"(ftmp[8])
 665         : [dest0]"r"(dest+0*linesize),  [dest1]"r"(dest+1*linesize),
 666           [dest2]"r"(dest+2*linesize),  [dest3]"r"(dest+3*linesize),
 667           [dest4]"r"(dest+4*linesize),  [dest5]"r"(dest+5*linesize),
 668           [dest6]"r"(dest+6*linesize),  [dest7]"r"(dest+7*linesize),
 669           [dc]"f"(dc)
 670         : "memory"
 671     );
 672 }
 673
 674 #if _MIPS_SIM != _ABIO32
 675 void ff_vc1_inv_trans_4x8_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
 676 {
 677     int16_t *src = block;
 678     int16_t *dst = block;
 679     double ftmp[16];
 680     uint32_t count, tmp[1];
 681     mips_reg addr[1];
 682     DECLARE_VAR_LOW32;
 683
 684     // 1st loop
 685     __asm__ volatile (
 686         "li         %[count],   0x02                                    \n\t"
 687         "li         %[tmp0],    0x03                                    \n\t"
 688         "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
 689
 690         "1:                                                             \n\t"
 691         MMI_LDC1(%[ftmp1], %[src], 0x00)
 692         MMI_LDC1(%[ftmp2], %[src], 0x10)
 693         MMI_LDC1(%[ftmp3], %[src], 0x20)
 694         MMI_LDC1(%[ftmp4], %[src], 0x30)
 695
 696         TRANSPOSE_4H(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
 697                      %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
 698                      %[ftmp9], %[tmp0],  %[ftmp10], %[ftmp11])
 699
 700         //                              t1        t2        t3        t4
 701         VC1_INV_TRANCS_4_STEP1_MMI(%[ftmp1], %[ftmp3], %[ftmp4], %[ftmp2],
 702                                    %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
 703                                    %[ff_pw_17], %[ff_pw_10], %[ff_pw_22],
 704                                    %[ff_pw_4])
 705
 706         PSRAH_4_MMI(%[ftmp1], %[ftmp3], %[ftmp4], %[ftmp2], %[ftmp0])
 707
 708         TRANSPOSE_4H(%[ftmp1], %[ftmp3], %[ftmp4], %[ftmp2],
 709                      %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
 710                      %[ftmp9], %[tmp0],  %[ftmp10], %[ftmp11])
 711
 712         MMI_SDC1(%[ftmp1], %[dst], 0x00)
 713         MMI_SDC1(%[ftmp3], %[dst], 0x10)
 714         MMI_SDC1(%[ftmp4], %[dst], 0x20)
 715         MMI_SDC1(%[ftmp2], %[dst], 0x30)
 716
 717         "addiu      %[count],   %[count],  -0x01                        \n\t"
 718         PTR_ADDIU  "%[src],     %[src],     0x40                        \n\t"
 719         PTR_ADDIU  "%[dst],     %[dst],     0x40                        \n\t"
 720         "bnez       %[count],   1b                                      \n\t"
 721         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
 722           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
 723           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
 724           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
 725           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
 726           [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
 727           [tmp0]"=&r"(tmp[0]),
 728           [count]"=&r"(count),
 729           [src]"+&r"(src),              [dst]"+&r"(dst)
 730         : [ff_pw_17]"f"(ff_pw_17),      [ff_pw_10]"f"(ff_pw_10),
 731           [ff_pw_22]"f"(ff_pw_22),      [ff_pw_4]"f"(ff_pw_4)
 732         : "memory"
 733     );
 734
 735     src = block;
 736
 737     // 2nd loop
 738     __asm__ volatile (
 739         "li         %[tmp0],    0x07                                    \n\t"
 740         "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
 741
 742         MMI_LDC1(%[ftmp5], %[src], 0x10)
 743         MMI_LDC1(%[ftmp6], %[src], 0x30)
 744         MMI_LDC1(%[ftmp7], %[src], 0x50)
 745         MMI_LDC1(%[ftmp8], %[src], 0x70)
 746
 747         VC1_INV_TRANCS_8_STEP1_MMI(%[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
 748                                    %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
 749                                    %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
 750                                    %[ff_pw_16], %[ff_pw_15], %[ff_pw_9],
 751                                    %[ff_pw_4])
 752
 753         MMI_LDC1(%[ftmp1], %[src], 0x00)
 754         MMI_LDC1(%[ftmp2], %[src], 0x40)
 755         MMI_LDC1(%[ftmp3], %[src], 0x20)
 756         MMI_LDC1(%[ftmp4], %[src], 0x60)
 757
 758         VC1_INV_TRANCS_8_STEP2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
 759                                    %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
 760                                    %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
 761                                    %[ff_pw_12], %[ff_pw_16], %[ff_pw_6],
 762                                    %[ff_pw_64])
 763
 764         "paddh      %[ftmp4],   %[ftmp4],   %[ff_pw_1]                  \n\t"
 765         "paddh      %[ftmp3],   %[ftmp3],   %[ff_pw_1]                  \n\t"
 766         "paddh      %[ftmp2],   %[ftmp2],   %[ff_pw_1]                  \n\t"
 767         "paddh      %[ftmp1],   %[ftmp1],   %[ff_pw_1]                  \n\t"
 768
 769         PSRAH_8_MMI(%[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
 770                     %[ftmp4], %[ftmp3], %[ftmp2], %[ftmp1], %[ftmp0])
 771
 772         "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
 773
 774         // dest low
 775         MMI_LWC1(%[ftmp9], %[dest], 0x00)
 776         PTR_ADDU   "%[addr0],   %[dest],    %[linesize]                 \n\t"
 777         MMI_LWC1(%[ftmp10], %[addr0], 0x00)
 778         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
 779         MMI_LWC1(%[ftmp11], %[addr0], 0x00)
 780         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
 781         MMI_LWC1(%[ftmp12], %[addr0], 0x00)
 782         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
 783
 784         VC1_INV_TRANCS_4_STEP2_MMI(%[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
 785                                    %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
 786                                    %[ftmp0])
 787
 788         // dest high
 789         MMI_LWC1(%[ftmp9], %[addr0], 0x00)
 790         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
 791         MMI_LWC1(%[ftmp10], %[addr0], 0x00)
 792         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
 793         MMI_LWC1(%[ftmp11], %[addr0], 0x00)
 794         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
 795         MMI_LWC1(%[ftmp12], %[addr0], 0x00)
 796
 797         VC1_INV_TRANCS_4_STEP2_MMI(%[ftmp4], %[ftmp3], %[ftmp2], %[ftmp1],
 798                                    %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
 799                                    %[ftmp0])
 800
 801         // dest low
 802         MMI_SWC1(%[ftmp5], %[dest], 0x00)
 803         PTR_ADDU   "%[addr0],   %[dest],    %[linesize]                 \n\t"
 804         MMI_SWC1(%[ftmp6], %[addr0], 0x00)
 805         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
 806         MMI_SWC1(%[ftmp7], %[addr0], 0x00)
 807         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
 808         MMI_SWC1(%[ftmp8], %[addr0], 0x00)
 809         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
 810
 811         // dest high
 812         MMI_SWC1(%[ftmp4], %[addr0], 0x00)
 813         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
 814         MMI_SWC1(%[ftmp3], %[addr0], 0x00)
 815         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
 816         MMI_SWC1(%[ftmp2], %[addr0], 0x00)
 817         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
 818         MMI_SWC1(%[ftmp1], %[addr0], 0x00)
 819         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
 820           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
 821           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
 822           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
 823           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
 824           [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
 825           [ftmp12]"=&f"(ftmp[12]),
 826           [tmp0]"=&r"(tmp[0]),
 827           RESTRICT_ASM_LOW32
 828           [addr0]"=&r"(addr[0]),
 829           [dest]"+&r"(dest)
 830         : [src]"r"(src),                [linesize]"r"(linesize),
 831           [ff_pw_1]"f"(ff_pw_1),        [ff_pw_4]"f"(ff_pw_4),
 832           [ff_pw_6]"f"(ff_pw_6),        [ff_pw_9]"f"(ff_pw_9),
 833           [ff_pw_12]"f"(ff_pw_12),      [ff_pw_15]"f"(ff_pw_15),
 834           [ff_pw_16]"f"(ff_pw_16),      [ff_pw_64]"f"(ff_pw_64)
 835         : "memory"
 836     );
 837 }
 838 #endif
 839
 840 /* Do inverse transform on 4x4 part of block */
 841 void ff_vc1_inv_trans_4x4_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
 842 {
 843     int dc = block[0];
 844     double ftmp[5];
 845     DECLARE_VAR_LOW32;
 846
 847     dc = (17 * dc +  4) >> 3;
 848     dc = (17 * dc + 64) >> 7;
 849
 850     __asm__ volatile(
 851         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
 852         "pshufh     %[dc],      %[dc],          %[ftmp0]                \n\t"
 853
 854         MMI_LWC1(%[ftmp1], %[dest0], 0x00)
 855         MMI_LWC1(%[ftmp2], %[dest1], 0x00)
 856         MMI_LWC1(%[ftmp3], %[dest2], 0x00)
 857         MMI_LWC1(%[ftmp4], %[dest3], 0x00)
 858
 859         "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
 860         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
 861         "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
 862         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
 863
 864         "paddsh     %[ftmp1],   %[ftmp1],       %[dc]                   \n\t"
 865         "paddsh     %[ftmp2],   %[ftmp2],       %[dc]                   \n\t"
 866         "paddsh     %[ftmp3],   %[ftmp3],       %[dc]                   \n\t"
 867         "paddsh     %[ftmp4],   %[ftmp4],       %[dc]                   \n\t"
 868
 869         "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
 870         "packushb   %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
 871         "packushb   %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
 872         "packushb   %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
 873
 874         MMI_SWC1(%[ftmp1], %[dest0], 0x00)
 875         MMI_SWC1(%[ftmp2], %[dest1], 0x00)
 876         MMI_SWC1(%[ftmp3], %[dest2], 0x00)
 877         MMI_SWC1(%[ftmp4], %[dest3], 0x00)
 878         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
 879           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
 880           RESTRICT_ASM_LOW32
 881           [ftmp4]"=&f"(ftmp[4])
 882         : [dest0]"r"(dest+0*linesize),  [dest1]"r"(dest+1*linesize),
 883           [dest2]"r"(dest+2*linesize),  [dest3]"r"(dest+3*linesize),
 884           [dc]"f"(dc)
 885         : "memory"
 886     );
 887 }
 888
 889 void ff_vc1_inv_trans_4x4_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
 890 {
 891     int16_t *src = block;
 892     int16_t *dst = block;
 893     double ftmp[12];
 894     uint32_t tmp[1];
 895     mips_reg addr[1];
 896     DECLARE_VAR_LOW32;
 897
 898     // 1st loop
 899     __asm__ volatile (
 900         "li         %[tmp0],    0x03                                    \n\t"
 901         "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
 902
 903         MMI_LDC1(%[ftmp1], %[src], 0x00)
 904         MMI_LDC1(%[ftmp2], %[src], 0x10)
 905         MMI_LDC1(%[ftmp3], %[src], 0x20)
 906         MMI_LDC1(%[ftmp4], %[src], 0x30)
 907
 908         TRANSPOSE_4H(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
 909                      %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
 910                      %[ftmp9], %[tmp0],  %[ftmp10], %[ftmp11])
 911
 912         //                              t1        t2        t3        t4
 913         VC1_INV_TRANCS_4_STEP1_MMI(%[ftmp1], %[ftmp3], %[ftmp4], %[ftmp2],
 914                                    %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
 915                                    %[ff_pw_17], %[ff_pw_10], %[ff_pw_22],
 916                                    %[ff_pw_4])
 917
 918         PSRAH_4_MMI(%[ftmp1], %[ftmp3], %[ftmp4], %[ftmp2], %[ftmp0])
 919
 920         TRANSPOSE_4H(%[ftmp1], %[ftmp3], %[ftmp4], %[ftmp2],
 921                      %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
 922                      %[ftmp9], %[tmp0],  %[ftmp10], %[ftmp11])
 923
 924         MMI_SDC1(%[ftmp1], %[dst], 0x00)
 925         MMI_SDC1(%[ftmp3], %[dst], 0x10)
 926         MMI_SDC1(%[ftmp4], %[dst], 0x20)
 927         MMI_SDC1(%[ftmp2], %[dst], 0x30)
 928         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
 929           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
 930           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
 931           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
 932           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
 933           [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
 934           [tmp0]"=&r"(tmp[0]),
 935           [src]"+&r"(src),              [dst]"+&r"(dst)
 936         : [ff_pw_17]"f"(ff_pw_17),      [ff_pw_10]"f"(ff_pw_10),
 937           [ff_pw_22]"f"(ff_pw_22),      [ff_pw_4]"f"(ff_pw_4)
 938         : "memory"
 939     );
 940
 941     src = block;
 942
 943     // 2nd loop
 944     __asm__ volatile (
 945         "li         %[tmp0],    0x07                                    \n\t"
 946         "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
 947
 948         // dest low 32bit
 949         MMI_LDC1(%[ftmp1], %[src], 0x00)
 950         MMI_LDC1(%[ftmp2], %[src], 0x20)
 951         MMI_LDC1(%[ftmp3], %[src], 0x30)
 952         MMI_LDC1(%[ftmp4], %[src], 0x10)
 953
 954         VC1_INV_TRANCS_4_STEP1_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
 955                                    %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
 956                                    %[ff_pw_17], %[ff_pw_10], %[ff_pw_22],
 957                                    %[ff_pw_64])
 958
 959         PSRAH_4_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], %[ftmp0])
 960
 961         MMI_LWC1(%[ftmp5], %[dest], 0x00)
 962         PTR_ADDU   "%[addr0],   %[dest],    %[linesize]                 \n\t"
 963         MMI_LWC1(%[ftmp6], %[addr0], 0x00)
 964         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
 965         MMI_LWC1(%[ftmp7], %[addr0], 0x00)
 966         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
 967         MMI_LWC1(%[ftmp8], %[addr0], 0x00)
 968
 969         "xor        %[ftmp9],   %[ftmp9],  %[ftmp9]                     \n\t"
 970
 971         VC1_INV_TRANCS_4_STEP2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
 972                                    %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
 973                                    %[ftmp9])
 974
 975         MMI_SWC1(%[ftmp1], %[dest], 0x00)
 976         PTR_ADDU   "%[addr0],   %[dest],    %[linesize]                 \n\t"
 977         MMI_SWC1(%[ftmp2], %[addr0], 0x00)
 978         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
 979         MMI_SWC1(%[ftmp3], %[addr0], 0x00)
 980         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
 981         MMI_SWC1(%[ftmp4], %[addr0], 0x00)
 982         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
 983           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
 984           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
 985           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
 986           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
 987           [tmp0]"=&r"(tmp[0]),
 988           RESTRICT_ASM_LOW32
 989           [addr0]"=&r"(addr[0])
 990         : [src]"r"(src),                [dest]"r"(dest),
 991           [linesize]"r"((mips_reg)linesize),
 992           [ff_pw_17]"f"(ff_pw_17),      [ff_pw_22]"f"(ff_pw_22),
 993           [ff_pw_10]"f"(ff_pw_10),      [ff_pw_64]"f"(ff_pw_64)
 994         : "memory"
 995     );
 996 }
 997
 998 /* Apply overlap transform to horizontal edge */
 999 void ff_vc1_h_overlap_mmi(uint8_t *src, int stride)
1000 {
1001     int i;
1002     int a, b, c, d;
1003     int d1, d2;
1004     int rnd = 1;
1005     for (i = 0; i < 8; i++) {
1006         a  = src[-2];
1007         b  = src[-1];
1008         c  = src[0];
1009         d  = src[1];
1010         d1 = (a - d + 3 + rnd) >> 3;
1011         d2 = (a - d + b - c + 4 - rnd) >> 3;
1012
1013         src[-2] = a - d1;
1014         src[-1] = av_clip_uint8(b - d2);
1015         src[0]  = av_clip_uint8(c + d2);
1016         src[1]  = d + d1;
1017         src    += stride;
1018         rnd     = !rnd;
1019     }
1020 }
1021
1022 void ff_vc1_h_s_overlap_mmi(int16_t *left, int16_t *right)
1023 {
1024     int i;
1025     int a, b, c, d;
1026     int d1, d2;
1027     int rnd1 = 4, rnd2 = 3;
1028     for (i = 0; i < 8; i++) {
1029         a  = left[6];
1030         b  = left[7];
1031         c  = right[0];
1032         d  = right[1];
1033         d1 = a - d;
1034         d2 = a - d + b - c;
1035
1036         left[6]  = ((a << 3) - d1 + rnd1) >> 3;
1037         left[7]  = ((b << 3) - d2 + rnd2) >> 3;
1038         right[0] = ((c << 3) + d2 + rnd1) >> 3;
1039         right[1] = ((d << 3) + d1 + rnd2) >> 3;
1040
1041         right += 8;
1042         left  += 8;
1043         rnd2   = 7 - rnd2;
1044         rnd1   = 7 - rnd1;
1045     }
1046 }
1047
1048 /* Apply overlap transform to vertical edge */
1049 void ff_vc1_v_overlap_mmi(uint8_t *src, int stride)
1050 {
1051     int i;
1052     int a, b, c, d;
1053     int d1, d2;
1054     int rnd = 1;
1055     for (i = 0; i < 8; i++) {
1056         a  = src[-2 * stride];
1057         b  = src[-stride];
1058         c  = src[0];
1059         d  = src[stride];
1060         d1 = (a - d + 3 + rnd) >> 3;
1061         d2 = (a - d + b - c + 4 - rnd) >> 3;
1062
1063         src[-2 * stride] = a - d1;
1064         src[-stride]     = av_clip_uint8(b - d2);
1065         src[0]           = av_clip_uint8(c + d2);
1066         src[stride]      = d + d1;
1067         src++;
1068         rnd = !rnd;
1069     }
1070 }
1071
1072 void ff_vc1_v_s_overlap_mmi(int16_t *top, int16_t *bottom)
1073 {
1074     int i;
1075     int a, b, c, d;
1076     int d1, d2;
1077     int rnd1 = 4, rnd2 = 3;
1078     for (i = 0; i < 8; i++) {
1079         a  = top[48];
1080         b  = top[56];
1081         c  = bottom[0];
1082         d  = bottom[8];
1083         d1 = a - d;
1084         d2 = a - d + b - c;
1085
1086         top[48]   = ((a << 3) - d1 + rnd1) >> 3;
1087         top[56]   = ((b << 3) - d2 + rnd2) >> 3;
1088         bottom[0] = ((c << 3) + d2 + rnd1) >> 3;
1089         bottom[8] = ((d << 3) + d1 + rnd2) >> 3;
1090
1091         bottom++;
1092         top++;
1093         rnd2 = 7 - rnd2;
1094         rnd1 = 7 - rnd1;
1095     }
1096 }
1097
1098 /**
1099  * VC-1 in-loop deblocking filter for one line
1100  * @param src source block type
1101  * @param stride block stride
1102  * @param pq block quantizer
1103  * @return whether other 3 pairs should be filtered or not
1104  * @see 8.6
1105  */
1106 static av_always_inline int vc1_filter_line(uint8_t *src, int stride, int pq)
1107 {
1108     int a0 = (2 * (src[-2 * stride] - src[1 * stride]) -
1109               5 * (src[-1 * stride] - src[0 * stride]) + 4) >> 3;
1110     int a0_sign = a0 >> 31;        /* Store sign */
1111
1112     a0 = (a0 ^ a0_sign) - a0_sign; /* a0 = FFABS(a0); */
1113     if (a0 < pq) {
1114         int a1 = FFABS((2 * (src[-4 * stride] - src[-1 * stride]) -
1115                         5 * (src[-3 * stride] - src[-2 * stride]) + 4) >> 3);
1116         int a2 = FFABS((2 * (src[ 0 * stride] - src[ 3 * stride]) -
1117                         5 * (src[ 1 * stride] - src[ 2 * stride]) + 4) >> 3);
1118         if (a1 < a0 || a2 < a0) {
1119             int clip      = src[-1 * stride] - src[0 * stride];
1120             int clip_sign = clip >> 31;
1121
1122             clip = ((clip ^ clip_sign) - clip_sign) >> 1;
1123             if (clip) {
1124                 int a3     = FFMIN(a1, a2);
1125                 int d      = 5 * (a3 - a0);
1126                 int d_sign = (d >> 31);
1127
1128                 d       = ((d ^ d_sign) - d_sign) >> 3;
1129                 d_sign ^= a0_sign;
1130
1131                 if (d_sign ^ clip_sign)
1132                     d = 0;
1133                 else {
1134                     d = FFMIN(d, clip);
1135                     d = (d ^ d_sign) - d_sign; /* Restore sign */
1136                     src[-1 * stride] = av_clip_uint8(src[-1 * stride] - d);
1137                     src[ 0 * stride] = av_clip_uint8(src[ 0 * stride] + d);
1138                 }
1139                 return 1;
1140             }
1141         }
1142     }
1143     return 0;
1144 }
1145
1146 /**
1147  * VC-1 in-loop deblocking filter
1148  * @param src source block type
1149  * @param step distance between horizontally adjacent elements
1150  * @param stride distance between vertically adjacent elements
1151  * @param len edge length to filter (4 or 8 pixels)
1152  * @param pq block quantizer
1153  * @see 8.6
1154  */
1155 static inline void vc1_loop_filter(uint8_t *src, int step, int stride,
1156                                    int len, int pq)
1157 {
1158     int i;
1159     int filt3;
1160
1161     for (i = 0; i < len; i += 4) {
1162         filt3 = vc1_filter_line(src + 2 * step, stride, pq);
1163         if (filt3) {
1164             vc1_filter_line(src + 0 * step, stride, pq);
1165             vc1_filter_line(src + 1 * step, stride, pq);
1166             vc1_filter_line(src + 3 * step, stride, pq);
1167         }
1168         src += step * 4;
1169     }
1170 }
1171
1172 void ff_vc1_v_loop_filter4_mmi(uint8_t *src, int stride, int pq)
1173 {
1174     vc1_loop_filter(src, 1, stride, 4, pq);
1175 }
1176
1177 void ff_vc1_h_loop_filter4_mmi(uint8_t *src, int stride, int pq)
1178 {
1179     vc1_loop_filter(src, stride, 1, 4, pq);
1180 }
1181
1182 void ff_vc1_v_loop_filter8_mmi(uint8_t *src, int stride, int pq)
1183 {
1184     vc1_loop_filter(src, 1, stride, 8, pq);
1185 }
1186
1187 void ff_vc1_h_loop_filter8_mmi(uint8_t *src, int stride, int pq)
1188 {
1189     vc1_loop_filter(src, stride, 1, 8, pq);
1190 }
1191
1192 void ff_vc1_v_loop_filter16_mmi(uint8_t *src, int stride, int pq)
1193 {
1194     vc1_loop_filter(src, 1, stride, 16, pq);
1195 }
1196
1197 void ff_vc1_h_loop_filter16_mmi(uint8_t *src, int stride, int pq)
1198 {
1199     vc1_loop_filter(src, stride, 1, 16, pq);
1200 }
1201
1202 void ff_put_vc1_mspel_mc00_mmi(uint8_t *dst, const uint8_t *src,
1203                                ptrdiff_t stride, int rnd)
1204 {
1205     ff_put_pixels8_8_mmi(dst, src, stride, 8);
1206 }
1207 void ff_put_vc1_mspel_mc00_16_mmi(uint8_t *dst, const uint8_t *src,
1208                                   ptrdiff_t stride, int rnd)
1209 {
1210     ff_put_pixels16_8_mmi(dst, src, stride, 16);
1211 }
1212 void ff_avg_vc1_mspel_mc00_mmi(uint8_t *dst, const uint8_t *src,
1213                                ptrdiff_t stride, int rnd)
1214 {
1215     ff_avg_pixels8_8_mmi(dst, src, stride, 8);
1216 }
1217 void ff_avg_vc1_mspel_mc00_16_mmi(uint8_t *dst, const uint8_t *src,
1218                                   ptrdiff_t stride, int rnd)
1219 {
1220     ff_avg_pixels16_8_mmi(dst, src, stride, 16);
1221 }
1222
1223 #define OP_PUT(S, D)
1224 #define OP_AVG(S, D)                                                        \
1225     "ldc1       $f16,   "#S"                        \n\t"                   \
1226     "pavgb      "#D",   "#D",   $f16                \n\t"
1227
1228 /** Add rounder from $f14 to $f6 and pack result at destination */
1229 #define NORMALIZE_MMI(SHIFT)                                                \
1230     "paddh      $f6,    $f6,    $f14                \n\t" /* +bias-r */     \
1231     "paddh      $f8,    $f8,    $f14                \n\t" /* +bias-r */     \
1232     "psrah      $f6,    $f6,    "SHIFT"             \n\t"                   \
1233     "psrah      $f8,    $f8,    "SHIFT"             \n\t"
1234
1235 #define TRANSFER_DO_PACK(OP)                                                \
1236     "packushb   $f6,    $f6,    $f8                 \n\t"                   \
1237     OP((%[dst]), $f6)                                                       \
1238     "sdc1       $f6,    0x00(%[dst])                \n\t"
1239
1240 #define TRANSFER_DONT_PACK(OP)                                              \
1241      OP(0(%[dst]), $f6)                                                     \
1242      OP(8(%[dst]), $f8)                                                     \
1243      "sdc1      $f6,    0x00(%[dst])                \n\t"                   \
1244      "sdc1      $f8,    0x08(%[dst])                \n\t"
1245
1246 /** @see MSPEL_FILTER13_CORE for use as UNPACK macro */
1247 #define DO_UNPACK(reg)                                                      \
1248     "punpcklbh  "reg",  "reg",  $f0                 \n\t"
1249 #define DONT_UNPACK(reg)
1250
1251 /** Compute the rounder 32-r or 8-r and unpacks it to $f14 */
1252 #define LOAD_ROUNDER_MMI(ROUND)                                             \
1253     "lwc1       $f14,   "ROUND"                     \n\t"                   \
1254     "punpcklhw  $f14,   $f14,   $f14                \n\t"                   \
1255     "punpcklwd  $f14,   $f14,   $f14                \n\t"
1256
1257
1258 #define SHIFT2_LINE(OFF, R0, R1, R2, R3)                                    \
1259     "paddh      "#R1",      "#R1",  "#R2"           \n\t"                   \
1260     PTR_ADDU    "$9,        %[src], %[stride1]      \n\t"                   \
1261     MMI_ULWC1(R0, $9, 0x00)                                                 \
1262     "pmullh     "#R1",      "#R1",  $f6             \n\t"                   \
1263     "punpcklbh  "#R0",      "#R0",  $f0             \n\t"                   \
1264     PTR_ADDU    "$9,        %[src], %[stride]       \n\t"                   \
1265     MMI_ULWC1(R3, $9, 0x00)                                                 \
1266     "psubh      "#R1",      "#R1",  "#R0"           \n\t"                   \
1267     "punpcklbh  "#R3",      "#R3",  $f0             \n\t"                   \
1268     "paddh      "#R1",      "#R1",  $f14            \n\t"                   \
1269     "psubh      "#R1",      "#R1",  "#R3"           \n\t"                   \
1270     "psrah      "#R1",      "#R1",  %[shift]        \n\t"                   \
1271     MMI_SDC1(R1, %[dst], OFF)                                               \
1272     PTR_ADDU    "%[src],    %[src], %[stride]       \n\t"
1273
1274 /** Sacrificing $f12 makes it possible to pipeline loads from src */
1275 static void vc1_put_ver_16b_shift2_mmi(int16_t *dst,
1276                                        const uint8_t *src, mips_reg stride,
1277                                        int rnd, int64_t shift)
1278 {
1279     DECLARE_VAR_LOW32;
1280     DECLARE_VAR_ADDRT;
1281
1282     __asm__ volatile(
1283         "xor        $f0,    $f0,    $f0             \n\t"
1284         "li         $8,     0x03                    \n\t"
1285         LOAD_ROUNDER_MMI("%[rnd]")
1286         "ldc1       $f12,   %[ff_pw_9]              \n\t"
1287         "1:                                         \n\t"
1288         MMI_ULWC1($f4, %[src], 0x00)
1289         PTR_ADDU   "%[src], %[src], %[stride]       \n\t"
1290         MMI_ULWC1($f6, %[src], 0x00)
1291         "punpcklbh  $f4,    $f4,    $f0             \n\t"
1292         "punpcklbh  $f6,    $f6,    $f0             \n\t"
1293         SHIFT2_LINE(  0, $f2, $f4, $f6, $f8)
1294         SHIFT2_LINE( 24, $f4, $f6, $f8, $f2)
1295         SHIFT2_LINE( 48, $f6, $f8, $f2, $f4)
1296         SHIFT2_LINE( 72, $f8, $f2, $f4, $f6)
1297         SHIFT2_LINE( 96, $f2, $f4, $f6, $f8)
1298         SHIFT2_LINE(120, $f4, $f6, $f8, $f2)
1299         SHIFT2_LINE(144, $f6, $f8, $f2, $f4)
1300         SHIFT2_LINE(168, $f8, $f2, $f4, $f6)
1301         PTR_SUBU   "%[src], %[src], %[stride2]      \n\t"
1302         PTR_ADDIU  "%[dst], %[dst], 0x08            \n\t"
1303         "addiu      $8,     $8,    -0x01            \n\t"
1304         "bnez       $8,     1b                      \n\t"
1305         : RESTRICT_ASM_LOW32            RESTRICT_ASM_ADDRT
1306           [src]"+r"(src),               [dst]"+r"(dst)
1307         : [stride]"r"(stride),          [stride1]"r"(-2*stride),
1308           [shift]"f"(shift),            [rnd]"m"(rnd),
1309           [stride2]"r"(9*stride-4),     [ff_pw_9]"m"(ff_pw_9)
1310         : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
1311           "$f14", "$f16", "memory"
1312     );
1313 }
1314
1315 /**
1316  * Data is already unpacked, so some operations can directly be made from
1317  * memory.
1318  */
1319 #define VC1_HOR_16B_SHIFT2(OP, OPNAME)                                      \
1320 static void OPNAME ## vc1_hor_16b_shift2_mmi(uint8_t *dst, mips_reg stride, \
1321                                              const int16_t *src, int rnd)   \
1322 {                                                                           \
1323     int h = 8;                                                              \
1324     DECLARE_VAR_ALL64;                                                      \
1325     DECLARE_VAR_ADDRT;                                                      \
1326                                                                             \
1327     src -= 1;                                                               \
1328     rnd -= (-1+9+9-1)*1024; /* Add -1024 bias */                            \
1329                                                                             \
1330     __asm__ volatile(                                                       \
1331         LOAD_ROUNDER_MMI("%[rnd]")                                          \
1332         "ldc1       $f12,   %[ff_pw_128]            \n\t"                   \
1333         "ldc1       $f10,   %[ff_pw_9]              \n\t"                   \
1334         "1:                                         \n\t"                   \
1335         MMI_ULDC1($f2, %[src], 0x00)                                        \
1336         MMI_ULDC1($f4, %[src], 0x08)                                        \
1337         MMI_ULDC1($f6, %[src], 0x02)                                        \
1338         MMI_ULDC1($f8, %[src], 0x0a)                                        \
1339         MMI_ULDC1($f0, %[src], 0x06)                                        \
1340         "paddh      $f2,    $f2,    $f0             \n\t"                   \
1341         MMI_ULDC1($f0, %[src], 0x0e)                                        \
1342         "paddh      $f4,    $f4,    $f0             \n\t"                   \
1343         MMI_ULDC1($f0, %[src], 0x04)                                        \
1344         "paddh      $f6,    $f6,    $f0             \n\t"                   \
1345         MMI_ULDC1($f0, %[src], 0x0b)                                        \
1346         "paddh      $f8,    $f8,    $f0             \n\t"                   \
1347         "pmullh     $f6,    $f6,    $f10            \n\t"                   \
1348         "pmullh     $f8,    $f8,    $f10            \n\t"                   \
1349         "psubh      $f6,    $f6,    $f2             \n\t"                   \
1350         "psubh      $f8,    $f8,    $f4             \n\t"                   \
1351         "li         $8,     0x07                    \n\t"                   \
1352         "mtc1       $8,     $f16                    \n\t"                   \
1353         NORMALIZE_MMI("$f16")                                               \
1354         /* Remove bias */                                                   \
1355         "paddh      $f6,    $f6,    $f12            \n\t"                   \
1356         "paddh      $f8,    $f8,    $f12            \n\t"                   \
1357         TRANSFER_DO_PACK(OP)                                                \
1358         "addiu      %[h],   %[h],  -0x01            \n\t"                   \
1359         PTR_ADDIU  "%[src], %[src], 0x18            \n\t"                   \
1360         PTR_ADDU   "%[dst], %[dst], %[stride]       \n\t"                   \
1361         "bnez       %[h],   1b                      \n\t"                   \
1362         : RESTRICT_ASM_ALL64            RESTRICT_ASM_ADDRT                  \
1363           [h]"+r"(h),                                                       \
1364           [src]"+r"(src),               [dst]"+r"(dst)                      \
1365         : [stride]"r"(stride),          [rnd]"m"(rnd),                      \
1366           [ff_pw_9]"m"(ff_pw_9),        [ff_pw_128]"m"(ff_pw_128)           \
1367         : "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14",  \
1368           "$f16", "memory"                                                  \
1369     );                                                                      \
1370 }
1371
1372 VC1_HOR_16B_SHIFT2(OP_PUT, put_)
1373 VC1_HOR_16B_SHIFT2(OP_AVG, avg_)
1374
1375 /**
1376  * Purely vertical or horizontal 1/2 shift interpolation.
1377  * Sacrify $f12 for *9 factor.
1378  */
1379 #define VC1_SHIFT2(OP, OPNAME)\
1380 static void OPNAME ## vc1_shift2_mmi(uint8_t *dst, const uint8_t *src,      \
1381                                      mips_reg stride, int rnd,              \
1382                                      mips_reg offset)                       \
1383 {                                                                           \
1384     DECLARE_VAR_LOW32;                                                      \
1385     DECLARE_VAR_ADDRT;                                                      \
1386                                                                             \
1387     rnd = 8 - rnd;                                                          \
1388                                                                             \
1389     __asm__ volatile(                                                       \
1390         "xor        $f0,    $f0,    $f0             \n\t"                   \
1391         "li         $10,    0x08                    \n\t"                   \
1392         LOAD_ROUNDER_MMI("%[rnd]")                                          \
1393         "ldc1       $f12,   %[ff_pw_9]              \n\t"                   \
1394         "1:                                         \n\t"                   \
1395         MMI_ULWC1($f6, %[src], 0x00)                                        \
1396         MMI_ULWC1($f8, %[src], 0x04)                                        \
1397         PTR_ADDU   "$9,     %[src], %[offset]       \n\t"                   \
1398         MMI_ULWC1($f2, $9, 0x00)                                            \
1399         MMI_ULWC1($f4, $9, 0x04)                                            \
1400         PTR_ADDU   "%[src], %[src], %[offset]       \n\t"                   \
1401         "punpcklbh  $f6,    $f6,    $f0             \n\t"                   \
1402         "punpcklbh  $f8,    $f8,    $f0             \n\t"                   \
1403         "punpcklbh  $f2,    $f2,    $f0             \n\t"                   \
1404         "punpcklbh  $f4,    $f4,    $f0             \n\t"                   \
1405         "paddh      $f6,    $f6,    $f2             \n\t"                   \
1406         "paddh      $f8,    $f8,    $f4             \n\t"                   \
1407         PTR_ADDU   "$9,     %[src], %[offset_x2n]   \n\t"                   \
1408         MMI_ULWC1($f2, $9, 0x00)                                            \
1409         MMI_ULWC1($f4, $9, 0x04)                                            \
1410         "pmullh     $f6,    $f6,    $f12            \n\t" /* 0,9,9,0*/      \
1411         "pmullh     $f8,    $f8,    $f12            \n\t" /* 0,9,9,0*/      \
1412         "punpcklbh  $f2,    $f2,    $f0             \n\t"                   \
1413         "punpcklbh  $f4,    $f4,    $f0             \n\t"                   \
1414         "psubh      $f6,    $f6,    $f2             \n\t" /*-1,9,9,0*/      \
1415         "psubh      $f8,    $f8,    $f4             \n\t" /*-1,9,9,0*/      \
1416         PTR_ADDU   "$9,     %[src], %[offset]       \n\t"                   \
1417         MMI_ULWC1($f2, $9, 0x00)                                            \
1418         MMI_ULWC1($f4, $9, 0x04)                                            \
1419         "punpcklbh  $f2,    $f2,    $f0             \n\t"                   \
1420         "punpcklbh  $f4,    $f4,    $f0             \n\t"                   \
1421         "psubh      $f6,    $f6,    $f2             \n\t" /*-1,9,9,-1*/     \
1422         "psubh      $f8,    $f8,    $f4             \n\t" /*-1,9,9,-1*/     \
1423         "li         $8,     0x04                    \n\t"                   \
1424         "mtc1       $8,     $f16                    \n\t"                   \
1425         NORMALIZE_MMI("$f16")                                               \
1426         "packushb   $f6,    $f6,    $f8             \n\t"                   \
1427         OP((%[dst]), $f6)                                                   \
1428         "sdc1       $f6,    0x00(%[dst])            \n\t"                   \
1429         "addiu      $10,    $10,   -0x01            \n\t"                   \
1430         PTR_ADDU   "%[src], %[src], %[stride1]      \n\t"                   \
1431         PTR_ADDU   "%[dst], %[dst], %[stride]       \n\t"                   \
1432         "bnez       $10,    1b                      \n\t"                   \
1433         : RESTRICT_ASM_LOW32            RESTRICT_ASM_ADDRT                  \
1434           [src]"+r"(src),               [dst]"+r"(dst)                      \
1435         : [offset]"r"(offset),          [offset_x2n]"r"(-2*offset),         \
1436           [stride]"g"(stride),          [rnd]"m"(rnd),                      \
1437           [stride1]"g"(stride-offset),                                      \
1438           [ff_pw_9]"m"(ff_pw_9)                                             \
1439         : "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10",     \
1440           "$f12", "$f14", "$f16", "memory"                                  \
1441     );                                                                      \
1442 }
1443
1444 VC1_SHIFT2(OP_PUT, put_)
1445 VC1_SHIFT2(OP_AVG, avg_)
1446
1447 /**
1448  * Core of the 1/4 and 3/4 shift bicubic interpolation.
1449  *
1450  * @param UNPACK  Macro unpacking arguments from 8 to 16bits (can be empty).
1451  * @param LOAD    "MMI_ULWC1" or "MMI_ULDC1", if data read is already unpacked.
1452  * @param M       "1" for MMI_ULWC1, "2" for MMI_ULDC1.
1453  * @param A1      Stride address of 1st tap (beware of unpacked/packed).
1454  * @param A2      Stride address of 2nd tap
1455  * @param A3      Stride address of 3rd tap
1456  * @param A4      Stride address of 4th tap
1457  */
1458 #define MSPEL_FILTER13_CORE(UNPACK, LOAD, M, A1, A2, A3, A4)                \
1459     PTR_ADDU   "$9,     %[src], "#A1"           \n\t"                       \
1460     LOAD($f2, $9, M*0)                                                      \
1461     LOAD($f4, $9, M*4)                                                      \
1462     UNPACK("$f2")                                                           \
1463     UNPACK("$f4")                                                           \
1464     "pmullh     $f2,    $f2,    %[ff_pw_3]      \n\t"                       \
1465     "pmullh     $f4,    $f4,    %[ff_pw_3]      \n\t"                       \
1466     PTR_ADDU   "$9,     %[src], "#A2"           \n\t"                       \
1467     LOAD($f6, $9, M*0)                                                      \
1468     LOAD($f8, $9, M*4)                                                      \
1469     UNPACK("$f6")                                                           \
1470     UNPACK("$f8")                                                           \
1471     "pmullh     $f6,    $f6,    $f12            \n\t" /* *18 */             \
1472     "pmullh     $f8,    $f8,    $f12            \n\t" /* *18 */             \
1473     "psubh      $f6,    $f6,    $f2             \n\t" /* *18, -3 */         \
1474     "psubh      $f8,    $f8,    $f4             \n\t" /* *18, -3 */         \
1475     PTR_ADDU   "$9,     %[src], "#A4"           \n\t"                       \
1476     LOAD($f2, $9, M*0)                                                      \
1477     LOAD($f4, $9, M*4)                                                      \
1478     UNPACK("$f2")                                                           \
1479     UNPACK("$f4")                                                           \
1480     "li         $8,     0x02                    \n\t"                       \
1481     "mtc1       $8,     $f16                    \n\t"                       \
1482     "psllh      $f2,    $f2,    $f16            \n\t" /* 4* */              \
1483     "psllh      $f4,    $f4,    $f16            \n\t" /* 4* */              \
1484     "psubh      $f6,    $f6,    $f2             \n\t" /* -4,18,-3 */        \
1485     "psubh      $f8,    $f8,    $f4             \n\t" /* -4,18,-3 */        \
1486     PTR_ADDU   "$9,     %[src], "#A3"           \n\t"                       \
1487     LOAD($f2, $9, M*0)                                                      \
1488     LOAD($f4, $9, M*4)                                                      \
1489     UNPACK("$f2")                                                           \
1490     UNPACK("$f4")                                                           \
1491     "pmullh     $f2,    $f2,    $f10            \n\t" /* *53 */             \
1492     "pmullh     $f4,    $f4,    $f10            \n\t" /* *53 */             \
1493     "paddh      $f6,    $f6,    $f2             \n\t" /* 4,53,18,-3 */      \
1494     "paddh      $f8,    $f8,    $f4             \n\t" /* 4,53,18,-3 */
1495
1496 /**
1497  * Macro to build the vertical 16bits version of vc1_put_shift[13].
1498  * Here, offset=src_stride. Parameters passed A1 to A4 must use
1499  * %3 (src_stride), %4 (2*src_stride) and %5 (3*src_stride).
1500  *
1501  * @param  NAME   Either 1 or 3
1502  * @see MSPEL_FILTER13_CORE for information on A1->A4
1503  */
1504 #define MSPEL_FILTER13_VER_16B(NAME, A1, A2, A3, A4)                        \
1505 static void                                                                 \
1506 vc1_put_ver_16b_ ## NAME ## _mmi(int16_t *dst, const uint8_t *src,          \
1507                                  mips_reg src_stride,                       \
1508                                  int rnd, int64_t shift)                    \
1509 {                                                                           \
1510     int h = 8;                                                              \
1511     DECLARE_VAR_LOW32;                                                      \
1512     DECLARE_VAR_ADDRT;                                                      \
1513                                                                             \
1514     src -= src_stride;                                                      \
1515                                                                             \
1516     __asm__ volatile(                                                       \
1517         "xor        $f0,    $f0,    $f0             \n\t"                   \
1518         LOAD_ROUNDER_MMI("%[rnd]")                                          \
1519         "ldc1       $f10,   %[ff_pw_53]             \n\t"                   \
1520         "ldc1       $f12,   %[ff_pw_18]             \n\t"                   \
1521         ".p2align 3                                 \n\t"                   \
1522         "1:                                         \n\t"                   \
1523         MSPEL_FILTER13_CORE(DO_UNPACK, MMI_ULWC1, 1, A1, A2, A3, A4)        \
1524         NORMALIZE_MMI("%[shift]")                                           \
1525         TRANSFER_DONT_PACK(OP_PUT)                                          \
1526         /* Last 3 (in fact 4) bytes on the line */                          \
1527         PTR_ADDU   "$9,     %[src], "#A1"           \n\t"                   \
1528         MMI_ULWC1($f2, $9, 0x08)                                            \
1529         DO_UNPACK("$f2")                                                    \
1530         "mov.d      $f6,    $f2                     \n\t"                   \
1531         "paddh      $f2,    $f2,    $f2             \n\t"                   \
1532         "paddh      $f2,    $f2,    $f6             \n\t" /* 3* */          \
1533         PTR_ADDU   "$9,     %[src], "#A2"           \n\t"                   \
1534         MMI_ULWC1($f6, $9, 0x08)                                            \
1535         DO_UNPACK("$f6")                                                    \
1536         "pmullh     $f6,    $f6,    $f12            \n\t" /* *18 */         \
1537         "psubh      $f6,    $f6,    $f2             \n\t" /* *18,-3 */      \
1538         PTR_ADDU   "$9,     %[src], "#A3"           \n\t"                   \
1539         MMI_ULWC1($f2, $9, 0x08)                                            \
1540         DO_UNPACK("$f2")                                                    \
1541         "pmullh     $f2,    $f2,    $f10            \n\t" /* *53 */         \
1542         "paddh      $f6,    $f6,    $f2             \n\t" /* *53,18,-3 */   \
1543         PTR_ADDU   "$9,     %[src], "#A4"           \n\t"                   \
1544         MMI_ULWC1($f2, $9, 0x08)                                            \
1545         DO_UNPACK("$f2")                                                    \
1546         "li         $8,     0x02                    \n\t"                   \
1547         "mtc1       $8,     $f16                    \n\t"                   \
1548         "psllh      $f2,    $f2,    $f16            \n\t" /* 4* */          \
1549         "psubh      $f6,    $f6,    $f2             \n\t"                   \
1550         "paddh      $f6,    $f6,    $f14            \n\t"                   \
1551         "li         $8,     0x06                    \n\t"                   \
1552         "mtc1       $8,     $f16                    \n\t"                   \
1553         "psrah      $f6,    $f6,    $f16            \n\t"                   \
1554         "sdc1       $f6,    0x10(%[dst])            \n\t"                   \
1555         "addiu      %[h],   %[h],  -0x01            \n\t"                   \
1556         PTR_ADDU   "%[src], %[src], %[stride_x1]    \n\t"                   \
1557         PTR_ADDIU  "%[dst], %[dst], 0x18            \n\t"                   \
1558         "bnez       %[h],   1b                      \n\t"                   \
1559         : RESTRICT_ASM_LOW32            RESTRICT_ASM_ADDRT                  \
1560           [h]"+r"(h),                                                       \
1561           [src]"+r"(src),               [dst]"+r"(dst)                      \
1562         : [stride_x1]"r"(src_stride),   [stride_x2]"r"(2*src_stride),       \
1563           [stride_x3]"r"(3*src_stride),                                     \
1564           [rnd]"m"(rnd),                [shift]"f"(shift),                  \
1565           [ff_pw_53]"m"(ff_pw_53),      [ff_pw_18]"m"(ff_pw_18),            \
1566           [ff_pw_3]"f"(ff_pw_3)                                             \
1567         : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",    \
1568           "$f14", "$f16", "memory"                                          \
1569     );                                                                      \
1570 }
1571
1572 /**
1573  * Macro to build the horizontal 16bits version of vc1_put_shift[13].
1574  * Here, offset=16bits, so parameters passed A1 to A4 should be simple.
1575  *
1576  * @param  NAME   Either 1 or 3
1577  * @see MSPEL_FILTER13_CORE for information on A1->A4
1578  */
1579 #define MSPEL_FILTER13_HOR_16B(NAME, A1, A2, A3, A4, OP, OPNAME)            \
1580 static void                                                                 \
1581 OPNAME ## vc1_hor_16b_ ## NAME ## _mmi(uint8_t *dst, mips_reg stride,       \
1582                                        const int16_t *src, int rnd)         \
1583 {                                                                           \
1584     int h = 8;                                                              \
1585     DECLARE_VAR_ALL64;                                                      \
1586     DECLARE_VAR_ADDRT;                                                      \
1587                                                                             \
1588     src -= 1;                                                               \
1589     rnd -= (-4+58+13-3)*256; /* Add -256 bias */                            \
1590                                                                             \
1591     __asm__ volatile(                                                       \
1592         "xor        $f0,    $f0,    $f0             \n\t"                   \
1593         LOAD_ROUNDER_MMI("%[rnd]")                                          \
1594         "ldc1       $f10,   %[ff_pw_53]             \n\t"                   \
1595         "ldc1       $f12,   %[ff_pw_18]             \n\t"                   \
1596         ".p2align 3                                 \n\t"                   \
1597         "1:                                         \n\t"                   \
1598         MSPEL_FILTER13_CORE(DONT_UNPACK, MMI_ULDC1, 2, A1, A2, A3, A4)      \
1599         "li         $8,     0x07                    \n\t"                   \
1600         "mtc1       $8,     $f16                    \n\t"                   \
1601         NORMALIZE_MMI("$f16")                                               \
1602         /* Remove bias */                                                   \
1603         "paddh      $f6,    $f6,    %[ff_pw_128]    \n\t"                   \
1604         "paddh      $f8,    $f8,    %[ff_pw_128]    \n\t"                   \
1605         TRANSFER_DO_PACK(OP)                                                \
1606         "addiu      %[h],   %[h],  -0x01            \n\t"                   \
1607         PTR_ADDU   "%[src], %[src], 0x18            \n\t"                   \
1608         PTR_ADDU   "%[dst], %[dst], %[stride]       \n\t"                   \
1609         "bnez       %[h],   1b                      \n\t"                   \
1610         : RESTRICT_ASM_ALL64            RESTRICT_ASM_ADDRT                  \
1611           [h]"+r"(h),                                                       \
1612           [src]"+r"(src),               [dst]"+r"(dst)                      \
1613         : [stride]"r"(stride),          [rnd]"m"(rnd),                      \
1614           [ff_pw_53]"m"(ff_pw_53),      [ff_pw_18]"m"(ff_pw_18),            \
1615           [ff_pw_3]"f"(ff_pw_3),        [ff_pw_128]"f"(ff_pw_128)           \
1616         : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",    \
1617           "$f14", "$f16", "memory"                                          \
1618     );                                                                      \
1619 }
1620
1621 /**
1622  * Macro to build the 8bits, any direction, version of vc1_put_shift[13].
1623  * Here, offset=src_stride. Parameters passed A1 to A4 must use
1624  * %3 (offset), %4 (2*offset) and %5 (3*offset).
1625  *
1626  * @param  NAME   Either 1 or 3
1627  * @see MSPEL_FILTER13_CORE for information on A1->A4
1628  */
1629 #define MSPEL_FILTER13_8B(NAME, A1, A2, A3, A4, OP, OPNAME)                 \
1630 static void                                                                 \
1631 OPNAME ## vc1_## NAME ## _mmi(uint8_t *dst, const uint8_t *src,             \
1632                               mips_reg stride, int rnd, mips_reg offset)    \
1633 {                                                                           \
1634     int h = 8;                                                              \
1635     DECLARE_VAR_LOW32;                                                      \
1636     DECLARE_VAR_ADDRT;                                                      \
1637                                                                             \
1638     src -= offset;                                                          \
1639     rnd = 32-rnd;                                                           \
1640                                                                             \
1641     __asm__ volatile (                                                      \
1642         "xor        $f0,    $f0,    $f0             \n\t"                   \
1643         LOAD_ROUNDER_MMI("%[rnd]")                                          \
1644         "ldc1       $f10,   %[ff_pw_53]             \n\t"                   \
1645         "ldc1       $f12,   %[ff_pw_18]             \n\t"                   \
1646         ".p2align 3                                 \n\t"                   \
1647         "1:                                         \n\t"                   \
1648         MSPEL_FILTER13_CORE(DO_UNPACK, MMI_ULWC1, 1, A1, A2, A3, A4)        \
1649         "li         $8,     0x06                    \n\t"                   \
1650         "mtc1       $8,     $f16                    \n\t"                   \
1651         NORMALIZE_MMI("$f16")                                               \
1652         TRANSFER_DO_PACK(OP)                                                \
1653         "addiu      %[h],   %[h],      -0x01        \n\t"                   \
1654         PTR_ADDU   "%[src], %[src],     %[stride]   \n\t"                   \
1655         PTR_ADDU   "%[dst], %[dst],     %[stride]   \n\t"                   \
1656         "bnez       %[h],   1b                      \n\t"                   \
1657         : RESTRICT_ASM_LOW32            RESTRICT_ASM_ADDRT                  \
1658           [h]"+r"(h),                                                       \
1659           [src]"+r"(src),               [dst]"+r"(dst)                      \
1660         : [offset_x1]"r"(offset),       [offset_x2]"r"(2*offset),           \
1661           [offset_x3]"r"(3*offset),     [stride]"g"(stride),                \
1662           [rnd]"m"(rnd),                                                    \
1663           [ff_pw_53]"m"(ff_pw_53),      [ff_pw_18]"m"(ff_pw_18),            \
1664           [ff_pw_3]"f"(ff_pw_3)                                             \
1665         : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",    \
1666           "$f14", "$f16", "memory"                                          \
1667     );                                                                      \
1668 }
1669
1670
1671 /** 1/4 shift bicubic interpolation */
1672 MSPEL_FILTER13_8B(shift1, %[offset_x3], %[offset_x2], %[offset_x1], $0, OP_PUT, put_)
1673 MSPEL_FILTER13_8B(shift1, %[offset_x3], %[offset_x2], %[offset_x1], $0, OP_AVG, avg_)
1674 MSPEL_FILTER13_VER_16B(shift1, %[stride_x3], %[stride_x2], %[stride_x1], $0)
1675 MSPEL_FILTER13_HOR_16B(shift1, 6, 4, 2, 0, OP_PUT, put_)
1676 MSPEL_FILTER13_HOR_16B(shift1, 6, 4, 2, 0, OP_AVG, avg_)
1677
1678 /** 3/4 shift bicubic interpolation */
1679 MSPEL_FILTER13_8B(shift3, $0, %[offset_x1], %[offset_x2], %[offset_x3], OP_PUT, put_)
1680 MSPEL_FILTER13_8B(shift3, $0, %[offset_x1], %[offset_x2], %[offset_x3], OP_AVG, avg_)
1681 MSPEL_FILTER13_VER_16B(shift3, $0, %[stride_x1], %[stride_x2], %[stride_x3])
1682 MSPEL_FILTER13_HOR_16B(shift3, 0, 2, 4, 6, OP_PUT, put_)
1683 MSPEL_FILTER13_HOR_16B(shift3, 0, 2, 4, 6, OP_AVG, avg_)
1684
1685 typedef void (*vc1_mspel_mc_filter_ver_16bits)
1686              (int16_t *dst, const uint8_t *src, mips_reg src_stride, int rnd,
1687               int64_t shift);
1688 typedef void (*vc1_mspel_mc_filter_hor_16bits)
1689              (uint8_t *dst, mips_reg dst_stride, const int16_t *src, int rnd);
1690 typedef void (*vc1_mspel_mc_filter_8bits)
1691              (uint8_t *dst, const uint8_t *src, mips_reg stride, int rnd,
1692               mips_reg offset);
1693
1694 /**
1695  * Interpolate fractional pel values by applying proper vertical then
1696  * horizontal filter.
1697  *
1698  * @param  dst     Destination buffer for interpolated pels.
1699  * @param  src     Source buffer.
1700  * @param  stride  Stride for both src and dst buffers.
1701  * @param  hmode   Horizontal filter (expressed in quarter pixels shift).
1702  * @param  hmode   Vertical filter.
1703  * @param  rnd     Rounding bias.
1704  */
1705 #define VC1_MSPEL_MC(OP)                                                    \
1706 static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\
1707                                int hmode, int vmode, int rnd)               \
1708 {                                                                           \
1709     static const vc1_mspel_mc_filter_ver_16bits vc1_put_shift_ver_16bits[] =\
1710          { NULL, vc1_put_ver_16b_shift1_mmi,                                \
1711                  vc1_put_ver_16b_shift2_mmi,                                \
1712                  vc1_put_ver_16b_shift3_mmi };                              \
1713     static const vc1_mspel_mc_filter_hor_16bits vc1_put_shift_hor_16bits[] =\
1714          { NULL, OP ## vc1_hor_16b_shift1_mmi,                              \
1715                  OP ## vc1_hor_16b_shift2_mmi,                              \
1716                  OP ## vc1_hor_16b_shift3_mmi };                            \
1717     static const vc1_mspel_mc_filter_8bits vc1_put_shift_8bits[] =          \
1718          { NULL, OP ## vc1_shift1_mmi,                                      \
1719                  OP ## vc1_shift2_mmi,                                      \
1720                  OP ## vc1_shift3_mmi };                                    \
1721                                                                             \
1722     if (vmode) { /* Vertical filter to apply */                             \
1723         if (hmode) { /* Horizontal filter to apply, output to tmp */        \
1724             static const int shift_value[] = { 0, 5, 1, 5 };                \
1725             int    shift = (shift_value[hmode]+shift_value[vmode])>>1;      \
1726             int    r;                                                       \
1727             LOCAL_ALIGNED(16, int16_t, tmp, [12*8]);                        \
1728                                                                             \
1729             r = (1<<(shift-1)) + rnd-1;                                     \
1730             vc1_put_shift_ver_16bits[vmode](tmp, src-1, stride, r, shift);  \
1731                                                                             \
1732             vc1_put_shift_hor_16bits[hmode](dst, stride, tmp+1, 64-rnd);    \
1733             return;                                                         \
1734         }                                                                   \
1735         else { /* No horizontal filter, output 8 lines to dst */            \
1736             vc1_put_shift_8bits[vmode](dst, src, stride, 1-rnd, stride);    \
1737             return;                                                         \
1738         }                                                                   \
1739     }                                                                       \
1740                                                                             \
1741     /* Horizontal mode with no vertical mode */                             \
1742     vc1_put_shift_8bits[hmode](dst, src, stride, rnd, 1);                   \
1743 }                                                                           \
1744 static void OP ## vc1_mspel_mc_16(uint8_t *dst, const uint8_t *src,         \
1745                                   int stride, int hmode, int vmode, int rnd)\
1746 {                                                                           \
1747     OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd);        \
1748     OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd);        \
1749     dst += 8*stride; src += 8*stride;                                       \
1750     OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd);        \
1751     OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd);        \
1752 }
1753
1754 VC1_MSPEL_MC(put_)
1755 VC1_MSPEL_MC(avg_)
1756
1757 /** Macro to ease bicubic filter interpolation functions declarations */
1758 #define DECLARE_FUNCTION(a, b)                                              \
1759 void ff_put_vc1_mspel_mc ## a ## b ## _mmi(uint8_t *dst,                    \
1760                                            const uint8_t *src,              \
1761                                            ptrdiff_t stride,                \
1762                                            int rnd)                         \
1763 {                                                                           \
1764      put_vc1_mspel_mc(dst, src, stride, a, b, rnd);                         \
1765 }                                                                           \
1766 void ff_avg_vc1_mspel_mc ## a ## b ## _mmi(uint8_t *dst,                    \
1767                                            const uint8_t *src,              \
1768                                            ptrdiff_t stride,                \
1769                                            int rnd)                         \
1770 {                                                                           \
1771      avg_vc1_mspel_mc(dst, src, stride, a, b, rnd);                         \
1772 }                                                                           \
1773 void ff_put_vc1_mspel_mc ## a ## b ## _16_mmi(uint8_t *dst,                 \
1774                                               const uint8_t *src,           \
1775                                               ptrdiff_t stride,             \
1776                                               int rnd)                      \
1777 {                                                                           \
1778      put_vc1_mspel_mc_16(dst, src, stride, a, b, rnd);                      \
1779 }                                                                           \
1780 void ff_avg_vc1_mspel_mc ## a ## b ## _16_mmi(uint8_t *dst,                 \
1781                                               const uint8_t *src,           \
1782                                               ptrdiff_t stride,             \
1783                                               int rnd)                      \
1784 {                                                                           \
1785      avg_vc1_mspel_mc_16(dst, src, stride, a, b, rnd);                      \
1786 }
1787
1788 DECLARE_FUNCTION(0, 1)
1789 DECLARE_FUNCTION(0, 2)
1790 DECLARE_FUNCTION(0, 3)
1791
1792 DECLARE_FUNCTION(1, 0)
1793 DECLARE_FUNCTION(1, 1)
1794 DECLARE_FUNCTION(1, 2)
1795 DECLARE_FUNCTION(1, 3)
1796
1797 DECLARE_FUNCTION(2, 0)
1798 DECLARE_FUNCTION(2, 1)
1799 DECLARE_FUNCTION(2, 2)
1800 DECLARE_FUNCTION(2, 3)
1801
1802 DECLARE_FUNCTION(3, 0)
1803 DECLARE_FUNCTION(3, 1)
1804 DECLARE_FUNCTION(3, 2)
1805 DECLARE_FUNCTION(3, 3)
1806
1807 #define CHROMA_MC_8_MMI                                                     \
1808         "punpckhbh  %[ftmp5],   %[ftmp1],   %[ftmp0]                \n\t"   \
1809         "punpcklbh  %[ftmp1],   %[ftmp1],   %[ftmp0]                \n\t"   \
1810         "punpckhbh  %[ftmp6],   %[ftmp2],   %[ftmp0]                \n\t"   \
1811         "punpcklbh  %[ftmp2],   %[ftmp2],   %[ftmp0]                \n\t"   \
1812         "punpckhbh  %[ftmp7],   %[ftmp3],   %[ftmp0]                \n\t"   \
1813         "punpcklbh  %[ftmp3],   %[ftmp3],   %[ftmp0]                \n\t"   \
1814         "punpckhbh  %[ftmp8],   %[ftmp4],   %[ftmp0]                \n\t"   \
1815         "punpcklbh  %[ftmp4],   %[ftmp4],   %[ftmp0]                \n\t"   \
1816                                                                             \
1817         "pmullh     %[ftmp1],   %[ftmp1],   %[A]                    \n\t"   \
1818         "pmullh     %[ftmp5],   %[ftmp5],   %[A]                    \n\t"   \
1819         "pmullh     %[ftmp2],   %[ftmp2],   %[B]                    \n\t"   \
1820         "pmullh     %[ftmp6],   %[ftmp6],   %[B]                    \n\t"   \
1821         "pmullh     %[ftmp3],   %[ftmp3],   %[C]                    \n\t"   \
1822         "pmullh     %[ftmp7],   %[ftmp7],   %[C]                    \n\t"   \
1823         "pmullh     %[ftmp4],   %[ftmp4],   %[D]                    \n\t"   \
1824         "pmullh     %[ftmp8],   %[ftmp8],   %[D]                    \n\t"   \
1825                                                                             \
1826         "paddh      %[ftmp1],   %[ftmp1],   %[ftmp2]                \n\t"   \
1827         "paddh      %[ftmp3],   %[ftmp3],   %[ftmp4]                \n\t"   \
1828         "paddh      %[ftmp1],   %[ftmp1],   %[ftmp3]                \n\t"   \
1829         "paddh      %[ftmp1],   %[ftmp1],   %[ff_pw_28]             \n\t"   \
1830                                                                             \
1831         "paddh      %[ftmp5],   %[ftmp5],   %[ftmp6]                \n\t"   \
1832         "paddh      %[ftmp7],   %[ftmp7],   %[ftmp8]                \n\t"   \
1833         "paddh      %[ftmp5],   %[ftmp5],   %[ftmp7]                \n\t"   \
1834         "paddh      %[ftmp5],   %[ftmp5],   %[ff_pw_28]             \n\t"   \
1835                                                                             \
1836         "psrlh      %[ftmp1],   %[ftmp1],   %[ftmp9]                \n\t"   \
1837         "psrlh      %[ftmp5],   %[ftmp5],   %[ftmp9]                \n\t"   \
1838         "packushb   %[ftmp1],   %[ftmp1],   %[ftmp5]                \n\t"
1839
1840
1841 #define CHROMA_MC_4_MMI                                                     \
1842         "punpcklbh  %[ftmp1],   %[ftmp1],   %[ftmp0]                \n\t"   \
1843         "punpcklbh  %[ftmp2],   %[ftmp2],   %[ftmp0]                \n\t"   \
1844         "punpcklbh  %[ftmp3],   %[ftmp3],   %[ftmp0]                \n\t"   \
1845         "punpcklbh  %[ftmp4],   %[ftmp4],   %[ftmp0]                \n\t"   \
1846                                                                             \
1847         "pmullh     %[ftmp1],   %[ftmp1],   %[A]                    \n\t"   \
1848         "pmullh     %[ftmp2],   %[ftmp2],   %[B]                    \n\t"   \
1849         "pmullh     %[ftmp3],   %[ftmp3],   %[C]                    \n\t"   \
1850         "pmullh     %[ftmp4],   %[ftmp4],   %[D]                    \n\t"   \
1851                                                                             \
1852         "paddh      %[ftmp1],   %[ftmp1],   %[ftmp2]                \n\t"   \
1853         "paddh      %[ftmp3],   %[ftmp3],   %[ftmp4]                \n\t"   \
1854         "paddh      %[ftmp1],   %[ftmp1],   %[ftmp3]                \n\t"   \
1855         "paddh      %[ftmp1],   %[ftmp1],   %[ff_pw_28]             \n\t"   \
1856                                                                             \
1857         "psrlh      %[ftmp1],   %[ftmp1],   %[ftmp5]                \n\t"   \
1858         "packushb   %[ftmp1],   %[ftmp1],   %[ftmp0]                \n\t"
1859
1860
1861 void ff_put_no_rnd_vc1_chroma_mc8_mmi(uint8_t *dst /* align 8 */,
1862                                       uint8_t *src /* align 1 */,
1863                                       int stride, int h, int x, int y)
1864 {
1865     const int A = (8 - x) * (8 - y);
1866     const int B =     (x) * (8 - y);
1867     const int C = (8 - x) *     (y);
1868     const int D =     (x) *     (y);
1869     double ftmp[10];
1870     uint32_t tmp[1];
1871     DECLARE_VAR_ALL64;
1872     DECLARE_VAR_ADDRT;
1873
1874     av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
1875
1876     __asm__ volatile(
1877         "li         %[tmp0],    0x06                                    \n\t"
1878         "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
1879         "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
1880         "pshufh     %[A],       %[A],       %[ftmp0]                    \n\t"
1881         "pshufh     %[B],       %[B],       %[ftmp0]                    \n\t"
1882         "pshufh     %[C],       %[C],       %[ftmp0]                    \n\t"
1883         "pshufh     %[D],       %[D],       %[ftmp0]                    \n\t"
1884
1885         "1:                                                             \n\t"
1886         MMI_ULDC1(%[ftmp1], %[src], 0x00)
1887         MMI_ULDC1(%[ftmp2], %[src], 0x01)
1888         PTR_ADDU   "%[src],     %[src],     %[stride]                   \n\t"
1889         MMI_ULDC1(%[ftmp3], %[src], 0x00)
1890         MMI_ULDC1(%[ftmp4], %[src], 0x01)
1891
1892         CHROMA_MC_8_MMI
1893
1894         MMI_SDC1(%[ftmp1], %[dst], 0x00)
1895         "addiu      %[h],       %[h],      -0x01                        \n\t"
1896         PTR_ADDU   "%[dst],     %[dst],     %[stride]                   \n\t"
1897         "bnez       %[h],       1b                                      \n\t"
1898         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
1899           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
1900           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
1901           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
1902           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
1903           RESTRICT_ASM_ALL64
1904           RESTRICT_ASM_ADDRT
1905           [tmp0]"=&r"(tmp[0]),
1906           [src]"+&r"(src),              [dst]"+&r"(dst),
1907           [h]"+&r"(h)
1908         : [stride]"r"((mips_reg)stride),
1909           [A]"f"(A),                    [B]"f"(B),
1910           [C]"f"(C),                    [D]"f"(D),
1911           [ff_pw_28]"f"(ff_pw_28)
1912         : "memory"
1913     );
1914 }
1915
1916 void ff_put_no_rnd_vc1_chroma_mc4_mmi(uint8_t *dst /* align 8 */,
1917                                       uint8_t *src /* align 1 */,
1918                                       int stride, int h, int x, int y)
1919 {
1920     const int A = (8 - x) * (8 - y);
1921     const int B =     (x) * (8 - y);
1922     const int C = (8 - x) *     (y);
1923     const int D =     (x) *     (y);
1924     double ftmp[6];
1925     uint32_t tmp[1];
1926     DECLARE_VAR_LOW32;
1927     DECLARE_VAR_ADDRT;
1928
1929     av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
1930
1931     __asm__ volatile(
1932         "li         %[tmp0],    0x06                                    \n\t"
1933         "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
1934         "mtc1       %[tmp0],    %[ftmp5]                                \n\t"
1935         "pshufh     %[A],       %[A],       %[ftmp0]                    \n\t"
1936         "pshufh     %[B],       %[B],       %[ftmp0]                    \n\t"
1937         "pshufh     %[C],       %[C],       %[ftmp0]                    \n\t"
1938         "pshufh     %[D],       %[D],       %[ftmp0]                    \n\t"
1939
1940         "1:                                                             \n\t"
1941         MMI_ULWC1(%[ftmp1], %[src], 0x00)
1942         MMI_ULWC1(%[ftmp2], %[src], 0x01)
1943         PTR_ADDU   "%[src],     %[src],     %[stride]                   \n\t"
1944         MMI_ULWC1(%[ftmp3], %[src], 0x00)
1945         MMI_ULWC1(%[ftmp4], %[src], 0x01)
1946
1947         CHROMA_MC_4_MMI
1948
1949         MMI_SWC1(%[ftmp1], %[dst], 0x00)
1950         "addiu      %[h],       %[h],      -0x01                        \n\t"
1951         PTR_ADDU   "%[dst],     %[dst],     %[stride]                   \n\t"
1952         "bnez       %[h],       1b                                      \n\t"
1953         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
1954           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
1955           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
1956           [tmp0]"=&r"(tmp[0]),
1957           RESTRICT_ASM_LOW32
1958           RESTRICT_ASM_ADDRT
1959           [src]"+&r"(src),              [dst]"+&r"(dst),
1960           [h]"+&r"(h)
1961         : [stride]"r"((mips_reg)stride),
1962           [A]"f"(A),                    [B]"f"(B),
1963           [C]"f"(C),                    [D]"f"(D),
1964           [ff_pw_28]"f"(ff_pw_28)
1965         : "memory"
1966     );
1967 }
1968
1969 void ff_avg_no_rnd_vc1_chroma_mc8_mmi(uint8_t *dst /* align 8 */,
1970                                       uint8_t *src /* align 1 */,
1971                                       int stride, int h, int x, int y)
1972 {
1973     const int A = (8 - x) * (8 - y);
1974     const int B =     (x) * (8 - y);
1975     const int C = (8 - x) *     (y);
1976     const int D =     (x) *     (y);
1977     double ftmp[10];
1978     uint32_t tmp[1];
1979     DECLARE_VAR_ALL64;
1980     DECLARE_VAR_ADDRT;
1981
1982     av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
1983
1984     __asm__ volatile(
1985         "li         %[tmp0],    0x06                                    \n\t"
1986         "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
1987         "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
1988         "pshufh     %[A],       %[A],       %[ftmp0]                    \n\t"
1989         "pshufh     %[B],       %[B],       %[ftmp0]                    \n\t"
1990         "pshufh     %[C],       %[C],       %[ftmp0]                    \n\t"
1991         "pshufh     %[D],       %[D],       %[ftmp0]                    \n\t"
1992
1993         "1:                                                             \n\t"
1994         MMI_ULDC1(%[ftmp1], %[src], 0x00)
1995         MMI_ULDC1(%[ftmp2], %[src], 0x01)
1996         PTR_ADDU   "%[src],     %[src],     %[stride]                   \n\t"
1997         MMI_ULDC1(%[ftmp3], %[src], 0x00)
1998         MMI_ULDC1(%[ftmp4], %[src], 0x01)
1999
2000         CHROMA_MC_8_MMI
2001
2002         MMI_LDC1(%[ftmp2], %[dst], 0x00)
2003         "pavgb      %[ftmp1],   %[ftmp1],   %[ftmp2]                    \n\t"
2004
2005         MMI_SDC1(%[ftmp1], %[dst], 0x00)
2006         "addiu      %[h],       %[h],      -0x01                        \n\t"
2007         PTR_ADDU   "%[dst],     %[dst],     %[stride]                   \n\t"
2008         "bnez       %[h],       1b                                      \n\t"
2009         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
2010           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
2011           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
2012           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
2013           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
2014           [tmp0]"=&r"(tmp[0]),
2015           RESTRICT_ASM_ALL64
2016           RESTRICT_ASM_ADDRT
2017           [src]"+&r"(src),              [dst]"+&r"(dst),
2018           [h]"+&r"(h)
2019         : [stride]"r"((mips_reg)stride),
2020           [A]"f"(A),                    [B]"f"(B),
2021           [C]"f"(C),                    [D]"f"(D),
2022           [ff_pw_28]"f"(ff_pw_28)
2023         : "memory"
2024     );
2025 }
2026
2027 void ff_avg_no_rnd_vc1_chroma_mc4_mmi(uint8_t *dst /* align 8 */,
2028                                       uint8_t *src /* align 1 */,
2029                                       int stride, int h, int x, int y)
2030 {
2031     const int A = (8 - x) * (8 - y);
2032     const int B = (    x) * (8 - y);
2033     const int C = (8 - x) * (    y);
2034     const int D = (    x) * (    y);
2035     double ftmp[6];
2036     uint32_t tmp[1];
2037     DECLARE_VAR_LOW32;
2038     DECLARE_VAR_ADDRT;
2039
2040     av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2041
2042     __asm__ volatile(
2043         "li         %[tmp0],    0x06                                    \n\t"
2044         "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
2045         "mtc1       %[tmp0],    %[ftmp5]                                \n\t"
2046         "pshufh     %[A],       %[A],       %[ftmp0]                    \n\t"
2047         "pshufh     %[B],       %[B],       %[ftmp0]                    \n\t"
2048         "pshufh     %[C],       %[C],       %[ftmp0]                    \n\t"
2049         "pshufh     %[D],       %[D],       %[ftmp0]                    \n\t"
2050
2051         "1:                                                             \n\t"
2052         MMI_ULWC1(%[ftmp1], %[src], 0x00)
2053         MMI_ULWC1(%[ftmp2], %[src], 0x01)
2054         PTR_ADDU   "%[src],     %[src],     %[stride]                   \n\t"
2055         MMI_ULWC1(%[ftmp3], %[src], 0x00)
2056         MMI_ULWC1(%[ftmp4], %[src], 0x01)
2057
2058         CHROMA_MC_4_MMI
2059
2060         MMI_LWC1(%[ftmp2], %[dst], 0x00)
2061         "pavgb      %[ftmp1],   %[ftmp1],   %[ftmp2]                    \n\t"
2062
2063         MMI_SWC1(%[ftmp1], %[dst], 0x00)
2064         "addiu      %[h],       %[h],      -0x01                        \n\t"
2065         PTR_ADDU   "%[dst],     %[dst],     %[stride]                   \n\t"
2066         "bnez       %[h],       1b                                      \n\t"
2067         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
2068           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
2069           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
2070           [tmp0]"=&r"(tmp[0]),
2071           RESTRICT_ASM_LOW32
2072           RESTRICT_ASM_ADDRT
2073           [src]"+&r"(src),              [dst]"+&r"(dst),
2074           [h]"+&r"(h)
2075         : [stride]"r"((mips_reg)stride),
2076           [A]"f"(A),                    [B]"f"(B),
2077           [C]"f"(C),                    [D]"f"(D),
2078           [ff_pw_28]"f"(ff_pw_28)
2079         : "memory"
2080     );
2081 }