git.sesse.net Git - ffmpeg/blob - libavcodec/mips/vc1dsp_mmi.c

   1 /*
   2  * VC-1 and WMV3 - DSP functions Loongson MMI-optimized
   3  *
   4  * Copyright (c) 2016 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
   5  *
   6  * This file is part of FFmpeg.
   7  *
   8  * FFmpeg is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License as published by the Free Software Foundation; either
  11  * version 2.1 of the License, or (at your option) any later version.
  12  *
  13  * FFmpeg is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16  * Lesser General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU Lesser General Public
  19  * License along with FFmpeg; if not, write to the Free Software
  20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21  */
  22
  23 #include "libavutil/avassert.h"
  24 #include "libavcodec/vc1dsp.h"
  25 #include "constants.h"
  26 #include "vc1dsp_mips.h"
  27 #include "hpeldsp_mips.h"
  28 #include "libavutil/mips/mmiutils.h"
  29
  30
  31 #define VC1_INV_TRANCS_8_STEP1_MMI(fp1,   fp2,   fp3,   fp4,                \
  32                                    o1,    o2,    o3,    o4,                 \
  33                                    t1,    t2,    t3,    t4,                 \
  34                                    ff_p1, ff_p2, ff_p3, ff_p4)              \
  35         "pmullh     "#t1"   ,   "#fp1"  ,   "#ff_p1"                \n\t"   \
  36         "pmullh     "#t2"   ,   "#fp2"  ,   "#ff_p2"                \n\t"   \
  37         "pmullh     "#t3"   ,   "#fp3"  ,   "#ff_p3"                \n\t"   \
  38         "pmullh     "#t4"   ,   "#fp4"  ,   "#ff_p4"                \n\t"   \
  39         "paddh      "#o1"   ,   "#t1"   ,   "#t2"                   \n\t"   \
  40         "paddh      "#o1"   ,   "#o1"   ,   "#t3"                   \n\t"   \
  41         "paddh      "#o1"   ,   "#o1"   ,   "#t4"                   \n\t"   \
  42                                                                             \
  43         "pmullh     "#t1"   ,   "#fp1"  ,   "#ff_p2"                \n\t"   \
  44         "pmullh     "#t2"   ,   "#fp2"  ,   "#ff_p4"                \n\t"   \
  45         "pmullh     "#t3"   ,   "#fp3"  ,   "#ff_p1"                \n\t"   \
  46         "pmullh     "#t4"   ,   "#fp4"  ,   "#ff_p3"                \n\t"   \
  47         "psubh      "#o2"   ,   "#t1"   ,   "#t2"                   \n\t"   \
  48         "psubh      "#o2"   ,   "#o2"   ,   "#t3"                   \n\t"   \
  49         "psubh      "#o2"   ,   "#o2"   ,   "#t4"                   \n\t"   \
  50                                                                             \
  51         "pmullh     "#t1"   ,   "#fp1"  ,   "#ff_p3"                \n\t"   \
  52         "pmullh     "#t2"   ,   "#fp2"  ,   "#ff_p1"                \n\t"   \
  53         "pmullh     "#t3"   ,   "#fp3"  ,   "#ff_p4"                \n\t"   \
  54         "pmullh     "#t4"   ,   "#fp4"  ,   "#ff_p2"                \n\t"   \
  55         "psubh      "#o3"   ,   "#t1"   ,   "#t2"                   \n\t"   \
  56         "paddh      "#o3"   ,   "#o3"   ,   "#t3"                   \n\t"   \
  57         "paddh      "#o3"   ,   "#o3"   ,   "#t4"                   \n\t"   \
  58                                                                             \
  59         "pmullh     "#t1"   ,   "#fp1"  ,   "#ff_p4"                \n\t"   \
  60         "pmullh     "#t2"   ,   "#fp2"  ,   "#ff_p3"                \n\t"   \
  61         "pmullh     "#t3"   ,   "#fp3"  ,   "#ff_p2"                \n\t"   \
  62         "pmullh     "#t4"   ,   "#fp4"  ,   "#ff_p1"                \n\t"   \
  63         "psubh      "#o4"   ,   "#t1"   ,   "#t2"                   \n\t"   \
  64         "paddh      "#o4"   ,   "#o4"   ,   "#t3"                   \n\t"   \
  65         "psubh      "#o4"   ,   "#o4"   ,   "#t4"                   \n\t"
  66
  67
  68 #define VC1_INV_TRANCS_8_STEP2_MMI(fp1,   fp2,   fp3,   fp4,                \
  69                                    fp5,   fp6,   fp7,   fp8,                \
  70                                    o1,    o2,    o3,    o4,                 \
  71                                    ff_p1, ff_p2, ff_p3, ff_pw)              \
  72         "paddh      "#fp5"  ,   "#fp1"  ,   "#fp2"                  \n\t"   \
  73         "psubh      "#fp6"  ,   "#fp1"  ,   "#fp2"                  \n\t"   \
  74         "pmullh     "#fp5"  ,   "#fp5"  ,   "#ff_p1"                \n\t"   \
  75         "pmullh     "#fp6"  ,   "#fp6"  ,   "#ff_p1"                \n\t"   \
  76         "paddh      "#fp5"  ,   "#fp5"  ,   "#ff_pw"                \n\t"   \
  77         "paddh      "#fp6"  ,   "#fp6"  ,   "#ff_pw"                \n\t"   \
  78                                                                             \
  79         "pmullh     "#fp1"  ,   "#fp3"  ,   "#ff_p2"                \n\t"   \
  80         "pmullh     "#fp2"  ,   "#fp4"  ,   "#ff_p3"                \n\t"   \
  81         "pmullh     "#fp3"  ,   "#fp3"  ,   "#ff_p3"                \n\t"   \
  82         "pmullh     "#fp4"  ,   "#fp4"  ,   "#ff_p2"                \n\t"   \
  83         "paddh      "#fp7"  ,   "#fp1"  ,   "#fp2"                  \n\t"   \
  84         "psubh      "#fp8"  ,   "#fp3"  ,   "#fp4"                  \n\t"   \
  85                                                                             \
  86         "paddh      "#fp1"  ,   "#fp5"  ,   "#fp7"                  \n\t"   \
  87         "paddh      "#fp2"  ,   "#fp6"  ,   "#fp8"                  \n\t"   \
  88         "psubh      "#fp3"  ,   "#fp6"  ,   "#fp8"                  \n\t"   \
  89         "psubh      "#fp4"  ,   "#fp5"  ,   "#fp7"                  \n\t"   \
  90                                                                             \
  91         "paddh      "#fp5"  ,   "#fp1"  ,   "#o1"                   \n\t"   \
  92         "paddh      "#fp6"  ,   "#fp2"  ,   "#o2"                   \n\t"   \
  93         "paddh      "#fp7"  ,   "#fp3"  ,   "#o3"                   \n\t"   \
  94         "paddh      "#fp8"  ,   "#fp4"  ,   "#o4"                   \n\t"   \
  95                                                                             \
  96         "psubh      "#fp4"  ,   "#fp4"  ,   "#o4"                   \n\t"   \
  97         "psubh      "#fp3"  ,   "#fp3"  ,   "#o3"                   \n\t"   \
  98         "psubh      "#fp2"  ,   "#fp2"  ,   "#o2"                   \n\t"   \
  99         "psubh      "#fp1"  ,   "#fp1"  ,   "#o1"                   \n\t"
 100
 101
 102 #define VC1_INV_TRANCS_4_STEP1_MMI(fp1,   fp2,   fp3,   fp4,                \
 103                                    fp5,   fp6,   fp7,   fp8,                \
 104                                    ff_p1, ff_p2, ff_p3, ff_pw)              \
 105         "paddh      "#fp5"  ,   "#fp1"  ,   "#fp2"                  \n\t"   \
 106         "psubh      "#fp6"  ,   "#fp1"  ,   "#fp2"                  \n\t"   \
 107         "pmullh     "#fp5"  ,   "#fp5"  ,   "#ff_p1"                \n\t"   \
 108         "pmullh     "#fp6"  ,   "#fp6"  ,   "#ff_p1"                \n\t"   \
 109         "paddh      "#fp5"  ,   "#fp5"  ,   "#ff_pw"                \n\t"   \
 110         "paddh      "#fp6"  ,   "#fp6"  ,   "#ff_pw"                \n\t"   \
 111                                                                             \
 112         "pmullh     "#fp1"  ,   "#fp3"  ,   "#ff_p2"                \n\t"   \
 113         "pmullh     "#fp2"  ,   "#fp4"  ,   "#ff_p3"                \n\t"   \
 114         "pmullh     "#fp3"  ,   "#fp3"  ,   "#ff_p3"                \n\t"   \
 115         "pmullh     "#fp4"  ,   "#fp4"  ,   "#ff_p2"                \n\t"   \
 116         "paddh      "#fp7"  ,   "#fp1"  ,   "#fp2"                  \n\t"   \
 117         "psubh      "#fp8"  ,   "#fp3"  ,   "#fp4"                  \n\t"   \
 118                                                                             \
 119         "paddh      "#fp1"  ,   "#fp5"  ,   "#fp7"                  \n\t"   \
 120         "psubh      "#fp2"  ,   "#fp6"  ,   "#fp8"                  \n\t"   \
 121         "paddh      "#fp3"  ,   "#fp6"  ,   "#fp8"                  \n\t"   \
 122         "psubh      "#fp4"  ,   "#fp5"  ,   "#fp7"                  \n\t"
 123
 124
 125 #define VC1_INV_TRANCS_4_STEP2_MMI(fp1, fp2, fp3, fp4,                      \
 126                                    fp5, fp6, fp7, fp8, zero)                \
 127         "punpcklbh  "#fp5"  ,   "#fp5"  ,   "#zero"                 \n\t"   \
 128         "punpcklbh  "#fp6"  ,   "#fp6"  ,   "#zero"                 \n\t"   \
 129         "punpcklbh  "#fp7"  ,   "#fp7"  ,   "#zero"                 \n\t"   \
 130         "punpcklbh  "#fp8"  ,   "#fp8"  ,   "#zero"                 \n\t"   \
 131                                                                             \
 132         "paddh      "#fp1"  ,   "#fp1"  ,   "#fp5"                  \n\t"   \
 133         "paddh      "#fp2"  ,   "#fp2"  ,   "#fp6"                  \n\t"   \
 134         "paddh      "#fp3"  ,   "#fp3"  ,   "#fp7"                  \n\t"   \
 135         "paddh      "#fp4"  ,   "#fp4"  ,   "#fp8"                  \n\t"   \
 136                                                                             \
 137         "packushb   "#fp1"  ,   "#fp1"  ,   "#zero"                 \n\t"   \
 138         "packushb   "#fp2"  ,   "#fp2"  ,   "#zero"                 \n\t"   \
 139         "packushb   "#fp3"  ,   "#fp3"  ,   "#zero"                 \n\t"   \
 140         "packushb   "#fp4"  ,   "#fp4"  ,   "#zero"                 \n\t"
 141
 142
 143 /* Do inverse transform on 8x8 block */
 144 void ff_vc1_inv_trans_8x8_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
 145 {
 146     int dc = block[0];
 147     double ftmp[9];
 148     mips_reg addr[1];
 149     int count;
 150
 151     dc = (3 * dc +  1) >> 1;
 152     dc = (3 * dc + 16) >> 5;
 153
 154     __asm__ volatile(
 155         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
 156         "pshufh     %[dc],      %[dc],          %[ftmp0]                \n\t"
 157         "li         %[count],   0x02                                    \n\t"
 158
 159         "1:                                                             \n\t"
 160         MMI_LDC1(%[ftmp1], %[dest], 0x00)
 161         PTR_ADDU   "%[addr0],   %[dest],        %[linesize]             \n\t"
 162         MMI_LDC1(%[ftmp2], %[addr0], 0x00)
 163         PTR_ADDU   "%[addr0],   %[addr0],       %[linesize]             \n\t"
 164         MMI_LDC1(%[ftmp3], %[addr0], 0x00)
 165         PTR_ADDU   "%[addr0],   %[addr0],       %[linesize]             \n\t"
 166         MMI_LDC1(%[ftmp4], %[addr0], 0x00)
 167
 168         "punpckhbh  %[ftmp5],   %[ftmp1],       %[ftmp0]                \n\t"
 169         "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
 170         "punpckhbh  %[ftmp6],   %[ftmp2],       %[ftmp0]                \n\t"
 171         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
 172         "punpckhbh  %[ftmp7],   %[ftmp3],       %[ftmp0]                \n\t"
 173         "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
 174         "punpckhbh  %[ftmp8],   %[ftmp4],       %[ftmp0]                \n\t"
 175         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
 176
 177         "paddsh     %[ftmp1],   %[ftmp1],       %[dc]                   \n\t"
 178         "paddsh     %[ftmp2],   %[ftmp2],       %[dc]                   \n\t"
 179         "paddsh     %[ftmp3],   %[ftmp3],       %[dc]                   \n\t"
 180         "paddsh     %[ftmp4],   %[ftmp4],       %[dc]                   \n\t"
 181         "paddsh     %[ftmp5],   %[ftmp5],       %[dc]                   \n\t"
 182         "paddsh     %[ftmp6],   %[ftmp6],       %[dc]                   \n\t"
 183         "paddsh     %[ftmp7],   %[ftmp7],       %[dc]                   \n\t"
 184         "paddsh     %[ftmp8],   %[ftmp8],       %[dc]                   \n\t"
 185
 186         "packushb   %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
 187         "packushb   %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
 188         "packushb   %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
 189         "packushb   %[ftmp4],   %[ftmp4],       %[ftmp8]                \n\t"
 190
 191         MMI_SDC1(%[ftmp1], %[dest], 0x00)
 192         PTR_ADDU   "%[addr0],   %[dest],        %[linesize]             \n\t"
 193         MMI_SDC1(%[ftmp2], %[addr0], 0x00)
 194         PTR_ADDU   "%[addr0],   %[addr0],       %[linesize]             \n\t"
 195         MMI_SDC1(%[ftmp3], %[addr0], 0x00)
 196         PTR_ADDU   "%[addr0],   %[addr0],       %[linesize]             \n\t"
 197         MMI_SDC1(%[ftmp4], %[addr0], 0x00)
 198
 199         "addiu      %[count],   %[count],       -0x01                   \n\t"
 200         PTR_ADDU   "%[dest],    %[addr0],       %[linesize]             \n\t"
 201         "bnez       %[count],   1b                                      \n\t"
 202         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
 203           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
 204           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
 205           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
 206           [ftmp8]"=&f"(ftmp[8]),
 207           [addr0]"=&r"(addr[0]),
 208           [count]"=&r"(count),          [dest]"+&r"(dest)
 209         : [linesize]"r"((mips_reg)linesize),
 210           [dc]"f"(dc)
 211         : "memory"
 212     );
 213 }
 214
 215 #if _MIPS_SIM != _ABIO32
 216 void ff_vc1_inv_trans_8x8_mmi(int16_t block[64])
 217 {
 218     DECLARE_ALIGNED(16, int16_t, temp[64]);
 219     int16_t *src = block;
 220     int16_t *dst = temp;
 221     double ftmp[16];
 222     uint32_t count, tmp[1];
 223
 224     // 1st loop
 225     __asm__ volatile (
 226         "li         %[tmp0],    0x03                                    \n\t"
 227         "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
 228         "li         %[count],   0x02                                    \n\t"
 229
 230         "1:                                                             \n\t"
 231         MMI_LDC1(%[ftmp5], %[src], 0x10)
 232         MMI_LDC1(%[ftmp6], %[src], 0x30)
 233         MMI_LDC1(%[ftmp7], %[src], 0x50)
 234         MMI_LDC1(%[ftmp8], %[src], 0x70)
 235
 236         VC1_INV_TRANCS_8_STEP1_MMI(%[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
 237                                    %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
 238                                    %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
 239                                    %[ff_pw_16], %[ff_pw_15], %[ff_pw_9],
 240                                    %[ff_pw_4])
 241
 242         MMI_LDC1(%[ftmp1], %[src], 0x00)
 243         MMI_LDC1(%[ftmp2], %[src], 0x40)
 244         MMI_LDC1(%[ftmp3], %[src], 0x20)
 245         MMI_LDC1(%[ftmp4], %[src], 0x60)
 246
 247         VC1_INV_TRANCS_8_STEP2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
 248                                    %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
 249                                    %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
 250                                    %[ff_pw_12], %[ff_pw_16], %[ff_pw_6],
 251                                    %[ff_pw_4])
 252
 253
 254         PSRAH_8_MMI(%[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
 255                     %[ftmp4], %[ftmp3], %[ftmp2], %[ftmp1], %[ftmp0])
 256
 257         TRANSPOSE_4H(%[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
 258                      %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
 259                      %[ftmp13], %[tmp0],  %[ftmp14], %[ftmp15])
 260
 261         MMI_SDC1(%[ftmp5], %[dst], 0x00)
 262         MMI_SDC1(%[ftmp6], %[dst], 0x10)
 263         MMI_SDC1(%[ftmp7], %[dst], 0x20)
 264         MMI_SDC1(%[ftmp8], %[dst], 0x30)
 265
 266         TRANSPOSE_4H(%[ftmp4], %[ftmp3], %[ftmp2], %[ftmp1],
 267                      %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
 268                      %[ftmp13], %[tmp0],  %[ftmp14], %[ftmp15])
 269
 270         MMI_SDC1(%[ftmp4], %[dst], 0x08)
 271         MMI_SDC1(%[ftmp3], %[dst], 0x18)
 272         MMI_SDC1(%[ftmp2], %[dst], 0x28)
 273         MMI_SDC1(%[ftmp1], %[dst], 0x38)
 274
 275         "addiu      %[count],   %[count],  -0x01                        \n\t"
 276         PTR_ADDIU  "%[src],     %[src],     0x08                        \n\t"
 277         PTR_ADDIU  "%[dst],     %[dst],     0x40                        \n\t"
 278         "bnez       %[count],   1b                                      \n\t"
 279         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
 280           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
 281           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
 282           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
 283           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
 284           [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
 285           [ftmp12]"=&f"(ftmp[12]),      [ftmp13]"=&f"(ftmp[13]),
 286           [ftmp14]"=&f"(ftmp[14]),      [ftmp15]"=&f"(ftmp[15]),
 287           [tmp0]"=&r"(tmp[0]),
 288           [count]"=&r"(count),
 289           [src]"+&r"(src),              [dst]"+&r"(dst)
 290         : [ff_pw_4]"f"(ff_pw_4),        [ff_pw_6]"f"(ff_pw_6),
 291           [ff_pw_9]"f"(ff_pw_9),        [ff_pw_12]"f"(ff_pw_12),
 292           [ff_pw_15]"f"(ff_pw_15),      [ff_pw_16]"f"(ff_pw_16)
 293         : "memory"
 294     );
 295
 296     src = temp;
 297     dst = block;
 298
 299     // 2nd loop
 300     __asm__ volatile (
 301         "li         %[tmp0],    0x07                                    \n\t"
 302         "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
 303         "li         %[count],   0x02                                    \n\t"
 304
 305         "1:                                                             \n\t"
 306         MMI_LDC1(%[ftmp5], %[src], 0x10)
 307         MMI_LDC1(%[ftmp6], %[src], 0x30)
 308         MMI_LDC1(%[ftmp7], %[src], 0x50)
 309         MMI_LDC1(%[ftmp8], %[src], 0x70)
 310
 311         VC1_INV_TRANCS_8_STEP1_MMI(%[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
 312                                    %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
 313                                    %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
 314                                    %[ff_pw_16], %[ff_pw_15], %[ff_pw_9],
 315                                    %[ff_pw_4])
 316
 317         MMI_LDC1(%[ftmp1], %[src], 0x00)
 318         MMI_LDC1(%[ftmp2], %[src], 0x40)
 319         MMI_LDC1(%[ftmp3], %[src], 0x20)
 320         MMI_LDC1(%[ftmp4], %[src], 0x60)
 321
 322         VC1_INV_TRANCS_8_STEP2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
 323                                    %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
 324                                    %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
 325                                    %[ff_pw_12], %[ff_pw_16], %[ff_pw_6],
 326                                    %[ff_pw_64])
 327
 328         "paddh      %[ftmp4],   %[ftmp4],   %[ff_pw_1]                  \n\t"
 329         "paddh      %[ftmp3],   %[ftmp3],   %[ff_pw_1]                  \n\t"
 330         "paddh      %[ftmp2],   %[ftmp2],   %[ff_pw_1]                  \n\t"
 331         "paddh      %[ftmp1],   %[ftmp1],   %[ff_pw_1]                  \n\t"
 332
 333         PSRAH_8_MMI(%[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
 334                     %[ftmp4], %[ftmp3], %[ftmp2], %[ftmp1], %[ftmp0])
 335
 336         MMI_SDC1(%[ftmp5], %[dst], 0x00)
 337         MMI_SDC1(%[ftmp6], %[dst], 0x10)
 338         MMI_SDC1(%[ftmp7], %[dst], 0x20)
 339         MMI_SDC1(%[ftmp8], %[dst], 0x30)
 340
 341         MMI_SDC1(%[ftmp4], %[dst], 0x40)
 342         MMI_SDC1(%[ftmp3], %[dst], 0x50)
 343         MMI_SDC1(%[ftmp2], %[dst], 0x60)
 344         MMI_SDC1(%[ftmp1], %[dst], 0x70)
 345
 346         "addiu      %[count],   %[count],  -0x01                        \n\t"
 347         PTR_ADDIU  "%[src],     %[src],     0x08                        \n\t"
 348         PTR_ADDIU  "%[dst],     %[dst],     0x08                        \n\t"
 349         "bnez       %[count],   1b                                      \n\t"
 350         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
 351           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
 352           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
 353           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
 354           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
 355           [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
 356           [ftmp12]"=&f"(ftmp[12]),      [ftmp13]"=&f"(ftmp[13]),
 357           [ftmp14]"=&f"(ftmp[14]),      [ftmp15]"=&f"(ftmp[15]),
 358           [tmp0]"=&r"(tmp[0]),
 359           [count]"=&r"(count),
 360           [src]"+&r"(src),              [dst]"+&r"(dst)
 361         : [ff_pw_1]"f"(ff_pw_1),        [ff_pw_4]"f"(ff_pw_4),
 362           [ff_pw_6]"f"(ff_pw_6),        [ff_pw_9]"f"(ff_pw_9),
 363           [ff_pw_12]"f"(ff_pw_12),      [ff_pw_15]"f"(ff_pw_15),
 364           [ff_pw_16]"f"(ff_pw_16),      [ff_pw_64]"f"(ff_pw_64)
 365         : "memory"
 366     );
 367 }
 368 #endif
 369
 370 /* Do inverse transform on 8x4 part of block */
 371 void ff_vc1_inv_trans_8x4_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
 372 {
 373     int dc = block[0];
 374     double ftmp[9];
 375
 376     dc = ( 3 * dc +  1) >> 1;
 377     dc = (17 * dc + 64) >> 7;
 378
 379     __asm__ volatile(
 380         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
 381         "pshufh     %[dc],      %[dc],          %[ftmp0]                \n\t"
 382
 383         MMI_LDC1(%[ftmp1], %[dest0], 0x00)
 384         MMI_LDC1(%[ftmp2], %[dest1], 0x00)
 385         MMI_LDC1(%[ftmp3], %[dest2], 0x00)
 386         MMI_LDC1(%[ftmp4], %[dest3], 0x00)
 387
 388         "punpckhbh  %[ftmp5],   %[ftmp1],       %[ftmp0]                \n\t"
 389         "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
 390         "punpckhbh  %[ftmp6],   %[ftmp2],       %[ftmp0]                \n\t"
 391         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
 392         "punpckhbh  %[ftmp7],   %[ftmp3],       %[ftmp0]                \n\t"
 393         "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
 394         "punpckhbh  %[ftmp8],   %[ftmp4],       %[ftmp0]                \n\t"
 395         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
 396
 397         "paddsh     %[ftmp1],   %[ftmp1],       %[dc]                   \n\t"
 398         "paddsh     %[ftmp2],   %[ftmp2],       %[dc]                   \n\t"
 399         "paddsh     %[ftmp3],   %[ftmp3],       %[dc]                   \n\t"
 400         "paddsh     %[ftmp4],   %[ftmp4],       %[dc]                   \n\t"
 401         "paddsh     %[ftmp5],   %[ftmp5],       %[dc]                   \n\t"
 402         "paddsh     %[ftmp6],   %[ftmp6],       %[dc]                   \n\t"
 403         "paddsh     %[ftmp7],   %[ftmp7],       %[dc]                   \n\t"
 404         "paddsh     %[ftmp8],   %[ftmp8],       %[dc]                   \n\t"
 405
 406         "packushb   %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
 407         "packushb   %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
 408         "packushb   %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
 409         "packushb   %[ftmp4],   %[ftmp4],       %[ftmp8]                \n\t"
 410
 411         MMI_SDC1(%[ftmp1], %[dest0], 0x00)
 412         MMI_SDC1(%[ftmp2], %[dest1], 0x00)
 413         MMI_SDC1(%[ftmp3], %[dest2], 0x00)
 414         MMI_SDC1(%[ftmp4], %[dest3], 0x00)
 415         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
 416           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
 417           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
 418           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
 419           [ftmp8]"=&f"(ftmp[8])
 420         : [dest0]"r"(dest+0*linesize),  [dest1]"r"(dest+1*linesize),
 421           [dest2]"r"(dest+2*linesize),  [dest3]"r"(dest+3*linesize),
 422           [dc]"f"(dc)
 423         : "memory"
 424     );
 425 }
 426
 427 #if _MIPS_SIM != _ABIO32
 428 void ff_vc1_inv_trans_8x4_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
 429 {
 430     int16_t *src = block;
 431     int16_t *dst = block;
 432     double ftmp[16];
 433     uint32_t tmp[1];
 434     mips_reg addr[1];
 435     DECLARE_VAR_LOW32;
 436
 437     // 1st loop
 438     __asm__ volatile (
 439         MMI_LDC1(%[ftmp1], %[src], 0x00)
 440         MMI_LDC1(%[ftmp2], %[src], 0x08)
 441         MMI_LDC1(%[ftmp3], %[src], 0x10)
 442         MMI_LDC1(%[ftmp4], %[src], 0x18)
 443         MMI_LDC1(%[ftmp5], %[src], 0x20)
 444         MMI_LDC1(%[ftmp6], %[src], 0x28)
 445         MMI_LDC1(%[ftmp7], %[src], 0x30)
 446         MMI_LDC1(%[ftmp8], %[src], 0x38)
 447
 448         //             a1        b1        a3        b2
 449         TRANSPOSE_4H(%[ftmp1], %[ftmp3], %[ftmp5], %[ftmp7],
 450                      %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
 451                      %[ftmp13], %[tmp0],  %[ftmp14], %[ftmp15])
 452
 453         //             a2        b3        a4        b4
 454         TRANSPOSE_4H(%[ftmp2], %[ftmp4], %[ftmp6], %[ftmp8],
 455                      %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
 456                      %[ftmp13], %[tmp0],  %[ftmp14], %[ftmp15])
 457
 458         // input b1 b2 b3 b4
 459         VC1_INV_TRANCS_8_STEP1_MMI(%[ftmp3], %[ftmp7], %[ftmp4], %[ftmp8],
 460                                    %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
 461                                    %[ftmp0], %[ftmp13], %[ftmp14], %[ftmp15],
 462                                    %[ff_pw_16], %[ff_pw_15], %[ff_pw_9],
 463                                    %[ff_pw_4])
 464         // input a1 a2 a3 a4
 465         VC1_INV_TRANCS_8_STEP2_MMI(%[ftmp1], %[ftmp2], %[ftmp5], %[ftmp6],
 466                                    %[ftmp3], %[ftmp7], %[ftmp4], %[ftmp8],
 467                                    %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
 468                                    %[ff_pw_12], %[ff_pw_16], %[ff_pw_6],
 469                                    %[ff_pw_4])
 470
 471         "li         %[tmp0],    0x03                                    \n\t"
 472         "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
 473
 474         PSRAH_8_MMI(%[ftmp3], %[ftmp7], %[ftmp4], %[ftmp8],
 475                     %[ftmp6], %[ftmp5], %[ftmp2], %[ftmp1], %[ftmp0])
 476
 477         TRANSPOSE_4H(%[ftmp3], %[ftmp7], %[ftmp4], %[ftmp8],
 478                      %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
 479                      %[ftmp13], %[tmp0],  %[ftmp14], %[ftmp15])
 480
 481         MMI_SDC1(%[ftmp3], %[dst], 0x00)
 482         MMI_SDC1(%[ftmp7], %[dst], 0x10)
 483         MMI_SDC1(%[ftmp4], %[dst], 0x20)
 484         MMI_SDC1(%[ftmp8], %[dst], 0x30)
 485
 486         TRANSPOSE_4H(%[ftmp6], %[ftmp5], %[ftmp2], %[ftmp1],
 487                      %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
 488                      %[ftmp13], %[tmp0],  %[ftmp14], %[ftmp15])
 489
 490         MMI_SDC1(%[ftmp6], %[dst], 0x08)
 491         MMI_SDC1(%[ftmp5], %[dst], 0x18)
 492         MMI_SDC1(%[ftmp2], %[dst], 0x28)
 493         MMI_SDC1(%[ftmp1], %[dst], 0x38)
 494         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
 495           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
 496           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
 497           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
 498           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
 499           [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
 500           [ftmp12]"=&f"(ftmp[12]),      [ftmp13]"=&f"(ftmp[13]),
 501           [ftmp14]"=&f"(ftmp[14]),      [ftmp15]"=&f"(ftmp[15]),
 502           [tmp0]"=&r"(tmp[0])
 503         : [src]"r"(src),                [dst]"r"(dst),
 504           [ff_pw_4]"f"(ff_pw_4),        [ff_pw_6]"f"(ff_pw_6),
 505           [ff_pw_9]"f"(ff_pw_9),        [ff_pw_12]"f"(ff_pw_12),
 506           [ff_pw_15]"f"(ff_pw_15),      [ff_pw_16]"f"(ff_pw_16)
 507         : "memory"
 508     );
 509
 510     src = block;
 511
 512     // 2nd loop
 513     __asm__ volatile (
 514         "li         %[tmp0],    0x07                                    \n\t"
 515         "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
 516         "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
 517
 518         // dest low 32bit
 519         MMI_LDC1(%[ftmp1], %[src], 0x00)
 520         MMI_LDC1(%[ftmp2], %[src], 0x20)
 521         MMI_LDC1(%[ftmp3], %[src], 0x30)
 522         MMI_LDC1(%[ftmp4], %[src], 0x10)
 523
 524         VC1_INV_TRANCS_4_STEP1_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
 525                                    %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
 526                                    %[ff_pw_17], %[ff_pw_10], %[ff_pw_22],
 527                                    %[ff_pw_64])
 528
 529         PSRAH_4_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], %[ftmp9])
 530
 531         MMI_LWC1(%[ftmp5], %[dest], 0x00)
 532         PTR_ADDU   "%[addr0],   %[dest],    %[linesize]                 \n\t"
 533         MMI_LWC1(%[ftmp6], %[addr0], 0x00)
 534         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
 535         MMI_LWC1(%[ftmp7], %[addr0], 0x00)
 536         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
 537         MMI_LWC1(%[ftmp8], %[addr0], 0x00)
 538
 539         VC1_INV_TRANCS_4_STEP2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
 540                                    %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
 541                                    %[ftmp0])
 542
 543         MMI_SWC1(%[ftmp1], %[dest], 0x00)
 544         PTR_ADDU   "%[addr0],   %[dest],    %[linesize]                 \n\t"
 545         MMI_SWC1(%[ftmp2], %[addr0], 0x00)
 546         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
 547         MMI_SWC1(%[ftmp3], %[addr0], 0x00)
 548         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
 549         MMI_SWC1(%[ftmp4], %[addr0], 0x00)
 550
 551         // dest high 32bit
 552         MMI_LDC1(%[ftmp1], %[src], 0x08)
 553         MMI_LDC1(%[ftmp2], %[src], 0x28)
 554         MMI_LDC1(%[ftmp3], %[src], 0x38)
 555         MMI_LDC1(%[ftmp4], %[src], 0x18)
 556
 557         VC1_INV_TRANCS_4_STEP1_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
 558                                    %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
 559                                    %[ff_pw_17], %[ff_pw_10], %[ff_pw_22],
 560                                    %[ff_pw_64])
 561
 562         PSRAH_4_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], %[ftmp9])
 563
 564         MMI_LWC1(%[ftmp5], %[dest], 0x04)
 565         PTR_ADDU   "%[addr0],   %[dest],    %[linesize]                 \n\t"
 566         MMI_LWC1(%[ftmp6], %[addr0], 0x04)
 567         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
 568         MMI_LWC1(%[ftmp7], %[addr0], 0x04)
 569         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
 570         MMI_LWC1(%[ftmp8], %[addr0], 0x04)
 571
 572         VC1_INV_TRANCS_4_STEP2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
 573                                    %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
 574                                    %[ftmp0])
 575
 576         MMI_SWC1(%[ftmp1], %[dest], 0x04)
 577         PTR_ADDU   "%[addr0],   %[dest],    %[linesize]                 \n\t"
 578         MMI_SWC1(%[ftmp2], %[addr0], 0x04)
 579         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
 580         MMI_SWC1(%[ftmp3], %[addr0], 0x04)
 581         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
 582         MMI_SWC1(%[ftmp4], %[addr0], 0x04)
 583
 584         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
 585           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
 586           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
 587           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
 588           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
 589           [tmp0]"=&r"(tmp[0]),
 590           RESTRICT_ASM_LOW32
 591           [addr0]"=&r"(addr[0])
 592         : [src]"r"(src),                [dest]"r"(dest),
 593           [linesize]"r"((mips_reg)linesize),
 594           [ff_pw_17]"f"(ff_pw_17),      [ff_pw_22]"f"(ff_pw_22),
 595           [ff_pw_10]"f"(ff_pw_10),      [ff_pw_64]"f"(ff_pw_64)
 596         : "memory"
 597     );
 598 }
 599 #endif
 600
 601 /* Do inverse transform on 4x8 parts of block */
 602 void ff_vc1_inv_trans_4x8_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
 603 {
 604     int dc = block[0];
 605     double ftmp[9];
 606     DECLARE_VAR_LOW32;
 607
 608     dc = (17 * dc +  4) >> 3;
 609     dc = (12 * dc + 64) >> 7;
 610
 611     __asm__ volatile(
 612         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
 613         "pshufh     %[dc],      %[dc],          %[ftmp0]                \n\t"
 614
 615         MMI_LWC1(%[ftmp1], %[dest0], 0x00)
 616         MMI_LWC1(%[ftmp2], %[dest1], 0x00)
 617         MMI_LWC1(%[ftmp3], %[dest2], 0x00)
 618         MMI_LWC1(%[ftmp4], %[dest3], 0x00)
 619         MMI_LWC1(%[ftmp5], %[dest4], 0x00)
 620         MMI_LWC1(%[ftmp6], %[dest5], 0x00)
 621         MMI_LWC1(%[ftmp7], %[dest6], 0x00)
 622         MMI_LWC1(%[ftmp8], %[dest7], 0x00)
 623
 624         "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
 625         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
 626         "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
 627         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
 628         "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
 629         "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
 630         "punpcklbh  %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
 631         "punpcklbh  %[ftmp8],   %[ftmp8],       %[ftmp0]                \n\t"
 632
 633         "paddsh     %[ftmp1],   %[ftmp1],       %[dc]                   \n\t"
 634         "paddsh     %[ftmp2],   %[ftmp2],       %[dc]                   \n\t"
 635         "paddsh     %[ftmp3],   %[ftmp3],       %[dc]                   \n\t"
 636         "paddsh     %[ftmp4],   %[ftmp4],       %[dc]                   \n\t"
 637         "paddsh     %[ftmp5],   %[ftmp5],       %[dc]                   \n\t"
 638         "paddsh     %[ftmp6],   %[ftmp6],       %[dc]                   \n\t"
 639         "paddsh     %[ftmp7],   %[ftmp7],       %[dc]                   \n\t"
 640         "paddsh     %[ftmp8],   %[ftmp8],       %[dc]                   \n\t"
 641
 642         "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
 643         "packushb   %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
 644         "packushb   %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
 645         "packushb   %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
 646         "packushb   %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
 647         "packushb   %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
 648         "packushb   %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
 649         "packushb   %[ftmp8],   %[ftmp8],       %[ftmp0]                \n\t"
 650
 651         MMI_SWC1(%[ftmp1], %[dest0], 0x00)
 652         MMI_SWC1(%[ftmp2], %[dest1], 0x00)
 653         MMI_SWC1(%[ftmp3], %[dest2], 0x00)
 654         MMI_SWC1(%[ftmp4], %[dest3], 0x00)
 655         MMI_SWC1(%[ftmp5], %[dest4], 0x00)
 656         MMI_SWC1(%[ftmp6], %[dest5], 0x00)
 657         MMI_SWC1(%[ftmp7], %[dest6], 0x00)
 658         MMI_SWC1(%[ftmp8], %[dest7], 0x00)
 659         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
 660           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
 661           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
 662           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
 663           RESTRICT_ASM_LOW32
 664           [ftmp8]"=&f"(ftmp[8])
 665         : [dest0]"r"(dest+0*linesize),  [dest1]"r"(dest+1*linesize),
 666           [dest2]"r"(dest+2*linesize),  [dest3]"r"(dest+3*linesize),
 667           [dest4]"r"(dest+4*linesize),  [dest5]"r"(dest+5*linesize),
 668           [dest6]"r"(dest+6*linesize),  [dest7]"r"(dest+7*linesize),
 669           [dc]"f"(dc)
 670         : "memory"
 671     );
 672 }
 673
 674 #if _MIPS_SIM != _ABIO32
 675 void ff_vc1_inv_trans_4x8_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
 676 {
 677     int16_t *src = block;
 678     int16_t *dst = block;
 679     double ftmp[16];
 680     uint32_t count, tmp[1];
 681     mips_reg addr[1];
 682     DECLARE_VAR_LOW32;
 683
 684     // 1st loop
 685     __asm__ volatile (
 686         "li         %[count],   0x02                                    \n\t"
 687         "li         %[tmp0],    0x03                                    \n\t"
 688         "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
 689
 690         "1:                                                             \n\t"
 691         MMI_LDC1(%[ftmp1], %[src], 0x00)
 692         MMI_LDC1(%[ftmp2], %[src], 0x10)
 693         MMI_LDC1(%[ftmp3], %[src], 0x20)
 694         MMI_LDC1(%[ftmp4], %[src], 0x30)
 695
 696         TRANSPOSE_4H(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
 697                      %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
 698                      %[ftmp9], %[tmp0],  %[ftmp10], %[ftmp11])
 699
 700         //                              t1        t2        t3        t4
 701         VC1_INV_TRANCS_4_STEP1_MMI(%[ftmp1], %[ftmp3], %[ftmp4], %[ftmp2],
 702                                    %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
 703                                    %[ff_pw_17], %[ff_pw_10], %[ff_pw_22],
 704                                    %[ff_pw_4])
 705
 706         PSRAH_4_MMI(%[ftmp1], %[ftmp3], %[ftmp4], %[ftmp2], %[ftmp0])
 707
 708         TRANSPOSE_4H(%[ftmp1], %[ftmp3], %[ftmp4], %[ftmp2],
 709                      %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
 710                      %[ftmp9], %[tmp0],  %[ftmp10], %[ftmp11])
 711
 712         MMI_SDC1(%[ftmp1], %[dst], 0x00)
 713         MMI_SDC1(%[ftmp3], %[dst], 0x10)
 714         MMI_SDC1(%[ftmp4], %[dst], 0x20)
 715         MMI_SDC1(%[ftmp2], %[dst], 0x30)
 716
 717         "addiu      %[count],   %[count],  -0x01                        \n\t"
 718         PTR_ADDIU  "%[src],     %[src],     0x40                        \n\t"
 719         PTR_ADDIU  "%[dst],     %[dst],     0x40                        \n\t"
 720         "bnez       %[count],   1b                                      \n\t"
 721         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
 722           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
 723           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
 724           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
 725           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
 726           [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
 727           [tmp0]"=&r"(tmp[0]),
 728           [count]"=&r"(count),
 729           [src]"+&r"(src),              [dst]"+&r"(dst)
 730         : [ff_pw_17]"f"(ff_pw_17),      [ff_pw_10]"f"(ff_pw_10),
 731           [ff_pw_22]"f"(ff_pw_22),      [ff_pw_4]"f"(ff_pw_4)
 732         : "memory"
 733     );
 734
 735     src = block;
 736
 737     // 2nd loop
 738     __asm__ volatile (
 739         "li         %[tmp0],    0x07                                    \n\t"
 740         "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
 741
 742         MMI_LDC1(%[ftmp5], %[src], 0x10)
 743         MMI_LDC1(%[ftmp6], %[src], 0x30)
 744         MMI_LDC1(%[ftmp7], %[src], 0x50)
 745         MMI_LDC1(%[ftmp8], %[src], 0x70)
 746
 747         VC1_INV_TRANCS_8_STEP1_MMI(%[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
 748                                    %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
 749                                    %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
 750                                    %[ff_pw_16], %[ff_pw_15], %[ff_pw_9],
 751                                    %[ff_pw_4])
 752
 753         MMI_LDC1(%[ftmp1], %[src], 0x00)
 754         MMI_LDC1(%[ftmp2], %[src], 0x40)
 755         MMI_LDC1(%[ftmp3], %[src], 0x20)
 756         MMI_LDC1(%[ftmp4], %[src], 0x60)
 757
 758         VC1_INV_TRANCS_8_STEP2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
 759                                    %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
 760                                    %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
 761                                    %[ff_pw_12], %[ff_pw_16], %[ff_pw_6],
 762                                    %[ff_pw_64])
 763
 764         "paddh      %[ftmp4],   %[ftmp4],   %[ff_pw_1]                  \n\t"
 765         "paddh      %[ftmp3],   %[ftmp3],   %[ff_pw_1]                  \n\t"
 766         "paddh      %[ftmp2],   %[ftmp2],   %[ff_pw_1]                  \n\t"
 767         "paddh      %[ftmp1],   %[ftmp1],   %[ff_pw_1]                  \n\t"
 768
 769         PSRAH_8_MMI(%[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
 770                     %[ftmp4], %[ftmp3], %[ftmp2], %[ftmp1], %[ftmp0])
 771
 772         "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
 773
 774         // dest low
 775         MMI_LWC1(%[ftmp9], %[dest], 0x00)
 776         PTR_ADDU   "%[addr0],   %[dest],    %[linesize]                 \n\t"
 777         MMI_LWC1(%[ftmp10], %[addr0], 0x00)
 778         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
 779         MMI_LWC1(%[ftmp11], %[addr0], 0x00)
 780         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
 781         MMI_LWC1(%[ftmp12], %[addr0], 0x00)
 782         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
 783
 784         VC1_INV_TRANCS_4_STEP2_MMI(%[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
 785                                    %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
 786                                    %[ftmp0])
 787
 788         // dest high
 789         MMI_LWC1(%[ftmp9], %[addr0], 0x00)
 790         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
 791         MMI_LWC1(%[ftmp10], %[addr0], 0x00)
 792         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
 793         MMI_LWC1(%[ftmp11], %[addr0], 0x00)
 794         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
 795         MMI_LWC1(%[ftmp12], %[addr0], 0x00)
 796
 797         VC1_INV_TRANCS_4_STEP2_MMI(%[ftmp4], %[ftmp3], %[ftmp2], %[ftmp1],
 798                                    %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
 799                                    %[ftmp0])
 800
 801         // dest low
 802         MMI_SWC1(%[ftmp5], %[dest], 0x00)
 803         PTR_ADDU   "%[addr0],   %[dest],    %[linesize]                 \n\t"
 804         MMI_SWC1(%[ftmp6], %[addr0], 0x00)
 805         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
 806         MMI_SWC1(%[ftmp7], %[addr0], 0x00)
 807         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
 808         MMI_SWC1(%[ftmp8], %[addr0], 0x00)
 809         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
 810
 811         // dest high
 812         MMI_SWC1(%[ftmp4], %[addr0], 0x00)
 813         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
 814         MMI_SWC1(%[ftmp3], %[addr0], 0x00)
 815         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
 816         MMI_SWC1(%[ftmp2], %[addr0], 0x00)
 817         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
 818         MMI_SWC1(%[ftmp1], %[addr0], 0x00)
 819         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
 820           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
 821           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
 822           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
 823           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
 824           [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
 825           [ftmp12]"=&f"(ftmp[12]),
 826           [tmp0]"=&r"(tmp[0]),
 827           RESTRICT_ASM_LOW32
 828           [addr0]"=&r"(addr[0]),
 829           [dest]"+&r"(dest)
 830         : [src]"r"(src),                [linesize]"r"(linesize),
 831           [ff_pw_1]"f"(ff_pw_1),        [ff_pw_4]"f"(ff_pw_4),
 832           [ff_pw_6]"f"(ff_pw_6),        [ff_pw_9]"f"(ff_pw_9),
 833           [ff_pw_12]"f"(ff_pw_12),      [ff_pw_15]"f"(ff_pw_15),
 834           [ff_pw_16]"f"(ff_pw_16),      [ff_pw_64]"f"(ff_pw_64)
 835         : "memory"
 836     );
 837 }
 838 #endif
 839
 840 /* Do inverse transform on 4x4 part of block */
 841 void ff_vc1_inv_trans_4x4_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
 842 {
 843     int dc = block[0];
 844     double ftmp[5];
 845     DECLARE_VAR_LOW32;
 846
 847     dc = (17 * dc +  4) >> 3;
 848     dc = (17 * dc + 64) >> 7;
 849
 850     __asm__ volatile(
 851         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
 852         "pshufh     %[dc],      %[dc],          %[ftmp0]                \n\t"
 853
 854         MMI_LWC1(%[ftmp1], %[dest0], 0x00)
 855         MMI_LWC1(%[ftmp2], %[dest1], 0x00)
 856         MMI_LWC1(%[ftmp3], %[dest2], 0x00)
 857         MMI_LWC1(%[ftmp4], %[dest3], 0x00)
 858
 859         "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
 860         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
 861         "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
 862         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
 863
 864         "paddsh     %[ftmp1],   %[ftmp1],       %[dc]                   \n\t"
 865         "paddsh     %[ftmp2],   %[ftmp2],       %[dc]                   \n\t"
 866         "paddsh     %[ftmp3],   %[ftmp3],       %[dc]                   \n\t"
 867         "paddsh     %[ftmp4],   %[ftmp4],       %[dc]                   \n\t"
 868
 869         "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
 870         "packushb   %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
 871         "packushb   %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
 872         "packushb   %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
 873
 874         MMI_SWC1(%[ftmp1], %[dest0], 0x00)
 875         MMI_SWC1(%[ftmp2], %[dest1], 0x00)
 876         MMI_SWC1(%[ftmp3], %[dest2], 0x00)
 877         MMI_SWC1(%[ftmp4], %[dest3], 0x00)
 878         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
 879           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
 880           RESTRICT_ASM_LOW32
 881           [ftmp4]"=&f"(ftmp[4])
 882         : [dest0]"r"(dest+0*linesize),  [dest1]"r"(dest+1*linesize),
 883           [dest2]"r"(dest+2*linesize),  [dest3]"r"(dest+3*linesize),
 884           [dc]"f"(dc)
 885         : "memory"
 886     );
 887 }
 888
 889 void ff_vc1_inv_trans_4x4_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
 890 {
 891     int16_t *src = block;
 892     int16_t *dst = block;
 893     double ftmp[12];
 894     uint32_t tmp[1];
 895     mips_reg addr[1];
 896     DECLARE_VAR_LOW32;
 897
 898     // 1st loop
 899     __asm__ volatile (
 900         "li         %[tmp0],    0x03                                    \n\t"
 901         "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
 902
 903         MMI_LDC1(%[ftmp1], %[src], 0x00)
 904         MMI_LDC1(%[ftmp2], %[src], 0x10)
 905         MMI_LDC1(%[ftmp3], %[src], 0x20)
 906         MMI_LDC1(%[ftmp4], %[src], 0x30)
 907
 908         TRANSPOSE_4H(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
 909                      %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
 910                      %[ftmp9], %[tmp0],  %[ftmp10], %[ftmp11])
 911
 912         //                              t1        t2        t3        t4
 913         VC1_INV_TRANCS_4_STEP1_MMI(%[ftmp1], %[ftmp3], %[ftmp4], %[ftmp2],
 914                                    %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
 915                                    %[ff_pw_17], %[ff_pw_10], %[ff_pw_22],
 916                                    %[ff_pw_4])
 917
 918         PSRAH_4_MMI(%[ftmp1], %[ftmp3], %[ftmp4], %[ftmp2], %[ftmp0])
 919
 920         TRANSPOSE_4H(%[ftmp1], %[ftmp3], %[ftmp4], %[ftmp2],
 921                      %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
 922                      %[ftmp9], %[tmp0],  %[ftmp10], %[ftmp11])
 923
 924         MMI_SDC1(%[ftmp1], %[dst], 0x00)
 925         MMI_SDC1(%[ftmp3], %[dst], 0x10)
 926         MMI_SDC1(%[ftmp4], %[dst], 0x20)
 927         MMI_SDC1(%[ftmp2], %[dst], 0x30)
 928         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
 929           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
 930           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
 931           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
 932           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
 933           [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
 934           [tmp0]"=&r"(tmp[0]),
 935           [src]"+&r"(src),              [dst]"+&r"(dst)
 936         : [ff_pw_17]"f"(ff_pw_17),      [ff_pw_10]"f"(ff_pw_10),
 937           [ff_pw_22]"f"(ff_pw_22),      [ff_pw_4]"f"(ff_pw_4)
 938         : "memory"
 939     );
 940
 941     src = block;
 942
 943     // 2nd loop
 944     __asm__ volatile (
 945         "li         %[tmp0],    0x07                                    \n\t"
 946         "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
 947
 948         // dest low 32bit
 949         MMI_LDC1(%[ftmp1], %[src], 0x00)
 950         MMI_LDC1(%[ftmp2], %[src], 0x20)
 951         MMI_LDC1(%[ftmp3], %[src], 0x30)
 952         MMI_LDC1(%[ftmp4], %[src], 0x10)
 953
 954         VC1_INV_TRANCS_4_STEP1_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
 955                                    %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
 956                                    %[ff_pw_17], %[ff_pw_10], %[ff_pw_22],
 957                                    %[ff_pw_64])
 958
 959         PSRAH_4_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], %[ftmp0])
 960
 961         MMI_LWC1(%[ftmp5], %[dest], 0x00)
 962         PTR_ADDU   "%[addr0],   %[dest],    %[linesize]                 \n\t"
 963         MMI_LWC1(%[ftmp6], %[addr0], 0x00)
 964         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
 965         MMI_LWC1(%[ftmp7], %[addr0], 0x00)
 966         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
 967         MMI_LWC1(%[ftmp8], %[addr0], 0x00)
 968
 969         "xor        %[ftmp9],   %[ftmp9],  %[ftmp9]                     \n\t"
 970
 971         VC1_INV_TRANCS_4_STEP2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
 972                                    %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
 973                                    %[ftmp9])
 974
 975         MMI_SWC1(%[ftmp1], %[dest], 0x00)
 976         PTR_ADDU   "%[addr0],   %[dest],    %[linesize]                 \n\t"
 977         MMI_SWC1(%[ftmp2], %[addr0], 0x00)
 978         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
 979         MMI_SWC1(%[ftmp3], %[addr0], 0x00)
 980         PTR_ADDU   "%[addr0],   %[addr0],   %[linesize]                 \n\t"
 981         MMI_SWC1(%[ftmp4], %[addr0], 0x00)
 982         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
 983           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
 984           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
 985           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
 986           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
 987           [tmp0]"=&r"(tmp[0]),
 988           RESTRICT_ASM_LOW32
 989           [addr0]"=&r"(addr[0])
 990         : [src]"r"(src),                [dest]"r"(dest),
 991           [linesize]"r"((mips_reg)linesize),
 992           [ff_pw_17]"f"(ff_pw_17),      [ff_pw_22]"f"(ff_pw_22),
 993           [ff_pw_10]"f"(ff_pw_10),      [ff_pw_64]"f"(ff_pw_64)
 994         : "memory"
 995     );
 996 }
 997
 998 /* Apply overlap transform to horizontal edge */
 999 void ff_vc1_h_overlap_mmi(uint8_t *src, int stride)
1000 {
1001     int i;
1002     int a, b, c, d;
1003     int d1, d2;
1004     int rnd = 1;
1005     for (i = 0; i < 8; i++) {
1006         a  = src[-2];
1007         b  = src[-1];
1008         c  = src[0];
1009         d  = src[1];
1010         d1 = (a - d + 3 + rnd) >> 3;
1011         d2 = (a - d + b - c + 4 - rnd) >> 3;
1012
1013         src[-2] = a - d1;
1014         src[-1] = av_clip_uint8(b - d2);
1015         src[0]  = av_clip_uint8(c + d2);
1016         src[1]  = d + d1;
1017         src    += stride;
1018         rnd     = !rnd;
1019     }
1020 }
1021
1022 void ff_vc1_h_s_overlap_mmi(int16_t *left, int16_t *right, int left_stride, int right_stride, int flags)
1023 {
1024     int i;
1025     int a, b, c, d;
1026     int d1, d2;
1027     int rnd1 = flags & 2 ? 3 : 4;
1028     int rnd2 = 7 - rnd1;
1029     for (i = 0; i < 8; i++) {
1030         a  = left[6];
1031         b  = left[7];
1032         c  = right[0];
1033         d  = right[1];
1034         d1 = a - d;
1035         d2 = a - d + b - c;
1036
1037         left[6]  = ((a << 3) - d1 + rnd1) >> 3;
1038         left[7]  = ((b << 3) - d2 + rnd2) >> 3;
1039         right[0] = ((c << 3) + d2 + rnd1) >> 3;
1040         right[1] = ((d << 3) + d1 + rnd2) >> 3;
1041
1042         right += right_stride;
1043         left  += left_stride;
1044         if (flags & 1) {
1045             rnd2   = 7 - rnd2;
1046             rnd1   = 7 - rnd1;
1047         }
1048     }
1049 }
1050
1051 /* Apply overlap transform to vertical edge */
1052 void ff_vc1_v_overlap_mmi(uint8_t *src, int stride)
1053 {
1054     int i;
1055     int a, b, c, d;
1056     int d1, d2;
1057     int rnd = 1;
1058     for (i = 0; i < 8; i++) {
1059         a  = src[-2 * stride];
1060         b  = src[-stride];
1061         c  = src[0];
1062         d  = src[stride];
1063         d1 = (a - d + 3 + rnd) >> 3;
1064         d2 = (a - d + b - c + 4 - rnd) >> 3;
1065
1066         src[-2 * stride] = a - d1;
1067         src[-stride]     = av_clip_uint8(b - d2);
1068         src[0]           = av_clip_uint8(c + d2);
1069         src[stride]      = d + d1;
1070         src++;
1071         rnd = !rnd;
1072     }
1073 }
1074
1075 void ff_vc1_v_s_overlap_mmi(int16_t *top, int16_t *bottom)
1076 {
1077     int i;
1078     int a, b, c, d;
1079     int d1, d2;
1080     int rnd1 = 4, rnd2 = 3;
1081     for (i = 0; i < 8; i++) {
1082         a  = top[48];
1083         b  = top[56];
1084         c  = bottom[0];
1085         d  = bottom[8];
1086         d1 = a - d;
1087         d2 = a - d + b - c;
1088
1089         top[48]   = ((a << 3) - d1 + rnd1) >> 3;
1090         top[56]   = ((b << 3) - d2 + rnd2) >> 3;
1091         bottom[0] = ((c << 3) + d2 + rnd1) >> 3;
1092         bottom[8] = ((d << 3) + d1 + rnd2) >> 3;
1093
1094         bottom++;
1095         top++;
1096         rnd2 = 7 - rnd2;
1097         rnd1 = 7 - rnd1;
1098     }
1099 }
1100
1101 /**
1102  * VC-1 in-loop deblocking filter for one line
1103  * @param src source block type
1104  * @param stride block stride
1105  * @param pq block quantizer
1106  * @return whether other 3 pairs should be filtered or not
1107  * @see 8.6
1108  */
1109 static av_always_inline int vc1_filter_line(uint8_t *src, int stride, int pq)
1110 {
1111     int a0 = (2 * (src[-2 * stride] - src[1 * stride]) -
1112               5 * (src[-1 * stride] - src[0 * stride]) + 4) >> 3;
1113     int a0_sign = a0 >> 31;        /* Store sign */
1114
1115     a0 = (a0 ^ a0_sign) - a0_sign; /* a0 = FFABS(a0); */
1116     if (a0 < pq) {
1117         int a1 = FFABS((2 * (src[-4 * stride] - src[-1 * stride]) -
1118                         5 * (src[-3 * stride] - src[-2 * stride]) + 4) >> 3);
1119         int a2 = FFABS((2 * (src[ 0 * stride] - src[ 3 * stride]) -
1120                         5 * (src[ 1 * stride] - src[ 2 * stride]) + 4) >> 3);
1121         if (a1 < a0 || a2 < a0) {
1122             int clip      = src[-1 * stride] - src[0 * stride];
1123             int clip_sign = clip >> 31;
1124
1125             clip = ((clip ^ clip_sign) - clip_sign) >> 1;
1126             if (clip) {
1127                 int a3     = FFMIN(a1, a2);
1128                 int d      = 5 * (a3 - a0);
1129                 int d_sign = (d >> 31);
1130
1131                 d       = ((d ^ d_sign) - d_sign) >> 3;
1132                 d_sign ^= a0_sign;
1133
1134                 if (d_sign ^ clip_sign)
1135                     d = 0;
1136                 else {
1137                     d = FFMIN(d, clip);
1138                     d = (d ^ d_sign) - d_sign; /* Restore sign */
1139                     src[-1 * stride] = av_clip_uint8(src[-1 * stride] - d);
1140                     src[ 0 * stride] = av_clip_uint8(src[ 0 * stride] + d);
1141                 }
1142                 return 1;
1143             }
1144         }
1145     }
1146     return 0;
1147 }
1148
1149 /**
1150  * VC-1 in-loop deblocking filter
1151  * @param src source block type
1152  * @param step distance between horizontally adjacent elements
1153  * @param stride distance between vertically adjacent elements
1154  * @param len edge length to filter (4 or 8 pixels)
1155  * @param pq block quantizer
1156  * @see 8.6
1157  */
1158 static inline void vc1_loop_filter(uint8_t *src, int step, int stride,
1159                                    int len, int pq)
1160 {
1161     int i;
1162     int filt3;
1163
1164     for (i = 0; i < len; i += 4) {
1165         filt3 = vc1_filter_line(src + 2 * step, stride, pq);
1166         if (filt3) {
1167             vc1_filter_line(src + 0 * step, stride, pq);
1168             vc1_filter_line(src + 1 * step, stride, pq);
1169             vc1_filter_line(src + 3 * step, stride, pq);
1170         }
1171         src += step * 4;
1172     }
1173 }
1174
1175 void ff_vc1_v_loop_filter4_mmi(uint8_t *src, int stride, int pq)
1176 {
1177     vc1_loop_filter(src, 1, stride, 4, pq);
1178 }
1179
1180 void ff_vc1_h_loop_filter4_mmi(uint8_t *src, int stride, int pq)
1181 {
1182     vc1_loop_filter(src, stride, 1, 4, pq);
1183 }
1184
1185 void ff_vc1_v_loop_filter8_mmi(uint8_t *src, int stride, int pq)
1186 {
1187     vc1_loop_filter(src, 1, stride, 8, pq);
1188 }
1189
1190 void ff_vc1_h_loop_filter8_mmi(uint8_t *src, int stride, int pq)
1191 {
1192     vc1_loop_filter(src, stride, 1, 8, pq);
1193 }
1194
1195 void ff_vc1_v_loop_filter16_mmi(uint8_t *src, int stride, int pq)
1196 {
1197     vc1_loop_filter(src, 1, stride, 16, pq);
1198 }
1199
1200 void ff_vc1_h_loop_filter16_mmi(uint8_t *src, int stride, int pq)
1201 {
1202     vc1_loop_filter(src, stride, 1, 16, pq);
1203 }
1204
1205 void ff_put_vc1_mspel_mc00_mmi(uint8_t *dst, const uint8_t *src,
1206                                ptrdiff_t stride, int rnd)
1207 {
1208     ff_put_pixels8_8_mmi(dst, src, stride, 8);
1209 }
1210 void ff_put_vc1_mspel_mc00_16_mmi(uint8_t *dst, const uint8_t *src,
1211                                   ptrdiff_t stride, int rnd)
1212 {
1213     ff_put_pixels16_8_mmi(dst, src, stride, 16);
1214 }
1215 void ff_avg_vc1_mspel_mc00_mmi(uint8_t *dst, const uint8_t *src,
1216                                ptrdiff_t stride, int rnd)
1217 {
1218     ff_avg_pixels8_8_mmi(dst, src, stride, 8);
1219 }
1220 void ff_avg_vc1_mspel_mc00_16_mmi(uint8_t *dst, const uint8_t *src,
1221                                   ptrdiff_t stride, int rnd)
1222 {
1223     ff_avg_pixels16_8_mmi(dst, src, stride, 16);
1224 }
1225
1226 #define OP_PUT(S, D)
1227 #define OP_AVG(S, D)                                                        \
1228     "ldc1       $f16,   "#S"                        \n\t"                   \
1229     "pavgb      "#D",   "#D",   $f16                \n\t"
1230
1231 /** Add rounder from $f14 to $f6 and pack result at destination */
1232 #define NORMALIZE_MMI(SHIFT)                                                \
1233     "paddh      $f6,    $f6,    $f14                \n\t" /* +bias-r */     \
1234     "paddh      $f8,    $f8,    $f14                \n\t" /* +bias-r */     \
1235     "psrah      $f6,    $f6,    "SHIFT"             \n\t"                   \
1236     "psrah      $f8,    $f8,    "SHIFT"             \n\t"
1237
1238 #define TRANSFER_DO_PACK(OP)                                                \
1239     "packushb   $f6,    $f6,    $f8                 \n\t"                   \
1240     OP((%[dst]), $f6)                                                       \
1241     "sdc1       $f6,    0x00(%[dst])                \n\t"
1242
1243 #define TRANSFER_DONT_PACK(OP)                                              \
1244      OP(0(%[dst]), $f6)                                                     \
1245      OP(8(%[dst]), $f8)                                                     \
1246      "sdc1      $f6,    0x00(%[dst])                \n\t"                   \
1247      "sdc1      $f8,    0x08(%[dst])                \n\t"
1248
1249 /** @see MSPEL_FILTER13_CORE for use as UNPACK macro */
1250 #define DO_UNPACK(reg)                                                      \
1251     "punpcklbh  "reg",  "reg",  $f0                 \n\t"
1252 #define DONT_UNPACK(reg)
1253
1254 /** Compute the rounder 32-r or 8-r and unpacks it to $f14 */
1255 #define LOAD_ROUNDER_MMI(ROUND)                                             \
1256     "lwc1       $f14,   "ROUND"                     \n\t"                   \
1257     "punpcklhw  $f14,   $f14,   $f14                \n\t"                   \
1258     "punpcklwd  $f14,   $f14,   $f14                \n\t"
1259
1260
1261 #define SHIFT2_LINE(OFF, R0, R1, R2, R3)                                    \
1262     "paddh      "#R1",      "#R1",  "#R2"           \n\t"                   \
1263     PTR_ADDU    "$9,        %[src], %[stride1]      \n\t"                   \
1264     MMI_ULWC1(R0, $9, 0x00)                                                 \
1265     "pmullh     "#R1",      "#R1",  $f6             \n\t"                   \
1266     "punpcklbh  "#R0",      "#R0",  $f0             \n\t"                   \
1267     PTR_ADDU    "$9,        %[src], %[stride]       \n\t"                   \
1268     MMI_ULWC1(R3, $9, 0x00)                                                 \
1269     "psubh      "#R1",      "#R1",  "#R0"           \n\t"                   \
1270     "punpcklbh  "#R3",      "#R3",  $f0             \n\t"                   \
1271     "paddh      "#R1",      "#R1",  $f14            \n\t"                   \
1272     "psubh      "#R1",      "#R1",  "#R3"           \n\t"                   \
1273     "psrah      "#R1",      "#R1",  %[shift]        \n\t"                   \
1274     MMI_SDC1(R1, %[dst], OFF)                                               \
1275     PTR_ADDU    "%[src],    %[src], %[stride]       \n\t"
1276
1277 /** Sacrificing $f12 makes it possible to pipeline loads from src */
1278 static void vc1_put_ver_16b_shift2_mmi(int16_t *dst,
1279                                        const uint8_t *src, mips_reg stride,
1280                                        int rnd, int64_t shift)
1281 {
1282     DECLARE_VAR_LOW32;
1283     DECLARE_VAR_ADDRT;
1284
1285     __asm__ volatile(
1286         "xor        $f0,    $f0,    $f0             \n\t"
1287         "li         $8,     0x03                    \n\t"
1288         LOAD_ROUNDER_MMI("%[rnd]")
1289         "ldc1       $f12,   %[ff_pw_9]              \n\t"
1290         "1:                                         \n\t"
1291         MMI_ULWC1($f4, %[src], 0x00)
1292         PTR_ADDU   "%[src], %[src], %[stride]       \n\t"
1293         MMI_ULWC1($f6, %[src], 0x00)
1294         "punpcklbh  $f4,    $f4,    $f0             \n\t"
1295         "punpcklbh  $f6,    $f6,    $f0             \n\t"
1296         SHIFT2_LINE(  0, $f2, $f4, $f6, $f8)
1297         SHIFT2_LINE( 24, $f4, $f6, $f8, $f2)
1298         SHIFT2_LINE( 48, $f6, $f8, $f2, $f4)
1299         SHIFT2_LINE( 72, $f8, $f2, $f4, $f6)
1300         SHIFT2_LINE( 96, $f2, $f4, $f6, $f8)
1301         SHIFT2_LINE(120, $f4, $f6, $f8, $f2)
1302         SHIFT2_LINE(144, $f6, $f8, $f2, $f4)
1303         SHIFT2_LINE(168, $f8, $f2, $f4, $f6)
1304         PTR_SUBU   "%[src], %[src], %[stride2]      \n\t"
1305         PTR_ADDIU  "%[dst], %[dst], 0x08            \n\t"
1306         "addiu      $8,     $8,    -0x01            \n\t"
1307         "bnez       $8,     1b                      \n\t"
1308         : RESTRICT_ASM_LOW32            RESTRICT_ASM_ADDRT
1309           [src]"+r"(src),               [dst]"+r"(dst)
1310         : [stride]"r"(stride),          [stride1]"r"(-2*stride),
1311           [shift]"f"(shift),            [rnd]"m"(rnd),
1312           [stride2]"r"(9*stride-4),     [ff_pw_9]"m"(ff_pw_9)
1313         : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
1314           "$f14", "$f16", "memory"
1315     );
1316 }
1317
1318 /**
1319  * Data is already unpacked, so some operations can directly be made from
1320  * memory.
1321  */
1322 #define VC1_HOR_16B_SHIFT2(OP, OPNAME)                                      \
1323 static void OPNAME ## vc1_hor_16b_shift2_mmi(uint8_t *dst, mips_reg stride, \
1324                                              const int16_t *src, int rnd)   \
1325 {                                                                           \
1326     int h = 8;                                                              \
1327     DECLARE_VAR_ALL64;                                                      \
1328     DECLARE_VAR_ADDRT;                                                      \
1329                                                                             \
1330     src -= 1;                                                               \
1331     rnd -= (-1+9+9-1)*1024; /* Add -1024 bias */                            \
1332                                                                             \
1333     __asm__ volatile(                                                       \
1334         LOAD_ROUNDER_MMI("%[rnd]")                                          \
1335         "ldc1       $f12,   %[ff_pw_128]            \n\t"                   \
1336         "ldc1       $f10,   %[ff_pw_9]              \n\t"                   \
1337         "1:                                         \n\t"                   \
1338         MMI_ULDC1($f2, %[src], 0x00)                                        \
1339         MMI_ULDC1($f4, %[src], 0x08)                                        \
1340         MMI_ULDC1($f6, %[src], 0x02)                                        \
1341         MMI_ULDC1($f8, %[src], 0x0a)                                        \
1342         MMI_ULDC1($f0, %[src], 0x06)                                        \
1343         "paddh      $f2,    $f2,    $f0             \n\t"                   \
1344         MMI_ULDC1($f0, %[src], 0x0e)                                        \
1345         "paddh      $f4,    $f4,    $f0             \n\t"                   \
1346         MMI_ULDC1($f0, %[src], 0x04)                                        \
1347         "paddh      $f6,    $f6,    $f0             \n\t"                   \
1348         MMI_ULDC1($f0, %[src], 0x0b)                                        \
1349         "paddh      $f8,    $f8,    $f0             \n\t"                   \
1350         "pmullh     $f6,    $f6,    $f10            \n\t"                   \
1351         "pmullh     $f8,    $f8,    $f10            \n\t"                   \
1352         "psubh      $f6,    $f6,    $f2             \n\t"                   \
1353         "psubh      $f8,    $f8,    $f4             \n\t"                   \
1354         "li         $8,     0x07                    \n\t"                   \
1355         "mtc1       $8,     $f16                    \n\t"                   \
1356         NORMALIZE_MMI("$f16")                                               \
1357         /* Remove bias */                                                   \
1358         "paddh      $f6,    $f6,    $f12            \n\t"                   \
1359         "paddh      $f8,    $f8,    $f12            \n\t"                   \
1360         TRANSFER_DO_PACK(OP)                                                \
1361         "addiu      %[h],   %[h],  -0x01            \n\t"                   \
1362         PTR_ADDIU  "%[src], %[src], 0x18            \n\t"                   \
1363         PTR_ADDU   "%[dst], %[dst], %[stride]       \n\t"                   \
1364         "bnez       %[h],   1b                      \n\t"                   \
1365         : RESTRICT_ASM_ALL64            RESTRICT_ASM_ADDRT                  \
1366           [h]"+r"(h),                                                       \
1367           [src]"+r"(src),               [dst]"+r"(dst)                      \
1368         : [stride]"r"(stride),          [rnd]"m"(rnd),                      \
1369           [ff_pw_9]"m"(ff_pw_9),        [ff_pw_128]"m"(ff_pw_128)           \
1370         : "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14",  \
1371           "$f16", "memory"                                                  \
1372     );                                                                      \
1373 }
1374
1375 VC1_HOR_16B_SHIFT2(OP_PUT, put_)
1376 VC1_HOR_16B_SHIFT2(OP_AVG, avg_)
1377
1378 /**
1379  * Purely vertical or horizontal 1/2 shift interpolation.
1380  * Sacrify $f12 for *9 factor.
1381  */
1382 #define VC1_SHIFT2(OP, OPNAME)\
1383 static void OPNAME ## vc1_shift2_mmi(uint8_t *dst, const uint8_t *src,      \
1384                                      mips_reg stride, int rnd,              \
1385                                      mips_reg offset)                       \
1386 {                                                                           \
1387     DECLARE_VAR_LOW32;                                                      \
1388     DECLARE_VAR_ADDRT;                                                      \
1389                                                                             \
1390     rnd = 8 - rnd;                                                          \
1391                                                                             \
1392     __asm__ volatile(                                                       \
1393         "xor        $f0,    $f0,    $f0             \n\t"                   \
1394         "li         $10,    0x08                    \n\t"                   \
1395         LOAD_ROUNDER_MMI("%[rnd]")                                          \
1396         "ldc1       $f12,   %[ff_pw_9]              \n\t"                   \
1397         "1:                                         \n\t"                   \
1398         MMI_ULWC1($f6, %[src], 0x00)                                        \
1399         MMI_ULWC1($f8, %[src], 0x04)                                        \
1400         PTR_ADDU   "$9,     %[src], %[offset]       \n\t"                   \
1401         MMI_ULWC1($f2, $9, 0x00)                                            \
1402         MMI_ULWC1($f4, $9, 0x04)                                            \
1403         PTR_ADDU   "%[src], %[src], %[offset]       \n\t"                   \
1404         "punpcklbh  $f6,    $f6,    $f0             \n\t"                   \
1405         "punpcklbh  $f8,    $f8,    $f0             \n\t"                   \
1406         "punpcklbh  $f2,    $f2,    $f0             \n\t"                   \
1407         "punpcklbh  $f4,    $f4,    $f0             \n\t"                   \
1408         "paddh      $f6,    $f6,    $f2             \n\t"                   \
1409         "paddh      $f8,    $f8,    $f4             \n\t"                   \
1410         PTR_ADDU   "$9,     %[src], %[offset_x2n]   \n\t"                   \
1411         MMI_ULWC1($f2, $9, 0x00)                                            \
1412         MMI_ULWC1($f4, $9, 0x04)                                            \
1413         "pmullh     $f6,    $f6,    $f12            \n\t" /* 0,9,9,0*/      \
1414         "pmullh     $f8,    $f8,    $f12            \n\t" /* 0,9,9,0*/      \
1415         "punpcklbh  $f2,    $f2,    $f0             \n\t"                   \
1416         "punpcklbh  $f4,    $f4,    $f0             \n\t"                   \
1417         "psubh      $f6,    $f6,    $f2             \n\t" /*-1,9,9,0*/      \
1418         "psubh      $f8,    $f8,    $f4             \n\t" /*-1,9,9,0*/      \
1419         PTR_ADDU   "$9,     %[src], %[offset]       \n\t"                   \
1420         MMI_ULWC1($f2, $9, 0x00)                                            \
1421         MMI_ULWC1($f4, $9, 0x04)                                            \
1422         "punpcklbh  $f2,    $f2,    $f0             \n\t"                   \
1423         "punpcklbh  $f4,    $f4,    $f0             \n\t"                   \
1424         "psubh      $f6,    $f6,    $f2             \n\t" /*-1,9,9,-1*/     \
1425         "psubh      $f8,    $f8,    $f4             \n\t" /*-1,9,9,-1*/     \
1426         "li         $8,     0x04                    \n\t"                   \
1427         "mtc1       $8,     $f16                    \n\t"                   \
1428         NORMALIZE_MMI("$f16")                                               \
1429         "packushb   $f6,    $f6,    $f8             \n\t"                   \
1430         OP((%[dst]), $f6)                                                   \
1431         "sdc1       $f6,    0x00(%[dst])            \n\t"                   \
1432         "addiu      $10,    $10,   -0x01            \n\t"                   \
1433         PTR_ADDU   "%[src], %[src], %[stride1]      \n\t"                   \
1434         PTR_ADDU   "%[dst], %[dst], %[stride]       \n\t"                   \
1435         "bnez       $10,    1b                      \n\t"                   \
1436         : RESTRICT_ASM_LOW32            RESTRICT_ASM_ADDRT                  \
1437           [src]"+r"(src),               [dst]"+r"(dst)                      \
1438         : [offset]"r"(offset),          [offset_x2n]"r"(-2*offset),         \
1439           [stride]"g"(stride),          [rnd]"m"(rnd),                      \
1440           [stride1]"g"(stride-offset),                                      \
1441           [ff_pw_9]"m"(ff_pw_9)                                             \
1442         : "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10",     \
1443           "$f12", "$f14", "$f16", "memory"                                  \
1444     );                                                                      \
1445 }
1446
1447 VC1_SHIFT2(OP_PUT, put_)
1448 VC1_SHIFT2(OP_AVG, avg_)
1449
1450 /**
1451  * Core of the 1/4 and 3/4 shift bicubic interpolation.
1452  *
1453  * @param UNPACK  Macro unpacking arguments from 8 to 16bits (can be empty).
1454  * @param LOAD    "MMI_ULWC1" or "MMI_ULDC1", if data read is already unpacked.
1455  * @param M       "1" for MMI_ULWC1, "2" for MMI_ULDC1.
1456  * @param A1      Stride address of 1st tap (beware of unpacked/packed).
1457  * @param A2      Stride address of 2nd tap
1458  * @param A3      Stride address of 3rd tap
1459  * @param A4      Stride address of 4th tap
1460  */
1461 #define MSPEL_FILTER13_CORE(UNPACK, LOAD, M, A1, A2, A3, A4)                \
1462     PTR_ADDU   "$9,     %[src], "#A1"           \n\t"                       \
1463     LOAD($f2, $9, M*0)                                                      \
1464     LOAD($f4, $9, M*4)                                                      \
1465     UNPACK("$f2")                                                           \
1466     UNPACK("$f4")                                                           \
1467     "pmullh     $f2,    $f2,    %[ff_pw_3]      \n\t"                       \
1468     "pmullh     $f4,    $f4,    %[ff_pw_3]      \n\t"                       \
1469     PTR_ADDU   "$9,     %[src], "#A2"           \n\t"                       \
1470     LOAD($f6, $9, M*0)                                                      \
1471     LOAD($f8, $9, M*4)                                                      \
1472     UNPACK("$f6")                                                           \
1473     UNPACK("$f8")                                                           \
1474     "pmullh     $f6,    $f6,    $f12            \n\t" /* *18 */             \
1475     "pmullh     $f8,    $f8,    $f12            \n\t" /* *18 */             \
1476     "psubh      $f6,    $f6,    $f2             \n\t" /* *18, -3 */         \
1477     "psubh      $f8,    $f8,    $f4             \n\t" /* *18, -3 */         \
1478     PTR_ADDU   "$9,     %[src], "#A4"           \n\t"                       \
1479     LOAD($f2, $9, M*0)                                                      \
1480     LOAD($f4, $9, M*4)                                                      \
1481     UNPACK("$f2")                                                           \
1482     UNPACK("$f4")                                                           \
1483     "li         $8,     0x02                    \n\t"                       \
1484     "mtc1       $8,     $f16                    \n\t"                       \
1485     "psllh      $f2,    $f2,    $f16            \n\t" /* 4* */              \
1486     "psllh      $f4,    $f4,    $f16            \n\t" /* 4* */              \
1487     "psubh      $f6,    $f6,    $f2             \n\t" /* -4,18,-3 */        \
1488     "psubh      $f8,    $f8,    $f4             \n\t" /* -4,18,-3 */        \
1489     PTR_ADDU   "$9,     %[src], "#A3"           \n\t"                       \
1490     LOAD($f2, $9, M*0)                                                      \
1491     LOAD($f4, $9, M*4)                                                      \
1492     UNPACK("$f2")                                                           \
1493     UNPACK("$f4")                                                           \
1494     "pmullh     $f2,    $f2,    $f10            \n\t" /* *53 */             \
1495     "pmullh     $f4,    $f4,    $f10            \n\t" /* *53 */             \
1496     "paddh      $f6,    $f6,    $f2             \n\t" /* 4,53,18,-3 */      \
1497     "paddh      $f8,    $f8,    $f4             \n\t" /* 4,53,18,-3 */
1498
1499 /**
1500  * Macro to build the vertical 16bits version of vc1_put_shift[13].
1501  * Here, offset=src_stride. Parameters passed A1 to A4 must use
1502  * %3 (src_stride), %4 (2*src_stride) and %5 (3*src_stride).
1503  *
1504  * @param  NAME   Either 1 or 3
1505  * @see MSPEL_FILTER13_CORE for information on A1->A4
1506  */
1507 #define MSPEL_FILTER13_VER_16B(NAME, A1, A2, A3, A4)                        \
1508 static void                                                                 \
1509 vc1_put_ver_16b_ ## NAME ## _mmi(int16_t *dst, const uint8_t *src,          \
1510                                  mips_reg src_stride,                       \
1511                                  int rnd, int64_t shift)                    \
1512 {                                                                           \
1513     int h = 8;                                                              \
1514     DECLARE_VAR_LOW32;                                                      \
1515     DECLARE_VAR_ADDRT;                                                      \
1516                                                                             \
1517     src -= src_stride;                                                      \
1518                                                                             \
1519     __asm__ volatile(                                                       \
1520         "xor        $f0,    $f0,    $f0             \n\t"                   \
1521         LOAD_ROUNDER_MMI("%[rnd]")                                          \
1522         "ldc1       $f10,   %[ff_pw_53]             \n\t"                   \
1523         "ldc1       $f12,   %[ff_pw_18]             \n\t"                   \
1524         ".p2align 3                                 \n\t"                   \
1525         "1:                                         \n\t"                   \
1526         MSPEL_FILTER13_CORE(DO_UNPACK, MMI_ULWC1, 1, A1, A2, A3, A4)        \
1527         NORMALIZE_MMI("%[shift]")                                           \
1528         TRANSFER_DONT_PACK(OP_PUT)                                          \
1529         /* Last 3 (in fact 4) bytes on the line */                          \
1530         PTR_ADDU   "$9,     %[src], "#A1"           \n\t"                   \
1531         MMI_ULWC1($f2, $9, 0x08)                                            \
1532         DO_UNPACK("$f2")                                                    \
1533         "mov.d      $f6,    $f2                     \n\t"                   \
1534         "paddh      $f2,    $f2,    $f2             \n\t"                   \
1535         "paddh      $f2,    $f2,    $f6             \n\t" /* 3* */          \
1536         PTR_ADDU   "$9,     %[src], "#A2"           \n\t"                   \
1537         MMI_ULWC1($f6, $9, 0x08)                                            \
1538         DO_UNPACK("$f6")                                                    \
1539         "pmullh     $f6,    $f6,    $f12            \n\t" /* *18 */         \
1540         "psubh      $f6,    $f6,    $f2             \n\t" /* *18,-3 */      \
1541         PTR_ADDU   "$9,     %[src], "#A3"           \n\t"                   \
1542         MMI_ULWC1($f2, $9, 0x08)                                            \
1543         DO_UNPACK("$f2")                                                    \
1544         "pmullh     $f2,    $f2,    $f10            \n\t" /* *53 */         \
1545         "paddh      $f6,    $f6,    $f2             \n\t" /* *53,18,-3 */   \
1546         PTR_ADDU   "$9,     %[src], "#A4"           \n\t"                   \
1547         MMI_ULWC1($f2, $9, 0x08)                                            \
1548         DO_UNPACK("$f2")                                                    \
1549         "li         $8,     0x02                    \n\t"                   \
1550         "mtc1       $8,     $f16                    \n\t"                   \
1551         "psllh      $f2,    $f2,    $f16            \n\t" /* 4* */          \
1552         "psubh      $f6,    $f6,    $f2             \n\t"                   \
1553         "paddh      $f6,    $f6,    $f14            \n\t"                   \
1554         "li         $8,     0x06                    \n\t"                   \
1555         "mtc1       $8,     $f16                    \n\t"                   \
1556         "psrah      $f6,    $f6,    $f16            \n\t"                   \
1557         "sdc1       $f6,    0x10(%[dst])            \n\t"                   \
1558         "addiu      %[h],   %[h],  -0x01            \n\t"                   \
1559         PTR_ADDU   "%[src], %[src], %[stride_x1]    \n\t"                   \
1560         PTR_ADDIU  "%[dst], %[dst], 0x18            \n\t"                   \
1561         "bnez       %[h],   1b                      \n\t"                   \
1562         : RESTRICT_ASM_LOW32            RESTRICT_ASM_ADDRT                  \
1563           [h]"+r"(h),                                                       \
1564           [src]"+r"(src),               [dst]"+r"(dst)                      \
1565         : [stride_x1]"r"(src_stride),   [stride_x2]"r"(2*src_stride),       \
1566           [stride_x3]"r"(3*src_stride),                                     \
1567           [rnd]"m"(rnd),                [shift]"f"(shift),                  \
1568           [ff_pw_53]"m"(ff_pw_53),      [ff_pw_18]"m"(ff_pw_18),            \
1569           [ff_pw_3]"f"(ff_pw_3)                                             \
1570         : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",    \
1571           "$f14", "$f16", "memory"                                          \
1572     );                                                                      \
1573 }
1574
1575 /**
1576  * Macro to build the horizontal 16bits version of vc1_put_shift[13].
1577  * Here, offset=16bits, so parameters passed A1 to A4 should be simple.
1578  *
1579  * @param  NAME   Either 1 or 3
1580  * @see MSPEL_FILTER13_CORE for information on A1->A4
1581  */
1582 #define MSPEL_FILTER13_HOR_16B(NAME, A1, A2, A3, A4, OP, OPNAME)            \
1583 static void                                                                 \
1584 OPNAME ## vc1_hor_16b_ ## NAME ## _mmi(uint8_t *dst, mips_reg stride,       \
1585                                        const int16_t *src, int rnd)         \
1586 {                                                                           \
1587     int h = 8;                                                              \
1588     DECLARE_VAR_ALL64;                                                      \
1589     DECLARE_VAR_ADDRT;                                                      \
1590                                                                             \
1591     src -= 1;                                                               \
1592     rnd -= (-4+58+13-3)*256; /* Add -256 bias */                            \
1593                                                                             \
1594     __asm__ volatile(                                                       \
1595         "xor        $f0,    $f0,    $f0             \n\t"                   \
1596         LOAD_ROUNDER_MMI("%[rnd]")                                          \
1597         "ldc1       $f10,   %[ff_pw_53]             \n\t"                   \
1598         "ldc1       $f12,   %[ff_pw_18]             \n\t"                   \
1599         ".p2align 3                                 \n\t"                   \
1600         "1:                                         \n\t"                   \
1601         MSPEL_FILTER13_CORE(DONT_UNPACK, MMI_ULDC1, 2, A1, A2, A3, A4)      \
1602         "li         $8,     0x07                    \n\t"                   \
1603         "mtc1       $8,     $f16                    \n\t"                   \
1604         NORMALIZE_MMI("$f16")                                               \
1605         /* Remove bias */                                                   \
1606         "paddh      $f6,    $f6,    %[ff_pw_128]    \n\t"                   \
1607         "paddh      $f8,    $f8,    %[ff_pw_128]    \n\t"                   \
1608         TRANSFER_DO_PACK(OP)                                                \
1609         "addiu      %[h],   %[h],  -0x01            \n\t"                   \
1610         PTR_ADDU   "%[src], %[src], 0x18            \n\t"                   \
1611         PTR_ADDU   "%[dst], %[dst], %[stride]       \n\t"                   \
1612         "bnez       %[h],   1b                      \n\t"                   \
1613         : RESTRICT_ASM_ALL64            RESTRICT_ASM_ADDRT                  \
1614           [h]"+r"(h),                                                       \
1615           [src]"+r"(src),               [dst]"+r"(dst)                      \
1616         : [stride]"r"(stride),          [rnd]"m"(rnd),                      \
1617           [ff_pw_53]"m"(ff_pw_53),      [ff_pw_18]"m"(ff_pw_18),            \
1618           [ff_pw_3]"f"(ff_pw_3),        [ff_pw_128]"f"(ff_pw_128)           \
1619         : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",    \
1620           "$f14", "$f16", "memory"                                          \
1621     );                                                                      \
1622 }
1623
1624 /**
1625  * Macro to build the 8bits, any direction, version of vc1_put_shift[13].
1626  * Here, offset=src_stride. Parameters passed A1 to A4 must use
1627  * %3 (offset), %4 (2*offset) and %5 (3*offset).
1628  *
1629  * @param  NAME   Either 1 or 3
1630  * @see MSPEL_FILTER13_CORE for information on A1->A4
1631  */
1632 #define MSPEL_FILTER13_8B(NAME, A1, A2, A3, A4, OP, OPNAME)                 \
1633 static void                                                                 \
1634 OPNAME ## vc1_## NAME ## _mmi(uint8_t *dst, const uint8_t *src,             \
1635                               mips_reg stride, int rnd, mips_reg offset)    \
1636 {                                                                           \
1637     int h = 8;                                                              \
1638     DECLARE_VAR_LOW32;                                                      \
1639     DECLARE_VAR_ADDRT;                                                      \
1640                                                                             \
1641     src -= offset;                                                          \
1642     rnd = 32-rnd;                                                           \
1643                                                                             \
1644     __asm__ volatile (                                                      \
1645         "xor        $f0,    $f0,    $f0             \n\t"                   \
1646         LOAD_ROUNDER_MMI("%[rnd]")                                          \
1647         "ldc1       $f10,   %[ff_pw_53]             \n\t"                   \
1648         "ldc1       $f12,   %[ff_pw_18]             \n\t"                   \
1649         ".p2align 3                                 \n\t"                   \
1650         "1:                                         \n\t"                   \
1651         MSPEL_FILTER13_CORE(DO_UNPACK, MMI_ULWC1, 1, A1, A2, A3, A4)        \
1652         "li         $8,     0x06                    \n\t"                   \
1653         "mtc1       $8,     $f16                    \n\t"                   \
1654         NORMALIZE_MMI("$f16")                                               \
1655         TRANSFER_DO_PACK(OP)                                                \
1656         "addiu      %[h],   %[h],      -0x01        \n\t"                   \
1657         PTR_ADDU   "%[src], %[src],     %[stride]   \n\t"                   \
1658         PTR_ADDU   "%[dst], %[dst],     %[stride]   \n\t"                   \
1659         "bnez       %[h],   1b                      \n\t"                   \
1660         : RESTRICT_ASM_LOW32            RESTRICT_ASM_ADDRT                  \
1661           [h]"+r"(h),                                                       \
1662           [src]"+r"(src),               [dst]"+r"(dst)                      \
1663         : [offset_x1]"r"(offset),       [offset_x2]"r"(2*offset),           \
1664           [offset_x3]"r"(3*offset),     [stride]"g"(stride),                \
1665           [rnd]"m"(rnd),                                                    \
1666           [ff_pw_53]"m"(ff_pw_53),      [ff_pw_18]"m"(ff_pw_18),            \
1667           [ff_pw_3]"f"(ff_pw_3)                                             \
1668         : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",    \
1669           "$f14", "$f16", "memory"                                          \
1670     );                                                                      \
1671 }
1672
1673
1674 /** 1/4 shift bicubic interpolation */
1675 MSPEL_FILTER13_8B(shift1, %[offset_x3], %[offset_x2], %[offset_x1], $0, OP_PUT, put_)
1676 MSPEL_FILTER13_8B(shift1, %[offset_x3], %[offset_x2], %[offset_x1], $0, OP_AVG, avg_)
1677 MSPEL_FILTER13_VER_16B(shift1, %[stride_x3], %[stride_x2], %[stride_x1], $0)
1678 MSPEL_FILTER13_HOR_16B(shift1, 6, 4, 2, 0, OP_PUT, put_)
1679 MSPEL_FILTER13_HOR_16B(shift1, 6, 4, 2, 0, OP_AVG, avg_)
1680
1681 /** 3/4 shift bicubic interpolation */
1682 MSPEL_FILTER13_8B(shift3, $0, %[offset_x1], %[offset_x2], %[offset_x3], OP_PUT, put_)
1683 MSPEL_FILTER13_8B(shift3, $0, %[offset_x1], %[offset_x2], %[offset_x3], OP_AVG, avg_)
1684 MSPEL_FILTER13_VER_16B(shift3, $0, %[stride_x1], %[stride_x2], %[stride_x3])
1685 MSPEL_FILTER13_HOR_16B(shift3, 0, 2, 4, 6, OP_PUT, put_)
1686 MSPEL_FILTER13_HOR_16B(shift3, 0, 2, 4, 6, OP_AVG, avg_)
1687
1688 typedef void (*vc1_mspel_mc_filter_ver_16bits)
1689              (int16_t *dst, const uint8_t *src, mips_reg src_stride, int rnd,
1690               int64_t shift);
1691 typedef void (*vc1_mspel_mc_filter_hor_16bits)
1692              (uint8_t *dst, mips_reg dst_stride, const int16_t *src, int rnd);
1693 typedef void (*vc1_mspel_mc_filter_8bits)
1694              (uint8_t *dst, const uint8_t *src, mips_reg stride, int rnd,
1695               mips_reg offset);
1696
1697 /**
1698  * Interpolate fractional pel values by applying proper vertical then
1699  * horizontal filter.
1700  *
1701  * @param  dst     Destination buffer for interpolated pels.
1702  * @param  src     Source buffer.
1703  * @param  stride  Stride for both src and dst buffers.
1704  * @param  hmode   Horizontal filter (expressed in quarter pixels shift).
1705  * @param  hmode   Vertical filter.
1706  * @param  rnd     Rounding bias.
1707  */
1708 #define VC1_MSPEL_MC(OP)                                                    \
1709 static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\
1710                                int hmode, int vmode, int rnd)               \
1711 {                                                                           \
1712     static const vc1_mspel_mc_filter_ver_16bits vc1_put_shift_ver_16bits[] =\
1713          { NULL, vc1_put_ver_16b_shift1_mmi,                                \
1714                  vc1_put_ver_16b_shift2_mmi,                                \
1715                  vc1_put_ver_16b_shift3_mmi };                              \
1716     static const vc1_mspel_mc_filter_hor_16bits vc1_put_shift_hor_16bits[] =\
1717          { NULL, OP ## vc1_hor_16b_shift1_mmi,                              \
1718                  OP ## vc1_hor_16b_shift2_mmi,                              \
1719                  OP ## vc1_hor_16b_shift3_mmi };                            \
1720     static const vc1_mspel_mc_filter_8bits vc1_put_shift_8bits[] =          \
1721          { NULL, OP ## vc1_shift1_mmi,                                      \
1722                  OP ## vc1_shift2_mmi,                                      \
1723                  OP ## vc1_shift3_mmi };                                    \
1724                                                                             \
1725     if (vmode) { /* Vertical filter to apply */                             \
1726         if (hmode) { /* Horizontal filter to apply, output to tmp */        \
1727             static const int shift_value[] = { 0, 5, 1, 5 };                \
1728             int    shift = (shift_value[hmode]+shift_value[vmode])>>1;      \
1729             int    r;                                                       \
1730             LOCAL_ALIGNED(16, int16_t, tmp, [12*8]);                        \
1731                                                                             \
1732             r = (1<<(shift-1)) + rnd-1;                                     \
1733             vc1_put_shift_ver_16bits[vmode](tmp, src-1, stride, r, shift);  \
1734                                                                             \
1735             vc1_put_shift_hor_16bits[hmode](dst, stride, tmp+1, 64-rnd);    \
1736             return;                                                         \
1737         }                                                                   \
1738         else { /* No horizontal filter, output 8 lines to dst */            \
1739             vc1_put_shift_8bits[vmode](dst, src, stride, 1-rnd, stride);    \
1740             return;                                                         \
1741         }                                                                   \
1742     }                                                                       \
1743                                                                             \
1744     /* Horizontal mode with no vertical mode */                             \
1745     vc1_put_shift_8bits[hmode](dst, src, stride, rnd, 1);                   \
1746 }                                                                           \
1747 static void OP ## vc1_mspel_mc_16(uint8_t *dst, const uint8_t *src,         \
1748                                   int stride, int hmode, int vmode, int rnd)\
1749 {                                                                           \
1750     OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd);        \
1751     OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd);        \
1752     dst += 8*stride; src += 8*stride;                                       \
1753     OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd);        \
1754     OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd);        \
1755 }
1756
1757 VC1_MSPEL_MC(put_)
1758 VC1_MSPEL_MC(avg_)
1759
1760 /** Macro to ease bicubic filter interpolation functions declarations */
1761 #define DECLARE_FUNCTION(a, b)                                              \
1762 void ff_put_vc1_mspel_mc ## a ## b ## _mmi(uint8_t *dst,                    \
1763                                            const uint8_t *src,              \
1764                                            ptrdiff_t stride,                \
1765                                            int rnd)                         \
1766 {                                                                           \
1767      put_vc1_mspel_mc(dst, src, stride, a, b, rnd);                         \
1768 }                                                                           \
1769 void ff_avg_vc1_mspel_mc ## a ## b ## _mmi(uint8_t *dst,                    \
1770                                            const uint8_t *src,              \
1771                                            ptrdiff_t stride,                \
1772                                            int rnd)                         \
1773 {                                                                           \
1774      avg_vc1_mspel_mc(dst, src, stride, a, b, rnd);                         \
1775 }                                                                           \
1776 void ff_put_vc1_mspel_mc ## a ## b ## _16_mmi(uint8_t *dst,                 \
1777                                               const uint8_t *src,           \
1778                                               ptrdiff_t stride,             \
1779                                               int rnd)                      \
1780 {                                                                           \
1781      put_vc1_mspel_mc_16(dst, src, stride, a, b, rnd);                      \
1782 }                                                                           \
1783 void ff_avg_vc1_mspel_mc ## a ## b ## _16_mmi(uint8_t *dst,                 \
1784                                               const uint8_t *src,           \
1785                                               ptrdiff_t stride,             \
1786                                               int rnd)                      \
1787 {                                                                           \
1788      avg_vc1_mspel_mc_16(dst, src, stride, a, b, rnd);                      \
1789 }
1790
1791 DECLARE_FUNCTION(0, 1)
1792 DECLARE_FUNCTION(0, 2)
1793 DECLARE_FUNCTION(0, 3)
1794
1795 DECLARE_FUNCTION(1, 0)
1796 DECLARE_FUNCTION(1, 1)
1797 DECLARE_FUNCTION(1, 2)
1798 DECLARE_FUNCTION(1, 3)
1799
1800 DECLARE_FUNCTION(2, 0)
1801 DECLARE_FUNCTION(2, 1)
1802 DECLARE_FUNCTION(2, 2)
1803 DECLARE_FUNCTION(2, 3)
1804
1805 DECLARE_FUNCTION(3, 0)
1806 DECLARE_FUNCTION(3, 1)
1807 DECLARE_FUNCTION(3, 2)
1808 DECLARE_FUNCTION(3, 3)
1809
1810 #define CHROMA_MC_8_MMI                                                     \
1811         "punpckhbh  %[ftmp5],   %[ftmp1],   %[ftmp0]                \n\t"   \
1812         "punpcklbh  %[ftmp1],   %[ftmp1],   %[ftmp0]                \n\t"   \
1813         "punpckhbh  %[ftmp6],   %[ftmp2],   %[ftmp0]                \n\t"   \
1814         "punpcklbh  %[ftmp2],   %[ftmp2],   %[ftmp0]                \n\t"   \
1815         "punpckhbh  %[ftmp7],   %[ftmp3],   %[ftmp0]                \n\t"   \
1816         "punpcklbh  %[ftmp3],   %[ftmp3],   %[ftmp0]                \n\t"   \
1817         "punpckhbh  %[ftmp8],   %[ftmp4],   %[ftmp0]                \n\t"   \
1818         "punpcklbh  %[ftmp4],   %[ftmp4],   %[ftmp0]                \n\t"   \
1819                                                                             \
1820         "pmullh     %[ftmp1],   %[ftmp1],   %[A]                    \n\t"   \
1821         "pmullh     %[ftmp5],   %[ftmp5],   %[A]                    \n\t"   \
1822         "pmullh     %[ftmp2],   %[ftmp2],   %[B]                    \n\t"   \
1823         "pmullh     %[ftmp6],   %[ftmp6],   %[B]                    \n\t"   \
1824         "pmullh     %[ftmp3],   %[ftmp3],   %[C]                    \n\t"   \
1825         "pmullh     %[ftmp7],   %[ftmp7],   %[C]                    \n\t"   \
1826         "pmullh     %[ftmp4],   %[ftmp4],   %[D]                    \n\t"   \
1827         "pmullh     %[ftmp8],   %[ftmp8],   %[D]                    \n\t"   \
1828                                                                             \
1829         "paddh      %[ftmp1],   %[ftmp1],   %[ftmp2]                \n\t"   \
1830         "paddh      %[ftmp3],   %[ftmp3],   %[ftmp4]                \n\t"   \
1831         "paddh      %[ftmp1],   %[ftmp1],   %[ftmp3]                \n\t"   \
1832         "paddh      %[ftmp1],   %[ftmp1],   %[ff_pw_28]             \n\t"   \
1833                                                                             \
1834         "paddh      %[ftmp5],   %[ftmp5],   %[ftmp6]                \n\t"   \
1835         "paddh      %[ftmp7],   %[ftmp7],   %[ftmp8]                \n\t"   \
1836         "paddh      %[ftmp5],   %[ftmp5],   %[ftmp7]                \n\t"   \
1837         "paddh      %[ftmp5],   %[ftmp5],   %[ff_pw_28]             \n\t"   \
1838                                                                             \
1839         "psrlh      %[ftmp1],   %[ftmp1],   %[ftmp9]                \n\t"   \
1840         "psrlh      %[ftmp5],   %[ftmp5],   %[ftmp9]                \n\t"   \
1841         "packushb   %[ftmp1],   %[ftmp1],   %[ftmp5]                \n\t"
1842
1843
1844 #define CHROMA_MC_4_MMI                                                     \
1845         "punpcklbh  %[ftmp1],   %[ftmp1],   %[ftmp0]                \n\t"   \
1846         "punpcklbh  %[ftmp2],   %[ftmp2],   %[ftmp0]                \n\t"   \
1847         "punpcklbh  %[ftmp3],   %[ftmp3],   %[ftmp0]                \n\t"   \
1848         "punpcklbh  %[ftmp4],   %[ftmp4],   %[ftmp0]                \n\t"   \
1849                                                                             \
1850         "pmullh     %[ftmp1],   %[ftmp1],   %[A]                    \n\t"   \
1851         "pmullh     %[ftmp2],   %[ftmp2],   %[B]                    \n\t"   \
1852         "pmullh     %[ftmp3],   %[ftmp3],   %[C]                    \n\t"   \
1853         "pmullh     %[ftmp4],   %[ftmp4],   %[D]                    \n\t"   \
1854                                                                             \
1855         "paddh      %[ftmp1],   %[ftmp1],   %[ftmp2]                \n\t"   \
1856         "paddh      %[ftmp3],   %[ftmp3],   %[ftmp4]                \n\t"   \
1857         "paddh      %[ftmp1],   %[ftmp1],   %[ftmp3]                \n\t"   \
1858         "paddh      %[ftmp1],   %[ftmp1],   %[ff_pw_28]             \n\t"   \
1859                                                                             \
1860         "psrlh      %[ftmp1],   %[ftmp1],   %[ftmp5]                \n\t"   \
1861         "packushb   %[ftmp1],   %[ftmp1],   %[ftmp0]                \n\t"
1862
1863
1864 void ff_put_no_rnd_vc1_chroma_mc8_mmi(uint8_t *dst /* align 8 */,
1865                                       uint8_t *src /* align 1 */,
1866                                       int stride, int h, int x, int y)
1867 {
1868     const int A = (8 - x) * (8 - y);
1869     const int B =     (x) * (8 - y);
1870     const int C = (8 - x) *     (y);
1871     const int D =     (x) *     (y);
1872     double ftmp[10];
1873     uint32_t tmp[1];
1874     DECLARE_VAR_ALL64;
1875     DECLARE_VAR_ADDRT;
1876
1877     av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
1878
1879     __asm__ volatile(
1880         "li         %[tmp0],    0x06                                    \n\t"
1881         "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
1882         "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
1883         "pshufh     %[A],       %[A],       %[ftmp0]                    \n\t"
1884         "pshufh     %[B],       %[B],       %[ftmp0]                    \n\t"
1885         "pshufh     %[C],       %[C],       %[ftmp0]                    \n\t"
1886         "pshufh     %[D],       %[D],       %[ftmp0]                    \n\t"
1887
1888         "1:                                                             \n\t"
1889         MMI_ULDC1(%[ftmp1], %[src], 0x00)
1890         MMI_ULDC1(%[ftmp2], %[src], 0x01)
1891         PTR_ADDU   "%[src],     %[src],     %[stride]                   \n\t"
1892         MMI_ULDC1(%[ftmp3], %[src], 0x00)
1893         MMI_ULDC1(%[ftmp4], %[src], 0x01)
1894
1895         CHROMA_MC_8_MMI
1896
1897         MMI_SDC1(%[ftmp1], %[dst], 0x00)
1898         "addiu      %[h],       %[h],      -0x01                        \n\t"
1899         PTR_ADDU   "%[dst],     %[dst],     %[stride]                   \n\t"
1900         "bnez       %[h],       1b                                      \n\t"
1901         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
1902           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
1903           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
1904           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
1905           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
1906           RESTRICT_ASM_ALL64
1907           RESTRICT_ASM_ADDRT
1908           [tmp0]"=&r"(tmp[0]),
1909           [src]"+&r"(src),              [dst]"+&r"(dst),
1910           [h]"+&r"(h)
1911         : [stride]"r"((mips_reg)stride),
1912           [A]"f"(A),                    [B]"f"(B),
1913           [C]"f"(C),                    [D]"f"(D),
1914           [ff_pw_28]"f"(ff_pw_28)
1915         : "memory"
1916     );
1917 }
1918
1919 void ff_put_no_rnd_vc1_chroma_mc4_mmi(uint8_t *dst /* align 8 */,
1920                                       uint8_t *src /* align 1 */,
1921                                       int stride, int h, int x, int y)
1922 {
1923     const int A = (8 - x) * (8 - y);
1924     const int B =     (x) * (8 - y);
1925     const int C = (8 - x) *     (y);
1926     const int D =     (x) *     (y);
1927     double ftmp[6];
1928     uint32_t tmp[1];
1929     DECLARE_VAR_LOW32;
1930     DECLARE_VAR_ADDRT;
1931
1932     av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
1933
1934     __asm__ volatile(
1935         "li         %[tmp0],    0x06                                    \n\t"
1936         "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
1937         "mtc1       %[tmp0],    %[ftmp5]                                \n\t"
1938         "pshufh     %[A],       %[A],       %[ftmp0]                    \n\t"
1939         "pshufh     %[B],       %[B],       %[ftmp0]                    \n\t"
1940         "pshufh     %[C],       %[C],       %[ftmp0]                    \n\t"
1941         "pshufh     %[D],       %[D],       %[ftmp0]                    \n\t"
1942
1943         "1:                                                             \n\t"
1944         MMI_ULWC1(%[ftmp1], %[src], 0x00)
1945         MMI_ULWC1(%[ftmp2], %[src], 0x01)
1946         PTR_ADDU   "%[src],     %[src],     %[stride]                   \n\t"
1947         MMI_ULWC1(%[ftmp3], %[src], 0x00)
1948         MMI_ULWC1(%[ftmp4], %[src], 0x01)
1949
1950         CHROMA_MC_4_MMI
1951
1952         MMI_SWC1(%[ftmp1], %[dst], 0x00)
1953         "addiu      %[h],       %[h],      -0x01                        \n\t"
1954         PTR_ADDU   "%[dst],     %[dst],     %[stride]                   \n\t"
1955         "bnez       %[h],       1b                                      \n\t"
1956         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
1957           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
1958           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
1959           [tmp0]"=&r"(tmp[0]),
1960           RESTRICT_ASM_LOW32
1961           RESTRICT_ASM_ADDRT
1962           [src]"+&r"(src),              [dst]"+&r"(dst),
1963           [h]"+&r"(h)
1964         : [stride]"r"((mips_reg)stride),
1965           [A]"f"(A),                    [B]"f"(B),
1966           [C]"f"(C),                    [D]"f"(D),
1967           [ff_pw_28]"f"(ff_pw_28)
1968         : "memory"
1969     );
1970 }
1971
1972 void ff_avg_no_rnd_vc1_chroma_mc8_mmi(uint8_t *dst /* align 8 */,
1973                                       uint8_t *src /* align 1 */,
1974                                       int stride, int h, int x, int y)
1975 {
1976     const int A = (8 - x) * (8 - y);
1977     const int B =     (x) * (8 - y);
1978     const int C = (8 - x) *     (y);
1979     const int D =     (x) *     (y);
1980     double ftmp[10];
1981     uint32_t tmp[1];
1982     DECLARE_VAR_ALL64;
1983     DECLARE_VAR_ADDRT;
1984
1985     av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
1986
1987     __asm__ volatile(
1988         "li         %[tmp0],    0x06                                    \n\t"
1989         "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
1990         "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
1991         "pshufh     %[A],       %[A],       %[ftmp0]                    \n\t"
1992         "pshufh     %[B],       %[B],       %[ftmp0]                    \n\t"
1993         "pshufh     %[C],       %[C],       %[ftmp0]                    \n\t"
1994         "pshufh     %[D],       %[D],       %[ftmp0]                    \n\t"
1995
1996         "1:                                                             \n\t"
1997         MMI_ULDC1(%[ftmp1], %[src], 0x00)
1998         MMI_ULDC1(%[ftmp2], %[src], 0x01)
1999         PTR_ADDU   "%[src],     %[src],     %[stride]                   \n\t"
2000         MMI_ULDC1(%[ftmp3], %[src], 0x00)
2001         MMI_ULDC1(%[ftmp4], %[src], 0x01)
2002
2003         CHROMA_MC_8_MMI
2004
2005         MMI_LDC1(%[ftmp2], %[dst], 0x00)
2006         "pavgb      %[ftmp1],   %[ftmp1],   %[ftmp2]                    \n\t"
2007
2008         MMI_SDC1(%[ftmp1], %[dst], 0x00)
2009         "addiu      %[h],       %[h],      -0x01                        \n\t"
2010         PTR_ADDU   "%[dst],     %[dst],     %[stride]                   \n\t"
2011         "bnez       %[h],       1b                                      \n\t"
2012         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
2013           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
2014           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
2015           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
2016           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
2017           [tmp0]"=&r"(tmp[0]),
2018           RESTRICT_ASM_ALL64
2019           RESTRICT_ASM_ADDRT
2020           [src]"+&r"(src),              [dst]"+&r"(dst),
2021           [h]"+&r"(h)
2022         : [stride]"r"((mips_reg)stride),
2023           [A]"f"(A),                    [B]"f"(B),
2024           [C]"f"(C),                    [D]"f"(D),
2025           [ff_pw_28]"f"(ff_pw_28)
2026         : "memory"
2027     );
2028 }
2029
2030 void ff_avg_no_rnd_vc1_chroma_mc4_mmi(uint8_t *dst /* align 8 */,
2031                                       uint8_t *src /* align 1 */,
2032                                       int stride, int h, int x, int y)
2033 {
2034     const int A = (8 - x) * (8 - y);
2035     const int B = (    x) * (8 - y);
2036     const int C = (8 - x) * (    y);
2037     const int D = (    x) * (    y);
2038     double ftmp[6];
2039     uint32_t tmp[1];
2040     DECLARE_VAR_LOW32;
2041     DECLARE_VAR_ADDRT;
2042
2043     av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2044
2045     __asm__ volatile(
2046         "li         %[tmp0],    0x06                                    \n\t"
2047         "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
2048         "mtc1       %[tmp0],    %[ftmp5]                                \n\t"
2049         "pshufh     %[A],       %[A],       %[ftmp0]                    \n\t"
2050         "pshufh     %[B],       %[B],       %[ftmp0]                    \n\t"
2051         "pshufh     %[C],       %[C],       %[ftmp0]                    \n\t"
2052         "pshufh     %[D],       %[D],       %[ftmp0]                    \n\t"
2053
2054         "1:                                                             \n\t"
2055         MMI_ULWC1(%[ftmp1], %[src], 0x00)
2056         MMI_ULWC1(%[ftmp2], %[src], 0x01)
2057         PTR_ADDU   "%[src],     %[src],     %[stride]                   \n\t"
2058         MMI_ULWC1(%[ftmp3], %[src], 0x00)
2059         MMI_ULWC1(%[ftmp4], %[src], 0x01)
2060
2061         CHROMA_MC_4_MMI
2062
2063         MMI_LWC1(%[ftmp2], %[dst], 0x00)
2064         "pavgb      %[ftmp1],   %[ftmp1],   %[ftmp2]                    \n\t"
2065
2066         MMI_SWC1(%[ftmp1], %[dst], 0x00)
2067         "addiu      %[h],       %[h],      -0x01                        \n\t"
2068         PTR_ADDU   "%[dst],     %[dst],     %[stride]                   \n\t"
2069         "bnez       %[h],       1b                                      \n\t"
2070         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
2071           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
2072           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
2073           [tmp0]"=&r"(tmp[0]),
2074           RESTRICT_ASM_LOW32
2075           RESTRICT_ASM_ADDRT
2076           [src]"+&r"(src),              [dst]"+&r"(dst),
2077           [h]"+&r"(h)
2078         : [stride]"r"((mips_reg)stride),
2079           [A]"f"(A),                    [B]"f"(B),
2080           [C]"f"(C),                    [D]"f"(D),
2081           [ff_pw_28]"f"(ff_pw_28)
2082         : "memory"
2083     );
2084 }