git.sesse.net Git - ffmpeg/blob - libavcodec/mips/vc1dsp_mmi.c

   1 /*
   2  * VC-1 and WMV3 - DSP functions Loongson MMI-optimized
   3  *
   4  * Copyright (c) 2016 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
   5  *
   6  * This file is part of FFmpeg.
   7  *
   8  * FFmpeg is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License as published by the Free Software Foundation; either
  11  * version 2.1 of the License, or (at your option) any later version.
  12  *
  13  * FFmpeg is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16  * Lesser General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU Lesser General Public
  19  * License along with FFmpeg; if not, write to the Free Software
  20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21  */
  22
  23 #include "libavutil/avassert.h"
  24 #include "libavcodec/vc1dsp.h"
  25 #include "constants.h"
  26 #include "vc1dsp_mips.h"
  27 #include "hpeldsp_mips.h"
  28 #include "libavutil/mips/mmiutils.h"
  29
  30 #define VC1_INV_TRANCS_8_TYPE1(o1, o2, r1, r2, r3, r4, c0)                  \
  31         "li         %[tmp0],    "#r1"                                 \n\t" \
  32         "mtc1       %[tmp0],    %[ftmp13]                             \n\t" \
  33         "punpcklwd  %[ftmp13],  %[ftmp13],  %[ftmp13]                 \n\t" \
  34         "li         %[tmp0],    "#r2"                                 \n\t" \
  35         "mtc1       %[tmp0],    %[ftmp14]                             \n\t" \
  36         "punpcklwd  %[ftmp14],  %[ftmp14],  %[ftmp14]                 \n\t" \
  37         "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp13]                 \n\t" \
  38         "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp14]                 \n\t" \
  39         "paddw      %[ftmp1],   %[ftmp1],   %[ftmp2]                  \n\t" \
  40         "pmaddhw    %[ftmp2],   %[ftmp6],   %[ftmp13]                 \n\t" \
  41         "pmaddhw    %[ftmp3],   %[ftmp8],   %[ftmp14]                 \n\t" \
  42         "paddw      %[ftmp2],   %[ftmp2],   %[ftmp3]                  \n\t" \
  43                                                                             \
  44         "li         %[tmp0],    "#r3"                                 \n\t" \
  45         "mtc1       %[tmp0],    %[ftmp13]                             \n\t" \
  46         "punpcklwd  %[ftmp13],  %[ftmp13],  %[ftmp13]                 \n\t" \
  47         "li         %[tmp0],    "#r4"                                 \n\t" \
  48         "mtc1       %[tmp0],    %[ftmp14]                             \n\t" \
  49         "punpcklwd  %[ftmp14],  %[ftmp14],  %[ftmp14]                 \n\t" \
  50         "pmaddhw    %[ftmp3],   %[ftmp9],   %[ftmp13]                 \n\t" \
  51         "pmaddhw    %[ftmp4],   %[ftmp11],  %[ftmp14]                 \n\t" \
  52         "paddw      %[ftmp3],   %[ftmp3],   %[ftmp4]                  \n\t" \
  53         "pmaddhw    %[ftmp4],   %[ftmp10],  %[ftmp13]                 \n\t" \
  54         "pmaddhw    %[ftmp13],  %[ftmp12],  %[ftmp14]                 \n\t" \
  55         "paddw      %[ftmp4],   %[ftmp4],   %[ftmp13]                 \n\t" \
  56                                                                             \
  57         "paddw      %[ftmp1],   %[ftmp1],   "#c0"                     \n\t" \
  58         "paddw      %[ftmp2],   %[ftmp2],   "#c0"                     \n\t" \
  59         "paddw      %[ftmp13],  %[ftmp1],   %[ftmp3]                  \n\t" \
  60         "psubw      %[ftmp14],  %[ftmp1],   %[ftmp3]                  \n\t" \
  61         "paddw      %[ftmp1],   %[ftmp2],   %[ftmp4]                  \n\t" \
  62         "psubw      %[ftmp3],   %[ftmp2],   %[ftmp4]                  \n\t" \
  63         "psraw      %[ftmp13],  %[ftmp13],  %[ftmp0]                  \n\t" \
  64         "psraw      %[ftmp1],   %[ftmp1],   %[ftmp0]                  \n\t" \
  65         "psraw      %[ftmp14],  %[ftmp14],  %[ftmp0]                  \n\t" \
  66         "psraw      %[ftmp3],   %[ftmp3],   %[ftmp0]                  \n\t" \
  67         "punpcklhw  %[ftmp2],   %[ftmp13],  %[ftmp1]                  \n\t" \
  68         "punpckhhw  %[ftmp4],   %[ftmp13],  %[ftmp1]                  \n\t" \
  69         "punpcklhw  "#o1",      %[ftmp2],   %[ftmp4]                  \n\t" \
  70         "punpcklhw  %[ftmp2],   %[ftmp14],  %[ftmp3]                  \n\t" \
  71         "punpckhhw  %[ftmp4],   %[ftmp14],  %[ftmp3]                  \n\t" \
  72         "punpcklhw  "#o2",      %[ftmp2],   %[ftmp4]                  \n\t"
  73
  74 #define VC1_INV_TRANCS_8_TYPE2(o1, o2, r1, r2, r3, r4, c0, c1)              \
  75         "li         %[tmp0],    "#r1"                                 \n\t" \
  76         "mtc1       %[tmp0],    %[ftmp13]                             \n\t" \
  77         "punpcklwd  %[ftmp13],  %[ftmp13],  %[ftmp13]                 \n\t" \
  78         "li         %[tmp0],    "#r2"                                 \n\t" \
  79         "mtc1       %[tmp0],    %[ftmp14]                             \n\t" \
  80         "punpcklwd  %[ftmp14],  %[ftmp14],  %[ftmp14]                 \n\t" \
  81         "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp13]                 \n\t" \
  82         "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp14]                 \n\t" \
  83         "paddw      %[ftmp1],   %[ftmp1],   %[ftmp2]                  \n\t" \
  84         "pmaddhw    %[ftmp2],   %[ftmp6],   %[ftmp13]                 \n\t" \
  85         "pmaddhw    %[ftmp3],   %[ftmp8],   %[ftmp14]                 \n\t" \
  86         "paddw      %[ftmp2],   %[ftmp2],   %[ftmp3]                  \n\t" \
  87                                                                             \
  88         "li         %[tmp0],    "#r3"                                 \n\t" \
  89         "mtc1       %[tmp0],    %[ftmp13]                             \n\t" \
  90         "punpcklwd  %[ftmp13],  %[ftmp13],  %[ftmp13]                 \n\t" \
  91         "li         %[tmp0],    "#r4"                                 \n\t" \
  92         "mtc1       %[tmp0],    %[ftmp14]                             \n\t" \
  93         "punpcklwd  %[ftmp14],  %[ftmp14],  %[ftmp14]                 \n\t" \
  94         "pmaddhw    %[ftmp3],   %[ftmp9],   %[ftmp13]                 \n\t" \
  95         "pmaddhw    %[ftmp4],   %[ftmp11],  %[ftmp14]                 \n\t" \
  96         "paddw      %[ftmp3],   %[ftmp3],   %[ftmp4]                  \n\t" \
  97         "pmaddhw    %[ftmp4],   %[ftmp10],  %[ftmp13]                 \n\t" \
  98         "pmaddhw    %[ftmp13],  %[ftmp12],  %[ftmp14]                 \n\t" \
  99         "paddw      %[ftmp4],   %[ftmp4],   %[ftmp13]                 \n\t" \
 100                                                                             \
 101         "paddw      %[ftmp13],  %[ftmp1],   %[ftmp3]                  \n\t" \
 102         "psubw      %[ftmp14],  %[ftmp1],   %[ftmp3]                  \n\t" \
 103         "paddw      %[ftmp14],  %[ftmp14],  "#c1"                     \n\t" \
 104         "paddw      %[ftmp1],   %[ftmp2],   %[ftmp4]                  \n\t" \
 105         "psubw      %[ftmp3],   %[ftmp2],   %[ftmp4]                  \n\t" \
 106         "paddw      %[ftmp3],   %[ftmp3],   "#c1"                     \n\t" \
 107         "paddw      %[ftmp13],  %[ftmp13],  "#c0"                     \n\t" \
 108         "paddw      %[ftmp14],  %[ftmp14],  "#c0"                     \n\t" \
 109         "paddw      %[ftmp1],   %[ftmp1],   "#c0"                     \n\t" \
 110         "paddw      %[ftmp3],   %[ftmp3],   "#c0"                     \n\t" \
 111         "psraw      %[ftmp13],  %[ftmp13],  %[ftmp0]                  \n\t" \
 112         "psraw      %[ftmp1],   %[ftmp1],   %[ftmp0]                  \n\t" \
 113         "psraw      %[ftmp14],  %[ftmp14],  %[ftmp0]                  \n\t" \
 114         "psraw      %[ftmp3],   %[ftmp3],   %[ftmp0]                  \n\t" \
 115         "punpcklhw  %[ftmp2],   %[ftmp13],  %[ftmp1]                  \n\t" \
 116         "punpckhhw  %[ftmp4],   %[ftmp13],  %[ftmp1]                  \n\t" \
 117         "punpcklhw  "#o1",      %[ftmp2],   %[ftmp4]                  \n\t" \
 118         "punpcklhw  %[ftmp2],   %[ftmp14],  %[ftmp3]                  \n\t" \
 119         "punpckhhw  %[ftmp4],   %[ftmp14],  %[ftmp3]                  \n\t" \
 120         "punpcklhw  "#o2",      %[ftmp2],   %[ftmp4]                  \n\t"
 121
 122 /* Do inverse transform on 8x8 block */
 123 void ff_vc1_inv_trans_8x8_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
 124 {
 125     int dc = block[0];
 126     double ftmp[9];
 127     mips_reg addr[1];
 128     int count;
 129
 130     dc = (3 * dc +  1) >> 1;
 131     dc = (3 * dc + 16) >> 5;
 132
 133     __asm__ volatile(
 134         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
 135         "pshufh     %[dc],      %[dc],          %[ftmp0]                \n\t"
 136         "li         %[count],   0x02                                    \n\t"
 137
 138         "1:                                                             \n\t"
 139         MMI_LDC1(%[ftmp1], %[dest], 0x00)
 140         PTR_ADDU   "%[addr0],   %[dest],        %[linesize]             \n\t"
 141         MMI_LDC1(%[ftmp2], %[addr0], 0x00)
 142         PTR_ADDU   "%[addr0],   %[addr0],       %[linesize]             \n\t"
 143         MMI_LDC1(%[ftmp3], %[addr0], 0x00)
 144         PTR_ADDU   "%[addr0],   %[addr0],       %[linesize]             \n\t"
 145         MMI_LDC1(%[ftmp4], %[addr0], 0x00)
 146
 147         "punpckhbh  %[ftmp5],   %[ftmp1],       %[ftmp0]                \n\t"
 148         "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
 149         "punpckhbh  %[ftmp6],   %[ftmp2],       %[ftmp0]                \n\t"
 150         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
 151         "punpckhbh  %[ftmp7],   %[ftmp3],       %[ftmp0]                \n\t"
 152         "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
 153         "punpckhbh  %[ftmp8],   %[ftmp4],       %[ftmp0]                \n\t"
 154         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
 155
 156         "paddsh     %[ftmp1],   %[ftmp1],       %[dc]                   \n\t"
 157         "paddsh     %[ftmp2],   %[ftmp2],       %[dc]                   \n\t"
 158         "paddsh     %[ftmp3],   %[ftmp3],       %[dc]                   \n\t"
 159         "paddsh     %[ftmp4],   %[ftmp4],       %[dc]                   \n\t"
 160         "paddsh     %[ftmp5],   %[ftmp5],       %[dc]                   \n\t"
 161         "paddsh     %[ftmp6],   %[ftmp6],       %[dc]                   \n\t"
 162         "paddsh     %[ftmp7],   %[ftmp7],       %[dc]                   \n\t"
 163         "paddsh     %[ftmp8],   %[ftmp8],       %[dc]                   \n\t"
 164
 165         "packushb   %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
 166         "packushb   %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
 167         "packushb   %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
 168         "packushb   %[ftmp4],   %[ftmp4],       %[ftmp8]                \n\t"
 169
 170         MMI_SDC1(%[ftmp1], %[dest], 0x00)
 171         PTR_ADDU   "%[addr0],   %[dest],        %[linesize]             \n\t"
 172         MMI_SDC1(%[ftmp2], %[addr0], 0x00)
 173         PTR_ADDU   "%[addr0],   %[addr0],       %[linesize]             \n\t"
 174         MMI_SDC1(%[ftmp3], %[addr0], 0x00)
 175         PTR_ADDU   "%[addr0],   %[addr0],       %[linesize]             \n\t"
 176         MMI_SDC1(%[ftmp4], %[addr0], 0x00)
 177
 178         "addiu      %[count],   %[count],       -0x01                   \n\t"
 179         PTR_ADDU   "%[dest],    %[addr0],       %[linesize]             \n\t"
 180         "bnez       %[count],   1b                                      \n\t"
 181         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
 182           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
 183           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
 184           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
 185           [ftmp8]"=&f"(ftmp[8]),
 186           [addr0]"=&r"(addr[0]),
 187           [count]"=&r"(count),          [dest]"+&r"(dest)
 188         : [linesize]"r"((mips_reg)linesize),
 189           [dc]"f"(dc)
 190         : "memory"
 191     );
 192 }
 193
 194 #if _MIPS_SIM != _ABIO32
 195 void ff_vc1_inv_trans_8x8_mmi(int16_t block[64])
 196 {
 197     DECLARE_ALIGNED(16, int16_t, temp[64]);
 198     DECLARE_ALIGNED(8, const uint64_t, ff_pw_1_local) = {0x0000000100000001ULL};
 199     DECLARE_ALIGNED(8, const uint64_t, ff_pw_4_local) = {0x0000000400000004ULL};
 200     DECLARE_ALIGNED(8, const uint64_t, ff_pw_64_local)= {0x0000004000000040ULL};
 201     double ftmp[23];
 202     uint64_t tmp[1];
 203
 204     __asm__ volatile (
 205         /* 1st loop: start */
 206         "li         %[tmp0],    0x03                                    \n\t"
 207         "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
 208
 209        // 1st part
 210         MMI_LDC1(%[ftmp1], %[block], 0x00)
 211         MMI_LDC1(%[ftmp11], %[block], 0x10)
 212         MMI_LDC1(%[ftmp2], %[block], 0x20)
 213         MMI_LDC1(%[ftmp12], %[block], 0x30)
 214         MMI_LDC1(%[ftmp3], %[block], 0x40)
 215         MMI_LDC1(%[ftmp13], %[block], 0x50)
 216         MMI_LDC1(%[ftmp4], %[block], 0x60)
 217         MMI_LDC1(%[ftmp14], %[block], 0x70)
 218         "punpcklhw  %[ftmp5],   %[ftmp1],   %[ftmp2]                    \n\t"
 219         "punpckhhw  %[ftmp6],   %[ftmp1],   %[ftmp2]                    \n\t"
 220         "punpcklhw  %[ftmp7],   %[ftmp3],   %[ftmp4]                    \n\t"
 221         "punpckhhw  %[ftmp8],   %[ftmp3],   %[ftmp4]                    \n\t"
 222
 223         "punpcklhw  %[ftmp9],  %[ftmp11],  %[ftmp12]                    \n\t"
 224         "punpckhhw  %[ftmp10], %[ftmp11],  %[ftmp12]                    \n\t"
 225         "punpcklhw  %[ftmp11], %[ftmp13],  %[ftmp14]                    \n\t"
 226         "punpckhhw  %[ftmp12], %[ftmp13],  %[ftmp14]                    \n\t"
 227
 228         /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */
 229         VC1_INV_TRANCS_8_TYPE1(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c,
 230                                0x000f0010, 0x00040009, %[ff_pw_4])
 231
 232         /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */
 233         VC1_INV_TRANCS_8_TYPE1(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4,
 234                                0xfffc000f, 0xfff7fff0, %[ff_pw_4])
 235
 236         /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */
 237         VC1_INV_TRANCS_8_TYPE1(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4,
 238                                0xfff00009, 0x000f0004, %[ff_pw_4])
 239
 240         /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */
 241         VC1_INV_TRANCS_8_TYPE1(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c,
 242                                0xfff70004, 0xfff0000f, %[ff_pw_4])
 243
 244         TRANSPOSE_4H(%[ftmp15], %[ftmp16], %[ftmp17], %[ftmp18],
 245                      %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
 246
 247         TRANSPOSE_4H(%[ftmp19], %[ftmp20], %[ftmp21], %[ftmp22],
 248                      %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
 249
 250         MMI_SDC1(%[ftmp15], %[temp], 0x00)
 251         MMI_SDC1(%[ftmp19], %[temp], 0x08)
 252         MMI_SDC1(%[ftmp16], %[temp], 0x10)
 253         MMI_SDC1(%[ftmp20], %[temp], 0x18)
 254         MMI_SDC1(%[ftmp17], %[temp], 0x20)
 255         MMI_SDC1(%[ftmp21], %[temp], 0x28)
 256         MMI_SDC1(%[ftmp18], %[temp], 0x30)
 257         MMI_SDC1(%[ftmp22], %[temp], 0x38)
 258
 259        // 2nd part
 260         MMI_LDC1(%[ftmp1], %[block], 0x08)
 261         MMI_LDC1(%[ftmp11], %[block], 0x18)
 262         MMI_LDC1(%[ftmp2], %[block], 0x28)
 263         MMI_LDC1(%[ftmp12], %[block], 0x38)
 264         MMI_LDC1(%[ftmp3], %[block], 0x48)
 265         MMI_LDC1(%[ftmp13], %[block], 0x58)
 266         MMI_LDC1(%[ftmp4], %[block], 0x68)
 267         MMI_LDC1(%[ftmp14], %[block], 0x78)
 268         "punpcklhw  %[ftmp5],   %[ftmp1],   %[ftmp2]                    \n\t"
 269         "punpckhhw  %[ftmp6],   %[ftmp1],   %[ftmp2]                    \n\t"
 270         "punpcklhw  %[ftmp7],   %[ftmp3],   %[ftmp4]                    \n\t"
 271         "punpckhhw  %[ftmp8],   %[ftmp3],   %[ftmp4]                    \n\t"
 272
 273         "punpcklhw  %[ftmp9],   %[ftmp11],  %[ftmp12]                   \n\t"
 274         "punpckhhw  %[ftmp10],  %[ftmp11],  %[ftmp12]                   \n\t"
 275         "punpcklhw  %[ftmp11],  %[ftmp13],  %[ftmp14]                   \n\t"
 276         "punpckhhw  %[ftmp12],  %[ftmp13],  %[ftmp14]                   \n\t"
 277
 278         /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */
 279         VC1_INV_TRANCS_8_TYPE1(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c,
 280                                0x000f0010, 0x00040009, %[ff_pw_4])
 281
 282         /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */
 283         VC1_INV_TRANCS_8_TYPE1(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4,
 284                                0xfffc000f, 0xfff7fff0, %[ff_pw_4])
 285
 286         /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */
 287         VC1_INV_TRANCS_8_TYPE1(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4,
 288                                0xfff00009, 0x000f0004, %[ff_pw_4])
 289
 290         /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */
 291         VC1_INV_TRANCS_8_TYPE1(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c,
 292                                0xfff70004, 0xfff0000f, %[ff_pw_4])
 293
 294         TRANSPOSE_4H(%[ftmp15], %[ftmp16], %[ftmp17], %[ftmp18],
 295                      %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
 296
 297         TRANSPOSE_4H(%[ftmp19], %[ftmp20], %[ftmp21], %[ftmp22],
 298                      %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
 299
 300         MMI_SDC1(%[ftmp19], %[temp], 0x48)
 301         MMI_SDC1(%[ftmp20], %[temp], 0x58)
 302         MMI_SDC1(%[ftmp21], %[temp], 0x68)
 303         MMI_SDC1(%[ftmp22], %[temp], 0x78)
 304         /* 1st loop: end */
 305
 306         /* 2nd loop: start */
 307         "li         %[tmp0],    0x07                                    \n\t"
 308         "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
 309
 310         // 1st part
 311         MMI_LDC1(%[ftmp1], %[temp], 0x00)
 312         MMI_LDC1(%[ftmp11], %[temp], 0x10)
 313         MMI_LDC1(%[ftmp2], %[temp], 0x20)
 314         MMI_LDC1(%[ftmp12], %[temp], 0x30)
 315         "punpcklhw  %[ftmp5],   %[ftmp1],   %[ftmp2]                    \n\t"
 316         "punpckhhw  %[ftmp6],   %[ftmp1],   %[ftmp2]                    \n\t"
 317         "punpcklhw  %[ftmp7],   %[ftmp15],  %[ftmp17]                   \n\t"
 318         "punpckhhw  %[ftmp8],   %[ftmp15],  %[ftmp17]                   \n\t"
 319
 320         "punpcklhw  %[ftmp9],   %[ftmp11],  %[ftmp12]                   \n\t"
 321         "punpckhhw  %[ftmp10],  %[ftmp11],  %[ftmp12]                   \n\t"
 322         "punpcklhw  %[ftmp11],  %[ftmp16],  %[ftmp18]                   \n\t"
 323         "punpckhhw  %[ftmp12],  %[ftmp16],  %[ftmp18]                   \n\t"
 324
 325         /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */
 326         VC1_INV_TRANCS_8_TYPE2(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c,
 327                                0x000f0010, 0x00040009, %[ff_pw_64], %[ff_pw_1])
 328
 329         /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */
 330         VC1_INV_TRANCS_8_TYPE2(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4,
 331                                0xfffc000f, 0xfff7fff0, %[ff_pw_64], %[ff_pw_1])
 332
 333         /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */
 334         VC1_INV_TRANCS_8_TYPE2(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4,
 335                                0xfff00009, 0x000f0004, %[ff_pw_64], %[ff_pw_1])
 336
 337         /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */
 338         VC1_INV_TRANCS_8_TYPE2(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c,
 339                                0xfff70004, 0xfff0000f, %[ff_pw_64], %[ff_pw_1])
 340
 341         MMI_SDC1(%[ftmp15], %[block], 0x00)
 342         MMI_SDC1(%[ftmp16], %[block], 0x10)
 343         MMI_SDC1(%[ftmp17], %[block], 0x20)
 344         MMI_SDC1(%[ftmp18], %[block], 0x30)
 345         MMI_SDC1(%[ftmp19], %[block], 0x40)
 346         MMI_SDC1(%[ftmp20], %[block], 0x50)
 347         MMI_SDC1(%[ftmp21], %[block], 0x60)
 348         MMI_SDC1(%[ftmp22], %[block], 0x70)
 349
 350        // 2nd part
 351         MMI_LDC1(%[ftmp1], %[temp], 0x08)
 352         MMI_LDC1(%[ftmp11], %[temp], 0x18)
 353         MMI_LDC1(%[ftmp2], %[temp], 0x28)
 354         MMI_LDC1(%[ftmp12], %[temp], 0x38)
 355         MMI_LDC1(%[ftmp3], %[temp], 0x48)
 356         MMI_LDC1(%[ftmp13], %[temp], 0x58)
 357         MMI_LDC1(%[ftmp4], %[temp], 0x68)
 358         MMI_LDC1(%[ftmp14], %[temp], 0x78)
 359         "punpcklhw  %[ftmp5],   %[ftmp1],   %[ftmp2]                    \n\t"
 360         "punpckhhw  %[ftmp6],   %[ftmp1],   %[ftmp2]                    \n\t"
 361         "punpcklhw  %[ftmp7],   %[ftmp3],   %[ftmp4]                    \n\t"
 362         "punpckhhw  %[ftmp8],   %[ftmp3],   %[ftmp4]                    \n\t"
 363
 364         "punpcklhw  %[ftmp9],   %[ftmp11],  %[ftmp12]                   \n\t"
 365         "punpckhhw  %[ftmp10],  %[ftmp11],  %[ftmp12]                   \n\t"
 366         "punpcklhw  %[ftmp11],  %[ftmp13],  %[ftmp14]                   \n\t"
 367         "punpckhhw  %[ftmp12],  %[ftmp13],  %[ftmp14]                   \n\t"
 368
 369         /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */
 370         VC1_INV_TRANCS_8_TYPE2(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c,
 371                                0x000f0010, 0x00040009, %[ff_pw_64], %[ff_pw_1])
 372
 373         /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */
 374         VC1_INV_TRANCS_8_TYPE2(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4,
 375                                0xfffc000f, 0xfff7fff0, %[ff_pw_64], %[ff_pw_1])
 376
 377         /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */
 378         VC1_INV_TRANCS_8_TYPE2(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4,
 379                                0xfff00009, 0x000f0004, %[ff_pw_64], %[ff_pw_1])
 380
 381         /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */
 382         VC1_INV_TRANCS_8_TYPE2(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c,
 383                                0xfff70004, 0xfff0000f, %[ff_pw_64], %[ff_pw_1])
 384
 385         MMI_SDC1(%[ftmp15], %[block], 0x08)
 386         MMI_SDC1(%[ftmp16], %[block], 0x18)
 387         MMI_SDC1(%[ftmp17], %[block], 0x28)
 388         MMI_SDC1(%[ftmp18], %[block], 0x38)
 389         MMI_SDC1(%[ftmp19], %[block], 0x48)
 390         MMI_SDC1(%[ftmp20], %[block], 0x58)
 391         MMI_SDC1(%[ftmp21], %[block], 0x68)
 392         MMI_SDC1(%[ftmp22], %[block], 0x78)
 393         /* 2nd loop: end */
 394         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
 395           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
 396           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
 397           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
 398           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
 399           [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
 400           [ftmp12]"=&f"(ftmp[12]),      [ftmp13]"=&f"(ftmp[13]),
 401           [ftmp14]"=&f"(ftmp[14]),      [ftmp15]"=&f"(ftmp[15]),
 402           [ftmp16]"=&f"(ftmp[16]),      [ftmp17]"=&f"(ftmp[17]),
 403           [ftmp18]"=&f"(ftmp[18]),      [ftmp19]"=&f"(ftmp[19]),
 404           [ftmp20]"=&f"(ftmp[20]),      [ftmp21]"=&f"(ftmp[21]),
 405           [ftmp22]"=&f"(ftmp[22]),
 406           [tmp0]"=&r"(tmp[0])
 407         : [ff_pw_1]"f"(ff_pw_1_local),  [ff_pw_64]"f"(ff_pw_64_local),
 408           [ff_pw_4]"f"(ff_pw_4_local), [block]"r"(block),
 409           [temp]"r"(temp)
 410         : "memory"
 411     );
 412 }
 413 #endif
 414
 415 /* Do inverse transform on 8x4 part of block */
 416 void ff_vc1_inv_trans_8x4_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
 417 {
 418     int dc = block[0];
 419     double ftmp[9];
 420
 421     dc = ( 3 * dc +  1) >> 1;
 422     dc = (17 * dc + 64) >> 7;
 423
 424     __asm__ volatile(
 425         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
 426         "pshufh     %[dc],      %[dc],          %[ftmp0]                \n\t"
 427
 428         MMI_LDC1(%[ftmp1], %[dest0], 0x00)
 429         MMI_LDC1(%[ftmp2], %[dest1], 0x00)
 430         MMI_LDC1(%[ftmp3], %[dest2], 0x00)
 431         MMI_LDC1(%[ftmp4], %[dest3], 0x00)
 432
 433         "punpckhbh  %[ftmp5],   %[ftmp1],       %[ftmp0]                \n\t"
 434         "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
 435         "punpckhbh  %[ftmp6],   %[ftmp2],       %[ftmp0]                \n\t"
 436         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
 437         "punpckhbh  %[ftmp7],   %[ftmp3],       %[ftmp0]                \n\t"
 438         "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
 439         "punpckhbh  %[ftmp8],   %[ftmp4],       %[ftmp0]                \n\t"
 440         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
 441
 442         "paddsh     %[ftmp1],   %[ftmp1],       %[dc]                   \n\t"
 443         "paddsh     %[ftmp2],   %[ftmp2],       %[dc]                   \n\t"
 444         "paddsh     %[ftmp3],   %[ftmp3],       %[dc]                   \n\t"
 445         "paddsh     %[ftmp4],   %[ftmp4],       %[dc]                   \n\t"
 446         "paddsh     %[ftmp5],   %[ftmp5],       %[dc]                   \n\t"
 447         "paddsh     %[ftmp6],   %[ftmp6],       %[dc]                   \n\t"
 448         "paddsh     %[ftmp7],   %[ftmp7],       %[dc]                   \n\t"
 449         "paddsh     %[ftmp8],   %[ftmp8],       %[dc]                   \n\t"
 450
 451         "packushb   %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
 452         "packushb   %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
 453         "packushb   %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
 454         "packushb   %[ftmp4],   %[ftmp4],       %[ftmp8]                \n\t"
 455
 456         MMI_SDC1(%[ftmp1], %[dest0], 0x00)
 457         MMI_SDC1(%[ftmp2], %[dest1], 0x00)
 458         MMI_SDC1(%[ftmp3], %[dest2], 0x00)
 459         MMI_SDC1(%[ftmp4], %[dest3], 0x00)
 460         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
 461           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
 462           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
 463           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
 464           [ftmp8]"=&f"(ftmp[8])
 465         : [dest0]"r"(dest+0*linesize),  [dest1]"r"(dest+1*linesize),
 466           [dest2]"r"(dest+2*linesize),  [dest3]"r"(dest+3*linesize),
 467           [dc]"f"(dc)
 468         : "memory"
 469     );
 470 }
 471
 472 #if _MIPS_SIM != _ABIO32
 473 void ff_vc1_inv_trans_8x4_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
 474 {
 475     int16_t *src = block;
 476     int16_t *dst = block;
 477     double ftmp[16];
 478     uint32_t tmp[1];
 479     int16_t count = 4;
 480     DECLARE_ALIGNED(16, const uint64_t, ff_pw_4_local) = {0x0000000400000004ULL};
 481     DECLARE_ALIGNED(16, const uint64_t, ff_pw_64_local)= {0x0000004000000040ULL};
 482     int16_t coeff[64] = {12, 16,  16,  15,  12,   9,   6,   4,
 483                          12, 15,   6,  -4, -12, -16, -16,  -9,
 484                          12,  9,  -6, -16, -12,   4,  16,  15,
 485                          12,  4, -16,  -9,  12,  15,  -6, -16,
 486                          12, -4, -16,   9,  12, -15,  -6,  16,
 487                          12, -9,  -6,  16, -12,  -4,  16, -15,
 488                          12, -15,  6,   4, -12,  16, -16,   9,
 489                          12, -16, 16, -15,  12,  -9,   6,  -4};
 490
 491     // 1st loop
 492     __asm__ volatile (
 493         "li         %[tmp0],    0x03                                    \n\t"
 494         "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
 495
 496         "1:                                                             \n\t"
 497         MMI_LDC1(%[ftmp1], %[src], 0x00)
 498         MMI_LDC1(%[ftmp2], %[src], 0x08)
 499
 500         /* ftmp11: dst1,dst0 */
 501         MMI_LDC1(%[ftmp3], %[coeff], 0x00)
 502         MMI_LDC1(%[ftmp4], %[coeff], 0x08)
 503         MMI_LDC1(%[ftmp5], %[coeff], 0x10)
 504         MMI_LDC1(%[ftmp6], %[coeff], 0x18)
 505         "pmaddhw    %[ftmp7],   %[ftmp1],   %[ftmp3]                    \n\t"
 506         "pmaddhw    %[ftmp8],   %[ftmp2],   %[ftmp4]                    \n\t"
 507         "paddw      %[ftmp9],   %[ftmp7],   %[ftmp8]                    \n\t"
 508         "pmaddhw    %[ftmp7],   %[ftmp1],   %[ftmp5]                    \n\t"
 509         "pmaddhw    %[ftmp8],   %[ftmp2],   %[ftmp6]                    \n\t"
 510         "paddw      %[ftmp10],  %[ftmp7],   %[ftmp8]                    \n\t"
 511         "punpcklwd  %[ftmp7],   %[ftmp9],   %[ftmp10]                   \n\t"
 512         "punpckhwd  %[ftmp8],   %[ftmp9],   %[ftmp10]                   \n\t"
 513         "paddw      %[ftmp11],  %[ftmp7],   %[ftmp8]                    \n\t"
 514         "paddw      %[ftmp11],  %[ftmp11],  %[ff_pw_4]                  \n\t"
 515
 516         /* ftmp12: dst3,dst2 */
 517         MMI_LDC1(%[ftmp3], %[coeff], 0x20)
 518         MMI_LDC1(%[ftmp4], %[coeff], 0x28)
 519         MMI_LDC1(%[ftmp5], %[coeff], 0x30)
 520         MMI_LDC1(%[ftmp6], %[coeff], 0x38)
 521         "pmaddhw    %[ftmp7],   %[ftmp1],   %[ftmp3]                    \n\t"
 522         "pmaddhw    %[ftmp8],   %[ftmp2],   %[ftmp4]                    \n\t"
 523         "paddw      %[ftmp9],   %[ftmp7],   %[ftmp8]                    \n\t"
 524         "pmaddhw    %[ftmp7],   %[ftmp1],   %[ftmp5]                    \n\t"
 525         "pmaddhw    %[ftmp8],   %[ftmp2],   %[ftmp6]                    \n\t"
 526         "paddw      %[ftmp10],  %[ftmp7],   %[ftmp8]                    \n\t"
 527         "punpcklwd  %[ftmp7],   %[ftmp9],   %[ftmp10]                   \n\t"
 528         "punpckhwd  %[ftmp8],   %[ftmp9],   %[ftmp10]                   \n\t"
 529         "paddw      %[ftmp12],  %[ftmp7],   %[ftmp8]                    \n\t"
 530         "paddw      %[ftmp12],  %[ftmp12],  %[ff_pw_4]                  \n\t"
 531
 532         /* ftmp13: dst5,dst4 */
 533         MMI_LDC1(%[ftmp3], %[coeff], 0x40)
 534         MMI_LDC1(%[ftmp4], %[coeff], 0x48)
 535         MMI_LDC1(%[ftmp5], %[coeff], 0x50)
 536         MMI_LDC1(%[ftmp6], %[coeff], 0x58)
 537         "pmaddhw    %[ftmp7],   %[ftmp1],   %[ftmp3]                    \n\t"
 538         "pmaddhw    %[ftmp8],   %[ftmp2],   %[ftmp4]                    \n\t"
 539         "paddw      %[ftmp9],   %[ftmp7],   %[ftmp8]                    \n\t"
 540         "pmaddhw    %[ftmp7],   %[ftmp1],   %[ftmp5]                    \n\t"
 541         "pmaddhw    %[ftmp8],   %[ftmp2],   %[ftmp6]                    \n\t"
 542         "paddw      %[ftmp10],  %[ftmp7],   %[ftmp8]                    \n\t"
 543         "punpcklwd  %[ftmp7],   %[ftmp9],   %[ftmp10]                   \n\t"
 544         "punpckhwd  %[ftmp8],   %[ftmp9],   %[ftmp10]                   \n\t"
 545         "paddw      %[ftmp13],  %[ftmp7],   %[ftmp8]                    \n\t"
 546         "paddw      %[ftmp13],  %[ftmp13],  %[ff_pw_4]                  \n\t"
 547
 548         /* ftmp14: dst7,dst6 */
 549         MMI_LDC1(%[ftmp3], %[coeff], 0x60)
 550         MMI_LDC1(%[ftmp4], %[coeff], 0x68)
 551         MMI_LDC1(%[ftmp5], %[coeff], 0x70)
 552         MMI_LDC1(%[ftmp6], %[coeff], 0x78)
 553         "pmaddhw    %[ftmp7],   %[ftmp1],   %[ftmp3]                    \n\t"
 554         "pmaddhw    %[ftmp8],   %[ftmp2],   %[ftmp4]                    \n\t"
 555         "paddw      %[ftmp9],   %[ftmp7],   %[ftmp8]                    \n\t"
 556         "pmaddhw    %[ftmp7],   %[ftmp1],   %[ftmp5]                    \n\t"
 557         "pmaddhw    %[ftmp8],   %[ftmp2],   %[ftmp6]                    \n\t"
 558         "paddw      %[ftmp10],  %[ftmp7],   %[ftmp8]                    \n\t"
 559         "punpcklwd  %[ftmp7],   %[ftmp9],   %[ftmp10]                   \n\t"
 560         "punpckhwd  %[ftmp8],   %[ftmp9],   %[ftmp10]                   \n\t"
 561         "paddw      %[ftmp14],  %[ftmp7],   %[ftmp8]                    \n\t"
 562         "paddw      %[ftmp14],  %[ftmp14],  %[ff_pw_4]                  \n\t"
 563
 564         /* ftmp9: dst3,dst2,dst1,dst0    ftmp10: dst7,dst6,dst5,dst4 */
 565         "psraw      %[ftmp11],  %[ftmp11],  %[ftmp0]                    \n\t"
 566         "psraw      %[ftmp12],  %[ftmp12],  %[ftmp0]                    \n\t"
 567         "psraw      %[ftmp13],  %[ftmp13],  %[ftmp0]                    \n\t"
 568         "psraw      %[ftmp14],  %[ftmp14],  %[ftmp0]                    \n\t"
 569         "punpcklhw  %[ftmp7],   %[ftmp11],  %[ftmp12]                   \n\t"
 570         "punpckhhw  %[ftmp8],   %[ftmp11],  %[ftmp12]                   \n\t"
 571         "punpcklhw  %[ftmp9],   %[ftmp7],   %[ftmp8]                    \n\t"
 572         "punpcklhw  %[ftmp7],   %[ftmp13],  %[ftmp14]                   \n\t"
 573         "punpckhhw  %[ftmp8],   %[ftmp13],  %[ftmp14]                   \n\t"
 574         "punpcklhw  %[ftmp10],  %[ftmp7],   %[ftmp8]                    \n\t"
 575         MMI_SDC1(%[ftmp9], %[dst], 0x00)
 576         MMI_SDC1(%[ftmp10], %[dst], 0x08)
 577
 578         PTR_ADDIU  "%[src],     %[src],     0x10                        \n\t"
 579         PTR_ADDIU  "%[dst],     %[dst],     0x10                        \n\t"
 580         "addiu      %[count],   %[count],   -0x01                       \n\t"
 581         "bnez       %[count],   1b                                      \n\t"
 582         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
 583           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
 584           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
 585           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
 586           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
 587           [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
 588           [ftmp12]"=&f"(ftmp[12]),      [ftmp13]"=&f"(ftmp[13]),
 589           [ftmp14]"=&f"(ftmp[14]),      [tmp0]"=&r"(tmp[0]),
 590           [src]"+&r"(src), [dst]"+&r"(dst), [count]"+&r"(count)
 591         : [ff_pw_4]"f"(ff_pw_4_local),  [coeff]"r"(coeff)
 592         : "memory"
 593     );
 594
 595     src = block;
 596
 597     // 2nd loop
 598     __asm__ volatile (
 599         "li         %[tmp0],    0x44                                    \n\t"
 600         "mtc1       %[tmp0],    %[ftmp15]                               \n\t"
 601
 602         // 1st part
 603         "li         %[tmp0],    0x07                                    \n\t"
 604         "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
 605         MMI_LDC1(%[ftmp1], %[src], 0x00)
 606         MMI_LDC1(%[ftmp2], %[src], 0x10)
 607         MMI_LDC1(%[ftmp3], %[src], 0x20)
 608         MMI_LDC1(%[ftmp4], %[src], 0x30)
 609         "punpcklhw  %[ftmp5],   %[ftmp1],   %[ftmp2]                    \n\t"
 610         "punpckhhw  %[ftmp6],   %[ftmp1],   %[ftmp2]                    \n\t"
 611         "punpcklhw  %[ftmp7],   %[ftmp3],   %[ftmp4]                    \n\t"
 612         "punpckhhw  %[ftmp8],   %[ftmp3],   %[ftmp4]                    \n\t"
 613
 614         /* ftmp11: dst03,dst02,dst01,dst00 */
 615         "li         %[tmp0],    0x00160011                              \n\t"
 616         "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
 617         "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
 618         "li         %[tmp0],    0x000a0011                              \n\t"
 619         "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
 620         "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
 621         "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
 622         "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
 623         "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
 624         "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
 625         "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
 626         "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
 627         "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
 628         "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
 629         "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
 630         "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
 631         "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
 632         "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
 633         "punpcklhw  %[ftmp11],  %[ftmp1],   %[ftmp2]                    \n\t"
 634
 635         /* ftmp12: dst13,dst12,dst11,dst10 */
 636         "li         %[tmp0],    0x000a0011                              \n\t"
 637         "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
 638         "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
 639         "li         %[tmp0],    0xffeaffef                              \n\t"
 640         "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
 641         "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
 642         "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
 643         "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
 644         "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
 645         "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
 646         "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
 647         "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
 648         "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
 649         "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
 650         "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
 651         "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
 652         "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
 653         "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
 654         "punpcklhw  %[ftmp12],  %[ftmp1],   %[ftmp2]                    \n\t"
 655
 656         /* ftmp13: dst23,dst22,dst21,dst20 */
 657         "li         %[tmp0],    0xfff60011                              \n\t"
 658         "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
 659         "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
 660         "li         %[tmp0],    0x0016ffef                              \n\t"
 661         "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
 662         "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
 663         "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
 664         "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
 665         "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
 666         "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
 667         "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
 668         "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
 669         "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
 670         "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
 671         "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
 672         "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
 673         "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
 674         "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
 675         "punpcklhw  %[ftmp13],  %[ftmp1],   %[ftmp2]                    \n\t"
 676
 677         /* ftmp14: dst33,dst32,dst31,dst30 */
 678         "li         %[tmp0],    0xffea0011                              \n\t"
 679         "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
 680         "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
 681         "li         %[tmp0],    0xfff60011                              \n\t"
 682         "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
 683         "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
 684         "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
 685         "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
 686         "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
 687         "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
 688         "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
 689         "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
 690         "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
 691         "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
 692         "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
 693         "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
 694         "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
 695         "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
 696         "punpcklhw  %[ftmp14],  %[ftmp1],   %[ftmp2]                    \n\t"
 697
 698         MMI_LWC1(%[ftmp1], %[dest], 0x00)
 699         PTR_ADDU    "%[tmp0],   %[dest],    %[linesize]                 \n\t"
 700         MMI_LWC1(%[ftmp2], %[tmp0], 0x00)
 701         PTR_ADDU    "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
 702         MMI_LWC1(%[ftmp3], %[tmp0], 0x00)
 703         PTR_ADDU    "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
 704         MMI_LWC1(%[ftmp4], %[tmp0], 0x00)
 705         "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
 706         "punpcklbh  %[ftmp1],   %[ftmp1],   %[ftmp0]                    \n\t"
 707         "punpcklbh  %[ftmp2],   %[ftmp2],   %[ftmp0]                    \n\t"
 708         "punpcklbh  %[ftmp3],   %[ftmp3],   %[ftmp0]                    \n\t"
 709         "punpcklbh  %[ftmp4],   %[ftmp4],   %[ftmp0]                    \n\t"
 710         "paddh      %[ftmp1],   %[ftmp1],   %[ftmp11]                   \n\t"
 711         "paddh      %[ftmp2],   %[ftmp2],   %[ftmp12]                   \n\t"
 712         "paddh      %[ftmp3],   %[ftmp3],   %[ftmp13]                   \n\t"
 713         "paddh      %[ftmp4],   %[ftmp4],   %[ftmp14]                   \n\t"
 714         "packushb   %[ftmp1],   %[ftmp1],   %[ftmp0]                    \n\t"
 715         "packushb   %[ftmp2],   %[ftmp2],   %[ftmp0]                    \n\t"
 716         "packushb   %[ftmp3],   %[ftmp3],   %[ftmp0]                    \n\t"
 717         "packushb   %[ftmp4],   %[ftmp4],   %[ftmp0]                    \n\t"
 718         MMI_SWC1(%[ftmp1], %[dest], 0x00)
 719         PTR_ADDU   "%[tmp0],    %[dest],    %[linesize]                 \n\t"
 720         MMI_SWC1(%[ftmp2], %[tmp0], 0x00)
 721         PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
 722         MMI_SWC1(%[ftmp3], %[tmp0], 0x00)
 723         PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
 724         MMI_SWC1(%[ftmp4], %[tmp0], 0x00)
 725
 726         // 2nd part
 727         "li         %[tmp0],    0x07                                    \n\t"
 728         "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
 729         MMI_LDC1(%[ftmp1], %[src], 0x08)
 730         MMI_LDC1(%[ftmp2], %[src], 0x18)
 731         MMI_LDC1(%[ftmp3], %[src], 0x28)
 732         MMI_LDC1(%[ftmp4], %[src], 0x38)
 733         "punpcklhw  %[ftmp5],   %[ftmp1],   %[ftmp2]                    \n\t"
 734         "punpckhhw  %[ftmp6],   %[ftmp1],   %[ftmp2]                    \n\t"
 735         "punpcklhw  %[ftmp7],   %[ftmp3],   %[ftmp4]                    \n\t"
 736         "punpckhhw  %[ftmp8],   %[ftmp3],   %[ftmp4]                    \n\t"
 737
 738         /* ftmp11: dst03,dst02,dst01,dst00 */
 739         "li         %[tmp0],    0x00160011                              \n\t"
 740         "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
 741         "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
 742         "li         %[tmp0],    0x000a0011                              \n\t"
 743         "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
 744         "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
 745         "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
 746         "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
 747         "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
 748         "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
 749         "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
 750         "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
 751         "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
 752         "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
 753         "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
 754         "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
 755         "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
 756         "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
 757         "punpcklhw  %[ftmp11],  %[ftmp1],   %[ftmp2]                    \n\t"
 758
 759         /* ftmp12: dst13,dst12,dst11,dst10 */
 760         "li         %[tmp0],    0x000a0011                              \n\t"
 761         "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
 762         "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
 763         "li         %[tmp0],    0xffeaffef                              \n\t"
 764         "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
 765         "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
 766         "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
 767         "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
 768         "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
 769         "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
 770         "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
 771         "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
 772         "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
 773         "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
 774         "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
 775         "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
 776         "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
 777         "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
 778         "punpcklhw  %[ftmp12],  %[ftmp1],   %[ftmp2]                    \n\t"
 779
 780         /* ftmp13: dst23,dst22,dst21,dst20 */
 781         "li         %[tmp0],    0xfff60011                              \n\t"
 782         "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
 783         "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
 784         "li         %[tmp0],    0x0016ffef                              \n\t"
 785         "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
 786         "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
 787         "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
 788         "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
 789         "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
 790         "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
 791         "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
 792         "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
 793         "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
 794         "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
 795         "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
 796         "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
 797         "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
 798         "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
 799         "punpcklhw  %[ftmp13],  %[ftmp1],   %[ftmp2]                    \n\t"
 800
 801         /* ftmp14: dst33,dst32,dst31,dst30 */
 802         "li         %[tmp0],    0xffea0011                              \n\t"
 803         "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
 804         "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
 805         "li         %[tmp0],    0xfff60011                              \n\t"
 806         "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
 807         "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
 808         "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
 809         "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
 810         "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
 811         "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
 812         "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
 813         "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
 814         "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
 815         "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
 816         "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
 817         "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
 818         "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
 819         "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
 820         "punpcklhw  %[ftmp14],  %[ftmp1],   %[ftmp2]                    \n\t"
 821
 822         MMI_LWC1(%[ftmp1], %[dest], 0x04)
 823         PTR_ADDU    "%[tmp0],   %[dest],    %[linesize]                 \n\t"
 824         MMI_LWC1(%[ftmp2], %[tmp0], 0x04)
 825         PTR_ADDU    "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
 826         MMI_LWC1(%[ftmp3], %[tmp0], 0x04)
 827         PTR_ADDU    "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
 828         MMI_LWC1(%[ftmp4], %[tmp0], 0x04)
 829         "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
 830         "punpcklbh  %[ftmp1],   %[ftmp1],   %[ftmp0]                    \n\t"
 831         "punpcklbh  %[ftmp2],   %[ftmp2],   %[ftmp0]                    \n\t"
 832         "punpcklbh  %[ftmp3],   %[ftmp3],   %[ftmp0]                    \n\t"
 833         "punpcklbh  %[ftmp4],   %[ftmp4],   %[ftmp0]                    \n\t"
 834         "paddh      %[ftmp1],   %[ftmp1],   %[ftmp11]                   \n\t"
 835         "paddh      %[ftmp2],   %[ftmp2],   %[ftmp12]                   \n\t"
 836         "paddh      %[ftmp3],   %[ftmp3],   %[ftmp13]                   \n\t"
 837         "paddh      %[ftmp4],   %[ftmp4],   %[ftmp14]                   \n\t"
 838         "packushb   %[ftmp1],   %[ftmp1],   %[ftmp0]                    \n\t"
 839         "packushb   %[ftmp2],   %[ftmp2],   %[ftmp0]                    \n\t"
 840         "packushb   %[ftmp3],   %[ftmp3],   %[ftmp0]                    \n\t"
 841         "packushb   %[ftmp4],   %[ftmp4],   %[ftmp0]                    \n\t"
 842         MMI_SWC1(%[ftmp1], %[dest], 0x04)
 843         PTR_ADDU   "%[tmp0],    %[dest],    %[linesize]                 \n\t"
 844         MMI_SWC1(%[ftmp2], %[tmp0], 0x04)
 845         PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
 846         MMI_SWC1(%[ftmp3], %[tmp0], 0x04)
 847         PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
 848         MMI_SWC1(%[ftmp4], %[tmp0], 0x04)
 849
 850         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
 851           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
 852           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
 853           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
 854           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
 855           [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
 856           [ftmp12]"=&f"(ftmp[12]),      [ftmp13]"=&f"(ftmp[13]),
 857           [ftmp14]"=&f"(ftmp[14]),      [ftmp15]"=&f"(ftmp[15]),
 858           [tmp0]"=&r"(tmp[0])
 859         : [ff_pw_64]"f"(ff_pw_64_local),
 860           [src]"r"(src), [dest]"r"(dest), [linesize]"r"(linesize)
 861         :"memory"
 862     );
 863 }
 864 #endif
 865
 866 /* Do inverse transform on 4x8 parts of block */
 867 void ff_vc1_inv_trans_4x8_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
 868 {
 869     int dc = block[0];
 870     double ftmp[9];
 871     DECLARE_VAR_LOW32;
 872
 873     dc = (17 * dc +  4) >> 3;
 874     dc = (12 * dc + 64) >> 7;
 875
 876     __asm__ volatile(
 877         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
 878         "pshufh     %[dc],      %[dc],          %[ftmp0]                \n\t"
 879
 880         MMI_LWC1(%[ftmp1], %[dest0], 0x00)
 881         MMI_LWC1(%[ftmp2], %[dest1], 0x00)
 882         MMI_LWC1(%[ftmp3], %[dest2], 0x00)
 883         MMI_LWC1(%[ftmp4], %[dest3], 0x00)
 884         MMI_LWC1(%[ftmp5], %[dest4], 0x00)
 885         MMI_LWC1(%[ftmp6], %[dest5], 0x00)
 886         MMI_LWC1(%[ftmp7], %[dest6], 0x00)
 887         MMI_LWC1(%[ftmp8], %[dest7], 0x00)
 888
 889         "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
 890         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
 891         "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
 892         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
 893         "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
 894         "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
 895         "punpcklbh  %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
 896         "punpcklbh  %[ftmp8],   %[ftmp8],       %[ftmp0]                \n\t"
 897
 898         "paddsh     %[ftmp1],   %[ftmp1],       %[dc]                   \n\t"
 899         "paddsh     %[ftmp2],   %[ftmp2],       %[dc]                   \n\t"
 900         "paddsh     %[ftmp3],   %[ftmp3],       %[dc]                   \n\t"
 901         "paddsh     %[ftmp4],   %[ftmp4],       %[dc]                   \n\t"
 902         "paddsh     %[ftmp5],   %[ftmp5],       %[dc]                   \n\t"
 903         "paddsh     %[ftmp6],   %[ftmp6],       %[dc]                   \n\t"
 904         "paddsh     %[ftmp7],   %[ftmp7],       %[dc]                   \n\t"
 905         "paddsh     %[ftmp8],   %[ftmp8],       %[dc]                   \n\t"
 906
 907         "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
 908         "packushb   %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
 909         "packushb   %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
 910         "packushb   %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
 911         "packushb   %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
 912         "packushb   %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
 913         "packushb   %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
 914         "packushb   %[ftmp8],   %[ftmp8],       %[ftmp0]                \n\t"
 915
 916         MMI_SWC1(%[ftmp1], %[dest0], 0x00)
 917         MMI_SWC1(%[ftmp2], %[dest1], 0x00)
 918         MMI_SWC1(%[ftmp3], %[dest2], 0x00)
 919         MMI_SWC1(%[ftmp4], %[dest3], 0x00)
 920         MMI_SWC1(%[ftmp5], %[dest4], 0x00)
 921         MMI_SWC1(%[ftmp6], %[dest5], 0x00)
 922         MMI_SWC1(%[ftmp7], %[dest6], 0x00)
 923         MMI_SWC1(%[ftmp8], %[dest7], 0x00)
 924         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
 925           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
 926           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
 927           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
 928           RESTRICT_ASM_LOW32
 929           [ftmp8]"=&f"(ftmp[8])
 930         : [dest0]"r"(dest+0*linesize),  [dest1]"r"(dest+1*linesize),
 931           [dest2]"r"(dest+2*linesize),  [dest3]"r"(dest+3*linesize),
 932           [dest4]"r"(dest+4*linesize),  [dest5]"r"(dest+5*linesize),
 933           [dest6]"r"(dest+6*linesize),  [dest7]"r"(dest+7*linesize),
 934           [dc]"f"(dc)
 935         : "memory"
 936     );
 937 }
 938
 939 #if _MIPS_SIM != _ABIO32
 940 void ff_vc1_inv_trans_4x8_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
 941 {
 942     int16_t *src = block;
 943     int16_t *dst = block;
 944     double ftmp[23];
 945     uint32_t count = 8, tmp[1];
 946     int16_t coeff[16] = {17, 22, 17, 10,
 947                          17, 10,-17,-22,
 948                          17,-10,-17, 22,
 949                          17,-22, 17,-10};
 950     DECLARE_ALIGNED(8, const uint64_t, ff_pw_1_local) = {0x0000000100000001ULL};
 951     DECLARE_ALIGNED(8, const uint64_t, ff_pw_4_local) = {0x0000000400000004ULL};
 952     DECLARE_ALIGNED(8, const uint64_t, ff_pw_64_local)= {0x0000004000000040ULL};
 953
 954     // 1st loop
 955     __asm__ volatile (
 956
 957         "li         %[tmp0],    0x03                                    \n\t"
 958         "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
 959
 960         MMI_LDC1(%[ftmp2], %[coeff], 0x00)
 961         MMI_LDC1(%[ftmp3], %[coeff], 0x08)
 962         MMI_LDC1(%[ftmp4], %[coeff], 0x10)
 963         MMI_LDC1(%[ftmp5], %[coeff], 0x18)
 964         "1:                                                             \n\t"
 965         /* ftmp8: dst3,dst2,dst1,dst0 */
 966         MMI_LDC1(%[ftmp1], %[src], 0x00)
 967         "pmaddhw    %[ftmp6],   %[ftmp2],   %[ftmp1]                    \n\t"
 968         "pmaddhw    %[ftmp7],   %[ftmp3],   %[ftmp1]                    \n\t"
 969         "pmaddhw    %[ftmp8],   %[ftmp4],   %[ftmp1]                    \n\t"
 970         "pmaddhw    %[ftmp9],   %[ftmp5],   %[ftmp1]                    \n\t"
 971         "punpcklwd  %[ftmp10],  %[ftmp6],   %[ftmp7]                    \n\t"
 972         "punpckhwd  %[ftmp11],  %[ftmp6],   %[ftmp7]                    \n\t"
 973         "punpcklwd  %[ftmp6],   %[ftmp8],   %[ftmp9]                    \n\t"
 974         "punpckhwd  %[ftmp7],   %[ftmp8],   %[ftmp9]                    \n\t"
 975         "paddw      %[ftmp8],   %[ftmp10],  %[ftmp11]                   \n\t"
 976         "paddw      %[ftmp9],   %[ftmp6],   %[ftmp7]                    \n\t"
 977         "paddw      %[ftmp8],   %[ftmp8],   %[ff_pw_4]                  \n\t"
 978         "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_4]                  \n\t"
 979         "psraw      %[ftmp8],   %[ftmp8],   %[ftmp0]                    \n\t"
 980         "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
 981         "punpcklhw  %[ftmp6],   %[ftmp8],   %[ftmp9]                    \n\t"
 982         "punpckhhw  %[ftmp7],   %[ftmp8],   %[ftmp9]                    \n\t"
 983         "punpcklhw  %[ftmp8],   %[ftmp6],   %[ftmp7]                    \n\t"
 984         MMI_SDC1(%[ftmp8], %[dst], 0x00)
 985
 986         PTR_ADDIU  "%[src],     %[src],     0x10                        \n\t"
 987         PTR_ADDIU  "%[dst],     %[dst],     0x10                        \n\t"
 988         "addiu      %[count],   %[count],   -0x01                       \n\t"
 989         "bnez       %[count],   1b                                      \n\t"
 990         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
 991           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
 992           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
 993           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
 994           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
 995           [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
 996           [tmp0]"=&r"(tmp[0]),          [count]"+&r"(count),
 997           [src]"+&r"(src),              [dst]"+&r"(dst)
 998         : [ff_pw_4]"f"(ff_pw_4_local),  [coeff]"r"(coeff)
 999         : "memory"
1000     );
1001
1002     src = block;
1003
1004     // 2nd loop
1005     __asm__ volatile (
1006         "li         %[tmp0],    0x07                                    \n\t"
1007         "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
1008
1009         MMI_LDC1(%[ftmp1], %[src], 0x00)
1010         MMI_LDC1(%[ftmp2], %[src], 0x20)
1011         MMI_LDC1(%[ftmp3], %[src], 0x40)
1012         MMI_LDC1(%[ftmp4], %[src], 0x60)
1013         "punpcklhw  %[ftmp5],   %[ftmp1],   %[ftmp2]                    \n\t"
1014         "punpckhhw  %[ftmp6],   %[ftmp1],   %[ftmp2]                    \n\t"
1015         "punpcklhw  %[ftmp7],   %[ftmp3],   %[ftmp4]                    \n\t"
1016         "punpckhhw  %[ftmp8],   %[ftmp3],   %[ftmp4]                    \n\t"
1017
1018         MMI_LDC1(%[ftmp1], %[src], 0x10)
1019         MMI_LDC1(%[ftmp2], %[src], 0x30)
1020         MMI_LDC1(%[ftmp3], %[src], 0x50)
1021         MMI_LDC1(%[ftmp4], %[src], 0x70)
1022         "punpcklhw  %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
1023         "punpckhhw  %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
1024         "punpcklhw  %[ftmp11],  %[ftmp3],   %[ftmp4]                    \n\t"
1025         "punpckhhw  %[ftmp12],  %[ftmp3],   %[ftmp4]                    \n\t"
1026
1027         /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */
1028         VC1_INV_TRANCS_8_TYPE2(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c,
1029                                0x000f0010, 0x00040009, %[ff_pw_64], %[ff_pw_1])
1030
1031         /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */
1032         VC1_INV_TRANCS_8_TYPE2(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4,
1033                                0xfffc000f, 0xfff7fff0, %[ff_pw_64], %[ff_pw_1])
1034
1035         /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */
1036         VC1_INV_TRANCS_8_TYPE2(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4,
1037                                0xfff00009, 0x000f0004, %[ff_pw_64], %[ff_pw_1])
1038
1039         /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */
1040         VC1_INV_TRANCS_8_TYPE2(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c,
1041                                0xfff70004, 0xfff0000f, %[ff_pw_64], %[ff_pw_1])
1042
1043         MMI_LWC1(%[ftmp1], %[dest], 0x00)
1044         PTR_ADDU  "%[tmp0],   %[dest],    %[linesize]                 \n\t"
1045         MMI_LWC1(%[ftmp2], %[tmp0], 0x00)
1046         PTR_ADDU  "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
1047         MMI_LWC1(%[ftmp3], %[tmp0], 0x00)
1048         PTR_ADDU  "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
1049         MMI_LWC1(%[ftmp4], %[tmp0], 0x00)
1050         PTR_ADDU  "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
1051         MMI_LWC1(%[ftmp5], %[tmp0], 0x00)
1052         PTR_ADDU  "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
1053         MMI_LWC1(%[ftmp6], %[tmp0], 0x00)
1054         PTR_ADDU  "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
1055         MMI_LWC1(%[ftmp7], %[tmp0], 0x00)
1056         PTR_ADDU  "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
1057         MMI_LWC1(%[ftmp8], %[tmp0], 0x00)
1058         "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
1059         "punpcklbh  %[ftmp1],   %[ftmp1],   %[ftmp0]                    \n\t"
1060         "punpcklbh  %[ftmp2],   %[ftmp2],   %[ftmp0]                    \n\t"
1061         "punpcklbh  %[ftmp3],   %[ftmp3],   %[ftmp0]                    \n\t"
1062         "punpcklbh  %[ftmp4],   %[ftmp4],   %[ftmp0]                    \n\t"
1063         "punpcklbh  %[ftmp5],   %[ftmp5],   %[ftmp0]                    \n\t"
1064         "punpcklbh  %[ftmp6],   %[ftmp6],   %[ftmp0]                    \n\t"
1065         "punpcklbh  %[ftmp7],   %[ftmp7],   %[ftmp0]                    \n\t"
1066         "punpcklbh  %[ftmp8],   %[ftmp8],   %[ftmp0]                    \n\t"
1067
1068         "paddh      %[ftmp1],   %[ftmp1],   %[ftmp15]                   \n\t"
1069         "paddh      %[ftmp2],   %[ftmp2],   %[ftmp16]                   \n\t"
1070         "paddh      %[ftmp3],   %[ftmp3],   %[ftmp17]                   \n\t"
1071         "paddh      %[ftmp4],   %[ftmp4],   %[ftmp18]                   \n\t"
1072         "paddh      %[ftmp5],   %[ftmp5],   %[ftmp19]                   \n\t"
1073         "paddh      %[ftmp6],   %[ftmp6],   %[ftmp20]                   \n\t"
1074         "paddh      %[ftmp7],   %[ftmp7],   %[ftmp21]                   \n\t"
1075         "paddh      %[ftmp8],   %[ftmp8],   %[ftmp22]                   \n\t"
1076
1077         "packushb   %[ftmp1],   %[ftmp1],   %[ftmp0]                    \n\t"
1078         "packushb   %[ftmp2],   %[ftmp2],   %[ftmp0]                    \n\t"
1079         "packushb   %[ftmp3],   %[ftmp3],   %[ftmp0]                    \n\t"
1080         "packushb   %[ftmp4],   %[ftmp4],   %[ftmp0]                    \n\t"
1081         "packushb   %[ftmp5],   %[ftmp5],   %[ftmp0]                    \n\t"
1082         "packushb   %[ftmp6],   %[ftmp6],   %[ftmp0]                    \n\t"
1083         "packushb   %[ftmp7],   %[ftmp7],   %[ftmp0]                    \n\t"
1084         "packushb   %[ftmp8],   %[ftmp8],   %[ftmp0]                    \n\t"
1085
1086         MMI_SWC1(%[ftmp1], %[dest], 0x00)
1087         PTR_ADDU   "%[tmp0],    %[dest],    %[linesize]                 \n\t"
1088         MMI_SWC1(%[ftmp2], %[tmp0], 0x00)
1089         PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
1090         MMI_SWC1(%[ftmp3], %[tmp0], 0x00)
1091         PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
1092         MMI_SWC1(%[ftmp4], %[tmp0], 0x00)
1093         PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
1094         MMI_SWC1(%[ftmp5], %[tmp0], 0x00)
1095         PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
1096         MMI_SWC1(%[ftmp6], %[tmp0], 0x00)
1097         PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
1098         MMI_SWC1(%[ftmp7], %[tmp0], 0x00)
1099         PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
1100         MMI_SWC1(%[ftmp8], %[tmp0], 0x00)
1101
1102         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
1103           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
1104           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
1105           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
1106           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
1107           [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
1108           [ftmp12]"=&f"(ftmp[12]),      [ftmp13]"=&f"(ftmp[13]),
1109           [ftmp14]"=&f"(ftmp[14]),      [ftmp15]"=&f"(ftmp[15]),
1110           [ftmp16]"=&f"(ftmp[16]),      [ftmp17]"=&f"(ftmp[17]),
1111           [ftmp18]"=&f"(ftmp[18]),      [ftmp19]"=&f"(ftmp[19]),
1112           [ftmp20]"=&f"(ftmp[20]),      [ftmp21]"=&f"(ftmp[21]),
1113           [ftmp22]"=&f"(ftmp[22]),
1114           [tmp0]"=&r"(tmp[0])
1115         : [ff_pw_1]"f"(ff_pw_1_local),  [ff_pw_64]"f"(ff_pw_64_local),
1116           [src]"r"(src), [dest]"r"(dest), [linesize]"r"(linesize)
1117         : "memory"
1118     );
1119 }
1120 #endif
1121
1122 /* Do inverse transform on 4x4 part of block */
1123 void ff_vc1_inv_trans_4x4_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
1124 {
1125     int dc = block[0];
1126     double ftmp[5];
1127     DECLARE_VAR_LOW32;
1128
1129     dc = (17 * dc +  4) >> 3;
1130     dc = (17 * dc + 64) >> 7;
1131
1132     __asm__ volatile(
1133         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
1134         "pshufh     %[dc],      %[dc],          %[ftmp0]                \n\t"
1135
1136         MMI_LWC1(%[ftmp1], %[dest0], 0x00)
1137         MMI_LWC1(%[ftmp2], %[dest1], 0x00)
1138         MMI_LWC1(%[ftmp3], %[dest2], 0x00)
1139         MMI_LWC1(%[ftmp4], %[dest3], 0x00)
1140
1141         "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
1142         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
1143         "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
1144         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
1145
1146         "paddsh     %[ftmp1],   %[ftmp1],       %[dc]                   \n\t"
1147         "paddsh     %[ftmp2],   %[ftmp2],       %[dc]                   \n\t"
1148         "paddsh     %[ftmp3],   %[ftmp3],       %[dc]                   \n\t"
1149         "paddsh     %[ftmp4],   %[ftmp4],       %[dc]                   \n\t"
1150
1151         "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
1152         "packushb   %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
1153         "packushb   %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
1154         "packushb   %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
1155
1156         MMI_SWC1(%[ftmp1], %[dest0], 0x00)
1157         MMI_SWC1(%[ftmp2], %[dest1], 0x00)
1158         MMI_SWC1(%[ftmp3], %[dest2], 0x00)
1159         MMI_SWC1(%[ftmp4], %[dest3], 0x00)
1160         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
1161           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
1162           RESTRICT_ASM_LOW32
1163           [ftmp4]"=&f"(ftmp[4])
1164         : [dest0]"r"(dest+0*linesize),  [dest1]"r"(dest+1*linesize),
1165           [dest2]"r"(dest+2*linesize),  [dest3]"r"(dest+3*linesize),
1166           [dc]"f"(dc)
1167         : "memory"
1168     );
1169 }
1170
1171 void ff_vc1_inv_trans_4x4_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
1172 {
1173     int16_t *src = block;
1174     int16_t *dst = block;
1175     double ftmp[16];
1176     uint32_t count = 4, tmp[1];
1177     int16_t coeff[16] = {17, 22, 17, 10,
1178                          17, 10,-17,-22,
1179                          17,-10,-17, 22,
1180                          17,-22, 17,-10};
1181     DECLARE_ALIGNED(8, const uint64_t, ff_pw_4_local) = {0x0000000400000004ULL};
1182     DECLARE_ALIGNED(8, const uint64_t, ff_pw_64_local)= {0x0000004000000040ULL};
1183     // 1st loop
1184     __asm__ volatile (
1185
1186         "li         %[tmp0],    0x03                                    \n\t"
1187         "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
1188         MMI_LDC1(%[ftmp2], %[coeff], 0x00)
1189         MMI_LDC1(%[ftmp3], %[coeff], 0x08)
1190         MMI_LDC1(%[ftmp4], %[coeff], 0x10)
1191         MMI_LDC1(%[ftmp5], %[coeff], 0x18)
1192         "1:                                                             \n\t"
1193         /* ftmp8: dst3,dst2,dst1,dst0 */
1194         MMI_LDC1(%[ftmp1], %[src], 0x00)
1195         "pmaddhw    %[ftmp6],   %[ftmp2],   %[ftmp1]                    \n\t"
1196         "pmaddhw    %[ftmp7],   %[ftmp3],   %[ftmp1]                    \n\t"
1197         "pmaddhw    %[ftmp8],   %[ftmp4],   %[ftmp1]                    \n\t"
1198         "pmaddhw    %[ftmp9],   %[ftmp5],   %[ftmp1]                    \n\t"
1199         "punpcklwd  %[ftmp10],  %[ftmp6],   %[ftmp7]                    \n\t"
1200         "punpckhwd  %[ftmp11],  %[ftmp6],   %[ftmp7]                    \n\t"
1201         "punpcklwd  %[ftmp6],   %[ftmp8],   %[ftmp9]                    \n\t"
1202         "punpckhwd  %[ftmp7],   %[ftmp8],   %[ftmp9]                    \n\t"
1203         "paddw      %[ftmp8],   %[ftmp10],  %[ftmp11]                   \n\t"
1204         "paddw      %[ftmp9],   %[ftmp6],   %[ftmp7]                    \n\t"
1205         "paddw      %[ftmp8],   %[ftmp8],   %[ff_pw_4]                  \n\t"
1206         "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_4]                  \n\t"
1207         "psraw      %[ftmp8],   %[ftmp8],   %[ftmp0]                    \n\t"
1208         "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
1209         "punpcklhw  %[ftmp6],   %[ftmp8],   %[ftmp9]                    \n\t"
1210         "punpckhhw  %[ftmp7],   %[ftmp8],   %[ftmp9]                    \n\t"
1211         "punpcklhw  %[ftmp8],   %[ftmp6],   %[ftmp7]                    \n\t"
1212         MMI_SDC1(%[ftmp8], %[dst], 0x00)
1213
1214         PTR_ADDIU  "%[src],     %[src],     0x10                        \n\t"
1215         PTR_ADDIU  "%[dst],     %[dst],     0x10                        \n\t"
1216         "addiu      %[count],   %[count],   -0x01                       \n\t"
1217         "bnez       %[count],   1b                                      \n\t"
1218         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
1219           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
1220           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
1221           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
1222           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
1223           [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
1224           [tmp0]"=&r"(tmp[0]),          [count]"+&r"(count),
1225           [src]"+&r"(src),              [dst]"+&r"(dst)
1226         : [ff_pw_4]"f"(ff_pw_4_local),  [coeff]"r"(coeff)
1227         : "memory"
1228     );
1229
1230     src = block;
1231
1232     // 2nd loop
1233     __asm__ volatile (
1234         "li         %[tmp0],    0x07                                    \n\t"
1235         "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
1236         "li         %[tmp0],    0x44                                    \n\t"
1237         "mtc1       %[tmp0],    %[ftmp15]                               \n\t"
1238
1239         MMI_LDC1(%[ftmp1], %[src], 0x00)
1240         MMI_LDC1(%[ftmp2], %[src], 0x10)
1241         MMI_LDC1(%[ftmp3], %[src], 0x20)
1242         MMI_LDC1(%[ftmp4], %[src], 0x30)
1243         "punpcklhw  %[ftmp5],   %[ftmp1],   %[ftmp2]                    \n\t"
1244         "punpckhhw  %[ftmp6],   %[ftmp1],   %[ftmp2]                    \n\t"
1245         "punpcklhw  %[ftmp7],   %[ftmp3],   %[ftmp4]                    \n\t"
1246         "punpckhhw  %[ftmp8],   %[ftmp3],   %[ftmp4]                    \n\t"
1247
1248         /* ftmp11: dst03,dst02,dst01,dst00 */
1249         "li         %[tmp0],    0x00160011                              \n\t"
1250         "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
1251         "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
1252         "li         %[tmp0],    0x000a0011                              \n\t"
1253         "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
1254         "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
1255         "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
1256         "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
1257         "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
1258         "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
1259         "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
1260         "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
1261         "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
1262         "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
1263         "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
1264         "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
1265         "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
1266         "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
1267         "punpcklhw  %[ftmp11],  %[ftmp1],   %[ftmp2]                    \n\t"
1268
1269         /* ftmp12: dst13,dst12,dst11,dst10 */
1270         "li         %[tmp0],    0x000a0011                              \n\t"
1271         "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
1272         "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
1273         "li         %[tmp0],    0xffeaffef                              \n\t"
1274         "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
1275         "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
1276         "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
1277         "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
1278         "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
1279         "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
1280         "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
1281         "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
1282         "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
1283         "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
1284         "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
1285         "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
1286         "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
1287         "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
1288         "punpcklhw  %[ftmp12],  %[ftmp1],   %[ftmp2]                    \n\t"
1289
1290         /* ftmp13: dst23,dst22,dst21,dst20 */
1291         "li         %[tmp0],    0xfff60011                              \n\t"
1292         "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
1293         "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
1294         "li         %[tmp0],    0x0016ffef                              \n\t"
1295         "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
1296         "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
1297         "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
1298         "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
1299         "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
1300         "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
1301         "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
1302         "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
1303         "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
1304         "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
1305         "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
1306         "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
1307         "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
1308         "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
1309         "punpcklhw  %[ftmp13],  %[ftmp1],   %[ftmp2]                    \n\t"
1310
1311         /* ftmp14: dst33,dst32,dst31,dst30 */
1312         "li         %[tmp0],    0xffea0011                              \n\t"
1313         "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
1314         "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
1315         "li         %[tmp0],    0xfff60011                              \n\t"
1316         "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
1317         "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
1318         "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
1319         "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
1320         "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
1321         "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
1322         "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
1323         "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
1324         "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
1325         "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
1326         "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
1327         "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
1328         "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
1329         "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
1330         "punpcklhw  %[ftmp14],  %[ftmp1],   %[ftmp2]                    \n\t"
1331
1332         MMI_LWC1(%[ftmp1], %[dest], 0x00)
1333         PTR_ADDU    "%[tmp0],   %[dest],    %[linesize]                 \n\t"
1334         MMI_LWC1(%[ftmp2], %[tmp0], 0x00)
1335         PTR_ADDU    "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
1336         MMI_LWC1(%[ftmp3], %[tmp0], 0x00)
1337         PTR_ADDU    "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
1338         MMI_LWC1(%[ftmp4], %[tmp0], 0x00)
1339         "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
1340         "punpcklbh  %[ftmp1],   %[ftmp1],   %[ftmp0]                    \n\t"
1341         "punpcklbh  %[ftmp2],   %[ftmp2],   %[ftmp0]                    \n\t"
1342         "punpcklbh  %[ftmp3],   %[ftmp3],   %[ftmp0]                    \n\t"
1343         "punpcklbh  %[ftmp4],   %[ftmp4],   %[ftmp0]                    \n\t"
1344         "paddh      %[ftmp1],   %[ftmp1],   %[ftmp11]                   \n\t"
1345         "paddh      %[ftmp2],   %[ftmp2],   %[ftmp12]                   \n\t"
1346         "paddh      %[ftmp3],   %[ftmp3],   %[ftmp13]                   \n\t"
1347         "paddh      %[ftmp4],   %[ftmp4],   %[ftmp14]                   \n\t"
1348         "packushb   %[ftmp1],   %[ftmp1],   %[ftmp0]                    \n\t"
1349         "packushb   %[ftmp2],   %[ftmp2],   %[ftmp0]                    \n\t"
1350         "packushb   %[ftmp3],   %[ftmp3],   %[ftmp0]                    \n\t"
1351         "packushb   %[ftmp4],   %[ftmp4],   %[ftmp0]                    \n\t"
1352
1353         MMI_SWC1(%[ftmp1], %[dest], 0x00)
1354         PTR_ADDU   "%[tmp0],    %[dest],    %[linesize]                 \n\t"
1355         MMI_SWC1(%[ftmp2], %[tmp0], 0x00)
1356         PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
1357         MMI_SWC1(%[ftmp3], %[tmp0], 0x00)
1358         PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
1359         MMI_SWC1(%[ftmp4], %[tmp0], 0x00)
1360
1361         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
1362           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
1363           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
1364           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
1365           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
1366           [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
1367           [ftmp12]"=&f"(ftmp[12]),      [ftmp13]"=&f"(ftmp[13]),
1368           [ftmp14]"=&f"(ftmp[14]),      [ftmp15]"=&f"(ftmp[15]),
1369           [tmp0]"=&r"(tmp[0])
1370         : [ff_pw_64]"f"(ff_pw_64_local),
1371           [src]"r"(src), [dest]"r"(dest), [linesize]"r"(linesize)
1372         :"memory"
1373     );
1374 }
1375
1376 /* Apply overlap transform to horizontal edge */
1377 void ff_vc1_h_overlap_mmi(uint8_t *src, int stride)
1378 {
1379     int i;
1380     int a, b, c, d;
1381     int d1, d2;
1382     int rnd = 1;
1383     for (i = 0; i < 8; i++) {
1384         a  = src[-2];
1385         b  = src[-1];
1386         c  = src[0];
1387         d  = src[1];
1388         d1 = (a - d + 3 + rnd) >> 3;
1389         d2 = (a - d + b - c + 4 - rnd) >> 3;
1390
1391         src[-2] = a - d1;
1392         src[-1] = av_clip_uint8(b - d2);
1393         src[0]  = av_clip_uint8(c + d2);
1394         src[1]  = d + d1;
1395         src    += stride;
1396         rnd     = !rnd;
1397     }
1398 }
1399
1400 void ff_vc1_h_s_overlap_mmi(int16_t *left, int16_t *right, int left_stride, int right_stride, int flags)
1401 {
1402     int i;
1403     int a, b, c, d;
1404     int d1, d2;
1405     int rnd1 = flags & 2 ? 3 : 4;
1406     int rnd2 = 7 - rnd1;
1407     for (i = 0; i < 8; i++) {
1408         a  = left[6];
1409         b  = left[7];
1410         c  = right[0];
1411         d  = right[1];
1412         d1 = a - d;
1413         d2 = a - d + b - c;
1414
1415         left[6]  = ((a << 3) - d1 + rnd1) >> 3;
1416         left[7]  = ((b << 3) - d2 + rnd2) >> 3;
1417         right[0] = ((c << 3) + d2 + rnd1) >> 3;
1418         right[1] = ((d << 3) + d1 + rnd2) >> 3;
1419
1420         right += right_stride;
1421         left  += left_stride;
1422         if (flags & 1) {
1423             rnd2   = 7 - rnd2;
1424             rnd1   = 7 - rnd1;
1425         }
1426     }
1427 }
1428
1429 /* Apply overlap transform to vertical edge */
1430 void ff_vc1_v_overlap_mmi(uint8_t *src, int stride)
1431 {
1432     int i;
1433     int a, b, c, d;
1434     int d1, d2;
1435     int rnd = 1;
1436     for (i = 0; i < 8; i++) {
1437         a  = src[-2 * stride];
1438         b  = src[-stride];
1439         c  = src[0];
1440         d  = src[stride];
1441         d1 = (a - d + 3 + rnd) >> 3;
1442         d2 = (a - d + b - c + 4 - rnd) >> 3;
1443
1444         src[-2 * stride] = a - d1;
1445         src[-stride]     = av_clip_uint8(b - d2);
1446         src[0]           = av_clip_uint8(c + d2);
1447         src[stride]      = d + d1;
1448         src++;
1449         rnd = !rnd;
1450     }
1451 }
1452
1453 void ff_vc1_v_s_overlap_mmi(int16_t *top, int16_t *bottom)
1454 {
1455     int i;
1456     int a, b, c, d;
1457     int d1, d2;
1458     int rnd1 = 4, rnd2 = 3;
1459     for (i = 0; i < 8; i++) {
1460         a  = top[48];
1461         b  = top[56];
1462         c  = bottom[0];
1463         d  = bottom[8];
1464         d1 = a - d;
1465         d2 = a - d + b - c;
1466
1467         top[48]   = ((a << 3) - d1 + rnd1) >> 3;
1468         top[56]   = ((b << 3) - d2 + rnd2) >> 3;
1469         bottom[0] = ((c << 3) + d2 + rnd1) >> 3;
1470         bottom[8] = ((d << 3) + d1 + rnd2) >> 3;
1471
1472         bottom++;
1473         top++;
1474         rnd2 = 7 - rnd2;
1475         rnd1 = 7 - rnd1;
1476     }
1477 }
1478
1479 /**
1480  * VC-1 in-loop deblocking filter for one line
1481  * @param src source block type
1482  * @param stride block stride
1483  * @param pq block quantizer
1484  * @return whether other 3 pairs should be filtered or not
1485  * @see 8.6
1486  */
1487 static av_always_inline int vc1_filter_line(uint8_t *src, int stride, int pq)
1488 {
1489     int a0 = (2 * (src[-2 * stride] - src[1 * stride]) -
1490               5 * (src[-1 * stride] - src[0 * stride]) + 4) >> 3;
1491     int a0_sign = a0 >> 31;        /* Store sign */
1492
1493     a0 = (a0 ^ a0_sign) - a0_sign; /* a0 = FFABS(a0); */
1494     if (a0 < pq) {
1495         int a1 = FFABS((2 * (src[-4 * stride] - src[-1 * stride]) -
1496                         5 * (src[-3 * stride] - src[-2 * stride]) + 4) >> 3);
1497         int a2 = FFABS((2 * (src[ 0 * stride] - src[ 3 * stride]) -
1498                         5 * (src[ 1 * stride] - src[ 2 * stride]) + 4) >> 3);
1499         if (a1 < a0 || a2 < a0) {
1500             int clip      = src[-1 * stride] - src[0 * stride];
1501             int clip_sign = clip >> 31;
1502
1503             clip = ((clip ^ clip_sign) - clip_sign) >> 1;
1504             if (clip) {
1505                 int a3     = FFMIN(a1, a2);
1506                 int d      = 5 * (a3 - a0);
1507                 int d_sign = (d >> 31);
1508
1509                 d       = ((d ^ d_sign) - d_sign) >> 3;
1510                 d_sign ^= a0_sign;
1511
1512                 if (d_sign ^ clip_sign)
1513                     d = 0;
1514                 else {
1515                     d = FFMIN(d, clip);
1516                     d = (d ^ d_sign) - d_sign; /* Restore sign */
1517                     src[-1 * stride] = av_clip_uint8(src[-1 * stride] - d);
1518                     src[ 0 * stride] = av_clip_uint8(src[ 0 * stride] + d);
1519                 }
1520                 return 1;
1521             }
1522         }
1523     }
1524     return 0;
1525 }
1526
1527 /**
1528  * VC-1 in-loop deblocking filter
1529  * @param src source block type
1530  * @param step distance between horizontally adjacent elements
1531  * @param stride distance between vertically adjacent elements
1532  * @param len edge length to filter (4 or 8 pixels)
1533  * @param pq block quantizer
1534  * @see 8.6
1535  */
1536 static inline void vc1_loop_filter(uint8_t *src, int step, int stride,
1537                                    int len, int pq)
1538 {
1539     int i;
1540     int filt3;
1541
1542     for (i = 0; i < len; i += 4) {
1543         filt3 = vc1_filter_line(src + 2 * step, stride, pq);
1544         if (filt3) {
1545             vc1_filter_line(src + 0 * step, stride, pq);
1546             vc1_filter_line(src + 1 * step, stride, pq);
1547             vc1_filter_line(src + 3 * step, stride, pq);
1548         }
1549         src += step * 4;
1550     }
1551 }
1552
1553 void ff_vc1_v_loop_filter4_mmi(uint8_t *src, int stride, int pq)
1554 {
1555     vc1_loop_filter(src, 1, stride, 4, pq);
1556 }
1557
1558 void ff_vc1_h_loop_filter4_mmi(uint8_t *src, int stride, int pq)
1559 {
1560     vc1_loop_filter(src, stride, 1, 4, pq);
1561 }
1562
1563 void ff_vc1_v_loop_filter8_mmi(uint8_t *src, int stride, int pq)
1564 {
1565     vc1_loop_filter(src, 1, stride, 8, pq);
1566 }
1567
1568 void ff_vc1_h_loop_filter8_mmi(uint8_t *src, int stride, int pq)
1569 {
1570     vc1_loop_filter(src, stride, 1, 8, pq);
1571 }
1572
1573 void ff_vc1_v_loop_filter16_mmi(uint8_t *src, int stride, int pq)
1574 {
1575     vc1_loop_filter(src, 1, stride, 16, pq);
1576 }
1577
1578 void ff_vc1_h_loop_filter16_mmi(uint8_t *src, int stride, int pq)
1579 {
1580     vc1_loop_filter(src, stride, 1, 16, pq);
1581 }
1582
1583 void ff_put_vc1_mspel_mc00_mmi(uint8_t *dst, const uint8_t *src,
1584                                ptrdiff_t stride, int rnd)
1585 {
1586     ff_put_pixels8_8_mmi(dst, src, stride, 8);
1587 }
1588 void ff_put_vc1_mspel_mc00_16_mmi(uint8_t *dst, const uint8_t *src,
1589                                   ptrdiff_t stride, int rnd)
1590 {
1591     ff_put_pixels16_8_mmi(dst, src, stride, 16);
1592 }
1593 void ff_avg_vc1_mspel_mc00_mmi(uint8_t *dst, const uint8_t *src,
1594                                ptrdiff_t stride, int rnd)
1595 {
1596     ff_avg_pixels8_8_mmi(dst, src, stride, 8);
1597 }
1598 void ff_avg_vc1_mspel_mc00_16_mmi(uint8_t *dst, const uint8_t *src,
1599                                   ptrdiff_t stride, int rnd)
1600 {
1601     ff_avg_pixels16_8_mmi(dst, src, stride, 16);
1602 }
1603
1604 #define OP_PUT(S, D)
1605 #define OP_AVG(S, D)                                                        \
1606     "ldc1       $f16,   "#S"                        \n\t"                   \
1607     "pavgb      "#D",   "#D",   $f16                \n\t"
1608
1609 /** Add rounder from $f14 to $f6 and pack result at destination */
1610 #define NORMALIZE_MMI(SHIFT)                                                \
1611     "paddh      $f6,    $f6,    $f14                \n\t" /* +bias-r */     \
1612     "paddh      $f8,    $f8,    $f14                \n\t" /* +bias-r */     \
1613     "psrah      $f6,    $f6,    "SHIFT"             \n\t"                   \
1614     "psrah      $f8,    $f8,    "SHIFT"             \n\t"
1615
1616 #define TRANSFER_DO_PACK(OP)                                                \
1617     "packushb   $f6,    $f6,    $f8                 \n\t"                   \
1618     OP((%[dst]), $f6)                                                       \
1619     "sdc1       $f6,    0x00(%[dst])                \n\t"
1620
1621 #define TRANSFER_DONT_PACK(OP)                                              \
1622      OP(0(%[dst]), $f6)                                                     \
1623      OP(8(%[dst]), $f8)                                                     \
1624      "sdc1      $f6,    0x00(%[dst])                \n\t"                   \
1625      "sdc1      $f8,    0x08(%[dst])                \n\t"
1626
1627 /** @see MSPEL_FILTER13_CORE for use as UNPACK macro */
1628 #define DO_UNPACK(reg)                                                      \
1629     "punpcklbh  "reg",  "reg",  $f0                 \n\t"
1630 #define DONT_UNPACK(reg)
1631
1632 /** Compute the rounder 32-r or 8-r and unpacks it to $f14 */
1633 #define LOAD_ROUNDER_MMI(ROUND)                                             \
1634     "lwc1       $f14,   "ROUND"                     \n\t"                   \
1635     "punpcklhw  $f14,   $f14,   $f14                \n\t"                   \
1636     "punpcklwd  $f14,   $f14,   $f14                \n\t"
1637
1638
1639 #define SHIFT2_LINE(OFF, R0, R1, R2, R3)                                    \
1640     "paddh      "#R1",      "#R1",  "#R2"           \n\t"                   \
1641     PTR_ADDU    "$9,        %[src], %[stride1]      \n\t"                   \
1642     MMI_ULWC1(R0, $9, 0x00)                                                 \
1643     "pmullh     "#R1",      "#R1",  $f6             \n\t"                   \
1644     "punpcklbh  "#R0",      "#R0",  $f0             \n\t"                   \
1645     PTR_ADDU    "$9,        %[src], %[stride]       \n\t"                   \
1646     MMI_ULWC1(R3, $9, 0x00)                                                 \
1647     "psubh      "#R1",      "#R1",  "#R0"           \n\t"                   \
1648     "punpcklbh  "#R3",      "#R3",  $f0             \n\t"                   \
1649     "paddh      "#R1",      "#R1",  $f14            \n\t"                   \
1650     "psubh      "#R1",      "#R1",  "#R3"           \n\t"                   \
1651     "psrah      "#R1",      "#R1",  %[shift]        \n\t"                   \
1652     MMI_SDC1(R1, %[dst], OFF)                                               \
1653     PTR_ADDU    "%[src],    %[src], %[stride]       \n\t"
1654
1655 /** Sacrificing $f12 makes it possible to pipeline loads from src */
1656 static void vc1_put_ver_16b_shift2_mmi(int16_t *dst,
1657                                        const uint8_t *src, mips_reg stride,
1658                                        int rnd, int64_t shift)
1659 {
1660     DECLARE_VAR_LOW32;
1661     DECLARE_VAR_ADDRT;
1662
1663     __asm__ volatile(
1664         "xor        $f0,    $f0,    $f0             \n\t"
1665         "li         $8,     0x03                    \n\t"
1666         LOAD_ROUNDER_MMI("%[rnd]")
1667         "ldc1       $f12,   %[ff_pw_9]              \n\t"
1668         "1:                                         \n\t"
1669         MMI_ULWC1($f4, %[src], 0x00)
1670         PTR_ADDU   "%[src], %[src], %[stride]       \n\t"
1671         MMI_ULWC1($f6, %[src], 0x00)
1672         "punpcklbh  $f4,    $f4,    $f0             \n\t"
1673         "punpcklbh  $f6,    $f6,    $f0             \n\t"
1674         SHIFT2_LINE(  0, $f2, $f4, $f6, $f8)
1675         SHIFT2_LINE( 24, $f4, $f6, $f8, $f2)
1676         SHIFT2_LINE( 48, $f6, $f8, $f2, $f4)
1677         SHIFT2_LINE( 72, $f8, $f2, $f4, $f6)
1678         SHIFT2_LINE( 96, $f2, $f4, $f6, $f8)
1679         SHIFT2_LINE(120, $f4, $f6, $f8, $f2)
1680         SHIFT2_LINE(144, $f6, $f8, $f2, $f4)
1681         SHIFT2_LINE(168, $f8, $f2, $f4, $f6)
1682         PTR_SUBU   "%[src], %[src], %[stride2]      \n\t"
1683         PTR_ADDIU  "%[dst], %[dst], 0x08            \n\t"
1684         "addiu      $8,     $8,    -0x01            \n\t"
1685         "bnez       $8,     1b                      \n\t"
1686         : RESTRICT_ASM_LOW32            RESTRICT_ASM_ADDRT
1687           [src]"+r"(src),               [dst]"+r"(dst)
1688         : [stride]"r"(stride),          [stride1]"r"(-2*stride),
1689           [shift]"f"(shift),            [rnd]"m"(rnd),
1690           [stride2]"r"(9*stride-4),     [ff_pw_9]"m"(ff_pw_9)
1691         : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
1692           "$f14", "$f16", "memory"
1693     );
1694 }
1695
1696 /**
1697  * Data is already unpacked, so some operations can directly be made from
1698  * memory.
1699  */
1700 #define VC1_HOR_16B_SHIFT2(OP, OPNAME)                                      \
1701 static void OPNAME ## vc1_hor_16b_shift2_mmi(uint8_t *dst, mips_reg stride, \
1702                                              const int16_t *src, int rnd)   \
1703 {                                                                           \
1704     int h = 8;                                                              \
1705     DECLARE_VAR_ALL64;                                                      \
1706     DECLARE_VAR_ADDRT;                                                      \
1707                                                                             \
1708     src -= 1;                                                               \
1709     rnd -= (-1+9+9-1)*1024; /* Add -1024 bias */                            \
1710                                                                             \
1711     __asm__ volatile(                                                       \
1712         LOAD_ROUNDER_MMI("%[rnd]")                                          \
1713         "ldc1       $f12,   %[ff_pw_128]            \n\t"                   \
1714         "ldc1       $f10,   %[ff_pw_9]              \n\t"                   \
1715         "1:                                         \n\t"                   \
1716         MMI_ULDC1($f2, %[src], 0x00)                                        \
1717         MMI_ULDC1($f4, %[src], 0x08)                                        \
1718         MMI_ULDC1($f6, %[src], 0x02)                                        \
1719         MMI_ULDC1($f8, %[src], 0x0a)                                        \
1720         MMI_ULDC1($f0, %[src], 0x06)                                        \
1721         "paddh      $f2,    $f2,    $f0             \n\t"                   \
1722         MMI_ULDC1($f0, %[src], 0x0e)                                        \
1723         "paddh      $f4,    $f4,    $f0             \n\t"                   \
1724         MMI_ULDC1($f0, %[src], 0x04)                                        \
1725         "paddh      $f6,    $f6,    $f0             \n\t"                   \
1726         MMI_ULDC1($f0, %[src], 0x0b)                                        \
1727         "paddh      $f8,    $f8,    $f0             \n\t"                   \
1728         "pmullh     $f6,    $f6,    $f10            \n\t"                   \
1729         "pmullh     $f8,    $f8,    $f10            \n\t"                   \
1730         "psubh      $f6,    $f6,    $f2             \n\t"                   \
1731         "psubh      $f8,    $f8,    $f4             \n\t"                   \
1732         "li         $8,     0x07                    \n\t"                   \
1733         "mtc1       $8,     $f16                    \n\t"                   \
1734         NORMALIZE_MMI("$f16")                                               \
1735         /* Remove bias */                                                   \
1736         "paddh      $f6,    $f6,    $f12            \n\t"                   \
1737         "paddh      $f8,    $f8,    $f12            \n\t"                   \
1738         TRANSFER_DO_PACK(OP)                                                \
1739         "addiu      %[h],   %[h],  -0x01            \n\t"                   \
1740         PTR_ADDIU  "%[src], %[src], 0x18            \n\t"                   \
1741         PTR_ADDU   "%[dst], %[dst], %[stride]       \n\t"                   \
1742         "bnez       %[h],   1b                      \n\t"                   \
1743         : RESTRICT_ASM_ALL64            RESTRICT_ASM_ADDRT                  \
1744           [h]"+r"(h),                                                       \
1745           [src]"+r"(src),               [dst]"+r"(dst)                      \
1746         : [stride]"r"(stride),          [rnd]"m"(rnd),                      \
1747           [ff_pw_9]"m"(ff_pw_9),        [ff_pw_128]"m"(ff_pw_128)           \
1748         : "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14",  \
1749           "$f16", "memory"                                                  \
1750     );                                                                      \
1751 }
1752
1753 VC1_HOR_16B_SHIFT2(OP_PUT, put_)
1754 VC1_HOR_16B_SHIFT2(OP_AVG, avg_)
1755
1756 /**
1757  * Purely vertical or horizontal 1/2 shift interpolation.
1758  * Sacrify $f12 for *9 factor.
1759  */
1760 #define VC1_SHIFT2(OP, OPNAME)\
1761 static void OPNAME ## vc1_shift2_mmi(uint8_t *dst, const uint8_t *src,      \
1762                                      mips_reg stride, int rnd,              \
1763                                      mips_reg offset)                       \
1764 {                                                                           \
1765     DECLARE_VAR_LOW32;                                                      \
1766     DECLARE_VAR_ADDRT;                                                      \
1767                                                                             \
1768     rnd = 8 - rnd;                                                          \
1769                                                                             \
1770     __asm__ volatile(                                                       \
1771         "xor        $f0,    $f0,    $f0             \n\t"                   \
1772         "li         $10,    0x08                    \n\t"                   \
1773         LOAD_ROUNDER_MMI("%[rnd]")                                          \
1774         "ldc1       $f12,   %[ff_pw_9]              \n\t"                   \
1775         "1:                                         \n\t"                   \
1776         MMI_ULWC1($f6, %[src], 0x00)                                        \
1777         MMI_ULWC1($f8, %[src], 0x04)                                        \
1778         PTR_ADDU   "$9,     %[src], %[offset]       \n\t"                   \
1779         MMI_ULWC1($f2, $9, 0x00)                                            \
1780         MMI_ULWC1($f4, $9, 0x04)                                            \
1781         PTR_ADDU   "%[src], %[src], %[offset]       \n\t"                   \
1782         "punpcklbh  $f6,    $f6,    $f0             \n\t"                   \
1783         "punpcklbh  $f8,    $f8,    $f0             \n\t"                   \
1784         "punpcklbh  $f2,    $f2,    $f0             \n\t"                   \
1785         "punpcklbh  $f4,    $f4,    $f0             \n\t"                   \
1786         "paddh      $f6,    $f6,    $f2             \n\t"                   \
1787         "paddh      $f8,    $f8,    $f4             \n\t"                   \
1788         PTR_ADDU   "$9,     %[src], %[offset_x2n]   \n\t"                   \
1789         MMI_ULWC1($f2, $9, 0x00)                                            \
1790         MMI_ULWC1($f4, $9, 0x04)                                            \
1791         "pmullh     $f6,    $f6,    $f12            \n\t" /* 0,9,9,0*/      \
1792         "pmullh     $f8,    $f8,    $f12            \n\t" /* 0,9,9,0*/      \
1793         "punpcklbh  $f2,    $f2,    $f0             \n\t"                   \
1794         "punpcklbh  $f4,    $f4,    $f0             \n\t"                   \
1795         "psubh      $f6,    $f6,    $f2             \n\t" /*-1,9,9,0*/      \
1796         "psubh      $f8,    $f8,    $f4             \n\t" /*-1,9,9,0*/      \
1797         PTR_ADDU   "$9,     %[src], %[offset]       \n\t"                   \
1798         MMI_ULWC1($f2, $9, 0x00)                                            \
1799         MMI_ULWC1($f4, $9, 0x04)                                            \
1800         "punpcklbh  $f2,    $f2,    $f0             \n\t"                   \
1801         "punpcklbh  $f4,    $f4,    $f0             \n\t"                   \
1802         "psubh      $f6,    $f6,    $f2             \n\t" /*-1,9,9,-1*/     \
1803         "psubh      $f8,    $f8,    $f4             \n\t" /*-1,9,9,-1*/     \
1804         "li         $8,     0x04                    \n\t"                   \
1805         "mtc1       $8,     $f16                    \n\t"                   \
1806         NORMALIZE_MMI("$f16")                                               \
1807         "packushb   $f6,    $f6,    $f8             \n\t"                   \
1808         OP((%[dst]), $f6)                                                   \
1809         "sdc1       $f6,    0x00(%[dst])            \n\t"                   \
1810         "addiu      $10,    $10,   -0x01            \n\t"                   \
1811         PTR_ADDU   "%[src], %[src], %[stride1]      \n\t"                   \
1812         PTR_ADDU   "%[dst], %[dst], %[stride]       \n\t"                   \
1813         "bnez       $10,    1b                      \n\t"                   \
1814         : RESTRICT_ASM_LOW32            RESTRICT_ASM_ADDRT                  \
1815           [src]"+r"(src),               [dst]"+r"(dst)                      \
1816         : [offset]"r"(offset),          [offset_x2n]"r"(-2*offset),         \
1817           [stride]"r"(stride),          [rnd]"m"(rnd),                      \
1818           [stride1]"r"(stride-offset),                                      \
1819           [ff_pw_9]"m"(ff_pw_9)                                             \
1820         : "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10",     \
1821           "$f12", "$f14", "$f16", "memory"                                  \
1822     );                                                                      \
1823 }
1824
1825 VC1_SHIFT2(OP_PUT, put_)
1826 VC1_SHIFT2(OP_AVG, avg_)
1827
1828 /**
1829  * Core of the 1/4 and 3/4 shift bicubic interpolation.
1830  *
1831  * @param UNPACK  Macro unpacking arguments from 8 to 16bits (can be empty).
1832  * @param LOAD    "MMI_ULWC1" or "MMI_ULDC1", if data read is already unpacked.
1833  * @param M       "1" for MMI_ULWC1, "2" for MMI_ULDC1.
1834  * @param A1      Stride address of 1st tap (beware of unpacked/packed).
1835  * @param A2      Stride address of 2nd tap
1836  * @param A3      Stride address of 3rd tap
1837  * @param A4      Stride address of 4th tap
1838  */
1839 #define MSPEL_FILTER13_CORE(UNPACK, LOAD, M, A1, A2, A3, A4)                \
1840     PTR_ADDU   "$9,     %[src], "#A1"           \n\t"                       \
1841     LOAD($f2, $9, M*0)                                                      \
1842     LOAD($f4, $9, M*4)                                                      \
1843     UNPACK("$f2")                                                           \
1844     UNPACK("$f4")                                                           \
1845     "pmullh     $f2,    $f2,    %[ff_pw_3]      \n\t"                       \
1846     "pmullh     $f4,    $f4,    %[ff_pw_3]      \n\t"                       \
1847     PTR_ADDU   "$9,     %[src], "#A2"           \n\t"                       \
1848     LOAD($f6, $9, M*0)                                                      \
1849     LOAD($f8, $9, M*4)                                                      \
1850     UNPACK("$f6")                                                           \
1851     UNPACK("$f8")                                                           \
1852     "pmullh     $f6,    $f6,    $f12            \n\t" /* *18 */             \
1853     "pmullh     $f8,    $f8,    $f12            \n\t" /* *18 */             \
1854     "psubh      $f6,    $f6,    $f2             \n\t" /* *18, -3 */         \
1855     "psubh      $f8,    $f8,    $f4             \n\t" /* *18, -3 */         \
1856     PTR_ADDU   "$9,     %[src], "#A4"           \n\t"                       \
1857     LOAD($f2, $9, M*0)                                                      \
1858     LOAD($f4, $9, M*4)                                                      \
1859     UNPACK("$f2")                                                           \
1860     UNPACK("$f4")                                                           \
1861     "li         $8,     0x02                    \n\t"                       \
1862     "mtc1       $8,     $f16                    \n\t"                       \
1863     "psllh      $f2,    $f2,    $f16            \n\t" /* 4* */              \
1864     "psllh      $f4,    $f4,    $f16            \n\t" /* 4* */              \
1865     "psubh      $f6,    $f6,    $f2             \n\t" /* -4,18,-3 */        \
1866     "psubh      $f8,    $f8,    $f4             \n\t" /* -4,18,-3 */        \
1867     PTR_ADDU   "$9,     %[src], "#A3"           \n\t"                       \
1868     LOAD($f2, $9, M*0)                                                      \
1869     LOAD($f4, $9, M*4)                                                      \
1870     UNPACK("$f2")                                                           \
1871     UNPACK("$f4")                                                           \
1872     "pmullh     $f2,    $f2,    $f10            \n\t" /* *53 */             \
1873     "pmullh     $f4,    $f4,    $f10            \n\t" /* *53 */             \
1874     "paddh      $f6,    $f6,    $f2             \n\t" /* 4,53,18,-3 */      \
1875     "paddh      $f8,    $f8,    $f4             \n\t" /* 4,53,18,-3 */
1876
1877 /**
1878  * Macro to build the vertical 16bits version of vc1_put_shift[13].
1879  * Here, offset=src_stride. Parameters passed A1 to A4 must use
1880  * %3 (src_stride), %4 (2*src_stride) and %5 (3*src_stride).
1881  *
1882  * @param  NAME   Either 1 or 3
1883  * @see MSPEL_FILTER13_CORE for information on A1->A4
1884  */
1885 #define MSPEL_FILTER13_VER_16B(NAME, A1, A2, A3, A4)                        \
1886 static void                                                                 \
1887 vc1_put_ver_16b_ ## NAME ## _mmi(int16_t *dst, const uint8_t *src,          \
1888                                  mips_reg src_stride,                       \
1889                                  int rnd, int64_t shift)                    \
1890 {                                                                           \
1891     int h = 8;                                                              \
1892     DECLARE_VAR_LOW32;                                                      \
1893     DECLARE_VAR_ADDRT;                                                      \
1894                                                                             \
1895     src -= src_stride;                                                      \
1896                                                                             \
1897     __asm__ volatile(                                                       \
1898         "xor        $f0,    $f0,    $f0             \n\t"                   \
1899         LOAD_ROUNDER_MMI("%[rnd]")                                          \
1900         "ldc1       $f10,   %[ff_pw_53]             \n\t"                   \
1901         "ldc1       $f12,   %[ff_pw_18]             \n\t"                   \
1902         ".p2align 3                                 \n\t"                   \
1903         "1:                                         \n\t"                   \
1904         MSPEL_FILTER13_CORE(DO_UNPACK, MMI_ULWC1, 1, A1, A2, A3, A4)        \
1905         NORMALIZE_MMI("%[shift]")                                           \
1906         TRANSFER_DONT_PACK(OP_PUT)                                          \
1907         /* Last 3 (in fact 4) bytes on the line */                          \
1908         PTR_ADDU   "$9,     %[src], "#A1"           \n\t"                   \
1909         MMI_ULWC1($f2, $9, 0x08)                                            \
1910         DO_UNPACK("$f2")                                                    \
1911         "mov.d      $f6,    $f2                     \n\t"                   \
1912         "paddh      $f2,    $f2,    $f2             \n\t"                   \
1913         "paddh      $f2,    $f2,    $f6             \n\t" /* 3* */          \
1914         PTR_ADDU   "$9,     %[src], "#A2"           \n\t"                   \
1915         MMI_ULWC1($f6, $9, 0x08)                                            \
1916         DO_UNPACK("$f6")                                                    \
1917         "pmullh     $f6,    $f6,    $f12            \n\t" /* *18 */         \
1918         "psubh      $f6,    $f6,    $f2             \n\t" /* *18,-3 */      \
1919         PTR_ADDU   "$9,     %[src], "#A3"           \n\t"                   \
1920         MMI_ULWC1($f2, $9, 0x08)                                            \
1921         DO_UNPACK("$f2")                                                    \
1922         "pmullh     $f2,    $f2,    $f10            \n\t" /* *53 */         \
1923         "paddh      $f6,    $f6,    $f2             \n\t" /* *53,18,-3 */   \
1924         PTR_ADDU   "$9,     %[src], "#A4"           \n\t"                   \
1925         MMI_ULWC1($f2, $9, 0x08)                                            \
1926         DO_UNPACK("$f2")                                                    \
1927         "li         $8,     0x02                    \n\t"                   \
1928         "mtc1       $8,     $f16                    \n\t"                   \
1929         "psllh      $f2,    $f2,    $f16            \n\t" /* 4* */          \
1930         "psubh      $f6,    $f6,    $f2             \n\t"                   \
1931         "paddh      $f6,    $f6,    $f14            \n\t"                   \
1932         "li         $8,     0x06                    \n\t"                   \
1933         "mtc1       $8,     $f16                    \n\t"                   \
1934         "psrah      $f6,    $f6,    $f16            \n\t"                   \
1935         "sdc1       $f6,    0x10(%[dst])            \n\t"                   \
1936         "addiu      %[h],   %[h],  -0x01            \n\t"                   \
1937         PTR_ADDU   "%[src], %[src], %[stride_x1]    \n\t"                   \
1938         PTR_ADDIU  "%[dst], %[dst], 0x18            \n\t"                   \
1939         "bnez       %[h],   1b                      \n\t"                   \
1940         : RESTRICT_ASM_LOW32            RESTRICT_ASM_ADDRT                  \
1941           [h]"+r"(h),                                                       \
1942           [src]"+r"(src),               [dst]"+r"(dst)                      \
1943         : [stride_x1]"r"(src_stride),   [stride_x2]"r"(2*src_stride),       \
1944           [stride_x3]"r"(3*src_stride),                                     \
1945           [rnd]"m"(rnd),                [shift]"f"(shift),                  \
1946           [ff_pw_53]"m"(ff_pw_53),      [ff_pw_18]"m"(ff_pw_18),            \
1947           [ff_pw_3]"f"(ff_pw_3)                                             \
1948         : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",    \
1949           "$f14", "$f16", "memory"                                          \
1950     );                                                                      \
1951 }
1952
1953 /**
1954  * Macro to build the horizontal 16bits version of vc1_put_shift[13].
1955  * Here, offset=16bits, so parameters passed A1 to A4 should be simple.
1956  *
1957  * @param  NAME   Either 1 or 3
1958  * @see MSPEL_FILTER13_CORE for information on A1->A4
1959  */
1960 #define MSPEL_FILTER13_HOR_16B(NAME, A1, A2, A3, A4, OP, OPNAME)            \
1961 static void                                                                 \
1962 OPNAME ## vc1_hor_16b_ ## NAME ## _mmi(uint8_t *dst, mips_reg stride,       \
1963                                        const int16_t *src, int rnd)         \
1964 {                                                                           \
1965     int h = 8;                                                              \
1966     DECLARE_VAR_ALL64;                                                      \
1967     DECLARE_VAR_ADDRT;                                                      \
1968                                                                             \
1969     src -= 1;                                                               \
1970     rnd -= (-4+58+13-3)*256; /* Add -256 bias */                            \
1971                                                                             \
1972     __asm__ volatile(                                                       \
1973         "xor        $f0,    $f0,    $f0             \n\t"                   \
1974         LOAD_ROUNDER_MMI("%[rnd]")                                          \
1975         "ldc1       $f10,   %[ff_pw_53]             \n\t"                   \
1976         "ldc1       $f12,   %[ff_pw_18]             \n\t"                   \
1977         ".p2align 3                                 \n\t"                   \
1978         "1:                                         \n\t"                   \
1979         MSPEL_FILTER13_CORE(DONT_UNPACK, MMI_ULDC1, 2, A1, A2, A3, A4)      \
1980         "li         $8,     0x07                    \n\t"                   \
1981         "mtc1       $8,     $f16                    \n\t"                   \
1982         NORMALIZE_MMI("$f16")                                               \
1983         /* Remove bias */                                                   \
1984         "paddh      $f6,    $f6,    %[ff_pw_128]    \n\t"                   \
1985         "paddh      $f8,    $f8,    %[ff_pw_128]    \n\t"                   \
1986         TRANSFER_DO_PACK(OP)                                                \
1987         "addiu      %[h],   %[h],  -0x01            \n\t"                   \
1988         PTR_ADDU   "%[src], %[src], 0x18            \n\t"                   \
1989         PTR_ADDU   "%[dst], %[dst], %[stride]       \n\t"                   \
1990         "bnez       %[h],   1b                      \n\t"                   \
1991         : RESTRICT_ASM_ALL64            RESTRICT_ASM_ADDRT                  \
1992           [h]"+r"(h),                                                       \
1993           [src]"+r"(src),               [dst]"+r"(dst)                      \
1994         : [stride]"r"(stride),          [rnd]"m"(rnd),                      \
1995           [ff_pw_53]"m"(ff_pw_53),      [ff_pw_18]"m"(ff_pw_18),            \
1996           [ff_pw_3]"f"(ff_pw_3),        [ff_pw_128]"f"(ff_pw_128)           \
1997         : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",    \
1998           "$f14", "$f16", "memory"                                          \
1999     );                                                                      \
2000 }
2001
2002 /**
2003  * Macro to build the 8bits, any direction, version of vc1_put_shift[13].
2004  * Here, offset=src_stride. Parameters passed A1 to A4 must use
2005  * %3 (offset), %4 (2*offset) and %5 (3*offset).
2006  *
2007  * @param  NAME   Either 1 or 3
2008  * @see MSPEL_FILTER13_CORE for information on A1->A4
2009  */
2010 #define MSPEL_FILTER13_8B(NAME, A1, A2, A3, A4, OP, OPNAME)                 \
2011 static void                                                                 \
2012 OPNAME ## vc1_## NAME ## _mmi(uint8_t *dst, const uint8_t *src,             \
2013                               mips_reg stride, int rnd, mips_reg offset)    \
2014 {                                                                           \
2015     int h = 8;                                                              \
2016     DECLARE_VAR_LOW32;                                                      \
2017     DECLARE_VAR_ADDRT;                                                      \
2018                                                                             \
2019     src -= offset;                                                          \
2020     rnd = 32-rnd;                                                           \
2021                                                                             \
2022     __asm__ volatile (                                                      \
2023         "xor        $f0,    $f0,    $f0             \n\t"                   \
2024         LOAD_ROUNDER_MMI("%[rnd]")                                          \
2025         "ldc1       $f10,   %[ff_pw_53]             \n\t"                   \
2026         "ldc1       $f12,   %[ff_pw_18]             \n\t"                   \
2027         ".p2align 3                                 \n\t"                   \
2028         "1:                                         \n\t"                   \
2029         MSPEL_FILTER13_CORE(DO_UNPACK, MMI_ULWC1, 1, A1, A2, A3, A4)        \
2030         "li         $8,     0x06                    \n\t"                   \
2031         "mtc1       $8,     $f16                    \n\t"                   \
2032         NORMALIZE_MMI("$f16")                                               \
2033         TRANSFER_DO_PACK(OP)                                                \
2034         "addiu      %[h],   %[h],      -0x01        \n\t"                   \
2035         PTR_ADDU   "%[src], %[src],     %[stride]   \n\t"                   \
2036         PTR_ADDU   "%[dst], %[dst],     %[stride]   \n\t"                   \
2037         "bnez       %[h],   1b                      \n\t"                   \
2038         : RESTRICT_ASM_LOW32            RESTRICT_ASM_ADDRT                  \
2039           [h]"+r"(h),                                                       \
2040           [src]"+r"(src),               [dst]"+r"(dst)                      \
2041         : [offset_x1]"r"(offset),       [offset_x2]"r"(2*offset),           \
2042           [offset_x3]"r"(3*offset),     [stride]"r"(stride),                \
2043           [rnd]"m"(rnd),                                                    \
2044           [ff_pw_53]"m"(ff_pw_53),      [ff_pw_18]"m"(ff_pw_18),            \
2045           [ff_pw_3]"f"(ff_pw_3)                                             \
2046         : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",    \
2047           "$f14", "$f16", "memory"                                          \
2048     );                                                                      \
2049 }
2050
2051
2052 /** 1/4 shift bicubic interpolation */
2053 MSPEL_FILTER13_8B(shift1, %[offset_x3], %[offset_x2], %[offset_x1], $0, OP_PUT, put_)
2054 MSPEL_FILTER13_8B(shift1, %[offset_x3], %[offset_x2], %[offset_x1], $0, OP_AVG, avg_)
2055 MSPEL_FILTER13_VER_16B(shift1, %[stride_x3], %[stride_x2], %[stride_x1], $0)
2056 MSPEL_FILTER13_HOR_16B(shift1, 6, 4, 2, 0, OP_PUT, put_)
2057 MSPEL_FILTER13_HOR_16B(shift1, 6, 4, 2, 0, OP_AVG, avg_)
2058
2059 /** 3/4 shift bicubic interpolation */
2060 MSPEL_FILTER13_8B(shift3, $0, %[offset_x1], %[offset_x2], %[offset_x3], OP_PUT, put_)
2061 MSPEL_FILTER13_8B(shift3, $0, %[offset_x1], %[offset_x2], %[offset_x3], OP_AVG, avg_)
2062 MSPEL_FILTER13_VER_16B(shift3, $0, %[stride_x1], %[stride_x2], %[stride_x3])
2063 MSPEL_FILTER13_HOR_16B(shift3, 0, 2, 4, 6, OP_PUT, put_)
2064 MSPEL_FILTER13_HOR_16B(shift3, 0, 2, 4, 6, OP_AVG, avg_)
2065
2066 typedef void (*vc1_mspel_mc_filter_ver_16bits)
2067              (int16_t *dst, const uint8_t *src, mips_reg src_stride, int rnd,
2068               int64_t shift);
2069 typedef void (*vc1_mspel_mc_filter_hor_16bits)
2070              (uint8_t *dst, mips_reg dst_stride, const int16_t *src, int rnd);
2071 typedef void (*vc1_mspel_mc_filter_8bits)
2072              (uint8_t *dst, const uint8_t *src, mips_reg stride, int rnd,
2073               mips_reg offset);
2074
2075 /**
2076  * Interpolate fractional pel values by applying proper vertical then
2077  * horizontal filter.
2078  *
2079  * @param  dst     Destination buffer for interpolated pels.
2080  * @param  src     Source buffer.
2081  * @param  stride  Stride for both src and dst buffers.
2082  * @param  hmode   Horizontal filter (expressed in quarter pixels shift).
2083  * @param  hmode   Vertical filter.
2084  * @param  rnd     Rounding bias.
2085  */
2086 #define VC1_MSPEL_MC(OP)                                                    \
2087 static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\
2088                                int hmode, int vmode, int rnd)               \
2089 {                                                                           \
2090     static const vc1_mspel_mc_filter_ver_16bits vc1_put_shift_ver_16bits[] =\
2091          { NULL, vc1_put_ver_16b_shift1_mmi,                                \
2092                  vc1_put_ver_16b_shift2_mmi,                                \
2093                  vc1_put_ver_16b_shift3_mmi };                              \
2094     static const vc1_mspel_mc_filter_hor_16bits vc1_put_shift_hor_16bits[] =\
2095          { NULL, OP ## vc1_hor_16b_shift1_mmi,                              \
2096                  OP ## vc1_hor_16b_shift2_mmi,                              \
2097                  OP ## vc1_hor_16b_shift3_mmi };                            \
2098     static const vc1_mspel_mc_filter_8bits vc1_put_shift_8bits[] =          \
2099          { NULL, OP ## vc1_shift1_mmi,                                      \
2100                  OP ## vc1_shift2_mmi,                                      \
2101                  OP ## vc1_shift3_mmi };                                    \
2102                                                                             \
2103     if (vmode) { /* Vertical filter to apply */                             \
2104         if (hmode) { /* Horizontal filter to apply, output to tmp */        \
2105             static const int shift_value[] = { 0, 5, 1, 5 };                \
2106             int    shift = (shift_value[hmode]+shift_value[vmode])>>1;      \
2107             int    r;                                                       \
2108             LOCAL_ALIGNED(16, int16_t, tmp, [12*8]);                        \
2109                                                                             \
2110             r = (1<<(shift-1)) + rnd-1;                                     \
2111             vc1_put_shift_ver_16bits[vmode](tmp, src-1, stride, r, shift);  \
2112                                                                             \
2113             vc1_put_shift_hor_16bits[hmode](dst, stride, tmp+1, 64-rnd);    \
2114             return;                                                         \
2115         }                                                                   \
2116         else { /* No horizontal filter, output 8 lines to dst */            \
2117             vc1_put_shift_8bits[vmode](dst, src, stride, 1-rnd, stride);    \
2118             return;                                                         \
2119         }                                                                   \
2120     }                                                                       \
2121                                                                             \
2122     /* Horizontal mode with no vertical mode */                             \
2123     vc1_put_shift_8bits[hmode](dst, src, stride, rnd, 1);                   \
2124 }                                                                           \
2125 static void OP ## vc1_mspel_mc_16(uint8_t *dst, const uint8_t *src,         \
2126                                   int stride, int hmode, int vmode, int rnd)\
2127 {                                                                           \
2128     OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd);        \
2129     OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd);        \
2130     dst += 8*stride; src += 8*stride;                                       \
2131     OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd);        \
2132     OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd);        \
2133 }
2134
2135 VC1_MSPEL_MC(put_)
2136 VC1_MSPEL_MC(avg_)
2137
2138 /** Macro to ease bicubic filter interpolation functions declarations */
2139 #define DECLARE_FUNCTION(a, b)                                              \
2140 void ff_put_vc1_mspel_mc ## a ## b ## _mmi(uint8_t *dst,                    \
2141                                            const uint8_t *src,              \
2142                                            ptrdiff_t stride,                \
2143                                            int rnd)                         \
2144 {                                                                           \
2145      put_vc1_mspel_mc(dst, src, stride, a, b, rnd);                         \
2146 }                                                                           \
2147 void ff_avg_vc1_mspel_mc ## a ## b ## _mmi(uint8_t *dst,                    \
2148                                            const uint8_t *src,              \
2149                                            ptrdiff_t stride,                \
2150                                            int rnd)                         \
2151 {                                                                           \
2152      avg_vc1_mspel_mc(dst, src, stride, a, b, rnd);                         \
2153 }                                                                           \
2154 void ff_put_vc1_mspel_mc ## a ## b ## _16_mmi(uint8_t *dst,                 \
2155                                               const uint8_t *src,           \
2156                                               ptrdiff_t stride,             \
2157                                               int rnd)                      \
2158 {                                                                           \
2159      put_vc1_mspel_mc_16(dst, src, stride, a, b, rnd);                      \
2160 }                                                                           \
2161 void ff_avg_vc1_mspel_mc ## a ## b ## _16_mmi(uint8_t *dst,                 \
2162                                               const uint8_t *src,           \
2163                                               ptrdiff_t stride,             \
2164                                               int rnd)                      \
2165 {                                                                           \
2166      avg_vc1_mspel_mc_16(dst, src, stride, a, b, rnd);                      \
2167 }
2168
2169 DECLARE_FUNCTION(0, 1)
2170 DECLARE_FUNCTION(0, 2)
2171 DECLARE_FUNCTION(0, 3)
2172
2173 DECLARE_FUNCTION(1, 0)
2174 DECLARE_FUNCTION(1, 1)
2175 DECLARE_FUNCTION(1, 2)
2176 DECLARE_FUNCTION(1, 3)
2177
2178 DECLARE_FUNCTION(2, 0)
2179 DECLARE_FUNCTION(2, 1)
2180 DECLARE_FUNCTION(2, 2)
2181 DECLARE_FUNCTION(2, 3)
2182
2183 DECLARE_FUNCTION(3, 0)
2184 DECLARE_FUNCTION(3, 1)
2185 DECLARE_FUNCTION(3, 2)
2186 DECLARE_FUNCTION(3, 3)
2187
2188 #define CHROMA_MC_8_MMI                                                     \
2189         "punpckhbh  %[ftmp5],   %[ftmp1],   %[ftmp0]                \n\t"   \
2190         "punpcklbh  %[ftmp1],   %[ftmp1],   %[ftmp0]                \n\t"   \
2191         "punpckhbh  %[ftmp6],   %[ftmp2],   %[ftmp0]                \n\t"   \
2192         "punpcklbh  %[ftmp2],   %[ftmp2],   %[ftmp0]                \n\t"   \
2193         "punpckhbh  %[ftmp7],   %[ftmp3],   %[ftmp0]                \n\t"   \
2194         "punpcklbh  %[ftmp3],   %[ftmp3],   %[ftmp0]                \n\t"   \
2195         "punpckhbh  %[ftmp8],   %[ftmp4],   %[ftmp0]                \n\t"   \
2196         "punpcklbh  %[ftmp4],   %[ftmp4],   %[ftmp0]                \n\t"   \
2197                                                                             \
2198         "pmullh     %[ftmp1],   %[ftmp1],   %[A]                    \n\t"   \
2199         "pmullh     %[ftmp5],   %[ftmp5],   %[A]                    \n\t"   \
2200         "pmullh     %[ftmp2],   %[ftmp2],   %[B]                    \n\t"   \
2201         "pmullh     %[ftmp6],   %[ftmp6],   %[B]                    \n\t"   \
2202         "pmullh     %[ftmp3],   %[ftmp3],   %[C]                    \n\t"   \
2203         "pmullh     %[ftmp7],   %[ftmp7],   %[C]                    \n\t"   \
2204         "pmullh     %[ftmp4],   %[ftmp4],   %[D]                    \n\t"   \
2205         "pmullh     %[ftmp8],   %[ftmp8],   %[D]                    \n\t"   \
2206                                                                             \
2207         "paddh      %[ftmp1],   %[ftmp1],   %[ftmp2]                \n\t"   \
2208         "paddh      %[ftmp3],   %[ftmp3],   %[ftmp4]                \n\t"   \
2209         "paddh      %[ftmp1],   %[ftmp1],   %[ftmp3]                \n\t"   \
2210         "paddh      %[ftmp1],   %[ftmp1],   %[ff_pw_28]             \n\t"   \
2211                                                                             \
2212         "paddh      %[ftmp5],   %[ftmp5],   %[ftmp6]                \n\t"   \
2213         "paddh      %[ftmp7],   %[ftmp7],   %[ftmp8]                \n\t"   \
2214         "paddh      %[ftmp5],   %[ftmp5],   %[ftmp7]                \n\t"   \
2215         "paddh      %[ftmp5],   %[ftmp5],   %[ff_pw_28]             \n\t"   \
2216                                                                             \
2217         "psrlh      %[ftmp1],   %[ftmp1],   %[ftmp9]                \n\t"   \
2218         "psrlh      %[ftmp5],   %[ftmp5],   %[ftmp9]                \n\t"   \
2219         "packushb   %[ftmp1],   %[ftmp1],   %[ftmp5]                \n\t"
2220
2221
2222 #define CHROMA_MC_4_MMI                                                     \
2223         "punpcklbh  %[ftmp1],   %[ftmp1],   %[ftmp0]                \n\t"   \
2224         "punpcklbh  %[ftmp2],   %[ftmp2],   %[ftmp0]                \n\t"   \
2225         "punpcklbh  %[ftmp3],   %[ftmp3],   %[ftmp0]                \n\t"   \
2226         "punpcklbh  %[ftmp4],   %[ftmp4],   %[ftmp0]                \n\t"   \
2227                                                                             \
2228         "pmullh     %[ftmp1],   %[ftmp1],   %[A]                    \n\t"   \
2229         "pmullh     %[ftmp2],   %[ftmp2],   %[B]                    \n\t"   \
2230         "pmullh     %[ftmp3],   %[ftmp3],   %[C]                    \n\t"   \
2231         "pmullh     %[ftmp4],   %[ftmp4],   %[D]                    \n\t"   \
2232                                                                             \
2233         "paddh      %[ftmp1],   %[ftmp1],   %[ftmp2]                \n\t"   \
2234         "paddh      %[ftmp3],   %[ftmp3],   %[ftmp4]                \n\t"   \
2235         "paddh      %[ftmp1],   %[ftmp1],   %[ftmp3]                \n\t"   \
2236         "paddh      %[ftmp1],   %[ftmp1],   %[ff_pw_28]             \n\t"   \
2237                                                                             \
2238         "psrlh      %[ftmp1],   %[ftmp1],   %[ftmp5]                \n\t"   \
2239         "packushb   %[ftmp1],   %[ftmp1],   %[ftmp0]                \n\t"
2240
2241
2242 void ff_put_no_rnd_vc1_chroma_mc8_mmi(uint8_t *dst /* align 8 */,
2243                                       uint8_t *src /* align 1 */,
2244                                       ptrdiff_t stride, int h, int x, int y)
2245 {
2246     const int A = (8 - x) * (8 - y);
2247     const int B =     (x) * (8 - y);
2248     const int C = (8 - x) *     (y);
2249     const int D =     (x) *     (y);
2250     double ftmp[10];
2251     uint32_t tmp[1];
2252     DECLARE_VAR_ALL64;
2253     DECLARE_VAR_ADDRT;
2254
2255     av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2256
2257     __asm__ volatile(
2258         "li         %[tmp0],    0x06                                    \n\t"
2259         "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
2260         "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
2261         "pshufh     %[A],       %[A],       %[ftmp0]                    \n\t"
2262         "pshufh     %[B],       %[B],       %[ftmp0]                    \n\t"
2263         "pshufh     %[C],       %[C],       %[ftmp0]                    \n\t"
2264         "pshufh     %[D],       %[D],       %[ftmp0]                    \n\t"
2265
2266         "1:                                                             \n\t"
2267         MMI_ULDC1(%[ftmp1], %[src], 0x00)
2268         MMI_ULDC1(%[ftmp2], %[src], 0x01)
2269         PTR_ADDU   "%[src],     %[src],     %[stride]                   \n\t"
2270         MMI_ULDC1(%[ftmp3], %[src], 0x00)
2271         MMI_ULDC1(%[ftmp4], %[src], 0x01)
2272
2273         CHROMA_MC_8_MMI
2274
2275         MMI_SDC1(%[ftmp1], %[dst], 0x00)
2276         "addiu      %[h],       %[h],      -0x01                        \n\t"
2277         PTR_ADDU   "%[dst],     %[dst],     %[stride]                   \n\t"
2278         "bnez       %[h],       1b                                      \n\t"
2279         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
2280           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
2281           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
2282           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
2283           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
2284           RESTRICT_ASM_ALL64
2285           RESTRICT_ASM_ADDRT
2286           [tmp0]"=&r"(tmp[0]),
2287           [src]"+&r"(src),              [dst]"+&r"(dst),
2288           [h]"+&r"(h)
2289         : [stride]"r"((mips_reg)stride),
2290           [A]"f"(A),                    [B]"f"(B),
2291           [C]"f"(C),                    [D]"f"(D),
2292           [ff_pw_28]"f"(ff_pw_28)
2293         : "memory"
2294     );
2295 }
2296
2297 void ff_put_no_rnd_vc1_chroma_mc4_mmi(uint8_t *dst /* align 8 */,
2298                                       uint8_t *src /* align 1 */,
2299                                       ptrdiff_t stride, int h, int x, int y)
2300 {
2301     const int A = (8 - x) * (8 - y);
2302     const int B =     (x) * (8 - y);
2303     const int C = (8 - x) *     (y);
2304     const int D =     (x) *     (y);
2305     double ftmp[6];
2306     uint32_t tmp[1];
2307     DECLARE_VAR_LOW32;
2308     DECLARE_VAR_ADDRT;
2309
2310     av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2311
2312     __asm__ volatile(
2313         "li         %[tmp0],    0x06                                    \n\t"
2314         "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
2315         "mtc1       %[tmp0],    %[ftmp5]                                \n\t"
2316         "pshufh     %[A],       %[A],       %[ftmp0]                    \n\t"
2317         "pshufh     %[B],       %[B],       %[ftmp0]                    \n\t"
2318         "pshufh     %[C],       %[C],       %[ftmp0]                    \n\t"
2319         "pshufh     %[D],       %[D],       %[ftmp0]                    \n\t"
2320
2321         "1:                                                             \n\t"
2322         MMI_ULWC1(%[ftmp1], %[src], 0x00)
2323         MMI_ULWC1(%[ftmp2], %[src], 0x01)
2324         PTR_ADDU   "%[src],     %[src],     %[stride]                   \n\t"
2325         MMI_ULWC1(%[ftmp3], %[src], 0x00)
2326         MMI_ULWC1(%[ftmp4], %[src], 0x01)
2327
2328         CHROMA_MC_4_MMI
2329
2330         MMI_SWC1(%[ftmp1], %[dst], 0x00)
2331         "addiu      %[h],       %[h],      -0x01                        \n\t"
2332         PTR_ADDU   "%[dst],     %[dst],     %[stride]                   \n\t"
2333         "bnez       %[h],       1b                                      \n\t"
2334         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
2335           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
2336           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
2337           [tmp0]"=&r"(tmp[0]),
2338           RESTRICT_ASM_LOW32
2339           RESTRICT_ASM_ADDRT
2340           [src]"+&r"(src),              [dst]"+&r"(dst),
2341           [h]"+&r"(h)
2342         : [stride]"r"((mips_reg)stride),
2343           [A]"f"(A),                    [B]"f"(B),
2344           [C]"f"(C),                    [D]"f"(D),
2345           [ff_pw_28]"f"(ff_pw_28)
2346         : "memory"
2347     );
2348 }
2349
2350 void ff_avg_no_rnd_vc1_chroma_mc8_mmi(uint8_t *dst /* align 8 */,
2351                                       uint8_t *src /* align 1 */,
2352                                       ptrdiff_t stride, int h, int x, int y)
2353 {
2354     const int A = (8 - x) * (8 - y);
2355     const int B =     (x) * (8 - y);
2356     const int C = (8 - x) *     (y);
2357     const int D =     (x) *     (y);
2358     double ftmp[10];
2359     uint32_t tmp[1];
2360     DECLARE_VAR_ALL64;
2361     DECLARE_VAR_ADDRT;
2362
2363     av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2364
2365     __asm__ volatile(
2366         "li         %[tmp0],    0x06                                    \n\t"
2367         "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
2368         "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
2369         "pshufh     %[A],       %[A],       %[ftmp0]                    \n\t"
2370         "pshufh     %[B],       %[B],       %[ftmp0]                    \n\t"
2371         "pshufh     %[C],       %[C],       %[ftmp0]                    \n\t"
2372         "pshufh     %[D],       %[D],       %[ftmp0]                    \n\t"
2373
2374         "1:                                                             \n\t"
2375         MMI_ULDC1(%[ftmp1], %[src], 0x00)
2376         MMI_ULDC1(%[ftmp2], %[src], 0x01)
2377         PTR_ADDU   "%[src],     %[src],     %[stride]                   \n\t"
2378         MMI_ULDC1(%[ftmp3], %[src], 0x00)
2379         MMI_ULDC1(%[ftmp4], %[src], 0x01)
2380
2381         CHROMA_MC_8_MMI
2382
2383         MMI_LDC1(%[ftmp2], %[dst], 0x00)
2384         "pavgb      %[ftmp1],   %[ftmp1],   %[ftmp2]                    \n\t"
2385
2386         MMI_SDC1(%[ftmp1], %[dst], 0x00)
2387         "addiu      %[h],       %[h],      -0x01                        \n\t"
2388         PTR_ADDU   "%[dst],     %[dst],     %[stride]                   \n\t"
2389         "bnez       %[h],       1b                                      \n\t"
2390         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
2391           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
2392           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
2393           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
2394           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
2395           [tmp0]"=&r"(tmp[0]),
2396           RESTRICT_ASM_ALL64
2397           RESTRICT_ASM_ADDRT
2398           [src]"+&r"(src),              [dst]"+&r"(dst),
2399           [h]"+&r"(h)
2400         : [stride]"r"((mips_reg)stride),
2401           [A]"f"(A),                    [B]"f"(B),
2402           [C]"f"(C),                    [D]"f"(D),
2403           [ff_pw_28]"f"(ff_pw_28)
2404         : "memory"
2405     );
2406 }
2407
2408 void ff_avg_no_rnd_vc1_chroma_mc4_mmi(uint8_t *dst /* align 8 */,
2409                                       uint8_t *src /* align 1 */,
2410                                       ptrdiff_t stride, int h, int x, int y)
2411 {
2412     const int A = (8 - x) * (8 - y);
2413     const int B = (    x) * (8 - y);
2414     const int C = (8 - x) * (    y);
2415     const int D = (    x) * (    y);
2416     double ftmp[6];
2417     uint32_t tmp[1];
2418     DECLARE_VAR_LOW32;
2419     DECLARE_VAR_ADDRT;
2420
2421     av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2422
2423     __asm__ volatile(
2424         "li         %[tmp0],    0x06                                    \n\t"
2425         "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
2426         "mtc1       %[tmp0],    %[ftmp5]                                \n\t"
2427         "pshufh     %[A],       %[A],       %[ftmp0]                    \n\t"
2428         "pshufh     %[B],       %[B],       %[ftmp0]                    \n\t"
2429         "pshufh     %[C],       %[C],       %[ftmp0]                    \n\t"
2430         "pshufh     %[D],       %[D],       %[ftmp0]                    \n\t"
2431
2432         "1:                                                             \n\t"
2433         MMI_ULWC1(%[ftmp1], %[src], 0x00)
2434         MMI_ULWC1(%[ftmp2], %[src], 0x01)
2435         PTR_ADDU   "%[src],     %[src],     %[stride]                   \n\t"
2436         MMI_ULWC1(%[ftmp3], %[src], 0x00)
2437         MMI_ULWC1(%[ftmp4], %[src], 0x01)
2438
2439         CHROMA_MC_4_MMI
2440
2441         MMI_LWC1(%[ftmp2], %[dst], 0x00)
2442         "pavgb      %[ftmp1],   %[ftmp1],   %[ftmp2]                    \n\t"
2443
2444         MMI_SWC1(%[ftmp1], %[dst], 0x00)
2445         "addiu      %[h],       %[h],      -0x01                        \n\t"
2446         PTR_ADDU   "%[dst],     %[dst],     %[stride]                   \n\t"
2447         "bnez       %[h],       1b                                      \n\t"
2448         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
2449           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
2450           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
2451           [tmp0]"=&r"(tmp[0]),
2452           RESTRICT_ASM_LOW32
2453           RESTRICT_ASM_ADDRT
2454           [src]"+&r"(src),              [dst]"+&r"(dst),
2455           [h]"+&r"(h)
2456         : [stride]"r"((mips_reg)stride),
2457           [A]"f"(A),                    [B]"f"(B),
2458           [C]"f"(C),                    [D]"f"(D),
2459           [ff_pw_28]"f"(ff_pw_28)
2460         : "memory"
2461     );
2462 }