git.sesse.net Git - ffmpeg/blob - libavcodec/mips/vc1dsp_mmi.c

   1 /*
   2  * VC-1 and WMV3 - DSP functions Loongson MMI-optimized
   3  *
   4  * Copyright (c) 2016 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
   5  *
   6  * This file is part of FFmpeg.
   7  *
   8  * FFmpeg is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License as published by the Free Software Foundation; either
  11  * version 2.1 of the License, or (at your option) any later version.
  12  *
  13  * FFmpeg is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16  * Lesser General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU Lesser General Public
  19  * License along with FFmpeg; if not, write to the Free Software
  20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21  */
  22
  23 #include "libavutil/avassert.h"
  24 #include "libavutil/mem_internal.h"
  25
  26 #include "libavcodec/vc1dsp.h"
  27 #include "constants.h"
  28 #include "vc1dsp_mips.h"
  29 #include "hpeldsp_mips.h"
  30 #include "libavutil/mem_internal.h"
  31 #include "libavutil/mips/mmiutils.h"
  32
  33 #define VC1_INV_TRANCS_8_TYPE1(o1, o2, r1, r2, r3, r4, c0)                  \
  34         "li         %[tmp0],    "#r1"                                 \n\t" \
  35         "mtc1       %[tmp0],    %[ftmp13]                             \n\t" \
  36         "punpcklwd  %[ftmp13],  %[ftmp13],  %[ftmp13]                 \n\t" \
  37         "li         %[tmp0],    "#r2"                                 \n\t" \
  38         "mtc1       %[tmp0],    %[ftmp14]                             \n\t" \
  39         "punpcklwd  %[ftmp14],  %[ftmp14],  %[ftmp14]                 \n\t" \
  40         "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp13]                 \n\t" \
  41         "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp14]                 \n\t" \
  42         "paddw      %[ftmp1],   %[ftmp1],   %[ftmp2]                  \n\t" \
  43         "pmaddhw    %[ftmp2],   %[ftmp6],   %[ftmp13]                 \n\t" \
  44         "pmaddhw    %[ftmp3],   %[ftmp8],   %[ftmp14]                 \n\t" \
  45         "paddw      %[ftmp2],   %[ftmp2],   %[ftmp3]                  \n\t" \
  46                                                                             \
  47         "li         %[tmp0],    "#r3"                                 \n\t" \
  48         "mtc1       %[tmp0],    %[ftmp13]                             \n\t" \
  49         "punpcklwd  %[ftmp13],  %[ftmp13],  %[ftmp13]                 \n\t" \
  50         "li         %[tmp0],    "#r4"                                 \n\t" \
  51         "mtc1       %[tmp0],    %[ftmp14]                             \n\t" \
  52         "punpcklwd  %[ftmp14],  %[ftmp14],  %[ftmp14]                 \n\t" \
  53         "pmaddhw    %[ftmp3],   %[ftmp9],   %[ftmp13]                 \n\t" \
  54         "pmaddhw    %[ftmp4],   %[ftmp11],  %[ftmp14]                 \n\t" \
  55         "paddw      %[ftmp3],   %[ftmp3],   %[ftmp4]                  \n\t" \
  56         "pmaddhw    %[ftmp4],   %[ftmp10],  %[ftmp13]                 \n\t" \
  57         "pmaddhw    %[ftmp13],  %[ftmp12],  %[ftmp14]                 \n\t" \
  58         "paddw      %[ftmp4],   %[ftmp4],   %[ftmp13]                 \n\t" \
  59                                                                             \
  60         "paddw      %[ftmp1],   %[ftmp1],   "#c0"                     \n\t" \
  61         "paddw      %[ftmp2],   %[ftmp2],   "#c0"                     \n\t" \
  62         "paddw      %[ftmp13],  %[ftmp1],   %[ftmp3]                  \n\t" \
  63         "psubw      %[ftmp14],  %[ftmp1],   %[ftmp3]                  \n\t" \
  64         "paddw      %[ftmp1],   %[ftmp2],   %[ftmp4]                  \n\t" \
  65         "psubw      %[ftmp3],   %[ftmp2],   %[ftmp4]                  \n\t" \
  66         "psraw      %[ftmp13],  %[ftmp13],  %[ftmp0]                  \n\t" \
  67         "psraw      %[ftmp1],   %[ftmp1],   %[ftmp0]                  \n\t" \
  68         "psraw      %[ftmp14],  %[ftmp14],  %[ftmp0]                  \n\t" \
  69         "psraw      %[ftmp3],   %[ftmp3],   %[ftmp0]                  \n\t" \
  70         "punpcklhw  %[ftmp2],   %[ftmp13],  %[ftmp1]                  \n\t" \
  71         "punpckhhw  %[ftmp4],   %[ftmp13],  %[ftmp1]                  \n\t" \
  72         "punpcklhw  "#o1",      %[ftmp2],   %[ftmp4]                  \n\t" \
  73         "punpcklhw  %[ftmp2],   %[ftmp14],  %[ftmp3]                  \n\t" \
  74         "punpckhhw  %[ftmp4],   %[ftmp14],  %[ftmp3]                  \n\t" \
  75         "punpcklhw  "#o2",      %[ftmp2],   %[ftmp4]                  \n\t"
  76
  77 #define VC1_INV_TRANCS_8_TYPE2(o1, o2, r1, r2, r3, r4, c0, c1)              \
  78         "li         %[tmp0],    "#r1"                                 \n\t" \
  79         "mtc1       %[tmp0],    %[ftmp13]                             \n\t" \
  80         "punpcklwd  %[ftmp13],  %[ftmp13],  %[ftmp13]                 \n\t" \
  81         "li         %[tmp0],    "#r2"                                 \n\t" \
  82         "mtc1       %[tmp0],    %[ftmp14]                             \n\t" \
  83         "punpcklwd  %[ftmp14],  %[ftmp14],  %[ftmp14]                 \n\t" \
  84         "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp13]                 \n\t" \
  85         "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp14]                 \n\t" \
  86         "paddw      %[ftmp1],   %[ftmp1],   %[ftmp2]                  \n\t" \
  87         "pmaddhw    %[ftmp2],   %[ftmp6],   %[ftmp13]                 \n\t" \
  88         "pmaddhw    %[ftmp3],   %[ftmp8],   %[ftmp14]                 \n\t" \
  89         "paddw      %[ftmp2],   %[ftmp2],   %[ftmp3]                  \n\t" \
  90                                                                             \
  91         "li         %[tmp0],    "#r3"                                 \n\t" \
  92         "mtc1       %[tmp0],    %[ftmp13]                             \n\t" \
  93         "punpcklwd  %[ftmp13],  %[ftmp13],  %[ftmp13]                 \n\t" \
  94         "li         %[tmp0],    "#r4"                                 \n\t" \
  95         "mtc1       %[tmp0],    %[ftmp14]                             \n\t" \
  96         "punpcklwd  %[ftmp14],  %[ftmp14],  %[ftmp14]                 \n\t" \
  97         "pmaddhw    %[ftmp3],   %[ftmp9],   %[ftmp13]                 \n\t" \
  98         "pmaddhw    %[ftmp4],   %[ftmp11],  %[ftmp14]                 \n\t" \
  99         "paddw      %[ftmp3],   %[ftmp3],   %[ftmp4]                  \n\t" \
 100         "pmaddhw    %[ftmp4],   %[ftmp10],  %[ftmp13]                 \n\t" \
 101         "pmaddhw    %[ftmp13],  %[ftmp12],  %[ftmp14]                 \n\t" \
 102         "paddw      %[ftmp4],   %[ftmp4],   %[ftmp13]                 \n\t" \
 103                                                                             \
 104         "paddw      %[ftmp13],  %[ftmp1],   %[ftmp3]                  \n\t" \
 105         "psubw      %[ftmp14],  %[ftmp1],   %[ftmp3]                  \n\t" \
 106         "paddw      %[ftmp14],  %[ftmp14],  "#c1"                     \n\t" \
 107         "paddw      %[ftmp1],   %[ftmp2],   %[ftmp4]                  \n\t" \
 108         "psubw      %[ftmp3],   %[ftmp2],   %[ftmp4]                  \n\t" \
 109         "paddw      %[ftmp3],   %[ftmp3],   "#c1"                     \n\t" \
 110         "paddw      %[ftmp13],  %[ftmp13],  "#c0"                     \n\t" \
 111         "paddw      %[ftmp14],  %[ftmp14],  "#c0"                     \n\t" \
 112         "paddw      %[ftmp1],   %[ftmp1],   "#c0"                     \n\t" \
 113         "paddw      %[ftmp3],   %[ftmp3],   "#c0"                     \n\t" \
 114         "psraw      %[ftmp13],  %[ftmp13],  %[ftmp0]                  \n\t" \
 115         "psraw      %[ftmp1],   %[ftmp1],   %[ftmp0]                  \n\t" \
 116         "psraw      %[ftmp14],  %[ftmp14],  %[ftmp0]                  \n\t" \
 117         "psraw      %[ftmp3],   %[ftmp3],   %[ftmp0]                  \n\t" \
 118         "punpcklhw  %[ftmp2],   %[ftmp13],  %[ftmp1]                  \n\t" \
 119         "punpckhhw  %[ftmp4],   %[ftmp13],  %[ftmp1]                  \n\t" \
 120         "punpcklhw  "#o1",      %[ftmp2],   %[ftmp4]                  \n\t" \
 121         "punpcklhw  %[ftmp2],   %[ftmp14],  %[ftmp3]                  \n\t" \
 122         "punpckhhw  %[ftmp4],   %[ftmp14],  %[ftmp3]                  \n\t" \
 123         "punpcklhw  "#o2",      %[ftmp2],   %[ftmp4]                  \n\t"
 124
 125 /* Do inverse transform on 8x8 block */
 126 void ff_vc1_inv_trans_8x8_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
 127 {
 128     int dc = block[0];
 129     double ftmp[9];
 130     mips_reg addr[1];
 131     int count;
 132
 133     dc = (3 * dc +  1) >> 1;
 134     dc = (3 * dc + 16) >> 5;
 135
 136     __asm__ volatile(
 137         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
 138         "pshufh     %[dc],      %[dc],          %[ftmp0]                \n\t"
 139         "li         %[count],   0x02                                    \n\t"
 140
 141         "1:                                                             \n\t"
 142         MMI_LDC1(%[ftmp1], %[dest], 0x00)
 143         PTR_ADDU   "%[addr0],   %[dest],        %[linesize]             \n\t"
 144         MMI_LDC1(%[ftmp2], %[addr0], 0x00)
 145         PTR_ADDU   "%[addr0],   %[addr0],       %[linesize]             \n\t"
 146         MMI_LDC1(%[ftmp3], %[addr0], 0x00)
 147         PTR_ADDU   "%[addr0],   %[addr0],       %[linesize]             \n\t"
 148         MMI_LDC1(%[ftmp4], %[addr0], 0x00)
 149
 150         "punpckhbh  %[ftmp5],   %[ftmp1],       %[ftmp0]                \n\t"
 151         "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
 152         "punpckhbh  %[ftmp6],   %[ftmp2],       %[ftmp0]                \n\t"
 153         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
 154         "punpckhbh  %[ftmp7],   %[ftmp3],       %[ftmp0]                \n\t"
 155         "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
 156         "punpckhbh  %[ftmp8],   %[ftmp4],       %[ftmp0]                \n\t"
 157         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
 158
 159         "paddsh     %[ftmp1],   %[ftmp1],       %[dc]                   \n\t"
 160         "paddsh     %[ftmp2],   %[ftmp2],       %[dc]                   \n\t"
 161         "paddsh     %[ftmp3],   %[ftmp3],       %[dc]                   \n\t"
 162         "paddsh     %[ftmp4],   %[ftmp4],       %[dc]                   \n\t"
 163         "paddsh     %[ftmp5],   %[ftmp5],       %[dc]                   \n\t"
 164         "paddsh     %[ftmp6],   %[ftmp6],       %[dc]                   \n\t"
 165         "paddsh     %[ftmp7],   %[ftmp7],       %[dc]                   \n\t"
 166         "paddsh     %[ftmp8],   %[ftmp8],       %[dc]                   \n\t"
 167
 168         "packushb   %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
 169         "packushb   %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
 170         "packushb   %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
 171         "packushb   %[ftmp4],   %[ftmp4],       %[ftmp8]                \n\t"
 172
 173         MMI_SDC1(%[ftmp1], %[dest], 0x00)
 174         PTR_ADDU   "%[addr0],   %[dest],        %[linesize]             \n\t"
 175         MMI_SDC1(%[ftmp2], %[addr0], 0x00)
 176         PTR_ADDU   "%[addr0],   %[addr0],       %[linesize]             \n\t"
 177         MMI_SDC1(%[ftmp3], %[addr0], 0x00)
 178         PTR_ADDU   "%[addr0],   %[addr0],       %[linesize]             \n\t"
 179         MMI_SDC1(%[ftmp4], %[addr0], 0x00)
 180
 181         "addiu      %[count],   %[count],       -0x01                   \n\t"
 182         PTR_ADDU   "%[dest],    %[addr0],       %[linesize]             \n\t"
 183         "bnez       %[count],   1b                                      \n\t"
 184         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
 185           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
 186           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
 187           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
 188           [ftmp8]"=&f"(ftmp[8]),
 189           [addr0]"=&r"(addr[0]),
 190           [count]"=&r"(count),          [dest]"+&r"(dest)
 191         : [linesize]"r"((mips_reg)linesize),
 192           [dc]"f"(dc)
 193         : "memory"
 194     );
 195 }
 196
 197 #if _MIPS_SIM != _ABIO32
 198 void ff_vc1_inv_trans_8x8_mmi(int16_t block[64])
 199 {
 200     DECLARE_ALIGNED(16, int16_t, temp[64]);
 201     DECLARE_ALIGNED(8, const uint64_t, ff_pw_1_local) = {0x0000000100000001ULL};
 202     DECLARE_ALIGNED(8, const uint64_t, ff_pw_4_local) = {0x0000000400000004ULL};
 203     DECLARE_ALIGNED(8, const uint64_t, ff_pw_64_local)= {0x0000004000000040ULL};
 204     double ftmp[23];
 205     uint64_t tmp[1];
 206
 207     __asm__ volatile (
 208         /* 1st loop: start */
 209         "li         %[tmp0],    0x03                                    \n\t"
 210         "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
 211
 212        // 1st part
 213         MMI_LDC1(%[ftmp1], %[block], 0x00)
 214         MMI_LDC1(%[ftmp11], %[block], 0x10)
 215         MMI_LDC1(%[ftmp2], %[block], 0x20)
 216         MMI_LDC1(%[ftmp12], %[block], 0x30)
 217         MMI_LDC1(%[ftmp3], %[block], 0x40)
 218         MMI_LDC1(%[ftmp13], %[block], 0x50)
 219         MMI_LDC1(%[ftmp4], %[block], 0x60)
 220         MMI_LDC1(%[ftmp14], %[block], 0x70)
 221         "punpcklhw  %[ftmp5],   %[ftmp1],   %[ftmp2]                    \n\t"
 222         "punpckhhw  %[ftmp6],   %[ftmp1],   %[ftmp2]                    \n\t"
 223         "punpcklhw  %[ftmp7],   %[ftmp3],   %[ftmp4]                    \n\t"
 224         "punpckhhw  %[ftmp8],   %[ftmp3],   %[ftmp4]                    \n\t"
 225
 226         "punpcklhw  %[ftmp9],  %[ftmp11],  %[ftmp12]                    \n\t"
 227         "punpckhhw  %[ftmp10], %[ftmp11],  %[ftmp12]                    \n\t"
 228         "punpcklhw  %[ftmp11], %[ftmp13],  %[ftmp14]                    \n\t"
 229         "punpckhhw  %[ftmp12], %[ftmp13],  %[ftmp14]                    \n\t"
 230
 231         /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */
 232         VC1_INV_TRANCS_8_TYPE1(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c,
 233                                0x000f0010, 0x00040009, %[ff_pw_4])
 234
 235         /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */
 236         VC1_INV_TRANCS_8_TYPE1(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4,
 237                                0xfffc000f, 0xfff7fff0, %[ff_pw_4])
 238
 239         /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */
 240         VC1_INV_TRANCS_8_TYPE1(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4,
 241                                0xfff00009, 0x000f0004, %[ff_pw_4])
 242
 243         /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */
 244         VC1_INV_TRANCS_8_TYPE1(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c,
 245                                0xfff70004, 0xfff0000f, %[ff_pw_4])
 246
 247         TRANSPOSE_4H(%[ftmp15], %[ftmp16], %[ftmp17], %[ftmp18],
 248                      %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
 249
 250         TRANSPOSE_4H(%[ftmp19], %[ftmp20], %[ftmp21], %[ftmp22],
 251                      %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
 252
 253         MMI_SDC1(%[ftmp15], %[temp], 0x00)
 254         MMI_SDC1(%[ftmp19], %[temp], 0x08)
 255         MMI_SDC1(%[ftmp16], %[temp], 0x10)
 256         MMI_SDC1(%[ftmp20], %[temp], 0x18)
 257         MMI_SDC1(%[ftmp17], %[temp], 0x20)
 258         MMI_SDC1(%[ftmp21], %[temp], 0x28)
 259         MMI_SDC1(%[ftmp18], %[temp], 0x30)
 260         MMI_SDC1(%[ftmp22], %[temp], 0x38)
 261
 262        // 2nd part
 263         MMI_LDC1(%[ftmp1], %[block], 0x08)
 264         MMI_LDC1(%[ftmp11], %[block], 0x18)
 265         MMI_LDC1(%[ftmp2], %[block], 0x28)
 266         MMI_LDC1(%[ftmp12], %[block], 0x38)
 267         MMI_LDC1(%[ftmp3], %[block], 0x48)
 268         MMI_LDC1(%[ftmp13], %[block], 0x58)
 269         MMI_LDC1(%[ftmp4], %[block], 0x68)
 270         MMI_LDC1(%[ftmp14], %[block], 0x78)
 271         "punpcklhw  %[ftmp5],   %[ftmp1],   %[ftmp2]                    \n\t"
 272         "punpckhhw  %[ftmp6],   %[ftmp1],   %[ftmp2]                    \n\t"
 273         "punpcklhw  %[ftmp7],   %[ftmp3],   %[ftmp4]                    \n\t"
 274         "punpckhhw  %[ftmp8],   %[ftmp3],   %[ftmp4]                    \n\t"
 275
 276         "punpcklhw  %[ftmp9],   %[ftmp11],  %[ftmp12]                   \n\t"
 277         "punpckhhw  %[ftmp10],  %[ftmp11],  %[ftmp12]                   \n\t"
 278         "punpcklhw  %[ftmp11],  %[ftmp13],  %[ftmp14]                   \n\t"
 279         "punpckhhw  %[ftmp12],  %[ftmp13],  %[ftmp14]                   \n\t"
 280
 281         /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */
 282         VC1_INV_TRANCS_8_TYPE1(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c,
 283                                0x000f0010, 0x00040009, %[ff_pw_4])
 284
 285         /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */
 286         VC1_INV_TRANCS_8_TYPE1(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4,
 287                                0xfffc000f, 0xfff7fff0, %[ff_pw_4])
 288
 289         /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */
 290         VC1_INV_TRANCS_8_TYPE1(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4,
 291                                0xfff00009, 0x000f0004, %[ff_pw_4])
 292
 293         /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */
 294         VC1_INV_TRANCS_8_TYPE1(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c,
 295                                0xfff70004, 0xfff0000f, %[ff_pw_4])
 296
 297         TRANSPOSE_4H(%[ftmp15], %[ftmp16], %[ftmp17], %[ftmp18],
 298                      %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
 299
 300         TRANSPOSE_4H(%[ftmp19], %[ftmp20], %[ftmp21], %[ftmp22],
 301                      %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
 302
 303         MMI_SDC1(%[ftmp19], %[temp], 0x48)
 304         MMI_SDC1(%[ftmp20], %[temp], 0x58)
 305         MMI_SDC1(%[ftmp21], %[temp], 0x68)
 306         MMI_SDC1(%[ftmp22], %[temp], 0x78)
 307         /* 1st loop: end */
 308
 309         /* 2nd loop: start */
 310         "li         %[tmp0],    0x07                                    \n\t"
 311         "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
 312
 313         // 1st part
 314         MMI_LDC1(%[ftmp1], %[temp], 0x00)
 315         MMI_LDC1(%[ftmp11], %[temp], 0x10)
 316         MMI_LDC1(%[ftmp2], %[temp], 0x20)
 317         MMI_LDC1(%[ftmp12], %[temp], 0x30)
 318         "punpcklhw  %[ftmp5],   %[ftmp1],   %[ftmp2]                    \n\t"
 319         "punpckhhw  %[ftmp6],   %[ftmp1],   %[ftmp2]                    \n\t"
 320         "punpcklhw  %[ftmp7],   %[ftmp15],  %[ftmp17]                   \n\t"
 321         "punpckhhw  %[ftmp8],   %[ftmp15],  %[ftmp17]                   \n\t"
 322
 323         "punpcklhw  %[ftmp9],   %[ftmp11],  %[ftmp12]                   \n\t"
 324         "punpckhhw  %[ftmp10],  %[ftmp11],  %[ftmp12]                   \n\t"
 325         "punpcklhw  %[ftmp11],  %[ftmp16],  %[ftmp18]                   \n\t"
 326         "punpckhhw  %[ftmp12],  %[ftmp16],  %[ftmp18]                   \n\t"
 327
 328         /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */
 329         VC1_INV_TRANCS_8_TYPE2(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c,
 330                                0x000f0010, 0x00040009, %[ff_pw_64], %[ff_pw_1])
 331
 332         /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */
 333         VC1_INV_TRANCS_8_TYPE2(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4,
 334                                0xfffc000f, 0xfff7fff0, %[ff_pw_64], %[ff_pw_1])
 335
 336         /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */
 337         VC1_INV_TRANCS_8_TYPE2(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4,
 338                                0xfff00009, 0x000f0004, %[ff_pw_64], %[ff_pw_1])
 339
 340         /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */
 341         VC1_INV_TRANCS_8_TYPE2(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c,
 342                                0xfff70004, 0xfff0000f, %[ff_pw_64], %[ff_pw_1])
 343
 344         MMI_SDC1(%[ftmp15], %[block], 0x00)
 345         MMI_SDC1(%[ftmp16], %[block], 0x10)
 346         MMI_SDC1(%[ftmp17], %[block], 0x20)
 347         MMI_SDC1(%[ftmp18], %[block], 0x30)
 348         MMI_SDC1(%[ftmp19], %[block], 0x40)
 349         MMI_SDC1(%[ftmp20], %[block], 0x50)
 350         MMI_SDC1(%[ftmp21], %[block], 0x60)
 351         MMI_SDC1(%[ftmp22], %[block], 0x70)
 352
 353        // 2nd part
 354         MMI_LDC1(%[ftmp1], %[temp], 0x08)
 355         MMI_LDC1(%[ftmp11], %[temp], 0x18)
 356         MMI_LDC1(%[ftmp2], %[temp], 0x28)
 357         MMI_LDC1(%[ftmp12], %[temp], 0x38)
 358         MMI_LDC1(%[ftmp3], %[temp], 0x48)
 359         MMI_LDC1(%[ftmp13], %[temp], 0x58)
 360         MMI_LDC1(%[ftmp4], %[temp], 0x68)
 361         MMI_LDC1(%[ftmp14], %[temp], 0x78)
 362         "punpcklhw  %[ftmp5],   %[ftmp1],   %[ftmp2]                    \n\t"
 363         "punpckhhw  %[ftmp6],   %[ftmp1],   %[ftmp2]                    \n\t"
 364         "punpcklhw  %[ftmp7],   %[ftmp3],   %[ftmp4]                    \n\t"
 365         "punpckhhw  %[ftmp8],   %[ftmp3],   %[ftmp4]                    \n\t"
 366
 367         "punpcklhw  %[ftmp9],   %[ftmp11],  %[ftmp12]                   \n\t"
 368         "punpckhhw  %[ftmp10],  %[ftmp11],  %[ftmp12]                   \n\t"
 369         "punpcklhw  %[ftmp11],  %[ftmp13],  %[ftmp14]                   \n\t"
 370         "punpckhhw  %[ftmp12],  %[ftmp13],  %[ftmp14]                   \n\t"
 371
 372         /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */
 373         VC1_INV_TRANCS_8_TYPE2(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c,
 374                                0x000f0010, 0x00040009, %[ff_pw_64], %[ff_pw_1])
 375
 376         /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */
 377         VC1_INV_TRANCS_8_TYPE2(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4,
 378                                0xfffc000f, 0xfff7fff0, %[ff_pw_64], %[ff_pw_1])
 379
 380         /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */
 381         VC1_INV_TRANCS_8_TYPE2(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4,
 382                                0xfff00009, 0x000f0004, %[ff_pw_64], %[ff_pw_1])
 383
 384         /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */
 385         VC1_INV_TRANCS_8_TYPE2(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c,
 386                                0xfff70004, 0xfff0000f, %[ff_pw_64], %[ff_pw_1])
 387
 388         MMI_SDC1(%[ftmp15], %[block], 0x08)
 389         MMI_SDC1(%[ftmp16], %[block], 0x18)
 390         MMI_SDC1(%[ftmp17], %[block], 0x28)
 391         MMI_SDC1(%[ftmp18], %[block], 0x38)
 392         MMI_SDC1(%[ftmp19], %[block], 0x48)
 393         MMI_SDC1(%[ftmp20], %[block], 0x58)
 394         MMI_SDC1(%[ftmp21], %[block], 0x68)
 395         MMI_SDC1(%[ftmp22], %[block], 0x78)
 396         /* 2nd loop: end */
 397         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
 398           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
 399           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
 400           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
 401           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
 402           [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
 403           [ftmp12]"=&f"(ftmp[12]),      [ftmp13]"=&f"(ftmp[13]),
 404           [ftmp14]"=&f"(ftmp[14]),      [ftmp15]"=&f"(ftmp[15]),
 405           [ftmp16]"=&f"(ftmp[16]),      [ftmp17]"=&f"(ftmp[17]),
 406           [ftmp18]"=&f"(ftmp[18]),      [ftmp19]"=&f"(ftmp[19]),
 407           [ftmp20]"=&f"(ftmp[20]),      [ftmp21]"=&f"(ftmp[21]),
 408           [ftmp22]"=&f"(ftmp[22]),
 409           [tmp0]"=&r"(tmp[0])
 410         : [ff_pw_1]"f"(ff_pw_1_local),  [ff_pw_64]"f"(ff_pw_64_local),
 411           [ff_pw_4]"f"(ff_pw_4_local), [block]"r"(block),
 412           [temp]"r"(temp)
 413         : "memory"
 414     );
 415 }
 416 #endif
 417
 418 /* Do inverse transform on 8x4 part of block */
 419 void ff_vc1_inv_trans_8x4_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
 420 {
 421     int dc = block[0];
 422     double ftmp[9];
 423
 424     dc = ( 3 * dc +  1) >> 1;
 425     dc = (17 * dc + 64) >> 7;
 426
 427     __asm__ volatile(
 428         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
 429         "pshufh     %[dc],      %[dc],          %[ftmp0]                \n\t"
 430
 431         MMI_LDC1(%[ftmp1], %[dest0], 0x00)
 432         MMI_LDC1(%[ftmp2], %[dest1], 0x00)
 433         MMI_LDC1(%[ftmp3], %[dest2], 0x00)
 434         MMI_LDC1(%[ftmp4], %[dest3], 0x00)
 435
 436         "punpckhbh  %[ftmp5],   %[ftmp1],       %[ftmp0]                \n\t"
 437         "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
 438         "punpckhbh  %[ftmp6],   %[ftmp2],       %[ftmp0]                \n\t"
 439         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
 440         "punpckhbh  %[ftmp7],   %[ftmp3],       %[ftmp0]                \n\t"
 441         "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
 442         "punpckhbh  %[ftmp8],   %[ftmp4],       %[ftmp0]                \n\t"
 443         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
 444
 445         "paddsh     %[ftmp1],   %[ftmp1],       %[dc]                   \n\t"
 446         "paddsh     %[ftmp2],   %[ftmp2],       %[dc]                   \n\t"
 447         "paddsh     %[ftmp3],   %[ftmp3],       %[dc]                   \n\t"
 448         "paddsh     %[ftmp4],   %[ftmp4],       %[dc]                   \n\t"
 449         "paddsh     %[ftmp5],   %[ftmp5],       %[dc]                   \n\t"
 450         "paddsh     %[ftmp6],   %[ftmp6],       %[dc]                   \n\t"
 451         "paddsh     %[ftmp7],   %[ftmp7],       %[dc]                   \n\t"
 452         "paddsh     %[ftmp8],   %[ftmp8],       %[dc]                   \n\t"
 453
 454         "packushb   %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
 455         "packushb   %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
 456         "packushb   %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
 457         "packushb   %[ftmp4],   %[ftmp4],       %[ftmp8]                \n\t"
 458
 459         MMI_SDC1(%[ftmp1], %[dest0], 0x00)
 460         MMI_SDC1(%[ftmp2], %[dest1], 0x00)
 461         MMI_SDC1(%[ftmp3], %[dest2], 0x00)
 462         MMI_SDC1(%[ftmp4], %[dest3], 0x00)
 463         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
 464           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
 465           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
 466           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
 467           [ftmp8]"=&f"(ftmp[8])
 468         : [dest0]"r"(dest+0*linesize),  [dest1]"r"(dest+1*linesize),
 469           [dest2]"r"(dest+2*linesize),  [dest3]"r"(dest+3*linesize),
 470           [dc]"f"(dc)
 471         : "memory"
 472     );
 473 }
 474
 475 #if _MIPS_SIM != _ABIO32
 476 void ff_vc1_inv_trans_8x4_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
 477 {
 478     int16_t *src = block;
 479     int16_t *dst = block;
 480     double ftmp[16];
 481     uint32_t tmp[1];
 482     int16_t count = 4;
 483     DECLARE_ALIGNED(16, const uint64_t, ff_pw_4_local) = {0x0000000400000004ULL};
 484     DECLARE_ALIGNED(16, const uint64_t, ff_pw_64_local)= {0x0000004000000040ULL};
 485     int16_t coeff[64] = {12, 16,  16,  15,  12,   9,   6,   4,
 486                          12, 15,   6,  -4, -12, -16, -16,  -9,
 487                          12,  9,  -6, -16, -12,   4,  16,  15,
 488                          12,  4, -16,  -9,  12,  15,  -6, -16,
 489                          12, -4, -16,   9,  12, -15,  -6,  16,
 490                          12, -9,  -6,  16, -12,  -4,  16, -15,
 491                          12, -15,  6,   4, -12,  16, -16,   9,
 492                          12, -16, 16, -15,  12,  -9,   6,  -4};
 493
 494     // 1st loop
 495     __asm__ volatile (
 496         "li         %[tmp0],    0x03                                    \n\t"
 497         "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
 498
 499         "1:                                                             \n\t"
 500         MMI_LDC1(%[ftmp1], %[src], 0x00)
 501         MMI_LDC1(%[ftmp2], %[src], 0x08)
 502
 503         /* ftmp11: dst1,dst0 */
 504         MMI_LDC1(%[ftmp3], %[coeff], 0x00)
 505         MMI_LDC1(%[ftmp4], %[coeff], 0x08)
 506         MMI_LDC1(%[ftmp5], %[coeff], 0x10)
 507         MMI_LDC1(%[ftmp6], %[coeff], 0x18)
 508         "pmaddhw    %[ftmp7],   %[ftmp1],   %[ftmp3]                    \n\t"
 509         "pmaddhw    %[ftmp8],   %[ftmp2],   %[ftmp4]                    \n\t"
 510         "paddw      %[ftmp9],   %[ftmp7],   %[ftmp8]                    \n\t"
 511         "pmaddhw    %[ftmp7],   %[ftmp1],   %[ftmp5]                    \n\t"
 512         "pmaddhw    %[ftmp8],   %[ftmp2],   %[ftmp6]                    \n\t"
 513         "paddw      %[ftmp10],  %[ftmp7],   %[ftmp8]                    \n\t"
 514         "punpcklwd  %[ftmp7],   %[ftmp9],   %[ftmp10]                   \n\t"
 515         "punpckhwd  %[ftmp8],   %[ftmp9],   %[ftmp10]                   \n\t"
 516         "paddw      %[ftmp11],  %[ftmp7],   %[ftmp8]                    \n\t"
 517         "paddw      %[ftmp11],  %[ftmp11],  %[ff_pw_4]                  \n\t"
 518
 519         /* ftmp12: dst3,dst2 */
 520         MMI_LDC1(%[ftmp3], %[coeff], 0x20)
 521         MMI_LDC1(%[ftmp4], %[coeff], 0x28)
 522         MMI_LDC1(%[ftmp5], %[coeff], 0x30)
 523         MMI_LDC1(%[ftmp6], %[coeff], 0x38)
 524         "pmaddhw    %[ftmp7],   %[ftmp1],   %[ftmp3]                    \n\t"
 525         "pmaddhw    %[ftmp8],   %[ftmp2],   %[ftmp4]                    \n\t"
 526         "paddw      %[ftmp9],   %[ftmp7],   %[ftmp8]                    \n\t"
 527         "pmaddhw    %[ftmp7],   %[ftmp1],   %[ftmp5]                    \n\t"
 528         "pmaddhw    %[ftmp8],   %[ftmp2],   %[ftmp6]                    \n\t"
 529         "paddw      %[ftmp10],  %[ftmp7],   %[ftmp8]                    \n\t"
 530         "punpcklwd  %[ftmp7],   %[ftmp9],   %[ftmp10]                   \n\t"
 531         "punpckhwd  %[ftmp8],   %[ftmp9],   %[ftmp10]                   \n\t"
 532         "paddw      %[ftmp12],  %[ftmp7],   %[ftmp8]                    \n\t"
 533         "paddw      %[ftmp12],  %[ftmp12],  %[ff_pw_4]                  \n\t"
 534
 535         /* ftmp13: dst5,dst4 */
 536         MMI_LDC1(%[ftmp3], %[coeff], 0x40)
 537         MMI_LDC1(%[ftmp4], %[coeff], 0x48)
 538         MMI_LDC1(%[ftmp5], %[coeff], 0x50)
 539         MMI_LDC1(%[ftmp6], %[coeff], 0x58)
 540         "pmaddhw    %[ftmp7],   %[ftmp1],   %[ftmp3]                    \n\t"
 541         "pmaddhw    %[ftmp8],   %[ftmp2],   %[ftmp4]                    \n\t"
 542         "paddw      %[ftmp9],   %[ftmp7],   %[ftmp8]                    \n\t"
 543         "pmaddhw    %[ftmp7],   %[ftmp1],   %[ftmp5]                    \n\t"
 544         "pmaddhw    %[ftmp8],   %[ftmp2],   %[ftmp6]                    \n\t"
 545         "paddw      %[ftmp10],  %[ftmp7],   %[ftmp8]                    \n\t"
 546         "punpcklwd  %[ftmp7],   %[ftmp9],   %[ftmp10]                   \n\t"
 547         "punpckhwd  %[ftmp8],   %[ftmp9],   %[ftmp10]                   \n\t"
 548         "paddw      %[ftmp13],  %[ftmp7],   %[ftmp8]                    \n\t"
 549         "paddw      %[ftmp13],  %[ftmp13],  %[ff_pw_4]                  \n\t"
 550
 551         /* ftmp14: dst7,dst6 */
 552         MMI_LDC1(%[ftmp3], %[coeff], 0x60)
 553         MMI_LDC1(%[ftmp4], %[coeff], 0x68)
 554         MMI_LDC1(%[ftmp5], %[coeff], 0x70)
 555         MMI_LDC1(%[ftmp6], %[coeff], 0x78)
 556         "pmaddhw    %[ftmp7],   %[ftmp1],   %[ftmp3]                    \n\t"
 557         "pmaddhw    %[ftmp8],   %[ftmp2],   %[ftmp4]                    \n\t"
 558         "paddw      %[ftmp9],   %[ftmp7],   %[ftmp8]                    \n\t"
 559         "pmaddhw    %[ftmp7],   %[ftmp1],   %[ftmp5]                    \n\t"
 560         "pmaddhw    %[ftmp8],   %[ftmp2],   %[ftmp6]                    \n\t"
 561         "paddw      %[ftmp10],  %[ftmp7],   %[ftmp8]                    \n\t"
 562         "punpcklwd  %[ftmp7],   %[ftmp9],   %[ftmp10]                   \n\t"
 563         "punpckhwd  %[ftmp8],   %[ftmp9],   %[ftmp10]                   \n\t"
 564         "paddw      %[ftmp14],  %[ftmp7],   %[ftmp8]                    \n\t"
 565         "paddw      %[ftmp14],  %[ftmp14],  %[ff_pw_4]                  \n\t"
 566
 567         /* ftmp9: dst3,dst2,dst1,dst0    ftmp10: dst7,dst6,dst5,dst4 */
 568         "psraw      %[ftmp11],  %[ftmp11],  %[ftmp0]                    \n\t"
 569         "psraw      %[ftmp12],  %[ftmp12],  %[ftmp0]                    \n\t"
 570         "psraw      %[ftmp13],  %[ftmp13],  %[ftmp0]                    \n\t"
 571         "psraw      %[ftmp14],  %[ftmp14],  %[ftmp0]                    \n\t"
 572         "punpcklhw  %[ftmp7],   %[ftmp11],  %[ftmp12]                   \n\t"
 573         "punpckhhw  %[ftmp8],   %[ftmp11],  %[ftmp12]                   \n\t"
 574         "punpcklhw  %[ftmp9],   %[ftmp7],   %[ftmp8]                    \n\t"
 575         "punpcklhw  %[ftmp7],   %[ftmp13],  %[ftmp14]                   \n\t"
 576         "punpckhhw  %[ftmp8],   %[ftmp13],  %[ftmp14]                   \n\t"
 577         "punpcklhw  %[ftmp10],  %[ftmp7],   %[ftmp8]                    \n\t"
 578         MMI_SDC1(%[ftmp9], %[dst], 0x00)
 579         MMI_SDC1(%[ftmp10], %[dst], 0x08)
 580
 581         PTR_ADDIU  "%[src],     %[src],     0x10                        \n\t"
 582         PTR_ADDIU  "%[dst],     %[dst],     0x10                        \n\t"
 583         "addiu      %[count],   %[count],   -0x01                       \n\t"
 584         "bnez       %[count],   1b                                      \n\t"
 585         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
 586           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
 587           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
 588           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
 589           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
 590           [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
 591           [ftmp12]"=&f"(ftmp[12]),      [ftmp13]"=&f"(ftmp[13]),
 592           [ftmp14]"=&f"(ftmp[14]),      [tmp0]"=&r"(tmp[0]),
 593           [src]"+&r"(src), [dst]"+&r"(dst), [count]"+&r"(count)
 594         : [ff_pw_4]"f"(ff_pw_4_local),  [coeff]"r"(coeff)
 595         : "memory"
 596     );
 597
 598     src = block;
 599
 600     // 2nd loop
 601     __asm__ volatile (
 602         "li         %[tmp0],    0x44                                    \n\t"
 603         "mtc1       %[tmp0],    %[ftmp15]                               \n\t"
 604
 605         // 1st part
 606         "li         %[tmp0],    0x07                                    \n\t"
 607         "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
 608         MMI_LDC1(%[ftmp1], %[src], 0x00)
 609         MMI_LDC1(%[ftmp2], %[src], 0x10)
 610         MMI_LDC1(%[ftmp3], %[src], 0x20)
 611         MMI_LDC1(%[ftmp4], %[src], 0x30)
 612         "punpcklhw  %[ftmp5],   %[ftmp1],   %[ftmp2]                    \n\t"
 613         "punpckhhw  %[ftmp6],   %[ftmp1],   %[ftmp2]                    \n\t"
 614         "punpcklhw  %[ftmp7],   %[ftmp3],   %[ftmp4]                    \n\t"
 615         "punpckhhw  %[ftmp8],   %[ftmp3],   %[ftmp4]                    \n\t"
 616
 617         /* ftmp11: dst03,dst02,dst01,dst00 */
 618         "li         %[tmp0],    0x00160011                              \n\t"
 619         "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
 620         "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
 621         "li         %[tmp0],    0x000a0011                              \n\t"
 622         "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
 623         "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
 624         "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
 625         "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
 626         "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
 627         "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
 628         "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
 629         "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
 630         "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
 631         "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
 632         "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
 633         "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
 634         "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
 635         "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
 636         "punpcklhw  %[ftmp11],  %[ftmp1],   %[ftmp2]                    \n\t"
 637
 638         /* ftmp12: dst13,dst12,dst11,dst10 */
 639         "li         %[tmp0],    0x000a0011                              \n\t"
 640         "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
 641         "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
 642         "li         %[tmp0],    0xffeaffef                              \n\t"
 643         "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
 644         "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
 645         "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
 646         "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
 647         "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
 648         "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
 649         "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
 650         "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
 651         "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
 652         "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
 653         "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
 654         "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
 655         "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
 656         "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
 657         "punpcklhw  %[ftmp12],  %[ftmp1],   %[ftmp2]                    \n\t"
 658
 659         /* ftmp13: dst23,dst22,dst21,dst20 */
 660         "li         %[tmp0],    0xfff60011                              \n\t"
 661         "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
 662         "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
 663         "li         %[tmp0],    0x0016ffef                              \n\t"
 664         "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
 665         "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
 666         "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
 667         "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
 668         "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
 669         "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
 670         "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
 671         "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
 672         "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
 673         "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
 674         "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
 675         "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
 676         "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
 677         "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
 678         "punpcklhw  %[ftmp13],  %[ftmp1],   %[ftmp2]                    \n\t"
 679
 680         /* ftmp14: dst33,dst32,dst31,dst30 */
 681         "li         %[tmp0],    0xffea0011                              \n\t"
 682         "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
 683         "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
 684         "li         %[tmp0],    0xfff60011                              \n\t"
 685         "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
 686         "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
 687         "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
 688         "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
 689         "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
 690         "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
 691         "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
 692         "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
 693         "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
 694         "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
 695         "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
 696         "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
 697         "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
 698         "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
 699         "punpcklhw  %[ftmp14],  %[ftmp1],   %[ftmp2]                    \n\t"
 700
 701         MMI_LWC1(%[ftmp1], %[dest], 0x00)
 702         PTR_ADDU    "%[tmp0],   %[dest],    %[linesize]                 \n\t"
 703         MMI_LWC1(%[ftmp2], %[tmp0], 0x00)
 704         PTR_ADDU    "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
 705         MMI_LWC1(%[ftmp3], %[tmp0], 0x00)
 706         PTR_ADDU    "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
 707         MMI_LWC1(%[ftmp4], %[tmp0], 0x00)
 708         "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
 709         "punpcklbh  %[ftmp1],   %[ftmp1],   %[ftmp0]                    \n\t"
 710         "punpcklbh  %[ftmp2],   %[ftmp2],   %[ftmp0]                    \n\t"
 711         "punpcklbh  %[ftmp3],   %[ftmp3],   %[ftmp0]                    \n\t"
 712         "punpcklbh  %[ftmp4],   %[ftmp4],   %[ftmp0]                    \n\t"
 713         "paddh      %[ftmp1],   %[ftmp1],   %[ftmp11]                   \n\t"
 714         "paddh      %[ftmp2],   %[ftmp2],   %[ftmp12]                   \n\t"
 715         "paddh      %[ftmp3],   %[ftmp3],   %[ftmp13]                   \n\t"
 716         "paddh      %[ftmp4],   %[ftmp4],   %[ftmp14]                   \n\t"
 717         "packushb   %[ftmp1],   %[ftmp1],   %[ftmp0]                    \n\t"
 718         "packushb   %[ftmp2],   %[ftmp2],   %[ftmp0]                    \n\t"
 719         "packushb   %[ftmp3],   %[ftmp3],   %[ftmp0]                    \n\t"
 720         "packushb   %[ftmp4],   %[ftmp4],   %[ftmp0]                    \n\t"
 721         MMI_SWC1(%[ftmp1], %[dest], 0x00)
 722         PTR_ADDU   "%[tmp0],    %[dest],    %[linesize]                 \n\t"
 723         MMI_SWC1(%[ftmp2], %[tmp0], 0x00)
 724         PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
 725         MMI_SWC1(%[ftmp3], %[tmp0], 0x00)
 726         PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
 727         MMI_SWC1(%[ftmp4], %[tmp0], 0x00)
 728
 729         // 2nd part
 730         "li         %[tmp0],    0x07                                    \n\t"
 731         "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
 732         MMI_LDC1(%[ftmp1], %[src], 0x08)
 733         MMI_LDC1(%[ftmp2], %[src], 0x18)
 734         MMI_LDC1(%[ftmp3], %[src], 0x28)
 735         MMI_LDC1(%[ftmp4], %[src], 0x38)
 736         "punpcklhw  %[ftmp5],   %[ftmp1],   %[ftmp2]                    \n\t"
 737         "punpckhhw  %[ftmp6],   %[ftmp1],   %[ftmp2]                    \n\t"
 738         "punpcklhw  %[ftmp7],   %[ftmp3],   %[ftmp4]                    \n\t"
 739         "punpckhhw  %[ftmp8],   %[ftmp3],   %[ftmp4]                    \n\t"
 740
 741         /* ftmp11: dst03,dst02,dst01,dst00 */
 742         "li         %[tmp0],    0x00160011                              \n\t"
 743         "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
 744         "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
 745         "li         %[tmp0],    0x000a0011                              \n\t"
 746         "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
 747         "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
 748         "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
 749         "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
 750         "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
 751         "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
 752         "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
 753         "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
 754         "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
 755         "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
 756         "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
 757         "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
 758         "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
 759         "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
 760         "punpcklhw  %[ftmp11],  %[ftmp1],   %[ftmp2]                    \n\t"
 761
 762         /* ftmp12: dst13,dst12,dst11,dst10 */
 763         "li         %[tmp0],    0x000a0011                              \n\t"
 764         "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
 765         "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
 766         "li         %[tmp0],    0xffeaffef                              \n\t"
 767         "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
 768         "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
 769         "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
 770         "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
 771         "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
 772         "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
 773         "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
 774         "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
 775         "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
 776         "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
 777         "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
 778         "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
 779         "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
 780         "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
 781         "punpcklhw  %[ftmp12],  %[ftmp1],   %[ftmp2]                    \n\t"
 782
 783         /* ftmp13: dst23,dst22,dst21,dst20 */
 784         "li         %[tmp0],    0xfff60011                              \n\t"
 785         "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
 786         "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
 787         "li         %[tmp0],    0x0016ffef                              \n\t"
 788         "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
 789         "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
 790         "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
 791         "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
 792         "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
 793         "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
 794         "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
 795         "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
 796         "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
 797         "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
 798         "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
 799         "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
 800         "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
 801         "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
 802         "punpcklhw  %[ftmp13],  %[ftmp1],   %[ftmp2]                    \n\t"
 803
 804         /* ftmp14: dst33,dst32,dst31,dst30 */
 805         "li         %[tmp0],    0xffea0011                              \n\t"
 806         "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
 807         "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
 808         "li         %[tmp0],    0xfff60011                              \n\t"
 809         "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
 810         "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
 811         "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
 812         "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
 813         "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
 814         "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
 815         "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
 816         "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
 817         "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
 818         "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
 819         "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
 820         "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
 821         "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
 822         "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
 823         "punpcklhw  %[ftmp14],  %[ftmp1],   %[ftmp2]                    \n\t"
 824
 825         MMI_LWC1(%[ftmp1], %[dest], 0x04)
 826         PTR_ADDU    "%[tmp0],   %[dest],    %[linesize]                 \n\t"
 827         MMI_LWC1(%[ftmp2], %[tmp0], 0x04)
 828         PTR_ADDU    "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
 829         MMI_LWC1(%[ftmp3], %[tmp0], 0x04)
 830         PTR_ADDU    "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
 831         MMI_LWC1(%[ftmp4], %[tmp0], 0x04)
 832         "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
 833         "punpcklbh  %[ftmp1],   %[ftmp1],   %[ftmp0]                    \n\t"
 834         "punpcklbh  %[ftmp2],   %[ftmp2],   %[ftmp0]                    \n\t"
 835         "punpcklbh  %[ftmp3],   %[ftmp3],   %[ftmp0]                    \n\t"
 836         "punpcklbh  %[ftmp4],   %[ftmp4],   %[ftmp0]                    \n\t"
 837         "paddh      %[ftmp1],   %[ftmp1],   %[ftmp11]                   \n\t"
 838         "paddh      %[ftmp2],   %[ftmp2],   %[ftmp12]                   \n\t"
 839         "paddh      %[ftmp3],   %[ftmp3],   %[ftmp13]                   \n\t"
 840         "paddh      %[ftmp4],   %[ftmp4],   %[ftmp14]                   \n\t"
 841         "packushb   %[ftmp1],   %[ftmp1],   %[ftmp0]                    \n\t"
 842         "packushb   %[ftmp2],   %[ftmp2],   %[ftmp0]                    \n\t"
 843         "packushb   %[ftmp3],   %[ftmp3],   %[ftmp0]                    \n\t"
 844         "packushb   %[ftmp4],   %[ftmp4],   %[ftmp0]                    \n\t"
 845         MMI_SWC1(%[ftmp1], %[dest], 0x04)
 846         PTR_ADDU   "%[tmp0],    %[dest],    %[linesize]                 \n\t"
 847         MMI_SWC1(%[ftmp2], %[tmp0], 0x04)
 848         PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
 849         MMI_SWC1(%[ftmp3], %[tmp0], 0x04)
 850         PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
 851         MMI_SWC1(%[ftmp4], %[tmp0], 0x04)
 852
 853         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
 854           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
 855           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
 856           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
 857           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
 858           [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
 859           [ftmp12]"=&f"(ftmp[12]),      [ftmp13]"=&f"(ftmp[13]),
 860           [ftmp14]"=&f"(ftmp[14]),      [ftmp15]"=&f"(ftmp[15]),
 861           [tmp0]"=&r"(tmp[0])
 862         : [ff_pw_64]"f"(ff_pw_64_local),
 863           [src]"r"(src), [dest]"r"(dest), [linesize]"r"(linesize)
 864         :"memory"
 865     );
 866 }
 867 #endif
 868
 869 /* Do inverse transform on 4x8 parts of block */
 870 void ff_vc1_inv_trans_4x8_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
 871 {
 872     int dc = block[0];
 873     double ftmp[9];
 874     DECLARE_VAR_LOW32;
 875
 876     dc = (17 * dc +  4) >> 3;
 877     dc = (12 * dc + 64) >> 7;
 878
 879     __asm__ volatile(
 880         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
 881         "pshufh     %[dc],      %[dc],          %[ftmp0]                \n\t"
 882
 883         MMI_LWC1(%[ftmp1], %[dest0], 0x00)
 884         MMI_LWC1(%[ftmp2], %[dest1], 0x00)
 885         MMI_LWC1(%[ftmp3], %[dest2], 0x00)
 886         MMI_LWC1(%[ftmp4], %[dest3], 0x00)
 887         MMI_LWC1(%[ftmp5], %[dest4], 0x00)
 888         MMI_LWC1(%[ftmp6], %[dest5], 0x00)
 889         MMI_LWC1(%[ftmp7], %[dest6], 0x00)
 890         MMI_LWC1(%[ftmp8], %[dest7], 0x00)
 891
 892         "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
 893         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
 894         "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
 895         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
 896         "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
 897         "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
 898         "punpcklbh  %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
 899         "punpcklbh  %[ftmp8],   %[ftmp8],       %[ftmp0]                \n\t"
 900
 901         "paddsh     %[ftmp1],   %[ftmp1],       %[dc]                   \n\t"
 902         "paddsh     %[ftmp2],   %[ftmp2],       %[dc]                   \n\t"
 903         "paddsh     %[ftmp3],   %[ftmp3],       %[dc]                   \n\t"
 904         "paddsh     %[ftmp4],   %[ftmp4],       %[dc]                   \n\t"
 905         "paddsh     %[ftmp5],   %[ftmp5],       %[dc]                   \n\t"
 906         "paddsh     %[ftmp6],   %[ftmp6],       %[dc]                   \n\t"
 907         "paddsh     %[ftmp7],   %[ftmp7],       %[dc]                   \n\t"
 908         "paddsh     %[ftmp8],   %[ftmp8],       %[dc]                   \n\t"
 909
 910         "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
 911         "packushb   %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
 912         "packushb   %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
 913         "packushb   %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
 914         "packushb   %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
 915         "packushb   %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
 916         "packushb   %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
 917         "packushb   %[ftmp8],   %[ftmp8],       %[ftmp0]                \n\t"
 918
 919         MMI_SWC1(%[ftmp1], %[dest0], 0x00)
 920         MMI_SWC1(%[ftmp2], %[dest1], 0x00)
 921         MMI_SWC1(%[ftmp3], %[dest2], 0x00)
 922         MMI_SWC1(%[ftmp4], %[dest3], 0x00)
 923         MMI_SWC1(%[ftmp5], %[dest4], 0x00)
 924         MMI_SWC1(%[ftmp6], %[dest5], 0x00)
 925         MMI_SWC1(%[ftmp7], %[dest6], 0x00)
 926         MMI_SWC1(%[ftmp8], %[dest7], 0x00)
 927         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
 928           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
 929           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
 930           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
 931           RESTRICT_ASM_LOW32
 932           [ftmp8]"=&f"(ftmp[8])
 933         : [dest0]"r"(dest+0*linesize),  [dest1]"r"(dest+1*linesize),
 934           [dest2]"r"(dest+2*linesize),  [dest3]"r"(dest+3*linesize),
 935           [dest4]"r"(dest+4*linesize),  [dest5]"r"(dest+5*linesize),
 936           [dest6]"r"(dest+6*linesize),  [dest7]"r"(dest+7*linesize),
 937           [dc]"f"(dc)
 938         : "memory"
 939     );
 940 }
 941
 942 #if _MIPS_SIM != _ABIO32
 943 void ff_vc1_inv_trans_4x8_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
 944 {
 945     int16_t *src = block;
 946     int16_t *dst = block;
 947     double ftmp[23];
 948     uint32_t count = 8, tmp[1];
 949     int16_t coeff[16] = {17, 22, 17, 10,
 950                          17, 10,-17,-22,
 951                          17,-10,-17, 22,
 952                          17,-22, 17,-10};
 953     DECLARE_ALIGNED(8, const uint64_t, ff_pw_1_local) = {0x0000000100000001ULL};
 954     DECLARE_ALIGNED(8, const uint64_t, ff_pw_4_local) = {0x0000000400000004ULL};
 955     DECLARE_ALIGNED(8, const uint64_t, ff_pw_64_local)= {0x0000004000000040ULL};
 956
 957     // 1st loop
 958     __asm__ volatile (
 959
 960         "li         %[tmp0],    0x03                                    \n\t"
 961         "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
 962
 963         MMI_LDC1(%[ftmp2], %[coeff], 0x00)
 964         MMI_LDC1(%[ftmp3], %[coeff], 0x08)
 965         MMI_LDC1(%[ftmp4], %[coeff], 0x10)
 966         MMI_LDC1(%[ftmp5], %[coeff], 0x18)
 967         "1:                                                             \n\t"
 968         /* ftmp8: dst3,dst2,dst1,dst0 */
 969         MMI_LDC1(%[ftmp1], %[src], 0x00)
 970         "pmaddhw    %[ftmp6],   %[ftmp2],   %[ftmp1]                    \n\t"
 971         "pmaddhw    %[ftmp7],   %[ftmp3],   %[ftmp1]                    \n\t"
 972         "pmaddhw    %[ftmp8],   %[ftmp4],   %[ftmp1]                    \n\t"
 973         "pmaddhw    %[ftmp9],   %[ftmp5],   %[ftmp1]                    \n\t"
 974         "punpcklwd  %[ftmp10],  %[ftmp6],   %[ftmp7]                    \n\t"
 975         "punpckhwd  %[ftmp11],  %[ftmp6],   %[ftmp7]                    \n\t"
 976         "punpcklwd  %[ftmp6],   %[ftmp8],   %[ftmp9]                    \n\t"
 977         "punpckhwd  %[ftmp7],   %[ftmp8],   %[ftmp9]                    \n\t"
 978         "paddw      %[ftmp8],   %[ftmp10],  %[ftmp11]                   \n\t"
 979         "paddw      %[ftmp9],   %[ftmp6],   %[ftmp7]                    \n\t"
 980         "paddw      %[ftmp8],   %[ftmp8],   %[ff_pw_4]                  \n\t"
 981         "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_4]                  \n\t"
 982         "psraw      %[ftmp8],   %[ftmp8],   %[ftmp0]                    \n\t"
 983         "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
 984         "punpcklhw  %[ftmp6],   %[ftmp8],   %[ftmp9]                    \n\t"
 985         "punpckhhw  %[ftmp7],   %[ftmp8],   %[ftmp9]                    \n\t"
 986         "punpcklhw  %[ftmp8],   %[ftmp6],   %[ftmp7]                    \n\t"
 987         MMI_SDC1(%[ftmp8], %[dst], 0x00)
 988
 989         PTR_ADDIU  "%[src],     %[src],     0x10                        \n\t"
 990         PTR_ADDIU  "%[dst],     %[dst],     0x10                        \n\t"
 991         "addiu      %[count],   %[count],   -0x01                       \n\t"
 992         "bnez       %[count],   1b                                      \n\t"
 993         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
 994           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
 995           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
 996           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
 997           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
 998           [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
 999           [tmp0]"=&r"(tmp[0]),          [count]"+&r"(count),
1000           [src]"+&r"(src),              [dst]"+&r"(dst)
1001         : [ff_pw_4]"f"(ff_pw_4_local),  [coeff]"r"(coeff)
1002         : "memory"
1003     );
1004
1005     src = block;
1006
1007     // 2nd loop
1008     __asm__ volatile (
1009         "li         %[tmp0],    0x07                                    \n\t"
1010         "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
1011
1012         MMI_LDC1(%[ftmp1], %[src], 0x00)
1013         MMI_LDC1(%[ftmp2], %[src], 0x20)
1014         MMI_LDC1(%[ftmp3], %[src], 0x40)
1015         MMI_LDC1(%[ftmp4], %[src], 0x60)
1016         "punpcklhw  %[ftmp5],   %[ftmp1],   %[ftmp2]                    \n\t"
1017         "punpckhhw  %[ftmp6],   %[ftmp1],   %[ftmp2]                    \n\t"
1018         "punpcklhw  %[ftmp7],   %[ftmp3],   %[ftmp4]                    \n\t"
1019         "punpckhhw  %[ftmp8],   %[ftmp3],   %[ftmp4]                    \n\t"
1020
1021         MMI_LDC1(%[ftmp1], %[src], 0x10)
1022         MMI_LDC1(%[ftmp2], %[src], 0x30)
1023         MMI_LDC1(%[ftmp3], %[src], 0x50)
1024         MMI_LDC1(%[ftmp4], %[src], 0x70)
1025         "punpcklhw  %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
1026         "punpckhhw  %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
1027         "punpcklhw  %[ftmp11],  %[ftmp3],   %[ftmp4]                    \n\t"
1028         "punpckhhw  %[ftmp12],  %[ftmp3],   %[ftmp4]                    \n\t"
1029
1030         /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */
1031         VC1_INV_TRANCS_8_TYPE2(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c,
1032                                0x000f0010, 0x00040009, %[ff_pw_64], %[ff_pw_1])
1033
1034         /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */
1035         VC1_INV_TRANCS_8_TYPE2(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4,
1036                                0xfffc000f, 0xfff7fff0, %[ff_pw_64], %[ff_pw_1])
1037
1038         /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */
1039         VC1_INV_TRANCS_8_TYPE2(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4,
1040                                0xfff00009, 0x000f0004, %[ff_pw_64], %[ff_pw_1])
1041
1042         /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */
1043         VC1_INV_TRANCS_8_TYPE2(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c,
1044                                0xfff70004, 0xfff0000f, %[ff_pw_64], %[ff_pw_1])
1045
1046         MMI_LWC1(%[ftmp1], %[dest], 0x00)
1047         PTR_ADDU  "%[tmp0],   %[dest],    %[linesize]                 \n\t"
1048         MMI_LWC1(%[ftmp2], %[tmp0], 0x00)
1049         PTR_ADDU  "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
1050         MMI_LWC1(%[ftmp3], %[tmp0], 0x00)
1051         PTR_ADDU  "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
1052         MMI_LWC1(%[ftmp4], %[tmp0], 0x00)
1053         PTR_ADDU  "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
1054         MMI_LWC1(%[ftmp5], %[tmp0], 0x00)
1055         PTR_ADDU  "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
1056         MMI_LWC1(%[ftmp6], %[tmp0], 0x00)
1057         PTR_ADDU  "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
1058         MMI_LWC1(%[ftmp7], %[tmp0], 0x00)
1059         PTR_ADDU  "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
1060         MMI_LWC1(%[ftmp8], %[tmp0], 0x00)
1061         "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
1062         "punpcklbh  %[ftmp1],   %[ftmp1],   %[ftmp0]                    \n\t"
1063         "punpcklbh  %[ftmp2],   %[ftmp2],   %[ftmp0]                    \n\t"
1064         "punpcklbh  %[ftmp3],   %[ftmp3],   %[ftmp0]                    \n\t"
1065         "punpcklbh  %[ftmp4],   %[ftmp4],   %[ftmp0]                    \n\t"
1066         "punpcklbh  %[ftmp5],   %[ftmp5],   %[ftmp0]                    \n\t"
1067         "punpcklbh  %[ftmp6],   %[ftmp6],   %[ftmp0]                    \n\t"
1068         "punpcklbh  %[ftmp7],   %[ftmp7],   %[ftmp0]                    \n\t"
1069         "punpcklbh  %[ftmp8],   %[ftmp8],   %[ftmp0]                    \n\t"
1070
1071         "paddh      %[ftmp1],   %[ftmp1],   %[ftmp15]                   \n\t"
1072         "paddh      %[ftmp2],   %[ftmp2],   %[ftmp16]                   \n\t"
1073         "paddh      %[ftmp3],   %[ftmp3],   %[ftmp17]                   \n\t"
1074         "paddh      %[ftmp4],   %[ftmp4],   %[ftmp18]                   \n\t"
1075         "paddh      %[ftmp5],   %[ftmp5],   %[ftmp19]                   \n\t"
1076         "paddh      %[ftmp6],   %[ftmp6],   %[ftmp20]                   \n\t"
1077         "paddh      %[ftmp7],   %[ftmp7],   %[ftmp21]                   \n\t"
1078         "paddh      %[ftmp8],   %[ftmp8],   %[ftmp22]                   \n\t"
1079
1080         "packushb   %[ftmp1],   %[ftmp1],   %[ftmp0]                    \n\t"
1081         "packushb   %[ftmp2],   %[ftmp2],   %[ftmp0]                    \n\t"
1082         "packushb   %[ftmp3],   %[ftmp3],   %[ftmp0]                    \n\t"
1083         "packushb   %[ftmp4],   %[ftmp4],   %[ftmp0]                    \n\t"
1084         "packushb   %[ftmp5],   %[ftmp5],   %[ftmp0]                    \n\t"
1085         "packushb   %[ftmp6],   %[ftmp6],   %[ftmp0]                    \n\t"
1086         "packushb   %[ftmp7],   %[ftmp7],   %[ftmp0]                    \n\t"
1087         "packushb   %[ftmp8],   %[ftmp8],   %[ftmp0]                    \n\t"
1088
1089         MMI_SWC1(%[ftmp1], %[dest], 0x00)
1090         PTR_ADDU   "%[tmp0],    %[dest],    %[linesize]                 \n\t"
1091         MMI_SWC1(%[ftmp2], %[tmp0], 0x00)
1092         PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
1093         MMI_SWC1(%[ftmp3], %[tmp0], 0x00)
1094         PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
1095         MMI_SWC1(%[ftmp4], %[tmp0], 0x00)
1096         PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
1097         MMI_SWC1(%[ftmp5], %[tmp0], 0x00)
1098         PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
1099         MMI_SWC1(%[ftmp6], %[tmp0], 0x00)
1100         PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
1101         MMI_SWC1(%[ftmp7], %[tmp0], 0x00)
1102         PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
1103         MMI_SWC1(%[ftmp8], %[tmp0], 0x00)
1104
1105         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
1106           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
1107           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
1108           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
1109           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
1110           [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
1111           [ftmp12]"=&f"(ftmp[12]),      [ftmp13]"=&f"(ftmp[13]),
1112           [ftmp14]"=&f"(ftmp[14]),      [ftmp15]"=&f"(ftmp[15]),
1113           [ftmp16]"=&f"(ftmp[16]),      [ftmp17]"=&f"(ftmp[17]),
1114           [ftmp18]"=&f"(ftmp[18]),      [ftmp19]"=&f"(ftmp[19]),
1115           [ftmp20]"=&f"(ftmp[20]),      [ftmp21]"=&f"(ftmp[21]),
1116           [ftmp22]"=&f"(ftmp[22]),
1117           [tmp0]"=&r"(tmp[0])
1118         : [ff_pw_1]"f"(ff_pw_1_local),  [ff_pw_64]"f"(ff_pw_64_local),
1119           [src]"r"(src), [dest]"r"(dest), [linesize]"r"(linesize)
1120         : "memory"
1121     );
1122 }
1123 #endif
1124
1125 /* Do inverse transform on 4x4 part of block */
1126 void ff_vc1_inv_trans_4x4_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
1127 {
1128     int dc = block[0];
1129     double ftmp[5];
1130     DECLARE_VAR_LOW32;
1131
1132     dc = (17 * dc +  4) >> 3;
1133     dc = (17 * dc + 64) >> 7;
1134
1135     __asm__ volatile(
1136         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
1137         "pshufh     %[dc],      %[dc],          %[ftmp0]                \n\t"
1138
1139         MMI_LWC1(%[ftmp1], %[dest0], 0x00)
1140         MMI_LWC1(%[ftmp2], %[dest1], 0x00)
1141         MMI_LWC1(%[ftmp3], %[dest2], 0x00)
1142         MMI_LWC1(%[ftmp4], %[dest3], 0x00)
1143
1144         "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
1145         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
1146         "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
1147         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
1148
1149         "paddsh     %[ftmp1],   %[ftmp1],       %[dc]                   \n\t"
1150         "paddsh     %[ftmp2],   %[ftmp2],       %[dc]                   \n\t"
1151         "paddsh     %[ftmp3],   %[ftmp3],       %[dc]                   \n\t"
1152         "paddsh     %[ftmp4],   %[ftmp4],       %[dc]                   \n\t"
1153
1154         "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
1155         "packushb   %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
1156         "packushb   %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
1157         "packushb   %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
1158
1159         MMI_SWC1(%[ftmp1], %[dest0], 0x00)
1160         MMI_SWC1(%[ftmp2], %[dest1], 0x00)
1161         MMI_SWC1(%[ftmp3], %[dest2], 0x00)
1162         MMI_SWC1(%[ftmp4], %[dest3], 0x00)
1163         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
1164           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
1165           RESTRICT_ASM_LOW32
1166           [ftmp4]"=&f"(ftmp[4])
1167         : [dest0]"r"(dest+0*linesize),  [dest1]"r"(dest+1*linesize),
1168           [dest2]"r"(dest+2*linesize),  [dest3]"r"(dest+3*linesize),
1169           [dc]"f"(dc)
1170         : "memory"
1171     );
1172 }
1173
1174 void ff_vc1_inv_trans_4x4_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
1175 {
1176     int16_t *src = block;
1177     int16_t *dst = block;
1178     double ftmp[16];
1179     uint32_t count = 4, tmp[1];
1180     int16_t coeff[16] = {17, 22, 17, 10,
1181                          17, 10,-17,-22,
1182                          17,-10,-17, 22,
1183                          17,-22, 17,-10};
1184     DECLARE_ALIGNED(8, const uint64_t, ff_pw_4_local) = {0x0000000400000004ULL};
1185     DECLARE_ALIGNED(8, const uint64_t, ff_pw_64_local)= {0x0000004000000040ULL};
1186     // 1st loop
1187     __asm__ volatile (
1188
1189         "li         %[tmp0],    0x03                                    \n\t"
1190         "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
1191         MMI_LDC1(%[ftmp2], %[coeff], 0x00)
1192         MMI_LDC1(%[ftmp3], %[coeff], 0x08)
1193         MMI_LDC1(%[ftmp4], %[coeff], 0x10)
1194         MMI_LDC1(%[ftmp5], %[coeff], 0x18)
1195         "1:                                                             \n\t"
1196         /* ftmp8: dst3,dst2,dst1,dst0 */
1197         MMI_LDC1(%[ftmp1], %[src], 0x00)
1198         "pmaddhw    %[ftmp6],   %[ftmp2],   %[ftmp1]                    \n\t"
1199         "pmaddhw    %[ftmp7],   %[ftmp3],   %[ftmp1]                    \n\t"
1200         "pmaddhw    %[ftmp8],   %[ftmp4],   %[ftmp1]                    \n\t"
1201         "pmaddhw    %[ftmp9],   %[ftmp5],   %[ftmp1]                    \n\t"
1202         "punpcklwd  %[ftmp10],  %[ftmp6],   %[ftmp7]                    \n\t"
1203         "punpckhwd  %[ftmp11],  %[ftmp6],   %[ftmp7]                    \n\t"
1204         "punpcklwd  %[ftmp6],   %[ftmp8],   %[ftmp9]                    \n\t"
1205         "punpckhwd  %[ftmp7],   %[ftmp8],   %[ftmp9]                    \n\t"
1206         "paddw      %[ftmp8],   %[ftmp10],  %[ftmp11]                   \n\t"
1207         "paddw      %[ftmp9],   %[ftmp6],   %[ftmp7]                    \n\t"
1208         "paddw      %[ftmp8],   %[ftmp8],   %[ff_pw_4]                  \n\t"
1209         "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_4]                  \n\t"
1210         "psraw      %[ftmp8],   %[ftmp8],   %[ftmp0]                    \n\t"
1211         "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
1212         "punpcklhw  %[ftmp6],   %[ftmp8],   %[ftmp9]                    \n\t"
1213         "punpckhhw  %[ftmp7],   %[ftmp8],   %[ftmp9]                    \n\t"
1214         "punpcklhw  %[ftmp8],   %[ftmp6],   %[ftmp7]                    \n\t"
1215         MMI_SDC1(%[ftmp8], %[dst], 0x00)
1216
1217         PTR_ADDIU  "%[src],     %[src],     0x10                        \n\t"
1218         PTR_ADDIU  "%[dst],     %[dst],     0x10                        \n\t"
1219         "addiu      %[count],   %[count],   -0x01                       \n\t"
1220         "bnez       %[count],   1b                                      \n\t"
1221         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
1222           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
1223           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
1224           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
1225           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
1226           [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
1227           [tmp0]"=&r"(tmp[0]),          [count]"+&r"(count),
1228           [src]"+&r"(src),              [dst]"+&r"(dst)
1229         : [ff_pw_4]"f"(ff_pw_4_local),  [coeff]"r"(coeff)
1230         : "memory"
1231     );
1232
1233     src = block;
1234
1235     // 2nd loop
1236     __asm__ volatile (
1237         "li         %[tmp0],    0x07                                    \n\t"
1238         "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
1239         "li         %[tmp0],    0x44                                    \n\t"
1240         "mtc1       %[tmp0],    %[ftmp15]                               \n\t"
1241
1242         MMI_LDC1(%[ftmp1], %[src], 0x00)
1243         MMI_LDC1(%[ftmp2], %[src], 0x10)
1244         MMI_LDC1(%[ftmp3], %[src], 0x20)
1245         MMI_LDC1(%[ftmp4], %[src], 0x30)
1246         "punpcklhw  %[ftmp5],   %[ftmp1],   %[ftmp2]                    \n\t"
1247         "punpckhhw  %[ftmp6],   %[ftmp1],   %[ftmp2]                    \n\t"
1248         "punpcklhw  %[ftmp7],   %[ftmp3],   %[ftmp4]                    \n\t"
1249         "punpckhhw  %[ftmp8],   %[ftmp3],   %[ftmp4]                    \n\t"
1250
1251         /* ftmp11: dst03,dst02,dst01,dst00 */
1252         "li         %[tmp0],    0x00160011                              \n\t"
1253         "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
1254         "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
1255         "li         %[tmp0],    0x000a0011                              \n\t"
1256         "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
1257         "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
1258         "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
1259         "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
1260         "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
1261         "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
1262         "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
1263         "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
1264         "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
1265         "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
1266         "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
1267         "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
1268         "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
1269         "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
1270         "punpcklhw  %[ftmp11],  %[ftmp1],   %[ftmp2]                    \n\t"
1271
1272         /* ftmp12: dst13,dst12,dst11,dst10 */
1273         "li         %[tmp0],    0x000a0011                              \n\t"
1274         "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
1275         "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
1276         "li         %[tmp0],    0xffeaffef                              \n\t"
1277         "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
1278         "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
1279         "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
1280         "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
1281         "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
1282         "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
1283         "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
1284         "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
1285         "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
1286         "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
1287         "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
1288         "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
1289         "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
1290         "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
1291         "punpcklhw  %[ftmp12],  %[ftmp1],   %[ftmp2]                    \n\t"
1292
1293         /* ftmp13: dst23,dst22,dst21,dst20 */
1294         "li         %[tmp0],    0xfff60011                              \n\t"
1295         "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
1296         "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
1297         "li         %[tmp0],    0x0016ffef                              \n\t"
1298         "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
1299         "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
1300         "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
1301         "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
1302         "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
1303         "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
1304         "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
1305         "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
1306         "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
1307         "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
1308         "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
1309         "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
1310         "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
1311         "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
1312         "punpcklhw  %[ftmp13],  %[ftmp1],   %[ftmp2]                    \n\t"
1313
1314         /* ftmp14: dst33,dst32,dst31,dst30 */
1315         "li         %[tmp0],    0xffea0011                              \n\t"
1316         "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
1317         "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
1318         "li         %[tmp0],    0xfff60011                              \n\t"
1319         "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
1320         "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
1321         "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
1322         "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
1323         "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
1324         "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
1325         "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
1326         "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
1327         "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
1328         "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
1329         "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
1330         "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
1331         "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
1332         "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
1333         "punpcklhw  %[ftmp14],  %[ftmp1],   %[ftmp2]                    \n\t"
1334
1335         MMI_LWC1(%[ftmp1], %[dest], 0x00)
1336         PTR_ADDU    "%[tmp0],   %[dest],    %[linesize]                 \n\t"
1337         MMI_LWC1(%[ftmp2], %[tmp0], 0x00)
1338         PTR_ADDU    "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
1339         MMI_LWC1(%[ftmp3], %[tmp0], 0x00)
1340         PTR_ADDU    "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
1341         MMI_LWC1(%[ftmp4], %[tmp0], 0x00)
1342         "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
1343         "punpcklbh  %[ftmp1],   %[ftmp1],   %[ftmp0]                    \n\t"
1344         "punpcklbh  %[ftmp2],   %[ftmp2],   %[ftmp0]                    \n\t"
1345         "punpcklbh  %[ftmp3],   %[ftmp3],   %[ftmp0]                    \n\t"
1346         "punpcklbh  %[ftmp4],   %[ftmp4],   %[ftmp0]                    \n\t"
1347         "paddh      %[ftmp1],   %[ftmp1],   %[ftmp11]                   \n\t"
1348         "paddh      %[ftmp2],   %[ftmp2],   %[ftmp12]                   \n\t"
1349         "paddh      %[ftmp3],   %[ftmp3],   %[ftmp13]                   \n\t"
1350         "paddh      %[ftmp4],   %[ftmp4],   %[ftmp14]                   \n\t"
1351         "packushb   %[ftmp1],   %[ftmp1],   %[ftmp0]                    \n\t"
1352         "packushb   %[ftmp2],   %[ftmp2],   %[ftmp0]                    \n\t"
1353         "packushb   %[ftmp3],   %[ftmp3],   %[ftmp0]                    \n\t"
1354         "packushb   %[ftmp4],   %[ftmp4],   %[ftmp0]                    \n\t"
1355
1356         MMI_SWC1(%[ftmp1], %[dest], 0x00)
1357         PTR_ADDU   "%[tmp0],    %[dest],    %[linesize]                 \n\t"
1358         MMI_SWC1(%[ftmp2], %[tmp0], 0x00)
1359         PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
1360         MMI_SWC1(%[ftmp3], %[tmp0], 0x00)
1361         PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
1362         MMI_SWC1(%[ftmp4], %[tmp0], 0x00)
1363
1364         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
1365           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
1366           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
1367           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
1368           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
1369           [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
1370           [ftmp12]"=&f"(ftmp[12]),      [ftmp13]"=&f"(ftmp[13]),
1371           [ftmp14]"=&f"(ftmp[14]),      [ftmp15]"=&f"(ftmp[15]),
1372           [tmp0]"=&r"(tmp[0])
1373         : [ff_pw_64]"f"(ff_pw_64_local),
1374           [src]"r"(src), [dest]"r"(dest), [linesize]"r"(linesize)
1375         :"memory"
1376     );
1377 }
1378
1379 /* Apply overlap transform to horizontal edge */
1380 void ff_vc1_h_overlap_mmi(uint8_t *src, int stride)
1381 {
1382     int i;
1383     int a, b, c, d;
1384     int d1, d2;
1385     int rnd = 1;
1386     for (i = 0; i < 8; i++) {
1387         a  = src[-2];
1388         b  = src[-1];
1389         c  = src[0];
1390         d  = src[1];
1391         d1 = (a - d + 3 + rnd) >> 3;
1392         d2 = (a - d + b - c + 4 - rnd) >> 3;
1393
1394         src[-2] = a - d1;
1395         src[-1] = av_clip_uint8(b - d2);
1396         src[0]  = av_clip_uint8(c + d2);
1397         src[1]  = d + d1;
1398         src    += stride;
1399         rnd     = !rnd;
1400     }
1401 }
1402
1403 void ff_vc1_h_s_overlap_mmi(int16_t *left, int16_t *right, int left_stride, int right_stride, int flags)
1404 {
1405     int i;
1406     int a, b, c, d;
1407     int d1, d2;
1408     int rnd1 = flags & 2 ? 3 : 4;
1409     int rnd2 = 7 - rnd1;
1410     for (i = 0; i < 8; i++) {
1411         a  = left[6];
1412         b  = left[7];
1413         c  = right[0];
1414         d  = right[1];
1415         d1 = a - d;
1416         d2 = a - d + b - c;
1417
1418         left[6]  = ((a << 3) - d1 + rnd1) >> 3;
1419         left[7]  = ((b << 3) - d2 + rnd2) >> 3;
1420         right[0] = ((c << 3) + d2 + rnd1) >> 3;
1421         right[1] = ((d << 3) + d1 + rnd2) >> 3;
1422
1423         right += right_stride;
1424         left  += left_stride;
1425         if (flags & 1) {
1426             rnd2   = 7 - rnd2;
1427             rnd1   = 7 - rnd1;
1428         }
1429     }
1430 }
1431
1432 /* Apply overlap transform to vertical edge */
1433 void ff_vc1_v_overlap_mmi(uint8_t *src, int stride)
1434 {
1435     int i;
1436     int a, b, c, d;
1437     int d1, d2;
1438     int rnd = 1;
1439     for (i = 0; i < 8; i++) {
1440         a  = src[-2 * stride];
1441         b  = src[-stride];
1442         c  = src[0];
1443         d  = src[stride];
1444         d1 = (a - d + 3 + rnd) >> 3;
1445         d2 = (a - d + b - c + 4 - rnd) >> 3;
1446
1447         src[-2 * stride] = a - d1;
1448         src[-stride]     = av_clip_uint8(b - d2);
1449         src[0]           = av_clip_uint8(c + d2);
1450         src[stride]      = d + d1;
1451         src++;
1452         rnd = !rnd;
1453     }
1454 }
1455
1456 void ff_vc1_v_s_overlap_mmi(int16_t *top, int16_t *bottom)
1457 {
1458     int i;
1459     int a, b, c, d;
1460     int d1, d2;
1461     int rnd1 = 4, rnd2 = 3;
1462     for (i = 0; i < 8; i++) {
1463         a  = top[48];
1464         b  = top[56];
1465         c  = bottom[0];
1466         d  = bottom[8];
1467         d1 = a - d;
1468         d2 = a - d + b - c;
1469
1470         top[48]   = ((a << 3) - d1 + rnd1) >> 3;
1471         top[56]   = ((b << 3) - d2 + rnd2) >> 3;
1472         bottom[0] = ((c << 3) + d2 + rnd1) >> 3;
1473         bottom[8] = ((d << 3) + d1 + rnd2) >> 3;
1474
1475         bottom++;
1476         top++;
1477         rnd2 = 7 - rnd2;
1478         rnd1 = 7 - rnd1;
1479     }
1480 }
1481
1482 /**
1483  * VC-1 in-loop deblocking filter for one line
1484  * @param src source block type
1485  * @param stride block stride
1486  * @param pq block quantizer
1487  * @return whether other 3 pairs should be filtered or not
1488  * @see 8.6
1489  */
1490 static av_always_inline int vc1_filter_line(uint8_t *src, int stride, int pq)
1491 {
1492     int a0 = (2 * (src[-2 * stride] - src[1 * stride]) -
1493               5 * (src[-1 * stride] - src[0 * stride]) + 4) >> 3;
1494     int a0_sign = a0 >> 31;        /* Store sign */
1495
1496     a0 = (a0 ^ a0_sign) - a0_sign; /* a0 = FFABS(a0); */
1497     if (a0 < pq) {
1498         int a1 = FFABS((2 * (src[-4 * stride] - src[-1 * stride]) -
1499                         5 * (src[-3 * stride] - src[-2 * stride]) + 4) >> 3);
1500         int a2 = FFABS((2 * (src[ 0 * stride] - src[ 3 * stride]) -
1501                         5 * (src[ 1 * stride] - src[ 2 * stride]) + 4) >> 3);
1502         if (a1 < a0 || a2 < a0) {
1503             int clip      = src[-1 * stride] - src[0 * stride];
1504             int clip_sign = clip >> 31;
1505
1506             clip = ((clip ^ clip_sign) - clip_sign) >> 1;
1507             if (clip) {
1508                 int a3     = FFMIN(a1, a2);
1509                 int d      = 5 * (a3 - a0);
1510                 int d_sign = (d >> 31);
1511
1512                 d       = ((d ^ d_sign) - d_sign) >> 3;
1513                 d_sign ^= a0_sign;
1514
1515                 if (d_sign ^ clip_sign)
1516                     d = 0;
1517                 else {
1518                     d = FFMIN(d, clip);
1519                     d = (d ^ d_sign) - d_sign; /* Restore sign */
1520                     src[-1 * stride] = av_clip_uint8(src[-1 * stride] - d);
1521                     src[ 0 * stride] = av_clip_uint8(src[ 0 * stride] + d);
1522                 }
1523                 return 1;
1524             }
1525         }
1526     }
1527     return 0;
1528 }
1529
1530 /**
1531  * VC-1 in-loop deblocking filter
1532  * @param src source block type
1533  * @param step distance between horizontally adjacent elements
1534  * @param stride distance between vertically adjacent elements
1535  * @param len edge length to filter (4 or 8 pixels)
1536  * @param pq block quantizer
1537  * @see 8.6
1538  */
1539 static inline void vc1_loop_filter(uint8_t *src, int step, int stride,
1540                                    int len, int pq)
1541 {
1542     int i;
1543     int filt3;
1544
1545     for (i = 0; i < len; i += 4) {
1546         filt3 = vc1_filter_line(src + 2 * step, stride, pq);
1547         if (filt3) {
1548             vc1_filter_line(src + 0 * step, stride, pq);
1549             vc1_filter_line(src + 1 * step, stride, pq);
1550             vc1_filter_line(src + 3 * step, stride, pq);
1551         }
1552         src += step * 4;
1553     }
1554 }
1555
1556 void ff_vc1_v_loop_filter4_mmi(uint8_t *src, int stride, int pq)
1557 {
1558     vc1_loop_filter(src, 1, stride, 4, pq);
1559 }
1560
1561 void ff_vc1_h_loop_filter4_mmi(uint8_t *src, int stride, int pq)
1562 {
1563     vc1_loop_filter(src, stride, 1, 4, pq);
1564 }
1565
1566 void ff_vc1_v_loop_filter8_mmi(uint8_t *src, int stride, int pq)
1567 {
1568     vc1_loop_filter(src, 1, stride, 8, pq);
1569 }
1570
1571 void ff_vc1_h_loop_filter8_mmi(uint8_t *src, int stride, int pq)
1572 {
1573     vc1_loop_filter(src, stride, 1, 8, pq);
1574 }
1575
1576 void ff_vc1_v_loop_filter16_mmi(uint8_t *src, int stride, int pq)
1577 {
1578     vc1_loop_filter(src, 1, stride, 16, pq);
1579 }
1580
1581 void ff_vc1_h_loop_filter16_mmi(uint8_t *src, int stride, int pq)
1582 {
1583     vc1_loop_filter(src, stride, 1, 16, pq);
1584 }
1585
1586 void ff_put_vc1_mspel_mc00_mmi(uint8_t *dst, const uint8_t *src,
1587                                ptrdiff_t stride, int rnd)
1588 {
1589     ff_put_pixels8_8_mmi(dst, src, stride, 8);
1590 }
1591 void ff_put_vc1_mspel_mc00_16_mmi(uint8_t *dst, const uint8_t *src,
1592                                   ptrdiff_t stride, int rnd)
1593 {
1594     ff_put_pixels16_8_mmi(dst, src, stride, 16);
1595 }
1596 void ff_avg_vc1_mspel_mc00_mmi(uint8_t *dst, const uint8_t *src,
1597                                ptrdiff_t stride, int rnd)
1598 {
1599     ff_avg_pixels8_8_mmi(dst, src, stride, 8);
1600 }
1601 void ff_avg_vc1_mspel_mc00_16_mmi(uint8_t *dst, const uint8_t *src,
1602                                   ptrdiff_t stride, int rnd)
1603 {
1604     ff_avg_pixels16_8_mmi(dst, src, stride, 16);
1605 }
1606
1607 #define OP_PUT(S, D)
1608 #define OP_AVG(S, D)                                                        \
1609     "ldc1       $f16,   "#S"                        \n\t"                   \
1610     "pavgb      "#D",   "#D",   $f16                \n\t"
1611
1612 /** Add rounder from $f14 to $f6 and pack result at destination */
1613 #define NORMALIZE_MMI(SHIFT)                                                \
1614     "paddh      $f6,    $f6,    $f14                \n\t" /* +bias-r */     \
1615     "paddh      $f8,    $f8,    $f14                \n\t" /* +bias-r */     \
1616     "psrah      $f6,    $f6,    "SHIFT"             \n\t"                   \
1617     "psrah      $f8,    $f8,    "SHIFT"             \n\t"
1618
1619 #define TRANSFER_DO_PACK(OP)                                                \
1620     "packushb   $f6,    $f6,    $f8                 \n\t"                   \
1621     OP((%[dst]), $f6)                                                       \
1622     "sdc1       $f6,    0x00(%[dst])                \n\t"
1623
1624 #define TRANSFER_DONT_PACK(OP)                                              \
1625      OP(0(%[dst]), $f6)                                                     \
1626      OP(8(%[dst]), $f8)                                                     \
1627      "sdc1      $f6,    0x00(%[dst])                \n\t"                   \
1628      "sdc1      $f8,    0x08(%[dst])                \n\t"
1629
1630 /** @see MSPEL_FILTER13_CORE for use as UNPACK macro */
1631 #define DO_UNPACK(reg)                                                      \
1632     "punpcklbh  "reg",  "reg",  $f0                 \n\t"
1633 #define DONT_UNPACK(reg)
1634
1635 /** Compute the rounder 32-r or 8-r and unpacks it to $f14 */
1636 #define LOAD_ROUNDER_MMI(ROUND)                                             \
1637     "lwc1       $f14,   "ROUND"                     \n\t"                   \
1638     "punpcklhw  $f14,   $f14,   $f14                \n\t"                   \
1639     "punpcklwd  $f14,   $f14,   $f14                \n\t"
1640
1641
1642 #define SHIFT2_LINE(OFF, R0, R1, R2, R3)                                    \
1643     "paddh      "#R1",      "#R1",  "#R2"           \n\t"                   \
1644     PTR_ADDU    "$9,        %[src], %[stride1]      \n\t"                   \
1645     MMI_ULWC1(R0, $9, 0x00)                                                 \
1646     "pmullh     "#R1",      "#R1",  $f6             \n\t"                   \
1647     "punpcklbh  "#R0",      "#R0",  $f0             \n\t"                   \
1648     PTR_ADDU    "$9,        %[src], %[stride]       \n\t"                   \
1649     MMI_ULWC1(R3, $9, 0x00)                                                 \
1650     "psubh      "#R1",      "#R1",  "#R0"           \n\t"                   \
1651     "punpcklbh  "#R3",      "#R3",  $f0             \n\t"                   \
1652     "paddh      "#R1",      "#R1",  $f14            \n\t"                   \
1653     "psubh      "#R1",      "#R1",  "#R3"           \n\t"                   \
1654     "psrah      "#R1",      "#R1",  %[shift]        \n\t"                   \
1655     MMI_SDC1(R1, %[dst], OFF)                                               \
1656     PTR_ADDU    "%[src],    %[src], %[stride]       \n\t"
1657
1658 /** Sacrificing $f12 makes it possible to pipeline loads from src */
1659 static void vc1_put_ver_16b_shift2_mmi(int16_t *dst,
1660                                        const uint8_t *src, mips_reg stride,
1661                                        int rnd, int64_t shift)
1662 {
1663     DECLARE_VAR_LOW32;
1664     DECLARE_VAR_ADDRT;
1665
1666     __asm__ volatile(
1667         "xor        $f0,    $f0,    $f0             \n\t"
1668         "li         $8,     0x03                    \n\t"
1669         LOAD_ROUNDER_MMI("%[rnd]")
1670         "ldc1       $f12,   %[ff_pw_9]              \n\t"
1671         "1:                                         \n\t"
1672         MMI_ULWC1($f4, %[src], 0x00)
1673         PTR_ADDU   "%[src], %[src], %[stride]       \n\t"
1674         MMI_ULWC1($f6, %[src], 0x00)
1675         "punpcklbh  $f4,    $f4,    $f0             \n\t"
1676         "punpcklbh  $f6,    $f6,    $f0             \n\t"
1677         SHIFT2_LINE(  0, $f2, $f4, $f6, $f8)
1678         SHIFT2_LINE( 24, $f4, $f6, $f8, $f2)
1679         SHIFT2_LINE( 48, $f6, $f8, $f2, $f4)
1680         SHIFT2_LINE( 72, $f8, $f2, $f4, $f6)
1681         SHIFT2_LINE( 96, $f2, $f4, $f6, $f8)
1682         SHIFT2_LINE(120, $f4, $f6, $f8, $f2)
1683         SHIFT2_LINE(144, $f6, $f8, $f2, $f4)
1684         SHIFT2_LINE(168, $f8, $f2, $f4, $f6)
1685         PTR_SUBU   "%[src], %[src], %[stride2]      \n\t"
1686         PTR_ADDIU  "%[dst], %[dst], 0x08            \n\t"
1687         "addiu      $8,     $8,    -0x01            \n\t"
1688         "bnez       $8,     1b                      \n\t"
1689         : RESTRICT_ASM_LOW32            RESTRICT_ASM_ADDRT
1690           [src]"+r"(src),               [dst]"+r"(dst)
1691         : [stride]"r"(stride),          [stride1]"r"(-2*stride),
1692           [shift]"f"(shift),            [rnd]"m"(rnd),
1693           [stride2]"r"(9*stride-4),     [ff_pw_9]"m"(ff_pw_9)
1694         : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
1695           "$f14", "$f16", "memory"
1696     );
1697 }
1698
1699 /**
1700  * Data is already unpacked, so some operations can directly be made from
1701  * memory.
1702  */
1703 #define VC1_HOR_16B_SHIFT2(OP, OPNAME)                                      \
1704 static void OPNAME ## vc1_hor_16b_shift2_mmi(uint8_t *dst, mips_reg stride, \
1705                                              const int16_t *src, int rnd)   \
1706 {                                                                           \
1707     int h = 8;                                                              \
1708     DECLARE_VAR_ALL64;                                                      \
1709     DECLARE_VAR_ADDRT;                                                      \
1710                                                                             \
1711     src -= 1;                                                               \
1712     rnd -= (-1+9+9-1)*1024; /* Add -1024 bias */                            \
1713                                                                             \
1714     __asm__ volatile(                                                       \
1715         LOAD_ROUNDER_MMI("%[rnd]")                                          \
1716         "ldc1       $f12,   %[ff_pw_128]            \n\t"                   \
1717         "ldc1       $f10,   %[ff_pw_9]              \n\t"                   \
1718         "1:                                         \n\t"                   \
1719         MMI_ULDC1($f2, %[src], 0x00)                                        \
1720         MMI_ULDC1($f4, %[src], 0x08)                                        \
1721         MMI_ULDC1($f6, %[src], 0x02)                                        \
1722         MMI_ULDC1($f8, %[src], 0x0a)                                        \
1723         MMI_ULDC1($f0, %[src], 0x06)                                        \
1724         "paddh      $f2,    $f2,    $f0             \n\t"                   \
1725         MMI_ULDC1($f0, %[src], 0x0e)                                        \
1726         "paddh      $f4,    $f4,    $f0             \n\t"                   \
1727         MMI_ULDC1($f0, %[src], 0x04)                                        \
1728         "paddh      $f6,    $f6,    $f0             \n\t"                   \
1729         MMI_ULDC1($f0, %[src], 0x0b)                                        \
1730         "paddh      $f8,    $f8,    $f0             \n\t"                   \
1731         "pmullh     $f6,    $f6,    $f10            \n\t"                   \
1732         "pmullh     $f8,    $f8,    $f10            \n\t"                   \
1733         "psubh      $f6,    $f6,    $f2             \n\t"                   \
1734         "psubh      $f8,    $f8,    $f4             \n\t"                   \
1735         "li         $8,     0x07                    \n\t"                   \
1736         "mtc1       $8,     $f16                    \n\t"                   \
1737         NORMALIZE_MMI("$f16")                                               \
1738         /* Remove bias */                                                   \
1739         "paddh      $f6,    $f6,    $f12            \n\t"                   \
1740         "paddh      $f8,    $f8,    $f12            \n\t"                   \
1741         TRANSFER_DO_PACK(OP)                                                \
1742         "addiu      %[h],   %[h],  -0x01            \n\t"                   \
1743         PTR_ADDIU  "%[src], %[src], 0x18            \n\t"                   \
1744         PTR_ADDU   "%[dst], %[dst], %[stride]       \n\t"                   \
1745         "bnez       %[h],   1b                      \n\t"                   \
1746         : RESTRICT_ASM_ALL64            RESTRICT_ASM_ADDRT                  \
1747           [h]"+r"(h),                                                       \
1748           [src]"+r"(src),               [dst]"+r"(dst)                      \
1749         : [stride]"r"(stride),          [rnd]"m"(rnd),                      \
1750           [ff_pw_9]"m"(ff_pw_9),        [ff_pw_128]"m"(ff_pw_128)           \
1751         : "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14",  \
1752           "$f16", "memory"                                                  \
1753     );                                                                      \
1754 }
1755
1756 VC1_HOR_16B_SHIFT2(OP_PUT, put_)
1757 VC1_HOR_16B_SHIFT2(OP_AVG, avg_)
1758
1759 /**
1760  * Purely vertical or horizontal 1/2 shift interpolation.
1761  * Sacrify $f12 for *9 factor.
1762  */
1763 #define VC1_SHIFT2(OP, OPNAME)\
1764 static void OPNAME ## vc1_shift2_mmi(uint8_t *dst, const uint8_t *src,      \
1765                                      mips_reg stride, int rnd,              \
1766                                      mips_reg offset)                       \
1767 {                                                                           \
1768     DECLARE_VAR_LOW32;                                                      \
1769     DECLARE_VAR_ADDRT;                                                      \
1770                                                                             \
1771     rnd = 8 - rnd;                                                          \
1772                                                                             \
1773     __asm__ volatile(                                                       \
1774         "xor        $f0,    $f0,    $f0             \n\t"                   \
1775         "li         $10,    0x08                    \n\t"                   \
1776         LOAD_ROUNDER_MMI("%[rnd]")                                          \
1777         "ldc1       $f12,   %[ff_pw_9]              \n\t"                   \
1778         "1:                                         \n\t"                   \
1779         MMI_ULWC1($f6, %[src], 0x00)                                        \
1780         MMI_ULWC1($f8, %[src], 0x04)                                        \
1781         PTR_ADDU   "$9,     %[src], %[offset]       \n\t"                   \
1782         MMI_ULWC1($f2, $9, 0x00)                                            \
1783         MMI_ULWC1($f4, $9, 0x04)                                            \
1784         PTR_ADDU   "%[src], %[src], %[offset]       \n\t"                   \
1785         "punpcklbh  $f6,    $f6,    $f0             \n\t"                   \
1786         "punpcklbh  $f8,    $f8,    $f0             \n\t"                   \
1787         "punpcklbh  $f2,    $f2,    $f0             \n\t"                   \
1788         "punpcklbh  $f4,    $f4,    $f0             \n\t"                   \
1789         "paddh      $f6,    $f6,    $f2             \n\t"                   \
1790         "paddh      $f8,    $f8,    $f4             \n\t"                   \
1791         PTR_ADDU   "$9,     %[src], %[offset_x2n]   \n\t"                   \
1792         MMI_ULWC1($f2, $9, 0x00)                                            \
1793         MMI_ULWC1($f4, $9, 0x04)                                            \
1794         "pmullh     $f6,    $f6,    $f12            \n\t" /* 0,9,9,0*/      \
1795         "pmullh     $f8,    $f8,    $f12            \n\t" /* 0,9,9,0*/      \
1796         "punpcklbh  $f2,    $f2,    $f0             \n\t"                   \
1797         "punpcklbh  $f4,    $f4,    $f0             \n\t"                   \
1798         "psubh      $f6,    $f6,    $f2             \n\t" /*-1,9,9,0*/      \
1799         "psubh      $f8,    $f8,    $f4             \n\t" /*-1,9,9,0*/      \
1800         PTR_ADDU   "$9,     %[src], %[offset]       \n\t"                   \
1801         MMI_ULWC1($f2, $9, 0x00)                                            \
1802         MMI_ULWC1($f4, $9, 0x04)                                            \
1803         "punpcklbh  $f2,    $f2,    $f0             \n\t"                   \
1804         "punpcklbh  $f4,    $f4,    $f0             \n\t"                   \
1805         "psubh      $f6,    $f6,    $f2             \n\t" /*-1,9,9,-1*/     \
1806         "psubh      $f8,    $f8,    $f4             \n\t" /*-1,9,9,-1*/     \
1807         "li         $8,     0x04                    \n\t"                   \
1808         "mtc1       $8,     $f16                    \n\t"                   \
1809         NORMALIZE_MMI("$f16")                                               \
1810         "packushb   $f6,    $f6,    $f8             \n\t"                   \
1811         OP((%[dst]), $f6)                                                   \
1812         "sdc1       $f6,    0x00(%[dst])            \n\t"                   \
1813         "addiu      $10,    $10,   -0x01            \n\t"                   \
1814         PTR_ADDU   "%[src], %[src], %[stride1]      \n\t"                   \
1815         PTR_ADDU   "%[dst], %[dst], %[stride]       \n\t"                   \
1816         "bnez       $10,    1b                      \n\t"                   \
1817         : RESTRICT_ASM_LOW32            RESTRICT_ASM_ADDRT                  \
1818           [src]"+r"(src),               [dst]"+r"(dst)                      \
1819         : [offset]"r"(offset),          [offset_x2n]"r"(-2*offset),         \
1820           [stride]"r"(stride),          [rnd]"m"(rnd),                      \
1821           [stride1]"r"(stride-offset),                                      \
1822           [ff_pw_9]"m"(ff_pw_9)                                             \
1823         : "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10",     \
1824           "$f12", "$f14", "$f16", "memory"                                  \
1825     );                                                                      \
1826 }
1827
1828 VC1_SHIFT2(OP_PUT, put_)
1829 VC1_SHIFT2(OP_AVG, avg_)
1830
1831 /**
1832  * Core of the 1/4 and 3/4 shift bicubic interpolation.
1833  *
1834  * @param UNPACK  Macro unpacking arguments from 8 to 16bits (can be empty).
1835  * @param LOAD    "MMI_ULWC1" or "MMI_ULDC1", if data read is already unpacked.
1836  * @param M       "1" for MMI_ULWC1, "2" for MMI_ULDC1.
1837  * @param A1      Stride address of 1st tap (beware of unpacked/packed).
1838  * @param A2      Stride address of 2nd tap
1839  * @param A3      Stride address of 3rd tap
1840  * @param A4      Stride address of 4th tap
1841  */
1842 #define MSPEL_FILTER13_CORE(UNPACK, LOAD, M, A1, A2, A3, A4)                \
1843     PTR_ADDU   "$9,     %[src], "#A1"           \n\t"                       \
1844     LOAD($f2, $9, M*0)                                                      \
1845     LOAD($f4, $9, M*4)                                                      \
1846     UNPACK("$f2")                                                           \
1847     UNPACK("$f4")                                                           \
1848     "pmullh     $f2,    $f2,    %[ff_pw_3]      \n\t"                       \
1849     "pmullh     $f4,    $f4,    %[ff_pw_3]      \n\t"                       \
1850     PTR_ADDU   "$9,     %[src], "#A2"           \n\t"                       \
1851     LOAD($f6, $9, M*0)                                                      \
1852     LOAD($f8, $9, M*4)                                                      \
1853     UNPACK("$f6")                                                           \
1854     UNPACK("$f8")                                                           \
1855     "pmullh     $f6,    $f6,    $f12            \n\t" /* *18 */             \
1856     "pmullh     $f8,    $f8,    $f12            \n\t" /* *18 */             \
1857     "psubh      $f6,    $f6,    $f2             \n\t" /* *18, -3 */         \
1858     "psubh      $f8,    $f8,    $f4             \n\t" /* *18, -3 */         \
1859     PTR_ADDU   "$9,     %[src], "#A4"           \n\t"                       \
1860     LOAD($f2, $9, M*0)                                                      \
1861     LOAD($f4, $9, M*4)                                                      \
1862     UNPACK("$f2")                                                           \
1863     UNPACK("$f4")                                                           \
1864     "li         $8,     0x02                    \n\t"                       \
1865     "mtc1       $8,     $f16                    \n\t"                       \
1866     "psllh      $f2,    $f2,    $f16            \n\t" /* 4* */              \
1867     "psllh      $f4,    $f4,    $f16            \n\t" /* 4* */              \
1868     "psubh      $f6,    $f6,    $f2             \n\t" /* -4,18,-3 */        \
1869     "psubh      $f8,    $f8,    $f4             \n\t" /* -4,18,-3 */        \
1870     PTR_ADDU   "$9,     %[src], "#A3"           \n\t"                       \
1871     LOAD($f2, $9, M*0)                                                      \
1872     LOAD($f4, $9, M*4)                                                      \
1873     UNPACK("$f2")                                                           \
1874     UNPACK("$f4")                                                           \
1875     "pmullh     $f2,    $f2,    $f10            \n\t" /* *53 */             \
1876     "pmullh     $f4,    $f4,    $f10            \n\t" /* *53 */             \
1877     "paddh      $f6,    $f6,    $f2             \n\t" /* 4,53,18,-3 */      \
1878     "paddh      $f8,    $f8,    $f4             \n\t" /* 4,53,18,-3 */
1879
1880 /**
1881  * Macro to build the vertical 16bits version of vc1_put_shift[13].
1882  * Here, offset=src_stride. Parameters passed A1 to A4 must use
1883  * %3 (src_stride), %4 (2*src_stride) and %5 (3*src_stride).
1884  *
1885  * @param  NAME   Either 1 or 3
1886  * @see MSPEL_FILTER13_CORE for information on A1->A4
1887  */
1888 #define MSPEL_FILTER13_VER_16B(NAME, A1, A2, A3, A4)                        \
1889 static void                                                                 \
1890 vc1_put_ver_16b_ ## NAME ## _mmi(int16_t *dst, const uint8_t *src,          \
1891                                  mips_reg src_stride,                       \
1892                                  int rnd, int64_t shift)                    \
1893 {                                                                           \
1894     int h = 8;                                                              \
1895     DECLARE_VAR_LOW32;                                                      \
1896     DECLARE_VAR_ADDRT;                                                      \
1897                                                                             \
1898     src -= src_stride;                                                      \
1899                                                                             \
1900     __asm__ volatile(                                                       \
1901         "xor        $f0,    $f0,    $f0             \n\t"                   \
1902         LOAD_ROUNDER_MMI("%[rnd]")                                          \
1903         "ldc1       $f10,   %[ff_pw_53]             \n\t"                   \
1904         "ldc1       $f12,   %[ff_pw_18]             \n\t"                   \
1905         ".p2align 3                                 \n\t"                   \
1906         "1:                                         \n\t"                   \
1907         MSPEL_FILTER13_CORE(DO_UNPACK, MMI_ULWC1, 1, A1, A2, A3, A4)        \
1908         NORMALIZE_MMI("%[shift]")                                           \
1909         TRANSFER_DONT_PACK(OP_PUT)                                          \
1910         /* Last 3 (in fact 4) bytes on the line */                          \
1911         PTR_ADDU   "$9,     %[src], "#A1"           \n\t"                   \
1912         MMI_ULWC1($f2, $9, 0x08)                                            \
1913         DO_UNPACK("$f2")                                                    \
1914         "mov.d      $f6,    $f2                     \n\t"                   \
1915         "paddh      $f2,    $f2,    $f2             \n\t"                   \
1916         "paddh      $f2,    $f2,    $f6             \n\t" /* 3* */          \
1917         PTR_ADDU   "$9,     %[src], "#A2"           \n\t"                   \
1918         MMI_ULWC1($f6, $9, 0x08)                                            \
1919         DO_UNPACK("$f6")                                                    \
1920         "pmullh     $f6,    $f6,    $f12            \n\t" /* *18 */         \
1921         "psubh      $f6,    $f6,    $f2             \n\t" /* *18,-3 */      \
1922         PTR_ADDU   "$9,     %[src], "#A3"           \n\t"                   \
1923         MMI_ULWC1($f2, $9, 0x08)                                            \
1924         DO_UNPACK("$f2")                                                    \
1925         "pmullh     $f2,    $f2,    $f10            \n\t" /* *53 */         \
1926         "paddh      $f6,    $f6,    $f2             \n\t" /* *53,18,-3 */   \
1927         PTR_ADDU   "$9,     %[src], "#A4"           \n\t"                   \
1928         MMI_ULWC1($f2, $9, 0x08)                                            \
1929         DO_UNPACK("$f2")                                                    \
1930         "li         $8,     0x02                    \n\t"                   \
1931         "mtc1       $8,     $f16                    \n\t"                   \
1932         "psllh      $f2,    $f2,    $f16            \n\t" /* 4* */          \
1933         "psubh      $f6,    $f6,    $f2             \n\t"                   \
1934         "paddh      $f6,    $f6,    $f14            \n\t"                   \
1935         "li         $8,     0x06                    \n\t"                   \
1936         "mtc1       $8,     $f16                    \n\t"                   \
1937         "psrah      $f6,    $f6,    $f16            \n\t"                   \
1938         "sdc1       $f6,    0x10(%[dst])            \n\t"                   \
1939         "addiu      %[h],   %[h],  -0x01            \n\t"                   \
1940         PTR_ADDU   "%[src], %[src], %[stride_x1]    \n\t"                   \
1941         PTR_ADDIU  "%[dst], %[dst], 0x18            \n\t"                   \
1942         "bnez       %[h],   1b                      \n\t"                   \
1943         : RESTRICT_ASM_LOW32            RESTRICT_ASM_ADDRT                  \
1944           [h]"+r"(h),                                                       \
1945           [src]"+r"(src),               [dst]"+r"(dst)                      \
1946         : [stride_x1]"r"(src_stride),   [stride_x2]"r"(2*src_stride),       \
1947           [stride_x3]"r"(3*src_stride),                                     \
1948           [rnd]"m"(rnd),                [shift]"f"(shift),                  \
1949           [ff_pw_53]"m"(ff_pw_53),      [ff_pw_18]"m"(ff_pw_18),            \
1950           [ff_pw_3]"f"(ff_pw_3)                                             \
1951         : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",    \
1952           "$f14", "$f16", "memory"                                          \
1953     );                                                                      \
1954 }
1955
1956 /**
1957  * Macro to build the horizontal 16bits version of vc1_put_shift[13].
1958  * Here, offset=16bits, so parameters passed A1 to A4 should be simple.
1959  *
1960  * @param  NAME   Either 1 or 3
1961  * @see MSPEL_FILTER13_CORE for information on A1->A4
1962  */
1963 #define MSPEL_FILTER13_HOR_16B(NAME, A1, A2, A3, A4, OP, OPNAME)            \
1964 static void                                                                 \
1965 OPNAME ## vc1_hor_16b_ ## NAME ## _mmi(uint8_t *dst, mips_reg stride,       \
1966                                        const int16_t *src, int rnd)         \
1967 {                                                                           \
1968     int h = 8;                                                              \
1969     DECLARE_VAR_ALL64;                                                      \
1970     DECLARE_VAR_ADDRT;                                                      \
1971                                                                             \
1972     src -= 1;                                                               \
1973     rnd -= (-4+58+13-3)*256; /* Add -256 bias */                            \
1974                                                                             \
1975     __asm__ volatile(                                                       \
1976         "xor        $f0,    $f0,    $f0             \n\t"                   \
1977         LOAD_ROUNDER_MMI("%[rnd]")                                          \
1978         "ldc1       $f10,   %[ff_pw_53]             \n\t"                   \
1979         "ldc1       $f12,   %[ff_pw_18]             \n\t"                   \
1980         ".p2align 3                                 \n\t"                   \
1981         "1:                                         \n\t"                   \
1982         MSPEL_FILTER13_CORE(DONT_UNPACK, MMI_ULDC1, 2, A1, A2, A3, A4)      \
1983         "li         $8,     0x07                    \n\t"                   \
1984         "mtc1       $8,     $f16                    \n\t"                   \
1985         NORMALIZE_MMI("$f16")                                               \
1986         /* Remove bias */                                                   \
1987         "paddh      $f6,    $f6,    %[ff_pw_128]    \n\t"                   \
1988         "paddh      $f8,    $f8,    %[ff_pw_128]    \n\t"                   \
1989         TRANSFER_DO_PACK(OP)                                                \
1990         "addiu      %[h],   %[h],  -0x01            \n\t"                   \
1991         PTR_ADDU   "%[src], %[src], 0x18            \n\t"                   \
1992         PTR_ADDU   "%[dst], %[dst], %[stride]       \n\t"                   \
1993         "bnez       %[h],   1b                      \n\t"                   \
1994         : RESTRICT_ASM_ALL64            RESTRICT_ASM_ADDRT                  \
1995           [h]"+r"(h),                                                       \
1996           [src]"+r"(src),               [dst]"+r"(dst)                      \
1997         : [stride]"r"(stride),          [rnd]"m"(rnd),                      \
1998           [ff_pw_53]"m"(ff_pw_53),      [ff_pw_18]"m"(ff_pw_18),            \
1999           [ff_pw_3]"f"(ff_pw_3),        [ff_pw_128]"f"(ff_pw_128)           \
2000         : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",    \
2001           "$f14", "$f16", "memory"                                          \
2002     );                                                                      \
2003 }
2004
2005 /**
2006  * Macro to build the 8bits, any direction, version of vc1_put_shift[13].
2007  * Here, offset=src_stride. Parameters passed A1 to A4 must use
2008  * %3 (offset), %4 (2*offset) and %5 (3*offset).
2009  *
2010  * @param  NAME   Either 1 or 3
2011  * @see MSPEL_FILTER13_CORE for information on A1->A4
2012  */
2013 #define MSPEL_FILTER13_8B(NAME, A1, A2, A3, A4, OP, OPNAME)                 \
2014 static void                                                                 \
2015 OPNAME ## vc1_## NAME ## _mmi(uint8_t *dst, const uint8_t *src,             \
2016                               mips_reg stride, int rnd, mips_reg offset)    \
2017 {                                                                           \
2018     int h = 8;                                                              \
2019     DECLARE_VAR_LOW32;                                                      \
2020     DECLARE_VAR_ADDRT;                                                      \
2021                                                                             \
2022     src -= offset;                                                          \
2023     rnd = 32-rnd;                                                           \
2024                                                                             \
2025     __asm__ volatile (                                                      \
2026         "xor        $f0,    $f0,    $f0             \n\t"                   \
2027         LOAD_ROUNDER_MMI("%[rnd]")                                          \
2028         "ldc1       $f10,   %[ff_pw_53]             \n\t"                   \
2029         "ldc1       $f12,   %[ff_pw_18]             \n\t"                   \
2030         ".p2align 3                                 \n\t"                   \
2031         "1:                                         \n\t"                   \
2032         MSPEL_FILTER13_CORE(DO_UNPACK, MMI_ULWC1, 1, A1, A2, A3, A4)        \
2033         "li         $8,     0x06                    \n\t"                   \
2034         "mtc1       $8,     $f16                    \n\t"                   \
2035         NORMALIZE_MMI("$f16")                                               \
2036         TRANSFER_DO_PACK(OP)                                                \
2037         "addiu      %[h],   %[h],      -0x01        \n\t"                   \
2038         PTR_ADDU   "%[src], %[src],     %[stride]   \n\t"                   \
2039         PTR_ADDU   "%[dst], %[dst],     %[stride]   \n\t"                   \
2040         "bnez       %[h],   1b                      \n\t"                   \
2041         : RESTRICT_ASM_LOW32            RESTRICT_ASM_ADDRT                  \
2042           [h]"+r"(h),                                                       \
2043           [src]"+r"(src),               [dst]"+r"(dst)                      \
2044         : [offset_x1]"r"(offset),       [offset_x2]"r"(2*offset),           \
2045           [offset_x3]"r"(3*offset),     [stride]"r"(stride),                \
2046           [rnd]"m"(rnd),                                                    \
2047           [ff_pw_53]"m"(ff_pw_53),      [ff_pw_18]"m"(ff_pw_18),            \
2048           [ff_pw_3]"f"(ff_pw_3)                                             \
2049         : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",    \
2050           "$f14", "$f16", "memory"                                          \
2051     );                                                                      \
2052 }
2053
2054
2055 /** 1/4 shift bicubic interpolation */
2056 MSPEL_FILTER13_8B(shift1, %[offset_x3], %[offset_x2], %[offset_x1], $0, OP_PUT, put_)
2057 MSPEL_FILTER13_8B(shift1, %[offset_x3], %[offset_x2], %[offset_x1], $0, OP_AVG, avg_)
2058 MSPEL_FILTER13_VER_16B(shift1, %[stride_x3], %[stride_x2], %[stride_x1], $0)
2059 MSPEL_FILTER13_HOR_16B(shift1, 6, 4, 2, 0, OP_PUT, put_)
2060 MSPEL_FILTER13_HOR_16B(shift1, 6, 4, 2, 0, OP_AVG, avg_)
2061
2062 /** 3/4 shift bicubic interpolation */
2063 MSPEL_FILTER13_8B(shift3, $0, %[offset_x1], %[offset_x2], %[offset_x3], OP_PUT, put_)
2064 MSPEL_FILTER13_8B(shift3, $0, %[offset_x1], %[offset_x2], %[offset_x3], OP_AVG, avg_)
2065 MSPEL_FILTER13_VER_16B(shift3, $0, %[stride_x1], %[stride_x2], %[stride_x3])
2066 MSPEL_FILTER13_HOR_16B(shift3, 0, 2, 4, 6, OP_PUT, put_)
2067 MSPEL_FILTER13_HOR_16B(shift3, 0, 2, 4, 6, OP_AVG, avg_)
2068
2069 typedef void (*vc1_mspel_mc_filter_ver_16bits)
2070              (int16_t *dst, const uint8_t *src, mips_reg src_stride, int rnd,
2071               int64_t shift);
2072 typedef void (*vc1_mspel_mc_filter_hor_16bits)
2073              (uint8_t *dst, mips_reg dst_stride, const int16_t *src, int rnd);
2074 typedef void (*vc1_mspel_mc_filter_8bits)
2075              (uint8_t *dst, const uint8_t *src, mips_reg stride, int rnd,
2076               mips_reg offset);
2077
2078 /**
2079  * Interpolate fractional pel values by applying proper vertical then
2080  * horizontal filter.
2081  *
2082  * @param  dst     Destination buffer for interpolated pels.
2083  * @param  src     Source buffer.
2084  * @param  stride  Stride for both src and dst buffers.
2085  * @param  hmode   Horizontal filter (expressed in quarter pixels shift).
2086  * @param  hmode   Vertical filter.
2087  * @param  rnd     Rounding bias.
2088  */
2089 #define VC1_MSPEL_MC(OP)                                                    \
2090 static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\
2091                                int hmode, int vmode, int rnd)               \
2092 {                                                                           \
2093     static const vc1_mspel_mc_filter_ver_16bits vc1_put_shift_ver_16bits[] =\
2094          { NULL, vc1_put_ver_16b_shift1_mmi,                                \
2095                  vc1_put_ver_16b_shift2_mmi,                                \
2096                  vc1_put_ver_16b_shift3_mmi };                              \
2097     static const vc1_mspel_mc_filter_hor_16bits vc1_put_shift_hor_16bits[] =\
2098          { NULL, OP ## vc1_hor_16b_shift1_mmi,                              \
2099                  OP ## vc1_hor_16b_shift2_mmi,                              \
2100                  OP ## vc1_hor_16b_shift3_mmi };                            \
2101     static const vc1_mspel_mc_filter_8bits vc1_put_shift_8bits[] =          \
2102          { NULL, OP ## vc1_shift1_mmi,                                      \
2103                  OP ## vc1_shift2_mmi,                                      \
2104                  OP ## vc1_shift3_mmi };                                    \
2105                                                                             \
2106     if (vmode) { /* Vertical filter to apply */                             \
2107         if (hmode) { /* Horizontal filter to apply, output to tmp */        \
2108             static const int shift_value[] = { 0, 5, 1, 5 };                \
2109             int    shift = (shift_value[hmode]+shift_value[vmode])>>1;      \
2110             int    r;                                                       \
2111             LOCAL_ALIGNED(16, int16_t, tmp, [12*8]);                        \
2112                                                                             \
2113             r = (1<<(shift-1)) + rnd-1;                                     \
2114             vc1_put_shift_ver_16bits[vmode](tmp, src-1, stride, r, shift);  \
2115                                                                             \
2116             vc1_put_shift_hor_16bits[hmode](dst, stride, tmp+1, 64-rnd);    \
2117             return;                                                         \
2118         }                                                                   \
2119         else { /* No horizontal filter, output 8 lines to dst */            \
2120             vc1_put_shift_8bits[vmode](dst, src, stride, 1-rnd, stride);    \
2121             return;                                                         \
2122         }                                                                   \
2123     }                                                                       \
2124                                                                             \
2125     /* Horizontal mode with no vertical mode */                             \
2126     vc1_put_shift_8bits[hmode](dst, src, stride, rnd, 1);                   \
2127 }                                                                           \
2128 static void OP ## vc1_mspel_mc_16(uint8_t *dst, const uint8_t *src,         \
2129                                   int stride, int hmode, int vmode, int rnd)\
2130 {                                                                           \
2131     OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd);        \
2132     OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd);        \
2133     dst += 8*stride; src += 8*stride;                                       \
2134     OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd);        \
2135     OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd);        \
2136 }
2137
2138 VC1_MSPEL_MC(put_)
2139 VC1_MSPEL_MC(avg_)
2140
2141 /** Macro to ease bicubic filter interpolation functions declarations */
2142 #define DECLARE_FUNCTION(a, b)                                              \
2143 void ff_put_vc1_mspel_mc ## a ## b ## _mmi(uint8_t *dst,                    \
2144                                            const uint8_t *src,              \
2145                                            ptrdiff_t stride,                \
2146                                            int rnd)                         \
2147 {                                                                           \
2148      put_vc1_mspel_mc(dst, src, stride, a, b, rnd);                         \
2149 }                                                                           \
2150 void ff_avg_vc1_mspel_mc ## a ## b ## _mmi(uint8_t *dst,                    \
2151                                            const uint8_t *src,              \
2152                                            ptrdiff_t stride,                \
2153                                            int rnd)                         \
2154 {                                                                           \
2155      avg_vc1_mspel_mc(dst, src, stride, a, b, rnd);                         \
2156 }                                                                           \
2157 void ff_put_vc1_mspel_mc ## a ## b ## _16_mmi(uint8_t *dst,                 \
2158                                               const uint8_t *src,           \
2159                                               ptrdiff_t stride,             \
2160                                               int rnd)                      \
2161 {                                                                           \
2162      put_vc1_mspel_mc_16(dst, src, stride, a, b, rnd);                      \
2163 }                                                                           \
2164 void ff_avg_vc1_mspel_mc ## a ## b ## _16_mmi(uint8_t *dst,                 \
2165                                               const uint8_t *src,           \
2166                                               ptrdiff_t stride,             \
2167                                               int rnd)                      \
2168 {                                                                           \
2169      avg_vc1_mspel_mc_16(dst, src, stride, a, b, rnd);                      \
2170 }
2171
2172 DECLARE_FUNCTION(0, 1)
2173 DECLARE_FUNCTION(0, 2)
2174 DECLARE_FUNCTION(0, 3)
2175
2176 DECLARE_FUNCTION(1, 0)
2177 DECLARE_FUNCTION(1, 1)
2178 DECLARE_FUNCTION(1, 2)
2179 DECLARE_FUNCTION(1, 3)
2180
2181 DECLARE_FUNCTION(2, 0)
2182 DECLARE_FUNCTION(2, 1)
2183 DECLARE_FUNCTION(2, 2)
2184 DECLARE_FUNCTION(2, 3)
2185
2186 DECLARE_FUNCTION(3, 0)
2187 DECLARE_FUNCTION(3, 1)
2188 DECLARE_FUNCTION(3, 2)
2189 DECLARE_FUNCTION(3, 3)
2190
2191 #define CHROMA_MC_8_MMI                                                     \
2192         "punpckhbh  %[ftmp5],   %[ftmp1],   %[ftmp0]                \n\t"   \
2193         "punpcklbh  %[ftmp1],   %[ftmp1],   %[ftmp0]                \n\t"   \
2194         "punpckhbh  %[ftmp6],   %[ftmp2],   %[ftmp0]                \n\t"   \
2195         "punpcklbh  %[ftmp2],   %[ftmp2],   %[ftmp0]                \n\t"   \
2196         "punpckhbh  %[ftmp7],   %[ftmp3],   %[ftmp0]                \n\t"   \
2197         "punpcklbh  %[ftmp3],   %[ftmp3],   %[ftmp0]                \n\t"   \
2198         "punpckhbh  %[ftmp8],   %[ftmp4],   %[ftmp0]                \n\t"   \
2199         "punpcklbh  %[ftmp4],   %[ftmp4],   %[ftmp0]                \n\t"   \
2200                                                                             \
2201         "pmullh     %[ftmp1],   %[ftmp1],   %[A]                    \n\t"   \
2202         "pmullh     %[ftmp5],   %[ftmp5],   %[A]                    \n\t"   \
2203         "pmullh     %[ftmp2],   %[ftmp2],   %[B]                    \n\t"   \
2204         "pmullh     %[ftmp6],   %[ftmp6],   %[B]                    \n\t"   \
2205         "pmullh     %[ftmp3],   %[ftmp3],   %[C]                    \n\t"   \
2206         "pmullh     %[ftmp7],   %[ftmp7],   %[C]                    \n\t"   \
2207         "pmullh     %[ftmp4],   %[ftmp4],   %[D]                    \n\t"   \
2208         "pmullh     %[ftmp8],   %[ftmp8],   %[D]                    \n\t"   \
2209                                                                             \
2210         "paddh      %[ftmp1],   %[ftmp1],   %[ftmp2]                \n\t"   \
2211         "paddh      %[ftmp3],   %[ftmp3],   %[ftmp4]                \n\t"   \
2212         "paddh      %[ftmp1],   %[ftmp1],   %[ftmp3]                \n\t"   \
2213         "paddh      %[ftmp1],   %[ftmp1],   %[ff_pw_28]             \n\t"   \
2214                                                                             \
2215         "paddh      %[ftmp5],   %[ftmp5],   %[ftmp6]                \n\t"   \
2216         "paddh      %[ftmp7],   %[ftmp7],   %[ftmp8]                \n\t"   \
2217         "paddh      %[ftmp5],   %[ftmp5],   %[ftmp7]                \n\t"   \
2218         "paddh      %[ftmp5],   %[ftmp5],   %[ff_pw_28]             \n\t"   \
2219                                                                             \
2220         "psrlh      %[ftmp1],   %[ftmp1],   %[ftmp9]                \n\t"   \
2221         "psrlh      %[ftmp5],   %[ftmp5],   %[ftmp9]                \n\t"   \
2222         "packushb   %[ftmp1],   %[ftmp1],   %[ftmp5]                \n\t"
2223
2224
2225 #define CHROMA_MC_4_MMI                                                     \
2226         "punpcklbh  %[ftmp1],   %[ftmp1],   %[ftmp0]                \n\t"   \
2227         "punpcklbh  %[ftmp2],   %[ftmp2],   %[ftmp0]                \n\t"   \
2228         "punpcklbh  %[ftmp3],   %[ftmp3],   %[ftmp0]                \n\t"   \
2229         "punpcklbh  %[ftmp4],   %[ftmp4],   %[ftmp0]                \n\t"   \
2230                                                                             \
2231         "pmullh     %[ftmp1],   %[ftmp1],   %[A]                    \n\t"   \
2232         "pmullh     %[ftmp2],   %[ftmp2],   %[B]                    \n\t"   \
2233         "pmullh     %[ftmp3],   %[ftmp3],   %[C]                    \n\t"   \
2234         "pmullh     %[ftmp4],   %[ftmp4],   %[D]                    \n\t"   \
2235                                                                             \
2236         "paddh      %[ftmp1],   %[ftmp1],   %[ftmp2]                \n\t"   \
2237         "paddh      %[ftmp3],   %[ftmp3],   %[ftmp4]                \n\t"   \
2238         "paddh      %[ftmp1],   %[ftmp1],   %[ftmp3]                \n\t"   \
2239         "paddh      %[ftmp1],   %[ftmp1],   %[ff_pw_28]             \n\t"   \
2240                                                                             \
2241         "psrlh      %[ftmp1],   %[ftmp1],   %[ftmp5]                \n\t"   \
2242         "packushb   %[ftmp1],   %[ftmp1],   %[ftmp0]                \n\t"
2243
2244
2245 void ff_put_no_rnd_vc1_chroma_mc8_mmi(uint8_t *dst /* align 8 */,
2246                                       uint8_t *src /* align 1 */,
2247                                       ptrdiff_t stride, int h, int x, int y)
2248 {
2249     const int A = (8 - x) * (8 - y);
2250     const int B =     (x) * (8 - y);
2251     const int C = (8 - x) *     (y);
2252     const int D =     (x) *     (y);
2253     double ftmp[10];
2254     uint32_t tmp[1];
2255     DECLARE_VAR_ALL64;
2256     DECLARE_VAR_ADDRT;
2257
2258     av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2259
2260     __asm__ volatile(
2261         "li         %[tmp0],    0x06                                    \n\t"
2262         "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
2263         "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
2264         "pshufh     %[A],       %[A],       %[ftmp0]                    \n\t"
2265         "pshufh     %[B],       %[B],       %[ftmp0]                    \n\t"
2266         "pshufh     %[C],       %[C],       %[ftmp0]                    \n\t"
2267         "pshufh     %[D],       %[D],       %[ftmp0]                    \n\t"
2268
2269         "1:                                                             \n\t"
2270         MMI_ULDC1(%[ftmp1], %[src], 0x00)
2271         MMI_ULDC1(%[ftmp2], %[src], 0x01)
2272         PTR_ADDU   "%[src],     %[src],     %[stride]                   \n\t"
2273         MMI_ULDC1(%[ftmp3], %[src], 0x00)
2274         MMI_ULDC1(%[ftmp4], %[src], 0x01)
2275
2276         CHROMA_MC_8_MMI
2277
2278         MMI_SDC1(%[ftmp1], %[dst], 0x00)
2279         "addiu      %[h],       %[h],      -0x01                        \n\t"
2280         PTR_ADDU   "%[dst],     %[dst],     %[stride]                   \n\t"
2281         "bnez       %[h],       1b                                      \n\t"
2282         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
2283           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
2284           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
2285           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
2286           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
2287           RESTRICT_ASM_ALL64
2288           RESTRICT_ASM_ADDRT
2289           [tmp0]"=&r"(tmp[0]),
2290           [src]"+&r"(src),              [dst]"+&r"(dst),
2291           [h]"+&r"(h)
2292         : [stride]"r"((mips_reg)stride),
2293           [A]"f"(A),                    [B]"f"(B),
2294           [C]"f"(C),                    [D]"f"(D),
2295           [ff_pw_28]"f"(ff_pw_28)
2296         : "memory"
2297     );
2298 }
2299
2300 void ff_put_no_rnd_vc1_chroma_mc4_mmi(uint8_t *dst /* align 8 */,
2301                                       uint8_t *src /* align 1 */,
2302                                       ptrdiff_t stride, int h, int x, int y)
2303 {
2304     const int A = (8 - x) * (8 - y);
2305     const int B =     (x) * (8 - y);
2306     const int C = (8 - x) *     (y);
2307     const int D =     (x) *     (y);
2308     double ftmp[6];
2309     uint32_t tmp[1];
2310     DECLARE_VAR_LOW32;
2311     DECLARE_VAR_ADDRT;
2312
2313     av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2314
2315     __asm__ volatile(
2316         "li         %[tmp0],    0x06                                    \n\t"
2317         "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
2318         "mtc1       %[tmp0],    %[ftmp5]                                \n\t"
2319         "pshufh     %[A],       %[A],       %[ftmp0]                    \n\t"
2320         "pshufh     %[B],       %[B],       %[ftmp0]                    \n\t"
2321         "pshufh     %[C],       %[C],       %[ftmp0]                    \n\t"
2322         "pshufh     %[D],       %[D],       %[ftmp0]                    \n\t"
2323
2324         "1:                                                             \n\t"
2325         MMI_ULWC1(%[ftmp1], %[src], 0x00)
2326         MMI_ULWC1(%[ftmp2], %[src], 0x01)
2327         PTR_ADDU   "%[src],     %[src],     %[stride]                   \n\t"
2328         MMI_ULWC1(%[ftmp3], %[src], 0x00)
2329         MMI_ULWC1(%[ftmp4], %[src], 0x01)
2330
2331         CHROMA_MC_4_MMI
2332
2333         MMI_SWC1(%[ftmp1], %[dst], 0x00)
2334         "addiu      %[h],       %[h],      -0x01                        \n\t"
2335         PTR_ADDU   "%[dst],     %[dst],     %[stride]                   \n\t"
2336         "bnez       %[h],       1b                                      \n\t"
2337         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
2338           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
2339           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
2340           [tmp0]"=&r"(tmp[0]),
2341           RESTRICT_ASM_LOW32
2342           RESTRICT_ASM_ADDRT
2343           [src]"+&r"(src),              [dst]"+&r"(dst),
2344           [h]"+&r"(h)
2345         : [stride]"r"((mips_reg)stride),
2346           [A]"f"(A),                    [B]"f"(B),
2347           [C]"f"(C),                    [D]"f"(D),
2348           [ff_pw_28]"f"(ff_pw_28)
2349         : "memory"
2350     );
2351 }
2352
2353 void ff_avg_no_rnd_vc1_chroma_mc8_mmi(uint8_t *dst /* align 8 */,
2354                                       uint8_t *src /* align 1 */,
2355                                       ptrdiff_t stride, int h, int x, int y)
2356 {
2357     const int A = (8 - x) * (8 - y);
2358     const int B =     (x) * (8 - y);
2359     const int C = (8 - x) *     (y);
2360     const int D =     (x) *     (y);
2361     double ftmp[10];
2362     uint32_t tmp[1];
2363     DECLARE_VAR_ALL64;
2364     DECLARE_VAR_ADDRT;
2365
2366     av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2367
2368     __asm__ volatile(
2369         "li         %[tmp0],    0x06                                    \n\t"
2370         "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
2371         "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
2372         "pshufh     %[A],       %[A],       %[ftmp0]                    \n\t"
2373         "pshufh     %[B],       %[B],       %[ftmp0]                    \n\t"
2374         "pshufh     %[C],       %[C],       %[ftmp0]                    \n\t"
2375         "pshufh     %[D],       %[D],       %[ftmp0]                    \n\t"
2376
2377         "1:                                                             \n\t"
2378         MMI_ULDC1(%[ftmp1], %[src], 0x00)
2379         MMI_ULDC1(%[ftmp2], %[src], 0x01)
2380         PTR_ADDU   "%[src],     %[src],     %[stride]                   \n\t"
2381         MMI_ULDC1(%[ftmp3], %[src], 0x00)
2382         MMI_ULDC1(%[ftmp4], %[src], 0x01)
2383
2384         CHROMA_MC_8_MMI
2385
2386         MMI_LDC1(%[ftmp2], %[dst], 0x00)
2387         "pavgb      %[ftmp1],   %[ftmp1],   %[ftmp2]                    \n\t"
2388
2389         MMI_SDC1(%[ftmp1], %[dst], 0x00)
2390         "addiu      %[h],       %[h],      -0x01                        \n\t"
2391         PTR_ADDU   "%[dst],     %[dst],     %[stride]                   \n\t"
2392         "bnez       %[h],       1b                                      \n\t"
2393         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
2394           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
2395           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
2396           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
2397           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
2398           [tmp0]"=&r"(tmp[0]),
2399           RESTRICT_ASM_ALL64
2400           RESTRICT_ASM_ADDRT
2401           [src]"+&r"(src),              [dst]"+&r"(dst),
2402           [h]"+&r"(h)
2403         : [stride]"r"((mips_reg)stride),
2404           [A]"f"(A),                    [B]"f"(B),
2405           [C]"f"(C),                    [D]"f"(D),
2406           [ff_pw_28]"f"(ff_pw_28)
2407         : "memory"
2408     );
2409 }
2410
2411 void ff_avg_no_rnd_vc1_chroma_mc4_mmi(uint8_t *dst /* align 8 */,
2412                                       uint8_t *src /* align 1 */,
2413                                       ptrdiff_t stride, int h, int x, int y)
2414 {
2415     const int A = (8 - x) * (8 - y);
2416     const int B = (    x) * (8 - y);
2417     const int C = (8 - x) * (    y);
2418     const int D = (    x) * (    y);
2419     double ftmp[6];
2420     uint32_t tmp[1];
2421     DECLARE_VAR_LOW32;
2422     DECLARE_VAR_ADDRT;
2423
2424     av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2425
2426     __asm__ volatile(
2427         "li         %[tmp0],    0x06                                    \n\t"
2428         "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
2429         "mtc1       %[tmp0],    %[ftmp5]                                \n\t"
2430         "pshufh     %[A],       %[A],       %[ftmp0]                    \n\t"
2431         "pshufh     %[B],       %[B],       %[ftmp0]                    \n\t"
2432         "pshufh     %[C],       %[C],       %[ftmp0]                    \n\t"
2433         "pshufh     %[D],       %[D],       %[ftmp0]                    \n\t"
2434
2435         "1:                                                             \n\t"
2436         MMI_ULWC1(%[ftmp1], %[src], 0x00)
2437         MMI_ULWC1(%[ftmp2], %[src], 0x01)
2438         PTR_ADDU   "%[src],     %[src],     %[stride]                   \n\t"
2439         MMI_ULWC1(%[ftmp3], %[src], 0x00)
2440         MMI_ULWC1(%[ftmp4], %[src], 0x01)
2441
2442         CHROMA_MC_4_MMI
2443
2444         MMI_LWC1(%[ftmp2], %[dst], 0x00)
2445         "pavgb      %[ftmp1],   %[ftmp1],   %[ftmp2]                    \n\t"
2446
2447         MMI_SWC1(%[ftmp1], %[dst], 0x00)
2448         "addiu      %[h],       %[h],      -0x01                        \n\t"
2449         PTR_ADDU   "%[dst],     %[dst],     %[stride]                   \n\t"
2450         "bnez       %[h],       1b                                      \n\t"
2451         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
2452           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
2453           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
2454           [tmp0]"=&r"(tmp[0]),
2455           RESTRICT_ASM_LOW32
2456           RESTRICT_ASM_ADDRT
2457           [src]"+&r"(src),              [dst]"+&r"(dst),
2458           [h]"+&r"(h)
2459         : [stride]"r"((mips_reg)stride),
2460           [A]"f"(A),                    [B]"f"(B),
2461           [C]"f"(C),                    [D]"f"(D),
2462           [ff_pw_28]"f"(ff_pw_28)
2463         : "memory"
2464     );
2465 }