git.sesse.net Git - ffmpeg/blob - libavcodec/mips/vc1dsp_mmi.c

   1 /*
   2  * VC-1 and WMV3 - DSP functions Loongson MMI-optimized
   3  *
   4  * Copyright (c) 2016 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
   5  *
   6  * This file is part of FFmpeg.
   7  *
   8  * FFmpeg is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License as published by the Free Software Foundation; either
  11  * version 2.1 of the License, or (at your option) any later version.
  12  *
  13  * FFmpeg is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16  * Lesser General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU Lesser General Public
  19  * License along with FFmpeg; if not, write to the Free Software
  20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21  */
  22
  23 #include "libavutil/avassert.h"
  24 #include "libavcodec/vc1dsp.h"
  25 #include "constants.h"
  26 #include "vc1dsp_mips.h"
  27 #include "hpeldsp_mips.h"
  28 #include "libavutil/mips/mmiutils.h"
  29
  30 #define VC1_INV_TRANCS_8_TYPE1(o1, o2, r1, r2, r3, r4, c0)                  \
  31         "li         %[tmp0],    "#r1"                                 \n\t" \
  32         "mtc1       %[tmp0],    %[ftmp13]                             \n\t" \
  33         "pshufh     %[ftmp13],  %[ftmp13],  %[ftmp23]                 \n\t" \
  34         "li         %[tmp0],    "#r2"                                 \n\t" \
  35         "mtc1       %[tmp0],    %[ftmp14]                             \n\t" \
  36         "pshufh     %[ftmp14],  %[ftmp14],  %[ftmp23]                 \n\t" \
  37         "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp13]                 \n\t" \
  38         "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp14]                 \n\t" \
  39         "paddw      %[ftmp1],   %[ftmp1],   %[ftmp2]                  \n\t" \
  40         "pmaddhw    %[ftmp2],   %[ftmp6],   %[ftmp13]                 \n\t" \
  41         "pmaddhw    %[ftmp3],   %[ftmp8],   %[ftmp14]                 \n\t" \
  42         "paddw      %[ftmp2],   %[ftmp2],   %[ftmp3]                  \n\t" \
  43                                                                             \
  44         "li         %[tmp0],    "#r3"                                 \n\t" \
  45         "mtc1       %[tmp0],    %[ftmp13]                             \n\t" \
  46         "pshufh     %[ftmp13],  %[ftmp13],  %[ftmp23]                 \n\t" \
  47         "li         %[tmp0],    "#r4"                                 \n\t" \
  48         "mtc1       %[tmp0],    %[ftmp14]                             \n\t" \
  49         "pshufh     %[ftmp14],  %[ftmp14],  %[ftmp23]                 \n\t" \
  50         "pmaddhw    %[ftmp3],   %[ftmp9],   %[ftmp13]                 \n\t" \
  51         "pmaddhw    %[ftmp4],   %[ftmp11],  %[ftmp14]                 \n\t" \
  52         "paddw      %[ftmp3],   %[ftmp3],   %[ftmp4]                  \n\t" \
  53         "pmaddhw    %[ftmp4],   %[ftmp10],  %[ftmp13]                 \n\t" \
  54         "pmaddhw    %[ftmp13],  %[ftmp12],  %[ftmp14]                 \n\t" \
  55         "paddw      %[ftmp4],   %[ftmp4],   %[ftmp13]                 \n\t" \
  56                                                                             \
  57         "paddw      %[ftmp13],  %[ftmp1],   %[ftmp3]                  \n\t" \
  58         "psubw      %[ftmp14],  %[ftmp1],   %[ftmp3]                  \n\t" \
  59         "paddw      %[ftmp1],   %[ftmp2],   %[ftmp4]                  \n\t" \
  60         "psubw      %[ftmp3],   %[ftmp2],   %[ftmp4]                  \n\t" \
  61         "paddw      %[ftmp13],  %[ftmp13],  "#c0"                     \n\t" \
  62         "paddw      %[ftmp14],  %[ftmp14],  "#c0"                     \n\t" \
  63         "paddw      %[ftmp1],   %[ftmp1],   "#c0"                     \n\t" \
  64         "paddw      %[ftmp3],   %[ftmp3],   "#c0"                     \n\t" \
  65         "psraw      %[ftmp13],  %[ftmp13],  %[ftmp0]                  \n\t" \
  66         "psraw      %[ftmp1],   %[ftmp1],   %[ftmp0]                  \n\t" \
  67         "psraw      %[ftmp14],  %[ftmp14],  %[ftmp0]                  \n\t" \
  68         "psraw      %[ftmp3],   %[ftmp3],   %[ftmp0]                  \n\t" \
  69         "punpcklhw  %[ftmp2],   %[ftmp13],  %[ftmp1]                  \n\t" \
  70         "punpckhhw  %[ftmp4],   %[ftmp13],  %[ftmp1]                  \n\t" \
  71         "punpcklhw  "#o1",      %[ftmp2],   %[ftmp4]                  \n\t" \
  72         "punpcklhw  %[ftmp2],   %[ftmp14],  %[ftmp3]                  \n\t" \
  73         "punpckhhw  %[ftmp4],   %[ftmp14],  %[ftmp3]                  \n\t" \
  74         "punpcklhw  "#o2",      %[ftmp2],   %[ftmp4]                  \n\t"
  75
  76 #define VC1_INV_TRANCS_8_TYPE2(o1, o2, r1, r2, r3, r4, c0, c1)              \
  77         "li         %[tmp0],    "#r1"                                 \n\t" \
  78         "mtc1       %[tmp0],    %[ftmp13]                             \n\t" \
  79         "pshufh     %[ftmp13],  %[ftmp13],  %[ftmp23]                 \n\t" \
  80         "li         %[tmp0],    "#r2"                                 \n\t" \
  81         "mtc1       %[tmp0],    %[ftmp14]                             \n\t" \
  82         "pshufh     %[ftmp14],  %[ftmp14],  %[ftmp23]                 \n\t" \
  83         "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp13]                 \n\t" \
  84         "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp14]                 \n\t" \
  85         "paddw      %[ftmp1],   %[ftmp1],   %[ftmp2]                  \n\t" \
  86         "pmaddhw    %[ftmp2],   %[ftmp6],   %[ftmp13]                 \n\t" \
  87         "pmaddhw    %[ftmp3],   %[ftmp8],   %[ftmp14]                 \n\t" \
  88         "paddw      %[ftmp2],   %[ftmp2],   %[ftmp3]                  \n\t" \
  89                                                                             \
  90         "li         %[tmp0],    "#r3"                                 \n\t" \
  91         "mtc1       %[tmp0],    %[ftmp13]                             \n\t" \
  92         "pshufh     %[ftmp13],  %[ftmp13],  %[ftmp23]                 \n\t" \
  93         "li         %[tmp0],    "#r4"                                 \n\t" \
  94         "mtc1       %[tmp0],    %[ftmp14]                             \n\t" \
  95         "pshufh     %[ftmp14],  %[ftmp14],  %[ftmp23]                 \n\t" \
  96         "pmaddhw    %[ftmp3],   %[ftmp9],   %[ftmp13]                 \n\t" \
  97         "pmaddhw    %[ftmp4],   %[ftmp11],  %[ftmp14]                 \n\t" \
  98         "paddw      %[ftmp3],   %[ftmp3],   %[ftmp4]                  \n\t" \
  99         "pmaddhw    %[ftmp4],   %[ftmp10],  %[ftmp13]                 \n\t" \
 100         "pmaddhw    %[ftmp13],  %[ftmp12],  %[ftmp14]                 \n\t" \
 101         "paddw      %[ftmp4],   %[ftmp4],   %[ftmp13]                 \n\t" \
 102                                                                             \
 103         "paddw      %[ftmp13],  %[ftmp1],   %[ftmp3]                  \n\t" \
 104         "psubw      %[ftmp14],  %[ftmp1],   %[ftmp3]                  \n\t" \
 105         "paddw      %[ftmp14],  %[ftmp14],  "#c1"                     \n\t" \
 106         "paddw      %[ftmp1],   %[ftmp2],   %[ftmp4]                  \n\t" \
 107         "psubw      %[ftmp3],   %[ftmp2],   %[ftmp4]                  \n\t" \
 108         "paddw      %[ftmp3],   %[ftmp3],   "#c1"                     \n\t" \
 109         "paddw      %[ftmp13],  %[ftmp13],  "#c0"                     \n\t" \
 110         "paddw      %[ftmp14],  %[ftmp14],  "#c0"                     \n\t" \
 111         "paddw      %[ftmp1],   %[ftmp1],   "#c0"                     \n\t" \
 112         "paddw      %[ftmp3],   %[ftmp3],   "#c0"                     \n\t" \
 113         "psraw      %[ftmp13],  %[ftmp13],  %[ftmp0]                  \n\t" \
 114         "psraw      %[ftmp1],   %[ftmp1],   %[ftmp0]                  \n\t" \
 115         "psraw      %[ftmp14],  %[ftmp14],  %[ftmp0]                  \n\t" \
 116         "psraw      %[ftmp3],   %[ftmp3],   %[ftmp0]                  \n\t" \
 117         "punpcklhw  %[ftmp2],   %[ftmp13],  %[ftmp1]                  \n\t" \
 118         "punpckhhw  %[ftmp4],   %[ftmp13],  %[ftmp1]                  \n\t" \
 119         "punpcklhw  "#o1",      %[ftmp2],   %[ftmp4]                  \n\t" \
 120         "punpcklhw  %[ftmp2],   %[ftmp14],  %[ftmp3]                  \n\t" \
 121         "punpckhhw  %[ftmp4],   %[ftmp14],  %[ftmp3]                  \n\t" \
 122         "punpcklhw  "#o2",      %[ftmp2],   %[ftmp4]                  \n\t"
 123
 124 /* Do inverse transform on 8x8 block */
 125 void ff_vc1_inv_trans_8x8_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
 126 {
 127     int dc = block[0];
 128     double ftmp[9];
 129     mips_reg addr[1];
 130     int count;
 131
 132     dc = (3 * dc +  1) >> 1;
 133     dc = (3 * dc + 16) >> 5;
 134
 135     __asm__ volatile(
 136         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
 137         "pshufh     %[dc],      %[dc],          %[ftmp0]                \n\t"
 138         "li         %[count],   0x02                                    \n\t"
 139
 140         "1:                                                             \n\t"
 141         MMI_LDC1(%[ftmp1], %[dest], 0x00)
 142         PTR_ADDU   "%[addr0],   %[dest],        %[linesize]             \n\t"
 143         MMI_LDC1(%[ftmp2], %[addr0], 0x00)
 144         PTR_ADDU   "%[addr0],   %[addr0],       %[linesize]             \n\t"
 145         MMI_LDC1(%[ftmp3], %[addr0], 0x00)
 146         PTR_ADDU   "%[addr0],   %[addr0],       %[linesize]             \n\t"
 147         MMI_LDC1(%[ftmp4], %[addr0], 0x00)
 148
 149         "punpckhbh  %[ftmp5],   %[ftmp1],       %[ftmp0]                \n\t"
 150         "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
 151         "punpckhbh  %[ftmp6],   %[ftmp2],       %[ftmp0]                \n\t"
 152         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
 153         "punpckhbh  %[ftmp7],   %[ftmp3],       %[ftmp0]                \n\t"
 154         "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
 155         "punpckhbh  %[ftmp8],   %[ftmp4],       %[ftmp0]                \n\t"
 156         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
 157
 158         "paddsh     %[ftmp1],   %[ftmp1],       %[dc]                   \n\t"
 159         "paddsh     %[ftmp2],   %[ftmp2],       %[dc]                   \n\t"
 160         "paddsh     %[ftmp3],   %[ftmp3],       %[dc]                   \n\t"
 161         "paddsh     %[ftmp4],   %[ftmp4],       %[dc]                   \n\t"
 162         "paddsh     %[ftmp5],   %[ftmp5],       %[dc]                   \n\t"
 163         "paddsh     %[ftmp6],   %[ftmp6],       %[dc]                   \n\t"
 164         "paddsh     %[ftmp7],   %[ftmp7],       %[dc]                   \n\t"
 165         "paddsh     %[ftmp8],   %[ftmp8],       %[dc]                   \n\t"
 166
 167         "packushb   %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
 168         "packushb   %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
 169         "packushb   %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
 170         "packushb   %[ftmp4],   %[ftmp4],       %[ftmp8]                \n\t"
 171
 172         MMI_SDC1(%[ftmp1], %[dest], 0x00)
 173         PTR_ADDU   "%[addr0],   %[dest],        %[linesize]             \n\t"
 174         MMI_SDC1(%[ftmp2], %[addr0], 0x00)
 175         PTR_ADDU   "%[addr0],   %[addr0],       %[linesize]             \n\t"
 176         MMI_SDC1(%[ftmp3], %[addr0], 0x00)
 177         PTR_ADDU   "%[addr0],   %[addr0],       %[linesize]             \n\t"
 178         MMI_SDC1(%[ftmp4], %[addr0], 0x00)
 179
 180         "addiu      %[count],   %[count],       -0x01                   \n\t"
 181         PTR_ADDU   "%[dest],    %[addr0],       %[linesize]             \n\t"
 182         "bnez       %[count],   1b                                      \n\t"
 183         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
 184           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
 185           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
 186           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
 187           [ftmp8]"=&f"(ftmp[8]),
 188           [addr0]"=&r"(addr[0]),
 189           [count]"=&r"(count),          [dest]"+&r"(dest)
 190         : [linesize]"r"((mips_reg)linesize),
 191           [dc]"f"(dc)
 192         : "memory"
 193     );
 194 }
 195
 196 #if _MIPS_SIM != _ABIO32
 197 void ff_vc1_inv_trans_8x8_mmi(int16_t block[64])
 198 {
 199     DECLARE_ALIGNED(16, int16_t, temp[64]);
 200     DECLARE_ALIGNED(8, const uint64_t, ff_pw_1_local) = {0x0000000100000001ULL};
 201     DECLARE_ALIGNED(8, const uint64_t, ff_pw_4_local) = {0x0000000400000004ULL};
 202     DECLARE_ALIGNED(8, const uint64_t, ff_pw_64_local)= {0x0000004000000040ULL};
 203     int16_t *src = block;
 204     int16_t *dst = temp;
 205     double ftmp[24];
 206     uint64_t tmp[1];
 207
 208     // 1st loop
 209     __asm__ volatile (
 210         "li         %[tmp0],    0x03                                    \n\t"
 211         "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
 212         "li         %[tmp0],    0x44                                    \n\t"
 213         "mtc1       %[tmp0],    %[ftmp23]                               \n\t"
 214
 215        // 1st part
 216         MMI_LDC1(%[ftmp1], %[src], 0x00)
 217         MMI_LDC1(%[ftmp2], %[src], 0x20)
 218         MMI_LDC1(%[ftmp3], %[src], 0x40)
 219         MMI_LDC1(%[ftmp4], %[src], 0x60)
 220         "punpcklhw  %[ftmp5],   %[ftmp1],   %[ftmp2]                    \n\t"
 221         "punpckhhw  %[ftmp6],   %[ftmp1],   %[ftmp2]                    \n\t"
 222         "punpcklhw  %[ftmp7],   %[ftmp3],   %[ftmp4]                    \n\t"
 223         "punpckhhw  %[ftmp8],   %[ftmp3],   %[ftmp4]                    \n\t"
 224
 225         MMI_LDC1(%[ftmp1], %[src], 0x10)
 226         MMI_LDC1(%[ftmp2], %[src], 0x30)
 227         MMI_LDC1(%[ftmp3], %[src], 0x50)
 228         MMI_LDC1(%[ftmp4], %[src], 0x70)
 229         "punpcklhw  %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
 230         "punpckhhw  %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
 231         "punpcklhw  %[ftmp11],  %[ftmp3],   %[ftmp4]                    \n\t"
 232         "punpckhhw  %[ftmp12],  %[ftmp3],   %[ftmp4]                    \n\t"
 233
 234         /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */
 235         VC1_INV_TRANCS_8_TYPE1(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c,
 236                                0x000f0010, 0x00040009, %[ff_pw_4])
 237
 238         /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */
 239         VC1_INV_TRANCS_8_TYPE1(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4,
 240                                0xfffc000f, 0xfff7fff0, %[ff_pw_4])
 241
 242         /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */
 243         VC1_INV_TRANCS_8_TYPE1(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4,
 244                                0xfff00009, 0x000f0004, %[ff_pw_4])
 245
 246         /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */
 247         VC1_INV_TRANCS_8_TYPE1(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c,
 248                                0xfff70004, 0xfff0000f, %[ff_pw_4])
 249
 250         TRANSPOSE_4H(%[ftmp15], %[ftmp16], %[ftmp17], %[ftmp18],
 251                      %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
 252
 253         MMI_SDC1(%[ftmp15], %[dst], 0x00)
 254         MMI_SDC1(%[ftmp16], %[dst], 0x10)
 255         MMI_SDC1(%[ftmp17], %[dst], 0x20)
 256         MMI_SDC1(%[ftmp18], %[dst], 0x30)
 257
 258         TRANSPOSE_4H(%[ftmp19], %[ftmp20], %[ftmp21], %[ftmp22],
 259                      %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
 260
 261         MMI_SDC1(%[ftmp19], %[dst], 0x08)
 262         MMI_SDC1(%[ftmp20], %[dst], 0x18)
 263         MMI_SDC1(%[ftmp21], %[dst], 0x28)
 264         MMI_SDC1(%[ftmp22], %[dst], 0x38)
 265
 266        // 2nd part
 267         MMI_LDC1(%[ftmp1], %[src], 0x08)
 268         MMI_LDC1(%[ftmp2], %[src], 0x28)
 269         MMI_LDC1(%[ftmp3], %[src], 0x48)
 270         MMI_LDC1(%[ftmp4], %[src], 0x68)
 271         "punpcklhw  %[ftmp5],   %[ftmp1],   %[ftmp2]                    \n\t"
 272         "punpckhhw  %[ftmp6],   %[ftmp1],   %[ftmp2]                    \n\t"
 273         "punpcklhw  %[ftmp7],   %[ftmp3],   %[ftmp4]                    \n\t"
 274         "punpckhhw  %[ftmp8],   %[ftmp3],   %[ftmp4]                    \n\t"
 275
 276         MMI_LDC1(%[ftmp1], %[src], 0x18)
 277         MMI_LDC1(%[ftmp2], %[src], 0x38)
 278         MMI_LDC1(%[ftmp3], %[src], 0x58)
 279         MMI_LDC1(%[ftmp4], %[src], 0x78)
 280         "punpcklhw  %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
 281         "punpckhhw  %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
 282         "punpcklhw  %[ftmp11],  %[ftmp3],   %[ftmp4]                    \n\t"
 283         "punpckhhw  %[ftmp12],  %[ftmp3],   %[ftmp4]                    \n\t"
 284
 285         /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */
 286         VC1_INV_TRANCS_8_TYPE1(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c,
 287                                0x000f0010, 0x00040009, %[ff_pw_4])
 288
 289         /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */
 290         VC1_INV_TRANCS_8_TYPE1(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4,
 291                                0xfffc000f, 0xfff7fff0, %[ff_pw_4])
 292
 293         /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */
 294         VC1_INV_TRANCS_8_TYPE1(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4,
 295                                0xfff00009, 0x000f0004, %[ff_pw_4])
 296
 297         /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */
 298         VC1_INV_TRANCS_8_TYPE1(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c,
 299                                0xfff70004, 0xfff0000f, %[ff_pw_4])
 300
 301         TRANSPOSE_4H(%[ftmp15], %[ftmp16], %[ftmp17], %[ftmp18],
 302                      %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
 303
 304         MMI_SDC1(%[ftmp15], %[dst], 0x40)
 305         MMI_SDC1(%[ftmp16], %[dst], 0x50)
 306         MMI_SDC1(%[ftmp17], %[dst], 0x60)
 307         MMI_SDC1(%[ftmp18], %[dst], 0x70)
 308
 309         TRANSPOSE_4H(%[ftmp19], %[ftmp20], %[ftmp21], %[ftmp22],
 310                      %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
 311
 312         MMI_SDC1(%[ftmp19], %[dst], 0x48)
 313         MMI_SDC1(%[ftmp20], %[dst], 0x58)
 314         MMI_SDC1(%[ftmp21], %[dst], 0x68)
 315         MMI_SDC1(%[ftmp22], %[dst], 0x78)
 316
 317         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
 318           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
 319           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
 320           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
 321           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
 322           [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
 323           [ftmp12]"=&f"(ftmp[12]),      [ftmp13]"=&f"(ftmp[13]),
 324           [ftmp14]"=&f"(ftmp[14]),      [ftmp15]"=&f"(ftmp[15]),
 325           [ftmp16]"=&f"(ftmp[16]),      [ftmp17]"=&f"(ftmp[17]),
 326           [ftmp18]"=&f"(ftmp[18]),      [ftmp19]"=&f"(ftmp[19]),
 327           [ftmp20]"=&f"(ftmp[20]),      [ftmp21]"=&f"(ftmp[21]),
 328           [ftmp22]"=&f"(ftmp[22]),      [ftmp23]"=&f"(ftmp[23]),
 329           [tmp0]"=&r"(tmp[0])
 330         : [ff_pw_4]"f"(ff_pw_4_local), [src]"r"(src), [dst]"r"(dst)
 331         : "memory"
 332     );
 333
 334     src = temp;
 335     dst = block;
 336
 337     // 2nd loop
 338     __asm__ volatile (
 339         "li         %[tmp0],    0x07                                    \n\t"
 340         "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
 341         "li         %[tmp0],    0x44                                    \n\t"
 342         "mtc1       %[tmp0],    %[ftmp23]                               \n\t"
 343
 344         // 1st part
 345         MMI_LDC1(%[ftmp1], %[src], 0x00)
 346         MMI_LDC1(%[ftmp2], %[src], 0x20)
 347         MMI_LDC1(%[ftmp3], %[src], 0x40)
 348         MMI_LDC1(%[ftmp4], %[src], 0x60)
 349         "punpcklhw  %[ftmp5],   %[ftmp1],   %[ftmp2]                    \n\t"
 350         "punpckhhw  %[ftmp6],   %[ftmp1],   %[ftmp2]                    \n\t"
 351         "punpcklhw  %[ftmp7],   %[ftmp3],   %[ftmp4]                    \n\t"
 352         "punpckhhw  %[ftmp8],   %[ftmp3],   %[ftmp4]                    \n\t"
 353
 354         MMI_LDC1(%[ftmp1], %[src], 0x10)
 355         MMI_LDC1(%[ftmp2], %[src], 0x30)
 356         MMI_LDC1(%[ftmp3], %[src], 0x50)
 357         MMI_LDC1(%[ftmp4], %[src], 0x70)
 358         "punpcklhw  %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
 359         "punpckhhw  %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
 360         "punpcklhw  %[ftmp11],  %[ftmp3],   %[ftmp4]                    \n\t"
 361         "punpckhhw  %[ftmp12],  %[ftmp3],   %[ftmp4]                    \n\t"
 362
 363         /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */
 364         VC1_INV_TRANCS_8_TYPE2(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c,
 365                                0x000f0010, 0x00040009, %[ff_pw_64], %[ff_pw_1])
 366
 367         /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */
 368         VC1_INV_TRANCS_8_TYPE2(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4,
 369                                0xfffc000f, 0xfff7fff0, %[ff_pw_64], %[ff_pw_1])
 370
 371         /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */
 372         VC1_INV_TRANCS_8_TYPE2(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4,
 373                                0xfff00009, 0x000f0004, %[ff_pw_64], %[ff_pw_1])
 374
 375         /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */
 376         VC1_INV_TRANCS_8_TYPE2(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c,
 377                                0xfff70004, 0xfff0000f, %[ff_pw_64], %[ff_pw_1])
 378
 379         MMI_SDC1(%[ftmp15], %[dst], 0x00)
 380         MMI_SDC1(%[ftmp16], %[dst], 0x10)
 381         MMI_SDC1(%[ftmp17], %[dst], 0x20)
 382         MMI_SDC1(%[ftmp18], %[dst], 0x30)
 383         MMI_SDC1(%[ftmp19], %[dst], 0x40)
 384         MMI_SDC1(%[ftmp20], %[dst], 0x50)
 385         MMI_SDC1(%[ftmp21], %[dst], 0x60)
 386         MMI_SDC1(%[ftmp22], %[dst], 0x70)
 387
 388        // 2nd part
 389         MMI_LDC1(%[ftmp1], %[src], 0x08)
 390         MMI_LDC1(%[ftmp2], %[src], 0x28)
 391         MMI_LDC1(%[ftmp3], %[src], 0x48)
 392         MMI_LDC1(%[ftmp4], %[src], 0x68)
 393         "punpcklhw  %[ftmp5],   %[ftmp1],   %[ftmp2]                    \n\t"
 394         "punpckhhw  %[ftmp6],   %[ftmp1],   %[ftmp2]                    \n\t"
 395         "punpcklhw  %[ftmp7],   %[ftmp3],   %[ftmp4]                    \n\t"
 396         "punpckhhw  %[ftmp8],   %[ftmp3],   %[ftmp4]                    \n\t"
 397
 398         MMI_LDC1(%[ftmp1], %[src], 0x18)
 399         MMI_LDC1(%[ftmp2], %[src], 0x38)
 400         MMI_LDC1(%[ftmp3], %[src], 0x58)
 401         MMI_LDC1(%[ftmp4], %[src], 0x78)
 402         "punpcklhw  %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
 403         "punpckhhw  %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
 404         "punpcklhw  %[ftmp11],  %[ftmp3],   %[ftmp4]                    \n\t"
 405         "punpckhhw  %[ftmp12],  %[ftmp3],   %[ftmp4]                    \n\t"
 406
 407         /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */
 408         VC1_INV_TRANCS_8_TYPE2(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c,
 409                                0x000f0010, 0x00040009, %[ff_pw_64], %[ff_pw_1])
 410
 411         /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */
 412         VC1_INV_TRANCS_8_TYPE2(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4,
 413                                0xfffc000f, 0xfff7fff0, %[ff_pw_64], %[ff_pw_1])
 414
 415         /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */
 416         VC1_INV_TRANCS_8_TYPE2(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4,
 417                                0xfff00009, 0x000f0004, %[ff_pw_64], %[ff_pw_1])
 418
 419         /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */
 420         VC1_INV_TRANCS_8_TYPE2(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c,
 421                                0xfff70004, 0xfff0000f, %[ff_pw_64], %[ff_pw_1])
 422
 423         MMI_SDC1(%[ftmp15], %[dst], 0x08)
 424         MMI_SDC1(%[ftmp16], %[dst], 0x18)
 425         MMI_SDC1(%[ftmp17], %[dst], 0x28)
 426         MMI_SDC1(%[ftmp18], %[dst], 0x38)
 427         MMI_SDC1(%[ftmp19], %[dst], 0x48)
 428         MMI_SDC1(%[ftmp20], %[dst], 0x58)
 429         MMI_SDC1(%[ftmp21], %[dst], 0x68)
 430         MMI_SDC1(%[ftmp22], %[dst], 0x78)
 431
 432         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
 433           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
 434           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
 435           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
 436           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
 437           [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
 438           [ftmp12]"=&f"(ftmp[12]),      [ftmp13]"=&f"(ftmp[13]),
 439           [ftmp14]"=&f"(ftmp[14]),      [ftmp15]"=&f"(ftmp[15]),
 440           [ftmp16]"=&f"(ftmp[16]),      [ftmp17]"=&f"(ftmp[17]),
 441           [ftmp18]"=&f"(ftmp[18]),      [ftmp19]"=&f"(ftmp[19]),
 442           [ftmp20]"=&f"(ftmp[20]),      [ftmp21]"=&f"(ftmp[21]),
 443           [ftmp22]"=&f"(ftmp[22]),      [ftmp23]"=&f"(ftmp[23]),
 444           [tmp0]"=&r"(tmp[0])
 445         : [ff_pw_1]"f"(ff_pw_1_local),  [ff_pw_64]"f"(ff_pw_64_local),
 446           [src]"r"(src), [dst]"r"(dst)
 447         : "memory"
 448     );
 449 }
 450 #endif
 451
 452 /* Do inverse transform on 8x4 part of block */
 453 void ff_vc1_inv_trans_8x4_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
 454 {
 455     int dc = block[0];
 456     double ftmp[9];
 457
 458     dc = ( 3 * dc +  1) >> 1;
 459     dc = (17 * dc + 64) >> 7;
 460
 461     __asm__ volatile(
 462         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
 463         "pshufh     %[dc],      %[dc],          %[ftmp0]                \n\t"
 464
 465         MMI_LDC1(%[ftmp1], %[dest0], 0x00)
 466         MMI_LDC1(%[ftmp2], %[dest1], 0x00)
 467         MMI_LDC1(%[ftmp3], %[dest2], 0x00)
 468         MMI_LDC1(%[ftmp4], %[dest3], 0x00)
 469
 470         "punpckhbh  %[ftmp5],   %[ftmp1],       %[ftmp0]                \n\t"
 471         "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
 472         "punpckhbh  %[ftmp6],   %[ftmp2],       %[ftmp0]                \n\t"
 473         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
 474         "punpckhbh  %[ftmp7],   %[ftmp3],       %[ftmp0]                \n\t"
 475         "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
 476         "punpckhbh  %[ftmp8],   %[ftmp4],       %[ftmp0]                \n\t"
 477         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
 478
 479         "paddsh     %[ftmp1],   %[ftmp1],       %[dc]                   \n\t"
 480         "paddsh     %[ftmp2],   %[ftmp2],       %[dc]                   \n\t"
 481         "paddsh     %[ftmp3],   %[ftmp3],       %[dc]                   \n\t"
 482         "paddsh     %[ftmp4],   %[ftmp4],       %[dc]                   \n\t"
 483         "paddsh     %[ftmp5],   %[ftmp5],       %[dc]                   \n\t"
 484         "paddsh     %[ftmp6],   %[ftmp6],       %[dc]                   \n\t"
 485         "paddsh     %[ftmp7],   %[ftmp7],       %[dc]                   \n\t"
 486         "paddsh     %[ftmp8],   %[ftmp8],       %[dc]                   \n\t"
 487
 488         "packushb   %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
 489         "packushb   %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
 490         "packushb   %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
 491         "packushb   %[ftmp4],   %[ftmp4],       %[ftmp8]                \n\t"
 492
 493         MMI_SDC1(%[ftmp1], %[dest0], 0x00)
 494         MMI_SDC1(%[ftmp2], %[dest1], 0x00)
 495         MMI_SDC1(%[ftmp3], %[dest2], 0x00)
 496         MMI_SDC1(%[ftmp4], %[dest3], 0x00)
 497         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
 498           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
 499           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
 500           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
 501           [ftmp8]"=&f"(ftmp[8])
 502         : [dest0]"r"(dest+0*linesize),  [dest1]"r"(dest+1*linesize),
 503           [dest2]"r"(dest+2*linesize),  [dest3]"r"(dest+3*linesize),
 504           [dc]"f"(dc)
 505         : "memory"
 506     );
 507 }
 508
 509 #if _MIPS_SIM != _ABIO32
 510 void ff_vc1_inv_trans_8x4_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
 511 {
 512     int16_t *src = block;
 513     int16_t *dst = block;
 514     double ftmp[16];
 515     uint32_t tmp[1];
 516     int16_t count = 4;
 517     DECLARE_ALIGNED(16, const uint64_t, ff_pw_4_local) = {0x0000000400000004ULL};
 518     DECLARE_ALIGNED(16, const uint64_t, ff_pw_64_local)= {0x0000004000000040ULL};
 519     int16_t coeff[64] = {12, 16,  16,  15,  12,   9,   6,   4,
 520                          12, 15,   6,  -4, -12, -16, -16,  -9,
 521                          12,  9,  -6, -16, -12,   4,  16,  15,
 522                          12,  4, -16,  -9,  12,  15,  -6, -16,
 523                          12, -4, -16,   9,  12, -15,  -6,  16,
 524                          12, -9,  -6,  16, -12,  -4,  16, -15,
 525                          12, -15,  6,   4, -12,  16, -16,   9,
 526                          12, -16, 16, -15,  12,  -9,   6,  -4};
 527
 528     // 1st loop
 529     __asm__ volatile (
 530         "li         %[tmp0],    0x03                                    \n\t"
 531         "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
 532
 533         "1:                                                             \n\t"
 534         MMI_LDC1(%[ftmp1], %[src], 0x00)
 535         MMI_LDC1(%[ftmp2], %[src], 0x08)
 536
 537         /* ftmp11: dst1,dst0 */
 538         MMI_LDC1(%[ftmp3], %[coeff], 0x00)
 539         MMI_LDC1(%[ftmp4], %[coeff], 0x08)
 540         MMI_LDC1(%[ftmp5], %[coeff], 0x10)
 541         MMI_LDC1(%[ftmp6], %[coeff], 0x18)
 542         "pmaddhw    %[ftmp7],   %[ftmp1],   %[ftmp3]                    \n\t"
 543         "pmaddhw    %[ftmp8],   %[ftmp2],   %[ftmp4]                    \n\t"
 544         "paddw      %[ftmp9],   %[ftmp7],   %[ftmp8]                    \n\t"
 545         "pmaddhw    %[ftmp7],   %[ftmp1],   %[ftmp5]                    \n\t"
 546         "pmaddhw    %[ftmp8],   %[ftmp2],   %[ftmp6]                    \n\t"
 547         "paddw      %[ftmp10],  %[ftmp7],   %[ftmp8]                    \n\t"
 548         "punpcklwd  %[ftmp7],   %[ftmp9],   %[ftmp10]                   \n\t"
 549         "punpckhwd  %[ftmp8],   %[ftmp9],   %[ftmp10]                   \n\t"
 550         "paddw      %[ftmp11],  %[ftmp7],   %[ftmp8]                    \n\t"
 551         "paddw      %[ftmp11],  %[ftmp11],  %[ff_pw_4]                  \n\t"
 552
 553         /* ftmp12: dst3,dst2 */
 554         MMI_LDC1(%[ftmp3], %[coeff], 0x20)
 555         MMI_LDC1(%[ftmp4], %[coeff], 0x28)
 556         MMI_LDC1(%[ftmp5], %[coeff], 0x30)
 557         MMI_LDC1(%[ftmp6], %[coeff], 0x38)
 558         "pmaddhw    %[ftmp7],   %[ftmp1],   %[ftmp3]                    \n\t"
 559         "pmaddhw    %[ftmp8],   %[ftmp2],   %[ftmp4]                    \n\t"
 560         "paddw      %[ftmp9],   %[ftmp7],   %[ftmp8]                    \n\t"
 561         "pmaddhw    %[ftmp7],   %[ftmp1],   %[ftmp5]                    \n\t"
 562         "pmaddhw    %[ftmp8],   %[ftmp2],   %[ftmp6]                    \n\t"
 563         "paddw      %[ftmp10],  %[ftmp7],   %[ftmp8]                    \n\t"
 564         "punpcklwd  %[ftmp7],   %[ftmp9],   %[ftmp10]                   \n\t"
 565         "punpckhwd  %[ftmp8],   %[ftmp9],   %[ftmp10]                   \n\t"
 566         "paddw      %[ftmp12],  %[ftmp7],   %[ftmp8]                    \n\t"
 567         "paddw      %[ftmp12],  %[ftmp12],  %[ff_pw_4]                  \n\t"
 568
 569         /* ftmp13: dst5,dst4 */
 570         MMI_LDC1(%[ftmp3], %[coeff], 0x40)
 571         MMI_LDC1(%[ftmp4], %[coeff], 0x48)
 572         MMI_LDC1(%[ftmp5], %[coeff], 0x50)
 573         MMI_LDC1(%[ftmp6], %[coeff], 0x58)
 574         "pmaddhw    %[ftmp7],   %[ftmp1],   %[ftmp3]                    \n\t"
 575         "pmaddhw    %[ftmp8],   %[ftmp2],   %[ftmp4]                    \n\t"
 576         "paddw      %[ftmp9],   %[ftmp7],   %[ftmp8]                    \n\t"
 577         "pmaddhw    %[ftmp7],   %[ftmp1],   %[ftmp5]                    \n\t"
 578         "pmaddhw    %[ftmp8],   %[ftmp2],   %[ftmp6]                    \n\t"
 579         "paddw      %[ftmp10],  %[ftmp7],   %[ftmp8]                    \n\t"
 580         "punpcklwd  %[ftmp7],   %[ftmp9],   %[ftmp10]                   \n\t"
 581         "punpckhwd  %[ftmp8],   %[ftmp9],   %[ftmp10]                   \n\t"
 582         "paddw      %[ftmp13],  %[ftmp7],   %[ftmp8]                    \n\t"
 583         "paddw      %[ftmp13],  %[ftmp13],  %[ff_pw_4]                  \n\t"
 584
 585         /* ftmp14: dst7,dst6 */
 586         MMI_LDC1(%[ftmp3], %[coeff], 0x60)
 587         MMI_LDC1(%[ftmp4], %[coeff], 0x68)
 588         MMI_LDC1(%[ftmp5], %[coeff], 0x70)
 589         MMI_LDC1(%[ftmp6], %[coeff], 0x78)
 590         "pmaddhw    %[ftmp7],   %[ftmp1],   %[ftmp3]                    \n\t"
 591         "pmaddhw    %[ftmp8],   %[ftmp2],   %[ftmp4]                    \n\t"
 592         "paddw      %[ftmp9],   %[ftmp7],   %[ftmp8]                    \n\t"
 593         "pmaddhw    %[ftmp7],   %[ftmp1],   %[ftmp5]                    \n\t"
 594         "pmaddhw    %[ftmp8],   %[ftmp2],   %[ftmp6]                    \n\t"
 595         "paddw      %[ftmp10],  %[ftmp7],   %[ftmp8]                    \n\t"
 596         "punpcklwd  %[ftmp7],   %[ftmp9],   %[ftmp10]                   \n\t"
 597         "punpckhwd  %[ftmp8],   %[ftmp9],   %[ftmp10]                   \n\t"
 598         "paddw      %[ftmp14],  %[ftmp7],   %[ftmp8]                    \n\t"
 599         "paddw      %[ftmp14],  %[ftmp14],  %[ff_pw_4]                  \n\t"
 600
 601         /* ftmp9: dst3,dst2,dst1,dst0    ftmp10: dst7,dst6,dst5,dst4 */
 602         "psraw      %[ftmp11],  %[ftmp11],  %[ftmp0]                    \n\t"
 603         "psraw      %[ftmp12],  %[ftmp12],  %[ftmp0]                    \n\t"
 604         "psraw      %[ftmp13],  %[ftmp13],  %[ftmp0]                    \n\t"
 605         "psraw      %[ftmp14],  %[ftmp14],  %[ftmp0]                    \n\t"
 606         "punpcklhw  %[ftmp7],   %[ftmp11],  %[ftmp12]                   \n\t"
 607         "punpckhhw  %[ftmp8],   %[ftmp11],  %[ftmp12]                   \n\t"
 608         "punpcklhw  %[ftmp9],   %[ftmp7],   %[ftmp8]                    \n\t"
 609         "punpcklhw  %[ftmp7],   %[ftmp13],  %[ftmp14]                   \n\t"
 610         "punpckhhw  %[ftmp8],   %[ftmp13],  %[ftmp14]                   \n\t"
 611         "punpcklhw  %[ftmp10],  %[ftmp7],   %[ftmp8]                    \n\t"
 612         MMI_SDC1(%[ftmp9], %[dst], 0x00)
 613         MMI_SDC1(%[ftmp10], %[dst], 0x08)
 614
 615         PTR_ADDIU  "%[src],     %[src],     0x10                        \n\t"
 616         PTR_ADDIU  "%[dst],     %[dst],     0x10                        \n\t"
 617         "addiu      %[count],   %[count],   -0x01                       \n\t"
 618         "bnez       %[count],   1b                                      \n\t"
 619         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
 620           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
 621           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
 622           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
 623           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
 624           [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
 625           [ftmp12]"=&f"(ftmp[12]),      [ftmp13]"=&f"(ftmp[13]),
 626           [ftmp14]"=&f"(ftmp[14]),      [tmp0]"=&r"(tmp[0]),
 627           [src]"+&r"(src), [dst]"+&r"(dst), [count]"+&r"(count)
 628         : [ff_pw_4]"f"(ff_pw_4_local),  [coeff]"r"(coeff)
 629         : "memory"
 630     );
 631
 632     src = block;
 633
 634     // 2nd loop
 635     __asm__ volatile (
 636         "li         %[tmp0],    0x44                                    \n\t"
 637         "mtc1       %[tmp0],    %[ftmp15]                               \n\t"
 638
 639         // 1st part
 640         "li         %[tmp0],    0x07                                    \n\t"
 641         "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
 642         MMI_LDC1(%[ftmp1], %[src], 0x00)
 643         MMI_LDC1(%[ftmp2], %[src], 0x10)
 644         MMI_LDC1(%[ftmp3], %[src], 0x20)
 645         MMI_LDC1(%[ftmp4], %[src], 0x30)
 646         "punpcklhw  %[ftmp5],   %[ftmp1],   %[ftmp2]                    \n\t"
 647         "punpckhhw  %[ftmp6],   %[ftmp1],   %[ftmp2]                    \n\t"
 648         "punpcklhw  %[ftmp7],   %[ftmp3],   %[ftmp4]                    \n\t"
 649         "punpckhhw  %[ftmp8],   %[ftmp3],   %[ftmp4]                    \n\t"
 650
 651         /* ftmp11: dst03,dst02,dst01,dst00 */
 652         "li         %[tmp0],    0x00160011                              \n\t"
 653         "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
 654         "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
 655         "li         %[tmp0],    0x000a0011                              \n\t"
 656         "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
 657         "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
 658         "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
 659         "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
 660         "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
 661         "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
 662         "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
 663         "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
 664         "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
 665         "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
 666         "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
 667         "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
 668         "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
 669         "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
 670         "punpcklhw  %[ftmp11],  %[ftmp1],   %[ftmp2]                    \n\t"
 671
 672         /* ftmp12: dst13,dst12,dst11,dst10 */
 673         "li         %[tmp0],    0x000a0011                              \n\t"
 674         "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
 675         "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
 676         "li         %[tmp0],    0xffeaffef                              \n\t"
 677         "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
 678         "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
 679         "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
 680         "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
 681         "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
 682         "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
 683         "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
 684         "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
 685         "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
 686         "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
 687         "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
 688         "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
 689         "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
 690         "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
 691         "punpcklhw  %[ftmp12],  %[ftmp1],   %[ftmp2]                    \n\t"
 692
 693         /* ftmp13: dst23,dst22,dst21,dst20 */
 694         "li         %[tmp0],    0xfff60011                              \n\t"
 695         "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
 696         "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
 697         "li         %[tmp0],    0x0016ffef                              \n\t"
 698         "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
 699         "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
 700         "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
 701         "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
 702         "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
 703         "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
 704         "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
 705         "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
 706         "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
 707         "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
 708         "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
 709         "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
 710         "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
 711         "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
 712         "punpcklhw  %[ftmp13],  %[ftmp1],   %[ftmp2]                    \n\t"
 713
 714         /* ftmp14: dst33,dst32,dst31,dst30 */
 715         "li         %[tmp0],    0xffea0011                              \n\t"
 716         "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
 717         "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
 718         "li         %[tmp0],    0xfff60011                              \n\t"
 719         "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
 720         "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
 721         "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
 722         "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
 723         "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
 724         "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
 725         "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
 726         "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
 727         "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
 728         "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
 729         "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
 730         "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
 731         "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
 732         "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
 733         "punpcklhw  %[ftmp14],  %[ftmp1],   %[ftmp2]                    \n\t"
 734
 735         MMI_LWC1(%[ftmp1], %[dest], 0x00)
 736         PTR_ADDU    "%[tmp0],   %[dest],    %[linesize]                 \n\t"
 737         MMI_LWC1(%[ftmp2], %[tmp0], 0x00)
 738         PTR_ADDU    "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
 739         MMI_LWC1(%[ftmp3], %[tmp0], 0x00)
 740         PTR_ADDU    "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
 741         MMI_LWC1(%[ftmp4], %[tmp0], 0x00)
 742         "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
 743         "punpcklbh  %[ftmp1],   %[ftmp1],   %[ftmp0]                    \n\t"
 744         "punpcklbh  %[ftmp2],   %[ftmp2],   %[ftmp0]                    \n\t"
 745         "punpcklbh  %[ftmp3],   %[ftmp3],   %[ftmp0]                    \n\t"
 746         "punpcklbh  %[ftmp4],   %[ftmp4],   %[ftmp0]                    \n\t"
 747         "paddh      %[ftmp1],   %[ftmp1],   %[ftmp11]                   \n\t"
 748         "paddh      %[ftmp2],   %[ftmp2],   %[ftmp12]                   \n\t"
 749         "paddh      %[ftmp3],   %[ftmp3],   %[ftmp13]                   \n\t"
 750         "paddh      %[ftmp4],   %[ftmp4],   %[ftmp14]                   \n\t"
 751         "packushb   %[ftmp1],   %[ftmp1],   %[ftmp0]                    \n\t"
 752         "packushb   %[ftmp2],   %[ftmp2],   %[ftmp0]                    \n\t"
 753         "packushb   %[ftmp3],   %[ftmp3],   %[ftmp0]                    \n\t"
 754         "packushb   %[ftmp4],   %[ftmp4],   %[ftmp0]                    \n\t"
 755         MMI_SWC1(%[ftmp1], %[dest], 0x00)
 756         PTR_ADDU   "%[tmp0],    %[dest],    %[linesize]                 \n\t"
 757         MMI_SWC1(%[ftmp2], %[tmp0], 0x00)
 758         PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
 759         MMI_SWC1(%[ftmp3], %[tmp0], 0x00)
 760         PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
 761         MMI_SWC1(%[ftmp4], %[tmp0], 0x00)
 762
 763         // 2nd part
 764         "li         %[tmp0],    0x07                                    \n\t"
 765         "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
 766         MMI_LDC1(%[ftmp1], %[src], 0x08)
 767         MMI_LDC1(%[ftmp2], %[src], 0x18)
 768         MMI_LDC1(%[ftmp3], %[src], 0x28)
 769         MMI_LDC1(%[ftmp4], %[src], 0x38)
 770         "punpcklhw  %[ftmp5],   %[ftmp1],   %[ftmp2]                    \n\t"
 771         "punpckhhw  %[ftmp6],   %[ftmp1],   %[ftmp2]                    \n\t"
 772         "punpcklhw  %[ftmp7],   %[ftmp3],   %[ftmp4]                    \n\t"
 773         "punpckhhw  %[ftmp8],   %[ftmp3],   %[ftmp4]                    \n\t"
 774
 775         /* ftmp11: dst03,dst02,dst01,dst00 */
 776         "li         %[tmp0],    0x00160011                              \n\t"
 777         "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
 778         "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
 779         "li         %[tmp0],    0x000a0011                              \n\t"
 780         "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
 781         "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
 782         "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
 783         "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
 784         "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
 785         "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
 786         "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
 787         "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
 788         "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
 789         "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
 790         "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
 791         "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
 792         "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
 793         "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
 794         "punpcklhw  %[ftmp11],  %[ftmp1],   %[ftmp2]                    \n\t"
 795
 796         /* ftmp12: dst13,dst12,dst11,dst10 */
 797         "li         %[tmp0],    0x000a0011                              \n\t"
 798         "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
 799         "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
 800         "li         %[tmp0],    0xffeaffef                              \n\t"
 801         "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
 802         "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
 803         "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
 804         "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
 805         "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
 806         "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
 807         "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
 808         "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
 809         "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
 810         "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
 811         "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
 812         "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
 813         "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
 814         "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
 815         "punpcklhw  %[ftmp12],  %[ftmp1],   %[ftmp2]                    \n\t"
 816
 817         /* ftmp13: dst23,dst22,dst21,dst20 */
 818         "li         %[tmp0],    0xfff60011                              \n\t"
 819         "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
 820         "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
 821         "li         %[tmp0],    0x0016ffef                              \n\t"
 822         "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
 823         "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
 824         "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
 825         "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
 826         "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
 827         "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
 828         "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
 829         "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
 830         "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
 831         "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
 832         "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
 833         "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
 834         "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
 835         "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
 836         "punpcklhw  %[ftmp13],  %[ftmp1],   %[ftmp2]                    \n\t"
 837
 838         /* ftmp14: dst33,dst32,dst31,dst30 */
 839         "li         %[tmp0],    0xffea0011                              \n\t"
 840         "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
 841         "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
 842         "li         %[tmp0],    0xfff60011                              \n\t"
 843         "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
 844         "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
 845         "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
 846         "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
 847         "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
 848         "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
 849         "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
 850         "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
 851         "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
 852         "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
 853         "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
 854         "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
 855         "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
 856         "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
 857         "punpcklhw  %[ftmp14],  %[ftmp1],   %[ftmp2]                    \n\t"
 858
 859         MMI_LWC1(%[ftmp1], %[dest], 0x04)
 860         PTR_ADDU    "%[tmp0],   %[dest],    %[linesize]                 \n\t"
 861         MMI_LWC1(%[ftmp2], %[tmp0], 0x04)
 862         PTR_ADDU    "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
 863         MMI_LWC1(%[ftmp3], %[tmp0], 0x04)
 864         PTR_ADDU    "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
 865         MMI_LWC1(%[ftmp4], %[tmp0], 0x04)
 866         "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
 867         "punpcklbh  %[ftmp1],   %[ftmp1],   %[ftmp0]                    \n\t"
 868         "punpcklbh  %[ftmp2],   %[ftmp2],   %[ftmp0]                    \n\t"
 869         "punpcklbh  %[ftmp3],   %[ftmp3],   %[ftmp0]                    \n\t"
 870         "punpcklbh  %[ftmp4],   %[ftmp4],   %[ftmp0]                    \n\t"
 871         "paddh      %[ftmp1],   %[ftmp1],   %[ftmp11]                   \n\t"
 872         "paddh      %[ftmp2],   %[ftmp2],   %[ftmp12]                   \n\t"
 873         "paddh      %[ftmp3],   %[ftmp3],   %[ftmp13]                   \n\t"
 874         "paddh      %[ftmp4],   %[ftmp4],   %[ftmp14]                   \n\t"
 875         "packushb   %[ftmp1],   %[ftmp1],   %[ftmp0]                    \n\t"
 876         "packushb   %[ftmp2],   %[ftmp2],   %[ftmp0]                    \n\t"
 877         "packushb   %[ftmp3],   %[ftmp3],   %[ftmp0]                    \n\t"
 878         "packushb   %[ftmp4],   %[ftmp4],   %[ftmp0]                    \n\t"
 879         MMI_SWC1(%[ftmp1], %[dest], 0x04)
 880         PTR_ADDU   "%[tmp0],    %[dest],    %[linesize]                 \n\t"
 881         MMI_SWC1(%[ftmp2], %[tmp0], 0x04)
 882         PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
 883         MMI_SWC1(%[ftmp3], %[tmp0], 0x04)
 884         PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
 885         MMI_SWC1(%[ftmp4], %[tmp0], 0x04)
 886
 887         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
 888           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
 889           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
 890           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
 891           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
 892           [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
 893           [ftmp12]"=&f"(ftmp[12]),      [ftmp13]"=&f"(ftmp[13]),
 894           [ftmp14]"=&f"(ftmp[14]),      [ftmp15]"=&f"(ftmp[15]),
 895           [tmp0]"=&r"(tmp[0])
 896         : [ff_pw_64]"f"(ff_pw_64_local),
 897           [src]"r"(src), [dest]"r"(dest), [linesize]"r"(linesize)
 898         :"memory"
 899     );
 900 }
 901 #endif
 902
 903 /* Do inverse transform on 4x8 parts of block */
 904 void ff_vc1_inv_trans_4x8_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
 905 {
 906     int dc = block[0];
 907     double ftmp[9];
 908     DECLARE_VAR_LOW32;
 909
 910     dc = (17 * dc +  4) >> 3;
 911     dc = (12 * dc + 64) >> 7;
 912
 913     __asm__ volatile(
 914         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
 915         "pshufh     %[dc],      %[dc],          %[ftmp0]                \n\t"
 916
 917         MMI_LWC1(%[ftmp1], %[dest0], 0x00)
 918         MMI_LWC1(%[ftmp2], %[dest1], 0x00)
 919         MMI_LWC1(%[ftmp3], %[dest2], 0x00)
 920         MMI_LWC1(%[ftmp4], %[dest3], 0x00)
 921         MMI_LWC1(%[ftmp5], %[dest4], 0x00)
 922         MMI_LWC1(%[ftmp6], %[dest5], 0x00)
 923         MMI_LWC1(%[ftmp7], %[dest6], 0x00)
 924         MMI_LWC1(%[ftmp8], %[dest7], 0x00)
 925
 926         "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
 927         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
 928         "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
 929         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
 930         "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
 931         "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
 932         "punpcklbh  %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
 933         "punpcklbh  %[ftmp8],   %[ftmp8],       %[ftmp0]                \n\t"
 934
 935         "paddsh     %[ftmp1],   %[ftmp1],       %[dc]                   \n\t"
 936         "paddsh     %[ftmp2],   %[ftmp2],       %[dc]                   \n\t"
 937         "paddsh     %[ftmp3],   %[ftmp3],       %[dc]                   \n\t"
 938         "paddsh     %[ftmp4],   %[ftmp4],       %[dc]                   \n\t"
 939         "paddsh     %[ftmp5],   %[ftmp5],       %[dc]                   \n\t"
 940         "paddsh     %[ftmp6],   %[ftmp6],       %[dc]                   \n\t"
 941         "paddsh     %[ftmp7],   %[ftmp7],       %[dc]                   \n\t"
 942         "paddsh     %[ftmp8],   %[ftmp8],       %[dc]                   \n\t"
 943
 944         "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
 945         "packushb   %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
 946         "packushb   %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
 947         "packushb   %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
 948         "packushb   %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
 949         "packushb   %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
 950         "packushb   %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
 951         "packushb   %[ftmp8],   %[ftmp8],       %[ftmp0]                \n\t"
 952
 953         MMI_SWC1(%[ftmp1], %[dest0], 0x00)
 954         MMI_SWC1(%[ftmp2], %[dest1], 0x00)
 955         MMI_SWC1(%[ftmp3], %[dest2], 0x00)
 956         MMI_SWC1(%[ftmp4], %[dest3], 0x00)
 957         MMI_SWC1(%[ftmp5], %[dest4], 0x00)
 958         MMI_SWC1(%[ftmp6], %[dest5], 0x00)
 959         MMI_SWC1(%[ftmp7], %[dest6], 0x00)
 960         MMI_SWC1(%[ftmp8], %[dest7], 0x00)
 961         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
 962           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
 963           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
 964           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
 965           RESTRICT_ASM_LOW32
 966           [ftmp8]"=&f"(ftmp[8])
 967         : [dest0]"r"(dest+0*linesize),  [dest1]"r"(dest+1*linesize),
 968           [dest2]"r"(dest+2*linesize),  [dest3]"r"(dest+3*linesize),
 969           [dest4]"r"(dest+4*linesize),  [dest5]"r"(dest+5*linesize),
 970           [dest6]"r"(dest+6*linesize),  [dest7]"r"(dest+7*linesize),
 971           [dc]"f"(dc)
 972         : "memory"
 973     );
 974 }
 975
 976 #if _MIPS_SIM != _ABIO32
 977 void ff_vc1_inv_trans_4x8_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
 978 {
 979     int16_t *src = block;
 980     int16_t *dst = block;
 981     double ftmp[24];
 982     uint32_t count = 8, tmp[1];
 983     int16_t coeff[16] = {17, 22, 17, 10,
 984                          17, 10,-17,-22,
 985                          17,-10,-17, 22,
 986                          17,-22, 17,-10};
 987     DECLARE_ALIGNED(8, const uint64_t, ff_pw_1_local) = {0x0000000100000001ULL};
 988     DECLARE_ALIGNED(8, const uint64_t, ff_pw_4_local) = {0x0000000400000004ULL};
 989     DECLARE_ALIGNED(8, const uint64_t, ff_pw_64_local)= {0x0000004000000040ULL};
 990
 991     // 1st loop
 992     __asm__ volatile (
 993
 994         "li         %[tmp0],    0x03                                    \n\t"
 995         "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
 996
 997         MMI_LDC1(%[ftmp2], %[coeff], 0x00)
 998         MMI_LDC1(%[ftmp3], %[coeff], 0x08)
 999         MMI_LDC1(%[ftmp4], %[coeff], 0x10)
1000         MMI_LDC1(%[ftmp5], %[coeff], 0x18)
1001         "1:                                                             \n\t"
1002         /* ftmp8: dst3,dst2,dst1,dst0 */
1003         MMI_LDC1(%[ftmp1], %[src], 0x00)
1004         "pmaddhw    %[ftmp6],   %[ftmp2],   %[ftmp1]                    \n\t"
1005         "pmaddhw    %[ftmp7],   %[ftmp3],   %[ftmp1]                    \n\t"
1006         "pmaddhw    %[ftmp8],   %[ftmp4],   %[ftmp1]                    \n\t"
1007         "pmaddhw    %[ftmp9],   %[ftmp5],   %[ftmp1]                    \n\t"
1008         "punpcklwd  %[ftmp10],  %[ftmp6],   %[ftmp7]                    \n\t"
1009         "punpckhwd  %[ftmp11],  %[ftmp6],   %[ftmp7]                    \n\t"
1010         "punpcklwd  %[ftmp6],   %[ftmp8],   %[ftmp9]                    \n\t"
1011         "punpckhwd  %[ftmp7],   %[ftmp8],   %[ftmp9]                    \n\t"
1012         "paddw      %[ftmp8],   %[ftmp10],  %[ftmp11]                   \n\t"
1013         "paddw      %[ftmp9],   %[ftmp6],   %[ftmp7]                    \n\t"
1014         "paddw      %[ftmp8],   %[ftmp8],   %[ff_pw_4]                  \n\t"
1015         "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_4]                  \n\t"
1016         "psraw      %[ftmp8],   %[ftmp8],   %[ftmp0]                    \n\t"
1017         "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
1018         "punpcklhw  %[ftmp6],   %[ftmp8],   %[ftmp9]                    \n\t"
1019         "punpckhhw  %[ftmp7],   %[ftmp8],   %[ftmp9]                    \n\t"
1020         "punpcklhw  %[ftmp8],   %[ftmp6],   %[ftmp7]                    \n\t"
1021         MMI_SDC1(%[ftmp8], %[dst], 0x00)
1022
1023         PTR_ADDIU  "%[src],     %[src],     0x10                        \n\t"
1024         PTR_ADDIU  "%[dst],     %[dst],     0x10                        \n\t"
1025         "addiu      %[count],   %[count],   -0x01                       \n\t"
1026         "bnez       %[count],   1b                                      \n\t"
1027         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
1028           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
1029           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
1030           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
1031           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
1032           [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
1033           [tmp0]"=&r"(tmp[0]),          [count]"+&r"(count),
1034           [src]"+&r"(src),              [dst]"+&r"(dst)
1035         : [ff_pw_4]"f"(ff_pw_4_local),  [coeff]"r"(coeff)
1036         : "memory"
1037     );
1038
1039     src = block;
1040
1041     // 2nd loop
1042     __asm__ volatile (
1043         "li         %[tmp0],    0x07                                    \n\t"
1044         "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
1045         "li         %[tmp0],    0x44                                    \n\t"
1046         "mtc1       %[tmp0],    %[ftmp23]                               \n\t"
1047
1048         MMI_LDC1(%[ftmp1], %[src], 0x00)
1049         MMI_LDC1(%[ftmp2], %[src], 0x20)
1050         MMI_LDC1(%[ftmp3], %[src], 0x40)
1051         MMI_LDC1(%[ftmp4], %[src], 0x60)
1052         "punpcklhw  %[ftmp5],   %[ftmp1],   %[ftmp2]                    \n\t"
1053         "punpckhhw  %[ftmp6],   %[ftmp1],   %[ftmp2]                    \n\t"
1054         "punpcklhw  %[ftmp7],   %[ftmp3],   %[ftmp4]                    \n\t"
1055         "punpckhhw  %[ftmp8],   %[ftmp3],   %[ftmp4]                    \n\t"
1056
1057         MMI_LDC1(%[ftmp1], %[src], 0x10)
1058         MMI_LDC1(%[ftmp2], %[src], 0x30)
1059         MMI_LDC1(%[ftmp3], %[src], 0x50)
1060         MMI_LDC1(%[ftmp4], %[src], 0x70)
1061         "punpcklhw  %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
1062         "punpckhhw  %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
1063         "punpcklhw  %[ftmp11],  %[ftmp3],   %[ftmp4]                    \n\t"
1064         "punpckhhw  %[ftmp12],  %[ftmp3],   %[ftmp4]                    \n\t"
1065
1066         /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */
1067         VC1_INV_TRANCS_8_TYPE2(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c,
1068                                0x000f0010, 0x00040009, %[ff_pw_64], %[ff_pw_1])
1069
1070         /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */
1071         VC1_INV_TRANCS_8_TYPE2(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4,
1072                                0xfffc000f, 0xfff7fff0, %[ff_pw_64], %[ff_pw_1])
1073
1074         /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */
1075         VC1_INV_TRANCS_8_TYPE2(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4,
1076                                0xfff00009, 0x000f0004, %[ff_pw_64], %[ff_pw_1])
1077
1078         /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */
1079         VC1_INV_TRANCS_8_TYPE2(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c,
1080                                0xfff70004, 0xfff0000f, %[ff_pw_64], %[ff_pw_1])
1081
1082         MMI_LWC1(%[ftmp1], %[dest], 0x00)
1083         PTR_ADDU  "%[tmp0],   %[dest],    %[linesize]                 \n\t"
1084         MMI_LWC1(%[ftmp2], %[tmp0], 0x00)
1085         PTR_ADDU  "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
1086         MMI_LWC1(%[ftmp3], %[tmp0], 0x00)
1087         PTR_ADDU  "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
1088         MMI_LWC1(%[ftmp4], %[tmp0], 0x00)
1089         PTR_ADDU  "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
1090         MMI_LWC1(%[ftmp5], %[tmp0], 0x00)
1091         PTR_ADDU  "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
1092         MMI_LWC1(%[ftmp6], %[tmp0], 0x00)
1093         PTR_ADDU  "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
1094         MMI_LWC1(%[ftmp7], %[tmp0], 0x00)
1095         PTR_ADDU  "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
1096         MMI_LWC1(%[ftmp8], %[tmp0], 0x00)
1097         "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
1098         "punpcklbh  %[ftmp1],   %[ftmp1],   %[ftmp0]                    \n\t"
1099         "punpcklbh  %[ftmp2],   %[ftmp2],   %[ftmp0]                    \n\t"
1100         "punpcklbh  %[ftmp3],   %[ftmp3],   %[ftmp0]                    \n\t"
1101         "punpcklbh  %[ftmp4],   %[ftmp4],   %[ftmp0]                    \n\t"
1102         "punpcklbh  %[ftmp5],   %[ftmp5],   %[ftmp0]                    \n\t"
1103         "punpcklbh  %[ftmp6],   %[ftmp6],   %[ftmp0]                    \n\t"
1104         "punpcklbh  %[ftmp7],   %[ftmp7],   %[ftmp0]                    \n\t"
1105         "punpcklbh  %[ftmp8],   %[ftmp8],   %[ftmp0]                    \n\t"
1106
1107         "paddh      %[ftmp1],   %[ftmp1],   %[ftmp15]                   \n\t"
1108         "paddh      %[ftmp2],   %[ftmp2],   %[ftmp16]                   \n\t"
1109         "paddh      %[ftmp3],   %[ftmp3],   %[ftmp17]                   \n\t"
1110         "paddh      %[ftmp4],   %[ftmp4],   %[ftmp18]                   \n\t"
1111         "paddh      %[ftmp5],   %[ftmp5],   %[ftmp19]                   \n\t"
1112         "paddh      %[ftmp6],   %[ftmp6],   %[ftmp20]                   \n\t"
1113         "paddh      %[ftmp7],   %[ftmp7],   %[ftmp21]                   \n\t"
1114         "paddh      %[ftmp8],   %[ftmp8],   %[ftmp22]                   \n\t"
1115
1116         "packushb   %[ftmp1],   %[ftmp1],   %[ftmp0]                    \n\t"
1117         "packushb   %[ftmp2],   %[ftmp2],   %[ftmp0]                    \n\t"
1118         "packushb   %[ftmp3],   %[ftmp3],   %[ftmp0]                    \n\t"
1119         "packushb   %[ftmp4],   %[ftmp4],   %[ftmp0]                    \n\t"
1120         "packushb   %[ftmp5],   %[ftmp5],   %[ftmp0]                    \n\t"
1121         "packushb   %[ftmp6],   %[ftmp6],   %[ftmp0]                    \n\t"
1122         "packushb   %[ftmp7],   %[ftmp7],   %[ftmp0]                    \n\t"
1123         "packushb   %[ftmp8],   %[ftmp8],   %[ftmp0]                    \n\t"
1124
1125         MMI_SWC1(%[ftmp1], %[dest], 0x00)
1126         PTR_ADDU   "%[tmp0],    %[dest],    %[linesize]                 \n\t"
1127         MMI_SWC1(%[ftmp2], %[tmp0], 0x00)
1128         PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
1129         MMI_SWC1(%[ftmp3], %[tmp0], 0x00)
1130         PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
1131         MMI_SWC1(%[ftmp4], %[tmp0], 0x00)
1132         PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
1133         MMI_SWC1(%[ftmp5], %[tmp0], 0x00)
1134         PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
1135         MMI_SWC1(%[ftmp6], %[tmp0], 0x00)
1136         PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
1137         MMI_SWC1(%[ftmp7], %[tmp0], 0x00)
1138         PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
1139         MMI_SWC1(%[ftmp8], %[tmp0], 0x00)
1140
1141         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
1142           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
1143           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
1144           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
1145           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
1146           [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
1147           [ftmp12]"=&f"(ftmp[12]),      [ftmp13]"=&f"(ftmp[13]),
1148           [ftmp14]"=&f"(ftmp[14]),      [ftmp15]"=&f"(ftmp[15]),
1149           [ftmp16]"=&f"(ftmp[16]),      [ftmp17]"=&f"(ftmp[17]),
1150           [ftmp18]"=&f"(ftmp[18]),      [ftmp19]"=&f"(ftmp[19]),
1151           [ftmp20]"=&f"(ftmp[20]),      [ftmp21]"=&f"(ftmp[21]),
1152           [ftmp22]"=&f"(ftmp[22]),      [ftmp23]"=&f"(ftmp[23]),
1153           [tmp0]"=&r"(tmp[0])
1154         : [ff_pw_1]"f"(ff_pw_1_local),  [ff_pw_64]"f"(ff_pw_64_local),
1155           [src]"r"(src), [dest]"r"(dest), [linesize]"r"(linesize)
1156         : "memory"
1157     );
1158 }
1159 #endif
1160
1161 /* Do inverse transform on 4x4 part of block */
1162 void ff_vc1_inv_trans_4x4_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
1163 {
1164     int dc = block[0];
1165     double ftmp[5];
1166     DECLARE_VAR_LOW32;
1167
1168     dc = (17 * dc +  4) >> 3;
1169     dc = (17 * dc + 64) >> 7;
1170
1171     __asm__ volatile(
1172         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
1173         "pshufh     %[dc],      %[dc],          %[ftmp0]                \n\t"
1174
1175         MMI_LWC1(%[ftmp1], %[dest0], 0x00)
1176         MMI_LWC1(%[ftmp2], %[dest1], 0x00)
1177         MMI_LWC1(%[ftmp3], %[dest2], 0x00)
1178         MMI_LWC1(%[ftmp4], %[dest3], 0x00)
1179
1180         "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
1181         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
1182         "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
1183         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
1184
1185         "paddsh     %[ftmp1],   %[ftmp1],       %[dc]                   \n\t"
1186         "paddsh     %[ftmp2],   %[ftmp2],       %[dc]                   \n\t"
1187         "paddsh     %[ftmp3],   %[ftmp3],       %[dc]                   \n\t"
1188         "paddsh     %[ftmp4],   %[ftmp4],       %[dc]                   \n\t"
1189
1190         "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
1191         "packushb   %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
1192         "packushb   %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
1193         "packushb   %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
1194
1195         MMI_SWC1(%[ftmp1], %[dest0], 0x00)
1196         MMI_SWC1(%[ftmp2], %[dest1], 0x00)
1197         MMI_SWC1(%[ftmp3], %[dest2], 0x00)
1198         MMI_SWC1(%[ftmp4], %[dest3], 0x00)
1199         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
1200           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
1201           RESTRICT_ASM_LOW32
1202           [ftmp4]"=&f"(ftmp[4])
1203         : [dest0]"r"(dest+0*linesize),  [dest1]"r"(dest+1*linesize),
1204           [dest2]"r"(dest+2*linesize),  [dest3]"r"(dest+3*linesize),
1205           [dc]"f"(dc)
1206         : "memory"
1207     );
1208 }
1209
1210 void ff_vc1_inv_trans_4x4_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
1211 {
1212     int16_t *src = block;
1213     int16_t *dst = block;
1214     double ftmp[16];
1215     uint32_t count = 4, tmp[1];
1216     int16_t coeff[16] = {17, 22, 17, 10,
1217                          17, 10,-17,-22,
1218                          17,-10,-17, 22,
1219                          17,-22, 17,-10};
1220     DECLARE_ALIGNED(8, const uint64_t, ff_pw_4_local) = {0x0000000400000004ULL};
1221     DECLARE_ALIGNED(8, const uint64_t, ff_pw_64_local)= {0x0000004000000040ULL};
1222     // 1st loop
1223     __asm__ volatile (
1224
1225         "li         %[tmp0],    0x03                                    \n\t"
1226         "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
1227         MMI_LDC1(%[ftmp2], %[coeff], 0x00)
1228         MMI_LDC1(%[ftmp3], %[coeff], 0x08)
1229         MMI_LDC1(%[ftmp4], %[coeff], 0x10)
1230         MMI_LDC1(%[ftmp5], %[coeff], 0x18)
1231         "1:                                                             \n\t"
1232         /* ftmp8: dst3,dst2,dst1,dst0 */
1233         MMI_LDC1(%[ftmp1], %[src], 0x00)
1234         "pmaddhw    %[ftmp6],   %[ftmp2],   %[ftmp1]                    \n\t"
1235         "pmaddhw    %[ftmp7],   %[ftmp3],   %[ftmp1]                    \n\t"
1236         "pmaddhw    %[ftmp8],   %[ftmp4],   %[ftmp1]                    \n\t"
1237         "pmaddhw    %[ftmp9],   %[ftmp5],   %[ftmp1]                    \n\t"
1238         "punpcklwd  %[ftmp10],  %[ftmp6],   %[ftmp7]                    \n\t"
1239         "punpckhwd  %[ftmp11],  %[ftmp6],   %[ftmp7]                    \n\t"
1240         "punpcklwd  %[ftmp6],   %[ftmp8],   %[ftmp9]                    \n\t"
1241         "punpckhwd  %[ftmp7],   %[ftmp8],   %[ftmp9]                    \n\t"
1242         "paddw      %[ftmp8],   %[ftmp10],  %[ftmp11]                   \n\t"
1243         "paddw      %[ftmp9],   %[ftmp6],   %[ftmp7]                    \n\t"
1244         "paddw      %[ftmp8],   %[ftmp8],   %[ff_pw_4]                  \n\t"
1245         "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_4]                  \n\t"
1246         "psraw      %[ftmp8],   %[ftmp8],   %[ftmp0]                    \n\t"
1247         "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
1248         "punpcklhw  %[ftmp6],   %[ftmp8],   %[ftmp9]                    \n\t"
1249         "punpckhhw  %[ftmp7],   %[ftmp8],   %[ftmp9]                    \n\t"
1250         "punpcklhw  %[ftmp8],   %[ftmp6],   %[ftmp7]                    \n\t"
1251         MMI_SDC1(%[ftmp8], %[dst], 0x00)
1252
1253         PTR_ADDIU  "%[src],     %[src],     0x10                        \n\t"
1254         PTR_ADDIU  "%[dst],     %[dst],     0x10                        \n\t"
1255         "addiu      %[count],   %[count],   -0x01                       \n\t"
1256         "bnez       %[count],   1b                                      \n\t"
1257         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
1258           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
1259           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
1260           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
1261           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
1262           [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
1263           [tmp0]"=&r"(tmp[0]),          [count]"+&r"(count),
1264           [src]"+&r"(src),              [dst]"+&r"(dst)
1265         : [ff_pw_4]"f"(ff_pw_4_local),  [coeff]"r"(coeff)
1266         : "memory"
1267     );
1268
1269     src = block;
1270
1271     // 2nd loop
1272     __asm__ volatile (
1273         "li         %[tmp0],    0x07                                    \n\t"
1274         "mtc1       %[tmp0],    %[ftmp0]                                \n\t"
1275         "li         %[tmp0],    0x44                                    \n\t"
1276         "mtc1       %[tmp0],    %[ftmp15]                               \n\t"
1277
1278         MMI_LDC1(%[ftmp1], %[src], 0x00)
1279         MMI_LDC1(%[ftmp2], %[src], 0x10)
1280         MMI_LDC1(%[ftmp3], %[src], 0x20)
1281         MMI_LDC1(%[ftmp4], %[src], 0x30)
1282         "punpcklhw  %[ftmp5],   %[ftmp1],   %[ftmp2]                    \n\t"
1283         "punpckhhw  %[ftmp6],   %[ftmp1],   %[ftmp2]                    \n\t"
1284         "punpcklhw  %[ftmp7],   %[ftmp3],   %[ftmp4]                    \n\t"
1285         "punpckhhw  %[ftmp8],   %[ftmp3],   %[ftmp4]                    \n\t"
1286
1287         /* ftmp11: dst03,dst02,dst01,dst00 */
1288         "li         %[tmp0],    0x00160011                              \n\t"
1289         "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
1290         "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
1291         "li         %[tmp0],    0x000a0011                              \n\t"
1292         "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
1293         "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
1294         "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
1295         "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
1296         "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
1297         "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
1298         "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
1299         "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
1300         "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
1301         "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
1302         "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
1303         "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
1304         "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
1305         "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
1306         "punpcklhw  %[ftmp11],  %[ftmp1],   %[ftmp2]                    \n\t"
1307
1308         /* ftmp12: dst13,dst12,dst11,dst10 */
1309         "li         %[tmp0],    0x000a0011                              \n\t"
1310         "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
1311         "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
1312         "li         %[tmp0],    0xffeaffef                              \n\t"
1313         "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
1314         "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
1315         "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
1316         "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
1317         "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
1318         "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
1319         "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
1320         "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
1321         "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
1322         "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
1323         "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
1324         "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
1325         "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
1326         "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
1327         "punpcklhw  %[ftmp12],  %[ftmp1],   %[ftmp2]                    \n\t"
1328
1329         /* ftmp13: dst23,dst22,dst21,dst20 */
1330         "li         %[tmp0],    0xfff60011                              \n\t"
1331         "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
1332         "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
1333         "li         %[tmp0],    0x0016ffef                              \n\t"
1334         "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
1335         "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
1336         "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
1337         "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
1338         "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
1339         "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
1340         "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
1341         "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
1342         "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
1343         "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
1344         "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
1345         "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
1346         "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
1347         "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
1348         "punpcklhw  %[ftmp13],  %[ftmp1],   %[ftmp2]                    \n\t"
1349
1350         /* ftmp14: dst33,dst32,dst31,dst30 */
1351         "li         %[tmp0],    0xffea0011                              \n\t"
1352         "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
1353         "pshufh     %[ftmp3],   %[ftmp3],   %[ftmp15]                   \n\t"
1354         "li         %[tmp0],    0xfff60011                              \n\t"
1355         "mtc1       %[tmp0],    %[ftmp4]                                \n\t"
1356         "pshufh     %[ftmp4],   %[ftmp4],   %[ftmp15]                   \n\t"
1357         "pmaddhw    %[ftmp1],   %[ftmp5],   %[ftmp3]                    \n\t"
1358         "pmaddhw    %[ftmp2],   %[ftmp7],   %[ftmp4]                    \n\t"
1359         "paddw      %[ftmp9],   %[ftmp1],   %[ftmp2]                    \n\t"
1360         "pmaddhw    %[ftmp1],   %[ftmp6],   %[ftmp3]                    \n\t"
1361         "pmaddhw    %[ftmp2],   %[ftmp8],   %[ftmp4]                    \n\t"
1362         "paddw      %[ftmp10],  %[ftmp1],   %[ftmp2]                    \n\t"
1363         "paddw      %[ftmp9],   %[ftmp9],   %[ff_pw_64]                 \n\t"
1364         "paddw      %[ftmp10],  %[ftmp10],  %[ff_pw_64]                 \n\t"
1365         "psraw      %[ftmp9],   %[ftmp9],   %[ftmp0]                    \n\t"
1366         "psraw      %[ftmp10],  %[ftmp10],  %[ftmp0]                    \n\t"
1367         "punpcklhw  %[ftmp1],   %[ftmp9],   %[ftmp10]                   \n\t"
1368         "punpckhhw  %[ftmp2],   %[ftmp9],   %[ftmp10]                   \n\t"
1369         "punpcklhw  %[ftmp14],  %[ftmp1],   %[ftmp2]                    \n\t"
1370
1371         MMI_LWC1(%[ftmp1], %[dest], 0x00)
1372         PTR_ADDU    "%[tmp0],   %[dest],    %[linesize]                 \n\t"
1373         MMI_LWC1(%[ftmp2], %[tmp0], 0x00)
1374         PTR_ADDU    "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
1375         MMI_LWC1(%[ftmp3], %[tmp0], 0x00)
1376         PTR_ADDU    "%[tmp0],   %[tmp0],    %[linesize]                 \n\t"
1377         MMI_LWC1(%[ftmp4], %[tmp0], 0x00)
1378         "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
1379         "punpcklbh  %[ftmp1],   %[ftmp1],   %[ftmp0]                    \n\t"
1380         "punpcklbh  %[ftmp2],   %[ftmp2],   %[ftmp0]                    \n\t"
1381         "punpcklbh  %[ftmp3],   %[ftmp3],   %[ftmp0]                    \n\t"
1382         "punpcklbh  %[ftmp4],   %[ftmp4],   %[ftmp0]                    \n\t"
1383         "paddh      %[ftmp1],   %[ftmp1],   %[ftmp11]                   \n\t"
1384         "paddh      %[ftmp2],   %[ftmp2],   %[ftmp12]                   \n\t"
1385         "paddh      %[ftmp3],   %[ftmp3],   %[ftmp13]                   \n\t"
1386         "paddh      %[ftmp4],   %[ftmp4],   %[ftmp14]                   \n\t"
1387         "packushb   %[ftmp1],   %[ftmp1],   %[ftmp0]                    \n\t"
1388         "packushb   %[ftmp2],   %[ftmp2],   %[ftmp0]                    \n\t"
1389         "packushb   %[ftmp3],   %[ftmp3],   %[ftmp0]                    \n\t"
1390         "packushb   %[ftmp4],   %[ftmp4],   %[ftmp0]                    \n\t"
1391
1392         MMI_SWC1(%[ftmp1], %[dest], 0x00)
1393         PTR_ADDU   "%[tmp0],    %[dest],    %[linesize]                 \n\t"
1394         MMI_SWC1(%[ftmp2], %[tmp0], 0x00)
1395         PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
1396         MMI_SWC1(%[ftmp3], %[tmp0], 0x00)
1397         PTR_ADDU   "%[tmp0],    %[tmp0],    %[linesize]                 \n\t"
1398         MMI_SWC1(%[ftmp4], %[tmp0], 0x00)
1399
1400         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
1401           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
1402           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
1403           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
1404           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
1405           [ftmp10]"=&f"(ftmp[10]),      [ftmp11]"=&f"(ftmp[11]),
1406           [ftmp12]"=&f"(ftmp[12]),      [ftmp13]"=&f"(ftmp[13]),
1407           [ftmp14]"=&f"(ftmp[14]),      [ftmp15]"=&f"(ftmp[15]),
1408           [tmp0]"=&r"(tmp[0])
1409         : [ff_pw_64]"f"(ff_pw_64_local),
1410           [src]"r"(src), [dest]"r"(dest), [linesize]"r"(linesize)
1411         :"memory"
1412     );
1413 }
1414
1415 /* Apply overlap transform to horizontal edge */
1416 void ff_vc1_h_overlap_mmi(uint8_t *src, int stride)
1417 {
1418     int i;
1419     int a, b, c, d;
1420     int d1, d2;
1421     int rnd = 1;
1422     for (i = 0; i < 8; i++) {
1423         a  = src[-2];
1424         b  = src[-1];
1425         c  = src[0];
1426         d  = src[1];
1427         d1 = (a - d + 3 + rnd) >> 3;
1428         d2 = (a - d + b - c + 4 - rnd) >> 3;
1429
1430         src[-2] = a - d1;
1431         src[-1] = av_clip_uint8(b - d2);
1432         src[0]  = av_clip_uint8(c + d2);
1433         src[1]  = d + d1;
1434         src    += stride;
1435         rnd     = !rnd;
1436     }
1437 }
1438
1439 void ff_vc1_h_s_overlap_mmi(int16_t *left, int16_t *right, int left_stride, int right_stride, int flags)
1440 {
1441     int i;
1442     int a, b, c, d;
1443     int d1, d2;
1444     int rnd1 = flags & 2 ? 3 : 4;
1445     int rnd2 = 7 - rnd1;
1446     for (i = 0; i < 8; i++) {
1447         a  = left[6];
1448         b  = left[7];
1449         c  = right[0];
1450         d  = right[1];
1451         d1 = a - d;
1452         d2 = a - d + b - c;
1453
1454         left[6]  = ((a << 3) - d1 + rnd1) >> 3;
1455         left[7]  = ((b << 3) - d2 + rnd2) >> 3;
1456         right[0] = ((c << 3) + d2 + rnd1) >> 3;
1457         right[1] = ((d << 3) + d1 + rnd2) >> 3;
1458
1459         right += right_stride;
1460         left  += left_stride;
1461         if (flags & 1) {
1462             rnd2   = 7 - rnd2;
1463             rnd1   = 7 - rnd1;
1464         }
1465     }
1466 }
1467
1468 /* Apply overlap transform to vertical edge */
1469 void ff_vc1_v_overlap_mmi(uint8_t *src, int stride)
1470 {
1471     int i;
1472     int a, b, c, d;
1473     int d1, d2;
1474     int rnd = 1;
1475     for (i = 0; i < 8; i++) {
1476         a  = src[-2 * stride];
1477         b  = src[-stride];
1478         c  = src[0];
1479         d  = src[stride];
1480         d1 = (a - d + 3 + rnd) >> 3;
1481         d2 = (a - d + b - c + 4 - rnd) >> 3;
1482
1483         src[-2 * stride] = a - d1;
1484         src[-stride]     = av_clip_uint8(b - d2);
1485         src[0]           = av_clip_uint8(c + d2);
1486         src[stride]      = d + d1;
1487         src++;
1488         rnd = !rnd;
1489     }
1490 }
1491
1492 void ff_vc1_v_s_overlap_mmi(int16_t *top, int16_t *bottom)
1493 {
1494     int i;
1495     int a, b, c, d;
1496     int d1, d2;
1497     int rnd1 = 4, rnd2 = 3;
1498     for (i = 0; i < 8; i++) {
1499         a  = top[48];
1500         b  = top[56];
1501         c  = bottom[0];
1502         d  = bottom[8];
1503         d1 = a - d;
1504         d2 = a - d + b - c;
1505
1506         top[48]   = ((a << 3) - d1 + rnd1) >> 3;
1507         top[56]   = ((b << 3) - d2 + rnd2) >> 3;
1508         bottom[0] = ((c << 3) + d2 + rnd1) >> 3;
1509         bottom[8] = ((d << 3) + d1 + rnd2) >> 3;
1510
1511         bottom++;
1512         top++;
1513         rnd2 = 7 - rnd2;
1514         rnd1 = 7 - rnd1;
1515     }
1516 }
1517
1518 /**
1519  * VC-1 in-loop deblocking filter for one line
1520  * @param src source block type
1521  * @param stride block stride
1522  * @param pq block quantizer
1523  * @return whether other 3 pairs should be filtered or not
1524  * @see 8.6
1525  */
1526 static av_always_inline int vc1_filter_line(uint8_t *src, int stride, int pq)
1527 {
1528     int a0 = (2 * (src[-2 * stride] - src[1 * stride]) -
1529               5 * (src[-1 * stride] - src[0 * stride]) + 4) >> 3;
1530     int a0_sign = a0 >> 31;        /* Store sign */
1531
1532     a0 = (a0 ^ a0_sign) - a0_sign; /* a0 = FFABS(a0); */
1533     if (a0 < pq) {
1534         int a1 = FFABS((2 * (src[-4 * stride] - src[-1 * stride]) -
1535                         5 * (src[-3 * stride] - src[-2 * stride]) + 4) >> 3);
1536         int a2 = FFABS((2 * (src[ 0 * stride] - src[ 3 * stride]) -
1537                         5 * (src[ 1 * stride] - src[ 2 * stride]) + 4) >> 3);
1538         if (a1 < a0 || a2 < a0) {
1539             int clip      = src[-1 * stride] - src[0 * stride];
1540             int clip_sign = clip >> 31;
1541
1542             clip = ((clip ^ clip_sign) - clip_sign) >> 1;
1543             if (clip) {
1544                 int a3     = FFMIN(a1, a2);
1545                 int d      = 5 * (a3 - a0);
1546                 int d_sign = (d >> 31);
1547
1548                 d       = ((d ^ d_sign) - d_sign) >> 3;
1549                 d_sign ^= a0_sign;
1550
1551                 if (d_sign ^ clip_sign)
1552                     d = 0;
1553                 else {
1554                     d = FFMIN(d, clip);
1555                     d = (d ^ d_sign) - d_sign; /* Restore sign */
1556                     src[-1 * stride] = av_clip_uint8(src[-1 * stride] - d);
1557                     src[ 0 * stride] = av_clip_uint8(src[ 0 * stride] + d);
1558                 }
1559                 return 1;
1560             }
1561         }
1562     }
1563     return 0;
1564 }
1565
1566 /**
1567  * VC-1 in-loop deblocking filter
1568  * @param src source block type
1569  * @param step distance between horizontally adjacent elements
1570  * @param stride distance between vertically adjacent elements
1571  * @param len edge length to filter (4 or 8 pixels)
1572  * @param pq block quantizer
1573  * @see 8.6
1574  */
1575 static inline void vc1_loop_filter(uint8_t *src, int step, int stride,
1576                                    int len, int pq)
1577 {
1578     int i;
1579     int filt3;
1580
1581     for (i = 0; i < len; i += 4) {
1582         filt3 = vc1_filter_line(src + 2 * step, stride, pq);
1583         if (filt3) {
1584             vc1_filter_line(src + 0 * step, stride, pq);
1585             vc1_filter_line(src + 1 * step, stride, pq);
1586             vc1_filter_line(src + 3 * step, stride, pq);
1587         }
1588         src += step * 4;
1589     }
1590 }
1591
1592 void ff_vc1_v_loop_filter4_mmi(uint8_t *src, int stride, int pq)
1593 {
1594     vc1_loop_filter(src, 1, stride, 4, pq);
1595 }
1596
1597 void ff_vc1_h_loop_filter4_mmi(uint8_t *src, int stride, int pq)
1598 {
1599     vc1_loop_filter(src, stride, 1, 4, pq);
1600 }
1601
1602 void ff_vc1_v_loop_filter8_mmi(uint8_t *src, int stride, int pq)
1603 {
1604     vc1_loop_filter(src, 1, stride, 8, pq);
1605 }
1606
1607 void ff_vc1_h_loop_filter8_mmi(uint8_t *src, int stride, int pq)
1608 {
1609     vc1_loop_filter(src, stride, 1, 8, pq);
1610 }
1611
1612 void ff_vc1_v_loop_filter16_mmi(uint8_t *src, int stride, int pq)
1613 {
1614     vc1_loop_filter(src, 1, stride, 16, pq);
1615 }
1616
1617 void ff_vc1_h_loop_filter16_mmi(uint8_t *src, int stride, int pq)
1618 {
1619     vc1_loop_filter(src, stride, 1, 16, pq);
1620 }
1621
1622 void ff_put_vc1_mspel_mc00_mmi(uint8_t *dst, const uint8_t *src,
1623                                ptrdiff_t stride, int rnd)
1624 {
1625     ff_put_pixels8_8_mmi(dst, src, stride, 8);
1626 }
1627 void ff_put_vc1_mspel_mc00_16_mmi(uint8_t *dst, const uint8_t *src,
1628                                   ptrdiff_t stride, int rnd)
1629 {
1630     ff_put_pixels16_8_mmi(dst, src, stride, 16);
1631 }
1632 void ff_avg_vc1_mspel_mc00_mmi(uint8_t *dst, const uint8_t *src,
1633                                ptrdiff_t stride, int rnd)
1634 {
1635     ff_avg_pixels8_8_mmi(dst, src, stride, 8);
1636 }
1637 void ff_avg_vc1_mspel_mc00_16_mmi(uint8_t *dst, const uint8_t *src,
1638                                   ptrdiff_t stride, int rnd)
1639 {
1640     ff_avg_pixels16_8_mmi(dst, src, stride, 16);
1641 }
1642
1643 #define OP_PUT(S, D)
1644 #define OP_AVG(S, D)                                                        \
1645     "ldc1       $f16,   "#S"                        \n\t"                   \
1646     "pavgb      "#D",   "#D",   $f16                \n\t"
1647
1648 /** Add rounder from $f14 to $f6 and pack result at destination */
1649 #define NORMALIZE_MMI(SHIFT)                                                \
1650     "paddh      $f6,    $f6,    $f14                \n\t" /* +bias-r */     \
1651     "paddh      $f8,    $f8,    $f14                \n\t" /* +bias-r */     \
1652     "psrah      $f6,    $f6,    "SHIFT"             \n\t"                   \
1653     "psrah      $f8,    $f8,    "SHIFT"             \n\t"
1654
1655 #define TRANSFER_DO_PACK(OP)                                                \
1656     "packushb   $f6,    $f6,    $f8                 \n\t"                   \
1657     OP((%[dst]), $f6)                                                       \
1658     "sdc1       $f6,    0x00(%[dst])                \n\t"
1659
1660 #define TRANSFER_DONT_PACK(OP)                                              \
1661      OP(0(%[dst]), $f6)                                                     \
1662      OP(8(%[dst]), $f8)                                                     \
1663      "sdc1      $f6,    0x00(%[dst])                \n\t"                   \
1664      "sdc1      $f8,    0x08(%[dst])                \n\t"
1665
1666 /** @see MSPEL_FILTER13_CORE for use as UNPACK macro */
1667 #define DO_UNPACK(reg)                                                      \
1668     "punpcklbh  "reg",  "reg",  $f0                 \n\t"
1669 #define DONT_UNPACK(reg)
1670
1671 /** Compute the rounder 32-r or 8-r and unpacks it to $f14 */
1672 #define LOAD_ROUNDER_MMI(ROUND)                                             \
1673     "lwc1       $f14,   "ROUND"                     \n\t"                   \
1674     "punpcklhw  $f14,   $f14,   $f14                \n\t"                   \
1675     "punpcklwd  $f14,   $f14,   $f14                \n\t"
1676
1677
1678 #define SHIFT2_LINE(OFF, R0, R1, R2, R3)                                    \
1679     "paddh      "#R1",      "#R1",  "#R2"           \n\t"                   \
1680     PTR_ADDU    "$9,        %[src], %[stride1]      \n\t"                   \
1681     MMI_ULWC1(R0, $9, 0x00)                                                 \
1682     "pmullh     "#R1",      "#R1",  $f6             \n\t"                   \
1683     "punpcklbh  "#R0",      "#R0",  $f0             \n\t"                   \
1684     PTR_ADDU    "$9,        %[src], %[stride]       \n\t"                   \
1685     MMI_ULWC1(R3, $9, 0x00)                                                 \
1686     "psubh      "#R1",      "#R1",  "#R0"           \n\t"                   \
1687     "punpcklbh  "#R3",      "#R3",  $f0             \n\t"                   \
1688     "paddh      "#R1",      "#R1",  $f14            \n\t"                   \
1689     "psubh      "#R1",      "#R1",  "#R3"           \n\t"                   \
1690     "psrah      "#R1",      "#R1",  %[shift]        \n\t"                   \
1691     MMI_SDC1(R1, %[dst], OFF)                                               \
1692     PTR_ADDU    "%[src],    %[src], %[stride]       \n\t"
1693
1694 /** Sacrificing $f12 makes it possible to pipeline loads from src */
1695 static void vc1_put_ver_16b_shift2_mmi(int16_t *dst,
1696                                        const uint8_t *src, mips_reg stride,
1697                                        int rnd, int64_t shift)
1698 {
1699     DECLARE_VAR_LOW32;
1700     DECLARE_VAR_ADDRT;
1701
1702     __asm__ volatile(
1703         "xor        $f0,    $f0,    $f0             \n\t"
1704         "li         $8,     0x03                    \n\t"
1705         LOAD_ROUNDER_MMI("%[rnd]")
1706         "ldc1       $f12,   %[ff_pw_9]              \n\t"
1707         "1:                                         \n\t"
1708         MMI_ULWC1($f4, %[src], 0x00)
1709         PTR_ADDU   "%[src], %[src], %[stride]       \n\t"
1710         MMI_ULWC1($f6, %[src], 0x00)
1711         "punpcklbh  $f4,    $f4,    $f0             \n\t"
1712         "punpcklbh  $f6,    $f6,    $f0             \n\t"
1713         SHIFT2_LINE(  0, $f2, $f4, $f6, $f8)
1714         SHIFT2_LINE( 24, $f4, $f6, $f8, $f2)
1715         SHIFT2_LINE( 48, $f6, $f8, $f2, $f4)
1716         SHIFT2_LINE( 72, $f8, $f2, $f4, $f6)
1717         SHIFT2_LINE( 96, $f2, $f4, $f6, $f8)
1718         SHIFT2_LINE(120, $f4, $f6, $f8, $f2)
1719         SHIFT2_LINE(144, $f6, $f8, $f2, $f4)
1720         SHIFT2_LINE(168, $f8, $f2, $f4, $f6)
1721         PTR_SUBU   "%[src], %[src], %[stride2]      \n\t"
1722         PTR_ADDIU  "%[dst], %[dst], 0x08            \n\t"
1723         "addiu      $8,     $8,    -0x01            \n\t"
1724         "bnez       $8,     1b                      \n\t"
1725         : RESTRICT_ASM_LOW32            RESTRICT_ASM_ADDRT
1726           [src]"+r"(src),               [dst]"+r"(dst)
1727         : [stride]"r"(stride),          [stride1]"r"(-2*stride),
1728           [shift]"f"(shift),            [rnd]"m"(rnd),
1729           [stride2]"r"(9*stride-4),     [ff_pw_9]"m"(ff_pw_9)
1730         : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
1731           "$f14", "$f16", "memory"
1732     );
1733 }
1734
1735 /**
1736  * Data is already unpacked, so some operations can directly be made from
1737  * memory.
1738  */
1739 #define VC1_HOR_16B_SHIFT2(OP, OPNAME)                                      \
1740 static void OPNAME ## vc1_hor_16b_shift2_mmi(uint8_t *dst, mips_reg stride, \
1741                                              const int16_t *src, int rnd)   \
1742 {                                                                           \
1743     int h = 8;                                                              \
1744     DECLARE_VAR_ALL64;                                                      \
1745     DECLARE_VAR_ADDRT;                                                      \
1746                                                                             \
1747     src -= 1;                                                               \
1748     rnd -= (-1+9+9-1)*1024; /* Add -1024 bias */                            \
1749                                                                             \
1750     __asm__ volatile(                                                       \
1751         LOAD_ROUNDER_MMI("%[rnd]")                                          \
1752         "ldc1       $f12,   %[ff_pw_128]            \n\t"                   \
1753         "ldc1       $f10,   %[ff_pw_9]              \n\t"                   \
1754         "1:                                         \n\t"                   \
1755         MMI_ULDC1($f2, %[src], 0x00)                                        \
1756         MMI_ULDC1($f4, %[src], 0x08)                                        \
1757         MMI_ULDC1($f6, %[src], 0x02)                                        \
1758         MMI_ULDC1($f8, %[src], 0x0a)                                        \
1759         MMI_ULDC1($f0, %[src], 0x06)                                        \
1760         "paddh      $f2,    $f2,    $f0             \n\t"                   \
1761         MMI_ULDC1($f0, %[src], 0x0e)                                        \
1762         "paddh      $f4,    $f4,    $f0             \n\t"                   \
1763         MMI_ULDC1($f0, %[src], 0x04)                                        \
1764         "paddh      $f6,    $f6,    $f0             \n\t"                   \
1765         MMI_ULDC1($f0, %[src], 0x0b)                                        \
1766         "paddh      $f8,    $f8,    $f0             \n\t"                   \
1767         "pmullh     $f6,    $f6,    $f10            \n\t"                   \
1768         "pmullh     $f8,    $f8,    $f10            \n\t"                   \
1769         "psubh      $f6,    $f6,    $f2             \n\t"                   \
1770         "psubh      $f8,    $f8,    $f4             \n\t"                   \
1771         "li         $8,     0x07                    \n\t"                   \
1772         "mtc1       $8,     $f16                    \n\t"                   \
1773         NORMALIZE_MMI("$f16")                                               \
1774         /* Remove bias */                                                   \
1775         "paddh      $f6,    $f6,    $f12            \n\t"                   \
1776         "paddh      $f8,    $f8,    $f12            \n\t"                   \
1777         TRANSFER_DO_PACK(OP)                                                \
1778         "addiu      %[h],   %[h],  -0x01            \n\t"                   \
1779         PTR_ADDIU  "%[src], %[src], 0x18            \n\t"                   \
1780         PTR_ADDU   "%[dst], %[dst], %[stride]       \n\t"                   \
1781         "bnez       %[h],   1b                      \n\t"                   \
1782         : RESTRICT_ASM_ALL64            RESTRICT_ASM_ADDRT                  \
1783           [h]"+r"(h),                                                       \
1784           [src]"+r"(src),               [dst]"+r"(dst)                      \
1785         : [stride]"r"(stride),          [rnd]"m"(rnd),                      \
1786           [ff_pw_9]"m"(ff_pw_9),        [ff_pw_128]"m"(ff_pw_128)           \
1787         : "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14",  \
1788           "$f16", "memory"                                                  \
1789     );                                                                      \
1790 }
1791
1792 VC1_HOR_16B_SHIFT2(OP_PUT, put_)
1793 VC1_HOR_16B_SHIFT2(OP_AVG, avg_)
1794
1795 /**
1796  * Purely vertical or horizontal 1/2 shift interpolation.
1797  * Sacrify $f12 for *9 factor.
1798  */
1799 #define VC1_SHIFT2(OP, OPNAME)\
1800 static void OPNAME ## vc1_shift2_mmi(uint8_t *dst, const uint8_t *src,      \
1801                                      mips_reg stride, int rnd,              \
1802                                      mips_reg offset)                       \
1803 {                                                                           \
1804     DECLARE_VAR_LOW32;                                                      \
1805     DECLARE_VAR_ADDRT;                                                      \
1806                                                                             \
1807     rnd = 8 - rnd;                                                          \
1808                                                                             \
1809     __asm__ volatile(                                                       \
1810         "xor        $f0,    $f0,    $f0             \n\t"                   \
1811         "li         $10,    0x08                    \n\t"                   \
1812         LOAD_ROUNDER_MMI("%[rnd]")                                          \
1813         "ldc1       $f12,   %[ff_pw_9]              \n\t"                   \
1814         "1:                                         \n\t"                   \
1815         MMI_ULWC1($f6, %[src], 0x00)                                        \
1816         MMI_ULWC1($f8, %[src], 0x04)                                        \
1817         PTR_ADDU   "$9,     %[src], %[offset]       \n\t"                   \
1818         MMI_ULWC1($f2, $9, 0x00)                                            \
1819         MMI_ULWC1($f4, $9, 0x04)                                            \
1820         PTR_ADDU   "%[src], %[src], %[offset]       \n\t"                   \
1821         "punpcklbh  $f6,    $f6,    $f0             \n\t"                   \
1822         "punpcklbh  $f8,    $f8,    $f0             \n\t"                   \
1823         "punpcklbh  $f2,    $f2,    $f0             \n\t"                   \
1824         "punpcklbh  $f4,    $f4,    $f0             \n\t"                   \
1825         "paddh      $f6,    $f6,    $f2             \n\t"                   \
1826         "paddh      $f8,    $f8,    $f4             \n\t"                   \
1827         PTR_ADDU   "$9,     %[src], %[offset_x2n]   \n\t"                   \
1828         MMI_ULWC1($f2, $9, 0x00)                                            \
1829         MMI_ULWC1($f4, $9, 0x04)                                            \
1830         "pmullh     $f6,    $f6,    $f12            \n\t" /* 0,9,9,0*/      \
1831         "pmullh     $f8,    $f8,    $f12            \n\t" /* 0,9,9,0*/      \
1832         "punpcklbh  $f2,    $f2,    $f0             \n\t"                   \
1833         "punpcklbh  $f4,    $f4,    $f0             \n\t"                   \
1834         "psubh      $f6,    $f6,    $f2             \n\t" /*-1,9,9,0*/      \
1835         "psubh      $f8,    $f8,    $f4             \n\t" /*-1,9,9,0*/      \
1836         PTR_ADDU   "$9,     %[src], %[offset]       \n\t"                   \
1837         MMI_ULWC1($f2, $9, 0x00)                                            \
1838         MMI_ULWC1($f4, $9, 0x04)                                            \
1839         "punpcklbh  $f2,    $f2,    $f0             \n\t"                   \
1840         "punpcklbh  $f4,    $f4,    $f0             \n\t"                   \
1841         "psubh      $f6,    $f6,    $f2             \n\t" /*-1,9,9,-1*/     \
1842         "psubh      $f8,    $f8,    $f4             \n\t" /*-1,9,9,-1*/     \
1843         "li         $8,     0x04                    \n\t"                   \
1844         "mtc1       $8,     $f16                    \n\t"                   \
1845         NORMALIZE_MMI("$f16")                                               \
1846         "packushb   $f6,    $f6,    $f8             \n\t"                   \
1847         OP((%[dst]), $f6)                                                   \
1848         "sdc1       $f6,    0x00(%[dst])            \n\t"                   \
1849         "addiu      $10,    $10,   -0x01            \n\t"                   \
1850         PTR_ADDU   "%[src], %[src], %[stride1]      \n\t"                   \
1851         PTR_ADDU   "%[dst], %[dst], %[stride]       \n\t"                   \
1852         "bnez       $10,    1b                      \n\t"                   \
1853         : RESTRICT_ASM_LOW32            RESTRICT_ASM_ADDRT                  \
1854           [src]"+r"(src),               [dst]"+r"(dst)                      \
1855         : [offset]"r"(offset),          [offset_x2n]"r"(-2*offset),         \
1856           [stride]"r"(stride),          [rnd]"m"(rnd),                      \
1857           [stride1]"r"(stride-offset),                                      \
1858           [ff_pw_9]"m"(ff_pw_9)                                             \
1859         : "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10",     \
1860           "$f12", "$f14", "$f16", "memory"                                  \
1861     );                                                                      \
1862 }
1863
1864 VC1_SHIFT2(OP_PUT, put_)
1865 VC1_SHIFT2(OP_AVG, avg_)
1866
1867 /**
1868  * Core of the 1/4 and 3/4 shift bicubic interpolation.
1869  *
1870  * @param UNPACK  Macro unpacking arguments from 8 to 16bits (can be empty).
1871  * @param LOAD    "MMI_ULWC1" or "MMI_ULDC1", if data read is already unpacked.
1872  * @param M       "1" for MMI_ULWC1, "2" for MMI_ULDC1.
1873  * @param A1      Stride address of 1st tap (beware of unpacked/packed).
1874  * @param A2      Stride address of 2nd tap
1875  * @param A3      Stride address of 3rd tap
1876  * @param A4      Stride address of 4th tap
1877  */
1878 #define MSPEL_FILTER13_CORE(UNPACK, LOAD, M, A1, A2, A3, A4)                \
1879     PTR_ADDU   "$9,     %[src], "#A1"           \n\t"                       \
1880     LOAD($f2, $9, M*0)                                                      \
1881     LOAD($f4, $9, M*4)                                                      \
1882     UNPACK("$f2")                                                           \
1883     UNPACK("$f4")                                                           \
1884     "pmullh     $f2,    $f2,    %[ff_pw_3]      \n\t"                       \
1885     "pmullh     $f4,    $f4,    %[ff_pw_3]      \n\t"                       \
1886     PTR_ADDU   "$9,     %[src], "#A2"           \n\t"                       \
1887     LOAD($f6, $9, M*0)                                                      \
1888     LOAD($f8, $9, M*4)                                                      \
1889     UNPACK("$f6")                                                           \
1890     UNPACK("$f8")                                                           \
1891     "pmullh     $f6,    $f6,    $f12            \n\t" /* *18 */             \
1892     "pmullh     $f8,    $f8,    $f12            \n\t" /* *18 */             \
1893     "psubh      $f6,    $f6,    $f2             \n\t" /* *18, -3 */         \
1894     "psubh      $f8,    $f8,    $f4             \n\t" /* *18, -3 */         \
1895     PTR_ADDU   "$9,     %[src], "#A4"           \n\t"                       \
1896     LOAD($f2, $9, M*0)                                                      \
1897     LOAD($f4, $9, M*4)                                                      \
1898     UNPACK("$f2")                                                           \
1899     UNPACK("$f4")                                                           \
1900     "li         $8,     0x02                    \n\t"                       \
1901     "mtc1       $8,     $f16                    \n\t"                       \
1902     "psllh      $f2,    $f2,    $f16            \n\t" /* 4* */              \
1903     "psllh      $f4,    $f4,    $f16            \n\t" /* 4* */              \
1904     "psubh      $f6,    $f6,    $f2             \n\t" /* -4,18,-3 */        \
1905     "psubh      $f8,    $f8,    $f4             \n\t" /* -4,18,-3 */        \
1906     PTR_ADDU   "$9,     %[src], "#A3"           \n\t"                       \
1907     LOAD($f2, $9, M*0)                                                      \
1908     LOAD($f4, $9, M*4)                                                      \
1909     UNPACK("$f2")                                                           \
1910     UNPACK("$f4")                                                           \
1911     "pmullh     $f2,    $f2,    $f10            \n\t" /* *53 */             \
1912     "pmullh     $f4,    $f4,    $f10            \n\t" /* *53 */             \
1913     "paddh      $f6,    $f6,    $f2             \n\t" /* 4,53,18,-3 */      \
1914     "paddh      $f8,    $f8,    $f4             \n\t" /* 4,53,18,-3 */
1915
1916 /**
1917  * Macro to build the vertical 16bits version of vc1_put_shift[13].
1918  * Here, offset=src_stride. Parameters passed A1 to A4 must use
1919  * %3 (src_stride), %4 (2*src_stride) and %5 (3*src_stride).
1920  *
1921  * @param  NAME   Either 1 or 3
1922  * @see MSPEL_FILTER13_CORE for information on A1->A4
1923  */
1924 #define MSPEL_FILTER13_VER_16B(NAME, A1, A2, A3, A4)                        \
1925 static void                                                                 \
1926 vc1_put_ver_16b_ ## NAME ## _mmi(int16_t *dst, const uint8_t *src,          \
1927                                  mips_reg src_stride,                       \
1928                                  int rnd, int64_t shift)                    \
1929 {                                                                           \
1930     int h = 8;                                                              \
1931     DECLARE_VAR_LOW32;                                                      \
1932     DECLARE_VAR_ADDRT;                                                      \
1933                                                                             \
1934     src -= src_stride;                                                      \
1935                                                                             \
1936     __asm__ volatile(                                                       \
1937         "xor        $f0,    $f0,    $f0             \n\t"                   \
1938         LOAD_ROUNDER_MMI("%[rnd]")                                          \
1939         "ldc1       $f10,   %[ff_pw_53]             \n\t"                   \
1940         "ldc1       $f12,   %[ff_pw_18]             \n\t"                   \
1941         ".p2align 3                                 \n\t"                   \
1942         "1:                                         \n\t"                   \
1943         MSPEL_FILTER13_CORE(DO_UNPACK, MMI_ULWC1, 1, A1, A2, A3, A4)        \
1944         NORMALIZE_MMI("%[shift]")                                           \
1945         TRANSFER_DONT_PACK(OP_PUT)                                          \
1946         /* Last 3 (in fact 4) bytes on the line */                          \
1947         PTR_ADDU   "$9,     %[src], "#A1"           \n\t"                   \
1948         MMI_ULWC1($f2, $9, 0x08)                                            \
1949         DO_UNPACK("$f2")                                                    \
1950         "mov.d      $f6,    $f2                     \n\t"                   \
1951         "paddh      $f2,    $f2,    $f2             \n\t"                   \
1952         "paddh      $f2,    $f2,    $f6             \n\t" /* 3* */          \
1953         PTR_ADDU   "$9,     %[src], "#A2"           \n\t"                   \
1954         MMI_ULWC1($f6, $9, 0x08)                                            \
1955         DO_UNPACK("$f6")                                                    \
1956         "pmullh     $f6,    $f6,    $f12            \n\t" /* *18 */         \
1957         "psubh      $f6,    $f6,    $f2             \n\t" /* *18,-3 */      \
1958         PTR_ADDU   "$9,     %[src], "#A3"           \n\t"                   \
1959         MMI_ULWC1($f2, $9, 0x08)                                            \
1960         DO_UNPACK("$f2")                                                    \
1961         "pmullh     $f2,    $f2,    $f10            \n\t" /* *53 */         \
1962         "paddh      $f6,    $f6,    $f2             \n\t" /* *53,18,-3 */   \
1963         PTR_ADDU   "$9,     %[src], "#A4"           \n\t"                   \
1964         MMI_ULWC1($f2, $9, 0x08)                                            \
1965         DO_UNPACK("$f2")                                                    \
1966         "li         $8,     0x02                    \n\t"                   \
1967         "mtc1       $8,     $f16                    \n\t"                   \
1968         "psllh      $f2,    $f2,    $f16            \n\t" /* 4* */          \
1969         "psubh      $f6,    $f6,    $f2             \n\t"                   \
1970         "paddh      $f6,    $f6,    $f14            \n\t"                   \
1971         "li         $8,     0x06                    \n\t"                   \
1972         "mtc1       $8,     $f16                    \n\t"                   \
1973         "psrah      $f6,    $f6,    $f16            \n\t"                   \
1974         "sdc1       $f6,    0x10(%[dst])            \n\t"                   \
1975         "addiu      %[h],   %[h],  -0x01            \n\t"                   \
1976         PTR_ADDU   "%[src], %[src], %[stride_x1]    \n\t"                   \
1977         PTR_ADDIU  "%[dst], %[dst], 0x18            \n\t"                   \
1978         "bnez       %[h],   1b                      \n\t"                   \
1979         : RESTRICT_ASM_LOW32            RESTRICT_ASM_ADDRT                  \
1980           [h]"+r"(h),                                                       \
1981           [src]"+r"(src),               [dst]"+r"(dst)                      \
1982         : [stride_x1]"r"(src_stride),   [stride_x2]"r"(2*src_stride),       \
1983           [stride_x3]"r"(3*src_stride),                                     \
1984           [rnd]"m"(rnd),                [shift]"f"(shift),                  \
1985           [ff_pw_53]"m"(ff_pw_53),      [ff_pw_18]"m"(ff_pw_18),            \
1986           [ff_pw_3]"f"(ff_pw_3)                                             \
1987         : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",    \
1988           "$f14", "$f16", "memory"                                          \
1989     );                                                                      \
1990 }
1991
1992 /**
1993  * Macro to build the horizontal 16bits version of vc1_put_shift[13].
1994  * Here, offset=16bits, so parameters passed A1 to A4 should be simple.
1995  *
1996  * @param  NAME   Either 1 or 3
1997  * @see MSPEL_FILTER13_CORE for information on A1->A4
1998  */
1999 #define MSPEL_FILTER13_HOR_16B(NAME, A1, A2, A3, A4, OP, OPNAME)            \
2000 static void                                                                 \
2001 OPNAME ## vc1_hor_16b_ ## NAME ## _mmi(uint8_t *dst, mips_reg stride,       \
2002                                        const int16_t *src, int rnd)         \
2003 {                                                                           \
2004     int h = 8;                                                              \
2005     DECLARE_VAR_ALL64;                                                      \
2006     DECLARE_VAR_ADDRT;                                                      \
2007                                                                             \
2008     src -= 1;                                                               \
2009     rnd -= (-4+58+13-3)*256; /* Add -256 bias */                            \
2010                                                                             \
2011     __asm__ volatile(                                                       \
2012         "xor        $f0,    $f0,    $f0             \n\t"                   \
2013         LOAD_ROUNDER_MMI("%[rnd]")                                          \
2014         "ldc1       $f10,   %[ff_pw_53]             \n\t"                   \
2015         "ldc1       $f12,   %[ff_pw_18]             \n\t"                   \
2016         ".p2align 3                                 \n\t"                   \
2017         "1:                                         \n\t"                   \
2018         MSPEL_FILTER13_CORE(DONT_UNPACK, MMI_ULDC1, 2, A1, A2, A3, A4)      \
2019         "li         $8,     0x07                    \n\t"                   \
2020         "mtc1       $8,     $f16                    \n\t"                   \
2021         NORMALIZE_MMI("$f16")                                               \
2022         /* Remove bias */                                                   \
2023         "paddh      $f6,    $f6,    %[ff_pw_128]    \n\t"                   \
2024         "paddh      $f8,    $f8,    %[ff_pw_128]    \n\t"                   \
2025         TRANSFER_DO_PACK(OP)                                                \
2026         "addiu      %[h],   %[h],  -0x01            \n\t"                   \
2027         PTR_ADDU   "%[src], %[src], 0x18            \n\t"                   \
2028         PTR_ADDU   "%[dst], %[dst], %[stride]       \n\t"                   \
2029         "bnez       %[h],   1b                      \n\t"                   \
2030         : RESTRICT_ASM_ALL64            RESTRICT_ASM_ADDRT                  \
2031           [h]"+r"(h),                                                       \
2032           [src]"+r"(src),               [dst]"+r"(dst)                      \
2033         : [stride]"r"(stride),          [rnd]"m"(rnd),                      \
2034           [ff_pw_53]"m"(ff_pw_53),      [ff_pw_18]"m"(ff_pw_18),            \
2035           [ff_pw_3]"f"(ff_pw_3),        [ff_pw_128]"f"(ff_pw_128)           \
2036         : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",    \
2037           "$f14", "$f16", "memory"                                          \
2038     );                                                                      \
2039 }
2040
2041 /**
2042  * Macro to build the 8bits, any direction, version of vc1_put_shift[13].
2043  * Here, offset=src_stride. Parameters passed A1 to A4 must use
2044  * %3 (offset), %4 (2*offset) and %5 (3*offset).
2045  *
2046  * @param  NAME   Either 1 or 3
2047  * @see MSPEL_FILTER13_CORE for information on A1->A4
2048  */
2049 #define MSPEL_FILTER13_8B(NAME, A1, A2, A3, A4, OP, OPNAME)                 \
2050 static void                                                                 \
2051 OPNAME ## vc1_## NAME ## _mmi(uint8_t *dst, const uint8_t *src,             \
2052                               mips_reg stride, int rnd, mips_reg offset)    \
2053 {                                                                           \
2054     int h = 8;                                                              \
2055     DECLARE_VAR_LOW32;                                                      \
2056     DECLARE_VAR_ADDRT;                                                      \
2057                                                                             \
2058     src -= offset;                                                          \
2059     rnd = 32-rnd;                                                           \
2060                                                                             \
2061     __asm__ volatile (                                                      \
2062         "xor        $f0,    $f0,    $f0             \n\t"                   \
2063         LOAD_ROUNDER_MMI("%[rnd]")                                          \
2064         "ldc1       $f10,   %[ff_pw_53]             \n\t"                   \
2065         "ldc1       $f12,   %[ff_pw_18]             \n\t"                   \
2066         ".p2align 3                                 \n\t"                   \
2067         "1:                                         \n\t"                   \
2068         MSPEL_FILTER13_CORE(DO_UNPACK, MMI_ULWC1, 1, A1, A2, A3, A4)        \
2069         "li         $8,     0x06                    \n\t"                   \
2070         "mtc1       $8,     $f16                    \n\t"                   \
2071         NORMALIZE_MMI("$f16")                                               \
2072         TRANSFER_DO_PACK(OP)                                                \
2073         "addiu      %[h],   %[h],      -0x01        \n\t"                   \
2074         PTR_ADDU   "%[src], %[src],     %[stride]   \n\t"                   \
2075         PTR_ADDU   "%[dst], %[dst],     %[stride]   \n\t"                   \
2076         "bnez       %[h],   1b                      \n\t"                   \
2077         : RESTRICT_ASM_LOW32            RESTRICT_ASM_ADDRT                  \
2078           [h]"+r"(h),                                                       \
2079           [src]"+r"(src),               [dst]"+r"(dst)                      \
2080         : [offset_x1]"r"(offset),       [offset_x2]"r"(2*offset),           \
2081           [offset_x3]"r"(3*offset),     [stride]"r"(stride),                \
2082           [rnd]"m"(rnd),                                                    \
2083           [ff_pw_53]"m"(ff_pw_53),      [ff_pw_18]"m"(ff_pw_18),            \
2084           [ff_pw_3]"f"(ff_pw_3)                                             \
2085         : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",    \
2086           "$f14", "$f16", "memory"                                          \
2087     );                                                                      \
2088 }
2089
2090
2091 /** 1/4 shift bicubic interpolation */
2092 MSPEL_FILTER13_8B(shift1, %[offset_x3], %[offset_x2], %[offset_x1], $0, OP_PUT, put_)
2093 MSPEL_FILTER13_8B(shift1, %[offset_x3], %[offset_x2], %[offset_x1], $0, OP_AVG, avg_)
2094 MSPEL_FILTER13_VER_16B(shift1, %[stride_x3], %[stride_x2], %[stride_x1], $0)
2095 MSPEL_FILTER13_HOR_16B(shift1, 6, 4, 2, 0, OP_PUT, put_)
2096 MSPEL_FILTER13_HOR_16B(shift1, 6, 4, 2, 0, OP_AVG, avg_)
2097
2098 /** 3/4 shift bicubic interpolation */
2099 MSPEL_FILTER13_8B(shift3, $0, %[offset_x1], %[offset_x2], %[offset_x3], OP_PUT, put_)
2100 MSPEL_FILTER13_8B(shift3, $0, %[offset_x1], %[offset_x2], %[offset_x3], OP_AVG, avg_)
2101 MSPEL_FILTER13_VER_16B(shift3, $0, %[stride_x1], %[stride_x2], %[stride_x3])
2102 MSPEL_FILTER13_HOR_16B(shift3, 0, 2, 4, 6, OP_PUT, put_)
2103 MSPEL_FILTER13_HOR_16B(shift3, 0, 2, 4, 6, OP_AVG, avg_)
2104
2105 typedef void (*vc1_mspel_mc_filter_ver_16bits)
2106              (int16_t *dst, const uint8_t *src, mips_reg src_stride, int rnd,
2107               int64_t shift);
2108 typedef void (*vc1_mspel_mc_filter_hor_16bits)
2109              (uint8_t *dst, mips_reg dst_stride, const int16_t *src, int rnd);
2110 typedef void (*vc1_mspel_mc_filter_8bits)
2111              (uint8_t *dst, const uint8_t *src, mips_reg stride, int rnd,
2112               mips_reg offset);
2113
2114 /**
2115  * Interpolate fractional pel values by applying proper vertical then
2116  * horizontal filter.
2117  *
2118  * @param  dst     Destination buffer for interpolated pels.
2119  * @param  src     Source buffer.
2120  * @param  stride  Stride for both src and dst buffers.
2121  * @param  hmode   Horizontal filter (expressed in quarter pixels shift).
2122  * @param  hmode   Vertical filter.
2123  * @param  rnd     Rounding bias.
2124  */
2125 #define VC1_MSPEL_MC(OP)                                                    \
2126 static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\
2127                                int hmode, int vmode, int rnd)               \
2128 {                                                                           \
2129     static const vc1_mspel_mc_filter_ver_16bits vc1_put_shift_ver_16bits[] =\
2130          { NULL, vc1_put_ver_16b_shift1_mmi,                                \
2131                  vc1_put_ver_16b_shift2_mmi,                                \
2132                  vc1_put_ver_16b_shift3_mmi };                              \
2133     static const vc1_mspel_mc_filter_hor_16bits vc1_put_shift_hor_16bits[] =\
2134          { NULL, OP ## vc1_hor_16b_shift1_mmi,                              \
2135                  OP ## vc1_hor_16b_shift2_mmi,                              \
2136                  OP ## vc1_hor_16b_shift3_mmi };                            \
2137     static const vc1_mspel_mc_filter_8bits vc1_put_shift_8bits[] =          \
2138          { NULL, OP ## vc1_shift1_mmi,                                      \
2139                  OP ## vc1_shift2_mmi,                                      \
2140                  OP ## vc1_shift3_mmi };                                    \
2141                                                                             \
2142     if (vmode) { /* Vertical filter to apply */                             \
2143         if (hmode) { /* Horizontal filter to apply, output to tmp */        \
2144             static const int shift_value[] = { 0, 5, 1, 5 };                \
2145             int    shift = (shift_value[hmode]+shift_value[vmode])>>1;      \
2146             int    r;                                                       \
2147             LOCAL_ALIGNED(16, int16_t, tmp, [12*8]);                        \
2148                                                                             \
2149             r = (1<<(shift-1)) + rnd-1;                                     \
2150             vc1_put_shift_ver_16bits[vmode](tmp, src-1, stride, r, shift);  \
2151                                                                             \
2152             vc1_put_shift_hor_16bits[hmode](dst, stride, tmp+1, 64-rnd);    \
2153             return;                                                         \
2154         }                                                                   \
2155         else { /* No horizontal filter, output 8 lines to dst */            \
2156             vc1_put_shift_8bits[vmode](dst, src, stride, 1-rnd, stride);    \
2157             return;                                                         \
2158         }                                                                   \
2159     }                                                                       \
2160                                                                             \
2161     /* Horizontal mode with no vertical mode */                             \
2162     vc1_put_shift_8bits[hmode](dst, src, stride, rnd, 1);                   \
2163 }                                                                           \
2164 static void OP ## vc1_mspel_mc_16(uint8_t *dst, const uint8_t *src,         \
2165                                   int stride, int hmode, int vmode, int rnd)\
2166 {                                                                           \
2167     OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd);        \
2168     OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd);        \
2169     dst += 8*stride; src += 8*stride;                                       \
2170     OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd);        \
2171     OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd);        \
2172 }
2173
2174 VC1_MSPEL_MC(put_)
2175 VC1_MSPEL_MC(avg_)
2176
2177 /** Macro to ease bicubic filter interpolation functions declarations */
2178 #define DECLARE_FUNCTION(a, b)                                              \
2179 void ff_put_vc1_mspel_mc ## a ## b ## _mmi(uint8_t *dst,                    \
2180                                            const uint8_t *src,              \
2181                                            ptrdiff_t stride,                \
2182                                            int rnd)                         \
2183 {                                                                           \
2184      put_vc1_mspel_mc(dst, src, stride, a, b, rnd);                         \
2185 }                                                                           \
2186 void ff_avg_vc1_mspel_mc ## a ## b ## _mmi(uint8_t *dst,                    \
2187                                            const uint8_t *src,              \
2188                                            ptrdiff_t stride,                \
2189                                            int rnd)                         \
2190 {                                                                           \
2191      avg_vc1_mspel_mc(dst, src, stride, a, b, rnd);                         \
2192 }                                                                           \
2193 void ff_put_vc1_mspel_mc ## a ## b ## _16_mmi(uint8_t *dst,                 \
2194                                               const uint8_t *src,           \
2195                                               ptrdiff_t stride,             \
2196                                               int rnd)                      \
2197 {                                                                           \
2198      put_vc1_mspel_mc_16(dst, src, stride, a, b, rnd);                      \
2199 }                                                                           \
2200 void ff_avg_vc1_mspel_mc ## a ## b ## _16_mmi(uint8_t *dst,                 \
2201                                               const uint8_t *src,           \
2202                                               ptrdiff_t stride,             \
2203                                               int rnd)                      \
2204 {                                                                           \
2205      avg_vc1_mspel_mc_16(dst, src, stride, a, b, rnd);                      \
2206 }
2207
2208 DECLARE_FUNCTION(0, 1)
2209 DECLARE_FUNCTION(0, 2)
2210 DECLARE_FUNCTION(0, 3)
2211
2212 DECLARE_FUNCTION(1, 0)
2213 DECLARE_FUNCTION(1, 1)
2214 DECLARE_FUNCTION(1, 2)
2215 DECLARE_FUNCTION(1, 3)
2216
2217 DECLARE_FUNCTION(2, 0)
2218 DECLARE_FUNCTION(2, 1)
2219 DECLARE_FUNCTION(2, 2)
2220 DECLARE_FUNCTION(2, 3)
2221
2222 DECLARE_FUNCTION(3, 0)
2223 DECLARE_FUNCTION(3, 1)
2224 DECLARE_FUNCTION(3, 2)
2225 DECLARE_FUNCTION(3, 3)
2226
2227 #define CHROMA_MC_8_MMI                                                     \
2228         "punpckhbh  %[ftmp5],   %[ftmp1],   %[ftmp0]                \n\t"   \
2229         "punpcklbh  %[ftmp1],   %[ftmp1],   %[ftmp0]                \n\t"   \
2230         "punpckhbh  %[ftmp6],   %[ftmp2],   %[ftmp0]                \n\t"   \
2231         "punpcklbh  %[ftmp2],   %[ftmp2],   %[ftmp0]                \n\t"   \
2232         "punpckhbh  %[ftmp7],   %[ftmp3],   %[ftmp0]                \n\t"   \
2233         "punpcklbh  %[ftmp3],   %[ftmp3],   %[ftmp0]                \n\t"   \
2234         "punpckhbh  %[ftmp8],   %[ftmp4],   %[ftmp0]                \n\t"   \
2235         "punpcklbh  %[ftmp4],   %[ftmp4],   %[ftmp0]                \n\t"   \
2236                                                                             \
2237         "pmullh     %[ftmp1],   %[ftmp1],   %[A]                    \n\t"   \
2238         "pmullh     %[ftmp5],   %[ftmp5],   %[A]                    \n\t"   \
2239         "pmullh     %[ftmp2],   %[ftmp2],   %[B]                    \n\t"   \
2240         "pmullh     %[ftmp6],   %[ftmp6],   %[B]                    \n\t"   \
2241         "pmullh     %[ftmp3],   %[ftmp3],   %[C]                    \n\t"   \
2242         "pmullh     %[ftmp7],   %[ftmp7],   %[C]                    \n\t"   \
2243         "pmullh     %[ftmp4],   %[ftmp4],   %[D]                    \n\t"   \
2244         "pmullh     %[ftmp8],   %[ftmp8],   %[D]                    \n\t"   \
2245                                                                             \
2246         "paddh      %[ftmp1],   %[ftmp1],   %[ftmp2]                \n\t"   \
2247         "paddh      %[ftmp3],   %[ftmp3],   %[ftmp4]                \n\t"   \
2248         "paddh      %[ftmp1],   %[ftmp1],   %[ftmp3]                \n\t"   \
2249         "paddh      %[ftmp1],   %[ftmp1],   %[ff_pw_28]             \n\t"   \
2250                                                                             \
2251         "paddh      %[ftmp5],   %[ftmp5],   %[ftmp6]                \n\t"   \
2252         "paddh      %[ftmp7],   %[ftmp7],   %[ftmp8]                \n\t"   \
2253         "paddh      %[ftmp5],   %[ftmp5],   %[ftmp7]                \n\t"   \
2254         "paddh      %[ftmp5],   %[ftmp5],   %[ff_pw_28]             \n\t"   \
2255                                                                             \
2256         "psrlh      %[ftmp1],   %[ftmp1],   %[ftmp9]                \n\t"   \
2257         "psrlh      %[ftmp5],   %[ftmp5],   %[ftmp9]                \n\t"   \
2258         "packushb   %[ftmp1],   %[ftmp1],   %[ftmp5]                \n\t"
2259
2260
2261 #define CHROMA_MC_4_MMI                                                     \
2262         "punpcklbh  %[ftmp1],   %[ftmp1],   %[ftmp0]                \n\t"   \
2263         "punpcklbh  %[ftmp2],   %[ftmp2],   %[ftmp0]                \n\t"   \
2264         "punpcklbh  %[ftmp3],   %[ftmp3],   %[ftmp0]                \n\t"   \
2265         "punpcklbh  %[ftmp4],   %[ftmp4],   %[ftmp0]                \n\t"   \
2266                                                                             \
2267         "pmullh     %[ftmp1],   %[ftmp1],   %[A]                    \n\t"   \
2268         "pmullh     %[ftmp2],   %[ftmp2],   %[B]                    \n\t"   \
2269         "pmullh     %[ftmp3],   %[ftmp3],   %[C]                    \n\t"   \
2270         "pmullh     %[ftmp4],   %[ftmp4],   %[D]                    \n\t"   \
2271                                                                             \
2272         "paddh      %[ftmp1],   %[ftmp1],   %[ftmp2]                \n\t"   \
2273         "paddh      %[ftmp3],   %[ftmp3],   %[ftmp4]                \n\t"   \
2274         "paddh      %[ftmp1],   %[ftmp1],   %[ftmp3]                \n\t"   \
2275         "paddh      %[ftmp1],   %[ftmp1],   %[ff_pw_28]             \n\t"   \
2276                                                                             \
2277         "psrlh      %[ftmp1],   %[ftmp1],   %[ftmp5]                \n\t"   \
2278         "packushb   %[ftmp1],   %[ftmp1],   %[ftmp0]                \n\t"
2279
2280
2281 void ff_put_no_rnd_vc1_chroma_mc8_mmi(uint8_t *dst /* align 8 */,
2282                                       uint8_t *src /* align 1 */,
2283                                       int stride, int h, int x, int y)
2284 {
2285     const int A = (8 - x) * (8 - y);
2286     const int B =     (x) * (8 - y);
2287     const int C = (8 - x) *     (y);
2288     const int D =     (x) *     (y);
2289     double ftmp[10];
2290     uint32_t tmp[1];
2291     DECLARE_VAR_ALL64;
2292     DECLARE_VAR_ADDRT;
2293
2294     av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2295
2296     __asm__ volatile(
2297         "li         %[tmp0],    0x06                                    \n\t"
2298         "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
2299         "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
2300         "pshufh     %[A],       %[A],       %[ftmp0]                    \n\t"
2301         "pshufh     %[B],       %[B],       %[ftmp0]                    \n\t"
2302         "pshufh     %[C],       %[C],       %[ftmp0]                    \n\t"
2303         "pshufh     %[D],       %[D],       %[ftmp0]                    \n\t"
2304
2305         "1:                                                             \n\t"
2306         MMI_ULDC1(%[ftmp1], %[src], 0x00)
2307         MMI_ULDC1(%[ftmp2], %[src], 0x01)
2308         PTR_ADDU   "%[src],     %[src],     %[stride]                   \n\t"
2309         MMI_ULDC1(%[ftmp3], %[src], 0x00)
2310         MMI_ULDC1(%[ftmp4], %[src], 0x01)
2311
2312         CHROMA_MC_8_MMI
2313
2314         MMI_SDC1(%[ftmp1], %[dst], 0x00)
2315         "addiu      %[h],       %[h],      -0x01                        \n\t"
2316         PTR_ADDU   "%[dst],     %[dst],     %[stride]                   \n\t"
2317         "bnez       %[h],       1b                                      \n\t"
2318         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
2319           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
2320           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
2321           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
2322           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
2323           RESTRICT_ASM_ALL64
2324           RESTRICT_ASM_ADDRT
2325           [tmp0]"=&r"(tmp[0]),
2326           [src]"+&r"(src),              [dst]"+&r"(dst),
2327           [h]"+&r"(h)
2328         : [stride]"r"((mips_reg)stride),
2329           [A]"f"(A),                    [B]"f"(B),
2330           [C]"f"(C),                    [D]"f"(D),
2331           [ff_pw_28]"f"(ff_pw_28)
2332         : "memory"
2333     );
2334 }
2335
2336 void ff_put_no_rnd_vc1_chroma_mc4_mmi(uint8_t *dst /* align 8 */,
2337                                       uint8_t *src /* align 1 */,
2338                                       int stride, int h, int x, int y)
2339 {
2340     const int A = (8 - x) * (8 - y);
2341     const int B =     (x) * (8 - y);
2342     const int C = (8 - x) *     (y);
2343     const int D =     (x) *     (y);
2344     double ftmp[6];
2345     uint32_t tmp[1];
2346     DECLARE_VAR_LOW32;
2347     DECLARE_VAR_ADDRT;
2348
2349     av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2350
2351     __asm__ volatile(
2352         "li         %[tmp0],    0x06                                    \n\t"
2353         "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
2354         "mtc1       %[tmp0],    %[ftmp5]                                \n\t"
2355         "pshufh     %[A],       %[A],       %[ftmp0]                    \n\t"
2356         "pshufh     %[B],       %[B],       %[ftmp0]                    \n\t"
2357         "pshufh     %[C],       %[C],       %[ftmp0]                    \n\t"
2358         "pshufh     %[D],       %[D],       %[ftmp0]                    \n\t"
2359
2360         "1:                                                             \n\t"
2361         MMI_ULWC1(%[ftmp1], %[src], 0x00)
2362         MMI_ULWC1(%[ftmp2], %[src], 0x01)
2363         PTR_ADDU   "%[src],     %[src],     %[stride]                   \n\t"
2364         MMI_ULWC1(%[ftmp3], %[src], 0x00)
2365         MMI_ULWC1(%[ftmp4], %[src], 0x01)
2366
2367         CHROMA_MC_4_MMI
2368
2369         MMI_SWC1(%[ftmp1], %[dst], 0x00)
2370         "addiu      %[h],       %[h],      -0x01                        \n\t"
2371         PTR_ADDU   "%[dst],     %[dst],     %[stride]                   \n\t"
2372         "bnez       %[h],       1b                                      \n\t"
2373         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
2374           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
2375           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
2376           [tmp0]"=&r"(tmp[0]),
2377           RESTRICT_ASM_LOW32
2378           RESTRICT_ASM_ADDRT
2379           [src]"+&r"(src),              [dst]"+&r"(dst),
2380           [h]"+&r"(h)
2381         : [stride]"r"((mips_reg)stride),
2382           [A]"f"(A),                    [B]"f"(B),
2383           [C]"f"(C),                    [D]"f"(D),
2384           [ff_pw_28]"f"(ff_pw_28)
2385         : "memory"
2386     );
2387 }
2388
2389 void ff_avg_no_rnd_vc1_chroma_mc8_mmi(uint8_t *dst /* align 8 */,
2390                                       uint8_t *src /* align 1 */,
2391                                       int stride, int h, int x, int y)
2392 {
2393     const int A = (8 - x) * (8 - y);
2394     const int B =     (x) * (8 - y);
2395     const int C = (8 - x) *     (y);
2396     const int D =     (x) *     (y);
2397     double ftmp[10];
2398     uint32_t tmp[1];
2399     DECLARE_VAR_ALL64;
2400     DECLARE_VAR_ADDRT;
2401
2402     av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2403
2404     __asm__ volatile(
2405         "li         %[tmp0],    0x06                                    \n\t"
2406         "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
2407         "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
2408         "pshufh     %[A],       %[A],       %[ftmp0]                    \n\t"
2409         "pshufh     %[B],       %[B],       %[ftmp0]                    \n\t"
2410         "pshufh     %[C],       %[C],       %[ftmp0]                    \n\t"
2411         "pshufh     %[D],       %[D],       %[ftmp0]                    \n\t"
2412
2413         "1:                                                             \n\t"
2414         MMI_ULDC1(%[ftmp1], %[src], 0x00)
2415         MMI_ULDC1(%[ftmp2], %[src], 0x01)
2416         PTR_ADDU   "%[src],     %[src],     %[stride]                   \n\t"
2417         MMI_ULDC1(%[ftmp3], %[src], 0x00)
2418         MMI_ULDC1(%[ftmp4], %[src], 0x01)
2419
2420         CHROMA_MC_8_MMI
2421
2422         MMI_LDC1(%[ftmp2], %[dst], 0x00)
2423         "pavgb      %[ftmp1],   %[ftmp1],   %[ftmp2]                    \n\t"
2424
2425         MMI_SDC1(%[ftmp1], %[dst], 0x00)
2426         "addiu      %[h],       %[h],      -0x01                        \n\t"
2427         PTR_ADDU   "%[dst],     %[dst],     %[stride]                   \n\t"
2428         "bnez       %[h],       1b                                      \n\t"
2429         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
2430           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
2431           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
2432           [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
2433           [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
2434           [tmp0]"=&r"(tmp[0]),
2435           RESTRICT_ASM_ALL64
2436           RESTRICT_ASM_ADDRT
2437           [src]"+&r"(src),              [dst]"+&r"(dst),
2438           [h]"+&r"(h)
2439         : [stride]"r"((mips_reg)stride),
2440           [A]"f"(A),                    [B]"f"(B),
2441           [C]"f"(C),                    [D]"f"(D),
2442           [ff_pw_28]"f"(ff_pw_28)
2443         : "memory"
2444     );
2445 }
2446
2447 void ff_avg_no_rnd_vc1_chroma_mc4_mmi(uint8_t *dst /* align 8 */,
2448                                       uint8_t *src /* align 1 */,
2449                                       int stride, int h, int x, int y)
2450 {
2451     const int A = (8 - x) * (8 - y);
2452     const int B = (    x) * (8 - y);
2453     const int C = (8 - x) * (    y);
2454     const int D = (    x) * (    y);
2455     double ftmp[6];
2456     uint32_t tmp[1];
2457     DECLARE_VAR_LOW32;
2458     DECLARE_VAR_ADDRT;
2459
2460     av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2461
2462     __asm__ volatile(
2463         "li         %[tmp0],    0x06                                    \n\t"
2464         "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]                    \n\t"
2465         "mtc1       %[tmp0],    %[ftmp5]                                \n\t"
2466         "pshufh     %[A],       %[A],       %[ftmp0]                    \n\t"
2467         "pshufh     %[B],       %[B],       %[ftmp0]                    \n\t"
2468         "pshufh     %[C],       %[C],       %[ftmp0]                    \n\t"
2469         "pshufh     %[D],       %[D],       %[ftmp0]                    \n\t"
2470
2471         "1:                                                             \n\t"
2472         MMI_ULWC1(%[ftmp1], %[src], 0x00)
2473         MMI_ULWC1(%[ftmp2], %[src], 0x01)
2474         PTR_ADDU   "%[src],     %[src],     %[stride]                   \n\t"
2475         MMI_ULWC1(%[ftmp3], %[src], 0x00)
2476         MMI_ULWC1(%[ftmp4], %[src], 0x01)
2477
2478         CHROMA_MC_4_MMI
2479
2480         MMI_LWC1(%[ftmp2], %[dst], 0x00)
2481         "pavgb      %[ftmp1],   %[ftmp1],   %[ftmp2]                    \n\t"
2482
2483         MMI_SWC1(%[ftmp1], %[dst], 0x00)
2484         "addiu      %[h],       %[h],      -0x01                        \n\t"
2485         PTR_ADDU   "%[dst],     %[dst],     %[stride]                   \n\t"
2486         "bnez       %[h],       1b                                      \n\t"
2487         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
2488           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
2489           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
2490           [tmp0]"=&r"(tmp[0]),
2491           RESTRICT_ASM_LOW32
2492           RESTRICT_ASM_ADDRT
2493           [src]"+&r"(src),              [dst]"+&r"(dst),
2494           [h]"+&r"(h)
2495         : [stride]"r"((mips_reg)stride),
2496           [A]"f"(A),                    [B]"f"(B),
2497           [C]"f"(C),                    [D]"f"(D),
2498           [ff_pw_28]"f"(ff_pw_28)
2499         : "memory"
2500     );
2501 }