git.sesse.net Git - ffmpeg/blob - libavcodec/mips/vp8dsp_mmi.c

   1 /*
   2  * Loongson SIMD optimized vp8dsp
   3  *
   4  * Copyright (c) 2016 Loongson Technology Corporation Limited
   5  * Copyright (c) 2016 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
   6  *
   7  * This file is part of FFmpeg.
   8  *
   9  * FFmpeg is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public
  11  * License as published by the Free Software Foundation; either
  12  * version 2.1 of the License, or (at your option) any later version.
  13  *
  14  * FFmpeg is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with FFmpeg; if not, write to the Free Software
  21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22  */
  23
  24 #include "vp8dsp_mips.h"
  25 #include "constants.h"
  26 #include "libavutil/attributes.h"
  27 #include "libavutil/mips/mmiutils.h"
  28 #include "libavutil/mem_internal.h"
  29
  30 #define DECLARE_DOUBLE_1            double db_1
  31 #define DECLARE_DOUBLE_2            double db_2
  32 #define DECLARE_UINT32_T            uint32_t  it_1
  33 #define RESTRICT_ASM_DOUBLE_1       [db_1]"=&f"(db_1)
  34 #define RESTRICT_ASM_DOUBLE_2       [db_2]"=&f"(db_2)
  35 #define RESTRICT_ASM_UINT32_T       [it_1]"=&r"(it_1)
  36
  37 #define MMI_PCMPGTUB(dst, src1, src2)                                       \
  38         "pcmpeqb    %[db_1],    "#src1",        "#src2"             \n\t"   \
  39         "pmaxub     %[db_2],    "#src1",        "#src2"             \n\t"   \
  40         "pcmpeqb    %[db_2],    %[db_2],        "#src1"             \n\t"   \
  41         "xor        "#dst",     %[db_2],        %[db_1]             \n\t"
  42
  43 #define MMI_BTOH(dst_l, dst_r, src)                                         \
  44         "xor        %[db_1],    %[db_1],        %[db_1]             \n\t"   \
  45         "pcmpgtb    %[db_2],    %[db_1],        "#src"              \n\t"   \
  46         "punpcklbh  "#dst_r",   "#src",         %[db_2]             \n\t"   \
  47         "punpckhbh  "#dst_l",   "#src",         %[db_2]             \n\t"
  48
  49 #define MMI_VP8_LOOP_FILTER                                                 \
  50         /* Calculation of hev */                                            \
  51         "dmtc1      %[thresh],  %[ftmp3]                            \n\t"   \
  52         "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"   \
  53         "punpcklhw  %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"   \
  54         "punpcklwd  %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"   \
  55         "pasubub    %[ftmp0],   %[p1],          %[p0]               \n\t"   \
  56         "pasubub    %[ftmp1],   %[q1],          %[q0]               \n\t"   \
  57         "pmaxub     %[ftmp0],   %[ftmp0],       %[ftmp1]            \n\t"   \
  58         MMI_PCMPGTUB(%[hev], %[ftmp0], %[ftmp3])                            \
  59         /* Calculation of mask */                                           \
  60         "pasubub    %[ftmp1],   %[p0],          %[q0]               \n\t"   \
  61         "paddusb    %[ftmp1],   %[ftmp1],       %[ftmp1]            \n\t"   \
  62         "pasubub    %[ftmp2],   %[p1],          %[q1]               \n\t"   \
  63         "li         %[tmp0],    0x09                                \n\t"   \
  64         "dmtc1      %[tmp0],    %[ftmp3]                            \n\t"   \
  65         PSRLB_MMI(%[ftmp2],  %[ftmp3],  %[ftmp4],  %[ftmp5],  %[ftmp2])     \
  66         "paddusb    %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"   \
  67         "dmtc1      %[e],       %[ftmp3]                            \n\t"   \
  68         "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"   \
  69         "punpcklhw  %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"   \
  70         "punpcklwd  %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"   \
  71         MMI_PCMPGTUB(%[mask], %[ftmp1], %[ftmp3])                           \
  72         "pmaxub     %[mask],    %[mask],        %[ftmp0]            \n\t"   \
  73         "pasubub    %[ftmp1],   %[p3],          %[p2]               \n\t"   \
  74         "pasubub    %[ftmp2],   %[p2],          %[p1]               \n\t"   \
  75         "pmaxub     %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"   \
  76         "pmaxub     %[mask],    %[mask],        %[ftmp1]            \n\t"   \
  77         "pasubub    %[ftmp1],   %[q3],          %[q2]               \n\t"   \
  78         "pasubub    %[ftmp2],   %[q2],          %[q1]               \n\t"   \
  79         "pmaxub     %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"   \
  80         "pmaxub     %[mask],    %[mask],        %[ftmp1]            \n\t"   \
  81         "dmtc1      %[i],       %[ftmp3]                            \n\t"   \
  82         "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"   \
  83         "punpcklhw  %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"   \
  84         "punpcklwd  %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"   \
  85         MMI_PCMPGTUB(%[mask], %[mask], %[ftmp3])                            \
  86         "pcmpeqw    %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"   \
  87         "xor        %[mask],    %[mask],        %[ftmp3]            \n\t"   \
  88         /* VP8_MBFILTER */                                                  \
  89         "li         %[tmp0],    0x80808080                          \n\t"   \
  90         "dmtc1      %[tmp0],    %[ftmp7]                            \n\t"   \
  91         "punpcklwd  %[ftmp7],   %[ftmp7],       %[ftmp7]            \n\t"   \
  92         "xor        %[p2],      %[p2],          %[ftmp7]            \n\t"   \
  93         "xor        %[p1],      %[p1],          %[ftmp7]            \n\t"   \
  94         "xor        %[p0],      %[p0],          %[ftmp7]            \n\t"   \
  95         "xor        %[q0],      %[q0],          %[ftmp7]            \n\t"   \
  96         "xor        %[q1],      %[q1],          %[ftmp7]            \n\t"   \
  97         "xor        %[q2],      %[q2],          %[ftmp7]            \n\t"   \
  98         "psubsb     %[ftmp4],   %[p1],          %[q1]               \n\t"   \
  99         "psubb      %[ftmp5],   %[q0],          %[p0]               \n\t"   \
 100         MMI_BTOH(%[ftmp1],  %[ftmp0],  %[ftmp5])                            \
 101         MMI_BTOH(%[ftmp3],  %[ftmp2],  %[ftmp4])                            \
 102         /* Right part */                                                    \
 103         "paddh      %[ftmp5],   %[ftmp0],       %[ftmp0]            \n\t"   \
 104         "paddh      %[ftmp0],   %[ftmp0],       %[ftmp5]            \n\t"   \
 105         "paddh      %[ftmp0],   %[ftmp2],       %[ftmp0]            \n\t"   \
 106         /* Left part */                                                     \
 107         "paddh      %[ftmp5],   %[ftmp1],       %[ftmp1]            \n\t"   \
 108         "paddh      %[ftmp1],   %[ftmp1],       %[ftmp5]            \n\t"   \
 109         "paddh      %[ftmp1],   %[ftmp3],       %[ftmp1]            \n\t"   \
 110         /* Combine left and right part */                                   \
 111         "packsshb   %[ftmp1],   %[ftmp0],       %[ftmp1]            \n\t"   \
 112         "and        %[ftmp1],   %[ftmp1],       %[mask]             \n\t"   \
 113         "and        %[ftmp2],   %[ftmp1],       %[hev]              \n\t"   \
 114         "li         %[tmp0],    0x04040404                          \n\t"   \
 115         "dmtc1      %[tmp0],    %[ftmp0]                            \n\t"   \
 116         "punpcklwd  %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"   \
 117         "paddsb     %[ftmp3],   %[ftmp2],       %[ftmp0]            \n\t"   \
 118         "li         %[tmp0],    0x0B                                \n\t"   \
 119         "dmtc1      %[tmp0],    %[ftmp4]                            \n\t"   \
 120         PSRAB_MMI(%[ftmp3],  %[ftmp4],  %[ftmp5],  %[ftmp6],  %[ftmp3])     \
 121         "li         %[tmp0],    0x03030303                          \n\t"   \
 122         "dmtc1      %[tmp0],    %[ftmp0]                            \n\t"   \
 123         "punpcklwd  %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"   \
 124         "paddsb     %[ftmp4],   %[ftmp2],       %[ftmp0]            \n\t"   \
 125         "li         %[tmp0],    0x0B                                \n\t"   \
 126         "dmtc1      %[tmp0],    %[ftmp2]                            \n\t"   \
 127         PSRAB_MMI(%[ftmp4],  %[ftmp2],  %[ftmp5],  %[ftmp6],  %[ftmp4])     \
 128         "psubsb     %[q0],      %[q0],          %[ftmp3]            \n\t"   \
 129         "paddsb     %[p0],      %[p0],          %[ftmp4]            \n\t"   \
 130         /* filt_val &= ~hev */                                              \
 131         "pcmpeqw    %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"   \
 132         "xor        %[hev],     %[hev],         %[ftmp0]            \n\t"   \
 133         "and        %[ftmp1],   %[ftmp1],       %[hev]              \n\t"   \
 134         MMI_BTOH(%[ftmp5],  %[ftmp6],  %[ftmp1])                            \
 135         "li         %[tmp0],    0x07                                \n\t"   \
 136         "dmtc1      %[tmp0],    %[ftmp2]                            \n\t"   \
 137         "li         %[tmp0],    0x001b001b                          \n\t"   \
 138         "dmtc1      %[tmp0],    %[ftmp1]                            \n\t"   \
 139         "punpcklwd  %[ftmp1],   %[ftmp1],       %[ftmp1]            \n\t"   \
 140         "li         %[tmp0],    0x003f003f                          \n\t"   \
 141         "dmtc1      %[tmp0],    %[ftmp0]                            \n\t"   \
 142         "punpcklwd  %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"   \
 143         /* Right part */                                                    \
 144         "pmullh     %[ftmp3],   %[ftmp6],       %[ftmp1]            \n\t"   \
 145         "paddh      %[ftmp3],   %[ftmp3],       %[ftmp0]            \n\t"   \
 146         "psrah      %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"   \
 147         /* Left part */                                                     \
 148         "pmullh     %[ftmp4],   %[ftmp5],       %[ftmp1]            \n\t"   \
 149         "paddh      %[ftmp4],   %[ftmp4],       %[ftmp0]            \n\t"   \
 150         "psrah      %[ftmp4],   %[ftmp4],       %[ftmp2]            \n\t"   \
 151         /* Combine left and right part */                                   \
 152         "packsshb   %[ftmp4],   %[ftmp3],       %[ftmp4]            \n\t"   \
 153         "psubsb     %[q0],      %[q0],          %[ftmp4]            \n\t"   \
 154         "xor        %[q0],      %[q0],          %[ftmp7]            \n\t"   \
 155         "paddsb     %[p0],      %[p0],          %[ftmp4]            \n\t"   \
 156         "xor        %[p0],      %[p0],          %[ftmp7]            \n\t"   \
 157         "li         %[tmp0],    0x00120012                          \n\t"   \
 158         "dmtc1      %[tmp0],    %[ftmp1]                            \n\t"   \
 159         "punpcklwd  %[ftmp1],   %[ftmp1],       %[ftmp1]            \n\t"   \
 160         /* Right part */                                                    \
 161         "pmullh     %[ftmp3],   %[ftmp6],       %[ftmp1]            \n\t"   \
 162         "paddh      %[ftmp3],   %[ftmp3],       %[ftmp0]            \n\t"   \
 163         "psrah      %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"   \
 164         /* Left part */                                                     \
 165         "pmullh     %[ftmp4],   %[ftmp5],       %[ftmp1]            \n\t"   \
 166         "paddh      %[ftmp4],   %[ftmp4],       %[ftmp0]            \n\t"   \
 167         "psrah      %[ftmp4],   %[ftmp4],       %[ftmp2]            \n\t"   \
 168         /* Combine left and right part */                                   \
 169         "packsshb   %[ftmp4],   %[ftmp3],       %[ftmp4]            \n\t"   \
 170         "psubsb     %[q1],      %[q1],          %[ftmp4]            \n\t"   \
 171         "xor        %[q1],      %[q1],          %[ftmp7]            \n\t"   \
 172         "paddsb     %[p1],      %[p1],          %[ftmp4]            \n\t"   \
 173         "xor        %[p1],      %[p1],          %[ftmp7]            \n\t"   \
 174         "li         %[tmp0],    0x03                                \n\t"   \
 175         "dmtc1      %[tmp0],    %[ftmp1]                            \n\t"   \
 176         /* Right part */                                                    \
 177         "psllh      %[ftmp3],   %[ftmp6],       %[ftmp1]            \n\t"   \
 178         "paddh      %[ftmp3],   %[ftmp3],       %[ftmp6]            \n\t"   \
 179         "paddh      %[ftmp3],   %[ftmp3],       %[ftmp0]            \n\t"   \
 180         "psrah      %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"   \
 181         /* Left part */                                                     \
 182         "psllh      %[ftmp4],   %[ftmp5],       %[ftmp1]            \n\t"   \
 183         "paddh      %[ftmp4],   %[ftmp4],       %[ftmp5]            \n\t"   \
 184         "paddh      %[ftmp4],   %[ftmp4],       %[ftmp0]            \n\t"   \
 185         "psrah      %[ftmp4],   %[ftmp4],       %[ftmp2]            \n\t"   \
 186         /* Combine left and right part */                                   \
 187         "packsshb   %[ftmp4],   %[ftmp3],       %[ftmp4]            \n\t"   \
 188         "psubsb     %[q2],      %[q2],          %[ftmp4]            \n\t"   \
 189         "xor        %[q2],      %[q2],          %[ftmp7]            \n\t"   \
 190         "paddsb     %[p2],      %[p2],          %[ftmp4]            \n\t"   \
 191         "xor        %[p2],      %[p2],          %[ftmp7]            \n\t"
 192
 193 #define PUT_VP8_EPEL4_H6_MMI(src, dst)                                      \
 194         MMI_ULWC1(%[ftmp1], src, 0x00)                                      \
 195         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
 196         "pmullh     %[ftmp3],   %[ftmp2],       %[filter2]          \n\t"   \
 197                                                                             \
 198         MMI_ULWC1(%[ftmp1], src, -0x01)                                     \
 199         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
 200         "pmullh     %[ftmp2],   %[ftmp2],       %[filter1]          \n\t"   \
 201         "psubsh     %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"   \
 202                                                                             \
 203         MMI_ULWC1(%[ftmp1], src, -0x02)                                     \
 204         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
 205         "pmullh     %[ftmp2],   %[ftmp2],       %[filter0]          \n\t"   \
 206         "paddsh     %[ftmp5],   %[ftmp3],       %[ftmp2]            \n\t"   \
 207                                                                             \
 208         MMI_ULWC1(%[ftmp1], src, 0x01)                                      \
 209         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
 210         "pmullh     %[ftmp3],   %[ftmp2],       %[filter3]          \n\t"   \
 211                                                                             \
 212         MMI_ULWC1(%[ftmp1], src, 0x02)                                      \
 213         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
 214         "pmullh     %[ftmp2],   %[ftmp2],       %[filter4]          \n\t"   \
 215         "psubsh     %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"   \
 216                                                                             \
 217         MMI_ULWC1(%[ftmp1], src, 0x03)                                      \
 218         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
 219         "pmullh     %[ftmp2],   %[ftmp2],       %[filter5]          \n\t"   \
 220         "paddsh     %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"   \
 221                                                                             \
 222         "paddsh     %[ftmp3],   %[ftmp3],       %[ftmp5]            \n\t"   \
 223         "paddsh     %[ftmp3],   %[ftmp3],       %[ff_pw_64]         \n\t"   \
 224         "psrah      %[ftmp3],   %[ftmp3],       %[ftmp4]            \n\t"   \
 225         "packushb   %[ftmp1],   %[ftmp3],       %[ftmp0]            \n\t"   \
 226                                                                             \
 227         MMI_SWC1(%[ftmp1], dst, 0x00)
 228
 229
 230 #define PUT_VP8_EPEL4_H4_MMI(src, dst)                                      \
 231         MMI_ULWC1(%[ftmp1], src, 0x00)                                      \
 232         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
 233         "pmullh     %[ftmp3],   %[ftmp2],       %[filter2]          \n\t"   \
 234                                                                             \
 235         MMI_ULWC1(%[ftmp1], src, -0x01)                                     \
 236         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
 237         "pmullh     %[ftmp2],   %[ftmp2],       %[filter1]          \n\t"   \
 238         "psubsh     %[ftmp5],   %[ftmp3],       %[ftmp2]            \n\t"   \
 239                                                                             \
 240         MMI_ULWC1(%[ftmp1], src, 0x01)                                      \
 241         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
 242         "pmullh     %[ftmp3],   %[ftmp2],       %[filter3]          \n\t"   \
 243                                                                             \
 244         MMI_ULWC1(%[ftmp1], src, 0x02)                                      \
 245         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
 246         "pmullh     %[ftmp2],   %[ftmp2],       %[filter4]          \n\t"   \
 247         "psubh      %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"   \
 248                                                                             \
 249         "paddsh     %[ftmp3],   %[ftmp3],       %[ftmp5]            \n\t"   \
 250                                                                             \
 251         "paddsh     %[ftmp3],   %[ftmp3],       %[ff_pw_64]         \n\t"   \
 252         "psrah      %[ftmp3],   %[ftmp3],       %[ftmp4]            \n\t"   \
 253                                                                             \
 254         "packushb   %[ftmp1],   %[ftmp3],       %[ftmp0]            \n\t"   \
 255         MMI_SWC1(%[ftmp1], dst, 0x00)
 256
 257
 258 #define PUT_VP8_EPEL4_V6_MMI(src, src1, dst, srcstride)                     \
 259         MMI_ULWC1(%[ftmp1], src, 0x00)                                      \
 260         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
 261         "pmullh     %[ftmp3],   %[ftmp2],       %[filter2]          \n\t"   \
 262                                                                             \
 263         PTR_SUBU   ""#src1",    "#src",         "#srcstride"        \n\t"   \
 264         MMI_ULWC1(%[ftmp1], src1, 0x00)                                     \
 265         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
 266         "pmullh     %[ftmp2],   %[ftmp2],       %[filter1]          \n\t"   \
 267         "psubsh     %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"   \
 268                                                                             \
 269         PTR_SUBU   ""#src1",    "#src1",        "#srcstride"        \n\t"   \
 270         MMI_ULWC1(%[ftmp1], src1, 0x00)                                     \
 271         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
 272         "pmullh     %[ftmp2],   %[ftmp2],       %[filter0]          \n\t"   \
 273         "paddsh     %[ftmp5],   %[ftmp3],       %[ftmp2]            \n\t"   \
 274                                                                             \
 275         PTR_ADDU   ""#src1",    "#src",         "#srcstride"        \n\t"   \
 276         MMI_ULWC1(%[ftmp1], src1, 0x00)                                     \
 277         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
 278         "pmullh     %[ftmp3],   %[ftmp2],       %[filter3]          \n\t"   \
 279                                                                             \
 280         PTR_ADDU   ""#src1",    "#src1",        "#srcstride"        \n\t"   \
 281         MMI_ULWC1(%[ftmp1], src1, 0x00)                                     \
 282         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
 283         "pmullh     %[ftmp2],   %[ftmp2],       %[filter4]          \n\t"   \
 284         "psubsh     %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"   \
 285                                                                             \
 286         PTR_ADDU   ""#src1",    "#src1",        "#srcstride"        \n\t"   \
 287         MMI_ULWC1(%[ftmp1], src1, 0x00)                                     \
 288         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
 289         "pmullh     %[ftmp2],   %[ftmp2],       %[filter5]          \n\t"   \
 290         "paddsh     %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"   \
 291                                                                             \
 292         "paddsh     %[ftmp3],   %[ftmp3],       %[ftmp5]            \n\t"   \
 293                                                                             \
 294         "paddsh     %[ftmp3],   %[ftmp3],       %[ff_pw_64]         \n\t"   \
 295         "psrah      %[ftmp3],   %[ftmp3],       %[ftmp4]            \n\t"   \
 296         "packushb   %[ftmp1],   %[ftmp3],       %[ftmp0]            \n\t"   \
 297                                                                             \
 298         MMI_SWC1(%[ftmp1], dst, 0x00)
 299
 300
 301 #define PUT_VP8_EPEL4_V4_MMI(src, src1, dst, srcstride)                     \
 302         MMI_ULWC1(%[ftmp1], src, 0x00)                                      \
 303         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
 304         "pmullh     %[ftmp3],   %[ftmp2],       %[filter2]          \n\t"   \
 305                                                                             \
 306         PTR_SUBU   ""#src1",    "#src",         "#srcstride"        \n\t"   \
 307         MMI_ULWC1(%[ftmp1], src1, 0x00)                                     \
 308         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
 309         "pmullh     %[ftmp2],   %[ftmp2],       %[filter1]          \n\t"   \
 310         "psubsh     %[ftmp5],   %[ftmp3],       %[ftmp2]            \n\t"   \
 311                                                                             \
 312         PTR_ADDU   ""#src1",    "#src",         "#srcstride"        \n\t"   \
 313         MMI_ULWC1(%[ftmp1], src1, 0x00)                                     \
 314         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
 315         "pmullh     %[ftmp3],   %[ftmp2],       %[filter3]          \n\t"   \
 316                                                                             \
 317         PTR_ADDU   ""#src1",    "#src1",        "#srcstride"        \n\t"   \
 318         MMI_ULWC1(%[ftmp1], src1, 0x00)                                     \
 319         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
 320         "pmullh     %[ftmp2],   %[ftmp2],       %[filter4]          \n\t"   \
 321         "psubsh     %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"   \
 322                                                                             \
 323         "paddsh     %[ftmp3],   %[ftmp3],       %[ftmp5]            \n\t"   \
 324                                                                             \
 325         "paddsh     %[ftmp3],   %[ftmp3],       %[ff_pw_64]         \n\t"   \
 326         "psrah      %[ftmp3],   %[ftmp3],       %[ftmp4]            \n\t"   \
 327         "packushb   %[ftmp1],   %[ftmp3],       %[ftmp0]            \n\t"   \
 328                                                                             \
 329         MMI_SWC1(%[ftmp1], dst, 0x00)
 330
 331
 332 #define PUT_VP8_EPEL8_H6_MMI(src, dst)                                      \
 333         MMI_ULDC1(%[ftmp1], src, 0x00)                                      \
 334         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
 335         "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
 336         "pmullh     %[ftmp5],   %[ftmp2],       %[filter2]          \n\t"   \
 337         "pmullh     %[ftmp6],   %[ftmp3],       %[filter2]          \n\t"   \
 338                                                                             \
 339         MMI_ULDC1(%[ftmp1], src, -0x01)                                     \
 340         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
 341         "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
 342         "pmullh     %[ftmp2],   %[ftmp2],       %[filter1]          \n\t"   \
 343         "pmullh     %[ftmp3],   %[ftmp3],       %[filter1]          \n\t"   \
 344         "psubsh     %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"   \
 345         "psubsh     %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"   \
 346                                                                             \
 347         MMI_ULDC1(%[ftmp1], src, -0x02)                                     \
 348         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
 349         "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
 350         "pmullh     %[ftmp2],   %[ftmp2],       %[filter0]          \n\t"   \
 351         "pmullh     %[ftmp3],   %[ftmp3],       %[filter0]          \n\t"   \
 352         "paddsh     %[ftmp7],   %[ftmp5],       %[ftmp2]            \n\t"   \
 353         "paddsh     %[ftmp8],   %[ftmp6],       %[ftmp3]            \n\t"   \
 354                                                                             \
 355         MMI_ULDC1(%[ftmp1], src, 0x01)                                      \
 356         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
 357         "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
 358         "pmullh     %[ftmp5],   %[ftmp2],       %[filter3]          \n\t"   \
 359         "pmullh     %[ftmp6],   %[ftmp3],       %[filter3]          \n\t"   \
 360                                                                             \
 361         MMI_ULDC1(%[ftmp1], src, 0x02)                                      \
 362         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
 363         "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
 364         "pmullh     %[ftmp2],   %[ftmp2],       %[filter4]          \n\t"   \
 365         "pmullh     %[ftmp3],   %[ftmp3],       %[filter4]          \n\t"   \
 366         "psubsh     %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"   \
 367         "psubsh     %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"   \
 368                                                                             \
 369         MMI_ULDC1(%[ftmp1], src, 0x03)                                      \
 370         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
 371         "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
 372         "pmullh     %[ftmp2],   %[ftmp2],       %[filter5]          \n\t"   \
 373         "pmullh     %[ftmp3],   %[ftmp3],       %[filter5]          \n\t"   \
 374         "paddsh     %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"   \
 375         "paddsh     %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"   \
 376                                                                             \
 377         "paddsh     %[ftmp5],   %[ftmp5],       %[ftmp7]            \n\t"   \
 378         "paddsh     %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"   \
 379                                                                             \
 380         "paddsh     %[ftmp5],   %[ftmp5],       %[ff_pw_64]         \n\t"   \
 381         "paddsh     %[ftmp6],   %[ftmp6],       %[ff_pw_64]         \n\t"   \
 382         "psrah      %[ftmp5],   %[ftmp5],       %[ftmp4]            \n\t"   \
 383         "psrah      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"   \
 384         "packushb   %[ftmp1],   %[ftmp5],       %[ftmp6]            \n\t"   \
 385                                                                             \
 386         MMI_SDC1(%[ftmp1], dst, 0x00)
 387
 388
 389 #define PUT_VP8_EPEL8_H4_MMI(src, dst)                                      \
 390         MMI_ULDC1(%[ftmp1], src, 0x00)                                      \
 391         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
 392         "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
 393         "pmullh     %[ftmp5],   %[ftmp2],       %[filter2]          \n\t"   \
 394         "pmullh     %[ftmp6],   %[ftmp3],       %[filter2]          \n\t"   \
 395                                                                             \
 396         MMI_ULDC1(%[ftmp1], src, -0x01)                                     \
 397         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
 398         "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
 399         "pmullh     %[ftmp2],   %[ftmp2],       %[filter1]          \n\t"   \
 400         "pmullh     %[ftmp3],   %[ftmp3],       %[filter1]          \n\t"   \
 401         "psubsh     %[ftmp7],   %[ftmp5],       %[ftmp2]            \n\t"   \
 402         "psubsh     %[ftmp8],   %[ftmp6],       %[ftmp3]            \n\t"   \
 403                                                                             \
 404         MMI_ULDC1(%[ftmp1], src, 0x01)                                      \
 405         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
 406         "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
 407         "pmullh     %[ftmp5],   %[ftmp2],       %[filter3]          \n\t"   \
 408         "pmullh     %[ftmp6],   %[ftmp3],       %[filter3]          \n\t"   \
 409                                                                             \
 410         MMI_ULDC1(%[ftmp1], src, 0x02)                                      \
 411         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
 412         "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
 413         "pmullh     %[ftmp2],   %[ftmp2],       %[filter4]          \n\t"   \
 414         "pmullh     %[ftmp3],   %[ftmp3],       %[filter4]          \n\t"   \
 415         "psubsh     %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"   \
 416         "psubsh     %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"   \
 417                                                                             \
 418         "paddsh     %[ftmp5],   %[ftmp5],       %[ftmp7]            \n\t"   \
 419         "paddsh     %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"   \
 420                                                                             \
 421         "paddsh     %[ftmp5],   %[ftmp5],       %[ff_pw_64]         \n\t"   \
 422         "paddsh     %[ftmp6],   %[ftmp6],       %[ff_pw_64]         \n\t"   \
 423         "psrah      %[ftmp5],   %[ftmp5],       %[ftmp4]            \n\t"   \
 424         "psrah      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"   \
 425                                                                             \
 426         "packushb   %[ftmp1],   %[ftmp5],       %[ftmp6]            \n\t"   \
 427         MMI_SDC1(%[ftmp1], dst, 0x00)
 428
 429
 430 #define PUT_VP8_EPEL8_V6_MMI(src, src1, dst, srcstride)                     \
 431         MMI_ULDC1(%[ftmp1], src, 0x00)                                      \
 432         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
 433         "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
 434         "pmullh     %[ftmp5],   %[ftmp2],       %[filter2]          \n\t"   \
 435         "pmullh     %[ftmp6],   %[ftmp3],       %[filter2]          \n\t"   \
 436                                                                             \
 437         PTR_SUBU   ""#src1",    "#src",         "#srcstride"        \n\t"   \
 438         MMI_ULDC1(%[ftmp1], src1, 0x00)                                     \
 439         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
 440         "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
 441         "pmullh     %[ftmp2],   %[ftmp2],       %[filter1]          \n\t"   \
 442         "pmullh     %[ftmp3],   %[ftmp3],       %[filter1]          \n\t"   \
 443         "psubsh     %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"   \
 444         "psubsh     %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"   \
 445                                                                             \
 446         PTR_SUBU   ""#src1",    "#src1",        "#srcstride"        \n\t"   \
 447         MMI_ULDC1(%[ftmp1], src1, 0x00)                                     \
 448         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
 449         "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
 450         "pmullh     %[ftmp2],   %[ftmp2],       %[filter0]          \n\t"   \
 451         "pmullh     %[ftmp3],   %[ftmp3],       %[filter0]          \n\t"   \
 452         "paddsh     %[ftmp7],   %[ftmp5],       %[ftmp2]            \n\t"   \
 453         "paddsh     %[ftmp8],   %[ftmp6],       %[ftmp3]            \n\t"   \
 454                                                                             \
 455         PTR_ADDU   ""#src1",    "#src",         "#srcstride"        \n\t"   \
 456         MMI_ULDC1(%[ftmp1], src1, 0x00)                                     \
 457         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
 458         "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
 459         "pmullh     %[ftmp5],   %[ftmp2],       %[filter3]          \n\t"   \
 460         "pmullh     %[ftmp6],   %[ftmp3],       %[filter3]          \n\t"   \
 461                                                                             \
 462         PTR_ADDU   ""#src1",    "#src1",        "#srcstride"        \n\t"   \
 463         MMI_ULDC1(%[ftmp1], src1, 0x00)                                     \
 464         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
 465         "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
 466         "pmullh     %[ftmp2],   %[ftmp2],       %[filter4]          \n\t"   \
 467         "pmullh     %[ftmp3],   %[ftmp3],       %[filter4]          \n\t"   \
 468         "psubsh     %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"   \
 469         "psubsh     %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"   \
 470                                                                             \
 471         PTR_ADDU   ""#src1",    "#src1",        "#srcstride"        \n\t"   \
 472         MMI_ULDC1(%[ftmp1], src1, 0x00)                                     \
 473         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
 474         "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
 475         "pmullh     %[ftmp2],   %[ftmp2],       %[filter5]          \n\t"   \
 476         "pmullh     %[ftmp3],   %[ftmp3],       %[filter5]          \n\t"   \
 477         "paddsh     %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"   \
 478         "paddsh     %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"   \
 479                                                                             \
 480         "paddsh     %[ftmp5],   %[ftmp5],       %[ftmp7]            \n\t"   \
 481         "paddsh     %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"   \
 482                                                                             \
 483         "paddsh     %[ftmp5],   %[ftmp5],       %[ff_pw_64]         \n\t"   \
 484         "paddsh     %[ftmp6],   %[ftmp6],       %[ff_pw_64]         \n\t"   \
 485         "psrah      %[ftmp5],   %[ftmp5],       %[ftmp4]            \n\t"   \
 486         "psrah      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"   \
 487         "packushb   %[ftmp1],   %[ftmp5],       %[ftmp6]            \n\t"   \
 488                                                                             \
 489         MMI_SDC1(%[ftmp1], dst, 0x00)
 490
 491
 492 #define PUT_VP8_EPEL8_V4_MMI(src, src1, dst, srcstride)                     \
 493         MMI_ULDC1(%[ftmp1], src, 0x00)                                      \
 494         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
 495         "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
 496         "pmullh     %[ftmp5],   %[ftmp2],       %[filter2]          \n\t"   \
 497         "pmullh     %[ftmp6],   %[ftmp3],       %[filter2]          \n\t"   \
 498                                                                             \
 499         PTR_SUBU   ""#src1",    "#src",         "#srcstride"        \n\t"   \
 500         MMI_ULDC1(%[ftmp1], src1, 0x00)                                     \
 501         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
 502         "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
 503         "pmullh     %[ftmp2],   %[ftmp2],       %[filter1]          \n\t"   \
 504         "pmullh     %[ftmp3],   %[ftmp3],       %[filter1]          \n\t"   \
 505         "psubsh     %[ftmp7],   %[ftmp5],       %[ftmp2]            \n\t"   \
 506         "psubsh     %[ftmp8],   %[ftmp6],       %[ftmp3]            \n\t"   \
 507                                                                             \
 508         PTR_ADDU   ""#src1",    "#src",         "#srcstride"        \n\t"   \
 509         MMI_ULDC1(%[ftmp1], src1, 0x00)                                     \
 510         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
 511         "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
 512         "pmullh     %[ftmp5],   %[ftmp2],       %[filter3]          \n\t"   \
 513         "pmullh     %[ftmp6],   %[ftmp3],       %[filter3]          \n\t"   \
 514                                                                             \
 515         PTR_ADDU   ""#src1",    "#src1",        "#srcstride"        \n\t"   \
 516         MMI_ULDC1(%[ftmp1], src1, 0x00)                                     \
 517         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
 518         "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
 519         "pmullh     %[ftmp2],   %[ftmp2],       %[filter4]          \n\t"   \
 520         "pmullh     %[ftmp3],   %[ftmp3],       %[filter4]          \n\t"   \
 521         "psubsh     %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"   \
 522         "psubsh     %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"   \
 523                                                                             \
 524         "paddsh     %[ftmp5],   %[ftmp5],       %[ftmp7]            \n\t"   \
 525         "paddsh     %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"   \
 526                                                                             \
 527         "paddsh     %[ftmp5],   %[ftmp5],       %[ff_pw_64]         \n\t"   \
 528         "paddsh     %[ftmp6],   %[ftmp6],       %[ff_pw_64]         \n\t"   \
 529         "psrah      %[ftmp5],   %[ftmp5],       %[ftmp4]            \n\t"   \
 530         "psrah      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"   \
 531         "packushb   %[ftmp1],   %[ftmp5],       %[ftmp6]            \n\t"   \
 532                                                                             \
 533         MMI_SDC1(%[ftmp1], dst, 0x00)
 534
 535
 536 #define PUT_VP8_BILINEAR8_H_MMI(src, dst)                                   \
 537         MMI_ULDC1(%[ftmp1], src, 0x00)                                      \
 538         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
 539         "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
 540         "pmullh     %[ftmp5],   %[ftmp2],       %[a]                \n\t"   \
 541         "pmullh     %[ftmp6],   %[ftmp3],       %[a]                \n\t"   \
 542                                                                             \
 543         MMI_ULDC1(%[ftmp1], src, 0x01)                                      \
 544         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
 545         "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
 546         "pmullh     %[ftmp2],   %[ftmp2],       %[b]                \n\t"   \
 547         "pmullh     %[ftmp3],   %[ftmp3],       %[b]                \n\t"   \
 548         "paddsh     %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"   \
 549         "paddsh     %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"   \
 550                                                                             \
 551         "paddsh     %[ftmp5],   %[ftmp5],       %[ff_pw_4]          \n\t"   \
 552         "paddsh     %[ftmp6],   %[ftmp6],       %[ff_pw_4]          \n\t"   \
 553         "psrah      %[ftmp5],   %[ftmp5],       %[ftmp4]            \n\t"   \
 554         "psrah      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"   \
 555                                                                             \
 556         "packushb   %[ftmp1],   %[ftmp5],       %[ftmp6]            \n\t"   \
 557         MMI_SDC1(%[ftmp1], dst, 0x00)
 558
 559
 560 #define PUT_VP8_BILINEAR4_H_MMI(src, dst)                                   \
 561         MMI_ULWC1(%[ftmp1], src, 0x00)                                      \
 562         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
 563         "pmullh     %[ftmp3],   %[ftmp2],       %[a]                \n\t"   \
 564                                                                             \
 565         MMI_ULWC1(%[ftmp1], src, 0x01)                                      \
 566         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
 567         "pmullh     %[ftmp2],   %[ftmp2],       %[b]                \n\t"   \
 568         "paddsh     %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"   \
 569                                                                             \
 570         "paddsh     %[ftmp3],   %[ftmp3],       %[ff_pw_4]          \n\t"   \
 571         "psrah      %[ftmp3],   %[ftmp3],       %[ftmp4]            \n\t"   \
 572                                                                             \
 573         "packushb   %[ftmp1],   %[ftmp3],       %[ftmp0]            \n\t"   \
 574         MMI_SWC1(%[ftmp1], dst, 0x00)
 575
 576
 577 #define PUT_VP8_BILINEAR8_V_MMI(src, src1, dst, sstride)                    \
 578         MMI_ULDC1(%[ftmp1], src, 0x00)                                      \
 579         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
 580         "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
 581         "pmullh     %[ftmp5],   %[ftmp2],       %[c]                \n\t"   \
 582         "pmullh     %[ftmp6],   %[ftmp3],       %[c]                \n\t"   \
 583                                                                             \
 584         PTR_ADDU   ""#src1",    "#src",         "#sstride"          \n\t"   \
 585         MMI_ULDC1(%[ftmp1], src1, 0x00)                                     \
 586         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
 587         "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
 588         "pmullh     %[ftmp2],   %[ftmp2],       %[d]                \n\t"   \
 589         "pmullh     %[ftmp3],   %[ftmp3],       %[d]                \n\t"   \
 590         "paddsh     %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"   \
 591         "paddsh     %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"   \
 592                                                                             \
 593         "paddsh     %[ftmp5],   %[ftmp5],       %[ff_pw_4]          \n\t"   \
 594         "paddsh     %[ftmp6],   %[ftmp6],       %[ff_pw_4]          \n\t"   \
 595         "psrah      %[ftmp5],   %[ftmp5],       %[ftmp4]            \n\t"   \
 596         "psrah      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"   \
 597                                                                             \
 598         "packushb   %[ftmp1],   %[ftmp5],       %[ftmp6]            \n\t"   \
 599         MMI_SDC1(%[ftmp1], dst, 0x00)
 600
 601
 602 #define PUT_VP8_BILINEAR4_V_MMI(src, src1, dst, sstride)                    \
 603         MMI_ULWC1(%[ftmp1], src, 0x00)                                      \
 604         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
 605         "pmullh     %[ftmp3],   %[ftmp2],       %[c]                \n\t"   \
 606                                                                             \
 607         PTR_ADDU   ""#src1",    "#src",         "#sstride"          \n\t"   \
 608         MMI_ULWC1(%[ftmp1], src1, 0x00)                                     \
 609         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
 610         "pmullh     %[ftmp2],   %[ftmp2],       %[d]                \n\t"   \
 611         "paddsh     %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"   \
 612                                                                             \
 613         "paddsh     %[ftmp3],   %[ftmp3],       %[ff_pw_4]          \n\t"   \
 614         "psrah      %[ftmp3],   %[ftmp3],       %[ftmp4]            \n\t"   \
 615                                                                             \
 616         "packushb   %[ftmp1],   %[ftmp3],       %[ftmp0]            \n\t"   \
 617         MMI_SWC1(%[ftmp1], dst, 0x00)
 618
 619
 620 DECLARE_ALIGNED(8, static const uint64_t, fourtap_subpel_filters[7][6]) = {
 621    {0x0000000000000000, 0x0006000600060006, 0x007b007b007b007b,
 622     0x000c000c000c000c, 0x0001000100010001, 0x0000000000000000},
 623
 624    {0x0002000200020002, 0x000b000b000b000b, 0x006c006c006c006c,
 625     0x0024002400240024, 0x0008000800080008, 0x0001000100010001},
 626
 627    {0x0000000000000000, 0x0009000900090009, 0x005d005d005d005d,
 628     0x0032003200320032, 0x0006000600060006, 0x0000000000000000},
 629
 630    {0x0003000300030003, 0x0010001000100010, 0x004d004d004d004d,
 631     0x004d004d004d004d, 0x0010001000100010, 0x0003000300030003},
 632
 633    {0x0000000000000000, 0x0006000600060006, 0x0032003200320032,
 634     0x005d005d005d005d, 0x0009000900090009, 0x0000000000000000},
 635
 636    {0x0001000100010001, 0x0008000800080008, 0x0024002400240024,
 637     0x006c006c006c006c, 0x000b000b000b000b, 0x0002000200020002},
 638
 639    {0x0000000000000000, 0x0001000100010001, 0x000c000c000c000c,
 640     0x007b007b007b007b, 0x0006000600060006, 0x0000000000000000}
 641 };
 642
 643 #if 0
 644 #define FILTER_6TAP(src, F, stride)                                           \
 645     cm[(F[2] * src[x + 0 * stride] - F[1] * src[x - 1 * stride] +             \
 646         F[0] * src[x - 2 * stride] + F[3] * src[x + 1 * stride] -             \
 647         F[4] * src[x + 2 * stride] + F[5] * src[x + 3 * stride] + 64) >> 7]
 648
 649 #define FILTER_4TAP(src, F, stride)                                           \
 650     cm[(F[2] * src[x + 0 * stride] - F[1] * src[x - 1 * stride] +             \
 651         F[3] * src[x + 1 * stride] - F[4] * src[x + 2 * stride] + 64) >> 7]
 652
 653 static const uint8_t subpel_filters[7][6] = {
 654     { 0,  6, 123,  12,  1, 0 },
 655     { 2, 11, 108,  36,  8, 1 },
 656     { 0,  9,  93,  50,  6, 0 },
 657     { 3, 16,  77,  77, 16, 3 },
 658     { 0,  6,  50,  93,  9, 0 },
 659     { 1,  8,  36, 108, 11, 2 },
 660     { 0,  1,  12, 123,  6, 0 },
 661 };
 662
 663 #define MUL_20091(a) ((((a) * 20091) >> 16) + (a))
 664 #define MUL_35468(a)  (((a) * 35468) >> 16)
 665 #endif
 666
 667 #define clip_int8(n) (cm[(n) + 0x80] - 0x80)
 668 static av_always_inline void vp8_filter_common_is4tap(uint8_t *p,
 669         ptrdiff_t stride)
 670 {
 671     int av_unused p1 = p[-2 * stride];
 672     int av_unused p0 = p[-1 * stride];
 673     int av_unused q0 = p[ 0 * stride];
 674     int av_unused q1 = p[ 1 * stride];
 675     int a, f1, f2;
 676     const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
 677
 678     a = 3 * (q0 - p0);
 679     a += clip_int8(p1 - q1);
 680     a = clip_int8(a);
 681
 682     // We deviate from the spec here with c(a+3) >> 3
 683     // since that's what libvpx does.
 684     f1 = FFMIN(a + 4, 127) >> 3;
 685     f2 = FFMIN(a + 3, 127) >> 3;
 686
 687     // Despite what the spec says, we do need to clamp here to
 688     // be bitexact with libvpx.
 689     p[-1 * stride] = cm[p0 + f2];
 690     p[ 0 * stride] = cm[q0 - f1];
 691 }
 692
 693 static av_always_inline void vp8_filter_common_isnot4tap(uint8_t *p,
 694         ptrdiff_t stride)
 695 {
 696     int av_unused p1 = p[-2 * stride];
 697     int av_unused p0 = p[-1 * stride];
 698     int av_unused q0 = p[ 0 * stride];
 699     int av_unused q1 = p[ 1 * stride];
 700     int a, f1, f2;
 701     const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
 702
 703     a = 3 * (q0 - p0);
 704     a = clip_int8(a);
 705
 706     // We deviate from the spec here with c(a+3) >> 3
 707     // since that's what libvpx does.
 708     f1 = FFMIN(a + 4, 127) >> 3;
 709     f2 = FFMIN(a + 3, 127) >> 3;
 710
 711     // Despite what the spec says, we do need to clamp here to
 712     // be bitexact with libvpx.
 713     p[-1 * stride] = cm[p0 + f2];
 714     p[ 0 * stride] = cm[q0 - f1];
 715     a              = (f1 + 1) >> 1;
 716     p[-2 * stride] = cm[p1 + a];
 717     p[ 1 * stride] = cm[q1 - a];
 718 }
 719
 720 static av_always_inline int vp8_simple_limit(uint8_t *p, ptrdiff_t stride,
 721         int flim)
 722 {
 723     int av_unused p1 = p[-2 * stride];
 724     int av_unused p0 = p[-1 * stride];
 725     int av_unused q0 = p[ 0 * stride];
 726     int av_unused q1 = p[ 1 * stride];
 727
 728     return 2 * FFABS(p0 - q0) + (FFABS(p1 - q1) >> 1) <= flim;
 729 }
 730
 731 static av_always_inline int hev(uint8_t *p, ptrdiff_t stride, int thresh)
 732 {
 733     int av_unused p1 = p[-2 * stride];
 734     int av_unused p0 = p[-1 * stride];
 735     int av_unused q0 = p[ 0 * stride];
 736     int av_unused q1 = p[ 1 * stride];
 737
 738     return FFABS(p1 - p0) > thresh || FFABS(q1 - q0) > thresh;
 739 }
 740
 741 static av_always_inline void filter_mbedge(uint8_t *p, ptrdiff_t stride)
 742 {
 743     int a0, a1, a2, w;
 744     const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
 745
 746     int av_unused p2 = p[-3 * stride];
 747     int av_unused p1 = p[-2 * stride];
 748     int av_unused p0 = p[-1 * stride];
 749     int av_unused q0 = p[ 0 * stride];
 750     int av_unused q1 = p[ 1 * stride];
 751     int av_unused q2 = p[ 2 * stride];
 752
 753     w = clip_int8(p1 - q1);
 754     w = clip_int8(w + 3 * (q0 - p0));
 755
 756     a0 = (27 * w + 63) >> 7;
 757     a1 = (18 * w + 63) >> 7;
 758     a2 =  (9 * w + 63) >> 7;
 759
 760     p[-3 * stride] = cm[p2 + a2];
 761     p[-2 * stride] = cm[p1 + a1];
 762     p[-1 * stride] = cm[p0 + a0];
 763     p[ 0 * stride] = cm[q0 - a0];
 764     p[ 1 * stride] = cm[q1 - a1];
 765     p[ 2 * stride] = cm[q2 - a2];
 766 }
 767
 768 static av_always_inline int vp8_normal_limit(uint8_t *p, ptrdiff_t stride,
 769         int E, int I)
 770 {
 771     int av_unused p3 = p[-4 * stride];
 772     int av_unused p2 = p[-3 * stride];
 773     int av_unused p1 = p[-2 * stride];
 774     int av_unused p0 = p[-1 * stride];
 775     int av_unused q0 = p[ 0 * stride];
 776     int av_unused q1 = p[ 1 * stride];
 777     int av_unused q2 = p[ 2 * stride];
 778     int av_unused q3 = p[ 3 * stride];
 779
 780     return vp8_simple_limit(p, stride, E) &&
 781            FFABS(p3 - p2) <= I && FFABS(p2 - p1) <= I &&
 782            FFABS(p1 - p0) <= I && FFABS(q3 - q2) <= I &&
 783            FFABS(q2 - q1) <= I && FFABS(q1 - q0) <= I;
 784 }
 785
 786 static av_always_inline void vp8_v_loop_filter8_mmi(uint8_t *dst,
 787         ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
 788 {
 789     double ftmp[18];
 790     uint32_t tmp[1];
 791     DECLARE_DOUBLE_1;
 792     DECLARE_DOUBLE_2;
 793     DECLARE_UINT32_T;
 794     __asm__ volatile(
 795         /* Get data from dst */
 796         "gsldlc1    %[q0],      0x07(%[dst])                      \n\t"
 797         "gsldrc1    %[q0],      0x00(%[dst])                      \n\t"
 798         PTR_SUBU    "%[tmp0],   %[dst],         %[stride]         \n\t"
 799         "gsldlc1    %[p0],      0x07(%[tmp0])                     \n\t"
 800         "gsldrc1    %[p0],      0x00(%[tmp0])                     \n\t"
 801         PTR_SUBU    "%[tmp0],   %[tmp0],        %[stride]         \n\t"
 802         "gsldlc1    %[p1],      0x07(%[tmp0])                     \n\t"
 803         "gsldrc1    %[p1],      0x00(%[tmp0])                     \n\t"
 804         PTR_SUBU    "%[tmp0],   %[tmp0],        %[stride]         \n\t"
 805         "gsldlc1    %[p2],      0x07(%[tmp0])                     \n\t"
 806         "gsldrc1    %[p2],      0x00(%[tmp0])                     \n\t"
 807         PTR_SUBU    "%[tmp0],   %[tmp0],        %[stride]         \n\t"
 808         "gsldlc1    %[p3],      0x07(%[tmp0])                     \n\t"
 809         "gsldrc1    %[p3],      0x00(%[tmp0])                     \n\t"
 810         PTR_ADDU    "%[tmp0],   %[dst],         %[stride]         \n\t"
 811         "gsldlc1    %[q1],      0x07(%[tmp0])                     \n\t"
 812         "gsldrc1    %[q1],      0x00(%[tmp0])                     \n\t"
 813         PTR_ADDU    "%[tmp0],   %[tmp0],        %[stride]         \n\t"
 814         "gsldlc1    %[q2],      0x07(%[tmp0])                     \n\t"
 815         "gsldrc1    %[q2],      0x00(%[tmp0])                     \n\t"
 816         PTR_ADDU    "%[tmp0],   %[tmp0],        %[stride]         \n\t"
 817         "gsldlc1    %[q3],      0x07(%[tmp0])                     \n\t"
 818         "gsldrc1    %[q3],      0x00(%[tmp0])                     \n\t"
 819         MMI_VP8_LOOP_FILTER
 820         /* Move to dst */
 821         "gssdlc1    %[q0],      0x07(%[dst])                      \n\t"
 822         "gssdrc1    %[q0],      0x00(%[dst])                      \n\t"
 823         PTR_SUBU    "%[tmp0],   %[dst],         %[stride]         \n\t"
 824         "gssdlc1    %[p0],      0x07(%[tmp0])                     \n\t"
 825         "gssdrc1    %[p0],      0x00(%[tmp0])                     \n\t"
 826         PTR_SUBU    "%[tmp0],   %[tmp0],        %[stride]         \n\t"
 827         "gssdlc1    %[p1],      0x07(%[tmp0])                     \n\t"
 828         "gssdrc1    %[p1],      0x00(%[tmp0])                     \n\t"
 829         PTR_SUBU    "%[tmp0],   %[tmp0],        %[stride]         \n\t"
 830         "gssdlc1    %[p2],      0x07(%[tmp0])                     \n\t"
 831         "gssdrc1    %[p2],      0x00(%[tmp0])                     \n\t"
 832         PTR_ADDU    "%[tmp0],   %[dst],         %[stride]         \n\t"
 833         "gssdlc1    %[q1],      0x07(%[tmp0])                     \n\t"
 834         "gssdrc1    %[q1],      0x00(%[tmp0])                     \n\t"
 835         PTR_ADDU    "%[tmp0],   %[tmp0],        %[stride]         \n\t"
 836         "gssdlc1    %[q2],      0x07(%[tmp0])                     \n\t"
 837         "gssdrc1    %[q2],      0x00(%[tmp0])                     \n\t"
 838         : [p3]"=&f"(ftmp[0]),       [p2]"=&f"(ftmp[1]),
 839           [p1]"=&f"(ftmp[2]),       [p0]"=&f"(ftmp[3]),
 840           [q0]"=&f"(ftmp[4]),       [q1]"=&f"(ftmp[5]),
 841           [q2]"=&f"(ftmp[6]),       [q3]"=&f"(ftmp[7]),
 842           [ftmp0]"=&f"(ftmp[8]),    [ftmp1]"=&f"(ftmp[9]),
 843           [ftmp2]"=&f"(ftmp[10]),   [ftmp3]"=&f"(ftmp[11]),
 844           [hev]"=&f"(ftmp[12]),     [mask]"=&f"(ftmp[13]),
 845           [ftmp4]"=&f"(ftmp[14]),   [ftmp5]"=&f"(ftmp[15]),
 846           [ftmp6]"=&f"(ftmp[16]),   [ftmp7]"=&f"(ftmp[17]),
 847           [dst]"+&r"(dst),          [tmp0]"=&r"(tmp[0]),
 848           RESTRICT_ASM_DOUBLE_1,    RESTRICT_ASM_DOUBLE_2,
 849           RESTRICT_ASM_UINT32_T
 850         : [e]"r"((mips_reg)flim_E), [thresh]"r"((mips_reg)hev_thresh),
 851           [i]"r"((mips_reg)flim_I), [stride]"r"((mips_reg)stride)
 852         : "memory"
 853     );
 854 }
 855
 856 static av_always_inline void vp8_v_loop_filter8_inner_mmi(uint8_t *dst,
 857         ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
 858 {
 859     int i;
 860
 861     for (i = 0; i < 8; i++)
 862         if (vp8_normal_limit(dst + i * 1, stride, flim_E, flim_I)) {
 863             int hv = hev(dst + i * 1, stride, hev_thresh);
 864             if (hv)
 865                 vp8_filter_common_is4tap(dst + i * 1, stride);
 866             else
 867                 vp8_filter_common_isnot4tap(dst + i * 1, stride);
 868         }
 869 }
 870
 871 static av_always_inline void vp8_h_loop_filter8_mmi(uint8_t *dst,
 872         ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
 873 {
 874     double ftmp[18];
 875     uint32_t tmp[1];
 876     DECLARE_DOUBLE_1;
 877     DECLARE_DOUBLE_2;
 878     DECLARE_UINT32_T;
 879     __asm__ volatile(
 880         /* Get data from dst */
 881         "gsldlc1    %[p3],        0x03(%[dst])                    \n\t"
 882         "gsldrc1    %[p3],        -0x04(%[dst])                   \n\t"
 883         PTR_ADDU    "%[tmp0],     %[dst],           %[stride]     \n\t"
 884         "gsldlc1    %[p2],        0x03(%[tmp0])                   \n\t"
 885         "gsldrc1    %[p2],        -0x04(%[tmp0])                  \n\t"
 886         PTR_ADDU    "%[tmp0],     %[tmp0],          %[stride]     \n\t"
 887         "gsldlc1    %[p1],        0x03(%[tmp0])                   \n\t"
 888         "gsldrc1    %[p1],        -0x04(%[tmp0])                  \n\t"
 889         PTR_ADDU    "%[tmp0],     %[tmp0],          %[stride]     \n\t"
 890         "gsldlc1    %[p0],        0x03(%[tmp0])                   \n\t"
 891         "gsldrc1    %[p0],        -0x04(%[tmp0])                  \n\t"
 892         PTR_ADDU    "%[tmp0],     %[tmp0],          %[stride]     \n\t"
 893         "gsldlc1    %[q0],        0x03(%[tmp0])                   \n\t"
 894         "gsldrc1    %[q0],        -0x04(%[tmp0])                  \n\t"
 895         PTR_ADDU    "%[tmp0],     %[tmp0],          %[stride]     \n\t"
 896         "gsldlc1    %[q1],        0x03(%[tmp0])                   \n\t"
 897         "gsldrc1    %[q1],        -0x04(%[tmp0])                  \n\t"
 898         PTR_ADDU    "%[tmp0],     %[tmp0],          %[stride]     \n\t"
 899         "gsldlc1    %[q2],        0x03(%[tmp0])                   \n\t"
 900         "gsldrc1    %[q2],        -0x04(%[tmp0])                  \n\t"
 901         PTR_ADDU    "%[tmp0],     %[tmp0],          %[stride]     \n\t"
 902         "gsldlc1    %[q3],        0x03(%[tmp0])                   \n\t"
 903         "gsldrc1    %[q3],        -0x04(%[tmp0])                  \n\t"
 904         /* Matrix transpose */
 905         TRANSPOSE_8B(%[p3], %[p2], %[p1], %[p0],
 906                      %[q0], %[q1], %[q2], %[q3],
 907                      %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
 908         MMI_VP8_LOOP_FILTER
 909         /* Matrix transpose */
 910         TRANSPOSE_8B(%[p3], %[p2], %[p1], %[p0],
 911                      %[q0], %[q1], %[q2], %[q3],
 912                      %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
 913         /* Move to dst */
 914         "gssdlc1    %[p3],        0x03(%[dst])                    \n\t"
 915         "gssdrc1    %[p3],        -0x04(%[dst])                   \n\t"
 916         PTR_ADDU    "%[dst],      %[dst],           %[stride]     \n\t"
 917         "gssdlc1    %[p2],        0x03(%[dst])                    \n\t"
 918         "gssdrc1    %[p2],        -0x04(%[dst])                   \n\t"
 919         PTR_ADDU    "%[dst],      %[dst],           %[stride]     \n\t"
 920         "gssdlc1    %[p1],        0x03(%[dst])                    \n\t"
 921         "gssdrc1    %[p1],        -0x04(%[dst])                   \n\t"
 922         PTR_ADDU    "%[dst],      %[dst],           %[stride]     \n\t"
 923         "gssdlc1    %[p0],        0x03(%[dst])                    \n\t"
 924         "gssdrc1    %[p0],        -0x04(%[dst])                   \n\t"
 925         PTR_ADDU    "%[dst],      %[dst],           %[stride]     \n\t"
 926         "gssdlc1    %[q0],        0x03(%[dst])                    \n\t"
 927         "gssdrc1    %[q0],        -0x04(%[dst])                   \n\t"
 928         PTR_ADDU    "%[dst],      %[dst],           %[stride]     \n\t"
 929         "gssdlc1    %[q1],        0x03(%[dst])                    \n\t"
 930         "gssdrc1    %[q1],        -0x04(%[dst])                   \n\t"
 931         PTR_ADDU    "%[dst],      %[dst],           %[stride]     \n\t"
 932         "gssdlc1    %[q2],        0x03(%[dst])                    \n\t"
 933         "gssdrc1    %[q2],        -0x04(%[dst])                   \n\t"
 934         PTR_ADDU    "%[dst],      %[dst],           %[stride]     \n\t"
 935         "gssdlc1    %[q3],        0x03(%[dst])                    \n\t"
 936         "gssdrc1    %[q3],        -0x04(%[dst])                   \n\t"
 937         : [p3]"=&f"(ftmp[0]),       [p2]"=&f"(ftmp[1]),
 938           [p1]"=&f"(ftmp[2]),       [p0]"=&f"(ftmp[3]),
 939           [q0]"=&f"(ftmp[4]),       [q1]"=&f"(ftmp[5]),
 940           [q2]"=&f"(ftmp[6]),       [q3]"=&f"(ftmp[7]),
 941           [ftmp0]"=&f"(ftmp[8]),    [ftmp1]"=&f"(ftmp[9]),
 942           [ftmp2]"=&f"(ftmp[10]),   [ftmp3]"=&f"(ftmp[11]),
 943           [hev]"=&f"(ftmp[12]),     [mask]"=&f"(ftmp[13]),
 944           [ftmp4]"=&f"(ftmp[14]),   [ftmp5]"=&f"(ftmp[15]),
 945           [ftmp6]"=&f"(ftmp[16]),   [ftmp7]"=&f"(ftmp[17]),
 946           [dst]"+&r"(dst),          [tmp0]"=&r"(tmp[0]),
 947           RESTRICT_ASM_DOUBLE_1,    RESTRICT_ASM_DOUBLE_2,
 948           RESTRICT_ASM_UINT32_T
 949         : [e]"r"((mips_reg)flim_E), [thresh]"r"((mips_reg)hev_thresh),
 950           [i]"r"((mips_reg)flim_I), [stride]"r"((mips_reg)stride)
 951         : "memory"
 952     );
 953 }
 954
 955 static av_always_inline void vp8_h_loop_filter8_inner_mmi(uint8_t *dst,
 956         ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
 957 {
 958     int i;
 959
 960     for (i = 0; i < 8; i++)
 961         if (vp8_normal_limit(dst + i * stride, 1, flim_E, flim_I)) {
 962             int hv = hev(dst + i * stride, 1, hev_thresh);
 963             if (hv)
 964                 vp8_filter_common_is4tap(dst + i * stride, 1);
 965             else
 966                 vp8_filter_common_isnot4tap(dst + i * stride, 1);
 967         }
 968 }
 969
 970 void ff_vp8_luma_dc_wht_mmi(int16_t block[4][4][16], int16_t dc[16])
 971 {
 972 #if 1
 973     double ftmp[8];
 974     DECLARE_VAR_ALL64;
 975
 976     __asm__ volatile (
 977         MMI_LDC1(%[ftmp0], %[dc], 0x00)
 978         MMI_LDC1(%[ftmp1], %[dc], 0x08)
 979         MMI_LDC1(%[ftmp2], %[dc], 0x10)
 980         MMI_LDC1(%[ftmp3], %[dc], 0x18)
 981         "paddsh     %[ftmp4],   %[ftmp0],       %[ftmp3]            \n\t"
 982         "psubsh     %[ftmp5],   %[ftmp0],       %[ftmp3]            \n\t"
 983         "paddsh     %[ftmp6],   %[ftmp1],       %[ftmp2]            \n\t"
 984         "psubsh     %[ftmp7],   %[ftmp1],       %[ftmp2]            \n\t"
 985         "paddsh     %[ftmp0],   %[ftmp4],       %[ftmp6]            \n\t"
 986         "paddsh     %[ftmp1],   %[ftmp5],       %[ftmp7]            \n\t"
 987         "psubsh     %[ftmp2],   %[ftmp4],       %[ftmp6]            \n\t"
 988         "psubsh     %[ftmp3],   %[ftmp5],       %[ftmp7]            \n\t"
 989         MMI_SDC1(%[ftmp0], %[dc], 0x00)
 990         MMI_SDC1(%[ftmp1], %[dc], 0x08)
 991         MMI_SDC1(%[ftmp2], %[dc], 0x10)
 992         MMI_SDC1(%[ftmp3], %[dc], 0x18)
 993         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
 994           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
 995           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
 996           [ftmp6]"=&f"(ftmp[6]),
 997           RESTRICT_ASM_ALL64
 998           [ftmp7]"=&f"(ftmp[7])
 999         : [dc]"r"((uint8_t*)dc)
1000         : "memory"
1001     );
1002
1003     block[0][0][0] = (dc[0] + dc[3] + 3 + dc[1] + dc[2]) >> 3;
1004     block[0][1][0] = (dc[0] - dc[3] + 3 + dc[1] - dc[2]) >> 3;
1005     block[0][2][0] = (dc[0] + dc[3] + 3 - dc[1] - dc[2]) >> 3;
1006     block[0][3][0] = (dc[0] - dc[3] + 3 - dc[1] + dc[2]) >> 3;
1007
1008     block[1][0][0] = (dc[4] + dc[7] + 3 + dc[5] + dc[6]) >> 3;
1009     block[1][1][0] = (dc[4] - dc[7] + 3 + dc[5] - dc[6]) >> 3;
1010     block[1][2][0] = (dc[4] + dc[7] + 3 - dc[5] - dc[6]) >> 3;
1011     block[1][3][0] = (dc[4] - dc[7] + 3 - dc[5] + dc[6]) >> 3;
1012
1013     block[2][0][0] = (dc[8] + dc[11] + 3 + dc[9] + dc[10]) >> 3;
1014     block[2][1][0] = (dc[8] - dc[11] + 3 + dc[9] - dc[10]) >> 3;
1015     block[2][2][0] = (dc[8] + dc[11] + 3 - dc[9] - dc[10]) >> 3;
1016     block[2][3][0] = (dc[8] - dc[11] + 3 - dc[9] + dc[10]) >> 3;
1017
1018     block[3][0][0] = (dc[12] + dc[15] + 3 + dc[13] + dc[14]) >> 3;
1019     block[3][1][0] = (dc[12] - dc[15] + 3 + dc[13] - dc[14]) >> 3;
1020     block[3][2][0] = (dc[12] + dc[15] + 3 - dc[13] - dc[14]) >> 3;
1021     block[3][3][0] = (dc[12] - dc[15] + 3 - dc[13] + dc[14]) >> 3;
1022
1023     __asm__ volatile (
1024         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
1025         MMI_SDC1(%[ftmp0], %[dc], 0x00)
1026         MMI_SDC1(%[ftmp0], %[dc], 0x08)
1027         MMI_SDC1(%[ftmp0], %[dc], 0x10)
1028         MMI_SDC1(%[ftmp0], %[dc], 0x18)
1029         : RESTRICT_ASM_ALL64
1030           [ftmp0]"=&f"(ftmp[0])
1031         : [dc]"r"((uint8_t *)dc)
1032         : "memory"
1033     );
1034 #else
1035     int t00, t01, t02, t03, t10, t11, t12, t13, t20, t21, t22, t23, t30, t31, t32, t33;
1036
1037     t00 = dc[0] + dc[12];
1038     t10 = dc[1] + dc[13];
1039     t20 = dc[2] + dc[14];
1040     t30 = dc[3] + dc[15];
1041
1042     t03 = dc[0] - dc[12];
1043     t13 = dc[1] - dc[13];
1044     t23 = dc[2] - dc[14];
1045     t33 = dc[3] - dc[15];
1046
1047     t01 = dc[4] + dc[ 8];
1048     t11 = dc[5] + dc[ 9];
1049     t21 = dc[6] + dc[10];
1050     t31 = dc[7] + dc[11];
1051
1052     t02 = dc[4] - dc[ 8];
1053     t12 = dc[5] - dc[ 9];
1054     t22 = dc[6] - dc[10];
1055     t32 = dc[7] - dc[11];
1056
1057     dc[ 0] = t00 + t01;
1058     dc[ 1] = t10 + t11;
1059     dc[ 2] = t20 + t21;
1060     dc[ 3] = t30 + t31;
1061
1062     dc[ 4] = t03 + t02;
1063     dc[ 5] = t13 + t12;
1064     dc[ 6] = t23 + t22;
1065     dc[ 7] = t33 + t32;
1066
1067     dc[ 8] = t00 - t01;
1068     dc[ 9] = t10 - t11;
1069     dc[10] = t20 - t21;
1070     dc[11] = t30 - t31;
1071
1072     dc[12] = t03 - t02;
1073     dc[13] = t13 - t12;
1074     dc[14] = t23 - t22;
1075     dc[15] = t33 - t32;
1076
1077     block[0][0][0] = (dc[0] + dc[3] + 3 + dc[1] + dc[2]) >> 3;
1078     block[0][1][0] = (dc[0] - dc[3] + 3 + dc[1] - dc[2]) >> 3;
1079     block[0][2][0] = (dc[0] + dc[3] + 3 - dc[1] - dc[2]) >> 3;
1080     block[0][3][0] = (dc[0] - dc[3] + 3 - dc[1] + dc[2]) >> 3;
1081
1082     block[1][0][0] = (dc[4] + dc[7] + 3 + dc[5] + dc[6]) >> 3;
1083     block[1][1][0] = (dc[4] - dc[7] + 3 + dc[5] - dc[6]) >> 3;
1084     block[1][2][0] = (dc[4] + dc[7] + 3 - dc[5] - dc[6]) >> 3;
1085     block[1][3][0] = (dc[4] - dc[7] + 3 - dc[5] + dc[6]) >> 3;
1086
1087     block[2][0][0] = (dc[8] + dc[11] + 3 + dc[9] + dc[10]) >> 3;
1088     block[2][1][0] = (dc[8] - dc[11] + 3 + dc[9] - dc[10]) >> 3;
1089     block[2][2][0] = (dc[8] + dc[11] + 3 - dc[9] - dc[10]) >> 3;
1090     block[2][3][0] = (dc[8] - dc[11] + 3 - dc[9] + dc[10]) >> 3;
1091
1092     block[3][0][0] = (dc[12] + dc[15] + 3 + dc[13] + dc[14]) >> 3;
1093     block[3][1][0] = (dc[12] - dc[15] + 3 + dc[13] - dc[14]) >> 3;
1094     block[3][2][0] = (dc[12] + dc[15] + 3 - dc[13] - dc[14]) >> 3;
1095     block[3][3][0] = (dc[12] - dc[15] + 3 - dc[13] + dc[14]) >> 3;
1096
1097     AV_ZERO64(dc + 0);
1098     AV_ZERO64(dc + 4);
1099     AV_ZERO64(dc + 8);
1100     AV_ZERO64(dc + 12);
1101 #endif
1102 }
1103
1104 void ff_vp8_luma_dc_wht_dc_mmi(int16_t block[4][4][16], int16_t dc[16])
1105 {
1106     int val = (dc[0] + 3) >> 3;
1107
1108     dc[0] = 0;
1109
1110     block[0][0][0] = val;
1111     block[0][1][0] = val;
1112     block[0][2][0] = val;
1113     block[0][3][0] = val;
1114     block[1][0][0] = val;
1115     block[1][1][0] = val;
1116     block[1][2][0] = val;
1117     block[1][3][0] = val;
1118     block[2][0][0] = val;
1119     block[2][1][0] = val;
1120     block[2][2][0] = val;
1121     block[2][3][0] = val;
1122     block[3][0][0] = val;
1123     block[3][1][0] = val;
1124     block[3][2][0] = val;
1125     block[3][3][0] = val;
1126 }
1127
1128 void ff_vp8_idct_add_mmi(uint8_t *dst, int16_t block[16], ptrdiff_t stride)
1129 {
1130 #if 1
1131     DECLARE_ALIGNED(8, const uint64_t, ff_ph_4e7b) = {0x4e7b4e7b4e7b4e7bULL};
1132     DECLARE_ALIGNED(8, const uint64_t, ff_ph_22a3) = {0x22a322a322a322a3ULL};
1133     double ftmp[12];
1134     uint32_t tmp[1];
1135     DECLARE_VAR_LOW32;
1136     DECLARE_VAR_ALL64;
1137
1138     __asm__ volatile (
1139         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
1140         MMI_LDC1(%[ftmp1], %[block], 0x00)
1141         MMI_LDC1(%[ftmp2], %[block], 0x08)
1142         MMI_LDC1(%[ftmp3], %[block], 0x10)
1143         MMI_LDC1(%[ftmp4], %[block], 0x18)
1144
1145         "li         %[tmp0],    0x02                                \n\t"
1146         "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
1147
1148         // block[0...3] + block[8...11]
1149         "paddh      %[ftmp5],   %[ftmp1],       %[ftmp3]            \n\t"
1150         // block[0...3] - block[8...11]
1151         "psubh      %[ftmp6],   %[ftmp1],       %[ftmp3]            \n\t"
1152         // MUL_35468(block[12...15])
1153         "psllh      %[ftmp9],   %[ftmp4],       %[ftmp11]           \n\t"
1154         "pmulhh     %[ftmp7],   %[ftmp9],       %[ff_ph_22a3]       \n\t"
1155         // MUL_35468(block[4...7])
1156         "psllh      %[ftmp9],   %[ftmp2],       %[ftmp11]           \n\t"
1157         "pmulhh     %[ftmp8],   %[ftmp9],       %[ff_ph_22a3]       \n\t"
1158         // MUL_20091(block[4...7]
1159         "pmulhh     %[ftmp9],   %[ftmp2],       %[ff_ph_4e7b]       \n\t"
1160         "paddh      %[ftmp9],   %[ftmp9],       %[ftmp2]            \n\t"
1161         // MUL_20091(block[12...15])
1162         "pmulhh     %[ftmp10],  %[ftmp4],       %[ff_ph_4e7b]       \n\t"
1163         "paddh      %[ftmp10],  %[ftmp10],      %[ftmp4]            \n\t"
1164
1165         // tmp[0 4  8 12]
1166         "paddh      %[ftmp1],   %[ftmp5],       %[ftmp7]            \n\t"
1167         "paddh      %[ftmp1],   %[ftmp1],       %[ftmp9]            \n\t"
1168         // tmp[1 5  9 13]
1169         "paddh      %[ftmp2],   %[ftmp6],       %[ftmp8]            \n\t"
1170         "psubh      %[ftmp2],   %[ftmp2],       %[ftmp10]           \n\t"
1171         // tmp[2 6 10 14]
1172         "psubh      %[ftmp3],   %[ftmp6],       %[ftmp8]            \n\t"
1173         "paddh      %[ftmp3],   %[ftmp3],       %[ftmp10]           \n\t"
1174         // tmp[3 7 11 15]
1175         "psubh      %[ftmp4],   %[ftmp5],       %[ftmp7]            \n\t"
1176         "psubh      %[ftmp4],   %[ftmp4],       %[ftmp9]            \n\t"
1177
1178         MMI_SDC1(%[ftmp0], %[block], 0x00)
1179         MMI_SDC1(%[ftmp0], %[block], 0x08)
1180         MMI_SDC1(%[ftmp0], %[block], 0x10)
1181         MMI_SDC1(%[ftmp0], %[block], 0x18)
1182
1183         TRANSPOSE_4H(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
1184                      %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8])
1185
1186         // t[0 4  8 12]
1187         "paddh      %[ftmp5],   %[ftmp1],       %[ftmp3]            \n\t"
1188         // t[1 5  9 13]
1189         "psubh      %[ftmp6],   %[ftmp1],       %[ftmp3]            \n\t"
1190         // t[2 6 10 14]
1191         "psllh      %[ftmp9],   %[ftmp2],       %[ftmp11]           \n\t"
1192         "pmulhh     %[ftmp9],   %[ftmp9],       %[ff_ph_22a3]       \n\t"
1193         "psubh      %[ftmp7],   %[ftmp9],       %[ftmp4]            \n\t"
1194         "pmulhh     %[ftmp10],  %[ftmp4],       %[ff_ph_4e7b]       \n\t"
1195         "psubh      %[ftmp7],   %[ftmp7],       %[ftmp10]           \n\t"
1196         // t[3 7 11 15]
1197         "psllh      %[ftmp9],   %[ftmp4],       %[ftmp11]           \n\t"
1198         "pmulhh     %[ftmp9],   %[ftmp9],       %[ff_ph_22a3]       \n\t"
1199         "paddh      %[ftmp8],   %[ftmp9],       %[ftmp2]            \n\t"
1200         "pmulhh     %[ftmp10],  %[ftmp2],       %[ff_ph_4e7b]       \n\t"
1201         "paddh      %[ftmp8],   %[ftmp8],       %[ftmp10]           \n\t"
1202
1203         "li         %[tmp0],    0x03                                \n\t"
1204         "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
1205         "paddh      %[ftmp1],   %[ftmp5],       %[ftmp8]            \n\t"
1206         "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_4]          \n\t"
1207         "psrah      %[ftmp1],   %[ftmp1],       %[ftmp11]           \n\t"
1208         "paddh      %[ftmp2],   %[ftmp6],       %[ftmp7]            \n\t"
1209         "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_4]          \n\t"
1210         "psrah      %[ftmp2],   %[ftmp2],       %[ftmp11]           \n\t"
1211         "psubh      %[ftmp3],   %[ftmp6],       %[ftmp7]            \n\t"
1212         "paddh      %[ftmp3],   %[ftmp3],       %[ff_pw_4]          \n\t"
1213         "psrah      %[ftmp3],   %[ftmp3],       %[ftmp11]           \n\t"
1214         "psubh      %[ftmp4],   %[ftmp5],       %[ftmp8]            \n\t"
1215         "paddh      %[ftmp4],   %[ftmp4],       %[ff_pw_4]          \n\t"
1216         "psrah      %[ftmp4],   %[ftmp4],       %[ftmp11]           \n\t"
1217
1218         TRANSPOSE_4H(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
1219                      %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8])
1220
1221         MMI_LWC1(%[ftmp5], %[dst0], 0x00)
1222         MMI_LWC1(%[ftmp6], %[dst1], 0x00)
1223         MMI_LWC1(%[ftmp7], %[dst2], 0x00)
1224         MMI_LWC1(%[ftmp8], %[dst3], 0x00)
1225
1226         "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp0]            \n\t"
1227         "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
1228         "punpcklbh  %[ftmp7],   %[ftmp7],       %[ftmp0]            \n\t"
1229         "punpcklbh  %[ftmp8],   %[ftmp8],       %[ftmp0]            \n\t"
1230
1231         "paddh      %[ftmp1],   %[ftmp1],       %[ftmp5]            \n\t"
1232         "paddh      %[ftmp2],   %[ftmp2],       %[ftmp6]            \n\t"
1233         "paddh      %[ftmp3],   %[ftmp3],       %[ftmp7]            \n\t"
1234         "paddh      %[ftmp4],   %[ftmp4],       %[ftmp8]            \n\t"
1235
1236         "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
1237         "packushb   %[ftmp2],   %[ftmp2],       %[ftmp0]            \n\t"
1238         "packushb   %[ftmp3],   %[ftmp3],       %[ftmp0]            \n\t"
1239         "packushb   %[ftmp4],   %[ftmp4],       %[ftmp0]            \n\t"
1240
1241         MMI_SWC1(%[ftmp1], %[dst0], 0x00)
1242         MMI_SWC1(%[ftmp2], %[dst1], 0x00)
1243         MMI_SWC1(%[ftmp3], %[dst2], 0x00)
1244         MMI_SWC1(%[ftmp4], %[dst3], 0x00)
1245         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
1246           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
1247           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
1248           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
1249           [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
1250           [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
1251           RESTRICT_ASM_LOW32
1252           RESTRICT_ASM_ALL64
1253           [tmp0]"=&r"(tmp[0])
1254         : [dst0]"r"(dst),                   [dst1]"r"(dst+stride),
1255           [dst2]"r"(dst+2*stride),          [dst3]"r"(dst+3*stride),
1256           [block]"r"(block),                [ff_pw_4]"f"(ff_pw_4),
1257           [ff_ph_4e7b]"f"(ff_ph_4e7b),      [ff_ph_22a3]"f"(ff_ph_22a3)
1258         : "memory"
1259     );
1260 #else
1261     int i, t0, t1, t2, t3;
1262     int16_t tmp[16];
1263
1264     for (i = 0; i < 4; i++) {
1265         t0 = block[0 + i] + block[8 + i];
1266         t1 = block[0 + i] - block[8 + i];
1267         t2 = MUL_35468(block[4 + i]) - MUL_20091(block[12 + i]);
1268         t3 = MUL_20091(block[4 + i]) + MUL_35468(block[12 + i]);
1269         block[ 0 + i] = 0;
1270         block[ 4 + i] = 0;
1271         block[ 8 + i] = 0;
1272         block[12 + i] = 0;
1273
1274         tmp[i * 4 + 0] = t0 + t3;
1275         tmp[i * 4 + 1] = t1 + t2;
1276         tmp[i * 4 + 2] = t1 - t2;
1277         tmp[i * 4 + 3] = t0 - t3;
1278     }
1279
1280     for (i = 0; i < 4; i++) {
1281         t0 = tmp[0 + i] + tmp[8 + i];
1282         t1 = tmp[0 + i] - tmp[8 + i];
1283         t2 = MUL_35468(tmp[4 + i]) - MUL_20091(tmp[12 + i]);
1284         t3 = MUL_20091(tmp[4 + i]) + MUL_35468(tmp[12 + i]);
1285
1286         dst[0] = av_clip_uint8(dst[0] + ((t0 + t3 + 4) >> 3));
1287         dst[1] = av_clip_uint8(dst[1] + ((t1 + t2 + 4) >> 3));
1288         dst[2] = av_clip_uint8(dst[2] + ((t1 - t2 + 4) >> 3));
1289         dst[3] = av_clip_uint8(dst[3] + ((t0 - t3 + 4) >> 3));
1290         dst   += stride;
1291     }
1292 #endif
1293 }
1294
1295 void ff_vp8_idct_dc_add_mmi(uint8_t *dst, int16_t block[16], ptrdiff_t stride)
1296 {
1297 #if 1
1298     int dc = (block[0] + 4) >> 3;
1299     double ftmp[6];
1300     DECLARE_VAR_LOW32;
1301
1302     block[0] = 0;
1303
1304     __asm__ volatile (
1305         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
1306         "mtc1       %[dc],      %[ftmp5]                            \n\t"
1307         MMI_LWC1(%[ftmp1], %[dst0], 0x00)
1308         MMI_LWC1(%[ftmp2], %[dst1], 0x00)
1309         MMI_LWC1(%[ftmp3], %[dst2], 0x00)
1310         MMI_LWC1(%[ftmp4], %[dst3], 0x00)
1311         "pshufh     %[ftmp5],   %[ftmp5],       %[ftmp0]            \n\t"
1312         "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
1313         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]            \n\t"
1314         "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]            \n\t"
1315         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]            \n\t"
1316         "paddsh     %[ftmp1],   %[ftmp1],       %[ftmp5]            \n\t"
1317         "paddsh     %[ftmp2],   %[ftmp2],       %[ftmp5]            \n\t"
1318         "paddsh     %[ftmp3],   %[ftmp3],       %[ftmp5]            \n\t"
1319         "paddsh     %[ftmp4],   %[ftmp4],       %[ftmp5]            \n\t"
1320         "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
1321         "packushb   %[ftmp2],   %[ftmp2],       %[ftmp0]            \n\t"
1322         "packushb   %[ftmp3],   %[ftmp3],       %[ftmp0]            \n\t"
1323         "packushb   %[ftmp4],   %[ftmp4],       %[ftmp0]            \n\t"
1324         MMI_SWC1(%[ftmp1], %[dst0], 0x00)
1325         MMI_SWC1(%[ftmp2], %[dst1], 0x00)
1326         MMI_SWC1(%[ftmp3], %[dst2], 0x00)
1327         MMI_SWC1(%[ftmp4], %[dst3], 0x00)
1328         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
1329           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
1330           [ftmp4]"=&f"(ftmp[4]),
1331           RESTRICT_ASM_LOW32
1332           [ftmp5]"=&f"(ftmp[5])
1333         : [dst0]"r"(dst),                   [dst1]"r"(dst+stride),
1334           [dst2]"r"(dst+2*stride),          [dst3]"r"(dst+3*stride),
1335           [dc]"r"(dc)
1336         : "memory"
1337     );
1338 #else
1339     int i, dc = (block[0] + 4) >> 3;
1340
1341     block[0] = 0;
1342
1343     for (i = 0; i < 4; i++) {
1344         dst[0] = av_clip_uint8(dst[0] + dc);
1345         dst[1] = av_clip_uint8(dst[1] + dc);
1346         dst[2] = av_clip_uint8(dst[2] + dc);
1347         dst[3] = av_clip_uint8(dst[3] + dc);
1348         dst   += stride;
1349     }
1350 #endif
1351 }
1352
1353 void ff_vp8_idct_dc_add4y_mmi(uint8_t *dst, int16_t block[4][16],
1354         ptrdiff_t stride)
1355 {
1356     ff_vp8_idct_dc_add_mmi(dst +  0, block[0], stride);
1357     ff_vp8_idct_dc_add_mmi(dst +  4, block[1], stride);
1358     ff_vp8_idct_dc_add_mmi(dst +  8, block[2], stride);
1359     ff_vp8_idct_dc_add_mmi(dst + 12, block[3], stride);
1360 }
1361
1362 void ff_vp8_idct_dc_add4uv_mmi(uint8_t *dst, int16_t block[4][16],
1363         ptrdiff_t stride)
1364 {
1365     ff_vp8_idct_dc_add_mmi(dst + stride * 0 + 0, block[0], stride);
1366     ff_vp8_idct_dc_add_mmi(dst + stride * 0 + 4, block[1], stride);
1367     ff_vp8_idct_dc_add_mmi(dst + stride * 4 + 0, block[2], stride);
1368     ff_vp8_idct_dc_add_mmi(dst + stride * 4 + 4, block[3], stride);
1369 }
1370
1371 // loop filter applied to edges between macroblocks
1372 void ff_vp8_v_loop_filter16_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E,
1373         int flim_I, int hev_thresh)
1374 {
1375     vp8_v_loop_filter8_mmi(dst, stride, flim_E, flim_I, hev_thresh);
1376     vp8_v_loop_filter8_mmi(dst + 8, stride, flim_E, flim_I, hev_thresh);
1377 }
1378
1379 void ff_vp8_h_loop_filter16_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E,
1380         int flim_I, int hev_thresh)
1381 {
1382     vp8_h_loop_filter8_mmi(dst, stride, flim_E, flim_I, hev_thresh);
1383     vp8_h_loop_filter8_mmi(dst + 8 * stride, stride, flim_E, flim_I,
1384                            hev_thresh);
1385 }
1386
1387 void ff_vp8_v_loop_filter8uv_mmi(uint8_t *dstU, uint8_t *dstV, ptrdiff_t stride,
1388         int flim_E, int flim_I, int hev_thresh)
1389 {
1390     vp8_v_loop_filter8_mmi(dstU, stride, flim_E, flim_I, hev_thresh);
1391     vp8_v_loop_filter8_mmi(dstV, stride, flim_E, flim_I, hev_thresh);
1392 }
1393
1394 void ff_vp8_h_loop_filter8uv_mmi(uint8_t *dstU, uint8_t *dstV, ptrdiff_t stride,
1395         int flim_E, int flim_I, int hev_thresh)
1396 {
1397     vp8_h_loop_filter8_mmi(dstU, stride, flim_E, flim_I, hev_thresh);
1398     vp8_h_loop_filter8_mmi(dstV, stride, flim_E, flim_I, hev_thresh);
1399 }
1400
1401 // loop filter applied to inner macroblock edges
1402 void ff_vp8_v_loop_filter16_inner_mmi(uint8_t *dst, ptrdiff_t stride,
1403         int flim_E, int flim_I, int hev_thresh)
1404 {
1405     int i;
1406
1407     for (i = 0; i < 16; i++)
1408         if (vp8_normal_limit(dst + i * 1, stride, flim_E, flim_I)) {
1409             int hv = hev(dst + i * 1, stride, hev_thresh);
1410             if (hv)
1411                 vp8_filter_common_is4tap(dst + i * 1, stride);
1412             else
1413                 vp8_filter_common_isnot4tap(dst + i * 1, stride);
1414         }
1415 }
1416
1417 void ff_vp8_h_loop_filter16_inner_mmi(uint8_t *dst, ptrdiff_t stride,
1418         int flim_E, int flim_I, int hev_thresh)
1419 {
1420     int i;
1421
1422     for (i = 0; i < 16; i++)
1423         if (vp8_normal_limit(dst + i * stride, 1, flim_E, flim_I)) {
1424             int hv = hev(dst + i * stride, 1, hev_thresh);
1425             if (hv)
1426                 vp8_filter_common_is4tap(dst + i * stride, 1);
1427             else
1428                 vp8_filter_common_isnot4tap(dst + i * stride, 1);
1429         }
1430 }
1431
1432 void ff_vp8_v_loop_filter8uv_inner_mmi(uint8_t *dstU, uint8_t *dstV,
1433         ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
1434 {
1435     vp8_v_loop_filter8_inner_mmi(dstU, stride, flim_E, flim_I, hev_thresh);
1436     vp8_v_loop_filter8_inner_mmi(dstV, stride, flim_E, flim_I, hev_thresh);
1437 }
1438
1439 void ff_vp8_h_loop_filter8uv_inner_mmi(uint8_t *dstU, uint8_t *dstV,
1440         ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
1441 {
1442     vp8_h_loop_filter8_inner_mmi(dstU, stride, flim_E, flim_I, hev_thresh);
1443     vp8_h_loop_filter8_inner_mmi(dstV, stride, flim_E, flim_I, hev_thresh);
1444 }
1445
1446 void ff_vp8_v_loop_filter_simple_mmi(uint8_t *dst, ptrdiff_t stride, int flim)
1447 {
1448     int i;
1449
1450     for (i = 0; i < 16; i++)
1451         if (vp8_simple_limit(dst + i, stride, flim))
1452             vp8_filter_common_is4tap(dst + i, stride);
1453 }
1454
1455 void ff_vp8_h_loop_filter_simple_mmi(uint8_t *dst, ptrdiff_t stride, int flim)
1456 {
1457     int i;
1458
1459     for (i = 0; i < 16; i++)
1460         if (vp8_simple_limit(dst + i * stride, 1, flim))
1461             vp8_filter_common_is4tap(dst + i * stride, 1);
1462 }
1463
1464 void ff_put_vp8_pixels16_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1465         ptrdiff_t srcstride, int h, int x, int y)
1466 {
1467 #if 1
1468     double ftmp[2];
1469     uint64_t tmp[2];
1470     mips_reg addr[2];
1471     DECLARE_VAR_ALL64;
1472
1473     __asm__ volatile (
1474         "1:                                                         \n\t"
1475         PTR_ADDU   "%[addr0],   %[src],         %[srcstride]        \n\t"
1476         MMI_ULDC1(%[ftmp0], %[src], 0x00)
1477         "ldl        %[tmp0],    0x0f(%[src])                        \n\t"
1478         "ldr        %[tmp0],    0x08(%[src])                        \n\t"
1479         MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
1480         "ldl        %[tmp1],    0x0f(%[addr0])                      \n\t"
1481         "ldr        %[tmp1],    0x08(%[addr0])                      \n\t"
1482         PTR_ADDU   "%[addr1],   %[dst],         %[dststride]        \n\t"
1483         MMI_SDC1(%[ftmp0], %[dst], 0x00)
1484         "sdl        %[tmp0],    0x0f(%[dst])                        \n\t"
1485         "sdr        %[tmp0],    0x08(%[dst])                        \n\t"
1486         "addiu      %[h],       %[h],           -0x02               \n\t"
1487         MMI_SDC1(%[ftmp1], %[addr1], 0x00)
1488         PTR_ADDU   "%[src],     %[addr0],       %[srcstride]        \n\t"
1489         "sdl        %[tmp1],    0x0f(%[addr1])                      \n\t"
1490         "sdr        %[tmp1],    0x08(%[addr1])                      \n\t"
1491         PTR_ADDU   "%[dst],     %[addr1],       %[dststride]        \n\t"
1492         "bnez       %[h],       1b                                  \n\t"
1493         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
1494           [tmp0]"=&r"(tmp[0]),              [tmp1]"=&r"(tmp[1]),
1495           RESTRICT_ASM_ALL64
1496           [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
1497           [dst]"+&r"(dst),                  [src]"+&r"(src),
1498           [h]"+&r"(h)
1499         : [dststride]"r"((mips_reg)dststride),
1500           [srcstride]"r"((mips_reg)srcstride)
1501         : "memory"
1502     );
1503 #else
1504     int i;
1505
1506     for (i = 0; i < h; i++, dst += dststride, src += srcstride)
1507         memcpy(dst, src, 16);
1508 #endif
1509 }
1510
1511 void ff_put_vp8_pixels8_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1512         ptrdiff_t srcstride, int h, int x, int y)
1513 {
1514 #if 1
1515     double ftmp[1];
1516     uint64_t tmp[1];
1517     mips_reg addr[2];
1518     DECLARE_VAR_ALL64;
1519
1520     __asm__ volatile (
1521         "1:                                                         \n\t"
1522         PTR_ADDU   "%[addr0],   %[src],         %[srcstride]        \n\t"
1523         MMI_ULDC1(%[ftmp0], %[src], 0x00)
1524         "ldl        %[tmp0],    0x07(%[addr0])                      \n\t"
1525         "ldr        %[tmp0],    0x00(%[addr0])                      \n\t"
1526         PTR_ADDU   "%[addr1],   %[dst],         %[dststride]        \n\t"
1527         MMI_SDC1(%[ftmp0], %[dst], 0x00)
1528         "addiu      %[h],       %[h],           -0x02               \n\t"
1529         "sdl        %[tmp0],    0x07(%[addr1])                      \n\t"
1530         "sdr        %[tmp0],    0x00(%[addr1])                      \n\t"
1531         PTR_ADDU   "%[src],     %[addr0],       %[srcstride]        \n\t"
1532         PTR_ADDU   "%[dst],     %[addr1],       %[dststride]        \n\t"
1533         "bnez       %[h],       1b                                  \n\t"
1534         : [ftmp0]"=&f"(ftmp[0]),            [tmp0]"=&r"(tmp[0]),
1535           RESTRICT_ASM_ALL64
1536           [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
1537           [dst]"+&r"(dst),                  [src]"+&r"(src),
1538           [h]"+&r"(h)
1539         : [dststride]"r"((mips_reg)dststride),
1540           [srcstride]"r"((mips_reg)srcstride)
1541         : "memory"
1542     );
1543 #else
1544     int i;
1545
1546     for (i = 0; i < h; i++, dst += dststride, src += srcstride)
1547         memcpy(dst, src, 8);
1548 #endif
1549 }
1550
1551 void ff_put_vp8_pixels4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1552         ptrdiff_t srcstride, int h, int x, int y)
1553 {
1554 #if 1
1555     double ftmp[1];
1556     uint64_t tmp[1];
1557     mips_reg addr[2];
1558     DECLARE_VAR_LOW32;
1559
1560     __asm__ volatile (
1561         "1:                                                         \n\t"
1562         PTR_ADDU   "%[addr0],   %[src],         %[srcstride]        \n\t"
1563         MMI_LWC1(%[ftmp0], %[src], 0x00)
1564         "lwl        %[tmp0],    0x03(%[addr0])                      \n\t"
1565         "lwr        %[tmp0],    0x00(%[addr0])                      \n\t"
1566         PTR_ADDU   "%[addr1],   %[dst],         %[dststride]        \n\t"
1567         MMI_SWC1(%[ftmp0], %[dst], 0x00)
1568         "addiu      %[h],       %[h],           -0x02               \n\t"
1569         "swl        %[tmp0],    0x03(%[addr1])                      \n\t"
1570         "swr        %[tmp0],    0x00(%[addr1])                      \n\t"
1571         PTR_ADDU   "%[src],     %[addr0],       %[srcstride]        \n\t"
1572         PTR_ADDU   "%[dst],     %[addr1],       %[dststride]        \n\t"
1573         "bnez       %[h],       1b                                  \n\t"
1574         : [ftmp0]"=&f"(ftmp[0]),            [tmp0]"=&r"(tmp[0]),
1575           RESTRICT_ASM_LOW32
1576           [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
1577           [dst]"+&r"(dst),                  [src]"+&r"(src),
1578           [h]"+&r"(h)
1579         : [dststride]"r"((mips_reg)dststride),
1580           [srcstride]"r"((mips_reg)srcstride)
1581         : "memory"
1582     );
1583 #else
1584     int i;
1585
1586     for (i = 0; i < h; i++, dst += dststride, src += srcstride)
1587         memcpy(dst, src, 4);
1588 #endif
1589 }
1590
1591 void ff_put_vp8_epel16_h4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1592         ptrdiff_t srcstride, int h, int mx, int my)
1593 {
1594 #if 1
1595     const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1596     double ftmp[9];
1597     uint32_t tmp[1];
1598     mips_reg src1, dst1;
1599     DECLARE_VAR_ALL64;
1600
1601     /*
1602     dst[0] = cm[(filter[2] * src[0] - filter[1] * src[-1] + filter[3] * src[1] - filter[4] * src[2] + 64) >> 7];
1603     dst[1] = cm[(filter[2] * src[1] - filter[1] * src[ 0] + filter[3] * src[2] - filter[4] * src[3] + 64) >> 7];
1604     dst[2] = cm[(filter[2] * src[2] - filter[1] * src[ 1] + filter[3] * src[3] - filter[4] * src[4] + 64) >> 7];
1605     dst[3] = cm[(filter[2] * src[3] - filter[1] * src[ 2] + filter[3] * src[4] - filter[4] * src[5] + 64) >> 7];
1606     dst[4] = cm[(filter[2] * src[4] - filter[1] * src[ 3] + filter[3] * src[5] - filter[4] * src[6] + 64) >> 7];
1607     dst[5] = cm[(filter[2] * src[5] - filter[1] * src[ 4] + filter[3] * src[6] - filter[4] * src[7] + 64) >> 7];
1608     dst[6] = cm[(filter[2] * src[6] - filter[1] * src[ 5] + filter[3] * src[7] - filter[4] * src[8] + 64) >> 7];
1609     dst[7] = cm[(filter[2] * src[7] - filter[1] * src[ 6] + filter[3] * src[8] - filter[4] * src[9] + 64) >> 7];
1610
1611     dst[ 8] = cm[(filter[2] * src[ 8] - filter[1] * src[ 7] + filter[3] * src[ 9] - filter[4] * src[10] + 64) >> 7];
1612     dst[ 9] = cm[(filter[2] * src[ 9] - filter[1] * src[ 8] + filter[3] * src[10] - filter[4] * src[11] + 64) >> 7];
1613     dst[10] = cm[(filter[2] * src[10] - filter[1] * src[ 9] + filter[3] * src[11] - filter[4] * src[12] + 64) >> 7];
1614     dst[11] = cm[(filter[2] * src[11] - filter[1] * src[10] + filter[3] * src[12] - filter[4] * src[13] + 64) >> 7];
1615     dst[12] = cm[(filter[2] * src[12] - filter[1] * src[11] + filter[3] * src[13] - filter[4] * src[14] + 64) >> 7];
1616     dst[13] = cm[(filter[2] * src[13] - filter[1] * src[12] + filter[3] * src[14] - filter[4] * src[15] + 64) >> 7];
1617     dst[14] = cm[(filter[2] * src[14] - filter[1] * src[13] + filter[3] * src[15] - filter[4] * src[16] + 64) >> 7];
1618     dst[15] = cm[(filter[2] * src[15] - filter[1] * src[14] + filter[3] * src[16] - filter[4] * src[17] + 64) >> 7];
1619     */
1620     __asm__ volatile (
1621         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
1622         "li         %[tmp0],    0x07                                \n\t"
1623         "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
1624
1625         "1:                                                         \n\t"
1626         // 0 - 7
1627         PUT_VP8_EPEL8_H4_MMI(%[src], %[dst])
1628         PTR_ADDIU  "%[src1],    %[src],         0x08                \n\t"
1629         PTR_ADDIU  "%[dst1],    %[dst],         0x08                \n\t"
1630         // 8 - 15
1631         PUT_VP8_EPEL8_H4_MMI(%[src1], %[dst1])
1632
1633         "addiu      %[h],       %[h],           -0x01               \n\t"
1634         PTR_ADDU   "%[src],     %[src],         %[srcstride]        \n\t"
1635         PTR_ADDU   "%[dst],     %[dst],         %[dststride]        \n\t"
1636         "bnez       %[h],       1b                                  \n\t"
1637         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
1638           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
1639           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
1640           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
1641           [ftmp8]"=&f"(ftmp[8]),
1642           [tmp0]"=&r"(tmp[0]),
1643           RESTRICT_ASM_ALL64
1644           [dst1]"=&r"(dst1),                [src1]"=&r"(src1),
1645           [h]"+&r"(h),
1646           [dst]"+&r"(dst),                  [src]"+&r"(src)
1647         : [ff_pw_64]"f"(ff_pw_64),
1648           [srcstride]"r"((mips_reg)srcstride),
1649           [dststride]"r"((mips_reg)dststride),
1650           [filter1]"f"(filter[1]),          [filter2]"f"(filter[2]),
1651           [filter3]"f"(filter[3]),          [filter4]"f"(filter[4])
1652         : "memory"
1653     );
1654 #else
1655     const uint8_t *filter = subpel_filters[mx - 1];
1656     const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
1657     int x, y;
1658
1659     for (y = 0; y < h; y++) {
1660         for (x = 0; x < 16; x++)
1661             dst[x] = FILTER_4TAP(src, filter, 1);
1662         dst += dststride;
1663         src += srcstride;
1664     }
1665 #endif
1666 }
1667
1668 void ff_put_vp8_epel8_h4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1669         ptrdiff_t srcstride, int h, int mx, int my)
1670 {
1671 #if 1
1672     const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1673     double ftmp[9];
1674     uint32_t tmp[1];
1675     DECLARE_VAR_ALL64;
1676
1677     /*
1678     dst[0] = cm[(filter[2] * src[0] - filter[1] * src[-1] + filter[3] * src[1] - filter[4] * src[2] + 64) >> 7];
1679     dst[1] = cm[(filter[2] * src[1] - filter[1] * src[ 0] + filter[3] * src[2] - filter[4] * src[3] + 64) >> 7];
1680     dst[2] = cm[(filter[2] * src[2] - filter[1] * src[ 1] + filter[3] * src[3] - filter[4] * src[4] + 64) >> 7];
1681     dst[3] = cm[(filter[2] * src[3] - filter[1] * src[ 2] + filter[3] * src[4] - filter[4] * src[5] + 64) >> 7];
1682     dst[4] = cm[(filter[2] * src[4] - filter[1] * src[ 3] + filter[3] * src[5] - filter[4] * src[6] + 64) >> 7];
1683     dst[5] = cm[(filter[2] * src[5] - filter[1] * src[ 4] + filter[3] * src[6] - filter[4] * src[7] + 64) >> 7];
1684     dst[6] = cm[(filter[2] * src[6] - filter[1] * src[ 5] + filter[3] * src[7] - filter[4] * src[8] + 64) >> 7];
1685     dst[7] = cm[(filter[2] * src[7] - filter[1] * src[ 6] + filter[3] * src[8] - filter[4] * src[9] + 64) >> 7];
1686     */
1687     __asm__ volatile (
1688         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
1689         "li         %[tmp0],    0x07                                \n\t"
1690         "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
1691
1692         "1:                                                         \n\t"
1693         PUT_VP8_EPEL8_H4_MMI(%[src], %[dst])
1694
1695         "addiu      %[h],       %[h],           -0x01               \n\t"
1696         PTR_ADDU   "%[src],     %[src],         %[srcstride]        \n\t"
1697         PTR_ADDU   "%[dst],     %[dst],         %[dststride]        \n\t"
1698         "bnez       %[h],       1b                                  \n\t"
1699         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
1700           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
1701           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
1702           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
1703           [ftmp8]"=&f"(ftmp[8]),
1704           [tmp0]"=&r"(tmp[0]),
1705           RESTRICT_ASM_ALL64
1706           [h]"+&r"(h),
1707           [dst]"+&r"(dst),                  [src]"+&r"(src)
1708         : [ff_pw_64]"f"(ff_pw_64),
1709           [srcstride]"r"((mips_reg)srcstride),
1710           [dststride]"r"((mips_reg)dststride),
1711           [filter1]"f"(filter[1]),          [filter2]"f"(filter[2]),
1712           [filter3]"f"(filter[3]),          [filter4]"f"(filter[4])
1713         : "memory"
1714     );
1715 #else
1716     const uint8_t *filter = subpel_filters[mx - 1];
1717     const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
1718     int x, y;
1719
1720     for (y = 0; y < h; y++) {
1721         for (x = 0; x < 8; x++)
1722             dst[x] = FILTER_4TAP(src, filter, 1);
1723         dst += dststride;
1724         src += srcstride;
1725     }
1726 #endif
1727 }
1728
1729 void ff_put_vp8_epel4_h4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1730         ptrdiff_t srcstride, int h, int mx, int my)
1731 {
1732 #if 1
1733     const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1734     double ftmp[6];
1735     uint32_t tmp[1];
1736     DECLARE_VAR_LOW32;
1737
1738     /*
1739     dst[0] = cm[(filter[2] * src[0] - filter[1] * src[-1] + filter[3] * src[1] - filter[4] * src[2] + 64) >> 7];
1740     dst[1] = cm[(filter[2] * src[1] - filter[1] * src[ 0] + filter[3] * src[2] - filter[4] * src[3] + 64) >> 7];
1741     dst[2] = cm[(filter[2] * src[2] - filter[1] * src[ 1] + filter[3] * src[3] - filter[4] * src[4] + 64) >> 7];
1742     dst[3] = cm[(filter[2] * src[3] - filter[1] * src[ 2] + filter[3] * src[4] - filter[4] * src[5] + 64) >> 7];
1743     */
1744     __asm__ volatile (
1745         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
1746         "li         %[tmp0],    0x07                                \n\t"
1747         "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
1748
1749         "1:                                                         \n\t"
1750         PUT_VP8_EPEL4_H4_MMI(%[src], %[dst])
1751
1752         "addiu      %[h],       %[h],           -0x01               \n\t"
1753         PTR_ADDU   "%[src],     %[src],         %[srcstride]        \n\t"
1754         PTR_ADDU   "%[dst],     %[dst],         %[dststride]        \n\t"
1755         "bnez       %[h],       1b                                  \n\t"
1756         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
1757           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
1758           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
1759           [tmp0]"=&r"(tmp[0]),
1760           RESTRICT_ASM_LOW32
1761           [h]"+&r"(h),
1762           [dst]"+&r"(dst),                  [src]"+&r"(src)
1763         : [ff_pw_64]"f"(ff_pw_64),
1764           [srcstride]"r"((mips_reg)srcstride),
1765           [dststride]"r"((mips_reg)dststride),
1766           [filter1]"f"(filter[1]),          [filter2]"f"(filter[2]),
1767           [filter3]"f"(filter[3]),          [filter4]"f"(filter[4])
1768         : "memory"
1769     );
1770 #else
1771     const uint8_t *filter = subpel_filters[mx - 1];
1772     const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
1773     int x, y;
1774
1775     for (y = 0; y < h; y++) {
1776         for (x = 0; x < 4; x++)
1777             dst[x] = FILTER_4TAP(src, filter, 1);
1778         dst += dststride;
1779         src += srcstride;
1780     }
1781 #endif
1782 }
1783
1784 void ff_put_vp8_epel16_h6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1785         ptrdiff_t srcstride, int h, int mx, int my)
1786 {
1787 #if 1
1788     const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1789     double ftmp[9];
1790     uint32_t tmp[1];
1791     mips_reg src1, dst1;
1792     DECLARE_VAR_ALL64;
1793
1794     /*
1795     dst[ 0] = cm[(filter[2]*src[ 0] - filter[1]*src[-1] + filter[0]*src[-2] + filter[3]*src[ 1] - filter[4]*src[ 2] + filter[5]*src[ 3] + 64) >> 7];
1796     dst[ 1] = cm[(filter[2]*src[ 1] - filter[1]*src[ 0] + filter[0]*src[-1] + filter[3]*src[ 2] - filter[4]*src[ 3] + filter[5]*src[ 4] + 64) >> 7];
1797     dst[ 2] = cm[(filter[2]*src[ 2] - filter[1]*src[ 1] + filter[0]*src[ 0] + filter[3]*src[ 3] - filter[4]*src[ 4] + filter[5]*src[ 5] + 64) >> 7];
1798     dst[ 3] = cm[(filter[2]*src[ 3] - filter[1]*src[ 2] + filter[0]*src[ 1] + filter[3]*src[ 4] - filter[4]*src[ 5] + filter[5]*src[ 6] + 64) >> 7];
1799     dst[ 4] = cm[(filter[2]*src[ 4] - filter[1]*src[ 3] + filter[0]*src[ 2] + filter[3]*src[ 5] - filter[4]*src[ 6] + filter[5]*src[ 7] + 64) >> 7];
1800     dst[ 5] = cm[(filter[2]*src[ 5] - filter[1]*src[ 4] + filter[0]*src[ 3] + filter[3]*src[ 6] - filter[4]*src[ 7] + filter[5]*src[ 8] + 64) >> 7];
1801     dst[ 6] = cm[(filter[2]*src[ 6] - filter[1]*src[ 5] + filter[0]*src[ 4] + filter[3]*src[ 7] - filter[4]*src[ 8] + filter[5]*src[ 9] + 64) >> 7];
1802     dst[ 7] = cm[(filter[2]*src[ 7] - filter[1]*src[ 6] + filter[0]*src[ 5] + filter[3]*src[ 8] - filter[4]*src[ 9] + filter[5]*src[10] + 64) >> 7];
1803
1804     dst[ 8] = cm[(filter[2]*src[ 8] - filter[1]*src[ 7] + filter[0]*src[ 6] + filter[3]*src[ 9] - filter[4]*src[10] + filter[5]*src[11] + 64) >> 7];
1805     dst[ 9] = cm[(filter[2]*src[ 9] - filter[1]*src[ 8] + filter[0]*src[ 7] + filter[3]*src[10] - filter[4]*src[11] + filter[5]*src[12] + 64) >> 7];
1806     dst[10] = cm[(filter[2]*src[10] - filter[1]*src[ 9] + filter[0]*src[ 8] + filter[3]*src[11] - filter[4]*src[12] + filter[5]*src[13] + 64) >> 7];
1807     dst[11] = cm[(filter[2]*src[11] - filter[1]*src[10] + filter[0]*src[ 9] + filter[3]*src[12] - filter[4]*src[13] + filter[5]*src[14] + 64) >> 7];
1808     dst[12] = cm[(filter[2]*src[12] - filter[1]*src[11] + filter[0]*src[10] + filter[3]*src[13] - filter[4]*src[14] + filter[5]*src[15] + 64) >> 7];
1809     dst[13] = cm[(filter[2]*src[13] - filter[1]*src[12] + filter[0]*src[11] + filter[3]*src[14] - filter[4]*src[15] + filter[5]*src[16] + 64) >> 7];
1810     dst[14] = cm[(filter[2]*src[14] - filter[1]*src[13] + filter[0]*src[12] + filter[3]*src[15] - filter[4]*src[16] + filter[5]*src[17] + 64) >> 7];
1811     dst[15] = cm[(filter[2]*src[15] - filter[1]*src[14] + filter[0]*src[13] + filter[3]*src[16] - filter[4]*src[17] + filter[5]*src[18] + 64) >> 7];
1812     */
1813     __asm__ volatile (
1814         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
1815         "li         %[tmp0],    0x07                                \n\t"
1816         "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
1817
1818         "1:                                                         \n\t"
1819         // 0 - 7
1820         PUT_VP8_EPEL8_H6_MMI(%[src], %[dst])
1821         PTR_ADDIU  "%[src1],    %[src],         0x08                \n\t"
1822         PTR_ADDIU  "%[dst1],    %[dst],         0x08                \n\t"
1823         // 8 - 15
1824         PUT_VP8_EPEL8_H6_MMI(%[src1], %[dst1])
1825
1826         "addiu      %[h],       %[h],           -0x01               \n\t"
1827         PTR_ADDU   "%[src],     %[src],         %[srcstride]        \n\t"
1828         PTR_ADDU   "%[dst],     %[dst],         %[dststride]        \n\t"
1829         "bnez       %[h],       1b                                  \n\t"
1830         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
1831           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
1832           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
1833           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
1834           [ftmp8]"=&f"(ftmp[8]),
1835           [tmp0]"=&r"(tmp[0]),
1836           RESTRICT_ASM_ALL64
1837           [dst1]"=&r"(dst1),                [src1]"=&r"(src1),
1838           [h]"+&r"(h),
1839           [dst]"+&r"(dst),                  [src]"+&r"(src)
1840         : [ff_pw_64]"f"(ff_pw_64),
1841           [srcstride]"r"((mips_reg)srcstride),
1842           [dststride]"r"((mips_reg)dststride),
1843           [filter0]"f"(filter[0]),          [filter1]"f"(filter[1]),
1844           [filter2]"f"(filter[2]),          [filter3]"f"(filter[3]),
1845           [filter4]"f"(filter[4]),          [filter5]"f"(filter[5])
1846         : "memory"
1847     );
1848 #else
1849     const uint8_t *filter = subpel_filters[mx - 1];
1850     const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
1851     int x, y;
1852
1853     for (y = 0; y < h; y++) {
1854         for (x = 0; x < 16; x++)
1855             dst[x] = FILTER_6TAP(src, filter, 1);
1856         dst += dststride;
1857         src += srcstride;
1858     }
1859 #endif
1860 }
1861
1862 void ff_put_vp8_epel8_h6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1863         ptrdiff_t srcstride, int h, int mx, int my)
1864 {
1865 #if 1
1866     const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1867     double ftmp[9];
1868     uint32_t tmp[1];
1869     DECLARE_VAR_ALL64;
1870
1871     /*
1872     dst[0] = cm[(filter[2]*src[0] - filter[1]*src[-1] + filter[0]*src[-2] + filter[3]*src[1] - filter[4]*src[2] + filter[5]*src[ 3] + 64) >> 7];
1873     dst[1] = cm[(filter[2]*src[1] - filter[1]*src[ 0] + filter[0]*src[-1] + filter[3]*src[2] - filter[4]*src[3] + filter[5]*src[ 4] + 64) >> 7];
1874     dst[2] = cm[(filter[2]*src[2] - filter[1]*src[ 1] + filter[0]*src[ 0] + filter[3]*src[3] - filter[4]*src[4] + filter[5]*src[ 5] + 64) >> 7];
1875     dst[3] = cm[(filter[2]*src[3] - filter[1]*src[ 2] + filter[0]*src[ 1] + filter[3]*src[4] - filter[4]*src[5] + filter[5]*src[ 6] + 64) >> 7];
1876     dst[4] = cm[(filter[2]*src[4] - filter[1]*src[ 3] + filter[0]*src[ 2] + filter[3]*src[5] - filter[4]*src[6] + filter[5]*src[ 7] + 64) >> 7];
1877     dst[5] = cm[(filter[2]*src[5] - filter[1]*src[ 4] + filter[0]*src[ 3] + filter[3]*src[6] - filter[4]*src[7] + filter[5]*src[ 8] + 64) >> 7];
1878     dst[6] = cm[(filter[2]*src[6] - filter[1]*src[ 5] + filter[0]*src[ 4] + filter[3]*src[7] - filter[4]*src[8] + filter[5]*src[ 9] + 64) >> 7];
1879     dst[7] = cm[(filter[2]*src[7] - filter[1]*src[ 6] + filter[0]*src[ 5] + filter[3]*src[8] - filter[4]*src[9] + filter[5]*src[10] + 64) >> 7];
1880     */
1881     __asm__ volatile (
1882         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
1883         "li         %[tmp0],    0x07                                \n\t"
1884         "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
1885
1886         "1:                                                         \n\t"
1887         PUT_VP8_EPEL8_H6_MMI(%[src], %[dst])
1888
1889         "addiu      %[h],       %[h],           -0x01               \n\t"
1890         PTR_ADDU   "%[src],     %[src],         %[srcstride]        \n\t"
1891         PTR_ADDU   "%[dst],     %[dst],         %[dststride]        \n\t"
1892         "bnez       %[h],       1b                                  \n\t"
1893         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
1894           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
1895           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
1896           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
1897           [ftmp8]"=&f"(ftmp[8]),
1898           [tmp0]"=&r"(tmp[0]),
1899           RESTRICT_ASM_ALL64
1900           [h]"+&r"(h),
1901           [dst]"+&r"(dst),                  [src]"+&r"(src)
1902         : [ff_pw_64]"f"(ff_pw_64),
1903           [srcstride]"r"((mips_reg)srcstride),
1904           [dststride]"r"((mips_reg)dststride),
1905           [filter0]"f"(filter[0]),          [filter1]"f"(filter[1]),
1906           [filter2]"f"(filter[2]),          [filter3]"f"(filter[3]),
1907           [filter4]"f"(filter[4]),          [filter5]"f"(filter[5])
1908         : "memory"
1909     );
1910 #else
1911     const uint8_t *filter = subpel_filters[mx - 1];
1912     const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
1913     int x, y;
1914
1915     for (y = 0; y < h; y++) {
1916         for (x = 0; x < 8; x++)
1917             dst[x] = FILTER_6TAP(src, filter, 1);
1918         dst += dststride;
1919         src += srcstride;
1920     }
1921 #endif
1922 }
1923
1924 void ff_put_vp8_epel4_h6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1925         ptrdiff_t srcstride, int h, int mx, int my)
1926 {
1927 #if 1
1928     const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1929     double ftmp[6];
1930     uint32_t tmp[1];
1931     DECLARE_VAR_LOW32;
1932
1933     /*
1934     dst[0] = cm[(filter[2]*src[0] - filter[1]*src[-1] + filter[0]*src[-2] + filter[3]*src[1] - filter[4]*src[2] + filter[5]*src[ 3] + 64) >> 7];
1935     dst[1] = cm[(filter[2]*src[1] - filter[1]*src[ 0] + filter[0]*src[-1] + filter[3]*src[2] - filter[4]*src[3] + filter[5]*src[ 4] + 64) >> 7];
1936     dst[2] = cm[(filter[2]*src[2] - filter[1]*src[ 1] + filter[0]*src[ 0] + filter[3]*src[3] - filter[4]*src[4] + filter[5]*src[ 5] + 64) >> 7];
1937     dst[3] = cm[(filter[2]*src[3] - filter[1]*src[ 2] + filter[0]*src[ 1] + filter[3]*src[4] - filter[4]*src[5] + filter[5]*src[ 6] + 64) >> 7];
1938     */
1939     __asm__ volatile (
1940         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
1941         "li         %[tmp0],    0x07                                \n\t"
1942         "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
1943
1944         "1:                                                         \n\t"
1945         PUT_VP8_EPEL4_H6_MMI(%[src], %[dst])
1946
1947         "addiu      %[h],       %[h],           -0x01               \n\t"
1948         PTR_ADDU   "%[src],     %[src],         %[srcstride]        \n\t"
1949         PTR_ADDU   "%[dst],     %[dst],         %[dststride]        \n\t"
1950         "bnez       %[h],       1b                                  \n\t"
1951         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
1952           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
1953           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
1954           [tmp0]"=&r"(tmp[0]),
1955           RESTRICT_ASM_LOW32
1956           [h]"+&r"(h),
1957           [dst]"+&r"(dst),                  [src]"+&r"(src)
1958         : [ff_pw_64]"f"(ff_pw_64),
1959           [srcstride]"r"((mips_reg)srcstride),
1960           [dststride]"r"((mips_reg)dststride),
1961           [filter0]"f"(filter[0]),          [filter1]"f"(filter[1]),
1962           [filter2]"f"(filter[2]),          [filter3]"f"(filter[3]),
1963           [filter4]"f"(filter[4]),          [filter5]"f"(filter[5])
1964         : "memory"
1965     );
1966 #else
1967     const uint8_t *filter = subpel_filters[mx - 1];
1968     const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
1969     int x, y;
1970
1971     for (y = 0; y < h; y++) {
1972         for (x = 0; x < 4; x++)
1973             dst[x] = FILTER_6TAP(src, filter, 1);
1974         dst += dststride;
1975         src += srcstride;
1976     }
1977 #endif
1978 }
1979
1980 void ff_put_vp8_epel16_v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1981         ptrdiff_t srcstride, int h, int mx, int my)
1982 {
1983 #if 1
1984     const uint64_t *filter = fourtap_subpel_filters[my - 1];
1985     double ftmp[9];
1986     uint32_t tmp[1];
1987     mips_reg src0, src1, dst0;
1988     DECLARE_VAR_ALL64;
1989
1990     /*
1991     dst[0] = cm[(filter[2] * src[0] - filter[1] * src[ -srcstride] + filter[3] * src[  srcstride] - filter[4] * src[  2*srcstride] + 64) >> 7];
1992     dst[1] = cm[(filter[2] * src[1] - filter[1] * src[1-srcstride] + filter[3] * src[1+srcstride] - filter[4] * src[1+2*srcstride] + 64) >> 7];
1993     dst[2] = cm[(filter[2] * src[2] - filter[1] * src[2-srcstride] + filter[3] * src[2+srcstride] - filter[4] * src[2+2*srcstride] + 64) >> 7];
1994     dst[3] = cm[(filter[2] * src[3] - filter[1] * src[3-srcstride] + filter[3] * src[3+srcstride] - filter[4] * src[3+2*srcstride] + 64) >> 7];
1995     dst[4] = cm[(filter[2] * src[4] - filter[1] * src[4-srcstride] + filter[3] * src[4+srcstride] - filter[4] * src[4+2*srcstride] + 64) >> 7];
1996     dst[5] = cm[(filter[2] * src[5] - filter[1] * src[5-srcstride] + filter[3] * src[5+srcstride] - filter[4] * src[5+2*srcstride] + 64) >> 7];
1997     dst[6] = cm[(filter[2] * src[6] - filter[1] * src[6-srcstride] + filter[3] * src[6+srcstride] - filter[4] * src[6+2*srcstride] + 64) >> 7];
1998     dst[7] = cm[(filter[2] * src[7] - filter[1] * src[7-srcstride] + filter[3] * src[7+srcstride] - filter[4] * src[7+2*srcstride] + 64) >> 7];
1999
2000     dst[ 8] = cm[(filter[2] * src[ 8] - filter[1] * src[ 8-srcstride] + filter[3] * src[ 8+srcstride] - filter[4] * src[ 8+2*srcstride] + 64) >> 7];
2001     dst[ 9] = cm[(filter[2] * src[ 9] - filter[1] * src[ 9-srcstride] + filter[3] * src[ 9+srcstride] - filter[4] * src[ 9+2*srcstride] + 64) >> 7];
2002     dst[10] = cm[(filter[2] * src[10] - filter[1] * src[10-srcstride] + filter[3] * src[10+srcstride] - filter[4] * src[10+2*srcstride] + 64) >> 7];
2003     dst[11] = cm[(filter[2] * src[11] - filter[1] * src[11-srcstride] + filter[3] * src[11+srcstride] - filter[4] * src[11+2*srcstride] + 64) >> 7];
2004     dst[12] = cm[(filter[2] * src[12] - filter[1] * src[12-srcstride] + filter[3] * src[12+srcstride] - filter[4] * src[12+2*srcstride] + 64) >> 7];
2005     dst[13] = cm[(filter[2] * src[13] - filter[1] * src[13-srcstride] + filter[3] * src[13+srcstride] - filter[4] * src[13+2*srcstride] + 64) >> 7];
2006     dst[14] = cm[(filter[2] * src[14] - filter[1] * src[14-srcstride] + filter[3] * src[14+srcstride] - filter[4] * src[14+2*srcstride] + 64) >> 7];
2007     dst[15] = cm[(filter[2] * src[15] - filter[1] * src[15-srcstride] + filter[3] * src[15+srcstride] - filter[4] * src[15+2*srcstride] + 64) >> 7];
2008     */
2009     __asm__ volatile (
2010         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
2011         "li         %[tmp0],    0x07                                \n\t"
2012         "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
2013
2014         "1:                                                         \n\t"
2015         // 0 - 7
2016         PUT_VP8_EPEL8_V4_MMI(%[src], %[src1], %[dst], %[srcstride])
2017         PTR_ADDIU  "%[src0],    %[src],         0x08                \n\t"
2018         PTR_ADDIU  "%[dst0],    %[dst],         0x08                \n\t"
2019         // 8 - 15
2020         PUT_VP8_EPEL8_V4_MMI(%[src0], %[src1], %[dst], %[srcstride])
2021
2022         "addiu      %[h],       %[h],           -0x01               \n\t"
2023         PTR_ADDU   "%[src],     %[src],         %[srcstride]        \n\t"
2024         PTR_ADDU   "%[dst],     %[dst],         %[dststride]        \n\t"
2025         "bnez       %[h],       1b                                  \n\t"
2026         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
2027           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
2028           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
2029           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
2030           [ftmp8]"=&f"(ftmp[8]),
2031           [tmp0]"=&r"(tmp[0]),
2032           RESTRICT_ASM_ALL64
2033           [src0]"=&r"(src0),                [dst0]"=&r"(dst0),
2034           [src1]"=&r"(src1),
2035           [h]"+&r"(h),
2036           [dst]"+&r"(dst),                  [src]"+&r"(src)
2037         : [ff_pw_64]"f"(ff_pw_64),
2038           [srcstride]"r"((mips_reg)srcstride),
2039           [dststride]"r"((mips_reg)dststride),
2040           [filter1]"f"(filter[1]),          [filter2]"f"(filter[2]),
2041           [filter3]"f"(filter[3]),          [filter4]"f"(filter[4])
2042         : "memory"
2043     );
2044 #else
2045     const uint8_t *filter = subpel_filters[my - 1];
2046     const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
2047     int x, y;
2048
2049     for (y = 0; y < h; y++) {
2050         for (x = 0; x < 16; x++)
2051             dst[x] = FILTER_4TAP(src, filter, srcstride);
2052         dst += dststride;
2053         src += srcstride;
2054     }
2055 #endif
2056 }
2057
2058 void ff_put_vp8_epel8_v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2059         ptrdiff_t srcstride, int h, int mx, int my)
2060 {
2061 #if 1
2062     const uint64_t *filter = fourtap_subpel_filters[my - 1];
2063     double ftmp[9];
2064     uint32_t tmp[1];
2065     mips_reg src1;
2066     DECLARE_VAR_ALL64;
2067
2068     /*
2069     dst[0] = cm[(filter[2] * src[0] - filter[1] * src[ -srcstride] + filter[3] * src[  srcstride] - filter[4] * src[  2*srcstride] + 64) >> 7];
2070     dst[1] = cm[(filter[2] * src[1] - filter[1] * src[1-srcstride] + filter[3] * src[1+srcstride] - filter[4] * src[1+2*srcstride] + 64) >> 7];
2071     dst[2] = cm[(filter[2] * src[2] - filter[1] * src[2-srcstride] + filter[3] * src[2+srcstride] - filter[4] * src[2+2*srcstride] + 64) >> 7];
2072     dst[3] = cm[(filter[2] * src[3] - filter[1] * src[3-srcstride] + filter[3] * src[3+srcstride] - filter[4] * src[3+2*srcstride] + 64) >> 7];
2073     dst[4] = cm[(filter[2] * src[4] - filter[1] * src[4-srcstride] + filter[3] * src[4+srcstride] - filter[4] * src[4+2*srcstride] + 64) >> 7];
2074     dst[5] = cm[(filter[2] * src[5] - filter[1] * src[5-srcstride] + filter[3] * src[5+srcstride] - filter[4] * src[5+2*srcstride] + 64) >> 7];
2075     dst[6] = cm[(filter[2] * src[6] - filter[1] * src[6-srcstride] + filter[3] * src[6+srcstride] - filter[4] * src[6+2*srcstride] + 64) >> 7];
2076     dst[7] = cm[(filter[2] * src[7] - filter[1] * src[7-srcstride] + filter[3] * src[7+srcstride] - filter[4] * src[7+2*srcstride] + 64) >> 7];
2077     */
2078     __asm__ volatile (
2079         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
2080         "li         %[tmp0],    0x07                                \n\t"
2081         "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
2082
2083         "1:                                                         \n\t"
2084         PUT_VP8_EPEL8_V4_MMI(%[src], %[src1], %[dst], %[srcstride])
2085
2086         "addiu      %[h],       %[h],           -0x01               \n\t"
2087         PTR_ADDU   "%[src],     %[src],         %[srcstride]        \n\t"
2088         PTR_ADDU   "%[dst],     %[dst],         %[dststride]        \n\t"
2089         "bnez       %[h],       1b                                  \n\t"
2090         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
2091           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
2092           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
2093           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
2094           [ftmp8]"=&f"(ftmp[8]),
2095           [tmp0]"=&r"(tmp[0]),
2096           RESTRICT_ASM_ALL64
2097           [src1]"=&r"(src1),
2098           [h]"+&r"(h),
2099           [dst]"+&r"(dst),                  [src]"+&r"(src)
2100         : [ff_pw_64]"f"(ff_pw_64),
2101           [srcstride]"r"((mips_reg)srcstride),
2102           [dststride]"r"((mips_reg)dststride),
2103           [filter1]"f"(filter[1]),          [filter2]"f"(filter[2]),
2104           [filter3]"f"(filter[3]),          [filter4]"f"(filter[4])
2105         : "memory"
2106     );
2107 #else
2108     const uint8_t *filter = subpel_filters[my - 1];
2109     const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
2110     int x, y;
2111
2112     for (y = 0; y < h; y++) {
2113         for (x = 0; x < 8; x++)
2114             dst[x] = FILTER_4TAP(src, filter, srcstride);
2115         dst += dststride;
2116         src += srcstride;
2117     }
2118 #endif
2119 }
2120
2121 void ff_put_vp8_epel4_v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2122         ptrdiff_t srcstride, int h, int mx, int my)
2123 {
2124 #if 1
2125     const uint64_t *filter = fourtap_subpel_filters[my - 1];
2126     double ftmp[6];
2127     uint32_t tmp[1];
2128     mips_reg src1;
2129     DECLARE_VAR_LOW32;
2130
2131     /*
2132     dst[0] = cm[(filter[2] * src[0] - filter[1] * src[ -srcstride] + filter[3] * src[  srcstride] - filter[4] * src[  2*srcstride] + 64) >> 7];
2133     dst[1] = cm[(filter[2] * src[1] - filter[1] * src[1-srcstride] + filter[3] * src[1+srcstride] - filter[4] * src[1+2*srcstride] + 64) >> 7];
2134     dst[2] = cm[(filter[2] * src[2] - filter[1] * src[2-srcstride] + filter[3] * src[2+srcstride] - filter[4] * src[2+2*srcstride] + 64) >> 7];
2135     dst[3] = cm[(filter[2] * src[3] - filter[1] * src[3-srcstride] + filter[3] * src[3+srcstride] - filter[4] * src[3+2*srcstride] + 64) >> 7];
2136     */
2137     __asm__ volatile (
2138         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
2139         "li         %[tmp0],    0x07                                \n\t"
2140         "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
2141
2142         "1:                                                         \n\t"
2143         PUT_VP8_EPEL4_V4_MMI(%[src], %[src1], %[dst], %[srcstride])
2144
2145         "addiu      %[h],       %[h],           -0x01               \n\t"
2146         PTR_ADDU   "%[src],     %[src],         %[srcstride]        \n\t"
2147         PTR_ADDU   "%[dst],     %[dst],         %[dststride]        \n\t"
2148         "bnez       %[h],       1b                                  \n\t"
2149         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
2150           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
2151           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
2152           [tmp0]"=&r"(tmp[0]),
2153           RESTRICT_ASM_LOW32
2154           [src1]"=&r"(src1),
2155           [h]"+&r"(h),
2156           [dst]"+&r"(dst),                  [src]"+&r"(src)
2157         : [ff_pw_64]"f"(ff_pw_64),
2158           [srcstride]"r"((mips_reg)srcstride),
2159           [dststride]"r"((mips_reg)dststride),
2160           [filter1]"f"(filter[1]),          [filter2]"f"(filter[2]),
2161           [filter3]"f"(filter[3]),          [filter4]"f"(filter[4])
2162         : "memory"
2163     );
2164 #else
2165     const uint8_t *filter = subpel_filters[my - 1];
2166     const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
2167     int x, y;
2168
2169     for (y = 0; y < h; y++) {
2170         for (x = 0; x < 4; x++)
2171             dst[x] = FILTER_4TAP(src, filter, srcstride);
2172         dst += dststride;
2173         src += srcstride;
2174     }
2175 #endif
2176 }
2177
2178 void ff_put_vp8_epel16_v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2179         ptrdiff_t srcstride, int h, int mx, int my)
2180 {
2181 #if 1
2182     const uint64_t *filter = fourtap_subpel_filters[my - 1];
2183     double ftmp[9];
2184     uint32_t tmp[1];
2185     mips_reg src0, src1, dst0;
2186     DECLARE_VAR_ALL64;
2187
2188     /*
2189     dst[0] = cm[(filter[2]*src[0] - filter[1]*src[0-srcstride] + filter[0]*src[0-2*srcstride] + filter[3]*src[0+srcstride] - filter[4]*src[0+2*srcstride] + filter[5]*src[0+3*srcstride] + 64) >> 7];
2190     dst[1] = cm[(filter[2]*src[1] - filter[1]*src[1-srcstride] + filter[0]*src[1-2*srcstride] + filter[3]*src[1+srcstride] - filter[4]*src[1+2*srcstride] + filter[5]*src[1+3*srcstride] + 64) >> 7];
2191     dst[2] = cm[(filter[2]*src[2] - filter[1]*src[2-srcstride] + filter[0]*src[2-2*srcstride] + filter[3]*src[2+srcstride] - filter[4]*src[2+2*srcstride] + filter[5]*src[2+3*srcstride] + 64) >> 7];
2192     dst[3] = cm[(filter[2]*src[3] - filter[1]*src[3-srcstride] + filter[0]*src[3-2*srcstride] + filter[3]*src[3+srcstride] - filter[4]*src[3+2*srcstride] + filter[5]*src[3+3*srcstride] + 64) >> 7];
2193     dst[4] = cm[(filter[2]*src[4] - filter[1]*src[4-srcstride] + filter[0]*src[4-2*srcstride] + filter[3]*src[4+srcstride] - filter[4]*src[4+2*srcstride] + filter[5]*src[4+3*srcstride] + 64) >> 7];
2194     dst[5] = cm[(filter[2]*src[5] - filter[1]*src[5-srcstride] + filter[0]*src[5-2*srcstride] + filter[3]*src[5+srcstride] - filter[4]*src[5+2*srcstride] + filter[5]*src[5+3*srcstride] + 64) >> 7];
2195     dst[6] = cm[(filter[2]*src[6] - filter[1]*src[6-srcstride] + filter[0]*src[6-2*srcstride] + filter[3]*src[6+srcstride] - filter[4]*src[6+2*srcstride] + filter[5]*src[6+3*srcstride] + 64) >> 7];
2196     dst[7] = cm[(filter[2]*src[7] - filter[1]*src[7-srcstride] + filter[0]*src[7-2*srcstride] + filter[3]*src[7+srcstride] - filter[4]*src[7+2*srcstride] + filter[5]*src[7+3*srcstride] + 64) >> 7];
2197
2198     dst[ 8] = cm[(filter[2]*src[ 8] - filter[1]*src[ 8-srcstride] + filter[0]*src[ 8-2*srcstride] + filter[3]*src[ 8+srcstride] - filter[4]*src[ 8+2*srcstride] + filter[5]*src[ 8+3*srcstride] + 64) >> 7];
2199     dst[ 9] = cm[(filter[2]*src[ 9] - filter[1]*src[ 9-srcstride] + filter[0]*src[ 9-2*srcstride] + filter[3]*src[ 9+srcstride] - filter[4]*src[ 9+2*srcstride] + filter[5]*src[ 9+3*srcstride] + 64) >> 7];
2200     dst[10] = cm[(filter[2]*src[10] - filter[1]*src[10-srcstride] + filter[0]*src[10-2*srcstride] + filter[3]*src[10+srcstride] - filter[4]*src[10+2*srcstride] + filter[5]*src[10+3*srcstride] + 64) >> 7];
2201     dst[11] = cm[(filter[2]*src[11] - filter[1]*src[11-srcstride] + filter[0]*src[11-2*srcstride] + filter[3]*src[11+srcstride] - filter[4]*src[11+2*srcstride] + filter[5]*src[11+3*srcstride] + 64) >> 7];
2202     dst[12] = cm[(filter[2]*src[12] - filter[1]*src[12-srcstride] + filter[0]*src[12-2*srcstride] + filter[3]*src[12+srcstride] - filter[4]*src[12+2*srcstride] + filter[5]*src[12+3*srcstride] + 64) >> 7];
2203     dst[13] = cm[(filter[2]*src[13] - filter[1]*src[13-srcstride] + filter[0]*src[13-2*srcstride] + filter[3]*src[13+srcstride] - filter[4]*src[13+2*srcstride] + filter[5]*src[13+3*srcstride] + 64) >> 7];
2204     dst[14] = cm[(filter[2]*src[14] - filter[1]*src[14-srcstride] + filter[0]*src[14-2*srcstride] + filter[3]*src[14+srcstride] - filter[4]*src[14+2*srcstride] + filter[5]*src[14+3*srcstride] + 64) >> 7];
2205     dst[15] = cm[(filter[2]*src[15] - filter[1]*src[15-srcstride] + filter[0]*src[15-2*srcstride] + filter[3]*src[15+srcstride] - filter[4]*src[15+2*srcstride] + filter[5]*src[15+3*srcstride] + 64) >> 7];
2206     */
2207     __asm__ volatile (
2208         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
2209         "li         %[tmp0],    0x07                                \n\t"
2210         "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
2211
2212         "1:                                                         \n\t"
2213         // 0 - 7
2214         PUT_VP8_EPEL8_V6_MMI(%[src], %[src1], %[dst], %[srcstride])
2215         PTR_ADDIU  "%[src0],    %[src],         0x08                \n\t"
2216         PTR_ADDIU  "%[dst0],    %[dst],         0x08                \n\t"
2217         // 8 - 15
2218         PUT_VP8_EPEL8_V6_MMI(%[src0], %[src1], %[dst0], %[srcstride])
2219
2220         "addiu      %[h],       %[h],           -0x01               \n\t"
2221         PTR_ADDU   "%[src],     %[src],         %[srcstride]        \n\t"
2222         PTR_ADDU   "%[dst],     %[dst],         %[dststride]        \n\t"
2223         "bnez       %[h],       1b                                  \n\t"
2224         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
2225           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
2226           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
2227           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
2228           [ftmp8]"=&f"(ftmp[8]),
2229           [tmp0]"=&r"(tmp[0]),
2230           RESTRICT_ASM_ALL64
2231           [src0]"=&r"(src0),                [dst0]"=&r"(dst0),
2232           [src1]"=&r"(src1),
2233           [h]"+&r"(h),
2234           [dst]"+&r"(dst),                  [src]"+&r"(src)
2235         : [ff_pw_64]"f"(ff_pw_64),
2236           [srcstride]"r"((mips_reg)srcstride),
2237           [dststride]"r"((mips_reg)dststride),
2238           [filter0]"f"(filter[0]),          [filter1]"f"(filter[1]),
2239           [filter2]"f"(filter[2]),          [filter3]"f"(filter[3]),
2240           [filter4]"f"(filter[4]),          [filter5]"f"(filter[5])
2241         : "memory"
2242     );
2243 #else
2244     const uint8_t *filter = subpel_filters[my - 1];
2245     const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
2246     int x, y;
2247
2248     for (y = 0; y < h; y++) {
2249         for (x = 0; x < 16; x++)
2250             dst[x] = FILTER_6TAP(src, filter, srcstride);
2251         dst += dststride;
2252         src += srcstride;
2253     }
2254 #endif
2255 }
2256
2257 void ff_put_vp8_epel8_v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2258         ptrdiff_t srcstride, int h, int mx, int my)
2259 {
2260 #if 1
2261     const uint64_t *filter = fourtap_subpel_filters[my - 1];
2262     double ftmp[9];
2263     uint32_t tmp[1];
2264     mips_reg src1;
2265     DECLARE_VAR_ALL64;
2266
2267     /*
2268     dst[0] = cm[(filter[2]*src[0] - filter[1]*src[0-srcstride] + filter[0]*src[0-2*srcstride] + filter[3]*src[0+srcstride] - filter[4]*src[0+2*srcstride] + filter[5]*src[0+3*srcstride] + 64) >> 7];
2269     dst[1] = cm[(filter[2]*src[1] - filter[1]*src[1-srcstride] + filter[0]*src[1-2*srcstride] + filter[3]*src[1+srcstride] - filter[4]*src[1+2*srcstride] + filter[5]*src[1+3*srcstride] + 64) >> 7];
2270     dst[2] = cm[(filter[2]*src[2] - filter[1]*src[2-srcstride] + filter[0]*src[2-2*srcstride] + filter[3]*src[2+srcstride] - filter[4]*src[2+2*srcstride] + filter[5]*src[2+3*srcstride] + 64) >> 7];
2271     dst[3] = cm[(filter[2]*src[3] - filter[1]*src[3-srcstride] + filter[0]*src[3-2*srcstride] + filter[3]*src[3+srcstride] - filter[4]*src[3+2*srcstride] + filter[5]*src[3+3*srcstride] + 64) >> 7];
2272     dst[4] = cm[(filter[2]*src[4] - filter[1]*src[4-srcstride] + filter[0]*src[4-2*srcstride] + filter[3]*src[4+srcstride] - filter[4]*src[4+2*srcstride] + filter[5]*src[4+3*srcstride] + 64) >> 7];
2273     dst[5] = cm[(filter[2]*src[5] - filter[1]*src[5-srcstride] + filter[0]*src[5-2*srcstride] + filter[3]*src[5+srcstride] - filter[4]*src[5+2*srcstride] + filter[5]*src[5+3*srcstride] + 64) >> 7];
2274     dst[6] = cm[(filter[2]*src[6] - filter[1]*src[6-srcstride] + filter[0]*src[6-2*srcstride] + filter[3]*src[6+srcstride] - filter[4]*src[6+2*srcstride] + filter[5]*src[6+3*srcstride] + 64) >> 7];
2275     dst[7] = cm[(filter[2]*src[7] - filter[1]*src[7-srcstride] + filter[0]*src[7-2*srcstride] + filter[3]*src[7+srcstride] - filter[4]*src[7+2*srcstride] + filter[5]*src[7+3*srcstride] + 64) >> 7];
2276     */
2277     __asm__ volatile (
2278         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
2279         "li         %[tmp0],    0x07                                \n\t"
2280         "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
2281
2282         "1:                                                         \n\t"
2283         PUT_VP8_EPEL8_V6_MMI(%[src], %[src1], %[dst], %[srcstride])
2284
2285         "addiu      %[h],       %[h],           -0x01               \n\t"
2286         PTR_ADDU   "%[src],     %[src],         %[srcstride]        \n\t"
2287         PTR_ADDU   "%[dst],     %[dst],         %[dststride]        \n\t"
2288         "bnez       %[h],       1b                                  \n\t"
2289         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
2290           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
2291           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
2292           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
2293           [ftmp8]"=&f"(ftmp[8]),
2294           [tmp0]"=&r"(tmp[0]),
2295           RESTRICT_ASM_ALL64
2296           [src1]"=&r"(src1),
2297           [h]"+&r"(h),
2298           [dst]"+&r"(dst),                  [src]"+&r"(src)
2299         : [ff_pw_64]"f"(ff_pw_64),
2300           [srcstride]"r"((mips_reg)srcstride),
2301           [dststride]"r"((mips_reg)dststride),
2302           [filter0]"f"(filter[0]),          [filter1]"f"(filter[1]),
2303           [filter2]"f"(filter[2]),          [filter3]"f"(filter[3]),
2304           [filter4]"f"(filter[4]),          [filter5]"f"(filter[5])
2305         : "memory"
2306     );
2307 #else
2308     const uint8_t *filter = subpel_filters[my - 1];
2309     const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
2310     int x, y;
2311
2312     for (y = 0; y < h; y++) {
2313         for (x = 0; x < 8; x++)
2314             dst[x] = FILTER_6TAP(src, filter, srcstride);
2315         dst += dststride;
2316         src += srcstride;
2317     }
2318 #endif
2319 }
2320
2321 void ff_put_vp8_epel4_v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2322         ptrdiff_t srcstride, int h, int mx, int my)
2323 {
2324 #if 1
2325     const uint64_t *filter = fourtap_subpel_filters[my - 1];
2326     double ftmp[6];
2327     uint32_t tmp[1];
2328     mips_reg src1;
2329     DECLARE_VAR_LOW32;
2330
2331     /*
2332     dst[0] = cm[(filter[2]*src[0] - filter[1]*src[0-srcstride] + filter[0]*src[0-2*srcstride] + filter[3]*src[0+srcstride] - filter[4]*src[0+2*srcstride] + filter[5]*src[0+3*srcstride] + 64) >> 7];
2333     dst[1] = cm[(filter[2]*src[1] - filter[1]*src[1-srcstride] + filter[0]*src[1-2*srcstride] + filter[3]*src[1+srcstride] - filter[4]*src[1+2*srcstride] + filter[5]*src[1+3*srcstride] + 64) >> 7];
2334     dst[2] = cm[(filter[2]*src[2] - filter[1]*src[2-srcstride] + filter[0]*src[2-2*srcstride] + filter[3]*src[2+srcstride] - filter[4]*src[2+2*srcstride] + filter[5]*src[2+3*srcstride] + 64) >> 7];
2335     dst[3] = cm[(filter[2]*src[3] - filter[1]*src[3-srcstride] + filter[0]*src[3-2*srcstride] + filter[3]*src[3+srcstride] - filter[4]*src[3+2*srcstride] + filter[5]*src[3+3*srcstride] + 64) >> 7];
2336     */
2337     __asm__ volatile (
2338         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
2339         "li         %[tmp0],    0x07                                \n\t"
2340         "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
2341
2342         "1:                                                         \n\t"
2343         PUT_VP8_EPEL4_V6_MMI(%[src], %[src1], %[dst], %[srcstride])
2344
2345         "addiu      %[h],       %[h],           -0x01               \n\t"
2346         PTR_ADDU   "%[src],     %[src],         %[srcstride]        \n\t"
2347         PTR_ADDU   "%[dst],     %[dst],         %[dststride]        \n\t"
2348         "bnez       %[h],       1b                                  \n\t"
2349         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
2350           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
2351           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
2352           [tmp0]"=&r"(tmp[0]),
2353           RESTRICT_ASM_LOW32
2354           [src1]"=&r"(src1),
2355           [h]"+&r"(h),
2356           [dst]"+&r"(dst),                  [src]"+&r"(src)
2357         : [ff_pw_64]"f"(ff_pw_64),
2358           [srcstride]"r"((mips_reg)srcstride),
2359           [dststride]"r"((mips_reg)dststride),
2360           [filter0]"f"(filter[0]),          [filter1]"f"(filter[1]),
2361           [filter2]"f"(filter[2]),          [filter3]"f"(filter[3]),
2362           [filter4]"f"(filter[4]),          [filter5]"f"(filter[5])
2363         : "memory"
2364     );
2365 #else
2366     const uint8_t *filter = subpel_filters[my - 1];
2367     const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
2368     int x, y;
2369
2370     for (y = 0; y < h; y++) {
2371         for (x = 0; x < 4; x++)
2372             dst[x] = FILTER_6TAP(src, filter, srcstride);
2373         dst += dststride;
2374         src += srcstride;
2375     }
2376 #endif
2377 }
2378
2379 void ff_put_vp8_epel16_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2380         ptrdiff_t srcstride, int h, int mx, int my)
2381 {
2382 #if 1
2383     DECLARE_ALIGNED(8, uint8_t, tmp_array[560]);
2384     uint8_t *tmp = tmp_array;
2385
2386     src -= srcstride;
2387     ff_put_vp8_epel16_h4_mmi(tmp, 16, src, srcstride, h + 3, mx, my);
2388     tmp = tmp_array + 16;
2389     ff_put_vp8_epel16_v4_mmi(dst, dststride, tmp, 16, h, mx, my);
2390 #else
2391     const uint8_t *filter = subpel_filters[mx - 1];
2392     const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
2393     int x, y;
2394     uint8_t tmp_array[560];
2395     uint8_t *tmp = tmp_array;
2396
2397     src -= srcstride;
2398
2399     for (y = 0; y < h + 3; y++) {
2400         for (x = 0; x < 16; x++)
2401             tmp[x] = FILTER_4TAP(src, filter, 1);
2402         tmp += 16;
2403         src += srcstride;
2404     }
2405
2406     tmp    = tmp_array + 16;
2407     filter = subpel_filters[my - 1];
2408
2409     for (y = 0; y < h; y++) {
2410         for (x = 0; x < 16; x++)
2411             dst[x] = FILTER_4TAP(tmp, filter, 16);
2412         dst += dststride;
2413         tmp += 16;
2414     }
2415 #endif
2416 }
2417
2418 void ff_put_vp8_epel8_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2419         ptrdiff_t srcstride, int h, int mx, int my)
2420 {
2421 #if 1
2422     DECLARE_ALIGNED(8, uint8_t, tmp_array[152]);
2423     uint8_t *tmp = tmp_array;
2424
2425     src -= srcstride;
2426     ff_put_vp8_epel8_h4_mmi(tmp, 8, src, srcstride, h + 3, mx, my);
2427     tmp = tmp_array + 8;
2428     ff_put_vp8_epel8_v4_mmi(dst, dststride, tmp, 8, h, mx, my);
2429 #else
2430     const uint8_t *filter = subpel_filters[mx - 1];
2431     const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
2432     int x, y;
2433     uint8_t tmp_array[152];
2434     uint8_t *tmp = tmp_array;
2435
2436     src -= srcstride;
2437
2438     for (y = 0; y < h + 3; y++) {
2439         for (x = 0; x < 8; x++)
2440             tmp[x] = FILTER_4TAP(src, filter, 1);
2441         tmp += 8;
2442         src += srcstride;
2443     }
2444
2445     tmp    = tmp_array + 8;
2446     filter = subpel_filters[my - 1];
2447
2448     for (y = 0; y < h; y++) {
2449         for (x = 0; x < 8; x++)
2450             dst[x] = FILTER_4TAP(tmp, filter, 8);
2451         dst += dststride;
2452         tmp += 8;
2453     }
2454 #endif
2455 }
2456
2457 void ff_put_vp8_epel4_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2458         ptrdiff_t srcstride, int h, int mx, int my)
2459 {
2460 #if 1
2461     DECLARE_ALIGNED(4, uint8_t, tmp_array[44]);
2462     uint8_t *tmp = tmp_array;
2463
2464     src -= srcstride;
2465     ff_put_vp8_epel4_h4_mmi(tmp, 4, src, srcstride, h + 3, mx, my);
2466     tmp = tmp_array + 4;
2467     ff_put_vp8_epel4_v4_mmi(dst, dststride, tmp, 4, h, mx, my);
2468 #else
2469     const uint8_t *filter = subpel_filters[mx - 1];
2470     const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
2471     int x, y;
2472     uint8_t tmp_array[44];
2473     uint8_t *tmp = tmp_array;
2474
2475     src -= srcstride;
2476
2477     for (y = 0; y < h + 3; y++) {
2478         for (x = 0; x < 4; x++)
2479             tmp[x] = FILTER_4TAP(src, filter, 1);
2480         tmp += 4;
2481         src += srcstride;
2482     }
2483     tmp    = tmp_array + 4;
2484     filter = subpel_filters[my - 1];
2485
2486     for (y = 0; y < h; y++) {
2487         for (x = 0; x < 4; x++)
2488             dst[x] = FILTER_4TAP(tmp, filter, 4);
2489         dst += dststride;
2490         tmp += 4;
2491     }
2492 #endif
2493 }
2494
2495 void ff_put_vp8_epel16_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2496         ptrdiff_t srcstride, int h, int mx, int my)
2497 {
2498 #if 1
2499     DECLARE_ALIGNED(8, uint8_t, tmp_array[592]);
2500     uint8_t *tmp = tmp_array;
2501
2502     src -= 2 * srcstride;
2503     ff_put_vp8_epel16_h4_mmi(tmp, 16, src, srcstride, h + 5, mx, my);
2504     tmp    = tmp_array + 32;
2505     ff_put_vp8_epel16_v6_mmi(dst, dststride, tmp, 16, h, mx, my);
2506 #else
2507     const uint8_t *filter = subpel_filters[mx - 1];
2508     const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
2509     int x, y;
2510     uint8_t tmp_array[592];
2511     uint8_t *tmp = tmp_array;
2512
2513     src -= 2 * srcstride;
2514
2515     for (y = 0; y < h + 5; y++) {
2516         for (x = 0; x < 16; x++)
2517             tmp[x] = FILTER_4TAP(src, filter, 1);
2518         tmp += 16;
2519         src += srcstride;
2520     }
2521
2522     tmp    = tmp_array + 32;
2523     filter = subpel_filters[my - 1];
2524
2525     for (y = 0; y < h; y++) {
2526         for (x = 0; x < 16; x++)
2527             dst[x] = FILTER_6TAP(tmp, filter, 16);
2528         dst += dststride;
2529         tmp += 16;
2530     }
2531 #endif
2532 }
2533
2534 void ff_put_vp8_epel8_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2535         ptrdiff_t srcstride, int h, int mx, int my)
2536 {
2537 #if 1
2538     DECLARE_ALIGNED(8, uint8_t, tmp_array[168]);
2539     uint8_t *tmp = tmp_array;
2540
2541     src -= 2 * srcstride;
2542     ff_put_vp8_epel8_h4_mmi(tmp, 8, src, srcstride, h + 5, mx, my);
2543     tmp    = tmp_array + 16;
2544     ff_put_vp8_epel8_v6_mmi(dst, dststride, tmp, 8, h, mx, my);
2545 #else
2546     const uint8_t *filter = subpel_filters[mx - 1];
2547     const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
2548     int x, y;
2549     uint8_t tmp_array[168];
2550     uint8_t *tmp = tmp_array;
2551
2552     src -= 2 * srcstride;
2553
2554     for (y = 0; y < h + 5; y++) {
2555         for (x = 0; x < 8; x++)
2556             tmp[x] = FILTER_4TAP(src, filter, 1);
2557         tmp += 8;
2558         src += srcstride;
2559     }
2560
2561     tmp    = tmp_array + 16;
2562     filter = subpel_filters[my - 1];
2563
2564     for (y = 0; y < h; y++) {
2565         for (x = 0; x < 8; x++)
2566             dst[x] = FILTER_6TAP(tmp, filter, 8);
2567         dst += dststride;
2568         tmp += 8;
2569     }
2570 #endif
2571 }
2572
2573 void ff_put_vp8_epel4_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2574         ptrdiff_t srcstride, int h, int mx, int my)
2575 {
2576 #if 1
2577     DECLARE_ALIGNED(4, uint8_t, tmp_array[52]);
2578     uint8_t *tmp = tmp_array;
2579
2580     src -= 2 * srcstride;
2581     ff_put_vp8_epel4_h4_mmi(tmp, 4, src, srcstride, h + 5, mx, my);
2582     tmp    = tmp_array + 8;
2583     ff_put_vp8_epel4_v6_mmi(dst, dststride, tmp, 4, h, mx, my);
2584 #else
2585     const uint8_t *filter = subpel_filters[mx - 1];
2586     const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
2587     int x, y;
2588     uint8_t tmp_array[52];
2589     uint8_t *tmp = tmp_array;
2590
2591     src -= 2 * srcstride;
2592
2593     for (y = 0; y < h + 5; y++) {
2594         for (x = 0; x < 4; x++)
2595             tmp[x] = FILTER_4TAP(src, filter, 1);
2596         tmp += 4;
2597         src += srcstride;
2598     }
2599
2600     tmp    = tmp_array + 8;
2601     filter = subpel_filters[my - 1];
2602
2603     for (y = 0; y < h; y++) {
2604         for (x = 0; x < 4; x++)
2605             dst[x] = FILTER_6TAP(tmp, filter, 4);
2606         dst += dststride;
2607         tmp += 4;
2608     }
2609 #endif
2610 }
2611
2612 void ff_put_vp8_epel16_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2613         ptrdiff_t srcstride, int h, int mx, int my)
2614 {
2615 #if 1
2616     DECLARE_ALIGNED(8, uint8_t, tmp_array[560]);
2617     uint8_t *tmp = tmp_array;
2618
2619     src -= srcstride;
2620     ff_put_vp8_epel16_h6_mmi(tmp, 16, src, srcstride, h + 3, mx, my);
2621     tmp    = tmp_array + 16;
2622     ff_put_vp8_epel16_v4_mmi(dst, dststride, tmp, 16, h, mx, my);
2623 #else
2624     const uint8_t *filter = subpel_filters[mx - 1];
2625     const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
2626     int x, y;
2627     uint8_t tmp_array[560];
2628     uint8_t *tmp = tmp_array;
2629
2630     src -= srcstride;
2631
2632     for (y = 0; y < h + 3; y++) {
2633         for (x = 0; x < 16; x++)
2634             tmp[x] = FILTER_6TAP(src, filter, 1);
2635         tmp += 16;
2636         src += srcstride;
2637     }
2638
2639     tmp    = tmp_array + 16;
2640     filter = subpel_filters[my - 1];
2641
2642     for (y = 0; y < h; y++) {
2643         for (x = 0; x < 16; x++)
2644             dst[x] = FILTER_4TAP(tmp, filter, 16);
2645         dst += dststride;
2646         tmp += 16;
2647     }
2648 #endif
2649 }
2650
2651 void ff_put_vp8_epel8_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2652         ptrdiff_t srcstride, int h, int mx, int my)
2653 {
2654 #if 1
2655     DECLARE_ALIGNED(8, uint8_t, tmp_array[152]);
2656     uint8_t *tmp = tmp_array;
2657
2658     src -= srcstride;
2659     ff_put_vp8_epel8_h6_mmi(tmp, 8, src, srcstride, h + 3, mx, my);
2660     tmp    = tmp_array + 8;
2661     ff_put_vp8_epel8_v4_mmi(dst, dststride, tmp, 8, h, mx, my);
2662 #else
2663     const uint8_t *filter = subpel_filters[mx - 1];
2664     const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
2665     int x, y;
2666     uint8_t tmp_array[152];
2667     uint8_t *tmp = tmp_array;
2668
2669     src -= srcstride;
2670
2671     for (y = 0; y < h + 3; y++) {
2672         for (x = 0; x < 8; x++)
2673             tmp[x] = FILTER_6TAP(src, filter, 1);
2674         tmp += 8;
2675         src += srcstride;
2676     }
2677
2678     tmp    = tmp_array + 8;
2679     filter = subpel_filters[my - 1];
2680
2681     for (y = 0; y < h; y++) {
2682         for (x = 0; x < 8; x++)
2683             dst[x] = FILTER_4TAP(tmp, filter, 8);
2684         dst += dststride;
2685         tmp += 8;
2686     }
2687 #endif
2688 }
2689
2690 void ff_put_vp8_epel4_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2691         ptrdiff_t srcstride, int h, int mx, int my)
2692 {
2693 #if 1
2694     DECLARE_ALIGNED(4, uint8_t, tmp_array[44]);
2695     uint8_t *tmp = tmp_array;
2696
2697     src -= srcstride;
2698     ff_put_vp8_epel4_h6_mmi(tmp, 4, src, srcstride, h + 3, mx, my);
2699     tmp    = tmp_array + 4;
2700     ff_put_vp8_epel4_v4_mmi(dst, dststride, tmp, 4, h, mx, my);
2701 #else
2702     const uint8_t *filter = subpel_filters[mx - 1];
2703     const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
2704     int x, y;
2705     uint8_t tmp_array[44];
2706     uint8_t *tmp = tmp_array;
2707
2708     src -= srcstride;
2709
2710     for (y = 0; y < h + 3; y++) {
2711         for (x = 0; x < 4; x++)
2712             tmp[x] = FILTER_6TAP(src, filter, 1);
2713         tmp += 4;
2714         src += srcstride;
2715     }
2716
2717     tmp    = tmp_array + 4;
2718     filter = subpel_filters[my - 1];
2719
2720     for (y = 0; y < h; y++) {
2721         for (x = 0; x < 4; x++)
2722             dst[x] = FILTER_4TAP(tmp, filter, 4);
2723         dst += dststride;
2724         tmp += 4;
2725     }
2726 #endif
2727 }
2728
2729 void ff_put_vp8_epel16_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2730         ptrdiff_t srcstride, int h, int mx, int my)
2731 {
2732 #if 1
2733     DECLARE_ALIGNED(8, uint8_t, tmp_array[592]);
2734     uint8_t *tmp = tmp_array;
2735
2736     src -= 2 * srcstride;
2737     ff_put_vp8_epel16_h6_mmi(tmp, 16, src, srcstride, h + 5, mx, my);
2738     tmp    = tmp_array + 32;
2739     ff_put_vp8_epel16_v6_mmi(dst, dststride, tmp, 16, h, mx, my);
2740 #else
2741     const uint8_t *filter = subpel_filters[mx - 1];
2742     const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
2743     int x, y;
2744     uint8_t tmp_array[592];
2745     uint8_t *tmp = tmp_array;
2746
2747     src -= 2 * srcstride;
2748
2749     for (y = 0; y < h + 5; y++) {
2750         for (x = 0; x < 16; x++)
2751             tmp[x] = FILTER_6TAP(src, filter, 1);
2752         tmp += 16;
2753         src += srcstride;
2754     }
2755
2756     tmp    = tmp_array + 32;
2757     filter = subpel_filters[my - 1];
2758
2759     for (y = 0; y < h; y++) {
2760         for (x = 0; x < 16; x++)
2761             dst[x] = FILTER_6TAP(tmp, filter, 16);
2762         dst += dststride;
2763         tmp += 16;
2764     }
2765 #endif
2766 }
2767
2768 void ff_put_vp8_epel8_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2769         ptrdiff_t srcstride, int h, int mx, int my)
2770 {
2771 #if 1
2772     DECLARE_ALIGNED(8, uint8_t, tmp_array[168]);
2773     uint8_t *tmp = tmp_array;
2774
2775     src -= 2 * srcstride;
2776     ff_put_vp8_epel8_h6_mmi(tmp, 8, src, srcstride, h + 5, mx, my);
2777     tmp    = tmp_array + 16;
2778     ff_put_vp8_epel8_v6_mmi(dst, dststride, tmp, 8, h, mx, my);
2779 #else
2780     const uint8_t *filter = subpel_filters[mx - 1];
2781     const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
2782     int x, y;
2783     uint8_t tmp_array[168];
2784     uint8_t *tmp = tmp_array;
2785
2786     src -= 2 * srcstride;
2787
2788     for (y = 0; y < h + 5; y++) {
2789         for (x = 0; x < 8; x++)
2790             tmp[x] = FILTER_6TAP(src, filter, 1);
2791         tmp += 8;
2792         src += srcstride;
2793     }
2794
2795     tmp    = tmp_array + 16;
2796     filter = subpel_filters[my - 1];
2797
2798     for (y = 0; y < h; y++) {
2799         for (x = 0; x < 8; x++)
2800             dst[x] = FILTER_6TAP(tmp, filter, 8);
2801         dst += dststride;
2802         tmp += 8;
2803     }
2804 #endif
2805 }
2806
2807 void ff_put_vp8_epel4_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2808         ptrdiff_t srcstride, int h, int mx, int my)
2809 {
2810 #if 1
2811     DECLARE_ALIGNED(4, uint8_t, tmp_array[52]);
2812     uint8_t *tmp = tmp_array;
2813
2814     src -= 2 * srcstride;
2815     ff_put_vp8_epel4_h6_mmi(tmp, 4, src, srcstride, h + 5, mx, my);
2816     tmp    = tmp_array + 8;
2817     ff_put_vp8_epel4_v6_mmi(dst, dststride, tmp, 4, h, mx, my);
2818 #else
2819     const uint8_t *filter = subpel_filters[mx - 1];
2820     const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
2821     int x, y;
2822     uint8_t tmp_array[52];
2823     uint8_t *tmp = tmp_array;
2824
2825     src -= 2 * srcstride;
2826
2827     for (y = 0; y < h + 5; y++) {
2828         for (x = 0; x < 4; x++)
2829             tmp[x] = FILTER_6TAP(src, filter, 1);
2830         tmp += 4;
2831         src += srcstride;
2832     }
2833
2834     tmp    = tmp_array + 8;
2835     filter = subpel_filters[my - 1];
2836
2837     for (y = 0; y < h; y++) {
2838         for (x = 0; x < 4; x++)
2839             dst[x] = FILTER_6TAP(tmp, filter, 4);
2840         dst += dststride;
2841         tmp += 4;
2842     }
2843 #endif
2844 }
2845
2846 void ff_put_vp8_bilinear16_h_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
2847         ptrdiff_t sstride, int h, int mx, int my)
2848 {
2849 #if 1
2850     int a = 8 - mx, b = mx;
2851     double ftmp[7];
2852     uint32_t tmp[1];
2853     mips_reg dst0, src0;
2854     DECLARE_VAR_ALL64;
2855
2856     /*
2857     dst[0] = (a * src[0] + b * src[1] + 4) >> 3;
2858     dst[1] = (a * src[1] + b * src[2] + 4) >> 3;
2859     dst[2] = (a * src[2] + b * src[3] + 4) >> 3;
2860     dst[3] = (a * src[3] + b * src[4] + 4) >> 3;
2861     dst[4] = (a * src[4] + b * src[5] + 4) >> 3;
2862     dst[5] = (a * src[5] + b * src[6] + 4) >> 3;
2863     dst[6] = (a * src[6] + b * src[7] + 4) >> 3;
2864     dst[7] = (a * src[7] + b * src[8] + 4) >> 3;
2865
2866     dst[ 8] = (a * src[ 8] + b * src[ 9] + 4) >> 3;
2867     dst[ 9] = (a * src[ 9] + b * src[10] + 4) >> 3;
2868     dst[10] = (a * src[10] + b * src[11] + 4) >> 3;
2869     dst[11] = (a * src[11] + b * src[12] + 4) >> 3;
2870     dst[12] = (a * src[12] + b * src[13] + 4) >> 3;
2871     dst[13] = (a * src[13] + b * src[14] + 4) >> 3;
2872     dst[14] = (a * src[14] + b * src[15] + 4) >> 3;
2873     dst[15] = (a * src[15] + b * src[16] + 4) >> 3;
2874     */
2875     __asm__ volatile (
2876         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
2877         "li         %[tmp0],    0x03                                \n\t"
2878         "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
2879         "pshufh     %[a],       %[a],           %[ftmp0]            \n\t"
2880         "pshufh     %[b],       %[b],           %[ftmp0]            \n\t"
2881
2882         "1:                                                         \n\t"
2883         // 0 - 7
2884         PUT_VP8_BILINEAR8_H_MMI(%[src], %[dst])
2885         PTR_ADDIU  "%[src0],    %[src],         0x08                \n\t"
2886         PTR_ADDIU  "%[dst0],    %[dst],         0x08                \n\t"
2887         // 8 - 15
2888         PUT_VP8_BILINEAR8_H_MMI(%[src0], %[dst0])
2889
2890         "addiu      %[h],       %[h],           -0x01               \n\t"
2891         PTR_ADDU   "%[src],     %[src],         %[sstride]          \n\t"
2892         PTR_ADDU   "%[dst],     %[dst],         %[dstride]          \n\t"
2893         "bnez       %[h],       1b                                  \n\t"
2894         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
2895           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
2896           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
2897           [ftmp6]"=&f"(ftmp[6]),
2898           [tmp0]"=&r"(tmp[0]),
2899           RESTRICT_ASM_ALL64
2900           [dst0]"=&r"(dst0),            [src0]"=&r"(src0),
2901           [h]"+&r"(h),
2902           [dst]"+&r"(dst),              [src]"+&r"(src),
2903           [a]"+&f"(a),                  [b]"+&f"(b)
2904         : [sstride]"r"((mips_reg)sstride),
2905           [dstride]"r"((mips_reg)dstride),
2906           [ff_pw_4]"f"(ff_pw_4)
2907         : "memory"
2908     );
2909 #else
2910     int a = 8 - mx, b = mx;
2911     int x, y;
2912
2913     for (y = 0; y < h; y++) {
2914         for (x = 0; x < 16; x++)
2915             dst[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
2916         dst += dstride;
2917         src += sstride;
2918     }
2919 #endif
2920 }
2921
2922 void ff_put_vp8_bilinear16_v_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
2923         ptrdiff_t sstride, int h, int mx, int my)
2924 {
2925 #if 1
2926     int c = 8 - my, d = my;
2927     double ftmp[7];
2928     uint32_t tmp[1];
2929     mips_reg src0, src1, dst0;
2930     DECLARE_VAR_ALL64;
2931
2932     /*
2933     dst[0] = (c * src[0] + d * src[    sstride] + 4) >> 3;
2934     dst[1] = (c * src[1] + d * src[1 + sstride] + 4) >> 3;
2935     dst[2] = (c * src[2] + d * src[2 + sstride] + 4) >> 3;
2936     dst[3] = (c * src[3] + d * src[3 + sstride] + 4) >> 3;
2937     dst[4] = (c * src[4] + d * src[4 + sstride] + 4) >> 3;
2938     dst[5] = (c * src[5] + d * src[5 + sstride] + 4) >> 3;
2939     dst[6] = (c * src[6] + d * src[6 + sstride] + 4) >> 3;
2940     dst[7] = (c * src[7] + d * src[7 + sstride] + 4) >> 3;
2941     */
2942     __asm__ volatile (
2943         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
2944         "li         %[tmp0],    0x03                                \n\t"
2945         "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
2946         "pshufh     %[c],       %[c],           %[ftmp0]            \n\t"
2947         "pshufh     %[d],       %[d],           %[ftmp0]            \n\t"
2948
2949         "1:                                                         \n\t"
2950         // 0 - 7
2951         PUT_VP8_BILINEAR8_V_MMI(%[src], %[src1], %[dst], %[sstride])
2952         PTR_ADDIU  "%[src0],    %[src],         0x08                \n\t"
2953         PTR_ADDIU  "%[dst0],    %[dst],         0x08                \n\t"
2954         // 8 - 15
2955         PUT_VP8_BILINEAR8_V_MMI(%[src0], %[src1], %[dst0], %[sstride])
2956
2957         "addiu      %[h],       %[h],           -0x01               \n\t"
2958         PTR_ADDU   "%[src],     %[src],         %[sstride]          \n\t"
2959         PTR_ADDU   "%[dst],     %[dst],         %[dstride]          \n\t"
2960         "bnez       %[h],       1b                                  \n\t"
2961         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
2962           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
2963           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
2964           [ftmp6]"=&f"(ftmp[6]),
2965           [tmp0]"=&r"(tmp[0]),
2966           RESTRICT_ASM_ALL64
2967           [src0]"=&r"(src0),            [dst0]"=&r"(dst0),
2968           [src1]"=&r"(src1),
2969           [h]"+&r"(h),
2970           [dst]"+&r"(dst),              [src]"+&r"(src),
2971           [c]"+&f"(c),                  [d]"+&f"(d)
2972         : [sstride]"r"((mips_reg)sstride),
2973           [dstride]"r"((mips_reg)dstride),
2974           [ff_pw_4]"f"(ff_pw_4)
2975         : "memory"
2976     );
2977 #else
2978     int c = 8 - my, d = my;
2979     int x, y;
2980
2981     for (y = 0; y < h; y++) {
2982         for (x = 0; x < 16; x++)
2983             dst[x] = (c * src[x] + d * src[x + sstride] + 4) >> 3;
2984         dst += dstride;
2985         src += sstride;
2986     }
2987 #endif
2988 }
2989
2990 void ff_put_vp8_bilinear16_hv_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
2991         ptrdiff_t sstride, int h, int mx, int my)
2992 {
2993 #if 1
2994     DECLARE_ALIGNED(8, uint8_t, tmp_array[528]);
2995     uint8_t *tmp = tmp_array;
2996
2997     ff_put_vp8_bilinear16_h_mmi(tmp, 16, src, sstride, h + 1, mx, my);
2998     ff_put_vp8_bilinear16_v_mmi(dst, dstride, tmp, 16, h, mx, my);
2999 #else
3000     int a = 8 - mx, b = mx;
3001     int c = 8 - my, d = my;
3002     int x, y;
3003     uint8_t tmp_array[528];
3004     uint8_t *tmp = tmp_array;
3005
3006     for (y = 0; y < h + 1; y++) {
3007         for (x = 0; x < 16; x++)
3008             tmp[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3009         tmp += 16;
3010         src += sstride;
3011     }
3012
3013     tmp = tmp_array;
3014
3015     for (y = 0; y < h; y++) {
3016         for (x = 0; x < 16; x++)
3017             dst[x] = (c * tmp[x] + d * tmp[x + 16] + 4) >> 3;
3018         dst += dstride;
3019         tmp += 16;
3020     }
3021 #endif
3022 }
3023
3024 void ff_put_vp8_bilinear8_h_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3025         ptrdiff_t sstride, int h, int mx, int my)
3026 {
3027 #if 1
3028     int a = 8 - mx, b = mx;
3029     double ftmp[7];
3030     uint32_t tmp[1];
3031     DECLARE_VAR_ALL64;
3032
3033     /*
3034     dst[0] = (a * src[0] + b * src[1] + 4) >> 3;
3035     dst[1] = (a * src[1] + b * src[2] + 4) >> 3;
3036     dst[2] = (a * src[2] + b * src[3] + 4) >> 3;
3037     dst[3] = (a * src[3] + b * src[4] + 4) >> 3;
3038     dst[4] = (a * src[4] + b * src[5] + 4) >> 3;
3039     dst[5] = (a * src[5] + b * src[6] + 4) >> 3;
3040     dst[6] = (a * src[6] + b * src[7] + 4) >> 3;
3041     dst[7] = (a * src[7] + b * src[8] + 4) >> 3;
3042     */
3043     __asm__ volatile (
3044         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
3045         "li         %[tmp0],    0x03                                \n\t"
3046         "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
3047         "pshufh     %[a],       %[a],           %[ftmp0]            \n\t"
3048         "pshufh     %[b],       %[b],           %[ftmp0]            \n\t"
3049
3050         "1:                                                         \n\t"
3051         PUT_VP8_BILINEAR8_H_MMI(%[src], %[dst])
3052
3053         "addiu      %[h],       %[h],           -0x01               \n\t"
3054         PTR_ADDU   "%[src],     %[src],         %[sstride]          \n\t"
3055         PTR_ADDU   "%[dst],     %[dst],         %[dstride]          \n\t"
3056         "bnez       %[h],       1b                                  \n\t"
3057         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
3058           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
3059           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
3060           [ftmp6]"=&f"(ftmp[6]),
3061           [tmp0]"=&r"(tmp[0]),
3062           RESTRICT_ASM_ALL64
3063           [h]"+&r"(h),
3064           [dst]"+&r"(dst),              [src]"+&r"(src),
3065           [a]"+&f"(a),                  [b]"+&f"(b)
3066         : [sstride]"r"((mips_reg)sstride),
3067           [dstride]"r"((mips_reg)dstride),
3068           [ff_pw_4]"f"(ff_pw_4)
3069         : "memory"
3070     );
3071 #else
3072     int a = 8 - mx, b = mx;
3073     int x, y;
3074
3075     for (y = 0; y < h; y++) {
3076         for (x = 0; x < 8; x++)
3077             dst[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3078         dst += dstride;
3079         src += sstride;
3080     }
3081 #endif
3082 }
3083
3084 void ff_put_vp8_bilinear8_v_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3085         ptrdiff_t sstride, int h, int mx, int my)
3086 {
3087 #if 1
3088     int c = 8 - my, d = my;
3089     double ftmp[7];
3090     uint32_t tmp[1];
3091     mips_reg src1;
3092     DECLARE_VAR_ALL64;
3093
3094     /*
3095     dst[0] = (c * src[0] + d * src[    sstride] + 4) >> 3;
3096     dst[1] = (c * src[1] + d * src[1 + sstride] + 4) >> 3;
3097     dst[2] = (c * src[2] + d * src[2 + sstride] + 4) >> 3;
3098     dst[3] = (c * src[3] + d * src[3 + sstride] + 4) >> 3;
3099     dst[4] = (c * src[4] + d * src[4 + sstride] + 4) >> 3;
3100     dst[5] = (c * src[5] + d * src[5 + sstride] + 4) >> 3;
3101     dst[6] = (c * src[6] + d * src[6 + sstride] + 4) >> 3;
3102     dst[7] = (c * src[7] + d * src[7 + sstride] + 4) >> 3;
3103     */
3104     __asm__ volatile (
3105         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
3106         "li         %[tmp0],    0x03                                \n\t"
3107         "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
3108         "pshufh     %[c],       %[c],           %[ftmp0]            \n\t"
3109         "pshufh     %[d],       %[d],           %[ftmp0]            \n\t"
3110
3111         "1:                                                         \n\t"
3112         PUT_VP8_BILINEAR8_V_MMI(%[src], %[src1], %[dst], %[sstride])
3113
3114         "addiu      %[h],       %[h],           -0x01               \n\t"
3115         PTR_ADDU   "%[src],     %[src],         %[sstride]          \n\t"
3116         PTR_ADDU   "%[dst],     %[dst],         %[dstride]          \n\t"
3117         "bnez       %[h],       1b                                  \n\t"
3118         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
3119           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
3120           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
3121           [ftmp6]"=&f"(ftmp[6]),
3122           [tmp0]"=&r"(tmp[0]),
3123           RESTRICT_ASM_ALL64
3124           [src1]"=&r"(src1),
3125           [h]"+&r"(h),
3126           [dst]"+&r"(dst),              [src]"+&r"(src),
3127           [c]"+&f"(c),                  [d]"+&f"(d)
3128         : [sstride]"r"((mips_reg)sstride),
3129           [dstride]"r"((mips_reg)dstride),
3130           [ff_pw_4]"f"(ff_pw_4)
3131         : "memory"
3132     );
3133 #else
3134     int c = 8 - my, d = my;
3135     int x, y;
3136
3137     for (y = 0; y < h; y++) {
3138         for (x = 0; x < 8; x++)
3139             dst[x] = (c * src[x] + d * src[x + sstride] + 4) >> 3;
3140         dst += dstride;
3141         src += sstride;
3142     }
3143 #endif
3144 }
3145
3146 void ff_put_vp8_bilinear8_hv_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3147         ptrdiff_t sstride, int h, int mx, int my)
3148 {
3149 #if 1
3150     DECLARE_ALIGNED(8, uint8_t, tmp_array[136]);
3151     uint8_t *tmp = tmp_array;
3152
3153     ff_put_vp8_bilinear8_h_mmi(tmp, 8, src, sstride, h + 1, mx, my);
3154     ff_put_vp8_bilinear8_v_mmi(dst, dstride, tmp, 8, h, mx, my);
3155 #else
3156     int a = 8 - mx, b = mx;
3157     int c = 8 - my, d = my;
3158     int x, y;
3159     uint8_t tmp_array[136];
3160     uint8_t *tmp = tmp_array;
3161
3162     for (y = 0; y < h + 1; y++) {
3163         for (x = 0; x < 8; x++)
3164             tmp[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3165         tmp += 8;
3166         src += sstride;
3167     }
3168
3169     tmp = tmp_array;
3170
3171     for (y = 0; y < h; y++) {
3172         for (x = 0; x < 8; x++)
3173             dst[x] = (c * tmp[x] + d * tmp[x + 8] + 4) >> 3;
3174         dst += dstride;
3175         tmp += 8;
3176     }
3177 #endif
3178 }
3179
3180 void ff_put_vp8_bilinear4_h_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3181         ptrdiff_t sstride, int h, int mx, int my)
3182 {
3183 #if 1
3184     int a = 8 - mx, b = mx;
3185     double ftmp[5];
3186     uint32_t tmp[1];
3187     DECLARE_VAR_LOW32;
3188     DECLARE_VAR_ALL64;
3189
3190     /*
3191     dst[0] = (a * src[0] + b * src[1] + 4) >> 3;
3192     dst[1] = (a * src[1] + b * src[2] + 4) >> 3;
3193     dst[2] = (a * src[2] + b * src[3] + 4) >> 3;
3194     dst[3] = (a * src[3] + b * src[4] + 4) >> 3;
3195     */
3196     __asm__ volatile (
3197         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
3198         "li         %[tmp0],    0x03                                \n\t"
3199         "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
3200         "pshufh     %[a],       %[a],           %[ftmp0]            \n\t"
3201         "pshufh     %[b],       %[b],           %[ftmp0]            \n\t"
3202
3203         "1:                                                         \n\t"
3204         PUT_VP8_BILINEAR4_H_MMI(%[src], %[dst])
3205
3206         "addiu      %[h],       %[h],           -0x01               \n\t"
3207         PTR_ADDU   "%[src],     %[src],         %[sstride]          \n\t"
3208         PTR_ADDU   "%[dst],     %[dst],         %[dstride]          \n\t"
3209         "bnez       %[h],       1b                                  \n\t"
3210         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
3211           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
3212           [ftmp4]"=&f"(ftmp[4]),
3213           [tmp0]"=&r"(tmp[0]),
3214           RESTRICT_ASM_LOW32
3215           RESTRICT_ASM_ALL64
3216           [h]"+&r"(h),
3217           [dst]"+&r"(dst),              [src]"+&r"(src),
3218           [a]"+&f"(a),                  [b]"+&f"(b)
3219         : [sstride]"r"((mips_reg)sstride),
3220           [dstride]"r"((mips_reg)dstride),
3221           [ff_pw_4]"f"(ff_pw_4)
3222         : "memory"
3223     );
3224 #else
3225     int a = 8 - mx, b = mx;
3226     int x, y;
3227
3228     for (y = 0; y < h; y++) {
3229         for (x = 0; x < 4; x++)
3230             dst[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3231         dst += dstride;
3232         src += sstride;
3233     }
3234 #endif
3235 }
3236
3237 void ff_put_vp8_bilinear4_v_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3238         ptrdiff_t sstride, int h, int mx, int my)
3239 {
3240 #if 1
3241     int c = 8 - my, d = my;
3242     double ftmp[7];
3243     uint32_t tmp[1];
3244     mips_reg src1;
3245     DECLARE_VAR_LOW32;
3246     DECLARE_VAR_ALL64;
3247
3248     /*
3249     dst[0] = (c * src[0] + d * src[    sstride] + 4) >> 3;
3250     dst[1] = (c * src[1] + d * src[1 + sstride] + 4) >> 3;
3251     dst[2] = (c * src[2] + d * src[2 + sstride] + 4) >> 3;
3252     dst[3] = (c * src[3] + d * src[3 + sstride] + 4) >> 3;
3253     */
3254     __asm__ volatile (
3255         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
3256         "li         %[tmp0],    0x03                                \n\t"
3257         "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
3258         "pshufh     %[c],       %[c],           %[ftmp0]            \n\t"
3259         "pshufh     %[d],       %[d],           %[ftmp0]            \n\t"
3260
3261         "1:                                                         \n\t"
3262         PUT_VP8_BILINEAR4_V_MMI(%[src], %[src1], %[dst], %[sstride])
3263
3264         "addiu      %[h],       %[h],           -0x01               \n\t"
3265         PTR_ADDU   "%[src],     %[src],         %[sstride]          \n\t"
3266         PTR_ADDU   "%[dst],     %[dst],         %[dstride]          \n\t"
3267         "bnez       %[h],       1b                                  \n\t"
3268         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
3269           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
3270           [ftmp4]"=&f"(ftmp[4]),
3271           [tmp0]"=&r"(tmp[0]),
3272           RESTRICT_ASM_LOW32
3273           RESTRICT_ASM_ALL64
3274           [src1]"=&r"(src1),
3275           [h]"+&r"(h),
3276           [dst]"+&r"(dst),              [src]"+&r"(src),
3277           [c]"+&f"(c),                  [d]"+&f"(d)
3278         : [sstride]"r"((mips_reg)sstride),
3279           [dstride]"r"((mips_reg)dstride),
3280           [ff_pw_4]"f"(ff_pw_4)
3281         : "memory"
3282     );
3283 #else
3284     int c = 8 - my, d = my;
3285     int x, y;
3286
3287     for (y = 0; y < h; y++) {
3288         for (x = 0; x < 4; x++)
3289             dst[x] = (c * src[x] + d * src[x + sstride] + 4) >> 3;
3290         dst += dstride;
3291         src += sstride;
3292     }
3293 #endif
3294 }
3295
3296 void ff_put_vp8_bilinear4_hv_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3297         ptrdiff_t sstride, int h, int mx, int my)
3298 {
3299 #if 1
3300     DECLARE_ALIGNED(4, uint8_t, tmp_array[36]);
3301     uint8_t *tmp = tmp_array;
3302
3303     ff_put_vp8_bilinear4_h_mmi(tmp, 4, src, sstride, h + 1, mx, my);
3304     ff_put_vp8_bilinear4_v_mmi(dst, dstride, tmp, 4, h, mx, my);
3305 #else
3306     int a = 8 - mx, b = mx;
3307     int c = 8 - my, d = my;
3308     int x, y;
3309     uint8_t tmp_array[36];
3310     uint8_t *tmp = tmp_array;
3311
3312     for (y = 0; y < h + 1; y++) {
3313         for (x = 0; x < 4; x++)
3314             tmp[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3315         tmp += 4;
3316         src += sstride;
3317     }
3318
3319     tmp = tmp_array;
3320
3321     for (y = 0; y < h; y++) {
3322         for (x = 0; x < 4; x++)
3323             dst[x] = (c * tmp[x] + d * tmp[x + 4] + 4) >> 3;
3324         dst += dstride;
3325         tmp += 4;
3326     }
3327 #endif
3328 }