git.sesse.net Git - ffmpeg/blob - libavcodec/mips/h264dsp_mmi.c

   1 /*
   2  * Loongson SIMD optimized h264dsp
   3  *
   4  * Copyright (c) 2015 Loongson Technology Corporation Limited
   5  * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
   6  *                    Zhang Shuangshuang <zhangshuangshuang@ict.ac.cn>
   7  *
   8  * This file is part of FFmpeg.
   9  *
  10  * FFmpeg is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public
  12  * License as published by the Free Software Foundation; either
  13  * version 2.1 of the License, or (at your option) any later version.
  14  *
  15  * FFmpeg is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with FFmpeg; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23  */
  24
  25 #include "libavcodec/bit_depth_template.c"
  26 #include "h264dsp_mips.h"
  27
  28 void ff_h264_weight_pixels16_8_mmi(uint8_t *block, int stride,
  29         int height, int log2_denom, int weight, int offset)
  30 {
  31     int y;
  32
  33     offset <<= log2_denom;
  34
  35     if (log2_denom)
  36         offset += 1 << (log2_denom - 1);
  37
  38     for (y=0; y<height; y++, block+=stride) {
  39         __asm__ volatile (
  40             "ldc1 $f2, %0                   \r\n"
  41             "ldc1 $f4, %1                   \r\n"
  42             "dmtc1 $0, $f20                 \r\n"
  43             "mtc1 %2, $f6                   \r\n"
  44             "mtc1 %3, $f8                   \r\n"
  45             "mtc1 %4, $f10                  \r\n"
  46             "pshufh $f6, $f6, $f20          \r\n"
  47             "pshufh $f8, $f8, $f20          \r\n"
  48             "punpckhbh $f14, $f2, $f20      \r\n"
  49             "punpckhbh $f16, $f4, $f20      \r\n"
  50             "punpcklbh $f2, $f2, $f20       \r\n"
  51             "punpcklbh $f4, $f4, $f20       \r\n"
  52             "pmullh $f14, $f14, $f6         \r\n"
  53             "pmullh $f16, $f16, $f6         \r\n"
  54             "pmullh $f2, $f2, $f6           \r\n"
  55             "pmullh $f4, $f4, $f6           \r\n"
  56             "paddsh $f14, $f14, $f8         \r\n"
  57             "paddsh $f16, $f16, $f8         \r\n"
  58             "paddsh $f2, $f2, $f8           \r\n"
  59             "paddsh $f4, $f4, $f8           \r\n"
  60             "psrah $f14, $f14, $f10         \r\n"
  61             "psrah $f16, $f16, $f10         \r\n"
  62             "psrah $f2, $f2, $f10           \r\n"
  63             "psrah $f4, $f4, $f10           \r\n"
  64             "packushb $f2, $f2, $f14        \r\n"
  65             "packushb $f4, $f4, $f16        \r\n"
  66             "sdc1 $f2, %0                   \r\n"
  67             "sdc1 $f4, %1                   \r\n"
  68             : "=m"(*block),"=m"(*(block + 8))
  69             : "r"(weight),"r"(offset),"r"(log2_denom)
  70         );
  71     }
  72 }
  73
  74 void ff_h264_biweight_pixels16_8_mmi(uint8_t *dst, uint8_t *src,
  75         int stride, int height, int log2_denom, int weightd, int weights,
  76         int offset)
  77 {
  78     int y;
  79
  80     offset = ((offset + 1) | 1) << log2_denom;
  81
  82     for (y=0; y<height; y++, dst+=stride, src+=stride) {
  83         __asm__ volatile (
  84             "ldc1 $f2, %2                   \r\n"
  85             "ldc1 $f4, %3                   \r\n"
  86             "dmtc1 $0, $f20                 \r\n"
  87             "mtc1 %6, $f6                   \r\n"
  88             "mtc1 %7, $f8                   \r\n"
  89             "mtc1 %8, $f10                  \r\n"
  90             "mtc1 %9, $f12                  \r\n"
  91             "pshufh $f6, $f6, $f20          \r\n"
  92             "pshufh $f8, $f8, $f20          \r\n"
  93             "pshufh $f10, $f10, $f20        \r\n"
  94             "punpckhbh $f14, $f2, $f20      \r\n"
  95             "punpckhbh $f16, $f4, $f20      \r\n"
  96             "punpcklbh $f2, $f2, $f20       \r\n"
  97             "punpcklbh $f4, $f4, $f20       \r\n"
  98             "pmullh $f14, $f14, $f6         \r\n"
  99             "pmullh $f16, $f16, $f8         \r\n"
 100             "pmullh $f2, $f2, $f6           \r\n"
 101             "pmullh $f4, $f4, $f8           \r\n"
 102             "paddsh $f14, $f14, $f10        \r\n"
 103             "paddsh $f2, $f2, $f10          \r\n"
 104             "paddsh $f14, $f14, $f16        \r\n"
 105             "paddsh $f2, $f2, $f4           \r\n"
 106             "psrah $f14, $f14, $f12         \r\n"
 107             "psrah $f2, $f2, $f12           \r\n"
 108             "packushb $f2, $f2, $f14        \r\n"
 109             "sdc1 $f2, %0                   \r\n"
 110             "ldc1 $f2, %4                   \r\n"
 111             "ldc1 $f4, %5                   \r\n"
 112             "punpckhbh $f14, $f2, $f20      \r\n"
 113             "punpckhbh $f16, $f4, $f20      \r\n"
 114             "punpcklbh $f2, $f2, $f20       \r\n"
 115             "punpcklbh $f4, $f4, $f20       \r\n"
 116             "pmullh $f14, $f14, $f6         \r\n"
 117             "pmullh $f16, $f16, $f8         \r\n"
 118             "pmullh $f2, $f2, $f6           \r\n"
 119             "pmullh $f4, $f4, $f8           \r\n"
 120             "paddsh $f14, $f14, $f10        \r\n"
 121             "paddsh $f2, $f2, $f10          \r\n"
 122             "paddsh $f14, $f14, $f16        \r\n"
 123             "paddsh $f2, $f2, $f4           \r\n"
 124             "psrah $f14, $f14, $f12         \r\n"
 125             "psrah $f2, $f2, $f12           \r\n"
 126             "packushb $f2, $f2, $f14        \r\n"
 127             "sdc1 $f2, %1                   \r\n"
 128             : "=m"(*dst),"=m"(*(dst+8))
 129             : "m"(*src),"m"(*dst),"m"(*(src+8)),"m"(*(dst+8)),
 130               "r"(weights),"r"(weightd),"r"(offset),"r"(log2_denom+1)
 131         );
 132     }
 133 }
 134
 135 void ff_h264_weight_pixels8_8_mmi(uint8_t *block, int stride, int height,
 136         int log2_denom, int weight, int offset)
 137 {
 138     int y;
 139
 140     offset <<= log2_denom;
 141
 142     if (log2_denom)
 143         offset += 1 << (log2_denom - 1);
 144
 145     for (y=0; y<height; y++, block+=stride) {
 146         __asm__ volatile (
 147             "ldc1 $f2, %0                   \r\n"
 148             "mtc1 %1, $f6                   \r\n"
 149             "mtc1 %2, $f8                   \r\n"
 150             "mtc1 %3, $f10                  \r\n"
 151             "dmtc1 $0, $f20                 \r\n"
 152             "pshufh $f6, $f6, $f20          \r\n"
 153             "pshufh $f8, $f8, $f20          \r\n"
 154             "punpckhbh $f14, $f2, $f20      \r\n"
 155             "punpcklbh $f2, $f2, $f20       \r\n"
 156             "pmullh $f14, $f14, $f6         \r\n"
 157             "pmullh $f2, $f2, $f6           \r\n"
 158             "paddsh $f14, $f14, $f8         \r\n"
 159             "paddsh $f2, $f2, $f8           \r\n"
 160             "psrah $f14, $f14, $f10         \r\n"
 161             "psrah $f2, $f2, $f10           \r\n"
 162             "packushb $f2, $f2, $f14        \r\n"
 163             "sdc1 $f2, %0                   \r\n"
 164             : "=m"(*block)
 165             : "r"(weight),"r"(offset),"r"(log2_denom)
 166         );
 167     }
 168 }
 169
 170 void ff_h264_biweight_pixels8_8_mmi(uint8_t *dst, uint8_t *src,
 171         int stride, int height, int log2_denom, int weightd, int weights,
 172         int offset)
 173 {
 174     int y;
 175
 176     offset = ((offset + 1) | 1) << log2_denom;
 177
 178     for (y=0; y<height; y++, dst+=stride, src+=stride) {
 179         __asm__ volatile (
 180             "ldc1 $f2, %1                   \r\n"
 181             "ldc1 $f4, %2                   \r\n"
 182             "dmtc1 $0, $f20                 \r\n"
 183             "mtc1 %3, $f6                   \r\n"
 184             "mtc1 %4, $f8                   \r\n"
 185             "mtc1 %5, $f10                  \r\n"
 186             "mtc1 %6, $f12                  \r\n"
 187             "pshufh $f6, $f6, $f20          \r\n"
 188             "pshufh $f8, $f8, $f20          \r\n"
 189             "pshufh $f10, $f10, $f20        \r\n"
 190             "punpckhbh $f14, $f2, $f20      \r\n"
 191             "punpckhbh $f16, $f4, $f20      \r\n"
 192             "punpcklbh $f2, $f2, $f20       \r\n"
 193             "punpcklbh $f4, $f4, $f20       \r\n"
 194             "pmullh $f14, $f14, $f6         \r\n"
 195             "pmullh $f16, $f16, $f8         \r\n"
 196             "pmullh $f2, $f2, $f6           \r\n"
 197             "pmullh $f4, $f4, $f8           \r\n"
 198             "paddsh $f14, $f14, $f10        \r\n"
 199             "paddsh $f2, $f2, $f10          \r\n"
 200             "paddsh $f14, $f14, $f16        \r\n"
 201             "paddsh $f2, $f2, $f4           \r\n"
 202             "psrah $f14, $f14, $f12         \r\n"
 203             "psrah $f2, $f2, $f12           \r\n"
 204             "packushb $f2, $f2, $f14        \r\n"
 205             "sdc1 $f2, %0                   \r\n"
 206             : "=m"(*dst)
 207             : "m"(*src),"m"(*dst),"r"(weights),
 208               "r"(weightd),"r"(offset),"r"(log2_denom+1)
 209         );
 210     }
 211 }
 212
 213 void ff_h264_weight_pixels4_8_mmi(uint8_t *block, int stride, int height,
 214         int log2_denom, int weight, int offset)
 215 {
 216     int y;
 217
 218     offset <<= log2_denom;
 219
 220     if (log2_denom)
 221         offset += 1 << (log2_denom - 1);
 222
 223     for (y=0; y<height; y++, block+=stride) {
 224         __asm__ volatile (
 225             "lwc1 $f2, %0                   \r\n"
 226             "mtc1 %1, $f6                   \r\n"
 227             "mtc1 %2, $f8                   \r\n"
 228             "mtc1 %3, $f10                  \r\n"
 229             "dmtc1 $0, $f20                 \r\n"
 230             "pshufh $f6, $f6, $f20          \r\n"
 231             "pshufh $f8, $f8, $f20          \r\n"
 232             "punpcklbh $f2, $f2, $f20       \r\n"
 233             "pmullh $f2, $f2, $f6           \r\n"
 234             "paddsh $f2, $f2, $f8           \r\n"
 235             "psrah $f2, $f2, $f10           \r\n"
 236             "packushb $f2, $f2, $f20        \r\n"
 237             "swc1 $f2, %0                   \r\n"
 238             : "=m"(*block)
 239             : "r"(weight),"r"(offset),"r"(log2_denom)
 240         );
 241     }
 242 }
 243
 244 void ff_h264_biweight_pixels4_8_mmi(uint8_t *dst, uint8_t *src,
 245         int stride, int height, int log2_denom, int weightd, int weights,
 246         int offset)
 247 {
 248     int y;
 249
 250     offset = ((offset + 1) | 1) << log2_denom;
 251
 252     for (y=0; y<height; y++, dst+=stride, src+=stride) {
 253         __asm__ volatile (
 254             "lwc1 $f2, %1                   \r\n"
 255             "lwc1 $f4, %2                   \r\n"
 256             "dmtc1 $0, $f20                 \r\n"
 257             "mtc1 %3, $f6                   \r\n"
 258             "mtc1 %4, $f8                   \r\n"
 259             "mtc1 %5, $f10                  \r\n"
 260             "mtc1 %6, $f12                  \r\n"
 261             "pshufh $f6, $f6, $f20          \r\n"
 262             "pshufh $f8, $f8, $f20          \r\n"
 263             "pshufh $f10, $f10, $f20        \r\n"
 264             "punpcklbh $f2, $f2, $f20       \r\n"
 265             "punpcklbh $f4, $f4, $f20       \r\n"
 266             "pmullh $f2, $f2, $f6           \r\n"
 267             "pmullh $f4, $f4, $f8           \r\n"
 268             "paddsh $f2, $f2, $f10          \r\n"
 269             "paddsh $f2, $f2, $f4           \r\n"
 270             "psrah $f2, $f2, $f12           \r\n"
 271             "packushb $f2, $f2, $f20        \r\n"
 272             "swc1 $f2, %0                   \r\n"
 273             : "=m"(*dst)
 274             : "m"(*src),"m"(*dst),"r"(weights),
 275               "r"(weightd),"r"(offset),"r"(log2_denom+1)
 276         );
 277     }
 278 }