2 * Loongson SIMD optimized h264dsp
4 * Copyright (c) 2015 Loongson Technology Corporation Limited
5 * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
6 * Zhang Shuangshuang <zhangshuangshuang@ict.ac.cn>
8 * This file is part of FFmpeg.
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25 #include "libavcodec/bit_depth_template.c"
26 #include "h264dsp_mips.h"
28 void ff_h264_weight_pixels16_8_mmi(uint8_t *block, int stride,
29 int height, int log2_denom, int weight, int offset)
33 offset <<= log2_denom;
36 offset += 1 << (log2_denom - 1);
38 for (y=0; y<height; y++, block+=stride) {
46 "pshufh $f6, $f6, $f20 \r\n"
47 "pshufh $f8, $f8, $f20 \r\n"
48 "punpckhbh $f14, $f2, $f20 \r\n"
49 "punpckhbh $f16, $f4, $f20 \r\n"
50 "punpcklbh $f2, $f2, $f20 \r\n"
51 "punpcklbh $f4, $f4, $f20 \r\n"
52 "pmullh $f14, $f14, $f6 \r\n"
53 "pmullh $f16, $f16, $f6 \r\n"
54 "pmullh $f2, $f2, $f6 \r\n"
55 "pmullh $f4, $f4, $f6 \r\n"
56 "paddsh $f14, $f14, $f8 \r\n"
57 "paddsh $f16, $f16, $f8 \r\n"
58 "paddsh $f2, $f2, $f8 \r\n"
59 "paddsh $f4, $f4, $f8 \r\n"
60 "psrah $f14, $f14, $f10 \r\n"
61 "psrah $f16, $f16, $f10 \r\n"
62 "psrah $f2, $f2, $f10 \r\n"
63 "psrah $f4, $f4, $f10 \r\n"
64 "packushb $f2, $f2, $f14 \r\n"
65 "packushb $f4, $f4, $f16 \r\n"
68 : "=m"(*block),"=m"(*(block + 8))
69 : "r"(weight),"r"(offset),"r"(log2_denom)
74 void ff_h264_biweight_pixels16_8_mmi(uint8_t *dst, uint8_t *src,
75 int stride, int height, int log2_denom, int weightd, int weights,
80 offset = ((offset + 1) | 1) << log2_denom;
82 for (y=0; y<height; y++, dst+=stride, src+=stride) {
91 "pshufh $f6, $f6, $f20 \r\n"
92 "pshufh $f8, $f8, $f20 \r\n"
93 "pshufh $f10, $f10, $f20 \r\n"
94 "punpckhbh $f14, $f2, $f20 \r\n"
95 "punpckhbh $f16, $f4, $f20 \r\n"
96 "punpcklbh $f2, $f2, $f20 \r\n"
97 "punpcklbh $f4, $f4, $f20 \r\n"
98 "pmullh $f14, $f14, $f6 \r\n"
99 "pmullh $f16, $f16, $f8 \r\n"
100 "pmullh $f2, $f2, $f6 \r\n"
101 "pmullh $f4, $f4, $f8 \r\n"
102 "paddsh $f14, $f14, $f10 \r\n"
103 "paddsh $f2, $f2, $f10 \r\n"
104 "paddsh $f14, $f14, $f16 \r\n"
105 "paddsh $f2, $f2, $f4 \r\n"
106 "psrah $f14, $f14, $f12 \r\n"
107 "psrah $f2, $f2, $f12 \r\n"
108 "packushb $f2, $f2, $f14 \r\n"
112 "punpckhbh $f14, $f2, $f20 \r\n"
113 "punpckhbh $f16, $f4, $f20 \r\n"
114 "punpcklbh $f2, $f2, $f20 \r\n"
115 "punpcklbh $f4, $f4, $f20 \r\n"
116 "pmullh $f14, $f14, $f6 \r\n"
117 "pmullh $f16, $f16, $f8 \r\n"
118 "pmullh $f2, $f2, $f6 \r\n"
119 "pmullh $f4, $f4, $f8 \r\n"
120 "paddsh $f14, $f14, $f10 \r\n"
121 "paddsh $f2, $f2, $f10 \r\n"
122 "paddsh $f14, $f14, $f16 \r\n"
123 "paddsh $f2, $f2, $f4 \r\n"
124 "psrah $f14, $f14, $f12 \r\n"
125 "psrah $f2, $f2, $f12 \r\n"
126 "packushb $f2, $f2, $f14 \r\n"
128 : "=m"(*dst),"=m"(*(dst+8))
129 : "m"(*src),"m"(*dst),"m"(*(src+8)),"m"(*(dst+8)),
130 "r"(weights),"r"(weightd),"r"(offset),"r"(log2_denom+1)
135 void ff_h264_weight_pixels8_8_mmi(uint8_t *block, int stride, int height,
136 int log2_denom, int weight, int offset)
140 offset <<= log2_denom;
143 offset += 1 << (log2_denom - 1);
145 for (y=0; y<height; y++, block+=stride) {
151 "dmtc1 $0, $f20 \r\n"
152 "pshufh $f6, $f6, $f20 \r\n"
153 "pshufh $f8, $f8, $f20 \r\n"
154 "punpckhbh $f14, $f2, $f20 \r\n"
155 "punpcklbh $f2, $f2, $f20 \r\n"
156 "pmullh $f14, $f14, $f6 \r\n"
157 "pmullh $f2, $f2, $f6 \r\n"
158 "paddsh $f14, $f14, $f8 \r\n"
159 "paddsh $f2, $f2, $f8 \r\n"
160 "psrah $f14, $f14, $f10 \r\n"
161 "psrah $f2, $f2, $f10 \r\n"
162 "packushb $f2, $f2, $f14 \r\n"
165 : "r"(weight),"r"(offset),"r"(log2_denom)
170 void ff_h264_biweight_pixels8_8_mmi(uint8_t *dst, uint8_t *src,
171 int stride, int height, int log2_denom, int weightd, int weights,
176 offset = ((offset + 1) | 1) << log2_denom;
178 for (y=0; y<height; y++, dst+=stride, src+=stride) {
182 "dmtc1 $0, $f20 \r\n"
187 "pshufh $f6, $f6, $f20 \r\n"
188 "pshufh $f8, $f8, $f20 \r\n"
189 "pshufh $f10, $f10, $f20 \r\n"
190 "punpckhbh $f14, $f2, $f20 \r\n"
191 "punpckhbh $f16, $f4, $f20 \r\n"
192 "punpcklbh $f2, $f2, $f20 \r\n"
193 "punpcklbh $f4, $f4, $f20 \r\n"
194 "pmullh $f14, $f14, $f6 \r\n"
195 "pmullh $f16, $f16, $f8 \r\n"
196 "pmullh $f2, $f2, $f6 \r\n"
197 "pmullh $f4, $f4, $f8 \r\n"
198 "paddsh $f14, $f14, $f10 \r\n"
199 "paddsh $f2, $f2, $f10 \r\n"
200 "paddsh $f14, $f14, $f16 \r\n"
201 "paddsh $f2, $f2, $f4 \r\n"
202 "psrah $f14, $f14, $f12 \r\n"
203 "psrah $f2, $f2, $f12 \r\n"
204 "packushb $f2, $f2, $f14 \r\n"
207 : "m"(*src),"m"(*dst),"r"(weights),
208 "r"(weightd),"r"(offset),"r"(log2_denom+1)
213 void ff_h264_weight_pixels4_8_mmi(uint8_t *block, int stride, int height,
214 int log2_denom, int weight, int offset)
218 offset <<= log2_denom;
221 offset += 1 << (log2_denom - 1);
223 for (y=0; y<height; y++, block+=stride) {
229 "dmtc1 $0, $f20 \r\n"
230 "pshufh $f6, $f6, $f20 \r\n"
231 "pshufh $f8, $f8, $f20 \r\n"
232 "punpcklbh $f2, $f2, $f20 \r\n"
233 "pmullh $f2, $f2, $f6 \r\n"
234 "paddsh $f2, $f2, $f8 \r\n"
235 "psrah $f2, $f2, $f10 \r\n"
236 "packushb $f2, $f2, $f20 \r\n"
239 : "r"(weight),"r"(offset),"r"(log2_denom)
244 void ff_h264_biweight_pixels4_8_mmi(uint8_t *dst, uint8_t *src,
245 int stride, int height, int log2_denom, int weightd, int weights,
250 offset = ((offset + 1) | 1) << log2_denom;
252 for (y=0; y<height; y++, dst+=stride, src+=stride) {
256 "dmtc1 $0, $f20 \r\n"
261 "pshufh $f6, $f6, $f20 \r\n"
262 "pshufh $f8, $f8, $f20 \r\n"
263 "pshufh $f10, $f10, $f20 \r\n"
264 "punpcklbh $f2, $f2, $f20 \r\n"
265 "punpcklbh $f4, $f4, $f20 \r\n"
266 "pmullh $f2, $f2, $f6 \r\n"
267 "pmullh $f4, $f4, $f8 \r\n"
268 "paddsh $f2, $f2, $f10 \r\n"
269 "paddsh $f2, $f2, $f4 \r\n"
270 "psrah $f2, $f2, $f12 \r\n"
271 "packushb $f2, $f2, $f20 \r\n"
274 : "m"(*src),"m"(*dst),"r"(weights),
275 "r"(weightd),"r"(offset),"r"(log2_denom+1)