2 * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #include "libavutil/cpu.h"
22 #include "libavutil/x86_cpu.h"
23 #include "libavcodec/h264dsp.h"
24 #include "dsputil_mmx.h"
26 DECLARE_ALIGNED(8, static const uint64_t, ff_pb_3_1 ) = 0x0103010301030103ULL;
27 DECLARE_ALIGNED(8, static const uint64_t, ff_pb_7_3 ) = 0x0307030703070307ULL;
29 /***********************************/
32 void ff_h264_idct_add_mmx (uint8_t *dst, int16_t *block, int stride);
33 void ff_h264_idct8_add_mmx (uint8_t *dst, int16_t *block, int stride);
34 void ff_h264_idct8_add_sse2 (uint8_t *dst, int16_t *block, int stride);
35 void ff_h264_idct_dc_add_mmx2 (uint8_t *dst, int16_t *block, int stride);
36 void ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride);
38 void ff_h264_idct_add16_mmx (uint8_t *dst, const int *block_offset,
39 DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
40 void ff_h264_idct8_add4_mmx (uint8_t *dst, const int *block_offset,
41 DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
42 void ff_h264_idct_add16_mmx2 (uint8_t *dst, const int *block_offset,
43 DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
44 void ff_h264_idct_add16intra_mmx (uint8_t *dst, const int *block_offset,
45 DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
46 void ff_h264_idct_add16intra_mmx2(uint8_t *dst, const int *block_offset,
47 DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
48 void ff_h264_idct8_add4_mmx2 (uint8_t *dst, const int *block_offset,
49 DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
50 void ff_h264_idct8_add4_sse2 (uint8_t *dst, const int *block_offset,
51 DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
52 void ff_h264_idct_add8_mmx (uint8_t **dest, const int *block_offset,
53 DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
54 void ff_h264_idct_add8_mmx2 (uint8_t **dest, const int *block_offset,
55 DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
57 void ff_h264_idct_add16_sse2 (uint8_t *dst, const int *block_offset, DCTELEM *block,
58 int stride, const uint8_t nnzc[6*8]);
59 void ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset, DCTELEM *block,
60 int stride, const uint8_t nnzc[6*8]);
61 void ff_h264_idct_add8_sse2 (uint8_t **dest, const int *block_offset, DCTELEM *block,
62 int stride, const uint8_t nnzc[6*8]);
64 /***********************************/
67 static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40], int8_t ref[2][40], int16_t mv[2][40][2],
68 int bidir, int edges, int step, int mask_mv0, int mask_mv1, int field ) {
73 ::"m"(ff_pb_1), "m"(ff_pb_3)
81 "movq %%mm6, %%mm5 \n"
82 "paddb %%mm5, %%mm5 \n"
85 // could do a special case for dir==0 && edges==1, but it only reduces the
86 // average filter time by 1.2%
87 for( dir=1; dir>=0; dir-- ) {
88 const x86_reg d_idx = dir ? -8 : -1;
89 const int mask_mv = dir ? mask_mv1 : mask_mv0;
90 DECLARE_ALIGNED(8, const uint64_t, mask_dir) = dir ? 0 : 0xffffffffffffffffULL;
92 for( b_idx=12, edge=0; edge<edges; edge+=step, b_idx+=8*step ) {
97 if(!(mask_mv & edge)) {
100 "movd (%1,%0), %%mm2 \n"
101 "punpckldq 40(%1,%0), %%mm2 \n" // { ref0[bn], ref1[bn] }
102 "pshufw $0x44, (%1), %%mm0 \n" // { ref0[b], ref0[b] }
103 "pshufw $0x44, 40(%1), %%mm1 \n" // { ref1[b], ref1[b] }
104 "pshufw $0x4E, %%mm2, %%mm3 \n"
105 "psubb %%mm2, %%mm0 \n" // { ref0[b]!=ref0[bn], ref0[b]!=ref1[bn] }
106 "psubb %%mm3, %%mm1 \n" // { ref1[b]!=ref1[bn], ref1[b]!=ref0[bn] }
108 "por %%mm1, %%mm0 \n"
109 "movq (%2,%0,4), %%mm1 \n"
110 "movq 8(%2,%0,4), %%mm2 \n"
111 "movq %%mm1, %%mm3 \n"
112 "movq %%mm2, %%mm4 \n"
113 "psubw (%2), %%mm1 \n"
114 "psubw 8(%2), %%mm2 \n"
115 "psubw 160(%2), %%mm3 \n"
116 "psubw 168(%2), %%mm4 \n"
117 "packsswb %%mm2, %%mm1 \n"
118 "packsswb %%mm4, %%mm3 \n"
119 "paddb %%mm6, %%mm1 \n"
120 "paddb %%mm6, %%mm3 \n"
121 "psubusb %%mm5, %%mm1 \n" // abs(mv[b] - mv[bn]) >= limit
122 "psubusb %%mm5, %%mm3 \n"
123 "packsswb %%mm3, %%mm1 \n"
128 "pshufw $0x4E, %%mm1, %%mm1 \n"
129 "por %%mm1, %%mm0 \n"
130 "pshufw $0x4E, %%mm0, %%mm1 \n"
131 "pminub %%mm1, %%mm0 \n"
138 "movd (%1), %%mm0 \n"
139 "psubb (%1,%0), %%mm0 \n" // ref[b] != ref[bn]
140 "movq (%2), %%mm1 \n"
141 "movq 8(%2), %%mm2 \n"
142 "psubw (%2,%0,4), %%mm1 \n"
143 "psubw 8(%2,%0,4), %%mm2 \n"
144 "packsswb %%mm2, %%mm1 \n"
145 "paddb %%mm6, %%mm1 \n"
146 "psubusb %%mm5, %%mm1 \n" // abs(mv[b] - mv[bn]) >= limit
147 "packsswb %%mm1, %%mm1 \n"
148 "por %%mm1, %%mm0 \n"
157 "por %1, %%mm1 \n" // nnz[b] || nnz[bn]
159 "m"(nnz[b_idx+d_idx])
162 "pminub %%mm7, %%mm1 \n"
163 "pminub %%mm7, %%mm0 \n"
165 "pxor %%mm2, %%mm2 \n"
166 "pmaxub %%mm0, %%mm1 \n"
167 "punpcklbw %%mm2, %%mm1 \n"
169 :"=m"(*bS[dir][edge])
177 "movq (%0), %%mm0 \n\t"
178 "movq 8(%0), %%mm1 \n\t"
179 "movq 16(%0), %%mm2 \n\t"
180 "movq 24(%0), %%mm3 \n\t"
181 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4)
182 "movq %%mm0, (%0) \n\t"
183 "movq %%mm3, 8(%0) \n\t"
184 "movq %%mm4, 16(%0) \n\t"
185 "movq %%mm2, 24(%0) \n\t"
191 #define LF_FUNC(DIR, TYPE, OPT) \
192 void ff_x264_deblock_ ## DIR ## _ ## TYPE ## _ ## OPT (uint8_t *pix, int stride, \
193 int alpha, int beta, int8_t *tc0);
194 #define LF_IFUNC(DIR, TYPE, OPT) \
195 void ff_x264_deblock_ ## DIR ## _ ## TYPE ## _ ## OPT (uint8_t *pix, int stride, \
196 int alpha, int beta);
198 LF_FUNC (h, chroma, mmxext)
199 LF_IFUNC(h, chroma_intra, mmxext)
200 LF_FUNC (v, chroma, mmxext)
201 LF_IFUNC(v, chroma_intra, mmxext)
203 LF_FUNC (h, luma, mmxext)
204 LF_IFUNC(h, luma_intra, mmxext)
205 #if HAVE_YASM && ARCH_X86_32
206 LF_FUNC (v8, luma, mmxext)
207 static void ff_x264_deblock_v_luma_mmxext(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
209 if((tc0[0] & tc0[1]) >= 0)
210 ff_x264_deblock_v8_luma_mmxext(pix+0, stride, alpha, beta, tc0);
211 if((tc0[2] & tc0[3]) >= 0)
212 ff_x264_deblock_v8_luma_mmxext(pix+8, stride, alpha, beta, tc0+2);
214 LF_IFUNC(v8, luma_intra, mmxext)
215 static void ff_x264_deblock_v_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta)
217 ff_x264_deblock_v8_luma_intra_mmxext(pix+0, stride, alpha, beta);
218 ff_x264_deblock_v8_luma_intra_mmxext(pix+8, stride, alpha, beta);
222 LF_FUNC (h, luma, sse2)
223 LF_IFUNC(h, luma_intra, sse2)
224 LF_FUNC (v, luma, sse2)
225 LF_IFUNC(v, luma_intra, sse2)
227 /***********************************/
228 /* weighted prediction */
230 #define H264_WEIGHT(W, H, OPT) \
231 void ff_h264_weight_ ## W ## x ## H ## _ ## OPT(uint8_t *dst, \
232 int stride, int log2_denom, int weight, int offset);
234 #define H264_BIWEIGHT(W, H, OPT) \
235 void ff_h264_biweight_ ## W ## x ## H ## _ ## OPT(uint8_t *dst, \
236 uint8_t *src, int stride, int log2_denom, int weightd, \
237 int weights, int offset);
239 #define H264_BIWEIGHT_MMX(W,H) \
240 H264_WEIGHT (W, H, mmx2) \
241 H264_BIWEIGHT(W, H, mmx2)
243 #define H264_BIWEIGHT_MMX_SSE(W,H) \
244 H264_BIWEIGHT_MMX(W, H) \
245 H264_WEIGHT (W, H, sse2) \
246 H264_BIWEIGHT (W, H, sse2) \
247 H264_BIWEIGHT (W, H, ssse3)
249 H264_BIWEIGHT_MMX_SSE(16, 16)
250 H264_BIWEIGHT_MMX_SSE(16, 8)
251 H264_BIWEIGHT_MMX_SSE( 8, 16)
252 H264_BIWEIGHT_MMX_SSE( 8, 8)
253 H264_BIWEIGHT_MMX_SSE( 8, 4)
254 H264_BIWEIGHT_MMX ( 4, 8)
255 H264_BIWEIGHT_MMX ( 4, 4)
256 H264_BIWEIGHT_MMX ( 4, 2)
258 void ff_h264dsp_init_x86(H264DSPContext *c)
260 int mm_flags = av_get_cpu_flags();
262 if (mm_flags & AV_CPU_FLAG_MMX2) {
263 c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2;
266 if (mm_flags & AV_CPU_FLAG_MMX) {
268 c->h264_idct_add= ff_h264_idct_add_mmx;
269 c->h264_idct8_dc_add=
270 c->h264_idct8_add= ff_h264_idct8_add_mmx;
272 c->h264_idct_add16 = ff_h264_idct_add16_mmx;
273 c->h264_idct8_add4 = ff_h264_idct8_add4_mmx;
274 c->h264_idct_add8 = ff_h264_idct_add8_mmx;
275 c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx;
277 if (mm_flags & AV_CPU_FLAG_MMX2) {
278 c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2;
279 c->h264_idct8_dc_add= ff_h264_idct8_dc_add_mmx2;
280 c->h264_idct_add16 = ff_h264_idct_add16_mmx2;
281 c->h264_idct8_add4 = ff_h264_idct8_add4_mmx2;
282 c->h264_idct_add8 = ff_h264_idct_add8_mmx2;
283 c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx2;
285 c->h264_v_loop_filter_chroma= ff_x264_deblock_v_chroma_mmxext;
286 c->h264_h_loop_filter_chroma= ff_x264_deblock_h_chroma_mmxext;
287 c->h264_v_loop_filter_chroma_intra= ff_x264_deblock_v_chroma_intra_mmxext;
288 c->h264_h_loop_filter_chroma_intra= ff_x264_deblock_h_chroma_intra_mmxext;
290 c->h264_v_loop_filter_luma= ff_x264_deblock_v_luma_mmxext;
291 c->h264_h_loop_filter_luma= ff_x264_deblock_h_luma_mmxext;
292 c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_mmxext;
293 c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_mmxext;
295 c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2;
296 c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2;
297 c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2;
298 c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2;
299 c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2;
300 c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_mmx2;
301 c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_mmx2;
302 c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_mmx2;
304 c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_mmx2;
305 c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_mmx2;
306 c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_mmx2;
307 c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2;
308 c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2;
309 c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2;
310 c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2;
311 c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2;
313 if (mm_flags&AV_CPU_FLAG_SSE2) {
314 c->h264_idct8_add = ff_h264_idct8_add_sse2;
315 c->h264_idct8_add4= ff_h264_idct8_add4_sse2;
317 c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_sse2;
318 c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_sse2;
319 c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_sse2;
320 c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_sse2;
321 c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_sse2;
323 c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_sse2;
324 c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_sse2;
325 c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_sse2;
326 c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_sse2;
327 c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_sse2;
329 c->h264_v_loop_filter_luma = ff_x264_deblock_v_luma_sse2;
330 c->h264_h_loop_filter_luma = ff_x264_deblock_h_luma_sse2;
331 c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_sse2;
332 c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_sse2;
334 c->h264_idct_add16 = ff_h264_idct_add16_sse2;
335 c->h264_idct_add8 = ff_h264_idct_add8_sse2;
336 c->h264_idct_add16intra = ff_h264_idct_add16intra_sse2;
338 if (mm_flags&AV_CPU_FLAG_SSSE3) {
339 c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_ssse3;
340 c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_ssse3;
341 c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_ssse3;
342 c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_ssse3;
343 c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_ssse3;