1 /*****************************************************************************
2 * predict.c: h264 encoder
3 *****************************************************************************
4 * Copyright (C) 2003-2008 x264 project
6 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
7 * Loren Merritt <lorenm@u.washington.edu>
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
22 *****************************************************************************/
24 #include "common/common.h"
28 extern void predict_16x16_v_mmx( uint8_t *src );
29 extern void predict_16x16_h_mmxext( uint8_t *src );
30 extern void predict_16x16_h_ssse3( uint8_t *src );
31 extern void predict_16x16_dc_core_mmxext( uint8_t *src, int i_dc_left );
32 extern void predict_16x16_dc_left_core_mmxext( uint8_t *src, int i_dc_left );
33 extern void predict_16x16_dc_top_mmxext( uint8_t *src );
34 extern void predict_16x16_p_core_mmxext( uint8_t *src, int i00, int b, int c );
35 extern void predict_8x8c_p_core_mmxext( uint8_t *src, int i00, int b, int c );
36 extern void predict_8x8c_p_core_sse2( uint8_t *src, int i00, int b, int c );
37 extern void predict_8x8c_dc_core_mmxext( uint8_t *src, int s2, int s3 );
38 extern void predict_8x8c_dc_top_mmxext( uint8_t *src );
39 extern void predict_8x8c_v_mmx( uint8_t *src );
40 extern void predict_8x8c_h_mmxext( uint8_t *src );
41 extern void predict_8x8c_h_ssse3( uint8_t *src );
42 extern void predict_8x8_v_mmxext( uint8_t *src, uint8_t edge[33] );
43 extern void predict_8x8_h_mmxext( uint8_t *src, uint8_t edge[33] );
44 extern void predict_8x8_hd_mmxext( uint8_t *src, uint8_t edge[33] );
45 extern void predict_8x8_hu_mmxext( uint8_t *src, uint8_t edge[33] );
46 extern void predict_8x8_dc_mmxext( uint8_t *src, uint8_t edge[33] );
47 extern void predict_8x8_dc_top_mmxext( uint8_t *src, uint8_t edge[33] );
48 extern void predict_8x8_dc_left_mmxext( uint8_t *src, uint8_t edge[33] );
49 extern void predict_8x8_ddl_mmxext( uint8_t *src, uint8_t edge[33] );
50 extern void predict_8x8_ddr_mmxext( uint8_t *src, uint8_t edge[33] );
51 extern void predict_8x8_ddl_sse2( uint8_t *src, uint8_t edge[33] );
52 extern void predict_8x8_ddr_sse2( uint8_t *src, uint8_t edge[33] );
53 extern void predict_8x8_vl_sse2( uint8_t *src, uint8_t edge[33] );
54 extern void predict_8x8_vr_sse2( uint8_t *src, uint8_t edge[33] );
55 extern void predict_8x8_hu_sse2( uint8_t *src, uint8_t edge[33] );
56 extern void predict_8x8_hd_sse2( uint8_t *src, uint8_t edge[33] );
57 extern void predict_8x8_vr_core_mmxext( uint8_t *src, uint8_t edge[33] );
58 extern void predict_8x8_hd_ssse3( uint8_t *src, uint8_t edge[33] );
59 extern void predict_8x8_hu_ssse3( uint8_t *src, uint8_t edge[33] );
60 extern void predict_8x8_filter_mmxext ( uint8_t *src, uint8_t edge[33], int i_neighbor, int i_filters );
61 extern void predict_8x8_filter_ssse3 ( uint8_t *src, uint8_t edge[33], int i_neighbor, int i_filters );
62 extern void predict_4x4_ddl_mmxext( uint8_t *src );
63 extern void predict_4x4_ddr_mmxext( uint8_t *src );
64 extern void predict_4x4_vl_mmxext( uint8_t *src );
65 extern void predict_4x4_vr_mmxext( uint8_t *src );
66 extern void predict_4x4_vr_ssse3( uint8_t *src );
67 extern void predict_4x4_hd_mmxext( uint8_t *src );
68 extern void predict_4x4_hd_ssse3( uint8_t *src );
69 extern void predict_4x4_dc_mmxext( uint8_t *src );
70 extern void predict_4x4_ddr_ssse3( uint8_t *src );
71 extern void predict_4x4_hu_mmxext( uint8_t *src );
72 extern void predict_16x16_dc_top_sse2( uint8_t *src );
73 extern void predict_16x16_dc_core_sse2( uint8_t *src, int i_dc_left );
74 extern void predict_16x16_dc_left_core_sse2( uint8_t *src, int i_dc_left );
75 extern void predict_16x16_v_sse2( uint8_t *src );
76 extern void predict_16x16_p_core_sse2( uint8_t *src, int i00, int b, int c );
78 ALIGNED_8( static const int8_t pb_12345678[8] ) = {1,2,3,4,5,6,7,8};
79 ALIGNED_8( static const int8_t pb_m87654321[8] ) = {-8,-7,-6,-5,-4,-3,-2,-1};
80 ALIGNED_8( static const int8_t pb_m32101234[8] ) = {-3,-2,-1,0,1,2,3,4};
82 #define PREDICT_P_SUM(j,i)\
83 H += i * ( src[j+i - FDEC_STRIDE ] - src[j-i - FDEC_STRIDE ] );\
84 V += i * ( src[(j+i)*FDEC_STRIDE -1] - src[(j-i)*FDEC_STRIDE -1] );\
86 #define PREDICT_16x16_P(name)\
87 static void predict_16x16_p_##name( uint8_t *src )\
101 a = 16 * ( src[15*FDEC_STRIDE -1] + src[15 - FDEC_STRIDE] );\
102 b = ( 5 * H + 32 ) >> 6;\
103 c = ( 5 * V + 32 ) >> 6;\
104 i00 = a - b * 7 - c * 7 + 16;\
105 predict_16x16_p_core_##name( src, i00, b, c );\
109 PREDICT_16x16_P( mmxext )
111 PREDICT_16x16_P( sse2 )
114 static void predict_16x16_p_ssse3( uint8_t *src )
120 "movq 8+%1, %%mm0 \n"
121 "palignr $7, -8+%1, %%mm1 \n"
122 "pmaddubsw %2, %%mm0 \n"
123 "pmaddubsw %3, %%mm1 \n"
124 "paddw %%mm1, %%mm0 \n"
125 "pshufw $14, %%mm0, %%mm1 \n"
126 "paddw %%mm1, %%mm0 \n"
127 "pshufw $1, %%mm0, %%mm1 \n"
128 "paddw %%mm1, %%mm0 \n"
132 :"m"(src[-FDEC_STRIDE]), "m"(*pb_12345678), "m"(*pb_m87654321)
134 V = 8 * ( src[15*FDEC_STRIDE-1] - src[-1*FDEC_STRIDE-1] )
135 + 7 * ( src[14*FDEC_STRIDE-1] - src[ 0*FDEC_STRIDE-1] )
136 + 6 * ( src[13*FDEC_STRIDE-1] - src[ 1*FDEC_STRIDE-1] )
137 + 5 * ( src[12*FDEC_STRIDE-1] - src[ 2*FDEC_STRIDE-1] )
138 + 4 * ( src[11*FDEC_STRIDE-1] - src[ 3*FDEC_STRIDE-1] )
139 + 3 * ( src[10*FDEC_STRIDE-1] - src[ 4*FDEC_STRIDE-1] )
140 + 2 * ( src[ 9*FDEC_STRIDE-1] - src[ 5*FDEC_STRIDE-1] )
141 + 1 * ( src[ 8*FDEC_STRIDE-1] - src[ 6*FDEC_STRIDE-1] );
142 a = 16 * ( src[15*FDEC_STRIDE -1] + src[15 - FDEC_STRIDE] );
143 b = ( 5 * H + 32 ) >> 6;
144 c = ( 5 * V + 32 ) >> 6;
145 i00 = a - b * 7 - c * 7 + 16;
146 predict_16x16_p_core_sse2( src, i00, b, c );
150 #define PREDICT_8x8_P(name)\
151 static void predict_8x8c_p_##name( uint8_t *src )\
161 a = 16 * ( src[7*FDEC_STRIDE -1] + src[7 - FDEC_STRIDE] );\
162 b = ( 17 * H + 16 ) >> 5;\
163 c = ( 17 * V + 16 ) >> 5;\
164 i00 = a -3*b -3*c + 16;\
165 predict_8x8c_p_core_##name( src, i00, b, c );\
169 PREDICT_8x8_P( mmxext )
171 PREDICT_8x8_P( sse2 )
174 static void predict_8x8c_p_ssse3( uint8_t *src )
180 "pmaddubsw %2, %%mm0 \n"
181 "pshufw $14, %%mm0, %%mm1 \n"
182 "paddw %%mm1, %%mm0 \n"
183 "pshufw $1, %%mm0, %%mm1 \n"
184 "paddw %%mm1, %%mm0 \n"
188 :"m"(src[-FDEC_STRIDE]), "m"(*pb_m32101234)
190 V = 1 * ( src[4*FDEC_STRIDE -1] - src[ 2*FDEC_STRIDE -1] )
191 + 2 * ( src[5*FDEC_STRIDE -1] - src[ 1*FDEC_STRIDE -1] )
192 + 3 * ( src[6*FDEC_STRIDE -1] - src[ 0*FDEC_STRIDE -1] )
193 + 4 * ( src[7*FDEC_STRIDE -1] - src[-1*FDEC_STRIDE -1] );
194 H += -4 * src[-1*FDEC_STRIDE -1];
195 a = 16 * ( src[7*FDEC_STRIDE -1] + src[7 - FDEC_STRIDE] );
196 b = ( 17 * H + 16 ) >> 5;
197 c = ( 17 * V + 16 ) >> 5;
198 i00 = a -3*b -3*c + 16;
199 predict_8x8c_p_core_sse2( src, i00, b, c );
203 #define PREDICT_16x16_DC(name)\
204 static void predict_16x16_dc_##name( uint8_t *src )\
208 for( i = 0; i < 16; i+=2 )\
210 dc += src[-1 + i * FDEC_STRIDE];\
211 dc += src[-1 + (i+1) * FDEC_STRIDE];\
213 predict_16x16_dc_core_##name( src, dc );\
216 PREDICT_16x16_DC( mmxext )
217 PREDICT_16x16_DC( sse2 )
219 #define PREDICT_16x16_DC_LEFT(name)\
220 static void predict_16x16_dc_left_##name( uint8_t *src )\
224 for( i = 0; i < 16; i+=2 )\
226 dc += src[-1 + i * FDEC_STRIDE];\
227 dc += src[-1 + (i+1) * FDEC_STRIDE];\
229 predict_16x16_dc_left_core_##name( src, dc>>4 );\
232 PREDICT_16x16_DC_LEFT( mmxext )
233 PREDICT_16x16_DC_LEFT( sse2 )
235 static void predict_8x8c_dc_mmxext( uint8_t *src )
238 + src[-1 + 0*FDEC_STRIDE]
239 + src[-1 + 1*FDEC_STRIDE]
240 + src[-1 + 2*FDEC_STRIDE]
241 + src[-1 + 3*FDEC_STRIDE];
244 + src[-1 + 4*FDEC_STRIDE]
245 + src[-1 + 5*FDEC_STRIDE]
246 + src[-1 + 6*FDEC_STRIDE]
247 + src[-1 + 7*FDEC_STRIDE];
249 predict_8x8c_dc_core_mmxext( src, s2, s3 );
253 static void predict_8x8c_dc_left( uint8_t *src )
256 uint32_t s0 = 0, s1 = 0;
259 for( y = 0; y < 4; y++ )
261 s0 += src[y * FDEC_STRIDE - 1];
262 s1 += src[(y+4) * FDEC_STRIDE - 1];
264 dc0 = (( s0 + 2 ) >> 2) * 0x0101010101010101ULL;
265 dc1 = (( s1 + 2 ) >> 2) * 0x0101010101010101ULL;
267 for( y = 0; y < 4; y++ )
272 for( y = 0; y < 4; y++ )
281 /****************************************************************************
282 * 8x8 prediction for intra luma block
283 ****************************************************************************/
286 UNUSED int l##y = edge[14-y];
288 UNUSED int t##x = edge[16+x];
289 #define PREDICT_8x8_LOAD_TOPLEFT \
291 #define PREDICT_8x8_LOAD_LEFT \
292 PL(0) PL(1) PL(2) PL(3) PL(4) PL(5) PL(6) PL(7)
293 #define PREDICT_8x8_LOAD_TOP \
294 PT(0) PT(1) PT(2) PT(3) PT(4) PT(5) PT(6) PT(7)
296 #define PREDICT_8x8_DC(v) \
298 for( y = 0; y < 8; y++ ) { \
301 src += FDEC_STRIDE; \
304 #define SRC(x,y) src[(x)+(y)*FDEC_STRIDE]
307 static void predict_8x8_vr_mmxext( uint8_t *src, uint8_t edge[33] )
309 predict_8x8_vr_core_mmxext( src, edge );
311 PREDICT_8x8_LOAD_TOPLEFT
312 PREDICT_8x8_LOAD_LEFT
313 SRC(0,2)=SRC(1,4)=SRC(2,6)= (l1 + 2*l0 + lt + 2) >> 2;
314 SRC(0,3)=SRC(1,5)=SRC(2,7)= (l2 + 2*l1 + l0 + 2) >> 2;
315 SRC(0,4)=SRC(1,6)= (l3 + 2*l2 + l1 + 2) >> 2;
316 SRC(0,5)=SRC(1,7)= (l4 + 2*l3 + l2 + 2) >> 2;
317 SRC(0,6)= (l5 + 2*l4 + l3 + 2) >> 2;
318 SRC(0,7)= (l6 + 2*l5 + l4 + 2) >> 2;
323 #define SUMSUB(a,b,c,d,e,f,g,h)\
329 #define INTRA_SA8D_X3(cpu) \
330 void x264_intra_sa8d_x3_8x8_##cpu( uint8_t *fenc, uint8_t edge[33], int res[3] )\
332 PREDICT_8x8_LOAD_TOP\
333 PREDICT_8x8_LOAD_LEFT\
335 ALIGNED_16( int16_t sa8d_1d[2][8] );\
336 SUMSUB(l0,l4,l1,l5,l2,l6,l3,l7);\
337 SUMSUB(l0,l2,l1,l3,l4,l6,l5,l7);\
338 SUMSUB(l0,l1,l2,l3,l4,l5,l6,l7);\
347 SUMSUB(t0,t4,t1,t5,t2,t6,t3,t7);\
348 SUMSUB(t0,t2,t1,t3,t4,t6,t5,t7);\
349 SUMSUB(t0,t1,t2,t3,t4,t5,t6,t7);\
358 x264_intra_sa8d_x3_8x8_core_##cpu( fenc, sa8d_1d, res );\
365 INTRA_SA8D_X3(mmxext)
368 /****************************************************************************
369 * Exported functions:
370 ****************************************************************************/
371 void x264_predict_16x16_init_mmx( int cpu, x264_predict_t pf[7] )
373 if( !(cpu&X264_CPU_MMX) )
375 pf[I_PRED_16x16_V] = predict_16x16_v_mmx;
376 if( !(cpu&X264_CPU_MMXEXT) )
378 pf[I_PRED_16x16_DC] = predict_16x16_dc_mmxext;
379 pf[I_PRED_16x16_DC_TOP] = predict_16x16_dc_top_mmxext;
380 pf[I_PRED_16x16_DC_LEFT] = predict_16x16_dc_left_mmxext;
382 pf[I_PRED_16x16_P] = predict_16x16_p_mmxext;
384 pf[I_PRED_16x16_H] = predict_16x16_h_mmxext;
385 if( !(cpu&X264_CPU_SSE2) )
387 pf[I_PRED_16x16_DC] = predict_16x16_dc_sse2;
388 pf[I_PRED_16x16_V] = predict_16x16_v_sse2;
389 if( cpu&X264_CPU_SSE2_IS_SLOW )
391 pf[I_PRED_16x16_DC_TOP] = predict_16x16_dc_top_sse2;
392 pf[I_PRED_16x16_DC_LEFT] = predict_16x16_dc_left_sse2;
393 pf[I_PRED_16x16_P] = predict_16x16_p_sse2;
394 if( !(cpu&X264_CPU_SSSE3) )
396 pf[I_PRED_16x16_H] = predict_16x16_h_ssse3;
398 pf[I_PRED_16x16_P] = predict_16x16_p_ssse3;
402 void x264_predict_8x8c_init_mmx( int cpu, x264_predict_t pf[7] )
404 if( !(cpu&X264_CPU_MMX) )
407 pf[I_PRED_CHROMA_DC_LEFT] = predict_8x8c_dc_left;
409 pf[I_PRED_CHROMA_V] = predict_8x8c_v_mmx;
410 if( !(cpu&X264_CPU_MMXEXT) )
412 pf[I_PRED_CHROMA_DC_TOP] = predict_8x8c_dc_top_mmxext;
413 pf[I_PRED_CHROMA_H] = predict_8x8c_h_mmxext;
415 pf[I_PRED_CHROMA_P] = predict_8x8c_p_mmxext;
417 pf[I_PRED_CHROMA_DC] = predict_8x8c_dc_mmxext;
418 if( !(cpu&X264_CPU_SSE2) )
420 pf[I_PRED_CHROMA_P] = predict_8x8c_p_sse2;
421 if( !(cpu&X264_CPU_SSSE3) )
423 pf[I_PRED_CHROMA_H] = predict_8x8c_h_ssse3;
425 pf[I_PRED_CHROMA_P] = predict_8x8c_p_ssse3;
429 void x264_predict_8x8_init_mmx( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_8x8_filter )
431 if( !(cpu&X264_CPU_MMXEXT) )
433 pf[I_PRED_8x8_V] = predict_8x8_v_mmxext;
434 pf[I_PRED_8x8_H] = predict_8x8_h_mmxext;
435 pf[I_PRED_8x8_DC] = predict_8x8_dc_mmxext;
436 pf[I_PRED_8x8_DC_TOP] = predict_8x8_dc_top_mmxext;
437 pf[I_PRED_8x8_DC_LEFT]= predict_8x8_dc_left_mmxext;
438 pf[I_PRED_8x8_HD] = predict_8x8_hd_mmxext;
439 *predict_8x8_filter = predict_8x8_filter_mmxext;
441 pf[I_PRED_8x8_DDL] = predict_8x8_ddl_mmxext;
442 pf[I_PRED_8x8_DDR] = predict_8x8_ddr_mmxext;
443 pf[I_PRED_8x8_VR] = predict_8x8_vr_mmxext;
444 pf[I_PRED_8x8_HU] = predict_8x8_hu_mmxext;
446 if( !(cpu&X264_CPU_SSE2) )
448 pf[I_PRED_8x8_DDL] = predict_8x8_ddl_sse2;
449 pf[I_PRED_8x8_VL] = predict_8x8_vl_sse2;
450 pf[I_PRED_8x8_VR] = predict_8x8_vr_sse2;
451 pf[I_PRED_8x8_DDR] = predict_8x8_ddr_sse2;
452 pf[I_PRED_8x8_HD] = predict_8x8_hd_sse2;
453 pf[I_PRED_8x8_HU] = predict_8x8_hu_sse2;
454 if( !(cpu&X264_CPU_SSSE3) )
456 pf[I_PRED_8x8_HD] = predict_8x8_hd_ssse3;
457 pf[I_PRED_8x8_HU] = predict_8x8_hu_ssse3;
458 *predict_8x8_filter = predict_8x8_filter_ssse3;
461 void x264_predict_4x4_init_mmx( int cpu, x264_predict_t pf[12] )
463 if( !(cpu&X264_CPU_MMXEXT) )
465 pf[I_PRED_4x4_VR] = predict_4x4_vr_mmxext;
466 pf[I_PRED_4x4_DDL] = predict_4x4_ddl_mmxext;
467 pf[I_PRED_4x4_VL] = predict_4x4_vl_mmxext;
468 pf[I_PRED_4x4_DC] = predict_4x4_dc_mmxext;
469 pf[I_PRED_4x4_DDR] = predict_4x4_ddr_mmxext;
470 pf[I_PRED_4x4_HD] = predict_4x4_hd_mmxext;
471 pf[I_PRED_4x4_HU] = predict_4x4_hu_mmxext;
472 if( !(cpu&X264_CPU_SSSE3) )
474 pf[I_PRED_4x4_DDR] = predict_4x4_ddr_ssse3;
475 pf[I_PRED_4x4_VR] = predict_4x4_vr_ssse3;
476 pf[I_PRED_4x4_HD] = predict_4x4_hd_ssse3;