1 /*****************************************************************************
2 * predict-c.c: intra prediction
3 *****************************************************************************
4 * Copyright (C) 2003-2010 x264 project
6 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
7 * Loren Merritt <lorenm@u.washington.edu>
8 * Fiona Glaser <fiona@x264.com>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
24 * This program is also available under a commercial proprietary license.
25 * For more information, contact us at licensing@x264.com.
26 *****************************************************************************/
28 #include "common/common.h"
32 void x264_predict_16x16_v_mmx( pixel *src );
33 void x264_predict_16x16_v_sse2( pixel *src );
34 void x264_predict_16x16_h_mmxext( pixel *src );
35 void x264_predict_16x16_h_sse2( uint16_t *src );
36 void x264_predict_16x16_h_ssse3( uint8_t *src );
37 void x264_predict_16x16_dc_core_mmxext( pixel *src, int i_dc_left );
38 void x264_predict_16x16_dc_core_sse2( pixel *src, int i_dc_left );
39 void x264_predict_16x16_dc_left_core_mmxext( pixel *src, int i_dc_left );
40 void x264_predict_16x16_dc_left_core_sse2( pixel *src, int i_dc_left );
41 void x264_predict_16x16_dc_top_mmxext( pixel *src );
42 void x264_predict_16x16_dc_top_sse2( pixel *src );
43 void x264_predict_16x16_dc_top_ssse3( uint16_t *src );
44 void x264_predict_16x16_p_core_mmxext( uint8_t *src, int i00, int b, int c );
45 void x264_predict_16x16_p_core_sse2( uint8_t *src, int i00, int b, int c );
46 void x264_predict_8x8c_p_core_mmxext( uint8_t *src, int i00, int b, int c );
47 void x264_predict_8x8c_p_core_sse2( uint8_t *src, int i00, int b, int c );
48 void x264_predict_8x8c_dc_core_mmxext( uint8_t *src, int s2, int s3 );
49 void x264_predict_8x8c_dc_top_mmxext( uint8_t *src );
50 void x264_predict_8x8c_v_mmx( uint8_t *src );
51 void x264_predict_8x8c_h_mmxext( uint8_t *src );
52 void x264_predict_8x8c_h_ssse3( uint8_t *src );
53 void x264_predict_8x8_v_mmxext( uint8_t *src, uint8_t edge[33] );
54 void x264_predict_8x8_v_sse2( uint16_t *src, uint16_t edge[33] );
55 void x264_predict_8x8_h_mmxext( uint8_t *src, uint8_t edge[33] );
56 void x264_predict_8x8_h_sse2( uint16_t *src, uint16_t edge[33] );
57 void x264_predict_8x8_hd_mmxext( uint8_t *src, uint8_t edge[33] );
58 void x264_predict_8x8_hu_mmxext( uint8_t *src, uint8_t edge[33] );
59 void x264_predict_8x8_dc_mmxext( uint8_t *src, uint8_t edge[33] );
60 void x264_predict_8x8_dc_sse2( uint16_t *src, uint16_t edge[33] );
61 void x264_predict_8x8_dc_top_mmxext( uint8_t *src, uint8_t edge[33] );
62 void x264_predict_8x8_dc_top_sse2( uint16_t *src, uint16_t edge[33] );
63 void x264_predict_8x8_dc_left_mmxext( uint8_t *src, uint8_t edge[33] );
64 void x264_predict_8x8_dc_left_sse2( uint16_t *src, uint16_t edge[33] );
65 void x264_predict_8x8_ddl_mmxext( uint8_t *src, uint8_t edge[33] );
66 void x264_predict_8x8_ddr_mmxext( uint8_t *src, uint8_t edge[33] );
67 void x264_predict_8x8_ddl_sse2( uint8_t *src, uint8_t edge[33] );
68 void x264_predict_8x8_ddr_sse2( uint8_t *src, uint8_t edge[33] );
69 void x264_predict_8x8_vl_sse2( uint8_t *src, uint8_t edge[33] );
70 void x264_predict_8x8_vr_sse2( uint8_t *src, uint8_t edge[33] );
71 void x264_predict_8x8_hu_sse2( uint8_t *src, uint8_t edge[33] );
72 void x264_predict_8x8_hd_sse2( uint8_t *src, uint8_t edge[33] );
73 void x264_predict_8x8_vr_core_mmxext( uint8_t *src, uint8_t edge[33] );
74 void x264_predict_8x8_hd_ssse3( uint8_t *src, uint8_t edge[33] );
75 void x264_predict_8x8_hu_ssse3( uint8_t *src, uint8_t edge[33] );
76 void x264_predict_8x8_filter_mmxext( uint8_t *src, uint8_t edge[33], int i_neighbor, int i_filters );
77 void x264_predict_8x8_filter_ssse3( uint8_t *src, uint8_t edge[33], int i_neighbor, int i_filters );
78 void x264_predict_4x4_ddl_mmxext( pixel *src );
79 void x264_predict_4x4_ddl_sse2( uint16_t *src );
80 void x264_predict_4x4_ddr_mmxext( uint8_t *src );
81 void x264_predict_4x4_vl_mmxext( pixel *src );
82 void x264_predict_4x4_vl_sse2( uint16_t *src );
83 void x264_predict_4x4_vr_mmxext( uint8_t *src );
84 void x264_predict_4x4_vr_ssse3( pixel *src );
85 void x264_predict_4x4_hd_mmxext( uint8_t *src );
86 void x264_predict_4x4_hd_ssse3( pixel *src );
87 void x264_predict_4x4_dc_mmxext( pixel *src );
88 void x264_predict_4x4_ddr_ssse3( pixel *src );
89 void x264_predict_4x4_hu_mmxext( uint8_t *src );
90 void x264_predict_4x4_hu_sse2( uint16_t *src );
92 #define PREDICT_16x16_DC(name)\
93 static void x264_predict_16x16_dc_##name( pixel *src )\
96 for( int i = 0; i < 16; i += 2 )\
98 dc += src[-1 + i * FDEC_STRIDE];\
99 dc += src[-1 + (i+1) * FDEC_STRIDE];\
101 x264_predict_16x16_dc_core_##name( src, dc );\
104 PREDICT_16x16_DC( mmxext )
105 PREDICT_16x16_DC( sse2 )
107 #define PREDICT_16x16_DC_LEFT(name)\
108 static void x264_predict_16x16_dc_left_##name( pixel *src )\
111 for( int i = 0; i < 16; i += 2 )\
113 dc += src[-1 + i * FDEC_STRIDE];\
114 dc += src[-1 + (i+1) * FDEC_STRIDE];\
116 x264_predict_16x16_dc_left_core_##name( src, dc>>4 );\
119 PREDICT_16x16_DC_LEFT( mmxext )
120 PREDICT_16x16_DC_LEFT( sse2 )
123 ALIGNED_8( static const int8_t pb_12345678[8] ) = {1,2,3,4,5,6,7,8};
124 ALIGNED_8( static const int8_t pb_m87654321[8] ) = {-8,-7,-6,-5,-4,-3,-2,-1};
125 ALIGNED_8( static const int8_t pb_m32101234[8] ) = {-3,-2,-1,0,1,2,3,4};
127 #define PREDICT_P_SUM(j,i)\
128 H += i * ( src[j+i - FDEC_STRIDE ] - src[j-i - FDEC_STRIDE ] );\
129 V += i * ( src[(j+i)*FDEC_STRIDE -1] - src[(j-i)*FDEC_STRIDE -1] );\
131 #define PREDICT_16x16_P(name)\
132 static void x264_predict_16x16_p_##name( uint8_t *src )\
146 a = 16 * ( src[15*FDEC_STRIDE -1] + src[15 - FDEC_STRIDE] );\
147 b = ( 5 * H + 32 ) >> 6;\
148 c = ( 5 * V + 32 ) >> 6;\
149 i00 = a - b * 7 - c * 7 + 16;\
150 x264_predict_16x16_p_core_##name( src, i00, b, c );\
154 PREDICT_16x16_P( mmxext )
156 PREDICT_16x16_P( sse2 )
159 static void x264_predict_16x16_p_ssse3( uint8_t *src )
165 "movq 8+%1, %%mm0 \n"
166 "palignr $7, -8+%1, %%mm1 \n"
167 "pmaddubsw %2, %%mm0 \n"
168 "pmaddubsw %3, %%mm1 \n"
169 "paddw %%mm1, %%mm0 \n"
170 "pshufw $14, %%mm0, %%mm1 \n"
171 "paddw %%mm1, %%mm0 \n"
172 "pshufw $1, %%mm0, %%mm1 \n"
173 "paddw %%mm1, %%mm0 \n"
177 :"m"(src[-FDEC_STRIDE]), "m"(*pb_12345678), "m"(*pb_m87654321)
179 V = 8 * ( src[15*FDEC_STRIDE-1] - src[-1*FDEC_STRIDE-1] )
180 + 7 * ( src[14*FDEC_STRIDE-1] - src[ 0*FDEC_STRIDE-1] )
181 + 6 * ( src[13*FDEC_STRIDE-1] - src[ 1*FDEC_STRIDE-1] )
182 + 5 * ( src[12*FDEC_STRIDE-1] - src[ 2*FDEC_STRIDE-1] )
183 + 4 * ( src[11*FDEC_STRIDE-1] - src[ 3*FDEC_STRIDE-1] )
184 + 3 * ( src[10*FDEC_STRIDE-1] - src[ 4*FDEC_STRIDE-1] )
185 + 2 * ( src[ 9*FDEC_STRIDE-1] - src[ 5*FDEC_STRIDE-1] )
186 + 1 * ( src[ 8*FDEC_STRIDE-1] - src[ 6*FDEC_STRIDE-1] );
187 a = 16 * ( src[15*FDEC_STRIDE -1] + src[15 - FDEC_STRIDE] );
188 b = ( 5 * H + 32 ) >> 6;
189 c = ( 5 * V + 32 ) >> 6;
190 i00 = a - b * 7 - c * 7 + 16;
191 x264_predict_16x16_p_core_sse2( src, i00, b, c );
195 #define PREDICT_8x8_P(name)\
196 static void x264_predict_8x8c_p_##name( uint8_t *src )\
206 a = 16 * ( src[7*FDEC_STRIDE -1] + src[7 - FDEC_STRIDE] );\
207 b = ( 17 * H + 16 ) >> 5;\
208 c = ( 17 * V + 16 ) >> 5;\
209 i00 = a -3*b -3*c + 16;\
210 x264_predict_8x8c_p_core_##name( src, i00, b, c );\
214 PREDICT_8x8_P( mmxext )
216 PREDICT_8x8_P( sse2 )
219 static void x264_predict_8x8c_p_ssse3( uint8_t *src )
225 "pmaddubsw %2, %%mm0 \n"
226 "pshufw $14, %%mm0, %%mm1 \n"
227 "paddw %%mm1, %%mm0 \n"
228 "pshufw $1, %%mm0, %%mm1 \n"
229 "paddw %%mm1, %%mm0 \n"
233 :"m"(src[-FDEC_STRIDE]), "m"(*pb_m32101234)
235 V = 1 * ( src[4*FDEC_STRIDE -1] - src[ 2*FDEC_STRIDE -1] )
236 + 2 * ( src[5*FDEC_STRIDE -1] - src[ 1*FDEC_STRIDE -1] )
237 + 3 * ( src[6*FDEC_STRIDE -1] - src[ 0*FDEC_STRIDE -1] )
238 + 4 * ( src[7*FDEC_STRIDE -1] - src[-1*FDEC_STRIDE -1] );
239 H += -4 * src[-1*FDEC_STRIDE -1];
240 a = 16 * ( src[7*FDEC_STRIDE -1] + src[7 - FDEC_STRIDE] );
241 b = ( 17 * H + 16 ) >> 5;
242 c = ( 17 * V + 16 ) >> 5;
243 i00 = a -3*b -3*c + 16;
244 x264_predict_8x8c_p_core_sse2( src, i00, b, c );
248 static void x264_predict_8x8c_dc_mmxext( uint8_t *src )
251 + src[-1 + 0*FDEC_STRIDE]
252 + src[-1 + 1*FDEC_STRIDE]
253 + src[-1 + 2*FDEC_STRIDE]
254 + src[-1 + 3*FDEC_STRIDE];
257 + src[-1 + 4*FDEC_STRIDE]
258 + src[-1 + 5*FDEC_STRIDE]
259 + src[-1 + 6*FDEC_STRIDE]
260 + src[-1 + 7*FDEC_STRIDE];
262 x264_predict_8x8c_dc_core_mmxext( src, s2, s3 );
266 static void x264_predict_8x8c_dc_left( uint8_t *src )
269 uint32_t s0 = 0, s1 = 0;
272 for( y = 0; y < 4; y++ )
274 s0 += src[y * FDEC_STRIDE - 1];
275 s1 += src[(y+4) * FDEC_STRIDE - 1];
277 dc0 = (( s0 + 2 ) >> 2) * 0x0101010101010101ULL;
278 dc1 = (( s1 + 2 ) >> 2) * 0x0101010101010101ULL;
280 for( y = 0; y < 4; y++ )
285 for( y = 0; y < 4; y++ )
294 /****************************************************************************
295 * 8x8 prediction for intra luma block
296 ****************************************************************************/
299 UNUSED int l##y = edge[14-y];
301 UNUSED int t##x = edge[16+x];
302 #define PREDICT_8x8_LOAD_TOPLEFT \
304 #define PREDICT_8x8_LOAD_LEFT \
305 PL(0) PL(1) PL(2) PL(3) PL(4) PL(5) PL(6) PL(7)
306 #define PREDICT_8x8_LOAD_TOP \
307 PT(0) PT(1) PT(2) PT(3) PT(4) PT(5) PT(6) PT(7)
309 #define PREDICT_8x8_DC(v) \
311 for( y = 0; y < 8; y++ ) { \
314 src += FDEC_STRIDE; \
317 #define SRC(x,y) src[(x)+(y)*FDEC_STRIDE]
320 static void x264_predict_8x8_vr_mmxext( uint8_t *src, uint8_t edge[33] )
322 x264_predict_8x8_vr_core_mmxext( src, edge );
324 PREDICT_8x8_LOAD_TOPLEFT
325 PREDICT_8x8_LOAD_LEFT
326 SRC(0,2)=SRC(1,4)=SRC(2,6)= (l1 + 2*l0 + lt + 2) >> 2;
327 SRC(0,3)=SRC(1,5)=SRC(2,7)= (l2 + 2*l1 + l0 + 2) >> 2;
328 SRC(0,4)=SRC(1,6)= (l3 + 2*l2 + l1 + 2) >> 2;
329 SRC(0,5)=SRC(1,7)= (l4 + 2*l3 + l2 + 2) >> 2;
330 SRC(0,6)= (l5 + 2*l4 + l3 + 2) >> 2;
331 SRC(0,7)= (l6 + 2*l5 + l4 + 2) >> 2;
336 #define SUMSUB(a,b,c,d,e,f,g,h)\
342 #define INTRA_SA8D_X3(cpu)\
343 void x264_intra_sa8d_x3_8x8_##cpu( uint8_t *fenc, uint8_t edge[33], int res[3] )\
345 PREDICT_8x8_LOAD_TOP\
346 PREDICT_8x8_LOAD_LEFT\
348 ALIGNED_16( int16_t sa8d_1d[2][8] );\
349 SUMSUB(l0,l4,l1,l5,l2,l6,l3,l7);\
350 SUMSUB(l0,l2,l1,l3,l4,l6,l5,l7);\
351 SUMSUB(l0,l1,l2,l3,l4,l5,l6,l7);\
360 SUMSUB(t0,t4,t1,t5,t2,t6,t3,t7);\
361 SUMSUB(t0,t2,t1,t3,t4,t6,t5,t7);\
362 SUMSUB(t0,t1,t2,t3,t4,t5,t6,t7);\
371 x264_intra_sa8d_x3_8x8_core_##cpu( fenc, sa8d_1d, res );\
378 INTRA_SA8D_X3(mmxext)
380 #endif // !HIGH_BIT_DEPTH
382 /****************************************************************************
383 * Exported functions:
384 ****************************************************************************/
385 void x264_predict_16x16_init_mmx( int cpu, x264_predict_t pf[7] )
387 if( !(cpu&X264_CPU_MMX) )
389 pf[I_PRED_16x16_V] = x264_predict_16x16_v_mmx;
390 if( cpu&X264_CPU_MMXEXT )
392 pf[I_PRED_16x16_DC] = x264_predict_16x16_dc_mmxext;
393 pf[I_PRED_16x16_DC_TOP] = x264_predict_16x16_dc_top_mmxext;
394 pf[I_PRED_16x16_DC_LEFT] = x264_predict_16x16_dc_left_mmxext;
395 pf[I_PRED_16x16_H] = x264_predict_16x16_h_mmxext;
398 if( !(cpu&X264_CPU_SSE2) )
400 pf[I_PRED_16x16_DC] = x264_predict_16x16_dc_sse2;
401 pf[I_PRED_16x16_DC_TOP] = x264_predict_16x16_dc_top_sse2;
402 pf[I_PRED_16x16_DC_LEFT] = x264_predict_16x16_dc_left_sse2;
403 pf[I_PRED_16x16_V] = x264_predict_16x16_v_sse2;
404 pf[I_PRED_16x16_H] = x264_predict_16x16_h_sse2;
407 pf[I_PRED_16x16_P] = x264_predict_16x16_p_mmxext;
409 if( !(cpu&X264_CPU_SSE2) )
411 pf[I_PRED_16x16_DC] = x264_predict_16x16_dc_sse2;
412 pf[I_PRED_16x16_V] = x264_predict_16x16_v_sse2;
413 if( cpu&X264_CPU_SSE2_IS_SLOW )
415 pf[I_PRED_16x16_DC_TOP] = x264_predict_16x16_dc_top_sse2;
416 pf[I_PRED_16x16_DC_LEFT] = x264_predict_16x16_dc_left_sse2;
417 pf[I_PRED_16x16_P] = x264_predict_16x16_p_sse2;
418 if( !(cpu&X264_CPU_SSSE3) )
420 pf[I_PRED_16x16_H] = x264_predict_16x16_h_ssse3;
422 pf[I_PRED_16x16_P] = x264_predict_16x16_p_ssse3;
424 #endif // HIGH_BIT_DEPTH
427 void x264_predict_8x8c_init_mmx( int cpu, x264_predict_t pf[7] )
429 if( !(cpu&X264_CPU_MMX) )
433 pf[I_PRED_CHROMA_DC_LEFT] = x264_predict_8x8c_dc_left;
435 pf[I_PRED_CHROMA_V] = x264_predict_8x8c_v_mmx;
436 if( !(cpu&X264_CPU_MMXEXT) )
438 pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x8c_dc_top_mmxext;
439 pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_mmxext;
441 pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_mmxext;
443 pf[I_PRED_CHROMA_DC] = x264_predict_8x8c_dc_mmxext;
444 if( !(cpu&X264_CPU_SSE2) )
446 pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_sse2;
447 if( !(cpu&X264_CPU_SSSE3) )
449 pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_ssse3;
451 pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_ssse3;
453 #endif // !HIGH_BIT_DEPTH
456 void x264_predict_8x8_init_mmx( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_8x8_filter )
458 if( !(cpu&X264_CPU_MMXEXT) )
461 if( !(cpu&X264_CPU_SSE2) )
463 pf[I_PRED_8x8_V] = x264_predict_8x8_v_sse2;
464 pf[I_PRED_8x8_H] = x264_predict_8x8_h_sse2;
465 pf[I_PRED_8x8_DC] = x264_predict_8x8_dc_sse2;
466 pf[I_PRED_8x8_DC_TOP] = x264_predict_8x8_dc_top_sse2;
467 pf[I_PRED_8x8_DC_LEFT]= x264_predict_8x8_dc_left_sse2;
469 pf[I_PRED_8x8_V] = x264_predict_8x8_v_mmxext;
470 pf[I_PRED_8x8_H] = x264_predict_8x8_h_mmxext;
471 pf[I_PRED_8x8_DC] = x264_predict_8x8_dc_mmxext;
472 pf[I_PRED_8x8_DC_TOP] = x264_predict_8x8_dc_top_mmxext;
473 pf[I_PRED_8x8_DC_LEFT]= x264_predict_8x8_dc_left_mmxext;
474 pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_mmxext;
475 *predict_8x8_filter = x264_predict_8x8_filter_mmxext;
477 pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_mmxext;
478 pf[I_PRED_8x8_DDR] = x264_predict_8x8_ddr_mmxext;
479 pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_mmxext;
480 pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_mmxext;
482 if( !(cpu&X264_CPU_SSE2) )
484 pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_sse2;
485 pf[I_PRED_8x8_VL] = x264_predict_8x8_vl_sse2;
486 pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_sse2;
487 pf[I_PRED_8x8_DDR] = x264_predict_8x8_ddr_sse2;
488 pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_sse2;
489 pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_sse2;
490 if( !(cpu&X264_CPU_SSSE3) )
492 pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_ssse3;
493 pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_ssse3;
494 *predict_8x8_filter = x264_predict_8x8_filter_ssse3;
495 #endif // HIGH_BIT_DEPTH
498 void x264_predict_4x4_init_mmx( int cpu, x264_predict_t pf[12] )
500 if( !(cpu&X264_CPU_MMXEXT) )
502 pf[I_PRED_4x4_DC] = x264_predict_4x4_dc_mmxext;
503 pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_mmxext;
504 pf[I_PRED_4x4_VL] = x264_predict_4x4_vl_mmxext;
506 if( !(cpu&X264_CPU_SSE2) )
508 pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_sse2;
509 pf[I_PRED_4x4_HU] = x264_predict_4x4_hu_sse2;
510 pf[I_PRED_4x4_VL] = x264_predict_4x4_vl_sse2;
511 if( !(cpu&X264_CPU_SSSE3) )
513 pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_ssse3;
514 pf[I_PRED_4x4_VR] = x264_predict_4x4_vr_ssse3;
515 pf[I_PRED_4x4_HD] = x264_predict_4x4_hd_ssse3;
517 pf[I_PRED_4x4_VR] = x264_predict_4x4_vr_mmxext;
518 pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_mmxext;
519 pf[I_PRED_4x4_HD] = x264_predict_4x4_hd_mmxext;
520 pf[I_PRED_4x4_HU] = x264_predict_4x4_hu_mmxext;
521 if( !(cpu&X264_CPU_SSSE3) )
523 pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_ssse3;
524 pf[I_PRED_4x4_VR] = x264_predict_4x4_vr_ssse3;
525 pf[I_PRED_4x4_HD] = x264_predict_4x4_hd_ssse3;
526 #endif // HIGH_BIT_DEPTH