1 /* Lookahead lowres intra analysis
3 * Each intra analysis function has been implemented twice, once for scalar GPUs
4 * (NV) and once for vectorized GPUs (AMD pre-Southern Islands). x264 detects
5 * the GPU type and sets the -DVECTORIZE compile flag accordingly.
7 * All the intra analysis functions were based on their C versions in pixel.c
8 * and produce the exact same results.
11 /* force all clamp arguments and return value to int, prevent ambiguous types */
12 #define clamp_int( X, MIN, MAX ) (int) clamp( (int)(X), (int)(MIN), (int)(MAX) )
15 int satd_8x4_intra_lr( const local pixel *data, int data_stride, int8 pr0, int8 pr1, int8 pr2, int8 pr3 )
18 int2 tmp00, tmp01, tmp02, tmp03, tmp10, tmp11, tmp12, tmp13;
19 int2 tmp20, tmp21, tmp22, tmp23, tmp30, tmp31, tmp32, tmp33;
21 d_v = convert_int8( vload8( 0, data ) );
22 a_v.s01234567 = (d_v - pr0).s04152637;
23 HADAMARD4V( tmp00, tmp01, tmp02, tmp03, a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi );
26 d_v = convert_int8( vload8( 0, data ) );
27 a_v.s01234567 = (d_v - pr1).s04152637;
28 HADAMARD4V( tmp10, tmp11, tmp12, tmp13, a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi );
31 d_v = convert_int8( vload8( 0, data ) );
32 a_v.s01234567 = (d_v - pr2).s04152637;
33 HADAMARD4V( tmp20, tmp21, tmp22, tmp23, a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi );
36 d_v = convert_int8( vload8( 0, data ) );
37 a_v.s01234567 = (d_v - pr3).s04152637;
38 HADAMARD4V( tmp30, tmp31, tmp32, tmp33, a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi );
42 HADAMARD4V( a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi, tmp00, tmp10, tmp20, tmp30 );
45 HADAMARD4V( a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi, tmp01, tmp11, tmp21, tmp31 );
48 HADAMARD4V( a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi, tmp02, tmp12, tmp22, tmp32 );
51 HADAMARD4V( a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi, tmp03, tmp13, tmp23, tmp33 );
54 uint4 sum2 = sum_v.hi + sum_v.lo;
55 uint2 sum3 = sum2.hi + sum2.lo;
56 return ( sum3.hi + sum3.lo ) >> 1;
59 SATD_C_8x4_Q( satd_8x4_lp, const local, private )
62 /****************************************************************************
63 * 8x8 prediction for intra luma block
64 ****************************************************************************/
67 #define F2( a, b, c ) ( a+2*b+c+2 )>>2
70 int x264_predict_8x8_ddl( const local pixel *src, int src_stride, const local pixel *top )
72 int8 pr0, pr1, pr2, pr3;
74 // Upper half of pred[]
75 pr0.s0 = ( 2 + top[0] + 2*top[1] + top[2] ) >> 2;
76 pr0.s1 = ( 2 + top[1] + 2*top[2] + top[3] ) >> 2;
77 pr0.s2 = ( 2 + top[2] + 2*top[3] + top[4] ) >> 2;
78 pr0.s3 = ( 2 + top[3] + 2*top[4] + top[5] ) >> 2;
79 pr0.s4 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2;
80 pr0.s5 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
81 pr0.s6 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
82 pr0.s7 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
84 pr1.s0 = ( 2 + top[1] + 2*top[2] + top[3] ) >> 2;
85 pr1.s1 = ( 2 + top[2] + 2*top[3] + top[4] ) >> 2;
86 pr1.s2 = ( 2 + top[3] + 2*top[4] + top[5] ) >> 2;
87 pr1.s3 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2;
88 pr1.s4 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
89 pr1.s5 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
90 pr1.s6 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
91 pr1.s7 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
93 pr2.s0 = ( 2 + top[2] + 2*top[3] + top[4] ) >> 2;
94 pr2.s1 = ( 2 + top[3] + 2*top[4] + top[5] ) >> 2;
95 pr2.s2 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2;
96 pr2.s3 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
97 pr2.s4 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
98 pr2.s5 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
99 pr2.s6 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
100 pr2.s7 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
102 pr3.s0 = ( 2 + top[3] + 2*top[4] + top[5] ) >> 2;
103 pr3.s1 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2;
104 pr3.s2 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
105 pr3.s3 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
106 pr3.s4 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
107 pr3.s5 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
108 pr3.s6 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
109 pr3.s7 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2;
110 int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
112 // Lower half of pred[]
113 pr0.s0 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2;
114 pr0.s1 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
115 pr0.s2 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
116 pr0.s3 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
117 pr0.s4 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
118 pr0.s5 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
119 pr0.s6 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2;
120 pr0.s7 = ( 2 + top[11] + 2*top[12] + top[13] ) >> 2;
122 pr1.s0 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
123 pr1.s1 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
124 pr1.s2 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
125 pr1.s3 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
126 pr1.s4 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
127 pr1.s5 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2;
128 pr1.s6 = ( 2 + top[11] + 2*top[12] + top[13] ) >> 2;
129 pr1.s7 = ( 2 + top[12] + 2*top[13] + top[14] ) >> 2;
131 pr2.s0 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
132 pr2.s1 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
133 pr2.s2 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
134 pr2.s3 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
135 pr2.s4 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2;
136 pr2.s5 = ( 2 + top[11] + 2*top[12] + top[13] ) >> 2;
137 pr2.s6 = ( 2 + top[12] + 2*top[13] + top[14] ) >> 2;
138 pr2.s7 = ( 2 + top[13] + 2*top[14] + top[15] ) >> 2;
140 pr3.s0 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
141 pr3.s1 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
142 pr3.s2 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
143 pr3.s3 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2;
144 pr3.s4 = ( 2 + top[11] + 2*top[12] + top[13] ) >> 2;
145 pr3.s5 = ( 2 + top[12] + 2*top[13] + top[14] ) >> 2;
146 pr3.s6 = ( 2 + top[13] + 2*top[14] + top[15] ) >> 2;
147 pr3.s7 = ( 2 + top[14] + 3*top[15] ) >> 2;
149 return satd + satd_8x4_intra_lr( src + (src_stride << 2), src_stride, pr0, pr1, pr2, pr3 );
152 int x264_predict_8x8_ddr( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
154 int8 pr0, pr1, pr2, pr3;
156 // Upper half of pred[]
157 pr3.s0 = F2( left[1], left[2], left[3] );
158 pr2.s0 = pr3.s1 = F2( left[0], left[1], left[2] );
159 pr1.s0 = pr2.s1 = pr3.s2 = F2( left[1], left[0], left_top );
160 pr0.s0 = pr1.s1 = pr2.s2 = pr3.s3 = F2( left[0], left_top, top[0] );
161 pr0.s1 = pr1.s2 = pr2.s3 = pr3.s4 = F2( left_top, top[0], top[1] );
162 pr0.s2 = pr1.s3 = pr2.s4 = pr3.s5 = F2( top[0], top[1], top[2] );
163 pr0.s3 = pr1.s4 = pr2.s5 = pr3.s6 = F2( top[1], top[2], top[3] );
164 pr0.s4 = pr1.s5 = pr2.s6 = pr3.s7 = F2( top[2], top[3], top[4] );
165 pr0.s5 = pr1.s6 = pr2.s7 = F2( top[3], top[4], top[5] );
166 pr0.s6 = pr1.s7 = F2( top[4], top[5], top[6] );
167 pr0.s7 = F2( top[5], top[6], top[7] );
168 int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
170 // Lower half of pred[]
171 pr3.s0 = F2( left[5], left[6], left[7] );
172 pr2.s0 = pr3.s1 = F2( left[4], left[5], left[6] );
173 pr1.s0 = pr2.s1 = pr3.s2 = F2( left[3], left[4], left[5] );
174 pr0.s0 = pr1.s1 = pr2.s2 = pr3.s3 = F2( left[2], left[3], left[4] );
175 pr0.s1 = pr1.s2 = pr2.s3 = pr3.s4 = F2( left[1], left[2], left[3] );
176 pr0.s2 = pr1.s3 = pr2.s4 = pr3.s5 = F2( left[0], left[1], left[2] );
177 pr0.s3 = pr1.s4 = pr2.s5 = pr3.s6 = F2( left[1], left[0], left_top );
178 pr0.s4 = pr1.s5 = pr2.s6 = pr3.s7 = F2( left[0], left_top, top[0] );
179 pr0.s5 = pr1.s6 = pr2.s7 = F2( left_top, top[0], top[1] );
180 pr0.s6 = pr1.s7 = F2( top[0], top[1], top[2] );
181 pr0.s7 = F2( top[1], top[2], top[3] );
182 return satd + satd_8x4_intra_lr( src + (src_stride << 2), src_stride, pr0, pr1, pr2, pr3 );
185 int x264_predict_8x8_vr( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
187 int8 pr0, pr1, pr2, pr3;
189 // Upper half of pred[]
190 pr2.s0 = F2( left[1], left[0], left_top );
191 pr3.s0 = F2( left[2], left[1], left[0] );
192 pr1.s0 = pr3.s1 = F2( left[0], left_top, top[0] );
193 pr0.s0 = pr2.s1 = F1( left_top, top[0] );
194 pr1.s1 = pr3.s2 = F2( left_top, top[0], top[1] );
195 pr0.s1 = pr2.s2 = F1( top[0], top[1] );
196 pr1.s2 = pr3.s3 = F2( top[0], top[1], top[2] );
197 pr0.s2 = pr2.s3 = F1( top[1], top[2] );
198 pr1.s3 = pr3.s4 = F2( top[1], top[2], top[3] );
199 pr0.s3 = pr2.s4 = F1( top[2], top[3] );
200 pr1.s4 = pr3.s5 = F2( top[2], top[3], top[4] );
201 pr0.s4 = pr2.s5 = F1( top[3], top[4] );
202 pr1.s5 = pr3.s6 = F2( top[3], top[4], top[5] );
203 pr0.s5 = pr2.s6 = F1( top[4], top[5] );
204 pr1.s6 = pr3.s7 = F2( top[4], top[5], top[6] );
205 pr0.s6 = pr2.s7 = F1( top[5], top[6] );
206 pr1.s7 = F2( top[5], top[6], top[7] );
207 pr0.s7 = F1( top[6], top[7] );
208 int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
210 // Lower half of pred[]
211 pr2.s0 = F2( left[5], left[4], left[3] );
212 pr3.s0 = F2( left[6], left[5], left[4] );
213 pr0.s0 = pr2.s1 = F2( left[3], left[2], left[1] );
214 pr1.s0 = pr3.s1 = F2( left[4], left[3], left[2] );
215 pr0.s1 = pr2.s2 = F2( left[1], left[0], left_top );
216 pr1.s1 = pr3.s2 = F2( left[2], left[1], left[0] );
217 pr1.s2 = pr3.s3 = F2( left[0], left_top, top[0] );
218 pr0.s2 = pr2.s3 = F1( left_top, top[0] );
219 pr1.s3 = pr3.s4 = F2( left_top, top[0], top[1] );
220 pr0.s3 = pr2.s4 = F1( top[0], top[1] );
221 pr1.s4 = pr3.s5 = F2( top[0], top[1], top[2] );
222 pr0.s4 = pr2.s5 = F1( top[1], top[2] );
223 pr1.s5 = pr3.s6 = F2( top[1], top[2], top[3] );
224 pr0.s5 = pr2.s6 = F1( top[2], top[3] );
225 pr1.s6 = pr3.s7 = F2( top[2], top[3], top[4] );
226 pr0.s6 = pr2.s7 = F1( top[3], top[4] );
227 pr1.s7 = F2( top[3], top[4], top[5] );
228 pr0.s7 = F1( top[4], top[5] );
229 return satd + satd_8x4_intra_lr( src + (src_stride << 2), src_stride, pr0, pr1, pr2, pr3 );
233 int x264_predict_8x8_hd( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
235 int8 pr0, pr1, pr2, pr3;
237 // Upper half of pred[]
238 pr0.s0 = F1( left_top, left[0] ); pr0.s1 = (left[0] + 2 * left_top + top[0] + 2) >> 2;
239 pr0.s2 = F2( top[1], top[0], left_top ); pr0.s3 = F2( top[2], top[1], top[0] );
240 pr0.s4 = F2( top[3], top[2], top[1] ); pr0.s5 = F2( top[4], top[3], top[2] );
241 pr0.s6 = F2( top[5], top[4], top[3] ); pr0.s7 = F2( top[6], top[5], top[4] );
243 pr1.s0 = F1( left[0], left[1] ); pr1.s1 = (left_top + 2 * left[0] + left[1] + 2) >> 2;
244 pr1.s2 = F1( left_top, left[0] ); pr1.s3 = (left[0] + 2 * left_top + top[0] + 2) >> 2;
245 pr1.s4 = F2( top[1], top[0], left_top ); pr1.s5 = F2( top[2], top[1], top[0] );
246 pr1.s6 = F2( top[3], top[2], top[1] ); pr1.s7 = F2( top[4], top[3], top[2] );
248 pr2.s0 = F1( left[1], left[2] ); pr2.s1 = (left[0] + 2 * left[1] + left[2] + 2) >> 2;
249 pr2.s2 = F1( left[0], left[1] ); pr2.s3 = (left_top + 2 * left[0] + left[1] + 2) >> 2;
250 pr2.s4 = F1( left_top, left[0] ); pr2.s5 = (left[0] + 2 * left_top + top[0] + 2) >> 2;
251 pr2.s6 = F2( top[1], top[0], left_top ); pr2.s7 = F2( top[2], top[1], top[0] );
253 pr3.s0 = F1( left[2], left[3] ); pr3.s1 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
254 pr3.s2 = F1( left[1], left[2] ); pr3.s3 = (left[0] + 2 * left[1] + left[2] + 2) >> 2;
255 pr3.s4 = F1( left[0], left[1] ); pr3.s5 = (left_top + 2 * left[0] + left[1] + 2) >> 2;
256 pr3.s6 = F1( left_top, left[0] ); pr3.s7 = (left[0] + 2 * left_top + top[0] + 2) >> 2;
257 int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
259 // Lower half of pred[]
260 pr0.s0 = F1( left[3], left[4] ); pr0.s1 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
261 pr0.s2 = F1( left[2], left[3] ); pr0.s3 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
262 pr0.s4 = F1( left[1], left[2] ); pr0.s5 = (left[0] + 2 * left[1] + left[2] + 2) >> 2;
263 pr0.s6 = F1( left[0], left[1] ); pr0.s7 = (left_top + 2 * left[0] + left[1] + 2) >> 2;
265 pr1.s0 = F1( left[4], left[5] ); pr1.s1 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
266 pr1.s2 = F1( left[3], left[4] ); pr1.s3 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
267 pr1.s4 = F1( left[2], left[3] ); pr1.s5 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
268 pr1.s6 = F1( left[1], left[2] ); pr1.s7 = (left[0] + 2 * left[1] + left[2] + 2) >> 2;
270 pr2.s0 = F1( left[5], left[6] ); pr2.s1 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
271 pr2.s2 = F1( left[4], left[5] ); pr2.s3 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
272 pr2.s4 = F1( left[3], left[4] ); pr2.s5 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
273 pr2.s6 = F1( left[2], left[3] ); pr2.s7 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
275 pr3.s0 = F1( left[6], left[7] ); pr3.s1 = (left[5] + 2 * left[6] + left[7] + 2) >> 2;
276 pr3.s2 = F1( left[5], left[6] ); pr3.s3 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
277 pr3.s4 = F1( left[4], left[5] ); pr3.s5 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
278 pr3.s6 = F1( left[3], left[4] ); pr3.s7 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
279 return satd + satd_8x4_intra_lr( src + (src_stride << 2), src_stride, pr0, pr1, pr2, pr3 );
282 int x264_predict_8x8_vl( const local pixel *src, int src_stride, const local pixel *top )
284 int8 pr0, pr1, pr2, pr3;
286 // Upper half of pred[]
287 pr0.s0 = F1( top[0], top[1] );
288 pr1.s0 = F2( top[0], top[1], top[2] );
289 pr2.s0 = pr0.s1 = F1( top[1], top[2] );
290 pr3.s0 = pr1.s1 = F2( top[1], top[2], top[3] );
291 pr2.s1 = pr0.s2 = F1( top[2], top[3] );
292 pr3.s1 = pr1.s2 = F2( top[2], top[3], top[4] );
293 pr2.s2 = pr0.s3 = F1( top[3], top[4] );
294 pr3.s2 = pr1.s3 = F2( top[3], top[4], top[5] );
295 pr2.s3 = pr0.s4 = F1( top[4], top[5] );
296 pr3.s3 = pr1.s4 = F2( top[4], top[5], top[6] );
297 pr2.s4 = pr0.s5 = F1( top[5], top[6] );
298 pr3.s4 = pr1.s5 = F2( top[5], top[6], top[7] );
299 pr2.s5 = pr0.s6 = F1( top[6], top[7] );
300 pr3.s5 = pr1.s6 = F2( top[6], top[7], top[8] );
301 pr2.s6 = pr0.s7 = F1( top[7], top[8] );
302 pr3.s6 = pr1.s7 = F2( top[7], top[8], top[9] );
303 pr2.s7 = F1( top[8], top[9] );
304 pr3.s7 = F2( top[8], top[9], top[10] );
305 int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
307 // Lower half of pred[]
308 pr0.s0 = F1( top[2], top[3] );
309 pr1.s0 = F2( top[2], top[3], top[4] );
310 pr2.s0 = pr0.s1 = F1( top[3], top[4] );
311 pr3.s0 = pr1.s1 = F2( top[3], top[4], top[5] );
312 pr2.s1 = pr0.s2 = F1( top[4], top[5] );
313 pr3.s1 = pr1.s2 = F2( top[4], top[5], top[6] );
314 pr2.s2 = pr0.s3 = F1( top[5], top[6] );
315 pr3.s2 = pr1.s3 = F2( top[5], top[6], top[7] );
316 pr2.s3 = pr0.s4 = F1( top[6], top[7] );
317 pr3.s3 = pr1.s4 = F2( top[6], top[7], top[8] );
318 pr2.s4 = pr0.s5 = F1( top[7], top[8] );
319 pr3.s4 = pr1.s5 = F2( top[7], top[8], top[9] );
320 pr2.s5 = pr0.s6 = F1( top[8], top[9] );
321 pr3.s5 = pr1.s6 = F2( top[8], top[9], top[10] );
322 pr2.s6 = pr0.s7 = F1( top[9], top[10] );
323 pr3.s6 = pr1.s7 = F2( top[9], top[10], top[11] );
324 pr2.s7 = F1( top[10], top[11] );
325 pr3.s7 = F2( top[10], top[11], top[12] );
326 return satd + satd_8x4_intra_lr( src + ( src_stride << 2 ), src_stride, pr0, pr1, pr2, pr3 );
329 int x264_predict_8x8_hu( const local pixel *src, int src_stride, const local pixel *left )
331 int8 pr0, pr1, pr2, pr3;
333 // Upper half of pred[]
334 pr0.s0 = F1( left[0], left[1] ); pr0.s1 = (left[0] + 2 * left[1] + left[2] + 2) >> 2;
335 pr0.s2 = F1( left[1], left[2] ); pr0.s3 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
336 pr0.s4 = F1( left[2], left[3] ); pr0.s5 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
337 pr0.s6 = F1( left[3], left[4] ); pr0.s7 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
339 pr1.s0 = F1( left[1], left[2] ); pr1.s1 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
340 pr1.s2 = F1( left[2], left[3] ); pr1.s3 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
341 pr1.s4 = F1( left[3], left[4] ); pr1.s5 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
342 pr1.s6 = F1( left[4], left[5] ); pr1.s7 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
344 pr2.s0 = F1( left[2], left[3] ); pr2.s1 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
345 pr2.s2 = F1( left[3], left[4] ); pr2.s3 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
346 pr2.s4 = F1( left[4], left[5] ); pr2.s5 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
347 pr2.s6 = F1( left[5], left[6] ); pr2.s7 = (left[5] + 2 * left[6] + left[7] + 2) >> 2;
349 pr3.s0 = F1( left[3], left[4] ); pr3.s1 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
350 pr3.s2 = F1( left[4], left[5] ); pr3.s3 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
351 pr3.s4 = F1( left[5], left[6] ); pr3.s5 = (left[5] + 2 * left[6] + left[7] + 2) >> 2;
352 pr3.s6 = F1( left[6], left[7] ); pr3.s7 = (left[6] + 2 * left[7] + left[7] + 2) >> 2;
353 int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
355 // Lower half of pred[]
356 pr0.s0 = F1( left[4], left[5] ); pr0.s1 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
357 pr0.s2 = F1( left[5], left[6] ); pr0.s3 = (left[5] + 2 * left[6] + left[7] + 2) >> 2;
358 pr0.s4 = F1( left[6], left[7] ); pr0.s5 = (left[6] + 2 * left[7] + left[7] + 2) >> 2;
359 pr0.s6 = left[7]; pr0.s7 = left[7];
361 pr1.s0 = F1( left[5], left[6] ); pr1.s1 = (left[5] + 2 * left[6] + left[7] + 2) >> 2;
362 pr1.s2 = F1( left[6], left[7] ); pr1.s3 = (left[6] + 2 * left[7] + left[7] + 2) >> 2;
363 pr1.s4 = left[7]; pr1.s5 = left[7];
364 pr1.s6 = left[7]; pr1.s7 = left[7];
366 pr2.s0 = F1( left[6], left[7] ); pr2.s1 = (left[6] + 2 * left[7] + left[7] + 2) >> 2;
367 pr2.s2 = left[7]; pr2.s3 = left[7];
368 pr2.s4 = left[7]; pr2.s5 = left[7];
369 pr2.s6 = left[7]; pr2.s7 = left[7];
373 return satd + satd_8x4_intra_lr( src + ( src_stride << 2 ), src_stride, pr0, pr1, pr2, pr3 );
376 int x264_predict_8x8c_h( const local pixel *src, int src_stride )
378 const local pixel *src_l = src;
379 int8 pr0, pr1, pr2, pr3;
381 // Upper half of pred[]
382 pr0 = (int8)src[-1]; src += src_stride;
383 pr1 = (int8)src[-1]; src += src_stride;
384 pr2 = (int8)src[-1]; src += src_stride;
385 pr3 = (int8)src[-1]; src += src_stride;
386 int satd = satd_8x4_intra_lr( src_l, src_stride, pr0, pr1, pr2, pr3 );
388 //Lower half of pred[]
389 pr0 = (int8)src[-1]; src += src_stride;
390 pr1 = (int8)src[-1]; src += src_stride;
391 pr2 = (int8)src[-1]; src += src_stride;
393 return satd + satd_8x4_intra_lr( src_l + ( src_stride << 2 ), src_stride, pr0, pr1, pr2, pr3 );
396 int x264_predict_8x8c_v( const local pixel *src, int src_stride )
398 int8 pred = convert_int8( vload8( 0, &src[-src_stride] ));
399 return satd_8x4_intra_lr( src, src_stride, pred, pred, pred, pred ) +
400 satd_8x4_intra_lr( src + ( src_stride << 2 ), src_stride, pred, pred, pred, pred );
403 int x264_predict_8x8c_p( const local pixel *src, int src_stride )
406 for( int i = 0; i < 4; i++ )
408 H += (i + 1) * (src[4 + i - src_stride] - src[2 - i - src_stride]);
409 V += (i + 1) * (src[-1 + (i + 4) * src_stride] - src[-1 + (2 - i) * src_stride]);
412 int a = 16 * (src[-1 + 7 * src_stride] + src[7 - src_stride]);
413 int b = (17 * H + 16) >> 5;
414 int c = (17 * V + 16) >> 5;
415 int i00 = a - 3 * b - 3 * c + 16;
417 // Upper half of pred[]
419 int8 pr0, pr1, pr2, pr3;
420 pr0.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
421 pr0.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
422 pr0.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
423 pr0.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
424 pr0.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
425 pr0.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
426 pr0.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
427 pr0.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
430 pr1.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
431 pr1.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
432 pr1.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
433 pr1.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
434 pr1.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
435 pr1.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
436 pr1.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
437 pr1.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
440 pr2.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
441 pr2.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
442 pr2.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
443 pr2.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
444 pr2.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
445 pr2.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
446 pr2.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
447 pr2.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
450 pr3.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
451 pr3.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
452 pr3.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
453 pr3.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
454 pr3.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
455 pr3.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
456 pr3.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
457 pr3.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
458 int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
460 //Lower half of pred[]
462 pr0.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
463 pr0.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
464 pr0.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
465 pr0.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
466 pr0.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
467 pr0.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
468 pr0.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
469 pr0.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
472 pr1.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
473 pr1.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
474 pr1.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
475 pr1.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
476 pr1.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
477 pr1.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
478 pr1.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
479 pr1.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
482 pr2.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
483 pr2.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
484 pr2.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
485 pr2.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
486 pr2.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
487 pr2.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
488 pr2.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
489 pr2.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
492 pr3.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
493 pr3.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
494 pr3.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
495 pr3.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
496 pr3.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
497 pr3.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
498 pr3.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
499 pr3.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
500 return satd + satd_8x4_intra_lr( src + ( src_stride << 2 ), src_stride, pr0, pr1, pr2, pr3 );
503 int x264_predict_8x8c_dc( const local pixel *src, int src_stride )
505 int s0 = 0, s1 = 0, s2 = 0, s3 = 0;
506 for( int i = 0; i < 4; i++ )
508 s0 += src[i - src_stride];
509 s1 += src[i + 4 - src_stride];
510 s2 += src[-1 + i * src_stride];
511 s3 += src[-1 + (i+4)*src_stride];
514 // Upper half of pred[]
516 dc0.lo = (int4)( (s0 + s2 + 4) >> 3 );
517 dc0.hi = (int4)( (s1 + 2) >> 2 );
518 int satd = satd_8x4_intra_lr( src, src_stride, dc0, dc0, dc0, dc0 );
520 // Lower half of pred[]
521 dc0.lo = (int4)( (s3 + 2) >> 2 );
522 dc0.hi = (int4)( (s1 + s3 + 4) >> 3 );
523 return satd + satd_8x4_intra_lr( src + ( src_stride << 2 ), src_stride, dc0, dc0, dc0, dc0 );
526 #else /* not vectorized: private is cheap registers are scarce */
528 int x264_predict_8x8_ddl( const local pixel *src, int src_stride, const local pixel *top )
530 private pixel pred[32];
532 // Upper half of pred[]
533 for( int y = 0; y < 4; y++ )
535 for( int x = 0; x < 8; x++ )
537 pixel x_plus_y = (pixel) clamp_int( x + y, 0, 13 );
538 pred[x + y*8] = ( 2 + top[x_plus_y] + 2*top[x_plus_y + 1] + top[x_plus_y + 2] ) >> 2;
541 int satd = satd_8x4_lp( src, src_stride, pred, 8 );
542 //Lower half of pred[]
543 for( int y = 4; y < 8; y++ )
545 for( int x = 0; x < 8; x++ )
547 pixel x_plus_y = (pixel) clamp_int( x + y, 0, 13 );
548 pred[x + ( y - 4 )*8] = ( 2 + top[x_plus_y] + 2*top[x_plus_y + 1] + top[x_plus_y + 2] ) >> 2;
551 pred[31] = ( 2 + top[14] + 3*top[15] ) >> 2;
552 satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
556 int x264_predict_8x8_ddr( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
558 private pixel pred[32];
559 #define PRED( x, y ) pred[(x) + (y)*8]
560 // Upper half of pred[]
561 PRED( 0, 3 ) = F2( left[1], left[2], left[3] );
562 PRED( 0, 2 ) = PRED( 1, 3 ) = F2( left[0], left[1], left[2] );
563 PRED( 0, 1 ) = PRED( 1, 2 ) = PRED( 2, 3 ) = F2( left[1], left[0], left_top );
564 PRED( 0, 0 ) = PRED( 1, 1 ) = PRED( 2, 2 ) = PRED( 3, 3 ) = F2( left[0], left_top, top[0] );
565 PRED( 1, 0 ) = PRED( 2, 1 ) = PRED( 3, 2 ) = PRED( 4, 3 ) = F2( left_top, top[0], top[1] );
566 PRED( 2, 0 ) = PRED( 3, 1 ) = PRED( 4, 2 ) = PRED( 5, 3 ) = F2( top[0], top[1], top[2] );
567 PRED( 3, 0 ) = PRED( 4, 1 ) = PRED( 5, 2 ) = PRED( 6, 3 ) = F2( top[1], top[2], top[3] );
568 PRED( 4, 0 ) = PRED( 5, 1 ) = PRED( 6, 2 ) = PRED( 7, 3 ) = F2( top[2], top[3], top[4] );
569 PRED( 5, 0 ) = PRED( 6, 1 ) = PRED( 7, 2 ) = F2( top[3], top[4], top[5] );
570 PRED( 6, 0 ) = PRED( 7, 1 ) = F2( top[4], top[5], top[6] );
571 PRED( 7, 0 ) = F2( top[5], top[6], top[7] );
572 int satd = satd_8x4_lp( src, src_stride, pred, 8 );
574 // Lower half of pred[]
575 PRED( 0, 3 ) = F2( left[5], left[6], left[7] );
576 PRED( 0, 2 ) = PRED( 1, 3 ) = F2( left[4], left[5], left[6] );
577 PRED( 0, 1 ) = PRED( 1, 2 ) = PRED( 2, 3 ) = F2( left[3], left[4], left[5] );
578 PRED( 0, 0 ) = PRED( 1, 1 ) = PRED( 2, 2 ) = PRED( 3, 3 ) = F2( left[2], left[3], left[4] );
579 PRED( 1, 0 ) = PRED( 2, 1 ) = PRED( 3, 2 ) = PRED( 4, 3 ) = F2( left[1], left[2], left[3] );
580 PRED( 2, 0 ) = PRED( 3, 1 ) = PRED( 4, 2 ) = PRED( 5, 3 ) = F2( left[0], left[1], left[2] );
581 PRED( 3, 0 ) = PRED( 4, 1 ) = PRED( 5, 2 ) = PRED( 6, 3 ) = F2( left[1], left[0], left_top );
582 PRED( 4, 0 ) = PRED( 5, 1 ) = PRED( 6, 2 ) = PRED( 7, 3 ) = F2( left[0], left_top, top[0] );
583 PRED( 5, 0 ) = PRED( 6, 1 ) = PRED( 7, 2 ) = F2( left_top, top[0], top[1] );
584 PRED( 6, 0 ) = PRED( 7, 1 ) = F2( top[0], top[1], top[2] );
585 PRED( 7, 0 ) = F2( top[1], top[2], top[3] );
586 satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
591 int x264_predict_8x8_vr( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
593 private pixel pred[32];
594 #define PRED( x, y ) pred[(x) + (y)*8]
595 // Upper half of pred[]
596 PRED( 0, 2 ) = F2( left[1], left[0], left_top );
597 PRED( 0, 3 ) = F2( left[2], left[1], left[0] );
598 PRED( 0, 1 ) = PRED( 1, 3 ) = F2( left[0], left_top, top[0] );
599 PRED( 0, 0 ) = PRED( 1, 2 ) = F1( left_top, top[0] );
600 PRED( 1, 1 ) = PRED( 2, 3 ) = F2( left_top, top[0], top[1] );
601 PRED( 1, 0 ) = PRED( 2, 2 ) = F1( top[0], top[1] );
602 PRED( 2, 1 ) = PRED( 3, 3 ) = F2( top[0], top[1], top[2] );
603 PRED( 2, 0 ) = PRED( 3, 2 ) = F1( top[1], top[2] );
604 PRED( 3, 1 ) = PRED( 4, 3 ) = F2( top[1], top[2], top[3] );
605 PRED( 3, 0 ) = PRED( 4, 2 ) = F1( top[2], top[3] );
606 PRED( 4, 1 ) = PRED( 5, 3 ) = F2( top[2], top[3], top[4] );
607 PRED( 4, 0 ) = PRED( 5, 2 ) = F1( top[3], top[4] );
608 PRED( 5, 1 ) = PRED( 6, 3 ) = F2( top[3], top[4], top[5] );
609 PRED( 5, 0 ) = PRED( 6, 2 ) = F1( top[4], top[5] );
610 PRED( 6, 1 ) = PRED( 7, 3 ) = F2( top[4], top[5], top[6] );
611 PRED( 6, 0 ) = PRED( 7, 2 ) = F1( top[5], top[6] );
612 PRED( 7, 1 ) = F2( top[5], top[6], top[7] );
613 PRED( 7, 0 ) = F1( top[6], top[7] );
614 int satd = satd_8x4_lp( src, src_stride, pred, 8 );
616 //Lower half of pred[]
617 PRED( 0, 2 ) = F2( left[5], left[4], left[3] );
618 PRED( 0, 3 ) = F2( left[6], left[5], left[4] );
619 PRED( 0, 0 ) = PRED( 1, 2 ) = F2( left[3], left[2], left[1] );
620 PRED( 0, 1 ) = PRED( 1, 3 ) = F2( left[4], left[3], left[2] );
621 PRED( 1, 0 ) = PRED( 2, 2 ) = F2( left[1], left[0], left_top );
622 PRED( 1, 1 ) = PRED( 2, 3 ) = F2( left[2], left[1], left[0] );
623 PRED( 2, 1 ) = PRED( 3, 3 ) = F2( left[0], left_top, top[0] );
624 PRED( 2, 0 ) = PRED( 3, 2 ) = F1( left_top, top[0] );
625 PRED( 3, 1 ) = PRED( 4, 3 ) = F2( left_top, top[0], top[1] );
626 PRED( 3, 0 ) = PRED( 4, 2 ) = F1( top[0], top[1] );
627 PRED( 4, 1 ) = PRED( 5, 3 ) = F2( top[0], top[1], top[2] );
628 PRED( 4, 0 ) = PRED( 5, 2 ) = F1( top[1], top[2] );
629 PRED( 5, 1 ) = PRED( 6, 3 ) = F2( top[1], top[2], top[3] );
630 PRED( 5, 0 ) = PRED( 6, 2 ) = F1( top[2], top[3] );
631 PRED( 6, 1 ) = PRED( 7, 3 ) = F2( top[2], top[3], top[4] );
632 PRED( 6, 0 ) = PRED( 7, 2 ) = F1( top[3], top[4] );
633 PRED( 7, 1 ) = F2( top[3], top[4], top[5] );
634 PRED( 7, 0 ) = F1( top[4], top[5] );
635 satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
640 inline uint32_t pack16to32( uint32_t a, uint32_t b )
642 return a + (b << 16);
645 inline uint32_t pack8to16( uint32_t a, uint32_t b )
650 int x264_predict_8x8_hd( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
652 private pixel pred[32];
654 int p1 = pack8to16( (F1( left[6], left[7] )), ((left[5] + 2 * left[6] + left[7] + 2) >> 2) );
655 int p2 = pack8to16( (F1( left[5], left[6] )), ((left[4] + 2 * left[5] + left[6] + 2) >> 2) );
656 int p3 = pack8to16( (F1( left[4], left[5] )), ((left[3] + 2 * left[4] + left[5] + 2) >> 2) );
657 int p4 = pack8to16( (F1( left[3], left[4] )), ((left[2] + 2 * left[3] + left[4] + 2) >> 2) );
658 int p5 = pack8to16( (F1( left[2], left[3] )), ((left[1] + 2 * left[2] + left[3] + 2) >> 2) );
659 int p6 = pack8to16( (F1( left[1], left[2] )), ((left[0] + 2 * left[1] + left[2] + 2) >> 2) );
660 int p7 = pack8to16( (F1( left[0], left[1] )), ((left_top + 2 * left[0] + left[1] + 2) >> 2) );
661 int p8 = pack8to16( (F1( left_top, left[0] )), ((left[0] + 2 * left_top + top[0] + 2) >> 2) );
662 int p9 = pack8to16( (F2( top[1], top[0], left_top )), (F2( top[2], top[1], top[0] )) );
663 int p10 = pack8to16( (F2( top[3], top[2], top[1] )), (F2( top[4], top[3], top[2] )) );
664 int p11 = pack8to16( (F2( top[5], top[4], top[3] )), (F2( top[6], top[5], top[4] )) );
665 // Upper half of pred[]
666 vstore4( as_uchar4( pack16to32( p8, p9 ) ), 0, &pred[0 + 0 * 8] );
667 vstore4( as_uchar4( pack16to32( p10, p11 ) ), 0, &pred[4 + 0 * 8] );
668 vstore4( as_uchar4( pack16to32( p7, p8 ) ), 0, &pred[0 + 1 * 8] );
669 vstore4( as_uchar4( pack16to32( p9, p10 ) ), 0, &pred[4 + 1 * 8] );
670 vstore4( as_uchar4( pack16to32( p6, p7 ) ), 0, &pred[0 + 2 * 8] );
671 vstore4( as_uchar4( pack16to32( p8, p9 ) ), 0, &pred[4 + 2 * 8] );
672 vstore4( as_uchar4( pack16to32( p5, p6 ) ), 0, &pred[0 + 3 * 8] );
673 vstore4( as_uchar4( pack16to32( p7, p8 ) ), 0, &pred[4 + 3 * 8] );
674 satd = satd_8x4_lp( src, src_stride, pred, 8 );
675 // Lower half of pred[]
676 vstore4( as_uchar4( pack16to32( p4, p5 ) ), 0, &pred[0 + 0 * 8] );
677 vstore4( as_uchar4( pack16to32( p6, p7 ) ), 0, &pred[4 + 0 * 8] );
678 vstore4( as_uchar4( pack16to32( p3, p4 ) ), 0, &pred[0 + 1 * 8] );
679 vstore4( as_uchar4( pack16to32( p5, p6 ) ), 0, &pred[4 + 1 * 8] );
680 vstore4( as_uchar4( pack16to32( p2, p3 ) ), 0, &pred[0 + 2 * 8] );
681 vstore4( as_uchar4( pack16to32( p4, p5 ) ), 0, &pred[4 + 2 * 8] );
682 vstore4( as_uchar4( pack16to32( p1, p2 ) ), 0, &pred[0 + 3 * 8] );
683 vstore4( as_uchar4( pack16to32( p3, p4 ) ), 0, &pred[4 + 3 * 8] );
684 satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
688 int x264_predict_8x8_vl( const local pixel *src, int src_stride, const local pixel *top )
690 private pixel pred[32];
692 #define PRED( x, y ) pred[(x) + (y)*8]
693 // Upper half of pred[]
694 PRED( 0, 0 ) = F1( top[0], top[1] );
695 PRED( 0, 1 ) = F2( top[0], top[1], top[2] );
696 PRED( 0, 2 ) = PRED( 1, 0 ) = F1( top[1], top[2] );
697 PRED( 0, 3 ) = PRED( 1, 1 ) = F2( top[1], top[2], top[3] );
698 PRED( 1, 2 ) = PRED( 2, 0 ) = F1( top[2], top[3] );
699 PRED( 1, 3 ) = PRED( 2, 1 ) = F2( top[2], top[3], top[4] );
700 PRED( 2, 2 ) = PRED( 3, 0 ) = F1( top[3], top[4] );
701 PRED( 2, 3 ) = PRED( 3, 1 ) = F2( top[3], top[4], top[5] );
702 PRED( 3, 2 ) = PRED( 4, 0 ) = F1( top[4], top[5] );
703 PRED( 3, 3 ) = PRED( 4, 1 ) = F2( top[4], top[5], top[6] );
704 PRED( 4, 2 ) = PRED( 5, 0 ) = F1( top[5], top[6] );
705 PRED( 4, 3 ) = PRED( 5, 1 ) = F2( top[5], top[6], top[7] );
706 PRED( 5, 2 ) = PRED( 6, 0 ) = F1( top[6], top[7] );
707 PRED( 5, 3 ) = PRED( 6, 1 ) = F2( top[6], top[7], top[8] );
708 PRED( 6, 2 ) = PRED( 7, 0 ) = F1( top[7], top[8] );
709 PRED( 6, 3 ) = PRED( 7, 1 ) = F2( top[7], top[8], top[9] );
710 PRED( 7, 2 ) = F1( top[8], top[9] );
711 PRED( 7, 3 ) = F2( top[8], top[9], top[10] );
712 satd = satd_8x4_lp( src, src_stride, pred, 8 );
713 // Lower half of pred[]
714 PRED( 0, 0 ) = F1( top[2], top[3] );
715 PRED( 0, 1 ) = F2( top[2], top[3], top[4] );
716 PRED( 0, 2 ) = PRED( 1, 0 ) = F1( top[3], top[4] );
717 PRED( 0, 3 ) = PRED( 1, 1 ) = F2( top[3], top[4], top[5] );
718 PRED( 1, 2 ) = PRED( 2, 0 ) = F1( top[4], top[5] );
719 PRED( 1, 3 ) = PRED( 2, 1 ) = F2( top[4], top[5], top[6] );
720 PRED( 2, 2 ) = PRED( 3, 0 ) = F1( top[5], top[6] );
721 PRED( 2, 3 ) = PRED( 3, 1 ) = F2( top[5], top[6], top[7] );
722 PRED( 3, 2 ) = PRED( 4, 0 ) = F1( top[6], top[7] );
723 PRED( 3, 3 ) = PRED( 4, 1 ) = F2( top[6], top[7], top[8] );
724 PRED( 4, 2 ) = PRED( 5, 0 ) = F1( top[7], top[8] );
725 PRED( 4, 3 ) = PRED( 5, 1 ) = F2( top[7], top[8], top[9] );
726 PRED( 5, 2 ) = PRED( 6, 0 ) = F1( top[8], top[9] );
727 PRED( 5, 3 ) = PRED( 6, 1 ) = F2( top[8], top[9], top[10] );
728 PRED( 6, 2 ) = PRED( 7, 0 ) = F1( top[9], top[10] );
729 PRED( 6, 3 ) = PRED( 7, 1 ) = F2( top[9], top[10], top[11] );
730 PRED( 7, 2 ) = F1( top[10], top[11] );
731 PRED( 7, 3 ) = F2( top[10], top[11], top[12] );
732 satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
737 int x264_predict_8x8_hu( const local pixel *src, int src_stride, const local pixel *left )
739 private pixel pred[32];
741 int p1 = pack8to16( (F1( left[0], left[1] )), ((left[0] + 2 * left[1] + left[2] + 2) >> 2) );
742 int p2 = pack8to16( (F1( left[1], left[2] )), ((left[1] + 2 * left[2] + left[3] + 2) >> 2) );
743 int p3 = pack8to16( (F1( left[2], left[3] )), ((left[2] + 2 * left[3] + left[4] + 2) >> 2) );
744 int p4 = pack8to16( (F1( left[3], left[4] )), ((left[3] + 2 * left[4] + left[5] + 2) >> 2) );
745 int p5 = pack8to16( (F1( left[4], left[5] )), ((left[4] + 2 * left[5] + left[6] + 2) >> 2) );
746 int p6 = pack8to16( (F1( left[5], left[6] )), ((left[5] + 2 * left[6] + left[7] + 2) >> 2) );
747 int p7 = pack8to16( (F1( left[6], left[7] )), ((left[6] + 2 * left[7] + left[7] + 2) >> 2) );
748 int p8 = pack8to16( left[7], left[7] );
749 // Upper half of pred[]
750 vstore4( as_uchar4( pack16to32( p1, p2 ) ), 0, &pred[( 0 ) + ( 0 ) * 8] );
751 vstore4( as_uchar4( pack16to32( p3, p4 ) ), 0, &pred[( 4 ) + ( 0 ) * 8] );
752 vstore4( as_uchar4( pack16to32( p2, p3 ) ), 0, &pred[( 0 ) + ( 1 ) * 8] );
753 vstore4( as_uchar4( pack16to32( p4, p5 ) ), 0, &pred[( 4 ) + ( 1 ) * 8] );
754 vstore4( as_uchar4( pack16to32( p3, p4 ) ), 0, &pred[( 0 ) + ( 2 ) * 8] );
755 vstore4( as_uchar4( pack16to32( p5, p6 ) ), 0, &pred[( 4 ) + ( 2 ) * 8] );
756 vstore4( as_uchar4( pack16to32( p4, p5 ) ), 0, &pred[( 0 ) + ( 3 ) * 8] );
757 vstore4( as_uchar4( pack16to32( p6, p7 ) ), 0, &pred[( 4 ) + ( 3 ) * 8] );
758 satd = satd_8x4_lp( src, src_stride, pred, 8 );
759 // Lower half of pred[]
760 vstore4( as_uchar4( pack16to32( p5, p6 ) ), 0, &pred[( 0 ) + ( 0 ) * 8] );
761 vstore4( as_uchar4( pack16to32( p7, p8 ) ), 0, &pred[( 4 ) + ( 0 ) * 8] );
762 vstore4( as_uchar4( pack16to32( p6, p7 ) ), 0, &pred[( 0 ) + ( 1 ) * 8] );
763 vstore4( as_uchar4( pack16to32( p8, p8 ) ), 0, &pred[( 4 ) + ( 1 ) * 8] );
764 vstore4( as_uchar4( pack16to32( p7, p8 ) ), 0, &pred[( 0 ) + ( 2 ) * 8] );
765 vstore4( as_uchar4( pack16to32( p8, p8 ) ), 0, &pred[( 4 ) + ( 2 ) * 8] );
766 vstore4( as_uchar4( pack16to32( p8, p8 ) ), 0, &pred[( 0 ) + ( 3 ) * 8] );
767 vstore4( as_uchar4( pack16to32( p8, p8 ) ), 0, &pred[( 4 ) + ( 3 ) * 8] );
768 satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
772 int x264_predict_8x8c_h( const local pixel *src, int src_stride )
774 private pixel pred[32];
775 const local pixel *src_l = src;
777 // Upper half of pred[]
778 vstore8( (uchar8)(src[-1]), 0, pred ); src += src_stride;
779 vstore8( (uchar8)(src[-1]), 1, pred ); src += src_stride;
780 vstore8( (uchar8)(src[-1]), 2, pred ); src += src_stride;
781 vstore8( (uchar8)(src[-1]), 3, pred ); src += src_stride;
782 int satd = satd_8x4_lp( src_l, src_stride, pred, 8 );
784 // Lower half of pred[]
785 vstore8( (uchar8)(src[-1]), 0, pred ); src += src_stride;
786 vstore8( (uchar8)(src[-1]), 1, pred ); src += src_stride;
787 vstore8( (uchar8)(src[-1]), 2, pred ); src += src_stride;
788 vstore8( (uchar8)(src[-1]), 3, pred );
789 return satd + satd_8x4_lp( src_l + ( src_stride << 2 ), src_stride, pred, 8 );
792 int x264_predict_8x8c_v( const local pixel *src, int src_stride )
794 private pixel pred[32];
796 v16.lo = vload8( 0, &src[-src_stride] );
797 v16.hi = vload8( 0, &src[-src_stride] );
799 vstore16( v16, 0, pred );
800 vstore16( v16, 1, pred );
802 return satd_8x4_lp( src, src_stride, pred, 8 ) +
803 satd_8x4_lp( src + (src_stride << 2), src_stride, pred, 8 );
806 int x264_predict_8x8c_p( const local pixel *src, int src_stride )
809 private pixel pred[32];
812 for( int i = 0; i < 4; i++ )
814 H += (i + 1) * (src[4 + i - src_stride] - src[2 - i - src_stride]);
815 V += (i + 1) * (src[-1 + (i + 4) * src_stride] - src[-1 + (2 - i) * src_stride]);
818 int a = 16 * (src[-1 + 7 * src_stride] + src[7 - src_stride]);
819 int b = (17 * H + 16) >> 5;
820 int c = (17 * V + 16) >> 5;
821 int i00 = a - 3 * b - 3 * c + 16;
823 // Upper half of pred[]
824 for( int y = 0; y < 4; y++ )
827 for( int x = 0; x < 8; x++ )
829 pred[x + y*8] = x264_clip_pixel( pix >> 5 );
834 satd = satd_8x4_lp( src, src_stride, pred, 8 );
835 // Lower half of pred[]
836 for( int y = 0; y < 4; y++ )
839 for( int x = 0; x < 8; x++ )
841 pred[x + y*8] = x264_clip_pixel( pix >> 5 );
846 satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
850 int x264_predict_8x8c_dc( const local pixel *src, int src_stride )
852 private pixel pred[32];
853 int s0 = 0, s1 = 0, s2 = 0, s3 = 0;
854 for( int i = 0; i < 4; i++ )
856 s0 += src[i - src_stride];
857 s1 += src[i + 4 - src_stride];
858 s2 += src[-1 + i * src_stride];
859 s3 += src[-1 + (i+4)*src_stride];
862 // Upper half of pred[]
864 dc0.lo = (uchar4)( (s0 + s2 + 4) >> 3 );
865 dc0.hi = (uchar4)( (s1 + 2) >> 2 );
866 vstore8( dc0, 0, pred );
867 vstore8( dc0, 1, pred );
868 vstore8( dc0, 2, pred );
869 vstore8( dc0, 3, pred );
870 int satd = satd_8x4_lp( src, src_stride, pred, 8 );
872 // Lower half of pred[]
873 dc0.lo = (uchar4)( (s3 + 2) >> 2 );
874 dc0.hi = (uchar4)( (s1 + s3 + 4) >> 3 );
875 vstore8( dc0, 0, pred );
876 vstore8( dc0, 1, pred );
877 vstore8( dc0, 2, pred );
878 vstore8( dc0, 3, pred );
879 return satd + satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
883 /* Find the least cost intra mode for 32 8x8 macroblocks per workgroup
885 * Loads 33 macroblocks plus the pixels directly above them into local memory,
886 * padding where necessary with edge pixels. It then cooperatively calculates
887 * smoothed top and left pixels for use in some of the analysis.
889 * Then groups of 32 threads each calculate a single intra mode for each 8x8
890 * block. Since consecutive threads are calculating the same intra mode there
891 * is no code-path divergence. 8 intra costs are calculated simultaneously. If
892 * the "slow" argument is not zero, the final two (least likely) intra modes are
893 * tested in a second pass. The slow mode is only enabled for presets slow,
894 * slower, and placebo.
896 * This allows all of the pixels functions to read pixels from local memory, and
897 * avoids re-fetching edge pixels from global memory. And it allows us to
898 * calculate all of the intra mode costs simultaneously without branch divergence.
900 * Local dimension: [ 32, 8 ]
901 * Global dimensions: [ paddedWidth, height ] */
902 kernel void mb_intra_cost_satd_8x8( read_only image2d_t fenc,
903 global uint16_t *fenc_intra_cost,
904 global int *frame_stats,
909 #define CACHE_STRIDE 265
910 #define BLOCK_OFFSET 266
911 local pixel cache[2385];
912 local int cost_buf[32];
913 local pixel top[32 * 16];
914 local pixel left[32 * 8];
915 local pixel left_top[32];
917 int lx = get_local_id( 0 );
918 int ly = get_local_id( 1 );
919 int gx = get_global_id( 0 );
920 int gy = get_global_id( 1 );
921 int gidx = get_group_id( 0 );
922 int gidy = get_group_id( 1 );
923 int linear_id = ly * get_local_size( 0 ) + lx;
925 int basex = gidx << 8;
926 int basey = (gidy << 3) - 1;
928 /* Load 33 8x8 macroblocks and the pixels above them into local cache */
929 for( int y = 0; y < 9 && linear_id < (33<<3)>>2; y++ )
931 int x = linear_id << 2;
932 uint4 data = read_imageui( fenc, sampler, (int2)(x + basex, y + basey) );
933 cache[y * CACHE_STRIDE + 1 + x] = data.s0;
934 cache[y * CACHE_STRIDE + 1 + x + 1] = data.s1;
935 cache[y * CACHE_STRIDE + 1 + x + 2] = data.s2;
936 cache[y * CACHE_STRIDE + 1 + x + 3] = data.s3;
938 /* load pixels on left edge */
940 cache[linear_id * CACHE_STRIDE] = read_imageui( fenc, sampler, (int2)( basex - 1, linear_id + basey) ).s0;
942 barrier( CLK_LOCAL_MEM_FENCE );
944 // Cooperatively build the top edge for the macroblock using lowpass filter
946 top[lx*16 + j] = ( cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j - 1, -1, 15 )] +
947 2*cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j, 0, 15 )] +
948 cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j + 1, 0, 15 )] + 2 ) >> 2;
950 top[lx*16 + j] = ( cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j - 1, -1, 15 )] +
951 2*cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j, 0, 15 )] +
952 cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j + 1, 0, 15 )] + 2 ) >> 2;
953 // Cooperatively build the left edge for the macroblock using lowpass filter
954 left[lx*8 + ly] = ( cache[BLOCK_OFFSET + 8*lx - 1 + CACHE_STRIDE*(ly - 1)] +
955 2*cache[BLOCK_OFFSET + 8*lx - 1 + CACHE_STRIDE*ly] +
956 cache[BLOCK_OFFSET + 8*lx - 1 + CACHE_STRIDE*clamp((ly + 1), 0, 7 )] + 2 ) >> 2;
957 // One left_top per macroblock
960 left_top[lx] = ( cache[BLOCK_OFFSET + 8*lx - 1] + 2*cache[BLOCK_OFFSET + 8*lx - 1 - CACHE_STRIDE] +
961 cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE] + 2 ) >> 2;
962 cost_buf[lx] = COST_MAX;
964 barrier( CLK_LOCAL_MEM_FENCE );
966 // each warp/wavefront generates a different prediction type; no divergence
970 satd = x264_predict_8x8c_h( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE );
973 satd = x264_predict_8x8c_v( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE );
976 satd = x264_predict_8x8c_dc( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE );
979 satd = x264_predict_8x8c_p( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE );
982 satd = x264_predict_8x8_ddr( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &top[16*lx], &left[8*lx], left_top[lx] );
985 satd = x264_predict_8x8_vr( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &top[16*lx], &left[8*lx], left_top[lx] );
988 satd = x264_predict_8x8_hd( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &top[16*lx], &left[8*lx], left_top[lx] );
991 satd = x264_predict_8x8_hu( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &left[8*lx] );
996 atom_min( &cost_buf[lx], satd );
999 // Do the remaining two (least likely) prediction modes
1003 satd = x264_predict_8x8_ddl( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &top[16*lx] );
1004 atom_min( &cost_buf[lx], satd );
1007 satd = x264_predict_8x8_vl( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &top[16*lx] );
1008 atom_min( &cost_buf[lx], satd );
1014 barrier( CLK_LOCAL_MEM_FENCE );
1016 if( (0 == ly) && (gx < mb_width) )
1017 fenc_intra_cost[gidy * mb_width + gx] = cost_buf[lx]+ 5*lambda;
1019 // initialize the frame_stats[2] buffer for kernel sum_intra_cost().
1020 if( gx < 2 && gy == 0 )
1021 frame_stats[gx] = 0;
1027 * parallel sum intra costs
1029 * global launch dimensions: [256, mb_height]
1031 kernel void sum_intra_cost( const global uint16_t *fenc_intra_cost,
1032 const global uint16_t *inv_qscale_factor,
1033 global int *fenc_row_satds,
1034 global int *frame_stats,
1037 int y = get_global_id( 1 );
1038 int mb_height = get_global_size( 1 );
1042 int cost_est_aq = 0;
1044 for( int x = get_global_id( 0 ); x < mb_width; x += get_global_size( 0 ))
1046 int mb_xy = x + y * mb_width;
1047 int cost = fenc_intra_cost[mb_xy];
1048 int cost_aq = (cost * inv_qscale_factor[mb_xy] + 128) >> 8;
1049 int b_frame_score_mb = (x > 0 && x < mb_width - 1 && y > 0 && y < mb_height - 1) || mb_width <= 2 || mb_height <= 2;
1051 row_satds += cost_aq;
1052 if( b_frame_score_mb )
1055 cost_est_aq += cost_aq;
1059 local int buffer[256];
1060 int x = get_global_id( 0 );
1062 row_satds = parallel_sum( row_satds, x, buffer );
1063 cost_est = parallel_sum( cost_est, x, buffer );
1064 cost_est_aq = parallel_sum( cost_est_aq, x, buffer );
1066 if( get_global_id( 0 ) == 0 )
1068 fenc_row_satds[y] = row_satds;
1069 atomic_add( frame_stats + COST_EST, cost_est );
1070 atomic_add( frame_stats + COST_EST_AQ, cost_est_aq );