1 /*****************************************************************************
2 * pixel.c: h264 encoder
3 *****************************************************************************
4 * Copyright (C) 2003-2008 x264 project
6 * Authors: Eric Petit <eric.petit@lapsus.org>
7 * Guillaume Poirier <gpoirier@mplayerhq.hu>
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
22 *****************************************************************************/
28 #include "common/common.h"
29 #include "ppccommon.h"
31 /***********************************************************************
33 **********************************************************************/
35 #define PIXEL_SAD_ALTIVEC( name, lx, ly, a, b ) \
36 static int name( uint8_t *pix1, int i_pix1, \
37 uint8_t *pix2, int i_pix2 ) \
40 DECLARE_ALIGNED_16( int sum ); \
44 vec_u8_t pix1v, pix2v; \
45 vec_s32_t sumv = zero_s32v; \
46 for( y = 0; y < ly; y++ ) \
48 VEC_LOAD_G( pix1, pix1v, lx, vec_u8_t ); \
49 VEC_LOAD_G( pix2, pix2v, lx, vec_u8_t ); \
50 sumv = (vec_s32_t) vec_sum4s( \
51 vec_sub( vec_max( pix1v, pix2v ), \
52 vec_min( pix1v, pix2v ) ), \
57 sumv = vec_sum##a( sumv, zero_s32v ); \
58 sumv = vec_splat( sumv, b ); \
59 vec_ste( sumv, 0, &sum ); \
63 PIXEL_SAD_ALTIVEC( pixel_sad_16x16_altivec, 16, 16, s, 3 )
64 PIXEL_SAD_ALTIVEC( pixel_sad_8x16_altivec, 8, 16, 2s, 1 )
65 PIXEL_SAD_ALTIVEC( pixel_sad_16x8_altivec, 16, 8, s, 3 )
66 PIXEL_SAD_ALTIVEC( pixel_sad_8x8_altivec, 8, 8, 2s, 1 )
70 /***********************************************************************
72 **********************************************************************/
74 /***********************************************************************
76 ***********************************************************************
77 * b[0] = a[0] + a[1] + a[2] + a[3]
78 * b[1] = a[0] + a[1] - a[2] - a[3]
79 * b[2] = a[0] - a[1] - a[2] + a[3]
80 * b[3] = a[0] - a[1] + a[2] - a[3]
81 **********************************************************************/
82 #define VEC_HADAMAR(a0,a1,a2,a3,b0,b1,b2,b3) \
83 b2 = vec_add( a0, a1 ); \
84 b3 = vec_add( a2, a3 ); \
85 a0 = vec_sub( a0, a1 ); \
86 a2 = vec_sub( a2, a3 ); \
87 b0 = vec_add( b2, b3 ); \
88 b1 = vec_sub( b2, b3 ); \
89 b2 = vec_sub( a0, a2 ); \
90 b3 = vec_add( a0, a2 )
92 /***********************************************************************
94 ***********************************************************************
99 * Call vec_sub()/vec_max() instead of vec_abs() because vec_abs()
100 * actually also calls vec_splat(0), but we already have a null vector.
101 **********************************************************************/
103 a = vec_max( a, vec_sub( zero_s16v, a ) );
105 /***********************************************************************
107 ***********************************************************************
111 * c[i] = abs(a[2*i]) + abs(a[2*i+1]) + [bi]
112 **********************************************************************/
113 #define VEC_ADD_ABS(a,b,c) \
115 c = vec_sum4s( a, b )
117 /***********************************************************************
119 **********************************************************************/
120 static int pixel_satd_4x4_altivec( uint8_t *pix1, int i_pix1,
121 uint8_t *pix2, int i_pix2 )
123 DECLARE_ALIGNED_16( int i_satd );
126 PREP_LOAD_SRC( pix1 );
127 vec_s16_t diff0v, diff1v, diff2v, diff3v;
128 vec_s16_t temp0v, temp1v, temp2v, temp3v;
131 vec_u8_t _offset1v_ = vec_lvsl(0, pix2);
132 vec_u8_t _offset2v_ = vec_lvsl(0, pix2 + i_pix2);
135 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff0v, offset1v );
136 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff1v, offset2v );
137 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff2v, offset1v );
138 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff3v, offset2v );
141 VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
142 temp0v, temp1v, temp2v, temp3v );
144 VEC_TRANSPOSE_4( temp0v, temp1v, temp2v, temp3v,
145 diff0v, diff1v, diff2v, diff3v );
147 VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
148 temp0v, temp1v, temp2v, temp3v );
150 VEC_ADD_ABS( temp0v, zero_s32v, satdv );
151 VEC_ADD_ABS( temp1v, satdv, satdv );
152 VEC_ADD_ABS( temp2v, satdv, satdv );
153 VEC_ADD_ABS( temp3v, satdv, satdv );
155 satdv = vec_sum2s( satdv, zero_s32v );
156 satdv = vec_splat( satdv, 1 );
157 vec_ste( satdv, 0, &i_satd );
162 /***********************************************************************
164 **********************************************************************/
165 static int pixel_satd_4x8_altivec( uint8_t *pix1, int i_pix1,
166 uint8_t *pix2, int i_pix2 )
168 DECLARE_ALIGNED_16( int i_satd );
171 vec_s16_t diff0v, diff1v, diff2v, diff3v;
172 vec_s16_t temp0v, temp1v, temp2v, temp3v;
175 PREP_LOAD_SRC( pix1 );
176 vec_u8_t _offset1v_ = vec_lvsl(0, pix2);
177 vec_u8_t _offset2v_ = vec_lvsl(0, pix2 + i_pix2);
179 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff0v, offset1v );
180 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff1v, offset2v );
181 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff2v, offset1v );
182 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff3v, offset2v );
183 VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
184 temp0v, temp1v, temp2v, temp3v );
185 VEC_TRANSPOSE_4( temp0v, temp1v, temp2v, temp3v,
186 diff0v, diff1v, diff2v, diff3v );
187 VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
188 temp0v, temp1v, temp2v, temp3v );
189 VEC_ADD_ABS( temp0v, zero_s32v, satdv );
190 VEC_ADD_ABS( temp1v, satdv, satdv );
191 VEC_ADD_ABS( temp2v, satdv, satdv );
192 VEC_ADD_ABS( temp3v, satdv, satdv );
194 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff0v, offset1v );
195 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff1v, offset2v );
196 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff2v, offset1v );
197 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff3v, offset2v );
198 VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
199 temp0v, temp1v, temp2v, temp3v );
200 VEC_TRANSPOSE_4( temp0v, temp1v, temp2v, temp3v,
201 diff0v, diff1v, diff2v, diff3v );
202 VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
203 temp0v, temp1v, temp2v, temp3v );
204 VEC_ADD_ABS( temp0v, satdv, satdv );
205 VEC_ADD_ABS( temp1v, satdv, satdv );
206 VEC_ADD_ABS( temp2v, satdv, satdv );
207 VEC_ADD_ABS( temp3v, satdv, satdv );
209 satdv = vec_sum2s( satdv, zero_s32v );
210 satdv = vec_splat( satdv, 1 );
211 vec_ste( satdv, 0, &i_satd );
216 /***********************************************************************
218 **********************************************************************/
219 static int pixel_satd_8x4_altivec( uint8_t *pix1, int i_pix1,
220 uint8_t *pix2, int i_pix2 )
222 DECLARE_ALIGNED_16( int i_satd );
225 vec_s16_t diff0v, diff1v, diff2v, diff3v,
226 diff4v, diff5v, diff6v, diff7v;
227 vec_s16_t temp0v, temp1v, temp2v, temp3v,
228 temp4v, temp5v, temp6v, temp7v;
232 PREP_LOAD_SRC( pix1 );
233 vec_u8_t _offset1v_ = vec_lvsl(0, pix2);
234 vec_u8_t _offset2v_ = vec_lvsl(0, pix2 + i_pix2);
236 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v, offset1v );
237 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v, offset2v );
238 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v, offset1v );
239 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v, offset2v );
241 VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
242 temp0v, temp1v, temp2v, temp3v );
243 /* This causes warnings because temp4v...temp7v haven't be set,
245 VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
246 temp4v, temp5v, temp6v, temp7v,
247 diff0v, diff1v, diff2v, diff3v,
248 diff4v, diff5v, diff6v, diff7v );
249 VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
250 temp0v, temp1v, temp2v, temp3v );
251 VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v,
252 temp4v, temp5v, temp6v, temp7v );
254 VEC_ADD_ABS( temp0v, zero_s32v, satdv );
255 VEC_ADD_ABS( temp1v, satdv, satdv );
256 VEC_ADD_ABS( temp2v, satdv, satdv );
257 VEC_ADD_ABS( temp3v, satdv, satdv );
258 VEC_ADD_ABS( temp4v, satdv, satdv );
259 VEC_ADD_ABS( temp5v, satdv, satdv );
260 VEC_ADD_ABS( temp6v, satdv, satdv );
261 VEC_ADD_ABS( temp7v, satdv, satdv );
263 satdv = vec_sum2s( satdv, zero_s32v );
264 satdv = vec_splat( satdv, 1 );
265 vec_ste( satdv, 0, &i_satd );
270 /***********************************************************************
272 **********************************************************************/
273 static int pixel_satd_8x8_altivec( uint8_t *pix1, int i_pix1,
274 uint8_t *pix2, int i_pix2 )
276 DECLARE_ALIGNED_16( int i_satd );
279 vec_s16_t diff0v, diff1v, diff2v, diff3v,
280 diff4v, diff5v, diff6v, diff7v;
281 vec_s16_t temp0v, temp1v, temp2v, temp3v,
282 temp4v, temp5v, temp6v, temp7v;
285 PREP_LOAD_SRC( pix1 );
286 vec_u8_t _offset1v_ = vec_lvsl(0, pix2);
287 vec_u8_t _offset2v_ = vec_lvsl(0, pix2 + i_pix2);
290 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v, offset1v );
291 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v, offset2v );
292 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v, offset1v );
293 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v, offset2v );
294 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff4v, offset1v );
295 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff5v, offset2v );
296 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff6v, offset1v );
297 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff7v, offset2v );
299 VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
300 temp0v, temp1v, temp2v, temp3v );
301 VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v,
302 temp4v, temp5v, temp6v, temp7v );
304 VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
305 temp4v, temp5v, temp6v, temp7v,
306 diff0v, diff1v, diff2v, diff3v,
307 diff4v, diff5v, diff6v, diff7v );
309 VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
310 temp0v, temp1v, temp2v, temp3v );
311 VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v,
312 temp4v, temp5v, temp6v, temp7v );
314 VEC_ADD_ABS( temp0v, zero_s32v, satdv );
315 VEC_ADD_ABS( temp1v, satdv, satdv );
316 VEC_ADD_ABS( temp2v, satdv, satdv );
317 VEC_ADD_ABS( temp3v, satdv, satdv );
318 VEC_ADD_ABS( temp4v, satdv, satdv );
319 VEC_ADD_ABS( temp5v, satdv, satdv );
320 VEC_ADD_ABS( temp6v, satdv, satdv );
321 VEC_ADD_ABS( temp7v, satdv, satdv );
323 satdv = vec_sums( satdv, zero_s32v );
324 satdv = vec_splat( satdv, 3 );
325 vec_ste( satdv, 0, &i_satd );
330 /***********************************************************************
332 **********************************************************************/
333 static int pixel_satd_8x16_altivec( uint8_t *pix1, int i_pix1,
334 uint8_t *pix2, int i_pix2 )
336 DECLARE_ALIGNED_16( int i_satd );
339 vec_s16_t diff0v, diff1v, diff2v, diff3v,
340 diff4v, diff5v, diff6v, diff7v;
341 vec_s16_t temp0v, temp1v, temp2v, temp3v,
342 temp4v, temp5v, temp6v, temp7v;
345 PREP_LOAD_SRC( pix1 );
346 vec_u8_t _offset1v_ = vec_lvsl(0, pix2);
347 vec_u8_t _offset2v_ = vec_lvsl(0, pix2 + i_pix2);
349 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v, offset1v );
350 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v, offset2v );
351 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v, offset1v );
352 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v, offset2v );
353 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff4v, offset1v );
354 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff5v, offset2v );
355 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff6v , offset1v);
356 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff7v, offset2v );
357 VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
358 temp0v, temp1v, temp2v, temp3v );
359 VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v,
360 temp4v, temp5v, temp6v, temp7v );
361 VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
362 temp4v, temp5v, temp6v, temp7v,
363 diff0v, diff1v, diff2v, diff3v,
364 diff4v, diff5v, diff6v, diff7v );
365 VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
366 temp0v, temp1v, temp2v, temp3v );
367 VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v,
368 temp4v, temp5v, temp6v, temp7v );
369 VEC_ADD_ABS( temp0v, zero_s32v, satdv );
370 VEC_ADD_ABS( temp1v, satdv, satdv );
371 VEC_ADD_ABS( temp2v, satdv, satdv );
372 VEC_ADD_ABS( temp3v, satdv, satdv );
373 VEC_ADD_ABS( temp4v, satdv, satdv );
374 VEC_ADD_ABS( temp5v, satdv, satdv );
375 VEC_ADD_ABS( temp6v, satdv, satdv );
376 VEC_ADD_ABS( temp7v, satdv, satdv );
378 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v, offset1v );
379 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v, offset2v );
380 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v, offset1v );
381 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v, offset2v );
382 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff4v, offset1v );
383 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff5v, offset2v );
384 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff6v, offset1v );
385 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff7v, offset2v );
386 VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
387 temp0v, temp1v, temp2v, temp3v );
388 VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v,
389 temp4v, temp5v, temp6v, temp7v );
390 VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
391 temp4v, temp5v, temp6v, temp7v,
392 diff0v, diff1v, diff2v, diff3v,
393 diff4v, diff5v, diff6v, diff7v );
394 VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
395 temp0v, temp1v, temp2v, temp3v );
396 VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v,
397 temp4v, temp5v, temp6v, temp7v );
398 VEC_ADD_ABS( temp0v, satdv, satdv );
399 VEC_ADD_ABS( temp1v, satdv, satdv );
400 VEC_ADD_ABS( temp2v, satdv, satdv );
401 VEC_ADD_ABS( temp3v, satdv, satdv );
402 VEC_ADD_ABS( temp4v, satdv, satdv );
403 VEC_ADD_ABS( temp5v, satdv, satdv );
404 VEC_ADD_ABS( temp6v, satdv, satdv );
405 VEC_ADD_ABS( temp7v, satdv, satdv );
407 satdv = vec_sums( satdv, zero_s32v );
408 satdv = vec_splat( satdv, 3 );
409 vec_ste( satdv, 0, &i_satd );
414 /***********************************************************************
416 **********************************************************************/
417 static int pixel_satd_16x8_altivec( uint8_t *pix1, int i_pix1,
418 uint8_t *pix2, int i_pix2 )
420 DECLARE_ALIGNED_16( int i_satd );
424 PREP_LOAD_SRC( pix2 );
426 vec_s16_t pix1v, pix2v;
427 vec_s16_t diffh0v, diffh1v, diffh2v, diffh3v,
428 diffh4v, diffh5v, diffh6v, diffh7v;
429 vec_s16_t diffl0v, diffl1v, diffl2v, diffl3v,
430 diffl4v, diffl5v, diffl6v, diffl7v;
431 vec_s16_t temp0v, temp1v, temp2v, temp3v,
432 temp4v, temp5v, temp6v, temp7v;
434 VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh0v, diffl0v );
435 VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh1v, diffl1v );
436 VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh2v, diffl2v );
437 VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh3v, diffl3v );
438 VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh4v, diffl4v );
439 VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh5v, diffl5v );
440 VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh6v, diffl6v );
441 VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh7v, diffl7v );
443 VEC_HADAMAR( diffh0v, diffh1v, diffh2v, diffh3v,
444 temp0v, temp1v, temp2v, temp3v );
445 VEC_HADAMAR( diffh4v, diffh5v, diffh6v, diffh7v,
446 temp4v, temp5v, temp6v, temp7v );
448 VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
449 temp4v, temp5v, temp6v, temp7v,
450 diffh0v, diffh1v, diffh2v, diffh3v,
451 diffh4v, diffh5v, diffh6v, diffh7v );
453 VEC_HADAMAR( diffh0v, diffh1v, diffh2v, diffh3v,
454 temp0v, temp1v, temp2v, temp3v );
455 VEC_HADAMAR( diffh4v, diffh5v, diffh6v, diffh7v,
456 temp4v, temp5v, temp6v, temp7v );
458 VEC_ADD_ABS( temp0v, zero_s32v, satdv );
459 VEC_ADD_ABS( temp1v, satdv, satdv );
460 VEC_ADD_ABS( temp2v, satdv, satdv );
461 VEC_ADD_ABS( temp3v, satdv, satdv );
462 VEC_ADD_ABS( temp4v, satdv, satdv );
463 VEC_ADD_ABS( temp5v, satdv, satdv );
464 VEC_ADD_ABS( temp6v, satdv, satdv );
465 VEC_ADD_ABS( temp7v, satdv, satdv );
467 VEC_HADAMAR( diffl0v, diffl1v, diffl2v, diffl3v,
468 temp0v, temp1v, temp2v, temp3v );
469 VEC_HADAMAR( diffl4v, diffl5v, diffl6v, diffl7v,
470 temp4v, temp5v, temp6v, temp7v );
472 VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
473 temp4v, temp5v, temp6v, temp7v,
474 diffl0v, diffl1v, diffl2v, diffl3v,
475 diffl4v, diffl5v, diffl6v, diffl7v );
477 VEC_HADAMAR( diffl0v, diffl1v, diffl2v, diffl3v,
478 temp0v, temp1v, temp2v, temp3v );
479 VEC_HADAMAR( diffl4v, diffl5v, diffl6v, diffl7v,
480 temp4v, temp5v, temp6v, temp7v );
482 VEC_ADD_ABS( temp0v, satdv, satdv );
483 VEC_ADD_ABS( temp1v, satdv, satdv );
484 VEC_ADD_ABS( temp2v, satdv, satdv );
485 VEC_ADD_ABS( temp3v, satdv, satdv );
486 VEC_ADD_ABS( temp4v, satdv, satdv );
487 VEC_ADD_ABS( temp5v, satdv, satdv );
488 VEC_ADD_ABS( temp6v, satdv, satdv );
489 VEC_ADD_ABS( temp7v, satdv, satdv );
491 satdv = vec_sums( satdv, zero_s32v );
492 satdv = vec_splat( satdv, 3 );
493 vec_ste( satdv, 0, &i_satd );
498 /***********************************************************************
500 **********************************************************************/
501 static int pixel_satd_16x16_altivec( uint8_t *pix1, int i_pix1,
502 uint8_t *pix2, int i_pix2 )
504 DECLARE_ALIGNED_16( int i_satd );
509 vec_s16_t pix1v, pix2v;
510 vec_s16_t diffh0v, diffh1v, diffh2v, diffh3v,
511 diffh4v, diffh5v, diffh6v, diffh7v;
512 vec_s16_t diffl0v, diffl1v, diffl2v, diffl3v,
513 diffl4v, diffl5v, diffl6v, diffl7v;
514 vec_s16_t temp0v, temp1v, temp2v, temp3v,
515 temp4v, temp5v, temp6v, temp7v;
516 PREP_LOAD_SRC( pix2 );
519 VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh0v, diffl0v );
520 VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh1v, diffl1v );
521 VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh2v, diffl2v );
522 VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh3v, diffl3v );
523 VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh4v, diffl4v );
524 VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh5v, diffl5v );
525 VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh6v, diffl6v );
526 VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh7v, diffl7v );
527 VEC_HADAMAR( diffh0v, diffh1v, diffh2v, diffh3v,
528 temp0v, temp1v, temp2v, temp3v );
529 VEC_HADAMAR( diffh4v, diffh5v, diffh6v, diffh7v,
530 temp4v, temp5v, temp6v, temp7v );
531 VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
532 temp4v, temp5v, temp6v, temp7v,
533 diffh0v, diffh1v, diffh2v, diffh3v,
534 diffh4v, diffh5v, diffh6v, diffh7v );
535 VEC_HADAMAR( diffh0v, diffh1v, diffh2v, diffh3v,
536 temp0v, temp1v, temp2v, temp3v );
537 VEC_HADAMAR( diffh4v, diffh5v, diffh6v, diffh7v,
538 temp4v, temp5v, temp6v, temp7v );
539 VEC_ADD_ABS( temp0v, zero_s32v, satdv );
540 VEC_ADD_ABS( temp1v, satdv, satdv );
541 VEC_ADD_ABS( temp2v, satdv, satdv );
542 VEC_ADD_ABS( temp3v, satdv, satdv );
543 VEC_ADD_ABS( temp4v, satdv, satdv );
544 VEC_ADD_ABS( temp5v, satdv, satdv );
545 VEC_ADD_ABS( temp6v, satdv, satdv );
546 VEC_ADD_ABS( temp7v, satdv, satdv );
547 VEC_HADAMAR( diffl0v, diffl1v, diffl2v, diffl3v,
548 temp0v, temp1v, temp2v, temp3v );
549 VEC_HADAMAR( diffl4v, diffl5v, diffl6v, diffl7v,
550 temp4v, temp5v, temp6v, temp7v );
551 VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
552 temp4v, temp5v, temp6v, temp7v,
553 diffl0v, diffl1v, diffl2v, diffl3v,
554 diffl4v, diffl5v, diffl6v, diffl7v );
555 VEC_HADAMAR( diffl0v, diffl1v, diffl2v, diffl3v,
556 temp0v, temp1v, temp2v, temp3v );
557 VEC_HADAMAR( diffl4v, diffl5v, diffl6v, diffl7v,
558 temp4v, temp5v, temp6v, temp7v );
559 VEC_ADD_ABS( temp0v, satdv, satdv );
560 VEC_ADD_ABS( temp1v, satdv, satdv );
561 VEC_ADD_ABS( temp2v, satdv, satdv );
562 VEC_ADD_ABS( temp3v, satdv, satdv );
563 VEC_ADD_ABS( temp4v, satdv, satdv );
564 VEC_ADD_ABS( temp5v, satdv, satdv );
565 VEC_ADD_ABS( temp6v, satdv, satdv );
566 VEC_ADD_ABS( temp7v, satdv, satdv );
568 VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh0v, diffl0v );
569 VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh1v, diffl1v );
570 VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh2v, diffl2v );
571 VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh3v, diffl3v );
572 VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh4v, diffl4v );
573 VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh5v, diffl5v );
574 VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh6v, diffl6v );
575 VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh7v, diffl7v );
576 VEC_HADAMAR( diffh0v, diffh1v, diffh2v, diffh3v,
577 temp0v, temp1v, temp2v, temp3v );
578 VEC_HADAMAR( diffh4v, diffh5v, diffh6v, diffh7v,
579 temp4v, temp5v, temp6v, temp7v );
580 VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
581 temp4v, temp5v, temp6v, temp7v,
582 diffh0v, diffh1v, diffh2v, diffh3v,
583 diffh4v, diffh5v, diffh6v, diffh7v );
584 VEC_HADAMAR( diffh0v, diffh1v, diffh2v, diffh3v,
585 temp0v, temp1v, temp2v, temp3v );
586 VEC_HADAMAR( diffh4v, diffh5v, diffh6v, diffh7v,
587 temp4v, temp5v, temp6v, temp7v );
588 VEC_ADD_ABS( temp0v, satdv, satdv );
589 VEC_ADD_ABS( temp1v, satdv, satdv );
590 VEC_ADD_ABS( temp2v, satdv, satdv );
591 VEC_ADD_ABS( temp3v, satdv, satdv );
592 VEC_ADD_ABS( temp4v, satdv, satdv );
593 VEC_ADD_ABS( temp5v, satdv, satdv );
594 VEC_ADD_ABS( temp6v, satdv, satdv );
595 VEC_ADD_ABS( temp7v, satdv, satdv );
596 VEC_HADAMAR( diffl0v, diffl1v, diffl2v, diffl3v,
597 temp0v, temp1v, temp2v, temp3v );
598 VEC_HADAMAR( diffl4v, diffl5v, diffl6v, diffl7v,
599 temp4v, temp5v, temp6v, temp7v );
600 VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
601 temp4v, temp5v, temp6v, temp7v,
602 diffl0v, diffl1v, diffl2v, diffl3v,
603 diffl4v, diffl5v, diffl6v, diffl7v );
604 VEC_HADAMAR( diffl0v, diffl1v, diffl2v, diffl3v,
605 temp0v, temp1v, temp2v, temp3v );
606 VEC_HADAMAR( diffl4v, diffl5v, diffl6v, diffl7v,
607 temp4v, temp5v, temp6v, temp7v );
608 VEC_ADD_ABS( temp0v, satdv, satdv );
609 VEC_ADD_ABS( temp1v, satdv, satdv );
610 VEC_ADD_ABS( temp2v, satdv, satdv );
611 VEC_ADD_ABS( temp3v, satdv, satdv );
612 VEC_ADD_ABS( temp4v, satdv, satdv );
613 VEC_ADD_ABS( temp5v, satdv, satdv );
614 VEC_ADD_ABS( temp6v, satdv, satdv );
615 VEC_ADD_ABS( temp7v, satdv, satdv );
617 satdv = vec_sums( satdv, zero_s32v );
618 satdv = vec_splat( satdv, 3 );
619 vec_ste( satdv, 0, &i_satd );
626 /***********************************************************************
627 * Interleaved SAD routines
628 **********************************************************************/
630 static void pixel_sad_x4_16x16_altivec( uint8_t *fenc,
631 uint8_t *pix0, uint8_t *pix1,
632 uint8_t *pix2, uint8_t *pix3,
633 int i_stride, int scores[4] )
635 DECLARE_ALIGNED_16( int sum0 );
636 DECLARE_ALIGNED_16( int sum1 );
637 DECLARE_ALIGNED_16( int sum2 );
638 DECLARE_ALIGNED_16( int sum3 );
642 vec_u8_t temp_lv, temp_hv;
643 vec_u8_t fencv, pix0v, pix1v, pix2v, pix3v;
644 //vec_u8_t perm0v, perm1v, perm2v, perm3v;
645 vec_u8_t perm0vA, perm1vA, perm2vA, perm3vA, perm0vB, perm1vB, perm2vB, perm3vB;
647 vec_s32_t sum0v, sum1v, sum2v, sum3v;
649 sum0v = vec_splat_s32(0);
650 sum1v = vec_splat_s32(0);
651 sum2v = vec_splat_s32(0);
652 sum3v = vec_splat_s32(0);
654 perm0vA = vec_lvsl(0, pix0);
655 perm1vA = vec_lvsl(0, pix1);
656 perm2vA = vec_lvsl(0, pix2);
657 perm3vA = vec_lvsl(0, pix3);
659 perm0vB = vec_lvsl(0, pix0 + i_stride);
660 perm1vB = vec_lvsl(0, pix1 + i_stride);
661 perm2vB = vec_lvsl(0, pix2 + i_stride);
662 perm3vB = vec_lvsl(0, pix3 + i_stride);
664 for (y = 0; y < 8; y++)
666 temp_lv = vec_ld(0, pix0);
667 temp_hv = vec_ld(16, pix0);
668 pix0v = vec_perm(temp_lv, temp_hv, perm0vA);
671 temp_lv = vec_ld(0, pix1);
672 temp_hv = vec_ld(16, pix1);
673 pix1v = vec_perm(temp_lv, temp_hv, perm1vA);
676 fencv = vec_ld(0, fenc);
679 temp_lv = vec_ld(0, pix2);
680 temp_hv = vec_ld(16, pix2);
681 pix2v = vec_perm(temp_lv, temp_hv, perm2vA);
684 temp_lv = vec_ld(0, pix3);
685 temp_hv = vec_ld(16, pix3);
686 pix3v = vec_perm(temp_lv, temp_hv, perm3vA);
689 sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
691 sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
693 sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
695 sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v );
697 temp_lv = vec_ld(0, pix0);
698 temp_hv = vec_ld(16, pix0);
699 pix0v = vec_perm(temp_lv, temp_hv, perm0vB);
702 temp_lv = vec_ld(0, pix1);
703 temp_hv = vec_ld(16, pix1);
704 pix1v = vec_perm(temp_lv, temp_hv, perm1vB);
707 fencv = vec_ld(0, fenc);
710 temp_lv = vec_ld(0, pix2);
711 temp_hv = vec_ld(16, pix2);
712 pix2v = vec_perm(temp_lv, temp_hv, perm2vB);
715 temp_lv = vec_ld(0, pix3);
716 temp_hv = vec_ld(16, pix3);
717 pix3v = vec_perm(temp_lv, temp_hv, perm3vB);
720 sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
722 sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
724 sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
726 sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v );
731 sum0v = vec_sums( sum0v, zero_s32v );
732 sum1v = vec_sums( sum1v, zero_s32v );
733 sum2v = vec_sums( sum2v, zero_s32v );
734 sum3v = vec_sums( sum3v, zero_s32v );
736 sum0v = vec_splat( sum0v, 3 );
737 sum1v = vec_splat( sum1v, 3 );
738 sum2v = vec_splat( sum2v, 3 );
739 sum3v = vec_splat( sum3v, 3 );
741 vec_ste( sum0v, 0, &sum0);
742 vec_ste( sum1v, 0, &sum1);
743 vec_ste( sum2v, 0, &sum2);
744 vec_ste( sum3v, 0, &sum3);
752 static void pixel_sad_x3_16x16_altivec( uint8_t *fenc, uint8_t *pix0,
753 uint8_t *pix1, uint8_t *pix2,
754 int i_stride, int scores[3] )
756 DECLARE_ALIGNED_16( int sum0 );
757 DECLARE_ALIGNED_16( int sum1 );
758 DECLARE_ALIGNED_16( int sum2 );
762 vec_u8_t temp_lv, temp_hv; // temporary load vectors
763 vec_u8_t fencv, pix0v, pix1v, pix2v;
764 vec_u8_t perm0vA, perm1vA, perm2vA, perm0vB, perm1vB, perm2vB;
766 vec_s32_t sum0v, sum1v, sum2v;
768 sum0v = vec_splat_s32(0);
769 sum1v = vec_splat_s32(0);
770 sum2v = vec_splat_s32(0);
772 perm0vA = vec_lvsl(0, pix0);
773 perm1vA = vec_lvsl(0, pix1);
774 perm2vA = vec_lvsl(0, pix2);
776 perm0vB = vec_lvsl(0, pix0 + i_stride);
777 perm1vB = vec_lvsl(0, pix1 + i_stride);
778 perm2vB = vec_lvsl(0, pix2 + i_stride);
780 for (y = 0; y < 8; y++)
782 temp_lv = vec_ld(0, pix0);
783 temp_hv = vec_ld(16, pix0);
784 pix0v = vec_perm(temp_lv, temp_hv, perm0vA);
787 temp_lv = vec_ld(0, pix1);
788 temp_hv = vec_ld(16, pix1);
789 pix1v = vec_perm(temp_lv, temp_hv, perm1vA);
792 fencv = vec_ld(0, fenc);
795 temp_lv = vec_ld(0, pix2);
796 temp_hv = vec_ld(16, pix2);
797 pix2v = vec_perm(temp_lv, temp_hv, perm2vA);
800 sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
802 sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
804 sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
806 temp_lv = vec_ld(0, pix0);
807 temp_hv = vec_ld(16, pix0);
808 pix0v = vec_perm(temp_lv, temp_hv, perm0vB);
812 temp_lv = vec_ld(0, pix1);
813 temp_hv = vec_ld(16, pix1);
814 pix1v = vec_perm(temp_lv, temp_hv, perm1vB);
817 fencv = vec_ld(0, fenc);
820 temp_lv = vec_ld(0, pix2);
821 temp_hv = vec_ld(16, pix2);
822 pix2v = vec_perm(temp_lv, temp_hv, perm2vB);
825 sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
827 sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
829 sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
832 sum0v = vec_sums( sum0v, zero_s32v );
833 sum1v = vec_sums( sum1v, zero_s32v );
834 sum2v = vec_sums( sum2v, zero_s32v );
836 sum0v = vec_splat( sum0v, 3 );
837 sum1v = vec_splat( sum1v, 3 );
838 sum2v = vec_splat( sum2v, 3 );
840 vec_ste( sum0v, 0, &sum0);
841 vec_ste( sum1v, 0, &sum1);
842 vec_ste( sum2v, 0, &sum2);
849 static void pixel_sad_x4_16x8_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, uint8_t *pix3, int i_stride, int scores[4] )
851 DECLARE_ALIGNED_16( int sum0 );
852 DECLARE_ALIGNED_16( int sum1 );
853 DECLARE_ALIGNED_16( int sum2 );
854 DECLARE_ALIGNED_16( int sum3 );
858 vec_u8_t temp_lv, temp_hv;
859 vec_u8_t fencv, pix0v, pix1v, pix2v, pix3v;
860 vec_u8_t perm0vA, perm1vA, perm2vA, perm3vA, perm0vB, perm1vB, perm2vB, perm3vB;
862 vec_s32_t sum0v, sum1v, sum2v, sum3v;
864 sum0v = vec_splat_s32(0);
865 sum1v = vec_splat_s32(0);
866 sum2v = vec_splat_s32(0);
867 sum3v = vec_splat_s32(0);
869 perm0vA = vec_lvsl(0, pix0);
870 perm1vA = vec_lvsl(0, pix1);
871 perm2vA = vec_lvsl(0, pix2);
872 perm3vA = vec_lvsl(0, pix3);
874 perm0vB = vec_lvsl(0, pix0 + i_stride);
875 perm1vB = vec_lvsl(0, pix1 + i_stride);
876 perm2vB = vec_lvsl(0, pix2 + i_stride);
877 perm3vB = vec_lvsl(0, pix3 + i_stride);
879 for (y = 0; y < 4; y++)
881 temp_lv = vec_ld(0, pix0);
882 temp_hv = vec_ld(16, pix0);
883 pix0v = vec_perm(temp_lv, temp_hv, perm0vA);
886 temp_lv = vec_ld(0, pix1);
887 temp_hv = vec_ld(16, pix1);
888 pix1v = vec_perm(temp_lv, temp_hv, perm1vA);
891 fencv = vec_ld(0, fenc);
894 temp_lv = vec_ld(0, pix2);
895 temp_hv = vec_ld(16, pix2);
896 pix2v = vec_perm(temp_lv, temp_hv, perm2vA);
899 temp_lv = vec_ld(0, pix3);
900 temp_hv = vec_ld(16, pix3);
901 pix3v = vec_perm(temp_lv, temp_hv, perm3vA);
904 sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
906 sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
908 sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
910 sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v );
912 temp_lv = vec_ld(0, pix0);
913 temp_hv = vec_ld(16, pix0);
914 pix0v = vec_perm(temp_lv, temp_hv, perm0vB);
917 temp_lv = vec_ld(0, pix1);
918 temp_hv = vec_ld(16, pix1);
919 pix1v = vec_perm(temp_lv, temp_hv, perm1vB);
922 fencv = vec_ld(0, fenc);
925 temp_lv = vec_ld(0, pix2);
926 temp_hv = vec_ld(16, pix2);
927 pix2v = vec_perm(temp_lv, temp_hv, perm2vB);
930 temp_lv = vec_ld(0, pix3);
931 temp_hv = vec_ld(16, pix3);
932 pix3v = vec_perm(temp_lv, temp_hv, perm3vB);
935 sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
937 sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
939 sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
941 sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v );
944 sum0v = vec_sums( sum0v, zero_s32v );
945 sum1v = vec_sums( sum1v, zero_s32v );
946 sum2v = vec_sums( sum2v, zero_s32v );
947 sum3v = vec_sums( sum3v, zero_s32v );
949 sum0v = vec_splat( sum0v, 3 );
950 sum1v = vec_splat( sum1v, 3 );
951 sum2v = vec_splat( sum2v, 3 );
952 sum3v = vec_splat( sum3v, 3 );
954 vec_ste( sum0v, 0, &sum0);
955 vec_ste( sum1v, 0, &sum1);
956 vec_ste( sum2v, 0, &sum2);
957 vec_ste( sum3v, 0, &sum3);
965 static void pixel_sad_x3_16x8_altivec( uint8_t *fenc, uint8_t *pix0,
966 uint8_t *pix1, uint8_t *pix2,
967 int i_stride, int scores[3] )
969 DECLARE_ALIGNED_16( int sum0 );
970 DECLARE_ALIGNED_16( int sum1 );
971 DECLARE_ALIGNED_16( int sum2 );
975 vec_u8_t temp_lv, temp_hv;
976 vec_u8_t fencv, pix0v, pix1v, pix2v;
977 vec_u8_t perm0vA, perm1vA, perm2vA, perm0vB, perm1vB, perm2vB;
979 vec_s32_t sum0v, sum1v, sum2v;
981 sum0v = vec_splat_s32(0);
982 sum1v = vec_splat_s32(0);
983 sum2v = vec_splat_s32(0);
985 perm0vA = vec_lvsl(0, pix0);
986 perm1vA = vec_lvsl(0, pix1);
987 perm2vA = vec_lvsl(0, pix2);
989 perm0vB = vec_lvsl(0, pix0 + i_stride);
990 perm1vB = vec_lvsl(0, pix1 + i_stride);
991 perm2vB = vec_lvsl(0, pix2 + i_stride);
993 for (y = 0; y < 4; y++)
995 temp_lv = vec_ld(0, pix0);
996 temp_hv = vec_ld(16, pix0);
997 pix0v = vec_perm(temp_lv, temp_hv, perm0vA);
1000 temp_lv = vec_ld(0, pix1);
1001 temp_hv = vec_ld(16, pix1);
1002 pix1v = vec_perm(temp_lv, temp_hv, perm1vA);
1005 fencv = vec_ld(0, fenc);
1006 fenc += FENC_STRIDE;
1008 temp_lv = vec_ld(0, pix2);
1009 temp_hv = vec_ld(16, pix2);
1010 pix2v = vec_perm(temp_lv, temp_hv, perm2vA);
1013 sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
1015 sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
1017 sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
1019 temp_lv = vec_ld(0, pix0);
1020 temp_hv = vec_ld(16, pix0);
1021 pix0v = vec_perm(temp_lv, temp_hv, perm0vB);
1024 temp_lv = vec_ld(0, pix1);
1025 temp_hv = vec_ld(16, pix1);
1026 pix1v = vec_perm(temp_lv, temp_hv, perm1vB);
1029 fencv = vec_ld(0, fenc);
1030 fenc += FENC_STRIDE;
1032 temp_lv = vec_ld(0, pix2);
1033 temp_hv = vec_ld(16, pix2);
1034 pix2v = vec_perm(temp_lv, temp_hv, perm2vB);
1037 sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
1039 sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
1041 sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
1044 sum0v = vec_sums( sum0v, zero_s32v );
1045 sum1v = vec_sums( sum1v, zero_s32v );
1046 sum2v = vec_sums( sum2v, zero_s32v );
1048 sum0v = vec_splat( sum0v, 3 );
1049 sum1v = vec_splat( sum1v, 3 );
1050 sum2v = vec_splat( sum2v, 3 );
1052 vec_ste( sum0v, 0, &sum0);
1053 vec_ste( sum1v, 0, &sum1);
1054 vec_ste( sum2v, 0, &sum2);
1062 static void pixel_sad_x4_8x16_altivec( uint8_t *fenc,
1063 uint8_t *pix0, uint8_t *pix1,
1064 uint8_t *pix2, uint8_t *pix3,
1065 int i_stride, int scores[4] )
1067 DECLARE_ALIGNED_16( int sum0 );
1068 DECLARE_ALIGNED_16( int sum1 );
1069 DECLARE_ALIGNED_16( int sum2 );
1070 DECLARE_ALIGNED_16( int sum3 );
1074 vec_u8_t temp_lv, temp_hv;
1075 vec_u8_t fencv, pix0v, pix1v, pix2v, pix3v;
1076 vec_u8_t perm0vA, perm1vA, perm2vA, perm3vA, perm0vB, perm1vB, perm2vB, perm3vB, permEncv;
1078 vec_s32_t sum0v, sum1v, sum2v, sum3v;
1080 sum0v = vec_splat_s32(0);
1081 sum1v = vec_splat_s32(0);
1082 sum2v = vec_splat_s32(0);
1083 sum3v = vec_splat_s32(0);
1085 permEncv = vec_lvsl(0, fenc);
1086 perm0vA = vec_lvsl(0, pix0);
1087 perm1vA = vec_lvsl(0, pix1);
1088 perm2vA = vec_lvsl(0, pix2);
1089 perm3vA = vec_lvsl(0, pix3);
1091 perm0vB = vec_lvsl(0, pix0 + i_stride);
1092 perm1vB = vec_lvsl(0, pix1 + i_stride);
1093 perm2vB = vec_lvsl(0, pix2 + i_stride);
1094 perm3vB = vec_lvsl(0, pix3 + i_stride);
1096 for (y = 0; y < 8; y++)
1098 temp_lv = vec_ld(0, pix0);
1099 temp_hv = vec_ld(16, pix0);
1100 pix0v = vec_perm(temp_lv, temp_hv, perm0vA);
1103 temp_lv = vec_ld(0, pix1);
1104 temp_hv = vec_ld(16, pix1);
1105 pix1v = vec_perm(temp_lv, temp_hv, perm1vA);
1108 temp_lv = vec_ld(0, fenc);
1109 fencv = vec_perm(temp_lv, temp_hv, permEncv);
1110 fenc += FENC_STRIDE;
1112 temp_lv = vec_ld(0, pix2);
1113 temp_hv = vec_ld(16, pix2);
1114 pix2v = vec_perm(temp_lv, temp_hv, perm2vA);
1117 temp_lv = vec_ld(0, pix3);
1118 temp_hv = vec_ld(16, pix3);
1119 pix3v = vec_perm(temp_lv, temp_hv, perm3vA);
1122 sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
1124 sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
1126 sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
1128 sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v );
1130 temp_lv = vec_ld(0, pix0);
1131 temp_hv = vec_ld(16, pix0);
1132 pix0v = vec_perm(temp_lv, temp_hv, perm0vB);
1135 temp_lv = vec_ld(0, pix1);
1136 temp_hv = vec_ld(16, pix1);
1137 pix1v = vec_perm(temp_lv, temp_hv, perm1vB);
1140 temp_lv = vec_ld(0, fenc);
1141 fencv = vec_perm(temp_lv, temp_hv, permEncv);
1142 fenc += FENC_STRIDE;
1144 temp_lv = vec_ld(0, pix2);
1145 temp_hv = vec_ld(16, pix2);
1146 pix2v = vec_perm(temp_lv, temp_hv, perm2vB);
1149 temp_lv = vec_ld(0, pix3);
1150 temp_hv = vec_ld(16, pix3);
1151 pix3v = vec_perm(temp_lv, temp_hv, perm3vB);
1154 sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
1156 sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
1158 sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
1160 sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v );
1163 sum0v = vec_sum2s( sum0v, zero_s32v );
1164 sum1v = vec_sum2s( sum1v, zero_s32v );
1165 sum2v = vec_sum2s( sum2v, zero_s32v );
1166 sum3v = vec_sum2s( sum3v, zero_s32v );
1168 sum0v = vec_splat( sum0v, 1 );
1169 sum1v = vec_splat( sum1v, 1 );
1170 sum2v = vec_splat( sum2v, 1 );
1171 sum3v = vec_splat( sum3v, 1 );
1173 vec_ste( sum0v, 0, &sum0);
1174 vec_ste( sum1v, 0, &sum1);
1175 vec_ste( sum2v, 0, &sum2);
1176 vec_ste( sum3v, 0, &sum3);
1184 static void pixel_sad_x3_8x16_altivec( uint8_t *fenc, uint8_t *pix0,
1185 uint8_t *pix1, uint8_t *pix2,
1186 int i_stride, int scores[3] )
1188 DECLARE_ALIGNED_16( int sum0 );
1189 DECLARE_ALIGNED_16( int sum1 );
1190 DECLARE_ALIGNED_16( int sum2 );
1194 vec_u8_t temp_lv, temp_hv;
1195 vec_u8_t fencv, pix0v, pix1v, pix2v;
1196 vec_u8_t perm0vA, perm1vA, perm2vA, perm0vB, perm1vB, perm2vB,permEncv;
1198 vec_s32_t sum0v, sum1v, sum2v;
1200 sum0v = vec_splat_s32(0);
1201 sum1v = vec_splat_s32(0);
1202 sum2v = vec_splat_s32(0);
1204 permEncv = vec_lvsl(0, fenc);
1205 perm0vA = vec_lvsl(0, pix0);
1206 perm1vA = vec_lvsl(0, pix1);
1207 perm2vA = vec_lvsl(0, pix2);
1209 perm0vB = vec_lvsl(0, pix0 + i_stride);
1210 perm1vB = vec_lvsl(0, pix1 + i_stride);
1211 perm2vB = vec_lvsl(0, pix2 + i_stride);
1213 for (y = 0; y < 8; y++)
1215 temp_lv = vec_ld(0, pix0);
1216 temp_hv = vec_ld(16, pix0);
1217 pix0v = vec_perm(temp_lv, temp_hv, perm0vA);
1220 temp_lv = vec_ld(0, pix1);
1221 temp_hv = vec_ld(16, pix1);
1222 pix1v = vec_perm(temp_lv, temp_hv, perm1vA);
1225 temp_lv = vec_ld(0, fenc);
1226 fencv = vec_perm(temp_lv, temp_hv, permEncv);
1227 fenc += FENC_STRIDE;
1229 temp_lv = vec_ld(0, pix2);
1230 temp_hv = vec_ld(16, pix2);
1231 pix2v = vec_perm(temp_lv, temp_hv, perm2vA);
1234 sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
1236 sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
1238 sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
1240 temp_lv = vec_ld(0, pix0);
1241 temp_hv = vec_ld(16, pix0);
1242 pix0v = vec_perm(temp_lv, temp_hv, perm0vB);
1245 temp_lv = vec_ld(0, pix1);
1246 temp_hv = vec_ld(16, pix1);
1247 pix1v = vec_perm(temp_lv, temp_hv, perm1vB);
1250 temp_lv = vec_ld(0, fenc);
1251 fencv = vec_perm(temp_lv, temp_hv, permEncv);
1252 fenc += FENC_STRIDE;
1254 temp_lv = vec_ld(0, pix2);
1255 temp_hv = vec_ld(16, pix2);
1256 pix2v = vec_perm(temp_lv, temp_hv, perm2vB);
1259 sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
1261 sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
1263 sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
1266 sum0v = vec_sum2s( sum0v, zero_s32v );
1267 sum1v = vec_sum2s( sum1v, zero_s32v );
1268 sum2v = vec_sum2s( sum2v, zero_s32v );
1270 sum0v = vec_splat( sum0v, 1 );
1271 sum1v = vec_splat( sum1v, 1 );
1272 sum2v = vec_splat( sum2v, 1 );
1274 vec_ste( sum0v, 0, &sum0);
1275 vec_ste( sum1v, 0, &sum1);
1276 vec_ste( sum2v, 0, &sum2);
1283 static void pixel_sad_x4_8x8_altivec( uint8_t *fenc,
1284 uint8_t *pix0, uint8_t *pix1,
1285 uint8_t *pix2, uint8_t *pix3,
1286 int i_stride, int scores[4] )
1288 DECLARE_ALIGNED_16( int sum0 );
1289 DECLARE_ALIGNED_16( int sum1 );
1290 DECLARE_ALIGNED_16( int sum2 );
1291 DECLARE_ALIGNED_16( int sum3 );
1295 vec_u8_t temp_lv, temp_hv;
1296 vec_u8_t fencv, pix0v, pix1v, pix2v, pix3v;
1297 vec_u8_t perm0vA, perm1vA, perm2vA, perm3vA, perm0vB, perm1vB, perm2vB, perm3vB, permEncv;
1299 vec_s32_t sum0v, sum1v, sum2v, sum3v;
1301 sum0v = vec_splat_s32(0);
1302 sum1v = vec_splat_s32(0);
1303 sum2v = vec_splat_s32(0);
1304 sum3v = vec_splat_s32(0);
1306 permEncv = vec_lvsl(0, fenc);
1307 perm0vA = vec_lvsl(0, pix0);
1308 perm1vA = vec_lvsl(0, pix1);
1309 perm2vA = vec_lvsl(0, pix2);
1310 perm3vA = vec_lvsl(0, pix3);
1312 perm0vB = vec_lvsl(0, pix0 + i_stride);
1313 perm1vB = vec_lvsl(0, pix1 + i_stride);
1314 perm2vB = vec_lvsl(0, pix2 + i_stride);
1315 perm3vB = vec_lvsl(0, pix3 + i_stride);
1317 for (y = 0; y < 4; y++)
1319 temp_lv = vec_ld(0, pix0);
1320 temp_hv = vec_ld(16, pix0);
1321 pix0v = vec_perm(temp_lv, temp_hv, perm0vA);
1324 temp_lv = vec_ld(0, pix1);
1325 temp_hv = vec_ld(16, pix1);
1326 pix1v = vec_perm(temp_lv, temp_hv, perm1vA);
1329 temp_lv = vec_ld(0, fenc);
1330 fencv = vec_perm(temp_lv, temp_hv, permEncv);
1331 fenc += FENC_STRIDE;
1333 temp_lv = vec_ld(0, pix2);
1334 temp_hv = vec_ld(16, pix2);
1335 pix2v = vec_perm(temp_lv, temp_hv, perm2vA);
1338 temp_lv = vec_ld(0, pix3);
1339 temp_hv = vec_ld(16, pix3);
1340 pix3v = vec_perm(temp_lv, temp_hv, perm3vA);
1343 sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
1345 sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
1347 sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
1349 sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v );
1351 temp_lv = vec_ld(0, pix0);
1352 temp_hv = vec_ld(16, pix0);
1353 pix0v = vec_perm(temp_lv, temp_hv, perm0vB);
1356 temp_lv = vec_ld(0, pix1);
1357 temp_hv = vec_ld(16, pix1);
1358 pix1v = vec_perm(temp_lv, temp_hv, perm1vB);
1361 temp_lv = vec_ld(0, fenc);
1362 fencv = vec_perm(temp_lv, temp_hv, permEncv);
1363 fenc += FENC_STRIDE;
1365 temp_lv = vec_ld(0, pix2);
1366 temp_hv = vec_ld(16, pix2);
1367 pix2v = vec_perm(temp_lv, temp_hv, perm2vB);
1370 temp_lv = vec_ld(0, pix3);
1371 temp_hv = vec_ld(16, pix3);
1372 pix3v = vec_perm(temp_lv, temp_hv, perm3vB);
1375 sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
1377 sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
1379 sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
1381 sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v );
1384 sum0v = vec_sum2s( sum0v, zero_s32v );
1385 sum1v = vec_sum2s( sum1v, zero_s32v );
1386 sum2v = vec_sum2s( sum2v, zero_s32v );
1387 sum3v = vec_sum2s( sum3v, zero_s32v );
1389 sum0v = vec_splat( sum0v, 1 );
1390 sum1v = vec_splat( sum1v, 1 );
1391 sum2v = vec_splat( sum2v, 1 );
1392 sum3v = vec_splat( sum3v, 1 );
1394 vec_ste( sum0v, 0, &sum0);
1395 vec_ste( sum1v, 0, &sum1);
1396 vec_ste( sum2v, 0, &sum2);
1397 vec_ste( sum3v, 0, &sum3);
1405 static void pixel_sad_x3_8x8_altivec( uint8_t *fenc, uint8_t *pix0,
1406 uint8_t *pix1, uint8_t *pix2,
1407 int i_stride, int scores[3] )
1409 DECLARE_ALIGNED_16( int sum0 );
1410 DECLARE_ALIGNED_16( int sum1 );
1411 DECLARE_ALIGNED_16( int sum2 );
1415 vec_u8_t temp_lv, temp_hv;
1416 vec_u8_t fencv, pix0v, pix1v, pix2v;
1417 vec_u8_t perm0vA, perm1vA, perm2vA, perm0vB, perm1vB, perm2vB, permEncv;
1419 vec_s32_t sum0v, sum1v, sum2v;
1421 sum0v = vec_splat_s32(0);
1422 sum1v = vec_splat_s32(0);
1423 sum2v = vec_splat_s32(0);
1425 permEncv = vec_lvsl(0, fenc);
1426 perm0vA = vec_lvsl(0, pix0);
1427 perm1vA = vec_lvsl(0, pix1);
1428 perm2vA = vec_lvsl(0, pix2);
1430 perm0vB = vec_lvsl(0, pix0 + i_stride);
1431 perm1vB = vec_lvsl(0, pix1 + i_stride);
1432 perm2vB = vec_lvsl(0, pix2 + i_stride);
1434 for (y = 0; y < 4; y++)
1436 temp_lv = vec_ld(0, pix0);
1437 temp_hv = vec_ld(16, pix0);
1438 pix0v = vec_perm(temp_lv, temp_hv, perm0vA);
1441 temp_lv = vec_ld(0, pix1);
1442 temp_hv = vec_ld(16, pix1);
1443 pix1v = vec_perm(temp_lv, temp_hv, perm1vA);
1446 temp_lv = vec_ld(0, fenc);
1447 fencv = vec_perm(temp_lv, temp_hv, permEncv);
1448 fenc += FENC_STRIDE;
1450 temp_lv = vec_ld(0, pix2);
1451 temp_hv = vec_ld(16, pix2);
1452 pix2v = vec_perm(temp_lv, temp_hv, perm2vA);
1455 sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
1457 sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
1459 sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
1461 temp_lv = vec_ld(0, pix0);
1462 temp_hv = vec_ld(16, pix0);
1463 pix0v = vec_perm(temp_lv, temp_hv, perm0vB);
1466 temp_lv = vec_ld(0, pix1);
1467 temp_hv = vec_ld(16, pix1);
1468 pix1v = vec_perm(temp_lv, temp_hv, perm1vB);
1471 temp_lv = vec_ld(0, fenc);
1472 fencv = vec_perm(temp_lv, temp_hv, permEncv);
1473 fenc += FENC_STRIDE;
1475 temp_lv = vec_ld(0, pix2);
1476 temp_hv = vec_ld(16, pix2);
1477 pix2v = vec_perm(temp_lv, temp_hv, perm2vB);
1480 sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
1482 sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
1484 sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
1487 sum0v = vec_sum2s( sum0v, zero_s32v );
1488 sum1v = vec_sum2s( sum1v, zero_s32v );
1489 sum2v = vec_sum2s( sum2v, zero_s32v );
1491 sum0v = vec_splat( sum0v, 1 );
1492 sum1v = vec_splat( sum1v, 1 );
1493 sum2v = vec_splat( sum2v, 1 );
1495 vec_ste( sum0v, 0, &sum0);
1496 vec_ste( sum1v, 0, &sum1);
1497 vec_ste( sum2v, 0, &sum2);
1504 /***********************************************************************
1506 **********************************************************************/
1508 static int pixel_ssd_16x16_altivec ( uint8_t *pix1, int i_stride_pix1,
1509 uint8_t *pix2, int i_stride_pix2)
1511 DECLARE_ALIGNED_16( int sum );
1515 vec_u8_t pix1vA, pix2vA, pix1vB, pix2vB;
1517 vec_u8_t maxA, minA, diffA, maxB, minB, diffB;
1518 vec_u8_t temp_lv, temp_hv;
1519 vec_u8_t permA, permB;
1521 sumv = vec_splat_u32(0);
1523 permA = vec_lvsl(0, pix2);
1524 permB = vec_lvsl(0, pix2 + i_stride_pix2);
1526 temp_lv = vec_ld(0, pix2);
1527 temp_hv = vec_ld(16, pix2);
1528 pix2vA = vec_perm(temp_lv, temp_hv, permA);
1529 pix1vA = vec_ld(0, pix1);
1531 for (y=0; y < 7; y++)
1533 pix1 += i_stride_pix1;
1534 pix2 += i_stride_pix2;
1536 maxA = vec_max(pix1vA, pix2vA);
1537 minA = vec_min(pix1vA, pix2vA);
1539 temp_lv = vec_ld(0, pix2);
1540 temp_hv = vec_ld(16, pix2);
1541 pix2vB = vec_perm(temp_lv, temp_hv, permB);
1542 pix1vB = vec_ld(0, pix1);
1544 diffA = vec_sub(maxA, minA);
1545 sumv = vec_msum(diffA, diffA, sumv);
1547 pix1 += i_stride_pix1;
1548 pix2 += i_stride_pix2;
1550 maxB = vec_max(pix1vB, pix2vB);
1551 minB = vec_min(pix1vB, pix2vB);
1553 temp_lv = vec_ld(0, pix2);
1554 temp_hv = vec_ld(16, pix2);
1555 pix2vA = vec_perm(temp_lv, temp_hv, permA);
1556 pix1vA = vec_ld(0, pix1);
1558 diffB = vec_sub(maxB, minB);
1559 sumv = vec_msum(diffB, diffB, sumv);
1562 pix1 += i_stride_pix1;
1563 pix2 += i_stride_pix2;
1565 temp_lv = vec_ld(0, pix2);
1566 temp_hv = vec_ld(16, pix2);
1567 pix2vB = vec_perm(temp_lv, temp_hv, permB);
1568 pix1vB = vec_ld(0, pix1);
1570 maxA = vec_max(pix1vA, pix2vA);
1571 minA = vec_min(pix1vA, pix2vA);
1573 maxB = vec_max(pix1vB, pix2vB);
1574 minB = vec_min(pix1vB, pix2vB);
1576 diffA = vec_sub(maxA, minA);
1577 sumv = vec_msum(diffA, diffA, sumv);
1579 diffB = vec_sub(maxB, minB);
1580 sumv = vec_msum(diffB, diffB, sumv);
1582 sumv = (vec_u32_t) vec_sums((vec_s32_t) sumv, zero_s32v);
1583 sumv = vec_splat(sumv, 3);
1584 vec_ste((vec_s32_t) sumv, 0, &sum);
1588 static int pixel_ssd_8x8_altivec ( uint8_t *pix1, int i_stride_pix1,
1589 uint8_t *pix2, int i_stride_pix2)
1591 DECLARE_ALIGNED_16( int sum );
1595 vec_u8_t pix1v, pix2v;
1597 vec_u8_t maxv, minv, diffv;
1598 vec_u8_t temp_lv, temp_hv;
1599 vec_u8_t perm1v, perm2v;
1601 const vec_u32_t sel = (vec_u32_t)CV(-1,-1,0,0);
1603 sumv = vec_splat_u32(0);
1605 perm1v = vec_lvsl(0, pix1);
1606 perm2v = vec_lvsl(0, pix2);
1608 for (y=0; y < 8; y++)
1610 temp_hv = vec_ld(0, pix1);
1611 temp_lv = vec_ld(7, pix1);
1612 pix1v = vec_perm(temp_hv, temp_lv, perm1v);
1614 temp_hv = vec_ld(0, pix2);
1615 temp_lv = vec_ld(7, pix2);
1616 pix2v = vec_perm(temp_hv, temp_lv, perm2v);
1618 maxv = vec_max(pix1v, pix2v);
1619 minv = vec_min(pix1v, pix2v);
1621 diffv = vec_sub(maxv, minv);
1622 sumv = vec_msum(diffv, diffv, sumv);
1624 pix1 += i_stride_pix1;
1625 pix2 += i_stride_pix2;
1628 sumv = vec_sel( zero_u32v, sumv, sel );
1630 sumv = (vec_u32_t) vec_sums((vec_s32_t) sumv, zero_s32v);
1631 sumv = vec_splat(sumv, 3);
1632 vec_ste((vec_s32_t) sumv, 0, &sum);
1637 /**********************************************************************
1638 * SA8D routines: sum of 8x8 Hadamard transformed differences
1639 **********************************************************************/
1640 /* SA8D_1D unrolled by 8 in Altivec */
1641 #define SA8D_1D_ALTIVEC( sa8d0v, sa8d1v, sa8d2v, sa8d3v, \
1642 sa8d4v, sa8d5v, sa8d6v, sa8d7v ) \
1644 /* int a0 = SRC(0) + SRC(4) */ \
1645 vec_s16_t a0v = vec_add(sa8d0v, sa8d4v); \
1646 /* int a4 = SRC(0) - SRC(4) */ \
1647 vec_s16_t a4v = vec_sub(sa8d0v, sa8d4v); \
1648 /* int a1 = SRC(1) + SRC(5) */ \
1649 vec_s16_t a1v = vec_add(sa8d1v, sa8d5v); \
1650 /* int a5 = SRC(1) - SRC(5) */ \
1651 vec_s16_t a5v = vec_sub(sa8d1v, sa8d5v); \
1652 /* int a2 = SRC(2) + SRC(6) */ \
1653 vec_s16_t a2v = vec_add(sa8d2v, sa8d6v); \
1654 /* int a6 = SRC(2) - SRC(6) */ \
1655 vec_s16_t a6v = vec_sub(sa8d2v, sa8d6v); \
1656 /* int a3 = SRC(3) + SRC(7) */ \
1657 vec_s16_t a3v = vec_add(sa8d3v, sa8d7v); \
1658 /* int a7 = SRC(3) - SRC(7) */ \
1659 vec_s16_t a7v = vec_sub(sa8d3v, sa8d7v); \
1661 /* int b0 = a0 + a2 */ \
1662 vec_s16_t b0v = vec_add(a0v, a2v); \
1663 /* int b2 = a0 - a2; */ \
1664 vec_s16_t b2v = vec_sub(a0v, a2v); \
1665 /* int b1 = a1 + a3; */ \
1666 vec_s16_t b1v = vec_add(a1v, a3v); \
1667 /* int b3 = a1 - a3; */ \
1668 vec_s16_t b3v = vec_sub(a1v, a3v); \
1669 /* int b4 = a4 + a6; */ \
1670 vec_s16_t b4v = vec_add(a4v, a6v); \
1671 /* int b6 = a4 - a6; */ \
1672 vec_s16_t b6v = vec_sub(a4v, a6v); \
1673 /* int b5 = a5 + a7; */ \
1674 vec_s16_t b5v = vec_add(a5v, a7v); \
1675 /* int b7 = a5 - a7; */ \
1676 vec_s16_t b7v = vec_sub(a5v, a7v); \
1678 /* DST(0, b0 + b1) */ \
1679 sa8d0v = vec_add(b0v, b1v); \
1680 /* DST(1, b0 - b1) */ \
1681 sa8d1v = vec_sub(b0v, b1v); \
1682 /* DST(2, b2 + b3) */ \
1683 sa8d2v = vec_add(b2v, b3v); \
1684 /* DST(3, b2 - b3) */ \
1685 sa8d3v = vec_sub(b2v, b3v); \
1686 /* DST(4, b4 + b5) */ \
1687 sa8d4v = vec_add(b4v, b5v); \
1688 /* DST(5, b4 - b5) */ \
1689 sa8d5v = vec_sub(b4v, b5v); \
1690 /* DST(6, b6 + b7) */ \
1691 sa8d6v = vec_add(b6v, b7v); \
1692 /* DST(7, b6 - b7) */ \
1693 sa8d7v = vec_sub(b6v, b7v); \
1696 static int pixel_sa8d_8x8_core_altivec( uint8_t *pix1, int i_pix1,
1697 uint8_t *pix2, int i_pix2 )
1702 PREP_LOAD_SRC( pix1 );
1703 PREP_LOAD_SRC( pix2 );
1705 vec_s16_t diff0v, diff1v, diff2v, diff3v, diff4v, diff5v, diff6v, diff7v;
1707 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v, pix2 );
1708 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v, pix2 );
1709 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v, pix2 );
1710 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v, pix2 );
1712 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff4v, pix2 );
1713 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff5v, pix2 );
1714 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff6v, pix2 );
1715 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff7v, pix2 );
1717 vec_s16_t sa8d0v, sa8d1v, sa8d2v, sa8d3v, sa8d4v, sa8d5v, sa8d6v, sa8d7v;
1719 SA8D_1D_ALTIVEC(diff0v, diff1v, diff2v, diff3v,
1720 diff4v, diff5v, diff6v, diff7v);
1722 VEC_TRANSPOSE_8(diff0v, diff1v, diff2v, diff3v,
1723 diff4v, diff5v, diff6v, diff7v,
1724 sa8d0v, sa8d1v, sa8d2v, sa8d3v,
1725 sa8d4v, sa8d5v, sa8d6v, sa8d7v );
1727 SA8D_1D_ALTIVEC(sa8d0v, sa8d1v, sa8d2v, sa8d3v,
1728 sa8d4v, sa8d5v, sa8d6v, sa8d7v );
1730 /* accumulation of the absolute value of all elements of the resulting bloc */
1731 vec_s16_t abs0v = VEC_ABS(sa8d0v);
1732 vec_s16_t abs1v = VEC_ABS(sa8d1v);
1733 vec_s16_t sum01v = vec_add(abs0v, abs1v);
1735 vec_s16_t abs2v = VEC_ABS(sa8d2v);
1736 vec_s16_t abs3v = VEC_ABS(sa8d3v);
1737 vec_s16_t sum23v = vec_add(abs2v, abs3v);
1739 vec_s16_t abs4v = VEC_ABS(sa8d4v);
1740 vec_s16_t abs5v = VEC_ABS(sa8d5v);
1741 vec_s16_t sum45v = vec_add(abs4v, abs5v);
1743 vec_s16_t abs6v = VEC_ABS(sa8d6v);
1744 vec_s16_t abs7v = VEC_ABS(sa8d7v);
1745 vec_s16_t sum67v = vec_add(abs6v, abs7v);
1747 vec_s16_t sum0123v = vec_add(sum01v, sum23v);
1748 vec_s16_t sum4567v = vec_add(sum45v, sum67v);
1752 sumblocv = vec_sum4s(sum0123v, (vec_s32_t)zerov );
1753 sumblocv = vec_sum4s(sum4567v, sumblocv );
1755 sumblocv = vec_sums(sumblocv, (vec_s32_t)zerov );
1757 sumblocv = vec_splat(sumblocv, 3);
1759 vec_ste(sumblocv, 0, &i_satd);
1764 static int pixel_sa8d_8x8_altivec( uint8_t *pix1, int i_pix1,
1765 uint8_t *pix2, int i_pix2 )
1768 i_satd = (pixel_sa8d_8x8_core_altivec( pix1, i_pix1, pix2, i_pix2 )+2)>>2;
1772 static int pixel_sa8d_16x16_altivec( uint8_t *pix1, int i_pix1,
1773 uint8_t *pix2, int i_pix2 )
1777 i_satd = (pixel_sa8d_8x8_core_altivec( &pix1[0], i_pix1, &pix2[0], i_pix2 )
1778 + pixel_sa8d_8x8_core_altivec( &pix1[8], i_pix1, &pix2[8], i_pix2 )
1779 + pixel_sa8d_8x8_core_altivec( &pix1[8*i_pix1], i_pix1, &pix2[8*i_pix2], i_pix2 )
1780 + pixel_sa8d_8x8_core_altivec( &pix1[8*i_pix1+8], i_pix1, &pix2[8*i_pix2+8], i_pix2 ) +2)>>2;
1784 /****************************************************************************
1785 * structural similarity metric
1786 ****************************************************************************/
1787 static void ssim_4x4x2_core_altivec( const uint8_t *pix1, int stride1,
1788 const uint8_t *pix2, int stride2,
1791 DECLARE_ALIGNED_16( int temp[4] );
1794 vec_u8_t pix1v, pix2v;
1795 vec_u32_t s1v, s2v, ssv, s12v;
1797 PREP_LOAD_SRC (pix1);
1798 PREP_LOAD_SRC (pix2);
1801 s1v = s2v = ssv = s12v = zero_u32v;
1805 VEC_LOAD( &pix1[y*stride1], pix1v, 16, vec_u8_t, pix1 );
1806 VEC_LOAD( &pix2[y*stride2], pix2v, 16, vec_u8_t, pix2 );
1808 s1v = vec_sum4s( pix1v, s1v );
1809 s2v = vec_sum4s( pix2v, s2v );
1810 ssv = vec_msum( pix1v, pix1v, ssv );
1811 ssv = vec_msum( pix2v, pix2v, ssv );
1812 s12v = vec_msum( pix1v, pix2v, s12v );
1815 vec_st( (vec_s32_t)s1v, 0, temp );
1816 sums[0][0] = temp[0];
1817 sums[1][0] = temp[1];
1818 vec_st( (vec_s32_t)s2v, 0, temp );
1819 sums[0][1] = temp[0];
1820 sums[1][1] = temp[1];
1821 vec_st( (vec_s32_t)ssv, 0, temp );
1822 sums[0][2] = temp[0];
1823 sums[1][2] = temp[1];
1824 vec_st( (vec_s32_t)s12v, 0, temp );
1825 sums[0][3] = temp[0];
1826 sums[1][3] = temp[1];
1829 /****************************************************************************
1831 ****************************************************************************/
1832 void x264_pixel_altivec_init( x264_pixel_function_t *pixf )
1834 pixf->sad[PIXEL_16x16] = pixel_sad_16x16_altivec;
1835 pixf->sad[PIXEL_8x16] = pixel_sad_8x16_altivec;
1836 pixf->sad[PIXEL_16x8] = pixel_sad_16x8_altivec;
1837 pixf->sad[PIXEL_8x8] = pixel_sad_8x8_altivec;
1839 pixf->sad_x3[PIXEL_16x16] = pixel_sad_x3_16x16_altivec;
1840 pixf->sad_x3[PIXEL_8x16] = pixel_sad_x3_8x16_altivec;
1841 pixf->sad_x3[PIXEL_16x8] = pixel_sad_x3_16x8_altivec;
1842 pixf->sad_x3[PIXEL_8x8] = pixel_sad_x3_8x8_altivec;
1844 pixf->sad_x4[PIXEL_16x16] = pixel_sad_x4_16x16_altivec;
1845 pixf->sad_x4[PIXEL_8x16] = pixel_sad_x4_8x16_altivec;
1846 pixf->sad_x4[PIXEL_16x8] = pixel_sad_x4_16x8_altivec;
1847 pixf->sad_x4[PIXEL_8x8] = pixel_sad_x4_8x8_altivec;
1849 pixf->satd[PIXEL_16x16] = pixel_satd_16x16_altivec;
1850 pixf->satd[PIXEL_8x16] = pixel_satd_8x16_altivec;
1851 pixf->satd[PIXEL_16x8] = pixel_satd_16x8_altivec;
1852 pixf->satd[PIXEL_8x8] = pixel_satd_8x8_altivec;
1853 pixf->satd[PIXEL_8x4] = pixel_satd_8x4_altivec;
1854 pixf->satd[PIXEL_4x8] = pixel_satd_4x8_altivec;
1855 pixf->satd[PIXEL_4x4] = pixel_satd_4x4_altivec;
1857 pixf->ssd[PIXEL_16x16] = pixel_ssd_16x16_altivec;
1858 pixf->ssd[PIXEL_8x8] = pixel_ssd_8x8_altivec;
1860 pixf->sa8d[PIXEL_16x16] = pixel_sa8d_16x16_altivec;
1861 pixf->sa8d[PIXEL_8x8] = pixel_sa8d_8x8_altivec;
1863 pixf->ssim_4x4x2_core = ssim_4x4x2_core_altivec;