1 /*****************************************************************************
2 * pixel.c: h264 encoder
3 *****************************************************************************
4 * Copyright (C) 2003 Laurent Aimar
5 * $Id: pixel.c,v 1.1 2004/06/03 19:27:07 fenrir Exp $
7 * Authors: Eric Petit <titer@m0k.org>
8 * Guillaume Poirier <gpoirier@mplayerhq.hu>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
23 *****************************************************************************/
29 #include "common/common.h"
30 #include "ppccommon.h"
32 /***********************************************************************
34 **********************************************************************/
36 #define PIXEL_SAD_ALTIVEC( name, lx, ly, a, b ) \
37 static int name( uint8_t *pix1, int i_pix1, \
38 uint8_t *pix2, int i_pix2 ) \
41 DECLARE_ALIGNED_16( int sum ); \
45 vec_u8_t pix1v, pix2v; \
46 vec_s32_t sumv = zero_s32v; \
47 for( y = 0; y < ly; y++ ) \
49 VEC_LOAD( pix1, pix1v, lx, vec_u8_t ); \
50 VEC_LOAD( pix2, pix2v, lx, vec_u8_t ); \
51 sumv = (vec_s32_t) vec_sum4s( \
52 vec_sub( vec_max( pix1v, pix2v ), \
53 vec_min( pix1v, pix2v ) ), \
58 sumv = vec_sum##a( sumv, zero_s32v ); \
59 sumv = vec_splat( sumv, b ); \
60 vec_ste( sumv, 0, &sum ); \
64 PIXEL_SAD_ALTIVEC( pixel_sad_16x16_altivec, 16, 16, s, 3 )
65 PIXEL_SAD_ALTIVEC( pixel_sad_8x16_altivec, 8, 16, 2s, 1 )
66 PIXEL_SAD_ALTIVEC( pixel_sad_16x8_altivec, 16, 8, s, 3 )
67 PIXEL_SAD_ALTIVEC( pixel_sad_8x8_altivec, 8, 8, 2s, 1 )
71 /***********************************************************************
73 **********************************************************************/
75 /***********************************************************************
77 ***********************************************************************
78 * b[0] = a[0] + a[1] + a[2] + a[3]
79 * b[1] = a[0] + a[1] - a[2] - a[3]
80 * b[2] = a[0] - a[1] - a[2] + a[3]
81 * b[3] = a[0] - a[1] + a[2] - a[3]
82 **********************************************************************/
83 #define VEC_HADAMAR(a0,a1,a2,a3,b0,b1,b2,b3) \
84 b2 = vec_add( a0, a1 ); \
85 b3 = vec_add( a2, a3 ); \
86 a0 = vec_sub( a0, a1 ); \
87 a2 = vec_sub( a2, a3 ); \
88 b0 = vec_add( b2, b3 ); \
89 b1 = vec_sub( b2, b3 ); \
90 b2 = vec_sub( a0, a2 ); \
91 b3 = vec_add( a0, a2 )
93 /***********************************************************************
95 ***********************************************************************
100 * Call vec_sub()/vec_max() instead of vec_abs() because vec_abs()
101 * actually also calls vec_splat(0), but we already have a null vector.
102 **********************************************************************/
104 a = vec_max( a, vec_sub( zero_s16v, a ) );
106 /***********************************************************************
108 ***********************************************************************
112 * c[i] = abs(a[2*i]) + abs(a[2*i+1]) + [bi]
113 **********************************************************************/
114 #define VEC_ADD_ABS(a,b,c) \
116 c = vec_sum4s( a, b )
118 /***********************************************************************
120 **********************************************************************/
121 static int pixel_satd_4x4_altivec( uint8_t *pix1, int i_pix1,
122 uint8_t *pix2, int i_pix2 )
124 DECLARE_ALIGNED_16( int i_satd );
127 vec_s16_t diff0v, diff1v, diff2v, diff3v;
128 vec_s16_t temp0v, temp1v, temp2v, temp3v;
131 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff0v );
132 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff1v );
133 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff2v );
134 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff3v );
137 VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
138 temp0v, temp1v, temp2v, temp3v );
140 VEC_TRANSPOSE_4( temp0v, temp1v, temp2v, temp3v,
141 diff0v, diff1v, diff2v, diff3v );
143 VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
144 temp0v, temp1v, temp2v, temp3v );
146 VEC_ADD_ABS( temp0v, zero_s32v, satdv );
147 VEC_ADD_ABS( temp1v, satdv, satdv );
148 VEC_ADD_ABS( temp2v, satdv, satdv );
149 VEC_ADD_ABS( temp3v, satdv, satdv );
151 satdv = vec_sum2s( satdv, zero_s32v );
152 satdv = vec_splat( satdv, 1 );
153 vec_ste( satdv, 0, &i_satd );
158 /***********************************************************************
160 **********************************************************************/
161 static int pixel_satd_4x8_altivec( uint8_t *pix1, int i_pix1,
162 uint8_t *pix2, int i_pix2 )
164 DECLARE_ALIGNED_16( int i_satd );
167 vec_s16_t diff0v, diff1v, diff2v, diff3v;
168 vec_s16_t temp0v, temp1v, temp2v, temp3v;
171 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff0v );
172 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff1v );
173 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff2v );
174 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff3v );
175 VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
176 temp0v, temp1v, temp2v, temp3v );
177 VEC_TRANSPOSE_4( temp0v, temp1v, temp2v, temp3v,
178 diff0v, diff1v, diff2v, diff3v );
179 VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
180 temp0v, temp1v, temp2v, temp3v );
181 VEC_ADD_ABS( temp0v, zero_s32v, satdv );
182 VEC_ADD_ABS( temp1v, satdv, satdv );
183 VEC_ADD_ABS( temp2v, satdv, satdv );
184 VEC_ADD_ABS( temp3v, satdv, satdv );
186 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff0v );
187 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff1v );
188 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff2v );
189 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff3v );
190 VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
191 temp0v, temp1v, temp2v, temp3v );
192 VEC_TRANSPOSE_4( temp0v, temp1v, temp2v, temp3v,
193 diff0v, diff1v, diff2v, diff3v );
194 VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
195 temp0v, temp1v, temp2v, temp3v );
196 VEC_ADD_ABS( temp0v, satdv, satdv );
197 VEC_ADD_ABS( temp1v, satdv, satdv );
198 VEC_ADD_ABS( temp2v, satdv, satdv );
199 VEC_ADD_ABS( temp3v, satdv, satdv );
201 satdv = vec_sum2s( satdv, zero_s32v );
202 satdv = vec_splat( satdv, 1 );
203 vec_ste( satdv, 0, &i_satd );
208 /***********************************************************************
210 **********************************************************************/
211 static int pixel_satd_8x4_altivec( uint8_t *pix1, int i_pix1,
212 uint8_t *pix2, int i_pix2 )
214 DECLARE_ALIGNED_16( int i_satd );
217 vec_s16_t diff0v, diff1v, diff2v, diff3v,
218 diff4v, diff5v, diff6v, diff7v;
219 vec_s16_t temp0v, temp1v, temp2v, temp3v,
220 temp4v, temp5v, temp6v, temp7v;
223 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v );
224 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v );
225 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v );
226 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v );
228 VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
229 temp0v, temp1v, temp2v, temp3v );
230 /* This causes warnings because temp4v...temp7v haven't be set,
232 VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
233 temp4v, temp5v, temp6v, temp7v,
234 diff0v, diff1v, diff2v, diff3v,
235 diff4v, diff5v, diff6v, diff7v );
236 VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
237 temp0v, temp1v, temp2v, temp3v );
238 VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v,
239 temp4v, temp5v, temp6v, temp7v );
241 VEC_ADD_ABS( temp0v, zero_s32v, satdv );
242 VEC_ADD_ABS( temp1v, satdv, satdv );
243 VEC_ADD_ABS( temp2v, satdv, satdv );
244 VEC_ADD_ABS( temp3v, satdv, satdv );
245 VEC_ADD_ABS( temp4v, satdv, satdv );
246 VEC_ADD_ABS( temp5v, satdv, satdv );
247 VEC_ADD_ABS( temp6v, satdv, satdv );
248 VEC_ADD_ABS( temp7v, satdv, satdv );
250 satdv = vec_sum2s( satdv, zero_s32v );
251 satdv = vec_splat( satdv, 1 );
252 vec_ste( satdv, 0, &i_satd );
257 /***********************************************************************
259 **********************************************************************/
260 static int pixel_satd_8x8_altivec( uint8_t *pix1, int i_pix1,
261 uint8_t *pix2, int i_pix2 )
263 DECLARE_ALIGNED_16( int i_satd );
266 vec_s16_t diff0v, diff1v, diff2v, diff3v,
267 diff4v, diff5v, diff6v, diff7v;
268 vec_s16_t temp0v, temp1v, temp2v, temp3v,
269 temp4v, temp5v, temp6v, temp7v;
272 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v );
273 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v );
274 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v );
275 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v );
276 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff4v );
277 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff5v );
278 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff6v );
279 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff7v );
281 VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
282 temp0v, temp1v, temp2v, temp3v );
283 VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v,
284 temp4v, temp5v, temp6v, temp7v );
286 VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
287 temp4v, temp5v, temp6v, temp7v,
288 diff0v, diff1v, diff2v, diff3v,
289 diff4v, diff5v, diff6v, diff7v );
291 VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
292 temp0v, temp1v, temp2v, temp3v );
293 VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v,
294 temp4v, temp5v, temp6v, temp7v );
296 VEC_ADD_ABS( temp0v, zero_s32v, satdv );
297 VEC_ADD_ABS( temp1v, satdv, satdv );
298 VEC_ADD_ABS( temp2v, satdv, satdv );
299 VEC_ADD_ABS( temp3v, satdv, satdv );
300 VEC_ADD_ABS( temp4v, satdv, satdv );
301 VEC_ADD_ABS( temp5v, satdv, satdv );
302 VEC_ADD_ABS( temp6v, satdv, satdv );
303 VEC_ADD_ABS( temp7v, satdv, satdv );
305 satdv = vec_sums( satdv, zero_s32v );
306 satdv = vec_splat( satdv, 3 );
307 vec_ste( satdv, 0, &i_satd );
312 /***********************************************************************
314 **********************************************************************/
315 static int pixel_satd_8x16_altivec( uint8_t *pix1, int i_pix1,
316 uint8_t *pix2, int i_pix2 )
318 DECLARE_ALIGNED_16( int i_satd );
321 vec_s16_t diff0v, diff1v, diff2v, diff3v,
322 diff4v, diff5v, diff6v, diff7v;
323 vec_s16_t temp0v, temp1v, temp2v, temp3v,
324 temp4v, temp5v, temp6v, temp7v;
327 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v );
328 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v );
329 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v );
330 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v );
331 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff4v );
332 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff5v );
333 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff6v );
334 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff7v );
335 VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
336 temp0v, temp1v, temp2v, temp3v );
337 VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v,
338 temp4v, temp5v, temp6v, temp7v );
339 VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
340 temp4v, temp5v, temp6v, temp7v,
341 diff0v, diff1v, diff2v, diff3v,
342 diff4v, diff5v, diff6v, diff7v );
343 VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
344 temp0v, temp1v, temp2v, temp3v );
345 VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v,
346 temp4v, temp5v, temp6v, temp7v );
347 VEC_ADD_ABS( temp0v, zero_s32v, satdv );
348 VEC_ADD_ABS( temp1v, satdv, satdv );
349 VEC_ADD_ABS( temp2v, satdv, satdv );
350 VEC_ADD_ABS( temp3v, satdv, satdv );
351 VEC_ADD_ABS( temp4v, satdv, satdv );
352 VEC_ADD_ABS( temp5v, satdv, satdv );
353 VEC_ADD_ABS( temp6v, satdv, satdv );
354 VEC_ADD_ABS( temp7v, satdv, satdv );
356 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v );
357 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v );
358 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v );
359 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v );
360 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff4v );
361 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff5v );
362 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff6v );
363 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff7v );
364 VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
365 temp0v, temp1v, temp2v, temp3v );
366 VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v,
367 temp4v, temp5v, temp6v, temp7v );
368 VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
369 temp4v, temp5v, temp6v, temp7v,
370 diff0v, diff1v, diff2v, diff3v,
371 diff4v, diff5v, diff6v, diff7v );
372 VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
373 temp0v, temp1v, temp2v, temp3v );
374 VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v,
375 temp4v, temp5v, temp6v, temp7v );
376 VEC_ADD_ABS( temp0v, satdv, satdv );
377 VEC_ADD_ABS( temp1v, satdv, satdv );
378 VEC_ADD_ABS( temp2v, satdv, satdv );
379 VEC_ADD_ABS( temp3v, satdv, satdv );
380 VEC_ADD_ABS( temp4v, satdv, satdv );
381 VEC_ADD_ABS( temp5v, satdv, satdv );
382 VEC_ADD_ABS( temp6v, satdv, satdv );
383 VEC_ADD_ABS( temp7v, satdv, satdv );
385 satdv = vec_sums( satdv, zero_s32v );
386 satdv = vec_splat( satdv, 3 );
387 vec_ste( satdv, 0, &i_satd );
392 /***********************************************************************
394 **********************************************************************/
395 static int pixel_satd_16x8_altivec( uint8_t *pix1, int i_pix1,
396 uint8_t *pix2, int i_pix2 )
398 DECLARE_ALIGNED_16( int i_satd );
403 vec_s16_t pix1v, pix2v;
404 vec_s16_t diffh0v, diffh1v, diffh2v, diffh3v,
405 diffh4v, diffh5v, diffh6v, diffh7v;
406 vec_s16_t diffl0v, diffl1v, diffl2v, diffl3v,
407 diffl4v, diffl5v, diffl6v, diffl7v;
408 vec_s16_t temp0v, temp1v, temp2v, temp3v,
409 temp4v, temp5v, temp6v, temp7v;
411 VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh0v, diffl0v );
412 VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh1v, diffl1v );
413 VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh2v, diffl2v );
414 VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh3v, diffl3v );
415 VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh4v, diffl4v );
416 VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh5v, diffl5v );
417 VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh6v, diffl6v );
418 VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh7v, diffl7v );
420 VEC_HADAMAR( diffh0v, diffh1v, diffh2v, diffh3v,
421 temp0v, temp1v, temp2v, temp3v );
422 VEC_HADAMAR( diffh4v, diffh5v, diffh6v, diffh7v,
423 temp4v, temp5v, temp6v, temp7v );
425 VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
426 temp4v, temp5v, temp6v, temp7v,
427 diffh0v, diffh1v, diffh2v, diffh3v,
428 diffh4v, diffh5v, diffh6v, diffh7v );
430 VEC_HADAMAR( diffh0v, diffh1v, diffh2v, diffh3v,
431 temp0v, temp1v, temp2v, temp3v );
432 VEC_HADAMAR( diffh4v, diffh5v, diffh6v, diffh7v,
433 temp4v, temp5v, temp6v, temp7v );
435 VEC_ADD_ABS( temp0v, zero_s32v, satdv );
436 VEC_ADD_ABS( temp1v, satdv, satdv );
437 VEC_ADD_ABS( temp2v, satdv, satdv );
438 VEC_ADD_ABS( temp3v, satdv, satdv );
439 VEC_ADD_ABS( temp4v, satdv, satdv );
440 VEC_ADD_ABS( temp5v, satdv, satdv );
441 VEC_ADD_ABS( temp6v, satdv, satdv );
442 VEC_ADD_ABS( temp7v, satdv, satdv );
444 VEC_HADAMAR( diffl0v, diffl1v, diffl2v, diffl3v,
445 temp0v, temp1v, temp2v, temp3v );
446 VEC_HADAMAR( diffl4v, diffl5v, diffl6v, diffl7v,
447 temp4v, temp5v, temp6v, temp7v );
449 VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
450 temp4v, temp5v, temp6v, temp7v,
451 diffl0v, diffl1v, diffl2v, diffl3v,
452 diffl4v, diffl5v, diffl6v, diffl7v );
454 VEC_HADAMAR( diffl0v, diffl1v, diffl2v, diffl3v,
455 temp0v, temp1v, temp2v, temp3v );
456 VEC_HADAMAR( diffl4v, diffl5v, diffl6v, diffl7v,
457 temp4v, temp5v, temp6v, temp7v );
459 VEC_ADD_ABS( temp0v, satdv, satdv );
460 VEC_ADD_ABS( temp1v, satdv, satdv );
461 VEC_ADD_ABS( temp2v, satdv, satdv );
462 VEC_ADD_ABS( temp3v, satdv, satdv );
463 VEC_ADD_ABS( temp4v, satdv, satdv );
464 VEC_ADD_ABS( temp5v, satdv, satdv );
465 VEC_ADD_ABS( temp6v, satdv, satdv );
466 VEC_ADD_ABS( temp7v, satdv, satdv );
468 satdv = vec_sums( satdv, zero_s32v );
469 satdv = vec_splat( satdv, 3 );
470 vec_ste( satdv, 0, &i_satd );
475 /***********************************************************************
477 **********************************************************************/
478 static int pixel_satd_16x16_altivec( uint8_t *pix1, int i_pix1,
479 uint8_t *pix2, int i_pix2 )
481 DECLARE_ALIGNED_16( int i_satd );
486 vec_s16_t pix1v, pix2v;
487 vec_s16_t diffh0v, diffh1v, diffh2v, diffh3v,
488 diffh4v, diffh5v, diffh6v, diffh7v;
489 vec_s16_t diffl0v, diffl1v, diffl2v, diffl3v,
490 diffl4v, diffl5v, diffl6v, diffl7v;
491 vec_s16_t temp0v, temp1v, temp2v, temp3v,
492 temp4v, temp5v, temp6v, temp7v;
494 VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh0v, diffl0v );
495 VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh1v, diffl1v );
496 VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh2v, diffl2v );
497 VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh3v, diffl3v );
498 VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh4v, diffl4v );
499 VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh5v, diffl5v );
500 VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh6v, diffl6v );
501 VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh7v, diffl7v );
502 VEC_HADAMAR( diffh0v, diffh1v, diffh2v, diffh3v,
503 temp0v, temp1v, temp2v, temp3v );
504 VEC_HADAMAR( diffh4v, diffh5v, diffh6v, diffh7v,
505 temp4v, temp5v, temp6v, temp7v );
506 VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
507 temp4v, temp5v, temp6v, temp7v,
508 diffh0v, diffh1v, diffh2v, diffh3v,
509 diffh4v, diffh5v, diffh6v, diffh7v );
510 VEC_HADAMAR( diffh0v, diffh1v, diffh2v, diffh3v,
511 temp0v, temp1v, temp2v, temp3v );
512 VEC_HADAMAR( diffh4v, diffh5v, diffh6v, diffh7v,
513 temp4v, temp5v, temp6v, temp7v );
514 VEC_ADD_ABS( temp0v, zero_s32v, satdv );
515 VEC_ADD_ABS( temp1v, satdv, satdv );
516 VEC_ADD_ABS( temp2v, satdv, satdv );
517 VEC_ADD_ABS( temp3v, satdv, satdv );
518 VEC_ADD_ABS( temp4v, satdv, satdv );
519 VEC_ADD_ABS( temp5v, satdv, satdv );
520 VEC_ADD_ABS( temp6v, satdv, satdv );
521 VEC_ADD_ABS( temp7v, satdv, satdv );
522 VEC_HADAMAR( diffl0v, diffl1v, diffl2v, diffl3v,
523 temp0v, temp1v, temp2v, temp3v );
524 VEC_HADAMAR( diffl4v, diffl5v, diffl6v, diffl7v,
525 temp4v, temp5v, temp6v, temp7v );
526 VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
527 temp4v, temp5v, temp6v, temp7v,
528 diffl0v, diffl1v, diffl2v, diffl3v,
529 diffl4v, diffl5v, diffl6v, diffl7v );
530 VEC_HADAMAR( diffl0v, diffl1v, diffl2v, diffl3v,
531 temp0v, temp1v, temp2v, temp3v );
532 VEC_HADAMAR( diffl4v, diffl5v, diffl6v, diffl7v,
533 temp4v, temp5v, temp6v, temp7v );
534 VEC_ADD_ABS( temp0v, satdv, satdv );
535 VEC_ADD_ABS( temp1v, satdv, satdv );
536 VEC_ADD_ABS( temp2v, satdv, satdv );
537 VEC_ADD_ABS( temp3v, satdv, satdv );
538 VEC_ADD_ABS( temp4v, satdv, satdv );
539 VEC_ADD_ABS( temp5v, satdv, satdv );
540 VEC_ADD_ABS( temp6v, satdv, satdv );
541 VEC_ADD_ABS( temp7v, satdv, satdv );
543 VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh0v, diffl0v );
544 VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh1v, diffl1v );
545 VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh2v, diffl2v );
546 VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh3v, diffl3v );
547 VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh4v, diffl4v );
548 VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh5v, diffl5v );
549 VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh6v, diffl6v );
550 VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh7v, diffl7v );
551 VEC_HADAMAR( diffh0v, diffh1v, diffh2v, diffh3v,
552 temp0v, temp1v, temp2v, temp3v );
553 VEC_HADAMAR( diffh4v, diffh5v, diffh6v, diffh7v,
554 temp4v, temp5v, temp6v, temp7v );
555 VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
556 temp4v, temp5v, temp6v, temp7v,
557 diffh0v, diffh1v, diffh2v, diffh3v,
558 diffh4v, diffh5v, diffh6v, diffh7v );
559 VEC_HADAMAR( diffh0v, diffh1v, diffh2v, diffh3v,
560 temp0v, temp1v, temp2v, temp3v );
561 VEC_HADAMAR( diffh4v, diffh5v, diffh6v, diffh7v,
562 temp4v, temp5v, temp6v, temp7v );
563 VEC_ADD_ABS( temp0v, satdv, satdv );
564 VEC_ADD_ABS( temp1v, satdv, satdv );
565 VEC_ADD_ABS( temp2v, satdv, satdv );
566 VEC_ADD_ABS( temp3v, satdv, satdv );
567 VEC_ADD_ABS( temp4v, satdv, satdv );
568 VEC_ADD_ABS( temp5v, satdv, satdv );
569 VEC_ADD_ABS( temp6v, satdv, satdv );
570 VEC_ADD_ABS( temp7v, satdv, satdv );
571 VEC_HADAMAR( diffl0v, diffl1v, diffl2v, diffl3v,
572 temp0v, temp1v, temp2v, temp3v );
573 VEC_HADAMAR( diffl4v, diffl5v, diffl6v, diffl7v,
574 temp4v, temp5v, temp6v, temp7v );
575 VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
576 temp4v, temp5v, temp6v, temp7v,
577 diffl0v, diffl1v, diffl2v, diffl3v,
578 diffl4v, diffl5v, diffl6v, diffl7v );
579 VEC_HADAMAR( diffl0v, diffl1v, diffl2v, diffl3v,
580 temp0v, temp1v, temp2v, temp3v );
581 VEC_HADAMAR( diffl4v, diffl5v, diffl6v, diffl7v,
582 temp4v, temp5v, temp6v, temp7v );
583 VEC_ADD_ABS( temp0v, satdv, satdv );
584 VEC_ADD_ABS( temp1v, satdv, satdv );
585 VEC_ADD_ABS( temp2v, satdv, satdv );
586 VEC_ADD_ABS( temp3v, satdv, satdv );
587 VEC_ADD_ABS( temp4v, satdv, satdv );
588 VEC_ADD_ABS( temp5v, satdv, satdv );
589 VEC_ADD_ABS( temp6v, satdv, satdv );
590 VEC_ADD_ABS( temp7v, satdv, satdv );
592 satdv = vec_sums( satdv, zero_s32v );
593 satdv = vec_splat( satdv, 3 );
594 vec_ste( satdv, 0, &i_satd );
601 /***********************************************************************
602 * Interleaved SAD routines
603 **********************************************************************/
605 static void pixel_sad_x4_16x16_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, uint8_t *pix3, int i_stride, int scores[4] )
607 DECLARE_ALIGNED_16( int sum0 );
608 DECLARE_ALIGNED_16( int sum1 );
609 DECLARE_ALIGNED_16( int sum2 );
610 DECLARE_ALIGNED_16( int sum3 );
614 vec_u8_t temp_lv, temp_hv;
615 vec_u8_t fencv, pix0v, pix1v, pix2v, pix3v;
616 //vec_u8_t perm0v, perm1v, perm2v, perm3v;
617 vec_u8_t perm0vA, perm1vA, perm2vA, perm3vA, perm0vB, perm1vB, perm2vB, perm3vB;
619 vec_s32_t sum0v, sum1v, sum2v, sum3v;
621 sum0v = vec_splat_s32(0);
622 sum1v = vec_splat_s32(0);
623 sum2v = vec_splat_s32(0);
624 sum3v = vec_splat_s32(0);
626 perm0vA = vec_lvsl(0, pix0);
627 perm1vA = vec_lvsl(0, pix1);
628 perm2vA = vec_lvsl(0, pix2);
629 perm3vA = vec_lvsl(0, pix3);
631 perm0vB = vec_lvsl(0, pix0 + i_stride);
632 perm1vB = vec_lvsl(0, pix1 + i_stride);
633 perm2vB = vec_lvsl(0, pix2 + i_stride);
634 perm3vB = vec_lvsl(0, pix3 + i_stride);
637 for (y = 0; y < 8; y++)
639 temp_lv = vec_ld(0, pix0);
640 temp_hv = vec_ld(16, pix0);
641 pix0v = vec_perm(temp_lv, temp_hv, perm0vA);
645 temp_lv = vec_ld(0, pix1);
646 temp_hv = vec_ld(16, pix1);
647 pix1v = vec_perm(temp_lv, temp_hv, perm1vA);
650 fencv = vec_ld(0, fenc);
653 temp_lv = vec_ld(0, pix2);
654 temp_hv = vec_ld(16, pix2);
655 pix2v = vec_perm(temp_lv, temp_hv, perm2vA);
658 temp_lv = vec_ld(0, pix3);
659 temp_hv = vec_ld(16, pix3);
660 pix3v = vec_perm(temp_lv, temp_hv, perm3vA);
663 sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
665 sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
667 sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
669 sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v );
671 temp_lv = vec_ld(0, pix0);
672 temp_hv = vec_ld(16, pix0);
673 pix0v = vec_perm(temp_lv, temp_hv, perm0vB);
677 temp_lv = vec_ld(0, pix1);
678 temp_hv = vec_ld(16, pix1);
679 pix1v = vec_perm(temp_lv, temp_hv, perm1vB);
682 fencv = vec_ld(0, fenc);
685 temp_lv = vec_ld(0, pix2);
686 temp_hv = vec_ld(16, pix2);
687 pix2v = vec_perm(temp_lv, temp_hv, perm2vB);
690 temp_lv = vec_ld(0, pix3);
691 temp_hv = vec_ld(16, pix3);
692 pix3v = vec_perm(temp_lv, temp_hv, perm3vB);
695 sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
697 sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
699 sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
701 sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v );
706 sum0v = vec_sums( sum0v, zero_s32v );
707 sum1v = vec_sums( sum1v, zero_s32v );
708 sum2v = vec_sums( sum2v, zero_s32v );
709 sum3v = vec_sums( sum3v, zero_s32v );
711 sum0v = vec_splat( sum0v, 3 );
712 sum1v = vec_splat( sum1v, 3 );
713 sum2v = vec_splat( sum2v, 3 );
714 sum3v = vec_splat( sum3v, 3 );
716 vec_ste( sum0v, 0, &sum0);
717 vec_ste( sum1v, 0, &sum1);
718 vec_ste( sum2v, 0, &sum2);
719 vec_ste( sum3v, 0, &sum3);
730 static void pixel_sad_x3_16x16_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, int i_stride, int scores[3] )
733 DECLARE_ALIGNED_16( int sum0 );
734 DECLARE_ALIGNED_16( int sum1 );
735 DECLARE_ALIGNED_16( int sum2 );
739 vec_u8_t temp_lv, temp_hv; // temporary load vectors
740 vec_u8_t fencv, pix0v, pix1v, pix2v;
741 vec_u8_t perm0vA, perm1vA, perm2vA, perm0vB, perm1vB, perm2vB;
743 vec_s32_t sum0v, sum1v, sum2v;
745 sum0v = vec_splat_s32(0);
746 sum1v = vec_splat_s32(0);
747 sum2v = vec_splat_s32(0);
749 perm0vA = vec_lvsl(0, pix0);
750 perm1vA = vec_lvsl(0, pix1);
751 perm2vA = vec_lvsl(0, pix2);
753 perm0vB = vec_lvsl(0, pix0 + i_stride);
754 perm1vB = vec_lvsl(0, pix1 + i_stride);
755 perm2vB = vec_lvsl(0, pix2 + i_stride);
757 for (y = 0; y < 8; y++)
759 temp_lv = vec_ld(0, pix0);
760 temp_hv = vec_ld(16, pix0);
761 pix0v = vec_perm(temp_lv, temp_hv, perm0vA);
765 temp_lv = vec_ld(0, pix1);
766 temp_hv = vec_ld(16, pix1);
767 pix1v = vec_perm(temp_lv, temp_hv, perm1vA);
770 fencv = vec_ld(0, fenc);
773 temp_lv = vec_ld(0, pix2);
774 temp_hv = vec_ld(16, pix2);
775 pix2v = vec_perm(temp_lv, temp_hv, perm2vA);
779 sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
781 sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
783 sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
785 temp_lv = vec_ld(0, pix0);
786 temp_hv = vec_ld(16, pix0);
787 pix0v = vec_perm(temp_lv, temp_hv, perm0vB);
791 temp_lv = vec_ld(0, pix1);
792 temp_hv = vec_ld(16, pix1);
793 pix1v = vec_perm(temp_lv, temp_hv, perm1vB);
796 fencv = vec_ld(0, fenc);
799 temp_lv = vec_ld(0, pix2);
800 temp_hv = vec_ld(16, pix2);
801 pix2v = vec_perm(temp_lv, temp_hv, perm2vB);
805 sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
807 sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
809 sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
815 sum0v = vec_sums( sum0v, zero_s32v );
816 sum1v = vec_sums( sum1v, zero_s32v );
817 sum2v = vec_sums( sum2v, zero_s32v );
819 sum0v = vec_splat( sum0v, 3 );
820 sum1v = vec_splat( sum1v, 3 );
821 sum2v = vec_splat( sum2v, 3 );
823 vec_ste( sum0v, 0, &sum0);
824 vec_ste( sum1v, 0, &sum1);
825 vec_ste( sum2v, 0, &sum2);
833 static void pixel_sad_x4_16x8_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, uint8_t *pix3, int i_stride, int scores[4] )
835 DECLARE_ALIGNED_16( int sum0 );
836 DECLARE_ALIGNED_16( int sum1 );
837 DECLARE_ALIGNED_16( int sum2 );
838 DECLARE_ALIGNED_16( int sum3 );
842 vec_u8_t temp_lv, temp_hv;
843 vec_u8_t fencv, pix0v, pix1v, pix2v, pix3v;
844 vec_u8_t perm0vA, perm1vA, perm2vA, perm3vA, perm0vB, perm1vB, perm2vB, perm3vB;
846 vec_s32_t sum0v, sum1v, sum2v, sum3v;
848 sum0v = vec_splat_s32(0);
849 sum1v = vec_splat_s32(0);
850 sum2v = vec_splat_s32(0);
851 sum3v = vec_splat_s32(0);
853 perm0vA = vec_lvsl(0, pix0);
854 perm1vA = vec_lvsl(0, pix1);
855 perm2vA = vec_lvsl(0, pix2);
856 perm3vA = vec_lvsl(0, pix3);
858 perm0vB = vec_lvsl(0, pix0 + i_stride);
859 perm1vB = vec_lvsl(0, pix1 + i_stride);
860 perm2vB = vec_lvsl(0, pix2 + i_stride);
861 perm3vB = vec_lvsl(0, pix3 + i_stride);
865 for (y = 0; y < 4; y++)
867 temp_lv = vec_ld(0, pix0);
868 temp_hv = vec_ld(16, pix0);
869 pix0v = vec_perm(temp_lv, temp_hv, perm0vA);
873 temp_lv = vec_ld(0, pix1);
874 temp_hv = vec_ld(16, pix1);
875 pix1v = vec_perm(temp_lv, temp_hv, perm1vA);
878 fencv = vec_ld(0, fenc);
881 temp_lv = vec_ld(0, pix2);
882 temp_hv = vec_ld(16, pix2);
883 pix2v = vec_perm(temp_lv, temp_hv, perm2vA);
886 temp_lv = vec_ld(0, pix3);
887 temp_hv = vec_ld(16, pix3);
888 pix3v = vec_perm(temp_lv, temp_hv, perm3vA);
891 sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
893 sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
895 sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
897 sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v );
899 temp_lv = vec_ld(0, pix0);
900 temp_hv = vec_ld(16, pix0);
901 pix0v = vec_perm(temp_lv, temp_hv, perm0vB);
905 temp_lv = vec_ld(0, pix1);
906 temp_hv = vec_ld(16, pix1);
907 pix1v = vec_perm(temp_lv, temp_hv, perm1vB);
910 fencv = vec_ld(0, fenc);
913 temp_lv = vec_ld(0, pix2);
914 temp_hv = vec_ld(16, pix2);
915 pix2v = vec_perm(temp_lv, temp_hv, perm2vB);
918 temp_lv = vec_ld(0, pix3);
919 temp_hv = vec_ld(16, pix3);
920 pix3v = vec_perm(temp_lv, temp_hv, perm3vB);
923 sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
925 sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
927 sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
929 sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v );
934 sum0v = vec_sums( sum0v, zero_s32v );
935 sum1v = vec_sums( sum1v, zero_s32v );
936 sum2v = vec_sums( sum2v, zero_s32v );
937 sum3v = vec_sums( sum3v, zero_s32v );
939 sum0v = vec_splat( sum0v, 3 );
940 sum1v = vec_splat( sum1v, 3 );
941 sum2v = vec_splat( sum2v, 3 );
942 sum3v = vec_splat( sum3v, 3 );
944 vec_ste( sum0v, 0, &sum0);
945 vec_ste( sum1v, 0, &sum1);
946 vec_ste( sum2v, 0, &sum2);
947 vec_ste( sum3v, 0, &sum3);
958 static void pixel_sad_x3_16x8_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, int i_stride, int scores[3] )
961 DECLARE_ALIGNED_16( int sum0 );
962 DECLARE_ALIGNED_16( int sum1 );
963 DECLARE_ALIGNED_16( int sum2 );
967 vec_u8_t temp_lv, temp_hv;
968 vec_u8_t fencv, pix0v, pix1v, pix2v;
969 vec_u8_t perm0vA, perm1vA, perm2vA, perm0vB, perm1vB, perm2vB;
971 vec_s32_t sum0v, sum1v, sum2v;
973 sum0v = vec_splat_s32(0);
974 sum1v = vec_splat_s32(0);
975 sum2v = vec_splat_s32(0);
978 perm0vA = vec_lvsl(0, pix0);
979 perm1vA = vec_lvsl(0, pix1);
980 perm2vA = vec_lvsl(0, pix2);
982 perm0vB = vec_lvsl(0, pix0 + i_stride);
983 perm1vB = vec_lvsl(0, pix1 + i_stride);
984 perm2vB = vec_lvsl(0, pix2 + i_stride);
986 for (y = 0; y < 4; y++)
988 temp_lv = vec_ld(0, pix0);
989 temp_hv = vec_ld(16, pix0);
990 pix0v = vec_perm(temp_lv, temp_hv, perm0vA);
994 temp_lv = vec_ld(0, pix1);
995 temp_hv = vec_ld(16, pix1);
996 pix1v = vec_perm(temp_lv, temp_hv, perm1vA);
999 fencv = vec_ld(0, fenc);
1000 fenc += FENC_STRIDE;
1002 temp_lv = vec_ld(0, pix2);
1003 temp_hv = vec_ld(16, pix2);
1004 pix2v = vec_perm(temp_lv, temp_hv, perm2vA);
1008 sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
1010 sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
1012 sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
1014 temp_lv = vec_ld(0, pix0);
1015 temp_hv = vec_ld(16, pix0);
1016 pix0v = vec_perm(temp_lv, temp_hv, perm0vB);
1020 temp_lv = vec_ld(0, pix1);
1021 temp_hv = vec_ld(16, pix1);
1022 pix1v = vec_perm(temp_lv, temp_hv, perm1vB);
1025 fencv = vec_ld(0, fenc);
1026 fenc += FENC_STRIDE;
1028 temp_lv = vec_ld(0, pix2);
1029 temp_hv = vec_ld(16, pix2);
1030 pix2v = vec_perm(temp_lv, temp_hv, perm2vB);
1034 sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
1036 sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
1038 sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
1043 sum0v = vec_sums( sum0v, zero_s32v );
1044 sum1v = vec_sums( sum1v, zero_s32v );
1045 sum2v = vec_sums( sum2v, zero_s32v );
1047 sum0v = vec_splat( sum0v, 3 );
1048 sum1v = vec_splat( sum1v, 3 );
1049 sum2v = vec_splat( sum2v, 3 );
1051 vec_ste( sum0v, 0, &sum0);
1052 vec_ste( sum1v, 0, &sum1);
1053 vec_ste( sum2v, 0, &sum2);
1062 static void pixel_sad_x4_8x16_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, uint8_t *pix3, int i_stride, int scores[4] )
1064 DECLARE_ALIGNED_16( int sum0 );
1065 DECLARE_ALIGNED_16( int sum1 );
1066 DECLARE_ALIGNED_16( int sum2 );
1067 DECLARE_ALIGNED_16( int sum3 );
1071 vec_u8_t temp_lv, temp_hv;
1072 vec_u8_t fencv, pix0v, pix1v, pix2v, pix3v;
1073 vec_u8_t perm0vA, perm1vA, perm2vA, perm3vA, perm0vB, perm1vB, perm2vB, perm3vB, permEncv;
1075 vec_s32_t sum0v, sum1v, sum2v, sum3v;
1077 sum0v = vec_splat_s32(0);
1078 sum1v = vec_splat_s32(0);
1079 sum2v = vec_splat_s32(0);
1080 sum3v = vec_splat_s32(0);
1082 permEncv = vec_lvsl(0, fenc);
1083 perm0vA = vec_lvsl(0, pix0);
1084 perm1vA = vec_lvsl(0, pix1);
1085 perm2vA = vec_lvsl(0, pix2);
1086 perm3vA = vec_lvsl(0, pix3);
1088 perm0vB = vec_lvsl(0, pix0 + i_stride);
1089 perm1vB = vec_lvsl(0, pix1 + i_stride);
1090 perm2vB = vec_lvsl(0, pix2 + i_stride);
1091 perm3vB = vec_lvsl(0, pix3 + i_stride);
1094 for (y = 0; y < 8; y++)
1096 temp_lv = vec_ld(0, pix0);
1097 temp_hv = vec_ld(16, pix0);
1098 pix0v = vec_perm(temp_lv, temp_hv, perm0vA);
1102 temp_lv = vec_ld(0, pix1);
1103 temp_hv = vec_ld(16, pix1);
1104 pix1v = vec_perm(temp_lv, temp_hv, perm1vA);
1107 temp_lv = vec_ld(0, fenc);
1108 fencv = vec_perm(temp_lv, temp_hv, permEncv);
1109 fenc += FENC_STRIDE;
1111 temp_lv = vec_ld(0, pix2);
1112 temp_hv = vec_ld(16, pix2);
1113 pix2v = vec_perm(temp_lv, temp_hv, perm2vA);
1116 temp_lv = vec_ld(0, pix3);
1117 temp_hv = vec_ld(16, pix3);
1118 pix3v = vec_perm(temp_lv, temp_hv, perm3vA);
1121 sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
1123 sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
1125 sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
1127 sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v );
1129 temp_lv = vec_ld(0, pix0);
1130 temp_hv = vec_ld(16, pix0);
1131 pix0v = vec_perm(temp_lv, temp_hv, perm0vB);
1135 temp_lv = vec_ld(0, pix1);
1136 temp_hv = vec_ld(16, pix1);
1137 pix1v = vec_perm(temp_lv, temp_hv, perm1vB);
1140 temp_lv = vec_ld(0, fenc);
1141 fencv = vec_perm(temp_lv, temp_hv, permEncv);
1142 fenc += FENC_STRIDE;
1144 temp_lv = vec_ld(0, pix2);
1145 temp_hv = vec_ld(16, pix2);
1146 pix2v = vec_perm(temp_lv, temp_hv, perm2vB);
1149 temp_lv = vec_ld(0, pix3);
1150 temp_hv = vec_ld(16, pix3);
1151 pix3v = vec_perm(temp_lv, temp_hv, perm3vB);
1154 sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
1156 sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
1158 sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
1160 sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v );
1163 sum0v = vec_sum2s( sum0v, zero_s32v );
1164 sum1v = vec_sum2s( sum1v, zero_s32v );
1165 sum2v = vec_sum2s( sum2v, zero_s32v );
1166 sum3v = vec_sum2s( sum3v, zero_s32v );
1168 sum0v = vec_splat( sum0v, 1 );
1169 sum1v = vec_splat( sum1v, 1 );
1170 sum2v = vec_splat( sum2v, 1 );
1171 sum3v = vec_splat( sum3v, 1 );
1173 vec_ste( sum0v, 0, &sum0);
1174 vec_ste( sum1v, 0, &sum1);
1175 vec_ste( sum2v, 0, &sum2);
1176 vec_ste( sum3v, 0, &sum3);
1185 static void pixel_sad_x3_8x16_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, int i_stride, int scores[3] )
1187 DECLARE_ALIGNED_16( int sum0 );
1188 DECLARE_ALIGNED_16( int sum1 );
1189 DECLARE_ALIGNED_16( int sum2 );
1193 vec_u8_t temp_lv, temp_hv;
1194 vec_u8_t fencv, pix0v, pix1v, pix2v;
1195 vec_u8_t perm0vA, perm1vA, perm2vA, perm0vB, perm1vB, perm2vB,permEncv;
1197 vec_s32_t sum0v, sum1v, sum2v;
1199 sum0v = vec_splat_s32(0);
1200 sum1v = vec_splat_s32(0);
1201 sum2v = vec_splat_s32(0);
1203 permEncv = vec_lvsl(0, fenc);
1204 perm0vA = vec_lvsl(0, pix0);
1205 perm1vA = vec_lvsl(0, pix1);
1206 perm2vA = vec_lvsl(0, pix2);
1208 perm0vB = vec_lvsl(0, pix0 + i_stride);
1209 perm1vB = vec_lvsl(0, pix1 + i_stride);
1210 perm2vB = vec_lvsl(0, pix2 + i_stride);
1212 for (y = 0; y < 8; y++)
1214 temp_lv = vec_ld(0, pix0);
1215 temp_hv = vec_ld(16, pix0);
1216 pix0v = vec_perm(temp_lv, temp_hv, perm0vA);
1220 temp_lv = vec_ld(0, pix1);
1221 temp_hv = vec_ld(16, pix1);
1222 pix1v = vec_perm(temp_lv, temp_hv, perm1vA);
1225 temp_lv = vec_ld(0, fenc);
1226 fencv = vec_perm(temp_lv, temp_hv, permEncv);
1227 fenc += FENC_STRIDE;
1229 temp_lv = vec_ld(0, pix2);
1230 temp_hv = vec_ld(16, pix2);
1231 pix2v = vec_perm(temp_lv, temp_hv, perm2vA);
1235 sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
1237 sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
1239 sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
1242 temp_lv = vec_ld(0, pix0);
1243 temp_hv = vec_ld(16, pix0);
1244 pix0v = vec_perm(temp_lv, temp_hv, perm0vB);
1248 temp_lv = vec_ld(0, pix1);
1249 temp_hv = vec_ld(16, pix1);
1250 pix1v = vec_perm(temp_lv, temp_hv, perm1vB);
1253 temp_lv = vec_ld(0, fenc);
1254 fencv = vec_perm(temp_lv, temp_hv, permEncv);
1255 fenc += FENC_STRIDE;
1257 temp_lv = vec_ld(0, pix2);
1258 temp_hv = vec_ld(16, pix2);
1259 pix2v = vec_perm(temp_lv, temp_hv, perm2vB);
1263 sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
1265 sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
1267 sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
1272 sum0v = vec_sum2s( sum0v, zero_s32v );
1273 sum1v = vec_sum2s( sum1v, zero_s32v );
1274 sum2v = vec_sum2s( sum2v, zero_s32v );
1276 sum0v = vec_splat( sum0v, 1 );
1277 sum1v = vec_splat( sum1v, 1 );
1278 sum2v = vec_splat( sum2v, 1 );
1280 vec_ste( sum0v, 0, &sum0);
1281 vec_ste( sum1v, 0, &sum1);
1282 vec_ste( sum2v, 0, &sum2);
1290 static void pixel_sad_x4_8x8_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, uint8_t *pix3, int i_stride, int scores[4] )
1292 DECLARE_ALIGNED_16( int sum0 );
1293 DECLARE_ALIGNED_16( int sum1 );
1294 DECLARE_ALIGNED_16( int sum2 );
1295 DECLARE_ALIGNED_16( int sum3 );
1299 vec_u8_t temp_lv, temp_hv;
1300 vec_u8_t fencv, pix0v, pix1v, pix2v, pix3v;
1301 vec_u8_t perm0vA, perm1vA, perm2vA, perm3vA, perm0vB, perm1vB, perm2vB, perm3vB, permEncv;
1303 vec_s32_t sum0v, sum1v, sum2v, sum3v;
1305 sum0v = vec_splat_s32(0);
1306 sum1v = vec_splat_s32(0);
1307 sum2v = vec_splat_s32(0);
1308 sum3v = vec_splat_s32(0);
1310 permEncv = vec_lvsl(0, fenc);
1311 perm0vA = vec_lvsl(0, pix0);
1312 perm1vA = vec_lvsl(0, pix1);
1313 perm2vA = vec_lvsl(0, pix2);
1314 perm3vA = vec_lvsl(0, pix3);
1316 perm0vB = vec_lvsl(0, pix0 + i_stride);
1317 perm1vB = vec_lvsl(0, pix1 + i_stride);
1318 perm2vB = vec_lvsl(0, pix2 + i_stride);
1319 perm3vB = vec_lvsl(0, pix3 + i_stride);
1322 for (y = 0; y < 4; y++)
1324 temp_lv = vec_ld(0, pix0);
1325 temp_hv = vec_ld(16, pix0);
1326 pix0v = vec_perm(temp_lv, temp_hv, perm0vA);
1330 temp_lv = vec_ld(0, pix1);
1331 temp_hv = vec_ld(16, pix1);
1332 pix1v = vec_perm(temp_lv, temp_hv, perm1vA);
1335 temp_lv = vec_ld(0, fenc);
1336 fencv = vec_perm(temp_lv, temp_hv, permEncv);
1337 fenc += FENC_STRIDE;
1339 temp_lv = vec_ld(0, pix2);
1340 temp_hv = vec_ld(16, pix2);
1341 pix2v = vec_perm(temp_lv, temp_hv, perm2vA);
1344 temp_lv = vec_ld(0, pix3);
1345 temp_hv = vec_ld(16, pix3);
1346 pix3v = vec_perm(temp_lv, temp_hv, perm3vA);
1349 sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
1351 sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
1353 sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
1355 sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v );
1357 temp_lv = vec_ld(0, pix0);
1358 temp_hv = vec_ld(16, pix0);
1359 pix0v = vec_perm(temp_lv, temp_hv, perm0vB);
1363 temp_lv = vec_ld(0, pix1);
1364 temp_hv = vec_ld(16, pix1);
1365 pix1v = vec_perm(temp_lv, temp_hv, perm1vB);
1368 temp_lv = vec_ld(0, fenc);
1369 fencv = vec_perm(temp_lv, temp_hv, permEncv);
1370 fenc += FENC_STRIDE;
1372 temp_lv = vec_ld(0, pix2);
1373 temp_hv = vec_ld(16, pix2);
1374 pix2v = vec_perm(temp_lv, temp_hv, perm2vB);
1377 temp_lv = vec_ld(0, pix3);
1378 temp_hv = vec_ld(16, pix3);
1379 pix3v = vec_perm(temp_lv, temp_hv, perm3vB);
1382 sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
1384 sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
1386 sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
1388 sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v );
1392 sum0v = vec_sum2s( sum0v, zero_s32v );
1393 sum1v = vec_sum2s( sum1v, zero_s32v );
1394 sum2v = vec_sum2s( sum2v, zero_s32v );
1395 sum3v = vec_sum2s( sum3v, zero_s32v );
1397 sum0v = vec_splat( sum0v, 1 );
1398 sum1v = vec_splat( sum1v, 1 );
1399 sum2v = vec_splat( sum2v, 1 );
1400 sum3v = vec_splat( sum3v, 1 );
1402 vec_ste( sum0v, 0, &sum0);
1403 vec_ste( sum1v, 0, &sum1);
1404 vec_ste( sum2v, 0, &sum2);
1405 vec_ste( sum3v, 0, &sum3);
1415 static void pixel_sad_x3_8x8_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, int i_stride, int scores[3] )
1417 DECLARE_ALIGNED_16( int sum0 );
1418 DECLARE_ALIGNED_16( int sum1 );
1419 DECLARE_ALIGNED_16( int sum2 );
1423 vec_u8_t temp_lv, temp_hv;
1424 vec_u8_t fencv, pix0v, pix1v, pix2v;
1425 vec_u8_t perm0vA, perm1vA, perm2vA, perm0vB, perm1vB, perm2vB, permEncv;
1427 vec_s32_t sum0v, sum1v, sum2v;
1429 sum0v = vec_splat_s32(0);
1430 sum1v = vec_splat_s32(0);
1431 sum2v = vec_splat_s32(0);
1433 permEncv = vec_lvsl(0, fenc);
1434 perm0vA = vec_lvsl(0, pix0);
1435 perm1vA = vec_lvsl(0, pix1);
1436 perm2vA = vec_lvsl(0, pix2);
1438 perm0vB = vec_lvsl(0, pix0 + i_stride);
1439 perm1vB = vec_lvsl(0, pix1 + i_stride);
1440 perm2vB = vec_lvsl(0, pix2 + i_stride);
1442 for (y = 0; y < 4; y++)
1444 temp_lv = vec_ld(0, pix0);
1445 temp_hv = vec_ld(16, pix0);
1446 pix0v = vec_perm(temp_lv, temp_hv, perm0vA);
1450 temp_lv = vec_ld(0, pix1);
1451 temp_hv = vec_ld(16, pix1);
1452 pix1v = vec_perm(temp_lv, temp_hv, perm1vA);
1455 temp_lv = vec_ld(0, fenc);
1456 fencv = vec_perm(temp_lv, temp_hv, permEncv);
1457 fenc += FENC_STRIDE;
1459 temp_lv = vec_ld(0, pix2);
1460 temp_hv = vec_ld(16, pix2);
1461 pix2v = vec_perm(temp_lv, temp_hv, perm2vA);
1465 sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
1467 sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
1469 sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
1472 temp_lv = vec_ld(0, pix0);
1473 temp_hv = vec_ld(16, pix0);
1474 pix0v = vec_perm(temp_lv, temp_hv, perm0vB);
1478 temp_lv = vec_ld(0, pix1);
1479 temp_hv = vec_ld(16, pix1);
1480 pix1v = vec_perm(temp_lv, temp_hv, perm1vB);
1483 temp_lv = vec_ld(0, fenc);
1484 fencv = vec_perm(temp_lv, temp_hv, permEncv);
1485 fenc += FENC_STRIDE;
1487 temp_lv = vec_ld(0, pix2);
1488 temp_hv = vec_ld(16, pix2);
1489 pix2v = vec_perm(temp_lv, temp_hv, perm2vB);
1493 sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
1495 sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
1497 sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
1502 sum0v = vec_sum2s( sum0v, zero_s32v );
1503 sum1v = vec_sum2s( sum1v, zero_s32v );
1504 sum2v = vec_sum2s( sum2v, zero_s32v );
1506 sum0v = vec_splat( sum0v, 1 );
1507 sum1v = vec_splat( sum1v, 1 );
1508 sum2v = vec_splat( sum2v, 1 );
1510 vec_ste( sum0v, 0, &sum0);
1511 vec_ste( sum1v, 0, &sum1);
1512 vec_ste( sum2v, 0, &sum2);
1519 /***********************************************************************
1521 **********************************************************************/
1523 static int pixel_ssd_16x16_altivec ( uint8_t *pix1, int i_stride_pix1,
1524 uint8_t *pix2, int i_stride_pix2)
1526 DECLARE_ALIGNED_16( int sum );
1530 vec_u8_t pix1vA, pix2vA, pix1vB, pix2vB;
1532 vec_u8_t maxA, minA, diffA, maxB, minB, diffB;
1533 vec_u8_t temp_lv, temp_hv;
1534 vec_u8_t permA, permB;
1536 sumv = vec_splat_u32(0);
1538 permA = vec_lvsl(0, pix2);
1539 permB = vec_lvsl(0, pix2 + i_stride_pix2);
1541 temp_lv = vec_ld(0, pix2);
1542 temp_hv = vec_ld(16, pix2);
1543 pix2vA = vec_perm(temp_lv, temp_hv, permA);
1544 pix1vA = vec_ld(0, pix1);
1546 for (y=0; y < 7; y++)
1548 pix1 += i_stride_pix1;
1549 pix2 += i_stride_pix2;
1552 maxA = vec_max(pix1vA, pix2vA);
1553 minA = vec_min(pix1vA, pix2vA);
1556 temp_lv = vec_ld(0, pix2);
1557 temp_hv = vec_ld(16, pix2);
1558 pix2vB = vec_perm(temp_lv, temp_hv, permB);
1559 pix1vB = vec_ld(0, pix1);
1562 diffA = vec_sub(maxA, minA);
1563 sumv = vec_msum(diffA, diffA, sumv);
1565 pix1 += i_stride_pix1;
1566 pix2 += i_stride_pix2;
1568 maxB = vec_max(pix1vB, pix2vB);
1569 minB = vec_min(pix1vB, pix2vB);
1571 temp_lv = vec_ld(0, pix2);
1572 temp_hv = vec_ld(16, pix2);
1573 pix2vA = vec_perm(temp_lv, temp_hv, permA);
1574 pix1vA = vec_ld(0, pix1);
1576 diffB = vec_sub(maxB, minB);
1577 sumv = vec_msum(diffB, diffB, sumv);
1581 pix1 += i_stride_pix1;
1582 pix2 += i_stride_pix2;
1584 temp_lv = vec_ld(0, pix2);
1585 temp_hv = vec_ld(16, pix2);
1586 pix2vB = vec_perm(temp_lv, temp_hv, permB);
1587 pix1vB = vec_ld(0, pix1);
1589 maxA = vec_max(pix1vA, pix2vA);
1590 minA = vec_min(pix1vA, pix2vA);
1592 maxB = vec_max(pix1vB, pix2vB);
1593 minB = vec_min(pix1vB, pix2vB);
1595 diffA = vec_sub(maxA, minA);
1596 sumv = vec_msum(diffA, diffA, sumv);
1598 diffB = vec_sub(maxB, minB);
1599 sumv = vec_msum(diffB, diffB, sumv);
1601 sumv = (vec_u32_t) vec_sums((vec_s32_t) sumv, zero_s32v);
1602 sumv = vec_splat(sumv, 3);
1603 vec_ste((vec_s32_t) sumv, 0, &sum);
1607 static int pixel_ssd_8x8_altivec ( uint8_t *pix1, int i_stride_pix1,
1608 uint8_t *pix2, int i_stride_pix2)
1610 DECLARE_ALIGNED_16( int sum );
1614 vec_u8_t pix1v, pix2v;
1616 vec_u8_t maxv, minv, diffv;
1617 vec_u8_t temp_lv, temp_hv;
1618 vec_u8_t perm1v, perm2v;
1620 const vec_u32_t sel = (vec_u32_t)CV(-1,-1,0,0);
1622 sumv = vec_splat_u32(0);
1624 perm1v = vec_lvsl(0, pix1);
1625 perm2v = vec_lvsl(0, pix2);
1627 for (y=0; y < 8; y++)
1629 temp_hv = vec_ld(0, pix1);
1630 temp_lv = vec_ld(7, pix1);
1631 pix1v = vec_perm(temp_hv, temp_lv, perm1v);
1633 temp_hv = vec_ld(0, pix2);
1634 temp_lv = vec_ld(7, pix2);
1635 pix2v = vec_perm(temp_hv, temp_lv, perm2v);
1637 maxv = vec_max(pix1v, pix2v);
1638 minv = vec_min(pix1v, pix2v);
1640 diffv = vec_sub(maxv, minv);
1641 sumv = vec_msum(diffv, diffv, sumv);
1643 pix1 += i_stride_pix1;
1644 pix2 += i_stride_pix2;
1647 sumv = vec_sel( zero_u32v, sumv, sel );
1649 sumv = (vec_u32_t) vec_sums((vec_s32_t) sumv, zero_s32v);
1650 sumv = vec_splat(sumv, 3);
1651 vec_ste((vec_s32_t) sumv, 0, &sum);
1656 /**********************************************************************
1657 * SA8D routines: sum of 8x8 Hadamard transformed differences
1658 **********************************************************************/
1659 /* SA8D_1D unrolled by 8 in Altivec */
1660 #define SA8D_1D_ALTIVEC( sa8d0v, sa8d1v, sa8d2v, sa8d3v, sa8d4v, sa8d5v, sa8d6v, sa8d7v )\
1662 /* int a0 = SRC(0) + SRC(4) */\
1663 vec_s16_t a0v = vec_add(sa8d0v, sa8d4v); \
1664 /* int a4 = SRC(0) - SRC(4) */\
1665 vec_s16_t a4v = vec_sub(sa8d0v, sa8d4v); \
1666 /* int a1 = SRC(1) + SRC(5) */\
1667 vec_s16_t a1v = vec_add(sa8d1v, sa8d5v); \
1668 /* int a5 = SRC(1) - SRC(5) */\
1669 vec_s16_t a5v = vec_sub(sa8d1v, sa8d5v); \
1670 /* int a2 = SRC(2) + SRC(6) */\
1671 vec_s16_t a2v = vec_add(sa8d2v, sa8d6v); \
1672 /* int a6 = SRC(2) - SRC(6) */\
1673 vec_s16_t a6v = vec_sub(sa8d2v, sa8d6v); \
1674 /* int a3 = SRC(3) + SRC(7) */\
1675 vec_s16_t a3v = vec_add(sa8d3v, sa8d7v); \
1676 /* int a7 = SRC(3) - SRC(7) */\
1677 vec_s16_t a7v = vec_sub(sa8d3v, sa8d7v); \
1679 /* int b0 = a0 + a2 */\
1680 vec_s16_t b0v = vec_add(a0v, a2v); \
1681 /* int b2 = a0 - a2; */\
1682 vec_s16_t b2v = vec_sub(a0v, a2v);\
1683 /* int b1 = a1 + a3; */\
1684 vec_s16_t b1v = vec_add(a1v, a3v); \
1685 /* int b3 = a1 - a3; */\
1686 vec_s16_t b3v = vec_sub(a1v, a3v); \
1687 /* int b4 = a4 + a6; */\
1688 vec_s16_t b4v = vec_add(a4v, a6v); \
1689 /* int b6 = a4 - a6; */\
1690 vec_s16_t b6v = vec_sub(a4v, a6v); \
1691 /* int b5 = a5 + a7; */\
1692 vec_s16_t b5v = vec_add(a5v, a7v); \
1693 /* int b7 = a5 - a7; */\
1694 vec_s16_t b7v = vec_sub(a5v, a7v); \
1696 /* DST(0, b0 + b1) */\
1697 sa8d0v = vec_add(b0v, b1v); \
1698 /* DST(1, b0 - b1) */\
1699 sa8d1v = vec_sub(b0v, b1v); \
1700 /* DST(2, b2 + b3) */\
1701 sa8d2v = vec_add(b2v, b3v); \
1702 /* DST(3, b2 - b3) */\
1703 sa8d3v = vec_sub(b2v, b3v); \
1704 /* DST(4, b4 + b5) */\
1705 sa8d4v = vec_add(b4v, b5v); \
1706 /* DST(5, b4 - b5) */\
1707 sa8d5v = vec_sub(b4v, b5v); \
1708 /* DST(6, b6 + b7) */\
1709 sa8d6v = vec_add(b6v, b7v); \
1710 /* DST(7, b6 - b7) */\
1711 sa8d7v = vec_sub(b6v, b7v); \
1714 static int pixel_sa8d_8x8_core_altivec( uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
1720 vec_s16_t diff0v, diff1v, diff2v, diff3v, diff4v, diff5v, diff6v, diff7v;
1722 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v );
1723 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v );
1724 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v );
1725 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v );
1727 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff4v );
1728 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff5v );
1729 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff6v );
1730 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff7v );
1732 vec_s16_t sa8d0v, sa8d1v, sa8d2v, sa8d3v, sa8d4v, sa8d5v, sa8d6v, sa8d7v;
1734 SA8D_1D_ALTIVEC(diff0v, diff1v, diff2v, diff3v,
1735 diff4v, diff5v, diff6v, diff7v);
1737 VEC_TRANSPOSE_8(diff0v, diff1v, diff2v, diff3v,
1738 diff4v, diff5v, diff6v, diff7v,
1739 sa8d0v, sa8d1v, sa8d2v, sa8d3v,
1740 sa8d4v, sa8d5v, sa8d6v, sa8d7v );
1742 SA8D_1D_ALTIVEC(sa8d0v, sa8d1v, sa8d2v, sa8d3v,
1743 sa8d4v, sa8d5v, sa8d6v, sa8d7v );
1745 /* accumulation of the absolute value of all elements of the resulting bloc */
1746 vec_s16_t abs0v = VEC_ABS(sa8d0v);
1747 vec_s16_t abs1v = VEC_ABS(sa8d1v);
1748 vec_s16_t sum01v = vec_add(abs0v, abs1v);
1750 vec_s16_t abs2v = VEC_ABS(sa8d2v);
1751 vec_s16_t abs3v = VEC_ABS(sa8d3v);
1752 vec_s16_t sum23v = vec_add(abs2v, abs3v);
1754 vec_s16_t abs4v = VEC_ABS(sa8d4v);
1755 vec_s16_t abs5v = VEC_ABS(sa8d5v);
1756 vec_s16_t sum45v = vec_add(abs4v, abs5v);
1758 vec_s16_t abs6v = VEC_ABS(sa8d6v);
1759 vec_s16_t abs7v = VEC_ABS(sa8d7v);
1760 vec_s16_t sum67v = vec_add(abs6v, abs7v);
1762 vec_s16_t sum0123v = vec_add(sum01v, sum23v);
1763 vec_s16_t sum4567v = vec_add(sum45v, sum67v);
1767 sumblocv = vec_sum4s(sum0123v, (vec_s32_t)zerov );
1768 sumblocv = vec_sum4s(sum4567v, sumblocv );
1770 sumblocv = vec_sums(sumblocv, (vec_s32_t)zerov );
1772 sumblocv = vec_splat(sumblocv, 3);
1774 vec_ste(sumblocv, 0, &i_satd);
1779 static int pixel_sa8d_8x8_altivec( uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
1782 i_satd = (pixel_sa8d_8x8_core_altivec( pix1, i_pix1, pix2, i_pix2 )+2)>>2;
1786 static int pixel_sa8d_16x16_altivec( uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
1790 i_satd = (pixel_sa8d_8x8_core_altivec( &pix1[0], i_pix1, &pix2[0], i_pix2 )
1791 + pixel_sa8d_8x8_core_altivec( &pix1[8], i_pix1, &pix2[8], i_pix2 )
1792 + pixel_sa8d_8x8_core_altivec( &pix1[8*i_pix1], i_pix1, &pix2[8*i_pix2], i_pix2 )
1793 + pixel_sa8d_8x8_core_altivec( &pix1[8*i_pix1+8], i_pix1, &pix2[8*i_pix2+8], i_pix2 ) +2)>>2;
1797 /****************************************************************************
1798 * structural similarity metric
1799 ****************************************************************************/
1800 static void ssim_4x4x2_core_altivec( const uint8_t *pix1, int stride1,
1801 const uint8_t *pix2, int stride2,
1804 DECLARE_ALIGNED_16( int temp[4] );
1807 vec_u8_t pix1v, pix2v;
1808 vec_u32_t s1v, s2v, ssv, s12v;
1812 s1v = s2v = ssv = s12v = zero_u32v;
1816 VEC_LOAD( &pix1[y*stride1], pix1v, 16, vec_u8_t );
1817 VEC_LOAD( &pix2[y*stride2], pix2v, 16, vec_u8_t );
1819 s1v = vec_sum4s( pix1v, s1v );
1820 s2v = vec_sum4s( pix2v, s2v );
1821 ssv = vec_msum( pix1v, pix1v, ssv );
1822 ssv = vec_msum( pix2v, pix2v, ssv );
1823 s12v = vec_msum( pix1v, pix2v, s12v );
1826 vec_st( (vec_s32_t)s1v, 0, temp );
1827 sums[0][0] = temp[0];
1828 sums[1][0] = temp[1];
1829 vec_st( (vec_s32_t)s2v, 0, temp );
1830 sums[0][1] = temp[0];
1831 sums[1][1] = temp[1];
1832 vec_st( (vec_s32_t)ssv, 0, temp );
1833 sums[0][2] = temp[0];
1834 sums[1][2] = temp[1];
1835 vec_st( (vec_s32_t)s12v, 0, temp );
1836 sums[0][3] = temp[0];
1837 sums[1][3] = temp[1];
1840 /****************************************************************************
1842 ****************************************************************************/
1843 void x264_pixel_altivec_init( x264_pixel_function_t *pixf )
1845 pixf->sad[PIXEL_16x16] = pixel_sad_16x16_altivec;
1846 pixf->sad[PIXEL_8x16] = pixel_sad_8x16_altivec;
1847 pixf->sad[PIXEL_16x8] = pixel_sad_16x8_altivec;
1848 pixf->sad[PIXEL_8x8] = pixel_sad_8x8_altivec;
1850 pixf->sad_x3[PIXEL_16x16] = pixel_sad_x3_16x16_altivec;
1851 pixf->sad_x3[PIXEL_8x16] = pixel_sad_x3_8x16_altivec;
1852 pixf->sad_x3[PIXEL_16x8] = pixel_sad_x3_16x8_altivec;
1853 pixf->sad_x3[PIXEL_8x8] = pixel_sad_x3_8x8_altivec;
1855 pixf->sad_x4[PIXEL_16x16] = pixel_sad_x4_16x16_altivec;
1856 pixf->sad_x4[PIXEL_8x16] = pixel_sad_x4_8x16_altivec;
1857 pixf->sad_x4[PIXEL_16x8] = pixel_sad_x4_16x8_altivec;
1858 pixf->sad_x4[PIXEL_8x8] = pixel_sad_x4_8x8_altivec;
1860 pixf->satd[PIXEL_16x16] = pixel_satd_16x16_altivec;
1861 pixf->satd[PIXEL_8x16] = pixel_satd_8x16_altivec;
1862 pixf->satd[PIXEL_16x8] = pixel_satd_16x8_altivec;
1863 pixf->satd[PIXEL_8x8] = pixel_satd_8x8_altivec;
1864 pixf->satd[PIXEL_8x4] = pixel_satd_8x4_altivec;
1865 pixf->satd[PIXEL_4x8] = pixel_satd_4x8_altivec;
1866 pixf->satd[PIXEL_4x4] = pixel_satd_4x4_altivec;
1868 pixf->ssd[PIXEL_16x16] = pixel_ssd_16x16_altivec;
1869 pixf->ssd[PIXEL_8x8] = pixel_ssd_8x8_altivec;
1871 pixf->sa8d[PIXEL_16x16] = pixel_sa8d_16x16_altivec;
1872 pixf->sa8d[PIXEL_8x8] = pixel_sa8d_8x8_altivec;
1874 pixf->ssim_4x4x2_core = ssim_4x4x2_core_altivec;