1 /*****************************************************************************
2 * pixel.c: h264 encoder
3 *****************************************************************************
4 * Copyright (C) 2003-2008 x264 project
6 * Authors: Eric Petit <titer@m0k.org>
7 * Guillaume Poirier <gpoirier@mplayerhq.hu>
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
22 *****************************************************************************/
28 #include "common/common.h"
29 #include "ppccommon.h"
31 /***********************************************************************
33 **********************************************************************/
35 #define PIXEL_SAD_ALTIVEC( name, lx, ly, a, b ) \
36 static int name( uint8_t *pix1, int i_pix1, \
37 uint8_t *pix2, int i_pix2 ) \
40 DECLARE_ALIGNED_16( int sum ); \
44 vec_u8_t pix1v, pix2v; \
45 vec_s32_t sumv = zero_s32v; \
46 for( y = 0; y < ly; y++ ) \
48 VEC_LOAD( pix1, pix1v, lx, vec_u8_t ); \
49 VEC_LOAD( pix2, pix2v, lx, vec_u8_t ); \
50 sumv = (vec_s32_t) vec_sum4s( \
51 vec_sub( vec_max( pix1v, pix2v ), \
52 vec_min( pix1v, pix2v ) ), \
57 sumv = vec_sum##a( sumv, zero_s32v ); \
58 sumv = vec_splat( sumv, b ); \
59 vec_ste( sumv, 0, &sum ); \
63 PIXEL_SAD_ALTIVEC( pixel_sad_16x16_altivec, 16, 16, s, 3 )
64 PIXEL_SAD_ALTIVEC( pixel_sad_8x16_altivec, 8, 16, 2s, 1 )
65 PIXEL_SAD_ALTIVEC( pixel_sad_16x8_altivec, 16, 8, s, 3 )
66 PIXEL_SAD_ALTIVEC( pixel_sad_8x8_altivec, 8, 8, 2s, 1 )
70 /***********************************************************************
72 **********************************************************************/
74 /***********************************************************************
76 ***********************************************************************
77 * b[0] = a[0] + a[1] + a[2] + a[3]
78 * b[1] = a[0] + a[1] - a[2] - a[3]
79 * b[2] = a[0] - a[1] - a[2] + a[3]
80 * b[3] = a[0] - a[1] + a[2] - a[3]
81 **********************************************************************/
82 #define VEC_HADAMAR(a0,a1,a2,a3,b0,b1,b2,b3) \
83 b2 = vec_add( a0, a1 ); \
84 b3 = vec_add( a2, a3 ); \
85 a0 = vec_sub( a0, a1 ); \
86 a2 = vec_sub( a2, a3 ); \
87 b0 = vec_add( b2, b3 ); \
88 b1 = vec_sub( b2, b3 ); \
89 b2 = vec_sub( a0, a2 ); \
90 b3 = vec_add( a0, a2 )
92 /***********************************************************************
94 ***********************************************************************
99 * Call vec_sub()/vec_max() instead of vec_abs() because vec_abs()
100 * actually also calls vec_splat(0), but we already have a null vector.
101 **********************************************************************/
103 a = vec_max( a, vec_sub( zero_s16v, a ) );
105 /***********************************************************************
107 ***********************************************************************
111 * c[i] = abs(a[2*i]) + abs(a[2*i+1]) + [bi]
112 **********************************************************************/
113 #define VEC_ADD_ABS(a,b,c) \
115 c = vec_sum4s( a, b )
117 /***********************************************************************
119 **********************************************************************/
120 static int pixel_satd_4x4_altivec( uint8_t *pix1, int i_pix1,
121 uint8_t *pix2, int i_pix2 )
123 DECLARE_ALIGNED_16( int i_satd );
126 vec_s16_t diff0v, diff1v, diff2v, diff3v;
127 vec_s16_t temp0v, temp1v, temp2v, temp3v;
130 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff0v );
131 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff1v );
132 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff2v );
133 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff3v );
136 VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
137 temp0v, temp1v, temp2v, temp3v );
139 VEC_TRANSPOSE_4( temp0v, temp1v, temp2v, temp3v,
140 diff0v, diff1v, diff2v, diff3v );
142 VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
143 temp0v, temp1v, temp2v, temp3v );
145 VEC_ADD_ABS( temp0v, zero_s32v, satdv );
146 VEC_ADD_ABS( temp1v, satdv, satdv );
147 VEC_ADD_ABS( temp2v, satdv, satdv );
148 VEC_ADD_ABS( temp3v, satdv, satdv );
150 satdv = vec_sum2s( satdv, zero_s32v );
151 satdv = vec_splat( satdv, 1 );
152 vec_ste( satdv, 0, &i_satd );
157 /***********************************************************************
159 **********************************************************************/
160 static int pixel_satd_4x8_altivec( uint8_t *pix1, int i_pix1,
161 uint8_t *pix2, int i_pix2 )
163 DECLARE_ALIGNED_16( int i_satd );
166 vec_s16_t diff0v, diff1v, diff2v, diff3v;
167 vec_s16_t temp0v, temp1v, temp2v, temp3v;
170 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff0v );
171 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff1v );
172 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff2v );
173 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff3v );
174 VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
175 temp0v, temp1v, temp2v, temp3v );
176 VEC_TRANSPOSE_4( temp0v, temp1v, temp2v, temp3v,
177 diff0v, diff1v, diff2v, diff3v );
178 VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
179 temp0v, temp1v, temp2v, temp3v );
180 VEC_ADD_ABS( temp0v, zero_s32v, satdv );
181 VEC_ADD_ABS( temp1v, satdv, satdv );
182 VEC_ADD_ABS( temp2v, satdv, satdv );
183 VEC_ADD_ABS( temp3v, satdv, satdv );
185 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff0v );
186 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff1v );
187 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff2v );
188 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff3v );
189 VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
190 temp0v, temp1v, temp2v, temp3v );
191 VEC_TRANSPOSE_4( temp0v, temp1v, temp2v, temp3v,
192 diff0v, diff1v, diff2v, diff3v );
193 VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
194 temp0v, temp1v, temp2v, temp3v );
195 VEC_ADD_ABS( temp0v, satdv, satdv );
196 VEC_ADD_ABS( temp1v, satdv, satdv );
197 VEC_ADD_ABS( temp2v, satdv, satdv );
198 VEC_ADD_ABS( temp3v, satdv, satdv );
200 satdv = vec_sum2s( satdv, zero_s32v );
201 satdv = vec_splat( satdv, 1 );
202 vec_ste( satdv, 0, &i_satd );
207 /***********************************************************************
209 **********************************************************************/
210 static int pixel_satd_8x4_altivec( uint8_t *pix1, int i_pix1,
211 uint8_t *pix2, int i_pix2 )
213 DECLARE_ALIGNED_16( int i_satd );
216 vec_s16_t diff0v, diff1v, diff2v, diff3v,
217 diff4v, diff5v, diff6v, diff7v;
218 vec_s16_t temp0v, temp1v, temp2v, temp3v,
219 temp4v, temp5v, temp6v, temp7v;
222 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v );
223 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v );
224 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v );
225 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v );
227 VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
228 temp0v, temp1v, temp2v, temp3v );
229 /* This causes warnings because temp4v...temp7v haven't be set,
231 VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
232 temp4v, temp5v, temp6v, temp7v,
233 diff0v, diff1v, diff2v, diff3v,
234 diff4v, diff5v, diff6v, diff7v );
235 VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
236 temp0v, temp1v, temp2v, temp3v );
237 VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v,
238 temp4v, temp5v, temp6v, temp7v );
240 VEC_ADD_ABS( temp0v, zero_s32v, satdv );
241 VEC_ADD_ABS( temp1v, satdv, satdv );
242 VEC_ADD_ABS( temp2v, satdv, satdv );
243 VEC_ADD_ABS( temp3v, satdv, satdv );
244 VEC_ADD_ABS( temp4v, satdv, satdv );
245 VEC_ADD_ABS( temp5v, satdv, satdv );
246 VEC_ADD_ABS( temp6v, satdv, satdv );
247 VEC_ADD_ABS( temp7v, satdv, satdv );
249 satdv = vec_sum2s( satdv, zero_s32v );
250 satdv = vec_splat( satdv, 1 );
251 vec_ste( satdv, 0, &i_satd );
256 /***********************************************************************
258 **********************************************************************/
259 static int pixel_satd_8x8_altivec( uint8_t *pix1, int i_pix1,
260 uint8_t *pix2, int i_pix2 )
262 DECLARE_ALIGNED_16( int i_satd );
265 vec_s16_t diff0v, diff1v, diff2v, diff3v,
266 diff4v, diff5v, diff6v, diff7v;
267 vec_s16_t temp0v, temp1v, temp2v, temp3v,
268 temp4v, temp5v, temp6v, temp7v;
271 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v );
272 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v );
273 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v );
274 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v );
275 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff4v );
276 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff5v );
277 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff6v );
278 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff7v );
280 VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
281 temp0v, temp1v, temp2v, temp3v );
282 VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v,
283 temp4v, temp5v, temp6v, temp7v );
285 VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
286 temp4v, temp5v, temp6v, temp7v,
287 diff0v, diff1v, diff2v, diff3v,
288 diff4v, diff5v, diff6v, diff7v );
290 VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
291 temp0v, temp1v, temp2v, temp3v );
292 VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v,
293 temp4v, temp5v, temp6v, temp7v );
295 VEC_ADD_ABS( temp0v, zero_s32v, satdv );
296 VEC_ADD_ABS( temp1v, satdv, satdv );
297 VEC_ADD_ABS( temp2v, satdv, satdv );
298 VEC_ADD_ABS( temp3v, satdv, satdv );
299 VEC_ADD_ABS( temp4v, satdv, satdv );
300 VEC_ADD_ABS( temp5v, satdv, satdv );
301 VEC_ADD_ABS( temp6v, satdv, satdv );
302 VEC_ADD_ABS( temp7v, satdv, satdv );
304 satdv = vec_sums( satdv, zero_s32v );
305 satdv = vec_splat( satdv, 3 );
306 vec_ste( satdv, 0, &i_satd );
311 /***********************************************************************
313 **********************************************************************/
314 static int pixel_satd_8x16_altivec( uint8_t *pix1, int i_pix1,
315 uint8_t *pix2, int i_pix2 )
317 DECLARE_ALIGNED_16( int i_satd );
320 vec_s16_t diff0v, diff1v, diff2v, diff3v,
321 diff4v, diff5v, diff6v, diff7v;
322 vec_s16_t temp0v, temp1v, temp2v, temp3v,
323 temp4v, temp5v, temp6v, temp7v;
326 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v );
327 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v );
328 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v );
329 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v );
330 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff4v );
331 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff5v );
332 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff6v );
333 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff7v );
334 VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
335 temp0v, temp1v, temp2v, temp3v );
336 VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v,
337 temp4v, temp5v, temp6v, temp7v );
338 VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
339 temp4v, temp5v, temp6v, temp7v,
340 diff0v, diff1v, diff2v, diff3v,
341 diff4v, diff5v, diff6v, diff7v );
342 VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
343 temp0v, temp1v, temp2v, temp3v );
344 VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v,
345 temp4v, temp5v, temp6v, temp7v );
346 VEC_ADD_ABS( temp0v, zero_s32v, satdv );
347 VEC_ADD_ABS( temp1v, satdv, satdv );
348 VEC_ADD_ABS( temp2v, satdv, satdv );
349 VEC_ADD_ABS( temp3v, satdv, satdv );
350 VEC_ADD_ABS( temp4v, satdv, satdv );
351 VEC_ADD_ABS( temp5v, satdv, satdv );
352 VEC_ADD_ABS( temp6v, satdv, satdv );
353 VEC_ADD_ABS( temp7v, satdv, satdv );
355 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v );
356 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v );
357 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v );
358 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v );
359 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff4v );
360 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff5v );
361 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff6v );
362 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff7v );
363 VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
364 temp0v, temp1v, temp2v, temp3v );
365 VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v,
366 temp4v, temp5v, temp6v, temp7v );
367 VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
368 temp4v, temp5v, temp6v, temp7v,
369 diff0v, diff1v, diff2v, diff3v,
370 diff4v, diff5v, diff6v, diff7v );
371 VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
372 temp0v, temp1v, temp2v, temp3v );
373 VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v,
374 temp4v, temp5v, temp6v, temp7v );
375 VEC_ADD_ABS( temp0v, satdv, satdv );
376 VEC_ADD_ABS( temp1v, satdv, satdv );
377 VEC_ADD_ABS( temp2v, satdv, satdv );
378 VEC_ADD_ABS( temp3v, satdv, satdv );
379 VEC_ADD_ABS( temp4v, satdv, satdv );
380 VEC_ADD_ABS( temp5v, satdv, satdv );
381 VEC_ADD_ABS( temp6v, satdv, satdv );
382 VEC_ADD_ABS( temp7v, satdv, satdv );
384 satdv = vec_sums( satdv, zero_s32v );
385 satdv = vec_splat( satdv, 3 );
386 vec_ste( satdv, 0, &i_satd );
391 /***********************************************************************
393 **********************************************************************/
394 static int pixel_satd_16x8_altivec( uint8_t *pix1, int i_pix1,
395 uint8_t *pix2, int i_pix2 )
397 DECLARE_ALIGNED_16( int i_satd );
402 vec_s16_t pix1v, pix2v;
403 vec_s16_t diffh0v, diffh1v, diffh2v, diffh3v,
404 diffh4v, diffh5v, diffh6v, diffh7v;
405 vec_s16_t diffl0v, diffl1v, diffl2v, diffl3v,
406 diffl4v, diffl5v, diffl6v, diffl7v;
407 vec_s16_t temp0v, temp1v, temp2v, temp3v,
408 temp4v, temp5v, temp6v, temp7v;
410 VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh0v, diffl0v );
411 VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh1v, diffl1v );
412 VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh2v, diffl2v );
413 VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh3v, diffl3v );
414 VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh4v, diffl4v );
415 VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh5v, diffl5v );
416 VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh6v, diffl6v );
417 VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh7v, diffl7v );
419 VEC_HADAMAR( diffh0v, diffh1v, diffh2v, diffh3v,
420 temp0v, temp1v, temp2v, temp3v );
421 VEC_HADAMAR( diffh4v, diffh5v, diffh6v, diffh7v,
422 temp4v, temp5v, temp6v, temp7v );
424 VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
425 temp4v, temp5v, temp6v, temp7v,
426 diffh0v, diffh1v, diffh2v, diffh3v,
427 diffh4v, diffh5v, diffh6v, diffh7v );
429 VEC_HADAMAR( diffh0v, diffh1v, diffh2v, diffh3v,
430 temp0v, temp1v, temp2v, temp3v );
431 VEC_HADAMAR( diffh4v, diffh5v, diffh6v, diffh7v,
432 temp4v, temp5v, temp6v, temp7v );
434 VEC_ADD_ABS( temp0v, zero_s32v, satdv );
435 VEC_ADD_ABS( temp1v, satdv, satdv );
436 VEC_ADD_ABS( temp2v, satdv, satdv );
437 VEC_ADD_ABS( temp3v, satdv, satdv );
438 VEC_ADD_ABS( temp4v, satdv, satdv );
439 VEC_ADD_ABS( temp5v, satdv, satdv );
440 VEC_ADD_ABS( temp6v, satdv, satdv );
441 VEC_ADD_ABS( temp7v, satdv, satdv );
443 VEC_HADAMAR( diffl0v, diffl1v, diffl2v, diffl3v,
444 temp0v, temp1v, temp2v, temp3v );
445 VEC_HADAMAR( diffl4v, diffl5v, diffl6v, diffl7v,
446 temp4v, temp5v, temp6v, temp7v );
448 VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
449 temp4v, temp5v, temp6v, temp7v,
450 diffl0v, diffl1v, diffl2v, diffl3v,
451 diffl4v, diffl5v, diffl6v, diffl7v );
453 VEC_HADAMAR( diffl0v, diffl1v, diffl2v, diffl3v,
454 temp0v, temp1v, temp2v, temp3v );
455 VEC_HADAMAR( diffl4v, diffl5v, diffl6v, diffl7v,
456 temp4v, temp5v, temp6v, temp7v );
458 VEC_ADD_ABS( temp0v, satdv, satdv );
459 VEC_ADD_ABS( temp1v, satdv, satdv );
460 VEC_ADD_ABS( temp2v, satdv, satdv );
461 VEC_ADD_ABS( temp3v, satdv, satdv );
462 VEC_ADD_ABS( temp4v, satdv, satdv );
463 VEC_ADD_ABS( temp5v, satdv, satdv );
464 VEC_ADD_ABS( temp6v, satdv, satdv );
465 VEC_ADD_ABS( temp7v, satdv, satdv );
467 satdv = vec_sums( satdv, zero_s32v );
468 satdv = vec_splat( satdv, 3 );
469 vec_ste( satdv, 0, &i_satd );
474 /***********************************************************************
476 **********************************************************************/
477 static int pixel_satd_16x16_altivec( uint8_t *pix1, int i_pix1,
478 uint8_t *pix2, int i_pix2 )
480 DECLARE_ALIGNED_16( int i_satd );
485 vec_s16_t pix1v, pix2v;
486 vec_s16_t diffh0v, diffh1v, diffh2v, diffh3v,
487 diffh4v, diffh5v, diffh6v, diffh7v;
488 vec_s16_t diffl0v, diffl1v, diffl2v, diffl3v,
489 diffl4v, diffl5v, diffl6v, diffl7v;
490 vec_s16_t temp0v, temp1v, temp2v, temp3v,
491 temp4v, temp5v, temp6v, temp7v;
493 VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh0v, diffl0v );
494 VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh1v, diffl1v );
495 VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh2v, diffl2v );
496 VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh3v, diffl3v );
497 VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh4v, diffl4v );
498 VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh5v, diffl5v );
499 VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh6v, diffl6v );
500 VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh7v, diffl7v );
501 VEC_HADAMAR( diffh0v, diffh1v, diffh2v, diffh3v,
502 temp0v, temp1v, temp2v, temp3v );
503 VEC_HADAMAR( diffh4v, diffh5v, diffh6v, diffh7v,
504 temp4v, temp5v, temp6v, temp7v );
505 VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
506 temp4v, temp5v, temp6v, temp7v,
507 diffh0v, diffh1v, diffh2v, diffh3v,
508 diffh4v, diffh5v, diffh6v, diffh7v );
509 VEC_HADAMAR( diffh0v, diffh1v, diffh2v, diffh3v,
510 temp0v, temp1v, temp2v, temp3v );
511 VEC_HADAMAR( diffh4v, diffh5v, diffh6v, diffh7v,
512 temp4v, temp5v, temp6v, temp7v );
513 VEC_ADD_ABS( temp0v, zero_s32v, satdv );
514 VEC_ADD_ABS( temp1v, satdv, satdv );
515 VEC_ADD_ABS( temp2v, satdv, satdv );
516 VEC_ADD_ABS( temp3v, satdv, satdv );
517 VEC_ADD_ABS( temp4v, satdv, satdv );
518 VEC_ADD_ABS( temp5v, satdv, satdv );
519 VEC_ADD_ABS( temp6v, satdv, satdv );
520 VEC_ADD_ABS( temp7v, satdv, satdv );
521 VEC_HADAMAR( diffl0v, diffl1v, diffl2v, diffl3v,
522 temp0v, temp1v, temp2v, temp3v );
523 VEC_HADAMAR( diffl4v, diffl5v, diffl6v, diffl7v,
524 temp4v, temp5v, temp6v, temp7v );
525 VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
526 temp4v, temp5v, temp6v, temp7v,
527 diffl0v, diffl1v, diffl2v, diffl3v,
528 diffl4v, diffl5v, diffl6v, diffl7v );
529 VEC_HADAMAR( diffl0v, diffl1v, diffl2v, diffl3v,
530 temp0v, temp1v, temp2v, temp3v );
531 VEC_HADAMAR( diffl4v, diffl5v, diffl6v, diffl7v,
532 temp4v, temp5v, temp6v, temp7v );
533 VEC_ADD_ABS( temp0v, satdv, satdv );
534 VEC_ADD_ABS( temp1v, satdv, satdv );
535 VEC_ADD_ABS( temp2v, satdv, satdv );
536 VEC_ADD_ABS( temp3v, satdv, satdv );
537 VEC_ADD_ABS( temp4v, satdv, satdv );
538 VEC_ADD_ABS( temp5v, satdv, satdv );
539 VEC_ADD_ABS( temp6v, satdv, satdv );
540 VEC_ADD_ABS( temp7v, satdv, satdv );
542 VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh0v, diffl0v );
543 VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh1v, diffl1v );
544 VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh2v, diffl2v );
545 VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh3v, diffl3v );
546 VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh4v, diffl4v );
547 VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh5v, diffl5v );
548 VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh6v, diffl6v );
549 VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh7v, diffl7v );
550 VEC_HADAMAR( diffh0v, diffh1v, diffh2v, diffh3v,
551 temp0v, temp1v, temp2v, temp3v );
552 VEC_HADAMAR( diffh4v, diffh5v, diffh6v, diffh7v,
553 temp4v, temp5v, temp6v, temp7v );
554 VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
555 temp4v, temp5v, temp6v, temp7v,
556 diffh0v, diffh1v, diffh2v, diffh3v,
557 diffh4v, diffh5v, diffh6v, diffh7v );
558 VEC_HADAMAR( diffh0v, diffh1v, diffh2v, diffh3v,
559 temp0v, temp1v, temp2v, temp3v );
560 VEC_HADAMAR( diffh4v, diffh5v, diffh6v, diffh7v,
561 temp4v, temp5v, temp6v, temp7v );
562 VEC_ADD_ABS( temp0v, satdv, satdv );
563 VEC_ADD_ABS( temp1v, satdv, satdv );
564 VEC_ADD_ABS( temp2v, satdv, satdv );
565 VEC_ADD_ABS( temp3v, satdv, satdv );
566 VEC_ADD_ABS( temp4v, satdv, satdv );
567 VEC_ADD_ABS( temp5v, satdv, satdv );
568 VEC_ADD_ABS( temp6v, satdv, satdv );
569 VEC_ADD_ABS( temp7v, satdv, satdv );
570 VEC_HADAMAR( diffl0v, diffl1v, diffl2v, diffl3v,
571 temp0v, temp1v, temp2v, temp3v );
572 VEC_HADAMAR( diffl4v, diffl5v, diffl6v, diffl7v,
573 temp4v, temp5v, temp6v, temp7v );
574 VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
575 temp4v, temp5v, temp6v, temp7v,
576 diffl0v, diffl1v, diffl2v, diffl3v,
577 diffl4v, diffl5v, diffl6v, diffl7v );
578 VEC_HADAMAR( diffl0v, diffl1v, diffl2v, diffl3v,
579 temp0v, temp1v, temp2v, temp3v );
580 VEC_HADAMAR( diffl4v, diffl5v, diffl6v, diffl7v,
581 temp4v, temp5v, temp6v, temp7v );
582 VEC_ADD_ABS( temp0v, satdv, satdv );
583 VEC_ADD_ABS( temp1v, satdv, satdv );
584 VEC_ADD_ABS( temp2v, satdv, satdv );
585 VEC_ADD_ABS( temp3v, satdv, satdv );
586 VEC_ADD_ABS( temp4v, satdv, satdv );
587 VEC_ADD_ABS( temp5v, satdv, satdv );
588 VEC_ADD_ABS( temp6v, satdv, satdv );
589 VEC_ADD_ABS( temp7v, satdv, satdv );
591 satdv = vec_sums( satdv, zero_s32v );
592 satdv = vec_splat( satdv, 3 );
593 vec_ste( satdv, 0, &i_satd );
600 /***********************************************************************
601 * Interleaved SAD routines
602 **********************************************************************/
604 static void pixel_sad_x4_16x16_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, uint8_t *pix3, int i_stride, int scores[4] )
606 DECLARE_ALIGNED_16( int sum0 );
607 DECLARE_ALIGNED_16( int sum1 );
608 DECLARE_ALIGNED_16( int sum2 );
609 DECLARE_ALIGNED_16( int sum3 );
613 vec_u8_t temp_lv, temp_hv;
614 vec_u8_t fencv, pix0v, pix1v, pix2v, pix3v;
615 //vec_u8_t perm0v, perm1v, perm2v, perm3v;
616 vec_u8_t perm0vA, perm1vA, perm2vA, perm3vA, perm0vB, perm1vB, perm2vB, perm3vB;
618 vec_s32_t sum0v, sum1v, sum2v, sum3v;
620 sum0v = vec_splat_s32(0);
621 sum1v = vec_splat_s32(0);
622 sum2v = vec_splat_s32(0);
623 sum3v = vec_splat_s32(0);
625 perm0vA = vec_lvsl(0, pix0);
626 perm1vA = vec_lvsl(0, pix1);
627 perm2vA = vec_lvsl(0, pix2);
628 perm3vA = vec_lvsl(0, pix3);
630 perm0vB = vec_lvsl(0, pix0 + i_stride);
631 perm1vB = vec_lvsl(0, pix1 + i_stride);
632 perm2vB = vec_lvsl(0, pix2 + i_stride);
633 perm3vB = vec_lvsl(0, pix3 + i_stride);
636 for (y = 0; y < 8; y++)
638 temp_lv = vec_ld(0, pix0);
639 temp_hv = vec_ld(16, pix0);
640 pix0v = vec_perm(temp_lv, temp_hv, perm0vA);
644 temp_lv = vec_ld(0, pix1);
645 temp_hv = vec_ld(16, pix1);
646 pix1v = vec_perm(temp_lv, temp_hv, perm1vA);
649 fencv = vec_ld(0, fenc);
652 temp_lv = vec_ld(0, pix2);
653 temp_hv = vec_ld(16, pix2);
654 pix2v = vec_perm(temp_lv, temp_hv, perm2vA);
657 temp_lv = vec_ld(0, pix3);
658 temp_hv = vec_ld(16, pix3);
659 pix3v = vec_perm(temp_lv, temp_hv, perm3vA);
662 sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
664 sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
666 sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
668 sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v );
670 temp_lv = vec_ld(0, pix0);
671 temp_hv = vec_ld(16, pix0);
672 pix0v = vec_perm(temp_lv, temp_hv, perm0vB);
676 temp_lv = vec_ld(0, pix1);
677 temp_hv = vec_ld(16, pix1);
678 pix1v = vec_perm(temp_lv, temp_hv, perm1vB);
681 fencv = vec_ld(0, fenc);
684 temp_lv = vec_ld(0, pix2);
685 temp_hv = vec_ld(16, pix2);
686 pix2v = vec_perm(temp_lv, temp_hv, perm2vB);
689 temp_lv = vec_ld(0, pix3);
690 temp_hv = vec_ld(16, pix3);
691 pix3v = vec_perm(temp_lv, temp_hv, perm3vB);
694 sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
696 sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
698 sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
700 sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v );
705 sum0v = vec_sums( sum0v, zero_s32v );
706 sum1v = vec_sums( sum1v, zero_s32v );
707 sum2v = vec_sums( sum2v, zero_s32v );
708 sum3v = vec_sums( sum3v, zero_s32v );
710 sum0v = vec_splat( sum0v, 3 );
711 sum1v = vec_splat( sum1v, 3 );
712 sum2v = vec_splat( sum2v, 3 );
713 sum3v = vec_splat( sum3v, 3 );
715 vec_ste( sum0v, 0, &sum0);
716 vec_ste( sum1v, 0, &sum1);
717 vec_ste( sum2v, 0, &sum2);
718 vec_ste( sum3v, 0, &sum3);
729 static void pixel_sad_x3_16x16_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, int i_stride, int scores[3] )
732 DECLARE_ALIGNED_16( int sum0 );
733 DECLARE_ALIGNED_16( int sum1 );
734 DECLARE_ALIGNED_16( int sum2 );
738 vec_u8_t temp_lv, temp_hv; // temporary load vectors
739 vec_u8_t fencv, pix0v, pix1v, pix2v;
740 vec_u8_t perm0vA, perm1vA, perm2vA, perm0vB, perm1vB, perm2vB;
742 vec_s32_t sum0v, sum1v, sum2v;
744 sum0v = vec_splat_s32(0);
745 sum1v = vec_splat_s32(0);
746 sum2v = vec_splat_s32(0);
748 perm0vA = vec_lvsl(0, pix0);
749 perm1vA = vec_lvsl(0, pix1);
750 perm2vA = vec_lvsl(0, pix2);
752 perm0vB = vec_lvsl(0, pix0 + i_stride);
753 perm1vB = vec_lvsl(0, pix1 + i_stride);
754 perm2vB = vec_lvsl(0, pix2 + i_stride);
756 for (y = 0; y < 8; y++)
758 temp_lv = vec_ld(0, pix0);
759 temp_hv = vec_ld(16, pix0);
760 pix0v = vec_perm(temp_lv, temp_hv, perm0vA);
764 temp_lv = vec_ld(0, pix1);
765 temp_hv = vec_ld(16, pix1);
766 pix1v = vec_perm(temp_lv, temp_hv, perm1vA);
769 fencv = vec_ld(0, fenc);
772 temp_lv = vec_ld(0, pix2);
773 temp_hv = vec_ld(16, pix2);
774 pix2v = vec_perm(temp_lv, temp_hv, perm2vA);
778 sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
780 sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
782 sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
784 temp_lv = vec_ld(0, pix0);
785 temp_hv = vec_ld(16, pix0);
786 pix0v = vec_perm(temp_lv, temp_hv, perm0vB);
790 temp_lv = vec_ld(0, pix1);
791 temp_hv = vec_ld(16, pix1);
792 pix1v = vec_perm(temp_lv, temp_hv, perm1vB);
795 fencv = vec_ld(0, fenc);
798 temp_lv = vec_ld(0, pix2);
799 temp_hv = vec_ld(16, pix2);
800 pix2v = vec_perm(temp_lv, temp_hv, perm2vB);
804 sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
806 sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
808 sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
814 sum0v = vec_sums( sum0v, zero_s32v );
815 sum1v = vec_sums( sum1v, zero_s32v );
816 sum2v = vec_sums( sum2v, zero_s32v );
818 sum0v = vec_splat( sum0v, 3 );
819 sum1v = vec_splat( sum1v, 3 );
820 sum2v = vec_splat( sum2v, 3 );
822 vec_ste( sum0v, 0, &sum0);
823 vec_ste( sum1v, 0, &sum1);
824 vec_ste( sum2v, 0, &sum2);
832 static void pixel_sad_x4_16x8_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, uint8_t *pix3, int i_stride, int scores[4] )
834 DECLARE_ALIGNED_16( int sum0 );
835 DECLARE_ALIGNED_16( int sum1 );
836 DECLARE_ALIGNED_16( int sum2 );
837 DECLARE_ALIGNED_16( int sum3 );
841 vec_u8_t temp_lv, temp_hv;
842 vec_u8_t fencv, pix0v, pix1v, pix2v, pix3v;
843 vec_u8_t perm0vA, perm1vA, perm2vA, perm3vA, perm0vB, perm1vB, perm2vB, perm3vB;
845 vec_s32_t sum0v, sum1v, sum2v, sum3v;
847 sum0v = vec_splat_s32(0);
848 sum1v = vec_splat_s32(0);
849 sum2v = vec_splat_s32(0);
850 sum3v = vec_splat_s32(0);
852 perm0vA = vec_lvsl(0, pix0);
853 perm1vA = vec_lvsl(0, pix1);
854 perm2vA = vec_lvsl(0, pix2);
855 perm3vA = vec_lvsl(0, pix3);
857 perm0vB = vec_lvsl(0, pix0 + i_stride);
858 perm1vB = vec_lvsl(0, pix1 + i_stride);
859 perm2vB = vec_lvsl(0, pix2 + i_stride);
860 perm3vB = vec_lvsl(0, pix3 + i_stride);
864 for (y = 0; y < 4; y++)
866 temp_lv = vec_ld(0, pix0);
867 temp_hv = vec_ld(16, pix0);
868 pix0v = vec_perm(temp_lv, temp_hv, perm0vA);
872 temp_lv = vec_ld(0, pix1);
873 temp_hv = vec_ld(16, pix1);
874 pix1v = vec_perm(temp_lv, temp_hv, perm1vA);
877 fencv = vec_ld(0, fenc);
880 temp_lv = vec_ld(0, pix2);
881 temp_hv = vec_ld(16, pix2);
882 pix2v = vec_perm(temp_lv, temp_hv, perm2vA);
885 temp_lv = vec_ld(0, pix3);
886 temp_hv = vec_ld(16, pix3);
887 pix3v = vec_perm(temp_lv, temp_hv, perm3vA);
890 sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
892 sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
894 sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
896 sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v );
898 temp_lv = vec_ld(0, pix0);
899 temp_hv = vec_ld(16, pix0);
900 pix0v = vec_perm(temp_lv, temp_hv, perm0vB);
904 temp_lv = vec_ld(0, pix1);
905 temp_hv = vec_ld(16, pix1);
906 pix1v = vec_perm(temp_lv, temp_hv, perm1vB);
909 fencv = vec_ld(0, fenc);
912 temp_lv = vec_ld(0, pix2);
913 temp_hv = vec_ld(16, pix2);
914 pix2v = vec_perm(temp_lv, temp_hv, perm2vB);
917 temp_lv = vec_ld(0, pix3);
918 temp_hv = vec_ld(16, pix3);
919 pix3v = vec_perm(temp_lv, temp_hv, perm3vB);
922 sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
924 sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
926 sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
928 sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v );
933 sum0v = vec_sums( sum0v, zero_s32v );
934 sum1v = vec_sums( sum1v, zero_s32v );
935 sum2v = vec_sums( sum2v, zero_s32v );
936 sum3v = vec_sums( sum3v, zero_s32v );
938 sum0v = vec_splat( sum0v, 3 );
939 sum1v = vec_splat( sum1v, 3 );
940 sum2v = vec_splat( sum2v, 3 );
941 sum3v = vec_splat( sum3v, 3 );
943 vec_ste( sum0v, 0, &sum0);
944 vec_ste( sum1v, 0, &sum1);
945 vec_ste( sum2v, 0, &sum2);
946 vec_ste( sum3v, 0, &sum3);
957 static void pixel_sad_x3_16x8_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, int i_stride, int scores[3] )
960 DECLARE_ALIGNED_16( int sum0 );
961 DECLARE_ALIGNED_16( int sum1 );
962 DECLARE_ALIGNED_16( int sum2 );
966 vec_u8_t temp_lv, temp_hv;
967 vec_u8_t fencv, pix0v, pix1v, pix2v;
968 vec_u8_t perm0vA, perm1vA, perm2vA, perm0vB, perm1vB, perm2vB;
970 vec_s32_t sum0v, sum1v, sum2v;
972 sum0v = vec_splat_s32(0);
973 sum1v = vec_splat_s32(0);
974 sum2v = vec_splat_s32(0);
977 perm0vA = vec_lvsl(0, pix0);
978 perm1vA = vec_lvsl(0, pix1);
979 perm2vA = vec_lvsl(0, pix2);
981 perm0vB = vec_lvsl(0, pix0 + i_stride);
982 perm1vB = vec_lvsl(0, pix1 + i_stride);
983 perm2vB = vec_lvsl(0, pix2 + i_stride);
985 for (y = 0; y < 4; y++)
987 temp_lv = vec_ld(0, pix0);
988 temp_hv = vec_ld(16, pix0);
989 pix0v = vec_perm(temp_lv, temp_hv, perm0vA);
993 temp_lv = vec_ld(0, pix1);
994 temp_hv = vec_ld(16, pix1);
995 pix1v = vec_perm(temp_lv, temp_hv, perm1vA);
998 fencv = vec_ld(0, fenc);
1001 temp_lv = vec_ld(0, pix2);
1002 temp_hv = vec_ld(16, pix2);
1003 pix2v = vec_perm(temp_lv, temp_hv, perm2vA);
1007 sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
1009 sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
1011 sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
1013 temp_lv = vec_ld(0, pix0);
1014 temp_hv = vec_ld(16, pix0);
1015 pix0v = vec_perm(temp_lv, temp_hv, perm0vB);
1019 temp_lv = vec_ld(0, pix1);
1020 temp_hv = vec_ld(16, pix1);
1021 pix1v = vec_perm(temp_lv, temp_hv, perm1vB);
1024 fencv = vec_ld(0, fenc);
1025 fenc += FENC_STRIDE;
1027 temp_lv = vec_ld(0, pix2);
1028 temp_hv = vec_ld(16, pix2);
1029 pix2v = vec_perm(temp_lv, temp_hv, perm2vB);
1033 sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
1035 sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
1037 sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
1042 sum0v = vec_sums( sum0v, zero_s32v );
1043 sum1v = vec_sums( sum1v, zero_s32v );
1044 sum2v = vec_sums( sum2v, zero_s32v );
1046 sum0v = vec_splat( sum0v, 3 );
1047 sum1v = vec_splat( sum1v, 3 );
1048 sum2v = vec_splat( sum2v, 3 );
1050 vec_ste( sum0v, 0, &sum0);
1051 vec_ste( sum1v, 0, &sum1);
1052 vec_ste( sum2v, 0, &sum2);
1061 static void pixel_sad_x4_8x16_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, uint8_t *pix3, int i_stride, int scores[4] )
1063 DECLARE_ALIGNED_16( int sum0 );
1064 DECLARE_ALIGNED_16( int sum1 );
1065 DECLARE_ALIGNED_16( int sum2 );
1066 DECLARE_ALIGNED_16( int sum3 );
1070 vec_u8_t temp_lv, temp_hv;
1071 vec_u8_t fencv, pix0v, pix1v, pix2v, pix3v;
1072 vec_u8_t perm0vA, perm1vA, perm2vA, perm3vA, perm0vB, perm1vB, perm2vB, perm3vB, permEncv;
1074 vec_s32_t sum0v, sum1v, sum2v, sum3v;
1076 sum0v = vec_splat_s32(0);
1077 sum1v = vec_splat_s32(0);
1078 sum2v = vec_splat_s32(0);
1079 sum3v = vec_splat_s32(0);
1081 permEncv = vec_lvsl(0, fenc);
1082 perm0vA = vec_lvsl(0, pix0);
1083 perm1vA = vec_lvsl(0, pix1);
1084 perm2vA = vec_lvsl(0, pix2);
1085 perm3vA = vec_lvsl(0, pix3);
1087 perm0vB = vec_lvsl(0, pix0 + i_stride);
1088 perm1vB = vec_lvsl(0, pix1 + i_stride);
1089 perm2vB = vec_lvsl(0, pix2 + i_stride);
1090 perm3vB = vec_lvsl(0, pix3 + i_stride);
1093 for (y = 0; y < 8; y++)
1095 temp_lv = vec_ld(0, pix0);
1096 temp_hv = vec_ld(16, pix0);
1097 pix0v = vec_perm(temp_lv, temp_hv, perm0vA);
1101 temp_lv = vec_ld(0, pix1);
1102 temp_hv = vec_ld(16, pix1);
1103 pix1v = vec_perm(temp_lv, temp_hv, perm1vA);
1106 temp_lv = vec_ld(0, fenc);
1107 fencv = vec_perm(temp_lv, temp_hv, permEncv);
1108 fenc += FENC_STRIDE;
1110 temp_lv = vec_ld(0, pix2);
1111 temp_hv = vec_ld(16, pix2);
1112 pix2v = vec_perm(temp_lv, temp_hv, perm2vA);
1115 temp_lv = vec_ld(0, pix3);
1116 temp_hv = vec_ld(16, pix3);
1117 pix3v = vec_perm(temp_lv, temp_hv, perm3vA);
1120 sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
1122 sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
1124 sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
1126 sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v );
1128 temp_lv = vec_ld(0, pix0);
1129 temp_hv = vec_ld(16, pix0);
1130 pix0v = vec_perm(temp_lv, temp_hv, perm0vB);
1134 temp_lv = vec_ld(0, pix1);
1135 temp_hv = vec_ld(16, pix1);
1136 pix1v = vec_perm(temp_lv, temp_hv, perm1vB);
1139 temp_lv = vec_ld(0, fenc);
1140 fencv = vec_perm(temp_lv, temp_hv, permEncv);
1141 fenc += FENC_STRIDE;
1143 temp_lv = vec_ld(0, pix2);
1144 temp_hv = vec_ld(16, pix2);
1145 pix2v = vec_perm(temp_lv, temp_hv, perm2vB);
1148 temp_lv = vec_ld(0, pix3);
1149 temp_hv = vec_ld(16, pix3);
1150 pix3v = vec_perm(temp_lv, temp_hv, perm3vB);
1153 sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
1155 sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
1157 sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
1159 sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v );
1162 sum0v = vec_sum2s( sum0v, zero_s32v );
1163 sum1v = vec_sum2s( sum1v, zero_s32v );
1164 sum2v = vec_sum2s( sum2v, zero_s32v );
1165 sum3v = vec_sum2s( sum3v, zero_s32v );
1167 sum0v = vec_splat( sum0v, 1 );
1168 sum1v = vec_splat( sum1v, 1 );
1169 sum2v = vec_splat( sum2v, 1 );
1170 sum3v = vec_splat( sum3v, 1 );
1172 vec_ste( sum0v, 0, &sum0);
1173 vec_ste( sum1v, 0, &sum1);
1174 vec_ste( sum2v, 0, &sum2);
1175 vec_ste( sum3v, 0, &sum3);
1184 static void pixel_sad_x3_8x16_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, int i_stride, int scores[3] )
1186 DECLARE_ALIGNED_16( int sum0 );
1187 DECLARE_ALIGNED_16( int sum1 );
1188 DECLARE_ALIGNED_16( int sum2 );
1192 vec_u8_t temp_lv, temp_hv;
1193 vec_u8_t fencv, pix0v, pix1v, pix2v;
1194 vec_u8_t perm0vA, perm1vA, perm2vA, perm0vB, perm1vB, perm2vB,permEncv;
1196 vec_s32_t sum0v, sum1v, sum2v;
1198 sum0v = vec_splat_s32(0);
1199 sum1v = vec_splat_s32(0);
1200 sum2v = vec_splat_s32(0);
1202 permEncv = vec_lvsl(0, fenc);
1203 perm0vA = vec_lvsl(0, pix0);
1204 perm1vA = vec_lvsl(0, pix1);
1205 perm2vA = vec_lvsl(0, pix2);
1207 perm0vB = vec_lvsl(0, pix0 + i_stride);
1208 perm1vB = vec_lvsl(0, pix1 + i_stride);
1209 perm2vB = vec_lvsl(0, pix2 + i_stride);
1211 for (y = 0; y < 8; y++)
1213 temp_lv = vec_ld(0, pix0);
1214 temp_hv = vec_ld(16, pix0);
1215 pix0v = vec_perm(temp_lv, temp_hv, perm0vA);
1219 temp_lv = vec_ld(0, pix1);
1220 temp_hv = vec_ld(16, pix1);
1221 pix1v = vec_perm(temp_lv, temp_hv, perm1vA);
1224 temp_lv = vec_ld(0, fenc);
1225 fencv = vec_perm(temp_lv, temp_hv, permEncv);
1226 fenc += FENC_STRIDE;
1228 temp_lv = vec_ld(0, pix2);
1229 temp_hv = vec_ld(16, pix2);
1230 pix2v = vec_perm(temp_lv, temp_hv, perm2vA);
1234 sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
1236 sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
1238 sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
1241 temp_lv = vec_ld(0, pix0);
1242 temp_hv = vec_ld(16, pix0);
1243 pix0v = vec_perm(temp_lv, temp_hv, perm0vB);
1247 temp_lv = vec_ld(0, pix1);
1248 temp_hv = vec_ld(16, pix1);
1249 pix1v = vec_perm(temp_lv, temp_hv, perm1vB);
1252 temp_lv = vec_ld(0, fenc);
1253 fencv = vec_perm(temp_lv, temp_hv, permEncv);
1254 fenc += FENC_STRIDE;
1256 temp_lv = vec_ld(0, pix2);
1257 temp_hv = vec_ld(16, pix2);
1258 pix2v = vec_perm(temp_lv, temp_hv, perm2vB);
1262 sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
1264 sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
1266 sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
1271 sum0v = vec_sum2s( sum0v, zero_s32v );
1272 sum1v = vec_sum2s( sum1v, zero_s32v );
1273 sum2v = vec_sum2s( sum2v, zero_s32v );
1275 sum0v = vec_splat( sum0v, 1 );
1276 sum1v = vec_splat( sum1v, 1 );
1277 sum2v = vec_splat( sum2v, 1 );
1279 vec_ste( sum0v, 0, &sum0);
1280 vec_ste( sum1v, 0, &sum1);
1281 vec_ste( sum2v, 0, &sum2);
1289 static void pixel_sad_x4_8x8_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, uint8_t *pix3, int i_stride, int scores[4] )
1291 DECLARE_ALIGNED_16( int sum0 );
1292 DECLARE_ALIGNED_16( int sum1 );
1293 DECLARE_ALIGNED_16( int sum2 );
1294 DECLARE_ALIGNED_16( int sum3 );
1298 vec_u8_t temp_lv, temp_hv;
1299 vec_u8_t fencv, pix0v, pix1v, pix2v, pix3v;
1300 vec_u8_t perm0vA, perm1vA, perm2vA, perm3vA, perm0vB, perm1vB, perm2vB, perm3vB, permEncv;
1302 vec_s32_t sum0v, sum1v, sum2v, sum3v;
1304 sum0v = vec_splat_s32(0);
1305 sum1v = vec_splat_s32(0);
1306 sum2v = vec_splat_s32(0);
1307 sum3v = vec_splat_s32(0);
1309 permEncv = vec_lvsl(0, fenc);
1310 perm0vA = vec_lvsl(0, pix0);
1311 perm1vA = vec_lvsl(0, pix1);
1312 perm2vA = vec_lvsl(0, pix2);
1313 perm3vA = vec_lvsl(0, pix3);
1315 perm0vB = vec_lvsl(0, pix0 + i_stride);
1316 perm1vB = vec_lvsl(0, pix1 + i_stride);
1317 perm2vB = vec_lvsl(0, pix2 + i_stride);
1318 perm3vB = vec_lvsl(0, pix3 + i_stride);
1321 for (y = 0; y < 4; y++)
1323 temp_lv = vec_ld(0, pix0);
1324 temp_hv = vec_ld(16, pix0);
1325 pix0v = vec_perm(temp_lv, temp_hv, perm0vA);
1329 temp_lv = vec_ld(0, pix1);
1330 temp_hv = vec_ld(16, pix1);
1331 pix1v = vec_perm(temp_lv, temp_hv, perm1vA);
1334 temp_lv = vec_ld(0, fenc);
1335 fencv = vec_perm(temp_lv, temp_hv, permEncv);
1336 fenc += FENC_STRIDE;
1338 temp_lv = vec_ld(0, pix2);
1339 temp_hv = vec_ld(16, pix2);
1340 pix2v = vec_perm(temp_lv, temp_hv, perm2vA);
1343 temp_lv = vec_ld(0, pix3);
1344 temp_hv = vec_ld(16, pix3);
1345 pix3v = vec_perm(temp_lv, temp_hv, perm3vA);
1348 sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
1350 sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
1352 sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
1354 sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v );
1356 temp_lv = vec_ld(0, pix0);
1357 temp_hv = vec_ld(16, pix0);
1358 pix0v = vec_perm(temp_lv, temp_hv, perm0vB);
1362 temp_lv = vec_ld(0, pix1);
1363 temp_hv = vec_ld(16, pix1);
1364 pix1v = vec_perm(temp_lv, temp_hv, perm1vB);
1367 temp_lv = vec_ld(0, fenc);
1368 fencv = vec_perm(temp_lv, temp_hv, permEncv);
1369 fenc += FENC_STRIDE;
1371 temp_lv = vec_ld(0, pix2);
1372 temp_hv = vec_ld(16, pix2);
1373 pix2v = vec_perm(temp_lv, temp_hv, perm2vB);
1376 temp_lv = vec_ld(0, pix3);
1377 temp_hv = vec_ld(16, pix3);
1378 pix3v = vec_perm(temp_lv, temp_hv, perm3vB);
1381 sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
1383 sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
1385 sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
1387 sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v );
1391 sum0v = vec_sum2s( sum0v, zero_s32v );
1392 sum1v = vec_sum2s( sum1v, zero_s32v );
1393 sum2v = vec_sum2s( sum2v, zero_s32v );
1394 sum3v = vec_sum2s( sum3v, zero_s32v );
1396 sum0v = vec_splat( sum0v, 1 );
1397 sum1v = vec_splat( sum1v, 1 );
1398 sum2v = vec_splat( sum2v, 1 );
1399 sum3v = vec_splat( sum3v, 1 );
1401 vec_ste( sum0v, 0, &sum0);
1402 vec_ste( sum1v, 0, &sum1);
1403 vec_ste( sum2v, 0, &sum2);
1404 vec_ste( sum3v, 0, &sum3);
1414 static void pixel_sad_x3_8x8_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, int i_stride, int scores[3] )
1416 DECLARE_ALIGNED_16( int sum0 );
1417 DECLARE_ALIGNED_16( int sum1 );
1418 DECLARE_ALIGNED_16( int sum2 );
1422 vec_u8_t temp_lv, temp_hv;
1423 vec_u8_t fencv, pix0v, pix1v, pix2v;
1424 vec_u8_t perm0vA, perm1vA, perm2vA, perm0vB, perm1vB, perm2vB, permEncv;
1426 vec_s32_t sum0v, sum1v, sum2v;
1428 sum0v = vec_splat_s32(0);
1429 sum1v = vec_splat_s32(0);
1430 sum2v = vec_splat_s32(0);
1432 permEncv = vec_lvsl(0, fenc);
1433 perm0vA = vec_lvsl(0, pix0);
1434 perm1vA = vec_lvsl(0, pix1);
1435 perm2vA = vec_lvsl(0, pix2);
1437 perm0vB = vec_lvsl(0, pix0 + i_stride);
1438 perm1vB = vec_lvsl(0, pix1 + i_stride);
1439 perm2vB = vec_lvsl(0, pix2 + i_stride);
1441 for (y = 0; y < 4; y++)
1443 temp_lv = vec_ld(0, pix0);
1444 temp_hv = vec_ld(16, pix0);
1445 pix0v = vec_perm(temp_lv, temp_hv, perm0vA);
1449 temp_lv = vec_ld(0, pix1);
1450 temp_hv = vec_ld(16, pix1);
1451 pix1v = vec_perm(temp_lv, temp_hv, perm1vA);
1454 temp_lv = vec_ld(0, fenc);
1455 fencv = vec_perm(temp_lv, temp_hv, permEncv);
1456 fenc += FENC_STRIDE;
1458 temp_lv = vec_ld(0, pix2);
1459 temp_hv = vec_ld(16, pix2);
1460 pix2v = vec_perm(temp_lv, temp_hv, perm2vA);
1464 sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
1466 sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
1468 sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
1471 temp_lv = vec_ld(0, pix0);
1472 temp_hv = vec_ld(16, pix0);
1473 pix0v = vec_perm(temp_lv, temp_hv, perm0vB);
1477 temp_lv = vec_ld(0, pix1);
1478 temp_hv = vec_ld(16, pix1);
1479 pix1v = vec_perm(temp_lv, temp_hv, perm1vB);
1482 temp_lv = vec_ld(0, fenc);
1483 fencv = vec_perm(temp_lv, temp_hv, permEncv);
1484 fenc += FENC_STRIDE;
1486 temp_lv = vec_ld(0, pix2);
1487 temp_hv = vec_ld(16, pix2);
1488 pix2v = vec_perm(temp_lv, temp_hv, perm2vB);
1492 sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
1494 sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
1496 sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
1501 sum0v = vec_sum2s( sum0v, zero_s32v );
1502 sum1v = vec_sum2s( sum1v, zero_s32v );
1503 sum2v = vec_sum2s( sum2v, zero_s32v );
1505 sum0v = vec_splat( sum0v, 1 );
1506 sum1v = vec_splat( sum1v, 1 );
1507 sum2v = vec_splat( sum2v, 1 );
1509 vec_ste( sum0v, 0, &sum0);
1510 vec_ste( sum1v, 0, &sum1);
1511 vec_ste( sum2v, 0, &sum2);
1518 /***********************************************************************
1520 **********************************************************************/
1522 static int pixel_ssd_16x16_altivec ( uint8_t *pix1, int i_stride_pix1,
1523 uint8_t *pix2, int i_stride_pix2)
1525 DECLARE_ALIGNED_16( int sum );
1529 vec_u8_t pix1vA, pix2vA, pix1vB, pix2vB;
1531 vec_u8_t maxA, minA, diffA, maxB, minB, diffB;
1532 vec_u8_t temp_lv, temp_hv;
1533 vec_u8_t permA, permB;
1535 sumv = vec_splat_u32(0);
1537 permA = vec_lvsl(0, pix2);
1538 permB = vec_lvsl(0, pix2 + i_stride_pix2);
1540 temp_lv = vec_ld(0, pix2);
1541 temp_hv = vec_ld(16, pix2);
1542 pix2vA = vec_perm(temp_lv, temp_hv, permA);
1543 pix1vA = vec_ld(0, pix1);
1545 for (y=0; y < 7; y++)
1547 pix1 += i_stride_pix1;
1548 pix2 += i_stride_pix2;
1551 maxA = vec_max(pix1vA, pix2vA);
1552 minA = vec_min(pix1vA, pix2vA);
1555 temp_lv = vec_ld(0, pix2);
1556 temp_hv = vec_ld(16, pix2);
1557 pix2vB = vec_perm(temp_lv, temp_hv, permB);
1558 pix1vB = vec_ld(0, pix1);
1561 diffA = vec_sub(maxA, minA);
1562 sumv = vec_msum(diffA, diffA, sumv);
1564 pix1 += i_stride_pix1;
1565 pix2 += i_stride_pix2;
1567 maxB = vec_max(pix1vB, pix2vB);
1568 minB = vec_min(pix1vB, pix2vB);
1570 temp_lv = vec_ld(0, pix2);
1571 temp_hv = vec_ld(16, pix2);
1572 pix2vA = vec_perm(temp_lv, temp_hv, permA);
1573 pix1vA = vec_ld(0, pix1);
1575 diffB = vec_sub(maxB, minB);
1576 sumv = vec_msum(diffB, diffB, sumv);
1580 pix1 += i_stride_pix1;
1581 pix2 += i_stride_pix2;
1583 temp_lv = vec_ld(0, pix2);
1584 temp_hv = vec_ld(16, pix2);
1585 pix2vB = vec_perm(temp_lv, temp_hv, permB);
1586 pix1vB = vec_ld(0, pix1);
1588 maxA = vec_max(pix1vA, pix2vA);
1589 minA = vec_min(pix1vA, pix2vA);
1591 maxB = vec_max(pix1vB, pix2vB);
1592 minB = vec_min(pix1vB, pix2vB);
1594 diffA = vec_sub(maxA, minA);
1595 sumv = vec_msum(diffA, diffA, sumv);
1597 diffB = vec_sub(maxB, minB);
1598 sumv = vec_msum(diffB, diffB, sumv);
1600 sumv = (vec_u32_t) vec_sums((vec_s32_t) sumv, zero_s32v);
1601 sumv = vec_splat(sumv, 3);
1602 vec_ste((vec_s32_t) sumv, 0, &sum);
1606 static int pixel_ssd_8x8_altivec ( uint8_t *pix1, int i_stride_pix1,
1607 uint8_t *pix2, int i_stride_pix2)
1609 DECLARE_ALIGNED_16( int sum );
1613 vec_u8_t pix1v, pix2v;
1615 vec_u8_t maxv, minv, diffv;
1616 vec_u8_t temp_lv, temp_hv;
1617 vec_u8_t perm1v, perm2v;
1619 const vec_u32_t sel = (vec_u32_t)CV(-1,-1,0,0);
1621 sumv = vec_splat_u32(0);
1623 perm1v = vec_lvsl(0, pix1);
1624 perm2v = vec_lvsl(0, pix2);
1626 for (y=0; y < 8; y++)
1628 temp_hv = vec_ld(0, pix1);
1629 temp_lv = vec_ld(7, pix1);
1630 pix1v = vec_perm(temp_hv, temp_lv, perm1v);
1632 temp_hv = vec_ld(0, pix2);
1633 temp_lv = vec_ld(7, pix2);
1634 pix2v = vec_perm(temp_hv, temp_lv, perm2v);
1636 maxv = vec_max(pix1v, pix2v);
1637 minv = vec_min(pix1v, pix2v);
1639 diffv = vec_sub(maxv, minv);
1640 sumv = vec_msum(diffv, diffv, sumv);
1642 pix1 += i_stride_pix1;
1643 pix2 += i_stride_pix2;
1646 sumv = vec_sel( zero_u32v, sumv, sel );
1648 sumv = (vec_u32_t) vec_sums((vec_s32_t) sumv, zero_s32v);
1649 sumv = vec_splat(sumv, 3);
1650 vec_ste((vec_s32_t) sumv, 0, &sum);
1655 /**********************************************************************
1656 * SA8D routines: sum of 8x8 Hadamard transformed differences
1657 **********************************************************************/
1658 /* SA8D_1D unrolled by 8 in Altivec */
1659 #define SA8D_1D_ALTIVEC( sa8d0v, sa8d1v, sa8d2v, sa8d3v, sa8d4v, sa8d5v, sa8d6v, sa8d7v )\
1661 /* int a0 = SRC(0) + SRC(4) */\
1662 vec_s16_t a0v = vec_add(sa8d0v, sa8d4v); \
1663 /* int a4 = SRC(0) - SRC(4) */\
1664 vec_s16_t a4v = vec_sub(sa8d0v, sa8d4v); \
1665 /* int a1 = SRC(1) + SRC(5) */\
1666 vec_s16_t a1v = vec_add(sa8d1v, sa8d5v); \
1667 /* int a5 = SRC(1) - SRC(5) */\
1668 vec_s16_t a5v = vec_sub(sa8d1v, sa8d5v); \
1669 /* int a2 = SRC(2) + SRC(6) */\
1670 vec_s16_t a2v = vec_add(sa8d2v, sa8d6v); \
1671 /* int a6 = SRC(2) - SRC(6) */\
1672 vec_s16_t a6v = vec_sub(sa8d2v, sa8d6v); \
1673 /* int a3 = SRC(3) + SRC(7) */\
1674 vec_s16_t a3v = vec_add(sa8d3v, sa8d7v); \
1675 /* int a7 = SRC(3) - SRC(7) */\
1676 vec_s16_t a7v = vec_sub(sa8d3v, sa8d7v); \
1678 /* int b0 = a0 + a2 */\
1679 vec_s16_t b0v = vec_add(a0v, a2v); \
1680 /* int b2 = a0 - a2; */\
1681 vec_s16_t b2v = vec_sub(a0v, a2v);\
1682 /* int b1 = a1 + a3; */\
1683 vec_s16_t b1v = vec_add(a1v, a3v); \
1684 /* int b3 = a1 - a3; */\
1685 vec_s16_t b3v = vec_sub(a1v, a3v); \
1686 /* int b4 = a4 + a6; */\
1687 vec_s16_t b4v = vec_add(a4v, a6v); \
1688 /* int b6 = a4 - a6; */\
1689 vec_s16_t b6v = vec_sub(a4v, a6v); \
1690 /* int b5 = a5 + a7; */\
1691 vec_s16_t b5v = vec_add(a5v, a7v); \
1692 /* int b7 = a5 - a7; */\
1693 vec_s16_t b7v = vec_sub(a5v, a7v); \
1695 /* DST(0, b0 + b1) */\
1696 sa8d0v = vec_add(b0v, b1v); \
1697 /* DST(1, b0 - b1) */\
1698 sa8d1v = vec_sub(b0v, b1v); \
1699 /* DST(2, b2 + b3) */\
1700 sa8d2v = vec_add(b2v, b3v); \
1701 /* DST(3, b2 - b3) */\
1702 sa8d3v = vec_sub(b2v, b3v); \
1703 /* DST(4, b4 + b5) */\
1704 sa8d4v = vec_add(b4v, b5v); \
1705 /* DST(5, b4 - b5) */\
1706 sa8d5v = vec_sub(b4v, b5v); \
1707 /* DST(6, b6 + b7) */\
1708 sa8d6v = vec_add(b6v, b7v); \
1709 /* DST(7, b6 - b7) */\
1710 sa8d7v = vec_sub(b6v, b7v); \
1713 static int pixel_sa8d_8x8_core_altivec( uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
1719 vec_s16_t diff0v, diff1v, diff2v, diff3v, diff4v, diff5v, diff6v, diff7v;
1721 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v );
1722 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v );
1723 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v );
1724 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v );
1726 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff4v );
1727 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff5v );
1728 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff6v );
1729 VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff7v );
1731 vec_s16_t sa8d0v, sa8d1v, sa8d2v, sa8d3v, sa8d4v, sa8d5v, sa8d6v, sa8d7v;
1733 SA8D_1D_ALTIVEC(diff0v, diff1v, diff2v, diff3v,
1734 diff4v, diff5v, diff6v, diff7v);
1736 VEC_TRANSPOSE_8(diff0v, diff1v, diff2v, diff3v,
1737 diff4v, diff5v, diff6v, diff7v,
1738 sa8d0v, sa8d1v, sa8d2v, sa8d3v,
1739 sa8d4v, sa8d5v, sa8d6v, sa8d7v );
1741 SA8D_1D_ALTIVEC(sa8d0v, sa8d1v, sa8d2v, sa8d3v,
1742 sa8d4v, sa8d5v, sa8d6v, sa8d7v );
1744 /* accumulation of the absolute value of all elements of the resulting bloc */
1745 vec_s16_t abs0v = VEC_ABS(sa8d0v);
1746 vec_s16_t abs1v = VEC_ABS(sa8d1v);
1747 vec_s16_t sum01v = vec_add(abs0v, abs1v);
1749 vec_s16_t abs2v = VEC_ABS(sa8d2v);
1750 vec_s16_t abs3v = VEC_ABS(sa8d3v);
1751 vec_s16_t sum23v = vec_add(abs2v, abs3v);
1753 vec_s16_t abs4v = VEC_ABS(sa8d4v);
1754 vec_s16_t abs5v = VEC_ABS(sa8d5v);
1755 vec_s16_t sum45v = vec_add(abs4v, abs5v);
1757 vec_s16_t abs6v = VEC_ABS(sa8d6v);
1758 vec_s16_t abs7v = VEC_ABS(sa8d7v);
1759 vec_s16_t sum67v = vec_add(abs6v, abs7v);
1761 vec_s16_t sum0123v = vec_add(sum01v, sum23v);
1762 vec_s16_t sum4567v = vec_add(sum45v, sum67v);
1766 sumblocv = vec_sum4s(sum0123v, (vec_s32_t)zerov );
1767 sumblocv = vec_sum4s(sum4567v, sumblocv );
1769 sumblocv = vec_sums(sumblocv, (vec_s32_t)zerov );
1771 sumblocv = vec_splat(sumblocv, 3);
1773 vec_ste(sumblocv, 0, &i_satd);
1778 static int pixel_sa8d_8x8_altivec( uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
1781 i_satd = (pixel_sa8d_8x8_core_altivec( pix1, i_pix1, pix2, i_pix2 )+2)>>2;
1785 static int pixel_sa8d_16x16_altivec( uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
1789 i_satd = (pixel_sa8d_8x8_core_altivec( &pix1[0], i_pix1, &pix2[0], i_pix2 )
1790 + pixel_sa8d_8x8_core_altivec( &pix1[8], i_pix1, &pix2[8], i_pix2 )
1791 + pixel_sa8d_8x8_core_altivec( &pix1[8*i_pix1], i_pix1, &pix2[8*i_pix2], i_pix2 )
1792 + pixel_sa8d_8x8_core_altivec( &pix1[8*i_pix1+8], i_pix1, &pix2[8*i_pix2+8], i_pix2 ) +2)>>2;
1796 /****************************************************************************
1797 * structural similarity metric
1798 ****************************************************************************/
1799 static void ssim_4x4x2_core_altivec( const uint8_t *pix1, int stride1,
1800 const uint8_t *pix2, int stride2,
1803 DECLARE_ALIGNED_16( int temp[4] );
1806 vec_u8_t pix1v, pix2v;
1807 vec_u32_t s1v, s2v, ssv, s12v;
1811 s1v = s2v = ssv = s12v = zero_u32v;
1815 VEC_LOAD( &pix1[y*stride1], pix1v, 16, vec_u8_t );
1816 VEC_LOAD( &pix2[y*stride2], pix2v, 16, vec_u8_t );
1818 s1v = vec_sum4s( pix1v, s1v );
1819 s2v = vec_sum4s( pix2v, s2v );
1820 ssv = vec_msum( pix1v, pix1v, ssv );
1821 ssv = vec_msum( pix2v, pix2v, ssv );
1822 s12v = vec_msum( pix1v, pix2v, s12v );
1825 vec_st( (vec_s32_t)s1v, 0, temp );
1826 sums[0][0] = temp[0];
1827 sums[1][0] = temp[1];
1828 vec_st( (vec_s32_t)s2v, 0, temp );
1829 sums[0][1] = temp[0];
1830 sums[1][1] = temp[1];
1831 vec_st( (vec_s32_t)ssv, 0, temp );
1832 sums[0][2] = temp[0];
1833 sums[1][2] = temp[1];
1834 vec_st( (vec_s32_t)s12v, 0, temp );
1835 sums[0][3] = temp[0];
1836 sums[1][3] = temp[1];
1839 /****************************************************************************
1841 ****************************************************************************/
1842 void x264_pixel_altivec_init( x264_pixel_function_t *pixf )
1844 pixf->sad[PIXEL_16x16] = pixel_sad_16x16_altivec;
1845 pixf->sad[PIXEL_8x16] = pixel_sad_8x16_altivec;
1846 pixf->sad[PIXEL_16x8] = pixel_sad_16x8_altivec;
1847 pixf->sad[PIXEL_8x8] = pixel_sad_8x8_altivec;
1849 pixf->sad_x3[PIXEL_16x16] = pixel_sad_x3_16x16_altivec;
1850 pixf->sad_x3[PIXEL_8x16] = pixel_sad_x3_8x16_altivec;
1851 pixf->sad_x3[PIXEL_16x8] = pixel_sad_x3_16x8_altivec;
1852 pixf->sad_x3[PIXEL_8x8] = pixel_sad_x3_8x8_altivec;
1854 pixf->sad_x4[PIXEL_16x16] = pixel_sad_x4_16x16_altivec;
1855 pixf->sad_x4[PIXEL_8x16] = pixel_sad_x4_8x16_altivec;
1856 pixf->sad_x4[PIXEL_16x8] = pixel_sad_x4_16x8_altivec;
1857 pixf->sad_x4[PIXEL_8x8] = pixel_sad_x4_8x8_altivec;
1859 pixf->satd[PIXEL_16x16] = pixel_satd_16x16_altivec;
1860 pixf->satd[PIXEL_8x16] = pixel_satd_8x16_altivec;
1861 pixf->satd[PIXEL_16x8] = pixel_satd_16x8_altivec;
1862 pixf->satd[PIXEL_8x8] = pixel_satd_8x8_altivec;
1863 pixf->satd[PIXEL_8x4] = pixel_satd_8x4_altivec;
1864 pixf->satd[PIXEL_4x8] = pixel_satd_4x8_altivec;
1865 pixf->satd[PIXEL_4x4] = pixel_satd_4x4_altivec;
1867 pixf->ssd[PIXEL_16x16] = pixel_ssd_16x16_altivec;
1868 pixf->ssd[PIXEL_8x8] = pixel_ssd_8x8_altivec;
1870 pixf->sa8d[PIXEL_16x16] = pixel_sa8d_16x16_altivec;
1871 pixf->sa8d[PIXEL_8x8] = pixel_sa8d_8x8_altivec;
1873 pixf->ssim_4x4x2_core = ssim_4x4x2_core_altivec;