1 /*****************************************************************************
2 * pixel.c: h264 encoder
3 *****************************************************************************
4 * Copyright (C) 2003-2008 x264 project
6 * Authors: Loren Merritt <lorenm@u.washington.edu>
7 * Laurent Aimar <fenrir@via.ecp.fr>
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
22 *****************************************************************************/
27 # include "x86/pixel.h"
30 # include "ppc/pixel.h"
32 #ifdef ARCH_UltraSparc
33 # include "sparc/pixel.h"
37 /****************************************************************************
39 ****************************************************************************/
40 #define PIXEL_SAD_C( name, lx, ly ) \
41 static int name( uint8_t *pix1, int i_stride_pix1, \
42 uint8_t *pix2, int i_stride_pix2 ) \
46 for( y = 0; y < ly; y++ ) \
48 for( x = 0; x < lx; x++ ) \
50 i_sum += abs( pix1[x] - pix2[x] ); \
52 pix1 += i_stride_pix1; \
53 pix2 += i_stride_pix2; \
59 PIXEL_SAD_C( x264_pixel_sad_16x16, 16, 16 )
60 PIXEL_SAD_C( x264_pixel_sad_16x8, 16, 8 )
61 PIXEL_SAD_C( x264_pixel_sad_8x16, 8, 16 )
62 PIXEL_SAD_C( x264_pixel_sad_8x8, 8, 8 )
63 PIXEL_SAD_C( x264_pixel_sad_8x4, 8, 4 )
64 PIXEL_SAD_C( x264_pixel_sad_4x8, 4, 8 )
65 PIXEL_SAD_C( x264_pixel_sad_4x4, 4, 4 )
68 /****************************************************************************
70 ****************************************************************************/
71 #define PIXEL_SSD_C( name, lx, ly ) \
72 static int name( uint8_t *pix1, int i_stride_pix1, \
73 uint8_t *pix2, int i_stride_pix2 ) \
77 for( y = 0; y < ly; y++ ) \
79 for( x = 0; x < lx; x++ ) \
81 int d = pix1[x] - pix2[x]; \
84 pix1 += i_stride_pix1; \
85 pix2 += i_stride_pix2; \
90 PIXEL_SSD_C( x264_pixel_ssd_16x16, 16, 16 )
91 PIXEL_SSD_C( x264_pixel_ssd_16x8, 16, 8 )
92 PIXEL_SSD_C( x264_pixel_ssd_8x16, 8, 16 )
93 PIXEL_SSD_C( x264_pixel_ssd_8x8, 8, 8 )
94 PIXEL_SSD_C( x264_pixel_ssd_8x4, 8, 4 )
95 PIXEL_SSD_C( x264_pixel_ssd_4x8, 4, 8 )
96 PIXEL_SSD_C( x264_pixel_ssd_4x4, 4, 4 )
98 int64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2, int i_width, int i_height )
102 int align = !(((long)pix1 | (long)pix2 | i_pix1 | i_pix2) & 15);
104 #define SSD(size) i_ssd += pf->ssd[size]( pix1 + y*i_pix1 + x, i_pix1, \
105 pix2 + y*i_pix2 + x, i_pix2 );
106 for( y = 0; y < i_height-15; y += 16 )
110 for( ; x < i_width-15; x += 16 )
112 for( ; x < i_width-7; x += 8 )
116 for( x = 0; x < i_width-7; x += 8 )
120 #define SSD1 { int d = pix1[y*i_pix1+x] - pix2[y*i_pix2+x]; i_ssd += d*d; }
121 if( i_width % 8 != 0 )
123 for( y = 0; y < (i_height & ~7); y++ )
124 for( x = i_width & ~7; x < i_width; x++ )
127 if( i_height % 8 != 0 )
129 for( y = i_height & ~7; y < i_height; y++ )
130 for( x = 0; x < i_width; x++ )
139 static inline void pixel_sub_wxh( int16_t *diff, int i_size,
140 uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
143 for( y = 0; y < i_size; y++ )
145 for( x = 0; x < i_size; x++ )
147 diff[x + y*i_size] = pix1[x] - pix2[x];
155 /****************************************************************************
157 ****************************************************************************/
158 #define PIXEL_VAR_C( name, w, shift ) \
159 static int name( uint8_t *pix, int i_stride, uint32_t *sad ) \
161 uint32_t var = 0, sum = 0, sqr = 0; \
163 for( y = 0; y < w; y++ ) \
165 for( x = 0; x < w; x++ ) \
168 sqr += pix[x] * pix[x]; \
172 var = sqr - (sum * sum >> shift); \
177 PIXEL_VAR_C( x264_pixel_var_16x16, 16, 8 )
178 PIXEL_VAR_C( x264_pixel_var_8x8, 8, 6 )
182 /****************************************************************************
183 * pixel_satd_WxH: sum of 4x4 Hadamard transformed differences
184 ****************************************************************************/
185 static int pixel_satd_wxh( uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2, int i_width, int i_height )
192 for( y = 0; y < i_height; y += 4 )
194 for( x = 0; x < i_width; x += 4 )
198 pixel_sub_wxh( (int16_t*)diff, 4, &pix1[x], i_pix1, &pix2[x], i_pix2 );
200 for( d = 0; d < 4; d++ )
205 s01 = diff[d][0] + diff[d][1]; s23 = diff[d][2] + diff[d][3];
206 d01 = diff[d][0] - diff[d][1]; d23 = diff[d][2] - diff[d][3];
208 tmp[d][0] = s01 + s23;
209 tmp[d][1] = s01 - s23;
210 tmp[d][2] = d01 - d23;
211 tmp[d][3] = d01 + d23;
213 for( d = 0; d < 4; d++ )
218 s01 = tmp[0][d] + tmp[1][d]; s23 = tmp[2][d] + tmp[3][d];
219 d01 = tmp[0][d] - tmp[1][d]; d23 = tmp[2][d] - tmp[3][d];
221 i_satd += abs( s01 + s23 ) + abs( s01 - s23 ) + abs( d01 - d23 ) + abs( d01 + d23 );
231 #define PIXEL_SATD_C( name, width, height ) \
232 static int name( uint8_t *pix1, int i_stride_pix1, \
233 uint8_t *pix2, int i_stride_pix2 ) \
235 return pixel_satd_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, width, height ); \
237 PIXEL_SATD_C( x264_pixel_satd_16x16, 16, 16 )
238 PIXEL_SATD_C( x264_pixel_satd_16x8, 16, 8 )
239 PIXEL_SATD_C( x264_pixel_satd_8x16, 8, 16 )
240 PIXEL_SATD_C( x264_pixel_satd_8x8, 8, 8 )
241 PIXEL_SATD_C( x264_pixel_satd_8x4, 8, 4 )
242 PIXEL_SATD_C( x264_pixel_satd_4x8, 4, 8 )
243 PIXEL_SATD_C( x264_pixel_satd_4x4, 4, 4 )
246 /****************************************************************************
247 * pixel_sa8d_WxH: sum of 8x8 Hadamard transformed differences
248 ****************************************************************************/
250 const int a0 = SRC(0) + SRC(4);\
251 const int a4 = SRC(0) - SRC(4);\
252 const int a1 = SRC(1) + SRC(5);\
253 const int a5 = SRC(1) - SRC(5);\
254 const int a2 = SRC(2) + SRC(6);\
255 const int a6 = SRC(2) - SRC(6);\
256 const int a3 = SRC(3) + SRC(7);\
257 const int a7 = SRC(3) - SRC(7);\
258 const int b0 = a0 + a2;\
259 const int b2 = a0 - a2;\
260 const int b1 = a1 + a3;\
261 const int b3 = a1 - a3;\
262 const int b4 = a4 + a6;\
263 const int b6 = a4 - a6;\
264 const int b5 = a5 + a7;\
265 const int b7 = a5 - a7;\
276 static inline int pixel_sa8d_wxh( uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2,
277 int i_width, int i_height )
283 for( y = 0; y < i_height; y += 8 )
285 for( x = 0; x < i_width; x += 8 )
288 pixel_sub_wxh( (int16_t*)diff, 8, pix1+x, i_pix1, pix2+x, i_pix2 );
290 #define SRC(x) diff[i][x]
291 #define DST(x,rhs) diff[i][x] = (rhs)
292 for( i = 0; i < 8; i++ )
297 #define SRC(x) diff[x][i]
298 #define DST(x,rhs) i_satd += abs(rhs)
299 for( i = 0; i < 8; i++ )
311 #define PIXEL_SA8D_C( width, height ) \
312 static int x264_pixel_sa8d_##width##x##height( uint8_t *pix1, int i_stride_pix1, \
313 uint8_t *pix2, int i_stride_pix2 ) \
315 return ( pixel_sa8d_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, width, height ) + 2 ) >> 2; \
317 PIXEL_SA8D_C( 16, 16 )
318 PIXEL_SA8D_C( 16, 8 )
319 PIXEL_SA8D_C( 8, 16 )
322 /****************************************************************************
324 ****************************************************************************/
325 #define SAD_X( size ) \
326 static void x264_pixel_sad_x3_##size( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, int i_stride, int scores[3] )\
328 scores[0] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix0, i_stride );\
329 scores[1] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix1, i_stride );\
330 scores[2] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix2, i_stride );\
332 static void x264_pixel_sad_x4_##size( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, uint8_t *pix3, int i_stride, int scores[4] )\
334 scores[0] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix0, i_stride );\
335 scores[1] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix1, i_stride );\
336 scores[2] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix2, i_stride );\
337 scores[3] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix3, i_stride );\
348 #ifdef ARCH_UltraSparc
355 /****************************************************************************
357 * no faster than single satd, but needed for satd to be a drop-in replacement for sad
358 ****************************************************************************/
360 #define SATD_X( size, cpu ) \
361 static void x264_pixel_satd_x3_##size##cpu( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, int i_stride, int scores[3] )\
363 scores[0] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix0, i_stride );\
364 scores[1] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix1, i_stride );\
365 scores[2] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix2, i_stride );\
367 static void x264_pixel_satd_x4_##size##cpu( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, uint8_t *pix3, int i_stride, int scores[4] )\
369 scores[0] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix0, i_stride );\
370 scores[1] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix1, i_stride );\
371 scores[2] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix2, i_stride );\
372 scores[3] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix3, i_stride );\
374 #define SATD_X_DECL5( cpu )\
375 SATD_X( 16x16, cpu )\
380 #define SATD_X_DECL7( cpu )\
387 SATD_X_DECL7( _mmxext )
388 SATD_X_DECL5( _sse2 )
389 SATD_X_DECL7( _ssse3 )
390 SATD_X_DECL5( _ssse3_phadd )
393 /****************************************************************************
394 * structural similarity metric
395 ****************************************************************************/
396 static void ssim_4x4x2_core( const uint8_t *pix1, int stride1,
397 const uint8_t *pix2, int stride2,
403 uint32_t s1=0, s2=0, ss=0, s12=0;
407 int a = pix1[x+y*stride1];
408 int b = pix2[x+y*stride2];
424 static float ssim_end1( int s1, int s2, int ss, int s12 )
426 static const int ssim_c1 = (int)(.01*.01*255*255*64 + .5);
427 static const int ssim_c2 = (int)(.03*.03*255*255*64*63 + .5);
428 int vars = ss*64 - s1*s1 - s2*s2;
429 int covar = s12*64 - s1*s2;
430 return (float)(2*s1*s2 + ssim_c1) * (float)(2*covar + ssim_c2)\
431 / ((float)(s1*s1 + s2*s2 + ssim_c1) * (float)(vars + ssim_c2));
434 static float ssim_end4( int sum0[5][4], int sum1[5][4], int width )
438 for( i = 0; i < width; i++ )
439 ssim += ssim_end1( sum0[i][0] + sum0[i+1][0] + sum1[i][0] + sum1[i+1][0],
440 sum0[i][1] + sum0[i+1][1] + sum1[i][1] + sum1[i+1][1],
441 sum0[i][2] + sum0[i+1][2] + sum1[i][2] + sum1[i+1][2],
442 sum0[i][3] + sum0[i+1][3] + sum1[i][3] + sum1[i+1][3] );
446 float x264_pixel_ssim_wxh( x264_pixel_function_t *pf,
447 uint8_t *pix1, int stride1,
448 uint8_t *pix2, int stride2,
449 int width, int height )
453 int (*sum0)[4] = x264_malloc(4 * (width/4+3) * sizeof(int));
454 int (*sum1)[4] = x264_malloc(4 * (width/4+3) * sizeof(int));
458 for( y = 1; y < height; y++ )
462 XCHG( void*, sum0, sum1 );
463 for( x = 0; x < width; x+=2 )
464 pf->ssim_4x4x2_core( &pix1[4*(x+z*stride1)], stride1, &pix2[4*(x+z*stride2)], stride2, &sum0[x] );
466 for( x = 0; x < width-1; x += 4 )
467 ssim += pf->ssim_end4( sum0+x, sum1+x, X264_MIN(4,width-x-1) );
475 /****************************************************************************
476 * successive elimination
477 ****************************************************************************/
478 static int x264_pixel_ads4( int enc_dc[4], uint16_t *sums, int delta,
479 uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
482 for( i=0; i<width; i++, sums++ )
484 int ads = abs( enc_dc[0] - sums[0] )
485 + abs( enc_dc[1] - sums[8] )
486 + abs( enc_dc[2] - sums[delta] )
487 + abs( enc_dc[3] - sums[delta+8] )
495 static int x264_pixel_ads2( int enc_dc[2], uint16_t *sums, int delta,
496 uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
499 for( i=0; i<width; i++, sums++ )
501 int ads = abs( enc_dc[0] - sums[0] )
502 + abs( enc_dc[1] - sums[delta] )
510 static int x264_pixel_ads1( int enc_dc[1], uint16_t *sums, int delta,
511 uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
514 for( i=0; i<width; i++, sums++ )
516 int ads = abs( enc_dc[0] - sums[0] )
525 /****************************************************************************
527 ****************************************************************************/
528 void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
530 memset( pixf, 0, sizeof(*pixf) );
532 #define INIT2( name, cpu ) \
533 pixf->name[PIXEL_16x16] = x264_pixel_##name##_16x16##cpu;\
534 pixf->name[PIXEL_16x8] = x264_pixel_##name##_16x8##cpu;
535 #define INIT4( name, cpu ) \
537 pixf->name[PIXEL_8x16] = x264_pixel_##name##_8x16##cpu;\
538 pixf->name[PIXEL_8x8] = x264_pixel_##name##_8x8##cpu;
539 #define INIT5( name, cpu ) \
541 pixf->name[PIXEL_8x4] = x264_pixel_##name##_8x4##cpu;
542 #define INIT7( name, cpu ) \
544 pixf->name[PIXEL_4x8] = x264_pixel_##name##_4x8##cpu;\
545 pixf->name[PIXEL_4x4] = x264_pixel_##name##_4x4##cpu;
547 #define INIT_ADS( cpu ) \
548 pixf->ads[PIXEL_16x16] = x264_pixel_ads4##cpu;\
549 pixf->ads[PIXEL_16x8] = x264_pixel_ads2##cpu;\
550 pixf->ads[PIXEL_8x8] = x264_pixel_ads1##cpu;
562 pixf->var[PIXEL_16x16] = x264_pixel_var_16x16;
563 pixf->var[PIXEL_8x8] = x264_pixel_var_8x8;
565 pixf->ssim_4x4x2_core = ssim_4x4x2_core;
566 pixf->ssim_end4 = ssim_end4;
569 if( cpu&X264_CPU_MMX )
574 if( cpu&X264_CPU_MMXEXT )
576 INIT7( sad, _mmxext );
577 INIT7( sad_x3, _mmxext );
578 INIT7( sad_x4, _mmxext );
579 INIT7( satd, _mmxext );
580 INIT7( satd_x3, _mmxext );
581 INIT7( satd_x4, _mmxext );
583 pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_mmxext;
584 pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_mmxext;
586 pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_mmxext;
587 pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_mmxext;
588 pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_mmxext;
589 pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_mmxext;
591 if( cpu&X264_CPU_CACHELINE_32 )
593 INIT5( sad, _cache32_mmxext );
594 INIT4( sad_x3, _cache32_mmxext );
595 INIT4( sad_x4, _cache32_mmxext );
597 else if( cpu&X264_CPU_CACHELINE_64 )
599 INIT5( sad, _cache64_mmxext );
600 INIT4( sad_x3, _cache64_mmxext );
601 INIT4( sad_x4, _cache64_mmxext );
604 if( cpu&X264_CPU_CACHELINE_64 )
606 pixf->sad[PIXEL_8x16] = x264_pixel_sad_8x16_cache64_mmxext;
607 pixf->sad[PIXEL_8x8] = x264_pixel_sad_8x8_cache64_mmxext;
608 pixf->sad[PIXEL_8x4] = x264_pixel_sad_8x4_cache64_mmxext;
609 pixf->sad_x3[PIXEL_8x16] = x264_pixel_sad_x3_8x16_cache64_mmxext;
610 pixf->sad_x3[PIXEL_8x8] = x264_pixel_sad_x3_8x8_cache64_mmxext;
611 pixf->sad_x4[PIXEL_8x16] = x264_pixel_sad_x4_8x16_cache64_mmxext;
612 pixf->sad_x4[PIXEL_8x8] = x264_pixel_sad_x4_8x8_cache64_mmxext;
615 pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_mmxext;
616 pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_mmxext;
617 pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_mmxext;
620 if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_SSE2_IS_SLOW) )
623 INIT2( sad_x3, _sse2 );
624 INIT2( sad_x4, _sse2 );
626 pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_sse2;
629 if( cpu&X264_CPU_CACHELINE_64 )
631 INIT2( sad, _cache64_sse2 );
632 INIT2( sad_x3, _cache64_sse2 );
633 INIT2( sad_x4, _cache64_sse2 );
637 if( cpu&X264_CPU_SSE2 )
640 INIT5( satd, _sse2 );
641 INIT5( satd_x3, _sse2 );
642 INIT5( satd_x4, _sse2 );
643 pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_sse2;
644 pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_sse2;
645 pixf->ssim_end4 = x264_pixel_ssim_end4_sse2;
646 pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_sse2;
647 pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_sse2;
649 pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_sse2;
653 if( (cpu&X264_CPU_SSE3) && (cpu&X264_CPU_CACHELINE_64) )
656 INIT2( sad_x3, _sse3 );
657 INIT2( sad_x4, _sse3 );
660 if( cpu&X264_CPU_SSSE3 )
662 INIT7( satd, _ssse3 );
663 INIT7( satd_x3, _ssse3 );
664 INIT7( satd_x4, _ssse3 );
666 pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3;
667 pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_ssse3;
668 pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_ssse3;
669 pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_ssse3;
670 pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_ssse3;
672 pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_ssse3;
674 if( cpu&X264_CPU_CACHELINE_64 )
676 INIT2( sad, _cache64_ssse3 );
677 INIT2( sad_x3, _cache64_ssse3 );
678 INIT2( sad_x4, _cache64_ssse3 );
680 if( cpu&X264_CPU_PHADD_IS_FAST )
682 INIT5( satd, _ssse3_phadd );
683 INIT5( satd_x3, _ssse3_phadd );
684 INIT5( satd_x4, _ssse3_phadd );
690 if( cpu&X264_CPU_ALTIVEC )
692 x264_pixel_altivec_init( pixf );
695 #ifdef ARCH_UltraSparc
697 INIT4( sad_x3, _vis );
698 INIT4( sad_x4, _vis );
701 pixf->ads[PIXEL_8x16] =
702 pixf->ads[PIXEL_8x4] =
703 pixf->ads[PIXEL_4x8] = pixf->ads[PIXEL_16x8];
704 pixf->ads[PIXEL_4x4] = pixf->ads[PIXEL_8x8];