]> git.sesse.net Git - x264/blob - common/pixel.c
27575b5ca0f40f2bb2ffae94a33b3f1d99f2396f
[x264] / common / pixel.c
1 /*****************************************************************************
2  * pixel.c: h264 encoder
3  *****************************************************************************
4  * Copyright (C) 2003-2008 x264 project
5  *
6  * Authors: Loren Merritt <lorenm@u.washington.edu>
7  *          Laurent Aimar <fenrir@via.ecp.fr>
8  *
9  * This program is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License as published by
11  * the Free Software Foundation; either version 2 of the License, or
12  * (at your option) any later version.
13  *
14  * This program is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  * GNU General Public License for more details.
18  *
19  * You should have received a copy of the GNU General Public License
20  * along with this program; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
22  *****************************************************************************/
23
24 #include "common.h"
25
26 #ifdef HAVE_MMX
27 #   include "x86/pixel.h"
28 #endif
29 #ifdef ARCH_PPC
30 #   include "ppc/pixel.h"
31 #endif
32 #ifdef ARCH_UltraSparc
33 #   include "sparc/pixel.h"
34 #endif
35
36
37 /****************************************************************************
38  * pixel_sad_WxH
39  ****************************************************************************/
40 #define PIXEL_SAD_C( name, lx, ly ) \
41 static int name( uint8_t *pix1, int i_stride_pix1,  \
42                  uint8_t *pix2, int i_stride_pix2 ) \
43 {                                                   \
44     int i_sum = 0;                                  \
45     int x, y;                                       \
46     for( y = 0; y < ly; y++ )                       \
47     {                                               \
48         for( x = 0; x < lx; x++ )                   \
49         {                                           \
50             i_sum += abs( pix1[x] - pix2[x] );      \
51         }                                           \
52         pix1 += i_stride_pix1;                      \
53         pix2 += i_stride_pix2;                      \
54     }                                               \
55     return i_sum;                                   \
56 }
57
58
59 PIXEL_SAD_C( x264_pixel_sad_16x16, 16, 16 )
60 PIXEL_SAD_C( x264_pixel_sad_16x8,  16,  8 )
61 PIXEL_SAD_C( x264_pixel_sad_8x16,   8, 16 )
62 PIXEL_SAD_C( x264_pixel_sad_8x8,    8,  8 )
63 PIXEL_SAD_C( x264_pixel_sad_8x4,    8,  4 )
64 PIXEL_SAD_C( x264_pixel_sad_4x8,    4,  8 )
65 PIXEL_SAD_C( x264_pixel_sad_4x4,    4,  4 )
66
67
68 /****************************************************************************
69  * pixel_ssd_WxH
70  ****************************************************************************/
71 #define PIXEL_SSD_C( name, lx, ly ) \
72 static int name( uint8_t *pix1, int i_stride_pix1,  \
73                  uint8_t *pix2, int i_stride_pix2 ) \
74 {                                                   \
75     int i_sum = 0;                                  \
76     int x, y;                                       \
77     for( y = 0; y < ly; y++ )                       \
78     {                                               \
79         for( x = 0; x < lx; x++ )                   \
80         {                                           \
81             int d = pix1[x] - pix2[x];              \
82             i_sum += d*d;                           \
83         }                                           \
84         pix1 += i_stride_pix1;                      \
85         pix2 += i_stride_pix2;                      \
86     }                                               \
87     return i_sum;                                   \
88 }
89
90 PIXEL_SSD_C( x264_pixel_ssd_16x16, 16, 16 )
91 PIXEL_SSD_C( x264_pixel_ssd_16x8,  16,  8 )
92 PIXEL_SSD_C( x264_pixel_ssd_8x16,   8, 16 )
93 PIXEL_SSD_C( x264_pixel_ssd_8x8,    8,  8 )
94 PIXEL_SSD_C( x264_pixel_ssd_8x4,    8,  4 )
95 PIXEL_SSD_C( x264_pixel_ssd_4x8,    4,  8 )
96 PIXEL_SSD_C( x264_pixel_ssd_4x4,    4,  4 )
97
98 int64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2, int i_width, int i_height )
99 {
100     int64_t i_ssd = 0;
101     int x, y;
102     int align = !(((long)pix1 | (long)pix2 | i_pix1 | i_pix2) & 15);
103
104 #define SSD(size) i_ssd += pf->ssd[size]( pix1 + y*i_pix1 + x, i_pix1, \
105                                           pix2 + y*i_pix2 + x, i_pix2 );
106     for( y = 0; y < i_height-15; y += 16 )
107     {
108         x = 0;
109         if( align )
110             for( ; x < i_width-15; x += 16 )
111                 SSD(PIXEL_16x16);
112         for( ; x < i_width-7; x += 8 )
113             SSD(PIXEL_8x16);
114     }
115     if( y < i_height-7 )
116         for( x = 0; x < i_width-7; x += 8 )
117             SSD(PIXEL_8x8);
118 #undef SSD
119
120 #define SSD1 { int d = pix1[y*i_pix1+x] - pix2[y*i_pix2+x]; i_ssd += d*d; }
121     if( i_width % 8 != 0 )
122     {
123         for( y = 0; y < (i_height & ~7); y++ )
124             for( x = i_width & ~7; x < i_width; x++ )
125                 SSD1;
126     }
127     if( i_height % 8 != 0 )
128     {
129         for( y = i_height & ~7; y < i_height; y++ )
130             for( x = 0; x < i_width; x++ )
131                 SSD1;
132     }
133 #undef SSD1
134
135     return i_ssd;
136 }
137
138
139 static inline void pixel_sub_wxh( int16_t *diff, int i_size,
140                                   uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
141 {
142     int y, x;
143     for( y = 0; y < i_size; y++ )
144     {
145         for( x = 0; x < i_size; x++ )
146         {
147             diff[x + y*i_size] = pix1[x] - pix2[x];
148         }
149         pix1 += i_pix1;
150         pix2 += i_pix2;
151     }
152 }
153
154
155 /****************************************************************************
156  * pixel_var_wxh
157  ****************************************************************************/
158 #define PIXEL_VAR_C( name, w, shift ) \
159 static int name( uint8_t *pix, int i_stride, uint32_t *sad ) \
160 {                                             \
161     uint32_t var = 0, sum = 0, sqr = 0;       \
162     int x, y;                                 \
163     for( y = 0; y < w; y++ )                  \
164     {                                         \
165         for( x = 0; x < w; x++ )              \
166         {                                     \
167             sum += pix[x];                    \
168             sqr += pix[x] * pix[x];           \
169         }                                     \
170         pix += i_stride;                      \
171     }                                         \
172     var = sqr - (sum * sum >> shift);         \
173     *sad = sum;                               \
174     return var;                               \
175 }
176
177 PIXEL_VAR_C( x264_pixel_var_16x16, 16, 8 )
178 PIXEL_VAR_C( x264_pixel_var_8x8,    8, 6 )
179
180
181
182 /****************************************************************************
183  * pixel_satd_WxH: sum of 4x4 Hadamard transformed differences
184  ****************************************************************************/
185 static int pixel_satd_wxh( uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2, int i_width, int i_height )
186 {
187     int16_t tmp[4][4];
188     int16_t diff[4][4];
189     int x, y;
190     int i_satd = 0;
191
192     for( y = 0; y < i_height; y += 4 )
193     {
194         for( x = 0; x < i_width; x += 4 )
195         {
196             int d;
197
198             pixel_sub_wxh( (int16_t*)diff, 4, &pix1[x], i_pix1, &pix2[x], i_pix2 );
199
200             for( d = 0; d < 4; d++ )
201             {
202                 int s01, s23;
203                 int d01, d23;
204
205                 s01 = diff[d][0] + diff[d][1]; s23 = diff[d][2] + diff[d][3];
206                 d01 = diff[d][0] - diff[d][1]; d23 = diff[d][2] - diff[d][3];
207
208                 tmp[d][0] = s01 + s23;
209                 tmp[d][1] = s01 - s23;
210                 tmp[d][2] = d01 - d23;
211                 tmp[d][3] = d01 + d23;
212             }
213             for( d = 0; d < 4; d++ )
214             {
215                 int s01, s23;
216                 int d01, d23;
217
218                 s01 = tmp[0][d] + tmp[1][d]; s23 = tmp[2][d] + tmp[3][d];
219                 d01 = tmp[0][d] - tmp[1][d]; d23 = tmp[2][d] - tmp[3][d];
220
221                 i_satd += abs( s01 + s23 ) + abs( s01 - s23 ) + abs( d01 - d23 ) + abs( d01 + d23 );
222             }
223
224         }
225         pix1 += 4 * i_pix1;
226         pix2 += 4 * i_pix2;
227     }
228
229     return i_satd / 2;
230 }
231 #define PIXEL_SATD_C( name, width, height ) \
232 static int name( uint8_t *pix1, int i_stride_pix1, \
233                  uint8_t *pix2, int i_stride_pix2 ) \
234 { \
235     return pixel_satd_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, width, height ); \
236 }
237 PIXEL_SATD_C( x264_pixel_satd_16x16, 16, 16 )
238 PIXEL_SATD_C( x264_pixel_satd_16x8,  16, 8 )
239 PIXEL_SATD_C( x264_pixel_satd_8x16,  8, 16 )
240 PIXEL_SATD_C( x264_pixel_satd_8x8,   8, 8 )
241 PIXEL_SATD_C( x264_pixel_satd_8x4,   8, 4 )
242 PIXEL_SATD_C( x264_pixel_satd_4x8,   4, 8 )
243 PIXEL_SATD_C( x264_pixel_satd_4x4,   4, 4 )
244
245
246 /****************************************************************************
247  * pixel_sa8d_WxH: sum of 8x8 Hadamard transformed differences
248  ****************************************************************************/
249 #define SA8D_1D {\
250     const int a0 = SRC(0) + SRC(4);\
251     const int a4 = SRC(0) - SRC(4);\
252     const int a1 = SRC(1) + SRC(5);\
253     const int a5 = SRC(1) - SRC(5);\
254     const int a2 = SRC(2) + SRC(6);\
255     const int a6 = SRC(2) - SRC(6);\
256     const int a3 = SRC(3) + SRC(7);\
257     const int a7 = SRC(3) - SRC(7);\
258     const int b0 = a0 + a2;\
259     const int b2 = a0 - a2;\
260     const int b1 = a1 + a3;\
261     const int b3 = a1 - a3;\
262     const int b4 = a4 + a6;\
263     const int b6 = a4 - a6;\
264     const int b5 = a5 + a7;\
265     const int b7 = a5 - a7;\
266     DST(0, b0 + b1);\
267     DST(1, b0 - b1);\
268     DST(2, b2 + b3);\
269     DST(3, b2 - b3);\
270     DST(4, b4 + b5);\
271     DST(5, b4 - b5);\
272     DST(6, b6 + b7);\
273     DST(7, b6 - b7);\
274 }
275
276 static inline int pixel_sa8d_wxh( uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2,
277                                   int i_width, int i_height )
278 {
279     int16_t diff[8][8];
280     int i_satd = 0;
281     int x, y;
282
283     for( y = 0; y < i_height; y += 8 )
284     {
285         for( x = 0; x < i_width; x += 8 )
286         {
287             int i;
288             pixel_sub_wxh( (int16_t*)diff, 8, pix1+x, i_pix1, pix2+x, i_pix2 );
289
290 #define SRC(x)     diff[i][x]
291 #define DST(x,rhs) diff[i][x] = (rhs)
292             for( i = 0; i < 8; i++ )
293                 SA8D_1D
294 #undef SRC
295 #undef DST
296
297 #define SRC(x)     diff[x][i]
298 #define DST(x,rhs) i_satd += abs(rhs)
299             for( i = 0; i < 8; i++ )
300                 SA8D_1D
301 #undef SRC
302 #undef DST
303         }
304         pix1 += 8 * i_pix1;
305         pix2 += 8 * i_pix2;
306     }
307
308     return i_satd;
309 }
310
311 #define PIXEL_SA8D_C( width, height ) \
312 static int x264_pixel_sa8d_##width##x##height( uint8_t *pix1, int i_stride_pix1, \
313                                                uint8_t *pix2, int i_stride_pix2 ) \
314 { \
315     return ( pixel_sa8d_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, width, height ) + 2 ) >> 2; \
316 }
317 PIXEL_SA8D_C( 16, 16 )
318 PIXEL_SA8D_C( 16, 8 )
319 PIXEL_SA8D_C( 8, 16 )
320 PIXEL_SA8D_C( 8, 8 )
321
322 /****************************************************************************
323  * pixel_sad_x4
324  ****************************************************************************/
325 #define SAD_X( size ) \
326 static void x264_pixel_sad_x3_##size( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, int i_stride, int scores[3] )\
327 {\
328     scores[0] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix0, i_stride );\
329     scores[1] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix1, i_stride );\
330     scores[2] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix2, i_stride );\
331 }\
332 static void x264_pixel_sad_x4_##size( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, uint8_t *pix3, int i_stride, int scores[4] )\
333 {\
334     scores[0] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix0, i_stride );\
335     scores[1] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix1, i_stride );\
336     scores[2] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix2, i_stride );\
337     scores[3] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix3, i_stride );\
338 }
339
340 SAD_X( 16x16 )
341 SAD_X( 16x8 )
342 SAD_X( 8x16 )
343 SAD_X( 8x8 )
344 SAD_X( 8x4 )
345 SAD_X( 4x8 )
346 SAD_X( 4x4 )
347
348 #ifdef ARCH_UltraSparc
349 SAD_X( 16x16_vis )
350 SAD_X( 16x8_vis )
351 SAD_X( 8x16_vis )
352 SAD_X( 8x8_vis )
353 #endif
354
355 /****************************************************************************
356  * pixel_satd_x4
357  * no faster than single satd, but needed for satd to be a drop-in replacement for sad
358  ****************************************************************************/
359
360 #define SATD_X( size, cpu ) \
361 static void x264_pixel_satd_x3_##size##cpu( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, int i_stride, int scores[3] )\
362 {\
363     scores[0] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix0, i_stride );\
364     scores[1] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix1, i_stride );\
365     scores[2] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix2, i_stride );\
366 }\
367 static void x264_pixel_satd_x4_##size##cpu( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, uint8_t *pix3, int i_stride, int scores[4] )\
368 {\
369     scores[0] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix0, i_stride );\
370     scores[1] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix1, i_stride );\
371     scores[2] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix2, i_stride );\
372     scores[3] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix3, i_stride );\
373 }
374 #define SATD_X_DECL5( cpu )\
375 SATD_X( 16x16, cpu )\
376 SATD_X( 16x8, cpu )\
377 SATD_X( 8x16, cpu )\
378 SATD_X( 8x8, cpu )\
379 SATD_X( 8x4, cpu )
380 #define SATD_X_DECL7( cpu )\
381 SATD_X_DECL5( cpu )\
382 SATD_X( 4x8, cpu )\
383 SATD_X( 4x4, cpu )
384
385 SATD_X_DECL7()
386 #ifdef HAVE_MMX
387 SATD_X_DECL7( _mmxext )
388 SATD_X_DECL5( _sse2 )
389 SATD_X_DECL7( _ssse3 )
390 SATD_X_DECL5( _ssse3_phadd )
391 #endif
392
393 /****************************************************************************
394  * structural similarity metric
395  ****************************************************************************/
396 static void ssim_4x4x2_core( const uint8_t *pix1, int stride1,
397                              const uint8_t *pix2, int stride2,
398                              int sums[2][4])
399 {
400     int x, y, z;
401     for(z=0; z<2; z++)
402     {
403         uint32_t s1=0, s2=0, ss=0, s12=0;
404         for(y=0; y<4; y++)
405             for(x=0; x<4; x++)
406             {
407                 int a = pix1[x+y*stride1];
408                 int b = pix2[x+y*stride2];
409                 s1  += a;
410                 s2  += b;
411                 ss  += a*a;
412                 ss  += b*b;
413                 s12 += a*b;
414             }
415         sums[z][0] = s1;
416         sums[z][1] = s2;
417         sums[z][2] = ss;
418         sums[z][3] = s12;
419         pix1 += 4;
420         pix2 += 4;
421     }
422 }
423
424 static float ssim_end1( int s1, int s2, int ss, int s12 )
425 {
426     static const int ssim_c1 = (int)(.01*.01*255*255*64 + .5);
427     static const int ssim_c2 = (int)(.03*.03*255*255*64*63 + .5);
428     int vars = ss*64 - s1*s1 - s2*s2;
429     int covar = s12*64 - s1*s2;
430     return (float)(2*s1*s2 + ssim_c1) * (float)(2*covar + ssim_c2)\
431            / ((float)(s1*s1 + s2*s2 + ssim_c1) * (float)(vars + ssim_c2));
432 }
433
434 static float ssim_end4( int sum0[5][4], int sum1[5][4], int width )
435 {
436     int i;
437     float ssim = 0.0;
438     for( i = 0; i < width; i++ )
439         ssim += ssim_end1( sum0[i][0] + sum0[i+1][0] + sum1[i][0] + sum1[i+1][0],
440                            sum0[i][1] + sum0[i+1][1] + sum1[i][1] + sum1[i+1][1],
441                            sum0[i][2] + sum0[i+1][2] + sum1[i][2] + sum1[i+1][2],
442                            sum0[i][3] + sum0[i+1][3] + sum1[i][3] + sum1[i+1][3] );
443     return ssim;
444 }
445
446 float x264_pixel_ssim_wxh( x264_pixel_function_t *pf,
447                            uint8_t *pix1, int stride1,
448                            uint8_t *pix2, int stride2,
449                            int width, int height )
450 {
451     int x, y, z;
452     float ssim = 0.0;
453     int (*sum0)[4] = x264_malloc(4 * (width/4+3) * sizeof(int));
454     int (*sum1)[4] = x264_malloc(4 * (width/4+3) * sizeof(int));
455     width >>= 2;
456     height >>= 2;
457     z = 0;
458     for( y = 1; y < height; y++ )
459     {
460         for( ; z <= y; z++ )
461         {
462             XCHG( void*, sum0, sum1 );
463             for( x = 0; x < width; x+=2 )
464                 pf->ssim_4x4x2_core( &pix1[4*(x+z*stride1)], stride1, &pix2[4*(x+z*stride2)], stride2, &sum0[x] );
465         }
466         for( x = 0; x < width-1; x += 4 )
467             ssim += pf->ssim_end4( sum0+x, sum1+x, X264_MIN(4,width-x-1) );
468     }
469     x264_free(sum0);
470     x264_free(sum1);
471     return ssim;
472 }
473
474
475 /****************************************************************************
476  * successive elimination
477  ****************************************************************************/
478 static int x264_pixel_ads4( int enc_dc[4], uint16_t *sums, int delta,
479                             uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
480 {
481     int nmv=0, i;
482     for( i=0; i<width; i++, sums++ )
483     {
484         int ads = abs( enc_dc[0] - sums[0] )
485                 + abs( enc_dc[1] - sums[8] )
486                 + abs( enc_dc[2] - sums[delta] )
487                 + abs( enc_dc[3] - sums[delta+8] )
488                 + cost_mvx[i];
489         if( ads < thresh )
490             mvs[nmv++] = i;
491     }
492     return nmv;
493 }
494
495 static int x264_pixel_ads2( int enc_dc[2], uint16_t *sums, int delta,
496                             uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
497 {
498     int nmv=0, i;
499     for( i=0; i<width; i++, sums++ )
500     {
501         int ads = abs( enc_dc[0] - sums[0] )
502                 + abs( enc_dc[1] - sums[delta] )
503                 + cost_mvx[i];
504         if( ads < thresh )
505             mvs[nmv++] = i;
506     }
507     return nmv;
508 }
509
510 static int x264_pixel_ads1( int enc_dc[1], uint16_t *sums, int delta,
511                             uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
512 {
513     int nmv=0, i;
514     for( i=0; i<width; i++, sums++ )
515     {
516         int ads = abs( enc_dc[0] - sums[0] )
517                 + cost_mvx[i];
518         if( ads < thresh )
519             mvs[nmv++] = i;
520     }
521     return nmv;
522 }
523
524
525 /****************************************************************************
526  * x264_pixel_init:
527  ****************************************************************************/
528 void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
529 {
530     memset( pixf, 0, sizeof(*pixf) );
531
532 #define INIT2( name, cpu ) \
533     pixf->name[PIXEL_16x16] = x264_pixel_##name##_16x16##cpu;\
534     pixf->name[PIXEL_16x8]  = x264_pixel_##name##_16x8##cpu;
535 #define INIT4( name, cpu ) \
536     INIT2( name, cpu ) \
537     pixf->name[PIXEL_8x16]  = x264_pixel_##name##_8x16##cpu;\
538     pixf->name[PIXEL_8x8]   = x264_pixel_##name##_8x8##cpu;
539 #define INIT5( name, cpu ) \
540     INIT4( name, cpu ) \
541     pixf->name[PIXEL_8x4]   = x264_pixel_##name##_8x4##cpu;
542 #define INIT7( name, cpu ) \
543     INIT5( name, cpu ) \
544     pixf->name[PIXEL_4x8]   = x264_pixel_##name##_4x8##cpu;\
545     pixf->name[PIXEL_4x4]   = x264_pixel_##name##_4x4##cpu;
546
547 #define INIT_ADS( cpu ) \
548     pixf->ads[PIXEL_16x16] = x264_pixel_ads4##cpu;\
549     pixf->ads[PIXEL_16x8] = x264_pixel_ads2##cpu;\
550     pixf->ads[PIXEL_8x8] = x264_pixel_ads1##cpu;
551
552     INIT7( sad, );
553     INIT7( sad_x3, );
554     INIT7( sad_x4, );
555     INIT7( ssd, );
556     INIT7( satd, );
557     INIT7( satd_x3, );
558     INIT7( satd_x4, );
559     INIT4( sa8d, );
560     INIT_ADS( );
561
562     pixf->var[PIXEL_16x16] = x264_pixel_var_16x16;
563     pixf->var[PIXEL_8x8]   = x264_pixel_var_8x8;
564
565     pixf->ssim_4x4x2_core = ssim_4x4x2_core;
566     pixf->ssim_end4 = ssim_end4;
567
568 #ifdef HAVE_MMX
569     if( cpu&X264_CPU_MMX )
570     {
571         INIT7( ssd, _mmx );
572     }
573
574     if( cpu&X264_CPU_MMXEXT )
575     {
576         INIT7( sad, _mmxext );
577         INIT7( sad_x3, _mmxext );
578         INIT7( sad_x4, _mmxext );
579         INIT7( satd, _mmxext );
580         INIT7( satd_x3, _mmxext );
581         INIT7( satd_x4, _mmxext );
582         INIT_ADS( _mmxext );
583         pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_mmxext;
584         pixf->var[PIXEL_8x8]   = x264_pixel_var_8x8_mmxext;
585 #ifdef ARCH_X86
586         pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_mmxext;
587         pixf->sa8d[PIXEL_8x8]   = x264_pixel_sa8d_8x8_mmxext;
588         pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_mmxext;
589         pixf->ssim_4x4x2_core  = x264_pixel_ssim_4x4x2_core_mmxext;
590
591         if( cpu&X264_CPU_CACHELINE_32 )
592         {
593             INIT5( sad, _cache32_mmxext );
594             INIT4( sad_x3, _cache32_mmxext );
595             INIT4( sad_x4, _cache32_mmxext );
596         }
597         else if( cpu&X264_CPU_CACHELINE_64 )
598         {
599             INIT5( sad, _cache64_mmxext );
600             INIT4( sad_x3, _cache64_mmxext );
601             INIT4( sad_x4, _cache64_mmxext );
602         }
603 #else
604         if( cpu&X264_CPU_CACHELINE_64 )
605         {
606             pixf->sad[PIXEL_8x16] = x264_pixel_sad_8x16_cache64_mmxext;
607             pixf->sad[PIXEL_8x8]  = x264_pixel_sad_8x8_cache64_mmxext;
608             pixf->sad[PIXEL_8x4]  = x264_pixel_sad_8x4_cache64_mmxext;
609             pixf->sad_x3[PIXEL_8x16] = x264_pixel_sad_x3_8x16_cache64_mmxext;
610             pixf->sad_x3[PIXEL_8x8]  = x264_pixel_sad_x3_8x8_cache64_mmxext;
611             pixf->sad_x4[PIXEL_8x16] = x264_pixel_sad_x4_8x16_cache64_mmxext;
612             pixf->sad_x4[PIXEL_8x8]  = x264_pixel_sad_x4_8x8_cache64_mmxext;
613         }
614 #endif
615         pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_mmxext;
616         pixf->intra_satd_x3_8x8c  = x264_intra_satd_x3_8x8c_mmxext;
617         pixf->intra_satd_x3_4x4   = x264_intra_satd_x3_4x4_mmxext;
618     }
619
620     if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_SSE2_IS_SLOW) )
621     {
622         INIT2( sad, _sse2 );
623         INIT2( sad_x3, _sse2 );
624         INIT2( sad_x4, _sse2 );
625         INIT_ADS( _sse2 );
626         pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_sse2;
627
628 #ifdef ARCH_X86
629         if( cpu&X264_CPU_CACHELINE_64 )
630         {
631             INIT2( sad, _cache64_sse2 );
632             INIT2( sad_x3, _cache64_sse2 );
633             INIT2( sad_x4, _cache64_sse2 );
634         }
635 #endif
636     }
637     if( cpu&X264_CPU_SSE2 )
638     {
639         INIT5( ssd, _sse2 );
640         INIT5( satd, _sse2 );
641         INIT5( satd_x3, _sse2 );
642         INIT5( satd_x4, _sse2 );
643         pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_sse2;
644         pixf->ssim_4x4x2_core  = x264_pixel_ssim_4x4x2_core_sse2;
645         pixf->ssim_end4        = x264_pixel_ssim_end4_sse2;
646         pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_sse2;
647         pixf->sa8d[PIXEL_8x8]   = x264_pixel_sa8d_8x8_sse2;
648 #ifdef ARCH_X86_64
649         pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_sse2;
650 #endif
651     }
652
653     if( (cpu&X264_CPU_SSE3) && (cpu&X264_CPU_CACHELINE_64) )
654     {
655         INIT2( sad, _sse3 );
656         INIT2( sad_x3, _sse3 );
657         INIT2( sad_x4, _sse3 );
658     }
659
660     if( cpu&X264_CPU_SSSE3 )
661     {
662         INIT7( satd, _ssse3 );
663         INIT7( satd_x3, _ssse3 );
664         INIT7( satd_x4, _ssse3 );
665         INIT_ADS( _ssse3 );
666         pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3;
667         pixf->sa8d[PIXEL_8x8]  = x264_pixel_sa8d_8x8_ssse3;
668         pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_ssse3;
669         pixf->intra_satd_x3_8x8c  = x264_intra_satd_x3_8x8c_ssse3;
670         pixf->intra_satd_x3_4x4   = x264_intra_satd_x3_4x4_ssse3;
671 #ifdef ARCH_X86_64
672         pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_ssse3;
673 #endif
674         if( cpu&X264_CPU_CACHELINE_64 )
675         {
676             INIT2( sad, _cache64_ssse3 );
677             INIT2( sad_x3, _cache64_ssse3 );
678             INIT2( sad_x4, _cache64_ssse3 );
679         }
680         if( cpu&X264_CPU_PHADD_IS_FAST )
681         {
682             INIT5( satd, _ssse3_phadd );
683             INIT5( satd_x3, _ssse3_phadd );
684             INIT5( satd_x4, _ssse3_phadd );
685         }
686     }
687 #endif //HAVE_MMX
688
689 #ifdef ARCH_PPC
690     if( cpu&X264_CPU_ALTIVEC )
691     {
692         x264_pixel_altivec_init( pixf );
693     }
694 #endif
695 #ifdef ARCH_UltraSparc
696     INIT4( sad, _vis );
697     INIT4( sad_x3, _vis );
698     INIT4( sad_x4, _vis );
699 #endif
700
701     pixf->ads[PIXEL_8x16] =
702     pixf->ads[PIXEL_8x4] =
703     pixf->ads[PIXEL_4x8] = pixf->ads[PIXEL_16x8];
704     pixf->ads[PIXEL_4x4] = pixf->ads[PIXEL_8x8];
705 }
706