]> git.sesse.net Git - x264/blob - common/pixel.c
support pkg-config.
[x264] / common / pixel.c
1 /*****************************************************************************
2  * pixel.c: h264 encoder
3  *****************************************************************************
4  * Copyright (C) 2003 Laurent Aimar
5  * $Id: pixel.c,v 1.1 2004/06/03 19:27:07 fenrir Exp $
6  *
7  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
8  *
9  * This program is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License as published by
11  * the Free Software Foundation; either version 2 of the License, or
12  * (at your option) any later version.
13  *
14  * This program is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  * GNU General Public License for more details.
18  *
19  * You should have received a copy of the GNU General Public License
20  * along with this program; if not, write to the Free Software
21  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
22  *****************************************************************************/
23
24 #ifdef HAVE_STDINT_H
25 #include <stdint.h>
26 #else
27 #include <inttypes.h>
28 #endif
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdarg.h>
32
33 #include "x264.h"
34 #include "pixel.h"
35 #include "clip1.h"
36
37 #ifdef HAVE_MMXEXT
38 #   include "i386/pixel.h"
39 #endif
40 #ifdef ARCH_PPC
41 #   include "ppc/pixel.h"
42 #endif
43 #ifdef ARCH_UltraSparc
44 #   include "sparc/pixel.h"
45 #endif
46
47
48 /****************************************************************************
49  * pixel_sad_WxH
50  ****************************************************************************/
51 #define PIXEL_SAD_C( name, lx, ly ) \
52 static int name( uint8_t *pix1, int i_stride_pix1,  \
53                  uint8_t *pix2, int i_stride_pix2 ) \
54 {                                                   \
55     int i_sum = 0;                                  \
56     int x, y;                                       \
57     for( y = 0; y < ly; y++ )                       \
58     {                                               \
59         for( x = 0; x < lx; x++ )                   \
60         {                                           \
61             i_sum += abs( pix1[x] - pix2[x] );      \
62         }                                           \
63         pix1 += i_stride_pix1;                      \
64         pix2 += i_stride_pix2;                      \
65     }                                               \
66     return i_sum;                                   \
67 }
68
69
70 PIXEL_SAD_C( pixel_sad_16x16, 16, 16 )
71 PIXEL_SAD_C( pixel_sad_16x8,  16,  8 )
72 PIXEL_SAD_C( pixel_sad_8x16,   8, 16 )
73 PIXEL_SAD_C( pixel_sad_8x8,    8,  8 )
74 PIXEL_SAD_C( pixel_sad_8x4,    8,  4 )
75 PIXEL_SAD_C( pixel_sad_4x8,    4,  8 )
76 PIXEL_SAD_C( pixel_sad_4x4,    4,  4 )
77
78
79 /****************************************************************************
80  * pixel_ssd_WxH
81  ****************************************************************************/
82 #define PIXEL_SSD_C( name, lx, ly ) \
83 static int name( uint8_t *pix1, int i_stride_pix1,  \
84                  uint8_t *pix2, int i_stride_pix2 ) \
85 {                                                   \
86     int i_sum = 0;                                  \
87     int x, y;                                       \
88     for( y = 0; y < ly; y++ )                       \
89     {                                               \
90         for( x = 0; x < lx; x++ )                   \
91         {                                           \
92             int d = pix1[x] - pix2[x];              \
93             i_sum += d*d;                           \
94         }                                           \
95         pix1 += i_stride_pix1;                      \
96         pix2 += i_stride_pix2;                      \
97     }                                               \
98     return i_sum;                                   \
99 }
100
101 PIXEL_SSD_C( pixel_ssd_16x16, 16, 16 )
102 PIXEL_SSD_C( pixel_ssd_16x8,  16,  8 )
103 PIXEL_SSD_C( pixel_ssd_8x16,   8, 16 )
104 PIXEL_SSD_C( pixel_ssd_8x8,    8,  8 )
105 PIXEL_SSD_C( pixel_ssd_8x4,    8,  4 )
106 PIXEL_SSD_C( pixel_ssd_4x8,    4,  8 )
107 PIXEL_SSD_C( pixel_ssd_4x4,    4,  4 )
108
109 int64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2, int i_width, int i_height )
110 {
111     int64_t i_ssd = 0;
112     int x, y;
113
114 #define SSD(size) i_ssd += pf->ssd[size]( pix1 + y*i_pix1 + x, i_pix1, \
115                                           pix2 + y*i_pix2 + x, i_pix2 );
116     for( y = 0; y < i_height-15; y += 16 )
117     {
118         for( x = 0; x < i_width-15; x += 16 )
119             SSD(PIXEL_16x16);
120         if( x < i_width-7 )
121             SSD(PIXEL_8x16);
122     }
123     if( y < i_height-7 )
124         for( x = 0; x < i_width-7; x += 8 )
125             SSD(PIXEL_8x8);
126 #undef SSD
127
128 #define SSD1 { int d = pix1[y*i_pix1+x] - pix2[y*i_pix2+x]; i_ssd += d*d; }
129     if( i_width % 8 != 0 )
130     {
131         for( y = 0; y < (i_height & ~7); y++ )
132             for( x = i_width & ~7; x < i_width; x++ )
133                 SSD1;
134     }
135     if( i_height % 8 != 0 )
136     {
137         for( y = i_height & ~7; y < i_height; y++ )
138             for( x = 0; x < i_width; x++ )
139                 SSD1;
140     }
141 #undef SSD1
142
143     return i_ssd;
144 }
145
146
147 static inline void pixel_sub_wxh( int16_t *diff, int i_size,
148                                   uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
149 {
150     int y, x;
151     for( y = 0; y < i_size; y++ )
152     {
153         for( x = 0; x < i_size; x++ )
154         {
155             diff[x + y*i_size] = pix1[x] - pix2[x];
156         }
157         pix1 += i_pix1;
158         pix2 += i_pix2;
159     }
160 }
161
162
163 /****************************************************************************
164  * pixel_satd_WxH: sum of 4x4 Hadamard transformed differences
165  ****************************************************************************/
166 static int pixel_satd_wxh( uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2, int i_width, int i_height )
167 {
168     int16_t tmp[4][4];
169     int16_t diff[4][4];
170     int x, y;
171     int i_satd = 0;
172
173     for( y = 0; y < i_height; y += 4 )
174     {
175         for( x = 0; x < i_width; x += 4 )
176         {
177             int d;
178
179             pixel_sub_wxh( (int16_t*)diff, 4, &pix1[x], i_pix1, &pix2[x], i_pix2 );
180
181             for( d = 0; d < 4; d++ )
182             {
183                 int s01, s23;
184                 int d01, d23;
185
186                 s01 = diff[d][0] + diff[d][1]; s23 = diff[d][2] + diff[d][3];
187                 d01 = diff[d][0] - diff[d][1]; d23 = diff[d][2] - diff[d][3];
188
189                 tmp[d][0] = s01 + s23;
190                 tmp[d][1] = s01 - s23;
191                 tmp[d][2] = d01 - d23;
192                 tmp[d][3] = d01 + d23;
193             }
194             for( d = 0; d < 4; d++ )
195             {
196                 int s01, s23;
197                 int d01, d23;
198
199                 s01 = tmp[0][d] + tmp[1][d]; s23 = tmp[2][d] + tmp[3][d];
200                 d01 = tmp[0][d] - tmp[1][d]; d23 = tmp[2][d] - tmp[3][d];
201
202                 i_satd += abs( s01 + s23 ) + abs( s01 - s23 ) + abs( d01 - d23 ) + abs( d01 + d23 );
203             }
204
205         }
206         pix1 += 4 * i_pix1;
207         pix2 += 4 * i_pix2;
208     }
209
210     return i_satd / 2;
211 }
212 #define PIXEL_SATD_C( name, width, height ) \
213 static int name( uint8_t *pix1, int i_stride_pix1, \
214                  uint8_t *pix2, int i_stride_pix2 ) \
215 { \
216     return pixel_satd_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, width, height ); \
217 }
218 PIXEL_SATD_C( pixel_satd_16x16, 16, 16 )
219 PIXEL_SATD_C( pixel_satd_16x8,  16, 8 )
220 PIXEL_SATD_C( pixel_satd_8x16,  8, 16 )
221 PIXEL_SATD_C( pixel_satd_8x8,   8, 8 )
222 PIXEL_SATD_C( pixel_satd_8x4,   8, 4 )
223 PIXEL_SATD_C( pixel_satd_4x8,   4, 8 )
224 PIXEL_SATD_C( pixel_satd_4x4,   4, 4 )
225
226
227 /****************************************************************************
228  * pixel_sa8d_WxH: sum of 8x8 Hadamard transformed differences
229  ****************************************************************************/
230 #define SA8D_1D {\
231     const int a0 = SRC(0) + SRC(4);\
232     const int a4 = SRC(0) - SRC(4);\
233     const int a1 = SRC(1) + SRC(5);\
234     const int a5 = SRC(1) - SRC(5);\
235     const int a2 = SRC(2) + SRC(6);\
236     const int a6 = SRC(2) - SRC(6);\
237     const int a3 = SRC(3) + SRC(7);\
238     const int a7 = SRC(3) - SRC(7);\
239     const int b0 = a0 + a2;\
240     const int b2 = a0 - a2;\
241     const int b1 = a1 + a3;\
242     const int b3 = a1 - a3;\
243     const int b4 = a4 + a6;\
244     const int b6 = a4 - a6;\
245     const int b5 = a5 + a7;\
246     const int b7 = a5 - a7;\
247     DST(0, b0 + b1);\
248     DST(1, b0 - b1);\
249     DST(2, b2 + b3);\
250     DST(3, b2 - b3);\
251     DST(4, b4 + b5);\
252     DST(5, b4 - b5);\
253     DST(6, b6 + b7);\
254     DST(7, b6 - b7);\
255 }
256
257 static inline int pixel_sa8d_wxh( uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2,
258                                   int i_width, int i_height )
259 {
260     int16_t diff[8][8];
261     int i_satd = 0;
262     int x, y;
263
264     for( y = 0; y < i_height; y += 8 )
265     {
266         for( x = 0; x < i_width; x += 8 )
267         {
268             int i;
269             pixel_sub_wxh( (int16_t*)diff, 8, pix1+x, i_pix1, pix2+x, i_pix2 );
270
271 #define SRC(x)     diff[i][x]
272 #define DST(x,rhs) diff[i][x] = (rhs)
273             for( i = 0; i < 8; i++ )
274                 SA8D_1D
275 #undef SRC
276 #undef DST
277
278 #define SRC(x)     diff[x][i]
279 #define DST(x,rhs) i_satd += abs(rhs)
280             for( i = 0; i < 8; i++ )
281                 SA8D_1D
282 #undef SRC
283 #undef DST
284         }
285         pix1 += 8 * i_pix1;
286         pix2 += 8 * i_pix2;
287     }
288
289     return i_satd;
290 }
291
292 #define PIXEL_SA8D_C( width, height ) \
293 static int pixel_sa8d_##width##x##height( uint8_t *pix1, int i_stride_pix1, \
294                  uint8_t *pix2, int i_stride_pix2 ) \
295 { \
296     return ( pixel_sa8d_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, width, height ) + 2 ) >> 2; \
297 }
298 PIXEL_SA8D_C( 16, 16 )
299 PIXEL_SA8D_C( 16, 8 )
300 PIXEL_SA8D_C( 8, 16 )
301 PIXEL_SA8D_C( 8, 8 )
302
303
304 /****************************************************************************
305  * x264_pixel_init:
306  ****************************************************************************/
307 void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
308 {
309     pixf->sad[PIXEL_16x16] = pixel_sad_16x16;
310     pixf->sad[PIXEL_16x8]  = pixel_sad_16x8;
311     pixf->sad[PIXEL_8x16]  = pixel_sad_8x16;
312     pixf->sad[PIXEL_8x8]   = pixel_sad_8x8;
313     pixf->sad[PIXEL_8x4]   = pixel_sad_8x4;
314     pixf->sad[PIXEL_4x8]   = pixel_sad_4x8;
315     pixf->sad[PIXEL_4x4]   = pixel_sad_4x4;
316
317     pixf->ssd[PIXEL_16x16] = pixel_ssd_16x16;
318     pixf->ssd[PIXEL_16x8]  = pixel_ssd_16x8;
319     pixf->ssd[PIXEL_8x16]  = pixel_ssd_8x16;
320     pixf->ssd[PIXEL_8x8]   = pixel_ssd_8x8;
321     pixf->ssd[PIXEL_8x4]   = pixel_ssd_8x4;
322     pixf->ssd[PIXEL_4x8]   = pixel_ssd_4x8;
323     pixf->ssd[PIXEL_4x4]   = pixel_ssd_4x4;
324
325     pixf->satd[PIXEL_16x16]= pixel_satd_16x16;
326     pixf->satd[PIXEL_16x8] = pixel_satd_16x8;
327     pixf->satd[PIXEL_8x16] = pixel_satd_8x16;
328     pixf->satd[PIXEL_8x8]  = pixel_satd_8x8;
329     pixf->satd[PIXEL_8x4]  = pixel_satd_8x4;
330     pixf->satd[PIXEL_4x8]  = pixel_satd_4x8;
331     pixf->satd[PIXEL_4x4]  = pixel_satd_4x4;
332
333     pixf->sa8d[PIXEL_16x16]= pixel_sa8d_16x16;
334     pixf->sa8d[PIXEL_16x8] = pixel_sa8d_16x8;
335     pixf->sa8d[PIXEL_8x16] = pixel_sa8d_8x16;
336     pixf->sa8d[PIXEL_8x8]  = pixel_sa8d_8x8;
337
338 #ifdef HAVE_MMXEXT
339     if( cpu&X264_CPU_MMXEXT )
340     {
341         pixf->sad[PIXEL_16x16] = x264_pixel_sad_16x16_mmxext;
342         pixf->sad[PIXEL_16x8 ] = x264_pixel_sad_16x8_mmxext;
343         pixf->sad[PIXEL_8x16 ] = x264_pixel_sad_8x16_mmxext;
344         pixf->sad[PIXEL_8x8  ] = x264_pixel_sad_8x8_mmxext;
345         pixf->sad[PIXEL_8x4  ] = x264_pixel_sad_8x4_mmxext;
346         pixf->sad[PIXEL_4x8  ] = x264_pixel_sad_4x8_mmxext;
347         pixf->sad[PIXEL_4x4]   = x264_pixel_sad_4x4_mmxext;
348
349         pixf->ssd[PIXEL_16x16] = x264_pixel_ssd_16x16_mmxext;
350         pixf->ssd[PIXEL_16x8]  = x264_pixel_ssd_16x8_mmxext;
351         pixf->ssd[PIXEL_8x16]  = x264_pixel_ssd_8x16_mmxext;
352         pixf->ssd[PIXEL_8x8]   = x264_pixel_ssd_8x8_mmxext;
353         pixf->ssd[PIXEL_8x4]   = x264_pixel_ssd_8x4_mmxext;
354         pixf->ssd[PIXEL_4x8]   = x264_pixel_ssd_4x8_mmxext;
355         pixf->ssd[PIXEL_4x4]   = x264_pixel_ssd_4x4_mmxext;
356   
357         pixf->satd[PIXEL_16x16]= x264_pixel_satd_16x16_mmxext;
358         pixf->satd[PIXEL_16x8] = x264_pixel_satd_16x8_mmxext;
359         pixf->satd[PIXEL_8x16] = x264_pixel_satd_8x16_mmxext;
360         pixf->satd[PIXEL_8x8]  = x264_pixel_satd_8x8_mmxext;
361         pixf->satd[PIXEL_8x4]  = x264_pixel_satd_8x4_mmxext;
362         pixf->satd[PIXEL_4x8]  = x264_pixel_satd_4x8_mmxext;
363         pixf->satd[PIXEL_4x4]  = x264_pixel_satd_4x4_mmxext;
364     }
365 #endif
366
367 #ifdef HAVE_SSE2
368     // disable on AMD processors since it is slower
369     if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_3DNOW) )
370     {
371         pixf->sad[PIXEL_16x16] = x264_pixel_sad_16x16_sse2;
372         pixf->sad[PIXEL_16x8 ] = x264_pixel_sad_16x8_sse2;
373
374         pixf->satd[PIXEL_16x16]= x264_pixel_satd_16x16_sse2;
375         pixf->satd[PIXEL_16x8] = x264_pixel_satd_16x8_sse2;
376         pixf->satd[PIXEL_8x16] = x264_pixel_satd_8x16_sse2;
377         pixf->satd[PIXEL_8x8]  = x264_pixel_satd_8x8_sse2;
378         pixf->satd[PIXEL_8x4]  = x264_pixel_satd_8x4_sse2;
379     }
380     // these are faster on both Intel and AMD
381     if( cpu&X264_CPU_SSE2 )
382     {
383         pixf->ssd[PIXEL_16x16] = x264_pixel_ssd_16x16_sse2;
384         pixf->ssd[PIXEL_16x8]  = x264_pixel_ssd_16x8_sse2;
385     }
386 #endif
387
388 #ifdef ARCH_PPC
389     if( cpu&X264_CPU_ALTIVEC )
390     {
391         x264_pixel_altivec_init( pixf );
392     }
393 #endif
394 #ifdef ARCH_UltraSparc
395       pixf->sad[PIXEL_8x8]   = x264_pixel_sad_8x8_vis;
396       pixf->sad[PIXEL_8x16]  = x264_pixel_sad_8x16_vis;
397       pixf->sad[PIXEL_16x8]  = x264_pixel_sad_16x8_vis;
398       pixf->sad[PIXEL_16x16] = x264_pixel_sad_16x16_vis;
399 #endif
400 }
401