]> git.sesse.net Git - x264/blob - common/csp.c
tweak x264_pixel_sad_x4_16x16_sse2 horizontal sum. 168 -> 166 cycles on core2.
[x264] / common / csp.c
1 /*****************************************************************************
2  * csp.c: h264 encoder library
3  *****************************************************************************
4  * Copyright (C) 2004 Laurent Aimar
5  * $Id: csp.c,v 1.1 2004/06/03 19:27:06 fenrir Exp $
6  *
7  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
8  *
9  * This program is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License as published by
11  * the Free Software Foundation; either version 2 of the License, or
12  * (at your option) any later version.
13  *
14  * This program is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  * GNU General Public License for more details.
18  *
19  * You should have received a copy of the GNU General Public License
20  * along with this program; if not, write to the Free Software
21  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
22  *****************************************************************************/
23
24 #include "common.h"
25
26 static inline void plane_copy_vflip( x264_mc_functions_t *mc,
27                                      uint8_t *dst, int i_dst,
28                                      uint8_t *src, int i_src, int w, int h)
29 {
30     mc->plane_copy( dst, i_dst, src + (h -1)*i_src, -i_src, w, h );
31 }
32
33 static inline void plane_subsamplev2( uint8_t *dst, int i_dst,
34                                       uint8_t *src, int i_src, int w, int h)
35 {
36     for( ; h > 0; h-- )
37     {
38         uint8_t *d = dst;
39         uint8_t *s = src;
40         int     i;
41         for( i = 0; i < w; i++ )
42         {
43             *d++ = ( s[0] + s[i_src] + 1 ) >> 1;
44             s++;
45         }
46         dst += i_dst;
47         src += 2 * i_src;
48     }
49 }
50
51 static inline void plane_subsamplev2_vlip( uint8_t *dst, int i_dst,
52                                            uint8_t *src, int i_src, int w, int h)
53 {
54     plane_subsamplev2( dst, i_dst, src + (2*h-1)*i_src, -i_src, w, h );
55 }
56
57 static inline void plane_subsamplehv2( uint8_t *dst, int i_dst,
58                                        uint8_t *src, int i_src, int w, int h)
59 {
60     for( ; h > 0; h-- )
61     {
62         uint8_t *d = dst;
63         uint8_t *s = src;
64         int     i;
65         for( i = 0; i < w; i++ )
66         {
67             *d++ = ( s[0] + s[1] + s[i_src] + s[i_src+1] + 1 ) >> 2;
68             s += 2;
69         }
70         dst += i_dst;
71         src += 2 * i_src;
72     }
73 }
74
75 static inline void plane_subsamplehv2_vlip( uint8_t *dst, int i_dst,
76                                             uint8_t *src, int i_src, int w, int h)
77 {
78     plane_subsamplehv2( dst, i_dst, src + (2*h-1)*i_src, -i_src, w, h );
79 }
80
81 static void i420_to_i420( x264_mc_functions_t *mc,
82                           x264_frame_t *frm, x264_image_t *img,
83                           int i_width, int i_height )
84 {
85     if( img->i_csp & X264_CSP_VFLIP )
86     {
87         plane_copy_vflip( mc, frm->plane[0], frm->i_stride[0],
88                           img->plane[0], img->i_stride[0],
89                           i_width, i_height );
90         plane_copy_vflip( mc, frm->plane[1], frm->i_stride[1],
91                           img->plane[1], img->i_stride[1],
92                           i_width / 2, i_height / 2 );
93         plane_copy_vflip( mc, frm->plane[2], frm->i_stride[2],
94                           img->plane[2], img->i_stride[2],
95                           i_width / 2, i_height / 2 );
96     }
97     else
98     {
99         mc->plane_copy( frm->plane[0], frm->i_stride[0],
100                         img->plane[0], img->i_stride[0],
101                         i_width, i_height );
102         mc->plane_copy( frm->plane[1], frm->i_stride[1],
103                         img->plane[1], img->i_stride[1],
104                         i_width / 2, i_height / 2 );
105         mc->plane_copy( frm->plane[2], frm->i_stride[2],
106                         img->plane[2], img->i_stride[2],
107                         i_width / 2, i_height / 2 );
108     }
109 }
110
111 static void yv12_to_i420( x264_mc_functions_t *mc,
112                           x264_frame_t *frm, x264_image_t *img,
113                           int i_width, int i_height )
114 {
115     if( img->i_csp & X264_CSP_VFLIP )
116     {
117         plane_copy_vflip( mc, frm->plane[0], frm->i_stride[0],
118                           img->plane[0], img->i_stride[0],
119                           i_width, i_height );
120         plane_copy_vflip( mc, frm->plane[2], frm->i_stride[2],
121                           img->plane[1], img->i_stride[1],
122                           i_width / 2, i_height / 2 );
123         plane_copy_vflip( mc, frm->plane[1], frm->i_stride[1],
124                           img->plane[2], img->i_stride[2],
125                           i_width / 2, i_height / 2 );
126     }
127     else
128     {
129         mc->plane_copy( frm->plane[0], frm->i_stride[0],
130                         img->plane[0], img->i_stride[0],
131                         i_width, i_height );
132         mc->plane_copy( frm->plane[2], frm->i_stride[2],
133                         img->plane[1], img->i_stride[1],
134                         i_width / 2, i_height / 2 );
135         mc->plane_copy( frm->plane[1], frm->i_stride[1],
136                         img->plane[2], img->i_stride[2],
137                         i_width / 2, i_height / 2 );
138     }
139 }
140
141 static void i422_to_i420( x264_mc_functions_t *mc,
142                           x264_frame_t *frm, x264_image_t *img,
143                           int i_width, int i_height )
144 {
145     if( img->i_csp & X264_CSP_VFLIP )
146     {
147         plane_copy_vflip( mc, frm->plane[0], frm->i_stride[0],
148                           img->plane[0], img->i_stride[0],
149                           i_width, i_height );
150
151         plane_subsamplev2_vlip( frm->plane[1], frm->i_stride[1],
152                                 img->plane[1], img->i_stride[1],
153                                 i_width / 2, i_height / 2 );
154         plane_subsamplev2_vlip( frm->plane[2], frm->i_stride[2],
155                                 img->plane[2], img->i_stride[2],
156                                 i_width / 2, i_height / 2 );
157     }
158     else
159     {
160         mc->plane_copy( frm->plane[0], frm->i_stride[0],
161                         img->plane[0], img->i_stride[0],
162                         i_width, i_height );
163
164         plane_subsamplev2( frm->plane[1], frm->i_stride[1],
165                            img->plane[1], img->i_stride[1],
166                            i_width / 2, i_height / 2 );
167         plane_subsamplev2( frm->plane[2], frm->i_stride[2],
168                            img->plane[2], img->i_stride[2],
169                            i_width / 2, i_height / 2 );
170     }
171 }
172
173 static void i444_to_i420( x264_mc_functions_t *mc,
174                           x264_frame_t *frm, x264_image_t *img,
175                           int i_width, int i_height )
176 {
177     if( img->i_csp & X264_CSP_VFLIP )
178     {
179         plane_copy_vflip( mc, frm->plane[0], frm->i_stride[0],
180                           img->plane[0], img->i_stride[0],
181                           i_width, i_height );
182
183         plane_subsamplehv2_vlip( frm->plane[1], frm->i_stride[1],
184                                  img->plane[1], img->i_stride[1],
185                                  i_width / 2, i_height / 2 );
186         plane_subsamplehv2_vlip( frm->plane[2], frm->i_stride[2],
187                                  img->plane[2], img->i_stride[2],
188                                  i_width / 2, i_height / 2 );
189     }
190     else
191     {
192         mc->plane_copy( frm->plane[0], frm->i_stride[0],
193                         img->plane[0], img->i_stride[0],
194                         i_width, i_height );
195
196         plane_subsamplehv2( frm->plane[1], frm->i_stride[1],
197                             img->plane[1], img->i_stride[1],
198                             i_width / 2, i_height / 2 );
199         plane_subsamplehv2( frm->plane[2], frm->i_stride[2],
200                             img->plane[2], img->i_stride[2],
201                             i_width / 2, i_height / 2 );
202     }
203 }
204 static void yuyv_to_i420( x264_mc_functions_t *mc,
205                           x264_frame_t *frm, x264_image_t *img,
206                           int i_width, int i_height )
207 {
208     uint8_t *src = img->plane[0];
209     int     i_src= img->i_stride[0];
210
211     uint8_t *y   = frm->plane[0];
212     uint8_t *u   = frm->plane[1];
213     uint8_t *v   = frm->plane[2];
214
215     if( img->i_csp & X264_CSP_VFLIP )
216     {
217         src += ( i_height - 1 ) * i_src;
218         i_src = -i_src;
219     }
220
221     for( ; i_height > 0; i_height -= 2 )
222     {
223         uint8_t *ss = src;
224         uint8_t *yy = y;
225         uint8_t *uu = u;
226         uint8_t *vv = v;
227         int w;
228
229         for( w = i_width; w > 0; w -= 2 )
230         {
231             *yy++ = ss[0];
232             *yy++ = ss[2];
233
234             *uu++ = ( ss[1] + ss[1+i_src] + 1 ) >> 1;
235             *vv++ = ( ss[3] + ss[3+i_src] + 1 ) >> 1;
236
237             ss += 4;
238         }
239         src += i_src;
240         y += frm->i_stride[0];
241         u += frm->i_stride[1];
242         v += frm->i_stride[2];
243
244         ss = src;
245         yy = y;
246         for( w = i_width; w > 0; w -= 2 )
247         {
248             *yy++ = ss[0];
249             *yy++ = ss[2];
250             ss += 4;
251         }
252         src += i_src;
253         y += frm->i_stride[0];
254     }
255 }
256
257 /* Same value than in XviD */
258 #define BITS 8
259 #define FIX(f) ((int)((f) * (1 << BITS) + 0.5))
260
261 #define Y_R   FIX(0.257)
262 #define Y_G   FIX(0.504)
263 #define Y_B   FIX(0.098)
264 #define Y_ADD 16
265
266 #define U_R   FIX(0.148)
267 #define U_G   FIX(0.291)
268 #define U_B   FIX(0.439)
269 #define U_ADD 128
270
271 #define V_R   FIX(0.439)
272 #define V_G   FIX(0.368)
273 #define V_B   FIX(0.071)
274 #define V_ADD 128
275 #define RGB_TO_I420( name, POS_R, POS_G, POS_B, S_RGB ) \
276 static void name( x264_mc_functions_t *mc,              \
277                   x264_frame_t *frm, x264_image_t *img, \
278                   int i_width, int i_height )           \
279 {                                                       \
280     uint8_t *src = img->plane[0];                       \
281     int     i_src= img->i_stride[0];                    \
282     int     i_y  = frm->i_stride[0];                    \
283     uint8_t *y   = frm->plane[0];                       \
284     uint8_t *u   = frm->plane[1];                       \
285     uint8_t *v   = frm->plane[2];                       \
286                                                         \
287     if( img->i_csp & X264_CSP_VFLIP )                   \
288     {                                                   \
289         src += ( i_height - 1 ) * i_src;                \
290         i_src = -i_src;                                 \
291     }                                                   \
292                                                         \
293     for(  ; i_height > 0; i_height -= 2 )               \
294     {                                                   \
295         uint8_t *ss = src;                              \
296         uint8_t *yy = y;                                \
297         uint8_t *uu = u;                                \
298         uint8_t *vv = v;                                \
299         int w;                                          \
300                                                         \
301         for( w = i_width; w > 0; w -= 2 )               \
302         {                                               \
303             int cr = 0,cg = 0,cb = 0;                   \
304             int r, g, b;                                \
305                                                         \
306             /* Luma */                                  \
307             cr = r = ss[POS_R];                         \
308             cg = g = ss[POS_G];                         \
309             cb = b = ss[POS_B];                         \
310                                                         \
311             yy[0] = Y_ADD + ((Y_R * r + Y_G * g + Y_B * b) >> BITS);    \
312                                                         \
313             cr+= r = ss[POS_R+i_src];                   \
314             cg+= g = ss[POS_G+i_src];                   \
315             cb+= b = ss[POS_B+i_src];                   \
316             yy[i_y] = Y_ADD + ((Y_R * r + Y_G * g + Y_B * b) >> BITS);  \
317             yy++;                                       \
318             ss += S_RGB;                                \
319                                                         \
320             cr+= r = ss[POS_R];                         \
321             cg+= g = ss[POS_G];                         \
322             cb+= b = ss[POS_B];                         \
323                                                         \
324             yy[0] = Y_ADD + ((Y_R * r + Y_G * g + Y_B * b) >> BITS);    \
325                                                         \
326             cr+= r = ss[POS_R+i_src];                   \
327             cg+= g = ss[POS_G+i_src];                   \
328             cb+= b = ss[POS_B+i_src];                   \
329             yy[i_y] = Y_ADD + ((Y_R * r + Y_G * g + Y_B * b) >> BITS);  \
330             yy++;                                       \
331             ss += S_RGB;                                \
332                                                         \
333             /* Chroma */                                \
334             *uu++ = (uint8_t)(U_ADD + ((-U_R * cr - U_G * cg + U_B * cb) >> (BITS+2)) ); \
335             *vv++ = (uint8_t)(V_ADD + (( V_R * cr - V_G * cg - V_B * cb) >> (BITS+2)) ); \
336         }                                               \
337                                                         \
338         src += 2*i_src;                                   \
339         y += 2*frm->i_stride[0];                        \
340         u += frm->i_stride[1];                          \
341         v += frm->i_stride[2];                          \
342     }                                                   \
343 }
344
345 RGB_TO_I420( rgb_to_i420,  0, 1, 2, 3 );
346 RGB_TO_I420( bgr_to_i420,  2, 1, 0, 3 );
347 RGB_TO_I420( bgra_to_i420, 2, 1, 0, 4 );
348
349 void x264_csp_init( int cpu, int i_csp, x264_csp_function_t *pf )
350 {
351     switch( i_csp )
352     {
353         case X264_CSP_I420:
354             pf->convert[X264_CSP_I420] = i420_to_i420;
355             pf->convert[X264_CSP_I422] = i422_to_i420;
356             pf->convert[X264_CSP_I444] = i444_to_i420;
357             pf->convert[X264_CSP_YV12] = yv12_to_i420;
358             pf->convert[X264_CSP_YUYV] = yuyv_to_i420;
359             pf->convert[X264_CSP_RGB ] =  rgb_to_i420;
360             pf->convert[X264_CSP_BGR ] =  bgr_to_i420;
361             pf->convert[X264_CSP_BGRA] = bgra_to_i420;
362             break;
363
364         default:
365             /* For now, can't happen */
366             fprintf( stderr, "arg in x264_csp_init\n" );
367             exit( -1 );
368             break;
369     }
370 }
371