1 /*****************************************************************************
2 * i420_rgb16.c : YUV to bitmap RGB conversion module for vlc
3 *****************************************************************************
4 * Copyright (C) 2000 the VideoLAN team
7 * Authors: Samuel Hocevar <sam@zoy.org>
8 * Damien Fouilleul <damienf@videolan.org>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
23 *****************************************************************************/
25 /*****************************************************************************
27 *****************************************************************************/
28 #include <string.h> /* strerror() */
29 #include <stdlib.h> /* malloc(), free() */
35 #if defined (MODULE_NAME_IS_i420_rgb)
36 # include "i420_rgb_c.h"
37 #elif defined (MODULE_NAME_IS_i420_rgb_mmx)
38 # include "i420_rgb_mmx.h"
39 #elif defined (MODULE_NAME_IS_i420_rgb_sse2)
40 # include "i420_rgb_mmx.h"
43 static void SetOffset( int, int, int, int, vlc_bool_t *,
44 unsigned int *, int * );
46 #if defined (MODULE_NAME_IS_i420_rgb)
47 /*****************************************************************************
48 * I420_RGB16: color YUV 4:2:0 to RGB 16 bpp with dithering
49 *****************************************************************************
50 * Horizontal alignment needed:
51 * - input: 8 pixels (8 Y bytes, 4 U/V bytes), margins not allowed
52 * - output: 1 pixel (2 bytes), margins allowed
53 * Vertical alignment needed:
54 * - input: 2 lines (2 Y lines, 1 U/V line)
56 *****************************************************************************/
57 void E_(I420_RGB16_dither)( vout_thread_t *p_vout, picture_t *p_src,
60 /* We got this one from the old arguments */
61 uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
62 uint8_t *p_y = p_src->Y_PIXELS;
63 uint8_t *p_u = p_src->U_PIXELS;
64 uint8_t *p_v = p_src->V_PIXELS;
66 vlc_bool_t b_hscale; /* horizontal scaling type */
67 unsigned int i_vscale; /* vertical scaling type */
68 unsigned int i_x, i_y; /* horizontal and vertical indexes */
69 unsigned int i_real_y; /* y % 4 */
73 int i_scale_count; /* scale modulo counter */
74 int i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
75 uint16_t * p_pic_start; /* beginning of the current line for copy */
76 int i_uval, i_vval; /* U and V samples */
77 int i_red, i_green, i_blue; /* U and V modified samples */
78 uint16_t * p_yuv = p_vout->chroma.p_sys->p_rgb16;
79 uint16_t * p_ybase; /* Y dependant conversion table */
81 /* Conversion buffer pointer */
82 uint16_t * p_buffer_start = (uint16_t*)p_vout->chroma.p_sys->p_buffer;
85 /* Offset array pointer */
86 int * p_offset_start = p_vout->chroma.p_sys->p_offset;
89 const int i_source_margin = p_src->p[0].i_pitch
90 - p_src->p[0].i_visible_pitch;
91 const int i_source_margin_c = p_src->p[1].i_pitch
92 - p_src->p[1].i_visible_pitch;
94 /* The dithering matrices */
95 int dither10[4] = { 0x0, 0x8, 0x2, 0xa };
96 int dither11[4] = { 0xc, 0x4, 0xe, 0x6 };
97 int dither12[4] = { 0x3, 0xb, 0x1, 0x9 };
98 int dither13[4] = { 0xf, 0x7, 0xd, 0x5 };
100 for(i_x = 0; i_x < 4; i_x++)
102 dither10[i_x] = dither10[i_x] << (SHIFT - 4 + p_vout->output.i_rrshift);
103 dither11[i_x] = dither11[i_x] << (SHIFT - 4 + p_vout->output.i_rrshift);
104 dither12[i_x] = dither12[i_x] << (SHIFT - 4 + p_vout->output.i_rrshift);
105 dither13[i_x] = dither13[i_x] << (SHIFT - 4 + p_vout->output.i_rrshift);
108 i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
110 if( p_vout->render.i_width & 7 )
112 i_rewind = 8 - ( p_vout->render.i_width & 7 );
119 /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
120 * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
121 * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
122 SetOffset( p_vout->render.i_width, p_vout->render.i_height,
123 p_vout->output.i_width, p_vout->output.i_height,
124 &b_hscale, &i_vscale, p_offset_start );
129 i_scale_count = ( i_vscale == 1 ) ?
130 p_vout->output.i_height : p_vout->render.i_height;
131 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
133 i_real_y = i_y & 0x3;
135 p_buffer = b_hscale ? p_buffer_start : p_pic;
137 for ( i_x = p_vout->render.i_width / 8; i_x--; )
139 int *p_dither = dither10;
140 CONVERT_YUV_PIXEL_DITHER(2);
142 CONVERT_Y_PIXEL_DITHER(2);
144 CONVERT_YUV_PIXEL_DITHER(2);
146 CONVERT_Y_PIXEL_DITHER(2);
148 CONVERT_YUV_PIXEL_DITHER(2);
150 CONVERT_Y_PIXEL_DITHER(2);
152 CONVERT_YUV_PIXEL_DITHER(2);
154 CONVERT_Y_PIXEL_DITHER(2);
157 /* Here we do some unaligned reads and duplicate conversions, but
158 * at least we have all the pixels */
161 int *p_dither = dither10;
163 p_u -= i_rewind >> 1;
164 p_v -= i_rewind >> 1;
165 p_buffer -= i_rewind;
166 CONVERT_YUV_PIXEL_DITHER(2);
168 CONVERT_Y_PIXEL_DITHER(2);
170 CONVERT_YUV_PIXEL_DITHER(2);
172 CONVERT_Y_PIXEL_DITHER(2);
174 CONVERT_YUV_PIXEL_DITHER(2);
176 CONVERT_Y_PIXEL_DITHER(2);
178 CONVERT_YUV_PIXEL_DITHER(2);
180 CONVERT_Y_PIXEL_DITHER(2);
183 SCALE_HEIGHT( 420, 2 );
185 p_y += i_source_margin;
188 p_u += i_source_margin_c;
189 p_v += i_source_margin_c;
195 /*****************************************************************************
196 * I420_RGB16: color YUV 4:2:0 to RGB 16 bpp
197 *****************************************************************************
198 * Horizontal alignment needed:
199 * - input: 8 pixels (8 Y bytes, 4 U/V bytes), margins not allowed
200 * - output: 1 pixel (2 bytes), margins allowed
201 * Vertical alignment needed:
202 * - input: 2 lines (2 Y lines, 1 U/V line)
204 *****************************************************************************/
206 #if defined (MODULE_NAME_IS_i420_rgb)
208 void E_(I420_RGB16)( vout_thread_t *p_vout, picture_t *p_src,
211 /* We got this one from the old arguments */
212 uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
213 uint8_t *p_y = p_src->Y_PIXELS;
214 uint8_t *p_u = p_src->U_PIXELS;
215 uint8_t *p_v = p_src->V_PIXELS;
217 vlc_bool_t b_hscale; /* horizontal scaling type */
218 unsigned int i_vscale; /* vertical scaling type */
219 unsigned int i_x, i_y; /* horizontal and vertical indexes */
223 int i_scale_count; /* scale modulo counter */
224 int i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
225 uint16_t * p_pic_start; /* beginning of the current line for copy */
226 int i_uval, i_vval; /* U and V samples */
227 int i_red, i_green, i_blue; /* U and V modified samples */
228 uint16_t * p_yuv = p_vout->chroma.p_sys->p_rgb16;
229 uint16_t * p_ybase; /* Y dependant conversion table */
231 /* Conversion buffer pointer */
232 uint16_t * p_buffer_start = (uint16_t*)p_vout->chroma.p_sys->p_buffer;
235 /* Offset array pointer */
236 int * p_offset_start = p_vout->chroma.p_sys->p_offset;
239 const int i_source_margin = p_src->p[0].i_pitch
240 - p_src->p[0].i_visible_pitch;
241 const int i_source_margin_c = p_src->p[1].i_pitch
242 - p_src->p[1].i_visible_pitch;
244 i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
246 if( p_vout->render.i_width & 7 )
248 i_rewind = 8 - ( p_vout->render.i_width & 7 );
255 /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
256 * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
257 * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
258 SetOffset( p_vout->render.i_width, p_vout->render.i_height,
259 p_vout->output.i_width, p_vout->output.i_height,
260 &b_hscale, &i_vscale, p_offset_start );
265 i_scale_count = ( i_vscale == 1 ) ?
266 p_vout->output.i_height : p_vout->render.i_height;
267 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
270 p_buffer = b_hscale ? p_buffer_start : p_pic;
272 for ( i_x = p_vout->render.i_width / 8; i_x--; )
274 CONVERT_YUV_PIXEL(2); CONVERT_Y_PIXEL(2);
275 CONVERT_YUV_PIXEL(2); CONVERT_Y_PIXEL(2);
276 CONVERT_YUV_PIXEL(2); CONVERT_Y_PIXEL(2);
277 CONVERT_YUV_PIXEL(2); CONVERT_Y_PIXEL(2);
280 /* Here we do some unaligned reads and duplicate conversions, but
281 * at least we have all the pixels */
285 p_u -= i_rewind >> 1;
286 p_v -= i_rewind >> 1;
287 p_buffer -= i_rewind;
289 CONVERT_YUV_PIXEL(2); CONVERT_Y_PIXEL(2);
290 CONVERT_YUV_PIXEL(2); CONVERT_Y_PIXEL(2);
291 CONVERT_YUV_PIXEL(2); CONVERT_Y_PIXEL(2);
292 CONVERT_YUV_PIXEL(2); CONVERT_Y_PIXEL(2);
295 SCALE_HEIGHT( 420, 2 );
297 p_y += i_source_margin;
300 p_u += i_source_margin_c;
301 p_v += i_source_margin_c;
306 #else // ! defined (MODULE_NAME_IS_i420_rgb)
308 void E_(I420_R5G5B5)( vout_thread_t *p_vout, picture_t *p_src,
311 /* We got this one from the old arguments */
312 uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
313 uint8_t *p_y = p_src->Y_PIXELS;
314 uint8_t *p_u = p_src->U_PIXELS;
315 uint8_t *p_v = p_src->V_PIXELS;
317 vlc_bool_t b_hscale; /* horizontal scaling type */
318 unsigned int i_vscale; /* vertical scaling type */
319 unsigned int i_x, i_y; /* horizontal and vertical indexes */
323 int i_scale_count; /* scale modulo counter */
324 int i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
325 uint16_t * p_pic_start; /* beginning of the current line for copy */
327 /* Conversion buffer pointer */
328 uint16_t * p_buffer_start = (uint16_t*)p_vout->chroma.p_sys->p_buffer;
331 /* Offset array pointer */
332 int * p_offset_start = p_vout->chroma.p_sys->p_offset;
335 const int i_source_margin = p_src->p[0].i_pitch
336 - p_src->p[0].i_visible_pitch;
337 const int i_source_margin_c = p_src->p[1].i_pitch
338 - p_src->p[1].i_visible_pitch;
340 i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
342 /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
343 * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
344 * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
345 SetOffset( p_vout->render.i_width, p_vout->render.i_height,
346 p_vout->output.i_width, p_vout->output.i_height,
347 &b_hscale, &i_vscale, p_offset_start );
353 i_scale_count = ( i_vscale == 1 ) ?
354 p_vout->output.i_height : p_vout->render.i_height;
356 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
358 if( p_vout->render.i_width & 15 )
360 i_rewind = 16 - ( p_vout->render.i_width & 15 );
368 ** SSE2 128 bits fetch/store instructions are faster
369 ** if memory access is 16 bytes aligned
372 p_buffer = b_hscale ? p_buffer_start : p_pic;
373 if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
378 /* use faster SSE2 aligned fetch and store */
379 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
383 for ( i_x = p_vout->render.i_width/16; i_x--; )
389 SSE2_UNPACK_15_ALIGNED
396 /* Here we do some unaligned reads and duplicate conversions, but
397 * at least we have all the pixels */
401 p_u -= i_rewind >> 1;
402 p_v -= i_rewind >> 1;
403 p_buffer -= i_rewind;
406 SSE2_INIT_16_UNALIGNED
409 SSE2_UNPACK_15_UNALIGNED
416 SCALE_HEIGHT( 420, 2 );
418 p_y += i_source_margin;
421 p_u += i_source_margin_c;
422 p_v += i_source_margin_c;
424 p_buffer = b_hscale ? p_buffer_start : p_pic;
429 /* use slower SSE2 unaligned fetch and store */
430 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
433 p_buffer = b_hscale ? p_buffer_start : p_pic;
435 for ( i_x = p_vout->render.i_width/16; i_x--; )
438 SSE2_INIT_16_UNALIGNED
441 SSE2_UNPACK_15_UNALIGNED
448 /* Here we do some unaligned reads and duplicate conversions, but
449 * at least we have all the pixels */
453 p_u -= i_rewind >> 1;
454 p_v -= i_rewind >> 1;
455 p_buffer -= i_rewind;
458 SSE2_INIT_16_UNALIGNED
461 SSE2_UNPACK_15_UNALIGNED
468 SCALE_HEIGHT( 420, 2 );
470 p_y += i_source_margin;
473 p_u += i_source_margin_c;
474 p_v += i_source_margin_c;
476 p_buffer = b_hscale ? p_buffer_start : p_pic;
480 /* make sure all SSE2 stores are visible thereafter */
483 #else // defined (MODULE_NAME_IS_i420_rgb_mmx)
485 if( p_vout->render.i_width & 7 )
487 i_rewind = 8 - ( p_vout->render.i_width & 7 );
494 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
497 p_buffer = b_hscale ? p_buffer_start : p_pic;
499 for ( i_x = p_vout->render.i_width / 8; i_x--; )
513 /* Here we do some unaligned reads and duplicate conversions, but
514 * at least we have all the pixels */
518 p_u -= i_rewind >> 1;
519 p_v -= i_rewind >> 1;
520 p_buffer -= i_rewind;
534 SCALE_HEIGHT( 420, 2 );
536 p_y += i_source_margin;
539 p_u += i_source_margin_c;
540 p_v += i_source_margin_c;
543 /* re-enable FPU registers */
549 void E_(I420_R5G6B5)( vout_thread_t *p_vout, picture_t *p_src,
552 /* We got this one from the old arguments */
553 uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
554 uint8_t *p_y = p_src->Y_PIXELS;
555 uint8_t *p_u = p_src->U_PIXELS;
556 uint8_t *p_v = p_src->V_PIXELS;
558 vlc_bool_t b_hscale; /* horizontal scaling type */
559 unsigned int i_vscale; /* vertical scaling type */
560 unsigned int i_x, i_y; /* horizontal and vertical indexes */
564 int i_scale_count; /* scale modulo counter */
565 int i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
566 uint16_t * p_pic_start; /* beginning of the current line for copy */
568 /* Conversion buffer pointer */
569 uint16_t * p_buffer_start = (uint16_t*)p_vout->chroma.p_sys->p_buffer;
572 /* Offset array pointer */
573 int * p_offset_start = p_vout->chroma.p_sys->p_offset;
576 const int i_source_margin = p_src->p[0].i_pitch
577 - p_src->p[0].i_visible_pitch;
578 const int i_source_margin_c = p_src->p[1].i_pitch
579 - p_src->p[1].i_visible_pitch;
581 i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
583 /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
584 * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
585 * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
586 SetOffset( p_vout->render.i_width, p_vout->render.i_height,
587 p_vout->output.i_width, p_vout->output.i_height,
588 &b_hscale, &i_vscale, p_offset_start );
594 i_scale_count = ( i_vscale == 1 ) ?
595 p_vout->output.i_height : p_vout->render.i_height;
597 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
599 if( p_vout->render.i_width & 15 )
601 i_rewind = 16 - ( p_vout->render.i_width & 15 );
609 ** SSE2 128 bits fetch/store instructions are faster
610 ** if memory access is 16 bytes aligned
613 p_buffer = b_hscale ? p_buffer_start : p_pic;
614 if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
619 /* use faster SSE2 aligned fetch and store */
620 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
624 for ( i_x = p_vout->render.i_width/16; i_x--; )
630 SSE2_UNPACK_16_ALIGNED
637 /* Here we do some unaligned reads and duplicate conversions, but
638 * at least we have all the pixels */
642 p_u -= i_rewind >> 1;
643 p_v -= i_rewind >> 1;
644 p_buffer -= i_rewind;
647 SSE2_INIT_16_UNALIGNED
650 SSE2_UNPACK_16_UNALIGNED
657 SCALE_HEIGHT( 420, 2 );
659 p_y += i_source_margin;
662 p_u += i_source_margin_c;
663 p_v += i_source_margin_c;
665 p_buffer = b_hscale ? p_buffer_start : p_pic;
670 /* use slower SSE2 unaligned fetch and store */
671 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
674 p_buffer = b_hscale ? p_buffer_start : p_pic;
676 for ( i_x = p_vout->render.i_width/16; i_x--; )
679 SSE2_INIT_16_UNALIGNED
682 SSE2_UNPACK_16_UNALIGNED
689 /* Here we do some unaligned reads and duplicate conversions, but
690 * at least we have all the pixels */
694 p_u -= i_rewind >> 1;
695 p_v -= i_rewind >> 1;
696 p_buffer -= i_rewind;
699 SSE2_INIT_16_UNALIGNED
702 SSE2_UNPACK_16_UNALIGNED
709 SCALE_HEIGHT( 420, 2 );
711 p_y += i_source_margin;
714 p_u += i_source_margin_c;
715 p_v += i_source_margin_c;
717 p_buffer = b_hscale ? p_buffer_start : p_pic;
721 /* make sure all SSE2 stores are visible thereafter */
724 #else // defined (MODULE_NAME_IS_i420_rgb_mmx)
726 if( p_vout->render.i_width & 7 )
728 i_rewind = 8 - ( p_vout->render.i_width & 7 );
735 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
738 p_buffer = b_hscale ? p_buffer_start : p_pic;
740 for ( i_x = p_vout->render.i_width / 8; i_x--; )
754 /* Here we do some unaligned reads and duplicate conversions, but
755 * at least we have all the pixels */
759 p_u -= i_rewind >> 1;
760 p_v -= i_rewind >> 1;
761 p_buffer -= i_rewind;
775 SCALE_HEIGHT( 420, 2 );
777 p_y += i_source_margin;
780 p_u += i_source_margin_c;
781 p_v += i_source_margin_c;
784 /* re-enable FPU registers */
792 /*****************************************************************************
793 * I420_RGB32: color YUV 4:2:0 to RGB 32 bpp
794 *****************************************************************************
795 * Horizontal alignment needed:
796 * - input: 8 pixels (8 Y bytes, 4 U/V bytes), margins not allowed
797 * - output: 1 pixel (2 bytes), margins allowed
798 * Vertical alignment needed:
799 * - input: 2 lines (2 Y lines, 1 U/V line)
801 *****************************************************************************/
803 #if defined (MODULE_NAME_IS_i420_rgb)
805 void E_(I420_RGB32)( vout_thread_t *p_vout, picture_t *p_src,
808 /* We got this one from the old arguments */
809 uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
810 uint8_t *p_y = p_src->Y_PIXELS;
811 uint8_t *p_u = p_src->U_PIXELS;
812 uint8_t *p_v = p_src->V_PIXELS;
814 vlc_bool_t b_hscale; /* horizontal scaling type */
815 unsigned int i_vscale; /* vertical scaling type */
816 unsigned int i_x, i_y; /* horizontal and vertical indexes */
820 int i_scale_count; /* scale modulo counter */
821 int i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
822 uint32_t * p_pic_start; /* beginning of the current line for copy */
823 int i_uval, i_vval; /* U and V samples */
824 int i_red, i_green, i_blue; /* U and V modified samples */
825 uint32_t * p_yuv = p_vout->chroma.p_sys->p_rgb32;
826 uint32_t * p_ybase; /* Y dependant conversion table */
828 /* Conversion buffer pointer */
829 uint32_t * p_buffer_start = (uint32_t*)p_vout->chroma.p_sys->p_buffer;
832 /* Offset array pointer */
833 int * p_offset_start = p_vout->chroma.p_sys->p_offset;
836 const int i_source_margin = p_src->p[0].i_pitch
837 - p_src->p[0].i_visible_pitch;
838 const int i_source_margin_c = p_src->p[1].i_pitch
839 - p_src->p[1].i_visible_pitch;
841 i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
843 if( p_vout->render.i_width & 7 )
845 i_rewind = 8 - ( p_vout->render.i_width & 7 );
852 /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
853 * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
854 * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
855 SetOffset( p_vout->render.i_width, p_vout->render.i_height,
856 p_vout->output.i_width, p_vout->output.i_height,
857 &b_hscale, &i_vscale, p_offset_start );
862 i_scale_count = ( i_vscale == 1 ) ?
863 p_vout->output.i_height : p_vout->render.i_height;
864 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
867 p_buffer = b_hscale ? p_buffer_start : p_pic;
869 for ( i_x = p_vout->render.i_width / 8; i_x--; )
871 CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4);
872 CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4);
873 CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4);
874 CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4);
877 /* Here we do some unaligned reads and duplicate conversions, but
878 * at least we have all the pixels */
882 p_u -= i_rewind >> 1;
883 p_v -= i_rewind >> 1;
884 p_buffer -= i_rewind;
885 CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4);
886 CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4);
887 CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4);
888 CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4);
891 SCALE_HEIGHT( 420, 4 );
893 p_y += i_source_margin;
896 p_u += i_source_margin_c;
897 p_v += i_source_margin_c;
902 #else // defined (MODULE_NAME_IS_i420_rgb_mmx) || defined (MODULE_NAME_IS_i420_rgb_sse2)
904 void E_(I420_A8R8G8B8)( vout_thread_t *p_vout, picture_t *p_src,
907 /* We got this one from the old arguments */
908 uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
909 uint8_t *p_y = p_src->Y_PIXELS;
910 uint8_t *p_u = p_src->U_PIXELS;
911 uint8_t *p_v = p_src->V_PIXELS;
913 vlc_bool_t b_hscale; /* horizontal scaling type */
914 unsigned int i_vscale; /* vertical scaling type */
915 unsigned int i_x, i_y; /* horizontal and vertical indexes */
919 int i_scale_count; /* scale modulo counter */
920 int i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
921 uint32_t * p_pic_start; /* beginning of the current line for copy */
922 /* Conversion buffer pointer */
923 uint32_t * p_buffer_start = (uint32_t*)p_vout->chroma.p_sys->p_buffer;
926 /* Offset array pointer */
927 int * p_offset_start = p_vout->chroma.p_sys->p_offset;
930 const int i_source_margin = p_src->p[0].i_pitch
931 - p_src->p[0].i_visible_pitch;
932 const int i_source_margin_c = p_src->p[1].i_pitch
933 - p_src->p[1].i_visible_pitch;
935 i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
937 /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
938 * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
939 * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
940 SetOffset( p_vout->render.i_width, p_vout->render.i_height,
941 p_vout->output.i_width, p_vout->output.i_height,
942 &b_hscale, &i_vscale, p_offset_start );
947 i_scale_count = ( i_vscale == 1 ) ?
948 p_vout->output.i_height : p_vout->render.i_height;
950 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
952 if( p_vout->render.i_width & 15 )
954 i_rewind = 16 - ( p_vout->render.i_width & 15 );
962 ** SSE2 128 bits fetch/store instructions are faster
963 ** if memory access is 16 bytes aligned
966 p_buffer = b_hscale ? p_buffer_start : p_pic;
967 if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
972 /* use faster SSE2 aligned fetch and store */
973 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
977 for ( i_x = p_vout->render.i_width / 16; i_x--; )
983 SSE2_UNPACK_32_ARGB_ALIGNED
991 /* Here we do some unaligned reads and duplicate conversions, but
992 * at least we have all the pixels */
996 p_u -= i_rewind >> 1;
997 p_v -= i_rewind >> 1;
998 p_buffer -= i_rewind;
1000 SSE2_INIT_32_UNALIGNED
1003 SSE2_UNPACK_32_ARGB_UNALIGNED
1010 SCALE_HEIGHT( 420, 4 );
1012 p_y += i_source_margin;
1015 p_u += i_source_margin_c;
1016 p_v += i_source_margin_c;
1018 p_buffer = b_hscale ? p_buffer_start : p_pic;
1023 /* use slower SSE2 unaligned fetch and store */
1024 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1026 p_pic_start = p_pic;
1027 p_buffer = b_hscale ? p_buffer_start : p_pic;
1029 for ( i_x = p_vout->render.i_width / 16; i_x--; )
1032 SSE2_INIT_32_UNALIGNED
1035 SSE2_UNPACK_32_ARGB_UNALIGNED
1043 /* Here we do some unaligned reads and duplicate conversions, but
1044 * at least we have all the pixels */
1048 p_u -= i_rewind >> 1;
1049 p_v -= i_rewind >> 1;
1050 p_buffer -= i_rewind;
1052 SSE2_INIT_32_UNALIGNED
1055 SSE2_UNPACK_32_ARGB_UNALIGNED
1062 SCALE_HEIGHT( 420, 4 );
1064 p_y += i_source_margin;
1067 p_u += i_source_margin_c;
1068 p_v += i_source_margin_c;
1070 p_buffer = b_hscale ? p_buffer_start : p_pic;
1074 /* make sure all SSE2 stores are visible thereafter */
1077 #else // defined (MODULE_NAME_IS_i420_rgb_mmx)
1079 if( p_vout->render.i_width & 7 )
1081 i_rewind = 8 - ( p_vout->render.i_width & 7 );
1088 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1090 p_pic_start = p_pic;
1091 p_buffer = b_hscale ? p_buffer_start : p_pic;
1093 for ( i_x = p_vout->render.i_width / 8; i_x--; )
1107 /* Here we do some unaligned reads and duplicate conversions, but
1108 * at least we have all the pixels */
1112 p_u -= i_rewind >> 1;
1113 p_v -= i_rewind >> 1;
1114 p_buffer -= i_rewind;
1127 SCALE_HEIGHT( 420, 4 );
1129 p_y += i_source_margin;
1132 p_u += i_source_margin_c;
1133 p_v += i_source_margin_c;
1137 /* re-enable FPU registers */
1143 void E_(I420_R8G8B8A8)( vout_thread_t *p_vout, picture_t *p_src,
1146 /* We got this one from the old arguments */
1147 uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
1148 uint8_t *p_y = p_src->Y_PIXELS;
1149 uint8_t *p_u = p_src->U_PIXELS;
1150 uint8_t *p_v = p_src->V_PIXELS;
1152 vlc_bool_t b_hscale; /* horizontal scaling type */
1153 unsigned int i_vscale; /* vertical scaling type */
1154 unsigned int i_x, i_y; /* horizontal and vertical indexes */
1158 int i_scale_count; /* scale modulo counter */
1159 int i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
1160 uint32_t * p_pic_start; /* beginning of the current line for copy */
1161 /* Conversion buffer pointer */
1162 uint32_t * p_buffer_start = (uint32_t*)p_vout->chroma.p_sys->p_buffer;
1163 uint32_t * p_buffer;
1165 /* Offset array pointer */
1166 int * p_offset_start = p_vout->chroma.p_sys->p_offset;
1169 const int i_source_margin = p_src->p[0].i_pitch
1170 - p_src->p[0].i_visible_pitch;
1171 const int i_source_margin_c = p_src->p[1].i_pitch
1172 - p_src->p[1].i_visible_pitch;
1174 i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
1176 /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
1177 * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
1178 * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
1179 SetOffset( p_vout->render.i_width, p_vout->render.i_height,
1180 p_vout->output.i_width, p_vout->output.i_height,
1181 &b_hscale, &i_vscale, p_offset_start );
1184 * Perform conversion
1186 i_scale_count = ( i_vscale == 1 ) ?
1187 p_vout->output.i_height : p_vout->render.i_height;
1189 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
1191 if( p_vout->render.i_width & 15 )
1193 i_rewind = 16 - ( p_vout->render.i_width & 15 );
1201 ** SSE2 128 bits fetch/store instructions are faster
1202 ** if memory access is 16 bytes aligned
1205 p_buffer = b_hscale ? p_buffer_start : p_pic;
1206 if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
1211 /* use faster SSE2 aligned fetch and store */
1212 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1214 p_pic_start = p_pic;
1216 for ( i_x = p_vout->render.i_width / 16; i_x--; )
1219 SSE2_INIT_32_ALIGNED
1222 SSE2_UNPACK_32_RGBA_ALIGNED
1230 /* Here we do some unaligned reads and duplicate conversions, but
1231 * at least we have all the pixels */
1235 p_u -= i_rewind >> 1;
1236 p_v -= i_rewind >> 1;
1237 p_buffer -= i_rewind;
1239 SSE2_INIT_32_UNALIGNED
1242 SSE2_UNPACK_32_RGBA_UNALIGNED
1249 SCALE_HEIGHT( 420, 4 );
1251 p_y += i_source_margin;
1254 p_u += i_source_margin_c;
1255 p_v += i_source_margin_c;
1257 p_buffer = b_hscale ? p_buffer_start : p_pic;
1262 /* use slower SSE2 unaligned fetch and store */
1263 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1265 p_pic_start = p_pic;
1266 p_buffer = b_hscale ? p_buffer_start : p_pic;
1268 for ( i_x = p_vout->render.i_width / 16; i_x--; )
1271 SSE2_INIT_32_UNALIGNED
1274 SSE2_UNPACK_32_RGBA_UNALIGNED
1282 /* Here we do some unaligned reads and duplicate conversions, but
1283 * at least we have all the pixels */
1287 p_u -= i_rewind >> 1;
1288 p_v -= i_rewind >> 1;
1289 p_buffer -= i_rewind;
1291 SSE2_INIT_32_UNALIGNED
1294 SSE2_UNPACK_32_RGBA_UNALIGNED
1301 SCALE_HEIGHT( 420, 4 );
1303 p_y += i_source_margin;
1306 p_u += i_source_margin_c;
1307 p_v += i_source_margin_c;
1309 p_buffer = b_hscale ? p_buffer_start : p_pic;
1313 /* make sure all SSE2 stores are visible thereafter */
1316 #else // defined (MODULE_NAME_IS_i420_rgb_mmx)
1318 if( p_vout->render.i_width & 7 )
1320 i_rewind = 8 - ( p_vout->render.i_width & 7 );
1327 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1329 p_pic_start = p_pic;
1330 p_buffer = b_hscale ? p_buffer_start : p_pic;
1332 for ( i_x = p_vout->render.i_width / 8; i_x--; )
1346 /* Here we do some unaligned reads and duplicate conversions, but
1347 * at least we have all the pixels */
1351 p_u -= i_rewind >> 1;
1352 p_v -= i_rewind >> 1;
1353 p_buffer -= i_rewind;
1366 SCALE_HEIGHT( 420, 4 );
1368 p_y += i_source_margin;
1371 p_u += i_source_margin_c;
1372 p_v += i_source_margin_c;
1376 /* re-enable FPU registers */
1382 void E_(I420_B8G8R8A8)( vout_thread_t *p_vout, picture_t *p_src,
1385 /* We got this one from the old arguments */
1386 uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
1387 uint8_t *p_y = p_src->Y_PIXELS;
1388 uint8_t *p_u = p_src->U_PIXELS;
1389 uint8_t *p_v = p_src->V_PIXELS;
1391 vlc_bool_t b_hscale; /* horizontal scaling type */
1392 unsigned int i_vscale; /* vertical scaling type */
1393 unsigned int i_x, i_y; /* horizontal and vertical indexes */
1397 int i_scale_count; /* scale modulo counter */
1398 int i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
1399 uint32_t * p_pic_start; /* beginning of the current line for copy */
1400 /* Conversion buffer pointer */
1401 uint32_t * p_buffer_start = (uint32_t*)p_vout->chroma.p_sys->p_buffer;
1402 uint32_t * p_buffer;
1404 /* Offset array pointer */
1405 int * p_offset_start = p_vout->chroma.p_sys->p_offset;
1408 const int i_source_margin = p_src->p[0].i_pitch
1409 - p_src->p[0].i_visible_pitch;
1410 const int i_source_margin_c = p_src->p[1].i_pitch
1411 - p_src->p[1].i_visible_pitch;
1413 i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
1415 /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
1416 * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
1417 * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
1418 SetOffset( p_vout->render.i_width, p_vout->render.i_height,
1419 p_vout->output.i_width, p_vout->output.i_height,
1420 &b_hscale, &i_vscale, p_offset_start );
1423 * Perform conversion
1425 i_scale_count = ( i_vscale == 1 ) ?
1426 p_vout->output.i_height : p_vout->render.i_height;
1428 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
1430 if( p_vout->render.i_width & 15 )
1432 i_rewind = 16 - ( p_vout->render.i_width & 15 );
1440 ** SSE2 128 bits fetch/store instructions are faster
1441 ** if memory access is 16 bytes aligned
1444 p_buffer = b_hscale ? p_buffer_start : p_pic;
1445 if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
1450 /* use faster SSE2 aligned fetch and store */
1451 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1453 p_pic_start = p_pic;
1455 for ( i_x = p_vout->render.i_width / 16; i_x--; )
1458 SSE2_INIT_32_ALIGNED
1461 SSE2_UNPACK_32_BGRA_ALIGNED
1469 /* Here we do some unaligned reads and duplicate conversions, but
1470 * at least we have all the pixels */
1474 p_u -= i_rewind >> 1;
1475 p_v -= i_rewind >> 1;
1476 p_buffer -= i_rewind;
1478 SSE2_INIT_32_UNALIGNED
1481 SSE2_UNPACK_32_BGRA_UNALIGNED
1488 SCALE_HEIGHT( 420, 4 );
1490 p_y += i_source_margin;
1493 p_u += i_source_margin_c;
1494 p_v += i_source_margin_c;
1496 p_buffer = b_hscale ? p_buffer_start : p_pic;
1501 /* use slower SSE2 unaligned fetch and store */
1502 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1504 p_pic_start = p_pic;
1505 p_buffer = b_hscale ? p_buffer_start : p_pic;
1507 for ( i_x = p_vout->render.i_width / 16; i_x--; )
1510 SSE2_INIT_32_UNALIGNED
1513 SSE2_UNPACK_32_BGRA_UNALIGNED
1521 /* Here we do some unaligned reads and duplicate conversions, but
1522 * at least we have all the pixels */
1526 p_u -= i_rewind >> 1;
1527 p_v -= i_rewind >> 1;
1528 p_buffer -= i_rewind;
1530 SSE2_INIT_32_UNALIGNED
1533 SSE2_UNPACK_32_BGRA_UNALIGNED
1540 SCALE_HEIGHT( 420, 4 );
1542 p_y += i_source_margin;
1545 p_u += i_source_margin_c;
1546 p_v += i_source_margin_c;
1548 p_buffer = b_hscale ? p_buffer_start : p_pic;
1554 if( p_vout->render.i_width & 7 )
1556 i_rewind = 8 - ( p_vout->render.i_width & 7 );
1563 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1565 p_pic_start = p_pic;
1566 p_buffer = b_hscale ? p_buffer_start : p_pic;
1568 for ( i_x = p_vout->render.i_width / 8; i_x--; )
1582 /* Here we do some unaligned reads and duplicate conversions, but
1583 * at least we have all the pixels */
1587 p_u -= i_rewind >> 1;
1588 p_v -= i_rewind >> 1;
1589 p_buffer -= i_rewind;
1602 SCALE_HEIGHT( 420, 4 );
1604 p_y += i_source_margin;
1607 p_u += i_source_margin_c;
1608 p_v += i_source_margin_c;
1612 /* re-enable FPU registers */
1618 void E_(I420_A8B8G8R8)( vout_thread_t *p_vout, picture_t *p_src,
1621 /* We got this one from the old arguments */
1622 uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
1623 uint8_t *p_y = p_src->Y_PIXELS;
1624 uint8_t *p_u = p_src->U_PIXELS;
1625 uint8_t *p_v = p_src->V_PIXELS;
1627 vlc_bool_t b_hscale; /* horizontal scaling type */
1628 unsigned int i_vscale; /* vertical scaling type */
1629 unsigned int i_x, i_y; /* horizontal and vertical indexes */
1633 int i_scale_count; /* scale modulo counter */
1634 int i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
1635 uint32_t * p_pic_start; /* beginning of the current line for copy */
1636 /* Conversion buffer pointer */
1637 uint32_t * p_buffer_start = (uint32_t*)p_vout->chroma.p_sys->p_buffer;
1638 uint32_t * p_buffer;
1640 /* Offset array pointer */
1641 int * p_offset_start = p_vout->chroma.p_sys->p_offset;
1644 const int i_source_margin = p_src->p[0].i_pitch
1645 - p_src->p[0].i_visible_pitch;
1646 const int i_source_margin_c = p_src->p[1].i_pitch
1647 - p_src->p[1].i_visible_pitch;
1649 i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
1651 /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
1652 * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
1653 * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
1654 SetOffset( p_vout->render.i_width, p_vout->render.i_height,
1655 p_vout->output.i_width, p_vout->output.i_height,
1656 &b_hscale, &i_vscale, p_offset_start );
1659 * Perform conversion
1661 i_scale_count = ( i_vscale == 1 ) ?
1662 p_vout->output.i_height : p_vout->render.i_height;
1664 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
1666 if( p_vout->render.i_width & 15 )
1668 i_rewind = 16 - ( p_vout->render.i_width & 15 );
1676 ** SSE2 128 bits fetch/store instructions are faster
1677 ** if memory access is 16 bytes aligned
1680 p_buffer = b_hscale ? p_buffer_start : p_pic;
1681 if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
1686 /* use faster SSE2 aligned fetch and store */
1687 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1689 p_pic_start = p_pic;
1691 for ( i_x = p_vout->render.i_width / 16; i_x--; )
1694 SSE2_INIT_32_ALIGNED
1697 SSE2_UNPACK_32_ABGR_ALIGNED
1705 /* Here we do some unaligned reads and duplicate conversions, but
1706 * at least we have all the pixels */
1710 p_u -= i_rewind >> 1;
1711 p_v -= i_rewind >> 1;
1712 p_buffer -= i_rewind;
1714 SSE2_INIT_32_UNALIGNED
1717 SSE2_UNPACK_32_ABGR_UNALIGNED
1724 SCALE_HEIGHT( 420, 4 );
1726 p_y += i_source_margin;
1729 p_u += i_source_margin_c;
1730 p_v += i_source_margin_c;
1732 p_buffer = b_hscale ? p_buffer_start : p_pic;
1737 /* use slower SSE2 unaligned fetch and store */
1738 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1740 p_pic_start = p_pic;
1741 p_buffer = b_hscale ? p_buffer_start : p_pic;
1743 for ( i_x = p_vout->render.i_width / 16; i_x--; )
1746 SSE2_INIT_32_UNALIGNED
1749 SSE2_UNPACK_32_ABGR_UNALIGNED
1757 /* Here we do some unaligned reads and duplicate conversions, but
1758 * at least we have all the pixels */
1762 p_u -= i_rewind >> 1;
1763 p_v -= i_rewind >> 1;
1764 p_buffer -= i_rewind;
1766 SSE2_INIT_32_UNALIGNED
1769 SSE2_UNPACK_32_ABGR_UNALIGNED
1776 SCALE_HEIGHT( 420, 4 );
1778 p_y += i_source_margin;
1781 p_u += i_source_margin_c;
1782 p_v += i_source_margin_c;
1784 p_buffer = b_hscale ? p_buffer_start : p_pic;
1790 if( p_vout->render.i_width & 7 )
1792 i_rewind = 8 - ( p_vout->render.i_width & 7 );
1799 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1801 p_pic_start = p_pic;
1802 p_buffer = b_hscale ? p_buffer_start : p_pic;
1804 for ( i_x = p_vout->render.i_width / 8; i_x--; )
1818 /* Here we do some unaligned reads and duplicate conversions, but
1819 * at least we have all the pixels */
1823 p_u -= i_rewind >> 1;
1824 p_v -= i_rewind >> 1;
1825 p_buffer -= i_rewind;
1838 SCALE_HEIGHT( 420, 4 );
1840 p_y += i_source_margin;
1843 p_u += i_source_margin_c;
1844 p_v += i_source_margin_c;
1848 /* re-enable FPU registers */
1856 /* Following functions are local */
1858 /*****************************************************************************
1859 * SetOffset: build offset array for conversion functions
1860 *****************************************************************************
1861 * This function will build an offset array used in later conversion functions.
1862 * It will also set horizontal and vertical scaling indicators.
1863 *****************************************************************************/
1864 static void SetOffset( int i_width, int i_height, int i_pic_width,
1865 int i_pic_height, vlc_bool_t *pb_hscale,
1866 unsigned int *pi_vscale, int *p_offset )
1868 int i_x; /* x position in destination */
1869 int i_scale_count; /* modulo counter */
1872 * Prepare horizontal offset array
1874 if( i_pic_width - i_width == 0 )
1876 /* No horizontal scaling: YUV conversion is done directly to picture */
1879 else if( i_pic_width - i_width > 0 )
1881 /* Prepare scaling array for horizontal extension */
1883 i_scale_count = i_pic_width;
1884 for( i_x = i_width; i_x--; )
1886 while( (i_scale_count -= i_width) > 0 )
1891 i_scale_count += i_pic_width;
1894 else /* if( i_pic_width - i_width < 0 ) */
1896 /* Prepare scaling array for horizontal reduction */
1898 i_scale_count = i_width;
1899 for( i_x = i_pic_width; i_x--; )
1902 while( (i_scale_count -= i_pic_width) > 0 )
1907 i_scale_count += i_width;
1912 * Set vertical scaling indicator
1914 if( i_pic_height - i_height == 0 )
1918 else if( i_pic_height - i_height > 0 )
1922 else /* if( i_pic_height - i_height < 0 ) */