1 /*****************************************************************************
2 * i420_rgb16.c : YUV to bitmap RGB conversion module for vlc
3 *****************************************************************************
4 * Copyright (C) 2000 the VideoLAN team
7 * Authors: Samuel Hocevar <sam@zoy.org>
8 * Damien Fouilleul <damienf@videolan.org>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
23 *****************************************************************************/
25 /*****************************************************************************
27 *****************************************************************************/
33 #if defined (MODULE_NAME_IS_i420_rgb)
34 # include "i420_rgb_c.h"
35 #elif defined (MODULE_NAME_IS_i420_rgb_mmx)
36 # include "i420_rgb_mmx.h"
37 #elif defined (MODULE_NAME_IS_i420_rgb_sse2)
38 # include "i420_rgb_mmx.h"
41 static void SetOffset( int, int, int, int, vlc_bool_t *,
42 unsigned int *, int * );
44 #if defined (MODULE_NAME_IS_i420_rgb)
45 /*****************************************************************************
46 * I420_RGB16: color YUV 4:2:0 to RGB 16 bpp with dithering
47 *****************************************************************************
48 * Horizontal alignment needed:
49 * - input: 8 pixels (8 Y bytes, 4 U/V bytes), margins not allowed
50 * - output: 1 pixel (2 bytes), margins allowed
51 * Vertical alignment needed:
52 * - input: 2 lines (2 Y lines, 1 U/V line)
54 *****************************************************************************/
55 void E_(I420_RGB16_dither)( vout_thread_t *p_vout, picture_t *p_src,
58 /* We got this one from the old arguments */
59 uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
60 uint8_t *p_y = p_src->Y_PIXELS;
61 uint8_t *p_u = p_src->U_PIXELS;
62 uint8_t *p_v = p_src->V_PIXELS;
64 vlc_bool_t b_hscale; /* horizontal scaling type */
65 unsigned int i_vscale; /* vertical scaling type */
66 unsigned int i_x, i_y; /* horizontal and vertical indexes */
67 unsigned int i_real_y; /* y % 4 */
71 int i_scale_count; /* scale modulo counter */
72 int i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
73 uint16_t * p_pic_start; /* beginning of the current line for copy */
74 int i_uval, i_vval; /* U and V samples */
75 int i_red, i_green, i_blue; /* U and V modified samples */
76 uint16_t * p_yuv = p_vout->chroma.p_sys->p_rgb16;
77 uint16_t * p_ybase; /* Y dependant conversion table */
79 /* Conversion buffer pointer */
80 uint16_t * p_buffer_start = (uint16_t*)p_vout->chroma.p_sys->p_buffer;
83 /* Offset array pointer */
84 int * p_offset_start = p_vout->chroma.p_sys->p_offset;
87 const int i_source_margin = p_src->p[0].i_pitch
88 - p_src->p[0].i_visible_pitch;
89 const int i_source_margin_c = p_src->p[1].i_pitch
90 - p_src->p[1].i_visible_pitch;
92 /* The dithering matrices */
93 int dither10[4] = { 0x0, 0x8, 0x2, 0xa };
94 int dither11[4] = { 0xc, 0x4, 0xe, 0x6 };
95 int dither12[4] = { 0x3, 0xb, 0x1, 0x9 };
96 int dither13[4] = { 0xf, 0x7, 0xd, 0x5 };
98 for(i_x = 0; i_x < 4; i_x++)
100 dither10[i_x] = dither10[i_x] << (SHIFT - 4 + p_vout->output.i_rrshift);
101 dither11[i_x] = dither11[i_x] << (SHIFT - 4 + p_vout->output.i_rrshift);
102 dither12[i_x] = dither12[i_x] << (SHIFT - 4 + p_vout->output.i_rrshift);
103 dither13[i_x] = dither13[i_x] << (SHIFT - 4 + p_vout->output.i_rrshift);
106 i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
108 if( p_vout->render.i_width & 7 )
110 i_rewind = 8 - ( p_vout->render.i_width & 7 );
117 /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
118 * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
119 * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
120 SetOffset( p_vout->render.i_width, p_vout->render.i_height,
121 p_vout->output.i_width, p_vout->output.i_height,
122 &b_hscale, &i_vscale, p_offset_start );
127 i_scale_count = ( i_vscale == 1 ) ?
128 p_vout->output.i_height : p_vout->render.i_height;
129 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
131 i_real_y = i_y & 0x3;
133 p_buffer = b_hscale ? p_buffer_start : p_pic;
135 for ( i_x = p_vout->render.i_width / 8; i_x--; )
137 int *p_dither = dither10;
138 CONVERT_YUV_PIXEL_DITHER(2);
140 CONVERT_Y_PIXEL_DITHER(2);
142 CONVERT_YUV_PIXEL_DITHER(2);
144 CONVERT_Y_PIXEL_DITHER(2);
146 CONVERT_YUV_PIXEL_DITHER(2);
148 CONVERT_Y_PIXEL_DITHER(2);
150 CONVERT_YUV_PIXEL_DITHER(2);
152 CONVERT_Y_PIXEL_DITHER(2);
155 /* Here we do some unaligned reads and duplicate conversions, but
156 * at least we have all the pixels */
159 int *p_dither = dither10;
161 p_u -= i_rewind >> 1;
162 p_v -= i_rewind >> 1;
163 p_buffer -= i_rewind;
164 CONVERT_YUV_PIXEL_DITHER(2);
166 CONVERT_Y_PIXEL_DITHER(2);
168 CONVERT_YUV_PIXEL_DITHER(2);
170 CONVERT_Y_PIXEL_DITHER(2);
172 CONVERT_YUV_PIXEL_DITHER(2);
174 CONVERT_Y_PIXEL_DITHER(2);
176 CONVERT_YUV_PIXEL_DITHER(2);
178 CONVERT_Y_PIXEL_DITHER(2);
181 SCALE_HEIGHT( 420, 2 );
183 p_y += i_source_margin;
186 p_u += i_source_margin_c;
187 p_v += i_source_margin_c;
193 /*****************************************************************************
194 * I420_RGB16: color YUV 4:2:0 to RGB 16 bpp
195 *****************************************************************************
196 * Horizontal alignment needed:
197 * - input: 8 pixels (8 Y bytes, 4 U/V bytes), margins not allowed
198 * - output: 1 pixel (2 bytes), margins allowed
199 * Vertical alignment needed:
200 * - input: 2 lines (2 Y lines, 1 U/V line)
202 *****************************************************************************/
204 #if defined (MODULE_NAME_IS_i420_rgb)
206 void E_(I420_RGB16)( vout_thread_t *p_vout, picture_t *p_src,
209 /* We got this one from the old arguments */
210 uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
211 uint8_t *p_y = p_src->Y_PIXELS;
212 uint8_t *p_u = p_src->U_PIXELS;
213 uint8_t *p_v = p_src->V_PIXELS;
215 vlc_bool_t b_hscale; /* horizontal scaling type */
216 unsigned int i_vscale; /* vertical scaling type */
217 unsigned int i_x, i_y; /* horizontal and vertical indexes */
221 int i_scale_count; /* scale modulo counter */
222 int i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
223 uint16_t * p_pic_start; /* beginning of the current line for copy */
224 int i_uval, i_vval; /* U and V samples */
225 int i_red, i_green, i_blue; /* U and V modified samples */
226 uint16_t * p_yuv = p_vout->chroma.p_sys->p_rgb16;
227 uint16_t * p_ybase; /* Y dependant conversion table */
229 /* Conversion buffer pointer */
230 uint16_t * p_buffer_start = (uint16_t*)p_vout->chroma.p_sys->p_buffer;
233 /* Offset array pointer */
234 int * p_offset_start = p_vout->chroma.p_sys->p_offset;
237 const int i_source_margin = p_src->p[0].i_pitch
238 - p_src->p[0].i_visible_pitch;
239 const int i_source_margin_c = p_src->p[1].i_pitch
240 - p_src->p[1].i_visible_pitch;
242 i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
244 if( p_vout->render.i_width & 7 )
246 i_rewind = 8 - ( p_vout->render.i_width & 7 );
253 /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
254 * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
255 * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
256 SetOffset( p_vout->render.i_width, p_vout->render.i_height,
257 p_vout->output.i_width, p_vout->output.i_height,
258 &b_hscale, &i_vscale, p_offset_start );
263 i_scale_count = ( i_vscale == 1 ) ?
264 p_vout->output.i_height : p_vout->render.i_height;
265 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
268 p_buffer = b_hscale ? p_buffer_start : p_pic;
270 for ( i_x = p_vout->render.i_width / 8; i_x--; )
272 CONVERT_YUV_PIXEL(2); CONVERT_Y_PIXEL(2);
273 CONVERT_YUV_PIXEL(2); CONVERT_Y_PIXEL(2);
274 CONVERT_YUV_PIXEL(2); CONVERT_Y_PIXEL(2);
275 CONVERT_YUV_PIXEL(2); CONVERT_Y_PIXEL(2);
278 /* Here we do some unaligned reads and duplicate conversions, but
279 * at least we have all the pixels */
283 p_u -= i_rewind >> 1;
284 p_v -= i_rewind >> 1;
285 p_buffer -= i_rewind;
287 CONVERT_YUV_PIXEL(2); CONVERT_Y_PIXEL(2);
288 CONVERT_YUV_PIXEL(2); CONVERT_Y_PIXEL(2);
289 CONVERT_YUV_PIXEL(2); CONVERT_Y_PIXEL(2);
290 CONVERT_YUV_PIXEL(2); CONVERT_Y_PIXEL(2);
293 SCALE_HEIGHT( 420, 2 );
295 p_y += i_source_margin;
298 p_u += i_source_margin_c;
299 p_v += i_source_margin_c;
304 #else // ! defined (MODULE_NAME_IS_i420_rgb)
306 void E_(I420_R5G5B5)( vout_thread_t *p_vout, picture_t *p_src,
309 /* We got this one from the old arguments */
310 uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
311 uint8_t *p_y = p_src->Y_PIXELS;
312 uint8_t *p_u = p_src->U_PIXELS;
313 uint8_t *p_v = p_src->V_PIXELS;
315 vlc_bool_t b_hscale; /* horizontal scaling type */
316 unsigned int i_vscale; /* vertical scaling type */
317 unsigned int i_x, i_y; /* horizontal and vertical indexes */
321 int i_scale_count; /* scale modulo counter */
322 int i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
323 uint16_t * p_pic_start; /* beginning of the current line for copy */
325 /* Conversion buffer pointer */
326 uint16_t * p_buffer_start = (uint16_t*)p_vout->chroma.p_sys->p_buffer;
329 /* Offset array pointer */
330 int * p_offset_start = p_vout->chroma.p_sys->p_offset;
333 const int i_source_margin = p_src->p[0].i_pitch
334 - p_src->p[0].i_visible_pitch;
335 const int i_source_margin_c = p_src->p[1].i_pitch
336 - p_src->p[1].i_visible_pitch;
338 i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
340 /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
341 * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
342 * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
343 SetOffset( p_vout->render.i_width, p_vout->render.i_height,
344 p_vout->output.i_width, p_vout->output.i_height,
345 &b_hscale, &i_vscale, p_offset_start );
351 i_scale_count = ( i_vscale == 1 ) ?
352 p_vout->output.i_height : p_vout->render.i_height;
354 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
356 if( p_vout->render.i_width & 15 )
358 i_rewind = 16 - ( p_vout->render.i_width & 15 );
366 ** SSE2 128 bits fetch/store instructions are faster
367 ** if memory access is 16 bytes aligned
370 p_buffer = b_hscale ? p_buffer_start : p_pic;
371 if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
374 ((intptr_t)p_buffer))) )
376 /* use faster SSE2 aligned fetch and store */
377 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
381 for ( i_x = p_vout->render.i_width/16; i_x--; )
387 SSE2_UNPACK_15_ALIGNED
394 /* Here we do some unaligned reads and duplicate conversions, but
395 * at least we have all the pixels */
399 p_u -= i_rewind >> 1;
400 p_v -= i_rewind >> 1;
401 p_buffer -= i_rewind;
404 SSE2_INIT_16_UNALIGNED
407 SSE2_UNPACK_15_UNALIGNED
414 SCALE_HEIGHT( 420, 2 );
416 p_y += i_source_margin;
419 p_u += i_source_margin_c;
420 p_v += i_source_margin_c;
422 p_buffer = b_hscale ? p_buffer_start : p_pic;
427 /* use slower SSE2 unaligned fetch and store */
428 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
431 p_buffer = b_hscale ? p_buffer_start : p_pic;
433 for ( i_x = p_vout->render.i_width/16; i_x--; )
436 SSE2_INIT_16_UNALIGNED
439 SSE2_UNPACK_15_UNALIGNED
446 /* Here we do some unaligned reads and duplicate conversions, but
447 * at least we have all the pixels */
451 p_u -= i_rewind >> 1;
452 p_v -= i_rewind >> 1;
453 p_buffer -= i_rewind;
456 SSE2_INIT_16_UNALIGNED
459 SSE2_UNPACK_15_UNALIGNED
466 SCALE_HEIGHT( 420, 2 );
468 p_y += i_source_margin;
471 p_u += i_source_margin_c;
472 p_v += i_source_margin_c;
474 p_buffer = b_hscale ? p_buffer_start : p_pic;
478 /* make sure all SSE2 stores are visible thereafter */
481 #else // defined (MODULE_NAME_IS_i420_rgb_mmx)
483 if( p_vout->render.i_width & 7 )
485 i_rewind = 8 - ( p_vout->render.i_width & 7 );
492 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
495 p_buffer = b_hscale ? p_buffer_start : p_pic;
497 for ( i_x = p_vout->render.i_width / 8; i_x--; )
511 /* Here we do some unaligned reads and duplicate conversions, but
512 * at least we have all the pixels */
516 p_u -= i_rewind >> 1;
517 p_v -= i_rewind >> 1;
518 p_buffer -= i_rewind;
532 SCALE_HEIGHT( 420, 2 );
534 p_y += i_source_margin;
537 p_u += i_source_margin_c;
538 p_v += i_source_margin_c;
541 /* re-enable FPU registers */
547 void E_(I420_R5G6B5)( vout_thread_t *p_vout, picture_t *p_src,
550 /* We got this one from the old arguments */
551 uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
552 uint8_t *p_y = p_src->Y_PIXELS;
553 uint8_t *p_u = p_src->U_PIXELS;
554 uint8_t *p_v = p_src->V_PIXELS;
556 vlc_bool_t b_hscale; /* horizontal scaling type */
557 unsigned int i_vscale; /* vertical scaling type */
558 unsigned int i_x, i_y; /* horizontal and vertical indexes */
562 int i_scale_count; /* scale modulo counter */
563 int i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
564 uint16_t * p_pic_start; /* beginning of the current line for copy */
566 /* Conversion buffer pointer */
567 uint16_t * p_buffer_start = (uint16_t*)p_vout->chroma.p_sys->p_buffer;
570 /* Offset array pointer */
571 int * p_offset_start = p_vout->chroma.p_sys->p_offset;
574 const int i_source_margin = p_src->p[0].i_pitch
575 - p_src->p[0].i_visible_pitch;
576 const int i_source_margin_c = p_src->p[1].i_pitch
577 - p_src->p[1].i_visible_pitch;
579 i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
581 /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
582 * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
583 * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
584 SetOffset( p_vout->render.i_width, p_vout->render.i_height,
585 p_vout->output.i_width, p_vout->output.i_height,
586 &b_hscale, &i_vscale, p_offset_start );
592 i_scale_count = ( i_vscale == 1 ) ?
593 p_vout->output.i_height : p_vout->render.i_height;
595 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
597 if( p_vout->render.i_width & 15 )
599 i_rewind = 16 - ( p_vout->render.i_width & 15 );
607 ** SSE2 128 bits fetch/store instructions are faster
608 ** if memory access is 16 bytes aligned
611 p_buffer = b_hscale ? p_buffer_start : p_pic;
612 if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
615 ((intptr_t)p_buffer))) )
617 /* use faster SSE2 aligned fetch and store */
618 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
622 for ( i_x = p_vout->render.i_width/16; i_x--; )
628 SSE2_UNPACK_16_ALIGNED
635 /* Here we do some unaligned reads and duplicate conversions, but
636 * at least we have all the pixels */
640 p_u -= i_rewind >> 1;
641 p_v -= i_rewind >> 1;
642 p_buffer -= i_rewind;
645 SSE2_INIT_16_UNALIGNED
648 SSE2_UNPACK_16_UNALIGNED
655 SCALE_HEIGHT( 420, 2 );
657 p_y += i_source_margin;
660 p_u += i_source_margin_c;
661 p_v += i_source_margin_c;
663 p_buffer = b_hscale ? p_buffer_start : p_pic;
668 /* use slower SSE2 unaligned fetch and store */
669 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
672 p_buffer = b_hscale ? p_buffer_start : p_pic;
674 for ( i_x = p_vout->render.i_width/16; i_x--; )
677 SSE2_INIT_16_UNALIGNED
680 SSE2_UNPACK_16_UNALIGNED
687 /* Here we do some unaligned reads and duplicate conversions, but
688 * at least we have all the pixels */
692 p_u -= i_rewind >> 1;
693 p_v -= i_rewind >> 1;
694 p_buffer -= i_rewind;
697 SSE2_INIT_16_UNALIGNED
700 SSE2_UNPACK_16_UNALIGNED
707 SCALE_HEIGHT( 420, 2 );
709 p_y += i_source_margin;
712 p_u += i_source_margin_c;
713 p_v += i_source_margin_c;
715 p_buffer = b_hscale ? p_buffer_start : p_pic;
719 /* make sure all SSE2 stores are visible thereafter */
722 #else // defined (MODULE_NAME_IS_i420_rgb_mmx)
724 if( p_vout->render.i_width & 7 )
726 i_rewind = 8 - ( p_vout->render.i_width & 7 );
733 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
736 p_buffer = b_hscale ? p_buffer_start : p_pic;
738 for ( i_x = p_vout->render.i_width / 8; i_x--; )
752 /* Here we do some unaligned reads and duplicate conversions, but
753 * at least we have all the pixels */
757 p_u -= i_rewind >> 1;
758 p_v -= i_rewind >> 1;
759 p_buffer -= i_rewind;
773 SCALE_HEIGHT( 420, 2 );
775 p_y += i_source_margin;
778 p_u += i_source_margin_c;
779 p_v += i_source_margin_c;
782 /* re-enable FPU registers */
790 /*****************************************************************************
791 * I420_RGB32: color YUV 4:2:0 to RGB 32 bpp
792 *****************************************************************************
793 * Horizontal alignment needed:
794 * - input: 8 pixels (8 Y bytes, 4 U/V bytes), margins not allowed
795 * - output: 1 pixel (2 bytes), margins allowed
796 * Vertical alignment needed:
797 * - input: 2 lines (2 Y lines, 1 U/V line)
799 *****************************************************************************/
801 #if defined (MODULE_NAME_IS_i420_rgb)
803 void E_(I420_RGB32)( vout_thread_t *p_vout, picture_t *p_src,
806 /* We got this one from the old arguments */
807 uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
808 uint8_t *p_y = p_src->Y_PIXELS;
809 uint8_t *p_u = p_src->U_PIXELS;
810 uint8_t *p_v = p_src->V_PIXELS;
812 vlc_bool_t b_hscale; /* horizontal scaling type */
813 unsigned int i_vscale; /* vertical scaling type */
814 unsigned int i_x, i_y; /* horizontal and vertical indexes */
818 int i_scale_count; /* scale modulo counter */
819 int i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
820 uint32_t * p_pic_start; /* beginning of the current line for copy */
821 int i_uval, i_vval; /* U and V samples */
822 int i_red, i_green, i_blue; /* U and V modified samples */
823 uint32_t * p_yuv = p_vout->chroma.p_sys->p_rgb32;
824 uint32_t * p_ybase; /* Y dependant conversion table */
826 /* Conversion buffer pointer */
827 uint32_t * p_buffer_start = (uint32_t*)p_vout->chroma.p_sys->p_buffer;
830 /* Offset array pointer */
831 int * p_offset_start = p_vout->chroma.p_sys->p_offset;
834 const int i_source_margin = p_src->p[0].i_pitch
835 - p_src->p[0].i_visible_pitch;
836 const int i_source_margin_c = p_src->p[1].i_pitch
837 - p_src->p[1].i_visible_pitch;
839 i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
841 if( p_vout->render.i_width & 7 )
843 i_rewind = 8 - ( p_vout->render.i_width & 7 );
850 /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
851 * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
852 * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
853 SetOffset( p_vout->render.i_width, p_vout->render.i_height,
854 p_vout->output.i_width, p_vout->output.i_height,
855 &b_hscale, &i_vscale, p_offset_start );
860 i_scale_count = ( i_vscale == 1 ) ?
861 p_vout->output.i_height : p_vout->render.i_height;
862 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
865 p_buffer = b_hscale ? p_buffer_start : p_pic;
867 for ( i_x = p_vout->render.i_width / 8; i_x--; )
869 CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4);
870 CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4);
871 CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4);
872 CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4);
875 /* Here we do some unaligned reads and duplicate conversions, but
876 * at least we have all the pixels */
880 p_u -= i_rewind >> 1;
881 p_v -= i_rewind >> 1;
882 p_buffer -= i_rewind;
883 CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4);
884 CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4);
885 CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4);
886 CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4);
889 SCALE_HEIGHT( 420, 4 );
891 p_y += i_source_margin;
894 p_u += i_source_margin_c;
895 p_v += i_source_margin_c;
900 #else // defined (MODULE_NAME_IS_i420_rgb_mmx) || defined (MODULE_NAME_IS_i420_rgb_sse2)
902 void E_(I420_A8R8G8B8)( vout_thread_t *p_vout, picture_t *p_src,
905 /* We got this one from the old arguments */
906 uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
907 uint8_t *p_y = p_src->Y_PIXELS;
908 uint8_t *p_u = p_src->U_PIXELS;
909 uint8_t *p_v = p_src->V_PIXELS;
911 vlc_bool_t b_hscale; /* horizontal scaling type */
912 unsigned int i_vscale; /* vertical scaling type */
913 unsigned int i_x, i_y; /* horizontal and vertical indexes */
917 int i_scale_count; /* scale modulo counter */
918 int i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
919 uint32_t * p_pic_start; /* beginning of the current line for copy */
920 /* Conversion buffer pointer */
921 uint32_t * p_buffer_start = (uint32_t*)p_vout->chroma.p_sys->p_buffer;
924 /* Offset array pointer */
925 int * p_offset_start = p_vout->chroma.p_sys->p_offset;
928 const int i_source_margin = p_src->p[0].i_pitch
929 - p_src->p[0].i_visible_pitch;
930 const int i_source_margin_c = p_src->p[1].i_pitch
931 - p_src->p[1].i_visible_pitch;
933 i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
935 /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
936 * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
937 * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
938 SetOffset( p_vout->render.i_width, p_vout->render.i_height,
939 p_vout->output.i_width, p_vout->output.i_height,
940 &b_hscale, &i_vscale, p_offset_start );
945 i_scale_count = ( i_vscale == 1 ) ?
946 p_vout->output.i_height : p_vout->render.i_height;
948 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
950 if( p_vout->render.i_width & 15 )
952 i_rewind = 16 - ( p_vout->render.i_width & 15 );
960 ** SSE2 128 bits fetch/store instructions are faster
961 ** if memory access is 16 bytes aligned
964 p_buffer = b_hscale ? p_buffer_start : p_pic;
965 if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
968 ((intptr_t)p_buffer))) )
970 /* use faster SSE2 aligned fetch and store */
971 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
975 for ( i_x = p_vout->render.i_width / 16; i_x--; )
981 SSE2_UNPACK_32_ARGB_ALIGNED
989 /* Here we do some unaligned reads and duplicate conversions, but
990 * at least we have all the pixels */
994 p_u -= i_rewind >> 1;
995 p_v -= i_rewind >> 1;
996 p_buffer -= i_rewind;
998 SSE2_INIT_32_UNALIGNED
1001 SSE2_UNPACK_32_ARGB_UNALIGNED
1008 SCALE_HEIGHT( 420, 4 );
1010 p_y += i_source_margin;
1013 p_u += i_source_margin_c;
1014 p_v += i_source_margin_c;
1016 p_buffer = b_hscale ? p_buffer_start : p_pic;
1021 /* use slower SSE2 unaligned fetch and store */
1022 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1024 p_pic_start = p_pic;
1025 p_buffer = b_hscale ? p_buffer_start : p_pic;
1027 for ( i_x = p_vout->render.i_width / 16; i_x--; )
1030 SSE2_INIT_32_UNALIGNED
1033 SSE2_UNPACK_32_ARGB_UNALIGNED
1041 /* Here we do some unaligned reads and duplicate conversions, but
1042 * at least we have all the pixels */
1046 p_u -= i_rewind >> 1;
1047 p_v -= i_rewind >> 1;
1048 p_buffer -= i_rewind;
1050 SSE2_INIT_32_UNALIGNED
1053 SSE2_UNPACK_32_ARGB_UNALIGNED
1060 SCALE_HEIGHT( 420, 4 );
1062 p_y += i_source_margin;
1065 p_u += i_source_margin_c;
1066 p_v += i_source_margin_c;
1068 p_buffer = b_hscale ? p_buffer_start : p_pic;
1072 /* make sure all SSE2 stores are visible thereafter */
1075 #else // defined (MODULE_NAME_IS_i420_rgb_mmx)
1077 if( p_vout->render.i_width & 7 )
1079 i_rewind = 8 - ( p_vout->render.i_width & 7 );
1086 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1088 p_pic_start = p_pic;
1089 p_buffer = b_hscale ? p_buffer_start : p_pic;
1091 for ( i_x = p_vout->render.i_width / 8; i_x--; )
1105 /* Here we do some unaligned reads and duplicate conversions, but
1106 * at least we have all the pixels */
1110 p_u -= i_rewind >> 1;
1111 p_v -= i_rewind >> 1;
1112 p_buffer -= i_rewind;
1125 SCALE_HEIGHT( 420, 4 );
1127 p_y += i_source_margin;
1130 p_u += i_source_margin_c;
1131 p_v += i_source_margin_c;
1135 /* re-enable FPU registers */
1141 void E_(I420_R8G8B8A8)( vout_thread_t *p_vout, picture_t *p_src,
1144 /* We got this one from the old arguments */
1145 uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
1146 uint8_t *p_y = p_src->Y_PIXELS;
1147 uint8_t *p_u = p_src->U_PIXELS;
1148 uint8_t *p_v = p_src->V_PIXELS;
1150 vlc_bool_t b_hscale; /* horizontal scaling type */
1151 unsigned int i_vscale; /* vertical scaling type */
1152 unsigned int i_x, i_y; /* horizontal and vertical indexes */
1156 int i_scale_count; /* scale modulo counter */
1157 int i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
1158 uint32_t * p_pic_start; /* beginning of the current line for copy */
1159 /* Conversion buffer pointer */
1160 uint32_t * p_buffer_start = (uint32_t*)p_vout->chroma.p_sys->p_buffer;
1161 uint32_t * p_buffer;
1163 /* Offset array pointer */
1164 int * p_offset_start = p_vout->chroma.p_sys->p_offset;
1167 const int i_source_margin = p_src->p[0].i_pitch
1168 - p_src->p[0].i_visible_pitch;
1169 const int i_source_margin_c = p_src->p[1].i_pitch
1170 - p_src->p[1].i_visible_pitch;
1172 i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
1174 /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
1175 * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
1176 * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
1177 SetOffset( p_vout->render.i_width, p_vout->render.i_height,
1178 p_vout->output.i_width, p_vout->output.i_height,
1179 &b_hscale, &i_vscale, p_offset_start );
1182 * Perform conversion
1184 i_scale_count = ( i_vscale == 1 ) ?
1185 p_vout->output.i_height : p_vout->render.i_height;
1187 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
1189 if( p_vout->render.i_width & 15 )
1191 i_rewind = 16 - ( p_vout->render.i_width & 15 );
1199 ** SSE2 128 bits fetch/store instructions are faster
1200 ** if memory access is 16 bytes aligned
1203 p_buffer = b_hscale ? p_buffer_start : p_pic;
1204 if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
1207 ((intptr_t)p_buffer))) )
1209 /* use faster SSE2 aligned fetch and store */
1210 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1212 p_pic_start = p_pic;
1214 for ( i_x = p_vout->render.i_width / 16; i_x--; )
1217 SSE2_INIT_32_ALIGNED
1220 SSE2_UNPACK_32_RGBA_ALIGNED
1228 /* Here we do some unaligned reads and duplicate conversions, but
1229 * at least we have all the pixels */
1233 p_u -= i_rewind >> 1;
1234 p_v -= i_rewind >> 1;
1235 p_buffer -= i_rewind;
1237 SSE2_INIT_32_UNALIGNED
1240 SSE2_UNPACK_32_RGBA_UNALIGNED
1247 SCALE_HEIGHT( 420, 4 );
1249 p_y += i_source_margin;
1252 p_u += i_source_margin_c;
1253 p_v += i_source_margin_c;
1255 p_buffer = b_hscale ? p_buffer_start : p_pic;
1260 /* use slower SSE2 unaligned fetch and store */
1261 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1263 p_pic_start = p_pic;
1264 p_buffer = b_hscale ? p_buffer_start : p_pic;
1266 for ( i_x = p_vout->render.i_width / 16; i_x--; )
1269 SSE2_INIT_32_UNALIGNED
1272 SSE2_UNPACK_32_RGBA_UNALIGNED
1280 /* Here we do some unaligned reads and duplicate conversions, but
1281 * at least we have all the pixels */
1285 p_u -= i_rewind >> 1;
1286 p_v -= i_rewind >> 1;
1287 p_buffer -= i_rewind;
1289 SSE2_INIT_32_UNALIGNED
1292 SSE2_UNPACK_32_RGBA_UNALIGNED
1299 SCALE_HEIGHT( 420, 4 );
1301 p_y += i_source_margin;
1304 p_u += i_source_margin_c;
1305 p_v += i_source_margin_c;
1307 p_buffer = b_hscale ? p_buffer_start : p_pic;
1311 /* make sure all SSE2 stores are visible thereafter */
1314 #else // defined (MODULE_NAME_IS_i420_rgb_mmx)
1316 if( p_vout->render.i_width & 7 )
1318 i_rewind = 8 - ( p_vout->render.i_width & 7 );
1325 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1327 p_pic_start = p_pic;
1328 p_buffer = b_hscale ? p_buffer_start : p_pic;
1330 for ( i_x = p_vout->render.i_width / 8; i_x--; )
1344 /* Here we do some unaligned reads and duplicate conversions, but
1345 * at least we have all the pixels */
1349 p_u -= i_rewind >> 1;
1350 p_v -= i_rewind >> 1;
1351 p_buffer -= i_rewind;
1364 SCALE_HEIGHT( 420, 4 );
1366 p_y += i_source_margin;
1369 p_u += i_source_margin_c;
1370 p_v += i_source_margin_c;
1374 /* re-enable FPU registers */
1380 void E_(I420_B8G8R8A8)( vout_thread_t *p_vout, picture_t *p_src,
1383 /* We got this one from the old arguments */
1384 uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
1385 uint8_t *p_y = p_src->Y_PIXELS;
1386 uint8_t *p_u = p_src->U_PIXELS;
1387 uint8_t *p_v = p_src->V_PIXELS;
1389 vlc_bool_t b_hscale; /* horizontal scaling type */
1390 unsigned int i_vscale; /* vertical scaling type */
1391 unsigned int i_x, i_y; /* horizontal and vertical indexes */
1395 int i_scale_count; /* scale modulo counter */
1396 int i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
1397 uint32_t * p_pic_start; /* beginning of the current line for copy */
1398 /* Conversion buffer pointer */
1399 uint32_t * p_buffer_start = (uint32_t*)p_vout->chroma.p_sys->p_buffer;
1400 uint32_t * p_buffer;
1402 /* Offset array pointer */
1403 int * p_offset_start = p_vout->chroma.p_sys->p_offset;
1406 const int i_source_margin = p_src->p[0].i_pitch
1407 - p_src->p[0].i_visible_pitch;
1408 const int i_source_margin_c = p_src->p[1].i_pitch
1409 - p_src->p[1].i_visible_pitch;
1411 i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
1413 /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
1414 * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
1415 * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
1416 SetOffset( p_vout->render.i_width, p_vout->render.i_height,
1417 p_vout->output.i_width, p_vout->output.i_height,
1418 &b_hscale, &i_vscale, p_offset_start );
1421 * Perform conversion
1423 i_scale_count = ( i_vscale == 1 ) ?
1424 p_vout->output.i_height : p_vout->render.i_height;
1426 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
1428 if( p_vout->render.i_width & 15 )
1430 i_rewind = 16 - ( p_vout->render.i_width & 15 );
1438 ** SSE2 128 bits fetch/store instructions are faster
1439 ** if memory access is 16 bytes aligned
1442 p_buffer = b_hscale ? p_buffer_start : p_pic;
1443 if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
1446 ((intptr_t)p_buffer))) )
1448 /* use faster SSE2 aligned fetch and store */
1449 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1451 p_pic_start = p_pic;
1453 for ( i_x = p_vout->render.i_width / 16; i_x--; )
1456 SSE2_INIT_32_ALIGNED
1459 SSE2_UNPACK_32_BGRA_ALIGNED
1467 /* Here we do some unaligned reads and duplicate conversions, but
1468 * at least we have all the pixels */
1472 p_u -= i_rewind >> 1;
1473 p_v -= i_rewind >> 1;
1474 p_buffer -= i_rewind;
1476 SSE2_INIT_32_UNALIGNED
1479 SSE2_UNPACK_32_BGRA_UNALIGNED
1486 SCALE_HEIGHT( 420, 4 );
1488 p_y += i_source_margin;
1491 p_u += i_source_margin_c;
1492 p_v += i_source_margin_c;
1494 p_buffer = b_hscale ? p_buffer_start : p_pic;
1499 /* use slower SSE2 unaligned fetch and store */
1500 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1502 p_pic_start = p_pic;
1503 p_buffer = b_hscale ? p_buffer_start : p_pic;
1505 for ( i_x = p_vout->render.i_width / 16; i_x--; )
1508 SSE2_INIT_32_UNALIGNED
1511 SSE2_UNPACK_32_BGRA_UNALIGNED
1519 /* Here we do some unaligned reads and duplicate conversions, but
1520 * at least we have all the pixels */
1524 p_u -= i_rewind >> 1;
1525 p_v -= i_rewind >> 1;
1526 p_buffer -= i_rewind;
1528 SSE2_INIT_32_UNALIGNED
1531 SSE2_UNPACK_32_BGRA_UNALIGNED
1538 SCALE_HEIGHT( 420, 4 );
1540 p_y += i_source_margin;
1543 p_u += i_source_margin_c;
1544 p_v += i_source_margin_c;
1546 p_buffer = b_hscale ? p_buffer_start : p_pic;
1552 if( p_vout->render.i_width & 7 )
1554 i_rewind = 8 - ( p_vout->render.i_width & 7 );
1561 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1563 p_pic_start = p_pic;
1564 p_buffer = b_hscale ? p_buffer_start : p_pic;
1566 for ( i_x = p_vout->render.i_width / 8; i_x--; )
1580 /* Here we do some unaligned reads and duplicate conversions, but
1581 * at least we have all the pixels */
1585 p_u -= i_rewind >> 1;
1586 p_v -= i_rewind >> 1;
1587 p_buffer -= i_rewind;
1600 SCALE_HEIGHT( 420, 4 );
1602 p_y += i_source_margin;
1605 p_u += i_source_margin_c;
1606 p_v += i_source_margin_c;
1610 /* re-enable FPU registers */
1616 void E_(I420_A8B8G8R8)( vout_thread_t *p_vout, picture_t *p_src,
1619 /* We got this one from the old arguments */
1620 uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
1621 uint8_t *p_y = p_src->Y_PIXELS;
1622 uint8_t *p_u = p_src->U_PIXELS;
1623 uint8_t *p_v = p_src->V_PIXELS;
1625 vlc_bool_t b_hscale; /* horizontal scaling type */
1626 unsigned int i_vscale; /* vertical scaling type */
1627 unsigned int i_x, i_y; /* horizontal and vertical indexes */
1631 int i_scale_count; /* scale modulo counter */
1632 int i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
1633 uint32_t * p_pic_start; /* beginning of the current line for copy */
1634 /* Conversion buffer pointer */
1635 uint32_t * p_buffer_start = (uint32_t*)p_vout->chroma.p_sys->p_buffer;
1636 uint32_t * p_buffer;
1638 /* Offset array pointer */
1639 int * p_offset_start = p_vout->chroma.p_sys->p_offset;
1642 const int i_source_margin = p_src->p[0].i_pitch
1643 - p_src->p[0].i_visible_pitch;
1644 const int i_source_margin_c = p_src->p[1].i_pitch
1645 - p_src->p[1].i_visible_pitch;
1647 i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
1649 /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
1650 * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
1651 * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
1652 SetOffset( p_vout->render.i_width, p_vout->render.i_height,
1653 p_vout->output.i_width, p_vout->output.i_height,
1654 &b_hscale, &i_vscale, p_offset_start );
1657 * Perform conversion
1659 i_scale_count = ( i_vscale == 1 ) ?
1660 p_vout->output.i_height : p_vout->render.i_height;
1662 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
1664 if( p_vout->render.i_width & 15 )
1666 i_rewind = 16 - ( p_vout->render.i_width & 15 );
1674 ** SSE2 128 bits fetch/store instructions are faster
1675 ** if memory access is 16 bytes aligned
1678 p_buffer = b_hscale ? p_buffer_start : p_pic;
1679 if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
1682 ((intptr_t)p_buffer))) )
1684 /* use faster SSE2 aligned fetch and store */
1685 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1687 p_pic_start = p_pic;
1689 for ( i_x = p_vout->render.i_width / 16; i_x--; )
1692 SSE2_INIT_32_ALIGNED
1695 SSE2_UNPACK_32_ABGR_ALIGNED
1703 /* Here we do some unaligned reads and duplicate conversions, but
1704 * at least we have all the pixels */
1708 p_u -= i_rewind >> 1;
1709 p_v -= i_rewind >> 1;
1710 p_buffer -= i_rewind;
1712 SSE2_INIT_32_UNALIGNED
1715 SSE2_UNPACK_32_ABGR_UNALIGNED
1722 SCALE_HEIGHT( 420, 4 );
1724 p_y += i_source_margin;
1727 p_u += i_source_margin_c;
1728 p_v += i_source_margin_c;
1730 p_buffer = b_hscale ? p_buffer_start : p_pic;
1735 /* use slower SSE2 unaligned fetch and store */
1736 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1738 p_pic_start = p_pic;
1739 p_buffer = b_hscale ? p_buffer_start : p_pic;
1741 for ( i_x = p_vout->render.i_width / 16; i_x--; )
1744 SSE2_INIT_32_UNALIGNED
1747 SSE2_UNPACK_32_ABGR_UNALIGNED
1755 /* Here we do some unaligned reads and duplicate conversions, but
1756 * at least we have all the pixels */
1760 p_u -= i_rewind >> 1;
1761 p_v -= i_rewind >> 1;
1762 p_buffer -= i_rewind;
1764 SSE2_INIT_32_UNALIGNED
1767 SSE2_UNPACK_32_ABGR_UNALIGNED
1774 SCALE_HEIGHT( 420, 4 );
1776 p_y += i_source_margin;
1779 p_u += i_source_margin_c;
1780 p_v += i_source_margin_c;
1782 p_buffer = b_hscale ? p_buffer_start : p_pic;
1788 if( p_vout->render.i_width & 7 )
1790 i_rewind = 8 - ( p_vout->render.i_width & 7 );
1797 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1799 p_pic_start = p_pic;
1800 p_buffer = b_hscale ? p_buffer_start : p_pic;
1802 for ( i_x = p_vout->render.i_width / 8; i_x--; )
1816 /* Here we do some unaligned reads and duplicate conversions, but
1817 * at least we have all the pixels */
1821 p_u -= i_rewind >> 1;
1822 p_v -= i_rewind >> 1;
1823 p_buffer -= i_rewind;
1836 SCALE_HEIGHT( 420, 4 );
1838 p_y += i_source_margin;
1841 p_u += i_source_margin_c;
1842 p_v += i_source_margin_c;
1846 /* re-enable FPU registers */
1854 /* Following functions are local */
1856 /*****************************************************************************
1857 * SetOffset: build offset array for conversion functions
1858 *****************************************************************************
1859 * This function will build an offset array used in later conversion functions.
1860 * It will also set horizontal and vertical scaling indicators.
1861 *****************************************************************************/
1862 static void SetOffset( int i_width, int i_height, int i_pic_width,
1863 int i_pic_height, vlc_bool_t *pb_hscale,
1864 unsigned int *pi_vscale, int *p_offset )
1866 int i_x; /* x position in destination */
1867 int i_scale_count; /* modulo counter */
1870 * Prepare horizontal offset array
1872 if( i_pic_width - i_width == 0 )
1874 /* No horizontal scaling: YUV conversion is done directly to picture */
1877 else if( i_pic_width - i_width > 0 )
1879 /* Prepare scaling array for horizontal extension */
1881 i_scale_count = i_pic_width;
1882 for( i_x = i_width; i_x--; )
1884 while( (i_scale_count -= i_width) > 0 )
1889 i_scale_count += i_pic_width;
1892 else /* if( i_pic_width - i_width < 0 ) */
1894 /* Prepare scaling array for horizontal reduction */
1896 i_scale_count = i_width;
1897 for( i_x = i_pic_width; i_x--; )
1900 while( (i_scale_count -= i_pic_width) > 0 )
1905 i_scale_count += i_width;
1910 * Set vertical scaling indicator
1912 if( i_pic_height - i_height == 0 )
1916 else if( i_pic_height - i_height > 0 )
1920 else /* if( i_pic_height - i_height < 0 ) */