1 /*****************************************************************************
2 * i420_rgb16.c : YUV to bitmap RGB conversion module for vlc
3 *****************************************************************************
4 * Copyright (C) 2000 the VideoLAN team
7 * Authors: Samuel Hocevar <sam@zoy.org>
8 * Damien Fouilleul <damienf@videolan.org>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
23 *****************************************************************************/
25 /*****************************************************************************
27 *****************************************************************************/
37 #if defined (MODULE_NAME_IS_i420_rgb)
38 # include "i420_rgb_c.h"
39 #elif defined (MODULE_NAME_IS_i420_rgb_mmx)
40 # include "i420_rgb_mmx.h"
41 #elif defined (MODULE_NAME_IS_i420_rgb_sse2)
42 # include "i420_rgb_mmx.h"
45 static void SetOffset( int, int, int, int, vlc_bool_t *,
46 unsigned int *, int * );
48 #if defined (MODULE_NAME_IS_i420_rgb)
49 /*****************************************************************************
50 * I420_RGB16: color YUV 4:2:0 to RGB 16 bpp with dithering
51 *****************************************************************************
52 * Horizontal alignment needed:
53 * - input: 8 pixels (8 Y bytes, 4 U/V bytes), margins not allowed
54 * - output: 1 pixel (2 bytes), margins allowed
55 * Vertical alignment needed:
56 * - input: 2 lines (2 Y lines, 1 U/V line)
58 *****************************************************************************/
59 void E_(I420_RGB16_dither)( vout_thread_t *p_vout, picture_t *p_src,
62 /* We got this one from the old arguments */
63 uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
64 uint8_t *p_y = p_src->Y_PIXELS;
65 uint8_t *p_u = p_src->U_PIXELS;
66 uint8_t *p_v = p_src->V_PIXELS;
68 vlc_bool_t b_hscale; /* horizontal scaling type */
69 unsigned int i_vscale; /* vertical scaling type */
70 unsigned int i_x, i_y; /* horizontal and vertical indexes */
71 unsigned int i_real_y; /* y % 4 */
75 int i_scale_count; /* scale modulo counter */
76 int i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
77 uint16_t * p_pic_start; /* beginning of the current line for copy */
78 int i_uval, i_vval; /* U and V samples */
79 int i_red, i_green, i_blue; /* U and V modified samples */
80 uint16_t * p_yuv = p_vout->chroma.p_sys->p_rgb16;
81 uint16_t * p_ybase; /* Y dependant conversion table */
83 /* Conversion buffer pointer */
84 uint16_t * p_buffer_start = (uint16_t*)p_vout->chroma.p_sys->p_buffer;
87 /* Offset array pointer */
88 int * p_offset_start = p_vout->chroma.p_sys->p_offset;
91 const int i_source_margin = p_src->p[0].i_pitch
92 - p_src->p[0].i_visible_pitch;
93 const int i_source_margin_c = p_src->p[1].i_pitch
94 - p_src->p[1].i_visible_pitch;
96 /* The dithering matrices */
97 int dither10[4] = { 0x0, 0x8, 0x2, 0xa };
98 int dither11[4] = { 0xc, 0x4, 0xe, 0x6 };
99 int dither12[4] = { 0x3, 0xb, 0x1, 0x9 };
100 int dither13[4] = { 0xf, 0x7, 0xd, 0x5 };
102 for(i_x = 0; i_x < 4; i_x++)
104 dither10[i_x] = dither10[i_x] << (SHIFT - 4 + p_vout->output.i_rrshift);
105 dither11[i_x] = dither11[i_x] << (SHIFT - 4 + p_vout->output.i_rrshift);
106 dither12[i_x] = dither12[i_x] << (SHIFT - 4 + p_vout->output.i_rrshift);
107 dither13[i_x] = dither13[i_x] << (SHIFT - 4 + p_vout->output.i_rrshift);
110 i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
112 if( p_vout->render.i_width & 7 )
114 i_rewind = 8 - ( p_vout->render.i_width & 7 );
121 /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
122 * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
123 * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
124 SetOffset( p_vout->render.i_width, p_vout->render.i_height,
125 p_vout->output.i_width, p_vout->output.i_height,
126 &b_hscale, &i_vscale, p_offset_start );
131 i_scale_count = ( i_vscale == 1 ) ?
132 p_vout->output.i_height : p_vout->render.i_height;
133 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
135 i_real_y = i_y & 0x3;
137 p_buffer = b_hscale ? p_buffer_start : p_pic;
139 for ( i_x = p_vout->render.i_width / 8; i_x--; )
141 int *p_dither = dither10;
142 CONVERT_YUV_PIXEL_DITHER(2);
144 CONVERT_Y_PIXEL_DITHER(2);
146 CONVERT_YUV_PIXEL_DITHER(2);
148 CONVERT_Y_PIXEL_DITHER(2);
150 CONVERT_YUV_PIXEL_DITHER(2);
152 CONVERT_Y_PIXEL_DITHER(2);
154 CONVERT_YUV_PIXEL_DITHER(2);
156 CONVERT_Y_PIXEL_DITHER(2);
159 /* Here we do some unaligned reads and duplicate conversions, but
160 * at least we have all the pixels */
163 int *p_dither = dither10;
165 p_u -= i_rewind >> 1;
166 p_v -= i_rewind >> 1;
167 p_buffer -= i_rewind;
168 CONVERT_YUV_PIXEL_DITHER(2);
170 CONVERT_Y_PIXEL_DITHER(2);
172 CONVERT_YUV_PIXEL_DITHER(2);
174 CONVERT_Y_PIXEL_DITHER(2);
176 CONVERT_YUV_PIXEL_DITHER(2);
178 CONVERT_Y_PIXEL_DITHER(2);
180 CONVERT_YUV_PIXEL_DITHER(2);
182 CONVERT_Y_PIXEL_DITHER(2);
185 SCALE_HEIGHT( 420, 2 );
187 p_y += i_source_margin;
190 p_u += i_source_margin_c;
191 p_v += i_source_margin_c;
197 /*****************************************************************************
198 * I420_RGB16: color YUV 4:2:0 to RGB 16 bpp
199 *****************************************************************************
200 * Horizontal alignment needed:
201 * - input: 8 pixels (8 Y bytes, 4 U/V bytes), margins not allowed
202 * - output: 1 pixel (2 bytes), margins allowed
203 * Vertical alignment needed:
204 * - input: 2 lines (2 Y lines, 1 U/V line)
206 *****************************************************************************/
208 #if defined (MODULE_NAME_IS_i420_rgb)
210 void E_(I420_RGB16)( vout_thread_t *p_vout, picture_t *p_src,
213 /* We got this one from the old arguments */
214 uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
215 uint8_t *p_y = p_src->Y_PIXELS;
216 uint8_t *p_u = p_src->U_PIXELS;
217 uint8_t *p_v = p_src->V_PIXELS;
219 vlc_bool_t b_hscale; /* horizontal scaling type */
220 unsigned int i_vscale; /* vertical scaling type */
221 unsigned int i_x, i_y; /* horizontal and vertical indexes */
225 int i_scale_count; /* scale modulo counter */
226 int i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
227 uint16_t * p_pic_start; /* beginning of the current line for copy */
228 int i_uval, i_vval; /* U and V samples */
229 int i_red, i_green, i_blue; /* U and V modified samples */
230 uint16_t * p_yuv = p_vout->chroma.p_sys->p_rgb16;
231 uint16_t * p_ybase; /* Y dependant conversion table */
233 /* Conversion buffer pointer */
234 uint16_t * p_buffer_start = (uint16_t*)p_vout->chroma.p_sys->p_buffer;
237 /* Offset array pointer */
238 int * p_offset_start = p_vout->chroma.p_sys->p_offset;
241 const int i_source_margin = p_src->p[0].i_pitch
242 - p_src->p[0].i_visible_pitch;
243 const int i_source_margin_c = p_src->p[1].i_pitch
244 - p_src->p[1].i_visible_pitch;
246 i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
248 if( p_vout->render.i_width & 7 )
250 i_rewind = 8 - ( p_vout->render.i_width & 7 );
257 /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
258 * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
259 * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
260 SetOffset( p_vout->render.i_width, p_vout->render.i_height,
261 p_vout->output.i_width, p_vout->output.i_height,
262 &b_hscale, &i_vscale, p_offset_start );
267 i_scale_count = ( i_vscale == 1 ) ?
268 p_vout->output.i_height : p_vout->render.i_height;
269 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
272 p_buffer = b_hscale ? p_buffer_start : p_pic;
274 for ( i_x = p_vout->render.i_width / 8; i_x--; )
276 CONVERT_YUV_PIXEL(2); CONVERT_Y_PIXEL(2);
277 CONVERT_YUV_PIXEL(2); CONVERT_Y_PIXEL(2);
278 CONVERT_YUV_PIXEL(2); CONVERT_Y_PIXEL(2);
279 CONVERT_YUV_PIXEL(2); CONVERT_Y_PIXEL(2);
282 /* Here we do some unaligned reads and duplicate conversions, but
283 * at least we have all the pixels */
287 p_u -= i_rewind >> 1;
288 p_v -= i_rewind >> 1;
289 p_buffer -= i_rewind;
291 CONVERT_YUV_PIXEL(2); CONVERT_Y_PIXEL(2);
292 CONVERT_YUV_PIXEL(2); CONVERT_Y_PIXEL(2);
293 CONVERT_YUV_PIXEL(2); CONVERT_Y_PIXEL(2);
294 CONVERT_YUV_PIXEL(2); CONVERT_Y_PIXEL(2);
297 SCALE_HEIGHT( 420, 2 );
299 p_y += i_source_margin;
302 p_u += i_source_margin_c;
303 p_v += i_source_margin_c;
308 #else // ! defined (MODULE_NAME_IS_i420_rgb)
310 void E_(I420_R5G5B5)( vout_thread_t *p_vout, picture_t *p_src,
313 /* We got this one from the old arguments */
314 uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
315 uint8_t *p_y = p_src->Y_PIXELS;
316 uint8_t *p_u = p_src->U_PIXELS;
317 uint8_t *p_v = p_src->V_PIXELS;
319 vlc_bool_t b_hscale; /* horizontal scaling type */
320 unsigned int i_vscale; /* vertical scaling type */
321 unsigned int i_x, i_y; /* horizontal and vertical indexes */
325 int i_scale_count; /* scale modulo counter */
326 int i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
327 uint16_t * p_pic_start; /* beginning of the current line for copy */
329 /* Conversion buffer pointer */
330 uint16_t * p_buffer_start = (uint16_t*)p_vout->chroma.p_sys->p_buffer;
333 /* Offset array pointer */
334 int * p_offset_start = p_vout->chroma.p_sys->p_offset;
337 const int i_source_margin = p_src->p[0].i_pitch
338 - p_src->p[0].i_visible_pitch;
339 const int i_source_margin_c = p_src->p[1].i_pitch
340 - p_src->p[1].i_visible_pitch;
342 i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
344 /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
345 * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
346 * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
347 SetOffset( p_vout->render.i_width, p_vout->render.i_height,
348 p_vout->output.i_width, p_vout->output.i_height,
349 &b_hscale, &i_vscale, p_offset_start );
355 i_scale_count = ( i_vscale == 1 ) ?
356 p_vout->output.i_height : p_vout->render.i_height;
358 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
360 if( p_vout->render.i_width & 15 )
362 i_rewind = 16 - ( p_vout->render.i_width & 15 );
370 ** SSE2 128 bits fetch/store instructions are faster
371 ** if memory access is 16 bytes aligned
374 p_buffer = b_hscale ? p_buffer_start : p_pic;
375 if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
378 ((intptr_t)p_buffer))) )
380 /* use faster SSE2 aligned fetch and store */
381 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
385 for ( i_x = p_vout->render.i_width/16; i_x--; )
391 SSE2_UNPACK_15_ALIGNED
398 /* Here we do some unaligned reads and duplicate conversions, but
399 * at least we have all the pixels */
403 p_u -= i_rewind >> 1;
404 p_v -= i_rewind >> 1;
405 p_buffer -= i_rewind;
408 SSE2_INIT_16_UNALIGNED
411 SSE2_UNPACK_15_UNALIGNED
418 SCALE_HEIGHT( 420, 2 );
420 p_y += i_source_margin;
423 p_u += i_source_margin_c;
424 p_v += i_source_margin_c;
426 p_buffer = b_hscale ? p_buffer_start : p_pic;
431 /* use slower SSE2 unaligned fetch and store */
432 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
435 p_buffer = b_hscale ? p_buffer_start : p_pic;
437 for ( i_x = p_vout->render.i_width/16; i_x--; )
440 SSE2_INIT_16_UNALIGNED
443 SSE2_UNPACK_15_UNALIGNED
450 /* Here we do some unaligned reads and duplicate conversions, but
451 * at least we have all the pixels */
455 p_u -= i_rewind >> 1;
456 p_v -= i_rewind >> 1;
457 p_buffer -= i_rewind;
460 SSE2_INIT_16_UNALIGNED
463 SSE2_UNPACK_15_UNALIGNED
470 SCALE_HEIGHT( 420, 2 );
472 p_y += i_source_margin;
475 p_u += i_source_margin_c;
476 p_v += i_source_margin_c;
478 p_buffer = b_hscale ? p_buffer_start : p_pic;
482 /* make sure all SSE2 stores are visible thereafter */
485 #else // defined (MODULE_NAME_IS_i420_rgb_mmx)
487 if( p_vout->render.i_width & 7 )
489 i_rewind = 8 - ( p_vout->render.i_width & 7 );
496 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
499 p_buffer = b_hscale ? p_buffer_start : p_pic;
501 for ( i_x = p_vout->render.i_width / 8; i_x--; )
515 /* Here we do some unaligned reads and duplicate conversions, but
516 * at least we have all the pixels */
520 p_u -= i_rewind >> 1;
521 p_v -= i_rewind >> 1;
522 p_buffer -= i_rewind;
536 SCALE_HEIGHT( 420, 2 );
538 p_y += i_source_margin;
541 p_u += i_source_margin_c;
542 p_v += i_source_margin_c;
545 /* re-enable FPU registers */
551 void E_(I420_R5G6B5)( vout_thread_t *p_vout, picture_t *p_src,
554 /* We got this one from the old arguments */
555 uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
556 uint8_t *p_y = p_src->Y_PIXELS;
557 uint8_t *p_u = p_src->U_PIXELS;
558 uint8_t *p_v = p_src->V_PIXELS;
560 vlc_bool_t b_hscale; /* horizontal scaling type */
561 unsigned int i_vscale; /* vertical scaling type */
562 unsigned int i_x, i_y; /* horizontal and vertical indexes */
566 int i_scale_count; /* scale modulo counter */
567 int i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
568 uint16_t * p_pic_start; /* beginning of the current line for copy */
570 /* Conversion buffer pointer */
571 uint16_t * p_buffer_start = (uint16_t*)p_vout->chroma.p_sys->p_buffer;
574 /* Offset array pointer */
575 int * p_offset_start = p_vout->chroma.p_sys->p_offset;
578 const int i_source_margin = p_src->p[0].i_pitch
579 - p_src->p[0].i_visible_pitch;
580 const int i_source_margin_c = p_src->p[1].i_pitch
581 - p_src->p[1].i_visible_pitch;
583 i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
585 /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
586 * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
587 * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
588 SetOffset( p_vout->render.i_width, p_vout->render.i_height,
589 p_vout->output.i_width, p_vout->output.i_height,
590 &b_hscale, &i_vscale, p_offset_start );
596 i_scale_count = ( i_vscale == 1 ) ?
597 p_vout->output.i_height : p_vout->render.i_height;
599 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
601 if( p_vout->render.i_width & 15 )
603 i_rewind = 16 - ( p_vout->render.i_width & 15 );
611 ** SSE2 128 bits fetch/store instructions are faster
612 ** if memory access is 16 bytes aligned
615 p_buffer = b_hscale ? p_buffer_start : p_pic;
616 if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
619 ((intptr_t)p_buffer))) )
621 /* use faster SSE2 aligned fetch and store */
622 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
626 for ( i_x = p_vout->render.i_width/16; i_x--; )
632 SSE2_UNPACK_16_ALIGNED
639 /* Here we do some unaligned reads and duplicate conversions, but
640 * at least we have all the pixels */
644 p_u -= i_rewind >> 1;
645 p_v -= i_rewind >> 1;
646 p_buffer -= i_rewind;
649 SSE2_INIT_16_UNALIGNED
652 SSE2_UNPACK_16_UNALIGNED
659 SCALE_HEIGHT( 420, 2 );
661 p_y += i_source_margin;
664 p_u += i_source_margin_c;
665 p_v += i_source_margin_c;
667 p_buffer = b_hscale ? p_buffer_start : p_pic;
672 /* use slower SSE2 unaligned fetch and store */
673 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
676 p_buffer = b_hscale ? p_buffer_start : p_pic;
678 for ( i_x = p_vout->render.i_width/16; i_x--; )
681 SSE2_INIT_16_UNALIGNED
684 SSE2_UNPACK_16_UNALIGNED
691 /* Here we do some unaligned reads and duplicate conversions, but
692 * at least we have all the pixels */
696 p_u -= i_rewind >> 1;
697 p_v -= i_rewind >> 1;
698 p_buffer -= i_rewind;
701 SSE2_INIT_16_UNALIGNED
704 SSE2_UNPACK_16_UNALIGNED
711 SCALE_HEIGHT( 420, 2 );
713 p_y += i_source_margin;
716 p_u += i_source_margin_c;
717 p_v += i_source_margin_c;
719 p_buffer = b_hscale ? p_buffer_start : p_pic;
723 /* make sure all SSE2 stores are visible thereafter */
726 #else // defined (MODULE_NAME_IS_i420_rgb_mmx)
728 if( p_vout->render.i_width & 7 )
730 i_rewind = 8 - ( p_vout->render.i_width & 7 );
737 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
740 p_buffer = b_hscale ? p_buffer_start : p_pic;
742 for ( i_x = p_vout->render.i_width / 8; i_x--; )
756 /* Here we do some unaligned reads and duplicate conversions, but
757 * at least we have all the pixels */
761 p_u -= i_rewind >> 1;
762 p_v -= i_rewind >> 1;
763 p_buffer -= i_rewind;
777 SCALE_HEIGHT( 420, 2 );
779 p_y += i_source_margin;
782 p_u += i_source_margin_c;
783 p_v += i_source_margin_c;
786 /* re-enable FPU registers */
794 /*****************************************************************************
795 * I420_RGB32: color YUV 4:2:0 to RGB 32 bpp
796 *****************************************************************************
797 * Horizontal alignment needed:
798 * - input: 8 pixels (8 Y bytes, 4 U/V bytes), margins not allowed
799 * - output: 1 pixel (2 bytes), margins allowed
800 * Vertical alignment needed:
801 * - input: 2 lines (2 Y lines, 1 U/V line)
803 *****************************************************************************/
805 #if defined (MODULE_NAME_IS_i420_rgb)
807 void E_(I420_RGB32)( vout_thread_t *p_vout, picture_t *p_src,
810 /* We got this one from the old arguments */
811 uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
812 uint8_t *p_y = p_src->Y_PIXELS;
813 uint8_t *p_u = p_src->U_PIXELS;
814 uint8_t *p_v = p_src->V_PIXELS;
816 vlc_bool_t b_hscale; /* horizontal scaling type */
817 unsigned int i_vscale; /* vertical scaling type */
818 unsigned int i_x, i_y; /* horizontal and vertical indexes */
822 int i_scale_count; /* scale modulo counter */
823 int i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
824 uint32_t * p_pic_start; /* beginning of the current line for copy */
825 int i_uval, i_vval; /* U and V samples */
826 int i_red, i_green, i_blue; /* U and V modified samples */
827 uint32_t * p_yuv = p_vout->chroma.p_sys->p_rgb32;
828 uint32_t * p_ybase; /* Y dependant conversion table */
830 /* Conversion buffer pointer */
831 uint32_t * p_buffer_start = (uint32_t*)p_vout->chroma.p_sys->p_buffer;
834 /* Offset array pointer */
835 int * p_offset_start = p_vout->chroma.p_sys->p_offset;
838 const int i_source_margin = p_src->p[0].i_pitch
839 - p_src->p[0].i_visible_pitch;
840 const int i_source_margin_c = p_src->p[1].i_pitch
841 - p_src->p[1].i_visible_pitch;
843 i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
845 if( p_vout->render.i_width & 7 )
847 i_rewind = 8 - ( p_vout->render.i_width & 7 );
854 /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
855 * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
856 * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
857 SetOffset( p_vout->render.i_width, p_vout->render.i_height,
858 p_vout->output.i_width, p_vout->output.i_height,
859 &b_hscale, &i_vscale, p_offset_start );
864 i_scale_count = ( i_vscale == 1 ) ?
865 p_vout->output.i_height : p_vout->render.i_height;
866 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
869 p_buffer = b_hscale ? p_buffer_start : p_pic;
871 for ( i_x = p_vout->render.i_width / 8; i_x--; )
873 CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4);
874 CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4);
875 CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4);
876 CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4);
879 /* Here we do some unaligned reads and duplicate conversions, but
880 * at least we have all the pixels */
884 p_u -= i_rewind >> 1;
885 p_v -= i_rewind >> 1;
886 p_buffer -= i_rewind;
887 CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4);
888 CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4);
889 CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4);
890 CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4);
893 SCALE_HEIGHT( 420, 4 );
895 p_y += i_source_margin;
898 p_u += i_source_margin_c;
899 p_v += i_source_margin_c;
904 #else // defined (MODULE_NAME_IS_i420_rgb_mmx) || defined (MODULE_NAME_IS_i420_rgb_sse2)
906 void E_(I420_A8R8G8B8)( vout_thread_t *p_vout, picture_t *p_src,
909 /* We got this one from the old arguments */
910 uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
911 uint8_t *p_y = p_src->Y_PIXELS;
912 uint8_t *p_u = p_src->U_PIXELS;
913 uint8_t *p_v = p_src->V_PIXELS;
915 vlc_bool_t b_hscale; /* horizontal scaling type */
916 unsigned int i_vscale; /* vertical scaling type */
917 unsigned int i_x, i_y; /* horizontal and vertical indexes */
921 int i_scale_count; /* scale modulo counter */
922 int i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
923 uint32_t * p_pic_start; /* beginning of the current line for copy */
924 /* Conversion buffer pointer */
925 uint32_t * p_buffer_start = (uint32_t*)p_vout->chroma.p_sys->p_buffer;
928 /* Offset array pointer */
929 int * p_offset_start = p_vout->chroma.p_sys->p_offset;
932 const int i_source_margin = p_src->p[0].i_pitch
933 - p_src->p[0].i_visible_pitch;
934 const int i_source_margin_c = p_src->p[1].i_pitch
935 - p_src->p[1].i_visible_pitch;
937 i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
939 /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
940 * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
941 * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
942 SetOffset( p_vout->render.i_width, p_vout->render.i_height,
943 p_vout->output.i_width, p_vout->output.i_height,
944 &b_hscale, &i_vscale, p_offset_start );
949 i_scale_count = ( i_vscale == 1 ) ?
950 p_vout->output.i_height : p_vout->render.i_height;
952 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
954 if( p_vout->render.i_width & 15 )
956 i_rewind = 16 - ( p_vout->render.i_width & 15 );
964 ** SSE2 128 bits fetch/store instructions are faster
965 ** if memory access is 16 bytes aligned
968 p_buffer = b_hscale ? p_buffer_start : p_pic;
969 if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
972 ((intptr_t)p_buffer))) )
974 /* use faster SSE2 aligned fetch and store */
975 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
979 for ( i_x = p_vout->render.i_width / 16; i_x--; )
985 SSE2_UNPACK_32_ARGB_ALIGNED
993 /* Here we do some unaligned reads and duplicate conversions, but
994 * at least we have all the pixels */
998 p_u -= i_rewind >> 1;
999 p_v -= i_rewind >> 1;
1000 p_buffer -= i_rewind;
1002 SSE2_INIT_32_UNALIGNED
1005 SSE2_UNPACK_32_ARGB_UNALIGNED
1012 SCALE_HEIGHT( 420, 4 );
1014 p_y += i_source_margin;
1017 p_u += i_source_margin_c;
1018 p_v += i_source_margin_c;
1020 p_buffer = b_hscale ? p_buffer_start : p_pic;
1025 /* use slower SSE2 unaligned fetch and store */
1026 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1028 p_pic_start = p_pic;
1029 p_buffer = b_hscale ? p_buffer_start : p_pic;
1031 for ( i_x = p_vout->render.i_width / 16; i_x--; )
1034 SSE2_INIT_32_UNALIGNED
1037 SSE2_UNPACK_32_ARGB_UNALIGNED
1045 /* Here we do some unaligned reads and duplicate conversions, but
1046 * at least we have all the pixels */
1050 p_u -= i_rewind >> 1;
1051 p_v -= i_rewind >> 1;
1052 p_buffer -= i_rewind;
1054 SSE2_INIT_32_UNALIGNED
1057 SSE2_UNPACK_32_ARGB_UNALIGNED
1064 SCALE_HEIGHT( 420, 4 );
1066 p_y += i_source_margin;
1069 p_u += i_source_margin_c;
1070 p_v += i_source_margin_c;
1072 p_buffer = b_hscale ? p_buffer_start : p_pic;
1076 /* make sure all SSE2 stores are visible thereafter */
1079 #else // defined (MODULE_NAME_IS_i420_rgb_mmx)
1081 if( p_vout->render.i_width & 7 )
1083 i_rewind = 8 - ( p_vout->render.i_width & 7 );
1090 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1092 p_pic_start = p_pic;
1093 p_buffer = b_hscale ? p_buffer_start : p_pic;
1095 for ( i_x = p_vout->render.i_width / 8; i_x--; )
1109 /* Here we do some unaligned reads and duplicate conversions, but
1110 * at least we have all the pixels */
1114 p_u -= i_rewind >> 1;
1115 p_v -= i_rewind >> 1;
1116 p_buffer -= i_rewind;
1129 SCALE_HEIGHT( 420, 4 );
1131 p_y += i_source_margin;
1134 p_u += i_source_margin_c;
1135 p_v += i_source_margin_c;
1139 /* re-enable FPU registers */
1145 void E_(I420_R8G8B8A8)( vout_thread_t *p_vout, picture_t *p_src,
1148 /* We got this one from the old arguments */
1149 uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
1150 uint8_t *p_y = p_src->Y_PIXELS;
1151 uint8_t *p_u = p_src->U_PIXELS;
1152 uint8_t *p_v = p_src->V_PIXELS;
1154 vlc_bool_t b_hscale; /* horizontal scaling type */
1155 unsigned int i_vscale; /* vertical scaling type */
1156 unsigned int i_x, i_y; /* horizontal and vertical indexes */
1160 int i_scale_count; /* scale modulo counter */
1161 int i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
1162 uint32_t * p_pic_start; /* beginning of the current line for copy */
1163 /* Conversion buffer pointer */
1164 uint32_t * p_buffer_start = (uint32_t*)p_vout->chroma.p_sys->p_buffer;
1165 uint32_t * p_buffer;
1167 /* Offset array pointer */
1168 int * p_offset_start = p_vout->chroma.p_sys->p_offset;
1171 const int i_source_margin = p_src->p[0].i_pitch
1172 - p_src->p[0].i_visible_pitch;
1173 const int i_source_margin_c = p_src->p[1].i_pitch
1174 - p_src->p[1].i_visible_pitch;
1176 i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
1178 /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
1179 * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
1180 * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
1181 SetOffset( p_vout->render.i_width, p_vout->render.i_height,
1182 p_vout->output.i_width, p_vout->output.i_height,
1183 &b_hscale, &i_vscale, p_offset_start );
1186 * Perform conversion
1188 i_scale_count = ( i_vscale == 1 ) ?
1189 p_vout->output.i_height : p_vout->render.i_height;
1191 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
1193 if( p_vout->render.i_width & 15 )
1195 i_rewind = 16 - ( p_vout->render.i_width & 15 );
1203 ** SSE2 128 bits fetch/store instructions are faster
1204 ** if memory access is 16 bytes aligned
1207 p_buffer = b_hscale ? p_buffer_start : p_pic;
1208 if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
1211 ((intptr_t)p_buffer))) )
1213 /* use faster SSE2 aligned fetch and store */
1214 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1216 p_pic_start = p_pic;
1218 for ( i_x = p_vout->render.i_width / 16; i_x--; )
1221 SSE2_INIT_32_ALIGNED
1224 SSE2_UNPACK_32_RGBA_ALIGNED
1232 /* Here we do some unaligned reads and duplicate conversions, but
1233 * at least we have all the pixels */
1237 p_u -= i_rewind >> 1;
1238 p_v -= i_rewind >> 1;
1239 p_buffer -= i_rewind;
1241 SSE2_INIT_32_UNALIGNED
1244 SSE2_UNPACK_32_RGBA_UNALIGNED
1251 SCALE_HEIGHT( 420, 4 );
1253 p_y += i_source_margin;
1256 p_u += i_source_margin_c;
1257 p_v += i_source_margin_c;
1259 p_buffer = b_hscale ? p_buffer_start : p_pic;
1264 /* use slower SSE2 unaligned fetch and store */
1265 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1267 p_pic_start = p_pic;
1268 p_buffer = b_hscale ? p_buffer_start : p_pic;
1270 for ( i_x = p_vout->render.i_width / 16; i_x--; )
1273 SSE2_INIT_32_UNALIGNED
1276 SSE2_UNPACK_32_RGBA_UNALIGNED
1284 /* Here we do some unaligned reads and duplicate conversions, but
1285 * at least we have all the pixels */
1289 p_u -= i_rewind >> 1;
1290 p_v -= i_rewind >> 1;
1291 p_buffer -= i_rewind;
1293 SSE2_INIT_32_UNALIGNED
1296 SSE2_UNPACK_32_RGBA_UNALIGNED
1303 SCALE_HEIGHT( 420, 4 );
1305 p_y += i_source_margin;
1308 p_u += i_source_margin_c;
1309 p_v += i_source_margin_c;
1311 p_buffer = b_hscale ? p_buffer_start : p_pic;
1315 /* make sure all SSE2 stores are visible thereafter */
1318 #else // defined (MODULE_NAME_IS_i420_rgb_mmx)
1320 if( p_vout->render.i_width & 7 )
1322 i_rewind = 8 - ( p_vout->render.i_width & 7 );
1329 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1331 p_pic_start = p_pic;
1332 p_buffer = b_hscale ? p_buffer_start : p_pic;
1334 for ( i_x = p_vout->render.i_width / 8; i_x--; )
1348 /* Here we do some unaligned reads and duplicate conversions, but
1349 * at least we have all the pixels */
1353 p_u -= i_rewind >> 1;
1354 p_v -= i_rewind >> 1;
1355 p_buffer -= i_rewind;
1368 SCALE_HEIGHT( 420, 4 );
1370 p_y += i_source_margin;
1373 p_u += i_source_margin_c;
1374 p_v += i_source_margin_c;
1378 /* re-enable FPU registers */
1384 void E_(I420_B8G8R8A8)( vout_thread_t *p_vout, picture_t *p_src,
1387 /* We got this one from the old arguments */
1388 uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
1389 uint8_t *p_y = p_src->Y_PIXELS;
1390 uint8_t *p_u = p_src->U_PIXELS;
1391 uint8_t *p_v = p_src->V_PIXELS;
1393 vlc_bool_t b_hscale; /* horizontal scaling type */
1394 unsigned int i_vscale; /* vertical scaling type */
1395 unsigned int i_x, i_y; /* horizontal and vertical indexes */
1399 int i_scale_count; /* scale modulo counter */
1400 int i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
1401 uint32_t * p_pic_start; /* beginning of the current line for copy */
1402 /* Conversion buffer pointer */
1403 uint32_t * p_buffer_start = (uint32_t*)p_vout->chroma.p_sys->p_buffer;
1404 uint32_t * p_buffer;
1406 /* Offset array pointer */
1407 int * p_offset_start = p_vout->chroma.p_sys->p_offset;
1410 const int i_source_margin = p_src->p[0].i_pitch
1411 - p_src->p[0].i_visible_pitch;
1412 const int i_source_margin_c = p_src->p[1].i_pitch
1413 - p_src->p[1].i_visible_pitch;
1415 i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
1417 /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
1418 * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
1419 * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
1420 SetOffset( p_vout->render.i_width, p_vout->render.i_height,
1421 p_vout->output.i_width, p_vout->output.i_height,
1422 &b_hscale, &i_vscale, p_offset_start );
1425 * Perform conversion
1427 i_scale_count = ( i_vscale == 1 ) ?
1428 p_vout->output.i_height : p_vout->render.i_height;
1430 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
1432 if( p_vout->render.i_width & 15 )
1434 i_rewind = 16 - ( p_vout->render.i_width & 15 );
1442 ** SSE2 128 bits fetch/store instructions are faster
1443 ** if memory access is 16 bytes aligned
1446 p_buffer = b_hscale ? p_buffer_start : p_pic;
1447 if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
1450 ((intptr_t)p_buffer))) )
1452 /* use faster SSE2 aligned fetch and store */
1453 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1455 p_pic_start = p_pic;
1457 for ( i_x = p_vout->render.i_width / 16; i_x--; )
1460 SSE2_INIT_32_ALIGNED
1463 SSE2_UNPACK_32_BGRA_ALIGNED
1471 /* Here we do some unaligned reads and duplicate conversions, but
1472 * at least we have all the pixels */
1476 p_u -= i_rewind >> 1;
1477 p_v -= i_rewind >> 1;
1478 p_buffer -= i_rewind;
1480 SSE2_INIT_32_UNALIGNED
1483 SSE2_UNPACK_32_BGRA_UNALIGNED
1490 SCALE_HEIGHT( 420, 4 );
1492 p_y += i_source_margin;
1495 p_u += i_source_margin_c;
1496 p_v += i_source_margin_c;
1498 p_buffer = b_hscale ? p_buffer_start : p_pic;
1503 /* use slower SSE2 unaligned fetch and store */
1504 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1506 p_pic_start = p_pic;
1507 p_buffer = b_hscale ? p_buffer_start : p_pic;
1509 for ( i_x = p_vout->render.i_width / 16; i_x--; )
1512 SSE2_INIT_32_UNALIGNED
1515 SSE2_UNPACK_32_BGRA_UNALIGNED
1523 /* Here we do some unaligned reads and duplicate conversions, but
1524 * at least we have all the pixels */
1528 p_u -= i_rewind >> 1;
1529 p_v -= i_rewind >> 1;
1530 p_buffer -= i_rewind;
1532 SSE2_INIT_32_UNALIGNED
1535 SSE2_UNPACK_32_BGRA_UNALIGNED
1542 SCALE_HEIGHT( 420, 4 );
1544 p_y += i_source_margin;
1547 p_u += i_source_margin_c;
1548 p_v += i_source_margin_c;
1550 p_buffer = b_hscale ? p_buffer_start : p_pic;
1556 if( p_vout->render.i_width & 7 )
1558 i_rewind = 8 - ( p_vout->render.i_width & 7 );
1565 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1567 p_pic_start = p_pic;
1568 p_buffer = b_hscale ? p_buffer_start : p_pic;
1570 for ( i_x = p_vout->render.i_width / 8; i_x--; )
1584 /* Here we do some unaligned reads and duplicate conversions, but
1585 * at least we have all the pixels */
1589 p_u -= i_rewind >> 1;
1590 p_v -= i_rewind >> 1;
1591 p_buffer -= i_rewind;
1604 SCALE_HEIGHT( 420, 4 );
1606 p_y += i_source_margin;
1609 p_u += i_source_margin_c;
1610 p_v += i_source_margin_c;
1614 /* re-enable FPU registers */
1620 void E_(I420_A8B8G8R8)( vout_thread_t *p_vout, picture_t *p_src,
1623 /* We got this one from the old arguments */
1624 uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
1625 uint8_t *p_y = p_src->Y_PIXELS;
1626 uint8_t *p_u = p_src->U_PIXELS;
1627 uint8_t *p_v = p_src->V_PIXELS;
1629 vlc_bool_t b_hscale; /* horizontal scaling type */
1630 unsigned int i_vscale; /* vertical scaling type */
1631 unsigned int i_x, i_y; /* horizontal and vertical indexes */
1635 int i_scale_count; /* scale modulo counter */
1636 int i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
1637 uint32_t * p_pic_start; /* beginning of the current line for copy */
1638 /* Conversion buffer pointer */
1639 uint32_t * p_buffer_start = (uint32_t*)p_vout->chroma.p_sys->p_buffer;
1640 uint32_t * p_buffer;
1642 /* Offset array pointer */
1643 int * p_offset_start = p_vout->chroma.p_sys->p_offset;
1646 const int i_source_margin = p_src->p[0].i_pitch
1647 - p_src->p[0].i_visible_pitch;
1648 const int i_source_margin_c = p_src->p[1].i_pitch
1649 - p_src->p[1].i_visible_pitch;
1651 i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
1653 /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
1654 * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
1655 * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
1656 SetOffset( p_vout->render.i_width, p_vout->render.i_height,
1657 p_vout->output.i_width, p_vout->output.i_height,
1658 &b_hscale, &i_vscale, p_offset_start );
1661 * Perform conversion
1663 i_scale_count = ( i_vscale == 1 ) ?
1664 p_vout->output.i_height : p_vout->render.i_height;
1666 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
1668 if( p_vout->render.i_width & 15 )
1670 i_rewind = 16 - ( p_vout->render.i_width & 15 );
1678 ** SSE2 128 bits fetch/store instructions are faster
1679 ** if memory access is 16 bytes aligned
1682 p_buffer = b_hscale ? p_buffer_start : p_pic;
1683 if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
1686 ((intptr_t)p_buffer))) )
1688 /* use faster SSE2 aligned fetch and store */
1689 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1691 p_pic_start = p_pic;
1693 for ( i_x = p_vout->render.i_width / 16; i_x--; )
1696 SSE2_INIT_32_ALIGNED
1699 SSE2_UNPACK_32_ABGR_ALIGNED
1707 /* Here we do some unaligned reads and duplicate conversions, but
1708 * at least we have all the pixels */
1712 p_u -= i_rewind >> 1;
1713 p_v -= i_rewind >> 1;
1714 p_buffer -= i_rewind;
1716 SSE2_INIT_32_UNALIGNED
1719 SSE2_UNPACK_32_ABGR_UNALIGNED
1726 SCALE_HEIGHT( 420, 4 );
1728 p_y += i_source_margin;
1731 p_u += i_source_margin_c;
1732 p_v += i_source_margin_c;
1734 p_buffer = b_hscale ? p_buffer_start : p_pic;
1739 /* use slower SSE2 unaligned fetch and store */
1740 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1742 p_pic_start = p_pic;
1743 p_buffer = b_hscale ? p_buffer_start : p_pic;
1745 for ( i_x = p_vout->render.i_width / 16; i_x--; )
1748 SSE2_INIT_32_UNALIGNED
1751 SSE2_UNPACK_32_ABGR_UNALIGNED
1759 /* Here we do some unaligned reads and duplicate conversions, but
1760 * at least we have all the pixels */
1764 p_u -= i_rewind >> 1;
1765 p_v -= i_rewind >> 1;
1766 p_buffer -= i_rewind;
1768 SSE2_INIT_32_UNALIGNED
1771 SSE2_UNPACK_32_ABGR_UNALIGNED
1778 SCALE_HEIGHT( 420, 4 );
1780 p_y += i_source_margin;
1783 p_u += i_source_margin_c;
1784 p_v += i_source_margin_c;
1786 p_buffer = b_hscale ? p_buffer_start : p_pic;
1792 if( p_vout->render.i_width & 7 )
1794 i_rewind = 8 - ( p_vout->render.i_width & 7 );
1801 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1803 p_pic_start = p_pic;
1804 p_buffer = b_hscale ? p_buffer_start : p_pic;
1806 for ( i_x = p_vout->render.i_width / 8; i_x--; )
1820 /* Here we do some unaligned reads and duplicate conversions, but
1821 * at least we have all the pixels */
1825 p_u -= i_rewind >> 1;
1826 p_v -= i_rewind >> 1;
1827 p_buffer -= i_rewind;
1840 SCALE_HEIGHT( 420, 4 );
1842 p_y += i_source_margin;
1845 p_u += i_source_margin_c;
1846 p_v += i_source_margin_c;
1850 /* re-enable FPU registers */
1858 /* Following functions are local */
1860 /*****************************************************************************
1861 * SetOffset: build offset array for conversion functions
1862 *****************************************************************************
1863 * This function will build an offset array used in later conversion functions.
1864 * It will also set horizontal and vertical scaling indicators.
1865 *****************************************************************************/
1866 static void SetOffset( int i_width, int i_height, int i_pic_width,
1867 int i_pic_height, vlc_bool_t *pb_hscale,
1868 unsigned int *pi_vscale, int *p_offset )
1870 int i_x; /* x position in destination */
1871 int i_scale_count; /* modulo counter */
1874 * Prepare horizontal offset array
1876 if( i_pic_width - i_width == 0 )
1878 /* No horizontal scaling: YUV conversion is done directly to picture */
1881 else if( i_pic_width - i_width > 0 )
1883 /* Prepare scaling array for horizontal extension */
1885 i_scale_count = i_pic_width;
1886 for( i_x = i_width; i_x--; )
1888 while( (i_scale_count -= i_width) > 0 )
1893 i_scale_count += i_pic_width;
1896 else /* if( i_pic_width - i_width < 0 ) */
1898 /* Prepare scaling array for horizontal reduction */
1900 i_scale_count = i_width;
1901 for( i_x = i_pic_width; i_x--; )
1904 while( (i_scale_count -= i_pic_width) > 0 )
1909 i_scale_count += i_width;
1914 * Set vertical scaling indicator
1916 if( i_pic_height - i_height == 0 )
1920 else if( i_pic_height - i_height > 0 )
1924 else /* if( i_pic_height - i_height < 0 ) */