1 /*****************************************************************************
2 * i420_rgb16.c : YUV to bitmap RGB conversion module for vlc
3 *****************************************************************************
4 * Copyright (C) 2000 the VideoLAN team
7 * Authors: Samuel Hocevar <sam@zoy.org>
8 * Damien Fouilleul <damienf@videolan.org>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
23 *****************************************************************************/
25 /*****************************************************************************
27 *****************************************************************************/
28 #include <string.h> /* strerror() */
29 #include <stdlib.h> /* malloc(), free() */
35 #if defined (MODULE_NAME_IS_i420_rgb)
36 # include "i420_rgb_c.h"
37 #elif defined (MODULE_NAME_IS_i420_rgb_mmx)
38 # if defined(HAVE_MMX_INTRINSICS)
39 # include <mmintrin.h>
41 # include "i420_rgb_mmx.h"
42 #elif defined (MODULE_NAME_IS_i420_rgb_sse2)
43 # if defined(HAVE_SSE2_INTRINSICS)
44 # include <emmintrin.h>
46 # include "i420_rgb_mmx.h"
49 static void SetOffset( int, int, int, int, vlc_bool_t *,
50 unsigned int *, int * );
52 #if defined (MODULE_NAME_IS_i420_rgb)
53 /*****************************************************************************
54 * I420_RGB16: color YUV 4:2:0 to RGB 16 bpp with dithering
55 *****************************************************************************
56 * Horizontal alignment needed:
57 * - input: 8 pixels (8 Y bytes, 4 U/V bytes), margins not allowed
58 * - output: 1 pixel (2 bytes), margins allowed
59 * Vertical alignment needed:
60 * - input: 2 lines (2 Y lines, 1 U/V line)
62 *****************************************************************************/
63 void E_(I420_RGB16_dither)( vout_thread_t *p_vout, picture_t *p_src,
66 /* We got this one from the old arguments */
67 uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
68 uint8_t *p_y = p_src->Y_PIXELS;
69 uint8_t *p_u = p_src->U_PIXELS;
70 uint8_t *p_v = p_src->V_PIXELS;
72 vlc_bool_t b_hscale; /* horizontal scaling type */
73 unsigned int i_vscale; /* vertical scaling type */
74 unsigned int i_x, i_y; /* horizontal and vertical indexes */
75 unsigned int i_real_y; /* y % 4 */
79 int i_scale_count; /* scale modulo counter */
80 int i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
81 uint16_t * p_pic_start; /* beginning of the current line for copy */
82 int i_uval, i_vval; /* U and V samples */
83 int i_red, i_green, i_blue; /* U and V modified samples */
84 uint16_t * p_yuv = p_vout->chroma.p_sys->p_rgb16;
85 uint16_t * p_ybase; /* Y dependant conversion table */
87 /* Conversion buffer pointer */
88 uint16_t * p_buffer_start = (uint16_t*)p_vout->chroma.p_sys->p_buffer;
91 /* Offset array pointer */
92 int * p_offset_start = p_vout->chroma.p_sys->p_offset;
95 const int i_source_margin = p_src->p[0].i_pitch
96 - p_src->p[0].i_visible_pitch;
97 const int i_source_margin_c = p_src->p[1].i_pitch
98 - p_src->p[1].i_visible_pitch;
100 /* The dithering matrices */
101 int dither10[4] = { 0x0, 0x8, 0x2, 0xa };
102 int dither11[4] = { 0xc, 0x4, 0xe, 0x6 };
103 int dither12[4] = { 0x3, 0xb, 0x1, 0x9 };
104 int dither13[4] = { 0xf, 0x7, 0xd, 0x5 };
106 for(i_x = 0; i_x < 4; i_x++)
108 dither10[i_x] = dither10[i_x] << (SHIFT - 4 + p_vout->output.i_rrshift);
109 dither11[i_x] = dither11[i_x] << (SHIFT - 4 + p_vout->output.i_rrshift);
110 dither12[i_x] = dither12[i_x] << (SHIFT - 4 + p_vout->output.i_rrshift);
111 dither13[i_x] = dither13[i_x] << (SHIFT - 4 + p_vout->output.i_rrshift);
114 i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
116 if( p_vout->render.i_width & 7 )
118 i_rewind = 8 - ( p_vout->render.i_width & 7 );
125 /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
126 * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
127 * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
128 SetOffset( p_vout->render.i_width, p_vout->render.i_height,
129 p_vout->output.i_width, p_vout->output.i_height,
130 &b_hscale, &i_vscale, p_offset_start );
135 i_scale_count = ( i_vscale == 1 ) ?
136 p_vout->output.i_height : p_vout->render.i_height;
137 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
139 i_real_y = i_y & 0x3;
141 p_buffer = b_hscale ? p_buffer_start : p_pic;
143 for ( i_x = p_vout->render.i_width / 8; i_x--; )
145 int *p_dither = dither10;
146 CONVERT_YUV_PIXEL_DITHER(2);
148 CONVERT_Y_PIXEL_DITHER(2);
150 CONVERT_YUV_PIXEL_DITHER(2);
152 CONVERT_Y_PIXEL_DITHER(2);
154 CONVERT_YUV_PIXEL_DITHER(2);
156 CONVERT_Y_PIXEL_DITHER(2);
158 CONVERT_YUV_PIXEL_DITHER(2);
160 CONVERT_Y_PIXEL_DITHER(2);
163 /* Here we do some unaligned reads and duplicate conversions, but
164 * at least we have all the pixels */
167 int *p_dither = dither10;
169 p_u -= i_rewind >> 1;
170 p_v -= i_rewind >> 1;
171 p_buffer -= i_rewind;
172 CONVERT_YUV_PIXEL_DITHER(2);
174 CONVERT_Y_PIXEL_DITHER(2);
176 CONVERT_YUV_PIXEL_DITHER(2);
178 CONVERT_Y_PIXEL_DITHER(2);
180 CONVERT_YUV_PIXEL_DITHER(2);
182 CONVERT_Y_PIXEL_DITHER(2);
184 CONVERT_YUV_PIXEL_DITHER(2);
186 CONVERT_Y_PIXEL_DITHER(2);
189 SCALE_HEIGHT( 420, 2 );
191 p_y += i_source_margin;
194 p_u += i_source_margin_c;
195 p_v += i_source_margin_c;
201 /*****************************************************************************
202 * I420_RGB16: color YUV 4:2:0 to RGB 16 bpp
203 *****************************************************************************
204 * Horizontal alignment needed:
205 * - input: 8 pixels (8 Y bytes, 4 U/V bytes), margins not allowed
206 * - output: 1 pixel (2 bytes), margins allowed
207 * Vertical alignment needed:
208 * - input: 2 lines (2 Y lines, 1 U/V line)
210 *****************************************************************************/
212 #if defined (MODULE_NAME_IS_i420_rgb)
214 void E_(I420_RGB16)( vout_thread_t *p_vout, picture_t *p_src,
217 /* We got this one from the old arguments */
218 uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
219 uint8_t *p_y = p_src->Y_PIXELS;
220 uint8_t *p_u = p_src->U_PIXELS;
221 uint8_t *p_v = p_src->V_PIXELS;
223 vlc_bool_t b_hscale; /* horizontal scaling type */
224 unsigned int i_vscale; /* vertical scaling type */
225 unsigned int i_x, i_y; /* horizontal and vertical indexes */
229 int i_scale_count; /* scale modulo counter */
230 int i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
231 uint16_t * p_pic_start; /* beginning of the current line for copy */
232 int i_uval, i_vval; /* U and V samples */
233 int i_red, i_green, i_blue; /* U and V modified samples */
234 uint16_t * p_yuv = p_vout->chroma.p_sys->p_rgb16;
235 uint16_t * p_ybase; /* Y dependant conversion table */
237 /* Conversion buffer pointer */
238 uint16_t * p_buffer_start = (uint16_t*)p_vout->chroma.p_sys->p_buffer;
241 /* Offset array pointer */
242 int * p_offset_start = p_vout->chroma.p_sys->p_offset;
245 const int i_source_margin = p_src->p[0].i_pitch
246 - p_src->p[0].i_visible_pitch;
247 const int i_source_margin_c = p_src->p[1].i_pitch
248 - p_src->p[1].i_visible_pitch;
250 i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
252 if( p_vout->render.i_width & 7 )
254 i_rewind = 8 - ( p_vout->render.i_width & 7 );
261 /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
262 * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
263 * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
264 SetOffset( p_vout->render.i_width, p_vout->render.i_height,
265 p_vout->output.i_width, p_vout->output.i_height,
266 &b_hscale, &i_vscale, p_offset_start );
271 i_scale_count = ( i_vscale == 1 ) ?
272 p_vout->output.i_height : p_vout->render.i_height;
273 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
276 p_buffer = b_hscale ? p_buffer_start : p_pic;
278 for ( i_x = p_vout->render.i_width / 8; i_x--; )
280 CONVERT_YUV_PIXEL(2); CONVERT_Y_PIXEL(2);
281 CONVERT_YUV_PIXEL(2); CONVERT_Y_PIXEL(2);
282 CONVERT_YUV_PIXEL(2); CONVERT_Y_PIXEL(2);
283 CONVERT_YUV_PIXEL(2); CONVERT_Y_PIXEL(2);
286 /* Here we do some unaligned reads and duplicate conversions, but
287 * at least we have all the pixels */
291 p_u -= i_rewind >> 1;
292 p_v -= i_rewind >> 1;
293 p_buffer -= i_rewind;
295 CONVERT_YUV_PIXEL(2); CONVERT_Y_PIXEL(2);
296 CONVERT_YUV_PIXEL(2); CONVERT_Y_PIXEL(2);
297 CONVERT_YUV_PIXEL(2); CONVERT_Y_PIXEL(2);
298 CONVERT_YUV_PIXEL(2); CONVERT_Y_PIXEL(2);
301 SCALE_HEIGHT( 420, 2 );
303 p_y += i_source_margin;
306 p_u += i_source_margin_c;
307 p_v += i_source_margin_c;
312 #else // defined (MODULE_NAME_IS_i420_rgb_mmx)
314 void E_(I420_R5G5B5)( vout_thread_t *p_vout, picture_t *p_src,
317 /* We got this one from the old arguments */
318 uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
319 uint8_t *p_y = p_src->Y_PIXELS;
320 uint8_t *p_u = p_src->U_PIXELS;
321 uint8_t *p_v = p_src->V_PIXELS;
323 vlc_bool_t b_hscale; /* horizontal scaling type */
324 unsigned int i_vscale; /* vertical scaling type */
325 unsigned int i_x, i_y; /* horizontal and vertical indexes */
329 int i_scale_count; /* scale modulo counter */
330 int i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
331 uint16_t * p_pic_start; /* beginning of the current line for copy */
333 /* Conversion buffer pointer */
334 uint16_t * p_buffer_start = (uint16_t*)p_vout->chroma.p_sys->p_buffer;
337 /* Offset array pointer */
338 int * p_offset_start = p_vout->chroma.p_sys->p_offset;
341 const int i_source_margin = p_src->p[0].i_pitch
342 - p_src->p[0].i_visible_pitch;
343 const int i_source_margin_c = p_src->p[1].i_pitch
344 - p_src->p[1].i_visible_pitch;
346 i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
348 /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
349 * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
350 * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
351 SetOffset( p_vout->render.i_width, p_vout->render.i_height,
352 p_vout->output.i_width, p_vout->output.i_height,
353 &b_hscale, &i_vscale, p_offset_start );
359 i_scale_count = ( i_vscale == 1 ) ?
360 p_vout->output.i_height : p_vout->render.i_height;
362 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
364 if( p_vout->render.i_width & 15 )
366 i_rewind = 16 - ( p_vout->render.i_width & 15 );
374 ** SSE2 128 bits fetch/store instructions are faster
375 ** if memory access is 16 bytes aligned
378 p_buffer = b_hscale ? p_buffer_start : p_pic;
379 if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
384 /* use faster SSE2 aligned fetch and store */
385 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
389 for ( i_x = p_vout->render.i_width/16; i_x--; )
391 #if defined (CAN_COMPILE_SSE2)
392 __asm__( ".p2align 3"
396 SSE2_UNPACK_15_ALIGNED
397 : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
399 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
400 SSE2_INTRINSICS_INIT_16_ALIGNED
401 SSE2_INTRINSICS_YUV_MUL
402 SSE2_INTRINSICS_YUV_ADD
403 SSE2_INTRINSICS_UNPACK_15_ALIGNED
410 /* Here we do some unaligned reads and duplicate conversions, but
411 * at least we have all the pixels */
415 p_u -= i_rewind >> 1;
416 p_v -= i_rewind >> 1;
417 p_buffer -= i_rewind;
419 #if defined (CAN_COMPILE_SSE2)
420 __asm__( ".p2align 3"
421 SSE2_INIT_16_UNALIGNED
424 SSE2_UNPACK_15_UNALIGNED
425 : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
428 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
430 SSE2_INTRINSICS_INIT_16_UNALIGNED
431 SSE2_INTRINSICS_YUV_MUL
432 SSE2_INTRINSICS_YUV_ADD
433 SSE2_INTRINSICS_UNPACK_15_UNALIGNED
441 SCALE_HEIGHT( 420, 2 );
443 p_y += i_source_margin;
446 p_u += i_source_margin_c;
447 p_v += i_source_margin_c;
449 p_buffer = b_hscale ? p_buffer_start : p_pic;
451 /* make sure all SSE2 stores are visible thereafter */
452 #if defined (CAN_COMPILE_SSE2)
453 __asm__ __volatile__ ( "sfence" );
460 /* use slower SSE2 unaligned fetch and store */
461 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
464 p_buffer = b_hscale ? p_buffer_start : p_pic;
466 for ( i_x = p_vout->render.i_width/16; i_x--; )
468 #if defined (CAN_COMPILE_SSE2)
469 __asm__( ".p2align 3"
470 SSE2_INIT_16_UNALIGNED
473 SSE2_UNPACK_15_UNALIGNED
474 : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
476 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
477 SSE2_INTRINSICS_INIT_16_UNALIGNED
478 SSE2_INTRINSICS_YUV_MUL
479 SSE2_INTRINSICS_YUV_ADD
480 SSE2_INTRINSICS_UNPACK_15_UNALIGNED
487 /* Here we do some unaligned reads and duplicate conversions, but
488 * at least we have all the pixels */
492 p_u -= i_rewind >> 1;
493 p_v -= i_rewind >> 1;
494 p_buffer -= i_rewind;
496 #if defined (CAN_COMPILE_SSE2)
497 __asm__( ".p2align 3"
498 SSE2_INIT_16_UNALIGNED
501 SSE2_UNPACK_15_UNALIGNED
502 : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
505 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
507 SSE2_INTRINSICS_INIT_16_UNALIGNED
508 SSE2_INTRINSICS_YUV_MUL
509 SSE2_INTRINSICS_YUV_ADD
510 SSE2_INTRINSICS_UNPACK_15_UNALIGNED
518 SCALE_HEIGHT( 420, 2 );
520 p_y += i_source_margin;
523 p_u += i_source_margin_c;
524 p_v += i_source_margin_c;
526 p_buffer = b_hscale ? p_buffer_start : p_pic;
529 #else // defined (MODULE_NAME_IS_i420_rgb_mmx)
531 if( p_vout->render.i_width & 7 )
533 i_rewind = 8 - ( p_vout->render.i_width & 7 );
540 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
543 p_buffer = b_hscale ? p_buffer_start : p_pic;
545 for ( i_x = p_vout->render.i_width / 8; i_x--; )
547 #if defined (CAN_COMPILE_MMX)
548 __asm__( ".p2align 3"
553 : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
555 __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
557 MMX_INTRINSICS_INIT_16
558 MMX_INTRINSICS_YUV_MUL
559 MMX_INTRINSICS_YUV_ADD
560 MMX_INTRINSICS_UNPACK_15
569 /* Here we do some unaligned reads and duplicate conversions, but
570 * at least we have all the pixels */
574 p_u -= i_rewind >> 1;
575 p_v -= i_rewind >> 1;
576 p_buffer -= i_rewind;
578 #if defined (CAN_COMPILE_MMX)
579 __asm__( ".p2align 3"
584 : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
587 __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
590 MMX_INTRINSICS_INIT_16
591 MMX_INTRINSICS_YUV_MUL
592 MMX_INTRINSICS_YUV_ADD
593 MMX_INTRINSICS_UNPACK_15
602 SCALE_HEIGHT( 420, 2 );
604 p_y += i_source_margin;
607 p_u += i_source_margin_c;
608 p_v += i_source_margin_c;
611 /* re-enable FPU registers */
612 #if defined (CAN_COMPILE_MMX)
613 __asm__ __volatile__ ( "emms" );
621 void E_(I420_R5G6B5)( vout_thread_t *p_vout, picture_t *p_src,
624 /* We got this one from the old arguments */
625 uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
626 uint8_t *p_y = p_src->Y_PIXELS;
627 uint8_t *p_u = p_src->U_PIXELS;
628 uint8_t *p_v = p_src->V_PIXELS;
630 vlc_bool_t b_hscale; /* horizontal scaling type */
631 unsigned int i_vscale; /* vertical scaling type */
632 unsigned int i_x, i_y; /* horizontal and vertical indexes */
636 int i_scale_count; /* scale modulo counter */
637 int i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
638 uint16_t * p_pic_start; /* beginning of the current line for copy */
640 /* Conversion buffer pointer */
641 uint16_t * p_buffer_start = (uint16_t*)p_vout->chroma.p_sys->p_buffer;
644 /* Offset array pointer */
645 int * p_offset_start = p_vout->chroma.p_sys->p_offset;
648 const int i_source_margin = p_src->p[0].i_pitch
649 - p_src->p[0].i_visible_pitch;
650 const int i_source_margin_c = p_src->p[1].i_pitch
651 - p_src->p[1].i_visible_pitch;
653 i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
655 /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
656 * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
657 * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
658 SetOffset( p_vout->render.i_width, p_vout->render.i_height,
659 p_vout->output.i_width, p_vout->output.i_height,
660 &b_hscale, &i_vscale, p_offset_start );
666 i_scale_count = ( i_vscale == 1 ) ?
667 p_vout->output.i_height : p_vout->render.i_height;
669 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
671 if( p_vout->render.i_width & 15 )
673 i_rewind = 16 - ( p_vout->render.i_width & 15 );
681 ** SSE2 128 bits fetch/store instructions are faster
682 ** if memory access is 16 bytes aligned
685 p_buffer = b_hscale ? p_buffer_start : p_pic;
686 if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
691 /* use faster SSE2 aligned fetch and store */
692 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
696 for ( i_x = p_vout->render.i_width/16; i_x--; )
698 #if defined (CAN_COMPILE_SSE2)
699 __asm__( ".p2align 3"
703 SSE2_UNPACK_16_ALIGNED
704 : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
706 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
707 SSE2_INTRINSICS_INIT_16_ALIGNED
708 SSE2_INTRINSICS_YUV_MUL
709 SSE2_INTRINSICS_YUV_ADD
710 SSE2_INTRINSICS_UNPACK_16_ALIGNED
717 /* Here we do some unaligned reads and duplicate conversions, but
718 * at least we have all the pixels */
722 p_u -= i_rewind >> 1;
723 p_v -= i_rewind >> 1;
724 p_buffer -= i_rewind;
726 #if defined (CAN_COMPILE_SSE2)
727 __asm__( ".p2align 3"
728 SSE2_INIT_16_UNALIGNED
731 SSE2_UNPACK_16_UNALIGNED
732 : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
735 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
737 SSE2_INTRINSICS_INIT_16_UNALIGNED
738 SSE2_INTRINSICS_YUV_MUL
739 SSE2_INTRINSICS_YUV_ADD
740 SSE2_INTRINSICS_UNPACK_16_UNALIGNED
748 SCALE_HEIGHT( 420, 2 );
750 p_y += i_source_margin;
753 p_u += i_source_margin_c;
754 p_v += i_source_margin_c;
756 p_buffer = b_hscale ? p_buffer_start : p_pic;
758 /* make sure all SSE2 stores are visible thereafter */
759 #if defined (CAN_COMPILE_SSE2)
760 __asm__ __volatile__ ( "sfence" );
767 /* use slower SSE2 unaligned fetch and store */
768 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
771 p_buffer = b_hscale ? p_buffer_start : p_pic;
773 for ( i_x = p_vout->render.i_width/16; i_x--; )
775 #if defined (CAN_COMPILE_SSE2)
776 __asm__( ".p2align 3"
777 SSE2_INIT_16_UNALIGNED
780 SSE2_UNPACK_16_UNALIGNED
781 : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
783 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
784 SSE2_INTRINSICS_INIT_16_UNALIGNED
785 SSE2_INTRINSICS_YUV_MUL
786 SSE2_INTRINSICS_YUV_ADD
787 SSE2_INTRINSICS_UNPACK_16_UNALIGNED
794 /* Here we do some unaligned reads and duplicate conversions, but
795 * at least we have all the pixels */
799 p_u -= i_rewind >> 1;
800 p_v -= i_rewind >> 1;
801 p_buffer -= i_rewind;
803 #if defined (CAN_COMPILE_SSE2)
804 __asm__( ".p2align 3"
805 SSE2_INIT_16_UNALIGNED
808 SSE2_UNPACK_16_UNALIGNED
809 : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
812 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
814 SSE2_INTRINSICS_INIT_16_UNALIGNED
815 SSE2_INTRINSICS_YUV_MUL
816 SSE2_INTRINSICS_YUV_ADD
817 SSE2_INTRINSICS_UNPACK_16_UNALIGNED
825 SCALE_HEIGHT( 420, 2 );
827 p_y += i_source_margin;
830 p_u += i_source_margin_c;
831 p_v += i_source_margin_c;
833 p_buffer = b_hscale ? p_buffer_start : p_pic;
836 #else // defined (MODULE_NAME_IS_i420_rgb_mmx)
838 if( p_vout->render.i_width & 7 )
840 i_rewind = 8 - ( p_vout->render.i_width & 7 );
847 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
850 p_buffer = b_hscale ? p_buffer_start : p_pic;
852 for ( i_x = p_vout->render.i_width / 8; i_x--; )
854 #if defined (CAN_COMPILE_MMX)
855 __asm__( ".p2align 3"
860 : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
862 __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
864 MMX_INTRINSICS_INIT_16
865 MMX_INTRINSICS_YUV_MUL
866 MMX_INTRINSICS_YUV_ADD
867 MMX_INTRINSICS_UNPACK_16
876 /* Here we do some unaligned reads and duplicate conversions, but
877 * at least we have all the pixels */
881 p_u -= i_rewind >> 1;
882 p_v -= i_rewind >> 1;
883 p_buffer -= i_rewind;
885 #if defined (CAN_COMPILE_MMX)
886 __asm__( ".p2align 3"
891 : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
894 __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
897 MMX_INTRINSICS_INIT_16
898 MMX_INTRINSICS_YUV_MUL
899 MMX_INTRINSICS_YUV_ADD
900 MMX_INTRINSICS_UNPACK_16
909 SCALE_HEIGHT( 420, 2 );
911 p_y += i_source_margin;
914 p_u += i_source_margin_c;
915 p_v += i_source_margin_c;
918 /* re-enable FPU registers */
919 #if defined (CAN_COMPILE_MMX)
920 __asm__ __volatile__ ( "emms" );
930 /*****************************************************************************
931 * I420_RGB32: color YUV 4:2:0 to RGB 32 bpp
932 *****************************************************************************
933 * Horizontal alignment needed:
934 * - input: 8 pixels (8 Y bytes, 4 U/V bytes), margins not allowed
935 * - output: 1 pixel (2 bytes), margins allowed
936 * Vertical alignment needed:
937 * - input: 2 lines (2 Y lines, 1 U/V line)
939 *****************************************************************************/
941 #if defined (MODULE_NAME_IS_i420_rgb)
943 void E_(I420_RGB32)( vout_thread_t *p_vout, picture_t *p_src,
946 /* We got this one from the old arguments */
947 uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
948 uint8_t *p_y = p_src->Y_PIXELS;
949 uint8_t *p_u = p_src->U_PIXELS;
950 uint8_t *p_v = p_src->V_PIXELS;
952 vlc_bool_t b_hscale; /* horizontal scaling type */
953 unsigned int i_vscale; /* vertical scaling type */
954 unsigned int i_x, i_y; /* horizontal and vertical indexes */
958 int i_scale_count; /* scale modulo counter */
959 int i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
960 uint32_t * p_pic_start; /* beginning of the current line for copy */
961 int i_uval, i_vval; /* U and V samples */
962 int i_red, i_green, i_blue; /* U and V modified samples */
963 uint32_t * p_yuv = p_vout->chroma.p_sys->p_rgb32;
964 uint32_t * p_ybase; /* Y dependant conversion table */
966 /* Conversion buffer pointer */
967 uint32_t * p_buffer_start = (uint32_t*)p_vout->chroma.p_sys->p_buffer;
970 /* Offset array pointer */
971 int * p_offset_start = p_vout->chroma.p_sys->p_offset;
974 const int i_source_margin = p_src->p[0].i_pitch
975 - p_src->p[0].i_visible_pitch;
976 const int i_source_margin_c = p_src->p[1].i_pitch
977 - p_src->p[1].i_visible_pitch;
979 i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
981 if( p_vout->render.i_width & 7 )
983 i_rewind = 8 - ( p_vout->render.i_width & 7 );
990 /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
991 * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
992 * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
993 SetOffset( p_vout->render.i_width, p_vout->render.i_height,
994 p_vout->output.i_width, p_vout->output.i_height,
995 &b_hscale, &i_vscale, p_offset_start );
1000 i_scale_count = ( i_vscale == 1 ) ?
1001 p_vout->output.i_height : p_vout->render.i_height;
1002 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1004 p_pic_start = p_pic;
1005 p_buffer = b_hscale ? p_buffer_start : p_pic;
1007 for ( i_x = p_vout->render.i_width / 8; i_x--; )
1009 CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4);
1010 CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4);
1011 CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4);
1012 CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4);
1015 /* Here we do some unaligned reads and duplicate conversions, but
1016 * at least we have all the pixels */
1020 p_u -= i_rewind >> 1;
1021 p_v -= i_rewind >> 1;
1022 p_buffer -= i_rewind;
1023 CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4);
1024 CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4);
1025 CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4);
1026 CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4);
1029 SCALE_HEIGHT( 420, 4 );
1031 p_y += i_source_margin;
1034 p_u += i_source_margin_c;
1035 p_v += i_source_margin_c;
1040 #else // defined (MODULE_NAME_IS_i420_rgb_mmx) || defined (MODULE_NAME_IS_i420_rgb_sse2)
1042 void E_(I420_A8R8G8B8)( vout_thread_t *p_vout, picture_t *p_src,
1045 /* We got this one from the old arguments */
1046 uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
1047 uint8_t *p_y = p_src->Y_PIXELS;
1048 uint8_t *p_u = p_src->U_PIXELS;
1049 uint8_t *p_v = p_src->V_PIXELS;
1051 vlc_bool_t b_hscale; /* horizontal scaling type */
1052 unsigned int i_vscale; /* vertical scaling type */
1053 unsigned int i_x, i_y; /* horizontal and vertical indexes */
1057 int i_scale_count; /* scale modulo counter */
1058 int i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
1059 uint32_t * p_pic_start; /* beginning of the current line for copy */
1060 /* Conversion buffer pointer */
1061 uint32_t * p_buffer_start = (uint32_t*)p_vout->chroma.p_sys->p_buffer;
1062 uint32_t * p_buffer;
1064 /* Offset array pointer */
1065 int * p_offset_start = p_vout->chroma.p_sys->p_offset;
1068 const int i_source_margin = p_src->p[0].i_pitch
1069 - p_src->p[0].i_visible_pitch;
1070 const int i_source_margin_c = p_src->p[1].i_pitch
1071 - p_src->p[1].i_visible_pitch;
1073 i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
1075 /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
1076 * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
1077 * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
1078 SetOffset( p_vout->render.i_width, p_vout->render.i_height,
1079 p_vout->output.i_width, p_vout->output.i_height,
1080 &b_hscale, &i_vscale, p_offset_start );
1083 * Perform conversion
1085 i_scale_count = ( i_vscale == 1 ) ?
1086 p_vout->output.i_height : p_vout->render.i_height;
1088 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
1090 if( p_vout->render.i_width & 15 )
1092 i_rewind = 16 - ( p_vout->render.i_width & 15 );
1100 ** SSE2 128 bits fetch/store instructions are faster
1101 ** if memory access is 16 bytes aligned
1104 p_buffer = b_hscale ? p_buffer_start : p_pic;
1105 if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
1110 /* use faster SSE2 aligned fetch and store */
1111 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1113 p_pic_start = p_pic;
1115 for ( i_x = p_vout->render.i_width / 16; i_x--; )
1117 #if defined (CAN_COMPILE_SSE2)
1118 /* use inline SSE2 assembly */
1119 __asm__( ".p2align 3"
1120 SSE2_INIT_32_ALIGNED
1123 SSE2_UNPACK_32_ARGB_ALIGNED
1124 : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
1126 /* otherwise use SSE2 C intrinsics wrappers */
1127 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
1129 SSE2_INTRINSICS_INIT_32_ALIGNED
1130 SSE2_INTRINSICS_YUV_MUL
1131 SSE2_INTRINSICS_YUV_ADD
1132 SSE2_INTRINSICS_UNPACK_32_ARGB_ALIGNED
1140 /* Here we do some unaligned reads and duplicate conversions, but
1141 * at least we have all the pixels */
1145 p_u -= i_rewind >> 1;
1146 p_v -= i_rewind >> 1;
1147 p_buffer -= i_rewind;
1148 #if defined (CAN_COMPILE_SSE2)
1149 /* use inline SSE2 assembly */
1150 __asm__( ".p2align 3"
1151 SSE2_INIT_32_UNALIGNED
1154 SSE2_UNPACK_32_ARGB_UNALIGNED
1155 : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
1157 /* otherwise use SSE2 intrinsics wrappers */
1159 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
1161 SSE2_INTRINSICS_INIT_32_UNALIGNED
1162 SSE2_INTRINSICS_YUV_MUL
1163 SSE2_INTRINSICS_YUV_ADD
1164 SSE2_INTRINSICS_UNPACK_32_ARGB_UNALIGNED
1172 SCALE_HEIGHT( 420, 4 );
1174 p_y += i_source_margin;
1177 p_u += i_source_margin_c;
1178 p_v += i_source_margin_c;
1180 p_buffer = b_hscale ? p_buffer_start : p_pic;
1182 /* make sure all SSE2 stores are visible thereafter */
1183 #if defined (CAN_COMPILE_SSE2)
1184 __asm__ __volatile__ ( "sfence" );
1191 /* use slower SSE2 unaligned fetch and store */
1192 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1194 p_pic_start = p_pic;
1195 p_buffer = b_hscale ? p_buffer_start : p_pic;
1197 for ( i_x = p_vout->render.i_width / 16; i_x--; )
1199 #if defined (CAN_COMPILE_SSE2)
1200 /* use inline SSE2 assembly */
1201 __asm__( ".p2align 3"
1202 SSE2_INIT_32_UNALIGNED
1205 SSE2_UNPACK_32_ARGB_UNALIGNED
1206 : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
1208 /* otherwise use SSE2 C intrinsics wrappers */
1209 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
1211 SSE2_INTRINSICS_INIT_32_UNALIGNED
1212 SSE2_INTRINSICS_YUV_MUL
1213 SSE2_INTRINSICS_YUV_ADD
1214 SSE2_INTRINSICS_UNPACK_32_ARGB_UNALIGNED
1222 /* Here we do some unaligned reads and duplicate conversions, but
1223 * at least we have all the pixels */
1227 p_u -= i_rewind >> 1;
1228 p_v -= i_rewind >> 1;
1229 p_buffer -= i_rewind;
1230 #if defined (CAN_COMPILE_SSE2)
1231 /* use inline SSE2 assembly */
1232 __asm__( ".p2align 3"
1233 SSE2_INIT_32_UNALIGNED
1236 SSE2_UNPACK_32_ARGB_UNALIGNED
1237 : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
1239 /* otherwise use SSE2 intrinsics wrappers */
1241 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
1243 SSE2_INTRINSICS_INIT_32_UNALIGNED
1244 SSE2_INTRINSICS_YUV_MUL
1245 SSE2_INTRINSICS_YUV_ADD
1246 SSE2_INTRINSICS_UNPACK_32_ARGB_UNALIGNED
1254 SCALE_HEIGHT( 420, 4 );
1256 p_y += i_source_margin;
1259 p_u += i_source_margin_c;
1260 p_v += i_source_margin_c;
1262 p_buffer = b_hscale ? p_buffer_start : p_pic;
1268 if( p_vout->render.i_width & 7 )
1270 i_rewind = 8 - ( p_vout->render.i_width & 7 );
1277 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1279 p_pic_start = p_pic;
1280 p_buffer = b_hscale ? p_buffer_start : p_pic;
1282 for ( i_x = p_vout->render.i_width / 8; i_x--; )
1284 #if defined (CAN_COMPILE_MMX)
1285 /* use inline MMX assembly */
1286 __asm__( MMX_INIT_32
1287 : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
1289 __asm__( ".p2align 3"
1293 : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
1295 /* otherwise use MMX C intrinsics wrappers */
1296 __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
1299 MMX_INTRINSICS_INIT_32
1300 MMX_INTRINSICS_YUV_MUL
1301 MMX_INTRINSICS_YUV_ADD
1302 MMX_INTRINSICS_UNPACK_32_ARGB
1310 /* Here we do some unaligned reads and duplicate conversions, but
1311 * at least we have all the pixels */
1315 p_u -= i_rewind >> 1;
1316 p_v -= i_rewind >> 1;
1317 p_buffer -= i_rewind;
1318 #if defined (CAN_COMPILE_MMX)
1319 /* use inline MMX assembly */
1320 __asm__( ".p2align 3"
1325 : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
1327 /* otherwise use MMX intrinsics wrappers */
1329 __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
1332 MMX_INTRINSICS_INIT_32
1333 MMX_INTRINSICS_YUV_MUL
1334 MMX_INTRINSICS_YUV_ADD
1335 MMX_INTRINSICS_UNPACK_32_ARGB
1344 SCALE_HEIGHT( 420, 4 );
1346 p_y += i_source_margin;
1349 p_u += i_source_margin_c;
1350 p_v += i_source_margin_c;
1353 /* re-enable FPU registers */
1354 #if defined (CAN_COMPILE_MMX)
1355 __asm__ __volatile__ ( "emms" );
1363 void E_(I420_B8G8R8A8)( vout_thread_t *p_vout, picture_t *p_src,
1366 /* We got this one from the old arguments */
1367 uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
1368 uint8_t *p_y = p_src->Y_PIXELS;
1369 uint8_t *p_u = p_src->U_PIXELS;
1370 uint8_t *p_v = p_src->V_PIXELS;
1372 vlc_bool_t b_hscale; /* horizontal scaling type */
1373 unsigned int i_vscale; /* vertical scaling type */
1374 unsigned int i_x, i_y; /* horizontal and vertical indexes */
1378 int i_scale_count; /* scale modulo counter */
1379 int i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
1380 uint32_t * p_pic_start; /* beginning of the current line for copy */
1381 /* Conversion buffer pointer */
1382 uint32_t * p_buffer_start = (uint32_t*)p_vout->chroma.p_sys->p_buffer;
1383 uint32_t * p_buffer;
1385 /* Offset array pointer */
1386 int * p_offset_start = p_vout->chroma.p_sys->p_offset;
1389 const int i_source_margin = p_src->p[0].i_pitch
1390 - p_src->p[0].i_visible_pitch;
1391 const int i_source_margin_c = p_src->p[1].i_pitch
1392 - p_src->p[1].i_visible_pitch;
1394 i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
1396 /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
1397 * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
1398 * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
1399 SetOffset( p_vout->render.i_width, p_vout->render.i_height,
1400 p_vout->output.i_width, p_vout->output.i_height,
1401 &b_hscale, &i_vscale, p_offset_start );
1404 * Perform conversion
1406 i_scale_count = ( i_vscale == 1 ) ?
1407 p_vout->output.i_height : p_vout->render.i_height;
1409 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
1411 if( p_vout->render.i_width & 15 )
1413 i_rewind = 16 - ( p_vout->render.i_width & 15 );
1421 ** SSE2 128 bits fetch/store instructions are faster
1422 ** if memory access is 16 bytes aligned
1425 p_buffer = b_hscale ? p_buffer_start : p_pic;
1426 if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
1431 /* use faster SSE2 aligned fetch and store */
1432 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1434 p_pic_start = p_pic;
1436 for ( i_x = p_vout->render.i_width / 16; i_x--; )
1438 #if defined (CAN_COMPILE_SSE2)
1439 /* use inline SSE2 assembly */
1440 __asm__( ".p2align 3"
1441 SSE2_INIT_32_ALIGNED
1444 SSE2_UNPACK_32_BGRA_ALIGNED
1445 : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
1447 /* otherwise use SSE2 C intrinsics wrappers */
1448 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
1450 SSE2_INTRINSICS_INIT_32_ALIGNED
1451 SSE2_INTRINSICS_YUV_MUL
1452 SSE2_INTRINSICS_YUV_ADD
1453 SSE2_INTRINSICS_UNPACK_32_BGRA_ALIGNED
1461 /* Here we do some unaligned reads and duplicate conversions, but
1462 * at least we have all the pixels */
1466 p_u -= i_rewind >> 1;
1467 p_v -= i_rewind >> 1;
1468 p_buffer -= i_rewind;
1469 #if defined (CAN_COMPILE_SSE2)
1470 /* use inline SSE2 assembly */
1471 __asm__( ".p2align 3"
1472 SSE2_INIT_32_UNALIGNED
1475 SSE2_UNPACK_32_BGRA_UNALIGNED
1476 : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
1478 /* otherwise use SSE2 intrinsics wrappers */
1480 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
1482 SSE2_INTRINSICS_INIT_32_UNALIGNED
1483 SSE2_INTRINSICS_YUV_MUL
1484 SSE2_INTRINSICS_YUV_ADD
1485 SSE2_INTRINSICS_UNPACK_32_BGRA_UNALIGNED
1493 SCALE_HEIGHT( 420, 4 );
1495 p_y += i_source_margin;
1498 p_u += i_source_margin_c;
1499 p_v += i_source_margin_c;
1501 p_buffer = b_hscale ? p_buffer_start : p_pic;
1503 /* make sure all SSE2 stores are visible thereafter */
1504 #if defined (CAN_COMPILE_SSE2)
1505 __asm__ __volatile__ ( "sfence" );
1512 /* use slower SSE2 unaligned fetch and store */
1513 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1515 p_pic_start = p_pic;
1516 p_buffer = b_hscale ? p_buffer_start : p_pic;
1518 for ( i_x = p_vout->render.i_width / 16; i_x--; )
1520 #if defined (CAN_COMPILE_SSE2)
1521 /* use inline SSE2 assembly */
1522 __asm__( ".p2align 3"
1523 SSE2_INIT_32_UNALIGNED
1526 SSE2_UNPACK_32_BGRA_UNALIGNED
1527 : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
1529 /* otherwise use SSE2 C intrinsics wrappers */
1530 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
1532 SSE2_INTRINSICS_INIT_32_UNALIGNED
1533 SSE2_INTRINSICS_YUV_MUL
1534 SSE2_INTRINSICS_YUV_ADD
1535 SSE2_INTRINSICS_UNPACK_32_BGRA_UNALIGNED
1543 /* Here we do some unaligned reads and duplicate conversions, but
1544 * at least we have all the pixels */
1548 p_u -= i_rewind >> 1;
1549 p_v -= i_rewind >> 1;
1550 p_buffer -= i_rewind;
1551 #if defined (CAN_COMPILE_SSE2)
1552 /* use inline SSE2 assembly */
1553 __asm__( ".p2align 3"
1554 SSE2_INIT_32_UNALIGNED
1557 SSE2_UNPACK_32_BGRA_UNALIGNED
1558 : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
1560 /* otherwise use SSE2 intrinsics wrappers */
1562 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
1564 SSE2_INTRINSICS_INIT_32_UNALIGNED
1565 SSE2_INTRINSICS_YUV_MUL
1566 SSE2_INTRINSICS_YUV_ADD
1567 SSE2_INTRINSICS_UNPACK_32_BGRA_UNALIGNED
1575 SCALE_HEIGHT( 420, 4 );
1577 p_y += i_source_margin;
1580 p_u += i_source_margin_c;
1581 p_v += i_source_margin_c;
1583 p_buffer = b_hscale ? p_buffer_start : p_pic;
1589 if( p_vout->render.i_width & 7 )
1591 i_rewind = 8 - ( p_vout->render.i_width & 7 );
1598 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1600 p_pic_start = p_pic;
1601 p_buffer = b_hscale ? p_buffer_start : p_pic;
1603 for ( i_x = p_vout->render.i_width / 8; i_x--; )
1605 #if defined (CAN_COMPILE_MMX)
1606 /* use inline MMX assembly */
1607 __asm__( MMX_INIT_32
1608 : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
1610 __asm__( ".p2align 3"
1614 : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
1616 /* otherwise use MMX C intrinsics wrappers */
1617 __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
1620 MMX_INTRINSICS_INIT_32
1621 MMX_INTRINSICS_YUV_MUL
1622 MMX_INTRINSICS_YUV_ADD
1623 MMX_INTRINSICS_UNPACK_32_BGRA
1631 /* Here we do some unaligned reads and duplicate conversions, but
1632 * at least we have all the pixels */
1636 p_u -= i_rewind >> 1;
1637 p_v -= i_rewind >> 1;
1638 p_buffer -= i_rewind;
1639 #if defined (CAN_COMPILE_MMX)
1640 /* use inline MMX assembly */
1641 __asm__( ".p2align 3"
1646 : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
1648 /* otherwise use MMX intrinsics wrappers */
1650 __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
1653 MMX_INTRINSICS_INIT_32
1654 MMX_INTRINSICS_YUV_MUL
1655 MMX_INTRINSICS_YUV_ADD
1656 MMX_INTRINSICS_UNPACK_32_BGRA
1665 SCALE_HEIGHT( 420, 4 );
1667 p_y += i_source_margin;
1670 p_u += i_source_margin_c;
1671 p_v += i_source_margin_c;
1674 /* re-enable FPU registers */
1675 #if defined (CAN_COMPILE_MMX)
1676 __asm__ __volatile__ ( "emms" );
1686 /* Following functions are local */
1688 /*****************************************************************************
1689 * SetOffset: build offset array for conversion functions
1690 *****************************************************************************
1691 * This function will build an offset array used in later conversion functions.
1692 * It will also set horizontal and vertical scaling indicators.
1693 *****************************************************************************/
1694 static void SetOffset( int i_width, int i_height, int i_pic_width,
1695 int i_pic_height, vlc_bool_t *pb_hscale,
1696 unsigned int *pi_vscale, int *p_offset )
1698 int i_x; /* x position in destination */
1699 int i_scale_count; /* modulo counter */
1702 * Prepare horizontal offset array
1704 if( i_pic_width - i_width == 0 )
1706 /* No horizontal scaling: YUV conversion is done directly to picture */
1709 else if( i_pic_width - i_width > 0 )
1711 /* Prepare scaling array for horizontal extension */
1713 i_scale_count = i_pic_width;
1714 for( i_x = i_width; i_x--; )
1716 while( (i_scale_count -= i_width) > 0 )
1721 i_scale_count += i_pic_width;
1724 else /* if( i_pic_width - i_width < 0 ) */
1726 /* Prepare scaling array for horizontal reduction */
1728 i_scale_count = i_width;
1729 for( i_x = i_pic_width; i_x--; )
1732 while( (i_scale_count -= i_pic_width) > 0 )
1737 i_scale_count += i_width;
1742 * Set vertical scaling indicator
1744 if( i_pic_height - i_height == 0 )
1748 else if( i_pic_height - i_height > 0 )
1752 else /* if( i_pic_height - i_height < 0 ) */