1 /*****************************************************************************
2 * i420_rgb16.c : YUV to bitmap RGB conversion module for vlc
3 *****************************************************************************
4 * Copyright (C) 2000 the VideoLAN team
7 * Authors: Samuel Hocevar <sam@zoy.org>
8 * Damien Fouilleul <damienf@videolan.org>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
23 *****************************************************************************/
25 /*****************************************************************************
27 *****************************************************************************/
28 #include <string.h> /* strerror() */
29 #include <stdlib.h> /* malloc(), free() */
35 #if defined (MODULE_NAME_IS_i420_rgb)
36 # include "i420_rgb_c.h"
37 #elif defined (MODULE_NAME_IS_i420_rgb_mmx)
38 # if defined(HAVE_MMX_INTRINSICS)
39 # include <mmintrin.h>
41 # include "i420_rgb_mmx.h"
42 #elif defined (MODULE_NAME_IS_i420_rgb_sse2)
43 # if defined(HAVE_SSE2_INTRINSICS)
44 # include <emmintrin.h>
46 # include "i420_rgb_mmx.h"
49 static void SetOffset( int, int, int, int, vlc_bool_t *,
50 unsigned int *, int * );
52 #if defined (MODULE_NAME_IS_i420_rgb)
53 /*****************************************************************************
54 * I420_RGB16: color YUV 4:2:0 to RGB 16 bpp with dithering
55 *****************************************************************************
56 * Horizontal alignment needed:
57 * - input: 8 pixels (8 Y bytes, 4 U/V bytes), margins not allowed
58 * - output: 1 pixel (2 bytes), margins allowed
59 * Vertical alignment needed:
60 * - input: 2 lines (2 Y lines, 1 U/V line)
62 *****************************************************************************/
63 void E_(I420_RGB16_dither)( vout_thread_t *p_vout, picture_t *p_src,
66 /* We got this one from the old arguments */
67 uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
68 uint8_t *p_y = p_src->Y_PIXELS;
69 uint8_t *p_u = p_src->U_PIXELS;
70 uint8_t *p_v = p_src->V_PIXELS;
72 vlc_bool_t b_hscale; /* horizontal scaling type */
73 unsigned int i_vscale; /* vertical scaling type */
74 unsigned int i_x, i_y; /* horizontal and vertical indexes */
75 unsigned int i_real_y; /* y % 4 */
79 int i_scale_count; /* scale modulo counter */
80 int i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
81 uint16_t * p_pic_start; /* beginning of the current line for copy */
82 int i_uval, i_vval; /* U and V samples */
83 int i_red, i_green, i_blue; /* U and V modified samples */
84 uint16_t * p_yuv = p_vout->chroma.p_sys->p_rgb16;
85 uint16_t * p_ybase; /* Y dependant conversion table */
87 /* Conversion buffer pointer */
88 uint16_t * p_buffer_start = (uint16_t*)p_vout->chroma.p_sys->p_buffer;
91 /* Offset array pointer */
92 int * p_offset_start = p_vout->chroma.p_sys->p_offset;
95 const int i_source_margin = p_src->p[0].i_pitch
96 - p_src->p[0].i_visible_pitch;
97 const int i_source_margin_c = p_src->p[1].i_pitch
98 - p_src->p[1].i_visible_pitch;
100 /* The dithering matrices */
101 int dither10[4] = { 0x0, 0x8, 0x2, 0xa };
102 int dither11[4] = { 0xc, 0x4, 0xe, 0x6 };
103 int dither12[4] = { 0x3, 0xb, 0x1, 0x9 };
104 int dither13[4] = { 0xf, 0x7, 0xd, 0x5 };
106 for(i_x = 0; i_x < 4; i_x++)
108 dither10[i_x] = dither10[i_x] << (SHIFT - 4 + p_vout->output.i_rrshift);
109 dither11[i_x] = dither11[i_x] << (SHIFT - 4 + p_vout->output.i_rrshift);
110 dither12[i_x] = dither12[i_x] << (SHIFT - 4 + p_vout->output.i_rrshift);
111 dither13[i_x] = dither13[i_x] << (SHIFT - 4 + p_vout->output.i_rrshift);
114 i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
116 if( p_vout->render.i_width & 7 )
118 i_rewind = 8 - ( p_vout->render.i_width & 7 );
125 /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
126 * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
127 * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
128 SetOffset( p_vout->render.i_width, p_vout->render.i_height,
129 p_vout->output.i_width, p_vout->output.i_height,
130 &b_hscale, &i_vscale, p_offset_start );
135 i_scale_count = ( i_vscale == 1 ) ?
136 p_vout->output.i_height : p_vout->render.i_height;
137 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
139 i_real_y = i_y & 0x3;
141 p_buffer = b_hscale ? p_buffer_start : p_pic;
143 for ( i_x = p_vout->render.i_width / 8; i_x--; )
145 int *p_dither = dither10;
146 CONVERT_YUV_PIXEL_DITHER(2);
148 CONVERT_Y_PIXEL_DITHER(2);
150 CONVERT_YUV_PIXEL_DITHER(2);
152 CONVERT_Y_PIXEL_DITHER(2);
154 CONVERT_YUV_PIXEL_DITHER(2);
156 CONVERT_Y_PIXEL_DITHER(2);
158 CONVERT_YUV_PIXEL_DITHER(2);
160 CONVERT_Y_PIXEL_DITHER(2);
163 /* Here we do some unaligned reads and duplicate conversions, but
164 * at least we have all the pixels */
167 int *p_dither = dither10;
169 p_u -= i_rewind >> 1;
170 p_v -= i_rewind >> 1;
171 p_buffer -= i_rewind;
172 CONVERT_YUV_PIXEL_DITHER(2);
174 CONVERT_Y_PIXEL_DITHER(2);
176 CONVERT_YUV_PIXEL_DITHER(2);
178 CONVERT_Y_PIXEL_DITHER(2);
180 CONVERT_YUV_PIXEL_DITHER(2);
182 CONVERT_Y_PIXEL_DITHER(2);
184 CONVERT_YUV_PIXEL_DITHER(2);
186 CONVERT_Y_PIXEL_DITHER(2);
189 SCALE_HEIGHT( 420, 2 );
191 p_y += i_source_margin;
194 p_u += i_source_margin_c;
195 p_v += i_source_margin_c;
201 /*****************************************************************************
202 * I420_RGB16: color YUV 4:2:0 to RGB 16 bpp
203 *****************************************************************************
204 * Horizontal alignment needed:
205 * - input: 8 pixels (8 Y bytes, 4 U/V bytes), margins not allowed
206 * - output: 1 pixel (2 bytes), margins allowed
207 * Vertical alignment needed:
208 * - input: 2 lines (2 Y lines, 1 U/V line)
210 *****************************************************************************/
212 #if defined (MODULE_NAME_IS_i420_rgb)
214 void E_(I420_RGB16)( vout_thread_t *p_vout, picture_t *p_src,
217 /* We got this one from the old arguments */
218 uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
219 uint8_t *p_y = p_src->Y_PIXELS;
220 uint8_t *p_u = p_src->U_PIXELS;
221 uint8_t *p_v = p_src->V_PIXELS;
223 vlc_bool_t b_hscale; /* horizontal scaling type */
224 unsigned int i_vscale; /* vertical scaling type */
225 unsigned int i_x, i_y; /* horizontal and vertical indexes */
229 int i_scale_count; /* scale modulo counter */
230 int i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
231 uint16_t * p_pic_start; /* beginning of the current line for copy */
232 int i_uval, i_vval; /* U and V samples */
233 int i_red, i_green, i_blue; /* U and V modified samples */
234 uint16_t * p_yuv = p_vout->chroma.p_sys->p_rgb16;
235 uint16_t * p_ybase; /* Y dependant conversion table */
237 /* Conversion buffer pointer */
238 uint16_t * p_buffer_start = (uint16_t*)p_vout->chroma.p_sys->p_buffer;
241 /* Offset array pointer */
242 int * p_offset_start = p_vout->chroma.p_sys->p_offset;
245 const int i_source_margin = p_src->p[0].i_pitch
246 - p_src->p[0].i_visible_pitch;
247 const int i_source_margin_c = p_src->p[1].i_pitch
248 - p_src->p[1].i_visible_pitch;
250 i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
252 if( p_vout->render.i_width & 7 )
254 i_rewind = 8 - ( p_vout->render.i_width & 7 );
261 /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
262 * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
263 * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
264 SetOffset( p_vout->render.i_width, p_vout->render.i_height,
265 p_vout->output.i_width, p_vout->output.i_height,
266 &b_hscale, &i_vscale, p_offset_start );
271 i_scale_count = ( i_vscale == 1 ) ?
272 p_vout->output.i_height : p_vout->render.i_height;
273 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
276 p_buffer = b_hscale ? p_buffer_start : p_pic;
278 for ( i_x = p_vout->render.i_width / 8; i_x--; )
280 CONVERT_YUV_PIXEL(2); CONVERT_Y_PIXEL(2);
281 CONVERT_YUV_PIXEL(2); CONVERT_Y_PIXEL(2);
282 CONVERT_YUV_PIXEL(2); CONVERT_Y_PIXEL(2);
283 CONVERT_YUV_PIXEL(2); CONVERT_Y_PIXEL(2);
286 /* Here we do some unaligned reads and duplicate conversions, but
287 * at least we have all the pixels */
291 p_u -= i_rewind >> 1;
292 p_v -= i_rewind >> 1;
293 p_buffer -= i_rewind;
295 CONVERT_YUV_PIXEL(2); CONVERT_Y_PIXEL(2);
296 CONVERT_YUV_PIXEL(2); CONVERT_Y_PIXEL(2);
297 CONVERT_YUV_PIXEL(2); CONVERT_Y_PIXEL(2);
298 CONVERT_YUV_PIXEL(2); CONVERT_Y_PIXEL(2);
301 SCALE_HEIGHT( 420, 2 );
303 p_y += i_source_margin;
306 p_u += i_source_margin_c;
307 p_v += i_source_margin_c;
312 #else // defined (MODULE_NAME_IS_i420_rgb_mmx)
314 void E_(I420_R5G5B5)( vout_thread_t *p_vout, picture_t *p_src,
317 /* We got this one from the old arguments */
318 uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
319 uint8_t *p_y = p_src->Y_PIXELS;
320 uint8_t *p_u = p_src->U_PIXELS;
321 uint8_t *p_v = p_src->V_PIXELS;
323 vlc_bool_t b_hscale; /* horizontal scaling type */
324 unsigned int i_vscale; /* vertical scaling type */
325 unsigned int i_x, i_y; /* horizontal and vertical indexes */
329 int i_scale_count; /* scale modulo counter */
330 int i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
331 uint16_t * p_pic_start; /* beginning of the current line for copy */
333 /* Conversion buffer pointer */
334 uint16_t * p_buffer_start = (uint16_t*)p_vout->chroma.p_sys->p_buffer;
337 /* Offset array pointer */
338 int * p_offset_start = p_vout->chroma.p_sys->p_offset;
341 const int i_source_margin = p_src->p[0].i_pitch
342 - p_src->p[0].i_visible_pitch;
343 const int i_source_margin_c = p_src->p[1].i_pitch
344 - p_src->p[1].i_visible_pitch;
346 i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
348 /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
349 * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
350 * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
351 SetOffset( p_vout->render.i_width, p_vout->render.i_height,
352 p_vout->output.i_width, p_vout->output.i_height,
353 &b_hscale, &i_vscale, p_offset_start );
359 i_scale_count = ( i_vscale == 1 ) ?
360 p_vout->output.i_height : p_vout->render.i_height;
362 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
364 if( p_vout->render.i_width & 15 )
366 i_rewind = 16 - ( p_vout->render.i_width & 15 );
374 ** SSE2 128 bits fetch/store instructions are faster
375 ** if memory access is 16 bytes aligned
378 p_buffer = b_hscale ? p_buffer_start : p_pic;
379 if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
384 /* use faster SSE2 aligned fetch and store */
385 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
389 for ( i_x = p_vout->render.i_width/16; i_x--; )
391 #if defined (CAN_COMPILE_SSE2)
392 __asm__( ".p2align 3"
396 SSE2_UNPACK_15_ALIGNED
397 : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
399 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
400 SSE2_INTRINSICS_INIT_16_ALIGNED
401 SSE2_INTRINSICS_YUV_MUL
402 SSE2_INTRINSICS_YUV_ADD
403 SSE2_INTRINSICS_UNPACK_15_ALIGNED
410 /* Here we do some unaligned reads and duplicate conversions, but
411 * at least we have all the pixels */
415 p_u -= i_rewind >> 1;
416 p_v -= i_rewind >> 1;
417 p_buffer -= i_rewind;
419 #if defined (CAN_COMPILE_SSE2)
420 __asm__( ".p2align 3"
421 SSE2_INIT_16_UNALIGNED
424 SSE2_UNPACK_15_UNALIGNED
425 : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
428 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
430 SSE2_INTRINSICS_INIT_16_UNALIGNED
431 SSE2_INTRINSICS_YUV_MUL
432 SSE2_INTRINSICS_YUV_ADD
433 SSE2_INTRINSICS_UNPACK_15_UNALIGNED
441 SCALE_HEIGHT( 420, 2 );
443 p_y += i_source_margin;
446 p_u += i_source_margin_c;
447 p_v += i_source_margin_c;
449 p_buffer = b_hscale ? p_buffer_start : p_pic;
454 /* use slower SSE2 unaligned fetch and store */
455 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
458 p_buffer = b_hscale ? p_buffer_start : p_pic;
460 for ( i_x = p_vout->render.i_width/16; i_x--; )
462 #if defined (CAN_COMPILE_SSE2)
463 __asm__( ".p2align 3"
464 SSE2_INIT_16_UNALIGNED
467 SSE2_UNPACK_15_UNALIGNED
468 : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
470 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
471 SSE2_INTRINSICS_INIT_16_UNALIGNED
472 SSE2_INTRINSICS_YUV_MUL
473 SSE2_INTRINSICS_YUV_ADD
474 SSE2_INTRINSICS_UNPACK_15_UNALIGNED
481 /* Here we do some unaligned reads and duplicate conversions, but
482 * at least we have all the pixels */
486 p_u -= i_rewind >> 1;
487 p_v -= i_rewind >> 1;
488 p_buffer -= i_rewind;
490 #if defined (CAN_COMPILE_SSE2)
491 __asm__( ".p2align 3"
492 SSE2_INIT_16_UNALIGNED
495 SSE2_UNPACK_15_UNALIGNED
496 : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
499 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
501 SSE2_INTRINSICS_INIT_16_UNALIGNED
502 SSE2_INTRINSICS_YUV_MUL
503 SSE2_INTRINSICS_YUV_ADD
504 SSE2_INTRINSICS_UNPACK_15_UNALIGNED
512 SCALE_HEIGHT( 420, 2 );
514 p_y += i_source_margin;
517 p_u += i_source_margin_c;
518 p_v += i_source_margin_c;
520 p_buffer = b_hscale ? p_buffer_start : p_pic;
524 /* make sure all SSE2 stores are visible thereafter */
525 #if defined (CAN_COMPILE_SSE2)
526 __asm__ __volatile__ ( "sfence" ::: "memory" );
531 #else // defined (MODULE_NAME_IS_i420_rgb_mmx)
533 if( p_vout->render.i_width & 7 )
535 i_rewind = 8 - ( p_vout->render.i_width & 7 );
542 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
545 p_buffer = b_hscale ? p_buffer_start : p_pic;
547 for ( i_x = p_vout->render.i_width / 8; i_x--; )
549 #if defined (CAN_COMPILE_MMX)
550 __asm__( ".p2align 3"
555 : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
557 __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
559 MMX_INTRINSICS_INIT_16
560 MMX_INTRINSICS_YUV_MUL
561 MMX_INTRINSICS_YUV_ADD
562 MMX_INTRINSICS_UNPACK_15
571 /* Here we do some unaligned reads and duplicate conversions, but
572 * at least we have all the pixels */
576 p_u -= i_rewind >> 1;
577 p_v -= i_rewind >> 1;
578 p_buffer -= i_rewind;
580 #if defined (CAN_COMPILE_MMX)
581 __asm__( ".p2align 3"
586 : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
589 __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
592 MMX_INTRINSICS_INIT_16
593 MMX_INTRINSICS_YUV_MUL
594 MMX_INTRINSICS_YUV_ADD
595 MMX_INTRINSICS_UNPACK_15
604 SCALE_HEIGHT( 420, 2 );
606 p_y += i_source_margin;
609 p_u += i_source_margin_c;
610 p_v += i_source_margin_c;
613 /* re-enable FPU registers */
614 #if defined (CAN_COMPILE_MMX)
615 __asm__ __volatile__ ( "emms" );
623 void E_(I420_R5G6B5)( vout_thread_t *p_vout, picture_t *p_src,
626 /* We got this one from the old arguments */
627 uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
628 uint8_t *p_y = p_src->Y_PIXELS;
629 uint8_t *p_u = p_src->U_PIXELS;
630 uint8_t *p_v = p_src->V_PIXELS;
632 vlc_bool_t b_hscale; /* horizontal scaling type */
633 unsigned int i_vscale; /* vertical scaling type */
634 unsigned int i_x, i_y; /* horizontal and vertical indexes */
638 int i_scale_count; /* scale modulo counter */
639 int i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
640 uint16_t * p_pic_start; /* beginning of the current line for copy */
642 /* Conversion buffer pointer */
643 uint16_t * p_buffer_start = (uint16_t*)p_vout->chroma.p_sys->p_buffer;
646 /* Offset array pointer */
647 int * p_offset_start = p_vout->chroma.p_sys->p_offset;
650 const int i_source_margin = p_src->p[0].i_pitch
651 - p_src->p[0].i_visible_pitch;
652 const int i_source_margin_c = p_src->p[1].i_pitch
653 - p_src->p[1].i_visible_pitch;
655 i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
657 /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
658 * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
659 * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
660 SetOffset( p_vout->render.i_width, p_vout->render.i_height,
661 p_vout->output.i_width, p_vout->output.i_height,
662 &b_hscale, &i_vscale, p_offset_start );
668 i_scale_count = ( i_vscale == 1 ) ?
669 p_vout->output.i_height : p_vout->render.i_height;
671 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
673 if( p_vout->render.i_width & 15 )
675 i_rewind = 16 - ( p_vout->render.i_width & 15 );
683 ** SSE2 128 bits fetch/store instructions are faster
684 ** if memory access is 16 bytes aligned
687 p_buffer = b_hscale ? p_buffer_start : p_pic;
688 if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
693 /* use faster SSE2 aligned fetch and store */
694 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
698 for ( i_x = p_vout->render.i_width/16; i_x--; )
700 #if defined (CAN_COMPILE_SSE2)
701 __asm__( ".p2align 3"
705 SSE2_UNPACK_16_ALIGNED
706 : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
708 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
709 SSE2_INTRINSICS_INIT_16_ALIGNED
710 SSE2_INTRINSICS_YUV_MUL
711 SSE2_INTRINSICS_YUV_ADD
712 SSE2_INTRINSICS_UNPACK_16_ALIGNED
719 /* Here we do some unaligned reads and duplicate conversions, but
720 * at least we have all the pixels */
724 p_u -= i_rewind >> 1;
725 p_v -= i_rewind >> 1;
726 p_buffer -= i_rewind;
728 #if defined (CAN_COMPILE_SSE2)
729 __asm__( ".p2align 3"
730 SSE2_INIT_16_UNALIGNED
733 SSE2_UNPACK_16_UNALIGNED
734 : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
737 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
739 SSE2_INTRINSICS_INIT_16_UNALIGNED
740 SSE2_INTRINSICS_YUV_MUL
741 SSE2_INTRINSICS_YUV_ADD
742 SSE2_INTRINSICS_UNPACK_16_UNALIGNED
750 SCALE_HEIGHT( 420, 2 );
752 p_y += i_source_margin;
755 p_u += i_source_margin_c;
756 p_v += i_source_margin_c;
758 p_buffer = b_hscale ? p_buffer_start : p_pic;
763 /* use slower SSE2 unaligned fetch and store */
764 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
767 p_buffer = b_hscale ? p_buffer_start : p_pic;
769 for ( i_x = p_vout->render.i_width/16; i_x--; )
771 #if defined (CAN_COMPILE_SSE2)
772 __asm__( ".p2align 3"
773 SSE2_INIT_16_UNALIGNED
776 SSE2_UNPACK_16_UNALIGNED
777 : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
779 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
780 SSE2_INTRINSICS_INIT_16_UNALIGNED
781 SSE2_INTRINSICS_YUV_MUL
782 SSE2_INTRINSICS_YUV_ADD
783 SSE2_INTRINSICS_UNPACK_16_UNALIGNED
790 /* Here we do some unaligned reads and duplicate conversions, but
791 * at least we have all the pixels */
795 p_u -= i_rewind >> 1;
796 p_v -= i_rewind >> 1;
797 p_buffer -= i_rewind;
799 #if defined (CAN_COMPILE_SSE2)
800 __asm__( ".p2align 3"
801 SSE2_INIT_16_UNALIGNED
804 SSE2_UNPACK_16_UNALIGNED
805 : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
808 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
810 SSE2_INTRINSICS_INIT_16_UNALIGNED
811 SSE2_INTRINSICS_YUV_MUL
812 SSE2_INTRINSICS_YUV_ADD
813 SSE2_INTRINSICS_UNPACK_16_UNALIGNED
821 SCALE_HEIGHT( 420, 2 );
823 p_y += i_source_margin;
826 p_u += i_source_margin_c;
827 p_v += i_source_margin_c;
829 p_buffer = b_hscale ? p_buffer_start : p_pic;
833 /* make sure all SSE2 stores are visible thereafter */
834 #if defined (CAN_COMPILE_SSE2)
835 __asm__ __volatile__ ( "sfence" ::: "memory" );
840 #else // defined (MODULE_NAME_IS_i420_rgb_mmx)
842 if( p_vout->render.i_width & 7 )
844 i_rewind = 8 - ( p_vout->render.i_width & 7 );
851 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
854 p_buffer = b_hscale ? p_buffer_start : p_pic;
856 for ( i_x = p_vout->render.i_width / 8; i_x--; )
858 #if defined (CAN_COMPILE_MMX)
859 __asm__( ".p2align 3"
864 : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
866 __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
868 MMX_INTRINSICS_INIT_16
869 MMX_INTRINSICS_YUV_MUL
870 MMX_INTRINSICS_YUV_ADD
871 MMX_INTRINSICS_UNPACK_16
880 /* Here we do some unaligned reads and duplicate conversions, but
881 * at least we have all the pixels */
885 p_u -= i_rewind >> 1;
886 p_v -= i_rewind >> 1;
887 p_buffer -= i_rewind;
889 #if defined (CAN_COMPILE_MMX)
890 __asm__( ".p2align 3"
895 : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
898 __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
901 MMX_INTRINSICS_INIT_16
902 MMX_INTRINSICS_YUV_MUL
903 MMX_INTRINSICS_YUV_ADD
904 MMX_INTRINSICS_UNPACK_16
913 SCALE_HEIGHT( 420, 2 );
915 p_y += i_source_margin;
918 p_u += i_source_margin_c;
919 p_v += i_source_margin_c;
922 /* re-enable FPU registers */
923 #if defined (CAN_COMPILE_MMX)
924 __asm__ __volatile__ ( "emms" );
934 /*****************************************************************************
935 * I420_RGB32: color YUV 4:2:0 to RGB 32 bpp
936 *****************************************************************************
937 * Horizontal alignment needed:
938 * - input: 8 pixels (8 Y bytes, 4 U/V bytes), margins not allowed
939 * - output: 1 pixel (2 bytes), margins allowed
940 * Vertical alignment needed:
941 * - input: 2 lines (2 Y lines, 1 U/V line)
943 *****************************************************************************/
945 #if defined (MODULE_NAME_IS_i420_rgb)
947 void E_(I420_RGB32)( vout_thread_t *p_vout, picture_t *p_src,
950 /* We got this one from the old arguments */
951 uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
952 uint8_t *p_y = p_src->Y_PIXELS;
953 uint8_t *p_u = p_src->U_PIXELS;
954 uint8_t *p_v = p_src->V_PIXELS;
956 vlc_bool_t b_hscale; /* horizontal scaling type */
957 unsigned int i_vscale; /* vertical scaling type */
958 unsigned int i_x, i_y; /* horizontal and vertical indexes */
962 int i_scale_count; /* scale modulo counter */
963 int i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
964 uint32_t * p_pic_start; /* beginning of the current line for copy */
965 int i_uval, i_vval; /* U and V samples */
966 int i_red, i_green, i_blue; /* U and V modified samples */
967 uint32_t * p_yuv = p_vout->chroma.p_sys->p_rgb32;
968 uint32_t * p_ybase; /* Y dependant conversion table */
970 /* Conversion buffer pointer */
971 uint32_t * p_buffer_start = (uint32_t*)p_vout->chroma.p_sys->p_buffer;
974 /* Offset array pointer */
975 int * p_offset_start = p_vout->chroma.p_sys->p_offset;
978 const int i_source_margin = p_src->p[0].i_pitch
979 - p_src->p[0].i_visible_pitch;
980 const int i_source_margin_c = p_src->p[1].i_pitch
981 - p_src->p[1].i_visible_pitch;
983 i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
985 if( p_vout->render.i_width & 7 )
987 i_rewind = 8 - ( p_vout->render.i_width & 7 );
994 /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
995 * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
996 * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
997 SetOffset( p_vout->render.i_width, p_vout->render.i_height,
998 p_vout->output.i_width, p_vout->output.i_height,
999 &b_hscale, &i_vscale, p_offset_start );
1002 * Perform conversion
1004 i_scale_count = ( i_vscale == 1 ) ?
1005 p_vout->output.i_height : p_vout->render.i_height;
1006 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1008 p_pic_start = p_pic;
1009 p_buffer = b_hscale ? p_buffer_start : p_pic;
1011 for ( i_x = p_vout->render.i_width / 8; i_x--; )
1013 CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4);
1014 CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4);
1015 CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4);
1016 CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4);
1019 /* Here we do some unaligned reads and duplicate conversions, but
1020 * at least we have all the pixels */
1024 p_u -= i_rewind >> 1;
1025 p_v -= i_rewind >> 1;
1026 p_buffer -= i_rewind;
1027 CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4);
1028 CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4);
1029 CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4);
1030 CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4);
1033 SCALE_HEIGHT( 420, 4 );
1035 p_y += i_source_margin;
1038 p_u += i_source_margin_c;
1039 p_v += i_source_margin_c;
1044 #else // defined (MODULE_NAME_IS_i420_rgb_mmx) || defined (MODULE_NAME_IS_i420_rgb_sse2)
1046 void E_(I420_A8R8G8B8)( vout_thread_t *p_vout, picture_t *p_src,
1049 /* We got this one from the old arguments */
1050 uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
1051 uint8_t *p_y = p_src->Y_PIXELS;
1052 uint8_t *p_u = p_src->U_PIXELS;
1053 uint8_t *p_v = p_src->V_PIXELS;
1055 vlc_bool_t b_hscale; /* horizontal scaling type */
1056 unsigned int i_vscale; /* vertical scaling type */
1057 unsigned int i_x, i_y; /* horizontal and vertical indexes */
1061 int i_scale_count; /* scale modulo counter */
1062 int i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
1063 uint32_t * p_pic_start; /* beginning of the current line for copy */
1064 /* Conversion buffer pointer */
1065 uint32_t * p_buffer_start = (uint32_t*)p_vout->chroma.p_sys->p_buffer;
1066 uint32_t * p_buffer;
1068 /* Offset array pointer */
1069 int * p_offset_start = p_vout->chroma.p_sys->p_offset;
1072 const int i_source_margin = p_src->p[0].i_pitch
1073 - p_src->p[0].i_visible_pitch;
1074 const int i_source_margin_c = p_src->p[1].i_pitch
1075 - p_src->p[1].i_visible_pitch;
1077 i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
1079 /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
1080 * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
1081 * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
1082 SetOffset( p_vout->render.i_width, p_vout->render.i_height,
1083 p_vout->output.i_width, p_vout->output.i_height,
1084 &b_hscale, &i_vscale, p_offset_start );
1087 * Perform conversion
1089 i_scale_count = ( i_vscale == 1 ) ?
1090 p_vout->output.i_height : p_vout->render.i_height;
1092 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
1094 if( p_vout->render.i_width & 15 )
1096 i_rewind = 16 - ( p_vout->render.i_width & 15 );
1104 ** SSE2 128 bits fetch/store instructions are faster
1105 ** if memory access is 16 bytes aligned
1108 p_buffer = b_hscale ? p_buffer_start : p_pic;
1109 if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
1114 /* use faster SSE2 aligned fetch and store */
1115 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1117 p_pic_start = p_pic;
1119 for ( i_x = p_vout->render.i_width / 16; i_x--; )
1121 #if defined (CAN_COMPILE_SSE2)
1122 /* use inline SSE2 assembly */
1123 __asm__( ".p2align 3"
1124 SSE2_INIT_32_ALIGNED
1127 SSE2_UNPACK_32_ARGB_ALIGNED
1128 : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
1130 /* otherwise use SSE2 C intrinsics wrappers */
1131 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
1133 SSE2_INTRINSICS_INIT_32_ALIGNED
1134 SSE2_INTRINSICS_YUV_MUL
1135 SSE2_INTRINSICS_YUV_ADD
1136 SSE2_INTRINSICS_UNPACK_32_ARGB_ALIGNED
1144 /* Here we do some unaligned reads and duplicate conversions, but
1145 * at least we have all the pixels */
1149 p_u -= i_rewind >> 1;
1150 p_v -= i_rewind >> 1;
1151 p_buffer -= i_rewind;
1152 #if defined (CAN_COMPILE_SSE2)
1153 /* use inline SSE2 assembly */
1154 __asm__( ".p2align 3"
1155 SSE2_INIT_32_UNALIGNED
1158 SSE2_UNPACK_32_ARGB_UNALIGNED
1159 : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
1161 /* otherwise use SSE2 intrinsics wrappers */
1163 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
1165 SSE2_INTRINSICS_INIT_32_UNALIGNED
1166 SSE2_INTRINSICS_YUV_MUL
1167 SSE2_INTRINSICS_YUV_ADD
1168 SSE2_INTRINSICS_UNPACK_32_ARGB_UNALIGNED
1176 SCALE_HEIGHT( 420, 4 );
1178 p_y += i_source_margin;
1181 p_u += i_source_margin_c;
1182 p_v += i_source_margin_c;
1184 p_buffer = b_hscale ? p_buffer_start : p_pic;
1189 /* use slower SSE2 unaligned fetch and store */
1190 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1192 p_pic_start = p_pic;
1193 p_buffer = b_hscale ? p_buffer_start : p_pic;
1195 for ( i_x = p_vout->render.i_width / 16; i_x--; )
1197 #if defined (CAN_COMPILE_SSE2)
1198 /* use inline SSE2 assembly */
1199 __asm__( ".p2align 3"
1200 SSE2_INIT_32_UNALIGNED
1203 SSE2_UNPACK_32_ARGB_UNALIGNED
1204 : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
1206 /* otherwise use SSE2 C intrinsics wrappers */
1207 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
1209 SSE2_INTRINSICS_INIT_32_UNALIGNED
1210 SSE2_INTRINSICS_YUV_MUL
1211 SSE2_INTRINSICS_YUV_ADD
1212 SSE2_INTRINSICS_UNPACK_32_ARGB_UNALIGNED
1220 /* Here we do some unaligned reads and duplicate conversions, but
1221 * at least we have all the pixels */
1225 p_u -= i_rewind >> 1;
1226 p_v -= i_rewind >> 1;
1227 p_buffer -= i_rewind;
1228 #if defined (CAN_COMPILE_SSE2)
1229 /* use inline SSE2 assembly */
1230 __asm__( ".p2align 3"
1231 SSE2_INIT_32_UNALIGNED
1234 SSE2_UNPACK_32_ARGB_UNALIGNED
1235 : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
1237 /* otherwise use SSE2 intrinsics wrappers */
1239 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
1241 SSE2_INTRINSICS_INIT_32_UNALIGNED
1242 SSE2_INTRINSICS_YUV_MUL
1243 SSE2_INTRINSICS_YUV_ADD
1244 SSE2_INTRINSICS_UNPACK_32_ARGB_UNALIGNED
1252 SCALE_HEIGHT( 420, 4 );
1254 p_y += i_source_margin;
1257 p_u += i_source_margin_c;
1258 p_v += i_source_margin_c;
1260 p_buffer = b_hscale ? p_buffer_start : p_pic;
1264 /* make sure all SSE2 stores are visible thereafter */
1265 #if defined (CAN_COMPILE_SSE2)
1266 __asm__ __volatile__ ( "sfence" ::: "memory" );
1271 #else // defined (MODULE_NAME_IS_i420_rgb_mmx)
1273 if( p_vout->render.i_width & 7 )
1275 i_rewind = 8 - ( p_vout->render.i_width & 7 );
1282 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1284 p_pic_start = p_pic;
1285 p_buffer = b_hscale ? p_buffer_start : p_pic;
1287 for ( i_x = p_vout->render.i_width / 8; i_x--; )
1289 #if defined (CAN_COMPILE_MMX)
1290 /* use inline MMX assembly */
1291 __asm__( MMX_INIT_32
1292 : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
1294 __asm__( ".p2align 3"
1298 : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
1300 /* otherwise use MMX C intrinsics wrappers */
1301 __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
1304 MMX_INTRINSICS_INIT_32
1305 MMX_INTRINSICS_YUV_MUL
1306 MMX_INTRINSICS_YUV_ADD
1307 MMX_INTRINSICS_UNPACK_32_ARGB
1315 /* Here we do some unaligned reads and duplicate conversions, but
1316 * at least we have all the pixels */
1320 p_u -= i_rewind >> 1;
1321 p_v -= i_rewind >> 1;
1322 p_buffer -= i_rewind;
1323 #if defined (CAN_COMPILE_MMX)
1324 /* use inline MMX assembly */
1325 __asm__( ".p2align 3"
1330 : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
1332 /* otherwise use MMX intrinsics wrappers */
1334 __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
1337 MMX_INTRINSICS_INIT_32
1338 MMX_INTRINSICS_YUV_MUL
1339 MMX_INTRINSICS_YUV_ADD
1340 MMX_INTRINSICS_UNPACK_32_ARGB
1349 SCALE_HEIGHT( 420, 4 );
1351 p_y += i_source_margin;
1354 p_u += i_source_margin_c;
1355 p_v += i_source_margin_c;
1358 /* re-enable FPU registers */
1359 #if defined (CAN_COMPILE_MMX)
1360 __asm__ __volatile__ ( "emms" );
1368 void E_(I420_B8G8R8A8)( vout_thread_t *p_vout, picture_t *p_src,
1371 /* We got this one from the old arguments */
1372 uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
1373 uint8_t *p_y = p_src->Y_PIXELS;
1374 uint8_t *p_u = p_src->U_PIXELS;
1375 uint8_t *p_v = p_src->V_PIXELS;
1377 vlc_bool_t b_hscale; /* horizontal scaling type */
1378 unsigned int i_vscale; /* vertical scaling type */
1379 unsigned int i_x, i_y; /* horizontal and vertical indexes */
1383 int i_scale_count; /* scale modulo counter */
1384 int i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
1385 uint32_t * p_pic_start; /* beginning of the current line for copy */
1386 /* Conversion buffer pointer */
1387 uint32_t * p_buffer_start = (uint32_t*)p_vout->chroma.p_sys->p_buffer;
1388 uint32_t * p_buffer;
1390 /* Offset array pointer */
1391 int * p_offset_start = p_vout->chroma.p_sys->p_offset;
1394 const int i_source_margin = p_src->p[0].i_pitch
1395 - p_src->p[0].i_visible_pitch;
1396 const int i_source_margin_c = p_src->p[1].i_pitch
1397 - p_src->p[1].i_visible_pitch;
1399 i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
1401 /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
1402 * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
1403 * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
1404 SetOffset( p_vout->render.i_width, p_vout->render.i_height,
1405 p_vout->output.i_width, p_vout->output.i_height,
1406 &b_hscale, &i_vscale, p_offset_start );
1409 * Perform conversion
1411 i_scale_count = ( i_vscale == 1 ) ?
1412 p_vout->output.i_height : p_vout->render.i_height;
1414 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
1416 if( p_vout->render.i_width & 15 )
1418 i_rewind = 16 - ( p_vout->render.i_width & 15 );
1426 ** SSE2 128 bits fetch/store instructions are faster
1427 ** if memory access is 16 bytes aligned
1430 p_buffer = b_hscale ? p_buffer_start : p_pic;
1431 if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
1436 /* use faster SSE2 aligned fetch and store */
1437 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1439 p_pic_start = p_pic;
1441 for ( i_x = p_vout->render.i_width / 16; i_x--; )
1443 #if defined (CAN_COMPILE_SSE2)
1444 /* use inline SSE2 assembly */
1445 __asm__( ".p2align 3"
1446 SSE2_INIT_32_ALIGNED
1449 SSE2_UNPACK_32_BGRA_ALIGNED
1450 : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
1452 /* otherwise use SSE2 C intrinsics wrappers */
1453 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
1455 SSE2_INTRINSICS_INIT_32_ALIGNED
1456 SSE2_INTRINSICS_YUV_MUL
1457 SSE2_INTRINSICS_YUV_ADD
1458 SSE2_INTRINSICS_UNPACK_32_BGRA_ALIGNED
1466 /* Here we do some unaligned reads and duplicate conversions, but
1467 * at least we have all the pixels */
1471 p_u -= i_rewind >> 1;
1472 p_v -= i_rewind >> 1;
1473 p_buffer -= i_rewind;
1474 #if defined (CAN_COMPILE_SSE2)
1475 /* use inline SSE2 assembly */
1476 __asm__( ".p2align 3"
1477 SSE2_INIT_32_UNALIGNED
1480 SSE2_UNPACK_32_BGRA_UNALIGNED
1481 : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
1483 /* otherwise use SSE2 intrinsics wrappers */
1485 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
1487 SSE2_INTRINSICS_INIT_32_UNALIGNED
1488 SSE2_INTRINSICS_YUV_MUL
1489 SSE2_INTRINSICS_YUV_ADD
1490 SSE2_INTRINSICS_UNPACK_32_BGRA_UNALIGNED
1498 SCALE_HEIGHT( 420, 4 );
1500 p_y += i_source_margin;
1503 p_u += i_source_margin_c;
1504 p_v += i_source_margin_c;
1506 p_buffer = b_hscale ? p_buffer_start : p_pic;
1511 /* use slower SSE2 unaligned fetch and store */
1512 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1514 p_pic_start = p_pic;
1515 p_buffer = b_hscale ? p_buffer_start : p_pic;
1517 for ( i_x = p_vout->render.i_width / 16; i_x--; )
1519 #if defined (CAN_COMPILE_SSE2)
1520 /* use inline SSE2 assembly */
1521 __asm__( ".p2align 3"
1522 SSE2_INIT_32_UNALIGNED
1525 SSE2_UNPACK_32_BGRA_UNALIGNED
1526 : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
1528 /* otherwise use SSE2 C intrinsics wrappers */
1529 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
1531 SSE2_INTRINSICS_INIT_32_UNALIGNED
1532 SSE2_INTRINSICS_YUV_MUL
1533 SSE2_INTRINSICS_YUV_ADD
1534 SSE2_INTRINSICS_UNPACK_32_BGRA_UNALIGNED
1542 /* Here we do some unaligned reads and duplicate conversions, but
1543 * at least we have all the pixels */
1547 p_u -= i_rewind >> 1;
1548 p_v -= i_rewind >> 1;
1549 p_buffer -= i_rewind;
1550 #if defined (CAN_COMPILE_SSE2)
1551 /* use inline SSE2 assembly */
1552 __asm__( ".p2align 3"
1553 SSE2_INIT_32_UNALIGNED
1556 SSE2_UNPACK_32_BGRA_UNALIGNED
1557 : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
1559 /* otherwise use SSE2 intrinsics wrappers */
1561 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
1563 SSE2_INTRINSICS_INIT_32_UNALIGNED
1564 SSE2_INTRINSICS_YUV_MUL
1565 SSE2_INTRINSICS_YUV_ADD
1566 SSE2_INTRINSICS_UNPACK_32_BGRA_UNALIGNED
1574 SCALE_HEIGHT( 420, 4 );
1576 p_y += i_source_margin;
1579 p_u += i_source_margin_c;
1580 p_v += i_source_margin_c;
1582 p_buffer = b_hscale ? p_buffer_start : p_pic;
1588 if( p_vout->render.i_width & 7 )
1590 i_rewind = 8 - ( p_vout->render.i_width & 7 );
1597 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1599 p_pic_start = p_pic;
1600 p_buffer = b_hscale ? p_buffer_start : p_pic;
1602 for ( i_x = p_vout->render.i_width / 8; i_x--; )
1604 #if defined (CAN_COMPILE_MMX)
1605 /* use inline MMX assembly */
1606 __asm__( MMX_INIT_32
1607 : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
1609 __asm__( ".p2align 3"
1613 : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
1615 /* otherwise use MMX C intrinsics wrappers */
1616 __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
1619 MMX_INTRINSICS_INIT_32
1620 MMX_INTRINSICS_YUV_MUL
1621 MMX_INTRINSICS_YUV_ADD
1622 MMX_INTRINSICS_UNPACK_32_BGRA
1630 /* Here we do some unaligned reads and duplicate conversions, but
1631 * at least we have all the pixels */
1635 p_u -= i_rewind >> 1;
1636 p_v -= i_rewind >> 1;
1637 p_buffer -= i_rewind;
1638 #if defined (CAN_COMPILE_MMX)
1639 /* use inline MMX assembly */
1640 __asm__( ".p2align 3"
1645 : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
1647 /* otherwise use MMX intrinsics wrappers */
1649 __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
1652 MMX_INTRINSICS_INIT_32
1653 MMX_INTRINSICS_YUV_MUL
1654 MMX_INTRINSICS_YUV_ADD
1655 MMX_INTRINSICS_UNPACK_32_BGRA
1664 SCALE_HEIGHT( 420, 4 );
1666 p_y += i_source_margin;
1669 p_u += i_source_margin_c;
1670 p_v += i_source_margin_c;
1673 /* re-enable FPU registers */
1674 #if defined (CAN_COMPILE_MMX)
1675 __asm__ __volatile__ ( "emms" );
1685 /* Following functions are local */
1687 /*****************************************************************************
1688 * SetOffset: build offset array for conversion functions
1689 *****************************************************************************
1690 * This function will build an offset array used in later conversion functions.
1691 * It will also set horizontal and vertical scaling indicators.
1692 *****************************************************************************/
1693 static void SetOffset( int i_width, int i_height, int i_pic_width,
1694 int i_pic_height, vlc_bool_t *pb_hscale,
1695 unsigned int *pi_vscale, int *p_offset )
1697 int i_x; /* x position in destination */
1698 int i_scale_count; /* modulo counter */
1701 * Prepare horizontal offset array
1703 if( i_pic_width - i_width == 0 )
1705 /* No horizontal scaling: YUV conversion is done directly to picture */
1708 else if( i_pic_width - i_width > 0 )
1710 /* Prepare scaling array for horizontal extension */
1712 i_scale_count = i_pic_width;
1713 for( i_x = i_width; i_x--; )
1715 while( (i_scale_count -= i_width) > 0 )
1720 i_scale_count += i_pic_width;
1723 else /* if( i_pic_width - i_width < 0 ) */
1725 /* Prepare scaling array for horizontal reduction */
1727 i_scale_count = i_width;
1728 for( i_x = i_pic_width; i_x--; )
1731 while( (i_scale_count -= i_pic_width) > 0 )
1736 i_scale_count += i_width;
1741 * Set vertical scaling indicator
1743 if( i_pic_height - i_height == 0 )
1747 else if( i_pic_height - i_height > 0 )
1751 else /* if( i_pic_height - i_height < 0 ) */