3 * yuv2rgb_mmx.c, Software YUV to RGB coverter with Intel MMX "technology"
5 * Copyright (C) 2000, Silicon Integrated System Corp.
8 * Author: Olie Lho <ollie@sis.com.tw>
10 * This file is part of mpeg2dec, a free MPEG-2 video decoder
12 * mpeg2dec is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2, or (at your option)
17 * mpeg2dec is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
22 * You should have received a copy of the GNU General Public License
23 * along with GNU Make; see the file COPYING. If not, write to
24 * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
26 * 15 and 24 bpp support from Michael Niedermayer (michaelni@gmx.at)
32 #include "../config.h"
34 //#include "libmpeg2/mpeg2.h"
35 //#include "libmpeg2/mpeg2_internal.h"
39 #include "../mmx_defs.h"
41 /* hope these constant values are cache line aligned */
42 uint64_t __attribute__((aligned(8))) mmx_80w = 0x0080008000800080;
43 uint64_t __attribute__((aligned(8))) mmx_10w = 0x1010101010101010;
44 uint64_t __attribute__((aligned(8))) mmx_00ffw = 0x00ff00ff00ff00ff;
45 uint64_t __attribute__((aligned(8))) mmx_Y_coeff = 0x253f253f253f253f;
47 /* hope these constant values are cache line aligned */
48 uint64_t __attribute__((aligned(8))) mmx_U_green = 0xf37df37df37df37d;
49 uint64_t __attribute__((aligned(8))) mmx_U_blue = 0x4093409340934093;
50 uint64_t __attribute__((aligned(8))) mmx_V_red = 0x3312331233123312;
51 uint64_t __attribute__((aligned(8))) mmx_V_green = 0xe5fce5fce5fce5fc;
53 /* hope these constant values are cache line aligned */
54 uint64_t __attribute__((aligned(8))) mmx_redmask = 0xf8f8f8f8f8f8f8f8;
55 uint64_t __attribute__((aligned(8))) mmx_grnmask = 0xfcfcfcfcfcfcfcfc;
58 /* Do the multiply part of the conversion for even and odd pixels,
60 mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels,
61 mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd pixels,
62 mm6 -> Y even, mm7 -> Y odd */\
63 /* convert the chroma part */\
64 "punpcklbw %%mm4, %%mm0;" /* scatter 4 Cb 00 u3 00 u2 00 u1 00 u0 */ \
65 "punpcklbw %%mm4, %%mm1;" /* scatter 4 Cr 00 v3 00 v2 00 v1 00 v0 */ \
67 "psubsw mmx_80w, %%mm0;" /* Cb -= 128 */ \
68 "psubsw mmx_80w, %%mm1;" /* Cr -= 128 */ \
70 "psllw $3, %%mm0;" /* Promote precision */ \
71 "psllw $3, %%mm1;" /* Promote precision */ \
73 "movq %%mm0, %%mm2;" /* Copy 4 Cb 00 u3 00 u2 00 u1 00 u0 */ \
74 "movq %%mm1, %%mm3;" /* Copy 4 Cr 00 v3 00 v2 00 v1 00 v0 */ \
76 "pmulhw mmx_U_green, %%mm2;" /* Mul Cb with green coeff -> Cb green */ \
77 "pmulhw mmx_V_green, %%mm3;" /* Mul Cr with green coeff -> Cr green */ \
79 "pmulhw mmx_U_blue, %%mm0;" /* Mul Cb -> Cblue 00 b3 00 b2 00 b1 00 b0 */\
80 "pmulhw mmx_V_red, %%mm1;" /* Mul Cr -> Cred 00 r3 00 r2 00 r1 00 r0 */\
82 "paddsw %%mm3, %%mm2;" /* Cb green + Cr green -> Cgreen */\
84 /* convert the luma part */\
85 "psubusb mmx_10w, %%mm6;" /* Y -= 16 */\
87 "movq %%mm6, %%mm7;" /* Copy 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */\
88 "pand mmx_00ffw, %%mm6;" /* get Y even 00 Y6 00 Y4 00 Y2 00 Y0 */\
90 "psrlw $8, %%mm7;" /* get Y odd 00 Y7 00 Y5 00 Y3 00 Y1 */\
92 "psllw $3, %%mm6;" /* Promote precision */\
93 "psllw $3, %%mm7;" /* Promote precision */\
95 "pmulhw mmx_Y_coeff, %%mm6;" /* Mul 4 Y even 00 y6 00 y4 00 y2 00 y0 */\
96 "pmulhw mmx_Y_coeff, %%mm7;" /* Mul 4 Y odd 00 y7 00 y5 00 y3 00 y1 */\
98 /* Do the addition part of the conversion for even and odd pixels,
100 mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels,
101 mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd pixels,
102 mm6 -> Y even, mm7 -> Y odd */\
103 "movq %%mm0, %%mm3;" /* Copy Cblue */\
104 "movq %%mm1, %%mm4;" /* Copy Cred */\
105 "movq %%mm2, %%mm5;" /* Copy Cgreen */\
107 "paddsw %%mm6, %%mm0;" /* Y even + Cblue 00 B6 00 B4 00 B2 00 B0 */\
108 "paddsw %%mm7, %%mm3;" /* Y odd + Cblue 00 B7 00 B5 00 B3 00 B1 */\
110 "paddsw %%mm6, %%mm1;" /* Y even + Cred 00 R6 00 R4 00 R2 00 R0 */\
111 "paddsw %%mm7, %%mm4;" /* Y odd + Cred 00 R7 00 R5 00 R3 00 R1 */\
113 "paddsw %%mm6, %%mm2;" /* Y even + Cgreen 00 G6 00 G4 00 G2 00 G0 */\
114 "paddsw %%mm7, %%mm5;" /* Y odd + Cgreen 00 G7 00 G5 00 G3 00 G1 */\
116 /* Limit RGB even to 0..255 */\
117 "packuswb %%mm0, %%mm0;" /* B6 B4 B2 B0 B6 B4 B2 B0 */\
118 "packuswb %%mm1, %%mm1;" /* R6 R4 R2 R0 R6 R4 R2 R0 */\
119 "packuswb %%mm2, %%mm2;" /* G6 G4 G2 G0 G6 G4 G2 G0 */\
121 /* Limit RGB odd to 0..255 */\
122 "packuswb %%mm3, %%mm3;" /* B7 B5 B3 B1 B7 B5 B3 B1 */\
123 "packuswb %%mm4, %%mm4;" /* R7 R5 R3 R1 R7 R5 R3 R1 */\
124 "packuswb %%mm5, %%mm5;" /* G7 G5 G3 G1 G7 G5 G3 G1 */\
126 /* Interleave RGB even and odd */\
127 "punpcklbw %%mm3, %%mm0;" /* B7 B6 B5 B4 B3 B2 B1 B0 */\
128 "punpcklbw %%mm4, %%mm1;" /* R7 R6 R5 R4 R3 R2 R1 R0 */\
129 "punpcklbw %%mm5, %%mm2;" /* G7 G6 G5 G4 G3 G2 G1 G0 */\
132 static void yuv420_rgb16_mmx (uint8_t * image, uint8_t * py,
133 uint8_t * pu, uint8_t * pv,
134 int h_size, int v_size,
135 int rgb_stride, int y_stride, int uv_stride)
140 __asm__ __volatile__ ("pxor %mm4, %mm4;" /* zero mm4 */ );
142 for (y = v_size; --y >= 0; ) {
143 uint8_t *_image = image;
148 /* load data for start of next scan line */
149 __asm__ __volatile__ (
150 "movd (%1), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
151 "movd (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
152 "movq (%0), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
154 : : "r" (_py), "r" (_pu), "r" (_pv));
156 for (x = h_size >> 3; --x >= 0; ) {
157 /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8
158 pixels in each iteration */
160 __asm__ __volatile__ (
161 /* no speed diference on my p3@500 with prefetch,
162 * if it is faster for anyone with -benchmark then tell me
163 PREFETCH" 64(%0) \n\t"
164 PREFETCH" 64(%1) \n\t"
165 PREFETCH" 64(%2) \n\t"
169 /* mask unneeded bits off */
170 "pand mmx_redmask, %%mm0;" /* b7b6b5b4 b3_0_0_0 b7b6b5b4 b3_0_0_0 */
171 "pand mmx_grnmask, %%mm2;" /* g7g6g5g4 g3g2_0_0 g7g6g5g4 g3g2_0_0 */
172 "pand mmx_redmask, %%mm1;" /* r7r6r5r4 r3_0_0_0 r7r6r5r4 r3_0_0_0 */
174 "psrlw $3,%%mm0;" /* 0_0_0_b7 b6b5b4b3 0_0_0_b7 b6b5b4b3 */
175 "pxor %%mm4, %%mm4;" /* zero mm4 */
177 "movq %%mm0, %%mm5;" /* Copy B7-B0 */
178 "movq %%mm2, %%mm7;" /* Copy G7-G0 */
180 /* convert rgb24 plane to rgb16 pack for pixel 0-3 */
181 "punpcklbw %%mm4, %%mm2;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3g2_0_0 */
182 "punpcklbw %%mm1, %%mm0;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */
184 "psllw $3, %%mm2;" /* 0_0_0_0 0_g7g6g5 g4g3g2_0 0_0_0_0 */
185 "por %%mm2, %%mm0;" /* r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 */
187 "movq 8 (%0), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
188 MOVNTQ " %%mm0, (%3);" /* store pixel 0-3 */
190 /* convert rgb24 plane to rgb16 pack for pixel 0-3 */
191 "punpckhbw %%mm4, %%mm7;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3g2_0_0 */
192 "punpckhbw %%mm1, %%mm5;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */
194 "psllw $3, %%mm7;" /* 0_0_0_0 0_g7g6g5 g4g3g2_0 0_0_0_0 */
195 "movd 4 (%1), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
197 "por %%mm7, %%mm5;" /* r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 */
198 "movd 4 (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
200 MOVNTQ " %%mm5, 8 (%3);" /* store pixel 4-7 */
201 : : "r" (_py), "r" (_pu), "r" (_pv), "r" (_image));
220 __asm__ __volatile__ (EMMS);
223 static void yuv420_rgb15_mmx (uint8_t * image, uint8_t * py,
224 uint8_t * pu, uint8_t * pv,
225 int h_size, int v_size,
226 int rgb_stride, int y_stride, int uv_stride)
231 __asm__ __volatile__ ("pxor %mm4, %mm4;" /* zero mm4 */ );
233 for (y = v_size; --y >= 0; ) {
234 uint8_t *_image = image;
239 /* load data for start of next scan line */
240 __asm__ __volatile__ (
241 "movd (%1), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
242 "movd (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
243 "movq (%0), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
245 : : "r" (_py), "r" (_pu), "r" (_pv));
247 for (x = h_size >> 3; --x >= 0; ) {
248 /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8
249 pixels in each iteration */
251 __asm__ __volatile__ (
254 /* mask unneeded bits off */
255 "pand mmx_redmask, %%mm0;" /* b7b6b5b4 b3_0_0_0 b7b6b5b4 b3_0_0_0 */
256 "pand mmx_redmask, %%mm2;" /* g7g6g5g4 g3_0_0_0 g7g6g5g4 g3_0_0_0 */
257 "pand mmx_redmask, %%mm1;" /* r7r6r5r4 r3_0_0_0 r7r6r5r4 r3_0_0_0 */
259 "psrlw $3,%%mm0;" /* 0_0_0_b7 b6b5b4b3 0_0_0_b7 b6b5b4b3 */
260 "psrlw $1,%%mm1;" /* 0_r7r6r5 r4r3_0_0 0_r7r6r5 r4r3_0_0 */
261 "pxor %%mm4, %%mm4;" /* zero mm4 */
263 "movq %%mm0, %%mm5;" /* Copy B7-B0 */
264 "movq %%mm2, %%mm7;" /* Copy G7-G0 */
266 /* convert rgb24 plane to rgb16 pack for pixel 0-3 */
267 "punpcklbw %%mm4, %%mm2;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3_0_0_0 */
268 "punpcklbw %%mm1, %%mm0;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */
270 "psllw $2, %%mm2;" /* 0_0_0_0 0_0_g7g6 g5g4g3_0 0_0_0_0 */
271 "por %%mm2, %%mm0;" /* 0_r7r6r5 r4r3g7g6 g5g4g3b7 b6b5b4b3 */
273 "movq 8 (%0), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
274 MOVNTQ " %%mm0, (%3);" /* store pixel 0-3 */
276 /* convert rgb24 plane to rgb16 pack for pixel 0-3 */
277 "punpckhbw %%mm4, %%mm7;" /* 0_0_0_0 0_0_0_0 0_g7g6g5 g4g3_0_0 */
278 "punpckhbw %%mm1, %%mm5;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */
280 "psllw $2, %%mm7;" /* 0_0_0_0 0_0_g7g6 g5g4g3_0 0_0_0_0 */
281 "movd 4 (%1), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
283 "por %%mm7, %%mm5;" /* 0_r7r6r5 r4r3g7g6 g5g4g3b7 b6b5b4b3 */
284 "movd 4 (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
286 MOVNTQ " %%mm5, 8 (%3);" /* store pixel 4-7 */
287 : : "r" (_py), "r" (_pu), "r" (_pv), "r" (_image));
306 __asm__ __volatile__ (EMMS);
309 static void yuv420_rgb24_mmx (uint8_t * image, uint8_t * py,
310 uint8_t * pu, uint8_t * pv,
311 int h_size, int v_size,
312 int rgb_stride, int y_stride, int uv_stride)
317 __asm__ __volatile__ ("pxor %mm4, %mm4;" /* zero mm4 */ );
319 for (y = v_size; --y >= 0; ) {
320 uint8_t *_image = image;
325 /* load data for start of next scan line */
326 __asm__ __volatile__ (
327 "movd (%1), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
328 "movd (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
329 "movq (%0), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
331 : : "r" (_py), "r" (_pu), "r" (_pv));
333 for (x = h_size >> 3; --x >= 0; ) {
334 /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8
335 pixels in each iteration */
337 __asm__ __volatile__ (
340 /* mm0=B, %%mm2=G, %%mm1=R */
341 "pxor %%mm4, %%mm4 \n\t"
342 "movq %%mm0, %%mm5 \n\t" /* B */
343 "movq %%mm1, %%mm6 \n\t" /* R */
344 "punpcklbw %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */
345 "punpcklbw %%mm4, %%mm1 \n\t" /* 0R0R0R0R 0 */
346 "punpckhbw %%mm2, %%mm5 \n\t" /* GBGBGBGB 2 */
347 "punpckhbw %%mm4, %%mm6 \n\t" /* 0R0R0R0R 2 */
348 "movq %%mm0, %%mm7 \n\t" /* GBGBGBGB 0 */
349 "movq %%mm5, %%mm3 \n\t" /* GBGBGBGB 2 */
350 "punpcklwd %%mm1, %%mm7 \n\t" /* 0RGB0RGB 0 */
351 "punpckhwd %%mm1, %%mm0 \n\t" /* 0RGB0RGB 1 */
352 "punpcklwd %%mm6, %%mm5 \n\t" /* 0RGB0RGB 2 */
353 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */
355 "movq %%mm7, %%mm2 \n\t" /* 0RGB0RGB 0 */
356 "movq %%mm0, %%mm6 \n\t" /* 0RGB0RGB 1 */
357 "movq %%mm5, %%mm1 \n\t" /* 0RGB0RGB 2 */
358 "movq %%mm3, %%mm4 \n\t" /* 0RGB0RGB 3 */
360 "psllq $40, %%mm7 \n\t" /* RGB00000 0 */
361 "psllq $40, %%mm0 \n\t" /* RGB00000 1 */
362 "psllq $40, %%mm5 \n\t" /* RGB00000 2 */
363 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */
365 "punpckhdq %%mm2, %%mm7 \n\t" /* 0RGBRGB0 0 */
366 "punpckhdq %%mm6, %%mm0 \n\t" /* 0RGBRGB0 1 */
367 "punpckhdq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */
368 "punpckhdq %%mm4, %%mm3 \n\t" /* 0RGBRGB0 3 */
370 "psrlq $8, %%mm7 \n\t" /* 00RGBRGB 0 */
371 "movq %%mm0, %%mm6 \n\t" /* 0RGBRGB0 1 */
372 "psllq $40, %%mm0 \n\t" /* GB000000 1 */
373 "por %%mm0, %%mm7 \n\t" /* GBRGBRGB 0 */
374 MOVNTQ" %%mm7, (%3) \n\t"
376 "movd 4 (%1), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
378 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */
379 "movq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */
380 "psllq $24, %%mm5 \n\t" /* BRGB0000 2 */
381 "por %%mm5, %%mm6 \n\t" /* BRGBRGBR 1 */
382 MOVNTQ" %%mm6, 8(%3) \n\t"
384 "movq 8 (%0), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
386 "psrlq $40, %%mm1 \n\t" /* 000000RG 2 */
387 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */
388 "por %%mm3, %%mm1 \n\t" /* RGBRGBRG 2 */
389 MOVNTQ" %%mm1, 16(%3) \n\t"
391 "movd 4 (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
392 "pxor %%mm4, %%mm4 \n\t"
395 : : "r" (_py), "r" (_pu), "r" (_pv), "r" (_image));
414 __asm__ __volatile__ (EMMS);
418 static void yuv420_argb32_mmx (uint8_t * image, uint8_t * py,
419 uint8_t * pu, uint8_t * pv,
420 int h_size, int v_size,
421 int rgb_stride, int y_stride, int uv_stride)
426 __asm__ __volatile__ ("pxor %mm4, %mm4;" /* zero mm4 */ );
428 for (y = v_size; --y >= 0; ) {
429 uint8_t *_image = image;
434 /* load data for start of next scan line */
437 "movd (%1), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
438 "movd (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
439 "movq (%0), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
440 : : "r" (_py), "r" (_pu), "r" (_pv)
443 for (x = h_size >> 3; --x >= 0; ) {
444 /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8
445 pixels in each iteration */
446 __asm__ __volatile__ (
448 /* convert RGB plane to RGB packed format,
449 mm0 -> B, mm1 -> R, mm2 -> G, mm3 -> 0,
450 mm4 -> GB, mm5 -> AR pixel 4-7,
451 mm6 -> GB, mm7 -> AR pixel 0-3 */
452 "pxor %%mm3, %%mm3;" /* zero mm3 */
454 "movq %%mm0, %%mm6;" /* B7 B6 B5 B4 B3 B2 B1 B0 */
455 "movq %%mm1, %%mm7;" /* R7 R6 R5 R4 R3 R2 R1 R0 */
457 "movq %%mm0, %%mm4;" /* B7 B6 B5 B4 B3 B2 B1 B0 */
458 "movq %%mm1, %%mm5;" /* R7 R6 R5 R4 R3 R2 R1 R0 */
460 "punpcklbw %%mm2, %%mm6;" /* G3 B3 G2 B2 G1 B1 G0 B0 */
461 "punpcklbw %%mm3, %%mm7;" /* 00 R3 00 R2 00 R1 00 R0 */
463 "punpcklwd %%mm7, %%mm6;" /* 00 R1 B1 G1 00 R0 B0 G0 */
464 MOVNTQ " %%mm6, (%3);" /* Store ARGB1 ARGB0 */
466 "movq %%mm0, %%mm6;" /* B7 B6 B5 B4 B3 B2 B1 B0 */
467 "punpcklbw %%mm2, %%mm6;" /* G3 B3 G2 B2 G1 B1 G0 B0 */
469 "punpckhwd %%mm7, %%mm6;" /* 00 R3 G3 B3 00 R2 B3 G2 */
470 MOVNTQ " %%mm6, 8 (%3);" /* Store ARGB3 ARGB2 */
472 "punpckhbw %%mm2, %%mm4;" /* G7 B7 G6 B6 G5 B5 G4 B4 */
473 "punpckhbw %%mm3, %%mm5;" /* 00 R7 00 R6 00 R5 00 R4 */
475 "punpcklwd %%mm5, %%mm4;" /* 00 R5 B5 G5 00 R4 B4 G4 */
476 MOVNTQ " %%mm4, 16 (%3);" /* Store ARGB5 ARGB4 */
478 "movq %%mm0, %%mm4;" /* B7 B6 B5 B4 B3 B2 B1 B0 */
479 "punpckhbw %%mm2, %%mm4;" /* G7 B7 G6 B6 G5 B5 G4 B4 */
481 "punpckhwd %%mm5, %%mm4;" /* 00 R7 G7 B7 00 R6 B6 G6 */
482 MOVNTQ " %%mm4, 24 (%3);" /* Store ARGB7 ARGB6 */
484 "movd 4 (%1), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
485 "movd 4 (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
487 "pxor %%mm4, %%mm4;" /* zero mm4 */
488 "movq 8 (%0), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
490 : : "r" (_py), "r" (_pu), "r" (_pv), "r" (_image));
509 __asm__ __volatile__ (EMMS);
512 yuv2rgb_fun yuv2rgb_init_mmx (int bpp, int mode)
514 if (bpp == 15 && mode == MODE_RGB) return yuv420_rgb15_mmx;
515 if (bpp == 16 && mode == MODE_RGB) return yuv420_rgb16_mmx;
516 if (bpp == 24 && mode == MODE_RGB) return yuv420_rgb24_mmx;
517 if (bpp == 32 && mode == MODE_RGB) return yuv420_argb32_mmx;
518 return NULL; // Fallback to C.