2 * rgb2rgb.c, Software RGB to RGB convertor
3 * pluralize by Software PAL8 to RGB convertor
4 * Software YUV to YUV convertor
5 * Software YUV to RGB convertor
6 * Written by Nick Kurshev.
7 * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
8 * lot of big-endian byteorder fixes by Alex Beregszaszi
10 * This file is part of FFmpeg.
12 * FFmpeg is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
17 * FFmpeg is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
22 * You should have received a copy of the GNU General Public License
23 * along with FFmpeg; if not, write to the Free Software
24 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
26 * the C code (not assembly, mmx, ...) of this file can be used
27 * under the LGPL license too
31 #include <inttypes.h> /* for __WORDSIZE */
34 // #warning You have misconfigured system and probably will lose performance!
35 #define __WORDSIZE MP_WORDSIZE
53 #define PREFETCH "prefetch"
54 #define PREFETCHW "prefetchw"
55 #define PAVGB "pavgusb"
56 #elif defined ( HAVE_MMX2 )
57 #define PREFETCH "prefetchnta"
58 #define PREFETCHW "prefetcht0"
65 #define PREFETCH " # nop"
66 #define PREFETCHW " # nop"
71 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
78 #define MOVNTQ "movntq"
79 #define SFENCE "sfence"
82 #define SFENCE " # nop"
85 static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,long src_size)
88 const uint8_t *s = src;
91 const uint8_t *mm_end;
95 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
97 __asm __volatile("movq %0, %%mm7"::"m"(mask32):"memory");
102 "movd %1, %%mm0 \n\t"
103 "punpckldq 3%1, %%mm0 \n\t"
104 "movd 6%1, %%mm1 \n\t"
105 "punpckldq 9%1, %%mm1 \n\t"
106 "movd 12%1, %%mm2 \n\t"
107 "punpckldq 15%1, %%mm2 \n\t"
108 "movd 18%1, %%mm3 \n\t"
109 "punpckldq 21%1, %%mm3 \n\t"
110 "pand %%mm7, %%mm0 \n\t"
111 "pand %%mm7, %%mm1 \n\t"
112 "pand %%mm7, %%mm2 \n\t"
113 "pand %%mm7, %%mm3 \n\t"
114 MOVNTQ" %%mm0, %0 \n\t"
115 MOVNTQ" %%mm1, 8%0 \n\t"
116 MOVNTQ" %%mm2, 16%0 \n\t"
124 __asm __volatile(SFENCE:::"memory");
125 __asm __volatile(EMMS:::"memory");
129 #ifdef WORDS_BIGENDIAN
130 /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */
145 static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,long src_size)
148 const uint8_t *s = src;
151 const uint8_t *mm_end;
155 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
161 "movq %1, %%mm0 \n\t"
162 "movq 8%1, %%mm1 \n\t"
163 "movq 16%1, %%mm4 \n\t"
164 "movq 24%1, %%mm5 \n\t"
165 "movq %%mm0, %%mm2 \n\t"
166 "movq %%mm1, %%mm3 \n\t"
167 "movq %%mm4, %%mm6 \n\t"
168 "movq %%mm5, %%mm7 \n\t"
169 "psrlq $8, %%mm2 \n\t"
170 "psrlq $8, %%mm3 \n\t"
171 "psrlq $8, %%mm6 \n\t"
172 "psrlq $8, %%mm7 \n\t"
173 "pand %2, %%mm0 \n\t"
174 "pand %2, %%mm1 \n\t"
175 "pand %2, %%mm4 \n\t"
176 "pand %2, %%mm5 \n\t"
177 "pand %3, %%mm2 \n\t"
178 "pand %3, %%mm3 \n\t"
179 "pand %3, %%mm6 \n\t"
180 "pand %3, %%mm7 \n\t"
181 "por %%mm2, %%mm0 \n\t"
182 "por %%mm3, %%mm1 \n\t"
183 "por %%mm6, %%mm4 \n\t"
184 "por %%mm7, %%mm5 \n\t"
186 "movq %%mm1, %%mm2 \n\t"
187 "movq %%mm4, %%mm3 \n\t"
188 "psllq $48, %%mm2 \n\t"
189 "psllq $32, %%mm3 \n\t"
190 "pand %4, %%mm2 \n\t"
191 "pand %5, %%mm3 \n\t"
192 "por %%mm2, %%mm0 \n\t"
193 "psrlq $16, %%mm1 \n\t"
194 "psrlq $32, %%mm4 \n\t"
195 "psllq $16, %%mm5 \n\t"
196 "por %%mm3, %%mm1 \n\t"
197 "pand %6, %%mm5 \n\t"
198 "por %%mm5, %%mm4 \n\t"
200 MOVNTQ" %%mm0, %0 \n\t"
201 MOVNTQ" %%mm1, 8%0 \n\t"
204 :"m"(*s),"m"(mask24l),
205 "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
210 __asm __volatile(SFENCE:::"memory");
211 __asm __volatile(EMMS:::"memory");
215 #ifdef WORDS_BIGENDIAN
216 /* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */
232 Original by Strepto/Astral
233 ported to gcc & bugfixed : A'rpi
234 MMX2, 3DNOW optimization by Nick Kurshev
235 32bit c version, and and&add trick by Michael Niedermayer
237 static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,long src_size)
239 register const uint8_t* s=src;
240 register uint8_t* d=dst;
241 register const uint8_t *end;
242 const uint8_t *mm_end;
245 __asm __volatile(PREFETCH" %0"::"m"(*s));
246 __asm __volatile("movq %0, %%mm4"::"m"(mask15s));
252 "movq %1, %%mm0 \n\t"
253 "movq 8%1, %%mm2 \n\t"
254 "movq %%mm0, %%mm1 \n\t"
255 "movq %%mm2, %%mm3 \n\t"
256 "pand %%mm4, %%mm0 \n\t"
257 "pand %%mm4, %%mm2 \n\t"
258 "paddw %%mm1, %%mm0 \n\t"
259 "paddw %%mm3, %%mm2 \n\t"
260 MOVNTQ" %%mm0, %0 \n\t"
268 __asm __volatile(SFENCE:::"memory");
269 __asm __volatile(EMMS:::"memory");
274 register unsigned x= *((uint32_t *)s);
275 *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
281 register unsigned short x= *((uint16_t *)s);
282 *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
286 static inline void RENAME(rgb16to15)(const uint8_t *src,uint8_t *dst,long src_size)
288 register const uint8_t* s=src;
289 register uint8_t* d=dst;
290 register const uint8_t *end;
291 const uint8_t *mm_end;
294 __asm __volatile(PREFETCH" %0"::"m"(*s));
295 __asm __volatile("movq %0, %%mm7"::"m"(mask15rg));
296 __asm __volatile("movq %0, %%mm6"::"m"(mask15b));
302 "movq %1, %%mm0 \n\t"
303 "movq 8%1, %%mm2 \n\t"
304 "movq %%mm0, %%mm1 \n\t"
305 "movq %%mm2, %%mm3 \n\t"
306 "psrlq $1, %%mm0 \n\t"
307 "psrlq $1, %%mm2 \n\t"
308 "pand %%mm7, %%mm0 \n\t"
309 "pand %%mm7, %%mm2 \n\t"
310 "pand %%mm6, %%mm1 \n\t"
311 "pand %%mm6, %%mm3 \n\t"
312 "por %%mm1, %%mm0 \n\t"
313 "por %%mm3, %%mm2 \n\t"
314 MOVNTQ" %%mm0, %0 \n\t"
322 __asm __volatile(SFENCE:::"memory");
323 __asm __volatile(EMMS:::"memory");
328 register uint32_t x= *((uint32_t *)s);
329 *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
335 register uint16_t x= *((uint16_t *)s);
336 *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
342 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size)
344 const uint8_t *s = src;
347 const uint8_t *mm_end;
349 uint16_t *d = (uint16_t *)dst;
353 #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
355 "movq %3, %%mm5 \n\t"
356 "movq %4, %%mm6 \n\t"
357 "movq %5, %%mm7 \n\t"
361 PREFETCH" 32(%1) \n\t"
362 "movd (%1), %%mm0 \n\t"
363 "movd 4(%1), %%mm3 \n\t"
364 "punpckldq 8(%1), %%mm0 \n\t"
365 "punpckldq 12(%1), %%mm3 \n\t"
366 "movq %%mm0, %%mm1 \n\t"
367 "movq %%mm3, %%mm4 \n\t"
368 "pand %%mm6, %%mm0 \n\t"
369 "pand %%mm6, %%mm3 \n\t"
370 "pmaddwd %%mm7, %%mm0 \n\t"
371 "pmaddwd %%mm7, %%mm3 \n\t"
372 "pand %%mm5, %%mm1 \n\t"
373 "pand %%mm5, %%mm4 \n\t"
374 "por %%mm1, %%mm0 \n\t"
375 "por %%mm4, %%mm3 \n\t"
376 "psrld $5, %%mm0 \n\t"
377 "pslld $11, %%mm3 \n\t"
378 "por %%mm3, %%mm0 \n\t"
379 MOVNTQ" %%mm0, (%0) \n\t"
386 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
389 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
391 "movq %0, %%mm7 \n\t"
392 "movq %1, %%mm6 \n\t"
393 ::"m"(red_16mask),"m"(green_16mask));
398 "movd %1, %%mm0 \n\t"
399 "movd 4%1, %%mm3 \n\t"
400 "punpckldq 8%1, %%mm0 \n\t"
401 "punpckldq 12%1, %%mm3 \n\t"
402 "movq %%mm0, %%mm1 \n\t"
403 "movq %%mm0, %%mm2 \n\t"
404 "movq %%mm3, %%mm4 \n\t"
405 "movq %%mm3, %%mm5 \n\t"
406 "psrlq $3, %%mm0 \n\t"
407 "psrlq $3, %%mm3 \n\t"
408 "pand %2, %%mm0 \n\t"
409 "pand %2, %%mm3 \n\t"
410 "psrlq $5, %%mm1 \n\t"
411 "psrlq $5, %%mm4 \n\t"
412 "pand %%mm6, %%mm1 \n\t"
413 "pand %%mm6, %%mm4 \n\t"
414 "psrlq $8, %%mm2 \n\t"
415 "psrlq $8, %%mm5 \n\t"
416 "pand %%mm7, %%mm2 \n\t"
417 "pand %%mm7, %%mm5 \n\t"
418 "por %%mm1, %%mm0 \n\t"
419 "por %%mm4, %%mm3 \n\t"
420 "por %%mm2, %%mm0 \n\t"
421 "por %%mm5, %%mm3 \n\t"
422 "psllq $16, %%mm3 \n\t"
423 "por %%mm3, %%mm0 \n\t"
424 MOVNTQ" %%mm0, %0 \n\t"
425 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
430 __asm __volatile(SFENCE:::"memory");
431 __asm __volatile(EMMS:::"memory");
435 register int rgb = *(uint32_t*)s; s += 4;
436 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
440 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
442 const uint8_t *s = src;
445 const uint8_t *mm_end;
447 uint16_t *d = (uint16_t *)dst;
450 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
452 "movq %0, %%mm7 \n\t"
453 "movq %1, %%mm6 \n\t"
454 ::"m"(red_16mask),"m"(green_16mask));
460 "movd %1, %%mm0 \n\t"
461 "movd 4%1, %%mm3 \n\t"
462 "punpckldq 8%1, %%mm0 \n\t"
463 "punpckldq 12%1, %%mm3 \n\t"
464 "movq %%mm0, %%mm1 \n\t"
465 "movq %%mm0, %%mm2 \n\t"
466 "movq %%mm3, %%mm4 \n\t"
467 "movq %%mm3, %%mm5 \n\t"
468 "psllq $8, %%mm0 \n\t"
469 "psllq $8, %%mm3 \n\t"
470 "pand %%mm7, %%mm0 \n\t"
471 "pand %%mm7, %%mm3 \n\t"
472 "psrlq $5, %%mm1 \n\t"
473 "psrlq $5, %%mm4 \n\t"
474 "pand %%mm6, %%mm1 \n\t"
475 "pand %%mm6, %%mm4 \n\t"
476 "psrlq $19, %%mm2 \n\t"
477 "psrlq $19, %%mm5 \n\t"
478 "pand %2, %%mm2 \n\t"
479 "pand %2, %%mm5 \n\t"
480 "por %%mm1, %%mm0 \n\t"
481 "por %%mm4, %%mm3 \n\t"
482 "por %%mm2, %%mm0 \n\t"
483 "por %%mm5, %%mm3 \n\t"
484 "psllq $16, %%mm3 \n\t"
485 "por %%mm3, %%mm0 \n\t"
486 MOVNTQ" %%mm0, %0 \n\t"
487 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
491 __asm __volatile(SFENCE:::"memory");
492 __asm __volatile(EMMS:::"memory");
496 register int rgb = *(uint32_t*)s; s += 4;
497 *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
501 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size)
503 const uint8_t *s = src;
506 const uint8_t *mm_end;
508 uint16_t *d = (uint16_t *)dst;
512 #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
514 "movq %3, %%mm5 \n\t"
515 "movq %4, %%mm6 \n\t"
516 "movq %5, %%mm7 \n\t"
520 PREFETCH" 32(%1) \n\t"
521 "movd (%1), %%mm0 \n\t"
522 "movd 4(%1), %%mm3 \n\t"
523 "punpckldq 8(%1), %%mm0 \n\t"
524 "punpckldq 12(%1), %%mm3 \n\t"
525 "movq %%mm0, %%mm1 \n\t"
526 "movq %%mm3, %%mm4 \n\t"
527 "pand %%mm6, %%mm0 \n\t"
528 "pand %%mm6, %%mm3 \n\t"
529 "pmaddwd %%mm7, %%mm0 \n\t"
530 "pmaddwd %%mm7, %%mm3 \n\t"
531 "pand %%mm5, %%mm1 \n\t"
532 "pand %%mm5, %%mm4 \n\t"
533 "por %%mm1, %%mm0 \n\t"
534 "por %%mm4, %%mm3 \n\t"
535 "psrld $6, %%mm0 \n\t"
536 "pslld $10, %%mm3 \n\t"
537 "por %%mm3, %%mm0 \n\t"
538 MOVNTQ" %%mm0, (%0) \n\t"
545 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
548 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
550 "movq %0, %%mm7 \n\t"
551 "movq %1, %%mm6 \n\t"
552 ::"m"(red_15mask),"m"(green_15mask));
557 "movd %1, %%mm0 \n\t"
558 "movd 4%1, %%mm3 \n\t"
559 "punpckldq 8%1, %%mm0 \n\t"
560 "punpckldq 12%1, %%mm3 \n\t"
561 "movq %%mm0, %%mm1 \n\t"
562 "movq %%mm0, %%mm2 \n\t"
563 "movq %%mm3, %%mm4 \n\t"
564 "movq %%mm3, %%mm5 \n\t"
565 "psrlq $3, %%mm0 \n\t"
566 "psrlq $3, %%mm3 \n\t"
567 "pand %2, %%mm0 \n\t"
568 "pand %2, %%mm3 \n\t"
569 "psrlq $6, %%mm1 \n\t"
570 "psrlq $6, %%mm4 \n\t"
571 "pand %%mm6, %%mm1 \n\t"
572 "pand %%mm6, %%mm4 \n\t"
573 "psrlq $9, %%mm2 \n\t"
574 "psrlq $9, %%mm5 \n\t"
575 "pand %%mm7, %%mm2 \n\t"
576 "pand %%mm7, %%mm5 \n\t"
577 "por %%mm1, %%mm0 \n\t"
578 "por %%mm4, %%mm3 \n\t"
579 "por %%mm2, %%mm0 \n\t"
580 "por %%mm5, %%mm3 \n\t"
581 "psllq $16, %%mm3 \n\t"
582 "por %%mm3, %%mm0 \n\t"
583 MOVNTQ" %%mm0, %0 \n\t"
584 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
589 __asm __volatile(SFENCE:::"memory");
590 __asm __volatile(EMMS:::"memory");
594 register int rgb = *(uint32_t*)s; s += 4;
595 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
599 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
601 const uint8_t *s = src;
604 const uint8_t *mm_end;
606 uint16_t *d = (uint16_t *)dst;
609 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
611 "movq %0, %%mm7 \n\t"
612 "movq %1, %%mm6 \n\t"
613 ::"m"(red_15mask),"m"(green_15mask));
619 "movd %1, %%mm0 \n\t"
620 "movd 4%1, %%mm3 \n\t"
621 "punpckldq 8%1, %%mm0 \n\t"
622 "punpckldq 12%1, %%mm3 \n\t"
623 "movq %%mm0, %%mm1 \n\t"
624 "movq %%mm0, %%mm2 \n\t"
625 "movq %%mm3, %%mm4 \n\t"
626 "movq %%mm3, %%mm5 \n\t"
627 "psllq $7, %%mm0 \n\t"
628 "psllq $7, %%mm3 \n\t"
629 "pand %%mm7, %%mm0 \n\t"
630 "pand %%mm7, %%mm3 \n\t"
631 "psrlq $6, %%mm1 \n\t"
632 "psrlq $6, %%mm4 \n\t"
633 "pand %%mm6, %%mm1 \n\t"
634 "pand %%mm6, %%mm4 \n\t"
635 "psrlq $19, %%mm2 \n\t"
636 "psrlq $19, %%mm5 \n\t"
637 "pand %2, %%mm2 \n\t"
638 "pand %2, %%mm5 \n\t"
639 "por %%mm1, %%mm0 \n\t"
640 "por %%mm4, %%mm3 \n\t"
641 "por %%mm2, %%mm0 \n\t"
642 "por %%mm5, %%mm3 \n\t"
643 "psllq $16, %%mm3 \n\t"
644 "por %%mm3, %%mm0 \n\t"
645 MOVNTQ" %%mm0, %0 \n\t"
646 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
650 __asm __volatile(SFENCE:::"memory");
651 __asm __volatile(EMMS:::"memory");
655 register int rgb = *(uint32_t*)s; s += 4;
656 *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
660 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size)
662 const uint8_t *s = src;
665 const uint8_t *mm_end;
667 uint16_t *d = (uint16_t *)dst;
670 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
672 "movq %0, %%mm7 \n\t"
673 "movq %1, %%mm6 \n\t"
674 ::"m"(red_16mask),"m"(green_16mask));
680 "movd %1, %%mm0 \n\t"
681 "movd 3%1, %%mm3 \n\t"
682 "punpckldq 6%1, %%mm0 \n\t"
683 "punpckldq 9%1, %%mm3 \n\t"
684 "movq %%mm0, %%mm1 \n\t"
685 "movq %%mm0, %%mm2 \n\t"
686 "movq %%mm3, %%mm4 \n\t"
687 "movq %%mm3, %%mm5 \n\t"
688 "psrlq $3, %%mm0 \n\t"
689 "psrlq $3, %%mm3 \n\t"
690 "pand %2, %%mm0 \n\t"
691 "pand %2, %%mm3 \n\t"
692 "psrlq $5, %%mm1 \n\t"
693 "psrlq $5, %%mm4 \n\t"
694 "pand %%mm6, %%mm1 \n\t"
695 "pand %%mm6, %%mm4 \n\t"
696 "psrlq $8, %%mm2 \n\t"
697 "psrlq $8, %%mm5 \n\t"
698 "pand %%mm7, %%mm2 \n\t"
699 "pand %%mm7, %%mm5 \n\t"
700 "por %%mm1, %%mm0 \n\t"
701 "por %%mm4, %%mm3 \n\t"
702 "por %%mm2, %%mm0 \n\t"
703 "por %%mm5, %%mm3 \n\t"
704 "psllq $16, %%mm3 \n\t"
705 "por %%mm3, %%mm0 \n\t"
706 MOVNTQ" %%mm0, %0 \n\t"
707 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
711 __asm __volatile(SFENCE:::"memory");
712 __asm __volatile(EMMS:::"memory");
719 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
723 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
725 const uint8_t *s = src;
728 const uint8_t *mm_end;
730 uint16_t *d = (uint16_t *)dst;
733 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
735 "movq %0, %%mm7 \n\t"
736 "movq %1, %%mm6 \n\t"
737 ::"m"(red_16mask),"m"(green_16mask));
743 "movd %1, %%mm0 \n\t"
744 "movd 3%1, %%mm3 \n\t"
745 "punpckldq 6%1, %%mm0 \n\t"
746 "punpckldq 9%1, %%mm3 \n\t"
747 "movq %%mm0, %%mm1 \n\t"
748 "movq %%mm0, %%mm2 \n\t"
749 "movq %%mm3, %%mm4 \n\t"
750 "movq %%mm3, %%mm5 \n\t"
751 "psllq $8, %%mm0 \n\t"
752 "psllq $8, %%mm3 \n\t"
753 "pand %%mm7, %%mm0 \n\t"
754 "pand %%mm7, %%mm3 \n\t"
755 "psrlq $5, %%mm1 \n\t"
756 "psrlq $5, %%mm4 \n\t"
757 "pand %%mm6, %%mm1 \n\t"
758 "pand %%mm6, %%mm4 \n\t"
759 "psrlq $19, %%mm2 \n\t"
760 "psrlq $19, %%mm5 \n\t"
761 "pand %2, %%mm2 \n\t"
762 "pand %2, %%mm5 \n\t"
763 "por %%mm1, %%mm0 \n\t"
764 "por %%mm4, %%mm3 \n\t"
765 "por %%mm2, %%mm0 \n\t"
766 "por %%mm5, %%mm3 \n\t"
767 "psllq $16, %%mm3 \n\t"
768 "por %%mm3, %%mm0 \n\t"
769 MOVNTQ" %%mm0, %0 \n\t"
770 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
774 __asm __volatile(SFENCE:::"memory");
775 __asm __volatile(EMMS:::"memory");
782 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
786 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size)
788 const uint8_t *s = src;
791 const uint8_t *mm_end;
793 uint16_t *d = (uint16_t *)dst;
796 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
798 "movq %0, %%mm7 \n\t"
799 "movq %1, %%mm6 \n\t"
800 ::"m"(red_15mask),"m"(green_15mask));
806 "movd %1, %%mm0 \n\t"
807 "movd 3%1, %%mm3 \n\t"
808 "punpckldq 6%1, %%mm0 \n\t"
809 "punpckldq 9%1, %%mm3 \n\t"
810 "movq %%mm0, %%mm1 \n\t"
811 "movq %%mm0, %%mm2 \n\t"
812 "movq %%mm3, %%mm4 \n\t"
813 "movq %%mm3, %%mm5 \n\t"
814 "psrlq $3, %%mm0 \n\t"
815 "psrlq $3, %%mm3 \n\t"
816 "pand %2, %%mm0 \n\t"
817 "pand %2, %%mm3 \n\t"
818 "psrlq $6, %%mm1 \n\t"
819 "psrlq $6, %%mm4 \n\t"
820 "pand %%mm6, %%mm1 \n\t"
821 "pand %%mm6, %%mm4 \n\t"
822 "psrlq $9, %%mm2 \n\t"
823 "psrlq $9, %%mm5 \n\t"
824 "pand %%mm7, %%mm2 \n\t"
825 "pand %%mm7, %%mm5 \n\t"
826 "por %%mm1, %%mm0 \n\t"
827 "por %%mm4, %%mm3 \n\t"
828 "por %%mm2, %%mm0 \n\t"
829 "por %%mm5, %%mm3 \n\t"
830 "psllq $16, %%mm3 \n\t"
831 "por %%mm3, %%mm0 \n\t"
832 MOVNTQ" %%mm0, %0 \n\t"
833 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
837 __asm __volatile(SFENCE:::"memory");
838 __asm __volatile(EMMS:::"memory");
845 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
849 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
851 const uint8_t *s = src;
854 const uint8_t *mm_end;
856 uint16_t *d = (uint16_t *)dst;
859 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
861 "movq %0, %%mm7 \n\t"
862 "movq %1, %%mm6 \n\t"
863 ::"m"(red_15mask),"m"(green_15mask));
869 "movd %1, %%mm0 \n\t"
870 "movd 3%1, %%mm3 \n\t"
871 "punpckldq 6%1, %%mm0 \n\t"
872 "punpckldq 9%1, %%mm3 \n\t"
873 "movq %%mm0, %%mm1 \n\t"
874 "movq %%mm0, %%mm2 \n\t"
875 "movq %%mm3, %%mm4 \n\t"
876 "movq %%mm3, %%mm5 \n\t"
877 "psllq $7, %%mm0 \n\t"
878 "psllq $7, %%mm3 \n\t"
879 "pand %%mm7, %%mm0 \n\t"
880 "pand %%mm7, %%mm3 \n\t"
881 "psrlq $6, %%mm1 \n\t"
882 "psrlq $6, %%mm4 \n\t"
883 "pand %%mm6, %%mm1 \n\t"
884 "pand %%mm6, %%mm4 \n\t"
885 "psrlq $19, %%mm2 \n\t"
886 "psrlq $19, %%mm5 \n\t"
887 "pand %2, %%mm2 \n\t"
888 "pand %2, %%mm5 \n\t"
889 "por %%mm1, %%mm0 \n\t"
890 "por %%mm4, %%mm3 \n\t"
891 "por %%mm2, %%mm0 \n\t"
892 "por %%mm5, %%mm3 \n\t"
893 "psllq $16, %%mm3 \n\t"
894 "por %%mm3, %%mm0 \n\t"
895 MOVNTQ" %%mm0, %0 \n\t"
896 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
900 __asm __volatile(SFENCE:::"memory");
901 __asm __volatile(EMMS:::"memory");
908 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
913 I use here less accurate approximation by simply
914 left-shifting the input
915 value and filling the low order bits with
916 zeroes. This method improves png's
917 compression but this scheme cannot reproduce white exactly, since it does not
918 generate an all-ones maximum value; the net effect is to darken the
921 The better method should be "left bit replication":
931 | Leftmost Bits Repeated to Fill Open Bits
935 static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, long src_size)
939 const uint16_t *mm_end;
941 uint8_t *d = (uint8_t *)dst;
942 const uint16_t *s = (uint16_t *)src;
943 end = s + src_size/2;
945 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
951 "movq %1, %%mm0 \n\t"
952 "movq %1, %%mm1 \n\t"
953 "movq %1, %%mm2 \n\t"
954 "pand %2, %%mm0 \n\t"
955 "pand %3, %%mm1 \n\t"
956 "pand %4, %%mm2 \n\t"
957 "psllq $3, %%mm0 \n\t"
958 "psrlq $2, %%mm1 \n\t"
959 "psrlq $7, %%mm2 \n\t"
960 "movq %%mm0, %%mm3 \n\t"
961 "movq %%mm1, %%mm4 \n\t"
962 "movq %%mm2, %%mm5 \n\t"
963 "punpcklwd %5, %%mm0 \n\t"
964 "punpcklwd %5, %%mm1 \n\t"
965 "punpcklwd %5, %%mm2 \n\t"
966 "punpckhwd %5, %%mm3 \n\t"
967 "punpckhwd %5, %%mm4 \n\t"
968 "punpckhwd %5, %%mm5 \n\t"
969 "psllq $8, %%mm1 \n\t"
970 "psllq $16, %%mm2 \n\t"
971 "por %%mm1, %%mm0 \n\t"
972 "por %%mm2, %%mm0 \n\t"
973 "psllq $8, %%mm4 \n\t"
974 "psllq $16, %%mm5 \n\t"
975 "por %%mm4, %%mm3 \n\t"
976 "por %%mm5, %%mm3 \n\t"
978 "movq %%mm0, %%mm6 \n\t"
979 "movq %%mm3, %%mm7 \n\t"
981 "movq 8%1, %%mm0 \n\t"
982 "movq 8%1, %%mm1 \n\t"
983 "movq 8%1, %%mm2 \n\t"
984 "pand %2, %%mm0 \n\t"
985 "pand %3, %%mm1 \n\t"
986 "pand %4, %%mm2 \n\t"
987 "psllq $3, %%mm0 \n\t"
988 "psrlq $2, %%mm1 \n\t"
989 "psrlq $7, %%mm2 \n\t"
990 "movq %%mm0, %%mm3 \n\t"
991 "movq %%mm1, %%mm4 \n\t"
992 "movq %%mm2, %%mm5 \n\t"
993 "punpcklwd %5, %%mm0 \n\t"
994 "punpcklwd %5, %%mm1 \n\t"
995 "punpcklwd %5, %%mm2 \n\t"
996 "punpckhwd %5, %%mm3 \n\t"
997 "punpckhwd %5, %%mm4 \n\t"
998 "punpckhwd %5, %%mm5 \n\t"
999 "psllq $8, %%mm1 \n\t"
1000 "psllq $16, %%mm2 \n\t"
1001 "por %%mm1, %%mm0 \n\t"
1002 "por %%mm2, %%mm0 \n\t"
1003 "psllq $8, %%mm4 \n\t"
1004 "psllq $16, %%mm5 \n\t"
1005 "por %%mm4, %%mm3 \n\t"
1006 "por %%mm5, %%mm3 \n\t"
1009 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
1011 /* Borrowed 32 to 24 */
1013 "movq %%mm0, %%mm4 \n\t"
1014 "movq %%mm3, %%mm5 \n\t"
1015 "movq %%mm6, %%mm0 \n\t"
1016 "movq %%mm7, %%mm1 \n\t"
1018 "movq %%mm4, %%mm6 \n\t"
1019 "movq %%mm5, %%mm7 \n\t"
1020 "movq %%mm0, %%mm2 \n\t"
1021 "movq %%mm1, %%mm3 \n\t"
1023 "psrlq $8, %%mm2 \n\t"
1024 "psrlq $8, %%mm3 \n\t"
1025 "psrlq $8, %%mm6 \n\t"
1026 "psrlq $8, %%mm7 \n\t"
1027 "pand %2, %%mm0 \n\t"
1028 "pand %2, %%mm1 \n\t"
1029 "pand %2, %%mm4 \n\t"
1030 "pand %2, %%mm5 \n\t"
1031 "pand %3, %%mm2 \n\t"
1032 "pand %3, %%mm3 \n\t"
1033 "pand %3, %%mm6 \n\t"
1034 "pand %3, %%mm7 \n\t"
1035 "por %%mm2, %%mm0 \n\t"
1036 "por %%mm3, %%mm1 \n\t"
1037 "por %%mm6, %%mm4 \n\t"
1038 "por %%mm7, %%mm5 \n\t"
1040 "movq %%mm1, %%mm2 \n\t"
1041 "movq %%mm4, %%mm3 \n\t"
1042 "psllq $48, %%mm2 \n\t"
1043 "psllq $32, %%mm3 \n\t"
1044 "pand %4, %%mm2 \n\t"
1045 "pand %5, %%mm3 \n\t"
1046 "por %%mm2, %%mm0 \n\t"
1047 "psrlq $16, %%mm1 \n\t"
1048 "psrlq $32, %%mm4 \n\t"
1049 "psllq $16, %%mm5 \n\t"
1050 "por %%mm3, %%mm1 \n\t"
1051 "pand %6, %%mm5 \n\t"
1052 "por %%mm5, %%mm4 \n\t"
1054 MOVNTQ" %%mm0, %0 \n\t"
1055 MOVNTQ" %%mm1, 8%0 \n\t"
1056 MOVNTQ" %%mm4, 16%0"
1059 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1064 __asm __volatile(SFENCE:::"memory");
1065 __asm __volatile(EMMS:::"memory");
1069 register uint16_t bgr;
1071 *d++ = (bgr&0x1F)<<3;
1072 *d++ = (bgr&0x3E0)>>2;
1073 *d++ = (bgr&0x7C00)>>7;
1077 static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, long src_size)
1079 const uint16_t *end;
1081 const uint16_t *mm_end;
1083 uint8_t *d = (uint8_t *)dst;
1084 const uint16_t *s = (const uint16_t *)src;
1085 end = s + src_size/2;
1087 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1092 PREFETCH" 32%1 \n\t"
1093 "movq %1, %%mm0 \n\t"
1094 "movq %1, %%mm1 \n\t"
1095 "movq %1, %%mm2 \n\t"
1096 "pand %2, %%mm0 \n\t"
1097 "pand %3, %%mm1 \n\t"
1098 "pand %4, %%mm2 \n\t"
1099 "psllq $3, %%mm0 \n\t"
1100 "psrlq $3, %%mm1 \n\t"
1101 "psrlq $8, %%mm2 \n\t"
1102 "movq %%mm0, %%mm3 \n\t"
1103 "movq %%mm1, %%mm4 \n\t"
1104 "movq %%mm2, %%mm5 \n\t"
1105 "punpcklwd %5, %%mm0 \n\t"
1106 "punpcklwd %5, %%mm1 \n\t"
1107 "punpcklwd %5, %%mm2 \n\t"
1108 "punpckhwd %5, %%mm3 \n\t"
1109 "punpckhwd %5, %%mm4 \n\t"
1110 "punpckhwd %5, %%mm5 \n\t"
1111 "psllq $8, %%mm1 \n\t"
1112 "psllq $16, %%mm2 \n\t"
1113 "por %%mm1, %%mm0 \n\t"
1114 "por %%mm2, %%mm0 \n\t"
1115 "psllq $8, %%mm4 \n\t"
1116 "psllq $16, %%mm5 \n\t"
1117 "por %%mm4, %%mm3 \n\t"
1118 "por %%mm5, %%mm3 \n\t"
1120 "movq %%mm0, %%mm6 \n\t"
1121 "movq %%mm3, %%mm7 \n\t"
1123 "movq 8%1, %%mm0 \n\t"
1124 "movq 8%1, %%mm1 \n\t"
1125 "movq 8%1, %%mm2 \n\t"
1126 "pand %2, %%mm0 \n\t"
1127 "pand %3, %%mm1 \n\t"
1128 "pand %4, %%mm2 \n\t"
1129 "psllq $3, %%mm0 \n\t"
1130 "psrlq $3, %%mm1 \n\t"
1131 "psrlq $8, %%mm2 \n\t"
1132 "movq %%mm0, %%mm3 \n\t"
1133 "movq %%mm1, %%mm4 \n\t"
1134 "movq %%mm2, %%mm5 \n\t"
1135 "punpcklwd %5, %%mm0 \n\t"
1136 "punpcklwd %5, %%mm1 \n\t"
1137 "punpcklwd %5, %%mm2 \n\t"
1138 "punpckhwd %5, %%mm3 \n\t"
1139 "punpckhwd %5, %%mm4 \n\t"
1140 "punpckhwd %5, %%mm5 \n\t"
1141 "psllq $8, %%mm1 \n\t"
1142 "psllq $16, %%mm2 \n\t"
1143 "por %%mm1, %%mm0 \n\t"
1144 "por %%mm2, %%mm0 \n\t"
1145 "psllq $8, %%mm4 \n\t"
1146 "psllq $16, %%mm5 \n\t"
1147 "por %%mm4, %%mm3 \n\t"
1148 "por %%mm5, %%mm3 \n\t"
1150 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
1152 /* Borrowed 32 to 24 */
1154 "movq %%mm0, %%mm4 \n\t"
1155 "movq %%mm3, %%mm5 \n\t"
1156 "movq %%mm6, %%mm0 \n\t"
1157 "movq %%mm7, %%mm1 \n\t"
1159 "movq %%mm4, %%mm6 \n\t"
1160 "movq %%mm5, %%mm7 \n\t"
1161 "movq %%mm0, %%mm2 \n\t"
1162 "movq %%mm1, %%mm3 \n\t"
1164 "psrlq $8, %%mm2 \n\t"
1165 "psrlq $8, %%mm3 \n\t"
1166 "psrlq $8, %%mm6 \n\t"
1167 "psrlq $8, %%mm7 \n\t"
1168 "pand %2, %%mm0 \n\t"
1169 "pand %2, %%mm1 \n\t"
1170 "pand %2, %%mm4 \n\t"
1171 "pand %2, %%mm5 \n\t"
1172 "pand %3, %%mm2 \n\t"
1173 "pand %3, %%mm3 \n\t"
1174 "pand %3, %%mm6 \n\t"
1175 "pand %3, %%mm7 \n\t"
1176 "por %%mm2, %%mm0 \n\t"
1177 "por %%mm3, %%mm1 \n\t"
1178 "por %%mm6, %%mm4 \n\t"
1179 "por %%mm7, %%mm5 \n\t"
1181 "movq %%mm1, %%mm2 \n\t"
1182 "movq %%mm4, %%mm3 \n\t"
1183 "psllq $48, %%mm2 \n\t"
1184 "psllq $32, %%mm3 \n\t"
1185 "pand %4, %%mm2 \n\t"
1186 "pand %5, %%mm3 \n\t"
1187 "por %%mm2, %%mm0 \n\t"
1188 "psrlq $16, %%mm1 \n\t"
1189 "psrlq $32, %%mm4 \n\t"
1190 "psllq $16, %%mm5 \n\t"
1191 "por %%mm3, %%mm1 \n\t"
1192 "pand %6, %%mm5 \n\t"
1193 "por %%mm5, %%mm4 \n\t"
1195 MOVNTQ" %%mm0, %0 \n\t"
1196 MOVNTQ" %%mm1, 8%0 \n\t"
1197 MOVNTQ" %%mm4, 16%0"
1200 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1205 __asm __volatile(SFENCE:::"memory");
1206 __asm __volatile(EMMS:::"memory");
1210 register uint16_t bgr;
1212 *d++ = (bgr&0x1F)<<3;
1213 *d++ = (bgr&0x7E0)>>3;
1214 *d++ = (bgr&0xF800)>>8;
1218 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size)
1220 const uint16_t *end;
1222 const uint16_t *mm_end;
1224 uint8_t *d = (uint8_t *)dst;
1225 const uint16_t *s = (const uint16_t *)src;
1226 end = s + src_size/2;
1228 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1229 __asm __volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
1234 PREFETCH" 32%1 \n\t"
1235 "movq %1, %%mm0 \n\t"
1236 "movq %1, %%mm1 \n\t"
1237 "movq %1, %%mm2 \n\t"
1238 "pand %2, %%mm0 \n\t"
1239 "pand %3, %%mm1 \n\t"
1240 "pand %4, %%mm2 \n\t"
1241 "psllq $3, %%mm0 \n\t"
1242 "psrlq $2, %%mm1 \n\t"
1243 "psrlq $7, %%mm2 \n\t"
1244 "movq %%mm0, %%mm3 \n\t"
1245 "movq %%mm1, %%mm4 \n\t"
1246 "movq %%mm2, %%mm5 \n\t"
1247 "punpcklwd %%mm7, %%mm0 \n\t"
1248 "punpcklwd %%mm7, %%mm1 \n\t"
1249 "punpcklwd %%mm7, %%mm2 \n\t"
1250 "punpckhwd %%mm7, %%mm3 \n\t"
1251 "punpckhwd %%mm7, %%mm4 \n\t"
1252 "punpckhwd %%mm7, %%mm5 \n\t"
1253 "psllq $8, %%mm1 \n\t"
1254 "psllq $16, %%mm2 \n\t"
1255 "por %%mm1, %%mm0 \n\t"
1256 "por %%mm2, %%mm0 \n\t"
1257 "psllq $8, %%mm4 \n\t"
1258 "psllq $16, %%mm5 \n\t"
1259 "por %%mm4, %%mm3 \n\t"
1260 "por %%mm5, %%mm3 \n\t"
1261 MOVNTQ" %%mm0, %0 \n\t"
1262 MOVNTQ" %%mm3, 8%0 \n\t"
1264 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1269 __asm __volatile(SFENCE:::"memory");
1270 __asm __volatile(EMMS:::"memory");
1274 #if 0 //slightly slower on athlon
1276 *((uint32_t*)d)++ = ((bgr&0x1F)<<3) + ((bgr&0x3E0)<<6) + ((bgr&0x7C00)<<9);
1278 register uint16_t bgr;
1280 #ifdef WORDS_BIGENDIAN
1282 *d++ = (bgr&0x7C00)>>7;
1283 *d++ = (bgr&0x3E0)>>2;
1284 *d++ = (bgr&0x1F)<<3;
1286 *d++ = (bgr&0x1F)<<3;
1287 *d++ = (bgr&0x3E0)>>2;
1288 *d++ = (bgr&0x7C00)>>7;
1296 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size)
1298 const uint16_t *end;
1300 const uint16_t *mm_end;
1302 uint8_t *d = (uint8_t *)dst;
1303 const uint16_t *s = (uint16_t *)src;
1304 end = s + src_size/2;
1306 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1307 __asm __volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
1312 PREFETCH" 32%1 \n\t"
1313 "movq %1, %%mm0 \n\t"
1314 "movq %1, %%mm1 \n\t"
1315 "movq %1, %%mm2 \n\t"
1316 "pand %2, %%mm0 \n\t"
1317 "pand %3, %%mm1 \n\t"
1318 "pand %4, %%mm2 \n\t"
1319 "psllq $3, %%mm0 \n\t"
1320 "psrlq $3, %%mm1 \n\t"
1321 "psrlq $8, %%mm2 \n\t"
1322 "movq %%mm0, %%mm3 \n\t"
1323 "movq %%mm1, %%mm4 \n\t"
1324 "movq %%mm2, %%mm5 \n\t"
1325 "punpcklwd %%mm7, %%mm0 \n\t"
1326 "punpcklwd %%mm7, %%mm1 \n\t"
1327 "punpcklwd %%mm7, %%mm2 \n\t"
1328 "punpckhwd %%mm7, %%mm3 \n\t"
1329 "punpckhwd %%mm7, %%mm4 \n\t"
1330 "punpckhwd %%mm7, %%mm5 \n\t"
1331 "psllq $8, %%mm1 \n\t"
1332 "psllq $16, %%mm2 \n\t"
1333 "por %%mm1, %%mm0 \n\t"
1334 "por %%mm2, %%mm0 \n\t"
1335 "psllq $8, %%mm4 \n\t"
1336 "psllq $16, %%mm5 \n\t"
1337 "por %%mm4, %%mm3 \n\t"
1338 "por %%mm5, %%mm3 \n\t"
1339 MOVNTQ" %%mm0, %0 \n\t"
1340 MOVNTQ" %%mm3, 8%0 \n\t"
1342 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1347 __asm __volatile(SFENCE:::"memory");
1348 __asm __volatile(EMMS:::"memory");
1352 register uint16_t bgr;
1354 #ifdef WORDS_BIGENDIAN
1356 *d++ = (bgr&0xF800)>>8;
1357 *d++ = (bgr&0x7E0)>>3;
1358 *d++ = (bgr&0x1F)<<3;
1360 *d++ = (bgr&0x1F)<<3;
1361 *d++ = (bgr&0x7E0)>>3;
1362 *d++ = (bgr&0xF800)>>8;
1368 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
1370 long idx = 15 - src_size;
1371 uint8_t *s = (uint8_t *) src-idx, *d = dst-idx;
1376 PREFETCH" (%1, %0) \n\t"
1377 "movq %3, %%mm7 \n\t"
1378 "pxor %4, %%mm7 \n\t"
1379 "movq %%mm7, %%mm6 \n\t"
1380 "pxor %5, %%mm7 \n\t"
1383 PREFETCH" 32(%1, %0) \n\t"
1384 "movq (%1, %0), %%mm0 \n\t"
1385 "movq 8(%1, %0), %%mm1 \n\t"
1387 "pshufw $177, %%mm0, %%mm3 \n\t"
1388 "pshufw $177, %%mm1, %%mm5 \n\t"
1389 "pand %%mm7, %%mm0 \n\t"
1390 "pand %%mm6, %%mm3 \n\t"
1391 "pand %%mm7, %%mm1 \n\t"
1392 "pand %%mm6, %%mm5 \n\t"
1393 "por %%mm3, %%mm0 \n\t"
1394 "por %%mm5, %%mm1 \n\t"
1396 "movq %%mm0, %%mm2 \n\t"
1397 "movq %%mm1, %%mm4 \n\t"
1398 "pand %%mm7, %%mm0 \n\t"
1399 "pand %%mm6, %%mm2 \n\t"
1400 "pand %%mm7, %%mm1 \n\t"
1401 "pand %%mm6, %%mm4 \n\t"
1402 "movq %%mm2, %%mm3 \n\t"
1403 "movq %%mm4, %%mm5 \n\t"
1404 "pslld $16, %%mm2 \n\t"
1405 "psrld $16, %%mm3 \n\t"
1406 "pslld $16, %%mm4 \n\t"
1407 "psrld $16, %%mm5 \n\t"
1408 "por %%mm2, %%mm0 \n\t"
1409 "por %%mm4, %%mm1 \n\t"
1410 "por %%mm3, %%mm0 \n\t"
1411 "por %%mm5, %%mm1 \n\t"
1413 MOVNTQ" %%mm0, (%2, %0) \n\t"
1414 MOVNTQ" %%mm1, 8(%2, %0) \n\t"
1421 : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
1424 for (; idx<15; idx+=4) {
1425 register int v = *(uint32_t *)&s[idx], g = v & 0xff00ff00;
1427 *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16);
1431 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
1435 long mmx_size= 23 - src_size;
1437 "test %%"REG_a", %%"REG_a" \n\t"
1439 "movq "MANGLE(mask24r)", %%mm5 \n\t"
1440 "movq "MANGLE(mask24g)", %%mm6 \n\t"
1441 "movq "MANGLE(mask24b)", %%mm7 \n\t"
1444 PREFETCH" 32(%1, %%"REG_a") \n\t"
1445 "movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1446 "movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG
1447 "movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B
1448 "psllq $16, %%mm0 \n\t" // 00 BGR BGR
1449 "pand %%mm5, %%mm0 \n\t"
1450 "pand %%mm6, %%mm1 \n\t"
1451 "pand %%mm7, %%mm2 \n\t"
1452 "por %%mm0, %%mm1 \n\t"
1453 "por %%mm2, %%mm1 \n\t"
1454 "movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1455 MOVNTQ" %%mm1, (%2, %%"REG_a") \n\t" // RGB RGB RG
1456 "movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B
1457 "movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR
1458 "pand %%mm7, %%mm0 \n\t"
1459 "pand %%mm5, %%mm1 \n\t"
1460 "pand %%mm6, %%mm2 \n\t"
1461 "por %%mm0, %%mm1 \n\t"
1462 "por %%mm2, %%mm1 \n\t"
1463 "movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B
1464 MOVNTQ" %%mm1, 8(%2, %%"REG_a") \n\t" // B RGB RGB R
1465 "movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR
1466 "movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG
1467 "pand %%mm6, %%mm0 \n\t"
1468 "pand %%mm7, %%mm1 \n\t"
1469 "pand %%mm5, %%mm2 \n\t"
1470 "por %%mm0, %%mm1 \n\t"
1471 "por %%mm2, %%mm1 \n\t"
1472 MOVNTQ" %%mm1, 16(%2, %%"REG_a") \n\t"
1473 "add $24, %%"REG_a" \n\t"
1477 : "r" (src-mmx_size), "r"(dst-mmx_size)
1480 __asm __volatile(SFENCE:::"memory");
1481 __asm __volatile(EMMS:::"memory");
1483 if (mmx_size==23) return; //finihsed, was multiple of 8
1487 src_size= 23-mmx_size;
1491 for (i=0; i<src_size; i+=3)
1495 dst[i + 1] = src[i + 1];
1496 dst[i + 2] = src[i + 0];
1501 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1502 long width, long height,
1503 long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1506 const long chromWidth= width>>1;
1507 for (y=0; y<height; y++)
1510 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1512 "xor %%"REG_a", %%"REG_a" \n\t"
1515 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1516 PREFETCH" 32(%2, %%"REG_a") \n\t"
1517 PREFETCH" 32(%3, %%"REG_a") \n\t"
1518 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1519 "movq %%mm0, %%mm2 \n\t" // U(0)
1520 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1521 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1522 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1524 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1525 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1526 "movq %%mm3, %%mm4 \n\t" // Y(0)
1527 "movq %%mm5, %%mm6 \n\t" // Y(8)
1528 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
1529 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
1530 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
1531 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
1533 MOVNTQ" %%mm3, (%0, %%"REG_a", 4) \n\t"
1534 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
1535 MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4) \n\t"
1536 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
1538 "add $8, %%"REG_a" \n\t"
1539 "cmp %4, %%"REG_a" \n\t"
1541 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1546 #if defined ARCH_ALPHA && defined HAVE_MVI
1547 #define pl2yuy2(n) \
1552 asm("unpkbw %1, %0" : "=r"(y1) : "r"(y1)); \
1553 asm("unpkbw %1, %0" : "=r"(y2) : "r"(y2)); \
1554 asm("unpkbl %1, %0" : "=r"(u) : "r"(u)); \
1555 asm("unpkbl %1, %0" : "=r"(v) : "r"(v)); \
1556 yuv1 = (u << 8) + (v << 24); \
1563 uint64_t *qdst = (uint64_t *) dst;
1564 uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
1565 const uint32_t *yc = (uint32_t *) ysrc;
1566 const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
1567 const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
1568 for (i = 0; i < chromWidth; i += 8){
1569 uint64_t y1, y2, yuv1, yuv2;
1572 asm("ldq $31,64(%0)" :: "r"(yc));
1573 asm("ldq $31,64(%0)" :: "r"(yc2));
1574 asm("ldq $31,64(%0)" :: "r"(uc));
1575 asm("ldq $31,64(%0)" :: "r"(vc));
1593 #elif __WORDSIZE >= 64
1595 uint64_t *ldst = (uint64_t *) dst;
1596 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1597 for (i = 0; i < chromWidth; i += 2){
1599 k = yc[0] + (uc[0] << 8) +
1600 (yc[1] << 16) + (vc[0] << 24);
1601 l = yc[2] + (uc[1] << 8) +
1602 (yc[3] << 16) + (vc[1] << 24);
1603 *ldst++ = k + (l << 32);
1610 int i, *idst = (int32_t *) dst;
1611 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1612 for (i = 0; i < chromWidth; i++){
1613 #ifdef WORDS_BIGENDIAN
1614 *idst++ = (yc[0] << 24)+ (uc[0] << 16) +
1615 (yc[1] << 8) + (vc[0] << 0);
1617 *idst++ = yc[0] + (uc[0] << 8) +
1618 (yc[1] << 16) + (vc[0] << 24);
1626 if ((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1628 usrc += chromStride;
1629 vsrc += chromStride;
1643 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1644 * problem for anyone then tell me, and ill fix it)
1646 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1647 long width, long height,
1648 long lumStride, long chromStride, long dstStride)
1650 //FIXME interpolate chroma
1651 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1654 static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1655 long width, long height,
1656 long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1659 const long chromWidth= width>>1;
1660 for (y=0; y<height; y++)
1663 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1665 "xor %%"REG_a", %%"REG_a" \n\t"
1668 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1669 PREFETCH" 32(%2, %%"REG_a") \n\t"
1670 PREFETCH" 32(%3, %%"REG_a") \n\t"
1671 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1672 "movq %%mm0, %%mm2 \n\t" // U(0)
1673 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1674 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1675 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1677 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1678 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1679 "movq %%mm0, %%mm4 \n\t" // Y(0)
1680 "movq %%mm2, %%mm6 \n\t" // Y(8)
1681 "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
1682 "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
1683 "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
1684 "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
1686 MOVNTQ" %%mm0, (%0, %%"REG_a", 4) \n\t"
1687 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
1688 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4) \n\t"
1689 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
1691 "add $8, %%"REG_a" \n\t"
1692 "cmp %4, %%"REG_a" \n\t"
1694 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1698 //FIXME adapt the alpha asm code from yv12->yuy2
1700 #if __WORDSIZE >= 64
1702 uint64_t *ldst = (uint64_t *) dst;
1703 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1704 for (i = 0; i < chromWidth; i += 2){
1706 k = uc[0] + (yc[0] << 8) +
1707 (vc[0] << 16) + (yc[1] << 24);
1708 l = uc[1] + (yc[2] << 8) +
1709 (vc[1] << 16) + (yc[3] << 24);
1710 *ldst++ = k + (l << 32);
1717 int i, *idst = (int32_t *) dst;
1718 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1719 for (i = 0; i < chromWidth; i++){
1720 #ifdef WORDS_BIGENDIAN
1721 *idst++ = (uc[0] << 24)+ (yc[0] << 16) +
1722 (vc[0] << 8) + (yc[1] << 0);
1724 *idst++ = uc[0] + (yc[0] << 8) +
1725 (vc[0] << 16) + (yc[1] << 24);
1733 if ((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1735 usrc += chromStride;
1736 vsrc += chromStride;
1750 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1751 * problem for anyone then tell me, and ill fix it)
1753 static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1754 long width, long height,
1755 long lumStride, long chromStride, long dstStride)
1757 //FIXME interpolate chroma
1758 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1763 * width should be a multiple of 16
1765 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1766 long width, long height,
1767 long lumStride, long chromStride, long dstStride)
1769 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1774 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1775 * problem for anyone then tell me, and ill fix it)
1777 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1778 long width, long height,
1779 long lumStride, long chromStride, long srcStride)
1782 const long chromWidth= width>>1;
1783 for (y=0; y<height; y+=2)
1787 "xor %%"REG_a", %%"REG_a" \n\t"
1788 "pcmpeqw %%mm7, %%mm7 \n\t"
1789 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1792 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1793 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1794 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1795 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
1796 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
1797 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
1798 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
1799 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1800 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1801 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1802 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1804 MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
1806 "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(8)
1807 "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(12)
1808 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
1809 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
1810 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
1811 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
1812 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1813 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1814 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1815 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1817 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1819 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1820 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1821 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1822 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1823 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1824 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1825 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1826 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1828 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
1829 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
1831 "add $8, %%"REG_a" \n\t"
1832 "cmp %4, %%"REG_a" \n\t"
1834 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1835 : "memory", "%"REG_a
1842 "xor %%"REG_a", %%"REG_a" \n\t"
1845 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1846 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1847 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1848 "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
1849 "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
1850 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1851 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1852 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1853 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1854 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1855 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1857 MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
1858 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1860 "add $8, %%"REG_a" \n\t"
1861 "cmp %4, %%"REG_a" \n\t"
1864 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1865 : "memory", "%"REG_a
1869 for (i=0; i<chromWidth; i++)
1871 ydst[2*i+0] = src[4*i+0];
1872 udst[i] = src[4*i+1];
1873 ydst[2*i+1] = src[4*i+2];
1874 vdst[i] = src[4*i+3];
1879 for (i=0; i<chromWidth; i++)
1881 ydst[2*i+0] = src[4*i+0];
1882 ydst[2*i+1] = src[4*i+2];
1885 udst += chromStride;
1886 vdst += chromStride;
1891 asm volatile( EMMS" \n\t"
1897 static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1898 uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1899 long width, long height, long lumStride, long chromStride)
1902 memcpy(ydst, ysrc, width*height);
1904 /* XXX: implement upscaling for U,V */
1907 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWidth, long srcHeight, long srcStride, long dstStride)
1914 for (x=0; x<srcWidth-1; x++){
1915 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1916 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1918 dst[2*srcWidth-1]= src[srcWidth-1];
1922 for (y=1; y<srcHeight; y++){
1923 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1924 const long mmxSize= srcWidth&~15;
1926 "mov %4, %%"REG_a" \n\t"
1928 "movq (%0, %%"REG_a"), %%mm0 \n\t"
1929 "movq (%1, %%"REG_a"), %%mm1 \n\t"
1930 "movq 1(%0, %%"REG_a"), %%mm2 \n\t"
1931 "movq 1(%1, %%"REG_a"), %%mm3 \n\t"
1932 "movq -1(%0, %%"REG_a"), %%mm4 \n\t"
1933 "movq -1(%1, %%"REG_a"), %%mm5 \n\t"
1934 PAVGB" %%mm0, %%mm5 \n\t"
1935 PAVGB" %%mm0, %%mm3 \n\t"
1936 PAVGB" %%mm0, %%mm5 \n\t"
1937 PAVGB" %%mm0, %%mm3 \n\t"
1938 PAVGB" %%mm1, %%mm4 \n\t"
1939 PAVGB" %%mm1, %%mm2 \n\t"
1940 PAVGB" %%mm1, %%mm4 \n\t"
1941 PAVGB" %%mm1, %%mm2 \n\t"
1942 "movq %%mm5, %%mm7 \n\t"
1943 "movq %%mm4, %%mm6 \n\t"
1944 "punpcklbw %%mm3, %%mm5 \n\t"
1945 "punpckhbw %%mm3, %%mm7 \n\t"
1946 "punpcklbw %%mm2, %%mm4 \n\t"
1947 "punpckhbw %%mm2, %%mm6 \n\t"
1949 MOVNTQ" %%mm5, (%2, %%"REG_a", 2) \n\t"
1950 MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2) \n\t"
1951 MOVNTQ" %%mm4, (%3, %%"REG_a", 2) \n\t"
1952 MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2) \n\t"
1954 "movq %%mm5, (%2, %%"REG_a", 2) \n\t"
1955 "movq %%mm7, 8(%2, %%"REG_a", 2) \n\t"
1956 "movq %%mm4, (%3, %%"REG_a", 2) \n\t"
1957 "movq %%mm6, 8(%3, %%"REG_a", 2) \n\t"
1959 "add $8, %%"REG_a" \n\t"
1961 :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
1962 "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1968 const long mmxSize=1;
1970 dst[0 ]= (3*src[0] + src[srcStride])>>2;
1971 dst[dstStride]= ( src[0] + 3*src[srcStride])>>2;
1973 for (x=mmxSize-1; x<srcWidth-1; x++){
1974 dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
1975 dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
1976 dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
1977 dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
1979 dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
1980 dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1990 for (x=0; x<srcWidth-1; x++){
1991 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1992 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1994 dst[2*srcWidth-1]= src[srcWidth-1];
1996 for (x=0; x<srcWidth; x++){
2003 asm volatile( EMMS" \n\t"
2011 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
2012 * problem for anyone then tell me, and ill fix it)
2013 * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
2015 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2016 long width, long height,
2017 long lumStride, long chromStride, long srcStride)
2020 const long chromWidth= width>>1;
2021 for (y=0; y<height; y+=2)
2025 "xorl %%eax, %%eax \n\t"
2026 "pcmpeqw %%mm7, %%mm7 \n\t"
2027 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
2030 PREFETCH" 64(%0, %%eax, 4) \n\t"
2031 "movq (%0, %%eax, 4), %%mm0 \n\t" // UYVY UYVY(0)
2032 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(4)
2033 "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
2034 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
2035 "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
2036 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
2037 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
2038 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
2039 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
2040 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
2042 MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
2044 "movq 16(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(8)
2045 "movq 24(%0, %%eax, 4), %%mm2 \n\t" // UYVY UYVY(12)
2046 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
2047 "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
2048 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
2049 "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
2050 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
2051 "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
2052 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
2053 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
2055 MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
2057 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
2058 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
2059 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
2060 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
2061 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
2062 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
2063 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
2064 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
2066 MOVNTQ" %%mm0, (%3, %%eax) \n\t"
2067 MOVNTQ" %%mm2, (%2, %%eax) \n\t"
2069 "addl $8, %%eax \n\t"
2070 "cmpl %4, %%eax \n\t"
2072 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2080 "xorl %%eax, %%eax \n\t"
2083 PREFETCH" 64(%0, %%eax, 4) \n\t"
2084 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
2085 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
2086 "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
2087 "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
2088 "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
2089 "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
2090 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
2091 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
2092 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
2093 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
2095 MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
2096 MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
2098 "addl $8, %%eax \n\t"
2099 "cmpl %4, %%eax \n\t"
2102 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2107 for (i=0; i<chromWidth; i++)
2109 udst[i] = src[4*i+0];
2110 ydst[2*i+0] = src[4*i+1];
2111 vdst[i] = src[4*i+2];
2112 ydst[2*i+1] = src[4*i+3];
2117 for (i=0; i<chromWidth; i++)
2119 ydst[2*i+0] = src[4*i+1];
2120 ydst[2*i+1] = src[4*i+3];
2123 udst += chromStride;
2124 vdst += chromStride;
2129 asm volatile( EMMS" \n\t"
2137 * height should be a multiple of 2 and width should be a multiple of 2 (if this is a
2138 * problem for anyone then tell me, and ill fix it)
2139 * chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version
2141 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2142 long width, long height,
2143 long lumStride, long chromStride, long srcStride)
2146 const long chromWidth= width>>1;
2148 for (y=0; y<height-2; y+=2)
2154 "mov %2, %%"REG_a" \n\t"
2155 "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
2156 "movq "MANGLE(w1111)", %%mm5 \n\t"
2157 "pxor %%mm7, %%mm7 \n\t"
2158 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
2161 PREFETCH" 64(%0, %%"REG_d") \n\t"
2162 "movd (%0, %%"REG_d"), %%mm0 \n\t"
2163 "movd 3(%0, %%"REG_d"), %%mm1 \n\t"
2164 "punpcklbw %%mm7, %%mm0 \n\t"
2165 "punpcklbw %%mm7, %%mm1 \n\t"
2166 "movd 6(%0, %%"REG_d"), %%mm2 \n\t"
2167 "movd 9(%0, %%"REG_d"), %%mm3 \n\t"
2168 "punpcklbw %%mm7, %%mm2 \n\t"
2169 "punpcklbw %%mm7, %%mm3 \n\t"
2170 "pmaddwd %%mm6, %%mm0 \n\t"
2171 "pmaddwd %%mm6, %%mm1 \n\t"
2172 "pmaddwd %%mm6, %%mm2 \n\t"
2173 "pmaddwd %%mm6, %%mm3 \n\t"
2174 #ifndef FAST_BGR2YV12
2175 "psrad $8, %%mm0 \n\t"
2176 "psrad $8, %%mm1 \n\t"
2177 "psrad $8, %%mm2 \n\t"
2178 "psrad $8, %%mm3 \n\t"
2180 "packssdw %%mm1, %%mm0 \n\t"
2181 "packssdw %%mm3, %%mm2 \n\t"
2182 "pmaddwd %%mm5, %%mm0 \n\t"
2183 "pmaddwd %%mm5, %%mm2 \n\t"
2184 "packssdw %%mm2, %%mm0 \n\t"
2185 "psraw $7, %%mm0 \n\t"
2187 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
2188 "movd 15(%0, %%"REG_d"), %%mm1 \n\t"
2189 "punpcklbw %%mm7, %%mm4 \n\t"
2190 "punpcklbw %%mm7, %%mm1 \n\t"
2191 "movd 18(%0, %%"REG_d"), %%mm2 \n\t"
2192 "movd 21(%0, %%"REG_d"), %%mm3 \n\t"
2193 "punpcklbw %%mm7, %%mm2 \n\t"
2194 "punpcklbw %%mm7, %%mm3 \n\t"
2195 "pmaddwd %%mm6, %%mm4 \n\t"
2196 "pmaddwd %%mm6, %%mm1 \n\t"
2197 "pmaddwd %%mm6, %%mm2 \n\t"
2198 "pmaddwd %%mm6, %%mm3 \n\t"
2199 #ifndef FAST_BGR2YV12
2200 "psrad $8, %%mm4 \n\t"
2201 "psrad $8, %%mm1 \n\t"
2202 "psrad $8, %%mm2 \n\t"
2203 "psrad $8, %%mm3 \n\t"
2205 "packssdw %%mm1, %%mm4 \n\t"
2206 "packssdw %%mm3, %%mm2 \n\t"
2207 "pmaddwd %%mm5, %%mm4 \n\t"
2208 "pmaddwd %%mm5, %%mm2 \n\t"
2209 "add $24, %%"REG_d" \n\t"
2210 "packssdw %%mm2, %%mm4 \n\t"
2211 "psraw $7, %%mm4 \n\t"
2213 "packuswb %%mm4, %%mm0 \n\t"
2214 "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
2216 MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t"
2217 "add $8, %%"REG_a" \n\t"
2219 : : "r" (src+width*3), "r" (ydst+width), "g" (-width)
2220 : "%"REG_a, "%"REG_d
2227 "mov %4, %%"REG_a" \n\t"
2228 "movq "MANGLE(w1111)", %%mm5 \n\t"
2229 "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
2230 "pxor %%mm7, %%mm7 \n\t"
2231 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
2232 "add %%"REG_d", %%"REG_d" \n\t"
2235 PREFETCH" 64(%0, %%"REG_d") \n\t"
2236 PREFETCH" 64(%1, %%"REG_d") \n\t"
2237 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2238 "movq (%0, %%"REG_d"), %%mm0 \n\t"
2239 "movq (%1, %%"REG_d"), %%mm1 \n\t"
2240 "movq 6(%0, %%"REG_d"), %%mm2 \n\t"
2241 "movq 6(%1, %%"REG_d"), %%mm3 \n\t"
2242 PAVGB" %%mm1, %%mm0 \n\t"
2243 PAVGB" %%mm3, %%mm2 \n\t"
2244 "movq %%mm0, %%mm1 \n\t"
2245 "movq %%mm2, %%mm3 \n\t"
2246 "psrlq $24, %%mm0 \n\t"
2247 "psrlq $24, %%mm2 \n\t"
2248 PAVGB" %%mm1, %%mm0 \n\t"
2249 PAVGB" %%mm3, %%mm2 \n\t"
2250 "punpcklbw %%mm7, %%mm0 \n\t"
2251 "punpcklbw %%mm7, %%mm2 \n\t"
2253 "movd (%0, %%"REG_d"), %%mm0 \n\t"
2254 "movd (%1, %%"REG_d"), %%mm1 \n\t"
2255 "movd 3(%0, %%"REG_d"), %%mm2 \n\t"
2256 "movd 3(%1, %%"REG_d"), %%mm3 \n\t"
2257 "punpcklbw %%mm7, %%mm0 \n\t"
2258 "punpcklbw %%mm7, %%mm1 \n\t"
2259 "punpcklbw %%mm7, %%mm2 \n\t"
2260 "punpcklbw %%mm7, %%mm3 \n\t"
2261 "paddw %%mm1, %%mm0 \n\t"
2262 "paddw %%mm3, %%mm2 \n\t"
2263 "paddw %%mm2, %%mm0 \n\t"
2264 "movd 6(%0, %%"REG_d"), %%mm4 \n\t"
2265 "movd 6(%1, %%"REG_d"), %%mm1 \n\t"
2266 "movd 9(%0, %%"REG_d"), %%mm2 \n\t"
2267 "movd 9(%1, %%"REG_d"), %%mm3 \n\t"
2268 "punpcklbw %%mm7, %%mm4 \n\t"
2269 "punpcklbw %%mm7, %%mm1 \n\t"
2270 "punpcklbw %%mm7, %%mm2 \n\t"
2271 "punpcklbw %%mm7, %%mm3 \n\t"
2272 "paddw %%mm1, %%mm4 \n\t"
2273 "paddw %%mm3, %%mm2 \n\t"
2274 "paddw %%mm4, %%mm2 \n\t"
2275 "psrlw $2, %%mm0 \n\t"
2276 "psrlw $2, %%mm2 \n\t"
2278 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
2279 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
2281 "pmaddwd %%mm0, %%mm1 \n\t"
2282 "pmaddwd %%mm2, %%mm3 \n\t"
2283 "pmaddwd %%mm6, %%mm0 \n\t"
2284 "pmaddwd %%mm6, %%mm2 \n\t"
2285 #ifndef FAST_BGR2YV12
2286 "psrad $8, %%mm0 \n\t"
2287 "psrad $8, %%mm1 \n\t"
2288 "psrad $8, %%mm2 \n\t"
2289 "psrad $8, %%mm3 \n\t"
2291 "packssdw %%mm2, %%mm0 \n\t"
2292 "packssdw %%mm3, %%mm1 \n\t"
2293 "pmaddwd %%mm5, %%mm0 \n\t"
2294 "pmaddwd %%mm5, %%mm1 \n\t"
2295 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
2296 "psraw $7, %%mm0 \n\t"
2298 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2299 "movq 12(%0, %%"REG_d"), %%mm4 \n\t"
2300 "movq 12(%1, %%"REG_d"), %%mm1 \n\t"
2301 "movq 18(%0, %%"REG_d"), %%mm2 \n\t"
2302 "movq 18(%1, %%"REG_d"), %%mm3 \n\t"
2303 PAVGB" %%mm1, %%mm4 \n\t"
2304 PAVGB" %%mm3, %%mm2 \n\t"
2305 "movq %%mm4, %%mm1 \n\t"
2306 "movq %%mm2, %%mm3 \n\t"
2307 "psrlq $24, %%mm4 \n\t"
2308 "psrlq $24, %%mm2 \n\t"
2309 PAVGB" %%mm1, %%mm4 \n\t"
2310 PAVGB" %%mm3, %%mm2 \n\t"
2311 "punpcklbw %%mm7, %%mm4 \n\t"
2312 "punpcklbw %%mm7, %%mm2 \n\t"
2314 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
2315 "movd 12(%1, %%"REG_d"), %%mm1 \n\t"
2316 "movd 15(%0, %%"REG_d"), %%mm2 \n\t"
2317 "movd 15(%1, %%"REG_d"), %%mm3 \n\t"
2318 "punpcklbw %%mm7, %%mm4 \n\t"
2319 "punpcklbw %%mm7, %%mm1 \n\t"
2320 "punpcklbw %%mm7, %%mm2 \n\t"
2321 "punpcklbw %%mm7, %%mm3 \n\t"
2322 "paddw %%mm1, %%mm4 \n\t"
2323 "paddw %%mm3, %%mm2 \n\t"
2324 "paddw %%mm2, %%mm4 \n\t"
2325 "movd 18(%0, %%"REG_d"), %%mm5 \n\t"
2326 "movd 18(%1, %%"REG_d"), %%mm1 \n\t"
2327 "movd 21(%0, %%"REG_d"), %%mm2 \n\t"
2328 "movd 21(%1, %%"REG_d"), %%mm3 \n\t"
2329 "punpcklbw %%mm7, %%mm5 \n\t"
2330 "punpcklbw %%mm7, %%mm1 \n\t"
2331 "punpcklbw %%mm7, %%mm2 \n\t"
2332 "punpcklbw %%mm7, %%mm3 \n\t"
2333 "paddw %%mm1, %%mm5 \n\t"
2334 "paddw %%mm3, %%mm2 \n\t"
2335 "paddw %%mm5, %%mm2 \n\t"
2336 "movq "MANGLE(w1111)", %%mm5 \n\t"
2337 "psrlw $2, %%mm4 \n\t"
2338 "psrlw $2, %%mm2 \n\t"
2340 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
2341 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
2343 "pmaddwd %%mm4, %%mm1 \n\t"
2344 "pmaddwd %%mm2, %%mm3 \n\t"
2345 "pmaddwd %%mm6, %%mm4 \n\t"
2346 "pmaddwd %%mm6, %%mm2 \n\t"
2347 #ifndef FAST_BGR2YV12
2348 "psrad $8, %%mm4 \n\t"
2349 "psrad $8, %%mm1 \n\t"
2350 "psrad $8, %%mm2 \n\t"
2351 "psrad $8, %%mm3 \n\t"
2353 "packssdw %%mm2, %%mm4 \n\t"
2354 "packssdw %%mm3, %%mm1 \n\t"
2355 "pmaddwd %%mm5, %%mm4 \n\t"
2356 "pmaddwd %%mm5, %%mm1 \n\t"
2357 "add $24, %%"REG_d" \n\t"
2358 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
2359 "psraw $7, %%mm4 \n\t"
2361 "movq %%mm0, %%mm1 \n\t"
2362 "punpckldq %%mm4, %%mm0 \n\t"
2363 "punpckhdq %%mm4, %%mm1 \n\t"
2364 "packsswb %%mm1, %%mm0 \n\t"
2365 "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
2366 "movd %%mm0, (%2, %%"REG_a") \n\t"
2367 "punpckhdq %%mm0, %%mm0 \n\t"
2368 "movd %%mm0, (%3, %%"REG_a") \n\t"
2369 "add $4, %%"REG_a" \n\t"
2371 : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
2372 : "%"REG_a, "%"REG_d
2375 udst += chromStride;
2376 vdst += chromStride;
2380 asm volatile( EMMS" \n\t"
2386 for (; y<height; y+=2)
2389 for (i=0; i<chromWidth; i++)
2391 unsigned int b = src[6*i+0];
2392 unsigned int g = src[6*i+1];
2393 unsigned int r = src[6*i+2];
2395 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2396 unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2397 unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2407 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2413 for (i=0; i<chromWidth; i++)
2415 unsigned int b = src[6*i+0];
2416 unsigned int g = src[6*i+1];
2417 unsigned int r = src[6*i+2];
2419 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2427 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2430 udst += chromStride;
2431 vdst += chromStride;
2437 void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
2438 long width, long height, long src1Stride,
2439 long src2Stride, long dstStride){
2442 for (h=0; h < height; h++)
2449 "xor %%"REG_a", %%"REG_a" \n\t"
2451 PREFETCH" 64(%1, %%"REG_a") \n\t"
2452 PREFETCH" 64(%2, %%"REG_a") \n\t"
2453 "movdqa (%1, %%"REG_a"), %%xmm0 \n\t"
2454 "movdqa (%1, %%"REG_a"), %%xmm1 \n\t"
2455 "movdqa (%2, %%"REG_a"), %%xmm2 \n\t"
2456 "punpcklbw %%xmm2, %%xmm0 \n\t"
2457 "punpckhbw %%xmm2, %%xmm1 \n\t"
2458 "movntdq %%xmm0, (%0, %%"REG_a", 2) \n\t"
2459 "movntdq %%xmm1, 16(%0, %%"REG_a", 2) \n\t"
2460 "add $16, %%"REG_a" \n\t"
2461 "cmp %3, %%"REG_a" \n\t"
2463 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2464 : "memory", "%"REG_a""
2468 "xor %%"REG_a", %%"REG_a" \n\t"
2470 PREFETCH" 64(%1, %%"REG_a") \n\t"
2471 PREFETCH" 64(%2, %%"REG_a") \n\t"
2472 "movq (%1, %%"REG_a"), %%mm0 \n\t"
2473 "movq 8(%1, %%"REG_a"), %%mm2 \n\t"
2474 "movq %%mm0, %%mm1 \n\t"
2475 "movq %%mm2, %%mm3 \n\t"
2476 "movq (%2, %%"REG_a"), %%mm4 \n\t"
2477 "movq 8(%2, %%"REG_a"), %%mm5 \n\t"
2478 "punpcklbw %%mm4, %%mm0 \n\t"
2479 "punpckhbw %%mm4, %%mm1 \n\t"
2480 "punpcklbw %%mm5, %%mm2 \n\t"
2481 "punpckhbw %%mm5, %%mm3 \n\t"
2482 MOVNTQ" %%mm0, (%0, %%"REG_a", 2) \n\t"
2483 MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2) \n\t"
2484 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2) \n\t"
2485 MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2) \n\t"
2486 "add $16, %%"REG_a" \n\t"
2487 "cmp %3, %%"REG_a" \n\t"
2489 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2490 : "memory", "%"REG_a
2493 for (w= (width&(~15)); w < width; w++)
2495 dest[2*w+0] = src1[w];
2496 dest[2*w+1] = src2[w];
2499 for (w=0; w < width; w++)
2501 dest[2*w+0] = src1[w];
2502 dest[2*w+1] = src2[w];
2518 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2519 uint8_t *dst1, uint8_t *dst2,
2520 long width, long height,
2521 long srcStride1, long srcStride2,
2522 long dstStride1, long dstStride2)
2525 w=width/2; h=height/2;
2530 ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2533 const uint8_t* s1=src1+srcStride1*(y>>1);
2534 uint8_t* d=dst1+dstStride1*y;
2540 PREFETCH" 32%1 \n\t"
2541 "movq %1, %%mm0 \n\t"
2542 "movq 8%1, %%mm2 \n\t"
2543 "movq 16%1, %%mm4 \n\t"
2544 "movq 24%1, %%mm6 \n\t"
2545 "movq %%mm0, %%mm1 \n\t"
2546 "movq %%mm2, %%mm3 \n\t"
2547 "movq %%mm4, %%mm5 \n\t"
2548 "movq %%mm6, %%mm7 \n\t"
2549 "punpcklbw %%mm0, %%mm0 \n\t"
2550 "punpckhbw %%mm1, %%mm1 \n\t"
2551 "punpcklbw %%mm2, %%mm2 \n\t"
2552 "punpckhbw %%mm3, %%mm3 \n\t"
2553 "punpcklbw %%mm4, %%mm4 \n\t"
2554 "punpckhbw %%mm5, %%mm5 \n\t"
2555 "punpcklbw %%mm6, %%mm6 \n\t"
2556 "punpckhbw %%mm7, %%mm7 \n\t"
2557 MOVNTQ" %%mm0, %0 \n\t"
2558 MOVNTQ" %%mm1, 8%0 \n\t"
2559 MOVNTQ" %%mm2, 16%0 \n\t"
2560 MOVNTQ" %%mm3, 24%0 \n\t"
2561 MOVNTQ" %%mm4, 32%0 \n\t"
2562 MOVNTQ" %%mm5, 40%0 \n\t"
2563 MOVNTQ" %%mm6, 48%0 \n\t"
2564 MOVNTQ" %%mm7, 56%0"
2570 for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2573 const uint8_t* s2=src2+srcStride2*(y>>1);
2574 uint8_t* d=dst2+dstStride2*y;
2580 PREFETCH" 32%1 \n\t"
2581 "movq %1, %%mm0 \n\t"
2582 "movq 8%1, %%mm2 \n\t"
2583 "movq 16%1, %%mm4 \n\t"
2584 "movq 24%1, %%mm6 \n\t"
2585 "movq %%mm0, %%mm1 \n\t"
2586 "movq %%mm2, %%mm3 \n\t"
2587 "movq %%mm4, %%mm5 \n\t"
2588 "movq %%mm6, %%mm7 \n\t"
2589 "punpcklbw %%mm0, %%mm0 \n\t"
2590 "punpckhbw %%mm1, %%mm1 \n\t"
2591 "punpcklbw %%mm2, %%mm2 \n\t"
2592 "punpckhbw %%mm3, %%mm3 \n\t"
2593 "punpcklbw %%mm4, %%mm4 \n\t"
2594 "punpckhbw %%mm5, %%mm5 \n\t"
2595 "punpcklbw %%mm6, %%mm6 \n\t"
2596 "punpckhbw %%mm7, %%mm7 \n\t"
2597 MOVNTQ" %%mm0, %0 \n\t"
2598 MOVNTQ" %%mm1, 8%0 \n\t"
2599 MOVNTQ" %%mm2, 16%0 \n\t"
2600 MOVNTQ" %%mm3, 24%0 \n\t"
2601 MOVNTQ" %%mm4, 32%0 \n\t"
2602 MOVNTQ" %%mm5, 40%0 \n\t"
2603 MOVNTQ" %%mm6, 48%0 \n\t"
2604 MOVNTQ" %%mm7, 56%0"
2610 for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2621 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2623 long width, long height,
2624 long srcStride1, long srcStride2,
2625 long srcStride3, long dstStride)
2628 w=width/2; h=height;
2630 const uint8_t* yp=src1+srcStride1*y;
2631 const uint8_t* up=src2+srcStride2*(y>>2);
2632 const uint8_t* vp=src3+srcStride3*(y>>2);
2633 uint8_t* d=dst+dstStride*y;
2639 PREFETCH" 32(%1, %0) \n\t"
2640 PREFETCH" 32(%2, %0) \n\t"
2641 PREFETCH" 32(%3, %0) \n\t"
2642 "movq (%1, %0, 4), %%mm0 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2643 "movq (%2, %0), %%mm1 \n\t" /* U0U1U2U3U4U5U6U7 */
2644 "movq (%3, %0), %%mm2 \n\t" /* V0V1V2V3V4V5V6V7 */
2645 "movq %%mm0, %%mm3 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2646 "movq %%mm1, %%mm4 \n\t" /* U0U1U2U3U4U5U6U7 */
2647 "movq %%mm2, %%mm5 \n\t" /* V0V1V2V3V4V5V6V7 */
2648 "punpcklbw %%mm1, %%mm1 \n\t" /* U0U0 U1U1 U2U2 U3U3 */
2649 "punpcklbw %%mm2, %%mm2 \n\t" /* V0V0 V1V1 V2V2 V3V3 */
2650 "punpckhbw %%mm4, %%mm4 \n\t" /* U4U4 U5U5 U6U6 U7U7 */
2651 "punpckhbw %%mm5, %%mm5 \n\t" /* V4V4 V5V5 V6V6 V7V7 */
2653 "movq %%mm1, %%mm6 \n\t"
2654 "punpcklbw %%mm2, %%mm1 \n\t" /* U0V0 U0V0 U1V1 U1V1*/
2655 "punpcklbw %%mm1, %%mm0 \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2656 "punpckhbw %%mm1, %%mm3 \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2657 MOVNTQ" %%mm0, (%4, %0, 8) \n\t"
2658 MOVNTQ" %%mm3, 8(%4, %0, 8) \n\t"
2660 "punpckhbw %%mm2, %%mm6 \n\t" /* U2V2 U2V2 U3V3 U3V3*/
2661 "movq 8(%1, %0, 4), %%mm0 \n\t"
2662 "movq %%mm0, %%mm3 \n\t"
2663 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U2 Y V2 Y U2 Y V2*/
2664 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U3 Y V3 Y U3 Y V3*/
2665 MOVNTQ" %%mm0, 16(%4, %0, 8) \n\t"
2666 MOVNTQ" %%mm3, 24(%4, %0, 8) \n\t"
2668 "movq %%mm4, %%mm6 \n\t"
2669 "movq 16(%1, %0, 4), %%mm0 \n\t"
2670 "movq %%mm0, %%mm3 \n\t"
2671 "punpcklbw %%mm5, %%mm4 \n\t"
2672 "punpcklbw %%mm4, %%mm0 \n\t" /* Y U4 Y V4 Y U4 Y V4*/
2673 "punpckhbw %%mm4, %%mm3 \n\t" /* Y U5 Y V5 Y U5 Y V5*/
2674 MOVNTQ" %%mm0, 32(%4, %0, 8) \n\t"
2675 MOVNTQ" %%mm3, 40(%4, %0, 8) \n\t"
2677 "punpckhbw %%mm5, %%mm6 \n\t"
2678 "movq 24(%1, %0, 4), %%mm0 \n\t"
2679 "movq %%mm0, %%mm3 \n\t"
2680 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U6 Y V6 Y U6 Y V6*/
2681 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U7 Y V7 Y U7 Y V7*/
2682 MOVNTQ" %%mm0, 48(%4, %0, 8) \n\t"
2683 MOVNTQ" %%mm3, 56(%4, %0, 8) \n\t"
2686 : "r"(yp), "r" (up), "r"(vp), "r"(d)
2692 const long x2 = x<<2;
2695 d[8*x+2] = yp[x2+1];
2697 d[8*x+4] = yp[x2+2];
2699 d[8*x+6] = yp[x2+3];
2712 static inline void RENAME(rgb2rgb_init)(void){
2713 rgb15to16 = RENAME(rgb15to16);
2714 rgb15to24 = RENAME(rgb15to24);
2715 rgb15to32 = RENAME(rgb15to32);
2716 rgb16to24 = RENAME(rgb16to24);
2717 rgb16to32 = RENAME(rgb16to32);
2718 rgb16to15 = RENAME(rgb16to15);
2719 rgb24to16 = RENAME(rgb24to16);
2720 rgb24to15 = RENAME(rgb24to15);
2721 rgb24to32 = RENAME(rgb24to32);
2722 rgb32to16 = RENAME(rgb32to16);
2723 rgb32to15 = RENAME(rgb32to15);
2724 rgb32to24 = RENAME(rgb32to24);
2725 rgb24tobgr15 = RENAME(rgb24tobgr15);
2726 rgb24tobgr16 = RENAME(rgb24tobgr16);
2727 rgb24tobgr24 = RENAME(rgb24tobgr24);
2728 rgb32tobgr32 = RENAME(rgb32tobgr32);
2729 rgb32tobgr16 = RENAME(rgb32tobgr16);
2730 rgb32tobgr15 = RENAME(rgb32tobgr15);
2731 yv12toyuy2 = RENAME(yv12toyuy2);
2732 yv12touyvy = RENAME(yv12touyvy);
2733 yuv422ptoyuy2 = RENAME(yuv422ptoyuy2);
2734 yuy2toyv12 = RENAME(yuy2toyv12);
2735 // uyvytoyv12 = RENAME(uyvytoyv12);
2736 // yvu9toyv12 = RENAME(yvu9toyv12);
2737 planar2x = RENAME(planar2x);
2738 rgb24toyv12 = RENAME(rgb24toyv12);
2739 interleaveBytes = RENAME(interleaveBytes);
2740 vu9_to_vu12 = RENAME(vu9_to_vu12);
2741 yvu9_to_yuy2 = RENAME(yvu9_to_yuy2);