2 * software RGB to RGB converter
3 * pluralize by software PAL8 to RGB converter
4 * software YUV to YUV converter
5 * software YUV to RGB converter
6 * Written by Nick Kurshev.
7 * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
8 * lot of big-endian byte order fixes by Alex Beregszaszi
10 * This file is part of FFmpeg.
12 * FFmpeg is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
17 * FFmpeg is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
22 * You should have received a copy of the GNU General Public License
23 * along with FFmpeg; if not, write to the Free Software
24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
26 * The C code (not assembly, MMX, ...) of this file can be used
27 * under the LGPL license.
31 #include <inttypes.h> /* for __WORDSIZE */
34 // #warning You have a misconfigured system and will probably lose performance!
35 #define __WORDSIZE MP_WORDSIZE
53 #define PREFETCH "prefetch"
54 #define PREFETCHW "prefetchw"
55 #define PAVGB "pavgusb"
56 #elif defined (HAVE_MMX2)
57 #define PREFETCH "prefetchnta"
58 #define PREFETCHW "prefetcht0"
65 #define PREFETCH " # nop"
66 #define PREFETCHW " # nop"
71 /* On K6 femms is faster than emms. On K7 femms is directly mapped on emms. */
78 #define MOVNTQ "movntq"
79 #define SFENCE "sfence"
82 #define SFENCE " # nop"
85 static inline void RENAME(rgb24to32)(const uint8_t *src, uint8_t *dst, long src_size)
88 const uint8_t *s = src;
91 const uint8_t *mm_end;
95 asm volatile(PREFETCH" %0"::"m"(*s):"memory");
97 asm volatile("movq %0, %%mm7"::"m"(mask32):"memory");
102 "movd %1, %%mm0 \n\t"
103 "punpckldq 3%1, %%mm0 \n\t"
104 "movd 6%1, %%mm1 \n\t"
105 "punpckldq 9%1, %%mm1 \n\t"
106 "movd 12%1, %%mm2 \n\t"
107 "punpckldq 15%1, %%mm2 \n\t"
108 "movd 18%1, %%mm3 \n\t"
109 "punpckldq 21%1, %%mm3 \n\t"
110 "pand %%mm7, %%mm0 \n\t"
111 "pand %%mm7, %%mm1 \n\t"
112 "pand %%mm7, %%mm2 \n\t"
113 "pand %%mm7, %%mm3 \n\t"
114 MOVNTQ" %%mm0, %0 \n\t"
115 MOVNTQ" %%mm1, 8%0 \n\t"
116 MOVNTQ" %%mm2, 16%0 \n\t"
124 asm volatile(SFENCE:::"memory");
125 asm volatile(EMMS:::"memory");
129 #ifdef WORDS_BIGENDIAN
130 /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */
145 static inline void RENAME(rgb32to24)(const uint8_t *src, uint8_t *dst, long src_size)
148 const uint8_t *s = src;
151 const uint8_t *mm_end;
155 asm volatile(PREFETCH" %0"::"m"(*s):"memory");
161 "movq %1, %%mm0 \n\t"
162 "movq 8%1, %%mm1 \n\t"
163 "movq 16%1, %%mm4 \n\t"
164 "movq 24%1, %%mm5 \n\t"
165 "movq %%mm0, %%mm2 \n\t"
166 "movq %%mm1, %%mm3 \n\t"
167 "movq %%mm4, %%mm6 \n\t"
168 "movq %%mm5, %%mm7 \n\t"
169 "psrlq $8, %%mm2 \n\t"
170 "psrlq $8, %%mm3 \n\t"
171 "psrlq $8, %%mm6 \n\t"
172 "psrlq $8, %%mm7 \n\t"
173 "pand %2, %%mm0 \n\t"
174 "pand %2, %%mm1 \n\t"
175 "pand %2, %%mm4 \n\t"
176 "pand %2, %%mm5 \n\t"
177 "pand %3, %%mm2 \n\t"
178 "pand %3, %%mm3 \n\t"
179 "pand %3, %%mm6 \n\t"
180 "pand %3, %%mm7 \n\t"
181 "por %%mm2, %%mm0 \n\t"
182 "por %%mm3, %%mm1 \n\t"
183 "por %%mm6, %%mm4 \n\t"
184 "por %%mm7, %%mm5 \n\t"
186 "movq %%mm1, %%mm2 \n\t"
187 "movq %%mm4, %%mm3 \n\t"
188 "psllq $48, %%mm2 \n\t"
189 "psllq $32, %%mm3 \n\t"
190 "pand %4, %%mm2 \n\t"
191 "pand %5, %%mm3 \n\t"
192 "por %%mm2, %%mm0 \n\t"
193 "psrlq $16, %%mm1 \n\t"
194 "psrlq $32, %%mm4 \n\t"
195 "psllq $16, %%mm5 \n\t"
196 "por %%mm3, %%mm1 \n\t"
197 "pand %6, %%mm5 \n\t"
198 "por %%mm5, %%mm4 \n\t"
200 MOVNTQ" %%mm0, %0 \n\t"
201 MOVNTQ" %%mm1, 8%0 \n\t"
204 :"m"(*s),"m"(mask24l),
205 "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
210 asm volatile(SFENCE:::"memory");
211 asm volatile(EMMS:::"memory");
215 #ifdef WORDS_BIGENDIAN
216 /* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */
232 original by Strepto/Astral
233 ported to gcc & bugfixed: A'rpi
234 MMX2, 3DNOW optimization by Nick Kurshev
235 32-bit C version, and and&add trick by Michael Niedermayer
237 static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, long src_size)
239 register const uint8_t* s=src;
240 register uint8_t* d=dst;
241 register const uint8_t *end;
242 const uint8_t *mm_end;
245 asm volatile(PREFETCH" %0"::"m"(*s));
246 asm volatile("movq %0, %%mm4"::"m"(mask15s));
252 "movq %1, %%mm0 \n\t"
253 "movq 8%1, %%mm2 \n\t"
254 "movq %%mm0, %%mm1 \n\t"
255 "movq %%mm2, %%mm3 \n\t"
256 "pand %%mm4, %%mm0 \n\t"
257 "pand %%mm4, %%mm2 \n\t"
258 "paddw %%mm1, %%mm0 \n\t"
259 "paddw %%mm3, %%mm2 \n\t"
260 MOVNTQ" %%mm0, %0 \n\t"
268 asm volatile(SFENCE:::"memory");
269 asm volatile(EMMS:::"memory");
274 register unsigned x= *((const uint32_t *)s);
275 *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
281 register unsigned short x= *((const uint16_t *)s);
282 *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
286 static inline void RENAME(rgb16to15)(const uint8_t *src, uint8_t *dst, long src_size)
288 register const uint8_t* s=src;
289 register uint8_t* d=dst;
290 register const uint8_t *end;
291 const uint8_t *mm_end;
294 asm volatile(PREFETCH" %0"::"m"(*s));
295 asm volatile("movq %0, %%mm7"::"m"(mask15rg));
296 asm volatile("movq %0, %%mm6"::"m"(mask15b));
302 "movq %1, %%mm0 \n\t"
303 "movq 8%1, %%mm2 \n\t"
304 "movq %%mm0, %%mm1 \n\t"
305 "movq %%mm2, %%mm3 \n\t"
306 "psrlq $1, %%mm0 \n\t"
307 "psrlq $1, %%mm2 \n\t"
308 "pand %%mm7, %%mm0 \n\t"
309 "pand %%mm7, %%mm2 \n\t"
310 "pand %%mm6, %%mm1 \n\t"
311 "pand %%mm6, %%mm3 \n\t"
312 "por %%mm1, %%mm0 \n\t"
313 "por %%mm3, %%mm2 \n\t"
314 MOVNTQ" %%mm0, %0 \n\t"
322 asm volatile(SFENCE:::"memory");
323 asm volatile(EMMS:::"memory");
328 register uint32_t x= *((const uint32_t*)s);
329 *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
335 register uint16_t x= *((const uint16_t*)s);
336 *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
342 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size)
344 const uint8_t *s = src;
347 const uint8_t *mm_end;
349 uint16_t *d = (uint16_t *)dst;
353 #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
355 "movq %3, %%mm5 \n\t"
356 "movq %4, %%mm6 \n\t"
357 "movq %5, %%mm7 \n\t"
361 PREFETCH" 32(%1) \n\t"
362 "movd (%1), %%mm0 \n\t"
363 "movd 4(%1), %%mm3 \n\t"
364 "punpckldq 8(%1), %%mm0 \n\t"
365 "punpckldq 12(%1), %%mm3 \n\t"
366 "movq %%mm0, %%mm1 \n\t"
367 "movq %%mm3, %%mm4 \n\t"
368 "pand %%mm6, %%mm0 \n\t"
369 "pand %%mm6, %%mm3 \n\t"
370 "pmaddwd %%mm7, %%mm0 \n\t"
371 "pmaddwd %%mm7, %%mm3 \n\t"
372 "pand %%mm5, %%mm1 \n\t"
373 "pand %%mm5, %%mm4 \n\t"
374 "por %%mm1, %%mm0 \n\t"
375 "por %%mm4, %%mm3 \n\t"
376 "psrld $5, %%mm0 \n\t"
377 "pslld $11, %%mm3 \n\t"
378 "por %%mm3, %%mm0 \n\t"
379 MOVNTQ" %%mm0, (%0) \n\t"
386 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
389 asm volatile(PREFETCH" %0"::"m"(*src):"memory");
391 "movq %0, %%mm7 \n\t"
392 "movq %1, %%mm6 \n\t"
393 ::"m"(red_16mask),"m"(green_16mask));
398 "movd %1, %%mm0 \n\t"
399 "movd 4%1, %%mm3 \n\t"
400 "punpckldq 8%1, %%mm0 \n\t"
401 "punpckldq 12%1, %%mm3 \n\t"
402 "movq %%mm0, %%mm1 \n\t"
403 "movq %%mm0, %%mm2 \n\t"
404 "movq %%mm3, %%mm4 \n\t"
405 "movq %%mm3, %%mm5 \n\t"
406 "psrlq $3, %%mm0 \n\t"
407 "psrlq $3, %%mm3 \n\t"
408 "pand %2, %%mm0 \n\t"
409 "pand %2, %%mm3 \n\t"
410 "psrlq $5, %%mm1 \n\t"
411 "psrlq $5, %%mm4 \n\t"
412 "pand %%mm6, %%mm1 \n\t"
413 "pand %%mm6, %%mm4 \n\t"
414 "psrlq $8, %%mm2 \n\t"
415 "psrlq $8, %%mm5 \n\t"
416 "pand %%mm7, %%mm2 \n\t"
417 "pand %%mm7, %%mm5 \n\t"
418 "por %%mm1, %%mm0 \n\t"
419 "por %%mm4, %%mm3 \n\t"
420 "por %%mm2, %%mm0 \n\t"
421 "por %%mm5, %%mm3 \n\t"
422 "psllq $16, %%mm3 \n\t"
423 "por %%mm3, %%mm0 \n\t"
424 MOVNTQ" %%mm0, %0 \n\t"
425 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
430 asm volatile(SFENCE:::"memory");
431 asm volatile(EMMS:::"memory");
435 register int rgb = *(const uint32_t*)s; s += 4;
436 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
440 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
442 const uint8_t *s = src;
445 const uint8_t *mm_end;
447 uint16_t *d = (uint16_t *)dst;
450 asm volatile(PREFETCH" %0"::"m"(*src):"memory");
452 "movq %0, %%mm7 \n\t"
453 "movq %1, %%mm6 \n\t"
454 ::"m"(red_16mask),"m"(green_16mask));
460 "movd %1, %%mm0 \n\t"
461 "movd 4%1, %%mm3 \n\t"
462 "punpckldq 8%1, %%mm0 \n\t"
463 "punpckldq 12%1, %%mm3 \n\t"
464 "movq %%mm0, %%mm1 \n\t"
465 "movq %%mm0, %%mm2 \n\t"
466 "movq %%mm3, %%mm4 \n\t"
467 "movq %%mm3, %%mm5 \n\t"
468 "psllq $8, %%mm0 \n\t"
469 "psllq $8, %%mm3 \n\t"
470 "pand %%mm7, %%mm0 \n\t"
471 "pand %%mm7, %%mm3 \n\t"
472 "psrlq $5, %%mm1 \n\t"
473 "psrlq $5, %%mm4 \n\t"
474 "pand %%mm6, %%mm1 \n\t"
475 "pand %%mm6, %%mm4 \n\t"
476 "psrlq $19, %%mm2 \n\t"
477 "psrlq $19, %%mm5 \n\t"
478 "pand %2, %%mm2 \n\t"
479 "pand %2, %%mm5 \n\t"
480 "por %%mm1, %%mm0 \n\t"
481 "por %%mm4, %%mm3 \n\t"
482 "por %%mm2, %%mm0 \n\t"
483 "por %%mm5, %%mm3 \n\t"
484 "psllq $16, %%mm3 \n\t"
485 "por %%mm3, %%mm0 \n\t"
486 MOVNTQ" %%mm0, %0 \n\t"
487 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
491 asm volatile(SFENCE:::"memory");
492 asm volatile(EMMS:::"memory");
496 register int rgb = *(const uint32_t*)s; s += 4;
497 *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
501 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size)
503 const uint8_t *s = src;
506 const uint8_t *mm_end;
508 uint16_t *d = (uint16_t *)dst;
512 #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
514 "movq %3, %%mm5 \n\t"
515 "movq %4, %%mm6 \n\t"
516 "movq %5, %%mm7 \n\t"
520 PREFETCH" 32(%1) \n\t"
521 "movd (%1), %%mm0 \n\t"
522 "movd 4(%1), %%mm3 \n\t"
523 "punpckldq 8(%1), %%mm0 \n\t"
524 "punpckldq 12(%1), %%mm3 \n\t"
525 "movq %%mm0, %%mm1 \n\t"
526 "movq %%mm3, %%mm4 \n\t"
527 "pand %%mm6, %%mm0 \n\t"
528 "pand %%mm6, %%mm3 \n\t"
529 "pmaddwd %%mm7, %%mm0 \n\t"
530 "pmaddwd %%mm7, %%mm3 \n\t"
531 "pand %%mm5, %%mm1 \n\t"
532 "pand %%mm5, %%mm4 \n\t"
533 "por %%mm1, %%mm0 \n\t"
534 "por %%mm4, %%mm3 \n\t"
535 "psrld $6, %%mm0 \n\t"
536 "pslld $10, %%mm3 \n\t"
537 "por %%mm3, %%mm0 \n\t"
538 MOVNTQ" %%mm0, (%0) \n\t"
545 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
548 asm volatile(PREFETCH" %0"::"m"(*src):"memory");
550 "movq %0, %%mm7 \n\t"
551 "movq %1, %%mm6 \n\t"
552 ::"m"(red_15mask),"m"(green_15mask));
557 "movd %1, %%mm0 \n\t"
558 "movd 4%1, %%mm3 \n\t"
559 "punpckldq 8%1, %%mm0 \n\t"
560 "punpckldq 12%1, %%mm3 \n\t"
561 "movq %%mm0, %%mm1 \n\t"
562 "movq %%mm0, %%mm2 \n\t"
563 "movq %%mm3, %%mm4 \n\t"
564 "movq %%mm3, %%mm5 \n\t"
565 "psrlq $3, %%mm0 \n\t"
566 "psrlq $3, %%mm3 \n\t"
567 "pand %2, %%mm0 \n\t"
568 "pand %2, %%mm3 \n\t"
569 "psrlq $6, %%mm1 \n\t"
570 "psrlq $6, %%mm4 \n\t"
571 "pand %%mm6, %%mm1 \n\t"
572 "pand %%mm6, %%mm4 \n\t"
573 "psrlq $9, %%mm2 \n\t"
574 "psrlq $9, %%mm5 \n\t"
575 "pand %%mm7, %%mm2 \n\t"
576 "pand %%mm7, %%mm5 \n\t"
577 "por %%mm1, %%mm0 \n\t"
578 "por %%mm4, %%mm3 \n\t"
579 "por %%mm2, %%mm0 \n\t"
580 "por %%mm5, %%mm3 \n\t"
581 "psllq $16, %%mm3 \n\t"
582 "por %%mm3, %%mm0 \n\t"
583 MOVNTQ" %%mm0, %0 \n\t"
584 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
589 asm volatile(SFENCE:::"memory");
590 asm volatile(EMMS:::"memory");
594 register int rgb = *(const uint32_t*)s; s += 4;
595 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
599 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
601 const uint8_t *s = src;
604 const uint8_t *mm_end;
606 uint16_t *d = (uint16_t *)dst;
609 asm volatile(PREFETCH" %0"::"m"(*src):"memory");
611 "movq %0, %%mm7 \n\t"
612 "movq %1, %%mm6 \n\t"
613 ::"m"(red_15mask),"m"(green_15mask));
619 "movd %1, %%mm0 \n\t"
620 "movd 4%1, %%mm3 \n\t"
621 "punpckldq 8%1, %%mm0 \n\t"
622 "punpckldq 12%1, %%mm3 \n\t"
623 "movq %%mm0, %%mm1 \n\t"
624 "movq %%mm0, %%mm2 \n\t"
625 "movq %%mm3, %%mm4 \n\t"
626 "movq %%mm3, %%mm5 \n\t"
627 "psllq $7, %%mm0 \n\t"
628 "psllq $7, %%mm3 \n\t"
629 "pand %%mm7, %%mm0 \n\t"
630 "pand %%mm7, %%mm3 \n\t"
631 "psrlq $6, %%mm1 \n\t"
632 "psrlq $6, %%mm4 \n\t"
633 "pand %%mm6, %%mm1 \n\t"
634 "pand %%mm6, %%mm4 \n\t"
635 "psrlq $19, %%mm2 \n\t"
636 "psrlq $19, %%mm5 \n\t"
637 "pand %2, %%mm2 \n\t"
638 "pand %2, %%mm5 \n\t"
639 "por %%mm1, %%mm0 \n\t"
640 "por %%mm4, %%mm3 \n\t"
641 "por %%mm2, %%mm0 \n\t"
642 "por %%mm5, %%mm3 \n\t"
643 "psllq $16, %%mm3 \n\t"
644 "por %%mm3, %%mm0 \n\t"
645 MOVNTQ" %%mm0, %0 \n\t"
646 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
650 asm volatile(SFENCE:::"memory");
651 asm volatile(EMMS:::"memory");
655 register int rgb = *(const uint32_t*)s; s += 4;
656 *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
660 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size)
662 const uint8_t *s = src;
665 const uint8_t *mm_end;
667 uint16_t *d = (uint16_t *)dst;
670 asm volatile(PREFETCH" %0"::"m"(*src):"memory");
672 "movq %0, %%mm7 \n\t"
673 "movq %1, %%mm6 \n\t"
674 ::"m"(red_16mask),"m"(green_16mask));
680 "movd %1, %%mm0 \n\t"
681 "movd 3%1, %%mm3 \n\t"
682 "punpckldq 6%1, %%mm0 \n\t"
683 "punpckldq 9%1, %%mm3 \n\t"
684 "movq %%mm0, %%mm1 \n\t"
685 "movq %%mm0, %%mm2 \n\t"
686 "movq %%mm3, %%mm4 \n\t"
687 "movq %%mm3, %%mm5 \n\t"
688 "psrlq $3, %%mm0 \n\t"
689 "psrlq $3, %%mm3 \n\t"
690 "pand %2, %%mm0 \n\t"
691 "pand %2, %%mm3 \n\t"
692 "psrlq $5, %%mm1 \n\t"
693 "psrlq $5, %%mm4 \n\t"
694 "pand %%mm6, %%mm1 \n\t"
695 "pand %%mm6, %%mm4 \n\t"
696 "psrlq $8, %%mm2 \n\t"
697 "psrlq $8, %%mm5 \n\t"
698 "pand %%mm7, %%mm2 \n\t"
699 "pand %%mm7, %%mm5 \n\t"
700 "por %%mm1, %%mm0 \n\t"
701 "por %%mm4, %%mm3 \n\t"
702 "por %%mm2, %%mm0 \n\t"
703 "por %%mm5, %%mm3 \n\t"
704 "psllq $16, %%mm3 \n\t"
705 "por %%mm3, %%mm0 \n\t"
706 MOVNTQ" %%mm0, %0 \n\t"
707 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
711 asm volatile(SFENCE:::"memory");
712 asm volatile(EMMS:::"memory");
719 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
723 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
725 const uint8_t *s = src;
728 const uint8_t *mm_end;
730 uint16_t *d = (uint16_t *)dst;
733 asm volatile(PREFETCH" %0"::"m"(*src):"memory");
735 "movq %0, %%mm7 \n\t"
736 "movq %1, %%mm6 \n\t"
737 ::"m"(red_16mask),"m"(green_16mask));
743 "movd %1, %%mm0 \n\t"
744 "movd 3%1, %%mm3 \n\t"
745 "punpckldq 6%1, %%mm0 \n\t"
746 "punpckldq 9%1, %%mm3 \n\t"
747 "movq %%mm0, %%mm1 \n\t"
748 "movq %%mm0, %%mm2 \n\t"
749 "movq %%mm3, %%mm4 \n\t"
750 "movq %%mm3, %%mm5 \n\t"
751 "psllq $8, %%mm0 \n\t"
752 "psllq $8, %%mm3 \n\t"
753 "pand %%mm7, %%mm0 \n\t"
754 "pand %%mm7, %%mm3 \n\t"
755 "psrlq $5, %%mm1 \n\t"
756 "psrlq $5, %%mm4 \n\t"
757 "pand %%mm6, %%mm1 \n\t"
758 "pand %%mm6, %%mm4 \n\t"
759 "psrlq $19, %%mm2 \n\t"
760 "psrlq $19, %%mm5 \n\t"
761 "pand %2, %%mm2 \n\t"
762 "pand %2, %%mm5 \n\t"
763 "por %%mm1, %%mm0 \n\t"
764 "por %%mm4, %%mm3 \n\t"
765 "por %%mm2, %%mm0 \n\t"
766 "por %%mm5, %%mm3 \n\t"
767 "psllq $16, %%mm3 \n\t"
768 "por %%mm3, %%mm0 \n\t"
769 MOVNTQ" %%mm0, %0 \n\t"
770 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
774 asm volatile(SFENCE:::"memory");
775 asm volatile(EMMS:::"memory");
782 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
786 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size)
788 const uint8_t *s = src;
791 const uint8_t *mm_end;
793 uint16_t *d = (uint16_t *)dst;
796 asm volatile(PREFETCH" %0"::"m"(*src):"memory");
798 "movq %0, %%mm7 \n\t"
799 "movq %1, %%mm6 \n\t"
800 ::"m"(red_15mask),"m"(green_15mask));
806 "movd %1, %%mm0 \n\t"
807 "movd 3%1, %%mm3 \n\t"
808 "punpckldq 6%1, %%mm0 \n\t"
809 "punpckldq 9%1, %%mm3 \n\t"
810 "movq %%mm0, %%mm1 \n\t"
811 "movq %%mm0, %%mm2 \n\t"
812 "movq %%mm3, %%mm4 \n\t"
813 "movq %%mm3, %%mm5 \n\t"
814 "psrlq $3, %%mm0 \n\t"
815 "psrlq $3, %%mm3 \n\t"
816 "pand %2, %%mm0 \n\t"
817 "pand %2, %%mm3 \n\t"
818 "psrlq $6, %%mm1 \n\t"
819 "psrlq $6, %%mm4 \n\t"
820 "pand %%mm6, %%mm1 \n\t"
821 "pand %%mm6, %%mm4 \n\t"
822 "psrlq $9, %%mm2 \n\t"
823 "psrlq $9, %%mm5 \n\t"
824 "pand %%mm7, %%mm2 \n\t"
825 "pand %%mm7, %%mm5 \n\t"
826 "por %%mm1, %%mm0 \n\t"
827 "por %%mm4, %%mm3 \n\t"
828 "por %%mm2, %%mm0 \n\t"
829 "por %%mm5, %%mm3 \n\t"
830 "psllq $16, %%mm3 \n\t"
831 "por %%mm3, %%mm0 \n\t"
832 MOVNTQ" %%mm0, %0 \n\t"
833 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
837 asm volatile(SFENCE:::"memory");
838 asm volatile(EMMS:::"memory");
845 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
849 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
851 const uint8_t *s = src;
854 const uint8_t *mm_end;
856 uint16_t *d = (uint16_t *)dst;
859 asm volatile(PREFETCH" %0"::"m"(*src):"memory");
861 "movq %0, %%mm7 \n\t"
862 "movq %1, %%mm6 \n\t"
863 ::"m"(red_15mask),"m"(green_15mask));
869 "movd %1, %%mm0 \n\t"
870 "movd 3%1, %%mm3 \n\t"
871 "punpckldq 6%1, %%mm0 \n\t"
872 "punpckldq 9%1, %%mm3 \n\t"
873 "movq %%mm0, %%mm1 \n\t"
874 "movq %%mm0, %%mm2 \n\t"
875 "movq %%mm3, %%mm4 \n\t"
876 "movq %%mm3, %%mm5 \n\t"
877 "psllq $7, %%mm0 \n\t"
878 "psllq $7, %%mm3 \n\t"
879 "pand %%mm7, %%mm0 \n\t"
880 "pand %%mm7, %%mm3 \n\t"
881 "psrlq $6, %%mm1 \n\t"
882 "psrlq $6, %%mm4 \n\t"
883 "pand %%mm6, %%mm1 \n\t"
884 "pand %%mm6, %%mm4 \n\t"
885 "psrlq $19, %%mm2 \n\t"
886 "psrlq $19, %%mm5 \n\t"
887 "pand %2, %%mm2 \n\t"
888 "pand %2, %%mm5 \n\t"
889 "por %%mm1, %%mm0 \n\t"
890 "por %%mm4, %%mm3 \n\t"
891 "por %%mm2, %%mm0 \n\t"
892 "por %%mm5, %%mm3 \n\t"
893 "psllq $16, %%mm3 \n\t"
894 "por %%mm3, %%mm0 \n\t"
895 MOVNTQ" %%mm0, %0 \n\t"
896 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
900 asm volatile(SFENCE:::"memory");
901 asm volatile(EMMS:::"memory");
908 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
913 I use less accurate approximation here by simply left-shifting the input
914 value and filling the low order bits with zeroes. This method improves PNG
915 compression but this scheme cannot reproduce white exactly, since it does
916 not generate an all-ones maximum value; the net effect is to darken the
919 The better method should be "left bit replication":
929 | leftmost bits repeated to fill open bits
933 static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, long src_size)
937 const uint16_t *mm_end;
940 const uint16_t *s = (const uint16_t*)src;
941 end = s + src_size/2;
943 asm volatile(PREFETCH" %0"::"m"(*s):"memory");
949 "movq %1, %%mm0 \n\t"
950 "movq %1, %%mm1 \n\t"
951 "movq %1, %%mm2 \n\t"
952 "pand %2, %%mm0 \n\t"
953 "pand %3, %%mm1 \n\t"
954 "pand %4, %%mm2 \n\t"
955 "psllq $3, %%mm0 \n\t"
956 "psrlq $2, %%mm1 \n\t"
957 "psrlq $7, %%mm2 \n\t"
958 "movq %%mm0, %%mm3 \n\t"
959 "movq %%mm1, %%mm4 \n\t"
960 "movq %%mm2, %%mm5 \n\t"
961 "punpcklwd %5, %%mm0 \n\t"
962 "punpcklwd %5, %%mm1 \n\t"
963 "punpcklwd %5, %%mm2 \n\t"
964 "punpckhwd %5, %%mm3 \n\t"
965 "punpckhwd %5, %%mm4 \n\t"
966 "punpckhwd %5, %%mm5 \n\t"
967 "psllq $8, %%mm1 \n\t"
968 "psllq $16, %%mm2 \n\t"
969 "por %%mm1, %%mm0 \n\t"
970 "por %%mm2, %%mm0 \n\t"
971 "psllq $8, %%mm4 \n\t"
972 "psllq $16, %%mm5 \n\t"
973 "por %%mm4, %%mm3 \n\t"
974 "por %%mm5, %%mm3 \n\t"
976 "movq %%mm0, %%mm6 \n\t"
977 "movq %%mm3, %%mm7 \n\t"
979 "movq 8%1, %%mm0 \n\t"
980 "movq 8%1, %%mm1 \n\t"
981 "movq 8%1, %%mm2 \n\t"
982 "pand %2, %%mm0 \n\t"
983 "pand %3, %%mm1 \n\t"
984 "pand %4, %%mm2 \n\t"
985 "psllq $3, %%mm0 \n\t"
986 "psrlq $2, %%mm1 \n\t"
987 "psrlq $7, %%mm2 \n\t"
988 "movq %%mm0, %%mm3 \n\t"
989 "movq %%mm1, %%mm4 \n\t"
990 "movq %%mm2, %%mm5 \n\t"
991 "punpcklwd %5, %%mm0 \n\t"
992 "punpcklwd %5, %%mm1 \n\t"
993 "punpcklwd %5, %%mm2 \n\t"
994 "punpckhwd %5, %%mm3 \n\t"
995 "punpckhwd %5, %%mm4 \n\t"
996 "punpckhwd %5, %%mm5 \n\t"
997 "psllq $8, %%mm1 \n\t"
998 "psllq $16, %%mm2 \n\t"
999 "por %%mm1, %%mm0 \n\t"
1000 "por %%mm2, %%mm0 \n\t"
1001 "psllq $8, %%mm4 \n\t"
1002 "psllq $16, %%mm5 \n\t"
1003 "por %%mm4, %%mm3 \n\t"
1004 "por %%mm5, %%mm3 \n\t"
1007 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
1009 /* borrowed 32 to 24 */
1011 "movq %%mm0, %%mm4 \n\t"
1012 "movq %%mm3, %%mm5 \n\t"
1013 "movq %%mm6, %%mm0 \n\t"
1014 "movq %%mm7, %%mm1 \n\t"
1016 "movq %%mm4, %%mm6 \n\t"
1017 "movq %%mm5, %%mm7 \n\t"
1018 "movq %%mm0, %%mm2 \n\t"
1019 "movq %%mm1, %%mm3 \n\t"
1021 "psrlq $8, %%mm2 \n\t"
1022 "psrlq $8, %%mm3 \n\t"
1023 "psrlq $8, %%mm6 \n\t"
1024 "psrlq $8, %%mm7 \n\t"
1025 "pand %2, %%mm0 \n\t"
1026 "pand %2, %%mm1 \n\t"
1027 "pand %2, %%mm4 \n\t"
1028 "pand %2, %%mm5 \n\t"
1029 "pand %3, %%mm2 \n\t"
1030 "pand %3, %%mm3 \n\t"
1031 "pand %3, %%mm6 \n\t"
1032 "pand %3, %%mm7 \n\t"
1033 "por %%mm2, %%mm0 \n\t"
1034 "por %%mm3, %%mm1 \n\t"
1035 "por %%mm6, %%mm4 \n\t"
1036 "por %%mm7, %%mm5 \n\t"
1038 "movq %%mm1, %%mm2 \n\t"
1039 "movq %%mm4, %%mm3 \n\t"
1040 "psllq $48, %%mm2 \n\t"
1041 "psllq $32, %%mm3 \n\t"
1042 "pand %4, %%mm2 \n\t"
1043 "pand %5, %%mm3 \n\t"
1044 "por %%mm2, %%mm0 \n\t"
1045 "psrlq $16, %%mm1 \n\t"
1046 "psrlq $32, %%mm4 \n\t"
1047 "psllq $16, %%mm5 \n\t"
1048 "por %%mm3, %%mm1 \n\t"
1049 "pand %6, %%mm5 \n\t"
1050 "por %%mm5, %%mm4 \n\t"
1052 MOVNTQ" %%mm0, %0 \n\t"
1053 MOVNTQ" %%mm1, 8%0 \n\t"
1054 MOVNTQ" %%mm4, 16%0"
1057 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1062 asm volatile(SFENCE:::"memory");
1063 asm volatile(EMMS:::"memory");
1067 register uint16_t bgr;
1069 *d++ = (bgr&0x1F)<<3;
1070 *d++ = (bgr&0x3E0)>>2;
1071 *d++ = (bgr&0x7C00)>>7;
1075 static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, long src_size)
1077 const uint16_t *end;
1079 const uint16_t *mm_end;
1081 uint8_t *d = (uint8_t *)dst;
1082 const uint16_t *s = (const uint16_t *)src;
1083 end = s + src_size/2;
1085 asm volatile(PREFETCH" %0"::"m"(*s):"memory");
1090 PREFETCH" 32%1 \n\t"
1091 "movq %1, %%mm0 \n\t"
1092 "movq %1, %%mm1 \n\t"
1093 "movq %1, %%mm2 \n\t"
1094 "pand %2, %%mm0 \n\t"
1095 "pand %3, %%mm1 \n\t"
1096 "pand %4, %%mm2 \n\t"
1097 "psllq $3, %%mm0 \n\t"
1098 "psrlq $3, %%mm1 \n\t"
1099 "psrlq $8, %%mm2 \n\t"
1100 "movq %%mm0, %%mm3 \n\t"
1101 "movq %%mm1, %%mm4 \n\t"
1102 "movq %%mm2, %%mm5 \n\t"
1103 "punpcklwd %5, %%mm0 \n\t"
1104 "punpcklwd %5, %%mm1 \n\t"
1105 "punpcklwd %5, %%mm2 \n\t"
1106 "punpckhwd %5, %%mm3 \n\t"
1107 "punpckhwd %5, %%mm4 \n\t"
1108 "punpckhwd %5, %%mm5 \n\t"
1109 "psllq $8, %%mm1 \n\t"
1110 "psllq $16, %%mm2 \n\t"
1111 "por %%mm1, %%mm0 \n\t"
1112 "por %%mm2, %%mm0 \n\t"
1113 "psllq $8, %%mm4 \n\t"
1114 "psllq $16, %%mm5 \n\t"
1115 "por %%mm4, %%mm3 \n\t"
1116 "por %%mm5, %%mm3 \n\t"
1118 "movq %%mm0, %%mm6 \n\t"
1119 "movq %%mm3, %%mm7 \n\t"
1121 "movq 8%1, %%mm0 \n\t"
1122 "movq 8%1, %%mm1 \n\t"
1123 "movq 8%1, %%mm2 \n\t"
1124 "pand %2, %%mm0 \n\t"
1125 "pand %3, %%mm1 \n\t"
1126 "pand %4, %%mm2 \n\t"
1127 "psllq $3, %%mm0 \n\t"
1128 "psrlq $3, %%mm1 \n\t"
1129 "psrlq $8, %%mm2 \n\t"
1130 "movq %%mm0, %%mm3 \n\t"
1131 "movq %%mm1, %%mm4 \n\t"
1132 "movq %%mm2, %%mm5 \n\t"
1133 "punpcklwd %5, %%mm0 \n\t"
1134 "punpcklwd %5, %%mm1 \n\t"
1135 "punpcklwd %5, %%mm2 \n\t"
1136 "punpckhwd %5, %%mm3 \n\t"
1137 "punpckhwd %5, %%mm4 \n\t"
1138 "punpckhwd %5, %%mm5 \n\t"
1139 "psllq $8, %%mm1 \n\t"
1140 "psllq $16, %%mm2 \n\t"
1141 "por %%mm1, %%mm0 \n\t"
1142 "por %%mm2, %%mm0 \n\t"
1143 "psllq $8, %%mm4 \n\t"
1144 "psllq $16, %%mm5 \n\t"
1145 "por %%mm4, %%mm3 \n\t"
1146 "por %%mm5, %%mm3 \n\t"
1148 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
1150 /* borrowed 32 to 24 */
1152 "movq %%mm0, %%mm4 \n\t"
1153 "movq %%mm3, %%mm5 \n\t"
1154 "movq %%mm6, %%mm0 \n\t"
1155 "movq %%mm7, %%mm1 \n\t"
1157 "movq %%mm4, %%mm6 \n\t"
1158 "movq %%mm5, %%mm7 \n\t"
1159 "movq %%mm0, %%mm2 \n\t"
1160 "movq %%mm1, %%mm3 \n\t"
1162 "psrlq $8, %%mm2 \n\t"
1163 "psrlq $8, %%mm3 \n\t"
1164 "psrlq $8, %%mm6 \n\t"
1165 "psrlq $8, %%mm7 \n\t"
1166 "pand %2, %%mm0 \n\t"
1167 "pand %2, %%mm1 \n\t"
1168 "pand %2, %%mm4 \n\t"
1169 "pand %2, %%mm5 \n\t"
1170 "pand %3, %%mm2 \n\t"
1171 "pand %3, %%mm3 \n\t"
1172 "pand %3, %%mm6 \n\t"
1173 "pand %3, %%mm7 \n\t"
1174 "por %%mm2, %%mm0 \n\t"
1175 "por %%mm3, %%mm1 \n\t"
1176 "por %%mm6, %%mm4 \n\t"
1177 "por %%mm7, %%mm5 \n\t"
1179 "movq %%mm1, %%mm2 \n\t"
1180 "movq %%mm4, %%mm3 \n\t"
1181 "psllq $48, %%mm2 \n\t"
1182 "psllq $32, %%mm3 \n\t"
1183 "pand %4, %%mm2 \n\t"
1184 "pand %5, %%mm3 \n\t"
1185 "por %%mm2, %%mm0 \n\t"
1186 "psrlq $16, %%mm1 \n\t"
1187 "psrlq $32, %%mm4 \n\t"
1188 "psllq $16, %%mm5 \n\t"
1189 "por %%mm3, %%mm1 \n\t"
1190 "pand %6, %%mm5 \n\t"
1191 "por %%mm5, %%mm4 \n\t"
1193 MOVNTQ" %%mm0, %0 \n\t"
1194 MOVNTQ" %%mm1, 8%0 \n\t"
1195 MOVNTQ" %%mm4, 16%0"
1198 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1203 asm volatile(SFENCE:::"memory");
1204 asm volatile(EMMS:::"memory");
1208 register uint16_t bgr;
1210 *d++ = (bgr&0x1F)<<3;
1211 *d++ = (bgr&0x7E0)>>3;
1212 *d++ = (bgr&0xF800)>>8;
1216 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size)
1218 const uint16_t *end;
1220 const uint16_t *mm_end;
1223 const uint16_t *s = (const uint16_t *)src;
1224 end = s + src_size/2;
1226 asm volatile(PREFETCH" %0"::"m"(*s):"memory");
1227 asm volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
1232 PREFETCH" 32%1 \n\t"
1233 "movq %1, %%mm0 \n\t"
1234 "movq %1, %%mm1 \n\t"
1235 "movq %1, %%mm2 \n\t"
1236 "pand %2, %%mm0 \n\t"
1237 "pand %3, %%mm1 \n\t"
1238 "pand %4, %%mm2 \n\t"
1239 "psllq $3, %%mm0 \n\t"
1240 "psrlq $2, %%mm1 \n\t"
1241 "psrlq $7, %%mm2 \n\t"
1242 "movq %%mm0, %%mm3 \n\t"
1243 "movq %%mm1, %%mm4 \n\t"
1244 "movq %%mm2, %%mm5 \n\t"
1245 "punpcklwd %%mm7, %%mm0 \n\t"
1246 "punpcklwd %%mm7, %%mm1 \n\t"
1247 "punpcklwd %%mm7, %%mm2 \n\t"
1248 "punpckhwd %%mm7, %%mm3 \n\t"
1249 "punpckhwd %%mm7, %%mm4 \n\t"
1250 "punpckhwd %%mm7, %%mm5 \n\t"
1251 "psllq $8, %%mm1 \n\t"
1252 "psllq $16, %%mm2 \n\t"
1253 "por %%mm1, %%mm0 \n\t"
1254 "por %%mm2, %%mm0 \n\t"
1255 "psllq $8, %%mm4 \n\t"
1256 "psllq $16, %%mm5 \n\t"
1257 "por %%mm4, %%mm3 \n\t"
1258 "por %%mm5, %%mm3 \n\t"
1259 MOVNTQ" %%mm0, %0 \n\t"
1260 MOVNTQ" %%mm3, 8%0 \n\t"
1262 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1267 asm volatile(SFENCE:::"memory");
1268 asm volatile(EMMS:::"memory");
1272 #if 0 //slightly slower on Athlon
1274 *((uint32_t*)d)++ = ((bgr&0x1F)<<3) + ((bgr&0x3E0)<<6) + ((bgr&0x7C00)<<9);
1276 register uint16_t bgr;
1278 #ifdef WORDS_BIGENDIAN
1280 *d++ = (bgr&0x7C00)>>7;
1281 *d++ = (bgr&0x3E0)>>2;
1282 *d++ = (bgr&0x1F)<<3;
1284 *d++ = (bgr&0x1F)<<3;
1285 *d++ = (bgr&0x3E0)>>2;
1286 *d++ = (bgr&0x7C00)>>7;
1294 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size)
1296 const uint16_t *end;
1298 const uint16_t *mm_end;
1301 const uint16_t *s = (const uint16_t*)src;
1302 end = s + src_size/2;
1304 asm volatile(PREFETCH" %0"::"m"(*s):"memory");
1305 asm volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
1310 PREFETCH" 32%1 \n\t"
1311 "movq %1, %%mm0 \n\t"
1312 "movq %1, %%mm1 \n\t"
1313 "movq %1, %%mm2 \n\t"
1314 "pand %2, %%mm0 \n\t"
1315 "pand %3, %%mm1 \n\t"
1316 "pand %4, %%mm2 \n\t"
1317 "psllq $3, %%mm0 \n\t"
1318 "psrlq $3, %%mm1 \n\t"
1319 "psrlq $8, %%mm2 \n\t"
1320 "movq %%mm0, %%mm3 \n\t"
1321 "movq %%mm1, %%mm4 \n\t"
1322 "movq %%mm2, %%mm5 \n\t"
1323 "punpcklwd %%mm7, %%mm0 \n\t"
1324 "punpcklwd %%mm7, %%mm1 \n\t"
1325 "punpcklwd %%mm7, %%mm2 \n\t"
1326 "punpckhwd %%mm7, %%mm3 \n\t"
1327 "punpckhwd %%mm7, %%mm4 \n\t"
1328 "punpckhwd %%mm7, %%mm5 \n\t"
1329 "psllq $8, %%mm1 \n\t"
1330 "psllq $16, %%mm2 \n\t"
1331 "por %%mm1, %%mm0 \n\t"
1332 "por %%mm2, %%mm0 \n\t"
1333 "psllq $8, %%mm4 \n\t"
1334 "psllq $16, %%mm5 \n\t"
1335 "por %%mm4, %%mm3 \n\t"
1336 "por %%mm5, %%mm3 \n\t"
1337 MOVNTQ" %%mm0, %0 \n\t"
1338 MOVNTQ" %%mm3, 8%0 \n\t"
1340 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1345 asm volatile(SFENCE:::"memory");
1346 asm volatile(EMMS:::"memory");
1350 register uint16_t bgr;
1352 #ifdef WORDS_BIGENDIAN
1354 *d++ = (bgr&0xF800)>>8;
1355 *d++ = (bgr&0x7E0)>>3;
1356 *d++ = (bgr&0x1F)<<3;
1358 *d++ = (bgr&0x1F)<<3;
1359 *d++ = (bgr&0x7E0)>>3;
1360 *d++ = (bgr&0xF800)>>8;
1366 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
1368 long idx = 15 - src_size;
1369 const uint8_t *s = src-idx;
1370 uint8_t *d = dst-idx;
1375 PREFETCH" (%1, %0) \n\t"
1376 "movq %3, %%mm7 \n\t"
1377 "pxor %4, %%mm7 \n\t"
1378 "movq %%mm7, %%mm6 \n\t"
1379 "pxor %5, %%mm7 \n\t"
1382 PREFETCH" 32(%1, %0) \n\t"
1383 "movq (%1, %0), %%mm0 \n\t"
1384 "movq 8(%1, %0), %%mm1 \n\t"
1386 "pshufw $177, %%mm0, %%mm3 \n\t"
1387 "pshufw $177, %%mm1, %%mm5 \n\t"
1388 "pand %%mm7, %%mm0 \n\t"
1389 "pand %%mm6, %%mm3 \n\t"
1390 "pand %%mm7, %%mm1 \n\t"
1391 "pand %%mm6, %%mm5 \n\t"
1392 "por %%mm3, %%mm0 \n\t"
1393 "por %%mm5, %%mm1 \n\t"
1395 "movq %%mm0, %%mm2 \n\t"
1396 "movq %%mm1, %%mm4 \n\t"
1397 "pand %%mm7, %%mm0 \n\t"
1398 "pand %%mm6, %%mm2 \n\t"
1399 "pand %%mm7, %%mm1 \n\t"
1400 "pand %%mm6, %%mm4 \n\t"
1401 "movq %%mm2, %%mm3 \n\t"
1402 "movq %%mm4, %%mm5 \n\t"
1403 "pslld $16, %%mm2 \n\t"
1404 "psrld $16, %%mm3 \n\t"
1405 "pslld $16, %%mm4 \n\t"
1406 "psrld $16, %%mm5 \n\t"
1407 "por %%mm2, %%mm0 \n\t"
1408 "por %%mm4, %%mm1 \n\t"
1409 "por %%mm3, %%mm0 \n\t"
1410 "por %%mm5, %%mm1 \n\t"
1412 MOVNTQ" %%mm0, (%2, %0) \n\t"
1413 MOVNTQ" %%mm1, 8(%2, %0) \n\t"
1420 : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
1423 for (; idx<15; idx+=4) {
1424 register int v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00;
1426 *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16);
1430 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
1434 long mmx_size= 23 - src_size;
1436 "test %%"REG_a", %%"REG_a" \n\t"
1438 "movq "MANGLE(mask24r)", %%mm5 \n\t"
1439 "movq "MANGLE(mask24g)", %%mm6 \n\t"
1440 "movq "MANGLE(mask24b)", %%mm7 \n\t"
1443 PREFETCH" 32(%1, %%"REG_a") \n\t"
1444 "movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1445 "movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG
1446 "movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B
1447 "psllq $16, %%mm0 \n\t" // 00 BGR BGR
1448 "pand %%mm5, %%mm0 \n\t"
1449 "pand %%mm6, %%mm1 \n\t"
1450 "pand %%mm7, %%mm2 \n\t"
1451 "por %%mm0, %%mm1 \n\t"
1452 "por %%mm2, %%mm1 \n\t"
1453 "movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1454 MOVNTQ" %%mm1, (%2, %%"REG_a") \n\t" // RGB RGB RG
1455 "movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B
1456 "movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR
1457 "pand %%mm7, %%mm0 \n\t"
1458 "pand %%mm5, %%mm1 \n\t"
1459 "pand %%mm6, %%mm2 \n\t"
1460 "por %%mm0, %%mm1 \n\t"
1461 "por %%mm2, %%mm1 \n\t"
1462 "movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B
1463 MOVNTQ" %%mm1, 8(%2, %%"REG_a") \n\t" // B RGB RGB R
1464 "movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR
1465 "movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG
1466 "pand %%mm6, %%mm0 \n\t"
1467 "pand %%mm7, %%mm1 \n\t"
1468 "pand %%mm5, %%mm2 \n\t"
1469 "por %%mm0, %%mm1 \n\t"
1470 "por %%mm2, %%mm1 \n\t"
1471 MOVNTQ" %%mm1, 16(%2, %%"REG_a") \n\t"
1472 "add $24, %%"REG_a" \n\t"
1476 : "r" (src-mmx_size), "r"(dst-mmx_size)
1479 asm volatile(SFENCE:::"memory");
1480 asm volatile(EMMS:::"memory");
1482 if (mmx_size==23) return; //finished, was multiple of 8
1486 src_size= 23-mmx_size;
1490 for (i=0; i<src_size; i+=3)
1494 dst[i + 1] = src[i + 1];
1495 dst[i + 2] = src[i + 0];
1500 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1501 long width, long height,
1502 long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1505 const long chromWidth= width>>1;
1506 for (y=0; y<height; y++)
1509 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1511 "xor %%"REG_a", %%"REG_a" \n\t"
1514 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1515 PREFETCH" 32(%2, %%"REG_a") \n\t"
1516 PREFETCH" 32(%3, %%"REG_a") \n\t"
1517 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1518 "movq %%mm0, %%mm2 \n\t" // U(0)
1519 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1520 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1521 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1523 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1524 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1525 "movq %%mm3, %%mm4 \n\t" // Y(0)
1526 "movq %%mm5, %%mm6 \n\t" // Y(8)
1527 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
1528 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
1529 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
1530 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
1532 MOVNTQ" %%mm3, (%0, %%"REG_a", 4) \n\t"
1533 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
1534 MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4) \n\t"
1535 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
1537 "add $8, %%"REG_a" \n\t"
1538 "cmp %4, %%"REG_a" \n\t"
1540 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1545 #if defined ARCH_ALPHA && defined HAVE_MVI
1546 #define pl2yuy2(n) \
1551 asm("unpkbw %1, %0" : "=r"(y1) : "r"(y1)); \
1552 asm("unpkbw %1, %0" : "=r"(y2) : "r"(y2)); \
1553 asm("unpkbl %1, %0" : "=r"(u) : "r"(u)); \
1554 asm("unpkbl %1, %0" : "=r"(v) : "r"(v)); \
1555 yuv1 = (u << 8) + (v << 24); \
1562 uint64_t *qdst = (uint64_t *) dst;
1563 uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
1564 const uint32_t *yc = (uint32_t *) ysrc;
1565 const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
1566 const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
1567 for (i = 0; i < chromWidth; i += 8){
1568 uint64_t y1, y2, yuv1, yuv2;
1571 asm("ldq $31,64(%0)" :: "r"(yc));
1572 asm("ldq $31,64(%0)" :: "r"(yc2));
1573 asm("ldq $31,64(%0)" :: "r"(uc));
1574 asm("ldq $31,64(%0)" :: "r"(vc));
1592 #elif __WORDSIZE >= 64
1594 uint64_t *ldst = (uint64_t *) dst;
1595 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1596 for (i = 0; i < chromWidth; i += 2){
1598 k = yc[0] + (uc[0] << 8) +
1599 (yc[1] << 16) + (vc[0] << 24);
1600 l = yc[2] + (uc[1] << 8) +
1601 (yc[3] << 16) + (vc[1] << 24);
1602 *ldst++ = k + (l << 32);
1609 int i, *idst = (int32_t *) dst;
1610 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1611 for (i = 0; i < chromWidth; i++){
1612 #ifdef WORDS_BIGENDIAN
1613 *idst++ = (yc[0] << 24)+ (uc[0] << 16) +
1614 (yc[1] << 8) + (vc[0] << 0);
1616 *idst++ = yc[0] + (uc[0] << 8) +
1617 (yc[1] << 16) + (vc[0] << 24);
1625 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1)
1627 usrc += chromStride;
1628 vsrc += chromStride;
1641 * Height should be a multiple of 2 and width should be a multiple of 16.
1642 * (If this is a problem for anyone then tell me, and I will fix it.)
1644 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1645 long width, long height,
1646 long lumStride, long chromStride, long dstStride)
1648 //FIXME interpolate chroma
1649 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1652 static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1653 long width, long height,
1654 long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1657 const long chromWidth= width>>1;
1658 for (y=0; y<height; y++)
1661 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1663 "xor %%"REG_a", %%"REG_a" \n\t"
1666 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1667 PREFETCH" 32(%2, %%"REG_a") \n\t"
1668 PREFETCH" 32(%3, %%"REG_a") \n\t"
1669 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1670 "movq %%mm0, %%mm2 \n\t" // U(0)
1671 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1672 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1673 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1675 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1676 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1677 "movq %%mm0, %%mm4 \n\t" // Y(0)
1678 "movq %%mm2, %%mm6 \n\t" // Y(8)
1679 "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
1680 "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
1681 "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
1682 "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
1684 MOVNTQ" %%mm0, (%0, %%"REG_a", 4) \n\t"
1685 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
1686 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4) \n\t"
1687 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
1689 "add $8, %%"REG_a" \n\t"
1690 "cmp %4, %%"REG_a" \n\t"
1692 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1696 //FIXME adapt the Alpha ASM code from yv12->yuy2
1698 #if __WORDSIZE >= 64
1700 uint64_t *ldst = (uint64_t *) dst;
1701 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1702 for (i = 0; i < chromWidth; i += 2){
1704 k = uc[0] + (yc[0] << 8) +
1705 (vc[0] << 16) + (yc[1] << 24);
1706 l = uc[1] + (yc[2] << 8) +
1707 (vc[1] << 16) + (yc[3] << 24);
1708 *ldst++ = k + (l << 32);
1715 int i, *idst = (int32_t *) dst;
1716 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1717 for (i = 0; i < chromWidth; i++){
1718 #ifdef WORDS_BIGENDIAN
1719 *idst++ = (uc[0] << 24)+ (yc[0] << 16) +
1720 (vc[0] << 8) + (yc[1] << 0);
1722 *idst++ = uc[0] + (yc[0] << 8) +
1723 (vc[0] << 16) + (yc[1] << 24);
1731 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1)
1733 usrc += chromStride;
1734 vsrc += chromStride;
1747 * Height should be a multiple of 2 and width should be a multiple of 16
1748 * (If this is a problem for anyone then tell me, and I will fix it.)
1750 static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1751 long width, long height,
1752 long lumStride, long chromStride, long dstStride)
1754 //FIXME interpolate chroma
1755 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1759 * Width should be a multiple of 16.
1761 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1762 long width, long height,
1763 long lumStride, long chromStride, long dstStride)
1765 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1769 * Height should be a multiple of 2 and width should be a multiple of 16.
1770 * (If this is a problem for anyone then tell me, and I will fix it.)
1772 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1773 long width, long height,
1774 long lumStride, long chromStride, long srcStride)
1777 const long chromWidth= width>>1;
1778 for (y=0; y<height; y+=2)
1782 "xor %%"REG_a", %%"REG_a" \n\t"
1783 "pcmpeqw %%mm7, %%mm7 \n\t"
1784 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1787 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1788 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1789 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1790 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
1791 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
1792 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
1793 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
1794 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1795 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1796 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1797 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1799 MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
1801 "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(8)
1802 "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(12)
1803 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
1804 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
1805 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
1806 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
1807 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1808 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1809 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1810 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1812 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1814 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1815 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1816 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1817 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1818 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1819 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1820 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1821 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1823 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
1824 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
1826 "add $8, %%"REG_a" \n\t"
1827 "cmp %4, %%"REG_a" \n\t"
1829 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1830 : "memory", "%"REG_a
1837 "xor %%"REG_a", %%"REG_a" \n\t"
1840 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1841 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1842 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1843 "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
1844 "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
1845 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1846 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1847 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1848 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1849 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1850 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1852 MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
1853 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1855 "add $8, %%"REG_a" \n\t"
1856 "cmp %4, %%"REG_a" \n\t"
1859 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1860 : "memory", "%"REG_a
1864 for (i=0; i<chromWidth; i++)
1866 ydst[2*i+0] = src[4*i+0];
1867 udst[i] = src[4*i+1];
1868 ydst[2*i+1] = src[4*i+2];
1869 vdst[i] = src[4*i+3];
1874 for (i=0; i<chromWidth; i++)
1876 ydst[2*i+0] = src[4*i+0];
1877 ydst[2*i+1] = src[4*i+2];
1880 udst += chromStride;
1881 vdst += chromStride;
1886 asm volatile( EMMS" \n\t"
1892 static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1893 uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1894 long width, long height, long lumStride, long chromStride)
1897 memcpy(ydst, ysrc, width*height);
1899 /* XXX: implement upscaling for U,V */
1902 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWidth, long srcHeight, long srcStride, long dstStride)
1909 for (x=0; x<srcWidth-1; x++){
1910 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1911 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1913 dst[2*srcWidth-1]= src[srcWidth-1];
1917 for (y=1; y<srcHeight; y++){
1918 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1919 const long mmxSize= srcWidth&~15;
1921 "mov %4, %%"REG_a" \n\t"
1923 "movq (%0, %%"REG_a"), %%mm0 \n\t"
1924 "movq (%1, %%"REG_a"), %%mm1 \n\t"
1925 "movq 1(%0, %%"REG_a"), %%mm2 \n\t"
1926 "movq 1(%1, %%"REG_a"), %%mm3 \n\t"
1927 "movq -1(%0, %%"REG_a"), %%mm4 \n\t"
1928 "movq -1(%1, %%"REG_a"), %%mm5 \n\t"
1929 PAVGB" %%mm0, %%mm5 \n\t"
1930 PAVGB" %%mm0, %%mm3 \n\t"
1931 PAVGB" %%mm0, %%mm5 \n\t"
1932 PAVGB" %%mm0, %%mm3 \n\t"
1933 PAVGB" %%mm1, %%mm4 \n\t"
1934 PAVGB" %%mm1, %%mm2 \n\t"
1935 PAVGB" %%mm1, %%mm4 \n\t"
1936 PAVGB" %%mm1, %%mm2 \n\t"
1937 "movq %%mm5, %%mm7 \n\t"
1938 "movq %%mm4, %%mm6 \n\t"
1939 "punpcklbw %%mm3, %%mm5 \n\t"
1940 "punpckhbw %%mm3, %%mm7 \n\t"
1941 "punpcklbw %%mm2, %%mm4 \n\t"
1942 "punpckhbw %%mm2, %%mm6 \n\t"
1944 MOVNTQ" %%mm5, (%2, %%"REG_a", 2) \n\t"
1945 MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2) \n\t"
1946 MOVNTQ" %%mm4, (%3, %%"REG_a", 2) \n\t"
1947 MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2) \n\t"
1949 "movq %%mm5, (%2, %%"REG_a", 2) \n\t"
1950 "movq %%mm7, 8(%2, %%"REG_a", 2) \n\t"
1951 "movq %%mm4, (%3, %%"REG_a", 2) \n\t"
1952 "movq %%mm6, 8(%3, %%"REG_a", 2) \n\t"
1954 "add $8, %%"REG_a" \n\t"
1956 :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
1957 "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1963 const long mmxSize=1;
1965 dst[0 ]= (3*src[0] + src[srcStride])>>2;
1966 dst[dstStride]= ( src[0] + 3*src[srcStride])>>2;
1968 for (x=mmxSize-1; x<srcWidth-1; x++){
1969 dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
1970 dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
1971 dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
1972 dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
1974 dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
1975 dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1985 for (x=0; x<srcWidth-1; x++){
1986 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1987 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1989 dst[2*srcWidth-1]= src[srcWidth-1];
1991 for (x=0; x<srcWidth; x++){
1998 asm volatile( EMMS" \n\t"
2005 * Height should be a multiple of 2 and width should be a multiple of 16.
2006 * (If this is a problem for anyone then tell me, and I will fix it.)
2007 * Chrominance data is only taken from every second line, others are ignored.
2008 * FIXME: Write HQ version.
2010 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2011 long width, long height,
2012 long lumStride, long chromStride, long srcStride)
2015 const long chromWidth= width>>1;
2016 for (y=0; y<height; y+=2)
2020 "xorl %%eax, %%eax \n\t"
2021 "pcmpeqw %%mm7, %%mm7 \n\t"
2022 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
2025 PREFETCH" 64(%0, %%eax, 4) \n\t"
2026 "movq (%0, %%eax, 4), %%mm0 \n\t" // UYVY UYVY(0)
2027 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(4)
2028 "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
2029 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
2030 "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
2031 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
2032 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
2033 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
2034 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
2035 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
2037 MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
2039 "movq 16(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(8)
2040 "movq 24(%0, %%eax, 4), %%mm2 \n\t" // UYVY UYVY(12)
2041 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
2042 "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
2043 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
2044 "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
2045 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
2046 "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
2047 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
2048 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
2050 MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
2052 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
2053 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
2054 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
2055 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
2056 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
2057 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
2058 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
2059 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
2061 MOVNTQ" %%mm0, (%3, %%eax) \n\t"
2062 MOVNTQ" %%mm2, (%2, %%eax) \n\t"
2064 "addl $8, %%eax \n\t"
2065 "cmpl %4, %%eax \n\t"
2067 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2075 "xorl %%eax, %%eax \n\t"
2078 PREFETCH" 64(%0, %%eax, 4) \n\t"
2079 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
2080 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
2081 "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
2082 "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
2083 "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
2084 "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
2085 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
2086 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
2087 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
2088 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
2090 MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
2091 MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
2093 "addl $8, %%eax \n\t"
2094 "cmpl %4, %%eax \n\t"
2097 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2102 for (i=0; i<chromWidth; i++)
2104 udst[i] = src[4*i+0];
2105 ydst[2*i+0] = src[4*i+1];
2106 vdst[i] = src[4*i+2];
2107 ydst[2*i+1] = src[4*i+3];
2112 for (i=0; i<chromWidth; i++)
2114 ydst[2*i+0] = src[4*i+1];
2115 ydst[2*i+1] = src[4*i+3];
2118 udst += chromStride;
2119 vdst += chromStride;
2124 asm volatile( EMMS" \n\t"
2131 * Height should be a multiple of 2 and width should be a multiple of 2.
2132 * (If this is a problem for anyone then tell me, and I will fix it.)
2133 * Chrominance data is only taken from every second line,
2134 * others are ignored in the C version.
2135 * FIXME: Write HQ version.
2137 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2138 long width, long height,
2139 long lumStride, long chromStride, long srcStride)
2142 const long chromWidth= width>>1;
2144 for (y=0; y<height-2; y+=2)
2150 "mov %2, %%"REG_a" \n\t"
2151 "movq "MANGLE(ff_bgr2YCoeff)", %%mm6 \n\t"
2152 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
2153 "pxor %%mm7, %%mm7 \n\t"
2154 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
2157 PREFETCH" 64(%0, %%"REG_d") \n\t"
2158 "movd (%0, %%"REG_d"), %%mm0 \n\t"
2159 "movd 3(%0, %%"REG_d"), %%mm1 \n\t"
2160 "punpcklbw %%mm7, %%mm0 \n\t"
2161 "punpcklbw %%mm7, %%mm1 \n\t"
2162 "movd 6(%0, %%"REG_d"), %%mm2 \n\t"
2163 "movd 9(%0, %%"REG_d"), %%mm3 \n\t"
2164 "punpcklbw %%mm7, %%mm2 \n\t"
2165 "punpcklbw %%mm7, %%mm3 \n\t"
2166 "pmaddwd %%mm6, %%mm0 \n\t"
2167 "pmaddwd %%mm6, %%mm1 \n\t"
2168 "pmaddwd %%mm6, %%mm2 \n\t"
2169 "pmaddwd %%mm6, %%mm3 \n\t"
2170 #ifndef FAST_BGR2YV12
2171 "psrad $8, %%mm0 \n\t"
2172 "psrad $8, %%mm1 \n\t"
2173 "psrad $8, %%mm2 \n\t"
2174 "psrad $8, %%mm3 \n\t"
2176 "packssdw %%mm1, %%mm0 \n\t"
2177 "packssdw %%mm3, %%mm2 \n\t"
2178 "pmaddwd %%mm5, %%mm0 \n\t"
2179 "pmaddwd %%mm5, %%mm2 \n\t"
2180 "packssdw %%mm2, %%mm0 \n\t"
2181 "psraw $7, %%mm0 \n\t"
2183 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
2184 "movd 15(%0, %%"REG_d"), %%mm1 \n\t"
2185 "punpcklbw %%mm7, %%mm4 \n\t"
2186 "punpcklbw %%mm7, %%mm1 \n\t"
2187 "movd 18(%0, %%"REG_d"), %%mm2 \n\t"
2188 "movd 21(%0, %%"REG_d"), %%mm3 \n\t"
2189 "punpcklbw %%mm7, %%mm2 \n\t"
2190 "punpcklbw %%mm7, %%mm3 \n\t"
2191 "pmaddwd %%mm6, %%mm4 \n\t"
2192 "pmaddwd %%mm6, %%mm1 \n\t"
2193 "pmaddwd %%mm6, %%mm2 \n\t"
2194 "pmaddwd %%mm6, %%mm3 \n\t"
2195 #ifndef FAST_BGR2YV12
2196 "psrad $8, %%mm4 \n\t"
2197 "psrad $8, %%mm1 \n\t"
2198 "psrad $8, %%mm2 \n\t"
2199 "psrad $8, %%mm3 \n\t"
2201 "packssdw %%mm1, %%mm4 \n\t"
2202 "packssdw %%mm3, %%mm2 \n\t"
2203 "pmaddwd %%mm5, %%mm4 \n\t"
2204 "pmaddwd %%mm5, %%mm2 \n\t"
2205 "add $24, %%"REG_d" \n\t"
2206 "packssdw %%mm2, %%mm4 \n\t"
2207 "psraw $7, %%mm4 \n\t"
2209 "packuswb %%mm4, %%mm0 \n\t"
2210 "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0 \n\t"
2212 MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t"
2213 "add $8, %%"REG_a" \n\t"
2215 : : "r" (src+width*3), "r" (ydst+width), "g" (-width)
2216 : "%"REG_a, "%"REG_d
2223 "mov %4, %%"REG_a" \n\t"
2224 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
2225 "movq "MANGLE(ff_bgr2UCoeff)", %%mm6 \n\t"
2226 "pxor %%mm7, %%mm7 \n\t"
2227 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
2228 "add %%"REG_d", %%"REG_d" \n\t"
2231 PREFETCH" 64(%0, %%"REG_d") \n\t"
2232 PREFETCH" 64(%1, %%"REG_d") \n\t"
2233 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2234 "movq (%0, %%"REG_d"), %%mm0 \n\t"
2235 "movq (%1, %%"REG_d"), %%mm1 \n\t"
2236 "movq 6(%0, %%"REG_d"), %%mm2 \n\t"
2237 "movq 6(%1, %%"REG_d"), %%mm3 \n\t"
2238 PAVGB" %%mm1, %%mm0 \n\t"
2239 PAVGB" %%mm3, %%mm2 \n\t"
2240 "movq %%mm0, %%mm1 \n\t"
2241 "movq %%mm2, %%mm3 \n\t"
2242 "psrlq $24, %%mm0 \n\t"
2243 "psrlq $24, %%mm2 \n\t"
2244 PAVGB" %%mm1, %%mm0 \n\t"
2245 PAVGB" %%mm3, %%mm2 \n\t"
2246 "punpcklbw %%mm7, %%mm0 \n\t"
2247 "punpcklbw %%mm7, %%mm2 \n\t"
2249 "movd (%0, %%"REG_d"), %%mm0 \n\t"
2250 "movd (%1, %%"REG_d"), %%mm1 \n\t"
2251 "movd 3(%0, %%"REG_d"), %%mm2 \n\t"
2252 "movd 3(%1, %%"REG_d"), %%mm3 \n\t"
2253 "punpcklbw %%mm7, %%mm0 \n\t"
2254 "punpcklbw %%mm7, %%mm1 \n\t"
2255 "punpcklbw %%mm7, %%mm2 \n\t"
2256 "punpcklbw %%mm7, %%mm3 \n\t"
2257 "paddw %%mm1, %%mm0 \n\t"
2258 "paddw %%mm3, %%mm2 \n\t"
2259 "paddw %%mm2, %%mm0 \n\t"
2260 "movd 6(%0, %%"REG_d"), %%mm4 \n\t"
2261 "movd 6(%1, %%"REG_d"), %%mm1 \n\t"
2262 "movd 9(%0, %%"REG_d"), %%mm2 \n\t"
2263 "movd 9(%1, %%"REG_d"), %%mm3 \n\t"
2264 "punpcklbw %%mm7, %%mm4 \n\t"
2265 "punpcklbw %%mm7, %%mm1 \n\t"
2266 "punpcklbw %%mm7, %%mm2 \n\t"
2267 "punpcklbw %%mm7, %%mm3 \n\t"
2268 "paddw %%mm1, %%mm4 \n\t"
2269 "paddw %%mm3, %%mm2 \n\t"
2270 "paddw %%mm4, %%mm2 \n\t"
2271 "psrlw $2, %%mm0 \n\t"
2272 "psrlw $2, %%mm2 \n\t"
2274 "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
2275 "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
2277 "pmaddwd %%mm0, %%mm1 \n\t"
2278 "pmaddwd %%mm2, %%mm3 \n\t"
2279 "pmaddwd %%mm6, %%mm0 \n\t"
2280 "pmaddwd %%mm6, %%mm2 \n\t"
2281 #ifndef FAST_BGR2YV12
2282 "psrad $8, %%mm0 \n\t"
2283 "psrad $8, %%mm1 \n\t"
2284 "psrad $8, %%mm2 \n\t"
2285 "psrad $8, %%mm3 \n\t"
2287 "packssdw %%mm2, %%mm0 \n\t"
2288 "packssdw %%mm3, %%mm1 \n\t"
2289 "pmaddwd %%mm5, %%mm0 \n\t"
2290 "pmaddwd %%mm5, %%mm1 \n\t"
2291 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
2292 "psraw $7, %%mm0 \n\t"
2294 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2295 "movq 12(%0, %%"REG_d"), %%mm4 \n\t"
2296 "movq 12(%1, %%"REG_d"), %%mm1 \n\t"
2297 "movq 18(%0, %%"REG_d"), %%mm2 \n\t"
2298 "movq 18(%1, %%"REG_d"), %%mm3 \n\t"
2299 PAVGB" %%mm1, %%mm4 \n\t"
2300 PAVGB" %%mm3, %%mm2 \n\t"
2301 "movq %%mm4, %%mm1 \n\t"
2302 "movq %%mm2, %%mm3 \n\t"
2303 "psrlq $24, %%mm4 \n\t"
2304 "psrlq $24, %%mm2 \n\t"
2305 PAVGB" %%mm1, %%mm4 \n\t"
2306 PAVGB" %%mm3, %%mm2 \n\t"
2307 "punpcklbw %%mm7, %%mm4 \n\t"
2308 "punpcklbw %%mm7, %%mm2 \n\t"
2310 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
2311 "movd 12(%1, %%"REG_d"), %%mm1 \n\t"
2312 "movd 15(%0, %%"REG_d"), %%mm2 \n\t"
2313 "movd 15(%1, %%"REG_d"), %%mm3 \n\t"
2314 "punpcklbw %%mm7, %%mm4 \n\t"
2315 "punpcklbw %%mm7, %%mm1 \n\t"
2316 "punpcklbw %%mm7, %%mm2 \n\t"
2317 "punpcklbw %%mm7, %%mm3 \n\t"
2318 "paddw %%mm1, %%mm4 \n\t"
2319 "paddw %%mm3, %%mm2 \n\t"
2320 "paddw %%mm2, %%mm4 \n\t"
2321 "movd 18(%0, %%"REG_d"), %%mm5 \n\t"
2322 "movd 18(%1, %%"REG_d"), %%mm1 \n\t"
2323 "movd 21(%0, %%"REG_d"), %%mm2 \n\t"
2324 "movd 21(%1, %%"REG_d"), %%mm3 \n\t"
2325 "punpcklbw %%mm7, %%mm5 \n\t"
2326 "punpcklbw %%mm7, %%mm1 \n\t"
2327 "punpcklbw %%mm7, %%mm2 \n\t"
2328 "punpcklbw %%mm7, %%mm3 \n\t"
2329 "paddw %%mm1, %%mm5 \n\t"
2330 "paddw %%mm3, %%mm2 \n\t"
2331 "paddw %%mm5, %%mm2 \n\t"
2332 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
2333 "psrlw $2, %%mm4 \n\t"
2334 "psrlw $2, %%mm2 \n\t"
2336 "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
2337 "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
2339 "pmaddwd %%mm4, %%mm1 \n\t"
2340 "pmaddwd %%mm2, %%mm3 \n\t"
2341 "pmaddwd %%mm6, %%mm4 \n\t"
2342 "pmaddwd %%mm6, %%mm2 \n\t"
2343 #ifndef FAST_BGR2YV12
2344 "psrad $8, %%mm4 \n\t"
2345 "psrad $8, %%mm1 \n\t"
2346 "psrad $8, %%mm2 \n\t"
2347 "psrad $8, %%mm3 \n\t"
2349 "packssdw %%mm2, %%mm4 \n\t"
2350 "packssdw %%mm3, %%mm1 \n\t"
2351 "pmaddwd %%mm5, %%mm4 \n\t"
2352 "pmaddwd %%mm5, %%mm1 \n\t"
2353 "add $24, %%"REG_d" \n\t"
2354 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
2355 "psraw $7, %%mm4 \n\t"
2357 "movq %%mm0, %%mm1 \n\t"
2358 "punpckldq %%mm4, %%mm0 \n\t"
2359 "punpckhdq %%mm4, %%mm1 \n\t"
2360 "packsswb %%mm1, %%mm0 \n\t"
2361 "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0 \n\t"
2362 "movd %%mm0, (%2, %%"REG_a") \n\t"
2363 "punpckhdq %%mm0, %%mm0 \n\t"
2364 "movd %%mm0, (%3, %%"REG_a") \n\t"
2365 "add $4, %%"REG_a" \n\t"
2367 : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
2368 : "%"REG_a, "%"REG_d
2371 udst += chromStride;
2372 vdst += chromStride;
2376 asm volatile( EMMS" \n\t"
2382 for (; y<height; y+=2)
2385 for (i=0; i<chromWidth; i++)
2387 unsigned int b = src[6*i+0];
2388 unsigned int g = src[6*i+1];
2389 unsigned int r = src[6*i+2];
2391 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2392 unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2393 unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2403 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2409 for (i=0; i<chromWidth; i++)
2411 unsigned int b = src[6*i+0];
2412 unsigned int g = src[6*i+1];
2413 unsigned int r = src[6*i+2];
2415 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2423 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2426 udst += chromStride;
2427 vdst += chromStride;
2433 void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
2434 long width, long height, long src1Stride,
2435 long src2Stride, long dstStride){
2438 for (h=0; h < height; h++)
2445 "xor %%"REG_a", %%"REG_a" \n\t"
2447 PREFETCH" 64(%1, %%"REG_a") \n\t"
2448 PREFETCH" 64(%2, %%"REG_a") \n\t"
2449 "movdqa (%1, %%"REG_a"), %%xmm0 \n\t"
2450 "movdqa (%1, %%"REG_a"), %%xmm1 \n\t"
2451 "movdqa (%2, %%"REG_a"), %%xmm2 \n\t"
2452 "punpcklbw %%xmm2, %%xmm0 \n\t"
2453 "punpckhbw %%xmm2, %%xmm1 \n\t"
2454 "movntdq %%xmm0, (%0, %%"REG_a", 2) \n\t"
2455 "movntdq %%xmm1, 16(%0, %%"REG_a", 2) \n\t"
2456 "add $16, %%"REG_a" \n\t"
2457 "cmp %3, %%"REG_a" \n\t"
2459 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2460 : "memory", "%"REG_a""
2464 "xor %%"REG_a", %%"REG_a" \n\t"
2466 PREFETCH" 64(%1, %%"REG_a") \n\t"
2467 PREFETCH" 64(%2, %%"REG_a") \n\t"
2468 "movq (%1, %%"REG_a"), %%mm0 \n\t"
2469 "movq 8(%1, %%"REG_a"), %%mm2 \n\t"
2470 "movq %%mm0, %%mm1 \n\t"
2471 "movq %%mm2, %%mm3 \n\t"
2472 "movq (%2, %%"REG_a"), %%mm4 \n\t"
2473 "movq 8(%2, %%"REG_a"), %%mm5 \n\t"
2474 "punpcklbw %%mm4, %%mm0 \n\t"
2475 "punpckhbw %%mm4, %%mm1 \n\t"
2476 "punpcklbw %%mm5, %%mm2 \n\t"
2477 "punpckhbw %%mm5, %%mm3 \n\t"
2478 MOVNTQ" %%mm0, (%0, %%"REG_a", 2) \n\t"
2479 MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2) \n\t"
2480 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2) \n\t"
2481 MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2) \n\t"
2482 "add $16, %%"REG_a" \n\t"
2483 "cmp %3, %%"REG_a" \n\t"
2485 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2486 : "memory", "%"REG_a
2489 for (w= (width&(~15)); w < width; w++)
2491 dest[2*w+0] = src1[w];
2492 dest[2*w+1] = src2[w];
2495 for (w=0; w < width; w++)
2497 dest[2*w+0] = src1[w];
2498 dest[2*w+1] = src2[w];
2514 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2515 uint8_t *dst1, uint8_t *dst2,
2516 long width, long height,
2517 long srcStride1, long srcStride2,
2518 long dstStride1, long dstStride2)
2521 w=width/2; h=height/2;
2526 ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2529 const uint8_t* s1=src1+srcStride1*(y>>1);
2530 uint8_t* d=dst1+dstStride1*y;
2536 PREFETCH" 32%1 \n\t"
2537 "movq %1, %%mm0 \n\t"
2538 "movq 8%1, %%mm2 \n\t"
2539 "movq 16%1, %%mm4 \n\t"
2540 "movq 24%1, %%mm6 \n\t"
2541 "movq %%mm0, %%mm1 \n\t"
2542 "movq %%mm2, %%mm3 \n\t"
2543 "movq %%mm4, %%mm5 \n\t"
2544 "movq %%mm6, %%mm7 \n\t"
2545 "punpcklbw %%mm0, %%mm0 \n\t"
2546 "punpckhbw %%mm1, %%mm1 \n\t"
2547 "punpcklbw %%mm2, %%mm2 \n\t"
2548 "punpckhbw %%mm3, %%mm3 \n\t"
2549 "punpcklbw %%mm4, %%mm4 \n\t"
2550 "punpckhbw %%mm5, %%mm5 \n\t"
2551 "punpcklbw %%mm6, %%mm6 \n\t"
2552 "punpckhbw %%mm7, %%mm7 \n\t"
2553 MOVNTQ" %%mm0, %0 \n\t"
2554 MOVNTQ" %%mm1, 8%0 \n\t"
2555 MOVNTQ" %%mm2, 16%0 \n\t"
2556 MOVNTQ" %%mm3, 24%0 \n\t"
2557 MOVNTQ" %%mm4, 32%0 \n\t"
2558 MOVNTQ" %%mm5, 40%0 \n\t"
2559 MOVNTQ" %%mm6, 48%0 \n\t"
2560 MOVNTQ" %%mm7, 56%0"
2566 for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2569 const uint8_t* s2=src2+srcStride2*(y>>1);
2570 uint8_t* d=dst2+dstStride2*y;
2576 PREFETCH" 32%1 \n\t"
2577 "movq %1, %%mm0 \n\t"
2578 "movq 8%1, %%mm2 \n\t"
2579 "movq 16%1, %%mm4 \n\t"
2580 "movq 24%1, %%mm6 \n\t"
2581 "movq %%mm0, %%mm1 \n\t"
2582 "movq %%mm2, %%mm3 \n\t"
2583 "movq %%mm4, %%mm5 \n\t"
2584 "movq %%mm6, %%mm7 \n\t"
2585 "punpcklbw %%mm0, %%mm0 \n\t"
2586 "punpckhbw %%mm1, %%mm1 \n\t"
2587 "punpcklbw %%mm2, %%mm2 \n\t"
2588 "punpckhbw %%mm3, %%mm3 \n\t"
2589 "punpcklbw %%mm4, %%mm4 \n\t"
2590 "punpckhbw %%mm5, %%mm5 \n\t"
2591 "punpcklbw %%mm6, %%mm6 \n\t"
2592 "punpckhbw %%mm7, %%mm7 \n\t"
2593 MOVNTQ" %%mm0, %0 \n\t"
2594 MOVNTQ" %%mm1, 8%0 \n\t"
2595 MOVNTQ" %%mm2, 16%0 \n\t"
2596 MOVNTQ" %%mm3, 24%0 \n\t"
2597 MOVNTQ" %%mm4, 32%0 \n\t"
2598 MOVNTQ" %%mm5, 40%0 \n\t"
2599 MOVNTQ" %%mm6, 48%0 \n\t"
2600 MOVNTQ" %%mm7, 56%0"
2606 for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2617 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2619 long width, long height,
2620 long srcStride1, long srcStride2,
2621 long srcStride3, long dstStride)
2624 w=width/2; h=height;
2626 const uint8_t* yp=src1+srcStride1*y;
2627 const uint8_t* up=src2+srcStride2*(y>>2);
2628 const uint8_t* vp=src3+srcStride3*(y>>2);
2629 uint8_t* d=dst+dstStride*y;
2635 PREFETCH" 32(%1, %0) \n\t"
2636 PREFETCH" 32(%2, %0) \n\t"
2637 PREFETCH" 32(%3, %0) \n\t"
2638 "movq (%1, %0, 4), %%mm0 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2639 "movq (%2, %0), %%mm1 \n\t" /* U0U1U2U3U4U5U6U7 */
2640 "movq (%3, %0), %%mm2 \n\t" /* V0V1V2V3V4V5V6V7 */
2641 "movq %%mm0, %%mm3 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2642 "movq %%mm1, %%mm4 \n\t" /* U0U1U2U3U4U5U6U7 */
2643 "movq %%mm2, %%mm5 \n\t" /* V0V1V2V3V4V5V6V7 */
2644 "punpcklbw %%mm1, %%mm1 \n\t" /* U0U0 U1U1 U2U2 U3U3 */
2645 "punpcklbw %%mm2, %%mm2 \n\t" /* V0V0 V1V1 V2V2 V3V3 */
2646 "punpckhbw %%mm4, %%mm4 \n\t" /* U4U4 U5U5 U6U6 U7U7 */
2647 "punpckhbw %%mm5, %%mm5 \n\t" /* V4V4 V5V5 V6V6 V7V7 */
2649 "movq %%mm1, %%mm6 \n\t"
2650 "punpcklbw %%mm2, %%mm1 \n\t" /* U0V0 U0V0 U1V1 U1V1*/
2651 "punpcklbw %%mm1, %%mm0 \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2652 "punpckhbw %%mm1, %%mm3 \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2653 MOVNTQ" %%mm0, (%4, %0, 8) \n\t"
2654 MOVNTQ" %%mm3, 8(%4, %0, 8) \n\t"
2656 "punpckhbw %%mm2, %%mm6 \n\t" /* U2V2 U2V2 U3V3 U3V3*/
2657 "movq 8(%1, %0, 4), %%mm0 \n\t"
2658 "movq %%mm0, %%mm3 \n\t"
2659 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U2 Y V2 Y U2 Y V2*/
2660 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U3 Y V3 Y U3 Y V3*/
2661 MOVNTQ" %%mm0, 16(%4, %0, 8) \n\t"
2662 MOVNTQ" %%mm3, 24(%4, %0, 8) \n\t"
2664 "movq %%mm4, %%mm6 \n\t"
2665 "movq 16(%1, %0, 4), %%mm0 \n\t"
2666 "movq %%mm0, %%mm3 \n\t"
2667 "punpcklbw %%mm5, %%mm4 \n\t"
2668 "punpcklbw %%mm4, %%mm0 \n\t" /* Y U4 Y V4 Y U4 Y V4*/
2669 "punpckhbw %%mm4, %%mm3 \n\t" /* Y U5 Y V5 Y U5 Y V5*/
2670 MOVNTQ" %%mm0, 32(%4, %0, 8) \n\t"
2671 MOVNTQ" %%mm3, 40(%4, %0, 8) \n\t"
2673 "punpckhbw %%mm5, %%mm6 \n\t"
2674 "movq 24(%1, %0, 4), %%mm0 \n\t"
2675 "movq %%mm0, %%mm3 \n\t"
2676 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U6 Y V6 Y U6 Y V6*/
2677 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U7 Y V7 Y U7 Y V7*/
2678 MOVNTQ" %%mm0, 48(%4, %0, 8) \n\t"
2679 MOVNTQ" %%mm3, 56(%4, %0, 8) \n\t"
2682 : "r"(yp), "r" (up), "r"(vp), "r"(d)
2688 const long x2 = x<<2;
2691 d[8*x+2] = yp[x2+1];
2693 d[8*x+4] = yp[x2+2];
2695 d[8*x+6] = yp[x2+3];
2708 static inline void RENAME(rgb2rgb_init)(void){
2709 rgb15to16 = RENAME(rgb15to16);
2710 rgb15to24 = RENAME(rgb15to24);
2711 rgb15to32 = RENAME(rgb15to32);
2712 rgb16to24 = RENAME(rgb16to24);
2713 rgb16to32 = RENAME(rgb16to32);
2714 rgb16to15 = RENAME(rgb16to15);
2715 rgb24to16 = RENAME(rgb24to16);
2716 rgb24to15 = RENAME(rgb24to15);
2717 rgb24to32 = RENAME(rgb24to32);
2718 rgb32to16 = RENAME(rgb32to16);
2719 rgb32to15 = RENAME(rgb32to15);
2720 rgb32to24 = RENAME(rgb32to24);
2721 rgb24tobgr15 = RENAME(rgb24tobgr15);
2722 rgb24tobgr16 = RENAME(rgb24tobgr16);
2723 rgb24tobgr24 = RENAME(rgb24tobgr24);
2724 rgb32tobgr32 = RENAME(rgb32tobgr32);
2725 rgb32tobgr16 = RENAME(rgb32tobgr16);
2726 rgb32tobgr15 = RENAME(rgb32tobgr15);
2727 yv12toyuy2 = RENAME(yv12toyuy2);
2728 yv12touyvy = RENAME(yv12touyvy);
2729 yuv422ptoyuy2 = RENAME(yuv422ptoyuy2);
2730 yuy2toyv12 = RENAME(yuy2toyv12);
2731 // uyvytoyv12 = RENAME(uyvytoyv12);
2732 // yvu9toyv12 = RENAME(yvu9toyv12);
2733 planar2x = RENAME(planar2x);
2734 rgb24toyv12 = RENAME(rgb24toyv12);
2735 interleaveBytes = RENAME(interleaveBytes);
2736 vu9_to_vu12 = RENAME(vu9_to_vu12);
2737 yvu9_to_yuy2 = RENAME(yvu9_to_yuy2);