2 * software RGB to RGB converter
3 * pluralize by software PAL8 to RGB converter
4 * software YUV to YUV converter
5 * software YUV to RGB converter
6 * Written by Nick Kurshev.
7 * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
8 * lot of big-endian byte order fixes by Alex Beregszaszi
10 * This file is part of FFmpeg.
12 * FFmpeg is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
17 * FFmpeg is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
22 * You should have received a copy of the GNU General Public License
23 * along with FFmpeg; if not, write to the Free Software
24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
26 * The C code (not assembly, MMX, ...) of this file can be used
27 * under the LGPL license.
31 #include <inttypes.h> /* for __WORDSIZE */
34 // #warning You have a misconfigured system and will probably lose performance!
35 #define __WORDSIZE MP_WORDSIZE
53 #define PREFETCH "prefetch"
54 #define PREFETCHW "prefetchw"
55 #define PAVGB "pavgusb"
56 #elif defined (HAVE_MMX2)
57 #define PREFETCH "prefetchnta"
58 #define PREFETCHW "prefetcht0"
65 #define PREFETCH " # nop"
66 #define PREFETCHW " # nop"
71 /* On K6 femms is faster than emms. On K7 femms is directly mapped on emms. */
78 #define MOVNTQ "movntq"
79 #define SFENCE "sfence"
82 #define SFENCE " # nop"
85 static inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
88 const uint8_t *s = src;
91 const uint8_t *mm_end;
95 asm volatile(PREFETCH" %0"::"m"(*s):"memory");
97 asm volatile("movq %0, %%mm7"::"m"(mask32):"memory");
102 "movd %1, %%mm0 \n\t"
103 "punpckldq 3%1, %%mm0 \n\t"
104 "movd 6%1, %%mm1 \n\t"
105 "punpckldq 9%1, %%mm1 \n\t"
106 "movd 12%1, %%mm2 \n\t"
107 "punpckldq 15%1, %%mm2 \n\t"
108 "movd 18%1, %%mm3 \n\t"
109 "punpckldq 21%1, %%mm3 \n\t"
110 "pand %%mm7, %%mm0 \n\t"
111 "pand %%mm7, %%mm1 \n\t"
112 "pand %%mm7, %%mm2 \n\t"
113 "pand %%mm7, %%mm3 \n\t"
114 MOVNTQ" %%mm0, %0 \n\t"
115 MOVNTQ" %%mm1, 8%0 \n\t"
116 MOVNTQ" %%mm2, 16%0 \n\t"
124 asm volatile(SFENCE:::"memory");
125 asm volatile(EMMS:::"memory");
129 #ifdef WORDS_BIGENDIAN
130 /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */
145 static inline void RENAME(rgb32tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
148 const uint8_t *s = src;
151 const uint8_t *mm_end;
155 asm volatile(PREFETCH" %0"::"m"(*s):"memory");
161 "movq %1, %%mm0 \n\t"
162 "movq 8%1, %%mm1 \n\t"
163 "movq 16%1, %%mm4 \n\t"
164 "movq 24%1, %%mm5 \n\t"
165 "movq %%mm0, %%mm2 \n\t"
166 "movq %%mm1, %%mm3 \n\t"
167 "movq %%mm4, %%mm6 \n\t"
168 "movq %%mm5, %%mm7 \n\t"
169 "psrlq $8, %%mm2 \n\t"
170 "psrlq $8, %%mm3 \n\t"
171 "psrlq $8, %%mm6 \n\t"
172 "psrlq $8, %%mm7 \n\t"
173 "pand %2, %%mm0 \n\t"
174 "pand %2, %%mm1 \n\t"
175 "pand %2, %%mm4 \n\t"
176 "pand %2, %%mm5 \n\t"
177 "pand %3, %%mm2 \n\t"
178 "pand %3, %%mm3 \n\t"
179 "pand %3, %%mm6 \n\t"
180 "pand %3, %%mm7 \n\t"
181 "por %%mm2, %%mm0 \n\t"
182 "por %%mm3, %%mm1 \n\t"
183 "por %%mm6, %%mm4 \n\t"
184 "por %%mm7, %%mm5 \n\t"
186 "movq %%mm1, %%mm2 \n\t"
187 "movq %%mm4, %%mm3 \n\t"
188 "psllq $48, %%mm2 \n\t"
189 "psllq $32, %%mm3 \n\t"
190 "pand %4, %%mm2 \n\t"
191 "pand %5, %%mm3 \n\t"
192 "por %%mm2, %%mm0 \n\t"
193 "psrlq $16, %%mm1 \n\t"
194 "psrlq $32, %%mm4 \n\t"
195 "psllq $16, %%mm5 \n\t"
196 "por %%mm3, %%mm1 \n\t"
197 "pand %6, %%mm5 \n\t"
198 "por %%mm5, %%mm4 \n\t"
200 MOVNTQ" %%mm0, %0 \n\t"
201 MOVNTQ" %%mm1, 8%0 \n\t"
204 :"m"(*s),"m"(mask24l),
205 "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
210 asm volatile(SFENCE:::"memory");
211 asm volatile(EMMS:::"memory");
215 #ifdef WORDS_BIGENDIAN
216 /* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */
232 original by Strepto/Astral
233 ported to gcc & bugfixed: A'rpi
234 MMX2, 3DNOW optimization by Nick Kurshev
235 32-bit C version, and and&add trick by Michael Niedermayer
237 static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, long src_size)
239 register const uint8_t* s=src;
240 register uint8_t* d=dst;
241 register const uint8_t *end;
242 const uint8_t *mm_end;
245 asm volatile(PREFETCH" %0"::"m"(*s));
246 asm volatile("movq %0, %%mm4"::"m"(mask15s));
252 "movq %1, %%mm0 \n\t"
253 "movq 8%1, %%mm2 \n\t"
254 "movq %%mm0, %%mm1 \n\t"
255 "movq %%mm2, %%mm3 \n\t"
256 "pand %%mm4, %%mm0 \n\t"
257 "pand %%mm4, %%mm2 \n\t"
258 "paddw %%mm1, %%mm0 \n\t"
259 "paddw %%mm3, %%mm2 \n\t"
260 MOVNTQ" %%mm0, %0 \n\t"
268 asm volatile(SFENCE:::"memory");
269 asm volatile(EMMS:::"memory");
274 register unsigned x= *((const uint32_t *)s);
275 *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
281 register unsigned short x= *((const uint16_t *)s);
282 *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
286 static inline void RENAME(rgb16to15)(const uint8_t *src, uint8_t *dst, long src_size)
288 register const uint8_t* s=src;
289 register uint8_t* d=dst;
290 register const uint8_t *end;
291 const uint8_t *mm_end;
294 asm volatile(PREFETCH" %0"::"m"(*s));
295 asm volatile("movq %0, %%mm7"::"m"(mask15rg));
296 asm volatile("movq %0, %%mm6"::"m"(mask15b));
302 "movq %1, %%mm0 \n\t"
303 "movq 8%1, %%mm2 \n\t"
304 "movq %%mm0, %%mm1 \n\t"
305 "movq %%mm2, %%mm3 \n\t"
306 "psrlq $1, %%mm0 \n\t"
307 "psrlq $1, %%mm2 \n\t"
308 "pand %%mm7, %%mm0 \n\t"
309 "pand %%mm7, %%mm2 \n\t"
310 "pand %%mm6, %%mm1 \n\t"
311 "pand %%mm6, %%mm3 \n\t"
312 "por %%mm1, %%mm0 \n\t"
313 "por %%mm3, %%mm2 \n\t"
314 MOVNTQ" %%mm0, %0 \n\t"
322 asm volatile(SFENCE:::"memory");
323 asm volatile(EMMS:::"memory");
328 register uint32_t x= *((const uint32_t*)s);
329 *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
335 register uint16_t x= *((const uint16_t*)s);
336 *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
342 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size)
344 const uint8_t *s = src;
347 const uint8_t *mm_end;
349 uint16_t *d = (uint16_t *)dst;
353 #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
355 "movq %3, %%mm5 \n\t"
356 "movq %4, %%mm6 \n\t"
357 "movq %5, %%mm7 \n\t"
361 PREFETCH" 32(%1) \n\t"
362 "movd (%1), %%mm0 \n\t"
363 "movd 4(%1), %%mm3 \n\t"
364 "punpckldq 8(%1), %%mm0 \n\t"
365 "punpckldq 12(%1), %%mm3 \n\t"
366 "movq %%mm0, %%mm1 \n\t"
367 "movq %%mm3, %%mm4 \n\t"
368 "pand %%mm6, %%mm0 \n\t"
369 "pand %%mm6, %%mm3 \n\t"
370 "pmaddwd %%mm7, %%mm0 \n\t"
371 "pmaddwd %%mm7, %%mm3 \n\t"
372 "pand %%mm5, %%mm1 \n\t"
373 "pand %%mm5, %%mm4 \n\t"
374 "por %%mm1, %%mm0 \n\t"
375 "por %%mm4, %%mm3 \n\t"
376 "psrld $5, %%mm0 \n\t"
377 "pslld $11, %%mm3 \n\t"
378 "por %%mm3, %%mm0 \n\t"
379 MOVNTQ" %%mm0, (%0) \n\t"
386 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
389 asm volatile(PREFETCH" %0"::"m"(*src):"memory");
391 "movq %0, %%mm7 \n\t"
392 "movq %1, %%mm6 \n\t"
393 ::"m"(red_16mask),"m"(green_16mask));
398 "movd %1, %%mm0 \n\t"
399 "movd 4%1, %%mm3 \n\t"
400 "punpckldq 8%1, %%mm0 \n\t"
401 "punpckldq 12%1, %%mm3 \n\t"
402 "movq %%mm0, %%mm1 \n\t"
403 "movq %%mm0, %%mm2 \n\t"
404 "movq %%mm3, %%mm4 \n\t"
405 "movq %%mm3, %%mm5 \n\t"
406 "psrlq $3, %%mm0 \n\t"
407 "psrlq $3, %%mm3 \n\t"
408 "pand %2, %%mm0 \n\t"
409 "pand %2, %%mm3 \n\t"
410 "psrlq $5, %%mm1 \n\t"
411 "psrlq $5, %%mm4 \n\t"
412 "pand %%mm6, %%mm1 \n\t"
413 "pand %%mm6, %%mm4 \n\t"
414 "psrlq $8, %%mm2 \n\t"
415 "psrlq $8, %%mm5 \n\t"
416 "pand %%mm7, %%mm2 \n\t"
417 "pand %%mm7, %%mm5 \n\t"
418 "por %%mm1, %%mm0 \n\t"
419 "por %%mm4, %%mm3 \n\t"
420 "por %%mm2, %%mm0 \n\t"
421 "por %%mm5, %%mm3 \n\t"
422 "psllq $16, %%mm3 \n\t"
423 "por %%mm3, %%mm0 \n\t"
424 MOVNTQ" %%mm0, %0 \n\t"
425 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
430 asm volatile(SFENCE:::"memory");
431 asm volatile(EMMS:::"memory");
435 register int rgb = *(const uint32_t*)s; s += 4;
436 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
440 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
442 const uint8_t *s = src;
445 const uint8_t *mm_end;
447 uint16_t *d = (uint16_t *)dst;
450 asm volatile(PREFETCH" %0"::"m"(*src):"memory");
452 "movq %0, %%mm7 \n\t"
453 "movq %1, %%mm6 \n\t"
454 ::"m"(red_16mask),"m"(green_16mask));
460 "movd %1, %%mm0 \n\t"
461 "movd 4%1, %%mm3 \n\t"
462 "punpckldq 8%1, %%mm0 \n\t"
463 "punpckldq 12%1, %%mm3 \n\t"
464 "movq %%mm0, %%mm1 \n\t"
465 "movq %%mm0, %%mm2 \n\t"
466 "movq %%mm3, %%mm4 \n\t"
467 "movq %%mm3, %%mm5 \n\t"
468 "psllq $8, %%mm0 \n\t"
469 "psllq $8, %%mm3 \n\t"
470 "pand %%mm7, %%mm0 \n\t"
471 "pand %%mm7, %%mm3 \n\t"
472 "psrlq $5, %%mm1 \n\t"
473 "psrlq $5, %%mm4 \n\t"
474 "pand %%mm6, %%mm1 \n\t"
475 "pand %%mm6, %%mm4 \n\t"
476 "psrlq $19, %%mm2 \n\t"
477 "psrlq $19, %%mm5 \n\t"
478 "pand %2, %%mm2 \n\t"
479 "pand %2, %%mm5 \n\t"
480 "por %%mm1, %%mm0 \n\t"
481 "por %%mm4, %%mm3 \n\t"
482 "por %%mm2, %%mm0 \n\t"
483 "por %%mm5, %%mm3 \n\t"
484 "psllq $16, %%mm3 \n\t"
485 "por %%mm3, %%mm0 \n\t"
486 MOVNTQ" %%mm0, %0 \n\t"
487 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
491 asm volatile(SFENCE:::"memory");
492 asm volatile(EMMS:::"memory");
496 register int rgb = *(const uint32_t*)s; s += 4;
497 *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
501 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size)
503 const uint8_t *s = src;
506 const uint8_t *mm_end;
508 uint16_t *d = (uint16_t *)dst;
512 #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
514 "movq %3, %%mm5 \n\t"
515 "movq %4, %%mm6 \n\t"
516 "movq %5, %%mm7 \n\t"
520 PREFETCH" 32(%1) \n\t"
521 "movd (%1), %%mm0 \n\t"
522 "movd 4(%1), %%mm3 \n\t"
523 "punpckldq 8(%1), %%mm0 \n\t"
524 "punpckldq 12(%1), %%mm3 \n\t"
525 "movq %%mm0, %%mm1 \n\t"
526 "movq %%mm3, %%mm4 \n\t"
527 "pand %%mm6, %%mm0 \n\t"
528 "pand %%mm6, %%mm3 \n\t"
529 "pmaddwd %%mm7, %%mm0 \n\t"
530 "pmaddwd %%mm7, %%mm3 \n\t"
531 "pand %%mm5, %%mm1 \n\t"
532 "pand %%mm5, %%mm4 \n\t"
533 "por %%mm1, %%mm0 \n\t"
534 "por %%mm4, %%mm3 \n\t"
535 "psrld $6, %%mm0 \n\t"
536 "pslld $10, %%mm3 \n\t"
537 "por %%mm3, %%mm0 \n\t"
538 MOVNTQ" %%mm0, (%0) \n\t"
545 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
548 asm volatile(PREFETCH" %0"::"m"(*src):"memory");
550 "movq %0, %%mm7 \n\t"
551 "movq %1, %%mm6 \n\t"
552 ::"m"(red_15mask),"m"(green_15mask));
557 "movd %1, %%mm0 \n\t"
558 "movd 4%1, %%mm3 \n\t"
559 "punpckldq 8%1, %%mm0 \n\t"
560 "punpckldq 12%1, %%mm3 \n\t"
561 "movq %%mm0, %%mm1 \n\t"
562 "movq %%mm0, %%mm2 \n\t"
563 "movq %%mm3, %%mm4 \n\t"
564 "movq %%mm3, %%mm5 \n\t"
565 "psrlq $3, %%mm0 \n\t"
566 "psrlq $3, %%mm3 \n\t"
567 "pand %2, %%mm0 \n\t"
568 "pand %2, %%mm3 \n\t"
569 "psrlq $6, %%mm1 \n\t"
570 "psrlq $6, %%mm4 \n\t"
571 "pand %%mm6, %%mm1 \n\t"
572 "pand %%mm6, %%mm4 \n\t"
573 "psrlq $9, %%mm2 \n\t"
574 "psrlq $9, %%mm5 \n\t"
575 "pand %%mm7, %%mm2 \n\t"
576 "pand %%mm7, %%mm5 \n\t"
577 "por %%mm1, %%mm0 \n\t"
578 "por %%mm4, %%mm3 \n\t"
579 "por %%mm2, %%mm0 \n\t"
580 "por %%mm5, %%mm3 \n\t"
581 "psllq $16, %%mm3 \n\t"
582 "por %%mm3, %%mm0 \n\t"
583 MOVNTQ" %%mm0, %0 \n\t"
584 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
589 asm volatile(SFENCE:::"memory");
590 asm volatile(EMMS:::"memory");
594 register int rgb = *(const uint32_t*)s; s += 4;
595 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
599 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
601 const uint8_t *s = src;
604 const uint8_t *mm_end;
606 uint16_t *d = (uint16_t *)dst;
609 asm volatile(PREFETCH" %0"::"m"(*src):"memory");
611 "movq %0, %%mm7 \n\t"
612 "movq %1, %%mm6 \n\t"
613 ::"m"(red_15mask),"m"(green_15mask));
619 "movd %1, %%mm0 \n\t"
620 "movd 4%1, %%mm3 \n\t"
621 "punpckldq 8%1, %%mm0 \n\t"
622 "punpckldq 12%1, %%mm3 \n\t"
623 "movq %%mm0, %%mm1 \n\t"
624 "movq %%mm0, %%mm2 \n\t"
625 "movq %%mm3, %%mm4 \n\t"
626 "movq %%mm3, %%mm5 \n\t"
627 "psllq $7, %%mm0 \n\t"
628 "psllq $7, %%mm3 \n\t"
629 "pand %%mm7, %%mm0 \n\t"
630 "pand %%mm7, %%mm3 \n\t"
631 "psrlq $6, %%mm1 \n\t"
632 "psrlq $6, %%mm4 \n\t"
633 "pand %%mm6, %%mm1 \n\t"
634 "pand %%mm6, %%mm4 \n\t"
635 "psrlq $19, %%mm2 \n\t"
636 "psrlq $19, %%mm5 \n\t"
637 "pand %2, %%mm2 \n\t"
638 "pand %2, %%mm5 \n\t"
639 "por %%mm1, %%mm0 \n\t"
640 "por %%mm4, %%mm3 \n\t"
641 "por %%mm2, %%mm0 \n\t"
642 "por %%mm5, %%mm3 \n\t"
643 "psllq $16, %%mm3 \n\t"
644 "por %%mm3, %%mm0 \n\t"
645 MOVNTQ" %%mm0, %0 \n\t"
646 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
650 asm volatile(SFENCE:::"memory");
651 asm volatile(EMMS:::"memory");
655 register int rgb = *(const uint32_t*)s; s += 4;
656 *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
660 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
662 const uint8_t *s = src;
665 const uint8_t *mm_end;
667 uint16_t *d = (uint16_t *)dst;
670 asm volatile(PREFETCH" %0"::"m"(*src):"memory");
672 "movq %0, %%mm7 \n\t"
673 "movq %1, %%mm6 \n\t"
674 ::"m"(red_16mask),"m"(green_16mask));
680 "movd %1, %%mm0 \n\t"
681 "movd 3%1, %%mm3 \n\t"
682 "punpckldq 6%1, %%mm0 \n\t"
683 "punpckldq 9%1, %%mm3 \n\t"
684 "movq %%mm0, %%mm1 \n\t"
685 "movq %%mm0, %%mm2 \n\t"
686 "movq %%mm3, %%mm4 \n\t"
687 "movq %%mm3, %%mm5 \n\t"
688 "psrlq $3, %%mm0 \n\t"
689 "psrlq $3, %%mm3 \n\t"
690 "pand %2, %%mm0 \n\t"
691 "pand %2, %%mm3 \n\t"
692 "psrlq $5, %%mm1 \n\t"
693 "psrlq $5, %%mm4 \n\t"
694 "pand %%mm6, %%mm1 \n\t"
695 "pand %%mm6, %%mm4 \n\t"
696 "psrlq $8, %%mm2 \n\t"
697 "psrlq $8, %%mm5 \n\t"
698 "pand %%mm7, %%mm2 \n\t"
699 "pand %%mm7, %%mm5 \n\t"
700 "por %%mm1, %%mm0 \n\t"
701 "por %%mm4, %%mm3 \n\t"
702 "por %%mm2, %%mm0 \n\t"
703 "por %%mm5, %%mm3 \n\t"
704 "psllq $16, %%mm3 \n\t"
705 "por %%mm3, %%mm0 \n\t"
706 MOVNTQ" %%mm0, %0 \n\t"
707 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
711 asm volatile(SFENCE:::"memory");
712 asm volatile(EMMS:::"memory");
719 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
723 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size)
725 const uint8_t *s = src;
728 const uint8_t *mm_end;
730 uint16_t *d = (uint16_t *)dst;
733 asm volatile(PREFETCH" %0"::"m"(*src):"memory");
735 "movq %0, %%mm7 \n\t"
736 "movq %1, %%mm6 \n\t"
737 ::"m"(red_16mask),"m"(green_16mask));
743 "movd %1, %%mm0 \n\t"
744 "movd 3%1, %%mm3 \n\t"
745 "punpckldq 6%1, %%mm0 \n\t"
746 "punpckldq 9%1, %%mm3 \n\t"
747 "movq %%mm0, %%mm1 \n\t"
748 "movq %%mm0, %%mm2 \n\t"
749 "movq %%mm3, %%mm4 \n\t"
750 "movq %%mm3, %%mm5 \n\t"
751 "psllq $8, %%mm0 \n\t"
752 "psllq $8, %%mm3 \n\t"
753 "pand %%mm7, %%mm0 \n\t"
754 "pand %%mm7, %%mm3 \n\t"
755 "psrlq $5, %%mm1 \n\t"
756 "psrlq $5, %%mm4 \n\t"
757 "pand %%mm6, %%mm1 \n\t"
758 "pand %%mm6, %%mm4 \n\t"
759 "psrlq $19, %%mm2 \n\t"
760 "psrlq $19, %%mm5 \n\t"
761 "pand %2, %%mm2 \n\t"
762 "pand %2, %%mm5 \n\t"
763 "por %%mm1, %%mm0 \n\t"
764 "por %%mm4, %%mm3 \n\t"
765 "por %%mm2, %%mm0 \n\t"
766 "por %%mm5, %%mm3 \n\t"
767 "psllq $16, %%mm3 \n\t"
768 "por %%mm3, %%mm0 \n\t"
769 MOVNTQ" %%mm0, %0 \n\t"
770 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
774 asm volatile(SFENCE:::"memory");
775 asm volatile(EMMS:::"memory");
782 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
786 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
788 const uint8_t *s = src;
791 const uint8_t *mm_end;
793 uint16_t *d = (uint16_t *)dst;
796 asm volatile(PREFETCH" %0"::"m"(*src):"memory");
798 "movq %0, %%mm7 \n\t"
799 "movq %1, %%mm6 \n\t"
800 ::"m"(red_15mask),"m"(green_15mask));
806 "movd %1, %%mm0 \n\t"
807 "movd 3%1, %%mm3 \n\t"
808 "punpckldq 6%1, %%mm0 \n\t"
809 "punpckldq 9%1, %%mm3 \n\t"
810 "movq %%mm0, %%mm1 \n\t"
811 "movq %%mm0, %%mm2 \n\t"
812 "movq %%mm3, %%mm4 \n\t"
813 "movq %%mm3, %%mm5 \n\t"
814 "psrlq $3, %%mm0 \n\t"
815 "psrlq $3, %%mm3 \n\t"
816 "pand %2, %%mm0 \n\t"
817 "pand %2, %%mm3 \n\t"
818 "psrlq $6, %%mm1 \n\t"
819 "psrlq $6, %%mm4 \n\t"
820 "pand %%mm6, %%mm1 \n\t"
821 "pand %%mm6, %%mm4 \n\t"
822 "psrlq $9, %%mm2 \n\t"
823 "psrlq $9, %%mm5 \n\t"
824 "pand %%mm7, %%mm2 \n\t"
825 "pand %%mm7, %%mm5 \n\t"
826 "por %%mm1, %%mm0 \n\t"
827 "por %%mm4, %%mm3 \n\t"
828 "por %%mm2, %%mm0 \n\t"
829 "por %%mm5, %%mm3 \n\t"
830 "psllq $16, %%mm3 \n\t"
831 "por %%mm3, %%mm0 \n\t"
832 MOVNTQ" %%mm0, %0 \n\t"
833 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
837 asm volatile(SFENCE:::"memory");
838 asm volatile(EMMS:::"memory");
845 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
849 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size)
851 const uint8_t *s = src;
854 const uint8_t *mm_end;
856 uint16_t *d = (uint16_t *)dst;
859 asm volatile(PREFETCH" %0"::"m"(*src):"memory");
861 "movq %0, %%mm7 \n\t"
862 "movq %1, %%mm6 \n\t"
863 ::"m"(red_15mask),"m"(green_15mask));
869 "movd %1, %%mm0 \n\t"
870 "movd 3%1, %%mm3 \n\t"
871 "punpckldq 6%1, %%mm0 \n\t"
872 "punpckldq 9%1, %%mm3 \n\t"
873 "movq %%mm0, %%mm1 \n\t"
874 "movq %%mm0, %%mm2 \n\t"
875 "movq %%mm3, %%mm4 \n\t"
876 "movq %%mm3, %%mm5 \n\t"
877 "psllq $7, %%mm0 \n\t"
878 "psllq $7, %%mm3 \n\t"
879 "pand %%mm7, %%mm0 \n\t"
880 "pand %%mm7, %%mm3 \n\t"
881 "psrlq $6, %%mm1 \n\t"
882 "psrlq $6, %%mm4 \n\t"
883 "pand %%mm6, %%mm1 \n\t"
884 "pand %%mm6, %%mm4 \n\t"
885 "psrlq $19, %%mm2 \n\t"
886 "psrlq $19, %%mm5 \n\t"
887 "pand %2, %%mm2 \n\t"
888 "pand %2, %%mm5 \n\t"
889 "por %%mm1, %%mm0 \n\t"
890 "por %%mm4, %%mm3 \n\t"
891 "por %%mm2, %%mm0 \n\t"
892 "por %%mm5, %%mm3 \n\t"
893 "psllq $16, %%mm3 \n\t"
894 "por %%mm3, %%mm0 \n\t"
895 MOVNTQ" %%mm0, %0 \n\t"
896 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
900 asm volatile(SFENCE:::"memory");
901 asm volatile(EMMS:::"memory");
908 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
913 I use less accurate approximation here by simply left-shifting the input
914 value and filling the low order bits with zeroes. This method improves PNG
915 compression but this scheme cannot reproduce white exactly, since it does
916 not generate an all-ones maximum value; the net effect is to darken the
919 The better method should be "left bit replication":
929 | leftmost bits repeated to fill open bits
933 static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
937 const uint16_t *mm_end;
940 const uint16_t *s = (const uint16_t*)src;
941 end = s + src_size/2;
943 asm volatile(PREFETCH" %0"::"m"(*s):"memory");
949 "movq %1, %%mm0 \n\t"
950 "movq %1, %%mm1 \n\t"
951 "movq %1, %%mm2 \n\t"
952 "pand %2, %%mm0 \n\t"
953 "pand %3, %%mm1 \n\t"
954 "pand %4, %%mm2 \n\t"
955 "psllq $3, %%mm0 \n\t"
956 "psrlq $2, %%mm1 \n\t"
957 "psrlq $7, %%mm2 \n\t"
958 "movq %%mm0, %%mm3 \n\t"
959 "movq %%mm1, %%mm4 \n\t"
960 "movq %%mm2, %%mm5 \n\t"
961 "punpcklwd %5, %%mm0 \n\t"
962 "punpcklwd %5, %%mm1 \n\t"
963 "punpcklwd %5, %%mm2 \n\t"
964 "punpckhwd %5, %%mm3 \n\t"
965 "punpckhwd %5, %%mm4 \n\t"
966 "punpckhwd %5, %%mm5 \n\t"
967 "psllq $8, %%mm1 \n\t"
968 "psllq $16, %%mm2 \n\t"
969 "por %%mm1, %%mm0 \n\t"
970 "por %%mm2, %%mm0 \n\t"
971 "psllq $8, %%mm4 \n\t"
972 "psllq $16, %%mm5 \n\t"
973 "por %%mm4, %%mm3 \n\t"
974 "por %%mm5, %%mm3 \n\t"
976 "movq %%mm0, %%mm6 \n\t"
977 "movq %%mm3, %%mm7 \n\t"
979 "movq 8%1, %%mm0 \n\t"
980 "movq 8%1, %%mm1 \n\t"
981 "movq 8%1, %%mm2 \n\t"
982 "pand %2, %%mm0 \n\t"
983 "pand %3, %%mm1 \n\t"
984 "pand %4, %%mm2 \n\t"
985 "psllq $3, %%mm0 \n\t"
986 "psrlq $2, %%mm1 \n\t"
987 "psrlq $7, %%mm2 \n\t"
988 "movq %%mm0, %%mm3 \n\t"
989 "movq %%mm1, %%mm4 \n\t"
990 "movq %%mm2, %%mm5 \n\t"
991 "punpcklwd %5, %%mm0 \n\t"
992 "punpcklwd %5, %%mm1 \n\t"
993 "punpcklwd %5, %%mm2 \n\t"
994 "punpckhwd %5, %%mm3 \n\t"
995 "punpckhwd %5, %%mm4 \n\t"
996 "punpckhwd %5, %%mm5 \n\t"
997 "psllq $8, %%mm1 \n\t"
998 "psllq $16, %%mm2 \n\t"
999 "por %%mm1, %%mm0 \n\t"
1000 "por %%mm2, %%mm0 \n\t"
1001 "psllq $8, %%mm4 \n\t"
1002 "psllq $16, %%mm5 \n\t"
1003 "por %%mm4, %%mm3 \n\t"
1004 "por %%mm5, %%mm3 \n\t"
1007 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
1009 /* borrowed 32 to 24 */
1011 "movq %%mm0, %%mm4 \n\t"
1012 "movq %%mm3, %%mm5 \n\t"
1013 "movq %%mm6, %%mm0 \n\t"
1014 "movq %%mm7, %%mm1 \n\t"
1016 "movq %%mm4, %%mm6 \n\t"
1017 "movq %%mm5, %%mm7 \n\t"
1018 "movq %%mm0, %%mm2 \n\t"
1019 "movq %%mm1, %%mm3 \n\t"
1021 "psrlq $8, %%mm2 \n\t"
1022 "psrlq $8, %%mm3 \n\t"
1023 "psrlq $8, %%mm6 \n\t"
1024 "psrlq $8, %%mm7 \n\t"
1025 "pand %2, %%mm0 \n\t"
1026 "pand %2, %%mm1 \n\t"
1027 "pand %2, %%mm4 \n\t"
1028 "pand %2, %%mm5 \n\t"
1029 "pand %3, %%mm2 \n\t"
1030 "pand %3, %%mm3 \n\t"
1031 "pand %3, %%mm6 \n\t"
1032 "pand %3, %%mm7 \n\t"
1033 "por %%mm2, %%mm0 \n\t"
1034 "por %%mm3, %%mm1 \n\t"
1035 "por %%mm6, %%mm4 \n\t"
1036 "por %%mm7, %%mm5 \n\t"
1038 "movq %%mm1, %%mm2 \n\t"
1039 "movq %%mm4, %%mm3 \n\t"
1040 "psllq $48, %%mm2 \n\t"
1041 "psllq $32, %%mm3 \n\t"
1042 "pand %4, %%mm2 \n\t"
1043 "pand %5, %%mm3 \n\t"
1044 "por %%mm2, %%mm0 \n\t"
1045 "psrlq $16, %%mm1 \n\t"
1046 "psrlq $32, %%mm4 \n\t"
1047 "psllq $16, %%mm5 \n\t"
1048 "por %%mm3, %%mm1 \n\t"
1049 "pand %6, %%mm5 \n\t"
1050 "por %%mm5, %%mm4 \n\t"
1052 MOVNTQ" %%mm0, %0 \n\t"
1053 MOVNTQ" %%mm1, 8%0 \n\t"
1054 MOVNTQ" %%mm4, 16%0"
1057 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1062 asm volatile(SFENCE:::"memory");
1063 asm volatile(EMMS:::"memory");
1067 register uint16_t bgr;
1069 *d++ = (bgr&0x1F)<<3;
1070 *d++ = (bgr&0x3E0)>>2;
1071 *d++ = (bgr&0x7C00)>>7;
1075 static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
1077 const uint16_t *end;
1079 const uint16_t *mm_end;
1081 uint8_t *d = (uint8_t *)dst;
1082 const uint16_t *s = (const uint16_t *)src;
1083 end = s + src_size/2;
1085 asm volatile(PREFETCH" %0"::"m"(*s):"memory");
1090 PREFETCH" 32%1 \n\t"
1091 "movq %1, %%mm0 \n\t"
1092 "movq %1, %%mm1 \n\t"
1093 "movq %1, %%mm2 \n\t"
1094 "pand %2, %%mm0 \n\t"
1095 "pand %3, %%mm1 \n\t"
1096 "pand %4, %%mm2 \n\t"
1097 "psllq $3, %%mm0 \n\t"
1098 "psrlq $3, %%mm1 \n\t"
1099 "psrlq $8, %%mm2 \n\t"
1100 "movq %%mm0, %%mm3 \n\t"
1101 "movq %%mm1, %%mm4 \n\t"
1102 "movq %%mm2, %%mm5 \n\t"
1103 "punpcklwd %5, %%mm0 \n\t"
1104 "punpcklwd %5, %%mm1 \n\t"
1105 "punpcklwd %5, %%mm2 \n\t"
1106 "punpckhwd %5, %%mm3 \n\t"
1107 "punpckhwd %5, %%mm4 \n\t"
1108 "punpckhwd %5, %%mm5 \n\t"
1109 "psllq $8, %%mm1 \n\t"
1110 "psllq $16, %%mm2 \n\t"
1111 "por %%mm1, %%mm0 \n\t"
1112 "por %%mm2, %%mm0 \n\t"
1113 "psllq $8, %%mm4 \n\t"
1114 "psllq $16, %%mm5 \n\t"
1115 "por %%mm4, %%mm3 \n\t"
1116 "por %%mm5, %%mm3 \n\t"
1118 "movq %%mm0, %%mm6 \n\t"
1119 "movq %%mm3, %%mm7 \n\t"
1121 "movq 8%1, %%mm0 \n\t"
1122 "movq 8%1, %%mm1 \n\t"
1123 "movq 8%1, %%mm2 \n\t"
1124 "pand %2, %%mm0 \n\t"
1125 "pand %3, %%mm1 \n\t"
1126 "pand %4, %%mm2 \n\t"
1127 "psllq $3, %%mm0 \n\t"
1128 "psrlq $3, %%mm1 \n\t"
1129 "psrlq $8, %%mm2 \n\t"
1130 "movq %%mm0, %%mm3 \n\t"
1131 "movq %%mm1, %%mm4 \n\t"
1132 "movq %%mm2, %%mm5 \n\t"
1133 "punpcklwd %5, %%mm0 \n\t"
1134 "punpcklwd %5, %%mm1 \n\t"
1135 "punpcklwd %5, %%mm2 \n\t"
1136 "punpckhwd %5, %%mm3 \n\t"
1137 "punpckhwd %5, %%mm4 \n\t"
1138 "punpckhwd %5, %%mm5 \n\t"
1139 "psllq $8, %%mm1 \n\t"
1140 "psllq $16, %%mm2 \n\t"
1141 "por %%mm1, %%mm0 \n\t"
1142 "por %%mm2, %%mm0 \n\t"
1143 "psllq $8, %%mm4 \n\t"
1144 "psllq $16, %%mm5 \n\t"
1145 "por %%mm4, %%mm3 \n\t"
1146 "por %%mm5, %%mm3 \n\t"
1148 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
1150 /* borrowed 32 to 24 */
1152 "movq %%mm0, %%mm4 \n\t"
1153 "movq %%mm3, %%mm5 \n\t"
1154 "movq %%mm6, %%mm0 \n\t"
1155 "movq %%mm7, %%mm1 \n\t"
1157 "movq %%mm4, %%mm6 \n\t"
1158 "movq %%mm5, %%mm7 \n\t"
1159 "movq %%mm0, %%mm2 \n\t"
1160 "movq %%mm1, %%mm3 \n\t"
1162 "psrlq $8, %%mm2 \n\t"
1163 "psrlq $8, %%mm3 \n\t"
1164 "psrlq $8, %%mm6 \n\t"
1165 "psrlq $8, %%mm7 \n\t"
1166 "pand %2, %%mm0 \n\t"
1167 "pand %2, %%mm1 \n\t"
1168 "pand %2, %%mm4 \n\t"
1169 "pand %2, %%mm5 \n\t"
1170 "pand %3, %%mm2 \n\t"
1171 "pand %3, %%mm3 \n\t"
1172 "pand %3, %%mm6 \n\t"
1173 "pand %3, %%mm7 \n\t"
1174 "por %%mm2, %%mm0 \n\t"
1175 "por %%mm3, %%mm1 \n\t"
1176 "por %%mm6, %%mm4 \n\t"
1177 "por %%mm7, %%mm5 \n\t"
1179 "movq %%mm1, %%mm2 \n\t"
1180 "movq %%mm4, %%mm3 \n\t"
1181 "psllq $48, %%mm2 \n\t"
1182 "psllq $32, %%mm3 \n\t"
1183 "pand %4, %%mm2 \n\t"
1184 "pand %5, %%mm3 \n\t"
1185 "por %%mm2, %%mm0 \n\t"
1186 "psrlq $16, %%mm1 \n\t"
1187 "psrlq $32, %%mm4 \n\t"
1188 "psllq $16, %%mm5 \n\t"
1189 "por %%mm3, %%mm1 \n\t"
1190 "pand %6, %%mm5 \n\t"
1191 "por %%mm5, %%mm4 \n\t"
1193 MOVNTQ" %%mm0, %0 \n\t"
1194 MOVNTQ" %%mm1, 8%0 \n\t"
1195 MOVNTQ" %%mm4, 16%0"
1198 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1203 asm volatile(SFENCE:::"memory");
1204 asm volatile(EMMS:::"memory");
1208 register uint16_t bgr;
1210 *d++ = (bgr&0x1F)<<3;
1211 *d++ = (bgr&0x7E0)>>3;
1212 *d++ = (bgr&0xF800)>>8;
1216 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size)
1218 const uint16_t *end;
1220 const uint16_t *mm_end;
1223 const uint16_t *s = (const uint16_t *)src;
1224 end = s + src_size/2;
1226 asm volatile(PREFETCH" %0"::"m"(*s):"memory");
1227 asm volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
1232 PREFETCH" 32%1 \n\t"
1233 "movq %1, %%mm0 \n\t"
1234 "movq %1, %%mm1 \n\t"
1235 "movq %1, %%mm2 \n\t"
1236 "pand %2, %%mm0 \n\t"
1237 "pand %3, %%mm1 \n\t"
1238 "pand %4, %%mm2 \n\t"
1239 "psllq $3, %%mm0 \n\t"
1240 "psrlq $2, %%mm1 \n\t"
1241 "psrlq $7, %%mm2 \n\t"
1242 "movq %%mm0, %%mm3 \n\t"
1243 "movq %%mm1, %%mm4 \n\t"
1244 "movq %%mm2, %%mm5 \n\t"
1245 "punpcklwd %%mm7, %%mm0 \n\t"
1246 "punpcklwd %%mm7, %%mm1 \n\t"
1247 "punpcklwd %%mm7, %%mm2 \n\t"
1248 "punpckhwd %%mm7, %%mm3 \n\t"
1249 "punpckhwd %%mm7, %%mm4 \n\t"
1250 "punpckhwd %%mm7, %%mm5 \n\t"
1251 "psllq $8, %%mm1 \n\t"
1252 "psllq $16, %%mm2 \n\t"
1253 "por %%mm1, %%mm0 \n\t"
1254 "por %%mm2, %%mm0 \n\t"
1255 "psllq $8, %%mm4 \n\t"
1256 "psllq $16, %%mm5 \n\t"
1257 "por %%mm4, %%mm3 \n\t"
1258 "por %%mm5, %%mm3 \n\t"
1259 MOVNTQ" %%mm0, %0 \n\t"
1260 MOVNTQ" %%mm3, 8%0 \n\t"
1262 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1267 asm volatile(SFENCE:::"memory");
1268 asm volatile(EMMS:::"memory");
1272 #if 0 //slightly slower on Athlon
1274 *((uint32_t*)d)++ = ((bgr&0x1F)<<3) + ((bgr&0x3E0)<<6) + ((bgr&0x7C00)<<9);
1276 register uint16_t bgr;
1278 #ifdef WORDS_BIGENDIAN
1280 *d++ = (bgr&0x7C00)>>7;
1281 *d++ = (bgr&0x3E0)>>2;
1282 *d++ = (bgr&0x1F)<<3;
1284 *d++ = (bgr&0x1F)<<3;
1285 *d++ = (bgr&0x3E0)>>2;
1286 *d++ = (bgr&0x7C00)>>7;
1294 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size)
1296 const uint16_t *end;
1298 const uint16_t *mm_end;
1301 const uint16_t *s = (const uint16_t*)src;
1302 end = s + src_size/2;
1304 asm volatile(PREFETCH" %0"::"m"(*s):"memory");
1305 asm volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
1310 PREFETCH" 32%1 \n\t"
1311 "movq %1, %%mm0 \n\t"
1312 "movq %1, %%mm1 \n\t"
1313 "movq %1, %%mm2 \n\t"
1314 "pand %2, %%mm0 \n\t"
1315 "pand %3, %%mm1 \n\t"
1316 "pand %4, %%mm2 \n\t"
1317 "psllq $3, %%mm0 \n\t"
1318 "psrlq $3, %%mm1 \n\t"
1319 "psrlq $8, %%mm2 \n\t"
1320 "movq %%mm0, %%mm3 \n\t"
1321 "movq %%mm1, %%mm4 \n\t"
1322 "movq %%mm2, %%mm5 \n\t"
1323 "punpcklwd %%mm7, %%mm0 \n\t"
1324 "punpcklwd %%mm7, %%mm1 \n\t"
1325 "punpcklwd %%mm7, %%mm2 \n\t"
1326 "punpckhwd %%mm7, %%mm3 \n\t"
1327 "punpckhwd %%mm7, %%mm4 \n\t"
1328 "punpckhwd %%mm7, %%mm5 \n\t"
1329 "psllq $8, %%mm1 \n\t"
1330 "psllq $16, %%mm2 \n\t"
1331 "por %%mm1, %%mm0 \n\t"
1332 "por %%mm2, %%mm0 \n\t"
1333 "psllq $8, %%mm4 \n\t"
1334 "psllq $16, %%mm5 \n\t"
1335 "por %%mm4, %%mm3 \n\t"
1336 "por %%mm5, %%mm3 \n\t"
1337 MOVNTQ" %%mm0, %0 \n\t"
1338 MOVNTQ" %%mm3, 8%0 \n\t"
1340 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1345 asm volatile(SFENCE:::"memory");
1346 asm volatile(EMMS:::"memory");
1350 register uint16_t bgr;
1352 #ifdef WORDS_BIGENDIAN
1354 *d++ = (bgr&0xF800)>>8;
1355 *d++ = (bgr&0x7E0)>>3;
1356 *d++ = (bgr&0x1F)<<3;
1358 *d++ = (bgr&0x1F)<<3;
1359 *d++ = (bgr&0x7E0)>>3;
1360 *d++ = (bgr&0xF800)>>8;
1366 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
1368 long idx = 15 - src_size;
1369 const uint8_t *s = src-idx;
1370 uint8_t *d = dst-idx;
1375 PREFETCH" (%1, %0) \n\t"
1376 "movq %3, %%mm7 \n\t"
1377 "pxor %4, %%mm7 \n\t"
1378 "movq %%mm7, %%mm6 \n\t"
1379 "pxor %5, %%mm7 \n\t"
1382 PREFETCH" 32(%1, %0) \n\t"
1383 "movq (%1, %0), %%mm0 \n\t"
1384 "movq 8(%1, %0), %%mm1 \n\t"
1386 "pshufw $177, %%mm0, %%mm3 \n\t"
1387 "pshufw $177, %%mm1, %%mm5 \n\t"
1388 "pand %%mm7, %%mm0 \n\t"
1389 "pand %%mm6, %%mm3 \n\t"
1390 "pand %%mm7, %%mm1 \n\t"
1391 "pand %%mm6, %%mm5 \n\t"
1392 "por %%mm3, %%mm0 \n\t"
1393 "por %%mm5, %%mm1 \n\t"
1395 "movq %%mm0, %%mm2 \n\t"
1396 "movq %%mm1, %%mm4 \n\t"
1397 "pand %%mm7, %%mm0 \n\t"
1398 "pand %%mm6, %%mm2 \n\t"
1399 "pand %%mm7, %%mm1 \n\t"
1400 "pand %%mm6, %%mm4 \n\t"
1401 "movq %%mm2, %%mm3 \n\t"
1402 "movq %%mm4, %%mm5 \n\t"
1403 "pslld $16, %%mm2 \n\t"
1404 "psrld $16, %%mm3 \n\t"
1405 "pslld $16, %%mm4 \n\t"
1406 "psrld $16, %%mm5 \n\t"
1407 "por %%mm2, %%mm0 \n\t"
1408 "por %%mm4, %%mm1 \n\t"
1409 "por %%mm3, %%mm0 \n\t"
1410 "por %%mm5, %%mm1 \n\t"
1412 MOVNTQ" %%mm0, (%2, %0) \n\t"
1413 MOVNTQ" %%mm1, 8(%2, %0) \n\t"
1420 : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
1423 for (; idx<15; idx+=4) {
1424 register int v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00;
1426 *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16);
1430 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
1434 long mmx_size= 23 - src_size;
1436 "test %%"REG_a", %%"REG_a" \n\t"
1438 "movq "MANGLE(mask24r)", %%mm5 \n\t"
1439 "movq "MANGLE(mask24g)", %%mm6 \n\t"
1440 "movq "MANGLE(mask24b)", %%mm7 \n\t"
1443 PREFETCH" 32(%1, %%"REG_a") \n\t"
1444 "movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1445 "movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG
1446 "movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B
1447 "psllq $16, %%mm0 \n\t" // 00 BGR BGR
1448 "pand %%mm5, %%mm0 \n\t"
1449 "pand %%mm6, %%mm1 \n\t"
1450 "pand %%mm7, %%mm2 \n\t"
1451 "por %%mm0, %%mm1 \n\t"
1452 "por %%mm2, %%mm1 \n\t"
1453 "movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1454 MOVNTQ" %%mm1, (%2, %%"REG_a") \n\t" // RGB RGB RG
1455 "movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B
1456 "movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR
1457 "pand %%mm7, %%mm0 \n\t"
1458 "pand %%mm5, %%mm1 \n\t"
1459 "pand %%mm6, %%mm2 \n\t"
1460 "por %%mm0, %%mm1 \n\t"
1461 "por %%mm2, %%mm1 \n\t"
1462 "movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B
1463 MOVNTQ" %%mm1, 8(%2, %%"REG_a") \n\t" // B RGB RGB R
1464 "movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR
1465 "movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG
1466 "pand %%mm6, %%mm0 \n\t"
1467 "pand %%mm7, %%mm1 \n\t"
1468 "pand %%mm5, %%mm2 \n\t"
1469 "por %%mm0, %%mm1 \n\t"
1470 "por %%mm2, %%mm1 \n\t"
1471 MOVNTQ" %%mm1, 16(%2, %%"REG_a") \n\t"
1472 "add $24, %%"REG_a" \n\t"
1476 : "r" (src-mmx_size), "r"(dst-mmx_size)
1479 asm volatile(SFENCE:::"memory");
1480 asm volatile(EMMS:::"memory");
1482 if (mmx_size==23) return; //finished, was multiple of 8
1486 src_size= 23-mmx_size;
1490 for (i=0; i<src_size; i+=3)
1494 dst[i + 1] = src[i + 1];
1495 dst[i + 2] = src[i + 0];
1500 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1501 long width, long height,
1502 long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1505 const long chromWidth= width>>1;
1506 for (y=0; y<height; y++)
1509 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1511 "xor %%"REG_a", %%"REG_a" \n\t"
1514 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1515 PREFETCH" 32(%2, %%"REG_a") \n\t"
1516 PREFETCH" 32(%3, %%"REG_a") \n\t"
1517 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1518 "movq %%mm0, %%mm2 \n\t" // U(0)
1519 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1520 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1521 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1523 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1524 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1525 "movq %%mm3, %%mm4 \n\t" // Y(0)
1526 "movq %%mm5, %%mm6 \n\t" // Y(8)
1527 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
1528 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
1529 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
1530 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
1532 MOVNTQ" %%mm3, (%0, %%"REG_a", 4) \n\t"
1533 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
1534 MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4) \n\t"
1535 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
1537 "add $8, %%"REG_a" \n\t"
1538 "cmp %4, %%"REG_a" \n\t"
1540 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1545 #if defined ARCH_ALPHA && defined HAVE_MVI
1546 #define pl2yuy2(n) \
1551 asm("unpkbw %1, %0" : "=r"(y1) : "r"(y1)); \
1552 asm("unpkbw %1, %0" : "=r"(y2) : "r"(y2)); \
1553 asm("unpkbl %1, %0" : "=r"(u) : "r"(u)); \
1554 asm("unpkbl %1, %0" : "=r"(v) : "r"(v)); \
1555 yuv1 = (u << 8) + (v << 24); \
1562 uint64_t *qdst = (uint64_t *) dst;
1563 uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
1564 const uint32_t *yc = (uint32_t *) ysrc;
1565 const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
1566 const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
1567 for (i = 0; i < chromWidth; i += 8){
1568 uint64_t y1, y2, yuv1, yuv2;
1571 asm("ldq $31,64(%0)" :: "r"(yc));
1572 asm("ldq $31,64(%0)" :: "r"(yc2));
1573 asm("ldq $31,64(%0)" :: "r"(uc));
1574 asm("ldq $31,64(%0)" :: "r"(vc));
1592 #elif __WORDSIZE >= 64
1594 uint64_t *ldst = (uint64_t *) dst;
1595 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1596 for (i = 0; i < chromWidth; i += 2){
1598 k = yc[0] + (uc[0] << 8) +
1599 (yc[1] << 16) + (vc[0] << 24);
1600 l = yc[2] + (uc[1] << 8) +
1601 (yc[3] << 16) + (vc[1] << 24);
1602 *ldst++ = k + (l << 32);
1609 int i, *idst = (int32_t *) dst;
1610 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1611 for (i = 0; i < chromWidth; i++){
1612 #ifdef WORDS_BIGENDIAN
1613 *idst++ = (yc[0] << 24)+ (uc[0] << 16) +
1614 (yc[1] << 8) + (vc[0] << 0);
1616 *idst++ = yc[0] + (uc[0] << 8) +
1617 (yc[1] << 16) + (vc[0] << 24);
1625 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1)
1627 usrc += chromStride;
1628 vsrc += chromStride;
1641 * Height should be a multiple of 2 and width should be a multiple of 16.
1642 * (If this is a problem for anyone then tell me, and I will fix it.)
1644 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1645 long width, long height,
1646 long lumStride, long chromStride, long dstStride)
1648 //FIXME interpolate chroma
1649 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1652 static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1653 long width, long height,
1654 long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1657 const long chromWidth= width>>1;
1658 for (y=0; y<height; y++)
1661 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1663 "xor %%"REG_a", %%"REG_a" \n\t"
1666 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1667 PREFETCH" 32(%2, %%"REG_a") \n\t"
1668 PREFETCH" 32(%3, %%"REG_a") \n\t"
1669 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1670 "movq %%mm0, %%mm2 \n\t" // U(0)
1671 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1672 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1673 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1675 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1676 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1677 "movq %%mm0, %%mm4 \n\t" // Y(0)
1678 "movq %%mm2, %%mm6 \n\t" // Y(8)
1679 "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
1680 "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
1681 "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
1682 "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
1684 MOVNTQ" %%mm0, (%0, %%"REG_a", 4) \n\t"
1685 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
1686 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4) \n\t"
1687 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
1689 "add $8, %%"REG_a" \n\t"
1690 "cmp %4, %%"REG_a" \n\t"
1692 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1696 //FIXME adapt the Alpha ASM code from yv12->yuy2
1698 #if __WORDSIZE >= 64
1700 uint64_t *ldst = (uint64_t *) dst;
1701 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1702 for (i = 0; i < chromWidth; i += 2){
1704 k = uc[0] + (yc[0] << 8) +
1705 (vc[0] << 16) + (yc[1] << 24);
1706 l = uc[1] + (yc[2] << 8) +
1707 (vc[1] << 16) + (yc[3] << 24);
1708 *ldst++ = k + (l << 32);
1715 int i, *idst = (int32_t *) dst;
1716 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1717 for (i = 0; i < chromWidth; i++){
1718 #ifdef WORDS_BIGENDIAN
1719 *idst++ = (uc[0] << 24)+ (yc[0] << 16) +
1720 (vc[0] << 8) + (yc[1] << 0);
1722 *idst++ = uc[0] + (yc[0] << 8) +
1723 (vc[0] << 16) + (yc[1] << 24);
1731 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1)
1733 usrc += chromStride;
1734 vsrc += chromStride;
1747 * Height should be a multiple of 2 and width should be a multiple of 16
1748 * (If this is a problem for anyone then tell me, and I will fix it.)
1750 static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1751 long width, long height,
1752 long lumStride, long chromStride, long dstStride)
1754 //FIXME interpolate chroma
1755 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1759 * Width should be a multiple of 16.
1761 static inline void RENAME(yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1762 long width, long height,
1763 long lumStride, long chromStride, long dstStride)
1765 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1769 * Width should be a multiple of 16.
1771 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1772 long width, long height,
1773 long lumStride, long chromStride, long dstStride)
1775 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1779 * Height should be a multiple of 2 and width should be a multiple of 16.
1780 * (If this is a problem for anyone then tell me, and I will fix it.)
1782 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1783 long width, long height,
1784 long lumStride, long chromStride, long srcStride)
1787 const long chromWidth= width>>1;
1788 for (y=0; y<height; y+=2)
1792 "xor %%"REG_a", %%"REG_a" \n\t"
1793 "pcmpeqw %%mm7, %%mm7 \n\t"
1794 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1797 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1798 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1799 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1800 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
1801 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
1802 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
1803 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
1804 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1805 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1806 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1807 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1809 MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
1811 "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(8)
1812 "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(12)
1813 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
1814 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
1815 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
1816 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
1817 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1818 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1819 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1820 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1822 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1824 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1825 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1826 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1827 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1828 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1829 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1830 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1831 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1833 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
1834 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
1836 "add $8, %%"REG_a" \n\t"
1837 "cmp %4, %%"REG_a" \n\t"
1839 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1840 : "memory", "%"REG_a
1847 "xor %%"REG_a", %%"REG_a" \n\t"
1850 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1851 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1852 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1853 "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
1854 "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
1855 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1856 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1857 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1858 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1859 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1860 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1862 MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
1863 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1865 "add $8, %%"REG_a" \n\t"
1866 "cmp %4, %%"REG_a" \n\t"
1869 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1870 : "memory", "%"REG_a
1874 for (i=0; i<chromWidth; i++)
1876 ydst[2*i+0] = src[4*i+0];
1877 udst[i] = src[4*i+1];
1878 ydst[2*i+1] = src[4*i+2];
1879 vdst[i] = src[4*i+3];
1884 for (i=0; i<chromWidth; i++)
1886 ydst[2*i+0] = src[4*i+0];
1887 ydst[2*i+1] = src[4*i+2];
1890 udst += chromStride;
1891 vdst += chromStride;
1896 asm volatile( EMMS" \n\t"
1902 static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1903 uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1904 long width, long height, long lumStride, long chromStride)
1907 memcpy(ydst, ysrc, width*height);
1909 /* XXX: implement upscaling for U,V */
1912 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWidth, long srcHeight, long srcStride, long dstStride)
1919 for (x=0; x<srcWidth-1; x++){
1920 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1921 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1923 dst[2*srcWidth-1]= src[srcWidth-1];
1927 for (y=1; y<srcHeight; y++){
1928 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1929 const long mmxSize= srcWidth&~15;
1931 "mov %4, %%"REG_a" \n\t"
1933 "movq (%0, %%"REG_a"), %%mm0 \n\t"
1934 "movq (%1, %%"REG_a"), %%mm1 \n\t"
1935 "movq 1(%0, %%"REG_a"), %%mm2 \n\t"
1936 "movq 1(%1, %%"REG_a"), %%mm3 \n\t"
1937 "movq -1(%0, %%"REG_a"), %%mm4 \n\t"
1938 "movq -1(%1, %%"REG_a"), %%mm5 \n\t"
1939 PAVGB" %%mm0, %%mm5 \n\t"
1940 PAVGB" %%mm0, %%mm3 \n\t"
1941 PAVGB" %%mm0, %%mm5 \n\t"
1942 PAVGB" %%mm0, %%mm3 \n\t"
1943 PAVGB" %%mm1, %%mm4 \n\t"
1944 PAVGB" %%mm1, %%mm2 \n\t"
1945 PAVGB" %%mm1, %%mm4 \n\t"
1946 PAVGB" %%mm1, %%mm2 \n\t"
1947 "movq %%mm5, %%mm7 \n\t"
1948 "movq %%mm4, %%mm6 \n\t"
1949 "punpcklbw %%mm3, %%mm5 \n\t"
1950 "punpckhbw %%mm3, %%mm7 \n\t"
1951 "punpcklbw %%mm2, %%mm4 \n\t"
1952 "punpckhbw %%mm2, %%mm6 \n\t"
1954 MOVNTQ" %%mm5, (%2, %%"REG_a", 2) \n\t"
1955 MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2) \n\t"
1956 MOVNTQ" %%mm4, (%3, %%"REG_a", 2) \n\t"
1957 MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2) \n\t"
1959 "movq %%mm5, (%2, %%"REG_a", 2) \n\t"
1960 "movq %%mm7, 8(%2, %%"REG_a", 2) \n\t"
1961 "movq %%mm4, (%3, %%"REG_a", 2) \n\t"
1962 "movq %%mm6, 8(%3, %%"REG_a", 2) \n\t"
1964 "add $8, %%"REG_a" \n\t"
1966 :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
1967 "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1973 const long mmxSize=1;
1975 dst[0 ]= (3*src[0] + src[srcStride])>>2;
1976 dst[dstStride]= ( src[0] + 3*src[srcStride])>>2;
1978 for (x=mmxSize-1; x<srcWidth-1; x++){
1979 dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
1980 dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
1981 dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
1982 dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
1984 dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
1985 dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1995 for (x=0; x<srcWidth-1; x++){
1996 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1997 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1999 dst[2*srcWidth-1]= src[srcWidth-1];
2001 for (x=0; x<srcWidth; x++){
2008 asm volatile( EMMS" \n\t"
2015 * Height should be a multiple of 2 and width should be a multiple of 16.
2016 * (If this is a problem for anyone then tell me, and I will fix it.)
2017 * Chrominance data is only taken from every second line, others are ignored.
2018 * FIXME: Write HQ version.
2020 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2021 long width, long height,
2022 long lumStride, long chromStride, long srcStride)
2025 const long chromWidth= width>>1;
2026 for (y=0; y<height; y+=2)
2030 "xorl %%eax, %%eax \n\t"
2031 "pcmpeqw %%mm7, %%mm7 \n\t"
2032 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
2035 PREFETCH" 64(%0, %%eax, 4) \n\t"
2036 "movq (%0, %%eax, 4), %%mm0 \n\t" // UYVY UYVY(0)
2037 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(4)
2038 "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
2039 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
2040 "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
2041 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
2042 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
2043 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
2044 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
2045 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
2047 MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
2049 "movq 16(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(8)
2050 "movq 24(%0, %%eax, 4), %%mm2 \n\t" // UYVY UYVY(12)
2051 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
2052 "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
2053 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
2054 "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
2055 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
2056 "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
2057 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
2058 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
2060 MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
2062 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
2063 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
2064 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
2065 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
2066 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
2067 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
2068 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
2069 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
2071 MOVNTQ" %%mm0, (%3, %%eax) \n\t"
2072 MOVNTQ" %%mm2, (%2, %%eax) \n\t"
2074 "addl $8, %%eax \n\t"
2075 "cmpl %4, %%eax \n\t"
2077 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2085 "xorl %%eax, %%eax \n\t"
2088 PREFETCH" 64(%0, %%eax, 4) \n\t"
2089 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
2090 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
2091 "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
2092 "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
2093 "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
2094 "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
2095 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
2096 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
2097 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
2098 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
2100 MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
2101 MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
2103 "addl $8, %%eax \n\t"
2104 "cmpl %4, %%eax \n\t"
2107 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2112 for (i=0; i<chromWidth; i++)
2114 udst[i] = src[4*i+0];
2115 ydst[2*i+0] = src[4*i+1];
2116 vdst[i] = src[4*i+2];
2117 ydst[2*i+1] = src[4*i+3];
2122 for (i=0; i<chromWidth; i++)
2124 ydst[2*i+0] = src[4*i+1];
2125 ydst[2*i+1] = src[4*i+3];
2128 udst += chromStride;
2129 vdst += chromStride;
2134 asm volatile( EMMS" \n\t"
2141 * Height should be a multiple of 2 and width should be a multiple of 2.
2142 * (If this is a problem for anyone then tell me, and I will fix it.)
2143 * Chrominance data is only taken from every second line,
2144 * others are ignored in the C version.
2145 * FIXME: Write HQ version.
2147 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2148 long width, long height,
2149 long lumStride, long chromStride, long srcStride)
2152 const long chromWidth= width>>1;
2154 for (y=0; y<height-2; y+=2)
2160 "mov %2, %%"REG_a" \n\t"
2161 "movq "MANGLE(ff_bgr2YCoeff)", %%mm6 \n\t"
2162 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
2163 "pxor %%mm7, %%mm7 \n\t"
2164 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
2167 PREFETCH" 64(%0, %%"REG_d") \n\t"
2168 "movd (%0, %%"REG_d"), %%mm0 \n\t"
2169 "movd 3(%0, %%"REG_d"), %%mm1 \n\t"
2170 "punpcklbw %%mm7, %%mm0 \n\t"
2171 "punpcklbw %%mm7, %%mm1 \n\t"
2172 "movd 6(%0, %%"REG_d"), %%mm2 \n\t"
2173 "movd 9(%0, %%"REG_d"), %%mm3 \n\t"
2174 "punpcklbw %%mm7, %%mm2 \n\t"
2175 "punpcklbw %%mm7, %%mm3 \n\t"
2176 "pmaddwd %%mm6, %%mm0 \n\t"
2177 "pmaddwd %%mm6, %%mm1 \n\t"
2178 "pmaddwd %%mm6, %%mm2 \n\t"
2179 "pmaddwd %%mm6, %%mm3 \n\t"
2180 #ifndef FAST_BGR2YV12
2181 "psrad $8, %%mm0 \n\t"
2182 "psrad $8, %%mm1 \n\t"
2183 "psrad $8, %%mm2 \n\t"
2184 "psrad $8, %%mm3 \n\t"
2186 "packssdw %%mm1, %%mm0 \n\t"
2187 "packssdw %%mm3, %%mm2 \n\t"
2188 "pmaddwd %%mm5, %%mm0 \n\t"
2189 "pmaddwd %%mm5, %%mm2 \n\t"
2190 "packssdw %%mm2, %%mm0 \n\t"
2191 "psraw $7, %%mm0 \n\t"
2193 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
2194 "movd 15(%0, %%"REG_d"), %%mm1 \n\t"
2195 "punpcklbw %%mm7, %%mm4 \n\t"
2196 "punpcklbw %%mm7, %%mm1 \n\t"
2197 "movd 18(%0, %%"REG_d"), %%mm2 \n\t"
2198 "movd 21(%0, %%"REG_d"), %%mm3 \n\t"
2199 "punpcklbw %%mm7, %%mm2 \n\t"
2200 "punpcklbw %%mm7, %%mm3 \n\t"
2201 "pmaddwd %%mm6, %%mm4 \n\t"
2202 "pmaddwd %%mm6, %%mm1 \n\t"
2203 "pmaddwd %%mm6, %%mm2 \n\t"
2204 "pmaddwd %%mm6, %%mm3 \n\t"
2205 #ifndef FAST_BGR2YV12
2206 "psrad $8, %%mm4 \n\t"
2207 "psrad $8, %%mm1 \n\t"
2208 "psrad $8, %%mm2 \n\t"
2209 "psrad $8, %%mm3 \n\t"
2211 "packssdw %%mm1, %%mm4 \n\t"
2212 "packssdw %%mm3, %%mm2 \n\t"
2213 "pmaddwd %%mm5, %%mm4 \n\t"
2214 "pmaddwd %%mm5, %%mm2 \n\t"
2215 "add $24, %%"REG_d" \n\t"
2216 "packssdw %%mm2, %%mm4 \n\t"
2217 "psraw $7, %%mm4 \n\t"
2219 "packuswb %%mm4, %%mm0 \n\t"
2220 "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0 \n\t"
2222 MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t"
2223 "add $8, %%"REG_a" \n\t"
2225 : : "r" (src+width*3), "r" (ydst+width), "g" (-width)
2226 : "%"REG_a, "%"REG_d
2233 "mov %4, %%"REG_a" \n\t"
2234 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
2235 "movq "MANGLE(ff_bgr2UCoeff)", %%mm6 \n\t"
2236 "pxor %%mm7, %%mm7 \n\t"
2237 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
2238 "add %%"REG_d", %%"REG_d" \n\t"
2241 PREFETCH" 64(%0, %%"REG_d") \n\t"
2242 PREFETCH" 64(%1, %%"REG_d") \n\t"
2243 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2244 "movq (%0, %%"REG_d"), %%mm0 \n\t"
2245 "movq (%1, %%"REG_d"), %%mm1 \n\t"
2246 "movq 6(%0, %%"REG_d"), %%mm2 \n\t"
2247 "movq 6(%1, %%"REG_d"), %%mm3 \n\t"
2248 PAVGB" %%mm1, %%mm0 \n\t"
2249 PAVGB" %%mm3, %%mm2 \n\t"
2250 "movq %%mm0, %%mm1 \n\t"
2251 "movq %%mm2, %%mm3 \n\t"
2252 "psrlq $24, %%mm0 \n\t"
2253 "psrlq $24, %%mm2 \n\t"
2254 PAVGB" %%mm1, %%mm0 \n\t"
2255 PAVGB" %%mm3, %%mm2 \n\t"
2256 "punpcklbw %%mm7, %%mm0 \n\t"
2257 "punpcklbw %%mm7, %%mm2 \n\t"
2259 "movd (%0, %%"REG_d"), %%mm0 \n\t"
2260 "movd (%1, %%"REG_d"), %%mm1 \n\t"
2261 "movd 3(%0, %%"REG_d"), %%mm2 \n\t"
2262 "movd 3(%1, %%"REG_d"), %%mm3 \n\t"
2263 "punpcklbw %%mm7, %%mm0 \n\t"
2264 "punpcklbw %%mm7, %%mm1 \n\t"
2265 "punpcklbw %%mm7, %%mm2 \n\t"
2266 "punpcklbw %%mm7, %%mm3 \n\t"
2267 "paddw %%mm1, %%mm0 \n\t"
2268 "paddw %%mm3, %%mm2 \n\t"
2269 "paddw %%mm2, %%mm0 \n\t"
2270 "movd 6(%0, %%"REG_d"), %%mm4 \n\t"
2271 "movd 6(%1, %%"REG_d"), %%mm1 \n\t"
2272 "movd 9(%0, %%"REG_d"), %%mm2 \n\t"
2273 "movd 9(%1, %%"REG_d"), %%mm3 \n\t"
2274 "punpcklbw %%mm7, %%mm4 \n\t"
2275 "punpcklbw %%mm7, %%mm1 \n\t"
2276 "punpcklbw %%mm7, %%mm2 \n\t"
2277 "punpcklbw %%mm7, %%mm3 \n\t"
2278 "paddw %%mm1, %%mm4 \n\t"
2279 "paddw %%mm3, %%mm2 \n\t"
2280 "paddw %%mm4, %%mm2 \n\t"
2281 "psrlw $2, %%mm0 \n\t"
2282 "psrlw $2, %%mm2 \n\t"
2284 "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
2285 "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
2287 "pmaddwd %%mm0, %%mm1 \n\t"
2288 "pmaddwd %%mm2, %%mm3 \n\t"
2289 "pmaddwd %%mm6, %%mm0 \n\t"
2290 "pmaddwd %%mm6, %%mm2 \n\t"
2291 #ifndef FAST_BGR2YV12
2292 "psrad $8, %%mm0 \n\t"
2293 "psrad $8, %%mm1 \n\t"
2294 "psrad $8, %%mm2 \n\t"
2295 "psrad $8, %%mm3 \n\t"
2297 "packssdw %%mm2, %%mm0 \n\t"
2298 "packssdw %%mm3, %%mm1 \n\t"
2299 "pmaddwd %%mm5, %%mm0 \n\t"
2300 "pmaddwd %%mm5, %%mm1 \n\t"
2301 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
2302 "psraw $7, %%mm0 \n\t"
2304 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2305 "movq 12(%0, %%"REG_d"), %%mm4 \n\t"
2306 "movq 12(%1, %%"REG_d"), %%mm1 \n\t"
2307 "movq 18(%0, %%"REG_d"), %%mm2 \n\t"
2308 "movq 18(%1, %%"REG_d"), %%mm3 \n\t"
2309 PAVGB" %%mm1, %%mm4 \n\t"
2310 PAVGB" %%mm3, %%mm2 \n\t"
2311 "movq %%mm4, %%mm1 \n\t"
2312 "movq %%mm2, %%mm3 \n\t"
2313 "psrlq $24, %%mm4 \n\t"
2314 "psrlq $24, %%mm2 \n\t"
2315 PAVGB" %%mm1, %%mm4 \n\t"
2316 PAVGB" %%mm3, %%mm2 \n\t"
2317 "punpcklbw %%mm7, %%mm4 \n\t"
2318 "punpcklbw %%mm7, %%mm2 \n\t"
2320 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
2321 "movd 12(%1, %%"REG_d"), %%mm1 \n\t"
2322 "movd 15(%0, %%"REG_d"), %%mm2 \n\t"
2323 "movd 15(%1, %%"REG_d"), %%mm3 \n\t"
2324 "punpcklbw %%mm7, %%mm4 \n\t"
2325 "punpcklbw %%mm7, %%mm1 \n\t"
2326 "punpcklbw %%mm7, %%mm2 \n\t"
2327 "punpcklbw %%mm7, %%mm3 \n\t"
2328 "paddw %%mm1, %%mm4 \n\t"
2329 "paddw %%mm3, %%mm2 \n\t"
2330 "paddw %%mm2, %%mm4 \n\t"
2331 "movd 18(%0, %%"REG_d"), %%mm5 \n\t"
2332 "movd 18(%1, %%"REG_d"), %%mm1 \n\t"
2333 "movd 21(%0, %%"REG_d"), %%mm2 \n\t"
2334 "movd 21(%1, %%"REG_d"), %%mm3 \n\t"
2335 "punpcklbw %%mm7, %%mm5 \n\t"
2336 "punpcklbw %%mm7, %%mm1 \n\t"
2337 "punpcklbw %%mm7, %%mm2 \n\t"
2338 "punpcklbw %%mm7, %%mm3 \n\t"
2339 "paddw %%mm1, %%mm5 \n\t"
2340 "paddw %%mm3, %%mm2 \n\t"
2341 "paddw %%mm5, %%mm2 \n\t"
2342 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
2343 "psrlw $2, %%mm4 \n\t"
2344 "psrlw $2, %%mm2 \n\t"
2346 "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
2347 "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
2349 "pmaddwd %%mm4, %%mm1 \n\t"
2350 "pmaddwd %%mm2, %%mm3 \n\t"
2351 "pmaddwd %%mm6, %%mm4 \n\t"
2352 "pmaddwd %%mm6, %%mm2 \n\t"
2353 #ifndef FAST_BGR2YV12
2354 "psrad $8, %%mm4 \n\t"
2355 "psrad $8, %%mm1 \n\t"
2356 "psrad $8, %%mm2 \n\t"
2357 "psrad $8, %%mm3 \n\t"
2359 "packssdw %%mm2, %%mm4 \n\t"
2360 "packssdw %%mm3, %%mm1 \n\t"
2361 "pmaddwd %%mm5, %%mm4 \n\t"
2362 "pmaddwd %%mm5, %%mm1 \n\t"
2363 "add $24, %%"REG_d" \n\t"
2364 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
2365 "psraw $7, %%mm4 \n\t"
2367 "movq %%mm0, %%mm1 \n\t"
2368 "punpckldq %%mm4, %%mm0 \n\t"
2369 "punpckhdq %%mm4, %%mm1 \n\t"
2370 "packsswb %%mm1, %%mm0 \n\t"
2371 "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0 \n\t"
2372 "movd %%mm0, (%2, %%"REG_a") \n\t"
2373 "punpckhdq %%mm0, %%mm0 \n\t"
2374 "movd %%mm0, (%3, %%"REG_a") \n\t"
2375 "add $4, %%"REG_a" \n\t"
2377 : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
2378 : "%"REG_a, "%"REG_d
2381 udst += chromStride;
2382 vdst += chromStride;
2386 asm volatile( EMMS" \n\t"
2392 for (; y<height; y+=2)
2395 for (i=0; i<chromWidth; i++)
2397 unsigned int b = src[6*i+0];
2398 unsigned int g = src[6*i+1];
2399 unsigned int r = src[6*i+2];
2401 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2402 unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2403 unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2413 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2419 for (i=0; i<chromWidth; i++)
2421 unsigned int b = src[6*i+0];
2422 unsigned int g = src[6*i+1];
2423 unsigned int r = src[6*i+2];
2425 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2433 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2436 udst += chromStride;
2437 vdst += chromStride;
2443 void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
2444 long width, long height, long src1Stride,
2445 long src2Stride, long dstStride){
2448 for (h=0; h < height; h++)
2455 "xor %%"REG_a", %%"REG_a" \n\t"
2457 PREFETCH" 64(%1, %%"REG_a") \n\t"
2458 PREFETCH" 64(%2, %%"REG_a") \n\t"
2459 "movdqa (%1, %%"REG_a"), %%xmm0 \n\t"
2460 "movdqa (%1, %%"REG_a"), %%xmm1 \n\t"
2461 "movdqa (%2, %%"REG_a"), %%xmm2 \n\t"
2462 "punpcklbw %%xmm2, %%xmm0 \n\t"
2463 "punpckhbw %%xmm2, %%xmm1 \n\t"
2464 "movntdq %%xmm0, (%0, %%"REG_a", 2) \n\t"
2465 "movntdq %%xmm1, 16(%0, %%"REG_a", 2) \n\t"
2466 "add $16, %%"REG_a" \n\t"
2467 "cmp %3, %%"REG_a" \n\t"
2469 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2470 : "memory", "%"REG_a""
2474 "xor %%"REG_a", %%"REG_a" \n\t"
2476 PREFETCH" 64(%1, %%"REG_a") \n\t"
2477 PREFETCH" 64(%2, %%"REG_a") \n\t"
2478 "movq (%1, %%"REG_a"), %%mm0 \n\t"
2479 "movq 8(%1, %%"REG_a"), %%mm2 \n\t"
2480 "movq %%mm0, %%mm1 \n\t"
2481 "movq %%mm2, %%mm3 \n\t"
2482 "movq (%2, %%"REG_a"), %%mm4 \n\t"
2483 "movq 8(%2, %%"REG_a"), %%mm5 \n\t"
2484 "punpcklbw %%mm4, %%mm0 \n\t"
2485 "punpckhbw %%mm4, %%mm1 \n\t"
2486 "punpcklbw %%mm5, %%mm2 \n\t"
2487 "punpckhbw %%mm5, %%mm3 \n\t"
2488 MOVNTQ" %%mm0, (%0, %%"REG_a", 2) \n\t"
2489 MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2) \n\t"
2490 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2) \n\t"
2491 MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2) \n\t"
2492 "add $16, %%"REG_a" \n\t"
2493 "cmp %3, %%"REG_a" \n\t"
2495 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2496 : "memory", "%"REG_a
2499 for (w= (width&(~15)); w < width; w++)
2501 dest[2*w+0] = src1[w];
2502 dest[2*w+1] = src2[w];
2505 for (w=0; w < width; w++)
2507 dest[2*w+0] = src1[w];
2508 dest[2*w+1] = src2[w];
2524 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2525 uint8_t *dst1, uint8_t *dst2,
2526 long width, long height,
2527 long srcStride1, long srcStride2,
2528 long dstStride1, long dstStride2)
2531 w=width/2; h=height/2;
2536 ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2539 const uint8_t* s1=src1+srcStride1*(y>>1);
2540 uint8_t* d=dst1+dstStride1*y;
2546 PREFETCH" 32%1 \n\t"
2547 "movq %1, %%mm0 \n\t"
2548 "movq 8%1, %%mm2 \n\t"
2549 "movq 16%1, %%mm4 \n\t"
2550 "movq 24%1, %%mm6 \n\t"
2551 "movq %%mm0, %%mm1 \n\t"
2552 "movq %%mm2, %%mm3 \n\t"
2553 "movq %%mm4, %%mm5 \n\t"
2554 "movq %%mm6, %%mm7 \n\t"
2555 "punpcklbw %%mm0, %%mm0 \n\t"
2556 "punpckhbw %%mm1, %%mm1 \n\t"
2557 "punpcklbw %%mm2, %%mm2 \n\t"
2558 "punpckhbw %%mm3, %%mm3 \n\t"
2559 "punpcklbw %%mm4, %%mm4 \n\t"
2560 "punpckhbw %%mm5, %%mm5 \n\t"
2561 "punpcklbw %%mm6, %%mm6 \n\t"
2562 "punpckhbw %%mm7, %%mm7 \n\t"
2563 MOVNTQ" %%mm0, %0 \n\t"
2564 MOVNTQ" %%mm1, 8%0 \n\t"
2565 MOVNTQ" %%mm2, 16%0 \n\t"
2566 MOVNTQ" %%mm3, 24%0 \n\t"
2567 MOVNTQ" %%mm4, 32%0 \n\t"
2568 MOVNTQ" %%mm5, 40%0 \n\t"
2569 MOVNTQ" %%mm6, 48%0 \n\t"
2570 MOVNTQ" %%mm7, 56%0"
2576 for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2579 const uint8_t* s2=src2+srcStride2*(y>>1);
2580 uint8_t* d=dst2+dstStride2*y;
2586 PREFETCH" 32%1 \n\t"
2587 "movq %1, %%mm0 \n\t"
2588 "movq 8%1, %%mm2 \n\t"
2589 "movq 16%1, %%mm4 \n\t"
2590 "movq 24%1, %%mm6 \n\t"
2591 "movq %%mm0, %%mm1 \n\t"
2592 "movq %%mm2, %%mm3 \n\t"
2593 "movq %%mm4, %%mm5 \n\t"
2594 "movq %%mm6, %%mm7 \n\t"
2595 "punpcklbw %%mm0, %%mm0 \n\t"
2596 "punpckhbw %%mm1, %%mm1 \n\t"
2597 "punpcklbw %%mm2, %%mm2 \n\t"
2598 "punpckhbw %%mm3, %%mm3 \n\t"
2599 "punpcklbw %%mm4, %%mm4 \n\t"
2600 "punpckhbw %%mm5, %%mm5 \n\t"
2601 "punpcklbw %%mm6, %%mm6 \n\t"
2602 "punpckhbw %%mm7, %%mm7 \n\t"
2603 MOVNTQ" %%mm0, %0 \n\t"
2604 MOVNTQ" %%mm1, 8%0 \n\t"
2605 MOVNTQ" %%mm2, 16%0 \n\t"
2606 MOVNTQ" %%mm3, 24%0 \n\t"
2607 MOVNTQ" %%mm4, 32%0 \n\t"
2608 MOVNTQ" %%mm5, 40%0 \n\t"
2609 MOVNTQ" %%mm6, 48%0 \n\t"
2610 MOVNTQ" %%mm7, 56%0"
2616 for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2627 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2629 long width, long height,
2630 long srcStride1, long srcStride2,
2631 long srcStride3, long dstStride)
2634 w=width/2; h=height;
2636 const uint8_t* yp=src1+srcStride1*y;
2637 const uint8_t* up=src2+srcStride2*(y>>2);
2638 const uint8_t* vp=src3+srcStride3*(y>>2);
2639 uint8_t* d=dst+dstStride*y;
2645 PREFETCH" 32(%1, %0) \n\t"
2646 PREFETCH" 32(%2, %0) \n\t"
2647 PREFETCH" 32(%3, %0) \n\t"
2648 "movq (%1, %0, 4), %%mm0 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2649 "movq (%2, %0), %%mm1 \n\t" /* U0U1U2U3U4U5U6U7 */
2650 "movq (%3, %0), %%mm2 \n\t" /* V0V1V2V3V4V5V6V7 */
2651 "movq %%mm0, %%mm3 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2652 "movq %%mm1, %%mm4 \n\t" /* U0U1U2U3U4U5U6U7 */
2653 "movq %%mm2, %%mm5 \n\t" /* V0V1V2V3V4V5V6V7 */
2654 "punpcklbw %%mm1, %%mm1 \n\t" /* U0U0 U1U1 U2U2 U3U3 */
2655 "punpcklbw %%mm2, %%mm2 \n\t" /* V0V0 V1V1 V2V2 V3V3 */
2656 "punpckhbw %%mm4, %%mm4 \n\t" /* U4U4 U5U5 U6U6 U7U7 */
2657 "punpckhbw %%mm5, %%mm5 \n\t" /* V4V4 V5V5 V6V6 V7V7 */
2659 "movq %%mm1, %%mm6 \n\t"
2660 "punpcklbw %%mm2, %%mm1 \n\t" /* U0V0 U0V0 U1V1 U1V1*/
2661 "punpcklbw %%mm1, %%mm0 \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2662 "punpckhbw %%mm1, %%mm3 \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2663 MOVNTQ" %%mm0, (%4, %0, 8) \n\t"
2664 MOVNTQ" %%mm3, 8(%4, %0, 8) \n\t"
2666 "punpckhbw %%mm2, %%mm6 \n\t" /* U2V2 U2V2 U3V3 U3V3*/
2667 "movq 8(%1, %0, 4), %%mm0 \n\t"
2668 "movq %%mm0, %%mm3 \n\t"
2669 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U2 Y V2 Y U2 Y V2*/
2670 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U3 Y V3 Y U3 Y V3*/
2671 MOVNTQ" %%mm0, 16(%4, %0, 8) \n\t"
2672 MOVNTQ" %%mm3, 24(%4, %0, 8) \n\t"
2674 "movq %%mm4, %%mm6 \n\t"
2675 "movq 16(%1, %0, 4), %%mm0 \n\t"
2676 "movq %%mm0, %%mm3 \n\t"
2677 "punpcklbw %%mm5, %%mm4 \n\t"
2678 "punpcklbw %%mm4, %%mm0 \n\t" /* Y U4 Y V4 Y U4 Y V4*/
2679 "punpckhbw %%mm4, %%mm3 \n\t" /* Y U5 Y V5 Y U5 Y V5*/
2680 MOVNTQ" %%mm0, 32(%4, %0, 8) \n\t"
2681 MOVNTQ" %%mm3, 40(%4, %0, 8) \n\t"
2683 "punpckhbw %%mm5, %%mm6 \n\t"
2684 "movq 24(%1, %0, 4), %%mm0 \n\t"
2685 "movq %%mm0, %%mm3 \n\t"
2686 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U6 Y V6 Y U6 Y V6*/
2687 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U7 Y V7 Y U7 Y V7*/
2688 MOVNTQ" %%mm0, 48(%4, %0, 8) \n\t"
2689 MOVNTQ" %%mm3, 56(%4, %0, 8) \n\t"
2692 : "r"(yp), "r" (up), "r"(vp), "r"(d)
2698 const long x2 = x<<2;
2701 d[8*x+2] = yp[x2+1];
2703 d[8*x+4] = yp[x2+2];
2705 d[8*x+6] = yp[x2+3];
2718 static inline void RENAME(rgb2rgb_init)(void){
2719 rgb15to16 = RENAME(rgb15to16);
2720 rgb15tobgr24 = RENAME(rgb15tobgr24);
2721 rgb15to32 = RENAME(rgb15to32);
2722 rgb16tobgr24 = RENAME(rgb16tobgr24);
2723 rgb16to32 = RENAME(rgb16to32);
2724 rgb16to15 = RENAME(rgb16to15);
2725 rgb24tobgr16 = RENAME(rgb24tobgr16);
2726 rgb24tobgr15 = RENAME(rgb24tobgr15);
2727 rgb24tobgr32 = RENAME(rgb24tobgr32);
2728 rgb32to16 = RENAME(rgb32to16);
2729 rgb32to15 = RENAME(rgb32to15);
2730 rgb32tobgr24 = RENAME(rgb32tobgr24);
2731 rgb24to15 = RENAME(rgb24to15);
2732 rgb24to16 = RENAME(rgb24to16);
2733 rgb24tobgr24 = RENAME(rgb24tobgr24);
2734 rgb32tobgr32 = RENAME(rgb32tobgr32);
2735 rgb32tobgr16 = RENAME(rgb32tobgr16);
2736 rgb32tobgr15 = RENAME(rgb32tobgr15);
2737 yv12toyuy2 = RENAME(yv12toyuy2);
2738 yv12touyvy = RENAME(yv12touyvy);
2739 yuv422ptoyuy2 = RENAME(yuv422ptoyuy2);
2740 yuv422ptouyvy = RENAME(yuv422ptouyvy);
2741 yuy2toyv12 = RENAME(yuy2toyv12);
2742 // uyvytoyv12 = RENAME(uyvytoyv12);
2743 // yvu9toyv12 = RENAME(yvu9toyv12);
2744 planar2x = RENAME(planar2x);
2745 rgb24toyv12 = RENAME(rgb24toyv12);
2746 interleaveBytes = RENAME(interleaveBytes);
2747 vu9_to_vu12 = RENAME(vu9_to_vu12);
2748 yvu9_to_yuy2 = RENAME(yvu9_to_yuy2);