2 * software RGB to RGB converter
3 * pluralize by software PAL8 to RGB converter
4 * software YUV to YUV converter
5 * software YUV to RGB converter
6 * Written by Nick Kurshev.
7 * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
8 * lot of big-endian byte order fixes by Alex Beregszaszi
10 * This file is part of FFmpeg.
12 * FFmpeg is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
17 * FFmpeg is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
22 * You should have received a copy of the GNU General Public License
23 * along with FFmpeg; if not, write to the Free Software
24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
26 * The C code (not assembly, MMX, ...) of this file can be used
27 * under the LGPL license.
31 #include <inttypes.h> /* for __WORDSIZE */
48 #define PREFETCH "prefetch"
49 #define PREFETCHW "prefetchw"
50 #define PAVGB "pavgusb"
51 #elif defined (HAVE_MMX2)
52 #define PREFETCH "prefetchnta"
53 #define PREFETCHW "prefetcht0"
60 #define PREFETCH " # nop"
61 #define PREFETCHW " # nop"
66 /* On K6 femms is faster than emms. On K7 femms is directly mapped on emms. */
73 #define MOVNTQ "movntq"
74 #define SFENCE "sfence"
77 #define SFENCE " # nop"
80 static inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
83 const uint8_t *s = src;
86 const uint8_t *mm_end;
90 asm volatile(PREFETCH" %0"::"m"(*s):"memory");
92 asm volatile("movq %0, %%mm7"::"m"(mask32):"memory");
98 "punpckldq 3%1, %%mm0 \n\t"
99 "movd 6%1, %%mm1 \n\t"
100 "punpckldq 9%1, %%mm1 \n\t"
101 "movd 12%1, %%mm2 \n\t"
102 "punpckldq 15%1, %%mm2 \n\t"
103 "movd 18%1, %%mm3 \n\t"
104 "punpckldq 21%1, %%mm3 \n\t"
105 "pand %%mm7, %%mm0 \n\t"
106 "pand %%mm7, %%mm1 \n\t"
107 "pand %%mm7, %%mm2 \n\t"
108 "pand %%mm7, %%mm3 \n\t"
109 MOVNTQ" %%mm0, %0 \n\t"
110 MOVNTQ" %%mm1, 8%0 \n\t"
111 MOVNTQ" %%mm2, 16%0 \n\t"
119 asm volatile(SFENCE:::"memory");
120 asm volatile(EMMS:::"memory");
124 #ifdef WORDS_BIGENDIAN
125 /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */
140 static inline void RENAME(rgb32tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
143 const uint8_t *s = src;
146 const uint8_t *mm_end;
150 asm volatile(PREFETCH" %0"::"m"(*s):"memory");
156 "movq %1, %%mm0 \n\t"
157 "movq 8%1, %%mm1 \n\t"
158 "movq 16%1, %%mm4 \n\t"
159 "movq 24%1, %%mm5 \n\t"
160 "movq %%mm0, %%mm2 \n\t"
161 "movq %%mm1, %%mm3 \n\t"
162 "movq %%mm4, %%mm6 \n\t"
163 "movq %%mm5, %%mm7 \n\t"
164 "psrlq $8, %%mm2 \n\t"
165 "psrlq $8, %%mm3 \n\t"
166 "psrlq $8, %%mm6 \n\t"
167 "psrlq $8, %%mm7 \n\t"
168 "pand %2, %%mm0 \n\t"
169 "pand %2, %%mm1 \n\t"
170 "pand %2, %%mm4 \n\t"
171 "pand %2, %%mm5 \n\t"
172 "pand %3, %%mm2 \n\t"
173 "pand %3, %%mm3 \n\t"
174 "pand %3, %%mm6 \n\t"
175 "pand %3, %%mm7 \n\t"
176 "por %%mm2, %%mm0 \n\t"
177 "por %%mm3, %%mm1 \n\t"
178 "por %%mm6, %%mm4 \n\t"
179 "por %%mm7, %%mm5 \n\t"
181 "movq %%mm1, %%mm2 \n\t"
182 "movq %%mm4, %%mm3 \n\t"
183 "psllq $48, %%mm2 \n\t"
184 "psllq $32, %%mm3 \n\t"
185 "pand %4, %%mm2 \n\t"
186 "pand %5, %%mm3 \n\t"
187 "por %%mm2, %%mm0 \n\t"
188 "psrlq $16, %%mm1 \n\t"
189 "psrlq $32, %%mm4 \n\t"
190 "psllq $16, %%mm5 \n\t"
191 "por %%mm3, %%mm1 \n\t"
192 "pand %6, %%mm5 \n\t"
193 "por %%mm5, %%mm4 \n\t"
195 MOVNTQ" %%mm0, %0 \n\t"
196 MOVNTQ" %%mm1, 8%0 \n\t"
199 :"m"(*s),"m"(mask24l),
200 "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
205 asm volatile(SFENCE:::"memory");
206 asm volatile(EMMS:::"memory");
210 #ifdef WORDS_BIGENDIAN
211 /* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */
227 original by Strepto/Astral
228 ported to gcc & bugfixed: A'rpi
229 MMX2, 3DNOW optimization by Nick Kurshev
230 32-bit C version, and and&add trick by Michael Niedermayer
232 static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, long src_size)
234 register const uint8_t* s=src;
235 register uint8_t* d=dst;
236 register const uint8_t *end;
237 const uint8_t *mm_end;
240 asm volatile(PREFETCH" %0"::"m"(*s));
241 asm volatile("movq %0, %%mm4"::"m"(mask15s));
247 "movq %1, %%mm0 \n\t"
248 "movq 8%1, %%mm2 \n\t"
249 "movq %%mm0, %%mm1 \n\t"
250 "movq %%mm2, %%mm3 \n\t"
251 "pand %%mm4, %%mm0 \n\t"
252 "pand %%mm4, %%mm2 \n\t"
253 "paddw %%mm1, %%mm0 \n\t"
254 "paddw %%mm3, %%mm2 \n\t"
255 MOVNTQ" %%mm0, %0 \n\t"
263 asm volatile(SFENCE:::"memory");
264 asm volatile(EMMS:::"memory");
269 register unsigned x= *((const uint32_t *)s);
270 *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
276 register unsigned short x= *((const uint16_t *)s);
277 *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
281 static inline void RENAME(rgb16to15)(const uint8_t *src, uint8_t *dst, long src_size)
283 register const uint8_t* s=src;
284 register uint8_t* d=dst;
285 register const uint8_t *end;
286 const uint8_t *mm_end;
289 asm volatile(PREFETCH" %0"::"m"(*s));
290 asm volatile("movq %0, %%mm7"::"m"(mask15rg));
291 asm volatile("movq %0, %%mm6"::"m"(mask15b));
297 "movq %1, %%mm0 \n\t"
298 "movq 8%1, %%mm2 \n\t"
299 "movq %%mm0, %%mm1 \n\t"
300 "movq %%mm2, %%mm3 \n\t"
301 "psrlq $1, %%mm0 \n\t"
302 "psrlq $1, %%mm2 \n\t"
303 "pand %%mm7, %%mm0 \n\t"
304 "pand %%mm7, %%mm2 \n\t"
305 "pand %%mm6, %%mm1 \n\t"
306 "pand %%mm6, %%mm3 \n\t"
307 "por %%mm1, %%mm0 \n\t"
308 "por %%mm3, %%mm2 \n\t"
309 MOVNTQ" %%mm0, %0 \n\t"
317 asm volatile(SFENCE:::"memory");
318 asm volatile(EMMS:::"memory");
323 register uint32_t x= *((const uint32_t*)s);
324 *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
330 register uint16_t x= *((const uint16_t*)s);
331 *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
337 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size)
339 const uint8_t *s = src;
342 const uint8_t *mm_end;
344 uint16_t *d = (uint16_t *)dst;
348 #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
350 "movq %3, %%mm5 \n\t"
351 "movq %4, %%mm6 \n\t"
352 "movq %5, %%mm7 \n\t"
356 PREFETCH" 32(%1) \n\t"
357 "movd (%1), %%mm0 \n\t"
358 "movd 4(%1), %%mm3 \n\t"
359 "punpckldq 8(%1), %%mm0 \n\t"
360 "punpckldq 12(%1), %%mm3 \n\t"
361 "movq %%mm0, %%mm1 \n\t"
362 "movq %%mm3, %%mm4 \n\t"
363 "pand %%mm6, %%mm0 \n\t"
364 "pand %%mm6, %%mm3 \n\t"
365 "pmaddwd %%mm7, %%mm0 \n\t"
366 "pmaddwd %%mm7, %%mm3 \n\t"
367 "pand %%mm5, %%mm1 \n\t"
368 "pand %%mm5, %%mm4 \n\t"
369 "por %%mm1, %%mm0 \n\t"
370 "por %%mm4, %%mm3 \n\t"
371 "psrld $5, %%mm0 \n\t"
372 "pslld $11, %%mm3 \n\t"
373 "por %%mm3, %%mm0 \n\t"
374 MOVNTQ" %%mm0, (%0) \n\t"
381 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
384 asm volatile(PREFETCH" %0"::"m"(*src):"memory");
386 "movq %0, %%mm7 \n\t"
387 "movq %1, %%mm6 \n\t"
388 ::"m"(red_16mask),"m"(green_16mask));
393 "movd %1, %%mm0 \n\t"
394 "movd 4%1, %%mm3 \n\t"
395 "punpckldq 8%1, %%mm0 \n\t"
396 "punpckldq 12%1, %%mm3 \n\t"
397 "movq %%mm0, %%mm1 \n\t"
398 "movq %%mm0, %%mm2 \n\t"
399 "movq %%mm3, %%mm4 \n\t"
400 "movq %%mm3, %%mm5 \n\t"
401 "psrlq $3, %%mm0 \n\t"
402 "psrlq $3, %%mm3 \n\t"
403 "pand %2, %%mm0 \n\t"
404 "pand %2, %%mm3 \n\t"
405 "psrlq $5, %%mm1 \n\t"
406 "psrlq $5, %%mm4 \n\t"
407 "pand %%mm6, %%mm1 \n\t"
408 "pand %%mm6, %%mm4 \n\t"
409 "psrlq $8, %%mm2 \n\t"
410 "psrlq $8, %%mm5 \n\t"
411 "pand %%mm7, %%mm2 \n\t"
412 "pand %%mm7, %%mm5 \n\t"
413 "por %%mm1, %%mm0 \n\t"
414 "por %%mm4, %%mm3 \n\t"
415 "por %%mm2, %%mm0 \n\t"
416 "por %%mm5, %%mm3 \n\t"
417 "psllq $16, %%mm3 \n\t"
418 "por %%mm3, %%mm0 \n\t"
419 MOVNTQ" %%mm0, %0 \n\t"
420 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
425 asm volatile(SFENCE:::"memory");
426 asm volatile(EMMS:::"memory");
430 register int rgb = *(const uint32_t*)s; s += 4;
431 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
435 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
437 const uint8_t *s = src;
440 const uint8_t *mm_end;
442 uint16_t *d = (uint16_t *)dst;
445 asm volatile(PREFETCH" %0"::"m"(*src):"memory");
447 "movq %0, %%mm7 \n\t"
448 "movq %1, %%mm6 \n\t"
449 ::"m"(red_16mask),"m"(green_16mask));
455 "movd %1, %%mm0 \n\t"
456 "movd 4%1, %%mm3 \n\t"
457 "punpckldq 8%1, %%mm0 \n\t"
458 "punpckldq 12%1, %%mm3 \n\t"
459 "movq %%mm0, %%mm1 \n\t"
460 "movq %%mm0, %%mm2 \n\t"
461 "movq %%mm3, %%mm4 \n\t"
462 "movq %%mm3, %%mm5 \n\t"
463 "psllq $8, %%mm0 \n\t"
464 "psllq $8, %%mm3 \n\t"
465 "pand %%mm7, %%mm0 \n\t"
466 "pand %%mm7, %%mm3 \n\t"
467 "psrlq $5, %%mm1 \n\t"
468 "psrlq $5, %%mm4 \n\t"
469 "pand %%mm6, %%mm1 \n\t"
470 "pand %%mm6, %%mm4 \n\t"
471 "psrlq $19, %%mm2 \n\t"
472 "psrlq $19, %%mm5 \n\t"
473 "pand %2, %%mm2 \n\t"
474 "pand %2, %%mm5 \n\t"
475 "por %%mm1, %%mm0 \n\t"
476 "por %%mm4, %%mm3 \n\t"
477 "por %%mm2, %%mm0 \n\t"
478 "por %%mm5, %%mm3 \n\t"
479 "psllq $16, %%mm3 \n\t"
480 "por %%mm3, %%mm0 \n\t"
481 MOVNTQ" %%mm0, %0 \n\t"
482 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
486 asm volatile(SFENCE:::"memory");
487 asm volatile(EMMS:::"memory");
491 register int rgb = *(const uint32_t*)s; s += 4;
492 *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
496 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size)
498 const uint8_t *s = src;
501 const uint8_t *mm_end;
503 uint16_t *d = (uint16_t *)dst;
507 #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
509 "movq %3, %%mm5 \n\t"
510 "movq %4, %%mm6 \n\t"
511 "movq %5, %%mm7 \n\t"
515 PREFETCH" 32(%1) \n\t"
516 "movd (%1), %%mm0 \n\t"
517 "movd 4(%1), %%mm3 \n\t"
518 "punpckldq 8(%1), %%mm0 \n\t"
519 "punpckldq 12(%1), %%mm3 \n\t"
520 "movq %%mm0, %%mm1 \n\t"
521 "movq %%mm3, %%mm4 \n\t"
522 "pand %%mm6, %%mm0 \n\t"
523 "pand %%mm6, %%mm3 \n\t"
524 "pmaddwd %%mm7, %%mm0 \n\t"
525 "pmaddwd %%mm7, %%mm3 \n\t"
526 "pand %%mm5, %%mm1 \n\t"
527 "pand %%mm5, %%mm4 \n\t"
528 "por %%mm1, %%mm0 \n\t"
529 "por %%mm4, %%mm3 \n\t"
530 "psrld $6, %%mm0 \n\t"
531 "pslld $10, %%mm3 \n\t"
532 "por %%mm3, %%mm0 \n\t"
533 MOVNTQ" %%mm0, (%0) \n\t"
540 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
543 asm volatile(PREFETCH" %0"::"m"(*src):"memory");
545 "movq %0, %%mm7 \n\t"
546 "movq %1, %%mm6 \n\t"
547 ::"m"(red_15mask),"m"(green_15mask));
552 "movd %1, %%mm0 \n\t"
553 "movd 4%1, %%mm3 \n\t"
554 "punpckldq 8%1, %%mm0 \n\t"
555 "punpckldq 12%1, %%mm3 \n\t"
556 "movq %%mm0, %%mm1 \n\t"
557 "movq %%mm0, %%mm2 \n\t"
558 "movq %%mm3, %%mm4 \n\t"
559 "movq %%mm3, %%mm5 \n\t"
560 "psrlq $3, %%mm0 \n\t"
561 "psrlq $3, %%mm3 \n\t"
562 "pand %2, %%mm0 \n\t"
563 "pand %2, %%mm3 \n\t"
564 "psrlq $6, %%mm1 \n\t"
565 "psrlq $6, %%mm4 \n\t"
566 "pand %%mm6, %%mm1 \n\t"
567 "pand %%mm6, %%mm4 \n\t"
568 "psrlq $9, %%mm2 \n\t"
569 "psrlq $9, %%mm5 \n\t"
570 "pand %%mm7, %%mm2 \n\t"
571 "pand %%mm7, %%mm5 \n\t"
572 "por %%mm1, %%mm0 \n\t"
573 "por %%mm4, %%mm3 \n\t"
574 "por %%mm2, %%mm0 \n\t"
575 "por %%mm5, %%mm3 \n\t"
576 "psllq $16, %%mm3 \n\t"
577 "por %%mm3, %%mm0 \n\t"
578 MOVNTQ" %%mm0, %0 \n\t"
579 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
584 asm volatile(SFENCE:::"memory");
585 asm volatile(EMMS:::"memory");
589 register int rgb = *(const uint32_t*)s; s += 4;
590 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
594 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
596 const uint8_t *s = src;
599 const uint8_t *mm_end;
601 uint16_t *d = (uint16_t *)dst;
604 asm volatile(PREFETCH" %0"::"m"(*src):"memory");
606 "movq %0, %%mm7 \n\t"
607 "movq %1, %%mm6 \n\t"
608 ::"m"(red_15mask),"m"(green_15mask));
614 "movd %1, %%mm0 \n\t"
615 "movd 4%1, %%mm3 \n\t"
616 "punpckldq 8%1, %%mm0 \n\t"
617 "punpckldq 12%1, %%mm3 \n\t"
618 "movq %%mm0, %%mm1 \n\t"
619 "movq %%mm0, %%mm2 \n\t"
620 "movq %%mm3, %%mm4 \n\t"
621 "movq %%mm3, %%mm5 \n\t"
622 "psllq $7, %%mm0 \n\t"
623 "psllq $7, %%mm3 \n\t"
624 "pand %%mm7, %%mm0 \n\t"
625 "pand %%mm7, %%mm3 \n\t"
626 "psrlq $6, %%mm1 \n\t"
627 "psrlq $6, %%mm4 \n\t"
628 "pand %%mm6, %%mm1 \n\t"
629 "pand %%mm6, %%mm4 \n\t"
630 "psrlq $19, %%mm2 \n\t"
631 "psrlq $19, %%mm5 \n\t"
632 "pand %2, %%mm2 \n\t"
633 "pand %2, %%mm5 \n\t"
634 "por %%mm1, %%mm0 \n\t"
635 "por %%mm4, %%mm3 \n\t"
636 "por %%mm2, %%mm0 \n\t"
637 "por %%mm5, %%mm3 \n\t"
638 "psllq $16, %%mm3 \n\t"
639 "por %%mm3, %%mm0 \n\t"
640 MOVNTQ" %%mm0, %0 \n\t"
641 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
645 asm volatile(SFENCE:::"memory");
646 asm volatile(EMMS:::"memory");
650 register int rgb = *(const uint32_t*)s; s += 4;
651 *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
655 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
657 const uint8_t *s = src;
660 const uint8_t *mm_end;
662 uint16_t *d = (uint16_t *)dst;
665 asm volatile(PREFETCH" %0"::"m"(*src):"memory");
667 "movq %0, %%mm7 \n\t"
668 "movq %1, %%mm6 \n\t"
669 ::"m"(red_16mask),"m"(green_16mask));
675 "movd %1, %%mm0 \n\t"
676 "movd 3%1, %%mm3 \n\t"
677 "punpckldq 6%1, %%mm0 \n\t"
678 "punpckldq 9%1, %%mm3 \n\t"
679 "movq %%mm0, %%mm1 \n\t"
680 "movq %%mm0, %%mm2 \n\t"
681 "movq %%mm3, %%mm4 \n\t"
682 "movq %%mm3, %%mm5 \n\t"
683 "psrlq $3, %%mm0 \n\t"
684 "psrlq $3, %%mm3 \n\t"
685 "pand %2, %%mm0 \n\t"
686 "pand %2, %%mm3 \n\t"
687 "psrlq $5, %%mm1 \n\t"
688 "psrlq $5, %%mm4 \n\t"
689 "pand %%mm6, %%mm1 \n\t"
690 "pand %%mm6, %%mm4 \n\t"
691 "psrlq $8, %%mm2 \n\t"
692 "psrlq $8, %%mm5 \n\t"
693 "pand %%mm7, %%mm2 \n\t"
694 "pand %%mm7, %%mm5 \n\t"
695 "por %%mm1, %%mm0 \n\t"
696 "por %%mm4, %%mm3 \n\t"
697 "por %%mm2, %%mm0 \n\t"
698 "por %%mm5, %%mm3 \n\t"
699 "psllq $16, %%mm3 \n\t"
700 "por %%mm3, %%mm0 \n\t"
701 MOVNTQ" %%mm0, %0 \n\t"
702 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
706 asm volatile(SFENCE:::"memory");
707 asm volatile(EMMS:::"memory");
714 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
718 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size)
720 const uint8_t *s = src;
723 const uint8_t *mm_end;
725 uint16_t *d = (uint16_t *)dst;
728 asm volatile(PREFETCH" %0"::"m"(*src):"memory");
730 "movq %0, %%mm7 \n\t"
731 "movq %1, %%mm6 \n\t"
732 ::"m"(red_16mask),"m"(green_16mask));
738 "movd %1, %%mm0 \n\t"
739 "movd 3%1, %%mm3 \n\t"
740 "punpckldq 6%1, %%mm0 \n\t"
741 "punpckldq 9%1, %%mm3 \n\t"
742 "movq %%mm0, %%mm1 \n\t"
743 "movq %%mm0, %%mm2 \n\t"
744 "movq %%mm3, %%mm4 \n\t"
745 "movq %%mm3, %%mm5 \n\t"
746 "psllq $8, %%mm0 \n\t"
747 "psllq $8, %%mm3 \n\t"
748 "pand %%mm7, %%mm0 \n\t"
749 "pand %%mm7, %%mm3 \n\t"
750 "psrlq $5, %%mm1 \n\t"
751 "psrlq $5, %%mm4 \n\t"
752 "pand %%mm6, %%mm1 \n\t"
753 "pand %%mm6, %%mm4 \n\t"
754 "psrlq $19, %%mm2 \n\t"
755 "psrlq $19, %%mm5 \n\t"
756 "pand %2, %%mm2 \n\t"
757 "pand %2, %%mm5 \n\t"
758 "por %%mm1, %%mm0 \n\t"
759 "por %%mm4, %%mm3 \n\t"
760 "por %%mm2, %%mm0 \n\t"
761 "por %%mm5, %%mm3 \n\t"
762 "psllq $16, %%mm3 \n\t"
763 "por %%mm3, %%mm0 \n\t"
764 MOVNTQ" %%mm0, %0 \n\t"
765 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
769 asm volatile(SFENCE:::"memory");
770 asm volatile(EMMS:::"memory");
777 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
781 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
783 const uint8_t *s = src;
786 const uint8_t *mm_end;
788 uint16_t *d = (uint16_t *)dst;
791 asm volatile(PREFETCH" %0"::"m"(*src):"memory");
793 "movq %0, %%mm7 \n\t"
794 "movq %1, %%mm6 \n\t"
795 ::"m"(red_15mask),"m"(green_15mask));
801 "movd %1, %%mm0 \n\t"
802 "movd 3%1, %%mm3 \n\t"
803 "punpckldq 6%1, %%mm0 \n\t"
804 "punpckldq 9%1, %%mm3 \n\t"
805 "movq %%mm0, %%mm1 \n\t"
806 "movq %%mm0, %%mm2 \n\t"
807 "movq %%mm3, %%mm4 \n\t"
808 "movq %%mm3, %%mm5 \n\t"
809 "psrlq $3, %%mm0 \n\t"
810 "psrlq $3, %%mm3 \n\t"
811 "pand %2, %%mm0 \n\t"
812 "pand %2, %%mm3 \n\t"
813 "psrlq $6, %%mm1 \n\t"
814 "psrlq $6, %%mm4 \n\t"
815 "pand %%mm6, %%mm1 \n\t"
816 "pand %%mm6, %%mm4 \n\t"
817 "psrlq $9, %%mm2 \n\t"
818 "psrlq $9, %%mm5 \n\t"
819 "pand %%mm7, %%mm2 \n\t"
820 "pand %%mm7, %%mm5 \n\t"
821 "por %%mm1, %%mm0 \n\t"
822 "por %%mm4, %%mm3 \n\t"
823 "por %%mm2, %%mm0 \n\t"
824 "por %%mm5, %%mm3 \n\t"
825 "psllq $16, %%mm3 \n\t"
826 "por %%mm3, %%mm0 \n\t"
827 MOVNTQ" %%mm0, %0 \n\t"
828 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
832 asm volatile(SFENCE:::"memory");
833 asm volatile(EMMS:::"memory");
840 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
844 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size)
846 const uint8_t *s = src;
849 const uint8_t *mm_end;
851 uint16_t *d = (uint16_t *)dst;
854 asm volatile(PREFETCH" %0"::"m"(*src):"memory");
856 "movq %0, %%mm7 \n\t"
857 "movq %1, %%mm6 \n\t"
858 ::"m"(red_15mask),"m"(green_15mask));
864 "movd %1, %%mm0 \n\t"
865 "movd 3%1, %%mm3 \n\t"
866 "punpckldq 6%1, %%mm0 \n\t"
867 "punpckldq 9%1, %%mm3 \n\t"
868 "movq %%mm0, %%mm1 \n\t"
869 "movq %%mm0, %%mm2 \n\t"
870 "movq %%mm3, %%mm4 \n\t"
871 "movq %%mm3, %%mm5 \n\t"
872 "psllq $7, %%mm0 \n\t"
873 "psllq $7, %%mm3 \n\t"
874 "pand %%mm7, %%mm0 \n\t"
875 "pand %%mm7, %%mm3 \n\t"
876 "psrlq $6, %%mm1 \n\t"
877 "psrlq $6, %%mm4 \n\t"
878 "pand %%mm6, %%mm1 \n\t"
879 "pand %%mm6, %%mm4 \n\t"
880 "psrlq $19, %%mm2 \n\t"
881 "psrlq $19, %%mm5 \n\t"
882 "pand %2, %%mm2 \n\t"
883 "pand %2, %%mm5 \n\t"
884 "por %%mm1, %%mm0 \n\t"
885 "por %%mm4, %%mm3 \n\t"
886 "por %%mm2, %%mm0 \n\t"
887 "por %%mm5, %%mm3 \n\t"
888 "psllq $16, %%mm3 \n\t"
889 "por %%mm3, %%mm0 \n\t"
890 MOVNTQ" %%mm0, %0 \n\t"
891 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
895 asm volatile(SFENCE:::"memory");
896 asm volatile(EMMS:::"memory");
903 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
908 I use less accurate approximation here by simply left-shifting the input
909 value and filling the low order bits with zeroes. This method improves PNG
910 compression but this scheme cannot reproduce white exactly, since it does
911 not generate an all-ones maximum value; the net effect is to darken the
914 The better method should be "left bit replication":
924 | leftmost bits repeated to fill open bits
928 static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
932 const uint16_t *mm_end;
935 const uint16_t *s = (const uint16_t*)src;
936 end = s + src_size/2;
938 asm volatile(PREFETCH" %0"::"m"(*s):"memory");
944 "movq %1, %%mm0 \n\t"
945 "movq %1, %%mm1 \n\t"
946 "movq %1, %%mm2 \n\t"
947 "pand %2, %%mm0 \n\t"
948 "pand %3, %%mm1 \n\t"
949 "pand %4, %%mm2 \n\t"
950 "psllq $3, %%mm0 \n\t"
951 "psrlq $2, %%mm1 \n\t"
952 "psrlq $7, %%mm2 \n\t"
953 "movq %%mm0, %%mm3 \n\t"
954 "movq %%mm1, %%mm4 \n\t"
955 "movq %%mm2, %%mm5 \n\t"
956 "punpcklwd %5, %%mm0 \n\t"
957 "punpcklwd %5, %%mm1 \n\t"
958 "punpcklwd %5, %%mm2 \n\t"
959 "punpckhwd %5, %%mm3 \n\t"
960 "punpckhwd %5, %%mm4 \n\t"
961 "punpckhwd %5, %%mm5 \n\t"
962 "psllq $8, %%mm1 \n\t"
963 "psllq $16, %%mm2 \n\t"
964 "por %%mm1, %%mm0 \n\t"
965 "por %%mm2, %%mm0 \n\t"
966 "psllq $8, %%mm4 \n\t"
967 "psllq $16, %%mm5 \n\t"
968 "por %%mm4, %%mm3 \n\t"
969 "por %%mm5, %%mm3 \n\t"
971 "movq %%mm0, %%mm6 \n\t"
972 "movq %%mm3, %%mm7 \n\t"
974 "movq 8%1, %%mm0 \n\t"
975 "movq 8%1, %%mm1 \n\t"
976 "movq 8%1, %%mm2 \n\t"
977 "pand %2, %%mm0 \n\t"
978 "pand %3, %%mm1 \n\t"
979 "pand %4, %%mm2 \n\t"
980 "psllq $3, %%mm0 \n\t"
981 "psrlq $2, %%mm1 \n\t"
982 "psrlq $7, %%mm2 \n\t"
983 "movq %%mm0, %%mm3 \n\t"
984 "movq %%mm1, %%mm4 \n\t"
985 "movq %%mm2, %%mm5 \n\t"
986 "punpcklwd %5, %%mm0 \n\t"
987 "punpcklwd %5, %%mm1 \n\t"
988 "punpcklwd %5, %%mm2 \n\t"
989 "punpckhwd %5, %%mm3 \n\t"
990 "punpckhwd %5, %%mm4 \n\t"
991 "punpckhwd %5, %%mm5 \n\t"
992 "psllq $8, %%mm1 \n\t"
993 "psllq $16, %%mm2 \n\t"
994 "por %%mm1, %%mm0 \n\t"
995 "por %%mm2, %%mm0 \n\t"
996 "psllq $8, %%mm4 \n\t"
997 "psllq $16, %%mm5 \n\t"
998 "por %%mm4, %%mm3 \n\t"
999 "por %%mm5, %%mm3 \n\t"
1002 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
1004 /* borrowed 32 to 24 */
1006 "movq %%mm0, %%mm4 \n\t"
1007 "movq %%mm3, %%mm5 \n\t"
1008 "movq %%mm6, %%mm0 \n\t"
1009 "movq %%mm7, %%mm1 \n\t"
1011 "movq %%mm4, %%mm6 \n\t"
1012 "movq %%mm5, %%mm7 \n\t"
1013 "movq %%mm0, %%mm2 \n\t"
1014 "movq %%mm1, %%mm3 \n\t"
1016 "psrlq $8, %%mm2 \n\t"
1017 "psrlq $8, %%mm3 \n\t"
1018 "psrlq $8, %%mm6 \n\t"
1019 "psrlq $8, %%mm7 \n\t"
1020 "pand %2, %%mm0 \n\t"
1021 "pand %2, %%mm1 \n\t"
1022 "pand %2, %%mm4 \n\t"
1023 "pand %2, %%mm5 \n\t"
1024 "pand %3, %%mm2 \n\t"
1025 "pand %3, %%mm3 \n\t"
1026 "pand %3, %%mm6 \n\t"
1027 "pand %3, %%mm7 \n\t"
1028 "por %%mm2, %%mm0 \n\t"
1029 "por %%mm3, %%mm1 \n\t"
1030 "por %%mm6, %%mm4 \n\t"
1031 "por %%mm7, %%mm5 \n\t"
1033 "movq %%mm1, %%mm2 \n\t"
1034 "movq %%mm4, %%mm3 \n\t"
1035 "psllq $48, %%mm2 \n\t"
1036 "psllq $32, %%mm3 \n\t"
1037 "pand %4, %%mm2 \n\t"
1038 "pand %5, %%mm3 \n\t"
1039 "por %%mm2, %%mm0 \n\t"
1040 "psrlq $16, %%mm1 \n\t"
1041 "psrlq $32, %%mm4 \n\t"
1042 "psllq $16, %%mm5 \n\t"
1043 "por %%mm3, %%mm1 \n\t"
1044 "pand %6, %%mm5 \n\t"
1045 "por %%mm5, %%mm4 \n\t"
1047 MOVNTQ" %%mm0, %0 \n\t"
1048 MOVNTQ" %%mm1, 8%0 \n\t"
1049 MOVNTQ" %%mm4, 16%0"
1052 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1057 asm volatile(SFENCE:::"memory");
1058 asm volatile(EMMS:::"memory");
1062 register uint16_t bgr;
1064 *d++ = (bgr&0x1F)<<3;
1065 *d++ = (bgr&0x3E0)>>2;
1066 *d++ = (bgr&0x7C00)>>7;
1070 static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
1072 const uint16_t *end;
1074 const uint16_t *mm_end;
1076 uint8_t *d = (uint8_t *)dst;
1077 const uint16_t *s = (const uint16_t *)src;
1078 end = s + src_size/2;
1080 asm volatile(PREFETCH" %0"::"m"(*s):"memory");
1085 PREFETCH" 32%1 \n\t"
1086 "movq %1, %%mm0 \n\t"
1087 "movq %1, %%mm1 \n\t"
1088 "movq %1, %%mm2 \n\t"
1089 "pand %2, %%mm0 \n\t"
1090 "pand %3, %%mm1 \n\t"
1091 "pand %4, %%mm2 \n\t"
1092 "psllq $3, %%mm0 \n\t"
1093 "psrlq $3, %%mm1 \n\t"
1094 "psrlq $8, %%mm2 \n\t"
1095 "movq %%mm0, %%mm3 \n\t"
1096 "movq %%mm1, %%mm4 \n\t"
1097 "movq %%mm2, %%mm5 \n\t"
1098 "punpcklwd %5, %%mm0 \n\t"
1099 "punpcklwd %5, %%mm1 \n\t"
1100 "punpcklwd %5, %%mm2 \n\t"
1101 "punpckhwd %5, %%mm3 \n\t"
1102 "punpckhwd %5, %%mm4 \n\t"
1103 "punpckhwd %5, %%mm5 \n\t"
1104 "psllq $8, %%mm1 \n\t"
1105 "psllq $16, %%mm2 \n\t"
1106 "por %%mm1, %%mm0 \n\t"
1107 "por %%mm2, %%mm0 \n\t"
1108 "psllq $8, %%mm4 \n\t"
1109 "psllq $16, %%mm5 \n\t"
1110 "por %%mm4, %%mm3 \n\t"
1111 "por %%mm5, %%mm3 \n\t"
1113 "movq %%mm0, %%mm6 \n\t"
1114 "movq %%mm3, %%mm7 \n\t"
1116 "movq 8%1, %%mm0 \n\t"
1117 "movq 8%1, %%mm1 \n\t"
1118 "movq 8%1, %%mm2 \n\t"
1119 "pand %2, %%mm0 \n\t"
1120 "pand %3, %%mm1 \n\t"
1121 "pand %4, %%mm2 \n\t"
1122 "psllq $3, %%mm0 \n\t"
1123 "psrlq $3, %%mm1 \n\t"
1124 "psrlq $8, %%mm2 \n\t"
1125 "movq %%mm0, %%mm3 \n\t"
1126 "movq %%mm1, %%mm4 \n\t"
1127 "movq %%mm2, %%mm5 \n\t"
1128 "punpcklwd %5, %%mm0 \n\t"
1129 "punpcklwd %5, %%mm1 \n\t"
1130 "punpcklwd %5, %%mm2 \n\t"
1131 "punpckhwd %5, %%mm3 \n\t"
1132 "punpckhwd %5, %%mm4 \n\t"
1133 "punpckhwd %5, %%mm5 \n\t"
1134 "psllq $8, %%mm1 \n\t"
1135 "psllq $16, %%mm2 \n\t"
1136 "por %%mm1, %%mm0 \n\t"
1137 "por %%mm2, %%mm0 \n\t"
1138 "psllq $8, %%mm4 \n\t"
1139 "psllq $16, %%mm5 \n\t"
1140 "por %%mm4, %%mm3 \n\t"
1141 "por %%mm5, %%mm3 \n\t"
1143 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
1145 /* borrowed 32 to 24 */
1147 "movq %%mm0, %%mm4 \n\t"
1148 "movq %%mm3, %%mm5 \n\t"
1149 "movq %%mm6, %%mm0 \n\t"
1150 "movq %%mm7, %%mm1 \n\t"
1152 "movq %%mm4, %%mm6 \n\t"
1153 "movq %%mm5, %%mm7 \n\t"
1154 "movq %%mm0, %%mm2 \n\t"
1155 "movq %%mm1, %%mm3 \n\t"
1157 "psrlq $8, %%mm2 \n\t"
1158 "psrlq $8, %%mm3 \n\t"
1159 "psrlq $8, %%mm6 \n\t"
1160 "psrlq $8, %%mm7 \n\t"
1161 "pand %2, %%mm0 \n\t"
1162 "pand %2, %%mm1 \n\t"
1163 "pand %2, %%mm4 \n\t"
1164 "pand %2, %%mm5 \n\t"
1165 "pand %3, %%mm2 \n\t"
1166 "pand %3, %%mm3 \n\t"
1167 "pand %3, %%mm6 \n\t"
1168 "pand %3, %%mm7 \n\t"
1169 "por %%mm2, %%mm0 \n\t"
1170 "por %%mm3, %%mm1 \n\t"
1171 "por %%mm6, %%mm4 \n\t"
1172 "por %%mm7, %%mm5 \n\t"
1174 "movq %%mm1, %%mm2 \n\t"
1175 "movq %%mm4, %%mm3 \n\t"
1176 "psllq $48, %%mm2 \n\t"
1177 "psllq $32, %%mm3 \n\t"
1178 "pand %4, %%mm2 \n\t"
1179 "pand %5, %%mm3 \n\t"
1180 "por %%mm2, %%mm0 \n\t"
1181 "psrlq $16, %%mm1 \n\t"
1182 "psrlq $32, %%mm4 \n\t"
1183 "psllq $16, %%mm5 \n\t"
1184 "por %%mm3, %%mm1 \n\t"
1185 "pand %6, %%mm5 \n\t"
1186 "por %%mm5, %%mm4 \n\t"
1188 MOVNTQ" %%mm0, %0 \n\t"
1189 MOVNTQ" %%mm1, 8%0 \n\t"
1190 MOVNTQ" %%mm4, 16%0"
1193 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1198 asm volatile(SFENCE:::"memory");
1199 asm volatile(EMMS:::"memory");
1203 register uint16_t bgr;
1205 *d++ = (bgr&0x1F)<<3;
1206 *d++ = (bgr&0x7E0)>>3;
1207 *d++ = (bgr&0xF800)>>8;
1211 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size)
1213 const uint16_t *end;
1215 const uint16_t *mm_end;
1218 const uint16_t *s = (const uint16_t *)src;
1219 end = s + src_size/2;
1221 asm volatile(PREFETCH" %0"::"m"(*s):"memory");
1222 asm volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
1227 PREFETCH" 32%1 \n\t"
1228 "movq %1, %%mm0 \n\t"
1229 "movq %1, %%mm1 \n\t"
1230 "movq %1, %%mm2 \n\t"
1231 "pand %2, %%mm0 \n\t"
1232 "pand %3, %%mm1 \n\t"
1233 "pand %4, %%mm2 \n\t"
1234 "psllq $3, %%mm0 \n\t"
1235 "psrlq $2, %%mm1 \n\t"
1236 "psrlq $7, %%mm2 \n\t"
1237 "movq %%mm0, %%mm3 \n\t"
1238 "movq %%mm1, %%mm4 \n\t"
1239 "movq %%mm2, %%mm5 \n\t"
1240 "punpcklwd %%mm7, %%mm0 \n\t"
1241 "punpcklwd %%mm7, %%mm1 \n\t"
1242 "punpcklwd %%mm7, %%mm2 \n\t"
1243 "punpckhwd %%mm7, %%mm3 \n\t"
1244 "punpckhwd %%mm7, %%mm4 \n\t"
1245 "punpckhwd %%mm7, %%mm5 \n\t"
1246 "psllq $8, %%mm1 \n\t"
1247 "psllq $16, %%mm2 \n\t"
1248 "por %%mm1, %%mm0 \n\t"
1249 "por %%mm2, %%mm0 \n\t"
1250 "psllq $8, %%mm4 \n\t"
1251 "psllq $16, %%mm5 \n\t"
1252 "por %%mm4, %%mm3 \n\t"
1253 "por %%mm5, %%mm3 \n\t"
1254 MOVNTQ" %%mm0, %0 \n\t"
1255 MOVNTQ" %%mm3, 8%0 \n\t"
1257 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1262 asm volatile(SFENCE:::"memory");
1263 asm volatile(EMMS:::"memory");
1267 #if 0 //slightly slower on Athlon
1269 *((uint32_t*)d)++ = ((bgr&0x1F)<<3) + ((bgr&0x3E0)<<6) + ((bgr&0x7C00)<<9);
1271 register uint16_t bgr;
1273 #ifdef WORDS_BIGENDIAN
1275 *d++ = (bgr&0x7C00)>>7;
1276 *d++ = (bgr&0x3E0)>>2;
1277 *d++ = (bgr&0x1F)<<3;
1279 *d++ = (bgr&0x1F)<<3;
1280 *d++ = (bgr&0x3E0)>>2;
1281 *d++ = (bgr&0x7C00)>>7;
1289 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size)
1291 const uint16_t *end;
1293 const uint16_t *mm_end;
1296 const uint16_t *s = (const uint16_t*)src;
1297 end = s + src_size/2;
1299 asm volatile(PREFETCH" %0"::"m"(*s):"memory");
1300 asm volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
1305 PREFETCH" 32%1 \n\t"
1306 "movq %1, %%mm0 \n\t"
1307 "movq %1, %%mm1 \n\t"
1308 "movq %1, %%mm2 \n\t"
1309 "pand %2, %%mm0 \n\t"
1310 "pand %3, %%mm1 \n\t"
1311 "pand %4, %%mm2 \n\t"
1312 "psllq $3, %%mm0 \n\t"
1313 "psrlq $3, %%mm1 \n\t"
1314 "psrlq $8, %%mm2 \n\t"
1315 "movq %%mm0, %%mm3 \n\t"
1316 "movq %%mm1, %%mm4 \n\t"
1317 "movq %%mm2, %%mm5 \n\t"
1318 "punpcklwd %%mm7, %%mm0 \n\t"
1319 "punpcklwd %%mm7, %%mm1 \n\t"
1320 "punpcklwd %%mm7, %%mm2 \n\t"
1321 "punpckhwd %%mm7, %%mm3 \n\t"
1322 "punpckhwd %%mm7, %%mm4 \n\t"
1323 "punpckhwd %%mm7, %%mm5 \n\t"
1324 "psllq $8, %%mm1 \n\t"
1325 "psllq $16, %%mm2 \n\t"
1326 "por %%mm1, %%mm0 \n\t"
1327 "por %%mm2, %%mm0 \n\t"
1328 "psllq $8, %%mm4 \n\t"
1329 "psllq $16, %%mm5 \n\t"
1330 "por %%mm4, %%mm3 \n\t"
1331 "por %%mm5, %%mm3 \n\t"
1332 MOVNTQ" %%mm0, %0 \n\t"
1333 MOVNTQ" %%mm3, 8%0 \n\t"
1335 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1340 asm volatile(SFENCE:::"memory");
1341 asm volatile(EMMS:::"memory");
1345 register uint16_t bgr;
1347 #ifdef WORDS_BIGENDIAN
1349 *d++ = (bgr&0xF800)>>8;
1350 *d++ = (bgr&0x7E0)>>3;
1351 *d++ = (bgr&0x1F)<<3;
1353 *d++ = (bgr&0x1F)<<3;
1354 *d++ = (bgr&0x7E0)>>3;
1355 *d++ = (bgr&0xF800)>>8;
1361 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
1363 long idx = 15 - src_size;
1364 const uint8_t *s = src-idx;
1365 uint8_t *d = dst-idx;
1370 PREFETCH" (%1, %0) \n\t"
1371 "movq %3, %%mm7 \n\t"
1372 "pxor %4, %%mm7 \n\t"
1373 "movq %%mm7, %%mm6 \n\t"
1374 "pxor %5, %%mm7 \n\t"
1377 PREFETCH" 32(%1, %0) \n\t"
1378 "movq (%1, %0), %%mm0 \n\t"
1379 "movq 8(%1, %0), %%mm1 \n\t"
1381 "pshufw $177, %%mm0, %%mm3 \n\t"
1382 "pshufw $177, %%mm1, %%mm5 \n\t"
1383 "pand %%mm7, %%mm0 \n\t"
1384 "pand %%mm6, %%mm3 \n\t"
1385 "pand %%mm7, %%mm1 \n\t"
1386 "pand %%mm6, %%mm5 \n\t"
1387 "por %%mm3, %%mm0 \n\t"
1388 "por %%mm5, %%mm1 \n\t"
1390 "movq %%mm0, %%mm2 \n\t"
1391 "movq %%mm1, %%mm4 \n\t"
1392 "pand %%mm7, %%mm0 \n\t"
1393 "pand %%mm6, %%mm2 \n\t"
1394 "pand %%mm7, %%mm1 \n\t"
1395 "pand %%mm6, %%mm4 \n\t"
1396 "movq %%mm2, %%mm3 \n\t"
1397 "movq %%mm4, %%mm5 \n\t"
1398 "pslld $16, %%mm2 \n\t"
1399 "psrld $16, %%mm3 \n\t"
1400 "pslld $16, %%mm4 \n\t"
1401 "psrld $16, %%mm5 \n\t"
1402 "por %%mm2, %%mm0 \n\t"
1403 "por %%mm4, %%mm1 \n\t"
1404 "por %%mm3, %%mm0 \n\t"
1405 "por %%mm5, %%mm1 \n\t"
1407 MOVNTQ" %%mm0, (%2, %0) \n\t"
1408 MOVNTQ" %%mm1, 8(%2, %0) \n\t"
1415 : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
1418 for (; idx<15; idx+=4) {
1419 register int v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00;
1421 *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16);
1425 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
1429 long mmx_size= 23 - src_size;
1431 "test %%"REG_a", %%"REG_a" \n\t"
1433 "movq "MANGLE(mask24r)", %%mm5 \n\t"
1434 "movq "MANGLE(mask24g)", %%mm6 \n\t"
1435 "movq "MANGLE(mask24b)", %%mm7 \n\t"
1438 PREFETCH" 32(%1, %%"REG_a") \n\t"
1439 "movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1440 "movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG
1441 "movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B
1442 "psllq $16, %%mm0 \n\t" // 00 BGR BGR
1443 "pand %%mm5, %%mm0 \n\t"
1444 "pand %%mm6, %%mm1 \n\t"
1445 "pand %%mm7, %%mm2 \n\t"
1446 "por %%mm0, %%mm1 \n\t"
1447 "por %%mm2, %%mm1 \n\t"
1448 "movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1449 MOVNTQ" %%mm1, (%2, %%"REG_a") \n\t" // RGB RGB RG
1450 "movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B
1451 "movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR
1452 "pand %%mm7, %%mm0 \n\t"
1453 "pand %%mm5, %%mm1 \n\t"
1454 "pand %%mm6, %%mm2 \n\t"
1455 "por %%mm0, %%mm1 \n\t"
1456 "por %%mm2, %%mm1 \n\t"
1457 "movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B
1458 MOVNTQ" %%mm1, 8(%2, %%"REG_a") \n\t" // B RGB RGB R
1459 "movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR
1460 "movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG
1461 "pand %%mm6, %%mm0 \n\t"
1462 "pand %%mm7, %%mm1 \n\t"
1463 "pand %%mm5, %%mm2 \n\t"
1464 "por %%mm0, %%mm1 \n\t"
1465 "por %%mm2, %%mm1 \n\t"
1466 MOVNTQ" %%mm1, 16(%2, %%"REG_a") \n\t"
1467 "add $24, %%"REG_a" \n\t"
1471 : "r" (src-mmx_size), "r"(dst-mmx_size)
1474 asm volatile(SFENCE:::"memory");
1475 asm volatile(EMMS:::"memory");
1477 if (mmx_size==23) return; //finished, was multiple of 8
1481 src_size= 23-mmx_size;
1485 for (i=0; i<src_size; i+=3)
1489 dst[i + 1] = src[i + 1];
1490 dst[i + 2] = src[i + 0];
1495 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1496 long width, long height,
1497 long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1500 const long chromWidth= width>>1;
1501 for (y=0; y<height; y++)
1504 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1506 "xor %%"REG_a", %%"REG_a" \n\t"
1509 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1510 PREFETCH" 32(%2, %%"REG_a") \n\t"
1511 PREFETCH" 32(%3, %%"REG_a") \n\t"
1512 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1513 "movq %%mm0, %%mm2 \n\t" // U(0)
1514 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1515 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1516 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1518 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1519 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1520 "movq %%mm3, %%mm4 \n\t" // Y(0)
1521 "movq %%mm5, %%mm6 \n\t" // Y(8)
1522 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
1523 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
1524 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
1525 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
1527 MOVNTQ" %%mm3, (%0, %%"REG_a", 4) \n\t"
1528 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
1529 MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4) \n\t"
1530 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
1532 "add $8, %%"REG_a" \n\t"
1533 "cmp %4, %%"REG_a" \n\t"
1535 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1540 #if defined ARCH_ALPHA && defined HAVE_MVI
1541 #define pl2yuy2(n) \
1546 asm("unpkbw %1, %0" : "=r"(y1) : "r"(y1)); \
1547 asm("unpkbw %1, %0" : "=r"(y2) : "r"(y2)); \
1548 asm("unpkbl %1, %0" : "=r"(u) : "r"(u)); \
1549 asm("unpkbl %1, %0" : "=r"(v) : "r"(v)); \
1550 yuv1 = (u << 8) + (v << 24); \
1557 uint64_t *qdst = (uint64_t *) dst;
1558 uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
1559 const uint32_t *yc = (uint32_t *) ysrc;
1560 const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
1561 const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
1562 for (i = 0; i < chromWidth; i += 8){
1563 uint64_t y1, y2, yuv1, yuv2;
1566 asm("ldq $31,64(%0)" :: "r"(yc));
1567 asm("ldq $31,64(%0)" :: "r"(yc2));
1568 asm("ldq $31,64(%0)" :: "r"(uc));
1569 asm("ldq $31,64(%0)" :: "r"(vc));
1587 #elif __WORDSIZE >= 64
1589 uint64_t *ldst = (uint64_t *) dst;
1590 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1591 for (i = 0; i < chromWidth; i += 2){
1593 k = yc[0] + (uc[0] << 8) +
1594 (yc[1] << 16) + (vc[0] << 24);
1595 l = yc[2] + (uc[1] << 8) +
1596 (yc[3] << 16) + (vc[1] << 24);
1597 *ldst++ = k + (l << 32);
1604 int i, *idst = (int32_t *) dst;
1605 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1606 for (i = 0; i < chromWidth; i++){
1607 #ifdef WORDS_BIGENDIAN
1608 *idst++ = (yc[0] << 24)+ (uc[0] << 16) +
1609 (yc[1] << 8) + (vc[0] << 0);
1611 *idst++ = yc[0] + (uc[0] << 8) +
1612 (yc[1] << 16) + (vc[0] << 24);
1620 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1)
1622 usrc += chromStride;
1623 vsrc += chromStride;
1636 * Height should be a multiple of 2 and width should be a multiple of 16.
1637 * (If this is a problem for anyone then tell me, and I will fix it.)
1639 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1640 long width, long height,
1641 long lumStride, long chromStride, long dstStride)
1643 //FIXME interpolate chroma
1644 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1647 static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1648 long width, long height,
1649 long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1652 const long chromWidth= width>>1;
1653 for (y=0; y<height; y++)
1656 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1658 "xor %%"REG_a", %%"REG_a" \n\t"
1661 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1662 PREFETCH" 32(%2, %%"REG_a") \n\t"
1663 PREFETCH" 32(%3, %%"REG_a") \n\t"
1664 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1665 "movq %%mm0, %%mm2 \n\t" // U(0)
1666 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1667 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1668 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1670 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1671 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1672 "movq %%mm0, %%mm4 \n\t" // Y(0)
1673 "movq %%mm2, %%mm6 \n\t" // Y(8)
1674 "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
1675 "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
1676 "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
1677 "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
1679 MOVNTQ" %%mm0, (%0, %%"REG_a", 4) \n\t"
1680 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
1681 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4) \n\t"
1682 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
1684 "add $8, %%"REG_a" \n\t"
1685 "cmp %4, %%"REG_a" \n\t"
1687 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1691 //FIXME adapt the Alpha ASM code from yv12->yuy2
1693 #if __WORDSIZE >= 64
1695 uint64_t *ldst = (uint64_t *) dst;
1696 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1697 for (i = 0; i < chromWidth; i += 2){
1699 k = uc[0] + (yc[0] << 8) +
1700 (vc[0] << 16) + (yc[1] << 24);
1701 l = uc[1] + (yc[2] << 8) +
1702 (vc[1] << 16) + (yc[3] << 24);
1703 *ldst++ = k + (l << 32);
1710 int i, *idst = (int32_t *) dst;
1711 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1712 for (i = 0; i < chromWidth; i++){
1713 #ifdef WORDS_BIGENDIAN
1714 *idst++ = (uc[0] << 24)+ (yc[0] << 16) +
1715 (vc[0] << 8) + (yc[1] << 0);
1717 *idst++ = uc[0] + (yc[0] << 8) +
1718 (vc[0] << 16) + (yc[1] << 24);
1726 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1)
1728 usrc += chromStride;
1729 vsrc += chromStride;
1742 * Height should be a multiple of 2 and width should be a multiple of 16
1743 * (If this is a problem for anyone then tell me, and I will fix it.)
1745 static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1746 long width, long height,
1747 long lumStride, long chromStride, long dstStride)
1749 //FIXME interpolate chroma
1750 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1754 * Width should be a multiple of 16.
1756 static inline void RENAME(yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1757 long width, long height,
1758 long lumStride, long chromStride, long dstStride)
1760 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1764 * Width should be a multiple of 16.
1766 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1767 long width, long height,
1768 long lumStride, long chromStride, long dstStride)
1770 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1774 * Height should be a multiple of 2 and width should be a multiple of 16.
1775 * (If this is a problem for anyone then tell me, and I will fix it.)
1777 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1778 long width, long height,
1779 long lumStride, long chromStride, long srcStride)
1782 const long chromWidth= width>>1;
1783 for (y=0; y<height; y+=2)
1787 "xor %%"REG_a", %%"REG_a" \n\t"
1788 "pcmpeqw %%mm7, %%mm7 \n\t"
1789 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1792 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1793 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1794 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1795 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
1796 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
1797 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
1798 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
1799 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1800 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1801 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1802 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1804 MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
1806 "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(8)
1807 "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(12)
1808 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
1809 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
1810 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
1811 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
1812 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1813 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1814 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1815 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1817 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1819 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1820 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1821 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1822 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1823 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1824 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1825 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1826 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1828 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
1829 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
1831 "add $8, %%"REG_a" \n\t"
1832 "cmp %4, %%"REG_a" \n\t"
1834 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1835 : "memory", "%"REG_a
1842 "xor %%"REG_a", %%"REG_a" \n\t"
1845 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1846 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1847 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1848 "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
1849 "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
1850 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1851 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1852 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1853 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1854 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1855 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1857 MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
1858 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1860 "add $8, %%"REG_a" \n\t"
1861 "cmp %4, %%"REG_a" \n\t"
1864 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1865 : "memory", "%"REG_a
1869 for (i=0; i<chromWidth; i++)
1871 ydst[2*i+0] = src[4*i+0];
1872 udst[i] = src[4*i+1];
1873 ydst[2*i+1] = src[4*i+2];
1874 vdst[i] = src[4*i+3];
1879 for (i=0; i<chromWidth; i++)
1881 ydst[2*i+0] = src[4*i+0];
1882 ydst[2*i+1] = src[4*i+2];
1885 udst += chromStride;
1886 vdst += chromStride;
1891 asm volatile( EMMS" \n\t"
1897 static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1898 uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1899 long width, long height, long lumStride, long chromStride)
1902 memcpy(ydst, ysrc, width*height);
1904 /* XXX: implement upscaling for U,V */
1907 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWidth, long srcHeight, long srcStride, long dstStride)
1914 for (x=0; x<srcWidth-1; x++){
1915 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1916 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1918 dst[2*srcWidth-1]= src[srcWidth-1];
1922 for (y=1; y<srcHeight; y++){
1923 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1924 const long mmxSize= srcWidth&~15;
1926 "mov %4, %%"REG_a" \n\t"
1928 "movq (%0, %%"REG_a"), %%mm0 \n\t"
1929 "movq (%1, %%"REG_a"), %%mm1 \n\t"
1930 "movq 1(%0, %%"REG_a"), %%mm2 \n\t"
1931 "movq 1(%1, %%"REG_a"), %%mm3 \n\t"
1932 "movq -1(%0, %%"REG_a"), %%mm4 \n\t"
1933 "movq -1(%1, %%"REG_a"), %%mm5 \n\t"
1934 PAVGB" %%mm0, %%mm5 \n\t"
1935 PAVGB" %%mm0, %%mm3 \n\t"
1936 PAVGB" %%mm0, %%mm5 \n\t"
1937 PAVGB" %%mm0, %%mm3 \n\t"
1938 PAVGB" %%mm1, %%mm4 \n\t"
1939 PAVGB" %%mm1, %%mm2 \n\t"
1940 PAVGB" %%mm1, %%mm4 \n\t"
1941 PAVGB" %%mm1, %%mm2 \n\t"
1942 "movq %%mm5, %%mm7 \n\t"
1943 "movq %%mm4, %%mm6 \n\t"
1944 "punpcklbw %%mm3, %%mm5 \n\t"
1945 "punpckhbw %%mm3, %%mm7 \n\t"
1946 "punpcklbw %%mm2, %%mm4 \n\t"
1947 "punpckhbw %%mm2, %%mm6 \n\t"
1949 MOVNTQ" %%mm5, (%2, %%"REG_a", 2) \n\t"
1950 MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2) \n\t"
1951 MOVNTQ" %%mm4, (%3, %%"REG_a", 2) \n\t"
1952 MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2) \n\t"
1954 "movq %%mm5, (%2, %%"REG_a", 2) \n\t"
1955 "movq %%mm7, 8(%2, %%"REG_a", 2) \n\t"
1956 "movq %%mm4, (%3, %%"REG_a", 2) \n\t"
1957 "movq %%mm6, 8(%3, %%"REG_a", 2) \n\t"
1959 "add $8, %%"REG_a" \n\t"
1961 :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
1962 "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1968 const long mmxSize=1;
1970 dst[0 ]= (3*src[0] + src[srcStride])>>2;
1971 dst[dstStride]= ( src[0] + 3*src[srcStride])>>2;
1973 for (x=mmxSize-1; x<srcWidth-1; x++){
1974 dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
1975 dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
1976 dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
1977 dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
1979 dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
1980 dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1990 for (x=0; x<srcWidth-1; x++){
1991 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1992 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1994 dst[2*srcWidth-1]= src[srcWidth-1];
1996 for (x=0; x<srcWidth; x++){
2003 asm volatile( EMMS" \n\t"
2010 * Height should be a multiple of 2 and width should be a multiple of 16.
2011 * (If this is a problem for anyone then tell me, and I will fix it.)
2012 * Chrominance data is only taken from every second line, others are ignored.
2013 * FIXME: Write HQ version.
2015 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2016 long width, long height,
2017 long lumStride, long chromStride, long srcStride)
2020 const long chromWidth= width>>1;
2021 for (y=0; y<height; y+=2)
2025 "xorl %%eax, %%eax \n\t"
2026 "pcmpeqw %%mm7, %%mm7 \n\t"
2027 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
2030 PREFETCH" 64(%0, %%eax, 4) \n\t"
2031 "movq (%0, %%eax, 4), %%mm0 \n\t" // UYVY UYVY(0)
2032 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(4)
2033 "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
2034 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
2035 "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
2036 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
2037 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
2038 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
2039 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
2040 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
2042 MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
2044 "movq 16(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(8)
2045 "movq 24(%0, %%eax, 4), %%mm2 \n\t" // UYVY UYVY(12)
2046 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
2047 "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
2048 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
2049 "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
2050 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
2051 "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
2052 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
2053 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
2055 MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
2057 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
2058 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
2059 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
2060 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
2061 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
2062 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
2063 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
2064 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
2066 MOVNTQ" %%mm0, (%3, %%eax) \n\t"
2067 MOVNTQ" %%mm2, (%2, %%eax) \n\t"
2069 "addl $8, %%eax \n\t"
2070 "cmpl %4, %%eax \n\t"
2072 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2080 "xorl %%eax, %%eax \n\t"
2083 PREFETCH" 64(%0, %%eax, 4) \n\t"
2084 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
2085 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
2086 "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
2087 "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
2088 "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
2089 "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
2090 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
2091 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
2092 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
2093 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
2095 MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
2096 MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
2098 "addl $8, %%eax \n\t"
2099 "cmpl %4, %%eax \n\t"
2102 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2107 for (i=0; i<chromWidth; i++)
2109 udst[i] = src[4*i+0];
2110 ydst[2*i+0] = src[4*i+1];
2111 vdst[i] = src[4*i+2];
2112 ydst[2*i+1] = src[4*i+3];
2117 for (i=0; i<chromWidth; i++)
2119 ydst[2*i+0] = src[4*i+1];
2120 ydst[2*i+1] = src[4*i+3];
2123 udst += chromStride;
2124 vdst += chromStride;
2129 asm volatile( EMMS" \n\t"
2136 * Height should be a multiple of 2 and width should be a multiple of 2.
2137 * (If this is a problem for anyone then tell me, and I will fix it.)
2138 * Chrominance data is only taken from every second line,
2139 * others are ignored in the C version.
2140 * FIXME: Write HQ version.
2142 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2143 long width, long height,
2144 long lumStride, long chromStride, long srcStride)
2147 const long chromWidth= width>>1;
2149 for (y=0; y<height-2; y+=2)
2155 "mov %2, %%"REG_a" \n\t"
2156 "movq "MANGLE(ff_bgr2YCoeff)", %%mm6 \n\t"
2157 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
2158 "pxor %%mm7, %%mm7 \n\t"
2159 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
2162 PREFETCH" 64(%0, %%"REG_d") \n\t"
2163 "movd (%0, %%"REG_d"), %%mm0 \n\t"
2164 "movd 3(%0, %%"REG_d"), %%mm1 \n\t"
2165 "punpcklbw %%mm7, %%mm0 \n\t"
2166 "punpcklbw %%mm7, %%mm1 \n\t"
2167 "movd 6(%0, %%"REG_d"), %%mm2 \n\t"
2168 "movd 9(%0, %%"REG_d"), %%mm3 \n\t"
2169 "punpcklbw %%mm7, %%mm2 \n\t"
2170 "punpcklbw %%mm7, %%mm3 \n\t"
2171 "pmaddwd %%mm6, %%mm0 \n\t"
2172 "pmaddwd %%mm6, %%mm1 \n\t"
2173 "pmaddwd %%mm6, %%mm2 \n\t"
2174 "pmaddwd %%mm6, %%mm3 \n\t"
2175 #ifndef FAST_BGR2YV12
2176 "psrad $8, %%mm0 \n\t"
2177 "psrad $8, %%mm1 \n\t"
2178 "psrad $8, %%mm2 \n\t"
2179 "psrad $8, %%mm3 \n\t"
2181 "packssdw %%mm1, %%mm0 \n\t"
2182 "packssdw %%mm3, %%mm2 \n\t"
2183 "pmaddwd %%mm5, %%mm0 \n\t"
2184 "pmaddwd %%mm5, %%mm2 \n\t"
2185 "packssdw %%mm2, %%mm0 \n\t"
2186 "psraw $7, %%mm0 \n\t"
2188 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
2189 "movd 15(%0, %%"REG_d"), %%mm1 \n\t"
2190 "punpcklbw %%mm7, %%mm4 \n\t"
2191 "punpcklbw %%mm7, %%mm1 \n\t"
2192 "movd 18(%0, %%"REG_d"), %%mm2 \n\t"
2193 "movd 21(%0, %%"REG_d"), %%mm3 \n\t"
2194 "punpcklbw %%mm7, %%mm2 \n\t"
2195 "punpcklbw %%mm7, %%mm3 \n\t"
2196 "pmaddwd %%mm6, %%mm4 \n\t"
2197 "pmaddwd %%mm6, %%mm1 \n\t"
2198 "pmaddwd %%mm6, %%mm2 \n\t"
2199 "pmaddwd %%mm6, %%mm3 \n\t"
2200 #ifndef FAST_BGR2YV12
2201 "psrad $8, %%mm4 \n\t"
2202 "psrad $8, %%mm1 \n\t"
2203 "psrad $8, %%mm2 \n\t"
2204 "psrad $8, %%mm3 \n\t"
2206 "packssdw %%mm1, %%mm4 \n\t"
2207 "packssdw %%mm3, %%mm2 \n\t"
2208 "pmaddwd %%mm5, %%mm4 \n\t"
2209 "pmaddwd %%mm5, %%mm2 \n\t"
2210 "add $24, %%"REG_d" \n\t"
2211 "packssdw %%mm2, %%mm4 \n\t"
2212 "psraw $7, %%mm4 \n\t"
2214 "packuswb %%mm4, %%mm0 \n\t"
2215 "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0 \n\t"
2217 MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t"
2218 "add $8, %%"REG_a" \n\t"
2220 : : "r" (src+width*3), "r" (ydst+width), "g" (-width)
2221 : "%"REG_a, "%"REG_d
2228 "mov %4, %%"REG_a" \n\t"
2229 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
2230 "movq "MANGLE(ff_bgr2UCoeff)", %%mm6 \n\t"
2231 "pxor %%mm7, %%mm7 \n\t"
2232 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
2233 "add %%"REG_d", %%"REG_d" \n\t"
2236 PREFETCH" 64(%0, %%"REG_d") \n\t"
2237 PREFETCH" 64(%1, %%"REG_d") \n\t"
2238 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2239 "movq (%0, %%"REG_d"), %%mm0 \n\t"
2240 "movq (%1, %%"REG_d"), %%mm1 \n\t"
2241 "movq 6(%0, %%"REG_d"), %%mm2 \n\t"
2242 "movq 6(%1, %%"REG_d"), %%mm3 \n\t"
2243 PAVGB" %%mm1, %%mm0 \n\t"
2244 PAVGB" %%mm3, %%mm2 \n\t"
2245 "movq %%mm0, %%mm1 \n\t"
2246 "movq %%mm2, %%mm3 \n\t"
2247 "psrlq $24, %%mm0 \n\t"
2248 "psrlq $24, %%mm2 \n\t"
2249 PAVGB" %%mm1, %%mm0 \n\t"
2250 PAVGB" %%mm3, %%mm2 \n\t"
2251 "punpcklbw %%mm7, %%mm0 \n\t"
2252 "punpcklbw %%mm7, %%mm2 \n\t"
2254 "movd (%0, %%"REG_d"), %%mm0 \n\t"
2255 "movd (%1, %%"REG_d"), %%mm1 \n\t"
2256 "movd 3(%0, %%"REG_d"), %%mm2 \n\t"
2257 "movd 3(%1, %%"REG_d"), %%mm3 \n\t"
2258 "punpcklbw %%mm7, %%mm0 \n\t"
2259 "punpcklbw %%mm7, %%mm1 \n\t"
2260 "punpcklbw %%mm7, %%mm2 \n\t"
2261 "punpcklbw %%mm7, %%mm3 \n\t"
2262 "paddw %%mm1, %%mm0 \n\t"
2263 "paddw %%mm3, %%mm2 \n\t"
2264 "paddw %%mm2, %%mm0 \n\t"
2265 "movd 6(%0, %%"REG_d"), %%mm4 \n\t"
2266 "movd 6(%1, %%"REG_d"), %%mm1 \n\t"
2267 "movd 9(%0, %%"REG_d"), %%mm2 \n\t"
2268 "movd 9(%1, %%"REG_d"), %%mm3 \n\t"
2269 "punpcklbw %%mm7, %%mm4 \n\t"
2270 "punpcklbw %%mm7, %%mm1 \n\t"
2271 "punpcklbw %%mm7, %%mm2 \n\t"
2272 "punpcklbw %%mm7, %%mm3 \n\t"
2273 "paddw %%mm1, %%mm4 \n\t"
2274 "paddw %%mm3, %%mm2 \n\t"
2275 "paddw %%mm4, %%mm2 \n\t"
2276 "psrlw $2, %%mm0 \n\t"
2277 "psrlw $2, %%mm2 \n\t"
2279 "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
2280 "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
2282 "pmaddwd %%mm0, %%mm1 \n\t"
2283 "pmaddwd %%mm2, %%mm3 \n\t"
2284 "pmaddwd %%mm6, %%mm0 \n\t"
2285 "pmaddwd %%mm6, %%mm2 \n\t"
2286 #ifndef FAST_BGR2YV12
2287 "psrad $8, %%mm0 \n\t"
2288 "psrad $8, %%mm1 \n\t"
2289 "psrad $8, %%mm2 \n\t"
2290 "psrad $8, %%mm3 \n\t"
2292 "packssdw %%mm2, %%mm0 \n\t"
2293 "packssdw %%mm3, %%mm1 \n\t"
2294 "pmaddwd %%mm5, %%mm0 \n\t"
2295 "pmaddwd %%mm5, %%mm1 \n\t"
2296 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
2297 "psraw $7, %%mm0 \n\t"
2299 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2300 "movq 12(%0, %%"REG_d"), %%mm4 \n\t"
2301 "movq 12(%1, %%"REG_d"), %%mm1 \n\t"
2302 "movq 18(%0, %%"REG_d"), %%mm2 \n\t"
2303 "movq 18(%1, %%"REG_d"), %%mm3 \n\t"
2304 PAVGB" %%mm1, %%mm4 \n\t"
2305 PAVGB" %%mm3, %%mm2 \n\t"
2306 "movq %%mm4, %%mm1 \n\t"
2307 "movq %%mm2, %%mm3 \n\t"
2308 "psrlq $24, %%mm4 \n\t"
2309 "psrlq $24, %%mm2 \n\t"
2310 PAVGB" %%mm1, %%mm4 \n\t"
2311 PAVGB" %%mm3, %%mm2 \n\t"
2312 "punpcklbw %%mm7, %%mm4 \n\t"
2313 "punpcklbw %%mm7, %%mm2 \n\t"
2315 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
2316 "movd 12(%1, %%"REG_d"), %%mm1 \n\t"
2317 "movd 15(%0, %%"REG_d"), %%mm2 \n\t"
2318 "movd 15(%1, %%"REG_d"), %%mm3 \n\t"
2319 "punpcklbw %%mm7, %%mm4 \n\t"
2320 "punpcklbw %%mm7, %%mm1 \n\t"
2321 "punpcklbw %%mm7, %%mm2 \n\t"
2322 "punpcklbw %%mm7, %%mm3 \n\t"
2323 "paddw %%mm1, %%mm4 \n\t"
2324 "paddw %%mm3, %%mm2 \n\t"
2325 "paddw %%mm2, %%mm4 \n\t"
2326 "movd 18(%0, %%"REG_d"), %%mm5 \n\t"
2327 "movd 18(%1, %%"REG_d"), %%mm1 \n\t"
2328 "movd 21(%0, %%"REG_d"), %%mm2 \n\t"
2329 "movd 21(%1, %%"REG_d"), %%mm3 \n\t"
2330 "punpcklbw %%mm7, %%mm5 \n\t"
2331 "punpcklbw %%mm7, %%mm1 \n\t"
2332 "punpcklbw %%mm7, %%mm2 \n\t"
2333 "punpcklbw %%mm7, %%mm3 \n\t"
2334 "paddw %%mm1, %%mm5 \n\t"
2335 "paddw %%mm3, %%mm2 \n\t"
2336 "paddw %%mm5, %%mm2 \n\t"
2337 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
2338 "psrlw $2, %%mm4 \n\t"
2339 "psrlw $2, %%mm2 \n\t"
2341 "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
2342 "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
2344 "pmaddwd %%mm4, %%mm1 \n\t"
2345 "pmaddwd %%mm2, %%mm3 \n\t"
2346 "pmaddwd %%mm6, %%mm4 \n\t"
2347 "pmaddwd %%mm6, %%mm2 \n\t"
2348 #ifndef FAST_BGR2YV12
2349 "psrad $8, %%mm4 \n\t"
2350 "psrad $8, %%mm1 \n\t"
2351 "psrad $8, %%mm2 \n\t"
2352 "psrad $8, %%mm3 \n\t"
2354 "packssdw %%mm2, %%mm4 \n\t"
2355 "packssdw %%mm3, %%mm1 \n\t"
2356 "pmaddwd %%mm5, %%mm4 \n\t"
2357 "pmaddwd %%mm5, %%mm1 \n\t"
2358 "add $24, %%"REG_d" \n\t"
2359 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
2360 "psraw $7, %%mm4 \n\t"
2362 "movq %%mm0, %%mm1 \n\t"
2363 "punpckldq %%mm4, %%mm0 \n\t"
2364 "punpckhdq %%mm4, %%mm1 \n\t"
2365 "packsswb %%mm1, %%mm0 \n\t"
2366 "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0 \n\t"
2367 "movd %%mm0, (%2, %%"REG_a") \n\t"
2368 "punpckhdq %%mm0, %%mm0 \n\t"
2369 "movd %%mm0, (%3, %%"REG_a") \n\t"
2370 "add $4, %%"REG_a" \n\t"
2372 : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
2373 : "%"REG_a, "%"REG_d
2376 udst += chromStride;
2377 vdst += chromStride;
2381 asm volatile( EMMS" \n\t"
2387 for (; y<height; y+=2)
2390 for (i=0; i<chromWidth; i++)
2392 unsigned int b = src[6*i+0];
2393 unsigned int g = src[6*i+1];
2394 unsigned int r = src[6*i+2];
2396 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2397 unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2398 unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2408 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2414 for (i=0; i<chromWidth; i++)
2416 unsigned int b = src[6*i+0];
2417 unsigned int g = src[6*i+1];
2418 unsigned int r = src[6*i+2];
2420 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2428 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2431 udst += chromStride;
2432 vdst += chromStride;
2438 void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
2439 long width, long height, long src1Stride,
2440 long src2Stride, long dstStride){
2443 for (h=0; h < height; h++)
2450 "xor %%"REG_a", %%"REG_a" \n\t"
2452 PREFETCH" 64(%1, %%"REG_a") \n\t"
2453 PREFETCH" 64(%2, %%"REG_a") \n\t"
2454 "movdqa (%1, %%"REG_a"), %%xmm0 \n\t"
2455 "movdqa (%1, %%"REG_a"), %%xmm1 \n\t"
2456 "movdqa (%2, %%"REG_a"), %%xmm2 \n\t"
2457 "punpcklbw %%xmm2, %%xmm0 \n\t"
2458 "punpckhbw %%xmm2, %%xmm1 \n\t"
2459 "movntdq %%xmm0, (%0, %%"REG_a", 2) \n\t"
2460 "movntdq %%xmm1, 16(%0, %%"REG_a", 2) \n\t"
2461 "add $16, %%"REG_a" \n\t"
2462 "cmp %3, %%"REG_a" \n\t"
2464 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2465 : "memory", "%"REG_a""
2469 "xor %%"REG_a", %%"REG_a" \n\t"
2471 PREFETCH" 64(%1, %%"REG_a") \n\t"
2472 PREFETCH" 64(%2, %%"REG_a") \n\t"
2473 "movq (%1, %%"REG_a"), %%mm0 \n\t"
2474 "movq 8(%1, %%"REG_a"), %%mm2 \n\t"
2475 "movq %%mm0, %%mm1 \n\t"
2476 "movq %%mm2, %%mm3 \n\t"
2477 "movq (%2, %%"REG_a"), %%mm4 \n\t"
2478 "movq 8(%2, %%"REG_a"), %%mm5 \n\t"
2479 "punpcklbw %%mm4, %%mm0 \n\t"
2480 "punpckhbw %%mm4, %%mm1 \n\t"
2481 "punpcklbw %%mm5, %%mm2 \n\t"
2482 "punpckhbw %%mm5, %%mm3 \n\t"
2483 MOVNTQ" %%mm0, (%0, %%"REG_a", 2) \n\t"
2484 MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2) \n\t"
2485 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2) \n\t"
2486 MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2) \n\t"
2487 "add $16, %%"REG_a" \n\t"
2488 "cmp %3, %%"REG_a" \n\t"
2490 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2491 : "memory", "%"REG_a
2494 for (w= (width&(~15)); w < width; w++)
2496 dest[2*w+0] = src1[w];
2497 dest[2*w+1] = src2[w];
2500 for (w=0; w < width; w++)
2502 dest[2*w+0] = src1[w];
2503 dest[2*w+1] = src2[w];
2519 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2520 uint8_t *dst1, uint8_t *dst2,
2521 long width, long height,
2522 long srcStride1, long srcStride2,
2523 long dstStride1, long dstStride2)
2526 w=width/2; h=height/2;
2531 ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2534 const uint8_t* s1=src1+srcStride1*(y>>1);
2535 uint8_t* d=dst1+dstStride1*y;
2541 PREFETCH" 32%1 \n\t"
2542 "movq %1, %%mm0 \n\t"
2543 "movq 8%1, %%mm2 \n\t"
2544 "movq 16%1, %%mm4 \n\t"
2545 "movq 24%1, %%mm6 \n\t"
2546 "movq %%mm0, %%mm1 \n\t"
2547 "movq %%mm2, %%mm3 \n\t"
2548 "movq %%mm4, %%mm5 \n\t"
2549 "movq %%mm6, %%mm7 \n\t"
2550 "punpcklbw %%mm0, %%mm0 \n\t"
2551 "punpckhbw %%mm1, %%mm1 \n\t"
2552 "punpcklbw %%mm2, %%mm2 \n\t"
2553 "punpckhbw %%mm3, %%mm3 \n\t"
2554 "punpcklbw %%mm4, %%mm4 \n\t"
2555 "punpckhbw %%mm5, %%mm5 \n\t"
2556 "punpcklbw %%mm6, %%mm6 \n\t"
2557 "punpckhbw %%mm7, %%mm7 \n\t"
2558 MOVNTQ" %%mm0, %0 \n\t"
2559 MOVNTQ" %%mm1, 8%0 \n\t"
2560 MOVNTQ" %%mm2, 16%0 \n\t"
2561 MOVNTQ" %%mm3, 24%0 \n\t"
2562 MOVNTQ" %%mm4, 32%0 \n\t"
2563 MOVNTQ" %%mm5, 40%0 \n\t"
2564 MOVNTQ" %%mm6, 48%0 \n\t"
2565 MOVNTQ" %%mm7, 56%0"
2571 for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2574 const uint8_t* s2=src2+srcStride2*(y>>1);
2575 uint8_t* d=dst2+dstStride2*y;
2581 PREFETCH" 32%1 \n\t"
2582 "movq %1, %%mm0 \n\t"
2583 "movq 8%1, %%mm2 \n\t"
2584 "movq 16%1, %%mm4 \n\t"
2585 "movq 24%1, %%mm6 \n\t"
2586 "movq %%mm0, %%mm1 \n\t"
2587 "movq %%mm2, %%mm3 \n\t"
2588 "movq %%mm4, %%mm5 \n\t"
2589 "movq %%mm6, %%mm7 \n\t"
2590 "punpcklbw %%mm0, %%mm0 \n\t"
2591 "punpckhbw %%mm1, %%mm1 \n\t"
2592 "punpcklbw %%mm2, %%mm2 \n\t"
2593 "punpckhbw %%mm3, %%mm3 \n\t"
2594 "punpcklbw %%mm4, %%mm4 \n\t"
2595 "punpckhbw %%mm5, %%mm5 \n\t"
2596 "punpcklbw %%mm6, %%mm6 \n\t"
2597 "punpckhbw %%mm7, %%mm7 \n\t"
2598 MOVNTQ" %%mm0, %0 \n\t"
2599 MOVNTQ" %%mm1, 8%0 \n\t"
2600 MOVNTQ" %%mm2, 16%0 \n\t"
2601 MOVNTQ" %%mm3, 24%0 \n\t"
2602 MOVNTQ" %%mm4, 32%0 \n\t"
2603 MOVNTQ" %%mm5, 40%0 \n\t"
2604 MOVNTQ" %%mm6, 48%0 \n\t"
2605 MOVNTQ" %%mm7, 56%0"
2611 for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2622 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2624 long width, long height,
2625 long srcStride1, long srcStride2,
2626 long srcStride3, long dstStride)
2629 w=width/2; h=height;
2631 const uint8_t* yp=src1+srcStride1*y;
2632 const uint8_t* up=src2+srcStride2*(y>>2);
2633 const uint8_t* vp=src3+srcStride3*(y>>2);
2634 uint8_t* d=dst+dstStride*y;
2640 PREFETCH" 32(%1, %0) \n\t"
2641 PREFETCH" 32(%2, %0) \n\t"
2642 PREFETCH" 32(%3, %0) \n\t"
2643 "movq (%1, %0, 4), %%mm0 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2644 "movq (%2, %0), %%mm1 \n\t" /* U0U1U2U3U4U5U6U7 */
2645 "movq (%3, %0), %%mm2 \n\t" /* V0V1V2V3V4V5V6V7 */
2646 "movq %%mm0, %%mm3 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2647 "movq %%mm1, %%mm4 \n\t" /* U0U1U2U3U4U5U6U7 */
2648 "movq %%mm2, %%mm5 \n\t" /* V0V1V2V3V4V5V6V7 */
2649 "punpcklbw %%mm1, %%mm1 \n\t" /* U0U0 U1U1 U2U2 U3U3 */
2650 "punpcklbw %%mm2, %%mm2 \n\t" /* V0V0 V1V1 V2V2 V3V3 */
2651 "punpckhbw %%mm4, %%mm4 \n\t" /* U4U4 U5U5 U6U6 U7U7 */
2652 "punpckhbw %%mm5, %%mm5 \n\t" /* V4V4 V5V5 V6V6 V7V7 */
2654 "movq %%mm1, %%mm6 \n\t"
2655 "punpcklbw %%mm2, %%mm1 \n\t" /* U0V0 U0V0 U1V1 U1V1*/
2656 "punpcklbw %%mm1, %%mm0 \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2657 "punpckhbw %%mm1, %%mm3 \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2658 MOVNTQ" %%mm0, (%4, %0, 8) \n\t"
2659 MOVNTQ" %%mm3, 8(%4, %0, 8) \n\t"
2661 "punpckhbw %%mm2, %%mm6 \n\t" /* U2V2 U2V2 U3V3 U3V3*/
2662 "movq 8(%1, %0, 4), %%mm0 \n\t"
2663 "movq %%mm0, %%mm3 \n\t"
2664 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U2 Y V2 Y U2 Y V2*/
2665 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U3 Y V3 Y U3 Y V3*/
2666 MOVNTQ" %%mm0, 16(%4, %0, 8) \n\t"
2667 MOVNTQ" %%mm3, 24(%4, %0, 8) \n\t"
2669 "movq %%mm4, %%mm6 \n\t"
2670 "movq 16(%1, %0, 4), %%mm0 \n\t"
2671 "movq %%mm0, %%mm3 \n\t"
2672 "punpcklbw %%mm5, %%mm4 \n\t"
2673 "punpcklbw %%mm4, %%mm0 \n\t" /* Y U4 Y V4 Y U4 Y V4*/
2674 "punpckhbw %%mm4, %%mm3 \n\t" /* Y U5 Y V5 Y U5 Y V5*/
2675 MOVNTQ" %%mm0, 32(%4, %0, 8) \n\t"
2676 MOVNTQ" %%mm3, 40(%4, %0, 8) \n\t"
2678 "punpckhbw %%mm5, %%mm6 \n\t"
2679 "movq 24(%1, %0, 4), %%mm0 \n\t"
2680 "movq %%mm0, %%mm3 \n\t"
2681 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U6 Y V6 Y U6 Y V6*/
2682 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U7 Y V7 Y U7 Y V7*/
2683 MOVNTQ" %%mm0, 48(%4, %0, 8) \n\t"
2684 MOVNTQ" %%mm3, 56(%4, %0, 8) \n\t"
2687 : "r"(yp), "r" (up), "r"(vp), "r"(d)
2693 const long x2 = x<<2;
2696 d[8*x+2] = yp[x2+1];
2698 d[8*x+4] = yp[x2+2];
2700 d[8*x+6] = yp[x2+3];
2713 static inline void RENAME(rgb2rgb_init)(void){
2714 rgb15to16 = RENAME(rgb15to16);
2715 rgb15tobgr24 = RENAME(rgb15tobgr24);
2716 rgb15to32 = RENAME(rgb15to32);
2717 rgb16tobgr24 = RENAME(rgb16tobgr24);
2718 rgb16to32 = RENAME(rgb16to32);
2719 rgb16to15 = RENAME(rgb16to15);
2720 rgb24tobgr16 = RENAME(rgb24tobgr16);
2721 rgb24tobgr15 = RENAME(rgb24tobgr15);
2722 rgb24tobgr32 = RENAME(rgb24tobgr32);
2723 rgb32to16 = RENAME(rgb32to16);
2724 rgb32to15 = RENAME(rgb32to15);
2725 rgb32tobgr24 = RENAME(rgb32tobgr24);
2726 rgb24to15 = RENAME(rgb24to15);
2727 rgb24to16 = RENAME(rgb24to16);
2728 rgb24tobgr24 = RENAME(rgb24tobgr24);
2729 rgb32tobgr32 = RENAME(rgb32tobgr32);
2730 rgb32tobgr16 = RENAME(rgb32tobgr16);
2731 rgb32tobgr15 = RENAME(rgb32tobgr15);
2732 yv12toyuy2 = RENAME(yv12toyuy2);
2733 yv12touyvy = RENAME(yv12touyvy);
2734 yuv422ptoyuy2 = RENAME(yuv422ptoyuy2);
2735 yuv422ptouyvy = RENAME(yuv422ptouyvy);
2736 yuy2toyv12 = RENAME(yuy2toyv12);
2737 // uyvytoyv12 = RENAME(uyvytoyv12);
2738 // yvu9toyv12 = RENAME(yvu9toyv12);
2739 planar2x = RENAME(planar2x);
2740 rgb24toyv12 = RENAME(rgb24toyv12);
2741 interleaveBytes = RENAME(interleaveBytes);
2742 vu9_to_vu12 = RENAME(vu9_to_vu12);
2743 yvu9_to_yuy2 = RENAME(yvu9_to_yuy2);