2 * software RGB to RGB converter
3 * pluralize by software PAL8 to RGB converter
4 * software YUV to YUV converter
5 * software YUV to RGB converter
6 * Written by Nick Kurshev.
7 * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
8 * lot of big-endian byte order fixes by Alex Beregszaszi
10 * This file is part of FFmpeg.
12 * FFmpeg is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
17 * FFmpeg is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
22 * You should have received a copy of the GNU General Public License
23 * along with FFmpeg; if not, write to the Free Software
24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
26 * The C code (not assembly, MMX, ...) of this file can be used
27 * under the LGPL license.
47 #define PREFETCH "prefetch"
48 #define PREFETCHW "prefetchw"
49 #define PAVGB "pavgusb"
51 #define PREFETCH "prefetchnta"
52 #define PREFETCHW "prefetcht0"
55 #define PREFETCH " # nop"
56 #define PREFETCHW " # nop"
60 /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
67 #define MOVNTQ "movntq"
68 #define SFENCE "sfence"
71 #define SFENCE " # nop"
74 static inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
77 const uint8_t *s = src;
80 const uint8_t *mm_end;
84 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
86 __asm__ volatile("movq %0, %%mm7"::"m"(mask32a):"memory");
92 "punpckldq 3%1, %%mm0 \n\t"
93 "movd 6%1, %%mm1 \n\t"
94 "punpckldq 9%1, %%mm1 \n\t"
95 "movd 12%1, %%mm2 \n\t"
96 "punpckldq 15%1, %%mm2 \n\t"
97 "movd 18%1, %%mm3 \n\t"
98 "punpckldq 21%1, %%mm3 \n\t"
99 "por %%mm7, %%mm0 \n\t"
100 "por %%mm7, %%mm1 \n\t"
101 "por %%mm7, %%mm2 \n\t"
102 "por %%mm7, %%mm3 \n\t"
103 MOVNTQ" %%mm0, %0 \n\t"
104 MOVNTQ" %%mm1, 8%0 \n\t"
105 MOVNTQ" %%mm2, 16%0 \n\t"
113 __asm__ volatile(SFENCE:::"memory");
114 __asm__ volatile(EMMS:::"memory");
118 #ifdef WORDS_BIGENDIAN
119 /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */
134 static inline void RENAME(rgb32tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
137 const uint8_t *s = src;
140 const uint8_t *mm_end;
144 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
150 "movq %1, %%mm0 \n\t"
151 "movq 8%1, %%mm1 \n\t"
152 "movq 16%1, %%mm4 \n\t"
153 "movq 24%1, %%mm5 \n\t"
154 "movq %%mm0, %%mm2 \n\t"
155 "movq %%mm1, %%mm3 \n\t"
156 "movq %%mm4, %%mm6 \n\t"
157 "movq %%mm5, %%mm7 \n\t"
158 "psrlq $8, %%mm2 \n\t"
159 "psrlq $8, %%mm3 \n\t"
160 "psrlq $8, %%mm6 \n\t"
161 "psrlq $8, %%mm7 \n\t"
162 "pand %2, %%mm0 \n\t"
163 "pand %2, %%mm1 \n\t"
164 "pand %2, %%mm4 \n\t"
165 "pand %2, %%mm5 \n\t"
166 "pand %3, %%mm2 \n\t"
167 "pand %3, %%mm3 \n\t"
168 "pand %3, %%mm6 \n\t"
169 "pand %3, %%mm7 \n\t"
170 "por %%mm2, %%mm0 \n\t"
171 "por %%mm3, %%mm1 \n\t"
172 "por %%mm6, %%mm4 \n\t"
173 "por %%mm7, %%mm5 \n\t"
175 "movq %%mm1, %%mm2 \n\t"
176 "movq %%mm4, %%mm3 \n\t"
177 "psllq $48, %%mm2 \n\t"
178 "psllq $32, %%mm3 \n\t"
179 "pand %4, %%mm2 \n\t"
180 "pand %5, %%mm3 \n\t"
181 "por %%mm2, %%mm0 \n\t"
182 "psrlq $16, %%mm1 \n\t"
183 "psrlq $32, %%mm4 \n\t"
184 "psllq $16, %%mm5 \n\t"
185 "por %%mm3, %%mm1 \n\t"
186 "pand %6, %%mm5 \n\t"
187 "por %%mm5, %%mm4 \n\t"
189 MOVNTQ" %%mm0, %0 \n\t"
190 MOVNTQ" %%mm1, 8%0 \n\t"
193 :"m"(*s),"m"(mask24l),
194 "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
199 __asm__ volatile(SFENCE:::"memory");
200 __asm__ volatile(EMMS:::"memory");
204 #ifdef WORDS_BIGENDIAN
205 /* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */
221 original by Strepto/Astral
222 ported to gcc & bugfixed: A'rpi
223 MMX2, 3DNOW optimization by Nick Kurshev
224 32-bit C version, and and&add trick by Michael Niedermayer
226 static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, long src_size)
228 register const uint8_t* s=src;
229 register uint8_t* d=dst;
230 register const uint8_t *end;
231 const uint8_t *mm_end;
234 __asm__ volatile(PREFETCH" %0"::"m"(*s));
235 __asm__ volatile("movq %0, %%mm4"::"m"(mask15s));
241 "movq %1, %%mm0 \n\t"
242 "movq 8%1, %%mm2 \n\t"
243 "movq %%mm0, %%mm1 \n\t"
244 "movq %%mm2, %%mm3 \n\t"
245 "pand %%mm4, %%mm0 \n\t"
246 "pand %%mm4, %%mm2 \n\t"
247 "paddw %%mm1, %%mm0 \n\t"
248 "paddw %%mm3, %%mm2 \n\t"
249 MOVNTQ" %%mm0, %0 \n\t"
257 __asm__ volatile(SFENCE:::"memory");
258 __asm__ volatile(EMMS:::"memory");
263 register unsigned x= *((const uint32_t *)s);
264 *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
270 register unsigned short x= *((const uint16_t *)s);
271 *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
275 static inline void RENAME(rgb16to15)(const uint8_t *src, uint8_t *dst, long src_size)
277 register const uint8_t* s=src;
278 register uint8_t* d=dst;
279 register const uint8_t *end;
280 const uint8_t *mm_end;
283 __asm__ volatile(PREFETCH" %0"::"m"(*s));
284 __asm__ volatile("movq %0, %%mm7"::"m"(mask15rg));
285 __asm__ volatile("movq %0, %%mm6"::"m"(mask15b));
291 "movq %1, %%mm0 \n\t"
292 "movq 8%1, %%mm2 \n\t"
293 "movq %%mm0, %%mm1 \n\t"
294 "movq %%mm2, %%mm3 \n\t"
295 "psrlq $1, %%mm0 \n\t"
296 "psrlq $1, %%mm2 \n\t"
297 "pand %%mm7, %%mm0 \n\t"
298 "pand %%mm7, %%mm2 \n\t"
299 "pand %%mm6, %%mm1 \n\t"
300 "pand %%mm6, %%mm3 \n\t"
301 "por %%mm1, %%mm0 \n\t"
302 "por %%mm3, %%mm2 \n\t"
303 MOVNTQ" %%mm0, %0 \n\t"
311 __asm__ volatile(SFENCE:::"memory");
312 __asm__ volatile(EMMS:::"memory");
317 register uint32_t x= *((const uint32_t*)s);
318 *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
324 register uint16_t x= *((const uint16_t*)s);
325 *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
331 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size)
333 const uint8_t *s = src;
336 const uint8_t *mm_end;
338 uint16_t *d = (uint16_t *)dst;
342 #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
344 "movq %3, %%mm5 \n\t"
345 "movq %4, %%mm6 \n\t"
346 "movq %5, %%mm7 \n\t"
350 PREFETCH" 32(%1) \n\t"
351 "movd (%1), %%mm0 \n\t"
352 "movd 4(%1), %%mm3 \n\t"
353 "punpckldq 8(%1), %%mm0 \n\t"
354 "punpckldq 12(%1), %%mm3 \n\t"
355 "movq %%mm0, %%mm1 \n\t"
356 "movq %%mm3, %%mm4 \n\t"
357 "pand %%mm6, %%mm0 \n\t"
358 "pand %%mm6, %%mm3 \n\t"
359 "pmaddwd %%mm7, %%mm0 \n\t"
360 "pmaddwd %%mm7, %%mm3 \n\t"
361 "pand %%mm5, %%mm1 \n\t"
362 "pand %%mm5, %%mm4 \n\t"
363 "por %%mm1, %%mm0 \n\t"
364 "por %%mm4, %%mm3 \n\t"
365 "psrld $5, %%mm0 \n\t"
366 "pslld $11, %%mm3 \n\t"
367 "por %%mm3, %%mm0 \n\t"
368 MOVNTQ" %%mm0, (%0) \n\t"
375 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
378 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
380 "movq %0, %%mm7 \n\t"
381 "movq %1, %%mm6 \n\t"
382 ::"m"(red_16mask),"m"(green_16mask));
387 "movd %1, %%mm0 \n\t"
388 "movd 4%1, %%mm3 \n\t"
389 "punpckldq 8%1, %%mm0 \n\t"
390 "punpckldq 12%1, %%mm3 \n\t"
391 "movq %%mm0, %%mm1 \n\t"
392 "movq %%mm0, %%mm2 \n\t"
393 "movq %%mm3, %%mm4 \n\t"
394 "movq %%mm3, %%mm5 \n\t"
395 "psrlq $3, %%mm0 \n\t"
396 "psrlq $3, %%mm3 \n\t"
397 "pand %2, %%mm0 \n\t"
398 "pand %2, %%mm3 \n\t"
399 "psrlq $5, %%mm1 \n\t"
400 "psrlq $5, %%mm4 \n\t"
401 "pand %%mm6, %%mm1 \n\t"
402 "pand %%mm6, %%mm4 \n\t"
403 "psrlq $8, %%mm2 \n\t"
404 "psrlq $8, %%mm5 \n\t"
405 "pand %%mm7, %%mm2 \n\t"
406 "pand %%mm7, %%mm5 \n\t"
407 "por %%mm1, %%mm0 \n\t"
408 "por %%mm4, %%mm3 \n\t"
409 "por %%mm2, %%mm0 \n\t"
410 "por %%mm5, %%mm3 \n\t"
411 "psllq $16, %%mm3 \n\t"
412 "por %%mm3, %%mm0 \n\t"
413 MOVNTQ" %%mm0, %0 \n\t"
414 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
419 __asm__ volatile(SFENCE:::"memory");
420 __asm__ volatile(EMMS:::"memory");
424 register int rgb = *(const uint32_t*)s; s += 4;
425 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
429 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
431 const uint8_t *s = src;
434 const uint8_t *mm_end;
436 uint16_t *d = (uint16_t *)dst;
439 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
441 "movq %0, %%mm7 \n\t"
442 "movq %1, %%mm6 \n\t"
443 ::"m"(red_16mask),"m"(green_16mask));
449 "movd %1, %%mm0 \n\t"
450 "movd 4%1, %%mm3 \n\t"
451 "punpckldq 8%1, %%mm0 \n\t"
452 "punpckldq 12%1, %%mm3 \n\t"
453 "movq %%mm0, %%mm1 \n\t"
454 "movq %%mm0, %%mm2 \n\t"
455 "movq %%mm3, %%mm4 \n\t"
456 "movq %%mm3, %%mm5 \n\t"
457 "psllq $8, %%mm0 \n\t"
458 "psllq $8, %%mm3 \n\t"
459 "pand %%mm7, %%mm0 \n\t"
460 "pand %%mm7, %%mm3 \n\t"
461 "psrlq $5, %%mm1 \n\t"
462 "psrlq $5, %%mm4 \n\t"
463 "pand %%mm6, %%mm1 \n\t"
464 "pand %%mm6, %%mm4 \n\t"
465 "psrlq $19, %%mm2 \n\t"
466 "psrlq $19, %%mm5 \n\t"
467 "pand %2, %%mm2 \n\t"
468 "pand %2, %%mm5 \n\t"
469 "por %%mm1, %%mm0 \n\t"
470 "por %%mm4, %%mm3 \n\t"
471 "por %%mm2, %%mm0 \n\t"
472 "por %%mm5, %%mm3 \n\t"
473 "psllq $16, %%mm3 \n\t"
474 "por %%mm3, %%mm0 \n\t"
475 MOVNTQ" %%mm0, %0 \n\t"
476 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
480 __asm__ volatile(SFENCE:::"memory");
481 __asm__ volatile(EMMS:::"memory");
485 register int rgb = *(const uint32_t*)s; s += 4;
486 *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
490 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size)
492 const uint8_t *s = src;
495 const uint8_t *mm_end;
497 uint16_t *d = (uint16_t *)dst;
501 #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
503 "movq %3, %%mm5 \n\t"
504 "movq %4, %%mm6 \n\t"
505 "movq %5, %%mm7 \n\t"
509 PREFETCH" 32(%1) \n\t"
510 "movd (%1), %%mm0 \n\t"
511 "movd 4(%1), %%mm3 \n\t"
512 "punpckldq 8(%1), %%mm0 \n\t"
513 "punpckldq 12(%1), %%mm3 \n\t"
514 "movq %%mm0, %%mm1 \n\t"
515 "movq %%mm3, %%mm4 \n\t"
516 "pand %%mm6, %%mm0 \n\t"
517 "pand %%mm6, %%mm3 \n\t"
518 "pmaddwd %%mm7, %%mm0 \n\t"
519 "pmaddwd %%mm7, %%mm3 \n\t"
520 "pand %%mm5, %%mm1 \n\t"
521 "pand %%mm5, %%mm4 \n\t"
522 "por %%mm1, %%mm0 \n\t"
523 "por %%mm4, %%mm3 \n\t"
524 "psrld $6, %%mm0 \n\t"
525 "pslld $10, %%mm3 \n\t"
526 "por %%mm3, %%mm0 \n\t"
527 MOVNTQ" %%mm0, (%0) \n\t"
534 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
537 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
539 "movq %0, %%mm7 \n\t"
540 "movq %1, %%mm6 \n\t"
541 ::"m"(red_15mask),"m"(green_15mask));
546 "movd %1, %%mm0 \n\t"
547 "movd 4%1, %%mm3 \n\t"
548 "punpckldq 8%1, %%mm0 \n\t"
549 "punpckldq 12%1, %%mm3 \n\t"
550 "movq %%mm0, %%mm1 \n\t"
551 "movq %%mm0, %%mm2 \n\t"
552 "movq %%mm3, %%mm4 \n\t"
553 "movq %%mm3, %%mm5 \n\t"
554 "psrlq $3, %%mm0 \n\t"
555 "psrlq $3, %%mm3 \n\t"
556 "pand %2, %%mm0 \n\t"
557 "pand %2, %%mm3 \n\t"
558 "psrlq $6, %%mm1 \n\t"
559 "psrlq $6, %%mm4 \n\t"
560 "pand %%mm6, %%mm1 \n\t"
561 "pand %%mm6, %%mm4 \n\t"
562 "psrlq $9, %%mm2 \n\t"
563 "psrlq $9, %%mm5 \n\t"
564 "pand %%mm7, %%mm2 \n\t"
565 "pand %%mm7, %%mm5 \n\t"
566 "por %%mm1, %%mm0 \n\t"
567 "por %%mm4, %%mm3 \n\t"
568 "por %%mm2, %%mm0 \n\t"
569 "por %%mm5, %%mm3 \n\t"
570 "psllq $16, %%mm3 \n\t"
571 "por %%mm3, %%mm0 \n\t"
572 MOVNTQ" %%mm0, %0 \n\t"
573 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
578 __asm__ volatile(SFENCE:::"memory");
579 __asm__ volatile(EMMS:::"memory");
583 register int rgb = *(const uint32_t*)s; s += 4;
584 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
588 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
590 const uint8_t *s = src;
593 const uint8_t *mm_end;
595 uint16_t *d = (uint16_t *)dst;
598 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
600 "movq %0, %%mm7 \n\t"
601 "movq %1, %%mm6 \n\t"
602 ::"m"(red_15mask),"m"(green_15mask));
608 "movd %1, %%mm0 \n\t"
609 "movd 4%1, %%mm3 \n\t"
610 "punpckldq 8%1, %%mm0 \n\t"
611 "punpckldq 12%1, %%mm3 \n\t"
612 "movq %%mm0, %%mm1 \n\t"
613 "movq %%mm0, %%mm2 \n\t"
614 "movq %%mm3, %%mm4 \n\t"
615 "movq %%mm3, %%mm5 \n\t"
616 "psllq $7, %%mm0 \n\t"
617 "psllq $7, %%mm3 \n\t"
618 "pand %%mm7, %%mm0 \n\t"
619 "pand %%mm7, %%mm3 \n\t"
620 "psrlq $6, %%mm1 \n\t"
621 "psrlq $6, %%mm4 \n\t"
622 "pand %%mm6, %%mm1 \n\t"
623 "pand %%mm6, %%mm4 \n\t"
624 "psrlq $19, %%mm2 \n\t"
625 "psrlq $19, %%mm5 \n\t"
626 "pand %2, %%mm2 \n\t"
627 "pand %2, %%mm5 \n\t"
628 "por %%mm1, %%mm0 \n\t"
629 "por %%mm4, %%mm3 \n\t"
630 "por %%mm2, %%mm0 \n\t"
631 "por %%mm5, %%mm3 \n\t"
632 "psllq $16, %%mm3 \n\t"
633 "por %%mm3, %%mm0 \n\t"
634 MOVNTQ" %%mm0, %0 \n\t"
635 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
639 __asm__ volatile(SFENCE:::"memory");
640 __asm__ volatile(EMMS:::"memory");
644 register int rgb = *(const uint32_t*)s; s += 4;
645 *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
649 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
651 const uint8_t *s = src;
654 const uint8_t *mm_end;
656 uint16_t *d = (uint16_t *)dst;
659 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
661 "movq %0, %%mm7 \n\t"
662 "movq %1, %%mm6 \n\t"
663 ::"m"(red_16mask),"m"(green_16mask));
669 "movd %1, %%mm0 \n\t"
670 "movd 3%1, %%mm3 \n\t"
671 "punpckldq 6%1, %%mm0 \n\t"
672 "punpckldq 9%1, %%mm3 \n\t"
673 "movq %%mm0, %%mm1 \n\t"
674 "movq %%mm0, %%mm2 \n\t"
675 "movq %%mm3, %%mm4 \n\t"
676 "movq %%mm3, %%mm5 \n\t"
677 "psrlq $3, %%mm0 \n\t"
678 "psrlq $3, %%mm3 \n\t"
679 "pand %2, %%mm0 \n\t"
680 "pand %2, %%mm3 \n\t"
681 "psrlq $5, %%mm1 \n\t"
682 "psrlq $5, %%mm4 \n\t"
683 "pand %%mm6, %%mm1 \n\t"
684 "pand %%mm6, %%mm4 \n\t"
685 "psrlq $8, %%mm2 \n\t"
686 "psrlq $8, %%mm5 \n\t"
687 "pand %%mm7, %%mm2 \n\t"
688 "pand %%mm7, %%mm5 \n\t"
689 "por %%mm1, %%mm0 \n\t"
690 "por %%mm4, %%mm3 \n\t"
691 "por %%mm2, %%mm0 \n\t"
692 "por %%mm5, %%mm3 \n\t"
693 "psllq $16, %%mm3 \n\t"
694 "por %%mm3, %%mm0 \n\t"
695 MOVNTQ" %%mm0, %0 \n\t"
696 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
700 __asm__ volatile(SFENCE:::"memory");
701 __asm__ volatile(EMMS:::"memory");
708 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
712 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size)
714 const uint8_t *s = src;
717 const uint8_t *mm_end;
719 uint16_t *d = (uint16_t *)dst;
722 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
724 "movq %0, %%mm7 \n\t"
725 "movq %1, %%mm6 \n\t"
726 ::"m"(red_16mask),"m"(green_16mask));
732 "movd %1, %%mm0 \n\t"
733 "movd 3%1, %%mm3 \n\t"
734 "punpckldq 6%1, %%mm0 \n\t"
735 "punpckldq 9%1, %%mm3 \n\t"
736 "movq %%mm0, %%mm1 \n\t"
737 "movq %%mm0, %%mm2 \n\t"
738 "movq %%mm3, %%mm4 \n\t"
739 "movq %%mm3, %%mm5 \n\t"
740 "psllq $8, %%mm0 \n\t"
741 "psllq $8, %%mm3 \n\t"
742 "pand %%mm7, %%mm0 \n\t"
743 "pand %%mm7, %%mm3 \n\t"
744 "psrlq $5, %%mm1 \n\t"
745 "psrlq $5, %%mm4 \n\t"
746 "pand %%mm6, %%mm1 \n\t"
747 "pand %%mm6, %%mm4 \n\t"
748 "psrlq $19, %%mm2 \n\t"
749 "psrlq $19, %%mm5 \n\t"
750 "pand %2, %%mm2 \n\t"
751 "pand %2, %%mm5 \n\t"
752 "por %%mm1, %%mm0 \n\t"
753 "por %%mm4, %%mm3 \n\t"
754 "por %%mm2, %%mm0 \n\t"
755 "por %%mm5, %%mm3 \n\t"
756 "psllq $16, %%mm3 \n\t"
757 "por %%mm3, %%mm0 \n\t"
758 MOVNTQ" %%mm0, %0 \n\t"
759 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
763 __asm__ volatile(SFENCE:::"memory");
764 __asm__ volatile(EMMS:::"memory");
771 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
775 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
777 const uint8_t *s = src;
780 const uint8_t *mm_end;
782 uint16_t *d = (uint16_t *)dst;
785 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
787 "movq %0, %%mm7 \n\t"
788 "movq %1, %%mm6 \n\t"
789 ::"m"(red_15mask),"m"(green_15mask));
795 "movd %1, %%mm0 \n\t"
796 "movd 3%1, %%mm3 \n\t"
797 "punpckldq 6%1, %%mm0 \n\t"
798 "punpckldq 9%1, %%mm3 \n\t"
799 "movq %%mm0, %%mm1 \n\t"
800 "movq %%mm0, %%mm2 \n\t"
801 "movq %%mm3, %%mm4 \n\t"
802 "movq %%mm3, %%mm5 \n\t"
803 "psrlq $3, %%mm0 \n\t"
804 "psrlq $3, %%mm3 \n\t"
805 "pand %2, %%mm0 \n\t"
806 "pand %2, %%mm3 \n\t"
807 "psrlq $6, %%mm1 \n\t"
808 "psrlq $6, %%mm4 \n\t"
809 "pand %%mm6, %%mm1 \n\t"
810 "pand %%mm6, %%mm4 \n\t"
811 "psrlq $9, %%mm2 \n\t"
812 "psrlq $9, %%mm5 \n\t"
813 "pand %%mm7, %%mm2 \n\t"
814 "pand %%mm7, %%mm5 \n\t"
815 "por %%mm1, %%mm0 \n\t"
816 "por %%mm4, %%mm3 \n\t"
817 "por %%mm2, %%mm0 \n\t"
818 "por %%mm5, %%mm3 \n\t"
819 "psllq $16, %%mm3 \n\t"
820 "por %%mm3, %%mm0 \n\t"
821 MOVNTQ" %%mm0, %0 \n\t"
822 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
826 __asm__ volatile(SFENCE:::"memory");
827 __asm__ volatile(EMMS:::"memory");
834 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
838 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size)
840 const uint8_t *s = src;
843 const uint8_t *mm_end;
845 uint16_t *d = (uint16_t *)dst;
848 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
850 "movq %0, %%mm7 \n\t"
851 "movq %1, %%mm6 \n\t"
852 ::"m"(red_15mask),"m"(green_15mask));
858 "movd %1, %%mm0 \n\t"
859 "movd 3%1, %%mm3 \n\t"
860 "punpckldq 6%1, %%mm0 \n\t"
861 "punpckldq 9%1, %%mm3 \n\t"
862 "movq %%mm0, %%mm1 \n\t"
863 "movq %%mm0, %%mm2 \n\t"
864 "movq %%mm3, %%mm4 \n\t"
865 "movq %%mm3, %%mm5 \n\t"
866 "psllq $7, %%mm0 \n\t"
867 "psllq $7, %%mm3 \n\t"
868 "pand %%mm7, %%mm0 \n\t"
869 "pand %%mm7, %%mm3 \n\t"
870 "psrlq $6, %%mm1 \n\t"
871 "psrlq $6, %%mm4 \n\t"
872 "pand %%mm6, %%mm1 \n\t"
873 "pand %%mm6, %%mm4 \n\t"
874 "psrlq $19, %%mm2 \n\t"
875 "psrlq $19, %%mm5 \n\t"
876 "pand %2, %%mm2 \n\t"
877 "pand %2, %%mm5 \n\t"
878 "por %%mm1, %%mm0 \n\t"
879 "por %%mm4, %%mm3 \n\t"
880 "por %%mm2, %%mm0 \n\t"
881 "por %%mm5, %%mm3 \n\t"
882 "psllq $16, %%mm3 \n\t"
883 "por %%mm3, %%mm0 \n\t"
884 MOVNTQ" %%mm0, %0 \n\t"
885 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
889 __asm__ volatile(SFENCE:::"memory");
890 __asm__ volatile(EMMS:::"memory");
897 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
902 I use less accurate approximation here by simply left-shifting the input
903 value and filling the low order bits with zeroes. This method improves PNG
904 compression but this scheme cannot reproduce white exactly, since it does
905 not generate an all-ones maximum value; the net effect is to darken the
908 The better method should be "left bit replication":
918 | leftmost bits repeated to fill open bits
922 static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
926 const uint16_t *mm_end;
929 const uint16_t *s = (const uint16_t*)src;
930 end = s + src_size/2;
932 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
938 "movq %1, %%mm0 \n\t"
939 "movq %1, %%mm1 \n\t"
940 "movq %1, %%mm2 \n\t"
941 "pand %2, %%mm0 \n\t"
942 "pand %3, %%mm1 \n\t"
943 "pand %4, %%mm2 \n\t"
944 "psllq $3, %%mm0 \n\t"
945 "psrlq $2, %%mm1 \n\t"
946 "psrlq $7, %%mm2 \n\t"
947 "movq %%mm0, %%mm3 \n\t"
948 "movq %%mm1, %%mm4 \n\t"
949 "movq %%mm2, %%mm5 \n\t"
950 "punpcklwd %5, %%mm0 \n\t"
951 "punpcklwd %5, %%mm1 \n\t"
952 "punpcklwd %5, %%mm2 \n\t"
953 "punpckhwd %5, %%mm3 \n\t"
954 "punpckhwd %5, %%mm4 \n\t"
955 "punpckhwd %5, %%mm5 \n\t"
956 "psllq $8, %%mm1 \n\t"
957 "psllq $16, %%mm2 \n\t"
958 "por %%mm1, %%mm0 \n\t"
959 "por %%mm2, %%mm0 \n\t"
960 "psllq $8, %%mm4 \n\t"
961 "psllq $16, %%mm5 \n\t"
962 "por %%mm4, %%mm3 \n\t"
963 "por %%mm5, %%mm3 \n\t"
965 "movq %%mm0, %%mm6 \n\t"
966 "movq %%mm3, %%mm7 \n\t"
968 "movq 8%1, %%mm0 \n\t"
969 "movq 8%1, %%mm1 \n\t"
970 "movq 8%1, %%mm2 \n\t"
971 "pand %2, %%mm0 \n\t"
972 "pand %3, %%mm1 \n\t"
973 "pand %4, %%mm2 \n\t"
974 "psllq $3, %%mm0 \n\t"
975 "psrlq $2, %%mm1 \n\t"
976 "psrlq $7, %%mm2 \n\t"
977 "movq %%mm0, %%mm3 \n\t"
978 "movq %%mm1, %%mm4 \n\t"
979 "movq %%mm2, %%mm5 \n\t"
980 "punpcklwd %5, %%mm0 \n\t"
981 "punpcklwd %5, %%mm1 \n\t"
982 "punpcklwd %5, %%mm2 \n\t"
983 "punpckhwd %5, %%mm3 \n\t"
984 "punpckhwd %5, %%mm4 \n\t"
985 "punpckhwd %5, %%mm5 \n\t"
986 "psllq $8, %%mm1 \n\t"
987 "psllq $16, %%mm2 \n\t"
988 "por %%mm1, %%mm0 \n\t"
989 "por %%mm2, %%mm0 \n\t"
990 "psllq $8, %%mm4 \n\t"
991 "psllq $16, %%mm5 \n\t"
992 "por %%mm4, %%mm3 \n\t"
993 "por %%mm5, %%mm3 \n\t"
996 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
998 /* borrowed 32 to 24 */
1000 "movq %%mm0, %%mm4 \n\t"
1001 "movq %%mm3, %%mm5 \n\t"
1002 "movq %%mm6, %%mm0 \n\t"
1003 "movq %%mm7, %%mm1 \n\t"
1005 "movq %%mm4, %%mm6 \n\t"
1006 "movq %%mm5, %%mm7 \n\t"
1007 "movq %%mm0, %%mm2 \n\t"
1008 "movq %%mm1, %%mm3 \n\t"
1010 "psrlq $8, %%mm2 \n\t"
1011 "psrlq $8, %%mm3 \n\t"
1012 "psrlq $8, %%mm6 \n\t"
1013 "psrlq $8, %%mm7 \n\t"
1014 "pand %2, %%mm0 \n\t"
1015 "pand %2, %%mm1 \n\t"
1016 "pand %2, %%mm4 \n\t"
1017 "pand %2, %%mm5 \n\t"
1018 "pand %3, %%mm2 \n\t"
1019 "pand %3, %%mm3 \n\t"
1020 "pand %3, %%mm6 \n\t"
1021 "pand %3, %%mm7 \n\t"
1022 "por %%mm2, %%mm0 \n\t"
1023 "por %%mm3, %%mm1 \n\t"
1024 "por %%mm6, %%mm4 \n\t"
1025 "por %%mm7, %%mm5 \n\t"
1027 "movq %%mm1, %%mm2 \n\t"
1028 "movq %%mm4, %%mm3 \n\t"
1029 "psllq $48, %%mm2 \n\t"
1030 "psllq $32, %%mm3 \n\t"
1031 "pand %4, %%mm2 \n\t"
1032 "pand %5, %%mm3 \n\t"
1033 "por %%mm2, %%mm0 \n\t"
1034 "psrlq $16, %%mm1 \n\t"
1035 "psrlq $32, %%mm4 \n\t"
1036 "psllq $16, %%mm5 \n\t"
1037 "por %%mm3, %%mm1 \n\t"
1038 "pand %6, %%mm5 \n\t"
1039 "por %%mm5, %%mm4 \n\t"
1041 MOVNTQ" %%mm0, %0 \n\t"
1042 MOVNTQ" %%mm1, 8%0 \n\t"
1043 MOVNTQ" %%mm4, 16%0"
1046 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1051 __asm__ volatile(SFENCE:::"memory");
1052 __asm__ volatile(EMMS:::"memory");
1056 register uint16_t bgr;
1058 *d++ = (bgr&0x1F)<<3;
1059 *d++ = (bgr&0x3E0)>>2;
1060 *d++ = (bgr&0x7C00)>>7;
1064 static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
1066 const uint16_t *end;
1068 const uint16_t *mm_end;
1070 uint8_t *d = (uint8_t *)dst;
1071 const uint16_t *s = (const uint16_t *)src;
1072 end = s + src_size/2;
1074 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
1079 PREFETCH" 32%1 \n\t"
1080 "movq %1, %%mm0 \n\t"
1081 "movq %1, %%mm1 \n\t"
1082 "movq %1, %%mm2 \n\t"
1083 "pand %2, %%mm0 \n\t"
1084 "pand %3, %%mm1 \n\t"
1085 "pand %4, %%mm2 \n\t"
1086 "psllq $3, %%mm0 \n\t"
1087 "psrlq $3, %%mm1 \n\t"
1088 "psrlq $8, %%mm2 \n\t"
1089 "movq %%mm0, %%mm3 \n\t"
1090 "movq %%mm1, %%mm4 \n\t"
1091 "movq %%mm2, %%mm5 \n\t"
1092 "punpcklwd %5, %%mm0 \n\t"
1093 "punpcklwd %5, %%mm1 \n\t"
1094 "punpcklwd %5, %%mm2 \n\t"
1095 "punpckhwd %5, %%mm3 \n\t"
1096 "punpckhwd %5, %%mm4 \n\t"
1097 "punpckhwd %5, %%mm5 \n\t"
1098 "psllq $8, %%mm1 \n\t"
1099 "psllq $16, %%mm2 \n\t"
1100 "por %%mm1, %%mm0 \n\t"
1101 "por %%mm2, %%mm0 \n\t"
1102 "psllq $8, %%mm4 \n\t"
1103 "psllq $16, %%mm5 \n\t"
1104 "por %%mm4, %%mm3 \n\t"
1105 "por %%mm5, %%mm3 \n\t"
1107 "movq %%mm0, %%mm6 \n\t"
1108 "movq %%mm3, %%mm7 \n\t"
1110 "movq 8%1, %%mm0 \n\t"
1111 "movq 8%1, %%mm1 \n\t"
1112 "movq 8%1, %%mm2 \n\t"
1113 "pand %2, %%mm0 \n\t"
1114 "pand %3, %%mm1 \n\t"
1115 "pand %4, %%mm2 \n\t"
1116 "psllq $3, %%mm0 \n\t"
1117 "psrlq $3, %%mm1 \n\t"
1118 "psrlq $8, %%mm2 \n\t"
1119 "movq %%mm0, %%mm3 \n\t"
1120 "movq %%mm1, %%mm4 \n\t"
1121 "movq %%mm2, %%mm5 \n\t"
1122 "punpcklwd %5, %%mm0 \n\t"
1123 "punpcklwd %5, %%mm1 \n\t"
1124 "punpcklwd %5, %%mm2 \n\t"
1125 "punpckhwd %5, %%mm3 \n\t"
1126 "punpckhwd %5, %%mm4 \n\t"
1127 "punpckhwd %5, %%mm5 \n\t"
1128 "psllq $8, %%mm1 \n\t"
1129 "psllq $16, %%mm2 \n\t"
1130 "por %%mm1, %%mm0 \n\t"
1131 "por %%mm2, %%mm0 \n\t"
1132 "psllq $8, %%mm4 \n\t"
1133 "psllq $16, %%mm5 \n\t"
1134 "por %%mm4, %%mm3 \n\t"
1135 "por %%mm5, %%mm3 \n\t"
1137 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
1139 /* borrowed 32 to 24 */
1141 "movq %%mm0, %%mm4 \n\t"
1142 "movq %%mm3, %%mm5 \n\t"
1143 "movq %%mm6, %%mm0 \n\t"
1144 "movq %%mm7, %%mm1 \n\t"
1146 "movq %%mm4, %%mm6 \n\t"
1147 "movq %%mm5, %%mm7 \n\t"
1148 "movq %%mm0, %%mm2 \n\t"
1149 "movq %%mm1, %%mm3 \n\t"
1151 "psrlq $8, %%mm2 \n\t"
1152 "psrlq $8, %%mm3 \n\t"
1153 "psrlq $8, %%mm6 \n\t"
1154 "psrlq $8, %%mm7 \n\t"
1155 "pand %2, %%mm0 \n\t"
1156 "pand %2, %%mm1 \n\t"
1157 "pand %2, %%mm4 \n\t"
1158 "pand %2, %%mm5 \n\t"
1159 "pand %3, %%mm2 \n\t"
1160 "pand %3, %%mm3 \n\t"
1161 "pand %3, %%mm6 \n\t"
1162 "pand %3, %%mm7 \n\t"
1163 "por %%mm2, %%mm0 \n\t"
1164 "por %%mm3, %%mm1 \n\t"
1165 "por %%mm6, %%mm4 \n\t"
1166 "por %%mm7, %%mm5 \n\t"
1168 "movq %%mm1, %%mm2 \n\t"
1169 "movq %%mm4, %%mm3 \n\t"
1170 "psllq $48, %%mm2 \n\t"
1171 "psllq $32, %%mm3 \n\t"
1172 "pand %4, %%mm2 \n\t"
1173 "pand %5, %%mm3 \n\t"
1174 "por %%mm2, %%mm0 \n\t"
1175 "psrlq $16, %%mm1 \n\t"
1176 "psrlq $32, %%mm4 \n\t"
1177 "psllq $16, %%mm5 \n\t"
1178 "por %%mm3, %%mm1 \n\t"
1179 "pand %6, %%mm5 \n\t"
1180 "por %%mm5, %%mm4 \n\t"
1182 MOVNTQ" %%mm0, %0 \n\t"
1183 MOVNTQ" %%mm1, 8%0 \n\t"
1184 MOVNTQ" %%mm4, 16%0"
1187 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1192 __asm__ volatile(SFENCE:::"memory");
1193 __asm__ volatile(EMMS:::"memory");
1197 register uint16_t bgr;
1199 *d++ = (bgr&0x1F)<<3;
1200 *d++ = (bgr&0x7E0)>>3;
1201 *d++ = (bgr&0xF800)>>8;
1206 * mm0 = 00 B3 00 B2 00 B1 00 B0
1207 * mm1 = 00 G3 00 G2 00 G1 00 G0
1208 * mm2 = 00 R3 00 R2 00 R1 00 R0
1209 * mm6 = FF FF FF FF FF FF FF FF
1210 * mm7 = 00 00 00 00 00 00 00 00
1212 #define PACK_RGB32 \
1213 "packuswb %%mm7, %%mm0 \n\t" /* 00 00 00 00 B3 B2 B1 B0 */ \
1214 "packuswb %%mm7, %%mm1 \n\t" /* 00 00 00 00 G3 G2 G1 G0 */ \
1215 "packuswb %%mm7, %%mm2 \n\t" /* 00 00 00 00 R3 R2 R1 R0 */ \
1216 "punpcklbw %%mm1, %%mm0 \n\t" /* G3 B3 G2 B2 G1 B1 G0 B0 */ \
1217 "punpcklbw %%mm6, %%mm2 \n\t" /* FF R3 FF R2 FF R1 FF R0 */ \
1218 "movq %%mm0, %%mm3 \n\t" \
1219 "punpcklwd %%mm2, %%mm0 \n\t" /* FF R1 G1 B1 FF R0 G0 B0 */ \
1220 "punpckhwd %%mm2, %%mm3 \n\t" /* FF R3 G3 B3 FF R2 G2 B2 */ \
1221 MOVNTQ" %%mm0, %0 \n\t" \
1222 MOVNTQ" %%mm3, 8%0 \n\t" \
1224 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size)
1226 const uint16_t *end;
1228 const uint16_t *mm_end;
1231 const uint16_t *s = (const uint16_t *)src;
1232 end = s + src_size/2;
1234 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
1235 __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
1236 __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
1241 PREFETCH" 32%1 \n\t"
1242 "movq %1, %%mm0 \n\t"
1243 "movq %1, %%mm1 \n\t"
1244 "movq %1, %%mm2 \n\t"
1245 "pand %2, %%mm0 \n\t"
1246 "pand %3, %%mm1 \n\t"
1247 "pand %4, %%mm2 \n\t"
1248 "psllq $3, %%mm0 \n\t"
1249 "psrlq $2, %%mm1 \n\t"
1250 "psrlq $7, %%mm2 \n\t"
1253 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1258 __asm__ volatile(SFENCE:::"memory");
1259 __asm__ volatile(EMMS:::"memory");
1263 #if 0 //slightly slower on Athlon
1265 *((uint32_t*)d)++ = ((bgr&0x1F)<<3) + ((bgr&0x3E0)<<6) + ((bgr&0x7C00)<<9);
1267 register uint16_t bgr;
1269 #ifdef WORDS_BIGENDIAN
1271 *d++ = (bgr&0x7C00)>>7;
1272 *d++ = (bgr&0x3E0)>>2;
1273 *d++ = (bgr&0x1F)<<3;
1275 *d++ = (bgr&0x1F)<<3;
1276 *d++ = (bgr&0x3E0)>>2;
1277 *d++ = (bgr&0x7C00)>>7;
1285 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size)
1287 const uint16_t *end;
1289 const uint16_t *mm_end;
1292 const uint16_t *s = (const uint16_t*)src;
1293 end = s + src_size/2;
1295 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
1296 __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
1297 __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
1302 PREFETCH" 32%1 \n\t"
1303 "movq %1, %%mm0 \n\t"
1304 "movq %1, %%mm1 \n\t"
1305 "movq %1, %%mm2 \n\t"
1306 "pand %2, %%mm0 \n\t"
1307 "pand %3, %%mm1 \n\t"
1308 "pand %4, %%mm2 \n\t"
1309 "psllq $3, %%mm0 \n\t"
1310 "psrlq $3, %%mm1 \n\t"
1311 "psrlq $8, %%mm2 \n\t"
1314 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1319 __asm__ volatile(SFENCE:::"memory");
1320 __asm__ volatile(EMMS:::"memory");
1324 register uint16_t bgr;
1326 #ifdef WORDS_BIGENDIAN
1328 *d++ = (bgr&0xF800)>>8;
1329 *d++ = (bgr&0x7E0)>>3;
1330 *d++ = (bgr&0x1F)<<3;
1332 *d++ = (bgr&0x1F)<<3;
1333 *d++ = (bgr&0x7E0)>>3;
1334 *d++ = (bgr&0xF800)>>8;
1340 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
1347 idx = 15 - src_size;
1348 const uint8_t *s = src-idx;
1349 uint8_t *d = dst-idx;
1354 PREFETCH" (%1, %0) \n\t"
1355 "movq %3, %%mm7 \n\t"
1356 "pxor %4, %%mm7 \n\t"
1357 "movq %%mm7, %%mm6 \n\t"
1358 "pxor %5, %%mm7 \n\t"
1361 PREFETCH" 32(%1, %0) \n\t"
1362 "movq (%1, %0), %%mm0 \n\t"
1363 "movq 8(%1, %0), %%mm1 \n\t"
1365 "pshufw $177, %%mm0, %%mm3 \n\t"
1366 "pshufw $177, %%mm1, %%mm5 \n\t"
1367 "pand %%mm7, %%mm0 \n\t"
1368 "pand %%mm6, %%mm3 \n\t"
1369 "pand %%mm7, %%mm1 \n\t"
1370 "pand %%mm6, %%mm5 \n\t"
1371 "por %%mm3, %%mm0 \n\t"
1372 "por %%mm5, %%mm1 \n\t"
1374 "movq %%mm0, %%mm2 \n\t"
1375 "movq %%mm1, %%mm4 \n\t"
1376 "pand %%mm7, %%mm0 \n\t"
1377 "pand %%mm6, %%mm2 \n\t"
1378 "pand %%mm7, %%mm1 \n\t"
1379 "pand %%mm6, %%mm4 \n\t"
1380 "movq %%mm2, %%mm3 \n\t"
1381 "movq %%mm4, %%mm5 \n\t"
1382 "pslld $16, %%mm2 \n\t"
1383 "psrld $16, %%mm3 \n\t"
1384 "pslld $16, %%mm4 \n\t"
1385 "psrld $16, %%mm5 \n\t"
1386 "por %%mm2, %%mm0 \n\t"
1387 "por %%mm4, %%mm1 \n\t"
1388 "por %%mm3, %%mm0 \n\t"
1389 "por %%mm5, %%mm1 \n\t"
1391 MOVNTQ" %%mm0, (%2, %0) \n\t"
1392 MOVNTQ" %%mm1, 8(%2, %0) \n\t"
1399 : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
1402 for (; idx<15; idx+=4) {
1403 register int v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00;
1405 *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16);
1409 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
1413 x86_reg mmx_size= 23 - src_size;
1415 "test %%"REG_a", %%"REG_a" \n\t"
1417 "movq "MANGLE(mask24r)", %%mm5 \n\t"
1418 "movq "MANGLE(mask24g)", %%mm6 \n\t"
1419 "movq "MANGLE(mask24b)", %%mm7 \n\t"
1422 PREFETCH" 32(%1, %%"REG_a") \n\t"
1423 "movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1424 "movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG
1425 "movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B
1426 "psllq $16, %%mm0 \n\t" // 00 BGR BGR
1427 "pand %%mm5, %%mm0 \n\t"
1428 "pand %%mm6, %%mm1 \n\t"
1429 "pand %%mm7, %%mm2 \n\t"
1430 "por %%mm0, %%mm1 \n\t"
1431 "por %%mm2, %%mm1 \n\t"
1432 "movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1433 MOVNTQ" %%mm1, (%2, %%"REG_a") \n\t" // RGB RGB RG
1434 "movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B
1435 "movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR
1436 "pand %%mm7, %%mm0 \n\t"
1437 "pand %%mm5, %%mm1 \n\t"
1438 "pand %%mm6, %%mm2 \n\t"
1439 "por %%mm0, %%mm1 \n\t"
1440 "por %%mm2, %%mm1 \n\t"
1441 "movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B
1442 MOVNTQ" %%mm1, 8(%2, %%"REG_a") \n\t" // B RGB RGB R
1443 "movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR
1444 "movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG
1445 "pand %%mm6, %%mm0 \n\t"
1446 "pand %%mm7, %%mm1 \n\t"
1447 "pand %%mm5, %%mm2 \n\t"
1448 "por %%mm0, %%mm1 \n\t"
1449 "por %%mm2, %%mm1 \n\t"
1450 MOVNTQ" %%mm1, 16(%2, %%"REG_a") \n\t"
1451 "add $24, %%"REG_a" \n\t"
1455 : "r" (src-mmx_size), "r"(dst-mmx_size)
1458 __asm__ volatile(SFENCE:::"memory");
1459 __asm__ volatile(EMMS:::"memory");
1461 if (mmx_size==23) return; //finished, was multiple of 8
1465 src_size= 23-mmx_size;
1469 for (i=0; i<src_size; i+=3)
1473 dst[i + 1] = src[i + 1];
1474 dst[i + 2] = src[i + 0];
1479 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1480 long width, long height,
1481 long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1484 const long chromWidth= width>>1;
1485 for (y=0; y<height; y++)
1488 x86_reg cw = chromWidth;
1489 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1491 "xor %%"REG_a", %%"REG_a" \n\t"
1494 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1495 PREFETCH" 32(%2, %%"REG_a") \n\t"
1496 PREFETCH" 32(%3, %%"REG_a") \n\t"
1497 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1498 "movq %%mm0, %%mm2 \n\t" // U(0)
1499 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1500 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1501 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1503 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1504 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1505 "movq %%mm3, %%mm4 \n\t" // Y(0)
1506 "movq %%mm5, %%mm6 \n\t" // Y(8)
1507 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
1508 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
1509 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
1510 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
1512 MOVNTQ" %%mm3, (%0, %%"REG_a", 4) \n\t"
1513 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
1514 MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4) \n\t"
1515 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
1517 "add $8, %%"REG_a" \n\t"
1518 "cmp %4, %%"REG_a" \n\t"
1520 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (cw)
1525 #if ARCH_ALPHA && HAVE_MVI
1526 #define pl2yuy2(n) \
1531 __asm__("unpkbw %1, %0" : "=r"(y1) : "r"(y1)); \
1532 __asm__("unpkbw %1, %0" : "=r"(y2) : "r"(y2)); \
1533 __asm__("unpkbl %1, %0" : "=r"(u) : "r"(u)); \
1534 __asm__("unpkbl %1, %0" : "=r"(v) : "r"(v)); \
1535 yuv1 = (u << 8) + (v << 24); \
1542 uint64_t *qdst = (uint64_t *) dst;
1543 uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
1544 const uint32_t *yc = (uint32_t *) ysrc;
1545 const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
1546 const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
1547 for (i = 0; i < chromWidth; i += 8){
1548 uint64_t y1, y2, yuv1, yuv2;
1551 __asm__("ldq $31,64(%0)" :: "r"(yc));
1552 __asm__("ldq $31,64(%0)" :: "r"(yc2));
1553 __asm__("ldq $31,64(%0)" :: "r"(uc));
1554 __asm__("ldq $31,64(%0)" :: "r"(vc));
1572 #elif HAVE_FAST_64BIT
1574 uint64_t *ldst = (uint64_t *) dst;
1575 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1576 for (i = 0; i < chromWidth; i += 2){
1578 k = yc[0] + (uc[0] << 8) +
1579 (yc[1] << 16) + (vc[0] << 24);
1580 l = yc[2] + (uc[1] << 8) +
1581 (yc[3] << 16) + (vc[1] << 24);
1582 *ldst++ = k + (l << 32);
1589 int i, *idst = (int32_t *) dst;
1590 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1591 for (i = 0; i < chromWidth; i++){
1592 #ifdef WORDS_BIGENDIAN
1593 *idst++ = (yc[0] << 24)+ (uc[0] << 16) +
1594 (yc[1] << 8) + (vc[0] << 0);
1596 *idst++ = yc[0] + (uc[0] << 8) +
1597 (yc[1] << 16) + (vc[0] << 24);
1605 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1)
1607 usrc += chromStride;
1608 vsrc += chromStride;
1614 __asm__( EMMS" \n\t"
1621 * Height should be a multiple of 2 and width should be a multiple of 16.
1622 * (If this is a problem for anyone then tell me, and I will fix it.)
1624 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1625 long width, long height,
1626 long lumStride, long chromStride, long dstStride)
1628 //FIXME interpolate chroma
1629 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1632 static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1633 long width, long height,
1634 long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1637 const long chromWidth= width>>1;
1638 for (y=0; y<height; y++)
1641 x86_reg cw = chromWidth;
1642 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1644 "xor %%"REG_a", %%"REG_a" \n\t"
1647 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1648 PREFETCH" 32(%2, %%"REG_a") \n\t"
1649 PREFETCH" 32(%3, %%"REG_a") \n\t"
1650 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1651 "movq %%mm0, %%mm2 \n\t" // U(0)
1652 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1653 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1654 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1656 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1657 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1658 "movq %%mm0, %%mm4 \n\t" // Y(0)
1659 "movq %%mm2, %%mm6 \n\t" // Y(8)
1660 "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
1661 "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
1662 "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
1663 "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
1665 MOVNTQ" %%mm0, (%0, %%"REG_a", 4) \n\t"
1666 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
1667 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4) \n\t"
1668 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
1670 "add $8, %%"REG_a" \n\t"
1671 "cmp %4, %%"REG_a" \n\t"
1673 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (cw)
1677 //FIXME adapt the Alpha ASM code from yv12->yuy2
1681 uint64_t *ldst = (uint64_t *) dst;
1682 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1683 for (i = 0; i < chromWidth; i += 2){
1685 k = uc[0] + (yc[0] << 8) +
1686 (vc[0] << 16) + (yc[1] << 24);
1687 l = uc[1] + (yc[2] << 8) +
1688 (vc[1] << 16) + (yc[3] << 24);
1689 *ldst++ = k + (l << 32);
1696 int i, *idst = (int32_t *) dst;
1697 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1698 for (i = 0; i < chromWidth; i++){
1699 #ifdef WORDS_BIGENDIAN
1700 *idst++ = (uc[0] << 24)+ (yc[0] << 16) +
1701 (vc[0] << 8) + (yc[1] << 0);
1703 *idst++ = uc[0] + (yc[0] << 8) +
1704 (vc[0] << 16) + (yc[1] << 24);
1712 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1)
1714 usrc += chromStride;
1715 vsrc += chromStride;
1721 __asm__( EMMS" \n\t"
1728 * Height should be a multiple of 2 and width should be a multiple of 16
1729 * (If this is a problem for anyone then tell me, and I will fix it.)
1731 static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1732 long width, long height,
1733 long lumStride, long chromStride, long dstStride)
1735 //FIXME interpolate chroma
1736 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1740 * Width should be a multiple of 16.
1742 static inline void RENAME(yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1743 long width, long height,
1744 long lumStride, long chromStride, long dstStride)
1746 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1750 * Width should be a multiple of 16.
1752 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1753 long width, long height,
1754 long lumStride, long chromStride, long dstStride)
1756 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1760 * Height should be a multiple of 2 and width should be a multiple of 16.
1761 * (If this is a problem for anyone then tell me, and I will fix it.)
1763 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1764 long width, long height,
1765 long lumStride, long chromStride, long srcStride)
1768 const long chromWidth= width>>1;
1769 for (y=0; y<height; y+=2)
1772 x86_reg cw = chromWidth;
1774 "xor %%"REG_a", %%"REG_a" \n\t"
1775 "pcmpeqw %%mm7, %%mm7 \n\t"
1776 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1779 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1780 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1781 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1782 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
1783 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
1784 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
1785 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
1786 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1787 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1788 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1789 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1791 MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
1793 "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(8)
1794 "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(12)
1795 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
1796 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
1797 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
1798 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
1799 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1800 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1801 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1802 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1804 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1806 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1807 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1808 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1809 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1810 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1811 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1812 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1813 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1815 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
1816 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
1818 "add $8, %%"REG_a" \n\t"
1819 "cmp %4, %%"REG_a" \n\t"
1821 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (cw)
1822 : "memory", "%"REG_a
1829 "xor %%"REG_a", %%"REG_a" \n\t"
1832 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1833 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1834 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1835 "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
1836 "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
1837 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1838 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1839 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1840 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1841 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1842 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1844 MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
1845 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1847 "add $8, %%"REG_a" \n\t"
1848 "cmp %4, %%"REG_a" \n\t"
1851 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (cw)
1852 : "memory", "%"REG_a
1856 for (i=0; i<chromWidth; i++)
1858 ydst[2*i+0] = src[4*i+0];
1859 udst[i] = src[4*i+1];
1860 ydst[2*i+1] = src[4*i+2];
1861 vdst[i] = src[4*i+3];
1866 for (i=0; i<chromWidth; i++)
1868 ydst[2*i+0] = src[4*i+0];
1869 ydst[2*i+1] = src[4*i+2];
1872 udst += chromStride;
1873 vdst += chromStride;
1878 __asm__ volatile( EMMS" \n\t"
1884 static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1885 uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1886 long width, long height, long lumStride, long chromStride)
1889 memcpy(ydst, ysrc, width*height);
1891 /* XXX: implement upscaling for U,V */
1894 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWidth, long srcHeight, long srcStride, long dstStride)
1901 for (x=0; x<srcWidth-1; x++){
1902 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1903 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1905 dst[2*srcWidth-1]= src[srcWidth-1];
1909 for (y=1; y<srcHeight; y++){
1910 #if HAVE_MMX2 || HAVE_AMD3DNOW
1911 const x86_reg mmxSize= srcWidth&~15;
1913 "mov %4, %%"REG_a" \n\t"
1915 "movq (%0, %%"REG_a"), %%mm0 \n\t"
1916 "movq (%1, %%"REG_a"), %%mm1 \n\t"
1917 "movq 1(%0, %%"REG_a"), %%mm2 \n\t"
1918 "movq 1(%1, %%"REG_a"), %%mm3 \n\t"
1919 "movq -1(%0, %%"REG_a"), %%mm4 \n\t"
1920 "movq -1(%1, %%"REG_a"), %%mm5 \n\t"
1921 PAVGB" %%mm0, %%mm5 \n\t"
1922 PAVGB" %%mm0, %%mm3 \n\t"
1923 PAVGB" %%mm0, %%mm5 \n\t"
1924 PAVGB" %%mm0, %%mm3 \n\t"
1925 PAVGB" %%mm1, %%mm4 \n\t"
1926 PAVGB" %%mm1, %%mm2 \n\t"
1927 PAVGB" %%mm1, %%mm4 \n\t"
1928 PAVGB" %%mm1, %%mm2 \n\t"
1929 "movq %%mm5, %%mm7 \n\t"
1930 "movq %%mm4, %%mm6 \n\t"
1931 "punpcklbw %%mm3, %%mm5 \n\t"
1932 "punpckhbw %%mm3, %%mm7 \n\t"
1933 "punpcklbw %%mm2, %%mm4 \n\t"
1934 "punpckhbw %%mm2, %%mm6 \n\t"
1936 MOVNTQ" %%mm5, (%2, %%"REG_a", 2) \n\t"
1937 MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2) \n\t"
1938 MOVNTQ" %%mm4, (%3, %%"REG_a", 2) \n\t"
1939 MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2) \n\t"
1941 "movq %%mm5, (%2, %%"REG_a", 2) \n\t"
1942 "movq %%mm7, 8(%2, %%"REG_a", 2) \n\t"
1943 "movq %%mm4, (%3, %%"REG_a", 2) \n\t"
1944 "movq %%mm6, 8(%3, %%"REG_a", 2) \n\t"
1946 "add $8, %%"REG_a" \n\t"
1948 :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
1949 "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1955 const long mmxSize=1;
1957 dst[0 ]= (3*src[0] + src[srcStride])>>2;
1958 dst[dstStride]= ( src[0] + 3*src[srcStride])>>2;
1960 for (x=mmxSize-1; x<srcWidth-1; x++){
1961 dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
1962 dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
1963 dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
1964 dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
1966 dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
1967 dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1977 for (x=0; x<srcWidth-1; x++){
1978 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1979 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1981 dst[2*srcWidth-1]= src[srcWidth-1];
1983 for (x=0; x<srcWidth; x++){
1990 __asm__ volatile( EMMS" \n\t"
1997 * Height should be a multiple of 2 and width should be a multiple of 16.
1998 * (If this is a problem for anyone then tell me, and I will fix it.)
1999 * Chrominance data is only taken from every second line, others are ignored.
2000 * FIXME: Write HQ version.
2002 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2003 long width, long height,
2004 long lumStride, long chromStride, long srcStride)
2007 const long chromWidth= width>>1;
2008 for (y=0; y<height; y+=2)
2011 x86_reg cw = chromWidth;
2013 "xor %%"REG_a", %%"REG_a" \n\t"
2014 "pcmpeqw %%mm7, %%mm7 \n\t"
2015 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
2018 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
2019 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // UYVY UYVY(0)
2020 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(4)
2021 "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
2022 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
2023 "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
2024 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
2025 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
2026 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
2027 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
2028 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
2030 MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
2032 "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(8)
2033 "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // UYVY UYVY(12)
2034 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
2035 "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
2036 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
2037 "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
2038 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
2039 "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
2040 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
2041 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
2043 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
2045 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
2046 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
2047 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
2048 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
2049 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
2050 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
2051 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
2052 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
2054 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
2055 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
2057 "add $8, %%"REG_a" \n\t"
2058 "cmp %4, %%"REG_a" \n\t"
2060 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (cw)
2061 : "memory", "%"REG_a
2068 "xor %%"REG_a", %%"REG_a" \n\t"
2071 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
2072 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
2073 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
2074 "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
2075 "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
2076 "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
2077 "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
2078 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
2079 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
2080 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
2081 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
2083 MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
2084 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
2086 "add $8, %%"REG_a" \n\t"
2087 "cmp %4, %%"REG_a" \n\t"
2090 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (cw)
2091 : "memory", "%"REG_a
2095 for (i=0; i<chromWidth; i++)
2097 udst[i] = src[4*i+0];
2098 ydst[2*i+0] = src[4*i+1];
2099 vdst[i] = src[4*i+2];
2100 ydst[2*i+1] = src[4*i+3];
2105 for (i=0; i<chromWidth; i++)
2107 ydst[2*i+0] = src[4*i+1];
2108 ydst[2*i+1] = src[4*i+3];
2111 udst += chromStride;
2112 vdst += chromStride;
2117 __asm__ volatile( EMMS" \n\t"
2124 * Height should be a multiple of 2 and width should be a multiple of 2.
2125 * (If this is a problem for anyone then tell me, and I will fix it.)
2126 * Chrominance data is only taken from every second line,
2127 * others are ignored in the C version.
2128 * FIXME: Write HQ version.
2130 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2131 long width, long height,
2132 long lumStride, long chromStride, long srcStride)
2135 const long chromWidth= width>>1;
2137 x86_reg cw = chromWidth;
2138 for (y=0; y<height-2; y+=2)
2144 "mov %2, %%"REG_a" \n\t"
2145 "movq "MANGLE(ff_bgr2YCoeff)", %%mm6 \n\t"
2146 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
2147 "pxor %%mm7, %%mm7 \n\t"
2148 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
2151 PREFETCH" 64(%0, %%"REG_d") \n\t"
2152 "movd (%0, %%"REG_d"), %%mm0 \n\t"
2153 "movd 3(%0, %%"REG_d"), %%mm1 \n\t"
2154 "punpcklbw %%mm7, %%mm0 \n\t"
2155 "punpcklbw %%mm7, %%mm1 \n\t"
2156 "movd 6(%0, %%"REG_d"), %%mm2 \n\t"
2157 "movd 9(%0, %%"REG_d"), %%mm3 \n\t"
2158 "punpcklbw %%mm7, %%mm2 \n\t"
2159 "punpcklbw %%mm7, %%mm3 \n\t"
2160 "pmaddwd %%mm6, %%mm0 \n\t"
2161 "pmaddwd %%mm6, %%mm1 \n\t"
2162 "pmaddwd %%mm6, %%mm2 \n\t"
2163 "pmaddwd %%mm6, %%mm3 \n\t"
2164 #ifndef FAST_BGR2YV12
2165 "psrad $8, %%mm0 \n\t"
2166 "psrad $8, %%mm1 \n\t"
2167 "psrad $8, %%mm2 \n\t"
2168 "psrad $8, %%mm3 \n\t"
2170 "packssdw %%mm1, %%mm0 \n\t"
2171 "packssdw %%mm3, %%mm2 \n\t"
2172 "pmaddwd %%mm5, %%mm0 \n\t"
2173 "pmaddwd %%mm5, %%mm2 \n\t"
2174 "packssdw %%mm2, %%mm0 \n\t"
2175 "psraw $7, %%mm0 \n\t"
2177 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
2178 "movd 15(%0, %%"REG_d"), %%mm1 \n\t"
2179 "punpcklbw %%mm7, %%mm4 \n\t"
2180 "punpcklbw %%mm7, %%mm1 \n\t"
2181 "movd 18(%0, %%"REG_d"), %%mm2 \n\t"
2182 "movd 21(%0, %%"REG_d"), %%mm3 \n\t"
2183 "punpcklbw %%mm7, %%mm2 \n\t"
2184 "punpcklbw %%mm7, %%mm3 \n\t"
2185 "pmaddwd %%mm6, %%mm4 \n\t"
2186 "pmaddwd %%mm6, %%mm1 \n\t"
2187 "pmaddwd %%mm6, %%mm2 \n\t"
2188 "pmaddwd %%mm6, %%mm3 \n\t"
2189 #ifndef FAST_BGR2YV12
2190 "psrad $8, %%mm4 \n\t"
2191 "psrad $8, %%mm1 \n\t"
2192 "psrad $8, %%mm2 \n\t"
2193 "psrad $8, %%mm3 \n\t"
2195 "packssdw %%mm1, %%mm4 \n\t"
2196 "packssdw %%mm3, %%mm2 \n\t"
2197 "pmaddwd %%mm5, %%mm4 \n\t"
2198 "pmaddwd %%mm5, %%mm2 \n\t"
2199 "add $24, %%"REG_d" \n\t"
2200 "packssdw %%mm2, %%mm4 \n\t"
2201 "psraw $7, %%mm4 \n\t"
2203 "packuswb %%mm4, %%mm0 \n\t"
2204 "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0 \n\t"
2206 MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t"
2207 "add $8, %%"REG_a" \n\t"
2209 : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width)
2210 : "%"REG_a, "%"REG_d
2217 "mov %4, %%"REG_a" \n\t"
2218 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
2219 "movq "MANGLE(ff_bgr2UCoeff)", %%mm6 \n\t"
2220 "pxor %%mm7, %%mm7 \n\t"
2221 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
2222 "add %%"REG_d", %%"REG_d" \n\t"
2225 PREFETCH" 64(%0, %%"REG_d") \n\t"
2226 PREFETCH" 64(%1, %%"REG_d") \n\t"
2227 #if HAVE_MMX2 || HAVE_AMD3DNOW
2228 "movq (%0, %%"REG_d"), %%mm0 \n\t"
2229 "movq (%1, %%"REG_d"), %%mm1 \n\t"
2230 "movq 6(%0, %%"REG_d"), %%mm2 \n\t"
2231 "movq 6(%1, %%"REG_d"), %%mm3 \n\t"
2232 PAVGB" %%mm1, %%mm0 \n\t"
2233 PAVGB" %%mm3, %%mm2 \n\t"
2234 "movq %%mm0, %%mm1 \n\t"
2235 "movq %%mm2, %%mm3 \n\t"
2236 "psrlq $24, %%mm0 \n\t"
2237 "psrlq $24, %%mm2 \n\t"
2238 PAVGB" %%mm1, %%mm0 \n\t"
2239 PAVGB" %%mm3, %%mm2 \n\t"
2240 "punpcklbw %%mm7, %%mm0 \n\t"
2241 "punpcklbw %%mm7, %%mm2 \n\t"
2243 "movd (%0, %%"REG_d"), %%mm0 \n\t"
2244 "movd (%1, %%"REG_d"), %%mm1 \n\t"
2245 "movd 3(%0, %%"REG_d"), %%mm2 \n\t"
2246 "movd 3(%1, %%"REG_d"), %%mm3 \n\t"
2247 "punpcklbw %%mm7, %%mm0 \n\t"
2248 "punpcklbw %%mm7, %%mm1 \n\t"
2249 "punpcklbw %%mm7, %%mm2 \n\t"
2250 "punpcklbw %%mm7, %%mm3 \n\t"
2251 "paddw %%mm1, %%mm0 \n\t"
2252 "paddw %%mm3, %%mm2 \n\t"
2253 "paddw %%mm2, %%mm0 \n\t"
2254 "movd 6(%0, %%"REG_d"), %%mm4 \n\t"
2255 "movd 6(%1, %%"REG_d"), %%mm1 \n\t"
2256 "movd 9(%0, %%"REG_d"), %%mm2 \n\t"
2257 "movd 9(%1, %%"REG_d"), %%mm3 \n\t"
2258 "punpcklbw %%mm7, %%mm4 \n\t"
2259 "punpcklbw %%mm7, %%mm1 \n\t"
2260 "punpcklbw %%mm7, %%mm2 \n\t"
2261 "punpcklbw %%mm7, %%mm3 \n\t"
2262 "paddw %%mm1, %%mm4 \n\t"
2263 "paddw %%mm3, %%mm2 \n\t"
2264 "paddw %%mm4, %%mm2 \n\t"
2265 "psrlw $2, %%mm0 \n\t"
2266 "psrlw $2, %%mm2 \n\t"
2268 "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
2269 "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
2271 "pmaddwd %%mm0, %%mm1 \n\t"
2272 "pmaddwd %%mm2, %%mm3 \n\t"
2273 "pmaddwd %%mm6, %%mm0 \n\t"
2274 "pmaddwd %%mm6, %%mm2 \n\t"
2275 #ifndef FAST_BGR2YV12
2276 "psrad $8, %%mm0 \n\t"
2277 "psrad $8, %%mm1 \n\t"
2278 "psrad $8, %%mm2 \n\t"
2279 "psrad $8, %%mm3 \n\t"
2281 "packssdw %%mm2, %%mm0 \n\t"
2282 "packssdw %%mm3, %%mm1 \n\t"
2283 "pmaddwd %%mm5, %%mm0 \n\t"
2284 "pmaddwd %%mm5, %%mm1 \n\t"
2285 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
2286 "psraw $7, %%mm0 \n\t"
2288 #if HAVE_MMX2 || HAVE_AMD3DNOW
2289 "movq 12(%0, %%"REG_d"), %%mm4 \n\t"
2290 "movq 12(%1, %%"REG_d"), %%mm1 \n\t"
2291 "movq 18(%0, %%"REG_d"), %%mm2 \n\t"
2292 "movq 18(%1, %%"REG_d"), %%mm3 \n\t"
2293 PAVGB" %%mm1, %%mm4 \n\t"
2294 PAVGB" %%mm3, %%mm2 \n\t"
2295 "movq %%mm4, %%mm1 \n\t"
2296 "movq %%mm2, %%mm3 \n\t"
2297 "psrlq $24, %%mm4 \n\t"
2298 "psrlq $24, %%mm2 \n\t"
2299 PAVGB" %%mm1, %%mm4 \n\t"
2300 PAVGB" %%mm3, %%mm2 \n\t"
2301 "punpcklbw %%mm7, %%mm4 \n\t"
2302 "punpcklbw %%mm7, %%mm2 \n\t"
2304 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
2305 "movd 12(%1, %%"REG_d"), %%mm1 \n\t"
2306 "movd 15(%0, %%"REG_d"), %%mm2 \n\t"
2307 "movd 15(%1, %%"REG_d"), %%mm3 \n\t"
2308 "punpcklbw %%mm7, %%mm4 \n\t"
2309 "punpcklbw %%mm7, %%mm1 \n\t"
2310 "punpcklbw %%mm7, %%mm2 \n\t"
2311 "punpcklbw %%mm7, %%mm3 \n\t"
2312 "paddw %%mm1, %%mm4 \n\t"
2313 "paddw %%mm3, %%mm2 \n\t"
2314 "paddw %%mm2, %%mm4 \n\t"
2315 "movd 18(%0, %%"REG_d"), %%mm5 \n\t"
2316 "movd 18(%1, %%"REG_d"), %%mm1 \n\t"
2317 "movd 21(%0, %%"REG_d"), %%mm2 \n\t"
2318 "movd 21(%1, %%"REG_d"), %%mm3 \n\t"
2319 "punpcklbw %%mm7, %%mm5 \n\t"
2320 "punpcklbw %%mm7, %%mm1 \n\t"
2321 "punpcklbw %%mm7, %%mm2 \n\t"
2322 "punpcklbw %%mm7, %%mm3 \n\t"
2323 "paddw %%mm1, %%mm5 \n\t"
2324 "paddw %%mm3, %%mm2 \n\t"
2325 "paddw %%mm5, %%mm2 \n\t"
2326 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
2327 "psrlw $2, %%mm4 \n\t"
2328 "psrlw $2, %%mm2 \n\t"
2330 "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
2331 "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
2333 "pmaddwd %%mm4, %%mm1 \n\t"
2334 "pmaddwd %%mm2, %%mm3 \n\t"
2335 "pmaddwd %%mm6, %%mm4 \n\t"
2336 "pmaddwd %%mm6, %%mm2 \n\t"
2337 #ifndef FAST_BGR2YV12
2338 "psrad $8, %%mm4 \n\t"
2339 "psrad $8, %%mm1 \n\t"
2340 "psrad $8, %%mm2 \n\t"
2341 "psrad $8, %%mm3 \n\t"
2343 "packssdw %%mm2, %%mm4 \n\t"
2344 "packssdw %%mm3, %%mm1 \n\t"
2345 "pmaddwd %%mm5, %%mm4 \n\t"
2346 "pmaddwd %%mm5, %%mm1 \n\t"
2347 "add $24, %%"REG_d" \n\t"
2348 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
2349 "psraw $7, %%mm4 \n\t"
2351 "movq %%mm0, %%mm1 \n\t"
2352 "punpckldq %%mm4, %%mm0 \n\t"
2353 "punpckhdq %%mm4, %%mm1 \n\t"
2354 "packsswb %%mm1, %%mm0 \n\t"
2355 "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0 \n\t"
2356 "movd %%mm0, (%2, %%"REG_a") \n\t"
2357 "punpckhdq %%mm0, %%mm0 \n\t"
2358 "movd %%mm0, (%3, %%"REG_a") \n\t"
2359 "add $4, %%"REG_a" \n\t"
2361 : : "r" (src+cw*6), "r" (src+srcStride+cw*6), "r" (udst+cw), "r" (vdst+cw), "g" (-cw)
2362 : "%"REG_a, "%"REG_d
2365 udst += chromStride;
2366 vdst += chromStride;
2370 __asm__ volatile( EMMS" \n\t"
2376 for (; y<height; y+=2)
2379 for (i=0; i<chromWidth; i++)
2381 unsigned int b = src[6*i+0];
2382 unsigned int g = src[6*i+1];
2383 unsigned int r = src[6*i+2];
2385 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2386 unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2387 unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2397 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2403 for (i=0; i<chromWidth; i++)
2405 unsigned int b = src[6*i+0];
2406 unsigned int g = src[6*i+1];
2407 unsigned int r = src[6*i+2];
2409 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2417 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2420 udst += chromStride;
2421 vdst += chromStride;
2427 static void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
2428 long width, long height, long src1Stride,
2429 long src2Stride, long dstStride){
2432 for (h=0; h < height; h++)
2439 "xor %%"REG_a", %%"REG_a" \n\t"
2441 PREFETCH" 64(%1, %%"REG_a") \n\t"
2442 PREFETCH" 64(%2, %%"REG_a") \n\t"
2443 "movdqa (%1, %%"REG_a"), %%xmm0 \n\t"
2444 "movdqa (%1, %%"REG_a"), %%xmm1 \n\t"
2445 "movdqa (%2, %%"REG_a"), %%xmm2 \n\t"
2446 "punpcklbw %%xmm2, %%xmm0 \n\t"
2447 "punpckhbw %%xmm2, %%xmm1 \n\t"
2448 "movntdq %%xmm0, (%0, %%"REG_a", 2) \n\t"
2449 "movntdq %%xmm1, 16(%0, %%"REG_a", 2) \n\t"
2450 "add $16, %%"REG_a" \n\t"
2451 "cmp %3, %%"REG_a" \n\t"
2453 ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
2454 : "memory", "%"REG_a""
2458 "xor %%"REG_a", %%"REG_a" \n\t"
2460 PREFETCH" 64(%1, %%"REG_a") \n\t"
2461 PREFETCH" 64(%2, %%"REG_a") \n\t"
2462 "movq (%1, %%"REG_a"), %%mm0 \n\t"
2463 "movq 8(%1, %%"REG_a"), %%mm2 \n\t"
2464 "movq %%mm0, %%mm1 \n\t"
2465 "movq %%mm2, %%mm3 \n\t"
2466 "movq (%2, %%"REG_a"), %%mm4 \n\t"
2467 "movq 8(%2, %%"REG_a"), %%mm5 \n\t"
2468 "punpcklbw %%mm4, %%mm0 \n\t"
2469 "punpckhbw %%mm4, %%mm1 \n\t"
2470 "punpcklbw %%mm5, %%mm2 \n\t"
2471 "punpckhbw %%mm5, %%mm3 \n\t"
2472 MOVNTQ" %%mm0, (%0, %%"REG_a", 2) \n\t"
2473 MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2) \n\t"
2474 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2) \n\t"
2475 MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2) \n\t"
2476 "add $16, %%"REG_a" \n\t"
2477 "cmp %3, %%"REG_a" \n\t"
2479 ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
2480 : "memory", "%"REG_a
2483 for (w= (width&(~15)); w < width; w++)
2485 dest[2*w+0] = src1[w];
2486 dest[2*w+1] = src2[w];
2489 for (w=0; w < width; w++)
2491 dest[2*w+0] = src1[w];
2492 dest[2*w+1] = src2[w];
2508 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2509 uint8_t *dst1, uint8_t *dst2,
2510 long width, long height,
2511 long srcStride1, long srcStride2,
2512 long dstStride1, long dstStride2)
2515 w=width/2; h=height/2;
2520 ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2523 const uint8_t* s1=src1+srcStride1*(y>>1);
2524 uint8_t* d=dst1+dstStride1*y;
2530 PREFETCH" 32%1 \n\t"
2531 "movq %1, %%mm0 \n\t"
2532 "movq 8%1, %%mm2 \n\t"
2533 "movq 16%1, %%mm4 \n\t"
2534 "movq 24%1, %%mm6 \n\t"
2535 "movq %%mm0, %%mm1 \n\t"
2536 "movq %%mm2, %%mm3 \n\t"
2537 "movq %%mm4, %%mm5 \n\t"
2538 "movq %%mm6, %%mm7 \n\t"
2539 "punpcklbw %%mm0, %%mm0 \n\t"
2540 "punpckhbw %%mm1, %%mm1 \n\t"
2541 "punpcklbw %%mm2, %%mm2 \n\t"
2542 "punpckhbw %%mm3, %%mm3 \n\t"
2543 "punpcklbw %%mm4, %%mm4 \n\t"
2544 "punpckhbw %%mm5, %%mm5 \n\t"
2545 "punpcklbw %%mm6, %%mm6 \n\t"
2546 "punpckhbw %%mm7, %%mm7 \n\t"
2547 MOVNTQ" %%mm0, %0 \n\t"
2548 MOVNTQ" %%mm1, 8%0 \n\t"
2549 MOVNTQ" %%mm2, 16%0 \n\t"
2550 MOVNTQ" %%mm3, 24%0 \n\t"
2551 MOVNTQ" %%mm4, 32%0 \n\t"
2552 MOVNTQ" %%mm5, 40%0 \n\t"
2553 MOVNTQ" %%mm6, 48%0 \n\t"
2554 MOVNTQ" %%mm7, 56%0"
2560 for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2563 const uint8_t* s2=src2+srcStride2*(y>>1);
2564 uint8_t* d=dst2+dstStride2*y;
2570 PREFETCH" 32%1 \n\t"
2571 "movq %1, %%mm0 \n\t"
2572 "movq 8%1, %%mm2 \n\t"
2573 "movq 16%1, %%mm4 \n\t"
2574 "movq 24%1, %%mm6 \n\t"
2575 "movq %%mm0, %%mm1 \n\t"
2576 "movq %%mm2, %%mm3 \n\t"
2577 "movq %%mm4, %%mm5 \n\t"
2578 "movq %%mm6, %%mm7 \n\t"
2579 "punpcklbw %%mm0, %%mm0 \n\t"
2580 "punpckhbw %%mm1, %%mm1 \n\t"
2581 "punpcklbw %%mm2, %%mm2 \n\t"
2582 "punpckhbw %%mm3, %%mm3 \n\t"
2583 "punpcklbw %%mm4, %%mm4 \n\t"
2584 "punpckhbw %%mm5, %%mm5 \n\t"
2585 "punpcklbw %%mm6, %%mm6 \n\t"
2586 "punpckhbw %%mm7, %%mm7 \n\t"
2587 MOVNTQ" %%mm0, %0 \n\t"
2588 MOVNTQ" %%mm1, 8%0 \n\t"
2589 MOVNTQ" %%mm2, 16%0 \n\t"
2590 MOVNTQ" %%mm3, 24%0 \n\t"
2591 MOVNTQ" %%mm4, 32%0 \n\t"
2592 MOVNTQ" %%mm5, 40%0 \n\t"
2593 MOVNTQ" %%mm6, 48%0 \n\t"
2594 MOVNTQ" %%mm7, 56%0"
2600 for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2611 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2613 long width, long height,
2614 long srcStride1, long srcStride2,
2615 long srcStride3, long dstStride)
2618 w=width/2; h=height;
2620 const uint8_t* yp=src1+srcStride1*y;
2621 const uint8_t* up=src2+srcStride2*(y>>2);
2622 const uint8_t* vp=src3+srcStride3*(y>>2);
2623 uint8_t* d=dst+dstStride*y;
2629 PREFETCH" 32(%1, %0) \n\t"
2630 PREFETCH" 32(%2, %0) \n\t"
2631 PREFETCH" 32(%3, %0) \n\t"
2632 "movq (%1, %0, 4), %%mm0 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2633 "movq (%2, %0), %%mm1 \n\t" /* U0U1U2U3U4U5U6U7 */
2634 "movq (%3, %0), %%mm2 \n\t" /* V0V1V2V3V4V5V6V7 */
2635 "movq %%mm0, %%mm3 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2636 "movq %%mm1, %%mm4 \n\t" /* U0U1U2U3U4U5U6U7 */
2637 "movq %%mm2, %%mm5 \n\t" /* V0V1V2V3V4V5V6V7 */
2638 "punpcklbw %%mm1, %%mm1 \n\t" /* U0U0 U1U1 U2U2 U3U3 */
2639 "punpcklbw %%mm2, %%mm2 \n\t" /* V0V0 V1V1 V2V2 V3V3 */
2640 "punpckhbw %%mm4, %%mm4 \n\t" /* U4U4 U5U5 U6U6 U7U7 */
2641 "punpckhbw %%mm5, %%mm5 \n\t" /* V4V4 V5V5 V6V6 V7V7 */
2643 "movq %%mm1, %%mm6 \n\t"
2644 "punpcklbw %%mm2, %%mm1 \n\t" /* U0V0 U0V0 U1V1 U1V1*/
2645 "punpcklbw %%mm1, %%mm0 \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2646 "punpckhbw %%mm1, %%mm3 \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2647 MOVNTQ" %%mm0, (%4, %0, 8) \n\t"
2648 MOVNTQ" %%mm3, 8(%4, %0, 8) \n\t"
2650 "punpckhbw %%mm2, %%mm6 \n\t" /* U2V2 U2V2 U3V3 U3V3*/
2651 "movq 8(%1, %0, 4), %%mm0 \n\t"
2652 "movq %%mm0, %%mm3 \n\t"
2653 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U2 Y V2 Y U2 Y V2*/
2654 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U3 Y V3 Y U3 Y V3*/
2655 MOVNTQ" %%mm0, 16(%4, %0, 8) \n\t"
2656 MOVNTQ" %%mm3, 24(%4, %0, 8) \n\t"
2658 "movq %%mm4, %%mm6 \n\t"
2659 "movq 16(%1, %0, 4), %%mm0 \n\t"
2660 "movq %%mm0, %%mm3 \n\t"
2661 "punpcklbw %%mm5, %%mm4 \n\t"
2662 "punpcklbw %%mm4, %%mm0 \n\t" /* Y U4 Y V4 Y U4 Y V4*/
2663 "punpckhbw %%mm4, %%mm3 \n\t" /* Y U5 Y V5 Y U5 Y V5*/
2664 MOVNTQ" %%mm0, 32(%4, %0, 8) \n\t"
2665 MOVNTQ" %%mm3, 40(%4, %0, 8) \n\t"
2667 "punpckhbw %%mm5, %%mm6 \n\t"
2668 "movq 24(%1, %0, 4), %%mm0 \n\t"
2669 "movq %%mm0, %%mm3 \n\t"
2670 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U6 Y V6 Y U6 Y V6*/
2671 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U7 Y V7 Y U7 Y V7*/
2672 MOVNTQ" %%mm0, 48(%4, %0, 8) \n\t"
2673 MOVNTQ" %%mm3, 56(%4, %0, 8) \n\t"
2676 : "r"(yp), "r" (up), "r"(vp), "r"(d)
2684 const long x2 = x<<2;
2687 d[8*x+2] = yp[x2+1];
2689 d[8*x+4] = yp[x2+2];
2691 d[8*x+6] = yp[x2+3];
2704 static inline void RENAME(rgb2rgb_init)(void){
2705 rgb15to16 = RENAME(rgb15to16);
2706 rgb15tobgr24 = RENAME(rgb15tobgr24);
2707 rgb15to32 = RENAME(rgb15to32);
2708 rgb16tobgr24 = RENAME(rgb16tobgr24);
2709 rgb16to32 = RENAME(rgb16to32);
2710 rgb16to15 = RENAME(rgb16to15);
2711 rgb24tobgr16 = RENAME(rgb24tobgr16);
2712 rgb24tobgr15 = RENAME(rgb24tobgr15);
2713 rgb24tobgr32 = RENAME(rgb24tobgr32);
2714 rgb32to16 = RENAME(rgb32to16);
2715 rgb32to15 = RENAME(rgb32to15);
2716 rgb32tobgr24 = RENAME(rgb32tobgr24);
2717 rgb24to15 = RENAME(rgb24to15);
2718 rgb24to16 = RENAME(rgb24to16);
2719 rgb24tobgr24 = RENAME(rgb24tobgr24);
2720 rgb32tobgr32 = RENAME(rgb32tobgr32);
2721 rgb32tobgr16 = RENAME(rgb32tobgr16);
2722 rgb32tobgr15 = RENAME(rgb32tobgr15);
2723 yv12toyuy2 = RENAME(yv12toyuy2);
2724 yv12touyvy = RENAME(yv12touyvy);
2725 yuv422ptoyuy2 = RENAME(yuv422ptoyuy2);
2726 yuv422ptouyvy = RENAME(yuv422ptouyvy);
2727 yuy2toyv12 = RENAME(yuy2toyv12);
2728 // uyvytoyv12 = RENAME(uyvytoyv12);
2729 // yvu9toyv12 = RENAME(yvu9toyv12);
2730 planar2x = RENAME(planar2x);
2731 rgb24toyv12 = RENAME(rgb24toyv12);
2732 interleaveBytes = RENAME(interleaveBytes);
2733 vu9_to_vu12 = RENAME(vu9_to_vu12);
2734 yvu9_to_yuy2 = RENAME(yvu9_to_yuy2);