2 * software RGB to RGB converter
3 * pluralize by software PAL8 to RGB converter
4 * software YUV to YUV converter
5 * software YUV to RGB converter
6 * Written by Nick Kurshev.
7 * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
8 * lot of big-endian byte order fixes by Alex Beregszaszi
10 * This file is part of FFmpeg.
12 * FFmpeg is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License as published by the Free Software Foundation; either
15 * version 2.1 of the License, or (at your option) any later version.
17 * FFmpeg is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * Lesser General Public License for more details.
22 * You should have received a copy of the GNU Lesser General Public
23 * License along with FFmpeg; if not, write to the Free Software
24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
43 #define PREFETCH "prefetch"
44 #define PAVGB "pavgusb"
46 #define PREFETCH "prefetchnta"
49 #define PREFETCH " # nop"
53 /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
60 #define MOVNTQ "movntq"
61 #define SFENCE "sfence"
64 #define SFENCE " # nop"
67 static inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
70 const uint8_t *s = src;
73 const uint8_t *mm_end;
77 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
79 __asm__ volatile("movq %0, %%mm7"::"m"(mask32a):"memory");
84 "punpckldq 3%1, %%mm0 \n\t"
85 "movd 6%1, %%mm1 \n\t"
86 "punpckldq 9%1, %%mm1 \n\t"
87 "movd 12%1, %%mm2 \n\t"
88 "punpckldq 15%1, %%mm2 \n\t"
89 "movd 18%1, %%mm3 \n\t"
90 "punpckldq 21%1, %%mm3 \n\t"
91 "por %%mm7, %%mm0 \n\t"
92 "por %%mm7, %%mm1 \n\t"
93 "por %%mm7, %%mm2 \n\t"
94 "por %%mm7, %%mm3 \n\t"
95 MOVNTQ" %%mm0, %0 \n\t"
96 MOVNTQ" %%mm1, 8%0 \n\t"
97 MOVNTQ" %%mm2, 16%0 \n\t"
105 __asm__ volatile(SFENCE:::"memory");
106 __asm__ volatile(EMMS:::"memory");
110 /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */
125 #define STORE_BGR24_MMX \
126 "psrlq $8, %%mm2 \n\t" \
127 "psrlq $8, %%mm3 \n\t" \
128 "psrlq $8, %%mm6 \n\t" \
129 "psrlq $8, %%mm7 \n\t" \
130 "pand "MANGLE(mask24l)", %%mm0\n\t" \
131 "pand "MANGLE(mask24l)", %%mm1\n\t" \
132 "pand "MANGLE(mask24l)", %%mm4\n\t" \
133 "pand "MANGLE(mask24l)", %%mm5\n\t" \
134 "pand "MANGLE(mask24h)", %%mm2\n\t" \
135 "pand "MANGLE(mask24h)", %%mm3\n\t" \
136 "pand "MANGLE(mask24h)", %%mm6\n\t" \
137 "pand "MANGLE(mask24h)", %%mm7\n\t" \
138 "por %%mm2, %%mm0 \n\t" \
139 "por %%mm3, %%mm1 \n\t" \
140 "por %%mm6, %%mm4 \n\t" \
141 "por %%mm7, %%mm5 \n\t" \
143 "movq %%mm1, %%mm2 \n\t" \
144 "movq %%mm4, %%mm3 \n\t" \
145 "psllq $48, %%mm2 \n\t" \
146 "psllq $32, %%mm3 \n\t" \
147 "pand "MANGLE(mask24hh)", %%mm2\n\t" \
148 "pand "MANGLE(mask24hhh)", %%mm3\n\t" \
149 "por %%mm2, %%mm0 \n\t" \
150 "psrlq $16, %%mm1 \n\t" \
151 "psrlq $32, %%mm4 \n\t" \
152 "psllq $16, %%mm5 \n\t" \
153 "por %%mm3, %%mm1 \n\t" \
154 "pand "MANGLE(mask24hhhh)", %%mm5\n\t" \
155 "por %%mm5, %%mm4 \n\t" \
157 MOVNTQ" %%mm0, %0 \n\t" \
158 MOVNTQ" %%mm1, 8%0 \n\t" \
162 static inline void RENAME(rgb32tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
165 const uint8_t *s = src;
168 const uint8_t *mm_end;
172 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
177 "movq %1, %%mm0 \n\t"
178 "movq 8%1, %%mm1 \n\t"
179 "movq 16%1, %%mm4 \n\t"
180 "movq 24%1, %%mm5 \n\t"
181 "movq %%mm0, %%mm2 \n\t"
182 "movq %%mm1, %%mm3 \n\t"
183 "movq %%mm4, %%mm6 \n\t"
184 "movq %%mm5, %%mm7 \n\t"
192 __asm__ volatile(SFENCE:::"memory");
193 __asm__ volatile(EMMS:::"memory");
197 /* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */
213 original by Strepto/Astral
214 ported to gcc & bugfixed: A'rpi
215 MMX2, 3DNOW optimization by Nick Kurshev
216 32-bit C version, and and&add trick by Michael Niedermayer
218 static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, long src_size)
220 register const uint8_t* s=src;
221 register uint8_t* d=dst;
222 register const uint8_t *end;
223 const uint8_t *mm_end;
226 __asm__ volatile(PREFETCH" %0"::"m"(*s));
227 __asm__ volatile("movq %0, %%mm4"::"m"(mask15s));
232 "movq %1, %%mm0 \n\t"
233 "movq 8%1, %%mm2 \n\t"
234 "movq %%mm0, %%mm1 \n\t"
235 "movq %%mm2, %%mm3 \n\t"
236 "pand %%mm4, %%mm0 \n\t"
237 "pand %%mm4, %%mm2 \n\t"
238 "paddw %%mm1, %%mm0 \n\t"
239 "paddw %%mm3, %%mm2 \n\t"
240 MOVNTQ" %%mm0, %0 \n\t"
248 __asm__ volatile(SFENCE:::"memory");
249 __asm__ volatile(EMMS:::"memory");
253 register unsigned x= *((const uint32_t *)s);
254 *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
259 register unsigned short x= *((const uint16_t *)s);
260 *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
264 static inline void RENAME(rgb16to15)(const uint8_t *src, uint8_t *dst, long src_size)
266 register const uint8_t* s=src;
267 register uint8_t* d=dst;
268 register const uint8_t *end;
269 const uint8_t *mm_end;
272 __asm__ volatile(PREFETCH" %0"::"m"(*s));
273 __asm__ volatile("movq %0, %%mm7"::"m"(mask15rg));
274 __asm__ volatile("movq %0, %%mm6"::"m"(mask15b));
279 "movq %1, %%mm0 \n\t"
280 "movq 8%1, %%mm2 \n\t"
281 "movq %%mm0, %%mm1 \n\t"
282 "movq %%mm2, %%mm3 \n\t"
283 "psrlq $1, %%mm0 \n\t"
284 "psrlq $1, %%mm2 \n\t"
285 "pand %%mm7, %%mm0 \n\t"
286 "pand %%mm7, %%mm2 \n\t"
287 "pand %%mm6, %%mm1 \n\t"
288 "pand %%mm6, %%mm3 \n\t"
289 "por %%mm1, %%mm0 \n\t"
290 "por %%mm3, %%mm2 \n\t"
291 MOVNTQ" %%mm0, %0 \n\t"
299 __asm__ volatile(SFENCE:::"memory");
300 __asm__ volatile(EMMS:::"memory");
304 register uint32_t x= *((const uint32_t*)s);
305 *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
310 register uint16_t x= *((const uint16_t*)s);
311 *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
315 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size)
317 const uint8_t *s = src;
320 const uint8_t *mm_end;
322 uint16_t *d = (uint16_t *)dst;
326 #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
328 "movq %3, %%mm5 \n\t"
329 "movq %4, %%mm6 \n\t"
330 "movq %5, %%mm7 \n\t"
334 PREFETCH" 32(%1) \n\t"
335 "movd (%1), %%mm0 \n\t"
336 "movd 4(%1), %%mm3 \n\t"
337 "punpckldq 8(%1), %%mm0 \n\t"
338 "punpckldq 12(%1), %%mm3 \n\t"
339 "movq %%mm0, %%mm1 \n\t"
340 "movq %%mm3, %%mm4 \n\t"
341 "pand %%mm6, %%mm0 \n\t"
342 "pand %%mm6, %%mm3 \n\t"
343 "pmaddwd %%mm7, %%mm0 \n\t"
344 "pmaddwd %%mm7, %%mm3 \n\t"
345 "pand %%mm5, %%mm1 \n\t"
346 "pand %%mm5, %%mm4 \n\t"
347 "por %%mm1, %%mm0 \n\t"
348 "por %%mm4, %%mm3 \n\t"
349 "psrld $5, %%mm0 \n\t"
350 "pslld $11, %%mm3 \n\t"
351 "por %%mm3, %%mm0 \n\t"
352 MOVNTQ" %%mm0, (%0) \n\t"
359 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
362 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
364 "movq %0, %%mm7 \n\t"
365 "movq %1, %%mm6 \n\t"
366 ::"m"(red_16mask),"m"(green_16mask));
370 "movd %1, %%mm0 \n\t"
371 "movd 4%1, %%mm3 \n\t"
372 "punpckldq 8%1, %%mm0 \n\t"
373 "punpckldq 12%1, %%mm3 \n\t"
374 "movq %%mm0, %%mm1 \n\t"
375 "movq %%mm0, %%mm2 \n\t"
376 "movq %%mm3, %%mm4 \n\t"
377 "movq %%mm3, %%mm5 \n\t"
378 "psrlq $3, %%mm0 \n\t"
379 "psrlq $3, %%mm3 \n\t"
380 "pand %2, %%mm0 \n\t"
381 "pand %2, %%mm3 \n\t"
382 "psrlq $5, %%mm1 \n\t"
383 "psrlq $5, %%mm4 \n\t"
384 "pand %%mm6, %%mm1 \n\t"
385 "pand %%mm6, %%mm4 \n\t"
386 "psrlq $8, %%mm2 \n\t"
387 "psrlq $8, %%mm5 \n\t"
388 "pand %%mm7, %%mm2 \n\t"
389 "pand %%mm7, %%mm5 \n\t"
390 "por %%mm1, %%mm0 \n\t"
391 "por %%mm4, %%mm3 \n\t"
392 "por %%mm2, %%mm0 \n\t"
393 "por %%mm5, %%mm3 \n\t"
394 "psllq $16, %%mm3 \n\t"
395 "por %%mm3, %%mm0 \n\t"
396 MOVNTQ" %%mm0, %0 \n\t"
397 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
402 __asm__ volatile(SFENCE:::"memory");
403 __asm__ volatile(EMMS:::"memory");
406 register int rgb = *(const uint32_t*)s; s += 4;
407 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
411 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
413 const uint8_t *s = src;
416 const uint8_t *mm_end;
418 uint16_t *d = (uint16_t *)dst;
421 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
423 "movq %0, %%mm7 \n\t"
424 "movq %1, %%mm6 \n\t"
425 ::"m"(red_16mask),"m"(green_16mask));
430 "movd %1, %%mm0 \n\t"
431 "movd 4%1, %%mm3 \n\t"
432 "punpckldq 8%1, %%mm0 \n\t"
433 "punpckldq 12%1, %%mm3 \n\t"
434 "movq %%mm0, %%mm1 \n\t"
435 "movq %%mm0, %%mm2 \n\t"
436 "movq %%mm3, %%mm4 \n\t"
437 "movq %%mm3, %%mm5 \n\t"
438 "psllq $8, %%mm0 \n\t"
439 "psllq $8, %%mm3 \n\t"
440 "pand %%mm7, %%mm0 \n\t"
441 "pand %%mm7, %%mm3 \n\t"
442 "psrlq $5, %%mm1 \n\t"
443 "psrlq $5, %%mm4 \n\t"
444 "pand %%mm6, %%mm1 \n\t"
445 "pand %%mm6, %%mm4 \n\t"
446 "psrlq $19, %%mm2 \n\t"
447 "psrlq $19, %%mm5 \n\t"
448 "pand %2, %%mm2 \n\t"
449 "pand %2, %%mm5 \n\t"
450 "por %%mm1, %%mm0 \n\t"
451 "por %%mm4, %%mm3 \n\t"
452 "por %%mm2, %%mm0 \n\t"
453 "por %%mm5, %%mm3 \n\t"
454 "psllq $16, %%mm3 \n\t"
455 "por %%mm3, %%mm0 \n\t"
456 MOVNTQ" %%mm0, %0 \n\t"
457 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
461 __asm__ volatile(SFENCE:::"memory");
462 __asm__ volatile(EMMS:::"memory");
465 register int rgb = *(const uint32_t*)s; s += 4;
466 *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
470 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size)
472 const uint8_t *s = src;
475 const uint8_t *mm_end;
477 uint16_t *d = (uint16_t *)dst;
481 #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
483 "movq %3, %%mm5 \n\t"
484 "movq %4, %%mm6 \n\t"
485 "movq %5, %%mm7 \n\t"
489 PREFETCH" 32(%1) \n\t"
490 "movd (%1), %%mm0 \n\t"
491 "movd 4(%1), %%mm3 \n\t"
492 "punpckldq 8(%1), %%mm0 \n\t"
493 "punpckldq 12(%1), %%mm3 \n\t"
494 "movq %%mm0, %%mm1 \n\t"
495 "movq %%mm3, %%mm4 \n\t"
496 "pand %%mm6, %%mm0 \n\t"
497 "pand %%mm6, %%mm3 \n\t"
498 "pmaddwd %%mm7, %%mm0 \n\t"
499 "pmaddwd %%mm7, %%mm3 \n\t"
500 "pand %%mm5, %%mm1 \n\t"
501 "pand %%mm5, %%mm4 \n\t"
502 "por %%mm1, %%mm0 \n\t"
503 "por %%mm4, %%mm3 \n\t"
504 "psrld $6, %%mm0 \n\t"
505 "pslld $10, %%mm3 \n\t"
506 "por %%mm3, %%mm0 \n\t"
507 MOVNTQ" %%mm0, (%0) \n\t"
514 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
517 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
519 "movq %0, %%mm7 \n\t"
520 "movq %1, %%mm6 \n\t"
521 ::"m"(red_15mask),"m"(green_15mask));
525 "movd %1, %%mm0 \n\t"
526 "movd 4%1, %%mm3 \n\t"
527 "punpckldq 8%1, %%mm0 \n\t"
528 "punpckldq 12%1, %%mm3 \n\t"
529 "movq %%mm0, %%mm1 \n\t"
530 "movq %%mm0, %%mm2 \n\t"
531 "movq %%mm3, %%mm4 \n\t"
532 "movq %%mm3, %%mm5 \n\t"
533 "psrlq $3, %%mm0 \n\t"
534 "psrlq $3, %%mm3 \n\t"
535 "pand %2, %%mm0 \n\t"
536 "pand %2, %%mm3 \n\t"
537 "psrlq $6, %%mm1 \n\t"
538 "psrlq $6, %%mm4 \n\t"
539 "pand %%mm6, %%mm1 \n\t"
540 "pand %%mm6, %%mm4 \n\t"
541 "psrlq $9, %%mm2 \n\t"
542 "psrlq $9, %%mm5 \n\t"
543 "pand %%mm7, %%mm2 \n\t"
544 "pand %%mm7, %%mm5 \n\t"
545 "por %%mm1, %%mm0 \n\t"
546 "por %%mm4, %%mm3 \n\t"
547 "por %%mm2, %%mm0 \n\t"
548 "por %%mm5, %%mm3 \n\t"
549 "psllq $16, %%mm3 \n\t"
550 "por %%mm3, %%mm0 \n\t"
551 MOVNTQ" %%mm0, %0 \n\t"
552 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
557 __asm__ volatile(SFENCE:::"memory");
558 __asm__ volatile(EMMS:::"memory");
561 register int rgb = *(const uint32_t*)s; s += 4;
562 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
566 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
568 const uint8_t *s = src;
571 const uint8_t *mm_end;
573 uint16_t *d = (uint16_t *)dst;
576 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
578 "movq %0, %%mm7 \n\t"
579 "movq %1, %%mm6 \n\t"
580 ::"m"(red_15mask),"m"(green_15mask));
585 "movd %1, %%mm0 \n\t"
586 "movd 4%1, %%mm3 \n\t"
587 "punpckldq 8%1, %%mm0 \n\t"
588 "punpckldq 12%1, %%mm3 \n\t"
589 "movq %%mm0, %%mm1 \n\t"
590 "movq %%mm0, %%mm2 \n\t"
591 "movq %%mm3, %%mm4 \n\t"
592 "movq %%mm3, %%mm5 \n\t"
593 "psllq $7, %%mm0 \n\t"
594 "psllq $7, %%mm3 \n\t"
595 "pand %%mm7, %%mm0 \n\t"
596 "pand %%mm7, %%mm3 \n\t"
597 "psrlq $6, %%mm1 \n\t"
598 "psrlq $6, %%mm4 \n\t"
599 "pand %%mm6, %%mm1 \n\t"
600 "pand %%mm6, %%mm4 \n\t"
601 "psrlq $19, %%mm2 \n\t"
602 "psrlq $19, %%mm5 \n\t"
603 "pand %2, %%mm2 \n\t"
604 "pand %2, %%mm5 \n\t"
605 "por %%mm1, %%mm0 \n\t"
606 "por %%mm4, %%mm3 \n\t"
607 "por %%mm2, %%mm0 \n\t"
608 "por %%mm5, %%mm3 \n\t"
609 "psllq $16, %%mm3 \n\t"
610 "por %%mm3, %%mm0 \n\t"
611 MOVNTQ" %%mm0, %0 \n\t"
612 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
616 __asm__ volatile(SFENCE:::"memory");
617 __asm__ volatile(EMMS:::"memory");
620 register int rgb = *(const uint32_t*)s; s += 4;
621 *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
625 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
627 const uint8_t *s = src;
630 const uint8_t *mm_end;
632 uint16_t *d = (uint16_t *)dst;
635 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
637 "movq %0, %%mm7 \n\t"
638 "movq %1, %%mm6 \n\t"
639 ::"m"(red_16mask),"m"(green_16mask));
644 "movd %1, %%mm0 \n\t"
645 "movd 3%1, %%mm3 \n\t"
646 "punpckldq 6%1, %%mm0 \n\t"
647 "punpckldq 9%1, %%mm3 \n\t"
648 "movq %%mm0, %%mm1 \n\t"
649 "movq %%mm0, %%mm2 \n\t"
650 "movq %%mm3, %%mm4 \n\t"
651 "movq %%mm3, %%mm5 \n\t"
652 "psrlq $3, %%mm0 \n\t"
653 "psrlq $3, %%mm3 \n\t"
654 "pand %2, %%mm0 \n\t"
655 "pand %2, %%mm3 \n\t"
656 "psrlq $5, %%mm1 \n\t"
657 "psrlq $5, %%mm4 \n\t"
658 "pand %%mm6, %%mm1 \n\t"
659 "pand %%mm6, %%mm4 \n\t"
660 "psrlq $8, %%mm2 \n\t"
661 "psrlq $8, %%mm5 \n\t"
662 "pand %%mm7, %%mm2 \n\t"
663 "pand %%mm7, %%mm5 \n\t"
664 "por %%mm1, %%mm0 \n\t"
665 "por %%mm4, %%mm3 \n\t"
666 "por %%mm2, %%mm0 \n\t"
667 "por %%mm5, %%mm3 \n\t"
668 "psllq $16, %%mm3 \n\t"
669 "por %%mm3, %%mm0 \n\t"
670 MOVNTQ" %%mm0, %0 \n\t"
671 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
675 __asm__ volatile(SFENCE:::"memory");
676 __asm__ volatile(EMMS:::"memory");
682 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
686 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size)
688 const uint8_t *s = src;
691 const uint8_t *mm_end;
693 uint16_t *d = (uint16_t *)dst;
696 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
698 "movq %0, %%mm7 \n\t"
699 "movq %1, %%mm6 \n\t"
700 ::"m"(red_16mask),"m"(green_16mask));
705 "movd %1, %%mm0 \n\t"
706 "movd 3%1, %%mm3 \n\t"
707 "punpckldq 6%1, %%mm0 \n\t"
708 "punpckldq 9%1, %%mm3 \n\t"
709 "movq %%mm0, %%mm1 \n\t"
710 "movq %%mm0, %%mm2 \n\t"
711 "movq %%mm3, %%mm4 \n\t"
712 "movq %%mm3, %%mm5 \n\t"
713 "psllq $8, %%mm0 \n\t"
714 "psllq $8, %%mm3 \n\t"
715 "pand %%mm7, %%mm0 \n\t"
716 "pand %%mm7, %%mm3 \n\t"
717 "psrlq $5, %%mm1 \n\t"
718 "psrlq $5, %%mm4 \n\t"
719 "pand %%mm6, %%mm1 \n\t"
720 "pand %%mm6, %%mm4 \n\t"
721 "psrlq $19, %%mm2 \n\t"
722 "psrlq $19, %%mm5 \n\t"
723 "pand %2, %%mm2 \n\t"
724 "pand %2, %%mm5 \n\t"
725 "por %%mm1, %%mm0 \n\t"
726 "por %%mm4, %%mm3 \n\t"
727 "por %%mm2, %%mm0 \n\t"
728 "por %%mm5, %%mm3 \n\t"
729 "psllq $16, %%mm3 \n\t"
730 "por %%mm3, %%mm0 \n\t"
731 MOVNTQ" %%mm0, %0 \n\t"
732 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
736 __asm__ volatile(SFENCE:::"memory");
737 __asm__ volatile(EMMS:::"memory");
743 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
747 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
749 const uint8_t *s = src;
752 const uint8_t *mm_end;
754 uint16_t *d = (uint16_t *)dst;
757 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
759 "movq %0, %%mm7 \n\t"
760 "movq %1, %%mm6 \n\t"
761 ::"m"(red_15mask),"m"(green_15mask));
766 "movd %1, %%mm0 \n\t"
767 "movd 3%1, %%mm3 \n\t"
768 "punpckldq 6%1, %%mm0 \n\t"
769 "punpckldq 9%1, %%mm3 \n\t"
770 "movq %%mm0, %%mm1 \n\t"
771 "movq %%mm0, %%mm2 \n\t"
772 "movq %%mm3, %%mm4 \n\t"
773 "movq %%mm3, %%mm5 \n\t"
774 "psrlq $3, %%mm0 \n\t"
775 "psrlq $3, %%mm3 \n\t"
776 "pand %2, %%mm0 \n\t"
777 "pand %2, %%mm3 \n\t"
778 "psrlq $6, %%mm1 \n\t"
779 "psrlq $6, %%mm4 \n\t"
780 "pand %%mm6, %%mm1 \n\t"
781 "pand %%mm6, %%mm4 \n\t"
782 "psrlq $9, %%mm2 \n\t"
783 "psrlq $9, %%mm5 \n\t"
784 "pand %%mm7, %%mm2 \n\t"
785 "pand %%mm7, %%mm5 \n\t"
786 "por %%mm1, %%mm0 \n\t"
787 "por %%mm4, %%mm3 \n\t"
788 "por %%mm2, %%mm0 \n\t"
789 "por %%mm5, %%mm3 \n\t"
790 "psllq $16, %%mm3 \n\t"
791 "por %%mm3, %%mm0 \n\t"
792 MOVNTQ" %%mm0, %0 \n\t"
793 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
797 __asm__ volatile(SFENCE:::"memory");
798 __asm__ volatile(EMMS:::"memory");
804 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
808 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size)
810 const uint8_t *s = src;
813 const uint8_t *mm_end;
815 uint16_t *d = (uint16_t *)dst;
818 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
820 "movq %0, %%mm7 \n\t"
821 "movq %1, %%mm6 \n\t"
822 ::"m"(red_15mask),"m"(green_15mask));
827 "movd %1, %%mm0 \n\t"
828 "movd 3%1, %%mm3 \n\t"
829 "punpckldq 6%1, %%mm0 \n\t"
830 "punpckldq 9%1, %%mm3 \n\t"
831 "movq %%mm0, %%mm1 \n\t"
832 "movq %%mm0, %%mm2 \n\t"
833 "movq %%mm3, %%mm4 \n\t"
834 "movq %%mm3, %%mm5 \n\t"
835 "psllq $7, %%mm0 \n\t"
836 "psllq $7, %%mm3 \n\t"
837 "pand %%mm7, %%mm0 \n\t"
838 "pand %%mm7, %%mm3 \n\t"
839 "psrlq $6, %%mm1 \n\t"
840 "psrlq $6, %%mm4 \n\t"
841 "pand %%mm6, %%mm1 \n\t"
842 "pand %%mm6, %%mm4 \n\t"
843 "psrlq $19, %%mm2 \n\t"
844 "psrlq $19, %%mm5 \n\t"
845 "pand %2, %%mm2 \n\t"
846 "pand %2, %%mm5 \n\t"
847 "por %%mm1, %%mm0 \n\t"
848 "por %%mm4, %%mm3 \n\t"
849 "por %%mm2, %%mm0 \n\t"
850 "por %%mm5, %%mm3 \n\t"
851 "psllq $16, %%mm3 \n\t"
852 "por %%mm3, %%mm0 \n\t"
853 MOVNTQ" %%mm0, %0 \n\t"
854 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
858 __asm__ volatile(SFENCE:::"memory");
859 __asm__ volatile(EMMS:::"memory");
865 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
870 I use less accurate approximation here by simply left-shifting the input
871 value and filling the low order bits with zeroes. This method improves PNG
872 compression but this scheme cannot reproduce white exactly, since it does
873 not generate an all-ones maximum value; the net effect is to darken the
876 The better method should be "left bit replication":
886 | leftmost bits repeated to fill open bits
890 static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
894 const uint16_t *mm_end;
897 const uint16_t *s = (const uint16_t*)src;
898 end = s + src_size/2;
900 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
905 "movq %1, %%mm0 \n\t"
906 "movq %1, %%mm1 \n\t"
907 "movq %1, %%mm2 \n\t"
908 "pand %2, %%mm0 \n\t"
909 "pand %3, %%mm1 \n\t"
910 "pand %4, %%mm2 \n\t"
911 "psllq $3, %%mm0 \n\t"
912 "psrlq $2, %%mm1 \n\t"
913 "psrlq $7, %%mm2 \n\t"
914 "movq %%mm0, %%mm3 \n\t"
915 "movq %%mm1, %%mm4 \n\t"
916 "movq %%mm2, %%mm5 \n\t"
917 "punpcklwd %5, %%mm0 \n\t"
918 "punpcklwd %5, %%mm1 \n\t"
919 "punpcklwd %5, %%mm2 \n\t"
920 "punpckhwd %5, %%mm3 \n\t"
921 "punpckhwd %5, %%mm4 \n\t"
922 "punpckhwd %5, %%mm5 \n\t"
923 "psllq $8, %%mm1 \n\t"
924 "psllq $16, %%mm2 \n\t"
925 "por %%mm1, %%mm0 \n\t"
926 "por %%mm2, %%mm0 \n\t"
927 "psllq $8, %%mm4 \n\t"
928 "psllq $16, %%mm5 \n\t"
929 "por %%mm4, %%mm3 \n\t"
930 "por %%mm5, %%mm3 \n\t"
932 "movq %%mm0, %%mm6 \n\t"
933 "movq %%mm3, %%mm7 \n\t"
935 "movq 8%1, %%mm0 \n\t"
936 "movq 8%1, %%mm1 \n\t"
937 "movq 8%1, %%mm2 \n\t"
938 "pand %2, %%mm0 \n\t"
939 "pand %3, %%mm1 \n\t"
940 "pand %4, %%mm2 \n\t"
941 "psllq $3, %%mm0 \n\t"
942 "psrlq $2, %%mm1 \n\t"
943 "psrlq $7, %%mm2 \n\t"
944 "movq %%mm0, %%mm3 \n\t"
945 "movq %%mm1, %%mm4 \n\t"
946 "movq %%mm2, %%mm5 \n\t"
947 "punpcklwd %5, %%mm0 \n\t"
948 "punpcklwd %5, %%mm1 \n\t"
949 "punpcklwd %5, %%mm2 \n\t"
950 "punpckhwd %5, %%mm3 \n\t"
951 "punpckhwd %5, %%mm4 \n\t"
952 "punpckhwd %5, %%mm5 \n\t"
953 "psllq $8, %%mm1 \n\t"
954 "psllq $16, %%mm2 \n\t"
955 "por %%mm1, %%mm0 \n\t"
956 "por %%mm2, %%mm0 \n\t"
957 "psllq $8, %%mm4 \n\t"
958 "psllq $16, %%mm5 \n\t"
959 "por %%mm4, %%mm3 \n\t"
960 "por %%mm5, %%mm3 \n\t"
963 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
965 /* borrowed 32 to 24 */
967 "movq %%mm0, %%mm4 \n\t"
968 "movq %%mm3, %%mm5 \n\t"
969 "movq %%mm6, %%mm0 \n\t"
970 "movq %%mm7, %%mm1 \n\t"
972 "movq %%mm4, %%mm6 \n\t"
973 "movq %%mm5, %%mm7 \n\t"
974 "movq %%mm0, %%mm2 \n\t"
975 "movq %%mm1, %%mm3 \n\t"
985 __asm__ volatile(SFENCE:::"memory");
986 __asm__ volatile(EMMS:::"memory");
989 register uint16_t bgr;
991 *d++ = (bgr&0x1F)<<3;
992 *d++ = (bgr&0x3E0)>>2;
993 *d++ = (bgr&0x7C00)>>7;
997 static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
1001 const uint16_t *mm_end;
1003 uint8_t *d = (uint8_t *)dst;
1004 const uint16_t *s = (const uint16_t *)src;
1005 end = s + src_size/2;
1007 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
1009 while (s < mm_end) {
1011 PREFETCH" 32%1 \n\t"
1012 "movq %1, %%mm0 \n\t"
1013 "movq %1, %%mm1 \n\t"
1014 "movq %1, %%mm2 \n\t"
1015 "pand %2, %%mm0 \n\t"
1016 "pand %3, %%mm1 \n\t"
1017 "pand %4, %%mm2 \n\t"
1018 "psllq $3, %%mm0 \n\t"
1019 "psrlq $3, %%mm1 \n\t"
1020 "psrlq $8, %%mm2 \n\t"
1021 "movq %%mm0, %%mm3 \n\t"
1022 "movq %%mm1, %%mm4 \n\t"
1023 "movq %%mm2, %%mm5 \n\t"
1024 "punpcklwd %5, %%mm0 \n\t"
1025 "punpcklwd %5, %%mm1 \n\t"
1026 "punpcklwd %5, %%mm2 \n\t"
1027 "punpckhwd %5, %%mm3 \n\t"
1028 "punpckhwd %5, %%mm4 \n\t"
1029 "punpckhwd %5, %%mm5 \n\t"
1030 "psllq $8, %%mm1 \n\t"
1031 "psllq $16, %%mm2 \n\t"
1032 "por %%mm1, %%mm0 \n\t"
1033 "por %%mm2, %%mm0 \n\t"
1034 "psllq $8, %%mm4 \n\t"
1035 "psllq $16, %%mm5 \n\t"
1036 "por %%mm4, %%mm3 \n\t"
1037 "por %%mm5, %%mm3 \n\t"
1039 "movq %%mm0, %%mm6 \n\t"
1040 "movq %%mm3, %%mm7 \n\t"
1042 "movq 8%1, %%mm0 \n\t"
1043 "movq 8%1, %%mm1 \n\t"
1044 "movq 8%1, %%mm2 \n\t"
1045 "pand %2, %%mm0 \n\t"
1046 "pand %3, %%mm1 \n\t"
1047 "pand %4, %%mm2 \n\t"
1048 "psllq $3, %%mm0 \n\t"
1049 "psrlq $3, %%mm1 \n\t"
1050 "psrlq $8, %%mm2 \n\t"
1051 "movq %%mm0, %%mm3 \n\t"
1052 "movq %%mm1, %%mm4 \n\t"
1053 "movq %%mm2, %%mm5 \n\t"
1054 "punpcklwd %5, %%mm0 \n\t"
1055 "punpcklwd %5, %%mm1 \n\t"
1056 "punpcklwd %5, %%mm2 \n\t"
1057 "punpckhwd %5, %%mm3 \n\t"
1058 "punpckhwd %5, %%mm4 \n\t"
1059 "punpckhwd %5, %%mm5 \n\t"
1060 "psllq $8, %%mm1 \n\t"
1061 "psllq $16, %%mm2 \n\t"
1062 "por %%mm1, %%mm0 \n\t"
1063 "por %%mm2, %%mm0 \n\t"
1064 "psllq $8, %%mm4 \n\t"
1065 "psllq $16, %%mm5 \n\t"
1066 "por %%mm4, %%mm3 \n\t"
1067 "por %%mm5, %%mm3 \n\t"
1069 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
1071 /* borrowed 32 to 24 */
1073 "movq %%mm0, %%mm4 \n\t"
1074 "movq %%mm3, %%mm5 \n\t"
1075 "movq %%mm6, %%mm0 \n\t"
1076 "movq %%mm7, %%mm1 \n\t"
1078 "movq %%mm4, %%mm6 \n\t"
1079 "movq %%mm5, %%mm7 \n\t"
1080 "movq %%mm0, %%mm2 \n\t"
1081 "movq %%mm1, %%mm3 \n\t"
1091 __asm__ volatile(SFENCE:::"memory");
1092 __asm__ volatile(EMMS:::"memory");
1095 register uint16_t bgr;
1097 *d++ = (bgr&0x1F)<<3;
1098 *d++ = (bgr&0x7E0)>>3;
1099 *d++ = (bgr&0xF800)>>8;
1104 * mm0 = 00 B3 00 B2 00 B1 00 B0
1105 * mm1 = 00 G3 00 G2 00 G1 00 G0
1106 * mm2 = 00 R3 00 R2 00 R1 00 R0
1107 * mm6 = FF FF FF FF FF FF FF FF
1108 * mm7 = 00 00 00 00 00 00 00 00
1110 #define PACK_RGB32 \
1111 "packuswb %%mm7, %%mm0 \n\t" /* 00 00 00 00 B3 B2 B1 B0 */ \
1112 "packuswb %%mm7, %%mm1 \n\t" /* 00 00 00 00 G3 G2 G1 G0 */ \
1113 "packuswb %%mm7, %%mm2 \n\t" /* 00 00 00 00 R3 R2 R1 R0 */ \
1114 "punpcklbw %%mm1, %%mm0 \n\t" /* G3 B3 G2 B2 G1 B1 G0 B0 */ \
1115 "punpcklbw %%mm6, %%mm2 \n\t" /* FF R3 FF R2 FF R1 FF R0 */ \
1116 "movq %%mm0, %%mm3 \n\t" \
1117 "punpcklwd %%mm2, %%mm0 \n\t" /* FF R1 G1 B1 FF R0 G0 B0 */ \
1118 "punpckhwd %%mm2, %%mm3 \n\t" /* FF R3 G3 B3 FF R2 G2 B2 */ \
1119 MOVNTQ" %%mm0, %0 \n\t" \
1120 MOVNTQ" %%mm3, 8%0 \n\t" \
1122 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size)
1124 const uint16_t *end;
1126 const uint16_t *mm_end;
1129 const uint16_t *s = (const uint16_t *)src;
1130 end = s + src_size/2;
1132 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
1133 __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
1134 __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
1136 while (s < mm_end) {
1138 PREFETCH" 32%1 \n\t"
1139 "movq %1, %%mm0 \n\t"
1140 "movq %1, %%mm1 \n\t"
1141 "movq %1, %%mm2 \n\t"
1142 "pand %2, %%mm0 \n\t"
1143 "pand %3, %%mm1 \n\t"
1144 "pand %4, %%mm2 \n\t"
1145 "psllq $3, %%mm0 \n\t"
1146 "psrlq $2, %%mm1 \n\t"
1147 "psrlq $7, %%mm2 \n\t"
1150 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1155 __asm__ volatile(SFENCE:::"memory");
1156 __asm__ volatile(EMMS:::"memory");
1159 register uint16_t bgr;
1163 *d++ = (bgr&0x7C00)>>7;
1164 *d++ = (bgr&0x3E0)>>2;
1165 *d++ = (bgr&0x1F)<<3;
1167 *d++ = (bgr&0x1F)<<3;
1168 *d++ = (bgr&0x3E0)>>2;
1169 *d++ = (bgr&0x7C00)>>7;
1175 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size)
1177 const uint16_t *end;
1179 const uint16_t *mm_end;
1182 const uint16_t *s = (const uint16_t*)src;
1183 end = s + src_size/2;
1185 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
1186 __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
1187 __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
1189 while (s < mm_end) {
1191 PREFETCH" 32%1 \n\t"
1192 "movq %1, %%mm0 \n\t"
1193 "movq %1, %%mm1 \n\t"
1194 "movq %1, %%mm2 \n\t"
1195 "pand %2, %%mm0 \n\t"
1196 "pand %3, %%mm1 \n\t"
1197 "pand %4, %%mm2 \n\t"
1198 "psllq $3, %%mm0 \n\t"
1199 "psrlq $3, %%mm1 \n\t"
1200 "psrlq $8, %%mm2 \n\t"
1203 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1208 __asm__ volatile(SFENCE:::"memory");
1209 __asm__ volatile(EMMS:::"memory");
1212 register uint16_t bgr;
1216 *d++ = (bgr&0xF800)>>8;
1217 *d++ = (bgr&0x7E0)>>3;
1218 *d++ = (bgr&0x1F)<<3;
1220 *d++ = (bgr&0x1F)<<3;
1221 *d++ = (bgr&0x7E0)>>3;
1222 *d++ = (bgr&0xF800)>>8;
1228 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
1230 x86_reg idx = 15 - src_size;
1231 const uint8_t *s = src-idx;
1232 uint8_t *d = dst-idx;
1237 PREFETCH" (%1, %0) \n\t"
1238 "movq %3, %%mm7 \n\t"
1239 "pxor %4, %%mm7 \n\t"
1240 "movq %%mm7, %%mm6 \n\t"
1241 "pxor %5, %%mm7 \n\t"
1244 PREFETCH" 32(%1, %0) \n\t"
1245 "movq (%1, %0), %%mm0 \n\t"
1246 "movq 8(%1, %0), %%mm1 \n\t"
1248 "pshufw $177, %%mm0, %%mm3 \n\t"
1249 "pshufw $177, %%mm1, %%mm5 \n\t"
1250 "pand %%mm7, %%mm0 \n\t"
1251 "pand %%mm6, %%mm3 \n\t"
1252 "pand %%mm7, %%mm1 \n\t"
1253 "pand %%mm6, %%mm5 \n\t"
1254 "por %%mm3, %%mm0 \n\t"
1255 "por %%mm5, %%mm1 \n\t"
1257 "movq %%mm0, %%mm2 \n\t"
1258 "movq %%mm1, %%mm4 \n\t"
1259 "pand %%mm7, %%mm0 \n\t"
1260 "pand %%mm6, %%mm2 \n\t"
1261 "pand %%mm7, %%mm1 \n\t"
1262 "pand %%mm6, %%mm4 \n\t"
1263 "movq %%mm2, %%mm3 \n\t"
1264 "movq %%mm4, %%mm5 \n\t"
1265 "pslld $16, %%mm2 \n\t"
1266 "psrld $16, %%mm3 \n\t"
1267 "pslld $16, %%mm4 \n\t"
1268 "psrld $16, %%mm5 \n\t"
1269 "por %%mm2, %%mm0 \n\t"
1270 "por %%mm4, %%mm1 \n\t"
1271 "por %%mm3, %%mm0 \n\t"
1272 "por %%mm5, %%mm1 \n\t"
1274 MOVNTQ" %%mm0, (%2, %0) \n\t"
1275 MOVNTQ" %%mm1, 8(%2, %0) \n\t"
1282 : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
1285 for (; idx<15; idx+=4) {
1286 register int v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00;
1288 *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16);
1292 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
1296 x86_reg mmx_size= 23 - src_size;
1298 "test %%"REG_a", %%"REG_a" \n\t"
1300 "movq "MANGLE(mask24r)", %%mm5 \n\t"
1301 "movq "MANGLE(mask24g)", %%mm6 \n\t"
1302 "movq "MANGLE(mask24b)", %%mm7 \n\t"
1305 PREFETCH" 32(%1, %%"REG_a") \n\t"
1306 "movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1307 "movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG
1308 "movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B
1309 "psllq $16, %%mm0 \n\t" // 00 BGR BGR
1310 "pand %%mm5, %%mm0 \n\t"
1311 "pand %%mm6, %%mm1 \n\t"
1312 "pand %%mm7, %%mm2 \n\t"
1313 "por %%mm0, %%mm1 \n\t"
1314 "por %%mm2, %%mm1 \n\t"
1315 "movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1316 MOVNTQ" %%mm1, (%2, %%"REG_a") \n\t" // RGB RGB RG
1317 "movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B
1318 "movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR
1319 "pand %%mm7, %%mm0 \n\t"
1320 "pand %%mm5, %%mm1 \n\t"
1321 "pand %%mm6, %%mm2 \n\t"
1322 "por %%mm0, %%mm1 \n\t"
1323 "por %%mm2, %%mm1 \n\t"
1324 "movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B
1325 MOVNTQ" %%mm1, 8(%2, %%"REG_a") \n\t" // B RGB RGB R
1326 "movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR
1327 "movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG
1328 "pand %%mm6, %%mm0 \n\t"
1329 "pand %%mm7, %%mm1 \n\t"
1330 "pand %%mm5, %%mm2 \n\t"
1331 "por %%mm0, %%mm1 \n\t"
1332 "por %%mm2, %%mm1 \n\t"
1333 MOVNTQ" %%mm1, 16(%2, %%"REG_a") \n\t"
1334 "add $24, %%"REG_a" \n\t"
1338 : "r" (src-mmx_size), "r"(dst-mmx_size)
1341 __asm__ volatile(SFENCE:::"memory");
1342 __asm__ volatile(EMMS:::"memory");
1344 if (mmx_size==23) return; //finished, was multiple of 8
1348 src_size= 23-mmx_size;
1352 for (i=0; i<src_size; i+=3) {
1355 dst[i + 1] = src[i + 1];
1356 dst[i + 2] = src[i + 0];
1361 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1362 long width, long height,
1363 long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1366 const x86_reg chromWidth= width>>1;
1367 for (y=0; y<height; y++) {
1369 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1371 "xor %%"REG_a", %%"REG_a" \n\t"
1374 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1375 PREFETCH" 32(%2, %%"REG_a") \n\t"
1376 PREFETCH" 32(%3, %%"REG_a") \n\t"
1377 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1378 "movq %%mm0, %%mm2 \n\t" // U(0)
1379 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1380 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1381 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1383 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1384 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1385 "movq %%mm3, %%mm4 \n\t" // Y(0)
1386 "movq %%mm5, %%mm6 \n\t" // Y(8)
1387 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
1388 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
1389 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
1390 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
1392 MOVNTQ" %%mm3, (%0, %%"REG_a", 4) \n\t"
1393 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
1394 MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4) \n\t"
1395 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
1397 "add $8, %%"REG_a" \n\t"
1398 "cmp %4, %%"REG_a" \n\t"
1400 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1405 #if ARCH_ALPHA && HAVE_MVI
1406 #define pl2yuy2(n) \
1411 __asm__("unpkbw %1, %0" : "=r"(y1) : "r"(y1)); \
1412 __asm__("unpkbw %1, %0" : "=r"(y2) : "r"(y2)); \
1413 __asm__("unpkbl %1, %0" : "=r"(u) : "r"(u)); \
1414 __asm__("unpkbl %1, %0" : "=r"(v) : "r"(v)); \
1415 yuv1 = (u << 8) + (v << 24); \
1422 uint64_t *qdst = (uint64_t *) dst;
1423 uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
1424 const uint32_t *yc = (uint32_t *) ysrc;
1425 const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
1426 const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
1427 for (i = 0; i < chromWidth; i += 8) {
1428 uint64_t y1, y2, yuv1, yuv2;
1431 __asm__("ldq $31,64(%0)" :: "r"(yc));
1432 __asm__("ldq $31,64(%0)" :: "r"(yc2));
1433 __asm__("ldq $31,64(%0)" :: "r"(uc));
1434 __asm__("ldq $31,64(%0)" :: "r"(vc));
1452 #elif HAVE_FAST_64BIT
1454 uint64_t *ldst = (uint64_t *) dst;
1455 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1456 for (i = 0; i < chromWidth; i += 2) {
1458 k = yc[0] + (uc[0] << 8) +
1459 (yc[1] << 16) + (vc[0] << 24);
1460 l = yc[2] + (uc[1] << 8) +
1461 (yc[3] << 16) + (vc[1] << 24);
1462 *ldst++ = k + (l << 32);
1469 int i, *idst = (int32_t *) dst;
1470 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1471 for (i = 0; i < chromWidth; i++) {
1473 *idst++ = (yc[0] << 24)+ (uc[0] << 16) +
1474 (yc[1] << 8) + (vc[0] << 0);
1476 *idst++ = yc[0] + (uc[0] << 8) +
1477 (yc[1] << 16) + (vc[0] << 24);
1485 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1486 usrc += chromStride;
1487 vsrc += chromStride;
1500 * Height should be a multiple of 2 and width should be a multiple of 16.
1501 * (If this is a problem for anyone then tell me, and I will fix it.)
1503 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1504 long width, long height,
1505 long lumStride, long chromStride, long dstStride)
1507 //FIXME interpolate chroma
1508 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1511 static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1512 long width, long height,
1513 long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1516 const x86_reg chromWidth= width>>1;
1517 for (y=0; y<height; y++) {
1519 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1521 "xor %%"REG_a", %%"REG_a" \n\t"
1524 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1525 PREFETCH" 32(%2, %%"REG_a") \n\t"
1526 PREFETCH" 32(%3, %%"REG_a") \n\t"
1527 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1528 "movq %%mm0, %%mm2 \n\t" // U(0)
1529 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1530 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1531 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1533 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1534 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1535 "movq %%mm0, %%mm4 \n\t" // Y(0)
1536 "movq %%mm2, %%mm6 \n\t" // Y(8)
1537 "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
1538 "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
1539 "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
1540 "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
1542 MOVNTQ" %%mm0, (%0, %%"REG_a", 4) \n\t"
1543 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
1544 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4) \n\t"
1545 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
1547 "add $8, %%"REG_a" \n\t"
1548 "cmp %4, %%"REG_a" \n\t"
1550 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1554 //FIXME adapt the Alpha ASM code from yv12->yuy2
1558 uint64_t *ldst = (uint64_t *) dst;
1559 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1560 for (i = 0; i < chromWidth; i += 2) {
1562 k = uc[0] + (yc[0] << 8) +
1563 (vc[0] << 16) + (yc[1] << 24);
1564 l = uc[1] + (yc[2] << 8) +
1565 (vc[1] << 16) + (yc[3] << 24);
1566 *ldst++ = k + (l << 32);
1573 int i, *idst = (int32_t *) dst;
1574 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1575 for (i = 0; i < chromWidth; i++) {
1577 *idst++ = (uc[0] << 24)+ (yc[0] << 16) +
1578 (vc[0] << 8) + (yc[1] << 0);
1580 *idst++ = uc[0] + (yc[0] << 8) +
1581 (vc[0] << 16) + (yc[1] << 24);
1589 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1590 usrc += chromStride;
1591 vsrc += chromStride;
1604 * Height should be a multiple of 2 and width should be a multiple of 16
1605 * (If this is a problem for anyone then tell me, and I will fix it.)
1607 static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1608 long width, long height,
1609 long lumStride, long chromStride, long dstStride)
1611 //FIXME interpolate chroma
1612 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1616 * Width should be a multiple of 16.
1618 static inline void RENAME(yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1619 long width, long height,
1620 long lumStride, long chromStride, long dstStride)
1622 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1626 * Width should be a multiple of 16.
1628 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1629 long width, long height,
1630 long lumStride, long chromStride, long dstStride)
1632 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1636 * Height should be a multiple of 2 and width should be a multiple of 16.
1637 * (If this is a problem for anyone then tell me, and I will fix it.)
1639 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1640 long width, long height,
1641 long lumStride, long chromStride, long srcStride)
1644 const x86_reg chromWidth= width>>1;
1645 for (y=0; y<height; y+=2) {
1648 "xor %%"REG_a", %%"REG_a" \n\t"
1649 "pcmpeqw %%mm7, %%mm7 \n\t"
1650 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1653 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1654 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1655 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1656 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
1657 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
1658 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
1659 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
1660 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1661 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1662 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1663 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1665 MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
1667 "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(8)
1668 "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(12)
1669 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
1670 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
1671 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
1672 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
1673 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1674 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1675 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1676 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1678 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1680 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1681 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1682 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1683 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1684 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1685 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1686 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1687 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1689 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
1690 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
1692 "add $8, %%"REG_a" \n\t"
1693 "cmp %4, %%"REG_a" \n\t"
1695 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1696 : "memory", "%"REG_a
1703 "xor %%"REG_a", %%"REG_a" \n\t"
1706 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1707 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1708 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1709 "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
1710 "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
1711 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1712 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1713 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1714 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1715 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1716 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1718 MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
1719 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1721 "add $8, %%"REG_a" \n\t"
1722 "cmp %4, %%"REG_a" \n\t"
1725 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1726 : "memory", "%"REG_a
1730 for (i=0; i<chromWidth; i++) {
1731 ydst[2*i+0] = src[4*i+0];
1732 udst[i] = src[4*i+1];
1733 ydst[2*i+1] = src[4*i+2];
1734 vdst[i] = src[4*i+3];
1739 for (i=0; i<chromWidth; i++) {
1740 ydst[2*i+0] = src[4*i+0];
1741 ydst[2*i+1] = src[4*i+2];
1744 udst += chromStride;
1745 vdst += chromStride;
1750 __asm__ volatile(EMMS" \n\t"
1756 static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1757 uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1758 long width, long height, long lumStride, long chromStride)
1761 memcpy(ydst, ysrc, width*height);
1763 /* XXX: implement upscaling for U,V */
1766 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWidth, long srcHeight, long srcStride, long dstStride)
1773 for (x=0; x<srcWidth-1; x++) {
1774 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1775 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1777 dst[2*srcWidth-1]= src[srcWidth-1];
1781 for (y=1; y<srcHeight; y++) {
1782 #if HAVE_MMX2 || HAVE_AMD3DNOW
1783 const x86_reg mmxSize= srcWidth&~15;
1785 "mov %4, %%"REG_a" \n\t"
1787 "movq (%0, %%"REG_a"), %%mm0 \n\t"
1788 "movq (%1, %%"REG_a"), %%mm1 \n\t"
1789 "movq 1(%0, %%"REG_a"), %%mm2 \n\t"
1790 "movq 1(%1, %%"REG_a"), %%mm3 \n\t"
1791 "movq -1(%0, %%"REG_a"), %%mm4 \n\t"
1792 "movq -1(%1, %%"REG_a"), %%mm5 \n\t"
1793 PAVGB" %%mm0, %%mm5 \n\t"
1794 PAVGB" %%mm0, %%mm3 \n\t"
1795 PAVGB" %%mm0, %%mm5 \n\t"
1796 PAVGB" %%mm0, %%mm3 \n\t"
1797 PAVGB" %%mm1, %%mm4 \n\t"
1798 PAVGB" %%mm1, %%mm2 \n\t"
1799 PAVGB" %%mm1, %%mm4 \n\t"
1800 PAVGB" %%mm1, %%mm2 \n\t"
1801 "movq %%mm5, %%mm7 \n\t"
1802 "movq %%mm4, %%mm6 \n\t"
1803 "punpcklbw %%mm3, %%mm5 \n\t"
1804 "punpckhbw %%mm3, %%mm7 \n\t"
1805 "punpcklbw %%mm2, %%mm4 \n\t"
1806 "punpckhbw %%mm2, %%mm6 \n\t"
1808 MOVNTQ" %%mm5, (%2, %%"REG_a", 2) \n\t"
1809 MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2) \n\t"
1810 MOVNTQ" %%mm4, (%3, %%"REG_a", 2) \n\t"
1811 MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2) \n\t"
1813 "movq %%mm5, (%2, %%"REG_a", 2) \n\t"
1814 "movq %%mm7, 8(%2, %%"REG_a", 2) \n\t"
1815 "movq %%mm4, (%3, %%"REG_a", 2) \n\t"
1816 "movq %%mm6, 8(%3, %%"REG_a", 2) \n\t"
1818 "add $8, %%"REG_a" \n\t"
1820 :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
1821 "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1827 const x86_reg mmxSize=1;
1829 dst[0 ]= (3*src[0] + src[srcStride])>>2;
1830 dst[dstStride]= ( src[0] + 3*src[srcStride])>>2;
1832 for (x=mmxSize-1; x<srcWidth-1; x++) {
1833 dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
1834 dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
1835 dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
1836 dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
1838 dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
1839 dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1849 for (x=0; x<srcWidth-1; x++) {
1850 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1851 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1853 dst[2*srcWidth-1]= src[srcWidth-1];
1855 for (x=0; x<srcWidth; x++) {
1862 __asm__ volatile(EMMS" \n\t"
1869 * Height should be a multiple of 2 and width should be a multiple of 16.
1870 * (If this is a problem for anyone then tell me, and I will fix it.)
1871 * Chrominance data is only taken from every second line, others are ignored.
1872 * FIXME: Write HQ version.
1874 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1875 long width, long height,
1876 long lumStride, long chromStride, long srcStride)
1879 const x86_reg chromWidth= width>>1;
1880 for (y=0; y<height; y+=2) {
1883 "xor %%"REG_a", %%"REG_a" \n\t"
1884 "pcmpeqw %%mm7, %%mm7 \n\t"
1885 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1888 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1889 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // UYVY UYVY(0)
1890 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(4)
1891 "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
1892 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
1893 "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
1894 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
1895 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1896 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1897 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1898 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1900 MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
1902 "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(8)
1903 "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // UYVY UYVY(12)
1904 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
1905 "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
1906 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
1907 "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
1908 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1909 "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1910 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1911 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1913 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1915 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1916 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1917 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1918 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1919 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1920 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1921 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1922 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1924 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
1925 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
1927 "add $8, %%"REG_a" \n\t"
1928 "cmp %4, %%"REG_a" \n\t"
1930 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1931 : "memory", "%"REG_a
1938 "xor %%"REG_a", %%"REG_a" \n\t"
1941 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1942 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1943 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1944 "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
1945 "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
1946 "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1947 "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1948 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1949 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1950 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1951 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1953 MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
1954 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1956 "add $8, %%"REG_a" \n\t"
1957 "cmp %4, %%"REG_a" \n\t"
1960 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1961 : "memory", "%"REG_a
1965 for (i=0; i<chromWidth; i++) {
1966 udst[i] = src[4*i+0];
1967 ydst[2*i+0] = src[4*i+1];
1968 vdst[i] = src[4*i+2];
1969 ydst[2*i+1] = src[4*i+3];
1974 for (i=0; i<chromWidth; i++) {
1975 ydst[2*i+0] = src[4*i+1];
1976 ydst[2*i+1] = src[4*i+3];
1979 udst += chromStride;
1980 vdst += chromStride;
1985 __asm__ volatile(EMMS" \n\t"
1992 * Height should be a multiple of 2 and width should be a multiple of 2.
1993 * (If this is a problem for anyone then tell me, and I will fix it.)
1994 * Chrominance data is only taken from every second line,
1995 * others are ignored in the C version.
1996 * FIXME: Write HQ version.
1998 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1999 long width, long height,
2000 long lumStride, long chromStride, long srcStride)
2003 const x86_reg chromWidth= width>>1;
2005 for (y=0; y<height-2; y+=2) {
2007 for (i=0; i<2; i++) {
2009 "mov %2, %%"REG_a" \n\t"
2010 "movq "MANGLE(ff_bgr2YCoeff)", %%mm6 \n\t"
2011 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
2012 "pxor %%mm7, %%mm7 \n\t"
2013 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
2016 PREFETCH" 64(%0, %%"REG_d") \n\t"
2017 "movd (%0, %%"REG_d"), %%mm0 \n\t"
2018 "movd 3(%0, %%"REG_d"), %%mm1 \n\t"
2019 "punpcklbw %%mm7, %%mm0 \n\t"
2020 "punpcklbw %%mm7, %%mm1 \n\t"
2021 "movd 6(%0, %%"REG_d"), %%mm2 \n\t"
2022 "movd 9(%0, %%"REG_d"), %%mm3 \n\t"
2023 "punpcklbw %%mm7, %%mm2 \n\t"
2024 "punpcklbw %%mm7, %%mm3 \n\t"
2025 "pmaddwd %%mm6, %%mm0 \n\t"
2026 "pmaddwd %%mm6, %%mm1 \n\t"
2027 "pmaddwd %%mm6, %%mm2 \n\t"
2028 "pmaddwd %%mm6, %%mm3 \n\t"
2029 #ifndef FAST_BGR2YV12
2030 "psrad $8, %%mm0 \n\t"
2031 "psrad $8, %%mm1 \n\t"
2032 "psrad $8, %%mm2 \n\t"
2033 "psrad $8, %%mm3 \n\t"
2035 "packssdw %%mm1, %%mm0 \n\t"
2036 "packssdw %%mm3, %%mm2 \n\t"
2037 "pmaddwd %%mm5, %%mm0 \n\t"
2038 "pmaddwd %%mm5, %%mm2 \n\t"
2039 "packssdw %%mm2, %%mm0 \n\t"
2040 "psraw $7, %%mm0 \n\t"
2042 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
2043 "movd 15(%0, %%"REG_d"), %%mm1 \n\t"
2044 "punpcklbw %%mm7, %%mm4 \n\t"
2045 "punpcklbw %%mm7, %%mm1 \n\t"
2046 "movd 18(%0, %%"REG_d"), %%mm2 \n\t"
2047 "movd 21(%0, %%"REG_d"), %%mm3 \n\t"
2048 "punpcklbw %%mm7, %%mm2 \n\t"
2049 "punpcklbw %%mm7, %%mm3 \n\t"
2050 "pmaddwd %%mm6, %%mm4 \n\t"
2051 "pmaddwd %%mm6, %%mm1 \n\t"
2052 "pmaddwd %%mm6, %%mm2 \n\t"
2053 "pmaddwd %%mm6, %%mm3 \n\t"
2054 #ifndef FAST_BGR2YV12
2055 "psrad $8, %%mm4 \n\t"
2056 "psrad $8, %%mm1 \n\t"
2057 "psrad $8, %%mm2 \n\t"
2058 "psrad $8, %%mm3 \n\t"
2060 "packssdw %%mm1, %%mm4 \n\t"
2061 "packssdw %%mm3, %%mm2 \n\t"
2062 "pmaddwd %%mm5, %%mm4 \n\t"
2063 "pmaddwd %%mm5, %%mm2 \n\t"
2064 "add $24, %%"REG_d" \n\t"
2065 "packssdw %%mm2, %%mm4 \n\t"
2066 "psraw $7, %%mm4 \n\t"
2068 "packuswb %%mm4, %%mm0 \n\t"
2069 "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0 \n\t"
2071 MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t"
2072 "add $8, %%"REG_a" \n\t"
2074 : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width)
2075 : "%"REG_a, "%"REG_d
2082 "mov %4, %%"REG_a" \n\t"
2083 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
2084 "movq "MANGLE(ff_bgr2UCoeff)", %%mm6 \n\t"
2085 "pxor %%mm7, %%mm7 \n\t"
2086 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
2087 "add %%"REG_d", %%"REG_d" \n\t"
2090 PREFETCH" 64(%0, %%"REG_d") \n\t"
2091 PREFETCH" 64(%1, %%"REG_d") \n\t"
2092 #if HAVE_MMX2 || HAVE_AMD3DNOW
2093 "movq (%0, %%"REG_d"), %%mm0 \n\t"
2094 "movq (%1, %%"REG_d"), %%mm1 \n\t"
2095 "movq 6(%0, %%"REG_d"), %%mm2 \n\t"
2096 "movq 6(%1, %%"REG_d"), %%mm3 \n\t"
2097 PAVGB" %%mm1, %%mm0 \n\t"
2098 PAVGB" %%mm3, %%mm2 \n\t"
2099 "movq %%mm0, %%mm1 \n\t"
2100 "movq %%mm2, %%mm3 \n\t"
2101 "psrlq $24, %%mm0 \n\t"
2102 "psrlq $24, %%mm2 \n\t"
2103 PAVGB" %%mm1, %%mm0 \n\t"
2104 PAVGB" %%mm3, %%mm2 \n\t"
2105 "punpcklbw %%mm7, %%mm0 \n\t"
2106 "punpcklbw %%mm7, %%mm2 \n\t"
2108 "movd (%0, %%"REG_d"), %%mm0 \n\t"
2109 "movd (%1, %%"REG_d"), %%mm1 \n\t"
2110 "movd 3(%0, %%"REG_d"), %%mm2 \n\t"
2111 "movd 3(%1, %%"REG_d"), %%mm3 \n\t"
2112 "punpcklbw %%mm7, %%mm0 \n\t"
2113 "punpcklbw %%mm7, %%mm1 \n\t"
2114 "punpcklbw %%mm7, %%mm2 \n\t"
2115 "punpcklbw %%mm7, %%mm3 \n\t"
2116 "paddw %%mm1, %%mm0 \n\t"
2117 "paddw %%mm3, %%mm2 \n\t"
2118 "paddw %%mm2, %%mm0 \n\t"
2119 "movd 6(%0, %%"REG_d"), %%mm4 \n\t"
2120 "movd 6(%1, %%"REG_d"), %%mm1 \n\t"
2121 "movd 9(%0, %%"REG_d"), %%mm2 \n\t"
2122 "movd 9(%1, %%"REG_d"), %%mm3 \n\t"
2123 "punpcklbw %%mm7, %%mm4 \n\t"
2124 "punpcklbw %%mm7, %%mm1 \n\t"
2125 "punpcklbw %%mm7, %%mm2 \n\t"
2126 "punpcklbw %%mm7, %%mm3 \n\t"
2127 "paddw %%mm1, %%mm4 \n\t"
2128 "paddw %%mm3, %%mm2 \n\t"
2129 "paddw %%mm4, %%mm2 \n\t"
2130 "psrlw $2, %%mm0 \n\t"
2131 "psrlw $2, %%mm2 \n\t"
2133 "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
2134 "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
2136 "pmaddwd %%mm0, %%mm1 \n\t"
2137 "pmaddwd %%mm2, %%mm3 \n\t"
2138 "pmaddwd %%mm6, %%mm0 \n\t"
2139 "pmaddwd %%mm6, %%mm2 \n\t"
2140 #ifndef FAST_BGR2YV12
2141 "psrad $8, %%mm0 \n\t"
2142 "psrad $8, %%mm1 \n\t"
2143 "psrad $8, %%mm2 \n\t"
2144 "psrad $8, %%mm3 \n\t"
2146 "packssdw %%mm2, %%mm0 \n\t"
2147 "packssdw %%mm3, %%mm1 \n\t"
2148 "pmaddwd %%mm5, %%mm0 \n\t"
2149 "pmaddwd %%mm5, %%mm1 \n\t"
2150 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
2151 "psraw $7, %%mm0 \n\t"
2153 #if HAVE_MMX2 || HAVE_AMD3DNOW
2154 "movq 12(%0, %%"REG_d"), %%mm4 \n\t"
2155 "movq 12(%1, %%"REG_d"), %%mm1 \n\t"
2156 "movq 18(%0, %%"REG_d"), %%mm2 \n\t"
2157 "movq 18(%1, %%"REG_d"), %%mm3 \n\t"
2158 PAVGB" %%mm1, %%mm4 \n\t"
2159 PAVGB" %%mm3, %%mm2 \n\t"
2160 "movq %%mm4, %%mm1 \n\t"
2161 "movq %%mm2, %%mm3 \n\t"
2162 "psrlq $24, %%mm4 \n\t"
2163 "psrlq $24, %%mm2 \n\t"
2164 PAVGB" %%mm1, %%mm4 \n\t"
2165 PAVGB" %%mm3, %%mm2 \n\t"
2166 "punpcklbw %%mm7, %%mm4 \n\t"
2167 "punpcklbw %%mm7, %%mm2 \n\t"
2169 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
2170 "movd 12(%1, %%"REG_d"), %%mm1 \n\t"
2171 "movd 15(%0, %%"REG_d"), %%mm2 \n\t"
2172 "movd 15(%1, %%"REG_d"), %%mm3 \n\t"
2173 "punpcklbw %%mm7, %%mm4 \n\t"
2174 "punpcklbw %%mm7, %%mm1 \n\t"
2175 "punpcklbw %%mm7, %%mm2 \n\t"
2176 "punpcklbw %%mm7, %%mm3 \n\t"
2177 "paddw %%mm1, %%mm4 \n\t"
2178 "paddw %%mm3, %%mm2 \n\t"
2179 "paddw %%mm2, %%mm4 \n\t"
2180 "movd 18(%0, %%"REG_d"), %%mm5 \n\t"
2181 "movd 18(%1, %%"REG_d"), %%mm1 \n\t"
2182 "movd 21(%0, %%"REG_d"), %%mm2 \n\t"
2183 "movd 21(%1, %%"REG_d"), %%mm3 \n\t"
2184 "punpcklbw %%mm7, %%mm5 \n\t"
2185 "punpcklbw %%mm7, %%mm1 \n\t"
2186 "punpcklbw %%mm7, %%mm2 \n\t"
2187 "punpcklbw %%mm7, %%mm3 \n\t"
2188 "paddw %%mm1, %%mm5 \n\t"
2189 "paddw %%mm3, %%mm2 \n\t"
2190 "paddw %%mm5, %%mm2 \n\t"
2191 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
2192 "psrlw $2, %%mm4 \n\t"
2193 "psrlw $2, %%mm2 \n\t"
2195 "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
2196 "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
2198 "pmaddwd %%mm4, %%mm1 \n\t"
2199 "pmaddwd %%mm2, %%mm3 \n\t"
2200 "pmaddwd %%mm6, %%mm4 \n\t"
2201 "pmaddwd %%mm6, %%mm2 \n\t"
2202 #ifndef FAST_BGR2YV12
2203 "psrad $8, %%mm4 \n\t"
2204 "psrad $8, %%mm1 \n\t"
2205 "psrad $8, %%mm2 \n\t"
2206 "psrad $8, %%mm3 \n\t"
2208 "packssdw %%mm2, %%mm4 \n\t"
2209 "packssdw %%mm3, %%mm1 \n\t"
2210 "pmaddwd %%mm5, %%mm4 \n\t"
2211 "pmaddwd %%mm5, %%mm1 \n\t"
2212 "add $24, %%"REG_d" \n\t"
2213 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
2214 "psraw $7, %%mm4 \n\t"
2216 "movq %%mm0, %%mm1 \n\t"
2217 "punpckldq %%mm4, %%mm0 \n\t"
2218 "punpckhdq %%mm4, %%mm1 \n\t"
2219 "packsswb %%mm1, %%mm0 \n\t"
2220 "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0 \n\t"
2221 "movd %%mm0, (%2, %%"REG_a") \n\t"
2222 "punpckhdq %%mm0, %%mm0 \n\t"
2223 "movd %%mm0, (%3, %%"REG_a") \n\t"
2224 "add $4, %%"REG_a" \n\t"
2226 : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
2227 : "%"REG_a, "%"REG_d
2230 udst += chromStride;
2231 vdst += chromStride;
2235 __asm__ volatile(EMMS" \n\t"
2241 for (; y<height; y+=2) {
2243 for (i=0; i<chromWidth; i++) {
2244 unsigned int b = src[6*i+0];
2245 unsigned int g = src[6*i+1];
2246 unsigned int r = src[6*i+2];
2248 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2249 unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2250 unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2260 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2266 for (i=0; i<chromWidth; i++) {
2267 unsigned int b = src[6*i+0];
2268 unsigned int g = src[6*i+1];
2269 unsigned int r = src[6*i+2];
2271 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2279 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2282 udst += chromStride;
2283 vdst += chromStride;
2289 static void RENAME(interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dest,
2290 long width, long height, long src1Stride,
2291 long src2Stride, long dstStride)
2295 for (h=0; h < height; h++) {
2301 "xor %%"REG_a", %%"REG_a" \n\t"
2303 PREFETCH" 64(%1, %%"REG_a") \n\t"
2304 PREFETCH" 64(%2, %%"REG_a") \n\t"
2305 "movdqa (%1, %%"REG_a"), %%xmm0 \n\t"
2306 "movdqa (%1, %%"REG_a"), %%xmm1 \n\t"
2307 "movdqa (%2, %%"REG_a"), %%xmm2 \n\t"
2308 "punpcklbw %%xmm2, %%xmm0 \n\t"
2309 "punpckhbw %%xmm2, %%xmm1 \n\t"
2310 "movntdq %%xmm0, (%0, %%"REG_a", 2) \n\t"
2311 "movntdq %%xmm1, 16(%0, %%"REG_a", 2) \n\t"
2312 "add $16, %%"REG_a" \n\t"
2313 "cmp %3, %%"REG_a" \n\t"
2315 ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
2316 : "memory", "%"REG_a""
2320 "xor %%"REG_a", %%"REG_a" \n\t"
2322 PREFETCH" 64(%1, %%"REG_a") \n\t"
2323 PREFETCH" 64(%2, %%"REG_a") \n\t"
2324 "movq (%1, %%"REG_a"), %%mm0 \n\t"
2325 "movq 8(%1, %%"REG_a"), %%mm2 \n\t"
2326 "movq %%mm0, %%mm1 \n\t"
2327 "movq %%mm2, %%mm3 \n\t"
2328 "movq (%2, %%"REG_a"), %%mm4 \n\t"
2329 "movq 8(%2, %%"REG_a"), %%mm5 \n\t"
2330 "punpcklbw %%mm4, %%mm0 \n\t"
2331 "punpckhbw %%mm4, %%mm1 \n\t"
2332 "punpcklbw %%mm5, %%mm2 \n\t"
2333 "punpckhbw %%mm5, %%mm3 \n\t"
2334 MOVNTQ" %%mm0, (%0, %%"REG_a", 2) \n\t"
2335 MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2) \n\t"
2336 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2) \n\t"
2337 MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2) \n\t"
2338 "add $16, %%"REG_a" \n\t"
2339 "cmp %3, %%"REG_a" \n\t"
2341 ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
2342 : "memory", "%"REG_a
2345 for (w= (width&(~15)); w < width; w++) {
2346 dest[2*w+0] = src1[w];
2347 dest[2*w+1] = src2[w];
2350 for (w=0; w < width; w++) {
2351 dest[2*w+0] = src1[w];
2352 dest[2*w+1] = src2[w];
2368 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2369 uint8_t *dst1, uint8_t *dst2,
2370 long width, long height,
2371 long srcStride1, long srcStride2,
2372 long dstStride1, long dstStride2)
2376 w=width/2; h=height/2;
2381 ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2384 const uint8_t* s1=src1+srcStride1*(y>>1);
2385 uint8_t* d=dst1+dstStride1*y;
2388 for (;x<w-31;x+=32) {
2390 PREFETCH" 32%1 \n\t"
2391 "movq %1, %%mm0 \n\t"
2392 "movq 8%1, %%mm2 \n\t"
2393 "movq 16%1, %%mm4 \n\t"
2394 "movq 24%1, %%mm6 \n\t"
2395 "movq %%mm0, %%mm1 \n\t"
2396 "movq %%mm2, %%mm3 \n\t"
2397 "movq %%mm4, %%mm5 \n\t"
2398 "movq %%mm6, %%mm7 \n\t"
2399 "punpcklbw %%mm0, %%mm0 \n\t"
2400 "punpckhbw %%mm1, %%mm1 \n\t"
2401 "punpcklbw %%mm2, %%mm2 \n\t"
2402 "punpckhbw %%mm3, %%mm3 \n\t"
2403 "punpcklbw %%mm4, %%mm4 \n\t"
2404 "punpckhbw %%mm5, %%mm5 \n\t"
2405 "punpcklbw %%mm6, %%mm6 \n\t"
2406 "punpckhbw %%mm7, %%mm7 \n\t"
2407 MOVNTQ" %%mm0, %0 \n\t"
2408 MOVNTQ" %%mm1, 8%0 \n\t"
2409 MOVNTQ" %%mm2, 16%0 \n\t"
2410 MOVNTQ" %%mm3, 24%0 \n\t"
2411 MOVNTQ" %%mm4, 32%0 \n\t"
2412 MOVNTQ" %%mm5, 40%0 \n\t"
2413 MOVNTQ" %%mm6, 48%0 \n\t"
2414 MOVNTQ" %%mm7, 56%0"
2420 for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2423 const uint8_t* s2=src2+srcStride2*(y>>1);
2424 uint8_t* d=dst2+dstStride2*y;
2427 for (;x<w-31;x+=32) {
2429 PREFETCH" 32%1 \n\t"
2430 "movq %1, %%mm0 \n\t"
2431 "movq 8%1, %%mm2 \n\t"
2432 "movq 16%1, %%mm4 \n\t"
2433 "movq 24%1, %%mm6 \n\t"
2434 "movq %%mm0, %%mm1 \n\t"
2435 "movq %%mm2, %%mm3 \n\t"
2436 "movq %%mm4, %%mm5 \n\t"
2437 "movq %%mm6, %%mm7 \n\t"
2438 "punpcklbw %%mm0, %%mm0 \n\t"
2439 "punpckhbw %%mm1, %%mm1 \n\t"
2440 "punpcklbw %%mm2, %%mm2 \n\t"
2441 "punpckhbw %%mm3, %%mm3 \n\t"
2442 "punpcklbw %%mm4, %%mm4 \n\t"
2443 "punpckhbw %%mm5, %%mm5 \n\t"
2444 "punpcklbw %%mm6, %%mm6 \n\t"
2445 "punpckhbw %%mm7, %%mm7 \n\t"
2446 MOVNTQ" %%mm0, %0 \n\t"
2447 MOVNTQ" %%mm1, 8%0 \n\t"
2448 MOVNTQ" %%mm2, 16%0 \n\t"
2449 MOVNTQ" %%mm3, 24%0 \n\t"
2450 MOVNTQ" %%mm4, 32%0 \n\t"
2451 MOVNTQ" %%mm5, 40%0 \n\t"
2452 MOVNTQ" %%mm6, 48%0 \n\t"
2453 MOVNTQ" %%mm7, 56%0"
2459 for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2470 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2472 long width, long height,
2473 long srcStride1, long srcStride2,
2474 long srcStride3, long dstStride)
2478 w=width/2; h=height;
2480 const uint8_t* yp=src1+srcStride1*y;
2481 const uint8_t* up=src2+srcStride2*(y>>2);
2482 const uint8_t* vp=src3+srcStride3*(y>>2);
2483 uint8_t* d=dst+dstStride*y;
2488 PREFETCH" 32(%1, %0) \n\t"
2489 PREFETCH" 32(%2, %0) \n\t"
2490 PREFETCH" 32(%3, %0) \n\t"
2491 "movq (%1, %0, 4), %%mm0 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2492 "movq (%2, %0), %%mm1 \n\t" /* U0U1U2U3U4U5U6U7 */
2493 "movq (%3, %0), %%mm2 \n\t" /* V0V1V2V3V4V5V6V7 */
2494 "movq %%mm0, %%mm3 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2495 "movq %%mm1, %%mm4 \n\t" /* U0U1U2U3U4U5U6U7 */
2496 "movq %%mm2, %%mm5 \n\t" /* V0V1V2V3V4V5V6V7 */
2497 "punpcklbw %%mm1, %%mm1 \n\t" /* U0U0 U1U1 U2U2 U3U3 */
2498 "punpcklbw %%mm2, %%mm2 \n\t" /* V0V0 V1V1 V2V2 V3V3 */
2499 "punpckhbw %%mm4, %%mm4 \n\t" /* U4U4 U5U5 U6U6 U7U7 */
2500 "punpckhbw %%mm5, %%mm5 \n\t" /* V4V4 V5V5 V6V6 V7V7 */
2502 "movq %%mm1, %%mm6 \n\t"
2503 "punpcklbw %%mm2, %%mm1 \n\t" /* U0V0 U0V0 U1V1 U1V1*/
2504 "punpcklbw %%mm1, %%mm0 \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2505 "punpckhbw %%mm1, %%mm3 \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2506 MOVNTQ" %%mm0, (%4, %0, 8) \n\t"
2507 MOVNTQ" %%mm3, 8(%4, %0, 8) \n\t"
2509 "punpckhbw %%mm2, %%mm6 \n\t" /* U2V2 U2V2 U3V3 U3V3*/
2510 "movq 8(%1, %0, 4), %%mm0 \n\t"
2511 "movq %%mm0, %%mm3 \n\t"
2512 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U2 Y V2 Y U2 Y V2*/
2513 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U3 Y V3 Y U3 Y V3*/
2514 MOVNTQ" %%mm0, 16(%4, %0, 8) \n\t"
2515 MOVNTQ" %%mm3, 24(%4, %0, 8) \n\t"
2517 "movq %%mm4, %%mm6 \n\t"
2518 "movq 16(%1, %0, 4), %%mm0 \n\t"
2519 "movq %%mm0, %%mm3 \n\t"
2520 "punpcklbw %%mm5, %%mm4 \n\t"
2521 "punpcklbw %%mm4, %%mm0 \n\t" /* Y U4 Y V4 Y U4 Y V4*/
2522 "punpckhbw %%mm4, %%mm3 \n\t" /* Y U5 Y V5 Y U5 Y V5*/
2523 MOVNTQ" %%mm0, 32(%4, %0, 8) \n\t"
2524 MOVNTQ" %%mm3, 40(%4, %0, 8) \n\t"
2526 "punpckhbw %%mm5, %%mm6 \n\t"
2527 "movq 24(%1, %0, 4), %%mm0 \n\t"
2528 "movq %%mm0, %%mm3 \n\t"
2529 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U6 Y V6 Y U6 Y V6*/
2530 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U7 Y V7 Y U7 Y V7*/
2531 MOVNTQ" %%mm0, 48(%4, %0, 8) \n\t"
2532 MOVNTQ" %%mm3, 56(%4, %0, 8) \n\t"
2535 : "r"(yp), "r" (up), "r"(vp), "r"(d)
2540 const long x2 = x<<2;
2543 d[8*x+2] = yp[x2+1];
2545 d[8*x+4] = yp[x2+2];
2547 d[8*x+6] = yp[x2+3];
2560 static void RENAME(extract_even)(const uint8_t *src, uint8_t *dst, x86_reg count)
2570 "pcmpeqw %%mm7, %%mm7 \n\t"
2571 "psrlw $8, %%mm7 \n\t"
2573 "movq -30(%1, %0, 2), %%mm0 \n\t"
2574 "movq -22(%1, %0, 2), %%mm1 \n\t"
2575 "movq -14(%1, %0, 2), %%mm2 \n\t"
2576 "movq -6(%1, %0, 2), %%mm3 \n\t"
2577 "pand %%mm7, %%mm0 \n\t"
2578 "pand %%mm7, %%mm1 \n\t"
2579 "pand %%mm7, %%mm2 \n\t"
2580 "pand %%mm7, %%mm3 \n\t"
2581 "packuswb %%mm1, %%mm0 \n\t"
2582 "packuswb %%mm3, %%mm2 \n\t"
2583 MOVNTQ" %%mm0,-15(%2, %0) \n\t"
2584 MOVNTQ" %%mm2,- 7(%2, %0) \n\t"
2588 : "r"(src), "r"(dst)
2594 dst[count]= src[2*count];
2599 static void RENAME(extract_even2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2609 "pcmpeqw %%mm7, %%mm7 \n\t"
2610 "psrlw $8, %%mm7 \n\t"
2612 "movq -28(%1, %0, 4), %%mm0 \n\t"
2613 "movq -20(%1, %0, 4), %%mm1 \n\t"
2614 "movq -12(%1, %0, 4), %%mm2 \n\t"
2615 "movq -4(%1, %0, 4), %%mm3 \n\t"
2616 "pand %%mm7, %%mm0 \n\t"
2617 "pand %%mm7, %%mm1 \n\t"
2618 "pand %%mm7, %%mm2 \n\t"
2619 "pand %%mm7, %%mm3 \n\t"
2620 "packuswb %%mm1, %%mm0 \n\t"
2621 "packuswb %%mm3, %%mm2 \n\t"
2622 "movq %%mm0, %%mm1 \n\t"
2623 "movq %%mm2, %%mm3 \n\t"
2624 "psrlw $8, %%mm0 \n\t"
2625 "psrlw $8, %%mm2 \n\t"
2626 "pand %%mm7, %%mm1 \n\t"
2627 "pand %%mm7, %%mm3 \n\t"
2628 "packuswb %%mm2, %%mm0 \n\t"
2629 "packuswb %%mm3, %%mm1 \n\t"
2630 MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
2631 MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
2635 : "r"(src), "r"(dst0), "r"(dst1)
2641 dst0[count]= src[4*count+0];
2642 dst1[count]= src[4*count+2];
2647 static void RENAME(extract_even2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2658 "pcmpeqw %%mm7, %%mm7 \n\t"
2659 "psrlw $8, %%mm7 \n\t"
2661 "movq -28(%1, %0, 4), %%mm0 \n\t"
2662 "movq -20(%1, %0, 4), %%mm1 \n\t"
2663 "movq -12(%1, %0, 4), %%mm2 \n\t"
2664 "movq -4(%1, %0, 4), %%mm3 \n\t"
2665 PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
2666 PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
2667 PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
2668 PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
2669 "pand %%mm7, %%mm0 \n\t"
2670 "pand %%mm7, %%mm1 \n\t"
2671 "pand %%mm7, %%mm2 \n\t"
2672 "pand %%mm7, %%mm3 \n\t"
2673 "packuswb %%mm1, %%mm0 \n\t"
2674 "packuswb %%mm3, %%mm2 \n\t"
2675 "movq %%mm0, %%mm1 \n\t"
2676 "movq %%mm2, %%mm3 \n\t"
2677 "psrlw $8, %%mm0 \n\t"
2678 "psrlw $8, %%mm2 \n\t"
2679 "pand %%mm7, %%mm1 \n\t"
2680 "pand %%mm7, %%mm3 \n\t"
2681 "packuswb %%mm2, %%mm0 \n\t"
2682 "packuswb %%mm3, %%mm1 \n\t"
2683 MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
2684 MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
2688 : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2694 dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2695 dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2700 static void RENAME(extract_odd2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2710 "pcmpeqw %%mm7, %%mm7 \n\t"
2711 "psrlw $8, %%mm7 \n\t"
2713 "movq -28(%1, %0, 4), %%mm0 \n\t"
2714 "movq -20(%1, %0, 4), %%mm1 \n\t"
2715 "movq -12(%1, %0, 4), %%mm2 \n\t"
2716 "movq -4(%1, %0, 4), %%mm3 \n\t"
2717 "psrlw $8, %%mm0 \n\t"
2718 "psrlw $8, %%mm1 \n\t"
2719 "psrlw $8, %%mm2 \n\t"
2720 "psrlw $8, %%mm3 \n\t"
2721 "packuswb %%mm1, %%mm0 \n\t"
2722 "packuswb %%mm3, %%mm2 \n\t"
2723 "movq %%mm0, %%mm1 \n\t"
2724 "movq %%mm2, %%mm3 \n\t"
2725 "psrlw $8, %%mm0 \n\t"
2726 "psrlw $8, %%mm2 \n\t"
2727 "pand %%mm7, %%mm1 \n\t"
2728 "pand %%mm7, %%mm3 \n\t"
2729 "packuswb %%mm2, %%mm0 \n\t"
2730 "packuswb %%mm3, %%mm1 \n\t"
2731 MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
2732 MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
2736 : "r"(src), "r"(dst0), "r"(dst1)
2743 dst0[count]= src[4*count+0];
2744 dst1[count]= src[4*count+2];
2749 static void RENAME(extract_odd2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2760 "pcmpeqw %%mm7, %%mm7 \n\t"
2761 "psrlw $8, %%mm7 \n\t"
2763 "movq -28(%1, %0, 4), %%mm0 \n\t"
2764 "movq -20(%1, %0, 4), %%mm1 \n\t"
2765 "movq -12(%1, %0, 4), %%mm2 \n\t"
2766 "movq -4(%1, %0, 4), %%mm3 \n\t"
2767 PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
2768 PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
2769 PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
2770 PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
2771 "psrlw $8, %%mm0 \n\t"
2772 "psrlw $8, %%mm1 \n\t"
2773 "psrlw $8, %%mm2 \n\t"
2774 "psrlw $8, %%mm3 \n\t"
2775 "packuswb %%mm1, %%mm0 \n\t"
2776 "packuswb %%mm3, %%mm2 \n\t"
2777 "movq %%mm0, %%mm1 \n\t"
2778 "movq %%mm2, %%mm3 \n\t"
2779 "psrlw $8, %%mm0 \n\t"
2780 "psrlw $8, %%mm2 \n\t"
2781 "pand %%mm7, %%mm1 \n\t"
2782 "pand %%mm7, %%mm3 \n\t"
2783 "packuswb %%mm2, %%mm0 \n\t"
2784 "packuswb %%mm3, %%mm1 \n\t"
2785 MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
2786 MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
2790 : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2798 dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2799 dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2804 static void RENAME(yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2805 long width, long height,
2806 long lumStride, long chromStride, long srcStride)
2809 const long chromWidth= -((-width)>>1);
2811 for (y=0; y<height; y++) {
2812 RENAME(extract_even)(src, ydst, width);
2814 RENAME(extract_odd2avg)(src-srcStride, src, udst, vdst, chromWidth);
2831 static void RENAME(yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2832 long width, long height,
2833 long lumStride, long chromStride, long srcStride)
2836 const long chromWidth= -((-width)>>1);
2838 for (y=0; y<height; y++) {
2839 RENAME(extract_even)(src, ydst, width);
2840 RENAME(extract_odd2)(src, udst, vdst, chromWidth);
2856 static void RENAME(uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2857 long width, long height,
2858 long lumStride, long chromStride, long srcStride)
2861 const long chromWidth= -((-width)>>1);
2863 for (y=0; y<height; y++) {
2864 RENAME(extract_even)(src+1, ydst, width);
2866 RENAME(extract_even2avg)(src-srcStride, src, udst, vdst, chromWidth);
2883 static void RENAME(uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2884 long width, long height,
2885 long lumStride, long chromStride, long srcStride)
2888 const long chromWidth= -((-width)>>1);
2890 for (y=0; y<height; y++) {
2891 RENAME(extract_even)(src+1, ydst, width);
2892 RENAME(extract_even2)(src, udst, vdst, chromWidth);
2908 static inline void RENAME(rgb2rgb_init)(void)
2910 rgb15to16 = RENAME(rgb15to16);
2911 rgb15tobgr24 = RENAME(rgb15tobgr24);
2912 rgb15to32 = RENAME(rgb15to32);
2913 rgb16tobgr24 = RENAME(rgb16tobgr24);
2914 rgb16to32 = RENAME(rgb16to32);
2915 rgb16to15 = RENAME(rgb16to15);
2916 rgb24tobgr16 = RENAME(rgb24tobgr16);
2917 rgb24tobgr15 = RENAME(rgb24tobgr15);
2918 rgb24tobgr32 = RENAME(rgb24tobgr32);
2919 rgb32to16 = RENAME(rgb32to16);
2920 rgb32to15 = RENAME(rgb32to15);
2921 rgb32tobgr24 = RENAME(rgb32tobgr24);
2922 rgb24to15 = RENAME(rgb24to15);
2923 rgb24to16 = RENAME(rgb24to16);
2924 rgb24tobgr24 = RENAME(rgb24tobgr24);
2925 rgb32tobgr32 = RENAME(rgb32tobgr32);
2926 rgb32tobgr16 = RENAME(rgb32tobgr16);
2927 rgb32tobgr15 = RENAME(rgb32tobgr15);
2928 yv12toyuy2 = RENAME(yv12toyuy2);
2929 yv12touyvy = RENAME(yv12touyvy);
2930 yuv422ptoyuy2 = RENAME(yuv422ptoyuy2);
2931 yuv422ptouyvy = RENAME(yuv422ptouyvy);
2932 yuy2toyv12 = RENAME(yuy2toyv12);
2933 // yvu9toyv12 = RENAME(yvu9toyv12);
2934 planar2x = RENAME(planar2x);
2935 rgb24toyv12 = RENAME(rgb24toyv12);
2936 interleaveBytes = RENAME(interleaveBytes);
2937 vu9_to_vu12 = RENAME(vu9_to_vu12);
2938 yvu9_to_yuy2 = RENAME(yvu9_to_yuy2);
2940 uyvytoyuv420 = RENAME(uyvytoyuv420);
2941 uyvytoyuv422 = RENAME(uyvytoyuv422);
2942 yuyvtoyuv420 = RENAME(yuyvtoyuv420);
2943 yuyvtoyuv422 = RENAME(yuyvtoyuv422);