2 * software RGB to RGB converter
3 * pluralize by software PAL8 to RGB converter
4 * software YUV to YUV converter
5 * software YUV to RGB converter
6 * Written by Nick Kurshev.
7 * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
8 * lot of big-endian byte order fixes by Alex Beregszaszi
10 * This file is part of FFmpeg.
12 * FFmpeg is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License as published by the Free Software Foundation; either
15 * version 2.1 of the License, or (at your option) any later version.
17 * FFmpeg is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * Lesser General Public License for more details.
22 * You should have received a copy of the GNU Lesser General Public
23 * License along with FFmpeg; if not, write to the Free Software
24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
36 #if COMPILE_TEMPLATE_SSE2
42 #if COMPILE_TEMPLATE_AMD3DNOW
43 #define PREFETCH "prefetch"
44 #define PAVGB "pavgusb"
45 #elif COMPILE_TEMPLATE_MMX2
46 #define PREFETCH "prefetchnta"
49 #define PREFETCH " # nop"
52 #if COMPILE_TEMPLATE_AMD3DNOW
53 /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
59 #if COMPILE_TEMPLATE_MMX2
60 #define MOVNTQ "movntq"
61 #define SFENCE "sfence"
64 #define SFENCE " # nop"
67 static inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
70 const uint8_t *s = src;
72 #if COMPILE_TEMPLATE_MMX
73 const uint8_t *mm_end;
76 #if COMPILE_TEMPLATE_MMX
77 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
79 __asm__ volatile("movq %0, %%mm7"::"m"(mask32a):"memory");
84 "punpckldq 3%1, %%mm0 \n\t"
85 "movd 6%1, %%mm1 \n\t"
86 "punpckldq 9%1, %%mm1 \n\t"
87 "movd 12%1, %%mm2 \n\t"
88 "punpckldq 15%1, %%mm2 \n\t"
89 "movd 18%1, %%mm3 \n\t"
90 "punpckldq 21%1, %%mm3 \n\t"
91 "por %%mm7, %%mm0 \n\t"
92 "por %%mm7, %%mm1 \n\t"
93 "por %%mm7, %%mm2 \n\t"
94 "por %%mm7, %%mm3 \n\t"
95 MOVNTQ" %%mm0, %0 \n\t"
96 MOVNTQ" %%mm1, 8%0 \n\t"
97 MOVNTQ" %%mm2, 16%0 \n\t"
105 __asm__ volatile(SFENCE:::"memory");
106 __asm__ volatile(EMMS:::"memory");
110 /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */
125 #define STORE_BGR24_MMX \
126 "psrlq $8, %%mm2 \n\t" \
127 "psrlq $8, %%mm3 \n\t" \
128 "psrlq $8, %%mm6 \n\t" \
129 "psrlq $8, %%mm7 \n\t" \
130 "pand "MANGLE(mask24l)", %%mm0\n\t" \
131 "pand "MANGLE(mask24l)", %%mm1\n\t" \
132 "pand "MANGLE(mask24l)", %%mm4\n\t" \
133 "pand "MANGLE(mask24l)", %%mm5\n\t" \
134 "pand "MANGLE(mask24h)", %%mm2\n\t" \
135 "pand "MANGLE(mask24h)", %%mm3\n\t" \
136 "pand "MANGLE(mask24h)", %%mm6\n\t" \
137 "pand "MANGLE(mask24h)", %%mm7\n\t" \
138 "por %%mm2, %%mm0 \n\t" \
139 "por %%mm3, %%mm1 \n\t" \
140 "por %%mm6, %%mm4 \n\t" \
141 "por %%mm7, %%mm5 \n\t" \
143 "movq %%mm1, %%mm2 \n\t" \
144 "movq %%mm4, %%mm3 \n\t" \
145 "psllq $48, %%mm2 \n\t" \
146 "psllq $32, %%mm3 \n\t" \
147 "pand "MANGLE(mask24hh)", %%mm2\n\t" \
148 "pand "MANGLE(mask24hhh)", %%mm3\n\t" \
149 "por %%mm2, %%mm0 \n\t" \
150 "psrlq $16, %%mm1 \n\t" \
151 "psrlq $32, %%mm4 \n\t" \
152 "psllq $16, %%mm5 \n\t" \
153 "por %%mm3, %%mm1 \n\t" \
154 "pand "MANGLE(mask24hhhh)", %%mm5\n\t" \
155 "por %%mm5, %%mm4 \n\t" \
157 MOVNTQ" %%mm0, %0 \n\t" \
158 MOVNTQ" %%mm1, 8%0 \n\t" \
162 static inline void RENAME(rgb32tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
165 const uint8_t *s = src;
167 #if COMPILE_TEMPLATE_MMX
168 const uint8_t *mm_end;
171 #if COMPILE_TEMPLATE_MMX
172 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
177 "movq %1, %%mm0 \n\t"
178 "movq 8%1, %%mm1 \n\t"
179 "movq 16%1, %%mm4 \n\t"
180 "movq 24%1, %%mm5 \n\t"
181 "movq %%mm0, %%mm2 \n\t"
182 "movq %%mm1, %%mm3 \n\t"
183 "movq %%mm4, %%mm6 \n\t"
184 "movq %%mm5, %%mm7 \n\t"
192 __asm__ volatile(SFENCE:::"memory");
193 __asm__ volatile(EMMS:::"memory");
197 /* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */
213 original by Strepto/Astral
214 ported to gcc & bugfixed: A'rpi
215 MMX2, 3DNOW optimization by Nick Kurshev
216 32-bit C version, and and&add trick by Michael Niedermayer
218 static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, long src_size)
220 register const uint8_t* s=src;
221 register uint8_t* d=dst;
222 register const uint8_t *end;
223 const uint8_t *mm_end;
225 #if COMPILE_TEMPLATE_MMX
226 __asm__ volatile(PREFETCH" %0"::"m"(*s));
227 __asm__ volatile("movq %0, %%mm4"::"m"(mask15s));
232 "movq %1, %%mm0 \n\t"
233 "movq 8%1, %%mm2 \n\t"
234 "movq %%mm0, %%mm1 \n\t"
235 "movq %%mm2, %%mm3 \n\t"
236 "pand %%mm4, %%mm0 \n\t"
237 "pand %%mm4, %%mm2 \n\t"
238 "paddw %%mm1, %%mm0 \n\t"
239 "paddw %%mm3, %%mm2 \n\t"
240 MOVNTQ" %%mm0, %0 \n\t"
248 __asm__ volatile(SFENCE:::"memory");
249 __asm__ volatile(EMMS:::"memory");
253 register unsigned x= *((const uint32_t *)s);
254 *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
259 register unsigned short x= *((const uint16_t *)s);
260 *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
264 static inline void RENAME(rgb16to15)(const uint8_t *src, uint8_t *dst, long src_size)
266 register const uint8_t* s=src;
267 register uint8_t* d=dst;
268 register const uint8_t *end;
269 const uint8_t *mm_end;
271 #if COMPILE_TEMPLATE_MMX
272 __asm__ volatile(PREFETCH" %0"::"m"(*s));
273 __asm__ volatile("movq %0, %%mm7"::"m"(mask15rg));
274 __asm__ volatile("movq %0, %%mm6"::"m"(mask15b));
279 "movq %1, %%mm0 \n\t"
280 "movq 8%1, %%mm2 \n\t"
281 "movq %%mm0, %%mm1 \n\t"
282 "movq %%mm2, %%mm3 \n\t"
283 "psrlq $1, %%mm0 \n\t"
284 "psrlq $1, %%mm2 \n\t"
285 "pand %%mm7, %%mm0 \n\t"
286 "pand %%mm7, %%mm2 \n\t"
287 "pand %%mm6, %%mm1 \n\t"
288 "pand %%mm6, %%mm3 \n\t"
289 "por %%mm1, %%mm0 \n\t"
290 "por %%mm3, %%mm2 \n\t"
291 MOVNTQ" %%mm0, %0 \n\t"
299 __asm__ volatile(SFENCE:::"memory");
300 __asm__ volatile(EMMS:::"memory");
304 register uint32_t x= *((const uint32_t*)s);
305 *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
310 register uint16_t x= *((const uint16_t*)s);
311 *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
315 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size)
317 const uint8_t *s = src;
319 #if COMPILE_TEMPLATE_MMX
320 const uint8_t *mm_end;
322 uint16_t *d = (uint16_t *)dst;
324 #if COMPILE_TEMPLATE_MMX
326 #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
328 "movq %3, %%mm5 \n\t"
329 "movq %4, %%mm6 \n\t"
330 "movq %5, %%mm7 \n\t"
334 PREFETCH" 32(%1) \n\t"
335 "movd (%1), %%mm0 \n\t"
336 "movd 4(%1), %%mm3 \n\t"
337 "punpckldq 8(%1), %%mm0 \n\t"
338 "punpckldq 12(%1), %%mm3 \n\t"
339 "movq %%mm0, %%mm1 \n\t"
340 "movq %%mm3, %%mm4 \n\t"
341 "pand %%mm6, %%mm0 \n\t"
342 "pand %%mm6, %%mm3 \n\t"
343 "pmaddwd %%mm7, %%mm0 \n\t"
344 "pmaddwd %%mm7, %%mm3 \n\t"
345 "pand %%mm5, %%mm1 \n\t"
346 "pand %%mm5, %%mm4 \n\t"
347 "por %%mm1, %%mm0 \n\t"
348 "por %%mm4, %%mm3 \n\t"
349 "psrld $5, %%mm0 \n\t"
350 "pslld $11, %%mm3 \n\t"
351 "por %%mm3, %%mm0 \n\t"
352 MOVNTQ" %%mm0, (%0) \n\t"
359 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
362 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
364 "movq %0, %%mm7 \n\t"
365 "movq %1, %%mm6 \n\t"
366 ::"m"(red_16mask),"m"(green_16mask));
370 "movd %1, %%mm0 \n\t"
371 "movd 4%1, %%mm3 \n\t"
372 "punpckldq 8%1, %%mm0 \n\t"
373 "punpckldq 12%1, %%mm3 \n\t"
374 "movq %%mm0, %%mm1 \n\t"
375 "movq %%mm0, %%mm2 \n\t"
376 "movq %%mm3, %%mm4 \n\t"
377 "movq %%mm3, %%mm5 \n\t"
378 "psrlq $3, %%mm0 \n\t"
379 "psrlq $3, %%mm3 \n\t"
380 "pand %2, %%mm0 \n\t"
381 "pand %2, %%mm3 \n\t"
382 "psrlq $5, %%mm1 \n\t"
383 "psrlq $5, %%mm4 \n\t"
384 "pand %%mm6, %%mm1 \n\t"
385 "pand %%mm6, %%mm4 \n\t"
386 "psrlq $8, %%mm2 \n\t"
387 "psrlq $8, %%mm5 \n\t"
388 "pand %%mm7, %%mm2 \n\t"
389 "pand %%mm7, %%mm5 \n\t"
390 "por %%mm1, %%mm0 \n\t"
391 "por %%mm4, %%mm3 \n\t"
392 "por %%mm2, %%mm0 \n\t"
393 "por %%mm5, %%mm3 \n\t"
394 "psllq $16, %%mm3 \n\t"
395 "por %%mm3, %%mm0 \n\t"
396 MOVNTQ" %%mm0, %0 \n\t"
397 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
402 __asm__ volatile(SFENCE:::"memory");
403 __asm__ volatile(EMMS:::"memory");
406 register int rgb = *(const uint32_t*)s; s += 4;
407 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
411 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
413 const uint8_t *s = src;
415 #if COMPILE_TEMPLATE_MMX
416 const uint8_t *mm_end;
418 uint16_t *d = (uint16_t *)dst;
420 #if COMPILE_TEMPLATE_MMX
421 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
423 "movq %0, %%mm7 \n\t"
424 "movq %1, %%mm6 \n\t"
425 ::"m"(red_16mask),"m"(green_16mask));
430 "movd %1, %%mm0 \n\t"
431 "movd 4%1, %%mm3 \n\t"
432 "punpckldq 8%1, %%mm0 \n\t"
433 "punpckldq 12%1, %%mm3 \n\t"
434 "movq %%mm0, %%mm1 \n\t"
435 "movq %%mm0, %%mm2 \n\t"
436 "movq %%mm3, %%mm4 \n\t"
437 "movq %%mm3, %%mm5 \n\t"
438 "psllq $8, %%mm0 \n\t"
439 "psllq $8, %%mm3 \n\t"
440 "pand %%mm7, %%mm0 \n\t"
441 "pand %%mm7, %%mm3 \n\t"
442 "psrlq $5, %%mm1 \n\t"
443 "psrlq $5, %%mm4 \n\t"
444 "pand %%mm6, %%mm1 \n\t"
445 "pand %%mm6, %%mm4 \n\t"
446 "psrlq $19, %%mm2 \n\t"
447 "psrlq $19, %%mm5 \n\t"
448 "pand %2, %%mm2 \n\t"
449 "pand %2, %%mm5 \n\t"
450 "por %%mm1, %%mm0 \n\t"
451 "por %%mm4, %%mm3 \n\t"
452 "por %%mm2, %%mm0 \n\t"
453 "por %%mm5, %%mm3 \n\t"
454 "psllq $16, %%mm3 \n\t"
455 "por %%mm3, %%mm0 \n\t"
456 MOVNTQ" %%mm0, %0 \n\t"
457 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
461 __asm__ volatile(SFENCE:::"memory");
462 __asm__ volatile(EMMS:::"memory");
465 register int rgb = *(const uint32_t*)s; s += 4;
466 *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
470 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size)
472 const uint8_t *s = src;
474 #if COMPILE_TEMPLATE_MMX
475 const uint8_t *mm_end;
477 uint16_t *d = (uint16_t *)dst;
479 #if COMPILE_TEMPLATE_MMX
481 #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
483 "movq %3, %%mm5 \n\t"
484 "movq %4, %%mm6 \n\t"
485 "movq %5, %%mm7 \n\t"
489 PREFETCH" 32(%1) \n\t"
490 "movd (%1), %%mm0 \n\t"
491 "movd 4(%1), %%mm3 \n\t"
492 "punpckldq 8(%1), %%mm0 \n\t"
493 "punpckldq 12(%1), %%mm3 \n\t"
494 "movq %%mm0, %%mm1 \n\t"
495 "movq %%mm3, %%mm4 \n\t"
496 "pand %%mm6, %%mm0 \n\t"
497 "pand %%mm6, %%mm3 \n\t"
498 "pmaddwd %%mm7, %%mm0 \n\t"
499 "pmaddwd %%mm7, %%mm3 \n\t"
500 "pand %%mm5, %%mm1 \n\t"
501 "pand %%mm5, %%mm4 \n\t"
502 "por %%mm1, %%mm0 \n\t"
503 "por %%mm4, %%mm3 \n\t"
504 "psrld $6, %%mm0 \n\t"
505 "pslld $10, %%mm3 \n\t"
506 "por %%mm3, %%mm0 \n\t"
507 MOVNTQ" %%mm0, (%0) \n\t"
514 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
517 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
519 "movq %0, %%mm7 \n\t"
520 "movq %1, %%mm6 \n\t"
521 ::"m"(red_15mask),"m"(green_15mask));
525 "movd %1, %%mm0 \n\t"
526 "movd 4%1, %%mm3 \n\t"
527 "punpckldq 8%1, %%mm0 \n\t"
528 "punpckldq 12%1, %%mm3 \n\t"
529 "movq %%mm0, %%mm1 \n\t"
530 "movq %%mm0, %%mm2 \n\t"
531 "movq %%mm3, %%mm4 \n\t"
532 "movq %%mm3, %%mm5 \n\t"
533 "psrlq $3, %%mm0 \n\t"
534 "psrlq $3, %%mm3 \n\t"
535 "pand %2, %%mm0 \n\t"
536 "pand %2, %%mm3 \n\t"
537 "psrlq $6, %%mm1 \n\t"
538 "psrlq $6, %%mm4 \n\t"
539 "pand %%mm6, %%mm1 \n\t"
540 "pand %%mm6, %%mm4 \n\t"
541 "psrlq $9, %%mm2 \n\t"
542 "psrlq $9, %%mm5 \n\t"
543 "pand %%mm7, %%mm2 \n\t"
544 "pand %%mm7, %%mm5 \n\t"
545 "por %%mm1, %%mm0 \n\t"
546 "por %%mm4, %%mm3 \n\t"
547 "por %%mm2, %%mm0 \n\t"
548 "por %%mm5, %%mm3 \n\t"
549 "psllq $16, %%mm3 \n\t"
550 "por %%mm3, %%mm0 \n\t"
551 MOVNTQ" %%mm0, %0 \n\t"
552 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
557 __asm__ volatile(SFENCE:::"memory");
558 __asm__ volatile(EMMS:::"memory");
561 register int rgb = *(const uint32_t*)s; s += 4;
562 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
566 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
568 const uint8_t *s = src;
570 #if COMPILE_TEMPLATE_MMX
571 const uint8_t *mm_end;
573 uint16_t *d = (uint16_t *)dst;
575 #if COMPILE_TEMPLATE_MMX
576 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
578 "movq %0, %%mm7 \n\t"
579 "movq %1, %%mm6 \n\t"
580 ::"m"(red_15mask),"m"(green_15mask));
585 "movd %1, %%mm0 \n\t"
586 "movd 4%1, %%mm3 \n\t"
587 "punpckldq 8%1, %%mm0 \n\t"
588 "punpckldq 12%1, %%mm3 \n\t"
589 "movq %%mm0, %%mm1 \n\t"
590 "movq %%mm0, %%mm2 \n\t"
591 "movq %%mm3, %%mm4 \n\t"
592 "movq %%mm3, %%mm5 \n\t"
593 "psllq $7, %%mm0 \n\t"
594 "psllq $7, %%mm3 \n\t"
595 "pand %%mm7, %%mm0 \n\t"
596 "pand %%mm7, %%mm3 \n\t"
597 "psrlq $6, %%mm1 \n\t"
598 "psrlq $6, %%mm4 \n\t"
599 "pand %%mm6, %%mm1 \n\t"
600 "pand %%mm6, %%mm4 \n\t"
601 "psrlq $19, %%mm2 \n\t"
602 "psrlq $19, %%mm5 \n\t"
603 "pand %2, %%mm2 \n\t"
604 "pand %2, %%mm5 \n\t"
605 "por %%mm1, %%mm0 \n\t"
606 "por %%mm4, %%mm3 \n\t"
607 "por %%mm2, %%mm0 \n\t"
608 "por %%mm5, %%mm3 \n\t"
609 "psllq $16, %%mm3 \n\t"
610 "por %%mm3, %%mm0 \n\t"
611 MOVNTQ" %%mm0, %0 \n\t"
612 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
616 __asm__ volatile(SFENCE:::"memory");
617 __asm__ volatile(EMMS:::"memory");
620 register int rgb = *(const uint32_t*)s; s += 4;
621 *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
625 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
627 const uint8_t *s = src;
629 #if COMPILE_TEMPLATE_MMX
630 const uint8_t *mm_end;
632 uint16_t *d = (uint16_t *)dst;
634 #if COMPILE_TEMPLATE_MMX
635 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
637 "movq %0, %%mm7 \n\t"
638 "movq %1, %%mm6 \n\t"
639 ::"m"(red_16mask),"m"(green_16mask));
644 "movd %1, %%mm0 \n\t"
645 "movd 3%1, %%mm3 \n\t"
646 "punpckldq 6%1, %%mm0 \n\t"
647 "punpckldq 9%1, %%mm3 \n\t"
648 "movq %%mm0, %%mm1 \n\t"
649 "movq %%mm0, %%mm2 \n\t"
650 "movq %%mm3, %%mm4 \n\t"
651 "movq %%mm3, %%mm5 \n\t"
652 "psrlq $3, %%mm0 \n\t"
653 "psrlq $3, %%mm3 \n\t"
654 "pand %2, %%mm0 \n\t"
655 "pand %2, %%mm3 \n\t"
656 "psrlq $5, %%mm1 \n\t"
657 "psrlq $5, %%mm4 \n\t"
658 "pand %%mm6, %%mm1 \n\t"
659 "pand %%mm6, %%mm4 \n\t"
660 "psrlq $8, %%mm2 \n\t"
661 "psrlq $8, %%mm5 \n\t"
662 "pand %%mm7, %%mm2 \n\t"
663 "pand %%mm7, %%mm5 \n\t"
664 "por %%mm1, %%mm0 \n\t"
665 "por %%mm4, %%mm3 \n\t"
666 "por %%mm2, %%mm0 \n\t"
667 "por %%mm5, %%mm3 \n\t"
668 "psllq $16, %%mm3 \n\t"
669 "por %%mm3, %%mm0 \n\t"
670 MOVNTQ" %%mm0, %0 \n\t"
671 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
675 __asm__ volatile(SFENCE:::"memory");
676 __asm__ volatile(EMMS:::"memory");
682 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
686 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size)
688 const uint8_t *s = src;
690 #if COMPILE_TEMPLATE_MMX
691 const uint8_t *mm_end;
693 uint16_t *d = (uint16_t *)dst;
695 #if COMPILE_TEMPLATE_MMX
696 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
698 "movq %0, %%mm7 \n\t"
699 "movq %1, %%mm6 \n\t"
700 ::"m"(red_16mask),"m"(green_16mask));
705 "movd %1, %%mm0 \n\t"
706 "movd 3%1, %%mm3 \n\t"
707 "punpckldq 6%1, %%mm0 \n\t"
708 "punpckldq 9%1, %%mm3 \n\t"
709 "movq %%mm0, %%mm1 \n\t"
710 "movq %%mm0, %%mm2 \n\t"
711 "movq %%mm3, %%mm4 \n\t"
712 "movq %%mm3, %%mm5 \n\t"
713 "psllq $8, %%mm0 \n\t"
714 "psllq $8, %%mm3 \n\t"
715 "pand %%mm7, %%mm0 \n\t"
716 "pand %%mm7, %%mm3 \n\t"
717 "psrlq $5, %%mm1 \n\t"
718 "psrlq $5, %%mm4 \n\t"
719 "pand %%mm6, %%mm1 \n\t"
720 "pand %%mm6, %%mm4 \n\t"
721 "psrlq $19, %%mm2 \n\t"
722 "psrlq $19, %%mm5 \n\t"
723 "pand %2, %%mm2 \n\t"
724 "pand %2, %%mm5 \n\t"
725 "por %%mm1, %%mm0 \n\t"
726 "por %%mm4, %%mm3 \n\t"
727 "por %%mm2, %%mm0 \n\t"
728 "por %%mm5, %%mm3 \n\t"
729 "psllq $16, %%mm3 \n\t"
730 "por %%mm3, %%mm0 \n\t"
731 MOVNTQ" %%mm0, %0 \n\t"
732 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
736 __asm__ volatile(SFENCE:::"memory");
737 __asm__ volatile(EMMS:::"memory");
743 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
747 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
749 const uint8_t *s = src;
751 #if COMPILE_TEMPLATE_MMX
752 const uint8_t *mm_end;
754 uint16_t *d = (uint16_t *)dst;
756 #if COMPILE_TEMPLATE_MMX
757 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
759 "movq %0, %%mm7 \n\t"
760 "movq %1, %%mm6 \n\t"
761 ::"m"(red_15mask),"m"(green_15mask));
766 "movd %1, %%mm0 \n\t"
767 "movd 3%1, %%mm3 \n\t"
768 "punpckldq 6%1, %%mm0 \n\t"
769 "punpckldq 9%1, %%mm3 \n\t"
770 "movq %%mm0, %%mm1 \n\t"
771 "movq %%mm0, %%mm2 \n\t"
772 "movq %%mm3, %%mm4 \n\t"
773 "movq %%mm3, %%mm5 \n\t"
774 "psrlq $3, %%mm0 \n\t"
775 "psrlq $3, %%mm3 \n\t"
776 "pand %2, %%mm0 \n\t"
777 "pand %2, %%mm3 \n\t"
778 "psrlq $6, %%mm1 \n\t"
779 "psrlq $6, %%mm4 \n\t"
780 "pand %%mm6, %%mm1 \n\t"
781 "pand %%mm6, %%mm4 \n\t"
782 "psrlq $9, %%mm2 \n\t"
783 "psrlq $9, %%mm5 \n\t"
784 "pand %%mm7, %%mm2 \n\t"
785 "pand %%mm7, %%mm5 \n\t"
786 "por %%mm1, %%mm0 \n\t"
787 "por %%mm4, %%mm3 \n\t"
788 "por %%mm2, %%mm0 \n\t"
789 "por %%mm5, %%mm3 \n\t"
790 "psllq $16, %%mm3 \n\t"
791 "por %%mm3, %%mm0 \n\t"
792 MOVNTQ" %%mm0, %0 \n\t"
793 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
797 __asm__ volatile(SFENCE:::"memory");
798 __asm__ volatile(EMMS:::"memory");
804 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
808 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size)
810 const uint8_t *s = src;
812 #if COMPILE_TEMPLATE_MMX
813 const uint8_t *mm_end;
815 uint16_t *d = (uint16_t *)dst;
817 #if COMPILE_TEMPLATE_MMX
818 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
820 "movq %0, %%mm7 \n\t"
821 "movq %1, %%mm6 \n\t"
822 ::"m"(red_15mask),"m"(green_15mask));
827 "movd %1, %%mm0 \n\t"
828 "movd 3%1, %%mm3 \n\t"
829 "punpckldq 6%1, %%mm0 \n\t"
830 "punpckldq 9%1, %%mm3 \n\t"
831 "movq %%mm0, %%mm1 \n\t"
832 "movq %%mm0, %%mm2 \n\t"
833 "movq %%mm3, %%mm4 \n\t"
834 "movq %%mm3, %%mm5 \n\t"
835 "psllq $7, %%mm0 \n\t"
836 "psllq $7, %%mm3 \n\t"
837 "pand %%mm7, %%mm0 \n\t"
838 "pand %%mm7, %%mm3 \n\t"
839 "psrlq $6, %%mm1 \n\t"
840 "psrlq $6, %%mm4 \n\t"
841 "pand %%mm6, %%mm1 \n\t"
842 "pand %%mm6, %%mm4 \n\t"
843 "psrlq $19, %%mm2 \n\t"
844 "psrlq $19, %%mm5 \n\t"
845 "pand %2, %%mm2 \n\t"
846 "pand %2, %%mm5 \n\t"
847 "por %%mm1, %%mm0 \n\t"
848 "por %%mm4, %%mm3 \n\t"
849 "por %%mm2, %%mm0 \n\t"
850 "por %%mm5, %%mm3 \n\t"
851 "psllq $16, %%mm3 \n\t"
852 "por %%mm3, %%mm0 \n\t"
853 MOVNTQ" %%mm0, %0 \n\t"
854 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
858 __asm__ volatile(SFENCE:::"memory");
859 __asm__ volatile(EMMS:::"memory");
865 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
870 I use less accurate approximation here by simply left-shifting the input
871 value and filling the low order bits with zeroes. This method improves PNG
872 compression but this scheme cannot reproduce white exactly, since it does
873 not generate an all-ones maximum value; the net effect is to darken the
876 The better method should be "left bit replication":
886 | leftmost bits repeated to fill open bits
890 static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
893 #if COMPILE_TEMPLATE_MMX
894 const uint16_t *mm_end;
897 const uint16_t *s = (const uint16_t*)src;
898 end = s + src_size/2;
899 #if COMPILE_TEMPLATE_MMX
900 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
905 "movq %1, %%mm0 \n\t"
906 "movq %1, %%mm1 \n\t"
907 "movq %1, %%mm2 \n\t"
908 "pand %2, %%mm0 \n\t"
909 "pand %3, %%mm1 \n\t"
910 "pand %4, %%mm2 \n\t"
911 "psllq $3, %%mm0 \n\t"
912 "psrlq $2, %%mm1 \n\t"
913 "psrlq $7, %%mm2 \n\t"
914 "movq %%mm0, %%mm3 \n\t"
915 "movq %%mm1, %%mm4 \n\t"
916 "movq %%mm2, %%mm5 \n\t"
917 "punpcklwd %5, %%mm0 \n\t"
918 "punpcklwd %5, %%mm1 \n\t"
919 "punpcklwd %5, %%mm2 \n\t"
920 "punpckhwd %5, %%mm3 \n\t"
921 "punpckhwd %5, %%mm4 \n\t"
922 "punpckhwd %5, %%mm5 \n\t"
923 "psllq $8, %%mm1 \n\t"
924 "psllq $16, %%mm2 \n\t"
925 "por %%mm1, %%mm0 \n\t"
926 "por %%mm2, %%mm0 \n\t"
927 "psllq $8, %%mm4 \n\t"
928 "psllq $16, %%mm5 \n\t"
929 "por %%mm4, %%mm3 \n\t"
930 "por %%mm5, %%mm3 \n\t"
932 "movq %%mm0, %%mm6 \n\t"
933 "movq %%mm3, %%mm7 \n\t"
935 "movq 8%1, %%mm0 \n\t"
936 "movq 8%1, %%mm1 \n\t"
937 "movq 8%1, %%mm2 \n\t"
938 "pand %2, %%mm0 \n\t"
939 "pand %3, %%mm1 \n\t"
940 "pand %4, %%mm2 \n\t"
941 "psllq $3, %%mm0 \n\t"
942 "psrlq $2, %%mm1 \n\t"
943 "psrlq $7, %%mm2 \n\t"
944 "movq %%mm0, %%mm3 \n\t"
945 "movq %%mm1, %%mm4 \n\t"
946 "movq %%mm2, %%mm5 \n\t"
947 "punpcklwd %5, %%mm0 \n\t"
948 "punpcklwd %5, %%mm1 \n\t"
949 "punpcklwd %5, %%mm2 \n\t"
950 "punpckhwd %5, %%mm3 \n\t"
951 "punpckhwd %5, %%mm4 \n\t"
952 "punpckhwd %5, %%mm5 \n\t"
953 "psllq $8, %%mm1 \n\t"
954 "psllq $16, %%mm2 \n\t"
955 "por %%mm1, %%mm0 \n\t"
956 "por %%mm2, %%mm0 \n\t"
957 "psllq $8, %%mm4 \n\t"
958 "psllq $16, %%mm5 \n\t"
959 "por %%mm4, %%mm3 \n\t"
960 "por %%mm5, %%mm3 \n\t"
963 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
965 /* borrowed 32 to 24 */
967 "movq %%mm0, %%mm4 \n\t"
968 "movq %%mm3, %%mm5 \n\t"
969 "movq %%mm6, %%mm0 \n\t"
970 "movq %%mm7, %%mm1 \n\t"
972 "movq %%mm4, %%mm6 \n\t"
973 "movq %%mm5, %%mm7 \n\t"
974 "movq %%mm0, %%mm2 \n\t"
975 "movq %%mm1, %%mm3 \n\t"
985 __asm__ volatile(SFENCE:::"memory");
986 __asm__ volatile(EMMS:::"memory");
989 register uint16_t bgr;
991 *d++ = (bgr&0x1F)<<3;
992 *d++ = (bgr&0x3E0)>>2;
993 *d++ = (bgr&0x7C00)>>7;
997 static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
1000 #if COMPILE_TEMPLATE_MMX
1001 const uint16_t *mm_end;
1003 uint8_t *d = (uint8_t *)dst;
1004 const uint16_t *s = (const uint16_t *)src;
1005 end = s + src_size/2;
1006 #if COMPILE_TEMPLATE_MMX
1007 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
1009 while (s < mm_end) {
1011 PREFETCH" 32%1 \n\t"
1012 "movq %1, %%mm0 \n\t"
1013 "movq %1, %%mm1 \n\t"
1014 "movq %1, %%mm2 \n\t"
1015 "pand %2, %%mm0 \n\t"
1016 "pand %3, %%mm1 \n\t"
1017 "pand %4, %%mm2 \n\t"
1018 "psllq $3, %%mm0 \n\t"
1019 "psrlq $3, %%mm1 \n\t"
1020 "psrlq $8, %%mm2 \n\t"
1021 "movq %%mm0, %%mm3 \n\t"
1022 "movq %%mm1, %%mm4 \n\t"
1023 "movq %%mm2, %%mm5 \n\t"
1024 "punpcklwd %5, %%mm0 \n\t"
1025 "punpcklwd %5, %%mm1 \n\t"
1026 "punpcklwd %5, %%mm2 \n\t"
1027 "punpckhwd %5, %%mm3 \n\t"
1028 "punpckhwd %5, %%mm4 \n\t"
1029 "punpckhwd %5, %%mm5 \n\t"
1030 "psllq $8, %%mm1 \n\t"
1031 "psllq $16, %%mm2 \n\t"
1032 "por %%mm1, %%mm0 \n\t"
1033 "por %%mm2, %%mm0 \n\t"
1034 "psllq $8, %%mm4 \n\t"
1035 "psllq $16, %%mm5 \n\t"
1036 "por %%mm4, %%mm3 \n\t"
1037 "por %%mm5, %%mm3 \n\t"
1039 "movq %%mm0, %%mm6 \n\t"
1040 "movq %%mm3, %%mm7 \n\t"
1042 "movq 8%1, %%mm0 \n\t"
1043 "movq 8%1, %%mm1 \n\t"
1044 "movq 8%1, %%mm2 \n\t"
1045 "pand %2, %%mm0 \n\t"
1046 "pand %3, %%mm1 \n\t"
1047 "pand %4, %%mm2 \n\t"
1048 "psllq $3, %%mm0 \n\t"
1049 "psrlq $3, %%mm1 \n\t"
1050 "psrlq $8, %%mm2 \n\t"
1051 "movq %%mm0, %%mm3 \n\t"
1052 "movq %%mm1, %%mm4 \n\t"
1053 "movq %%mm2, %%mm5 \n\t"
1054 "punpcklwd %5, %%mm0 \n\t"
1055 "punpcklwd %5, %%mm1 \n\t"
1056 "punpcklwd %5, %%mm2 \n\t"
1057 "punpckhwd %5, %%mm3 \n\t"
1058 "punpckhwd %5, %%mm4 \n\t"
1059 "punpckhwd %5, %%mm5 \n\t"
1060 "psllq $8, %%mm1 \n\t"
1061 "psllq $16, %%mm2 \n\t"
1062 "por %%mm1, %%mm0 \n\t"
1063 "por %%mm2, %%mm0 \n\t"
1064 "psllq $8, %%mm4 \n\t"
1065 "psllq $16, %%mm5 \n\t"
1066 "por %%mm4, %%mm3 \n\t"
1067 "por %%mm5, %%mm3 \n\t"
1069 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
1071 /* borrowed 32 to 24 */
1073 "movq %%mm0, %%mm4 \n\t"
1074 "movq %%mm3, %%mm5 \n\t"
1075 "movq %%mm6, %%mm0 \n\t"
1076 "movq %%mm7, %%mm1 \n\t"
1078 "movq %%mm4, %%mm6 \n\t"
1079 "movq %%mm5, %%mm7 \n\t"
1080 "movq %%mm0, %%mm2 \n\t"
1081 "movq %%mm1, %%mm3 \n\t"
1091 __asm__ volatile(SFENCE:::"memory");
1092 __asm__ volatile(EMMS:::"memory");
1095 register uint16_t bgr;
1097 *d++ = (bgr&0x1F)<<3;
1098 *d++ = (bgr&0x7E0)>>3;
1099 *d++ = (bgr&0xF800)>>8;
1104 * mm0 = 00 B3 00 B2 00 B1 00 B0
1105 * mm1 = 00 G3 00 G2 00 G1 00 G0
1106 * mm2 = 00 R3 00 R2 00 R1 00 R0
1107 * mm6 = FF FF FF FF FF FF FF FF
1108 * mm7 = 00 00 00 00 00 00 00 00
1110 #define PACK_RGB32 \
1111 "packuswb %%mm7, %%mm0 \n\t" /* 00 00 00 00 B3 B2 B1 B0 */ \
1112 "packuswb %%mm7, %%mm1 \n\t" /* 00 00 00 00 G3 G2 G1 G0 */ \
1113 "packuswb %%mm7, %%mm2 \n\t" /* 00 00 00 00 R3 R2 R1 R0 */ \
1114 "punpcklbw %%mm1, %%mm0 \n\t" /* G3 B3 G2 B2 G1 B1 G0 B0 */ \
1115 "punpcklbw %%mm6, %%mm2 \n\t" /* FF R3 FF R2 FF R1 FF R0 */ \
1116 "movq %%mm0, %%mm3 \n\t" \
1117 "punpcklwd %%mm2, %%mm0 \n\t" /* FF R1 G1 B1 FF R0 G0 B0 */ \
1118 "punpckhwd %%mm2, %%mm3 \n\t" /* FF R3 G3 B3 FF R2 G2 B2 */ \
1119 MOVNTQ" %%mm0, %0 \n\t" \
1120 MOVNTQ" %%mm3, 8%0 \n\t" \
1122 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size)
1124 const uint16_t *end;
1125 #if COMPILE_TEMPLATE_MMX
1126 const uint16_t *mm_end;
1129 const uint16_t *s = (const uint16_t *)src;
1130 end = s + src_size/2;
1131 #if COMPILE_TEMPLATE_MMX
1132 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
1133 __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
1134 __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
1136 while (s < mm_end) {
1138 PREFETCH" 32%1 \n\t"
1139 "movq %1, %%mm0 \n\t"
1140 "movq %1, %%mm1 \n\t"
1141 "movq %1, %%mm2 \n\t"
1142 "pand %2, %%mm0 \n\t"
1143 "pand %3, %%mm1 \n\t"
1144 "pand %4, %%mm2 \n\t"
1145 "psllq $3, %%mm0 \n\t"
1146 "psrlq $2, %%mm1 \n\t"
1147 "psrlq $7, %%mm2 \n\t"
1150 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1155 __asm__ volatile(SFENCE:::"memory");
1156 __asm__ volatile(EMMS:::"memory");
1159 register uint16_t bgr;
1163 *d++ = (bgr&0x7C00)>>7;
1164 *d++ = (bgr&0x3E0)>>2;
1165 *d++ = (bgr&0x1F)<<3;
1167 *d++ = (bgr&0x1F)<<3;
1168 *d++ = (bgr&0x3E0)>>2;
1169 *d++ = (bgr&0x7C00)>>7;
1175 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size)
1177 const uint16_t *end;
1178 #if COMPILE_TEMPLATE_MMX
1179 const uint16_t *mm_end;
1182 const uint16_t *s = (const uint16_t*)src;
1183 end = s + src_size/2;
1184 #if COMPILE_TEMPLATE_MMX
1185 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
1186 __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
1187 __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
1189 while (s < mm_end) {
1191 PREFETCH" 32%1 \n\t"
1192 "movq %1, %%mm0 \n\t"
1193 "movq %1, %%mm1 \n\t"
1194 "movq %1, %%mm2 \n\t"
1195 "pand %2, %%mm0 \n\t"
1196 "pand %3, %%mm1 \n\t"
1197 "pand %4, %%mm2 \n\t"
1198 "psllq $3, %%mm0 \n\t"
1199 "psrlq $3, %%mm1 \n\t"
1200 "psrlq $8, %%mm2 \n\t"
1203 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1208 __asm__ volatile(SFENCE:::"memory");
1209 __asm__ volatile(EMMS:::"memory");
1212 register uint16_t bgr;
1216 *d++ = (bgr&0xF800)>>8;
1217 *d++ = (bgr&0x7E0)>>3;
1218 *d++ = (bgr&0x1F)<<3;
1220 *d++ = (bgr&0x1F)<<3;
1221 *d++ = (bgr&0x7E0)>>3;
1222 *d++ = (bgr&0xF800)>>8;
1228 static inline void RENAME(shuffle_bytes_2103)(const uint8_t *src, uint8_t *dst, long src_size)
1230 x86_reg idx = 15 - src_size;
1231 const uint8_t *s = src-idx;
1232 uint8_t *d = dst-idx;
1233 #if COMPILE_TEMPLATE_MMX
1237 PREFETCH" (%1, %0) \n\t"
1238 "movq %3, %%mm7 \n\t"
1239 "pxor %4, %%mm7 \n\t"
1240 "movq %%mm7, %%mm6 \n\t"
1241 "pxor %5, %%mm7 \n\t"
1244 PREFETCH" 32(%1, %0) \n\t"
1245 "movq (%1, %0), %%mm0 \n\t"
1246 "movq 8(%1, %0), %%mm1 \n\t"
1247 # if COMPILE_TEMPLATE_MMX2
1248 "pshufw $177, %%mm0, %%mm3 \n\t"
1249 "pshufw $177, %%mm1, %%mm5 \n\t"
1250 "pand %%mm7, %%mm0 \n\t"
1251 "pand %%mm6, %%mm3 \n\t"
1252 "pand %%mm7, %%mm1 \n\t"
1253 "pand %%mm6, %%mm5 \n\t"
1254 "por %%mm3, %%mm0 \n\t"
1255 "por %%mm5, %%mm1 \n\t"
1257 "movq %%mm0, %%mm2 \n\t"
1258 "movq %%mm1, %%mm4 \n\t"
1259 "pand %%mm7, %%mm0 \n\t"
1260 "pand %%mm6, %%mm2 \n\t"
1261 "pand %%mm7, %%mm1 \n\t"
1262 "pand %%mm6, %%mm4 \n\t"
1263 "movq %%mm2, %%mm3 \n\t"
1264 "movq %%mm4, %%mm5 \n\t"
1265 "pslld $16, %%mm2 \n\t"
1266 "psrld $16, %%mm3 \n\t"
1267 "pslld $16, %%mm4 \n\t"
1268 "psrld $16, %%mm5 \n\t"
1269 "por %%mm2, %%mm0 \n\t"
1270 "por %%mm4, %%mm1 \n\t"
1271 "por %%mm3, %%mm0 \n\t"
1272 "por %%mm5, %%mm1 \n\t"
1274 MOVNTQ" %%mm0, (%2, %0) \n\t"
1275 MOVNTQ" %%mm1, 8(%2, %0) \n\t"
1282 : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
1285 for (; idx<15; idx+=4) {
1286 register int v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00;
1288 *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16);
1292 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
1295 #if COMPILE_TEMPLATE_MMX
1296 x86_reg mmx_size= 23 - src_size;
1298 "test %%"REG_a", %%"REG_a" \n\t"
1300 "movq "MANGLE(mask24r)", %%mm5 \n\t"
1301 "movq "MANGLE(mask24g)", %%mm6 \n\t"
1302 "movq "MANGLE(mask24b)", %%mm7 \n\t"
1305 PREFETCH" 32(%1, %%"REG_a") \n\t"
1306 "movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1307 "movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG
1308 "movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B
1309 "psllq $16, %%mm0 \n\t" // 00 BGR BGR
1310 "pand %%mm5, %%mm0 \n\t"
1311 "pand %%mm6, %%mm1 \n\t"
1312 "pand %%mm7, %%mm2 \n\t"
1313 "por %%mm0, %%mm1 \n\t"
1314 "por %%mm2, %%mm1 \n\t"
1315 "movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1316 MOVNTQ" %%mm1, (%2, %%"REG_a") \n\t" // RGB RGB RG
1317 "movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B
1318 "movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR
1319 "pand %%mm7, %%mm0 \n\t"
1320 "pand %%mm5, %%mm1 \n\t"
1321 "pand %%mm6, %%mm2 \n\t"
1322 "por %%mm0, %%mm1 \n\t"
1323 "por %%mm2, %%mm1 \n\t"
1324 "movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B
1325 MOVNTQ" %%mm1, 8(%2, %%"REG_a") \n\t" // B RGB RGB R
1326 "movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR
1327 "movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG
1328 "pand %%mm6, %%mm0 \n\t"
1329 "pand %%mm7, %%mm1 \n\t"
1330 "pand %%mm5, %%mm2 \n\t"
1331 "por %%mm0, %%mm1 \n\t"
1332 "por %%mm2, %%mm1 \n\t"
1333 MOVNTQ" %%mm1, 16(%2, %%"REG_a") \n\t"
1334 "add $24, %%"REG_a" \n\t"
1338 : "r" (src-mmx_size), "r"(dst-mmx_size)
1341 __asm__ volatile(SFENCE:::"memory");
1342 __asm__ volatile(EMMS:::"memory");
1344 if (mmx_size==23) return; //finished, was multiple of 8
1348 src_size= 23-mmx_size;
1352 for (i=0; i<src_size; i+=3) {
1355 dst[i + 1] = src[i + 1];
1356 dst[i + 2] = src[i + 0];
1361 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1362 long width, long height,
1363 long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1366 const x86_reg chromWidth= width>>1;
1367 for (y=0; y<height; y++) {
1368 #if COMPILE_TEMPLATE_MMX
1369 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1371 "xor %%"REG_a", %%"REG_a" \n\t"
1374 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1375 PREFETCH" 32(%2, %%"REG_a") \n\t"
1376 PREFETCH" 32(%3, %%"REG_a") \n\t"
1377 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1378 "movq %%mm0, %%mm2 \n\t" // U(0)
1379 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1380 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1381 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1383 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1384 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1385 "movq %%mm3, %%mm4 \n\t" // Y(0)
1386 "movq %%mm5, %%mm6 \n\t" // Y(8)
1387 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
1388 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
1389 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
1390 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
1392 MOVNTQ" %%mm3, (%0, %%"REG_a", 4) \n\t"
1393 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
1394 MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4) \n\t"
1395 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
1397 "add $8, %%"REG_a" \n\t"
1398 "cmp %4, %%"REG_a" \n\t"
1400 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1405 #if ARCH_ALPHA && HAVE_MVI
1406 #define pl2yuy2(n) \
1411 __asm__("unpkbw %1, %0" : "=r"(y1) : "r"(y1)); \
1412 __asm__("unpkbw %1, %0" : "=r"(y2) : "r"(y2)); \
1413 __asm__("unpkbl %1, %0" : "=r"(u) : "r"(u)); \
1414 __asm__("unpkbl %1, %0" : "=r"(v) : "r"(v)); \
1415 yuv1 = (u << 8) + (v << 24); \
1422 uint64_t *qdst = (uint64_t *) dst;
1423 uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
1424 const uint32_t *yc = (uint32_t *) ysrc;
1425 const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
1426 const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
1427 for (i = 0; i < chromWidth; i += 8) {
1428 uint64_t y1, y2, yuv1, yuv2;
1431 __asm__("ldq $31,64(%0)" :: "r"(yc));
1432 __asm__("ldq $31,64(%0)" :: "r"(yc2));
1433 __asm__("ldq $31,64(%0)" :: "r"(uc));
1434 __asm__("ldq $31,64(%0)" :: "r"(vc));
1452 #elif HAVE_FAST_64BIT
1454 uint64_t *ldst = (uint64_t *) dst;
1455 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1456 for (i = 0; i < chromWidth; i += 2) {
1458 k = yc[0] + (uc[0] << 8) +
1459 (yc[1] << 16) + (vc[0] << 24);
1460 l = yc[2] + (uc[1] << 8) +
1461 (yc[3] << 16) + (vc[1] << 24);
1462 *ldst++ = k + (l << 32);
1469 int i, *idst = (int32_t *) dst;
1470 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1471 for (i = 0; i < chromWidth; i++) {
1473 *idst++ = (yc[0] << 24)+ (uc[0] << 16) +
1474 (yc[1] << 8) + (vc[0] << 0);
1476 *idst++ = yc[0] + (uc[0] << 8) +
1477 (yc[1] << 16) + (vc[0] << 24);
1485 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1486 usrc += chromStride;
1487 vsrc += chromStride;
1492 #if COMPILE_TEMPLATE_MMX
1500 * Height should be a multiple of 2 and width should be a multiple of 16.
1501 * (If this is a problem for anyone then tell me, and I will fix it.)
1503 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1504 long width, long height,
1505 long lumStride, long chromStride, long dstStride)
1507 //FIXME interpolate chroma
1508 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1511 static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1512 long width, long height,
1513 long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1516 const x86_reg chromWidth= width>>1;
1517 for (y=0; y<height; y++) {
1518 #if COMPILE_TEMPLATE_MMX
1519 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1521 "xor %%"REG_a", %%"REG_a" \n\t"
1524 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1525 PREFETCH" 32(%2, %%"REG_a") \n\t"
1526 PREFETCH" 32(%3, %%"REG_a") \n\t"
1527 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1528 "movq %%mm0, %%mm2 \n\t" // U(0)
1529 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1530 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1531 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1533 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1534 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1535 "movq %%mm0, %%mm4 \n\t" // Y(0)
1536 "movq %%mm2, %%mm6 \n\t" // Y(8)
1537 "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
1538 "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
1539 "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
1540 "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
1542 MOVNTQ" %%mm0, (%0, %%"REG_a", 4) \n\t"
1543 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
1544 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4) \n\t"
1545 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
1547 "add $8, %%"REG_a" \n\t"
1548 "cmp %4, %%"REG_a" \n\t"
1550 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1554 //FIXME adapt the Alpha ASM code from yv12->yuy2
1558 uint64_t *ldst = (uint64_t *) dst;
1559 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1560 for (i = 0; i < chromWidth; i += 2) {
1562 k = uc[0] + (yc[0] << 8) +
1563 (vc[0] << 16) + (yc[1] << 24);
1564 l = uc[1] + (yc[2] << 8) +
1565 (vc[1] << 16) + (yc[3] << 24);
1566 *ldst++ = k + (l << 32);
1573 int i, *idst = (int32_t *) dst;
1574 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1575 for (i = 0; i < chromWidth; i++) {
1577 *idst++ = (uc[0] << 24)+ (yc[0] << 16) +
1578 (vc[0] << 8) + (yc[1] << 0);
1580 *idst++ = uc[0] + (yc[0] << 8) +
1581 (vc[0] << 16) + (yc[1] << 24);
1589 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1590 usrc += chromStride;
1591 vsrc += chromStride;
1596 #if COMPILE_TEMPLATE_MMX
1604 * Height should be a multiple of 2 and width should be a multiple of 16
1605 * (If this is a problem for anyone then tell me, and I will fix it.)
1607 static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1608 long width, long height,
1609 long lumStride, long chromStride, long dstStride)
1611 //FIXME interpolate chroma
1612 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1616 * Width should be a multiple of 16.
1618 static inline void RENAME(yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1619 long width, long height,
1620 long lumStride, long chromStride, long dstStride)
1622 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1626 * Width should be a multiple of 16.
1628 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1629 long width, long height,
1630 long lumStride, long chromStride, long dstStride)
1632 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1636 * Height should be a multiple of 2 and width should be a multiple of 16.
1637 * (If this is a problem for anyone then tell me, and I will fix it.)
1639 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1640 long width, long height,
1641 long lumStride, long chromStride, long srcStride)
1644 const x86_reg chromWidth= width>>1;
1645 for (y=0; y<height; y+=2) {
1646 #if COMPILE_TEMPLATE_MMX
1648 "xor %%"REG_a", %%"REG_a" \n\t"
1649 "pcmpeqw %%mm7, %%mm7 \n\t"
1650 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1653 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1654 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1655 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1656 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
1657 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
1658 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
1659 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
1660 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1661 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1662 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1663 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1665 MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
1667 "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(8)
1668 "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(12)
1669 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
1670 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
1671 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
1672 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
1673 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1674 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1675 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1676 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1678 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1680 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1681 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1682 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1683 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1684 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1685 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1686 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1687 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1689 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
1690 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
1692 "add $8, %%"REG_a" \n\t"
1693 "cmp %4, %%"REG_a" \n\t"
1695 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1696 : "memory", "%"REG_a
1703 "xor %%"REG_a", %%"REG_a" \n\t"
1706 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1707 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1708 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1709 "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
1710 "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
1711 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1712 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1713 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1714 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1715 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1716 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1718 MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
1719 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1721 "add $8, %%"REG_a" \n\t"
1722 "cmp %4, %%"REG_a" \n\t"
1725 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1726 : "memory", "%"REG_a
1730 for (i=0; i<chromWidth; i++) {
1731 ydst[2*i+0] = src[4*i+0];
1732 udst[i] = src[4*i+1];
1733 ydst[2*i+1] = src[4*i+2];
1734 vdst[i] = src[4*i+3];
1739 for (i=0; i<chromWidth; i++) {
1740 ydst[2*i+0] = src[4*i+0];
1741 ydst[2*i+1] = src[4*i+2];
1744 udst += chromStride;
1745 vdst += chromStride;
1749 #if COMPILE_TEMPLATE_MMX
1750 __asm__ volatile(EMMS" \n\t"
1756 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWidth, long srcHeight, long srcStride, long dstStride)
1763 for (x=0; x<srcWidth-1; x++) {
1764 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1765 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1767 dst[2*srcWidth-1]= src[srcWidth-1];
1771 for (y=1; y<srcHeight; y++) {
1772 #if COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW
1773 const x86_reg mmxSize= srcWidth&~15;
1775 "mov %4, %%"REG_a" \n\t"
1776 "movq "MANGLE(mmx_ff)", %%mm0 \n\t"
1777 "movq (%0, %%"REG_a"), %%mm4 \n\t"
1778 "movq %%mm4, %%mm2 \n\t"
1779 "psllq $8, %%mm4 \n\t"
1780 "pand %%mm0, %%mm2 \n\t"
1781 "por %%mm2, %%mm4 \n\t"
1782 "movq (%1, %%"REG_a"), %%mm5 \n\t"
1783 "movq %%mm5, %%mm3 \n\t"
1784 "psllq $8, %%mm5 \n\t"
1785 "pand %%mm0, %%mm3 \n\t"
1786 "por %%mm3, %%mm5 \n\t"
1788 "movq (%0, %%"REG_a"), %%mm0 \n\t"
1789 "movq (%1, %%"REG_a"), %%mm1 \n\t"
1790 "movq 1(%0, %%"REG_a"), %%mm2 \n\t"
1791 "movq 1(%1, %%"REG_a"), %%mm3 \n\t"
1792 PAVGB" %%mm0, %%mm5 \n\t"
1793 PAVGB" %%mm0, %%mm3 \n\t"
1794 PAVGB" %%mm0, %%mm5 \n\t"
1795 PAVGB" %%mm0, %%mm3 \n\t"
1796 PAVGB" %%mm1, %%mm4 \n\t"
1797 PAVGB" %%mm1, %%mm2 \n\t"
1798 PAVGB" %%mm1, %%mm4 \n\t"
1799 PAVGB" %%mm1, %%mm2 \n\t"
1800 "movq %%mm5, %%mm7 \n\t"
1801 "movq %%mm4, %%mm6 \n\t"
1802 "punpcklbw %%mm3, %%mm5 \n\t"
1803 "punpckhbw %%mm3, %%mm7 \n\t"
1804 "punpcklbw %%mm2, %%mm4 \n\t"
1805 "punpckhbw %%mm2, %%mm6 \n\t"
1807 MOVNTQ" %%mm5, (%2, %%"REG_a", 2) \n\t"
1808 MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2) \n\t"
1809 MOVNTQ" %%mm4, (%3, %%"REG_a", 2) \n\t"
1810 MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2) \n\t"
1812 "movq %%mm5, (%2, %%"REG_a", 2) \n\t"
1813 "movq %%mm7, 8(%2, %%"REG_a", 2) \n\t"
1814 "movq %%mm4, (%3, %%"REG_a", 2) \n\t"
1815 "movq %%mm6, 8(%3, %%"REG_a", 2) \n\t"
1817 "add $8, %%"REG_a" \n\t"
1818 "movq -1(%0, %%"REG_a"), %%mm4 \n\t"
1819 "movq -1(%1, %%"REG_a"), %%mm5 \n\t"
1821 :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
1822 "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1827 const x86_reg mmxSize=1;
1829 dst[0 ]= (3*src[0] + src[srcStride])>>2;
1830 dst[dstStride]= ( src[0] + 3*src[srcStride])>>2;
1833 for (x=mmxSize-1; x<srcWidth-1; x++) {
1834 dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
1835 dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
1836 dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
1837 dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
1839 dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
1840 dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1850 for (x=0; x<srcWidth-1; x++) {
1851 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1852 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1854 dst[2*srcWidth-1]= src[srcWidth-1];
1856 for (x=0; x<srcWidth; x++) {
1862 #if COMPILE_TEMPLATE_MMX
1863 __asm__ volatile(EMMS" \n\t"
1870 * Height should be a multiple of 2 and width should be a multiple of 16.
1871 * (If this is a problem for anyone then tell me, and I will fix it.)
1872 * Chrominance data is only taken from every second line, others are ignored.
1873 * FIXME: Write HQ version.
1875 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1876 long width, long height,
1877 long lumStride, long chromStride, long srcStride)
1880 const x86_reg chromWidth= width>>1;
1881 for (y=0; y<height; y+=2) {
1882 #if COMPILE_TEMPLATE_MMX
1884 "xor %%"REG_a", %%"REG_a" \n\t"
1885 "pcmpeqw %%mm7, %%mm7 \n\t"
1886 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1889 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1890 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // UYVY UYVY(0)
1891 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(4)
1892 "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
1893 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
1894 "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
1895 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
1896 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1897 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1898 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1899 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1901 MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
1903 "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(8)
1904 "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // UYVY UYVY(12)
1905 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
1906 "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
1907 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
1908 "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
1909 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1910 "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1911 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1912 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1914 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1916 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1917 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1918 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1919 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1920 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1921 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1922 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1923 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1925 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
1926 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
1928 "add $8, %%"REG_a" \n\t"
1929 "cmp %4, %%"REG_a" \n\t"
1931 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1932 : "memory", "%"REG_a
1939 "xor %%"REG_a", %%"REG_a" \n\t"
1942 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1943 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1944 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1945 "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
1946 "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
1947 "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1948 "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1949 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1950 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1951 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1952 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1954 MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
1955 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1957 "add $8, %%"REG_a" \n\t"
1958 "cmp %4, %%"REG_a" \n\t"
1961 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1962 : "memory", "%"REG_a
1966 for (i=0; i<chromWidth; i++) {
1967 udst[i] = src[4*i+0];
1968 ydst[2*i+0] = src[4*i+1];
1969 vdst[i] = src[4*i+2];
1970 ydst[2*i+1] = src[4*i+3];
1975 for (i=0; i<chromWidth; i++) {
1976 ydst[2*i+0] = src[4*i+1];
1977 ydst[2*i+1] = src[4*i+3];
1980 udst += chromStride;
1981 vdst += chromStride;
1985 #if COMPILE_TEMPLATE_MMX
1986 __asm__ volatile(EMMS" \n\t"
1993 * Height should be a multiple of 2 and width should be a multiple of 2.
1994 * (If this is a problem for anyone then tell me, and I will fix it.)
1995 * Chrominance data is only taken from every second line,
1996 * others are ignored in the C version.
1997 * FIXME: Write HQ version.
1999 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2000 long width, long height,
2001 long lumStride, long chromStride, long srcStride)
2004 const x86_reg chromWidth= width>>1;
2005 #if COMPILE_TEMPLATE_MMX
2006 for (y=0; y<height-2; y+=2) {
2008 for (i=0; i<2; i++) {
2010 "mov %2, %%"REG_a" \n\t"
2011 "movq "MANGLE(ff_bgr2YCoeff)", %%mm6 \n\t"
2012 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
2013 "pxor %%mm7, %%mm7 \n\t"
2014 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
2017 PREFETCH" 64(%0, %%"REG_d") \n\t"
2018 "movd (%0, %%"REG_d"), %%mm0 \n\t"
2019 "movd 3(%0, %%"REG_d"), %%mm1 \n\t"
2020 "punpcklbw %%mm7, %%mm0 \n\t"
2021 "punpcklbw %%mm7, %%mm1 \n\t"
2022 "movd 6(%0, %%"REG_d"), %%mm2 \n\t"
2023 "movd 9(%0, %%"REG_d"), %%mm3 \n\t"
2024 "punpcklbw %%mm7, %%mm2 \n\t"
2025 "punpcklbw %%mm7, %%mm3 \n\t"
2026 "pmaddwd %%mm6, %%mm0 \n\t"
2027 "pmaddwd %%mm6, %%mm1 \n\t"
2028 "pmaddwd %%mm6, %%mm2 \n\t"
2029 "pmaddwd %%mm6, %%mm3 \n\t"
2030 #ifndef FAST_BGR2YV12
2031 "psrad $8, %%mm0 \n\t"
2032 "psrad $8, %%mm1 \n\t"
2033 "psrad $8, %%mm2 \n\t"
2034 "psrad $8, %%mm3 \n\t"
2036 "packssdw %%mm1, %%mm0 \n\t"
2037 "packssdw %%mm3, %%mm2 \n\t"
2038 "pmaddwd %%mm5, %%mm0 \n\t"
2039 "pmaddwd %%mm5, %%mm2 \n\t"
2040 "packssdw %%mm2, %%mm0 \n\t"
2041 "psraw $7, %%mm0 \n\t"
2043 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
2044 "movd 15(%0, %%"REG_d"), %%mm1 \n\t"
2045 "punpcklbw %%mm7, %%mm4 \n\t"
2046 "punpcklbw %%mm7, %%mm1 \n\t"
2047 "movd 18(%0, %%"REG_d"), %%mm2 \n\t"
2048 "movd 21(%0, %%"REG_d"), %%mm3 \n\t"
2049 "punpcklbw %%mm7, %%mm2 \n\t"
2050 "punpcklbw %%mm7, %%mm3 \n\t"
2051 "pmaddwd %%mm6, %%mm4 \n\t"
2052 "pmaddwd %%mm6, %%mm1 \n\t"
2053 "pmaddwd %%mm6, %%mm2 \n\t"
2054 "pmaddwd %%mm6, %%mm3 \n\t"
2055 #ifndef FAST_BGR2YV12
2056 "psrad $8, %%mm4 \n\t"
2057 "psrad $8, %%mm1 \n\t"
2058 "psrad $8, %%mm2 \n\t"
2059 "psrad $8, %%mm3 \n\t"
2061 "packssdw %%mm1, %%mm4 \n\t"
2062 "packssdw %%mm3, %%mm2 \n\t"
2063 "pmaddwd %%mm5, %%mm4 \n\t"
2064 "pmaddwd %%mm5, %%mm2 \n\t"
2065 "add $24, %%"REG_d" \n\t"
2066 "packssdw %%mm2, %%mm4 \n\t"
2067 "psraw $7, %%mm4 \n\t"
2069 "packuswb %%mm4, %%mm0 \n\t"
2070 "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0 \n\t"
2072 MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t"
2073 "add $8, %%"REG_a" \n\t"
2075 : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width)
2076 : "%"REG_a, "%"REG_d
2083 "mov %4, %%"REG_a" \n\t"
2084 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
2085 "movq "MANGLE(ff_bgr2UCoeff)", %%mm6 \n\t"
2086 "pxor %%mm7, %%mm7 \n\t"
2087 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
2088 "add %%"REG_d", %%"REG_d" \n\t"
2091 PREFETCH" 64(%0, %%"REG_d") \n\t"
2092 PREFETCH" 64(%1, %%"REG_d") \n\t"
2093 #if COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW
2094 "movq (%0, %%"REG_d"), %%mm0 \n\t"
2095 "movq (%1, %%"REG_d"), %%mm1 \n\t"
2096 "movq 6(%0, %%"REG_d"), %%mm2 \n\t"
2097 "movq 6(%1, %%"REG_d"), %%mm3 \n\t"
2098 PAVGB" %%mm1, %%mm0 \n\t"
2099 PAVGB" %%mm3, %%mm2 \n\t"
2100 "movq %%mm0, %%mm1 \n\t"
2101 "movq %%mm2, %%mm3 \n\t"
2102 "psrlq $24, %%mm0 \n\t"
2103 "psrlq $24, %%mm2 \n\t"
2104 PAVGB" %%mm1, %%mm0 \n\t"
2105 PAVGB" %%mm3, %%mm2 \n\t"
2106 "punpcklbw %%mm7, %%mm0 \n\t"
2107 "punpcklbw %%mm7, %%mm2 \n\t"
2109 "movd (%0, %%"REG_d"), %%mm0 \n\t"
2110 "movd (%1, %%"REG_d"), %%mm1 \n\t"
2111 "movd 3(%0, %%"REG_d"), %%mm2 \n\t"
2112 "movd 3(%1, %%"REG_d"), %%mm3 \n\t"
2113 "punpcklbw %%mm7, %%mm0 \n\t"
2114 "punpcklbw %%mm7, %%mm1 \n\t"
2115 "punpcklbw %%mm7, %%mm2 \n\t"
2116 "punpcklbw %%mm7, %%mm3 \n\t"
2117 "paddw %%mm1, %%mm0 \n\t"
2118 "paddw %%mm3, %%mm2 \n\t"
2119 "paddw %%mm2, %%mm0 \n\t"
2120 "movd 6(%0, %%"REG_d"), %%mm4 \n\t"
2121 "movd 6(%1, %%"REG_d"), %%mm1 \n\t"
2122 "movd 9(%0, %%"REG_d"), %%mm2 \n\t"
2123 "movd 9(%1, %%"REG_d"), %%mm3 \n\t"
2124 "punpcklbw %%mm7, %%mm4 \n\t"
2125 "punpcklbw %%mm7, %%mm1 \n\t"
2126 "punpcklbw %%mm7, %%mm2 \n\t"
2127 "punpcklbw %%mm7, %%mm3 \n\t"
2128 "paddw %%mm1, %%mm4 \n\t"
2129 "paddw %%mm3, %%mm2 \n\t"
2130 "paddw %%mm4, %%mm2 \n\t"
2131 "psrlw $2, %%mm0 \n\t"
2132 "psrlw $2, %%mm2 \n\t"
2134 "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
2135 "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
2137 "pmaddwd %%mm0, %%mm1 \n\t"
2138 "pmaddwd %%mm2, %%mm3 \n\t"
2139 "pmaddwd %%mm6, %%mm0 \n\t"
2140 "pmaddwd %%mm6, %%mm2 \n\t"
2141 #ifndef FAST_BGR2YV12
2142 "psrad $8, %%mm0 \n\t"
2143 "psrad $8, %%mm1 \n\t"
2144 "psrad $8, %%mm2 \n\t"
2145 "psrad $8, %%mm3 \n\t"
2147 "packssdw %%mm2, %%mm0 \n\t"
2148 "packssdw %%mm3, %%mm1 \n\t"
2149 "pmaddwd %%mm5, %%mm0 \n\t"
2150 "pmaddwd %%mm5, %%mm1 \n\t"
2151 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
2152 "psraw $7, %%mm0 \n\t"
2154 #if COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW
2155 "movq 12(%0, %%"REG_d"), %%mm4 \n\t"
2156 "movq 12(%1, %%"REG_d"), %%mm1 \n\t"
2157 "movq 18(%0, %%"REG_d"), %%mm2 \n\t"
2158 "movq 18(%1, %%"REG_d"), %%mm3 \n\t"
2159 PAVGB" %%mm1, %%mm4 \n\t"
2160 PAVGB" %%mm3, %%mm2 \n\t"
2161 "movq %%mm4, %%mm1 \n\t"
2162 "movq %%mm2, %%mm3 \n\t"
2163 "psrlq $24, %%mm4 \n\t"
2164 "psrlq $24, %%mm2 \n\t"
2165 PAVGB" %%mm1, %%mm4 \n\t"
2166 PAVGB" %%mm3, %%mm2 \n\t"
2167 "punpcklbw %%mm7, %%mm4 \n\t"
2168 "punpcklbw %%mm7, %%mm2 \n\t"
2170 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
2171 "movd 12(%1, %%"REG_d"), %%mm1 \n\t"
2172 "movd 15(%0, %%"REG_d"), %%mm2 \n\t"
2173 "movd 15(%1, %%"REG_d"), %%mm3 \n\t"
2174 "punpcklbw %%mm7, %%mm4 \n\t"
2175 "punpcklbw %%mm7, %%mm1 \n\t"
2176 "punpcklbw %%mm7, %%mm2 \n\t"
2177 "punpcklbw %%mm7, %%mm3 \n\t"
2178 "paddw %%mm1, %%mm4 \n\t"
2179 "paddw %%mm3, %%mm2 \n\t"
2180 "paddw %%mm2, %%mm4 \n\t"
2181 "movd 18(%0, %%"REG_d"), %%mm5 \n\t"
2182 "movd 18(%1, %%"REG_d"), %%mm1 \n\t"
2183 "movd 21(%0, %%"REG_d"), %%mm2 \n\t"
2184 "movd 21(%1, %%"REG_d"), %%mm3 \n\t"
2185 "punpcklbw %%mm7, %%mm5 \n\t"
2186 "punpcklbw %%mm7, %%mm1 \n\t"
2187 "punpcklbw %%mm7, %%mm2 \n\t"
2188 "punpcklbw %%mm7, %%mm3 \n\t"
2189 "paddw %%mm1, %%mm5 \n\t"
2190 "paddw %%mm3, %%mm2 \n\t"
2191 "paddw %%mm5, %%mm2 \n\t"
2192 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
2193 "psrlw $2, %%mm4 \n\t"
2194 "psrlw $2, %%mm2 \n\t"
2196 "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
2197 "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
2199 "pmaddwd %%mm4, %%mm1 \n\t"
2200 "pmaddwd %%mm2, %%mm3 \n\t"
2201 "pmaddwd %%mm6, %%mm4 \n\t"
2202 "pmaddwd %%mm6, %%mm2 \n\t"
2203 #ifndef FAST_BGR2YV12
2204 "psrad $8, %%mm4 \n\t"
2205 "psrad $8, %%mm1 \n\t"
2206 "psrad $8, %%mm2 \n\t"
2207 "psrad $8, %%mm3 \n\t"
2209 "packssdw %%mm2, %%mm4 \n\t"
2210 "packssdw %%mm3, %%mm1 \n\t"
2211 "pmaddwd %%mm5, %%mm4 \n\t"
2212 "pmaddwd %%mm5, %%mm1 \n\t"
2213 "add $24, %%"REG_d" \n\t"
2214 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
2215 "psraw $7, %%mm4 \n\t"
2217 "movq %%mm0, %%mm1 \n\t"
2218 "punpckldq %%mm4, %%mm0 \n\t"
2219 "punpckhdq %%mm4, %%mm1 \n\t"
2220 "packsswb %%mm1, %%mm0 \n\t"
2221 "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0 \n\t"
2222 "movd %%mm0, (%2, %%"REG_a") \n\t"
2223 "punpckhdq %%mm0, %%mm0 \n\t"
2224 "movd %%mm0, (%3, %%"REG_a") \n\t"
2225 "add $4, %%"REG_a" \n\t"
2227 : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
2228 : "%"REG_a, "%"REG_d
2231 udst += chromStride;
2232 vdst += chromStride;
2236 __asm__ volatile(EMMS" \n\t"
2242 for (; y<height; y+=2) {
2244 for (i=0; i<chromWidth; i++) {
2245 unsigned int b = src[6*i+0];
2246 unsigned int g = src[6*i+1];
2247 unsigned int r = src[6*i+2];
2249 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2250 unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2251 unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2261 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2270 for (i=0; i<chromWidth; i++) {
2271 unsigned int b = src[6*i+0];
2272 unsigned int g = src[6*i+1];
2273 unsigned int r = src[6*i+2];
2275 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2283 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2286 udst += chromStride;
2287 vdst += chromStride;
2293 static void RENAME(interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dest,
2294 long width, long height, long src1Stride,
2295 long src2Stride, long dstStride)
2299 for (h=0; h < height; h++) {
2302 #if COMPILE_TEMPLATE_MMX
2303 #if COMPILE_TEMPLATE_SSE2
2305 "xor %%"REG_a", %%"REG_a" \n\t"
2307 PREFETCH" 64(%1, %%"REG_a") \n\t"
2308 PREFETCH" 64(%2, %%"REG_a") \n\t"
2309 "movdqa (%1, %%"REG_a"), %%xmm0 \n\t"
2310 "movdqa (%1, %%"REG_a"), %%xmm1 \n\t"
2311 "movdqa (%2, %%"REG_a"), %%xmm2 \n\t"
2312 "punpcklbw %%xmm2, %%xmm0 \n\t"
2313 "punpckhbw %%xmm2, %%xmm1 \n\t"
2314 "movntdq %%xmm0, (%0, %%"REG_a", 2) \n\t"
2315 "movntdq %%xmm1, 16(%0, %%"REG_a", 2) \n\t"
2316 "add $16, %%"REG_a" \n\t"
2317 "cmp %3, %%"REG_a" \n\t"
2319 ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
2320 : "memory", "%"REG_a""
2324 "xor %%"REG_a", %%"REG_a" \n\t"
2326 PREFETCH" 64(%1, %%"REG_a") \n\t"
2327 PREFETCH" 64(%2, %%"REG_a") \n\t"
2328 "movq (%1, %%"REG_a"), %%mm0 \n\t"
2329 "movq 8(%1, %%"REG_a"), %%mm2 \n\t"
2330 "movq %%mm0, %%mm1 \n\t"
2331 "movq %%mm2, %%mm3 \n\t"
2332 "movq (%2, %%"REG_a"), %%mm4 \n\t"
2333 "movq 8(%2, %%"REG_a"), %%mm5 \n\t"
2334 "punpcklbw %%mm4, %%mm0 \n\t"
2335 "punpckhbw %%mm4, %%mm1 \n\t"
2336 "punpcklbw %%mm5, %%mm2 \n\t"
2337 "punpckhbw %%mm5, %%mm3 \n\t"
2338 MOVNTQ" %%mm0, (%0, %%"REG_a", 2) \n\t"
2339 MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2) \n\t"
2340 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2) \n\t"
2341 MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2) \n\t"
2342 "add $16, %%"REG_a" \n\t"
2343 "cmp %3, %%"REG_a" \n\t"
2345 ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
2346 : "memory", "%"REG_a
2349 for (w= (width&(~15)); w < width; w++) {
2350 dest[2*w+0] = src1[w];
2351 dest[2*w+1] = src2[w];
2354 for (w=0; w < width; w++) {
2355 dest[2*w+0] = src1[w];
2356 dest[2*w+1] = src2[w];
2363 #if COMPILE_TEMPLATE_MMX
2372 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2373 uint8_t *dst1, uint8_t *dst2,
2374 long width, long height,
2375 long srcStride1, long srcStride2,
2376 long dstStride1, long dstStride2)
2380 w=width/2; h=height/2;
2381 #if COMPILE_TEMPLATE_MMX
2385 ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2388 const uint8_t* s1=src1+srcStride1*(y>>1);
2389 uint8_t* d=dst1+dstStride1*y;
2391 #if COMPILE_TEMPLATE_MMX
2392 for (;x<w-31;x+=32) {
2394 PREFETCH" 32%1 \n\t"
2395 "movq %1, %%mm0 \n\t"
2396 "movq 8%1, %%mm2 \n\t"
2397 "movq 16%1, %%mm4 \n\t"
2398 "movq 24%1, %%mm6 \n\t"
2399 "movq %%mm0, %%mm1 \n\t"
2400 "movq %%mm2, %%mm3 \n\t"
2401 "movq %%mm4, %%mm5 \n\t"
2402 "movq %%mm6, %%mm7 \n\t"
2403 "punpcklbw %%mm0, %%mm0 \n\t"
2404 "punpckhbw %%mm1, %%mm1 \n\t"
2405 "punpcklbw %%mm2, %%mm2 \n\t"
2406 "punpckhbw %%mm3, %%mm3 \n\t"
2407 "punpcklbw %%mm4, %%mm4 \n\t"
2408 "punpckhbw %%mm5, %%mm5 \n\t"
2409 "punpcklbw %%mm6, %%mm6 \n\t"
2410 "punpckhbw %%mm7, %%mm7 \n\t"
2411 MOVNTQ" %%mm0, %0 \n\t"
2412 MOVNTQ" %%mm1, 8%0 \n\t"
2413 MOVNTQ" %%mm2, 16%0 \n\t"
2414 MOVNTQ" %%mm3, 24%0 \n\t"
2415 MOVNTQ" %%mm4, 32%0 \n\t"
2416 MOVNTQ" %%mm5, 40%0 \n\t"
2417 MOVNTQ" %%mm6, 48%0 \n\t"
2418 MOVNTQ" %%mm7, 56%0"
2424 for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2427 const uint8_t* s2=src2+srcStride2*(y>>1);
2428 uint8_t* d=dst2+dstStride2*y;
2430 #if COMPILE_TEMPLATE_MMX
2431 for (;x<w-31;x+=32) {
2433 PREFETCH" 32%1 \n\t"
2434 "movq %1, %%mm0 \n\t"
2435 "movq 8%1, %%mm2 \n\t"
2436 "movq 16%1, %%mm4 \n\t"
2437 "movq 24%1, %%mm6 \n\t"
2438 "movq %%mm0, %%mm1 \n\t"
2439 "movq %%mm2, %%mm3 \n\t"
2440 "movq %%mm4, %%mm5 \n\t"
2441 "movq %%mm6, %%mm7 \n\t"
2442 "punpcklbw %%mm0, %%mm0 \n\t"
2443 "punpckhbw %%mm1, %%mm1 \n\t"
2444 "punpcklbw %%mm2, %%mm2 \n\t"
2445 "punpckhbw %%mm3, %%mm3 \n\t"
2446 "punpcklbw %%mm4, %%mm4 \n\t"
2447 "punpckhbw %%mm5, %%mm5 \n\t"
2448 "punpcklbw %%mm6, %%mm6 \n\t"
2449 "punpckhbw %%mm7, %%mm7 \n\t"
2450 MOVNTQ" %%mm0, %0 \n\t"
2451 MOVNTQ" %%mm1, 8%0 \n\t"
2452 MOVNTQ" %%mm2, 16%0 \n\t"
2453 MOVNTQ" %%mm3, 24%0 \n\t"
2454 MOVNTQ" %%mm4, 32%0 \n\t"
2455 MOVNTQ" %%mm5, 40%0 \n\t"
2456 MOVNTQ" %%mm6, 48%0 \n\t"
2457 MOVNTQ" %%mm7, 56%0"
2463 for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2465 #if COMPILE_TEMPLATE_MMX
2474 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2476 long width, long height,
2477 long srcStride1, long srcStride2,
2478 long srcStride3, long dstStride)
2482 w=width/2; h=height;
2484 const uint8_t* yp=src1+srcStride1*y;
2485 const uint8_t* up=src2+srcStride2*(y>>2);
2486 const uint8_t* vp=src3+srcStride3*(y>>2);
2487 uint8_t* d=dst+dstStride*y;
2489 #if COMPILE_TEMPLATE_MMX
2492 PREFETCH" 32(%1, %0) \n\t"
2493 PREFETCH" 32(%2, %0) \n\t"
2494 PREFETCH" 32(%3, %0) \n\t"
2495 "movq (%1, %0, 4), %%mm0 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2496 "movq (%2, %0), %%mm1 \n\t" /* U0U1U2U3U4U5U6U7 */
2497 "movq (%3, %0), %%mm2 \n\t" /* V0V1V2V3V4V5V6V7 */
2498 "movq %%mm0, %%mm3 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2499 "movq %%mm1, %%mm4 \n\t" /* U0U1U2U3U4U5U6U7 */
2500 "movq %%mm2, %%mm5 \n\t" /* V0V1V2V3V4V5V6V7 */
2501 "punpcklbw %%mm1, %%mm1 \n\t" /* U0U0 U1U1 U2U2 U3U3 */
2502 "punpcklbw %%mm2, %%mm2 \n\t" /* V0V0 V1V1 V2V2 V3V3 */
2503 "punpckhbw %%mm4, %%mm4 \n\t" /* U4U4 U5U5 U6U6 U7U7 */
2504 "punpckhbw %%mm5, %%mm5 \n\t" /* V4V4 V5V5 V6V6 V7V7 */
2506 "movq %%mm1, %%mm6 \n\t"
2507 "punpcklbw %%mm2, %%mm1 \n\t" /* U0V0 U0V0 U1V1 U1V1*/
2508 "punpcklbw %%mm1, %%mm0 \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2509 "punpckhbw %%mm1, %%mm3 \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2510 MOVNTQ" %%mm0, (%4, %0, 8) \n\t"
2511 MOVNTQ" %%mm3, 8(%4, %0, 8) \n\t"
2513 "punpckhbw %%mm2, %%mm6 \n\t" /* U2V2 U2V2 U3V3 U3V3*/
2514 "movq 8(%1, %0, 4), %%mm0 \n\t"
2515 "movq %%mm0, %%mm3 \n\t"
2516 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U2 Y V2 Y U2 Y V2*/
2517 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U3 Y V3 Y U3 Y V3*/
2518 MOVNTQ" %%mm0, 16(%4, %0, 8) \n\t"
2519 MOVNTQ" %%mm3, 24(%4, %0, 8) \n\t"
2521 "movq %%mm4, %%mm6 \n\t"
2522 "movq 16(%1, %0, 4), %%mm0 \n\t"
2523 "movq %%mm0, %%mm3 \n\t"
2524 "punpcklbw %%mm5, %%mm4 \n\t"
2525 "punpcklbw %%mm4, %%mm0 \n\t" /* Y U4 Y V4 Y U4 Y V4*/
2526 "punpckhbw %%mm4, %%mm3 \n\t" /* Y U5 Y V5 Y U5 Y V5*/
2527 MOVNTQ" %%mm0, 32(%4, %0, 8) \n\t"
2528 MOVNTQ" %%mm3, 40(%4, %0, 8) \n\t"
2530 "punpckhbw %%mm5, %%mm6 \n\t"
2531 "movq 24(%1, %0, 4), %%mm0 \n\t"
2532 "movq %%mm0, %%mm3 \n\t"
2533 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U6 Y V6 Y U6 Y V6*/
2534 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U7 Y V7 Y U7 Y V7*/
2535 MOVNTQ" %%mm0, 48(%4, %0, 8) \n\t"
2536 MOVNTQ" %%mm3, 56(%4, %0, 8) \n\t"
2539 : "r"(yp), "r" (up), "r"(vp), "r"(d)
2544 const long x2 = x<<2;
2547 d[8*x+2] = yp[x2+1];
2549 d[8*x+4] = yp[x2+2];
2551 d[8*x+6] = yp[x2+3];
2555 #if COMPILE_TEMPLATE_MMX
2564 static void RENAME(extract_even)(const uint8_t *src, uint8_t *dst, x86_reg count)
2570 #if COMPILE_TEMPLATE_MMX
2574 "pcmpeqw %%mm7, %%mm7 \n\t"
2575 "psrlw $8, %%mm7 \n\t"
2577 "movq -30(%1, %0, 2), %%mm0 \n\t"
2578 "movq -22(%1, %0, 2), %%mm1 \n\t"
2579 "movq -14(%1, %0, 2), %%mm2 \n\t"
2580 "movq -6(%1, %0, 2), %%mm3 \n\t"
2581 "pand %%mm7, %%mm0 \n\t"
2582 "pand %%mm7, %%mm1 \n\t"
2583 "pand %%mm7, %%mm2 \n\t"
2584 "pand %%mm7, %%mm3 \n\t"
2585 "packuswb %%mm1, %%mm0 \n\t"
2586 "packuswb %%mm3, %%mm2 \n\t"
2587 MOVNTQ" %%mm0,-15(%2, %0) \n\t"
2588 MOVNTQ" %%mm2,- 7(%2, %0) \n\t"
2592 : "r"(src), "r"(dst)
2598 dst[count]= src[2*count];
2603 static void RENAME(extract_even2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2609 #if COMPILE_TEMPLATE_MMX
2613 "pcmpeqw %%mm7, %%mm7 \n\t"
2614 "psrlw $8, %%mm7 \n\t"
2616 "movq -28(%1, %0, 4), %%mm0 \n\t"
2617 "movq -20(%1, %0, 4), %%mm1 \n\t"
2618 "movq -12(%1, %0, 4), %%mm2 \n\t"
2619 "movq -4(%1, %0, 4), %%mm3 \n\t"
2620 "pand %%mm7, %%mm0 \n\t"
2621 "pand %%mm7, %%mm1 \n\t"
2622 "pand %%mm7, %%mm2 \n\t"
2623 "pand %%mm7, %%mm3 \n\t"
2624 "packuswb %%mm1, %%mm0 \n\t"
2625 "packuswb %%mm3, %%mm2 \n\t"
2626 "movq %%mm0, %%mm1 \n\t"
2627 "movq %%mm2, %%mm3 \n\t"
2628 "psrlw $8, %%mm0 \n\t"
2629 "psrlw $8, %%mm2 \n\t"
2630 "pand %%mm7, %%mm1 \n\t"
2631 "pand %%mm7, %%mm3 \n\t"
2632 "packuswb %%mm2, %%mm0 \n\t"
2633 "packuswb %%mm3, %%mm1 \n\t"
2634 MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
2635 MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
2639 : "r"(src), "r"(dst0), "r"(dst1)
2645 dst0[count]= src[4*count+0];
2646 dst1[count]= src[4*count+2];
2651 static void RENAME(extract_even2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2662 "pcmpeqw %%mm7, %%mm7 \n\t"
2663 "psrlw $8, %%mm7 \n\t"
2665 "movq -28(%1, %0, 4), %%mm0 \n\t"
2666 "movq -20(%1, %0, 4), %%mm1 \n\t"
2667 "movq -12(%1, %0, 4), %%mm2 \n\t"
2668 "movq -4(%1, %0, 4), %%mm3 \n\t"
2669 PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
2670 PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
2671 PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
2672 PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
2673 "pand %%mm7, %%mm0 \n\t"
2674 "pand %%mm7, %%mm1 \n\t"
2675 "pand %%mm7, %%mm2 \n\t"
2676 "pand %%mm7, %%mm3 \n\t"
2677 "packuswb %%mm1, %%mm0 \n\t"
2678 "packuswb %%mm3, %%mm2 \n\t"
2679 "movq %%mm0, %%mm1 \n\t"
2680 "movq %%mm2, %%mm3 \n\t"
2681 "psrlw $8, %%mm0 \n\t"
2682 "psrlw $8, %%mm2 \n\t"
2683 "pand %%mm7, %%mm1 \n\t"
2684 "pand %%mm7, %%mm3 \n\t"
2685 "packuswb %%mm2, %%mm0 \n\t"
2686 "packuswb %%mm3, %%mm1 \n\t"
2687 MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
2688 MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
2692 : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2698 dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2699 dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2704 static void RENAME(extract_odd2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2710 #if COMPILE_TEMPLATE_MMX
2714 "pcmpeqw %%mm7, %%mm7 \n\t"
2715 "psrlw $8, %%mm7 \n\t"
2717 "movq -28(%1, %0, 4), %%mm0 \n\t"
2718 "movq -20(%1, %0, 4), %%mm1 \n\t"
2719 "movq -12(%1, %0, 4), %%mm2 \n\t"
2720 "movq -4(%1, %0, 4), %%mm3 \n\t"
2721 "psrlw $8, %%mm0 \n\t"
2722 "psrlw $8, %%mm1 \n\t"
2723 "psrlw $8, %%mm2 \n\t"
2724 "psrlw $8, %%mm3 \n\t"
2725 "packuswb %%mm1, %%mm0 \n\t"
2726 "packuswb %%mm3, %%mm2 \n\t"
2727 "movq %%mm0, %%mm1 \n\t"
2728 "movq %%mm2, %%mm3 \n\t"
2729 "psrlw $8, %%mm0 \n\t"
2730 "psrlw $8, %%mm2 \n\t"
2731 "pand %%mm7, %%mm1 \n\t"
2732 "pand %%mm7, %%mm3 \n\t"
2733 "packuswb %%mm2, %%mm0 \n\t"
2734 "packuswb %%mm3, %%mm1 \n\t"
2735 MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
2736 MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
2740 : "r"(src), "r"(dst0), "r"(dst1)
2747 dst0[count]= src[4*count+0];
2748 dst1[count]= src[4*count+2];
2753 static void RENAME(extract_odd2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2764 "pcmpeqw %%mm7, %%mm7 \n\t"
2765 "psrlw $8, %%mm7 \n\t"
2767 "movq -28(%1, %0, 4), %%mm0 \n\t"
2768 "movq -20(%1, %0, 4), %%mm1 \n\t"
2769 "movq -12(%1, %0, 4), %%mm2 \n\t"
2770 "movq -4(%1, %0, 4), %%mm3 \n\t"
2771 PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
2772 PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
2773 PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
2774 PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
2775 "psrlw $8, %%mm0 \n\t"
2776 "psrlw $8, %%mm1 \n\t"
2777 "psrlw $8, %%mm2 \n\t"
2778 "psrlw $8, %%mm3 \n\t"
2779 "packuswb %%mm1, %%mm0 \n\t"
2780 "packuswb %%mm3, %%mm2 \n\t"
2781 "movq %%mm0, %%mm1 \n\t"
2782 "movq %%mm2, %%mm3 \n\t"
2783 "psrlw $8, %%mm0 \n\t"
2784 "psrlw $8, %%mm2 \n\t"
2785 "pand %%mm7, %%mm1 \n\t"
2786 "pand %%mm7, %%mm3 \n\t"
2787 "packuswb %%mm2, %%mm0 \n\t"
2788 "packuswb %%mm3, %%mm1 \n\t"
2789 MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
2790 MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
2794 : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2802 dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2803 dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2808 static void RENAME(yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2809 long width, long height,
2810 long lumStride, long chromStride, long srcStride)
2813 const long chromWidth= -((-width)>>1);
2815 for (y=0; y<height; y++) {
2816 RENAME(extract_even)(src, ydst, width);
2818 RENAME(extract_odd2avg)(src-srcStride, src, udst, vdst, chromWidth);
2826 #if COMPILE_TEMPLATE_MMX
2835 static void RENAME(yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2836 long width, long height,
2837 long lumStride, long chromStride, long srcStride)
2840 const long chromWidth= -((-width)>>1);
2842 for (y=0; y<height; y++) {
2843 RENAME(extract_even)(src, ydst, width);
2844 RENAME(extract_odd2)(src, udst, vdst, chromWidth);
2851 #if COMPILE_TEMPLATE_MMX
2860 static void RENAME(uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2861 long width, long height,
2862 long lumStride, long chromStride, long srcStride)
2865 const long chromWidth= -((-width)>>1);
2867 for (y=0; y<height; y++) {
2868 RENAME(extract_even)(src+1, ydst, width);
2870 RENAME(extract_even2avg)(src-srcStride, src, udst, vdst, chromWidth);
2878 #if COMPILE_TEMPLATE_MMX
2887 static void RENAME(uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2888 long width, long height,
2889 long lumStride, long chromStride, long srcStride)
2892 const long chromWidth= -((-width)>>1);
2894 for (y=0; y<height; y++) {
2895 RENAME(extract_even)(src+1, ydst, width);
2896 RENAME(extract_even2)(src, udst, vdst, chromWidth);
2903 #if COMPILE_TEMPLATE_MMX
2912 static inline void RENAME(rgb2rgb_init)(void)
2914 rgb15to16 = RENAME(rgb15to16);
2915 rgb15tobgr24 = RENAME(rgb15tobgr24);
2916 rgb15to32 = RENAME(rgb15to32);
2917 rgb16tobgr24 = RENAME(rgb16tobgr24);
2918 rgb16to32 = RENAME(rgb16to32);
2919 rgb16to15 = RENAME(rgb16to15);
2920 rgb24tobgr16 = RENAME(rgb24tobgr16);
2921 rgb24tobgr15 = RENAME(rgb24tobgr15);
2922 rgb24tobgr32 = RENAME(rgb24tobgr32);
2923 rgb32to16 = RENAME(rgb32to16);
2924 rgb32to15 = RENAME(rgb32to15);
2925 rgb32tobgr24 = RENAME(rgb32tobgr24);
2926 rgb24to15 = RENAME(rgb24to15);
2927 rgb24to16 = RENAME(rgb24to16);
2928 rgb24tobgr24 = RENAME(rgb24tobgr24);
2929 shuffle_bytes_2103 = RENAME(shuffle_bytes_2103);
2930 rgb32tobgr16 = RENAME(rgb32tobgr16);
2931 rgb32tobgr15 = RENAME(rgb32tobgr15);
2932 yv12toyuy2 = RENAME(yv12toyuy2);
2933 yv12touyvy = RENAME(yv12touyvy);
2934 yuv422ptoyuy2 = RENAME(yuv422ptoyuy2);
2935 yuv422ptouyvy = RENAME(yuv422ptouyvy);
2936 yuy2toyv12 = RENAME(yuy2toyv12);
2937 planar2x = RENAME(planar2x);
2938 rgb24toyv12 = RENAME(rgb24toyv12);
2939 interleaveBytes = RENAME(interleaveBytes);
2940 vu9_to_vu12 = RENAME(vu9_to_vu12);
2941 yvu9_to_yuy2 = RENAME(yvu9_to_yuy2);
2943 uyvytoyuv420 = RENAME(uyvytoyuv420);
2944 uyvytoyuv422 = RENAME(uyvytoyuv422);
2945 yuyvtoyuv420 = RENAME(yuyvtoyuv420);
2946 yuyvtoyuv422 = RENAME(yuyvtoyuv422);