2 * software RGB to RGB converter
3 * pluralize by software PAL8 to RGB converter
4 * software YUV to YUV converter
5 * software YUV to RGB converter
6 * Written by Nick Kurshev.
7 * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
8 * lot of big-endian byte order fixes by Alex Beregszaszi
10 * This file is part of Libav.
12 * Libav is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License as published by the Free Software Foundation; either
15 * version 2.1 of the License, or (at your option) any later version.
17 * Libav is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * Lesser General Public License for more details.
22 * You should have received a copy of the GNU Lesser General Public
23 * License along with Libav; if not, write to the Free Software
24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
35 #if COMPILE_TEMPLATE_AMD3DNOW
36 #define PREFETCH "prefetch"
37 #define PAVGB "pavgusb"
38 #elif COMPILE_TEMPLATE_MMX2
39 #define PREFETCH "prefetchnta"
42 #define PREFETCH " # nop"
45 #if COMPILE_TEMPLATE_AMD3DNOW
46 /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
52 #if COMPILE_TEMPLATE_MMX2
53 #define MOVNTQ "movntq"
54 #define SFENCE "sfence"
57 #define SFENCE " # nop"
60 #if !COMPILE_TEMPLATE_SSE2
62 #if !COMPILE_TEMPLATE_AMD3DNOW
64 static inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, int src_size)
67 const uint8_t *s = src;
69 const uint8_t *mm_end;
71 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
73 __asm__ volatile("movq %0, %%mm7"::"m"(mask32a):"memory");
78 "punpckldq 3%1, %%mm0 \n\t"
79 "movd 6%1, %%mm1 \n\t"
80 "punpckldq 9%1, %%mm1 \n\t"
81 "movd 12%1, %%mm2 \n\t"
82 "punpckldq 15%1, %%mm2 \n\t"
83 "movd 18%1, %%mm3 \n\t"
84 "punpckldq 21%1, %%mm3 \n\t"
85 "por %%mm7, %%mm0 \n\t"
86 "por %%mm7, %%mm1 \n\t"
87 "por %%mm7, %%mm2 \n\t"
88 "por %%mm7, %%mm3 \n\t"
89 MOVNTQ" %%mm0, %0 \n\t"
90 MOVNTQ" %%mm1, 8%0 \n\t"
91 MOVNTQ" %%mm2, 16%0 \n\t"
99 __asm__ volatile(SFENCE:::"memory");
100 __asm__ volatile(EMMS:::"memory");
109 #define STORE_BGR24_MMX \
110 "psrlq $8, %%mm2 \n\t" \
111 "psrlq $8, %%mm3 \n\t" \
112 "psrlq $8, %%mm6 \n\t" \
113 "psrlq $8, %%mm7 \n\t" \
114 "pand "MANGLE(mask24l)", %%mm0\n\t" \
115 "pand "MANGLE(mask24l)", %%mm1\n\t" \
116 "pand "MANGLE(mask24l)", %%mm4\n\t" \
117 "pand "MANGLE(mask24l)", %%mm5\n\t" \
118 "pand "MANGLE(mask24h)", %%mm2\n\t" \
119 "pand "MANGLE(mask24h)", %%mm3\n\t" \
120 "pand "MANGLE(mask24h)", %%mm6\n\t" \
121 "pand "MANGLE(mask24h)", %%mm7\n\t" \
122 "por %%mm2, %%mm0 \n\t" \
123 "por %%mm3, %%mm1 \n\t" \
124 "por %%mm6, %%mm4 \n\t" \
125 "por %%mm7, %%mm5 \n\t" \
127 "movq %%mm1, %%mm2 \n\t" \
128 "movq %%mm4, %%mm3 \n\t" \
129 "psllq $48, %%mm2 \n\t" \
130 "psllq $32, %%mm3 \n\t" \
131 "pand "MANGLE(mask24hh)", %%mm2\n\t" \
132 "pand "MANGLE(mask24hhh)", %%mm3\n\t" \
133 "por %%mm2, %%mm0 \n\t" \
134 "psrlq $16, %%mm1 \n\t" \
135 "psrlq $32, %%mm4 \n\t" \
136 "psllq $16, %%mm5 \n\t" \
137 "por %%mm3, %%mm1 \n\t" \
138 "pand "MANGLE(mask24hhhh)", %%mm5\n\t" \
139 "por %%mm5, %%mm4 \n\t" \
141 MOVNTQ" %%mm0, %0 \n\t" \
142 MOVNTQ" %%mm1, 8%0 \n\t" \
146 static inline void RENAME(rgb32tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
149 const uint8_t *s = src;
151 const uint8_t *mm_end;
153 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
158 "movq %1, %%mm0 \n\t"
159 "movq 8%1, %%mm1 \n\t"
160 "movq 16%1, %%mm4 \n\t"
161 "movq 24%1, %%mm5 \n\t"
162 "movq %%mm0, %%mm2 \n\t"
163 "movq %%mm1, %%mm3 \n\t"
164 "movq %%mm4, %%mm6 \n\t"
165 "movq %%mm5, %%mm7 \n\t"
173 __asm__ volatile(SFENCE:::"memory");
174 __asm__ volatile(EMMS:::"memory");
184 original by Strepto/Astral
185 ported to gcc & bugfixed: A'rpi
186 MMX2, 3DNOW optimization by Nick Kurshev
187 32-bit C version, and and&add trick by Michael Niedermayer
189 static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, int src_size)
191 register const uint8_t* s=src;
192 register uint8_t* d=dst;
193 register const uint8_t *end;
194 const uint8_t *mm_end;
196 __asm__ volatile(PREFETCH" %0"::"m"(*s));
197 __asm__ volatile("movq %0, %%mm4"::"m"(mask15s));
202 "movq %1, %%mm0 \n\t"
203 "movq 8%1, %%mm2 \n\t"
204 "movq %%mm0, %%mm1 \n\t"
205 "movq %%mm2, %%mm3 \n\t"
206 "pand %%mm4, %%mm0 \n\t"
207 "pand %%mm4, %%mm2 \n\t"
208 "paddw %%mm1, %%mm0 \n\t"
209 "paddw %%mm3, %%mm2 \n\t"
210 MOVNTQ" %%mm0, %0 \n\t"
218 __asm__ volatile(SFENCE:::"memory");
219 __asm__ volatile(EMMS:::"memory");
222 register unsigned x= *((const uint32_t *)s);
223 *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
228 register unsigned short x= *((const uint16_t *)s);
229 *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
233 static inline void RENAME(rgb16to15)(const uint8_t *src, uint8_t *dst, int src_size)
235 register const uint8_t* s=src;
236 register uint8_t* d=dst;
237 register const uint8_t *end;
238 const uint8_t *mm_end;
240 __asm__ volatile(PREFETCH" %0"::"m"(*s));
241 __asm__ volatile("movq %0, %%mm7"::"m"(mask15rg));
242 __asm__ volatile("movq %0, %%mm6"::"m"(mask15b));
247 "movq %1, %%mm0 \n\t"
248 "movq 8%1, %%mm2 \n\t"
249 "movq %%mm0, %%mm1 \n\t"
250 "movq %%mm2, %%mm3 \n\t"
251 "psrlq $1, %%mm0 \n\t"
252 "psrlq $1, %%mm2 \n\t"
253 "pand %%mm7, %%mm0 \n\t"
254 "pand %%mm7, %%mm2 \n\t"
255 "pand %%mm6, %%mm1 \n\t"
256 "pand %%mm6, %%mm3 \n\t"
257 "por %%mm1, %%mm0 \n\t"
258 "por %%mm3, %%mm2 \n\t"
259 MOVNTQ" %%mm0, %0 \n\t"
267 __asm__ volatile(SFENCE:::"memory");
268 __asm__ volatile(EMMS:::"memory");
271 register uint32_t x= *((const uint32_t*)s);
272 *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
277 register uint16_t x= *((const uint16_t*)s);
278 *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
282 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, int src_size)
284 const uint8_t *s = src;
286 const uint8_t *mm_end;
287 uint16_t *d = (uint16_t *)dst;
290 #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
292 "movq %3, %%mm5 \n\t"
293 "movq %4, %%mm6 \n\t"
294 "movq %5, %%mm7 \n\t"
298 PREFETCH" 32(%1) \n\t"
299 "movd (%1), %%mm0 \n\t"
300 "movd 4(%1), %%mm3 \n\t"
301 "punpckldq 8(%1), %%mm0 \n\t"
302 "punpckldq 12(%1), %%mm3 \n\t"
303 "movq %%mm0, %%mm1 \n\t"
304 "movq %%mm3, %%mm4 \n\t"
305 "pand %%mm6, %%mm0 \n\t"
306 "pand %%mm6, %%mm3 \n\t"
307 "pmaddwd %%mm7, %%mm0 \n\t"
308 "pmaddwd %%mm7, %%mm3 \n\t"
309 "pand %%mm5, %%mm1 \n\t"
310 "pand %%mm5, %%mm4 \n\t"
311 "por %%mm1, %%mm0 \n\t"
312 "por %%mm4, %%mm3 \n\t"
313 "psrld $5, %%mm0 \n\t"
314 "pslld $11, %%mm3 \n\t"
315 "por %%mm3, %%mm0 \n\t"
316 MOVNTQ" %%mm0, (%0) \n\t"
323 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
326 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
328 "movq %0, %%mm7 \n\t"
329 "movq %1, %%mm6 \n\t"
330 ::"m"(red_16mask),"m"(green_16mask));
334 "movd %1, %%mm0 \n\t"
335 "movd 4%1, %%mm3 \n\t"
336 "punpckldq 8%1, %%mm0 \n\t"
337 "punpckldq 12%1, %%mm3 \n\t"
338 "movq %%mm0, %%mm1 \n\t"
339 "movq %%mm0, %%mm2 \n\t"
340 "movq %%mm3, %%mm4 \n\t"
341 "movq %%mm3, %%mm5 \n\t"
342 "psrlq $3, %%mm0 \n\t"
343 "psrlq $3, %%mm3 \n\t"
344 "pand %2, %%mm0 \n\t"
345 "pand %2, %%mm3 \n\t"
346 "psrlq $5, %%mm1 \n\t"
347 "psrlq $5, %%mm4 \n\t"
348 "pand %%mm6, %%mm1 \n\t"
349 "pand %%mm6, %%mm4 \n\t"
350 "psrlq $8, %%mm2 \n\t"
351 "psrlq $8, %%mm5 \n\t"
352 "pand %%mm7, %%mm2 \n\t"
353 "pand %%mm7, %%mm5 \n\t"
354 "por %%mm1, %%mm0 \n\t"
355 "por %%mm4, %%mm3 \n\t"
356 "por %%mm2, %%mm0 \n\t"
357 "por %%mm5, %%mm3 \n\t"
358 "psllq $16, %%mm3 \n\t"
359 "por %%mm3, %%mm0 \n\t"
360 MOVNTQ" %%mm0, %0 \n\t"
361 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
366 __asm__ volatile(SFENCE:::"memory");
367 __asm__ volatile(EMMS:::"memory");
369 register int rgb = *(const uint32_t*)s; s += 4;
370 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
374 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, int src_size)
376 const uint8_t *s = src;
378 const uint8_t *mm_end;
379 uint16_t *d = (uint16_t *)dst;
381 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
383 "movq %0, %%mm7 \n\t"
384 "movq %1, %%mm6 \n\t"
385 ::"m"(red_16mask),"m"(green_16mask));
390 "movd %1, %%mm0 \n\t"
391 "movd 4%1, %%mm3 \n\t"
392 "punpckldq 8%1, %%mm0 \n\t"
393 "punpckldq 12%1, %%mm3 \n\t"
394 "movq %%mm0, %%mm1 \n\t"
395 "movq %%mm0, %%mm2 \n\t"
396 "movq %%mm3, %%mm4 \n\t"
397 "movq %%mm3, %%mm5 \n\t"
398 "psllq $8, %%mm0 \n\t"
399 "psllq $8, %%mm3 \n\t"
400 "pand %%mm7, %%mm0 \n\t"
401 "pand %%mm7, %%mm3 \n\t"
402 "psrlq $5, %%mm1 \n\t"
403 "psrlq $5, %%mm4 \n\t"
404 "pand %%mm6, %%mm1 \n\t"
405 "pand %%mm6, %%mm4 \n\t"
406 "psrlq $19, %%mm2 \n\t"
407 "psrlq $19, %%mm5 \n\t"
408 "pand %2, %%mm2 \n\t"
409 "pand %2, %%mm5 \n\t"
410 "por %%mm1, %%mm0 \n\t"
411 "por %%mm4, %%mm3 \n\t"
412 "por %%mm2, %%mm0 \n\t"
413 "por %%mm5, %%mm3 \n\t"
414 "psllq $16, %%mm3 \n\t"
415 "por %%mm3, %%mm0 \n\t"
416 MOVNTQ" %%mm0, %0 \n\t"
417 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
421 __asm__ volatile(SFENCE:::"memory");
422 __asm__ volatile(EMMS:::"memory");
424 register int rgb = *(const uint32_t*)s; s += 4;
425 *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
429 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, int src_size)
431 const uint8_t *s = src;
433 const uint8_t *mm_end;
434 uint16_t *d = (uint16_t *)dst;
437 #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
439 "movq %3, %%mm5 \n\t"
440 "movq %4, %%mm6 \n\t"
441 "movq %5, %%mm7 \n\t"
445 PREFETCH" 32(%1) \n\t"
446 "movd (%1), %%mm0 \n\t"
447 "movd 4(%1), %%mm3 \n\t"
448 "punpckldq 8(%1), %%mm0 \n\t"
449 "punpckldq 12(%1), %%mm3 \n\t"
450 "movq %%mm0, %%mm1 \n\t"
451 "movq %%mm3, %%mm4 \n\t"
452 "pand %%mm6, %%mm0 \n\t"
453 "pand %%mm6, %%mm3 \n\t"
454 "pmaddwd %%mm7, %%mm0 \n\t"
455 "pmaddwd %%mm7, %%mm3 \n\t"
456 "pand %%mm5, %%mm1 \n\t"
457 "pand %%mm5, %%mm4 \n\t"
458 "por %%mm1, %%mm0 \n\t"
459 "por %%mm4, %%mm3 \n\t"
460 "psrld $6, %%mm0 \n\t"
461 "pslld $10, %%mm3 \n\t"
462 "por %%mm3, %%mm0 \n\t"
463 MOVNTQ" %%mm0, (%0) \n\t"
470 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
473 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
475 "movq %0, %%mm7 \n\t"
476 "movq %1, %%mm6 \n\t"
477 ::"m"(red_15mask),"m"(green_15mask));
481 "movd %1, %%mm0 \n\t"
482 "movd 4%1, %%mm3 \n\t"
483 "punpckldq 8%1, %%mm0 \n\t"
484 "punpckldq 12%1, %%mm3 \n\t"
485 "movq %%mm0, %%mm1 \n\t"
486 "movq %%mm0, %%mm2 \n\t"
487 "movq %%mm3, %%mm4 \n\t"
488 "movq %%mm3, %%mm5 \n\t"
489 "psrlq $3, %%mm0 \n\t"
490 "psrlq $3, %%mm3 \n\t"
491 "pand %2, %%mm0 \n\t"
492 "pand %2, %%mm3 \n\t"
493 "psrlq $6, %%mm1 \n\t"
494 "psrlq $6, %%mm4 \n\t"
495 "pand %%mm6, %%mm1 \n\t"
496 "pand %%mm6, %%mm4 \n\t"
497 "psrlq $9, %%mm2 \n\t"
498 "psrlq $9, %%mm5 \n\t"
499 "pand %%mm7, %%mm2 \n\t"
500 "pand %%mm7, %%mm5 \n\t"
501 "por %%mm1, %%mm0 \n\t"
502 "por %%mm4, %%mm3 \n\t"
503 "por %%mm2, %%mm0 \n\t"
504 "por %%mm5, %%mm3 \n\t"
505 "psllq $16, %%mm3 \n\t"
506 "por %%mm3, %%mm0 \n\t"
507 MOVNTQ" %%mm0, %0 \n\t"
508 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
513 __asm__ volatile(SFENCE:::"memory");
514 __asm__ volatile(EMMS:::"memory");
516 register int rgb = *(const uint32_t*)s; s += 4;
517 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
521 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, int src_size)
523 const uint8_t *s = src;
525 const uint8_t *mm_end;
526 uint16_t *d = (uint16_t *)dst;
528 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
530 "movq %0, %%mm7 \n\t"
531 "movq %1, %%mm6 \n\t"
532 ::"m"(red_15mask),"m"(green_15mask));
537 "movd %1, %%mm0 \n\t"
538 "movd 4%1, %%mm3 \n\t"
539 "punpckldq 8%1, %%mm0 \n\t"
540 "punpckldq 12%1, %%mm3 \n\t"
541 "movq %%mm0, %%mm1 \n\t"
542 "movq %%mm0, %%mm2 \n\t"
543 "movq %%mm3, %%mm4 \n\t"
544 "movq %%mm3, %%mm5 \n\t"
545 "psllq $7, %%mm0 \n\t"
546 "psllq $7, %%mm3 \n\t"
547 "pand %%mm7, %%mm0 \n\t"
548 "pand %%mm7, %%mm3 \n\t"
549 "psrlq $6, %%mm1 \n\t"
550 "psrlq $6, %%mm4 \n\t"
551 "pand %%mm6, %%mm1 \n\t"
552 "pand %%mm6, %%mm4 \n\t"
553 "psrlq $19, %%mm2 \n\t"
554 "psrlq $19, %%mm5 \n\t"
555 "pand %2, %%mm2 \n\t"
556 "pand %2, %%mm5 \n\t"
557 "por %%mm1, %%mm0 \n\t"
558 "por %%mm4, %%mm3 \n\t"
559 "por %%mm2, %%mm0 \n\t"
560 "por %%mm5, %%mm3 \n\t"
561 "psllq $16, %%mm3 \n\t"
562 "por %%mm3, %%mm0 \n\t"
563 MOVNTQ" %%mm0, %0 \n\t"
564 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
568 __asm__ volatile(SFENCE:::"memory");
569 __asm__ volatile(EMMS:::"memory");
571 register int rgb = *(const uint32_t*)s; s += 4;
572 *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
576 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, int src_size)
578 const uint8_t *s = src;
580 const uint8_t *mm_end;
581 uint16_t *d = (uint16_t *)dst;
583 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
585 "movq %0, %%mm7 \n\t"
586 "movq %1, %%mm6 \n\t"
587 ::"m"(red_16mask),"m"(green_16mask));
592 "movd %1, %%mm0 \n\t"
593 "movd 3%1, %%mm3 \n\t"
594 "punpckldq 6%1, %%mm0 \n\t"
595 "punpckldq 9%1, %%mm3 \n\t"
596 "movq %%mm0, %%mm1 \n\t"
597 "movq %%mm0, %%mm2 \n\t"
598 "movq %%mm3, %%mm4 \n\t"
599 "movq %%mm3, %%mm5 \n\t"
600 "psrlq $3, %%mm0 \n\t"
601 "psrlq $3, %%mm3 \n\t"
602 "pand %2, %%mm0 \n\t"
603 "pand %2, %%mm3 \n\t"
604 "psrlq $5, %%mm1 \n\t"
605 "psrlq $5, %%mm4 \n\t"
606 "pand %%mm6, %%mm1 \n\t"
607 "pand %%mm6, %%mm4 \n\t"
608 "psrlq $8, %%mm2 \n\t"
609 "psrlq $8, %%mm5 \n\t"
610 "pand %%mm7, %%mm2 \n\t"
611 "pand %%mm7, %%mm5 \n\t"
612 "por %%mm1, %%mm0 \n\t"
613 "por %%mm4, %%mm3 \n\t"
614 "por %%mm2, %%mm0 \n\t"
615 "por %%mm5, %%mm3 \n\t"
616 "psllq $16, %%mm3 \n\t"
617 "por %%mm3, %%mm0 \n\t"
618 MOVNTQ" %%mm0, %0 \n\t"
619 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
623 __asm__ volatile(SFENCE:::"memory");
624 __asm__ volatile(EMMS:::"memory");
629 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
633 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, int src_size)
635 const uint8_t *s = src;
637 const uint8_t *mm_end;
638 uint16_t *d = (uint16_t *)dst;
640 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
642 "movq %0, %%mm7 \n\t"
643 "movq %1, %%mm6 \n\t"
644 ::"m"(red_16mask),"m"(green_16mask));
649 "movd %1, %%mm0 \n\t"
650 "movd 3%1, %%mm3 \n\t"
651 "punpckldq 6%1, %%mm0 \n\t"
652 "punpckldq 9%1, %%mm3 \n\t"
653 "movq %%mm0, %%mm1 \n\t"
654 "movq %%mm0, %%mm2 \n\t"
655 "movq %%mm3, %%mm4 \n\t"
656 "movq %%mm3, %%mm5 \n\t"
657 "psllq $8, %%mm0 \n\t"
658 "psllq $8, %%mm3 \n\t"
659 "pand %%mm7, %%mm0 \n\t"
660 "pand %%mm7, %%mm3 \n\t"
661 "psrlq $5, %%mm1 \n\t"
662 "psrlq $5, %%mm4 \n\t"
663 "pand %%mm6, %%mm1 \n\t"
664 "pand %%mm6, %%mm4 \n\t"
665 "psrlq $19, %%mm2 \n\t"
666 "psrlq $19, %%mm5 \n\t"
667 "pand %2, %%mm2 \n\t"
668 "pand %2, %%mm5 \n\t"
669 "por %%mm1, %%mm0 \n\t"
670 "por %%mm4, %%mm3 \n\t"
671 "por %%mm2, %%mm0 \n\t"
672 "por %%mm5, %%mm3 \n\t"
673 "psllq $16, %%mm3 \n\t"
674 "por %%mm3, %%mm0 \n\t"
675 MOVNTQ" %%mm0, %0 \n\t"
676 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
680 __asm__ volatile(SFENCE:::"memory");
681 __asm__ volatile(EMMS:::"memory");
686 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
690 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, int src_size)
692 const uint8_t *s = src;
694 const uint8_t *mm_end;
695 uint16_t *d = (uint16_t *)dst;
697 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
699 "movq %0, %%mm7 \n\t"
700 "movq %1, %%mm6 \n\t"
701 ::"m"(red_15mask),"m"(green_15mask));
706 "movd %1, %%mm0 \n\t"
707 "movd 3%1, %%mm3 \n\t"
708 "punpckldq 6%1, %%mm0 \n\t"
709 "punpckldq 9%1, %%mm3 \n\t"
710 "movq %%mm0, %%mm1 \n\t"
711 "movq %%mm0, %%mm2 \n\t"
712 "movq %%mm3, %%mm4 \n\t"
713 "movq %%mm3, %%mm5 \n\t"
714 "psrlq $3, %%mm0 \n\t"
715 "psrlq $3, %%mm3 \n\t"
716 "pand %2, %%mm0 \n\t"
717 "pand %2, %%mm3 \n\t"
718 "psrlq $6, %%mm1 \n\t"
719 "psrlq $6, %%mm4 \n\t"
720 "pand %%mm6, %%mm1 \n\t"
721 "pand %%mm6, %%mm4 \n\t"
722 "psrlq $9, %%mm2 \n\t"
723 "psrlq $9, %%mm5 \n\t"
724 "pand %%mm7, %%mm2 \n\t"
725 "pand %%mm7, %%mm5 \n\t"
726 "por %%mm1, %%mm0 \n\t"
727 "por %%mm4, %%mm3 \n\t"
728 "por %%mm2, %%mm0 \n\t"
729 "por %%mm5, %%mm3 \n\t"
730 "psllq $16, %%mm3 \n\t"
731 "por %%mm3, %%mm0 \n\t"
732 MOVNTQ" %%mm0, %0 \n\t"
733 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
737 __asm__ volatile(SFENCE:::"memory");
738 __asm__ volatile(EMMS:::"memory");
743 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
747 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, int src_size)
749 const uint8_t *s = src;
751 const uint8_t *mm_end;
752 uint16_t *d = (uint16_t *)dst;
754 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
756 "movq %0, %%mm7 \n\t"
757 "movq %1, %%mm6 \n\t"
758 ::"m"(red_15mask),"m"(green_15mask));
763 "movd %1, %%mm0 \n\t"
764 "movd 3%1, %%mm3 \n\t"
765 "punpckldq 6%1, %%mm0 \n\t"
766 "punpckldq 9%1, %%mm3 \n\t"
767 "movq %%mm0, %%mm1 \n\t"
768 "movq %%mm0, %%mm2 \n\t"
769 "movq %%mm3, %%mm4 \n\t"
770 "movq %%mm3, %%mm5 \n\t"
771 "psllq $7, %%mm0 \n\t"
772 "psllq $7, %%mm3 \n\t"
773 "pand %%mm7, %%mm0 \n\t"
774 "pand %%mm7, %%mm3 \n\t"
775 "psrlq $6, %%mm1 \n\t"
776 "psrlq $6, %%mm4 \n\t"
777 "pand %%mm6, %%mm1 \n\t"
778 "pand %%mm6, %%mm4 \n\t"
779 "psrlq $19, %%mm2 \n\t"
780 "psrlq $19, %%mm5 \n\t"
781 "pand %2, %%mm2 \n\t"
782 "pand %2, %%mm5 \n\t"
783 "por %%mm1, %%mm0 \n\t"
784 "por %%mm4, %%mm3 \n\t"
785 "por %%mm2, %%mm0 \n\t"
786 "por %%mm5, %%mm3 \n\t"
787 "psllq $16, %%mm3 \n\t"
788 "por %%mm3, %%mm0 \n\t"
789 MOVNTQ" %%mm0, %0 \n\t"
790 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
794 __asm__ volatile(SFENCE:::"memory");
795 __asm__ volatile(EMMS:::"memory");
800 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
805 I use less accurate approximation here by simply left-shifting the input
806 value and filling the low order bits with zeroes. This method improves PNG
807 compression but this scheme cannot reproduce white exactly, since it does
808 not generate an all-ones maximum value; the net effect is to darken the
811 The better method should be "left bit replication":
821 | leftmost bits repeated to fill open bits
825 static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
828 const uint16_t *mm_end;
830 const uint16_t *s = (const uint16_t*)src;
831 end = s + src_size/2;
832 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
837 "movq %1, %%mm0 \n\t"
838 "movq %1, %%mm1 \n\t"
839 "movq %1, %%mm2 \n\t"
840 "pand %2, %%mm0 \n\t"
841 "pand %3, %%mm1 \n\t"
842 "pand %4, %%mm2 \n\t"
843 "psllq $3, %%mm0 \n\t"
844 "psrlq $2, %%mm1 \n\t"
845 "psrlq $7, %%mm2 \n\t"
846 "movq %%mm0, %%mm3 \n\t"
847 "movq %%mm1, %%mm4 \n\t"
848 "movq %%mm2, %%mm5 \n\t"
849 "punpcklwd %5, %%mm0 \n\t"
850 "punpcklwd %5, %%mm1 \n\t"
851 "punpcklwd %5, %%mm2 \n\t"
852 "punpckhwd %5, %%mm3 \n\t"
853 "punpckhwd %5, %%mm4 \n\t"
854 "punpckhwd %5, %%mm5 \n\t"
855 "psllq $8, %%mm1 \n\t"
856 "psllq $16, %%mm2 \n\t"
857 "por %%mm1, %%mm0 \n\t"
858 "por %%mm2, %%mm0 \n\t"
859 "psllq $8, %%mm4 \n\t"
860 "psllq $16, %%mm5 \n\t"
861 "por %%mm4, %%mm3 \n\t"
862 "por %%mm5, %%mm3 \n\t"
864 "movq %%mm0, %%mm6 \n\t"
865 "movq %%mm3, %%mm7 \n\t"
867 "movq 8%1, %%mm0 \n\t"
868 "movq 8%1, %%mm1 \n\t"
869 "movq 8%1, %%mm2 \n\t"
870 "pand %2, %%mm0 \n\t"
871 "pand %3, %%mm1 \n\t"
872 "pand %4, %%mm2 \n\t"
873 "psllq $3, %%mm0 \n\t"
874 "psrlq $2, %%mm1 \n\t"
875 "psrlq $7, %%mm2 \n\t"
876 "movq %%mm0, %%mm3 \n\t"
877 "movq %%mm1, %%mm4 \n\t"
878 "movq %%mm2, %%mm5 \n\t"
879 "punpcklwd %5, %%mm0 \n\t"
880 "punpcklwd %5, %%mm1 \n\t"
881 "punpcklwd %5, %%mm2 \n\t"
882 "punpckhwd %5, %%mm3 \n\t"
883 "punpckhwd %5, %%mm4 \n\t"
884 "punpckhwd %5, %%mm5 \n\t"
885 "psllq $8, %%mm1 \n\t"
886 "psllq $16, %%mm2 \n\t"
887 "por %%mm1, %%mm0 \n\t"
888 "por %%mm2, %%mm0 \n\t"
889 "psllq $8, %%mm4 \n\t"
890 "psllq $16, %%mm5 \n\t"
891 "por %%mm4, %%mm3 \n\t"
892 "por %%mm5, %%mm3 \n\t"
895 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
897 /* borrowed 32 to 24 */
899 "movq %%mm0, %%mm4 \n\t"
900 "movq %%mm3, %%mm5 \n\t"
901 "movq %%mm6, %%mm0 \n\t"
902 "movq %%mm7, %%mm1 \n\t"
904 "movq %%mm4, %%mm6 \n\t"
905 "movq %%mm5, %%mm7 \n\t"
906 "movq %%mm0, %%mm2 \n\t"
907 "movq %%mm1, %%mm3 \n\t"
917 __asm__ volatile(SFENCE:::"memory");
918 __asm__ volatile(EMMS:::"memory");
920 register uint16_t bgr;
922 *d++ = (bgr&0x1F)<<3;
923 *d++ = (bgr&0x3E0)>>2;
924 *d++ = (bgr&0x7C00)>>7;
928 static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
931 const uint16_t *mm_end;
932 uint8_t *d = (uint8_t *)dst;
933 const uint16_t *s = (const uint16_t *)src;
934 end = s + src_size/2;
935 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
940 "movq %1, %%mm0 \n\t"
941 "movq %1, %%mm1 \n\t"
942 "movq %1, %%mm2 \n\t"
943 "pand %2, %%mm0 \n\t"
944 "pand %3, %%mm1 \n\t"
945 "pand %4, %%mm2 \n\t"
946 "psllq $3, %%mm0 \n\t"
947 "psrlq $3, %%mm1 \n\t"
948 "psrlq $8, %%mm2 \n\t"
949 "movq %%mm0, %%mm3 \n\t"
950 "movq %%mm1, %%mm4 \n\t"
951 "movq %%mm2, %%mm5 \n\t"
952 "punpcklwd %5, %%mm0 \n\t"
953 "punpcklwd %5, %%mm1 \n\t"
954 "punpcklwd %5, %%mm2 \n\t"
955 "punpckhwd %5, %%mm3 \n\t"
956 "punpckhwd %5, %%mm4 \n\t"
957 "punpckhwd %5, %%mm5 \n\t"
958 "psllq $8, %%mm1 \n\t"
959 "psllq $16, %%mm2 \n\t"
960 "por %%mm1, %%mm0 \n\t"
961 "por %%mm2, %%mm0 \n\t"
962 "psllq $8, %%mm4 \n\t"
963 "psllq $16, %%mm5 \n\t"
964 "por %%mm4, %%mm3 \n\t"
965 "por %%mm5, %%mm3 \n\t"
967 "movq %%mm0, %%mm6 \n\t"
968 "movq %%mm3, %%mm7 \n\t"
970 "movq 8%1, %%mm0 \n\t"
971 "movq 8%1, %%mm1 \n\t"
972 "movq 8%1, %%mm2 \n\t"
973 "pand %2, %%mm0 \n\t"
974 "pand %3, %%mm1 \n\t"
975 "pand %4, %%mm2 \n\t"
976 "psllq $3, %%mm0 \n\t"
977 "psrlq $3, %%mm1 \n\t"
978 "psrlq $8, %%mm2 \n\t"
979 "movq %%mm0, %%mm3 \n\t"
980 "movq %%mm1, %%mm4 \n\t"
981 "movq %%mm2, %%mm5 \n\t"
982 "punpcklwd %5, %%mm0 \n\t"
983 "punpcklwd %5, %%mm1 \n\t"
984 "punpcklwd %5, %%mm2 \n\t"
985 "punpckhwd %5, %%mm3 \n\t"
986 "punpckhwd %5, %%mm4 \n\t"
987 "punpckhwd %5, %%mm5 \n\t"
988 "psllq $8, %%mm1 \n\t"
989 "psllq $16, %%mm2 \n\t"
990 "por %%mm1, %%mm0 \n\t"
991 "por %%mm2, %%mm0 \n\t"
992 "psllq $8, %%mm4 \n\t"
993 "psllq $16, %%mm5 \n\t"
994 "por %%mm4, %%mm3 \n\t"
995 "por %%mm5, %%mm3 \n\t"
997 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
999 /* borrowed 32 to 24 */
1001 "movq %%mm0, %%mm4 \n\t"
1002 "movq %%mm3, %%mm5 \n\t"
1003 "movq %%mm6, %%mm0 \n\t"
1004 "movq %%mm7, %%mm1 \n\t"
1006 "movq %%mm4, %%mm6 \n\t"
1007 "movq %%mm5, %%mm7 \n\t"
1008 "movq %%mm0, %%mm2 \n\t"
1009 "movq %%mm1, %%mm3 \n\t"
1019 __asm__ volatile(SFENCE:::"memory");
1020 __asm__ volatile(EMMS:::"memory");
1022 register uint16_t bgr;
1024 *d++ = (bgr&0x1F)<<3;
1025 *d++ = (bgr&0x7E0)>>3;
1026 *d++ = (bgr&0xF800)>>8;
1031 * mm0 = 00 B3 00 B2 00 B1 00 B0
1032 * mm1 = 00 G3 00 G2 00 G1 00 G0
1033 * mm2 = 00 R3 00 R2 00 R1 00 R0
1034 * mm6 = FF FF FF FF FF FF FF FF
1035 * mm7 = 00 00 00 00 00 00 00 00
1037 #define PACK_RGB32 \
1038 "packuswb %%mm7, %%mm0 \n\t" /* 00 00 00 00 B3 B2 B1 B0 */ \
1039 "packuswb %%mm7, %%mm1 \n\t" /* 00 00 00 00 G3 G2 G1 G0 */ \
1040 "packuswb %%mm7, %%mm2 \n\t" /* 00 00 00 00 R3 R2 R1 R0 */ \
1041 "punpcklbw %%mm1, %%mm0 \n\t" /* G3 B3 G2 B2 G1 B1 G0 B0 */ \
1042 "punpcklbw %%mm6, %%mm2 \n\t" /* FF R3 FF R2 FF R1 FF R0 */ \
1043 "movq %%mm0, %%mm3 \n\t" \
1044 "punpcklwd %%mm2, %%mm0 \n\t" /* FF R1 G1 B1 FF R0 G0 B0 */ \
1045 "punpckhwd %%mm2, %%mm3 \n\t" /* FF R3 G3 B3 FF R2 G2 B2 */ \
1046 MOVNTQ" %%mm0, %0 \n\t" \
1047 MOVNTQ" %%mm3, 8%0 \n\t" \
1049 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, int src_size)
1051 const uint16_t *end;
1052 const uint16_t *mm_end;
1054 const uint16_t *s = (const uint16_t *)src;
1055 end = s + src_size/2;
1056 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
1057 __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
1058 __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
1060 while (s < mm_end) {
1062 PREFETCH" 32%1 \n\t"
1063 "movq %1, %%mm0 \n\t"
1064 "movq %1, %%mm1 \n\t"
1065 "movq %1, %%mm2 \n\t"
1066 "pand %2, %%mm0 \n\t"
1067 "pand %3, %%mm1 \n\t"
1068 "pand %4, %%mm2 \n\t"
1069 "psllq $3, %%mm0 \n\t"
1070 "psrlq $2, %%mm1 \n\t"
1071 "psrlq $7, %%mm2 \n\t"
1074 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1079 __asm__ volatile(SFENCE:::"memory");
1080 __asm__ volatile(EMMS:::"memory");
1082 register uint16_t bgr;
1084 *d++ = (bgr&0x1F)<<3;
1085 *d++ = (bgr&0x3E0)>>2;
1086 *d++ = (bgr&0x7C00)>>7;
1091 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, int src_size)
1093 const uint16_t *end;
1094 const uint16_t *mm_end;
1096 const uint16_t *s = (const uint16_t*)src;
1097 end = s + src_size/2;
1098 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
1099 __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
1100 __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
1102 while (s < mm_end) {
1104 PREFETCH" 32%1 \n\t"
1105 "movq %1, %%mm0 \n\t"
1106 "movq %1, %%mm1 \n\t"
1107 "movq %1, %%mm2 \n\t"
1108 "pand %2, %%mm0 \n\t"
1109 "pand %3, %%mm1 \n\t"
1110 "pand %4, %%mm2 \n\t"
1111 "psllq $3, %%mm0 \n\t"
1112 "psrlq $3, %%mm1 \n\t"
1113 "psrlq $8, %%mm2 \n\t"
1116 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1121 __asm__ volatile(SFENCE:::"memory");
1122 __asm__ volatile(EMMS:::"memory");
1124 register uint16_t bgr;
1126 *d++ = (bgr&0x1F)<<3;
1127 *d++ = (bgr&0x7E0)>>3;
1128 *d++ = (bgr&0xF800)>>8;
1133 static inline void RENAME(shuffle_bytes_2103)(const uint8_t *src, uint8_t *dst, int src_size)
1135 x86_reg idx = 15 - src_size;
1136 const uint8_t *s = src-idx;
1137 uint8_t *d = dst-idx;
1141 PREFETCH" (%1, %0) \n\t"
1142 "movq %3, %%mm7 \n\t"
1143 "pxor %4, %%mm7 \n\t"
1144 "movq %%mm7, %%mm6 \n\t"
1145 "pxor %5, %%mm7 \n\t"
1148 PREFETCH" 32(%1, %0) \n\t"
1149 "movq (%1, %0), %%mm0 \n\t"
1150 "movq 8(%1, %0), %%mm1 \n\t"
1151 # if COMPILE_TEMPLATE_MMX2
1152 "pshufw $177, %%mm0, %%mm3 \n\t"
1153 "pshufw $177, %%mm1, %%mm5 \n\t"
1154 "pand %%mm7, %%mm0 \n\t"
1155 "pand %%mm6, %%mm3 \n\t"
1156 "pand %%mm7, %%mm1 \n\t"
1157 "pand %%mm6, %%mm5 \n\t"
1158 "por %%mm3, %%mm0 \n\t"
1159 "por %%mm5, %%mm1 \n\t"
1161 "movq %%mm0, %%mm2 \n\t"
1162 "movq %%mm1, %%mm4 \n\t"
1163 "pand %%mm7, %%mm0 \n\t"
1164 "pand %%mm6, %%mm2 \n\t"
1165 "pand %%mm7, %%mm1 \n\t"
1166 "pand %%mm6, %%mm4 \n\t"
1167 "movq %%mm2, %%mm3 \n\t"
1168 "movq %%mm4, %%mm5 \n\t"
1169 "pslld $16, %%mm2 \n\t"
1170 "psrld $16, %%mm3 \n\t"
1171 "pslld $16, %%mm4 \n\t"
1172 "psrld $16, %%mm5 \n\t"
1173 "por %%mm2, %%mm0 \n\t"
1174 "por %%mm4, %%mm1 \n\t"
1175 "por %%mm3, %%mm0 \n\t"
1176 "por %%mm5, %%mm1 \n\t"
1178 MOVNTQ" %%mm0, (%2, %0) \n\t"
1179 MOVNTQ" %%mm1, 8(%2, %0) \n\t"
1186 : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
1188 for (; idx<15; idx+=4) {
1189 register int v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00;
1191 *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16);
1195 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
1198 x86_reg mmx_size= 23 - src_size;
1200 "test %%"REG_a", %%"REG_a" \n\t"
1202 "movq "MANGLE(mask24r)", %%mm5 \n\t"
1203 "movq "MANGLE(mask24g)", %%mm6 \n\t"
1204 "movq "MANGLE(mask24b)", %%mm7 \n\t"
1207 PREFETCH" 32(%1, %%"REG_a") \n\t"
1208 "movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1209 "movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG
1210 "movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B
1211 "psllq $16, %%mm0 \n\t" // 00 BGR BGR
1212 "pand %%mm5, %%mm0 \n\t"
1213 "pand %%mm6, %%mm1 \n\t"
1214 "pand %%mm7, %%mm2 \n\t"
1215 "por %%mm0, %%mm1 \n\t"
1216 "por %%mm2, %%mm1 \n\t"
1217 "movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1218 MOVNTQ" %%mm1, (%2, %%"REG_a") \n\t" // RGB RGB RG
1219 "movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B
1220 "movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR
1221 "pand %%mm7, %%mm0 \n\t"
1222 "pand %%mm5, %%mm1 \n\t"
1223 "pand %%mm6, %%mm2 \n\t"
1224 "por %%mm0, %%mm1 \n\t"
1225 "por %%mm2, %%mm1 \n\t"
1226 "movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B
1227 MOVNTQ" %%mm1, 8(%2, %%"REG_a") \n\t" // B RGB RGB R
1228 "movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR
1229 "movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG
1230 "pand %%mm6, %%mm0 \n\t"
1231 "pand %%mm7, %%mm1 \n\t"
1232 "pand %%mm5, %%mm2 \n\t"
1233 "por %%mm0, %%mm1 \n\t"
1234 "por %%mm2, %%mm1 \n\t"
1235 MOVNTQ" %%mm1, 16(%2, %%"REG_a") \n\t"
1236 "add $24, %%"REG_a" \n\t"
1240 : "r" (src-mmx_size), "r"(dst-mmx_size)
1243 __asm__ volatile(SFENCE:::"memory");
1244 __asm__ volatile(EMMS:::"memory");
1246 if (mmx_size==23) return; //finished, was multiple of 8
1250 src_size= 23-mmx_size;
1253 for (i=0; i<src_size; i+=3) {
1256 dst[i + 1] = src[i + 1];
1257 dst[i + 2] = src[i + 0];
1262 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1263 int width, int height,
1264 int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
1267 const x86_reg chromWidth= width>>1;
1268 for (y=0; y<height; y++) {
1269 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1271 "xor %%"REG_a", %%"REG_a" \n\t"
1274 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1275 PREFETCH" 32(%2, %%"REG_a") \n\t"
1276 PREFETCH" 32(%3, %%"REG_a") \n\t"
1277 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1278 "movq %%mm0, %%mm2 \n\t" // U(0)
1279 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1280 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1281 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1283 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1284 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1285 "movq %%mm3, %%mm4 \n\t" // Y(0)
1286 "movq %%mm5, %%mm6 \n\t" // Y(8)
1287 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
1288 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
1289 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
1290 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
1292 MOVNTQ" %%mm3, (%0, %%"REG_a", 4) \n\t"
1293 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
1294 MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4) \n\t"
1295 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
1297 "add $8, %%"REG_a" \n\t"
1298 "cmp %4, %%"REG_a" \n\t"
1300 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1303 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1304 usrc += chromStride;
1305 vsrc += chromStride;
1316 * Height should be a multiple of 2 and width should be a multiple of 16.
1317 * (If this is a problem for anyone then tell me, and I will fix it.)
1319 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1320 int width, int height,
1321 int lumStride, int chromStride, int dstStride)
1323 //FIXME interpolate chroma
1324 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1327 static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1328 int width, int height,
1329 int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
1332 const x86_reg chromWidth= width>>1;
1333 for (y=0; y<height; y++) {
1334 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1336 "xor %%"REG_a", %%"REG_a" \n\t"
1339 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1340 PREFETCH" 32(%2, %%"REG_a") \n\t"
1341 PREFETCH" 32(%3, %%"REG_a") \n\t"
1342 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1343 "movq %%mm0, %%mm2 \n\t" // U(0)
1344 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1345 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1346 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1348 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1349 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1350 "movq %%mm0, %%mm4 \n\t" // Y(0)
1351 "movq %%mm2, %%mm6 \n\t" // Y(8)
1352 "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
1353 "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
1354 "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
1355 "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
1357 MOVNTQ" %%mm0, (%0, %%"REG_a", 4) \n\t"
1358 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
1359 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4) \n\t"
1360 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
1362 "add $8, %%"REG_a" \n\t"
1363 "cmp %4, %%"REG_a" \n\t"
1365 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1368 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1369 usrc += chromStride;
1370 vsrc += chromStride;
1381 * Height should be a multiple of 2 and width should be a multiple of 16
1382 * (If this is a problem for anyone then tell me, and I will fix it.)
1384 static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1385 int width, int height,
1386 int lumStride, int chromStride, int dstStride)
1388 //FIXME interpolate chroma
1389 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1393 * Width should be a multiple of 16.
1395 static inline void RENAME(yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1396 int width, int height,
1397 int lumStride, int chromStride, int dstStride)
1399 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1403 * Width should be a multiple of 16.
1405 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1406 int width, int height,
1407 int lumStride, int chromStride, int dstStride)
1409 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1413 * Height should be a multiple of 2 and width should be a multiple of 16.
1414 * (If this is a problem for anyone then tell me, and I will fix it.)
1416 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1417 int width, int height,
1418 int lumStride, int chromStride, int srcStride)
1421 const x86_reg chromWidth= width>>1;
1422 for (y=0; y<height; y+=2) {
1424 "xor %%"REG_a", %%"REG_a" \n\t"
1425 "pcmpeqw %%mm7, %%mm7 \n\t"
1426 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1429 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1430 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1431 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1432 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
1433 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
1434 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
1435 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
1436 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1437 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1438 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1439 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1441 MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
1443 "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(8)
1444 "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(12)
1445 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
1446 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
1447 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
1448 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
1449 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1450 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1451 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1452 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1454 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1456 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1457 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1458 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1459 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1460 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1461 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1462 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1463 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1465 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
1466 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
1468 "add $8, %%"REG_a" \n\t"
1469 "cmp %4, %%"REG_a" \n\t"
1471 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1472 : "memory", "%"REG_a
1479 "xor %%"REG_a", %%"REG_a" \n\t"
1482 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1483 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1484 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1485 "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
1486 "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
1487 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1488 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1489 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1490 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1491 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1492 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1494 MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
1495 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1497 "add $8, %%"REG_a" \n\t"
1498 "cmp %4, %%"REG_a" \n\t"
1501 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1502 : "memory", "%"REG_a
1504 udst += chromStride;
1505 vdst += chromStride;
1509 __asm__ volatile(EMMS" \n\t"
1513 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
1515 #if COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW
1516 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride)
1523 for (x=0; x<srcWidth-1; x++) {
1524 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1525 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1527 dst[2*srcWidth-1]= src[srcWidth-1];
1531 for (y=1; y<srcHeight; y++) {
1532 const x86_reg mmxSize= srcWidth&~15;
1534 "mov %4, %%"REG_a" \n\t"
1535 "movq "MANGLE(mmx_ff)", %%mm0 \n\t"
1536 "movq (%0, %%"REG_a"), %%mm4 \n\t"
1537 "movq %%mm4, %%mm2 \n\t"
1538 "psllq $8, %%mm4 \n\t"
1539 "pand %%mm0, %%mm2 \n\t"
1540 "por %%mm2, %%mm4 \n\t"
1541 "movq (%1, %%"REG_a"), %%mm5 \n\t"
1542 "movq %%mm5, %%mm3 \n\t"
1543 "psllq $8, %%mm5 \n\t"
1544 "pand %%mm0, %%mm3 \n\t"
1545 "por %%mm3, %%mm5 \n\t"
1547 "movq (%0, %%"REG_a"), %%mm0 \n\t"
1548 "movq (%1, %%"REG_a"), %%mm1 \n\t"
1549 "movq 1(%0, %%"REG_a"), %%mm2 \n\t"
1550 "movq 1(%1, %%"REG_a"), %%mm3 \n\t"
1551 PAVGB" %%mm0, %%mm5 \n\t"
1552 PAVGB" %%mm0, %%mm3 \n\t"
1553 PAVGB" %%mm0, %%mm5 \n\t"
1554 PAVGB" %%mm0, %%mm3 \n\t"
1555 PAVGB" %%mm1, %%mm4 \n\t"
1556 PAVGB" %%mm1, %%mm2 \n\t"
1557 PAVGB" %%mm1, %%mm4 \n\t"
1558 PAVGB" %%mm1, %%mm2 \n\t"
1559 "movq %%mm5, %%mm7 \n\t"
1560 "movq %%mm4, %%mm6 \n\t"
1561 "punpcklbw %%mm3, %%mm5 \n\t"
1562 "punpckhbw %%mm3, %%mm7 \n\t"
1563 "punpcklbw %%mm2, %%mm4 \n\t"
1564 "punpckhbw %%mm2, %%mm6 \n\t"
1565 MOVNTQ" %%mm5, (%2, %%"REG_a", 2) \n\t"
1566 MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2) \n\t"
1567 MOVNTQ" %%mm4, (%3, %%"REG_a", 2) \n\t"
1568 MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2) \n\t"
1569 "add $8, %%"REG_a" \n\t"
1570 "movq -1(%0, %%"REG_a"), %%mm4 \n\t"
1571 "movq -1(%1, %%"REG_a"), %%mm5 \n\t"
1573 :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
1574 "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1579 for (x=mmxSize-1; x<srcWidth-1; x++) {
1580 dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
1581 dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
1582 dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
1583 dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
1585 dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
1586 dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1595 for (x=0; x<srcWidth-1; x++) {
1596 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1597 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1599 dst[2*srcWidth-1]= src[srcWidth-1];
1601 __asm__ volatile(EMMS" \n\t"
1605 #endif /* COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW */
1607 #if !COMPILE_TEMPLATE_AMD3DNOW
1609 * Height should be a multiple of 2 and width should be a multiple of 16.
1610 * (If this is a problem for anyone then tell me, and I will fix it.)
1611 * Chrominance data is only taken from every second line, others are ignored.
1612 * FIXME: Write HQ version.
1614 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1615 int width, int height,
1616 int lumStride, int chromStride, int srcStride)
1619 const x86_reg chromWidth= width>>1;
1620 for (y=0; y<height; y+=2) {
1622 "xor %%"REG_a", %%"REG_a" \n\t"
1623 "pcmpeqw %%mm7, %%mm7 \n\t"
1624 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1627 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1628 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // UYVY UYVY(0)
1629 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(4)
1630 "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
1631 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
1632 "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
1633 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
1634 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1635 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1636 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1637 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1639 MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
1641 "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(8)
1642 "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // UYVY UYVY(12)
1643 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
1644 "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
1645 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
1646 "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
1647 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1648 "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1649 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1650 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1652 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1654 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1655 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1656 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1657 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1658 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1659 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1660 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1661 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1663 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
1664 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
1666 "add $8, %%"REG_a" \n\t"
1667 "cmp %4, %%"REG_a" \n\t"
1669 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1670 : "memory", "%"REG_a
1677 "xor %%"REG_a", %%"REG_a" \n\t"
1680 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1681 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1682 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1683 "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
1684 "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
1685 "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1686 "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1687 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1688 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1689 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1690 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1692 MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
1693 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1695 "add $8, %%"REG_a" \n\t"
1696 "cmp %4, %%"REG_a" \n\t"
1699 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1700 : "memory", "%"REG_a
1702 udst += chromStride;
1703 vdst += chromStride;
1707 __asm__ volatile(EMMS" \n\t"
1711 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
1714 * Height should be a multiple of 2 and width should be a multiple of 2.
1715 * (If this is a problem for anyone then tell me, and I will fix it.)
1716 * Chrominance data is only taken from every second line,
1717 * others are ignored in the C version.
1718 * FIXME: Write HQ version.
1720 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1721 int width, int height,
1722 int lumStride, int chromStride, int srcStride)
1725 const x86_reg chromWidth= width>>1;
1726 for (y=0; y<height-2; y+=2) {
1728 for (i=0; i<2; i++) {
1730 "mov %2, %%"REG_a" \n\t"
1731 "movq "MANGLE(ff_bgr2YCoeff)", %%mm6 \n\t"
1732 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
1733 "pxor %%mm7, %%mm7 \n\t"
1734 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
1737 PREFETCH" 64(%0, %%"REG_d") \n\t"
1738 "movd (%0, %%"REG_d"), %%mm0 \n\t"
1739 "movd 3(%0, %%"REG_d"), %%mm1 \n\t"
1740 "punpcklbw %%mm7, %%mm0 \n\t"
1741 "punpcklbw %%mm7, %%mm1 \n\t"
1742 "movd 6(%0, %%"REG_d"), %%mm2 \n\t"
1743 "movd 9(%0, %%"REG_d"), %%mm3 \n\t"
1744 "punpcklbw %%mm7, %%mm2 \n\t"
1745 "punpcklbw %%mm7, %%mm3 \n\t"
1746 "pmaddwd %%mm6, %%mm0 \n\t"
1747 "pmaddwd %%mm6, %%mm1 \n\t"
1748 "pmaddwd %%mm6, %%mm2 \n\t"
1749 "pmaddwd %%mm6, %%mm3 \n\t"
1750 #ifndef FAST_BGR2YV12
1751 "psrad $8, %%mm0 \n\t"
1752 "psrad $8, %%mm1 \n\t"
1753 "psrad $8, %%mm2 \n\t"
1754 "psrad $8, %%mm3 \n\t"
1756 "packssdw %%mm1, %%mm0 \n\t"
1757 "packssdw %%mm3, %%mm2 \n\t"
1758 "pmaddwd %%mm5, %%mm0 \n\t"
1759 "pmaddwd %%mm5, %%mm2 \n\t"
1760 "packssdw %%mm2, %%mm0 \n\t"
1761 "psraw $7, %%mm0 \n\t"
1763 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
1764 "movd 15(%0, %%"REG_d"), %%mm1 \n\t"
1765 "punpcklbw %%mm7, %%mm4 \n\t"
1766 "punpcklbw %%mm7, %%mm1 \n\t"
1767 "movd 18(%0, %%"REG_d"), %%mm2 \n\t"
1768 "movd 21(%0, %%"REG_d"), %%mm3 \n\t"
1769 "punpcklbw %%mm7, %%mm2 \n\t"
1770 "punpcklbw %%mm7, %%mm3 \n\t"
1771 "pmaddwd %%mm6, %%mm4 \n\t"
1772 "pmaddwd %%mm6, %%mm1 \n\t"
1773 "pmaddwd %%mm6, %%mm2 \n\t"
1774 "pmaddwd %%mm6, %%mm3 \n\t"
1775 #ifndef FAST_BGR2YV12
1776 "psrad $8, %%mm4 \n\t"
1777 "psrad $8, %%mm1 \n\t"
1778 "psrad $8, %%mm2 \n\t"
1779 "psrad $8, %%mm3 \n\t"
1781 "packssdw %%mm1, %%mm4 \n\t"
1782 "packssdw %%mm3, %%mm2 \n\t"
1783 "pmaddwd %%mm5, %%mm4 \n\t"
1784 "pmaddwd %%mm5, %%mm2 \n\t"
1785 "add $24, %%"REG_d" \n\t"
1786 "packssdw %%mm2, %%mm4 \n\t"
1787 "psraw $7, %%mm4 \n\t"
1789 "packuswb %%mm4, %%mm0 \n\t"
1790 "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0 \n\t"
1792 MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t"
1793 "add $8, %%"REG_a" \n\t"
1795 : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width)
1796 : "%"REG_a, "%"REG_d
1803 "mov %4, %%"REG_a" \n\t"
1804 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
1805 "movq "MANGLE(ff_bgr2UCoeff)", %%mm6 \n\t"
1806 "pxor %%mm7, %%mm7 \n\t"
1807 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
1808 "add %%"REG_d", %%"REG_d" \n\t"
1811 PREFETCH" 64(%0, %%"REG_d") \n\t"
1812 PREFETCH" 64(%1, %%"REG_d") \n\t"
1813 #if COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW
1814 "movq (%0, %%"REG_d"), %%mm0 \n\t"
1815 "movq (%1, %%"REG_d"), %%mm1 \n\t"
1816 "movq 6(%0, %%"REG_d"), %%mm2 \n\t"
1817 "movq 6(%1, %%"REG_d"), %%mm3 \n\t"
1818 PAVGB" %%mm1, %%mm0 \n\t"
1819 PAVGB" %%mm3, %%mm2 \n\t"
1820 "movq %%mm0, %%mm1 \n\t"
1821 "movq %%mm2, %%mm3 \n\t"
1822 "psrlq $24, %%mm0 \n\t"
1823 "psrlq $24, %%mm2 \n\t"
1824 PAVGB" %%mm1, %%mm0 \n\t"
1825 PAVGB" %%mm3, %%mm2 \n\t"
1826 "punpcklbw %%mm7, %%mm0 \n\t"
1827 "punpcklbw %%mm7, %%mm2 \n\t"
1829 "movd (%0, %%"REG_d"), %%mm0 \n\t"
1830 "movd (%1, %%"REG_d"), %%mm1 \n\t"
1831 "movd 3(%0, %%"REG_d"), %%mm2 \n\t"
1832 "movd 3(%1, %%"REG_d"), %%mm3 \n\t"
1833 "punpcklbw %%mm7, %%mm0 \n\t"
1834 "punpcklbw %%mm7, %%mm1 \n\t"
1835 "punpcklbw %%mm7, %%mm2 \n\t"
1836 "punpcklbw %%mm7, %%mm3 \n\t"
1837 "paddw %%mm1, %%mm0 \n\t"
1838 "paddw %%mm3, %%mm2 \n\t"
1839 "paddw %%mm2, %%mm0 \n\t"
1840 "movd 6(%0, %%"REG_d"), %%mm4 \n\t"
1841 "movd 6(%1, %%"REG_d"), %%mm1 \n\t"
1842 "movd 9(%0, %%"REG_d"), %%mm2 \n\t"
1843 "movd 9(%1, %%"REG_d"), %%mm3 \n\t"
1844 "punpcklbw %%mm7, %%mm4 \n\t"
1845 "punpcklbw %%mm7, %%mm1 \n\t"
1846 "punpcklbw %%mm7, %%mm2 \n\t"
1847 "punpcklbw %%mm7, %%mm3 \n\t"
1848 "paddw %%mm1, %%mm4 \n\t"
1849 "paddw %%mm3, %%mm2 \n\t"
1850 "paddw %%mm4, %%mm2 \n\t"
1851 "psrlw $2, %%mm0 \n\t"
1852 "psrlw $2, %%mm2 \n\t"
1854 "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
1855 "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
1857 "pmaddwd %%mm0, %%mm1 \n\t"
1858 "pmaddwd %%mm2, %%mm3 \n\t"
1859 "pmaddwd %%mm6, %%mm0 \n\t"
1860 "pmaddwd %%mm6, %%mm2 \n\t"
1861 #ifndef FAST_BGR2YV12
1862 "psrad $8, %%mm0 \n\t"
1863 "psrad $8, %%mm1 \n\t"
1864 "psrad $8, %%mm2 \n\t"
1865 "psrad $8, %%mm3 \n\t"
1867 "packssdw %%mm2, %%mm0 \n\t"
1868 "packssdw %%mm3, %%mm1 \n\t"
1869 "pmaddwd %%mm5, %%mm0 \n\t"
1870 "pmaddwd %%mm5, %%mm1 \n\t"
1871 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
1872 "psraw $7, %%mm0 \n\t"
1874 #if COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW
1875 "movq 12(%0, %%"REG_d"), %%mm4 \n\t"
1876 "movq 12(%1, %%"REG_d"), %%mm1 \n\t"
1877 "movq 18(%0, %%"REG_d"), %%mm2 \n\t"
1878 "movq 18(%1, %%"REG_d"), %%mm3 \n\t"
1879 PAVGB" %%mm1, %%mm4 \n\t"
1880 PAVGB" %%mm3, %%mm2 \n\t"
1881 "movq %%mm4, %%mm1 \n\t"
1882 "movq %%mm2, %%mm3 \n\t"
1883 "psrlq $24, %%mm4 \n\t"
1884 "psrlq $24, %%mm2 \n\t"
1885 PAVGB" %%mm1, %%mm4 \n\t"
1886 PAVGB" %%mm3, %%mm2 \n\t"
1887 "punpcklbw %%mm7, %%mm4 \n\t"
1888 "punpcklbw %%mm7, %%mm2 \n\t"
1890 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
1891 "movd 12(%1, %%"REG_d"), %%mm1 \n\t"
1892 "movd 15(%0, %%"REG_d"), %%mm2 \n\t"
1893 "movd 15(%1, %%"REG_d"), %%mm3 \n\t"
1894 "punpcklbw %%mm7, %%mm4 \n\t"
1895 "punpcklbw %%mm7, %%mm1 \n\t"
1896 "punpcklbw %%mm7, %%mm2 \n\t"
1897 "punpcklbw %%mm7, %%mm3 \n\t"
1898 "paddw %%mm1, %%mm4 \n\t"
1899 "paddw %%mm3, %%mm2 \n\t"
1900 "paddw %%mm2, %%mm4 \n\t"
1901 "movd 18(%0, %%"REG_d"), %%mm5 \n\t"
1902 "movd 18(%1, %%"REG_d"), %%mm1 \n\t"
1903 "movd 21(%0, %%"REG_d"), %%mm2 \n\t"
1904 "movd 21(%1, %%"REG_d"), %%mm3 \n\t"
1905 "punpcklbw %%mm7, %%mm5 \n\t"
1906 "punpcklbw %%mm7, %%mm1 \n\t"
1907 "punpcklbw %%mm7, %%mm2 \n\t"
1908 "punpcklbw %%mm7, %%mm3 \n\t"
1909 "paddw %%mm1, %%mm5 \n\t"
1910 "paddw %%mm3, %%mm2 \n\t"
1911 "paddw %%mm5, %%mm2 \n\t"
1912 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
1913 "psrlw $2, %%mm4 \n\t"
1914 "psrlw $2, %%mm2 \n\t"
1916 "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
1917 "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
1919 "pmaddwd %%mm4, %%mm1 \n\t"
1920 "pmaddwd %%mm2, %%mm3 \n\t"
1921 "pmaddwd %%mm6, %%mm4 \n\t"
1922 "pmaddwd %%mm6, %%mm2 \n\t"
1923 #ifndef FAST_BGR2YV12
1924 "psrad $8, %%mm4 \n\t"
1925 "psrad $8, %%mm1 \n\t"
1926 "psrad $8, %%mm2 \n\t"
1927 "psrad $8, %%mm3 \n\t"
1929 "packssdw %%mm2, %%mm4 \n\t"
1930 "packssdw %%mm3, %%mm1 \n\t"
1931 "pmaddwd %%mm5, %%mm4 \n\t"
1932 "pmaddwd %%mm5, %%mm1 \n\t"
1933 "add $24, %%"REG_d" \n\t"
1934 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
1935 "psraw $7, %%mm4 \n\t"
1937 "movq %%mm0, %%mm1 \n\t"
1938 "punpckldq %%mm4, %%mm0 \n\t"
1939 "punpckhdq %%mm4, %%mm1 \n\t"
1940 "packsswb %%mm1, %%mm0 \n\t"
1941 "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0 \n\t"
1942 "movd %%mm0, (%2, %%"REG_a") \n\t"
1943 "punpckhdq %%mm0, %%mm0 \n\t"
1944 "movd %%mm0, (%3, %%"REG_a") \n\t"
1945 "add $4, %%"REG_a" \n\t"
1947 : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
1948 : "%"REG_a, "%"REG_d
1951 udst += chromStride;
1952 vdst += chromStride;
1956 __asm__ volatile(EMMS" \n\t"
1960 rgb24toyv12_c(src, ydst, udst, vdst, width, height-y, lumStride, chromStride, srcStride);
1962 #endif /* !COMPILE_TEMPLATE_SSE2 */
1964 #if !COMPILE_TEMPLATE_AMD3DNOW
1965 static void RENAME(interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dest,
1966 int width, int height, int src1Stride,
1967 int src2Stride, int dstStride)
1971 for (h=0; h < height; h++) {
1974 #if COMPILE_TEMPLATE_SSE2
1976 "xor %%"REG_a", %%"REG_a" \n\t"
1978 PREFETCH" 64(%1, %%"REG_a") \n\t"
1979 PREFETCH" 64(%2, %%"REG_a") \n\t"
1980 "movdqa (%1, %%"REG_a"), %%xmm0 \n\t"
1981 "movdqa (%1, %%"REG_a"), %%xmm1 \n\t"
1982 "movdqa (%2, %%"REG_a"), %%xmm2 \n\t"
1983 "punpcklbw %%xmm2, %%xmm0 \n\t"
1984 "punpckhbw %%xmm2, %%xmm1 \n\t"
1985 "movntdq %%xmm0, (%0, %%"REG_a", 2) \n\t"
1986 "movntdq %%xmm1, 16(%0, %%"REG_a", 2) \n\t"
1987 "add $16, %%"REG_a" \n\t"
1988 "cmp %3, %%"REG_a" \n\t"
1990 ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
1991 : "memory", "%"REG_a""
1995 "xor %%"REG_a", %%"REG_a" \n\t"
1997 PREFETCH" 64(%1, %%"REG_a") \n\t"
1998 PREFETCH" 64(%2, %%"REG_a") \n\t"
1999 "movq (%1, %%"REG_a"), %%mm0 \n\t"
2000 "movq 8(%1, %%"REG_a"), %%mm2 \n\t"
2001 "movq %%mm0, %%mm1 \n\t"
2002 "movq %%mm2, %%mm3 \n\t"
2003 "movq (%2, %%"REG_a"), %%mm4 \n\t"
2004 "movq 8(%2, %%"REG_a"), %%mm5 \n\t"
2005 "punpcklbw %%mm4, %%mm0 \n\t"
2006 "punpckhbw %%mm4, %%mm1 \n\t"
2007 "punpcklbw %%mm5, %%mm2 \n\t"
2008 "punpckhbw %%mm5, %%mm3 \n\t"
2009 MOVNTQ" %%mm0, (%0, %%"REG_a", 2) \n\t"
2010 MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2) \n\t"
2011 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2) \n\t"
2012 MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2) \n\t"
2013 "add $16, %%"REG_a" \n\t"
2014 "cmp %3, %%"REG_a" \n\t"
2016 ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
2017 : "memory", "%"REG_a
2020 for (w= (width&(~15)); w < width; w++) {
2021 dest[2*w+0] = src1[w];
2022 dest[2*w+1] = src2[w];
2034 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2036 #if !COMPILE_TEMPLATE_SSE2
2037 #if !COMPILE_TEMPLATE_AMD3DNOW
2038 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2039 uint8_t *dst1, uint8_t *dst2,
2040 int width, int height,
2041 int srcStride1, int srcStride2,
2042 int dstStride1, int dstStride2)
2046 w=width/2; h=height/2;
2050 ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2052 const uint8_t* s1=src1+srcStride1*(y>>1);
2053 uint8_t* d=dst1+dstStride1*y;
2055 for (;x<w-31;x+=32) {
2057 PREFETCH" 32%1 \n\t"
2058 "movq %1, %%mm0 \n\t"
2059 "movq 8%1, %%mm2 \n\t"
2060 "movq 16%1, %%mm4 \n\t"
2061 "movq 24%1, %%mm6 \n\t"
2062 "movq %%mm0, %%mm1 \n\t"
2063 "movq %%mm2, %%mm3 \n\t"
2064 "movq %%mm4, %%mm5 \n\t"
2065 "movq %%mm6, %%mm7 \n\t"
2066 "punpcklbw %%mm0, %%mm0 \n\t"
2067 "punpckhbw %%mm1, %%mm1 \n\t"
2068 "punpcklbw %%mm2, %%mm2 \n\t"
2069 "punpckhbw %%mm3, %%mm3 \n\t"
2070 "punpcklbw %%mm4, %%mm4 \n\t"
2071 "punpckhbw %%mm5, %%mm5 \n\t"
2072 "punpcklbw %%mm6, %%mm6 \n\t"
2073 "punpckhbw %%mm7, %%mm7 \n\t"
2074 MOVNTQ" %%mm0, %0 \n\t"
2075 MOVNTQ" %%mm1, 8%0 \n\t"
2076 MOVNTQ" %%mm2, 16%0 \n\t"
2077 MOVNTQ" %%mm3, 24%0 \n\t"
2078 MOVNTQ" %%mm4, 32%0 \n\t"
2079 MOVNTQ" %%mm5, 40%0 \n\t"
2080 MOVNTQ" %%mm6, 48%0 \n\t"
2081 MOVNTQ" %%mm7, 56%0"
2086 for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2089 const uint8_t* s2=src2+srcStride2*(y>>1);
2090 uint8_t* d=dst2+dstStride2*y;
2092 for (;x<w-31;x+=32) {
2094 PREFETCH" 32%1 \n\t"
2095 "movq %1, %%mm0 \n\t"
2096 "movq 8%1, %%mm2 \n\t"
2097 "movq 16%1, %%mm4 \n\t"
2098 "movq 24%1, %%mm6 \n\t"
2099 "movq %%mm0, %%mm1 \n\t"
2100 "movq %%mm2, %%mm3 \n\t"
2101 "movq %%mm4, %%mm5 \n\t"
2102 "movq %%mm6, %%mm7 \n\t"
2103 "punpcklbw %%mm0, %%mm0 \n\t"
2104 "punpckhbw %%mm1, %%mm1 \n\t"
2105 "punpcklbw %%mm2, %%mm2 \n\t"
2106 "punpckhbw %%mm3, %%mm3 \n\t"
2107 "punpcklbw %%mm4, %%mm4 \n\t"
2108 "punpckhbw %%mm5, %%mm5 \n\t"
2109 "punpcklbw %%mm6, %%mm6 \n\t"
2110 "punpckhbw %%mm7, %%mm7 \n\t"
2111 MOVNTQ" %%mm0, %0 \n\t"
2112 MOVNTQ" %%mm1, 8%0 \n\t"
2113 MOVNTQ" %%mm2, 16%0 \n\t"
2114 MOVNTQ" %%mm3, 24%0 \n\t"
2115 MOVNTQ" %%mm4, 32%0 \n\t"
2116 MOVNTQ" %%mm5, 40%0 \n\t"
2117 MOVNTQ" %%mm6, 48%0 \n\t"
2118 MOVNTQ" %%mm7, 56%0"
2123 for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2132 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2134 int width, int height,
2135 int srcStride1, int srcStride2,
2136 int srcStride3, int dstStride)
2140 w=width/2; h=height;
2142 const uint8_t* yp=src1+srcStride1*y;
2143 const uint8_t* up=src2+srcStride2*(y>>2);
2144 const uint8_t* vp=src3+srcStride3*(y>>2);
2145 uint8_t* d=dst+dstStride*y;
2149 PREFETCH" 32(%1, %0) \n\t"
2150 PREFETCH" 32(%2, %0) \n\t"
2151 PREFETCH" 32(%3, %0) \n\t"
2152 "movq (%1, %0, 4), %%mm0 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2153 "movq (%2, %0), %%mm1 \n\t" /* U0U1U2U3U4U5U6U7 */
2154 "movq (%3, %0), %%mm2 \n\t" /* V0V1V2V3V4V5V6V7 */
2155 "movq %%mm0, %%mm3 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2156 "movq %%mm1, %%mm4 \n\t" /* U0U1U2U3U4U5U6U7 */
2157 "movq %%mm2, %%mm5 \n\t" /* V0V1V2V3V4V5V6V7 */
2158 "punpcklbw %%mm1, %%mm1 \n\t" /* U0U0 U1U1 U2U2 U3U3 */
2159 "punpcklbw %%mm2, %%mm2 \n\t" /* V0V0 V1V1 V2V2 V3V3 */
2160 "punpckhbw %%mm4, %%mm4 \n\t" /* U4U4 U5U5 U6U6 U7U7 */
2161 "punpckhbw %%mm5, %%mm5 \n\t" /* V4V4 V5V5 V6V6 V7V7 */
2163 "movq %%mm1, %%mm6 \n\t"
2164 "punpcklbw %%mm2, %%mm1 \n\t" /* U0V0 U0V0 U1V1 U1V1*/
2165 "punpcklbw %%mm1, %%mm0 \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2166 "punpckhbw %%mm1, %%mm3 \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2167 MOVNTQ" %%mm0, (%4, %0, 8) \n\t"
2168 MOVNTQ" %%mm3, 8(%4, %0, 8) \n\t"
2170 "punpckhbw %%mm2, %%mm6 \n\t" /* U2V2 U2V2 U3V3 U3V3*/
2171 "movq 8(%1, %0, 4), %%mm0 \n\t"
2172 "movq %%mm0, %%mm3 \n\t"
2173 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U2 Y V2 Y U2 Y V2*/
2174 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U3 Y V3 Y U3 Y V3*/
2175 MOVNTQ" %%mm0, 16(%4, %0, 8) \n\t"
2176 MOVNTQ" %%mm3, 24(%4, %0, 8) \n\t"
2178 "movq %%mm4, %%mm6 \n\t"
2179 "movq 16(%1, %0, 4), %%mm0 \n\t"
2180 "movq %%mm0, %%mm3 \n\t"
2181 "punpcklbw %%mm5, %%mm4 \n\t"
2182 "punpcklbw %%mm4, %%mm0 \n\t" /* Y U4 Y V4 Y U4 Y V4*/
2183 "punpckhbw %%mm4, %%mm3 \n\t" /* Y U5 Y V5 Y U5 Y V5*/
2184 MOVNTQ" %%mm0, 32(%4, %0, 8) \n\t"
2185 MOVNTQ" %%mm3, 40(%4, %0, 8) \n\t"
2187 "punpckhbw %%mm5, %%mm6 \n\t"
2188 "movq 24(%1, %0, 4), %%mm0 \n\t"
2189 "movq %%mm0, %%mm3 \n\t"
2190 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U6 Y V6 Y U6 Y V6*/
2191 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U7 Y V7 Y U7 Y V7*/
2192 MOVNTQ" %%mm0, 48(%4, %0, 8) \n\t"
2193 MOVNTQ" %%mm3, 56(%4, %0, 8) \n\t"
2196 : "r"(yp), "r" (up), "r"(vp), "r"(d)
2200 const int x2 = x<<2;
2203 d[8*x+2] = yp[x2+1];
2205 d[8*x+4] = yp[x2+2];
2207 d[8*x+6] = yp[x2+3];
2217 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2219 static void RENAME(extract_even)(const uint8_t *src, uint8_t *dst, x86_reg count)
2228 "pcmpeqw %%mm7, %%mm7 \n\t"
2229 "psrlw $8, %%mm7 \n\t"
2231 "movq -30(%1, %0, 2), %%mm0 \n\t"
2232 "movq -22(%1, %0, 2), %%mm1 \n\t"
2233 "movq -14(%1, %0, 2), %%mm2 \n\t"
2234 "movq -6(%1, %0, 2), %%mm3 \n\t"
2235 "pand %%mm7, %%mm0 \n\t"
2236 "pand %%mm7, %%mm1 \n\t"
2237 "pand %%mm7, %%mm2 \n\t"
2238 "pand %%mm7, %%mm3 \n\t"
2239 "packuswb %%mm1, %%mm0 \n\t"
2240 "packuswb %%mm3, %%mm2 \n\t"
2241 MOVNTQ" %%mm0,-15(%2, %0) \n\t"
2242 MOVNTQ" %%mm2,- 7(%2, %0) \n\t"
2246 : "r"(src), "r"(dst)
2251 dst[count]= src[2*count];
2256 #if !COMPILE_TEMPLATE_AMD3DNOW
2257 static void RENAME(extract_even2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2266 "pcmpeqw %%mm7, %%mm7 \n\t"
2267 "psrlw $8, %%mm7 \n\t"
2269 "movq -28(%1, %0, 4), %%mm0 \n\t"
2270 "movq -20(%1, %0, 4), %%mm1 \n\t"
2271 "movq -12(%1, %0, 4), %%mm2 \n\t"
2272 "movq -4(%1, %0, 4), %%mm3 \n\t"
2273 "pand %%mm7, %%mm0 \n\t"
2274 "pand %%mm7, %%mm1 \n\t"
2275 "pand %%mm7, %%mm2 \n\t"
2276 "pand %%mm7, %%mm3 \n\t"
2277 "packuswb %%mm1, %%mm0 \n\t"
2278 "packuswb %%mm3, %%mm2 \n\t"
2279 "movq %%mm0, %%mm1 \n\t"
2280 "movq %%mm2, %%mm3 \n\t"
2281 "psrlw $8, %%mm0 \n\t"
2282 "psrlw $8, %%mm2 \n\t"
2283 "pand %%mm7, %%mm1 \n\t"
2284 "pand %%mm7, %%mm3 \n\t"
2285 "packuswb %%mm2, %%mm0 \n\t"
2286 "packuswb %%mm3, %%mm1 \n\t"
2287 MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
2288 MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
2292 : "r"(src), "r"(dst0), "r"(dst1)
2297 dst0[count]= src[4*count+0];
2298 dst1[count]= src[4*count+2];
2302 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2304 static void RENAME(extract_even2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2315 "pcmpeqw %%mm7, %%mm7 \n\t"
2316 "psrlw $8, %%mm7 \n\t"
2318 "movq -28(%1, %0, 4), %%mm0 \n\t"
2319 "movq -20(%1, %0, 4), %%mm1 \n\t"
2320 "movq -12(%1, %0, 4), %%mm2 \n\t"
2321 "movq -4(%1, %0, 4), %%mm3 \n\t"
2322 PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
2323 PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
2324 PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
2325 PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
2326 "pand %%mm7, %%mm0 \n\t"
2327 "pand %%mm7, %%mm1 \n\t"
2328 "pand %%mm7, %%mm2 \n\t"
2329 "pand %%mm7, %%mm3 \n\t"
2330 "packuswb %%mm1, %%mm0 \n\t"
2331 "packuswb %%mm3, %%mm2 \n\t"
2332 "movq %%mm0, %%mm1 \n\t"
2333 "movq %%mm2, %%mm3 \n\t"
2334 "psrlw $8, %%mm0 \n\t"
2335 "psrlw $8, %%mm2 \n\t"
2336 "pand %%mm7, %%mm1 \n\t"
2337 "pand %%mm7, %%mm3 \n\t"
2338 "packuswb %%mm2, %%mm0 \n\t"
2339 "packuswb %%mm3, %%mm1 \n\t"
2340 MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
2341 MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
2345 : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2351 dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2352 dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2357 #if !COMPILE_TEMPLATE_AMD3DNOW
2358 static void RENAME(extract_odd2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2367 "pcmpeqw %%mm7, %%mm7 \n\t"
2368 "psrlw $8, %%mm7 \n\t"
2370 "movq -28(%1, %0, 4), %%mm0 \n\t"
2371 "movq -20(%1, %0, 4), %%mm1 \n\t"
2372 "movq -12(%1, %0, 4), %%mm2 \n\t"
2373 "movq -4(%1, %0, 4), %%mm3 \n\t"
2374 "psrlw $8, %%mm0 \n\t"
2375 "psrlw $8, %%mm1 \n\t"
2376 "psrlw $8, %%mm2 \n\t"
2377 "psrlw $8, %%mm3 \n\t"
2378 "packuswb %%mm1, %%mm0 \n\t"
2379 "packuswb %%mm3, %%mm2 \n\t"
2380 "movq %%mm0, %%mm1 \n\t"
2381 "movq %%mm2, %%mm3 \n\t"
2382 "psrlw $8, %%mm0 \n\t"
2383 "psrlw $8, %%mm2 \n\t"
2384 "pand %%mm7, %%mm1 \n\t"
2385 "pand %%mm7, %%mm3 \n\t"
2386 "packuswb %%mm2, %%mm0 \n\t"
2387 "packuswb %%mm3, %%mm1 \n\t"
2388 MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
2389 MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
2393 : "r"(src), "r"(dst0), "r"(dst1)
2399 dst0[count]= src[4*count+0];
2400 dst1[count]= src[4*count+2];
2404 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2406 static void RENAME(extract_odd2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2417 "pcmpeqw %%mm7, %%mm7 \n\t"
2418 "psrlw $8, %%mm7 \n\t"
2420 "movq -28(%1, %0, 4), %%mm0 \n\t"
2421 "movq -20(%1, %0, 4), %%mm1 \n\t"
2422 "movq -12(%1, %0, 4), %%mm2 \n\t"
2423 "movq -4(%1, %0, 4), %%mm3 \n\t"
2424 PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
2425 PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
2426 PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
2427 PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
2428 "psrlw $8, %%mm0 \n\t"
2429 "psrlw $8, %%mm1 \n\t"
2430 "psrlw $8, %%mm2 \n\t"
2431 "psrlw $8, %%mm3 \n\t"
2432 "packuswb %%mm1, %%mm0 \n\t"
2433 "packuswb %%mm3, %%mm2 \n\t"
2434 "movq %%mm0, %%mm1 \n\t"
2435 "movq %%mm2, %%mm3 \n\t"
2436 "psrlw $8, %%mm0 \n\t"
2437 "psrlw $8, %%mm2 \n\t"
2438 "pand %%mm7, %%mm1 \n\t"
2439 "pand %%mm7, %%mm3 \n\t"
2440 "packuswb %%mm2, %%mm0 \n\t"
2441 "packuswb %%mm3, %%mm1 \n\t"
2442 MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
2443 MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
2447 : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2455 dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2456 dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2461 static void RENAME(yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2462 int width, int height,
2463 int lumStride, int chromStride, int srcStride)
2466 const int chromWidth= -((-width)>>1);
2468 for (y=0; y<height; y++) {
2469 RENAME(extract_even)(src, ydst, width);
2471 RENAME(extract_odd2avg)(src-srcStride, src, udst, vdst, chromWidth);
2486 #if !COMPILE_TEMPLATE_AMD3DNOW
2487 static void RENAME(yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2488 int width, int height,
2489 int lumStride, int chromStride, int srcStride)
2492 const int chromWidth= -((-width)>>1);
2494 for (y=0; y<height; y++) {
2495 RENAME(extract_even)(src, ydst, width);
2496 RENAME(extract_odd2)(src, udst, vdst, chromWidth);
2509 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2511 static void RENAME(uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2512 int width, int height,
2513 int lumStride, int chromStride, int srcStride)
2516 const int chromWidth= -((-width)>>1);
2518 for (y=0; y<height; y++) {
2519 RENAME(extract_even)(src+1, ydst, width);
2521 RENAME(extract_even2avg)(src-srcStride, src, udst, vdst, chromWidth);
2536 #if !COMPILE_TEMPLATE_AMD3DNOW
2537 static void RENAME(uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2538 int width, int height,
2539 int lumStride, int chromStride, int srcStride)
2542 const int chromWidth= -((-width)>>1);
2544 for (y=0; y<height; y++) {
2545 RENAME(extract_even)(src+1, ydst, width);
2546 RENAME(extract_even2)(src, udst, vdst, chromWidth);
2559 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2560 #endif /* !COMPILE_TEMPLATE_SSE2 */
2562 static inline void RENAME(rgb2rgb_init)(void)
2564 #if !COMPILE_TEMPLATE_SSE2
2565 #if !COMPILE_TEMPLATE_AMD3DNOW
2566 rgb15to16 = RENAME(rgb15to16);
2567 rgb15tobgr24 = RENAME(rgb15tobgr24);
2568 rgb15to32 = RENAME(rgb15to32);
2569 rgb16tobgr24 = RENAME(rgb16tobgr24);
2570 rgb16to32 = RENAME(rgb16to32);
2571 rgb16to15 = RENAME(rgb16to15);
2572 rgb24tobgr16 = RENAME(rgb24tobgr16);
2573 rgb24tobgr15 = RENAME(rgb24tobgr15);
2574 rgb24tobgr32 = RENAME(rgb24tobgr32);
2575 rgb32to16 = RENAME(rgb32to16);
2576 rgb32to15 = RENAME(rgb32to15);
2577 rgb32tobgr24 = RENAME(rgb32tobgr24);
2578 rgb24to15 = RENAME(rgb24to15);
2579 rgb24to16 = RENAME(rgb24to16);
2580 rgb24tobgr24 = RENAME(rgb24tobgr24);
2581 shuffle_bytes_2103 = RENAME(shuffle_bytes_2103);
2582 rgb32tobgr16 = RENAME(rgb32tobgr16);
2583 rgb32tobgr15 = RENAME(rgb32tobgr15);
2584 yv12toyuy2 = RENAME(yv12toyuy2);
2585 yv12touyvy = RENAME(yv12touyvy);
2586 yuv422ptoyuy2 = RENAME(yuv422ptoyuy2);
2587 yuv422ptouyvy = RENAME(yuv422ptouyvy);
2588 yuy2toyv12 = RENAME(yuy2toyv12);
2589 vu9_to_vu12 = RENAME(vu9_to_vu12);
2590 yvu9_to_yuy2 = RENAME(yvu9_to_yuy2);
2591 uyvytoyuv422 = RENAME(uyvytoyuv422);
2592 yuyvtoyuv422 = RENAME(yuyvtoyuv422);
2593 #endif /* !COMPILE_TEMPLATE_SSE2 */
2595 #if COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW
2596 planar2x = RENAME(planar2x);
2597 #endif /* COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW */
2598 rgb24toyv12 = RENAME(rgb24toyv12);
2600 yuyvtoyuv420 = RENAME(yuyvtoyuv420);
2601 uyvytoyuv420 = RENAME(uyvytoyuv420);
2602 #endif /* COMPILE_TEMPLATE_SSE2 */
2604 #if !COMPILE_TEMPLATE_AMD3DNOW
2605 interleaveBytes = RENAME(interleaveBytes);
2606 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */