2 * software RGB to RGB converter
3 * pluralize by software PAL8 to RGB converter
4 * software YUV to YUV converter
5 * software YUV to RGB converter
6 * Written by Nick Kurshev.
7 * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
8 * lot of big-endian byte order fixes by Alex Beregszaszi
10 * This file is part of Libav.
12 * Libav is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License as published by the Free Software Foundation; either
15 * version 2.1 of the License, or (at your option) any later version.
17 * Libav is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * Lesser General Public License for more details.
22 * You should have received a copy of the GNU Lesser General Public
23 * License along with Libav; if not, write to the Free Software
24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
29 #include "libavutil/attributes.h"
37 #if COMPILE_TEMPLATE_AMD3DNOW
38 #define PREFETCH "prefetch"
39 #define PAVGB "pavgusb"
40 #elif COMPILE_TEMPLATE_MMXEXT
41 #define PREFETCH "prefetchnta"
44 #define PREFETCH " # nop"
47 #if COMPILE_TEMPLATE_AMD3DNOW
48 /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
54 #if COMPILE_TEMPLATE_MMXEXT
55 #define MOVNTQ "movntq"
56 #define SFENCE "sfence"
59 #define SFENCE " # nop"
62 #if !COMPILE_TEMPLATE_SSE2
64 #if !COMPILE_TEMPLATE_AMD3DNOW
66 static inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, int src_size)
69 const uint8_t *s = src;
71 const uint8_t *mm_end;
73 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
75 __asm__ volatile("movq %0, %%mm7"::"m"(mask32a):"memory");
78 PREFETCH" 32(%1) \n\t"
79 "movd (%1), %%mm0 \n\t"
80 "punpckldq 3(%1), %%mm0 \n\t"
81 "movd 6(%1), %%mm1 \n\t"
82 "punpckldq 9(%1), %%mm1 \n\t"
83 "movd 12(%1), %%mm2 \n\t"
84 "punpckldq 15(%1), %%mm2 \n\t"
85 "movd 18(%1), %%mm3 \n\t"
86 "punpckldq 21(%1), %%mm3 \n\t"
87 "por %%mm7, %%mm0 \n\t"
88 "por %%mm7, %%mm1 \n\t"
89 "por %%mm7, %%mm2 \n\t"
90 "por %%mm7, %%mm3 \n\t"
91 MOVNTQ" %%mm0, (%0) \n\t"
92 MOVNTQ" %%mm1, 8(%0) \n\t"
93 MOVNTQ" %%mm2, 16(%0) \n\t"
94 MOVNTQ" %%mm3, 24(%0)"
100 __asm__ volatile(SFENCE:::"memory");
101 __asm__ volatile(EMMS:::"memory");
110 #define STORE_BGR24_MMX \
111 "psrlq $8, %%mm2 \n\t" \
112 "psrlq $8, %%mm3 \n\t" \
113 "psrlq $8, %%mm6 \n\t" \
114 "psrlq $8, %%mm7 \n\t" \
115 "pand "MANGLE(mask24l)", %%mm0\n\t" \
116 "pand "MANGLE(mask24l)", %%mm1\n\t" \
117 "pand "MANGLE(mask24l)", %%mm4\n\t" \
118 "pand "MANGLE(mask24l)", %%mm5\n\t" \
119 "pand "MANGLE(mask24h)", %%mm2\n\t" \
120 "pand "MANGLE(mask24h)", %%mm3\n\t" \
121 "pand "MANGLE(mask24h)", %%mm6\n\t" \
122 "pand "MANGLE(mask24h)", %%mm7\n\t" \
123 "por %%mm2, %%mm0 \n\t" \
124 "por %%mm3, %%mm1 \n\t" \
125 "por %%mm6, %%mm4 \n\t" \
126 "por %%mm7, %%mm5 \n\t" \
128 "movq %%mm1, %%mm2 \n\t" \
129 "movq %%mm4, %%mm3 \n\t" \
130 "psllq $48, %%mm2 \n\t" \
131 "psllq $32, %%mm3 \n\t" \
132 "pand "MANGLE(mask24hh)", %%mm2\n\t" \
133 "pand "MANGLE(mask24hhh)", %%mm3\n\t" \
134 "por %%mm2, %%mm0 \n\t" \
135 "psrlq $16, %%mm1 \n\t" \
136 "psrlq $32, %%mm4 \n\t" \
137 "psllq $16, %%mm5 \n\t" \
138 "por %%mm3, %%mm1 \n\t" \
139 "pand "MANGLE(mask24hhhh)", %%mm5\n\t" \
140 "por %%mm5, %%mm4 \n\t" \
142 MOVNTQ" %%mm0, (%0) \n\t" \
143 MOVNTQ" %%mm1, 8(%0) \n\t" \
144 MOVNTQ" %%mm4, 16(%0)"
147 static inline void RENAME(rgb32tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
150 const uint8_t *s = src;
152 const uint8_t *mm_end;
154 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
158 PREFETCH" 32(%1) \n\t"
159 "movq (%1), %%mm0 \n\t"
160 "movq 8(%1), %%mm1 \n\t"
161 "movq 16(%1), %%mm4 \n\t"
162 "movq 24(%1), %%mm5 \n\t"
163 "movq %%mm0, %%mm2 \n\t"
164 "movq %%mm1, %%mm3 \n\t"
165 "movq %%mm4, %%mm6 \n\t"
166 "movq %%mm5, %%mm7 \n\t"
173 __asm__ volatile(SFENCE:::"memory");
174 __asm__ volatile(EMMS:::"memory");
184 original by Strepto/Astral
185 ported to gcc & bugfixed: A'rpi
186 MMXEXT, 3DNOW optimization by Nick Kurshev
187 32-bit C version, and and&add trick by Michael Niedermayer
189 static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, int src_size)
191 register const uint8_t* s=src;
192 register uint8_t* d=dst;
193 register const uint8_t *end;
194 const uint8_t *mm_end;
196 __asm__ volatile(PREFETCH" %0"::"m"(*s));
197 __asm__ volatile("movq %0, %%mm4"::"m"(mask15s));
201 PREFETCH" 32(%1) \n\t"
202 "movq (%1), %%mm0 \n\t"
203 "movq 8(%1), %%mm2 \n\t"
204 "movq %%mm0, %%mm1 \n\t"
205 "movq %%mm2, %%mm3 \n\t"
206 "pand %%mm4, %%mm0 \n\t"
207 "pand %%mm4, %%mm2 \n\t"
208 "paddw %%mm1, %%mm0 \n\t"
209 "paddw %%mm3, %%mm2 \n\t"
210 MOVNTQ" %%mm0, (%0) \n\t"
211 MOVNTQ" %%mm2, 8(%0)"
217 __asm__ volatile(SFENCE:::"memory");
218 __asm__ volatile(EMMS:::"memory");
221 register unsigned x= *((const uint32_t *)s);
222 *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
227 register unsigned short x= *((const uint16_t *)s);
228 *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
232 static inline void RENAME(rgb16to15)(const uint8_t *src, uint8_t *dst, int src_size)
234 register const uint8_t* s=src;
235 register uint8_t* d=dst;
236 register const uint8_t *end;
237 const uint8_t *mm_end;
239 __asm__ volatile(PREFETCH" %0"::"m"(*s));
240 __asm__ volatile("movq %0, %%mm7"::"m"(mask15rg));
241 __asm__ volatile("movq %0, %%mm6"::"m"(mask15b));
245 PREFETCH" 32(%1) \n\t"
246 "movq (%1), %%mm0 \n\t"
247 "movq 8(%1), %%mm2 \n\t"
248 "movq %%mm0, %%mm1 \n\t"
249 "movq %%mm2, %%mm3 \n\t"
250 "psrlq $1, %%mm0 \n\t"
251 "psrlq $1, %%mm2 \n\t"
252 "pand %%mm7, %%mm0 \n\t"
253 "pand %%mm7, %%mm2 \n\t"
254 "pand %%mm6, %%mm1 \n\t"
255 "pand %%mm6, %%mm3 \n\t"
256 "por %%mm1, %%mm0 \n\t"
257 "por %%mm3, %%mm2 \n\t"
258 MOVNTQ" %%mm0, (%0) \n\t"
259 MOVNTQ" %%mm2, 8(%0)"
265 __asm__ volatile(SFENCE:::"memory");
266 __asm__ volatile(EMMS:::"memory");
269 register uint32_t x= *((const uint32_t*)s);
270 *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
275 register uint16_t x= *((const uint16_t*)s);
276 *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
280 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, int src_size)
282 const uint8_t *s = src;
284 const uint8_t *mm_end;
285 uint16_t *d = (uint16_t *)dst;
289 "movq %3, %%mm5 \n\t"
290 "movq %4, %%mm6 \n\t"
291 "movq %5, %%mm7 \n\t"
295 PREFETCH" 32(%1) \n\t"
296 "movd (%1), %%mm0 \n\t"
297 "movd 4(%1), %%mm3 \n\t"
298 "punpckldq 8(%1), %%mm0 \n\t"
299 "punpckldq 12(%1), %%mm3 \n\t"
300 "movq %%mm0, %%mm1 \n\t"
301 "movq %%mm3, %%mm4 \n\t"
302 "pand %%mm6, %%mm0 \n\t"
303 "pand %%mm6, %%mm3 \n\t"
304 "pmaddwd %%mm7, %%mm0 \n\t"
305 "pmaddwd %%mm7, %%mm3 \n\t"
306 "pand %%mm5, %%mm1 \n\t"
307 "pand %%mm5, %%mm4 \n\t"
308 "por %%mm1, %%mm0 \n\t"
309 "por %%mm4, %%mm3 \n\t"
310 "psrld $5, %%mm0 \n\t"
311 "pslld $11, %%mm3 \n\t"
312 "por %%mm3, %%mm0 \n\t"
313 MOVNTQ" %%mm0, (%0) \n\t"
320 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
322 __asm__ volatile(SFENCE:::"memory");
323 __asm__ volatile(EMMS:::"memory");
325 register int rgb = *(const uint32_t*)s; s += 4;
326 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
330 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, int src_size)
332 const uint8_t *s = src;
334 const uint8_t *mm_end;
335 uint16_t *d = (uint16_t *)dst;
337 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
339 "movq %0, %%mm7 \n\t"
340 "movq %1, %%mm6 \n\t"
341 ::"m"(red_16mask),"m"(green_16mask));
345 PREFETCH" 32(%1) \n\t"
346 "movd (%1), %%mm0 \n\t"
347 "movd 4(%1), %%mm3 \n\t"
348 "punpckldq 8(%1), %%mm0 \n\t"
349 "punpckldq 12(%1), %%mm3 \n\t"
350 "movq %%mm0, %%mm1 \n\t"
351 "movq %%mm0, %%mm2 \n\t"
352 "movq %%mm3, %%mm4 \n\t"
353 "movq %%mm3, %%mm5 \n\t"
354 "psllq $8, %%mm0 \n\t"
355 "psllq $8, %%mm3 \n\t"
356 "pand %%mm7, %%mm0 \n\t"
357 "pand %%mm7, %%mm3 \n\t"
358 "psrlq $5, %%mm1 \n\t"
359 "psrlq $5, %%mm4 \n\t"
360 "pand %%mm6, %%mm1 \n\t"
361 "pand %%mm6, %%mm4 \n\t"
362 "psrlq $19, %%mm2 \n\t"
363 "psrlq $19, %%mm5 \n\t"
364 "pand %2, %%mm2 \n\t"
365 "pand %2, %%mm5 \n\t"
366 "por %%mm1, %%mm0 \n\t"
367 "por %%mm4, %%mm3 \n\t"
368 "por %%mm2, %%mm0 \n\t"
369 "por %%mm5, %%mm3 \n\t"
370 "psllq $16, %%mm3 \n\t"
371 "por %%mm3, %%mm0 \n\t"
372 MOVNTQ" %%mm0, (%0) \n\t"
373 :: "r"(d),"r"(s),"m"(blue_16mask):"memory");
377 __asm__ volatile(SFENCE:::"memory");
378 __asm__ volatile(EMMS:::"memory");
380 register int rgb = *(const uint32_t*)s; s += 4;
381 *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
385 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, int src_size)
387 const uint8_t *s = src;
389 const uint8_t *mm_end;
390 uint16_t *d = (uint16_t *)dst;
394 "movq %3, %%mm5 \n\t"
395 "movq %4, %%mm6 \n\t"
396 "movq %5, %%mm7 \n\t"
400 PREFETCH" 32(%1) \n\t"
401 "movd (%1), %%mm0 \n\t"
402 "movd 4(%1), %%mm3 \n\t"
403 "punpckldq 8(%1), %%mm0 \n\t"
404 "punpckldq 12(%1), %%mm3 \n\t"
405 "movq %%mm0, %%mm1 \n\t"
406 "movq %%mm3, %%mm4 \n\t"
407 "pand %%mm6, %%mm0 \n\t"
408 "pand %%mm6, %%mm3 \n\t"
409 "pmaddwd %%mm7, %%mm0 \n\t"
410 "pmaddwd %%mm7, %%mm3 \n\t"
411 "pand %%mm5, %%mm1 \n\t"
412 "pand %%mm5, %%mm4 \n\t"
413 "por %%mm1, %%mm0 \n\t"
414 "por %%mm4, %%mm3 \n\t"
415 "psrld $6, %%mm0 \n\t"
416 "pslld $10, %%mm3 \n\t"
417 "por %%mm3, %%mm0 \n\t"
418 MOVNTQ" %%mm0, (%0) \n\t"
425 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
427 __asm__ volatile(SFENCE:::"memory");
428 __asm__ volatile(EMMS:::"memory");
430 register int rgb = *(const uint32_t*)s; s += 4;
431 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
435 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, int src_size)
437 const uint8_t *s = src;
439 const uint8_t *mm_end;
440 uint16_t *d = (uint16_t *)dst;
442 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
444 "movq %0, %%mm7 \n\t"
445 "movq %1, %%mm6 \n\t"
446 ::"m"(red_15mask),"m"(green_15mask));
450 PREFETCH" 32(%1) \n\t"
451 "movd (%1), %%mm0 \n\t"
452 "movd 4(%1), %%mm3 \n\t"
453 "punpckldq 8(%1), %%mm0 \n\t"
454 "punpckldq 12(%1), %%mm3 \n\t"
455 "movq %%mm0, %%mm1 \n\t"
456 "movq %%mm0, %%mm2 \n\t"
457 "movq %%mm3, %%mm4 \n\t"
458 "movq %%mm3, %%mm5 \n\t"
459 "psllq $7, %%mm0 \n\t"
460 "psllq $7, %%mm3 \n\t"
461 "pand %%mm7, %%mm0 \n\t"
462 "pand %%mm7, %%mm3 \n\t"
463 "psrlq $6, %%mm1 \n\t"
464 "psrlq $6, %%mm4 \n\t"
465 "pand %%mm6, %%mm1 \n\t"
466 "pand %%mm6, %%mm4 \n\t"
467 "psrlq $19, %%mm2 \n\t"
468 "psrlq $19, %%mm5 \n\t"
469 "pand %2, %%mm2 \n\t"
470 "pand %2, %%mm5 \n\t"
471 "por %%mm1, %%mm0 \n\t"
472 "por %%mm4, %%mm3 \n\t"
473 "por %%mm2, %%mm0 \n\t"
474 "por %%mm5, %%mm3 \n\t"
475 "psllq $16, %%mm3 \n\t"
476 "por %%mm3, %%mm0 \n\t"
477 MOVNTQ" %%mm0, (%0) \n\t"
478 ::"r"(d),"r"(s),"m"(blue_15mask):"memory");
482 __asm__ volatile(SFENCE:::"memory");
483 __asm__ volatile(EMMS:::"memory");
485 register int rgb = *(const uint32_t*)s; s += 4;
486 *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
490 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, int src_size)
492 const uint8_t *s = src;
494 const uint8_t *mm_end;
495 uint16_t *d = (uint16_t *)dst;
497 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
499 "movq %0, %%mm7 \n\t"
500 "movq %1, %%mm6 \n\t"
501 ::"m"(red_16mask),"m"(green_16mask));
505 PREFETCH" 32(%1) \n\t"
506 "movd (%1), %%mm0 \n\t"
507 "movd 3(%1), %%mm3 \n\t"
508 "punpckldq 6(%1), %%mm0 \n\t"
509 "punpckldq 9(%1), %%mm3 \n\t"
510 "movq %%mm0, %%mm1 \n\t"
511 "movq %%mm0, %%mm2 \n\t"
512 "movq %%mm3, %%mm4 \n\t"
513 "movq %%mm3, %%mm5 \n\t"
514 "psrlq $3, %%mm0 \n\t"
515 "psrlq $3, %%mm3 \n\t"
516 "pand %2, %%mm0 \n\t"
517 "pand %2, %%mm3 \n\t"
518 "psrlq $5, %%mm1 \n\t"
519 "psrlq $5, %%mm4 \n\t"
520 "pand %%mm6, %%mm1 \n\t"
521 "pand %%mm6, %%mm4 \n\t"
522 "psrlq $8, %%mm2 \n\t"
523 "psrlq $8, %%mm5 \n\t"
524 "pand %%mm7, %%mm2 \n\t"
525 "pand %%mm7, %%mm5 \n\t"
526 "por %%mm1, %%mm0 \n\t"
527 "por %%mm4, %%mm3 \n\t"
528 "por %%mm2, %%mm0 \n\t"
529 "por %%mm5, %%mm3 \n\t"
530 "psllq $16, %%mm3 \n\t"
531 "por %%mm3, %%mm0 \n\t"
532 MOVNTQ" %%mm0, (%0) \n\t"
533 ::"r"(d),"r"(s),"m"(blue_16mask):"memory");
537 __asm__ volatile(SFENCE:::"memory");
538 __asm__ volatile(EMMS:::"memory");
543 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
547 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, int src_size)
549 const uint8_t *s = src;
551 const uint8_t *mm_end;
552 uint16_t *d = (uint16_t *)dst;
554 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
556 "movq %0, %%mm7 \n\t"
557 "movq %1, %%mm6 \n\t"
558 ::"m"(red_16mask),"m"(green_16mask));
562 PREFETCH" 32(%1) \n\t"
563 "movd (%1), %%mm0 \n\t"
564 "movd 3(%1), %%mm3 \n\t"
565 "punpckldq 6(%1), %%mm0 \n\t"
566 "punpckldq 9(%1), %%mm3 \n\t"
567 "movq %%mm0, %%mm1 \n\t"
568 "movq %%mm0, %%mm2 \n\t"
569 "movq %%mm3, %%mm4 \n\t"
570 "movq %%mm3, %%mm5 \n\t"
571 "psllq $8, %%mm0 \n\t"
572 "psllq $8, %%mm3 \n\t"
573 "pand %%mm7, %%mm0 \n\t"
574 "pand %%mm7, %%mm3 \n\t"
575 "psrlq $5, %%mm1 \n\t"
576 "psrlq $5, %%mm4 \n\t"
577 "pand %%mm6, %%mm1 \n\t"
578 "pand %%mm6, %%mm4 \n\t"
579 "psrlq $19, %%mm2 \n\t"
580 "psrlq $19, %%mm5 \n\t"
581 "pand %2, %%mm2 \n\t"
582 "pand %2, %%mm5 \n\t"
583 "por %%mm1, %%mm0 \n\t"
584 "por %%mm4, %%mm3 \n\t"
585 "por %%mm2, %%mm0 \n\t"
586 "por %%mm5, %%mm3 \n\t"
587 "psllq $16, %%mm3 \n\t"
588 "por %%mm3, %%mm0 \n\t"
589 MOVNTQ" %%mm0, (%0) \n\t"
590 ::"r"(d),"r"(s),"m"(blue_16mask):"memory");
594 __asm__ volatile(SFENCE:::"memory");
595 __asm__ volatile(EMMS:::"memory");
600 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
604 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, int src_size)
606 const uint8_t *s = src;
608 const uint8_t *mm_end;
609 uint16_t *d = (uint16_t *)dst;
611 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
613 "movq %0, %%mm7 \n\t"
614 "movq %1, %%mm6 \n\t"
615 ::"m"(red_15mask),"m"(green_15mask));
619 PREFETCH" 32(%1) \n\t"
620 "movd (%1), %%mm0 \n\t"
621 "movd 3(%1), %%mm3 \n\t"
622 "punpckldq 6(%1), %%mm0 \n\t"
623 "punpckldq 9(%1), %%mm3 \n\t"
624 "movq %%mm0, %%mm1 \n\t"
625 "movq %%mm0, %%mm2 \n\t"
626 "movq %%mm3, %%mm4 \n\t"
627 "movq %%mm3, %%mm5 \n\t"
628 "psrlq $3, %%mm0 \n\t"
629 "psrlq $3, %%mm3 \n\t"
630 "pand %2, %%mm0 \n\t"
631 "pand %2, %%mm3 \n\t"
632 "psrlq $6, %%mm1 \n\t"
633 "psrlq $6, %%mm4 \n\t"
634 "pand %%mm6, %%mm1 \n\t"
635 "pand %%mm6, %%mm4 \n\t"
636 "psrlq $9, %%mm2 \n\t"
637 "psrlq $9, %%mm5 \n\t"
638 "pand %%mm7, %%mm2 \n\t"
639 "pand %%mm7, %%mm5 \n\t"
640 "por %%mm1, %%mm0 \n\t"
641 "por %%mm4, %%mm3 \n\t"
642 "por %%mm2, %%mm0 \n\t"
643 "por %%mm5, %%mm3 \n\t"
644 "psllq $16, %%mm3 \n\t"
645 "por %%mm3, %%mm0 \n\t"
646 MOVNTQ" %%mm0, (%0) \n\t"
647 ::"r"(d),"r"(s),"m"(blue_15mask):"memory");
651 __asm__ volatile(SFENCE:::"memory");
652 __asm__ volatile(EMMS:::"memory");
657 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
661 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, int src_size)
663 const uint8_t *s = src;
665 const uint8_t *mm_end;
666 uint16_t *d = (uint16_t *)dst;
668 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
670 "movq %0, %%mm7 \n\t"
671 "movq %1, %%mm6 \n\t"
672 ::"m"(red_15mask),"m"(green_15mask));
676 PREFETCH" 32(%1) \n\t"
677 "movd (%1), %%mm0 \n\t"
678 "movd 3(%1), %%mm3 \n\t"
679 "punpckldq 6(%1), %%mm0 \n\t"
680 "punpckldq 9(%1), %%mm3 \n\t"
681 "movq %%mm0, %%mm1 \n\t"
682 "movq %%mm0, %%mm2 \n\t"
683 "movq %%mm3, %%mm4 \n\t"
684 "movq %%mm3, %%mm5 \n\t"
685 "psllq $7, %%mm0 \n\t"
686 "psllq $7, %%mm3 \n\t"
687 "pand %%mm7, %%mm0 \n\t"
688 "pand %%mm7, %%mm3 \n\t"
689 "psrlq $6, %%mm1 \n\t"
690 "psrlq $6, %%mm4 \n\t"
691 "pand %%mm6, %%mm1 \n\t"
692 "pand %%mm6, %%mm4 \n\t"
693 "psrlq $19, %%mm2 \n\t"
694 "psrlq $19, %%mm5 \n\t"
695 "pand %2, %%mm2 \n\t"
696 "pand %2, %%mm5 \n\t"
697 "por %%mm1, %%mm0 \n\t"
698 "por %%mm4, %%mm3 \n\t"
699 "por %%mm2, %%mm0 \n\t"
700 "por %%mm5, %%mm3 \n\t"
701 "psllq $16, %%mm3 \n\t"
702 "por %%mm3, %%mm0 \n\t"
703 MOVNTQ" %%mm0, (%0) \n\t"
704 ::"r"(d),"r"(s),"m"(blue_15mask):"memory");
708 __asm__ volatile(SFENCE:::"memory");
709 __asm__ volatile(EMMS:::"memory");
714 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
719 I use less accurate approximation here by simply left-shifting the input
720 value and filling the low order bits with zeroes. This method improves PNG
721 compression but this scheme cannot reproduce white exactly, since it does
722 not generate an all-ones maximum value; the net effect is to darken the
725 The better method should be "left bit replication":
735 | leftmost bits repeated to fill open bits
739 static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
742 const uint16_t *mm_end;
744 const uint16_t *s = (const uint16_t*)src;
745 end = s + src_size/2;
746 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
750 PREFETCH" 32(%1) \n\t"
751 "movq (%1), %%mm0 \n\t"
752 "movq (%1), %%mm1 \n\t"
753 "movq (%1), %%mm2 \n\t"
754 "pand %2, %%mm0 \n\t"
755 "pand %3, %%mm1 \n\t"
756 "pand %4, %%mm2 \n\t"
757 "psllq $3, %%mm0 \n\t"
758 "psrlq $2, %%mm1 \n\t"
759 "psrlq $7, %%mm2 \n\t"
760 "movq %%mm0, %%mm3 \n\t"
761 "movq %%mm1, %%mm4 \n\t"
762 "movq %%mm2, %%mm5 \n\t"
763 "punpcklwd %5, %%mm0 \n\t"
764 "punpcklwd %5, %%mm1 \n\t"
765 "punpcklwd %5, %%mm2 \n\t"
766 "punpckhwd %5, %%mm3 \n\t"
767 "punpckhwd %5, %%mm4 \n\t"
768 "punpckhwd %5, %%mm5 \n\t"
769 "psllq $8, %%mm1 \n\t"
770 "psllq $16, %%mm2 \n\t"
771 "por %%mm1, %%mm0 \n\t"
772 "por %%mm2, %%mm0 \n\t"
773 "psllq $8, %%mm4 \n\t"
774 "psllq $16, %%mm5 \n\t"
775 "por %%mm4, %%mm3 \n\t"
776 "por %%mm5, %%mm3 \n\t"
778 "movq %%mm0, %%mm6 \n\t"
779 "movq %%mm3, %%mm7 \n\t"
781 "movq 8(%1), %%mm0 \n\t"
782 "movq 8(%1), %%mm1 \n\t"
783 "movq 8(%1), %%mm2 \n\t"
784 "pand %2, %%mm0 \n\t"
785 "pand %3, %%mm1 \n\t"
786 "pand %4, %%mm2 \n\t"
787 "psllq $3, %%mm0 \n\t"
788 "psrlq $2, %%mm1 \n\t"
789 "psrlq $7, %%mm2 \n\t"
790 "movq %%mm0, %%mm3 \n\t"
791 "movq %%mm1, %%mm4 \n\t"
792 "movq %%mm2, %%mm5 \n\t"
793 "punpcklwd %5, %%mm0 \n\t"
794 "punpcklwd %5, %%mm1 \n\t"
795 "punpcklwd %5, %%mm2 \n\t"
796 "punpckhwd %5, %%mm3 \n\t"
797 "punpckhwd %5, %%mm4 \n\t"
798 "punpckhwd %5, %%mm5 \n\t"
799 "psllq $8, %%mm1 \n\t"
800 "psllq $16, %%mm2 \n\t"
801 "por %%mm1, %%mm0 \n\t"
802 "por %%mm2, %%mm0 \n\t"
803 "psllq $8, %%mm4 \n\t"
804 "psllq $16, %%mm5 \n\t"
805 "por %%mm4, %%mm3 \n\t"
806 "por %%mm5, %%mm3 \n\t"
809 :"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
811 /* borrowed 32 to 24 */
813 "movq %%mm0, %%mm4 \n\t"
814 "movq %%mm3, %%mm5 \n\t"
815 "movq %%mm6, %%mm0 \n\t"
816 "movq %%mm7, %%mm1 \n\t"
818 "movq %%mm4, %%mm6 \n\t"
819 "movq %%mm5, %%mm7 \n\t"
820 "movq %%mm0, %%mm2 \n\t"
821 "movq %%mm1, %%mm3 \n\t"
830 __asm__ volatile(SFENCE:::"memory");
831 __asm__ volatile(EMMS:::"memory");
833 register uint16_t bgr;
835 *d++ = (bgr&0x1F)<<3;
836 *d++ = (bgr&0x3E0)>>2;
837 *d++ = (bgr&0x7C00)>>7;
841 static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
844 const uint16_t *mm_end;
845 uint8_t *d = (uint8_t *)dst;
846 const uint16_t *s = (const uint16_t *)src;
847 end = s + src_size/2;
848 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
852 PREFETCH" 32(%1) \n\t"
853 "movq (%1), %%mm0 \n\t"
854 "movq (%1), %%mm1 \n\t"
855 "movq (%1), %%mm2 \n\t"
856 "pand %2, %%mm0 \n\t"
857 "pand %3, %%mm1 \n\t"
858 "pand %4, %%mm2 \n\t"
859 "psllq $3, %%mm0 \n\t"
860 "psrlq $3, %%mm1 \n\t"
861 "psrlq $8, %%mm2 \n\t"
862 "movq %%mm0, %%mm3 \n\t"
863 "movq %%mm1, %%mm4 \n\t"
864 "movq %%mm2, %%mm5 \n\t"
865 "punpcklwd %5, %%mm0 \n\t"
866 "punpcklwd %5, %%mm1 \n\t"
867 "punpcklwd %5, %%mm2 \n\t"
868 "punpckhwd %5, %%mm3 \n\t"
869 "punpckhwd %5, %%mm4 \n\t"
870 "punpckhwd %5, %%mm5 \n\t"
871 "psllq $8, %%mm1 \n\t"
872 "psllq $16, %%mm2 \n\t"
873 "por %%mm1, %%mm0 \n\t"
874 "por %%mm2, %%mm0 \n\t"
875 "psllq $8, %%mm4 \n\t"
876 "psllq $16, %%mm5 \n\t"
877 "por %%mm4, %%mm3 \n\t"
878 "por %%mm5, %%mm3 \n\t"
880 "movq %%mm0, %%mm6 \n\t"
881 "movq %%mm3, %%mm7 \n\t"
883 "movq 8(%1), %%mm0 \n\t"
884 "movq 8(%1), %%mm1 \n\t"
885 "movq 8(%1), %%mm2 \n\t"
886 "pand %2, %%mm0 \n\t"
887 "pand %3, %%mm1 \n\t"
888 "pand %4, %%mm2 \n\t"
889 "psllq $3, %%mm0 \n\t"
890 "psrlq $3, %%mm1 \n\t"
891 "psrlq $8, %%mm2 \n\t"
892 "movq %%mm0, %%mm3 \n\t"
893 "movq %%mm1, %%mm4 \n\t"
894 "movq %%mm2, %%mm5 \n\t"
895 "punpcklwd %5, %%mm0 \n\t"
896 "punpcklwd %5, %%mm1 \n\t"
897 "punpcklwd %5, %%mm2 \n\t"
898 "punpckhwd %5, %%mm3 \n\t"
899 "punpckhwd %5, %%mm4 \n\t"
900 "punpckhwd %5, %%mm5 \n\t"
901 "psllq $8, %%mm1 \n\t"
902 "psllq $16, %%mm2 \n\t"
903 "por %%mm1, %%mm0 \n\t"
904 "por %%mm2, %%mm0 \n\t"
905 "psllq $8, %%mm4 \n\t"
906 "psllq $16, %%mm5 \n\t"
907 "por %%mm4, %%mm3 \n\t"
908 "por %%mm5, %%mm3 \n\t"
910 :"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
912 /* borrowed 32 to 24 */
914 "movq %%mm0, %%mm4 \n\t"
915 "movq %%mm3, %%mm5 \n\t"
916 "movq %%mm6, %%mm0 \n\t"
917 "movq %%mm7, %%mm1 \n\t"
919 "movq %%mm4, %%mm6 \n\t"
920 "movq %%mm5, %%mm7 \n\t"
921 "movq %%mm0, %%mm2 \n\t"
922 "movq %%mm1, %%mm3 \n\t"
931 __asm__ volatile(SFENCE:::"memory");
932 __asm__ volatile(EMMS:::"memory");
934 register uint16_t bgr;
936 *d++ = (bgr&0x1F)<<3;
937 *d++ = (bgr&0x7E0)>>3;
938 *d++ = (bgr&0xF800)>>8;
943 * mm0 = 00 B3 00 B2 00 B1 00 B0
944 * mm1 = 00 G3 00 G2 00 G1 00 G0
945 * mm2 = 00 R3 00 R2 00 R1 00 R0
946 * mm6 = FF FF FF FF FF FF FF FF
947 * mm7 = 00 00 00 00 00 00 00 00
950 "packuswb %%mm7, %%mm0 \n\t" /* 00 00 00 00 B3 B2 B1 B0 */ \
951 "packuswb %%mm7, %%mm1 \n\t" /* 00 00 00 00 G3 G2 G1 G0 */ \
952 "packuswb %%mm7, %%mm2 \n\t" /* 00 00 00 00 R3 R2 R1 R0 */ \
953 "punpcklbw %%mm1, %%mm0 \n\t" /* G3 B3 G2 B2 G1 B1 G0 B0 */ \
954 "punpcklbw %%mm6, %%mm2 \n\t" /* FF R3 FF R2 FF R1 FF R0 */ \
955 "movq %%mm0, %%mm3 \n\t" \
956 "punpcklwd %%mm2, %%mm0 \n\t" /* FF R1 G1 B1 FF R0 G0 B0 */ \
957 "punpckhwd %%mm2, %%mm3 \n\t" /* FF R3 G3 B3 FF R2 G2 B2 */ \
958 MOVNTQ" %%mm0, (%0) \n\t" \
959 MOVNTQ" %%mm3, 8(%0) \n\t" \
961 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, int src_size)
964 const uint16_t *mm_end;
966 const uint16_t *s = (const uint16_t *)src;
967 end = s + src_size/2;
968 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
969 __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
970 __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
974 PREFETCH" 32(%1) \n\t"
975 "movq (%1), %%mm0 \n\t"
976 "movq (%1), %%mm1 \n\t"
977 "movq (%1), %%mm2 \n\t"
978 "pand %2, %%mm0 \n\t"
979 "pand %3, %%mm1 \n\t"
980 "pand %4, %%mm2 \n\t"
981 "psllq $3, %%mm0 \n\t"
982 "psrlq $2, %%mm1 \n\t"
983 "psrlq $7, %%mm2 \n\t"
985 ::"r"(d),"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
990 __asm__ volatile(SFENCE:::"memory");
991 __asm__ volatile(EMMS:::"memory");
993 register uint16_t bgr;
995 *d++ = (bgr&0x1F)<<3;
996 *d++ = (bgr&0x3E0)>>2;
997 *d++ = (bgr&0x7C00)>>7;
1002 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, int src_size)
1004 const uint16_t *end;
1005 const uint16_t *mm_end;
1007 const uint16_t *s = (const uint16_t*)src;
1008 end = s + src_size/2;
1009 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
1010 __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
1011 __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
1013 while (s < mm_end) {
1015 PREFETCH" 32(%1) \n\t"
1016 "movq (%1), %%mm0 \n\t"
1017 "movq (%1), %%mm1 \n\t"
1018 "movq (%1), %%mm2 \n\t"
1019 "pand %2, %%mm0 \n\t"
1020 "pand %3, %%mm1 \n\t"
1021 "pand %4, %%mm2 \n\t"
1022 "psllq $3, %%mm0 \n\t"
1023 "psrlq $3, %%mm1 \n\t"
1024 "psrlq $8, %%mm2 \n\t"
1026 ::"r"(d),"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1031 __asm__ volatile(SFENCE:::"memory");
1032 __asm__ volatile(EMMS:::"memory");
1034 register uint16_t bgr;
1036 *d++ = (bgr&0x1F)<<3;
1037 *d++ = (bgr&0x7E0)>>3;
1038 *d++ = (bgr&0xF800)>>8;
1043 static inline void RENAME(shuffle_bytes_2103)(const uint8_t *src, uint8_t *dst, int src_size)
1045 x86_reg idx = 15 - src_size;
1046 const uint8_t *s = src-idx;
1047 uint8_t *d = dst-idx;
1051 PREFETCH" (%1, %0) \n\t"
1052 "movq %3, %%mm7 \n\t"
1053 "pxor %4, %%mm7 \n\t"
1054 "movq %%mm7, %%mm6 \n\t"
1055 "pxor %5, %%mm7 \n\t"
1058 PREFETCH" 32(%1, %0) \n\t"
1059 "movq (%1, %0), %%mm0 \n\t"
1060 "movq 8(%1, %0), %%mm1 \n\t"
1061 # if COMPILE_TEMPLATE_MMXEXT
1062 "pshufw $177, %%mm0, %%mm3 \n\t"
1063 "pshufw $177, %%mm1, %%mm5 \n\t"
1064 "pand %%mm7, %%mm0 \n\t"
1065 "pand %%mm6, %%mm3 \n\t"
1066 "pand %%mm7, %%mm1 \n\t"
1067 "pand %%mm6, %%mm5 \n\t"
1068 "por %%mm3, %%mm0 \n\t"
1069 "por %%mm5, %%mm1 \n\t"
1071 "movq %%mm0, %%mm2 \n\t"
1072 "movq %%mm1, %%mm4 \n\t"
1073 "pand %%mm7, %%mm0 \n\t"
1074 "pand %%mm6, %%mm2 \n\t"
1075 "pand %%mm7, %%mm1 \n\t"
1076 "pand %%mm6, %%mm4 \n\t"
1077 "movq %%mm2, %%mm3 \n\t"
1078 "movq %%mm4, %%mm5 \n\t"
1079 "pslld $16, %%mm2 \n\t"
1080 "psrld $16, %%mm3 \n\t"
1081 "pslld $16, %%mm4 \n\t"
1082 "psrld $16, %%mm5 \n\t"
1083 "por %%mm2, %%mm0 \n\t"
1084 "por %%mm4, %%mm1 \n\t"
1085 "por %%mm3, %%mm0 \n\t"
1086 "por %%mm5, %%mm1 \n\t"
1088 MOVNTQ" %%mm0, (%2, %0) \n\t"
1089 MOVNTQ" %%mm1, 8(%2, %0) \n\t"
1096 : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
1098 for (; idx<15; idx+=4) {
1099 register int v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00;
1101 *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16);
1105 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
1108 x86_reg mmx_size= 23 - src_size;
1110 "test %%"REG_a", %%"REG_a" \n\t"
1112 "movq "MANGLE(mask24r)", %%mm5 \n\t"
1113 "movq "MANGLE(mask24g)", %%mm6 \n\t"
1114 "movq "MANGLE(mask24b)", %%mm7 \n\t"
1117 PREFETCH" 32(%1, %%"REG_a") \n\t"
1118 "movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1119 "movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG
1120 "movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B
1121 "psllq $16, %%mm0 \n\t" // 00 BGR BGR
1122 "pand %%mm5, %%mm0 \n\t"
1123 "pand %%mm6, %%mm1 \n\t"
1124 "pand %%mm7, %%mm2 \n\t"
1125 "por %%mm0, %%mm1 \n\t"
1126 "por %%mm2, %%mm1 \n\t"
1127 "movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1128 MOVNTQ" %%mm1, (%2, %%"REG_a") \n\t" // RGB RGB RG
1129 "movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B
1130 "movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR
1131 "pand %%mm7, %%mm0 \n\t"
1132 "pand %%mm5, %%mm1 \n\t"
1133 "pand %%mm6, %%mm2 \n\t"
1134 "por %%mm0, %%mm1 \n\t"
1135 "por %%mm2, %%mm1 \n\t"
1136 "movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B
1137 MOVNTQ" %%mm1, 8(%2, %%"REG_a") \n\t" // B RGB RGB R
1138 "movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR
1139 "movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG
1140 "pand %%mm6, %%mm0 \n\t"
1141 "pand %%mm7, %%mm1 \n\t"
1142 "pand %%mm5, %%mm2 \n\t"
1143 "por %%mm0, %%mm1 \n\t"
1144 "por %%mm2, %%mm1 \n\t"
1145 MOVNTQ" %%mm1, 16(%2, %%"REG_a") \n\t"
1146 "add $24, %%"REG_a" \n\t"
1150 : "r" (src-mmx_size), "r"(dst-mmx_size)
1153 __asm__ volatile(SFENCE:::"memory");
1154 __asm__ volatile(EMMS:::"memory");
1156 if (mmx_size==23) return; //finished, was multiple of 8
1160 src_size= 23-mmx_size;
1163 for (i=0; i<src_size; i+=3) {
1166 dst[i + 1] = src[i + 1];
1167 dst[i + 2] = src[i + 0];
1172 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1173 int width, int height,
1174 int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
1177 const x86_reg chromWidth= width>>1;
1178 for (y=0; y<height; y++) {
1179 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1181 "xor %%"REG_a", %%"REG_a" \n\t"
1184 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1185 PREFETCH" 32(%2, %%"REG_a") \n\t"
1186 PREFETCH" 32(%3, %%"REG_a") \n\t"
1187 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1188 "movq %%mm0, %%mm2 \n\t" // U(0)
1189 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1190 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1191 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1193 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1194 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1195 "movq %%mm3, %%mm4 \n\t" // Y(0)
1196 "movq %%mm5, %%mm6 \n\t" // Y(8)
1197 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
1198 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
1199 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
1200 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
1202 MOVNTQ" %%mm3, (%0, %%"REG_a", 4) \n\t"
1203 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
1204 MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4) \n\t"
1205 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
1207 "add $8, %%"REG_a" \n\t"
1208 "cmp %4, %%"REG_a" \n\t"
1210 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1213 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1214 usrc += chromStride;
1215 vsrc += chromStride;
1226 * Height should be a multiple of 2 and width should be a multiple of 16.
1227 * (If this is a problem for anyone then tell me, and I will fix it.)
1229 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1230 int width, int height,
1231 int lumStride, int chromStride, int dstStride)
1233 //FIXME interpolate chroma
1234 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1237 static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1238 int width, int height,
1239 int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
1242 const x86_reg chromWidth= width>>1;
1243 for (y=0; y<height; y++) {
1244 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1246 "xor %%"REG_a", %%"REG_a" \n\t"
1249 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1250 PREFETCH" 32(%2, %%"REG_a") \n\t"
1251 PREFETCH" 32(%3, %%"REG_a") \n\t"
1252 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1253 "movq %%mm0, %%mm2 \n\t" // U(0)
1254 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1255 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1256 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1258 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1259 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1260 "movq %%mm0, %%mm4 \n\t" // Y(0)
1261 "movq %%mm2, %%mm6 \n\t" // Y(8)
1262 "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
1263 "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
1264 "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
1265 "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
1267 MOVNTQ" %%mm0, (%0, %%"REG_a", 4) \n\t"
1268 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
1269 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4) \n\t"
1270 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
1272 "add $8, %%"REG_a" \n\t"
1273 "cmp %4, %%"REG_a" \n\t"
1275 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1278 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1279 usrc += chromStride;
1280 vsrc += chromStride;
1291 * Height should be a multiple of 2 and width should be a multiple of 16
1292 * (If this is a problem for anyone then tell me, and I will fix it.)
1294 static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1295 int width, int height,
1296 int lumStride, int chromStride, int dstStride)
1298 //FIXME interpolate chroma
1299 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1303 * Width should be a multiple of 16.
1305 static inline void RENAME(yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1306 int width, int height,
1307 int lumStride, int chromStride, int dstStride)
1309 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1313 * Width should be a multiple of 16.
1315 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1316 int width, int height,
1317 int lumStride, int chromStride, int dstStride)
1319 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1323 * Height should be a multiple of 2 and width should be a multiple of 16.
1324 * (If this is a problem for anyone then tell me, and I will fix it.)
1326 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1327 int width, int height,
1328 int lumStride, int chromStride, int srcStride)
1331 const x86_reg chromWidth= width>>1;
1332 for (y=0; y<height; y+=2) {
1334 "xor %%"REG_a", %%"REG_a" \n\t"
1335 "pcmpeqw %%mm7, %%mm7 \n\t"
1336 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1339 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1340 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1341 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1342 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
1343 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
1344 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
1345 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
1346 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1347 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1348 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1349 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1351 MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
1353 "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(8)
1354 "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(12)
1355 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
1356 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
1357 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
1358 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
1359 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1360 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1361 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1362 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1364 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1366 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1367 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1368 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1369 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1370 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1371 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1372 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1373 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1375 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
1376 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
1378 "add $8, %%"REG_a" \n\t"
1379 "cmp %4, %%"REG_a" \n\t"
1381 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1382 : "memory", "%"REG_a
1389 "xor %%"REG_a", %%"REG_a" \n\t"
1392 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1393 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1394 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1395 "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
1396 "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
1397 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1398 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1399 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1400 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1401 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1402 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1404 MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
1405 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1407 "add $8, %%"REG_a" \n\t"
1408 "cmp %4, %%"REG_a" \n\t"
1411 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1412 : "memory", "%"REG_a
1414 udst += chromStride;
1415 vdst += chromStride;
1419 __asm__ volatile(EMMS" \n\t"
1423 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
1425 #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
1426 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride)
1433 for (x=0; x<srcWidth-1; x++) {
1434 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1435 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1437 dst[2*srcWidth-1]= src[srcWidth-1];
1441 for (y=1; y<srcHeight; y++) {
1442 const x86_reg mmxSize= srcWidth&~15;
1444 "mov %4, %%"REG_a" \n\t"
1445 "movq "MANGLE(mmx_ff)", %%mm0 \n\t"
1446 "movq (%0, %%"REG_a"), %%mm4 \n\t"
1447 "movq %%mm4, %%mm2 \n\t"
1448 "psllq $8, %%mm4 \n\t"
1449 "pand %%mm0, %%mm2 \n\t"
1450 "por %%mm2, %%mm4 \n\t"
1451 "movq (%1, %%"REG_a"), %%mm5 \n\t"
1452 "movq %%mm5, %%mm3 \n\t"
1453 "psllq $8, %%mm5 \n\t"
1454 "pand %%mm0, %%mm3 \n\t"
1455 "por %%mm3, %%mm5 \n\t"
1457 "movq (%0, %%"REG_a"), %%mm0 \n\t"
1458 "movq (%1, %%"REG_a"), %%mm1 \n\t"
1459 "movq 1(%0, %%"REG_a"), %%mm2 \n\t"
1460 "movq 1(%1, %%"REG_a"), %%mm3 \n\t"
1461 PAVGB" %%mm0, %%mm5 \n\t"
1462 PAVGB" %%mm0, %%mm3 \n\t"
1463 PAVGB" %%mm0, %%mm5 \n\t"
1464 PAVGB" %%mm0, %%mm3 \n\t"
1465 PAVGB" %%mm1, %%mm4 \n\t"
1466 PAVGB" %%mm1, %%mm2 \n\t"
1467 PAVGB" %%mm1, %%mm4 \n\t"
1468 PAVGB" %%mm1, %%mm2 \n\t"
1469 "movq %%mm5, %%mm7 \n\t"
1470 "movq %%mm4, %%mm6 \n\t"
1471 "punpcklbw %%mm3, %%mm5 \n\t"
1472 "punpckhbw %%mm3, %%mm7 \n\t"
1473 "punpcklbw %%mm2, %%mm4 \n\t"
1474 "punpckhbw %%mm2, %%mm6 \n\t"
1475 MOVNTQ" %%mm5, (%2, %%"REG_a", 2) \n\t"
1476 MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2) \n\t"
1477 MOVNTQ" %%mm4, (%3, %%"REG_a", 2) \n\t"
1478 MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2) \n\t"
1479 "add $8, %%"REG_a" \n\t"
1480 "movq -1(%0, %%"REG_a"), %%mm4 \n\t"
1481 "movq -1(%1, %%"REG_a"), %%mm5 \n\t"
1483 :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
1484 "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1489 for (x=mmxSize-1; x<srcWidth-1; x++) {
1490 dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
1491 dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
1492 dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
1493 dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
1495 dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
1496 dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1505 for (x=0; x<srcWidth-1; x++) {
1506 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1507 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1509 dst[2*srcWidth-1]= src[srcWidth-1];
1511 __asm__ volatile(EMMS" \n\t"
1515 #endif /* COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW */
1517 #if !COMPILE_TEMPLATE_AMD3DNOW
1519 * Height should be a multiple of 2 and width should be a multiple of 16.
1520 * (If this is a problem for anyone then tell me, and I will fix it.)
1521 * Chrominance data is only taken from every second line, others are ignored.
1522 * FIXME: Write HQ version.
1524 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1525 int width, int height,
1526 int lumStride, int chromStride, int srcStride)
1529 const x86_reg chromWidth= width>>1;
1530 for (y=0; y<height; y+=2) {
1532 "xor %%"REG_a", %%"REG_a" \n\t"
1533 "pcmpeqw %%mm7, %%mm7 \n\t"
1534 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1537 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1538 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // UYVY UYVY(0)
1539 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(4)
1540 "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
1541 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
1542 "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
1543 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
1544 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1545 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1546 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1547 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1549 MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
1551 "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(8)
1552 "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // UYVY UYVY(12)
1553 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
1554 "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
1555 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
1556 "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
1557 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1558 "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1559 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1560 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1562 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1564 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1565 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1566 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1567 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1568 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1569 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1570 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1571 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1573 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
1574 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
1576 "add $8, %%"REG_a" \n\t"
1577 "cmp %4, %%"REG_a" \n\t"
1579 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1580 : "memory", "%"REG_a
1587 "xor %%"REG_a", %%"REG_a" \n\t"
1590 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1591 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1592 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1593 "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
1594 "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
1595 "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1596 "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1597 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1598 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1599 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1600 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1602 MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
1603 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1605 "add $8, %%"REG_a" \n\t"
1606 "cmp %4, %%"REG_a" \n\t"
1609 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1610 : "memory", "%"REG_a
1612 udst += chromStride;
1613 vdst += chromStride;
1617 __asm__ volatile(EMMS" \n\t"
1621 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
1624 * Height should be a multiple of 2 and width should be a multiple of 2.
1625 * (If this is a problem for anyone then tell me, and I will fix it.)
1626 * Chrominance data is only taken from every second line,
1627 * others are ignored in the C version.
1628 * FIXME: Write HQ version.
1630 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1631 int width, int height,
1632 int lumStride, int chromStride, int srcStride)
1635 const x86_reg chromWidth= width>>1;
1636 for (y=0; y<height-2; y+=2) {
1638 for (i=0; i<2; i++) {
1640 "mov %2, %%"REG_a" \n\t"
1641 "movq "MANGLE(ff_bgr2YCoeff)", %%mm6 \n\t"
1642 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
1643 "pxor %%mm7, %%mm7 \n\t"
1644 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
1647 PREFETCH" 64(%0, %%"REG_d") \n\t"
1648 "movd (%0, %%"REG_d"), %%mm0 \n\t"
1649 "movd 3(%0, %%"REG_d"), %%mm1 \n\t"
1650 "punpcklbw %%mm7, %%mm0 \n\t"
1651 "punpcklbw %%mm7, %%mm1 \n\t"
1652 "movd 6(%0, %%"REG_d"), %%mm2 \n\t"
1653 "movd 9(%0, %%"REG_d"), %%mm3 \n\t"
1654 "punpcklbw %%mm7, %%mm2 \n\t"
1655 "punpcklbw %%mm7, %%mm3 \n\t"
1656 "pmaddwd %%mm6, %%mm0 \n\t"
1657 "pmaddwd %%mm6, %%mm1 \n\t"
1658 "pmaddwd %%mm6, %%mm2 \n\t"
1659 "pmaddwd %%mm6, %%mm3 \n\t"
1660 #ifndef FAST_BGR2YV12
1661 "psrad $8, %%mm0 \n\t"
1662 "psrad $8, %%mm1 \n\t"
1663 "psrad $8, %%mm2 \n\t"
1664 "psrad $8, %%mm3 \n\t"
1666 "packssdw %%mm1, %%mm0 \n\t"
1667 "packssdw %%mm3, %%mm2 \n\t"
1668 "pmaddwd %%mm5, %%mm0 \n\t"
1669 "pmaddwd %%mm5, %%mm2 \n\t"
1670 "packssdw %%mm2, %%mm0 \n\t"
1671 "psraw $7, %%mm0 \n\t"
1673 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
1674 "movd 15(%0, %%"REG_d"), %%mm1 \n\t"
1675 "punpcklbw %%mm7, %%mm4 \n\t"
1676 "punpcklbw %%mm7, %%mm1 \n\t"
1677 "movd 18(%0, %%"REG_d"), %%mm2 \n\t"
1678 "movd 21(%0, %%"REG_d"), %%mm3 \n\t"
1679 "punpcklbw %%mm7, %%mm2 \n\t"
1680 "punpcklbw %%mm7, %%mm3 \n\t"
1681 "pmaddwd %%mm6, %%mm4 \n\t"
1682 "pmaddwd %%mm6, %%mm1 \n\t"
1683 "pmaddwd %%mm6, %%mm2 \n\t"
1684 "pmaddwd %%mm6, %%mm3 \n\t"
1685 #ifndef FAST_BGR2YV12
1686 "psrad $8, %%mm4 \n\t"
1687 "psrad $8, %%mm1 \n\t"
1688 "psrad $8, %%mm2 \n\t"
1689 "psrad $8, %%mm3 \n\t"
1691 "packssdw %%mm1, %%mm4 \n\t"
1692 "packssdw %%mm3, %%mm2 \n\t"
1693 "pmaddwd %%mm5, %%mm4 \n\t"
1694 "pmaddwd %%mm5, %%mm2 \n\t"
1695 "add $24, %%"REG_d" \n\t"
1696 "packssdw %%mm2, %%mm4 \n\t"
1697 "psraw $7, %%mm4 \n\t"
1699 "packuswb %%mm4, %%mm0 \n\t"
1700 "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0 \n\t"
1702 MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t"
1703 "add $8, %%"REG_a" \n\t"
1705 : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width)
1706 : "%"REG_a, "%"REG_d
1713 "mov %4, %%"REG_a" \n\t"
1714 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
1715 "movq "MANGLE(ff_bgr2UCoeff)", %%mm6 \n\t"
1716 "pxor %%mm7, %%mm7 \n\t"
1717 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
1718 "add %%"REG_d", %%"REG_d" \n\t"
1721 PREFETCH" 64(%0, %%"REG_d") \n\t"
1722 PREFETCH" 64(%1, %%"REG_d") \n\t"
1723 #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
1724 "movq (%0, %%"REG_d"), %%mm0 \n\t"
1725 "movq (%1, %%"REG_d"), %%mm1 \n\t"
1726 "movq 6(%0, %%"REG_d"), %%mm2 \n\t"
1727 "movq 6(%1, %%"REG_d"), %%mm3 \n\t"
1728 PAVGB" %%mm1, %%mm0 \n\t"
1729 PAVGB" %%mm3, %%mm2 \n\t"
1730 "movq %%mm0, %%mm1 \n\t"
1731 "movq %%mm2, %%mm3 \n\t"
1732 "psrlq $24, %%mm0 \n\t"
1733 "psrlq $24, %%mm2 \n\t"
1734 PAVGB" %%mm1, %%mm0 \n\t"
1735 PAVGB" %%mm3, %%mm2 \n\t"
1736 "punpcklbw %%mm7, %%mm0 \n\t"
1737 "punpcklbw %%mm7, %%mm2 \n\t"
1739 "movd (%0, %%"REG_d"), %%mm0 \n\t"
1740 "movd (%1, %%"REG_d"), %%mm1 \n\t"
1741 "movd 3(%0, %%"REG_d"), %%mm2 \n\t"
1742 "movd 3(%1, %%"REG_d"), %%mm3 \n\t"
1743 "punpcklbw %%mm7, %%mm0 \n\t"
1744 "punpcklbw %%mm7, %%mm1 \n\t"
1745 "punpcklbw %%mm7, %%mm2 \n\t"
1746 "punpcklbw %%mm7, %%mm3 \n\t"
1747 "paddw %%mm1, %%mm0 \n\t"
1748 "paddw %%mm3, %%mm2 \n\t"
1749 "paddw %%mm2, %%mm0 \n\t"
1750 "movd 6(%0, %%"REG_d"), %%mm4 \n\t"
1751 "movd 6(%1, %%"REG_d"), %%mm1 \n\t"
1752 "movd 9(%0, %%"REG_d"), %%mm2 \n\t"
1753 "movd 9(%1, %%"REG_d"), %%mm3 \n\t"
1754 "punpcklbw %%mm7, %%mm4 \n\t"
1755 "punpcklbw %%mm7, %%mm1 \n\t"
1756 "punpcklbw %%mm7, %%mm2 \n\t"
1757 "punpcklbw %%mm7, %%mm3 \n\t"
1758 "paddw %%mm1, %%mm4 \n\t"
1759 "paddw %%mm3, %%mm2 \n\t"
1760 "paddw %%mm4, %%mm2 \n\t"
1761 "psrlw $2, %%mm0 \n\t"
1762 "psrlw $2, %%mm2 \n\t"
1764 "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
1765 "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
1767 "pmaddwd %%mm0, %%mm1 \n\t"
1768 "pmaddwd %%mm2, %%mm3 \n\t"
1769 "pmaddwd %%mm6, %%mm0 \n\t"
1770 "pmaddwd %%mm6, %%mm2 \n\t"
1771 #ifndef FAST_BGR2YV12
1772 "psrad $8, %%mm0 \n\t"
1773 "psrad $8, %%mm1 \n\t"
1774 "psrad $8, %%mm2 \n\t"
1775 "psrad $8, %%mm3 \n\t"
1777 "packssdw %%mm2, %%mm0 \n\t"
1778 "packssdw %%mm3, %%mm1 \n\t"
1779 "pmaddwd %%mm5, %%mm0 \n\t"
1780 "pmaddwd %%mm5, %%mm1 \n\t"
1781 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
1782 "psraw $7, %%mm0 \n\t"
1784 #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
1785 "movq 12(%0, %%"REG_d"), %%mm4 \n\t"
1786 "movq 12(%1, %%"REG_d"), %%mm1 \n\t"
1787 "movq 18(%0, %%"REG_d"), %%mm2 \n\t"
1788 "movq 18(%1, %%"REG_d"), %%mm3 \n\t"
1789 PAVGB" %%mm1, %%mm4 \n\t"
1790 PAVGB" %%mm3, %%mm2 \n\t"
1791 "movq %%mm4, %%mm1 \n\t"
1792 "movq %%mm2, %%mm3 \n\t"
1793 "psrlq $24, %%mm4 \n\t"
1794 "psrlq $24, %%mm2 \n\t"
1795 PAVGB" %%mm1, %%mm4 \n\t"
1796 PAVGB" %%mm3, %%mm2 \n\t"
1797 "punpcklbw %%mm7, %%mm4 \n\t"
1798 "punpcklbw %%mm7, %%mm2 \n\t"
1800 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
1801 "movd 12(%1, %%"REG_d"), %%mm1 \n\t"
1802 "movd 15(%0, %%"REG_d"), %%mm2 \n\t"
1803 "movd 15(%1, %%"REG_d"), %%mm3 \n\t"
1804 "punpcklbw %%mm7, %%mm4 \n\t"
1805 "punpcklbw %%mm7, %%mm1 \n\t"
1806 "punpcklbw %%mm7, %%mm2 \n\t"
1807 "punpcklbw %%mm7, %%mm3 \n\t"
1808 "paddw %%mm1, %%mm4 \n\t"
1809 "paddw %%mm3, %%mm2 \n\t"
1810 "paddw %%mm2, %%mm4 \n\t"
1811 "movd 18(%0, %%"REG_d"), %%mm5 \n\t"
1812 "movd 18(%1, %%"REG_d"), %%mm1 \n\t"
1813 "movd 21(%0, %%"REG_d"), %%mm2 \n\t"
1814 "movd 21(%1, %%"REG_d"), %%mm3 \n\t"
1815 "punpcklbw %%mm7, %%mm5 \n\t"
1816 "punpcklbw %%mm7, %%mm1 \n\t"
1817 "punpcklbw %%mm7, %%mm2 \n\t"
1818 "punpcklbw %%mm7, %%mm3 \n\t"
1819 "paddw %%mm1, %%mm5 \n\t"
1820 "paddw %%mm3, %%mm2 \n\t"
1821 "paddw %%mm5, %%mm2 \n\t"
1822 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
1823 "psrlw $2, %%mm4 \n\t"
1824 "psrlw $2, %%mm2 \n\t"
1826 "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
1827 "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
1829 "pmaddwd %%mm4, %%mm1 \n\t"
1830 "pmaddwd %%mm2, %%mm3 \n\t"
1831 "pmaddwd %%mm6, %%mm4 \n\t"
1832 "pmaddwd %%mm6, %%mm2 \n\t"
1833 #ifndef FAST_BGR2YV12
1834 "psrad $8, %%mm4 \n\t"
1835 "psrad $8, %%mm1 \n\t"
1836 "psrad $8, %%mm2 \n\t"
1837 "psrad $8, %%mm3 \n\t"
1839 "packssdw %%mm2, %%mm4 \n\t"
1840 "packssdw %%mm3, %%mm1 \n\t"
1841 "pmaddwd %%mm5, %%mm4 \n\t"
1842 "pmaddwd %%mm5, %%mm1 \n\t"
1843 "add $24, %%"REG_d" \n\t"
1844 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
1845 "psraw $7, %%mm4 \n\t"
1847 "movq %%mm0, %%mm1 \n\t"
1848 "punpckldq %%mm4, %%mm0 \n\t"
1849 "punpckhdq %%mm4, %%mm1 \n\t"
1850 "packsswb %%mm1, %%mm0 \n\t"
1851 "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0 \n\t"
1852 "movd %%mm0, (%2, %%"REG_a") \n\t"
1853 "punpckhdq %%mm0, %%mm0 \n\t"
1854 "movd %%mm0, (%3, %%"REG_a") \n\t"
1855 "add $4, %%"REG_a" \n\t"
1857 : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
1858 : "%"REG_a, "%"REG_d
1861 udst += chromStride;
1862 vdst += chromStride;
1866 __asm__ volatile(EMMS" \n\t"
1870 rgb24toyv12_c(src, ydst, udst, vdst, width, height-y, lumStride, chromStride, srcStride);
1872 #endif /* !COMPILE_TEMPLATE_SSE2 */
1874 #if !COMPILE_TEMPLATE_AMD3DNOW && !COMPILE_TEMPLATE_AVX
1875 static void RENAME(interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dest,
1876 int width, int height, int src1Stride,
1877 int src2Stride, int dstStride)
1881 for (h=0; h < height; h++) {
1884 #if COMPILE_TEMPLATE_SSE2
1886 "xor %%"REG_a", %%"REG_a" \n\t"
1888 PREFETCH" 64(%1, %%"REG_a") \n\t"
1889 PREFETCH" 64(%2, %%"REG_a") \n\t"
1890 "movdqa (%1, %%"REG_a"), %%xmm0 \n\t"
1891 "movdqa (%1, %%"REG_a"), %%xmm1 \n\t"
1892 "movdqa (%2, %%"REG_a"), %%xmm2 \n\t"
1893 "punpcklbw %%xmm2, %%xmm0 \n\t"
1894 "punpckhbw %%xmm2, %%xmm1 \n\t"
1895 "movntdq %%xmm0, (%0, %%"REG_a", 2) \n\t"
1896 "movntdq %%xmm1, 16(%0, %%"REG_a", 2) \n\t"
1897 "add $16, %%"REG_a" \n\t"
1898 "cmp %3, %%"REG_a" \n\t"
1900 ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
1901 : "memory", "%"REG_a""
1905 "xor %%"REG_a", %%"REG_a" \n\t"
1907 PREFETCH" 64(%1, %%"REG_a") \n\t"
1908 PREFETCH" 64(%2, %%"REG_a") \n\t"
1909 "movq (%1, %%"REG_a"), %%mm0 \n\t"
1910 "movq 8(%1, %%"REG_a"), %%mm2 \n\t"
1911 "movq %%mm0, %%mm1 \n\t"
1912 "movq %%mm2, %%mm3 \n\t"
1913 "movq (%2, %%"REG_a"), %%mm4 \n\t"
1914 "movq 8(%2, %%"REG_a"), %%mm5 \n\t"
1915 "punpcklbw %%mm4, %%mm0 \n\t"
1916 "punpckhbw %%mm4, %%mm1 \n\t"
1917 "punpcklbw %%mm5, %%mm2 \n\t"
1918 "punpckhbw %%mm5, %%mm3 \n\t"
1919 MOVNTQ" %%mm0, (%0, %%"REG_a", 2) \n\t"
1920 MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2) \n\t"
1921 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2) \n\t"
1922 MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2) \n\t"
1923 "add $16, %%"REG_a" \n\t"
1924 "cmp %3, %%"REG_a" \n\t"
1926 ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
1927 : "memory", "%"REG_a
1930 for (w= (width&(~15)); w < width; w++) {
1931 dest[2*w+0] = src1[w];
1932 dest[2*w+1] = src2[w];
1944 #endif /* !COMPILE_TEMPLATE_AMD3DNOW && !COMPILE_TEMPLATE_AVX */
1946 #if !COMPILE_TEMPLATE_SSE2
1947 #if !COMPILE_TEMPLATE_AMD3DNOW
1948 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
1949 uint8_t *dst1, uint8_t *dst2,
1950 int width, int height,
1951 int srcStride1, int srcStride2,
1952 int dstStride1, int dstStride2)
1956 w=width/2; h=height/2;
1960 ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
1962 const uint8_t* s1=src1+srcStride1*(y>>1);
1963 uint8_t* d=dst1+dstStride1*y;
1965 for (;x<w-31;x+=32) {
1967 PREFETCH" 32(%1,%2) \n\t"
1968 "movq (%1,%2), %%mm0 \n\t"
1969 "movq 8(%1,%2), %%mm2 \n\t"
1970 "movq 16(%1,%2), %%mm4 \n\t"
1971 "movq 24(%1,%2), %%mm6 \n\t"
1972 "movq %%mm0, %%mm1 \n\t"
1973 "movq %%mm2, %%mm3 \n\t"
1974 "movq %%mm4, %%mm5 \n\t"
1975 "movq %%mm6, %%mm7 \n\t"
1976 "punpcklbw %%mm0, %%mm0 \n\t"
1977 "punpckhbw %%mm1, %%mm1 \n\t"
1978 "punpcklbw %%mm2, %%mm2 \n\t"
1979 "punpckhbw %%mm3, %%mm3 \n\t"
1980 "punpcklbw %%mm4, %%mm4 \n\t"
1981 "punpckhbw %%mm5, %%mm5 \n\t"
1982 "punpcklbw %%mm6, %%mm6 \n\t"
1983 "punpckhbw %%mm7, %%mm7 \n\t"
1984 MOVNTQ" %%mm0, (%0,%2,2) \n\t"
1985 MOVNTQ" %%mm1, 8(%0,%2,2) \n\t"
1986 MOVNTQ" %%mm2, 16(%0,%2,2) \n\t"
1987 MOVNTQ" %%mm3, 24(%0,%2,2) \n\t"
1988 MOVNTQ" %%mm4, 32(%0,%2,2) \n\t"
1989 MOVNTQ" %%mm5, 40(%0,%2,2) \n\t"
1990 MOVNTQ" %%mm6, 48(%0,%2,2) \n\t"
1991 MOVNTQ" %%mm7, 56(%0,%2,2)"
1992 :: "r"(d), "r"(s1), "r"(x)
1995 for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
1998 const uint8_t* s2=src2+srcStride2*(y>>1);
1999 uint8_t* d=dst2+dstStride2*y;
2001 for (;x<w-31;x+=32) {
2003 PREFETCH" 32(%1,%2) \n\t"
2004 "movq (%1,%2), %%mm0 \n\t"
2005 "movq 8(%1,%2), %%mm2 \n\t"
2006 "movq 16(%1,%2), %%mm4 \n\t"
2007 "movq 24(%1,%2), %%mm6 \n\t"
2008 "movq %%mm0, %%mm1 \n\t"
2009 "movq %%mm2, %%mm3 \n\t"
2010 "movq %%mm4, %%mm5 \n\t"
2011 "movq %%mm6, %%mm7 \n\t"
2012 "punpcklbw %%mm0, %%mm0 \n\t"
2013 "punpckhbw %%mm1, %%mm1 \n\t"
2014 "punpcklbw %%mm2, %%mm2 \n\t"
2015 "punpckhbw %%mm3, %%mm3 \n\t"
2016 "punpcklbw %%mm4, %%mm4 \n\t"
2017 "punpckhbw %%mm5, %%mm5 \n\t"
2018 "punpcklbw %%mm6, %%mm6 \n\t"
2019 "punpckhbw %%mm7, %%mm7 \n\t"
2020 MOVNTQ" %%mm0, (%0,%2,2) \n\t"
2021 MOVNTQ" %%mm1, 8(%0,%2,2) \n\t"
2022 MOVNTQ" %%mm2, 16(%0,%2,2) \n\t"
2023 MOVNTQ" %%mm3, 24(%0,%2,2) \n\t"
2024 MOVNTQ" %%mm4, 32(%0,%2,2) \n\t"
2025 MOVNTQ" %%mm5, 40(%0,%2,2) \n\t"
2026 MOVNTQ" %%mm6, 48(%0,%2,2) \n\t"
2027 MOVNTQ" %%mm7, 56(%0,%2,2)"
2028 :: "r"(d), "r"(s2), "r"(x)
2031 for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2040 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2042 int width, int height,
2043 int srcStride1, int srcStride2,
2044 int srcStride3, int dstStride)
2048 w=width/2; h=height;
2050 const uint8_t* yp=src1+srcStride1*y;
2051 const uint8_t* up=src2+srcStride2*(y>>2);
2052 const uint8_t* vp=src3+srcStride3*(y>>2);
2053 uint8_t* d=dst+dstStride*y;
2057 PREFETCH" 32(%1, %0) \n\t"
2058 PREFETCH" 32(%2, %0) \n\t"
2059 PREFETCH" 32(%3, %0) \n\t"
2060 "movq (%1, %0, 4), %%mm0 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2061 "movq (%2, %0), %%mm1 \n\t" /* U0U1U2U3U4U5U6U7 */
2062 "movq (%3, %0), %%mm2 \n\t" /* V0V1V2V3V4V5V6V7 */
2063 "movq %%mm0, %%mm3 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2064 "movq %%mm1, %%mm4 \n\t" /* U0U1U2U3U4U5U6U7 */
2065 "movq %%mm2, %%mm5 \n\t" /* V0V1V2V3V4V5V6V7 */
2066 "punpcklbw %%mm1, %%mm1 \n\t" /* U0U0 U1U1 U2U2 U3U3 */
2067 "punpcklbw %%mm2, %%mm2 \n\t" /* V0V0 V1V1 V2V2 V3V3 */
2068 "punpckhbw %%mm4, %%mm4 \n\t" /* U4U4 U5U5 U6U6 U7U7 */
2069 "punpckhbw %%mm5, %%mm5 \n\t" /* V4V4 V5V5 V6V6 V7V7 */
2071 "movq %%mm1, %%mm6 \n\t"
2072 "punpcklbw %%mm2, %%mm1 \n\t" /* U0V0 U0V0 U1V1 U1V1*/
2073 "punpcklbw %%mm1, %%mm0 \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2074 "punpckhbw %%mm1, %%mm3 \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2075 MOVNTQ" %%mm0, (%4, %0, 8) \n\t"
2076 MOVNTQ" %%mm3, 8(%4, %0, 8) \n\t"
2078 "punpckhbw %%mm2, %%mm6 \n\t" /* U2V2 U2V2 U3V3 U3V3*/
2079 "movq 8(%1, %0, 4), %%mm0 \n\t"
2080 "movq %%mm0, %%mm3 \n\t"
2081 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U2 Y V2 Y U2 Y V2*/
2082 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U3 Y V3 Y U3 Y V3*/
2083 MOVNTQ" %%mm0, 16(%4, %0, 8) \n\t"
2084 MOVNTQ" %%mm3, 24(%4, %0, 8) \n\t"
2086 "movq %%mm4, %%mm6 \n\t"
2087 "movq 16(%1, %0, 4), %%mm0 \n\t"
2088 "movq %%mm0, %%mm3 \n\t"
2089 "punpcklbw %%mm5, %%mm4 \n\t"
2090 "punpcklbw %%mm4, %%mm0 \n\t" /* Y U4 Y V4 Y U4 Y V4*/
2091 "punpckhbw %%mm4, %%mm3 \n\t" /* Y U5 Y V5 Y U5 Y V5*/
2092 MOVNTQ" %%mm0, 32(%4, %0, 8) \n\t"
2093 MOVNTQ" %%mm3, 40(%4, %0, 8) \n\t"
2095 "punpckhbw %%mm5, %%mm6 \n\t"
2096 "movq 24(%1, %0, 4), %%mm0 \n\t"
2097 "movq %%mm0, %%mm3 \n\t"
2098 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U6 Y V6 Y U6 Y V6*/
2099 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U7 Y V7 Y U7 Y V7*/
2100 MOVNTQ" %%mm0, 48(%4, %0, 8) \n\t"
2101 MOVNTQ" %%mm3, 56(%4, %0, 8) \n\t"
2104 : "r"(yp), "r" (up), "r"(vp), "r"(d)
2108 const int x2 = x<<2;
2111 d[8*x+2] = yp[x2+1];
2113 d[8*x+4] = yp[x2+2];
2115 d[8*x+6] = yp[x2+3];
2125 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2127 static void RENAME(extract_even)(const uint8_t *src, uint8_t *dst, x86_reg count)
2136 "pcmpeqw %%mm7, %%mm7 \n\t"
2137 "psrlw $8, %%mm7 \n\t"
2139 "movq -30(%1, %0, 2), %%mm0 \n\t"
2140 "movq -22(%1, %0, 2), %%mm1 \n\t"
2141 "movq -14(%1, %0, 2), %%mm2 \n\t"
2142 "movq -6(%1, %0, 2), %%mm3 \n\t"
2143 "pand %%mm7, %%mm0 \n\t"
2144 "pand %%mm7, %%mm1 \n\t"
2145 "pand %%mm7, %%mm2 \n\t"
2146 "pand %%mm7, %%mm3 \n\t"
2147 "packuswb %%mm1, %%mm0 \n\t"
2148 "packuswb %%mm3, %%mm2 \n\t"
2149 MOVNTQ" %%mm0,-15(%2, %0) \n\t"
2150 MOVNTQ" %%mm2,- 7(%2, %0) \n\t"
2154 : "r"(src), "r"(dst)
2159 dst[count]= src[2*count];
2164 #if !COMPILE_TEMPLATE_AMD3DNOW
2165 static void RENAME(extract_even2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2174 "pcmpeqw %%mm7, %%mm7 \n\t"
2175 "psrlw $8, %%mm7 \n\t"
2177 "movq -28(%1, %0, 4), %%mm0 \n\t"
2178 "movq -20(%1, %0, 4), %%mm1 \n\t"
2179 "movq -12(%1, %0, 4), %%mm2 \n\t"
2180 "movq -4(%1, %0, 4), %%mm3 \n\t"
2181 "pand %%mm7, %%mm0 \n\t"
2182 "pand %%mm7, %%mm1 \n\t"
2183 "pand %%mm7, %%mm2 \n\t"
2184 "pand %%mm7, %%mm3 \n\t"
2185 "packuswb %%mm1, %%mm0 \n\t"
2186 "packuswb %%mm3, %%mm2 \n\t"
2187 "movq %%mm0, %%mm1 \n\t"
2188 "movq %%mm2, %%mm3 \n\t"
2189 "psrlw $8, %%mm0 \n\t"
2190 "psrlw $8, %%mm2 \n\t"
2191 "pand %%mm7, %%mm1 \n\t"
2192 "pand %%mm7, %%mm3 \n\t"
2193 "packuswb %%mm2, %%mm0 \n\t"
2194 "packuswb %%mm3, %%mm1 \n\t"
2195 MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
2196 MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
2200 : "r"(src), "r"(dst0), "r"(dst1)
2205 dst0[count]= src[4*count+0];
2206 dst1[count]= src[4*count+2];
2210 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2212 static void RENAME(extract_even2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2223 "pcmpeqw %%mm7, %%mm7 \n\t"
2224 "psrlw $8, %%mm7 \n\t"
2226 "movq -28(%1, %0, 4), %%mm0 \n\t"
2227 "movq -20(%1, %0, 4), %%mm1 \n\t"
2228 "movq -12(%1, %0, 4), %%mm2 \n\t"
2229 "movq -4(%1, %0, 4), %%mm3 \n\t"
2230 PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
2231 PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
2232 PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
2233 PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
2234 "pand %%mm7, %%mm0 \n\t"
2235 "pand %%mm7, %%mm1 \n\t"
2236 "pand %%mm7, %%mm2 \n\t"
2237 "pand %%mm7, %%mm3 \n\t"
2238 "packuswb %%mm1, %%mm0 \n\t"
2239 "packuswb %%mm3, %%mm2 \n\t"
2240 "movq %%mm0, %%mm1 \n\t"
2241 "movq %%mm2, %%mm3 \n\t"
2242 "psrlw $8, %%mm0 \n\t"
2243 "psrlw $8, %%mm2 \n\t"
2244 "pand %%mm7, %%mm1 \n\t"
2245 "pand %%mm7, %%mm3 \n\t"
2246 "packuswb %%mm2, %%mm0 \n\t"
2247 "packuswb %%mm3, %%mm1 \n\t"
2248 MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
2249 MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
2253 : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2259 dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2260 dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2265 #if !COMPILE_TEMPLATE_AMD3DNOW
2266 static void RENAME(extract_odd2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2275 "pcmpeqw %%mm7, %%mm7 \n\t"
2276 "psrlw $8, %%mm7 \n\t"
2278 "movq -28(%1, %0, 4), %%mm0 \n\t"
2279 "movq -20(%1, %0, 4), %%mm1 \n\t"
2280 "movq -12(%1, %0, 4), %%mm2 \n\t"
2281 "movq -4(%1, %0, 4), %%mm3 \n\t"
2282 "psrlw $8, %%mm0 \n\t"
2283 "psrlw $8, %%mm1 \n\t"
2284 "psrlw $8, %%mm2 \n\t"
2285 "psrlw $8, %%mm3 \n\t"
2286 "packuswb %%mm1, %%mm0 \n\t"
2287 "packuswb %%mm3, %%mm2 \n\t"
2288 "movq %%mm0, %%mm1 \n\t"
2289 "movq %%mm2, %%mm3 \n\t"
2290 "psrlw $8, %%mm0 \n\t"
2291 "psrlw $8, %%mm2 \n\t"
2292 "pand %%mm7, %%mm1 \n\t"
2293 "pand %%mm7, %%mm3 \n\t"
2294 "packuswb %%mm2, %%mm0 \n\t"
2295 "packuswb %%mm3, %%mm1 \n\t"
2296 MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
2297 MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
2301 : "r"(src), "r"(dst0), "r"(dst1)
2307 dst0[count]= src[4*count+0];
2308 dst1[count]= src[4*count+2];
2312 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2314 static void RENAME(extract_odd2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2325 "pcmpeqw %%mm7, %%mm7 \n\t"
2326 "psrlw $8, %%mm7 \n\t"
2328 "movq -28(%1, %0, 4), %%mm0 \n\t"
2329 "movq -20(%1, %0, 4), %%mm1 \n\t"
2330 "movq -12(%1, %0, 4), %%mm2 \n\t"
2331 "movq -4(%1, %0, 4), %%mm3 \n\t"
2332 PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
2333 PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
2334 PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
2335 PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
2336 "psrlw $8, %%mm0 \n\t"
2337 "psrlw $8, %%mm1 \n\t"
2338 "psrlw $8, %%mm2 \n\t"
2339 "psrlw $8, %%mm3 \n\t"
2340 "packuswb %%mm1, %%mm0 \n\t"
2341 "packuswb %%mm3, %%mm2 \n\t"
2342 "movq %%mm0, %%mm1 \n\t"
2343 "movq %%mm2, %%mm3 \n\t"
2344 "psrlw $8, %%mm0 \n\t"
2345 "psrlw $8, %%mm2 \n\t"
2346 "pand %%mm7, %%mm1 \n\t"
2347 "pand %%mm7, %%mm3 \n\t"
2348 "packuswb %%mm2, %%mm0 \n\t"
2349 "packuswb %%mm3, %%mm1 \n\t"
2350 MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
2351 MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
2355 : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2363 dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2364 dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2369 static void RENAME(yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2370 int width, int height,
2371 int lumStride, int chromStride, int srcStride)
2374 const int chromWidth= -((-width)>>1);
2376 for (y=0; y<height; y++) {
2377 RENAME(extract_even)(src, ydst, width);
2379 RENAME(extract_odd2avg)(src-srcStride, src, udst, vdst, chromWidth);
2394 #if !COMPILE_TEMPLATE_AMD3DNOW
2395 static void RENAME(yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2396 int width, int height,
2397 int lumStride, int chromStride, int srcStride)
2400 const int chromWidth= -((-width)>>1);
2402 for (y=0; y<height; y++) {
2403 RENAME(extract_even)(src, ydst, width);
2404 RENAME(extract_odd2)(src, udst, vdst, chromWidth);
2417 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2419 static void RENAME(uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2420 int width, int height,
2421 int lumStride, int chromStride, int srcStride)
2424 const int chromWidth= -((-width)>>1);
2426 for (y=0; y<height; y++) {
2427 RENAME(extract_even)(src+1, ydst, width);
2429 RENAME(extract_even2avg)(src-srcStride, src, udst, vdst, chromWidth);
2444 #if !COMPILE_TEMPLATE_AMD3DNOW
2445 static void RENAME(uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2446 int width, int height,
2447 int lumStride, int chromStride, int srcStride)
2450 const int chromWidth= -((-width)>>1);
2452 for (y=0; y<height; y++) {
2453 RENAME(extract_even)(src+1, ydst, width);
2454 RENAME(extract_even2)(src, udst, vdst, chromWidth);
2467 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2468 #endif /* !COMPILE_TEMPLATE_SSE2 */
2470 static av_cold void RENAME(rgb2rgb_init)(void)
2472 #if !COMPILE_TEMPLATE_SSE2
2473 #if !COMPILE_TEMPLATE_AMD3DNOW
2474 rgb15to16 = RENAME(rgb15to16);
2475 rgb15tobgr24 = RENAME(rgb15tobgr24);
2476 rgb15to32 = RENAME(rgb15to32);
2477 rgb16tobgr24 = RENAME(rgb16tobgr24);
2478 rgb16to32 = RENAME(rgb16to32);
2479 rgb16to15 = RENAME(rgb16to15);
2480 rgb24tobgr16 = RENAME(rgb24tobgr16);
2481 rgb24tobgr15 = RENAME(rgb24tobgr15);
2482 rgb24tobgr32 = RENAME(rgb24tobgr32);
2483 rgb32to16 = RENAME(rgb32to16);
2484 rgb32to15 = RENAME(rgb32to15);
2485 rgb32tobgr24 = RENAME(rgb32tobgr24);
2486 rgb24to15 = RENAME(rgb24to15);
2487 rgb24to16 = RENAME(rgb24to16);
2488 rgb24tobgr24 = RENAME(rgb24tobgr24);
2489 shuffle_bytes_2103 = RENAME(shuffle_bytes_2103);
2490 rgb32tobgr16 = RENAME(rgb32tobgr16);
2491 rgb32tobgr15 = RENAME(rgb32tobgr15);
2492 yv12toyuy2 = RENAME(yv12toyuy2);
2493 yv12touyvy = RENAME(yv12touyvy);
2494 yuv422ptoyuy2 = RENAME(yuv422ptoyuy2);
2495 yuv422ptouyvy = RENAME(yuv422ptouyvy);
2496 yuy2toyv12 = RENAME(yuy2toyv12);
2497 vu9_to_vu12 = RENAME(vu9_to_vu12);
2498 yvu9_to_yuy2 = RENAME(yvu9_to_yuy2);
2499 uyvytoyuv422 = RENAME(uyvytoyuv422);
2500 yuyvtoyuv422 = RENAME(yuyvtoyuv422);
2501 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2503 #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
2504 planar2x = RENAME(planar2x);
2505 #endif /* COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW */
2506 rgb24toyv12 = RENAME(rgb24toyv12);
2508 yuyvtoyuv420 = RENAME(yuyvtoyuv420);
2509 uyvytoyuv420 = RENAME(uyvytoyuv420);
2510 #endif /* !COMPILE_TEMPLATE_SSE2 */
2512 #if !COMPILE_TEMPLATE_AMD3DNOW && !COMPILE_TEMPLATE_AVX
2513 interleaveBytes = RENAME(interleaveBytes);
2514 #endif /* !COMPILE_TEMPLATE_AMD3DNOW && !COMPILE_TEMPLATE_AVX */