2 * software RGB to RGB converter
3 * pluralize by software PAL8 to RGB converter
4 * software YUV to YUV converter
5 * software YUV to RGB converter
6 * Written by Nick Kurshev.
7 * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
8 * lot of big-endian byte order fixes by Alex Beregszaszi
10 * This file is part of Libav.
12 * Libav is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License as published by the Free Software Foundation; either
15 * version 2.1 of the License, or (at your option) any later version.
17 * Libav is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * Lesser General Public License for more details.
22 * You should have received a copy of the GNU Lesser General Public
23 * License along with Libav; if not, write to the Free Software
24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
35 #if COMPILE_TEMPLATE_AMD3DNOW
36 #define PREFETCH "prefetch"
37 #define PAVGB "pavgusb"
38 #elif COMPILE_TEMPLATE_MMXEXT
39 #define PREFETCH "prefetchnta"
42 #define PREFETCH " # nop"
45 #if COMPILE_TEMPLATE_AMD3DNOW
46 /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
52 #if COMPILE_TEMPLATE_MMXEXT
53 #define MOVNTQ "movntq"
54 #define SFENCE "sfence"
57 #define SFENCE " # nop"
60 #if !COMPILE_TEMPLATE_SSE2
62 #if !COMPILE_TEMPLATE_AMD3DNOW
64 static inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, int src_size)
67 const uint8_t *s = src;
69 const uint8_t *mm_end;
71 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
73 __asm__ volatile("movq %0, %%mm7"::"m"(mask32a):"memory");
76 PREFETCH" 32(%1) \n\t"
77 "movd (%1), %%mm0 \n\t"
78 "punpckldq 3(%1), %%mm0 \n\t"
79 "movd 6(%1), %%mm1 \n\t"
80 "punpckldq 9(%1), %%mm1 \n\t"
81 "movd 12(%1), %%mm2 \n\t"
82 "punpckldq 15(%1), %%mm2 \n\t"
83 "movd 18(%1), %%mm3 \n\t"
84 "punpckldq 21(%1), %%mm3 \n\t"
85 "por %%mm7, %%mm0 \n\t"
86 "por %%mm7, %%mm1 \n\t"
87 "por %%mm7, %%mm2 \n\t"
88 "por %%mm7, %%mm3 \n\t"
89 MOVNTQ" %%mm0, (%0) \n\t"
90 MOVNTQ" %%mm1, 8(%0) \n\t"
91 MOVNTQ" %%mm2, 16(%0) \n\t"
92 MOVNTQ" %%mm3, 24(%0)"
98 __asm__ volatile(SFENCE:::"memory");
99 __asm__ volatile(EMMS:::"memory");
108 #define STORE_BGR24_MMX \
109 "psrlq $8, %%mm2 \n\t" \
110 "psrlq $8, %%mm3 \n\t" \
111 "psrlq $8, %%mm6 \n\t" \
112 "psrlq $8, %%mm7 \n\t" \
113 "pand "MANGLE(mask24l)", %%mm0\n\t" \
114 "pand "MANGLE(mask24l)", %%mm1\n\t" \
115 "pand "MANGLE(mask24l)", %%mm4\n\t" \
116 "pand "MANGLE(mask24l)", %%mm5\n\t" \
117 "pand "MANGLE(mask24h)", %%mm2\n\t" \
118 "pand "MANGLE(mask24h)", %%mm3\n\t" \
119 "pand "MANGLE(mask24h)", %%mm6\n\t" \
120 "pand "MANGLE(mask24h)", %%mm7\n\t" \
121 "por %%mm2, %%mm0 \n\t" \
122 "por %%mm3, %%mm1 \n\t" \
123 "por %%mm6, %%mm4 \n\t" \
124 "por %%mm7, %%mm5 \n\t" \
126 "movq %%mm1, %%mm2 \n\t" \
127 "movq %%mm4, %%mm3 \n\t" \
128 "psllq $48, %%mm2 \n\t" \
129 "psllq $32, %%mm3 \n\t" \
130 "pand "MANGLE(mask24hh)", %%mm2\n\t" \
131 "pand "MANGLE(mask24hhh)", %%mm3\n\t" \
132 "por %%mm2, %%mm0 \n\t" \
133 "psrlq $16, %%mm1 \n\t" \
134 "psrlq $32, %%mm4 \n\t" \
135 "psllq $16, %%mm5 \n\t" \
136 "por %%mm3, %%mm1 \n\t" \
137 "pand "MANGLE(mask24hhhh)", %%mm5\n\t" \
138 "por %%mm5, %%mm4 \n\t" \
140 MOVNTQ" %%mm0, (%0) \n\t" \
141 MOVNTQ" %%mm1, 8(%0) \n\t" \
142 MOVNTQ" %%mm4, 16(%0)"
145 static inline void RENAME(rgb32tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
148 const uint8_t *s = src;
150 const uint8_t *mm_end;
152 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
156 PREFETCH" 32(%1) \n\t"
157 "movq (%1), %%mm0 \n\t"
158 "movq 8(%1), %%mm1 \n\t"
159 "movq 16(%1), %%mm4 \n\t"
160 "movq 24(%1), %%mm5 \n\t"
161 "movq %%mm0, %%mm2 \n\t"
162 "movq %%mm1, %%mm3 \n\t"
163 "movq %%mm4, %%mm6 \n\t"
164 "movq %%mm5, %%mm7 \n\t"
171 __asm__ volatile(SFENCE:::"memory");
172 __asm__ volatile(EMMS:::"memory");
182 original by Strepto/Astral
183 ported to gcc & bugfixed: A'rpi
184 MMX2, 3DNOW optimization by Nick Kurshev
185 32-bit C version, and and&add trick by Michael Niedermayer
187 static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, int src_size)
189 register const uint8_t* s=src;
190 register uint8_t* d=dst;
191 register const uint8_t *end;
192 const uint8_t *mm_end;
194 __asm__ volatile(PREFETCH" %0"::"m"(*s));
195 __asm__ volatile("movq %0, %%mm4"::"m"(mask15s));
199 PREFETCH" 32(%1) \n\t"
200 "movq (%1), %%mm0 \n\t"
201 "movq 8(%1), %%mm2 \n\t"
202 "movq %%mm0, %%mm1 \n\t"
203 "movq %%mm2, %%mm3 \n\t"
204 "pand %%mm4, %%mm0 \n\t"
205 "pand %%mm4, %%mm2 \n\t"
206 "paddw %%mm1, %%mm0 \n\t"
207 "paddw %%mm3, %%mm2 \n\t"
208 MOVNTQ" %%mm0, (%0) \n\t"
209 MOVNTQ" %%mm2, 8(%0)"
215 __asm__ volatile(SFENCE:::"memory");
216 __asm__ volatile(EMMS:::"memory");
219 register unsigned x= *((const uint32_t *)s);
220 *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
225 register unsigned short x= *((const uint16_t *)s);
226 *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
230 static inline void RENAME(rgb16to15)(const uint8_t *src, uint8_t *dst, int src_size)
232 register const uint8_t* s=src;
233 register uint8_t* d=dst;
234 register const uint8_t *end;
235 const uint8_t *mm_end;
237 __asm__ volatile(PREFETCH" %0"::"m"(*s));
238 __asm__ volatile("movq %0, %%mm7"::"m"(mask15rg));
239 __asm__ volatile("movq %0, %%mm6"::"m"(mask15b));
243 PREFETCH" 32(%1) \n\t"
244 "movq (%1), %%mm0 \n\t"
245 "movq 8(%1), %%mm2 \n\t"
246 "movq %%mm0, %%mm1 \n\t"
247 "movq %%mm2, %%mm3 \n\t"
248 "psrlq $1, %%mm0 \n\t"
249 "psrlq $1, %%mm2 \n\t"
250 "pand %%mm7, %%mm0 \n\t"
251 "pand %%mm7, %%mm2 \n\t"
252 "pand %%mm6, %%mm1 \n\t"
253 "pand %%mm6, %%mm3 \n\t"
254 "por %%mm1, %%mm0 \n\t"
255 "por %%mm3, %%mm2 \n\t"
256 MOVNTQ" %%mm0, (%0) \n\t"
257 MOVNTQ" %%mm2, 8(%0)"
263 __asm__ volatile(SFENCE:::"memory");
264 __asm__ volatile(EMMS:::"memory");
267 register uint32_t x= *((const uint32_t*)s);
268 *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
273 register uint16_t x= *((const uint16_t*)s);
274 *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
278 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, int src_size)
280 const uint8_t *s = src;
282 const uint8_t *mm_end;
283 uint16_t *d = (uint16_t *)dst;
287 "movq %3, %%mm5 \n\t"
288 "movq %4, %%mm6 \n\t"
289 "movq %5, %%mm7 \n\t"
293 PREFETCH" 32(%1) \n\t"
294 "movd (%1), %%mm0 \n\t"
295 "movd 4(%1), %%mm3 \n\t"
296 "punpckldq 8(%1), %%mm0 \n\t"
297 "punpckldq 12(%1), %%mm3 \n\t"
298 "movq %%mm0, %%mm1 \n\t"
299 "movq %%mm3, %%mm4 \n\t"
300 "pand %%mm6, %%mm0 \n\t"
301 "pand %%mm6, %%mm3 \n\t"
302 "pmaddwd %%mm7, %%mm0 \n\t"
303 "pmaddwd %%mm7, %%mm3 \n\t"
304 "pand %%mm5, %%mm1 \n\t"
305 "pand %%mm5, %%mm4 \n\t"
306 "por %%mm1, %%mm0 \n\t"
307 "por %%mm4, %%mm3 \n\t"
308 "psrld $5, %%mm0 \n\t"
309 "pslld $11, %%mm3 \n\t"
310 "por %%mm3, %%mm0 \n\t"
311 MOVNTQ" %%mm0, (%0) \n\t"
318 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
320 __asm__ volatile(SFENCE:::"memory");
321 __asm__ volatile(EMMS:::"memory");
323 register int rgb = *(const uint32_t*)s; s += 4;
324 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
328 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, int src_size)
330 const uint8_t *s = src;
332 const uint8_t *mm_end;
333 uint16_t *d = (uint16_t *)dst;
335 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
337 "movq %0, %%mm7 \n\t"
338 "movq %1, %%mm6 \n\t"
339 ::"m"(red_16mask),"m"(green_16mask));
343 PREFETCH" 32(%1) \n\t"
344 "movd (%1), %%mm0 \n\t"
345 "movd 4(%1), %%mm3 \n\t"
346 "punpckldq 8(%1), %%mm0 \n\t"
347 "punpckldq 12(%1), %%mm3 \n\t"
348 "movq %%mm0, %%mm1 \n\t"
349 "movq %%mm0, %%mm2 \n\t"
350 "movq %%mm3, %%mm4 \n\t"
351 "movq %%mm3, %%mm5 \n\t"
352 "psllq $8, %%mm0 \n\t"
353 "psllq $8, %%mm3 \n\t"
354 "pand %%mm7, %%mm0 \n\t"
355 "pand %%mm7, %%mm3 \n\t"
356 "psrlq $5, %%mm1 \n\t"
357 "psrlq $5, %%mm4 \n\t"
358 "pand %%mm6, %%mm1 \n\t"
359 "pand %%mm6, %%mm4 \n\t"
360 "psrlq $19, %%mm2 \n\t"
361 "psrlq $19, %%mm5 \n\t"
362 "pand %2, %%mm2 \n\t"
363 "pand %2, %%mm5 \n\t"
364 "por %%mm1, %%mm0 \n\t"
365 "por %%mm4, %%mm3 \n\t"
366 "por %%mm2, %%mm0 \n\t"
367 "por %%mm5, %%mm3 \n\t"
368 "psllq $16, %%mm3 \n\t"
369 "por %%mm3, %%mm0 \n\t"
370 MOVNTQ" %%mm0, (%0) \n\t"
371 :: "r"(d),"r"(s),"m"(blue_16mask):"memory");
375 __asm__ volatile(SFENCE:::"memory");
376 __asm__ volatile(EMMS:::"memory");
378 register int rgb = *(const uint32_t*)s; s += 4;
379 *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
383 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, int src_size)
385 const uint8_t *s = src;
387 const uint8_t *mm_end;
388 uint16_t *d = (uint16_t *)dst;
392 "movq %3, %%mm5 \n\t"
393 "movq %4, %%mm6 \n\t"
394 "movq %5, %%mm7 \n\t"
398 PREFETCH" 32(%1) \n\t"
399 "movd (%1), %%mm0 \n\t"
400 "movd 4(%1), %%mm3 \n\t"
401 "punpckldq 8(%1), %%mm0 \n\t"
402 "punpckldq 12(%1), %%mm3 \n\t"
403 "movq %%mm0, %%mm1 \n\t"
404 "movq %%mm3, %%mm4 \n\t"
405 "pand %%mm6, %%mm0 \n\t"
406 "pand %%mm6, %%mm3 \n\t"
407 "pmaddwd %%mm7, %%mm0 \n\t"
408 "pmaddwd %%mm7, %%mm3 \n\t"
409 "pand %%mm5, %%mm1 \n\t"
410 "pand %%mm5, %%mm4 \n\t"
411 "por %%mm1, %%mm0 \n\t"
412 "por %%mm4, %%mm3 \n\t"
413 "psrld $6, %%mm0 \n\t"
414 "pslld $10, %%mm3 \n\t"
415 "por %%mm3, %%mm0 \n\t"
416 MOVNTQ" %%mm0, (%0) \n\t"
423 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
425 __asm__ volatile(SFENCE:::"memory");
426 __asm__ volatile(EMMS:::"memory");
428 register int rgb = *(const uint32_t*)s; s += 4;
429 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
433 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, int src_size)
435 const uint8_t *s = src;
437 const uint8_t *mm_end;
438 uint16_t *d = (uint16_t *)dst;
440 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
442 "movq %0, %%mm7 \n\t"
443 "movq %1, %%mm6 \n\t"
444 ::"m"(red_15mask),"m"(green_15mask));
448 PREFETCH" 32(%1) \n\t"
449 "movd (%1), %%mm0 \n\t"
450 "movd 4(%1), %%mm3 \n\t"
451 "punpckldq 8(%1), %%mm0 \n\t"
452 "punpckldq 12(%1), %%mm3 \n\t"
453 "movq %%mm0, %%mm1 \n\t"
454 "movq %%mm0, %%mm2 \n\t"
455 "movq %%mm3, %%mm4 \n\t"
456 "movq %%mm3, %%mm5 \n\t"
457 "psllq $7, %%mm0 \n\t"
458 "psllq $7, %%mm3 \n\t"
459 "pand %%mm7, %%mm0 \n\t"
460 "pand %%mm7, %%mm3 \n\t"
461 "psrlq $6, %%mm1 \n\t"
462 "psrlq $6, %%mm4 \n\t"
463 "pand %%mm6, %%mm1 \n\t"
464 "pand %%mm6, %%mm4 \n\t"
465 "psrlq $19, %%mm2 \n\t"
466 "psrlq $19, %%mm5 \n\t"
467 "pand %2, %%mm2 \n\t"
468 "pand %2, %%mm5 \n\t"
469 "por %%mm1, %%mm0 \n\t"
470 "por %%mm4, %%mm3 \n\t"
471 "por %%mm2, %%mm0 \n\t"
472 "por %%mm5, %%mm3 \n\t"
473 "psllq $16, %%mm3 \n\t"
474 "por %%mm3, %%mm0 \n\t"
475 MOVNTQ" %%mm0, (%0) \n\t"
476 ::"r"(d),"r"(s),"m"(blue_15mask):"memory");
480 __asm__ volatile(SFENCE:::"memory");
481 __asm__ volatile(EMMS:::"memory");
483 register int rgb = *(const uint32_t*)s; s += 4;
484 *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
488 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, int src_size)
490 const uint8_t *s = src;
492 const uint8_t *mm_end;
493 uint16_t *d = (uint16_t *)dst;
495 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
497 "movq %0, %%mm7 \n\t"
498 "movq %1, %%mm6 \n\t"
499 ::"m"(red_16mask),"m"(green_16mask));
503 PREFETCH" 32(%1) \n\t"
504 "movd (%1), %%mm0 \n\t"
505 "movd 3(%1), %%mm3 \n\t"
506 "punpckldq 6(%1), %%mm0 \n\t"
507 "punpckldq 9(%1), %%mm3 \n\t"
508 "movq %%mm0, %%mm1 \n\t"
509 "movq %%mm0, %%mm2 \n\t"
510 "movq %%mm3, %%mm4 \n\t"
511 "movq %%mm3, %%mm5 \n\t"
512 "psrlq $3, %%mm0 \n\t"
513 "psrlq $3, %%mm3 \n\t"
514 "pand %2, %%mm0 \n\t"
515 "pand %2, %%mm3 \n\t"
516 "psrlq $5, %%mm1 \n\t"
517 "psrlq $5, %%mm4 \n\t"
518 "pand %%mm6, %%mm1 \n\t"
519 "pand %%mm6, %%mm4 \n\t"
520 "psrlq $8, %%mm2 \n\t"
521 "psrlq $8, %%mm5 \n\t"
522 "pand %%mm7, %%mm2 \n\t"
523 "pand %%mm7, %%mm5 \n\t"
524 "por %%mm1, %%mm0 \n\t"
525 "por %%mm4, %%mm3 \n\t"
526 "por %%mm2, %%mm0 \n\t"
527 "por %%mm5, %%mm3 \n\t"
528 "psllq $16, %%mm3 \n\t"
529 "por %%mm3, %%mm0 \n\t"
530 MOVNTQ" %%mm0, (%0) \n\t"
531 ::"r"(d),"r"(s),"m"(blue_16mask):"memory");
535 __asm__ volatile(SFENCE:::"memory");
536 __asm__ volatile(EMMS:::"memory");
541 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
545 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, int src_size)
547 const uint8_t *s = src;
549 const uint8_t *mm_end;
550 uint16_t *d = (uint16_t *)dst;
552 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
554 "movq %0, %%mm7 \n\t"
555 "movq %1, %%mm6 \n\t"
556 ::"m"(red_16mask),"m"(green_16mask));
560 PREFETCH" 32(%1) \n\t"
561 "movd (%1), %%mm0 \n\t"
562 "movd 3(%1), %%mm3 \n\t"
563 "punpckldq 6(%1), %%mm0 \n\t"
564 "punpckldq 9(%1), %%mm3 \n\t"
565 "movq %%mm0, %%mm1 \n\t"
566 "movq %%mm0, %%mm2 \n\t"
567 "movq %%mm3, %%mm4 \n\t"
568 "movq %%mm3, %%mm5 \n\t"
569 "psllq $8, %%mm0 \n\t"
570 "psllq $8, %%mm3 \n\t"
571 "pand %%mm7, %%mm0 \n\t"
572 "pand %%mm7, %%mm3 \n\t"
573 "psrlq $5, %%mm1 \n\t"
574 "psrlq $5, %%mm4 \n\t"
575 "pand %%mm6, %%mm1 \n\t"
576 "pand %%mm6, %%mm4 \n\t"
577 "psrlq $19, %%mm2 \n\t"
578 "psrlq $19, %%mm5 \n\t"
579 "pand %2, %%mm2 \n\t"
580 "pand %2, %%mm5 \n\t"
581 "por %%mm1, %%mm0 \n\t"
582 "por %%mm4, %%mm3 \n\t"
583 "por %%mm2, %%mm0 \n\t"
584 "por %%mm5, %%mm3 \n\t"
585 "psllq $16, %%mm3 \n\t"
586 "por %%mm3, %%mm0 \n\t"
587 MOVNTQ" %%mm0, (%0) \n\t"
588 ::"r"(d),"r"(s),"m"(blue_16mask):"memory");
592 __asm__ volatile(SFENCE:::"memory");
593 __asm__ volatile(EMMS:::"memory");
598 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
602 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, int src_size)
604 const uint8_t *s = src;
606 const uint8_t *mm_end;
607 uint16_t *d = (uint16_t *)dst;
609 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
611 "movq %0, %%mm7 \n\t"
612 "movq %1, %%mm6 \n\t"
613 ::"m"(red_15mask),"m"(green_15mask));
617 PREFETCH" 32(%1) \n\t"
618 "movd (%1), %%mm0 \n\t"
619 "movd 3(%1), %%mm3 \n\t"
620 "punpckldq 6(%1), %%mm0 \n\t"
621 "punpckldq 9(%1), %%mm3 \n\t"
622 "movq %%mm0, %%mm1 \n\t"
623 "movq %%mm0, %%mm2 \n\t"
624 "movq %%mm3, %%mm4 \n\t"
625 "movq %%mm3, %%mm5 \n\t"
626 "psrlq $3, %%mm0 \n\t"
627 "psrlq $3, %%mm3 \n\t"
628 "pand %2, %%mm0 \n\t"
629 "pand %2, %%mm3 \n\t"
630 "psrlq $6, %%mm1 \n\t"
631 "psrlq $6, %%mm4 \n\t"
632 "pand %%mm6, %%mm1 \n\t"
633 "pand %%mm6, %%mm4 \n\t"
634 "psrlq $9, %%mm2 \n\t"
635 "psrlq $9, %%mm5 \n\t"
636 "pand %%mm7, %%mm2 \n\t"
637 "pand %%mm7, %%mm5 \n\t"
638 "por %%mm1, %%mm0 \n\t"
639 "por %%mm4, %%mm3 \n\t"
640 "por %%mm2, %%mm0 \n\t"
641 "por %%mm5, %%mm3 \n\t"
642 "psllq $16, %%mm3 \n\t"
643 "por %%mm3, %%mm0 \n\t"
644 MOVNTQ" %%mm0, (%0) \n\t"
645 ::"r"(d),"r"(s),"m"(blue_15mask):"memory");
649 __asm__ volatile(SFENCE:::"memory");
650 __asm__ volatile(EMMS:::"memory");
655 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
659 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, int src_size)
661 const uint8_t *s = src;
663 const uint8_t *mm_end;
664 uint16_t *d = (uint16_t *)dst;
666 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
668 "movq %0, %%mm7 \n\t"
669 "movq %1, %%mm6 \n\t"
670 ::"m"(red_15mask),"m"(green_15mask));
674 PREFETCH" 32(%1) \n\t"
675 "movd (%1), %%mm0 \n\t"
676 "movd 3(%1), %%mm3 \n\t"
677 "punpckldq 6(%1), %%mm0 \n\t"
678 "punpckldq 9(%1), %%mm3 \n\t"
679 "movq %%mm0, %%mm1 \n\t"
680 "movq %%mm0, %%mm2 \n\t"
681 "movq %%mm3, %%mm4 \n\t"
682 "movq %%mm3, %%mm5 \n\t"
683 "psllq $7, %%mm0 \n\t"
684 "psllq $7, %%mm3 \n\t"
685 "pand %%mm7, %%mm0 \n\t"
686 "pand %%mm7, %%mm3 \n\t"
687 "psrlq $6, %%mm1 \n\t"
688 "psrlq $6, %%mm4 \n\t"
689 "pand %%mm6, %%mm1 \n\t"
690 "pand %%mm6, %%mm4 \n\t"
691 "psrlq $19, %%mm2 \n\t"
692 "psrlq $19, %%mm5 \n\t"
693 "pand %2, %%mm2 \n\t"
694 "pand %2, %%mm5 \n\t"
695 "por %%mm1, %%mm0 \n\t"
696 "por %%mm4, %%mm3 \n\t"
697 "por %%mm2, %%mm0 \n\t"
698 "por %%mm5, %%mm3 \n\t"
699 "psllq $16, %%mm3 \n\t"
700 "por %%mm3, %%mm0 \n\t"
701 MOVNTQ" %%mm0, (%0) \n\t"
702 ::"r"(d),"r"(s),"m"(blue_15mask):"memory");
706 __asm__ volatile(SFENCE:::"memory");
707 __asm__ volatile(EMMS:::"memory");
712 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
717 I use less accurate approximation here by simply left-shifting the input
718 value and filling the low order bits with zeroes. This method improves PNG
719 compression but this scheme cannot reproduce white exactly, since it does
720 not generate an all-ones maximum value; the net effect is to darken the
723 The better method should be "left bit replication":
733 | leftmost bits repeated to fill open bits
737 static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
740 const uint16_t *mm_end;
742 const uint16_t *s = (const uint16_t*)src;
743 end = s + src_size/2;
744 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
748 PREFETCH" 32(%1) \n\t"
749 "movq (%1), %%mm0 \n\t"
750 "movq (%1), %%mm1 \n\t"
751 "movq (%1), %%mm2 \n\t"
752 "pand %2, %%mm0 \n\t"
753 "pand %3, %%mm1 \n\t"
754 "pand %4, %%mm2 \n\t"
755 "psllq $3, %%mm0 \n\t"
756 "psrlq $2, %%mm1 \n\t"
757 "psrlq $7, %%mm2 \n\t"
758 "movq %%mm0, %%mm3 \n\t"
759 "movq %%mm1, %%mm4 \n\t"
760 "movq %%mm2, %%mm5 \n\t"
761 "punpcklwd %5, %%mm0 \n\t"
762 "punpcklwd %5, %%mm1 \n\t"
763 "punpcklwd %5, %%mm2 \n\t"
764 "punpckhwd %5, %%mm3 \n\t"
765 "punpckhwd %5, %%mm4 \n\t"
766 "punpckhwd %5, %%mm5 \n\t"
767 "psllq $8, %%mm1 \n\t"
768 "psllq $16, %%mm2 \n\t"
769 "por %%mm1, %%mm0 \n\t"
770 "por %%mm2, %%mm0 \n\t"
771 "psllq $8, %%mm4 \n\t"
772 "psllq $16, %%mm5 \n\t"
773 "por %%mm4, %%mm3 \n\t"
774 "por %%mm5, %%mm3 \n\t"
776 "movq %%mm0, %%mm6 \n\t"
777 "movq %%mm3, %%mm7 \n\t"
779 "movq 8(%1), %%mm0 \n\t"
780 "movq 8(%1), %%mm1 \n\t"
781 "movq 8(%1), %%mm2 \n\t"
782 "pand %2, %%mm0 \n\t"
783 "pand %3, %%mm1 \n\t"
784 "pand %4, %%mm2 \n\t"
785 "psllq $3, %%mm0 \n\t"
786 "psrlq $2, %%mm1 \n\t"
787 "psrlq $7, %%mm2 \n\t"
788 "movq %%mm0, %%mm3 \n\t"
789 "movq %%mm1, %%mm4 \n\t"
790 "movq %%mm2, %%mm5 \n\t"
791 "punpcklwd %5, %%mm0 \n\t"
792 "punpcklwd %5, %%mm1 \n\t"
793 "punpcklwd %5, %%mm2 \n\t"
794 "punpckhwd %5, %%mm3 \n\t"
795 "punpckhwd %5, %%mm4 \n\t"
796 "punpckhwd %5, %%mm5 \n\t"
797 "psllq $8, %%mm1 \n\t"
798 "psllq $16, %%mm2 \n\t"
799 "por %%mm1, %%mm0 \n\t"
800 "por %%mm2, %%mm0 \n\t"
801 "psllq $8, %%mm4 \n\t"
802 "psllq $16, %%mm5 \n\t"
803 "por %%mm4, %%mm3 \n\t"
804 "por %%mm5, %%mm3 \n\t"
807 :"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
809 /* borrowed 32 to 24 */
811 "movq %%mm0, %%mm4 \n\t"
812 "movq %%mm3, %%mm5 \n\t"
813 "movq %%mm6, %%mm0 \n\t"
814 "movq %%mm7, %%mm1 \n\t"
816 "movq %%mm4, %%mm6 \n\t"
817 "movq %%mm5, %%mm7 \n\t"
818 "movq %%mm0, %%mm2 \n\t"
819 "movq %%mm1, %%mm3 \n\t"
828 __asm__ volatile(SFENCE:::"memory");
829 __asm__ volatile(EMMS:::"memory");
831 register uint16_t bgr;
833 *d++ = (bgr&0x1F)<<3;
834 *d++ = (bgr&0x3E0)>>2;
835 *d++ = (bgr&0x7C00)>>7;
839 static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
842 const uint16_t *mm_end;
843 uint8_t *d = (uint8_t *)dst;
844 const uint16_t *s = (const uint16_t *)src;
845 end = s + src_size/2;
846 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
850 PREFETCH" 32(%1) \n\t"
851 "movq (%1), %%mm0 \n\t"
852 "movq (%1), %%mm1 \n\t"
853 "movq (%1), %%mm2 \n\t"
854 "pand %2, %%mm0 \n\t"
855 "pand %3, %%mm1 \n\t"
856 "pand %4, %%mm2 \n\t"
857 "psllq $3, %%mm0 \n\t"
858 "psrlq $3, %%mm1 \n\t"
859 "psrlq $8, %%mm2 \n\t"
860 "movq %%mm0, %%mm3 \n\t"
861 "movq %%mm1, %%mm4 \n\t"
862 "movq %%mm2, %%mm5 \n\t"
863 "punpcklwd %5, %%mm0 \n\t"
864 "punpcklwd %5, %%mm1 \n\t"
865 "punpcklwd %5, %%mm2 \n\t"
866 "punpckhwd %5, %%mm3 \n\t"
867 "punpckhwd %5, %%mm4 \n\t"
868 "punpckhwd %5, %%mm5 \n\t"
869 "psllq $8, %%mm1 \n\t"
870 "psllq $16, %%mm2 \n\t"
871 "por %%mm1, %%mm0 \n\t"
872 "por %%mm2, %%mm0 \n\t"
873 "psllq $8, %%mm4 \n\t"
874 "psllq $16, %%mm5 \n\t"
875 "por %%mm4, %%mm3 \n\t"
876 "por %%mm5, %%mm3 \n\t"
878 "movq %%mm0, %%mm6 \n\t"
879 "movq %%mm3, %%mm7 \n\t"
881 "movq 8(%1), %%mm0 \n\t"
882 "movq 8(%1), %%mm1 \n\t"
883 "movq 8(%1), %%mm2 \n\t"
884 "pand %2, %%mm0 \n\t"
885 "pand %3, %%mm1 \n\t"
886 "pand %4, %%mm2 \n\t"
887 "psllq $3, %%mm0 \n\t"
888 "psrlq $3, %%mm1 \n\t"
889 "psrlq $8, %%mm2 \n\t"
890 "movq %%mm0, %%mm3 \n\t"
891 "movq %%mm1, %%mm4 \n\t"
892 "movq %%mm2, %%mm5 \n\t"
893 "punpcklwd %5, %%mm0 \n\t"
894 "punpcklwd %5, %%mm1 \n\t"
895 "punpcklwd %5, %%mm2 \n\t"
896 "punpckhwd %5, %%mm3 \n\t"
897 "punpckhwd %5, %%mm4 \n\t"
898 "punpckhwd %5, %%mm5 \n\t"
899 "psllq $8, %%mm1 \n\t"
900 "psllq $16, %%mm2 \n\t"
901 "por %%mm1, %%mm0 \n\t"
902 "por %%mm2, %%mm0 \n\t"
903 "psllq $8, %%mm4 \n\t"
904 "psllq $16, %%mm5 \n\t"
905 "por %%mm4, %%mm3 \n\t"
906 "por %%mm5, %%mm3 \n\t"
908 :"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
910 /* borrowed 32 to 24 */
912 "movq %%mm0, %%mm4 \n\t"
913 "movq %%mm3, %%mm5 \n\t"
914 "movq %%mm6, %%mm0 \n\t"
915 "movq %%mm7, %%mm1 \n\t"
917 "movq %%mm4, %%mm6 \n\t"
918 "movq %%mm5, %%mm7 \n\t"
919 "movq %%mm0, %%mm2 \n\t"
920 "movq %%mm1, %%mm3 \n\t"
929 __asm__ volatile(SFENCE:::"memory");
930 __asm__ volatile(EMMS:::"memory");
932 register uint16_t bgr;
934 *d++ = (bgr&0x1F)<<3;
935 *d++ = (bgr&0x7E0)>>3;
936 *d++ = (bgr&0xF800)>>8;
941 * mm0 = 00 B3 00 B2 00 B1 00 B0
942 * mm1 = 00 G3 00 G2 00 G1 00 G0
943 * mm2 = 00 R3 00 R2 00 R1 00 R0
944 * mm6 = FF FF FF FF FF FF FF FF
945 * mm7 = 00 00 00 00 00 00 00 00
948 "packuswb %%mm7, %%mm0 \n\t" /* 00 00 00 00 B3 B2 B1 B0 */ \
949 "packuswb %%mm7, %%mm1 \n\t" /* 00 00 00 00 G3 G2 G1 G0 */ \
950 "packuswb %%mm7, %%mm2 \n\t" /* 00 00 00 00 R3 R2 R1 R0 */ \
951 "punpcklbw %%mm1, %%mm0 \n\t" /* G3 B3 G2 B2 G1 B1 G0 B0 */ \
952 "punpcklbw %%mm6, %%mm2 \n\t" /* FF R3 FF R2 FF R1 FF R0 */ \
953 "movq %%mm0, %%mm3 \n\t" \
954 "punpcklwd %%mm2, %%mm0 \n\t" /* FF R1 G1 B1 FF R0 G0 B0 */ \
955 "punpckhwd %%mm2, %%mm3 \n\t" /* FF R3 G3 B3 FF R2 G2 B2 */ \
956 MOVNTQ" %%mm0, (%0) \n\t" \
957 MOVNTQ" %%mm3, 8(%0) \n\t" \
959 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, int src_size)
962 const uint16_t *mm_end;
964 const uint16_t *s = (const uint16_t *)src;
965 end = s + src_size/2;
966 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
967 __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
968 __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
972 PREFETCH" 32(%1) \n\t"
973 "movq (%1), %%mm0 \n\t"
974 "movq (%1), %%mm1 \n\t"
975 "movq (%1), %%mm2 \n\t"
976 "pand %2, %%mm0 \n\t"
977 "pand %3, %%mm1 \n\t"
978 "pand %4, %%mm2 \n\t"
979 "psllq $3, %%mm0 \n\t"
980 "psrlq $2, %%mm1 \n\t"
981 "psrlq $7, %%mm2 \n\t"
983 ::"r"(d),"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
988 __asm__ volatile(SFENCE:::"memory");
989 __asm__ volatile(EMMS:::"memory");
991 register uint16_t bgr;
993 *d++ = (bgr&0x1F)<<3;
994 *d++ = (bgr&0x3E0)>>2;
995 *d++ = (bgr&0x7C00)>>7;
1000 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, int src_size)
1002 const uint16_t *end;
1003 const uint16_t *mm_end;
1005 const uint16_t *s = (const uint16_t*)src;
1006 end = s + src_size/2;
1007 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
1008 __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
1009 __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
1011 while (s < mm_end) {
1013 PREFETCH" 32(%1) \n\t"
1014 "movq (%1), %%mm0 \n\t"
1015 "movq (%1), %%mm1 \n\t"
1016 "movq (%1), %%mm2 \n\t"
1017 "pand %2, %%mm0 \n\t"
1018 "pand %3, %%mm1 \n\t"
1019 "pand %4, %%mm2 \n\t"
1020 "psllq $3, %%mm0 \n\t"
1021 "psrlq $3, %%mm1 \n\t"
1022 "psrlq $8, %%mm2 \n\t"
1024 ::"r"(d),"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1029 __asm__ volatile(SFENCE:::"memory");
1030 __asm__ volatile(EMMS:::"memory");
1032 register uint16_t bgr;
1034 *d++ = (bgr&0x1F)<<3;
1035 *d++ = (bgr&0x7E0)>>3;
1036 *d++ = (bgr&0xF800)>>8;
1041 static inline void RENAME(shuffle_bytes_2103)(const uint8_t *src, uint8_t *dst, int src_size)
1043 x86_reg idx = 15 - src_size;
1044 const uint8_t *s = src-idx;
1045 uint8_t *d = dst-idx;
1049 PREFETCH" (%1, %0) \n\t"
1050 "movq %3, %%mm7 \n\t"
1051 "pxor %4, %%mm7 \n\t"
1052 "movq %%mm7, %%mm6 \n\t"
1053 "pxor %5, %%mm7 \n\t"
1056 PREFETCH" 32(%1, %0) \n\t"
1057 "movq (%1, %0), %%mm0 \n\t"
1058 "movq 8(%1, %0), %%mm1 \n\t"
1059 # if COMPILE_TEMPLATE_MMXEXT
1060 "pshufw $177, %%mm0, %%mm3 \n\t"
1061 "pshufw $177, %%mm1, %%mm5 \n\t"
1062 "pand %%mm7, %%mm0 \n\t"
1063 "pand %%mm6, %%mm3 \n\t"
1064 "pand %%mm7, %%mm1 \n\t"
1065 "pand %%mm6, %%mm5 \n\t"
1066 "por %%mm3, %%mm0 \n\t"
1067 "por %%mm5, %%mm1 \n\t"
1069 "movq %%mm0, %%mm2 \n\t"
1070 "movq %%mm1, %%mm4 \n\t"
1071 "pand %%mm7, %%mm0 \n\t"
1072 "pand %%mm6, %%mm2 \n\t"
1073 "pand %%mm7, %%mm1 \n\t"
1074 "pand %%mm6, %%mm4 \n\t"
1075 "movq %%mm2, %%mm3 \n\t"
1076 "movq %%mm4, %%mm5 \n\t"
1077 "pslld $16, %%mm2 \n\t"
1078 "psrld $16, %%mm3 \n\t"
1079 "pslld $16, %%mm4 \n\t"
1080 "psrld $16, %%mm5 \n\t"
1081 "por %%mm2, %%mm0 \n\t"
1082 "por %%mm4, %%mm1 \n\t"
1083 "por %%mm3, %%mm0 \n\t"
1084 "por %%mm5, %%mm1 \n\t"
1086 MOVNTQ" %%mm0, (%2, %0) \n\t"
1087 MOVNTQ" %%mm1, 8(%2, %0) \n\t"
1094 : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
1096 for (; idx<15; idx+=4) {
1097 register int v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00;
1099 *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16);
1103 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
1106 x86_reg mmx_size= 23 - src_size;
1108 "test %%"REG_a", %%"REG_a" \n\t"
1110 "movq "MANGLE(mask24r)", %%mm5 \n\t"
1111 "movq "MANGLE(mask24g)", %%mm6 \n\t"
1112 "movq "MANGLE(mask24b)", %%mm7 \n\t"
1115 PREFETCH" 32(%1, %%"REG_a") \n\t"
1116 "movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1117 "movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG
1118 "movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B
1119 "psllq $16, %%mm0 \n\t" // 00 BGR BGR
1120 "pand %%mm5, %%mm0 \n\t"
1121 "pand %%mm6, %%mm1 \n\t"
1122 "pand %%mm7, %%mm2 \n\t"
1123 "por %%mm0, %%mm1 \n\t"
1124 "por %%mm2, %%mm1 \n\t"
1125 "movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1126 MOVNTQ" %%mm1, (%2, %%"REG_a") \n\t" // RGB RGB RG
1127 "movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B
1128 "movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR
1129 "pand %%mm7, %%mm0 \n\t"
1130 "pand %%mm5, %%mm1 \n\t"
1131 "pand %%mm6, %%mm2 \n\t"
1132 "por %%mm0, %%mm1 \n\t"
1133 "por %%mm2, %%mm1 \n\t"
1134 "movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B
1135 MOVNTQ" %%mm1, 8(%2, %%"REG_a") \n\t" // B RGB RGB R
1136 "movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR
1137 "movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG
1138 "pand %%mm6, %%mm0 \n\t"
1139 "pand %%mm7, %%mm1 \n\t"
1140 "pand %%mm5, %%mm2 \n\t"
1141 "por %%mm0, %%mm1 \n\t"
1142 "por %%mm2, %%mm1 \n\t"
1143 MOVNTQ" %%mm1, 16(%2, %%"REG_a") \n\t"
1144 "add $24, %%"REG_a" \n\t"
1148 : "r" (src-mmx_size), "r"(dst-mmx_size)
1151 __asm__ volatile(SFENCE:::"memory");
1152 __asm__ volatile(EMMS:::"memory");
1154 if (mmx_size==23) return; //finished, was multiple of 8
1158 src_size= 23-mmx_size;
1161 for (i=0; i<src_size; i+=3) {
1164 dst[i + 1] = src[i + 1];
1165 dst[i + 2] = src[i + 0];
1170 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1171 int width, int height,
1172 int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
1175 const x86_reg chromWidth= width>>1;
1176 for (y=0; y<height; y++) {
1177 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1179 "xor %%"REG_a", %%"REG_a" \n\t"
1182 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1183 PREFETCH" 32(%2, %%"REG_a") \n\t"
1184 PREFETCH" 32(%3, %%"REG_a") \n\t"
1185 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1186 "movq %%mm0, %%mm2 \n\t" // U(0)
1187 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1188 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1189 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1191 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1192 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1193 "movq %%mm3, %%mm4 \n\t" // Y(0)
1194 "movq %%mm5, %%mm6 \n\t" // Y(8)
1195 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
1196 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
1197 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
1198 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
1200 MOVNTQ" %%mm3, (%0, %%"REG_a", 4) \n\t"
1201 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
1202 MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4) \n\t"
1203 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
1205 "add $8, %%"REG_a" \n\t"
1206 "cmp %4, %%"REG_a" \n\t"
1208 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1211 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1212 usrc += chromStride;
1213 vsrc += chromStride;
1224 * Height should be a multiple of 2 and width should be a multiple of 16.
1225 * (If this is a problem for anyone then tell me, and I will fix it.)
1227 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1228 int width, int height,
1229 int lumStride, int chromStride, int dstStride)
1231 //FIXME interpolate chroma
1232 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1235 static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1236 int width, int height,
1237 int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
1240 const x86_reg chromWidth= width>>1;
1241 for (y=0; y<height; y++) {
1242 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1244 "xor %%"REG_a", %%"REG_a" \n\t"
1247 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1248 PREFETCH" 32(%2, %%"REG_a") \n\t"
1249 PREFETCH" 32(%3, %%"REG_a") \n\t"
1250 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1251 "movq %%mm0, %%mm2 \n\t" // U(0)
1252 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1253 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1254 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1256 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1257 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1258 "movq %%mm0, %%mm4 \n\t" // Y(0)
1259 "movq %%mm2, %%mm6 \n\t" // Y(8)
1260 "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
1261 "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
1262 "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
1263 "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
1265 MOVNTQ" %%mm0, (%0, %%"REG_a", 4) \n\t"
1266 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
1267 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4) \n\t"
1268 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
1270 "add $8, %%"REG_a" \n\t"
1271 "cmp %4, %%"REG_a" \n\t"
1273 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1276 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1277 usrc += chromStride;
1278 vsrc += chromStride;
1289 * Height should be a multiple of 2 and width should be a multiple of 16
1290 * (If this is a problem for anyone then tell me, and I will fix it.)
1292 static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1293 int width, int height,
1294 int lumStride, int chromStride, int dstStride)
1296 //FIXME interpolate chroma
1297 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1301 * Width should be a multiple of 16.
1303 static inline void RENAME(yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1304 int width, int height,
1305 int lumStride, int chromStride, int dstStride)
1307 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1311 * Width should be a multiple of 16.
1313 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1314 int width, int height,
1315 int lumStride, int chromStride, int dstStride)
1317 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1321 * Height should be a multiple of 2 and width should be a multiple of 16.
1322 * (If this is a problem for anyone then tell me, and I will fix it.)
1324 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1325 int width, int height,
1326 int lumStride, int chromStride, int srcStride)
1329 const x86_reg chromWidth= width>>1;
1330 for (y=0; y<height; y+=2) {
1332 "xor %%"REG_a", %%"REG_a" \n\t"
1333 "pcmpeqw %%mm7, %%mm7 \n\t"
1334 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1337 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1338 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1339 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1340 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
1341 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
1342 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
1343 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
1344 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1345 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1346 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1347 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1349 MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
1351 "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(8)
1352 "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(12)
1353 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
1354 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
1355 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
1356 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
1357 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1358 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1359 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1360 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1362 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1364 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1365 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1366 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1367 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1368 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1369 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1370 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1371 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1373 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
1374 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
1376 "add $8, %%"REG_a" \n\t"
1377 "cmp %4, %%"REG_a" \n\t"
1379 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1380 : "memory", "%"REG_a
1387 "xor %%"REG_a", %%"REG_a" \n\t"
1390 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1391 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1392 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1393 "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
1394 "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
1395 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1396 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1397 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1398 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1399 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1400 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1402 MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
1403 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1405 "add $8, %%"REG_a" \n\t"
1406 "cmp %4, %%"REG_a" \n\t"
1409 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1410 : "memory", "%"REG_a
1412 udst += chromStride;
1413 vdst += chromStride;
1417 __asm__ volatile(EMMS" \n\t"
1421 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
1423 #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
1424 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride)
1431 for (x=0; x<srcWidth-1; x++) {
1432 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1433 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1435 dst[2*srcWidth-1]= src[srcWidth-1];
1439 for (y=1; y<srcHeight; y++) {
1440 const x86_reg mmxSize= srcWidth&~15;
1442 "mov %4, %%"REG_a" \n\t"
1443 "movq "MANGLE(mmx_ff)", %%mm0 \n\t"
1444 "movq (%0, %%"REG_a"), %%mm4 \n\t"
1445 "movq %%mm4, %%mm2 \n\t"
1446 "psllq $8, %%mm4 \n\t"
1447 "pand %%mm0, %%mm2 \n\t"
1448 "por %%mm2, %%mm4 \n\t"
1449 "movq (%1, %%"REG_a"), %%mm5 \n\t"
1450 "movq %%mm5, %%mm3 \n\t"
1451 "psllq $8, %%mm5 \n\t"
1452 "pand %%mm0, %%mm3 \n\t"
1453 "por %%mm3, %%mm5 \n\t"
1455 "movq (%0, %%"REG_a"), %%mm0 \n\t"
1456 "movq (%1, %%"REG_a"), %%mm1 \n\t"
1457 "movq 1(%0, %%"REG_a"), %%mm2 \n\t"
1458 "movq 1(%1, %%"REG_a"), %%mm3 \n\t"
1459 PAVGB" %%mm0, %%mm5 \n\t"
1460 PAVGB" %%mm0, %%mm3 \n\t"
1461 PAVGB" %%mm0, %%mm5 \n\t"
1462 PAVGB" %%mm0, %%mm3 \n\t"
1463 PAVGB" %%mm1, %%mm4 \n\t"
1464 PAVGB" %%mm1, %%mm2 \n\t"
1465 PAVGB" %%mm1, %%mm4 \n\t"
1466 PAVGB" %%mm1, %%mm2 \n\t"
1467 "movq %%mm5, %%mm7 \n\t"
1468 "movq %%mm4, %%mm6 \n\t"
1469 "punpcklbw %%mm3, %%mm5 \n\t"
1470 "punpckhbw %%mm3, %%mm7 \n\t"
1471 "punpcklbw %%mm2, %%mm4 \n\t"
1472 "punpckhbw %%mm2, %%mm6 \n\t"
1473 MOVNTQ" %%mm5, (%2, %%"REG_a", 2) \n\t"
1474 MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2) \n\t"
1475 MOVNTQ" %%mm4, (%3, %%"REG_a", 2) \n\t"
1476 MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2) \n\t"
1477 "add $8, %%"REG_a" \n\t"
1478 "movq -1(%0, %%"REG_a"), %%mm4 \n\t"
1479 "movq -1(%1, %%"REG_a"), %%mm5 \n\t"
1481 :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
1482 "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1487 for (x=mmxSize-1; x<srcWidth-1; x++) {
1488 dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
1489 dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
1490 dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
1491 dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
1493 dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
1494 dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1503 for (x=0; x<srcWidth-1; x++) {
1504 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1505 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1507 dst[2*srcWidth-1]= src[srcWidth-1];
1509 __asm__ volatile(EMMS" \n\t"
1513 #endif /* COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW */
1515 #if !COMPILE_TEMPLATE_AMD3DNOW
1517 * Height should be a multiple of 2 and width should be a multiple of 16.
1518 * (If this is a problem for anyone then tell me, and I will fix it.)
1519 * Chrominance data is only taken from every second line, others are ignored.
1520 * FIXME: Write HQ version.
1522 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1523 int width, int height,
1524 int lumStride, int chromStride, int srcStride)
1527 const x86_reg chromWidth= width>>1;
1528 for (y=0; y<height; y+=2) {
1530 "xor %%"REG_a", %%"REG_a" \n\t"
1531 "pcmpeqw %%mm7, %%mm7 \n\t"
1532 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1535 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1536 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // UYVY UYVY(0)
1537 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(4)
1538 "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
1539 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
1540 "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
1541 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
1542 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1543 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1544 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1545 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1547 MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
1549 "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(8)
1550 "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // UYVY UYVY(12)
1551 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
1552 "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
1553 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
1554 "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
1555 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1556 "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1557 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1558 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1560 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1562 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1563 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1564 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1565 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1566 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1567 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1568 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1569 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1571 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
1572 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
1574 "add $8, %%"REG_a" \n\t"
1575 "cmp %4, %%"REG_a" \n\t"
1577 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1578 : "memory", "%"REG_a
1585 "xor %%"REG_a", %%"REG_a" \n\t"
1588 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1589 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1590 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1591 "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
1592 "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
1593 "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1594 "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1595 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1596 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1597 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1598 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1600 MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
1601 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1603 "add $8, %%"REG_a" \n\t"
1604 "cmp %4, %%"REG_a" \n\t"
1607 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1608 : "memory", "%"REG_a
1610 udst += chromStride;
1611 vdst += chromStride;
1615 __asm__ volatile(EMMS" \n\t"
1619 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
1622 * Height should be a multiple of 2 and width should be a multiple of 2.
1623 * (If this is a problem for anyone then tell me, and I will fix it.)
1624 * Chrominance data is only taken from every second line,
1625 * others are ignored in the C version.
1626 * FIXME: Write HQ version.
1628 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1629 int width, int height,
1630 int lumStride, int chromStride, int srcStride)
1633 const x86_reg chromWidth= width>>1;
1634 for (y=0; y<height-2; y+=2) {
1636 for (i=0; i<2; i++) {
1638 "mov %2, %%"REG_a" \n\t"
1639 "movq "MANGLE(ff_bgr2YCoeff)", %%mm6 \n\t"
1640 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
1641 "pxor %%mm7, %%mm7 \n\t"
1642 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
1645 PREFETCH" 64(%0, %%"REG_d") \n\t"
1646 "movd (%0, %%"REG_d"), %%mm0 \n\t"
1647 "movd 3(%0, %%"REG_d"), %%mm1 \n\t"
1648 "punpcklbw %%mm7, %%mm0 \n\t"
1649 "punpcklbw %%mm7, %%mm1 \n\t"
1650 "movd 6(%0, %%"REG_d"), %%mm2 \n\t"
1651 "movd 9(%0, %%"REG_d"), %%mm3 \n\t"
1652 "punpcklbw %%mm7, %%mm2 \n\t"
1653 "punpcklbw %%mm7, %%mm3 \n\t"
1654 "pmaddwd %%mm6, %%mm0 \n\t"
1655 "pmaddwd %%mm6, %%mm1 \n\t"
1656 "pmaddwd %%mm6, %%mm2 \n\t"
1657 "pmaddwd %%mm6, %%mm3 \n\t"
1658 #ifndef FAST_BGR2YV12
1659 "psrad $8, %%mm0 \n\t"
1660 "psrad $8, %%mm1 \n\t"
1661 "psrad $8, %%mm2 \n\t"
1662 "psrad $8, %%mm3 \n\t"
1664 "packssdw %%mm1, %%mm0 \n\t"
1665 "packssdw %%mm3, %%mm2 \n\t"
1666 "pmaddwd %%mm5, %%mm0 \n\t"
1667 "pmaddwd %%mm5, %%mm2 \n\t"
1668 "packssdw %%mm2, %%mm0 \n\t"
1669 "psraw $7, %%mm0 \n\t"
1671 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
1672 "movd 15(%0, %%"REG_d"), %%mm1 \n\t"
1673 "punpcklbw %%mm7, %%mm4 \n\t"
1674 "punpcklbw %%mm7, %%mm1 \n\t"
1675 "movd 18(%0, %%"REG_d"), %%mm2 \n\t"
1676 "movd 21(%0, %%"REG_d"), %%mm3 \n\t"
1677 "punpcklbw %%mm7, %%mm2 \n\t"
1678 "punpcklbw %%mm7, %%mm3 \n\t"
1679 "pmaddwd %%mm6, %%mm4 \n\t"
1680 "pmaddwd %%mm6, %%mm1 \n\t"
1681 "pmaddwd %%mm6, %%mm2 \n\t"
1682 "pmaddwd %%mm6, %%mm3 \n\t"
1683 #ifndef FAST_BGR2YV12
1684 "psrad $8, %%mm4 \n\t"
1685 "psrad $8, %%mm1 \n\t"
1686 "psrad $8, %%mm2 \n\t"
1687 "psrad $8, %%mm3 \n\t"
1689 "packssdw %%mm1, %%mm4 \n\t"
1690 "packssdw %%mm3, %%mm2 \n\t"
1691 "pmaddwd %%mm5, %%mm4 \n\t"
1692 "pmaddwd %%mm5, %%mm2 \n\t"
1693 "add $24, %%"REG_d" \n\t"
1694 "packssdw %%mm2, %%mm4 \n\t"
1695 "psraw $7, %%mm4 \n\t"
1697 "packuswb %%mm4, %%mm0 \n\t"
1698 "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0 \n\t"
1700 MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t"
1701 "add $8, %%"REG_a" \n\t"
1703 : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width)
1704 : "%"REG_a, "%"REG_d
1711 "mov %4, %%"REG_a" \n\t"
1712 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
1713 "movq "MANGLE(ff_bgr2UCoeff)", %%mm6 \n\t"
1714 "pxor %%mm7, %%mm7 \n\t"
1715 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
1716 "add %%"REG_d", %%"REG_d" \n\t"
1719 PREFETCH" 64(%0, %%"REG_d") \n\t"
1720 PREFETCH" 64(%1, %%"REG_d") \n\t"
1721 #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
1722 "movq (%0, %%"REG_d"), %%mm0 \n\t"
1723 "movq (%1, %%"REG_d"), %%mm1 \n\t"
1724 "movq 6(%0, %%"REG_d"), %%mm2 \n\t"
1725 "movq 6(%1, %%"REG_d"), %%mm3 \n\t"
1726 PAVGB" %%mm1, %%mm0 \n\t"
1727 PAVGB" %%mm3, %%mm2 \n\t"
1728 "movq %%mm0, %%mm1 \n\t"
1729 "movq %%mm2, %%mm3 \n\t"
1730 "psrlq $24, %%mm0 \n\t"
1731 "psrlq $24, %%mm2 \n\t"
1732 PAVGB" %%mm1, %%mm0 \n\t"
1733 PAVGB" %%mm3, %%mm2 \n\t"
1734 "punpcklbw %%mm7, %%mm0 \n\t"
1735 "punpcklbw %%mm7, %%mm2 \n\t"
1737 "movd (%0, %%"REG_d"), %%mm0 \n\t"
1738 "movd (%1, %%"REG_d"), %%mm1 \n\t"
1739 "movd 3(%0, %%"REG_d"), %%mm2 \n\t"
1740 "movd 3(%1, %%"REG_d"), %%mm3 \n\t"
1741 "punpcklbw %%mm7, %%mm0 \n\t"
1742 "punpcklbw %%mm7, %%mm1 \n\t"
1743 "punpcklbw %%mm7, %%mm2 \n\t"
1744 "punpcklbw %%mm7, %%mm3 \n\t"
1745 "paddw %%mm1, %%mm0 \n\t"
1746 "paddw %%mm3, %%mm2 \n\t"
1747 "paddw %%mm2, %%mm0 \n\t"
1748 "movd 6(%0, %%"REG_d"), %%mm4 \n\t"
1749 "movd 6(%1, %%"REG_d"), %%mm1 \n\t"
1750 "movd 9(%0, %%"REG_d"), %%mm2 \n\t"
1751 "movd 9(%1, %%"REG_d"), %%mm3 \n\t"
1752 "punpcklbw %%mm7, %%mm4 \n\t"
1753 "punpcklbw %%mm7, %%mm1 \n\t"
1754 "punpcklbw %%mm7, %%mm2 \n\t"
1755 "punpcklbw %%mm7, %%mm3 \n\t"
1756 "paddw %%mm1, %%mm4 \n\t"
1757 "paddw %%mm3, %%mm2 \n\t"
1758 "paddw %%mm4, %%mm2 \n\t"
1759 "psrlw $2, %%mm0 \n\t"
1760 "psrlw $2, %%mm2 \n\t"
1762 "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
1763 "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
1765 "pmaddwd %%mm0, %%mm1 \n\t"
1766 "pmaddwd %%mm2, %%mm3 \n\t"
1767 "pmaddwd %%mm6, %%mm0 \n\t"
1768 "pmaddwd %%mm6, %%mm2 \n\t"
1769 #ifndef FAST_BGR2YV12
1770 "psrad $8, %%mm0 \n\t"
1771 "psrad $8, %%mm1 \n\t"
1772 "psrad $8, %%mm2 \n\t"
1773 "psrad $8, %%mm3 \n\t"
1775 "packssdw %%mm2, %%mm0 \n\t"
1776 "packssdw %%mm3, %%mm1 \n\t"
1777 "pmaddwd %%mm5, %%mm0 \n\t"
1778 "pmaddwd %%mm5, %%mm1 \n\t"
1779 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
1780 "psraw $7, %%mm0 \n\t"
1782 #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
1783 "movq 12(%0, %%"REG_d"), %%mm4 \n\t"
1784 "movq 12(%1, %%"REG_d"), %%mm1 \n\t"
1785 "movq 18(%0, %%"REG_d"), %%mm2 \n\t"
1786 "movq 18(%1, %%"REG_d"), %%mm3 \n\t"
1787 PAVGB" %%mm1, %%mm4 \n\t"
1788 PAVGB" %%mm3, %%mm2 \n\t"
1789 "movq %%mm4, %%mm1 \n\t"
1790 "movq %%mm2, %%mm3 \n\t"
1791 "psrlq $24, %%mm4 \n\t"
1792 "psrlq $24, %%mm2 \n\t"
1793 PAVGB" %%mm1, %%mm4 \n\t"
1794 PAVGB" %%mm3, %%mm2 \n\t"
1795 "punpcklbw %%mm7, %%mm4 \n\t"
1796 "punpcklbw %%mm7, %%mm2 \n\t"
1798 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
1799 "movd 12(%1, %%"REG_d"), %%mm1 \n\t"
1800 "movd 15(%0, %%"REG_d"), %%mm2 \n\t"
1801 "movd 15(%1, %%"REG_d"), %%mm3 \n\t"
1802 "punpcklbw %%mm7, %%mm4 \n\t"
1803 "punpcklbw %%mm7, %%mm1 \n\t"
1804 "punpcklbw %%mm7, %%mm2 \n\t"
1805 "punpcklbw %%mm7, %%mm3 \n\t"
1806 "paddw %%mm1, %%mm4 \n\t"
1807 "paddw %%mm3, %%mm2 \n\t"
1808 "paddw %%mm2, %%mm4 \n\t"
1809 "movd 18(%0, %%"REG_d"), %%mm5 \n\t"
1810 "movd 18(%1, %%"REG_d"), %%mm1 \n\t"
1811 "movd 21(%0, %%"REG_d"), %%mm2 \n\t"
1812 "movd 21(%1, %%"REG_d"), %%mm3 \n\t"
1813 "punpcklbw %%mm7, %%mm5 \n\t"
1814 "punpcklbw %%mm7, %%mm1 \n\t"
1815 "punpcklbw %%mm7, %%mm2 \n\t"
1816 "punpcklbw %%mm7, %%mm3 \n\t"
1817 "paddw %%mm1, %%mm5 \n\t"
1818 "paddw %%mm3, %%mm2 \n\t"
1819 "paddw %%mm5, %%mm2 \n\t"
1820 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
1821 "psrlw $2, %%mm4 \n\t"
1822 "psrlw $2, %%mm2 \n\t"
1824 "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
1825 "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
1827 "pmaddwd %%mm4, %%mm1 \n\t"
1828 "pmaddwd %%mm2, %%mm3 \n\t"
1829 "pmaddwd %%mm6, %%mm4 \n\t"
1830 "pmaddwd %%mm6, %%mm2 \n\t"
1831 #ifndef FAST_BGR2YV12
1832 "psrad $8, %%mm4 \n\t"
1833 "psrad $8, %%mm1 \n\t"
1834 "psrad $8, %%mm2 \n\t"
1835 "psrad $8, %%mm3 \n\t"
1837 "packssdw %%mm2, %%mm4 \n\t"
1838 "packssdw %%mm3, %%mm1 \n\t"
1839 "pmaddwd %%mm5, %%mm4 \n\t"
1840 "pmaddwd %%mm5, %%mm1 \n\t"
1841 "add $24, %%"REG_d" \n\t"
1842 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
1843 "psraw $7, %%mm4 \n\t"
1845 "movq %%mm0, %%mm1 \n\t"
1846 "punpckldq %%mm4, %%mm0 \n\t"
1847 "punpckhdq %%mm4, %%mm1 \n\t"
1848 "packsswb %%mm1, %%mm0 \n\t"
1849 "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0 \n\t"
1850 "movd %%mm0, (%2, %%"REG_a") \n\t"
1851 "punpckhdq %%mm0, %%mm0 \n\t"
1852 "movd %%mm0, (%3, %%"REG_a") \n\t"
1853 "add $4, %%"REG_a" \n\t"
1855 : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
1856 : "%"REG_a, "%"REG_d
1859 udst += chromStride;
1860 vdst += chromStride;
1864 __asm__ volatile(EMMS" \n\t"
1868 rgb24toyv12_c(src, ydst, udst, vdst, width, height-y, lumStride, chromStride, srcStride);
1870 #endif /* !COMPILE_TEMPLATE_SSE2 */
1872 #if !COMPILE_TEMPLATE_AMD3DNOW
1873 static void RENAME(interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dest,
1874 int width, int height, int src1Stride,
1875 int src2Stride, int dstStride)
1879 for (h=0; h < height; h++) {
1882 #if COMPILE_TEMPLATE_SSE2
1884 "xor %%"REG_a", %%"REG_a" \n\t"
1886 PREFETCH" 64(%1, %%"REG_a") \n\t"
1887 PREFETCH" 64(%2, %%"REG_a") \n\t"
1888 "movdqa (%1, %%"REG_a"), %%xmm0 \n\t"
1889 "movdqa (%1, %%"REG_a"), %%xmm1 \n\t"
1890 "movdqa (%2, %%"REG_a"), %%xmm2 \n\t"
1891 "punpcklbw %%xmm2, %%xmm0 \n\t"
1892 "punpckhbw %%xmm2, %%xmm1 \n\t"
1893 "movntdq %%xmm0, (%0, %%"REG_a", 2) \n\t"
1894 "movntdq %%xmm1, 16(%0, %%"REG_a", 2) \n\t"
1895 "add $16, %%"REG_a" \n\t"
1896 "cmp %3, %%"REG_a" \n\t"
1898 ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
1899 : "memory", "%"REG_a""
1903 "xor %%"REG_a", %%"REG_a" \n\t"
1905 PREFETCH" 64(%1, %%"REG_a") \n\t"
1906 PREFETCH" 64(%2, %%"REG_a") \n\t"
1907 "movq (%1, %%"REG_a"), %%mm0 \n\t"
1908 "movq 8(%1, %%"REG_a"), %%mm2 \n\t"
1909 "movq %%mm0, %%mm1 \n\t"
1910 "movq %%mm2, %%mm3 \n\t"
1911 "movq (%2, %%"REG_a"), %%mm4 \n\t"
1912 "movq 8(%2, %%"REG_a"), %%mm5 \n\t"
1913 "punpcklbw %%mm4, %%mm0 \n\t"
1914 "punpckhbw %%mm4, %%mm1 \n\t"
1915 "punpcklbw %%mm5, %%mm2 \n\t"
1916 "punpckhbw %%mm5, %%mm3 \n\t"
1917 MOVNTQ" %%mm0, (%0, %%"REG_a", 2) \n\t"
1918 MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2) \n\t"
1919 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2) \n\t"
1920 MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2) \n\t"
1921 "add $16, %%"REG_a" \n\t"
1922 "cmp %3, %%"REG_a" \n\t"
1924 ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
1925 : "memory", "%"REG_a
1928 for (w= (width&(~15)); w < width; w++) {
1929 dest[2*w+0] = src1[w];
1930 dest[2*w+1] = src2[w];
1942 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
1944 #if !COMPILE_TEMPLATE_SSE2
1945 #if !COMPILE_TEMPLATE_AMD3DNOW
1946 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
1947 uint8_t *dst1, uint8_t *dst2,
1948 int width, int height,
1949 int srcStride1, int srcStride2,
1950 int dstStride1, int dstStride2)
1954 w=width/2; h=height/2;
1958 ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
1960 const uint8_t* s1=src1+srcStride1*(y>>1);
1961 uint8_t* d=dst1+dstStride1*y;
1963 for (;x<w-31;x+=32) {
1965 PREFETCH" 32(%1,%2) \n\t"
1966 "movq (%1,%2), %%mm0 \n\t"
1967 "movq 8(%1,%2), %%mm2 \n\t"
1968 "movq 16(%1,%2), %%mm4 \n\t"
1969 "movq 24(%1,%2), %%mm6 \n\t"
1970 "movq %%mm0, %%mm1 \n\t"
1971 "movq %%mm2, %%mm3 \n\t"
1972 "movq %%mm4, %%mm5 \n\t"
1973 "movq %%mm6, %%mm7 \n\t"
1974 "punpcklbw %%mm0, %%mm0 \n\t"
1975 "punpckhbw %%mm1, %%mm1 \n\t"
1976 "punpcklbw %%mm2, %%mm2 \n\t"
1977 "punpckhbw %%mm3, %%mm3 \n\t"
1978 "punpcklbw %%mm4, %%mm4 \n\t"
1979 "punpckhbw %%mm5, %%mm5 \n\t"
1980 "punpcklbw %%mm6, %%mm6 \n\t"
1981 "punpckhbw %%mm7, %%mm7 \n\t"
1982 MOVNTQ" %%mm0, (%0,%2,2) \n\t"
1983 MOVNTQ" %%mm1, 8(%0,%2,2) \n\t"
1984 MOVNTQ" %%mm2, 16(%0,%2,2) \n\t"
1985 MOVNTQ" %%mm3, 24(%0,%2,2) \n\t"
1986 MOVNTQ" %%mm4, 32(%0,%2,2) \n\t"
1987 MOVNTQ" %%mm5, 40(%0,%2,2) \n\t"
1988 MOVNTQ" %%mm6, 48(%0,%2,2) \n\t"
1989 MOVNTQ" %%mm7, 56(%0,%2,2)"
1990 :: "r"(d), "r"(s1), "r"(x)
1993 for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
1996 const uint8_t* s2=src2+srcStride2*(y>>1);
1997 uint8_t* d=dst2+dstStride2*y;
1999 for (;x<w-31;x+=32) {
2001 PREFETCH" 32(%1,%2) \n\t"
2002 "movq (%1,%2), %%mm0 \n\t"
2003 "movq 8(%1,%2), %%mm2 \n\t"
2004 "movq 16(%1,%2), %%mm4 \n\t"
2005 "movq 24(%1,%2), %%mm6 \n\t"
2006 "movq %%mm0, %%mm1 \n\t"
2007 "movq %%mm2, %%mm3 \n\t"
2008 "movq %%mm4, %%mm5 \n\t"
2009 "movq %%mm6, %%mm7 \n\t"
2010 "punpcklbw %%mm0, %%mm0 \n\t"
2011 "punpckhbw %%mm1, %%mm1 \n\t"
2012 "punpcklbw %%mm2, %%mm2 \n\t"
2013 "punpckhbw %%mm3, %%mm3 \n\t"
2014 "punpcklbw %%mm4, %%mm4 \n\t"
2015 "punpckhbw %%mm5, %%mm5 \n\t"
2016 "punpcklbw %%mm6, %%mm6 \n\t"
2017 "punpckhbw %%mm7, %%mm7 \n\t"
2018 MOVNTQ" %%mm0, (%0,%2,2) \n\t"
2019 MOVNTQ" %%mm1, 8(%0,%2,2) \n\t"
2020 MOVNTQ" %%mm2, 16(%0,%2,2) \n\t"
2021 MOVNTQ" %%mm3, 24(%0,%2,2) \n\t"
2022 MOVNTQ" %%mm4, 32(%0,%2,2) \n\t"
2023 MOVNTQ" %%mm5, 40(%0,%2,2) \n\t"
2024 MOVNTQ" %%mm6, 48(%0,%2,2) \n\t"
2025 MOVNTQ" %%mm7, 56(%0,%2,2)"
2026 :: "r"(d), "r"(s2), "r"(x)
2029 for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2038 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2040 int width, int height,
2041 int srcStride1, int srcStride2,
2042 int srcStride3, int dstStride)
2046 w=width/2; h=height;
2048 const uint8_t* yp=src1+srcStride1*y;
2049 const uint8_t* up=src2+srcStride2*(y>>2);
2050 const uint8_t* vp=src3+srcStride3*(y>>2);
2051 uint8_t* d=dst+dstStride*y;
2055 PREFETCH" 32(%1, %0) \n\t"
2056 PREFETCH" 32(%2, %0) \n\t"
2057 PREFETCH" 32(%3, %0) \n\t"
2058 "movq (%1, %0, 4), %%mm0 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2059 "movq (%2, %0), %%mm1 \n\t" /* U0U1U2U3U4U5U6U7 */
2060 "movq (%3, %0), %%mm2 \n\t" /* V0V1V2V3V4V5V6V7 */
2061 "movq %%mm0, %%mm3 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2062 "movq %%mm1, %%mm4 \n\t" /* U0U1U2U3U4U5U6U7 */
2063 "movq %%mm2, %%mm5 \n\t" /* V0V1V2V3V4V5V6V7 */
2064 "punpcklbw %%mm1, %%mm1 \n\t" /* U0U0 U1U1 U2U2 U3U3 */
2065 "punpcklbw %%mm2, %%mm2 \n\t" /* V0V0 V1V1 V2V2 V3V3 */
2066 "punpckhbw %%mm4, %%mm4 \n\t" /* U4U4 U5U5 U6U6 U7U7 */
2067 "punpckhbw %%mm5, %%mm5 \n\t" /* V4V4 V5V5 V6V6 V7V7 */
2069 "movq %%mm1, %%mm6 \n\t"
2070 "punpcklbw %%mm2, %%mm1 \n\t" /* U0V0 U0V0 U1V1 U1V1*/
2071 "punpcklbw %%mm1, %%mm0 \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2072 "punpckhbw %%mm1, %%mm3 \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2073 MOVNTQ" %%mm0, (%4, %0, 8) \n\t"
2074 MOVNTQ" %%mm3, 8(%4, %0, 8) \n\t"
2076 "punpckhbw %%mm2, %%mm6 \n\t" /* U2V2 U2V2 U3V3 U3V3*/
2077 "movq 8(%1, %0, 4), %%mm0 \n\t"
2078 "movq %%mm0, %%mm3 \n\t"
2079 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U2 Y V2 Y U2 Y V2*/
2080 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U3 Y V3 Y U3 Y V3*/
2081 MOVNTQ" %%mm0, 16(%4, %0, 8) \n\t"
2082 MOVNTQ" %%mm3, 24(%4, %0, 8) \n\t"
2084 "movq %%mm4, %%mm6 \n\t"
2085 "movq 16(%1, %0, 4), %%mm0 \n\t"
2086 "movq %%mm0, %%mm3 \n\t"
2087 "punpcklbw %%mm5, %%mm4 \n\t"
2088 "punpcklbw %%mm4, %%mm0 \n\t" /* Y U4 Y V4 Y U4 Y V4*/
2089 "punpckhbw %%mm4, %%mm3 \n\t" /* Y U5 Y V5 Y U5 Y V5*/
2090 MOVNTQ" %%mm0, 32(%4, %0, 8) \n\t"
2091 MOVNTQ" %%mm3, 40(%4, %0, 8) \n\t"
2093 "punpckhbw %%mm5, %%mm6 \n\t"
2094 "movq 24(%1, %0, 4), %%mm0 \n\t"
2095 "movq %%mm0, %%mm3 \n\t"
2096 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U6 Y V6 Y U6 Y V6*/
2097 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U7 Y V7 Y U7 Y V7*/
2098 MOVNTQ" %%mm0, 48(%4, %0, 8) \n\t"
2099 MOVNTQ" %%mm3, 56(%4, %0, 8) \n\t"
2102 : "r"(yp), "r" (up), "r"(vp), "r"(d)
2106 const int x2 = x<<2;
2109 d[8*x+2] = yp[x2+1];
2111 d[8*x+4] = yp[x2+2];
2113 d[8*x+6] = yp[x2+3];
2123 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2125 static void RENAME(extract_even)(const uint8_t *src, uint8_t *dst, x86_reg count)
2134 "pcmpeqw %%mm7, %%mm7 \n\t"
2135 "psrlw $8, %%mm7 \n\t"
2137 "movq -30(%1, %0, 2), %%mm0 \n\t"
2138 "movq -22(%1, %0, 2), %%mm1 \n\t"
2139 "movq -14(%1, %0, 2), %%mm2 \n\t"
2140 "movq -6(%1, %0, 2), %%mm3 \n\t"
2141 "pand %%mm7, %%mm0 \n\t"
2142 "pand %%mm7, %%mm1 \n\t"
2143 "pand %%mm7, %%mm2 \n\t"
2144 "pand %%mm7, %%mm3 \n\t"
2145 "packuswb %%mm1, %%mm0 \n\t"
2146 "packuswb %%mm3, %%mm2 \n\t"
2147 MOVNTQ" %%mm0,-15(%2, %0) \n\t"
2148 MOVNTQ" %%mm2,- 7(%2, %0) \n\t"
2152 : "r"(src), "r"(dst)
2157 dst[count]= src[2*count];
2162 #if !COMPILE_TEMPLATE_AMD3DNOW
2163 static void RENAME(extract_even2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2172 "pcmpeqw %%mm7, %%mm7 \n\t"
2173 "psrlw $8, %%mm7 \n\t"
2175 "movq -28(%1, %0, 4), %%mm0 \n\t"
2176 "movq -20(%1, %0, 4), %%mm1 \n\t"
2177 "movq -12(%1, %0, 4), %%mm2 \n\t"
2178 "movq -4(%1, %0, 4), %%mm3 \n\t"
2179 "pand %%mm7, %%mm0 \n\t"
2180 "pand %%mm7, %%mm1 \n\t"
2181 "pand %%mm7, %%mm2 \n\t"
2182 "pand %%mm7, %%mm3 \n\t"
2183 "packuswb %%mm1, %%mm0 \n\t"
2184 "packuswb %%mm3, %%mm2 \n\t"
2185 "movq %%mm0, %%mm1 \n\t"
2186 "movq %%mm2, %%mm3 \n\t"
2187 "psrlw $8, %%mm0 \n\t"
2188 "psrlw $8, %%mm2 \n\t"
2189 "pand %%mm7, %%mm1 \n\t"
2190 "pand %%mm7, %%mm3 \n\t"
2191 "packuswb %%mm2, %%mm0 \n\t"
2192 "packuswb %%mm3, %%mm1 \n\t"
2193 MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
2194 MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
2198 : "r"(src), "r"(dst0), "r"(dst1)
2203 dst0[count]= src[4*count+0];
2204 dst1[count]= src[4*count+2];
2208 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2210 static void RENAME(extract_even2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2221 "pcmpeqw %%mm7, %%mm7 \n\t"
2222 "psrlw $8, %%mm7 \n\t"
2224 "movq -28(%1, %0, 4), %%mm0 \n\t"
2225 "movq -20(%1, %0, 4), %%mm1 \n\t"
2226 "movq -12(%1, %0, 4), %%mm2 \n\t"
2227 "movq -4(%1, %0, 4), %%mm3 \n\t"
2228 PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
2229 PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
2230 PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
2231 PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
2232 "pand %%mm7, %%mm0 \n\t"
2233 "pand %%mm7, %%mm1 \n\t"
2234 "pand %%mm7, %%mm2 \n\t"
2235 "pand %%mm7, %%mm3 \n\t"
2236 "packuswb %%mm1, %%mm0 \n\t"
2237 "packuswb %%mm3, %%mm2 \n\t"
2238 "movq %%mm0, %%mm1 \n\t"
2239 "movq %%mm2, %%mm3 \n\t"
2240 "psrlw $8, %%mm0 \n\t"
2241 "psrlw $8, %%mm2 \n\t"
2242 "pand %%mm7, %%mm1 \n\t"
2243 "pand %%mm7, %%mm3 \n\t"
2244 "packuswb %%mm2, %%mm0 \n\t"
2245 "packuswb %%mm3, %%mm1 \n\t"
2246 MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
2247 MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
2251 : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2257 dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2258 dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2263 #if !COMPILE_TEMPLATE_AMD3DNOW
2264 static void RENAME(extract_odd2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2273 "pcmpeqw %%mm7, %%mm7 \n\t"
2274 "psrlw $8, %%mm7 \n\t"
2276 "movq -28(%1, %0, 4), %%mm0 \n\t"
2277 "movq -20(%1, %0, 4), %%mm1 \n\t"
2278 "movq -12(%1, %0, 4), %%mm2 \n\t"
2279 "movq -4(%1, %0, 4), %%mm3 \n\t"
2280 "psrlw $8, %%mm0 \n\t"
2281 "psrlw $8, %%mm1 \n\t"
2282 "psrlw $8, %%mm2 \n\t"
2283 "psrlw $8, %%mm3 \n\t"
2284 "packuswb %%mm1, %%mm0 \n\t"
2285 "packuswb %%mm3, %%mm2 \n\t"
2286 "movq %%mm0, %%mm1 \n\t"
2287 "movq %%mm2, %%mm3 \n\t"
2288 "psrlw $8, %%mm0 \n\t"
2289 "psrlw $8, %%mm2 \n\t"
2290 "pand %%mm7, %%mm1 \n\t"
2291 "pand %%mm7, %%mm3 \n\t"
2292 "packuswb %%mm2, %%mm0 \n\t"
2293 "packuswb %%mm3, %%mm1 \n\t"
2294 MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
2295 MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
2299 : "r"(src), "r"(dst0), "r"(dst1)
2305 dst0[count]= src[4*count+0];
2306 dst1[count]= src[4*count+2];
2310 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2312 static void RENAME(extract_odd2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2323 "pcmpeqw %%mm7, %%mm7 \n\t"
2324 "psrlw $8, %%mm7 \n\t"
2326 "movq -28(%1, %0, 4), %%mm0 \n\t"
2327 "movq -20(%1, %0, 4), %%mm1 \n\t"
2328 "movq -12(%1, %0, 4), %%mm2 \n\t"
2329 "movq -4(%1, %0, 4), %%mm3 \n\t"
2330 PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
2331 PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
2332 PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
2333 PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
2334 "psrlw $8, %%mm0 \n\t"
2335 "psrlw $8, %%mm1 \n\t"
2336 "psrlw $8, %%mm2 \n\t"
2337 "psrlw $8, %%mm3 \n\t"
2338 "packuswb %%mm1, %%mm0 \n\t"
2339 "packuswb %%mm3, %%mm2 \n\t"
2340 "movq %%mm0, %%mm1 \n\t"
2341 "movq %%mm2, %%mm3 \n\t"
2342 "psrlw $8, %%mm0 \n\t"
2343 "psrlw $8, %%mm2 \n\t"
2344 "pand %%mm7, %%mm1 \n\t"
2345 "pand %%mm7, %%mm3 \n\t"
2346 "packuswb %%mm2, %%mm0 \n\t"
2347 "packuswb %%mm3, %%mm1 \n\t"
2348 MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
2349 MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
2353 : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2361 dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2362 dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2367 static void RENAME(yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2368 int width, int height,
2369 int lumStride, int chromStride, int srcStride)
2372 const int chromWidth= -((-width)>>1);
2374 for (y=0; y<height; y++) {
2375 RENAME(extract_even)(src, ydst, width);
2377 RENAME(extract_odd2avg)(src-srcStride, src, udst, vdst, chromWidth);
2392 #if !COMPILE_TEMPLATE_AMD3DNOW
2393 static void RENAME(yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2394 int width, int height,
2395 int lumStride, int chromStride, int srcStride)
2398 const int chromWidth= -((-width)>>1);
2400 for (y=0; y<height; y++) {
2401 RENAME(extract_even)(src, ydst, width);
2402 RENAME(extract_odd2)(src, udst, vdst, chromWidth);
2415 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2417 static void RENAME(uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2418 int width, int height,
2419 int lumStride, int chromStride, int srcStride)
2422 const int chromWidth= -((-width)>>1);
2424 for (y=0; y<height; y++) {
2425 RENAME(extract_even)(src+1, ydst, width);
2427 RENAME(extract_even2avg)(src-srcStride, src, udst, vdst, chromWidth);
2442 #if !COMPILE_TEMPLATE_AMD3DNOW
2443 static void RENAME(uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2444 int width, int height,
2445 int lumStride, int chromStride, int srcStride)
2448 const int chromWidth= -((-width)>>1);
2450 for (y=0; y<height; y++) {
2451 RENAME(extract_even)(src+1, ydst, width);
2452 RENAME(extract_even2)(src, udst, vdst, chromWidth);
2465 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2466 #endif /* !COMPILE_TEMPLATE_SSE2 */
2468 static inline void RENAME(rgb2rgb_init)(void)
2470 #if !COMPILE_TEMPLATE_SSE2
2471 #if !COMPILE_TEMPLATE_AMD3DNOW
2472 rgb15to16 = RENAME(rgb15to16);
2473 rgb15tobgr24 = RENAME(rgb15tobgr24);
2474 rgb15to32 = RENAME(rgb15to32);
2475 rgb16tobgr24 = RENAME(rgb16tobgr24);
2476 rgb16to32 = RENAME(rgb16to32);
2477 rgb16to15 = RENAME(rgb16to15);
2478 rgb24tobgr16 = RENAME(rgb24tobgr16);
2479 rgb24tobgr15 = RENAME(rgb24tobgr15);
2480 rgb24tobgr32 = RENAME(rgb24tobgr32);
2481 rgb32to16 = RENAME(rgb32to16);
2482 rgb32to15 = RENAME(rgb32to15);
2483 rgb32tobgr24 = RENAME(rgb32tobgr24);
2484 rgb24to15 = RENAME(rgb24to15);
2485 rgb24to16 = RENAME(rgb24to16);
2486 rgb24tobgr24 = RENAME(rgb24tobgr24);
2487 shuffle_bytes_2103 = RENAME(shuffle_bytes_2103);
2488 rgb32tobgr16 = RENAME(rgb32tobgr16);
2489 rgb32tobgr15 = RENAME(rgb32tobgr15);
2490 yv12toyuy2 = RENAME(yv12toyuy2);
2491 yv12touyvy = RENAME(yv12touyvy);
2492 yuv422ptoyuy2 = RENAME(yuv422ptoyuy2);
2493 yuv422ptouyvy = RENAME(yuv422ptouyvy);
2494 yuy2toyv12 = RENAME(yuy2toyv12);
2495 vu9_to_vu12 = RENAME(vu9_to_vu12);
2496 yvu9_to_yuy2 = RENAME(yvu9_to_yuy2);
2497 uyvytoyuv422 = RENAME(uyvytoyuv422);
2498 yuyvtoyuv422 = RENAME(yuyvtoyuv422);
2499 #endif /* !COMPILE_TEMPLATE_SSE2 */
2501 #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
2502 planar2x = RENAME(planar2x);
2503 #endif /* COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW */
2504 rgb24toyv12 = RENAME(rgb24toyv12);
2506 yuyvtoyuv420 = RENAME(yuyvtoyuv420);
2507 uyvytoyuv420 = RENAME(uyvytoyuv420);
2508 #endif /* COMPILE_TEMPLATE_SSE2 */
2510 #if !COMPILE_TEMPLATE_AMD3DNOW
2511 interleaveBytes = RENAME(interleaveBytes);
2512 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */