git.sesse.net Git - ffmpeg/blob - postproc/swscale_template.c

   1 /*
   2     Copyright (C) 2001-2002 Michael Niedermayer <michaelni@gmx.at>
   3
   4     This program is free software; you can redistribute it and/or modify
   5     it under the terms of the GNU General Public License as published by
   6     the Free Software Foundation; either version 2 of the License, or
   7     (at your option) any later version.
   8
   9     This program is distributed in the hope that it will be useful,
  10     but WITHOUT ANY WARRANTY; without even the implied warranty of
  11     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12     GNU General Public License for more details.
  13
  14     You should have received a copy of the GNU General Public License
  15     along with this program; if not, write to the Free Software
  16     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  17 */
  18
  19 #undef MOVNTQ
  20 #undef PAVGB
  21 #undef PREFETCH
  22 #undef PREFETCHW
  23 #undef EMMS
  24 #undef SFENCE
  25
  26 #ifdef HAVE_3DNOW
  27 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
  28 #define EMMS     "femms"
  29 #else
  30 #define EMMS     "emms"
  31 #endif
  32
  33 #ifdef HAVE_3DNOW
  34 #define PREFETCH  "prefetch"
  35 #define PREFETCHW "prefetchw"
  36 #elif defined ( HAVE_MMX2 )
  37 #define PREFETCH "prefetchnta"
  38 #define PREFETCHW "prefetcht0"
  39 #else
  40 #define PREFETCH "/nop"
  41 #define PREFETCHW "/nop"
  42 #endif
  43
  44 #ifdef HAVE_MMX2
  45 #define SFENCE "sfence"
  46 #else
  47 #define SFENCE "/nop"
  48 #endif
  49
  50 #ifdef HAVE_MMX2
  51 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
  52 #elif defined (HAVE_3DNOW)
  53 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
  54 #endif
  55
  56 #ifdef HAVE_MMX2
  57 #define MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
  58 #else
  59 #define MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
  60 #endif
  61
  62
  63 #define YSCALEYUV2YV12X(x) \
  64                         "xorl %%eax, %%eax              \n\t"\
  65                         "pxor %%mm3, %%mm3              \n\t"\
  66                         "pxor %%mm4, %%mm4              \n\t"\
  67                         "movl %0, %%edx                 \n\t"\
  68                         ".balign 16                     \n\t" /* FIXME Unroll? */\
  69                         "1:                             \n\t"\
  70                         "movl (%1, %%edx, 4), %%esi     \n\t"\
  71                         "movq (%2, %%edx, 8), %%mm0     \n\t" /* filterCoeff */\
  72                         "movq " #x "(%%esi, %%eax, 2), %%mm2    \n\t" /* srcData */\
  73                         "movq 8+" #x "(%%esi, %%eax, 2), %%mm5  \n\t" /* srcData */\
  74                         "pmulhw %%mm0, %%mm2            \n\t"\
  75                         "pmulhw %%mm0, %%mm5            \n\t"\
  76                         "paddw %%mm2, %%mm3             \n\t"\
  77                         "paddw %%mm5, %%mm4             \n\t"\
  78                         "addl $1, %%edx                 \n\t"\
  79                         " jnz 1b                        \n\t"\
  80                         "psraw $3, %%mm3                \n\t"\
  81                         "psraw $3, %%mm4                \n\t"\
  82                         "packuswb %%mm4, %%mm3          \n\t"\
  83                         MOVNTQ(%%mm3, (%3, %%eax))\
  84                         "addl $8, %%eax                 \n\t"\
  85                         "cmpl %4, %%eax                 \n\t"\
  86                         "pxor %%mm3, %%mm3              \n\t"\
  87                         "pxor %%mm4, %%mm4              \n\t"\
  88                         "movl %0, %%edx                 \n\t"\
  89                         "jb 1b                          \n\t"
  90
  91 #define YSCALEYUV2YV121 \
  92                         "movl %2, %%eax                 \n\t"\
  93                         ".balign 16                     \n\t" /* FIXME Unroll? */\
  94                         "1:                             \n\t"\
  95                         "movq (%0, %%eax, 2), %%mm0     \n\t"\
  96                         "movq 8(%0, %%eax, 2), %%mm1    \n\t"\
  97                         "psraw $7, %%mm0                \n\t"\
  98                         "psraw $7, %%mm1                \n\t"\
  99                         "packuswb %%mm1, %%mm0          \n\t"\
 100                         MOVNTQ(%%mm0, (%1, %%eax))\
 101                         "addl $8, %%eax                 \n\t"\
 102                         "jnc 1b                         \n\t"
 103
 104 /*
 105                         :: "m" (-lumFilterSize), "m" (-chrFilterSize),
 106                            "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
 107                            "r" (dest), "m" (dstW),
 108                            "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
 109                         : "%eax", "%ebx", "%ecx", "%edx", "%esi"
 110 */
 111 #define YSCALEYUV2RGBX \
 112                 "xorl %%eax, %%eax              \n\t"\
 113                 ".balign 16                     \n\t"\
 114                 "1:                             \n\t"\
 115                 "movl %1, %%edx                 \n\t" /* -chrFilterSize */\
 116                 "movl %3, %%ebx                 \n\t" /* chrMmxFilter+lumFilterSize */\
 117                 "movl %7, %%ecx                 \n\t" /* chrSrc+lumFilterSize */\
 118                 "pxor %%mm3, %%mm3              \n\t"\
 119                 "pxor %%mm4, %%mm4              \n\t"\
 120                 "2:                             \n\t"\
 121                 "movl (%%ecx, %%edx, 4), %%esi  \n\t"\
 122                 "movq (%%ebx, %%edx, 8), %%mm0  \n\t" /* filterCoeff */\
 123                 "movq (%%esi, %%eax), %%mm2     \n\t" /* UsrcData */\
 124                 "movq 4096(%%esi, %%eax), %%mm5 \n\t" /* VsrcData */\
 125                 "pmulhw %%mm0, %%mm2            \n\t"\
 126                 "pmulhw %%mm0, %%mm5            \n\t"\
 127                 "paddw %%mm2, %%mm3             \n\t"\
 128                 "paddw %%mm5, %%mm4             \n\t"\
 129                 "addl $1, %%edx                 \n\t"\
 130                 " jnz 2b                        \n\t"\
 131 \
 132                 "movl %0, %%edx                 \n\t" /* -lumFilterSize */\
 133                 "movl %2, %%ebx                 \n\t" /* lumMmxFilter+lumFilterSize */\
 134                 "movl %6, %%ecx                 \n\t" /* lumSrc+lumFilterSize */\
 135                 "pxor %%mm1, %%mm1              \n\t"\
 136                 "pxor %%mm7, %%mm7              \n\t"\
 137                 "2:                             \n\t"\
 138                 "movl (%%ecx, %%edx, 4), %%esi  \n\t"\
 139                 "movq (%%ebx, %%edx, 8), %%mm0  \n\t" /* filterCoeff */\
 140                 "movq (%%esi, %%eax, 2), %%mm2  \n\t" /* Y1srcData */\
 141                 "movq 8(%%esi, %%eax, 2), %%mm5 \n\t" /* Y2srcData */\
 142                 "pmulhw %%mm0, %%mm2            \n\t"\
 143                 "pmulhw %%mm0, %%mm5            \n\t"\
 144                 "paddw %%mm2, %%mm1             \n\t"\
 145                 "paddw %%mm5, %%mm7             \n\t"\
 146                 "addl $1, %%edx                 \n\t"\
 147                 " jnz 2b                        \n\t"\
 148 \
 149                 "psubw "MANGLE(w400)", %%mm3    \n\t" /* (U-128)8*/\
 150                 "psubw "MANGLE(w400)", %%mm4    \n\t" /* (V-128)8*/\
 151                 "movq %%mm3, %%mm2              \n\t" /* (U-128)8*/\
 152                 "movq %%mm4, %%mm5              \n\t" /* (V-128)8*/\
 153                 "pmulhw "MANGLE(ugCoeff)", %%mm3\n\t"\
 154                 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
 155         /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
 156                 "pmulhw "MANGLE(ubCoeff)", %%mm2\n\t"\
 157                 "pmulhw "MANGLE(vrCoeff)", %%mm5\n\t"\
 158                 "psubw "MANGLE(w80)", %%mm1     \n\t" /* 8(Y-16)*/\
 159                 "psubw "MANGLE(w80)", %%mm7     \n\t" /* 8(Y-16)*/\
 160                 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
 161                 "pmulhw "MANGLE(yCoeff)", %%mm7 \n\t"\
 162         /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
 163                 "paddw %%mm3, %%mm4             \n\t"\
 164                 "movq %%mm2, %%mm0              \n\t"\
 165                 "movq %%mm5, %%mm6              \n\t"\
 166                 "movq %%mm4, %%mm3              \n\t"\
 167                 "punpcklwd %%mm2, %%mm2         \n\t"\
 168                 "punpcklwd %%mm5, %%mm5         \n\t"\
 169                 "punpcklwd %%mm4, %%mm4         \n\t"\
 170                 "paddw %%mm1, %%mm2             \n\t"\
 171                 "paddw %%mm1, %%mm5             \n\t"\
 172                 "paddw %%mm1, %%mm4             \n\t"\
 173                 "punpckhwd %%mm0, %%mm0         \n\t"\
 174                 "punpckhwd %%mm6, %%mm6         \n\t"\
 175                 "punpckhwd %%mm3, %%mm3         \n\t"\
 176                 "paddw %%mm7, %%mm0             \n\t"\
 177                 "paddw %%mm7, %%mm6             \n\t"\
 178                 "paddw %%mm7, %%mm3             \n\t"\
 179                 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
 180                 "packuswb %%mm0, %%mm2          \n\t"\
 181                 "packuswb %%mm6, %%mm5          \n\t"\
 182                 "packuswb %%mm3, %%mm4          \n\t"\
 183                 "pxor %%mm7, %%mm7              \n\t"
 184
 185 #define FULL_YSCALEYUV2RGB \
 186                 "pxor %%mm7, %%mm7              \n\t"\
 187                 "movd %6, %%mm6                 \n\t" /*yalpha1*/\
 188                 "punpcklwd %%mm6, %%mm6         \n\t"\
 189                 "punpcklwd %%mm6, %%mm6         \n\t"\
 190                 "movd %7, %%mm5                 \n\t" /*uvalpha1*/\
 191                 "punpcklwd %%mm5, %%mm5         \n\t"\
 192                 "punpcklwd %%mm5, %%mm5         \n\t"\
 193                 "xorl %%eax, %%eax              \n\t"\
 194                 ".balign 16                     \n\t"\
 195                 "1:                             \n\t"\
 196                 "movq (%0, %%eax, 2), %%mm0     \n\t" /*buf0[eax]*/\
 197                 "movq (%1, %%eax, 2), %%mm1     \n\t" /*buf1[eax]*/\
 198                 "movq (%2, %%eax,2), %%mm2      \n\t" /* uvbuf0[eax]*/\
 199                 "movq (%3, %%eax,2), %%mm3      \n\t" /* uvbuf1[eax]*/\
 200                 "psubw %%mm1, %%mm0             \n\t" /* buf0[eax] - buf1[eax]*/\
 201                 "psubw %%mm3, %%mm2             \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
 202                 "pmulhw %%mm6, %%mm0            \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
 203                 "pmulhw %%mm5, %%mm2            \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
 204                 "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
 205                 "movq 4096(%2, %%eax,2), %%mm4  \n\t" /* uvbuf0[eax+2048]*/\
 206                 "psraw $4, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
 207                 "paddw %%mm0, %%mm1             \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
 208                 "movq 4096(%3, %%eax,2), %%mm0  \n\t" /* uvbuf1[eax+2048]*/\
 209                 "paddw %%mm2, %%mm3             \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
 210                 "psubw %%mm0, %%mm4             \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
 211                 "psubw "MANGLE(w80)", %%mm1     \n\t" /* 8(Y-16)*/\
 212                 "psubw "MANGLE(w400)", %%mm3    \n\t" /* 8(U-128)*/\
 213                 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
 214 \
 215 \
 216                 "pmulhw %%mm5, %%mm4            \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
 217                 "movq %%mm3, %%mm2              \n\t" /* (U-128)8*/\
 218                 "pmulhw "MANGLE(ubCoeff)", %%mm3\n\t"\
 219                 "psraw $4, %%mm0                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
 220                 "pmulhw "MANGLE(ugCoeff)", %%mm2\n\t"\
 221                 "paddw %%mm4, %%mm0             \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
 222                 "psubw "MANGLE(w400)", %%mm0    \n\t" /* (V-128)8*/\
 223 \
 224 \
 225                 "movq %%mm0, %%mm4              \n\t" /* (V-128)8*/\
 226                 "pmulhw "MANGLE(vrCoeff)", %%mm0\n\t"\
 227                 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
 228                 "paddw %%mm1, %%mm3             \n\t" /* B*/\
 229                 "paddw %%mm1, %%mm0             \n\t" /* R*/\
 230                 "packuswb %%mm3, %%mm3          \n\t"\
 231 \
 232                 "packuswb %%mm0, %%mm0          \n\t"\
 233                 "paddw %%mm4, %%mm2             \n\t"\
 234                 "paddw %%mm2, %%mm1             \n\t" /* G*/\
 235 \
 236                 "packuswb %%mm1, %%mm1          \n\t"
 237
 238 #define YSCALEYUV2RGB \
 239                 "movd %6, %%mm6                 \n\t" /*yalpha1*/\
 240                 "punpcklwd %%mm6, %%mm6         \n\t"\
 241                 "punpcklwd %%mm6, %%mm6         \n\t"\
 242                 "movq %%mm6, "MANGLE(asm_yalpha1)"\n\t"\
 243                 "movd %7, %%mm5                 \n\t" /*uvalpha1*/\
 244                 "punpcklwd %%mm5, %%mm5         \n\t"\
 245                 "punpcklwd %%mm5, %%mm5         \n\t"\
 246                 "movq %%mm5, "MANGLE(asm_uvalpha1)"\n\t"\
 247                 "xorl %%eax, %%eax              \n\t"\
 248                 ".balign 16                     \n\t"\
 249                 "1:                             \n\t"\
 250                 "movq (%2, %%eax), %%mm2        \n\t" /* uvbuf0[eax]*/\
 251                 "movq (%3, %%eax), %%mm3        \n\t" /* uvbuf1[eax]*/\
 252                 "movq 4096(%2, %%eax), %%mm5    \n\t" /* uvbuf0[eax+2048]*/\
 253                 "movq 4096(%3, %%eax), %%mm4    \n\t" /* uvbuf1[eax+2048]*/\
 254                 "psubw %%mm3, %%mm2             \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
 255                 "psubw %%mm4, %%mm5             \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
 256                 "movq "MANGLE(asm_uvalpha1)", %%mm0\n\t"\
 257                 "pmulhw %%mm0, %%mm2            \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
 258                 "pmulhw %%mm0, %%mm5            \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
 259                 "psraw $4, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
 260                 "psraw $4, %%mm4                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
 261                 "paddw %%mm2, %%mm3             \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
 262                 "paddw %%mm5, %%mm4             \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
 263                 "psubw "MANGLE(w400)", %%mm3    \n\t" /* (U-128)8*/\
 264                 "psubw "MANGLE(w400)", %%mm4    \n\t" /* (V-128)8*/\
 265                 "movq %%mm3, %%mm2              \n\t" /* (U-128)8*/\
 266                 "movq %%mm4, %%mm5              \n\t" /* (V-128)8*/\
 267                 "pmulhw "MANGLE(ugCoeff)", %%mm3\n\t"\
 268                 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
 269         /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
 270                 "movq (%0, %%eax, 2), %%mm0     \n\t" /*buf0[eax]*/\
 271                 "movq (%1, %%eax, 2), %%mm1     \n\t" /*buf1[eax]*/\
 272                 "movq 8(%0, %%eax, 2), %%mm6    \n\t" /*buf0[eax]*/\
 273                 "movq 8(%1, %%eax, 2), %%mm7    \n\t" /*buf1[eax]*/\
 274                 "psubw %%mm1, %%mm0             \n\t" /* buf0[eax] - buf1[eax]*/\
 275                 "psubw %%mm7, %%mm6             \n\t" /* buf0[eax] - buf1[eax]*/\
 276                 "pmulhw "MANGLE(asm_yalpha1)", %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
 277                 "pmulhw "MANGLE(asm_yalpha1)", %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
 278                 "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
 279                 "psraw $4, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
 280                 "paddw %%mm0, %%mm1             \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
 281                 "paddw %%mm6, %%mm7             \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
 282                 "pmulhw "MANGLE(ubCoeff)", %%mm2\n\t"\
 283                 "pmulhw "MANGLE(vrCoeff)", %%mm5\n\t"\
 284                 "psubw "MANGLE(w80)", %%mm1     \n\t" /* 8(Y-16)*/\
 285                 "psubw "MANGLE(w80)", %%mm7     \n\t" /* 8(Y-16)*/\
 286                 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
 287                 "pmulhw "MANGLE(yCoeff)", %%mm7 \n\t"\
 288         /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
 289                 "paddw %%mm3, %%mm4             \n\t"\
 290                 "movq %%mm2, %%mm0              \n\t"\
 291                 "movq %%mm5, %%mm6              \n\t"\
 292                 "movq %%mm4, %%mm3              \n\t"\
 293                 "punpcklwd %%mm2, %%mm2         \n\t"\
 294                 "punpcklwd %%mm5, %%mm5         \n\t"\
 295                 "punpcklwd %%mm4, %%mm4         \n\t"\
 296                 "paddw %%mm1, %%mm2             \n\t"\
 297                 "paddw %%mm1, %%mm5             \n\t"\
 298                 "paddw %%mm1, %%mm4             \n\t"\
 299                 "punpckhwd %%mm0, %%mm0         \n\t"\
 300                 "punpckhwd %%mm6, %%mm6         \n\t"\
 301                 "punpckhwd %%mm3, %%mm3         \n\t"\
 302                 "paddw %%mm7, %%mm0             \n\t"\
 303                 "paddw %%mm7, %%mm6             \n\t"\
 304                 "paddw %%mm7, %%mm3             \n\t"\
 305                 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
 306                 "packuswb %%mm0, %%mm2          \n\t"\
 307                 "packuswb %%mm6, %%mm5          \n\t"\
 308                 "packuswb %%mm3, %%mm4          \n\t"\
 309                 "pxor %%mm7, %%mm7              \n\t"
 310
 311 #define YSCALEYUV2RGB1 \
 312                 "xorl %%eax, %%eax              \n\t"\
 313                 ".balign 16                     \n\t"\
 314                 "1:                             \n\t"\
 315                 "movq (%2, %%eax), %%mm3        \n\t" /* uvbuf0[eax]*/\
 316                 "movq 4096(%2, %%eax), %%mm4    \n\t" /* uvbuf0[eax+2048]*/\
 317                 "psraw $4, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
 318                 "psraw $4, %%mm4                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
 319                 "psubw "MANGLE(w400)", %%mm3    \n\t" /* (U-128)8*/\
 320                 "psubw "MANGLE(w400)", %%mm4    \n\t" /* (V-128)8*/\
 321                 "movq %%mm3, %%mm2              \n\t" /* (U-128)8*/\
 322                 "movq %%mm4, %%mm5              \n\t" /* (V-128)8*/\
 323                 "pmulhw "MANGLE(ugCoeff)", %%mm3\n\t"\
 324                 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
 325         /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
 326                 "movq (%0, %%eax, 2), %%mm1     \n\t" /*buf0[eax]*/\
 327                 "movq 8(%0, %%eax, 2), %%mm7    \n\t" /*buf0[eax]*/\
 328                 "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
 329                 "psraw $4, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
 330                 "pmulhw "MANGLE(ubCoeff)", %%mm2\n\t"\
 331                 "pmulhw "MANGLE(vrCoeff)", %%mm5\n\t"\
 332                 "psubw "MANGLE(w80)", %%mm1     \n\t" /* 8(Y-16)*/\
 333                 "psubw "MANGLE(w80)", %%mm7     \n\t" /* 8(Y-16)*/\
 334                 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
 335                 "pmulhw "MANGLE(yCoeff)", %%mm7 \n\t"\
 336         /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
 337                 "paddw %%mm3, %%mm4             \n\t"\
 338                 "movq %%mm2, %%mm0              \n\t"\
 339                 "movq %%mm5, %%mm6              \n\t"\
 340                 "movq %%mm4, %%mm3              \n\t"\
 341                 "punpcklwd %%mm2, %%mm2         \n\t"\
 342                 "punpcklwd %%mm5, %%mm5         \n\t"\
 343                 "punpcklwd %%mm4, %%mm4         \n\t"\
 344                 "paddw %%mm1, %%mm2             \n\t"\
 345                 "paddw %%mm1, %%mm5             \n\t"\
 346                 "paddw %%mm1, %%mm4             \n\t"\
 347                 "punpckhwd %%mm0, %%mm0         \n\t"\
 348                 "punpckhwd %%mm6, %%mm6         \n\t"\
 349                 "punpckhwd %%mm3, %%mm3         \n\t"\
 350                 "paddw %%mm7, %%mm0             \n\t"\
 351                 "paddw %%mm7, %%mm6             \n\t"\
 352                 "paddw %%mm7, %%mm3             \n\t"\
 353                 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
 354                 "packuswb %%mm0, %%mm2          \n\t"\
 355                 "packuswb %%mm6, %%mm5          \n\t"\
 356                 "packuswb %%mm3, %%mm4          \n\t"\
 357                 "pxor %%mm7, %%mm7              \n\t"
 358
 359 // do vertical chrominance interpolation
 360 #define YSCALEYUV2RGB1b \
 361                 "xorl %%eax, %%eax              \n\t"\
 362                 ".balign 16                     \n\t"\
 363                 "1:                             \n\t"\
 364                 "movq (%2, %%eax), %%mm2        \n\t" /* uvbuf0[eax]*/\
 365                 "movq (%3, %%eax), %%mm3        \n\t" /* uvbuf1[eax]*/\
 366                 "movq 4096(%2, %%eax), %%mm5    \n\t" /* uvbuf0[eax+2048]*/\
 367                 "movq 4096(%3, %%eax), %%mm4    \n\t" /* uvbuf1[eax+2048]*/\
 368                 "paddw %%mm2, %%mm3             \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
 369                 "paddw %%mm5, %%mm4             \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
 370                 "psrlw $5, %%mm3                \n\t" /*FIXME might overflow*/\
 371                 "psrlw $5, %%mm4                \n\t" /*FIXME might overflow*/\
 372                 "psubw "MANGLE(w400)", %%mm3    \n\t" /* (U-128)8*/\
 373                 "psubw "MANGLE(w400)", %%mm4    \n\t" /* (V-128)8*/\
 374                 "movq %%mm3, %%mm2              \n\t" /* (U-128)8*/\
 375                 "movq %%mm4, %%mm5              \n\t" /* (V-128)8*/\
 376                 "pmulhw "MANGLE(ugCoeff)", %%mm3\n\t"\
 377                 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
 378         /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
 379                 "movq (%0, %%eax, 2), %%mm1     \n\t" /*buf0[eax]*/\
 380                 "movq 8(%0, %%eax, 2), %%mm7    \n\t" /*buf0[eax]*/\
 381                 "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
 382                 "psraw $4, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
 383                 "pmulhw "MANGLE(ubCoeff)", %%mm2\n\t"\
 384                 "pmulhw "MANGLE(vrCoeff)", %%mm5\n\t"\
 385                 "psubw "MANGLE(w80)", %%mm1     \n\t" /* 8(Y-16)*/\
 386                 "psubw "MANGLE(w80)", %%mm7     \n\t" /* 8(Y-16)*/\
 387                 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
 388                 "pmulhw "MANGLE(yCoeff)", %%mm7 \n\t"\
 389         /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
 390                 "paddw %%mm3, %%mm4             \n\t"\
 391                 "movq %%mm2, %%mm0              \n\t"\
 392                 "movq %%mm5, %%mm6              \n\t"\
 393                 "movq %%mm4, %%mm3              \n\t"\
 394                 "punpcklwd %%mm2, %%mm2         \n\t"\
 395                 "punpcklwd %%mm5, %%mm5         \n\t"\
 396                 "punpcklwd %%mm4, %%mm4         \n\t"\
 397                 "paddw %%mm1, %%mm2             \n\t"\
 398                 "paddw %%mm1, %%mm5             \n\t"\
 399                 "paddw %%mm1, %%mm4             \n\t"\
 400                 "punpckhwd %%mm0, %%mm0         \n\t"\
 401                 "punpckhwd %%mm6, %%mm6         \n\t"\
 402                 "punpckhwd %%mm3, %%mm3         \n\t"\
 403                 "paddw %%mm7, %%mm0             \n\t"\
 404                 "paddw %%mm7, %%mm6             \n\t"\
 405                 "paddw %%mm7, %%mm3             \n\t"\
 406                 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
 407                 "packuswb %%mm0, %%mm2          \n\t"\
 408                 "packuswb %%mm6, %%mm5          \n\t"\
 409                 "packuswb %%mm3, %%mm4          \n\t"\
 410                 "pxor %%mm7, %%mm7              \n\t"
 411
 412 #define WRITEBGR32 \
 413                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
 414                         "movq %%mm2, %%mm1              \n\t" /* B */\
 415                         "movq %%mm5, %%mm6              \n\t" /* R */\
 416                         "punpcklbw %%mm4, %%mm2         \n\t" /* GBGBGBGB 0 */\
 417                         "punpcklbw %%mm7, %%mm5         \n\t" /* 0R0R0R0R 0 */\
 418                         "punpckhbw %%mm4, %%mm1         \n\t" /* GBGBGBGB 2 */\
 419                         "punpckhbw %%mm7, %%mm6         \n\t" /* 0R0R0R0R 2 */\
 420                         "movq %%mm2, %%mm0              \n\t" /* GBGBGBGB 0 */\
 421                         "movq %%mm1, %%mm3              \n\t" /* GBGBGBGB 2 */\
 422                         "punpcklwd %%mm5, %%mm0         \n\t" /* 0RGB0RGB 0 */\
 423                         "punpckhwd %%mm5, %%mm2         \n\t" /* 0RGB0RGB 1 */\
 424                         "punpcklwd %%mm6, %%mm1         \n\t" /* 0RGB0RGB 2 */\
 425                         "punpckhwd %%mm6, %%mm3         \n\t" /* 0RGB0RGB 3 */\
 426 \
 427                         MOVNTQ(%%mm0, (%4, %%eax, 4))\
 428                         MOVNTQ(%%mm2, 8(%4, %%eax, 4))\
 429                         MOVNTQ(%%mm1, 16(%4, %%eax, 4))\
 430                         MOVNTQ(%%mm3, 24(%4, %%eax, 4))\
 431 \
 432                         "addl $8, %%eax                 \n\t"\
 433                         "cmpl %5, %%eax                 \n\t"\
 434                         " jb 1b                         \n\t"
 435
 436 #define WRITEBGR16 \
 437                         "pand "MANGLE(bF8)", %%mm2      \n\t" /* B */\
 438                         "pand "MANGLE(bFC)", %%mm4      \n\t" /* G */\
 439                         "pand "MANGLE(bF8)", %%mm5      \n\t" /* R */\
 440                         "psrlq $3, %%mm2                \n\t"\
 441 \
 442                         "movq %%mm2, %%mm1              \n\t"\
 443                         "movq %%mm4, %%mm3              \n\t"\
 444 \
 445                         "punpcklbw %%mm7, %%mm3         \n\t"\
 446                         "punpcklbw %%mm5, %%mm2         \n\t"\
 447                         "punpckhbw %%mm7, %%mm4         \n\t"\
 448                         "punpckhbw %%mm5, %%mm1         \n\t"\
 449 \
 450                         "psllq $3, %%mm3                \n\t"\
 451                         "psllq $3, %%mm4                \n\t"\
 452 \
 453                         "por %%mm3, %%mm2               \n\t"\
 454                         "por %%mm4, %%mm1               \n\t"\
 455 \
 456                         MOVNTQ(%%mm2, (%4, %%eax, 2))\
 457                         MOVNTQ(%%mm1, 8(%4, %%eax, 2))\
 458 \
 459                         "addl $8, %%eax                 \n\t"\
 460                         "cmpl %5, %%eax                 \n\t"\
 461                         " jb 1b                         \n\t"
 462
 463 #define WRITEBGR15 \
 464                         "pand "MANGLE(bF8)", %%mm2      \n\t" /* B */\
 465                         "pand "MANGLE(bF8)", %%mm4      \n\t" /* G */\
 466                         "pand "MANGLE(bF8)", %%mm5      \n\t" /* R */\
 467                         "psrlq $3, %%mm2                \n\t"\
 468                         "psrlq $1, %%mm5                \n\t"\
 469 \
 470                         "movq %%mm2, %%mm1              \n\t"\
 471                         "movq %%mm4, %%mm3              \n\t"\
 472 \
 473                         "punpcklbw %%mm7, %%mm3         \n\t"\
 474                         "punpcklbw %%mm5, %%mm2         \n\t"\
 475                         "punpckhbw %%mm7, %%mm4         \n\t"\
 476                         "punpckhbw %%mm5, %%mm1         \n\t"\
 477 \
 478                         "psllq $2, %%mm3                \n\t"\
 479                         "psllq $2, %%mm4                \n\t"\
 480 \
 481                         "por %%mm3, %%mm2               \n\t"\
 482                         "por %%mm4, %%mm1               \n\t"\
 483 \
 484                         MOVNTQ(%%mm2, (%4, %%eax, 2))\
 485                         MOVNTQ(%%mm1, 8(%4, %%eax, 2))\
 486 \
 487                         "addl $8, %%eax                 \n\t"\
 488                         "cmpl %5, %%eax                 \n\t"\
 489                         " jb 1b                         \n\t"
 490
 491 #define WRITEBGR24OLD \
 492                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
 493                         "movq %%mm2, %%mm1              \n\t" /* B */\
 494                         "movq %%mm5, %%mm6              \n\t" /* R */\
 495                         "punpcklbw %%mm4, %%mm2         \n\t" /* GBGBGBGB 0 */\
 496                         "punpcklbw %%mm7, %%mm5         \n\t" /* 0R0R0R0R 0 */\
 497                         "punpckhbw %%mm4, %%mm1         \n\t" /* GBGBGBGB 2 */\
 498                         "punpckhbw %%mm7, %%mm6         \n\t" /* 0R0R0R0R 2 */\
 499                         "movq %%mm2, %%mm0              \n\t" /* GBGBGBGB 0 */\
 500                         "movq %%mm1, %%mm3              \n\t" /* GBGBGBGB 2 */\
 501                         "punpcklwd %%mm5, %%mm0         \n\t" /* 0RGB0RGB 0 */\
 502                         "punpckhwd %%mm5, %%mm2         \n\t" /* 0RGB0RGB 1 */\
 503                         "punpcklwd %%mm6, %%mm1         \n\t" /* 0RGB0RGB 2 */\
 504                         "punpckhwd %%mm6, %%mm3         \n\t" /* 0RGB0RGB 3 */\
 505 \
 506                         "movq %%mm0, %%mm4              \n\t" /* 0RGB0RGB 0 */\
 507                         "psrlq $8, %%mm0                \n\t" /* 00RGB0RG 0 */\
 508                         "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 0 */\
 509                         "pand "MANGLE(bm11111000)", %%mm0\n\t" /* 00RGB000 0.5 */\
 510                         "por %%mm4, %%mm0               \n\t" /* 00RGBRGB 0 */\
 511                         "movq %%mm2, %%mm4              \n\t" /* 0RGB0RGB 1 */\
 512                         "psllq $48, %%mm2               \n\t" /* GB000000 1 */\
 513                         "por %%mm2, %%mm0               \n\t" /* GBRGBRGB 0 */\
 514 \
 515                         "movq %%mm4, %%mm2              \n\t" /* 0RGB0RGB 1 */\
 516                         "psrld $16, %%mm4               \n\t" /* 000R000R 1 */\
 517                         "psrlq $24, %%mm2               \n\t" /* 0000RGB0 1.5 */\
 518                         "por %%mm4, %%mm2               \n\t" /* 000RRGBR 1 */\
 519                         "pand "MANGLE(bm00001111)", %%mm2\n\t" /* 0000RGBR 1 */\
 520                         "movq %%mm1, %%mm4              \n\t" /* 0RGB0RGB 2 */\
 521                         "psrlq $8, %%mm1                \n\t" /* 00RGB0RG 2 */\
 522                         "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 2 */\
 523                         "pand "MANGLE(bm11111000)", %%mm1\n\t" /* 00RGB000 2.5 */\
 524                         "por %%mm4, %%mm1               \n\t" /* 00RGBRGB 2 */\
 525                         "movq %%mm1, %%mm4              \n\t" /* 00RGBRGB 2 */\
 526                         "psllq $32, %%mm1               \n\t" /* BRGB0000 2 */\
 527                         "por %%mm1, %%mm2               \n\t" /* BRGBRGBR 1 */\
 528 \
 529                         "psrlq $32, %%mm4               \n\t" /* 000000RG 2.5 */\
 530                         "movq %%mm3, %%mm5              \n\t" /* 0RGB0RGB 3 */\
 531                         "psrlq $8, %%mm3                \n\t" /* 00RGB0RG 3 */\
 532                         "pand "MANGLE(bm00000111)", %%mm5\n\t" /* 00000RGB 3 */\
 533                         "pand "MANGLE(bm11111000)", %%mm3\n\t" /* 00RGB000 3.5 */\
 534                         "por %%mm5, %%mm3               \n\t" /* 00RGBRGB 3 */\
 535                         "psllq $16, %%mm3               \n\t" /* RGBRGB00 3 */\
 536                         "por %%mm4, %%mm3               \n\t" /* RGBRGBRG 2.5 */\
 537 \
 538                         MOVNTQ(%%mm0, (%%ebx))\
 539                         MOVNTQ(%%mm2, 8(%%ebx))\
 540                         MOVNTQ(%%mm3, 16(%%ebx))\
 541                         "addl $24, %%ebx                \n\t"\
 542 \
 543                         "addl $8, %%eax                 \n\t"\
 544                         "cmpl %5, %%eax                 \n\t"\
 545                         " jb 1b                         \n\t"
 546
 547 #define WRITEBGR24MMX \
 548                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
 549                         "movq %%mm2, %%mm1              \n\t" /* B */\
 550                         "movq %%mm5, %%mm6              \n\t" /* R */\
 551                         "punpcklbw %%mm4, %%mm2         \n\t" /* GBGBGBGB 0 */\
 552                         "punpcklbw %%mm7, %%mm5         \n\t" /* 0R0R0R0R 0 */\
 553                         "punpckhbw %%mm4, %%mm1         \n\t" /* GBGBGBGB 2 */\
 554                         "punpckhbw %%mm7, %%mm6         \n\t" /* 0R0R0R0R 2 */\
 555                         "movq %%mm2, %%mm0              \n\t" /* GBGBGBGB 0 */\
 556                         "movq %%mm1, %%mm3              \n\t" /* GBGBGBGB 2 */\
 557                         "punpcklwd %%mm5, %%mm0         \n\t" /* 0RGB0RGB 0 */\
 558                         "punpckhwd %%mm5, %%mm2         \n\t" /* 0RGB0RGB 1 */\
 559                         "punpcklwd %%mm6, %%mm1         \n\t" /* 0RGB0RGB 2 */\
 560                         "punpckhwd %%mm6, %%mm3         \n\t" /* 0RGB0RGB 3 */\
 561 \
 562                         "movq %%mm0, %%mm4              \n\t" /* 0RGB0RGB 0 */\
 563                         "movq %%mm2, %%mm6              \n\t" /* 0RGB0RGB 1 */\
 564                         "movq %%mm1, %%mm5              \n\t" /* 0RGB0RGB 2 */\
 565                         "movq %%mm3, %%mm7              \n\t" /* 0RGB0RGB 3 */\
 566 \
 567                         "psllq $40, %%mm0               \n\t" /* RGB00000 0 */\
 568                         "psllq $40, %%mm2               \n\t" /* RGB00000 1 */\
 569                         "psllq $40, %%mm1               \n\t" /* RGB00000 2 */\
 570                         "psllq $40, %%mm3               \n\t" /* RGB00000 3 */\
 571 \
 572                         "punpckhdq %%mm4, %%mm0         \n\t" /* 0RGBRGB0 0 */\
 573                         "punpckhdq %%mm6, %%mm2         \n\t" /* 0RGBRGB0 1 */\
 574                         "punpckhdq %%mm5, %%mm1         \n\t" /* 0RGBRGB0 2 */\
 575                         "punpckhdq %%mm7, %%mm3         \n\t" /* 0RGBRGB0 3 */\
 576 \
 577                         "psrlq $8, %%mm0                \n\t" /* 00RGBRGB 0 */\
 578                         "movq %%mm2, %%mm6              \n\t" /* 0RGBRGB0 1 */\
 579                         "psllq $40, %%mm2               \n\t" /* GB000000 1 */\
 580                         "por %%mm2, %%mm0               \n\t" /* GBRGBRGB 0 */\
 581                         MOVNTQ(%%mm0, (%%ebx))\
 582 \
 583                         "psrlq $24, %%mm6               \n\t" /* 0000RGBR 1 */\
 584                         "movq %%mm1, %%mm5              \n\t" /* 0RGBRGB0 2 */\
 585                         "psllq $24, %%mm1               \n\t" /* BRGB0000 2 */\
 586                         "por %%mm1, %%mm6               \n\t" /* BRGBRGBR 1 */\
 587                         MOVNTQ(%%mm6, 8(%%ebx))\
 588 \
 589                         "psrlq $40, %%mm5               \n\t" /* 000000RG 2 */\
 590                         "psllq $8, %%mm3                \n\t" /* RGBRGB00 3 */\
 591                         "por %%mm3, %%mm5               \n\t" /* RGBRGBRG 2 */\
 592                         MOVNTQ(%%mm5, 16(%%ebx))\
 593 \
 594                         "addl $24, %%ebx                \n\t"\
 595 \
 596                         "addl $8, %%eax                 \n\t"\
 597                         "cmpl %5, %%eax                 \n\t"\
 598                         " jb 1b                         \n\t"
 599
 600 #define WRITEBGR24MMX2 \
 601                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
 602                         "movq "MANGLE(M24A)", %%mm0     \n\t"\
 603                         "movq "MANGLE(M24C)", %%mm7     \n\t"\
 604                         "pshufw $0x50, %%mm2, %%mm1     \n\t" /* B3 B2 B3 B2  B1 B0 B1 B0 */\
 605                         "pshufw $0x50, %%mm4, %%mm3     \n\t" /* G3 G2 G3 G2  G1 G0 G1 G0 */\
 606                         "pshufw $0x00, %%mm5, %%mm6     \n\t" /* R1 R0 R1 R0  R1 R0 R1 R0 */\
 607 \
 608                         "pand %%mm0, %%mm1              \n\t" /*    B2        B1       B0 */\
 609                         "pand %%mm0, %%mm3              \n\t" /*    G2        G1       G0 */\
 610                         "pand %%mm7, %%mm6              \n\t" /*       R1        R0       */\
 611 \
 612                         "psllq $8, %%mm3                \n\t" /* G2        G1       G0    */\
 613                         "por %%mm1, %%mm6               \n\t"\
 614                         "por %%mm3, %%mm6               \n\t"\
 615                         MOVNTQ(%%mm6, (%%ebx))\
 616 \
 617                         "psrlq $8, %%mm4                \n\t" /* 00 G7 G6 G5  G4 G3 G2 G1 */\
 618                         "pshufw $0xA5, %%mm2, %%mm1     \n\t" /* B5 B4 B5 B4  B3 B2 B3 B2 */\
 619                         "pshufw $0x55, %%mm4, %%mm3     \n\t" /* G4 G3 G4 G3  G4 G3 G4 G3 */\
 620                         "pshufw $0xA5, %%mm5, %%mm6     \n\t" /* R5 R4 R5 R4  R3 R2 R3 R2 */\
 621 \
 622                         "pand "MANGLE(M24B)", %%mm1     \n\t" /* B5       B4        B3    */\
 623                         "pand %%mm7, %%mm3              \n\t" /*       G4        G3       */\
 624                         "pand %%mm0, %%mm6              \n\t" /*    R4        R3       R2 */\
 625 \
 626                         "por %%mm1, %%mm3               \n\t" /* B5    G4 B4     G3 B3    */\
 627                         "por %%mm3, %%mm6               \n\t"\
 628                         MOVNTQ(%%mm6, 8(%%ebx))\
 629 \
 630                         "pshufw $0xFF, %%mm2, %%mm1     \n\t" /* B7 B6 B7 B6  B7 B6 B6 B7 */\
 631                         "pshufw $0xFA, %%mm4, %%mm3     \n\t" /* 00 G7 00 G7  G6 G5 G6 G5 */\
 632                         "pshufw $0xFA, %%mm5, %%mm6     \n\t" /* R7 R6 R7 R6  R5 R4 R5 R4 */\
 633 \
 634                         "pand %%mm7, %%mm1              \n\t" /*       B7        B6       */\
 635                         "pand %%mm0, %%mm3              \n\t" /*    G7        G6       G5 */\
 636                         "pand "MANGLE(M24B)", %%mm6     \n\t" /* R7       R6        R5    */\
 637 \
 638                         "por %%mm1, %%mm3               \n\t"\
 639                         "por %%mm3, %%mm6               \n\t"\
 640                         MOVNTQ(%%mm6, 16(%%ebx))\
 641 \
 642                         "addl $24, %%ebx                \n\t"\
 643 \
 644                         "addl $8, %%eax                 \n\t"\
 645                         "cmpl %5, %%eax                 \n\t"\
 646                         " jb 1b                         \n\t"
 647
 648 #ifdef HAVE_MMX2
 649 #undef WRITEBGR24
 650 #define WRITEBGR24 WRITEBGR24MMX2
 651 #else
 652 #undef WRITEBGR24
 653 #define WRITEBGR24 WRITEBGR24MMX
 654 #endif
 655
 656 static inline void RENAME(yuv2yuvX)(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
 657                                     int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
 658                                     uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW,
 659                                     int16_t * lumMmxFilter, int16_t * chrMmxFilter)
 660 {
 661 #ifdef HAVE_MMX
 662         if(uDest != NULL)
 663         {
 664                 asm volatile(
 665                                 YSCALEYUV2YV12X(0)
 666                                 :: "m" (-chrFilterSize), "r" (chrSrc+chrFilterSize),
 667                                 "r" (chrMmxFilter+chrFilterSize*4), "r" (uDest), "m" (dstW>>1)
 668                                 : "%eax", "%edx", "%esi"
 669                         );
 670
 671                 asm volatile(
 672                                 YSCALEYUV2YV12X(4096)
 673                                 :: "m" (-chrFilterSize), "r" (chrSrc+chrFilterSize),
 674                                 "r" (chrMmxFilter+chrFilterSize*4), "r" (vDest), "m" (dstW>>1)
 675                                 : "%eax", "%edx", "%esi"
 676                         );
 677         }
 678
 679         asm volatile(
 680                         YSCALEYUV2YV12X(0)
 681                         :: "m" (-lumFilterSize), "r" (lumSrc+lumFilterSize),
 682                            "r" (lumMmxFilter+lumFilterSize*4), "r" (dest), "m" (dstW)
 683                         : "%eax", "%edx", "%esi"
 684                 );
 685 #else
 686 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
 687             chrFilter, chrSrc, chrFilterSize,
 688             dest, uDest, vDest, dstW);
 689 #endif
 690 }
 691
 692 static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc,
 693                                     uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW)
 694 {
 695 #ifdef HAVE_MMX
 696         if(uDest != NULL)
 697         {
 698                 asm volatile(
 699                                 YSCALEYUV2YV121
 700                                 :: "r" (chrSrc + (dstW>>1)), "r" (uDest + (dstW>>1)),
 701                                 "g" (-(dstW>>1))
 702                                 : "%eax"
 703                         );
 704
 705                 asm volatile(
 706                                 YSCALEYUV2YV121
 707                                 :: "r" (chrSrc + 2048 + (dstW>>1)), "r" (vDest + (dstW>>1)),
 708                                 "g" (-(dstW>>1))
 709                                 : "%eax"
 710                         );
 711         }
 712
 713         asm volatile(
 714                 YSCALEYUV2YV121
 715                 :: "r" (lumSrc + dstW), "r" (dest + dstW),
 716                 "g" (-dstW)
 717                 : "%eax"
 718         );
 719 #else
 720         //FIXME Optimize (just quickly writen not opti..)
 721         //FIXME replace MINMAX with LUTs
 722         int i;
 723         for(i=0; i<dstW; i++)
 724         {
 725                 int val= lumSrc[i]>>7;
 726
 727                 dest[i]= MIN(MAX(val>>19, 0), 255);
 728         }
 729
 730         if(uDest != NULL)
 731                 for(i=0; i<(dstW>>1); i++)
 732                 {
 733                         int u=chrSrc[i]>>7;
 734                         int v=chrSrc[i + 2048]>>7;
 735
 736                         uDest[i]= MIN(MAX(u>>19, 0), 255);
 737                         vDest[i]= MIN(MAX(v>>19, 0), 255);
 738                 }
 739 #endif
 740 }
 741
 742
 743 /**
 744  * vertical scale YV12 to RGB
 745  */
 746 static inline void RENAME(yuv2rgbX)(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
 747                                     int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
 748                             uint8_t *dest, int dstW, int dstFormat, int16_t * lumMmxFilter, int16_t * chrMmxFilter)
 749 {
 750 /*      if(flags&SWS_FULL_UV_IPOL)
 751         {
 752 //FIXME
 753         }//FULL_UV_IPOL
 754         else*/
 755         {
 756 #ifdef HAVE_MMX
 757                 if(dstFormat == IMGFMT_BGR32) //FIXME untested
 758                 {
 759                         asm volatile(
 760                                 YSCALEYUV2RGBX
 761                                 WRITEBGR32
 762
 763                         :: "m" (-lumFilterSize), "m" (-chrFilterSize),
 764                            "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
 765                            "r" (dest), "m" (dstW),
 766                            "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
 767                         : "%eax", "%ebx", "%ecx", "%edx", "%esi"
 768                         );
 769                 }
 770                 else if(dstFormat == IMGFMT_BGR24) //FIXME untested
 771                 {
 772                         asm volatile(
 773                                 YSCALEYUV2RGBX
 774                                 "leal (%%eax, %%eax, 2), %%ebx  \n\t" //FIXME optimize
 775                                 "addl %4, %%ebx                 \n\t"
 776                                 WRITEBGR24
 777
 778                         :: "m" (-lumFilterSize), "m" (-chrFilterSize),
 779                            "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
 780                            "r" (dest), "m" (dstW),
 781                            "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
 782                         : "%eax", "%ebx", "%ecx", "%edx", "%esi"
 783                         );
 784                 }
 785                 else if(dstFormat==IMGFMT_BGR15)
 786                 {
 787                         asm volatile(
 788                                 YSCALEYUV2RGBX
 789                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
 790 #ifdef DITHER1XBPP
 791                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
 792                                 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
 793                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
 794 #endif
 795
 796                                 WRITEBGR15
 797
 798                         :: "m" (-lumFilterSize), "m" (-chrFilterSize),
 799                            "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
 800                            "r" (dest), "m" (dstW),
 801                            "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
 802                         : "%eax", "%ebx", "%ecx", "%edx", "%esi"
 803                         );
 804                 }
 805                 else if(dstFormat==IMGFMT_BGR16)
 806                 {
 807                         asm volatile(
 808                                 YSCALEYUV2RGBX
 809                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
 810 #ifdef DITHER1XBPP
 811                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
 812                                 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
 813                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
 814 #endif
 815
 816                                 WRITEBGR16
 817
 818                         :: "m" (-lumFilterSize), "m" (-chrFilterSize),
 819                            "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
 820                            "r" (dest), "m" (dstW),
 821                            "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
 822                         : "%eax", "%ebx", "%ecx", "%edx", "%esi"
 823                         );
 824                 }
 825 #else
 826 yuv2rgbXinC(lumFilter, lumSrc, lumFilterSize,
 827             chrFilter, chrSrc, chrFilterSize,
 828             dest, dstW, dstFormat);
 829
 830 #endif
 831         } //!FULL_UV_IPOL
 832 }
 833
 834
 835 /**
 836  * vertical bilinear scale YV12 to RGB
 837  */
 838 static inline void RENAME(yuv2rgb2)(uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
 839                             uint8_t *dest, int dstW, int yalpha, int uvalpha, int dstFormat, int flags)
 840 {
 841         int yalpha1=yalpha^4095;
 842         int uvalpha1=uvalpha^4095;
 843
 844         if(flags&SWS_FULL_CHR_H_INT)
 845         {
 846
 847 #ifdef HAVE_MMX
 848                 if(dstFormat==IMGFMT_BGR32)
 849                 {
 850                         asm volatile(
 851
 852
 853 FULL_YSCALEYUV2RGB
 854                         "punpcklbw %%mm1, %%mm3         \n\t" // BGBGBGBG
 855                         "punpcklbw %%mm7, %%mm0         \n\t" // R0R0R0R0
 856
 857                         "movq %%mm3, %%mm1              \n\t"
 858                         "punpcklwd %%mm0, %%mm3         \n\t" // BGR0BGR0
 859                         "punpckhwd %%mm0, %%mm1         \n\t" // BGR0BGR0
 860
 861                         MOVNTQ(%%mm3, (%4, %%eax, 4))
 862                         MOVNTQ(%%mm1, 8(%4, %%eax, 4))
 863
 864                         "addl $4, %%eax                 \n\t"
 865                         "cmpl %5, %%eax                 \n\t"
 866                         " jb 1b                         \n\t"
 867
 868
 869                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
 870                         "m" (yalpha1), "m" (uvalpha1)
 871                         : "%eax"
 872                         );
 873                 }
 874                 else if(dstFormat==IMGFMT_BGR24)
 875                 {
 876                         asm volatile(
 877
 878 FULL_YSCALEYUV2RGB
 879
 880                                                                 // lsb ... msb
 881                         "punpcklbw %%mm1, %%mm3         \n\t" // BGBGBGBG
 882                         "punpcklbw %%mm7, %%mm0         \n\t" // R0R0R0R0
 883
 884                         "movq %%mm3, %%mm1              \n\t"
 885                         "punpcklwd %%mm0, %%mm3         \n\t" // BGR0BGR0
 886                         "punpckhwd %%mm0, %%mm1         \n\t" // BGR0BGR0
 887
 888                         "movq %%mm3, %%mm2              \n\t" // BGR0BGR0
 889                         "psrlq $8, %%mm3                \n\t" // GR0BGR00
 890                         "pand "MANGLE(bm00000111)", %%mm2\n\t" // BGR00000
 891                         "pand "MANGLE(bm11111000)", %%mm3\n\t" // 000BGR00
 892                         "por %%mm2, %%mm3               \n\t" // BGRBGR00
 893                         "movq %%mm1, %%mm2              \n\t"
 894                         "psllq $48, %%mm1               \n\t" // 000000BG
 895                         "por %%mm1, %%mm3               \n\t" // BGRBGRBG
 896
 897                         "movq %%mm2, %%mm1              \n\t" // BGR0BGR0
 898                         "psrld $16, %%mm2               \n\t" // R000R000
 899                         "psrlq $24, %%mm1               \n\t" // 0BGR0000
 900                         "por %%mm2, %%mm1               \n\t" // RBGRR000
 901
 902                         "movl %4, %%ebx                 \n\t"
 903                         "addl %%eax, %%ebx              \n\t"
 904
 905 #ifdef HAVE_MMX2
 906                         //FIXME Alignment
 907                         "movntq %%mm3, (%%ebx, %%eax, 2)\n\t"
 908                         "movntq %%mm1, 8(%%ebx, %%eax, 2)\n\t"
 909 #else
 910                         "movd %%mm3, (%%ebx, %%eax, 2)  \n\t"
 911                         "psrlq $32, %%mm3               \n\t"
 912                         "movd %%mm3, 4(%%ebx, %%eax, 2) \n\t"
 913                         "movd %%mm1, 8(%%ebx, %%eax, 2) \n\t"
 914 #endif
 915                         "addl $4, %%eax                 \n\t"
 916                         "cmpl %5, %%eax                 \n\t"
 917                         " jb 1b                         \n\t"
 918
 919                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
 920                         "m" (yalpha1), "m" (uvalpha1)
 921                         : "%eax", "%ebx"
 922                         );
 923                 }
 924                 else if(dstFormat==IMGFMT_BGR15)
 925                 {
 926                         asm volatile(
 927
 928 FULL_YSCALEYUV2RGB
 929 #ifdef DITHER1XBPP
 930                         "paddusb "MANGLE(g5Dither)", %%mm1\n\t"
 931                         "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
 932                         "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
 933 #endif
 934                         "punpcklbw %%mm7, %%mm1         \n\t" // 0G0G0G0G
 935                         "punpcklbw %%mm7, %%mm3         \n\t" // 0B0B0B0B
 936                         "punpcklbw %%mm7, %%mm0         \n\t" // 0R0R0R0R
 937
 938                         "psrlw $3, %%mm3                \n\t"
 939                         "psllw $2, %%mm1                \n\t"
 940                         "psllw $7, %%mm0                \n\t"
 941                         "pand "MANGLE(g15Mask)", %%mm1  \n\t"
 942                         "pand "MANGLE(r15Mask)", %%mm0  \n\t"
 943
 944                         "por %%mm3, %%mm1               \n\t"
 945                         "por %%mm1, %%mm0               \n\t"
 946
 947                         MOVNTQ(%%mm0, (%4, %%eax, 2))
 948
 949                         "addl $4, %%eax                 \n\t"
 950                         "cmpl %5, %%eax                 \n\t"
 951                         " jb 1b                         \n\t"
 952
 953                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
 954                         "m" (yalpha1), "m" (uvalpha1)
 955                         : "%eax"
 956                         );
 957                 }
 958                 else if(dstFormat==IMGFMT_BGR16)
 959                 {
 960                         asm volatile(
 961
 962 FULL_YSCALEYUV2RGB
 963 #ifdef DITHER1XBPP
 964                         "paddusb "MANGLE(g6Dither)", %%mm1\n\t"
 965                         "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
 966                         "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
 967 #endif
 968                         "punpcklbw %%mm7, %%mm1         \n\t" // 0G0G0G0G
 969                         "punpcklbw %%mm7, %%mm3         \n\t" // 0B0B0B0B
 970                         "punpcklbw %%mm7, %%mm0         \n\t" // 0R0R0R0R
 971
 972                         "psrlw $3, %%mm3                \n\t"
 973                         "psllw $3, %%mm1                \n\t"
 974                         "psllw $8, %%mm0                \n\t"
 975                         "pand "MANGLE(g16Mask)", %%mm1  \n\t"
 976                         "pand "MANGLE(r16Mask)", %%mm0  \n\t"
 977
 978                         "por %%mm3, %%mm1               \n\t"
 979                         "por %%mm1, %%mm0               \n\t"
 980
 981                         MOVNTQ(%%mm0, (%4, %%eax, 2))
 982
 983                         "addl $4, %%eax                 \n\t"
 984                         "cmpl %5, %%eax                 \n\t"
 985                         " jb 1b                         \n\t"
 986
 987                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
 988                         "m" (yalpha1), "m" (uvalpha1)
 989                         : "%eax"
 990                         );
 991                 }
 992 #else
 993                 if(dstFormat==IMGFMT_BGR32)
 994                 {
 995                         int i;
 996 #ifdef WORDS_BIGENDIAN
 997                         dest++;
 998 #endif
 999                         for(i=0;i<dstW;i++){
1000                                 // vertical linear interpolation && yuv2rgb in a single step:
1001                                 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1002                                 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1003                                 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1004                                 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1005                                 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1006                                 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1007                                 dest+= 4;
1008                         }
1009                 }
1010                 else if(dstFormat==IMGFMT_BGR24)
1011                 {
1012                         int i;
1013                         for(i=0;i<dstW;i++){
1014                                 // vertical linear interpolation && yuv2rgb in a single step:
1015                                 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1016                                 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1017                                 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1018                                 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1019                                 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1020                                 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1021                                 dest+= 3;
1022                         }
1023                 }
1024                 else if(dstFormat==IMGFMT_BGR16)
1025                 {
1026                         int i;
1027                         for(i=0;i<dstW;i++){
1028                                 // vertical linear interpolation && yuv2rgb in a single step:
1029                                 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1030                                 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1031                                 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1032
1033                                 ((uint16_t*)dest)[i] =
1034                                         clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
1035                                         clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1036                                         clip_table16r[(Y + yuvtab_3343[V]) >>13];
1037                         }
1038                 }
1039                 else if(dstFormat==IMGFMT_BGR15)
1040                 {
1041                         int i;
1042                         for(i=0;i<dstW;i++){
1043                                 // vertical linear interpolation && yuv2rgb in a single step:
1044                                 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1045                                 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1046                                 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1047
1048                                 ((uint16_t*)dest)[i] =
1049                                         clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
1050                                         clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1051                                         clip_table15r[(Y + yuvtab_3343[V]) >>13];
1052                         }
1053                 }
1054 #endif
1055         }//FULL_UV_IPOL
1056         else
1057         {
1058 #ifdef HAVE_MMX
1059                 if(dstFormat==IMGFMT_BGR32)
1060                 {
1061                         asm volatile(
1062                                 YSCALEYUV2RGB
1063                                 WRITEBGR32
1064
1065                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1066                         "m" (yalpha1), "m" (uvalpha1)
1067                         : "%eax"
1068                         );
1069                 }
1070                 else if(dstFormat==IMGFMT_BGR24)
1071                 {
1072                         asm volatile(
1073                                 "movl %4, %%ebx                 \n\t"
1074                                 YSCALEYUV2RGB
1075                                 WRITEBGR24
1076
1077                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1078                         "m" (yalpha1), "m" (uvalpha1)
1079                         : "%eax", "%ebx"
1080                         );
1081                 }
1082                 else if(dstFormat==IMGFMT_BGR15)
1083                 {
1084                         asm volatile(
1085                                 YSCALEYUV2RGB
1086                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1087 #ifdef DITHER1XBPP
1088                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1089                                 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1090                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1091 #endif
1092
1093                                 WRITEBGR15
1094
1095                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1096                         "m" (yalpha1), "m" (uvalpha1)
1097                         : "%eax"
1098                         );
1099                 }
1100                 else if(dstFormat==IMGFMT_BGR16)
1101                 {
1102                         asm volatile(
1103                                 YSCALEYUV2RGB
1104                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1105 #ifdef DITHER1XBPP
1106                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1107                                 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1108                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1109 #endif
1110
1111                                 WRITEBGR16
1112
1113                         :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1114                         "m" (yalpha1), "m" (uvalpha1)
1115                         : "%eax"
1116                         );
1117                 }
1118 #else
1119                 if(dstFormat==IMGFMT_BGR32)
1120                 {
1121                         int i;
1122 #ifdef WORDS_BIGENDIAN
1123                         dest++;
1124 #endif
1125                         for(i=0; i<dstW-1; i+=2){
1126                                 // vertical linear interpolation && yuv2rgb in a single step:
1127                                 int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1128                                 int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
1129                                 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1130                                 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1131
1132                                 int Cb= yuvtab_40cf[U];
1133                                 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1134                                 int Cr= yuvtab_3343[V];
1135
1136                                 dest[4*i+0]=clip_table[((Y1 + Cb) >>13)];
1137                                 dest[4*i+1]=clip_table[((Y1 + Cg) >>13)];
1138                                 dest[4*i+2]=clip_table[((Y1 + Cr) >>13)];
1139
1140                                 dest[4*i+4]=clip_table[((Y2 + Cb) >>13)];
1141                                 dest[4*i+5]=clip_table[((Y2 + Cg) >>13)];
1142                                 dest[4*i+6]=clip_table[((Y2 + Cr) >>13)];
1143                         }
1144                 }
1145                 else if(dstFormat==IMGFMT_BGR24)
1146                 {
1147                         int i;
1148                         for(i=0; i<dstW-1; i+=2){
1149                                 // vertical linear interpolation && yuv2rgb in a single step:
1150                                 int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1151                                 int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
1152                                 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1153                                 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1154
1155                                 int Cb= yuvtab_40cf[U];
1156                                 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1157                                 int Cr= yuvtab_3343[V];
1158
1159                                 dest[0]=clip_table[((Y1 + Cb) >>13)];
1160                                 dest[1]=clip_table[((Y1 + Cg) >>13)];
1161                                 dest[2]=clip_table[((Y1 + Cr) >>13)];
1162
1163                                 dest[3]=clip_table[((Y2 + Cb) >>13)];
1164                                 dest[4]=clip_table[((Y2 + Cg) >>13)];
1165                                 dest[5]=clip_table[((Y2 + Cr) >>13)];
1166                                 dest+=6;
1167                         }
1168                 }
1169                 else if(dstFormat==IMGFMT_BGR16)
1170                 {
1171                         int i;
1172 #ifdef DITHER1XBPP
1173                         static int ditherb1=1<<14;
1174                         static int ditherg1=1<<13;
1175                         static int ditherr1=2<<14;
1176                         static int ditherb2=3<<14;
1177                         static int ditherg2=3<<13;
1178                         static int ditherr2=0<<14;
1179
1180                         ditherb1 ^= (1^2)<<14;
1181                         ditherg1 ^= (1^2)<<13;
1182                         ditherr1 ^= (1^2)<<14;
1183                         ditherb2 ^= (3^0)<<14;
1184                         ditherg2 ^= (3^0)<<13;
1185                         ditherr2 ^= (3^0)<<14;
1186 #else
1187                         const int ditherb1=0;
1188                         const int ditherg1=0;
1189                         const int ditherr1=0;
1190                         const int ditherb2=0;
1191                         const int ditherg2=0;
1192                         const int ditherr2=0;
1193 #endif
1194                         for(i=0; i<dstW-1; i+=2){
1195                                 // vertical linear interpolation && yuv2rgb in a single step:
1196                                 int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1197                                 int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
1198                                 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1199                                 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1200
1201                                 int Cb= yuvtab_40cf[U];
1202                                 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1203                                 int Cr= yuvtab_3343[V];
1204
1205                                 ((uint16_t*)dest)[i] =
1206                                         clip_table16b[(Y1 + Cb + ditherb1) >>13] |
1207                                         clip_table16g[(Y1 + Cg + ditherg1) >>13] |
1208                                         clip_table16r[(Y1 + Cr + ditherr1) >>13];
1209
1210                                 ((uint16_t*)dest)[i+1] =
1211                                         clip_table16b[(Y2 + Cb + ditherb2) >>13] |
1212                                         clip_table16g[(Y2 + Cg + ditherg2) >>13] |
1213                                         clip_table16r[(Y2 + Cr + ditherr2) >>13];
1214                         }
1215                 }
1216                 else if(dstFormat==IMGFMT_BGR15)
1217                 {
1218                         int i;
1219 #ifdef DITHER1XBPP
1220                         static int ditherb1=1<<14;
1221                         static int ditherg1=1<<14;
1222                         static int ditherr1=2<<14;
1223                         static int ditherb2=3<<14;
1224                         static int ditherg2=3<<14;
1225                         static int ditherr2=0<<14;
1226
1227                         ditherb1 ^= (1^2)<<14;
1228                         ditherg1 ^= (1^2)<<14;
1229                         ditherr1 ^= (1^2)<<14;
1230                         ditherb2 ^= (3^0)<<14;
1231                         ditherg2 ^= (3^0)<<14;
1232                         ditherr2 ^= (3^0)<<14;
1233 #else
1234                         const int ditherb1=0;
1235                         const int ditherg1=0;
1236                         const int ditherr1=0;
1237                         const int ditherb2=0;
1238                         const int ditherg2=0;
1239                         const int ditherr2=0;
1240 #endif
1241                         for(i=0; i<dstW-1; i+=2){
1242                                 // vertical linear interpolation && yuv2rgb in a single step:
1243                                 int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1244                                 int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
1245                                 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1246                                 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1247
1248                                 int Cb= yuvtab_40cf[U];
1249                                 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1250                                 int Cr= yuvtab_3343[V];
1251
1252                                 ((uint16_t*)dest)[i] =
1253                                         clip_table15b[(Y1 + Cb + ditherb1) >>13] |
1254                                         clip_table15g[(Y1 + Cg + ditherg1) >>13] |
1255                                         clip_table15r[(Y1 + Cr + ditherr1) >>13];
1256
1257                                 ((uint16_t*)dest)[i+1] =
1258                                         clip_table15b[(Y2 + Cb + ditherb2) >>13] |
1259                                         clip_table15g[(Y2 + Cg + ditherg2) >>13] |
1260                                         clip_table15r[(Y2 + Cr + ditherr2) >>13];
1261                         }
1262                 }
1263 #endif
1264         } //!FULL_UV_IPOL
1265 }
1266
1267 /**
1268  * YV12 to RGB without scaling or interpolating
1269  */
1270 static inline void RENAME(yuv2rgb1)(uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
1271                             uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags)
1272 {
1273         int uvalpha1=uvalpha^4095;
1274         const int yalpha1=0;
1275
1276         if(flags&SWS_FULL_CHR_H_INT)
1277         {
1278                 RENAME(yuv2rgb2)(buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, dstFormat, flags);
1279                 return;
1280         }
1281
1282 #ifdef HAVE_MMX
1283         if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster
1284         {
1285                 if(dstFormat==IMGFMT_BGR32)
1286                 {
1287                         asm volatile(
1288                                 YSCALEYUV2RGB1
1289                                 WRITEBGR32
1290                         :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1291                         "m" (yalpha1), "m" (uvalpha1)
1292                         : "%eax"
1293                         );
1294                 }
1295                 else if(dstFormat==IMGFMT_BGR24)
1296                 {
1297                         asm volatile(
1298                                 "movl %4, %%ebx                 \n\t"
1299                                 YSCALEYUV2RGB1
1300                                 WRITEBGR24
1301                         :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1302                         "m" (yalpha1), "m" (uvalpha1)
1303                         : "%eax", "%ebx"
1304                         );
1305                 }
1306                 else if(dstFormat==IMGFMT_BGR15)
1307                 {
1308                         asm volatile(
1309                                 YSCALEYUV2RGB1
1310                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1311 #ifdef DITHER1XBPP
1312                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1313                                 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1314                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1315 #endif
1316                                 WRITEBGR15
1317                         :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1318                         "m" (yalpha1), "m" (uvalpha1)
1319                         : "%eax"
1320                         );
1321                 }
1322                 else if(dstFormat==IMGFMT_BGR16)
1323                 {
1324                         asm volatile(
1325                                 YSCALEYUV2RGB1
1326                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1327 #ifdef DITHER1XBPP
1328                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1329                                 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1330                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1331 #endif
1332
1333                                 WRITEBGR16
1334                         :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1335                         "m" (yalpha1), "m" (uvalpha1)
1336                         : "%eax"
1337                         );
1338                 }
1339         }
1340         else
1341         {
1342                 if(dstFormat==IMGFMT_BGR32)
1343                 {
1344                         asm volatile(
1345                                 YSCALEYUV2RGB1b
1346                                 WRITEBGR32
1347                         :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1348                         "m" (yalpha1), "m" (uvalpha1)
1349                         : "%eax"
1350                         );
1351                 }
1352                 else if(dstFormat==IMGFMT_BGR24)
1353                 {
1354                         asm volatile(
1355                                 "movl %4, %%ebx                 \n\t"
1356                                 YSCALEYUV2RGB1b
1357                                 WRITEBGR24
1358                         :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1359                         "m" (yalpha1), "m" (uvalpha1)
1360                         : "%eax", "%ebx"
1361                         );
1362                 }
1363                 else if(dstFormat==IMGFMT_BGR15)
1364                 {
1365                         asm volatile(
1366                                 YSCALEYUV2RGB1b
1367                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1368 #ifdef DITHER1XBPP
1369                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1370                                 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1371                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1372 #endif
1373                                 WRITEBGR15
1374                         :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1375                         "m" (yalpha1), "m" (uvalpha1)
1376                         : "%eax"
1377                         );
1378                 }
1379                 else if(dstFormat==IMGFMT_BGR16)
1380                 {
1381                         asm volatile(
1382                                 YSCALEYUV2RGB1b
1383                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1384 #ifdef DITHER1XBPP
1385                                 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1386                                 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1387                                 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1388 #endif
1389
1390                                 WRITEBGR16
1391                         :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1392                         "m" (yalpha1), "m" (uvalpha1)
1393                         : "%eax"
1394                         );
1395                 }
1396         }
1397 #else
1398 //FIXME write 2 versions (for even & odd lines)
1399
1400         if(dstFormat==IMGFMT_BGR32)
1401         {
1402                 int i;
1403 #ifdef WORDS_BIGENDIAN
1404                 dest++;
1405 #endif
1406                 for(i=0; i<dstW-1; i+=2){
1407                         // vertical linear interpolation && yuv2rgb in a single step:
1408                         int Y1=yuvtab_2568[buf0[i]>>7];
1409                         int Y2=yuvtab_2568[buf0[i+1]>>7];
1410                         int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1411                         int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1412
1413                         int Cb= yuvtab_40cf[U];
1414                         int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1415                         int Cr= yuvtab_3343[V];
1416
1417                         dest[4*i+0]=clip_table[((Y1 + Cb) >>13)];
1418                         dest[4*i+1]=clip_table[((Y1 + Cg) >>13)];
1419                         dest[4*i+2]=clip_table[((Y1 + Cr) >>13)];
1420
1421                         dest[4*i+4]=clip_table[((Y2 + Cb) >>13)];
1422                         dest[4*i+5]=clip_table[((Y2 + Cg) >>13)];
1423                         dest[4*i+6]=clip_table[((Y2 + Cr) >>13)];
1424                 }
1425         }
1426         else if(dstFormat==IMGFMT_BGR24)
1427         {
1428                 int i;
1429                 for(i=0; i<dstW-1; i+=2){
1430                         // vertical linear interpolation && yuv2rgb in a single step:
1431                         int Y1=yuvtab_2568[buf0[i]>>7];
1432                         int Y2=yuvtab_2568[buf0[i+1]>>7];
1433                         int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1434                         int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1435
1436                         int Cb= yuvtab_40cf[U];
1437                         int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1438                         int Cr= yuvtab_3343[V];
1439
1440                         dest[0]=clip_table[((Y1 + Cb) >>13)];
1441                         dest[1]=clip_table[((Y1 + Cg) >>13)];
1442                         dest[2]=clip_table[((Y1 + Cr) >>13)];
1443
1444                         dest[3]=clip_table[((Y2 + Cb) >>13)];
1445                         dest[4]=clip_table[((Y2 + Cg) >>13)];
1446                         dest[5]=clip_table[((Y2 + Cr) >>13)];
1447                         dest+=6;
1448                 }
1449         }
1450         else if(dstFormat==IMGFMT_BGR16)
1451         {
1452                 int i;
1453 #ifdef DITHER1XBPP
1454                 static int ditherb1=1<<14;
1455                 static int ditherg1=1<<13;
1456                 static int ditherr1=2<<14;
1457                 static int ditherb2=3<<14;
1458                 static int ditherg2=3<<13;
1459                 static int ditherr2=0<<14;
1460
1461                 ditherb1 ^= (1^2)<<14;
1462                 ditherg1 ^= (1^2)<<13;
1463                 ditherr1 ^= (1^2)<<14;
1464                 ditherb2 ^= (3^0)<<14;
1465                 ditherg2 ^= (3^0)<<13;
1466                 ditherr2 ^= (3^0)<<14;
1467 #else
1468                 const int ditherb1=0;
1469                 const int ditherg1=0;
1470                 const int ditherr1=0;
1471                 const int ditherb2=0;
1472                 const int ditherg2=0;
1473                 const int ditherr2=0;
1474 #endif
1475                 for(i=0; i<dstW-1; i+=2){
1476                         // vertical linear interpolation && yuv2rgb in a single step:
1477                         int Y1=yuvtab_2568[buf0[i]>>7];
1478                         int Y2=yuvtab_2568[buf0[i+1]>>7];
1479                         int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1480                         int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1481
1482                         int Cb= yuvtab_40cf[U];
1483                         int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1484                         int Cr= yuvtab_3343[V];
1485
1486                         ((uint16_t*)dest)[i] =
1487                                 clip_table16b[(Y1 + Cb + ditherb1) >>13] |
1488                                 clip_table16g[(Y1 + Cg + ditherg1) >>13] |
1489                                 clip_table16r[(Y1 + Cr + ditherr1) >>13];
1490
1491                         ((uint16_t*)dest)[i+1] =
1492                                 clip_table16b[(Y2 + Cb + ditherb2) >>13] |
1493                                 clip_table16g[(Y2 + Cg + ditherg2) >>13] |
1494                                 clip_table16r[(Y2 + Cr + ditherr2) >>13];
1495                 }
1496         }
1497         else if(dstFormat==IMGFMT_BGR15)
1498         {
1499                 int i;
1500 #ifdef DITHER1XBPP
1501                 static int ditherb1=1<<14;
1502                 static int ditherg1=1<<14;
1503                 static int ditherr1=2<<14;
1504                 static int ditherb2=3<<14;
1505                 static int ditherg2=3<<14;
1506                 static int ditherr2=0<<14;
1507
1508                 ditherb1 ^= (1^2)<<14;
1509                 ditherg1 ^= (1^2)<<14;
1510                 ditherr1 ^= (1^2)<<14;
1511                 ditherb2 ^= (3^0)<<14;
1512                 ditherg2 ^= (3^0)<<14;
1513                 ditherr2 ^= (3^0)<<14;
1514 #else
1515                 const int ditherb1=0;
1516                 const int ditherg1=0;
1517                 const int ditherr1=0;
1518                 const int ditherb2=0;
1519                 const int ditherg2=0;
1520                 const int ditherr2=0;
1521 #endif
1522                 for(i=0; i<dstW-1; i+=2){
1523                         // vertical linear interpolation && yuv2rgb in a single step:
1524                         int Y1=yuvtab_2568[buf0[i]>>7];
1525                         int Y2=yuvtab_2568[buf0[i+1]>>7];
1526                         int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1527                         int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1528
1529                         int Cb= yuvtab_40cf[U];
1530                         int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1531                         int Cr= yuvtab_3343[V];
1532
1533                         ((uint16_t*)dest)[i] =
1534                                 clip_table15b[(Y1 + Cb + ditherb1) >>13] |
1535                                 clip_table15g[(Y1 + Cg + ditherg1) >>13] |
1536                                 clip_table15r[(Y1 + Cr + ditherr1) >>13];
1537
1538                         ((uint16_t*)dest)[i+1] =
1539                                 clip_table15b[(Y2 + Cb + ditherb2) >>13] |
1540                                 clip_table15g[(Y2 + Cg + ditherg2) >>13] |
1541                                 clip_table15r[(Y2 + Cr + ditherr2) >>13];
1542                 }
1543         }
1544 #endif
1545 }
1546
1547 //FIXME yuy2* can read upto 7 samples to much
1548
1549 static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, int width)
1550 {
1551 #ifdef HAVE_MMX
1552         asm volatile(
1553                 "movq "MANGLE(bm01010101)", %%mm2\n\t"
1554                 "movl %0, %%eax                 \n\t"
1555                 "1:                             \n\t"
1556                 "movq (%1, %%eax,2), %%mm0      \n\t"
1557                 "movq 8(%1, %%eax,2), %%mm1     \n\t"
1558                 "pand %%mm2, %%mm0              \n\t"
1559                 "pand %%mm2, %%mm1              \n\t"
1560                 "packuswb %%mm1, %%mm0          \n\t"
1561                 "movq %%mm0, (%2, %%eax)        \n\t"
1562                 "addl $8, %%eax                 \n\t"
1563                 " js 1b                         \n\t"
1564                 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1565                 : "%eax"
1566         );
1567 #else
1568         int i;
1569         for(i=0; i<width; i++)
1570                 dst[i]= src[2*i];
1571 #endif
1572 }
1573
1574 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1575 {
1576 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1577         asm volatile(
1578                 "movq "MANGLE(bm01010101)", %%mm4\n\t"
1579                 "movl %0, %%eax                 \n\t"
1580                 "1:                             \n\t"
1581                 "movq (%1, %%eax,4), %%mm0      \n\t"
1582                 "movq 8(%1, %%eax,4), %%mm1     \n\t"
1583                 "movq (%2, %%eax,4), %%mm2      \n\t"
1584                 "movq 8(%2, %%eax,4), %%mm3     \n\t"
1585                 PAVGB(%%mm2, %%mm0)
1586                 PAVGB(%%mm3, %%mm1)
1587                 "psrlw $8, %%mm0                \n\t"
1588                 "psrlw $8, %%mm1                \n\t"
1589                 "packuswb %%mm1, %%mm0          \n\t"
1590                 "movq %%mm0, %%mm1              \n\t"
1591                 "psrlw $8, %%mm0                \n\t"
1592                 "pand %%mm4, %%mm1              \n\t"
1593                 "packuswb %%mm0, %%mm0          \n\t"
1594                 "packuswb %%mm1, %%mm1          \n\t"
1595                 "movd %%mm0, (%4, %%eax)        \n\t"
1596                 "movd %%mm1, (%3, %%eax)        \n\t"
1597                 "addl $4, %%eax                 \n\t"
1598                 " js 1b                         \n\t"
1599                 : : "g" (-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
1600                 : "%eax"
1601         );
1602 #else
1603         int i;
1604         for(i=0; i<width; i++)
1605         {
1606                 dstU[i]= (src1[4*i + 1] + src2[4*i + 1])>>1;
1607                 dstV[i]= (src1[4*i + 3] + src2[4*i + 3])>>1;
1608         }
1609 #endif
1610 }
1611
1612 static inline void RENAME(bgr32ToY)(uint8_t *dst, uint8_t *src, int width)
1613 {
1614 #ifdef HAVE_MMXFIXME
1615 #else
1616         int i;
1617         for(i=0; i<width; i++)
1618         {
1619                 int b= src[i*4+0];
1620                 int g= src[i*4+1];
1621                 int r= src[i*4+2];
1622
1623                 dst[i]= ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1624         }
1625 #endif
1626 }
1627
1628 static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1629 {
1630 #ifdef HAVE_MMXFIXME
1631 #else
1632         int i;
1633         for(i=0; i<width; i++)
1634         {
1635                 int b= src1[8*i + 0] + src1[8*i + 4] + src2[8*i + 0] + src2[8*i + 4];
1636                 int g= src1[8*i + 1] + src1[8*i + 5] + src2[8*i + 1] + src2[8*i + 5];
1637                 int r= src1[8*i + 2] + src1[8*i + 6] + src2[8*i + 2] + src2[8*i + 6];
1638
1639                 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1640                 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1641         }
1642 #endif
1643 }
1644
1645 static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, int width)
1646 {
1647 #ifdef HAVE_MMX
1648         asm volatile(
1649                 "movl %2, %%eax                 \n\t"
1650                 "movq "MANGLE(bgr2YCoeff)", %%mm6               \n\t"
1651                 "movq "MANGLE(w1111)", %%mm5            \n\t"
1652                 "pxor %%mm7, %%mm7              \n\t"
1653                 "leal (%%eax, %%eax, 2), %%ebx  \n\t"
1654                 ".balign 16                     \n\t"
1655                 "1:                             \n\t"
1656                 PREFETCH" 64(%0, %%ebx)         \n\t"
1657                 "movd (%0, %%ebx), %%mm0        \n\t"
1658                 "movd 3(%0, %%ebx), %%mm1       \n\t"
1659                 "punpcklbw %%mm7, %%mm0         \n\t"
1660                 "punpcklbw %%mm7, %%mm1         \n\t"
1661                 "movd 6(%0, %%ebx), %%mm2       \n\t"
1662                 "movd 9(%0, %%ebx), %%mm3       \n\t"
1663                 "punpcklbw %%mm7, %%mm2         \n\t"
1664                 "punpcklbw %%mm7, %%mm3         \n\t"
1665                 "pmaddwd %%mm6, %%mm0           \n\t"
1666                 "pmaddwd %%mm6, %%mm1           \n\t"
1667                 "pmaddwd %%mm6, %%mm2           \n\t"
1668                 "pmaddwd %%mm6, %%mm3           \n\t"
1669 #ifndef FAST_BGR2YV12
1670                 "psrad $8, %%mm0                \n\t"
1671                 "psrad $8, %%mm1                \n\t"
1672                 "psrad $8, %%mm2                \n\t"
1673                 "psrad $8, %%mm3                \n\t"
1674 #endif
1675                 "packssdw %%mm1, %%mm0          \n\t"
1676                 "packssdw %%mm3, %%mm2          \n\t"
1677                 "pmaddwd %%mm5, %%mm0           \n\t"
1678                 "pmaddwd %%mm5, %%mm2           \n\t"
1679                 "packssdw %%mm2, %%mm0          \n\t"
1680                 "psraw $7, %%mm0                \n\t"
1681
1682                 "movd 12(%0, %%ebx), %%mm4      \n\t"
1683                 "movd 15(%0, %%ebx), %%mm1      \n\t"
1684                 "punpcklbw %%mm7, %%mm4         \n\t"
1685                 "punpcklbw %%mm7, %%mm1         \n\t"
1686                 "movd 18(%0, %%ebx), %%mm2      \n\t"
1687                 "movd 21(%0, %%ebx), %%mm3      \n\t"
1688                 "punpcklbw %%mm7, %%mm2         \n\t"
1689                 "punpcklbw %%mm7, %%mm3         \n\t"
1690                 "pmaddwd %%mm6, %%mm4           \n\t"
1691                 "pmaddwd %%mm6, %%mm1           \n\t"
1692                 "pmaddwd %%mm6, %%mm2           \n\t"
1693                 "pmaddwd %%mm6, %%mm3           \n\t"
1694 #ifndef FAST_BGR2YV12
1695                 "psrad $8, %%mm4                \n\t"
1696                 "psrad $8, %%mm1                \n\t"
1697                 "psrad $8, %%mm2                \n\t"
1698                 "psrad $8, %%mm3                \n\t"
1699 #endif
1700                 "packssdw %%mm1, %%mm4          \n\t"
1701                 "packssdw %%mm3, %%mm2          \n\t"
1702                 "pmaddwd %%mm5, %%mm4           \n\t"
1703                 "pmaddwd %%mm5, %%mm2           \n\t"
1704                 "addl $24, %%ebx                \n\t"
1705                 "packssdw %%mm2, %%mm4          \n\t"
1706                 "psraw $7, %%mm4                \n\t"
1707
1708                 "packuswb %%mm4, %%mm0          \n\t"
1709                 "paddusb "MANGLE(bgr2YOffset)", %%mm0   \n\t"
1710
1711                 "movq %%mm0, (%1, %%eax)        \n\t"
1712                 "addl $8, %%eax                 \n\t"
1713                 " js 1b                         \n\t"
1714                 : : "r" (src+width*3), "r" (dst+width), "g" (-width)
1715                 : "%eax", "%ebx"
1716         );
1717 #else
1718         int i;
1719         for(i=0; i<width; i++)
1720         {
1721                 int b= src[i*3+0];
1722                 int g= src[i*3+1];
1723                 int r= src[i*3+2];
1724
1725                 dst[i]= ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1726         }
1727 #endif
1728 }
1729
1730 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1731 {
1732 #ifdef HAVE_MMX
1733         asm volatile(
1734                 "movl %4, %%eax                 \n\t"
1735                 "movq "MANGLE(w1111)", %%mm5            \n\t"
1736                 "movq "MANGLE(bgr2UCoeff)", %%mm6               \n\t"
1737                 "pxor %%mm7, %%mm7              \n\t"
1738                 "leal (%%eax, %%eax, 2), %%ebx  \n\t"
1739                 "addl %%ebx, %%ebx              \n\t"
1740                 ".balign 16                     \n\t"
1741                 "1:                             \n\t"
1742                 PREFETCH" 64(%0, %%ebx)         \n\t"
1743                 PREFETCH" 64(%1, %%ebx)         \n\t"
1744 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1745                 "movq (%0, %%ebx), %%mm0        \n\t"
1746                 "movq (%1, %%ebx), %%mm1        \n\t"
1747                 "movq 6(%0, %%ebx), %%mm2       \n\t"
1748                 "movq 6(%1, %%ebx), %%mm3       \n\t"
1749                 PAVGB(%%mm1, %%mm0)
1750                 PAVGB(%%mm3, %%mm2)
1751                 "movq %%mm0, %%mm1              \n\t"
1752                 "movq %%mm2, %%mm3              \n\t"
1753                 "psrlq $24, %%mm0               \n\t"
1754                 "psrlq $24, %%mm2               \n\t"
1755                 PAVGB(%%mm1, %%mm0)
1756                 PAVGB(%%mm3, %%mm2)
1757                 "punpcklbw %%mm7, %%mm0         \n\t"
1758                 "punpcklbw %%mm7, %%mm2         \n\t"
1759 #else
1760                 "movd (%0, %%ebx), %%mm0        \n\t"
1761                 "movd (%1, %%ebx), %%mm1        \n\t"
1762                 "movd 3(%0, %%ebx), %%mm2       \n\t"
1763                 "movd 3(%1, %%ebx), %%mm3       \n\t"
1764                 "punpcklbw %%mm7, %%mm0         \n\t"
1765                 "punpcklbw %%mm7, %%mm1         \n\t"
1766                 "punpcklbw %%mm7, %%mm2         \n\t"
1767                 "punpcklbw %%mm7, %%mm3         \n\t"
1768                 "paddw %%mm1, %%mm0             \n\t"
1769                 "paddw %%mm3, %%mm2             \n\t"
1770                 "paddw %%mm2, %%mm0             \n\t"
1771                 "movd 6(%0, %%ebx), %%mm4       \n\t"
1772                 "movd 6(%1, %%ebx), %%mm1       \n\t"
1773                 "movd 9(%0, %%ebx), %%mm2       \n\t"
1774                 "movd 9(%1, %%ebx), %%mm3       \n\t"
1775                 "punpcklbw %%mm7, %%mm4         \n\t"
1776                 "punpcklbw %%mm7, %%mm1         \n\t"
1777                 "punpcklbw %%mm7, %%mm2         \n\t"
1778                 "punpcklbw %%mm7, %%mm3         \n\t"
1779                 "paddw %%mm1, %%mm4             \n\t"
1780                 "paddw %%mm3, %%mm2             \n\t"
1781                 "paddw %%mm4, %%mm2             \n\t"
1782                 "psrlw $2, %%mm0                \n\t"
1783                 "psrlw $2, %%mm2                \n\t"
1784 #endif
1785                 "movq "MANGLE(bgr2VCoeff)", %%mm1               \n\t"
1786                 "movq "MANGLE(bgr2VCoeff)", %%mm3               \n\t"
1787
1788                 "pmaddwd %%mm0, %%mm1           \n\t"
1789                 "pmaddwd %%mm2, %%mm3           \n\t"
1790                 "pmaddwd %%mm6, %%mm0           \n\t"
1791                 "pmaddwd %%mm6, %%mm2           \n\t"
1792 #ifndef FAST_BGR2YV12
1793                 "psrad $8, %%mm0                \n\t"
1794                 "psrad $8, %%mm1                \n\t"
1795                 "psrad $8, %%mm2                \n\t"
1796                 "psrad $8, %%mm3                \n\t"
1797 #endif
1798                 "packssdw %%mm2, %%mm0          \n\t"
1799                 "packssdw %%mm3, %%mm1          \n\t"
1800                 "pmaddwd %%mm5, %%mm0           \n\t"
1801                 "pmaddwd %%mm5, %%mm1           \n\t"
1802                 "packssdw %%mm1, %%mm0          \n\t" // V1 V0 U1 U0
1803                 "psraw $7, %%mm0                \n\t"
1804
1805 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1806                 "movq 12(%0, %%ebx), %%mm4      \n\t"
1807                 "movq 12(%1, %%ebx), %%mm1      \n\t"
1808                 "movq 18(%0, %%ebx), %%mm2      \n\t"
1809                 "movq 18(%1, %%ebx), %%mm3      \n\t"
1810                 PAVGB(%%mm1, %%mm4)
1811                 PAVGB(%%mm3, %%mm2)
1812                 "movq %%mm4, %%mm1              \n\t"
1813                 "movq %%mm2, %%mm3              \n\t"
1814                 "psrlq $24, %%mm4               \n\t"
1815                 "psrlq $24, %%mm2               \n\t"
1816                 PAVGB(%%mm1, %%mm4)
1817                 PAVGB(%%mm3, %%mm2)
1818                 "punpcklbw %%mm7, %%mm4         \n\t"
1819                 "punpcklbw %%mm7, %%mm2         \n\t"
1820 #else
1821                 "movd 12(%0, %%ebx), %%mm4      \n\t"
1822                 "movd 12(%1, %%ebx), %%mm1      \n\t"
1823                 "movd 15(%0, %%ebx), %%mm2      \n\t"
1824                 "movd 15(%1, %%ebx), %%mm3      \n\t"
1825                 "punpcklbw %%mm7, %%mm4         \n\t"
1826                 "punpcklbw %%mm7, %%mm1         \n\t"
1827                 "punpcklbw %%mm7, %%mm2         \n\t"
1828                 "punpcklbw %%mm7, %%mm3         \n\t"
1829                 "paddw %%mm1, %%mm4             \n\t"
1830                 "paddw %%mm3, %%mm2             \n\t"
1831                 "paddw %%mm2, %%mm4             \n\t"
1832                 "movd 18(%0, %%ebx), %%mm5      \n\t"
1833                 "movd 18(%1, %%ebx), %%mm1      \n\t"
1834                 "movd 21(%0, %%ebx), %%mm2      \n\t"
1835                 "movd 21(%1, %%ebx), %%mm3      \n\t"
1836                 "punpcklbw %%mm7, %%mm5         \n\t"
1837                 "punpcklbw %%mm7, %%mm1         \n\t"
1838                 "punpcklbw %%mm7, %%mm2         \n\t"
1839                 "punpcklbw %%mm7, %%mm3         \n\t"
1840                 "paddw %%mm1, %%mm5             \n\t"
1841                 "paddw %%mm3, %%mm2             \n\t"
1842                 "paddw %%mm5, %%mm2             \n\t"
1843                 "movq "MANGLE(w1111)", %%mm5            \n\t"
1844                 "psrlw $2, %%mm4                \n\t"
1845                 "psrlw $2, %%mm2                \n\t"
1846 #endif
1847                 "movq "MANGLE(bgr2VCoeff)", %%mm1               \n\t"
1848                 "movq "MANGLE(bgr2VCoeff)", %%mm3               \n\t"
1849
1850                 "pmaddwd %%mm4, %%mm1           \n\t"
1851                 "pmaddwd %%mm2, %%mm3           \n\t"
1852                 "pmaddwd %%mm6, %%mm4           \n\t"
1853                 "pmaddwd %%mm6, %%mm2           \n\t"
1854 #ifndef FAST_BGR2YV12
1855                 "psrad $8, %%mm4                \n\t"
1856                 "psrad $8, %%mm1                \n\t"
1857                 "psrad $8, %%mm2                \n\t"
1858                 "psrad $8, %%mm3                \n\t"
1859 #endif
1860                 "packssdw %%mm2, %%mm4          \n\t"
1861                 "packssdw %%mm3, %%mm1          \n\t"
1862                 "pmaddwd %%mm5, %%mm4           \n\t"
1863                 "pmaddwd %%mm5, %%mm1           \n\t"
1864                 "addl $24, %%ebx                \n\t"
1865                 "packssdw %%mm1, %%mm4          \n\t" // V3 V2 U3 U2
1866                 "psraw $7, %%mm4                \n\t"
1867
1868                 "movq %%mm0, %%mm1              \n\t"
1869                 "punpckldq %%mm4, %%mm0         \n\t"
1870                 "punpckhdq %%mm4, %%mm1         \n\t"
1871                 "packsswb %%mm1, %%mm0          \n\t"
1872                 "paddb "MANGLE(bgr2UVOffset)", %%mm0    \n\t"
1873
1874                 "movd %%mm0, (%2, %%eax)        \n\t"
1875                 "punpckhdq %%mm0, %%mm0         \n\t"
1876                 "movd %%mm0, (%3, %%eax)        \n\t"
1877                 "addl $4, %%eax                 \n\t"
1878                 " js 1b                         \n\t"
1879                 : : "r" (src1+width*6), "r" (src2+width*6), "r" (dstU+width), "r" (dstV+width), "g" (-width)
1880                 : "%eax", "%ebx"
1881         );
1882 #else
1883         int i;
1884         for(i=0; i<width; i++)
1885         {
1886                 int b= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
1887                 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
1888                 int r= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
1889
1890                 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1891                 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1892         }
1893 #endif
1894 }
1895
1896 static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, int width)
1897 {
1898         int i;
1899         for(i=0; i<width; i++)
1900         {
1901                 int d= src[i*2] + (src[i*2+1]<<8);
1902                 int b= d&0x1F;
1903                 int g= (d>>5)&0x3F;
1904                 int r= (d>>11)&0x1F;
1905
1906                 dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
1907         }
1908 }
1909
1910 static inline void RENAME(bgr16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1911 {
1912         int i;
1913         for(i=0; i<width; i++)
1914         {
1915 #if 1
1916                 int d0= le2me_32( ((uint32_t*)src1)[i] );
1917                 int d1= le2me_32( ((uint32_t*)src2)[i] );
1918
1919                 int dl= (d0&0x07E0F81F) + (d1&0x07E0F81F);
1920                 int dh= ((d0>>5)&0x07C0F83F) + ((d1>>5)&0x07C0F83F);
1921
1922                 int dh2= (dh>>11) + (dh<<21);
1923                 int d= dh2 + dl;
1924
1925                 int b= d&0x7F;
1926                 int r= (d>>11)&0x7F;
1927                 int g= d>>21;
1928 #else
1929                 int d0= src1[i*4] + (src1[i*4+1]<<8);
1930                 int b0= d0&0x1F;
1931                 int g0= (d0>>5)&0x3F;
1932                 int r0= (d0>>11)&0x1F;
1933
1934                 int d1= src1[i*4+2] + (src1[i*4+3]<<8);
1935                 int b1= d1&0x1F;
1936                 int g1= (d1>>5)&0x3F;
1937                 int r1= (d1>>11)&0x1F;
1938
1939                 int d2= src2[i*4] + (src2[i*4+1]<<8);
1940                 int b2= d2&0x1F;
1941                 int g2= (d2>>5)&0x3F;
1942                 int r2= (d2>>11)&0x1F;
1943
1944                 int d3= src2[i*4+2] + (src2[i*4+3]<<8);
1945                 int b3= d3&0x1F;
1946                 int g3= (d3>>5)&0x3F;
1947                 int r3= (d3>>11)&0x1F;
1948
1949                 int b= b0 + b1 + b2 + b3;
1950                 int g= g0 + g1 + g2 + g3;
1951                 int r= r0 + r1 + r2 + r3;
1952 #endif
1953                 dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
1954                 dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
1955         }
1956 }
1957
1958 static inline void RENAME(bgr15ToY)(uint8_t *dst, uint8_t *src, int width)
1959 {
1960         int i;
1961         for(i=0; i<width; i++)
1962         {
1963                 int d= src[i*2] + (src[i*2+1]<<8);
1964                 int b= d&0x1F;
1965                 int g= (d>>5)&0x1F;
1966                 int r= (d>>10)&0x1F;
1967
1968                 dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
1969         }
1970 }
1971
1972 static inline void RENAME(bgr15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1973 {
1974         int i;
1975         for(i=0; i<width; i++)
1976         {
1977 #if 1
1978                 int d0= le2me_32( ((uint32_t*)src1)[i] );
1979                 int d1= le2me_32( ((uint32_t*)src2)[i] );
1980
1981                 int dl= (d0&0x03E07C1F) + (d1&0x03E07C1F);
1982                 int dh= ((d0>>5)&0x03E0F81F) + ((d1>>5)&0x03E0F81F);
1983
1984                 int dh2= (dh>>11) + (dh<<21);
1985                 int d= dh2 + dl;
1986
1987                 int b= d&0x7F;
1988                 int r= (d>>10)&0x7F;
1989                 int g= d>>21;
1990 #else
1991                 int d0= src1[i*4] + (src1[i*4+1]<<8);
1992                 int b0= d0&0x1F;
1993                 int g0= (d0>>5)&0x1F;
1994                 int r0= (d0>>10)&0x1F;
1995
1996                 int d1= src1[i*4+2] + (src1[i*4+3]<<8);
1997                 int b1= d1&0x1F;
1998                 int g1= (d1>>5)&0x1F;
1999                 int r1= (d1>>10)&0x1F;
2000
2001                 int d2= src2[i*4] + (src2[i*4+1]<<8);
2002                 int b2= d2&0x1F;
2003                 int g2= (d2>>5)&0x1F;
2004                 int r2= (d2>>10)&0x1F;
2005
2006                 int d3= src2[i*4+2] + (src2[i*4+3]<<8);
2007                 int b3= d3&0x1F;
2008                 int g3= (d3>>5)&0x1F;
2009                 int r3= (d3>>10)&0x1F;
2010
2011                 int b= b0 + b1 + b2 + b3;
2012                 int g= g0 + g1 + g2 + g3;
2013                 int r= r0 + r1 + r2 + r3;
2014 #endif
2015                 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
2016                 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
2017         }
2018 }
2019
2020
2021 static inline void RENAME(rgb32ToY)(uint8_t *dst, uint8_t *src, int width)
2022 {
2023         int i;
2024         for(i=0; i<width; i++)
2025         {
2026                 int r= src[i*4+0];
2027                 int g= src[i*4+1];
2028                 int b= src[i*4+2];
2029
2030                 dst[i]= ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2031         }
2032 }
2033
2034 static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2035 {
2036         int i;
2037         for(i=0; i<width; i++)
2038         {
2039                 int r= src1[8*i + 0] + src1[8*i + 4] + src2[8*i + 0] + src2[8*i + 4];
2040                 int g= src1[8*i + 1] + src1[8*i + 5] + src2[8*i + 1] + src2[8*i + 5];
2041                 int b= src1[8*i + 2] + src1[8*i + 6] + src2[8*i + 2] + src2[8*i + 6];
2042
2043                 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
2044                 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
2045         }
2046 }
2047
2048 static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, int width)
2049 {
2050         int i;
2051         for(i=0; i<width; i++)
2052         {
2053                 int r= src[i*3+0];
2054                 int g= src[i*3+1];
2055                 int b= src[i*3+2];
2056
2057                 dst[i]= ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2058         }
2059 }
2060
2061 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2062 {
2063         int i;
2064         for(i=0; i<width; i++)
2065         {
2066                 int r= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
2067                 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
2068                 int b= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
2069
2070                 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
2071                 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
2072         }
2073 }
2074
2075
2076 // Bilinear / Bicubic scaling
2077 static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
2078                                   int16_t *filter, int16_t *filterPos, int filterSize)
2079 {
2080 #ifdef HAVE_MMX
2081         if(filterSize==4) // allways true for upscaling, sometimes for down too
2082         {
2083                 int counter= -2*dstW;
2084                 filter-= counter*2;
2085                 filterPos-= counter/2;
2086                 dst-= counter/2;
2087                 asm volatile(
2088                         "pxor %%mm7, %%mm7              \n\t"
2089                         "movq "MANGLE(w02)", %%mm6      \n\t"
2090                         "pushl %%ebp                    \n\t" // we use 7 regs here ...
2091                         "movl %%eax, %%ebp              \n\t"
2092                         ".balign 16                     \n\t"
2093                         "1:                             \n\t"
2094                         "movzwl (%2, %%ebp), %%eax      \n\t"
2095                         "movzwl 2(%2, %%ebp), %%ebx     \n\t"
2096                         "movq (%1, %%ebp, 4), %%mm1     \n\t"
2097                         "movq 8(%1, %%ebp, 4), %%mm3    \n\t"
2098                         "movd (%3, %%eax), %%mm0        \n\t"
2099                         "movd (%3, %%ebx), %%mm2        \n\t"
2100                         "punpcklbw %%mm7, %%mm0         \n\t"
2101                         "punpcklbw %%mm7, %%mm2         \n\t"
2102                         "pmaddwd %%mm1, %%mm0           \n\t"
2103                         "pmaddwd %%mm2, %%mm3           \n\t"
2104                         "psrad $8, %%mm0                \n\t"
2105                         "psrad $8, %%mm3                \n\t"
2106                         "packssdw %%mm3, %%mm0          \n\t"
2107                         "pmaddwd %%mm6, %%mm0           \n\t"
2108                         "packssdw %%mm0, %%mm0          \n\t"
2109                         "movd %%mm0, (%4, %%ebp)        \n\t"
2110                         "addl $4, %%ebp                 \n\t"
2111                         " jnc 1b                        \n\t"
2112
2113                         "popl %%ebp                     \n\t"
2114                         : "+a" (counter)
2115                         : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2116                         : "%ebx"
2117                 );
2118         }
2119         else if(filterSize==8)
2120         {
2121                 int counter= -2*dstW;
2122                 filter-= counter*4;
2123                 filterPos-= counter/2;
2124                 dst-= counter/2;
2125                 asm volatile(
2126                         "pxor %%mm7, %%mm7              \n\t"
2127                         "movq "MANGLE(w02)", %%mm6      \n\t"
2128                         "pushl %%ebp                    \n\t" // we use 7 regs here ...
2129                         "movl %%eax, %%ebp              \n\t"
2130                         ".balign 16                     \n\t"
2131                         "1:                             \n\t"
2132                         "movzwl (%2, %%ebp), %%eax      \n\t"
2133                         "movzwl 2(%2, %%ebp), %%ebx     \n\t"
2134                         "movq (%1, %%ebp, 8), %%mm1     \n\t"
2135                         "movq 16(%1, %%ebp, 8), %%mm3   \n\t"
2136                         "movd (%3, %%eax), %%mm0        \n\t"
2137                         "movd (%3, %%ebx), %%mm2        \n\t"
2138                         "punpcklbw %%mm7, %%mm0         \n\t"
2139                         "punpcklbw %%mm7, %%mm2         \n\t"
2140                         "pmaddwd %%mm1, %%mm0           \n\t"
2141                         "pmaddwd %%mm2, %%mm3           \n\t"
2142
2143                         "movq 8(%1, %%ebp, 8), %%mm1    \n\t"
2144                         "movq 24(%1, %%ebp, 8), %%mm5   \n\t"
2145                         "movd 4(%3, %%eax), %%mm4       \n\t"
2146                         "movd 4(%3, %%ebx), %%mm2       \n\t"
2147                         "punpcklbw %%mm7, %%mm4         \n\t"
2148                         "punpcklbw %%mm7, %%mm2         \n\t"
2149                         "pmaddwd %%mm1, %%mm4           \n\t"
2150                         "pmaddwd %%mm2, %%mm5           \n\t"
2151                         "paddd %%mm4, %%mm0             \n\t"
2152                         "paddd %%mm5, %%mm3             \n\t"
2153
2154                         "psrad $8, %%mm0                \n\t"
2155                         "psrad $8, %%mm3                \n\t"
2156                         "packssdw %%mm3, %%mm0          \n\t"
2157                         "pmaddwd %%mm6, %%mm0           \n\t"
2158                         "packssdw %%mm0, %%mm0          \n\t"
2159                         "movd %%mm0, (%4, %%ebp)        \n\t"
2160                         "addl $4, %%ebp                 \n\t"
2161                         " jnc 1b                        \n\t"
2162
2163                         "popl %%ebp                     \n\t"
2164                         : "+a" (counter)
2165                         : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2166                         : "%ebx"
2167                 );
2168         }
2169         else
2170         {
2171                 int counter= -2*dstW;
2172 //              filter-= counter*filterSize/2;
2173                 filterPos-= counter/2;
2174                 dst-= counter/2;
2175                 asm volatile(
2176                         "pxor %%mm7, %%mm7              \n\t"
2177                         "movq "MANGLE(w02)", %%mm6      \n\t"
2178                         ".balign 16                     \n\t"
2179                         "1:                             \n\t"
2180                         "movl %2, %%ecx                 \n\t"
2181                         "movzwl (%%ecx, %0), %%eax      \n\t"
2182                         "movzwl 2(%%ecx, %0), %%ebx     \n\t"
2183                         "movl %5, %%ecx                 \n\t"
2184                         "pxor %%mm4, %%mm4              \n\t"
2185                         "pxor %%mm5, %%mm5              \n\t"
2186                         "2:                             \n\t"
2187                         "movq (%1), %%mm1               \n\t"
2188                         "movq (%1, %6), %%mm3           \n\t"
2189                         "movd (%%ecx, %%eax), %%mm0     \n\t"
2190                         "movd (%%ecx, %%ebx), %%mm2     \n\t"
2191                         "punpcklbw %%mm7, %%mm0         \n\t"
2192                         "punpcklbw %%mm7, %%mm2         \n\t"
2193                         "pmaddwd %%mm1, %%mm0           \n\t"
2194                         "pmaddwd %%mm2, %%mm3           \n\t"
2195                         "paddd %%mm3, %%mm5             \n\t"
2196                         "paddd %%mm0, %%mm4             \n\t"
2197                         "addl $8, %1                    \n\t"
2198                         "addl $4, %%ecx                 \n\t"
2199                         "cmpl %4, %%ecx                 \n\t"
2200                         " jb 2b                         \n\t"
2201                         "addl %6, %1                    \n\t"
2202                         "psrad $8, %%mm4                \n\t"
2203                         "psrad $8, %%mm5                \n\t"
2204                         "packssdw %%mm5, %%mm4          \n\t"
2205                         "pmaddwd %%mm6, %%mm4           \n\t"
2206                         "packssdw %%mm4, %%mm4          \n\t"
2207                         "movl %3, %%eax                 \n\t"
2208                         "movd %%mm4, (%%eax, %0)        \n\t"
2209                         "addl $4, %0                    \n\t"
2210                         " jnc 1b                        \n\t"
2211
2212                         : "+r" (counter), "+r" (filter)
2213                         : "m" (filterPos), "m" (dst), "m"(src+filterSize),
2214                           "m" (src), "r" (filterSize*2)
2215                         : "%ebx", "%eax", "%ecx"
2216                 );
2217         }
2218 #else
2219         int i;
2220         for(i=0; i<dstW; i++)
2221         {
2222                 int j;
2223                 int srcPos= filterPos[i];
2224                 int val=0;
2225 //              printf("filterPos: %d\n", filterPos[i]);
2226                 for(j=0; j<filterSize; j++)
2227                 {
2228 //                      printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2229                         val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2230                 }
2231 //              filter += hFilterSize;
2232                 dst[i] = MIN(MAX(0, val>>7), (1<<15)-1); // the cubic equation does overflow ...
2233 //              dst[i] = val>>7;
2234         }
2235 #endif
2236 }
2237       // *** horizontal scale Y line to temp buffer
2238 static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, int srcW, int xInc,
2239                                    int flags, int canMMX2BeUsed, int16_t *hLumFilter,
2240                                    int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
2241                                    int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2242                                    int32_t *mmx2FilterPos)
2243 {
2244     if(srcFormat==IMGFMT_YUY2)
2245     {
2246         RENAME(yuy2ToY)(formatConvBuffer, src, srcW);
2247         src= formatConvBuffer;
2248     }
2249     else if(srcFormat==IMGFMT_BGR32)
2250     {
2251         RENAME(bgr32ToY)(formatConvBuffer, src, srcW);
2252         src= formatConvBuffer;
2253     }
2254     else if(srcFormat==IMGFMT_BGR24)
2255     {
2256         RENAME(bgr24ToY)(formatConvBuffer, src, srcW);
2257         src= formatConvBuffer;
2258     }
2259     else if(srcFormat==IMGFMT_BGR16)
2260     {
2261         RENAME(bgr16ToY)(formatConvBuffer, src, srcW);
2262         src= formatConvBuffer;
2263     }
2264     else if(srcFormat==IMGFMT_BGR15)
2265     {
2266         RENAME(bgr15ToY)(formatConvBuffer, src, srcW);
2267         src= formatConvBuffer;
2268     }
2269     else if(srcFormat==IMGFMT_RGB32)
2270     {
2271         RENAME(rgb32ToY)(formatConvBuffer, src, srcW);
2272         src= formatConvBuffer;
2273     }
2274     else if(srcFormat==IMGFMT_RGB24)
2275     {
2276         RENAME(rgb24ToY)(formatConvBuffer, src, srcW);
2277         src= formatConvBuffer;
2278     }
2279
2280 #ifdef HAVE_MMX
2281         // use the new MMX scaler if th mmx2 cant be used (its faster than the x86asm one)
2282     if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2283 #else
2284     if(!(flags&SWS_FAST_BILINEAR))
2285 #endif
2286     {
2287         RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2288     }
2289     else // Fast Bilinear upscale / crap downscale
2290     {
2291 #ifdef ARCH_X86
2292 #ifdef HAVE_MMX2
2293         int i;
2294         if(canMMX2BeUsed)
2295         {
2296                 asm volatile(
2297                         "pxor %%mm7, %%mm7              \n\t"
2298                         "movl %0, %%ecx                 \n\t"
2299                         "movl %1, %%edi                 \n\t"
2300                         "movl %2, %%edx                 \n\t"
2301                         "movl %3, %%ebx                 \n\t"
2302                         "xorl %%eax, %%eax              \n\t" // i
2303                         PREFETCH" (%%ecx)               \n\t"
2304                         PREFETCH" 32(%%ecx)             \n\t"
2305                         PREFETCH" 64(%%ecx)             \n\t"
2306
2307 #define FUNNY_Y_CODE \
2308                         "movl (%%ebx), %%esi            \n\t"\
2309                         "call *%4                       \n\t"\
2310                         "addl (%%ebx, %%eax), %%ecx     \n\t"\
2311                         "addl %%eax, %%edi              \n\t"\
2312                         "xorl %%eax, %%eax              \n\t"\
2313
2314 FUNNY_Y_CODE
2315 FUNNY_Y_CODE
2316 FUNNY_Y_CODE
2317 FUNNY_Y_CODE
2318 FUNNY_Y_CODE
2319 FUNNY_Y_CODE
2320 FUNNY_Y_CODE
2321 FUNNY_Y_CODE
2322
2323                         :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2324                         "m" (funnyYCode)
2325                         : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
2326                 );
2327                 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2328         }
2329         else
2330         {
2331 #endif
2332         //NO MMX just normal asm ...
2333         asm volatile(
2334                 "xorl %%eax, %%eax              \n\t" // i
2335                 "xorl %%ebx, %%ebx              \n\t" // xx
2336                 "xorl %%ecx, %%ecx              \n\t" // 2*xalpha
2337                 ".balign 16                     \n\t"
2338                 "1:                             \n\t"
2339                 "movzbl  (%0, %%ebx), %%edi     \n\t" //src[xx]
2340                 "movzbl 1(%0, %%ebx), %%esi     \n\t" //src[xx+1]
2341                 "subl %%edi, %%esi              \n\t" //src[xx+1] - src[xx]
2342                 "imull %%ecx, %%esi             \n\t" //(src[xx+1] - src[xx])*2*xalpha
2343                 "shll $16, %%edi                \n\t"
2344                 "addl %%edi, %%esi              \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2345                 "movl %1, %%edi                 \n\t"
2346                 "shrl $9, %%esi                 \n\t"
2347                 "movw %%si, (%%edi, %%eax, 2)   \n\t"
2348                 "addw %4, %%cx                  \n\t" //2*xalpha += xInc&0xFF
2349                 "adcl %3, %%ebx                 \n\t" //xx+= xInc>>8 + carry
2350
2351                 "movzbl (%0, %%ebx), %%edi      \n\t" //src[xx]
2352                 "movzbl 1(%0, %%ebx), %%esi     \n\t" //src[xx+1]
2353                 "subl %%edi, %%esi              \n\t" //src[xx+1] - src[xx]
2354                 "imull %%ecx, %%esi             \n\t" //(src[xx+1] - src[xx])*2*xalpha
2355                 "shll $16, %%edi                \n\t"
2356                 "addl %%edi, %%esi              \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2357                 "movl %1, %%edi                 \n\t"
2358                 "shrl $9, %%esi                 \n\t"
2359                 "movw %%si, 2(%%edi, %%eax, 2)  \n\t"
2360                 "addw %4, %%cx                  \n\t" //2*xalpha += xInc&0xFF
2361                 "adcl %3, %%ebx                 \n\t" //xx+= xInc>>8 + carry
2362
2363
2364                 "addl $2, %%eax                 \n\t"
2365                 "cmpl %2, %%eax                 \n\t"
2366                 " jb 1b                         \n\t"
2367
2368
2369                 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF)
2370                 : "%eax", "%ebx", "%ecx", "%edi", "%esi"
2371                 );
2372 #ifdef HAVE_MMX2
2373         } //if MMX2 cant be used
2374 #endif
2375 #else
2376         int i;
2377         unsigned int xpos=0;
2378         for(i=0;i<dstWidth;i++)
2379         {
2380                 register unsigned int xx=xpos>>16;
2381                 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2382                 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2383                 xpos+=xInc;
2384         }
2385 #endif
2386     }
2387 }
2388
2389 inline static void RENAME(hcscale)(uint16_t *dst, int dstWidth, uint8_t *src1, uint8_t *src2,
2390                                    int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
2391                                    int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
2392                                    int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2393                                    int32_t *mmx2FilterPos)
2394 {
2395     if(srcFormat==IMGFMT_YUY2)
2396     {
2397         RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2398         src1= formatConvBuffer;
2399         src2= formatConvBuffer+2048;
2400     }
2401     else if(srcFormat==IMGFMT_BGR32)
2402     {
2403         RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2404         src1= formatConvBuffer;
2405         src2= formatConvBuffer+2048;
2406     }
2407     else if(srcFormat==IMGFMT_BGR24)
2408     {
2409         RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2410         src1= formatConvBuffer;
2411         src2= formatConvBuffer+2048;
2412     }
2413     else if(srcFormat==IMGFMT_BGR16)
2414     {
2415         RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2416         src1= formatConvBuffer;
2417         src2= formatConvBuffer+2048;
2418     }
2419     else if(srcFormat==IMGFMT_BGR15)
2420     {
2421         RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2422         src1= formatConvBuffer;
2423         src2= formatConvBuffer+2048;
2424     }
2425     else if(srcFormat==IMGFMT_RGB32)
2426     {
2427         RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2428         src1= formatConvBuffer;
2429         src2= formatConvBuffer+2048;
2430     }
2431     else if(srcFormat==IMGFMT_RGB24)
2432     {
2433         RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2434         src1= formatConvBuffer;
2435         src2= formatConvBuffer+2048;
2436     }
2437     else if(isGray(srcFormat))
2438     {
2439         return;
2440     }
2441
2442 #ifdef HAVE_MMX
2443         // use the new MMX scaler if th mmx2 cant be used (its faster than the x86asm one)
2444     if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2445 #else
2446     if(!(flags&SWS_FAST_BILINEAR))
2447 #endif
2448     {
2449         RENAME(hScale)(dst     , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2450         RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2451     }
2452     else // Fast Bilinear upscale / crap downscale
2453     {
2454 #ifdef ARCH_X86
2455 #ifdef HAVE_MMX2
2456         int i;
2457         if(canMMX2BeUsed)
2458         {
2459                 asm volatile(
2460                         "pxor %%mm7, %%mm7              \n\t"
2461                         "movl %0, %%ecx                 \n\t"
2462                         "movl %1, %%edi                 \n\t"
2463                         "movl %2, %%edx                 \n\t"
2464                         "movl %3, %%ebx                 \n\t"
2465                         "xorl %%eax, %%eax              \n\t" // i
2466                         PREFETCH" (%%ecx)               \n\t"
2467                         PREFETCH" 32(%%ecx)             \n\t"
2468                         PREFETCH" 64(%%ecx)             \n\t"
2469
2470 #define FUNNY_UV_CODE \
2471                         "movl (%%ebx), %%esi            \n\t"\
2472                         "call *%4                       \n\t"\
2473                         "addl (%%ebx, %%eax), %%ecx     \n\t"\
2474                         "addl %%eax, %%edi              \n\t"\
2475                         "xorl %%eax, %%eax              \n\t"\
2476
2477 FUNNY_UV_CODE
2478 FUNNY_UV_CODE
2479 FUNNY_UV_CODE
2480 FUNNY_UV_CODE
2481                         "xorl %%eax, %%eax              \n\t" // i
2482                         "movl %5, %%ecx                 \n\t" // src
2483                         "movl %1, %%edi                 \n\t" // buf1
2484                         "addl $4096, %%edi              \n\t"
2485                         PREFETCH" (%%ecx)               \n\t"
2486                         PREFETCH" 32(%%ecx)             \n\t"
2487                         PREFETCH" 64(%%ecx)             \n\t"
2488
2489 FUNNY_UV_CODE
2490 FUNNY_UV_CODE
2491 FUNNY_UV_CODE
2492 FUNNY_UV_CODE
2493
2494                         :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2495                         "m" (funnyUVCode), "m" (src2)
2496                         : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
2497                 );
2498                 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2499                 {
2500 //                      printf("%d %d %d\n", dstWidth, i, srcW);
2501                         dst[i] = src1[srcW-1]*128;
2502                         dst[i+2048] = src2[srcW-1]*128;
2503                 }
2504         }
2505         else
2506         {
2507 #endif
2508         asm volatile(
2509                 "xorl %%eax, %%eax              \n\t" // i
2510                 "xorl %%ebx, %%ebx              \n\t" // xx
2511                 "xorl %%ecx, %%ecx              \n\t" // 2*xalpha
2512                 ".balign 16                     \n\t"
2513                 "1:                             \n\t"
2514                 "movl %0, %%esi                 \n\t"
2515                 "movzbl  (%%esi, %%ebx), %%edi  \n\t" //src[xx]
2516                 "movzbl 1(%%esi, %%ebx), %%esi  \n\t" //src[xx+1]
2517                 "subl %%edi, %%esi              \n\t" //src[xx+1] - src[xx]
2518                 "imull %%ecx, %%esi             \n\t" //(src[xx+1] - src[xx])*2*xalpha
2519                 "shll $16, %%edi                \n\t"
2520                 "addl %%edi, %%esi              \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2521                 "movl %1, %%edi                 \n\t"
2522                 "shrl $9, %%esi                 \n\t"
2523                 "movw %%si, (%%edi, %%eax, 2)   \n\t"
2524
2525                 "movzbl  (%5, %%ebx), %%edi     \n\t" //src[xx]
2526                 "movzbl 1(%5, %%ebx), %%esi     \n\t" //src[xx+1]
2527                 "subl %%edi, %%esi              \n\t" //src[xx+1] - src[xx]
2528                 "imull %%ecx, %%esi             \n\t" //(src[xx+1] - src[xx])*2*xalpha
2529                 "shll $16, %%edi                \n\t"
2530                 "addl %%edi, %%esi              \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2531                 "movl %1, %%edi                 \n\t"
2532                 "shrl $9, %%esi                 \n\t"
2533                 "movw %%si, 4096(%%edi, %%eax, 2)\n\t"
2534
2535                 "addw %4, %%cx                  \n\t" //2*xalpha += xInc&0xFF
2536                 "adcl %3, %%ebx                 \n\t" //xx+= xInc>>8 + carry
2537                 "addl $1, %%eax                 \n\t"
2538                 "cmpl %2, %%eax                 \n\t"
2539                 " jb 1b                         \n\t"
2540
2541                 :: "m" (src1), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF),
2542                 "r" (src2)
2543                 : "%eax", "%ebx", "%ecx", "%edi", "%esi"
2544                 );
2545 #ifdef HAVE_MMX2
2546         } //if MMX2 cant be used
2547 #endif
2548 #else
2549         int i;
2550         unsigned int xpos=0;
2551         for(i=0;i<dstWidth;i++)
2552         {
2553                 register unsigned int xx=xpos>>16;
2554                 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2555                 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2556                 dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2557 /* slower
2558           dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2559           dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2560 */
2561                 xpos+=xInc;
2562         }
2563 #endif
2564    }
2565 }
2566
2567 static void RENAME(swScale)(SwsContext *c, uint8_t* srcParam[], int srcStrideParam[], int srcSliceY,
2568              int srcSliceH, uint8_t* dstParam[], int dstStrideParam[]){
2569
2570         /* load a few things into local vars to make the code more readable? and faster */
2571         const int srcW= c->srcW;
2572         const int dstW= c->dstW;
2573         const int dstH= c->dstH;
2574         const int chrDstW= c->chrDstW;
2575         const int lumXInc= c->lumXInc;
2576         const int chrXInc= c->chrXInc;
2577         const int dstFormat= c->dstFormat;
2578         const int flags= c->flags;
2579         const int canMMX2BeUsed= c->canMMX2BeUsed;
2580         int16_t *vLumFilterPos= c->vLumFilterPos;
2581         int16_t *vChrFilterPos= c->vChrFilterPos;
2582         int16_t *hLumFilterPos= c->hLumFilterPos;
2583         int16_t *hChrFilterPos= c->hChrFilterPos;
2584         int16_t *vLumFilter= c->vLumFilter;
2585         int16_t *vChrFilter= c->vChrFilter;
2586         int16_t *hLumFilter= c->hLumFilter;
2587         int16_t *hChrFilter= c->hChrFilter;
2588         int16_t *lumMmxFilter= c->lumMmxFilter;
2589         int16_t *chrMmxFilter= c->chrMmxFilter;
2590         const int vLumFilterSize= c->vLumFilterSize;
2591         const int vChrFilterSize= c->vChrFilterSize;
2592         const int hLumFilterSize= c->hLumFilterSize;
2593         const int hChrFilterSize= c->hChrFilterSize;
2594         int16_t **lumPixBuf= c->lumPixBuf;
2595         int16_t **chrPixBuf= c->chrPixBuf;
2596         const int vLumBufSize= c->vLumBufSize;
2597         const int vChrBufSize= c->vChrBufSize;
2598         uint8_t *funnyYCode= c->funnyYCode;
2599         uint8_t *funnyUVCode= c->funnyUVCode;
2600         uint8_t *formatConvBuffer= c->formatConvBuffer;
2601
2602         /* vars whch will change and which we need to storw back in the context */
2603         int dstY= c->dstY;
2604         int lumBufIndex= c->lumBufIndex;
2605         int chrBufIndex= c->chrBufIndex;
2606         int lastInLumBuf= c->lastInLumBuf;
2607         int lastInChrBuf= c->lastInChrBuf;
2608         int srcStride[3];
2609         int dstStride[3];
2610         uint8_t *src[3];
2611         uint8_t *dst[3];
2612
2613         if(c->srcFormat == IMGFMT_I420){
2614                 src[0]= srcParam[0];
2615                 src[1]= srcParam[2];
2616                 src[2]= srcParam[1];
2617                 srcStride[0]= srcStrideParam[0];
2618                 srcStride[1]= srcStrideParam[2];
2619                 srcStride[2]= srcStrideParam[1];
2620         }
2621         else if(c->srcFormat==IMGFMT_YV12){
2622                 src[0]= srcParam[0];
2623                 src[1]= srcParam[1];
2624                 src[2]= srcParam[2];
2625                 srcStride[0]= srcStrideParam[0];
2626                 srcStride[1]= srcStrideParam[1];
2627                 srcStride[2]= srcStrideParam[2];
2628         }
2629         else if(isPacked(c->srcFormat)){
2630                 src[0]=
2631                 src[1]=
2632                 src[2]= srcParam[0];
2633                 srcStride[0]= srcStrideParam[0];
2634                 srcStride[1]=
2635                 srcStride[2]= srcStrideParam[0]<<1;
2636         }
2637         else if(isGray(c->srcFormat)){
2638                 src[0]= srcParam[0];
2639                 src[1]=
2640                 src[2]= NULL;
2641                 srcStride[0]= srcStrideParam[0];
2642                 srcStride[1]=
2643                 srcStride[2]= 0;
2644         }
2645
2646         if(dstFormat == IMGFMT_I420){
2647                 dst[0]= dstParam[0];
2648                 dst[1]= dstParam[2];
2649                 dst[2]= dstParam[1];
2650                 dstStride[0]= dstStrideParam[0];
2651                 dstStride[1]= dstStrideParam[2];
2652                 dstStride[2]= dstStrideParam[1];
2653         }else{
2654                 dst[0]= dstParam[0];
2655                 dst[1]= dstParam[1];
2656                 dst[2]= dstParam[2];
2657                 dstStride[0]= dstStrideParam[0];
2658                 dstStride[1]= dstStrideParam[1];
2659                 dstStride[2]= dstStrideParam[2];
2660         }
2661
2662 //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2663 //dstStride[0],dstStride[1],dstStride[2]);
2664
2665         if(dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
2666         {
2667                 static int firstTime=1; //FIXME move this into the context perhaps
2668                 if(flags & SWS_PRINT_INFO && firstTime)
2669                 {
2670                         mp_msg(MSGT_SWS,MSGL_WARN,"SwScaler: Warning: dstStride is not aligned!\n"
2671                                         "SwScaler:          ->cannot do aligned memory acesses anymore\n");
2672                         firstTime=0;
2673                 }
2674         }
2675
2676         /* Note the user might start scaling the picture in the middle so this will not get executed
2677            this is not really intended but works currently, so ppl might do it */
2678         if(srcSliceY ==0){
2679                 lumBufIndex=0;
2680                 chrBufIndex=0;
2681                 dstY=0;
2682                 lastInLumBuf= -1;
2683                 lastInChrBuf= -1;
2684         }
2685
2686         for(;dstY < dstH; dstY++){
2687                 unsigned char *dest =dst[0]+dstStride[0]*dstY;
2688                 unsigned char *uDest=dst[1]+dstStride[1]*(dstY>>1);
2689                 unsigned char *vDest=dst[2]+dstStride[2]*(dstY>>1);
2690                 const int chrDstY= isHalfChrV(dstFormat) ? (dstY>>1) : dstY;
2691
2692                 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2693                 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2694                 const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2695                 const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2696
2697                 //handle holes (FAST_BILINEAR & weird filters)
2698                 if(firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2699                 if(firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2700 //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
2701                 ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1)
2702                 ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1)
2703
2704                 // Do we have enough lines in this slice to output the dstY line
2705                 if(lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < ((srcSliceY + srcSliceH + 1)>>1))
2706                 {
2707                         //Do horizontal scaling
2708                         while(lastInLumBuf < lastLumSrcY)
2709                         {
2710                                 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2711                                 lumBufIndex++;
2712 //                              printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf,  lastLumSrcY);
2713                                 ASSERT(lumBufIndex < 2*vLumBufSize)
2714                                 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2715                                 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2716 //                              printf("%d %d\n", lumBufIndex, vLumBufSize);
2717                                 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2718                                                 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2719                                                 funnyYCode, c->srcFormat, formatConvBuffer,
2720                                                 c->lumMmx2Filter, c->lumMmx2FilterPos);
2721                                 lastInLumBuf++;
2722                         }
2723                         while(lastInChrBuf < lastChrSrcY)
2724                         {
2725                                 uint8_t *src1= src[1]+(lastInChrBuf + 1 - (srcSliceY>>1))*srcStride[1];
2726                                 uint8_t *src2= src[2]+(lastInChrBuf + 1 - (srcSliceY>>1))*srcStride[2];
2727                                 chrBufIndex++;
2728                                 ASSERT(chrBufIndex < 2*vChrBufSize)
2729                                 ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) < ((srcSliceH+1)>>1))
2730                                 ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) >= 0)
2731                                 //FIXME replace parameters through context struct (some at least)
2732                                 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, (srcW+1)>>1, chrXInc,
2733                                                 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2734                                                 funnyUVCode, c->srcFormat, formatConvBuffer,
2735                                                 c->chrMmx2Filter, c->chrMmx2FilterPos);
2736                                 lastInChrBuf++;
2737                         }
2738                         //wrap buf index around to stay inside the ring buffer
2739                         if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2740                         if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2741                 }
2742                 else // not enough lines left in this slice -> load the rest in the buffer
2743                 {
2744 /*              printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
2745                         firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
2746                         lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
2747                         vChrBufSize, vLumBufSize);
2748 */
2749                         //Do horizontal scaling
2750                         while(lastInLumBuf+1 < srcSliceY + srcSliceH)
2751                         {
2752                                 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2753                                 lumBufIndex++;
2754                                 ASSERT(lumBufIndex < 2*vLumBufSize)
2755                                 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2756                                 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2757                                 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2758                                                 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2759                                                 funnyYCode, c->srcFormat, formatConvBuffer,
2760                                                 c->lumMmx2Filter, c->lumMmx2FilterPos);
2761                                 lastInLumBuf++;
2762                         }
2763                         while(lastInChrBuf+1 < ((srcSliceY + srcSliceH)>>1))
2764                         {
2765                                 uint8_t *src1= src[1]+(lastInChrBuf + 1 - (srcSliceY>>1))*srcStride[1];
2766                                 uint8_t *src2= src[2]+(lastInChrBuf + 1 - (srcSliceY>>1))*srcStride[2];
2767                                 chrBufIndex++;
2768                                 ASSERT(chrBufIndex < 2*vChrBufSize)
2769                                 ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) < ((srcSliceH+1)>>1))
2770                                 ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) >= 0)
2771                                 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, (srcW+1)>>1, chrXInc,
2772                                                 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2773                                                 funnyUVCode, c->srcFormat, formatConvBuffer,
2774                                                 c->chrMmx2Filter, c->chrMmx2FilterPos);
2775                                 lastInChrBuf++;
2776                         }
2777                         //wrap buf index around to stay inside the ring buffer
2778                         if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2779                         if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2780                         break; //we cant output a dstY line so lets try with the next slice
2781                 }
2782
2783 #ifdef HAVE_MMX
2784                 b5Dither= dither8[dstY&1];
2785                 g6Dither= dither4[dstY&1];
2786                 g5Dither= dither8[dstY&1];
2787                 r5Dither= dither8[(dstY+1)&1];
2788 #endif
2789             if(dstY < dstH-2)
2790             {
2791                 if(isPlanarYUV(dstFormat)) //YV12 like
2792                 {
2793                         if(dstY&1) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2794                         if(vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12
2795                         {
2796                                 int16_t *lumBuf = lumPixBuf[0];
2797                                 int16_t *chrBuf= chrPixBuf[0];
2798                                 RENAME(yuv2yuv1)(lumBuf, chrBuf, dest, uDest, vDest, dstW);
2799                         }
2800                         else //General YV12
2801                         {
2802                                 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2803                                 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2804                                 RENAME(yuv2yuvX)(
2805                                         vLumFilter+dstY*vLumFilterSize     , lumSrcPtr, vLumFilterSize,
2806                                         vChrFilter+(dstY>>1)*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2807                                         dest, uDest, vDest, dstW,
2808                                         lumMmxFilter+dstY*vLumFilterSize*4, chrMmxFilter+(dstY>>1)*vChrFilterSize*4);
2809                         }
2810                 }
2811                 else
2812                 {
2813                         int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2814                         int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2815
2816                         ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2817                         ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2818                         if(vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB
2819                         {
2820                                 int chrAlpha= vChrFilter[2*dstY+1];
2821
2822                                 RENAME(yuv2rgb1)(*lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
2823                                                  dest, dstW, chrAlpha, dstFormat, flags);
2824                         }
2825                         else if(vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB
2826                         {
2827                                 int lumAlpha= vLumFilter[2*dstY+1];
2828                                 int chrAlpha= vChrFilter[2*dstY+1];
2829
2830                                 RENAME(yuv2rgb2)(*lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
2831                                                  dest, dstW, lumAlpha, chrAlpha, dstFormat, flags);
2832                         }
2833                         else //General RGB
2834                         {
2835                                 RENAME(yuv2rgbX)(
2836                                         vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2837                                         vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2838                                         dest, dstW, dstFormat,
2839                                         lumMmxFilter+dstY*vLumFilterSize*4, chrMmxFilter+dstY*vChrFilterSize*4);
2840                         }
2841                 }
2842             }
2843             else // hmm looks like we cant use MMX here without overwriting this arrays tail
2844             {
2845                 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2846                 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2847                 if(isPlanarYUV(dstFormat)) //YV12
2848                 {
2849                         if(dstY&1) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2850                         yuv2yuvXinC(
2851                                 vLumFilter+dstY*vLumFilterSize     , lumSrcPtr, vLumFilterSize,
2852                                 vChrFilter+(dstY>>1)*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2853                                 dest, uDest, vDest, dstW);
2854                 }
2855                 else
2856                 {
2857                         ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2858                         ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2859                         yuv2rgbXinC(
2860                                 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2861                                 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2862                                 dest, dstW, dstFormat);
2863                 }
2864             }
2865         }
2866
2867 #ifdef HAVE_MMX
2868         __asm __volatile(SFENCE:::"memory");
2869         __asm __volatile(EMMS:::"memory");
2870 #endif
2871         /* store changed local vars back in the context */
2872         c->dstY= dstY;
2873         c->lumBufIndex= lumBufIndex;
2874         c->chrBufIndex= chrBufIndex;
2875         c->lastInLumBuf= lastInLumBuf;
2876         c->lastInChrBuf= lastInChrBuf;
2877 }