git.sesse.net Git - mlt/blob - src/modules/xine/yadif.c

   1 /*
   2         Yadif C-plugin for Avisynth 2.5 - Yet Another DeInterlacing Filter
   3         Copyright (C)2007 Alexander G. Balakhnin aka Fizick  http://avisynth.org.ru
   4     Port of YADIF filter from MPlayer
   5         Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at>
   6
   7     This program is free software; you can redistribute it and/or modify
   8         it under the terms of the GNU General Public License as published by
   9         the Free Software Foundation.
  10
  11         This program is distributed in the hope that it will be useful,
  12         but WITHOUT ANY WARRANTY; without even the implied warranty of
  13         MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14         GNU General Public License for more details.
  15
  16         You should have received a copy of the GNU General Public License
  17         along with this program; if not, write to the Free Software
  18         Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  19
  20     Avisynth_C plugin
  21         Assembler optimized for GNU C compiler
  22
  23 */
  24 #include "yadif.h"
  25 #include <stdlib.h>
  26 #include <memory.h>
  27
  28 #define MIN(a,b) ((a) > (b) ? (b) : (a))
  29 #define MAX(a,b) ((a) < (b) ? (b) : (a))
  30 #define ABS(a) ((a) > 0 ? (a) : (-(a)))
  31
  32 #define MIN3(a,b,c) MIN(MIN(a,b),c)
  33 #define MAX3(a,b,c) MAX(MAX(a,b),c)
  34
  35 static void (*filter_line)(int mode, uint8_t *dst, const uint8_t *prev, const uint8_t *cur, const uint8_t *next, int w, int refs, int parity);
  36
  37 #if defined(__GNUC__) && defined(USE_SSE)
  38
  39 #define LOAD4(mem,dst) \
  40             "movd      "mem", "#dst" \n\t"\
  41             "punpcklbw %%mm7, "#dst" \n\t"
  42
  43 #define PABS(tmp,dst) \
  44             "pxor     "#tmp", "#tmp" \n\t"\
  45             "psubw    "#dst", "#tmp" \n\t"\
  46             "pmaxsw   "#tmp", "#dst" \n\t"
  47
  48 #define CHECK(pj,mj) \
  49             "movq "#pj"(%[cur],%[mrefs]), %%mm2 \n\t" /* cur[x-refs-1+j] */\
  50             "movq "#mj"(%[cur],%[prefs]), %%mm3 \n\t" /* cur[x+refs-1-j] */\
  51             "movq      %%mm2, %%mm4 \n\t"\
  52             "movq      %%mm2, %%mm5 \n\t"\
  53             "pxor      %%mm3, %%mm4 \n\t"\
  54             "pavgb     %%mm3, %%mm5 \n\t"\
  55             "pand     %[pb1], %%mm4 \n\t"\
  56             "psubusb   %%mm4, %%mm5 \n\t"\
  57             "psrlq     $8,    %%mm5 \n\t"\
  58             "punpcklbw %%mm7, %%mm5 \n\t" /* (cur[x-refs+j] + cur[x+refs-j])>>1 */\
  59             "movq      %%mm2, %%mm4 \n\t"\
  60             "psubusb   %%mm3, %%mm2 \n\t"\
  61             "psubusb   %%mm4, %%mm3 \n\t"\
  62             "pmaxub    %%mm3, %%mm2 \n\t"\
  63             "movq      %%mm2, %%mm3 \n\t"\
  64             "movq      %%mm2, %%mm4 \n\t" /* ABS(cur[x-refs-1+j] - cur[x+refs-1-j]) */\
  65             "psrlq      $8,   %%mm3 \n\t" /* ABS(cur[x-refs  +j] - cur[x+refs  -j]) */\
  66             "psrlq     $16,   %%mm4 \n\t" /* ABS(cur[x-refs+1+j] - cur[x+refs+1-j]) */\
  67             "punpcklbw %%mm7, %%mm2 \n\t"\
  68             "punpcklbw %%mm7, %%mm3 \n\t"\
  69             "punpcklbw %%mm7, %%mm4 \n\t"\
  70             "paddw     %%mm3, %%mm2 \n\t"\
  71             "paddw     %%mm4, %%mm2 \n\t" /* score */
  72
  73 #define CHECK1 \
  74             "movq      %%mm0, %%mm3 \n\t"\
  75             "pcmpgtw   %%mm2, %%mm3 \n\t" /* if(score < spatial_score) */\
  76             "pminsw    %%mm2, %%mm0 \n\t" /* spatial_score= score; */\
  77             "movq      %%mm3, %%mm6 \n\t"\
  78             "pand      %%mm3, %%mm5 \n\t"\
  79             "pandn     %%mm1, %%mm3 \n\t"\
  80             "por       %%mm5, %%mm3 \n\t"\
  81             "movq      %%mm3, %%mm1 \n\t" /* spatial_pred= (cur[x-refs+j] + cur[x+refs-j])>>1; */
  82
  83 #define CHECK2 /* pretend not to have checked dir=2 if dir=1 was bad.\
  84                   hurts both quality and speed, but matches the C version. */\
  85             "paddw    %[pw1], %%mm6 \n\t"\
  86             "psllw     $14,   %%mm6 \n\t"\
  87             "paddsw    %%mm6, %%mm2 \n\t"\
  88             "movq      %%mm0, %%mm3 \n\t"\
  89             "pcmpgtw   %%mm2, %%mm3 \n\t"\
  90             "pminsw    %%mm2, %%mm0 \n\t"\
  91             "pand      %%mm3, %%mm5 \n\t"\
  92             "pandn     %%mm1, %%mm3 \n\t"\
  93             "por       %%mm5, %%mm3 \n\t"\
  94             "movq      %%mm3, %%mm1 \n\t"
  95
  96 static void filter_line_mmx2(int mode, uint8_t *dst, const uint8_t *prev, const uint8_t *cur, const uint8_t *next, int w, int refs, int parity){
  97     static const uint64_t pw_1 = 0x0001000100010001ULL;
  98     static const uint64_t pb_1 = 0x0101010101010101ULL;
  99 //    const int mode = p->mode;
 100     uint64_t tmp0, tmp1, tmp2, tmp3;
 101     int x;
 102
 103 #define FILTER\
 104     for(x=0; x<w; x+=4){\
 105         asm volatile(\
 106             "pxor      %%mm7, %%mm7 \n\t"\
 107             LOAD4("(%[cur],%[mrefs])", %%mm0) /* c = cur[x-refs] */\
 108             LOAD4("(%[cur],%[prefs])", %%mm1) /* e = cur[x+refs] */\
 109             LOAD4("(%["prev2"])", %%mm2) /* prev2[x] */\
 110             LOAD4("(%["next2"])", %%mm3) /* next2[x] */\
 111             "movq      %%mm3, %%mm4 \n\t"\
 112             "paddw     %%mm2, %%mm3 \n\t"\
 113             "psraw     $1,    %%mm3 \n\t" /* d = (prev2[x] + next2[x])>>1 */\
 114             "movq      %%mm0, %[tmp0] \n\t" /* c */\
 115             "movq      %%mm3, %[tmp1] \n\t" /* d */\
 116             "movq      %%mm1, %[tmp2] \n\t" /* e */\
 117             "psubw     %%mm4, %%mm2 \n\t"\
 118             PABS(      %%mm4, %%mm2) /* temporal_diff0 */\
 119             LOAD4("(%[prev],%[mrefs])", %%mm3) /* prev[x-refs] */\
 120             LOAD4("(%[prev],%[prefs])", %%mm4) /* prev[x+refs] */\
 121             "psubw     %%mm0, %%mm3 \n\t"\
 122             "psubw     %%mm1, %%mm4 \n\t"\
 123             PABS(      %%mm5, %%mm3)\
 124             PABS(      %%mm5, %%mm4)\
 125             "paddw     %%mm4, %%mm3 \n\t" /* temporal_diff1 */\
 126             "psrlw     $1,    %%mm2 \n\t"\
 127             "psrlw     $1,    %%mm3 \n\t"\
 128             "pmaxsw    %%mm3, %%mm2 \n\t"\
 129             LOAD4("(%[next],%[mrefs])", %%mm3) /* next[x-refs] */\
 130             LOAD4("(%[next],%[prefs])", %%mm4) /* next[x+refs] */\
 131             "psubw     %%mm0, %%mm3 \n\t"\
 132             "psubw     %%mm1, %%mm4 \n\t"\
 133             PABS(      %%mm5, %%mm3)\
 134             PABS(      %%mm5, %%mm4)\
 135             "paddw     %%mm4, %%mm3 \n\t" /* temporal_diff2 */\
 136             "psrlw     $1,    %%mm3 \n\t"\
 137             "pmaxsw    %%mm3, %%mm2 \n\t"\
 138             "movq      %%mm2, %[tmp3] \n\t" /* diff */\
 139 \
 140             "paddw     %%mm0, %%mm1 \n\t"\
 141             "paddw     %%mm0, %%mm0 \n\t"\
 142             "psubw     %%mm1, %%mm0 \n\t"\
 143             "psrlw     $1,    %%mm1 \n\t" /* spatial_pred */\
 144             PABS(      %%mm2, %%mm0)      /* ABS(c-e) */\
 145 \
 146             "movq -1(%[cur],%[mrefs]), %%mm2 \n\t" /* cur[x-refs-1] */\
 147             "movq -1(%[cur],%[prefs]), %%mm3 \n\t" /* cur[x+refs-1] */\
 148             "movq      %%mm2, %%mm4 \n\t"\
 149             "psubusb   %%mm3, %%mm2 \n\t"\
 150             "psubusb   %%mm4, %%mm3 \n\t"\
 151             "pmaxub    %%mm3, %%mm2 \n\t"\
 152             /*"pshufw $9,%%mm2, %%mm3 \n\t"*/\
 153             "movq %%mm2, %%mm3 \n\t" /* replace for "pshufw $9,%%mm2, %%mm3" - Fizick */\
 154             "psrlq $16, %%mm3 \n\t"/* replace for "pshufw $9,%%mm2, %%mm3" - Fizick*/\
 155             "punpcklbw %%mm7, %%mm2 \n\t" /* ABS(cur[x-refs-1] - cur[x+refs-1]) */\
 156             "punpcklbw %%mm7, %%mm3 \n\t" /* ABS(cur[x-refs+1] - cur[x+refs+1]) */\
 157             "paddw     %%mm2, %%mm0 \n\t"\
 158             "paddw     %%mm3, %%mm0 \n\t"\
 159             "psubw    %[pw1], %%mm0 \n\t" /* spatial_score */\
 160 \
 161             CHECK(-2,0)\
 162             CHECK1\
 163             CHECK(-3,1)\
 164             CHECK2\
 165             CHECK(0,-2)\
 166             CHECK1\
 167             CHECK(1,-3)\
 168             CHECK2\
 169 \
 170             /* if(p->mode<2) ... */\
 171             "movq    %[tmp3], %%mm6 \n\t" /* diff */\
 172             "cmp       $2, %[mode] \n\t"\
 173             "jge       1f \n\t"\
 174             LOAD4("(%["prev2"],%[mrefs],2)", %%mm2) /* prev2[x-2*refs] */\
 175             LOAD4("(%["next2"],%[mrefs],2)", %%mm4) /* next2[x-2*refs] */\
 176             LOAD4("(%["prev2"],%[prefs],2)", %%mm3) /* prev2[x+2*refs] */\
 177             LOAD4("(%["next2"],%[prefs],2)", %%mm5) /* next2[x+2*refs] */\
 178             "paddw     %%mm4, %%mm2 \n\t"\
 179             "paddw     %%mm5, %%mm3 \n\t"\
 180             "psrlw     $1,    %%mm2 \n\t" /* b */\
 181             "psrlw     $1,    %%mm3 \n\t" /* f */\
 182             "movq    %[tmp0], %%mm4 \n\t" /* c */\
 183             "movq    %[tmp1], %%mm5 \n\t" /* d */\
 184             "movq    %[tmp2], %%mm7 \n\t" /* e */\
 185             "psubw     %%mm4, %%mm2 \n\t" /* b-c */\
 186             "psubw     %%mm7, %%mm3 \n\t" /* f-e */\
 187             "movq      %%mm5, %%mm0 \n\t"\
 188             "psubw     %%mm4, %%mm5 \n\t" /* d-c */\
 189             "psubw     %%mm7, %%mm0 \n\t" /* d-e */\
 190             "movq      %%mm2, %%mm4 \n\t"\
 191             "pminsw    %%mm3, %%mm2 \n\t"\
 192             "pmaxsw    %%mm4, %%mm3 \n\t"\
 193             "pmaxsw    %%mm5, %%mm2 \n\t"\
 194             "pminsw    %%mm5, %%mm3 \n\t"\
 195             "pmaxsw    %%mm0, %%mm2 \n\t" /* max */\
 196             "pminsw    %%mm0, %%mm3 \n\t" /* min */\
 197             "pxor      %%mm4, %%mm4 \n\t"\
 198             "pmaxsw    %%mm3, %%mm6 \n\t"\
 199             "psubw     %%mm2, %%mm4 \n\t" /* -max */\
 200             "pmaxsw    %%mm4, %%mm6 \n\t" /* diff= MAX3(diff, min, -max); */\
 201             "1: \n\t"\
 202 \
 203             "movq    %[tmp1], %%mm2 \n\t" /* d */\
 204             "movq      %%mm2, %%mm3 \n\t"\
 205             "psubw     %%mm6, %%mm2 \n\t" /* d-diff */\
 206             "paddw     %%mm6, %%mm3 \n\t" /* d+diff */\
 207             "pmaxsw    %%mm2, %%mm1 \n\t"\
 208             "pminsw    %%mm3, %%mm1 \n\t" /* d = clip(spatial_pred, d-diff, d+diff); */\
 209             "packuswb  %%mm1, %%mm1 \n\t"\
 210 \
 211             :[tmp0]"=m"(tmp0),\
 212              [tmp1]"=m"(tmp1),\
 213              [tmp2]"=m"(tmp2),\
 214              [tmp3]"=m"(tmp3)\
 215             :[prev] "r"(prev),\
 216              [cur]  "r"(cur),\
 217              [next] "r"(next),\
 218              [prefs]"r"((long)refs),\
 219              [mrefs]"r"((long)-refs),\
 220              [pw1]  "m"(pw_1),\
 221              [pb1]  "m"(pb_1),\
 222              [mode] "g"(mode)\
 223         );\
 224         asm volatile("movd %%mm1, %0" :"=m"(*dst));\
 225         dst += 4;\
 226         prev+= 4;\
 227         cur += 4;\
 228         next+= 4;\
 229     }
 230
 231     if(parity){
 232 #define prev2 "prev"
 233 #define next2 "cur"
 234         FILTER
 235 #undef prev2
 236 #undef next2
 237     }else{
 238 #define prev2 "cur"
 239 #define next2 "next"
 240         FILTER
 241 #undef prev2
 242 #undef next2
 243     }
 244 }
 245 #undef LOAD4
 246 #undef PABS
 247 #undef CHECK
 248 #undef CHECK1
 249 #undef CHECK2
 250 #undef FILTER
 251
 252 #ifndef attribute_align_arg
 253 #if defined(__GNUC__) && (__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__>1)
 254 #    define attribute_align_arg __attribute__((force_align_arg_pointer))
 255 #else
 256 #    define attribute_align_arg
 257 #endif
 258 #endif
 259
 260 // for proper alignment SSE2 we need in GCC 4.2 and above
 261 #if (__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__>1)
 262
 263 #ifndef DECLARE_ALIGNED
 264 #define DECLARE_ALIGNED(n,t,v)       t v __attribute__ ((aligned (n)))
 265 #endif
 266
 267 // ================= SSE2 =================
 268 #if defined(USE_SSE2) && defined(ARCH_X86_64)
 269 #define PABS(tmp,dst) \
 270             "pxor     "#tmp", "#tmp" \n\t"\
 271             "psubw    "#dst", "#tmp" \n\t"\
 272             "pmaxsw   "#tmp", "#dst" \n\t"
 273
 274 #define FILTER_LINE_FUNC_NAME filter_line_sse2
 275 #include "vf_yadif_template.h"
 276 #endif
 277
 278 // ================ SSSE3 =================
 279 #ifdef USE_SSE3
 280 #define PABS(tmp,dst) \
 281             "pabsw     "#dst", "#dst" \n\t"
 282
 283 #define FILTER_LINE_FUNC_NAME filter_line_ssse3
 284 #include "vf_yadif_template.h"
 285 #endif
 286
 287 #endif // GCC 4.2+
 288 #endif // GNUC, USE_SSE
 289
 290 static void filter_line_c(int mode, uint8_t *dst, const uint8_t *prev, const uint8_t *cur, const uint8_t *next, int w, int refs, int parity){
 291     int x;
 292     const uint8_t *prev2= parity ? prev : cur ;
 293     const uint8_t *next2= parity ? cur  : next;
 294     for(x=0; x<w; x++){
 295         int c= cur[-refs];
 296         int d= (prev2[0] + next2[0])>>1;
 297         int e= cur[+refs];
 298         int temporal_diff0= ABS(prev2[0] - next2[0]);
 299         int temporal_diff1=( ABS(prev[-refs] - c) + ABS(prev[+refs] - e) )>>1;
 300         int temporal_diff2=( ABS(next[-refs] - c) + ABS(next[+refs] - e) )>>1;
 301         int diff= MAX3(temporal_diff0>>1, temporal_diff1, temporal_diff2);
 302         int spatial_pred= (c+e)>>1;
 303         int spatial_score= ABS(cur[-refs-1] - cur[+refs-1]) + ABS(c-e)
 304                          + ABS(cur[-refs+1] - cur[+refs+1]) - 1;
 305
 306 #define CHECK(j)\
 307     {   int score= ABS(cur[-refs-1+ j] - cur[+refs-1- j])\
 308                  + ABS(cur[-refs  + j] - cur[+refs  - j])\
 309                  + ABS(cur[-refs+1+ j] - cur[+refs+1- j]);\
 310         if(score < spatial_score){\
 311             spatial_score= score;\
 312             spatial_pred= (cur[-refs  + j] + cur[+refs  - j])>>1;\
 313
 314         CHECK(-1) CHECK(-2) }} }}
 315         CHECK( 1) CHECK( 2) }} }}
 316
 317         if(mode<2){
 318             int b= (prev2[-2*refs] + next2[-2*refs])>>1;
 319             int f= (prev2[+2*refs] + next2[+2*refs])>>1;
 320 #if 0
 321             int a= cur[-3*refs];
 322             int g= cur[+3*refs];
 323             int max= MAX3(d-e, d-c, MIN3(MAX(b-c,f-e),MAX(b-c,b-a),MAX(f-g,f-e)) );
 324             int min= MIN3(d-e, d-c, MAX3(MIN(b-c,f-e),MIN(b-c,b-a),MIN(f-g,f-e)) );
 325 #else
 326             int max= MAX3(d-e, d-c, MIN(b-c, f-e));
 327             int min= MIN3(d-e, d-c, MAX(b-c, f-e));
 328 #endif
 329
 330             diff= MAX3(diff, min, -max);
 331         }
 332
 333         if(spatial_pred > d + diff)
 334            spatial_pred = d + diff;
 335         else if(spatial_pred < d - diff)
 336            spatial_pred = d - diff;
 337
 338         dst[0] = spatial_pred;
 339
 340         dst++;
 341         cur++;
 342         prev++;
 343         next++;
 344         prev2++;
 345         next2++;
 346     }
 347 }
 348
 349 static void interpolate(uint8_t *dst, const uint8_t *cur0,  const uint8_t *cur2, int w)
 350 {
 351     int x;
 352     for (x=0; x<w; x++) {
 353         dst[x] = (cur0[x] + cur2[x] + 1)>>1; // simple average
 354     }
 355 }
 356
 357 void filter_plane(int mode, uint8_t *dst, int dst_stride, const uint8_t *prev0, const uint8_t *cur0, const uint8_t *next0, int refs, int w, int h, int parity, int tff, int cpu){
 358
 359         int y;
 360         filter_line = filter_line_c;
 361 #ifdef __GNUC__
 362 #if (__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__>1)
 363 #ifdef USE_SSE3
 364         if (cpu & AVS_CPU_SSSE3)
 365                 filter_line = filter_line_ssse3;
 366         else
 367 #endif
 368 #if defined(USE_SSE2) && defined(ARCH_X86_64)
 369         if (cpu & AVS_CPU_SSE2)
 370                 filter_line = filter_line_sse2;
 371         else
 372 #endif
 373 #endif // GCC 4.2+
 374 #ifdef USE_SSE
 375         if (cpu & AVS_CPU_INTEGER_SSE)
 376                 filter_line = filter_line_mmx2;
 377 #endif
 378 #endif // GNUC
 379         y=0;
 380         if(((y ^ parity) & 1)){
 381             memcpy(dst, cur0 + refs, w);// duplicate 1
 382         }else{
 383             memcpy(dst, cur0, w);
 384         }
 385         y=1;
 386         if(((y ^ parity) & 1)){
 387             interpolate(dst + dst_stride, cur0, cur0 + refs*2, w);   // interpolate 0 and 2
 388         }else{
 389             memcpy(dst + dst_stride, cur0 + refs, w); // copy original
 390         }
 391         for(y=2; y<h-2; y++){
 392             if(((y ^ parity) & 1)){
 393                 const uint8_t *prev= prev0 + y*refs;
 394                 const uint8_t *cur = cur0 + y*refs;
 395                 const uint8_t *next= next0 + y*refs;
 396                 uint8_t *dst2= dst + y*dst_stride;
 397                 filter_line(mode, dst2, prev, cur, next, w, refs, (parity ^ tff));
 398             }else{
 399                 memcpy(dst + y*dst_stride, cur0 + y*refs, w); // copy original
 400             }
 401         }
 402        y=h-2;
 403         if(((y ^ parity) & 1)){
 404             interpolate(dst + (h-2)*dst_stride, cur0 + (h-3)*refs, cur0 + (h-1)*refs, w);   // interpolate h-3 and h-1
 405         }else{
 406             memcpy(dst + (h-2)*dst_stride, cur0 + (h-2)*refs, w); // copy original
 407         }
 408         y=h-1;
 409         if(((y ^ parity) & 1)){
 410             memcpy(dst + (h-1)*dst_stride, cur0 + (h-2)*refs, w); // duplicate h-2
 411         }else{
 412             memcpy(dst + (h-1)*dst_stride, cur0 + (h-1)*refs, w); // copy original
 413         }
 414
 415 #if defined(__GNUC__) && defined(USE_SSE)
 416         if (cpu >= AVS_CPU_INTEGER_SSE)
 417                 asm volatile("emms");
 418 #endif
 419 }
 420
 421 #if defined(__GNUC__) && defined(USE_SSE) && !defined(PIC)
 422 static attribute_align_arg void  YUY2ToPlanes_mmx(const unsigned char *srcYUY2, int pitch_yuy2, int width, int height,
 423                     unsigned char *py, int pitch_y,
 424                     unsigned char *pu, unsigned char *pv,  int pitch_uv)
 425 { /* process by 16 bytes (8 pixels), so width is assumed mod 8 */
 426     int widthdiv2 = width>>1;
 427 //    static unsigned __int64 Ymask = 0x00FF00FF00FF00FFULL;
 428     int h;
 429     for (h=0; h<height; h++)
 430     {
 431         asm (\
 432         "pcmpeqb %%mm5, %%mm5 \n\t"  /* prepare Ymask FFFFFFFFFFFFFFFF */\
 433         "psrlw $8, %%mm5 \n\t" /* Ymask = 00FF00FF00FF00FF */\
 434         "xor %%eax, %%eax \n\t"\
 435         "xloop%= : \n\t"\
 436         "prefetchnta 0xc0(%%edi,%%eax,4) \n\t"\
 437         "movq (%%edi,%%eax,4), %%mm0 \n\t" /* src VYUYVYUY - 1 */\
 438         "movq 8(%%edi,%%eax,4), %%mm1 \n\t" /* src VYUYVYUY - 2 */\
 439         "movq %%mm0, %%mm2 \n\t" /* VYUYVYUY - 1 */\
 440         "movq %%mm1, %%mm3 \n\t" /* VYUYVYUY - 2 */\
 441         "pand %%mm5, %%mm0 \n\t" /* 0Y0Y0Y0Y - 1 */\
 442         "psrlw $8, %%mm2 \n\t" /* 0V0U0V0U - 1 */\
 443         "pand %%mm5, %%mm1 \n\t" /* 0Y0Y0Y0Y - 2 */\
 444         "psrlw $8, %%mm3 \n\t" /* 0V0U0V0U - 2 */\
 445         "packuswb %%mm1, %%mm0 \n\t" /* YYYYYYYY */\
 446         "packuswb %%mm3, %%mm2 \n\t" /* VUVUVUVU */\
 447         "movntq %%mm0, (%%ebx,%%eax,2) \n\t" /* store y */\
 448         "movq %%mm2, %%mm4 \n\t" /* VUVUVUVU */\
 449         "pand %%mm5, %%mm2 \n\t" /* 0U0U0U0U */\
 450         "psrlw $8, %%mm4 \n\t" /* 0V0V0V0V */\
 451         "packuswb %%mm2, %%mm2 \n\t" /* xxxxUUUU */\
 452         "packuswb %%mm4, %%mm4 \n\t" /* xxxxVVVV */\
 453         "movd %%mm2, (%%edx,%%eax) \n\t" /* store u */\
 454         "add $4, %%eax \n\t" \
 455         "cmp %%ecx, %%eax \n\t" \
 456         "movd %%mm4, -4(%%esi,%%eax) \n\t" /* store v */\
 457         "jl xloop%= \n\t"\
 458         : : "D"(srcYUY2), "b"(py), "d"(pu), "S"(pv), "c"(widthdiv2) : "%eax");
 459
 460         srcYUY2 += pitch_yuy2;
 461         py += pitch_y;
 462         pu += pitch_uv;
 463         pv += pitch_uv;
 464     }
 465     asm ("sfence \n\t emms");
 466 }
 467
 468 static attribute_align_arg void YUY2FromPlanes_mmx(unsigned char *dstYUY2, int pitch_yuy2, int width, int height,
 469                     const unsigned char *py, int pitch_y,
 470                     const unsigned char *pu, const unsigned char *pv,  int pitch_uv)
 471 {
 472     int widthdiv2 = width >> 1;
 473     int h;
 474     for (h=0; h<height; h++)
 475     {
 476         asm (\
 477         "xor %%eax, %%eax \n\t"\
 478         "xloop%=: \n\t"\
 479         "movd (%%edx,%%eax), %%mm1 \n\t" /* 0000UUUU */\
 480         "movd (%%esi,%%eax), %%mm2 \n\t" /* 0000VVVV */\
 481         "movq (%%ebx,%%eax,2), %%mm0 \n\t" /* YYYYYYYY */\
 482         "punpcklbw %%mm2,%%mm1 \n\t" /* VUVUVUVU */\
 483         "movq %%mm0, %%mm3 \n\t" /* YYYYYYYY */\
 484         "punpcklbw %%mm1, %%mm0 \n\t" /* VYUYVYUY */\
 485         "add $4, %%eax \n\t"\
 486         "punpckhbw %%mm1, %%mm3 \n\t" /* VYUYVYUY */\
 487         "movntq %%mm0, -16(%%edi,%%eax,4) \n\t" /*store */\
 488         "movntq %%mm3, -8(%%edi,%%eax,4) \n\t" /*  store */\
 489         "cmp %%ecx, %%eax \n\t"\
 490         "jl xloop%= \n\t"\
 491         : : "b"(py), "d"(pu), "S"(pv), "D"(dstYUY2), "c"(widthdiv2) : "%eax");
 492         py += pitch_y;
 493         pu += pitch_uv;
 494         pv += pitch_uv;
 495         dstYUY2 += pitch_yuy2;
 496     }
 497     asm ("sfence \n\t emms");
 498 }
 499 #endif // GNUC, USE_SSE, !PIC
 500
 501 //----------------------------------------------------------------------------------------------
 502
 503 void YUY2ToPlanes(const unsigned char *pSrcYUY2, int nSrcPitchYUY2, int nWidth, int nHeight,
 504                                                            unsigned char * pSrcY, int srcPitchY,
 505                                                            unsigned char * pSrcU,  unsigned char * pSrcV, int srcPitchUV, int cpu)
 506 {
 507
 508     int h,w;
 509     int w0 = 0;
 510 #if defined(__GNUC__) && defined(USE_SSE) && !defined(PIC)
 511     if (cpu & AVS_CPU_INTEGER_SSE) {
 512         w0 = (nWidth/8)*8;
 513         YUY2ToPlanes_mmx(pSrcYUY2, nSrcPitchYUY2, w0, nHeight, pSrcY, srcPitchY, pSrcU, pSrcV, srcPitchUV);
 514     }
 515 #endif
 516         for (h=0; h<nHeight; h++)
 517         {
 518                 for (w=w0; w<nWidth; w+=2)
 519                 {
 520                         int w2 = w+w;
 521                         pSrcY[w] = pSrcYUY2[w2];
 522                         pSrcY[w+1] = pSrcYUY2[w2+2];
 523                         pSrcU[(w>>1)] = pSrcYUY2[w2+1];
 524                         pSrcV[(w>>1)] = pSrcYUY2[w2+3];
 525                 }
 526                 pSrcY += srcPitchY;
 527                 pSrcU += srcPitchUV;
 528                 pSrcV += srcPitchUV;
 529                 pSrcYUY2 += nSrcPitchYUY2;
 530         }
 531 }
 532
 533 //----------------------------------------------------------------------------------------------
 534
 535 void YUY2FromPlanes(unsigned char *pSrcYUY2, int nSrcPitchYUY2, int nWidth, int nHeight,
 536                                                           const unsigned char * pSrcY, int srcPitchY,
 537                                                           const unsigned char * pSrcU, const unsigned char * pSrcV, int srcPitchUV, int cpu)
 538 {
 539     int h,w;
 540     int w0 = 0;
 541 #if defined(__GNUC__) && defined(USE_SSE) && !defined(PIC)
 542     if (cpu & AVS_CPU_INTEGER_SSE) {
 543         w0 = (nWidth/8)*8;
 544         YUY2FromPlanes_mmx(pSrcYUY2, nSrcPitchYUY2, w0, nHeight, pSrcY, srcPitchY, pSrcU, pSrcV, srcPitchUV);
 545     }
 546 #endif
 547   for (h=0; h<nHeight; h++)
 548         {
 549                 for (w=w0; w<nWidth; w+=2)
 550                 {
 551                         int w2 = w+w;
 552                         pSrcYUY2[w2] = pSrcY[w];
 553                         pSrcYUY2[w2+1] = pSrcU[(w>>1)];
 554                         pSrcYUY2[w2+2] = pSrcY[w+1];
 555                         pSrcYUY2[w2+3] = pSrcV[(w>>1)];
 556                 }
 557                 pSrcY += srcPitchY;
 558                 pSrcU += srcPitchUV;
 559                 pSrcV += srcPitchUV;
 560                 pSrcYUY2 += nSrcPitchYUY2;
 561         }
 562 }