git.sesse.net Git - vlc/blob - src/video_output/video_yuv_mmx.S

   1 /*****************************************************************************
   2  * video_yuv_mmx.S: YUV transformation, optimized for MMX processors
   3  *****************************************************************************
   4  * Copyright (C) 1999, 2000 VideoLAN
   5  *
   6  * Authors:
   7  *
   8  * This program is free software; you can redistribute it and/or modify
   9  * it under the terms of the GNU General Public License as published by
  10  * the Free Software Foundation; either version 2 of the License, or
  11  * (at your option) any later version.
  12  *
  13  * This program is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16  * General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU General Public
  19  * License along with this program; if not, write to the
  20  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  21  * Boston, MA 02111-1307, USA.
  22  *****************************************************************************/
  23
  24 /*****************************************************************************
  25  * Following functions are defined:
  26  * vout_YUV420_16_MMX
  27  *            This function performs YUV12-to-RGB16 color conversion for H26x.
  28  *            It handles any format in which there are three fields, the low
  29  *            order field being B and fully contained in the low order byte, the
  30  *            second field being G and being somewhere in bits 4 through 11,
  31  *            and the high order field being R and fully contained in the high
  32  *            order byte.
  33  *
  34  *            The YUV12 input is planar, 8 bits per pel.  The Y plane may have
  35  *            a pitch of up to 768.  It may have a width less than or equal
  36  *            to the pitch.  It must be DWORD aligned, and preferably QWORD
  37  *            aligned.  Pitch and Width must be a multiple of four.  For best
  38  *            performance, Pitch should not be 4 more than a multiple of 32.
  39  *            Height may be any amount, but must be a multiple of two.  The U
  40  *            and V planes may have a different pitch than the Y plane, subject
  41  *            to the same limitations.
  42  *****************************************************************************/
  43
  44 //.include iammx.inc
  45 //.include locals.inc
  46
  47 .data
  48     .align 16
  49
  50 RGB_formats:
  51     .long  RGB565
  52     .long  RGB555
  53     .long  RGB664
  54     .long  RGB655
  55
  56 Minusg:             .long 0x00800080, 0x00800080
  57 Yadd:               .long 0x10101010, 0x10101010
  58 VtR:                .long 0x00660066, 0x00660066
  59 VtG:                .long 0x00340034, 0x00340034
  60 UtG:                .long 0x00190019, 0x00190019
  61 UtB:                .long 0x00810081, 0x00810081
  62 Ymul:               .long 0x004a004a, 0x004a004a
  63 UVtG:               .long 0x00340019, 0x00340019
  64 VtRUtB:             .long 0x01990205, 0x01990205
  65 fourbitu:           .quad 0xf0f0f0f0f0f0f0f0
  66 fivebitu:           .quad 0xe0e0e0e0e0e0e0e0
  67 sixbitu:            .quad 0xc0c0c0c0c0c0c0c0
  68
  69 .text
  70
  71 #define LocalFrameSize  156
  72 #define RegisterStorageSize  16
  73
  74 //#define DOUBLE /*double le nombre de colonnes */
  75
  76 /* Arguments: */
  77 #define YPlane                    LocalFrameSize + RegisterStorageSize +  4
  78 #define UPlane                    LocalFrameSize + RegisterStorageSize +  8
  79 #define VPlane                    LocalFrameSize + RegisterStorageSize + 12
  80 #define FrameWidth                LocalFrameSize + RegisterStorageSize + 16
  81 #define FrameHeight               LocalFrameSize + RegisterStorageSize + 20
  82 #define YPitch                    LocalFrameSize + RegisterStorageSize + 24
  83 #define ChromaPitch               LocalFrameSize + RegisterStorageSize + 28
  84 #define AspectAdjustmentCount     LocalFrameSize + RegisterStorageSize + 32
  85 #define ColorConvertedFrame       LocalFrameSize + RegisterStorageSize + 36
  86 #define DCIOffset                 LocalFrameSize + RegisterStorageSize + 40
  87 #define CCOffsetToLine0           LocalFrameSize + RegisterStorageSize + 44
  88 #define CCOPitch                  LocalFrameSize + RegisterStorageSize + 48
  89 #define CCType                    LocalFrameSize + RegisterStorageSize + 52
  90 #define EndOfArgList              LocalFrameSize + RegisterStorageSize + 56
  91
  92 /* Locals (on local stack frame) */
  93 #define CCOCursor        0
  94 #define CCOSkipDistance  4
  95 #define ChromaLineLen    8
  96 #define YCursor          12
  97 #define DistanceFromVToU 16
  98 #define EndOfChromaLine  20
  99 #define AspectCount      24
 100 #define AspectBaseCount  28
 101 #define tmpYCursorEven   32
 102 #define tmpYCursorOdd    36
 103 #define tmpCCOPitch      40
 104 #define temp_mmx         44
 105 #define RLeftShift       92
 106 #define GLeftShift       100
 107 #define RRightShift      108
 108 #define GRightShift      116
 109 #define BRightShift      124
 110 #define RUpperLimit      132
 111 #define GUpperLimit      140
 112 #define BUpperLimit      148
 113
 114 /*
 115  * extern void C ConvertYUV420RGB16MMX (
 116  *                                     U8* YPlane,
 117  *                                     U8* UPlane,
 118  *                                     U8* VPlane,
 119  *                                     UN  FrameWidth,
 120  *                                     UN  FrameHeight,
 121  *                                     UN  YPitch,
 122  *                                     UN  VPitch,
 123  *                                     UN  AspectAdjustmentCount,
 124  *                                     U8* ColorConvertedFrame,
 125  *                                     U32 DCIOffset,
 126  *                                     U32 CCOffsetToLine0,
 127  *                                     IN  CCOPitch,
 128  *                                     IN  CCType)
 129  *
 130  *  The local variables are on the stack,
 131  *  The tables are in the one and only data segment.
 132  *
 133  *  CCOffsetToLine0 is relative to ColorConvertedFrame.
 134  *  CCType  used by RGB color convertors to determine the exact conversion type.
 135  *    RGB565 = 0
 136  *    RGB555 = 1
 137  *    RGB664 = 2
 138  *    RGB655 = 3
 139  */
 140
 141 .globl ConvertYUV420RGB16MMX
 142 ConvertYUV420RGB16MMX:
 143   pushl      %esi
 144   pushl      %edi
 145
 146   pushl      %ebp
 147   pushl      %ebx
 148
 149   subl       $LocalFrameSize,%esp
 150   movl       CCType(%esp),%eax
 151   cmpl       $4,%eax
 152   jae        finish
 153
 154   jmp        *RGB_formats(,%eax,4)
 155
 156 RGB555:
 157   xorl       %eax,%eax
 158   movl       $2,%ebx                 /* 10-8 for byte shift */
 159   movl       %ebx,RLeftShift(%esp)
 160   movl       %eax,RLeftShift+4(%esp)
 161   movl       $5,%ebx
 162   movl       %ebx,GLeftShift(%esp)
 163   movl       %eax,GLeftShift+4(%esp)
 164   movl       $9,%ebx
 165   movl       %ebx,RRightShift(%esp)
 166   movl       %eax,RRightShift+4(%esp)
 167   movl       %ebx,GRightShift(%esp)
 168   movl       %eax,GRightShift+4(%esp)
 169   movl       %ebx,BRightShift(%esp)
 170   movl       %eax,BRightShift+4(%esp)
 171   movq       fivebitu,%mm0
 172   movq       %mm0,RUpperLimit(%esp)
 173   movq       %mm0,GUpperLimit(%esp)
 174   movq       %mm0,BUpperLimit(%esp)
 175   jmp        RGBEND
 176
 177 RGB664:
 178   xorl       %eax,%eax
 179   movl       $2,%ebx                 /* 8-6 */
 180   movl       %ebx,RLeftShift(%esp)
 181   movl       %eax,RLeftShift+4(%esp)
 182   movl       $4,%ebx
 183   movl       %ebx,GLeftShift(%esp)
 184   movl       %eax,GLeftShift+4(%esp)
 185   movl       $8,%ebx
 186   movl       %ebx,RRightShift(%esp)
 187   movl       %eax,RRightShift+4(%esp)
 188   movl       %ebx,GRightShift(%esp)
 189   movl       %eax,GRightShift+4(%esp)
 190   movl       $10,%ebx
 191   movl       %ebx,BRightShift(%esp)
 192   movl       %eax,BRightShift+4(%esp)
 193   movq       sixbitu,%mm0
 194   movq       %mm0,RUpperLimit(%esp)
 195   movq       %mm0,GUpperLimit(%esp)
 196   movq       fourbitu,%mm0
 197   movq       %mm0,BUpperLimit(%esp)
 198   jmp        RGBEND
 199
 200 RGB655:
 201   xorl       %eax,%eax
 202   movl       $2,%ebx                 /* 8-6 */
 203   movl       %ebx,RLeftShift(%esp)
 204   movl       %eax,RLeftShift+4(%esp)
 205   movl       $5,%ebx
 206   movl       %ebx,GLeftShift(%esp)
 207   movl       %eax,GLeftShift+4(%esp)
 208   movl       $8,%ebx
 209   movl       %ebx,RRightShift(%esp)
 210   movl       %eax,RRightShift+4(%esp)
 211   movl       $9,%ebx
 212   movl       %ebx,GRightShift(%esp)
 213   movl       %eax,GRightShift+4(%esp)
 214   movl       %ebx,BRightShift(%esp)
 215   movl       %eax,BRightShift+4(%esp)
 216   movq       sixbitu,%mm0
 217   movq       %mm0,RUpperLimit(%esp)
 218   movq       fivebitu,%mm0
 219   movq       %mm0,GUpperLimit(%esp)
 220   movq       %mm0,BUpperLimit(%esp)
 221   jmp        RGBEND
 222
 223 RGB565:
 224   xorl       %eax,%eax
 225   movl       $3,%ebx                 /* 8-5 */
 226   movl       %ebx,RLeftShift(%esp)
 227   movl       %eax,RLeftShift+4(%esp)
 228   movl       $5,%ebx
 229   movl       %ebx,GLeftShift(%esp)
 230   movl       %eax,GLeftShift+4(%esp)
 231   movl       $9,%ebx
 232   movl       %ebx,RRightShift(%esp)
 233   movl       %eax,RRightShift+4(%esp)
 234   movl       %ebx,BRightShift(%esp)
 235   movl       %eax,BRightShift+4(%esp)
 236   movl       $8,%ebx
 237   movl       %ebx,GRightShift(%esp)
 238   movl       %eax,GRightShift+4(%esp)
 239   movq       fivebitu,%mm0
 240   movq       %mm0,RUpperLimit(%esp)
 241   movq       %mm0,BUpperLimit(%esp)
 242   movq       sixbitu,%mm0
 243   movq       %mm0,GUpperLimit(%esp)
 244 //  jmp        RGBEND
 245
 246 RGBEND:
 247   movl       VPlane(%esp),%ebx
 248   movl       UPlane(%esp),%ecx
 249   subl       %ebx,%ecx
 250   movl       %ecx,DistanceFromVToU(%esp)
 251
 252   movl       ColorConvertedFrame(%esp),%eax
 253   addl       DCIOffset(%esp),%eax
 254   addl       CCOffsetToLine0(%esp),%eax
 255   movl       %eax,CCOCursor(%esp)
 256
 257
 258   movl       YPitch(%esp),%ecx
 259   movl       FrameWidth(%esp),%ebx
 260   movl       CCOPitch(%esp),%eax
 261   subl       %ebx,%eax                   /* CCOPitch-FrameWidth */
 262   subl       %ebx,%eax                   /* CCOPitch-2*FrameWidth */
 263   sarl       %ebx                        /* FrameWidth/2 */
 264   movl       YPlane(%esp),%esi           /* Fetch cursor over luma plane. */
 265   movl       %ebx,ChromaLineLen(%esp)    /* FrameWidth/2 */
 266   movl       %eax,CCOSkipDistance(%esp)  /* CCOPitch-3*FrameWidth */
 267   movl       %esi,YCursor(%esp)
 268   movl       AspectAdjustmentCount(%esp),%edx
 269   movl       VPlane(%esp),%esi
 270
 271   cmpl       $1,%edx
 272   je         finish
 273   movl       %edx,AspectCount(%esp)
 274   movl       %edx,AspectBaseCount(%esp)
 275   xorl       %eax,%eax
 276
 277   movl       ChromaLineLen(%esp),%edi
 278   movl       %edi,EndOfChromaLine(%esp)
 279   movl       CCOCursor(%esp),%edi
 280
 281   movl       DistanceFromVToU(%esp),%edx
 282   movl       YCursor(%esp),%ebp         /* Fetch Y Pitch. */
 283   movl       FrameWidth(%esp),%ebx
 284
 285   addl       %ebx,%ebp
 286   movl       %ebp,tmpYCursorEven(%esp)
 287   movl       YPitch(%esp),%eax
 288   addl       %eax,%ebp
 289   movl       %ebp,tmpYCursorOdd(%esp)
 290
 291   sarl       %ebx
 292   addl       %ebx,%esi
 293   addl       %esi,%edx
 294   negl       %ebx
 295   movl       %ebx,FrameWidth(%esp)
 296
 297 /*
 298  *  Register Usage:
 299  */
 300
 301 PrepareChromaLine:
 302   movl       AspectCount(%esp),%ebp
 303   movl       FrameWidth(%esp),%ebx
 304   subl       $2,%ebp
 305   movl       CCOPitch(%esp),%eax
 306   movl       %eax,tmpCCOPitch(%esp)
 307   ja         continue
 308
 309   xorl       %eax,%eax
 310   addl       AspectAdjustmentCount(%esp),%ebp
 311   movl       %eax,tmpCCOPitch(%esp)
 312 continue:
 313   movl       %ebp,AspectCount(%esp)
 314
 315 do_next_8x2_block:
 316   movl       tmpYCursorEven(%esp),%ebp
 317 /* here is even line */
 318   movd       (%edx,%ebx,),%mm1       /* 4 u values */
 319   pxor       %mm0,%mm0               /* mm0=0 */
 320   movd       (%esi,%ebx,),%mm2       /* 4 v values */
 321   punpcklbw  %mm0,%mm1               /* get 4 unsign u */
 322   psubw      Minusg,%mm1             /* get 4 unsign u-128 */
 323   punpcklbw  %mm0,%mm2               /* get unsign v */
 324   psubw      Minusg,%mm2             /* get unsign v-128 */
 325   movq       %mm1,%mm3               /* save the u-128 unsign */
 326   movq       %mm1,%mm5               /* save u-128 unsign */
 327   punpcklwd  %mm2,%mm1               /* get 2 low u, v unsign pairs */
 328   pmaddwd    UVtG,%mm1
 329   punpckhwd  %mm2,%mm3               /* create high 2 unsign uv pairs */
 330   pmaddwd    UVtG,%mm3
 331   movq       %mm2,temp_mmx(%esp)       /* save v-128 */
 332   movq       (%ebp,%ebx,2),%mm6      /* mm6 has 8 y pixels */
 333   psubusb    Yadd,%mm6               /* mm6 has 8 y-16 pixels */
 334   packssdw   %mm3,%mm1               /* packed the results to signed words */
 335   movq       %mm6,%mm7               /* save the 8 y-16 pixels */
 336   punpcklbw  %mm0,%mm6               /* mm6 has 4 low y-16 unsign */
 337   pmullw     Ymul,%mm6
 338   punpckhbw  %mm0,%mm7               /* mm7 has 4 high y-16 unsign */
 339   pmullw     Ymul,%mm7
 340   movq       %mm1,%mm4
 341   movq       %mm1,temp_mmx+8(%esp)     /* save 4 chroma G values */
 342   punpcklwd  %mm1,%mm1               /* chroma G replicate low 2 */
 343   movq       %mm6,%mm0               /* low  y */
 344   punpckhwd  %mm4,%mm4               /* chroma G replicate high 2 */
 345   movq       %mm7,%mm3               /* high y */
 346   psubw      %mm1,%mm6               /* 4 low G */
 347   psraw      GRightShift(%esp),%mm6
 348   psubw      %mm4,%mm7               /* 4 high G values in signed 16 bit */
 349   movq       %mm5,%mm2
 350   punpcklwd  %mm5,%mm5               /* replicate the 2 low u pixels */
 351   pmullw     UtB,%mm5
 352   punpckhwd  %mm2,%mm2
 353   psraw      GRightShift(%esp),%mm7
 354   pmullw     UtB,%mm2
 355   packuswb   %mm7,%mm6               /* mm6: G7 G6 G5 G4 G3 G2 G1 G0 */
 356   movq       %mm5,temp_mmx+16(%esp)    /* low chroma B */
 357   paddw      %mm0,%mm5               /* 4 low B values in signed 16 bit */
 358   movq       %mm2,temp_mmx+40(%esp)    /* high chroma B */
 359   paddw      %mm3,%mm2               /* 4 high B values in signed 16 bit */
 360   psraw      BRightShift(%esp),%mm5  /* low B scaled down by 6+(8-5) */
 361   psraw      BRightShift(%esp),%mm2  /* high B scaled down by 6+(8-5) */
 362   packuswb   %mm2,%mm5               /* mm5: B7 B6 B5 B4 B3 B2 B1 B0 */
 363
 364   movq       temp_mmx(%esp),%mm2       /* 4 v values */
 365   movq       %mm5,%mm1               /* save B */
 366   movq       %mm2,%mm7
 367   punpcklwd  %mm2,%mm2               /* replicate the 2 low v pixels */
 368   pmullw     VtR,%mm2
 369   punpckhwd  %mm7,%mm7
 370   pmullw     VtR,%mm7
 371   paddusb    BUpperLimit(%esp),%mm1  /* mm1: saturate B+0FF-15 */
 372   movq       %mm2,temp_mmx+24(%esp)    /* low chroma R */
 373   paddw      %mm0,%mm2               /* 4 low R values in signed 16 bit */
 374   psraw      RRightShift(%esp),%mm2  /* low R scaled down by 6+(8-5) */
 375   pxor       %mm4,%mm4               /* mm4=0 for 8-&gt;16 conversion */
 376   movq       %mm7,temp_mmx+32(%esp)    /* high chroma R */
 377   paddw      %mm3,%mm7               /* 4 high R values in signed 16 bit */
 378   psraw      RRightShift(%esp),%mm7  /* high R scaled down by 6+(8-5) */
 379   psubusb    BUpperLimit(%esp),%mm1
 380   packuswb   %mm7,%mm2               /* mm2: R7 R6 R5 R4 R3 R2 R1 R0 */
 381   paddusb    GUpperLimit(%esp),%mm6  /* G fast patch ih */
 382   psubusb    GUpperLimit(%esp),%mm6  /* fast patch ih */
 383   paddusb    RUpperLimit(%esp),%mm2  /* R */
 384   psubusb    RUpperLimit(%esp),%mm2
 385
 386 /*
 387  * here we are packing from RGB24 to RGB16
 388  * input:
 389  *         mm6: G7 G6 G5 G4 G3 G2 G1 G0
 390  *         mm1: B7 B6 B5 B4 B3 B2 B1 B0
 391  *         mm2: R7 R6 R5 R4 R3 R2 R1 R0
 392  * assuming 8 original pixels in 0-H representation on mm6, mm5, mm2
 393  * when  H=2**xBITS-1 (x is for R G B)
 394  * output:
 395  *        mm1- result: 4 low RGB16
 396  *        mm7- result: 4 high RGB16
 397  * using: mm0- zero register
 398  *        mm3- temporary results
 399  * algorithm:
 400  *   for (i=0; i&lt;8; i++) {
 401  *     RGB[i]=256*(R[i]&lt;&lt;(8-5))+(G[i]&lt;&lt;5)+B[i];
 402  *   }
 403  */
 404
 405   psllq      RLeftShift(%esp),%mm2   /* position R in the most significant
 406                                         part of the byte */
 407   movq       %mm1,%mm7               /* mm1: Save B */
 408
 409 /*
 410  * note: no need for shift to place B on the least significant part of the byte
 411  *   R in left position, B in the right position so they can be combined
 412  */
 413
 414   punpcklbw  %mm2,%mm1               /* mm1: 4 low 16 bit RB */
 415   pxor       %mm0,%mm0               /* mm0: 0 */
 416   punpckhbw  %mm2,%mm7               /* mm5: 4 high 16 bit RB */
 417   movq       %mm6,%mm3               /* mm3: G */
 418   punpcklbw  %mm0,%mm6               /* mm6: low 4 G 16 bit */
 419   psllw      GLeftShift(%esp),%mm6   /* shift low G 5 positions */
 420   punpckhbw  %mm0,%mm3               /* mm3: high 4 G 16 bit */
 421   por        %mm6,%mm1               /* mm1: low RBG16 */
 422   psllw      GLeftShift(%esp),%mm3   /* shift high G 5 positions */
 423   por        %mm3,%mm7               /* mm5: high RBG16 */
 424
 425   movl       tmpYCursorOdd(%esp),%ebp  /* moved to here to save cycles
 426                                            before odd line */
 427   movq       %mm1,(%edi)             /* !! aligned */
 428
 429 /*- start odd line */
 430   movq       (%ebp,%ebx,2),%mm1      /* mm1 has 8 y pixels */
 431   pxor       %mm2,%mm2
 432   psubusb    Yadd,%mm1               /* mm1 has 8 pixels y-16 */
 433   movq       %mm1,%mm5
 434   punpcklbw  %mm2,%mm1               /* get 4 low y-16 unsign pixels word */
 435   pmullw     Ymul,%mm1               /* low 4 luminance contribution */
 436   punpckhbw  %mm2,%mm5               /* 4 high y-16 */
 437   pmullw     Ymul,%mm5               /* high 4 luminance contribution */
 438   movq       %mm7,8(%edi)            /* !! aligned */
 439   movq       %mm1,%mm0
 440   paddw      temp_mmx+24(%esp),%mm0    /* low 4 R */
 441   movq       %mm5,%mm6
 442   psraw      RRightShift(%esp),%mm0  /* low R scaled down by 6+(8-5) */
 443   paddw      temp_mmx+32(%esp),%mm5    /* high 4 R */
 444   movq       %mm1,%mm2
 445   psraw      RRightShift(%esp),%mm5  /* high R scaled down by 6+(8-5) */
 446   paddw      temp_mmx+16(%esp),%mm2    /* low 4 B */
 447   packuswb   %mm5,%mm0               /* mm0: R7 R6 R5 R4 R3 R2 R1 R0 */
 448   psraw      BRightShift(%esp),%mm2  /* low B scaled down by 6+(8-5) */
 449   movq       %mm6,%mm5
 450   paddw      temp_mmx+40(%esp),%mm6    /* high 4 B */
 451   psraw      BRightShift(%esp),%mm6  /* high B scaled down by 6+(8-5) */
 452   movq       temp_mmx+8(%esp),%mm3     /* chroma G  low 4 */
 453   packuswb   %mm6,%mm2               /* mm2: B7 B6 B5 B4 B3 B2 B1 B0 */
 454   movq       %mm3,%mm4
 455   punpcklwd  %mm3,%mm3               /* replicate low 2 */
 456   punpckhwd  %mm4,%mm4               /* replicate high 2 */
 457   psubw      %mm3,%mm1               /* 4 low G */
 458   psraw      GRightShift(%esp),%mm1  /* low G scaled down by 6+(8-5) */
 459   psubw      %mm4,%mm5               /* 4 high G values in signed 16 bit */
 460   psraw      GRightShift(%esp),%mm5  /* high G scaled down by 6+(8-5) */
 461   paddusb    BUpperLimit(%esp),%mm2  /* mm1: saturate B+0FF-15 */
 462   packuswb   %mm5,%mm1               /*mm1: G7 G6 G5 G4 G3 G2 G1 G0 */
 463   psubusb    BUpperLimit(%esp),%mm2
 464   paddusb    GUpperLimit(%esp),%mm1  /* G */
 465   psubusb    GUpperLimit(%esp),%mm1
 466   paddusb    RUpperLimit(%esp),%mm0  /* R */
 467   movl       tmpCCOPitch(%esp),%eax
 468   psubusb    RUpperLimit(%esp),%mm0
 469
 470 /*
 471  * here we are packing from RGB24 to RGB16
 472  *        mm1: G7 G6 G5 G4 G3 G2 G1 G0
 473  *        mm2: B7 B6 B5 B4 B3 B2 B1 B0
 474  *        mm0: R7 R6 R5 R4 R3 R2 R1 R0
 475  * output:
 476  *        mm2- result: 4 low RGB16
 477  *        mm7- result: 4 high RGB16
 478  * using: mm4- zero register
 479  *        mm3- temporary results
 480  */
 481
 482   psllq      RLeftShift(%esp),%mm0   /* position R in the most significant
 483                                         part of the byte */
 484   movq       %mm2,%mm7               /* mm7: Save B */
 485
 486 /*
 487  * note: no need for shift to place B on the least significant part of the byte
 488  *   R in left position, B in the right position so they can be combined
 489  */
 490
 491   punpcklbw  %mm0,%mm2               /* mm1: 4 low 16 bit RB */
 492   pxor       %mm4,%mm4               /* mm4: 0 */
 493   movq       %mm1,%mm3               /* mm3: G */
 494   punpckhbw  %mm0,%mm7               /* mm7: 4 high 16 bit RB */
 495   punpcklbw  %mm4,%mm1               /* mm1: low 4 G 16 bit */
 496   punpckhbw  %mm4,%mm3               /* mm3: high 4 G 16 bit */
 497   psllw      GLeftShift(%esp),%mm1   /* shift low G 5 positions */
 498   por        %mm1,%mm2               /* mm2: low RBG16 */
 499   psllw      GLeftShift(%esp),%mm3   /* shift high G 5 positions */
 500   por        %mm3,%mm7               /* mm7: high RBG16 */
 501 #ifdef DOUBLE
 502   movq       %mm2,%mm1
 503   movq       %mm7,%mm5
 504   movq       %mm2,%mm0
 505   movq       %mm7,%mm3
 506   punpckhwd  %mm2,%mm1
 507   punpckhwd  %mm7,%mm5
 508   punpcklwd  %mm2,%mm0
 509   punpcklwd  %mm7,%mm3
 510   movq       %mm0,(%edi,%eax,)
 511   movq       %mm1,8(%edi,%eax,)
 512   movq       %mm3,16(%edi,%eax,)
 513   movq       %mm5,24(%edi,%eax,)
 514   addl       $32,%edi
 515   addl       $4,%ebx
 516 #endif
 517 #ifndef DOUBLE
 518   movq       %mm2,(%edi,%eax,)
 519   movq       %mm7,8(%edi,%eax,)      /* aligned */
 520   addl       $16,%edi                /* ih take 16 bytes (8 pixels-16 bit) */
 521   addl       $4,%ebx                 /* ? to take 4 pixels together
 522                                         instead of 2 */
 523 #endif
 524   jl         do_next_8x2_block
 525
 526   addl       CCOSkipDistance(%esp),%edi /* go to begin of next line */
 527   addl       tmpCCOPitch(%esp),%edi     /* skip odd line (if it is needed) */
 528 // Leax       AspectCount
 529 // Lebp       CCOPitch               ; skip odd line
 530
 531 // sub        eax, 2
 532 // jg         @f
 533
 534 // Addeax     AspectBaseCount
 535 // xor        ebp, ebp
 536
 537 //@@:
 538 //  Seax       AspectCount
 539 //  add        edi, ebp
 540
 541   movl       YPitch(%esp),%eax
 542   movl       tmpYCursorOdd(%esp),%ebp
 543   addl       %eax,%ebp               /* skip one line */
 544 //  lea        ebp, [ebp+2*eax]        /* skip two lines */
 545   movl       %ebp,tmpYCursorEven(%esp)
 546 //  Sebp       tmpYCursorOdd
 547
 548   addl       %eax,%ebp               /* skip one line */
 549   movl       %ebp,tmpYCursorOdd(%esp)
 550 //  Lebp       tmpYCursorEven
 551 //  lea        ebp, [ebp+2*eax]
 552 //  Sebp       tmpYCursorEven
 553
 554
 555   addl       ChromaPitch(%esp),%esi
 556   addl       ChromaPitch(%esp),%edx
 557
 558
 559 //  Leax       YLimit                  /* Done with last line? */
 560 //  cmp        ebp, eax
 561 //  jbe        PrepareChromaLine
 562   subw       $2,FrameHeight(%esp)
 563   ja         PrepareChromaLine
 564
 565 /******************************************************************************/
 566
 567 finish:
 568   emms
 569   addl       $LocalFrameSize,%esp
 570
 571   popl       %ebx
 572   popl       %ebp
 573   popl       %edi
 574   popl       %esi
 575   ret
 576