git.sesse.net Git - vlc/blob - src/video_output/video_yuv_mmx.S

   1 /*****************************************************************************
   2  * video_yuv_mmx.S: YUV transformation, optimized for MMX processors
   3  *****************************************************************************
   4  * Copyright (C) 1999, 2000 VideoLAN
   5  *
   6  * Authors:
   7  *
   8  * This program is free software; you can redistribute it and/or modify
   9  * it under the terms of the GNU General Public License as published by
  10  * the Free Software Foundation; either version 2 of the License, or
  11  * (at your option) any later version.
  12  *
  13  * This program is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16  * GNU General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU General Public License
  19  * along with this program; if not, write to the Free Software
  20  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
  21  *****************************************************************************/
  22
  23 /*****************************************************************************
  24  * Following functions are defined:
  25  * vout_YUV420_16_MMX
  26  *            This function performs YUV12-to-RGB16 color conversion for H26x.
  27  *            It handles any format in which there are three fields, the low
  28  *            order field being B and fully contained in the low order byte, the
  29  *            second field being G and being somewhere in bits 4 through 11,
  30  *            and the high order field being R and fully contained in the high
  31  *            order byte.
  32  *
  33  *            The YUV12 input is planar, 8 bits per pel.  The Y plane may have
  34  *            a pitch of up to 768.  It may have a width less than or equal
  35  *            to the pitch.  It must be DWORD aligned, and preferably QWORD
  36  *            aligned.  Pitch and Width must be a multiple of four.  For best
  37  *            performance, Pitch should not be 4 more than a multiple of 32.
  38  *            Height may be any amount, but must be a multiple of two.  The U
  39  *            and V planes may have a different pitch than the Y plane, subject
  40  *            to the same limitations.
  41  *****************************************************************************/
  42
  43 //.include iammx.inc
  44 //.include locals.inc
  45
  46 .data
  47     .align 16
  48
  49 RGB_formats:
  50     .long  RGB565
  51     .long  RGB555
  52     .long  RGB664
  53     .long  RGB655
  54
  55 Minusg:             .long 0x00800080, 0x00800080
  56 Yadd:               .long 0x10101010, 0x10101010
  57 VtR:                .long 0x00660066, 0x00660066
  58 VtG:                .long 0x00340034, 0x00340034
  59 UtG:                .long 0x00190019, 0x00190019
  60 UtB:                .long 0x00810081, 0x00810081
  61 Ymul:               .long 0x004a004a, 0x004a004a
  62 UVtG:               .long 0x00340019, 0x00340019
  63 VtRUtB:             .long 0x01990205, 0x01990205
  64 fourbitu:           .quad 0xf0f0f0f0f0f0f0f0
  65 fivebitu:           .quad 0xe0e0e0e0e0e0e0e0
  66 sixbitu:            .quad 0xc0c0c0c0c0c0c0c0
  67
  68 .text
  69
  70 #define LocalFrameSize  156
  71 #define RegisterStorageSize  16
  72
  73 //#define DOUBLE /*double le nombre de colonnes */
  74
  75 /* Arguments: */
  76 #define YPlane                    LocalFrameSize + RegisterStorageSize +  4
  77 #define UPlane                    LocalFrameSize + RegisterStorageSize +  8
  78 #define VPlane                    LocalFrameSize + RegisterStorageSize + 12
  79 #define FrameWidth                LocalFrameSize + RegisterStorageSize + 16
  80 #define FrameHeight               LocalFrameSize + RegisterStorageSize + 20
  81 #define YPitch                    LocalFrameSize + RegisterStorageSize + 24
  82 #define ChromaPitch               LocalFrameSize + RegisterStorageSize + 28
  83 #define AspectAdjustmentCount     LocalFrameSize + RegisterStorageSize + 32
  84 #define ColorConvertedFrame       LocalFrameSize + RegisterStorageSize + 36
  85 #define DCIOffset                 LocalFrameSize + RegisterStorageSize + 40
  86 #define CCOffsetToLine0           LocalFrameSize + RegisterStorageSize + 44
  87 #define CCOPitch                  LocalFrameSize + RegisterStorageSize + 48
  88 #define CCType                    LocalFrameSize + RegisterStorageSize + 52
  89 #define EndOfArgList              LocalFrameSize + RegisterStorageSize + 56
  90
  91 /* Locals (on local stack frame) */
  92 #define CCOCursor        0
  93 #define CCOSkipDistance  4
  94 #define ChromaLineLen    8
  95 #define YCursor          12
  96 #define DistanceFromVToU 16
  97 #define EndOfChromaLine  20
  98 #define AspectCount      24
  99 #define AspectBaseCount  28
 100 #define tmpYCursorEven   32
 101 #define tmpYCursorOdd    36
 102 #define tmpCCOPitch      40
 103 #define temp_mmx         44
 104 #define RLeftShift       92
 105 #define GLeftShift       100
 106 #define RRightShift      108
 107 #define GRightShift      116
 108 #define BRightShift      124
 109 #define RUpperLimit      132
 110 #define GUpperLimit      140
 111 #define BUpperLimit      148
 112
 113 /*
 114  * extern void C ConvertYUV420RGB16MMX (
 115  *                                     U8* YPlane,
 116  *                                     U8* UPlane,
 117  *                                     U8* VPlane,
 118  *                                     UN  FrameWidth,
 119  *                                     UN  FrameHeight,
 120  *                                     UN  YPitch,
 121  *                                     UN  VPitch,
 122  *                                     UN  AspectAdjustmentCount,
 123  *                                     U8* ColorConvertedFrame,
 124  *                                     U32 DCIOffset,
 125  *                                     U32 CCOffsetToLine0,
 126  *                                     IN  CCOPitch,
 127  *                                     IN  CCType)
 128  *
 129  *  The local variables are on the stack,
 130  *  The tables are in the one and only data segment.
 131  *
 132  *  CCOffsetToLine0 is relative to ColorConvertedFrame.
 133  *  CCType  used by RGB color convertors to determine the exact conversion type.
 134  *    RGB565 = 0
 135  *    RGB555 = 1
 136  *    RGB664 = 2
 137  *    RGB655 = 3
 138  */
 139
 140 .globl ConvertYUV420RGB16MMX
 141 ConvertYUV420RGB16MMX:
 142   pushl      %esi
 143   pushl      %edi
 144
 145   pushl      %ebp
 146   pushl      %ebx
 147
 148   subl       $LocalFrameSize,%esp
 149   movl       CCType(%esp),%eax
 150   cmpl       $4,%eax
 151   jae        finish
 152
 153   jmp        *RGB_formats(,%eax,4)
 154
 155 RGB555:
 156   xorl       %eax,%eax
 157   movl       $2,%ebx                 /* 10-8 for byte shift */
 158   movl       %ebx,RLeftShift(%esp)
 159   movl       %eax,RLeftShift+4(%esp)
 160   movl       $5,%ebx
 161   movl       %ebx,GLeftShift(%esp)
 162   movl       %eax,GLeftShift+4(%esp)
 163   movl       $9,%ebx
 164   movl       %ebx,RRightShift(%esp)
 165   movl       %eax,RRightShift+4(%esp)
 166   movl       %ebx,GRightShift(%esp)
 167   movl       %eax,GRightShift+4(%esp)
 168   movl       %ebx,BRightShift(%esp)
 169   movl       %eax,BRightShift+4(%esp)
 170   movq       fivebitu,%mm0
 171   movq       %mm0,RUpperLimit(%esp)
 172   movq       %mm0,GUpperLimit(%esp)
 173   movq       %mm0,BUpperLimit(%esp)
 174   jmp        RGBEND
 175
 176 RGB664:
 177   xorl       %eax,%eax
 178   movl       $2,%ebx                 /* 8-6 */
 179   movl       %ebx,RLeftShift(%esp)
 180   movl       %eax,RLeftShift+4(%esp)
 181   movl       $4,%ebx
 182   movl       %ebx,GLeftShift(%esp)
 183   movl       %eax,GLeftShift+4(%esp)
 184   movl       $8,%ebx
 185   movl       %ebx,RRightShift(%esp)
 186   movl       %eax,RRightShift+4(%esp)
 187   movl       %ebx,GRightShift(%esp)
 188   movl       %eax,GRightShift+4(%esp)
 189   movl       $10,%ebx
 190   movl       %ebx,BRightShift(%esp)
 191   movl       %eax,BRightShift+4(%esp)
 192   movq       sixbitu,%mm0
 193   movq       %mm0,RUpperLimit(%esp)
 194   movq       %mm0,GUpperLimit(%esp)
 195   movq       fourbitu,%mm0
 196   movq       %mm0,BUpperLimit(%esp)
 197   jmp        RGBEND
 198
 199 RGB655:
 200   xorl       %eax,%eax
 201   movl       $2,%ebx                 /* 8-6 */
 202   movl       %ebx,RLeftShift(%esp)
 203   movl       %eax,RLeftShift+4(%esp)
 204   movl       $5,%ebx
 205   movl       %ebx,GLeftShift(%esp)
 206   movl       %eax,GLeftShift+4(%esp)
 207   movl       $8,%ebx
 208   movl       %ebx,RRightShift(%esp)
 209   movl       %eax,RRightShift+4(%esp)
 210   movl       $9,%ebx
 211   movl       %ebx,GRightShift(%esp)
 212   movl       %eax,GRightShift+4(%esp)
 213   movl       %ebx,BRightShift(%esp)
 214   movl       %eax,BRightShift+4(%esp)
 215   movq       sixbitu,%mm0
 216   movq       %mm0,RUpperLimit(%esp)
 217   movq       fivebitu,%mm0
 218   movq       %mm0,GUpperLimit(%esp)
 219   movq       %mm0,BUpperLimit(%esp)
 220   jmp        RGBEND
 221
 222 RGB565:
 223   xorl       %eax,%eax
 224   movl       $3,%ebx                 /* 8-5 */
 225   movl       %ebx,RLeftShift(%esp)
 226   movl       %eax,RLeftShift+4(%esp)
 227   movl       $5,%ebx
 228   movl       %ebx,GLeftShift(%esp)
 229   movl       %eax,GLeftShift+4(%esp)
 230   movl       $9,%ebx
 231   movl       %ebx,RRightShift(%esp)
 232   movl       %eax,RRightShift+4(%esp)
 233   movl       %ebx,BRightShift(%esp)
 234   movl       %eax,BRightShift+4(%esp)
 235   movl       $8,%ebx
 236   movl       %ebx,GRightShift(%esp)
 237   movl       %eax,GRightShift+4(%esp)
 238   movq       fivebitu,%mm0
 239   movq       %mm0,RUpperLimit(%esp)
 240   movq       %mm0,BUpperLimit(%esp)
 241   movq       sixbitu,%mm0
 242   movq       %mm0,GUpperLimit(%esp)
 243 //  jmp        RGBEND
 244
 245 RGBEND:
 246   movl       VPlane(%esp),%ebx
 247   movl       UPlane(%esp),%ecx
 248   subl       %ebx,%ecx
 249   movl       %ecx,DistanceFromVToU(%esp)
 250
 251   movl       ColorConvertedFrame(%esp),%eax
 252   addl       DCIOffset(%esp),%eax
 253   addl       CCOffsetToLine0(%esp),%eax
 254   movl       %eax,CCOCursor(%esp)
 255
 256
 257   movl       YPitch(%esp),%ecx
 258   movl       FrameWidth(%esp),%ebx
 259   movl       CCOPitch(%esp),%eax
 260   subl       %ebx,%eax                   /* CCOPitch-FrameWidth */
 261   subl       %ebx,%eax                   /* CCOPitch-2*FrameWidth */
 262   sarl       %ebx                        /* FrameWidth/2 */
 263   movl       YPlane(%esp),%esi           /* Fetch cursor over luma plane. */
 264   movl       %ebx,ChromaLineLen(%esp)    /* FrameWidth/2 */
 265   movl       %eax,CCOSkipDistance(%esp)  /* CCOPitch-3*FrameWidth */
 266   movl       %esi,YCursor(%esp)
 267   movl       AspectAdjustmentCount(%esp),%edx
 268   movl       VPlane(%esp),%esi
 269
 270   cmpl       $1,%edx
 271   je         finish
 272   movl       %edx,AspectCount(%esp)
 273   movl       %edx,AspectBaseCount(%esp)
 274   xorl       %eax,%eax
 275
 276   movl       ChromaLineLen(%esp),%edi
 277   movl       %edi,EndOfChromaLine(%esp)
 278   movl       CCOCursor(%esp),%edi
 279
 280   movl       DistanceFromVToU(%esp),%edx
 281   movl       YCursor(%esp),%ebp         /* Fetch Y Pitch. */
 282   movl       FrameWidth(%esp),%ebx
 283
 284   addl       %ebx,%ebp
 285   movl       %ebp,tmpYCursorEven(%esp)
 286   movl       YPitch(%esp),%eax
 287   addl       %eax,%ebp
 288   movl       %ebp,tmpYCursorOdd(%esp)
 289
 290   sarl       %ebx
 291   addl       %ebx,%esi
 292   addl       %esi,%edx
 293   negl       %ebx
 294   movl       %ebx,FrameWidth(%esp)
 295
 296 /*
 297  *  Register Usage:
 298  */
 299
 300 PrepareChromaLine:
 301   movl       AspectCount(%esp),%ebp
 302   movl       FrameWidth(%esp),%ebx
 303   subl       $2,%ebp
 304   movl       CCOPitch(%esp),%eax
 305   movl       %eax,tmpCCOPitch(%esp)
 306   ja         continue
 307
 308   xorl       %eax,%eax
 309   addl       AspectAdjustmentCount(%esp),%ebp
 310   movl       %eax,tmpCCOPitch(%esp)
 311 continue:
 312   movl       %ebp,AspectCount(%esp)
 313
 314 do_next_8x2_block:
 315   movl       tmpYCursorEven(%esp),%ebp
 316 /* here is even line */
 317   movd       (%edx,%ebx,),%mm1       /* 4 u values */
 318   pxor       %mm0,%mm0               /* mm0=0 */
 319   movd       (%esi,%ebx,),%mm2       /* 4 v values */
 320   punpcklbw  %mm0,%mm1               /* get 4 unsign u */
 321   psubw      Minusg,%mm1             /* get 4 unsign u-128 */
 322   punpcklbw  %mm0,%mm2               /* get unsign v */
 323   psubw      Minusg,%mm2             /* get unsign v-128 */
 324   movq       %mm1,%mm3               /* save the u-128 unsign */
 325   movq       %mm1,%mm5               /* save u-128 unsign */
 326   punpcklwd  %mm2,%mm1               /* get 2 low u, v unsign pairs */
 327   pmaddwd    UVtG,%mm1
 328   punpckhwd  %mm2,%mm3               /* create high 2 unsign uv pairs */
 329   pmaddwd    UVtG,%mm3
 330   movq       %mm2,temp_mmx(%esp)       /* save v-128 */
 331   movq       (%ebp,%ebx,2),%mm6      /* mm6 has 8 y pixels */
 332   psubusb    Yadd,%mm6               /* mm6 has 8 y-16 pixels */
 333   packssdw   %mm3,%mm1               /* packed the results to signed words */
 334   movq       %mm6,%mm7               /* save the 8 y-16 pixels */
 335   punpcklbw  %mm0,%mm6               /* mm6 has 4 low y-16 unsign */
 336   pmullw     Ymul,%mm6
 337   punpckhbw  %mm0,%mm7               /* mm7 has 4 high y-16 unsign */
 338   pmullw     Ymul,%mm7
 339   movq       %mm1,%mm4
 340   movq       %mm1,temp_mmx+8(%esp)     /* save 4 chroma G values */
 341   punpcklwd  %mm1,%mm1               /* chroma G replicate low 2 */
 342   movq       %mm6,%mm0               /* low  y */
 343   punpckhwd  %mm4,%mm4               /* chroma G replicate high 2 */
 344   movq       %mm7,%mm3               /* high y */
 345   psubw      %mm1,%mm6               /* 4 low G */
 346   psraw      GRightShift(%esp),%mm6
 347   psubw      %mm4,%mm7               /* 4 high G values in signed 16 bit */
 348   movq       %mm5,%mm2
 349   punpcklwd  %mm5,%mm5               /* replicate the 2 low u pixels */
 350   pmullw     UtB,%mm5
 351   punpckhwd  %mm2,%mm2
 352   psraw      GRightShift(%esp),%mm7
 353   pmullw     UtB,%mm2
 354   packuswb   %mm7,%mm6               /* mm6: G7 G6 G5 G4 G3 G2 G1 G0 */
 355   movq       %mm5,temp_mmx+16(%esp)    /* low chroma B */
 356   paddw      %mm0,%mm5               /* 4 low B values in signed 16 bit */
 357   movq       %mm2,temp_mmx+40(%esp)    /* high chroma B */
 358   paddw      %mm3,%mm2               /* 4 high B values in signed 16 bit */
 359   psraw      BRightShift(%esp),%mm5  /* low B scaled down by 6+(8-5) */
 360   psraw      BRightShift(%esp),%mm2  /* high B scaled down by 6+(8-5) */
 361   packuswb   %mm2,%mm5               /* mm5: B7 B6 B5 B4 B3 B2 B1 B0 */
 362
 363   movq       temp_mmx(%esp),%mm2       /* 4 v values */
 364   movq       %mm5,%mm1               /* save B */
 365   movq       %mm2,%mm7
 366   punpcklwd  %mm2,%mm2               /* replicate the 2 low v pixels */
 367   pmullw     VtR,%mm2
 368   punpckhwd  %mm7,%mm7
 369   pmullw     VtR,%mm7
 370   paddusb    BUpperLimit(%esp),%mm1  /* mm1: saturate B+0FF-15 */
 371   movq       %mm2,temp_mmx+24(%esp)    /* low chroma R */
 372   paddw      %mm0,%mm2               /* 4 low R values in signed 16 bit */
 373   psraw      RRightShift(%esp),%mm2  /* low R scaled down by 6+(8-5) */
 374   pxor       %mm4,%mm4               /* mm4=0 for 8-&gt;16 conversion */
 375   movq       %mm7,temp_mmx+32(%esp)    /* high chroma R */
 376   paddw      %mm3,%mm7               /* 4 high R values in signed 16 bit */
 377   psraw      RRightShift(%esp),%mm7  /* high R scaled down by 6+(8-5) */
 378   psubusb    BUpperLimit(%esp),%mm1
 379   packuswb   %mm7,%mm2               /* mm2: R7 R6 R5 R4 R3 R2 R1 R0 */
 380   paddusb    GUpperLimit(%esp),%mm6  /* G fast patch ih */
 381   psubusb    GUpperLimit(%esp),%mm6  /* fast patch ih */
 382   paddusb    RUpperLimit(%esp),%mm2  /* R */
 383   psubusb    RUpperLimit(%esp),%mm2
 384
 385 /*
 386  * here we are packing from RGB24 to RGB16
 387  * input:
 388  *         mm6: G7 G6 G5 G4 G3 G2 G1 G0
 389  *         mm1: B7 B6 B5 B4 B3 B2 B1 B0
 390  *         mm2: R7 R6 R5 R4 R3 R2 R1 R0
 391  * assuming 8 original pixels in 0-H representation on mm6, mm5, mm2
 392  * when  H=2**xBITS-1 (x is for R G B)
 393  * output:
 394  *        mm1- result: 4 low RGB16
 395  *        mm7- result: 4 high RGB16
 396  * using: mm0- zero register
 397  *        mm3- temporary results
 398  * algorithm:
 399  *   for (i=0; i&lt;8; i++) {
 400  *     RGB[i]=256*(R[i]&lt;&lt;(8-5))+(G[i]&lt;&lt;5)+B[i];
 401  *   }
 402  */
 403
 404   psllq      RLeftShift(%esp),%mm2   /* position R in the most significant
 405                                         part of the byte */
 406   movq       %mm1,%mm7               /* mm1: Save B */
 407
 408 /*
 409  * note: no need for shift to place B on the least significant part of the byte
 410  *   R in left position, B in the right position so they can be combined
 411  */
 412
 413   punpcklbw  %mm2,%mm1               /* mm1: 4 low 16 bit RB */
 414   pxor       %mm0,%mm0               /* mm0: 0 */
 415   punpckhbw  %mm2,%mm7               /* mm5: 4 high 16 bit RB */
 416   movq       %mm6,%mm3               /* mm3: G */
 417   punpcklbw  %mm0,%mm6               /* mm6: low 4 G 16 bit */
 418   psllw      GLeftShift(%esp),%mm6   /* shift low G 5 positions */
 419   punpckhbw  %mm0,%mm3               /* mm3: high 4 G 16 bit */
 420   por        %mm6,%mm1               /* mm1: low RBG16 */
 421   psllw      GLeftShift(%esp),%mm3   /* shift high G 5 positions */
 422   por        %mm3,%mm7               /* mm5: high RBG16 */
 423
 424   movl       tmpYCursorOdd(%esp),%ebp  /* moved to here to save cycles
 425                                            before odd line */
 426   movq       %mm1,(%edi)             /* !! aligned */
 427
 428 /*- start odd line */
 429   movq       (%ebp,%ebx,2),%mm1      /* mm1 has 8 y pixels */
 430   pxor       %mm2,%mm2
 431   psubusb    Yadd,%mm1               /* mm1 has 8 pixels y-16 */
 432   movq       %mm1,%mm5
 433   punpcklbw  %mm2,%mm1               /* get 4 low y-16 unsign pixels word */
 434   pmullw     Ymul,%mm1               /* low 4 luminance contribution */
 435   punpckhbw  %mm2,%mm5               /* 4 high y-16 */
 436   pmullw     Ymul,%mm5               /* high 4 luminance contribution */
 437   movq       %mm7,8(%edi)            /* !! aligned */
 438   movq       %mm1,%mm0
 439   paddw      temp_mmx+24(%esp),%mm0    /* low 4 R */
 440   movq       %mm5,%mm6
 441   psraw      RRightShift(%esp),%mm0  /* low R scaled down by 6+(8-5) */
 442   paddw      temp_mmx+32(%esp),%mm5    /* high 4 R */
 443   movq       %mm1,%mm2
 444   psraw      RRightShift(%esp),%mm5  /* high R scaled down by 6+(8-5) */
 445   paddw      temp_mmx+16(%esp),%mm2    /* low 4 B */
 446   packuswb   %mm5,%mm0               /* mm0: R7 R6 R5 R4 R3 R2 R1 R0 */
 447   psraw      BRightShift(%esp),%mm2  /* low B scaled down by 6+(8-5) */
 448   movq       %mm6,%mm5
 449   paddw      temp_mmx+40(%esp),%mm6    /* high 4 B */
 450   psraw      BRightShift(%esp),%mm6  /* high B scaled down by 6+(8-5) */
 451   movq       temp_mmx+8(%esp),%mm3     /* chroma G  low 4 */
 452   packuswb   %mm6,%mm2               /* mm2: B7 B6 B5 B4 B3 B2 B1 B0 */
 453   movq       %mm3,%mm4
 454   punpcklwd  %mm3,%mm3               /* replicate low 2 */
 455   punpckhwd  %mm4,%mm4               /* replicate high 2 */
 456   psubw      %mm3,%mm1               /* 4 low G */
 457   psraw      GRightShift(%esp),%mm1  /* low G scaled down by 6+(8-5) */
 458   psubw      %mm4,%mm5               /* 4 high G values in signed 16 bit */
 459   psraw      GRightShift(%esp),%mm5  /* high G scaled down by 6+(8-5) */
 460   paddusb    BUpperLimit(%esp),%mm2  /* mm1: saturate B+0FF-15 */
 461   packuswb   %mm5,%mm1               /*mm1: G7 G6 G5 G4 G3 G2 G1 G0 */
 462   psubusb    BUpperLimit(%esp),%mm2
 463   paddusb    GUpperLimit(%esp),%mm1  /* G */
 464   psubusb    GUpperLimit(%esp),%mm1
 465   paddusb    RUpperLimit(%esp),%mm0  /* R */
 466   movl       tmpCCOPitch(%esp),%eax
 467   psubusb    RUpperLimit(%esp),%mm0
 468
 469 /*
 470  * here we are packing from RGB24 to RGB16
 471  *        mm1: G7 G6 G5 G4 G3 G2 G1 G0
 472  *        mm2: B7 B6 B5 B4 B3 B2 B1 B0
 473  *        mm0: R7 R6 R5 R4 R3 R2 R1 R0
 474  * output:
 475  *        mm2- result: 4 low RGB16
 476  *        mm7- result: 4 high RGB16
 477  * using: mm4- zero register
 478  *        mm3- temporary results
 479  */
 480
 481   psllq      RLeftShift(%esp),%mm0   /* position R in the most significant
 482                                         part of the byte */
 483   movq       %mm2,%mm7               /* mm7: Save B */
 484
 485 /*
 486  * note: no need for shift to place B on the least significant part of the byte
 487  *   R in left position, B in the right position so they can be combined
 488  */
 489
 490   punpcklbw  %mm0,%mm2               /* mm1: 4 low 16 bit RB */
 491   pxor       %mm4,%mm4               /* mm4: 0 */
 492   movq       %mm1,%mm3               /* mm3: G */
 493   punpckhbw  %mm0,%mm7               /* mm7: 4 high 16 bit RB */
 494   punpcklbw  %mm4,%mm1               /* mm1: low 4 G 16 bit */
 495   punpckhbw  %mm4,%mm3               /* mm3: high 4 G 16 bit */
 496   psllw      GLeftShift(%esp),%mm1   /* shift low G 5 positions */
 497   por        %mm1,%mm2               /* mm2: low RBG16 */
 498   psllw      GLeftShift(%esp),%mm3   /* shift high G 5 positions */
 499   por        %mm3,%mm7               /* mm7: high RBG16 */
 500 #ifdef DOUBLE
 501   movq       %mm2,%mm1
 502   movq       %mm7,%mm5
 503   movq       %mm2,%mm0
 504   movq       %mm7,%mm3
 505   punpckhwd  %mm2,%mm1
 506   punpckhwd  %mm7,%mm5
 507   punpcklwd  %mm2,%mm0
 508   punpcklwd  %mm7,%mm3
 509   movq       %mm0,(%edi,%eax,)
 510   movq       %mm1,8(%edi,%eax,)
 511   movq       %mm3,16(%edi,%eax,)
 512   movq       %mm5,24(%edi,%eax,)
 513   addl       $32,%edi
 514   addl       $4,%ebx
 515 #endif
 516 #ifndef DOUBLE
 517   movq       %mm2,(%edi,%eax,)
 518   movq       %mm7,8(%edi,%eax,)      /* aligned */
 519   addl       $16,%edi                /* ih take 16 bytes (8 pixels-16 bit) */
 520   addl       $4,%ebx                 /* ? to take 4 pixels together
 521                                         instead of 2 */
 522 #endif
 523   jl         do_next_8x2_block
 524
 525   addl       CCOSkipDistance(%esp),%edi /* go to begin of next line */
 526   addl       tmpCCOPitch(%esp),%edi     /* skip odd line (if it is needed) */
 527 // Leax       AspectCount
 528 // Lebp       CCOPitch               ; skip odd line
 529
 530 // sub        eax, 2
 531 // jg         @f
 532
 533 // Addeax     AspectBaseCount
 534 // xor        ebp, ebp
 535
 536 //@@:
 537 //  Seax       AspectCount
 538 //  add        edi, ebp
 539
 540   movl       YPitch(%esp),%eax
 541   movl       tmpYCursorOdd(%esp),%ebp
 542   addl       %eax,%ebp               /* skip one line */
 543 //  lea        ebp, [ebp+2*eax]        /* skip two lines */
 544   movl       %ebp,tmpYCursorEven(%esp)
 545 //  Sebp       tmpYCursorOdd
 546
 547   addl       %eax,%ebp               /* skip one line */
 548   movl       %ebp,tmpYCursorOdd(%esp)
 549 //  Lebp       tmpYCursorEven
 550 //  lea        ebp, [ebp+2*eax]
 551 //  Sebp       tmpYCursorEven
 552
 553
 554   addl       ChromaPitch(%esp),%esi
 555   addl       ChromaPitch(%esp),%edx
 556
 557
 558 //  Leax       YLimit                  /* Done with last line? */
 559 //  cmp        ebp, eax
 560 //  jbe        PrepareChromaLine
 561   subw       $2,FrameHeight(%esp)
 562   ja         PrepareChromaLine
 563
 564 /******************************************************************************/
 565
 566 finish:
 567   emms
 568   addl       $LocalFrameSize,%esp
 569
 570   popl       %ebx
 571   popl       %ebp
 572   popl       %edi
 573   popl       %esi
 574   ret
 575