git.sesse.net Git - mlt/blob - src/modules/gtk2/scale_line_22_yuv_mmx.S

   1 /*
   2  * scale_line_22_yuv_mmx.S -- scale line in YUY2 format
   3  * Copyright (C) 2003-2004 Ushodaya Enterprises Limited
   4  * Author: Dan Dennedy <dan@dennedy.org>
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with this library; if not, write to the Free Software
  18  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  19  */
  20         .file   "scale_line_22_yuv_mmx.S"
  21         .version        "01.01"
  22 #if !defined(__MINGW32__) && !defined(__CYGWIN__)
  23         .section .note.GNU-stack,"",%progbits
  24 #endif
  25
  26 .extern printf
  27
  28 gcc2_compiled.:
  29 .data
  30 MSG: .ascii "scale_line_22_yuv_mmx: %d %d\n"
  31
  32 .text
  33         .align 16
  34
  35 #if !defined(__MINGW32__) && !defined(__CYGWIN__)
  36
  37 .globl pixops_scale_line_22_yuv_mmx
  38         .type    pixops_scale_line_22_yuv_mmx,@function
  39 pixops_scale_line_22_yuv_mmx:
  40
  41 #else
  42
  43 .globl _pixops_scale_line_22_yuv_mmx
  44 _pixops_scale_line_22_yuv_mmx:
  45
  46 #endif
  47 /*
  48  * Arguments
  49  *
  50  * weights:          8(%ebp)
  51  * p (dest):    12(%ebp)        %esi
  52  * q1 (src0):   16(%ebp)
  53  * q2 (src1):   20(%ebp)
  54  * xstep:       24(%ebp)
  55  * p_end:       28(%ebp)
  56  * xinit:       32(%ebp)
  57  * dest_x:      36(%ebp)
  58  *
  59  */
  60
  61 /*
  62  * Function call entry
  63  */
  64         pushl %ebp
  65         movl %esp,%ebp
  66         subl $28,%esp
  67         pushl %edi
  68         pushl %esi
  69         pushl %ebx
  70 /* Locals:
  71  * int x                      %ebx
  72  * int x_scaled             -24(%ebp)
  73  * int dest_x               36(%ebp)
  74  */
  75
  76 /*
  77  * Setup
  78  */
  79 /* Initialize variables */
  80         movl 36(%ebp),%eax # destx
  81         movl %eax,36(%ebp)
  82         movl 32(%ebp),%ebx # x
  83         movl 12(%ebp),%esi # dest
  84
  85         cmpl 28(%ebp),%esi # dest == dest_end ?
  86         jnb  .out
  87
  88 /* For the body of this loop, %mm0, %mm1, %mm2, %mm3 hold the 4 adjoining
  89  * points we are interpolating between, as:
  90  *
  91  *  00VV00Y200UU00Y1
  92  */
  93
  94         pxor %mm4, %mm4
  95 /*
  96  * Load next component values into mm1 (src0) and mm3 (src1)
  97  */
  98         movl %ebx, %eax          # x_scaled
  99         sarl $15, %eax
 100         andl $0xfffffffe, %eax
 101         movl %eax, %edx          # x_aligned
 102         andl $0xfffffffc, %edx
 103
 104         movl 16(%ebp), %edi      # get src0
 105         movl (%edi,%eax), %ecx   # get y
 106         andl $0x00ff00ff, %ecx   # mask off y
 107         movl (%edi,%edx), %eax   # get uv
 108         andl $0xff00ff00, %eax   # mask off uv
 109         orl %eax, %ecx           # composite y, uv
 110         movd %ecx, %mm1          # move to mmx1
 111         punpcklbw %mm4, %mm1
 112
 113         movl 20(%ebp), %edi      # get src1
 114         movl (%edi,%edx), %ecx   # get y
 115         andl $0x00ff00ff, %ecx   # mask off y
 116         movl (%edi,%edx), %eax   # get uv
 117         andl $0xff00ff00, %eax   # mask off uv
 118         orl %eax, %ecx           # composite y, uv
 119         movd %ecx, %mm3          # move to mmx3
 120         punpcklbw %mm4, %mm3
 121
 122         jmp .newx
 123
 124         .p2align 4,,7
 125 .loop:
 126
 127 /* short *pixel_weights = weights + ((x >> (SCALE_SHIFT - SUBSAMPLE_BITS)) & SUBSAMPLE_MASK) * n_x * n_y
 128  *                                             16             4                  0xf            2     2
 129  */
 130         movl 8(%ebp), %edi       # get weights pointer
 131         movl %ebx, %eax
 132         andl $0xf000, %eax
 133         shrl $7, %eax
 134
 135 /* At this point, %edi holds weights. Load the 4 weights into
 136  * %mm4,%mm5,%mm6,%mm7, multiply and accumulate.
 137  */
 138         movq (%edi,%eax), %mm4
 139         pmullw %mm0, %mm4
 140         movq 8(%edi,%eax), %mm5
 141         pmullw %mm1, %mm5
 142         movq 16(%edi,%eax), %mm6
 143         pmullw %mm2,%mm6
 144         movq 24(%edi,%eax), %mm7
 145         pmullw %mm3,%mm7
 146
 147         paddw %mm4, %mm5
 148         paddw %mm6, %mm7
 149         paddw %mm5, %mm7
 150
 151 /* %mm7 holds the accumulated sum. Compute (C + 0x80) / 256
 152  */
 153         pxor %mm4, %mm4
 154         movl $0x80808080, %eax
 155         movd %eax, %mm6
 156         punpcklbw %mm4, %mm6
 157         paddw %mm6, %mm7
 158         psrlw $8, %mm7
 159
 160 /* Pack into %eax and store result
 161  */
 162         packuswb %mm7, %mm7
 163         movd %mm7, %eax
 164
 165         movb %al, (%esi)         # *dest = y
 166
 167         movl 36(%ebp), %ecx      # get dest_x
 168         andl $1, %ecx            # select u or v
 169         sall $1, %ecx            # determine offset
 170         addl $1, %ecx            # relative to x_aligned
 171         sall $3, %ecx            # offset * 8 bits/byte
 172
 173         movd %mm7, %eax
 174         shrl %cl, %eax
 175         movb %al, 1(%esi)        # *dest = uv
 176
 177         addl $2, %esi            # dest += 2
 178         cmpl %esi,28(%ebp)       # if dest == dest_end
 179         je   .out                # then exit
 180
 181         addl $1, 36(%ebp)        # dest_x++
 182
 183 .newx:
 184
 185         addl 24(%ebp), %ebx      # x += x_step
 186 /*
 187  * Load current component values into mm0 (src0) and mm2 (src1)
 188  */
 189         movq %mm1, %mm0
 190         movq %mm3, %mm2
 191
 192 /*
 193  * Load next component values into mm1 (src0) and mm3 (src1)
 194  */
 195         movl %ebx, %eax          # x_scaled
 196         sarl $15, %eax
 197         andl $0xfffffffe, %eax
 198         movl %eax, %edx          # x_aligned
 199         andl $0xfffffffc, %edx
 200
 201         movl 16(%ebp), %edi      # get src0
 202         movl (%edi,%eax), %ecx   # get y
 203         andl $0x00ff00ff, %ecx   # mask off y
 204         movl (%edi,%edx), %eax   # get uv
 205         andl $0xff00ff00, %eax   # mask off uv
 206         orl %eax, %ecx           # composite y, uv
 207         movd %ecx, %mm1          # move to mmx1
 208         punpcklbw %mm4, %mm1
 209
 210         movl 20(%ebp), %edi      # get src1
 211         movl (%edi,%edx), %ecx   # get y
 212         andl $0x00ff00ff, %ecx   # mask off y
 213         movl (%edi,%edx), %eax   # get uv
 214         andl $0xff00ff00, %eax   # mask off uv
 215         orl %eax, %ecx           # composite y, uv
 216         movd %ecx, %mm3          # move to mmx3
 217         punpcklbw %mm4, %mm3
 218
 219         jmp .loop
 220
 221 .out:
 222         movl %esi,%eax
 223         emms
 224         leal -40(%ebp),%esp
 225         popl %ebx
 226         popl %esi
 227         popl %edi
 228         movl %ebp,%esp
 229         popl %ebp
 230         ret