/* mmx function declarations */
#ifdef USE_MMX
-guchar *pixops_scale_line_22_33_mmx ( guint32 weights[ 16 ][ 8 ], guchar *p, guchar *q1, guchar *q2, int x_step, guchar *p_stop, int x_init );
+guchar *pixops_scale_line_22_yuv_mmx ( guint32 weights[ 16 ][ 8 ], guchar *p, guchar *q1, guchar *q2, int x_step, guchar *p_stop, int x_init, int destx );
int pixops_have_mmx ( void );
#endif
#ifdef USE_MMX
static inline guchar *
-scale_line_22_33_mmx_stub ( int *weights, int n_x, int n_y,
+scale_line_22_yuv_mmx_stub ( int *weights, int n_x, int n_y,
guchar *dest, int dest_x, guchar *dest_end,
guchar **src,
int x_init, int x_step, int src_width )
mmx_weights[ j ][ 7 ] = 0x00010001 * ( weights[ 4 * j + 3 ] >> 8 );
}
- return pixops_scale_line_22_33_mmx ( mmx_weights, dest, src[ 0 ], src[ 1 ], x_step, dest_end, x_init );
+ return pixops_scale_line_22_yuv_mmx ( mmx_weights, dest, src[ 0 ], src[ 1 ], x_step, dest_end, x_init, dest_x );
}
#endif /* USE_MMX */
static inline guchar *
-scale_line_22_33 ( int *weights, int n_x, int n_y,
+scale_line_22_yuv ( int *weights, int n_x, int n_y,
guchar *dest, int dest_x, guchar *dest_end,
guchar **src,
int x_init, int x_step, int src_width )
/* process U/V */
x_aligned = ( ( x_scaled >> 1 ) << 2 );
q0 = src0 + x_aligned;
+ uv_index = ( ( dest_x & 1 ) << 1 );
+ //printf( "scale_line_22_yuv: %d %d\n", x_aligned + uv_index, dest_x );
+ p = w1 * q0[ uv_index + 1 ];
+ p += w2 * q0[ uv_index + 1 ];
+
+ x += x_step;
+ x_scaled = x >> SCALE_SHIFT;
+ dest_x++;
+
+ x_aligned = ( ( x_scaled >> 1 ) << 2 );
q1 = src1 + x_aligned;
uv_index = ( ( dest_x & 1 ) << 1 ) + 1;
- p = w1 * q0[ uv_index ];
- p += w2 * q0[ uv_index ];
p += w3 * q1[ uv_index ];
p += w4 * q1[ uv_index ];
*dest++ = ( p + 0x8000 ) >> SCALE_SHIFT;
- x += x_step;
- dest_x++;
}
return dest;
if ( filter.x.n == 2 && filter.y.n == 2 )
{
#ifdef USE_MMX
- if ( 0 && found_mmx )
- line_func = scale_line_22_33_mmx_stub;
+ if ( found_mmx )
+ {
+ //fprintf( stderr, "rescale: using mmx\n" );
+ line_func = scale_line_22_yuv_mmx_stub;
+ }
else
#endif
- line_func = scale_line_22_33;
+ line_func = scale_line_22_yuv;
}
else
line_func = scale_line;
- .file "scale_line_22_33_mmx.S"
+ .file "scale_line_22_yuv_mmx.S"
.version "01.01"
+
+.extern printf
+
gcc2_compiled.:
+.data
+MSG: .ascii "scale_line_22_yuv_mmx: %d %d\n"
+
.text
.align 16
#if !defined(__MINGW32__) && !defined(__CYGWIN__)
-.globl pixops_scale_line_22_33_mmx
- .type pixops_scale_line_22_33_mmx,@function
-pixops_scale_line_22_33_mmx:
+.globl pixops_scale_line_22_yuv_mmx
+ .type pixops_scale_line_22_yuv_mmx,@function
+pixops_scale_line_22_yuv_mmx:
#else
-.globl _pixops_scale_line_22_33_mmx
-_pixops_scale_line_22_33_mmx:
+.globl _pixops_scale_line_22_yuv_mmx
+_pixops_scale_line_22_yuv_mmx:
#endif
/*
* Arguments
*
* weights: 8(%ebp)
- * p: 12(%ebp) %esi
- * q1: 16(%ebp)
- * q2: 20(%ebp)
+ * p (dest): 12(%ebp) %esi
+ * q1 (src0): 16(%ebp)
+ * q2 (src1): 20(%ebp)
* xstep: 24(%ebp)
* p_end: 28(%ebp)
* xinit: 32(%ebp)
- *
+ * destx: 36(%ebp)
+ *
*/
/*
pushl %edi
pushl %esi
pushl %ebx
-/* Locals:
+/* Locals:
* int x %ebx
* int x_scaled -24(%ebp)
+ * int dest_x 36(%ebp)
*/
/*
* Setup
*/
-/* Initialize variables */
- movl 32(%ebp),%ebx
- movl 32(%ebp),%edx
- sarl $16,%edx
- movl 12(%ebp),%esi
+/* Initialize variables */
+ movl 36(%ebp),%eax # destx
+ movl %eax,36(%ebp)
+ movl 32(%ebp),%ebx # x
+ movl 12(%ebp),%esi # dest
- cmpl 28(%ebp),%esi
+ cmpl 28(%ebp),%esi # dest == dest_end ?
jnb .out
-/* For the body of this loop, %mm01, %mm1, %mm2, %mm3 hold the 4 adjoining
+/* For the body of this loop, %mm0, %mm1, %mm2, %mm3 hold the 4 adjoining
* points we are interpolating between, as:
*
- * 000000BB00GG00RR
- */
-
+ * 0000000000UV00YY
+ */
+
/* Load initial values into %mm1, %mm3 */
- leal (%edx,%edx,2),%edx # Multiply by 3
- movl 16(%ebp),%edi
- pxor %mm4, %mm4
- movzbl 2(%edi,%edx),%ecx
- shll $16,%ecx
- movzwl (%edi,%edx),%eax
- orl %eax,%ecx
+ /* x_scaled = ( x >> 16 ) * stride */
+ movl %ebx, %edx
+ sarl $16,%edx
+ sall $1, %edx
+
+ /* load from src0 */
+ movl 16(%ebp), %edi
+ movzbl (%edi,%edx), %ecx
+
+ /* x_aligned = x_scaled divided by 2 and multiplied by 4 */
+ movl %ebx, %edx
+ sarl $17, %edx
+ sall $2, %edx
+
+ /* uv_index = ( ( dest_x & 1 ) << 1 ) + 1; */
+ movl 36(%ebp), %eax
+ andl $1, %eax
+ sall $1, %eax
+ addl %eax, %edx
+ movzbl 1(%edi,%edx), %eax
+ shll $8, %eax
+ orl %eax, %ecx
+
movd %ecx, %mm1
+ pxor %mm4, %mm4
punpcklbw %mm4, %mm1
- movl 20(%ebp),%edi
- movzbl 2(%edi,%edx),%ecx
- shll $16,%ecx
- movzwl (%edi,%edx),%eax
- orl %eax,%ecx
+ /* x_scaled = ( x >> 16 ) * stride */
+ movl %ebx, %edx
+ sarl $16, %edx
+ sall $1, %edx
+
+ /* load from src1 */
+ movl 20(%ebp), %edi
+ movzbl (%edi,%edx), %ecx
+
+ /* x_aligned = x_scaled divided by 2 and multiplied by 4 */
+ movl %ebx, %edx
+ sarl $17, %edx
+ sall $2, %edx
+
+ /* uv_index = ( ( dest_x & 1 ) << 1 ) + 1; */
+ movl 36(%ebp), %eax
+ andl $1, %eax
+ sall $1, %eax
+ addl %eax, %edx
+ movzbl (%edi,%edx), %eax
+ shll $8, %eax
+ orl %eax, %ecx
+
movd %ecx, %mm3
punpcklbw %mm4, %mm3
- addl $65536,%ebx
- movl %ebx,%edx
- sarl $16,%edx
+ /* dest_x++; */
+ movl 36(%ebp), %eax
+ addl $1, %eax
+ movl %eax, 36(%ebp)
+
+ /* x_scaled = x >> 16 */
+ addl $65536, %ebx
+ movl %ebx, %edx
+ sarl $16, %edx
+ movl %edx, -24(%ebp)
jmp .newx
.p2align 4,,7
paddw %mm6, %mm7
paddw %mm5, %mm7
-/* %mm7 holds the accumulated sum. Compute (C + 0x80) / 256
+/* %mm7 holds the accumulated sum. Compute (C + 0x80) / 256
*/
pxor %mm4, %mm4
- movl $8421504, %eax # 0x00808080
- movd %eax, %mm6
+ movl $0x80808080, %eax
+ movd %eax, %mm6
punpcklbw %mm4, %mm6
paddw %mm6, %mm7
psrlw $8, %mm7
/* Pack into %eax and store result
- */
+ */
packuswb %mm7, %mm7
movd %mm7, %eax
-
- movb %al, (%esi)
+
+ movb %al, 0(%esi) # *dest = y
shrl $8, %eax
- movw %ax, 1(%esi)
- addl $3, %esi
-
- cmpl %esi,28(%ebp)
- je .out
-
-/* x += x_step; */
- addl 24(%ebp),%ebx
-/* x_scaled = x >> 16; */
- movl %ebx,%edx
- sarl $16,%edx
+ movb %al, 1(%esi) # *dest = uv
+
+ addl $2, %esi # dest += 2
+
+ cmpl %esi,28(%ebp) # if dest == dest_end ?
+ je .out # then exit
+
+ movl 36(%ebp), %eax # get dest_x
+ addl $1, %eax # dest_x++
+ movl %eax, 36(%ebp) # put dest_x
- cmpl %edx,-24(%ebp)
- je .loop
+ addl 24(%ebp), %ebx # x += x_step
+
+ movl %ebx, %edx # x_scaled = x ...
+ sarl $16, %edx # >> 16
+ movl %edx, -24(%ebp) # save x_scaled
.newx:
- movl %edx,-24(%ebp)
+
/*
* Load the two new values into %mm1, %mm3, move old values into %mm0, %mm2
*/
movq %mm1, %mm0
movq %mm3, %mm2
-
- leal (%edx,%edx,2),%edx # Multiply by 3
- movl 16(%ebp),%edi
- movzbl 2(%edi,%edx),%ecx
- shll $16,%ecx
- movzwl (%edi,%edx),%eax
- orl %eax,%ecx
- movd %ecx, %mm1
+ sall $1, %edx # x_scaled *= channels
+
+ movl 16(%ebp), %edi # get src0
+ movzbl (%edi,%edx), %ecx # y = src0[ x_scaled ]
+
+ sarl $2, %edx # x_aligned = ( x_scaled / channels ) >> 1 ...
+ sall $2, %edx # << 2
+
+ movl 36(%ebp), %eax # uv_index = dest_x ...
+ #pushl %eax
+ andl $1, %eax # ( dest_x & 1 ) ...
+ sall $1, %eax # << 1
+ addl %eax, %edx # x_aligned += uv_index
+ #pushl %edx
+ #pushl $MSG
+ #call printf
+ #popl %edx
+ #popl %edx
+ #popl %edx
+ movzbl 1(%edi,%edx), %eax # uv = src0[ x_aligned + 1 ]
+ shll $8, %eax # store uv
+ orl %eax, %ecx
+
+ movd %ecx, %mm1 # move to mmx1
punpcklbw %mm4, %mm1
- movl 20(%ebp),%edi
- movzbl 2(%edi,%edx),%ecx
- shll $16,%ecx
- movzwl (%edi,%edx),%eax
- orl %eax,%ecx
- movd %ecx, %mm3
+ movl %ebx, %edx # x_scaled = x ...
+ sarl $16, %edx # >> 16
+ sall $1, %edx # x_scaled *= channels
+
+ movl 20(%ebp), %edi # get src1
+ movzbl (%edi,%edx), %ecx # y = src1[ x_scaled ]
+
+ sarl $2, %edx # x_aligned = ( x_scaled / channels ) >> 1 ...
+ sall $2, %edx # << 2
+
+ movl 36(%ebp), %eax # uv_index = dest_x ...
+ andl $1, %eax # ( dest_x & 1 ) ...
+ sall $1, %eax # << 1
+ addl %eax, %edx # x_aligned += uv_index
+ movzbl 1(%edi,%edx), %eax # uv = src1[ x_aligned + 1 ]
+ shll $8, %eax # store uv
+ orl %eax, %ecx
+
+ movd %ecx, %mm3 # move to mmx3
punpcklbw %mm4, %mm3
-
- movl 8(%ebp),%edi
+
+ movl 8(%ebp), %edi # get weights pointer
jmp .loop