-static void composite_line_yuv( uint8_t *dest, uint8_t *src, int width, uint8_t *alpha_b, uint8_t *alpha_a, int weight, uint16_t *luma, int softness )
+void composite_line_yuv( uint8_t *dest, uint8_t *src, int width, uint8_t *alpha_b, uint8_t *alpha_a, int weight, uint16_t *luma, int soft, uint32_t step )
+{
+ register int j = 0;
+ register int mix;
+
+#if defined(USE_SSE) && defined(ARCH_X86_64)
+ if ( !luma && width > 7 )
+ {
+ composite_line_yuv_sse2_simple(dest, src, width, alpha_b, alpha_a, weight);
+ j = width - width % 8;
+ dest += j * 2;
+ src += j * 2;
+ alpha_a += j;
+ alpha_b += j;
+ }
+#endif
+
+ for ( ; j < width; j ++ )
+ {
+ mix = calculate_mix( luma, j, soft, weight, *alpha_b ++, step );
+ *dest = sample_mix( *dest, *src++, mix );
+ dest++;
+ *dest = sample_mix( *dest, *src++, mix );
+ dest++;
+ *alpha_a = ( mix >> 8 ) | *alpha_a;
+ alpha_a ++;
+ }
+}
+
+static void composite_line_yuv_or( uint8_t *dest, uint8_t *src, int width, uint8_t *alpha_b, uint8_t *alpha_a, int weight, uint16_t *luma, int soft, uint32_t step )