; pixels per iteration. In order to not have to keep track of where
; we are w.r.t. dithering, we unroll the mmx/8bit loop x2.
%if %1 == 8
-%rep 16/mmsize
-%endif ; %1 == 8
+%assign %%repcnt 16/mmsize
+%else
+%assign %%repcnt 1
+%endif
+
+%rep %%repcnt
%if %1 == 8
%if ARCH_X86_32
mova m1, [yuv2yuvX_%1_start]
mova m2, m1
%endif ; %1 == 8/9/10/16
- movsx cntr_reg, r1m ; FIXME should be fltsizem, but the assembler does the wrong thing b/c of SUB above
+ movsx cntr_reg, fltsizem
.filterloop_ %+ %%i:
; input pixels
mov r6, [srcq+gprsize*cntr_reg-2*gprsize]
add r5, mmsize/2
sub wd, mmsize/2
-%if %1 == 8
+
%assign %%i %%i+2
%endrep
-%endif ; %1 == 8
jg .pixelloop
%if %1 == 8
yuv2planeX_fn 10, 7, 5
yuv2planeX_fn 16, 8, 5
+%if HAVE_AVX
INIT_XMM avx
yuv2planeX_fn 8, 10, 7
yuv2planeX_fn 9, 7, 5
yuv2planeX_fn 10, 7, 5
+%endif
; %1=outout-bpc, %2=alignment (u/a)
%macro yuv2plane1_mainloop 2
INIT_XMM sse4
yuv2plane1_fn 16, 5, 3
+%if HAVE_AVX
INIT_XMM avx
yuv2plane1_fn 8, 5, 5
yuv2plane1_fn 9, 5, 3
yuv2plane1_fn 10, 5, 3
yuv2plane1_fn 16, 5, 3
+%endif