pw_8: times 4 dw 8
pw_32: times 4 dw 32
pw_64: times 4 dw 64
+sw_64: dd 64
SECTION .text
jg .height_loop
REP_RET
-cglobal x264_pixel_avg2_w16_sse2, 6,7
+%macro PIXEL_AVG_SSE 1
+cglobal x264_pixel_avg2_w16_%1, 6,7
sub r4, r2
lea r6, [r4+r3]
.height_loop:
jg .height_loop
REP_RET
-cglobal x264_pixel_avg2_w20_sse2, 6,7
+cglobal x264_pixel_avg2_w20_%1, 6,7
sub r4, r2
lea r6, [r4+r3]
.height_loop:
sub r5d, 2
jg .height_loop
REP_RET
+%endmacro
+PIXEL_AVG_SSE sse2
+%define movdqu lddqu
+PIXEL_AVG_SSE sse3
+%undef movdqu
+
+; Cacheline split code for processors with high latencies for loads
+; split over cache lines. See sad-a.asm for a more detailed explanation.
+; This particular instance is complicated by the fact that src1 and src2
+; can have different alignments. For simplicity and code size, only the
+; MMX cacheline workaround is used. As a result, in the case of SSE2
+; pixel_avg, the cacheline check functions calls the SSE2 version if there
+; is no cacheline split, and the MMX workaround if there is.
+
+%macro INIT_SHIFT 2
+ and eax, 7
+ shl eax, 3
+%ifdef PIC32
+ ; both versions work, but picgetgot is slower than gpr->mmx is slower than mem->mmx
+ mov r2, 64
+ sub r2, eax
+ movd %2, eax
+ movd %1, r2
+%else
+ movd %1, [sw_64 GLOBAL]
+ movd %2, eax
+ psubw %1, %2
+%endif
+%endmacro
+%macro AVG_CACHELINE_CHECK 3 ; width, cacheline, instruction set
+cglobal x264_pixel_avg2_w%1_cache%2_%3, 0,0
+ mov eax, r2m
+ and eax, 0x1f|(%2>>1)
+ cmp eax, (32-%1)|(%2>>1)
+ jle x264_pixel_avg2_w%1_%3
+;w12 isn't needed because w16 is just as fast if there's no cacheline split
+%if %1 == 12
+ jmp x264_pixel_avg2_w16_cache_mmxext
+%else
+ jmp x264_pixel_avg2_w%1_cache_mmxext
+%endif
+%endmacro
+
+%macro AVG_CACHELINE_START 0
+ %assign stack_offset 0
+ INIT_SHIFT mm6, mm7
+ mov eax, r4m
+ INIT_SHIFT mm4, mm5
+ PROLOGUE 6,6,0
+ and r2, ~7
+ and r4, ~7
+ sub r4, r2
+.height_loop:
+%endmacro
+
+%macro AVG_CACHELINE_LOOP 2
+ movq mm0, [r2+8+%1]
+ movq mm1, [r2+%1]
+ movq mm2, [r2+r4+8+%1]
+ movq mm3, [r2+r4+%1]
+ psllq mm0, mm6
+ psrlq mm1, mm7
+ psllq mm2, mm4
+ psrlq mm3, mm5
+ por mm0, mm1
+ por mm2, mm3
+ pavgb mm0, mm2
+ %2 [r0+%1], mm0
+%endmacro
+
+x264_pixel_avg2_w8_cache_mmxext:
+ AVG_CACHELINE_START
+ AVG_CACHELINE_LOOP 0, movq
+ add r2, r3
+ add r0, r1
+ dec r5d
+ jg .height_loop
+ RET
+
+x264_pixel_avg2_w16_cache_mmxext:
+ AVG_CACHELINE_START
+ AVG_CACHELINE_LOOP 0, movq
+ AVG_CACHELINE_LOOP 8, movq
+ add r2, r3
+ add r0, r1
+ dec r5d
+ jg .height_loop
+ RET
+
+x264_pixel_avg2_w20_cache_mmxext:
+ AVG_CACHELINE_START
+ AVG_CACHELINE_LOOP 0, movq
+ AVG_CACHELINE_LOOP 8, movq
+ AVG_CACHELINE_LOOP 16, movd
+ add r2, r3
+ add r0, r1
+ dec r5d
+ jg .height_loop
+ RET
+
+%ifndef ARCH_X86_64
+AVG_CACHELINE_CHECK 8, 32, mmxext
+AVG_CACHELINE_CHECK 12, 32, mmxext
+AVG_CACHELINE_CHECK 16, 32, mmxext
+AVG_CACHELINE_CHECK 20, 32, mmxext
+AVG_CACHELINE_CHECK 16, 64, mmxext
+AVG_CACHELINE_CHECK 20, 64, mmxext
+%endif
+
+AVG_CACHELINE_CHECK 8, 64, mmxext
+AVG_CACHELINE_CHECK 12, 64, mmxext
+AVG_CACHELINE_CHECK 16, 64, sse2
+AVG_CACHELINE_CHECK 20, 64, sse2
;=============================================================================
; pixel copy
jg .height_loop
REP_RET
-cglobal x264_mc_copy_w16_sse2,5,7
+%macro COPY_W16_SSE2 2
+cglobal %1, 5,7
lea r6, [r3*3]
lea r5, [r1*3]
-.height_loop
- movdqu xmm0, [r2]
- movdqu xmm1, [r2+r3]
- movdqu xmm2, [r2+r3*2]
- movdqu xmm3, [r2+r6]
+.height_loop:
+ %2 xmm0, [r2]
+ %2 xmm1, [r2+r3]
+ %2 xmm2, [r2+r3*2]
+ %2 xmm3, [r2+r6]
movdqa [r0], xmm0
movdqa [r0+r1], xmm1
movdqa [r0+r1*2], xmm2
sub r4d, 4
jg .height_loop
REP_RET
+%endmacro
+
+COPY_W16_SSE2 x264_mc_copy_w16_sse2, movdqu
+; cacheline split with mmx has too much overhead; the speed benefit is near-zero.
+; but with SSE3 the overhead is zero, so there's no reason not to include it.
+COPY_W16_SSE2 x264_mc_copy_w16_sse3, lddqu
+COPY_W16_SSE2 x264_mc_copy_w16_aligned_sse2, movdqa
%endif
ALIGN 4
-.height_loop
+.height_loop:
movd mm1, [r2+r3]
movd mm0, [r2]
mov r4d, r7m ; i_height
jmp .height_loop
-.finish
- REP_RET
-
ALIGN 4
-.mc1d
+.mc1d:
mov eax, r4d
or eax, r5d
and eax, 7
je .height_loop1_w8
ALIGN 4
-.height_loop1_w4
+.height_loop1_w4:
movd mm0, [r2+r5]
movd mm1, [r2]
punpcklbw mm0, mm3
add r0, r1
dec r4d
jnz .height_loop1_w4
+.finish:
REP_RET
ALIGN 4
-.height_loop1_w8
+.height_loop1_w8:
movq mm0, [r2+r5]
movq mm1, [r2]
movq mm2, mm0