From 8d369600eca3fe3d059cfa8ba68d74ccdd883e33 Mon Sep 17 00:00:00 2001 From: Marco Costalba Date: Mon, 10 Aug 2009 14:23:19 +0100 Subject: [PATCH] Double prefetch on Windows After fixing the cpu frequency with RightMark tool I was able to test speed all the different prefetch combinations. Here the results: OS Windows Vista 32bit, MSVC compile CPU Intecl Core 2 Duo T5220 1.55 GHz bench on depth 12, 1 thread, 26552844 nodes searched results in nodes/sec no-prefetch 402486, 402005, 402767, 401439, 403060 single prefetch (aligned 64) 410145, 409159, 408078, 410443, 409652 double prefetch (aligned 64) 0+32 414739, 411238, 413937, 414641, 413834 double prefetch (aligned 64) 0+64 413537, 414337, 413537, 414842, 414240 And now also some crazy stuff: single prefetch (aligned 128) 410145, 407395, 406230, 410050, 409949 double prefetch (aligned 64) 0+0 409753, 410044, 409456 single prefetch (aligned 64) +32 408379, 408272, 406809 single prefetch (aligned 64) +64 408279, 409059, 407395 So it seems the best is a double prefetch at the addres + 32 or +64, I will choose the second one because it seems more natural to me. It is still a mystery why it doesn't work under Linux :-( Signed-off-by: Marco Costalba --- src/tt.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/tt.cpp b/src/tt.cpp index 0396b287..ffb8b012 100644 --- a/src/tt.cpp +++ b/src/tt.cpp @@ -174,12 +174,14 @@ TTEntry* TranspositionTable::retrieve(const Key posKey) const { /// blocking function and do not stalls the CPU waiting for data /// to be loaded from RAM, that can be very slow. When we will /// subsequently call retrieve() the TT data will be already -/// quickly accessible in L1/l2 CPU cache. +/// quickly accessible in L1/L2 CPU cache. void TranspositionTable::prefetch(const Key posKey) const { #if defined(_MSC_VER) - _mm_prefetch((char*)first_entry(posKey), _MM_HINT_T0); + char* addr = (char*)first_entry(posKey); + _mm_prefetch(addr, _MM_HINT_T0); + _mm_prefetch(addr+64, _MM_HINT_T0); #else // We need to force an asm volatile here because gcc builtin // is optimized away by Intel compiler. -- 2.39.2