]> git.sesse.net Git - casparcg/commitdiff
2.0. Updated tbb library.
authorRonag <Ronag@362d55ac-95cf-4e76-9f9a-cbaa9c17b72d>
Tue, 2 Aug 2011 14:49:33 +0000 (14:49 +0000)
committerRonag <Ronag@362d55ac-95cf-4e76-9f9a-cbaa9c17b72d>
Tue, 2 Aug 2011 14:49:33 +0000 (14:49 +0000)
89 files changed:
tbb/bin/ia32/vc10/irml/irml.dll
tbb/bin/ia32/vc10/irml/irml.pdb
tbb/bin/ia32/vc10/irml/irml_debug.dll
tbb/bin/ia32/vc10/irml/irml_debug.pdb
tbb/bin/ia32/vc10/irml_c/irml.dll
tbb/bin/ia32/vc10/irml_c/irml.pdb
tbb/bin/ia32/vc10/irml_c/irml_debug.dll
tbb/bin/ia32/vc10/irml_c/irml_debug.pdb
tbb/bin/ia32/vc10/tbb.dll
tbb/bin/ia32/vc10/tbb.pdb
tbb/bin/ia32/vc10/tbb_debug.dll
tbb/bin/ia32/vc10/tbb_debug.pdb
tbb/bin/ia32/vc10/tbb_preview.dll
tbb/bin/ia32/vc10/tbb_preview.pdb
tbb/bin/ia32/vc10/tbb_preview_debug.dll
tbb/bin/ia32/vc10/tbb_preview_debug.pdb
tbb/bin/ia32/vc10/tbbmalloc.dll
tbb/bin/ia32/vc10/tbbmalloc.pdb
tbb/bin/ia32/vc10/tbbmalloc_debug.dll
tbb/bin/ia32/vc10/tbbmalloc_debug.pdb
tbb/bin/ia32/vc10/tbbmalloc_proxy.dll
tbb/bin/ia32/vc10/tbbmalloc_proxy.pdb
tbb/bin/ia32/vc10/tbbmalloc_proxy_debug.dll
tbb/bin/ia32/vc10/tbbmalloc_proxy_debug.pdb
tbb/include/tbb/atomic.h
tbb/include/tbb/blocked_range2d.h
tbb/include/tbb/blocked_range3d.h
tbb/include/tbb/compat/ppl.h
tbb/include/tbb/compat/tuple
tbb/include/tbb/concurrent_hash_map.h
tbb/include/tbb/concurrent_priority_queue.h
tbb/include/tbb/concurrent_queue.h
tbb/include/tbb/concurrent_unordered_map.h
tbb/include/tbb/concurrent_unordered_set.h [new file with mode: 0644]
tbb/include/tbb/concurrent_vector.h
tbb/include/tbb/enumerable_thread_specific.h
tbb/include/tbb/flow_graph.h [new file with mode: 0644]
tbb/include/tbb/internal/_aggregator_impl.h [new file with mode: 0644]
tbb/include/tbb/internal/_concurrent_queue_impl.h [new file with mode: 0644]
tbb/include/tbb/internal/_concurrent_unordered_impl.h [new file with mode: 0644]
tbb/include/tbb/internal/_flow_graph_impl.h [new file with mode: 0644]
tbb/include/tbb/internal/_flow_graph_item_buffer_impl.h [new file with mode: 0644]
tbb/include/tbb/internal/_flow_graph_join_impl.h [new file with mode: 0644]
tbb/include/tbb/internal/_flow_graph_node_impl.h [new file with mode: 0644]
tbb/include/tbb/internal/_flow_graph_or_impl.h [new file with mode: 0644]
tbb/include/tbb/internal/_flow_graph_tagged_buffer_impl.h [new file with mode: 0644]
tbb/include/tbb/internal/_tbb_windef.h [new file with mode: 0644]
tbb/include/tbb/machine/gcc_generic.h
tbb/include/tbb/machine/ibm_aix51.h
tbb/include/tbb/machine/linux_ia32.h
tbb/include/tbb/machine/linux_ia64.h
tbb/include/tbb/machine/linux_intel64.h
tbb/include/tbb/machine/mac_ppc.h
tbb/include/tbb/machine/macos_common.h
tbb/include/tbb/machine/sunos_sparc.h
tbb/include/tbb/machine/windows_ia32.h
tbb/include/tbb/machine/windows_intel64.h
tbb/include/tbb/machine/xbox360_ppc.h
tbb/include/tbb/parallel_for.h
tbb/include/tbb/parallel_for_each.h
tbb/include/tbb/parallel_invoke.h
tbb/include/tbb/parallel_reduce.h
tbb/include/tbb/parallel_sort.h
tbb/include/tbb/queuing_mutex.h
tbb/include/tbb/queuing_rw_mutex.h
tbb/include/tbb/reader_writer_lock.h
tbb/include/tbb/runtime_loader.h [new file with mode: 0644]
tbb/include/tbb/spin_mutex.h
tbb/include/tbb/task.h
tbb/include/tbb/task_group.h
tbb/include/tbb/task_scheduler_init.h
tbb/include/tbb/tbb_config.h
tbb/include/tbb/tbb_machine.h
tbb/include/tbb/tbb_stddef.h
tbb/lib/ia32/vc10/irml/irml.lib
tbb/lib/ia32/vc10/irml/irml_debug.lib
tbb/lib/ia32/vc10/irml_c/irml.lib
tbb/lib/ia32/vc10/irml_c/irml_debug.lib
tbb/lib/ia32/vc10/tbb.def
tbb/lib/ia32/vc10/tbb.lib
tbb/lib/ia32/vc10/tbb_debug.lib
tbb/lib/ia32/vc10/tbb_preview.lib
tbb/lib/ia32/vc10/tbb_preview_debug.lib
tbb/lib/ia32/vc10/tbbmalloc.lib
tbb/lib/ia32/vc10/tbbmalloc_debug.lib
tbb/lib/ia32/vc10/tbbmalloc_proxy.lib
tbb/lib/ia32/vc10/tbbmalloc_proxy_debug.lib
tbb/lib/ia32/vc10/tbbproxy.lib [new file with mode: 0644]
tbb/lib/ia32/vc10/tbbproxy_debug.lib [new file with mode: 0644]

index 2bbcfbdcdb98c4f64eeac2eda3227809db1542a7..0d59f73de2afa5a89b4121f49666503749b6c249 100644 (file)
Binary files a/tbb/bin/ia32/vc10/irml/irml.dll and b/tbb/bin/ia32/vc10/irml/irml.dll differ
index 462f8a9b691db3f77c48eebb98e1f320bf09929c..07be7d1838c2524f07798537c45b6684b19c66ff 100644 (file)
Binary files a/tbb/bin/ia32/vc10/irml/irml.pdb and b/tbb/bin/ia32/vc10/irml/irml.pdb differ
index c07c6ad8ec23297bb7d85f50b6cf6843893d3ffb..5a93b62e5abd8bc43a506cb8e40049384cc436a5 100644 (file)
Binary files a/tbb/bin/ia32/vc10/irml/irml_debug.dll and b/tbb/bin/ia32/vc10/irml/irml_debug.dll differ
index 361fea89a9200297851ac9b0cdae064681d16dd4..1fc98b45b8cbbbfc22b3aa3d0539dd60fb9adade 100644 (file)
Binary files a/tbb/bin/ia32/vc10/irml/irml_debug.pdb and b/tbb/bin/ia32/vc10/irml/irml_debug.pdb differ
index 20e67ac910a87d25ab0f61beea8f19d54ae1fc54..7be2081603ccc112dd5f6b0c1996113f50f9b8c1 100644 (file)
Binary files a/tbb/bin/ia32/vc10/irml_c/irml.dll and b/tbb/bin/ia32/vc10/irml_c/irml.dll differ
index f4a40b963968fe4ffeec377865a98752a9e19bfd..adbdd639990096dded211603098d3263df0f451a 100644 (file)
Binary files a/tbb/bin/ia32/vc10/irml_c/irml.pdb and b/tbb/bin/ia32/vc10/irml_c/irml.pdb differ
index f1f0797eacd3cf4e0834990a2d7e03bdd86af523..ab336fc072aeb7d986bb757ae36a79f381e4c1ea 100644 (file)
Binary files a/tbb/bin/ia32/vc10/irml_c/irml_debug.dll and b/tbb/bin/ia32/vc10/irml_c/irml_debug.dll differ
index 57fa7b0e490b72213cf51f74f2ebdb610d60c4e8..a5ced025fff1f642f08951905ab1f951b51cbaeb 100644 (file)
Binary files a/tbb/bin/ia32/vc10/irml_c/irml_debug.pdb and b/tbb/bin/ia32/vc10/irml_c/irml_debug.pdb differ
index 5f4f57a8c4c3d81ae97c89716a1d8329fd9ca928..036412e7622944c24253653c64693b9b466e70a6 100644 (file)
Binary files a/tbb/bin/ia32/vc10/tbb.dll and b/tbb/bin/ia32/vc10/tbb.dll differ
index 10581cf8dd864fa0d080e8583892380095c19059..2a77e02e9d660a49841d2b66786d0b9930c9ae5c 100644 (file)
Binary files a/tbb/bin/ia32/vc10/tbb.pdb and b/tbb/bin/ia32/vc10/tbb.pdb differ
index f71394d7863ac2a459b896d00db899d2e3fa5dde..73f9321075449a32df98a12fe3041444e27054b3 100644 (file)
Binary files a/tbb/bin/ia32/vc10/tbb_debug.dll and b/tbb/bin/ia32/vc10/tbb_debug.dll differ
index e3f4cc31ef1bac3265845593c7ebde19c87b2015..dc9b15f73b836955239dfec32e986aef1aca549a 100644 (file)
Binary files a/tbb/bin/ia32/vc10/tbb_debug.pdb and b/tbb/bin/ia32/vc10/tbb_debug.pdb differ
index 0cf5262aeeb568c8b8767c090203e3991c2c046a..a54f107c74eb899f52555432f885d2ed89616752 100644 (file)
Binary files a/tbb/bin/ia32/vc10/tbb_preview.dll and b/tbb/bin/ia32/vc10/tbb_preview.dll differ
index 75215bf13e5a12fde2d865358745e7f8d8158d4f..0a21659e4fde7bb7930bd336ada8f3824c3d675d 100644 (file)
Binary files a/tbb/bin/ia32/vc10/tbb_preview.pdb and b/tbb/bin/ia32/vc10/tbb_preview.pdb differ
index 878e3f0f016f7b87eafa2520868da61c25fd35b2..21aeeb5697e332583b363156c0c88cf98fd9b545 100644 (file)
Binary files a/tbb/bin/ia32/vc10/tbb_preview_debug.dll and b/tbb/bin/ia32/vc10/tbb_preview_debug.dll differ
index f5939181d42d508bcb8e5a9b2596ed1cfc74c07f..8898de1dcc806b1a82520ad13d9836e158d3e90b 100644 (file)
Binary files a/tbb/bin/ia32/vc10/tbb_preview_debug.pdb and b/tbb/bin/ia32/vc10/tbb_preview_debug.pdb differ
index 60e7c40eec76eabd241be2038da90ebc6ab71f65..d112dcca923813250d2c1118eb2dc87ef45c452c 100644 (file)
Binary files a/tbb/bin/ia32/vc10/tbbmalloc.dll and b/tbb/bin/ia32/vc10/tbbmalloc.dll differ
index 0c132833cb19899e932750b58627cce5f7488b14..100afd06416c1c1d2481940ff9543c374a472f51 100644 (file)
Binary files a/tbb/bin/ia32/vc10/tbbmalloc.pdb and b/tbb/bin/ia32/vc10/tbbmalloc.pdb differ
index bc94d62f7fd671525c2a31256cf27f63c997d032..ad6402975f89eacacace48c57a2cf304dc4d4ed4 100644 (file)
Binary files a/tbb/bin/ia32/vc10/tbbmalloc_debug.dll and b/tbb/bin/ia32/vc10/tbbmalloc_debug.dll differ
index 310d827ae601332a951622472504d82de6166c23..895908cc6cebd25967adc3370d9395d6da682b53 100644 (file)
Binary files a/tbb/bin/ia32/vc10/tbbmalloc_debug.pdb and b/tbb/bin/ia32/vc10/tbbmalloc_debug.pdb differ
index eae894f3fc76104ec8d9d4b126d4a7eac98bad53..41dbc4f79a1a84c2c1a75804914bcaa1af75f7cf 100644 (file)
Binary files a/tbb/bin/ia32/vc10/tbbmalloc_proxy.dll and b/tbb/bin/ia32/vc10/tbbmalloc_proxy.dll differ
index cb072eb144abee1bb087886746ea83fbca847139..479186fea55574f7ed49b939eda2b6197afdad63 100644 (file)
Binary files a/tbb/bin/ia32/vc10/tbbmalloc_proxy.pdb and b/tbb/bin/ia32/vc10/tbbmalloc_proxy.pdb differ
index 439bdfed6693d8b2883a2bd0877555a4a9e9989a..285ef80b6adc989094b5ee9cbf90e6b65d048b0b 100644 (file)
Binary files a/tbb/bin/ia32/vc10/tbbmalloc_proxy_debug.dll and b/tbb/bin/ia32/vc10/tbbmalloc_proxy_debug.dll differ
index 8150fc5dbb6952f71add9b4a8b9b46aa3e7de70d..025b8b45370f284b808d78cde013eea766f27baa 100644 (file)
Binary files a/tbb/bin/ia32/vc10/tbbmalloc_proxy_debug.pdb and b/tbb/bin/ia32/vc10/tbbmalloc_proxy_debug.pdb differ
index d3600899f8a5c55177bc92727f89c8c8baf24a10..5257ff59ee07f9ae30d8f60d5e833a66d03ffeac 100644 (file)
@@ -50,24 +50,26 @@ namespace tbb {
 
 //! Specifies memory fencing.
 enum memory_semantics {
-    //! For internal use only.
-    __TBB_full_fence,
+    //! Sequentially consistent fence.
+    full_fence,
     //! Acquire fence
     acquire,
     //! Release fence
-    release
+    release,
+    //! No ordering
+    relaxed
 };
 
 //! @cond INTERNAL
 namespace internal {
 
-#if __GNUC__ || __SUNPRO_CC || __IBMCPP__
-#define __TBB_DECL_ATOMIC_FIELD(t,f,a) t f  __attribute__ ((aligned(a)));
-#elif defined(__INTEL_COMPILER)||_MSC_VER >= 1300
-#define __TBB_DECL_ATOMIC_FIELD(t,f,a) __declspec(align(a)) t f;
+#if __TBB_ATTRIBUTE_ALIGNED_PRESENT
+    #define __TBB_DECL_ATOMIC_FIELD(t,f,a) t f  __attribute__ ((aligned(a)));
+#elif __TBB_DECLSPEC_ALIGN_PRESENT
+    #define __TBB_DECL_ATOMIC_FIELD(t,f,a) __declspec(align(a)) t f;
 #else 
-#error Do not know syntax for forcing alignment.
-#endif /* __GNUC__ */
+    #error Do not know syntax for forcing alignment.
+#endif
 
 template<size_t S>
 struct atomic_rep;           // Primary template declared, but never defined.
@@ -103,58 +105,82 @@ struct atomic_rep<8> {       // Specialization
 template<size_t Size, memory_semantics M>
 struct atomic_traits;        // Primary template declared, but not defined.
 
-#define __TBB_DECL_FENCED_ATOMIC_PRIMITIVES(S,M)                         \
-    template<> struct atomic_traits<S,M> {                               \
-        typedef atomic_rep<S>::word word;                               \
-        inline static word compare_and_swap( volatile void* location, word new_value, word comparand ) {\
-            return __TBB_CompareAndSwap##S##M(location,new_value,comparand);    \
-        }                                                                       \
-        inline static word fetch_and_add( volatile void* location, word addend ) { \
-            return __TBB_FetchAndAdd##S##M(location,addend);                    \
-        }                                                                       \
-        inline static word fetch_and_store( volatile void* location, word value ) {\
-            return __TBB_FetchAndStore##S##M(location,value);                   \
-        }                                                                       \
+#define __TBB_DECL_FENCED_ATOMIC_PRIMITIVES(S,M)                                                         \
+    template<> struct atomic_traits<S,M> {                                                               \
+        typedef atomic_rep<S>::word word;                                                                \
+        inline static word compare_and_swap( volatile void* location, word new_value, word comparand ) { \
+            return __TBB_machine_cmpswp##S##M(location,new_value,comparand);                             \
+        }                                                                                                \
+        inline static word fetch_and_add( volatile void* location, word addend ) {                       \
+            return __TBB_machine_fetchadd##S##M(location,addend);                                        \
+        }                                                                                                \
+        inline static word fetch_and_store( volatile void* location, word value ) {                      \
+            return __TBB_machine_fetchstore##S##M(location,value);                                       \
+        }                                                                                                \
     };
 
-#define __TBB_DECL_ATOMIC_PRIMITIVES(S)                                  \
-    template<memory_semantics M>                                         \
-    struct atomic_traits<S,M> {                                          \
-        typedef atomic_rep<S>::word word;                               \
-        inline static word compare_and_swap( volatile void* location, word new_value, word comparand ) {\
-            return __TBB_CompareAndSwap##S(location,new_value,comparand);       \
-        }                                                                       \
-        inline static word fetch_and_add( volatile void* location, word addend ) { \
-            return __TBB_FetchAndAdd##S(location,addend);                       \
-        }                                                                       \
-        inline static word fetch_and_store( volatile void* location, word value ) {\
-            return __TBB_FetchAndStore##S(location,value);                      \
-        }                                                                       \
+#define __TBB_DECL_ATOMIC_PRIMITIVES(S)                                                                  \
+    template<memory_semantics M>                                                                         \
+    struct atomic_traits<S,M> {                                                                          \
+        typedef atomic_rep<S>::word word;                                                                \
+        inline static word compare_and_swap( volatile void* location, word new_value, word comparand ) { \
+            return __TBB_machine_cmpswp##S(location,new_value,comparand);                                \
+        }                                                                                                \
+        inline static word fetch_and_add( volatile void* location, word addend ) {                       \
+            return __TBB_machine_fetchadd##S(location,addend);                                           \
+        }                                                                                                \
+        inline static word fetch_and_store( volatile void* location, word value ) {                      \
+            return __TBB_machine_fetchstore##S(location,value);                                          \
+        }                                                                                                \
     };
 
-#if __TBB_DECL_FENCED_ATOMICS
-__TBB_DECL_FENCED_ATOMIC_PRIMITIVES(1,__TBB_full_fence)
-__TBB_DECL_FENCED_ATOMIC_PRIMITIVES(2,__TBB_full_fence)
-__TBB_DECL_FENCED_ATOMIC_PRIMITIVES(4,__TBB_full_fence)
-__TBB_DECL_FENCED_ATOMIC_PRIMITIVES(8,__TBB_full_fence)
+template<memory_semantics M>
+struct atomic_load_store_traits;    // Primary template declaration
+
+#define __TBB_DECL_ATOMIC_LOAD_STORE_PRIMITIVES(M)                      \
+    template<> struct atomic_load_store_traits<M> {                     \
+        template <typename T>                                           \
+        inline static T load( const volatile T& location ) {            \
+            return __TBB_load_##M( location );                          \
+        }                                                               \
+        template <typename T>                                           \
+        inline static void store( volatile T& location, T value ) {     \
+            __TBB_store_##M( location, value );                         \
+        }                                                               \
+    }
+
+#if __TBB_USE_FENCED_ATOMICS
+__TBB_DECL_FENCED_ATOMIC_PRIMITIVES(1,full_fence)
+__TBB_DECL_FENCED_ATOMIC_PRIMITIVES(2,full_fence)
+__TBB_DECL_FENCED_ATOMIC_PRIMITIVES(4,full_fence)
 __TBB_DECL_FENCED_ATOMIC_PRIMITIVES(1,acquire)
 __TBB_DECL_FENCED_ATOMIC_PRIMITIVES(2,acquire)
 __TBB_DECL_FENCED_ATOMIC_PRIMITIVES(4,acquire)
 __TBB_DECL_FENCED_ATOMIC_PRIMITIVES(1,release)
 __TBB_DECL_FENCED_ATOMIC_PRIMITIVES(2,release)
 __TBB_DECL_FENCED_ATOMIC_PRIMITIVES(4,release)
+__TBB_DECL_FENCED_ATOMIC_PRIMITIVES(1,relaxed)
+__TBB_DECL_FENCED_ATOMIC_PRIMITIVES(2,relaxed)
+__TBB_DECL_FENCED_ATOMIC_PRIMITIVES(4,relaxed)
 #if __TBB_64BIT_ATOMICS
+__TBB_DECL_FENCED_ATOMIC_PRIMITIVES(8,full_fence)
 __TBB_DECL_FENCED_ATOMIC_PRIMITIVES(8,acquire)
 __TBB_DECL_FENCED_ATOMIC_PRIMITIVES(8,release)
+__TBB_DECL_FENCED_ATOMIC_PRIMITIVES(8,relaxed)
 #endif
-#else
+#else /* !__TBB_USE_FENCED_ATOMICS */
 __TBB_DECL_ATOMIC_PRIMITIVES(1)
 __TBB_DECL_ATOMIC_PRIMITIVES(2)
 __TBB_DECL_ATOMIC_PRIMITIVES(4)
 #if __TBB_64BIT_ATOMICS
 __TBB_DECL_ATOMIC_PRIMITIVES(8)
 #endif
-#endif
+#endif /* !__TBB_USE_FENCED_ATOMICS */
+
+__TBB_DECL_ATOMIC_LOAD_STORE_PRIMITIVES(full_fence);
+__TBB_DECL_ATOMIC_LOAD_STORE_PRIMITIVES(acquire);
+__TBB_DECL_ATOMIC_LOAD_STORE_PRIMITIVES(release);
+__TBB_DECL_ATOMIC_LOAD_STORE_PRIMITIVES(relaxed);
 
 //! Additive inverse of 1 for type T.
 /** Various compilers issue various warnings if -1 is used with various integer types.
@@ -186,7 +212,7 @@ public:
     }
 
     value_type fetch_and_store( value_type value ) {
-        return fetch_and_store<__TBB_full_fence>(value);
+        return fetch_and_store<full_fence>(value);
     }
 
     template<memory_semantics M>
@@ -199,7 +225,7 @@ public:
     }
 
     value_type compare_and_swap( value_type value, value_type comparand ) {
-        return compare_and_swap<__TBB_full_fence>(value,comparand);
+        return compare_and_swap<full_fence>(value,comparand);
     }
 
     operator value_type() const volatile {                // volatile qualifier here for backwards compatibility 
@@ -208,6 +234,28 @@ public:
         return w.value;
     }
 
+    template<memory_semantics M>
+    value_type load () const {
+        converter u;
+        u.bits = internal::atomic_load_store_traits<M>::load( rep.value );
+        return u.value;
+    }
+
+    value_type load () const {
+        return load<acquire>();
+    }
+
+    template<memory_semantics M>
+    void store ( value_type value ) {
+        converter u;
+        u.value = value;
+        internal::atomic_load_store_traits<M>::store( rep.value, u.bits );
+    }
+
+    void store ( value_type value ) {
+        store<release>( value );
+    }
+
 protected:
     value_type store_with_release( value_type rhs ) {
         converter u;
@@ -232,7 +280,7 @@ public:
     }
 
     value_type fetch_and_add( D addend ) {
-        return fetch_and_add<__TBB_full_fence>(addend);
+        return fetch_and_add<full_fence>(addend);
     }
 
     template<memory_semantics M>
@@ -303,9 +351,10 @@ struct atomic: internal::atomic_impl<T> {
     };
 
 #if __TBB_64BIT_ATOMICS
-// otherwise size is verified by test_atomic
 __TBB_DECL_ATOMIC(__TBB_LONG_LONG)
 __TBB_DECL_ATOMIC(unsigned __TBB_LONG_LONG)
+#else
+// test_atomic will verify that sizeof(long long)==8
 #endif
 __TBB_DECL_ATOMIC(long)
 __TBB_DECL_ATOMIC(unsigned long)
@@ -363,6 +412,15 @@ template<> struct atomic<void*>: internal::atomic_impl<void*> {
     }
 };
 
+// Helpers to workaround ugly syntax of calling template member function of a
+// template class with template argument dependent on template parameters.
+
+template <memory_semantics M, typename T>
+T load ( const atomic<T>& a ) { return a.template load<M>(); }
+
+template <memory_semantics M, typename T>
+void store ( atomic<T>& a, T value ) { return a.template store<M>(value); }
+
 } // namespace tbb
 
 #if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
index 9bd05094c4646588baf8bfbc5cccd61eb2d6f236..5fb182f3c86a2b8e160ae191cce65b75f2fa85d8 100644 (file)
@@ -39,7 +39,7 @@ namespace tbb {
 template<typename RowValue, typename ColValue=RowValue>
 class blocked_range2d {
 public:
-    //! Type for size of an iteation range
+    //! Type for size of an iteration range
     typedef blocked_range<RowValue> row_range_type;
     typedef blocked_range<ColValue> col_range_type;
  
index 85a66f1709d773940e8395151014feb2db002070..f6f58de6b2763720088b2e9cf68a2e5fce566acd 100644 (file)
@@ -39,7 +39,7 @@ namespace tbb {
 template<typename PageValue, typename RowValue=PageValue, typename ColValue=RowValue>
 class blocked_range3d {
 public:
-    //! Type for size of an iteation range
+    //! Type for size of an iteration range
     typedef blocked_range<PageValue> page_range_type;
     typedef blocked_range<RowValue>  row_range_type;
     typedef blocked_range<ColValue>  col_range_type;
index a474b73c16052d952760a0057c324c1ef9b72ba6..6f93148c2adfda7ffaba7b1c4b99b2a0e74b8e1c 100644 (file)
@@ -40,6 +40,7 @@
 
 namespace Concurrency {
 
+#if __TBB_TASK_GROUP_CONTEXT
     using tbb::task_handle;
     using tbb::task_group_status;
     using tbb::task_group;
@@ -53,6 +54,7 @@ namespace Concurrency {
     using tbb::canceled;
 
     using tbb::is_current_task_group_canceling;
+#endif /* __TBB_TASK_GROUP_CONTEXT */
 
     using tbb::parallel_invoke;
     using tbb::strict_ppl::parallel_for;
index 4a4f5f22a9de033456a2c3d02c81459245df982b..3177a14a252b9837343ec994aa0bfe948778e552 100644 (file)
@@ -386,6 +386,11 @@ struct tuple_element {
 template<int N, class T>
 inline static typename tuple_element<N,T>::type& get(T &t) { return t.get<N>(); }
 
+template<int N, class T>
+inline static typename tuple_element<N,T>::type const& get(T const &t) { return 
+    const_cast<typename tuple_element<N,T>::type const &>
+        (const_cast<T &>(t).get<N>()); }
+
 }  // interface5
 } // tbb
 
index 39cc308c75bbb2dcbe1b6135a181da8048185c40..ce589c93cd73111c492c01655197ecc31ea3a1f1 100644 (file)
@@ -52,7 +52,7 @@
 #include "aligned_space.h"
 #include "tbb_exception.h"
 #include "tbb_profiling.h"
-#include "_concurrent_unordered_internal.h" // Need tbb_hasher
+#include "internal/_concurrent_unordered_impl.h" // Need tbb_hasher
 #if TBB_USE_PERFORMANCE_WARNINGS || __TBB_STATISTICS
 #include <typeinfo>
 #endif
index 8a9f252b660309e35c287203620a01749e15da15..fe1839a779a9403321334c35fae5e3d8317cc3a0 100644 (file)
@@ -38,7 +38,7 @@
 #include "tbb_exception.h"
 #include "tbb_stddef.h"
 #include "tbb_profiling.h"
-#include "_aggregator_internal.h"
+#include "internal/_aggregator_impl.h"
 #include <vector>
 #include <iterator>
 #include <functional>
@@ -71,64 +71,71 @@ class concurrent_priority_queue {
     typedef A allocator_type;
 
     //! Constructs a new concurrent_priority_queue with default capacity
-    explicit concurrent_priority_queue(const allocator_type& a = allocator_type()) : mark(0), data(a) {
+    explicit concurrent_priority_queue(const allocator_type& a = allocator_type()) : mark(0), my_size(0), data(a)
+    {
         my_aggregator.initialize_handler(my_functor_t(this));
     }
 
     //! Constructs a new concurrent_priority_queue with init_sz capacity
-    explicit concurrent_priority_queue(size_type init_capacity, const allocator_type& a = allocator_type()) : mark(0), data(a) {
+    explicit concurrent_priority_queue(size_type init_capacity, const allocator_type& a = allocator_type()) :
+        mark(0), my_size(0), data(a)
+    {
         data.reserve(init_capacity);
         my_aggregator.initialize_handler(my_functor_t(this));
     }
 
     //! [begin,end) constructor
     template<typename InputIterator>
-    concurrent_priority_queue(InputIterator begin, InputIterator end, const allocator_type& a = allocator_type()) : data(begin, end, a)
+    concurrent_priority_queue(InputIterator begin, InputIterator end, const allocator_type& a = allocator_type()) :
+        data(begin, end, a)
     {
         mark = 0;
         my_aggregator.initialize_handler(my_functor_t(this));
         heapify();
+        my_size = data.size();
     }
 
     //! Copy constructor
-    /** State of this queue may not reflect results of pending
-       operations on the copied queue. */
-    explicit concurrent_priority_queue(const concurrent_priority_queue& src) : mark(src.mark), data(src.data.begin(), src.data.end(), src.data.get_allocator())
+    /** This operation is unsafe if there are pending concurrent operations on the src queue. */
+    explicit concurrent_priority_queue(const concurrent_priority_queue& src) : mark(src.mark),
+        my_size(src.my_size), data(src.data.begin(), src.data.end(), src.data.get_allocator())
     {
         my_aggregator.initialize_handler(my_functor_t(this));
         heapify();
     }
 
-    concurrent_priority_queue(const concurrent_priority_queue& src, const allocator_type& a) : mark(src.mark), data(src.data.begin(), src.data.end(), a)
+    //! Copy constructor with specific allocator
+    /** This operation is unsafe if there are pending concurrent operations on the src queue. */
+    concurrent_priority_queue(const concurrent_priority_queue& src, const allocator_type& a) : mark(src.mark),
+        my_size(src.my_size), data(src.data.begin(), src.data.end(), a)
     {
         my_aggregator.initialize_handler(my_functor_t(this));
         heapify();
     }
 
     //! Assignment operator
-    /** State of this queue may not reflect results of pending
-       operations on the copied queue. */
+    /** This operation is unsafe if there are pending concurrent operations on the src queue. */
     concurrent_priority_queue& operator=(const concurrent_priority_queue& src) {
         if (this != &src) {
             std::vector<value_type, allocator_type>(src.data.begin(), src.data.end(), src.data.get_allocator()).swap(data);
             mark = src.mark;
+            my_size = src.my_size;
         }
         return *this;
     }
 
     //! Returns true if empty, false otherwise
-    /** Returned value may not reflect results of pending operations. */
-    bool empty() const { return data.empty(); }
+    /** Returned value may not reflect results of pending operations.
+        This operation reads shared data and will trigger a race condition. */
+    bool empty() const { return size()==0; }
 
     //! Returns the current number of elements contained in the queue
-    /** Returned value may not reflect results of pending operations. */
-    size_type size() const { return data.size(); }
-
-    //! Returns the current capacity (i.e. allocated storage) of the queue
-    /** Returned value may not reflect results of pending operations. */
-    size_type capacity() const { return data.capacity(); }
+    /** Returned value may not reflect results of pending operations.
+        This operation reads shared data and will trigger a race condition. */
+    size_type size() const { return __TBB_load_with_acquire(my_size); }
 
     //! Pushes elem onto the queue, increasing capacity of queue if necessary
+    /** This operation can be safely used concurrently with other push, try_pop or reserve operations. */
     void push(const_reference elem) {
         cpq_operation op_data(elem, PUSH_OP);
         my_aggregator.execute(&op_data);
@@ -138,7 +145,8 @@ class concurrent_priority_queue {
 
     //! Gets a reference to and removes highest priority element
     /** If a highest priority element was found, sets elem and returns true,
-        otherwise returns false. */
+        otherwise returns false.
+        This operation can be safely used concurrently with other push, try_pop or reserve operations. */
     bool try_pop(reference elem) {
         cpq_operation op_data(POP_OP);
         op_data.elem = &elem;
@@ -146,39 +154,29 @@ class concurrent_priority_queue {
         return op_data.status==SUCCEEDED;
     }
 
-    //! If current capacity is less than new_cap, increases capacity to new_cap
-    void reserve(size_type new_cap) {
-        cpq_operation op_data(RESERVE_OP);
-        op_data.sz = new_cap;
-        my_aggregator.execute(&op_data);
-        if (op_data.status == FAILED) // exception thrown
-            throw_exception(eid_bad_alloc);
-    }
-
     //! Clear the queue; not thread-safe
-    /** Resets size, effectively emptying queue; does not free space.
+    /** This operation is unsafe if there are pending concurrent operations on the queue.
+        Resets size, effectively emptying queue; does not free space.
         May not clear elements added in pending operations. */
     void clear() {
         data.clear();
         mark = 0;
-    }
-
-    //! Shrink queue capacity to current contents; not thread-safe
-    void shrink_to_fit() {
-        std::vector<value_type, allocator_type>(data.begin(), data.end(), data.get_allocator()).swap(data);
+        my_size = 0;
     }
 
     //! Swap this queue with another; not thread-safe
+    /** This operation is unsafe if there are pending concurrent operations on the queue. */
     void swap(concurrent_priority_queue& q) {
         data.swap(q.data);
         std::swap(mark, q.mark);
+        std::swap(my_size, q.my_size);
     }
 
     //! Return allocator object
     allocator_type get_allocator() const { return data.get_allocator(); }
 
  private:
-    enum operation_type {INVALID_OP, PUSH_OP, POP_OP, RESERVE_OP};
+    enum operation_type {INVALID_OP, PUSH_OP, POP_OP};
     enum operation_status { WAIT=0, SUCCEEDED, FAILED };
 
     class cpq_operation : public aggregated_operation<cpq_operation> {
@@ -208,9 +206,10 @@ class concurrent_priority_queue {
     char padding1[NFS_MaxLineSize - sizeof(aggregator< my_functor_t, cpq_operation >)];
     //! The point at which unsorted elements begin
     size_type mark;
+    __TBB_atomic size_type my_size;
     Compare compare;
     //! Padding added to avoid false sharing
-    char padding2[NFS_MaxLineSize - sizeof(size_type) - sizeof(Compare)];
+    char padding2[NFS_MaxLineSize - (2*sizeof(size_type)) - sizeof(Compare)];
     //! Storage for the heap of elements in queue, plus unheapified elements
     /** data has the following structure:
 
@@ -222,14 +221,13 @@ class concurrent_priority_queue {
         [_|...|_|_|...|_| |...| ]
          0       ^       ^       ^
                  |       |       |__capacity
-                 |       |__size
+                 |       |__my_size
                  |__mark
-                 
 
         Thus, data stores the binary heap starting at position 0 through
-        mark-1 (it may be empty).  Then there are 0 or more elements 
-        that have not yet been inserted into the heap, in positions 
-        mark through size-1. */
+        mark-1 (it may be empty).  Then there are 0 or more elements
+        that have not yet been inserted into the heap, in positions
+        mark through my_size-1. */
     std::vector<value_type, allocator_type> data;
 
     void handle_operations(cpq_operation *op_list) {
@@ -254,17 +252,20 @@ class concurrent_priority_queue {
             if (tmp->type == PUSH_OP) {
                 __TBB_TRY {
                     data.push_back(*(tmp->elem));
+                    __TBB_store_with_release(my_size, my_size+1);
                     itt_store_word_with_release(tmp->status, uintptr_t(SUCCEEDED));
                 } __TBB_CATCH(...) {
                     itt_store_word_with_release(tmp->status, uintptr_t(FAILED));
                 }
             }
-            else if (tmp->type == POP_OP) {
+            else { // tmp->type == POP_OP
+                __TBB_ASSERT(tmp->type == POP_OP, NULL);
                 if (mark < data.size() &&
                     compare(data[0], data[data.size()-1])) {
                     // there are newly pushed elems and the last one
                     // is higher than top
                     *(tmp->elem) = data[data.size()-1]; // copy the data
+                    __TBB_store_with_release(my_size, my_size-1);
                     itt_store_word_with_release(tmp->status, uintptr_t(SUCCEEDED));
                     data.pop_back();
                     __TBB_ASSERT(mark<=data.size(), NULL);
@@ -274,15 +275,6 @@ class concurrent_priority_queue {
                     pop_list = tmp;
                 }
             }
-            else {
-                __TBB_ASSERT(tmp->type == RESERVE_OP, NULL);
-                __TBB_TRY {
-                    data.reserve(tmp->sz);
-                    itt_store_word_with_release(tmp->status, uintptr_t(SUCCEEDED));
-                } __TBB_CATCH(...) {
-                    itt_store_word_with_release(tmp->status, uintptr_t(FAILED));
-                }
-            }
         }
 
         // second pass processes pop operations
@@ -300,11 +292,13 @@ class concurrent_priority_queue {
                     // there are newly pushed elems and the last one is
                     // higher than top
                     *(tmp->elem) = data[data.size()-1]; // copy the data
+                    __TBB_store_with_release(my_size, my_size-1);
                     itt_store_word_with_release(tmp->status, uintptr_t(SUCCEEDED));
                     data.pop_back();
                 }
                 else { // extract top and push last element down heap
                     *(tmp->elem) = data[0]; // copy the data
+                    __TBB_store_with_release(my_size, my_size-1);
                     itt_store_word_with_release(tmp->status, uintptr_t(SUCCEEDED));
                     reheap();
                 }
@@ -319,7 +313,7 @@ class concurrent_priority_queue {
 
     //! Merge unsorted elements into heap
     void heapify() {
-        if (!mark) mark = 1;
+        if (!mark && data.size()>0) mark = 1;
         for (; mark<data.size(); ++mark) {
             // for each unheapified element under size
             size_type cur_pos = mark;
index 5ea6909d2b0d99c26ca6e5deb7d5c975dc83ccb2..cc7ac33599a02539712dca592924003a3c0d9f0b 100644 (file)
@@ -29,7 +29,7 @@
 #ifndef __TBB_concurrent_queue_H
 #define __TBB_concurrent_queue_H
 
-#include "_concurrent_queue_internal.h"
+#include "internal/_concurrent_queue_impl.h"
 
 namespace tbb {
 
@@ -48,7 +48,7 @@ class concurrent_queue: public internal::concurrent_queue_base_v3<T> {
     page_allocator_type my_allocator;
 
     //! Allocates a block of size n (bytes)
-    /*overide*/ virtual void *allocate_block( size_t n ) {
+    /*override*/ virtual void *allocate_block( size_t n ) {
         void *b = reinterpret_cast<void*>(my_allocator.allocate( n ));
         if( !b )
             internal::throw_exception(internal::eid_bad_alloc); 
@@ -199,7 +199,7 @@ class concurrent_bounded_queue: public internal::concurrent_queue_base_v3 {
         *static_cast<T*>(dst) = from;
     }
 
-    /*overide*/ virtual page *allocate_page() {
+    /*override*/ virtual page *allocate_page() {
         size_t n = sizeof(padded_page) + (items_per_page-1)*sizeof(T);
         page *p = reinterpret_cast<page*>(my_allocator.allocate( n ));
         if( !p )
@@ -208,7 +208,7 @@ class concurrent_bounded_queue: public internal::concurrent_queue_base_v3 {
     }
 
     /*override*/ virtual void deallocate_page( page *p ) {
-        size_t n = sizeof(padded_page) + items_per_page*sizeof(T);
+        size_t n = sizeof(padded_page) + (items_per_page-1)*sizeof(T);
         my_allocator.deallocate( reinterpret_cast<char*>(p), n );
     }
 
index ff13c2a40c5094fc57ca80b8cb62af3f3e3b2d14..e14fad0b5b371f486b578606c2c799f8f36fe082 100644 (file)
 #ifndef __TBB_concurrent_unordered_map_H
 #define __TBB_concurrent_unordered_map_H
 
-#include "_concurrent_unordered_internal.h"
+#include "internal/_concurrent_unordered_impl.h"
 
 namespace tbb
 {
 
-// Template class for hash compare
-template<typename Key>
-class tbb_hash
-{
-public:
-    tbb_hash() {}
-
-    size_t operator()(const Key& key) const
-    {
-        return tbb_hasher(key);
-    }
-};
-
 namespace interface5 {
 
 // Template class for hash map traits
@@ -90,7 +77,7 @@ protected:
     hash_compare my_hash_compare; // the comparator predicate for keys
 };
 
-template <typename Key, typename T, typename Hasher = tbb_hash<Key>, typename Key_equality = std::equal_to<Key>, typename Allocator = tbb::tbb_allocator<std::pair<const Key, T> > >
+template <typename Key, typename T, typename Hasher = tbb::tbb_hash<Key>, typename Key_equality = std::equal_to<Key>, typename Allocator = tbb::tbb_allocator<std::pair<const Key, T> > >
 class concurrent_unordered_map : public internal::concurrent_unordered_base< concurrent_unordered_map_traits<Key, T, internal::hash_compare<Key, Hasher, Key_equality>, Allocator, false> >
 {
     // Base type definitions
diff --git a/tbb/include/tbb/concurrent_unordered_set.h b/tbb/include/tbb/concurrent_unordered_set.h
new file mode 100644 (file)
index 0000000..45b550e
--- /dev/null
@@ -0,0 +1,177 @@
+/*
+    Copyright 2005-2011 Intel Corporation.  All Rights Reserved.
+
+    This file is part of Threading Building Blocks.
+
+    Threading Building Blocks is free software; you can redistribute it
+    and/or modify it under the terms of the GNU General Public License
+    version 2 as published by the Free Software Foundation.
+
+    Threading Building Blocks is distributed in the hope that it will be
+    useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+    of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Threading Building Blocks; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+    As a special exception, you may use this file as part of a free software
+    library without restriction.  Specifically, if other files instantiate
+    templates or use macros or inline functions from this file, or you compile
+    this file and link it with other files to produce an executable, this
+    file does not by itself cause the resulting executable to be covered by
+    the GNU General Public License.  This exception does not however
+    invalidate any other reasons why the executable file might be covered by
+    the GNU General Public License.
+*/
+
+/* Container implementations in this header are based on PPL implementations
+   provided by Microsoft. */
+
+#ifndef __TBB_concurrent_unordered_set_H
+#define __TBB_concurrent_unordered_set_H
+
+#include "internal/_concurrent_unordered_impl.h"
+
+namespace tbb
+{
+
+namespace interface5 {
+
+// Template class for hash set traits
+template<typename Key, typename Hash_compare, typename Allocator, bool Allow_multimapping>
+class concurrent_unordered_set_traits
+{
+protected:
+    typedef Key value_type;
+    typedef Key key_type;
+    typedef Hash_compare hash_compare;
+    typedef typename Allocator::template rebind<value_type>::other allocator_type;
+    enum { allow_multimapping = Allow_multimapping };
+
+    concurrent_unordered_set_traits() : my_hash_compare() {}
+    concurrent_unordered_set_traits(const hash_compare& hc) : my_hash_compare(hc) {}
+
+    typedef hash_compare value_compare;
+
+    static const Key& get_key(const value_type& value) {
+        return value;
+    }
+
+    hash_compare my_hash_compare; // the comparator predicate for keys
+};
+
+template <typename Key, typename Hasher = tbb::tbb_hash<Key>, typename Key_equality = std::equal_to<Key>, typename Allocator = tbb::tbb_allocator<Key> >
+class concurrent_unordered_set : public internal::concurrent_unordered_base< concurrent_unordered_set_traits<Key, internal::hash_compare<Key, Hasher, Key_equality>, Allocator, false> >
+{
+    // Base type definitions
+    typedef internal::hash_compare<Key, Hasher, Key_equality> hash_compare;
+    typedef internal::concurrent_unordered_base< concurrent_unordered_set_traits<Key, hash_compare, Allocator, false> > base_type;
+    typedef concurrent_unordered_set_traits<Key, internal::hash_compare<Key, Hasher, Key_equality>, Allocator, false> traits_type;
+    using traits_type::my_hash_compare;
+#if __TBB_EXTRA_DEBUG
+public:
+#endif
+    using traits_type::allow_multimapping;
+public:
+    using base_type::end;
+    using base_type::find;
+    using base_type::insert;
+
+    // Type definitions
+    typedef Key key_type;
+    typedef typename base_type::value_type value_type;
+    typedef Key mapped_type;
+    typedef Hasher hasher;
+    typedef Key_equality key_equal;
+    typedef hash_compare key_compare;
+
+    typedef typename base_type::allocator_type allocator_type;
+    typedef typename base_type::pointer pointer;
+    typedef typename base_type::const_pointer const_pointer;
+    typedef typename base_type::reference reference;
+    typedef typename base_type::const_reference const_reference;
+
+    typedef typename base_type::size_type size_type;
+    typedef typename base_type::difference_type difference_type;
+
+    typedef typename base_type::iterator iterator;
+    typedef typename base_type::const_iterator const_iterator;
+    typedef typename base_type::iterator local_iterator;
+    typedef typename base_type::const_iterator const_local_iterator;
+
+    // Construction/destruction/copying
+    explicit concurrent_unordered_set(size_type n_of_buckets = 8, const hasher& a_hasher = hasher(),
+        const key_equal& a_keyeq = key_equal(), const allocator_type& a = allocator_type())
+        : base_type(n_of_buckets, key_compare(a_hasher, a_keyeq), a)
+    {
+    }
+
+    concurrent_unordered_set(const Allocator& a) : base_type(8, key_compare(), a)
+    {
+    }
+
+    template <typename Iterator>
+    concurrent_unordered_set(Iterator first, Iterator last, size_type n_of_buckets = 8, const hasher& a_hasher = hasher(),
+        const key_equal& a_keyeq = key_equal(), const allocator_type& a = allocator_type())
+        : base_type(n_of_buckets, key_compare(a_hasher, a_keyeq), a)
+    {
+        for (; first != last; ++first)
+            base_type::insert(*first);
+    }
+
+    concurrent_unordered_set(const concurrent_unordered_set& table) : base_type(table)
+    {
+    }
+
+    concurrent_unordered_set(const concurrent_unordered_set& table, const Allocator& a)
+        : base_type(table, a)
+    {
+    }
+
+    concurrent_unordered_set& operator=(const concurrent_unordered_set& table)
+    {
+        base_type::operator=(table);
+        return (*this);
+    }
+
+    iterator unsafe_erase(const_iterator where)
+    {
+        return base_type::unsafe_erase(where);
+    }
+
+    size_type unsafe_erase(const key_type& key)
+    {
+        return base_type::unsafe_erase(key);
+    }
+
+    iterator unsafe_erase(const_iterator first, const_iterator last)
+    {
+        return base_type::unsafe_erase(first, last);
+    }
+
+    void swap(concurrent_unordered_set& table)
+    {
+        base_type::swap(table);
+    }
+
+    // Observers
+    hasher hash_function() const
+    {
+        return my_hash_compare.my_hash_object;
+    }
+
+    key_equal key_eq() const
+    {
+        return my_hash_compare.my_key_compare_object;
+    }
+};
+
+} // namespace interface5
+
+using interface5::concurrent_unordered_set;
+
+} // namespace tbb
+
+#endif// __TBB_concurrent_unordered_set_H
index abcc645de5ef55627f5ee8e0410c659afee3106b..2a38ff487689df17ae2aea5f1b3d5da3b9d24c62 100644 (file)
@@ -37,6 +37,7 @@
 #include "tbb_machine.h"
 #include "tbb_profiling.h"
 #include <new>
+#include <cstring>   // for memset()
 
 #if !TBB_USE_EXCEPTIONS && _MSC_VER
     // Suppress "C++ exception handler used, but unwind semantics are not enabled" warning in STL headers
index c45a428d1f80061b4a5724bf93efe2d4d790c24f..f0a2b57083fa51dd86465b7a23c4535988c9f57d 100644 (file)
@@ -31,6 +31,7 @@
 
 #include "concurrent_vector.h"
 #include "tbb_thread.h"
+#include "tbb_allocator.h"
 #include "cache_aligned_allocator.h"
 #include "aligned_space.h"
 #include <string.h>  // for memcpy
diff --git a/tbb/include/tbb/flow_graph.h b/tbb/include/tbb/flow_graph.h
new file mode 100644 (file)
index 0000000..5f80b6a
--- /dev/null
@@ -0,0 +1,1556 @@
+/*
+    Copyright 2005-2011 Intel Corporation.  All Rights Reserved.
+
+    This file is part of Threading Building Blocks.
+
+    Threading Building Blocks is free software; you can redistribute it
+    and/or modify it under the terms of the GNU General Public License
+    version 2 as published by the Free Software Foundation.
+
+    Threading Building Blocks is distributed in the hope that it will be
+    useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+    of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Threading Building Blocks; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+    As a special exception, you may use this file as part of a free software
+    library without restriction.  Specifically, if other files instantiate
+    templates or use macros or inline functions from this file, or you compile
+    this file and link it with other files to produce an executable, this
+    file does not by itself cause the resulting executable to be covered by
+    the GNU General Public License.  This exception does not however
+    invalidate any other reasons why the executable file might be covered by
+    the GNU General Public License.
+*/
+
+#ifndef __TBB_graph_H
+#define __TBB_graph_H
+
+#if !TBB_PREVIEW_GRAPH
+#error Set TBB_PREVIEW_GRAPH to include graph.h
+#endif
+
+#include "tbb_stddef.h"
+#include "atomic.h"
+#include "spin_mutex.h"
+#include "null_mutex.h"
+#include "spin_rw_mutex.h"
+#include "null_rw_mutex.h"
+#include "task.h"
+#include "concurrent_vector.h"
+#include "internal/_aggregator_impl.h"
+
+// use the VC10 or gcc version of tuple if it is available.
+#if TBB_IMPLEMENT_CPP0X && (!defined(_MSC_VER) || _MSC_VER < 1600)
+#define TBB_PREVIEW_TUPLE 1
+#include "compat/tuple"
+#else
+#include <tuple>
+#endif
+
+#include<list>
+#include<queue>
+
+/** @file
+  \brief The graph related classes and functions
+
+  There are some applications that best express dependencies as messages
+  passed between nodes in a graph.  These messages may contain data or
+  simply act as signals that a predecessors has completed. The graph
+  class and its associated node classes can be used to express such
+  applcations.
+*/
+
+namespace tbb {
+namespace flow {
+
+//! An enumeration the provides the two most common concurrency levels: unlimited and serial
+enum concurrency { unlimited = 0, serial = 1 };
+
+namespace interface6 {
+
+//! The base of all graph nodes.  Allows them to be stored in a collection for deletion.
+class graph_node {
+public:
+    virtual ~graph_node() {} 
+}; 
+
+//! An empty class used for messages that mean "I'm done" 
+class continue_msg {};
+        
+template< typename T > class sender;
+template< typename T > class receiver;
+class continue_receiver;
+        
+//! Pure virtual template class that defines a sender of messages of type T
+template< typename T >
+class sender {
+public:
+    //! The output type of this sender
+    typedef T output_type;
+        
+    //! The successor type for this node
+    typedef receiver<T> successor_type;
+        
+    virtual ~sender() {}
+        
+    //! Add a new successor to this node
+    virtual bool register_successor( successor_type &r ) = 0;
+        
+    //! Removes a successor from this node
+    virtual bool remove_successor( successor_type &r ) = 0;
+        
+    //! Request an item from the sender
+    virtual bool try_get( T & ) { return false; }
+        
+    //! Reserves an item in the sender 
+    virtual bool try_reserve( T & ) { return false; }
+        
+    //! Releases the reserved item
+    virtual bool try_release( ) { return false; }
+        
+    //! Consumes the reserved item
+    virtual bool try_consume( ) { return false; }
+        
+};
+        
+        
+//! Pure virtual template class that defines a receiver of messages of type T
+template< typename T >
+class receiver {
+public:
+        
+    //! The input type of this receiver
+    typedef T input_type;
+        
+    //! The predecessor type for this node
+    typedef sender<T> predecessor_type;
+        
+    //! Destructor
+    virtual ~receiver() {}
+        
+    //! Put an item to the receiver
+    virtual bool try_put( const T& t ) = 0;
+        
+    //! Add a predecessor to the node
+    virtual bool register_predecessor( predecessor_type & ) { return false; }
+        
+    //! Remove a predecessor from the node
+    virtual bool remove_predecessor( predecessor_type & ) { return false; }
+        
+};
+        
+//! Base class for receivers of completion messages
+/** These receivers automatically reset, but cannot be explicitly waited on */
+class continue_receiver : public receiver< continue_msg > {
+public:
+        
+    //! The input type
+    typedef continue_msg input_type;
+        
+    //! The predecessor type for this node
+    typedef sender< continue_msg > predecessor_type;
+        
+    //! Constructor
+    continue_receiver( int number_of_predecessors = 0 ) { 
+        my_predecessor_count = my_initial_predecessor_count = number_of_predecessors;
+        my_current_count = 0;
+    }
+        
+    //! Copy constructor
+    continue_receiver( const continue_receiver& src ) : receiver<continue_msg>() { 
+        my_predecessor_count = my_initial_predecessor_count = src.my_initial_predecessor_count;
+        my_current_count = 0;
+    }
+        
+    //! Destructor
+    virtual ~continue_receiver() { }
+        
+    //! Increments the trigger threshold
+    /* override */ bool register_predecessor( predecessor_type & ) {
+        spin_mutex::scoped_lock l(my_mutex);
+        ++my_predecessor_count;
+        return true;
+    }
+        
+    //! Decrements the trigger threshold
+    /** Does not check to see if the removal of the predecessor now makes the current count
+        exceed the new threshold.  So removing a predecessor while the graph is active can cause
+        unexpected results. */
+    /* override */ bool remove_predecessor( predecessor_type & ) {
+        spin_mutex::scoped_lock l(my_mutex);
+        --my_predecessor_count;
+        return true;
+    }
+        
+    //! Puts a continue_msg to the receiver
+    /** If the message causes the message count to reach the predecessor count, execute() is called and
+        the message count is reset to 0.  Otherwise the message count is incremented. */
+    /* override */ bool try_put( const input_type & ) {
+        {
+            spin_mutex::scoped_lock l(my_mutex);
+            if ( ++my_current_count < my_predecessor_count ) 
+                return true;
+            else
+                my_current_count = 0;
+        }
+        execute();
+        return true;
+    }
+        
+protected:
+        
+    spin_mutex my_mutex;
+    int my_predecessor_count;
+    int my_current_count;
+    int my_initial_predecessor_count;
+        
+    //! Does whatever should happen when the threshold is reached
+    /** This should be very fast or else spawn a task.  This is
+        called while the sender is blocked in the try_put(). */
+    virtual void execute() = 0;
+        
+};
+
+#include "internal/_flow_graph_impl.h"
+using namespace internal::graph_policy_namespace;
+
+//! The graph class
+/** This class serves as a handle to the graph */
+class graph : tbb::internal::no_copy {
+        
+    template< typename Body >
+    class run_task : public task {
+    public: 
+        run_task( Body& body ) : my_body(body) {}
+        task *execute() {
+            my_body();
+            return NULL;
+        }
+    private:
+        Body my_body;
+    };
+        
+    template< typename Receiver, typename Body >
+    class run_and_put_task : public task {
+    public: 
+        run_and_put_task( Receiver &r, Body& body ) : my_receiver(r), my_body(body) {}
+        task *execute() {
+            my_receiver.try_put( my_body() );
+            return NULL;
+        }
+    private:
+        Receiver &my_receiver;
+        Body my_body;
+    };
+        
+public:
+        
+        
+    //! Constructs a graph withy no nodes.
+    graph() : my_root_task( new ( task::allocate_root( ) ) empty_task ) {
+        my_root_task->set_ref_count(1);
+    }
+        
+    //! Destroys the graph.
+    /** Calls wait_for_all on the graph, deletes all of the nodes appended by calls to add, and then 
+        destroys the root task of the graph. */ 
+    ~graph() {
+        wait_for_all();
+        my_root_task->set_ref_count(0);
+        task::destroy( *my_root_task );
+    }
+        
+        
+    //! Used to register that an external entity may still interact with the graph.
+    /** The graph will not return from wait_for_all until a matching number of decrement_wait_count calls
+        is made. */
+    void increment_wait_count() { 
+        if (my_root_task)
+            my_root_task->increment_ref_count();
+    }
+        
+    //! Deregisters an external entity that may have interacted with the graph.
+    /** The graph will not return from wait_for_all until all the number of decrement_wait_count calls
+        matches the number of increment_wait_count calls. */
+    void decrement_wait_count() { 
+        if (my_root_task)
+            my_root_task->decrement_ref_count(); 
+    }
+        
+    //! Spawns a task that runs a body and puts its output to a specific receiver
+    /** The task is spawned as a child of the graph. This is useful for running tasks 
+        that need to block a wait_for_all() on the graph.  For example a one-off source. */
+    template< typename Receiver, typename Body >
+        void run( Receiver &r, Body body ) {
+       task::enqueue( * new ( task::allocate_additional_child_of( *my_root_task ) ) 
+           run_and_put_task< Receiver, Body >( r, body ) );
+    }
+        
+    //! Spawns a task that runs a function object 
+    /** The task is spawned as a child of the graph. This is useful for running tasks 
+        that need to block a wait_for_all() on the graph. For example a one-off source. */
+    template< typename Body >
+    void run( Body body ) {
+       task::enqueue( * new ( task::allocate_additional_child_of( *my_root_task ) ) 
+           run_task< Body >( body ) );
+    }
+        
+    //! Waits until the graph is idle and the number of decrement_wait_count calls equals the number of increment_wait_count calls.
+    /** The waiting thread will go off and steal work while it is block in the wait_for_all. */
+    void wait_for_all() {
+        if (my_root_task)
+            my_root_task->wait_for_all();
+        my_root_task->set_ref_count(1);
+    }
+        
+    //! Returns the root task of the graph
+    task * root_task() {
+        return my_root_task;
+    }
+        
+private:
+        
+    task *my_root_task;
+        
+};
+
+#include "internal/_flow_graph_node_impl.h"
+
+//! An executable node that acts as a source, i.e. it has no predecessors
+template < typename Output >
+class source_node : public graph_node, public sender< Output > {
+public:
+        
+    //! The type of the output message, which is complete
+    typedef Output output_type;           
+        
+    //! The type of successors of this node
+    typedef receiver< Output > successor_type;
+        
+    //! Constructor for a node with a successor
+    template< typename Body >
+    source_node( graph &g, Body body, bool is_active = true )
+        : my_root_task(g.root_task()), my_active(is_active), init_my_active(is_active),
+        my_body( new internal::source_body_leaf< output_type, Body>(body) ),
+        my_reserved(false), my_has_cached_item(false) 
+    { 
+        my_successors.set_owner(this);
+    }
+        
+    //! Copy constructor
+    source_node( const source_node& src ) :
+#if ( __TBB_GCC_VERSION < 40202 )
+        graph_node(), sender<Output>(),
+#endif
+        my_root_task( src.my_root_task), my_active(src.init_my_active),
+        init_my_active(src.init_my_active), my_body( src.my_body->clone() ),
+        my_reserved(false), my_has_cached_item(false)
+    {
+        my_successors.set_owner(this);
+    }
+
+    //! The destructor
+    ~source_node() { delete my_body; }
+        
+    //! Add a new successor to this node
+    /* override */ bool register_successor( receiver<output_type> &r ) {
+        spin_mutex::scoped_lock lock(my_mutex);
+        my_successors.register_successor(r);
+        if ( my_active )
+            spawn_put();
+        return true;
+    }
+        
+    //! Removes a successor from this node
+    /* override */ bool remove_successor( receiver<output_type> &r ) {
+        spin_mutex::scoped_lock lock(my_mutex);
+        my_successors.remove_successor(r);
+        return true;
+    }
+        
+    //! Request an item from the node
+    /*override */ bool try_get( output_type &v ) {
+        spin_mutex::scoped_lock lock(my_mutex);
+        if ( my_reserved )  
+            return false;
+        
+        if ( my_has_cached_item ) {
+            v = my_cached_item;
+            my_has_cached_item = false;
+        } else if ( (*my_body)(v) == false ) {
+            return false;
+        }
+        return true;
+    }
+        
+    //! Reserves an item.
+    /* override */ bool try_reserve( output_type &v ) {
+        spin_mutex::scoped_lock lock(my_mutex);
+        if ( my_reserved ) {
+            return false;
+        }
+        
+        if ( !my_has_cached_item && (*my_body)(my_cached_item) )  
+            my_has_cached_item = true;
+        
+        if ( my_has_cached_item ) {
+            v = my_cached_item;
+            my_reserved = true;
+            return true;
+        } else {
+            return false;
+        }
+    }
+        
+    //! Release a reserved item.  
+    /**  true = item has been released and so remains in sender, dest must request or reserve future items */
+    /* override */ bool try_release( ) {
+        spin_mutex::scoped_lock lock(my_mutex);
+        __TBB_ASSERT( my_reserved && my_has_cached_item, "releasing non-existent reservation" );
+        my_reserved = false;
+        spawn_put();
+        return true;
+    }
+        
+    //! Consumes a reserved item
+    /* override */ bool try_consume( ) {
+        spin_mutex::scoped_lock lock(my_mutex);
+        __TBB_ASSERT( my_reserved && my_has_cached_item, "consuming non-existent reservation" );
+        my_reserved = false;
+        my_has_cached_item = false;
+        if ( !my_successors.empty() ) {
+            spawn_put();
+        }
+        return true;
+    }
+        
+    //! Activates a node that was created in the inactive state
+    void activate() {
+        spin_mutex::scoped_lock lock(my_mutex);
+        my_active = true;
+        if ( !my_successors.empty() )
+            spawn_put();
+    }
+        
+private:
+        
+    task *my_root_task;
+    spin_mutex my_mutex;
+    bool my_active;
+    bool init_my_active;
+    internal::source_body<output_type> *my_body;
+    internal::broadcast_cache< output_type > my_successors;
+    bool my_reserved;
+    bool my_has_cached_item;
+    output_type my_cached_item;
+        
+    friend class internal::source_task< source_node< output_type > >;
+        
+    //! Applies the body
+    /* override */ void apply_body( ) {
+        output_type v;
+        if ( try_reserve(v) == false )
+            return;
+        
+        if ( my_successors.try_put( v ) ) 
+            try_consume();
+        else
+            try_release();
+    }
+        
+    //! Spawns a task that applies the body
+    /* override */ void spawn_put( ) {
+        task::enqueue( * new ( task::allocate_additional_child_of( *my_root_task ) ) 
+           internal::source_task< source_node< output_type > >( *this ) ); 
+    }
+        
+};
+        
+//! Implements a function node that supports Input -> Output
+template < typename Input, typename Output = continue_msg, graph_buffer_policy = queueing, typename Allocator=cache_aligned_allocator<Input> >
+class function_node : public graph_node, public internal::function_input<Input,Output,Allocator>, public internal::function_output<Output> {
+public:
+        
+    typedef Input input_type;
+    typedef Output output_type;
+    typedef sender< input_type > predecessor_type;
+    typedef receiver< output_type > successor_type;
+        
+    //! Constructor
+    template< typename Body >
+    function_node( graph &g, size_t concurrency, Body body )
+    : internal::function_input<input_type,output_type,Allocator>( g, concurrency, body ) {
+        my_successors.set_owner(this);
+    }
+
+    //! Copy constructor
+    function_node( const function_node& src ) : 
+#if ( __TBB_GCC_VERSION < 40202 )
+        graph_node(), 
+#endif
+        internal::function_input<input_type,output_type,Allocator>( src )
+#if ( __TBB_GCC_VERSION < 40202 )
+        , internal::function_output<Output>()
+#endif
+    {
+        my_successors.set_owner(this);
+    }
+        
+protected:
+
+    internal::broadcast_cache<output_type> my_successors; 
+    /* override */ internal::broadcast_cache<output_type> &successors () { return my_successors; }
+        
+};
+
+//! Implements a function node that supports Input -> Output
+template < typename Input, typename Output, typename Allocator >
+class function_node<Input,Output,queueing,Allocator> : public graph_node, public internal::function_input<Input,Output,Allocator>, public internal::function_output<Output> {
+public:
+        
+    typedef Input input_type;
+    typedef Output output_type;
+    typedef sender< input_type > predecessor_type;
+    typedef receiver< output_type > successor_type;
+        
+    //! Constructor
+    template< typename Body >
+    function_node( graph &g, size_t concurrency, Body body )
+    : internal::function_input< input_type, output_type, Allocator >( g, concurrency, body, new internal::function_input_queue< input_type, Allocator >() ) {
+        my_successors.set_owner(this);
+    }
+
+    //! Copy constructor
+    function_node( const function_node& src ) : 
+#if ( __TBB_GCC_VERSION < 40202 )
+        graph_node(), 
+#endif
+        internal::function_input<input_type,output_type,Allocator>( src, new internal::function_input_queue< input_type, Allocator >() )
+#if ( __TBB_GCC_VERSION < 40202 )
+        , internal::function_output<Output>()
+#endif
+    {
+        my_successors.set_owner(this);
+    }
+
+protected:
+
+    internal::broadcast_cache<output_type> my_successors; 
+    /* override */ internal::broadcast_cache<output_type> &successors () { return my_successors; }
+        
+};
+        
+//! Implements an executable node that supports continue_msg -> Output
+template <typename Output>
+class continue_node : public graph_node, public internal::continue_input<Output>, public internal::function_output<Output> {
+public:
+        
+    typedef continue_msg input_type;
+    typedef Output output_type;
+    typedef sender< input_type > predecessor_type;
+    typedef receiver< output_type > successor_type;
+        
+     //! Constructor for executable node with continue_msg -> Output
+     template <typename Body >
+     continue_node( graph &g, Body body )
+             : internal::continue_input<output_type>( g, body ) {
+         my_successors.set_owner(this);
+     }
+        
+    //! Constructor for executable node with continue_msg -> Output
+    template <typename Body >
+    continue_node( graph &g, int number_of_predecessors, Body body )
+        : internal::continue_input<output_type>( g, number_of_predecessors, body )
+    {
+        my_successors.set_owner(this);
+    }
+    //! Copy constructor       
+    continue_node( const continue_node& src ) :
+#if ( __TBB_GCC_VERSION < 40202 )
+        graph_node(),
+#endif
+        internal::continue_input<output_type>(src)
+#if ( __TBB_GCC_VERSION < 40202 )
+        , internal::function_output<Output>()
+#endif
+    {
+        my_successors.set_owner(this);
+    }
+
+protected:
+        
+    internal::broadcast_cache<output_type> my_successors; 
+    /* override */ internal::broadcast_cache<output_type> &successors () { return my_successors; }
+        
+};
+        
+template< typename T >
+class overwrite_node : public graph_node, public receiver<T>, public sender<T> {
+public:
+        
+    typedef T input_type;
+    typedef T output_type;
+    typedef sender< input_type > predecessor_type;
+    typedef receiver< output_type > successor_type;
+        
+    overwrite_node() : my_buffer_is_valid(false) {
+        my_successors.set_owner( this );
+    }
+
+    // Copy constructor; doesn't take anything from src; default won't work
+    overwrite_node( const overwrite_node& ) : 
+#if ( __TBB_GCC_VERSION < 40202 )
+        graph_node(), receiver<T>(), sender<T>(),
+#endif
+        my_buffer_is_valid(false) 
+    {
+        my_successors.set_owner( this );
+    }
+        
+    ~overwrite_node() {}
+        
+    /* override */ bool register_successor( successor_type &s ) {
+        spin_mutex::scoped_lock l( my_mutex );
+        if ( my_buffer_is_valid ) {
+            // We have a valid value that must be forwarded immediately.
+            if ( s.try_put( my_buffer ) || !s.register_predecessor( *this  ) ) {
+                // We add the successor: it accepted our put or it rejected it but won't let use become a predecessor
+                my_successors.register_successor( s );
+                return true;
+            } else {
+                // We don't add the successor: it rejected our put and we became its predecessor instead
+                return false;
+            }
+        } else {
+            // No valid value yet, just add as successor
+            my_successors.register_successor( s );
+            return true;
+        }
+    }
+        
+    /* override */ bool remove_successor( successor_type &s ) {
+        spin_mutex::scoped_lock l( my_mutex );
+        my_successors.remove_successor(s);
+        return true;
+    }
+        
+    /* override */ bool try_put( const T &v ) {
+        spin_mutex::scoped_lock l( my_mutex );
+        my_buffer = v;
+        my_buffer_is_valid = true;
+        my_successors.try_put(v);
+        return true;
+    }
+        
+    /* override */ bool try_get( T &v ) {
+        spin_mutex::scoped_lock l( my_mutex );
+        if ( my_buffer_is_valid ) {
+            v = my_buffer;
+            return true;
+        } else {
+            return false;
+        }
+    }
+        
+    bool is_valid() {
+       spin_mutex::scoped_lock l( my_mutex );
+       return my_buffer_is_valid;
+    }
+        
+    void clear() {
+       spin_mutex::scoped_lock l( my_mutex );
+       my_buffer_is_valid = false;
+    }
+        
+protected:
+        
+    spin_mutex my_mutex;
+    internal::broadcast_cache< T, null_rw_mutex > my_successors;
+    T my_buffer;
+    bool my_buffer_is_valid;
+        
+};
+        
+template< typename T >
+class write_once_node : public overwrite_node<T> {
+public:
+        
+    typedef T input_type;
+    typedef T output_type;
+    typedef sender< input_type > predecessor_type;
+    typedef receiver< output_type > successor_type;
+        
+    //! Constructor
+    write_once_node() : overwrite_node<T>() {}
+
+    //! Copy constructor: call base class copy constructor
+    write_once_node( const write_once_node& src ) : overwrite_node<T>(src) {}
+
+    /* override */ bool try_put( const T &v ) {
+        spin_mutex::scoped_lock l( this->my_mutex );
+        if ( this->my_buffer_is_valid ) {
+            return false;
+        } else {
+            this->my_buffer = v;
+            this->my_buffer_is_valid = true;
+            this->my_successors.try_put(v);
+            return true;
+        }
+    }
+};
+        
+//! Forwards messages of type T to all successors
+template <typename T>
+class broadcast_node : public graph_node, public receiver<T>, public sender<T> {
+        
+    internal::broadcast_cache<T> my_successors;
+        
+public:
+        
+    typedef T input_type;
+    typedef T output_type;
+    typedef sender< input_type > predecessor_type;
+    typedef receiver< output_type > successor_type;
+        
+    broadcast_node( ) {
+       my_successors.set_owner( this ); 
+    }
+        
+    // Copy constructor
+    broadcast_node( const broadcast_node& ) 
+#if ( __TBB_GCC_VERSION < 40202 )
+        : graph_node(), receiver<T>(), sender<T>()
+#endif
+    {
+       my_successors.set_owner( this ); 
+    }
+        
+    //! Adds a successor
+    virtual bool register_successor( receiver<T> &r ) {
+        my_successors.register_successor( r );
+        return true;
+    }
+        
+    //! Removes s as a successor
+    virtual bool remove_successor( receiver<T> &r ) {
+        my_successors.remove_successor( r );
+        return true;
+    }
+        
+    /* override */ bool try_put( const T &t ) {
+        my_successors.try_put(t);
+        return true;
+    }
+        
+};
+
+#include "internal/_flow_graph_item_buffer_impl.h"
+
+//! Forwards messages in arbitrary order
+template <typename T, typename A=cache_aligned_allocator<T> >
+class buffer_node : public graph_node, public reservable_item_buffer<T, A>, public receiver<T>, public sender<T> {
+public:
+    typedef T input_type;
+    typedef T output_type;
+    typedef sender< input_type > predecessor_type;
+    typedef receiver< output_type > successor_type;
+    typedef buffer_node<T, A> my_class;
+protected:
+    typedef size_t size_type;
+    internal::round_robin_cache< T, null_rw_mutex > my_successors;
+        
+    task *my_parent;
+        
+    friend class internal::forward_task< buffer_node< T, A > >;
+        
+    enum op_type {reg_succ, rem_succ, req_item, res_item, rel_res, con_res, put_item, try_fwd};
+    enum op_stat {WAIT=0, SUCCEEDED, FAILED};
+        
+    // implements the aggregator_operation concept
+    class buffer_operation : public internal::aggregated_operation< buffer_operation > {
+    public:
+        char type;
+        T *elem;
+        successor_type *r;
+        buffer_operation(const T& e, op_type t) :
+            type(char(t)), elem(const_cast<T*>(&e)), r(NULL) {}
+        buffer_operation(op_type t) : type(char(t)), r(NULL) {}
+    };
+        
+    bool forwarder_busy;
+    typedef internal::aggregating_functor<my_class, buffer_operation> my_handler;
+    friend class internal::aggregating_functor<my_class, buffer_operation>;
+    internal::aggregator< my_handler, buffer_operation> my_aggregator;
+        
+    virtual void handle_operations(buffer_operation *op_list) {
+        buffer_operation *tmp;
+        bool try_forwarding=false;
+        while (op_list) {
+            tmp = op_list;
+            op_list = op_list->next;
+            switch (tmp->type) {
+            case reg_succ: internal_reg_succ(tmp);  try_forwarding = true; break;
+            case rem_succ: internal_rem_succ(tmp); break;
+            case req_item: internal_pop(tmp); break;
+            case res_item: internal_reserve(tmp); break;
+            case rel_res:  internal_release(tmp);  try_forwarding = true; break;
+            case con_res:  internal_consume(tmp);  try_forwarding = true; break;
+            case put_item: internal_push(tmp);  try_forwarding = true; break;
+            case try_fwd:  internal_forward(tmp); break;
+            }
+        }
+        if (try_forwarding && !forwarder_busy) {
+            forwarder_busy = true;
+            task::enqueue(*new(task::allocate_additional_child_of(*my_parent)) internal::forward_task< buffer_node<input_type, A> >(*this));
+        }
+    }
+        
+    //! This is executed by an enqueued task, the "forwarder"
+    virtual void forward() {
+        buffer_operation op_data(try_fwd);
+        do {
+            op_data.status = WAIT;
+            my_aggregator.execute(&op_data);
+        } while (op_data.status == SUCCEEDED);
+    }
+        
+    //! Register successor
+    virtual void internal_reg_succ(buffer_operation *op) {
+        my_successors.register_successor(*(op->r));
+        __TBB_store_with_release(op->status, SUCCEEDED);
+    }
+        
+    //! Remove successor
+    virtual void internal_rem_succ(buffer_operation *op) {
+        my_successors.remove_successor(*(op->r));
+        __TBB_store_with_release(op->status, SUCCEEDED);
+    }
+        
+    //! Tries to forward valid items to successors
+    virtual void internal_forward(buffer_operation *op) {
+        T i_copy;
+        bool success = false; // flagged when a successor accepts
+        size_type counter = my_successors.size();
+        // Try forwarding, giving each successor a chance
+        while (counter>0 && !this->buffer_empty() && this->item_valid(this->my_tail-1)) {
+            this->fetch_back(i_copy);
+            if( my_successors.try_put(i_copy) ) {
+                this->invalidate_back();
+                --(this->my_tail);
+                success = true; // found an accepting successor
+            }
+            --counter;
+        }
+        if (success && !counter)
+            __TBB_store_with_release(op->status, SUCCEEDED);
+        else {
+            __TBB_store_with_release(op->status, FAILED);
+            forwarder_busy = false;
+        }
+    }
+        
+    virtual void internal_push(buffer_operation *op) {
+        this->push_back(*(op->elem));
+        __TBB_store_with_release(op->status, SUCCEEDED);
+    }
+        
+    virtual void internal_pop(buffer_operation *op) {
+        if(this->pop_back(*(op->elem))) {
+            __TBB_store_with_release(op->status, SUCCEEDED);
+        }
+        else {
+            __TBB_store_with_release(op->status, FAILED);
+        }
+    }
+        
+    virtual void internal_reserve(buffer_operation *op) {
+        if(this->reserve_front(*(op->elem))) {
+            __TBB_store_with_release(op->status, SUCCEEDED);
+        }
+        else {
+            __TBB_store_with_release(op->status, FAILED);
+        }
+    }
+        
+    virtual void internal_consume(buffer_operation *op) {
+        this->consume_front();
+        __TBB_store_with_release(op->status, SUCCEEDED);
+    }
+        
+    virtual void internal_release(buffer_operation *op) {
+        this->release_front();
+        __TBB_store_with_release(op->status, SUCCEEDED);
+    }
+        
+public:
+    //! Constructor
+    buffer_node( graph &g ) : reservable_item_buffer<T>(),
+        my_parent( g.root_task() ), forwarder_busy(false) {
+        my_successors.set_owner(this);
+        my_aggregator.initialize_handler(my_handler(this));
+    }
+
+    //! Copy constructor
+    buffer_node( const buffer_node& src ) : 
+#if ( __TBB_GCC_VERSION < 40202 )
+        graph_node(), 
+#endif
+        reservable_item_buffer<T>(),
+#if ( __TBB_GCC_VERSION < 40202 )
+        receiver<T>(), sender<T>(),
+#endif
+        my_parent( src.my_parent )  
+    {
+        forwarder_busy = false;
+        my_successors.set_owner(this);
+        my_aggregator.initialize_handler(my_handler(this));
+    }
+
+    virtual ~buffer_node() {}
+        
+    //
+    // message sender implementation
+    //
+        
+    //! Adds a new successor.
+    /** Adds successor r to the list of successors; may forward tasks.  */
+    /* override */ bool register_successor( receiver<output_type> &r ) {
+        buffer_operation op_data(reg_succ);
+        op_data.r = &r;
+        my_aggregator.execute(&op_data);
+        return true;
+    }
+        
+    //! Removes a successor.
+    /** Removes successor r from the list of successors.
+        It also calls r.remove_predecessor(*this) to remove this node as a predecessor. */
+    /* override */ bool remove_successor( receiver<output_type> &r ) {
+        r.remove_predecessor(*this);
+        buffer_operation op_data(rem_succ);
+        op_data.r = &r;
+        my_aggregator.execute(&op_data);
+        return true;
+    }
+        
+    //! Request an item from the buffer_node
+    /**  true = v contains the returned item<BR>
+         false = no item has been returned */
+    /* override */ bool try_get( T &v ) {
+        buffer_operation op_data(req_item);
+        op_data.elem = &v;
+        my_aggregator.execute(&op_data);
+        return (op_data.status==SUCCEEDED);
+    }
+        
+    //! Reserves an item.
+    /**  false = no item can be reserved<BR>
+         true = an item is reserved */
+    /* override */ bool try_reserve( T &v ) {
+        buffer_operation op_data(res_item);
+        op_data.elem = &v;
+        my_aggregator.execute(&op_data);
+        return (op_data.status==SUCCEEDED);
+    }
+        
+    //! Release a reserved item.
+    /**  true = item has been released and so remains in sender */
+    /* override */ bool try_release() {
+        buffer_operation op_data(rel_res);
+        my_aggregator.execute(&op_data);
+        return true;
+    }
+        
+    //! Consumes a reserved item.
+    /** true = item is removed from sender and reservation removed */
+    /* override */ bool try_consume() {
+        buffer_operation op_data(con_res);
+        my_aggregator.execute(&op_data);
+        return true;
+    }
+        
+    //! Receive an item
+    /** true is always returned */
+    /* override */ bool try_put(const T &t) {
+        buffer_operation op_data(t, put_item);
+        my_aggregator.execute(&op_data);
+        return true;
+    }
+};
+        
+        
+//! Forwards messages in FIFO order
+template <typename T, typename A=cache_aligned_allocator<T> >
+class queue_node : public buffer_node<T, A> {
+protected:
+typedef typename buffer_node<T, A>::size_type size_type;
+typedef typename buffer_node<T, A>::buffer_operation queue_operation;
+        
+    enum op_stat {WAIT=0, SUCCEEDED, FAILED};
+        
+    //! Tries to forward valid items to successors
+    /* override */ void internal_forward(queue_operation *op) {
+        T i_copy;
+        bool success = false; // flagged when a successor accepts
+        size_type counter = this->my_successors.size();
+        if (this->my_reserved || !this->item_valid(this->my_head)){
+            __TBB_store_with_release(op->status, FAILED);
+            this->forwarder_busy = false;
+            return;
+        }
+        // Keep trying to send items while there is at least one accepting successor
+        while (counter>0 && this->item_valid(this->my_head)) {
+            this->fetch_front(i_copy);
+            if(this->my_successors.try_put(i_copy)) {
+                 this->invalidate_front();
+                 ++(this->my_head);
+                success = true; // found an accepting successor
+            }
+            --counter;
+        }
+        if (success && !counter)
+            __TBB_store_with_release(op->status, SUCCEEDED);
+        else {
+            __TBB_store_with_release(op->status, FAILED);
+            this->forwarder_busy = false;
+        }
+    }
+        
+    /* override */ void internal_pop(queue_operation *op) {
+        if ( this->my_reserved || !this->item_valid(this->my_head)){
+            __TBB_store_with_release(op->status, FAILED);
+        }
+        else {
+            this->pop_front(*(op->elem));
+            __TBB_store_with_release(op->status, SUCCEEDED);
+        }
+    }
+    /* override */ void internal_reserve(queue_operation *op) {
+        if (this->my_reserved || !this->item_valid(this->my_head)) {
+            __TBB_store_with_release(op->status, FAILED);
+        }
+        else {
+            this->my_reserved = true;
+            this->fetch_front(*(op->elem));
+            this->invalidate_front();
+            __TBB_store_with_release(op->status, SUCCEEDED);
+        }
+    }
+    /* override */ void internal_consume(queue_operation *op) {
+        this->consume_front();
+        __TBB_store_with_release(op->status, SUCCEEDED);
+    }
+        
+public:
+        
+    typedef T input_type;
+    typedef T output_type;
+    typedef sender< input_type > predecessor_type;
+    typedef receiver< output_type > successor_type;
+        
+    //! Constructor
+    queue_node( graph &g ) : buffer_node<T, A>(g) {}
+
+    //! Copy constructor
+    queue_node( const queue_node& src) : buffer_node<T, A>(src) {}
+};
+        
+//! Forwards messages in sequence order
+template< typename T, typename A=cache_aligned_allocator<T> >
+class sequencer_node : public queue_node<T, A> {
+    internal::function_body< T, size_t > *my_sequencer;
+public:
+        
+    typedef T input_type;
+    typedef T output_type;
+    typedef sender< input_type > predecessor_type;
+    typedef receiver< output_type > successor_type;
+        
+    //! Constructor
+    template< typename Sequencer >
+    sequencer_node( graph &g, const Sequencer& s ) : queue_node<T, A>(g),
+        my_sequencer(new internal::function_body_leaf< T, size_t, Sequencer>(s) ) {}
+
+    //! Copy constructor
+    sequencer_node( const sequencer_node& src ) : queue_node<T, A>(src),
+        my_sequencer( src.my_sequencer->clone() ) {}
+        
+    //! Destructor
+    ~sequencer_node() { delete my_sequencer; }
+protected:
+    typedef typename buffer_node<T, A>::size_type size_type;
+    typedef typename buffer_node<T, A>::buffer_operation sequencer_operation;
+        
+    enum op_stat {WAIT=0, SUCCEEDED, FAILED};
+        
+private:
+    /* override */ void internal_push(sequencer_operation *op) {
+        size_type tag = (*my_sequencer)(*(op->elem));
+        
+        this->my_tail = (tag+1 > this->my_tail) ? tag+1 : this->my_tail;
+        
+        if(this->size() > this->capacity())
+            this->grow_my_array(this->size());  // tail already has 1 added to it
+        this->item(tag) = std::make_pair( *(op->elem), true );
+        __TBB_store_with_release(op->status, SUCCEEDED);
+    }
+};
+        
+//! Forwards messages in priority order
+template< typename T, typename Compare = std::less<T>, typename A=cache_aligned_allocator<T> >
+class priority_queue_node : public buffer_node<T, A> {
+public:
+    typedef T input_type;
+    typedef T output_type;
+    typedef sender< input_type > predecessor_type;
+    typedef receiver< output_type > successor_type;
+        
+    //! Constructor
+    priority_queue_node( graph &g ) : buffer_node<T, A>(g), mark(0) {}
+
+    //! Copy constructor
+    priority_queue_node( const priority_queue_node &src ) : buffer_node<T, A>(src), mark(0) {}
+        
+protected:
+    typedef typename buffer_node<T, A>::size_type size_type;
+    typedef typename buffer_node<T, A>::item_type item_type;
+    typedef typename buffer_node<T, A>::buffer_operation prio_operation;
+        
+    enum op_stat {WAIT=0, SUCCEEDED, FAILED};
+        
+    /* override */ void handle_operations(prio_operation *op_list) {
+        prio_operation *tmp /*, *pop_list*/ ;
+        bool try_forwarding=false;
+        while (op_list) {
+            tmp = op_list;
+            op_list = op_list->next;
+            switch (tmp->type) {
+            case buffer_node<T, A>::reg_succ: this->internal_reg_succ(tmp); try_forwarding = true; break;
+            case buffer_node<T, A>::rem_succ: this->internal_rem_succ(tmp); break;
+            case buffer_node<T, A>::put_item: internal_push(tmp); try_forwarding = true; break;
+            case buffer_node<T, A>::try_fwd: internal_forward(tmp); break;
+            case buffer_node<T, A>::rel_res: internal_release(tmp); try_forwarding = true; break;
+            case buffer_node<T, A>::con_res: internal_consume(tmp); try_forwarding = true; break;
+            case buffer_node<T, A>::req_item: internal_pop(tmp); break;
+            case buffer_node<T, A>::res_item: internal_reserve(tmp); break;
+            }
+        }
+        // process pops!  for now, no special pop processing
+        if (mark<this->my_tail) heapify();
+        if (try_forwarding && !this->forwarder_busy) {
+            this->forwarder_busy = true;
+            task::enqueue(*new(task::allocate_additional_child_of(*(this->my_parent))) internal::forward_task< buffer_node<input_type, A> >(*this));
+        }
+    }
+        
+    //! Tries to forward valid items to successors
+    /* override */ void internal_forward(prio_operation *op) {
+        T i_copy;
+        bool success = false; // flagged when a successor accepts
+        size_type counter = this->my_successors.size();
+        
+        if (this->my_reserved || this->my_tail == 0) {
+            __TBB_store_with_release(op->status, FAILED);
+            this->forwarder_busy = false;
+            return;
+        }
+        // Keep trying to send while there exists an accepting successor
+        while (counter>0 && this->my_tail > 0) {
+            i_copy = this->my_array[0].first;
+            bool msg = this->my_successors.try_put(i_copy);
+            if ( msg == true ) {
+                 if (mark == this->my_tail) --mark;
+                --(this->my_tail);
+                this->my_array[0].first=this->my_array[this->my_tail].first;
+                if (this->my_tail > 1) // don't reheap for heap of size 1
+                    reheap();
+                success = true; // found an accepting successor
+            }
+            --counter;
+        }
+        if (success && !counter)
+            __TBB_store_with_release(op->status, SUCCEEDED);
+        else {
+            __TBB_store_with_release(op->status, FAILED);
+            this->forwarder_busy = false;
+        }
+    }
+        
+    /* override */ void internal_push(prio_operation *op) {
+        if ( this->my_tail >= this->my_array_size )
+            this->grow_my_array( this->my_tail + 1 );
+        this->my_array[this->my_tail] = std::make_pair( *(op->elem), true );
+        ++(this->my_tail);
+        __TBB_store_with_release(op->status, SUCCEEDED);
+    }
+    /* override */ void internal_pop(prio_operation *op) {
+        if ( this->my_reserved == true || this->my_tail == 0 ) {
+            __TBB_store_with_release(op->status, FAILED);
+        }
+        else {
+            if (mark<this->my_tail &&
+                compare(this->my_array[0].first,
+                        this->my_array[this->my_tail-1].first)) {
+                // there are newly pushed elems; last one higher than top
+                // copy the data
+                *(op->elem) = this->my_array[this->my_tail-1].first;
+                --(this->my_tail);
+                __TBB_store_with_release(op->status, SUCCEEDED);
+            }
+            else { // extract and push the last element down heap
+                *(op->elem) = this->my_array[0].first; // copy the data
+                if (mark == this->my_tail) --mark;
+                --(this->my_tail);
+                __TBB_store_with_release(op->status, SUCCEEDED);
+                this->my_array[0].first=this->my_array[this->my_tail].first;
+                if (this->my_tail > 1) // don't reheap for heap of size 1
+                    reheap();
+            }
+        }
+    }
+    /* override */ void internal_reserve(prio_operation *op) {
+        if (this->my_reserved == true || this->my_tail == 0) {
+            __TBB_store_with_release(op->status, FAILED);
+        }
+        else {
+            this->my_reserved = true;
+            *(op->elem) = reserved_item = this->my_array[0].first;
+            if (mark == this->my_tail) --mark;
+            --(this->my_tail);
+            __TBB_store_with_release(op->status, SUCCEEDED);
+            this->my_array[0].first = this->my_array[this->my_tail].first;
+            if (this->my_tail > 1) // don't reheap for heap of size 1
+                reheap();
+        }
+    }
+    /* override */ void internal_consume(prio_operation *op) {
+        this->my_reserved = false;
+        __TBB_store_with_release(op->status, SUCCEEDED);
+    }
+    /* override */ void internal_release(prio_operation *op) {
+        if (this->my_tail >= this->my_array_size)
+            this->grow_my_array( this->my_tail + 1 );
+        this->my_array[this->my_tail] = std::make_pair(reserved_item, true);
+        ++(this->my_tail);
+        this->my_reserved = false;
+        __TBB_store_with_release(op->status, SUCCEEDED);
+        heapify();
+    }
+private:
+    Compare compare;
+    size_type mark;
+    input_type reserved_item;
+        
+    void heapify() {
+        if (!mark) mark = 1;
+        for (; mark<this->my_tail; ++mark) { // for each unheaped element
+            size_type cur_pos = mark;
+            input_type to_place = this->my_array[mark].first;
+            do { // push to_place up the heap
+                size_type parent = (cur_pos-1)>>1;
+                if (!compare(this->my_array[parent].first, to_place))
+                    break;
+                this->my_array[cur_pos].first = this->my_array[parent].first;
+                cur_pos = parent;
+            } while( cur_pos );
+            this->my_array[cur_pos].first = to_place;
+        }
+    }
+        
+    void reheap() {
+        size_type cur_pos=0, child=1;
+        while (child < mark) {
+            size_type target = child;
+            if (child+1<mark &&
+                compare(this->my_array[child].first,
+                        this->my_array[child+1].first))
+                ++target;
+            // target now has the higher priority child
+            if (compare(this->my_array[target].first,
+                        this->my_array[this->my_tail].first))
+                break;
+            this->my_array[cur_pos].first = this->my_array[target].first;
+            cur_pos = target;
+            child = (cur_pos<<1)+1;
+        }
+        this->my_array[cur_pos].first = this->my_array[this->my_tail].first;
+    }
+};
+        
+//! Forwards messages only if the threshold has not been reached
+/** This node forwards items until its threshold is reached.
+    It contains no buffering.  If the downstream node rejects, the
+    message is dropped. */
+template< typename T >
+class limiter_node : public graph_node, public receiver< T >, public sender< T > {
+public:
+        
+    typedef T input_type;
+    typedef T output_type;
+    typedef sender< input_type > predecessor_type;
+    typedef receiver< output_type > successor_type;
+        
+private:
+        
+    task *my_root_task;
+    size_t my_threshold;
+    size_t my_count;
+    internal::predecessor_cache< T > my_predecessors;
+    spin_mutex my_mutex;
+    internal::broadcast_cache< T > my_successors;
+    int init_decrement_predecessors;
+
+    friend class internal::forward_task< limiter_node<T> >;
+        
+    // Let decrementer call decrement_counter()
+    friend class internal::decrementer< limiter_node<T> >;
+        
+    void decrement_counter() {
+        input_type v;
+        
+        // If we can't get / put an item immediately then drop the count
+        if ( my_predecessors.get_item( v ) == false 
+             || my_successors.try_put(v) == false ) {
+            spin_mutex::scoped_lock lock(my_mutex);
+            --my_count;
+            if ( !my_predecessors.empty() ) 
+                task::enqueue( * new ( task::allocate_additional_child_of( *my_root_task ) ) 
+                            internal::forward_task< limiter_node<T> >( *this ) );
+        }
+    }
+        
+    void forward() {
+        {
+            spin_mutex::scoped_lock lock(my_mutex);
+            if ( my_count < my_threshold ) 
+                ++my_count;
+            else
+                return;
+        }
+        decrement_counter();
+    }
+        
+public:
+        
+    //! The internal receiver< continue_msg > that decrements the count
+    internal::decrementer< limiter_node<T> > decrement;
+        
+    //! Constructor
+    limiter_node(graph &g, size_t threshold, int num_decrement_predecessors=0) : 
+        my_root_task(g.root_task()), my_threshold(threshold), my_count(0), 
+        init_decrement_predecessors(num_decrement_predecessors), 
+        decrement(num_decrement_predecessors) 
+    {
+        my_predecessors.set_owner(this);
+        my_successors.set_owner(this);
+        decrement.set_owner(this);
+    }
+        
+    //! Copy constructor
+    limiter_node( const limiter_node& src ) : 
+#if ( __TBB_GCC_VERSION < 40202 )
+        graph_node(), receiver<T>(), sender<T>(),
+#endif
+        my_root_task(src.my_root_task), my_threshold(src.my_threshold), my_count(0), 
+        init_decrement_predecessors(src.init_decrement_predecessors), 
+        decrement(src.init_decrement_predecessors) 
+    {
+        my_predecessors.set_owner(this);
+        my_successors.set_owner(this);
+        decrement.set_owner(this);
+    }
+
+    //! Replace the current successor with this new successor
+    /* override */ bool register_successor( receiver<output_type> &r ) {
+        my_successors.register_successor(r);
+        return true;
+    }
+        
+    //! Removes a successor from this node
+    /** r.remove_predecessor(*this) is also called. */
+    /* override */ bool remove_successor( receiver<output_type> &r ) {
+        r.remove_predecessor(*this);
+        my_successors.remove_successor(r);
+        return true;
+    }
+        
+    //! Puts an item to this receiver
+    /* override */ bool try_put( const T &t ) {
+        {
+            spin_mutex::scoped_lock lock(my_mutex);
+            if ( my_count >= my_threshold ) 
+                return false;
+            else
+                ++my_count; 
+        }
+        
+        bool msg = my_successors.try_put(t);
+        
+        if ( msg != true ) {
+            spin_mutex::scoped_lock lock(my_mutex);
+            --my_count;
+            if ( !my_predecessors.empty() ) 
+                task::enqueue( * new ( task::allocate_additional_child_of( *my_root_task ) ) 
+                            internal::forward_task< limiter_node<T> >( *this ) );
+        }
+        
+        return msg;
+    }
+        
+    //! Removes src from the list of cached predecessors.
+    /* override */ bool register_predecessor( predecessor_type &src ) {
+        spin_mutex::scoped_lock lock(my_mutex);
+        my_predecessors.add( src );
+        if ( my_count < my_threshold && !my_successors.empty() ) 
+            task::enqueue( * new ( task::allocate_additional_child_of( *my_root_task ) ) 
+                           internal::forward_task< limiter_node<T> >( *this ) );
+        return true;
+    }
+        
+    //! Removes src from the list of cached predecessors.
+    /* override */ bool remove_predecessor( predecessor_type &src ) {
+        my_predecessors.remove( src );
+        return true;
+    }
+        
+};
+
+#include "internal/_flow_graph_join_impl.h"
+
+using internal::reserving_port;
+using internal::queueing_port;
+using internal::tag_matching_port;
+using internal::input_port;
+using internal::tag_value;
+using internal::NO_TAG;
+
+template<typename OutputTuple, graph_buffer_policy JP=queueing> class join_node;
+
+template<typename OutputTuple>
+class join_node<OutputTuple,reserving>: public internal::unfolded_join_node<std::tuple_size<OutputTuple>::value, reserving_port, OutputTuple, reserving> {
+private:
+    static const int N = std::tuple_size<OutputTuple>::value;
+    typedef typename internal::unfolded_join_node<N, reserving_port, OutputTuple, reserving> unfolded_type;
+public:
+    typedef OutputTuple output_type;
+    typedef typename unfolded_type::input_ports_tuple_type input_ports_tuple_type;
+    join_node(graph &g) : unfolded_type(g) { }
+    join_node(const join_node &other) : unfolded_type(other) {}
+};
+
+template<typename OutputTuple>
+class join_node<OutputTuple,queueing>: public internal::unfolded_join_node<std::tuple_size<OutputTuple>::value, queueing_port, OutputTuple, queueing> {
+private:
+    static const int N = std::tuple_size<OutputTuple>::value;
+    typedef typename internal::unfolded_join_node<N, queueing_port, OutputTuple, queueing> unfolded_type;
+public:
+    typedef OutputTuple output_type;
+    typedef typename unfolded_type::input_ports_tuple_type input_ports_tuple_type;
+    join_node(graph &g) : unfolded_type(g) { }
+    join_node(const join_node &other) : unfolded_type(other) {}
+};
+
+// template for tag_matching join_node
+template<typename OutputTuple>
+class join_node<OutputTuple, tag_matching> : public internal::unfolded_join_node<std::tuple_size<OutputTuple>::value,
+      tag_matching_port, OutputTuple, tag_matching> {
+private:
+    static const int N = std::tuple_size<OutputTuple>::value;
+    typedef typename internal::unfolded_join_node<N, tag_matching_port, OutputTuple, tag_matching> unfolded_type;
+public:
+    typedef OutputTuple output_type;
+    typedef typename unfolded_type::input_ports_tuple_type input_ports_tuple_type;
+    template<typename B0, typename B1>
+    join_node(graph &g, B0 b0, B1 b1) : unfolded_type(g, b0, b1) { }
+    template<typename B0, typename B1, typename B2>
+    join_node(graph &g, B0 b0, B1 b1, B2 b2) : unfolded_type(g, b0, b1, b2) { }
+    template<typename B0, typename B1, typename B2, typename B3>
+    join_node(graph &g, B0 b0, B1 b1, B2 b2, B3 b3) : unfolded_type(g, b0, b1, b2, b3) { }
+    template<typename B0, typename B1, typename B2, typename B3, typename B4>
+    join_node(graph &g, B0 b0, B1 b1, B2 b2, B3 b3, B4 b4) : unfolded_type(g, b0, b1, b2, b3, b4) { }
+    template<typename B0, typename B1, typename B2, typename B3, typename B4, typename B5>
+    join_node(graph &g, B0 b0, B1 b1, B2 b2, B3 b3, B4 b4, B5 b5) : unfolded_type(g, b0, b1, b2, b3, b4, b5) { }
+    template<typename B0, typename B1, typename B2, typename B3, typename B4, typename B5, typename B6>
+    join_node(graph &g, B0 b0, B1 b1, B2 b2, B3 b3, B4 b4, B5 b5, B6 b6) : unfolded_type(g, b0, b1, b2, b3, b4, b5, b6) { }
+    template<typename B0, typename B1, typename B2, typename B3, typename B4, typename B5, typename B6, typename B7>
+    join_node(graph &g, B0 b0, B1 b1, B2 b2, B3 b3, B4 b4, B5 b5, B6 b6, B7 b7) : unfolded_type(g, b0, b1, b2, b3, b4, b5, b6, b7) { }
+    template<typename B0, typename B1, typename B2, typename B3, typename B4, typename B5, typename B6, typename B7, typename B8>
+    join_node(graph &g, B0 b0, B1 b1, B2 b2, B3 b3, B4 b4, B5 b5, B6 b6, B7 b7, B8 b8) : unfolded_type(g, b0, b1, b2, b3, b4, b5, b6, b7, b8) { }
+    template<typename B0, typename B1, typename B2, typename B3, typename B4, typename B5, typename B6, typename B7, typename B8, typename B9>
+    join_node(graph &g, B0 b0, B1 b1, B2 b2, B3 b3, B4 b4, B5 b5, B6 b6, B7 b7, B8 b8, B9 b9) : unfolded_type(g, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9) { }
+    join_node(const join_node &other) : unfolded_type(other) {}
+};
+
+// or node
+#include "internal/_flow_graph_or_impl.h"
+
+template<typename InputTuple>
+class or_node : public internal::unfolded_or_node<InputTuple> {
+private:
+    static const int N = std::tuple_size<InputTuple>::value;
+public:
+    typedef typename internal::or_output_type<N,InputTuple>::type output_type;
+    typedef typename internal::unfolded_or_node<InputTuple> unfolded_type;
+    or_node() : unfolded_type() { }
+    // Copy constructor
+    or_node( const or_node& /*other*/ ) : unfolded_type() { }
+};
+
+//! Makes an edge between a single predecessor and a single successor
+template< typename T >
+inline void make_edge( sender<T> &p, receiver<T> &s ) {
+    p.register_successor( s );
+}
+        
+//! Makes an edge between a single predecessor and a single successor
+template< typename T >
+inline void remove_edge( sender<T> &p, receiver<T> &s ) {
+    p.remove_successor( s );
+}
+
+//! Returns a copy of the body from a function or continue node
+template< typename Body, typename Node >
+Body copy_body( Node &n ) {
+    return n.template copy_function_object<Body>();
+}
+        
+        
+} // interface6
+
+    using interface6::graph;
+    using interface6::graph_node;
+    using interface6::continue_msg;
+    using interface6::sender;
+    using interface6::receiver;
+    using interface6::continue_receiver;
+
+    using interface6::source_node;
+    using interface6::function_node;
+    using interface6::continue_node;
+    using interface6::overwrite_node;
+    using interface6::write_once_node;
+    using interface6::broadcast_node;
+    using interface6::buffer_node;
+    using interface6::queue_node;
+    using interface6::sequencer_node;
+    using interface6::priority_queue_node;
+    using interface6::limiter_node;
+    using namespace interface6::internal::graph_policy_namespace;
+    using interface6::join_node;
+    using interface6::or_node;
+    using interface6::input_port;
+    using interface6::copy_body; 
+    using interface6::make_edge; 
+    using interface6::remove_edge; 
+    using interface6::internal::NO_TAG;
+    using interface6::internal::tag_value;
+
+} // graph
+} // tbb
+
+#endif
+
diff --git a/tbb/include/tbb/internal/_aggregator_impl.h b/tbb/include/tbb/internal/_aggregator_impl.h
new file mode 100644 (file)
index 0000000..7158500
--- /dev/null
@@ -0,0 +1,162 @@
+/*
+    Copyright 2005-2011 Intel Corporation.  All Rights Reserved.
+
+    This file is part of Threading Building Blocks.
+
+    Threading Building Blocks is free software; you can redistribute it
+    and/or modify it under the terms of the GNU General Public License
+    version 2 as published by the Free Software Foundation.
+
+    Threading Building Blocks is distributed in the hope that it will be
+    useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+    of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Threading Building Blocks; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+    As a special exception, you may use this file as part of a free software
+    library without restriction.  Specifically, if other files instantiate
+    templates or use macros or inline functions from this file, or you compile
+    this file and link it with other files to produce an executable, this
+    file does not by itself cause the resulting executable to be covered by
+    the GNU General Public License.  This exception does not however
+    invalidate any other reasons why the executable file might be covered by
+    the GNU General Public License.
+*/
+
+#ifndef __TBB_aggregator_internal_H
+#define __TBB_aggregator_internal_H
+
+#include "../atomic.h"
+#include "../tbb_profiling.h"
+
+namespace tbb {
+namespace interface6 {
+namespace internal {
+
+using namespace tbb::internal;
+
+//! aggregated_operation base class
+template <typename Derived>
+class aggregated_operation {
+ public:
+    uintptr_t status;
+    Derived *next;
+    aggregated_operation() : status(0), next(NULL) {}
+};
+
+//! Aggregator base class
+/** An aggregator for collecting operations coming from multiple sources and executing
+    them serially on a single thread.  operation_type must be derived from
+    aggregated_operation. The parameter handler_type is a functor that will be passed the
+    list of operations and is expected to handle each operation appropriately, setting the
+    status of each operation to non-zero.*/
+ template < typename handler_type, typename operation_type >
+class aggregator {
+ public:
+    aggregator() : handler_busy(false) { pending_operations = NULL; }
+    explicit aggregator(handler_type h) : handler_busy(false), handle_operations(h) {
+        pending_operations = NULL; 
+    }
+
+    void initialize_handler(handler_type h) { handle_operations = h; }
+
+    //! Place operation in list
+    /** Place operation in list and either handle list or wait for operation to
+        complete.  */
+    void execute(operation_type *op) {
+        operation_type *res;
+
+        // ITT note: &(op->status) tag is used to cover accesses to this op node. This
+        // thread has created the operation, and now releases it so that the handler
+        // thread may handle the associated operation w/o triggering a race condition;
+        // thus this tag will be acquired just before the operation is handled in the
+        // handle_operations functor.
+        call_itt_notify(releasing, &(op->status));
+        // insert the operation in the queue
+        do {
+            // ITT may flag the following line as a race; it is a false positive:
+            // This is an atomic read; we don't provide itt_hide_load_word for atomics
+            op->next = res = pending_operations; // NOT A RACE 
+        } while (pending_operations.compare_and_swap(op, res) != res);
+        if (!res) { // first in the list; handle the operations
+            // ITT note: &pending_operations tag covers access to the handler_busy flag,
+            // which this waiting handler thread will try to set before entering
+            // handle_operations.
+            call_itt_notify(acquired, &pending_operations);
+            start_handle_operations();
+            __TBB_ASSERT(op->status, NULL);
+        }
+        else { // not first; wait for op to be ready
+            call_itt_notify(prepare, &(op->status));
+            spin_wait_while_eq(op->status, uintptr_t(0));
+            itt_load_word_with_acquire(op->status);
+        }
+    }
+
+ private:
+    //! An atomically updated list (aka mailbox) of pending operations
+    atomic<operation_type *> pending_operations;
+    //! Controls thread access to handle_operations
+    uintptr_t handler_busy;
+    handler_type handle_operations;
+
+    //! Trigger the handling of operations when the handler is free
+    void start_handle_operations() {
+        operation_type *op_list;
+
+        // ITT note: &handler_busy tag covers access to pending_operations as it is passed
+        // between active and waiting handlers.  Below, the waiting handler waits until
+        // the active handler releases, and the waiting handler acquires &handler_busy as
+        // it becomes the active_handler. The release point is at the end of this
+        // function, when all operations in pending_operations have been handled by the
+        // owner of this aggregator.
+        call_itt_notify(prepare, &handler_busy);
+        // get the handler_busy:
+        // only one thread can possibly spin here at a time
+        spin_wait_until_eq(handler_busy, uintptr_t(0));
+        call_itt_notify(acquired, &handler_busy);
+        // acquire fence not necessary here due to causality rule and surrounding atomics
+        __TBB_store_with_release(handler_busy, uintptr_t(1));
+
+        // ITT note: &pending_operations tag covers access to the handler_busy flag
+        // itself. Capturing the state of the pending_operations signifies that
+        // handler_busy has been set and a new active handler will now process that list's
+        // operations.
+        call_itt_notify(releasing, &pending_operations);
+        // grab pending_operations
+        op_list = pending_operations.fetch_and_store(NULL);
+
+        // handle all the operations
+        handle_operations(op_list);
+
+        // release the handler
+        itt_store_word_with_release(handler_busy, uintptr_t(0));
+    }
+};
+
+// the most-compatible friend declaration (vs, gcc, icc) is
+//    template<class U, class V> friend class aggregating_functor;
+template<typename aggregating_class, typename operation_list>
+class aggregating_functor {
+    aggregating_class *fi;
+public:
+    aggregating_functor() {}
+    aggregating_functor(aggregating_class *fi_) : fi(fi_) {}
+    void operator()(operation_list* op_list) { fi->handle_operations(op_list); }
+};
+
+} // namespace internal
+} // namespace interface6
+
+namespace internal {
+    using interface6::internal::aggregated_operation;
+    using interface6::internal::aggregator;
+    using interface6::internal::aggregating_functor;
+} // namespace internal
+
+} // namespace tbb
+
+#endif
diff --git a/tbb/include/tbb/internal/_concurrent_queue_impl.h b/tbb/include/tbb/internal/_concurrent_queue_impl.h
new file mode 100644 (file)
index 0000000..7cd4730
--- /dev/null
@@ -0,0 +1,1019 @@
+/*
+    Copyright 2005-2011 Intel Corporation.  All Rights Reserved.
+
+    This file is part of Threading Building Blocks.
+
+    Threading Building Blocks is free software; you can redistribute it
+    and/or modify it under the terms of the GNU General Public License
+    version 2 as published by the Free Software Foundation.
+
+    Threading Building Blocks is distributed in the hope that it will be
+    useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+    of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Threading Building Blocks; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+    As a special exception, you may use this file as part of a free software
+    library without restriction.  Specifically, if other files instantiate
+    templates or use macros or inline functions from this file, or you compile
+    this file and link it with other files to produce an executable, this
+    file does not by itself cause the resulting executable to be covered by
+    the GNU General Public License.  This exception does not however
+    invalidate any other reasons why the executable file might be covered by
+    the GNU General Public License.
+*/
+
+#ifndef __TBB_concurrent_queue_internal_H
+#define __TBB_concurrent_queue_internal_H
+
+#include "../tbb_stddef.h"
+#include "../tbb_machine.h"
+#include "../atomic.h"
+#include "../spin_mutex.h"
+#include "../cache_aligned_allocator.h"
+#include "../tbb_exception.h"
+#include "../tbb_profiling.h"
+#include <new>
+
+#if !TBB_USE_EXCEPTIONS && _MSC_VER
+    // Suppress "C++ exception handler used, but unwind semantics are not enabled" warning in STL headers
+    #pragma warning (push)
+    #pragma warning (disable: 4530)
+#endif
+
+#include <iterator>
+
+#if !TBB_USE_EXCEPTIONS && _MSC_VER
+    #pragma warning (pop)
+#endif
+
+namespace tbb {
+
+#if !__TBB_TEMPLATE_FRIENDS_BROKEN
+
+// forward declaration
+namespace strict_ppl {
+template<typename T, typename A> class concurrent_queue;
+}
+
+template<typename T, typename A> class concurrent_bounded_queue;
+
+namespace deprecated {
+template<typename T, typename A> class concurrent_queue;
+}
+#endif
+
+//! For internal use only.
+namespace strict_ppl {
+
+//! @cond INTERNAL
+namespace internal {
+
+using namespace tbb::internal;
+
+typedef size_t ticket;
+
+template<typename T> class micro_queue ;
+template<typename T> class micro_queue_pop_finalizer ;
+template<typename T> class concurrent_queue_base_v3;
+
+//! parts of concurrent_queue_rep that do not have references to micro_queue
+/**
+ * For internal use only.
+ */
+struct concurrent_queue_rep_base : no_copy {
+    template<typename T> friend class micro_queue;
+    template<typename T> friend class concurrent_queue_base_v3;
+
+protected:
+    //! Approximately n_queue/golden ratio
+    static const size_t phi = 3;
+
+public:
+    // must be power of 2
+    static const size_t n_queue = 8;
+
+    //! Prefix on a page
+    struct page {
+        page* next;
+        uintptr_t mask; 
+    };
+
+    atomic<ticket> head_counter;
+    char pad1[NFS_MaxLineSize-sizeof(atomic<ticket>)];
+    atomic<ticket> tail_counter;
+    char pad2[NFS_MaxLineSize-sizeof(atomic<ticket>)];
+
+    //! Always a power of 2
+    size_t items_per_page;
+
+    //! Size of an item
+    size_t item_size;
+
+    //! number of invalid entries in the queue
+    atomic<size_t> n_invalid_entries;
+
+    char pad3[NFS_MaxLineSize-sizeof(size_t)-sizeof(size_t)-sizeof(atomic<size_t>)];
+} ;
+
+inline bool is_valid_page(const concurrent_queue_rep_base::page* p) {
+    return uintptr_t(p)>1;
+}
+
+//! Abstract class to define interface for page allocation/deallocation
+/**
+ * For internal use only.
+ */
+class concurrent_queue_page_allocator
+{
+    template<typename T> friend class micro_queue ;
+    template<typename T> friend class micro_queue_pop_finalizer ;
+protected:
+    virtual ~concurrent_queue_page_allocator() {}
+private:
+    virtual concurrent_queue_rep_base::page* allocate_page() = 0;
+    virtual void deallocate_page( concurrent_queue_rep_base::page* p ) = 0;
+} ;
+
+#if _MSC_VER && !defined(__INTEL_COMPILER)
+// unary minus operator applied to unsigned type, result still unsigned
+#pragma warning( push )
+#pragma warning( disable: 4146 )
+#endif
+
+//! A queue using simple locking.
+/** For efficiency, this class has no constructor.
+    The caller is expected to zero-initialize it. */
+template<typename T>
+class micro_queue : no_copy {
+    typedef concurrent_queue_rep_base::page page;
+
+    //! Class used to ensure exception-safety of method "pop" 
+    class destroyer: no_copy {
+        T& my_value;
+    public:
+        destroyer( T& value ) : my_value(value) {}
+        ~destroyer() {my_value.~T();}          
+    };
+
+    void copy_item( page& dst, size_t index, const void* src ) {
+        new( &get_ref(dst,index) ) T(*static_cast<const T*>(src)); 
+    }
+
+    void copy_item( page& dst, size_t dindex, const page& src, size_t sindex ) {
+        new( &get_ref(dst,dindex) ) T( get_ref(const_cast<page&>(src),sindex) );
+    }
+
+    void assign_and_destroy_item( void* dst, page& src, size_t index ) {
+        T& from = get_ref(src,index);
+        destroyer d(from);
+        *static_cast<T*>(dst) = from;
+    }
+
+    void spin_wait_until_my_turn( atomic<ticket>& counter, ticket k, concurrent_queue_rep_base& rb ) const ;
+
+public:
+    friend class micro_queue_pop_finalizer<T>;
+
+    struct padded_page: page {
+        //! Not defined anywhere - exists to quiet warnings.
+        padded_page(); 
+        //! Not defined anywhere - exists to quiet warnings.
+        void operator=( const padded_page& );
+        //! Must be last field.
+        T last;
+    };
+
+    static T& get_ref( page& p, size_t index ) {
+        return (&static_cast<padded_page*>(static_cast<void*>(&p))->last)[index];
+    }
+
+    atomic<page*> head_page;
+    atomic<ticket> head_counter;
+
+    atomic<page*> tail_page;
+    atomic<ticket> tail_counter;
+
+    spin_mutex page_mutex;
+    
+    void push( const void* item, ticket k, concurrent_queue_base_v3<T>& base ) ;
+
+    bool pop( void* dst, ticket k, concurrent_queue_base_v3<T>& base ) ;
+
+    micro_queue& assign( const micro_queue& src, concurrent_queue_base_v3<T>& base ) ;
+
+    page* make_copy( concurrent_queue_base_v3<T>& base, const page* src_page, size_t begin_in_page, size_t end_in_page, ticket& g_index ) ;
+
+    void invalidate_page_and_rethrow( ticket k ) ;
+};
+
+template<typename T>
+void micro_queue<T>::spin_wait_until_my_turn( atomic<ticket>& counter, ticket k, concurrent_queue_rep_base& rb ) const {
+    atomic_backoff backoff;
+    do {
+        backoff.pause();
+        if( counter&1 ) {
+            ++rb.n_invalid_entries;
+            throw_exception( eid_bad_last_alloc );
+        }
+    } while( counter!=k ) ;
+}
+
+template<typename T>
+void micro_queue<T>::push( const void* item, ticket k, concurrent_queue_base_v3<T>& base ) {
+    k &= -concurrent_queue_rep_base::n_queue;
+    page* p = NULL;
+    size_t index = k/concurrent_queue_rep_base::n_queue & (base.my_rep->items_per_page-1);
+    if( !index ) {
+        __TBB_TRY {
+            concurrent_queue_page_allocator& pa = base;
+            p = pa.allocate_page();
+        } __TBB_CATCH (...) {
+            ++base.my_rep->n_invalid_entries;
+            invalidate_page_and_rethrow( k );
+        }
+        p->mask = 0;
+        p->next = NULL;
+    }
+
+    if( tail_counter!=k ) spin_wait_until_my_turn( tail_counter, k, *base.my_rep );
+    call_itt_notify(acquired, &tail_counter);
+        
+    if( p ) {
+        spin_mutex::scoped_lock lock( page_mutex );
+        page* q = tail_page;
+        if( is_valid_page(q) )
+            q->next = p;
+        else
+            head_page = p; 
+        tail_page = p;
+    } else {
+        p = tail_page;
+    }
+    __TBB_TRY {
+        copy_item( *p, index, item );
+        // If no exception was thrown, mark item as present.
+        itt_hide_store_word(p->mask,  p->mask | uintptr_t(1)<<index);
+        call_itt_notify(releasing, &tail_counter);
+        tail_counter += concurrent_queue_rep_base::n_queue; 
+    } __TBB_CATCH (...) {
+        ++base.my_rep->n_invalid_entries;
+        call_itt_notify(releasing, &tail_counter);
+        tail_counter += concurrent_queue_rep_base::n_queue; 
+        __TBB_RETHROW();
+    }
+}
+
+template<typename T>
+bool micro_queue<T>::pop( void* dst, ticket k, concurrent_queue_base_v3<T>& base ) {
+    k &= -concurrent_queue_rep_base::n_queue;
+    if( head_counter!=k ) spin_wait_until_eq( head_counter, k );
+    call_itt_notify(acquired, &head_counter);
+    if( tail_counter==k ) spin_wait_while_eq( tail_counter, k );
+    call_itt_notify(acquired, &tail_counter);
+    page& p = *head_page;
+    __TBB_ASSERT( &p, NULL );
+    size_t index = k/concurrent_queue_rep_base::n_queue & (base.my_rep->items_per_page-1);
+    bool success = false; 
+    {
+        micro_queue_pop_finalizer<T> finalizer( *this, base, k+concurrent_queue_rep_base::n_queue, index==base.my_rep->items_per_page-1 ? &p : NULL ); 
+        if( p.mask & uintptr_t(1)<<index ) {
+            success = true;
+            assign_and_destroy_item( dst, p, index );
+        } else {
+            --base.my_rep->n_invalid_entries;
+        }
+    }
+    return success;
+}
+
+template<typename T>
+micro_queue<T>& micro_queue<T>::assign( const micro_queue<T>& src, concurrent_queue_base_v3<T>& base ) {
+    head_counter = src.head_counter;
+    tail_counter = src.tail_counter;
+    page_mutex   = src.page_mutex;
+
+    const page* srcp = src.head_page;
+    if( is_valid_page(srcp) ) {
+        ticket g_index = head_counter;
+        __TBB_TRY {
+            size_t n_items  = (tail_counter-head_counter)/concurrent_queue_rep_base::n_queue;
+            size_t index = head_counter/concurrent_queue_rep_base::n_queue & (base.my_rep->items_per_page-1);
+            size_t end_in_first_page = (index+n_items<base.my_rep->items_per_page)?(index+n_items):base.my_rep->items_per_page;
+
+            head_page = make_copy( base, srcp, index, end_in_first_page, g_index );
+            page* cur_page = head_page;
+
+            if( srcp != src.tail_page ) {
+                for( srcp = srcp->next; srcp!=src.tail_page; srcp=srcp->next ) {
+                    cur_page->next = make_copy( base, srcp, 0, base.my_rep->items_per_page, g_index );
+                    cur_page = cur_page->next;
+                }
+
+                __TBB_ASSERT( srcp==src.tail_page, NULL );
+                size_t last_index = tail_counter/concurrent_queue_rep_base::n_queue & (base.my_rep->items_per_page-1);
+                if( last_index==0 ) last_index = base.my_rep->items_per_page;
+
+                cur_page->next = make_copy( base, srcp, 0, last_index, g_index );
+                cur_page = cur_page->next;
+            }
+            tail_page = cur_page;
+        } __TBB_CATCH (...) {
+            invalidate_page_and_rethrow( g_index );
+        }
+    } else {
+        head_page = tail_page = NULL;
+    }
+    return *this;
+}
+
+template<typename T>
+void micro_queue<T>::invalidate_page_and_rethrow( ticket k ) {
+    // Append an invalid page at address 1 so that no more pushes are allowed.
+    page* invalid_page = (page*)uintptr_t(1);
+    {
+        spin_mutex::scoped_lock lock( page_mutex );
+        itt_store_word_with_release(tail_counter, k+concurrent_queue_rep_base::n_queue+1);
+        page* q = tail_page;
+        if( is_valid_page(q) )
+            q->next = invalid_page;
+        else
+            head_page = invalid_page;
+        tail_page = invalid_page;
+    }
+    __TBB_RETHROW();
+}
+
+template<typename T>
+concurrent_queue_rep_base::page* micro_queue<T>::make_copy( concurrent_queue_base_v3<T>& base, const concurrent_queue_rep_base::page* src_page, size_t begin_in_page, size_t end_in_page, ticket& g_index ) {
+    concurrent_queue_page_allocator& pa = base;
+    page* new_page = pa.allocate_page();
+    new_page->next = NULL;
+    new_page->mask = src_page->mask;
+    for( ; begin_in_page!=end_in_page; ++begin_in_page, ++g_index )
+        if( new_page->mask & uintptr_t(1)<<begin_in_page )
+            copy_item( *new_page, begin_in_page, *src_page, begin_in_page );
+    return new_page;
+}
+
+template<typename T>
+class micro_queue_pop_finalizer: no_copy {
+    typedef concurrent_queue_rep_base::page page;
+    ticket my_ticket;
+    micro_queue<T>& my_queue;
+    page* my_page; 
+    concurrent_queue_page_allocator& allocator;
+public:
+    micro_queue_pop_finalizer( micro_queue<T>& queue, concurrent_queue_base_v3<T>& b, ticket k, page* p ) :
+        my_ticket(k), my_queue(queue), my_page(p), allocator(b)
+    {}
+    ~micro_queue_pop_finalizer() ;
+};
+
+template<typename T>
+micro_queue_pop_finalizer<T>::~micro_queue_pop_finalizer() {
+    page* p = my_page;
+    if( is_valid_page(p) ) {
+        spin_mutex::scoped_lock lock( my_queue.page_mutex );
+        page* q = p->next;
+        my_queue.head_page = q;
+        if( !is_valid_page(q) ) {
+            my_queue.tail_page = NULL;
+        }
+    }
+    itt_store_word_with_release(my_queue.head_counter, my_ticket);
+    if( is_valid_page(p) ) {
+        allocator.deallocate_page( p );
+    }
+}
+
+#if _MSC_VER && !defined(__INTEL_COMPILER)
+#pragma warning( pop )
+#endif // warning 4146 is back
+
+template<typename T> class concurrent_queue_iterator_rep ;
+template<typename T> class concurrent_queue_iterator_base_v3;
+
+//! representation of concurrent_queue_base
+/**
+ * the class inherits from concurrent_queue_rep_base and defines an array of micro_queue<T>'s
+ */
+template<typename T>
+struct concurrent_queue_rep : public concurrent_queue_rep_base {
+    micro_queue<T> array[n_queue];
+
+    //! Map ticket to an array index
+    static size_t index( ticket k ) {
+        return k*phi%n_queue;
+    }
+
+    micro_queue<T>& choose( ticket k ) {
+        // The formula here approximates LRU in a cache-oblivious way.
+        return array[index(k)];
+    }
+};
+
+//! base class of concurrent_queue
+/**
+ * The class implements the interface defined by concurrent_queue_page_allocator
+ * and has a pointer to an instance of concurrent_queue_rep.
+ */
+template<typename T>
+class concurrent_queue_base_v3: public concurrent_queue_page_allocator {
+    //! Internal representation
+    concurrent_queue_rep<T>* my_rep;
+
+    friend struct concurrent_queue_rep<T>;
+    friend class micro_queue<T>;
+    friend class concurrent_queue_iterator_rep<T>;
+    friend class concurrent_queue_iterator_base_v3<T>;
+
+protected:
+    typedef typename concurrent_queue_rep<T>::page page;
+
+private:
+    typedef typename micro_queue<T>::padded_page padded_page;
+
+    /* override */ virtual page *allocate_page() {
+        concurrent_queue_rep<T>& r = *my_rep;
+        size_t n = sizeof(padded_page) + (r.items_per_page-1)*sizeof(T);
+        return reinterpret_cast<page*>(allocate_block ( n ));
+    }
+
+    /* override */ virtual void deallocate_page( concurrent_queue_rep_base::page *p ) {
+        concurrent_queue_rep<T>& r = *my_rep;
+        size_t n = sizeof(padded_page) + (r.items_per_page-1)*sizeof(T);
+        deallocate_block( reinterpret_cast<void*>(p), n );
+    }
+
+    //! custom allocator
+    virtual void *allocate_block( size_t n ) = 0;
+
+    //! custom de-allocator
+    virtual void deallocate_block( void *p, size_t n ) = 0;
+
+protected:
+    concurrent_queue_base_v3();
+
+    /* override */ virtual ~concurrent_queue_base_v3() {
+#if TBB_USE_ASSERT
+        size_t nq = my_rep->n_queue;
+        for( size_t i=0; i<nq; i++ )
+            __TBB_ASSERT( my_rep->array[i].tail_page==NULL, "pages were not freed properly" );
+#endif /* TBB_USE_ASSERT */
+        cache_aligned_allocator<concurrent_queue_rep<T> >().deallocate(my_rep,1);
+    }
+
+    //! Enqueue item at tail of queue
+    void internal_push( const void* src ) {
+        concurrent_queue_rep<T>& r = *my_rep;
+        ticket k = r.tail_counter++;
+        r.choose(k).push( src, k, *this );
+    }
+
+    //! Attempt to dequeue item from queue.
+    /** NULL if there was no item to dequeue. */
+    bool internal_try_pop( void* dst ) ;
+
+    //! Get size of queue; result may be invalid if queue is modified concurrently
+    size_t internal_size() const ;
+
+    //! check if the queue is empty; thread safe
+    bool internal_empty() const ;
+
+    //! free any remaining pages
+    /* note that the name may be misleading, but it remains so due to a historical accident. */
+    void internal_finish_clear() ;
+
+    //! Obsolete
+    void internal_throw_exception() const {
+        throw_exception( eid_bad_alloc );
+    }
+
+    //! copy internal representation
+    void assign( const concurrent_queue_base_v3& src ) ;
+};
+
+template<typename T>
+concurrent_queue_base_v3<T>::concurrent_queue_base_v3() {
+    const size_t item_size = sizeof(T);
+    my_rep = cache_aligned_allocator<concurrent_queue_rep<T> >().allocate(1);
+    __TBB_ASSERT( (size_t)my_rep % NFS_GetLineSize()==0, "alignment error" );
+    __TBB_ASSERT( (size_t)&my_rep->head_counter % NFS_GetLineSize()==0, "alignment error" );
+    __TBB_ASSERT( (size_t)&my_rep->tail_counter % NFS_GetLineSize()==0, "alignment error" );
+    __TBB_ASSERT( (size_t)&my_rep->array % NFS_GetLineSize()==0, "alignment error" );
+    memset(my_rep,0,sizeof(concurrent_queue_rep<T>));
+    my_rep->item_size = item_size;
+    my_rep->items_per_page = item_size<=8 ? 32 :
+                             item_size<=16 ? 16 : 
+                             item_size<=32 ? 8 :
+                             item_size<=64 ? 4 :
+                             item_size<=128 ? 2 :
+                             1;
+}
+
+template<typename T>
+bool concurrent_queue_base_v3<T>::internal_try_pop( void* dst ) {
+    concurrent_queue_rep<T>& r = *my_rep;
+    ticket k;
+    do {
+        k = r.head_counter;
+        for(;;) {
+            if( r.tail_counter<=k ) {
+                // Queue is empty 
+                return false;
+            }
+            // Queue had item with ticket k when we looked.  Attempt to get that item.
+            ticket tk=k;
+#if defined(_MSC_VER) && defined(_Wp64)
+    #pragma warning (push)
+    #pragma warning (disable: 4267)
+#endif
+            k = r.head_counter.compare_and_swap( tk+1, tk );
+#if defined(_MSC_VER) && defined(_Wp64)
+    #pragma warning (pop)
+#endif
+            if( k==tk )
+                break;
+            // Another thread snatched the item, retry.
+        }
+    } while( !r.choose( k ).pop( dst, k, *this ) );
+    return true;
+}
+
+template<typename T>
+size_t concurrent_queue_base_v3<T>::internal_size() const {
+    concurrent_queue_rep<T>& r = *my_rep;
+    __TBB_ASSERT( sizeof(ptrdiff_t)<=sizeof(size_t), NULL );
+    ticket hc = r.head_counter;
+    size_t nie = r.n_invalid_entries;
+    ticket tc = r.tail_counter;
+    __TBB_ASSERT( hc!=tc || !nie, NULL );
+    ptrdiff_t sz = tc-hc-nie;
+    return sz<0 ? 0 :  size_t(sz);
+}
+
+template<typename T>
+bool concurrent_queue_base_v3<T>::internal_empty() const {
+    concurrent_queue_rep<T>& r = *my_rep;
+    ticket tc = r.tail_counter;
+    ticket hc = r.head_counter;
+    // if tc!=r.tail_counter, the queue was not empty at some point between the two reads.
+    return tc==r.tail_counter && tc==hc+r.n_invalid_entries ;
+}
+
+template<typename T>
+void concurrent_queue_base_v3<T>::internal_finish_clear() {
+    concurrent_queue_rep<T>& r = *my_rep;
+    size_t nq = r.n_queue;
+    for( size_t i=0; i<nq; ++i ) {
+        page* tp = r.array[i].tail_page;
+        if( is_valid_page(tp) ) {
+            __TBB_ASSERT( r.array[i].head_page==tp, "at most one page should remain" );
+            deallocate_page( tp );
+            r.array[i].tail_page = NULL;
+        } else 
+            __TBB_ASSERT( !is_valid_page(r.array[i].head_page), "head page pointer corrupt?" );
+    }
+}
+
+template<typename T>
+void concurrent_queue_base_v3<T>::assign( const concurrent_queue_base_v3& src ) {
+    concurrent_queue_rep<T>& r = *my_rep;
+    r.items_per_page = src.my_rep->items_per_page;
+
+    // copy concurrent_queue_rep.
+    r.head_counter = src.my_rep->head_counter;
+    r.tail_counter = src.my_rep->tail_counter;
+    r.n_invalid_entries = src.my_rep->n_invalid_entries;
+
+    // copy micro_queues
+    for( size_t i = 0; i<r.n_queue; ++i )
+        r.array[i].assign( src.my_rep->array[i], *this);
+
+    __TBB_ASSERT( r.head_counter==src.my_rep->head_counter && r.tail_counter==src.my_rep->tail_counter, 
+            "the source concurrent queue should not be concurrently modified." );
+}
+
+template<typename Container, typename Value> class concurrent_queue_iterator;
+
+template<typename T>
+class concurrent_queue_iterator_rep: no_assign {
+    typedef typename micro_queue<T>::padded_page padded_page;
+public:
+    ticket head_counter;
+    const concurrent_queue_base_v3<T>& my_queue;
+    typename concurrent_queue_base_v3<T>::page* array[concurrent_queue_rep<T>::n_queue];
+    concurrent_queue_iterator_rep( const concurrent_queue_base_v3<T>& queue ) :
+        head_counter(queue.my_rep->head_counter),
+        my_queue(queue)
+    {
+        for( size_t k=0; k<concurrent_queue_rep<T>::n_queue; ++k )
+            array[k] = queue.my_rep->array[k].head_page;
+    }
+
+    //! Set item to point to kth element.  Return true if at end of queue or item is marked valid; false otherwise.
+    bool get_item( T*& item, size_t k ) ;
+};
+
+template<typename T>
+bool concurrent_queue_iterator_rep<T>::get_item( T*& item, size_t k ) {
+    if( k==my_queue.my_rep->tail_counter ) {
+        item = NULL;
+        return true;
+    } else {
+        typename concurrent_queue_base_v3<T>::page* p = array[concurrent_queue_rep<T>::index(k)];
+        __TBB_ASSERT(p,NULL);
+        size_t i = k/concurrent_queue_rep<T>::n_queue & (my_queue.my_rep->items_per_page-1);
+        item = &micro_queue<T>::get_ref(*p,i);
+        return (p->mask & uintptr_t(1)<<i)!=0;
+    }
+}
+
+//! Constness-independent portion of concurrent_queue_iterator.
+/** @ingroup containers */
+template<typename Value>
+class concurrent_queue_iterator_base_v3 : no_assign {
+    //! Represents concurrent_queue over which we are iterating.
+    /** NULL if one past last element in queue. */
+    concurrent_queue_iterator_rep<Value>* my_rep;
+
+    template<typename C, typename T, typename U>
+    friend bool operator==( const concurrent_queue_iterator<C,T>& i, const concurrent_queue_iterator<C,U>& j );
+
+    template<typename C, typename T, typename U>
+    friend bool operator!=( const concurrent_queue_iterator<C,T>& i, const concurrent_queue_iterator<C,U>& j );
+protected:
+    //! Pointer to current item
+    Value* my_item;
+
+    //! Default constructor
+    concurrent_queue_iterator_base_v3() : my_rep(NULL), my_item(NULL) {
+#if __TBB_GCC_OPTIMIZER_ORDERING_BROKEN
+        __TBB_compiler_fence();
+#endif
+    }
+
+    //! Copy constructor
+    concurrent_queue_iterator_base_v3( const concurrent_queue_iterator_base_v3& i )
+    : no_assign(), my_rep(NULL), my_item(NULL) {
+        assign(i);
+    }
+
+    //! Construct iterator pointing to head of queue.
+    concurrent_queue_iterator_base_v3( const concurrent_queue_base_v3<Value>& queue ) ;
+
+    //! Assignment
+    void assign( const concurrent_queue_iterator_base_v3<Value>& other ) ;
+
+    //! Advance iterator one step towards tail of queue.
+    void advance() ;
+
+    //! Destructor
+    ~concurrent_queue_iterator_base_v3() {
+        cache_aligned_allocator<concurrent_queue_iterator_rep<Value> >().deallocate(my_rep, 1);
+        my_rep = NULL;
+    }
+};
+
+template<typename Value>
+concurrent_queue_iterator_base_v3<Value>::concurrent_queue_iterator_base_v3( const concurrent_queue_base_v3<Value>& queue ) {
+    my_rep = cache_aligned_allocator<concurrent_queue_iterator_rep<Value> >().allocate(1);
+    new( my_rep ) concurrent_queue_iterator_rep<Value>(queue);
+    size_t k = my_rep->head_counter;
+    if( !my_rep->get_item(my_item, k) ) advance();
+}
+
+template<typename Value>
+void concurrent_queue_iterator_base_v3<Value>::assign( const concurrent_queue_iterator_base_v3<Value>& other ) {
+    if( my_rep!=other.my_rep ) {
+        if( my_rep ) {
+            cache_aligned_allocator<concurrent_queue_iterator_rep<Value> >().deallocate(my_rep, 1);
+            my_rep = NULL;
+        }
+        if( other.my_rep ) {
+            my_rep = cache_aligned_allocator<concurrent_queue_iterator_rep<Value> >().allocate(1);
+            new( my_rep ) concurrent_queue_iterator_rep<Value>( *other.my_rep );
+        }
+    }
+    my_item = other.my_item;
+}
+
+template<typename Value>
+void concurrent_queue_iterator_base_v3<Value>::advance() {
+    __TBB_ASSERT( my_item, "attempt to increment iterator past end of queue" );  
+    size_t k = my_rep->head_counter;
+    const concurrent_queue_base_v3<Value>& queue = my_rep->my_queue;
+#if TBB_USE_ASSERT
+    Value* tmp;
+    my_rep->get_item(tmp,k);
+    __TBB_ASSERT( my_item==tmp, NULL );
+#endif /* TBB_USE_ASSERT */
+    size_t i = k/concurrent_queue_rep<Value>::n_queue & (queue.my_rep->items_per_page-1);
+    if( i==queue.my_rep->items_per_page-1 ) {
+        typename concurrent_queue_base_v3<Value>::page*& root = my_rep->array[concurrent_queue_rep<Value>::index(k)];
+        root = root->next;
+    }
+    // advance k
+    my_rep->head_counter = ++k;
+    if( !my_rep->get_item(my_item, k) ) advance();
+}
+
+//! Similar to C++0x std::remove_cv
+/** "tbb_" prefix added to avoid overload confusion with C++0x implementations. */
+template<typename T> struct tbb_remove_cv {typedef T type;};
+template<typename T> struct tbb_remove_cv<const T> {typedef T type;};
+template<typename T> struct tbb_remove_cv<volatile T> {typedef T type;};
+template<typename T> struct tbb_remove_cv<const volatile T> {typedef T type;};
+
+//! Meets requirements of a forward iterator for STL.
+/** Value is either the T or const T type of the container.
+    @ingroup containers */
+template<typename Container, typename Value>
+class concurrent_queue_iterator: public concurrent_queue_iterator_base_v3<typename tbb_remove_cv<Value>::type>,
+        public std::iterator<std::forward_iterator_tag,Value> {
+#if !__TBB_TEMPLATE_FRIENDS_BROKEN
+    template<typename T, class A>
+    friend class ::tbb::strict_ppl::concurrent_queue;
+#else
+public: // workaround for MSVC
+#endif 
+    //! Construct iterator pointing to head of queue.
+    concurrent_queue_iterator( const concurrent_queue_base_v3<Value>& queue ) :
+        concurrent_queue_iterator_base_v3<typename tbb_remove_cv<Value>::type>(queue)
+    {
+    }
+
+public:
+    concurrent_queue_iterator() {}
+
+    concurrent_queue_iterator( const concurrent_queue_iterator<Container,typename Container::value_type>& other ) :
+        concurrent_queue_iterator_base_v3<typename tbb_remove_cv<Value>::type>(other)
+    {}
+
+    //! Iterator assignment
+    concurrent_queue_iterator& operator=( const concurrent_queue_iterator& other ) {
+        this->assign(other);
+        return *this;
+    }
+
+    //! Reference to current item 
+    Value& operator*() const {
+        return *static_cast<Value*>(this->my_item);
+    }
+
+    Value* operator->() const {return &operator*();}
+
+    //! Advance to next item in queue
+    concurrent_queue_iterator& operator++() {
+        this->advance();
+        return *this;
+    }
+
+    //! Post increment
+    Value* operator++(int) {
+        Value* result = &operator*();
+        operator++();
+        return result;
+    }
+}; // concurrent_queue_iterator
+
+
+template<typename C, typename T, typename U>
+bool operator==( const concurrent_queue_iterator<C,T>& i, const concurrent_queue_iterator<C,U>& j ) {
+    return i.my_item==j.my_item;
+}
+
+template<typename C, typename T, typename U>
+bool operator!=( const concurrent_queue_iterator<C,T>& i, const concurrent_queue_iterator<C,U>& j ) {
+    return i.my_item!=j.my_item;
+}
+
+} // namespace internal
+
+//! @endcond
+
+} // namespace strict_ppl
+
+//! @cond INTERNAL
+namespace internal {
+
+class concurrent_queue_rep;
+class concurrent_queue_iterator_rep;
+class concurrent_queue_iterator_base_v3;
+template<typename Container, typename Value> class concurrent_queue_iterator;
+
+//! For internal use only.
+/** Type-independent portion of concurrent_queue.
+    @ingroup containers */
+class concurrent_queue_base_v3: no_copy {
+    //! Internal representation
+    concurrent_queue_rep* my_rep;
+
+    friend class concurrent_queue_rep;
+    friend struct micro_queue;
+    friend class micro_queue_pop_finalizer;
+    friend class concurrent_queue_iterator_rep;
+    friend class concurrent_queue_iterator_base_v3;
+protected:
+    //! Prefix on a page
+    struct page {
+        page* next;
+        uintptr_t mask; 
+    };
+
+    //! Capacity of the queue
+    ptrdiff_t my_capacity;
+   
+    //! Always a power of 2
+    size_t items_per_page;
+
+    //! Size of an item
+    size_t item_size;
+
+#if __TBB_GCC_3_3_PROTECTED_BROKEN
+public:
+#endif
+    template<typename T>
+    struct padded_page: page {
+        //! Not defined anywhere - exists to quiet warnings.
+        padded_page(); 
+        //! Not defined anywhere - exists to quiet warnings.
+        void operator=( const padded_page& );
+        //! Must be last field.
+        T last;
+    };
+
+private:
+    virtual void copy_item( page& dst, size_t index, const void* src ) = 0;
+    virtual void assign_and_destroy_item( void* dst, page& src, size_t index ) = 0;
+protected:
+    __TBB_EXPORTED_METHOD concurrent_queue_base_v3( size_t item_size );
+    virtual __TBB_EXPORTED_METHOD ~concurrent_queue_base_v3();
+
+    //! Enqueue item at tail of queue
+    void __TBB_EXPORTED_METHOD internal_push( const void* src );
+
+    //! Dequeue item from head of queue
+    void __TBB_EXPORTED_METHOD internal_pop( void* dst );
+
+    //! Attempt to enqueue item onto queue.
+    bool __TBB_EXPORTED_METHOD internal_push_if_not_full( const void* src );
+
+    //! Attempt to dequeue item from queue.
+    /** NULL if there was no item to dequeue. */
+    bool __TBB_EXPORTED_METHOD internal_pop_if_present( void* dst );
+
+    //! Get size of queue
+    ptrdiff_t __TBB_EXPORTED_METHOD internal_size() const;
+
+    //! Check if the queue is emtpy
+    bool __TBB_EXPORTED_METHOD internal_empty() const;
+
+    //! Set the queue capacity
+    void __TBB_EXPORTED_METHOD internal_set_capacity( ptrdiff_t capacity, size_t element_size );
+
+    //! custom allocator
+    virtual page *allocate_page() = 0;
+
+    //! custom de-allocator
+    virtual void deallocate_page( page *p ) = 0;
+
+    //! free any remaining pages
+    /* note that the name may be misleading, but it remains so due to a historical accident. */
+    void __TBB_EXPORTED_METHOD internal_finish_clear() ;
+
+    //! throw an exception
+    void __TBB_EXPORTED_METHOD internal_throw_exception() const;
+
+    //! copy internal representation
+    void __TBB_EXPORTED_METHOD assign( const concurrent_queue_base_v3& src ) ;
+
+private:
+    virtual void copy_page_item( page& dst, size_t dindex, const page& src, size_t sindex ) = 0;
+};
+
+//! Type-independent portion of concurrent_queue_iterator.
+/** @ingroup containers */
+class concurrent_queue_iterator_base_v3 {
+    //! concurrent_queue over which we are iterating.
+    /** NULL if one past last element in queue. */
+    concurrent_queue_iterator_rep* my_rep;
+
+    template<typename C, typename T, typename U>
+    friend bool operator==( const concurrent_queue_iterator<C,T>& i, const concurrent_queue_iterator<C,U>& j );
+
+    template<typename C, typename T, typename U>
+    friend bool operator!=( const concurrent_queue_iterator<C,T>& i, const concurrent_queue_iterator<C,U>& j );
+
+    void initialize( const concurrent_queue_base_v3& queue, size_t offset_of_data );
+protected:
+    //! Pointer to current item
+    void* my_item;
+
+    //! Default constructor
+    concurrent_queue_iterator_base_v3() : my_rep(NULL), my_item(NULL) {}
+
+    //! Copy constructor
+    concurrent_queue_iterator_base_v3( const concurrent_queue_iterator_base_v3& i ) : my_rep(NULL), my_item(NULL) {
+        assign(i);
+    }
+
+    //! Obsolete entry point for constructing iterator pointing to head of queue.
+    /** Does not work correctly for SSE types. */
+    __TBB_EXPORTED_METHOD concurrent_queue_iterator_base_v3( const concurrent_queue_base_v3& queue );
+
+    //! Construct iterator pointing to head of queue.
+    __TBB_EXPORTED_METHOD concurrent_queue_iterator_base_v3( const concurrent_queue_base_v3& queue, size_t offset_of_data );
+
+    //! Assignment
+    void __TBB_EXPORTED_METHOD assign( const concurrent_queue_iterator_base_v3& i );
+
+    //! Advance iterator one step towards tail of queue.
+    void __TBB_EXPORTED_METHOD advance();
+
+    //! Destructor
+    __TBB_EXPORTED_METHOD ~concurrent_queue_iterator_base_v3();
+};
+
+typedef concurrent_queue_iterator_base_v3 concurrent_queue_iterator_base;
+
+//! Meets requirements of a forward iterator for STL.
+/** Value is either the T or const T type of the container.
+    @ingroup containers */
+template<typename Container, typename Value>
+class concurrent_queue_iterator: public concurrent_queue_iterator_base,
+        public std::iterator<std::forward_iterator_tag,Value> {
+
+#if !defined(_MSC_VER) || defined(__INTEL_COMPILER)
+    template<typename T, class A>
+    friend class ::tbb::concurrent_bounded_queue;
+
+    template<typename T, class A>
+    friend class ::tbb::deprecated::concurrent_queue;
+#else
+public: // workaround for MSVC
+#endif 
+    //! Construct iterator pointing to head of queue.
+    concurrent_queue_iterator( const concurrent_queue_base_v3& queue ) :
+        concurrent_queue_iterator_base_v3(queue,__TBB_offsetof(concurrent_queue_base_v3::padded_page<Value>,last))
+    {
+    }
+
+public:
+    concurrent_queue_iterator() {}
+
+    /** If Value==Container::value_type, then this routine is the copy constructor. 
+        If Value==const Container::value_type, then this routine is a conversion constructor. */
+    concurrent_queue_iterator( const concurrent_queue_iterator<Container,typename Container::value_type>& other ) :
+        concurrent_queue_iterator_base_v3(other)
+    {}
+
+    //! Iterator assignment
+    concurrent_queue_iterator& operator=( const concurrent_queue_iterator& other ) {
+        assign(other);
+        return *this;
+    }
+
+    //! Reference to current item 
+    Value& operator*() const {
+        return *static_cast<Value*>(my_item);
+    }
+
+    Value* operator->() const {return &operator*();}
+
+    //! Advance to next item in queue
+    concurrent_queue_iterator& operator++() {
+        advance();
+        return *this;
+    }
+
+    //! Post increment
+    Value* operator++(int) {
+        Value* result = &operator*();
+        operator++();
+        return result;
+    }
+}; // concurrent_queue_iterator
+
+
+template<typename C, typename T, typename U>
+bool operator==( const concurrent_queue_iterator<C,T>& i, const concurrent_queue_iterator<C,U>& j ) {
+    return i.my_item==j.my_item;
+}
+
+template<typename C, typename T, typename U>
+bool operator!=( const concurrent_queue_iterator<C,T>& i, const concurrent_queue_iterator<C,U>& j ) {
+    return i.my_item!=j.my_item;
+}
+
+} // namespace internal;
+
+//! @endcond
+
+} // namespace tbb
+
+#endif /* __TBB_concurrent_queue_internal_H */
diff --git a/tbb/include/tbb/internal/_concurrent_unordered_impl.h b/tbb/include/tbb/internal/_concurrent_unordered_impl.h
new file mode 100644 (file)
index 0000000..497be3e
--- /dev/null
@@ -0,0 +1,1426 @@
+/*
+    Copyright 2005-2011 Intel Corporation.  All Rights Reserved.
+
+    This file is part of Threading Building Blocks.
+
+    Threading Building Blocks is free software; you can redistribute it
+    and/or modify it under the terms of the GNU General Public License
+    version 2 as published by the Free Software Foundation.
+
+    Threading Building Blocks is distributed in the hope that it will be
+    useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+    of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Threading Building Blocks; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+    As a special exception, you may use this file as part of a free software
+    library without restriction.  Specifically, if other files instantiate
+    templates or use macros or inline functions from this file, or you compile
+    this file and link it with other files to produce an executable, this
+    file does not by itself cause the resulting executable to be covered by
+    the GNU General Public License.  This exception does not however
+    invalidate any other reasons why the executable file might be covered by
+    the GNU General Public License.
+*/
+
+/* Container implementations in this header are based on PPL implementations 
+   provided by Microsoft. */
+
+#ifndef __TBB_concurrent_unordered_internal_H
+#define __TBB_concurrent_unordered_internal_H
+
+#include "../tbb_stddef.h"
+
+#if !TBB_USE_EXCEPTIONS && _MSC_VER
+    // Suppress "C++ exception handler used, but unwind semantics are not enabled" warning in STL headers
+    #pragma warning (push)
+    #pragma warning (disable: 4530)
+#endif
+
+#include <iterator>
+#include <utility>      // Need std::pair
+#include <functional>
+#include <string>       // For tbb_hasher
+#include <cstring>      // Need std::memset
+
+#if !TBB_USE_EXCEPTIONS && _MSC_VER
+    #pragma warning (pop)
+#endif
+
+#include "../atomic.h"
+#include "../tbb_exception.h"
+#include "../tbb_allocator.h"
+
+namespace tbb {
+namespace interface5 {
+//! @cond INTERNAL
+namespace internal {
+
+template <typename T, typename Allocator>
+class split_ordered_list;
+template <typename Traits>
+class concurrent_unordered_base;
+
+// Forward list iterators (without skipping dummy elements)
+template<class Solist, typename Value>
+class flist_iterator : public std::iterator<std::forward_iterator_tag, Value>
+{
+    template <typename T, typename Allocator>
+    friend class split_ordered_list;
+    template <typename Traits>
+    friend class concurrent_unordered_base;
+    template<class M, typename V>
+    friend class flist_iterator;
+
+    typedef typename Solist::nodeptr_t nodeptr_t;
+public:
+    typedef typename Solist::value_type value_type;
+    typedef typename Solist::difference_type difference_type;
+    typedef typename Solist::pointer pointer;
+    typedef typename Solist::reference reference;
+
+    flist_iterator() : my_node_ptr(0) {}
+    flist_iterator( const flist_iterator<Solist, typename Solist::value_type> &other )
+        : my_node_ptr(other.my_node_ptr) {}
+
+    reference operator*() const { return my_node_ptr->my_element; }
+    pointer operator->() const { return &**this; }
+
+    flist_iterator& operator++() {
+        my_node_ptr = my_node_ptr->my_next;
+        return *this;
+    }
+
+    flist_iterator operator++(int) {
+        flist_iterator tmp = *this;
+        ++*this;
+        return tmp;
+    }
+
+protected:
+    flist_iterator(nodeptr_t pnode) : my_node_ptr(pnode) {}
+    nodeptr_t get_node_ptr() const { return my_node_ptr; }
+
+    nodeptr_t my_node_ptr;
+
+    template<typename M, typename T, typename U>
+    friend bool operator==( const flist_iterator<M,T> &i, const flist_iterator<M,U> &j );
+    template<typename M, typename T, typename U>
+    friend bool operator!=( const flist_iterator<M,T>& i, const flist_iterator<M,U>& j );
+};
+
+template<typename Solist, typename T, typename U>
+bool operator==( const flist_iterator<Solist,T> &i, const flist_iterator<Solist,U> &j ) {
+    return i.my_node_ptr == j.my_node_ptr;
+}
+template<typename Solist, typename T, typename U>
+bool operator!=( const flist_iterator<Solist,T>& i, const flist_iterator<Solist,U>& j ) {
+    return i.my_node_ptr != j.my_node_ptr;
+}
+
+// Split-order list iterators, needed to skip dummy elements
+template<class Solist, typename Value>
+class solist_iterator : public flist_iterator<Solist, Value>
+{
+    typedef flist_iterator<Solist, Value> base_type;
+    typedef typename Solist::nodeptr_t nodeptr_t;
+    using base_type::get_node_ptr;
+    template <typename T, typename Allocator>
+    friend class split_ordered_list;
+    template<class M, typename V>
+    friend class solist_iterator;
+    template<typename M, typename T, typename U>
+    friend bool operator==( const solist_iterator<M,T> &i, const solist_iterator<M,U> &j );
+    template<typename M, typename T, typename U>
+    friend bool operator!=( const solist_iterator<M,T>& i, const solist_iterator<M,U>& j );
+
+    const Solist *my_list_ptr;
+    solist_iterator(nodeptr_t pnode, const Solist *plist) : base_type(pnode), my_list_ptr(plist) {}
+
+public:
+    typedef typename Solist::value_type value_type;
+    typedef typename Solist::difference_type difference_type;
+    typedef typename Solist::pointer pointer;
+    typedef typename Solist::reference reference;
+
+    solist_iterator() {}
+    solist_iterator(const solist_iterator<Solist, typename Solist::value_type> &other )
+        : base_type(other), my_list_ptr(other.my_list_ptr) {}
+
+    reference operator*() const {
+        return this->base_type::operator*();
+    }
+
+    pointer operator->() const {
+        return (&**this);
+    }
+
+    solist_iterator& operator++() {
+        do ++(*(base_type *)this);
+        while (get_node_ptr() != NULL && get_node_ptr()->is_dummy());
+
+        return (*this);
+    }
+
+    solist_iterator operator++(int) {
+        solist_iterator tmp = *this;
+        do ++*this;
+        while (get_node_ptr() != NULL && get_node_ptr()->is_dummy());
+
+        return (tmp);
+    }
+};
+
+template<typename Solist, typename T, typename U>
+bool operator==( const solist_iterator<Solist,T> &i, const solist_iterator<Solist,U> &j ) {
+    return i.my_node_ptr == j.my_node_ptr && i.my_list_ptr == j.my_list_ptr;
+}
+template<typename Solist, typename T, typename U>
+bool operator!=( const solist_iterator<Solist,T>& i, const solist_iterator<Solist,U>& j ) {
+    return i.my_node_ptr != j.my_node_ptr || i.my_list_ptr != j.my_list_ptr;
+}
+
+// Forward type and class definitions
+typedef size_t sokey_t;
+
+// Forward list in which elements are sorted in a split-order
+template <typename T, typename Allocator>
+class split_ordered_list
+{
+public:
+    typedef split_ordered_list<T, Allocator> self_type;
+    typedef typename Allocator::template rebind<T>::other allocator_type;
+    struct node;
+    typedef node *nodeptr_t;
+
+    typedef typename allocator_type::size_type size_type;
+    typedef typename allocator_type::difference_type difference_type;
+    typedef typename allocator_type::pointer pointer;
+    typedef typename allocator_type::const_pointer const_pointer;
+    typedef typename allocator_type::reference reference;
+    typedef typename allocator_type::const_reference const_reference;
+    typedef typename allocator_type::value_type value_type;
+
+    typedef solist_iterator<self_type, const value_type> const_iterator;
+    typedef solist_iterator<self_type, value_type> iterator;
+    typedef flist_iterator<self_type, const value_type> raw_const_iterator;
+    typedef flist_iterator<self_type, value_type> raw_iterator;
+
+    // Node that holds the element in a split-ordered list
+    struct node : tbb::internal::no_assign
+    {
+        // Initialize the node with the given order key
+        void init(sokey_t order_key) {
+            my_order_key = order_key;
+            my_next = NULL;
+        }
+
+        // Return the order key (needed for hashing)
+        sokey_t get_order_key() const { // TODO: remove
+            return my_order_key;
+        }
+
+        // Inserts the new element in the list in an atomic fashion
+        nodeptr_t atomic_set_next(nodeptr_t new_node, nodeptr_t current_node)
+        {
+            // Try to change the next pointer on the current element to a new element, only if it still points to the cached next
+            nodeptr_t exchange_node = (nodeptr_t) __TBB_CompareAndSwapW((void *) &my_next, (uintptr_t)new_node, (uintptr_t)current_node);
+
+            if (exchange_node == current_node) // TODO: why this branch?
+            {
+                // Operation succeeded, return the new node
+                return new_node;
+            }
+            else
+            {
+                // Operation failed, return the "interfering" node
+                return exchange_node;
+            }
+        }
+
+        // Checks if this element in the list is a dummy, order enforcing node. Dummy nodes are used by buckets
+        // in the hash table to quickly index into the right subsection of the split-ordered list.
+        bool is_dummy() const {
+            return (my_order_key & 0x1) == 0;
+        }
+
+
+        nodeptr_t  my_next;      // Next element in the list
+        value_type my_element;   // Element storage
+        sokey_t    my_order_key; // Order key for this element
+    };
+
+    // Allocate a new node with the given order key and value
+    nodeptr_t create_node(sokey_t order_key, const T &value) {
+        nodeptr_t pnode = my_node_allocator.allocate(1);
+
+        __TBB_TRY {
+            new(static_cast<void*>(&pnode->my_element)) T(value);
+            pnode->init(order_key);
+        } __TBB_CATCH(...) {
+            my_node_allocator.deallocate(pnode, 1);
+            __TBB_RETHROW();
+        }
+
+        return (pnode);
+    }
+
+    // Allocate a new node with the given order key; used to allocate dummy nodes
+    nodeptr_t create_node(sokey_t order_key) {
+        nodeptr_t pnode = my_node_allocator.allocate(1);
+
+        __TBB_TRY {
+            new(static_cast<void*>(&pnode->my_element)) T();
+            pnode->init(order_key);
+        } __TBB_CATCH(...) {
+            my_node_allocator.deallocate(pnode, 1);
+            __TBB_RETHROW();
+        }
+
+        return (pnode);
+    }
+
+   split_ordered_list(allocator_type a = allocator_type())
+       : my_node_allocator(a), my_element_count(0)
+    {
+        // Immediately allocate a dummy node with order key of 0. This node
+        // will always be the head of the list.
+        my_head = create_node(0);
+    }
+
+    ~split_ordered_list()
+    {
+        // Clear the list
+        clear();
+
+        // Remove the head element which is not cleared by clear()
+        nodeptr_t pnode = my_head;
+        my_head = NULL;
+
+        __TBB_ASSERT(pnode != NULL && pnode->my_next == NULL, "Invalid head list node");
+
+        destroy_node(pnode);
+    }
+
+    // Common forward list functions
+
+    allocator_type get_allocator() const {
+        return (my_node_allocator);
+    }
+
+    void clear() {
+        nodeptr_t pnext;
+        nodeptr_t pnode = my_head;
+
+        __TBB_ASSERT(my_head != NULL, "Invalid head list node");
+        pnext = pnode->my_next;
+        pnode->my_next = NULL;
+        pnode = pnext;
+
+        while (pnode != NULL)
+        {
+            pnext = pnode->my_next;
+            destroy_node(pnode);
+            pnode = pnext;
+        }
+
+        my_element_count = 0;
+    }
+
+    // Returns a first non-dummy element in the SOL
+    iterator begin() {
+        return first_real_iterator(raw_begin());
+    }
+
+    // Returns a first non-dummy element in the SOL
+    const_iterator begin() const {
+        return first_real_iterator(raw_begin());
+    }
+
+    iterator end() {
+        return (iterator(0, this));
+    }
+
+    const_iterator end() const {
+        return (const_iterator(0, this));
+    }
+
+    const_iterator cbegin() const {
+        return (((const self_type *)this)->begin());
+    }
+
+    const_iterator cend() const {
+        return (((const self_type *)this)->end());
+    }
+
+    // Checks if the number of elements (non-dummy) is 0
+    bool empty() const {
+        return (my_element_count == 0);
+    }
+
+    // Returns the number of non-dummy elements in the list
+    size_type size() const {
+        return my_element_count;
+    }
+
+    // Returns the maximum size of the list, determined by the allocator
+    size_type max_size() const {
+        return my_node_allocator.max_size();
+    }
+
+    // Swaps 'this' list with the passed in one
+    void swap(self_type& other)
+    {
+        if (this == &other)
+        {
+            // Nothing to do
+            return;
+        }
+
+        std::swap(my_element_count, other.my_element_count);
+        std::swap(my_head, other.my_head);
+    }
+
+    // Split-order list functions
+
+    // Returns a first element in the SOL, which is always a dummy
+    raw_iterator raw_begin() {
+        return raw_iterator(my_head);
+    }
+
+    // Returns a first element in the SOL, which is always a dummy
+    raw_const_iterator raw_begin() const {
+        return raw_const_iterator(my_head);
+    }
+
+    raw_iterator raw_end() {
+        return raw_iterator(0);
+    }
+
+    raw_const_iterator raw_end() const {
+        return raw_const_iterator(0);
+    }
+
+    static sokey_t get_order_key(const raw_const_iterator& it) {
+        return it.get_node_ptr()->get_order_key();
+    }
+
+    static sokey_t get_safe_order_key(const raw_const_iterator& it) {
+        if( !it.get_node_ptr() ) return sokey_t(~0U);
+        return it.get_node_ptr()->get_order_key();
+    }
+
+    // Returns a public iterator version of the internal iterator. Public iterator must not
+    // be a dummy private iterator.
+    iterator get_iterator(raw_iterator it) {
+        __TBB_ASSERT(it.get_node_ptr() == NULL || !it.get_node_ptr()->is_dummy(), "Invalid user node (dummy)");
+        return iterator(it.get_node_ptr(), this);
+    }
+
+    // Returns a public iterator version of the internal iterator. Public iterator must not
+    // be a dummy private iterator.
+    const_iterator get_iterator(raw_const_iterator it) const {
+        __TBB_ASSERT(it.get_node_ptr() == NULL || !it.get_node_ptr()->is_dummy(), "Invalid user node (dummy)");
+        return const_iterator(it.get_node_ptr(), this);
+    }
+
+    // Returns a non-const version of the raw_iterator
+    raw_iterator get_iterator(raw_const_iterator it) {
+        return raw_iterator(it.get_node_ptr());
+    }
+
+    // Returns a non-const version of the iterator
+    static iterator get_iterator(const_iterator it) {
+        return iterator(it.my_node_ptr, it.my_list_ptr);
+    }
+
+    // Returns a public iterator version of a first non-dummy internal iterator at or after
+    // the passed in internal iterator.
+    iterator first_real_iterator(raw_iterator it)
+    {
+        // Skip all dummy, internal only iterators
+        while (it != raw_end() && it.get_node_ptr()->is_dummy())
+            ++it;
+
+        return iterator(it.get_node_ptr(), this);
+    }
+
+    // Returns a public iterator version of a first non-dummy internal iterator at or after
+    // the passed in internal iterator.
+    const_iterator first_real_iterator(raw_const_iterator it) const
+    {
+        // Skip all dummy, internal only iterators
+        while (it != raw_end() && it.get_node_ptr()->is_dummy())
+            ++it;
+
+        return const_iterator(it.get_node_ptr(), this);
+    }
+
+    // Erase an element using the allocator
+    void destroy_node(nodeptr_t pnode) {
+        my_node_allocator.destroy(pnode);
+        my_node_allocator.deallocate(pnode, 1);
+    }
+
+    // Try to insert a new element in the list. If insert fails, return the node that
+    // was inserted instead.
+    nodeptr_t try_insert(nodeptr_t previous, nodeptr_t new_node, nodeptr_t current_node) {
+        new_node->my_next = current_node;
+        return previous->atomic_set_next(new_node, current_node);
+    }
+
+    // Insert a new element between passed in iterators
+    std::pair<iterator, bool> try_insert(raw_iterator it, raw_iterator next, const value_type &value, sokey_t order_key, size_type *new_count)
+    {
+        nodeptr_t pnode = create_node(order_key, value);
+        nodeptr_t inserted_node = try_insert(it.get_node_ptr(), pnode, next.get_node_ptr());
+
+        if (inserted_node == pnode)
+        {
+            // If the insert succeeded, check that the order is correct and increment the element count
+            check_range();
+            *new_count = __TBB_FetchAndAddW((uintptr_t*)&my_element_count, uintptr_t(1));
+            return std::pair<iterator, bool>(iterator(pnode, this), true);
+        }
+        else
+        {
+            // If the insert failed (element already there), then delete the new one
+            destroy_node(pnode);
+            return std::pair<iterator, bool>(end(), false);
+        }
+    }
+
+    // Insert a new dummy element, starting search at a parent dummy element
+    raw_iterator insert_dummy(raw_iterator it, sokey_t order_key)
+    {
+        raw_iterator last = raw_end();
+        raw_iterator where = it;
+
+        __TBB_ASSERT(where != last, "Invalid head node");
+
+        ++where;
+
+        // Create a dummy element up front, even though it may be discarded (due to concurrent insertion)
+        nodeptr_t dummy_node = create_node(order_key);
+
+        for (;;)
+        {
+            __TBB_ASSERT(it != last, "Invalid head list node");
+
+            // If the head iterator is at the end of the list, or past the point where this dummy
+            // node needs to be inserted, then try to insert it.
+            if (where == last || get_order_key(where) > order_key)
+            {
+                __TBB_ASSERT(get_order_key(it) < order_key, "Invalid node order in the list");
+
+                // Try to insert it in the right place
+                nodeptr_t inserted_node = try_insert(it.get_node_ptr(), dummy_node, where.get_node_ptr());
+
+                if (inserted_node == dummy_node)
+                {
+                    // Insertion succeeded, check the list for order violations
+                    check_range();
+                    return raw_iterator(dummy_node);
+                }
+                else
+                {
+                    // Insertion failed: either dummy node was inserted by another thread, or
+                    // a real element was inserted at exactly the same place as dummy node.
+                    // Proceed with the search from the previous location where order key was
+                    // known to be larger (note: this is legal only because there is no safe
+                    // concurrent erase operation supported).
+                    where = it;
+                    ++where;
+                    continue;
+                }
+            }
+            else if (get_order_key(where) == order_key)
+            {
+                // Another dummy node with the same value found, discard the new one.
+                destroy_node(dummy_node);
+                return where;
+            }
+
+            // Move the iterator forward
+            it = where;
+            ++where;
+        }
+
+    }
+
+    // This erase function can handle both real and dummy nodes
+    void erase_node(raw_iterator previous, raw_const_iterator& where)
+    {
+        nodeptr_t pnode = (where++).get_node_ptr();
+        nodeptr_t prevnode = previous.get_node_ptr();
+        __TBB_ASSERT(prevnode->my_next == pnode, "Erase must take consecutive iterators");
+        prevnode->my_next = pnode->my_next;
+
+        destroy_node(pnode);
+    }
+
+    // Erase the element (previous node needs to be passed because this is a forward only list)
+    iterator erase_node(raw_iterator previous, const_iterator where)
+    {
+        raw_const_iterator it = where;
+        erase_node(previous, it);
+        my_element_count--;
+
+        return get_iterator(first_real_iterator(it));
+    }
+
+    // Move all elements from the passed in split-ordered list to this one
+    void move_all(self_type& source)
+    {
+        raw_const_iterator first = source.raw_begin();
+        raw_const_iterator last = source.raw_end();
+
+        if (first == last)
+            return;
+
+        nodeptr_t previous_node = my_head;
+        raw_const_iterator begin_iterator = first++;
+
+        // Move all elements one by one, including dummy ones
+        for (raw_const_iterator it = first; it != last;)
+        {
+            nodeptr_t pnode = it.get_node_ptr();
+
+            nodeptr_t dummy_node = pnode->is_dummy() ? create_node(pnode->get_order_key()) : create_node(pnode->get_order_key(), pnode->my_element);
+            previous_node = try_insert(previous_node, dummy_node, NULL);
+            __TBB_ASSERT(previous_node != NULL, "Insertion must succeed");
+            raw_const_iterator where = it++;
+            source.erase_node(get_iterator(begin_iterator), where);
+        }
+        check_range();
+    }
+
+
+private:
+
+    // Check the list for order violations
+    void check_range()
+    {
+#if TBB_USE_ASSERT
+        for (raw_iterator it = raw_begin(); it != raw_end(); ++it)
+        {
+            raw_iterator next_iterator = it;
+            ++next_iterator;
+
+            __TBB_ASSERT(next_iterator == end() || next_iterator.get_node_ptr()->get_order_key() >= it.get_node_ptr()->get_order_key(), "!!! List order inconsistency !!!");
+        }
+#endif
+    }
+
+    typename allocator_type::template rebind<node>::other my_node_allocator;  // allocator object for nodes
+    size_type                                             my_element_count;   // Total item count, not counting dummy nodes
+    nodeptr_t                                             my_head;            // pointer to head node
+};
+
+// Template class for hash compare
+template<typename Key, typename Hasher, typename Key_equality>
+class hash_compare
+{
+public:
+    hash_compare() {}
+
+    hash_compare(Hasher a_hasher) : my_hash_object(a_hasher) {}
+
+    hash_compare(Hasher a_hasher, Key_equality a_keyeq) : my_hash_object(a_hasher), my_key_compare_object(a_keyeq) {}
+
+    size_t operator()(const Key& key) const {
+        return ((size_t)my_hash_object(key));
+    }
+
+    bool operator()(const Key& key1, const Key& key2) const {
+        return (!my_key_compare_object(key1, key2));
+    }
+
+    Hasher       my_hash_object;        // The hash object
+    Key_equality my_key_compare_object; // The equality comparator object
+};
+
+#if _MSC_VER
+#pragma warning(push)
+#pragma warning(disable: 4127) // warning 4127 -- while (true) has a constant expression in it (for allow_multimapping)
+#endif
+
+template <typename Traits>
+class concurrent_unordered_base : public Traits
+{
+protected:
+    // Type definitions
+    typedef concurrent_unordered_base<Traits> self_type;
+    typedef typename Traits::value_type value_type;
+    typedef typename Traits::key_type key_type;
+    typedef typename Traits::hash_compare hash_compare;
+    typedef typename Traits::value_compare value_compare;
+    typedef typename Traits::allocator_type allocator_type;
+    typedef typename allocator_type::pointer pointer;
+    typedef typename allocator_type::const_pointer const_pointer;
+    typedef typename allocator_type::reference reference;
+    typedef typename allocator_type::const_reference const_reference;
+    typedef typename allocator_type::size_type size_type;
+    typedef typename allocator_type::difference_type difference_type;
+    typedef split_ordered_list<value_type, typename Traits::allocator_type> solist_t;
+    typedef typename solist_t::nodeptr_t nodeptr_t;
+    // Iterators that walk the entire split-order list, including dummy nodes
+    typedef typename solist_t::raw_iterator raw_iterator;
+    typedef typename solist_t::raw_const_iterator raw_const_iterator;
+    typedef typename solist_t::iterator iterator; // TODO: restore const iterator for unordered_sets
+    typedef typename solist_t::const_iterator const_iterator;
+    typedef iterator local_iterator;
+    typedef const_iterator const_local_iterator;
+    using Traits::my_hash_compare;
+    using Traits::get_key;
+    using Traits::allow_multimapping;
+
+private:
+    typedef std::pair<iterator, iterator> pairii_t;
+    typedef std::pair<const_iterator, const_iterator> paircc_t;
+
+    static size_type const pointers_per_table = sizeof(size_type) * 8;              // One bucket segment per bit
+    static const size_type initial_bucket_number = 8;                               // Initial number of buckets
+    static const size_type initial_bucket_load = 4;                                // Initial maximum number of elements per bucket
+
+protected:
+    // Constructors/Destructors
+    concurrent_unordered_base(size_type n_of_buckets = initial_bucket_number,
+        const hash_compare& hc = hash_compare(), const allocator_type& a = allocator_type())
+        : Traits(hc), my_solist(a),
+          my_allocator(a), my_maximum_bucket_size((float) initial_bucket_load)
+    {
+        if( n_of_buckets == 0) ++n_of_buckets;
+        my_number_of_buckets = 1<<__TBB_Log2((uintptr_t)n_of_buckets*2-1); // round up to power of 2
+        internal_init();
+    }
+
+    concurrent_unordered_base(const concurrent_unordered_base& right, const allocator_type& a)
+        : Traits(right.my_hash_compare), my_solist(a), my_allocator(a)
+    {
+        internal_init();
+        internal_copy(right);
+    }
+
+    concurrent_unordered_base(const concurrent_unordered_base& right)
+        : Traits(right.my_hash_compare), my_solist(right.get_allocator()), my_allocator(right.get_allocator())
+    {
+        internal_init();
+        internal_copy(right);
+    }
+
+    concurrent_unordered_base& operator=(const concurrent_unordered_base& right) {
+        if (this != &right)
+            internal_copy(right);
+        return (*this);
+    }
+
+    ~concurrent_unordered_base() {
+        // Delete all node segments
+        internal_clear();
+    }
+
+public:
+    allocator_type get_allocator() const {
+        return my_solist.get_allocator();
+    }
+
+    // Size and capacity function
+    bool empty() const {
+        return my_solist.empty();
+    }
+
+    size_type size() const {
+        return my_solist.size();
+    }
+
+    size_type max_size() const {
+        return my_solist.max_size();
+    }
+
+    // Iterators 
+    iterator begin() {
+        return my_solist.begin();
+    }
+
+    const_iterator begin() const {
+        return my_solist.begin();
+    }
+
+    iterator end() {
+        return my_solist.end();
+    }
+
+    const_iterator end() const {
+        return my_solist.end();
+    }
+
+    const_iterator cbegin() const {
+        return my_solist.cbegin();
+    }
+
+    const_iterator cend() const {
+        return my_solist.cend();
+    }
+
+    // Parallel traversal support
+    class const_range_type : tbb::internal::no_assign {
+        const concurrent_unordered_base &my_table;
+        raw_const_iterator my_begin_node;
+        raw_const_iterator my_end_node;
+        mutable raw_const_iterator my_midpoint_node;
+    public:
+        //! Type for size of a range
+        typedef typename concurrent_unordered_base::size_type size_type;
+        typedef typename concurrent_unordered_base::value_type value_type;
+        typedef typename concurrent_unordered_base::reference reference;
+        typedef typename concurrent_unordered_base::difference_type difference_type;
+        typedef typename concurrent_unordered_base::const_iterator iterator;
+
+        //! True if range is empty.
+        bool empty() const {return my_begin_node == my_end_node;}
+
+        //! True if range can be partitioned into two subranges.
+        bool is_divisible() const {
+            return my_midpoint_node != my_end_node;
+        }
+        //! Split range.
+        const_range_type( const_range_type &r, split ) : 
+            my_table(r.my_table), my_end_node(r.my_end_node)
+        {
+            r.my_end_node = my_begin_node = r.my_midpoint_node;
+            __TBB_ASSERT( !empty(), "Splitting despite the range is not divisible" );
+            __TBB_ASSERT( !r.empty(), "Splitting despite the range is not divisible" );
+            set_midpoint();
+            r.set_midpoint();
+        }
+        //! Init range with container and grainsize specified
+        const_range_type( const concurrent_unordered_base &a_table ) : 
+            my_table(a_table), my_begin_node(a_table.my_solist.begin()),
+            my_end_node(a_table.my_solist.end())
+        {
+            set_midpoint();
+        }
+        iterator begin() const { return my_table.my_solist.get_iterator(my_begin_node); }
+        iterator end() const { return my_table.my_solist.get_iterator(my_end_node); }
+        //! The grain size for this range.
+        size_type grainsize() const { return 1; }
+
+        //! Set my_midpoint_node to point approximately half way between my_begin_node and my_end_node.
+        void set_midpoint() const {
+            if( my_begin_node == my_end_node ) // not divisible
+                my_midpoint_node = my_end_node;
+            else {
+                sokey_t begin_key = solist_t::get_safe_order_key(my_begin_node);
+                sokey_t end_key = solist_t::get_safe_order_key(my_end_node);
+                size_t mid_bucket = __TBB_ReverseBits( begin_key + (end_key-begin_key)/2 ) % my_table.my_number_of_buckets;
+                while ( !my_table.is_initialized(mid_bucket) ) mid_bucket = my_table.get_parent(mid_bucket);
+                my_midpoint_node = my_table.my_solist.first_real_iterator(my_table.get_bucket( mid_bucket ));
+                if( my_midpoint_node == my_begin_node )
+                    my_midpoint_node = my_end_node;
+#if TBB_USE_ASSERT
+                else {
+                    sokey_t mid_key = solist_t::get_safe_order_key(my_midpoint_node);
+                    __TBB_ASSERT( begin_key < mid_key, "my_begin_node is after my_midpoint_node" );
+                    __TBB_ASSERT( mid_key <= end_key, "my_midpoint_node is after my_end_node" );
+                }
+#endif // TBB_USE_ASSERT
+            }
+        }
+    };
+
+    class range_type : public const_range_type {
+    public:
+        typedef typename concurrent_unordered_base::iterator iterator;
+        //! Split range.
+        range_type( range_type &r, split ) : const_range_type( r, split() ) {}
+        //! Init range with container and grainsize specified
+        range_type( const concurrent_unordered_base &a_table ) : const_range_type(a_table) {}
+
+        iterator begin() const { return solist_t::get_iterator( const_range_type::begin() ); }
+        iterator end() const { return solist_t::get_iterator( const_range_type::end() ); }
+    };
+
+    range_type range() {
+        return range_type( *this );
+    }
+
+    const_range_type range() const {
+        return const_range_type( *this );
+    }
+
+    // Modifiers
+    std::pair<iterator, bool> insert(const value_type& value) {
+        return internal_insert(value);
+    }
+
+    iterator insert(const_iterator, const value_type& value) {
+        // Ignore hint
+        return insert(value).first;
+    }
+
+    template<class Iterator>
+    void insert(Iterator first, Iterator last) {
+        for (Iterator it = first; it != last; ++it)
+            insert(*it);
+    }
+
+    iterator unsafe_erase(const_iterator where) {
+        return internal_erase(where);
+    }
+
+    iterator unsafe_erase(const_iterator first, const_iterator last) {
+        while (first != last)
+            unsafe_erase(first++);
+        return my_solist.get_iterator(first);
+    }
+
+    size_type unsafe_erase(const key_type& key) {
+        pairii_t where = equal_range(key);
+        size_type item_count = internal_distance(where.first, where.second);
+        unsafe_erase(where.first, where.second);
+        return item_count;
+    }
+
+    void swap(concurrent_unordered_base& right) {
+        if (this != &right) {
+            std::swap(my_hash_compare, right.my_hash_compare); // TODO: check what ADL meant here
+            my_solist.swap(right.my_solist);
+            internal_swap_buckets(right);
+            std::swap(my_number_of_buckets, right.my_number_of_buckets);
+            std::swap(my_maximum_bucket_size, right.my_maximum_bucket_size);
+        }
+    }
+
+    // Observers
+    void clear() {
+        // Clear list
+        my_solist.clear();
+
+        // Clear buckets
+        internal_clear();
+
+        // Initialize bucket 0
+        __TBB_ASSERT(my_buckets[0] == NULL, NULL);
+        raw_iterator dummy_node = my_solist.raw_begin();
+        set_bucket(0, dummy_node);
+    }
+
+    // Lookup
+    iterator find(const key_type& key) {
+        return internal_find(key);
+    }
+
+    const_iterator find(const key_type& key) const {
+        return const_cast<self_type*>(this)->internal_find(key);
+    }
+
+    size_type count(const key_type& key) const {
+        if(allow_multimapping) {
+            paircc_t answer = equal_range(key);
+            size_type item_count = internal_distance(answer.first, answer.second);
+            return item_count;
+        } else {
+            return const_cast<self_type*>(this)->internal_find(key) == end()?0:1;
+        }
+    }
+
+    std::pair<iterator, iterator> equal_range(const key_type& key) {
+        return internal_equal_range(key);
+    }
+
+    std::pair<const_iterator, const_iterator> equal_range(const key_type& key) const {
+        return const_cast<self_type*>(this)->internal_equal_range(key);
+    }
+
+    // Bucket interface - for debugging 
+    size_type unsafe_bucket_count() const {
+        return my_number_of_buckets;
+    }
+
+    size_type unsafe_max_bucket_count() const {
+        return segment_size(pointers_per_table-1);
+    }
+
+    size_type unsafe_bucket_size(size_type bucket) {
+        size_type item_count = 0;
+        if (is_initialized(bucket)) {
+            raw_iterator it = get_bucket(bucket);
+            ++it;
+            for (; it != my_solist.raw_end() && !it.get_node_ptr()->is_dummy(); ++it)
+                ++item_count;
+        }
+        return item_count;
+    }
+
+    size_type unsafe_bucket(const key_type& key) const {
+        sokey_t order_key = (sokey_t) my_hash_compare(key);
+        size_type bucket = order_key % my_number_of_buckets;
+        return bucket;
+    }
+
+    // If the bucket is initialized, return a first non-dummy element in it
+    local_iterator unsafe_begin(size_type bucket) {
+        if (!is_initialized(bucket))
+            return end();
+
+        raw_iterator it = get_bucket(bucket);
+        return my_solist.first_real_iterator(it);
+    }
+
+    // If the bucket is initialized, return a first non-dummy element in it
+    const_local_iterator unsafe_begin(size_type bucket) const
+    {
+        if (!is_initialized(bucket))
+            return end();
+
+        raw_const_iterator it = get_bucket(bucket);
+        return my_solist.first_real_iterator(it);
+    }
+
+    // @REVIEW: Takes O(n)
+    // Returns the iterator after the last non-dummy element in the bucket
+    local_iterator unsafe_end(size_type bucket)
+    {
+        if (!is_initialized(bucket))
+            return end();
+
+        raw_iterator it = get_bucket(bucket);
+    
+        // Find the end of the bucket, denoted by the dummy element
+        do ++it;
+        while(it != my_solist.raw_end() && !it.get_node_ptr()->is_dummy());
+
+        // Return the first real element past the end of the bucket
+        return my_solist.first_real_iterator(it);
+    }
+
+    // @REVIEW: Takes O(n)
+    // Returns the iterator after the last non-dummy element in the bucket
+    const_local_iterator unsafe_end(size_type bucket) const
+    {
+        if (!is_initialized(bucket))
+            return end();
+
+        raw_const_iterator it = get_bucket(bucket);
+    
+        // Find the end of the bucket, denoted by the dummy element
+        do ++it;
+        while(it != my_solist.raw_end() && !it.get_node_ptr()->is_dummy());
+
+        // Return the first real element past the end of the bucket
+        return my_solist.first_real_iterator(it);
+    }
+
+    const_local_iterator unsafe_cbegin(size_type bucket) const {
+        return ((const self_type *) this)->begin();
+    }
+
+    const_local_iterator unsafe_cend(size_type bucket) const {
+        return ((const self_type *) this)->end();
+    }
+
+    // Hash policy
+    float load_factor() const {
+        return (float) size() / (float) unsafe_bucket_count();
+    }
+
+    float max_load_factor() const {
+        return my_maximum_bucket_size;
+    }
+
+    void max_load_factor(float newmax) {
+        if (newmax != newmax || newmax < 0)
+            tbb::internal::throw_exception(tbb::internal::eid_invalid_load_factor);
+        my_maximum_bucket_size = newmax;
+    }
+
+    // This function is a noop, because the underlying split-ordered list
+    // is already sorted, so an increase in the bucket number will be
+    // reflected next time this bucket is touched.
+    void rehash(size_type buckets) {
+        size_type current_buckets = my_number_of_buckets;
+        if (current_buckets >= buckets)
+            return;
+        my_number_of_buckets = 1<<__TBB_Log2((uintptr_t)buckets*2-1); // round up to power of 2
+    }
+
+private:
+
+    // Initialize the hash and keep the first bucket open
+    void internal_init() {
+        // Allocate an array of segment pointers
+        memset(my_buckets, 0, pointers_per_table * sizeof(void *));
+
+        // Initialize bucket 0
+        raw_iterator dummy_node = my_solist.raw_begin();
+        set_bucket(0, dummy_node);
+    }
+
+    void internal_clear() {
+        for (size_type index = 0; index < pointers_per_table; ++index) {
+            if (my_buckets[index] != NULL) {
+                size_type sz = segment_size(index);
+                for (size_type index2 = 0; index2 < sz; ++index2)
+                    my_allocator.destroy(&my_buckets[index][index2]);
+                my_allocator.deallocate(my_buckets[index], sz);
+                my_buckets[index] = 0;
+            }
+        }
+    }
+
+    void internal_copy(const self_type& right) {
+        clear();
+
+        my_maximum_bucket_size = right.my_maximum_bucket_size;
+        my_number_of_buckets = right.my_number_of_buckets;
+
+        __TBB_TRY {
+            insert(right.begin(), right.end());
+            my_hash_compare = right.my_hash_compare;
+        } __TBB_CATCH(...) {
+            my_solist.clear();
+            __TBB_RETHROW();
+        }
+    }
+
+    void internal_swap_buckets(concurrent_unordered_base& right)
+    {
+        // Swap all node segments
+        for (size_type index = 0; index < pointers_per_table; ++index)
+        {
+            raw_iterator * iterator_pointer = my_buckets[index];
+            my_buckets[index] = right.my_buckets[index];
+            right.my_buckets[index] = iterator_pointer;
+        }
+    }
+
+    // Hash APIs
+    size_type internal_distance(const_iterator first, const_iterator last) const
+    {
+        size_type num = 0;
+
+        for (const_iterator it = first; it != last; ++it)
+            ++num;
+
+        return num;
+    }
+
+    // Insert an element in the hash given its value
+    std::pair<iterator, bool> internal_insert(const value_type& value)
+    {
+        sokey_t order_key = (sokey_t) my_hash_compare(get_key(value));
+        size_type bucket = order_key % my_number_of_buckets;
+
+        // If bucket is empty, initialize it first
+        if (!is_initialized(bucket))
+            init_bucket(bucket);
+
+        size_type new_count;
+        order_key = split_order_key_regular(order_key);
+        raw_iterator it = get_bucket(bucket);
+        raw_iterator last = my_solist.raw_end();
+        raw_iterator where = it;
+
+        __TBB_ASSERT(where != last, "Invalid head node");
+
+        // First node is a dummy node
+        ++where;
+
+        for (;;)
+        {
+            if (where == last || solist_t::get_order_key(where) > order_key)
+            {
+                // Try to insert it in the right place
+                std::pair<iterator, bool> result = my_solist.try_insert(it, where, value, order_key, &new_count);
+                
+                if (result.second)
+                {
+                    // Insertion succeeded, adjust the table size, if needed
+                    adjust_table_size(new_count, my_number_of_buckets);
+                    return result;
+                }
+                else
+                {
+                    // Insertion failed: either the same node was inserted by another thread, or
+                    // another element was inserted at exactly the same place as this node.
+                    // Proceed with the search from the previous location where order key was
+                    // known to be larger (note: this is legal only because there is no safe
+                    // concurrent erase operation supported).
+                    where = it;
+                    ++where;
+                    continue;
+                }
+            }
+            else if (!allow_multimapping && solist_t::get_order_key(where) == order_key && my_hash_compare(get_key(*where), get_key(value)) == 0)
+            {
+                // Element already in the list, return it
+                return std::pair<iterator, bool>(my_solist.get_iterator(where), false);
+            }
+
+            // Move the iterator forward
+            it = where;
+            ++where;
+        }
+    }
+
+    // Find the element in the split-ordered list
+    iterator internal_find(const key_type& key)
+    {
+        sokey_t order_key = (sokey_t) my_hash_compare(key);
+        size_type bucket = order_key % my_number_of_buckets;
+
+        // If bucket is empty, initialize it first
+        if (!is_initialized(bucket))
+            init_bucket(bucket);
+
+        order_key = split_order_key_regular(order_key);
+        raw_iterator last = my_solist.raw_end();
+
+        for (raw_iterator it = get_bucket(bucket); it != last; ++it)
+        {
+            if (solist_t::get_order_key(it) > order_key)
+            {
+                // If the order key is smaller than the current order key, the element
+                // is not in the hash.
+                return end();
+            }
+            else if (solist_t::get_order_key(it) == order_key)
+            {
+                // The fact that order keys match does not mean that the element is found.
+                // Key function comparison has to be performed to check whether this is the
+                // right element. If not, keep searching while order key is the same.
+                if (!my_hash_compare(get_key(*it), key))
+                    return my_solist.get_iterator(it);
+            }
+        }
+
+        return end();
+    }
+
+    // Erase an element from the list. This is not a concurrency safe function.
+    iterator internal_erase(const_iterator it)
+    {
+        key_type key = get_key(*it);
+        sokey_t order_key = (sokey_t) my_hash_compare(key);
+        size_type bucket = order_key % my_number_of_buckets;
+
+        // If bucket is empty, initialize it first
+        if (!is_initialized(bucket))
+            init_bucket(bucket);
+
+        order_key = split_order_key_regular(order_key);
+
+        raw_iterator previous = get_bucket(bucket);
+        raw_iterator last = my_solist.raw_end();
+        raw_iterator where = previous;
+
+        __TBB_ASSERT(where != last, "Invalid head node");
+
+        // First node is a dummy node
+        ++where;
+
+        for (;;) {
+            if (where == last)
+                return end();
+            else if (my_solist.get_iterator(where) == it)
+                return my_solist.erase_node(previous, it);
+
+            // Move the iterator forward
+            previous = where;
+            ++where;
+        }
+    }
+
+    // Return the [begin, end) pair of iterators with the same key values.
+    // This operation makes sense only if mapping is many-to-one.
+    pairii_t internal_equal_range(const key_type& key)
+    {
+        sokey_t order_key = (sokey_t) my_hash_compare(key);
+        size_type bucket = order_key % my_number_of_buckets;
+
+        // If bucket is empty, initialize it first
+        if (!is_initialized(bucket))
+            init_bucket(bucket);
+
+        order_key = split_order_key_regular(order_key);
+        raw_iterator end_it = my_solist.raw_end();
+
+        for (raw_iterator it = get_bucket(bucket); it != end_it; ++it)
+        {
+            if (solist_t::get_order_key(it) > order_key)
+            {
+                // There is no element with the given key
+                return pairii_t(end(), end());
+            }
+            else if (solist_t::get_order_key(it) == order_key && !my_hash_compare(get_key(*it), key))
+            {
+                iterator first = my_solist.get_iterator(it);
+                iterator last = first;
+                do ++last; while( allow_multimapping && last != end() && !my_hash_compare(get_key(*last), key) );
+                return pairii_t(first, last);
+            }
+        }
+
+        return pairii_t(end(), end());
+    }
+
+    // Bucket APIs
+    void init_bucket(size_type bucket)
+    {
+        // Bucket 0 has no parent.
+        __TBB_ASSERT( bucket != 0, "The first bucket must always be initialized");
+
+        size_type parent_bucket = get_parent(bucket);
+
+        // All parent_bucket buckets have to be initialized before this bucket is
+        if (!is_initialized(parent_bucket))
+            init_bucket(parent_bucket);
+
+        raw_iterator parent = get_bucket(parent_bucket);
+
+        // Create a dummy first node in this bucket
+        raw_iterator dummy_node = my_solist.insert_dummy(parent, split_order_key_dummy(bucket));
+        set_bucket(bucket, dummy_node);
+    }
+
+    void adjust_table_size(size_type total_elements, size_type current_size)
+    {
+        // Grow the table by a factor of 2 if possible and needed
+        if ( ((float) total_elements / (float) current_size) > my_maximum_bucket_size )
+        {
+            // Double the size of the hash only if size has not changed inbetween loads
+            __TBB_CompareAndSwapW((uintptr_t*)&my_number_of_buckets, uintptr_t(2u*current_size), uintptr_t(current_size) );
+            //Simple "my_number_of_buckets.compare_and_swap( current_size<<1, current_size );" does not work for VC8
+            //due to overzealous compiler warnings in /Wp64 mode
+        }
+    }
+
+    size_type get_parent(size_type bucket) const
+    {
+        // Unsets bucket's most significant turned-on bit
+        size_type msb = __TBB_Log2((uintptr_t)bucket);
+        return bucket & ~(size_type(1) << msb);
+    }
+
+
+    // Dynamic sized array (segments)
+    //! @return segment index of given index in the array
+    static size_type segment_index_of( size_type index ) {
+        return size_type( __TBB_Log2( uintptr_t(index|1) ) );
+    }
+
+    //! @return the first array index of given segment
+    static size_type segment_base( size_type k ) {
+        return (size_type(1)<<k & ~size_type(1));
+    }
+
+    //! @return segment size
+    static size_type segment_size( size_type k ) {
+        return k? size_type(1)<<k : 2;
+    }
+
+    raw_iterator get_bucket(size_type bucket) const {
+        size_type segment = segment_index_of(bucket);
+        bucket -= segment_base(segment);
+        __TBB_ASSERT( my_buckets[segment], "bucket must be in an allocated segment" );
+        return my_buckets[segment][bucket];
+    }
+
+    void set_bucket(size_type bucket, raw_iterator dummy_head) {
+        size_type segment = segment_index_of(bucket);
+        bucket -= segment_base(segment);
+
+        if (my_buckets[segment] == NULL) {
+            size_type sz = segment_size(segment);
+            raw_iterator * new_segment = my_allocator.allocate(sz);
+            std::memset(new_segment, 0, sz*sizeof(raw_iterator));
+
+            if (__TBB_CompareAndSwapW((void *) &my_buckets[segment], (uintptr_t)new_segment, 0) != 0)
+                my_allocator.deallocate(new_segment, sz);
+        }
+
+        my_buckets[segment][bucket] = dummy_head;
+    }
+
+    bool is_initialized(size_type bucket) const {
+        size_type segment = segment_index_of(bucket);
+        bucket -= segment_base(segment);
+
+        if (my_buckets[segment] == NULL)
+            return false;
+
+        raw_iterator it = my_buckets[segment][bucket];
+        return (it.get_node_ptr() != NULL);
+    }
+
+    // Utilities for keys
+
+    // A regular order key has its original hash value reversed and the last bit set
+    sokey_t split_order_key_regular(sokey_t order_key) const {
+        return __TBB_ReverseBits(order_key) | 0x1;
+    }
+
+    // A dummy order key has its original hash value reversed and the last bit unset
+    sokey_t split_order_key_dummy(sokey_t order_key) const {
+        return __TBB_ReverseBits(order_key) & ~(0x1);
+    }
+
+    // Shared variables
+    atomic<size_type>                                             my_number_of_buckets;       // Current table size
+    solist_t                                                      my_solist;                  // List where all the elements are kept
+    typename allocator_type::template rebind<raw_iterator>::other my_allocator;               // Allocator object for segments
+    float                                                         my_maximum_bucket_size;     // Maximum size of the bucket
+    atomic<raw_iterator*>                                         my_buckets[pointers_per_table]; // The segment table
+};
+#if _MSC_VER
+#pragma warning(pop) // warning 4127 -- while (true) has a constant expression in it
+#endif
+
+//! Hash multiplier
+static const size_t hash_multiplier = sizeof(size_t)==4? 2654435769U : 11400714819323198485ULL;
+} // namespace internal
+//! @endcond
+//! Hasher functions
+template<typename T>
+inline size_t tbb_hasher( const T& t ) {
+    return static_cast<size_t>( t ) * internal::hash_multiplier;
+}
+template<typename P>
+inline size_t tbb_hasher( P* ptr ) {
+    size_t const h = reinterpret_cast<size_t>( ptr );
+    return (h >> 3) ^ h;
+}
+template<typename E, typename S, typename A>
+inline size_t tbb_hasher( const std::basic_string<E,S,A>& s ) {
+    size_t h = 0;
+    for( const E* c = s.c_str(); *c; ++c )
+        h = static_cast<size_t>(*c) ^ (h * internal::hash_multiplier);
+    return h;
+}
+template<typename F, typename S>
+inline size_t tbb_hasher( const std::pair<F,S>& p ) {
+    return tbb_hasher(p.first) ^ tbb_hasher(p.second);
+}
+} // namespace interface5
+using interface5::tbb_hasher;
+
+
+// Template class for hash compare
+template<typename Key>
+class tbb_hash
+{
+public:
+    tbb_hash() {}
+
+    size_t operator()(const Key& key) const
+    {
+        return tbb_hasher(key);
+    }
+};
+
+} // namespace tbb
+#endif// __TBB_concurrent_unordered_internal_H
diff --git a/tbb/include/tbb/internal/_flow_graph_impl.h b/tbb/include/tbb/internal/_flow_graph_impl.h
new file mode 100644 (file)
index 0000000..8c59c6d
--- /dev/null
@@ -0,0 +1,547 @@
+/*
+    Copyright 2005-2011 Intel Corporation.  All Rights Reserved.
+
+    This file is part of Threading Building Blocks.
+
+    Threading Building Blocks is free software; you can redistribute it
+    and/or modify it under the terms of the GNU General Public License
+    version 2 as published by the Free Software Foundation.
+
+    Threading Building Blocks is distributed in the hope that it will be
+    useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+    of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Threading Building Blocks; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+    As a special exception, you may use this file as part of a free software
+    library without restriction.  Specifically, if other files instantiate
+    templates or use macros or inline functions from this file, or you compile
+    this file and link it with other files to produce an executable, this
+    file does not by itself cause the resulting executable to be covered by
+    the GNU General Public License.  This exception does not however
+    invalidate any other reasons why the executable file might be covered by
+    the GNU General Public License.
+*/
+
+#ifndef __TBB__graph_internal_H
+#define __TBB__graph_internal_H
+
+namespace internal {
+
+    namespace graph_policy_namespace {
+        enum graph_buffer_policy { rejecting, reserving, queueing, tag_matching };
+    }
+
+    //! A functor that takes no input and generates a value of type Output
+    template< typename Output >
+    class source_body : tbb::internal::no_assign {
+    public:
+        virtual ~source_body() {}
+        virtual bool operator()(Output &output) = 0;
+        virtual source_body* clone() = 0;
+    };
+    
+    //! The leaf for source_body
+    template< typename Output, typename Body>
+    class source_body_leaf : public source_body<Output> {
+    public:
+        source_body_leaf( const Body &_body ) : body(_body), init_body(_body) { }
+        /*override*/ bool operator()(Output &output) { return body( output ); }
+        /*override*/ source_body_leaf* clone() { 
+            return new source_body_leaf< Output, Body >(init_body); 
+        }
+    private:
+        Body body;
+        Body init_body;
+    };
+    
+    //! A functor that takes an Input and generates an Output
+    template< typename Input, typename Output >
+    class function_body : tbb::internal::no_assign {
+    public:
+        virtual ~function_body() {}
+        virtual Output operator()(const Input &input) = 0;
+        virtual function_body* clone() = 0;
+    };
+    
+    //! the leaf for function_body
+    template <typename Input, typename Output, typename B>
+    class function_body_leaf : public function_body< Input, Output > {
+    public:
+        function_body_leaf( const B &_body ) : body(_body), init_body(_body) { }
+        Output operator()(const Input &i) { return body(i); }
+        B get_body() { return body; }
+        /*override*/ function_body_leaf* clone() {
+            return new function_body_leaf< Input, Output, B >(init_body);
+        }
+    private:
+        B body;
+        B init_body;
+    };
+    
+    //! the leaf for function_body specialized for Input and output of continue_msg
+    template <typename B>
+    class function_body_leaf< continue_msg, continue_msg, B> : public function_body< continue_msg, continue_msg > {
+    public:
+        function_body_leaf( const B &_body ) : body(_body), init_body(_body) { }
+        continue_msg operator()( const continue_msg &i ) { 
+            body(i); 
+            return i; 
+        }
+        B get_body() { return body; }
+        /*override*/ function_body_leaf* clone() {
+           return new function_body_leaf< continue_msg, continue_msg, B >(init_body);
+        }    
+    private:
+        B body;
+        B init_body;
+    };
+    
+    //! the leaf for function_body specialized for Output of continue_msg
+    template <typename Input, typename B>
+    class function_body_leaf< Input, continue_msg, B> : public function_body< Input, continue_msg > {
+    public:
+        function_body_leaf( const B &_body ) : body(_body), init_body(_body) { }
+        continue_msg operator()(const Input &i) { 
+            body(i); 
+            return continue_msg();
+        }
+        B get_body() { return body; }
+        /*override*/ function_body_leaf* clone() {
+            return new function_body_leaf< Input, continue_msg, B >(init_body);
+        }    
+    private:
+        B body;
+        B init_body;
+    };
+    
+    //! the leaf for function_body specialized for Input of continue_msg
+    template <typename Output, typename B>
+    class function_body_leaf< continue_msg, Output, B > : public function_body< continue_msg, Output > {
+    public:
+        function_body_leaf( const B &_body ) : body(_body), init_body(_body) { }
+        Output operator()(const continue_msg &i) { 
+            return body(i); 
+        }
+        B get_body() { return body; }
+        /*override*/ function_body_leaf* clone() {
+            return new function_body_leaf< continue_msg, Output, B >(init_body);
+        }    
+    private:
+        B body;
+        B init_body;
+    };
+    
+    //! A task that calls a node's forward function
+    template< typename NodeType >
+    class forward_task : public task {
+    
+        NodeType &my_node;
+    
+    public:
+    
+        forward_task( NodeType &n ) : my_node(n) {}
+    
+        task *execute() {
+            my_node.forward();
+            return NULL;
+        }
+    };
+    
+    //! A task that calls a node's apply_body function, passing in an input of type Input
+    template< typename NodeType, typename Input >
+    class apply_body_task : public task {
+    
+        NodeType &my_node;
+        Input my_input;
+        
+    public:
+        
+        apply_body_task( NodeType &n, const Input &i ) : my_node(n), my_input(i) {}
+        
+        task *execute() {
+            my_node.apply_body( my_input );
+            return NULL;
+        }
+    };
+    
+    //! A task that calls a node's apply_body function with no input
+    template< typename NodeType >
+    class source_task : public task {
+    
+        NodeType &my_node;
+    
+    public:
+    
+        source_task( NodeType &n ) : my_node(n) {}
+    
+        task *execute() {
+            my_node.apply_body( );
+            return NULL;
+        }
+    };
+    
+    //! An empty functor that takes an Input and returns a default constructed Output
+    template< typename Input, typename Output >
+    struct empty_body {
+       Output operator()( const Input & ) const { return Output(); } 
+    };
+    
+    //! A node_cache maintains a std::queue of elements of type T.  Each operation is protected by a lock. 
+    template< typename T, typename M=spin_mutex >
+    class node_cache {
+        public:
+    
+        typedef size_t size_type;
+        
+        bool empty() {
+            typename my_mutex_type::scoped_lock lock( my_mutex );
+            return internal_empty();
+        }
+    
+        void add( T &n ) {
+            typename my_mutex_type::scoped_lock lock( my_mutex );
+            internal_push(n);
+        }
+    
+        void remove( T &n ) {
+            typename my_mutex_type::scoped_lock lock( my_mutex );
+            for ( size_t i = internal_size(); i != 0; --i ) {
+                T &s = internal_pop();
+                if ( &s != &n ) {
+                    internal_push(s);
+                }
+            }
+        }
+        
+    protected:
+    
+        typedef M my_mutex_type;
+        my_mutex_type my_mutex;
+        std::queue< T * > my_q;
+    
+        // Assumes lock is held
+        inline bool internal_empty( )  {
+            return my_q.empty();
+        }
+    
+        // Assumes lock is held
+        inline size_type internal_size( )  {
+            return my_q.size(); 
+        }
+    
+        // Assumes lock is held
+        inline void internal_push( T &n )  {
+            my_q.push(&n);
+        }
+    
+        // Assumes lock is held
+        inline T &internal_pop() {
+            T *v = my_q.front();
+            my_q.pop();
+            return *v;
+        }
+    
+    };
+    
+    //! A cache of predecessors that only supports try_get
+    template< typename T, typename M=spin_mutex >
+    class predecessor_cache : public node_cache< sender<T>, M > {
+        public:
+        typedef M my_mutex_type;
+        typedef T output_type; 
+        typedef sender<output_type> predecessor_type;
+        typedef receiver<output_type> successor_type;
+    
+        predecessor_cache( ) : my_owner( NULL ) { }
+        
+        void set_owner( successor_type *owner ) { my_owner = owner; }
+        
+        bool get_item( output_type &v ) {
+        
+            bool msg = false;
+        
+            do {
+                predecessor_type *src;
+                {
+                    typename my_mutex_type::scoped_lock lock(this->my_mutex);
+                    if ( this->internal_empty() ) {
+                        break;
+                    }
+                    src = &this->internal_pop();
+                }
+        
+                // Try to get from this sender
+                msg = src->try_get( v );
+        
+                if (msg == false) {
+                    // Relinquish ownership of the edge
+                    if ( my_owner) 
+                        src->register_successor( *my_owner );
+                } else {
+                    // Retain ownership of the edge
+                    this->add(*src);
+                }
+            } while ( msg == false );
+            return msg;
+        }
+    
+    protected:
+        successor_type *my_owner;
+    };
+    
+    //! An cache of predecessors that supports requests and reservations
+    template< typename T, typename M=spin_mutex >
+    class reservable_predecessor_cache : public predecessor_cache< T, M > {
+    public:
+        typedef M my_mutex_type;
+        typedef T output_type; 
+        typedef sender<T> predecessor_type;
+        typedef receiver<T> successor_type;
+        
+        reservable_predecessor_cache( ) : reserved_src(NULL) { }
+        
+        bool 
+        try_reserve( output_type &v ) {
+            bool msg = false;
+        
+            do {
+                {
+                    typename my_mutex_type::scoped_lock lock(this->my_mutex);
+                    if ( reserved_src || this->internal_empty() ) 
+                        return false;
+        
+                    reserved_src = &this->internal_pop();
+                }
+        
+                // Try to get from this sender
+                msg = reserved_src->try_reserve( v );
+        
+                if (msg == false) {
+                    typename my_mutex_type::scoped_lock lock(this->my_mutex);
+                    // Relinquish ownership of the edge
+                    reserved_src->register_successor( *this->my_owner );
+                    reserved_src = NULL;
+                } else {
+                    // Retain ownership of the edge
+                    this->add( *reserved_src );
+                }
+            } while ( msg == false );
+        
+            return msg;
+        }
+        
+        bool 
+        try_release( ) {
+            reserved_src->try_release( );
+            reserved_src = NULL;
+            return true;
+        }
+        
+        bool 
+        try_consume( ) {
+            reserved_src->try_consume( );
+            reserved_src = NULL;
+            return true;
+        }
+    
+    private:
+        predecessor_type *reserved_src;
+    };
+    
+    
+    //! An abstract cache of succesors
+    template<typename T, typename M=spin_rw_mutex >
+    class successor_cache : tbb::internal::no_copy {
+    protected:
+        
+        typedef M my_mutex_type;
+        my_mutex_type my_mutex;
+        
+        typedef std::list< receiver<T> * > my_successors_type;
+        my_successors_type my_successors;
+        
+        sender<T> *my_owner;
+        
+    public:
+        
+        successor_cache( ) : my_owner(NULL) {}
+        
+        void set_owner( sender<T> *owner ) { my_owner = owner; }
+        
+        virtual ~successor_cache() {}
+        
+        void register_successor( receiver<T> &r ) {
+            typename my_mutex_type::scoped_lock l(my_mutex, true);
+            my_successors.push_back( &r ); 
+        }
+    
+        void remove_successor( receiver<T> &r ) {
+            typename my_mutex_type::scoped_lock l(my_mutex, true);
+            for ( typename my_successors_type::iterator i = my_successors.begin();
+                  i != my_successors.end(); ++i ) { 
+                if ( *i == & r ) { 
+                    my_successors.erase(i);
+                    break;
+                }
+            }
+        }
+        
+        bool empty() { 
+            typename my_mutex_type::scoped_lock l(my_mutex, false);
+            return my_successors.empty(); 
+        }
+        
+        virtual bool try_put( const T &t ) = 0; 
+     };
+    
+    //! An abstract cache of succesors, specialized to continue_msg
+    template<>
+    class successor_cache< continue_msg > : tbb::internal::no_copy {
+    protected:
+        
+        typedef spin_rw_mutex my_mutex_type;
+        my_mutex_type my_mutex;
+        
+        typedef std::list< receiver<continue_msg> * > my_successors_type;
+        my_successors_type my_successors;
+        
+        sender<continue_msg> *my_owner;
+        
+    public:
+        
+        successor_cache( ) : my_owner(NULL) {}
+        
+        void set_owner( sender<continue_msg> *owner ) { my_owner = owner; }
+        
+        virtual ~successor_cache() {}
+        
+        void register_successor( receiver<continue_msg> &r ) {
+            my_mutex_type::scoped_lock l(my_mutex, true);
+            my_successors.push_back( &r ); 
+            if ( my_owner )
+                r.register_predecessor( *my_owner );
+        }
+        
+        void remove_successor( receiver<continue_msg> &r ) {
+            my_mutex_type::scoped_lock l(my_mutex, true);
+            for ( my_successors_type::iterator i = my_successors.begin();
+                  i != my_successors.end(); ++i ) { 
+                if ( *i == & r ) { 
+                    if ( my_owner )
+                        r.remove_predecessor( *my_owner );
+                    my_successors.erase(i);
+                    break;
+                }
+            }
+        }
+    
+        bool empty() { 
+            my_mutex_type::scoped_lock l(my_mutex, false);
+            return my_successors.empty(); 
+        }
+    
+        virtual bool try_put( const continue_msg &t ) = 0; 
+        
+     };
+    
+    //! A cache of successors that are broadcast to
+    template<typename T, typename M=spin_rw_mutex>
+    class broadcast_cache : public successor_cache<T, M> {
+        typedef M my_mutex_type;
+        typedef std::list< receiver<T> * > my_successors_type;
+        
+    public:
+        
+        broadcast_cache( ) {}
+        
+        bool try_put( const T &t ) {
+            bool msg = false;
+            bool upgraded = false;
+            typename my_mutex_type::scoped_lock l(this->my_mutex, false);
+            typename my_successors_type::iterator i = this->my_successors.begin();
+            while ( i != this->my_successors.end() ) {
+               if ( (*i)->try_put( t ) == true ) {
+                   ++i;
+                   msg = true;
+               } else {
+                  if ( (*i)->register_predecessor(*this->my_owner) ) {
+                      if (!upgraded) {
+                          l.upgrade_to_writer();
+                          upgraded = true;
+                      }
+                      i = this->my_successors.erase(i);
+                  }
+                  else {
+                      ++i;
+                  }
+               }
+            }
+            return msg;
+        }
+    };
+    
+    //! A cache of successors that are put in a round-robin fashion
+    template<typename T, typename M=spin_rw_mutex >
+    class round_robin_cache : public successor_cache<T, M> {
+        typedef size_t size_type;
+        typedef M my_mutex_type;
+        typedef std::list< receiver<T> * > my_successors_type;
+    
+    public:
+        
+        round_robin_cache( ) {}
+        
+        size_type size() {
+            typename my_mutex_type::scoped_lock l(this->my_mutex, false);
+            return this->my_successors.size();
+        }
+        
+        bool try_put( const T &t ) {
+            bool upgraded = false;
+            typename my_mutex_type::scoped_lock l(this->my_mutex, false);
+            typename my_successors_type::iterator i = this->my_successors.begin();
+            while ( i != this->my_successors.end() ) {
+               if ( (*i)->try_put( t ) ) {
+                   return true;
+               } else {
+                  if ( (*i)->register_predecessor(*this->my_owner) ) {
+                      if (!upgraded) {
+                          l.upgrade_to_writer();
+                          upgraded = true;
+                      }
+                      i = this->my_successors.erase(i);
+                  }
+                  else {
+                      ++i;
+                  }
+               }
+            }
+            return false;
+        }
+    };
+    
+    template<typename T>
+    class decrementer : public continue_receiver, tbb::internal::no_copy {
+        
+        T *my_node;
+        
+        void execute() {
+            my_node->decrement_counter();
+        }
+        
+    public:
+       
+        typedef continue_msg input_type;
+        typedef continue_msg output_type;
+        decrementer( int number_of_predecessors = 0 ) : continue_receiver( number_of_predecessors ) { }
+        void set_owner( T *node ) { my_node = node; }
+    };
+    
+}
+
+#endif
+
diff --git a/tbb/include/tbb/internal/_flow_graph_item_buffer_impl.h b/tbb/include/tbb/internal/_flow_graph_item_buffer_impl.h
new file mode 100644 (file)
index 0000000..e235fd1
--- /dev/null
@@ -0,0 +1,186 @@
+/*
+    Copyright 2005-2011 Intel Corporation.  All Rights Reserved.
+
+    This file is part of Threading Building Blocks.
+
+    Threading Building Blocks is free software; you can redistribute it
+    and/or modify it under the terms of the GNU General Public License
+    version 2 as published by the Free Software Foundation.
+
+    Threading Building Blocks is distributed in the hope that it will be
+    useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+    of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Threading Building Blocks; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+    As a special exception, you may use this file as part of a free software
+    library without restriction.  Specifically, if other files instantiate
+    templates or use macros or inline functions from this file, or you compile
+    this file and link it with other files to produce an executable, this
+    file does not by itself cause the resulting executable to be covered by
+    the GNU General Public License.  This exception does not however
+    invalidate any other reasons why the executable file might be covered by
+    the GNU General Public License.
+*/
+
+#ifndef __TBB_item_buffer_H
+#define __TBB_item_buffer_H
+
+    //! Expandable buffer of items.  The possible operations are push, pop,
+    //* tests for empty and so forth.  No mutual exclusion is built in.
+    template <typename T, typename A=cache_aligned_allocator<T> >
+    class item_buffer {
+    public:
+        typedef T input_type;
+        typedef T output_type;
+    protected:
+        typedef size_t size_type;
+        typedef std::pair< T, bool > item_type;
+        typedef typename A::template rebind<item_type>::other allocator_type;
+
+        item_type *my_array;
+        size_type my_array_size;
+        static const size_type initial_buffer_size = 4;
+        size_type my_head;
+        size_type my_tail;
+
+        bool buffer_empty() { return my_head == my_tail; }
+
+        item_type &item(size_type i) { return my_array[i & (my_array_size - 1) ]; } // may not be marked valid
+
+        bool item_valid(size_type i) { return item(i).second; }
+
+        void fetch_front(T &v) { __TBB_ASSERT(item_valid(my_head), "front not valid"); v = item(my_head).first; }
+        void fetch_back(T &v) { __TBB_ASSERT(item_valid(my_tail-1), "back not valid"); v = item(my_tail-1).first; }
+
+        void invalidate(size_type i) { __TBB_ASSERT(item_valid(i), "Item not valid"); item(i).second = false; }
+        void validate(size_type i) { __TBB_ASSERT(!item_valid(i), "Item already valid"); item(i).second = true; }
+
+        void invalidate_front() { invalidate(my_head); }
+        void validate_front() { validate(my_head); }
+        void invalidate_back() { invalidate(my_tail-1); }
+
+        size_type size() { return my_tail - my_head; }
+        size_type capacity() { return my_array_size; }
+        bool buffer_full() { return size() == capacity(); }
+
+        //! Grows the internal array.
+        void grow_my_array( size_t minimum_size ) {
+            size_type old_size = my_array_size;
+            size_type new_size = old_size ? 2*old_size : initial_buffer_size;
+            while( new_size<minimum_size )
+                new_size*=2;
+
+            item_type* new_array = allocator_type().allocate(new_size);
+            item_type* old_array = my_array;
+
+            for( size_type i=0; i<new_size; ++i ) {
+                new (&(new_array[i].first)) input_type;
+                new_array[i].second = false;
+            }
+
+            size_t t=my_head;
+            for( size_type i=0; i<old_size; ++i, ++t )
+                new_array[t&(new_size-1)] = old_array[t&(old_size-1)];
+            my_array = new_array;
+            my_array_size = new_size;
+            if( old_array ) {
+                for( size_type i=0; i<old_size; ++i, ++t )
+                    old_array[i].first.~input_type();
+                allocator_type().deallocate(old_array,old_size);
+            }
+        }
+
+        bool push_back(T &v) {
+            if(buffer_full()) {
+                grow_my_array(size() + 1);
+            }
+            item(my_tail) = std::make_pair( v, true );
+            ++my_tail;
+            return true;
+        }
+
+        bool pop_back(T &v) {
+            if (!item_valid(my_tail-1)) {
+                return false;
+            }
+            fetch_back(v);
+            invalidate_back();
+            --my_tail;
+            return true;
+        }
+
+        bool pop_front(T &v) {
+            if(!item_valid(my_head)) {
+                return false;
+            }
+            fetch_front(v);
+            invalidate_front();
+            ++my_head;
+            return true;
+        }
+
+    public:
+        //! Constructor
+        item_buffer( ) : my_array(NULL), my_array_size(0),
+            my_head(0), my_tail(0) {
+            grow_my_array(initial_buffer_size);
+        }
+
+        ~item_buffer() {
+            if (my_array) {
+                for( size_type i=0; i<my_array_size; ++i ) {
+                    my_array[i].first.~input_type();
+                }
+                allocator_type().deallocate(my_array,my_array_size); 
+            }
+        }
+
+    };
+
+    //! item_buffer with reservable front-end.  NOTE: if reserving, do not
+    //* complete operation with pop_front(); use consume_front().  
+    //* No synchronization built-in.
+    template<typename T, typename A=cache_aligned_allocator<T> >
+    class reservable_item_buffer : public item_buffer<T, A> {
+    protected:
+        using item_buffer<T, A>::buffer_empty;
+        using item_buffer<T, A>::fetch_front;
+        using item_buffer<T, A>::invalidate_front;
+        using item_buffer<T, A>::validate_front;
+        using item_buffer<T, A>::item_valid;
+        using item_buffer<T, A>::my_head;
+
+    public:
+        reservable_item_buffer() : item_buffer<T, A>(), my_reserved(false) {}
+    protected:
+
+        bool reserve_front(T &v) {
+            if(my_reserved || !item_valid(my_head)) return false;
+            my_reserved = true;
+            // reserving the head
+            fetch_front(v);
+            // invalidate the head, but don't commit until consume is called
+            invalidate_front();
+            return true;
+        }
+
+        void consume_front() {
+            __TBB_ASSERT(my_reserved, "Attempt to consume a non-reserved item");
+            ++my_head;
+            my_reserved = false;
+        }
+
+        void release_front() {
+            __TBB_ASSERT(my_reserved, "Attempt to release a non-reserved item");
+            validate_front();
+            my_reserved = false;
+        }
+
+        bool my_reserved;
+    };
+
+#endif // __TBB_item_buffer_H
diff --git a/tbb/include/tbb/internal/_flow_graph_join_impl.h b/tbb/include/tbb/internal/_flow_graph_join_impl.h
new file mode 100644 (file)
index 0000000..80b38cf
--- /dev/null
@@ -0,0 +1,1695 @@
+/*
+    Copyright 2005-2011 Intel Corporation.  All Rights Reserved.
+
+    This file is part of Threading Building Blocks.
+
+    Threading Building Blocks is free software; you can redistribute it
+    and/or modify it under the terms of the GNU General Public License
+    version 2 as published by the Free Software Foundation.
+
+    Threading Building Blocks is distributed in the hope that it will be
+    useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+    of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Threading Building Blocks; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+    As a special exception, you may use this file as part of a free software
+    library without restriction.  Specifically, if other files instantiate
+    templates or use macros or inline functions from this file, or you compile
+    this file and link it with other files to produce an executable, this
+    file does not by itself cause the resulting executable to be covered by
+    the GNU General Public License.  This exception does not however
+    invalidate any other reasons why the executable file might be covered by
+    the GNU General Public License.
+*/
+
+#ifndef __TBB__graph_join_internal_H
+#define __TBB__graph_join_internal_H
+
+namespace internal {
+
+    typedef size_t tag_value;
+    static const tag_value NO_TAG = tag_value(-1);
+
+    struct forwarding_base {
+        forwarding_base(task *rt) : my_root_task(rt), current_tag(NO_TAG) {}
+        virtual ~forwarding_base() {}
+        virtual void decrement_port_count() = 0;
+        virtual void increment_port_count() = 0;
+        virtual void increment_tag_count(tag_value /*t*/) {}
+        // moved here so input ports can queue tasks
+        task* my_root_task;
+        tag_value current_tag; // so ports can refer to FE's desired items
+    };
+
+    template< int N >
+    struct join_helper {
+
+        template< typename TupleType, typename PortType >
+        static inline void set_join_node_pointer(TupleType &my_input, PortType *port) {
+            std::get<N-1>( my_input ).set_join_node_pointer(port);
+            join_helper<N-1>::set_join_node_pointer( my_input, port );
+        }
+        template< typename TupleType >
+        static inline void consume_reservations( TupleType &my_input ) {
+            std::get<N-1>( my_input ).consume();
+            join_helper<N-1>::consume_reservations( my_input );
+        }
+
+        template< typename TupleType >
+        static inline void release_my_reservation( TupleType &my_input ) {
+            std::get<N-1>( my_input ).release();
+        }
+
+        template <typename TupleType>
+        static inline void release_reservations( TupleType &my_input) {
+            join_helper<N-1>::release_reservations(my_input);
+            release_my_reservation(my_input);
+        }
+
+        template< typename InputTuple, typename OutputTuple >
+        static inline bool reserve( InputTuple &my_input, OutputTuple &out) {
+            if ( !std::get<N-1>( my_input ).reserve( std::get<N-1>( out ) ) ) return false;
+            if ( !join_helper<N-1>::reserve( my_input, out ) ) {
+                release_my_reservation( my_input );
+                return false;
+            }
+            return true;
+        }
+
+        template<typename InputTuple, typename OutputTuple>
+        static inline bool get_my_item( InputTuple &my_input, OutputTuple &out) {
+            bool res = std::get<N-1>(my_input).get_item(std::get<N-1>(out) ); // may fail
+            return join_helper<N-1>::get_my_item(my_input, out) && res;       // do get on other inputs before returning
+        }
+
+        template<typename InputTuple, typename OutputTuple>
+        static inline bool get_items(InputTuple &my_input, OutputTuple &out) {
+            return get_my_item(my_input, out);
+        }
+
+        template<typename InputTuple>
+        static inline void reset_my_port(InputTuple &my_input) {
+            join_helper<N-1>::reset_my_port(my_input);
+            std::get<N-1>(my_input).reset_port();
+        }
+
+        template<typename InputTuple>
+        static inline void reset_ports(InputTuple& my_input) {
+            reset_my_port(my_input);
+        }
+
+        template<typename InputTuple, typename TagFuncTuple>
+        static inline void set_tag_func(InputTuple &my_input, TagFuncTuple &my_tag_funcs) {
+            std::get<N-1>(my_input).set_my_original_tag_func(std::get<N-1>(my_tag_funcs));
+            std::get<N-1>(my_input).set_my_tag_func(std::get<N-1>(my_input).my_original_func()->clone());
+            std::get<N-1>(my_tag_funcs) = NULL;
+            join_helper<N-1>::set_tag_func(my_input, my_tag_funcs);
+        }
+
+        template< typename TagFuncTuple1, typename TagFuncTuple2>
+        static inline void copy_tag_functors(TagFuncTuple1 &my_inputs, TagFuncTuple2 &other_inputs) {
+            if(std::get<N-1>(other_inputs).my_original_func()) {
+                std::get<N-1>(my_inputs).set_my_tag_func(std::get<N-1>(other_inputs).my_original_func()->clone());
+                std::get<N-1>(my_inputs).set_my_original_tag_func(std::get<N-1>(other_inputs).my_original_func()->clone());
+            }
+            join_helper<N-1>::copy_tag_functors(my_inputs, other_inputs);
+        }
+    };
+
+    template< >
+    struct join_helper<1> {
+
+        template< typename TupleType, typename PortType >
+        static inline void set_join_node_pointer(TupleType &my_input, PortType *port) {
+            std::get<0>( my_input ).set_join_node_pointer(port);
+        }
+
+        template< typename TupleType >
+        static inline void consume_reservations( TupleType &my_input ) {
+            std::get<0>( my_input ).consume();
+        }
+
+        template< typename TupleType >
+        static inline void release_my_reservation( TupleType &my_input ) {
+            std::get<0>( my_input ).release();
+        }
+
+        template<typename TupleType>
+        static inline void release_reservations( TupleType &my_input) {
+            release_my_reservation(my_input);
+        }
+
+        template< typename InputTuple, typename OutputTuple >
+        static inline bool reserve( InputTuple &my_input, OutputTuple &out) {
+            return std::get<0>( my_input ).reserve( std::get<0>( out ) );
+        }
+
+        template<typename InputTuple, typename OutputTuple>
+        static inline bool get_my_item( InputTuple &my_input, OutputTuple &out) {
+            return std::get<0>(my_input).get_item(std::get<0>(out));
+        }
+
+        template<typename InputTuple, typename OutputTuple>
+        static inline bool get_items(InputTuple &my_input, OutputTuple &out) {
+            return get_my_item(my_input, out);
+        }
+
+        template<typename InputTuple>
+        static inline void reset_my_port(InputTuple &my_input) {
+            std::get<0>(my_input).reset_port();
+        }
+
+        template<typename InputTuple>
+        static inline void reset_ports(InputTuple& my_input) {
+            reset_my_port(my_input);
+        }
+
+        template<typename InputTuple, typename TagFuncTuple>
+        static inline void set_tag_func(InputTuple &my_input, TagFuncTuple &my_tag_funcs) {
+            std::get<0>(my_input).set_my_original_tag_func(std::get<0>(my_tag_funcs));
+            std::get<0>(my_input).set_my_tag_func(std::get<0>(my_input).my_original_func()->clone());
+            std::get<0>(my_tag_funcs) = NULL;
+        }
+
+        template< typename TagFuncTuple1, typename TagFuncTuple2>
+        static inline void copy_tag_functors(TagFuncTuple1 &my_inputs, TagFuncTuple2 &other_inputs) {
+            if(std::get<0>(other_inputs).my_original_func()) {
+                std::get<0>(my_inputs).set_my_tag_func(std::get<0>(other_inputs).my_original_func()->clone());
+                std::get<0>(my_inputs).set_my_original_tag_func(std::get<0>(other_inputs).my_original_func()->clone());
+            }
+        }
+    };
+
+    //! The two-phase join port
+    template< typename T >
+    class reserving_port : public receiver<T> {
+    public:
+        typedef T input_type;
+        typedef sender<T> predecessor_type;
+    private:
+        // ----------- Aggregator ------------
+        enum op_type { reg_pred, rem_pred, res_item, rel_res, con_res };
+        enum op_stat {WAIT=0, SUCCEEDED, FAILED};
+        typedef reserving_port<T> my_class;
+
+        class reserving_port_operation : public aggregated_operation<reserving_port_operation> {
+        public:
+            char type;
+            union {
+                T *my_arg;
+                predecessor_type *my_pred;
+            };
+            reserving_port_operation(const T& e, op_type t) :
+                type(char(t)), my_arg(const_cast<T*>(&e)) {}
+            reserving_port_operation(const predecessor_type &s, op_type t) : type(char(t)), 
+                my_pred(const_cast<predecessor_type *>(&s)) {}
+            reserving_port_operation(op_type t) : type(char(t)) {}
+        };
+
+        typedef internal::aggregating_functor<my_class, reserving_port_operation> my_handler;
+        friend class internal::aggregating_functor<my_class, reserving_port_operation>;
+        aggregator<my_handler, reserving_port_operation> my_aggregator;
+
+        void handle_operations(reserving_port_operation* op_list) {
+            reserving_port_operation *current;
+            bool no_predecessors;
+            while(op_list) {
+                current = op_list;
+                op_list = op_list->next;
+                switch(current->type) {
+                case reg_pred:
+                    no_predecessors = my_predecessors.empty();
+                    my_predecessors.add(*(current->my_pred));
+                    if ( no_predecessors ) {
+                        my_join->decrement_port_count( ); // may try to forward
+                    }
+                    __TBB_store_with_release(current->status, SUCCEEDED);
+                    break;
+                case rem_pred:
+                    my_predecessors.remove(*(current->my_pred));
+                    if(my_predecessors.empty()) my_join->increment_port_count();
+                    __TBB_store_with_release(current->status, SUCCEEDED);
+                    break;
+                case res_item:
+                    if ( reserved ) {
+                        __TBB_store_with_release(current->status, FAILED);
+                    }
+                    else if ( my_predecessors.try_reserve( *(current->my_arg) ) ) {
+                        reserved = true;
+                        __TBB_store_with_release(current->status, SUCCEEDED);
+                    } else {
+                        if ( my_predecessors.empty() ) {
+                            my_join->increment_port_count();
+                        }
+                        __TBB_store_with_release(current->status, FAILED);
+                    }
+                    break;
+                case rel_res:
+                    reserved = false;
+                    my_predecessors.try_release( );
+                    __TBB_store_with_release(current->status, SUCCEEDED);
+                    break;
+                case con_res:
+                    reserved = false;
+                    my_predecessors.try_consume( );
+                    __TBB_store_with_release(current->status, SUCCEEDED);
+                    break;
+                }
+            }
+        }
+
+    public:
+
+        //! Constructor
+        reserving_port() : reserved(false) {
+            my_join = NULL;
+            my_predecessors.set_owner( this );
+            my_aggregator.initialize_handler(my_handler(this));
+        }
+
+        // copy constructor
+        reserving_port(const reserving_port& /* other */) : receiver<T>() {
+            reserved = false;
+            my_join = NULL;
+            my_predecessors.set_owner( this );
+            my_aggregator.initialize_handler(my_handler(this));
+        }
+
+        void set_join_node_pointer(forwarding_base *join) {
+            my_join = join;
+        }
+
+        // always rejects, so arc is reversed (and reserves can be done.)
+        bool try_put( const T & ) {
+            return false;
+        }
+
+        //! Add a predecessor
+        bool register_predecessor( sender<T> &src ) {
+            reserving_port_operation op_data(src, reg_pred);
+            my_aggregator.execute(&op_data);
+            return op_data.status == SUCCEEDED;
+        }
+
+        //! Remove a predecessor
+        bool remove_predecessor( sender<T> &src ) {
+            reserving_port_operation op_data(src, rem_pred);
+            my_aggregator.execute(&op_data);
+            return op_data.status == SUCCEEDED;
+        }
+
+        //! Reserve an item from the port
+        bool reserve( T &v ) {
+            reserving_port_operation op_data(v, res_item);
+            my_aggregator.execute(&op_data);
+            return op_data.status == SUCCEEDED;
+        }
+
+        //! Release the port
+        void release( ) {
+            reserving_port_operation op_data(rel_res);
+            my_aggregator.execute(&op_data);
+        }
+
+        //! Complete use of the port
+        void consume( ) {
+            reserving_port_operation op_data(con_res);
+            my_aggregator.execute(&op_data);
+        }
+
+    private:
+        forwarding_base *my_join;
+        reservable_predecessor_cache< T, null_mutex > my_predecessors;
+        bool reserved;
+    };
+
+    //! queueing join_port
+    template<typename T>
+    class queueing_port : public receiver<T>, public item_buffer<T> {
+    public:
+        typedef T input_type;
+        typedef sender<T> predecessor_type;
+        typedef queueing_port<T> my_node_type;
+
+    // ----------- Aggregator ------------
+    private:
+        enum op_type { try__put, get__item, res_port };
+        enum op_stat {WAIT=0, SUCCEEDED, FAILED};
+        typedef queueing_port<T> my_class;
+
+        class queueing_port_operation : public aggregated_operation<queueing_port_operation> {
+        public:
+            char type;
+            union {
+                T my_val;
+                T *my_arg;
+            };
+            // constructor for value parameter
+            queueing_port_operation(const T& e, op_type t) :
+                // type(char(t)), my_val(const_cast<T>(e)) {}
+                type(char(t)), my_val(e) {}
+            // constructor for pointer parameter
+            queueing_port_operation(const T* p, op_type t) :
+                type(char(t)), my_arg(const_cast<T*>(p)) {}
+            // constructor with no parameter
+            queueing_port_operation(op_type t) : type(char(t)) {}
+        };
+
+        typedef internal::aggregating_functor<my_class, queueing_port_operation> my_handler;
+        friend class internal::aggregating_functor<my_class, queueing_port_operation>;
+        aggregator<my_handler, queueing_port_operation> my_aggregator;
+
+        void handle_operations(queueing_port_operation* op_list) {
+            queueing_port_operation *current;
+            bool was_empty;
+            while(op_list) {
+                current = op_list;
+                op_list = op_list->next;
+                switch(current->type) {
+                case try__put:
+                    was_empty = this->buffer_empty();
+                    this->push_back(current->my_val);
+                    if (was_empty) my_join->decrement_port_count();
+                    __TBB_store_with_release(current->status, SUCCEEDED);
+                    break;
+                case get__item:
+                    if(!this->buffer_empty()) {
+                        this->fetch_front(*(current->my_arg));
+                        __TBB_store_with_release(current->status, SUCCEEDED);
+                    }
+                    else {
+                        __TBB_store_with_release(current->status, FAILED);
+                    }
+                    break;
+                case res_port:
+                    __TBB_ASSERT(this->item_valid(this->my_head), "No item to reset");
+                    this->invalidate_front(); ++(this->my_head);
+                    if(this->item_valid(this->my_head)) {
+                        my_join->decrement_port_count();
+                    }
+                    __TBB_store_with_release(current->status, SUCCEEDED);
+                    break;
+                }
+            }
+        }
+    // ------------ End Aggregator ---------------
+    public:
+
+        //! Constructor
+        queueing_port() : item_buffer<T>() {
+            my_join = NULL;
+            my_aggregator.initialize_handler(my_handler(this));
+        }
+
+        //! copy constructor
+        queueing_port(const queueing_port& /* other */) : receiver<T>(), item_buffer<T>() {
+            my_join = NULL;
+            my_aggregator.initialize_handler(my_handler(this));
+        }
+
+        //! record parent for tallying available items
+        void set_join_node_pointer(forwarding_base *join) {
+            my_join = join;
+        }
+
+        /*override*/bool try_put(const T &v) {
+            queueing_port_operation op_data(v, try__put);
+            my_aggregator.execute(&op_data);
+            return op_data.status == SUCCEEDED;
+        }
+
+
+        bool get_item( T &v ) {
+            queueing_port_operation op_data(&v, get__item);
+            my_aggregator.execute(&op_data);
+            return op_data.status == SUCCEEDED;
+        }
+
+        // reset_port is called when item is accepted by successor, but
+        // is initiated by join_node.
+        void reset_port() {
+            queueing_port_operation op_data(res_port);
+            my_aggregator.execute(&op_data);
+            return;
+        }
+
+    private:
+        forwarding_base *my_join;
+    };
+
+#include "_flow_graph_tagged_buffer_impl.h"
+
+    template< typename T >
+    class tag_matching_port : public receiver<T>, public tagged_buffer< tag_value, T, NO_TAG > {
+    public:
+        typedef T input_type;
+        typedef sender<T> predecessor_type;
+        typedef tag_matching_port<T> my_node_type;  // for forwarding, if needed
+        typedef function_body<input_type, tag_value> my_tag_func_type;
+    private:
+// ----------- Aggregator ------------
+    private:
+        enum op_type { try__put, get__item, res_port };
+        enum op_stat {WAIT=0, SUCCEEDED, FAILED};
+        typedef tag_matching_port<T> my_class;
+
+        class tag_matching_port_operation : public aggregated_operation<tag_matching_port_operation> {
+        public:
+            char type;
+            union {
+                T my_val;
+                T *my_arg;
+            };
+            // constructor for value parameter
+            tag_matching_port_operation(const T& e, op_type t) :
+                // type(char(t)), my_val(const_cast<T>(e)) {}
+                type(char(t)), my_val(e) {}
+            // constructor for pointer parameter
+            tag_matching_port_operation(const T* p, op_type t) :
+                type(char(t)), my_arg(const_cast<T*>(p)) {}
+            // constructor with no parameter
+            tag_matching_port_operation(op_type t) : type(char(t)) {}
+        };
+
+        typedef internal::aggregating_functor<my_class, tag_matching_port_operation> my_handler;
+        friend class internal::aggregating_functor<my_class, tag_matching_port_operation>;
+        aggregator<my_handler, tag_matching_port_operation> my_aggregator;
+
+        void handle_operations(tag_matching_port_operation* op_list) {
+            tag_matching_port_operation *current;
+            while(op_list) {
+                current = op_list;
+                op_list = op_list->next;
+                switch(current->type) {
+                case try__put: {
+                        tag_value tval = (*my_tag_func)(current->my_val);
+                        bool was_inserted = this->tagged_insert(tval, current->my_val);
+                        if(was_inserted) {
+                            // report the tag to join_node_FE
+                            my_join->increment_tag_count(tval);  // may spawn
+                        }
+                        // should we make it an error to insert a tag twice?
+                        __TBB_store_with_release(current->status, SUCCEEDED);
+                    }
+                    break;
+                case get__item:
+                    // use current_tag from FE for item
+                    if(!this->tagged_find(my_join->current_tag, *(current->my_arg))) {
+                        __TBB_ASSERT(false, "Failed to find item corresponding to current_tag.");
+                    }
+                    __TBB_store_with_release(current->status, SUCCEEDED);
+                    break;
+                case res_port:
+                    // use current_tag from FE for item
+                    this->tagged_delete(my_join->current_tag);
+                    __TBB_store_with_release(current->status, SUCCEEDED);
+                    break;
+                }
+            }
+        }
+// ------------ End Aggregator ---------------
+    public:
+
+        tag_matching_port() : receiver<T>(), tagged_buffer<tag_value, T, NO_TAG>() {
+            my_join = NULL;
+            my_tag_func = NULL;
+            my_original_tag_func = NULL;
+            my_aggregator.initialize_handler(my_handler(this));
+        }
+
+        // copy constructor
+        tag_matching_port(const tag_matching_port& /*other*/) : receiver<T>(), tagged_buffer<tag_value,T, NO_TAG>() {
+            my_join = NULL;
+            // setting the tag methods is done in the copy-constructor for the front-end.
+            my_tag_func = NULL;
+            my_original_tag_func = NULL;
+            my_aggregator.initialize_handler(my_handler(this));
+        }
+
+        ~tag_matching_port() {
+            if (my_tag_func) delete my_tag_func;
+            if (my_original_tag_func) delete my_original_tag_func;
+        }
+
+        void set_join_node_pointer(forwarding_base *join) {
+            my_join = join;
+        }
+
+        void set_my_original_tag_func(my_tag_func_type *f) {
+            my_original_tag_func = f;
+        }
+
+        void set_my_tag_func(my_tag_func_type *f) {
+            my_tag_func = f;
+        }
+
+        /*override*/bool try_put(const T& v) {
+            tag_matching_port_operation op_data(v, try__put);
+            my_aggregator.execute(&op_data);
+            return op_data.status == SUCCEEDED;
+        }
+
+
+        bool get_item( T &v ) {
+            tag_matching_port_operation op_data(&v, get__item);
+            my_aggregator.execute(&op_data);
+            return op_data.status == SUCCEEDED;
+        }
+
+        // reset_port is called when item is accepted by successor, but
+        // is initiated by join_node.
+        void reset_port() {
+            tag_matching_port_operation op_data(res_port);
+            my_aggregator.execute(&op_data);
+            return;
+        }
+
+        my_tag_func_type *my_func() { return my_tag_func; }
+        my_tag_func_type *my_original_func() { return my_original_tag_func; }
+
+    private:
+        // need map of tags to values
+        forwarding_base *my_join;
+        my_tag_func_type *my_tag_func;
+        my_tag_func_type *my_original_tag_func;
+    };  // tag_matching_port
+
+    using namespace graph_policy_namespace;
+
+    template<graph_buffer_policy JP, typename InputTuple, typename OutputTuple>
+    class join_node_base;
+
+    //! join_node_FE : implements input port policy
+    template<graph_buffer_policy JP, typename InputTuple, typename OutputTuple>
+    class join_node_FE;
+
+    template<typename InputTuple, typename OutputTuple>
+    class join_node_FE<reserving, InputTuple, OutputTuple> : public forwarding_base {
+    public:
+        static const int N = std::tuple_size<OutputTuple>::value;
+        typedef OutputTuple output_type;
+        typedef InputTuple input_type;
+        typedef join_node_base<reserving, InputTuple, OutputTuple> my_node_type; // for forwarding
+
+        join_node_FE(graph &g) : forwarding_base(g.root_task()), my_node(NULL) {
+            ports_with_no_inputs = N;
+            join_helper<N>::set_join_node_pointer(my_inputs, this);
+        }
+
+        join_node_FE(const join_node_FE& other) : forwarding_base(other.my_root_task), my_node(NULL) {
+            ports_with_no_inputs = N;
+            join_helper<N>::set_join_node_pointer(my_inputs, this);
+        }
+
+        void set_my_node(my_node_type *new_my_node) { my_node = new_my_node; }
+
+       void increment_port_count() {
+            ++ports_with_no_inputs;
+        }
+
+        // if all input_ports have predecessors, spawn forward to try and consume tuples
+        void decrement_port_count() {
+            if(ports_with_no_inputs.fetch_and_decrement() == 1) {
+                task::enqueue( * new ( task::allocate_additional_child_of( *(this->my_root_task) ) )
+                    forward_task<my_node_type>(*my_node) );
+            }
+        }
+
+        input_type &inputs() { return my_inputs; }
+    protected:
+        // all methods on input ports should be called under mutual exclusion from join_node_base.
+
+        bool tuple_build_may_succeed() {
+            return !ports_with_no_inputs;
+        }
+
+        bool try_to_make_tuple(output_type &out) {
+            if(ports_with_no_inputs) return false;
+            return join_helper<N>::reserve(my_inputs, out);
+        }
+
+        void tuple_accepted() {
+            join_helper<N>::consume_reservations(my_inputs);
+        }
+        void tuple_rejected() {
+            join_helper<N>::release_reservations(my_inputs);
+        }
+
+        input_type my_inputs;
+        my_node_type *my_node;
+        atomic<size_t> ports_with_no_inputs;
+    };
+
+    template<typename InputTuple, typename OutputTuple>
+    class join_node_FE<queueing, InputTuple, OutputTuple> : public forwarding_base {
+    public:
+        static const int N = std::tuple_size<OutputTuple>::value;
+        typedef OutputTuple output_type;
+        typedef InputTuple input_type;
+        typedef join_node_base<queueing, InputTuple, OutputTuple> my_node_type; // for forwarding
+
+        join_node_FE(graph &g) : forwarding_base(g.root_task()), my_node(NULL) {
+            ports_with_no_items = N;
+            join_helper<N>::set_join_node_pointer(my_inputs, this);
+        }
+
+        join_node_FE(const join_node_FE& other) : forwarding_base(other.my_root_task), my_node(NULL) {
+            ports_with_no_items = N;
+            join_helper<N>::set_join_node_pointer(my_inputs, this);
+        }
+
+        // needed for forwarding
+        void set_my_node(my_node_type *new_my_node) { my_node = new_my_node; }
+
+        void reset_port_count() {
+            ports_with_no_items = N;
+        }
+
+        // if all input_ports have items, spawn forward to try and consume tuples
+        void decrement_port_count() {
+            if(ports_with_no_items.fetch_and_decrement() == 1) {
+                task::enqueue( * new ( task::allocate_additional_child_of( *(this->my_root_task) ) )
+                    forward_task<my_node_type>(*my_node) );
+            }
+        }
+
+        void increment_port_count() { __TBB_ASSERT(false, NULL); }  // should never be called
+
+        input_type &inputs() { return my_inputs; }
+    protected:
+        // all methods on input ports should be called under mutual exclusion from join_node_base.
+
+        bool tuple_build_may_succeed() {
+            return !ports_with_no_items;
+        }
+
+        bool try_to_make_tuple(output_type &out) {
+            if(ports_with_no_items) return false;
+            return join_helper<N>::get_items(my_inputs, out);
+        }
+
+        void tuple_accepted() {
+            reset_port_count();
+            join_helper<N>::reset_ports(my_inputs);
+        }
+        void tuple_rejected() {
+            // nothing to do.
+        }
+
+        input_type my_inputs;
+        my_node_type *my_node;
+        atomic<size_t> ports_with_no_items;
+    };
+
+    // tag_matching join input port.
+    template<typename InputTuple, typename OutputTuple>
+    class join_node_FE<tag_matching, InputTuple, OutputTuple> : public forwarding_base, public tagged_buffer<tag_value, size_t, NO_TAG> {
+    public:
+        static const int N = std::tuple_size<OutputTuple>::value;
+        typedef OutputTuple output_type;
+        typedef InputTuple input_type;
+        typedef tagged_buffer<tag_value, size_t, NO_TAG> my_tag_buffer;
+        typedef join_node_base<tag_matching, InputTuple, OutputTuple> my_node_type; // for forwarding
+
+// ----------- Aggregator ------------
+        // the aggregator is only needed to serialize the access to the hash table.
+        // and the current_tag field.
+    private:
+        enum op_type { res_count, inc_count, may_succeed, try_make };
+        enum op_stat {WAIT=0, SUCCEEDED, FAILED};
+        typedef join_node_FE<tag_matching, InputTuple, OutputTuple> my_class;
+
+        class tag_matching_FE_operation : public aggregated_operation<tag_matching_FE_operation> {
+        public:
+            char type;
+            union {
+                tag_value my_val;
+                output_type* my_output;
+            };
+            // constructor for value parameter
+            tag_matching_FE_operation(const tag_value& e, op_type t) :
+                // type(char(t)), my_val(const_cast<T>(e)) {}
+                type(char(t)), my_val(e) {}
+            tag_matching_FE_operation(output_type *p, op_type t) :
+                type(t), my_output(p) {}
+            // constructor with no parameter
+            tag_matching_FE_operation(op_type t) : type(char(t)) {}
+        };
+
+        typedef internal::aggregating_functor<my_class, tag_matching_FE_operation> my_handler;
+        friend class internal::aggregating_functor<my_class, tag_matching_FE_operation>;
+        aggregator<my_handler, tag_matching_FE_operation> my_aggregator;
+
+        void handle_operations(tag_matching_FE_operation* op_list) {
+            tag_matching_FE_operation *current;
+            while(op_list) {
+                current = op_list;
+                op_list = op_list->next;
+                switch(current->type) {
+                case res_count:
+                    this->current_tag = NO_TAG;
+                    if(find_value_tag(this->current_tag,N)) {
+                        this->tagged_delete(this->current_tag);
+                        task::enqueue( * new ( task::allocate_additional_child_of( *(this->my_root_task) ) )
+                                forward_task<my_node_type>(*my_node) );
+                    }
+                    __TBB_store_with_release(current->status, SUCCEEDED);
+                    break;
+                case inc_count: {
+                        size_t *p = 0;
+                        tag_value t = current->my_val;
+                        if(!(this->tagged_find_ref(t,p))) {
+                            this->tagged_insert(t, 0);
+                            if(!(this->tagged_find_ref(t,p))) {
+                                __TBB_ASSERT(false, NULL);
+                            }
+                        }
+                        if(++(*p) == size_t(N) && this->current_tag == NO_TAG) {
+                            // all items of tuple are available.
+                            this->current_tag = t;
+                            this->tagged_delete(t);
+                            task::enqueue( * new ( task::allocate_additional_child_of( *(this->my_root_task) ) )
+                                forward_task<my_node_type>(*my_node) );
+                        }
+                    }
+                    __TBB_store_with_release(current->status, SUCCEEDED);
+                    break;
+                case may_succeed:
+                    if(this->current_tag == NO_TAG) {
+                        __TBB_store_with_release(current->status, FAILED);
+                    }
+                    else {
+                        __TBB_store_with_release(current->status, SUCCEEDED);
+                    }
+                    break;
+                case try_make:
+                    if(this->current_tag == NO_TAG) {
+                        __TBB_store_with_release(current->status, FAILED);
+                    }
+                    else {
+                        if(join_helper<N>::get_items(my_inputs, *(current->my_output))) {
+                            __TBB_store_with_release(current->status, SUCCEEDED);
+                        }
+                        else {
+                            __TBB_ASSERT(false, NULL); // shouldn't be asked to make a tuple if all items not available.
+                        }
+                    }
+                    break;
+                }
+            }
+        }
+// ------------ End Aggregator ---------------
+
+    public:
+        template<typename FunctionTuple>
+        join_node_FE(graph &g, FunctionTuple tag_funcs) : forwarding_base(g.root_task()), my_node(NULL) {
+            join_helper<N>::set_join_node_pointer(my_inputs, this);
+            join_helper<N>::set_tag_func(my_inputs, tag_funcs);
+            my_aggregator.initialize_handler(my_handler(this));
+        }
+
+        join_node_FE(const join_node_FE& other) : forwarding_base(other.my_root_task), my_tag_buffer(), my_node(NULL) {
+            join_helper<N>::set_join_node_pointer(my_inputs, this);
+            join_helper<N>::copy_tag_functors(my_inputs, const_cast<input_type &>(other.my_inputs));
+            my_aggregator.initialize_handler(my_handler(this));
+        }
+
+        // needed for forwarding
+        void set_my_node(my_node_type *new_my_node) { my_node = new_my_node; }
+
+        void reset_port_count() {
+            // reset while current_tag has old value.  The current_tag value is still valid, and will
+            // not be reset until we call into res_count in the aggregator.
+            // called from aggregator of back-end of join node (via tuple_accepted()), so this is serial on join.
+            join_helper<N>::reset_ports(my_inputs);
+            // only the hash table ops need to be serialized on our aggregator.
+            tag_matching_FE_operation op_data(res_count);
+            my_aggregator.execute(&op_data);
+            return;
+        }
+
+        // if all input_ports have items, spawn forward to try and consume tuples
+        void increment_tag_count(tag_value t) {
+            tag_matching_FE_operation op_data(t, inc_count);
+            my_aggregator.execute(&op_data);
+            return;
+        }
+
+        void decrement_port_count() { __TBB_ASSERT(false, NULL); }
+
+        void increment_port_count() { __TBB_ASSERT(false, NULL); }  // should never be called
+
+        input_type &inputs() { return my_inputs; }
+    protected:
+        // all methods on input ports should be called under mutual exclusion from join_node_base.
+
+        bool tuple_build_may_succeed() {
+            tag_matching_FE_operation op_data(may_succeed);
+            my_aggregator.execute(&op_data);
+            return op_data.status == SUCCEEDED;
+        }
+
+        // cannot lock while calling back to input_ports.  current_tag will only be set
+        // and reset under the aggregator, so it will remain consistent.
+        bool try_to_make_tuple(output_type &out) {
+            if(this->current_tag == NO_TAG) {
+                return false;
+            }
+            if(join_helper<N>::get_items(my_inputs, out)) {
+                return true;
+            }
+            __TBB_ASSERT(false, NULL); // shouldn't be asked to make a tuple if all items not available.
+            return false;
+        }
+
+        void tuple_accepted() {
+            reset_port_count();  // reset current_tag after ports reset.
+        }
+
+        void tuple_rejected() {
+            // nothing to do.
+        }
+
+        input_type my_inputs;  // input ports
+        my_node_type *my_node;
+    }; // join_node_FE<tag_matching, InputTuple, OutputTuple>
+
+    //! join_node_base
+    template<graph_buffer_policy JP, typename InputTuple, typename OutputTuple>
+    class join_node_base : public graph_node, public join_node_FE<JP, InputTuple, OutputTuple>,
+                           public sender<OutputTuple> {
+    public:
+        typedef OutputTuple output_type;
+
+        typedef receiver<output_type> successor_type;
+        typedef join_node_FE<JP, InputTuple, OutputTuple> input_ports_type;
+        using input_ports_type::tuple_build_may_succeed;
+        using input_ports_type::try_to_make_tuple;
+        using input_ports_type::tuple_accepted;
+        using input_ports_type::tuple_rejected;
+
+    private:
+        // ----------- Aggregator ------------
+        enum op_type { reg_succ, rem_succ, try__get, do_fwrd };
+        enum op_stat {WAIT=0, SUCCEEDED, FAILED};
+        typedef join_node_base<JP,InputTuple,OutputTuple> my_class;
+
+        class join_node_base_operation : public aggregated_operation<join_node_base_operation> {
+        public:
+            char type;
+            union {
+                output_type *my_arg;
+                successor_type *my_succ;
+            };
+            join_node_base_operation(const output_type& e, op_type t) :
+                type(char(t)), my_arg(const_cast<output_type*>(&e)) {}
+            join_node_base_operation(const successor_type &s, op_type t) : type(char(t)), 
+                my_succ(const_cast<successor_type *>(&s)) {}
+            join_node_base_operation(op_type t) : type(char(t)) {}
+        };
+
+        typedef internal::aggregating_functor<my_class, join_node_base_operation> my_handler;
+        friend class internal::aggregating_functor<my_class, join_node_base_operation>;
+        bool forwarder_busy;
+        aggregator<my_handler, join_node_base_operation> my_aggregator;
+
+        void handle_operations(join_node_base_operation* op_list) {
+            join_node_base_operation *current;
+            while(op_list) {
+                current = op_list;
+                op_list = op_list->next;
+                switch(current->type) {
+                case reg_succ:
+                    my_successors.register_successor(*(current->my_succ));
+                    if(tuple_build_may_succeed() && !forwarder_busy) {
+                        task::enqueue( * new ( task::allocate_additional_child_of(*(this->my_root_task)) )
+                                forward_task<join_node_base<JP,InputTuple,OutputTuple> >(*this));
+                        forwarder_busy = true;
+                    }
+                    __TBB_store_with_release(current->status, SUCCEEDED);
+                    break;
+                case rem_succ:
+                    my_successors.remove_successor(*(current->my_succ));
+                    __TBB_store_with_release(current->status, SUCCEEDED);
+                    break;
+                case try__get:
+                    if(tuple_build_may_succeed()) {
+                        if(try_to_make_tuple(*(current->my_arg))) {
+                            tuple_accepted();
+                            __TBB_store_with_release(current->status, SUCCEEDED);
+                        }
+                        else __TBB_store_with_release(current->status, FAILED);
+                    }
+                    else __TBB_store_with_release(current->status, FAILED);
+                    break;
+                case do_fwrd: {
+                        bool build_succeeded;
+                        output_type out;
+                        if(tuple_build_may_succeed()) {
+                            do {
+                                build_succeeded = try_to_make_tuple(out);
+                                if(build_succeeded) {
+                                    if(my_successors.try_put(out)) {
+                                        tuple_accepted();
+                                    }
+                                    else {
+                                        tuple_rejected();
+                                        build_succeeded = false;
+                                    }
+                                }
+                            } while(build_succeeded);
+                        }
+                        __TBB_store_with_release(current->status, SUCCEEDED);
+                        forwarder_busy = false;
+                    }
+                    break;
+                }
+            }
+        }
+        // ---------- end aggregator -----------
+    public:
+        join_node_base(graph &g) : input_ports_type(g), forwarder_busy(false) {
+            my_successors.set_owner(this);
+            input_ports_type::set_my_node(this);
+            my_aggregator.initialize_handler(my_handler(this));
+        }
+
+        join_node_base(const join_node_base& other) :
+#if ( __TBB_GCC_VERSION < 40202 )
+            graph_node(),
+#endif
+            input_ports_type(other),
+#if ( __TBB_GCC_VERSION < 40202 )
+            sender<OutputTuple>(),
+#endif
+            forwarder_busy(false), my_successors() {
+            my_successors.set_owner(this);
+            input_ports_type::set_my_node(this);
+            my_aggregator.initialize_handler(my_handler(this));
+        }
+
+        template<typename FunctionTuple>
+        join_node_base(graph &g, FunctionTuple f) : input_ports_type(g, f), forwarder_busy(false) {
+            my_successors.set_owner(this);
+            input_ports_type::set_my_node(this);
+            my_aggregator.initialize_handler(my_handler(this));
+        }
+
+        bool register_successor(successor_type &r) {
+            join_node_base_operation op_data(r, reg_succ);
+            my_aggregator.execute(&op_data);
+            return op_data.status == SUCCEEDED;
+        }
+
+        bool remove_successor( successor_type &r) {
+            join_node_base_operation op_data(r, rem_succ);
+            my_aggregator.execute(&op_data);
+            return op_data.status == SUCCEEDED;
+        }
+
+        bool try_get( output_type &v) {
+            join_node_base_operation op_data(v, try__get);
+            my_aggregator.execute(&op_data);
+            return op_data.status == SUCCEEDED;
+        }
+
+    private:
+        broadcast_cache<output_type, null_rw_mutex> my_successors;
+
+        friend class forward_task< join_node_base<JP, InputTuple, OutputTuple> >;
+
+        void forward() {
+            join_node_base_operation op_data(do_fwrd);
+            my_aggregator.execute(&op_data);
+        }
+    };
+
+    //! unfolded_join_node : passes input_ports_tuple_type to join_node_base.  We build the input port type
+    //  using tuple_element.  The class PT is the port type (reserving_port, queueing_port, tag_matching_port)
+    //  and should match the graph_buffer_policy.
+    template<int N, template<class> class PT, typename OutputTuple, graph_buffer_policy JP>
+    class unfolded_join_node;
+
+    template<template<class> class PT, typename OutputTuple, graph_buffer_policy JP>
+    class unfolded_join_node<2,PT,OutputTuple,JP> : public internal::join_node_base<JP,
+        std::tuple<
+                PT<typename std::tuple_element<0,OutputTuple>::type>, 
+                PT<typename std::tuple_element<1,OutputTuple>::type> >,
+        OutputTuple
+                  >
+                  {
+    public:
+        typedef typename std::tuple<
+                PT<typename std::tuple_element<0,OutputTuple>::type>, 
+                PT<typename std::tuple_element<1,OutputTuple>::type> > input_ports_tuple_type;
+        typedef OutputTuple output_type;
+    private:
+        typedef join_node_base<JP, input_ports_tuple_type, output_type > base_type;
+    public:
+        unfolded_join_node(graph &g) : base_type(g) {}
+        unfolded_join_node(const unfolded_join_node &other) : base_type(other) {}
+    };
+
+    template<template<class> class PT, typename OutputTuple, graph_buffer_policy JP>
+    class unfolded_join_node<3,PT,OutputTuple,JP> : public internal::join_node_base<JP,
+        std::tuple<
+                PT<typename std::tuple_element<0,OutputTuple>::type>,
+                PT<typename std::tuple_element<1,OutputTuple>::type>,
+                PT<typename std::tuple_element<2,OutputTuple>::type> >,
+        OutputTuple
+                    >
+                    {
+    public:
+        typedef typename std::tuple<
+                PT<typename std::tuple_element<0,OutputTuple>::type>, 
+                PT<typename std::tuple_element<1,OutputTuple>::type>, 
+                PT<typename std::tuple_element<2,OutputTuple>::type> > input_ports_tuple_type;
+        typedef OutputTuple output_type;
+    private:
+        typedef join_node_base<JP, input_ports_tuple_type, output_type > base_type;
+    public:
+        unfolded_join_node(graph &g) : base_type(g) {}
+        unfolded_join_node(const unfolded_join_node &other) : base_type(other) {}
+    };
+
+    template<template<class> class PT, typename OutputTuple, graph_buffer_policy JP>
+    class unfolded_join_node<4,PT,OutputTuple,JP> : public internal::join_node_base<JP,
+        std::tuple<
+                PT<typename std::tuple_element<0,OutputTuple>::type>,
+                PT<typename std::tuple_element<1,OutputTuple>::type>,
+                PT<typename std::tuple_element<2,OutputTuple>::type>,
+                PT<typename std::tuple_element<3,OutputTuple>::type> >,
+        OutputTuple
+                    > {
+    public:
+        typedef typename std::tuple<
+                PT<typename std::tuple_element<0,OutputTuple>::type>, 
+                PT<typename std::tuple_element<1,OutputTuple>::type>, 
+                PT<typename std::tuple_element<2,OutputTuple>::type>, 
+                PT<typename std::tuple_element<3,OutputTuple>::type> > input_ports_tuple_type;
+        typedef OutputTuple output_type;
+    private:
+        typedef join_node_base<JP, input_ports_tuple_type, output_type > base_type;
+    public:
+        unfolded_join_node(graph &g) : base_type(g) {}
+        unfolded_join_node(const unfolded_join_node &other) : base_type(other) {}
+    };
+
+    template<template<class> class PT, typename OutputTuple, graph_buffer_policy JP>
+    class unfolded_join_node<5,PT,OutputTuple,JP> : public internal::join_node_base<JP,
+        std::tuple<
+                PT<typename std::tuple_element<0,OutputTuple>::type>,
+                PT<typename std::tuple_element<1,OutputTuple>::type>,
+                PT<typename std::tuple_element<2,OutputTuple>::type>,
+                PT<typename std::tuple_element<3,OutputTuple>::type>,
+                PT<typename std::tuple_element<4,OutputTuple>::type> >,
+        OutputTuple
+                > {
+    public:
+        typedef typename std::tuple<
+                PT<typename std::tuple_element<0,OutputTuple>::type>, 
+                PT<typename std::tuple_element<1,OutputTuple>::type>, 
+                PT<typename std::tuple_element<2,OutputTuple>::type>, 
+                PT<typename std::tuple_element<3,OutputTuple>::type>, 
+                PT<typename std::tuple_element<4,OutputTuple>::type> > input_ports_tuple_type;
+        typedef OutputTuple output_type;
+    private:
+        typedef join_node_base<JP, input_ports_tuple_type, output_type > base_type;
+    public:
+        unfolded_join_node(graph &g) : base_type(g) {}
+        unfolded_join_node(const unfolded_join_node &other) : base_type(other) {}
+    };
+
+    template<template<class> class PT, typename OutputTuple, graph_buffer_policy JP>
+    class unfolded_join_node<6,PT,OutputTuple,JP> : public internal::join_node_base<JP,
+        std::tuple<
+                PT<typename std::tuple_element<0,OutputTuple>::type>,
+                PT<typename std::tuple_element<1,OutputTuple>::type>,
+                PT<typename std::tuple_element<2,OutputTuple>::type>,
+                PT<typename std::tuple_element<3,OutputTuple>::type>,
+                PT<typename std::tuple_element<4,OutputTuple>::type>,
+                PT<typename std::tuple_element<5,OutputTuple>::type> >,
+        OutputTuple
+                    > {
+    public:
+        typedef typename std::tuple<
+                PT<typename std::tuple_element<0,OutputTuple>::type>, 
+                PT<typename std::tuple_element<1,OutputTuple>::type>, 
+                PT<typename std::tuple_element<2,OutputTuple>::type>, 
+                PT<typename std::tuple_element<3,OutputTuple>::type>, 
+                PT<typename std::tuple_element<4,OutputTuple>::type>, 
+                PT<typename std::tuple_element<5,OutputTuple>::type> > input_ports_tuple_type;
+        typedef OutputTuple output_type;
+    private:
+        typedef join_node_base<JP, input_ports_tuple_type, output_type > base_type;
+    public:
+        unfolded_join_node(graph &g) : base_type(g) {}
+        unfolded_join_node(const unfolded_join_node &other) : base_type(other) {}
+    };
+
+    template<template<class> class PT, typename OutputTuple, graph_buffer_policy JP>
+    class unfolded_join_node<7,PT,OutputTuple,JP> : public internal::join_node_base<JP,
+        std::tuple<
+                PT<typename std::tuple_element<0,OutputTuple>::type>,
+                PT<typename std::tuple_element<1,OutputTuple>::type>,
+                PT<typename std::tuple_element<2,OutputTuple>::type>,
+                PT<typename std::tuple_element<3,OutputTuple>::type>,
+                PT<typename std::tuple_element<4,OutputTuple>::type>,
+                PT<typename std::tuple_element<5,OutputTuple>::type>,
+                PT<typename std::tuple_element<6,OutputTuple>::type> >,
+        OutputTuple
+                > {
+    public:
+        typedef typename std::tuple<
+                PT<typename std::tuple_element<0,OutputTuple>::type>, 
+                PT<typename std::tuple_element<1,OutputTuple>::type>, 
+                PT<typename std::tuple_element<2,OutputTuple>::type>, 
+                PT<typename std::tuple_element<3,OutputTuple>::type>, 
+                PT<typename std::tuple_element<4,OutputTuple>::type>, 
+                PT<typename std::tuple_element<5,OutputTuple>::type>, 
+                PT<typename std::tuple_element<6,OutputTuple>::type> > input_ports_tuple_type;
+        typedef OutputTuple output_type;
+    private:
+        typedef join_node_base<JP, input_ports_tuple_type, output_type > base_type;
+    public:
+        unfolded_join_node(graph &g) : base_type(g) {}
+        unfolded_join_node(const unfolded_join_node &other) : base_type(other) {}
+    };
+
+    template<template<class> class PT, typename OutputTuple, graph_buffer_policy JP>
+    class unfolded_join_node<8,PT,OutputTuple,JP> : public internal::join_node_base<JP,
+        std::tuple<
+                PT<typename std::tuple_element<0,OutputTuple>::type>,
+                PT<typename std::tuple_element<1,OutputTuple>::type>,
+                PT<typename std::tuple_element<2,OutputTuple>::type>,
+                PT<typename std::tuple_element<3,OutputTuple>::type>,
+                PT<typename std::tuple_element<4,OutputTuple>::type>,
+                PT<typename std::tuple_element<5,OutputTuple>::type>,
+                PT<typename std::tuple_element<6,OutputTuple>::type>,
+                PT<typename std::tuple_element<7,OutputTuple>::type> >,
+        OutputTuple
+                > {
+    public:
+        typedef typename std::tuple<
+                PT<typename std::tuple_element<0,OutputTuple>::type>, 
+                PT<typename std::tuple_element<1,OutputTuple>::type>, 
+                PT<typename std::tuple_element<2,OutputTuple>::type>, 
+                PT<typename std::tuple_element<3,OutputTuple>::type>, 
+                PT<typename std::tuple_element<4,OutputTuple>::type>, 
+                PT<typename std::tuple_element<5,OutputTuple>::type>, 
+                PT<typename std::tuple_element<6,OutputTuple>::type>, 
+                PT<typename std::tuple_element<7,OutputTuple>::type> > input_ports_tuple_type;
+        typedef OutputTuple output_type;
+    private:
+        typedef join_node_base<JP, input_ports_tuple_type, output_type > base_type;
+    public:
+        unfolded_join_node(graph &g) : base_type(g) {}
+        unfolded_join_node(const unfolded_join_node &other) : base_type(other) {}
+    };
+
+    template<template<class> class PT, typename OutputTuple, graph_buffer_policy JP>
+    class unfolded_join_node<9,PT,OutputTuple,JP> : public internal::join_node_base<JP,
+        std::tuple<
+                PT<typename std::tuple_element<0,OutputTuple>::type>,
+                PT<typename std::tuple_element<1,OutputTuple>::type>,
+                PT<typename std::tuple_element<2,OutputTuple>::type>,
+                PT<typename std::tuple_element<3,OutputTuple>::type>,
+                PT<typename std::tuple_element<4,OutputTuple>::type>,
+                PT<typename std::tuple_element<5,OutputTuple>::type>,
+                PT<typename std::tuple_element<6,OutputTuple>::type>,
+                PT<typename std::tuple_element<7,OutputTuple>::type>,
+                PT<typename std::tuple_element<8,OutputTuple>::type> >,
+        OutputTuple
+                > {
+    public:
+        typedef typename std::tuple<
+                PT<typename std::tuple_element<0,OutputTuple>::type>, 
+                PT<typename std::tuple_element<1,OutputTuple>::type>, 
+                PT<typename std::tuple_element<2,OutputTuple>::type>, 
+                PT<typename std::tuple_element<3,OutputTuple>::type>, 
+                PT<typename std::tuple_element<4,OutputTuple>::type>, 
+                PT<typename std::tuple_element<5,OutputTuple>::type>, 
+                PT<typename std::tuple_element<6,OutputTuple>::type>, 
+                PT<typename std::tuple_element<7,OutputTuple>::type>, 
+                PT<typename std::tuple_element<8,OutputTuple>::type> > input_ports_tuple_type;
+        typedef OutputTuple output_type;
+    private:
+        typedef join_node_base<JP, input_ports_tuple_type, output_type > base_type;
+    public:
+        unfolded_join_node(graph &g) : base_type(g) {}
+        unfolded_join_node(const unfolded_join_node &other) : base_type(other) {}
+    };
+
+    template<template<class> class PT, typename OutputTuple, graph_buffer_policy JP>
+    class unfolded_join_node<10,PT,OutputTuple,JP> : public internal::join_node_base<JP,
+        std::tuple<
+                PT<typename std::tuple_element<0,OutputTuple>::type>,
+                PT<typename std::tuple_element<1,OutputTuple>::type>,
+                PT<typename std::tuple_element<2,OutputTuple>::type>,
+                PT<typename std::tuple_element<3,OutputTuple>::type>,
+                PT<typename std::tuple_element<4,OutputTuple>::type>,
+                PT<typename std::tuple_element<5,OutputTuple>::type>,
+                PT<typename std::tuple_element<6,OutputTuple>::type>,
+                PT<typename std::tuple_element<7,OutputTuple>::type>,
+                PT<typename std::tuple_element<8,OutputTuple>::type>,
+                PT<typename std::tuple_element<9,OutputTuple>::type> >,
+        OutputTuple
+                > {
+    public:
+        typedef typename std::tuple<
+                PT<typename std::tuple_element<0,OutputTuple>::type>, 
+                PT<typename std::tuple_element<1,OutputTuple>::type>, 
+                PT<typename std::tuple_element<2,OutputTuple>::type>, 
+                PT<typename std::tuple_element<3,OutputTuple>::type>, 
+                PT<typename std::tuple_element<4,OutputTuple>::type>, 
+                PT<typename std::tuple_element<5,OutputTuple>::type>, 
+                PT<typename std::tuple_element<6,OutputTuple>::type>, 
+                PT<typename std::tuple_element<7,OutputTuple>::type>, 
+                PT<typename std::tuple_element<8,OutputTuple>::type>, 
+                PT<typename std::tuple_element<9,OutputTuple>::type> > input_ports_tuple_type;
+        typedef OutputTuple output_type;
+    private:
+        typedef join_node_base<JP, input_ports_tuple_type, output_type > base_type;
+    public:
+        unfolded_join_node(graph &g) : base_type(g) {}
+        unfolded_join_node(const unfolded_join_node &other) : base_type(other) {}
+    };
+
+    // tag_matching unfolded_join_node.  This must be a separate specialization because the constructors
+    // differ.
+
+    template<typename OutputTuple>
+    class unfolded_join_node<2,tag_matching_port,OutputTuple,tag_matching> : public internal::join_node_base<tag_matching,
+        std::tuple<
+                tag_matching_port<typename std::tuple_element<0,OutputTuple>::type>, 
+                tag_matching_port<typename std::tuple_element<1,OutputTuple>::type> >,
+        OutputTuple
+                  >
+                  {
+        typedef typename std::tuple_element<0, OutputTuple>::type T0;
+        typedef typename std::tuple_element<1, OutputTuple>::type T1;
+    public:
+        typedef typename std::tuple< tag_matching_port<T0>, tag_matching_port<T1> > input_ports_tuple_type;
+        typedef OutputTuple output_type;
+    private:
+        typedef join_node_base<tag_matching, input_ports_tuple_type, output_type > base_type;
+        typedef typename internal::function_body<T0, tag_value> *f0_p;
+        typedef typename internal::function_body<T1, tag_value> *f1_p;
+        typedef typename std::tuple< f0_p, f1_p > func_initializer_type;
+    public:
+        template<typename B0, typename B1>
+        unfolded_join_node(graph &g, B0 b0, B1 b1) : base_type(g,
+                func_initializer_type(
+                    new internal::function_body_leaf<T0, tag_value, B0>(b0),
+                    new internal::function_body_leaf<T1, tag_value, B1>(b1)
+                    ) ) {}
+        unfolded_join_node(const unfolded_join_node &other) : base_type(other) {}
+    };
+
+    template<typename OutputTuple>
+    class unfolded_join_node<3,tag_matching_port,OutputTuple,tag_matching> : public internal::join_node_base<tag_matching,
+        std::tuple<
+                tag_matching_port<typename std::tuple_element<0,OutputTuple>::type>, 
+                tag_matching_port<typename std::tuple_element<1,OutputTuple>::type>,
+                tag_matching_port<typename std::tuple_element<2,OutputTuple>::type> >,
+        OutputTuple
+                  >
+                  {
+        typedef typename std::tuple_element<0, OutputTuple>::type T0;
+        typedef typename std::tuple_element<1, OutputTuple>::type T1;
+        typedef typename std::tuple_element<2, OutputTuple>::type T2;
+    public:
+        typedef typename std::tuple< tag_matching_port<T0>, tag_matching_port<T1>, tag_matching_port<T2>
+            > input_ports_tuple_type;
+        typedef OutputTuple output_type;
+    private:
+        typedef join_node_base<tag_matching, input_ports_tuple_type, output_type > base_type;
+        typedef typename internal::function_body<T0, tag_value> *f0_p;
+        typedef typename internal::function_body<T1, tag_value> *f1_p;
+        typedef typename internal::function_body<T2, tag_value> *f2_p;
+        typedef typename std::tuple< f0_p, f1_p, f2_p > func_initializer_type;
+    public:
+        template<typename B0, typename B1, typename B2>
+        unfolded_join_node(graph &g, B0 b0, B1 b1, B2 b2) : base_type(g,
+                func_initializer_type(
+                    new internal::function_body_leaf<T0, tag_value, B0>(b0),
+                    new internal::function_body_leaf<T1, tag_value, B1>(b1),
+                    new internal::function_body_leaf<T2, tag_value, B2>(b2)
+                    ) ) {}
+        unfolded_join_node(const unfolded_join_node &other) : base_type(other) {}
+    };
+
+    template<typename OutputTuple>
+    class unfolded_join_node<4,tag_matching_port,OutputTuple,tag_matching> : public internal::join_node_base<tag_matching,
+        std::tuple<
+                tag_matching_port<typename std::tuple_element<0,OutputTuple>::type>, 
+                tag_matching_port<typename std::tuple_element<1,OutputTuple>::type>,
+                tag_matching_port<typename std::tuple_element<2,OutputTuple>::type>,
+                tag_matching_port<typename std::tuple_element<3,OutputTuple>::type>
+            >, OutputTuple >
+                  {
+        typedef typename std::tuple_element<0, OutputTuple>::type T0;
+        typedef typename std::tuple_element<1, OutputTuple>::type T1;
+        typedef typename std::tuple_element<2, OutputTuple>::type T2;
+        typedef typename std::tuple_element<3, OutputTuple>::type T3;
+    public:
+        typedef typename std::tuple< tag_matching_port<T0>, tag_matching_port<T1>, tag_matching_port<T2>,
+                tag_matching_port<T3> > input_ports_tuple_type;
+        typedef OutputTuple output_type;
+    private:
+        typedef join_node_base<tag_matching, input_ports_tuple_type, output_type > base_type;
+        typedef typename internal::function_body<T0, tag_value> *f0_p;
+        typedef typename internal::function_body<T1, tag_value> *f1_p;
+        typedef typename internal::function_body<T2, tag_value> *f2_p;
+        typedef typename internal::function_body<T3, tag_value> *f3_p;
+        typedef typename std::tuple< f0_p, f1_p, f2_p, f3_p > func_initializer_type;
+    public:
+        template<typename B0, typename B1, typename B2, typename B3>
+        unfolded_join_node(graph &g, B0 b0, B1 b1, B2 b2, B3 b3) : base_type(g,
+                func_initializer_type(
+                    new internal::function_body_leaf<T0, tag_value, B0>(b0),
+                    new internal::function_body_leaf<T1, tag_value, B1>(b1),
+                    new internal::function_body_leaf<T2, tag_value, B2>(b2),
+                    new internal::function_body_leaf<T3, tag_value, B3>(b3)
+                    ) ) {}
+        unfolded_join_node(const unfolded_join_node &other) : base_type(other) {}
+    };
+
+    template<typename OutputTuple>
+    class unfolded_join_node<5,tag_matching_port,OutputTuple,tag_matching> : public internal::join_node_base<tag_matching,
+        std::tuple<
+                tag_matching_port<typename std::tuple_element<0,OutputTuple>::type>, 
+                tag_matching_port<typename std::tuple_element<1,OutputTuple>::type>,
+                tag_matching_port<typename std::tuple_element<2,OutputTuple>::type>,
+                tag_matching_port<typename std::tuple_element<3,OutputTuple>::type>,
+                tag_matching_port<typename std::tuple_element<4,OutputTuple>::type>
+            >, OutputTuple >
+                  {
+        typedef typename std::tuple_element<0, OutputTuple>::type T0;
+        typedef typename std::tuple_element<1, OutputTuple>::type T1;
+        typedef typename std::tuple_element<2, OutputTuple>::type T2;
+        typedef typename std::tuple_element<3, OutputTuple>::type T3;
+        typedef typename std::tuple_element<4, OutputTuple>::type T4;
+    public:
+        typedef typename std::tuple< tag_matching_port<T0>, tag_matching_port<T1>, tag_matching_port<T2>,
+                tag_matching_port<T3>, tag_matching_port<T4> > input_ports_tuple_type;
+        typedef OutputTuple output_type;
+    private:
+        typedef join_node_base<tag_matching, input_ports_tuple_type, output_type > base_type;
+        typedef typename internal::function_body<T0, tag_value> *f0_p;
+        typedef typename internal::function_body<T1, tag_value> *f1_p;
+        typedef typename internal::function_body<T2, tag_value> *f2_p;
+        typedef typename internal::function_body<T3, tag_value> *f3_p;
+        typedef typename internal::function_body<T4, tag_value> *f4_p;
+        typedef typename std::tuple< f0_p, f1_p, f2_p, f3_p, f4_p > func_initializer_type;
+    public:
+        template<typename B0, typename B1, typename B2, typename B3, typename B4>
+        unfolded_join_node(graph &g, B0 b0, B1 b1, B2 b2, B3 b3, B4 b4) : base_type(g,
+                func_initializer_type(
+                    new internal::function_body_leaf<T0, tag_value, B0>(b0),
+                    new internal::function_body_leaf<T1, tag_value, B1>(b1),
+                    new internal::function_body_leaf<T2, tag_value, B2>(b2),
+                    new internal::function_body_leaf<T3, tag_value, B3>(b3),
+                    new internal::function_body_leaf<T4, tag_value, B4>(b4)
+                    ) ) {}
+        unfolded_join_node(const unfolded_join_node &other) : base_type(other) {}
+    };
+
+    template<typename OutputTuple>
+    class unfolded_join_node<6,tag_matching_port,OutputTuple,tag_matching> : public internal::join_node_base<tag_matching,
+        std::tuple<
+                tag_matching_port<typename std::tuple_element<0,OutputTuple>::type>, 
+                tag_matching_port<typename std::tuple_element<1,OutputTuple>::type>,
+                tag_matching_port<typename std::tuple_element<2,OutputTuple>::type>,
+                tag_matching_port<typename std::tuple_element<3,OutputTuple>::type>,
+                tag_matching_port<typename std::tuple_element<4,OutputTuple>::type>,
+                tag_matching_port<typename std::tuple_element<5,OutputTuple>::type>
+            >, OutputTuple >
+                  {
+        typedef typename std::tuple_element<0, OutputTuple>::type T0;
+        typedef typename std::tuple_element<1, OutputTuple>::type T1;
+        typedef typename std::tuple_element<2, OutputTuple>::type T2;
+        typedef typename std::tuple_element<3, OutputTuple>::type T3;
+        typedef typename std::tuple_element<4, OutputTuple>::type T4;
+        typedef typename std::tuple_element<5, OutputTuple>::type T5;
+    public:
+        typedef typename std::tuple< tag_matching_port<T0>, tag_matching_port<T1>, tag_matching_port<T2>,
+                tag_matching_port<T3>, tag_matching_port<T4>, tag_matching_port<T5> > input_ports_tuple_type;
+        typedef OutputTuple output_type;
+    private:
+        typedef join_node_base<tag_matching, input_ports_tuple_type, output_type > base_type;
+        typedef typename internal::function_body<T0, tag_value> *f0_p;
+        typedef typename internal::function_body<T1, tag_value> *f1_p;
+        typedef typename internal::function_body<T2, tag_value> *f2_p;
+        typedef typename internal::function_body<T3, tag_value> *f3_p;
+        typedef typename internal::function_body<T4, tag_value> *f4_p;
+        typedef typename internal::function_body<T5, tag_value> *f5_p;
+        typedef typename std::tuple< f0_p, f1_p, f2_p, f3_p, f4_p, f5_p > func_initializer_type;
+    public:
+        template<typename B0, typename B1, typename B2, typename B3, typename B4, typename B5>
+        unfolded_join_node(graph &g, B0 b0, B1 b1, B2 b2, B3 b3, B4 b4, B5 b5) : base_type(g,
+                func_initializer_type(
+                    new internal::function_body_leaf<T0, tag_value, B0>(b0),
+                    new internal::function_body_leaf<T1, tag_value, B1>(b1),
+                    new internal::function_body_leaf<T2, tag_value, B2>(b2),
+                    new internal::function_body_leaf<T3, tag_value, B3>(b3),
+                    new internal::function_body_leaf<T4, tag_value, B4>(b4),
+                    new internal::function_body_leaf<T5, tag_value, B5>(b5)
+                    ) ) {}
+        unfolded_join_node(const unfolded_join_node &other) : base_type(other) {}
+    };
+
+    template<typename OutputTuple>
+    class unfolded_join_node<7,tag_matching_port,OutputTuple,tag_matching> : public internal::join_node_base<tag_matching,
+        std::tuple<
+                tag_matching_port<typename std::tuple_element<0,OutputTuple>::type>, 
+                tag_matching_port<typename std::tuple_element<1,OutputTuple>::type>,
+                tag_matching_port<typename std::tuple_element<2,OutputTuple>::type>,
+                tag_matching_port<typename std::tuple_element<3,OutputTuple>::type>,
+                tag_matching_port<typename std::tuple_element<4,OutputTuple>::type>,
+                tag_matching_port<typename std::tuple_element<5,OutputTuple>::type>,
+                tag_matching_port<typename std::tuple_element<6,OutputTuple>::type>
+            >, OutputTuple >
+                  {
+        typedef typename std::tuple_element<0, OutputTuple>::type T0;
+        typedef typename std::tuple_element<1, OutputTuple>::type T1;
+        typedef typename std::tuple_element<2, OutputTuple>::type T2;
+        typedef typename std::tuple_element<3, OutputTuple>::type T3;
+        typedef typename std::tuple_element<4, OutputTuple>::type T4;
+        typedef typename std::tuple_element<5, OutputTuple>::type T5;
+        typedef typename std::tuple_element<6, OutputTuple>::type T6;
+    public:
+        typedef typename std::tuple< tag_matching_port<T0>, tag_matching_port<T1>, tag_matching_port<T2>,
+                tag_matching_port<T3>, tag_matching_port<T4>, tag_matching_port<T5>, tag_matching_port<T6>
+            > input_ports_tuple_type;
+        typedef OutputTuple output_type;
+    private:
+        typedef join_node_base<tag_matching, input_ports_tuple_type, output_type > base_type;
+        typedef typename internal::function_body<T0, tag_value> *f0_p;
+        typedef typename internal::function_body<T1, tag_value> *f1_p;
+        typedef typename internal::function_body<T2, tag_value> *f2_p;
+        typedef typename internal::function_body<T3, tag_value> *f3_p;
+        typedef typename internal::function_body<T4, tag_value> *f4_p;
+        typedef typename internal::function_body<T5, tag_value> *f5_p;
+        typedef typename internal::function_body<T6, tag_value> *f6_p;
+        typedef typename std::tuple< f0_p, f1_p, f2_p, f3_p, f4_p, f5_p, f6_p > func_initializer_type;
+    public:
+        template<typename B0, typename B1, typename B2, typename B3, typename B4, typename B5, typename B6>
+        unfolded_join_node(graph &g, B0 b0, B1 b1, B2 b2, B3 b3, B4 b4, B5 b5, B6 b6) : base_type(g,
+                func_initializer_type(
+                    new internal::function_body_leaf<T0, tag_value, B0>(b0),
+                    new internal::function_body_leaf<T1, tag_value, B1>(b1),
+                    new internal::function_body_leaf<T2, tag_value, B2>(b2),
+                    new internal::function_body_leaf<T3, tag_value, B3>(b3),
+                    new internal::function_body_leaf<T4, tag_value, B4>(b4),
+                    new internal::function_body_leaf<T5, tag_value, B5>(b5),
+                    new internal::function_body_leaf<T6, tag_value, B6>(b6)
+                    ) ) {}
+        unfolded_join_node(const unfolded_join_node &other) : base_type(other) {}
+    };
+
+    template<typename OutputTuple>
+    class unfolded_join_node<8,tag_matching_port,OutputTuple,tag_matching> : public internal::join_node_base<tag_matching,
+        std::tuple<
+                tag_matching_port<typename std::tuple_element<0,OutputTuple>::type>, 
+                tag_matching_port<typename std::tuple_element<1,OutputTuple>::type>,
+                tag_matching_port<typename std::tuple_element<2,OutputTuple>::type>,
+                tag_matching_port<typename std::tuple_element<3,OutputTuple>::type>,
+                tag_matching_port<typename std::tuple_element<4,OutputTuple>::type>,
+                tag_matching_port<typename std::tuple_element<5,OutputTuple>::type>,
+                tag_matching_port<typename std::tuple_element<6,OutputTuple>::type>,
+                tag_matching_port<typename std::tuple_element<7,OutputTuple>::type>
+            >, OutputTuple >
+                  {
+        typedef typename std::tuple_element<0, OutputTuple>::type T0;
+        typedef typename std::tuple_element<1, OutputTuple>::type T1;
+        typedef typename std::tuple_element<2, OutputTuple>::type T2;
+        typedef typename std::tuple_element<3, OutputTuple>::type T3;
+        typedef typename std::tuple_element<4, OutputTuple>::type T4;
+        typedef typename std::tuple_element<5, OutputTuple>::type T5;
+        typedef typename std::tuple_element<6, OutputTuple>::type T6;
+        typedef typename std::tuple_element<7, OutputTuple>::type T7;
+    public:
+        typedef typename std::tuple< tag_matching_port<T0>, tag_matching_port<T1>, tag_matching_port<T2>,
+                tag_matching_port<T3>, tag_matching_port<T4>, tag_matching_port<T5>, tag_matching_port<T6>,
+                tag_matching_port<T7> > input_ports_tuple_type;
+        typedef OutputTuple output_type;
+    private:
+        typedef join_node_base<tag_matching, input_ports_tuple_type, output_type > base_type;
+        typedef typename internal::function_body<T0, tag_value> *f0_p;
+        typedef typename internal::function_body<T1, tag_value> *f1_p;
+        typedef typename internal::function_body<T2, tag_value> *f2_p;
+        typedef typename internal::function_body<T3, tag_value> *f3_p;
+        typedef typename internal::function_body<T4, tag_value> *f4_p;
+        typedef typename internal::function_body<T5, tag_value> *f5_p;
+        typedef typename internal::function_body<T6, tag_value> *f6_p;
+        typedef typename internal::function_body<T7, tag_value> *f7_p;
+        typedef typename std::tuple< f0_p, f1_p, f2_p, f3_p, f4_p, f5_p, f6_p, f7_p > func_initializer_type;
+    public:
+        template<typename B0, typename B1, typename B2, typename B3, typename B4, typename B5, typename B6, typename B7>
+        unfolded_join_node(graph &g, B0 b0, B1 b1, B2 b2, B3 b3, B4 b4, B5 b5, B6 b6, B7 b7) : base_type(g,
+                func_initializer_type(
+                    new internal::function_body_leaf<T0, tag_value, B0>(b0),
+                    new internal::function_body_leaf<T1, tag_value, B1>(b1),
+                    new internal::function_body_leaf<T2, tag_value, B2>(b2),
+                    new internal::function_body_leaf<T3, tag_value, B3>(b3),
+                    new internal::function_body_leaf<T4, tag_value, B4>(b4),
+                    new internal::function_body_leaf<T5, tag_value, B5>(b5),
+                    new internal::function_body_leaf<T6, tag_value, B6>(b6),
+                    new internal::function_body_leaf<T7, tag_value, B7>(b7)
+                    ) ) {}
+        unfolded_join_node(const unfolded_join_node &other) : base_type(other) {}
+    };
+
+    template<typename OutputTuple>
+    class unfolded_join_node<9,tag_matching_port,OutputTuple,tag_matching> : public internal::join_node_base<tag_matching,
+        std::tuple<
+                tag_matching_port<typename std::tuple_element<0,OutputTuple>::type>, 
+                tag_matching_port<typename std::tuple_element<1,OutputTuple>::type>,
+                tag_matching_port<typename std::tuple_element<2,OutputTuple>::type>,
+                tag_matching_port<typename std::tuple_element<3,OutputTuple>::type>,
+                tag_matching_port<typename std::tuple_element<4,OutputTuple>::type>,
+                tag_matching_port<typename std::tuple_element<5,OutputTuple>::type>,
+                tag_matching_port<typename std::tuple_element<6,OutputTuple>::type>,
+                tag_matching_port<typename std::tuple_element<7,OutputTuple>::type>,
+                tag_matching_port<typename std::tuple_element<8,OutputTuple>::type>
+            >, OutputTuple >
+                  {
+        typedef typename std::tuple_element<0, OutputTuple>::type T0;
+        typedef typename std::tuple_element<1, OutputTuple>::type T1;
+        typedef typename std::tuple_element<2, OutputTuple>::type T2;
+        typedef typename std::tuple_element<3, OutputTuple>::type T3;
+        typedef typename std::tuple_element<4, OutputTuple>::type T4;
+        typedef typename std::tuple_element<5, OutputTuple>::type T5;
+        typedef typename std::tuple_element<6, OutputTuple>::type T6;
+        typedef typename std::tuple_element<7, OutputTuple>::type T7;
+        typedef typename std::tuple_element<8, OutputTuple>::type T8;
+    public:
+        typedef typename std::tuple< tag_matching_port<T0>, tag_matching_port<T1>, tag_matching_port<T2>,
+                tag_matching_port<T3>, tag_matching_port<T4>, tag_matching_port<T5>, tag_matching_port<T6>,
+                tag_matching_port<T7>, tag_matching_port<T8> > input_ports_tuple_type;
+        typedef OutputTuple output_type;
+    private:
+        typedef join_node_base<tag_matching, input_ports_tuple_type, output_type > base_type;
+        typedef typename internal::function_body<T0, tag_value> *f0_p;
+        typedef typename internal::function_body<T1, tag_value> *f1_p;
+        typedef typename internal::function_body<T2, tag_value> *f2_p;
+        typedef typename internal::function_body<T3, tag_value> *f3_p;
+        typedef typename internal::function_body<T4, tag_value> *f4_p;
+        typedef typename internal::function_body<T5, tag_value> *f5_p;
+        typedef typename internal::function_body<T6, tag_value> *f6_p;
+        typedef typename internal::function_body<T7, tag_value> *f7_p;
+        typedef typename internal::function_body<T8, tag_value> *f8_p;
+        typedef typename std::tuple< f0_p, f1_p, f2_p, f3_p, f4_p, f5_p, f6_p, f7_p, f8_p > func_initializer_type;
+    public:
+        template<typename B0, typename B1, typename B2, typename B3, typename B4, typename B5, typename B6, typename B7, typename B8>
+        unfolded_join_node(graph &g, B0 b0, B1 b1, B2 b2, B3 b3, B4 b4, B5 b5, B6 b6, B7 b7, B8 b8) : base_type(g,
+                func_initializer_type(
+                    new internal::function_body_leaf<T0, tag_value, B0>(b0),
+                    new internal::function_body_leaf<T1, tag_value, B1>(b1),
+                    new internal::function_body_leaf<T2, tag_value, B2>(b2),
+                    new internal::function_body_leaf<T3, tag_value, B3>(b3),
+                    new internal::function_body_leaf<T4, tag_value, B4>(b4),
+                    new internal::function_body_leaf<T5, tag_value, B5>(b5),
+                    new internal::function_body_leaf<T6, tag_value, B6>(b6),
+                    new internal::function_body_leaf<T7, tag_value, B7>(b7),
+                    new internal::function_body_leaf<T8, tag_value, B8>(b8)
+                    ) ) {}
+        unfolded_join_node(const unfolded_join_node &other) : base_type(other) {}
+    };
+
+    template<typename OutputTuple>
+    class unfolded_join_node<10,tag_matching_port,OutputTuple,tag_matching> : public internal::join_node_base<tag_matching,
+        std::tuple<
+                tag_matching_port<typename std::tuple_element<0,OutputTuple>::type>, 
+                tag_matching_port<typename std::tuple_element<1,OutputTuple>::type>,
+                tag_matching_port<typename std::tuple_element<2,OutputTuple>::type>,
+                tag_matching_port<typename std::tuple_element<3,OutputTuple>::type>,
+                tag_matching_port<typename std::tuple_element<4,OutputTuple>::type>,
+                tag_matching_port<typename std::tuple_element<5,OutputTuple>::type>,
+                tag_matching_port<typename std::tuple_element<6,OutputTuple>::type>,
+                tag_matching_port<typename std::tuple_element<7,OutputTuple>::type>,
+                tag_matching_port<typename std::tuple_element<8,OutputTuple>::type>,
+                tag_matching_port<typename std::tuple_element<9,OutputTuple>::type>
+            >, OutputTuple >
+                  {
+        typedef typename std::tuple_element<0, OutputTuple>::type T0;
+        typedef typename std::tuple_element<1, OutputTuple>::type T1;
+        typedef typename std::tuple_element<2, OutputTuple>::type T2;
+        typedef typename std::tuple_element<3, OutputTuple>::type T3;
+        typedef typename std::tuple_element<4, OutputTuple>::type T4;
+        typedef typename std::tuple_element<5, OutputTuple>::type T5;
+        typedef typename std::tuple_element<6, OutputTuple>::type T6;
+        typedef typename std::tuple_element<7, OutputTuple>::type T7;
+        typedef typename std::tuple_element<8, OutputTuple>::type T8;
+        typedef typename std::tuple_element<9, OutputTuple>::type T9;
+    public:
+        typedef typename std::tuple< tag_matching_port<T0>, tag_matching_port<T1>, tag_matching_port<T2>,
+                tag_matching_port<T3>, tag_matching_port<T4>, tag_matching_port<T5>, tag_matching_port<T6>,
+                tag_matching_port<T7>, tag_matching_port<T8>, tag_matching_port<T9> > input_ports_tuple_type;
+        typedef OutputTuple output_type;
+    private:
+        typedef join_node_base<tag_matching, input_ports_tuple_type, output_type > base_type;
+        typedef typename internal::function_body<T0, tag_value> *f0_p;
+        typedef typename internal::function_body<T1, tag_value> *f1_p;
+        typedef typename internal::function_body<T2, tag_value> *f2_p;
+        typedef typename internal::function_body<T3, tag_value> *f3_p;
+        typedef typename internal::function_body<T4, tag_value> *f4_p;
+        typedef typename internal::function_body<T5, tag_value> *f5_p;
+        typedef typename internal::function_body<T6, tag_value> *f6_p;
+        typedef typename internal::function_body<T7, tag_value> *f7_p;
+        typedef typename internal::function_body<T8, tag_value> *f8_p;
+        typedef typename internal::function_body<T9, tag_value> *f9_p;
+        typedef typename std::tuple< f0_p, f1_p, f2_p, f3_p, f4_p, f5_p, f6_p, f7_p, f8_p, f9_p > func_initializer_type;
+    public:
+        template<typename B0, typename B1, typename B2, typename B3, typename B4, typename B5, typename B6, typename B7, typename B8, typename B9>
+        unfolded_join_node(graph &g, B0 b0, B1 b1, B2 b2, B3 b3, B4 b4, B5 b5, B6 b6, B7 b7, B8 b8, B9 b9) : base_type(g,
+                func_initializer_type(
+                    new internal::function_body_leaf<T0, tag_value, B0>(b0),
+                    new internal::function_body_leaf<T1, tag_value, B1>(b1),
+                    new internal::function_body_leaf<T2, tag_value, B2>(b2),
+                    new internal::function_body_leaf<T3, tag_value, B3>(b3),
+                    new internal::function_body_leaf<T4, tag_value, B4>(b4),
+                    new internal::function_body_leaf<T5, tag_value, B5>(b5),
+                    new internal::function_body_leaf<T6, tag_value, B6>(b6),
+                    new internal::function_body_leaf<T7, tag_value, B7>(b7),
+                    new internal::function_body_leaf<T8, tag_value, B8>(b8),
+                    new internal::function_body_leaf<T9, tag_value, B9>(b9)
+                    ) ) {}
+        unfolded_join_node(const unfolded_join_node &other) : base_type(other) {}
+    };
+
+    //! templated function to refer to input ports of the join node
+    template<size_t N, typename JNT>
+    typename std::tuple_element<N, typename JNT::input_ports_tuple_type>::type &input_port(JNT &jn) {
+        return std::get<N>(jn.inputs());
+    }
+
+} 
+
+#endif
+
diff --git a/tbb/include/tbb/internal/_flow_graph_node_impl.h b/tbb/include/tbb/internal/_flow_graph_node_impl.h
new file mode 100644 (file)
index 0000000..f3e112d
--- /dev/null
@@ -0,0 +1,349 @@
+/*
+    Copyright 2005-2011 Intel Corporation.  All Rights Reserved.
+
+    This file is part of Threading Building Blocks.
+
+    Threading Building Blocks is free software; you can redistribute it
+    and/or modify it under the terms of the GNU General Public License
+    version 2 as published by the Free Software Foundation.
+
+    Threading Building Blocks is distributed in the hope that it will be
+    useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+    of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Threading Building Blocks; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+    As a special exception, you may use this file as part of a free software
+    library without restriction.  Specifically, if other files instantiate
+    templates or use macros or inline functions from this file, or you compile
+    this file and link it with other files to produce an executable, this
+    file does not by itself cause the resulting executable to be covered by
+    the GNU General Public License.  This exception does not however
+    invalidate any other reasons why the executable file might be covered by
+    the GNU General Public License.
+*/
+
+#ifndef __TBB__graph_node_internal_H
+#define __TBB__graph_node_internal_H
+
+#include "_flow_graph_item_buffer_impl.h"
+
+//! @cond INTERNAL
+namespace internal {
+
+    using tbb::internal::aggregated_operation;
+    using tbb::internal::aggregating_functor;
+    using tbb::internal::aggregator;
+
+     template< typename T, typename A >
+     class function_input_queue : public item_buffer<T,A> {
+     public:
+         bool pop( T& t ) {
+             return this->pop_front( t );
+         }
+
+         bool push( T& t ) {
+             return this->push_back( t );
+         }
+     };
+
+    //! Implements methods for a function node that takes a type T as input
+    template< typename Input, typename Output, typename A >
+    class function_input : public receiver<Input>, tbb::internal::no_assign {
+        typedef sender<Input> predecessor_type;
+        enum op_stat {WAIT=0, SUCCEEDED, FAILED};
+        enum op_type {reg_pred, rem_pred, app_body, tryput, try_fwd};
+        typedef function_input<Input, Output, A> my_class;
+        
+    public:
+
+        //! The input type of this receiver
+        typedef Input input_type;
+        //! The output type of this receiver
+        typedef Output output_type;
+        
+        //! Constructor for function_input
+        template< typename Body >
+        function_input( graph &g, size_t max_concurrency, Body& body, function_input_queue<input_type,A> *q = NULL )
+            : my_root_task(g.root_task()), my_max_concurrency(max_concurrency), my_concurrency(0),
+              my_body( new internal::function_body_leaf< input_type, output_type, Body>(body) ),
+              my_queue(q), forwarder_busy(false) {
+            my_predecessors.set_owner(this);
+            my_aggregator.initialize_handler(my_handler(this));
+        }
+        
+        //! Copy constructor
+        function_input( const function_input& src, function_input_queue<input_type,A> *q = NULL ) : 
+#if (__TBB_GCC_VERSION < 40202 )
+            receiver<Input>(), tbb::internal::no_assign(),
+#endif
+            my_root_task( src.my_root_task), my_max_concurrency(src.my_max_concurrency),
+            my_concurrency(0), my_body( src.my_body->clone() ), my_queue(q), forwarder_busy(false)
+        {
+            my_predecessors.set_owner(this);
+            my_aggregator.initialize_handler(my_handler(this));
+        }
+
+        //! Destructor
+        virtual ~function_input() { 
+            delete my_body; 
+            if ( my_queue ) delete my_queue;
+        }
+        
+        //! Put to the node
+        virtual bool try_put( const input_type &t ) {
+           if ( my_max_concurrency == 0 ) {
+               spawn_body_task( t );
+               return true;
+           } else {
+               my_operation op_data(t, tryput);
+               my_aggregator.execute(&op_data);
+               return op_data.status == SUCCEEDED;
+           }
+        }
+        
+        //! Adds src to the list of cached predecessors.
+        /* override */ bool register_predecessor( predecessor_type &src ) {
+            my_operation op_data(reg_pred);
+            op_data.r = &src;
+            my_aggregator.execute(&op_data);
+            return true;
+        }
+        
+        //! Removes src from the list of cached predecessors.
+        /* override */ bool remove_predecessor( predecessor_type &src ) {
+            my_operation op_data(rem_pred);
+            op_data.r = &src;
+            my_aggregator.execute(&op_data);
+            return true;
+        }
+
+        template< typename Body >
+        Body copy_function_object() {
+            internal::function_body<input_type, output_type> &body_ref = *this->my_body;
+            return dynamic_cast< internal::function_body_leaf<input_type, output_type, Body> & >(body_ref).get_body(); 
+        } 
+        
+    protected:
+
+        task *my_root_task;
+        const size_t my_max_concurrency;
+        size_t my_concurrency;
+        function_body<input_type, output_type> *my_body;
+        function_input_queue<input_type, A> *my_queue;
+        predecessor_cache<input_type, null_mutex > my_predecessors;
+        
+        virtual broadcast_cache<output_type > &successors() = 0;
+
+    private:
+
+        friend class apply_body_task< my_class, input_type >;
+        friend class forward_task< my_class >;
+        
+        class my_operation : public aggregated_operation< my_operation > {
+        public:
+            char type;
+            union {
+                input_type *elem;
+                predecessor_type *r;
+            };
+            my_operation(const input_type& e, op_type t) :
+                type(char(t)), elem(const_cast<input_type*>(&e)) {}
+            my_operation(op_type t) : type(char(t)), r(NULL) {}
+        };
+        
+        bool forwarder_busy;
+        typedef internal::aggregating_functor<my_class, my_operation> my_handler;
+        friend class internal::aggregating_functor<my_class, my_operation>;
+        aggregator< my_handler, my_operation > my_aggregator;
+        
+        void handle_operations(my_operation *op_list) {
+            my_operation *tmp;
+            while (op_list) {
+                tmp = op_list;
+                op_list = op_list->next;
+                switch (tmp->type) {
+                case reg_pred:
+                    my_predecessors.add(*(tmp->r));
+                    __TBB_store_with_release(tmp->status, SUCCEEDED);
+                    if (!forwarder_busy) {
+                        forwarder_busy = true;
+                        spawn_forward_task();
+                    }
+                    break;
+                case rem_pred:
+                    my_predecessors.remove(*(tmp->r));
+                    __TBB_store_with_release(tmp->status, SUCCEEDED);
+                    break;
+                case app_body:
+                    __TBB_ASSERT(my_max_concurrency != 0, NULL);
+                    --my_concurrency;
+                    __TBB_store_with_release(tmp->status, SUCCEEDED);
+                    if (my_concurrency<my_max_concurrency) {
+                        input_type i;
+                        bool item_was_retrieved = false;
+                        if ( my_queue )
+                            item_was_retrieved = my_queue->pop(i);
+                        else
+                            item_was_retrieved = my_predecessors.get_item(i);
+                        if (item_was_retrieved) {
+                            ++my_concurrency;
+                            spawn_body_task(i);
+                        }
+                    }
+                    break;
+                case tryput: internal_try_put(tmp);  break;
+                case try_fwd: internal_forward(tmp);  break;
+                }
+            }
+        }
+        
+        //! Put to the node
+        void internal_try_put(my_operation *op) {
+            __TBB_ASSERT(my_max_concurrency != 0, NULL);
+            if (my_concurrency < my_max_concurrency) {
+               ++my_concurrency;
+               spawn_body_task(*(op->elem));
+               __TBB_store_with_release(op->status, SUCCEEDED);
+           } else if ( my_queue && my_queue->push(*(op->elem)) ) { 
+               __TBB_store_with_release(op->status, SUCCEEDED);
+           } else {
+               __TBB_store_with_release(op->status, FAILED);
+           }
+        }
+        
+        //! Tries to spawn bodies if available and if concurrency allows
+        void internal_forward(my_operation *op) {
+            if (my_concurrency<my_max_concurrency || !my_max_concurrency) {
+                input_type i;
+                bool item_was_retrieved = false;
+                if ( my_queue )
+                    item_was_retrieved = my_queue->pop(i);
+                else
+                    item_was_retrieved = my_predecessors.get_item(i);
+                if (item_was_retrieved) {
+                    ++my_concurrency;
+                    __TBB_store_with_release(op->status, SUCCEEDED);
+                    spawn_body_task(i);
+                    return;
+                }
+            }
+            __TBB_store_with_release(op->status, FAILED);
+            forwarder_busy = false;
+        }
+        
+        //! Applies the body to the provided input
+        void apply_body( input_type &i ) {
+            successors().try_put( (*my_body)(i) );
+            if ( my_max_concurrency != 0 ) {
+                my_operation op_data(app_body);
+                my_aggregator.execute(&op_data);
+            }
+        }
+        
+       //! Spawns a task that calls apply_body( input )
+       inline void spawn_body_task( const input_type &input ) {
+           task::enqueue(*new(task::allocate_additional_child_of(*my_root_task)) apply_body_task< my_class, input_type >(*this, input));
+       }
+        
+       //! This is executed by an enqueued task, the "forwarder"
+       void forward() {
+           my_operation op_data(try_fwd);
+           do {
+               op_data.status = WAIT;
+               my_aggregator.execute(&op_data);
+           } while (op_data.status == SUCCEEDED);
+       }
+        
+       //! Spawns a task that calls forward()
+       inline void spawn_forward_task() {
+           task::enqueue(*new(task::allocate_additional_child_of(*my_root_task)) forward_task< my_class >(*this));
+       }
+    };
+        
+    //! Implements methods for an executable node that takes continue_msg as input
+    template< typename Output >
+    class continue_input : public continue_receiver {
+    public:
+        
+        //! The input type of this receiver
+        typedef continue_msg input_type;
+            
+        //! The output type of this receiver
+        typedef Output output_type;
+        
+        template< typename Body >
+        continue_input( graph &g, Body& body )
+            : my_root_task(g.root_task()), 
+             my_body( new internal::function_body_leaf< input_type, output_type, Body>(body) ) { }
+        
+        template< typename Body >
+        continue_input( graph &g, int number_of_predecessors, Body& body )
+            : continue_receiver( number_of_predecessors ), my_root_task(g.root_task()), 
+             my_body( new internal::function_body_leaf< input_type, output_type, Body>(body) ) { }
+
+        continue_input( const continue_input& src ) : continue_receiver(src), 
+            my_root_task(src.my_root_task), my_body( src.my_body->clone() ) {}
+
+        template< typename Body >
+        Body copy_function_object() {
+            internal::function_body<input_type, output_type> &body_ref = *my_body;
+            return dynamic_cast< internal::function_body_leaf<input_type, output_type, Body> & >(body_ref).get_body(); 
+        } 
+
+    protected:
+        
+        task *my_root_task;
+        function_body<input_type, output_type> *my_body;
+        
+        virtual broadcast_cache<output_type > &successors() = 0; 
+        
+        friend class apply_body_task< continue_input< Output >, continue_msg >;
+        
+        //! Applies the body to the provided input
+        /* override */ void apply_body( input_type ) {
+            successors().try_put( (*my_body)( continue_msg() ) );
+        }
+        
+        //! Spawns a task that applies the body
+        /* override */ void execute( ) {
+            task::enqueue( * new ( task::allocate_additional_child_of( *my_root_task ) ) 
+               apply_body_task< continue_input< Output >, continue_msg >( *this, continue_msg() ) ); 
+        }
+
+    };
+        
+    //! Implements methods for both executable and function nodes that puts Output to its successors
+    template< typename Output >
+    class function_output : public sender<Output> {
+    public:
+        
+        typedef Output output_type;
+        
+        function_output() { }
+        
+        //! Adds a new successor to this node
+        /* override */ bool register_successor( receiver<output_type> &r ) {
+            successors().register_successor( r );
+            return true;
+        }
+        
+        //! Removes a successor from this node
+        /* override */ bool remove_successor( receiver<output_type> &r ) {
+            successors().remove_successor( r );
+            return true;
+        }
+          
+    protected:
+        
+        virtual broadcast_cache<output_type > &successors() = 0; 
+        
+    };
+
+}
+
+#endif
+
diff --git a/tbb/include/tbb/internal/_flow_graph_or_impl.h b/tbb/include/tbb/internal/_flow_graph_or_impl.h
new file mode 100644 (file)
index 0000000..e4d1ba7
--- /dev/null
@@ -0,0 +1,607 @@
+/*
+    Copyright 2005-2011 Intel Corporation.  All Rights Reserved.
+
+    This file is part of Threading Building Blocks.
+
+    Threading Building Blocks is free software; you can redistribute it
+    and/or modify it under the terms of the GNU General Public License
+    version 2 as published by the Free Software Foundation.
+
+    Threading Building Blocks is distributed in the hope that it will be
+    useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+    of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Threading Building Blocks; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+    As a special exception, you may use this file as part of a free software
+    library without restriction.  Specifically, if other files instantiate
+    templates or use macros or inline functions from this file, or you compile
+    this file and link it with other files to produce an executable, this
+    file does not by itself cause the resulting executable to be covered by
+    the GNU General Public License.  This exception does not however
+    invalidate any other reasons why the executable file might be covered by
+    the GNU General Public License.
+*/
+
+#ifndef __TBB__flow_graph_or_impl_H
+#define __TBB__flow_graph_or_impl_H
+
+namespace internal {
+
+    // Output of the or_node is a struct containing a union, and will be of
+    // the form
+    //
+    //  struct {
+    //     size_t indx;
+    //     union {
+    //         T0 result0;
+    //         T1 result1;
+    //         ...
+    //         Tn resultn;
+    //     };
+    //  };
+    //
+    //  where the value of indx will indicate which result was put to the
+    //  successor.  indx == 0 => result0 and so on.
+    //
+    //  The order of the items in the union is determined by the tuple that
+    //  defines the input port types (the same way a join_node's inputs are
+    //  defined.)  So the union ordering corresponds to the ordering of the 
+    //  input ports of the node.
+    //
+    //  the types of each element of the union are represented by tuple_types,
+    //  a typedef in the or_node.  So the 2nd type in the union that is the
+    //  output type for an or_node OrType is
+    //
+    //      std::tuple_element<1,OrType::tuple_types>::type
+
+    template<int N, typename OutputTuple>
+    struct or_output_type;
+
+    template<typename OutputTuple>
+    struct or_output_type<2, OutputTuple> {
+        typedef OutputTuple tuple_types;
+        typedef struct {
+            size_t indx;
+            union {
+                typename std::tuple_element<0,OutputTuple>::type result0;
+                typename std::tuple_element<1,OutputTuple>::type result1;
+            };
+        } type;
+    };
+
+    template<typename OutputTuple>
+    struct or_output_type<3, OutputTuple> {
+        typedef struct {
+            typedef OutputTuple tuple_types;
+            size_t indx;
+            union {
+                typename std::tuple_element<0,OutputTuple>::type result0;
+                typename std::tuple_element<1,OutputTuple>::type result1;
+                typename std::tuple_element<2,OutputTuple>::type result2;
+            };
+        } type;
+    };
+
+    template<typename OutputTuple>
+    struct or_output_type<4, OutputTuple> {
+        typedef struct {
+            typedef OutputTuple tuple_types;
+            size_t indx;
+            union {
+                typename std::tuple_element<0,OutputTuple>::type result0;
+                typename std::tuple_element<1,OutputTuple>::type result1;
+                typename std::tuple_element<2,OutputTuple>::type result2;
+                typename std::tuple_element<3,OutputTuple>::type result3;
+            };
+        } type;
+    };
+
+    template<typename OutputTuple>
+    struct or_output_type<5, OutputTuple> {
+        typedef struct {
+            typedef OutputTuple tuple_types;
+            size_t indx;
+            union {
+                typename std::tuple_element<0,OutputTuple>::type result0;
+                typename std::tuple_element<1,OutputTuple>::type result1;
+                typename std::tuple_element<2,OutputTuple>::type result2;
+                typename std::tuple_element<3,OutputTuple>::type result3;
+                typename std::tuple_element<4,OutputTuple>::type result4;
+            };
+        } type;
+    };
+
+    template<typename OutputTuple>
+    struct or_output_type<6, OutputTuple> {
+        typedef struct {
+            typedef OutputTuple tuple_types;
+            size_t indx;
+            union {
+                typename std::tuple_element<0,OutputTuple>::type result0;
+                typename std::tuple_element<1,OutputTuple>::type result1;
+                typename std::tuple_element<2,OutputTuple>::type result2;
+                typename std::tuple_element<3,OutputTuple>::type result3;
+                typename std::tuple_element<4,OutputTuple>::type result4;
+                typename std::tuple_element<5,OutputTuple>::type result5;
+            };
+        } type;
+    };
+
+    template<typename OutputTuple>
+    struct or_output_type<7, OutputTuple> {
+        typedef struct {
+            typedef OutputTuple tuple_types;
+            size_t indx;
+            union {
+                typename std::tuple_element<0,OutputTuple>::type result0;
+                typename std::tuple_element<1,OutputTuple>::type result1;
+                typename std::tuple_element<2,OutputTuple>::type result2;
+                typename std::tuple_element<3,OutputTuple>::type result3;
+                typename std::tuple_element<4,OutputTuple>::type result4;
+                typename std::tuple_element<5,OutputTuple>::type result5;
+                typename std::tuple_element<6,OutputTuple>::type result6;
+            };
+        } type;
+    };
+
+    template<typename OutputTuple>
+    struct or_output_type<8, OutputTuple> {
+        typedef struct {
+            typedef OutputTuple tuple_types;
+            size_t indx;
+            union {
+                typename std::tuple_element<0,OutputTuple>::type result0;
+                typename std::tuple_element<1,OutputTuple>::type result1;
+                typename std::tuple_element<2,OutputTuple>::type result2;
+                typename std::tuple_element<3,OutputTuple>::type result3;
+                typename std::tuple_element<4,OutputTuple>::type result4;
+                typename std::tuple_element<5,OutputTuple>::type result5;
+                typename std::tuple_element<6,OutputTuple>::type result6;
+                typename std::tuple_element<7,OutputTuple>::type result7;
+            };
+        } type;
+    };
+
+    template<typename OutputTuple>
+    struct or_output_type<9, OutputTuple> {
+        typedef struct {
+            typedef OutputTuple tuple_types;
+            size_t indx;
+            union {
+                typename std::tuple_element<0,OutputTuple>::type result0;
+                typename std::tuple_element<1,OutputTuple>::type result1;
+                typename std::tuple_element<2,OutputTuple>::type result2;
+                typename std::tuple_element<3,OutputTuple>::type result3;
+                typename std::tuple_element<4,OutputTuple>::type result4;
+                typename std::tuple_element<5,OutputTuple>::type result5;
+                typename std::tuple_element<6,OutputTuple>::type result6;
+                typename std::tuple_element<7,OutputTuple>::type result7;
+                typename std::tuple_element<8,OutputTuple>::type result8;
+            };
+        } type;
+    };
+
+    template<typename OutputTuple>
+    struct or_output_type<10, OutputTuple> {
+        typedef struct {
+            typedef OutputTuple tuple_types;
+            size_t indx;
+            union {
+                typename std::tuple_element<0,OutputTuple>::type result0;
+                typename std::tuple_element<1,OutputTuple>::type result1;
+                typename std::tuple_element<2,OutputTuple>::type result2;
+                typename std::tuple_element<3,OutputTuple>::type result3;
+                typename std::tuple_element<4,OutputTuple>::type result4;
+                typename std::tuple_element<5,OutputTuple>::type result5;
+                typename std::tuple_element<6,OutputTuple>::type result6;
+                typename std::tuple_element<7,OutputTuple>::type result7;
+                typename std::tuple_element<8,OutputTuple>::type result8;
+                typename std::tuple_element<9,OutputTuple>::type result9;
+            };
+        } type;
+    };
+
+    template<typename TupleTypes, int N>
+        struct or_item_helper;
+
+    // or_item_helper takes 0-9 as its template parameter
+    template<typename TupleTypes>
+    struct or_item_helper<TupleTypes,0> {
+        template<typename OutputType>
+        static inline void create_output_value(OutputType &o, void *v) {
+            o.indx = 0;
+            o.result0 = *(reinterpret_cast<typename std::tuple_element<0,TupleTypes>::type *>(v));
+        }
+    };
+
+    template<typename TupleTypes>
+    struct or_item_helper<TupleTypes,1> {
+        template<typename OutputType>
+        static inline void create_output_value(OutputType &o, void *v) {
+            o.indx = 1;
+            o.result1 = *(reinterpret_cast<typename std::tuple_element<1,TupleTypes>::type *>(v));
+        }
+    };
+
+    template<typename TupleTypes>
+    struct or_item_helper<TupleTypes,2> {
+        template<typename OutputType>
+        static inline void create_output_value(OutputType &o, void *v) {
+            o.indx = 2;
+            o.result2 = *(reinterpret_cast<typename std::tuple_element<2,TupleTypes>::type *>(v));
+        }
+    };
+
+    template<typename TupleTypes>
+    struct or_item_helper<TupleTypes,3> {
+        template<typename OutputType>
+        static inline void create_output_value(OutputType &o, void *v) {
+            o.indx = 3;
+            o.result3 = *(reinterpret_cast<typename std::tuple_element<3,TupleTypes>::type *>(v));
+        }
+    };
+
+    template<typename TupleTypes>
+    struct or_item_helper<TupleTypes,4> {
+        template<typename OutputType>
+        static inline void create_output_value(OutputType &o, void *v) {
+            o.indx = 4;
+            o.result4 = *(reinterpret_cast<typename std::tuple_element<4,TupleTypes>::type *>(v));
+        }
+    };
+
+    template<typename TupleTypes>
+    struct or_item_helper<TupleTypes,5> {
+        template<typename OutputType>
+        static inline void create_output_value(OutputType &o, void *v) {
+            o.indx = 5;
+            o.result5 = *(reinterpret_cast<typename std::tuple_element<5,TupleTypes>::type *>(v));
+        }
+    };
+
+    template<typename TupleTypes>
+    struct or_item_helper<TupleTypes,6> {
+        template<typename OutputType>
+        static inline void create_output_value(OutputType &o, void *v) {
+            o.indx = 6;
+            o.result6 = *(reinterpret_cast<typename std::tuple_element<6,TupleTypes>::type *>(v));
+        }
+    };
+
+    template<typename TupleTypes>
+    struct or_item_helper<TupleTypes,7> {
+        template<typename OutputType>
+        static inline void create_output_value(OutputType &o, void *v) {
+            o.indx = 7;
+            o.result7 = *(reinterpret_cast<typename std::tuple_element<7,TupleTypes>::type *>(v));
+        }
+    };
+
+    template<typename TupleTypes>
+    struct or_item_helper<TupleTypes,8> {
+        template<typename OutputType>
+        static inline void create_output_value(OutputType &o, void *v) {
+            o.indx = 8;
+            o.result8 = *(reinterpret_cast<typename std::tuple_element<8,TupleTypes>::type *>(v));
+        }
+    };
+
+    template<typename TupleTypes>
+    struct or_item_helper<TupleTypes,9> {
+        template<typename OutputType>
+        static inline void create_output_value(OutputType &o, void *v) {
+            o.indx = 9;
+            o.result9 = *(reinterpret_cast<typename std::tuple_element<9,TupleTypes>::type *>(v));
+        }
+    };
+
+    template<typename TupleTypes,int N>
+    struct or_helper {
+        template<typename OutputType>
+        static inline void create_output(OutputType &o, size_t i, void* v) {
+            if(i == N-1) {
+                or_item_helper<TupleTypes,N-1>::create_output_value(o,v);
+            }
+            else
+                or_helper<TupleTypes,N-1>::create_output(o,i,v);
+        }
+        template<typename PortTuple, typename PutBase>
+        static inline void set_or_node_pointer(PortTuple &my_input, PutBase *p) {
+            std::get<N-1>(my_input).set_up(p, N-1);
+            or_helper<TupleTypes,N-1>::set_or_node_pointer(my_input, p);
+        }
+    };
+
+    template<typename TupleTypes>
+    struct or_helper<TupleTypes,1> {
+        template<typename OutputType>
+        static inline void create_output(OutputType &o, size_t i, void* v) {
+            if(i == 0) {
+                or_item_helper<TupleTypes,0>::create_output_value(o,v);
+            }
+        }
+        template<typename PortTuple, typename PutBase>
+        static inline void set_or_node_pointer(PortTuple &my_input, PutBase *p) {
+            std::get<0>(my_input).set_up(p, 0);
+        }
+    };
+
+    struct put_base {
+        virtual bool try_put_with_index(size_t index, void *v) = 0;
+        virtual ~put_base() { }
+    };
+
+    template<typename T>
+    class or_input_port : public receiver<T> {
+    private:
+        size_t my_index;
+        put_base *my_or_node;
+    public:
+        void set_up(put_base *p, size_t i) { my_index = i; my_or_node = p; }
+        bool try_put(const T &v) {
+            return my_or_node->try_put_with_index(my_index, reinterpret_cast<void *>(const_cast<T*>(&v)));
+        }
+    };
+
+    template<size_t N, typename OutputTuple>
+    struct or_input_type;
+
+    template<typename OutputTuple>
+    struct or_input_type<2,OutputTuple> {
+        typedef typename std::tuple<
+            or_input_port<typename std::tuple_element<0,OutputTuple>::type>,
+            or_input_port<typename std::tuple_element<1,OutputTuple>::type>
+        > type;
+    };
+
+    template<typename OutputTuple>
+    struct or_input_type<3,OutputTuple> {
+        typedef typename std::tuple<
+            or_input_port<typename std::tuple_element<0,OutputTuple>::type>,
+            or_input_port<typename std::tuple_element<1,OutputTuple>::type>,
+            or_input_port<typename std::tuple_element<2,OutputTuple>::type>
+        > type;
+    };
+
+    template<typename OutputTuple>
+    struct or_input_type<4,OutputTuple> {
+        typedef typename std::tuple<
+            or_input_port<typename std::tuple_element<0,OutputTuple>::type>,
+            or_input_port<typename std::tuple_element<1,OutputTuple>::type>,
+            or_input_port<typename std::tuple_element<2,OutputTuple>::type>,
+            or_input_port<typename std::tuple_element<3,OutputTuple>::type>
+        > type;
+    };
+
+    template<typename OutputTuple>
+    struct or_input_type<5,OutputTuple> {
+        typedef typename std::tuple<
+            or_input_port<typename std::tuple_element<0,OutputTuple>::type>,
+            or_input_port<typename std::tuple_element<1,OutputTuple>::type>,
+            or_input_port<typename std::tuple_element<2,OutputTuple>::type>,
+            or_input_port<typename std::tuple_element<3,OutputTuple>::type>,
+            or_input_port<typename std::tuple_element<4,OutputTuple>::type>
+        > type;
+    };
+
+    template<typename OutputTuple>
+    struct or_input_type<6,OutputTuple> {
+        typedef typename std::tuple<
+            or_input_port<typename std::tuple_element<0,OutputTuple>::type>,
+            or_input_port<typename std::tuple_element<1,OutputTuple>::type>,
+            or_input_port<typename std::tuple_element<2,OutputTuple>::type>,
+            or_input_port<typename std::tuple_element<3,OutputTuple>::type>,
+            or_input_port<typename std::tuple_element<4,OutputTuple>::type>,
+            or_input_port<typename std::tuple_element<5,OutputTuple>::type>
+        > type;
+    };
+
+    template<typename OutputTuple>
+    struct or_input_type<7,OutputTuple> {
+        typedef typename std::tuple<
+            or_input_port<typename std::tuple_element<0,OutputTuple>::type>,
+            or_input_port<typename std::tuple_element<1,OutputTuple>::type>,
+            or_input_port<typename std::tuple_element<2,OutputTuple>::type>,
+            or_input_port<typename std::tuple_element<3,OutputTuple>::type>,
+            or_input_port<typename std::tuple_element<4,OutputTuple>::type>,
+            or_input_port<typename std::tuple_element<5,OutputTuple>::type>,
+            or_input_port<typename std::tuple_element<6,OutputTuple>::type>
+        > type;
+    };
+
+    template<typename OutputTuple>
+    struct or_input_type<8,OutputTuple> {
+        typedef typename std::tuple<
+            or_input_port<typename std::tuple_element<0,OutputTuple>::type>,
+            or_input_port<typename std::tuple_element<1,OutputTuple>::type>,
+            or_input_port<typename std::tuple_element<2,OutputTuple>::type>,
+            or_input_port<typename std::tuple_element<3,OutputTuple>::type>,
+            or_input_port<typename std::tuple_element<4,OutputTuple>::type>,
+            or_input_port<typename std::tuple_element<5,OutputTuple>::type>,
+            or_input_port<typename std::tuple_element<6,OutputTuple>::type>,
+            or_input_port<typename std::tuple_element<7,OutputTuple>::type>
+        > type;
+    };
+
+    template<typename OutputTuple>
+    struct or_input_type<9,OutputTuple> {
+        typedef typename std::tuple<
+            or_input_port<typename std::tuple_element<0,OutputTuple>::type>,
+            or_input_port<typename std::tuple_element<1,OutputTuple>::type>,
+            or_input_port<typename std::tuple_element<2,OutputTuple>::type>,
+            or_input_port<typename std::tuple_element<3,OutputTuple>::type>,
+            or_input_port<typename std::tuple_element<4,OutputTuple>::type>,
+            or_input_port<typename std::tuple_element<5,OutputTuple>::type>,
+            or_input_port<typename std::tuple_element<6,OutputTuple>::type>,
+            or_input_port<typename std::tuple_element<7,OutputTuple>::type>,
+            or_input_port<typename std::tuple_element<8,OutputTuple>::type>
+        > type;
+    };
+
+    template<typename OutputTuple>
+    struct or_input_type<10,OutputTuple> {
+        typedef typename std::tuple<
+            or_input_port<typename std::tuple_element<0,OutputTuple>::type>,
+            or_input_port<typename std::tuple_element<1,OutputTuple>::type>,
+            or_input_port<typename std::tuple_element<2,OutputTuple>::type>,
+            or_input_port<typename std::tuple_element<3,OutputTuple>::type>,
+            or_input_port<typename std::tuple_element<4,OutputTuple>::type>,
+            or_input_port<typename std::tuple_element<5,OutputTuple>::type>,
+            or_input_port<typename std::tuple_element<6,OutputTuple>::type>,
+            or_input_port<typename std::tuple_element<7,OutputTuple>::type>,
+            or_input_port<typename std::tuple_element<8,OutputTuple>::type>,
+            or_input_port<typename std::tuple_element<9,OutputTuple>::type>
+        > type;
+    };
+
+    template<typename InputTuple, typename OutputType, typename StructTypes>
+    class or_node_FE : public put_base {
+    public:
+        static const int N = std::tuple_size<InputTuple>::value;
+        typedef OutputType output_type;
+        typedef InputTuple input_type;
+
+        or_node_FE( ) {
+            or_helper<StructTypes,N>::set_or_node_pointer(my_inputs, this);
+        }
+
+        input_type &inputs() { return my_inputs; }
+    protected:
+        input_type my_inputs;
+    };
+
+    //! or_node_base
+    template<typename InputTuple, typename OutputType, typename StructTypes>
+    class or_node_base : public graph_node, public or_node_FE<InputTuple, OutputType,StructTypes>,
+                           public sender<OutputType> {
+    public:
+        static const size_t N = std::tuple_size<InputTuple>::value;
+        typedef OutputType output_type;
+        typedef StructTypes tuple_types;
+        typedef receiver<output_type> successor_type;
+        typedef or_node_FE<InputTuple, output_type,StructTypes> input_ports_type;
+
+    private:
+        // ----------- Aggregator ------------
+        enum op_type { reg_succ, rem_succ, try__put };
+        enum op_stat {WAIT=0, SUCCEEDED, FAILED};
+        typedef or_node_base<InputTuple,output_type,StructTypes> my_class;
+
+        class or_node_base_operation : public aggregated_operation<or_node_base_operation> {
+        public:
+            char type;
+            size_t indx;
+            union {
+                void *my_arg;
+                successor_type *my_succ;
+            };
+            or_node_base_operation(size_t i, const void* e, op_type t) :
+                type(char(t)), indx(i), my_arg(const_cast<void *>(e)) {}
+            or_node_base_operation(const successor_type &s, op_type t) : type(char(t)), 
+                my_succ(const_cast<successor_type *>(&s)) {}
+            or_node_base_operation(op_type t) : type(char(t)) {}
+        };
+
+        typedef internal::aggregating_functor<my_class, or_node_base_operation> my_handler;
+        friend class internal::aggregating_functor<my_class, or_node_base_operation>;
+        aggregator<my_handler, or_node_base_operation> my_aggregator;
+
+        void handle_operations(or_node_base_operation* op_list) {
+            or_node_base_operation *current;
+            while(op_list) {
+                current = op_list;
+                op_list = op_list->next;
+                switch(current->type) {
+
+                case reg_succ:
+                    my_successors.register_successor(*(current->my_succ));
+                    __TBB_store_with_release(current->status, SUCCEEDED);
+                    break;
+
+                case rem_succ:
+                    my_successors.remove_successor(*(current->my_succ));
+                    __TBB_store_with_release(current->status, SUCCEEDED);
+                    break;
+
+                case try__put:
+                    output_type oval;
+                    or_helper<tuple_types,N>::create_output(oval,current->indx,current->my_arg);
+                    if(my_successors.try_put(oval)) {
+                            __TBB_store_with_release(current->status, SUCCEEDED);
+                    }
+                    else __TBB_store_with_release(current->status, FAILED);
+                    break;
+                }
+            }
+        }
+        // ---------- end aggregator -----------
+    public:
+        or_node_base( ) : input_ports_type() {
+            my_successors.set_owner(this);
+            my_aggregator.initialize_handler(my_handler(this));
+        }
+
+        or_node_base( const or_node_base& /*other*/) : input_ports_type() {
+            my_successors.set_owner(this);
+            my_aggregator.initialize_handler(my_handler(this));
+        }
+
+        bool register_successor(successor_type &r) {
+            or_node_base_operation op_data(r, reg_succ);
+            my_aggregator.execute(&op_data);
+            return op_data.status == SUCCEEDED;
+        }
+
+        bool remove_successor( successor_type &r) {
+            or_node_base_operation op_data(r, rem_succ);
+            my_aggregator.execute(&op_data);
+            return op_data.status == SUCCEEDED;
+        }
+
+        bool try_put_with_index(size_t indx, void *v) {
+            or_node_base_operation op_data(indx, v, try__put);
+            my_aggregator.execute(&op_data);
+            return op_data.status == SUCCEEDED;
+        }
+
+    private:
+        broadcast_cache<output_type, null_rw_mutex> my_successors;
+    };
+
+    // type generators
+    template<typename OutputTuple>
+    struct or_types {
+        static const int N = std::tuple_size<OutputTuple>::value;
+        typedef typename or_input_type<N,OutputTuple>::type input_ports_tuple_type;
+        typedef typename or_output_type<N, OutputTuple>::type output_type;
+        typedef internal::or_node_FE<input_ports_tuple_type,output_type,OutputTuple> or_FE_type;
+        typedef internal::or_node_base<input_ports_tuple_type, output_type, OutputTuple> or_base_type;
+    };
+
+    //! unfolded_or_node : passes input_ports_tuple_type to or_node_base.  We build the input port type
+    //  using tuple_element.  The class PT is the port type (reserving_port, queueing_port, tag_matching_port)
+    //  and should match the graph_buffer_policy.
+    template<typename OutputTuple>
+    class unfolded_or_node;
+
+    template<class OutputTuple>
+    class unfolded_or_node : public or_types<OutputTuple>::or_base_type {
+    public:
+        // static const int N = std::tuple_size<OutputTuple>::value;
+        typedef typename or_types<OutputTuple>::input_ports_tuple_type input_ports_tuple_type;
+        typedef OutputTuple tuple_types;
+        typedef typename or_types<OutputTuple>::output_type output_type;
+    private:
+        typedef typename or_types<OutputTuple>::or_base_type base_type;
+    public:
+        unfolded_or_node() : base_type() {}
+    };
+
+
+} /* namespace internal */
+
+#endif  /* __TBB__flow_graph_or_impl_H */
diff --git a/tbb/include/tbb/internal/_flow_graph_tagged_buffer_impl.h b/tbb/include/tbb/internal/_flow_graph_tagged_buffer_impl.h
new file mode 100644 (file)
index 0000000..7558912
--- /dev/null
@@ -0,0 +1,204 @@
+/*
+    Copyright 2005-2011 Intel Corporation.  All Rights Reserved.
+
+    This file is part of Threading Building Blocks.
+
+    Threading Building Blocks is free software; you can redistribute it
+    and/or modify it under the terms of the GNU General Public License
+    version 2 as published by the Free Software Foundation.
+
+    Threading Building Blocks is distributed in the hope that it will be
+    useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+    of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Threading Building Blocks; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+    As a special exception, you may use this file as part of a free software
+    library without restriction.  Specifically, if other files instantiate
+    templates or use macros or inline functions from this file, or you compile
+    this file and link it with other files to produce an executable, this
+    file does not by itself cause the resulting executable to be covered by
+    the GNU General Public License.  This exception does not however
+    invalidate any other reasons why the executable file might be covered by
+    the GNU General Public License.
+*/
+
+// tagged buffer that can expand, and can support as many deletions as additions
+// list-based, with elements of list held in std::vector (for destruction management),
+// multiplicative hashing (like ets).  No synchronization built-in.
+//
+
+template<typename TagType, typename ValueType, size_t NoTagMark>
+struct buffer_element {
+    TagType t;
+    ValueType v;
+    buffer_element *next;
+    buffer_element() : t(NoTagMark), next(NULL) {}
+};
+
+template
+    <
+     typename TagType, 
+     typename ValueType, 
+     size_t   NoTagMark = 0,
+     typename Allocator=tbb::cache_aligned_allocator< buffer_element<TagType,ValueType,NoTagMark> >
+    >
+class tagged_buffer {
+public:
+    static const size_t INITIAL_SIZE = 8;  // initial size of the hash pointer table
+    static const TagType NO_TAG = TagType(NoTagMark);
+    typedef ValueType value_type;
+    typedef buffer_element<TagType,ValueType, NO_TAG> element_type;
+    typedef value_type *pointer_type;
+    typedef std::vector<element_type, Allocator> list_array_type;
+    typedef typename Allocator::template rebind<element_type*>::other pointer_array_allocator_type;
+    typedef typename Allocator::template rebind<list_array_type>::other list_array_allocator;
+private:
+
+    size_t my_size;
+    size_t nelements;
+    element_type** array;
+    std::vector<element_type, Allocator> *lists;
+    element_type* free_list;
+
+    size_t mask() { return my_size - 1; }
+
+// #define ABYSMAL 1
+    static size_t hash(TagType t) {
+#if ABYSMAL
+        return (size_t)1;
+#else
+#if __TBB_WORDSIZE == 4
+        return uintptr_t(t)*0x9E3779B9;
+#else
+        return uintptr_t(t)*0x9E3779B97F4A7C15;
+#endif
+#endif
+    }
+
+    void set_up_free_list( element_type **p_free_list, list_array_type *la, size_t sz) {
+        for(size_t i=0; i < sz - 1; ++i ) {  // construct free list
+            (*la)[i].next = &((*la)[i+1]);
+            (*la)[i].t = NO_TAG;
+        }
+        (*la)[sz-1].next = NULL;
+        *p_free_list = &((*la)[0]);
+    }
+
+    void grow_array() {
+        // make the pointer array larger
+        element_type **new_array;
+        element_type **old_array = array;
+        size_t old_size = my_size;
+        my_size *=2;
+        new_array = pointer_array_allocator_type().allocate(my_size);
+        for(size_t i=0; i < my_size; ++i) new_array[i] = NULL;
+        list_array_type *new_list_array = new list_array_type(old_size, element_type(), Allocator());
+        set_up_free_list(&free_list, new_list_array, old_size );
+
+        for(size_t i=0; i < old_size; ++i) {
+            for( element_type* op = old_array[i]; op; op = op->next) {
+                internal_tagged_insert(new_array, my_size, op->t, op->v);
+            }
+        }
+        pointer_array_allocator_type().deallocate(old_array, old_size);
+
+        delete lists;  // destroy and deallocate instead
+        array = new_array;
+        lists = new_list_array;
+    }
+
+    void internal_tagged_insert( element_type **ar, size_t sz, TagType t, value_type v) {
+        size_t l_mask = sz-1;
+        size_t h = hash(t) & l_mask;
+        __TBB_ASSERT(free_list, "Error: free list not set up.");
+        element_type* my_elem = free_list; free_list = free_list->next;
+        my_elem->t = t;
+        my_elem->v = v;
+        my_elem->next = ar[h];
+        ar[h] = my_elem;
+    }
+
+public:
+    tagged_buffer() : my_size(INITIAL_SIZE), nelements(0) {
+        array = pointer_array_allocator_type().allocate(my_size);
+        for(size_t i = 0; i < my_size; ++i) array[i] = NULL;
+        lists = new list_array_type(INITIAL_SIZE/2, element_type(), Allocator());
+        set_up_free_list(&free_list, lists, INITIAL_SIZE/2);
+    }
+
+    ~tagged_buffer() {
+        if(array) {
+            pointer_array_allocator_type().deallocate(array, my_size); 
+        }
+        if(lists) {
+            delete lists;
+        }
+    }
+
+    bool tagged_insert(TagType t, value_type v) {
+        pointer_type p;
+        if(tagged_find_ref(t, p)) {
+            *p = v;  // replace the value
+            return false;
+        }
+        ++nelements;
+        if(nelements*2 > my_size) grow_array();
+        internal_tagged_insert(array, my_size, t, v);
+        return true;
+    }
+
+    // returns reference to array element.v
+    bool tagged_find_ref(TagType t, pointer_type &v) {
+        size_t i = hash(t) & mask();
+        for(element_type* p = array[i]; p; p = p->next) {
+            if(p->t == t) {
+                v = &(p->v);
+                return true;
+            }
+        }
+        return false;
+    }
+
+    bool tagged_find( TagType t, value_type &v) {
+        value_type *p;
+        if(tagged_find_ref(t, p)) {
+            v = *p;
+            return true;
+        }
+        else
+            return false;
+    }
+
+    void tagged_delete(TagType t) {
+        size_t h = hash(t) & mask();
+        element_type* prev = NULL;
+        for(element_type* p = array[h]; p; prev = p, p = p->next) {
+            if(p->t == t) {
+                p->t = NO_TAG;
+                if(prev) prev->next = p->next;
+                else array[h] = p->next;
+                p->next = free_list;
+                free_list = p;
+                --nelements;
+                return;
+            }
+        }
+        __TBB_ASSERT(false, "tag not found for delete");
+    }
+
+    // search for v in the array; if found {set t, return true} else return false
+    // we use this in join_node_FE to find if a tag's items are all available.
+    bool find_value_tag( TagType &t, value_type v) {
+        for(size_t i= 0; i < my_size / 2; ++i) {  // remember the vector is half the size of the hash array
+            if( (*lists)[i].t != NO_TAG && (*lists)[i].v == v) {
+                t = (*lists)[i].t;
+                return true;
+            }
+        }
+        return false;
+    }
+};
diff --git a/tbb/include/tbb/internal/_tbb_windef.h b/tbb/include/tbb/internal/_tbb_windef.h
new file mode 100644 (file)
index 0000000..85326bc
--- /dev/null
@@ -0,0 +1,81 @@
+/*
+    Copyright 2005-2011 Intel Corporation.  All Rights Reserved.
+
+    This file is part of Threading Building Blocks.
+
+    Threading Building Blocks is free software; you can redistribute it
+    and/or modify it under the terms of the GNU General Public License
+    version 2 as published by the Free Software Foundation.
+
+    Threading Building Blocks is distributed in the hope that it will be
+    useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+    of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Threading Building Blocks; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+    As a special exception, you may use this file as part of a free software
+    library without restriction.  Specifically, if other files instantiate
+    templates or use macros or inline functions from this file, or you compile
+    this file and link it with other files to produce an executable, this
+    file does not by itself cause the resulting executable to be covered by
+    the GNU General Public License.  This exception does not however
+    invalidate any other reasons why the executable file might be covered by
+    the GNU General Public License.
+*/
+
+#ifndef __TBB_tbb_windef_H
+#error Do not #include this file directly.  Use "#include tbb/tbb_stddef.h" instead.
+#endif /* __TBB_tbb_windef_H */
+
+// Check that the target Windows version has all API calls requried for TBB.
+// Do not increase the version in condition beyond 0x0500 without prior discussion!
+#if defined(_WIN32_WINNT) && _WIN32_WINNT<0x0400
+#error TBB is unable to run on old Windows versions; _WIN32_WINNT must be 0x0400 or greater.
+#endif
+
+#if !defined(_MT)
+#error TBB requires linkage with multithreaded C/C++ runtime library. \
+       Choose multithreaded DLL runtime in project settings, or use /MD[d] compiler switch.
+#endif
+
+// Workaround for the problem with MVSC headers failing to define namespace std
+namespace std {
+  using ::size_t; using ::ptrdiff_t;
+}
+
+#define __TBB_STRING_AUX(x) #x
+#define __TBB_STRING(x) __TBB_STRING_AUX(x)
+
+// Default setting of TBB_USE_DEBUG
+#ifdef TBB_USE_DEBUG
+#    if TBB_USE_DEBUG 
+#        if !defined(_DEBUG)
+#            pragma message(__FILE__ "(" __TBB_STRING(__LINE__) ") : Warning: Recommend using /MDd if compiling with TBB_USE_DEBUG!=0")
+#        endif
+#    else
+#        if defined(_DEBUG)
+#            pragma message(__FILE__ "(" __TBB_STRING(__LINE__) ") : Warning: Recommend using /MD if compiling with TBB_USE_DEBUG==0")
+#        endif
+#    endif
+#endif
+
+#if __TBB_BUILD && !defined(__TBB_NO_IMPLICIT_LINKAGE)
+#define __TBB_NO_IMPLICIT_LINKAGE 1
+#endif
+
+#if _MSC_VER
+    #if !__TBB_NO_IMPLICIT_LINKAGE
+        #ifdef __TBB_LIB_NAME
+               #pragma comment(lib, __TBB_STRING(__TBB_LIB_NAME))
+        #else
+                       #ifdef _DEBUG
+                               #pragma comment(lib, "tbb_debug.lib")
+                       #else
+                               #pragma comment(lib, "tbb.lib")
+                       #endif
+        #endif
+    #endif
+#endif
index 8bf7922460fea3128df0640267d8ff86e5748078..e87a193f274277382192a84df1d2e4f851def71f 100644 (file)
     the GNU General Public License.
 */
 
-#ifndef __TBB_machine_H
+#if !defined(__TBB_machine_H) || defined(__TBB_machine_gcc_generic_H)
 #error Do not include this file directly; include tbb_machine.h instead
 #endif
 
+#define __TBB_machine_gcc_generic_H
+
 #include <stdint.h>
 #include <unistd.h>
 
 #define __TBB_WORDSIZE      __SIZEOF_INT__
 
-//for some unknown reason straight mapping does not work. At least on mingw
+// For some reason straight mapping does not work on mingw
 #if __BYTE_ORDER__==__ORDER_BIG_ENDIAN__
     #define __TBB_BIG_ENDIAN    0
 #elif __BYTE_ORDER__==__ORDER_LITTLE_ENDIAN__
     #define __TBB_BIG_ENDIAN    1
 #else
-#error "This endiannes is not supported."
+#error Unsupported endianness
 #endif
 
-//As the port has absolutely no information about underlying hardware, the performance,
-//most likely, will be sub-optimal, due to usage of full memory fence where a lightweight
-//one would suffice..
+/** As this generic implementation has absolutely no information about underlying
+    hardware, its performance most likely will be sub-optimal because of full memory
+    fence usages where a more lightweight synchronization means (or none at all)
+    could suffice. Thus if you use this header to enable TBB on a new platform,
+    consider forking it and relaxing below helpers as appropriate. **/
 #define __TBB_acquire_consistency_helper()  __sync_synchronize()
 #define __TBB_release_consistency_helper()  __sync_synchronize()
 #define __TBB_full_memory_fence()           __sync_synchronize()
 #define __TBB_control_consistency_helper()  __sync_synchronize()
 
-
-#define __MACHINE_DECL_ATOMICS(S,T)                                                               \
-inline T __TBB_generic_gcc_cmpswp##S(volatile void *ptr, T value, T comparand ) {                 \
+#define __TBB_MACHINE_DEFINE_ATOMICS(S,T)                                                         \
+inline T __TBB_machine_cmpswp##S( volatile void *ptr, T value, T comparand ) {                    \
     return __sync_val_compare_and_swap(reinterpret_cast<volatile T *>(ptr),comparand,value);      \
 }                                                                                                 \
 
-__MACHINE_DECL_ATOMICS(1,int8_t)
-__MACHINE_DECL_ATOMICS(2,int16_t)
-__MACHINE_DECL_ATOMICS(4,int32_t)
-__MACHINE_DECL_ATOMICS(8,int64_t)
+__TBB_MACHINE_DEFINE_ATOMICS(1,int8_t)
+__TBB_MACHINE_DEFINE_ATOMICS(2,int16_t)
+__TBB_MACHINE_DEFINE_ATOMICS(4,int32_t)
+__TBB_MACHINE_DEFINE_ATOMICS(8,int64_t)
 
-#define __TBB_CompareAndSwap1(P,V,C) __TBB_generic_gcc_cmpswp1(P,V,C)
-#define __TBB_CompareAndSwap2(P,V,C) __TBB_generic_gcc_cmpswp2(P,V,C)
-#define __TBB_CompareAndSwap4(P,V,C) __TBB_generic_gcc_cmpswp4(P,V,C)
-#define __TBB_CompareAndSwap8(P,V,C) __TBB_generic_gcc_cmpswp8(P,V,C)
+#undef __TBB_MACHINE_DEFINE_ATOMICS
 
-#if (__TBB_WORDSIZE==4)
-    #define __TBB_CompareAndSwapW(P,V,C) __TBB_CompareAndSwap4(P,V,C)
-#elif  (__TBB_WORDSIZE==8)
-    #define __TBB_CompareAndSwapW(P,V,C) __TBB_CompareAndSwap8(P,V,C)
-#else
-    #error "Unsupported word size."
-#endif
+#define __TBB_USE_GENERIC_FETCH_ADD                 1
+#define __TBB_USE_GENERIC_FETCH_STORE               1
+#define __TBB_USE_GENERIC_HALF_FENCED_LOAD_STORE    1
+#define __TBB_USE_GENERIC_RELAXED_LOAD_STORE        1
index 54bd080068cf5069a330837f77bc12032f6c60e6..a5c1d3aadb1fe8f0ce167224c98c4dc211a64835 100644 (file)
     the GNU General Public License.
 */
 
-#ifndef __TBB_machine_H
+// TODO: revise by comparing with mac_ppc.h
+
+#if !defined(__TBB_machine_H) || defined(__TBB_machine_ibm_aix51_H)
 #error Do not include this file directly; include tbb_machine.h instead
 #endif
 
+#define __TBB_machine_ibm_aix51_H
+
 #define __TBB_WORDSIZE 8
 #define __TBB_BIG_ENDIAN 1
 
 #include <sched.h>
 
 extern "C" {
-
 int32_t __TBB_machine_cas_32 (volatile void* ptr, int32_t value, int32_t comparand);
 int64_t __TBB_machine_cas_64 (volatile void* ptr, int64_t value, int64_t comparand);
-void    __TBB_machine_flush  ();
-
+void __TBB_machine_flush ();
+void __TBB_machine_lwsync ();
+void __TBB_machine_isync ();
 }
 
-#define __TBB_CompareAndSwap4(P,V,C) __TBB_machine_cas_32(P,V,C)
-#define __TBB_CompareAndSwap8(P,V,C) __TBB_machine_cas_64(P,V,C)
-#define __TBB_CompareAndSwapW(P,V,C) __TBB_machine_cas_64(P,V,C)
+// Mapping of old entry point names retained for the sake of backward binary compatibility
+#define __TBB_machine_cmpswp4 __TBB_machine_cas_32
+#define __TBB_machine_cmpswp8 __TBB_machine_cas_64
+
 #define __TBB_Yield() sched_yield()
 
+#define __TBB_USE_GENERIC_PART_WORD_CAS             1
+#define __TBB_USE_GENERIC_FETCH_ADD                 1
+#define __TBB_USE_GENERIC_FETCH_STORE               1
+#define __TBB_USE_GENERIC_HALF_FENCED_LOAD_STORE    1
+#define __TBB_USE_GENERIC_RELAXED_LOAD_STORE        1
+
 #if __GNUC__
-#define __TBB_full_memory_fence() __asm__ __volatile__("sync": : :"memory")
-#define __TBB_release_consistency_helper() __asm__ __volatile__("lwsync": : :"memory")
+    #define __TBB_control_consistency_helper() __asm__ __volatile__( "isync": : :"memory")
+    #define __TBB_acquire_consistency_helper() __asm__ __volatile__("lwsync": : :"memory")
+    #define __TBB_release_consistency_helper() __asm__ __volatile__("lwsync": : :"memory")
+    #define __TBB_full_memory_fence()          __asm__ __volatile__(  "sync": : :"memory")
 #else
-// IBM C++ Compiler does not support inline assembly
-#define __TBB_full_memory_fence() __TBB_machine_flush ()
-#define __TBB_release_consistency_helper() __TBB_machine_flush ()
+    // IBM C++ Compiler does not support inline assembly
+    // TODO: Since XL 9.0 or earlier GCC syntax is supported. Replace with more
+    //       lightweight implementation (like in mac_ppc.h)
+    #define __TBB_control_consistency_helper() __TBB_machine_isync ()
+    #define __TBB_acquire_consistency_helper() __TBB_machine_lwsync ()
+    #define __TBB_release_consistency_helper() __TBB_machine_lwsync ()
+    #define __TBB_full_memory_fence()          __TBB_machine_flush ()
 #endif
index 547bf50432be4dc3e9ad60a58cc086e7233b21bc..03ddef5b0cae9cb66c1cbb7f76dee785d03a2108 100644 (file)
     the GNU General Public License.
 */
 
-#ifndef __TBB_machine_H
+#if !defined(__TBB_machine_H) || defined(__TBB_machine_linux_ia32_H)
 #error Do not include this file directly; include tbb_machine.h instead
 #endif
 
+#define __TBB_machine_linux_ia32_H
+
 #include <stdint.h>
 #include <unistd.h>
 
 #define __TBB_WORDSIZE 4
 #define __TBB_BIG_ENDIAN 0
 
-#define __TBB_release_consistency_helper() __asm__ __volatile__("": : :"memory")
-#define __TBB_full_memory_fence() __asm__ __volatile__("mfence": : :"memory")
+#define __TBB_compiler_fence() __asm__ __volatile__("": : :"memory")
+#define __TBB_control_consistency_helper() __TBB_compiler_fence()
+#define __TBB_acquire_consistency_helper() __TBB_compiler_fence()
+#define __TBB_release_consistency_helper() __TBB_compiler_fence()
+#define __TBB_full_memory_fence()          __asm__ __volatile__("mfence": : :"memory")
 
 #if __TBB_ICC_ASM_VOLATILE_BROKEN
 #define __TBB_VOLATILE
@@ -45,7 +50,7 @@
 #define __TBB_VOLATILE volatile
 #endif
 
-#define __MACHINE_DECL_ATOMICS(S,T,X,R) \
+#define __TBB_MACHINE_DEFINE_ATOMICS(S,T,X,R)                                        \
 static inline T __TBB_machine_cmpswp##S (volatile void *ptr, T value, T comparand )  \
 {                                                                                    \
     T result;                                                                        \
@@ -61,7 +66,7 @@ static inline T __TBB_machine_fetchadd##S(volatile void *ptr, T addend)
 {                                                                                    \
     T result;                                                                        \
     __asm__ __volatile__("lock\nxadd" X " %0,%1"                                     \
-                          : R (result), "=m"(*(__TBB_VOLATILE T*)ptr)            \
+                          : R (result), "=m"(*(__TBB_VOLATILE T*)ptr)                \
                           : "0"(addend), "m"(*(__TBB_VOLATILE T*)ptr)                \
                           : "memory");                                               \
     return result;                                                                   \
@@ -71,15 +76,21 @@ static inline  T __TBB_machine_fetchstore##S(volatile void *ptr, T value)
 {                                                                                    \
     T result;                                                                        \
     __asm__ __volatile__("lock\nxchg" X " %0,%1"                                     \
-                          : R (result), "=m"(*(__TBB_VOLATILE T*)ptr)            \
+                          : R (result), "=m"(*(__TBB_VOLATILE T*)ptr)                \
                           : "0"(value), "m"(*(__TBB_VOLATILE T*)ptr)                 \
                           : "memory");                                               \
     return result;                                                                   \
 }                                                                                    \
                                                                                      
-__MACHINE_DECL_ATOMICS(1,int8_t,"","=q")
-__MACHINE_DECL_ATOMICS(2,int16_t,"","=r")
-__MACHINE_DECL_ATOMICS(4,int32_t,"l","=r")
+__TBB_MACHINE_DEFINE_ATOMICS(1,int8_t,"","=q")
+__TBB_MACHINE_DEFINE_ATOMICS(2,int16_t,"","=r")
+__TBB_MACHINE_DEFINE_ATOMICS(4,int32_t,"l","=r")
+
+#if __INTEL_COMPILER
+#pragma warning( push )
+// reference to EBX in a function requiring stack alignment
+#pragma warning( disable: 998 )
+#endif
 
 static inline int64_t __TBB_machine_cmpswp8 (volatile void *ptr, int64_t value, int64_t comparand )
 {
@@ -130,6 +141,10 @@ static inline int64_t __TBB_machine_cmpswp8 (volatile void *ptr, int64_t value,
     return result;
 }
 
+#if __INTEL_COMPILER
+#pragma warning( pop )
+#endif // warning 998 is back
+
 static inline int32_t __TBB_machine_lg( uint32_t x ) {
     int32_t j;
     __asm__ ("bsr %1,%0" : "=r"(j) : "r"(x));
@@ -184,46 +199,18 @@ static inline void __TBB_machine_store8(volatile void *ptr, int64_t value) {
 }
  
 // Machine specific atomic operations
-
-#define __TBB_CompareAndSwap1(P,V,C) __TBB_machine_cmpswp1(P,V,C)
-#define __TBB_CompareAndSwap2(P,V,C) __TBB_machine_cmpswp2(P,V,C)
-#define __TBB_CompareAndSwap4(P,V,C) __TBB_machine_cmpswp4(P,V,C)
-#define __TBB_CompareAndSwap8(P,V,C) __TBB_machine_cmpswp8(P,V,C)
-#define __TBB_CompareAndSwapW(P,V,C) __TBB_machine_cmpswp4(P,V,C)
-
-#define __TBB_FetchAndAdd1(P,V) __TBB_machine_fetchadd1(P,V)
-#define __TBB_FetchAndAdd2(P,V) __TBB_machine_fetchadd2(P,V)
-#define __TBB_FetchAndAdd4(P,V) __TBB_machine_fetchadd4(P,V)
-#define __TBB_FetchAndAddW(P,V) __TBB_machine_fetchadd4(P,V)
-
-#define __TBB_FetchAndStore1(P,V) __TBB_machine_fetchstore1(P,V)
-#define __TBB_FetchAndStore2(P,V) __TBB_machine_fetchstore2(P,V)
-#define __TBB_FetchAndStore4(P,V) __TBB_machine_fetchstore4(P,V)
-#define __TBB_FetchAndStoreW(P,V) __TBB_machine_fetchstore4(P,V)
-
-#define __TBB_Store8(P,V) __TBB_machine_store8(P,V)
-#define __TBB_Load8(P)    __TBB_machine_load8(P)
-
 #define __TBB_AtomicOR(P,V) __TBB_machine_or(P,V)
 #define __TBB_AtomicAND(P,V) __TBB_machine_and(P,V)
 
-
-// Those we chose not to implement (they will be implemented generically using CMPSWP8)
-#undef __TBB_FetchAndAdd8
-#undef __TBB_FetchAndStore8
-
 // Definition of other functions
 #define __TBB_Pause(V) __TBB_machine_pause(V)
 #define __TBB_Log2(V)  __TBB_machine_lg(V)
 
-// Special atomic functions
-#define __TBB_FetchAndAddWrelease(P,V) __TBB_FetchAndAddW(P,V)
-#define __TBB_FetchAndIncrementWacquire(P) __TBB_FetchAndAddW(P,1)
-#define __TBB_FetchAndDecrementWrelease(P) __TBB_FetchAndAddW(P,-1)
-
-// Use generic definitions from tbb_machine.h
-#undef __TBB_TryLockByte
-#undef __TBB_LockByte
+#define __TBB_USE_GENERIC_DWORD_FETCH_ADD           1
+#define __TBB_USE_GENERIC_DWORD_FETCH_STORE         1
+#define __TBB_USE_FETCHSTORE_AS_FULL_FENCED_STORE   1
+#define __TBB_USE_GENERIC_HALF_FENCED_LOAD_STORE    1
+#define __TBB_USE_GENERIC_RELAXED_LOAD_STORE        1
 
 // API to retrieve/update FPU control setting
 #define __TBB_CPU_CTL_ENV_PRESENT 1
@@ -247,4 +234,3 @@ inline void __TBB_set_cpu_ctl_env ( const __TBB_cpu_ctl_env_t* ctl ) {
             : : "m"(ctl->mxcsr), "m"(ctl->x87cw)
     );
 }
-
index b815d3c086c2d480e03594f2056945aba77eb220..c6313cbd468f68e7fa5e2529452d772449621049 100644 (file)
     the GNU General Public License.
 */
 
-#ifndef __TBB_machine_H
+#if !defined(__TBB_machine_H) || defined(__TBB_machine_linux_ia64_H)
 #error Do not include this file directly; include tbb_machine.h instead
 #endif
 
+#define __TBB_machine_linux_ia64_H
+
 #include <stdint.h>
 #include <unistd.h>
 #include <ia64intrin.h>
 
 #define __TBB_WORDSIZE 8
 #define __TBB_BIG_ENDIAN 0
-#define __TBB_DECL_FENCED_ATOMICS 1
+
+#if __INTEL_COMPILER
+    #define __TBB_compiler_fence()
+    #define __TBB_control_consistency_helper() __TBB_compiler_fence()
+    #define __TBB_acquire_consistency_helper()
+    #define __TBB_release_consistency_helper()
+    #define __TBB_full_memory_fence()          __mf()
+#else
+    #define __TBB_compiler_fence() __asm__ __volatile__("": : :"memory")
+    #define __TBB_control_consistency_helper() __TBB_compiler_fence()
+    // Even though GCC imbues volatile loads with acquire semantics, it sometimes moves 
+    // loads over the acquire fence. The following helpers stop such incorrect code motion.
+    #define __TBB_acquire_consistency_helper() __TBB_compiler_fence()
+    #define __TBB_release_consistency_helper() __TBB_compiler_fence()
+    #define __TBB_full_memory_fence()          __asm__ __volatile__("mf": : :"memory")
+#endif /* !__INTEL_COMPILER */
 
 // Most of the functions will be in a .s file
 
 extern "C" {
-    int8_t __TBB_machine_cmpswp1__TBB_full_fence (volatile void *ptr, int8_t value, int8_t comparand); 
     int8_t __TBB_machine_fetchadd1__TBB_full_fence (volatile void *ptr, int8_t addend);
     int8_t __TBB_machine_fetchadd1acquire(volatile void *ptr, int8_t addend);
     int8_t __TBB_machine_fetchadd1release(volatile void *ptr, int8_t addend);
-    int8_t __TBB_machine_fetchstore1acquire(volatile void *ptr, int8_t value);
-    int8_t __TBB_machine_fetchstore1release(volatile void *ptr, int8_t value);
 
-    int16_t __TBB_machine_cmpswp2__TBB_full_fence (volatile void *ptr, int16_t value, int16_t comparand);
     int16_t __TBB_machine_fetchadd2__TBB_full_fence (volatile void *ptr, int16_t addend);
     int16_t __TBB_machine_fetchadd2acquire(volatile void *ptr, int16_t addend);
     int16_t __TBB_machine_fetchadd2release(volatile void *ptr, int16_t addend);
+
+    int32_t __TBB_machine_fetchadd4__TBB_full_fence (volatile void *ptr, int32_t value);
+    int32_t __TBB_machine_fetchadd4acquire(volatile void *ptr, int32_t addend);
+    int32_t __TBB_machine_fetchadd4release(volatile void *ptr, int32_t addend);
+
+    int64_t __TBB_machine_fetchadd8__TBB_full_fence (volatile void *ptr, int64_t value);
+    int64_t __TBB_machine_fetchadd8acquire(volatile void *ptr, int64_t addend);
+    int64_t __TBB_machine_fetchadd8release(volatile void *ptr, int64_t addend);
+
+    int8_t __TBB_machine_fetchstore1__TBB_full_fence (volatile void *ptr, int8_t value);
+    int8_t __TBB_machine_fetchstore1acquire(volatile void *ptr, int8_t value);
+    int8_t __TBB_machine_fetchstore1release(volatile void *ptr, int8_t value);
+
+    int16_t __TBB_machine_fetchstore2__TBB_full_fence (volatile void *ptr, int16_t value);
     int16_t __TBB_machine_fetchstore2acquire(volatile void *ptr, int16_t value);
     int16_t __TBB_machine_fetchstore2release(volatile void *ptr, int16_t value);
 
     int32_t __TBB_machine_fetchstore4__TBB_full_fence (volatile void *ptr, int32_t value);
     int32_t __TBB_machine_fetchstore4acquire(volatile void *ptr, int32_t value);
     int32_t __TBB_machine_fetchstore4release(volatile void *ptr, int32_t value);
-    int32_t __TBB_machine_fetchadd4acquire(volatile void *ptr, int32_t addend);
-    int32_t __TBB_machine_fetchadd4release(volatile void *ptr, int32_t addend);
 
-    int64_t __TBB_machine_cmpswp8__TBB_full_fence (volatile void *ptr, int64_t value, int64_t comparand);
     int64_t __TBB_machine_fetchstore8__TBB_full_fence (volatile void *ptr, int64_t value);
     int64_t __TBB_machine_fetchstore8acquire(volatile void *ptr, int64_t value);
     int64_t __TBB_machine_fetchstore8release(volatile void *ptr, int64_t value);
-    int64_t __TBB_machine_fetchadd8acquire(volatile void *ptr, int64_t addend);
-    int64_t __TBB_machine_fetchadd8release(volatile void *ptr, int64_t addend);
 
+    int8_t __TBB_machine_cmpswp1__TBB_full_fence (volatile void *ptr, int8_t value, int8_t comparand); 
     int8_t __TBB_machine_cmpswp1acquire(volatile void *ptr, int8_t value, int8_t comparand); 
     int8_t __TBB_machine_cmpswp1release(volatile void *ptr, int8_t value, int8_t comparand); 
-    int8_t __TBB_machine_fetchstore1__TBB_full_fence (volatile void *ptr, int8_t value);
 
+    int16_t __TBB_machine_cmpswp2__TBB_full_fence (volatile void *ptr, int16_t value, int16_t comparand);
     int16_t __TBB_machine_cmpswp2acquire(volatile void *ptr, int16_t value, int16_t comparand); 
     int16_t __TBB_machine_cmpswp2release(volatile void *ptr, int16_t value, int16_t comparand); 
-    int16_t __TBB_machine_fetchstore2__TBB_full_fence (volatile void *ptr, int16_t value);
 
     int32_t __TBB_machine_cmpswp4__TBB_full_fence (volatile void *ptr, int32_t value, int32_t comparand);
     int32_t __TBB_machine_cmpswp4acquire(volatile void *ptr, int32_t value, int32_t comparand); 
     int32_t __TBB_machine_cmpswp4release(volatile void *ptr, int32_t value, int32_t comparand); 
-    int32_t __TBB_machine_fetchadd4__TBB_full_fence (volatile void *ptr, int32_t value);
 
+    int64_t __TBB_machine_cmpswp8__TBB_full_fence (volatile void *ptr, int64_t value, int64_t comparand);
     int64_t __TBB_machine_cmpswp8acquire(volatile void *ptr, int64_t value, int64_t comparand); 
     int64_t __TBB_machine_cmpswp8release(volatile void *ptr, int64_t value, int64_t comparand); 
-    int64_t __TBB_machine_fetchadd8__TBB_full_fence (volatile void *ptr, int64_t value);
 
     int64_t __TBB_machine_lg(uint64_t value);
     void __TBB_machine_pause(int32_t delay);
@@ -92,73 +113,71 @@ extern "C" {
 
     //! Retrieves the current RSE backing store pointer. IA64 specific.
     void* __TBB_get_bsp();
-}
-
-#define __TBB_CompareAndSwap1(P,V,C) __TBB_machine_cmpswp1__TBB_full_fence(P,V,C)
-#define __TBB_CompareAndSwap2(P,V,C) __TBB_machine_cmpswp2__TBB_full_fence(P,V,C) 
-
-#define __TBB_FetchAndAdd1(P,V)        __TBB_machine_fetchadd1__TBB_full_fence(P,V)
-#define __TBB_FetchAndAdd1acquire(P,V) __TBB_machine_fetchadd1acquire(P,V)
-#define __TBB_FetchAndAdd1release(P,V) __TBB_machine_fetchadd1release(P,V)
-#define __TBB_FetchAndAdd2(P,V)        __TBB_machine_fetchadd2__TBB_full_fence(P,V)
-#define __TBB_FetchAndAdd2acquire(P,V) __TBB_machine_fetchadd2acquire(P,V)
-#define __TBB_FetchAndAdd2release(P,V) __TBB_machine_fetchadd2release(P,V)
-#define __TBB_FetchAndAdd4acquire(P,V) __TBB_machine_fetchadd4acquire(P,V)
-#define __TBB_FetchAndAdd4release(P,V) __TBB_machine_fetchadd4release(P,V)
-#define __TBB_FetchAndAdd8acquire(P,V) __TBB_machine_fetchadd8acquire(P,V)
-#define __TBB_FetchAndAdd8release(P,V) __TBB_machine_fetchadd8release(P,V)
-
-#define __TBB_FetchAndStore1acquire(P,V) __TBB_machine_fetchstore1acquire(P,V)
-#define __TBB_FetchAndStore1release(P,V) __TBB_machine_fetchstore1release(P,V)
-#define __TBB_FetchAndStore2acquire(P,V) __TBB_machine_fetchstore2acquire(P,V)
-#define __TBB_FetchAndStore2release(P,V) __TBB_machine_fetchstore2release(P,V)
-#define __TBB_FetchAndStore4acquire(P,V) __TBB_machine_fetchstore4acquire(P,V)
-#define __TBB_FetchAndStore4release(P,V) __TBB_machine_fetchstore4release(P,V)
-#define __TBB_FetchAndStore8acquire(P,V) __TBB_machine_fetchstore8acquire(P,V)
-#define __TBB_FetchAndStore8release(P,V) __TBB_machine_fetchstore8release(P,V)
-
-#define __TBB_CompareAndSwap1acquire(P,V,C) __TBB_machine_cmpswp1acquire(P,V,C)
-#define __TBB_CompareAndSwap1release(P,V,C) __TBB_machine_cmpswp1release(P,V,C)
-#define __TBB_CompareAndSwap2acquire(P,V,C) __TBB_machine_cmpswp2acquire(P,V,C)
-#define __TBB_CompareAndSwap2release(P,V,C) __TBB_machine_cmpswp2release(P,V,C)
-#define __TBB_CompareAndSwap4(P,V,C)        __TBB_machine_cmpswp4__TBB_full_fence(P,V,C)
-#define __TBB_CompareAndSwap4acquire(P,V,C) __TBB_machine_cmpswp4acquire(P,V,C)
-#define __TBB_CompareAndSwap4release(P,V,C) __TBB_machine_cmpswp4release(P,V,C)
-#define __TBB_CompareAndSwap8(P,V,C)        __TBB_machine_cmpswp8__TBB_full_fence(P,V,C)
-#define __TBB_CompareAndSwap8acquire(P,V,C) __TBB_machine_cmpswp8acquire(P,V,C)
-#define __TBB_CompareAndSwap8release(P,V,C) __TBB_machine_cmpswp8release(P,V,C)
-
-#define __TBB_FetchAndAdd4(P,V) __TBB_machine_fetchadd4__TBB_full_fence(P,V)
-#define __TBB_FetchAndAdd8(P,V) __TBB_machine_fetchadd8__TBB_full_fence(P,V)
-
-#define __TBB_FetchAndStore1(P,V) __TBB_machine_fetchstore1__TBB_full_fence(P,V)
-#define __TBB_FetchAndStore2(P,V) __TBB_machine_fetchstore2__TBB_full_fence(P,V)
-#define __TBB_FetchAndStore4(P,V) __TBB_machine_fetchstore4__TBB_full_fence(P,V)
-#define __TBB_FetchAndStore8(P,V) __TBB_machine_fetchstore8__TBB_full_fence(P,V)
-
-#define __TBB_FetchAndIncrementWacquire(P) __TBB_FetchAndAdd8acquire(P,1)
-#define __TBB_FetchAndDecrementWrelease(P) __TBB_FetchAndAdd8release(P,-1)
-
-#ifndef __INTEL_COMPILER
-/* Even though GCC imbues volatile loads with acquire semantics, 
-   it sometimes moves loads over the acquire fence.  The
-   fences defined here stop such incorrect code motion. */
-#define __TBB_release_consistency_helper() __asm__ __volatile__("": : :"memory")
-#define __TBB_full_memory_fence() __asm__ __volatile__("mf": : :"memory")
-#else
-#define __TBB_release_consistency_helper()
-#define __TBB_full_memory_fence() __mf()
-#endif /* __INTEL_COMPILER */
 
-// Special atomic functions
-#define __TBB_CompareAndSwapW(P,V,C)   __TBB_CompareAndSwap8(P,V,C)
-#define __TBB_FetchAndStoreW(P,V)      __TBB_FetchAndStore8(P,V)
-#define __TBB_FetchAndAddW(P,V)        __TBB_FetchAndAdd8(P,V)
-#define __TBB_FetchAndAddWrelease(P,V) __TBB_FetchAndAdd8release(P,V)
-
-// Not needed
-#undef __TBB_Store8
-#undef __TBB_Load8
+    int32_t __TBB_machine_load1_relaxed(const void *ptr);
+    int32_t __TBB_machine_load2_relaxed(const void *ptr);
+    int32_t __TBB_machine_load4_relaxed(const void *ptr);
+    int64_t __TBB_machine_load8_relaxed(const void *ptr);
+
+    void __TBB_machine_store1_relaxed(void *ptr, int32_t value);
+    void __TBB_machine_store2_relaxed(void *ptr, int32_t value);
+    void __TBB_machine_store4_relaxed(void *ptr, int32_t value);
+    void __TBB_machine_store8_relaxed(void *ptr, int64_t value);
+} // extern "C"
+
+// Mapping old entry points to the names corresponding to the new full_fence identifier.
+#define __TBB_machine_fetchadd1full_fence   __TBB_machine_fetchadd1__TBB_full_fence
+#define __TBB_machine_fetchadd2full_fence   __TBB_machine_fetchadd2__TBB_full_fence
+#define __TBB_machine_fetchadd4full_fence   __TBB_machine_fetchadd4__TBB_full_fence
+#define __TBB_machine_fetchadd8full_fence   __TBB_machine_fetchadd8__TBB_full_fence
+#define __TBB_machine_fetchstore1full_fence __TBB_machine_fetchstore1__TBB_full_fence
+#define __TBB_machine_fetchstore2full_fence __TBB_machine_fetchstore2__TBB_full_fence
+#define __TBB_machine_fetchstore4full_fence __TBB_machine_fetchstore4__TBB_full_fence
+#define __TBB_machine_fetchstore8full_fence __TBB_machine_fetchstore8__TBB_full_fence
+#define __TBB_machine_cmpswp1full_fence     __TBB_machine_cmpswp1__TBB_full_fence
+#define __TBB_machine_cmpswp2full_fence     __TBB_machine_cmpswp2__TBB_full_fence 
+#define __TBB_machine_cmpswp4full_fence     __TBB_machine_cmpswp4__TBB_full_fence
+#define __TBB_machine_cmpswp8full_fence     __TBB_machine_cmpswp8__TBB_full_fence
+
+// Mapping relaxed operations to the entry points implementing them.
+/** On IA64 RMW operations implicitly have acquire semantics. Thus one cannot
+    actually have completely relaxed RMW operation here. **/
+#define __TBB_machine_fetchadd1relaxed      __TBB_machine_fetchadd1acquire
+#define __TBB_machine_fetchadd2relaxed      __TBB_machine_fetchadd2acquire
+#define __TBB_machine_fetchadd4relaxed      __TBB_machine_fetchadd4acquire
+#define __TBB_machine_fetchadd8relaxed      __TBB_machine_fetchadd8acquire
+#define __TBB_machine_fetchstore1relaxed    __TBB_machine_fetchstore1acquire
+#define __TBB_machine_fetchstore2relaxed    __TBB_machine_fetchstore2acquire
+#define __TBB_machine_fetchstore4relaxed    __TBB_machine_fetchstore4acquire
+#define __TBB_machine_fetchstore8relaxed    __TBB_machine_fetchstore8acquire
+#define __TBB_machine_cmpswp1relaxed        __TBB_machine_cmpswp1acquire
+#define __TBB_machine_cmpswp2relaxed        __TBB_machine_cmpswp2acquire 
+#define __TBB_machine_cmpswp4relaxed        __TBB_machine_cmpswp4acquire
+#define __TBB_machine_cmpswp8relaxed        __TBB_machine_cmpswp8acquire
+
+#define __TBB_MACHINE_DEFINE_ATOMICS(S,V)                               \
+    template <typename T>                                               \
+    struct machine_load_store_relaxed<T,S> {                      \
+        static inline T load ( const T& location ) {                    \
+            return (T)__TBB_machine_load##S##_relaxed(&location);       \
+        }                                                               \
+        static inline void store ( T& location, T value ) {             \
+            __TBB_machine_store##S##_relaxed(&location, (V)value);      \
+        }                                                               \
+    }
+
+namespace tbb {
+namespace internal {
+    __TBB_MACHINE_DEFINE_ATOMICS(1,int8_t);
+    __TBB_MACHINE_DEFINE_ATOMICS(2,int16_t);
+    __TBB_MACHINE_DEFINE_ATOMICS(4,int32_t);
+    __TBB_MACHINE_DEFINE_ATOMICS(8,int64_t);
+}} // namespaces internal, tbb
+
+#undef __TBB_MACHINE_DEFINE_ATOMICS
+
+#define __TBB_USE_FENCED_ATOMICS 1
+#define __TBB_USE_GENERIC_HALF_FENCED_LOAD_STORE 1
 
 // Definition of Lock functions
 #define __TBB_TryLockByte(P) __TBB_machine_trylockbyte(P)
@@ -167,4 +186,3 @@ extern "C" {
 // Definition of other utility functions
 #define __TBB_Pause(V) __TBB_machine_pause(V)
 #define __TBB_Log2(V)  __TBB_machine_lg(V)
-
index 8d0576256b392e2e558ed0b54c49671b97941678..353ba09fa5bc1e9039ec28fdc579d56f5bbb2468 100644 (file)
     the GNU General Public License.
 */
 
-#ifndef __TBB_machine_H
+#if !defined(__TBB_machine_H) || defined(__TBB_machine_linux_intel64_H)
 #error Do not include this file directly; include tbb_machine.h instead
 #endif
 
+#define __TBB_machine_linux_intel64_H
+
 #include <stdint.h>
 #include <unistd.h>
 
 #define __TBB_WORDSIZE 8
 #define __TBB_BIG_ENDIAN 0
 
-#define __TBB_release_consistency_helper() __asm__ __volatile__("": : :"memory")
+#define __TBB_compiler_fence() __asm__ __volatile__("": : :"memory")
+#define __TBB_control_consistency_helper() __TBB_compiler_fence()
+#define __TBB_acquire_consistency_helper() __TBB_compiler_fence()
+#define __TBB_release_consistency_helper() __TBB_compiler_fence()
 
-// __TBB_full_memory_fence can be predefined
 #ifndef __TBB_full_memory_fence
 #define __TBB_full_memory_fence() __asm__ __volatile__("mfence": : :"memory")
 #endif
 
-#define __MACHINE_DECL_ATOMICS(S,T,X) \
+#define __TBB_MACHINE_DEFINE_ATOMICS(S,T,X)                                          \
 static inline T __TBB_machine_cmpswp##S (volatile void *ptr, T value, T comparand )  \
 {                                                                                    \
     T result;                                                                        \
@@ -75,10 +79,12 @@ static inline  T __TBB_machine_fetchstore##S(volatile void *ptr, T value)
     return result;                                                                   \
 }                                                                                    \
                                                                                      
-__MACHINE_DECL_ATOMICS(1,int8_t,"")
-__MACHINE_DECL_ATOMICS(2,int16_t,"")
-__MACHINE_DECL_ATOMICS(4,int32_t,"")
-__MACHINE_DECL_ATOMICS(8,int64_t,"q")
+__TBB_MACHINE_DEFINE_ATOMICS(1,int8_t,"")
+__TBB_MACHINE_DEFINE_ATOMICS(2,int16_t,"")
+__TBB_MACHINE_DEFINE_ATOMICS(4,int32_t,"")
+__TBB_MACHINE_DEFINE_ATOMICS(8,int64_t,"q")
+
+#undef __TBB_MACHINE_DEFINE_ATOMICS
 
 static inline int64_t __TBB_machine_lg( uint64_t x ) {
     int64_t j;
@@ -94,29 +100,6 @@ static inline void __TBB_machine_and( volatile void *ptr, uint64_t addend ) {
     __asm__ __volatile__("lock\nandq %1,%0" : "=m"(*(volatile uint64_t*)ptr) : "r"(addend), "m"(*(volatile uint64_t*)ptr) : "memory");
 }
 
-// Machine specific atomic operations
-
-#define __TBB_CompareAndSwap1(P,V,C) __TBB_machine_cmpswp1(P,V,C)
-#define __TBB_CompareAndSwap2(P,V,C) __TBB_machine_cmpswp2(P,V,C)
-#define __TBB_CompareAndSwap4(P,V,C) __TBB_machine_cmpswp4(P,V,C)
-#define __TBB_CompareAndSwap8(P,V,C) __TBB_machine_cmpswp8(P,V,C)
-#define __TBB_CompareAndSwapW(P,V,C) __TBB_machine_cmpswp8(P,V,C)
-
-#define __TBB_FetchAndAdd1(P,V) __TBB_machine_fetchadd1(P,V)
-#define __TBB_FetchAndAdd2(P,V) __TBB_machine_fetchadd2(P,V)
-#define __TBB_FetchAndAdd4(P,V) __TBB_machine_fetchadd4(P,V)
-#define __TBB_FetchAndAdd8(P,V)  __TBB_machine_fetchadd8(P,V)
-#define __TBB_FetchAndAddW(P,V)  __TBB_machine_fetchadd8(P,V)
-
-#define __TBB_FetchAndStore1(P,V) __TBB_machine_fetchstore1(P,V)
-#define __TBB_FetchAndStore2(P,V) __TBB_machine_fetchstore2(P,V)
-#define __TBB_FetchAndStore4(P,V) __TBB_machine_fetchstore4(P,V)
-#define __TBB_FetchAndStore8(P,V)  __TBB_machine_fetchstore8(P,V)
-#define __TBB_FetchAndStoreW(P,V)  __TBB_machine_fetchstore8(P,V)
-
-#undef __TBB_Store8
-#undef __TBB_Load8
-
 #define __TBB_AtomicOR(P,V) __TBB_machine_or(P,V)
 #define __TBB_AtomicAND(P,V) __TBB_machine_and(P,V)
 
@@ -129,17 +112,13 @@ static inline void __TBB_machine_pause( int32_t delay ) {
     return;
 }
 #define __TBB_Pause(V) __TBB_machine_pause(V)
-#endif
-#define __TBB_Log2(V)    __TBB_machine_lg(V)
+#endif /* !__TBB_Pause */
 
-// Special atomic functions
-#define __TBB_FetchAndAddWrelease(P,V) __TBB_FetchAndAddW(P,V)
-#define __TBB_FetchAndIncrementWacquire(P) __TBB_FetchAndAddW(P,1)
-#define __TBB_FetchAndDecrementWrelease(P) __TBB_FetchAndAddW(P,-1)
+#define __TBB_Log2(V)  __TBB_machine_lg(V)
 
-// Use generic definitions from tbb_machine.h
-#undef __TBB_TryLockByte
-#undef __TBB_LockByte
+#define __TBB_USE_FETCHSTORE_AS_FULL_FENCED_STORE   1
+#define __TBB_USE_GENERIC_HALF_FENCED_LOAD_STORE    1
+#define __TBB_USE_GENERIC_RELAXED_LOAD_STORE        1
 
 // API to retrieve/update FPU control setting
 #ifndef __TBB_CPU_CTL_ENV_PRESENT
@@ -151,11 +130,21 @@ struct __TBB_cpu_ctl_env_t {
 };
 
 inline void __TBB_get_cpu_ctl_env ( __TBB_cpu_ctl_env_t* ctl ) {
+#if __TBB_ICC_12_0_INL_ASM_FSTCW_BROKEN
+    __TBB_cpu_ctl_env_t loc_ctl;
+    __asm__ __volatile__ (
+            "stmxcsr %0\n\t"
+            "fstcw %1"
+            : "=m"(loc_ctl.mxcsr), "=m"(loc_ctl.x87cw)
+    );
+    *ctl = loc_ctl;
+#else
     __asm__ __volatile__ (
             "stmxcsr %0\n\t"
             "fstcw %1"
             : "=m"(ctl->mxcsr), "=m"(ctl->x87cw)
     );
+#endif
 }
 inline void __TBB_set_cpu_ctl_env ( const __TBB_cpu_ctl_env_t* ctl ) {
     __asm__ __volatile__ (
@@ -164,4 +153,4 @@ inline void __TBB_set_cpu_ctl_env ( const __TBB_cpu_ctl_env_t* ctl ) {
             : : "m"(ctl->mxcsr), "m"(ctl->x87cw)
     );
 }
-#endif
+#endif /* !__TBB_CPU_CTL_ENV_PRESENT */
index 311403927ab9b115956b95d433c5f76bef72eff3..ad154fdee5d6976265442a9a88733eb17684d7a7 100644 (file)
     the GNU General Public License.
 */
 
-#ifndef __TBB_machine_H
+#if !defined(__TBB_machine_H) || defined(__TBB_machine_gcc_power_H)
 #error Do not include this file directly; include tbb_machine.h instead
 #endif
 
+#define __TBB_machine_gcc_power_H
+
 #include <stdint.h>
 #include <unistd.h>
 
-// This file is for PowerPC with compilers supporting GNU inline-assembler syntax (currently GNU g++ and IBM XL).
+// TODO: rename to gcc_power.h?
+// This file is for Power Architecture with compilers supporting GNU inline-assembler syntax (currently GNU g++ and IBM XL).
+// Note that XL V9.0 (sometimes?) has trouble dealing with empty input and/or clobber lists, so they should be avoided.
+
+#if __powerpc64__ || __ppc64__
+    // IBM XL documents __powerpc64__ (and __PPC64__).
+    // Apple documents __ppc64__ (with __ppc__ only on 32-bit).
+    #define __TBB_WORDSIZE 8
+#else
+    #define __TBB_WORDSIZE 4
+#endif
+
+// On Power Architecture, (lock-free) 64-bit atomics require 64-bit hardware:
+#if __TBB_WORDSIZE==8
+    // Do not change the following definition, because TBB itself will use 64-bit atomics in 64-bit builds.
+    #define __TBB_64BIT_ATOMICS 1
+#elif __bgp__
+    // Do not change the following definition on known 32-bit hardware.
+    #define __TBB_64BIT_ATOMICS 0
+#else
+    // To enable 64-bit atomics in 32-bit builds, set the value below to 1 instead of 0.
+    // You must make certain that the program will only use them on actual 64-bit hardware
+    // (which typically means that the entire program is only executed on such hardware),
+    // because their implementation involves machine instructions that are illegal elsewhere.
+    // The setting can be chosen independently per compilation unit,
+    // which also means that TBB itself does not need to be rebuilt.
+    // Alternatively (but only for the current architecture and TBB version),
+    // override the default as a predefined macro when invoking the compiler.
+    #ifndef __TBB_64BIT_ATOMICS
+    #define __TBB_64BIT_ATOMICS 0
+    #endif
+#endif
 
-// Motivation for use of "#if defined(__powerpc64__) || defined(__ppc64__)" to detect a 64-bit environment:
-// IBM XL documents both __powerpc64__ and __PPC64__, and these also appear to work on g++ (documentation?)
-// Apple documents __ppc64__ (with __ppc__ only 32-bit, which is not portable even to other environments using g++)
 inline int32_t __TBB_machine_cmpswp4 (volatile void *ptr, int32_t value, int32_t comparand )
 {
     int32_t result;
 
     __asm__ __volatile__("sync\n"
-                         "0: lwarx %0,0,%2\n\t"  /* load w/ reservation */
-                         "cmpw %0,%4\n\t"        /* compare against comparand */
-                         "bne- 1f\n\t"           /* exit if not same */
-                         "stwcx. %3,0,%2\n\t"    /* store new_value */
-                         "bne- 0b\n"             /* retry if reservation lost */
-                         "1: sync"               /* the exit */
-                          : "=&r"(result), "=m"(* (int32_t*) ptr)
-                          : "r"(ptr), "r"(value), "r"(comparand), "m"(* (int32_t*) ptr)
-                          : "cr0", "memory");
+                         "0:\n\t"
+                         "lwarx %[res],0,%[ptr]\n\t"     /* load w/ reservation */
+                         "cmpw %[res],%[cmp]\n\t"        /* compare against comparand */
+                         "bne- 1f\n\t"                   /* exit if not same */
+                         "stwcx. %[val],0,%[ptr]\n\t"    /* store new value */
+                         "bne- 0b\n"                     /* retry if reservation lost */
+                         "1:\n\t"                        /* the exit */
+                         "isync"
+                         : [res]"=&r"(result)
+                         , "+m"(* (int32_t*) ptr)        /* redundant with "memory" */
+                         : [ptr]"r"(ptr)
+                         , [val]"r"(value)
+                         , [cmp]"r"(comparand)
+                         : "memory"                      /* compiler full fence */
+                         , "cr0"                         /* clobbered by cmp and/or stwcx. */
+                         );
     return result;
 }
 
-#if defined(__powerpc64__) || defined(__ppc64__)
+#if __TBB_WORDSIZE==8
 
 inline int64_t __TBB_machine_cmpswp8 (volatile void *ptr, int64_t value, int64_t comparand )
 {
     int64_t result;
     __asm__ __volatile__("sync\n"
-                         "0: ldarx %0,0,%2\n\t"  /* load w/ reservation */
-                         "cmpd %0,%4\n\t"        /* compare against comparand */
-                         "bne- 1f\n\t"           /* exit if not same */
-                         "stdcx. %3,0,%2\n\t"    /* store new_value */
-                         "bne- 0b\n"             /* retry if reservation lost */
-                         "1: sync"               /* the exit */
-                          : "=&r"(result), "=m"(* (int64_t*) ptr)
-                          : "r"(ptr), "r"(value), "r"(comparand), "m"(* (int64_t*) ptr)
-                          : "cr0", "memory");
+                         "0:\n\t"
+                         "ldarx %[res],0,%[ptr]\n\t"     /* load w/ reservation */
+                         "cmpd %[res],%[cmp]\n\t"        /* compare against comparand */
+                         "bne- 1f\n\t"                   /* exit if not same */
+                         "stdcx. %[val],0,%[ptr]\n\t"    /* store new value */
+                         "bne- 0b\n"                     /* retry if reservation lost */
+                         "1:\n\t"                        /* the exit */
+                         "isync"
+                         : [res]"=&r"(result)
+                         , "+m"(* (int64_t*) ptr)        /* redundant with "memory" */
+                         : [ptr]"r"(ptr)
+                         , [val]"r"(value)
+                         , [cmp]"r"(comparand)
+                         : "memory"                      /* compiler full fence */
+                         , "cr0"                         /* clobbered by cmp and/or stdcx. */
+                         );
     return result;
 }
-#else
-// Except for special circumstances, 32-bit builds are meant to run on actual 32-bit hardware
-// A locked implementation would also be a possibility
-#define __TBB_64BIT_ATOMICS 0
-#endif /* 64bit CAS */
 
-#define __TBB_BIG_ENDIAN 1
+#elif __TBB_64BIT_ATOMICS /* && __TBB_WORDSIZE==4 */
 
-#if defined(__powerpc64__) || defined(__ppc64__)
-#define __TBB_WORDSIZE 8
-#define __TBB_CompareAndSwapW(P,V,C) __TBB_machine_cmpswp8(P,V,C)
-#else
-#define __TBB_WORDSIZE 4
-#define __TBB_CompareAndSwapW(P,V,C) __TBB_machine_cmpswp4(P,V,C)
-#endif
+inline int64_t __TBB_machine_cmpswp8 (volatile void *ptr, int64_t value, int64_t comparand )
+{
+    int64_t result;
+    int64_t value_register, comparand_register, result_register; // dummy variables to allocate registers
+    __asm__ __volatile__("sync\n\t"
+                         "ld %[val],%[valm]\n\t"
+                         "ld %[cmp],%[cmpm]\n"
+                         "0:\n\t"
+                         "ldarx %[res],0,%[ptr]\n\t"     /* load w/ reservation */
+                         "cmpd %[res],%[cmp]\n\t"        /* compare against comparand */
+                         "bne- 1f\n\t"                   /* exit if not same */
+                         "stdcx. %[val],0,%[ptr]\n\t"    /* store new value */
+                         "bne- 0b\n"                     /* retry if reservation lost */
+                         "1:\n\t"                        /* the exit */
+                         "std %[res],%[resm]\n\t"
+                         "isync"
+                         : [resm]"=m"(result)
+                         , [res] "=&r"(   result_register)
+                         , [val] "=&r"(    value_register)
+                         , [cmp] "=&r"(comparand_register)
+                         , "+m"(* (int64_t*) ptr)        /* redundant with "memory" */
+                         : [ptr] "r"(ptr)
+                         , [valm]"m"(value)
+                         , [cmpm]"m"(comparand)
+                         : "memory"                      /* compiler full fence */
+                         , "cr0"                         /* clobbered by cmpd and/or stdcx. */
+                         );
+    return result;
+}
+#endif /* __TBB_WORDSIZE==4 && __TBB_64BIT_ATOMICS */
 
-#define __TBB_CompareAndSwap4(P,V,C) __TBB_machine_cmpswp4(P,V,C)
-#if __TBB_64BIT_ATOMICS
-#define __TBB_CompareAndSwap8(P,V,C) __TBB_machine_cmpswp8(P,V,C)
-#endif
-#define __TBB_full_memory_fence() __asm__ __volatile__("sync": : :"memory")
-#define __TBB_release_consistency_helper() __asm__ __volatile__("lwsync": : :"memory")
+#define __TBB_MACHINE_DEFINE_LOAD_STORE(S,load,store,compare)                                                 \
+    template <typename T>                                                                                     \
+    struct machine_load_store<T,S> {                                                                          \
+        static inline T load_with_acquire(const volatile T& location) {                                       \
+            T result;                                                                                         \
+            __asm__ __volatile__(load " %[res],0(%[ptr])\n"                                                   \
+                                 "0:\n\t"                                                                     \
+                                 compare " %[res],%[res]\n\t"                                                 \
+                                 "bne- 0b\n\t"                                                                \
+                                 "isync"                                                                      \
+                                 : [res]"=r"(result)                                                          \
+                                 : [ptr]"b"(&location) /* cannot use register 0 here */                       \
+                                 , "m"(location)       /* redundant with "memory" */                          \
+                                 : "memory"            /* compiler acquire fence */                           \
+                                 , "cr0"               /* clobbered by cmpw/cmpd */);                         \
+            return result;                                                                                    \
+        }                                                                                                     \
+        static inline void store_with_release(volatile T &location, T value) {                                \
+            __asm__ __volatile__("lwsync\n\t"                                                                 \
+                                 store " %[val],0(%[ptr])"                                                    \
+                                 : "=m"(location)      /* redundant with "memory" */                          \
+                                 : [ptr]"b"(&location) /* cannot use register 0 here */                       \
+                                 , [val]"r"(value)                                                            \
+                                 : "memory"/*compiler release fence*/ /*(cr0 not affected)*/);                \
+        }                                                                                                     \
+    };                                                                                                        \
+                                                                                                              \
+    template <typename T>                                                                                     \
+    struct machine_load_store_relaxed<T,S> {                                                            \
+        static inline T load (const __TBB_atomic T& location) {                                               \
+            T result;                                                                                         \
+            __asm__ __volatile__(load " %[res],0(%[ptr])"                                                     \
+                                 : [res]"=r"(result)                                                          \
+                                 : [ptr]"b"(&location) /* cannot use register 0 here */                       \
+                                 , "m"(location)                                                              \
+                                 ); /*(no compiler fence)*/ /*(cr0 not affected)*/                            \
+            return result;                                                                                    \
+        }                                                                                                     \
+        static inline void store (__TBB_atomic T &location, T value) {                                        \
+            __asm__ __volatile__(store " %[val],0(%[ptr])"                                                    \
+                                 : "=m"(location)                                                             \
+                                 : [ptr]"b"(&location) /* cannot use register 0 here */                       \
+                                 , [val]"r"(value)                                                            \
+                                 ); /*(no compiler fence)*/ /*(cr0 not affected)*/                            \
+        }                                                                                                     \
+    };
+
+namespace tbb {
+namespace internal {
+    __TBB_MACHINE_DEFINE_LOAD_STORE(1,"lbz","stb","cmpw")
+    __TBB_MACHINE_DEFINE_LOAD_STORE(2,"lhz","sth","cmpw")
+    __TBB_MACHINE_DEFINE_LOAD_STORE(4,"lwz","stw","cmpw")
+
+#if __TBB_WORDSIZE==8
+
+    __TBB_MACHINE_DEFINE_LOAD_STORE(8,"ld" ,"std","cmpd")
+
+#elif __TBB_64BIT_ATOMICS /* && __TBB_WORDSIZE==4 */
+
+    template <typename T>
+    struct machine_load_store<T,8> {
+        static inline T load_with_acquire(const volatile T& location) {
+            T result;
+            T result_register; // dummy variable to allocate a register
+            __asm__ __volatile__("ld %[res],0(%[ptr])\n\t"
+                                 "std %[res],%[resm]\n"
+                                 "0:\n\t"
+                                 "cmpd %[res],%[res]\n\t"
+                                 "bne- 0b\n\t"
+                                 "isync"
+                                 : [resm]"=m"(result)
+                                 , [res]"=&r"(result_register)
+                                 : [ptr]"b"(&location) /* cannot use register 0 here */
+                                 , "m"(location)       /* redundant with "memory" */
+                                 : "memory"            /* compiler acquire fence */
+                                 , "cr0"               /* clobbered by cmpd */);
+            return result;
+        }
+
+        static inline void store_with_release(volatile T &location, T value) {
+            T value_register; // dummy variable to allocate a register
+            __asm__ __volatile__("lwsync\n\t"
+                                 "ld %[val],%[valm]\n\t"
+                                 "std %[val],0(%[ptr])"
+                                 : "=m"(location)      /* redundant with "memory" */
+                                 , [val]"=&r"(value_register)
+                                 : [ptr]"b"(&location) /* cannot use register 0 here */
+                                 , [valm]"m"(value)
+                                 : "memory"/*compiler release fence*/ /*(cr0 not affected)*/);
+        }
+    };
+
+    struct machine_load_store_relaxed<T,8> {
+        static inline T load (const volatile T& location) {
+            T result;
+            T result_register; // dummy variable to allocate a register
+            __asm__ __volatile__("ld %[res],0(%[ptr])\n\t"
+                                 "std %[res],%[resm]"
+                                 : [resm]"=m"(result)
+                                 , [res]"=&r"(result_register)
+                                 : [ptr]"b"(&location) /* cannot use register 0 here */
+                                 , "m"(location)
+                                 ); /*(no compiler fence)*/ /*(cr0 not affected)*/
+            return result;
+        }
+
+        static inline void store (volatile T &location, T value) {
+            T value_register; // dummy variable to allocate a register
+            __asm__ __volatile__("ld %[val],%[valm]\n\t"
+                                 "std %[val],0(%[ptr])"
+                                 : "=m"(location)
+                                 , [val]"=&r"(value_register)
+                                 : [ptr]"b"(&location) /* cannot use register 0 here */
+                                 , [valm]"m"(value)
+                                 ); /*(no compiler fence)*/ /*(cr0 not affected)*/
+        }
+    };
+    #define __TBB_machine_load_store_relaxed_8
+
+#endif /* __TBB_WORDSIZE==4 && __TBB_64BIT_ATOMICS */
+
+}} // namespaces internal, tbb
+
+#undef __TBB_MACHINE_DEFINE_LOAD_STORE
+
+#define __TBB_USE_GENERIC_PART_WORD_CAS 1
+#define __TBB_USE_GENERIC_FETCH_ADD     1
+#define __TBB_USE_GENERIC_FETCH_STORE   1
+
+#define __TBB_control_consistency_helper() __asm__ __volatile__("isync": : :"memory")
+#define __TBB_full_memory_fence()          __asm__ __volatile__( "sync": : :"memory")
 
-#if !__IBMCPP__
-// "1501-230 (S) Internal compiler error; please contact your Service Representative"
 static inline intptr_t __TBB_machine_lg( uintptr_t x ) {
-    // TODO: assumes sizeof(uintptr_t)<=8 resp. 4
-    #if defined(__powerpc64__) || defined(__ppc64__)
-    __asm__ __volatile__ ("cntlzd %0,%0" : "+r"(x)); // counting starts at 2^63
+    // cntlzd/cntlzw starts counting at 2^63/2^31 (ignoring any higher-order bits), and does not affect cr0
+#if __TBB_WORDSIZE==8
+    __asm__ __volatile__ ("cntlzd %0,%0" : "+r"(x));
     return 63-static_cast<intptr_t>(x);
-    #else
-    __asm__ __volatile__ ("cntlzw %0,%0" : "+r"(x)); // counting starts at 2^31 (on 64-bit hardware, higher-order bits are ignored)
+#else
+    __asm__ __volatile__ ("cntlzw %0,%0" : "+r"(x));
     return 31-static_cast<intptr_t>(x);
-    #endif
+#endif
 }
 #define __TBB_Log2(V) __TBB_machine_lg(V)
-#endif
 
-#define __TBB_Byte uint32_t // TODO: would this ever not be aligned without an alignment specification?
+// Assumes implicit alignment for any 32-bit value
+typedef uint32_t __TBB_Flag;
+#define __TBB_Flag __TBB_Flag
 
-inline bool __TBB_machine_trylockbyte( __TBB_Byte &flag ) {
+inline bool __TBB_machine_trylockbyte( __TBB_atomic __TBB_Flag &flag ) {
     return __TBB_machine_cmpswp4(&flag,1,0)==0;
 }
 #define __TBB_TryLockByte(P) __TBB_machine_trylockbyte(P)
index c0e8799b0aa626bdb4d4def27bb217f5b5936659..ec0b0f089d85462700fd1f18067c74ac9ed4e8fa 100644 (file)
     the GNU General Public License.
 */
 
-#ifndef __TBB_machine_H
+#if !defined(__TBB_machine_H) || defined(__TBB_machine_macos_common_H)
 #error Do not include this file directly; include tbb_machine.h instead
 #endif
 
+#define __TBB_machine_macos_common_H
+
 #include <sched.h>
 #define __TBB_Yield()  sched_yield()
 
-
 // __TBB_HardwareConcurrency
 
 #include <sys/types.h>
@@ -49,40 +50,21 @@ static inline int __TBB_macos_available_cpu() {
 
 #define __TBB_HardwareConcurrency() __TBB_macos_available_cpu()
 
-
-#ifndef __TBB_WORDSIZE
-#define __TBB_WORDSIZE 4
-#endif
-
-#ifndef __TBB_BIG_ENDIAN
-#if __BIG_ENDIAN__
-#define __TBB_BIG_ENDIAN 1
-#else
-#define __TBB_BIG_ENDIAN 0
-#endif
+#ifndef __TBB_full_memory_fence
+    // TBB has not recognized the architecture (none of the architecture abstraction
+    // headers was included).
+    #define __TBB_UnknownArchitecture 1
 #endif
 
-
-#if !defined(__TBB_CompareAndSwap4) || !defined(__TBB_CompareAndSwap8)
+#if __TBB_UnknownArchitecture || __TBB_WORDSIZE==4
+// In case of IA32 this is a workaround for compiler bugs triggered by inline
+// assembly implementation of __TBB_machine_cmpswp8 in linux_ia32.h, which may
+// lead to incorrect codegen (gcc) or compilation failures (any icc including 12.0.4).
 
 // Implementation of atomic operations based on OS provided primitives
 #include <libkern/OSAtomic.h>
 
-#define __TBB_release_consistency_helper() OSMemoryBarrier()
-#define __TBB_full_memory_fence()          OSMemoryBarrier()
-
-static inline int32_t __TBB_macos_cmpswp4(volatile void *ptr, int32_t value, int32_t comparand)
-{
-    __TBB_ASSERT( !((uintptr_t)ptr&0x3), "address not properly aligned for Mac OS atomics");
-    int32_t* address = (int32_t*)ptr;
-    while( !OSAtomicCompareAndSwap32Barrier(comparand, value, address) ){
-        int32_t snapshot = *address;
-        if( snapshot!=comparand ) return snapshot;
-    }
-    return comparand;
-}
-
-static inline int64_t __TBB_macos_cmpswp8(volatile void *ptr, int64_t value, int64_t comparand)
+static inline int64_t __TBB_machine_cmpswp8_OsX(volatile void *ptr, int64_t value, int64_t comparand)
 {
     __TBB_ASSERT( !((uintptr_t)ptr&0x7), "address not properly aligned for Mac OS atomics");
     int64_t* address = (int64_t*)ptr;
@@ -97,30 +79,58 @@ static inline int64_t __TBB_macos_cmpswp8(volatile void *ptr, int64_t value, int
     return comparand;
 }
 
-#define __TBB_CompareAndSwap4(P,V,C) __TBB_macos_cmpswp4(P,V,C)
-#define __TBB_CompareAndSwap8(P,V,C) __TBB_macos_cmpswp8(P,V,C)
+#define __TBB_machine_cmpswp8 __TBB_machine_cmpswp8_OsX
 
-static inline int32_t __TBB_macos_fetchadd4(volatile void *ptr, int32_t addend)
+#endif /* __TBB_UnknownArchitecture || __TBB_WORDSIZE==4 */
+
+#if __TBB_UnknownArchitecture
+
+#ifndef __TBB_WORDSIZE
+#define __TBB_WORDSIZE 4
+#endif
+
+#define __TBB_BIG_ENDIAN __BIG_ENDIAN__
+
+/** As this generic implementation has absolutely no information about underlying
+    hardware, its performance most likely will be sub-optimal because of full memory
+    fence usages where a more lightweight synchronization means (or none at all)
+    could suffice. Thus if you use this header to enable TBB on a new platform,
+    consider forking it and relaxing below helpers as appropriate. **/
+#define __TBB_control_consistency_helper() OSMemoryBarrier()
+#define __TBB_acquire_consistency_helper() OSMemoryBarrier()
+#define __TBB_release_consistency_helper() OSMemoryBarrier()
+#define __TBB_full_memory_fence()          OSMemoryBarrier()
+
+static inline int32_t __TBB_machine_cmpswp4(volatile void *ptr, int32_t value, int32_t comparand)
+{
+    __TBB_ASSERT( !((uintptr_t)ptr&0x3), "address not properly aligned for Mac OS atomics");
+    int32_t* address = (int32_t*)ptr;
+    while( !OSAtomicCompareAndSwap32Barrier(comparand, value, address) ){
+        int32_t snapshot = *address;
+        if( snapshot!=comparand ) return snapshot;
+    }
+    return comparand;
+}
+
+static inline int32_t __TBB_machine_fetchadd4(volatile void *ptr, int32_t addend)
 {
     __TBB_ASSERT( !((uintptr_t)ptr&0x3), "address not properly aligned for Mac OS atomics");
     return OSAtomicAdd32Barrier(addend, (int32_t*)ptr) - addend;
 }
 
-static inline int64_t __TBB_macos_fetchadd8(volatile void *ptr, int64_t addend)
+static inline int64_t __TBB_machine_fetchadd8(volatile void *ptr, int64_t addend)
 {
     __TBB_ASSERT( !((uintptr_t)ptr&0x7), "address not properly aligned for Mac OS atomics");
     return OSAtomicAdd64Barrier(addend, (int64_t*)ptr) - addend;
 }
 
-#define __TBB_FetchAndAdd4(P,V) __TBB_macos_fetchadd4(P,V)
-#define __TBB_FetchAndAdd8(P,V) __TBB_macos_fetchadd8(P,V)
-
-#if __TBB_WORDSIZE==4
-#define __TBB_CompareAndSwapW(P,V,C) __TBB_CompareAndSwap4(P,V,C)
-#define __TBB_FetchAndAddW(P,V) __TBB_FetchAndAdd4(P,V)
-#else
-#define __TBB_CompareAndSwapW(P,V,C) __TBB_CompareAndSwap8(P,V,C)
-#define __TBB_FetchAndAddW(P,V) __TBB_FetchAndAdd8(P,V)
+#define __TBB_USE_GENERIC_PART_WORD_CAS             1
+#define __TBB_USE_GENERIC_PART_WORD_FETCH_ADD       1
+#define __TBB_USE_GENERIC_FETCH_STORE               1
+#define __TBB_USE_GENERIC_HALF_FENCED_LOAD_STORE    1
+#define __TBB_USE_GENERIC_RELAXED_LOAD_STORE        1
+#if __TBB_WORDSIZE == 4
+    #define __TBB_USE_GENERIC_DWORD_LOAD_STORE      1
 #endif
 
-#endif /* !defined(__TBB_CompareAndSwap4) || !defined(__TBB_CompareAndSwap8) */
+#endif /* __TBB_UnknownArchitecture */
index ca228fadb47675f0c45d158860c48bee452967c9..47f27a76f25d0fc4c04f3d9a7f2071c5ac99edd7 100644 (file)
 */
 
 
-#ifndef __TBB_machine_H
+#if !defined(__TBB_machine_H) || defined(__TBB_machine_sunos_sparc_H)
 #error Do not include this file directly; include tbb_machine.h instead
 #endif
 
+#define __TBB_machine_sunos_sparc_H
+
 #include <stdint.h>
 #include <unistd.h>
 
 #define __TBB_WORDSIZE 8
 #define __TBB_BIG_ENDIAN 1
 
-#define __TBB_release_consistency_helper() __asm__ __volatile__ ("": : :"memory")
-#define __TBB_full_memory_fence() __asm__ __volatile__("membar #LoadLoad|#LoadStore|#StoreStore|#StoreLoad": : : "memory")
+/** To those working on SPARC hardware. Consider relaxing acquire and release
+    consistency helpers to no-op (as this port covers TSO mode only). **/
+#define __TBB_compiler_fence()             __asm__ __volatile__ ("": : :"memory")
+#define __TBB_control_consistency_helper() __TBB_compiler_fence()
+#define __TBB_acquire_consistency_helper() __TBB_compiler_fence()
+#define __TBB_release_consistency_helper() __TBB_compiler_fence()
+#define __TBB_full_memory_fence()          __asm__ __volatile__("membar #LoadLoad|#LoadStore|#StoreStore|#StoreLoad": : : "memory")
 
 //--------------------------------------------------
 // Compare and swap
@@ -184,45 +191,17 @@ static inline bool __TBB_machine_trylockbyte(unsigned char &flag){
     return result == 0;
 }
 
-
-// Machine specific atomic operations
-
-//#define __TBB_CompareAndSwap1(P,V,C) __TBB_machine_cmpswp1(P,V,C)  // use generic version in tbb_machine.h
-//#define __TBB_CompareAndSwap2(P,V,C) __TBB_machine_cmpswp2(P,V,C)  // use generic version in tbb_machine.h
-#define __TBB_CompareAndSwap4(P,V,C) __TBB_machine_cmpswp4(P,V,C)
-#define __TBB_CompareAndSwap8(P,V,C) __TBB_machine_cmpswp8(P,V,C)
-#define __TBB_CompareAndSwapW(P,V,C) __TBB_machine_cmpswp8(P,V,C)
-
-//#define __TBB_FetchAndAdd1(P,V) __TBB_machine_fetchadd1(P,V)       // use generic version in tbb_machine.h
-//#define __TBB_FetchAndAdd2(P,V) __TBB_machine_fetchadd2(P,V)       // use generic version in tbb_machine.h
-#define __TBB_FetchAndAdd4(P,V) __TBB_machine_fetchadd4(P,V)
-#define __TBB_FetchAndAdd8(P,V)  __TBB_machine_fetchadd8(P,V)
-#define __TBB_FetchAndAddW(P,V)  __TBB_machine_fetchadd8(P,V)
-
-// use generic version in tbb_machine.h
-//#define __TBB_FetchAndStore1(P,V) __TBB_machine_fetchstore1(P,V)  
-//#define __TBB_FetchAndStore2(P,V) __TBB_machine_fetchstore2(P,V)
-//#define __TBB_FetchAndStore4(P,V) __TBB_machine_fetchstore4(P,V)
-//#define __TBB_FetchAndStore8(P,V)  __TBB_machine_fetchstore8(P,V)
-//#define __TBB_FetchAndStoreW(P,V)  __TBB_machine_fetchstore8(P,V)
-
-#undef __TBB_Store8
-#undef __TBB_Load8
+#define __TBB_USE_GENERIC_PART_WORD_CAS             1
+#define __TBB_USE_GENERIC_PART_WORD_FETCH_ADD       1
+#define __TBB_USE_GENERIC_FETCH_STORE               1
+#define __TBB_USE_GENERIC_HALF_FENCED_LOAD_STORE    1
+#define __TBB_USE_GENERIC_RELAXED_LOAD_STORE        1
 
 #define __TBB_AtomicOR(P,V) __TBB_machine_or(P,V)
 #define __TBB_AtomicAND(P,V) __TBB_machine_and(P,V)
 
 // Definition of other functions
 #define __TBB_Pause(V) __TBB_machine_pause(V)
-#define __TBB_Log2(V)    __TBB_machine_lg(V)
-
-// Special atomic functions
-#define __TBB_FetchAndAddWrelease(P,V) __TBB_FetchAndAddW(P,V)
-#define __TBB_FetchAndIncrementWacquire(P) __TBB_FetchAndAddW(P,1)
-#define __TBB_FetchAndDecrementWrelease(P) __TBB_FetchAndAddW(P,-1)
-
-// Definition of Lock functions
-// Repeatedly runs TryLockByte, no need to implement
-#undef __TBB_LockByte
+#define __TBB_Log2(V)  __TBB_machine_lg(V)
 
 #define __TBB_TryLockByte(P) __TBB_machine_trylockbyte(P)
index 22dbddd6426993a3a3f0c9a1d01ca19ed3f21744..9657dd74ec865a2fd58a6523e5a8eb83d302236e 100644 (file)
     the GNU General Public License.
 */
 
-#ifndef __TBB_machine_H
+#if !defined(__TBB_machine_H) || defined(__TBB_machine_windows_ia32_H)
 #error Do not include this file directly; include tbb_machine.h instead
 #endif
 
-#if defined(__INTEL_COMPILER)
-#define __TBB_release_consistency_helper() __asm { __asm nop }
+#define __TBB_machine_windows_ia32_H
+
+#define __TBB_WORDSIZE 4
+#define __TBB_BIG_ENDIAN 0
+
+#if __INTEL_COMPILER
+    #define __TBB_compiler_fence() __asm { __asm nop }
 #elif _MSC_VER >= 1300
-extern "C" void _ReadWriteBarrier();
-#pragma intrinsic(_ReadWriteBarrier)
-#define __TBB_release_consistency_helper() _ReadWriteBarrier()
+    extern "C" void _ReadWriteBarrier();
+    #pragma intrinsic(_ReadWriteBarrier)
+    #define __TBB_compiler_fence() _ReadWriteBarrier()
 #else
-#error Unsupported compiler - need to define __TBB_release_consistency_helper to support it
+    #error Unsupported compiler - need to define __TBB_{control,acquire,release}_consistency_helper to support it
 #endif
 
-#define __TBB_full_memory_fence() __asm { __asm mfence }
-
-#define __TBB_WORDSIZE 4
-#define __TBB_BIG_ENDIAN 0
+#define __TBB_control_consistency_helper() __TBB_compiler_fence()
+#define __TBB_acquire_consistency_helper() __TBB_compiler_fence()
+#define __TBB_release_consistency_helper() __TBB_compiler_fence()
+#define __TBB_full_memory_fence()          __asm { __asm mfence }
 
 #if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
     // Workaround for overzealous compiler warnings in /Wp64 mode
@@ -59,11 +64,10 @@ extern "C" {
     __int64 __TBB_EXPORTED_FUNC __TBB_machine_load8 (const volatile void *ptr);
 }
 
-#define __TBB_DEFINE_ATOMICS(S,T,U,A,C) \
+#define __TBB_MACHINE_DEFINE_ATOMICS(S,T,U,A,C) \
 static inline T __TBB_machine_cmpswp##S ( volatile void * ptr, U value, U comparand ) { \
     T result; \
     volatile T *p = (T *)ptr; \
-    __TBB_release_consistency_helper(); \
     __asm \
     { \
        __asm mov edx, p \
@@ -72,14 +76,12 @@ static inline T __TBB_machine_cmpswp##S ( volatile void * ptr, U value, U compar
        __asm lock cmpxchg [edx], C \
        __asm mov result, A \
     } \
-    __TBB_release_consistency_helper(); \
     return result; \
 } \
 \
 static inline T __TBB_machine_fetchadd##S ( volatile void * ptr, U addend ) { \
     T result; \
     volatile T *p = (T *)ptr; \
-    __TBB_release_consistency_helper(); \
     __asm \
     { \
         __asm mov edx, p \
@@ -87,14 +89,12 @@ static inline T __TBB_machine_fetchadd##S ( volatile void * ptr, U addend ) { \
         __asm lock xadd [edx], A \
         __asm mov result, A \
     } \
-    __TBB_release_consistency_helper(); \
     return result; \
 }\
 \
 static inline T __TBB_machine_fetchstore##S ( volatile void * ptr, U value ) { \
     T result; \
     volatile T *p = (T *)ptr; \
-    __TBB_release_consistency_helper(); \
     __asm \
     { \
         __asm mov edx, p \
@@ -102,14 +102,15 @@ static inline T __TBB_machine_fetchstore##S ( volatile void * ptr, U value ) { \
         __asm lock xchg [edx], A \
         __asm mov result, A \
     } \
-    __TBB_release_consistency_helper(); \
     return result; \
 }
 
-__TBB_DEFINE_ATOMICS(1, __int8, __int8, al, cl)
-__TBB_DEFINE_ATOMICS(2, __int16, __int16, ax, cx)
-__TBB_DEFINE_ATOMICS(4, __int32, __int32, eax, ecx)
-__TBB_DEFINE_ATOMICS(W, ptrdiff_t, ptrdiff_t, eax, ecx)
+
+__TBB_MACHINE_DEFINE_ATOMICS(1, __int8, __int8, al, cl)
+__TBB_MACHINE_DEFINE_ATOMICS(2, __int16, __int16, ax, cx)
+__TBB_MACHINE_DEFINE_ATOMICS(4, ptrdiff_t, ptrdiff_t, eax, ecx)
+
+#undef __TBB_MACHINE_DEFINE_ATOMICS
 
 static inline __int32 __TBB_machine_lg( unsigned __int64 i ) {
     unsigned __int32 j;
@@ -151,39 +152,18 @@ static inline void __TBB_machine_pause (__int32 delay ) {
     return;
 }
 
-#define __TBB_CompareAndSwap1(P,V,C) __TBB_machine_cmpswp1(P,V,C)
-#define __TBB_CompareAndSwap2(P,V,C) __TBB_machine_cmpswp2(P,V,C)
-#define __TBB_CompareAndSwap4(P,V,C) __TBB_machine_cmpswp4(P,V,C)
-#define __TBB_CompareAndSwap8(P,V,C) __TBB_machine_cmpswp8(P,V,C)
-#define __TBB_CompareAndSwapW(P,V,C) __TBB_machine_cmpswpW(P,V,C)
-
-#define __TBB_FetchAndAdd1(P,V) __TBB_machine_fetchadd1(P,V)
-#define __TBB_FetchAndAdd2(P,V) __TBB_machine_fetchadd2(P,V)
-#define __TBB_FetchAndAdd4(P,V) __TBB_machine_fetchadd4(P,V)
-#define __TBB_FetchAndAdd8(P,V) __TBB_machine_fetchadd8(P,V)
-#define __TBB_FetchAndAddW(P,V) __TBB_machine_fetchaddW(P,V)
-
-#define __TBB_FetchAndStore1(P,V) __TBB_machine_fetchstore1(P,V)
-#define __TBB_FetchAndStore2(P,V) __TBB_machine_fetchstore2(P,V)
-#define __TBB_FetchAndStore4(P,V) __TBB_machine_fetchstore4(P,V)
-#define __TBB_FetchAndStore8(P,V) __TBB_machine_fetchstore8(P,V)
-#define __TBB_FetchAndStoreW(P,V) __TBB_machine_fetchstoreW(P,V)
-
-// Should define this: 
-#define __TBB_Store8(P,V) __TBB_machine_store8(P,V)
-#define __TBB_Load8(P) __TBB_machine_load8(P)
 #define __TBB_AtomicOR(P,V) __TBB_machine_OR(P,V)
 #define __TBB_AtomicAND(P,V) __TBB_machine_AND(P,V)
 
+#define __TBB_USE_FETCHSTORE_AS_FULL_FENCED_STORE   1
+#define __TBB_USE_GENERIC_HALF_FENCED_LOAD_STORE    1
+#define __TBB_USE_GENERIC_RELAXED_LOAD_STORE        1
+
 // Definition of other functions
 extern "C" __declspec(dllimport) int __stdcall SwitchToThread( void );
 #define __TBB_Yield()  SwitchToThread()
 #define __TBB_Pause(V) __TBB_machine_pause(V)
-#define __TBB_Log2(V)    __TBB_machine_lg(V)
-
-// Use generic definitions from tbb_machine.h
-#undef __TBB_TryLockByte
-#undef __TBB_LockByte
+#define __TBB_Log2(V)  __TBB_machine_lg(V)
 
 #if defined(_MSC_VER)&&_MSC_VER<1400
     static inline void* __TBB_machine_get_current_teb () {
index 9a45f5db6b751b9f821b59bbdb9e9edbd623d2a6..76e0b3418bfb416402ff8e17457a7c71b371ccc1 100644 (file)
     the GNU General Public License.
 */
 
-#ifndef __TBB_machine_H
+#if !defined(__TBB_machine_H) || defined(__TBB_machine_windows_intel64_H)
 #error Do not include this file directly; include tbb_machine.h instead
 #endif
 
+#define __TBB_machine_windows_intel64_H
+
+#define __TBB_WORDSIZE 8
+#define __TBB_BIG_ENDIAN 0
+
 #include <intrin.h>
-#if !defined(__INTEL_COMPILER)
-#pragma intrinsic(_InterlockedOr64)
-#pragma intrinsic(_InterlockedAnd64)
-#pragma intrinsic(_InterlockedCompareExchange)
-#pragma intrinsic(_InterlockedCompareExchange64)
-#pragma intrinsic(_InterlockedExchangeAdd)
-#pragma intrinsic(_InterlockedExchangeAdd64)
-#pragma intrinsic(_InterlockedExchange)
-#pragma intrinsic(_InterlockedExchange64)
+
+#if !__INTEL_COMPILER
+    #pragma intrinsic(_InterlockedOr64)
+    #pragma intrinsic(_InterlockedAnd64)
+    #pragma intrinsic(_InterlockedCompareExchange)
+    #pragma intrinsic(_InterlockedCompareExchange64)
+    #pragma intrinsic(_InterlockedExchangeAdd)
+    #pragma intrinsic(_InterlockedExchangeAdd64)
+    #pragma intrinsic(_InterlockedExchange)
+    #pragma intrinsic(_InterlockedExchange64)
 #endif /* !defined(__INTEL_COMPILER) */
 
-#if defined(__INTEL_COMPILER)
-#define __TBB_release_consistency_helper() __asm { __asm nop }
-#define __TBB_full_memory_fence() __asm { __asm mfence }
+#if __INTEL_COMPILER
+    #define __TBB_compiler_fence()    __asm { __asm nop }
+    #define __TBB_full_memory_fence() __asm { __asm mfence }
 #elif _MSC_VER >= 1300
-extern "C" void _ReadWriteBarrier();
-#pragma intrinsic(_ReadWriteBarrier)
-#define __TBB_release_consistency_helper() _ReadWriteBarrier()
-#pragma intrinsic(_mm_mfence)
-#define __TBB_full_memory_fence() _mm_mfence()
+    extern "C" void _ReadWriteBarrier();
+    #pragma intrinsic(_ReadWriteBarrier)
+    #pragma intrinsic(_mm_mfence)
+    #define __TBB_compiler_fence()    _ReadWriteBarrier()
+    #define __TBB_full_memory_fence() _mm_mfence()
 #endif
 
-#define __TBB_WORDSIZE 8
-#define __TBB_BIG_ENDIAN 0
+#define __TBB_control_consistency_helper() __TBB_compiler_fence()
+#define __TBB_acquire_consistency_helper() __TBB_compiler_fence()
+#define __TBB_release_consistency_helper() __TBB_compiler_fence()
 
 // ATTENTION: if you ever change argument types in machine-specific primitives,
 // please take care of atomic_word<> specializations in tbb/atomic.h
@@ -68,6 +75,29 @@ extern "C" {
     void __TBB_EXPORTED_FUNC __TBB_machine_pause (__int32 delay );
 }
 
+inline long __TBB_machine_cmpswp4 (volatile void *ptr, __int32 value, __int32 comparand ) {
+    return _InterlockedCompareExchange( (long*)ptr, value, comparand );
+}
+inline long __TBB_machine_fetchadd4 (volatile void *ptr, __int32 addend ) {
+    return _InterlockedExchangeAdd( (long*)ptr, addend );
+}
+inline long __TBB_machine_fetchstore4 (volatile void *ptr, __int32 value ) {
+    return _InterlockedExchange( (long*)ptr, value );
+}
+
+inline __int64 __TBB_machine_cmpswp8 (volatile void *ptr, __int64 value, __int64 comparand ) {
+    return _InterlockedCompareExchange64( (__int64*)ptr, value, comparand );
+}
+inline __int64 __TBB_machine_fetchadd8 (volatile void *ptr, __int64 addend ) {
+    return _InterlockedExchangeAdd64( (__int64*)ptr, addend );
+}
+inline __int64 __TBB_machine_fetchstore8 (volatile void *ptr, __int64 value ) {
+    return _InterlockedExchange64( (__int64*)ptr, value );
+}
+
+#define __TBB_USE_FETCHSTORE_AS_FULL_FENCED_STORE   1
+#define __TBB_USE_GENERIC_HALF_FENCED_LOAD_STORE    1
+#define __TBB_USE_GENERIC_RELAXED_LOAD_STORE        1
 
 #if !__INTEL_COMPILER
 extern "C" unsigned char _BitScanReverse64( unsigned long* i, unsigned __int64 w );
@@ -97,39 +127,13 @@ inline void __TBB_machine_AND( volatile void *operand, intptr_t addend ) {
     _InterlockedAnd64((__int64*)operand, addend); 
 }
 
-#define __TBB_CompareAndSwap1(P,V,C) __TBB_machine_cmpswp1(P,V,C)
-#define __TBB_CompareAndSwap2(P,V,C) __TBB_machine_cmpswp2(P,V,C)
-#define __TBB_CompareAndSwap4(P,V,C) _InterlockedCompareExchange( (long*) P , V , C ) 
-#define __TBB_CompareAndSwap8(P,V,C) _InterlockedCompareExchange64( (__int64*) P , V , C )
-#define __TBB_CompareAndSwapW(P,V,C) _InterlockedCompareExchange64( (__int64*) P , V , C )
-
-#define __TBB_FetchAndAdd1(P,V) __TBB_machine_fetchadd1(P,V)
-#define __TBB_FetchAndAdd2(P,V) __TBB_machine_fetchadd2(P,V)
-#define __TBB_FetchAndAdd4(P,V) _InterlockedExchangeAdd((long*) P , V )
-#define __TBB_FetchAndAdd8(P,V) _InterlockedExchangeAdd64((__int64*) P , V )
-#define __TBB_FetchAndAddW(P,V) _InterlockedExchangeAdd64((__int64*) P , V )
-
-#define __TBB_FetchAndStore1(P,V) __TBB_machine_fetchstore1(P,V)
-#define __TBB_FetchAndStore2(P,V) __TBB_machine_fetchstore2(P,V)
-#define __TBB_FetchAndStore4(P,V) _InterlockedExchange((long*) P , V )
-#define __TBB_FetchAndStore8(P,V) _InterlockedExchange64((__int64*) P , V )
-#define __TBB_FetchAndStoreW(P,V) _InterlockedExchange64((__int64*) P , V ) 
-
-// Not used if wordsize == 8
-#undef __TBB_Store8
-#undef __TBB_Load8
-
 #define __TBB_AtomicOR(P,V) __TBB_machine_OR(P,V)
 #define __TBB_AtomicAND(P,V) __TBB_machine_AND(P,V)
 
 extern "C" __declspec(dllimport) int __stdcall SwitchToThread( void );
 #define __TBB_Yield()  SwitchToThread()
 #define __TBB_Pause(V) __TBB_machine_pause(V)
-#define __TBB_Log2(V)    __TBB_machine_lg(V)
-
-// Use generic definitions from tbb_machine.h
-#undef __TBB_TryLockByte
-#undef __TBB_LockByte
+#define __TBB_Log2(V)  __TBB_machine_lg(V)
 
 // API to retrieve/update FPU control setting
 #define __TBB_CPU_CTL_ENV_PRESENT 1
index 7bde3082a045bcff000267a53c041532441e5284..e95dbbd402a2afd687ff59354a05b1dfe1bb0838 100644 (file)
     the GNU General Public License.
 */
 
-#ifndef __TBB_machine_H
+// TODO: revise by comparing with mac_ppc.h
+
+#if !defined(__TBB_machine_H) || defined(__TBB_machine_xbox360_ppc_H)
 #error Do not include this file directly; include tbb_machine.h instead
 #endif
 
+#define __TBB_machine_xbox360_ppc_H
+
 #define NONET
 #define NOD3D
 #include "xtl.h"    
@@ -38,6 +42,8 @@
 #if _MSC_VER >= 1300
 extern "C" void _MemoryBarrier();
 #pragma intrinsic(_MemoryBarrier)
+#define __TBB_control_consistency_helper() __isync()
+#define __TBB_acquire_consistency_helper() _MemoryBarrier()
 #define __TBB_release_consistency_helper() _MemoryBarrier()
 #endif
 
@@ -46,27 +52,30 @@ extern "C" void _MemoryBarrier();
 #define __TBB_WORDSIZE 4
 #define __TBB_BIG_ENDIAN 1
 
-//todo: define __TBB_DECL_FENCED_ATOMICS and define acquire/release primitives to maximize performance
-
-typedef __int64 int64_t;  //required for definition of Store8/Load8 in atomic.h
-typedef unsigned char uint8_t;  //same reason
+//todo: define __TBB_USE_FENCED_ATOMICS and define acquire/release primitives to maximize performance
 
-inline __int32 __TBB_machine_cmpswp4(volatile void *ptr, __int32 value, __int32 comparand )
-{                               
- __lwsync();
+inline __int32 __TBB_machine_cmpswp4(volatile void *ptr, __int32 value, __int32 comparand ) {                               
+ __sync();
  __int32 result = InterlockedCompareExchange((volatile LONG*)ptr, value, comparand);
- __lwsync();
+ __isync();
  return result;
 }
 
 inline __int64 __TBB_machine_cmpswp8(volatile void *ptr, __int64 value, __int64 comparand )
 {
- __lwsync();
+ __sync();
  __int64 result = InterlockedCompareExchange64((volatile LONG64*)ptr, value, comparand);
- __lwsync();
+ __isync();
  return result;
 }
 
+#define __TBB_USE_GENERIC_PART_WORD_CAS             1
+#define __TBB_USE_GENERIC_FETCH_ADD                 1
+#define __TBB_USE_GENERIC_FETCH_STORE               1
+#define __TBB_USE_GENERIC_HALF_FENCED_LOAD_STORE    1
+#define __TBB_USE_GENERIC_RELAXED_LOAD_STORE        1
+#define __TBB_USE_GENERIC_DWORD_LOAD_STORE          1
+
 #pragma optimize( "", off )
 inline void __TBB_machine_pause (__int32 delay ) 
 {
@@ -74,10 +83,6 @@ inline void __TBB_machine_pause (__int32 delay )
 }
 #pragma optimize( "", on ) 
 
-
-#define __TBB_CompareAndSwap4(P,V,C) __TBB_machine_cmpswp4(P,V,C)
-#define __TBB_CompareAndSwap8(P,V,C) __TBB_machine_cmpswp8(P,V,C)
-#define __TBB_CompareAndSwapW(P,V,C) __TBB_machine_cmpswp4(P,V,C)
 #define __TBB_Yield()  Sleep(0)
 #define __TBB_Pause(V) __TBB_machine_pause(V)
 
index 31b9f98ccadfea84246aad3b408f372fad6ed036..409534648619802d49cc02d812a27261aa4e299b 100644 (file)
@@ -203,11 +203,6 @@ namespace strict_ppl {
 //! Parallel iteration over a range of integers with a step provided
 template <typename Index, typename Function>
 void parallel_for(Index first, Index last, Index step, const Function& f) {
-    tbb::task_group_context context;
-    parallel_for(first, last, step, f, context);
-}
-template <typename Index, typename Function>
-void parallel_for(Index first, Index last, Index step, const Function& f, tbb::task_group_context &context) {
     if (step <= 0 )
         internal::throw_exception(internal::eid_nonpositive_step); // throws std::invalid_argument
     else if (last > first) {
@@ -215,20 +210,35 @@ void parallel_for(Index first, Index last, Index step, const Function& f, tbb::t
         Index end = (last - first - Index(1)) / step + Index(1);
         tbb::blocked_range<Index> range(static_cast<Index>(0), end);
         internal::parallel_for_body<Function, Index> body(f, first, step);
-        tbb::parallel_for(range, body, tbb::auto_partitioner(), context);
+        tbb::parallel_for(range, body, tbb::auto_partitioner());
     }
 }
 //! Parallel iteration over a range of integers with a default step value
 template <typename Index, typename Function>
 void parallel_for(Index first, Index last, const Function& f) {
-    tbb::task_group_context context;
-    parallel_for(first, last, static_cast<Index>(1), f, context);
+    parallel_for(first, last, static_cast<Index>(1), f);
 }
+
+#if __TBB_TASK_GROUP_CONTEXT
+//! Parallel iteration over a range of integers with explicit step and task group context
+template <typename Index, typename Function>
+void parallel_for(Index first, Index last, Index step, const Function& f, tbb::task_group_context &context) {
+    if (step <= 0 )
+        internal::throw_exception(internal::eid_nonpositive_step); // throws std::invalid_argument
+    else if (last > first) {
+        // Above "else" avoids "potential divide by zero" warning on some platforms
+        Index end = (last - first - Index(1)) / step + Index(1);
+        tbb::blocked_range<Index> range(static_cast<Index>(0), end);
+        internal::parallel_for_body<Function, Index> body(f, first, step);
+        tbb::parallel_for(range, body, tbb::auto_partitioner(), context);
+    }
+}
+//! Parallel iteration over a range of integers with a default step value and explicit task group context
 template <typename Index, typename Function>
 void parallel_for(Index first, Index last, const Function& f, tbb::task_group_context &context) {
     parallel_for(first, last, static_cast<Index>(1), f, context);
 }
-
+#endif /* __TBB_TASK_GROUP_CONTEXT */
 //@}
 
 } // namespace strict_ppl
index e59ee76bf807c61139301ef085fef9e9de76e317..59200b1ebb1d33d8395abb817f3f7428a26b8aed 100644 (file)
@@ -43,7 +43,7 @@ namespace internal {
         parallel_for_each_body(const Function &_func) : my_func(_func) {}
         parallel_for_each_body(const parallel_for_each_body<Function, Iterator> &_caller) : my_func(_caller.my_func) {}
 
-        void operator() ( typename std::iterator_traits<Iterator>::value_type& value ) const {
+        void operator() ( typename std::iterator_traits<Iterator>::reference value ) const {
             my_func(value);
         }
     };
@@ -55,18 +55,18 @@ namespace internal {
 //@{
 //! Calls function f for all items from [first, last) interval using user-supplied context
 /** @ingroup algorithms */
+#if __TBB_TASK_GROUP_CONTEXT
 template<typename InputIterator, typename Function>
 void parallel_for_each(InputIterator first, InputIterator last, const Function& f, task_group_context &context) {
     internal::parallel_for_each_body<Function, InputIterator> body(f);
-
     tbb::parallel_do (first, last, body, context);
 }
+#endif /* __TBB_TASK_GROUP_CONTEXT */
 
 //! Uses default context
 template<typename InputIterator, typename Function>
 void parallel_for_each(InputIterator first, InputIterator last, const Function& f) {
     internal::parallel_for_each_body<Function, InputIterator> body(f);
-
     tbb::parallel_do (first, last, body);
 }
 
index 3303c41a55d25b41dab82748cb44fa90b31edc1c..6cc38e26534ff1a4ef4087e174887b290faeecb4 100644 (file)
 
 namespace tbb {
 
+#if !__TBB_TASK_GROUP_CONTEXT
+    /** Dummy to avoid cluttering the bulk of the header with enormous amount of ifdefs. **/
+    struct task_group_context {};
+#endif /* __TBB_TASK_GROUP_CONTEXT */
+
 //! @cond INTERNAL
 namespace internal {
     // Simple task object, executing user method
@@ -137,8 +142,15 @@ namespace internal {
     // The class destroys root if exception occured as well as in normal case
     class parallel_invoke_cleaner: internal::no_copy { 
     public:
-        parallel_invoke_cleaner(int number_of_children, tbb::task_group_context& context) : root(*new(task::allocate_root(context)) internal::parallel_invoke_helper(number_of_children))
+#if __TBB_TASK_GROUP_CONTEXT
+        parallel_invoke_cleaner(int number_of_children, tbb::task_group_context& context)
+            : root(*new(task::allocate_root(context)) internal::parallel_invoke_helper(number_of_children))
+#else
+        parallel_invoke_cleaner(int number_of_children, tbb::task_group_context&)
+            : root(*new(task::allocate_root()) internal::parallel_invoke_helper(number_of_children))
+#endif /* !__TBB_TASK_GROUP_CONTEXT */
         {}
+
         ~parallel_invoke_cleaner(){
             root.destroy(root);
         }
index bef9d6cdffa024281d7fb8b40e70791909079007..935fcf8b409940039bc3f509868d23d0a27c673f 100644 (file)
@@ -148,7 +148,7 @@ public:
         } else {
             finish_type& c = *new( allocate_continuation()) finish_type(my_context);
             recycle_as_child_of(c);
-            c.set_ref_count(2);    
+            c.set_ref_count(2);
             bool delay = my_partition.decide_whether_to_delay();
             start_reduce& b = *new( c.allocate_child() ) start_reduce(*this,split());
             my_partition.spawn_or_delay(delay,b);
@@ -156,6 +156,87 @@ public:
         }
     }
 
+#if TBB_PREVIEW_DETERMINISTIC_REDUCE
+    //! Task type use to combine the partial results of parallel_deterministic_reduce.
+    /** @ingroup algorithms */
+    template<typename Body>
+    class finish_deterministic_reduce: public task {
+        Body &my_left_body;
+        Body my_right_body;
+
+        finish_deterministic_reduce( Body &body ) :
+            my_left_body( body ),
+            my_right_body( body, split() )
+        {
+        }
+        task* execute() {
+            my_left_body.join( my_right_body );
+            return NULL;
+        }
+        template<typename Range,typename Body_>
+        friend class start_deterministic_reduce;
+    };
+
+    //! Task type used to split the work of parallel_deterministic_reduce.
+    /** @ingroup algorithms */
+    template<typename Range, typename Body>
+    class start_deterministic_reduce: public task {
+        typedef finish_deterministic_reduce<Body> finish_type;
+        Body &my_body;
+        Range my_range;
+        /*override*/ task* execute();
+
+        //! Constructor used for root task
+        start_deterministic_reduce( const Range& range, Body& body ) :
+            my_body( body ),
+            my_range( range )
+        {
+        }
+        //! Splitting constructor used to generate children.
+        /** parent_ becomes left child.  Newly constructed object is right child. */
+        start_deterministic_reduce( start_deterministic_reduce& parent_, finish_type& c ) :
+            my_body( c.my_right_body ),
+            my_range( parent_.my_range, split() )
+        {
+        }
+
+public:
+        static void run( const Range& range, Body& body ) {
+            if( !range.empty() ) {
+#if !__TBB_TASK_GROUP_CONTEXT || TBB_JOIN_OUTER_TASK_GROUP
+                task::spawn_root_and_wait( *new(task::allocate_root()) start_deterministic_reduce(range,&body) );
+#else
+                // Bound context prevents exceptions from body to affect nesting or sibling algorithms,
+                // and allows users to handle exceptions safely by wrapping parallel_for in the try-block.
+                task_group_context context;
+                task::spawn_root_and_wait( *new(task::allocate_root(context)) start_deterministic_reduce(range,body) );
+#endif /* __TBB_TASK_GROUP_CONTEXT && !TBB_JOIN_OUTER_TASK_GROUP */
+            }
+        }
+#if __TBB_TASK_GROUP_CONTEXT
+        static void run( const Range& range, Body& body, task_group_context& context ) {
+            if( !range.empty() ) 
+                task::spawn_root_and_wait( *new(task::allocate_root(context)) start_deterministic_reduce(range,body) );
+        }
+#endif /* __TBB_TASK_GROUP_CONTEXT */
+    };
+
+    template<typename Range, typename Body>
+    task* start_deterministic_reduce<Range,Body>::execute() {
+        if( !my_range.is_divisible() ) {
+            my_body( my_range );
+            return NULL;
+        } else {
+            finish_type& c = *new( allocate_continuation() ) finish_type( my_body );
+            recycle_as_child_of(c);
+            c.set_ref_count(2);
+            start_deterministic_reduce& b = *new( c.allocate_child() ) start_deterministic_reduce( *this, c );
+            task::spawn(b);
+            return this;
+        }
+    }
+#endif /* TBB_PREVIEW_DETERMINISTIC_REDUCE */
+
     //! Auxiliary class for parallel_reduce; for internal use only.
     /** The adaptor class that implements \ref parallel_reduce_body_req "parallel_reduce Body"
         using given \ref parallel_reduce_lambda_req "anonymous function objects".
@@ -357,6 +438,50 @@ Value parallel_reduce( const Range& range, const Value& identity, const RealBody
     return body.result();
 }
 #endif /* __TBB_TASK_GROUP_CONTEXT */
+
+#if TBB_PREVIEW_DETERMINISTIC_REDUCE
+//! Parallel iteration with deterministic reduction and default partitioner.
+/** @ingroup algorithms **/
+template<typename Range, typename Body>
+void parallel_deterministic_reduce( const Range& range, Body& body ) {
+    internal::start_deterministic_reduce<Range,Body>::run( range, body );
+}
+
+#if __TBB_TASK_GROUP_CONTEXT
+//! Parallel iteration with deterministic reduction, simple partitioner and user-supplied context.
+/** @ingroup algorithms **/
+template<typename Range, typename Body>
+void parallel_deterministic_reduce( const Range& range, Body& body, task_group_context& context ) {
+    internal::start_deterministic_reduce<Range,Body>::run( range, body, context );
+}
+#endif /* __TBB_TASK_GROUP_CONTEXT */
+
+/** parallel_reduce overloads that work with anonymous function objects
+    (see also \ref parallel_reduce_lambda_req "requirements on parallel_reduce anonymous function objects"). **/
+
+//! Parallel iteration with deterministic reduction and default partitioner.
+/** @ingroup algorithms **/
+template<typename Range, typename Value, typename RealBody, typename Reduction>
+Value parallel_deterministic_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction ) {
+    internal::lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
+    internal::start_deterministic_reduce<Range,internal::lambda_reduce_body<Range,Value,RealBody,Reduction> >
+                          ::run(range, body);
+    return body.result();
+}
+
+#if __TBB_TASK_GROUP_CONTEXT
+//! Parallel iteration with deterministic reduction, simple partitioner and user-supplied context.
+/** @ingroup algorithms **/
+template<typename Range, typename Value, typename RealBody, typename Reduction>
+Value parallel_deterministic_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction,
+                       task_group_context& context ) {
+    internal::lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
+    internal::start_deterministic_reduce<Range,internal::lambda_reduce_body<Range,Value,RealBody,Reduction> >
+                          ::run( range, body, context );
+    return body.result();
+}
+#endif /* __TBB_TASK_GROUP_CONTEXT */
+#endif /* TBB_PREVIEW_DETERMINISTIC_REDUCE */
 //@}
 
 } // namespace tbb
index 00500460aa0035c5a2078f03a7a559f702ef256d..cdfbb6102b4edddeb0c3667a8b37fdc5d83066a5 100644 (file)
@@ -111,6 +111,7 @@ partition:
     }
 };
 
+#if __TBB_TASK_GROUP_CONTEXT
 //! Body class used to test if elements in a range are presorted
 /** @ingroup algorithms */
 template<typename RandomAccessIterator, typename Compare>
@@ -137,6 +138,7 @@ public:
     }
 
 };
+#endif /* __TBB_TASK_GROUP_CONTEXT */
 
 //! Body class used to sort elements in a range that is smaller than the grainsize.
 /** @ingroup algorithms */
@@ -152,6 +154,7 @@ struct quick_sort_body {
 /** @ingroup algorithms */
 template<typename RandomAccessIterator, typename Compare>
 void parallel_quick_sort( RandomAccessIterator begin, RandomAccessIterator end, const Compare& comp ) {
+#if __TBB_TASK_GROUP_CONTEXT
     task_group_context my_context;
     const int serial_cutoff = 9;
 
@@ -170,6 +173,7 @@ void parallel_quick_sort( RandomAccessIterator begin, RandomAccessIterator end,
 
     if (my_context.is_group_execution_cancelled())
 do_parallel_quick_sort:
+#endif /* __TBB_TASK_GROUP_CONTEXT */
         parallel_for( quick_sort_range<RandomAccessIterator,Compare>(begin, end-begin, comp ), 
                       quick_sort_body<RandomAccessIterator,Compare>(),
                       auto_partitioner() );
index fe0d5d9a2b4e9316fcaec350f49c78894859cfef..a7e224b2a4c336906c9d9be18b009b517f7cd945 100644 (file)
@@ -104,7 +104,7 @@ public:
         scoped_lock *next;
 
         //! The local spin-wait variable
-        /** Inverted (0 - blocked, 1 - acquired the mutex) for the sake of 
+        /** Inverted (0 - blocked, 1 - acquired the mutex) for the sake of
             zero-initialization.  Defining it as an entire word instead of
             a byte seems to help performance slightly. */
         uintptr_t going;
@@ -116,8 +116,6 @@ public:
     static const bool is_rw_mutex = false;
     static const bool is_recursive_mutex = false;
     static const bool is_fair_mutex = true;
-
-    friend class scoped_lock;
 private:
     //! The last competitor requesting the lock
     atomic<scoped_lock*> q_tail;
index 3c76332e649a4f9cd02654a51e89185184e9cd7f..644bdfdab1531ba5c447def0740a5718d8e71a06 100644 (file)
@@ -78,11 +78,11 @@ public:
     class scoped_lock: internal::no_copy {
         //! Initialize fields
         void initialize() {
-            mutex = NULL;
+            my_mutex = NULL;
 #if TBB_USE_ASSERT
-            state = 0xFF; // Set to invalid state
-            internal::poison_pointer(next);
-            internal::poison_pointer(prev);
+            my_state = 0xFF; // Set to invalid state
+            internal::poison_pointer(my_next);
+            internal::poison_pointer(my_prev);
 #endif /* TBB_USE_ASSERT */
         }
     public:
@@ -98,7 +98,7 @@ public:
 
         //! Release lock (if lock is held).
         ~scoped_lock() {
-            if( mutex ) release();
+            if( my_mutex ) release();
         }
 
         //! Acquire lock on given mutex.
@@ -119,22 +119,22 @@ public:
 
     private:
         //! The pointer to the current mutex to work
-        queuing_rw_mutex* mutex;
+        queuing_rw_mutex* my_mutex;
 
         //! The pointer to the previous and next competitors for a mutex
-        scoped_lock * prev, * next;
+        scoped_lock *__TBB_atomic my_prev, *__TBB_atomic my_next;
 
         typedef unsigned char state_t;
 
         //! State of the request: reader, writer, active reader, other service states
-        atomic<state_t> state;
+        atomic<state_t> my_state;
 
         //! The local spin-wait variable
         /** Corresponds to "spin" in the pseudocode but inverted for the sake of zero-initialization */
-        unsigned char going;
+        unsigned char __TBB_atomic my_going;
 
         //! A tiny internal lock
-        unsigned char internal_lock;
+        unsigned char my_internal_lock;
 
         //! Acquire the internal lock
         void acquire_internal_lock();
index a5cace99d8b515ec46694d2491bf8e6ab9101a9b..877234b0a3177232b39bcc90aea9437f6b853549 100644 (file)
@@ -93,7 +93,7 @@ namespace interface5 {
 
     //! The scoped lock pattern for write locks
     /** Scoped locks help avoid the common problem of forgetting to release the lock.
-        This type is also serves as the node for queuing locks. */
+        This type also serves as the node for queuing locks. */
     class scoped_lock : tbb::internal::no_copy {
     public:
         friend class reader_writer_lock;
diff --git a/tbb/include/tbb/runtime_loader.h b/tbb/include/tbb/runtime_loader.h
new file mode 100644 (file)
index 0000000..e0d4526
--- /dev/null
@@ -0,0 +1,188 @@
+/*
+    Copyright 2005-2011 Intel Corporation.  All Rights Reserved.
+
+    This file is part of Threading Building Blocks.
+
+    Threading Building Blocks is free software; you can redistribute it
+    and/or modify it under the terms of the GNU General Public License
+    version 2 as published by the Free Software Foundation.
+
+    Threading Building Blocks is distributed in the hope that it will be
+    useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+    of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Threading Building Blocks; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+    As a special exception, you may use this file as part of a free software
+    library without restriction.  Specifically, if other files instantiate
+    templates or use macros or inline functions from this file, or you compile
+    this file and link it with other files to produce an executable, this
+    file does not by itself cause the resulting executable to be covered by
+    the GNU General Public License.  This exception does not however
+    invalidate any other reasons why the executable file might be covered by
+    the GNU General Public License.
+*/
+
+#ifndef __TBB_runtime_loader_H
+#define __TBB_runtime_loader_H
+
+#if ! TBB_PREVIEW_RUNTIME_LOADER
+    #error Set TBB_PREVIEW_RUNTIME_LOADER to include runtime_loader.h
+#endif
+
+#include "tbb/tbb_stddef.h"
+#include <climits>
+
+#if _MSC_VER
+    #if ! __TBB_NO_IMPLICIT_LINKAGE
+        #ifdef _DEBUG
+            #pragma comment( linker, "/nodefaultlib:tbb_debug.lib" )
+            #pragma comment( linker, "/defaultlib:tbbproxy_debug.lib" )
+        #else
+            #pragma comment( linker, "/nodefaultlib:tbb.lib" )
+            #pragma comment( linker, "/defaultlib:tbbproxy.lib" )
+        #endif
+    #endif
+#endif
+
+namespace tbb {
+
+namespace interface6 {
+
+//! Load TBB at runtime.
+/*!
+
+\b Usage:
+
+In source code:
+
+\code
+#include "tbb/runtime_loader.h"
+
+char const * path[] = { "<install dir>/lib/ia32", NULL };
+tbb::runtime_loader loader( path );
+
+// Now use TBB.
+\endcode
+
+Link with \c tbbproxy.lib (or \c libtbbproxy.a) instead of \c tbb.lib (\c libtbb.dylib,
+\c libtbb.so).
+
+TBB library will be loaded at runtime from \c <install dir>/lib/ia32 directory.
+
+\b Attention:
+
+All \c runtime_loader objects (in the same module, i.e. exe or dll) share some global state.
+The most noticeable piece of global state is loaded TBB library.
+There are some implications:
+
+    -   Only one TBB library can be loaded per module.
+
+    -   If one object has already loaded TBB library, another object will not load TBB.
+        If the loaded TBB library is suitable for the second object, both will use TBB
+        cooperatively, otherwise the second object will report an error.
+
+    -   \c runtime_loader objects will not work (correctly) in parallel due to absence of
+        syncronization.
+
+*/
+
+class runtime_loader : tbb::internal::no_copy {
+
+    public:
+
+        //! Error mode constants.
+        enum error_mode {
+            em_status,     //!< Save status of operation and continue.
+            em_throw,      //!< Throw an exception of tbb::runtime_loader::error_code type.
+            em_abort       //!< Print message to \c stderr and call \c abort().
+        }; // error_mode
+
+        //! Error codes.
+        enum error_code {
+            ec_ok,         //!< No errors.
+            ec_bad_call,   //!< Invalid function call (e. g. load() called when TBB is already loaded).
+            ec_bad_arg,    //!< Invalid argument passed.
+            ec_bad_lib,    //!< Invalid library found (e. g. \c TBB_runtime_version symbol not found).
+            ec_bad_ver,    //!< TBB found but version is not suitable.
+            ec_no_lib      //!< No suitable TBB library found.
+        }; // error_code
+
+        //! Initialize object but do not load TBB.
+        runtime_loader( error_mode mode = em_abort );
+
+        //! Initialize object and load TBB.
+        /*!
+            See load() for details.
+
+            If error mode is \c em_status, call status() to check whether TBB was loaded or not.
+        */
+        runtime_loader(
+            char const * path[],                           //!< List of directories to search TBB in.
+            int          min_ver = TBB_INTERFACE_VERSION,  //!< Minimal suitable version of TBB.
+            int          max_ver = INT_MAX,                //!< Maximal suitable version of TBB.
+            error_mode   mode    = em_abort                //!< Error mode for this object.
+        );
+
+        //! Destroy object.
+        ~runtime_loader();
+
+        //! Load TBB.
+        /*!
+            The method searches the directories specified in \c path[] array for the TBB library.
+            When the library is found, it is loaded and its version is checked. If the version is
+            not suitable, the library is unloaded, and the search continues.
+
+            \b Note:
+
+            For security reasons, avoid using relative directory names. For example, never load
+            TBB from current (\c "."), parent (\c "..") or any other relative directory (like
+            \c "lib" ). Use only absolute directory names (e. g. "/usr/local/lib").
+
+            For the same security reasons, avoid using system default directories (\c "") on
+            Windows. (See http://www.microsoft.com/technet/security/advisory/2269637.mspx for
+            details.)
+
+            Neglecting these rules may cause your program to execute 3-rd party malicious code.
+
+            \b Errors:
+                -   \c ec_bad_call - TBB already loaded by this object.
+                -   \c ec_bad_arg - \p min_ver and/or \p max_ver negative or zero,
+                    or \p min_ver > \p max_ver.
+                -   \c ec_bad_ver - TBB of unsuitable version already loaded by another object.
+                -   \c ec_no_lib - No suitable library found.
+        */
+        error_code
+        load(
+            char const * path[],                           //!< List of directories to search TBB in.
+            int          min_ver = TBB_INTERFACE_VERSION,  //!< Minimal suitable version of TBB.
+            int          max_ver = INT_MAX                 //!< Maximal suitable version of TBB.
+
+        );
+
+
+        //! Report status.
+        /*!
+            If error mode is \c em_status, the function returns status of the last operation.
+        */
+        error_code status();
+
+    private:
+
+        error_mode const my_mode;
+        error_code       my_status;
+        bool             my_loaded;
+
+}; // class runtime_loader
+
+} // namespace interface6
+
+using interface6::runtime_loader;
+
+} // namespace tbb
+
+#endif /* __TBB_runtime_loader_H */
+
index 140c6e9e2b9a573b2c3a623d4b6611d4f6e691e4..0d200a1597ab2022151e940e71b5778a701b2c6a 100644 (file)
@@ -46,7 +46,7 @@ namespace tbb {
     @ingroup synchronization */
 class spin_mutex {
     //! 0 if lock is released, 1 if lock is acquired.
-    __TBB_Byte flag;
+    __TBB_atomic_flag flag;
 
 public:
     //! Construct unacquired lock.
@@ -64,7 +64,7 @@ public:
         spin_mutex* my_mutex; 
 
         //! Value to store into spin_mutex::flag to unlock the mutex.
-        uintptr_t my_unlock_value;
+        __TBB_Flag my_unlock_value;
 
         //! Like acquire, but with ITT instrumentation.
         void __TBB_EXPORTED_METHOD internal_acquire( spin_mutex& m );
@@ -122,7 +122,7 @@ public:
 #if TBB_USE_THREADING_TOOLS||TBB_USE_ASSERT
             internal_release();
 #else
-            __TBB_UnlockByte(my_mutex->flag, static_cast<__TBB_Byte>(my_unlock_value));
+            __TBB_UnlockByte(my_mutex->flag, my_unlock_value);
             my_mutex = NULL;
 #endif /* TBB_USE_THREADING_TOOLS||TBB_USE_ASSERT */
         }
@@ -133,7 +133,7 @@ public:
 #if TBB_USE_THREADING_TOOLS||TBB_USE_ASSERT
                 internal_release();
 #else
-                __TBB_UnlockByte(my_mutex->flag, static_cast<__TBB_Byte>(my_unlock_value));
+                __TBB_UnlockByte(my_mutex->flag, my_unlock_value);
 #endif /* TBB_USE_THREADING_TOOLS||TBB_USE_ASSERT */
             }
         }
index 7b8dab85141b77a5df9202c460f01517f79cc816..a887431e556be1466d1efd017fe6eada32ca5c23 100644 (file)
@@ -44,7 +44,7 @@ class task_list;
 class task_group_context;
 #endif /* __TBB_TASK_GROUP_CONTEXT */
 
-// MSVC does not allow taking the address of a member that was defined 
+// MSVC does not allow taking the address of a member that was defined
 // privately in task_base and made public in class task via a using declaration.
 #if _MSC_VER || (__GNUC__==3 && __GNUC_MINOR__<3)
 #define __TBB_TASK_BASE_ACCESS public
@@ -71,8 +71,8 @@ namespace interface5 {
         //! Base class for methods that became static in TBB 3.0.
         /** TBB's evolution caused the "this" argument for several methods to become obsolete.
             However, for backwards binary compatibility, the new methods need distinct names,
-            otherwise the One Definition Rule would be broken.  Hence the new methods are 
-            defined in this private base class, and then exposed in class task via 
+            otherwise the One Definition Rule would be broken.  Hence the new methods are
+            defined in this private base class, and then exposed in class task via
             using declarations. */
         class task_base: tbb::internal::no_copy {
         __TBB_TASK_BASE_ACCESS:
@@ -80,7 +80,7 @@ namespace interface5 {
 
             //! Schedule task for execution when a worker becomes available.
             static void spawn( task& t );
+
             //! Spawn multiple tasks and clear list.
             static void spawn( task_list& list );
 
@@ -97,7 +97,7 @@ namespace interface5 {
                 sometimes a task needs to be explicitly deallocated, such as
                 when a root task is used as the parent in spawn_and_wait_for_all. */
             static void __TBB_EXPORTED_FUNC destroy( task& victim );
-        }; 
+        };
     } // internal
 } // interface5
 
@@ -168,8 +168,14 @@ namespace internal {
     //! Memory prefix to a task object.
     /** This class is internal to the library.
         Do not reference it directly, except within the library itself.
-        Fields are ordered in way that preserves backwards compatibility and yields 
+        Fields are ordered in way that preserves backwards compatibility and yields
         good packing on typical 32-bit and 64-bit platforms.
+
+        In case task prefix size exceeds 32 or 64 bytes on IA32 and Intel64
+        architectures correspondingly, consider dynamic setting of task_alignment
+        and task_prefix_reservation_size based on the maximal operand size supported
+        by the current CPU.
+
         @ingroup task_scheduling */
     class task_prefix {
     private:
@@ -184,34 +190,34 @@ namespace internal {
 
 #if __TBB_TASK_GROUP_CONTEXT
         //! Shared context that is used to communicate asynchronous state changes
-        /** Currently it is used to broadcast cancellation requests generated both 
+        /** Currently it is used to broadcast cancellation requests generated both
             by users and as the result of unhandled exceptions in the task::execute()
             methods. */
         task_group_context  *context;
 #endif /* __TBB_TASK_GROUP_CONTEXT */
-        
+
         //! The scheduler that allocated the task, or NULL if the task is big.
         /** Small tasks are pooled by the scheduler that allocated the task.
             If a scheduler needs to free a small task allocated by another scheduler,
             it returns the task to that other scheduler.  This policy avoids
-            memory space blowup issues for memory allocators that allocate from 
+            memory space blowup issues for memory allocators that allocate from
             thread-specific pools. */
         scheduler* origin;
 
-#if TBB_PREVIEW_TASK_PRIORITY
+#if __TBB_TASK_PRIORITY
         union {
-#endif /* TBB_PREVIEW_TASK_PRIORITY */
+#endif /* __TBB_TASK_PRIORITY */
         //! Obsolete. The scheduler that owns the task.
-        /** Retained only for the sake of backward binary compatibility. 
+        /** Retained only for the sake of backward binary compatibility.
             Still used by inline methods in the task.h header. **/
         scheduler* owner;
 
-#if TBB_PREVIEW_TASK_PRIORITY
+#if __TBB_TASK_PRIORITY
         //! Pointer to the next offloaded lower priority task.
         /** Used to maintain a list of offloaded tasks inside the scheduler. **/
         task* next_offloaded;
         };
-#endif /* TBB_PREVIEW_TASK_PRIORITY */
+#endif /* __TBB_TASK_PRIORITY */
 
         //! The task whose reference count includes me.
         /** In the "blocking style" of programming, this field points to the parent task.
@@ -224,7 +230,7 @@ namespace internal {
             the difference of the number of allocated children minus the
             number of children that have completed.
             In the "blocking style" of programming, this field is one more than the difference. */
-        reference_count ref_count;
+        __TBB_atomic reference_count ref_count;
 
         //! Obsolete. Used to be scheduling depth before TBB 2.2
         /** Retained only for the sake of backward binary compatibility.
@@ -257,7 +263,7 @@ namespace internal {
 
 #if __TBB_TASK_GROUP_CONTEXT
 
-#if TBB_PREVIEW_TASK_PRIORITY
+#if __TBB_TASK_PRIORITY
 namespace internal {
     static const int priority_stride_v4 = INT_MAX / 4;
 }
@@ -268,7 +274,7 @@ enum priority_t {
     priority_high = priority_normal + internal::priority_stride_v4
 };
 
-#endif /* TBB_PREVIEW_TASK_PRIORITY */
+#endif /* __TBB_TASK_PRIORITY */
 
 #if TBB_USE_CAPTURED_EXCEPTION
     class tbb_exception;
@@ -280,25 +286,25 @@ enum priority_t {
 
 class task_scheduler_init;
 
-//! Used to form groups of tasks 
-/** @ingroup task_scheduling 
-    The context services explicit cancellation requests from user code, and unhandled 
-    exceptions intercepted during tasks execution. Intercepting an exception results 
-    in generating internal cancellation requests (which is processed in exactly the 
-    same way as external ones). 
+//! Used to form groups of tasks
+/** @ingroup task_scheduling
+    The context services explicit cancellation requests from user code, and unhandled
+    exceptions intercepted during tasks execution. Intercepting an exception results
+    in generating internal cancellation requests (which is processed in exactly the
+    same way as external ones).
 
-    The context is associated with one or more root tasks and defines the cancellation 
-    group that includes all the descendants of the corresponding root task(s). Association 
+    The context is associated with one or more root tasks and defines the cancellation
+    group that includes all the descendants of the corresponding root task(s). Association
     is established when a context object is passed as an argument to the task::allocate_root()
     method. See task_group_context::task_group_context for more details.
-    
+
     The context can be bound to another one, and other contexts can be bound to it,
     forming a tree-like structure: parent -> this -> children. Arrows here designate
     cancellation propagation direction. If a task in a cancellation group is canceled
     all the other tasks in this group and groups bound to it (as children) get canceled too.
 
-    IMPLEMENTATION NOTE: 
-    When adding new members to task_group_context or changing types of existing ones, 
+    IMPLEMENTATION NOTE:
+    When adding new members to task_group_context or changing types of existing ones,
     update the size of both padding buffers (_leading_padding and _trailing_padding)
     appropriately. See also VERSIONING NOTE at the constructor definition below. **/
 class task_group_context : internal::no_copy {
@@ -349,7 +355,7 @@ private:
     task_group_context *my_parent;
 
     //! Used to form the thread specific list of contexts without additional memory allocation.
-    /** A context is included into the list of the current thread when its binding to 
+    /** A context is included into the list of the current thread when its binding to
         its parent happens. Any context can be present in the list of one thread only. **/
     internal::context_list_node_t my_node;
 
@@ -358,15 +364,15 @@ private:
 
     //! Leading padding protecting accesses to frequently used members from false sharing.
     /** Read accesses to the field my_cancellation_requested are on the hot path inside
-        the scheduler. This padding ensures that this field never shares the same cache 
+        the scheduler. This padding ensures that this field never shares the same cache
         line with a local variable that is frequently written to. **/
     char _leading_padding[internal::NFS_MaxLineSize
                           - 2 * sizeof(uintptr_t)- sizeof(void*) - sizeof(internal::context_list_node_t)
                           - sizeof(__itt_caller)];
-    
+
     //! Specifies whether cancellation was request for this task group.
     uintptr_t my_cancellation_requested;
-    
+
     //! Version for run-time checks and behavioral traits of the context.
     /** Version occupies low 16 bits, and traits (zero or more ORed enumerators
         from the traits_type enumerations) take the next 16 bits.
@@ -382,48 +388,48 @@ private:
     //! Internal state (combination of state flags).
     uintptr_t my_state;
 
-#if TBB_PREVIEW_TASK_PRIORITY
+#if __TBB_TASK_PRIORITY
     //! Priority level of the task group (in normalized representation)
     intptr_t my_priority;
-#endif /* TBB_PREVIEW_TASK_PRIORITY */
+#endif /* __TBB_TASK_PRIORITY */
 
     //! Trailing padding protecting accesses to frequently used members from false sharing
     /** \sa _leading_padding **/
     char _trailing_padding[internal::NFS_MaxLineSize - 2 * sizeof(uintptr_t) - 2 * sizeof(void*)
-#if TBB_PREVIEW_TASK_PRIORITY
+#if __TBB_TASK_PRIORITY
                             - sizeof(intptr_t)
-#endif /* TBB_PREVIEW_TASK_PRIORITY */
+#endif /* __TBB_TASK_PRIORITY */
                           ];
 
 public:
     //! Default & binding constructor.
-    /** By default a bound context is created. That is this context will be bound 
-        (as child) to the context of the task calling task::allocate_root(this_context) 
+    /** By default a bound context is created. That is this context will be bound
+        (as child) to the context of the task calling task::allocate_root(this_context)
         method. Cancellation requests passed to the parent context are propagated
         to all the contexts bound to it. Similarly priority change is propagated
         from the parent context to its children.
 
         If task_group_context::isolated is used as the argument, then the tasks associated
         with this context will never be affected by events in any other context.
-        
+
         Creating isolated contexts involve much less overhead, but they have limited
         utility. Normally when an exception occurs in an algorithm that has nested
-        ones running, it is desirably to have all the nested algorithms canceled 
+        ones running, it is desirably to have all the nested algorithms canceled
         as well. Such a behavior requires nested algorithms to use bound contexts.
-        
+
         There is one good place where using isolated algorithms is beneficial. It is
         a master thread. That is if a particular algorithm is invoked directly from
-        the master thread (not from a TBB task), supplying it with explicitly 
+        the master thread (not from a TBB task), supplying it with explicitly
         created isolated context will result in a faster algorithm startup.
-        
-        VERSIONING NOTE: 
-        Implementation(s) of task_group_context constructor(s) cannot be made 
-        entirely out-of-line because the run-time version must be set by the user 
-        code. This will become critically important for binary compatibility, if 
+
+        VERSIONING NOTE:
+        Implementation(s) of task_group_context constructor(s) cannot be made
+        entirely out-of-line because the run-time version must be set by the user
+        code. This will become critically important for binary compatibility, if
         we ever have to change the size of the context object.
 
-        Boosting the runtime version will also be necessary if new data fields are 
-        introduced in the currently unused padding areas and these fields are updated 
+        Boosting the runtime version will also be necessary if new data fields are
+        introduced in the currently unused padding areas and these fields are updated
         by inline methods. **/
     task_group_context ( kind_type relation_with_parent = bound,
                          uintptr_t traits = default_traits )
@@ -436,22 +442,22 @@ public:
     __TBB_EXPORTED_METHOD ~task_group_context ();
 
     //! Forcefully reinitializes the context after the task tree it was associated with is completed.
-    /** Because the method assumes that all the tasks that used to be associated with 
-        this context have already finished, calling it while the context is still 
+    /** Because the method assumes that all the tasks that used to be associated with
+        this context have already finished, calling it while the context is still
         in use somewhere in the task hierarchy leads to undefined behavior.
-        
+
         IMPORTANT: This method is not thread safe!
 
-        The method does not change the context's parent if it is set. **/ 
+        The method does not change the context's parent if it is set. **/
     void __TBB_EXPORTED_METHOD reset ();
 
     //! Initiates cancellation of all tasks in this cancellation group and its subordinate groups.
-    /** \return false if cancellation has already been requested, true otherwise. 
+    /** \return false if cancellation has already been requested, true otherwise.
 
-        Note that canceling never fails. When false is returned, it just means that 
+        Note that canceling never fails. When false is returned, it just means that
         another thread (or this one) has already sent cancellation request to this
         context or to one of its ancestors (if this context is bound). It is guaranteed
-        that when this method is concurrently called on the same not yet cancelled 
+        that when this method is concurrently called on the same not yet cancelled
         context, true will be returned by one and only one invocation. **/
     bool __TBB_EXPORTED_METHOD cancel_group_execution ();
 
@@ -459,24 +465,24 @@ public:
     bool __TBB_EXPORTED_METHOD is_group_execution_cancelled () const;
 
     //! Records the pending exception, and cancels the task group.
-    /** May be called only from inside a catch-block. If the context is already 
-        canceled, does nothing. 
-        The method brings the task group associated with this context exactly into 
-        the state it would be in, if one of its tasks threw the currently pending 
-        exception during its execution. In other words, it emulates the actions 
+    /** May be called only from inside a catch-block. If the context is already
+        canceled, does nothing.
+        The method brings the task group associated with this context exactly into
+        the state it would be in, if one of its tasks threw the currently pending
+        exception during its execution. In other words, it emulates the actions
         of the scheduler's dispatch loop exception handler. **/
     void __TBB_EXPORTED_METHOD register_pending_exception ();
 
-#if TBB_PREVIEW_TASK_PRIORITY
-    //! Changes priority of the task grop 
+#if __TBB_TASK_PRIORITY
+    //! Changes priority of the task grop
     void set_priority ( priority_t );
 
     //! Retrieves current priority of the current task group
     priority_t priority () const;
-#endif /* TBB_PREVIEW_TASK_PRIORITY */
+#endif /* __TBB_TASK_PRIORITY */
 
 protected:
-    //! Out-of-line part of the constructor. 
+    //! Out-of-line part of the constructor.
     /** Singled out to ensure backward binary compatibility of the future versions. **/
     void __TBB_EXPORTED_METHOD init ();
 
@@ -490,7 +496,7 @@ private:
     static const kind_type dying = kind_type(detached+1);
 
     //! Propagates state change (if any) from an ancestor
-    /** Checks if one of this object's ancestors is in a new state, and propagates 
+    /** Checks if one of this object's ancestors is in a new state, and propagates
         the new state to all its descendants in this object's heritage line. **/
     template <typename T>
     void propagate_state_from_ancestors ( T task_group_context::*mptr_state, T new_state );
@@ -542,7 +548,7 @@ public:
         //! task object is on free list, or is going to be put there, or was just taken off.
         freed,
         //! task to be recycled as continuation
-        recycle 
+        recycle
     };
 
     //------------------------------------------------------------------------
@@ -595,8 +601,8 @@ public:
     /** The caller must guarantee that the task's refcount does not become zero until
         after the method execute() returns.  Typically, this is done by having
         method execute() return a pointer to a child of the task.  If the guarantee
-        cannot be made, use method recycle_as_safe_continuation instead. 
-       
+        cannot be made, use method recycle_as_safe_continuation instead.
+
         Because of the hazard, this method may be deprecated in the future. */
     void recycle_as_continuation() {
         __TBB_ASSERT( prefix().state==executing, "execute not running?" );
@@ -605,7 +611,7 @@ public:
 
     //! Recommended to use, safe variant of recycle_as_continuation
     /** For safety, it requires additional increment of ref_count.
-        With no decendants and ref_count of 1, it has the semantics of recycle_to_reexecute. */
+        With no descendants and ref_count of 1, it has the semantics of recycle_to_reexecute. */
     void recycle_as_safe_continuation() {
         __TBB_ASSERT( prefix().state==executing, "execute not running?" );
         prefix().state = recycle;
@@ -634,7 +640,7 @@ public:
         prefix().state = reexecute;
     }
 
-    // All depth-related methods are obsolete, and are retained for the sake 
+    // All depth-related methods are obsolete, and are retained for the sake
     // of backward source compatibility only
     intptr_t depth() const {return 0;}
     void set_depth( intptr_t ) {}
@@ -655,13 +661,13 @@ public:
     }
 
     //! Atomically increment reference count and returns its old value.
-    /** Has acquire semantics */  
+    /** Has acquire semantics */
     void increment_ref_count() {
         __TBB_FetchAndIncrementWacquire( &prefix().ref_count );
     }
 
     //! Atomically decrement reference count and returns its new value.
-    /** Has release semantics. */  
+    /** Has release semantics. */
     int decrement_ref_count() {
 #if TBB_USE_THREADING_TOOLS||TBB_USE_ASSERT
         return int(internal_decrement_ref_count());
@@ -698,29 +704,29 @@ public:
     }
 
     //! Enqueue task for starvation-resistant execution.
-#if TBB_PREVIEW_TASK_PRIORITY
+#if __TBB_TASK_PRIORITY
     /** The task will be enqueued on the normal priority level disregarding the
         priority of its task group.
-        
+
         The rationale of such semantics is that priority of an enqueued task is
         statically fixed at the moment of its enqueuing, while task group priority
         is dynamic. Thus automatic priority inheritance would be generally a subject
-        to the race, which may result in unexpected behavior. 
-        
+        to the race, which may result in unexpected behavior.
+
         Use enqueue() overload with explicit priority value and task::group_priority()
         method to implement such priority inheritance when it is really necessary. **/
-#endif /* TBB_PREVIEW_TASK_PRIORITY */
+#endif /* __TBB_TASK_PRIORITY */
     static void enqueue( task& t ) {
         t.prefix().owner->enqueue( t, NULL );
     }
 
-#if TBB_PREVIEW_TASK_PRIORITY
+#if __TBB_TASK_PRIORITY
     //! Enqueue task for starvation-resistant execution on the specified priority level.
     static void enqueue( task& t, priority_t p ) {
         __TBB_ASSERT( p == priority_low || p == priority_normal || p == priority_high, "Invalid priority level value" );
         t.prefix().owner->enqueue( t, (void*)p );
     }
-#endif /* TBB_PREVIEW_TASK_PRIORITY */
+#endif /* __TBB_TASK_PRIORITY */
 
     //! The innermost task being executed or destroyed by the current thread at the moment.
     static task& __TBB_EXPORTED_FUNC self();
@@ -728,6 +734,14 @@ public:
     //! task on whose behalf this task is working, or NULL if this is a root.
     task* parent() const {return prefix().parent;}
 
+    //! sets parent task pointer to specified value
+    void set_parent(task* p) {
+#if __TBB_TASK_GROUP_CONTEXT
+        __TBB_ASSERT(prefix().context == p->prefix().context, "The tasks must be in the same context");
+#endif
+        prefix().parent = p;
+    }
+
 #if __TBB_TASK_GROUP_CONTEXT
     //! This method is deprecated and will be removed in the future.
     /** Use method group() instead. **/
@@ -735,7 +749,7 @@ public:
 
     //! Pointer to the task group descriptor.
     task_group_context* group () { return prefix().context; }
-#endif /* __TBB_TASK_GROUP_CONTEXT */   
+#endif /* __TBB_TASK_GROUP_CONTEXT */
 
     //! True if task was stolen from the task pool of another thread.
     bool is_stolen_task() const {
@@ -764,7 +778,7 @@ public:
     //------------------------------------------------------------------------
     // Affinity
     //------------------------------------------------------------------------
+
     //! An id as used for specifying affinity.
     /** Guaranteed to be integral type.  Value of 0 means no affinity. */
     typedef internal::affinity_id affinity_id;
@@ -776,8 +790,8 @@ public:
     affinity_id affinity() const {return prefix().affinity;}
 
     //! Invoked by scheduler to notify task that it ran on unexpected thread.
-    /** Invoked before method execute() runs, if task is stolen, or task has 
-        affinity but will be executed on another thread. 
+    /** Invoked before method execute() runs, if task is stolen, or task has
+        affinity but will be executed on another thread.
 
         The default action does nothing. */
     virtual void __TBB_EXPORTED_METHOD note_affinity( affinity_id id );
@@ -792,7 +806,7 @@ public:
         traditional usage model where task group context are allocated locally on
         the stack inapplicable. Dynamic allocation of context objects is performance
         inefficient. Method change_group() allows to make task group context object
-        a member of the task class, and then associate it with its containing task 
+        a member of the task class, and then associate it with its containing task
         object in the latter's constructor. **/
     void __TBB_EXPORTED_METHOD change_group ( task_group_context& ctx );
 
@@ -804,14 +818,14 @@ public:
     bool is_cancelled () const { return prefix().context->is_group_execution_cancelled(); }
 #endif /* __TBB_TASK_GROUP_CONTEXT */
 
-#if TBB_PREVIEW_TASK_PRIORITY
+#if __TBB_TASK_PRIORITY
     //! Changes priority of the task group this task belongs to.
     void set_group_priority ( priority_t p ) {  prefix().context->set_priority(p); }
 
     //! Retrieves current priority of the task group this task belongs to.
     priority_t group_priority () const { return prefix().context->priority(); }
 
-#endif /* TBB_PREVIEW_TASK_PRIORITY */
+#endif /* __TBB_TASK_PRIORITY */
 
 private:
     friend class interface5::internal::task_base;
@@ -824,7 +838,7 @@ private:
     friend class internal::allocate_continuation_proxy;
     friend class internal::allocate_child_proxy;
     friend class internal::allocate_additional_child_of_proxy;
-    
+
     //! Get reference to corresponding task_prefix.
     /** Version tag prevents loader on Linux from using the wrong symbol in debug builds. **/
     internal::task_prefix& prefix( internal::version_tag* = NULL ) const {
index fd4d552d28252df22be6b55cd5f706ec4d68dd31..2e42544a713344b9b19c82b4b357b1614ab0d11e 100644 (file)
@@ -32,6 +32,8 @@
 #include "task.h"
 #include "tbb_exception.h"
 
+#if __TBB_TASK_GROUP_CONTEXT
+
 namespace tbb {
 
 namespace internal {
@@ -245,4 +247,6 @@ task_handle<F> make_task( const F& f ) {
 
 } // namespace tbb
 
+#endif /* __TBB_TASK_GROUP_CONTEXT */
+
 #endif /* __TBB_task_group_H */
index 2f8658ec3a628689e869a57f64715640818fed76..576d3702a21438207c2cce8fcfb829f37b7db0f7 100644 (file)
@@ -59,13 +59,11 @@ namespace internal {
     as described in task_scheduler_init::initialize().
     @ingroup task_scheduling */
 class task_scheduler_init: internal::no_copy {
-#if TBB_USE_EXCEPTIONS
     enum ExceptionPropagationMode {
         propagation_mode_exact = 1u,
         propagation_mode_captured = 2u,
         propagation_mode_mask = propagation_mode_exact | propagation_mode_captured
     };
-#endif /* TBB_USE_EXCEPTIONS */
 
     /** NULL if not currently initialized. */
     internal::scheduler* my_scheduler;
@@ -100,8 +98,7 @@ public:
 
     //! Shorthand for default constructor followed by call to initialize(number_of_threads).
     task_scheduler_init( int number_of_threads=automatic, stack_size_type thread_stack_size=0 ) : my_scheduler(NULL)  {
-#if TBB_USE_EXCEPTIONS
-        // Take two lowest order bits of the stack size argument to communicate
+        // Two lowest order bits of the stack size argument may be taken to communicate
         // default exception propagation mode of the client to be used when the
         // client manually creates tasks in the master thread and does not use
         // explicit task group context object. This is necessary because newer 
@@ -109,6 +106,7 @@ public:
         // by older clients that expect tbb::captured_exception wrapper.
         // All zeros mean old client - no preference. 
         __TBB_ASSERT( !(thread_stack_size & propagation_mode_mask), "Requested stack size is not aligned" );
+#if TBB_USE_EXCEPTIONS
         thread_stack_size |= TBB_USE_CAPTURED_EXCEPTION ? propagation_mode_captured : propagation_mode_exact;
 #endif /* TBB_USE_EXCEPTIONS */
         initialize( number_of_threads, thread_stack_size );
index 988035fc811b454b8210dc6f8c6504dca51ff78b..8b86b89e52933bcd58dd9c3e7be6a8d2aa9eb8cf 100644 (file)
 
 /** This header is supposed to contain macro definitions and C style comments only.
     The macros defined here are intended to control such aspects of TBB build as 
+    - presence of compiler features
     - compilation modes
     - feature sets
-    - workarounds presence 
+    - known compiler/platform issues
 **/
 
-/** Compilation modes **/
+#define __TBB_GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
+
+/** Presence of compiler features **/
+
+#if (__TBB_GCC_VERSION >= 40400) && !defined(__INTEL_COMPILER)
+    /** warning suppression pragmas available in GCC since 4.4 **/
+    #define __TBB_GCC_WARNING_SUPPRESSION_PRESENT 1
+#endif
+
+/* TODO: The following condition should be extended when new compilers/runtimes 
+         with std::exception_ptr support appear. */
+#define __TBB_EXCEPTION_PTR_PRESENT  ((_MSC_VER >= 1600 || (__GXX_EXPERIMENTAL_CXX0X__ && __GNUC__==4 && __GNUC_MINOR__>=4)) && !__INTEL_COMPILER)
+
+#if __GNUC__ || __SUNPRO_CC || __IBMCPP__
+    /* ICC defines __GNUC__ and so is covered */
+    #define __TBB_ATTRIBUTE_ALIGNED_PRESENT 1
+#elif _MSC_VER && (_MSC_VER >= 1300 || __INTEL_COMPILER)
+    #define __TBB_DECLSPEC_ALIGN_PRESENT 1
+#endif
+
+#if (__TBB_GCC_VERSION >= 40102) && !defined(__INTEL_COMPILER)
+    /** built-in atomics available in GCC since 4.1.2 **/
+    #define __TBB_GCC_BUILTIN_ATOMICS_PRESENT 1
+#endif
+
+/** User controlled TBB features & modes **/
 
 #ifndef TBB_USE_DEBUG
 #ifdef TBB_DO_ASSERT
 #define TBB_USE_DEBUG TBB_DO_ASSERT
 #else
+#ifdef _DEBUG
+#define TBB_USE_DEBUG _DEBUG
+#else
 #define TBB_USE_DEBUG 0
+#endif
 #endif /* TBB_DO_ASSERT */
-#else
-#define TBB_DO_ASSERT TBB_USE_DEBUG
 #endif /* TBB_USE_DEBUG */
 
 #ifndef TBB_USE_ASSERT
     #endif
 #endif /* TBB_IMPLEMENT_CPP0X */
 
+#ifndef TBB_USE_CAPTURED_EXCEPTION
+    #if __TBB_EXCEPTION_PTR_PRESENT
+        #define TBB_USE_CAPTURED_EXCEPTION 0
+    #else
+        #define TBB_USE_CAPTURED_EXCEPTION 1
+    #endif
+#else /* defined TBB_USE_CAPTURED_EXCEPTION */
+    #if !TBB_USE_CAPTURED_EXCEPTION && !__TBB_EXCEPTION_PTR_PRESENT
+        #error Current runtime does not support std::exception_ptr. Set TBB_USE_CAPTURED_EXCEPTION and make sure that your code is ready to catch tbb::captured_exception.
+    #endif
+#endif /* defined TBB_USE_CAPTURED_EXCEPTION */
+
+/** Check whether the request to use GCC atomics can be satisfied **/
+#if (TBB_USE_GCC_BUILTINS && !__TBB_GCC_BUILTIN_ATOMICS_PRESENT)
+    #error "GCC atomic built-ins are not supported."
+#endif
+
+/** Internal TBB features & modes **/
+
 #ifndef __TBB_DYNAMIC_LOAD_ENABLED
     #define __TBB_DYNAMIC_LOAD_ENABLED !__TBB_TASK_CPP_DIRECTLY_INCLUDED
 #elif !__TBB_DYNAMIC_LOAD_ENABLED
     #endif
 #endif
 
-/** Feature sets **/
-
 #ifndef __TBB_COUNT_TASK_NODES
     #define __TBB_COUNT_TASK_NODES TBB_USE_ASSERT
 #endif
     #define __TBB_TASK_GROUP_CONTEXT 1
 #endif /* __TBB_TASK_GROUP_CONTEXT */
 
+#if TBB_USE_EXCEPTIONS && !__TBB_TASK_GROUP_CONTEXT
+    #error TBB_USE_EXCEPTIONS requires __TBB_TASK_GROUP_CONTEXT to be enabled
+#endif
+
 #ifndef __TBB_SCHEDULER_OBSERVER
     #define __TBB_SCHEDULER_OBSERVER 1
 #endif /* __TBB_SCHEDULER_OBSERVER */
 
 #ifndef __TBB_TASK_PRIORITY
-    #define __TBB_TASK_PRIORITY __TBB_CPF_BUILD
+    #define __TBB_TASK_PRIORITY __TBB_TASK_GROUP_CONTEXT
 #endif /* __TBB_TASK_PRIORITY */
 
 #if __TBB_TASK_PRIORITY && !__TBB_TASK_GROUP_CONTEXT
     #error __TBB_TASK_PRIORITY requires __TBB_TASK_GROUP_CONTEXT to be enabled
 #endif
 
-#ifdef TBB_PREVIEW_TASK_PRIORITY
-    #if TBB_PREVIEW_TASK_PRIORITY
-        #define __TBB_NO_IMPLICIT_LINKAGE 1
-        #if __TBB_BUILD && !__TBB_TASK_PRIORITY
-            #error TBB_PREVIEW_TASK_PRIORITY requires __TBB_TASK_PRIORITY to be enabled during TBB build
-        #elif !__TBB_TASK_GROUP_CONTEXT
-            #error TBB_PREVIEW_TASK_PRIORITY requires __TBB_TASK_GROUP_CONTEXT to be enabled
-        #endif
-    #endif
-#else
-    #if __TBB_BUILD
-        #define TBB_PREVIEW_TASK_PRIORITY __TBB_TASK_PRIORITY
-    #endif
-#endif /* TBB_PREVIEW_TASK_PRIORITY */
-
 #if !defined(__TBB_SURVIVE_THREAD_SWITCH) && (_WIN32 || _WIN64 || __linux__)
     #define __TBB_SURVIVE_THREAD_SWITCH 1
 #endif /* __TBB_SURVIVE_THREAD_SWITCH */
 
-
-/* TODO: The following condition should be extended as soon as new compilers/runtimes 
-         with std::exception_ptr support appear. */
-#define __TBB_EXCEPTION_PTR_PRESENT  (_MSC_VER >= 1600 || __GXX_EXPERIMENTAL_CXX0X__ && (__GNUC__==4 && __GNUC_MINOR__>=4))
-
-
-#ifndef TBB_USE_CAPTURED_EXCEPTION
-    #if __TBB_EXCEPTION_PTR_PRESENT
-        #define TBB_USE_CAPTURED_EXCEPTION 0
-    #else
-        #define TBB_USE_CAPTURED_EXCEPTION 1
-    #endif
-#else /* defined TBB_USE_CAPTURED_EXCEPTION */
-    #if !TBB_USE_CAPTURED_EXCEPTION && !__TBB_EXCEPTION_PTR_PRESENT
-        #error Current runtime does not support std::exception_ptr. Set TBB_USE_CAPTURED_EXCEPTION and make sure that your code is ready to catch tbb::captured_exception.
-    #endif
-#endif /* defined TBB_USE_CAPTURED_EXCEPTION */
-
-
 #ifndef __TBB_DEFAULT_PARTITIONER
 #if TBB_DEPRECATED
 /** Default partitioner for parallel loop templates in TBB 1.0-2.1 */
 #define __TBB_DEFAULT_PARTITIONER tbb::simple_partitioner
 #else
-/** Default partitioner for parallel loop templates in TBB 2.2 */
+/** Default partitioner for parallel loop templates since TBB 2.2 */
 #define __TBB_DEFAULT_PARTITIONER tbb::auto_partitioner
-#endif /* TBB_DEFAULT_PARTITIONER */
+#endif /* TBB_DEPRECATED */
 #endif /* !defined(__TBB_DEFAULT_PARTITIONER */
 
-/** Workarounds presence **/
-
-#if __GNUC__==4 && __GNUC_MINOR__>=4 && !defined(__INTEL_COMPILER)
-    #define __TBB_GCC_WARNING_SUPPRESSION_ENABLED 1
-#endif
-
 /** Macros of the form __TBB_XXX_BROKEN denote known issues that are caused by
     the bugs in compilers, standard or OS specific libraries. They should be 
     removed as soon as the corresponding bugs are fixed or the buggy OS/compiler
     versions go out of the support list. 
 **/
 
+#if __GNUC__ && __TBB_x86_64 && __INTEL_COMPILER == 1200
+    #define __TBB_ICC_12_0_INL_ASM_FSTCW_BROKEN 1
+#endif
+
 #if _MSC_VER && __INTEL_COMPILER && (__INTEL_COMPILER<1110 || __INTEL_COMPILER==1110 && __INTEL_COMPILER_BUILD_DATE < 20091012)
     /** Necessary to avoid ICL error (or warning in non-strict mode): 
         "exception specification for implicitly declared virtual destructor is 
     #define __TBB_TEMPLATE_FRIENDS_BROKEN 1
 #endif
 
-#if __GLIBC__==2 && __GLIBC_MINOR__==3 || __MINGW32__
+#if __GLIBC__==2 && __GLIBC_MINOR__==3 || __MINGW32__ || (__APPLE__ && __INTEL_COMPILER==1200 && !TBB_USE_DEBUG)
     //! Macro controlling EH usages in TBB tests
     /** Some older versions of glibc crash when exception handling happens concurrently. **/
     #define __TBB_THROW_ACROSS_MODULE_BOUNDARY_BROKEN 1
     #define __TBB_ICC_ASM_VOLATILE_BROKEN 1
 #endif
 
-#define __TBB_GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
-
-/* #if more recent gcc than 4.1.2 */
-#if (__TBB_GCC_VERSION > 40102 ) && !defined(__INTEL_COMPILER)
-    #define __TBB_GCC_BUILTIN_ATOMICS_PRESENT 1
+#if !__INTEL_COMPILER && (_MSC_VER || __GNUC__==3 && __GNUC_MINOR__<=2)
+    /** Bug in GCC 3.2 and MSVC compilers that sometimes return 0 for __alignof(T) 
+        when T has not yet been instantiated. **/
+    #define __TBB_ALIGNOF_NOT_INSTANTIATED_TYPES_BROKEN 1
 #endif
 
-#if (TBB_USE_GCC_BUILTINS && !__TBB_GCC_BUILTIN_ATOMICS_PRESENT)
-    #error "generic gcc port is not supported for this os/architecture."
-#endif
 #endif /* __TBB_tbb_config_H */
index 8b43a1285500f5302fb34221f7a8d635917538bb..50636e5d274e8d4d8bd186fdba587d32a8ef8f4b 100644 (file)
 #ifndef __TBB_machine_H
 #define __TBB_machine_H
 
+/** This header provides basic platform abstraction layer by hooking up appropriate
+    architecture/OS/compiler specific headers from the /include/tbb/machine directory.
+    If a plug-in header does not implement all the required APIs, it must specify
+    the missing ones by setting one or more of the following macros:
+
+    __TBB_USE_GENERIC_PART_WORD_CAS
+    __TBB_USE_GENERIC_PART_WORD_FETCH_ADD
+    __TBB_USE_GENERIC_PART_WORD_FETCH_STORE
+    __TBB_USE_GENERIC_FETCH_ADD
+    __TBB_USE_GENERIC_FETCH_STORE
+    __TBB_USE_GENERIC_DWORD_FETCH_ADD
+    __TBB_USE_GENERIC_DWORD_FETCH_STORE
+    __TBB_USE_GENERIC_HALF_FENCED_LOAD_STORE
+    __TBB_USE_GENERIC_FULL_FENCED_LOAD_STORE
+    __TBB_USE_GENERIC_RELAXED_LOAD_STORE
+    __TBB_USE_FETCHSTORE_AS_FULL_FENCED_STORE
+
+    In this case tbb_machine.h will add missing functionality based on a minimal set 
+    of APIs that are required to be implemented by all plug-n headers as described
+    futher.
+    Note that these generic implementations may be sub-optimal for a particular
+    architecture, and thus should be relied upon only after careful evaluation
+    or as the last resort.
+
+    Additionally __TBB_64BIT_ATOMICS can be set to 0 on a 32-bit architecture to
+    indicate that the port is not going to support double word atomics. It may also
+    be set to 1 explicitly, though normally this is not necessary as tbb_machine.h
+    will set it automatically.
+
+    Prerequisites for each architecture port
+    ----------------------------------------
+    The following functions have no generic implementation. Therefore they must be 
+    implemented in each machine architecture specific header either as a conventional
+    function or as a functional macro.
+
+    __TBB_Yield()
+        Signals OS that the current thread is willing to relinquish the remainder
+        of its time quantum.
+
+    __TBB_full_memory_fence()
+        Must prevent all memory operations from being reordered across it (both
+        by hardware and compiler). All such fences must be totally ordered (or
+        sequentially consistent).
+
+    __TBB_machine_cmpswp4( volatile void *ptr, int32_t value, int32_t comparand )
+        Must be provided if __TBB_USE_FENCED_ATOMICS is not set.
+
+    __TBB_machine_cmpswp8( volatile void *ptr, int32_t value, int64_t comparand )
+        Must be provided for 64-bit architectures if __TBB_USE_FENCED_ATOMICS is not set,
+        and for 32-bit architectures if __TBB_64BIT_ATOMICS is set
+
+    __TBB_machine_<op><S><fence>(...), where
+        <op> = {cmpswp, fetchadd, fetchstore}
+        <S> = {1, 2, 4, 8}
+        <fence> = {full_fence, acquire, release, relaxed}
+        Must be provided if __TBB_USE_FENCED_ATOMICS is set.
+
+    __TBB_control_consistency_helper()
+        Bridges the memory-semantics gap between architectures providing only
+        implicit C++0x "consume" semantics (like Power Architecture) and those
+        also implicitly obeying control dependencies (like Itanium).
+        It must be used only in conditional code where the condition is itself
+        data-dependent, and will then make subsequent code behave as if the
+        original data dependency were acquired.
+        It needs only an empty definition where implied by the architecture
+        either specifically (Itanium) or because generally stronger C++0x "acquire"
+        semantics are enforced (like x86).
+    
+    __TBB_acquire_consistency_helper(), __TBB_release_consistency_helper()
+        Must be provided if __TBB_USE_GENERIC_HALF_FENCED_LOAD_STORE is set.
+        Enforce acquire and release semantics in generic implementations of fenced
+        store and load operations. Depending on the particular architecture/compiler
+        combination they may be a hardware fence, a compiler fence, both or nothing.
+ **/
+
 #include "tbb_stddef.h"
 
+namespace tbb {
+namespace internal {
+
+////////////////////////////////////////////////////////////////////////////////
+// Overridable helpers declarations
+//
+// A machine/*.h file may choose to define these templates, otherwise it must
+// request default implementation by setting appropriate __TBB_USE_GENERIC_XXX macro(s).
+//
+template <typename T, std::size_t S>
+struct machine_load_store;
+
+template <typename T, std::size_t S>
+struct machine_load_store_relaxed;
+
+template <typename T, std::size_t S>
+struct machine_load_store_seq_cst;
+//
+// End of overridable helpers declarations
+////////////////////////////////////////////////////////////////////////////////
+
+template<size_t S> struct atomic_selector;
+
+template<> struct atomic_selector<1> {
+    typedef int8_t word;
+    inline static word fetch_store ( volatile void* location, word value );
+};
+
+template<> struct atomic_selector<2> {
+    typedef int16_t word;
+    inline static word fetch_store ( volatile void* location, word value );
+};
+
+template<> struct atomic_selector<4> {
+#if _MSC_VER && !_WIN64
+    // Work-around that avoids spurious /Wp64 warnings
+    typedef intptr_t word;
+#else
+    typedef int32_t word;
+#endif
+    inline static word fetch_store ( volatile void* location, word value );
+};
+
+template<> struct atomic_selector<8> {
+    typedef int64_t word;
+    inline static word fetch_store ( volatile void* location, word value );
+};
+
+}} // namespaces internal, tbb
+
 #if _WIN32||_WIN64
 
 #ifdef _MANAGED
         #endif
     #elif defined(_M_IX86)
         #include "machine/windows_ia32.h"
-    #elif defined(_M_AMD64) 
+    #elif defined(_M_X64) 
         #include "machine/windows_intel64.h"
-    #elif _XBOX 
+    #elif _XBOX
         #include "machine/xbox360_ppc.h"
     #endif
 
 
 #elif __sun || __SUNPRO_CC
 
-    #define __asm__ asm 
+    #define __asm__ asm
     #define __volatile__ volatile
-    
+
     #if __i386  || __i386__
         #include "machine/linux_ia32.h"
     #elif __x86_64__
 #endif /* OS selection */
 
 #ifndef __TBB_64BIT_ATOMICS
-#define __TBB_64BIT_ATOMICS 1
+    #define __TBB_64BIT_ATOMICS 1
 #endif
 
-//! Prerequisites for each architecture port
-/** There are no generic implementation for these macros so they have to be implemented
-    in each machine architecture specific header.
+// Special atomic functions
+#if __TBB_USE_FENCED_ATOMICS
+    #define __TBB_machine_cmpswp1   __TBB_machine_cmpswp1full_fence
+    #define __TBB_machine_cmpswp2   __TBB_machine_cmpswp2full_fence
+    #define __TBB_machine_cmpswp4   __TBB_machine_cmpswp4full_fence
+    #define __TBB_machine_cmpswp8   __TBB_machine_cmpswp8full_fence
+
+    #if __TBB_WORDSIZE==8
+        #define __TBB_machine_fetchadd8             __TBB_machine_fetchadd8full_fence
+        #define __TBB_machine_fetchstore8           __TBB_machine_fetchstore8full_fence
+        #define __TBB_FetchAndAddWrelease(P,V)      __TBB_machine_fetchadd8release(P,V)
+        #define __TBB_FetchAndIncrementWacquire(P)  __TBB_machine_fetchadd8acquire(P,1)
+        #define __TBB_FetchAndDecrementWrelease(P)  __TBB_machine_fetchadd8release(P,(-1))
+    #else
+        #error Define macros for 4-byte word, similarly to the above __TBB_WORDSIZE==8 branch.
+    #endif /* __TBB_WORDSIZE==4 */
+#else /* !__TBB_USE_FENCED_ATOMICS */
+    #define __TBB_FetchAndAddWrelease(P,V)      __TBB_FetchAndAddW(P,V)
+    #define __TBB_FetchAndIncrementWacquire(P)  __TBB_FetchAndAddW(P,1)
+    #define __TBB_FetchAndDecrementWrelease(P)  __TBB_FetchAndAddW(P,(-1))
+#endif /* !__TBB_USE_FENCED_ATOMICS */
+
+#if __TBB_WORDSIZE==4
+    #define __TBB_CompareAndSwapW(P,V,C)    __TBB_machine_cmpswp4(P,V,C)
+    #define __TBB_FetchAndAddW(P,V)         __TBB_machine_fetchadd4(P,V)
+    #define __TBB_FetchAndStoreW(P,V)       __TBB_machine_fetchstore4(P,V)
+#elif  __TBB_WORDSIZE==8
+    #if __TBB_USE_GENERIC_DWORD_LOAD_STORE || __TBB_USE_GENERIC_DWORD_FETCH_ADD || __TBB_USE_GENERIC_DWORD_FETCH_STORE
+        #error These macros should only be used on 32-bit platforms.
+    #endif
 
-    __TBB_full_memory_fence must prevent all memory operations from being reordered 
-    across the fence. And all such fences must be totally ordered (or sequentially 
-    consistent). These fence must affect both compiler and hardware.
-    
-    __TBB_release_consistency_helper is used to enforce guarantees of acquire or 
-    release semantics in generic implementations of __TBB_load_with_acquire and 
-    __TBB_store_with_release below. Depending on the particular combination of
-    architecture+compiler it can be a hardware fence, a compiler fence, both or
-    nothing. **/
-#if    !defined(__TBB_CompareAndSwap4) \
-    || !defined(__TBB_CompareAndSwap8) && __TBB_64BIT_ATOMICS \
-    || !defined(__TBB_Yield)           \
-    || !defined(__TBB_full_memory_fence)    \
-    || !defined(__TBB_release_consistency_helper)
-#error Minimal requirements for tbb_machine.h not satisfied; platform is not supported.
-#endif
+    #define __TBB_CompareAndSwapW(P,V,C)    __TBB_machine_cmpswp8(P,V,C)
+    #define __TBB_FetchAndAddW(P,V)         __TBB_machine_fetchadd8(P,V)
+    #define __TBB_FetchAndStoreW(P,V)       __TBB_machine_fetchstore8(P,V)
+#else /* __TBB_WORDSIZE != 8 */
+    #error Unsupported machine word size.
+#endif /* __TBB_WORDSIZE */
 
 #ifndef __TBB_Pause
     inline void __TBB_Pause(int32_t) {
@@ -150,7 +292,7 @@ namespace internal {
 //! Class that implements exponential backoff.
 /** See implementation of spin_wait_while_eq for an example. */
 class atomic_backoff : no_copy {
-    //! Time delay, in units of "pause" instructions. 
+    //! Time delay, in units of "pause" instructions.
     /** Should be equal to approximately the number of "pause" instructions
         that take the same time as an context switch. */
     static const int32_t LOOPS_BEFORE_YIELD = 16;
@@ -221,8 +363,9 @@ inline T __TBB_MaskedCompareAndSwap (volatile T *ptr, T value, T comparand ) {
         result = *base; // reload the base value which might change during the pause
         uint32_t old_value = ( result & ~mask ) | ( comparand << bitoffset );
         uint32_t new_value = ( result & ~mask ) | ( value << bitoffset );
-        // __TBB_CompareAndSwap4 presumed to have full fence. 
-        result = __TBB_CompareAndSwap4( base, new_value, old_value );
+        // __TBB_CompareAndSwap4 presumed to have full fence.
+        // Cast shuts up /Wp64 warning
+        result = (uint32_t)__TBB_machine_cmpswp4( base, new_value, old_value );
         if(  result==old_value               // CAS succeeded
           || ((result^old_value)&mask)!=0 )  // CAS failed and the bits of interest have changed
             break;
@@ -233,37 +376,36 @@ inline T __TBB_MaskedCompareAndSwap (volatile T *ptr, T value, T comparand ) {
 }
 
 template<size_t S, typename T>
-inline T __TBB_CompareAndSwapGeneric (volatile void *ptr, T value, T comparand ) { 
-    return __TBB_CompareAndSwapW((T *)ptr,value,comparand);
-}
+inline T __TBB_CompareAndSwapGeneric (volatile void *ptr, T value, T comparand );
 
 template<>
 inline uint8_t __TBB_CompareAndSwapGeneric <1,uint8_t> (volatile void *ptr, uint8_t value, uint8_t comparand ) {
-#ifdef __TBB_CompareAndSwap1
-    return __TBB_CompareAndSwap1(ptr,value,comparand);
-#else
+#if __TBB_USE_GENERIC_PART_WORD_CAS
     return __TBB_MaskedCompareAndSwap<1,uint8_t>((volatile uint8_t *)ptr,value,comparand);
+#else
+    return __TBB_machine_cmpswp1(ptr,value,comparand);
 #endif
 }
 
 template<>
 inline uint16_t __TBB_CompareAndSwapGeneric <2,uint16_t> (volatile void *ptr, uint16_t value, uint16_t comparand ) {
-#ifdef __TBB_CompareAndSwap2
-    return __TBB_CompareAndSwap2(ptr,value,comparand);
-#else
+#if __TBB_USE_GENERIC_PART_WORD_CAS
     return __TBB_MaskedCompareAndSwap<2,uint16_t>((volatile uint16_t *)ptr,value,comparand);
+#else
+    return __TBB_machine_cmpswp2(ptr,value,comparand);
 #endif
 }
 
 template<>
-inline uint32_t __TBB_CompareAndSwapGeneric <4,uint32_t> (volatile void *ptr, uint32_t value, uint32_t comparand ) { 
-    return __TBB_CompareAndSwap4(ptr,value,comparand);
+inline uint32_t __TBB_CompareAndSwapGeneric <4,uint32_t> (volatile void *ptr, uint32_t value, uint32_t comparand ) {
+    // Cast shuts up /Wp64 warning
+    return (uint32_t)__TBB_machine_cmpswp4(ptr,value,comparand);
 }
 
 #if __TBB_64BIT_ATOMICS
 template<>
-inline uint64_t __TBB_CompareAndSwapGeneric <8,uint64_t> (volatile void *ptr, uint64_t value, uint64_t comparand ) { 
-    return __TBB_CompareAndSwap8(ptr,value,comparand);
+inline uint64_t __TBB_CompareAndSwapGeneric <8,uint64_t> (volatile void *ptr, uint64_t value, uint64_t comparand ) {
+    return __TBB_machine_cmpswp8(ptr,value,comparand);
 }
 #endif
 
@@ -273,8 +415,8 @@ inline T __TBB_FetchAndAddGeneric (volatile void *ptr, T addend) {
     T result;
     for(;;) {
         result = *reinterpret_cast<volatile T *>(ptr);
-        // __TBB_CompareAndSwapGeneric presumed to have full fence. 
-        if( __TBB_CompareAndSwapGeneric<S,T> ( ptr, result+addend, result )==result ) 
+        // __TBB_CompareAndSwapGeneric presumed to have full fence.
+        if( __TBB_CompareAndSwapGeneric<S,T> ( ptr, result+addend, result )==result )
             break;
         b.pause();
     }
@@ -288,59 +430,275 @@ inline T __TBB_FetchAndStoreGeneric (volatile void *ptr, T value) {
     for(;;) {
         result = *reinterpret_cast<volatile T *>(ptr);
         // __TBB_CompareAndSwapGeneric presumed to have full fence.
-        if( __TBB_CompareAndSwapGeneric<S,T> ( ptr, value, result )==result ) 
+        if( __TBB_CompareAndSwapGeneric<S,T> ( ptr, value, result )==result )
             break;
         b.pause();
     }
     return result;
 }
 
+#if __TBB_USE_GENERIC_PART_WORD_CAS
+#define __TBB_machine_cmpswp1 tbb::internal::__TBB_CompareAndSwapGeneric<1,uint8_t>
+#define __TBB_machine_cmpswp2 tbb::internal::__TBB_CompareAndSwapGeneric<2,uint16_t>
+#endif
+
+#if __TBB_USE_GENERIC_FETCH_ADD || __TBB_USE_GENERIC_PART_WORD_FETCH_ADD
+#define __TBB_machine_fetchadd1 tbb::internal::__TBB_FetchAndAddGeneric<1,uint8_t>
+#define __TBB_machine_fetchadd2 tbb::internal::__TBB_FetchAndAddGeneric<2,uint16_t>
+#endif
+
+#if __TBB_USE_GENERIC_FETCH_ADD 
+#define __TBB_machine_fetchadd4 tbb::internal::__TBB_FetchAndAddGeneric<4,uint32_t>
+#endif
+
+#if __TBB_USE_GENERIC_FETCH_ADD || __TBB_USE_GENERIC_DWORD_FETCH_ADD
+#define __TBB_machine_fetchadd8 tbb::internal::__TBB_FetchAndAddGeneric<8,uint64_t>
+#endif
+
+#if __TBB_USE_GENERIC_FETCH_STORE || __TBB_USE_GENERIC_PART_WORD_FETCH_STORE
+#define __TBB_machine_fetchstore1 tbb::internal::__TBB_FetchAndStoreGeneric<1,uint8_t>
+#define __TBB_machine_fetchstore2 tbb::internal::__TBB_FetchAndStoreGeneric<2,uint16_t>
+#endif
+
+#if __TBB_USE_GENERIC_FETCH_STORE 
+#define __TBB_machine_fetchstore4 tbb::internal::__TBB_FetchAndStoreGeneric<4,uint32_t>
+#endif
+
+#if __TBB_USE_GENERIC_FETCH_STORE || __TBB_USE_GENERIC_DWORD_FETCH_STORE
+#define __TBB_machine_fetchstore8 tbb::internal::__TBB_FetchAndStoreGeneric<8,uint64_t>
+#endif
+
+#if __TBB_USE_FETCHSTORE_AS_FULL_FENCED_STORE
+#define __TBB_MACHINE_DEFINE_ATOMIC_SELECTOR_FETCH_STORE(S)                                             \
+    atomic_selector<S>::word atomic_selector<S>::fetch_store ( volatile void* location, word value ) {  \
+        return __TBB_machine_fetchstore##S( location, value );                                          \
+    }
+
+__TBB_MACHINE_DEFINE_ATOMIC_SELECTOR_FETCH_STORE(1)
+__TBB_MACHINE_DEFINE_ATOMIC_SELECTOR_FETCH_STORE(2)
+__TBB_MACHINE_DEFINE_ATOMIC_SELECTOR_FETCH_STORE(4)
+__TBB_MACHINE_DEFINE_ATOMIC_SELECTOR_FETCH_STORE(8)
+
+#undef __TBB_MACHINE_DEFINE_ATOMIC_SELECTOR_FETCH_STORE
+#endif /* __TBB_USE_FETCHSTORE_AS_FULL_FENCED_STORE */
+
+#if __TBB_USE_GENERIC_DWORD_LOAD_STORE
+inline void __TBB_machine_store8 (volatile void *ptr, int64_t value) {
+    for(;;) {
+        int64_t result = *(int64_t *)ptr;
+        if( __TBB_machine_cmpswp8(ptr,value,result)==result ) break;
+    }
+}
+
+inline int64_t __TBB_machine_load8 (const volatile void *ptr) {
+    // Comparand and new value may be anything, they only must be equal, and
+    // the value should have a low probability to be actually found in 'location'.
+    const int64_t anyvalue = 2305843009213693951;
+    return __TBB_machine_cmpswp8(const_cast<volatile void *>(ptr),anyvalue,anyvalue);
+}
+#endif /* __TBB_USE_GENERIC_DWORD_LOAD_STORE */
+
+#if __TBB_USE_GENERIC_HALF_FENCED_LOAD_STORE
+/** Fenced operations use volatile qualifier to prevent compiler from optimizing
+    them out, and on on architectures with weak memory ordering to induce compiler
+    to generate code with appropriate acquire/release semantics.
+    On architectures like IA32, Intel64 (and likely and Sparc TSO) volatile has
+    no effect on code gen, and consistency helpers serve as a compiler fence (the
+    latter being true for IA64/gcc as well to fix a bug in some gcc versions). **/
+template <typename T, size_t S>
+struct machine_load_store {
+    static T load_with_acquire ( const volatile T& location ) {
+        T to_return = location;
+        __TBB_acquire_consistency_helper();
+        return to_return;
+    }
+    static void store_with_release ( volatile T &location, T value ) {
+        __TBB_release_consistency_helper();
+        location = value;
+    }
+};
+
+#if __TBB_WORDSIZE==4 && __TBB_64BIT_ATOMICS
+template <typename T>
+struct machine_load_store<T,8> {
+    static T load_with_acquire ( const volatile T& location ) {
+        return (T)__TBB_machine_load8( (const volatile void*)&location );
+    }
+    static void store_with_release ( volatile T& location, T value ) {
+        __TBB_machine_store8( (volatile void*)&location, (int64_t)value );
+    }
+};
+#endif /* __TBB_WORDSIZE==4 && __TBB_64BIT_ATOMICS */
+#endif /* __TBB_USE_GENERIC_HALF_FENCED_LOAD_STORE */
+
+template <typename T, size_t S>
+struct machine_load_store_seq_cst {
+    static T load ( const volatile T& location ) {
+        __TBB_full_memory_fence();
+        return machine_load_store<T,S>::load_with_acquire( location );
+    }
+#if __TBB_USE_FETCHSTORE_AS_FULL_FENCED_STORE
+    static void store ( volatile T &location, T value ) {
+        atomic_selector<S>::fetch_store( (volatile void*)&location, (typename atomic_selector<S>::word)value );
+    }
+#else /* !__TBB_USE_FETCHSTORE_AS_FULL_FENCED_STORE */
+    static void store ( volatile T &location, T value ) {
+        machine_load_store<T,S>::store_with_release( location, value );
+        __TBB_full_memory_fence();
+    }
+#endif /* !__TBB_USE_FETCHSTORE_AS_FULL_FENCED_STORE */
+};
+
+#if __TBB_WORDSIZE==4 && __TBB_64BIT_ATOMICS
+/** The implementation does not use functions __TBB_machine_load8/store8 as they
+    are not required to be sequentially consistent. **/
+template <typename T>
+struct machine_load_store_seq_cst<T,8> {
+    static T load ( const volatile T& location ) {
+        // Comparand and new value may be anything, they only must be equal, and
+        // the value should have a low probability to be actually found in 'location'.
+        const int64_t anyvalue = 2305843009213693951ll;
+        return __TBB_machine_cmpswp8( (volatile void*)const_cast<volatile T*>(&location), anyvalue, anyvalue );
+    }
+    static void store ( volatile T &location, T value ) {
+        int64_t result = (volatile int64_t&)location;
+        while ( __TBB_machine_cmpswp8((volatile void*)&location, (int64_t)value, result) != result )
+            result = (volatile int64_t&)location;
+    }
+};
+#endif /* __TBB_WORDSIZE==4 && __TBB_64BIT_ATOMICS */
+
+#if __TBB_USE_GENERIC_RELAXED_LOAD_STORE
+// Relaxed operations add volatile qualifier to prevent compiler from optimizing them out.
+/** Volatile should not incur any additional cost on IA32, Intel64, and Sparc TSO
+    architectures. However on architectures with weak memory ordering compiler may 
+    generate code with acquire/release semantics for operations on volatile data. **/
+template <typename T, size_t S>
+struct machine_load_store_relaxed {
+    static inline T load ( const volatile T& location ) {
+        return location;
+    }
+    static inline void store ( volatile T& location, T value ) {
+        location = value;
+    }
+};
+
+#if __TBB_WORDSIZE==4 && __TBB_64BIT_ATOMICS
+template <typename T>
+struct machine_load_store_relaxed<T,8> {
+    static inline T load ( const volatile T& location ) {
+        return (T)__TBB_machine_load8( (const volatile void*)&location );
+    }
+    static inline void store ( volatile T& location, T value ) {
+        __TBB_machine_store8( (volatile void*)&location, (int64_t)value );
+    }
+};
+#endif /* __TBB_WORDSIZE==4 && __TBB_64BIT_ATOMICS */
+#endif /* __TBB_USE_GENERIC_RELAXED_LOAD_STORE */
+
+template<typename T>
+inline T __TBB_load_with_acquire(const volatile T &location) {
+    return machine_load_store<T,sizeof(T)>::load_with_acquire( location );
+}
+template<typename T, typename V>
+inline void __TBB_store_with_release(volatile T& location, V value) {
+    machine_load_store<T,sizeof(T)>::store_with_release( location, T(value) );
+}
+//! Overload that exists solely to avoid /Wp64 warnings.
+inline void __TBB_store_with_release(volatile size_t& location, size_t value) {
+    machine_load_store<size_t,sizeof(size_t)>::store_with_release( location, value );
+}
+
+template<typename T>
+inline T __TBB_load_full_fence(const volatile T &location) {
+    return machine_load_store_seq_cst<T,sizeof(T)>::load( location );
+}
+template<typename T, typename V>
+inline void __TBB_store_full_fence(volatile T& location, V value) {
+    machine_load_store_seq_cst<T,sizeof(T)>::store( location, T(value) );
+}
+//! Overload that exists solely to avoid /Wp64 warnings.
+inline void __TBB_store_full_fence(volatile size_t& location, size_t value) {
+    machine_load_store_seq_cst<size_t,sizeof(size_t)>::store( location, value );
+}
+
+template<typename T>
+inline T __TBB_load_relaxed (const volatile T& location) {
+    return machine_load_store_relaxed<T,sizeof(T)>::load( const_cast<T&>(location) );
+}
+template<typename T, typename V>
+inline void __TBB_store_relaxed ( volatile T& location, V value ) {
+    machine_load_store_relaxed<T,sizeof(T)>::store( const_cast<T&>(location), T(value) );
+}
+//! Overload that exists solely to avoid /Wp64 warnings.
+inline void __TBB_store_relaxed ( volatile size_t& location, size_t value ) {
+    machine_load_store_relaxed<size_t,sizeof(size_t)>::store( const_cast<size_t&>(location), value );
+}
+
 // Macro __TBB_TypeWithAlignmentAtLeastAsStrict(T) should be a type with alignment at least as 
-// strict as type T.  Type type should have a trivial default constructor and destructor, so that
-// arrays of that type can be declared without initializers.  
+// strict as type T.  The type should have a trivial default constructor and destructor, so that
+// arrays of that type can be declared without initializers.
 // It is correct (but perhaps a waste of space) if __TBB_TypeWithAlignmentAtLeastAsStrict(T) expands
 // to a type bigger than T.
 // The default definition here works on machines where integers are naturally aligned and the
-// strictest alignment is 16.
+// strictest alignment is 64.
 #ifndef __TBB_TypeWithAlignmentAtLeastAsStrict
 
-#if __GNUC__ || __SUNPRO_CC || __IBMCPP__
-struct __TBB_machine_type_with_strictest_alignment {
-    int member[4];
-} __attribute__((aligned(16)));
-#elif _MSC_VER
-__declspec(align(16)) struct __TBB_machine_type_with_strictest_alignment {
-    int member[4];
+#if __TBB_ATTRIBUTE_ALIGNED_PRESENT
+
+#define __TBB_DefineTypeWithAlignment(PowerOf2)       \
+struct __TBB_machine_type_with_alignment_##PowerOf2 { \
+    uint32_t member[PowerOf2/sizeof(uint32_t)];       \
+} __attribute__((aligned(PowerOf2)));
+#define __TBB_alignof(T) __alignof__(T)
+
+#elif __TBB_DECLSPEC_ALIGN_PRESENT
+
+#define __TBB_DefineTypeWithAlignment(PowerOf2)       \
+__declspec(align(PowerOf2))                           \
+struct __TBB_machine_type_with_alignment_##PowerOf2 { \
+    uint32_t member[PowerOf2/sizeof(uint32_t)];       \
 };
-#else
-#error Must define __TBB_TypeWithAlignmentAtLeastAsStrict(T) or __TBB_machine_type_with_strictest_alignment
+#define __TBB_alignof(T) __alignof(T)
+
+#else /* A compiler with unknown syntax for data alignment */
+#error Must define __TBB_TypeWithAlignmentAtLeastAsStrict(T)
 #endif
 
-template<size_t N> struct type_with_alignment {__TBB_machine_type_with_strictest_alignment member;};
+/* Now declare types aligned to useful powers of two */
+// TODO: Is __TBB_DefineTypeWithAlignment(8) needed on 32 bit platforms?
+__TBB_DefineTypeWithAlignment(16)
+__TBB_DefineTypeWithAlignment(32)
+__TBB_DefineTypeWithAlignment(64)
+
+typedef __TBB_machine_type_with_alignment_64 __TBB_machine_type_with_strictest_alignment;
+
+// Primary template is a declaration of incomplete type so that it fails with unknown alignments
+template<size_t N> struct type_with_alignment;
+
+// Specializations for allowed alignments
 template<> struct type_with_alignment<1> { char member; };
 template<> struct type_with_alignment<2> { uint16_t member; };
 template<> struct type_with_alignment<4> { uint32_t member; };
 template<> struct type_with_alignment<8> { uint64_t member; };
+template<> struct type_with_alignment<16> {__TBB_machine_type_with_alignment_16 member; };
+template<> struct type_with_alignment<32> {__TBB_machine_type_with_alignment_32 member; };
+template<> struct type_with_alignment<64> {__TBB_machine_type_with_alignment_64 member; };
 
-#if _MSC_VER||defined(__GNUC__)&&__GNUC__==3 && __GNUC_MINOR__<=2  
+#if __TBB_ALIGNOF_NOT_INSTANTIATED_TYPES_BROKEN  
 //! Work around for bug in GNU 3.2 and MSVC compilers.
 /** Bug is that compiler sometimes returns 0 for __alignof(T) when T has not yet been instantiated.
     The work-around forces instantiation by forcing computation of sizeof(T) before __alignof(T). */
-template<size_t Size, typename T> 
+template<size_t Size, typename T>
 struct work_around_alignment_bug {
-#if _MSC_VER
-    static const size_t alignment = __alignof(T);
-#else
-    static const size_t alignment = __alignof__(T);
-#endif
+    static const size_t alignment = __TBB_alignof(T);
 };
 #define __TBB_TypeWithAlignmentAtLeastAsStrict(T) tbb::internal::type_with_alignment<tbb::internal::work_around_alignment_bug<sizeof(T),T>::alignment>
-#elif __GNUC__ || __SUNPRO_CC || __IBMCPP__
-#define __TBB_TypeWithAlignmentAtLeastAsStrict(T) tbb::internal::type_with_alignment<__alignof__(T)>
 #else
-#define __TBB_TypeWithAlignmentAtLeastAsStrict(T) __TBB_machine_type_with_strictest_alignment
-#endif
-#endif  /* ____TBB_TypeWithAlignmentAtLeastAsStrict */
+#define __TBB_TypeWithAlignmentAtLeastAsStrict(T) tbb::internal::type_with_alignment<__TBB_alignof(T)>
+#endif  /* __TBB_ALIGNOF_NOT_INSTANTIATED_TYPES_BROKEN */
+
+#endif  /* __TBB_TypeWithAlignmentAtLeastAsStrict */
 
 // Template class here is to avoid instantiation of the static data for modules that don't use it
 template<typename T>
@@ -372,262 +730,13 @@ const T reverse<T>::byte_table[256] = {
 } // namespace internal
 } // namespace tbb
 
-#ifndef __TBB_CompareAndSwap1
-#define __TBB_CompareAndSwap1 tbb::internal::__TBB_CompareAndSwapGeneric<1,uint8_t>
-#endif
-
-#ifndef __TBB_CompareAndSwap2 
-#define __TBB_CompareAndSwap2 tbb::internal::__TBB_CompareAndSwapGeneric<2,uint16_t>
-#endif
-
-#ifndef __TBB_CompareAndSwapW
-#define __TBB_CompareAndSwapW tbb::internal::__TBB_CompareAndSwapGeneric<sizeof(ptrdiff_t),ptrdiff_t>
-#endif
-
-#ifndef __TBB_FetchAndAdd1
-#define __TBB_FetchAndAdd1 tbb::internal::__TBB_FetchAndAddGeneric<1,uint8_t>
-#endif
-
-#ifndef __TBB_FetchAndAdd2
-#define __TBB_FetchAndAdd2 tbb::internal::__TBB_FetchAndAddGeneric<2,uint16_t>
-#endif
-
-#ifndef __TBB_FetchAndAdd4
-#define __TBB_FetchAndAdd4 tbb::internal::__TBB_FetchAndAddGeneric<4,uint32_t>
-#endif
-
-#ifndef __TBB_FetchAndAdd8
-#define __TBB_FetchAndAdd8 tbb::internal::__TBB_FetchAndAddGeneric<8,uint64_t>
-#endif
-
-#ifndef __TBB_FetchAndAddW
-#define __TBB_FetchAndAddW tbb::internal::__TBB_FetchAndAddGeneric<sizeof(ptrdiff_t),ptrdiff_t>
-#endif
-
-#ifndef __TBB_FetchAndStore1
-#define __TBB_FetchAndStore1 tbb::internal::__TBB_FetchAndStoreGeneric<1,uint8_t>
-#endif
-
-#ifndef __TBB_FetchAndStore2
-#define __TBB_FetchAndStore2 tbb::internal::__TBB_FetchAndStoreGeneric<2,uint16_t>
-#endif
-
-#ifndef __TBB_FetchAndStore4
-#define __TBB_FetchAndStore4 tbb::internal::__TBB_FetchAndStoreGeneric<4,uint32_t>
-#endif
-
-#ifndef __TBB_FetchAndStore8
-#define __TBB_FetchAndStore8 tbb::internal::__TBB_FetchAndStoreGeneric<8,uint64_t>
-#endif
-
-#ifndef __TBB_FetchAndStoreW
-#define __TBB_FetchAndStoreW tbb::internal::__TBB_FetchAndStoreGeneric<sizeof(ptrdiff_t),ptrdiff_t>
-#endif
-
-#if __TBB_DECL_FENCED_ATOMICS
-
-#ifndef __TBB_CompareAndSwap1__TBB_full_fence
-#define __TBB_CompareAndSwap1__TBB_full_fence __TBB_CompareAndSwap1
-#endif 
-#ifndef __TBB_CompareAndSwap1acquire
-#define __TBB_CompareAndSwap1acquire __TBB_CompareAndSwap1__TBB_full_fence
-#endif 
-#ifndef __TBB_CompareAndSwap1release
-#define __TBB_CompareAndSwap1release __TBB_CompareAndSwap1__TBB_full_fence
-#endif 
-
-#ifndef __TBB_CompareAndSwap2__TBB_full_fence
-#define __TBB_CompareAndSwap2__TBB_full_fence __TBB_CompareAndSwap2
-#endif
-#ifndef __TBB_CompareAndSwap2acquire
-#define __TBB_CompareAndSwap2acquire __TBB_CompareAndSwap2__TBB_full_fence
-#endif
-#ifndef __TBB_CompareAndSwap2release
-#define __TBB_CompareAndSwap2release __TBB_CompareAndSwap2__TBB_full_fence
-#endif
-
-#ifndef __TBB_CompareAndSwap4__TBB_full_fence
-#define __TBB_CompareAndSwap4__TBB_full_fence __TBB_CompareAndSwap4
-#endif 
-#ifndef __TBB_CompareAndSwap4acquire
-#define __TBB_CompareAndSwap4acquire __TBB_CompareAndSwap4__TBB_full_fence
-#endif 
-#ifndef __TBB_CompareAndSwap4release
-#define __TBB_CompareAndSwap4release __TBB_CompareAndSwap4__TBB_full_fence
-#endif 
-
-#ifndef __TBB_CompareAndSwap8__TBB_full_fence
-#define __TBB_CompareAndSwap8__TBB_full_fence __TBB_CompareAndSwap8
-#endif
-#ifndef __TBB_CompareAndSwap8acquire
-#define __TBB_CompareAndSwap8acquire __TBB_CompareAndSwap8__TBB_full_fence
-#endif
-#ifndef __TBB_CompareAndSwap8release
-#define __TBB_CompareAndSwap8release __TBB_CompareAndSwap8__TBB_full_fence
-#endif
-
-#ifndef __TBB_FetchAndAdd1__TBB_full_fence
-#define __TBB_FetchAndAdd1__TBB_full_fence __TBB_FetchAndAdd1
-#endif
-#ifndef __TBB_FetchAndAdd1acquire
-#define __TBB_FetchAndAdd1acquire __TBB_FetchAndAdd1__TBB_full_fence
-#endif
-#ifndef __TBB_FetchAndAdd1release
-#define __TBB_FetchAndAdd1release __TBB_FetchAndAdd1__TBB_full_fence
-#endif
-
-#ifndef __TBB_FetchAndAdd2__TBB_full_fence
-#define __TBB_FetchAndAdd2__TBB_full_fence __TBB_FetchAndAdd2
-#endif
-#ifndef __TBB_FetchAndAdd2acquire
-#define __TBB_FetchAndAdd2acquire __TBB_FetchAndAdd2__TBB_full_fence
-#endif
-#ifndef __TBB_FetchAndAdd2release
-#define __TBB_FetchAndAdd2release __TBB_FetchAndAdd2__TBB_full_fence
-#endif
-
-#ifndef __TBB_FetchAndAdd4__TBB_full_fence
-#define __TBB_FetchAndAdd4__TBB_full_fence __TBB_FetchAndAdd4
-#endif
-#ifndef __TBB_FetchAndAdd4acquire
-#define __TBB_FetchAndAdd4acquire __TBB_FetchAndAdd4__TBB_full_fence
-#endif
-#ifndef __TBB_FetchAndAdd4release
-#define __TBB_FetchAndAdd4release __TBB_FetchAndAdd4__TBB_full_fence
-#endif
-
-#ifndef __TBB_FetchAndAdd8__TBB_full_fence
-#define __TBB_FetchAndAdd8__TBB_full_fence __TBB_FetchAndAdd8
-#endif
-#ifndef __TBB_FetchAndAdd8acquire
-#define __TBB_FetchAndAdd8acquire __TBB_FetchAndAdd8__TBB_full_fence
-#endif
-#ifndef __TBB_FetchAndAdd8release
-#define __TBB_FetchAndAdd8release __TBB_FetchAndAdd8__TBB_full_fence
-#endif
-
-#ifndef __TBB_FetchAndStore1__TBB_full_fence
-#define __TBB_FetchAndStore1__TBB_full_fence __TBB_FetchAndStore1
-#endif
-#ifndef __TBB_FetchAndStore1acquire
-#define __TBB_FetchAndStore1acquire __TBB_FetchAndStore1__TBB_full_fence
-#endif
-#ifndef __TBB_FetchAndStore1release
-#define __TBB_FetchAndStore1release __TBB_FetchAndStore1__TBB_full_fence
-#endif
-
-#ifndef __TBB_FetchAndStore2__TBB_full_fence
-#define __TBB_FetchAndStore2__TBB_full_fence __TBB_FetchAndStore2
-#endif
-#ifndef __TBB_FetchAndStore2acquire
-#define __TBB_FetchAndStore2acquire __TBB_FetchAndStore2__TBB_full_fence
-#endif
-#ifndef __TBB_FetchAndStore2release
-#define __TBB_FetchAndStore2release __TBB_FetchAndStore2__TBB_full_fence
-#endif
-
-#ifndef __TBB_FetchAndStore4__TBB_full_fence
-#define __TBB_FetchAndStore4__TBB_full_fence __TBB_FetchAndStore4
-#endif
-#ifndef __TBB_FetchAndStore4acquire
-#define __TBB_FetchAndStore4acquire __TBB_FetchAndStore4__TBB_full_fence
-#endif
-#ifndef __TBB_FetchAndStore4release
-#define __TBB_FetchAndStore4release __TBB_FetchAndStore4__TBB_full_fence
-#endif
-
-#ifndef __TBB_FetchAndStore8__TBB_full_fence
-#define __TBB_FetchAndStore8__TBB_full_fence __TBB_FetchAndStore8
-#endif
-#ifndef __TBB_FetchAndStore8acquire
-#define __TBB_FetchAndStore8acquire __TBB_FetchAndStore8__TBB_full_fence
-#endif
-#ifndef __TBB_FetchAndStore8release
-#define __TBB_FetchAndStore8release __TBB_FetchAndStore8__TBB_full_fence
-#endif
-
-#endif // __TBB_DECL_FENCED_ATOMICS
-
-// Special atomic functions
-#ifndef __TBB_FetchAndAddWrelease
-#define __TBB_FetchAndAddWrelease __TBB_FetchAndAddW
-#endif
-
-#ifndef __TBB_FetchAndIncrementWacquire
-#define __TBB_FetchAndIncrementWacquire(P) __TBB_FetchAndAddW(P,1)
-#endif
+// Preserving access to legacy APIs
+using tbb::internal::__TBB_load_with_acquire;
+using tbb::internal::__TBB_store_with_release;
 
-#ifndef __TBB_FetchAndDecrementWrelease
-#define __TBB_FetchAndDecrementWrelease(P) __TBB_FetchAndAddW(P,(-1))
-#endif
-
-template <typename T, size_t S>
-struct __TBB_machine_load_store {
-    static inline T load_with_acquire(const volatile T& location) {
-        T to_return = location;
-        __TBB_release_consistency_helper();
-        return to_return;
-    }
-
-    static inline void store_with_release(volatile T &location, T value) {
-        __TBB_release_consistency_helper();
-        location = value;
-    }
-};
-
-#if __TBB_WORDSIZE==4 && __TBB_64BIT_ATOMICS
-#if _MSC_VER
-using tbb::internal::int64_t;
-#endif
-// On 32-bit platforms, there should be definition of __TBB_Store8 and __TBB_Load8
-#ifndef __TBB_Store8
-inline void __TBB_Store8 (volatile void *ptr, int64_t value) {
-    for(;;) {
-        int64_t result = *(int64_t *)ptr;
-        if( __TBB_CompareAndSwap8(ptr,value,result)==result ) break;
-    }
-}
-#endif
-
-#ifndef __TBB_Load8
-inline int64_t __TBB_Load8 (const volatile void *ptr) {
-    const int64_t anyvalue = 3264; // Could be anything, just the same for comparand and new value
-    return __TBB_CompareAndSwap8(const_cast<volatile void *>(ptr),anyvalue,anyvalue);
-}
-#endif
-
-template <typename T>
-struct __TBB_machine_load_store<T,8> {
-    static inline T load_with_acquire(const volatile T& location) {
-        T to_return = (T)__TBB_Load8((const volatile void*)&location);
-        __TBB_release_consistency_helper();
-        return to_return;
-    }
-
-    static inline void store_with_release(volatile T& location, T value) {
-        __TBB_release_consistency_helper();
-        __TBB_Store8((volatile void *)&location,(int64_t)value);
-    }
-};
-#endif /* __TBB_WORDSIZE==4 */
-
-#ifndef __TBB_load_with_acquire
-template<typename T>
-inline T __TBB_load_with_acquire(const volatile T &location) {
-    return __TBB_machine_load_store<T,sizeof(T)>::load_with_acquire(location);
-}
-#endif
-
-#ifndef __TBB_store_with_release
-template<typename T, typename V>
-inline void __TBB_store_with_release(volatile T& location, V value) {
-    __TBB_machine_load_store<T,sizeof(T)>::store_with_release(location,T(value));
-}
-//! Overload that exists solely to avoid /Wp64 warnings.
-inline void __TBB_store_with_release(volatile size_t& location, size_t value) {
-    __TBB_machine_load_store<size_t,sizeof(size_t)>::store_with_release(location,value);
-}
-#endif
+// Mapping historically used names to the ones expected by atomic_load_store_traits
+#define __TBB_load_acquire  __TBB_load_with_acquire
+#define __TBB_store_release __TBB_store_with_release
 
 #ifndef __TBB_Log2
 inline intptr_t __TBB_Log2( uintptr_t x ) {
@@ -669,18 +778,19 @@ inline void __TBB_AtomicAND( volatile void *operand, uintptr_t addend ) {
 }
 #endif
 
-#ifndef __TBB_Byte
-typedef unsigned char __TBB_Byte;
+#ifndef __TBB_Flag
+typedef unsigned char __TBB_Flag;
 #endif
+typedef __TBB_atomic __TBB_Flag __TBB_atomic_flag;
 
 #ifndef __TBB_TryLockByte
-inline bool __TBB_TryLockByte( __TBB_Byte &flag ) {
-    return __TBB_CompareAndSwap1(&flag,1,0)==0;
+inline bool __TBB_TryLockByte( __TBB_atomic_flag &flag ) {
+    return __TBB_machine_cmpswp1(&flag,1,0)==0;
 }
 #endif
 
 #ifndef __TBB_LockByte
-inline uintptr_t __TBB_LockByte( __TBB_Byte& flag ) {
+inline __TBB_Flag __TBB_LockByte( __TBB_atomic_flag& flag ) {
     if ( !__TBB_TryLockByte(flag) ) {
         tbb::internal::atomic_backoff b;
         do {
@@ -700,8 +810,7 @@ inline unsigned char __TBB_ReverseByte(unsigned char src) {
 #endif
 
 template<typename T>
-T __TBB_ReverseBits(T src)
-{
+T __TBB_ReverseBits(T src) {
     T dst;
     unsigned char *original = (unsigned char *) &src;
     unsigned char *reversed = (unsigned char *) &dst;
index 2e834ac43a35047d3a28ae489ec8f400a016953c..31cb92b1af5d5bb5e4550b905195e0837cd72680 100644 (file)
@@ -34,7 +34,7 @@
 #define TBB_VERSION_MINOR 0
 
 // Engineering-focused interface version
-#define TBB_INTERFACE_VERSION 5006
+#define TBB_INTERFACE_VERSION 5008
 #define TBB_INTERFACE_VERSION_MAJOR TBB_INTERFACE_VERSION/1000
 
 // The oldest major interface version still supported
 // tbb_config.h should be included the first since it contains macro definitions used in other headers
 #include "tbb_config.h"
 
-#if _MSC_VER
-// define the parts of stdint.h that are needed, but put them inside tbb::internal
-namespace tbb {
-namespace internal {
-    typedef __int8 int8_t;
-    typedef __int16 int16_t;
-    typedef __int32 int32_t;
-    typedef __int64 int64_t;
-    typedef unsigned __int8 uint8_t;
-    typedef unsigned __int16 uint16_t;
-    typedef unsigned __int32 uint32_t;
-    typedef unsigned __int64 uint64_t;
-} // namespace internal
-} // namespace tbb
-#else
-#include <stdint.h>
-#endif /* _MSC_VER */
-
 #if _MSC_VER >=1400
-#define __TBB_EXPORTED_FUNC   __cdecl
-#define __TBB_EXPORTED_METHOD __thiscall
+    #define __TBB_EXPORTED_FUNC   __cdecl
+    #define __TBB_EXPORTED_METHOD __thiscall
 #else
-#define __TBB_EXPORTED_FUNC
-#define __TBB_EXPORTED_METHOD
+    #define __TBB_EXPORTED_FUNC
+    #define __TBB_EXPORTED_METHOD
 #endif
 
 #include <cstddef>      /* Need size_t and ptrdiff_t */
 
 #if _MSC_VER
-#define __TBB_tbb_windef_H
-#include "_tbb_windef.h"
-#undef __TBB_tbb_windef_H
+    #define __TBB_tbb_windef_H
+    #include "internal/_tbb_windef.h"
+    #undef __TBB_tbb_windef_H
+#else
+    #include <stdint.h>
 #endif
 
 //! The namespace tbb contains all components of the library.
 namespace tbb {
 
-using std::size_t; using std::ptrdiff_t;
+#if _MSC_VER
+    namespace internal {
+        typedef __int8 int8_t;
+        typedef __int16 int16_t;
+        typedef __int32 int32_t;
+        typedef __int64 int64_t;
+        typedef unsigned __int8 uint8_t;
+        typedef unsigned __int16 uint16_t;
+        typedef unsigned __int32 uint32_t;
+        typedef unsigned __int64 uint64_t;
+    } // namespace internal
+#else /* Posix */
+    namespace internal {
+        using ::int8_t;
+        using ::int16_t;
+        using ::int32_t;
+        using ::int64_t;
+        using ::uint8_t;
+        using ::uint16_t;
+        using ::uint32_t;
+        using ::uint64_t;
+    } // namespace internal
+#endif /* Posix */
+
+    using std::size_t;
+    using std::ptrdiff_t;
 
     //! Type for an assertion handler
     typedef void(*assertion_handler_type)( const char* filename, int line, const char* expression, const char * comment );
 
 #if TBB_USE_ASSERT
 
-//! Assert that x is true.
-/** If x is false, print assertion failure message.  
-    If the comment argument is not NULL, it is printed as part of the failure message.  
-    The comment argument has no other effect. */
-#define __TBB_ASSERT(predicate,message) ((predicate)?((void)0):tbb::assertion_failure(__FILE__,__LINE__,#predicate,message))
-#define __TBB_ASSERT_EX __TBB_ASSERT
+    //! Assert that x is true.
+    /** If x is false, print assertion failure message.  
+        If the comment argument is not NULL, it is printed as part of the failure message.  
+        The comment argument has no other effect. */
+    #define __TBB_ASSERT(predicate,message) ((predicate)?((void)0):tbb::assertion_failure(__FILE__,__LINE__,#predicate,message))
+    #define __TBB_ASSERT_EX __TBB_ASSERT
 
     //! Set assertion handler and return previous value of it.
     assertion_handler_type __TBB_EXPORTED_FUNC set_assertion_handler( assertion_handler_type new_handler );
@@ -186,14 +195,14 @@ using std::size_t; using std::ptrdiff_t;
         Otherwise call the assertion handler. */
     void __TBB_EXPORTED_FUNC assertion_failure( const char* filename, int line, const char* expression, const char* comment );
 
-#else
+#else /* !TBB_USE_ASSERT */
 
-//! No-op version of __TBB_ASSERT.
-#define __TBB_ASSERT(predicate,comment) ((void)0)
-//! "Extended" version is useful to suppress warnings if a variable is only used with an assert
-#define __TBB_ASSERT_EX(predicate,comment) ((void)(1 && (predicate)))
+    //! No-op version of __TBB_ASSERT.
+    #define __TBB_ASSERT(predicate,comment) ((void)0)
+    //! "Extended" version is useful to suppress warnings if a variable is only used with an assert
+    #define __TBB_ASSERT_EX(predicate,comment) ((void)(1 && (predicate)))
 
-#endif /* TBB_USE_ASSERT */
+#endif /* !TBB_USE_ASSERT */
 
 //! The function returns the interface version of the TBB shared library being used.
 /**
@@ -222,6 +231,27 @@ namespace internal {
     @ingroup memory_allocation */
 const size_t NFS_MaxLineSize = 128;
 
+/** Label for data that may be accessed from different threads, and that may eventually become wrapped
+    in a formal atomic type.
+    
+    Note that no problems have yet been observed relating to the definition currently being empty,
+    even if at least "volatile" would seem to be in order to avoid data sometimes temporarily hiding
+    in a register (although "volatile" as a "poor man's atomic" lacks several other features of a proper
+    atomic, some of which are now provided instead through specialized functions).
+
+    Note that usage is intentionally compatible with a definition as qualifier "volatile",
+    both as a way to have the compiler help enforce use of the label and to quickly rule out
+    one potential issue.
+
+    Note however that, with some architecture/compiler combinations, e.g. on Itanium, "volatile" 
+    also has non-portable memory semantics that are needlessly expensive for "relaxed" operations.
+
+    Note that this must only be applied to data that will not change bit patterns when cast to/from
+    an integral type of the same length; tbb::atomic must be used instead for, e.g., floating-point types.
+
+    TODO: apply wherever relevant **/
+#define __TBB_atomic // intentionally empty, see above
+
 template<class T, int S>
 struct padded_base : T {
     char pad[NFS_MaxLineSize - sizeof(T) % NFS_MaxLineSize];
index 9d6c9e3b09850fb3786bb51abed4f11575882604..574ac770d5deb7fa77671ae2657eb3affdd1bbd0 100644 (file)
Binary files a/tbb/lib/ia32/vc10/irml/irml.lib and b/tbb/lib/ia32/vc10/irml/irml.lib differ
index b7c5fbe2f47ba00e015275ac2b9b4cb63e85171c..dab9237d0300497868cdf140c54eaaae9017e588 100644 (file)
Binary files a/tbb/lib/ia32/vc10/irml/irml_debug.lib and b/tbb/lib/ia32/vc10/irml/irml_debug.lib differ
index 781e150ce61392584bea39b9151bfdc19bf986f8..f1d10b0fce60265088f7aaa20506c45c60a165ff 100644 (file)
Binary files a/tbb/lib/ia32/vc10/irml_c/irml.lib and b/tbb/lib/ia32/vc10/irml_c/irml.lib differ
index 50b044552a948f50c8737b32e3e84af68922fc52..53c562390aa5314f1baa22d3a831bf25a5c07a4d 100644 (file)
Binary files a/tbb/lib/ia32/vc10/irml_c/irml_debug.lib and b/tbb/lib/ia32/vc10/irml_c/irml_debug.lib differ
index dff02dd8ded51285ba08792c2ed2fd52054d36ec..9e98addb21b7326f533ccd00c9e1c18ee7fb19c8 100644 (file)
@@ -90,6 +90,8 @@ EXPORTS
 \r
 \r
 \r
+    \r
+\r
 \r
 \r
 \r
@@ -115,20 +117,14 @@ EXPORTS
 \r
 \r
 \r
-    \r
 \r
 \r
 \r
-    \r
-    \r
 \r
 \r
-        \r
-    \r
 \r
 \r
 \r
-    \r
 \r
 \r
 \r
@@ -141,11 +137,9 @@ EXPORTS
 \r
 \r
 \r
-    \r
 \r
 \r
 \r
-    \r
 \r
 \r
 \r
@@ -154,11 +148,19 @@ EXPORTS
 \r
 \r
     \r
+    \r
 \r
 \r
+        \r
+    \r
 \r
 \r
 \r
+    \r
+        \r
+    \r
+\r
+\r
 \r
 \r
 \r
@@ -170,9 +172,6 @@ EXPORTS
 \r
 \r
 \r
-    \r
-        \r
-    \r
 \r
 \r
 \r
@@ -186,8 +185,11 @@ EXPORTS
 \r
 \r
 \r
+\r
     \r
-        \r
+\r
+\r
+\r
     \r
 \r
 \r
@@ -196,15 +198,20 @@ EXPORTS
 \r
 \r
 \r
+    \r
 \r
 \r
 \r
+    \r
+\r
 \r
 \r
 \r
 \r
 \r
 \r
+    \r
+\r
 \r
 \r
 \r
@@ -283,7 +290,9 @@ EXPORTS
 \r
 \r
 \r
+    \r
 \r
+    \r
 \r
 \r
 \r
@@ -354,8 +363,8 @@ __TBB_machine_trylockbyte
 ?register_pending_exception@task_group_context@tbb@@QAEXXZ\r
 ??1task_group_context@tbb@@QAE@XZ\r
 \r
-\r
-\r
+?set_priority@task_group_context@tbb@@QAEXW4priority_t@2@@Z\r
+?priority@task_group_context@tbb@@QBE?AW4priority_t@2@XZ\r
 \r
 ?name@captured_exception@tbb@@UBEPBDXZ\r
 ?what@captured_exception@tbb@@UBEPBDXZ\r
index fa25f58aad9da64dde4af0e8fdbbc87c193cff48..0c440d2dc03fac8b0cb3d96a4b22a9ac0c7823e4 100644 (file)
Binary files a/tbb/lib/ia32/vc10/tbb.lib and b/tbb/lib/ia32/vc10/tbb.lib differ
index 94194947bb1a9534f48dfc187eb949e55befec8f..3d6b801d23d084f4943cfa735f66612672d1418f 100644 (file)
Binary files a/tbb/lib/ia32/vc10/tbb_debug.lib and b/tbb/lib/ia32/vc10/tbb_debug.lib differ
index 5de0dd4951a9ce123289b2b14626507c50adb3fe..c425cff3b93afcec6fe4b546abcbf9c7d7c8a599 100644 (file)
Binary files a/tbb/lib/ia32/vc10/tbb_preview.lib and b/tbb/lib/ia32/vc10/tbb_preview.lib differ
index 20766cd69b49a0bb784862ce7e0adba6568f0c76..7decbbbba3453157b7a7c4713ac095f114f8eea5 100644 (file)
Binary files a/tbb/lib/ia32/vc10/tbb_preview_debug.lib and b/tbb/lib/ia32/vc10/tbb_preview_debug.lib differ
index 8fd46ce7e57dafb9af393735c6e9e7f31690dd64..3dbc2247adac32480202bd6af129efbc172840cd 100644 (file)
Binary files a/tbb/lib/ia32/vc10/tbbmalloc.lib and b/tbb/lib/ia32/vc10/tbbmalloc.lib differ
index 237f9427e6d9af319aac4651482d11ac8f747a50..dc83483f3ef468adfc7d21d6ac2cb27f5e1c0b33 100644 (file)
Binary files a/tbb/lib/ia32/vc10/tbbmalloc_debug.lib and b/tbb/lib/ia32/vc10/tbbmalloc_debug.lib differ
index d282777154a91a2464592044a38af52f9e6aee1f..1b3af1b57353ba6041ea7ced4dd42d45ee591c94 100644 (file)
Binary files a/tbb/lib/ia32/vc10/tbbmalloc_proxy.lib and b/tbb/lib/ia32/vc10/tbbmalloc_proxy.lib differ
index 93f484fb1c15eae8696321bea7a4a1d6e1e798dd..2f290075399e26d021b8af4d1157dd8f211f2e53 100644 (file)
Binary files a/tbb/lib/ia32/vc10/tbbmalloc_proxy_debug.lib and b/tbb/lib/ia32/vc10/tbbmalloc_proxy_debug.lib differ
diff --git a/tbb/lib/ia32/vc10/tbbproxy.lib b/tbb/lib/ia32/vc10/tbbproxy.lib
new file mode 100644 (file)
index 0000000..dd755a6
Binary files /dev/null and b/tbb/lib/ia32/vc10/tbbproxy.lib differ
diff --git a/tbb/lib/ia32/vc10/tbbproxy_debug.lib b/tbb/lib/ia32/vc10/tbbproxy_debug.lib
new file mode 100644 (file)
index 0000000..5e3ea75
Binary files /dev/null and b/tbb/lib/ia32/vc10/tbbproxy_debug.lib differ