git.sesse.net Git - casparcg/blob - tbb30_20100406oss/include/tbb/parallel_reduce.h

   1 /*
   2     Copyright 2005-2010 Intel Corporation.  All Rights Reserved.
   3
   4     This file is part of Threading Building Blocks.
   5
   6     Threading Building Blocks is free software; you can redistribute it
   7     and/or modify it under the terms of the GNU General Public License
   8     version 2 as published by the Free Software Foundation.
   9
  10     Threading Building Blocks is distributed in the hope that it will be
  11     useful, but WITHOUT ANY WARRANTY; without even the implied warranty
  12     of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13     GNU General Public License for more details.
  14
  15     You should have received a copy of the GNU General Public License
  16     along with Threading Building Blocks; if not, write to the Free Software
  17     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  18
  19     As a special exception, you may use this file as part of a free software
  20     library without restriction.  Specifically, if other files instantiate
  21     templates or use macros or inline functions from this file, or you compile
  22     this file and link it with other files to produce an executable, this
  23     file does not by itself cause the resulting executable to be covered by
  24     the GNU General Public License.  This exception does not however
  25     invalidate any other reasons why the executable file might be covered by
  26     the GNU General Public License.
  27 */
  28
  29 #ifndef __TBB_parallel_reduce_H
  30 #define __TBB_parallel_reduce_H
  31
  32 #include "task.h"
  33 #include "aligned_space.h"
  34 #include "partitioner.h"
  35 #include <new>
  36
  37 namespace tbb {
  38
  39 //! @cond INTERNAL
  40 namespace internal {
  41
  42     //! ITT instrumented routine that stores src into location pointed to by dst.
  43     void __TBB_EXPORTED_FUNC itt_store_pointer_with_release_v3( void* dst, void* src );
  44
  45     //! ITT instrumented routine that loads pointer from location pointed to by src.
  46     void* __TBB_EXPORTED_FUNC itt_load_pointer_with_acquire_v3( const void* src );
  47
  48     template<typename T> inline void parallel_reduce_store_body( T*& dst, T* src ) {
  49 #if TBB_USE_THREADING_TOOLS
  50         itt_store_pointer_with_release_v3(&dst,src);
  51 #else
  52         __TBB_store_with_release(dst,src);
  53 #endif /* TBB_USE_THREADING_TOOLS */
  54     }
  55
  56     template<typename T> inline T* parallel_reduce_load_body( T*& src ) {
  57 #if TBB_USE_THREADING_TOOLS
  58         return static_cast<T*>(itt_load_pointer_with_acquire_v3(&src));
  59 #else
  60         return __TBB_load_with_acquire(src);
  61 #endif /* TBB_USE_THREADING_TOOLS */
  62     }
  63
  64     //! 0 if root, 1 if a left child, 2 if a right child.
  65     /** Represented as a char, not enum, for compactness. */
  66     typedef char reduction_context;
  67
  68     //! Task type use to combine the partial results of parallel_reduce.
  69     /** @ingroup algorithms */
  70     template<typename Body>
  71     class finish_reduce: public task {
  72         //! Pointer to body, or NULL if the left child has not yet finished.
  73         Body* my_body;
  74         bool has_right_zombie;
  75         const reduction_context my_context;
  76         aligned_space<Body,1> zombie_space;
  77         finish_reduce( char context_ ) :
  78             my_body(NULL),
  79             has_right_zombie(false),
  80             my_context(context_)
  81         {
  82         }
  83         task* execute() {
  84             if( has_right_zombie ) {
  85                 // Right child was stolen.
  86                 Body* s = zombie_space.begin();
  87                 my_body->join( *s );
  88                 s->~Body();
  89             }
  90             if( my_context==1 )
  91                 parallel_reduce_store_body( static_cast<finish_reduce*>(parent())->my_body, my_body );
  92             return NULL;
  93         }
  94         template<typename Range,typename Body_, typename Partitioner>
  95         friend class start_reduce;
  96     };
  97
  98     //! Task type used to split the work of parallel_reduce.
  99     /** @ingroup algorithms */
 100     template<typename Range, typename Body, typename Partitioner>
 101     class start_reduce: public task {
 102         typedef finish_reduce<Body> finish_type;
 103         Body* my_body;
 104         Range my_range;
 105         typename Partitioner::partition_type my_partition;
 106         reduction_context my_context;
 107         /*override*/ task* execute();
 108         template<typename Body_>
 109         friend class finish_reduce;
 110
 111         //! Constructor used for root task
 112         start_reduce( const Range& range, Body* body, Partitioner& partitioner ) :
 113             my_body(body),
 114             my_range(range),
 115             my_partition(partitioner),
 116             my_context(0)
 117         {
 118         }
 119         //! Splitting constructor used to generate children.
 120         /** this becomes left child.  Newly constructed object is right child. */
 121         start_reduce( start_reduce& parent_, split ) :
 122             my_body(parent_.my_body),
 123             my_range(parent_.my_range,split()),
 124             my_partition(parent_.my_partition,split()),
 125             my_context(2)
 126         {
 127             my_partition.set_affinity(*this);
 128             parent_.my_context = 1;
 129         }
 130         //! Update affinity info, if any
 131         /*override*/ void note_affinity( affinity_id id ) {
 132             my_partition.note_affinity( id );
 133         }
 134
 135 public:
 136         static void run( const Range& range, Body& body, Partitioner& partitioner ) {
 137             if( !range.empty() ) {
 138 #if !__TBB_TASK_GROUP_CONTEXT || TBB_JOIN_OUTER_TASK_GROUP
 139                 task::spawn_root_and_wait( *new(task::allocate_root()) start_reduce(range,&body,partitioner) );
 140 #else
 141                 // Bound context prevents exceptions from body to affect nesting or sibling algorithms,
 142                 // and allows users to handle exceptions safely by wrapping parallel_for in the try-block.
 143                 task_group_context context;
 144                 task::spawn_root_and_wait( *new(task::allocate_root(context)) start_reduce(range,&body,partitioner) );
 145 #endif /* __TBB_TASK_GROUP_CONTEXT && !TBB_JOIN_OUTER_TASK_GROUP */
 146             }
 147         }
 148 #if __TBB_TASK_GROUP_CONTEXT
 149         static void run( const Range& range, Body& body, Partitioner& partitioner, task_group_context& context ) {
 150             if( !range.empty() )
 151                 task::spawn_root_and_wait( *new(task::allocate_root(context)) start_reduce(range,&body,partitioner) );
 152         }
 153 #endif /* __TBB_TASK_GROUP_CONTEXT */
 154     };
 155
 156     template<typename Range, typename Body, typename Partitioner>
 157     task* start_reduce<Range,Body,Partitioner>::execute() {
 158         if( my_context==2 ) {
 159             finish_type* p = static_cast<finish_type*>(parent() );
 160             if( !parallel_reduce_load_body(p->my_body) ) {
 161                 my_body = new( p->zombie_space.begin() ) Body(*my_body,split());
 162                 p->has_right_zombie = true;
 163             }
 164         }
 165         if( !my_range.is_divisible() || my_partition.should_execute_range(*this) ) {
 166             (*my_body)( my_range );
 167             if( my_context==1 )
 168                 parallel_reduce_store_body(static_cast<finish_type*>(parent())->my_body, my_body );
 169             return my_partition.continue_after_execute_range();
 170         } else {
 171             finish_type& c = *new( allocate_continuation()) finish_type(my_context);
 172             recycle_as_child_of(c);
 173             c.set_ref_count(2);
 174             bool delay = my_partition.decide_whether_to_delay();
 175             start_reduce& b = *new( c.allocate_child() ) start_reduce(*this,split());
 176             my_partition.spawn_or_delay(delay,b);
 177             return this;
 178         }
 179     }
 180
 181     //! Auxiliary class for parallel_reduce; for internal use only.
 182     /** The adaptor class that implements \ref parallel_reduce_body_req "parallel_reduce Body"
 183         using given \ref parallel_reduce_lambda_req "anonymous function objects".
 184      **/
 185     /** @ingroup algorithms */
 186     template<typename Range, typename Value, typename RealBody, typename Reduction>
 187     class lambda_reduce_body {
 188
 189 //FIXME: decide if my_real_body, my_reduction, and identity_element should be copied or referenced
 190 //       (might require some performance measurements)
 191
 192         const Value&     identity_element;
 193         const RealBody&  my_real_body;
 194         const Reduction& my_reduction;
 195         Value            my_value;
 196         lambda_reduce_body& operator= ( const lambda_reduce_body& other );
 197     public:
 198         lambda_reduce_body( const Value& identity, const RealBody& body, const Reduction& reduction )
 199             : identity_element(identity)
 200             , my_real_body(body)
 201             , my_reduction(reduction)
 202             , my_value(identity)
 203         { }
 204         lambda_reduce_body( const lambda_reduce_body& other )
 205             : identity_element(other.identity_element)
 206             , my_real_body(other.my_real_body)
 207             , my_reduction(other.my_reduction)
 208             , my_value(other.my_value)
 209         { }
 210         lambda_reduce_body( lambda_reduce_body& other, tbb::split )
 211             : identity_element(other.identity_element)
 212             , my_real_body(other.my_real_body)
 213             , my_reduction(other.my_reduction)
 214             , my_value(other.identity_element)
 215         { }
 216         void operator()(Range& range) {
 217             my_value = my_real_body(range, const_cast<const Value&>(my_value));
 218         }
 219         void join( lambda_reduce_body& rhs ) {
 220             my_value = my_reduction(const_cast<const Value&>(my_value), const_cast<const Value&>(rhs.my_value));
 221         }
 222         Value result() const {
 223             return my_value;
 224         }
 225     };
 226
 227 } // namespace internal
 228 //! @endcond
 229
 230 // Requirements on Range concept are documented in blocked_range.h
 231
 232 /** \page parallel_reduce_body_req Requirements on parallel_reduce body
 233     Class \c Body implementing the concept of parallel_reduce body must define:
 234     - \code Body::Body( Body&, split ); \endcode        Splitting constructor.
 235                                                         Must be able to run concurrently with operator() and method \c join
 236     - \code Body::~Body(); \endcode                     Destructor
 237     - \code void Body::operator()( Range& r ); \endcode Function call operator applying body to range \c r
 238                                                         and accumulating the result
 239     - \code void Body::join( Body& b ); \endcode        Join results.
 240                                                         The result in \c b should be merged into the result of \c this
 241 **/
 242
 243 /** \page parallel_reduce_lambda_req Requirements on parallel_reduce anonymous function objects (lambda functions)
 244     TO BE DOCUMENTED
 245 **/
 246
 247 /** \name parallel_reduce
 248     See also requirements on \ref range_req "Range" and \ref parallel_reduce_body_req "parallel_reduce Body". **/
 249 //@{
 250
 251 //! Parallel iteration with reduction and default partitioner.
 252 /** @ingroup algorithms **/
 253 template<typename Range, typename Body>
 254 void parallel_reduce( const Range& range, Body& body ) {
 255     internal::start_reduce<Range,Body, const __TBB_DEFAULT_PARTITIONER>::run( range, body, __TBB_DEFAULT_PARTITIONER() );
 256 }
 257
 258 //! Parallel iteration with reduction and simple_partitioner
 259 /** @ingroup algorithms **/
 260 template<typename Range, typename Body>
 261 void parallel_reduce( const Range& range, Body& body, const simple_partitioner& partitioner ) {
 262     internal::start_reduce<Range,Body,const simple_partitioner>::run( range, body, partitioner );
 263 }
 264
 265 //! Parallel iteration with reduction and auto_partitioner
 266 /** @ingroup algorithms **/
 267 template<typename Range, typename Body>
 268 void parallel_reduce( const Range& range, Body& body, const auto_partitioner& partitioner ) {
 269     internal::start_reduce<Range,Body,const auto_partitioner>::run( range, body, partitioner );
 270 }
 271
 272 //! Parallel iteration with reduction and affinity_partitioner
 273 /** @ingroup algorithms **/
 274 template<typename Range, typename Body>
 275 void parallel_reduce( const Range& range, Body& body, affinity_partitioner& partitioner ) {
 276     internal::start_reduce<Range,Body,affinity_partitioner>::run( range, body, partitioner );
 277 }
 278
 279 #if __TBB_TASK_GROUP_CONTEXT
 280 //! Parallel iteration with reduction, simple partitioner and user-supplied context.
 281 /** @ingroup algorithms **/
 282 template<typename Range, typename Body>
 283 void parallel_reduce( const Range& range, Body& body, const simple_partitioner& partitioner, task_group_context& context ) {
 284     internal::start_reduce<Range,Body,const simple_partitioner>::run( range, body, partitioner, context );
 285 }
 286
 287 //! Parallel iteration with reduction, auto_partitioner and user-supplied context
 288 /** @ingroup algorithms **/
 289 template<typename Range, typename Body>
 290 void parallel_reduce( const Range& range, Body& body, const auto_partitioner& partitioner, task_group_context& context ) {
 291     internal::start_reduce<Range,Body,const auto_partitioner>::run( range, body, partitioner, context );
 292 }
 293
 294 //! Parallel iteration with reduction, affinity_partitioner and user-supplied context
 295 /** @ingroup algorithms **/
 296 template<typename Range, typename Body>
 297 void parallel_reduce( const Range& range, Body& body, affinity_partitioner& partitioner, task_group_context& context ) {
 298     internal::start_reduce<Range,Body,affinity_partitioner>::run( range, body, partitioner, context );
 299 }
 300 #endif /* __TBB_TASK_GROUP_CONTEXT */
 301
 302 /** parallel_reduce overloads that work with anonymous function objects
 303     (see also \ref parallel_reduce_lambda_req "requirements on parallel_reduce anonymous function objects"). **/
 304
 305 //! Parallel iteration with reduction and default partitioner.
 306 /** @ingroup algorithms **/
 307 template<typename Range, typename Value, typename RealBody, typename Reduction>
 308 Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction ) {
 309     internal::lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
 310     internal::start_reduce<Range,internal::lambda_reduce_body<Range,Value,RealBody,Reduction>,const __TBB_DEFAULT_PARTITIONER>
 311                           ::run(range, body, __TBB_DEFAULT_PARTITIONER() );
 312     return body.result();
 313 }
 314
 315 //! Parallel iteration with reduction and simple_partitioner.
 316 /** @ingroup algorithms **/
 317 template<typename Range, typename Value, typename RealBody, typename Reduction>
 318 Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction,
 319                        const simple_partitioner& partitioner ) {
 320     internal::lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
 321     internal::start_reduce<Range,internal::lambda_reduce_body<Range,Value,RealBody,Reduction>,const simple_partitioner>
 322                           ::run(range, body, partitioner );
 323     return body.result();
 324 }
 325
 326 //! Parallel iteration with reduction and auto_partitioner
 327 /** @ingroup algorithms **/
 328 template<typename Range, typename Value, typename RealBody, typename Reduction>
 329 Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction,
 330                        const auto_partitioner& partitioner ) {
 331     internal::lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
 332     internal::start_reduce<Range,internal::lambda_reduce_body<Range,Value,RealBody,Reduction>,const auto_partitioner>
 333                           ::run( range, body, partitioner );
 334     return body.result();
 335 }
 336
 337 //! Parallel iteration with reduction and affinity_partitioner
 338 /** @ingroup algorithms **/
 339 template<typename Range, typename Value, typename RealBody, typename Reduction>
 340 Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction,
 341                        affinity_partitioner& partitioner ) {
 342     internal::lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
 343     internal::start_reduce<Range,internal::lambda_reduce_body<Range,Value,RealBody,Reduction>,affinity_partitioner>
 344                                         ::run( range, body, partitioner );
 345     return body.result();
 346 }
 347
 348 #if __TBB_TASK_GROUP_CONTEXT
 349 //! Parallel iteration with reduction, simple partitioner and user-supplied context.
 350 /** @ingroup algorithms **/
 351 template<typename Range, typename Value, typename RealBody, typename Reduction>
 352 Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction,
 353                        const simple_partitioner& partitioner, task_group_context& context ) {
 354     internal::lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
 355     internal::start_reduce<Range,internal::lambda_reduce_body<Range,Value,RealBody,Reduction>,const simple_partitioner>
 356                           ::run( range, body, partitioner, context );
 357     return body.result();
 358 }
 359
 360 //! Parallel iteration with reduction, auto_partitioner and user-supplied context
 361 /** @ingroup algorithms **/
 362 template<typename Range, typename Value, typename RealBody, typename Reduction>
 363 Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction,
 364                        const auto_partitioner& partitioner, task_group_context& context ) {
 365     internal::lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
 366     internal::start_reduce<Range,internal::lambda_reduce_body<Range,Value,RealBody,Reduction>,const auto_partitioner>
 367                           ::run( range, body, partitioner, context );
 368     return body.result();
 369 }
 370
 371 //! Parallel iteration with reduction, affinity_partitioner and user-supplied context
 372 /** @ingroup algorithms **/
 373 template<typename Range, typename Value, typename RealBody, typename Reduction>
 374 Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction,
 375                        affinity_partitioner& partitioner, task_group_context& context ) {
 376     internal::lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
 377     internal::start_reduce<Range,internal::lambda_reduce_body<Range,Value,RealBody,Reduction>,affinity_partitioner>
 378                                         ::run( range, body, partitioner, context );
 379     return body.result();
 380 }
 381 #endif /* __TBB_TASK_GROUP_CONTEXT */
 382 //@}
 383
 384 } // namespace tbb
 385
 386 #endif /* __TBB_parallel_reduce_H */
 387