git.sesse.net Git - casparcg/blob - tbb/include/tbb/parallel_reduce.h

   1 /*
   2     Copyright 2005-2011 Intel Corporation.  All Rights Reserved.
   3
   4     This file is part of Threading Building Blocks.
   5
   6     Threading Building Blocks is free software; you can redistribute it
   7     and/or modify it under the terms of the GNU General Public License
   8     version 2 as published by the Free Software Foundation.
   9
  10     Threading Building Blocks is distributed in the hope that it will be
  11     useful, but WITHOUT ANY WARRANTY; without even the implied warranty
  12     of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13     GNU General Public License for more details.
  14
  15     You should have received a copy of the GNU General Public License
  16     along with Threading Building Blocks; if not, write to the Free Software
  17     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  18
  19     As a special exception, you may use this file as part of a free software
  20     library without restriction.  Specifically, if other files instantiate
  21     templates or use macros or inline functions from this file, or you compile
  22     this file and link it with other files to produce an executable, this
  23     file does not by itself cause the resulting executable to be covered by
  24     the GNU General Public License.  This exception does not however
  25     invalidate any other reasons why the executable file might be covered by
  26     the GNU General Public License.
  27 */
  28
  29 #ifndef __TBB_parallel_reduce_H
  30 #define __TBB_parallel_reduce_H
  31
  32 #include "task.h"
  33 #include "aligned_space.h"
  34 #include "partitioner.h"
  35 #include "tbb_profiling.h"
  36 #include <new>
  37
  38 namespace tbb {
  39
  40 //! @cond INTERNAL
  41 namespace internal {
  42     //! 0 if root, 1 if a left child, 2 if a right child.
  43     /** Represented as a char, not enum, for compactness. */
  44     typedef char reduction_context;
  45
  46     //! Task type use to combine the partial results of parallel_reduce.
  47     /** @ingroup algorithms */
  48     template<typename Body>
  49     class finish_reduce: public task {
  50         //! Pointer to body, or NULL if the left child has not yet finished.
  51         Body* my_body;
  52         bool has_right_zombie;
  53         const reduction_context my_context;
  54         aligned_space<Body,1> zombie_space;
  55         finish_reduce( reduction_context context_ ) :
  56             my_body(NULL),
  57             has_right_zombie(false),
  58             my_context(context_)
  59         {
  60         }
  61         task* execute() {
  62             if( has_right_zombie ) {
  63                 // Right child was stolen.
  64                 Body* s = zombie_space.begin();
  65                 my_body->join( *s );
  66                 s->~Body();
  67             }
  68             if( my_context==1 )  // left child
  69                 itt_store_word_with_release( static_cast<finish_reduce*>(parent())->my_body, my_body );
  70             return NULL;
  71         }
  72         template<typename Range,typename Body_, typename Partitioner>
  73         friend class start_reduce;
  74     };
  75
  76     //! Task type used to split the work of parallel_reduce.
  77     /** @ingroup algorithms */
  78     template<typename Range, typename Body, typename Partitioner>
  79     class start_reduce: public task {
  80         typedef finish_reduce<Body> finish_type;
  81         Body* my_body;
  82         Range my_range;
  83         typename Partitioner::partition_type my_partition;
  84         reduction_context my_context;
  85         /*override*/ task* execute();
  86         template<typename Body_>
  87         friend class finish_reduce;
  88
  89         //! Constructor used for root task
  90         start_reduce( const Range& range, Body* body, Partitioner& partitioner ) :
  91             my_body(body),
  92             my_range(range),
  93             my_partition(partitioner),
  94             my_context(0)
  95         {
  96         }
  97         //! Splitting constructor used to generate children.
  98         /** parent_ becomes left child.  Newly constructed object is right child. */
  99         start_reduce( start_reduce& parent_, split ) :
 100             my_body(parent_.my_body),
 101             my_range(parent_.my_range,split()),
 102             my_partition(parent_.my_partition,split()),
 103             my_context(2)
 104         {
 105             my_partition.set_affinity(*this);
 106             parent_.my_context = 1;
 107         }
 108         //! Update affinity info, if any
 109         /*override*/ void note_affinity( affinity_id id ) {
 110             my_partition.note_affinity( id );
 111         }
 112
 113 public:
 114         static void run( const Range& range, Body& body, Partitioner& partitioner ) {
 115             if( !range.empty() ) {
 116 #if !__TBB_TASK_GROUP_CONTEXT || TBB_JOIN_OUTER_TASK_GROUP
 117                 task::spawn_root_and_wait( *new(task::allocate_root()) start_reduce(range,&body,partitioner) );
 118 #else
 119                 // Bound context prevents exceptions from body to affect nesting or sibling algorithms,
 120                 // and allows users to handle exceptions safely by wrapping parallel_for in the try-block.
 121                 task_group_context context;
 122                 task::spawn_root_and_wait( *new(task::allocate_root(context)) start_reduce(range,&body,partitioner) );
 123 #endif /* __TBB_TASK_GROUP_CONTEXT && !TBB_JOIN_OUTER_TASK_GROUP */
 124             }
 125         }
 126 #if __TBB_TASK_GROUP_CONTEXT
 127         static void run( const Range& range, Body& body, Partitioner& partitioner, task_group_context& context ) {
 128             if( !range.empty() )
 129                 task::spawn_root_and_wait( *new(task::allocate_root(context)) start_reduce(range,&body,partitioner) );
 130         }
 131 #endif /* __TBB_TASK_GROUP_CONTEXT */
 132     };
 133
 134     template<typename Range, typename Body, typename Partitioner>
 135     task* start_reduce<Range,Body,Partitioner>::execute() {
 136         if( my_context==2 ) { // right child
 137             finish_type* p = static_cast<finish_type*>(parent());
 138             if( !itt_load_word_with_acquire(p->my_body) ) {
 139                 my_body = new( p->zombie_space.begin() ) Body(*my_body,split());
 140                 p->has_right_zombie = true;
 141             }
 142         }
 143         if( !my_range.is_divisible() || my_partition.should_execute_range(*this) ) {
 144             (*my_body)( my_range );
 145             if( my_context==1 )
 146                 itt_store_word_with_release(static_cast<finish_type*>(parent())->my_body, my_body );
 147             return my_partition.continue_after_execute_range();
 148         } else {
 149             finish_type& c = *new( allocate_continuation()) finish_type(my_context);
 150             recycle_as_child_of(c);
 151             c.set_ref_count(2);
 152             bool delay = my_partition.decide_whether_to_delay();
 153             start_reduce& b = *new( c.allocate_child() ) start_reduce(*this,split());
 154             my_partition.spawn_or_delay(delay,b);
 155             return this;
 156         }
 157     }
 158
 159 #if TBB_PREVIEW_DETERMINISTIC_REDUCE
 160     //! Task type use to combine the partial results of parallel_deterministic_reduce.
 161     /** @ingroup algorithms */
 162     template<typename Body>
 163     class finish_deterministic_reduce: public task {
 164         Body &my_left_body;
 165         Body my_right_body;
 166
 167         finish_deterministic_reduce( Body &body ) :
 168             my_left_body( body ),
 169             my_right_body( body, split() )
 170         {
 171         }
 172         task* execute() {
 173             my_left_body.join( my_right_body );
 174             return NULL;
 175         }
 176         template<typename Range,typename Body_>
 177         friend class start_deterministic_reduce;
 178     };
 179
 180     //! Task type used to split the work of parallel_deterministic_reduce.
 181     /** @ingroup algorithms */
 182     template<typename Range, typename Body>
 183     class start_deterministic_reduce: public task {
 184         typedef finish_deterministic_reduce<Body> finish_type;
 185         Body &my_body;
 186         Range my_range;
 187         /*override*/ task* execute();
 188
 189         //! Constructor used for root task
 190         start_deterministic_reduce( const Range& range, Body& body ) :
 191             my_body( body ),
 192             my_range( range )
 193         {
 194         }
 195         //! Splitting constructor used to generate children.
 196         /** parent_ becomes left child.  Newly constructed object is right child. */
 197         start_deterministic_reduce( start_deterministic_reduce& parent_, finish_type& c ) :
 198             my_body( c.my_right_body ),
 199             my_range( parent_.my_range, split() )
 200         {
 201         }
 202
 203 public:
 204         static void run( const Range& range, Body& body ) {
 205             if( !range.empty() ) {
 206 #if !__TBB_TASK_GROUP_CONTEXT || TBB_JOIN_OUTER_TASK_GROUP
 207                 task::spawn_root_and_wait( *new(task::allocate_root()) start_deterministic_reduce(range,&body) );
 208 #else
 209                 // Bound context prevents exceptions from body to affect nesting or sibling algorithms,
 210                 // and allows users to handle exceptions safely by wrapping parallel_for in the try-block.
 211                 task_group_context context;
 212                 task::spawn_root_and_wait( *new(task::allocate_root(context)) start_deterministic_reduce(range,body) );
 213 #endif /* __TBB_TASK_GROUP_CONTEXT && !TBB_JOIN_OUTER_TASK_GROUP */
 214             }
 215         }
 216 #if __TBB_TASK_GROUP_CONTEXT
 217         static void run( const Range& range, Body& body, task_group_context& context ) {
 218             if( !range.empty() )
 219                 task::spawn_root_and_wait( *new(task::allocate_root(context)) start_deterministic_reduce(range,body) );
 220         }
 221 #endif /* __TBB_TASK_GROUP_CONTEXT */
 222     };
 223
 224     template<typename Range, typename Body>
 225     task* start_deterministic_reduce<Range,Body>::execute() {
 226         if( !my_range.is_divisible() ) {
 227             my_body( my_range );
 228             return NULL;
 229         } else {
 230             finish_type& c = *new( allocate_continuation() ) finish_type( my_body );
 231             recycle_as_child_of(c);
 232             c.set_ref_count(2);
 233             start_deterministic_reduce& b = *new( c.allocate_child() ) start_deterministic_reduce( *this, c );
 234             task::spawn(b);
 235             return this;
 236         }
 237     }
 238 #endif /* TBB_PREVIEW_DETERMINISTIC_REDUCE */
 239
 240     //! Auxiliary class for parallel_reduce; for internal use only.
 241     /** The adaptor class that implements \ref parallel_reduce_body_req "parallel_reduce Body"
 242         using given \ref parallel_reduce_lambda_req "anonymous function objects".
 243      **/
 244     /** @ingroup algorithms */
 245     template<typename Range, typename Value, typename RealBody, typename Reduction>
 246     class lambda_reduce_body {
 247
 248 //FIXME: decide if my_real_body, my_reduction, and identity_element should be copied or referenced
 249 //       (might require some performance measurements)
 250
 251         const Value&     identity_element;
 252         const RealBody&  my_real_body;
 253         const Reduction& my_reduction;
 254         Value            my_value;
 255         lambda_reduce_body& operator= ( const lambda_reduce_body& other );
 256     public:
 257         lambda_reduce_body( const Value& identity, const RealBody& body, const Reduction& reduction )
 258             : identity_element(identity)
 259             , my_real_body(body)
 260             , my_reduction(reduction)
 261             , my_value(identity)
 262         { }
 263         lambda_reduce_body( const lambda_reduce_body& other )
 264             : identity_element(other.identity_element)
 265             , my_real_body(other.my_real_body)
 266             , my_reduction(other.my_reduction)
 267             , my_value(other.my_value)
 268         { }
 269         lambda_reduce_body( lambda_reduce_body& other, tbb::split )
 270             : identity_element(other.identity_element)
 271             , my_real_body(other.my_real_body)
 272             , my_reduction(other.my_reduction)
 273             , my_value(other.identity_element)
 274         { }
 275         void operator()(Range& range) {
 276             my_value = my_real_body(range, const_cast<const Value&>(my_value));
 277         }
 278         void join( lambda_reduce_body& rhs ) {
 279             my_value = my_reduction(const_cast<const Value&>(my_value), const_cast<const Value&>(rhs.my_value));
 280         }
 281         Value result() const {
 282             return my_value;
 283         }
 284     };
 285
 286 } // namespace internal
 287 //! @endcond
 288
 289 // Requirements on Range concept are documented in blocked_range.h
 290
 291 /** \page parallel_reduce_body_req Requirements on parallel_reduce body
 292     Class \c Body implementing the concept of parallel_reduce body must define:
 293     - \code Body::Body( Body&, split ); \endcode        Splitting constructor.
 294                                                         Must be able to run concurrently with operator() and method \c join
 295     - \code Body::~Body(); \endcode                     Destructor
 296     - \code void Body::operator()( Range& r ); \endcode Function call operator applying body to range \c r
 297                                                         and accumulating the result
 298     - \code void Body::join( Body& b ); \endcode        Join results.
 299                                                         The result in \c b should be merged into the result of \c this
 300 **/
 301
 302 /** \page parallel_reduce_lambda_req Requirements on parallel_reduce anonymous function objects (lambda functions)
 303     TO BE DOCUMENTED
 304 **/
 305
 306 /** \name parallel_reduce
 307     See also requirements on \ref range_req "Range" and \ref parallel_reduce_body_req "parallel_reduce Body". **/
 308 //@{
 309
 310 //! Parallel iteration with reduction and default partitioner.
 311 /** @ingroup algorithms **/
 312 template<typename Range, typename Body>
 313 void parallel_reduce( const Range& range, Body& body ) {
 314     internal::start_reduce<Range,Body, const __TBB_DEFAULT_PARTITIONER>::run( range, body, __TBB_DEFAULT_PARTITIONER() );
 315 }
 316
 317 //! Parallel iteration with reduction and simple_partitioner
 318 /** @ingroup algorithms **/
 319 template<typename Range, typename Body>
 320 void parallel_reduce( const Range& range, Body& body, const simple_partitioner& partitioner ) {
 321     internal::start_reduce<Range,Body,const simple_partitioner>::run( range, body, partitioner );
 322 }
 323
 324 //! Parallel iteration with reduction and auto_partitioner
 325 /** @ingroup algorithms **/
 326 template<typename Range, typename Body>
 327 void parallel_reduce( const Range& range, Body& body, const auto_partitioner& partitioner ) {
 328     internal::start_reduce<Range,Body,const auto_partitioner>::run( range, body, partitioner );
 329 }
 330
 331 //! Parallel iteration with reduction and affinity_partitioner
 332 /** @ingroup algorithms **/
 333 template<typename Range, typename Body>
 334 void parallel_reduce( const Range& range, Body& body, affinity_partitioner& partitioner ) {
 335     internal::start_reduce<Range,Body,affinity_partitioner>::run( range, body, partitioner );
 336 }
 337
 338 #if __TBB_TASK_GROUP_CONTEXT
 339 //! Parallel iteration with reduction, simple partitioner and user-supplied context.
 340 /** @ingroup algorithms **/
 341 template<typename Range, typename Body>
 342 void parallel_reduce( const Range& range, Body& body, const simple_partitioner& partitioner, task_group_context& context ) {
 343     internal::start_reduce<Range,Body,const simple_partitioner>::run( range, body, partitioner, context );
 344 }
 345
 346 //! Parallel iteration with reduction, auto_partitioner and user-supplied context
 347 /** @ingroup algorithms **/
 348 template<typename Range, typename Body>
 349 void parallel_reduce( const Range& range, Body& body, const auto_partitioner& partitioner, task_group_context& context ) {
 350     internal::start_reduce<Range,Body,const auto_partitioner>::run( range, body, partitioner, context );
 351 }
 352
 353 //! Parallel iteration with reduction, affinity_partitioner and user-supplied context
 354 /** @ingroup algorithms **/
 355 template<typename Range, typename Body>
 356 void parallel_reduce( const Range& range, Body& body, affinity_partitioner& partitioner, task_group_context& context ) {
 357     internal::start_reduce<Range,Body,affinity_partitioner>::run( range, body, partitioner, context );
 358 }
 359 #endif /* __TBB_TASK_GROUP_CONTEXT */
 360
 361 /** parallel_reduce overloads that work with anonymous function objects
 362     (see also \ref parallel_reduce_lambda_req "requirements on parallel_reduce anonymous function objects"). **/
 363
 364 //! Parallel iteration with reduction and default partitioner.
 365 /** @ingroup algorithms **/
 366 template<typename Range, typename Value, typename RealBody, typename Reduction>
 367 Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction ) {
 368     internal::lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
 369     internal::start_reduce<Range,internal::lambda_reduce_body<Range,Value,RealBody,Reduction>,const __TBB_DEFAULT_PARTITIONER>
 370                           ::run(range, body, __TBB_DEFAULT_PARTITIONER() );
 371     return body.result();
 372 }
 373
 374 //! Parallel iteration with reduction and simple_partitioner.
 375 /** @ingroup algorithms **/
 376 template<typename Range, typename Value, typename RealBody, typename Reduction>
 377 Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction,
 378                        const simple_partitioner& partitioner ) {
 379     internal::lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
 380     internal::start_reduce<Range,internal::lambda_reduce_body<Range,Value,RealBody,Reduction>,const simple_partitioner>
 381                           ::run(range, body, partitioner );
 382     return body.result();
 383 }
 384
 385 //! Parallel iteration with reduction and auto_partitioner
 386 /** @ingroup algorithms **/
 387 template<typename Range, typename Value, typename RealBody, typename Reduction>
 388 Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction,
 389                        const auto_partitioner& partitioner ) {
 390     internal::lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
 391     internal::start_reduce<Range,internal::lambda_reduce_body<Range,Value,RealBody,Reduction>,const auto_partitioner>
 392                           ::run( range, body, partitioner );
 393     return body.result();
 394 }
 395
 396 //! Parallel iteration with reduction and affinity_partitioner
 397 /** @ingroup algorithms **/
 398 template<typename Range, typename Value, typename RealBody, typename Reduction>
 399 Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction,
 400                        affinity_partitioner& partitioner ) {
 401     internal::lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
 402     internal::start_reduce<Range,internal::lambda_reduce_body<Range,Value,RealBody,Reduction>,affinity_partitioner>
 403                                         ::run( range, body, partitioner );
 404     return body.result();
 405 }
 406
 407 #if __TBB_TASK_GROUP_CONTEXT
 408 //! Parallel iteration with reduction, simple partitioner and user-supplied context.
 409 /** @ingroup algorithms **/
 410 template<typename Range, typename Value, typename RealBody, typename Reduction>
 411 Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction,
 412                        const simple_partitioner& partitioner, task_group_context& context ) {
 413     internal::lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
 414     internal::start_reduce<Range,internal::lambda_reduce_body<Range,Value,RealBody,Reduction>,const simple_partitioner>
 415                           ::run( range, body, partitioner, context );
 416     return body.result();
 417 }
 418
 419 //! Parallel iteration with reduction, auto_partitioner and user-supplied context
 420 /** @ingroup algorithms **/
 421 template<typename Range, typename Value, typename RealBody, typename Reduction>
 422 Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction,
 423                        const auto_partitioner& partitioner, task_group_context& context ) {
 424     internal::lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
 425     internal::start_reduce<Range,internal::lambda_reduce_body<Range,Value,RealBody,Reduction>,const auto_partitioner>
 426                           ::run( range, body, partitioner, context );
 427     return body.result();
 428 }
 429
 430 //! Parallel iteration with reduction, affinity_partitioner and user-supplied context
 431 /** @ingroup algorithms **/
 432 template<typename Range, typename Value, typename RealBody, typename Reduction>
 433 Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction,
 434                        affinity_partitioner& partitioner, task_group_context& context ) {
 435     internal::lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
 436     internal::start_reduce<Range,internal::lambda_reduce_body<Range,Value,RealBody,Reduction>,affinity_partitioner>
 437                                         ::run( range, body, partitioner, context );
 438     return body.result();
 439 }
 440 #endif /* __TBB_TASK_GROUP_CONTEXT */
 441
 442 #if TBB_PREVIEW_DETERMINISTIC_REDUCE
 443 //! Parallel iteration with deterministic reduction and default partitioner.
 444 /** @ingroup algorithms **/
 445 template<typename Range, typename Body>
 446 void parallel_deterministic_reduce( const Range& range, Body& body ) {
 447     internal::start_deterministic_reduce<Range,Body>::run( range, body );
 448 }
 449
 450 #if __TBB_TASK_GROUP_CONTEXT
 451 //! Parallel iteration with deterministic reduction, simple partitioner and user-supplied context.
 452 /** @ingroup algorithms **/
 453 template<typename Range, typename Body>
 454 void parallel_deterministic_reduce( const Range& range, Body& body, task_group_context& context ) {
 455     internal::start_deterministic_reduce<Range,Body>::run( range, body, context );
 456 }
 457 #endif /* __TBB_TASK_GROUP_CONTEXT */
 458
 459 /** parallel_reduce overloads that work with anonymous function objects
 460     (see also \ref parallel_reduce_lambda_req "requirements on parallel_reduce anonymous function objects"). **/
 461
 462 //! Parallel iteration with deterministic reduction and default partitioner.
 463 /** @ingroup algorithms **/
 464 template<typename Range, typename Value, typename RealBody, typename Reduction>
 465 Value parallel_deterministic_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction ) {
 466     internal::lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
 467     internal::start_deterministic_reduce<Range,internal::lambda_reduce_body<Range,Value,RealBody,Reduction> >
 468                           ::run(range, body);
 469     return body.result();
 470 }
 471
 472 #if __TBB_TASK_GROUP_CONTEXT
 473 //! Parallel iteration with deterministic reduction, simple partitioner and user-supplied context.
 474 /** @ingroup algorithms **/
 475 template<typename Range, typename Value, typename RealBody, typename Reduction>
 476 Value parallel_deterministic_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction,
 477                        task_group_context& context ) {
 478     internal::lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
 479     internal::start_deterministic_reduce<Range,internal::lambda_reduce_body<Range,Value,RealBody,Reduction> >
 480                           ::run( range, body, context );
 481     return body.result();
 482 }
 483 #endif /* __TBB_TASK_GROUP_CONTEXT */
 484 #endif /* TBB_PREVIEW_DETERMINISTIC_REDUCE */
 485 //@}
 486
 487 } // namespace tbb
 488
 489 #endif /* __TBB_parallel_reduce_H */
 490