2 Copyright 2005-2014 Intel Corporation. All Rights Reserved.
4 This file is part of Threading Building Blocks. Threading Building Blocks is free software;
5 you can redistribute it and/or modify it under the terms of the GNU General Public License
6 version 2 as published by the Free Software Foundation. Threading Building Blocks is
7 distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the
8 implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
9 See the GNU General Public License for more details. You should have received a copy of
10 the GNU General Public License along with Threading Building Blocks; if not, write to the
11 Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
13 As a special exception, you may use this file as part of a free software library without
14 restriction. Specifically, if other files instantiate templates or use macros or inline
15 functions from this file, or you compile this file and link it with other files to produce
16 an executable, this file does not by itself cause the resulting executable to be covered
17 by the GNU General Public License. This exception does not however invalidate any other
18 reasons why the executable file might be covered by the GNU General Public License.
21 #ifndef __TBB_parallel_scan_H
22 #define __TBB_parallel_scan_H
25 #include "aligned_space.h"
27 #include "partitioner.h"
31 //! Used to indicate that the initial scan is being performed.
32 /** @ingroup algorithms */
34 static bool is_final_scan() {return false;}
37 //! Used to indicate that the final scan is being performed.
38 /** @ingroup algorithms */
39 struct final_scan_tag {
40 static bool is_final_scan() {return true;}
46 //! Performs final scan for a leaf
47 /** @ingroup algorithms */
48 template<typename Range, typename Body>
49 class final_sum: public task {
53 aligned_space<Range> my_range;
54 //! Where to put result of last subrange, or NULL if not last subrange.
57 final_sum( Body& body_ ) :
58 my_body(body_,split())
60 poison_pointer(my_stuff_last);
63 my_range.begin()->~Range();
65 void finish_construction( const Range& range_, Body* stuff_last_ ) {
66 new( my_range.begin() ) Range(range_);
67 my_stuff_last = stuff_last_;
70 /*override*/ task* execute() {
71 my_body( *my_range.begin(), final_scan_tag() );
73 my_stuff_last->assign(my_body);
78 //! Split work to be done in the scan.
79 /** @ingroup algorithms */
80 template<typename Range, typename Body>
81 class sum_node: public task {
82 typedef final_sum<Range,Body> final_sum_type;
84 final_sum_type *my_incoming;
85 final_sum_type *my_body;
88 final_sum_type *my_left_sum;
91 bool my_left_is_final;
93 sum_node( const Range range_, bool left_is_final_ ) :
97 my_left_is_final(left_is_final_),
100 // Poison fields that will be set by second pass.
101 poison_pointer(my_body);
102 poison_pointer(my_incoming);
104 task* create_child( const Range& range_, final_sum_type& f, sum_node* n, final_sum_type* incoming_, Body* stuff_last_ ) {
106 f.recycle_as_child_of( *this );
107 f.finish_construction( range_, stuff_last_ );
111 n->my_incoming = incoming_;
112 n->my_stuff_last = stuff_last_;
116 /*override*/ task* execute() {
119 my_left_sum->my_body.reverse_join( my_incoming->my_body );
120 recycle_as_continuation();
122 task* b = c.create_child(Range(my_range,split()),*my_left_sum,my_right,my_left_sum,my_stuff_last);
123 task* a = my_left_is_final ? NULL : c.create_child(my_range,*my_body,my_left,my_incoming,NULL);
124 set_ref_count( (a!=NULL)+(b!=NULL) );
133 template<typename Range_,typename Body_,typename Partitioner_>
134 friend class start_scan;
136 template<typename Range_,typename Body_>
137 friend class finish_scan;
140 //! Combine partial results
141 /** @ingroup algorithms */
142 template<typename Range, typename Body>
143 class finish_scan: public task {
144 typedef sum_node<Range,Body> sum_node_type;
145 typedef final_sum<Range,Body> final_sum_type;
146 final_sum_type** const my_sum;
147 sum_node_type*& my_return_slot;
149 final_sum_type* my_right_zombie;
150 sum_node_type& my_result;
152 /*override*/ task* execute() {
153 __TBB_ASSERT( my_result.ref_count()==(my_result.my_left!=NULL)+(my_result.my_right!=NULL), NULL );
154 if( my_result.my_left )
155 my_result.my_left_is_final = false;
156 if( my_right_zombie && my_sum )
157 ((*my_sum)->my_body).reverse_join(my_result.my_left_sum->my_body);
158 __TBB_ASSERT( !my_return_slot, NULL );
159 if( my_right_zombie || my_result.my_right ) {
160 my_return_slot = &my_result;
162 destroy( my_result );
164 if( my_right_zombie && !my_sum && !my_result.my_right ) {
165 destroy(*my_right_zombie);
166 my_right_zombie = NULL;
171 finish_scan( sum_node_type*& return_slot_, final_sum_type** sum_, sum_node_type& result_ ) :
173 my_return_slot(return_slot_),
174 my_right_zombie(NULL),
177 __TBB_ASSERT( !my_return_slot, NULL );
181 //! Initial task to split the work
182 /** @ingroup algorithms */
183 template<typename Range, typename Body, typename Partitioner=simple_partitioner>
184 class start_scan: public task {
185 typedef sum_node<Range,Body> sum_node_type;
186 typedef final_sum<Range,Body> final_sum_type;
187 final_sum_type* my_body;
188 /** Non-null if caller is requesting total. */
189 final_sum_type** my_sum;
190 sum_node_type** my_return_slot;
191 /** Null if computing root. */
192 sum_node_type* my_parent_sum;
194 bool my_is_right_child;
196 typename Partitioner::partition_type my_partition;
197 /*override*/ task* execute();
199 start_scan( sum_node_type*& return_slot_, start_scan& parent_, sum_node_type* parent_sum_ ) :
200 my_body(parent_.my_body),
201 my_sum(parent_.my_sum),
202 my_return_slot(&return_slot_),
203 my_parent_sum(parent_sum_),
204 my_is_final(parent_.my_is_final),
205 my_is_right_child(false),
206 my_range(parent_.my_range,split()),
207 my_partition(parent_.my_partition,split())
209 __TBB_ASSERT( !*my_return_slot, NULL );
212 start_scan( sum_node_type*& return_slot_, const Range& range_, final_sum_type& body_, const Partitioner& partitioner_) :
215 my_return_slot(&return_slot_),
218 my_is_right_child(false),
220 my_partition(partitioner_)
222 __TBB_ASSERT( !*my_return_slot, NULL );
225 static void run( const Range& range_, Body& body_, const Partitioner& partitioner_ ) {
226 if( !range_.empty() ) {
227 typedef internal::start_scan<Range,Body,Partitioner> start_pass1_type;
228 internal::sum_node<Range,Body>* root = NULL;
229 typedef internal::final_sum<Range,Body> final_sum_type;
230 final_sum_type* temp_body = new(task::allocate_root()) final_sum_type( body_ );
231 start_pass1_type& pass1 = *new(task::allocate_root()) start_pass1_type(
232 /*my_return_slot=*/root,
236 task::spawn_root_and_wait( pass1 );
238 root->my_body = temp_body;
239 root->my_incoming = NULL;
240 root->my_stuff_last = &body_;
241 task::spawn_root_and_wait( *root );
243 body_.assign(temp_body->my_body);
244 temp_body->finish_construction( range_, NULL );
245 temp_body->destroy(*temp_body);
251 template<typename Range, typename Body, typename Partitioner>
252 task* start_scan<Range,Body,Partitioner>::execute() {
253 typedef internal::finish_scan<Range,Body> finish_pass1_type;
254 finish_pass1_type* p = my_parent_sum ? static_cast<finish_pass1_type*>( parent() ) : NULL;
255 // Inspecting p->result.left_sum would ordinarily be a race condition.
256 // But we inspect it only if we are not a stolen task, in which case we
257 // know that task assigning to p->result.left_sum has completed.
258 bool treat_as_stolen = my_is_right_child && (is_stolen_task() || my_body!=p->my_result.my_left_sum);
259 if( treat_as_stolen ) {
260 // Invocation is for right child that has been really stolen or needs to be virtually stolen
261 p->my_right_zombie = my_body = new( allocate_root() ) final_sum_type(my_body->my_body);
264 task* next_task = NULL;
265 if( (my_is_right_child && !treat_as_stolen) || !my_range.is_divisible() || my_partition.should_execute_range(*this) ) {
267 (my_body->my_body)( my_range, final_scan_tag() );
269 (my_body->my_body)( my_range, pre_scan_tag() );
272 __TBB_ASSERT( !*my_return_slot, NULL );
274 sum_node_type* result;
276 result = new(allocate_additional_child_of(*my_parent_sum)) sum_node_type(my_range,/*my_left_is_final=*/my_is_final);
278 result = new(task::allocate_root()) sum_node_type(my_range,/*my_left_is_final=*/my_is_final);
279 finish_pass1_type& c = *new( allocate_continuation()) finish_pass1_type(*my_return_slot,my_sum,*result);
280 // Split off right child
281 start_scan& b = *new( c.allocate_child() ) start_scan( /*my_return_slot=*/result->my_right, *this, result );
282 b.my_is_right_child = true;
283 // Left child is recycling of *this. Must recycle this before spawning b,
284 // otherwise b might complete and decrement c.ref_count() to zero, which
285 // would cause c.execute() to run prematurely.
286 recycle_as_child_of(c);
289 my_sum = &result->my_left_sum;
290 my_return_slot = &result->my_left;
291 my_is_right_child = false;
293 my_parent_sum = result;
294 __TBB_ASSERT( !*my_return_slot, NULL );
298 } // namespace internal
301 // Requirements on Range concept are documented in blocked_range.h
303 /** \page parallel_scan_body_req Requirements on parallel_scan body
304 Class \c Body implementing the concept of parallel_scan body must define:
305 - \code Body::Body( Body&, split ); \endcode Splitting constructor.
306 Split \c b so that \c this and \c b can accumulate separately
307 - \code Body::~Body(); \endcode Destructor
308 - \code void Body::operator()( const Range& r, pre_scan_tag ); \endcode
309 Preprocess iterations for range \c r
310 - \code void Body::operator()( const Range& r, final_scan_tag ); \endcode
311 Do final processing for iterations of range \c r
312 - \code void Body::reverse_join( Body& a ); \endcode
313 Merge preprocessing state of \c a into \c this, where \c a was
314 created earlier from \c b by b's splitting constructor
317 /** \name parallel_scan
318 See also requirements on \ref range_req "Range" and \ref parallel_scan_body_req "parallel_scan Body". **/
321 //! Parallel prefix with default partitioner
322 /** @ingroup algorithms **/
323 template<typename Range, typename Body>
324 void parallel_scan( const Range& range, Body& body ) {
325 internal::start_scan<Range,Body,__TBB_DEFAULT_PARTITIONER>::run(range,body,__TBB_DEFAULT_PARTITIONER());
328 //! Parallel prefix with simple_partitioner
329 /** @ingroup algorithms **/
330 template<typename Range, typename Body>
331 void parallel_scan( const Range& range, Body& body, const simple_partitioner& partitioner ) {
332 internal::start_scan<Range,Body,simple_partitioner>::run(range,body,partitioner);
335 //! Parallel prefix with auto_partitioner
336 /** @ingroup algorithms **/
337 template<typename Range, typename Body>
338 void parallel_scan( const Range& range, Body& body, const auto_partitioner& partitioner ) {
339 internal::start_scan<Range,Body,auto_partitioner>::run(range,body,partitioner);
345 #endif /* __TBB_parallel_scan_H */