LLVM OpenMP* Runtime Library
kmp_barrier.cpp
1 /*
2  * kmp_barrier.cpp
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp_wait_release.h"
14 #include "kmp_barrier.h"
15 #include "kmp_itt.h"
16 #include "kmp_os.h"
17 #include "kmp_stats.h"
18 #include "ompt-specific.h"
19 // for distributed barrier
20 #include "kmp_affinity.h"
21 
22 #if KMP_MIC
23 #include <immintrin.h>
24 #define USE_NGO_STORES 1
25 #endif // KMP_MIC
26 
27 #include "tsan_annotations.h"
28 
29 #if KMP_MIC && USE_NGO_STORES
30 // ICV copying
31 #define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
32 #define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
33 #define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
34 #define ngo_sync() __asm__ volatile("lock; addl $0,0(%%rsp)" ::: "memory")
35 #else
36 #define ngo_load(src) ((void)0)
37 #define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
38 #define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE)
39 #define ngo_sync() ((void)0)
40 #endif /* KMP_MIC && USE_NGO_STORES */
41 
42 void __kmp_print_structure(void); // Forward declaration
43 
44 // ---------------------------- Barrier Algorithms ----------------------------
45 // Distributed barrier
46 
47 // Compute how many threads to have polling each cache-line.
48 // We want to limit the number of writes to IDEAL_GO_RESOLUTION.
49 void distributedBarrier::computeVarsForN(size_t n) {
50  int nsockets = 1;
51  if (__kmp_topology) {
52  int socket_level = __kmp_topology->get_level(KMP_HW_SOCKET);
53  int core_level = __kmp_topology->get_level(KMP_HW_CORE);
54  int ncores_per_socket =
55  __kmp_topology->calculate_ratio(core_level, socket_level);
56  nsockets = __kmp_topology->get_count(socket_level);
57 
58  if (nsockets <= 0)
59  nsockets = 1;
60  if (ncores_per_socket <= 0)
61  ncores_per_socket = 1;
62 
63  threads_per_go = ncores_per_socket >> 1;
64  if (!fix_threads_per_go) {
65  // Minimize num_gos
66  if (threads_per_go > 4) {
67  if (KMP_OPTIMIZE_FOR_REDUCTIONS) {
68  threads_per_go = threads_per_go >> 1;
69  }
70  if (threads_per_go > 4 && nsockets == 1)
71  threads_per_go = threads_per_go >> 1;
72  }
73  }
74  if (threads_per_go == 0)
75  threads_per_go = 1;
76  fix_threads_per_go = true;
77  num_gos = n / threads_per_go;
78  if (n % threads_per_go)
79  num_gos++;
80  if (nsockets == 1 || num_gos == 1)
81  num_groups = 1;
82  else {
83  num_groups = num_gos / nsockets;
84  if (num_gos % nsockets)
85  num_groups++;
86  }
87  if (num_groups <= 0)
88  num_groups = 1;
89  gos_per_group = num_gos / num_groups;
90  if (num_gos % num_groups)
91  gos_per_group++;
92  threads_per_group = threads_per_go * gos_per_group;
93  } else {
94  num_gos = n / threads_per_go;
95  if (n % threads_per_go)
96  num_gos++;
97  if (num_gos == 1)
98  num_groups = 1;
99  else {
100  num_groups = num_gos / 2;
101  if (num_gos % 2)
102  num_groups++;
103  }
104  gos_per_group = num_gos / num_groups;
105  if (num_gos % num_groups)
106  gos_per_group++;
107  threads_per_group = threads_per_go * gos_per_group;
108  }
109 }
110 
111 void distributedBarrier::computeGo(size_t n) {
112  // Minimize num_gos
113  for (num_gos = 1;; num_gos++)
114  if (IDEAL_CONTENTION * num_gos >= n)
115  break;
116  threads_per_go = n / num_gos;
117  if (n % num_gos)
118  threads_per_go++;
119  while (num_gos > MAX_GOS) {
120  threads_per_go++;
121  num_gos = n / threads_per_go;
122  if (n % threads_per_go)
123  num_gos++;
124  }
125  computeVarsForN(n);
126 }
127 
128 // This function is to resize the barrier arrays when the new number of threads
129 // exceeds max_threads, which is the current size of all the arrays
130 void distributedBarrier::resize(size_t nthr) {
131  KMP_DEBUG_ASSERT(nthr > max_threads);
132 
133  // expand to requested size * 2
134  max_threads = nthr * 2;
135 
136  // allocate arrays to new max threads
137  for (int i = 0; i < MAX_ITERS; ++i) {
138  if (flags[i])
139  flags[i] = (flags_s *)KMP_INTERNAL_REALLOC(flags[i],
140  max_threads * sizeof(flags_s));
141  else
142  flags[i] = (flags_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(flags_s));
143  }
144 
145  if (go)
146  go = (go_s *)KMP_INTERNAL_REALLOC(go, max_threads * sizeof(go_s));
147  else
148  go = (go_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(go_s));
149 
150  if (iter)
151  iter = (iter_s *)KMP_INTERNAL_REALLOC(iter, max_threads * sizeof(iter_s));
152  else
153  iter = (iter_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(iter_s));
154 
155  if (sleep)
156  sleep =
157  (sleep_s *)KMP_INTERNAL_REALLOC(sleep, max_threads * sizeof(sleep_s));
158  else
159  sleep = (sleep_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(sleep_s));
160 }
161 
162 // This function is to set all the go flags that threads might be waiting
163 // on, and when blocktime is not infinite, it should be followed by a wake-up
164 // call to each thread
165 kmp_uint64 distributedBarrier::go_release() {
166  kmp_uint64 next_go = iter[0].iter + distributedBarrier::MAX_ITERS;
167  for (size_t j = 0; j < num_gos; j++) {
168  go[j].go.store(next_go);
169  }
170  return next_go;
171 }
172 
173 void distributedBarrier::go_reset() {
174  for (size_t j = 0; j < max_threads; ++j) {
175  for (size_t i = 0; i < distributedBarrier::MAX_ITERS; ++i) {
176  flags[i][j].stillNeed = 1;
177  }
178  go[j].go.store(0);
179  iter[j].iter = 0;
180  }
181 }
182 
183 // This function inits/re-inits the distributed barrier for a particular number
184 // of threads. If a resize of arrays is needed, it calls the resize function.
185 void distributedBarrier::init(size_t nthr) {
186  size_t old_max = max_threads;
187  if (nthr > max_threads) { // need more space in arrays
188  resize(nthr);
189  }
190 
191  for (size_t i = 0; i < max_threads; i++) {
192  for (size_t j = 0; j < distributedBarrier::MAX_ITERS; j++) {
193  flags[j][i].stillNeed = 1;
194  }
195  go[i].go.store(0);
196  iter[i].iter = 0;
197  if (i >= old_max)
198  sleep[i].sleep = false;
199  }
200 
201  // Recalculate num_gos, etc. based on new nthr
202  computeVarsForN(nthr);
203 
204  num_threads = nthr;
205 
206  if (team_icvs == NULL)
207  team_icvs = __kmp_allocate(sizeof(kmp_internal_control_t));
208 }
209 
210 // This function is used only when KMP_BLOCKTIME is not infinite.
211 // static
212 void __kmp_dist_barrier_wakeup(enum barrier_type bt, kmp_team_t *team,
213  size_t start, size_t stop, size_t inc,
214  size_t tid) {
215  KMP_DEBUG_ASSERT(__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME);
216  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
217  return;
218 
219  kmp_info_t **other_threads = team->t.t_threads;
220  for (size_t thr = start; thr < stop; thr += inc) {
221  KMP_DEBUG_ASSERT(other_threads[thr]);
222  int gtid = other_threads[thr]->th.th_info.ds.ds_gtid;
223  // Wake up worker regardless of if it appears to be sleeping or not
224  __kmp_atomic_resume_64(gtid, (kmp_atomic_flag_64<> *)NULL);
225  }
226 }
227 
228 static void
229 __kmp_dist_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
230  int tid, void (*reduce)(void *, void *)
231  USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
232  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_dist_gather);
233  kmp_team_t *team;
234  distributedBarrier *b;
235  kmp_info_t **other_threads;
236  kmp_uint64 my_current_iter, my_next_iter;
237  kmp_uint32 nproc;
238  bool group_leader;
239 
240  team = this_thr->th.th_team;
241  nproc = this_thr->th.th_team_nproc;
242  other_threads = team->t.t_threads;
243  b = team->t.b;
244  my_current_iter = b->iter[tid].iter;
245  my_next_iter = (my_current_iter + 1) % distributedBarrier::MAX_ITERS;
246  group_leader = ((tid % b->threads_per_group) == 0);
247 
248  KA_TRACE(20,
249  ("__kmp_dist_barrier_gather: T#%d(%d:%d) enter; barrier type %d\n",
250  gtid, team->t.t_id, tid, bt));
251 
252 #if USE_ITT_BUILD && USE_ITT_NOTIFY
253  // Barrier imbalance - save arrive time to the thread
254  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
255  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
256  __itt_get_timestamp();
257  }
258 #endif
259 
260  if (group_leader) {
261  // Start from the thread after the group leader
262  size_t group_start = tid + 1;
263  size_t group_end = tid + b->threads_per_group;
264  size_t threads_pending = 0;
265 
266  if (group_end > nproc)
267  group_end = nproc;
268  do { // wait for threads in my group
269  threads_pending = 0;
270  // Check all the flags every time to avoid branch misspredict
271  for (size_t thr = group_start; thr < group_end; thr++) {
272  // Each thread uses a different cache line
273  threads_pending += b->flags[my_current_iter][thr].stillNeed;
274  }
275  // Execute tasks here
276  if (__kmp_tasking_mode != tskm_immediate_exec) {
277  kmp_task_team_t *task_team = this_thr->th.th_task_team;
278  if (task_team != NULL) {
279  if (TCR_SYNC_4(task_team->tt.tt_active)) {
280  if (KMP_TASKING_ENABLED(task_team)) {
281  int tasks_completed = FALSE;
282  __kmp_atomic_execute_tasks_64(
283  this_thr, gtid, (kmp_atomic_flag_64<> *)NULL, FALSE,
284  &tasks_completed USE_ITT_BUILD_ARG(itt_sync_obj), 0);
285  } else
286  this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
287  }
288  } else {
289  this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
290  } // if
291  }
292  if (TCR_4(__kmp_global.g.g_done)) {
293  if (__kmp_global.g.g_abort)
294  __kmp_abort_thread();
295  break;
296  } else if (__kmp_tasking_mode != tskm_immediate_exec &&
297  this_thr->th.th_reap_state == KMP_SAFE_TO_REAP) {
298  this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
299  }
300  } while (threads_pending > 0);
301 
302  if (reduce) { // Perform reduction if needed
303  OMPT_REDUCTION_DECL(this_thr, gtid);
304  OMPT_REDUCTION_BEGIN;
305  // Group leader reduces all threads in group
306  for (size_t thr = group_start; thr < group_end; thr++) {
307  (*reduce)(this_thr->th.th_local.reduce_data,
308  other_threads[thr]->th.th_local.reduce_data);
309  }
310  OMPT_REDUCTION_END;
311  }
312 
313  // Set flag for next iteration
314  b->flags[my_next_iter][tid].stillNeed = 1;
315  // Each thread uses a different cache line; resets stillNeed to 0 to
316  // indicate it has reached the barrier
317  b->flags[my_current_iter][tid].stillNeed = 0;
318 
319  do { // wait for all group leaders
320  threads_pending = 0;
321  for (size_t thr = 0; thr < nproc; thr += b->threads_per_group) {
322  threads_pending += b->flags[my_current_iter][thr].stillNeed;
323  }
324  // Execute tasks here
325  if (__kmp_tasking_mode != tskm_immediate_exec) {
326  kmp_task_team_t *task_team = this_thr->th.th_task_team;
327  if (task_team != NULL) {
328  if (TCR_SYNC_4(task_team->tt.tt_active)) {
329  if (KMP_TASKING_ENABLED(task_team)) {
330  int tasks_completed = FALSE;
331  __kmp_atomic_execute_tasks_64(
332  this_thr, gtid, (kmp_atomic_flag_64<> *)NULL, FALSE,
333  &tasks_completed USE_ITT_BUILD_ARG(itt_sync_obj), 0);
334  } else
335  this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
336  }
337  } else {
338  this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
339  } // if
340  }
341  if (TCR_4(__kmp_global.g.g_done)) {
342  if (__kmp_global.g.g_abort)
343  __kmp_abort_thread();
344  break;
345  } else if (__kmp_tasking_mode != tskm_immediate_exec &&
346  this_thr->th.th_reap_state == KMP_SAFE_TO_REAP) {
347  this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
348  }
349  } while (threads_pending > 0);
350 
351  if (reduce) { // Perform reduction if needed
352  if (KMP_MASTER_TID(tid)) { // Master reduces over group leaders
353  OMPT_REDUCTION_DECL(this_thr, gtid);
354  OMPT_REDUCTION_BEGIN;
355  for (size_t thr = b->threads_per_group; thr < nproc;
356  thr += b->threads_per_group) {
357  (*reduce)(this_thr->th.th_local.reduce_data,
358  other_threads[thr]->th.th_local.reduce_data);
359  }
360  OMPT_REDUCTION_END;
361  }
362  }
363  } else {
364  // Set flag for next iteration
365  b->flags[my_next_iter][tid].stillNeed = 1;
366  // Each thread uses a different cache line; resets stillNeed to 0 to
367  // indicate it has reached the barrier
368  b->flags[my_current_iter][tid].stillNeed = 0;
369  }
370 
371  KMP_MFENCE();
372 
373  KA_TRACE(20,
374  ("__kmp_dist_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
375  gtid, team->t.t_id, tid, bt));
376 }
377 
378 static void __kmp_dist_barrier_release(
379  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
380  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
381  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_dist_release);
382  kmp_team_t *team;
383  distributedBarrier *b;
384  kmp_bstate_t *thr_bar;
385  kmp_uint64 my_current_iter, next_go;
386  size_t my_go_index;
387  bool group_leader;
388 
389  KA_TRACE(20, ("__kmp_dist_barrier_release: T#%d(%d) enter; barrier type %d\n",
390  gtid, tid, bt));
391 
392  thr_bar = &this_thr->th.th_bar[bt].bb;
393 
394  if (!KMP_MASTER_TID(tid)) {
395  // workers and non-master group leaders need to check their presence in team
396  do {
397  if (this_thr->th.th_used_in_team.load() != 1 &&
398  this_thr->th.th_used_in_team.load() != 3) {
399  // Thread is not in use in a team. Wait on location in tid's thread
400  // struct. The 0 value tells anyone looking that this thread is spinning
401  // or sleeping until this location becomes 3 again; 3 is the transition
402  // state to get to 1 which is waiting on go and being in the team
403  kmp_flag_32<false, false> my_flag(&(this_thr->th.th_used_in_team), 3);
404  if (KMP_COMPARE_AND_STORE_ACQ32(&(this_thr->th.th_used_in_team), 2,
405  0) ||
406  this_thr->th.th_used_in_team.load() == 0) {
407  my_flag.wait(this_thr, true, itt_sync_obj);
408  }
409 #if USE_ITT_BUILD && USE_ITT_NOTIFY
410  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
411  // In fork barrier where we could not get the object reliably
412  itt_sync_obj =
413  __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
414  // Cancel wait on previous parallel region...
415  __kmp_itt_task_starting(itt_sync_obj);
416 
417  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
418  return;
419 
420  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
421  if (itt_sync_obj != NULL)
422  // Call prepare as early as possible for "new" barrier
423  __kmp_itt_task_finished(itt_sync_obj);
424  } else
425 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
426  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
427  return;
428  }
429  if (this_thr->th.th_used_in_team.load() != 1 &&
430  this_thr->th.th_used_in_team.load() != 3) // spurious wake-up?
431  continue;
432  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
433  return;
434 
435  // At this point, the thread thinks it is in use in a team, or in
436  // transition to be used in a team, but it might have reached this barrier
437  // before it was marked unused by the team. Unused threads are awoken and
438  // shifted to wait on local thread struct elsewhere. It also might reach
439  // this point by being picked up for use by a different team. Either way,
440  // we need to update the tid.
441  tid = __kmp_tid_from_gtid(gtid);
442  team = this_thr->th.th_team;
443  KMP_DEBUG_ASSERT(tid >= 0);
444  KMP_DEBUG_ASSERT(team);
445  b = team->t.b;
446  my_current_iter = b->iter[tid].iter;
447  next_go = my_current_iter + distributedBarrier::MAX_ITERS;
448  my_go_index = tid / b->threads_per_go;
449  if (this_thr->th.th_used_in_team.load() == 3) {
450  KMP_COMPARE_AND_STORE_ACQ32(&(this_thr->th.th_used_in_team), 3, 1);
451  }
452  // Check if go flag is set
453  if (b->go[my_go_index].go.load() != next_go) {
454  // Wait on go flag on team
455  kmp_atomic_flag_64<false, true> my_flag(
456  &(b->go[my_go_index].go), next_go, &(b->sleep[tid].sleep));
457  my_flag.wait(this_thr, true, itt_sync_obj);
458  KMP_DEBUG_ASSERT(my_current_iter == b->iter[tid].iter ||
459  b->iter[tid].iter == 0);
460  KMP_DEBUG_ASSERT(b->sleep[tid].sleep == false);
461  }
462 
463  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
464  return;
465  // At this point, the thread's go location was set. This means the primary
466  // thread is safely in the barrier, and so this thread's data is
467  // up-to-date, but we should check again that this thread is really in
468  // use in the team, as it could have been woken up for the purpose of
469  // changing team size, or reaping threads at shutdown.
470  if (this_thr->th.th_used_in_team.load() == 1)
471  break;
472  } while (1);
473 
474  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
475  return;
476 
477  group_leader = ((tid % b->threads_per_group) == 0);
478  if (group_leader) {
479  // Tell all the threads in my group they can go!
480  for (size_t go_idx = my_go_index + 1;
481  go_idx < my_go_index + b->gos_per_group; go_idx++) {
482  b->go[go_idx].go.store(next_go);
483  }
484  // Fence added so that workers can see changes to go. sfence inadequate.
485  KMP_MFENCE();
486  }
487 
488 #if KMP_BARRIER_ICV_PUSH
489  if (propagate_icvs) { // copy ICVs to final dest
490  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
491  tid, FALSE);
492  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
493  (kmp_internal_control_t *)team->t.b->team_icvs);
494  copy_icvs(&thr_bar->th_fixed_icvs,
495  &team->t.t_implicit_task_taskdata[tid].td_icvs);
496  }
497 #endif
498  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME && group_leader) {
499  // This thread is now awake and participating in the barrier;
500  // wake up the other threads in the group
501  size_t nproc = this_thr->th.th_team_nproc;
502  size_t group_end = tid + b->threads_per_group;
503  if (nproc < group_end)
504  group_end = nproc;
505  __kmp_dist_barrier_wakeup(bt, team, tid + 1, group_end, 1, tid);
506  }
507  } else { // Primary thread
508  team = this_thr->th.th_team;
509  b = team->t.b;
510  my_current_iter = b->iter[tid].iter;
511  next_go = my_current_iter + distributedBarrier::MAX_ITERS;
512 #if KMP_BARRIER_ICV_PUSH
513  if (propagate_icvs) {
514  // primary thread has ICVs in final destination; copy
515  copy_icvs(&thr_bar->th_fixed_icvs,
516  &team->t.t_implicit_task_taskdata[tid].td_icvs);
517  }
518 #endif
519  // Tell all the group leaders they can go!
520  for (size_t go_idx = 0; go_idx < b->num_gos; go_idx += b->gos_per_group) {
521  b->go[go_idx].go.store(next_go);
522  }
523 
524  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
525  // Wake-up the group leaders
526  size_t nproc = this_thr->th.th_team_nproc;
527  __kmp_dist_barrier_wakeup(bt, team, tid + b->threads_per_group, nproc,
528  b->threads_per_group, tid);
529  }
530 
531  // Tell all the threads in my group they can go!
532  for (size_t go_idx = 1; go_idx < b->gos_per_group; go_idx++) {
533  b->go[go_idx].go.store(next_go);
534  }
535 
536  // Fence added so that workers can see changes to go. sfence inadequate.
537  KMP_MFENCE();
538 
539  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
540  // Wake-up the other threads in my group
541  size_t nproc = this_thr->th.th_team_nproc;
542  size_t group_end = tid + b->threads_per_group;
543  if (nproc < group_end)
544  group_end = nproc;
545  __kmp_dist_barrier_wakeup(bt, team, tid + 1, group_end, 1, tid);
546  }
547  }
548  // Update to next iteration
549  KMP_ASSERT(my_current_iter == b->iter[tid].iter);
550  b->iter[tid].iter = (b->iter[tid].iter + 1) % distributedBarrier::MAX_ITERS;
551 
552  KA_TRACE(
553  20, ("__kmp_dist_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
554  gtid, team->t.t_id, tid, bt));
555 }
556 
557 // Linear Barrier
558 template <bool cancellable = false>
559 static bool __kmp_linear_barrier_gather_template(
560  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
561  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
562  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather);
563  kmp_team_t *team = this_thr->th.th_team;
564  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
565  kmp_info_t **other_threads = team->t.t_threads;
566 
567  KA_TRACE(
568  20,
569  ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
570  gtid, team->t.t_id, tid, bt));
571  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
572 
573 #if USE_ITT_BUILD && USE_ITT_NOTIFY
574  // Barrier imbalance - save arrive time to the thread
575  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
576  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
577  __itt_get_timestamp();
578  }
579 #endif
580  // We now perform a linear reduction to signal that all of the threads have
581  // arrived.
582  if (!KMP_MASTER_TID(tid)) {
583  KA_TRACE(20,
584  ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
585  "arrived(%p): %llu => %llu\n",
586  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(0, team),
587  team->t.t_id, 0, &thr_bar->b_arrived, thr_bar->b_arrived,
588  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
589  // Mark arrival to primary thread
590  /* After performing this write, a worker thread may not assume that the team
591  is valid any more - it could be deallocated by the primary thread at any
592  time. */
593  ANNOTATE_BARRIER_BEGIN(this_thr);
594  kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[0]);
595  flag.release();
596  } else {
597  kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
598  int nproc = this_thr->th.th_team_nproc;
599  int i;
600  // Don't have to worry about sleep bit here or atomic since team setting
601  kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
602 
603  // Collect all the worker team member threads.
604  for (i = 1; i < nproc; ++i) {
605 #if KMP_CACHE_MANAGE
606  // Prefetch next thread's arrived count
607  if (i + 1 < nproc)
608  KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_arrived);
609 #endif /* KMP_CACHE_MANAGE */
610  KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
611  "arrived(%p) == %llu\n",
612  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
613  team->t.t_id, i,
614  &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
615 
616  // Wait for worker thread to arrive
617  if (cancellable) {
618  kmp_flag_64<true, false> flag(
619  &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state);
620  if (flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)))
621  return true;
622  } else {
623  kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived,
624  new_state);
625  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
626  }
627  ANNOTATE_BARRIER_END(other_threads[i]);
628 #if USE_ITT_BUILD && USE_ITT_NOTIFY
629  // Barrier imbalance - write min of the thread time and the other thread
630  // time to the thread.
631  if (__kmp_forkjoin_frames_mode == 2) {
632  this_thr->th.th_bar_min_time = KMP_MIN(
633  this_thr->th.th_bar_min_time, other_threads[i]->th.th_bar_min_time);
634  }
635 #endif
636  if (reduce) {
637  KA_TRACE(100,
638  ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
639  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
640  team->t.t_id, i));
641  ANNOTATE_REDUCE_AFTER(reduce);
642  OMPT_REDUCTION_DECL(this_thr, gtid);
643  OMPT_REDUCTION_BEGIN;
644  (*reduce)(this_thr->th.th_local.reduce_data,
645  other_threads[i]->th.th_local.reduce_data);
646  OMPT_REDUCTION_END;
647  ANNOTATE_REDUCE_BEFORE(reduce);
648  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
649  }
650  }
651  // Don't have to worry about sleep bit here or atomic since team setting
652  team_bar->b_arrived = new_state;
653  KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d "
654  "arrived(%p) = %llu\n",
655  gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived,
656  new_state));
657  }
658  KA_TRACE(
659  20,
660  ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
661  gtid, team->t.t_id, tid, bt));
662  return false;
663 }
664 
665 template <bool cancellable = false>
666 static bool __kmp_linear_barrier_release_template(
667  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
668  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
669  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release);
670  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
671  kmp_team_t *team;
672 
673  if (KMP_MASTER_TID(tid)) {
674  unsigned int i;
675  kmp_uint32 nproc = this_thr->th.th_team_nproc;
676  kmp_info_t **other_threads;
677 
678  team = __kmp_threads[gtid]->th.th_team;
679  KMP_DEBUG_ASSERT(team != NULL);
680  other_threads = team->t.t_threads;
681 
682  KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) primary enter for "
683  "barrier type %d\n",
684  gtid, team->t.t_id, tid, bt));
685 
686  if (nproc > 1) {
687 #if KMP_BARRIER_ICV_PUSH
688  {
689  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
690  if (propagate_icvs) {
691  ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
692  for (i = 1; i < nproc; ++i) {
693  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i],
694  team, i, FALSE);
695  ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
696  &team->t.t_implicit_task_taskdata[0].td_icvs);
697  }
698  ngo_sync();
699  }
700  }
701 #endif // KMP_BARRIER_ICV_PUSH
702 
703  // Now, release all of the worker threads
704  for (i = 1; i < nproc; ++i) {
705 #if KMP_CACHE_MANAGE
706  // Prefetch next thread's go flag
707  if (i + 1 < nproc)
708  KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_go);
709 #endif /* KMP_CACHE_MANAGE */
710  KA_TRACE(
711  20,
712  ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
713  "go(%p): %u => %u\n",
714  gtid, team->t.t_id, tid, other_threads[i]->th.th_info.ds.ds_gtid,
715  team->t.t_id, i, &other_threads[i]->th.th_bar[bt].bb.b_go,
716  other_threads[i]->th.th_bar[bt].bb.b_go,
717  other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
718  ANNOTATE_BARRIER_BEGIN(other_threads[i]);
719  kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_go,
720  other_threads[i]);
721  flag.release();
722  }
723  }
724  } else { // Wait for the PRIMARY thread to release us
725  KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
726  gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
727  if (cancellable) {
728  kmp_flag_64<true, false> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
729  if (flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)))
730  return true;
731  } else {
732  kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
733  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
734  }
735  ANNOTATE_BARRIER_END(this_thr);
736 #if USE_ITT_BUILD && USE_ITT_NOTIFY
737  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
738  // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is
739  // disabled)
740  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
741  // Cancel wait on previous parallel region...
742  __kmp_itt_task_starting(itt_sync_obj);
743 
744  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
745  return false;
746 
747  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
748  if (itt_sync_obj != NULL)
749  // Call prepare as early as possible for "new" barrier
750  __kmp_itt_task_finished(itt_sync_obj);
751  } else
752 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
753  // Early exit for reaping threads releasing forkjoin barrier
754  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
755  return false;
756 // The worker thread may now assume that the team is valid.
757 #ifdef KMP_DEBUG
758  tid = __kmp_tid_from_gtid(gtid);
759  team = __kmp_threads[gtid]->th.th_team;
760 #endif
761  KMP_DEBUG_ASSERT(team != NULL);
762  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
763  KA_TRACE(20,
764  ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
765  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
766  KMP_MB(); // Flush all pending memory write invalidates.
767  }
768  KA_TRACE(
769  20,
770  ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
771  gtid, team->t.t_id, tid, bt));
772  return false;
773 }
774 
775 static void __kmp_linear_barrier_gather(
776  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
777  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
778  __kmp_linear_barrier_gather_template<false>(
779  bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
780 }
781 
782 static bool __kmp_linear_barrier_gather_cancellable(
783  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
784  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
785  return __kmp_linear_barrier_gather_template<true>(
786  bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
787 }
788 
789 static void __kmp_linear_barrier_release(
790  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
791  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
792  __kmp_linear_barrier_release_template<false>(
793  bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
794 }
795 
796 static bool __kmp_linear_barrier_release_cancellable(
797  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
798  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
799  return __kmp_linear_barrier_release_template<true>(
800  bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
801 }
802 
803 // Tree barrier
804 static void __kmp_tree_barrier_gather(
805  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
806  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
807  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather);
808  kmp_team_t *team = this_thr->th.th_team;
809  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
810  kmp_info_t **other_threads = team->t.t_threads;
811  kmp_uint32 nproc = this_thr->th.th_team_nproc;
812  kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
813  kmp_uint32 branch_factor = 1 << branch_bits;
814  kmp_uint32 child;
815  kmp_uint32 child_tid;
816  kmp_uint64 new_state = 0;
817 
818  KA_TRACE(
819  20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
820  gtid, team->t.t_id, tid, bt));
821  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
822 
823 #if USE_ITT_BUILD && USE_ITT_NOTIFY
824  // Barrier imbalance - save arrive time to the thread
825  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
826  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
827  __itt_get_timestamp();
828  }
829 #endif
830  // Perform tree gather to wait until all threads have arrived; reduce any
831  // required data as we go
832  child_tid = (tid << branch_bits) + 1;
833  if (child_tid < nproc) {
834  // Parent threads wait for all their children to arrive
835  new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
836  child = 1;
837  do {
838  kmp_info_t *child_thr = other_threads[child_tid];
839  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
840 #if KMP_CACHE_MANAGE
841  // Prefetch next thread's arrived count
842  if (child + 1 <= branch_factor && child_tid + 1 < nproc)
843  KMP_CACHE_PREFETCH(
844  &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_arrived);
845 #endif /* KMP_CACHE_MANAGE */
846  KA_TRACE(20,
847  ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
848  "arrived(%p) == %llu\n",
849  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
850  team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
851  // Wait for child to arrive
852  kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
853  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
854  ANNOTATE_BARRIER_END(child_thr);
855 #if USE_ITT_BUILD && USE_ITT_NOTIFY
856  // Barrier imbalance - write min of the thread time and a child time to
857  // the thread.
858  if (__kmp_forkjoin_frames_mode == 2) {
859  this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
860  child_thr->th.th_bar_min_time);
861  }
862 #endif
863  if (reduce) {
864  KA_TRACE(100,
865  ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
866  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
867  team->t.t_id, child_tid));
868  ANNOTATE_REDUCE_AFTER(reduce);
869  OMPT_REDUCTION_DECL(this_thr, gtid);
870  OMPT_REDUCTION_BEGIN;
871  (*reduce)(this_thr->th.th_local.reduce_data,
872  child_thr->th.th_local.reduce_data);
873  OMPT_REDUCTION_END;
874  ANNOTATE_REDUCE_BEFORE(reduce);
875  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
876  }
877  child++;
878  child_tid++;
879  } while (child <= branch_factor && child_tid < nproc);
880  }
881 
882  if (!KMP_MASTER_TID(tid)) { // Worker threads
883  kmp_int32 parent_tid = (tid - 1) >> branch_bits;
884 
885  KA_TRACE(20,
886  ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
887  "arrived(%p): %llu => %llu\n",
888  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
889  team->t.t_id, parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
890  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
891 
892  // Mark arrival to parent thread
893  /* After performing this write, a worker thread may not assume that the team
894  is valid any more - it could be deallocated by the primary thread at any
895  time. */
896  ANNOTATE_BARRIER_BEGIN(this_thr);
897  kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[parent_tid]);
898  flag.release();
899  } else {
900  // Need to update the team arrived pointer if we are the primary thread
901  if (nproc > 1) // New value was already computed above
902  team->t.t_bar[bt].b_arrived = new_state;
903  else
904  team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
905  KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d "
906  "arrived(%p) = %llu\n",
907  gtid, team->t.t_id, tid, team->t.t_id,
908  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
909  }
910  KA_TRACE(20,
911  ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
912  gtid, team->t.t_id, tid, bt));
913 }
914 
915 static void __kmp_tree_barrier_release(
916  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
917  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
918  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release);
919  kmp_team_t *team;
920  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
921  kmp_uint32 nproc;
922  kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
923  kmp_uint32 branch_factor = 1 << branch_bits;
924  kmp_uint32 child;
925  kmp_uint32 child_tid;
926 
927  // Perform a tree release for all of the threads that have been gathered
928  if (!KMP_MASTER_TID(
929  tid)) { // Handle fork barrier workers who aren't part of a team yet
930  KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", gtid,
931  &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
932  // Wait for parent thread to release us
933  kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
934  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
935  ANNOTATE_BARRIER_END(this_thr);
936 #if USE_ITT_BUILD && USE_ITT_NOTIFY
937  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
938  // In fork barrier where we could not get the object reliably (or
939  // ITTNOTIFY is disabled)
940  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
941  // Cancel wait on previous parallel region...
942  __kmp_itt_task_starting(itt_sync_obj);
943 
944  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
945  return;
946 
947  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
948  if (itt_sync_obj != NULL)
949  // Call prepare as early as possible for "new" barrier
950  __kmp_itt_task_finished(itt_sync_obj);
951  } else
952 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
953  // Early exit for reaping threads releasing forkjoin barrier
954  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
955  return;
956 
957  // The worker thread may now assume that the team is valid.
958  team = __kmp_threads[gtid]->th.th_team;
959  KMP_DEBUG_ASSERT(team != NULL);
960  tid = __kmp_tid_from_gtid(gtid);
961 
962  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
963  KA_TRACE(20,
964  ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", gtid,
965  team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
966  KMP_MB(); // Flush all pending memory write invalidates.
967  } else {
968  team = __kmp_threads[gtid]->th.th_team;
969  KMP_DEBUG_ASSERT(team != NULL);
970  KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) primary enter for "
971  "barrier type %d\n",
972  gtid, team->t.t_id, tid, bt));
973  }
974  nproc = this_thr->th.th_team_nproc;
975  child_tid = (tid << branch_bits) + 1;
976 
977  if (child_tid < nproc) {
978  kmp_info_t **other_threads = team->t.t_threads;
979  child = 1;
980  // Parent threads release all their children
981  do {
982  kmp_info_t *child_thr = other_threads[child_tid];
983  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
984 #if KMP_CACHE_MANAGE
985  // Prefetch next thread's go count
986  if (child + 1 <= branch_factor && child_tid + 1 < nproc)
987  KMP_CACHE_PREFETCH(
988  &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_go);
989 #endif /* KMP_CACHE_MANAGE */
990 
991 #if KMP_BARRIER_ICV_PUSH
992  {
993  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
994  if (propagate_icvs) {
995  __kmp_init_implicit_task(team->t.t_ident,
996  team->t.t_threads[child_tid], team,
997  child_tid, FALSE);
998  copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
999  &team->t.t_implicit_task_taskdata[0].td_icvs);
1000  }
1001  }
1002 #endif // KMP_BARRIER_ICV_PUSH
1003  KA_TRACE(20,
1004  ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
1005  "go(%p): %u => %u\n",
1006  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1007  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1008  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1009  // Release child from barrier
1010  ANNOTATE_BARRIER_BEGIN(child_thr);
1011  kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1012  flag.release();
1013  child++;
1014  child_tid++;
1015  } while (child <= branch_factor && child_tid < nproc);
1016  }
1017  KA_TRACE(
1018  20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
1019  gtid, team->t.t_id, tid, bt));
1020 }
1021 
1022 // Hyper Barrier
1023 static void __kmp_hyper_barrier_gather(
1024  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1025  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1026  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather);
1027  kmp_team_t *team = this_thr->th.th_team;
1028  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1029  kmp_info_t **other_threads = team->t.t_threads;
1030  kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE;
1031  kmp_uint32 num_threads = this_thr->th.th_team_nproc;
1032  kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
1033  kmp_uint32 branch_factor = 1 << branch_bits;
1034  kmp_uint32 offset;
1035  kmp_uint32 level;
1036 
1037  KA_TRACE(
1038  20,
1039  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
1040  gtid, team->t.t_id, tid, bt));
1041  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
1042 
1043 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1044  // Barrier imbalance - save arrive time to the thread
1045  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
1046  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
1047  __itt_get_timestamp();
1048  }
1049 #endif
1050  /* Perform a hypercube-embedded tree gather to wait until all of the threads
1051  have arrived, and reduce any required data as we go. */
1052  kmp_flag_64<> p_flag(&thr_bar->b_arrived);
1053  for (level = 0, offset = 1; offset < num_threads;
1054  level += branch_bits, offset <<= branch_bits) {
1055  kmp_uint32 child;
1056  kmp_uint32 child_tid;
1057 
1058  if (((tid >> level) & (branch_factor - 1)) != 0) {
1059  kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) - 1);
1060 
1061  KMP_MB(); // Synchronize parent and child threads.
1062  KA_TRACE(20,
1063  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
1064  "arrived(%p): %llu => %llu\n",
1065  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
1066  team->t.t_id, parent_tid, &thr_bar->b_arrived,
1067  thr_bar->b_arrived,
1068  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
1069  // Mark arrival to parent thread
1070  /* After performing this write (in the last iteration of the enclosing for
1071  loop), a worker thread may not assume that the team is valid any more
1072  - it could be deallocated by the primary thread at any time. */
1073  ANNOTATE_BARRIER_BEGIN(this_thr);
1074  p_flag.set_waiter(other_threads[parent_tid]);
1075  p_flag.release();
1076  break;
1077  }
1078 
1079  // Parent threads wait for children to arrive
1080  if (new_state == KMP_BARRIER_UNUSED_STATE)
1081  new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1082  for (child = 1, child_tid = tid + (1 << level);
1083  child < branch_factor && child_tid < num_threads;
1084  child++, child_tid += (1 << level)) {
1085  kmp_info_t *child_thr = other_threads[child_tid];
1086  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1087 #if KMP_CACHE_MANAGE
1088  kmp_uint32 next_child_tid = child_tid + (1 << level);
1089  // Prefetch next thread's arrived count
1090  if (child + 1 < branch_factor && next_child_tid < num_threads)
1091  KMP_CACHE_PREFETCH(
1092  &other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
1093 #endif /* KMP_CACHE_MANAGE */
1094  KA_TRACE(20,
1095  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
1096  "arrived(%p) == %llu\n",
1097  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1098  team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
1099  // Wait for child to arrive
1100  kmp_flag_64<> c_flag(&child_bar->b_arrived, new_state);
1101  c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1102  ANNOTATE_BARRIER_END(child_thr);
1103  KMP_MB(); // Synchronize parent and child threads.
1104 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1105  // Barrier imbalance - write min of the thread time and a child time to
1106  // the thread.
1107  if (__kmp_forkjoin_frames_mode == 2) {
1108  this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
1109  child_thr->th.th_bar_min_time);
1110  }
1111 #endif
1112  if (reduce) {
1113  KA_TRACE(100,
1114  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
1115  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1116  team->t.t_id, child_tid));
1117  ANNOTATE_REDUCE_AFTER(reduce);
1118  OMPT_REDUCTION_DECL(this_thr, gtid);
1119  OMPT_REDUCTION_BEGIN;
1120  (*reduce)(this_thr->th.th_local.reduce_data,
1121  child_thr->th.th_local.reduce_data);
1122  OMPT_REDUCTION_END;
1123  ANNOTATE_REDUCE_BEFORE(reduce);
1124  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
1125  }
1126  }
1127  }
1128 
1129  if (KMP_MASTER_TID(tid)) {
1130  // Need to update the team arrived pointer if we are the primary thread
1131  if (new_state == KMP_BARRIER_UNUSED_STATE)
1132  team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
1133  else
1134  team->t.t_bar[bt].b_arrived = new_state;
1135  KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d "
1136  "arrived(%p) = %llu\n",
1137  gtid, team->t.t_id, tid, team->t.t_id,
1138  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
1139  }
1140  KA_TRACE(
1141  20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
1142  gtid, team->t.t_id, tid, bt));
1143 }
1144 
1145 // The reverse versions seem to beat the forward versions overall
1146 #define KMP_REVERSE_HYPER_BAR
1147 static void __kmp_hyper_barrier_release(
1148  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1149  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1150  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release);
1151  kmp_team_t *team;
1152  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1153  kmp_info_t **other_threads;
1154  kmp_uint32 num_threads;
1155  kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
1156  kmp_uint32 branch_factor = 1 << branch_bits;
1157  kmp_uint32 child;
1158  kmp_uint32 child_tid;
1159  kmp_uint32 offset;
1160  kmp_uint32 level;
1161 
1162  /* Perform a hypercube-embedded tree release for all of the threads that have
1163  been gathered. If KMP_REVERSE_HYPER_BAR is defined (default) the threads
1164  are released in the reverse order of the corresponding gather, otherwise
1165  threads are released in the same order. */
1166  if (KMP_MASTER_TID(tid)) { // primary thread
1167  team = __kmp_threads[gtid]->th.th_team;
1168  KMP_DEBUG_ASSERT(team != NULL);
1169  KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) primary enter for "
1170  "barrier type %d\n",
1171  gtid, team->t.t_id, tid, bt));
1172 #if KMP_BARRIER_ICV_PUSH
1173  if (propagate_icvs) { // primary already has ICVs in final destination; copy
1174  copy_icvs(&thr_bar->th_fixed_icvs,
1175  &team->t.t_implicit_task_taskdata[tid].td_icvs);
1176  }
1177 #endif
1178  } else { // Handle fork barrier workers who aren't part of a team yet
1179  KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", gtid,
1180  &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
1181  // Wait for parent thread to release us
1182  kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
1183  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1184  ANNOTATE_BARRIER_END(this_thr);
1185 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1186  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
1187  // In fork barrier where we could not get the object reliably
1188  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
1189  // Cancel wait on previous parallel region...
1190  __kmp_itt_task_starting(itt_sync_obj);
1191 
1192  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1193  return;
1194 
1195  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1196  if (itt_sync_obj != NULL)
1197  // Call prepare as early as possible for "new" barrier
1198  __kmp_itt_task_finished(itt_sync_obj);
1199  } else
1200 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1201  // Early exit for reaping threads releasing forkjoin barrier
1202  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1203  return;
1204 
1205  // The worker thread may now assume that the team is valid.
1206  team = __kmp_threads[gtid]->th.th_team;
1207  KMP_DEBUG_ASSERT(team != NULL);
1208  tid = __kmp_tid_from_gtid(gtid);
1209 
1210  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
1211  KA_TRACE(20,
1212  ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1213  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
1214  KMP_MB(); // Flush all pending memory write invalidates.
1215  }
1216  num_threads = this_thr->th.th_team_nproc;
1217  other_threads = team->t.t_threads;
1218 
1219 #ifdef KMP_REVERSE_HYPER_BAR
1220  // Count up to correct level for parent
1221  for (level = 0, offset = 1;
1222  offset < num_threads && (((tid >> level) & (branch_factor - 1)) == 0);
1223  level += branch_bits, offset <<= branch_bits)
1224  ;
1225 
1226  // Now go down from there
1227  for (level -= branch_bits, offset >>= branch_bits; offset != 0;
1228  level -= branch_bits, offset >>= branch_bits)
1229 #else
1230  // Go down the tree, level by level
1231  for (level = 0, offset = 1; offset < num_threads;
1232  level += branch_bits, offset <<= branch_bits)
1233 #endif // KMP_REVERSE_HYPER_BAR
1234  {
1235 #ifdef KMP_REVERSE_HYPER_BAR
1236  /* Now go in reverse order through the children, highest to lowest.
1237  Initial setting of child is conservative here. */
1238  child = num_threads >> ((level == 0) ? level : level - 1);
1239  for (child = (child < branch_factor - 1) ? child : branch_factor - 1,
1240  child_tid = tid + (child << level);
1241  child >= 1; child--, child_tid -= (1 << level))
1242 #else
1243  if (((tid >> level) & (branch_factor - 1)) != 0)
1244  // No need to go lower than this, since this is the level parent would be
1245  // notified
1246  break;
1247  // Iterate through children on this level of the tree
1248  for (child = 1, child_tid = tid + (1 << level);
1249  child < branch_factor && child_tid < num_threads;
1250  child++, child_tid += (1 << level))
1251 #endif // KMP_REVERSE_HYPER_BAR
1252  {
1253  if (child_tid >= num_threads)
1254  continue; // Child doesn't exist so keep going
1255  else {
1256  kmp_info_t *child_thr = other_threads[child_tid];
1257  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1258 #if KMP_CACHE_MANAGE
1259  kmp_uint32 next_child_tid = child_tid - (1 << level);
1260 // Prefetch next thread's go count
1261 #ifdef KMP_REVERSE_HYPER_BAR
1262  if (child - 1 >= 1 && next_child_tid < num_threads)
1263 #else
1264  if (child + 1 < branch_factor && next_child_tid < num_threads)
1265 #endif // KMP_REVERSE_HYPER_BAR
1266  KMP_CACHE_PREFETCH(
1267  &other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
1268 #endif /* KMP_CACHE_MANAGE */
1269 
1270 #if KMP_BARRIER_ICV_PUSH
1271  if (propagate_icvs) // push my fixed ICVs to my child
1272  copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
1273 #endif // KMP_BARRIER_ICV_PUSH
1274 
1275  KA_TRACE(
1276  20,
1277  ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
1278  "go(%p): %u => %u\n",
1279  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1280  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1281  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1282  // Release child from barrier
1283  ANNOTATE_BARRIER_BEGIN(child_thr);
1284  kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1285  flag.release();
1286  }
1287  }
1288  }
1289 #if KMP_BARRIER_ICV_PUSH
1290  if (propagate_icvs &&
1291  !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
1292  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
1293  FALSE);
1294  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1295  &thr_bar->th_fixed_icvs);
1296  }
1297 #endif
1298  KA_TRACE(
1299  20,
1300  ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
1301  gtid, team->t.t_id, tid, bt));
1302 }
1303 
1304 // Hierarchical Barrier
1305 
1306 // Initialize thread barrier data
1307 /* Initializes/re-initializes the hierarchical barrier data stored on a thread.
1308  Performs the minimum amount of initialization required based on how the team
1309  has changed. Returns true if leaf children will require both on-core and
1310  traditional wake-up mechanisms. For example, if the team size increases,
1311  threads already in the team will respond to on-core wakeup on their parent
1312  thread, but threads newly added to the team will only be listening on the
1313  their local b_go. */
1314 static bool __kmp_init_hierarchical_barrier_thread(enum barrier_type bt,
1315  kmp_bstate_t *thr_bar,
1316  kmp_uint32 nproc, int gtid,
1317  int tid, kmp_team_t *team) {
1318  // Checks to determine if (re-)initialization is needed
1319  bool uninitialized = thr_bar->team == NULL;
1320  bool team_changed = team != thr_bar->team;
1321  bool team_sz_changed = nproc != thr_bar->nproc;
1322  bool tid_changed = tid != thr_bar->old_tid;
1323  bool retval = false;
1324 
1325  if (uninitialized || team_sz_changed) {
1326  __kmp_get_hierarchy(nproc, thr_bar);
1327  }
1328 
1329  if (uninitialized || team_sz_changed || tid_changed) {
1330  thr_bar->my_level = thr_bar->depth - 1; // default for primary thread
1331  thr_bar->parent_tid = -1; // default for primary thread
1332  if (!KMP_MASTER_TID(tid)) {
1333  // if not primary thread, find parent thread in hierarchy
1334  kmp_uint32 d = 0;
1335  while (d < thr_bar->depth) { // find parent based on level of thread in
1336  // hierarchy, and note level
1337  kmp_uint32 rem;
1338  if (d == thr_bar->depth - 2) { // reached level right below the primary
1339  thr_bar->parent_tid = 0;
1340  thr_bar->my_level = d;
1341  break;
1342  } else if ((rem = tid % thr_bar->skip_per_level[d + 1]) != 0) {
1343  // TODO: can we make the above op faster?
1344  // thread is not a subtree root at next level, so this is max
1345  thr_bar->parent_tid = tid - rem;
1346  thr_bar->my_level = d;
1347  break;
1348  }
1349  ++d;
1350  }
1351  }
1352  __kmp_type_convert(7 - ((tid - thr_bar->parent_tid) /
1353  (thr_bar->skip_per_level[thr_bar->my_level])),
1354  &(thr_bar->offset));
1355  thr_bar->old_tid = tid;
1356  thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
1357  thr_bar->team = team;
1358  thr_bar->parent_bar =
1359  &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
1360  }
1361  if (uninitialized || team_changed || tid_changed) {
1362  thr_bar->team = team;
1363  thr_bar->parent_bar =
1364  &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
1365  retval = true;
1366  }
1367  if (uninitialized || team_sz_changed || tid_changed) {
1368  thr_bar->nproc = nproc;
1369  thr_bar->leaf_kids = thr_bar->base_leaf_kids;
1370  if (thr_bar->my_level == 0)
1371  thr_bar->leaf_kids = 0;
1372  if (thr_bar->leaf_kids && (kmp_uint32)tid + thr_bar->leaf_kids + 1 > nproc)
1373  __kmp_type_convert(nproc - tid - 1, &(thr_bar->leaf_kids));
1374  thr_bar->leaf_state = 0;
1375  for (int i = 0; i < thr_bar->leaf_kids; ++i)
1376  ((char *)&(thr_bar->leaf_state))[7 - i] = 1;
1377  }
1378  return retval;
1379 }
1380 
1381 static void __kmp_hierarchical_barrier_gather(
1382  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1383  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1384  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather);
1385  kmp_team_t *team = this_thr->th.th_team;
1386  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1387  kmp_uint32 nproc = this_thr->th.th_team_nproc;
1388  kmp_info_t **other_threads = team->t.t_threads;
1389  kmp_uint64 new_state = 0;
1390 
1391  int level = team->t.t_level;
1392  if (other_threads[0]
1393  ->th.th_teams_microtask) // are we inside the teams construct?
1394  if (this_thr->th.th_teams_size.nteams > 1)
1395  ++level; // level was not increased in teams construct for team_of_masters
1396  if (level == 1)
1397  thr_bar->use_oncore_barrier = 1;
1398  else
1399  thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
1400 
1401  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for "
1402  "barrier type %d\n",
1403  gtid, team->t.t_id, tid, bt));
1404  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
1405 
1406 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1407  // Barrier imbalance - save arrive time to the thread
1408  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
1409  this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
1410  }
1411 #endif
1412 
1413  (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid,
1414  team);
1415 
1416  if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
1417  kmp_int32 child_tid;
1418  new_state =
1419  (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1420  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1421  thr_bar->use_oncore_barrier) {
1422  if (thr_bar->leaf_kids) {
1423  // First, wait for leaf children to check-in on my b_arrived flag
1424  kmp_uint64 leaf_state =
1425  KMP_MASTER_TID(tid)
1426  ? thr_bar->b_arrived | thr_bar->leaf_state
1427  : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
1428  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting "
1429  "for leaf kids\n",
1430  gtid, team->t.t_id, tid));
1431  kmp_flag_64<> flag(&thr_bar->b_arrived, leaf_state);
1432  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1433  if (reduce) {
1434  ANNOTATE_REDUCE_AFTER(reduce);
1435  OMPT_REDUCTION_DECL(this_thr, gtid);
1436  OMPT_REDUCTION_BEGIN;
1437  for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids;
1438  ++child_tid) {
1439  KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
1440  "T#%d(%d:%d)\n",
1441  gtid, team->t.t_id, tid,
1442  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1443  child_tid));
1444  ANNOTATE_BARRIER_END(other_threads[child_tid]);
1445  (*reduce)(this_thr->th.th_local.reduce_data,
1446  other_threads[child_tid]->th.th_local.reduce_data);
1447  }
1448  OMPT_REDUCTION_END;
1449  ANNOTATE_REDUCE_BEFORE(reduce);
1450  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
1451  }
1452  // clear leaf_state bits
1453  KMP_TEST_THEN_AND64(&thr_bar->b_arrived, ~(thr_bar->leaf_state));
1454  }
1455  // Next, wait for higher level children on each child's b_arrived flag
1456  for (kmp_uint32 d = 1; d < thr_bar->my_level;
1457  ++d) { // gather lowest level threads first, but skip 0
1458  kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
1459  skip = thr_bar->skip_per_level[d];
1460  if (last > nproc)
1461  last = nproc;
1462  for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1463  kmp_info_t *child_thr = other_threads[child_tid];
1464  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1465  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
1466  "T#%d(%d:%d) "
1467  "arrived(%p) == %llu\n",
1468  gtid, team->t.t_id, tid,
1469  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1470  child_tid, &child_bar->b_arrived, new_state));
1471  kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
1472  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1473  ANNOTATE_BARRIER_END(child_thr);
1474  if (reduce) {
1475  KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
1476  "T#%d(%d:%d)\n",
1477  gtid, team->t.t_id, tid,
1478  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1479  child_tid));
1480  ANNOTATE_REDUCE_AFTER(reduce);
1481  (*reduce)(this_thr->th.th_local.reduce_data,
1482  child_thr->th.th_local.reduce_data);
1483  ANNOTATE_REDUCE_BEFORE(reduce);
1484  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
1485  }
1486  }
1487  }
1488  } else { // Blocktime is not infinite
1489  for (kmp_uint32 d = 0; d < thr_bar->my_level;
1490  ++d) { // Gather lowest level threads first
1491  kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
1492  skip = thr_bar->skip_per_level[d];
1493  if (last > nproc)
1494  last = nproc;
1495  for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1496  kmp_info_t *child_thr = other_threads[child_tid];
1497  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1498  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
1499  "T#%d(%d:%d) "
1500  "arrived(%p) == %llu\n",
1501  gtid, team->t.t_id, tid,
1502  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1503  child_tid, &child_bar->b_arrived, new_state));
1504  kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
1505  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1506  ANNOTATE_BARRIER_END(child_thr);
1507  if (reduce) {
1508  KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
1509  "T#%d(%d:%d)\n",
1510  gtid, team->t.t_id, tid,
1511  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1512  child_tid));
1513  ANNOTATE_REDUCE_AFTER(reduce);
1514  (*reduce)(this_thr->th.th_local.reduce_data,
1515  child_thr->th.th_local.reduce_data);
1516  ANNOTATE_REDUCE_BEFORE(reduce);
1517  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
1518  }
1519  }
1520  }
1521  }
1522  }
1523  // All subordinates are gathered; now release parent if not primary thread
1524 
1525  if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
1526  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing"
1527  " T#%d(%d:%d) arrived(%p): %llu => %llu\n",
1528  gtid, team->t.t_id, tid,
1529  __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id,
1530  thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
1531  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
1532  /* Mark arrival to parent: After performing this write, a worker thread may
1533  not assume that the team is valid any more - it could be deallocated by
1534  the primary thread at any time. */
1535  if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ||
1536  !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived
1537  // flag; release it
1538  ANNOTATE_BARRIER_BEGIN(this_thr);
1539  kmp_flag_64<> flag(&thr_bar->b_arrived,
1540  other_threads[thr_bar->parent_tid]);
1541  flag.release();
1542  } else {
1543  // Leaf does special release on "offset" bits of parent's b_arrived flag
1544  thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1545  kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived,
1546  thr_bar->offset + 1);
1547  flag.set_waiter(other_threads[thr_bar->parent_tid]);
1548  flag.release();
1549  }
1550  } else { // Primary thread needs to update the team's b_arrived value
1551  team->t.t_bar[bt].b_arrived = new_state;
1552  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d "
1553  "arrived(%p) = %llu\n",
1554  gtid, team->t.t_id, tid, team->t.t_id,
1555  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
1556  }
1557  // Is the team access below unsafe or just technically invalid?
1558  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for "
1559  "barrier type %d\n",
1560  gtid, team->t.t_id, tid, bt));
1561 }
1562 
1563 static void __kmp_hierarchical_barrier_release(
1564  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1565  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1566  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release);
1567  kmp_team_t *team;
1568  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1569  kmp_uint32 nproc;
1570  bool team_change = false; // indicates on-core barrier shouldn't be used
1571 
1572  if (KMP_MASTER_TID(tid)) {
1573  team = __kmp_threads[gtid]->th.th_team;
1574  KMP_DEBUG_ASSERT(team != NULL);
1575  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) primary "
1576  "entered barrier type %d\n",
1577  gtid, team->t.t_id, tid, bt));
1578  } else { // Worker threads
1579  // Wait for parent thread to release me
1580  if (!thr_bar->use_oncore_barrier ||
1581  __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || thr_bar->my_level != 0 ||
1582  thr_bar->team == NULL) {
1583  // Use traditional method of waiting on my own b_go flag
1584  thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
1585  kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
1586  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1587  ANNOTATE_BARRIER_END(this_thr);
1588  TCW_8(thr_bar->b_go,
1589  KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1590  } else { // Thread barrier data is initialized, this is a leaf, blocktime is
1591  // infinite, not nested
1592  // Wait on my "offset" bits on parent's b_go flag
1593  thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
1594  kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP,
1595  thr_bar->offset + 1, bt,
1596  this_thr USE_ITT_BUILD_ARG(itt_sync_obj));
1597  flag.wait(this_thr, TRUE);
1598  if (thr_bar->wait_flag ==
1599  KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
1600  TCW_8(thr_bar->b_go,
1601  KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1602  } else { // Reset my bits on parent's b_go flag
1603  (RCAST(volatile char *,
1604  &(thr_bar->parent_bar->b_go)))[thr_bar->offset + 1] = 0;
1605  }
1606  }
1607  thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
1608  // Early exit for reaping threads releasing forkjoin barrier
1609  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1610  return;
1611  // The worker thread may now assume that the team is valid.
1612  team = __kmp_threads[gtid]->th.th_team;
1613  KMP_DEBUG_ASSERT(team != NULL);
1614  tid = __kmp_tid_from_gtid(gtid);
1615 
1616  KA_TRACE(
1617  20,
1618  ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1619  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
1620  KMP_MB(); // Flush all pending memory write invalidates.
1621  }
1622 
1623  nproc = this_thr->th.th_team_nproc;
1624  int level = team->t.t_level;
1625  if (team->t.t_threads[0]
1626  ->th.th_teams_microtask) { // are we inside the teams construct?
1627  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1628  this_thr->th.th_teams_level == level)
1629  ++level; // level was not increased in teams construct for team_of_workers
1630  if (this_thr->th.th_teams_size.nteams > 1)
1631  ++level; // level was not increased in teams construct for team_of_masters
1632  }
1633  if (level == 1)
1634  thr_bar->use_oncore_barrier = 1;
1635  else
1636  thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
1637 
1638  // If the team size has increased, we still communicate with old leaves via
1639  // oncore barrier.
1640  unsigned short int old_leaf_kids = thr_bar->leaf_kids;
1641  kmp_uint64 old_leaf_state = thr_bar->leaf_state;
1642  team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid,
1643  tid, team);
1644  // But if the entire team changes, we won't use oncore barrier at all
1645  if (team_change)
1646  old_leaf_kids = 0;
1647 
1648 #if KMP_BARRIER_ICV_PUSH
1649  if (propagate_icvs) {
1650  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
1651  FALSE);
1652  if (KMP_MASTER_TID(
1653  tid)) { // primary already has copy in final destination; copy
1654  copy_icvs(&thr_bar->th_fixed_icvs,
1655  &team->t.t_implicit_task_taskdata[tid].td_icvs);
1656  } else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1657  thr_bar->use_oncore_barrier) { // optimization for inf blocktime
1658  if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
1659  // leaves (on-core children) pull parent's fixed ICVs directly to local
1660  // ICV store
1661  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1662  &thr_bar->parent_bar->th_fixed_icvs);
1663  // non-leaves will get ICVs piggybacked with b_go via NGO store
1664  } else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
1665  if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can
1666  // access
1667  copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
1668  else // leaves copy parent's fixed ICVs directly to local ICV store
1669  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1670  &thr_bar->parent_bar->th_fixed_icvs);
1671  }
1672  }
1673 #endif // KMP_BARRIER_ICV_PUSH
1674 
1675  // Now, release my children
1676  if (thr_bar->my_level) { // not a leaf
1677  kmp_int32 child_tid;
1678  kmp_uint32 last;
1679  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1680  thr_bar->use_oncore_barrier) {
1681  if (KMP_MASTER_TID(tid)) { // do a flat release
1682  // Set local b_go to bump children via NGO store of the cache line
1683  // containing IVCs and b_go.
1684  thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
1685  // Use ngo stores if available; b_go piggybacks in the last 8 bytes of
1686  // the cache line
1687  ngo_load(&thr_bar->th_fixed_icvs);
1688  // This loops over all the threads skipping only the leaf nodes in the
1689  // hierarchy
1690  for (child_tid = thr_bar->skip_per_level[1]; child_tid < (int)nproc;
1691  child_tid += thr_bar->skip_per_level[1]) {
1692  kmp_bstate_t *child_bar =
1693  &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
1694  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1695  "releasing T#%d(%d:%d)"
1696  " go(%p): %u => %u\n",
1697  gtid, team->t.t_id, tid,
1698  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1699  child_tid, &child_bar->b_go, child_bar->b_go,
1700  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1701  // Use ngo store (if available) to both store ICVs and release child
1702  // via child's b_go
1703  ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
1704  }
1705  ngo_sync();
1706  }
1707  TCW_8(thr_bar->b_go,
1708  KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1709  // Now, release leaf children
1710  if (thr_bar->leaf_kids) { // if there are any
1711  // We test team_change on the off-chance that the level 1 team changed.
1712  if (team_change ||
1713  old_leaf_kids < thr_bar->leaf_kids) { // some old, some new
1714  if (old_leaf_kids) { // release old leaf kids
1715  thr_bar->b_go |= old_leaf_state;
1716  }
1717  // Release new leaf kids
1718  last = tid + thr_bar->skip_per_level[1];
1719  if (last > nproc)
1720  last = nproc;
1721  for (child_tid = tid + 1 + old_leaf_kids; child_tid < (int)last;
1722  ++child_tid) { // skip_per_level[0]=1
1723  kmp_info_t *child_thr = team->t.t_threads[child_tid];
1724  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1725  KA_TRACE(
1726  20,
1727  ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
1728  " T#%d(%d:%d) go(%p): %u => %u\n",
1729  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1730  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1731  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1732  // Release child using child's b_go flag
1733  ANNOTATE_BARRIER_BEGIN(child_thr);
1734  kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1735  flag.release();
1736  }
1737  } else { // Release all children at once with leaf_state bits on my own
1738  // b_go flag
1739  thr_bar->b_go |= thr_bar->leaf_state;
1740  }
1741  }
1742  } else { // Blocktime is not infinite; do a simple hierarchical release
1743  for (int d = thr_bar->my_level - 1; d >= 0;
1744  --d) { // Release highest level threads first
1745  last = tid + thr_bar->skip_per_level[d + 1];
1746  kmp_uint32 skip = thr_bar->skip_per_level[d];
1747  if (last > nproc)
1748  last = nproc;
1749  for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1750  kmp_info_t *child_thr = team->t.t_threads[child_tid];
1751  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1752  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1753  "releasing T#%d(%d:%d) go(%p): %u => %u\n",
1754  gtid, team->t.t_id, tid,
1755  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1756  child_tid, &child_bar->b_go, child_bar->b_go,
1757  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1758  // Release child using child's b_go flag
1759  ANNOTATE_BARRIER_BEGIN(child_thr);
1760  kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1761  flag.release();
1762  }
1763  }
1764  }
1765 #if KMP_BARRIER_ICV_PUSH
1766  if (propagate_icvs && !KMP_MASTER_TID(tid))
1767  // non-leaves copy ICVs from fixed ICVs to local dest
1768  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1769  &thr_bar->th_fixed_icvs);
1770 #endif // KMP_BARRIER_ICV_PUSH
1771  }
1772  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for "
1773  "barrier type %d\n",
1774  gtid, team->t.t_id, tid, bt));
1775 }
1776 
1777 // End of Barrier Algorithms
1778 
1779 // type traits for cancellable value
1780 // if cancellable is true, then is_cancellable is a normal boolean variable
1781 // if cancellable is false, then is_cancellable is a compile time constant
1782 template <bool cancellable> struct is_cancellable {};
1783 template <> struct is_cancellable<true> {
1784  bool value;
1785  is_cancellable() : value(false) {}
1786  is_cancellable(bool b) : value(b) {}
1787  is_cancellable &operator=(bool b) {
1788  value = b;
1789  return *this;
1790  }
1791  operator bool() const { return value; }
1792 };
1793 template <> struct is_cancellable<false> {
1794  is_cancellable &operator=(bool b) { return *this; }
1795  constexpr operator bool() const { return false; }
1796 };
1797 
1798 // Internal function to do a barrier.
1799 /* If is_split is true, do a split barrier, otherwise, do a plain barrier
1800  If reduce is non-NULL, do a split reduction barrier, otherwise, do a split
1801  barrier
1802  When cancellable = false,
1803  Returns 0 if primary thread, 1 if worker thread.
1804  When cancellable = true
1805  Returns 0 if not cancelled, 1 if cancelled. */
1806 template <bool cancellable = false>
1807 static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split,
1808  size_t reduce_size, void *reduce_data,
1809  void (*reduce)(void *, void *)) {
1810  KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier);
1811  KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1812  int tid = __kmp_tid_from_gtid(gtid);
1813  kmp_info_t *this_thr = __kmp_threads[gtid];
1814  kmp_team_t *team = this_thr->th.th_team;
1815  int status = 0;
1816  is_cancellable<cancellable> cancelled;
1817 #if OMPT_SUPPORT && OMPT_OPTIONAL
1818  ompt_data_t *my_task_data;
1819  ompt_data_t *my_parallel_data;
1820  void *return_address;
1821  ompt_sync_region_t barrier_kind;
1822 #endif
1823 
1824  KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n", gtid,
1825  __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1826 
1827  ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1828 #if OMPT_SUPPORT
1829  if (ompt_enabled.enabled) {
1830 #if OMPT_OPTIONAL
1831  my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1832  my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1833  return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1834  barrier_kind = __ompt_get_barrier_kind(bt, this_thr);
1835  if (ompt_enabled.ompt_callback_sync_region) {
1836  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1837  barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1838  return_address);
1839  }
1840  if (ompt_enabled.ompt_callback_sync_region_wait) {
1841  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1842  barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1843  return_address);
1844  }
1845 #endif
1846  // It is OK to report the barrier state after the barrier begin callback.
1847  // According to the OMPT specification, a compliant implementation may
1848  // even delay reporting this state until the barrier begins to wait.
1849  this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1850  }
1851 #endif
1852 
1853  if (!team->t.t_serialized) {
1854 #if USE_ITT_BUILD
1855  // This value will be used in itt notify events below.
1856  void *itt_sync_obj = NULL;
1857 #if USE_ITT_NOTIFY
1858  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1859  itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1860 #endif
1861 #endif /* USE_ITT_BUILD */
1862  if (__kmp_tasking_mode == tskm_extra_barrier) {
1863  __kmp_tasking_barrier(team, this_thr, gtid);
1864  KA_TRACE(15,
1865  ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n", gtid,
1866  __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1867  }
1868 
1869  /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1870  access it when the team struct is not guaranteed to exist. */
1871  // See note about the corresponding code in __kmp_join_barrier() being
1872  // performance-critical.
1873  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1874 #if KMP_USE_MONITOR
1875  this_thr->th.th_team_bt_intervals =
1876  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1877  this_thr->th.th_team_bt_set =
1878  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1879 #else
1880  this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1881 #endif
1882  }
1883 
1884 #if USE_ITT_BUILD
1885  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1886  __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1887 #endif /* USE_ITT_BUILD */
1888 #if USE_DEBUGGER
1889  // Let the debugger know: the thread arrived to the barrier and waiting.
1890  if (KMP_MASTER_TID(tid)) { // Primary thread counter stored in team struct
1891  team->t.t_bar[bt].b_master_arrived += 1;
1892  } else {
1893  this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
1894  } // if
1895 #endif /* USE_DEBUGGER */
1896  if (reduce != NULL) {
1897  // KMP_DEBUG_ASSERT( is_split == TRUE ); // #C69956
1898  this_thr->th.th_local.reduce_data = reduce_data;
1899  }
1900 
1901  if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec)
1902  // use 0 to only setup the current team if nthreads > 1
1903  __kmp_task_team_setup(this_thr, team, 0);
1904 
1905  if (cancellable) {
1906  cancelled = __kmp_linear_barrier_gather_cancellable(
1907  bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1908  } else {
1909  switch (__kmp_barrier_gather_pattern[bt]) {
1910  case bp_dist_bar: {
1911  __kmp_dist_barrier_gather(bt, this_thr, gtid, tid,
1912  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1913  break;
1914  }
1915  case bp_hyper_bar: {
1916  // don't set branch bits to 0; use linear
1917  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1918  __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid,
1919  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1920  break;
1921  }
1922  case bp_hierarchical_bar: {
1923  __kmp_hierarchical_barrier_gather(
1924  bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1925  break;
1926  }
1927  case bp_tree_bar: {
1928  // don't set branch bits to 0; use linear
1929  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1930  __kmp_tree_barrier_gather(bt, this_thr, gtid, tid,
1931  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1932  break;
1933  }
1934  default: {
1935  __kmp_linear_barrier_gather(bt, this_thr, gtid, tid,
1936  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1937  }
1938  }
1939  }
1940 
1941  KMP_MB();
1942 
1943  if (KMP_MASTER_TID(tid)) {
1944  status = 0;
1945  if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
1946  __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1947  }
1948 #if USE_DEBUGGER
1949  // Let the debugger know: All threads are arrived and starting leaving the
1950  // barrier.
1951  team->t.t_bar[bt].b_team_arrived += 1;
1952 #endif
1953 
1954  if (__kmp_omp_cancellation) {
1955  kmp_int32 cancel_request = KMP_ATOMIC_LD_RLX(&team->t.t_cancel_request);
1956  // Reset cancellation flag for worksharing constructs
1957  if (cancel_request == cancel_loop ||
1958  cancel_request == cancel_sections) {
1959  KMP_ATOMIC_ST_RLX(&team->t.t_cancel_request, cancel_noreq);
1960  }
1961  }
1962 #if USE_ITT_BUILD
1963  /* TODO: In case of split reduction barrier, primary thread may send
1964  acquired event early, before the final summation into the shared
1965  variable is done (final summation can be a long operation for array
1966  reductions). */
1967  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1968  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1969 #endif /* USE_ITT_BUILD */
1970 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1971  // Barrier - report frame end (only if active_level == 1)
1972  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1973  __kmp_forkjoin_frames_mode &&
1974  (this_thr->th.th_teams_microtask == NULL || // either not in teams
1975  this_thr->th.th_teams_size.nteams == 1) && // or inside single team
1976  team->t.t_active_level == 1) {
1977  ident_t *loc = __kmp_threads[gtid]->th.th_ident;
1978  kmp_uint64 cur_time = __itt_get_timestamp();
1979  kmp_info_t **other_threads = team->t.t_threads;
1980  int nproc = this_thr->th.th_team_nproc;
1981  int i;
1982  switch (__kmp_forkjoin_frames_mode) {
1983  case 1:
1984  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1985  loc, nproc);
1986  this_thr->th.th_frame_time = cur_time;
1987  break;
1988  case 2: // AC 2015-01-19: currently does not work for hierarchical (to
1989  // be fixed)
1990  __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time,
1991  1, loc, nproc);
1992  break;
1993  case 3:
1994  if (__itt_metadata_add_ptr) {
1995  // Initialize with primary thread's wait time
1996  kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1997  // Set arrive time to zero to be able to check it in
1998  // __kmp_invoke_task(); the same is done inside the loop below
1999  this_thr->th.th_bar_arrive_time = 0;
2000  for (i = 1; i < nproc; ++i) {
2001  delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
2002  other_threads[i]->th.th_bar_arrive_time = 0;
2003  }
2004  __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
2005  cur_time, delta,
2006  (kmp_uint64)(reduce != NULL));
2007  }
2008  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
2009  loc, nproc);
2010  this_thr->th.th_frame_time = cur_time;
2011  break;
2012  }
2013  }
2014 #endif /* USE_ITT_BUILD */
2015  } else {
2016  status = 1;
2017 #if USE_ITT_BUILD
2018  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2019  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
2020 #endif /* USE_ITT_BUILD */
2021  }
2022  if ((status == 1 || !is_split) && !cancelled) {
2023  if (cancellable) {
2024  cancelled = __kmp_linear_barrier_release_cancellable(
2025  bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2026  } else {
2027  switch (__kmp_barrier_release_pattern[bt]) {
2028  case bp_dist_bar: {
2029  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
2030  __kmp_dist_barrier_release(bt, this_thr, gtid, tid,
2031  FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2032  break;
2033  }
2034  case bp_hyper_bar: {
2035  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
2036  __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
2037  FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2038  break;
2039  }
2040  case bp_hierarchical_bar: {
2041  __kmp_hierarchical_barrier_release(
2042  bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2043  break;
2044  }
2045  case bp_tree_bar: {
2046  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
2047  __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
2048  FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2049  break;
2050  }
2051  default: {
2052  __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
2053  FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2054  }
2055  }
2056  }
2057  if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
2058  __kmp_task_team_sync(this_thr, team);
2059  }
2060  }
2061 
2062 #if USE_ITT_BUILD
2063  /* GEH: TODO: Move this under if-condition above and also include in
2064  __kmp_end_split_barrier(). This will more accurately represent the actual
2065  release time of the threads for split barriers. */
2066  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2067  __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2068 #endif /* USE_ITT_BUILD */
2069  } else { // Team is serialized.
2070  status = 0;
2071  if (__kmp_tasking_mode != tskm_immediate_exec) {
2072  if (this_thr->th.th_task_team != NULL) {
2073 #if USE_ITT_NOTIFY
2074  void *itt_sync_obj = NULL;
2075  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2076  itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
2077  __kmp_itt_barrier_starting(gtid, itt_sync_obj);
2078  }
2079 #endif
2080 
2081  KMP_DEBUG_ASSERT(this_thr->th.th_task_team->tt.tt_found_proxy_tasks ==
2082  TRUE);
2083  __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
2084  __kmp_task_team_setup(this_thr, team, 0);
2085 
2086 #if USE_ITT_BUILD
2087  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2088  __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2089 #endif /* USE_ITT_BUILD */
2090  }
2091  }
2092  }
2093  KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
2094  gtid, __kmp_team_from_gtid(gtid)->t.t_id,
2095  __kmp_tid_from_gtid(gtid), status));
2096 
2097 #if OMPT_SUPPORT
2098  if (ompt_enabled.enabled) {
2099 #if OMPT_OPTIONAL
2100  if (ompt_enabled.ompt_callback_sync_region_wait) {
2101  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2102  barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
2103  return_address);
2104  }
2105  if (ompt_enabled.ompt_callback_sync_region) {
2106  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2107  barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
2108  return_address);
2109  }
2110 #endif
2111  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
2112  }
2113 #endif
2114  ANNOTATE_BARRIER_END(&team->t.t_bar);
2115 
2116  if (cancellable)
2117  return (int)cancelled;
2118  return status;
2119 }
2120 
2121 // Returns 0 if primary thread, 1 if worker thread.
2122 int __kmp_barrier(enum barrier_type bt, int gtid, int is_split,
2123  size_t reduce_size, void *reduce_data,
2124  void (*reduce)(void *, void *)) {
2125  return __kmp_barrier_template<>(bt, gtid, is_split, reduce_size, reduce_data,
2126  reduce);
2127 }
2128 
2129 #if defined(KMP_GOMP_COMPAT)
2130 // Returns 1 if cancelled, 0 otherwise
2131 int __kmp_barrier_gomp_cancel(int gtid) {
2132  if (__kmp_omp_cancellation) {
2133  int cancelled = __kmp_barrier_template<true>(bs_plain_barrier, gtid, FALSE,
2134  0, NULL, NULL);
2135  if (cancelled) {
2136  int tid = __kmp_tid_from_gtid(gtid);
2137  kmp_info_t *this_thr = __kmp_threads[gtid];
2138  if (KMP_MASTER_TID(tid)) {
2139  // Primary thread does not need to revert anything
2140  } else {
2141  // Workers need to revert their private b_arrived flag
2142  this_thr->th.th_bar[bs_plain_barrier].bb.b_arrived -=
2143  KMP_BARRIER_STATE_BUMP;
2144  }
2145  }
2146  return cancelled;
2147  }
2148  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2149  return FALSE;
2150 }
2151 #endif
2152 
2153 void __kmp_end_split_barrier(enum barrier_type bt, int gtid) {
2154  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier);
2155  KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
2156  KMP_DEBUG_ASSERT(bt < bs_last_barrier);
2157  int tid = __kmp_tid_from_gtid(gtid);
2158  kmp_info_t *this_thr = __kmp_threads[gtid];
2159  kmp_team_t *team = this_thr->th.th_team;
2160 
2161  ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
2162  if (!team->t.t_serialized) {
2163  if (KMP_MASTER_GTID(gtid)) {
2164  switch (__kmp_barrier_release_pattern[bt]) {
2165  case bp_dist_bar: {
2166  __kmp_dist_barrier_release(bt, this_thr, gtid, tid,
2167  FALSE USE_ITT_BUILD_ARG(NULL));
2168  break;
2169  }
2170  case bp_hyper_bar: {
2171  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
2172  __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
2173  FALSE USE_ITT_BUILD_ARG(NULL));
2174  break;
2175  }
2176  case bp_hierarchical_bar: {
2177  __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid,
2178  FALSE USE_ITT_BUILD_ARG(NULL));
2179  break;
2180  }
2181  case bp_tree_bar: {
2182  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
2183  __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
2184  FALSE USE_ITT_BUILD_ARG(NULL));
2185  break;
2186  }
2187  default: {
2188  __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
2189  FALSE USE_ITT_BUILD_ARG(NULL));
2190  }
2191  }
2192  if (__kmp_tasking_mode != tskm_immediate_exec) {
2193  __kmp_task_team_sync(this_thr, team);
2194  } // if
2195  }
2196  }
2197  ANNOTATE_BARRIER_END(&team->t.t_bar);
2198 }
2199 
2200 void __kmp_join_barrier(int gtid) {
2201  KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier);
2202  KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
2203 
2204  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
2205 
2206  kmp_info_t *this_thr = __kmp_threads[gtid];
2207  kmp_team_t *team;
2208  kmp_uint nproc;
2209  kmp_info_t *master_thread;
2210  int tid;
2211 #ifdef KMP_DEBUG
2212  int team_id;
2213 #endif /* KMP_DEBUG */
2214 #if USE_ITT_BUILD
2215  void *itt_sync_obj = NULL;
2216 #if USE_ITT_NOTIFY
2217  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
2218  // Get object created at fork_barrier
2219  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2220 #endif
2221 #endif /* USE_ITT_BUILD */
2222  KMP_MB();
2223 
2224  // Get current info
2225  team = this_thr->th.th_team;
2226  nproc = this_thr->th.th_team_nproc;
2227  KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc);
2228  tid = __kmp_tid_from_gtid(gtid);
2229 #ifdef KMP_DEBUG
2230  team_id = team->t.t_id;
2231 #endif /* KMP_DEBUG */
2232  master_thread = this_thr->th.th_team_master;
2233 #ifdef KMP_DEBUG
2234  if (master_thread != team->t.t_threads[0]) {
2235  __kmp_print_structure();
2236  }
2237 #endif /* KMP_DEBUG */
2238  KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
2239  KMP_MB();
2240 
2241  // Verify state
2242  KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
2243  KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
2244  KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
2245  KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n",
2246  gtid, team_id, tid));
2247 
2248  ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
2249 #if OMPT_SUPPORT
2250  if (ompt_enabled.enabled) {
2251 #if OMPT_OPTIONAL
2252  ompt_data_t *my_task_data;
2253  ompt_data_t *my_parallel_data;
2254  void *codeptr = NULL;
2255  int ds_tid = this_thr->th.th_info.ds.ds_tid;
2256  if (KMP_MASTER_TID(ds_tid) &&
2257  (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
2258  ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
2259  codeptr = team->t.ompt_team_info.master_return_address;
2260  my_task_data = OMPT_CUR_TASK_DATA(this_thr);
2261  my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
2262  if (ompt_enabled.ompt_callback_sync_region) {
2263  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2264  ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
2265  my_task_data, codeptr);
2266  }
2267  if (ompt_enabled.ompt_callback_sync_region_wait) {
2268  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2269  ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
2270  my_task_data, codeptr);
2271  }
2272  if (!KMP_MASTER_TID(ds_tid))
2273  this_thr->th.ompt_thread_info.task_data = *OMPT_CUR_TASK_DATA(this_thr);
2274 #endif
2275  this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier_implicit;
2276  }
2277 #endif
2278 
2279  if (__kmp_tasking_mode == tskm_extra_barrier) {
2280  __kmp_tasking_barrier(team, this_thr, gtid);
2281  KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past tasking barrier\n",
2282  gtid, team_id, tid));
2283  }
2284 #ifdef KMP_DEBUG
2285  if (__kmp_tasking_mode != tskm_immediate_exec) {
2286  KA_TRACE(20, ("__kmp_join_barrier: T#%d, old team = %d, old task_team = "
2287  "%p, th_task_team = %p\n",
2288  __kmp_gtid_from_thread(this_thr), team_id,
2289  team->t.t_task_team[this_thr->th.th_task_state],
2290  this_thr->th.th_task_team));
2291  if (this_thr->th.th_task_team)
2292  KMP_DEBUG_ASSERT(this_thr->th.th_task_team ==
2293  team->t.t_task_team[this_thr->th.th_task_state]);
2294  }
2295 #endif /* KMP_DEBUG */
2296 
2297  /* Copy the blocktime info to the thread, where __kmp_wait_template() can
2298  access it when the team struct is not guaranteed to exist. Doing these
2299  loads causes a cache miss slows down EPCC parallel by 2x. As a workaround,
2300  we do not perform the copy if blocktime=infinite, since the values are not
2301  used by __kmp_wait_template() in that case. */
2302  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
2303 #if KMP_USE_MONITOR
2304  this_thr->th.th_team_bt_intervals =
2305  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
2306  this_thr->th.th_team_bt_set =
2307  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
2308 #else
2309  this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
2310 #endif
2311  }
2312 
2313 #if USE_ITT_BUILD
2314  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2315  __kmp_itt_barrier_starting(gtid, itt_sync_obj);
2316 #endif /* USE_ITT_BUILD */
2317 
2318  switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
2319  case bp_dist_bar: {
2320  __kmp_dist_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
2321  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2322  break;
2323  }
2324  case bp_hyper_bar: {
2325  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
2326  __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
2327  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2328  break;
2329  }
2330  case bp_hierarchical_bar: {
2331  __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
2332  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2333  break;
2334  }
2335  case bp_tree_bar: {
2336  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
2337  __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
2338  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2339  break;
2340  }
2341  default: {
2342  __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
2343  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2344  }
2345  }
2346 
2347  /* From this point on, the team data structure may be deallocated at any time
2348  by the primary thread - it is unsafe to reference it in any of the worker
2349  threads. Any per-team data items that need to be referenced before the
2350  end of the barrier should be moved to the kmp_task_team_t structs. */
2351  if (KMP_MASTER_TID(tid)) {
2352  if (__kmp_tasking_mode != tskm_immediate_exec) {
2353  __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
2354  }
2355  if (__kmp_display_affinity) {
2356  KMP_CHECK_UPDATE(team->t.t_display_affinity, 0);
2357  }
2358 #if KMP_STATS_ENABLED
2359  // Have primary thread flag the workers to indicate they are now waiting for
2360  // next parallel region, Also wake them up so they switch their timers to
2361  // idle.
2362  for (int i = 0; i < team->t.t_nproc; ++i) {
2363  kmp_info_t *team_thread = team->t.t_threads[i];
2364  if (team_thread == this_thr)
2365  continue;
2366  team_thread->th.th_stats->setIdleFlag();
2367  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME &&
2368  team_thread->th.th_sleep_loc != NULL)
2369  __kmp_null_resume_wrapper(team_thread);
2370  }
2371 #endif
2372 #if USE_ITT_BUILD
2373  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2374  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
2375 #endif /* USE_ITT_BUILD */
2376 
2377 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2378  // Join barrier - report frame end
2379  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2380  __kmp_forkjoin_frames_mode &&
2381  (this_thr->th.th_teams_microtask == NULL || // either not in teams
2382  this_thr->th.th_teams_size.nteams == 1) && // or inside single team
2383  team->t.t_active_level == 1) {
2384  kmp_uint64 cur_time = __itt_get_timestamp();
2385  ident_t *loc = team->t.t_ident;
2386  kmp_info_t **other_threads = team->t.t_threads;
2387  int nproc = this_thr->th.th_team_nproc;
2388  int i;
2389  switch (__kmp_forkjoin_frames_mode) {
2390  case 1:
2391  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
2392  loc, nproc);
2393  break;
2394  case 2:
2395  __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1,
2396  loc, nproc);
2397  break;
2398  case 3:
2399  if (__itt_metadata_add_ptr) {
2400  // Initialize with primary thread's wait time
2401  kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
2402  // Set arrive time to zero to be able to check it in
2403  // __kmp_invoke_task(); the same is done inside the loop below
2404  this_thr->th.th_bar_arrive_time = 0;
2405  for (i = 1; i < nproc; ++i) {
2406  delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
2407  other_threads[i]->th.th_bar_arrive_time = 0;
2408  }
2409  __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
2410  cur_time, delta, 0);
2411  }
2412  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
2413  loc, nproc);
2414  this_thr->th.th_frame_time = cur_time;
2415  break;
2416  }
2417  }
2418 #endif /* USE_ITT_BUILD */
2419  }
2420 #if USE_ITT_BUILD
2421  else {
2422  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2423  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
2424  }
2425 #endif /* USE_ITT_BUILD */
2426 
2427 #if KMP_DEBUG
2428  if (KMP_MASTER_TID(tid)) {
2429  KA_TRACE(
2430  15,
2431  ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
2432  gtid, team_id, tid, nproc));
2433  }
2434 #endif /* KMP_DEBUG */
2435 
2436  // TODO now, mark worker threads as done so they may be disbanded
2437  KMP_MB(); // Flush all pending memory write invalidates.
2438  KA_TRACE(10,
2439  ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
2440 
2441  ANNOTATE_BARRIER_END(&team->t.t_bar);
2442 }
2443 
2444 // TODO release worker threads' fork barriers as we are ready instead of all at
2445 // once
2446 void __kmp_fork_barrier(int gtid, int tid) {
2447  KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier);
2448  KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
2449  kmp_info_t *this_thr = __kmp_threads[gtid];
2450  kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
2451 #if USE_ITT_BUILD
2452  void *itt_sync_obj = NULL;
2453 #endif /* USE_ITT_BUILD */
2454  if (team)
2455  ANNOTATE_BARRIER_END(&team->t.t_bar);
2456 
2457  KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid,
2458  (team != NULL) ? team->t.t_id : -1, tid));
2459 
2460  // th_team pointer only valid for primary thread here
2461  if (KMP_MASTER_TID(tid)) {
2462 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2463  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2464  // Create itt barrier object
2465  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
2466  __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing
2467  }
2468 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2469 
2470 #ifdef KMP_DEBUG
2471  KMP_DEBUG_ASSERT(team);
2472  kmp_info_t **other_threads = team->t.t_threads;
2473  int i;
2474 
2475  // Verify state
2476  KMP_MB();
2477 
2478  for (i = 1; i < team->t.t_nproc; ++i) {
2479  KA_TRACE(500,
2480  ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go "
2481  "== %u.\n",
2482  gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
2483  team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
2484  other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
2485  KMP_DEBUG_ASSERT(
2486  (TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) &
2487  ~(KMP_BARRIER_SLEEP_STATE)) == KMP_INIT_BARRIER_STATE);
2488  KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
2489  }
2490 #endif
2491 
2492  if (__kmp_tasking_mode != tskm_immediate_exec) {
2493  // 0 indicates setup current task team if nthreads > 1
2494  __kmp_task_team_setup(this_thr, team, 0);
2495  }
2496 
2497  /* The primary thread may have changed its blocktime between join barrier
2498  and fork barrier. Copy the blocktime info to the thread, where
2499  __kmp_wait_template() can access it when the team struct is not
2500  guaranteed to exist. */
2501  // See note about the corresponding code in __kmp_join_barrier() being
2502  // performance-critical
2503  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
2504 #if KMP_USE_MONITOR
2505  this_thr->th.th_team_bt_intervals =
2506  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
2507  this_thr->th.th_team_bt_set =
2508  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
2509 #else
2510  this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
2511 #endif
2512  }
2513  } // primary thread
2514 
2515  switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
2516  case bp_dist_bar: {
2517  __kmp_dist_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2518  TRUE USE_ITT_BUILD_ARG(NULL));
2519  break;
2520  }
2521  case bp_hyper_bar: {
2522  KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
2523  __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2524  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2525  break;
2526  }
2527  case bp_hierarchical_bar: {
2528  __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2529  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2530  break;
2531  }
2532  case bp_tree_bar: {
2533  KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
2534  __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2535  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2536  break;
2537  }
2538  default: {
2539  __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2540  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2541  }
2542  }
2543 
2544 #if OMPT_SUPPORT
2545  if (ompt_enabled.enabled &&
2546  this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
2547  int ds_tid = this_thr->th.th_info.ds.ds_tid;
2548  ompt_data_t *task_data = (team)
2549  ? OMPT_CUR_TASK_DATA(this_thr)
2550  : &(this_thr->th.ompt_thread_info.task_data);
2551  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
2552 #if OMPT_OPTIONAL
2553  void *codeptr = NULL;
2554  if (KMP_MASTER_TID(ds_tid) &&
2555  (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
2556  ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
2557  codeptr = team ? team->t.ompt_team_info.master_return_address : NULL;
2558  if (ompt_enabled.ompt_callback_sync_region_wait) {
2559  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2560  ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
2561  codeptr);
2562  }
2563  if (ompt_enabled.ompt_callback_sync_region) {
2564  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2565  ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
2566  codeptr);
2567  }
2568 #endif
2569  if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
2570  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2571  ompt_scope_end, NULL, task_data, 0, ds_tid,
2572  ompt_task_implicit); // TODO: Can this be ompt_task_initial?
2573  }
2574  }
2575 #endif
2576 
2577  // Early exit for reaping threads releasing forkjoin barrier
2578  if (TCR_4(__kmp_global.g.g_done)) {
2579  this_thr->th.th_task_team = NULL;
2580 
2581 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2582  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2583  if (!KMP_MASTER_TID(tid)) {
2584  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2585  if (itt_sync_obj)
2586  __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2587  }
2588  }
2589 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2590  KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
2591  return;
2592  }
2593 
2594  /* We can now assume that a valid team structure has been allocated by the
2595  primary thread and propagated to all worker threads. The current thread,
2596  however, may not be part of the team, so we can't blindly assume that the
2597  team pointer is non-null. */
2598  team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
2599  KMP_DEBUG_ASSERT(team != NULL);
2600  tid = __kmp_tid_from_gtid(gtid);
2601 
2602 #if KMP_BARRIER_ICV_PULL
2603  /* Primary thread's copy of the ICVs was set up on the implicit taskdata in
2604  __kmp_reinitialize_team. __kmp_fork_call() assumes the primary thread's
2605  implicit task has this data before this function is called. We cannot
2606  modify __kmp_fork_call() to look at the fixed ICVs in the primary thread's
2607  thread struct, because it is not always the case that the threads arrays
2608  have been allocated when __kmp_fork_call() is executed. */
2609  {
2610  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
2611  if (!KMP_MASTER_TID(tid)) { // primary thread already has ICVs
2612  // Copy the initial ICVs from the primary thread's thread struct to the
2613  // implicit task for this tid.
2614  KA_TRACE(10,
2615  ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
2616  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
2617  tid, FALSE);
2618  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
2619  &team->t.t_threads[0]
2620  ->th.th_bar[bs_forkjoin_barrier]
2621  .bb.th_fixed_icvs);
2622  }
2623  }
2624 #endif // KMP_BARRIER_ICV_PULL
2625 
2626  if (__kmp_tasking_mode != tskm_immediate_exec) {
2627  __kmp_task_team_sync(this_thr, team);
2628  }
2629 
2630 #if KMP_AFFINITY_SUPPORTED
2631  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
2632  if (proc_bind == proc_bind_intel) {
2633  // Call dynamic affinity settings
2634  if (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) {
2635  __kmp_balanced_affinity(this_thr, team->t.t_nproc);
2636  }
2637  } else if (proc_bind != proc_bind_false) {
2638  if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
2639  KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
2640  __kmp_gtid_from_thread(this_thr),
2641  this_thr->th.th_current_place));
2642  } else {
2643  __kmp_affinity_set_place(gtid);
2644  }
2645  }
2646 #endif // KMP_AFFINITY_SUPPORTED
2647  // Perform the display affinity functionality
2648  if (__kmp_display_affinity) {
2649  if (team->t.t_display_affinity
2650 #if KMP_AFFINITY_SUPPORTED
2651  || (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed)
2652 #endif
2653  ) {
2654  // NULL means use the affinity-format-var ICV
2655  __kmp_aux_display_affinity(gtid, NULL);
2656  this_thr->th.th_prev_num_threads = team->t.t_nproc;
2657  this_thr->th.th_prev_level = team->t.t_level;
2658  }
2659  }
2660  if (!KMP_MASTER_TID(tid))
2661  KMP_CHECK_UPDATE(this_thr->th.th_def_allocator, team->t.t_def_allocator);
2662 
2663 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2664  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2665  if (!KMP_MASTER_TID(tid)) {
2666  // Get correct barrier object
2667  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2668  __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired
2669  } // (prepare called inside barrier_release)
2670  }
2671 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2672  ANNOTATE_BARRIER_END(&team->t.t_bar);
2673  KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid,
2674  team->t.t_id, tid));
2675 }
2676 
2677 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
2678  kmp_internal_control_t *new_icvs, ident_t *loc) {
2679  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy);
2680 
2681  KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
2682  KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
2683 
2684 /* Primary thread's copy of the ICVs was set up on the implicit taskdata in
2685  __kmp_reinitialize_team. __kmp_fork_call() assumes the primary thread's
2686  implicit task has this data before this function is called. */
2687 #if KMP_BARRIER_ICV_PULL
2688  /* Copy ICVs to primary thread's thread structure into th_fixed_icvs (which
2689  remains untouched), where all of the worker threads can access them and
2690  make their own copies after the barrier. */
2691  KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2692  // allocated at this point
2693  copy_icvs(
2694  &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs,
2695  new_icvs);
2696  KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", 0,
2697  team->t.t_threads[0], team));
2698 #elif KMP_BARRIER_ICV_PUSH
2699  // The ICVs will be propagated in the fork barrier, so nothing needs to be
2700  // done here.
2701  KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 0,
2702  team->t.t_threads[0], team));
2703 #else
2704  // Copy the ICVs to each of the non-primary threads. This takes O(nthreads)
2705  // time.
2706  ngo_load(new_icvs);
2707  KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2708  // allocated at this point
2709  for (int f = 1; f < new_nproc; ++f) { // Skip the primary thread
2710  // TODO: GEH - pass in better source location info since usually NULL here
2711  KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2712  f, team->t.t_threads[f], team));
2713  __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
2714  ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
2715  KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2716  f, team->t.t_threads[f], team));
2717  }
2718  ngo_sync();
2719 #endif // KMP_BARRIER_ICV_PULL
2720 }
Definition: kmp.h:234