13 #include "kmp_wait_release.h"
14 #include "kmp_barrier.h"
17 #include "kmp_stats.h"
18 #include "ompt-specific.h"
20 #include "kmp_affinity.h"
23 #include <immintrin.h>
24 #define USE_NGO_STORES 1
27 #include "tsan_annotations.h"
29 #if KMP_MIC && USE_NGO_STORES
31 #define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
32 #define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
33 #define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
34 #define ngo_sync() __asm__ volatile("lock; addl $0,0(%%rsp)" ::: "memory")
36 #define ngo_load(src) ((void)0)
37 #define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
38 #define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE)
39 #define ngo_sync() ((void)0)
42 void __kmp_print_structure(
void);
49 void distributedBarrier::computeVarsForN(
size_t n) {
52 int socket_level = __kmp_topology->get_level(KMP_HW_SOCKET);
53 int core_level = __kmp_topology->get_level(KMP_HW_CORE);
54 int ncores_per_socket =
55 __kmp_topology->calculate_ratio(core_level, socket_level);
56 nsockets = __kmp_topology->get_count(socket_level);
60 if (ncores_per_socket <= 0)
61 ncores_per_socket = 1;
63 threads_per_go = ncores_per_socket >> 1;
64 if (!fix_threads_per_go) {
66 if (threads_per_go > 4) {
67 if (KMP_OPTIMIZE_FOR_REDUCTIONS) {
68 threads_per_go = threads_per_go >> 1;
70 if (threads_per_go > 4 && nsockets == 1)
71 threads_per_go = threads_per_go >> 1;
74 if (threads_per_go == 0)
76 fix_threads_per_go =
true;
77 num_gos = n / threads_per_go;
78 if (n % threads_per_go)
80 if (nsockets == 1 || num_gos == 1)
83 num_groups = num_gos / nsockets;
84 if (num_gos % nsockets)
89 gos_per_group = num_gos / num_groups;
90 if (num_gos % num_groups)
92 threads_per_group = threads_per_go * gos_per_group;
94 num_gos = n / threads_per_go;
95 if (n % threads_per_go)
100 num_groups = num_gos / 2;
104 gos_per_group = num_gos / num_groups;
105 if (num_gos % num_groups)
107 threads_per_group = threads_per_go * gos_per_group;
111 void distributedBarrier::computeGo(
size_t n) {
113 for (num_gos = 1;; num_gos++)
114 if (IDEAL_CONTENTION * num_gos >= n)
116 threads_per_go = n / num_gos;
119 while (num_gos > MAX_GOS) {
121 num_gos = n / threads_per_go;
122 if (n % threads_per_go)
130 void distributedBarrier::resize(
size_t nthr) {
131 KMP_DEBUG_ASSERT(nthr > max_threads);
134 max_threads = nthr * 2;
137 for (
int i = 0; i < MAX_ITERS; ++i) {
139 flags[i] = (flags_s *)KMP_INTERNAL_REALLOC(flags[i],
140 max_threads *
sizeof(flags_s));
142 flags[i] = (flags_s *)KMP_INTERNAL_MALLOC(max_threads *
sizeof(flags_s));
146 go = (go_s *)KMP_INTERNAL_REALLOC(go, max_threads *
sizeof(go_s));
148 go = (go_s *)KMP_INTERNAL_MALLOC(max_threads *
sizeof(go_s));
151 iter = (iter_s *)KMP_INTERNAL_REALLOC(iter, max_threads *
sizeof(iter_s));
153 iter = (iter_s *)KMP_INTERNAL_MALLOC(max_threads *
sizeof(iter_s));
157 (sleep_s *)KMP_INTERNAL_REALLOC(sleep, max_threads *
sizeof(sleep_s));
159 sleep = (sleep_s *)KMP_INTERNAL_MALLOC(max_threads *
sizeof(sleep_s));
165 kmp_uint64 distributedBarrier::go_release() {
166 kmp_uint64 next_go = iter[0].iter + distributedBarrier::MAX_ITERS;
167 for (
size_t j = 0; j < num_gos; j++) {
168 go[j].go.store(next_go);
173 void distributedBarrier::go_reset() {
174 for (
size_t j = 0; j < max_threads; ++j) {
175 for (
size_t i = 0; i < distributedBarrier::MAX_ITERS; ++i) {
176 flags[i][j].stillNeed = 1;
185 void distributedBarrier::init(
size_t nthr) {
186 size_t old_max = max_threads;
187 if (nthr > max_threads) {
191 for (
size_t i = 0; i < max_threads; i++) {
192 for (
size_t j = 0; j < distributedBarrier::MAX_ITERS; j++) {
193 flags[j][i].stillNeed = 1;
198 sleep[i].sleep =
false;
202 computeVarsForN(nthr);
206 if (team_icvs == NULL)
207 team_icvs = __kmp_allocate(
sizeof(kmp_internal_control_t));
212 void __kmp_dist_barrier_wakeup(
enum barrier_type bt, kmp_team_t *team,
213 size_t start,
size_t stop,
size_t inc,
215 KMP_DEBUG_ASSERT(__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME);
216 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
219 kmp_info_t **other_threads = team->t.t_threads;
220 for (
size_t thr = start; thr < stop; thr += inc) {
221 KMP_DEBUG_ASSERT(other_threads[thr]);
222 int gtid = other_threads[thr]->th.th_info.ds.ds_gtid;
224 __kmp_atomic_resume_64(gtid, (kmp_atomic_flag_64<> *)NULL);
229 __kmp_dist_barrier_gather(
enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
230 int tid,
void (*reduce)(
void *,
void *)
231 USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
232 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_dist_gather);
234 distributedBarrier *b;
235 kmp_info_t **other_threads;
236 kmp_uint64 my_current_iter, my_next_iter;
240 team = this_thr->th.th_team;
241 nproc = this_thr->th.th_team_nproc;
242 other_threads = team->t.t_threads;
244 my_current_iter = b->iter[tid].iter;
245 my_next_iter = (my_current_iter + 1) % distributedBarrier::MAX_ITERS;
246 group_leader = ((tid % b->threads_per_group) == 0);
249 (
"__kmp_dist_barrier_gather: T#%d(%d:%d) enter; barrier type %d\n",
250 gtid, team->t.t_id, tid, bt));
252 #if USE_ITT_BUILD && USE_ITT_NOTIFY
254 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
255 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
256 __itt_get_timestamp();
262 size_t group_start = tid + 1;
263 size_t group_end = tid + b->threads_per_group;
264 size_t threads_pending = 0;
266 if (group_end > nproc)
271 for (
size_t thr = group_start; thr < group_end; thr++) {
273 threads_pending += b->flags[my_current_iter][thr].stillNeed;
276 if (__kmp_tasking_mode != tskm_immediate_exec) {
277 kmp_task_team_t *task_team = this_thr->th.th_task_team;
278 if (task_team != NULL) {
279 if (TCR_SYNC_4(task_team->tt.tt_active)) {
280 if (KMP_TASKING_ENABLED(task_team)) {
281 int tasks_completed = FALSE;
282 __kmp_atomic_execute_tasks_64(
283 this_thr, gtid, (kmp_atomic_flag_64<> *)NULL, FALSE,
284 &tasks_completed USE_ITT_BUILD_ARG(itt_sync_obj), 0);
286 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
289 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
292 if (TCR_4(__kmp_global.g.g_done)) {
293 if (__kmp_global.g.g_abort)
294 __kmp_abort_thread();
296 }
else if (__kmp_tasking_mode != tskm_immediate_exec &&
297 this_thr->th.th_reap_state == KMP_SAFE_TO_REAP) {
298 this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
300 }
while (threads_pending > 0);
303 OMPT_REDUCTION_DECL(this_thr, gtid);
304 OMPT_REDUCTION_BEGIN;
306 for (
size_t thr = group_start; thr < group_end; thr++) {
307 (*reduce)(this_thr->th.th_local.reduce_data,
308 other_threads[thr]->th.th_local.reduce_data);
314 b->flags[my_next_iter][tid].stillNeed = 1;
317 b->flags[my_current_iter][tid].stillNeed = 0;
321 for (
size_t thr = 0; thr < nproc; thr += b->threads_per_group) {
322 threads_pending += b->flags[my_current_iter][thr].stillNeed;
325 if (__kmp_tasking_mode != tskm_immediate_exec) {
326 kmp_task_team_t *task_team = this_thr->th.th_task_team;
327 if (task_team != NULL) {
328 if (TCR_SYNC_4(task_team->tt.tt_active)) {
329 if (KMP_TASKING_ENABLED(task_team)) {
330 int tasks_completed = FALSE;
331 __kmp_atomic_execute_tasks_64(
332 this_thr, gtid, (kmp_atomic_flag_64<> *)NULL, FALSE,
333 &tasks_completed USE_ITT_BUILD_ARG(itt_sync_obj), 0);
335 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
338 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
341 if (TCR_4(__kmp_global.g.g_done)) {
342 if (__kmp_global.g.g_abort)
343 __kmp_abort_thread();
345 }
else if (__kmp_tasking_mode != tskm_immediate_exec &&
346 this_thr->th.th_reap_state == KMP_SAFE_TO_REAP) {
347 this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
349 }
while (threads_pending > 0);
352 if (KMP_MASTER_TID(tid)) {
353 OMPT_REDUCTION_DECL(this_thr, gtid);
354 OMPT_REDUCTION_BEGIN;
355 for (
size_t thr = b->threads_per_group; thr < nproc;
356 thr += b->threads_per_group) {
357 (*reduce)(this_thr->th.th_local.reduce_data,
358 other_threads[thr]->th.th_local.reduce_data);
365 b->flags[my_next_iter][tid].stillNeed = 1;
368 b->flags[my_current_iter][tid].stillNeed = 0;
374 (
"__kmp_dist_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
375 gtid, team->t.t_id, tid, bt));
378 static void __kmp_dist_barrier_release(
379 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
380 int propagate_icvs USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
381 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_dist_release);
383 distributedBarrier *b;
384 kmp_bstate_t *thr_bar;
385 kmp_uint64 my_current_iter, next_go;
389 KA_TRACE(20, (
"__kmp_dist_barrier_release: T#%d(%d) enter; barrier type %d\n",
392 thr_bar = &this_thr->th.th_bar[bt].bb;
394 if (!KMP_MASTER_TID(tid)) {
397 if (this_thr->th.th_used_in_team.load() != 1 &&
398 this_thr->th.th_used_in_team.load() != 3) {
403 kmp_flag_32<false, false> my_flag(&(this_thr->th.th_used_in_team), 3);
404 if (KMP_COMPARE_AND_STORE_ACQ32(&(this_thr->th.th_used_in_team), 2,
406 this_thr->th.th_used_in_team.load() == 0) {
407 my_flag.wait(this_thr,
true, itt_sync_obj);
409 #if USE_ITT_BUILD && USE_ITT_NOTIFY
410 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
413 __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
415 __kmp_itt_task_starting(itt_sync_obj);
417 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
420 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
421 if (itt_sync_obj != NULL)
423 __kmp_itt_task_finished(itt_sync_obj);
426 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
429 if (this_thr->th.th_used_in_team.load() != 1 &&
430 this_thr->th.th_used_in_team.load() != 3)
432 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
441 tid = __kmp_tid_from_gtid(gtid);
442 team = this_thr->th.th_team;
443 KMP_DEBUG_ASSERT(tid >= 0);
444 KMP_DEBUG_ASSERT(team);
446 my_current_iter = b->iter[tid].iter;
447 next_go = my_current_iter + distributedBarrier::MAX_ITERS;
448 my_go_index = tid / b->threads_per_go;
449 if (this_thr->th.th_used_in_team.load() == 3) {
450 KMP_COMPARE_AND_STORE_ACQ32(&(this_thr->th.th_used_in_team), 3, 1);
453 if (b->go[my_go_index].go.load() != next_go) {
455 kmp_atomic_flag_64<false, true> my_flag(
456 &(b->go[my_go_index].go), next_go, &(b->sleep[tid].sleep));
457 my_flag.wait(this_thr,
true, itt_sync_obj);
458 KMP_DEBUG_ASSERT(my_current_iter == b->iter[tid].iter ||
459 b->iter[tid].iter == 0);
460 KMP_DEBUG_ASSERT(b->sleep[tid].sleep ==
false);
463 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
470 if (this_thr->th.th_used_in_team.load() == 1)
474 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
477 group_leader = ((tid % b->threads_per_group) == 0);
480 for (
size_t go_idx = my_go_index + 1;
481 go_idx < my_go_index + b->gos_per_group; go_idx++) {
482 b->go[go_idx].go.store(next_go);
488 #if KMP_BARRIER_ICV_PUSH
489 if (propagate_icvs) {
490 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
492 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
493 (kmp_internal_control_t *)team->t.b->team_icvs);
494 copy_icvs(&thr_bar->th_fixed_icvs,
495 &team->t.t_implicit_task_taskdata[tid].td_icvs);
498 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME && group_leader) {
501 size_t nproc = this_thr->th.th_team_nproc;
502 size_t group_end = tid + b->threads_per_group;
503 if (nproc < group_end)
505 __kmp_dist_barrier_wakeup(bt, team, tid + 1, group_end, 1, tid);
508 team = this_thr->th.th_team;
510 my_current_iter = b->iter[tid].iter;
511 next_go = my_current_iter + distributedBarrier::MAX_ITERS;
512 #if KMP_BARRIER_ICV_PUSH
513 if (propagate_icvs) {
515 copy_icvs(&thr_bar->th_fixed_icvs,
516 &team->t.t_implicit_task_taskdata[tid].td_icvs);
520 for (
size_t go_idx = 0; go_idx < b->num_gos; go_idx += b->gos_per_group) {
521 b->go[go_idx].go.store(next_go);
524 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
526 size_t nproc = this_thr->th.th_team_nproc;
527 __kmp_dist_barrier_wakeup(bt, team, tid + b->threads_per_group, nproc,
528 b->threads_per_group, tid);
532 for (
size_t go_idx = 1; go_idx < b->gos_per_group; go_idx++) {
533 b->go[go_idx].go.store(next_go);
539 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
541 size_t nproc = this_thr->th.th_team_nproc;
542 size_t group_end = tid + b->threads_per_group;
543 if (nproc < group_end)
545 __kmp_dist_barrier_wakeup(bt, team, tid + 1, group_end, 1, tid);
549 KMP_ASSERT(my_current_iter == b->iter[tid].iter);
550 b->iter[tid].iter = (b->iter[tid].iter + 1) % distributedBarrier::MAX_ITERS;
553 20, (
"__kmp_dist_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
554 gtid, team->t.t_id, tid, bt));
558 template <
bool cancellable = false>
559 static bool __kmp_linear_barrier_gather_template(
560 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
561 void (*reduce)(
void *,
void *) USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
562 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather);
563 kmp_team_t *team = this_thr->th.th_team;
564 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
565 kmp_info_t **other_threads = team->t.t_threads;
569 (
"__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
570 gtid, team->t.t_id, tid, bt));
571 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
573 #if USE_ITT_BUILD && USE_ITT_NOTIFY
575 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
576 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
577 __itt_get_timestamp();
582 if (!KMP_MASTER_TID(tid)) {
584 (
"__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
585 "arrived(%p): %llu => %llu\n",
586 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(0, team),
587 team->t.t_id, 0, &thr_bar->b_arrived, thr_bar->b_arrived,
588 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
593 ANNOTATE_BARRIER_BEGIN(this_thr);
594 kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[0]);
597 kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
598 int nproc = this_thr->th.th_team_nproc;
601 kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
604 for (i = 1; i < nproc; ++i) {
608 KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_arrived);
610 KA_TRACE(20, (
"__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
611 "arrived(%p) == %llu\n",
612 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
614 &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
618 kmp_flag_64<true, false> flag(
619 &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state);
620 if (flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)))
623 kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived,
625 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
627 ANNOTATE_BARRIER_END(other_threads[i]);
628 #if USE_ITT_BUILD && USE_ITT_NOTIFY
631 if (__kmp_forkjoin_frames_mode == 2) {
632 this_thr->th.th_bar_min_time = KMP_MIN(
633 this_thr->th.th_bar_min_time, other_threads[i]->th.th_bar_min_time);
638 (
"__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
639 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
641 ANNOTATE_REDUCE_AFTER(reduce);
642 OMPT_REDUCTION_DECL(this_thr, gtid);
643 OMPT_REDUCTION_BEGIN;
644 (*reduce)(this_thr->th.th_local.reduce_data,
645 other_threads[i]->th.th_local.reduce_data);
647 ANNOTATE_REDUCE_BEFORE(reduce);
648 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
652 team_bar->b_arrived = new_state;
653 KA_TRACE(20, (
"__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d "
654 "arrived(%p) = %llu\n",
655 gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived,
660 (
"__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
661 gtid, team->t.t_id, tid, bt));
665 template <
bool cancellable = false>
666 static bool __kmp_linear_barrier_release_template(
667 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
668 int propagate_icvs USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
669 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release);
670 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
673 if (KMP_MASTER_TID(tid)) {
675 kmp_uint32 nproc = this_thr->th.th_team_nproc;
676 kmp_info_t **other_threads;
678 team = __kmp_threads[gtid]->th.th_team;
679 KMP_DEBUG_ASSERT(team != NULL);
680 other_threads = team->t.t_threads;
682 KA_TRACE(20, (
"__kmp_linear_barrier_release: T#%d(%d:%d) primary enter for "
684 gtid, team->t.t_id, tid, bt));
687 #if KMP_BARRIER_ICV_PUSH
689 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
690 if (propagate_icvs) {
691 ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
692 for (i = 1; i < nproc; ++i) {
693 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i],
695 ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
696 &team->t.t_implicit_task_taskdata[0].td_icvs);
704 for (i = 1; i < nproc; ++i) {
708 KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_go);
712 (
"__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
713 "go(%p): %u => %u\n",
714 gtid, team->t.t_id, tid, other_threads[i]->th.th_info.ds.ds_gtid,
715 team->t.t_id, i, &other_threads[i]->th.th_bar[bt].bb.b_go,
716 other_threads[i]->th.th_bar[bt].bb.b_go,
717 other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
718 ANNOTATE_BARRIER_BEGIN(other_threads[i]);
719 kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_go,
725 KA_TRACE(20, (
"__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
726 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
728 kmp_flag_64<true, false> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
729 if (flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)))
732 kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
733 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
735 ANNOTATE_BARRIER_END(this_thr);
736 #if USE_ITT_BUILD && USE_ITT_NOTIFY
737 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
740 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
742 __kmp_itt_task_starting(itt_sync_obj);
744 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
747 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
748 if (itt_sync_obj != NULL)
750 __kmp_itt_task_finished(itt_sync_obj);
754 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
758 tid = __kmp_tid_from_gtid(gtid);
759 team = __kmp_threads[gtid]->th.th_team;
761 KMP_DEBUG_ASSERT(team != NULL);
762 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
764 (
"__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
765 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
770 (
"__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
771 gtid, team->t.t_id, tid, bt));
775 static void __kmp_linear_barrier_gather(
776 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
777 void (*reduce)(
void *,
void *) USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
778 __kmp_linear_barrier_gather_template<false>(
779 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
782 static bool __kmp_linear_barrier_gather_cancellable(
783 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
784 void (*reduce)(
void *,
void *) USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
785 return __kmp_linear_barrier_gather_template<true>(
786 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
789 static void __kmp_linear_barrier_release(
790 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
791 int propagate_icvs USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
792 __kmp_linear_barrier_release_template<false>(
793 bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
796 static bool __kmp_linear_barrier_release_cancellable(
797 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
798 int propagate_icvs USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
799 return __kmp_linear_barrier_release_template<true>(
800 bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
804 static void __kmp_tree_barrier_gather(
805 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
806 void (*reduce)(
void *,
void *) USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
807 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather);
808 kmp_team_t *team = this_thr->th.th_team;
809 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
810 kmp_info_t **other_threads = team->t.t_threads;
811 kmp_uint32 nproc = this_thr->th.th_team_nproc;
812 kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
813 kmp_uint32 branch_factor = 1 << branch_bits;
815 kmp_uint32 child_tid;
816 kmp_uint64 new_state = 0;
819 20, (
"__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
820 gtid, team->t.t_id, tid, bt));
821 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
823 #if USE_ITT_BUILD && USE_ITT_NOTIFY
825 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
826 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
827 __itt_get_timestamp();
832 child_tid = (tid << branch_bits) + 1;
833 if (child_tid < nproc) {
835 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
838 kmp_info_t *child_thr = other_threads[child_tid];
839 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
842 if (child + 1 <= branch_factor && child_tid + 1 < nproc)
844 &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_arrived);
847 (
"__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
848 "arrived(%p) == %llu\n",
849 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
850 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
852 kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
853 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
854 ANNOTATE_BARRIER_END(child_thr);
855 #if USE_ITT_BUILD && USE_ITT_NOTIFY
858 if (__kmp_forkjoin_frames_mode == 2) {
859 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
860 child_thr->th.th_bar_min_time);
865 (
"__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
866 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
867 team->t.t_id, child_tid));
868 ANNOTATE_REDUCE_AFTER(reduce);
869 OMPT_REDUCTION_DECL(this_thr, gtid);
870 OMPT_REDUCTION_BEGIN;
871 (*reduce)(this_thr->th.th_local.reduce_data,
872 child_thr->th.th_local.reduce_data);
874 ANNOTATE_REDUCE_BEFORE(reduce);
875 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
879 }
while (child <= branch_factor && child_tid < nproc);
882 if (!KMP_MASTER_TID(tid)) {
883 kmp_int32 parent_tid = (tid - 1) >> branch_bits;
886 (
"__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
887 "arrived(%p): %llu => %llu\n",
888 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
889 team->t.t_id, parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
890 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
896 ANNOTATE_BARRIER_BEGIN(this_thr);
897 kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[parent_tid]);
902 team->t.t_bar[bt].b_arrived = new_state;
904 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
905 KA_TRACE(20, (
"__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d "
906 "arrived(%p) = %llu\n",
907 gtid, team->t.t_id, tid, team->t.t_id,
908 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
911 (
"__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
912 gtid, team->t.t_id, tid, bt));
915 static void __kmp_tree_barrier_release(
916 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
917 int propagate_icvs USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
918 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release);
920 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
922 kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
923 kmp_uint32 branch_factor = 1 << branch_bits;
925 kmp_uint32 child_tid;
930 KA_TRACE(20, (
"__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", gtid,
931 &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
933 kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
934 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
935 ANNOTATE_BARRIER_END(this_thr);
936 #if USE_ITT_BUILD && USE_ITT_NOTIFY
937 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
940 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
942 __kmp_itt_task_starting(itt_sync_obj);
944 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
947 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
948 if (itt_sync_obj != NULL)
950 __kmp_itt_task_finished(itt_sync_obj);
954 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
958 team = __kmp_threads[gtid]->th.th_team;
959 KMP_DEBUG_ASSERT(team != NULL);
960 tid = __kmp_tid_from_gtid(gtid);
962 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
964 (
"__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", gtid,
965 team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
968 team = __kmp_threads[gtid]->th.th_team;
969 KMP_DEBUG_ASSERT(team != NULL);
970 KA_TRACE(20, (
"__kmp_tree_barrier_release: T#%d(%d:%d) primary enter for "
972 gtid, team->t.t_id, tid, bt));
974 nproc = this_thr->th.th_team_nproc;
975 child_tid = (tid << branch_bits) + 1;
977 if (child_tid < nproc) {
978 kmp_info_t **other_threads = team->t.t_threads;
982 kmp_info_t *child_thr = other_threads[child_tid];
983 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
986 if (child + 1 <= branch_factor && child_tid + 1 < nproc)
988 &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_go);
991 #if KMP_BARRIER_ICV_PUSH
993 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
994 if (propagate_icvs) {
995 __kmp_init_implicit_task(team->t.t_ident,
996 team->t.t_threads[child_tid], team,
998 copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
999 &team->t.t_implicit_task_taskdata[0].td_icvs);
1004 (
"__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
1005 "go(%p): %u => %u\n",
1006 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1007 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1008 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1010 ANNOTATE_BARRIER_BEGIN(child_thr);
1011 kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1015 }
while (child <= branch_factor && child_tid < nproc);
1018 20, (
"__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
1019 gtid, team->t.t_id, tid, bt));
1023 static void __kmp_hyper_barrier_gather(
1024 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
1025 void (*reduce)(
void *,
void *) USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
1026 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather);
1027 kmp_team_t *team = this_thr->th.th_team;
1028 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1029 kmp_info_t **other_threads = team->t.t_threads;
1030 kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE;
1031 kmp_uint32 num_threads = this_thr->th.th_team_nproc;
1032 kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
1033 kmp_uint32 branch_factor = 1 << branch_bits;
1039 (
"__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
1040 gtid, team->t.t_id, tid, bt));
1041 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
1043 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1045 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
1046 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
1047 __itt_get_timestamp();
1052 kmp_flag_64<> p_flag(&thr_bar->b_arrived);
1053 for (level = 0, offset = 1; offset < num_threads;
1054 level += branch_bits, offset <<= branch_bits) {
1056 kmp_uint32 child_tid;
1058 if (((tid >> level) & (branch_factor - 1)) != 0) {
1059 kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) - 1);
1063 (
"__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
1064 "arrived(%p): %llu => %llu\n",
1065 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
1066 team->t.t_id, parent_tid, &thr_bar->b_arrived,
1068 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
1073 ANNOTATE_BARRIER_BEGIN(this_thr);
1074 p_flag.set_waiter(other_threads[parent_tid]);
1080 if (new_state == KMP_BARRIER_UNUSED_STATE)
1081 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1082 for (child = 1, child_tid = tid + (1 << level);
1083 child < branch_factor && child_tid < num_threads;
1084 child++, child_tid += (1 << level)) {
1085 kmp_info_t *child_thr = other_threads[child_tid];
1086 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1087 #if KMP_CACHE_MANAGE
1088 kmp_uint32 next_child_tid = child_tid + (1 << level);
1090 if (child + 1 < branch_factor && next_child_tid < num_threads)
1092 &other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
1095 (
"__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
1096 "arrived(%p) == %llu\n",
1097 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1098 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
1100 kmp_flag_64<> c_flag(&child_bar->b_arrived, new_state);
1101 c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1102 ANNOTATE_BARRIER_END(child_thr);
1104 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1107 if (__kmp_forkjoin_frames_mode == 2) {
1108 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
1109 child_thr->th.th_bar_min_time);
1114 (
"__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
1115 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1116 team->t.t_id, child_tid));
1117 ANNOTATE_REDUCE_AFTER(reduce);
1118 OMPT_REDUCTION_DECL(this_thr, gtid);
1119 OMPT_REDUCTION_BEGIN;
1120 (*reduce)(this_thr->th.th_local.reduce_data,
1121 child_thr->th.th_local.reduce_data);
1123 ANNOTATE_REDUCE_BEFORE(reduce);
1124 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
1129 if (KMP_MASTER_TID(tid)) {
1131 if (new_state == KMP_BARRIER_UNUSED_STATE)
1132 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
1134 team->t.t_bar[bt].b_arrived = new_state;
1135 KA_TRACE(20, (
"__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d "
1136 "arrived(%p) = %llu\n",
1137 gtid, team->t.t_id, tid, team->t.t_id,
1138 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
1141 20, (
"__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
1142 gtid, team->t.t_id, tid, bt));
1146 #define KMP_REVERSE_HYPER_BAR
1147 static void __kmp_hyper_barrier_release(
1148 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
1149 int propagate_icvs USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
1150 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release);
1152 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1153 kmp_info_t **other_threads;
1154 kmp_uint32 num_threads;
1155 kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
1156 kmp_uint32 branch_factor = 1 << branch_bits;
1158 kmp_uint32 child_tid;
1166 if (KMP_MASTER_TID(tid)) {
1167 team = __kmp_threads[gtid]->th.th_team;
1168 KMP_DEBUG_ASSERT(team != NULL);
1169 KA_TRACE(20, (
"__kmp_hyper_barrier_release: T#%d(%d:%d) primary enter for "
1170 "barrier type %d\n",
1171 gtid, team->t.t_id, tid, bt));
1172 #if KMP_BARRIER_ICV_PUSH
1173 if (propagate_icvs) {
1174 copy_icvs(&thr_bar->th_fixed_icvs,
1175 &team->t.t_implicit_task_taskdata[tid].td_icvs);
1179 KA_TRACE(20, (
"__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", gtid,
1180 &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
1182 kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
1183 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1184 ANNOTATE_BARRIER_END(this_thr);
1185 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1186 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
1188 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
1190 __kmp_itt_task_starting(itt_sync_obj);
1192 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1195 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1196 if (itt_sync_obj != NULL)
1198 __kmp_itt_task_finished(itt_sync_obj);
1202 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1206 team = __kmp_threads[gtid]->th.th_team;
1207 KMP_DEBUG_ASSERT(team != NULL);
1208 tid = __kmp_tid_from_gtid(gtid);
1210 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
1212 (
"__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1213 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
1216 num_threads = this_thr->th.th_team_nproc;
1217 other_threads = team->t.t_threads;
1219 #ifdef KMP_REVERSE_HYPER_BAR
1221 for (level = 0, offset = 1;
1222 offset < num_threads && (((tid >> level) & (branch_factor - 1)) == 0);
1223 level += branch_bits, offset <<= branch_bits)
1227 for (level -= branch_bits, offset >>= branch_bits; offset != 0;
1228 level -= branch_bits, offset >>= branch_bits)
1231 for (level = 0, offset = 1; offset < num_threads;
1232 level += branch_bits, offset <<= branch_bits)
1235 #ifdef KMP_REVERSE_HYPER_BAR
1238 child = num_threads >> ((level == 0) ? level : level - 1);
1239 for (child = (child < branch_factor - 1) ? child : branch_factor - 1,
1240 child_tid = tid + (child << level);
1241 child >= 1; child--, child_tid -= (1 << level))
1243 if (((tid >> level) & (branch_factor - 1)) != 0)
1248 for (child = 1, child_tid = tid + (1 << level);
1249 child < branch_factor && child_tid < num_threads;
1250 child++, child_tid += (1 << level))
1253 if (child_tid >= num_threads)
1256 kmp_info_t *child_thr = other_threads[child_tid];
1257 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1258 #if KMP_CACHE_MANAGE
1259 kmp_uint32 next_child_tid = child_tid - (1 << level);
1261 #ifdef KMP_REVERSE_HYPER_BAR
1262 if (child - 1 >= 1 && next_child_tid < num_threads)
1264 if (child + 1 < branch_factor && next_child_tid < num_threads)
1267 &other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
1270 #if KMP_BARRIER_ICV_PUSH
1272 copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
1277 (
"__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
1278 "go(%p): %u => %u\n",
1279 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1280 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1281 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1283 ANNOTATE_BARRIER_BEGIN(child_thr);
1284 kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1289 #if KMP_BARRIER_ICV_PUSH
1290 if (propagate_icvs &&
1291 !KMP_MASTER_TID(tid)) {
1292 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
1294 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1295 &thr_bar->th_fixed_icvs);
1300 (
"__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
1301 gtid, team->t.t_id, tid, bt));
1314 static bool __kmp_init_hierarchical_barrier_thread(
enum barrier_type bt,
1315 kmp_bstate_t *thr_bar,
1316 kmp_uint32 nproc,
int gtid,
1317 int tid, kmp_team_t *team) {
1319 bool uninitialized = thr_bar->team == NULL;
1320 bool team_changed = team != thr_bar->team;
1321 bool team_sz_changed = nproc != thr_bar->nproc;
1322 bool tid_changed = tid != thr_bar->old_tid;
1323 bool retval =
false;
1325 if (uninitialized || team_sz_changed) {
1326 __kmp_get_hierarchy(nproc, thr_bar);
1329 if (uninitialized || team_sz_changed || tid_changed) {
1330 thr_bar->my_level = thr_bar->depth - 1;
1331 thr_bar->parent_tid = -1;
1332 if (!KMP_MASTER_TID(tid)) {
1335 while (d < thr_bar->depth) {
1338 if (d == thr_bar->depth - 2) {
1339 thr_bar->parent_tid = 0;
1340 thr_bar->my_level = d;
1342 }
else if ((rem = tid % thr_bar->skip_per_level[d + 1]) != 0) {
1345 thr_bar->parent_tid = tid - rem;
1346 thr_bar->my_level = d;
1352 __kmp_type_convert(7 - ((tid - thr_bar->parent_tid) /
1353 (thr_bar->skip_per_level[thr_bar->my_level])),
1354 &(thr_bar->offset));
1355 thr_bar->old_tid = tid;
1356 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
1357 thr_bar->team = team;
1358 thr_bar->parent_bar =
1359 &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
1361 if (uninitialized || team_changed || tid_changed) {
1362 thr_bar->team = team;
1363 thr_bar->parent_bar =
1364 &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
1367 if (uninitialized || team_sz_changed || tid_changed) {
1368 thr_bar->nproc = nproc;
1369 thr_bar->leaf_kids = thr_bar->base_leaf_kids;
1370 if (thr_bar->my_level == 0)
1371 thr_bar->leaf_kids = 0;
1372 if (thr_bar->leaf_kids && (kmp_uint32)tid + thr_bar->leaf_kids + 1 > nproc)
1373 __kmp_type_convert(nproc - tid - 1, &(thr_bar->leaf_kids));
1374 thr_bar->leaf_state = 0;
1375 for (
int i = 0; i < thr_bar->leaf_kids; ++i)
1376 ((
char *)&(thr_bar->leaf_state))[7 - i] = 1;
1381 static void __kmp_hierarchical_barrier_gather(
1382 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
1383 void (*reduce)(
void *,
void *) USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
1384 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather);
1385 kmp_team_t *team = this_thr->th.th_team;
1386 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1387 kmp_uint32 nproc = this_thr->th.th_team_nproc;
1388 kmp_info_t **other_threads = team->t.t_threads;
1389 kmp_uint64 new_state = 0;
1391 int level = team->t.t_level;
1392 if (other_threads[0]
1393 ->th.th_teams_microtask)
1394 if (this_thr->th.th_teams_size.nteams > 1)
1397 thr_bar->use_oncore_barrier = 1;
1399 thr_bar->use_oncore_barrier = 0;
1401 KA_TRACE(20, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for "
1402 "barrier type %d\n",
1403 gtid, team->t.t_id, tid, bt));
1404 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
1406 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1408 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
1409 this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
1413 (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid,
1416 if (thr_bar->my_level) {
1417 kmp_int32 child_tid;
1419 (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1420 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1421 thr_bar->use_oncore_barrier) {
1422 if (thr_bar->leaf_kids) {
1424 kmp_uint64 leaf_state =
1426 ? thr_bar->b_arrived | thr_bar->leaf_state
1427 : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
1428 KA_TRACE(20, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting "
1430 gtid, team->t.t_id, tid));
1431 kmp_flag_64<> flag(&thr_bar->b_arrived, leaf_state);
1432 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1434 ANNOTATE_REDUCE_AFTER(reduce);
1435 OMPT_REDUCTION_DECL(this_thr, gtid);
1436 OMPT_REDUCTION_BEGIN;
1437 for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids;
1439 KA_TRACE(100, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
1441 gtid, team->t.t_id, tid,
1442 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1444 ANNOTATE_BARRIER_END(other_threads[child_tid]);
1445 (*reduce)(this_thr->th.th_local.reduce_data,
1446 other_threads[child_tid]->th.th_local.reduce_data);
1449 ANNOTATE_REDUCE_BEFORE(reduce);
1450 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
1453 KMP_TEST_THEN_AND64(&thr_bar->b_arrived, ~(thr_bar->leaf_state));
1456 for (kmp_uint32 d = 1; d < thr_bar->my_level;
1458 kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
1459 skip = thr_bar->skip_per_level[d];
1462 for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1463 kmp_info_t *child_thr = other_threads[child_tid];
1464 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1465 KA_TRACE(20, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
1467 "arrived(%p) == %llu\n",
1468 gtid, team->t.t_id, tid,
1469 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1470 child_tid, &child_bar->b_arrived, new_state));
1471 kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
1472 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1473 ANNOTATE_BARRIER_END(child_thr);
1475 KA_TRACE(100, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
1477 gtid, team->t.t_id, tid,
1478 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1480 ANNOTATE_REDUCE_AFTER(reduce);
1481 (*reduce)(this_thr->th.th_local.reduce_data,
1482 child_thr->th.th_local.reduce_data);
1483 ANNOTATE_REDUCE_BEFORE(reduce);
1484 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
1489 for (kmp_uint32 d = 0; d < thr_bar->my_level;
1491 kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
1492 skip = thr_bar->skip_per_level[d];
1495 for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1496 kmp_info_t *child_thr = other_threads[child_tid];
1497 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1498 KA_TRACE(20, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
1500 "arrived(%p) == %llu\n",
1501 gtid, team->t.t_id, tid,
1502 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1503 child_tid, &child_bar->b_arrived, new_state));
1504 kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
1505 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1506 ANNOTATE_BARRIER_END(child_thr);
1508 KA_TRACE(100, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
1510 gtid, team->t.t_id, tid,
1511 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1513 ANNOTATE_REDUCE_AFTER(reduce);
1514 (*reduce)(this_thr->th.th_local.reduce_data,
1515 child_thr->th.th_local.reduce_data);
1516 ANNOTATE_REDUCE_BEFORE(reduce);
1517 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
1525 if (!KMP_MASTER_TID(tid)) {
1526 KA_TRACE(20, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing"
1527 " T#%d(%d:%d) arrived(%p): %llu => %llu\n",
1528 gtid, team->t.t_id, tid,
1529 __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id,
1530 thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
1531 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
1535 if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ||
1536 !thr_bar->use_oncore_barrier) {
1538 ANNOTATE_BARRIER_BEGIN(this_thr);
1539 kmp_flag_64<> flag(&thr_bar->b_arrived,
1540 other_threads[thr_bar->parent_tid]);
1544 thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1545 kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived,
1546 thr_bar->offset + 1);
1547 flag.set_waiter(other_threads[thr_bar->parent_tid]);
1551 team->t.t_bar[bt].b_arrived = new_state;
1552 KA_TRACE(20, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d "
1553 "arrived(%p) = %llu\n",
1554 gtid, team->t.t_id, tid, team->t.t_id,
1555 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
1558 KA_TRACE(20, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for "
1559 "barrier type %d\n",
1560 gtid, team->t.t_id, tid, bt));
1563 static void __kmp_hierarchical_barrier_release(
1564 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
1565 int propagate_icvs USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
1566 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release);
1568 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1570 bool team_change =
false;
1572 if (KMP_MASTER_TID(tid)) {
1573 team = __kmp_threads[gtid]->th.th_team;
1574 KMP_DEBUG_ASSERT(team != NULL);
1575 KA_TRACE(20, (
"__kmp_hierarchical_barrier_release: T#%d(%d:%d) primary "
1576 "entered barrier type %d\n",
1577 gtid, team->t.t_id, tid, bt));
1580 if (!thr_bar->use_oncore_barrier ||
1581 __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || thr_bar->my_level != 0 ||
1582 thr_bar->team == NULL) {
1584 thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
1585 kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
1586 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1587 ANNOTATE_BARRIER_END(this_thr);
1588 TCW_8(thr_bar->b_go,
1589 KMP_INIT_BARRIER_STATE);
1593 thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
1594 kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP,
1595 thr_bar->offset + 1, bt,
1596 this_thr USE_ITT_BUILD_ARG(itt_sync_obj));
1597 flag.wait(this_thr, TRUE);
1598 if (thr_bar->wait_flag ==
1599 KMP_BARRIER_SWITCHING) {
1600 TCW_8(thr_bar->b_go,
1601 KMP_INIT_BARRIER_STATE);
1603 (RCAST(
volatile char *,
1604 &(thr_bar->parent_bar->b_go)))[thr_bar->offset + 1] = 0;
1607 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
1609 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1612 team = __kmp_threads[gtid]->th.th_team;
1613 KMP_DEBUG_ASSERT(team != NULL);
1614 tid = __kmp_tid_from_gtid(gtid);
1618 (
"__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1619 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
1623 nproc = this_thr->th.th_team_nproc;
1624 int level = team->t.t_level;
1625 if (team->t.t_threads[0]
1626 ->th.th_teams_microtask) {
1627 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1628 this_thr->th.th_teams_level == level)
1630 if (this_thr->th.th_teams_size.nteams > 1)
1634 thr_bar->use_oncore_barrier = 1;
1636 thr_bar->use_oncore_barrier = 0;
1640 unsigned short int old_leaf_kids = thr_bar->leaf_kids;
1641 kmp_uint64 old_leaf_state = thr_bar->leaf_state;
1642 team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid,
1648 #if KMP_BARRIER_ICV_PUSH
1649 if (propagate_icvs) {
1650 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
1654 copy_icvs(&thr_bar->th_fixed_icvs,
1655 &team->t.t_implicit_task_taskdata[tid].td_icvs);
1656 }
else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1657 thr_bar->use_oncore_barrier) {
1658 if (!thr_bar->my_level)
1661 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1662 &thr_bar->parent_bar->th_fixed_icvs);
1665 if (thr_bar->my_level)
1667 copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
1669 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1670 &thr_bar->parent_bar->th_fixed_icvs);
1676 if (thr_bar->my_level) {
1677 kmp_int32 child_tid;
1679 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1680 thr_bar->use_oncore_barrier) {
1681 if (KMP_MASTER_TID(tid)) {
1684 thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
1687 ngo_load(&thr_bar->th_fixed_icvs);
1690 for (child_tid = thr_bar->skip_per_level[1]; child_tid < (
int)nproc;
1691 child_tid += thr_bar->skip_per_level[1]) {
1692 kmp_bstate_t *child_bar =
1693 &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
1694 KA_TRACE(20, (
"__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1695 "releasing T#%d(%d:%d)"
1696 " go(%p): %u => %u\n",
1697 gtid, team->t.t_id, tid,
1698 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1699 child_tid, &child_bar->b_go, child_bar->b_go,
1700 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1703 ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
1707 TCW_8(thr_bar->b_go,
1708 KMP_INIT_BARRIER_STATE);
1710 if (thr_bar->leaf_kids) {
1713 old_leaf_kids < thr_bar->leaf_kids) {
1714 if (old_leaf_kids) {
1715 thr_bar->b_go |= old_leaf_state;
1718 last = tid + thr_bar->skip_per_level[1];
1721 for (child_tid = tid + 1 + old_leaf_kids; child_tid < (int)last;
1723 kmp_info_t *child_thr = team->t.t_threads[child_tid];
1724 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1727 (
"__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
1728 " T#%d(%d:%d) go(%p): %u => %u\n",
1729 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1730 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1731 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1733 ANNOTATE_BARRIER_BEGIN(child_thr);
1734 kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1739 thr_bar->b_go |= thr_bar->leaf_state;
1743 for (
int d = thr_bar->my_level - 1; d >= 0;
1745 last = tid + thr_bar->skip_per_level[d + 1];
1746 kmp_uint32 skip = thr_bar->skip_per_level[d];
1749 for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1750 kmp_info_t *child_thr = team->t.t_threads[child_tid];
1751 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1752 KA_TRACE(20, (
"__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1753 "releasing T#%d(%d:%d) go(%p): %u => %u\n",
1754 gtid, team->t.t_id, tid,
1755 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1756 child_tid, &child_bar->b_go, child_bar->b_go,
1757 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1759 ANNOTATE_BARRIER_BEGIN(child_thr);
1760 kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1765 #if KMP_BARRIER_ICV_PUSH
1766 if (propagate_icvs && !KMP_MASTER_TID(tid))
1768 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1769 &thr_bar->th_fixed_icvs);
1772 KA_TRACE(20, (
"__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for "
1773 "barrier type %d\n",
1774 gtid, team->t.t_id, tid, bt));
1782 template <
bool cancellable>
struct is_cancellable {};
1783 template <>
struct is_cancellable<true> {
1785 is_cancellable() : value(false) {}
1786 is_cancellable(
bool b) : value(b) {}
1787 is_cancellable &operator=(
bool b) {
1791 operator bool()
const {
return value; }
1793 template <>
struct is_cancellable<false> {
1794 is_cancellable &operator=(
bool b) {
return *
this; }
1795 constexpr
operator bool()
const {
return false; }
1806 template <
bool cancellable = false>
1807 static int __kmp_barrier_template(
enum barrier_type bt,
int gtid,
int is_split,
1808 size_t reduce_size,
void *reduce_data,
1809 void (*reduce)(
void *,
void *)) {
1810 KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier);
1811 KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1812 int tid = __kmp_tid_from_gtid(gtid);
1813 kmp_info_t *this_thr = __kmp_threads[gtid];
1814 kmp_team_t *team = this_thr->th.th_team;
1816 is_cancellable<cancellable> cancelled;
1817 #if OMPT_SUPPORT && OMPT_OPTIONAL
1818 ompt_data_t *my_task_data;
1819 ompt_data_t *my_parallel_data;
1820 void *return_address;
1821 ompt_sync_region_t barrier_kind;
1824 KA_TRACE(15, (
"__kmp_barrier: T#%d(%d:%d) has arrived\n", gtid,
1825 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1827 ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1829 if (ompt_enabled.enabled) {
1831 my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1832 my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1833 return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1834 barrier_kind = __ompt_get_barrier_kind(bt, this_thr);
1835 if (ompt_enabled.ompt_callback_sync_region) {
1836 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1837 barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1840 if (ompt_enabled.ompt_callback_sync_region_wait) {
1841 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1842 barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1849 this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1853 if (!team->t.t_serialized) {
1856 void *itt_sync_obj = NULL;
1858 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1859 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1862 if (__kmp_tasking_mode == tskm_extra_barrier) {
1863 __kmp_tasking_barrier(team, this_thr, gtid);
1865 (
"__kmp_barrier: T#%d(%d:%d) past tasking barrier\n", gtid,
1866 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1873 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1875 this_thr->th.th_team_bt_intervals =
1876 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1877 this_thr->th.th_team_bt_set =
1878 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1880 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1885 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1886 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1890 if (KMP_MASTER_TID(tid)) {
1891 team->t.t_bar[bt].b_master_arrived += 1;
1893 this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
1896 if (reduce != NULL) {
1898 this_thr->th.th_local.reduce_data = reduce_data;
1901 if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec)
1903 __kmp_task_team_setup(this_thr, team, 0);
1906 cancelled = __kmp_linear_barrier_gather_cancellable(
1907 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1909 switch (__kmp_barrier_gather_pattern[bt]) {
1911 __kmp_dist_barrier_gather(bt, this_thr, gtid, tid,
1912 reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1915 case bp_hyper_bar: {
1917 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1918 __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid,
1919 reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1922 case bp_hierarchical_bar: {
1923 __kmp_hierarchical_barrier_gather(
1924 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1929 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1930 __kmp_tree_barrier_gather(bt, this_thr, gtid, tid,
1931 reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1935 __kmp_linear_barrier_gather(bt, this_thr, gtid, tid,
1936 reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1943 if (KMP_MASTER_TID(tid)) {
1945 if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
1946 __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1951 team->t.t_bar[bt].b_team_arrived += 1;
1954 if (__kmp_omp_cancellation) {
1955 kmp_int32 cancel_request = KMP_ATOMIC_LD_RLX(&team->t.t_cancel_request);
1957 if (cancel_request == cancel_loop ||
1958 cancel_request == cancel_sections) {
1959 KMP_ATOMIC_ST_RLX(&team->t.t_cancel_request, cancel_noreq);
1967 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1968 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1970 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1972 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1973 __kmp_forkjoin_frames_mode &&
1974 (this_thr->th.th_teams_microtask == NULL ||
1975 this_thr->th.th_teams_size.nteams == 1) &&
1976 team->t.t_active_level == 1) {
1977 ident_t *loc = __kmp_threads[gtid]->th.th_ident;
1978 kmp_uint64 cur_time = __itt_get_timestamp();
1979 kmp_info_t **other_threads = team->t.t_threads;
1980 int nproc = this_thr->th.th_team_nproc;
1982 switch (__kmp_forkjoin_frames_mode) {
1984 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1986 this_thr->th.th_frame_time = cur_time;
1990 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time,
1994 if (__itt_metadata_add_ptr) {
1996 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1999 this_thr->th.th_bar_arrive_time = 0;
2000 for (i = 1; i < nproc; ++i) {
2001 delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
2002 other_threads[i]->th.th_bar_arrive_time = 0;
2004 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
2006 (kmp_uint64)(reduce != NULL));
2008 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
2010 this_thr->th.th_frame_time = cur_time;
2018 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2019 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
2022 if ((status == 1 || !is_split) && !cancelled) {
2024 cancelled = __kmp_linear_barrier_release_cancellable(
2025 bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2027 switch (__kmp_barrier_release_pattern[bt]) {
2029 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
2030 __kmp_dist_barrier_release(bt, this_thr, gtid, tid,
2031 FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2034 case bp_hyper_bar: {
2035 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
2036 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
2037 FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2040 case bp_hierarchical_bar: {
2041 __kmp_hierarchical_barrier_release(
2042 bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2046 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
2047 __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
2048 FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2052 __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
2053 FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2057 if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
2058 __kmp_task_team_sync(this_thr, team);
2066 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2067 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2071 if (__kmp_tasking_mode != tskm_immediate_exec) {
2072 if (this_thr->th.th_task_team != NULL) {
2074 void *itt_sync_obj = NULL;
2075 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2076 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
2077 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
2081 KMP_DEBUG_ASSERT(this_thr->th.th_task_team->tt.tt_found_proxy_tasks ==
2083 __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
2084 __kmp_task_team_setup(this_thr, team, 0);
2087 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2088 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2093 KA_TRACE(15, (
"__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
2094 gtid, __kmp_team_from_gtid(gtid)->t.t_id,
2095 __kmp_tid_from_gtid(gtid), status));
2098 if (ompt_enabled.enabled) {
2100 if (ompt_enabled.ompt_callback_sync_region_wait) {
2101 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2102 barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
2105 if (ompt_enabled.ompt_callback_sync_region) {
2106 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2107 barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
2111 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
2114 ANNOTATE_BARRIER_END(&team->t.t_bar);
2117 return (
int)cancelled;
2122 int __kmp_barrier(
enum barrier_type bt,
int gtid,
int is_split,
2123 size_t reduce_size,
void *reduce_data,
2124 void (*reduce)(
void *,
void *)) {
2125 return __kmp_barrier_template<>(bt, gtid, is_split, reduce_size, reduce_data,
2129 #if defined(KMP_GOMP_COMPAT)
2131 int __kmp_barrier_gomp_cancel(
int gtid) {
2132 if (__kmp_omp_cancellation) {
2133 int cancelled = __kmp_barrier_template<true>(bs_plain_barrier, gtid, FALSE,
2136 int tid = __kmp_tid_from_gtid(gtid);
2137 kmp_info_t *this_thr = __kmp_threads[gtid];
2138 if (KMP_MASTER_TID(tid)) {
2142 this_thr->th.th_bar[bs_plain_barrier].bb.b_arrived -=
2143 KMP_BARRIER_STATE_BUMP;
2148 __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2153 void __kmp_end_split_barrier(
enum barrier_type bt,
int gtid) {
2154 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier);
2155 KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
2156 KMP_DEBUG_ASSERT(bt < bs_last_barrier);
2157 int tid = __kmp_tid_from_gtid(gtid);
2158 kmp_info_t *this_thr = __kmp_threads[gtid];
2159 kmp_team_t *team = this_thr->th.th_team;
2161 ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
2162 if (!team->t.t_serialized) {
2163 if (KMP_MASTER_GTID(gtid)) {
2164 switch (__kmp_barrier_release_pattern[bt]) {
2166 __kmp_dist_barrier_release(bt, this_thr, gtid, tid,
2167 FALSE USE_ITT_BUILD_ARG(NULL));
2170 case bp_hyper_bar: {
2171 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
2172 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
2173 FALSE USE_ITT_BUILD_ARG(NULL));
2176 case bp_hierarchical_bar: {
2177 __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid,
2178 FALSE USE_ITT_BUILD_ARG(NULL));
2182 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
2183 __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
2184 FALSE USE_ITT_BUILD_ARG(NULL));
2188 __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
2189 FALSE USE_ITT_BUILD_ARG(NULL));
2192 if (__kmp_tasking_mode != tskm_immediate_exec) {
2193 __kmp_task_team_sync(this_thr, team);
2197 ANNOTATE_BARRIER_END(&team->t.t_bar);
2200 void __kmp_join_barrier(
int gtid) {
2201 KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier);
2202 KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
2204 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
2206 kmp_info_t *this_thr = __kmp_threads[gtid];
2209 kmp_info_t *master_thread;
2215 void *itt_sync_obj = NULL;
2217 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2219 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2225 team = this_thr->th.th_team;
2226 nproc = this_thr->th.th_team_nproc;
2227 KMP_DEBUG_ASSERT((
int)nproc == team->t.t_nproc);
2228 tid = __kmp_tid_from_gtid(gtid);
2230 team_id = team->t.t_id;
2232 master_thread = this_thr->th.th_team_master;
2234 if (master_thread != team->t.t_threads[0]) {
2235 __kmp_print_structure();
2238 KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
2242 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
2243 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
2244 KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
2245 KA_TRACE(10, (
"__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n",
2246 gtid, team_id, tid));
2248 ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
2250 if (ompt_enabled.enabled) {
2252 ompt_data_t *my_task_data;
2253 ompt_data_t *my_parallel_data;
2254 void *codeptr = NULL;
2255 int ds_tid = this_thr->th.th_info.ds.ds_tid;
2256 if (KMP_MASTER_TID(ds_tid) &&
2257 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
2258 ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
2259 codeptr = team->t.ompt_team_info.master_return_address;
2260 my_task_data = OMPT_CUR_TASK_DATA(this_thr);
2261 my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
2262 if (ompt_enabled.ompt_callback_sync_region) {
2263 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2264 ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
2265 my_task_data, codeptr);
2267 if (ompt_enabled.ompt_callback_sync_region_wait) {
2268 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2269 ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
2270 my_task_data, codeptr);
2272 if (!KMP_MASTER_TID(ds_tid))
2273 this_thr->th.ompt_thread_info.task_data = *OMPT_CUR_TASK_DATA(this_thr);
2275 this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier_implicit;
2279 if (__kmp_tasking_mode == tskm_extra_barrier) {
2280 __kmp_tasking_barrier(team, this_thr, gtid);
2281 KA_TRACE(10, (
"__kmp_join_barrier: T#%d(%d:%d) past tasking barrier\n",
2282 gtid, team_id, tid));
2285 if (__kmp_tasking_mode != tskm_immediate_exec) {
2286 KA_TRACE(20, (
"__kmp_join_barrier: T#%d, old team = %d, old task_team = "
2287 "%p, th_task_team = %p\n",
2288 __kmp_gtid_from_thread(this_thr), team_id,
2289 team->t.t_task_team[this_thr->th.th_task_state],
2290 this_thr->th.th_task_team));
2291 if (this_thr->th.th_task_team)
2292 KMP_DEBUG_ASSERT(this_thr->th.th_task_team ==
2293 team->t.t_task_team[this_thr->th.th_task_state]);
2302 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
2304 this_thr->th.th_team_bt_intervals =
2305 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
2306 this_thr->th.th_team_bt_set =
2307 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
2309 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
2314 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2315 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
2318 switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
2320 __kmp_dist_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
2321 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2324 case bp_hyper_bar: {
2325 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
2326 __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
2327 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2330 case bp_hierarchical_bar: {
2331 __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
2332 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2336 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
2337 __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
2338 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2342 __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
2343 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2351 if (KMP_MASTER_TID(tid)) {
2352 if (__kmp_tasking_mode != tskm_immediate_exec) {
2353 __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
2355 if (__kmp_display_affinity) {
2356 KMP_CHECK_UPDATE(team->t.t_display_affinity, 0);
2358 #if KMP_STATS_ENABLED
2362 for (
int i = 0; i < team->t.t_nproc; ++i) {
2363 kmp_info_t *team_thread = team->t.t_threads[i];
2364 if (team_thread == this_thr)
2366 team_thread->th.th_stats->setIdleFlag();
2367 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME &&
2368 team_thread->th.th_sleep_loc != NULL)
2369 __kmp_null_resume_wrapper(team_thread);
2373 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2374 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
2377 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2379 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2380 __kmp_forkjoin_frames_mode &&
2381 (this_thr->th.th_teams_microtask == NULL ||
2382 this_thr->th.th_teams_size.nteams == 1) &&
2383 team->t.t_active_level == 1) {
2384 kmp_uint64 cur_time = __itt_get_timestamp();
2385 ident_t *loc = team->t.t_ident;
2386 kmp_info_t **other_threads = team->t.t_threads;
2387 int nproc = this_thr->th.th_team_nproc;
2389 switch (__kmp_forkjoin_frames_mode) {
2391 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
2395 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1,
2399 if (__itt_metadata_add_ptr) {
2401 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
2404 this_thr->th.th_bar_arrive_time = 0;
2405 for (i = 1; i < nproc; ++i) {
2406 delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
2407 other_threads[i]->th.th_bar_arrive_time = 0;
2409 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
2410 cur_time, delta, 0);
2412 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
2414 this_thr->th.th_frame_time = cur_time;
2422 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2423 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
2428 if (KMP_MASTER_TID(tid)) {
2431 (
"__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
2432 gtid, team_id, tid, nproc));
2439 (
"__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
2441 ANNOTATE_BARRIER_END(&team->t.t_bar);
2446 void __kmp_fork_barrier(
int gtid,
int tid) {
2447 KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier);
2448 KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
2449 kmp_info_t *this_thr = __kmp_threads[gtid];
2450 kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
2452 void *itt_sync_obj = NULL;
2455 ANNOTATE_BARRIER_END(&team->t.t_bar);
2457 KA_TRACE(10, (
"__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid,
2458 (team != NULL) ? team->t.t_id : -1, tid));
2461 if (KMP_MASTER_TID(tid)) {
2462 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2463 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2465 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
2466 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
2471 KMP_DEBUG_ASSERT(team);
2472 kmp_info_t **other_threads = team->t.t_threads;
2478 for (i = 1; i < team->t.t_nproc; ++i) {
2480 (
"__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go "
2482 gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
2483 team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
2484 other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
2486 (TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) &
2487 ~(KMP_BARRIER_SLEEP_STATE)) == KMP_INIT_BARRIER_STATE);
2488 KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
2492 if (__kmp_tasking_mode != tskm_immediate_exec) {
2494 __kmp_task_team_setup(this_thr, team, 0);
2503 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
2505 this_thr->th.th_team_bt_intervals =
2506 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
2507 this_thr->th.th_team_bt_set =
2508 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
2510 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
2515 switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
2517 __kmp_dist_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2518 TRUE USE_ITT_BUILD_ARG(NULL));
2521 case bp_hyper_bar: {
2522 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
2523 __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2524 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2527 case bp_hierarchical_bar: {
2528 __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2529 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2533 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
2534 __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2535 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2539 __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2540 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2545 if (ompt_enabled.enabled &&
2546 this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
2547 int ds_tid = this_thr->th.th_info.ds.ds_tid;
2548 ompt_data_t *task_data = (team)
2549 ? OMPT_CUR_TASK_DATA(this_thr)
2550 : &(this_thr->th.ompt_thread_info.task_data);
2551 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
2553 void *codeptr = NULL;
2554 if (KMP_MASTER_TID(ds_tid) &&
2555 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
2556 ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
2557 codeptr = team ? team->t.ompt_team_info.master_return_address : NULL;
2558 if (ompt_enabled.ompt_callback_sync_region_wait) {
2559 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2560 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
2563 if (ompt_enabled.ompt_callback_sync_region) {
2564 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2565 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
2569 if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
2570 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2571 ompt_scope_end, NULL, task_data, 0, ds_tid,
2572 ompt_task_implicit);
2578 if (TCR_4(__kmp_global.g.g_done)) {
2579 this_thr->th.th_task_team = NULL;
2581 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2582 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2583 if (!KMP_MASTER_TID(tid)) {
2584 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2586 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2590 KA_TRACE(10, (
"__kmp_fork_barrier: T#%d is leaving early\n", gtid));
2598 team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
2599 KMP_DEBUG_ASSERT(team != NULL);
2600 tid = __kmp_tid_from_gtid(gtid);
2602 #if KMP_BARRIER_ICV_PULL
2610 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
2611 if (!KMP_MASTER_TID(tid)) {
2615 (
"__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
2616 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
2618 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
2619 &team->t.t_threads[0]
2620 ->th.th_bar[bs_forkjoin_barrier]
2626 if (__kmp_tasking_mode != tskm_immediate_exec) {
2627 __kmp_task_team_sync(this_thr, team);
2630 #if KMP_AFFINITY_SUPPORTED
2631 kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
2632 if (proc_bind == proc_bind_intel) {
2634 if (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) {
2635 __kmp_balanced_affinity(this_thr, team->t.t_nproc);
2637 }
else if (proc_bind != proc_bind_false) {
2638 if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
2639 KA_TRACE(100, (
"__kmp_fork_barrier: T#%d already in correct place %d\n",
2640 __kmp_gtid_from_thread(this_thr),
2641 this_thr->th.th_current_place));
2643 __kmp_affinity_set_place(gtid);
2648 if (__kmp_display_affinity) {
2649 if (team->t.t_display_affinity
2650 #
if KMP_AFFINITY_SUPPORTED
2651 || (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed)
2655 __kmp_aux_display_affinity(gtid, NULL);
2656 this_thr->th.th_prev_num_threads = team->t.t_nproc;
2657 this_thr->th.th_prev_level = team->t.t_level;
2660 if (!KMP_MASTER_TID(tid))
2661 KMP_CHECK_UPDATE(this_thr->th.th_def_allocator, team->t.t_def_allocator);
2663 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2664 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2665 if (!KMP_MASTER_TID(tid)) {
2667 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2668 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2672 ANNOTATE_BARRIER_END(&team->t.t_bar);
2673 KA_TRACE(10, (
"__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid,
2674 team->t.t_id, tid));
2677 void __kmp_setup_icv_copy(kmp_team_t *team,
int new_nproc,
2678 kmp_internal_control_t *new_icvs,
ident_t *loc) {
2679 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy);
2681 KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
2682 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
2687 #if KMP_BARRIER_ICV_PULL
2691 KMP_DEBUG_ASSERT(team->t.t_threads[0]);
2694 &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs,
2696 KF_TRACE(10, (
"__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", 0,
2697 team->t.t_threads[0], team));
2698 #elif KMP_BARRIER_ICV_PUSH
2701 KF_TRACE(10, (
"__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 0,
2702 team->t.t_threads[0], team));
2707 KMP_DEBUG_ASSERT(team->t.t_threads[0]);
2709 for (
int f = 1; f < new_nproc; ++f) {
2711 KF_TRACE(10, (
"__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2712 f, team->t.t_threads[f], team));
2713 __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
2714 ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
2715 KF_TRACE(10, (
"__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2716 f, team->t.t_threads[f], team));