LLVM OpenMP* Runtime Library
kmp_runtime.cpp
1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_affinity.h"
15 #include "kmp_atomic.h"
16 #include "kmp_environment.h"
17 #include "kmp_error.h"
18 #include "kmp_i18n.h"
19 #include "kmp_io.h"
20 #include "kmp_itt.h"
21 #include "kmp_settings.h"
22 #include "kmp_stats.h"
23 #include "kmp_str.h"
24 #include "kmp_wait_release.h"
25 #include "kmp_wrapper_getpid.h"
26 #include "kmp_dispatch.h"
27 #if KMP_USE_HIER_SCHED
28 #include "kmp_dispatch_hier.h"
29 #endif
30 
31 #if OMPT_SUPPORT
32 #include "ompt-specific.h"
33 #endif
34 #if OMPD_SUPPORT
35 #include "ompd-specific.h"
36 #endif
37 
38 #if OMP_PROFILING_SUPPORT
39 #include "llvm/Support/TimeProfiler.h"
40 static char *ProfileTraceFile = nullptr;
41 #endif
42 
43 /* these are temporary issues to be dealt with */
44 #define KMP_USE_PRCTL 0
45 
46 #if KMP_OS_WINDOWS
47 #include <process.h>
48 #endif
49 
50 #include "tsan_annotations.h"
51 
52 #if KMP_OS_WINDOWS
53 // windows does not need include files as it doesn't use shared memory
54 #else
55 #include <sys/mman.h>
56 #include <sys/stat.h>
57 #include <fcntl.h>
58 #define SHM_SIZE 1024
59 #endif
60 
61 #if defined(KMP_GOMP_COMPAT)
62 char const __kmp_version_alt_comp[] =
63  KMP_VERSION_PREFIX "alternative compiler support: yes";
64 #endif /* defined(KMP_GOMP_COMPAT) */
65 
66 char const __kmp_version_omp_api[] =
67  KMP_VERSION_PREFIX "API version: 5.0 (201611)";
68 
69 #ifdef KMP_DEBUG
70 char const __kmp_version_lock[] =
71  KMP_VERSION_PREFIX "lock type: run time selectable";
72 #endif /* KMP_DEBUG */
73 
74 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
75 
76 /* ------------------------------------------------------------------------ */
77 
78 #if KMP_USE_MONITOR
79 kmp_info_t __kmp_monitor;
80 #endif
81 
82 /* Forward declarations */
83 
84 void __kmp_cleanup(void);
85 
86 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
87  int gtid);
88 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
89  kmp_internal_control_t *new_icvs,
90  ident_t *loc);
91 #if KMP_AFFINITY_SUPPORTED
92 static void __kmp_partition_places(kmp_team_t *team,
93  int update_master_only = 0);
94 #endif
95 static void __kmp_do_serial_initialize(void);
96 void __kmp_fork_barrier(int gtid, int tid);
97 void __kmp_join_barrier(int gtid);
98 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
99  kmp_internal_control_t *new_icvs, ident_t *loc);
100 
101 #ifdef USE_LOAD_BALANCE
102 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
103 #endif
104 
105 static int __kmp_expand_threads(int nNeed);
106 #if KMP_OS_WINDOWS
107 static int __kmp_unregister_root_other_thread(int gtid);
108 #endif
109 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
110 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
111 
112 void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
113  int new_nthreads);
114 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads);
115 
116 /* Calculate the identifier of the current thread */
117 /* fast (and somewhat portable) way to get unique identifier of executing
118  thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
119 int __kmp_get_global_thread_id() {
120  int i;
121  kmp_info_t **other_threads;
122  size_t stack_data;
123  char *stack_addr;
124  size_t stack_size;
125  char *stack_base;
126 
127  KA_TRACE(
128  1000,
129  ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n",
130  __kmp_nth, __kmp_all_nth));
131 
132  /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
133  a parallel region, made it return KMP_GTID_DNE to force serial_initialize
134  by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
135  __kmp_init_gtid for this to work. */
136 
137  if (!TCR_4(__kmp_init_gtid))
138  return KMP_GTID_DNE;
139 
140 #ifdef KMP_TDATA_GTID
141  if (TCR_4(__kmp_gtid_mode) >= 3) {
142  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
143  return __kmp_gtid;
144  }
145 #endif
146  if (TCR_4(__kmp_gtid_mode) >= 2) {
147  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
148  return __kmp_gtid_get_specific();
149  }
150  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
151 
152  stack_addr = (char *)&stack_data;
153  other_threads = __kmp_threads;
154 
155  /* ATT: The code below is a source of potential bugs due to unsynchronized
156  access to __kmp_threads array. For example:
157  1. Current thread loads other_threads[i] to thr and checks it, it is
158  non-NULL.
159  2. Current thread is suspended by OS.
160  3. Another thread unregisters and finishes (debug versions of free()
161  may fill memory with something like 0xEF).
162  4. Current thread is resumed.
163  5. Current thread reads junk from *thr.
164  TODO: Fix it. --ln */
165 
166  for (i = 0; i < __kmp_threads_capacity; i++) {
167 
168  kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
169  if (!thr)
170  continue;
171 
172  stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
173  stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
174 
175  /* stack grows down -- search through all of the active threads */
176 
177  if (stack_addr <= stack_base) {
178  size_t stack_diff = stack_base - stack_addr;
179 
180  if (stack_diff <= stack_size) {
181  /* The only way we can be closer than the allocated */
182  /* stack size is if we are running on this thread. */
183  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
184  return i;
185  }
186  }
187  }
188 
189  /* get specific to try and determine our gtid */
190  KA_TRACE(1000,
191  ("*** __kmp_get_global_thread_id: internal alg. failed to find "
192  "thread, using TLS\n"));
193  i = __kmp_gtid_get_specific();
194 
195  /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */
196 
197  /* if we havn't been assigned a gtid, then return code */
198  if (i < 0)
199  return i;
200 
201  /* dynamically updated stack window for uber threads to avoid get_specific
202  call */
203  if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
204  KMP_FATAL(StackOverflow, i);
205  }
206 
207  stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
208  if (stack_addr > stack_base) {
209  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
210  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
211  other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
212  stack_base);
213  } else {
214  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
215  stack_base - stack_addr);
216  }
217 
218  /* Reprint stack bounds for ubermaster since they have been refined */
219  if (__kmp_storage_map) {
220  char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
221  char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
222  __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
223  other_threads[i]->th.th_info.ds.ds_stacksize,
224  "th_%d stack (refinement)", i);
225  }
226  return i;
227 }
228 
229 int __kmp_get_global_thread_id_reg() {
230  int gtid;
231 
232  if (!__kmp_init_serial) {
233  gtid = KMP_GTID_DNE;
234  } else
235 #ifdef KMP_TDATA_GTID
236  if (TCR_4(__kmp_gtid_mode) >= 3) {
237  KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
238  gtid = __kmp_gtid;
239  } else
240 #endif
241  if (TCR_4(__kmp_gtid_mode) >= 2) {
242  KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
243  gtid = __kmp_gtid_get_specific();
244  } else {
245  KA_TRACE(1000,
246  ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
247  gtid = __kmp_get_global_thread_id();
248  }
249 
250  /* we must be a new uber master sibling thread */
251  if (gtid == KMP_GTID_DNE) {
252  KA_TRACE(10,
253  ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
254  "Registering a new gtid.\n"));
255  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
256  if (!__kmp_init_serial) {
257  __kmp_do_serial_initialize();
258  gtid = __kmp_gtid_get_specific();
259  } else {
260  gtid = __kmp_register_root(FALSE);
261  }
262  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
263  /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
264  }
265 
266  KMP_DEBUG_ASSERT(gtid >= 0);
267 
268  return gtid;
269 }
270 
271 /* caller must hold forkjoin_lock */
272 void __kmp_check_stack_overlap(kmp_info_t *th) {
273  int f;
274  char *stack_beg = NULL;
275  char *stack_end = NULL;
276  int gtid;
277 
278  KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
279  if (__kmp_storage_map) {
280  stack_end = (char *)th->th.th_info.ds.ds_stackbase;
281  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
282 
283  gtid = __kmp_gtid_from_thread(th);
284 
285  if (gtid == KMP_GTID_MONITOR) {
286  __kmp_print_storage_map_gtid(
287  gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
288  "th_%s stack (%s)", "mon",
289  (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
290  } else {
291  __kmp_print_storage_map_gtid(
292  gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
293  "th_%d stack (%s)", gtid,
294  (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
295  }
296  }
297 
298  /* No point in checking ubermaster threads since they use refinement and
299  * cannot overlap */
300  gtid = __kmp_gtid_from_thread(th);
301  if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
302  KA_TRACE(10,
303  ("__kmp_check_stack_overlap: performing extensive checking\n"));
304  if (stack_beg == NULL) {
305  stack_end = (char *)th->th.th_info.ds.ds_stackbase;
306  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
307  }
308 
309  for (f = 0; f < __kmp_threads_capacity; f++) {
310  kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
311 
312  if (f_th && f_th != th) {
313  char *other_stack_end =
314  (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
315  char *other_stack_beg =
316  other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
317  if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
318  (stack_end > other_stack_beg && stack_end < other_stack_end)) {
319 
320  /* Print the other stack values before the abort */
321  if (__kmp_storage_map)
322  __kmp_print_storage_map_gtid(
323  -1, other_stack_beg, other_stack_end,
324  (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
325  "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
326 
327  __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
328  __kmp_msg_null);
329  }
330  }
331  }
332  }
333  KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
334 }
335 
336 /* ------------------------------------------------------------------------ */
337 
338 void __kmp_infinite_loop(void) {
339  static int done = FALSE;
340 
341  while (!done) {
342  KMP_YIELD(TRUE);
343  }
344 }
345 
346 #define MAX_MESSAGE 512
347 
348 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
349  char const *format, ...) {
350  char buffer[MAX_MESSAGE];
351  va_list ap;
352 
353  va_start(ap, format);
354  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
355  p2, (unsigned long)size, format);
356  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
357  __kmp_vprintf(kmp_err, buffer, ap);
358 #if KMP_PRINT_DATA_PLACEMENT
359  int node;
360  if (gtid >= 0) {
361  if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
362  if (__kmp_storage_map_verbose) {
363  node = __kmp_get_host_node(p1);
364  if (node < 0) /* doesn't work, so don't try this next time */
365  __kmp_storage_map_verbose = FALSE;
366  else {
367  char *last;
368  int lastNode;
369  int localProc = __kmp_get_cpu_from_gtid(gtid);
370 
371  const int page_size = KMP_GET_PAGE_SIZE();
372 
373  p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
374  p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
375  if (localProc >= 0)
376  __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid,
377  localProc >> 1);
378  else
379  __kmp_printf_no_lock(" GTID %d\n", gtid);
380 #if KMP_USE_PRCTL
381  /* The more elaborate format is disabled for now because of the prctl
382  * hanging bug. */
383  do {
384  last = p1;
385  lastNode = node;
386  /* This loop collates adjacent pages with the same host node. */
387  do {
388  (char *)p1 += page_size;
389  } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
390  __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1,
391  lastNode);
392  } while (p1 <= p2);
393 #else
394  __kmp_printf_no_lock(" %p-%p memNode %d\n", p1,
395  (char *)p1 + (page_size - 1),
396  __kmp_get_host_node(p1));
397  if (p1 < p2) {
398  __kmp_printf_no_lock(" %p-%p memNode %d\n", p2,
399  (char *)p2 + (page_size - 1),
400  __kmp_get_host_node(p2));
401  }
402 #endif
403  }
404  }
405  } else
406  __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning));
407  }
408 #endif /* KMP_PRINT_DATA_PLACEMENT */
409  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
410 }
411 
412 void __kmp_warn(char const *format, ...) {
413  char buffer[MAX_MESSAGE];
414  va_list ap;
415 
416  if (__kmp_generate_warnings == kmp_warnings_off) {
417  return;
418  }
419 
420  va_start(ap, format);
421 
422  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
423  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
424  __kmp_vprintf(kmp_err, buffer, ap);
425  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
426 
427  va_end(ap);
428 }
429 
430 void __kmp_abort_process() {
431  // Later threads may stall here, but that's ok because abort() will kill them.
432  __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
433 
434  if (__kmp_debug_buf) {
435  __kmp_dump_debug_buffer();
436  }
437 
438  if (KMP_OS_WINDOWS) {
439  // Let other threads know of abnormal termination and prevent deadlock
440  // if abort happened during library initialization or shutdown
441  __kmp_global.g.g_abort = SIGABRT;
442 
443  /* On Windows* OS by default abort() causes pop-up error box, which stalls
444  nightly testing. Unfortunately, we cannot reliably suppress pop-up error
445  boxes. _set_abort_behavior() works well, but this function is not
446  available in VS7 (this is not problem for DLL, but it is a problem for
447  static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
448  help, at least in some versions of MS C RTL.
449 
450  It seems following sequence is the only way to simulate abort() and
451  avoid pop-up error box. */
452  raise(SIGABRT);
453  _exit(3); // Just in case, if signal ignored, exit anyway.
454  } else {
455  __kmp_unregister_library();
456  abort();
457  }
458 
459  __kmp_infinite_loop();
460  __kmp_release_bootstrap_lock(&__kmp_exit_lock);
461 
462 } // __kmp_abort_process
463 
464 void __kmp_abort_thread(void) {
465  // TODO: Eliminate g_abort global variable and this function.
466  // In case of abort just call abort(), it will kill all the threads.
467  __kmp_infinite_loop();
468 } // __kmp_abort_thread
469 
470 /* Print out the storage map for the major kmp_info_t thread data structures
471  that are allocated together. */
472 
473 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
474  __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
475  gtid);
476 
477  __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
478  sizeof(kmp_desc_t), "th_%d.th_info", gtid);
479 
480  __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
481  sizeof(kmp_local_t), "th_%d.th_local", gtid);
482 
483  __kmp_print_storage_map_gtid(
484  gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
485  sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
486 
487  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
488  &thr->th.th_bar[bs_plain_barrier + 1],
489  sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
490  gtid);
491 
492  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
493  &thr->th.th_bar[bs_forkjoin_barrier + 1],
494  sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
495  gtid);
496 
497 #if KMP_FAST_REDUCTION_BARRIER
498  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
499  &thr->th.th_bar[bs_reduction_barrier + 1],
500  sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
501  gtid);
502 #endif // KMP_FAST_REDUCTION_BARRIER
503 }
504 
505 /* Print out the storage map for the major kmp_team_t team data structures
506  that are allocated together. */
507 
508 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
509  int team_id, int num_thr) {
510  int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
511  __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
512  header, team_id);
513 
514  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
515  &team->t.t_bar[bs_last_barrier],
516  sizeof(kmp_balign_team_t) * bs_last_barrier,
517  "%s_%d.t_bar", header, team_id);
518 
519  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
520  &team->t.t_bar[bs_plain_barrier + 1],
521  sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
522  header, team_id);
523 
524  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
525  &team->t.t_bar[bs_forkjoin_barrier + 1],
526  sizeof(kmp_balign_team_t),
527  "%s_%d.t_bar[forkjoin]", header, team_id);
528 
529 #if KMP_FAST_REDUCTION_BARRIER
530  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
531  &team->t.t_bar[bs_reduction_barrier + 1],
532  sizeof(kmp_balign_team_t),
533  "%s_%d.t_bar[reduction]", header, team_id);
534 #endif // KMP_FAST_REDUCTION_BARRIER
535 
536  __kmp_print_storage_map_gtid(
537  -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
538  sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
539 
540  __kmp_print_storage_map_gtid(
541  -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
542  sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
543 
544  __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
545  &team->t.t_disp_buffer[num_disp_buff],
546  sizeof(dispatch_shared_info_t) * num_disp_buff,
547  "%s_%d.t_disp_buffer", header, team_id);
548 }
549 
550 static void __kmp_init_allocator() {
551  __kmp_init_memkind();
552  __kmp_init_target_mem();
553 }
554 static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
555 
556 /* ------------------------------------------------------------------------ */
557 
558 #if KMP_DYNAMIC_LIB
559 #if KMP_OS_WINDOWS
560 
561 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
562  //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
563 
564  switch (fdwReason) {
565 
566  case DLL_PROCESS_ATTACH:
567  KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
568 
569  return TRUE;
570 
571  case DLL_PROCESS_DETACH:
572  KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
573 
574  // According to Windows* documentation for DllMain entry point:
575  // for DLL_PROCESS_DETACH, lpReserved is used for telling the difference:
576  // lpReserved == NULL when FreeLibrary() is called,
577  // lpReserved != NULL when the process is terminated.
578  // When FreeLibrary() is called, worker threads remain alive. So the
579  // runtime's state is consistent and executing proper shutdown is OK.
580  // When the process is terminated, worker threads have exited or been
581  // forcefully terminated by the OS and only the shutdown thread remains.
582  // This can leave the runtime in an inconsistent state.
583  // Hence, only attempt proper cleanup when FreeLibrary() is called.
584  // Otherwise, rely on OS to reclaim resources.
585  if (lpReserved == NULL)
586  __kmp_internal_end_library(__kmp_gtid_get_specific());
587 
588  return TRUE;
589 
590  case DLL_THREAD_ATTACH:
591  KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
592 
593  /* if we want to register new siblings all the time here call
594  * __kmp_get_gtid(); */
595  return TRUE;
596 
597  case DLL_THREAD_DETACH:
598  KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
599 
600  __kmp_internal_end_thread(__kmp_gtid_get_specific());
601  return TRUE;
602  }
603 
604  return TRUE;
605 }
606 
607 #endif /* KMP_OS_WINDOWS */
608 #endif /* KMP_DYNAMIC_LIB */
609 
610 /* __kmp_parallel_deo -- Wait until it's our turn. */
611 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
612  int gtid = *gtid_ref;
613 #ifdef BUILD_PARALLEL_ORDERED
614  kmp_team_t *team = __kmp_team_from_gtid(gtid);
615 #endif /* BUILD_PARALLEL_ORDERED */
616 
617  if (__kmp_env_consistency_check) {
618  if (__kmp_threads[gtid]->th.th_root->r.r_active)
619 #if KMP_USE_DYNAMIC_LOCK
620  __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
621 #else
622  __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
623 #endif
624  }
625 #ifdef BUILD_PARALLEL_ORDERED
626  if (!team->t.t_serialized) {
627  KMP_MB();
628  KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
629  NULL);
630  KMP_MB();
631  }
632 #endif /* BUILD_PARALLEL_ORDERED */
633 }
634 
635 /* __kmp_parallel_dxo -- Signal the next task. */
636 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
637  int gtid = *gtid_ref;
638 #ifdef BUILD_PARALLEL_ORDERED
639  int tid = __kmp_tid_from_gtid(gtid);
640  kmp_team_t *team = __kmp_team_from_gtid(gtid);
641 #endif /* BUILD_PARALLEL_ORDERED */
642 
643  if (__kmp_env_consistency_check) {
644  if (__kmp_threads[gtid]->th.th_root->r.r_active)
645  __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
646  }
647 #ifdef BUILD_PARALLEL_ORDERED
648  if (!team->t.t_serialized) {
649  KMP_MB(); /* Flush all pending memory write invalidates. */
650 
651  /* use the tid of the next thread in this team */
652  /* TODO replace with general release procedure */
653  team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
654 
655  KMP_MB(); /* Flush all pending memory write invalidates. */
656  }
657 #endif /* BUILD_PARALLEL_ORDERED */
658 }
659 
660 /* ------------------------------------------------------------------------ */
661 /* The BARRIER for a SINGLE process section is always explicit */
662 
663 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
664  int status;
665  kmp_info_t *th;
666  kmp_team_t *team;
667 
668  if (!TCR_4(__kmp_init_parallel))
669  __kmp_parallel_initialize();
670  __kmp_resume_if_soft_paused();
671 
672  th = __kmp_threads[gtid];
673  team = th->th.th_team;
674  status = 0;
675 
676  th->th.th_ident = id_ref;
677 
678  if (team->t.t_serialized) {
679  status = 1;
680  } else {
681  kmp_int32 old_this = th->th.th_local.this_construct;
682 
683  ++th->th.th_local.this_construct;
684  /* try to set team count to thread count--success means thread got the
685  single block */
686  /* TODO: Should this be acquire or release? */
687  if (team->t.t_construct == old_this) {
688  status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
689  th->th.th_local.this_construct);
690  }
691 #if USE_ITT_BUILD
692  if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
693  KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
694  team->t.t_active_level == 1) {
695  // Only report metadata by primary thread of active team at level 1
696  __kmp_itt_metadata_single(id_ref);
697  }
698 #endif /* USE_ITT_BUILD */
699  }
700 
701  if (__kmp_env_consistency_check) {
702  if (status && push_ws) {
703  __kmp_push_workshare(gtid, ct_psingle, id_ref);
704  } else {
705  __kmp_check_workshare(gtid, ct_psingle, id_ref);
706  }
707  }
708 #if USE_ITT_BUILD
709  if (status) {
710  __kmp_itt_single_start(gtid);
711  }
712 #endif /* USE_ITT_BUILD */
713  return status;
714 }
715 
716 void __kmp_exit_single(int gtid) {
717 #if USE_ITT_BUILD
718  __kmp_itt_single_end(gtid);
719 #endif /* USE_ITT_BUILD */
720  if (__kmp_env_consistency_check)
721  __kmp_pop_workshare(gtid, ct_psingle, NULL);
722 }
723 
724 /* determine if we can go parallel or must use a serialized parallel region and
725  * how many threads we can use
726  * set_nproc is the number of threads requested for the team
727  * returns 0 if we should serialize or only use one thread,
728  * otherwise the number of threads to use
729  * The forkjoin lock is held by the caller. */
730 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
731  int master_tid, int set_nthreads,
732  int enter_teams) {
733  int capacity;
734  int new_nthreads;
735  KMP_DEBUG_ASSERT(__kmp_init_serial);
736  KMP_DEBUG_ASSERT(root && parent_team);
737  kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
738 
739  // If dyn-var is set, dynamically adjust the number of desired threads,
740  // according to the method specified by dynamic_mode.
741  new_nthreads = set_nthreads;
742  if (!get__dynamic_2(parent_team, master_tid)) {
743  ;
744  }
745 #ifdef USE_LOAD_BALANCE
746  else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
747  new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
748  if (new_nthreads == 1) {
749  KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
750  "reservation to 1 thread\n",
751  master_tid));
752  return 1;
753  }
754  if (new_nthreads < set_nthreads) {
755  KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
756  "reservation to %d threads\n",
757  master_tid, new_nthreads));
758  }
759  }
760 #endif /* USE_LOAD_BALANCE */
761  else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
762  new_nthreads = __kmp_avail_proc - __kmp_nth +
763  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
764  if (new_nthreads <= 1) {
765  KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
766  "reservation to 1 thread\n",
767  master_tid));
768  return 1;
769  }
770  if (new_nthreads < set_nthreads) {
771  KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
772  "reservation to %d threads\n",
773  master_tid, new_nthreads));
774  } else {
775  new_nthreads = set_nthreads;
776  }
777  } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
778  if (set_nthreads > 2) {
779  new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
780  new_nthreads = (new_nthreads % set_nthreads) + 1;
781  if (new_nthreads == 1) {
782  KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
783  "reservation to 1 thread\n",
784  master_tid));
785  return 1;
786  }
787  if (new_nthreads < set_nthreads) {
788  KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
789  "reservation to %d threads\n",
790  master_tid, new_nthreads));
791  }
792  }
793  } else {
794  KMP_ASSERT(0);
795  }
796 
797  // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
798  if (__kmp_nth + new_nthreads -
799  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
800  __kmp_max_nth) {
801  int tl_nthreads = __kmp_max_nth - __kmp_nth +
802  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
803  if (tl_nthreads <= 0) {
804  tl_nthreads = 1;
805  }
806 
807  // If dyn-var is false, emit a 1-time warning.
808  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
809  __kmp_reserve_warn = 1;
810  __kmp_msg(kmp_ms_warning,
811  KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
812  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
813  }
814  if (tl_nthreads == 1) {
815  KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
816  "reduced reservation to 1 thread\n",
817  master_tid));
818  return 1;
819  }
820  KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
821  "reservation to %d threads\n",
822  master_tid, tl_nthreads));
823  new_nthreads = tl_nthreads;
824  }
825 
826  // Respect OMP_THREAD_LIMIT
827  int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
828  int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
829  if (cg_nthreads + new_nthreads -
830  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
831  max_cg_threads) {
832  int tl_nthreads = max_cg_threads - cg_nthreads +
833  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
834  if (tl_nthreads <= 0) {
835  tl_nthreads = 1;
836  }
837 
838  // If dyn-var is false, emit a 1-time warning.
839  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
840  __kmp_reserve_warn = 1;
841  __kmp_msg(kmp_ms_warning,
842  KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
843  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
844  }
845  if (tl_nthreads == 1) {
846  KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
847  "reduced reservation to 1 thread\n",
848  master_tid));
849  return 1;
850  }
851  KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
852  "reservation to %d threads\n",
853  master_tid, tl_nthreads));
854  new_nthreads = tl_nthreads;
855  }
856 
857  // Check if the threads array is large enough, or needs expanding.
858  // See comment in __kmp_register_root() about the adjustment if
859  // __kmp_threads[0] == NULL.
860  capacity = __kmp_threads_capacity;
861  if (TCR_PTR(__kmp_threads[0]) == NULL) {
862  --capacity;
863  }
864  // If it is not for initializing the hidden helper team, we need to take
865  // __kmp_hidden_helper_threads_num out of the capacity because it is included
866  // in __kmp_threads_capacity.
867  if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
868  capacity -= __kmp_hidden_helper_threads_num;
869  }
870  if (__kmp_nth + new_nthreads -
871  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
872  capacity) {
873  // Expand the threads array.
874  int slotsRequired = __kmp_nth + new_nthreads -
875  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
876  capacity;
877  int slotsAdded = __kmp_expand_threads(slotsRequired);
878  if (slotsAdded < slotsRequired) {
879  // The threads array was not expanded enough.
880  new_nthreads -= (slotsRequired - slotsAdded);
881  KMP_ASSERT(new_nthreads >= 1);
882 
883  // If dyn-var is false, emit a 1-time warning.
884  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
885  __kmp_reserve_warn = 1;
886  if (__kmp_tp_cached) {
887  __kmp_msg(kmp_ms_warning,
888  KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
889  KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
890  KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
891  } else {
892  __kmp_msg(kmp_ms_warning,
893  KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
894  KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
895  }
896  }
897  }
898  }
899 
900 #ifdef KMP_DEBUG
901  if (new_nthreads == 1) {
902  KC_TRACE(10,
903  ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
904  "dead roots and rechecking; requested %d threads\n",
905  __kmp_get_gtid(), set_nthreads));
906  } else {
907  KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
908  " %d threads\n",
909  __kmp_get_gtid(), new_nthreads, set_nthreads));
910  }
911 #endif // KMP_DEBUG
912  return new_nthreads;
913 }
914 
915 /* Allocate threads from the thread pool and assign them to the new team. We are
916  assured that there are enough threads available, because we checked on that
917  earlier within critical section forkjoin */
918 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
919  kmp_info_t *master_th, int master_gtid) {
920  int i;
921  int use_hot_team;
922 
923  KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
924  KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
925  KMP_MB();
926 
927  /* first, let's setup the primary thread */
928  master_th->th.th_info.ds.ds_tid = 0;
929  master_th->th.th_team = team;
930  master_th->th.th_team_nproc = team->t.t_nproc;
931  master_th->th.th_team_master = master_th;
932  master_th->th.th_team_serialized = FALSE;
933  master_th->th.th_dispatch = &team->t.t_dispatch[0];
934 
935 /* make sure we are not the optimized hot team */
936 #if KMP_NESTED_HOT_TEAMS
937  use_hot_team = 0;
938  kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
939  if (hot_teams) { // hot teams array is not allocated if
940  // KMP_HOT_TEAMS_MAX_LEVEL=0
941  int level = team->t.t_active_level - 1; // index in array of hot teams
942  if (master_th->th.th_teams_microtask) { // are we inside the teams?
943  if (master_th->th.th_teams_size.nteams > 1) {
944  ++level; // level was not increased in teams construct for
945  // team_of_masters
946  }
947  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
948  master_th->th.th_teams_level == team->t.t_level) {
949  ++level; // level was not increased in teams construct for
950  // team_of_workers before the parallel
951  } // team->t.t_level will be increased inside parallel
952  }
953  if (level < __kmp_hot_teams_max_level) {
954  if (hot_teams[level].hot_team) {
955  // hot team has already been allocated for given level
956  KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
957  use_hot_team = 1; // the team is ready to use
958  } else {
959  use_hot_team = 0; // AC: threads are not allocated yet
960  hot_teams[level].hot_team = team; // remember new hot team
961  hot_teams[level].hot_team_nth = team->t.t_nproc;
962  }
963  } else {
964  use_hot_team = 0;
965  }
966  }
967 #else
968  use_hot_team = team == root->r.r_hot_team;
969 #endif
970  if (!use_hot_team) {
971 
972  /* install the primary thread */
973  team->t.t_threads[0] = master_th;
974  __kmp_initialize_info(master_th, team, 0, master_gtid);
975 
976  /* now, install the worker threads */
977  for (i = 1; i < team->t.t_nproc; i++) {
978 
979  /* fork or reallocate a new thread and install it in team */
980  kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
981  team->t.t_threads[i] = thr;
982  KMP_DEBUG_ASSERT(thr);
983  KMP_DEBUG_ASSERT(thr->th.th_team == team);
984  /* align team and thread arrived states */
985  KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
986  "T#%d(%d:%d) join =%llu, plain=%llu\n",
987  __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
988  __kmp_gtid_from_tid(i, team), team->t.t_id, i,
989  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
990  team->t.t_bar[bs_plain_barrier].b_arrived));
991  thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
992  thr->th.th_teams_level = master_th->th.th_teams_level;
993  thr->th.th_teams_size = master_th->th.th_teams_size;
994  { // Initialize threads' barrier data.
995  int b;
996  kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
997  for (b = 0; b < bs_last_barrier; ++b) {
998  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
999  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1000 #if USE_DEBUGGER
1001  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1002 #endif
1003  }
1004  }
1005  }
1006 
1007 #if KMP_AFFINITY_SUPPORTED
1008  __kmp_partition_places(team);
1009 #endif
1010  }
1011 
1012  if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1013  for (i = 0; i < team->t.t_nproc; i++) {
1014  kmp_info_t *thr = team->t.t_threads[i];
1015  if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1016  thr->th.th_prev_level != team->t.t_level) {
1017  team->t.t_display_affinity = 1;
1018  break;
1019  }
1020  }
1021  }
1022 
1023  KMP_MB();
1024 }
1025 
1026 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1027 // Propagate any changes to the floating point control registers out to the team
1028 // We try to avoid unnecessary writes to the relevant cache line in the team
1029 // structure, so we don't make changes unless they are needed.
1030 inline static void propagateFPControl(kmp_team_t *team) {
1031  if (__kmp_inherit_fp_control) {
1032  kmp_int16 x87_fpu_control_word;
1033  kmp_uint32 mxcsr;
1034 
1035  // Get primary thread's values of FPU control flags (both X87 and vector)
1036  __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1037  __kmp_store_mxcsr(&mxcsr);
1038  mxcsr &= KMP_X86_MXCSR_MASK;
1039 
1040  // There is no point looking at t_fp_control_saved here.
1041  // If it is TRUE, we still have to update the values if they are different
1042  // from those we now have. If it is FALSE we didn't save anything yet, but
1043  // our objective is the same. We have to ensure that the values in the team
1044  // are the same as those we have.
1045  // So, this code achieves what we need whether or not t_fp_control_saved is
1046  // true. By checking whether the value needs updating we avoid unnecessary
1047  // writes that would put the cache-line into a written state, causing all
1048  // threads in the team to have to read it again.
1049  KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1050  KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1051  // Although we don't use this value, other code in the runtime wants to know
1052  // whether it should restore them. So we must ensure it is correct.
1053  KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1054  } else {
1055  // Similarly here. Don't write to this cache-line in the team structure
1056  // unless we have to.
1057  KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1058  }
1059 }
1060 
1061 // Do the opposite, setting the hardware registers to the updated values from
1062 // the team.
1063 inline static void updateHWFPControl(kmp_team_t *team) {
1064  if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1065  // Only reset the fp control regs if they have been changed in the team.
1066  // the parallel region that we are exiting.
1067  kmp_int16 x87_fpu_control_word;
1068  kmp_uint32 mxcsr;
1069  __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1070  __kmp_store_mxcsr(&mxcsr);
1071  mxcsr &= KMP_X86_MXCSR_MASK;
1072 
1073  if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1074  __kmp_clear_x87_fpu_status_word();
1075  __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1076  }
1077 
1078  if (team->t.t_mxcsr != mxcsr) {
1079  __kmp_load_mxcsr(&team->t.t_mxcsr);
1080  }
1081  }
1082 }
1083 #else
1084 #define propagateFPControl(x) ((void)0)
1085 #define updateHWFPControl(x) ((void)0)
1086 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1087 
1088 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1089  int realloc); // forward declaration
1090 
1091 /* Run a parallel region that has been serialized, so runs only in a team of the
1092  single primary thread. */
1093 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1094  kmp_info_t *this_thr;
1095  kmp_team_t *serial_team;
1096 
1097  KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1098 
1099  /* Skip all this code for autopar serialized loops since it results in
1100  unacceptable overhead */
1101  if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1102  return;
1103 
1104  if (!TCR_4(__kmp_init_parallel))
1105  __kmp_parallel_initialize();
1106  __kmp_resume_if_soft_paused();
1107 
1108  this_thr = __kmp_threads[global_tid];
1109  serial_team = this_thr->th.th_serial_team;
1110 
1111  /* utilize the serialized team held by this thread */
1112  KMP_DEBUG_ASSERT(serial_team);
1113  KMP_MB();
1114 
1115  if (__kmp_tasking_mode != tskm_immediate_exec) {
1116  KMP_DEBUG_ASSERT(
1117  this_thr->th.th_task_team ==
1118  this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1119  KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1120  NULL);
1121  KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1122  "team %p, new task_team = NULL\n",
1123  global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1124  this_thr->th.th_task_team = NULL;
1125  }
1126 
1127  kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1128  if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1129  proc_bind = proc_bind_false;
1130  } else if (proc_bind == proc_bind_default) {
1131  // No proc_bind clause was specified, so use the current value
1132  // of proc-bind-var for this parallel region.
1133  proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1134  }
1135  // Reset for next parallel region
1136  this_thr->th.th_set_proc_bind = proc_bind_default;
1137 
1138 #if OMPT_SUPPORT
1139  ompt_data_t ompt_parallel_data = ompt_data_none;
1140  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1141  if (ompt_enabled.enabled &&
1142  this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1143 
1144  ompt_task_info_t *parent_task_info;
1145  parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1146 
1147  parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1148  if (ompt_enabled.ompt_callback_parallel_begin) {
1149  int team_size = 1;
1150 
1151  ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1152  &(parent_task_info->task_data), &(parent_task_info->frame),
1153  &ompt_parallel_data, team_size,
1154  ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1155  }
1156  }
1157 #endif // OMPT_SUPPORT
1158 
1159  if (this_thr->th.th_team != serial_team) {
1160  // Nested level will be an index in the nested nthreads array
1161  int level = this_thr->th.th_team->t.t_level;
1162 
1163  if (serial_team->t.t_serialized) {
1164  /* this serial team was already used
1165  TODO increase performance by making this locks more specific */
1166  kmp_team_t *new_team;
1167 
1168  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1169 
1170  new_team =
1171  __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1172 #if OMPT_SUPPORT
1173  ompt_parallel_data,
1174 #endif
1175  proc_bind, &this_thr->th.th_current_task->td_icvs,
1176  0 USE_NESTED_HOT_ARG(NULL));
1177  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1178  KMP_ASSERT(new_team);
1179 
1180  /* setup new serialized team and install it */
1181  new_team->t.t_threads[0] = this_thr;
1182  new_team->t.t_parent = this_thr->th.th_team;
1183  serial_team = new_team;
1184  this_thr->th.th_serial_team = serial_team;
1185 
1186  KF_TRACE(
1187  10,
1188  ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1189  global_tid, serial_team));
1190 
1191  /* TODO the above breaks the requirement that if we run out of resources,
1192  then we can still guarantee that serialized teams are ok, since we may
1193  need to allocate a new one */
1194  } else {
1195  KF_TRACE(
1196  10,
1197  ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1198  global_tid, serial_team));
1199  }
1200 
1201  /* we have to initialize this serial team */
1202  KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1203  KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1204  KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1205  serial_team->t.t_ident = loc;
1206  serial_team->t.t_serialized = 1;
1207  serial_team->t.t_nproc = 1;
1208  serial_team->t.t_parent = this_thr->th.th_team;
1209  serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1210  this_thr->th.th_team = serial_team;
1211  serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1212 
1213  KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d curtask=%p\n", global_tid,
1214  this_thr->th.th_current_task));
1215  KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1216  this_thr->th.th_current_task->td_flags.executing = 0;
1217 
1218  __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1219 
1220  /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1221  implicit task for each serialized task represented by
1222  team->t.t_serialized? */
1223  copy_icvs(&this_thr->th.th_current_task->td_icvs,
1224  &this_thr->th.th_current_task->td_parent->td_icvs);
1225 
1226  // Thread value exists in the nested nthreads array for the next nested
1227  // level
1228  if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1229  this_thr->th.th_current_task->td_icvs.nproc =
1230  __kmp_nested_nth.nth[level + 1];
1231  }
1232 
1233  if (__kmp_nested_proc_bind.used &&
1234  (level + 1 < __kmp_nested_proc_bind.used)) {
1235  this_thr->th.th_current_task->td_icvs.proc_bind =
1236  __kmp_nested_proc_bind.bind_types[level + 1];
1237  }
1238 
1239 #if USE_DEBUGGER
1240  serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1241 #endif
1242  this_thr->th.th_info.ds.ds_tid = 0;
1243 
1244  /* set thread cache values */
1245  this_thr->th.th_team_nproc = 1;
1246  this_thr->th.th_team_master = this_thr;
1247  this_thr->th.th_team_serialized = 1;
1248 
1249  serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1250  serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1251  serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1252 
1253  propagateFPControl(serial_team);
1254 
1255  /* check if we need to allocate dispatch buffers stack */
1256  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1257  if (!serial_team->t.t_dispatch->th_disp_buffer) {
1258  serial_team->t.t_dispatch->th_disp_buffer =
1259  (dispatch_private_info_t *)__kmp_allocate(
1260  sizeof(dispatch_private_info_t));
1261  }
1262  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1263 
1264  KMP_MB();
1265 
1266  } else {
1267  /* this serialized team is already being used,
1268  * that's fine, just add another nested level */
1269  KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1270  KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1271  KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1272  ++serial_team->t.t_serialized;
1273  this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1274 
1275  // Nested level will be an index in the nested nthreads array
1276  int level = this_thr->th.th_team->t.t_level;
1277  // Thread value exists in the nested nthreads array for the next nested
1278  // level
1279  if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1280  this_thr->th.th_current_task->td_icvs.nproc =
1281  __kmp_nested_nth.nth[level + 1];
1282  }
1283  serial_team->t.t_level++;
1284  KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1285  "of serial team %p to %d\n",
1286  global_tid, serial_team, serial_team->t.t_level));
1287 
1288  /* allocate/push dispatch buffers stack */
1289  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1290  {
1291  dispatch_private_info_t *disp_buffer =
1292  (dispatch_private_info_t *)__kmp_allocate(
1293  sizeof(dispatch_private_info_t));
1294  disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1295  serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1296  }
1297  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1298 
1299  KMP_MB();
1300  }
1301  KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1302 
1303  // Perform the display affinity functionality for
1304  // serialized parallel regions
1305  if (__kmp_display_affinity) {
1306  if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1307  this_thr->th.th_prev_num_threads != 1) {
1308  // NULL means use the affinity-format-var ICV
1309  __kmp_aux_display_affinity(global_tid, NULL);
1310  this_thr->th.th_prev_level = serial_team->t.t_level;
1311  this_thr->th.th_prev_num_threads = 1;
1312  }
1313  }
1314 
1315  if (__kmp_env_consistency_check)
1316  __kmp_push_parallel(global_tid, NULL);
1317 #if OMPT_SUPPORT
1318  serial_team->t.ompt_team_info.master_return_address = codeptr;
1319  if (ompt_enabled.enabled &&
1320  this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1321  OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1322  OMPT_GET_FRAME_ADDRESS(0);
1323 
1324  ompt_lw_taskteam_t lw_taskteam;
1325  __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1326  &ompt_parallel_data, codeptr);
1327 
1328  __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1329  // don't use lw_taskteam after linking. content was swaped
1330 
1331  /* OMPT implicit task begin */
1332  if (ompt_enabled.ompt_callback_implicit_task) {
1333  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1334  ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1335  OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid),
1336  ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1337  OMPT_CUR_TASK_INFO(this_thr)->thread_num =
1338  __kmp_tid_from_gtid(global_tid);
1339  }
1340 
1341  /* OMPT state */
1342  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1343  OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1344  OMPT_GET_FRAME_ADDRESS(0);
1345  }
1346 #endif
1347 }
1348 
1349 /* most of the work for a fork */
1350 /* return true if we really went parallel, false if serialized */
1351 int __kmp_fork_call(ident_t *loc, int gtid,
1352  enum fork_context_e call_context, // Intel, GNU, ...
1353  kmp_int32 argc, microtask_t microtask, launch_t invoker,
1354  kmp_va_list ap) {
1355  void **argv;
1356  int i;
1357  int master_tid;
1358  int master_this_cons;
1359  kmp_team_t *team;
1360  kmp_team_t *parent_team;
1361  kmp_info_t *master_th;
1362  kmp_root_t *root;
1363  int nthreads;
1364  int master_active;
1365  int master_set_numthreads;
1366  int level;
1367  int active_level;
1368  int teams_level;
1369 #if KMP_NESTED_HOT_TEAMS
1370  kmp_hot_team_ptr_t **p_hot_teams;
1371 #endif
1372  { // KMP_TIME_BLOCK
1373  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1374  KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1375 
1376  KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1377  if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1378  /* Some systems prefer the stack for the root thread(s) to start with */
1379  /* some gap from the parent stack to prevent false sharing. */
1380  void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1381  /* These 2 lines below are so this does not get optimized out */
1382  if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1383  __kmp_stkpadding += (short)((kmp_int64)dummy);
1384  }
1385 
1386  /* initialize if needed */
1387  KMP_DEBUG_ASSERT(
1388  __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1389  if (!TCR_4(__kmp_init_parallel))
1390  __kmp_parallel_initialize();
1391  __kmp_resume_if_soft_paused();
1392 
1393  /* setup current data */
1394  master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
1395  // shutdown
1396  parent_team = master_th->th.th_team;
1397  master_tid = master_th->th.th_info.ds.ds_tid;
1398  master_this_cons = master_th->th.th_local.this_construct;
1399  root = master_th->th.th_root;
1400  master_active = root->r.r_active;
1401  master_set_numthreads = master_th->th.th_set_nproc;
1402 
1403 #if OMPT_SUPPORT
1404  ompt_data_t ompt_parallel_data = ompt_data_none;
1405  ompt_data_t *parent_task_data;
1406  ompt_frame_t *ompt_frame;
1407  ompt_data_t *implicit_task_data;
1408  void *return_address = NULL;
1409 
1410  if (ompt_enabled.enabled) {
1411  __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1412  NULL, NULL);
1413  return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1414  }
1415 #endif
1416 
1417  // Assign affinity to root thread if it hasn't happened yet
1418  __kmp_assign_root_init_mask();
1419 
1420  // Nested level will be an index in the nested nthreads array
1421  level = parent_team->t.t_level;
1422  // used to launch non-serial teams even if nested is not allowed
1423  active_level = parent_team->t.t_active_level;
1424  // needed to check nesting inside the teams
1425  teams_level = master_th->th.th_teams_level;
1426 #if KMP_NESTED_HOT_TEAMS
1427  p_hot_teams = &master_th->th.th_hot_teams;
1428  if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1429  *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1430  sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1431  (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1432  // it is either actual or not needed (when active_level > 0)
1433  (*p_hot_teams)[0].hot_team_nth = 1;
1434  }
1435 #endif
1436 
1437 #if OMPT_SUPPORT
1438  if (ompt_enabled.enabled) {
1439  if (ompt_enabled.ompt_callback_parallel_begin) {
1440  int team_size = master_set_numthreads
1441  ? master_set_numthreads
1442  : get__nproc_2(parent_team, master_tid);
1443  int flags = OMPT_INVOKER(call_context) |
1444  ((microtask == (microtask_t)__kmp_teams_master)
1445  ? ompt_parallel_league
1446  : ompt_parallel_team);
1447  ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1448  parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
1449  return_address);
1450  }
1451  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1452  }
1453 #endif
1454 
1455  master_th->th.th_ident = loc;
1456 
1457  if (master_th->th.th_teams_microtask && ap &&
1458  microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
1459  // AC: This is start of parallel that is nested inside teams construct.
1460  // The team is actual (hot), all workers are ready at the fork barrier.
1461  // No lock needed to initialize the team a bit, then free workers.
1462  parent_team->t.t_ident = loc;
1463  __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1464  parent_team->t.t_argc = argc;
1465  argv = (void **)parent_team->t.t_argv;
1466  for (i = argc - 1; i >= 0; --i)
1467  *argv++ = va_arg(kmp_va_deref(ap), void *);
1468  // Increment our nested depth levels, but not increase the serialization
1469  if (parent_team == master_th->th.th_serial_team) {
1470  // AC: we are in serialized parallel
1471  __kmpc_serialized_parallel(loc, gtid);
1472  KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1473 
1474  if (call_context == fork_context_gnu) {
1475  // AC: need to decrement t_serialized for enquiry functions to work
1476  // correctly, will restore at join time
1477  parent_team->t.t_serialized--;
1478  return TRUE;
1479  }
1480 
1481 #if OMPD_SUPPORT
1482  parent_team->t.t_pkfn = microtask;
1483 #endif
1484 
1485 #if OMPT_SUPPORT
1486  void *dummy;
1487  void **exit_frame_p;
1488 
1489  ompt_lw_taskteam_t lw_taskteam;
1490 
1491  if (ompt_enabled.enabled) {
1492  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1493  &ompt_parallel_data, return_address);
1494  exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1495 
1496  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1497  // don't use lw_taskteam after linking. content was swaped
1498 
1499  /* OMPT implicit task begin */
1500  implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1501  if (ompt_enabled.ompt_callback_implicit_task) {
1502  OMPT_CUR_TASK_INFO(master_th)->thread_num =
1503  __kmp_tid_from_gtid(gtid);
1504  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1505  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1506  implicit_task_data, 1,
1507  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1508  }
1509 
1510  /* OMPT state */
1511  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1512  } else {
1513  exit_frame_p = &dummy;
1514  }
1515 #endif
1516  // AC: need to decrement t_serialized for enquiry functions to work
1517  // correctly, will restore at join time
1518  parent_team->t.t_serialized--;
1519 
1520  {
1521  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1522  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1523  __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1524 #if OMPT_SUPPORT
1525  ,
1526  exit_frame_p
1527 #endif
1528  );
1529  }
1530 
1531 #if OMPT_SUPPORT
1532  if (ompt_enabled.enabled) {
1533  *exit_frame_p = NULL;
1534  OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1535  if (ompt_enabled.ompt_callback_implicit_task) {
1536  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1537  ompt_scope_end, NULL, implicit_task_data, 1,
1538  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1539  }
1540  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1541  __ompt_lw_taskteam_unlink(master_th);
1542  if (ompt_enabled.ompt_callback_parallel_end) {
1543  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1544  &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1545  OMPT_INVOKER(call_context) | ompt_parallel_team,
1546  return_address);
1547  }
1548  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1549  }
1550 #endif
1551  return TRUE;
1552  }
1553 
1554  parent_team->t.t_pkfn = microtask;
1555  parent_team->t.t_invoke = invoker;
1556  KMP_ATOMIC_INC(&root->r.r_in_parallel);
1557  parent_team->t.t_active_level++;
1558  parent_team->t.t_level++;
1559  parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1560 
1561 #if OMPT_SUPPORT
1562  if (ompt_enabled.enabled) {
1563  ompt_lw_taskteam_t lw_taskteam;
1564  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1565  &ompt_parallel_data, return_address);
1566  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
1567  }
1568 #endif
1569 
1570  /* Change number of threads in the team if requested */
1571  if (master_set_numthreads) { // The parallel has num_threads clause
1572  if (master_set_numthreads <= master_th->th.th_teams_size.nth) {
1573  // AC: only can reduce number of threads dynamically, can't increase
1574  kmp_info_t **other_threads = parent_team->t.t_threads;
1575  // NOTE: if using distributed barrier, we need to run this code block
1576  // even when the team size appears not to have changed from the max.
1577  int old_proc = master_th->th.th_teams_size.nth;
1578  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] ==
1579  bp_dist_bar) {
1580  __kmp_resize_dist_barrier(parent_team, old_proc,
1581  master_set_numthreads);
1582  __kmp_add_threads_to_team(parent_team, master_set_numthreads);
1583  }
1584  parent_team->t.t_nproc = master_set_numthreads;
1585  for (i = 0; i < master_set_numthreads; ++i) {
1586  other_threads[i]->th.th_team_nproc = master_set_numthreads;
1587  }
1588  }
1589  // Keep extra threads hot in the team for possible next parallels
1590  master_th->th.th_set_nproc = 0;
1591  }
1592 
1593 #if USE_DEBUGGER
1594  if (__kmp_debugging) { // Let debugger override number of threads.
1595  int nth = __kmp_omp_num_threads(loc);
1596  if (nth > 0) { // 0 means debugger doesn't want to change num threads
1597  master_set_numthreads = nth;
1598  }
1599  }
1600 #endif
1601 
1602 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1603  if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
1604  KMP_ITT_DEBUG) &&
1605  __kmp_forkjoin_frames_mode == 3 &&
1606  parent_team->t.t_active_level == 1 // only report frames at level 1
1607  && master_th->th.th_teams_size.nteams == 1) {
1608  kmp_uint64 tmp_time = __itt_get_timestamp();
1609  master_th->th.th_frame_time = tmp_time;
1610  parent_team->t.t_region_time = tmp_time;
1611  }
1612  if (__itt_stack_caller_create_ptr) {
1613  KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
1614  // create new stack stitching id before entering fork barrier
1615  parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
1616  }
1617 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1618 
1619  KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
1620  "master_th=%p, gtid=%d\n",
1621  root, parent_team, master_th, gtid));
1622  __kmp_internal_fork(loc, gtid, parent_team);
1623  KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
1624  "master_th=%p, gtid=%d\n",
1625  root, parent_team, master_th, gtid));
1626 
1627  if (call_context == fork_context_gnu)
1628  return TRUE;
1629 
1630  /* Invoke microtask for PRIMARY thread */
1631  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
1632  parent_team->t.t_id, parent_team->t.t_pkfn));
1633 
1634  if (!parent_team->t.t_invoke(gtid)) {
1635  KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
1636  }
1637  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
1638  parent_team->t.t_id, parent_team->t.t_pkfn));
1639  KMP_MB(); /* Flush all pending memory write invalidates. */
1640 
1641  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
1642 
1643  return TRUE;
1644  } // Parallel closely nested in teams construct
1645 
1646 #if KMP_DEBUG
1647  if (__kmp_tasking_mode != tskm_immediate_exec) {
1648  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1649  parent_team->t.t_task_team[master_th->th.th_task_state]);
1650  }
1651 #endif
1652 
1653  // Need this to happen before we determine the number of threads, not while
1654  // we are allocating the team
1655  //__kmp_push_current_task_to_thread(master_th, parent_team, 0);
1656  int enter_teams = 0;
1657  if (parent_team->t.t_active_level >=
1658  master_th->th.th_current_task->td_icvs.max_active_levels) {
1659  nthreads = 1;
1660  } else {
1661  enter_teams = ((ap == NULL && active_level == 0) ||
1662  (ap && teams_level > 0 && teams_level == level));
1663  nthreads = master_set_numthreads
1664  ? master_set_numthreads
1665  // TODO: get nproc directly from current task
1666  : get__nproc_2(parent_team, master_tid);
1667  // Check if we need to take forkjoin lock? (no need for serialized
1668  // parallel out of teams construct). This code moved here from
1669  // __kmp_reserve_threads() to speedup nested serialized parallels.
1670  if (nthreads > 1) {
1671  if ((get__max_active_levels(master_th) == 1 &&
1672  (root->r.r_in_parallel && !enter_teams)) ||
1673  (__kmp_library == library_serial)) {
1674  KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
1675  " threads\n",
1676  gtid, nthreads));
1677  nthreads = 1;
1678  }
1679  }
1680  if (nthreads > 1) {
1681  /* determine how many new threads we can use */
1682  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1683  /* AC: If we execute teams from parallel region (on host), then teams
1684  should be created but each can only have 1 thread if nesting is
1685  disabled. If teams called from serial region, then teams and their
1686  threads should be created regardless of the nesting setting. */
1687  nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
1688  nthreads, enter_teams);
1689  if (nthreads == 1) {
1690  // Free lock for single thread execution here; for multi-thread
1691  // execution it will be freed later after team of threads created
1692  // and initialized
1693  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1694  }
1695  }
1696  }
1697  KMP_DEBUG_ASSERT(nthreads > 0);
1698 
1699  // If we temporarily changed the set number of threads then restore it now
1700  master_th->th.th_set_nproc = 0;
1701 
1702  /* create a serialized parallel region? */
1703  if (nthreads == 1) {
1704 /* josh todo: hypothetical question: what do we do for OS X*? */
1705 #if KMP_OS_LINUX && \
1706  (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1707  void *args[argc];
1708 #else
1709  void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1710 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1711  KMP_ARCH_AARCH64) */
1712 
1713  KA_TRACE(20,
1714  ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
1715 
1716  __kmpc_serialized_parallel(loc, gtid);
1717 
1718 #if OMPD_SUPPORT
1719  master_th->th.th_serial_team->t.t_pkfn = microtask;
1720 #endif
1721 
1722  if (call_context == fork_context_intel) {
1723  /* TODO this sucks, use the compiler itself to pass args! :) */
1724  master_th->th.th_serial_team->t.t_ident = loc;
1725  if (!ap) {
1726  // revert change made in __kmpc_serialized_parallel()
1727  master_th->th.th_serial_team->t.t_level--;
1728  // Get args from parent team for teams construct
1729 
1730 #if OMPT_SUPPORT
1731  void *dummy;
1732  void **exit_frame_p;
1733  ompt_task_info_t *task_info;
1734 
1735  ompt_lw_taskteam_t lw_taskteam;
1736 
1737  if (ompt_enabled.enabled) {
1738  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1739  &ompt_parallel_data, return_address);
1740 
1741  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1742  // don't use lw_taskteam after linking. content was swaped
1743 
1744  task_info = OMPT_CUR_TASK_INFO(master_th);
1745  exit_frame_p = &(task_info->frame.exit_frame.ptr);
1746  if (ompt_enabled.ompt_callback_implicit_task) {
1747  OMPT_CUR_TASK_INFO(master_th)->thread_num =
1748  __kmp_tid_from_gtid(gtid);
1749  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1750  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1751  &(task_info->task_data), 1,
1752  OMPT_CUR_TASK_INFO(master_th)->thread_num,
1753  ompt_task_implicit);
1754  }
1755 
1756  /* OMPT state */
1757  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1758  } else {
1759  exit_frame_p = &dummy;
1760  }
1761 #endif
1762 
1763  {
1764  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1765  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1766  __kmp_invoke_microtask(microtask, gtid, 0, argc,
1767  parent_team->t.t_argv
1768 #if OMPT_SUPPORT
1769  ,
1770  exit_frame_p
1771 #endif
1772  );
1773  }
1774 
1775 #if OMPT_SUPPORT
1776  if (ompt_enabled.enabled) {
1777  *exit_frame_p = NULL;
1778  if (ompt_enabled.ompt_callback_implicit_task) {
1779  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1780  ompt_scope_end, NULL, &(task_info->task_data), 1,
1781  OMPT_CUR_TASK_INFO(master_th)->thread_num,
1782  ompt_task_implicit);
1783  }
1784  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1785  __ompt_lw_taskteam_unlink(master_th);
1786  if (ompt_enabled.ompt_callback_parallel_end) {
1787  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1788  &ompt_parallel_data, parent_task_data,
1789  OMPT_INVOKER(call_context) | ompt_parallel_team,
1790  return_address);
1791  }
1792  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1793  }
1794 #endif
1795  } else if (microtask == (microtask_t)__kmp_teams_master) {
1796  KMP_DEBUG_ASSERT(master_th->th.th_team ==
1797  master_th->th.th_serial_team);
1798  team = master_th->th.th_team;
1799  // team->t.t_pkfn = microtask;
1800  team->t.t_invoke = invoker;
1801  __kmp_alloc_argv_entries(argc, team, TRUE);
1802  team->t.t_argc = argc;
1803  argv = (void **)team->t.t_argv;
1804  if (ap) {
1805  for (i = argc - 1; i >= 0; --i)
1806  *argv++ = va_arg(kmp_va_deref(ap), void *);
1807  } else {
1808  for (i = 0; i < argc; ++i)
1809  // Get args from parent team for teams construct
1810  argv[i] = parent_team->t.t_argv[i];
1811  }
1812  // AC: revert change made in __kmpc_serialized_parallel()
1813  // because initial code in teams should have level=0
1814  team->t.t_level--;
1815  // AC: call special invoker for outer "parallel" of teams construct
1816  invoker(gtid);
1817 #if OMPT_SUPPORT
1818  if (ompt_enabled.enabled) {
1819  ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1820  if (ompt_enabled.ompt_callback_implicit_task) {
1821  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1822  ompt_scope_end, NULL, &(task_info->task_data), 0,
1823  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1824  }
1825  if (ompt_enabled.ompt_callback_parallel_end) {
1826  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1827  &ompt_parallel_data, parent_task_data,
1828  OMPT_INVOKER(call_context) | ompt_parallel_league,
1829  return_address);
1830  }
1831  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1832  }
1833 #endif
1834  } else {
1835  argv = args;
1836  for (i = argc - 1; i >= 0; --i)
1837  *argv++ = va_arg(kmp_va_deref(ap), void *);
1838  KMP_MB();
1839 
1840 #if OMPT_SUPPORT
1841  void *dummy;
1842  void **exit_frame_p;
1843  ompt_task_info_t *task_info;
1844 
1845  ompt_lw_taskteam_t lw_taskteam;
1846 
1847  if (ompt_enabled.enabled) {
1848  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1849  &ompt_parallel_data, return_address);
1850  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1851  // don't use lw_taskteam after linking. content was swaped
1852  task_info = OMPT_CUR_TASK_INFO(master_th);
1853  exit_frame_p = &(task_info->frame.exit_frame.ptr);
1854 
1855  /* OMPT implicit task begin */
1856  implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1857  if (ompt_enabled.ompt_callback_implicit_task) {
1858  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1859  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1860  implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1861  ompt_task_implicit);
1862  OMPT_CUR_TASK_INFO(master_th)->thread_num =
1863  __kmp_tid_from_gtid(gtid);
1864  }
1865 
1866  /* OMPT state */
1867  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1868  } else {
1869  exit_frame_p = &dummy;
1870  }
1871 #endif
1872 
1873  {
1874  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1875  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1876  __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1877 #if OMPT_SUPPORT
1878  ,
1879  exit_frame_p
1880 #endif
1881  );
1882  }
1883 
1884 #if OMPT_SUPPORT
1885  if (ompt_enabled.enabled) {
1886  *exit_frame_p = NULL;
1887  if (ompt_enabled.ompt_callback_implicit_task) {
1888  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1889  ompt_scope_end, NULL, &(task_info->task_data), 1,
1890  OMPT_CUR_TASK_INFO(master_th)->thread_num,
1891  ompt_task_implicit);
1892  }
1893 
1894  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1895  __ompt_lw_taskteam_unlink(master_th);
1896  if (ompt_enabled.ompt_callback_parallel_end) {
1897  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1898  &ompt_parallel_data, parent_task_data,
1899  OMPT_INVOKER(call_context) | ompt_parallel_team,
1900  return_address);
1901  }
1902  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1903  }
1904 #endif
1905  }
1906  } else if (call_context == fork_context_gnu) {
1907 #if OMPT_SUPPORT
1908  ompt_lw_taskteam_t lwt;
1909  __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
1910  return_address);
1911 
1912  lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1913  __ompt_lw_taskteam_link(&lwt, master_th, 1);
1914 // don't use lw_taskteam after linking. content was swaped
1915 #endif
1916 
1917  // we were called from GNU native code
1918  KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1919  return FALSE;
1920  } else {
1921  KMP_ASSERT2(call_context < fork_context_last,
1922  "__kmp_fork_call: unknown fork_context parameter");
1923  }
1924 
1925  KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1926  KMP_MB();
1927  return FALSE;
1928  } // if (nthreads == 1)
1929 
1930  // GEH: only modify the executing flag in the case when not serialized
1931  // serialized case is handled in kmpc_serialized_parallel
1932  KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
1933  "curtask=%p, curtask_max_aclevel=%d\n",
1934  parent_team->t.t_active_level, master_th,
1935  master_th->th.th_current_task,
1936  master_th->th.th_current_task->td_icvs.max_active_levels));
1937  // TODO: GEH - cannot do this assertion because root thread not set up as
1938  // executing
1939  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1940  master_th->th.th_current_task->td_flags.executing = 0;
1941 
1942  if (!master_th->th.th_teams_microtask || level > teams_level) {
1943  /* Increment our nested depth level */
1944  KMP_ATOMIC_INC(&root->r.r_in_parallel);
1945  }
1946 
1947  // See if we need to make a copy of the ICVs.
1948  int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1949  if ((level + 1 < __kmp_nested_nth.used) &&
1950  (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
1951  nthreads_icv = __kmp_nested_nth.nth[level + 1];
1952  } else {
1953  nthreads_icv = 0; // don't update
1954  }
1955 
1956  // Figure out the proc_bind_policy for the new team.
1957  kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1958  kmp_proc_bind_t proc_bind_icv =
1959  proc_bind_default; // proc_bind_default means don't update
1960  if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1961  proc_bind = proc_bind_false;
1962  } else {
1963  if (proc_bind == proc_bind_default) {
1964  // No proc_bind clause specified; use current proc-bind-var for this
1965  // parallel region
1966  proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1967  }
1968  /* else: The proc_bind policy was specified explicitly on parallel clause.
1969  This overrides proc-bind-var for this parallel region, but does not
1970  change proc-bind-var. */
1971  // Figure the value of proc-bind-var for the child threads.
1972  if ((level + 1 < __kmp_nested_proc_bind.used) &&
1973  (__kmp_nested_proc_bind.bind_types[level + 1] !=
1974  master_th->th.th_current_task->td_icvs.proc_bind)) {
1975  proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1976  }
1977  }
1978 
1979  // Reset for next parallel region
1980  master_th->th.th_set_proc_bind = proc_bind_default;
1981 
1982  if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
1983  kmp_internal_control_t new_icvs;
1984  copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
1985  new_icvs.next = NULL;
1986  if (nthreads_icv > 0) {
1987  new_icvs.nproc = nthreads_icv;
1988  }
1989  if (proc_bind_icv != proc_bind_default) {
1990  new_icvs.proc_bind = proc_bind_icv;
1991  }
1992 
1993  /* allocate a new parallel team */
1994  KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
1995  team = __kmp_allocate_team(root, nthreads, nthreads,
1996 #if OMPT_SUPPORT
1997  ompt_parallel_data,
1998 #endif
1999  proc_bind, &new_icvs,
2000  argc USE_NESTED_HOT_ARG(master_th));
2001  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2002  copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs, &new_icvs);
2003  } else {
2004  /* allocate a new parallel team */
2005  KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2006  team = __kmp_allocate_team(root, nthreads, nthreads,
2007 #if OMPT_SUPPORT
2008  ompt_parallel_data,
2009 #endif
2010  proc_bind,
2011  &master_th->th.th_current_task->td_icvs,
2012  argc USE_NESTED_HOT_ARG(master_th));
2013  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2014  copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs,
2015  &master_th->th.th_current_task->td_icvs);
2016  }
2017  KF_TRACE(
2018  10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2019 
2020  /* setup the new team */
2021  KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2022  KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2023  KMP_CHECK_UPDATE(team->t.t_ident, loc);
2024  KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2025  KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2026 #if OMPT_SUPPORT
2027  KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2028  return_address);
2029 #endif
2030  KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2031  // TODO: parent_team->t.t_level == INT_MAX ???
2032  if (!master_th->th.th_teams_microtask || level > teams_level) {
2033  int new_level = parent_team->t.t_level + 1;
2034  KMP_CHECK_UPDATE(team->t.t_level, new_level);
2035  new_level = parent_team->t.t_active_level + 1;
2036  KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2037  } else {
2038  // AC: Do not increase parallel level at start of the teams construct
2039  int new_level = parent_team->t.t_level;
2040  KMP_CHECK_UPDATE(team->t.t_level, new_level);
2041  new_level = parent_team->t.t_active_level;
2042  KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2043  }
2044  kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2045  // set primary thread's schedule as new run-time schedule
2046  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2047 
2048  KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2049  KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2050 
2051  // Update the floating point rounding in the team if required.
2052  propagateFPControl(team);
2053 #if OMPD_SUPPORT
2054  if (ompd_state & OMPD_ENABLE_BP)
2055  ompd_bp_parallel_begin();
2056 #endif
2057 
2058  if (__kmp_tasking_mode != tskm_immediate_exec) {
2059  // Set primary thread's task team to team's task team. Unless this is hot
2060  // team, it should be NULL.
2061  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2062  parent_team->t.t_task_team[master_th->th.th_task_state]);
2063  KA_TRACE(20, ("__kmp_fork_call: Primary T#%d pushing task_team %p / team "
2064  "%p, new task_team %p / team %p\n",
2065  __kmp_gtid_from_thread(master_th),
2066  master_th->th.th_task_team, parent_team,
2067  team->t.t_task_team[master_th->th.th_task_state], team));
2068 
2069  if (active_level || master_th->th.th_task_team) {
2070  // Take a memo of primary thread's task_state
2071  KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2072  if (master_th->th.th_task_state_top >=
2073  master_th->th.th_task_state_stack_sz) { // increase size
2074  kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2075  kmp_uint8 *old_stack, *new_stack;
2076  kmp_uint32 i;
2077  new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2078  for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2079  new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2080  }
2081  for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2082  ++i) { // zero-init rest of stack
2083  new_stack[i] = 0;
2084  }
2085  old_stack = master_th->th.th_task_state_memo_stack;
2086  master_th->th.th_task_state_memo_stack = new_stack;
2087  master_th->th.th_task_state_stack_sz = new_size;
2088  __kmp_free(old_stack);
2089  }
2090  // Store primary thread's task_state on stack
2091  master_th->th
2092  .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2093  master_th->th.th_task_state;
2094  master_th->th.th_task_state_top++;
2095 #if KMP_NESTED_HOT_TEAMS
2096  if (master_th->th.th_hot_teams &&
2097  active_level < __kmp_hot_teams_max_level &&
2098  team == master_th->th.th_hot_teams[active_level].hot_team) {
2099  // Restore primary thread's nested state if nested hot team
2100  master_th->th.th_task_state =
2101  master_th->th
2102  .th_task_state_memo_stack[master_th->th.th_task_state_top];
2103  } else {
2104 #endif
2105  master_th->th.th_task_state = 0;
2106 #if KMP_NESTED_HOT_TEAMS
2107  }
2108 #endif
2109  }
2110 #if !KMP_NESTED_HOT_TEAMS
2111  KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2112  (team == root->r.r_hot_team));
2113 #endif
2114  }
2115 
2116  KA_TRACE(
2117  20,
2118  ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2119  gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2120  team->t.t_nproc));
2121  KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2122  (team->t.t_master_tid == 0 &&
2123  (team->t.t_parent == root->r.r_root_team ||
2124  team->t.t_parent->t.t_serialized)));
2125  KMP_MB();
2126 
2127  /* now, setup the arguments */
2128  argv = (void **)team->t.t_argv;
2129  if (ap) {
2130  for (i = argc - 1; i >= 0; --i) {
2131  void *new_argv = va_arg(kmp_va_deref(ap), void *);
2132  KMP_CHECK_UPDATE(*argv, new_argv);
2133  argv++;
2134  }
2135  } else {
2136  for (i = 0; i < argc; ++i) {
2137  // Get args from parent team for teams construct
2138  KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2139  }
2140  }
2141 
2142  /* now actually fork the threads */
2143  KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2144  if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2145  root->r.r_active = TRUE;
2146 
2147  __kmp_fork_team_threads(root, team, master_th, gtid);
2148  __kmp_setup_icv_copy(team, nthreads,
2149  &master_th->th.th_current_task->td_icvs, loc);
2150 
2151 #if OMPT_SUPPORT
2152  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2153 #endif
2154 
2155  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2156 
2157 #if USE_ITT_BUILD
2158  if (team->t.t_active_level == 1 // only report frames at level 1
2159  && !master_th->th.th_teams_microtask) { // not in teams construct
2160 #if USE_ITT_NOTIFY
2161  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2162  (__kmp_forkjoin_frames_mode == 3 ||
2163  __kmp_forkjoin_frames_mode == 1)) {
2164  kmp_uint64 tmp_time = 0;
2165  if (__itt_get_timestamp_ptr)
2166  tmp_time = __itt_get_timestamp();
2167  // Internal fork - report frame begin
2168  master_th->th.th_frame_time = tmp_time;
2169  if (__kmp_forkjoin_frames_mode == 3)
2170  team->t.t_region_time = tmp_time;
2171  } else
2172 // only one notification scheme (either "submit" or "forking/joined", not both)
2173 #endif /* USE_ITT_NOTIFY */
2174  if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2175  __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2176  // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2177  __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2178  }
2179  }
2180 #endif /* USE_ITT_BUILD */
2181 
2182  /* now go on and do the work */
2183  KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2184  KMP_MB();
2185  KF_TRACE(10,
2186  ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2187  root, team, master_th, gtid));
2188 
2189 #if USE_ITT_BUILD
2190  if (__itt_stack_caller_create_ptr) {
2191  // create new stack stitching id before entering fork barrier
2192  if (!enter_teams) {
2193  KMP_DEBUG_ASSERT(team->t.t_stack_id == NULL);
2194  team->t.t_stack_id = __kmp_itt_stack_caller_create();
2195  } else if (parent_team->t.t_serialized) {
2196  // keep stack stitching id in the serialized parent_team;
2197  // current team will be used for parallel inside the teams;
2198  // if parent_team is active, then it already keeps stack stitching id
2199  // for the league of teams
2200  KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
2201  parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
2202  }
2203  }
2204 #endif /* USE_ITT_BUILD */
2205 
2206  // AC: skip __kmp_internal_fork at teams construct, let only primary
2207  // threads execute
2208  if (ap) {
2209  __kmp_internal_fork(loc, gtid, team);
2210  KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2211  "master_th=%p, gtid=%d\n",
2212  root, team, master_th, gtid));
2213  }
2214 
2215  if (call_context == fork_context_gnu) {
2216  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2217  return TRUE;
2218  }
2219 
2220  /* Invoke microtask for PRIMARY thread */
2221  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2222  team->t.t_id, team->t.t_pkfn));
2223  } // END of timer KMP_fork_call block
2224 
2225 #if KMP_STATS_ENABLED
2226  // If beginning a teams construct, then change thread state
2227  stats_state_e previous_state = KMP_GET_THREAD_STATE();
2228  if (!ap) {
2229  KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2230  }
2231 #endif
2232 
2233  if (!team->t.t_invoke(gtid)) {
2234  KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
2235  }
2236 
2237 #if KMP_STATS_ENABLED
2238  // If was beginning of a teams construct, then reset thread state
2239  if (!ap) {
2240  KMP_SET_THREAD_STATE(previous_state);
2241  }
2242 #endif
2243 
2244  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2245  team->t.t_id, team->t.t_pkfn));
2246  KMP_MB(); /* Flush all pending memory write invalidates. */
2247 
2248  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2249 #if OMPT_SUPPORT
2250  if (ompt_enabled.enabled) {
2251  master_th->th.ompt_thread_info.state = ompt_state_overhead;
2252  }
2253 #endif
2254 
2255  return TRUE;
2256 }
2257 
2258 #if OMPT_SUPPORT
2259 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2260  kmp_team_t *team) {
2261  // restore state outside the region
2262  thread->th.ompt_thread_info.state =
2263  ((team->t.t_serialized) ? ompt_state_work_serial
2264  : ompt_state_work_parallel);
2265 }
2266 
2267 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2268  kmp_team_t *team, ompt_data_t *parallel_data,
2269  int flags, void *codeptr) {
2270  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2271  if (ompt_enabled.ompt_callback_parallel_end) {
2272  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2273  parallel_data, &(task_info->task_data), flags, codeptr);
2274  }
2275 
2276  task_info->frame.enter_frame = ompt_data_none;
2277  __kmp_join_restore_state(thread, team);
2278 }
2279 #endif
2280 
2281 void __kmp_join_call(ident_t *loc, int gtid
2282 #if OMPT_SUPPORT
2283  ,
2284  enum fork_context_e fork_context
2285 #endif
2286  ,
2287  int exit_teams) {
2288  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2289  kmp_team_t *team;
2290  kmp_team_t *parent_team;
2291  kmp_info_t *master_th;
2292  kmp_root_t *root;
2293  int master_active;
2294 
2295  KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2296 
2297  /* setup current data */
2298  master_th = __kmp_threads[gtid];
2299  root = master_th->th.th_root;
2300  team = master_th->th.th_team;
2301  parent_team = team->t.t_parent;
2302 
2303  master_th->th.th_ident = loc;
2304 
2305 #if OMPT_SUPPORT
2306  void *team_microtask = (void *)team->t.t_pkfn;
2307  // For GOMP interface with serialized parallel, need the
2308  // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task
2309  // and end-parallel events.
2310  if (ompt_enabled.enabled &&
2311  !(team->t.t_serialized && fork_context == fork_context_gnu)) {
2312  master_th->th.ompt_thread_info.state = ompt_state_overhead;
2313  }
2314 #endif
2315 
2316 #if KMP_DEBUG
2317  if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2318  KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2319  "th_task_team = %p\n",
2320  __kmp_gtid_from_thread(master_th), team,
2321  team->t.t_task_team[master_th->th.th_task_state],
2322  master_th->th.th_task_team));
2323  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2324  team->t.t_task_team[master_th->th.th_task_state]);
2325  }
2326 #endif
2327 
2328  if (team->t.t_serialized) {
2329  if (master_th->th.th_teams_microtask) {
2330  // We are in teams construct
2331  int level = team->t.t_level;
2332  int tlevel = master_th->th.th_teams_level;
2333  if (level == tlevel) {
2334  // AC: we haven't incremented it earlier at start of teams construct,
2335  // so do it here - at the end of teams construct
2336  team->t.t_level++;
2337  } else if (level == tlevel + 1) {
2338  // AC: we are exiting parallel inside teams, need to increment
2339  // serialization in order to restore it in the next call to
2340  // __kmpc_end_serialized_parallel
2341  team->t.t_serialized++;
2342  }
2343  }
2344  __kmpc_end_serialized_parallel(loc, gtid);
2345 
2346 #if OMPT_SUPPORT
2347  if (ompt_enabled.enabled) {
2348  __kmp_join_restore_state(master_th, parent_team);
2349  }
2350 #endif
2351 
2352  return;
2353  }
2354 
2355  master_active = team->t.t_master_active;
2356 
2357  if (!exit_teams) {
2358  // AC: No barrier for internal teams at exit from teams construct.
2359  // But there is barrier for external team (league).
2360  __kmp_internal_join(loc, gtid, team);
2361 #if USE_ITT_BUILD
2362  if (__itt_stack_caller_create_ptr) {
2363  KMP_DEBUG_ASSERT(team->t.t_stack_id != NULL);
2364  // destroy the stack stitching id after join barrier
2365  __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
2366  team->t.t_stack_id = NULL;
2367  }
2368 #endif
2369  } else {
2370  master_th->th.th_task_state =
2371  0; // AC: no tasking in teams (out of any parallel)
2372 #if USE_ITT_BUILD
2373  if (__itt_stack_caller_create_ptr && parent_team->t.t_serialized) {
2374  KMP_DEBUG_ASSERT(parent_team->t.t_stack_id != NULL);
2375  // destroy the stack stitching id on exit from the teams construct
2376  // if parent_team is active, then the id will be destroyed later on
2377  // by master of the league of teams
2378  __kmp_itt_stack_caller_destroy((__itt_caller)parent_team->t.t_stack_id);
2379  parent_team->t.t_stack_id = NULL;
2380  }
2381 #endif
2382 
2383  if (team->t.t_nproc > 1 &&
2384  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2385  team->t.b->update_num_threads(team->t.t_nproc);
2386  __kmp_add_threads_to_team(team, team->t.t_nproc);
2387  }
2388  }
2389 
2390  KMP_MB();
2391 
2392 #if OMPT_SUPPORT
2393  ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2394  void *codeptr = team->t.ompt_team_info.master_return_address;
2395 #endif
2396 
2397 #if USE_ITT_BUILD
2398  // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2399  if (team->t.t_active_level == 1 &&
2400  (!master_th->th.th_teams_microtask || /* not in teams construct */
2401  master_th->th.th_teams_size.nteams == 1)) {
2402  master_th->th.th_ident = loc;
2403  // only one notification scheme (either "submit" or "forking/joined", not
2404  // both)
2405  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2406  __kmp_forkjoin_frames_mode == 3)
2407  __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2408  master_th->th.th_frame_time, 0, loc,
2409  master_th->th.th_team_nproc, 1);
2410  else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2411  !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2412  __kmp_itt_region_joined(gtid);
2413  } // active_level == 1
2414 #endif /* USE_ITT_BUILD */
2415 
2416  if (master_th->th.th_teams_microtask && !exit_teams &&
2417  team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2418  team->t.t_level == master_th->th.th_teams_level + 1) {
2419 // AC: We need to leave the team structure intact at the end of parallel
2420 // inside the teams construct, so that at the next parallel same (hot) team
2421 // works, only adjust nesting levels
2422 #if OMPT_SUPPORT
2423  ompt_data_t ompt_parallel_data = ompt_data_none;
2424  if (ompt_enabled.enabled) {
2425  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2426  if (ompt_enabled.ompt_callback_implicit_task) {
2427  int ompt_team_size = team->t.t_nproc;
2428  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2429  ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2430  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2431  }
2432  task_info->frame.exit_frame = ompt_data_none;
2433  task_info->task_data = ompt_data_none;
2434  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2435  __ompt_lw_taskteam_unlink(master_th);
2436  }
2437 #endif
2438  /* Decrement our nested depth level */
2439  team->t.t_level--;
2440  team->t.t_active_level--;
2441  KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2442 
2443  // Restore number of threads in the team if needed. This code relies on
2444  // the proper adjustment of th_teams_size.nth after the fork in
2445  // __kmp_teams_master on each teams primary thread in the case that
2446  // __kmp_reserve_threads reduced it.
2447  if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2448  int old_num = master_th->th.th_team_nproc;
2449  int new_num = master_th->th.th_teams_size.nth;
2450  kmp_info_t **other_threads = team->t.t_threads;
2451  team->t.t_nproc = new_num;
2452  for (int i = 0; i < old_num; ++i) {
2453  other_threads[i]->th.th_team_nproc = new_num;
2454  }
2455  // Adjust states of non-used threads of the team
2456  for (int i = old_num; i < new_num; ++i) {
2457  // Re-initialize thread's barrier data.
2458  KMP_DEBUG_ASSERT(other_threads[i]);
2459  kmp_balign_t *balign = other_threads[i]->th.th_bar;
2460  for (int b = 0; b < bs_last_barrier; ++b) {
2461  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2462  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2463 #if USE_DEBUGGER
2464  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2465 #endif
2466  }
2467  if (__kmp_tasking_mode != tskm_immediate_exec) {
2468  // Synchronize thread's task state
2469  other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2470  }
2471  }
2472  }
2473 
2474 #if OMPT_SUPPORT
2475  if (ompt_enabled.enabled) {
2476  __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
2477  OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2478  }
2479 #endif
2480 
2481  return;
2482  }
2483 
2484  /* do cleanup and restore the parent team */
2485  master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2486  master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2487 
2488  master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2489 
2490  /* jc: The following lock has instructions with REL and ACQ semantics,
2491  separating the parallel user code called in this parallel region
2492  from the serial user code called after this function returns. */
2493  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2494 
2495  if (!master_th->th.th_teams_microtask ||
2496  team->t.t_level > master_th->th.th_teams_level) {
2497  /* Decrement our nested depth level */
2498  KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2499  }
2500  KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2501 
2502 #if OMPT_SUPPORT
2503  if (ompt_enabled.enabled) {
2504  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2505  if (ompt_enabled.ompt_callback_implicit_task) {
2506  int flags = (team_microtask == (void *)__kmp_teams_master)
2507  ? ompt_task_initial
2508  : ompt_task_implicit;
2509  int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2510  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2511  ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2512  OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2513  }
2514  task_info->frame.exit_frame = ompt_data_none;
2515  task_info->task_data = ompt_data_none;
2516  }
2517 #endif
2518 
2519  KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2520  master_th, team));
2521  __kmp_pop_current_task_from_thread(master_th);
2522 
2523 #if KMP_AFFINITY_SUPPORTED
2524  // Restore master thread's partition.
2525  master_th->th.th_first_place = team->t.t_first_place;
2526  master_th->th.th_last_place = team->t.t_last_place;
2527 #endif // KMP_AFFINITY_SUPPORTED
2528  master_th->th.th_def_allocator = team->t.t_def_allocator;
2529 
2530 #if OMPD_SUPPORT
2531  if (ompd_state & OMPD_ENABLE_BP)
2532  ompd_bp_parallel_end();
2533 #endif
2534  updateHWFPControl(team);
2535 
2536  if (root->r.r_active != master_active)
2537  root->r.r_active = master_active;
2538 
2539  __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2540  master_th)); // this will free worker threads
2541 
2542  /* this race was fun to find. make sure the following is in the critical
2543  region otherwise assertions may fail occasionally since the old team may be
2544  reallocated and the hierarchy appears inconsistent. it is actually safe to
2545  run and won't cause any bugs, but will cause those assertion failures. it's
2546  only one deref&assign so might as well put this in the critical region */
2547  master_th->th.th_team = parent_team;
2548  master_th->th.th_team_nproc = parent_team->t.t_nproc;
2549  master_th->th.th_team_master = parent_team->t.t_threads[0];
2550  master_th->th.th_team_serialized = parent_team->t.t_serialized;
2551 
2552  /* restore serialized team, if need be */
2553  if (parent_team->t.t_serialized &&
2554  parent_team != master_th->th.th_serial_team &&
2555  parent_team != root->r.r_root_team) {
2556  __kmp_free_team(root,
2557  master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2558  master_th->th.th_serial_team = parent_team;
2559  }
2560 
2561  if (__kmp_tasking_mode != tskm_immediate_exec) {
2562  if (master_th->th.th_task_state_top >
2563  0) { // Restore task state from memo stack
2564  KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2565  // Remember primary thread's state if we re-use this nested hot team
2566  master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2567  master_th->th.th_task_state;
2568  --master_th->th.th_task_state_top; // pop
2569  // Now restore state at this level
2570  master_th->th.th_task_state =
2571  master_th->th
2572  .th_task_state_memo_stack[master_th->th.th_task_state_top];
2573  }
2574  // Copy the task team from the parent team to the primary thread
2575  master_th->th.th_task_team =
2576  parent_team->t.t_task_team[master_th->th.th_task_state];
2577  KA_TRACE(20,
2578  ("__kmp_join_call: Primary T#%d restoring task_team %p, team %p\n",
2579  __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2580  parent_team));
2581  }
2582 
2583  // TODO: GEH - cannot do this assertion because root thread not set up as
2584  // executing
2585  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2586  master_th->th.th_current_task->td_flags.executing = 1;
2587 
2588  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2589 
2590 #if OMPT_SUPPORT
2591  int flags =
2592  OMPT_INVOKER(fork_context) |
2593  ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2594  : ompt_parallel_team);
2595  if (ompt_enabled.enabled) {
2596  __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
2597  codeptr);
2598  }
2599 #endif
2600 
2601  KMP_MB();
2602  KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2603 }
2604 
2605 /* Check whether we should push an internal control record onto the
2606  serial team stack. If so, do it. */
2607 void __kmp_save_internal_controls(kmp_info_t *thread) {
2608 
2609  if (thread->th.th_team != thread->th.th_serial_team) {
2610  return;
2611  }
2612  if (thread->th.th_team->t.t_serialized > 1) {
2613  int push = 0;
2614 
2615  if (thread->th.th_team->t.t_control_stack_top == NULL) {
2616  push = 1;
2617  } else {
2618  if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2619  thread->th.th_team->t.t_serialized) {
2620  push = 1;
2621  }
2622  }
2623  if (push) { /* push a record on the serial team's stack */
2624  kmp_internal_control_t *control =
2625  (kmp_internal_control_t *)__kmp_allocate(
2626  sizeof(kmp_internal_control_t));
2627 
2628  copy_icvs(control, &thread->th.th_current_task->td_icvs);
2629 
2630  control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2631 
2632  control->next = thread->th.th_team->t.t_control_stack_top;
2633  thread->th.th_team->t.t_control_stack_top = control;
2634  }
2635  }
2636 }
2637 
2638 /* Changes set_nproc */
2639 void __kmp_set_num_threads(int new_nth, int gtid) {
2640  kmp_info_t *thread;
2641  kmp_root_t *root;
2642 
2643  KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2644  KMP_DEBUG_ASSERT(__kmp_init_serial);
2645 
2646  if (new_nth < 1)
2647  new_nth = 1;
2648  else if (new_nth > __kmp_max_nth)
2649  new_nth = __kmp_max_nth;
2650 
2651  KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2652  thread = __kmp_threads[gtid];
2653  if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2654  return; // nothing to do
2655 
2656  __kmp_save_internal_controls(thread);
2657 
2658  set__nproc(thread, new_nth);
2659 
2660  // If this omp_set_num_threads() call will cause the hot team size to be
2661  // reduced (in the absence of a num_threads clause), then reduce it now,
2662  // rather than waiting for the next parallel region.
2663  root = thread->th.th_root;
2664  if (__kmp_init_parallel && (!root->r.r_active) &&
2665  (root->r.r_hot_team->t.t_nproc > new_nth)
2666 #if KMP_NESTED_HOT_TEAMS
2667  && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2668 #endif
2669  ) {
2670  kmp_team_t *hot_team = root->r.r_hot_team;
2671  int f;
2672 
2673  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2674 
2675  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2676  __kmp_resize_dist_barrier(hot_team, hot_team->t.t_nproc, new_nth);
2677  }
2678  // Release the extra threads we don't need any more.
2679  for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2680  KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2681  if (__kmp_tasking_mode != tskm_immediate_exec) {
2682  // When decreasing team size, threads no longer in the team should unref
2683  // task team.
2684  hot_team->t.t_threads[f]->th.th_task_team = NULL;
2685  }
2686  __kmp_free_thread(hot_team->t.t_threads[f]);
2687  hot_team->t.t_threads[f] = NULL;
2688  }
2689  hot_team->t.t_nproc = new_nth;
2690 #if KMP_NESTED_HOT_TEAMS
2691  if (thread->th.th_hot_teams) {
2692  KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2693  thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2694  }
2695 #endif
2696 
2697  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2698  hot_team->t.b->update_num_threads(new_nth);
2699  __kmp_add_threads_to_team(hot_team, new_nth);
2700  }
2701 
2702  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2703 
2704  // Update the t_nproc field in the threads that are still active.
2705  for (f = 0; f < new_nth; f++) {
2706  KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2707  hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2708  }
2709  // Special flag in case omp_set_num_threads() call
2710  hot_team->t.t_size_changed = -1;
2711  }
2712 }
2713 
2714 /* Changes max_active_levels */
2715 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2716  kmp_info_t *thread;
2717 
2718  KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2719  "%d = (%d)\n",
2720  gtid, max_active_levels));
2721  KMP_DEBUG_ASSERT(__kmp_init_serial);
2722 
2723  // validate max_active_levels
2724  if (max_active_levels < 0) {
2725  KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2726  // We ignore this call if the user has specified a negative value.
2727  // The current setting won't be changed. The last valid setting will be
2728  // used. A warning will be issued (if warnings are allowed as controlled by
2729  // the KMP_WARNINGS env var).
2730  KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2731  "max_active_levels for thread %d = (%d)\n",
2732  gtid, max_active_levels));
2733  return;
2734  }
2735  if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2736  // it's OK, the max_active_levels is within the valid range: [ 0;
2737  // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2738  // We allow a zero value. (implementation defined behavior)
2739  } else {
2740  KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2741  KMP_MAX_ACTIVE_LEVELS_LIMIT);
2742  max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2743  // Current upper limit is MAX_INT. (implementation defined behavior)
2744  // If the input exceeds the upper limit, we correct the input to be the
2745  // upper limit. (implementation defined behavior)
2746  // Actually, the flow should never get here until we use MAX_INT limit.
2747  }
2748  KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2749  "max_active_levels for thread %d = (%d)\n",
2750  gtid, max_active_levels));
2751 
2752  thread = __kmp_threads[gtid];
2753 
2754  __kmp_save_internal_controls(thread);
2755 
2756  set__max_active_levels(thread, max_active_levels);
2757 }
2758 
2759 /* Gets max_active_levels */
2760 int __kmp_get_max_active_levels(int gtid) {
2761  kmp_info_t *thread;
2762 
2763  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2764  KMP_DEBUG_ASSERT(__kmp_init_serial);
2765 
2766  thread = __kmp_threads[gtid];
2767  KMP_DEBUG_ASSERT(thread->th.th_current_task);
2768  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2769  "curtask_maxaclevel=%d\n",
2770  gtid, thread->th.th_current_task,
2771  thread->th.th_current_task->td_icvs.max_active_levels));
2772  return thread->th.th_current_task->td_icvs.max_active_levels;
2773 }
2774 
2775 // nteams-var per-device ICV
2776 void __kmp_set_num_teams(int num_teams) {
2777  if (num_teams > 0)
2778  __kmp_nteams = num_teams;
2779 }
2780 int __kmp_get_max_teams(void) { return __kmp_nteams; }
2781 // teams-thread-limit-var per-device ICV
2782 void __kmp_set_teams_thread_limit(int limit) {
2783  if (limit > 0)
2784  __kmp_teams_thread_limit = limit;
2785 }
2786 int __kmp_get_teams_thread_limit(void) { return __kmp_teams_thread_limit; }
2787 
2788 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2789 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2790 
2791 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2792 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2793  kmp_info_t *thread;
2794  kmp_sched_t orig_kind;
2795  // kmp_team_t *team;
2796 
2797  KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2798  gtid, (int)kind, chunk));
2799  KMP_DEBUG_ASSERT(__kmp_init_serial);
2800 
2801  // Check if the kind parameter is valid, correct if needed.
2802  // Valid parameters should fit in one of two intervals - standard or extended:
2803  // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2804  // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103
2805  orig_kind = kind;
2806  kind = __kmp_sched_without_mods(kind);
2807 
2808  if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2809  (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2810  // TODO: Hint needs attention in case we change the default schedule.
2811  __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2812  KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2813  __kmp_msg_null);
2814  kind = kmp_sched_default;
2815  chunk = 0; // ignore chunk value in case of bad kind
2816  }
2817 
2818  thread = __kmp_threads[gtid];
2819 
2820  __kmp_save_internal_controls(thread);
2821 
2822  if (kind < kmp_sched_upper_std) {
2823  if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2824  // differ static chunked vs. unchunked: chunk should be invalid to
2825  // indicate unchunked schedule (which is the default)
2826  thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2827  } else {
2828  thread->th.th_current_task->td_icvs.sched.r_sched_type =
2829  __kmp_sch_map[kind - kmp_sched_lower - 1];
2830  }
2831  } else {
2832  // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2833  // kmp_sched_lower - 2 ];
2834  thread->th.th_current_task->td_icvs.sched.r_sched_type =
2835  __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2836  kmp_sched_lower - 2];
2837  }
2838  __kmp_sched_apply_mods_intkind(
2839  orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2840  if (kind == kmp_sched_auto || chunk < 1) {
2841  // ignore parameter chunk for schedule auto
2842  thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2843  } else {
2844  thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2845  }
2846 }
2847 
2848 /* Gets def_sched_var ICV values */
2849 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2850  kmp_info_t *thread;
2851  enum sched_type th_type;
2852 
2853  KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2854  KMP_DEBUG_ASSERT(__kmp_init_serial);
2855 
2856  thread = __kmp_threads[gtid];
2857 
2858  th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2859  switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
2860  case kmp_sch_static:
2861  case kmp_sch_static_greedy:
2862  case kmp_sch_static_balanced:
2863  *kind = kmp_sched_static;
2864  __kmp_sched_apply_mods_stdkind(kind, th_type);
2865  *chunk = 0; // chunk was not set, try to show this fact via zero value
2866  return;
2867  case kmp_sch_static_chunked:
2868  *kind = kmp_sched_static;
2869  break;
2870  case kmp_sch_dynamic_chunked:
2871  *kind = kmp_sched_dynamic;
2872  break;
2874  case kmp_sch_guided_iterative_chunked:
2875  case kmp_sch_guided_analytical_chunked:
2876  *kind = kmp_sched_guided;
2877  break;
2878  case kmp_sch_auto:
2879  *kind = kmp_sched_auto;
2880  break;
2881  case kmp_sch_trapezoidal:
2882  *kind = kmp_sched_trapezoidal;
2883  break;
2884 #if KMP_STATIC_STEAL_ENABLED
2885  case kmp_sch_static_steal:
2886  *kind = kmp_sched_static_steal;
2887  break;
2888 #endif
2889  default:
2890  KMP_FATAL(UnknownSchedulingType, th_type);
2891  }
2892 
2893  __kmp_sched_apply_mods_stdkind(kind, th_type);
2894  *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2895 }
2896 
2897 int __kmp_get_ancestor_thread_num(int gtid, int level) {
2898 
2899  int ii, dd;
2900  kmp_team_t *team;
2901  kmp_info_t *thr;
2902 
2903  KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
2904  KMP_DEBUG_ASSERT(__kmp_init_serial);
2905 
2906  // validate level
2907  if (level == 0)
2908  return 0;
2909  if (level < 0)
2910  return -1;
2911  thr = __kmp_threads[gtid];
2912  team = thr->th.th_team;
2913  ii = team->t.t_level;
2914  if (level > ii)
2915  return -1;
2916 
2917  if (thr->th.th_teams_microtask) {
2918  // AC: we are in teams region where multiple nested teams have same level
2919  int tlevel = thr->th.th_teams_level; // the level of the teams construct
2920  if (level <=
2921  tlevel) { // otherwise usual algorithm works (will not touch the teams)
2922  KMP_DEBUG_ASSERT(ii >= tlevel);
2923  // AC: As we need to pass by the teams league, we need to artificially
2924  // increase ii
2925  if (ii == tlevel) {
2926  ii += 2; // three teams have same level
2927  } else {
2928  ii++; // two teams have same level
2929  }
2930  }
2931  }
2932 
2933  if (ii == level)
2934  return __kmp_tid_from_gtid(gtid);
2935 
2936  dd = team->t.t_serialized;
2937  level++;
2938  while (ii > level) {
2939  for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2940  }
2941  if ((team->t.t_serialized) && (!dd)) {
2942  team = team->t.t_parent;
2943  continue;
2944  }
2945  if (ii > level) {
2946  team = team->t.t_parent;
2947  dd = team->t.t_serialized;
2948  ii--;
2949  }
2950  }
2951 
2952  return (dd > 1) ? (0) : (team->t.t_master_tid);
2953 }
2954 
2955 int __kmp_get_team_size(int gtid, int level) {
2956 
2957  int ii, dd;
2958  kmp_team_t *team;
2959  kmp_info_t *thr;
2960 
2961  KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
2962  KMP_DEBUG_ASSERT(__kmp_init_serial);
2963 
2964  // validate level
2965  if (level == 0)
2966  return 1;
2967  if (level < 0)
2968  return -1;
2969  thr = __kmp_threads[gtid];
2970  team = thr->th.th_team;
2971  ii = team->t.t_level;
2972  if (level > ii)
2973  return -1;
2974 
2975  if (thr->th.th_teams_microtask) {
2976  // AC: we are in teams region where multiple nested teams have same level
2977  int tlevel = thr->th.th_teams_level; // the level of the teams construct
2978  if (level <=
2979  tlevel) { // otherwise usual algorithm works (will not touch the teams)
2980  KMP_DEBUG_ASSERT(ii >= tlevel);
2981  // AC: As we need to pass by the teams league, we need to artificially
2982  // increase ii
2983  if (ii == tlevel) {
2984  ii += 2; // three teams have same level
2985  } else {
2986  ii++; // two teams have same level
2987  }
2988  }
2989  }
2990 
2991  while (ii > level) {
2992  for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2993  }
2994  if (team->t.t_serialized && (!dd)) {
2995  team = team->t.t_parent;
2996  continue;
2997  }
2998  if (ii > level) {
2999  team = team->t.t_parent;
3000  ii--;
3001  }
3002  }
3003 
3004  return team->t.t_nproc;
3005 }
3006 
3007 kmp_r_sched_t __kmp_get_schedule_global() {
3008  // This routine created because pairs (__kmp_sched, __kmp_chunk) and
3009  // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
3010  // independently. So one can get the updated schedule here.
3011 
3012  kmp_r_sched_t r_sched;
3013 
3014  // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
3015  // __kmp_guided. __kmp_sched should keep original value, so that user can set
3016  // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
3017  // different roots (even in OMP 2.5)
3018  enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
3019  enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
3020  if (s == kmp_sch_static) {
3021  // replace STATIC with more detailed schedule (balanced or greedy)
3022  r_sched.r_sched_type = __kmp_static;
3023  } else if (s == kmp_sch_guided_chunked) {
3024  // replace GUIDED with more detailed schedule (iterative or analytical)
3025  r_sched.r_sched_type = __kmp_guided;
3026  } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
3027  r_sched.r_sched_type = __kmp_sched;
3028  }
3029  SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
3030 
3031  if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
3032  // __kmp_chunk may be wrong here (if it was not ever set)
3033  r_sched.chunk = KMP_DEFAULT_CHUNK;
3034  } else {
3035  r_sched.chunk = __kmp_chunk;
3036  }
3037 
3038  return r_sched;
3039 }
3040 
3041 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3042  at least argc number of *t_argv entries for the requested team. */
3043 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3044 
3045  KMP_DEBUG_ASSERT(team);
3046  if (!realloc || argc > team->t.t_max_argc) {
3047 
3048  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3049  "current entries=%d\n",
3050  team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3051  /* if previously allocated heap space for args, free them */
3052  if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3053  __kmp_free((void *)team->t.t_argv);
3054 
3055  if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3056  /* use unused space in the cache line for arguments */
3057  team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3058  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3059  "argv entries\n",
3060  team->t.t_id, team->t.t_max_argc));
3061  team->t.t_argv = &team->t.t_inline_argv[0];
3062  if (__kmp_storage_map) {
3063  __kmp_print_storage_map_gtid(
3064  -1, &team->t.t_inline_argv[0],
3065  &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3066  (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3067  team->t.t_id);
3068  }
3069  } else {
3070  /* allocate space for arguments in the heap */
3071  team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3072  ? KMP_MIN_MALLOC_ARGV_ENTRIES
3073  : 2 * argc;
3074  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3075  "argv entries\n",
3076  team->t.t_id, team->t.t_max_argc));
3077  team->t.t_argv =
3078  (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3079  if (__kmp_storage_map) {
3080  __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3081  &team->t.t_argv[team->t.t_max_argc],
3082  sizeof(void *) * team->t.t_max_argc,
3083  "team_%d.t_argv", team->t.t_id);
3084  }
3085  }
3086  }
3087 }
3088 
3089 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3090  int i;
3091  int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3092  team->t.t_threads =
3093  (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3094  team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3095  sizeof(dispatch_shared_info_t) * num_disp_buff);
3096  team->t.t_dispatch =
3097  (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3098  team->t.t_implicit_task_taskdata =
3099  (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3100  team->t.t_max_nproc = max_nth;
3101 
3102  /* setup dispatch buffers */
3103  for (i = 0; i < num_disp_buff; ++i) {
3104  team->t.t_disp_buffer[i].buffer_index = i;
3105  team->t.t_disp_buffer[i].doacross_buf_idx = i;
3106  }
3107 }
3108 
3109 static void __kmp_free_team_arrays(kmp_team_t *team) {
3110  /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3111  int i;
3112  for (i = 0; i < team->t.t_max_nproc; ++i) {
3113  if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3114  __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3115  team->t.t_dispatch[i].th_disp_buffer = NULL;
3116  }
3117  }
3118 #if KMP_USE_HIER_SCHED
3119  __kmp_dispatch_free_hierarchies(team);
3120 #endif
3121  __kmp_free(team->t.t_threads);
3122  __kmp_free(team->t.t_disp_buffer);
3123  __kmp_free(team->t.t_dispatch);
3124  __kmp_free(team->t.t_implicit_task_taskdata);
3125  team->t.t_threads = NULL;
3126  team->t.t_disp_buffer = NULL;
3127  team->t.t_dispatch = NULL;
3128  team->t.t_implicit_task_taskdata = 0;
3129 }
3130 
3131 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3132  kmp_info_t **oldThreads = team->t.t_threads;
3133 
3134  __kmp_free(team->t.t_disp_buffer);
3135  __kmp_free(team->t.t_dispatch);
3136  __kmp_free(team->t.t_implicit_task_taskdata);
3137  __kmp_allocate_team_arrays(team, max_nth);
3138 
3139  KMP_MEMCPY(team->t.t_threads, oldThreads,
3140  team->t.t_nproc * sizeof(kmp_info_t *));
3141 
3142  __kmp_free(oldThreads);
3143 }
3144 
3145 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3146 
3147  kmp_r_sched_t r_sched =
3148  __kmp_get_schedule_global(); // get current state of scheduling globals
3149 
3150  KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3151 
3152  kmp_internal_control_t g_icvs = {
3153  0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3154  (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3155  // adjustment of threads (per thread)
3156  (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3157  // whether blocktime is explicitly set
3158  __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3159 #if KMP_USE_MONITOR
3160  __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3161 // intervals
3162 #endif
3163  __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3164  // next parallel region (per thread)
3165  // (use a max ub on value if __kmp_parallel_initialize not called yet)
3166  __kmp_cg_max_nth, // int thread_limit;
3167  __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3168  // for max_active_levels
3169  r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3170  // {sched,chunk} pair
3171  __kmp_nested_proc_bind.bind_types[0],
3172  __kmp_default_device,
3173  NULL // struct kmp_internal_control *next;
3174  };
3175 
3176  return g_icvs;
3177 }
3178 
3179 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3180 
3181  kmp_internal_control_t gx_icvs;
3182  gx_icvs.serial_nesting_level =
3183  0; // probably =team->t.t_serial like in save_inter_controls
3184  copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3185  gx_icvs.next = NULL;
3186 
3187  return gx_icvs;
3188 }
3189 
3190 static void __kmp_initialize_root(kmp_root_t *root) {
3191  int f;
3192  kmp_team_t *root_team;
3193  kmp_team_t *hot_team;
3194  int hot_team_max_nth;
3195  kmp_r_sched_t r_sched =
3196  __kmp_get_schedule_global(); // get current state of scheduling globals
3197  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3198  KMP_DEBUG_ASSERT(root);
3199  KMP_ASSERT(!root->r.r_begin);
3200 
3201  /* setup the root state structure */
3202  __kmp_init_lock(&root->r.r_begin_lock);
3203  root->r.r_begin = FALSE;
3204  root->r.r_active = FALSE;
3205  root->r.r_in_parallel = 0;
3206  root->r.r_blocktime = __kmp_dflt_blocktime;
3207 #if KMP_AFFINITY_SUPPORTED
3208  root->r.r_affinity_assigned = FALSE;
3209 #endif
3210 
3211  /* setup the root team for this task */
3212  /* allocate the root team structure */
3213  KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3214 
3215  root_team =
3216  __kmp_allocate_team(root,
3217  1, // new_nproc
3218  1, // max_nproc
3219 #if OMPT_SUPPORT
3220  ompt_data_none, // root parallel id
3221 #endif
3222  __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3223  0 // argc
3224  USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3225  );
3226 #if USE_DEBUGGER
3227  // Non-NULL value should be assigned to make the debugger display the root
3228  // team.
3229  TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3230 #endif
3231 
3232  KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3233 
3234  root->r.r_root_team = root_team;
3235  root_team->t.t_control_stack_top = NULL;
3236 
3237  /* initialize root team */
3238  root_team->t.t_threads[0] = NULL;
3239  root_team->t.t_nproc = 1;
3240  root_team->t.t_serialized = 1;
3241  // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3242  root_team->t.t_sched.sched = r_sched.sched;
3243  KA_TRACE(
3244  20,
3245  ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3246  root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3247 
3248  /* setup the hot team for this task */
3249  /* allocate the hot team structure */
3250  KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3251 
3252  hot_team =
3253  __kmp_allocate_team(root,
3254  1, // new_nproc
3255  __kmp_dflt_team_nth_ub * 2, // max_nproc
3256 #if OMPT_SUPPORT
3257  ompt_data_none, // root parallel id
3258 #endif
3259  __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3260  0 // argc
3261  USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3262  );
3263  KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3264 
3265  root->r.r_hot_team = hot_team;
3266  root_team->t.t_control_stack_top = NULL;
3267 
3268  /* first-time initialization */
3269  hot_team->t.t_parent = root_team;
3270 
3271  /* initialize hot team */
3272  hot_team_max_nth = hot_team->t.t_max_nproc;
3273  for (f = 0; f < hot_team_max_nth; ++f) {
3274  hot_team->t.t_threads[f] = NULL;
3275  }
3276  hot_team->t.t_nproc = 1;
3277  // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3278  hot_team->t.t_sched.sched = r_sched.sched;
3279  hot_team->t.t_size_changed = 0;
3280 }
3281 
3282 #ifdef KMP_DEBUG
3283 
3284 typedef struct kmp_team_list_item {
3285  kmp_team_p const *entry;
3286  struct kmp_team_list_item *next;
3287 } kmp_team_list_item_t;
3288 typedef kmp_team_list_item_t *kmp_team_list_t;
3289 
3290 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3291  kmp_team_list_t list, // List of teams.
3292  kmp_team_p const *team // Team to add.
3293 ) {
3294 
3295  // List must terminate with item where both entry and next are NULL.
3296  // Team is added to the list only once.
3297  // List is sorted in ascending order by team id.
3298  // Team id is *not* a key.
3299 
3300  kmp_team_list_t l;
3301 
3302  KMP_DEBUG_ASSERT(list != NULL);
3303  if (team == NULL) {
3304  return;
3305  }
3306 
3307  __kmp_print_structure_team_accum(list, team->t.t_parent);
3308  __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3309 
3310  // Search list for the team.
3311  l = list;
3312  while (l->next != NULL && l->entry != team) {
3313  l = l->next;
3314  }
3315  if (l->next != NULL) {
3316  return; // Team has been added before, exit.
3317  }
3318 
3319  // Team is not found. Search list again for insertion point.
3320  l = list;
3321  while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3322  l = l->next;
3323  }
3324 
3325  // Insert team.
3326  {
3327  kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3328  sizeof(kmp_team_list_item_t));
3329  *item = *l;
3330  l->entry = team;
3331  l->next = item;
3332  }
3333 }
3334 
3335 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3336 
3337 ) {
3338  __kmp_printf("%s", title);
3339  if (team != NULL) {
3340  __kmp_printf("%2x %p\n", team->t.t_id, team);
3341  } else {
3342  __kmp_printf(" - (nil)\n");
3343  }
3344 }
3345 
3346 static void __kmp_print_structure_thread(char const *title,
3347  kmp_info_p const *thread) {
3348  __kmp_printf("%s", title);
3349  if (thread != NULL) {
3350  __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3351  } else {
3352  __kmp_printf(" - (nil)\n");
3353  }
3354 }
3355 
3356 void __kmp_print_structure(void) {
3357 
3358  kmp_team_list_t list;
3359 
3360  // Initialize list of teams.
3361  list =
3362  (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3363  list->entry = NULL;
3364  list->next = NULL;
3365 
3366  __kmp_printf("\n------------------------------\nGlobal Thread "
3367  "Table\n------------------------------\n");
3368  {
3369  int gtid;
3370  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3371  __kmp_printf("%2d", gtid);
3372  if (__kmp_threads != NULL) {
3373  __kmp_printf(" %p", __kmp_threads[gtid]);
3374  }
3375  if (__kmp_root != NULL) {
3376  __kmp_printf(" %p", __kmp_root[gtid]);
3377  }
3378  __kmp_printf("\n");
3379  }
3380  }
3381 
3382  // Print out __kmp_threads array.
3383  __kmp_printf("\n------------------------------\nThreads\n--------------------"
3384  "----------\n");
3385  if (__kmp_threads != NULL) {
3386  int gtid;
3387  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3388  kmp_info_t const *thread = __kmp_threads[gtid];
3389  if (thread != NULL) {
3390  __kmp_printf("GTID %2d %p:\n", gtid, thread);
3391  __kmp_printf(" Our Root: %p\n", thread->th.th_root);
3392  __kmp_print_structure_team(" Our Team: ", thread->th.th_team);
3393  __kmp_print_structure_team(" Serial Team: ",
3394  thread->th.th_serial_team);
3395  __kmp_printf(" Threads: %2d\n", thread->th.th_team_nproc);
3396  __kmp_print_structure_thread(" Primary: ",
3397  thread->th.th_team_master);
3398  __kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized);
3399  __kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc);
3400  __kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3401  __kmp_print_structure_thread(" Next in pool: ",
3402  thread->th.th_next_pool);
3403  __kmp_printf("\n");
3404  __kmp_print_structure_team_accum(list, thread->th.th_team);
3405  __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3406  }
3407  }
3408  } else {
3409  __kmp_printf("Threads array is not allocated.\n");
3410  }
3411 
3412  // Print out __kmp_root array.
3413  __kmp_printf("\n------------------------------\nUbers\n----------------------"
3414  "--------\n");
3415  if (__kmp_root != NULL) {
3416  int gtid;
3417  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3418  kmp_root_t const *root = __kmp_root[gtid];
3419  if (root != NULL) {
3420  __kmp_printf("GTID %2d %p:\n", gtid, root);
3421  __kmp_print_structure_team(" Root Team: ", root->r.r_root_team);
3422  __kmp_print_structure_team(" Hot Team: ", root->r.r_hot_team);
3423  __kmp_print_structure_thread(" Uber Thread: ",
3424  root->r.r_uber_thread);
3425  __kmp_printf(" Active?: %2d\n", root->r.r_active);
3426  __kmp_printf(" In Parallel: %2d\n",
3427  KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3428  __kmp_printf("\n");
3429  __kmp_print_structure_team_accum(list, root->r.r_root_team);
3430  __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3431  }
3432  }
3433  } else {
3434  __kmp_printf("Ubers array is not allocated.\n");
3435  }
3436 
3437  __kmp_printf("\n------------------------------\nTeams\n----------------------"
3438  "--------\n");
3439  while (list->next != NULL) {
3440  kmp_team_p const *team = list->entry;
3441  int i;
3442  __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3443  __kmp_print_structure_team(" Parent Team: ", team->t.t_parent);
3444  __kmp_printf(" Primary TID: %2d\n", team->t.t_master_tid);
3445  __kmp_printf(" Max threads: %2d\n", team->t.t_max_nproc);
3446  __kmp_printf(" Levels of serial: %2d\n", team->t.t_serialized);
3447  __kmp_printf(" Number threads: %2d\n", team->t.t_nproc);
3448  for (i = 0; i < team->t.t_nproc; ++i) {
3449  __kmp_printf(" Thread %2d: ", i);
3450  __kmp_print_structure_thread("", team->t.t_threads[i]);
3451  }
3452  __kmp_print_structure_team(" Next in pool: ", team->t.t_next_pool);
3453  __kmp_printf("\n");
3454  list = list->next;
3455  }
3456 
3457  // Print out __kmp_thread_pool and __kmp_team_pool.
3458  __kmp_printf("\n------------------------------\nPools\n----------------------"
3459  "--------\n");
3460  __kmp_print_structure_thread("Thread pool: ",
3461  CCAST(kmp_info_t *, __kmp_thread_pool));
3462  __kmp_print_structure_team("Team pool: ",
3463  CCAST(kmp_team_t *, __kmp_team_pool));
3464  __kmp_printf("\n");
3465 
3466  // Free team list.
3467  while (list != NULL) {
3468  kmp_team_list_item_t *item = list;
3469  list = list->next;
3470  KMP_INTERNAL_FREE(item);
3471  }
3472 }
3473 
3474 #endif
3475 
3476 //---------------------------------------------------------------------------
3477 // Stuff for per-thread fast random number generator
3478 // Table of primes
3479 static const unsigned __kmp_primes[] = {
3480  0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3481  0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3482  0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3483  0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3484  0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3485  0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3486  0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3487  0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3488  0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3489  0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3490  0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3491 
3492 //---------------------------------------------------------------------------
3493 // __kmp_get_random: Get a random number using a linear congruential method.
3494 unsigned short __kmp_get_random(kmp_info_t *thread) {
3495  unsigned x = thread->th.th_x;
3496  unsigned short r = (unsigned short)(x >> 16);
3497 
3498  thread->th.th_x = x * thread->th.th_a + 1;
3499 
3500  KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3501  thread->th.th_info.ds.ds_tid, r));
3502 
3503  return r;
3504 }
3505 //--------------------------------------------------------
3506 // __kmp_init_random: Initialize a random number generator
3507 void __kmp_init_random(kmp_info_t *thread) {
3508  unsigned seed = thread->th.th_info.ds.ds_tid;
3509 
3510  thread->th.th_a =
3511  __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3512  thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3513  KA_TRACE(30,
3514  ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3515 }
3516 
3517 #if KMP_OS_WINDOWS
3518 /* reclaim array entries for root threads that are already dead, returns number
3519  * reclaimed */
3520 static int __kmp_reclaim_dead_roots(void) {
3521  int i, r = 0;
3522 
3523  for (i = 0; i < __kmp_threads_capacity; ++i) {
3524  if (KMP_UBER_GTID(i) &&
3525  !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3526  !__kmp_root[i]
3527  ->r.r_active) { // AC: reclaim only roots died in non-active state
3528  r += __kmp_unregister_root_other_thread(i);
3529  }
3530  }
3531  return r;
3532 }
3533 #endif
3534 
3535 /* This function attempts to create free entries in __kmp_threads and
3536  __kmp_root, and returns the number of free entries generated.
3537 
3538  For Windows* OS static library, the first mechanism used is to reclaim array
3539  entries for root threads that are already dead.
3540 
3541  On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3542  __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3543  capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3544  threadprivate cache array has been created. Synchronization with
3545  __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3546 
3547  After any dead root reclamation, if the clipping value allows array expansion
3548  to result in the generation of a total of nNeed free slots, the function does
3549  that expansion. If not, nothing is done beyond the possible initial root
3550  thread reclamation.
3551 
3552  If any argument is negative, the behavior is undefined. */
3553 static int __kmp_expand_threads(int nNeed) {
3554  int added = 0;
3555  int minimumRequiredCapacity;
3556  int newCapacity;
3557  kmp_info_t **newThreads;
3558  kmp_root_t **newRoot;
3559 
3560  // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3561  // resizing __kmp_threads does not need additional protection if foreign
3562  // threads are present
3563 
3564 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3565  /* only for Windows static library */
3566  /* reclaim array entries for root threads that are already dead */
3567  added = __kmp_reclaim_dead_roots();
3568 
3569  if (nNeed) {
3570  nNeed -= added;
3571  if (nNeed < 0)
3572  nNeed = 0;
3573  }
3574 #endif
3575  if (nNeed <= 0)
3576  return added;
3577 
3578  // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3579  // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3580  // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3581  // > __kmp_max_nth in one of two ways:
3582  //
3583  // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0]
3584  // may not be reused by another thread, so we may need to increase
3585  // __kmp_threads_capacity to __kmp_max_nth + 1.
3586  //
3587  // 2) New foreign root(s) are encountered. We always register new foreign
3588  // roots. This may cause a smaller # of threads to be allocated at
3589  // subsequent parallel regions, but the worker threads hang around (and
3590  // eventually go to sleep) and need slots in the __kmp_threads[] array.
3591  //
3592  // Anyway, that is the reason for moving the check to see if
3593  // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3594  // instead of having it performed here. -BB
3595 
3596  KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3597 
3598  /* compute expansion headroom to check if we can expand */
3599  if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3600  /* possible expansion too small -- give up */
3601  return added;
3602  }
3603  minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3604 
3605  newCapacity = __kmp_threads_capacity;
3606  do {
3607  newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3608  : __kmp_sys_max_nth;
3609  } while (newCapacity < minimumRequiredCapacity);
3610  newThreads = (kmp_info_t **)__kmp_allocate(
3611  (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3612  newRoot =
3613  (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3614  KMP_MEMCPY(newThreads, __kmp_threads,
3615  __kmp_threads_capacity * sizeof(kmp_info_t *));
3616  KMP_MEMCPY(newRoot, __kmp_root,
3617  __kmp_threads_capacity * sizeof(kmp_root_t *));
3618 
3619  kmp_info_t **temp_threads = __kmp_threads;
3620  *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3621  *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3622  __kmp_free(temp_threads);
3623  added += newCapacity - __kmp_threads_capacity;
3624  *(volatile int *)&__kmp_threads_capacity = newCapacity;
3625 
3626  if (newCapacity > __kmp_tp_capacity) {
3627  __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3628  if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3629  __kmp_threadprivate_resize_cache(newCapacity);
3630  } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3631  *(volatile int *)&__kmp_tp_capacity = newCapacity;
3632  }
3633  __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3634  }
3635 
3636  return added;
3637 }
3638 
3639 /* Register the current thread as a root thread and obtain our gtid. We must
3640  have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3641  thread that calls from __kmp_do_serial_initialize() */
3642 int __kmp_register_root(int initial_thread) {
3643  kmp_info_t *root_thread;
3644  kmp_root_t *root;
3645  int gtid;
3646  int capacity;
3647  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3648  KA_TRACE(20, ("__kmp_register_root: entered\n"));
3649  KMP_MB();
3650 
3651  /* 2007-03-02:
3652  If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3653  initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3654  work as expected -- it may return false (that means there is at least one
3655  empty slot in __kmp_threads array), but it is possible the only free slot
3656  is #0, which is reserved for initial thread and so cannot be used for this
3657  one. Following code workarounds this bug.
3658 
3659  However, right solution seems to be not reserving slot #0 for initial
3660  thread because:
3661  (1) there is no magic in slot #0,
3662  (2) we cannot detect initial thread reliably (the first thread which does
3663  serial initialization may be not a real initial thread).
3664  */
3665  capacity = __kmp_threads_capacity;
3666  if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3667  --capacity;
3668  }
3669 
3670  // If it is not for initializing the hidden helper team, we need to take
3671  // __kmp_hidden_helper_threads_num out of the capacity because it is included
3672  // in __kmp_threads_capacity.
3673  if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
3674  capacity -= __kmp_hidden_helper_threads_num;
3675  }
3676 
3677  /* see if there are too many threads */
3678  if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3679  if (__kmp_tp_cached) {
3680  __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3681  KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3682  KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3683  } else {
3684  __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3685  __kmp_msg_null);
3686  }
3687  }
3688 
3689  // When hidden helper task is enabled, __kmp_threads is organized as follows:
3690  // 0: initial thread, also a regular OpenMP thread.
3691  // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads.
3692  // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for
3693  // regular OpenMP threads.
3694  if (TCR_4(__kmp_init_hidden_helper_threads)) {
3695  // Find an available thread slot for hidden helper thread. Slots for hidden
3696  // helper threads start from 1 to __kmp_hidden_helper_threads_num.
3697  for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL &&
3698  gtid <= __kmp_hidden_helper_threads_num;
3699  gtid++)
3700  ;
3701  KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num);
3702  KA_TRACE(1, ("__kmp_register_root: found slot in threads array for "
3703  "hidden helper thread: T#%d\n",
3704  gtid));
3705  } else {
3706  /* find an available thread slot */
3707  // Don't reassign the zero slot since we need that to only be used by
3708  // initial thread. Slots for hidden helper threads should also be skipped.
3709  if (initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3710  gtid = 0;
3711  } else {
3712  for (gtid = __kmp_hidden_helper_threads_num + 1;
3713  TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++)
3714  ;
3715  }
3716  KA_TRACE(
3717  1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3718  KMP_ASSERT(gtid < __kmp_threads_capacity);
3719  }
3720 
3721  /* update global accounting */
3722  __kmp_all_nth++;
3723  TCW_4(__kmp_nth, __kmp_nth + 1);
3724 
3725  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3726  // numbers of procs, and method #2 (keyed API call) for higher numbers.
3727  if (__kmp_adjust_gtid_mode) {
3728  if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3729  if (TCR_4(__kmp_gtid_mode) != 2) {
3730  TCW_4(__kmp_gtid_mode, 2);
3731  }
3732  } else {
3733  if (TCR_4(__kmp_gtid_mode) != 1) {
3734  TCW_4(__kmp_gtid_mode, 1);
3735  }
3736  }
3737  }
3738 
3739 #ifdef KMP_ADJUST_BLOCKTIME
3740  /* Adjust blocktime to zero if necessary */
3741  /* Middle initialization might not have occurred yet */
3742  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3743  if (__kmp_nth > __kmp_avail_proc) {
3744  __kmp_zero_bt = TRUE;
3745  }
3746  }
3747 #endif /* KMP_ADJUST_BLOCKTIME */
3748 
3749  /* setup this new hierarchy */
3750  if (!(root = __kmp_root[gtid])) {
3751  root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3752  KMP_DEBUG_ASSERT(!root->r.r_root_team);
3753  }
3754 
3755 #if KMP_STATS_ENABLED
3756  // Initialize stats as soon as possible (right after gtid assignment).
3757  __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3758  __kmp_stats_thread_ptr->startLife();
3759  KMP_SET_THREAD_STATE(SERIAL_REGION);
3760  KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3761 #endif
3762  __kmp_initialize_root(root);
3763 
3764  /* setup new root thread structure */
3765  if (root->r.r_uber_thread) {
3766  root_thread = root->r.r_uber_thread;
3767  } else {
3768  root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3769  if (__kmp_storage_map) {
3770  __kmp_print_thread_storage_map(root_thread, gtid);
3771  }
3772  root_thread->th.th_info.ds.ds_gtid = gtid;
3773 #if OMPT_SUPPORT
3774  root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3775 #endif
3776  root_thread->th.th_root = root;
3777  if (__kmp_env_consistency_check) {
3778  root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3779  }
3780 #if USE_FAST_MEMORY
3781  __kmp_initialize_fast_memory(root_thread);
3782 #endif /* USE_FAST_MEMORY */
3783 
3784 #if KMP_USE_BGET
3785  KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3786  __kmp_initialize_bget(root_thread);
3787 #endif
3788  __kmp_init_random(root_thread); // Initialize random number generator
3789  }
3790 
3791  /* setup the serial team held in reserve by the root thread */
3792  if (!root_thread->th.th_serial_team) {
3793  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3794  KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3795  root_thread->th.th_serial_team = __kmp_allocate_team(
3796  root, 1, 1,
3797 #if OMPT_SUPPORT
3798  ompt_data_none, // root parallel id
3799 #endif
3800  proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3801  }
3802  KMP_ASSERT(root_thread->th.th_serial_team);
3803  KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3804  root_thread->th.th_serial_team));
3805 
3806  /* drop root_thread into place */
3807  TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3808 
3809  root->r.r_root_team->t.t_threads[0] = root_thread;
3810  root->r.r_hot_team->t.t_threads[0] = root_thread;
3811  root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3812  // AC: the team created in reserve, not for execution (it is unused for now).
3813  root_thread->th.th_serial_team->t.t_serialized = 0;
3814  root->r.r_uber_thread = root_thread;
3815 
3816  /* initialize the thread, get it ready to go */
3817  __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3818  TCW_4(__kmp_init_gtid, TRUE);
3819 
3820  /* prepare the primary thread for get_gtid() */
3821  __kmp_gtid_set_specific(gtid);
3822 
3823 #if USE_ITT_BUILD
3824  __kmp_itt_thread_name(gtid);
3825 #endif /* USE_ITT_BUILD */
3826 
3827 #ifdef KMP_TDATA_GTID
3828  __kmp_gtid = gtid;
3829 #endif
3830  __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3831  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3832 
3833  KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3834  "plain=%u\n",
3835  gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3836  root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3837  KMP_INIT_BARRIER_STATE));
3838  { // Initialize barrier data.
3839  int b;
3840  for (b = 0; b < bs_last_barrier; ++b) {
3841  root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3842 #if USE_DEBUGGER
3843  root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3844 #endif
3845  }
3846  }
3847  KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3848  KMP_INIT_BARRIER_STATE);
3849 
3850 #if KMP_AFFINITY_SUPPORTED
3851  root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3852  root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3853  root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3854  root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3855 #endif /* KMP_AFFINITY_SUPPORTED */
3856  root_thread->th.th_def_allocator = __kmp_def_allocator;
3857  root_thread->th.th_prev_level = 0;
3858  root_thread->th.th_prev_num_threads = 1;
3859 
3860  kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
3861  tmp->cg_root = root_thread;
3862  tmp->cg_thread_limit = __kmp_cg_max_nth;
3863  tmp->cg_nthreads = 1;
3864  KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
3865  " cg_nthreads init to 1\n",
3866  root_thread, tmp));
3867  tmp->up = NULL;
3868  root_thread->th.th_cg_roots = tmp;
3869 
3870  __kmp_root_counter++;
3871 
3872 #if OMPT_SUPPORT
3873  if (!initial_thread && ompt_enabled.enabled) {
3874 
3875  kmp_info_t *root_thread = ompt_get_thread();
3876 
3877  ompt_set_thread_state(root_thread, ompt_state_overhead);
3878 
3879  if (ompt_enabled.ompt_callback_thread_begin) {
3880  ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
3881  ompt_thread_initial, __ompt_get_thread_data_internal());
3882  }
3883  ompt_data_t *task_data;
3884  ompt_data_t *parallel_data;
3885  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
3886  NULL);
3887  if (ompt_enabled.ompt_callback_implicit_task) {
3888  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3889  ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
3890  }
3891 
3892  ompt_set_thread_state(root_thread, ompt_state_work_serial);
3893  }
3894 #endif
3895 #if OMPD_SUPPORT
3896  if (ompd_state & OMPD_ENABLE_BP)
3897  ompd_bp_thread_begin();
3898 #endif
3899 
3900  KMP_MB();
3901  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3902 
3903  return gtid;
3904 }
3905 
3906 #if KMP_NESTED_HOT_TEAMS
3907 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
3908  const int max_level) {
3909  int i, n, nth;
3910  kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3911  if (!hot_teams || !hot_teams[level].hot_team) {
3912  return 0;
3913  }
3914  KMP_DEBUG_ASSERT(level < max_level);
3915  kmp_team_t *team = hot_teams[level].hot_team;
3916  nth = hot_teams[level].hot_team_nth;
3917  n = nth - 1; // primary thread is not freed
3918  if (level < max_level - 1) {
3919  for (i = 0; i < nth; ++i) {
3920  kmp_info_t *th = team->t.t_threads[i];
3921  n += __kmp_free_hot_teams(root, th, level + 1, max_level);
3922  if (i > 0 && th->th.th_hot_teams) {
3923  __kmp_free(th->th.th_hot_teams);
3924  th->th.th_hot_teams = NULL;
3925  }
3926  }
3927  }
3928  __kmp_free_team(root, team, NULL);
3929  return n;
3930 }
3931 #endif
3932 
3933 // Resets a root thread and clear its root and hot teams.
3934 // Returns the number of __kmp_threads entries directly and indirectly freed.
3935 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
3936  kmp_team_t *root_team = root->r.r_root_team;
3937  kmp_team_t *hot_team = root->r.r_hot_team;
3938  int n = hot_team->t.t_nproc;
3939  int i;
3940 
3941  KMP_DEBUG_ASSERT(!root->r.r_active);
3942 
3943  root->r.r_root_team = NULL;
3944  root->r.r_hot_team = NULL;
3945  // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
3946  // before call to __kmp_free_team().
3947  __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
3948 #if KMP_NESTED_HOT_TEAMS
3949  if (__kmp_hot_teams_max_level >
3950  0) { // need to free nested hot teams and their threads if any
3951  for (i = 0; i < hot_team->t.t_nproc; ++i) {
3952  kmp_info_t *th = hot_team->t.t_threads[i];
3953  if (__kmp_hot_teams_max_level > 1) {
3954  n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
3955  }
3956  if (th->th.th_hot_teams) {
3957  __kmp_free(th->th.th_hot_teams);
3958  th->th.th_hot_teams = NULL;
3959  }
3960  }
3961  }
3962 #endif
3963  __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
3964 
3965  // Before we can reap the thread, we need to make certain that all other
3966  // threads in the teams that had this root as ancestor have stopped trying to
3967  // steal tasks.
3968  if (__kmp_tasking_mode != tskm_immediate_exec) {
3969  __kmp_wait_to_unref_task_teams();
3970  }
3971 
3972 #if KMP_OS_WINDOWS
3973  /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3974  KA_TRACE(
3975  10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
3976  "\n",
3977  (LPVOID) & (root->r.r_uber_thread->th),
3978  root->r.r_uber_thread->th.th_info.ds.ds_thread));
3979  __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
3980 #endif /* KMP_OS_WINDOWS */
3981 
3982 #if OMPD_SUPPORT
3983  if (ompd_state & OMPD_ENABLE_BP)
3984  ompd_bp_thread_end();
3985 #endif
3986 
3987 #if OMPT_SUPPORT
3988  ompt_data_t *task_data;
3989  ompt_data_t *parallel_data;
3990  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
3991  NULL);
3992  if (ompt_enabled.ompt_callback_implicit_task) {
3993  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3994  ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
3995  }
3996  if (ompt_enabled.ompt_callback_thread_end) {
3997  ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
3998  &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
3999  }
4000 #endif
4001 
4002  TCW_4(__kmp_nth,
4003  __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
4004  i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
4005  KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
4006  " to %d\n",
4007  root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
4008  root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
4009  if (i == 1) {
4010  // need to free contention group structure
4011  KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
4012  root->r.r_uber_thread->th.th_cg_roots->cg_root);
4013  KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
4014  __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
4015  root->r.r_uber_thread->th.th_cg_roots = NULL;
4016  }
4017  __kmp_reap_thread(root->r.r_uber_thread, 1);
4018 
4019  // We canot put root thread to __kmp_thread_pool, so we have to reap it
4020  // instead of freeing.
4021  root->r.r_uber_thread = NULL;
4022  /* mark root as no longer in use */
4023  root->r.r_begin = FALSE;
4024 
4025  return n;
4026 }
4027 
4028 void __kmp_unregister_root_current_thread(int gtid) {
4029  KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
4030  /* this lock should be ok, since unregister_root_current_thread is never
4031  called during an abort, only during a normal close. furthermore, if you
4032  have the forkjoin lock, you should never try to get the initz lock */
4033  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
4034  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
4035  KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
4036  "exiting T#%d\n",
4037  gtid));
4038  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4039  return;
4040  }
4041  kmp_root_t *root = __kmp_root[gtid];
4042 
4043  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4044  KMP_ASSERT(KMP_UBER_GTID(gtid));
4045  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4046  KMP_ASSERT(root->r.r_active == FALSE);
4047 
4048  KMP_MB();
4049 
4050  kmp_info_t *thread = __kmp_threads[gtid];
4051  kmp_team_t *team = thread->th.th_team;
4052  kmp_task_team_t *task_team = thread->th.th_task_team;
4053 
4054  // we need to wait for the proxy tasks before finishing the thread
4055  if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) {
4056 #if OMPT_SUPPORT
4057  // the runtime is shutting down so we won't report any events
4058  thread->th.ompt_thread_info.state = ompt_state_undefined;
4059 #endif
4060  __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
4061  }
4062 
4063  __kmp_reset_root(gtid, root);
4064 
4065  KMP_MB();
4066  KC_TRACE(10,
4067  ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
4068 
4069  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4070 }
4071 
4072 #if KMP_OS_WINDOWS
4073 /* __kmp_forkjoin_lock must be already held
4074  Unregisters a root thread that is not the current thread. Returns the number
4075  of __kmp_threads entries freed as a result. */
4076 static int __kmp_unregister_root_other_thread(int gtid) {
4077  kmp_root_t *root = __kmp_root[gtid];
4078  int r;
4079 
4080  KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4081  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4082  KMP_ASSERT(KMP_UBER_GTID(gtid));
4083  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4084  KMP_ASSERT(root->r.r_active == FALSE);
4085 
4086  r = __kmp_reset_root(gtid, root);
4087  KC_TRACE(10,
4088  ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4089  return r;
4090 }
4091 #endif
4092 
4093 #if KMP_DEBUG
4094 void __kmp_task_info() {
4095 
4096  kmp_int32 gtid = __kmp_entry_gtid();
4097  kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4098  kmp_info_t *this_thr = __kmp_threads[gtid];
4099  kmp_team_t *steam = this_thr->th.th_serial_team;
4100  kmp_team_t *team = this_thr->th.th_team;
4101 
4102  __kmp_printf(
4103  "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4104  "ptask=%p\n",
4105  gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4106  team->t.t_implicit_task_taskdata[tid].td_parent);
4107 }
4108 #endif // KMP_DEBUG
4109 
4110 /* TODO optimize with one big memclr, take out what isn't needed, split
4111  responsibility to workers as much as possible, and delay initialization of
4112  features as much as possible */
4113 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4114  int tid, int gtid) {
4115  /* this_thr->th.th_info.ds.ds_gtid is setup in
4116  kmp_allocate_thread/create_worker.
4117  this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4118  KMP_DEBUG_ASSERT(this_thr != NULL);
4119  KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4120  KMP_DEBUG_ASSERT(team);
4121  KMP_DEBUG_ASSERT(team->t.t_threads);
4122  KMP_DEBUG_ASSERT(team->t.t_dispatch);
4123  kmp_info_t *master = team->t.t_threads[0];
4124  KMP_DEBUG_ASSERT(master);
4125  KMP_DEBUG_ASSERT(master->th.th_root);
4126 
4127  KMP_MB();
4128 
4129  TCW_SYNC_PTR(this_thr->th.th_team, team);
4130 
4131  this_thr->th.th_info.ds.ds_tid = tid;
4132  this_thr->th.th_set_nproc = 0;
4133  if (__kmp_tasking_mode != tskm_immediate_exec)
4134  // When tasking is possible, threads are not safe to reap until they are
4135  // done tasking; this will be set when tasking code is exited in wait
4136  this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4137  else // no tasking --> always safe to reap
4138  this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4139  this_thr->th.th_set_proc_bind = proc_bind_default;
4140 #if KMP_AFFINITY_SUPPORTED
4141  this_thr->th.th_new_place = this_thr->th.th_current_place;
4142 #endif
4143  this_thr->th.th_root = master->th.th_root;
4144 
4145  /* setup the thread's cache of the team structure */
4146  this_thr->th.th_team_nproc = team->t.t_nproc;
4147  this_thr->th.th_team_master = master;
4148  this_thr->th.th_team_serialized = team->t.t_serialized;
4149 
4150  KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4151 
4152  KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4153  tid, gtid, this_thr, this_thr->th.th_current_task));
4154 
4155  __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4156  team, tid, TRUE);
4157 
4158  KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4159  tid, gtid, this_thr, this_thr->th.th_current_task));
4160  // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4161  // __kmp_initialize_team()?
4162 
4163  /* TODO no worksharing in speculative threads */
4164  this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4165 
4166  this_thr->th.th_local.this_construct = 0;
4167 
4168  if (!this_thr->th.th_pri_common) {
4169  this_thr->th.th_pri_common =
4170  (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4171  if (__kmp_storage_map) {
4172  __kmp_print_storage_map_gtid(
4173  gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4174  sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4175  }
4176  this_thr->th.th_pri_head = NULL;
4177  }
4178 
4179  if (this_thr != master && // Primary thread's CG root is initialized elsewhere
4180  this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4181  // Make new thread's CG root same as primary thread's
4182  KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4183  kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4184  if (tmp) {
4185  // worker changes CG, need to check if old CG should be freed
4186  int i = tmp->cg_nthreads--;
4187  KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4188  " on node %p of thread %p to %d\n",
4189  this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4190  if (i == 1) {
4191  __kmp_free(tmp); // last thread left CG --> free it
4192  }
4193  }
4194  this_thr->th.th_cg_roots = master->th.th_cg_roots;
4195  // Increment new thread's CG root's counter to add the new thread
4196  this_thr->th.th_cg_roots->cg_nthreads++;
4197  KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4198  " node %p of thread %p to %d\n",
4199  this_thr, this_thr->th.th_cg_roots,
4200  this_thr->th.th_cg_roots->cg_root,
4201  this_thr->th.th_cg_roots->cg_nthreads));
4202  this_thr->th.th_current_task->td_icvs.thread_limit =
4203  this_thr->th.th_cg_roots->cg_thread_limit;
4204  }
4205 
4206  /* Initialize dynamic dispatch */
4207  {
4208  volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4209  // Use team max_nproc since this will never change for the team.
4210  size_t disp_size =
4211  sizeof(dispatch_private_info_t) *
4212  (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4213  KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4214  team->t.t_max_nproc));
4215  KMP_ASSERT(dispatch);
4216  KMP_DEBUG_ASSERT(team->t.t_dispatch);
4217  KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4218 
4219  dispatch->th_disp_index = 0;
4220  dispatch->th_doacross_buf_idx = 0;
4221  if (!dispatch->th_disp_buffer) {
4222  dispatch->th_disp_buffer =
4223  (dispatch_private_info_t *)__kmp_allocate(disp_size);
4224 
4225  if (__kmp_storage_map) {
4226  __kmp_print_storage_map_gtid(
4227  gtid, &dispatch->th_disp_buffer[0],
4228  &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4229  ? 1
4230  : __kmp_dispatch_num_buffers],
4231  disp_size,
4232  "th_%d.th_dispatch.th_disp_buffer "
4233  "(team_%d.t_dispatch[%d].th_disp_buffer)",
4234  gtid, team->t.t_id, gtid);
4235  }
4236  } else {
4237  memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4238  }
4239 
4240  dispatch->th_dispatch_pr_current = 0;
4241  dispatch->th_dispatch_sh_current = 0;
4242 
4243  dispatch->th_deo_fcn = 0; /* ORDERED */
4244  dispatch->th_dxo_fcn = 0; /* END ORDERED */
4245  }
4246 
4247  this_thr->th.th_next_pool = NULL;
4248 
4249  if (!this_thr->th.th_task_state_memo_stack) {
4250  size_t i;
4251  this_thr->th.th_task_state_memo_stack =
4252  (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4253  this_thr->th.th_task_state_top = 0;
4254  this_thr->th.th_task_state_stack_sz = 4;
4255  for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4256  ++i) // zero init the stack
4257  this_thr->th.th_task_state_memo_stack[i] = 0;
4258  }
4259 
4260  KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4261  KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4262 
4263  KMP_MB();
4264 }
4265 
4266 /* allocate a new thread for the requesting team. this is only called from
4267  within a forkjoin critical section. we will first try to get an available
4268  thread from the thread pool. if none is available, we will fork a new one
4269  assuming we are able to create a new one. this should be assured, as the
4270  caller should check on this first. */
4271 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4272  int new_tid) {
4273  kmp_team_t *serial_team;
4274  kmp_info_t *new_thr;
4275  int new_gtid;
4276 
4277  KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4278  KMP_DEBUG_ASSERT(root && team);
4279 #if !KMP_NESTED_HOT_TEAMS
4280  KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4281 #endif
4282  KMP_MB();
4283 
4284  /* first, try to get one from the thread pool */
4285  if (__kmp_thread_pool) {
4286  new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4287  __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4288  if (new_thr == __kmp_thread_pool_insert_pt) {
4289  __kmp_thread_pool_insert_pt = NULL;
4290  }
4291  TCW_4(new_thr->th.th_in_pool, FALSE);
4292  __kmp_suspend_initialize_thread(new_thr);
4293  __kmp_lock_suspend_mx(new_thr);
4294  if (new_thr->th.th_active_in_pool == TRUE) {
4295  KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4296  KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4297  new_thr->th.th_active_in_pool = FALSE;
4298  }
4299  __kmp_unlock_suspend_mx(new_thr);
4300 
4301  KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4302  __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4303  KMP_ASSERT(!new_thr->th.th_team);
4304  KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4305 
4306  /* setup the thread structure */
4307  __kmp_initialize_info(new_thr, team, new_tid,
4308  new_thr->th.th_info.ds.ds_gtid);
4309  KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4310 
4311  TCW_4(__kmp_nth, __kmp_nth + 1);
4312 
4313  new_thr->th.th_task_state = 0;
4314  new_thr->th.th_task_state_top = 0;
4315  new_thr->th.th_task_state_stack_sz = 4;
4316 
4317  if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
4318  // Make sure pool thread has transitioned to waiting on own thread struct
4319  KMP_DEBUG_ASSERT(new_thr->th.th_used_in_team.load() == 0);
4320  // Thread activated in __kmp_allocate_team when increasing team size
4321  }
4322 
4323 #ifdef KMP_ADJUST_BLOCKTIME
4324  /* Adjust blocktime back to zero if necessary */
4325  /* Middle initialization might not have occurred yet */
4326  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4327  if (__kmp_nth > __kmp_avail_proc) {
4328  __kmp_zero_bt = TRUE;
4329  }
4330  }
4331 #endif /* KMP_ADJUST_BLOCKTIME */
4332 
4333 #if KMP_DEBUG
4334  // If thread entered pool via __kmp_free_thread, wait_flag should !=
4335  // KMP_BARRIER_PARENT_FLAG.
4336  int b;
4337  kmp_balign_t *balign = new_thr->th.th_bar;
4338  for (b = 0; b < bs_last_barrier; ++b)
4339  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4340 #endif
4341 
4342  KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4343  __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4344 
4345  KMP_MB();
4346  return new_thr;
4347  }
4348 
4349  /* no, well fork a new one */
4350  KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4351  KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4352 
4353 #if KMP_USE_MONITOR
4354  // If this is the first worker thread the RTL is creating, then also
4355  // launch the monitor thread. We try to do this as early as possible.
4356  if (!TCR_4(__kmp_init_monitor)) {
4357  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4358  if (!TCR_4(__kmp_init_monitor)) {
4359  KF_TRACE(10, ("before __kmp_create_monitor\n"));
4360  TCW_4(__kmp_init_monitor, 1);
4361  __kmp_create_monitor(&__kmp_monitor);
4362  KF_TRACE(10, ("after __kmp_create_monitor\n"));
4363 #if KMP_OS_WINDOWS
4364  // AC: wait until monitor has started. This is a fix for CQ232808.
4365  // The reason is that if the library is loaded/unloaded in a loop with
4366  // small (parallel) work in between, then there is high probability that
4367  // monitor thread started after the library shutdown. At shutdown it is
4368  // too late to cope with the problem, because when the primary thread is
4369  // in DllMain (process detach) the monitor has no chances to start (it is
4370  // blocked), and primary thread has no means to inform the monitor that
4371  // the library has gone, because all the memory which the monitor can
4372  // access is going to be released/reset.
4373  while (TCR_4(__kmp_init_monitor) < 2) {
4374  KMP_YIELD(TRUE);
4375  }
4376  KF_TRACE(10, ("after monitor thread has started\n"));
4377 #endif
4378  }
4379  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4380  }
4381 #endif
4382 
4383  KMP_MB();
4384 
4385  {
4386  int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads)
4387  ? 1
4388  : __kmp_hidden_helper_threads_num + 1;
4389 
4390  for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL;
4391  ++new_gtid) {
4392  KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4393  }
4394 
4395  if (TCR_4(__kmp_init_hidden_helper_threads)) {
4396  KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num);
4397  }
4398  }
4399 
4400  /* allocate space for it. */
4401  new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4402 
4403  TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4404 
4405 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
4406  // suppress race conditions detection on synchronization flags in debug mode
4407  // this helps to analyze library internals eliminating false positives
4408  __itt_suppress_mark_range(
4409  __itt_suppress_range, __itt_suppress_threading_errors,
4410  &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc));
4411  __itt_suppress_mark_range(
4412  __itt_suppress_range, __itt_suppress_threading_errors,
4413  &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state));
4414 #if KMP_OS_WINDOWS
4415  __itt_suppress_mark_range(
4416  __itt_suppress_range, __itt_suppress_threading_errors,
4417  &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init));
4418 #else
4419  __itt_suppress_mark_range(__itt_suppress_range,
4420  __itt_suppress_threading_errors,
4421  &new_thr->th.th_suspend_init_count,
4422  sizeof(new_thr->th.th_suspend_init_count));
4423 #endif
4424  // TODO: check if we need to also suppress b_arrived flags
4425  __itt_suppress_mark_range(__itt_suppress_range,
4426  __itt_suppress_threading_errors,
4427  CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go),
4428  sizeof(new_thr->th.th_bar[0].bb.b_go));
4429  __itt_suppress_mark_range(__itt_suppress_range,
4430  __itt_suppress_threading_errors,
4431  CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go),
4432  sizeof(new_thr->th.th_bar[1].bb.b_go));
4433  __itt_suppress_mark_range(__itt_suppress_range,
4434  __itt_suppress_threading_errors,
4435  CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go),
4436  sizeof(new_thr->th.th_bar[2].bb.b_go));
4437 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
4438  if (__kmp_storage_map) {
4439  __kmp_print_thread_storage_map(new_thr, new_gtid);
4440  }
4441 
4442  // add the reserve serialized team, initialized from the team's primary thread
4443  {
4444  kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4445  KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4446  new_thr->th.th_serial_team = serial_team =
4447  (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4448 #if OMPT_SUPPORT
4449  ompt_data_none, // root parallel id
4450 #endif
4451  proc_bind_default, &r_icvs,
4452  0 USE_NESTED_HOT_ARG(NULL));
4453  }
4454  KMP_ASSERT(serial_team);
4455  serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4456  // execution (it is unused for now).
4457  serial_team->t.t_threads[0] = new_thr;
4458  KF_TRACE(10,
4459  ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4460  new_thr));
4461 
4462  /* setup the thread structures */
4463  __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4464 
4465 #if USE_FAST_MEMORY
4466  __kmp_initialize_fast_memory(new_thr);
4467 #endif /* USE_FAST_MEMORY */
4468 
4469 #if KMP_USE_BGET
4470  KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4471  __kmp_initialize_bget(new_thr);
4472 #endif
4473 
4474  __kmp_init_random(new_thr); // Initialize random number generator
4475 
4476  /* Initialize these only once when thread is grabbed for a team allocation */
4477  KA_TRACE(20,
4478  ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4479  __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4480 
4481  int b;
4482  kmp_balign_t *balign = new_thr->th.th_bar;
4483  for (b = 0; b < bs_last_barrier; ++b) {
4484  balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4485  balign[b].bb.team = NULL;
4486  balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4487  balign[b].bb.use_oncore_barrier = 0;
4488  }
4489 
4490  TCW_PTR(new_thr->th.th_sleep_loc, NULL);
4491  new_thr->th.th_sleep_loc_type = flag_unset;
4492 
4493  new_thr->th.th_spin_here = FALSE;
4494  new_thr->th.th_next_waiting = 0;
4495 #if KMP_OS_UNIX
4496  new_thr->th.th_blocking = false;
4497 #endif
4498 
4499 #if KMP_AFFINITY_SUPPORTED
4500  new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4501  new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4502  new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4503  new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4504 #endif
4505  new_thr->th.th_def_allocator = __kmp_def_allocator;
4506  new_thr->th.th_prev_level = 0;
4507  new_thr->th.th_prev_num_threads = 1;
4508 
4509  TCW_4(new_thr->th.th_in_pool, FALSE);
4510  new_thr->th.th_active_in_pool = FALSE;
4511  TCW_4(new_thr->th.th_active, TRUE);
4512 
4513  /* adjust the global counters */
4514  __kmp_all_nth++;
4515  __kmp_nth++;
4516 
4517  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4518  // numbers of procs, and method #2 (keyed API call) for higher numbers.
4519  if (__kmp_adjust_gtid_mode) {
4520  if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4521  if (TCR_4(__kmp_gtid_mode) != 2) {
4522  TCW_4(__kmp_gtid_mode, 2);
4523  }
4524  } else {
4525  if (TCR_4(__kmp_gtid_mode) != 1) {
4526  TCW_4(__kmp_gtid_mode, 1);
4527  }
4528  }
4529  }
4530 
4531 #ifdef KMP_ADJUST_BLOCKTIME
4532  /* Adjust blocktime back to zero if necessary */
4533  /* Middle initialization might not have occurred yet */
4534  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4535  if (__kmp_nth > __kmp_avail_proc) {
4536  __kmp_zero_bt = TRUE;
4537  }
4538  }
4539 #endif /* KMP_ADJUST_BLOCKTIME */
4540 
4541  /* actually fork it and create the new worker thread */
4542  KF_TRACE(
4543  10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4544  __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4545  KF_TRACE(10,
4546  ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4547 
4548  KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4549  new_gtid));
4550  KMP_MB();
4551  return new_thr;
4552 }
4553 
4554 /* Reinitialize team for reuse.
4555  The hot team code calls this case at every fork barrier, so EPCC barrier
4556  test are extremely sensitive to changes in it, esp. writes to the team
4557  struct, which cause a cache invalidation in all threads.
4558  IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4559 static void __kmp_reinitialize_team(kmp_team_t *team,
4560  kmp_internal_control_t *new_icvs,
4561  ident_t *loc) {
4562  KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4563  team->t.t_threads[0], team));
4564  KMP_DEBUG_ASSERT(team && new_icvs);
4565  KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4566  KMP_CHECK_UPDATE(team->t.t_ident, loc);
4567 
4568  KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4569  // Copy ICVs to the primary thread's implicit taskdata
4570  __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4571  copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4572 
4573  KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4574  team->t.t_threads[0], team));
4575 }
4576 
4577 /* Initialize the team data structure.
4578  This assumes the t_threads and t_max_nproc are already set.
4579  Also, we don't touch the arguments */
4580 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4581  kmp_internal_control_t *new_icvs,
4582  ident_t *loc) {
4583  KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4584 
4585  /* verify */
4586  KMP_DEBUG_ASSERT(team);
4587  KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4588  KMP_DEBUG_ASSERT(team->t.t_threads);
4589  KMP_MB();
4590 
4591  team->t.t_master_tid = 0; /* not needed */
4592  /* team->t.t_master_bar; not needed */
4593  team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4594  team->t.t_nproc = new_nproc;
4595 
4596  /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */
4597  team->t.t_next_pool = NULL;
4598  /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4599  * up hot team */
4600 
4601  TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4602  team->t.t_invoke = NULL; /* not needed */
4603 
4604  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4605  team->t.t_sched.sched = new_icvs->sched.sched;
4606 
4607 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4608  team->t.t_fp_control_saved = FALSE; /* not needed */
4609  team->t.t_x87_fpu_control_word = 0; /* not needed */
4610  team->t.t_mxcsr = 0; /* not needed */
4611 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4612 
4613  team->t.t_construct = 0;
4614 
4615  team->t.t_ordered.dt.t_value = 0;
4616  team->t.t_master_active = FALSE;
4617 
4618 #ifdef KMP_DEBUG
4619  team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4620 #endif
4621 #if KMP_OS_WINDOWS
4622  team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4623 #endif
4624 
4625  team->t.t_control_stack_top = NULL;
4626 
4627  __kmp_reinitialize_team(team, new_icvs, loc);
4628 
4629  KMP_MB();
4630  KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4631 }
4632 
4633 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
4634 /* Sets full mask for thread and returns old mask, no changes to structures. */
4635 static void
4636 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4637  if (KMP_AFFINITY_CAPABLE()) {
4638  int status;
4639  if (old_mask != NULL) {
4640  status = __kmp_get_system_affinity(old_mask, TRUE);
4641  int error = errno;
4642  if (status != 0) {
4643  __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4644  __kmp_msg_null);
4645  }
4646  }
4647  __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4648  }
4649 }
4650 #endif
4651 
4652 #if KMP_AFFINITY_SUPPORTED
4653 
4654 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4655 // It calculates the worker + primary thread's partition based upon the parent
4656 // thread's partition, and binds each worker to a thread in their partition.
4657 // The primary thread's partition should already include its current binding.
4658 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4659  // Do not partition places for the hidden helper team
4660  if (KMP_HIDDEN_HELPER_TEAM(team))
4661  return;
4662  // Copy the primary thread's place partition to the team struct
4663  kmp_info_t *master_th = team->t.t_threads[0];
4664  KMP_DEBUG_ASSERT(master_th != NULL);
4665  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4666  int first_place = master_th->th.th_first_place;
4667  int last_place = master_th->th.th_last_place;
4668  int masters_place = master_th->th.th_current_place;
4669  team->t.t_first_place = first_place;
4670  team->t.t_last_place = last_place;
4671 
4672  KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4673  "bound to place %d partition = [%d,%d]\n",
4674  proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4675  team->t.t_id, masters_place, first_place, last_place));
4676 
4677  switch (proc_bind) {
4678 
4679  case proc_bind_default:
4680  // Serial teams might have the proc_bind policy set to proc_bind_default.
4681  // Not an issue -- we don't rebind primary thread for any proc_bind policy.
4682  KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4683  break;
4684 
4685  case proc_bind_primary: {
4686  int f;
4687  int n_th = team->t.t_nproc;
4688  for (f = 1; f < n_th; f++) {
4689  kmp_info_t *th = team->t.t_threads[f];
4690  KMP_DEBUG_ASSERT(th != NULL);
4691  th->th.th_first_place = first_place;
4692  th->th.th_last_place = last_place;
4693  th->th.th_new_place = masters_place;
4694  if (__kmp_display_affinity && masters_place != th->th.th_current_place &&
4695  team->t.t_display_affinity != 1) {
4696  team->t.t_display_affinity = 1;
4697  }
4698 
4699  KA_TRACE(100, ("__kmp_partition_places: primary: T#%d(%d:%d) place %d "
4700  "partition = [%d,%d]\n",
4701  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4702  f, masters_place, first_place, last_place));
4703  }
4704  } break;
4705 
4706  case proc_bind_close: {
4707  int f;
4708  int n_th = team->t.t_nproc;
4709  int n_places;
4710  if (first_place <= last_place) {
4711  n_places = last_place - first_place + 1;
4712  } else {
4713  n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4714  }
4715  if (n_th <= n_places) {
4716  int place = masters_place;
4717  for (f = 1; f < n_th; f++) {
4718  kmp_info_t *th = team->t.t_threads[f];
4719  KMP_DEBUG_ASSERT(th != NULL);
4720 
4721  if (place == last_place) {
4722  place = first_place;
4723  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4724  place = 0;
4725  } else {
4726  place++;
4727  }
4728  th->th.th_first_place = first_place;
4729  th->th.th_last_place = last_place;
4730  th->th.th_new_place = place;
4731  if (__kmp_display_affinity && place != th->th.th_current_place &&
4732  team->t.t_display_affinity != 1) {
4733  team->t.t_display_affinity = 1;
4734  }
4735 
4736  KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4737  "partition = [%d,%d]\n",
4738  __kmp_gtid_from_thread(team->t.t_threads[f]),
4739  team->t.t_id, f, place, first_place, last_place));
4740  }
4741  } else {
4742  int S, rem, gap, s_count;
4743  S = n_th / n_places;
4744  s_count = 0;
4745  rem = n_th - (S * n_places);
4746  gap = rem > 0 ? n_places / rem : n_places;
4747  int place = masters_place;
4748  int gap_ct = gap;
4749  for (f = 0; f < n_th; f++) {
4750  kmp_info_t *th = team->t.t_threads[f];
4751  KMP_DEBUG_ASSERT(th != NULL);
4752 
4753  th->th.th_first_place = first_place;
4754  th->th.th_last_place = last_place;
4755  th->th.th_new_place = place;
4756  if (__kmp_display_affinity && place != th->th.th_current_place &&
4757  team->t.t_display_affinity != 1) {
4758  team->t.t_display_affinity = 1;
4759  }
4760  s_count++;
4761 
4762  if ((s_count == S) && rem && (gap_ct == gap)) {
4763  // do nothing, add an extra thread to place on next iteration
4764  } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4765  // we added an extra thread to this place; move to next place
4766  if (place == last_place) {
4767  place = first_place;
4768  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4769  place = 0;
4770  } else {
4771  place++;
4772  }
4773  s_count = 0;
4774  gap_ct = 1;
4775  rem--;
4776  } else if (s_count == S) { // place full; don't add extra
4777  if (place == last_place) {
4778  place = first_place;
4779  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4780  place = 0;
4781  } else {
4782  place++;
4783  }
4784  gap_ct++;
4785  s_count = 0;
4786  }
4787 
4788  KA_TRACE(100,
4789  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4790  "partition = [%d,%d]\n",
4791  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4792  th->th.th_new_place, first_place, last_place));
4793  }
4794  KMP_DEBUG_ASSERT(place == masters_place);
4795  }
4796  } break;
4797 
4798  case proc_bind_spread: {
4799  int f;
4800  int n_th = team->t.t_nproc;
4801  int n_places;
4802  int thidx;
4803  if (first_place <= last_place) {
4804  n_places = last_place - first_place + 1;
4805  } else {
4806  n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4807  }
4808  if (n_th <= n_places) {
4809  int place = -1;
4810 
4811  if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
4812  int S = n_places / n_th;
4813  int s_count, rem, gap, gap_ct;
4814 
4815  place = masters_place;
4816  rem = n_places - n_th * S;
4817  gap = rem ? n_th / rem : 1;
4818  gap_ct = gap;
4819  thidx = n_th;
4820  if (update_master_only == 1)
4821  thidx = 1;
4822  for (f = 0; f < thidx; f++) {
4823  kmp_info_t *th = team->t.t_threads[f];
4824  KMP_DEBUG_ASSERT(th != NULL);
4825 
4826  th->th.th_first_place = place;
4827  th->th.th_new_place = place;
4828  if (__kmp_display_affinity && place != th->th.th_current_place &&
4829  team->t.t_display_affinity != 1) {
4830  team->t.t_display_affinity = 1;
4831  }
4832  s_count = 1;
4833  while (s_count < S) {
4834  if (place == last_place) {
4835  place = first_place;
4836  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4837  place = 0;
4838  } else {
4839  place++;
4840  }
4841  s_count++;
4842  }
4843  if (rem && (gap_ct == gap)) {
4844  if (place == last_place) {
4845  place = first_place;
4846  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4847  place = 0;
4848  } else {
4849  place++;
4850  }
4851  rem--;
4852  gap_ct = 0;
4853  }
4854  th->th.th_last_place = place;
4855  gap_ct++;
4856 
4857  if (place == last_place) {
4858  place = first_place;
4859  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4860  place = 0;
4861  } else {
4862  place++;
4863  }
4864 
4865  KA_TRACE(100,
4866  ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4867  "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
4868  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4869  f, th->th.th_new_place, th->th.th_first_place,
4870  th->th.th_last_place, __kmp_affinity_num_masks));
4871  }
4872  } else {
4873  /* Having uniform space of available computation places I can create
4874  T partitions of round(P/T) size and put threads into the first
4875  place of each partition. */
4876  double current = static_cast<double>(masters_place);
4877  double spacing =
4878  (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4879  int first, last;
4880  kmp_info_t *th;
4881 
4882  thidx = n_th + 1;
4883  if (update_master_only == 1)
4884  thidx = 1;
4885  for (f = 0; f < thidx; f++) {
4886  first = static_cast<int>(current);
4887  last = static_cast<int>(current + spacing) - 1;
4888  KMP_DEBUG_ASSERT(last >= first);
4889  if (first >= n_places) {
4890  if (masters_place) {
4891  first -= n_places;
4892  last -= n_places;
4893  if (first == (masters_place + 1)) {
4894  KMP_DEBUG_ASSERT(f == n_th);
4895  first--;
4896  }
4897  if (last == masters_place) {
4898  KMP_DEBUG_ASSERT(f == (n_th - 1));
4899  last--;
4900  }
4901  } else {
4902  KMP_DEBUG_ASSERT(f == n_th);
4903  first = 0;
4904  last = 0;
4905  }
4906  }
4907  if (last >= n_places) {
4908  last = (n_places - 1);
4909  }
4910  place = first;
4911  current += spacing;
4912  if (f < n_th) {
4913  KMP_DEBUG_ASSERT(0 <= first);
4914  KMP_DEBUG_ASSERT(n_places > first);
4915  KMP_DEBUG_ASSERT(0 <= last);
4916  KMP_DEBUG_ASSERT(n_places > last);
4917  KMP_DEBUG_ASSERT(last_place >= first_place);
4918  th = team->t.t_threads[f];
4919  KMP_DEBUG_ASSERT(th);
4920  th->th.th_first_place = first;
4921  th->th.th_new_place = place;
4922  th->th.th_last_place = last;
4923  if (__kmp_display_affinity && place != th->th.th_current_place &&
4924  team->t.t_display_affinity != 1) {
4925  team->t.t_display_affinity = 1;
4926  }
4927  KA_TRACE(100,
4928  ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4929  "partition = [%d,%d], spacing = %.4f\n",
4930  __kmp_gtid_from_thread(team->t.t_threads[f]),
4931  team->t.t_id, f, th->th.th_new_place,
4932  th->th.th_first_place, th->th.th_last_place, spacing));
4933  }
4934  }
4935  }
4936  KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4937  } else {
4938  int S, rem, gap, s_count;
4939  S = n_th / n_places;
4940  s_count = 0;
4941  rem = n_th - (S * n_places);
4942  gap = rem > 0 ? n_places / rem : n_places;
4943  int place = masters_place;
4944  int gap_ct = gap;
4945  thidx = n_th;
4946  if (update_master_only == 1)
4947  thidx = 1;
4948  for (f = 0; f < thidx; f++) {
4949  kmp_info_t *th = team->t.t_threads[f];
4950  KMP_DEBUG_ASSERT(th != NULL);
4951 
4952  th->th.th_first_place = place;
4953  th->th.th_last_place = place;
4954  th->th.th_new_place = place;
4955  if (__kmp_display_affinity && place != th->th.th_current_place &&
4956  team->t.t_display_affinity != 1) {
4957  team->t.t_display_affinity = 1;
4958  }
4959  s_count++;
4960 
4961  if ((s_count == S) && rem && (gap_ct == gap)) {
4962  // do nothing, add an extra thread to place on next iteration
4963  } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4964  // we added an extra thread to this place; move on to next place
4965  if (place == last_place) {
4966  place = first_place;
4967  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4968  place = 0;
4969  } else {
4970  place++;
4971  }
4972  s_count = 0;
4973  gap_ct = 1;
4974  rem--;
4975  } else if (s_count == S) { // place is full; don't add extra thread
4976  if (place == last_place) {
4977  place = first_place;
4978  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4979  place = 0;
4980  } else {
4981  place++;
4982  }
4983  gap_ct++;
4984  s_count = 0;
4985  }
4986 
4987  KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4988  "partition = [%d,%d]\n",
4989  __kmp_gtid_from_thread(team->t.t_threads[f]),
4990  team->t.t_id, f, th->th.th_new_place,
4991  th->th.th_first_place, th->th.th_last_place));
4992  }
4993  KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4994  }
4995  } break;
4996 
4997  default:
4998  break;
4999  }
5000 
5001  KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
5002 }
5003 
5004 #endif // KMP_AFFINITY_SUPPORTED
5005 
5006 /* allocate a new team data structure to use. take one off of the free pool if
5007  available */
5008 kmp_team_t *
5009 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
5010 #if OMPT_SUPPORT
5011  ompt_data_t ompt_parallel_data,
5012 #endif
5013  kmp_proc_bind_t new_proc_bind,
5014  kmp_internal_control_t *new_icvs,
5015  int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5016  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
5017  int f;
5018  kmp_team_t *team;
5019  int use_hot_team = !root->r.r_active;
5020  int level = 0;
5021 
5022  KA_TRACE(20, ("__kmp_allocate_team: called\n"));
5023  KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
5024  KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
5025  KMP_MB();
5026 
5027 #if KMP_NESTED_HOT_TEAMS
5028  kmp_hot_team_ptr_t *hot_teams;
5029  if (master) {
5030  team = master->th.th_team;
5031  level = team->t.t_active_level;
5032  if (master->th.th_teams_microtask) { // in teams construct?
5033  if (master->th.th_teams_size.nteams > 1 &&
5034  ( // #teams > 1
5035  team->t.t_pkfn ==
5036  (microtask_t)__kmp_teams_master || // inner fork of the teams
5037  master->th.th_teams_level <
5038  team->t.t_level)) { // or nested parallel inside the teams
5039  ++level; // not increment if #teams==1, or for outer fork of the teams;
5040  // increment otherwise
5041  }
5042  }
5043  hot_teams = master->th.th_hot_teams;
5044  if (level < __kmp_hot_teams_max_level && hot_teams &&
5045  hot_teams[level].hot_team) {
5046  // hot team has already been allocated for given level
5047  use_hot_team = 1;
5048  } else {
5049  use_hot_team = 0;
5050  }
5051  } else {
5052  // check we won't access uninitialized hot_teams, just in case
5053  KMP_DEBUG_ASSERT(new_nproc == 1);
5054  }
5055 #endif
5056  // Optimization to use a "hot" team
5057  if (use_hot_team && new_nproc > 1) {
5058  KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
5059 #if KMP_NESTED_HOT_TEAMS
5060  team = hot_teams[level].hot_team;
5061 #else
5062  team = root->r.r_hot_team;
5063 #endif
5064 #if KMP_DEBUG
5065  if (__kmp_tasking_mode != tskm_immediate_exec) {
5066  KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5067  "task_team[1] = %p before reinit\n",
5068  team->t.t_task_team[0], team->t.t_task_team[1]));
5069  }
5070 #endif
5071 
5072  if (team->t.t_nproc != new_nproc &&
5073  __kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5074  // Distributed barrier may need a resize
5075  int old_nthr = team->t.t_nproc;
5076  __kmp_resize_dist_barrier(team, old_nthr, new_nproc);
5077  }
5078 
5079  // Has the number of threads changed?
5080  /* Let's assume the most common case is that the number of threads is
5081  unchanged, and put that case first. */
5082  if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
5083  KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
5084  // This case can mean that omp_set_num_threads() was called and the hot
5085  // team size was already reduced, so we check the special flag
5086  if (team->t.t_size_changed == -1) {
5087  team->t.t_size_changed = 1;
5088  } else {
5089  KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
5090  }
5091 
5092  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5093  kmp_r_sched_t new_sched = new_icvs->sched;
5094  // set primary thread's schedule as new run-time schedule
5095  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
5096 
5097  __kmp_reinitialize_team(team, new_icvs,
5098  root->r.r_uber_thread->th.th_ident);
5099 
5100  KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
5101  team->t.t_threads[0], team));
5102  __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5103 
5104 #if KMP_AFFINITY_SUPPORTED
5105  if ((team->t.t_size_changed == 0) &&
5106  (team->t.t_proc_bind == new_proc_bind)) {
5107  if (new_proc_bind == proc_bind_spread) {
5108  __kmp_partition_places(
5109  team, 1); // add flag to update only master for spread
5110  }
5111  KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5112  "proc_bind = %d, partition = [%d,%d]\n",
5113  team->t.t_id, new_proc_bind, team->t.t_first_place,
5114  team->t.t_last_place));
5115  } else {
5116  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5117  __kmp_partition_places(team);
5118  }
5119 #else
5120  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5121 #endif /* KMP_AFFINITY_SUPPORTED */
5122  } else if (team->t.t_nproc > new_nproc) {
5123  KA_TRACE(20,
5124  ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5125  new_nproc));
5126 
5127  team->t.t_size_changed = 1;
5128  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5129  // Barrier size already reduced earlier in this function
5130  // Activate team threads via th_used_in_team
5131  __kmp_add_threads_to_team(team, new_nproc);
5132  }
5133 #if KMP_NESTED_HOT_TEAMS
5134  if (__kmp_hot_teams_mode == 0) {
5135  // AC: saved number of threads should correspond to team's value in this
5136  // mode, can be bigger in mode 1, when hot team has threads in reserve
5137  KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5138  hot_teams[level].hot_team_nth = new_nproc;
5139 #endif // KMP_NESTED_HOT_TEAMS
5140  /* release the extra threads we don't need any more */
5141  for (f = new_nproc; f < team->t.t_nproc; f++) {
5142  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5143  if (__kmp_tasking_mode != tskm_immediate_exec) {
5144  // When decreasing team size, threads no longer in the team should
5145  // unref task team.
5146  team->t.t_threads[f]->th.th_task_team = NULL;
5147  }
5148  __kmp_free_thread(team->t.t_threads[f]);
5149  team->t.t_threads[f] = NULL;
5150  }
5151 #if KMP_NESTED_HOT_TEAMS
5152  } // (__kmp_hot_teams_mode == 0)
5153  else {
5154  // When keeping extra threads in team, switch threads to wait on own
5155  // b_go flag
5156  for (f = new_nproc; f < team->t.t_nproc; ++f) {
5157  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5158  kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5159  for (int b = 0; b < bs_last_barrier; ++b) {
5160  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5161  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5162  }
5163  KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5164  }
5165  }
5166  }
5167 #endif // KMP_NESTED_HOT_TEAMS
5168  team->t.t_nproc = new_nproc;
5169  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5170  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5171  __kmp_reinitialize_team(team, new_icvs,
5172  root->r.r_uber_thread->th.th_ident);
5173 
5174  // Update remaining threads
5175  for (f = 0; f < new_nproc; ++f) {
5176  team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5177  }
5178 
5179  // restore the current task state of the primary thread: should be the
5180  // implicit task
5181  KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5182  team->t.t_threads[0], team));
5183 
5184  __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5185 
5186 #ifdef KMP_DEBUG
5187  for (f = 0; f < team->t.t_nproc; f++) {
5188  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5189  team->t.t_threads[f]->th.th_team_nproc ==
5190  team->t.t_nproc);
5191  }
5192 #endif
5193 
5194  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5195 #if KMP_AFFINITY_SUPPORTED
5196  __kmp_partition_places(team);
5197 #endif
5198  } else { // team->t.t_nproc < new_nproc
5199 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5200  kmp_affin_mask_t *old_mask;
5201  if (KMP_AFFINITY_CAPABLE()) {
5202  KMP_CPU_ALLOC(old_mask);
5203  }
5204 #endif
5205 
5206  KA_TRACE(20,
5207  ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5208  new_nproc));
5209  int old_nproc = team->t.t_nproc; // save old value and use to update only
5210  team->t.t_size_changed = 1;
5211 
5212 #if KMP_NESTED_HOT_TEAMS
5213  int avail_threads = hot_teams[level].hot_team_nth;
5214  if (new_nproc < avail_threads)
5215  avail_threads = new_nproc;
5216  kmp_info_t **other_threads = team->t.t_threads;
5217  for (f = team->t.t_nproc; f < avail_threads; ++f) {
5218  // Adjust barrier data of reserved threads (if any) of the team
5219  // Other data will be set in __kmp_initialize_info() below.
5220  int b;
5221  kmp_balign_t *balign = other_threads[f]->th.th_bar;
5222  for (b = 0; b < bs_last_barrier; ++b) {
5223  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5224  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5225 #if USE_DEBUGGER
5226  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5227 #endif
5228  }
5229  }
5230  if (hot_teams[level].hot_team_nth >= new_nproc) {
5231  // we have all needed threads in reserve, no need to allocate any
5232  // this only possible in mode 1, cannot have reserved threads in mode 0
5233  KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5234  team->t.t_nproc = new_nproc; // just get reserved threads involved
5235  } else {
5236  // We may have some threads in reserve, but not enough;
5237  // get reserved threads involved if any.
5238  team->t.t_nproc = hot_teams[level].hot_team_nth;
5239  hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5240 #endif // KMP_NESTED_HOT_TEAMS
5241  if (team->t.t_max_nproc < new_nproc) {
5242  /* reallocate larger arrays */
5243  __kmp_reallocate_team_arrays(team, new_nproc);
5244  __kmp_reinitialize_team(team, new_icvs, NULL);
5245  }
5246 
5247 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5248  /* Temporarily set full mask for primary thread before creation of
5249  workers. The reason is that workers inherit the affinity from the
5250  primary thread, so if a lot of workers are created on the single
5251  core quickly, they don't get a chance to set their own affinity for
5252  a long time. */
5253  __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5254 #endif
5255 
5256  /* allocate new threads for the hot team */
5257  for (f = team->t.t_nproc; f < new_nproc; f++) {
5258  kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5259  KMP_DEBUG_ASSERT(new_worker);
5260  team->t.t_threads[f] = new_worker;
5261 
5262  KA_TRACE(20,
5263  ("__kmp_allocate_team: team %d init T#%d arrived: "
5264  "join=%llu, plain=%llu\n",
5265  team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5266  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5267  team->t.t_bar[bs_plain_barrier].b_arrived));
5268 
5269  { // Initialize barrier data for new threads.
5270  int b;
5271  kmp_balign_t *balign = new_worker->th.th_bar;
5272  for (b = 0; b < bs_last_barrier; ++b) {
5273  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5274  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5275  KMP_BARRIER_PARENT_FLAG);
5276 #if USE_DEBUGGER
5277  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5278 #endif
5279  }
5280  }
5281  }
5282 
5283 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5284  if (KMP_AFFINITY_CAPABLE()) {
5285  /* Restore initial primary thread's affinity mask */
5286  __kmp_set_system_affinity(old_mask, TRUE);
5287  KMP_CPU_FREE(old_mask);
5288  }
5289 #endif
5290 #if KMP_NESTED_HOT_TEAMS
5291  } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5292 #endif // KMP_NESTED_HOT_TEAMS
5293  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5294  // Barrier size already increased earlier in this function
5295  // Activate team threads via th_used_in_team
5296  __kmp_add_threads_to_team(team, new_nproc);
5297  }
5298  /* make sure everyone is syncronized */
5299  // new threads below
5300  __kmp_initialize_team(team, new_nproc, new_icvs,
5301  root->r.r_uber_thread->th.th_ident);
5302 
5303  /* reinitialize the threads */
5304  KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5305  for (f = 0; f < team->t.t_nproc; ++f)
5306  __kmp_initialize_info(team->t.t_threads[f], team, f,
5307  __kmp_gtid_from_tid(f, team));
5308 
5309  if (level) { // set th_task_state for new threads in nested hot team
5310  // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5311  // only need to set the th_task_state for the new threads. th_task_state
5312  // for primary thread will not be accurate until after this in
5313  // __kmp_fork_call(), so we look to the primary thread's memo_stack to
5314  // get the correct value.
5315  for (f = old_nproc; f < team->t.t_nproc; ++f)
5316  team->t.t_threads[f]->th.th_task_state =
5317  team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5318  } else { // set th_task_state for new threads in non-nested hot team
5319  // copy primary thread's state
5320  kmp_uint8 old_state = team->t.t_threads[0]->th.th_task_state;
5321  for (f = old_nproc; f < team->t.t_nproc; ++f)
5322  team->t.t_threads[f]->th.th_task_state = old_state;
5323  }
5324 
5325 #ifdef KMP_DEBUG
5326  for (f = 0; f < team->t.t_nproc; ++f) {
5327  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5328  team->t.t_threads[f]->th.th_team_nproc ==
5329  team->t.t_nproc);
5330  }
5331 #endif
5332 
5333  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5334 #if KMP_AFFINITY_SUPPORTED
5335  __kmp_partition_places(team);
5336 #endif
5337  } // Check changes in number of threads
5338 
5339  kmp_info_t *master = team->t.t_threads[0];
5340  if (master->th.th_teams_microtask) {
5341  for (f = 1; f < new_nproc; ++f) {
5342  // propagate teams construct specific info to workers
5343  kmp_info_t *thr = team->t.t_threads[f];
5344  thr->th.th_teams_microtask = master->th.th_teams_microtask;
5345  thr->th.th_teams_level = master->th.th_teams_level;
5346  thr->th.th_teams_size = master->th.th_teams_size;
5347  }
5348  }
5349 #if KMP_NESTED_HOT_TEAMS
5350  if (level) {
5351  // Sync barrier state for nested hot teams, not needed for outermost hot
5352  // team.
5353  for (f = 1; f < new_nproc; ++f) {
5354  kmp_info_t *thr = team->t.t_threads[f];
5355  int b;
5356  kmp_balign_t *balign = thr->th.th_bar;
5357  for (b = 0; b < bs_last_barrier; ++b) {
5358  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5359  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5360 #if USE_DEBUGGER
5361  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5362 #endif
5363  }
5364  }
5365  }
5366 #endif // KMP_NESTED_HOT_TEAMS
5367 
5368  /* reallocate space for arguments if necessary */
5369  __kmp_alloc_argv_entries(argc, team, TRUE);
5370  KMP_CHECK_UPDATE(team->t.t_argc, argc);
5371  // The hot team re-uses the previous task team,
5372  // if untouched during the previous release->gather phase.
5373 
5374  KF_TRACE(10, (" hot_team = %p\n", team));
5375 
5376 #if KMP_DEBUG
5377  if (__kmp_tasking_mode != tskm_immediate_exec) {
5378  KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5379  "task_team[1] = %p after reinit\n",
5380  team->t.t_task_team[0], team->t.t_task_team[1]));
5381  }
5382 #endif
5383 
5384 #if OMPT_SUPPORT
5385  __ompt_team_assign_id(team, ompt_parallel_data);
5386 #endif
5387 
5388  KMP_MB();
5389 
5390  return team;
5391  }
5392 
5393  /* next, let's try to take one from the team pool */
5394  KMP_MB();
5395  for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5396  /* TODO: consider resizing undersized teams instead of reaping them, now
5397  that we have a resizing mechanism */
5398  if (team->t.t_max_nproc >= max_nproc) {
5399  /* take this team from the team pool */
5400  __kmp_team_pool = team->t.t_next_pool;
5401 
5402  if (max_nproc > 1 &&
5403  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5404  if (!team->t.b) { // Allocate barrier structure
5405  team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5406  }
5407  }
5408 
5409  /* setup the team for fresh use */
5410  __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5411 
5412  KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5413  "task_team[1] %p to NULL\n",
5414  &team->t.t_task_team[0], &team->t.t_task_team[1]));
5415  team->t.t_task_team[0] = NULL;
5416  team->t.t_task_team[1] = NULL;
5417 
5418  /* reallocate space for arguments if necessary */
5419  __kmp_alloc_argv_entries(argc, team, TRUE);
5420  KMP_CHECK_UPDATE(team->t.t_argc, argc);
5421 
5422  KA_TRACE(
5423  20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5424  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5425  { // Initialize barrier data.
5426  int b;
5427  for (b = 0; b < bs_last_barrier; ++b) {
5428  team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5429 #if USE_DEBUGGER
5430  team->t.t_bar[b].b_master_arrived = 0;
5431  team->t.t_bar[b].b_team_arrived = 0;
5432 #endif
5433  }
5434  }
5435 
5436  team->t.t_proc_bind = new_proc_bind;
5437 
5438  KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5439  team->t.t_id));
5440 
5441 #if OMPT_SUPPORT
5442  __ompt_team_assign_id(team, ompt_parallel_data);
5443 #endif
5444 
5445  KMP_MB();
5446 
5447  return team;
5448  }
5449 
5450  /* reap team if it is too small, then loop back and check the next one */
5451  // not sure if this is wise, but, will be redone during the hot-teams
5452  // rewrite.
5453  /* TODO: Use technique to find the right size hot-team, don't reap them */
5454  team = __kmp_reap_team(team);
5455  __kmp_team_pool = team;
5456  }
5457 
5458  /* nothing available in the pool, no matter, make a new team! */
5459  KMP_MB();
5460  team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5461 
5462  /* and set it up */
5463  team->t.t_max_nproc = max_nproc;
5464  if (max_nproc > 1 &&
5465  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5466  // Allocate barrier structure
5467  team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5468  }
5469 
5470  /* NOTE well, for some reason allocating one big buffer and dividing it up
5471  seems to really hurt performance a lot on the P4, so, let's not use this */
5472  __kmp_allocate_team_arrays(team, max_nproc);
5473 
5474  KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5475  __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5476 
5477  KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5478  "%p to NULL\n",
5479  &team->t.t_task_team[0], &team->t.t_task_team[1]));
5480  team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5481  // memory, no need to duplicate
5482  team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5483  // memory, no need to duplicate
5484 
5485  if (__kmp_storage_map) {
5486  __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5487  }
5488 
5489  /* allocate space for arguments */
5490  __kmp_alloc_argv_entries(argc, team, FALSE);
5491  team->t.t_argc = argc;
5492 
5493  KA_TRACE(20,
5494  ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5495  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5496  { // Initialize barrier data.
5497  int b;
5498  for (b = 0; b < bs_last_barrier; ++b) {
5499  team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5500 #if USE_DEBUGGER
5501  team->t.t_bar[b].b_master_arrived = 0;
5502  team->t.t_bar[b].b_team_arrived = 0;
5503 #endif
5504  }
5505  }
5506 
5507  team->t.t_proc_bind = new_proc_bind;
5508 
5509 #if OMPT_SUPPORT
5510  __ompt_team_assign_id(team, ompt_parallel_data);
5511  team->t.ompt_serialized_team_info = NULL;
5512 #endif
5513 
5514  KMP_MB();
5515 
5516  KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5517  team->t.t_id));
5518 
5519  return team;
5520 }
5521 
5522 /* TODO implement hot-teams at all levels */
5523 /* TODO implement lazy thread release on demand (disband request) */
5524 
5525 /* free the team. return it to the team pool. release all the threads
5526  * associated with it */
5527 void __kmp_free_team(kmp_root_t *root,
5528  kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5529  int f;
5530  KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5531  team->t.t_id));
5532 
5533  /* verify state */
5534  KMP_DEBUG_ASSERT(root);
5535  KMP_DEBUG_ASSERT(team);
5536  KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5537  KMP_DEBUG_ASSERT(team->t.t_threads);
5538 
5539  int use_hot_team = team == root->r.r_hot_team;
5540 #if KMP_NESTED_HOT_TEAMS
5541  int level;
5542  kmp_hot_team_ptr_t *hot_teams;
5543  if (master) {
5544  level = team->t.t_active_level - 1;
5545  if (master->th.th_teams_microtask) { // in teams construct?
5546  if (master->th.th_teams_size.nteams > 1) {
5547  ++level; // level was not increased in teams construct for
5548  // team_of_masters
5549  }
5550  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5551  master->th.th_teams_level == team->t.t_level) {
5552  ++level; // level was not increased in teams construct for
5553  // team_of_workers before the parallel
5554  } // team->t.t_level will be increased inside parallel
5555  }
5556  hot_teams = master->th.th_hot_teams;
5557  if (level < __kmp_hot_teams_max_level) {
5558  KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5559  use_hot_team = 1;
5560  }
5561  }
5562 #endif // KMP_NESTED_HOT_TEAMS
5563 
5564  /* team is done working */
5565  TCW_SYNC_PTR(team->t.t_pkfn,
5566  NULL); // Important for Debugging Support Library.
5567 #if KMP_OS_WINDOWS
5568  team->t.t_copyin_counter = 0; // init counter for possible reuse
5569 #endif
5570  // Do not reset pointer to parent team to NULL for hot teams.
5571 
5572  /* if we are non-hot team, release our threads */
5573  if (!use_hot_team) {
5574  if (__kmp_tasking_mode != tskm_immediate_exec) {
5575  // Wait for threads to reach reapable state
5576  for (f = 1; f < team->t.t_nproc; ++f) {
5577  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5578  kmp_info_t *th = team->t.t_threads[f];
5579  volatile kmp_uint32 *state = &th->th.th_reap_state;
5580  while (*state != KMP_SAFE_TO_REAP) {
5581 #if KMP_OS_WINDOWS
5582  // On Windows a thread can be killed at any time, check this
5583  DWORD ecode;
5584  if (!__kmp_is_thread_alive(th, &ecode)) {
5585  *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5586  break;
5587  }
5588 #endif
5589  // first check if thread is sleeping
5590  kmp_flag_64<> fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5591  if (fl.is_sleeping())
5592  fl.resume(__kmp_gtid_from_thread(th));
5593  KMP_CPU_PAUSE();
5594  }
5595  }
5596 
5597  // Delete task teams
5598  int tt_idx;
5599  for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5600  kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5601  if (task_team != NULL) {
5602  for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5603  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5604  team->t.t_threads[f]->th.th_task_team = NULL;
5605  }
5606  KA_TRACE(
5607  20,
5608  ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5609  __kmp_get_gtid(), task_team, team->t.t_id));
5610 #if KMP_NESTED_HOT_TEAMS
5611  __kmp_free_task_team(master, task_team);
5612 #endif
5613  team->t.t_task_team[tt_idx] = NULL;
5614  }
5615  }
5616  }
5617 
5618  // Reset pointer to parent team only for non-hot teams.
5619  team->t.t_parent = NULL;
5620  team->t.t_level = 0;
5621  team->t.t_active_level = 0;
5622 
5623  /* free the worker threads */
5624  for (f = 1; f < team->t.t_nproc; ++f) {
5625  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5626  if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5627  KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team),
5628  1, 2);
5629  }
5630  __kmp_free_thread(team->t.t_threads[f]);
5631  }
5632 
5633  if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5634  if (team->t.b) {
5635  // wake up thread at old location
5636  team->t.b->go_release();
5637  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5638  for (f = 1; f < team->t.t_nproc; ++f) {
5639  if (team->t.b->sleep[f].sleep) {
5640  __kmp_atomic_resume_64(
5641  team->t.t_threads[f]->th.th_info.ds.ds_gtid,
5642  (kmp_atomic_flag_64<> *)NULL);
5643  }
5644  }
5645  }
5646  // Wait for threads to be removed from team
5647  for (int f = 1; f < team->t.t_nproc; ++f) {
5648  while (team->t.t_threads[f]->th.th_used_in_team.load() != 0)
5649  KMP_CPU_PAUSE();
5650  }
5651  }
5652  }
5653 
5654  for (f = 1; f < team->t.t_nproc; ++f) {
5655  team->t.t_threads[f] = NULL;
5656  }
5657 
5658  if (team->t.t_max_nproc > 1 &&
5659  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5660  distributedBarrier::deallocate(team->t.b);
5661  team->t.b = NULL;
5662  }
5663  /* put the team back in the team pool */
5664  /* TODO limit size of team pool, call reap_team if pool too large */
5665  team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5666  __kmp_team_pool = (volatile kmp_team_t *)team;
5667  } else { // Check if team was created for primary threads in teams construct
5668  // See if first worker is a CG root
5669  KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5670  team->t.t_threads[1]->th.th_cg_roots);
5671  if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5672  // Clean up the CG root nodes on workers so that this team can be re-used
5673  for (f = 1; f < team->t.t_nproc; ++f) {
5674  kmp_info_t *thr = team->t.t_threads[f];
5675  KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5676  thr->th.th_cg_roots->cg_root == thr);
5677  // Pop current CG root off list
5678  kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5679  thr->th.th_cg_roots = tmp->up;
5680  KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5681  " up to node %p. cg_nthreads was %d\n",
5682  thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5683  int i = tmp->cg_nthreads--;
5684  if (i == 1) {
5685  __kmp_free(tmp); // free CG if we are the last thread in it
5686  }
5687  // Restore current task's thread_limit from CG root
5688  if (thr->th.th_cg_roots)
5689  thr->th.th_current_task->td_icvs.thread_limit =
5690  thr->th.th_cg_roots->cg_thread_limit;
5691  }
5692  }
5693  }
5694 
5695  KMP_MB();
5696 }
5697 
5698 /* reap the team. destroy it, reclaim all its resources and free its memory */
5699 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5700  kmp_team_t *next_pool = team->t.t_next_pool;
5701 
5702  KMP_DEBUG_ASSERT(team);
5703  KMP_DEBUG_ASSERT(team->t.t_dispatch);
5704  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5705  KMP_DEBUG_ASSERT(team->t.t_threads);
5706  KMP_DEBUG_ASSERT(team->t.t_argv);
5707 
5708  /* TODO clean the threads that are a part of this? */
5709 
5710  /* free stuff */
5711  __kmp_free_team_arrays(team);
5712  if (team->t.t_argv != &team->t.t_inline_argv[0])
5713  __kmp_free((void *)team->t.t_argv);
5714  __kmp_free(team);
5715 
5716  KMP_MB();
5717  return next_pool;
5718 }
5719 
5720 // Free the thread. Don't reap it, just place it on the pool of available
5721 // threads.
5722 //
5723 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5724 // binding for the affinity mechanism to be useful.
5725 //
5726 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5727 // However, we want to avoid a potential performance problem by always
5728 // scanning through the list to find the correct point at which to insert
5729 // the thread (potential N**2 behavior). To do this we keep track of the
5730 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5731 // With single-level parallelism, threads will always be added to the tail
5732 // of the list, kept track of by __kmp_thread_pool_insert_pt. With nested
5733 // parallelism, all bets are off and we may need to scan through the entire
5734 // free list.
5735 //
5736 // This change also has a potentially large performance benefit, for some
5737 // applications. Previously, as threads were freed from the hot team, they
5738 // would be placed back on the free list in inverse order. If the hot team
5739 // grew back to it's original size, then the freed thread would be placed
5740 // back on the hot team in reverse order. This could cause bad cache
5741 // locality problems on programs where the size of the hot team regularly
5742 // grew and shrunk.
5743 //
5744 // Now, for single-level parallelism, the OMP tid is always == gtid.
5745 void __kmp_free_thread(kmp_info_t *this_th) {
5746  int gtid;
5747  kmp_info_t **scan;
5748 
5749  KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5750  __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5751 
5752  KMP_DEBUG_ASSERT(this_th);
5753 
5754  // When moving thread to pool, switch thread to wait on own b_go flag, and
5755  // uninitialized (NULL team).
5756  int b;
5757  kmp_balign_t *balign = this_th->th.th_bar;
5758  for (b = 0; b < bs_last_barrier; ++b) {
5759  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5760  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5761  balign[b].bb.team = NULL;
5762  balign[b].bb.leaf_kids = 0;
5763  }
5764  this_th->th.th_task_state = 0;
5765  this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5766 
5767  /* put thread back on the free pool */
5768  TCW_PTR(this_th->th.th_team, NULL);
5769  TCW_PTR(this_th->th.th_root, NULL);
5770  TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5771 
5772  while (this_th->th.th_cg_roots) {
5773  this_th->th.th_cg_roots->cg_nthreads--;
5774  KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5775  " %p of thread %p to %d\n",
5776  this_th, this_th->th.th_cg_roots,
5777  this_th->th.th_cg_roots->cg_root,
5778  this_th->th.th_cg_roots->cg_nthreads));
5779  kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5780  if (tmp->cg_root == this_th) { // Thread is a cg_root
5781  KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5782  KA_TRACE(
5783  5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5784  this_th->th.th_cg_roots = tmp->up;
5785  __kmp_free(tmp);
5786  } else { // Worker thread
5787  if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5788  __kmp_free(tmp);
5789  }
5790  this_th->th.th_cg_roots = NULL;
5791  break;
5792  }
5793  }
5794 
5795  /* If the implicit task assigned to this thread can be used by other threads
5796  * -> multiple threads can share the data and try to free the task at
5797  * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5798  * with higher probability when hot team is disabled but can occurs even when
5799  * the hot team is enabled */
5800  __kmp_free_implicit_task(this_th);
5801  this_th->th.th_current_task = NULL;
5802 
5803  // If the __kmp_thread_pool_insert_pt is already past the new insert
5804  // point, then we need to re-scan the entire list.
5805  gtid = this_th->th.th_info.ds.ds_gtid;
5806  if (__kmp_thread_pool_insert_pt != NULL) {
5807  KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5808  if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5809  __kmp_thread_pool_insert_pt = NULL;
5810  }
5811  }
5812 
5813  // Scan down the list to find the place to insert the thread.
5814  // scan is the address of a link in the list, possibly the address of
5815  // __kmp_thread_pool itself.
5816  //
5817  // In the absence of nested parallelism, the for loop will have 0 iterations.
5818  if (__kmp_thread_pool_insert_pt != NULL) {
5819  scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5820  } else {
5821  scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5822  }
5823  for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5824  scan = &((*scan)->th.th_next_pool))
5825  ;
5826 
5827  // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5828  // to its address.
5829  TCW_PTR(this_th->th.th_next_pool, *scan);
5830  __kmp_thread_pool_insert_pt = *scan = this_th;
5831  KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5832  (this_th->th.th_info.ds.ds_gtid <
5833  this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5834  TCW_4(this_th->th.th_in_pool, TRUE);
5835  __kmp_suspend_initialize_thread(this_th);
5836  __kmp_lock_suspend_mx(this_th);
5837  if (this_th->th.th_active == TRUE) {
5838  KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5839  this_th->th.th_active_in_pool = TRUE;
5840  }
5841 #if KMP_DEBUG
5842  else {
5843  KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5844  }
5845 #endif
5846  __kmp_unlock_suspend_mx(this_th);
5847 
5848  TCW_4(__kmp_nth, __kmp_nth - 1);
5849 
5850 #ifdef KMP_ADJUST_BLOCKTIME
5851  /* Adjust blocktime back to user setting or default if necessary */
5852  /* Middle initialization might never have occurred */
5853  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5854  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5855  if (__kmp_nth <= __kmp_avail_proc) {
5856  __kmp_zero_bt = FALSE;
5857  }
5858  }
5859 #endif /* KMP_ADJUST_BLOCKTIME */
5860 
5861  KMP_MB();
5862 }
5863 
5864 /* ------------------------------------------------------------------------ */
5865 
5866 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5867 #if OMP_PROFILING_SUPPORT
5868  ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE");
5869  // TODO: add a configuration option for time granularity
5870  if (ProfileTraceFile)
5871  llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget");
5872 #endif
5873 
5874  int gtid = this_thr->th.th_info.ds.ds_gtid;
5875  /* void *stack_data;*/
5876  kmp_team_t **volatile pteam;
5877 
5878  KMP_MB();
5879  KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5880 
5881  if (__kmp_env_consistency_check) {
5882  this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5883  }
5884 
5885 #if OMPD_SUPPORT
5886  if (ompd_state & OMPD_ENABLE_BP)
5887  ompd_bp_thread_begin();
5888 #endif
5889 
5890 #if OMPT_SUPPORT
5891  ompt_data_t *thread_data = nullptr;
5892  if (ompt_enabled.enabled) {
5893  thread_data = &(this_thr->th.ompt_thread_info.thread_data);
5894  *thread_data = ompt_data_none;
5895 
5896  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5897  this_thr->th.ompt_thread_info.wait_id = 0;
5898  this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
5899  this_thr->th.ompt_thread_info.parallel_flags = 0;
5900  if (ompt_enabled.ompt_callback_thread_begin) {
5901  ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
5902  ompt_thread_worker, thread_data);
5903  }
5904  this_thr->th.ompt_thread_info.state = ompt_state_idle;
5905  }
5906 #endif
5907 
5908  /* This is the place where threads wait for work */
5909  while (!TCR_4(__kmp_global.g.g_done)) {
5910  KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
5911  KMP_MB();
5912 
5913  /* wait for work to do */
5914  KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
5915 
5916  /* No tid yet since not part of a team */
5917  __kmp_fork_barrier(gtid, KMP_GTID_DNE);
5918 
5919 #if OMPT_SUPPORT
5920  if (ompt_enabled.enabled) {
5921  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5922  }
5923 #endif
5924 
5925  pteam = &this_thr->th.th_team;
5926 
5927  /* have we been allocated? */
5928  if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
5929  /* we were just woken up, so run our new task */
5930  if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
5931  int rc;
5932  KA_TRACE(20,
5933  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5934  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5935  (*pteam)->t.t_pkfn));
5936 
5937  updateHWFPControl(*pteam);
5938 
5939 #if OMPT_SUPPORT
5940  if (ompt_enabled.enabled) {
5941  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
5942  }
5943 #endif
5944 
5945  rc = (*pteam)->t.t_invoke(gtid);
5946  KMP_ASSERT(rc);
5947 
5948  KMP_MB();
5949  KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5950  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5951  (*pteam)->t.t_pkfn));
5952  }
5953 #if OMPT_SUPPORT
5954  if (ompt_enabled.enabled) {
5955  /* no frame set while outside task */
5956  __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
5957 
5958  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5959  }
5960 #endif
5961  /* join barrier after parallel region */
5962  __kmp_join_barrier(gtid);
5963  }
5964  }
5965  TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5966 
5967 #if OMPD_SUPPORT
5968  if (ompd_state & OMPD_ENABLE_BP)
5969  ompd_bp_thread_end();
5970 #endif
5971 
5972 #if OMPT_SUPPORT
5973  if (ompt_enabled.ompt_callback_thread_end) {
5974  ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
5975  }
5976 #endif
5977 
5978  this_thr->th.th_task_team = NULL;
5979  /* run the destructors for the threadprivate data for this thread */
5980  __kmp_common_destroy_gtid(gtid);
5981 
5982  KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
5983  KMP_MB();
5984 
5985 #if OMP_PROFILING_SUPPORT
5986  llvm::timeTraceProfilerFinishThread();
5987 #endif
5988  return this_thr;
5989 }
5990 
5991 /* ------------------------------------------------------------------------ */
5992 
5993 void __kmp_internal_end_dest(void *specific_gtid) {
5994  // Make sure no significant bits are lost
5995  int gtid;
5996  __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, &gtid);
5997 
5998  KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5999  /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
6000  * this is because 0 is reserved for the nothing-stored case */
6001 
6002  __kmp_internal_end_thread(gtid);
6003 }
6004 
6005 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
6006 
6007 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
6008  __kmp_internal_end_atexit();
6009 }
6010 
6011 #endif
6012 
6013 /* [Windows] josh: when the atexit handler is called, there may still be more
6014  than one thread alive */
6015 void __kmp_internal_end_atexit(void) {
6016  KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
6017  /* [Windows]
6018  josh: ideally, we want to completely shutdown the library in this atexit
6019  handler, but stat code that depends on thread specific data for gtid fails
6020  because that data becomes unavailable at some point during the shutdown, so
6021  we call __kmp_internal_end_thread instead. We should eventually remove the
6022  dependency on __kmp_get_specific_gtid in the stat code and use
6023  __kmp_internal_end_library to cleanly shutdown the library.
6024 
6025  // TODO: Can some of this comment about GVS be removed?
6026  I suspect that the offending stat code is executed when the calling thread
6027  tries to clean up a dead root thread's data structures, resulting in GVS
6028  code trying to close the GVS structures for that thread, but since the stat
6029  code uses __kmp_get_specific_gtid to get the gtid with the assumption that
6030  the calling thread is cleaning up itself instead of another thread, it get
6031  confused. This happens because allowing a thread to unregister and cleanup
6032  another thread is a recent modification for addressing an issue.
6033  Based on the current design (20050722), a thread may end up
6034  trying to unregister another thread only if thread death does not trigger
6035  the calling of __kmp_internal_end_thread. For Linux* OS, there is the
6036  thread specific data destructor function to detect thread death. For
6037  Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
6038  is nothing. Thus, the workaround is applicable only for Windows static
6039  stat library. */
6040  __kmp_internal_end_library(-1);
6041 #if KMP_OS_WINDOWS
6042  __kmp_close_console();
6043 #endif
6044 }
6045 
6046 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
6047  // It is assumed __kmp_forkjoin_lock is acquired.
6048 
6049  int gtid;
6050 
6051  KMP_DEBUG_ASSERT(thread != NULL);
6052 
6053  gtid = thread->th.th_info.ds.ds_gtid;
6054 
6055  if (!is_root) {
6056  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
6057  /* Assume the threads are at the fork barrier here */
6058  KA_TRACE(
6059  20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
6060  gtid));
6061  if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
6062  while (
6063  !KMP_COMPARE_AND_STORE_ACQ32(&(thread->th.th_used_in_team), 0, 3))
6064  KMP_CPU_PAUSE();
6065  __kmp_resume_32(gtid, (kmp_flag_32<false, false> *)NULL);
6066  } else {
6067  /* Need release fence here to prevent seg faults for tree forkjoin
6068  barrier (GEH) */
6069  ANNOTATE_HAPPENS_BEFORE(thread);
6070  kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
6071  thread);
6072  __kmp_release_64(&flag);
6073  }
6074  }
6075 
6076  // Terminate OS thread.
6077  __kmp_reap_worker(thread);
6078 
6079  // The thread was killed asynchronously. If it was actively
6080  // spinning in the thread pool, decrement the global count.
6081  //
6082  // There is a small timing hole here - if the worker thread was just waking
6083  // up after sleeping in the pool, had reset it's th_active_in_pool flag but
6084  // not decremented the global counter __kmp_thread_pool_active_nth yet, then
6085  // the global counter might not get updated.
6086  //
6087  // Currently, this can only happen as the library is unloaded,
6088  // so there are no harmful side effects.
6089  if (thread->th.th_active_in_pool) {
6090  thread->th.th_active_in_pool = FALSE;
6091  KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
6092  KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
6093  }
6094  }
6095 
6096  __kmp_free_implicit_task(thread);
6097 
6098 // Free the fast memory for tasking
6099 #if USE_FAST_MEMORY
6100  __kmp_free_fast_memory(thread);
6101 #endif /* USE_FAST_MEMORY */
6102 
6103  __kmp_suspend_uninitialize_thread(thread);
6104 
6105  KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
6106  TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
6107 
6108  --__kmp_all_nth;
6109  // __kmp_nth was decremented when thread is added to the pool.
6110 
6111 #ifdef KMP_ADJUST_BLOCKTIME
6112  /* Adjust blocktime back to user setting or default if necessary */
6113  /* Middle initialization might never have occurred */
6114  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6115  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6116  if (__kmp_nth <= __kmp_avail_proc) {
6117  __kmp_zero_bt = FALSE;
6118  }
6119  }
6120 #endif /* KMP_ADJUST_BLOCKTIME */
6121 
6122  /* free the memory being used */
6123  if (__kmp_env_consistency_check) {
6124  if (thread->th.th_cons) {
6125  __kmp_free_cons_stack(thread->th.th_cons);
6126  thread->th.th_cons = NULL;
6127  }
6128  }
6129 
6130  if (thread->th.th_pri_common != NULL) {
6131  __kmp_free(thread->th.th_pri_common);
6132  thread->th.th_pri_common = NULL;
6133  }
6134 
6135  if (thread->th.th_task_state_memo_stack != NULL) {
6136  __kmp_free(thread->th.th_task_state_memo_stack);
6137  thread->th.th_task_state_memo_stack = NULL;
6138  }
6139 
6140 #if KMP_USE_BGET
6141  if (thread->th.th_local.bget_data != NULL) {
6142  __kmp_finalize_bget(thread);
6143  }
6144 #endif
6145 
6146 #if KMP_AFFINITY_SUPPORTED
6147  if (thread->th.th_affin_mask != NULL) {
6148  KMP_CPU_FREE(thread->th.th_affin_mask);
6149  thread->th.th_affin_mask = NULL;
6150  }
6151 #endif /* KMP_AFFINITY_SUPPORTED */
6152 
6153 #if KMP_USE_HIER_SCHED
6154  if (thread->th.th_hier_bar_data != NULL) {
6155  __kmp_free(thread->th.th_hier_bar_data);
6156  thread->th.th_hier_bar_data = NULL;
6157  }
6158 #endif
6159 
6160  __kmp_reap_team(thread->th.th_serial_team);
6161  thread->th.th_serial_team = NULL;
6162  __kmp_free(thread);
6163 
6164  KMP_MB();
6165 
6166 } // __kmp_reap_thread
6167 
6168 static void __kmp_internal_end(void) {
6169  int i;
6170 
6171  /* First, unregister the library */
6172  __kmp_unregister_library();
6173 
6174 #if KMP_OS_WINDOWS
6175  /* In Win static library, we can't tell when a root actually dies, so we
6176  reclaim the data structures for any root threads that have died but not
6177  unregistered themselves, in order to shut down cleanly.
6178  In Win dynamic library we also can't tell when a thread dies. */
6179  __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
6180 // dead roots
6181 #endif
6182 
6183  for (i = 0; i < __kmp_threads_capacity; i++)
6184  if (__kmp_root[i])
6185  if (__kmp_root[i]->r.r_active)
6186  break;
6187  KMP_MB(); /* Flush all pending memory write invalidates. */
6188  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6189 
6190  if (i < __kmp_threads_capacity) {
6191 #if KMP_USE_MONITOR
6192  // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6193  KMP_MB(); /* Flush all pending memory write invalidates. */
6194 
6195  // Need to check that monitor was initialized before reaping it. If we are
6196  // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6197  // __kmp_monitor will appear to contain valid data, but it is only valid in
6198  // the parent process, not the child.
6199  // New behavior (201008): instead of keying off of the flag
6200  // __kmp_init_parallel, the monitor thread creation is keyed off
6201  // of the new flag __kmp_init_monitor.
6202  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6203  if (TCR_4(__kmp_init_monitor)) {
6204  __kmp_reap_monitor(&__kmp_monitor);
6205  TCW_4(__kmp_init_monitor, 0);
6206  }
6207  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6208  KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6209 #endif // KMP_USE_MONITOR
6210  } else {
6211 /* TODO move this to cleanup code */
6212 #ifdef KMP_DEBUG
6213  /* make sure that everything has properly ended */
6214  for (i = 0; i < __kmp_threads_capacity; i++) {
6215  if (__kmp_root[i]) {
6216  // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC:
6217  // there can be uber threads alive here
6218  KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6219  }
6220  }
6221 #endif
6222 
6223  KMP_MB();
6224 
6225  // Reap the worker threads.
6226  // This is valid for now, but be careful if threads are reaped sooner.
6227  while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6228  // Get the next thread from the pool.
6229  kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6230  __kmp_thread_pool = thread->th.th_next_pool;
6231  // Reap it.
6232  KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6233  thread->th.th_next_pool = NULL;
6234  thread->th.th_in_pool = FALSE;
6235  __kmp_reap_thread(thread, 0);
6236  }
6237  __kmp_thread_pool_insert_pt = NULL;
6238 
6239  // Reap teams.
6240  while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6241  // Get the next team from the pool.
6242  kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6243  __kmp_team_pool = team->t.t_next_pool;
6244  // Reap it.
6245  team->t.t_next_pool = NULL;
6246  __kmp_reap_team(team);
6247  }
6248 
6249  __kmp_reap_task_teams();
6250 
6251 #if KMP_OS_UNIX
6252  // Threads that are not reaped should not access any resources since they
6253  // are going to be deallocated soon, so the shutdown sequence should wait
6254  // until all threads either exit the final spin-waiting loop or begin
6255  // sleeping after the given blocktime.
6256  for (i = 0; i < __kmp_threads_capacity; i++) {
6257  kmp_info_t *thr = __kmp_threads[i];
6258  while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6259  KMP_CPU_PAUSE();
6260  }
6261 #endif
6262 
6263  for (i = 0; i < __kmp_threads_capacity; ++i) {
6264  // TBD: Add some checking...
6265  // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6266  }
6267 
6268  /* Make sure all threadprivate destructors get run by joining with all
6269  worker threads before resetting this flag */
6270  TCW_SYNC_4(__kmp_init_common, FALSE);
6271 
6272  KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6273  KMP_MB();
6274 
6275 #if KMP_USE_MONITOR
6276  // See note above: One of the possible fixes for CQ138434 / CQ140126
6277  //
6278  // FIXME: push both code fragments down and CSE them?
6279  // push them into __kmp_cleanup() ?
6280  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6281  if (TCR_4(__kmp_init_monitor)) {
6282  __kmp_reap_monitor(&__kmp_monitor);
6283  TCW_4(__kmp_init_monitor, 0);
6284  }
6285  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6286  KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6287 #endif
6288  } /* else !__kmp_global.t_active */
6289  TCW_4(__kmp_init_gtid, FALSE);
6290  KMP_MB(); /* Flush all pending memory write invalidates. */
6291 
6292  __kmp_cleanup();
6293 #if OMPT_SUPPORT
6294  ompt_fini();
6295 #endif
6296 }
6297 
6298 void __kmp_internal_end_library(int gtid_req) {
6299  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6300  /* this shouldn't be a race condition because __kmp_internal_end() is the
6301  only place to clear __kmp_serial_init */
6302  /* we'll check this later too, after we get the lock */
6303  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6304  // redundant, because the next check will work in any case.
6305  if (__kmp_global.g.g_abort) {
6306  KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6307  /* TODO abort? */
6308  return;
6309  }
6310  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6311  KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6312  return;
6313  }
6314 
6315  KMP_MB(); /* Flush all pending memory write invalidates. */
6316  /* find out who we are and what we should do */
6317  {
6318  int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6319  KA_TRACE(
6320  10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req));
6321  if (gtid == KMP_GTID_SHUTDOWN) {
6322  KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6323  "already shutdown\n"));
6324  return;
6325  } else if (gtid == KMP_GTID_MONITOR) {
6326  KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6327  "registered, or system shutdown\n"));
6328  return;
6329  } else if (gtid == KMP_GTID_DNE) {
6330  KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6331  "shutdown\n"));
6332  /* we don't know who we are, but we may still shutdown the library */
6333  } else if (KMP_UBER_GTID(gtid)) {
6334  /* unregister ourselves as an uber thread. gtid is no longer valid */
6335  if (__kmp_root[gtid]->r.r_active) {
6336  __kmp_global.g.g_abort = -1;
6337  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6338  __kmp_unregister_library();
6339  KA_TRACE(10,
6340  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6341  gtid));
6342  return;
6343  } else {
6344  KA_TRACE(
6345  10,
6346  ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6347  __kmp_unregister_root_current_thread(gtid);
6348  }
6349  } else {
6350 /* worker threads may call this function through the atexit handler, if they
6351  * call exit() */
6352 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6353  TODO: do a thorough shutdown instead */
6354 #ifdef DUMP_DEBUG_ON_EXIT
6355  if (__kmp_debug_buf)
6356  __kmp_dump_debug_buffer();
6357 #endif
6358  // added unregister library call here when we switch to shm linux
6359  // if we don't, it will leave lots of files in /dev/shm
6360  // cleanup shared memory file before exiting.
6361  __kmp_unregister_library();
6362  return;
6363  }
6364  }
6365  /* synchronize the termination process */
6366  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6367 
6368  /* have we already finished */
6369  if (__kmp_global.g.g_abort) {
6370  KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6371  /* TODO abort? */
6372  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6373  return;
6374  }
6375  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6376  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6377  return;
6378  }
6379 
6380  /* We need this lock to enforce mutex between this reading of
6381  __kmp_threads_capacity and the writing by __kmp_register_root.
6382  Alternatively, we can use a counter of roots that is atomically updated by
6383  __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6384  __kmp_internal_end_*. */
6385  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6386 
6387  /* now we can safely conduct the actual termination */
6388  __kmp_internal_end();
6389 
6390  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6391  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6392 
6393  KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6394 
6395 #ifdef DUMP_DEBUG_ON_EXIT
6396  if (__kmp_debug_buf)
6397  __kmp_dump_debug_buffer();
6398 #endif
6399 
6400 #if KMP_OS_WINDOWS
6401  __kmp_close_console();
6402 #endif
6403 
6404  __kmp_fini_allocator();
6405 
6406 } // __kmp_internal_end_library
6407 
6408 void __kmp_internal_end_thread(int gtid_req) {
6409  int i;
6410 
6411  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6412  /* this shouldn't be a race condition because __kmp_internal_end() is the
6413  * only place to clear __kmp_serial_init */
6414  /* we'll check this later too, after we get the lock */
6415  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6416  // redundant, because the next check will work in any case.
6417  if (__kmp_global.g.g_abort) {
6418  KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6419  /* TODO abort? */
6420  return;
6421  }
6422  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6423  KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6424  return;
6425  }
6426 
6427  // If hidden helper team has been initialized, we need to deinit it
6428  if (TCR_4(__kmp_init_hidden_helper)) {
6429  TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6430  // First release the main thread to let it continue its work
6431  __kmp_hidden_helper_main_thread_release();
6432  // Wait until the hidden helper team has been destroyed
6433  __kmp_hidden_helper_threads_deinitz_wait();
6434  }
6435 
6436  KMP_MB(); /* Flush all pending memory write invalidates. */
6437 
6438  /* find out who we are and what we should do */
6439  {
6440  int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6441  KA_TRACE(10,
6442  ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req));
6443  if (gtid == KMP_GTID_SHUTDOWN) {
6444  KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6445  "already shutdown\n"));
6446  return;
6447  } else if (gtid == KMP_GTID_MONITOR) {
6448  KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6449  "registered, or system shutdown\n"));
6450  return;
6451  } else if (gtid == KMP_GTID_DNE) {
6452  KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6453  "shutdown\n"));
6454  return;
6455  /* we don't know who we are */
6456  } else if (KMP_UBER_GTID(gtid)) {
6457  /* unregister ourselves as an uber thread. gtid is no longer valid */
6458  if (__kmp_root[gtid]->r.r_active) {
6459  __kmp_global.g.g_abort = -1;
6460  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6461  KA_TRACE(10,
6462  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6463  gtid));
6464  return;
6465  } else {
6466  KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6467  gtid));
6468  __kmp_unregister_root_current_thread(gtid);
6469  }
6470  } else {
6471  /* just a worker thread, let's leave */
6472  KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6473 
6474  if (gtid >= 0) {
6475  __kmp_threads[gtid]->th.th_task_team = NULL;
6476  }
6477 
6478  KA_TRACE(10,
6479  ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6480  gtid));
6481  return;
6482  }
6483  }
6484 #if KMP_DYNAMIC_LIB
6485  if (__kmp_pause_status != kmp_hard_paused)
6486  // AC: lets not shutdown the dynamic library at the exit of uber thread,
6487  // because we will better shutdown later in the library destructor.
6488  {
6489  KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6490  return;
6491  }
6492 #endif
6493  /* synchronize the termination process */
6494  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6495 
6496  /* have we already finished */
6497  if (__kmp_global.g.g_abort) {
6498  KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6499  /* TODO abort? */
6500  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6501  return;
6502  }
6503  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6504  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6505  return;
6506  }
6507 
6508  /* We need this lock to enforce mutex between this reading of
6509  __kmp_threads_capacity and the writing by __kmp_register_root.
6510  Alternatively, we can use a counter of roots that is atomically updated by
6511  __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6512  __kmp_internal_end_*. */
6513 
6514  /* should we finish the run-time? are all siblings done? */
6515  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6516 
6517  for (i = 0; i < __kmp_threads_capacity; ++i) {
6518  if (KMP_UBER_GTID(i)) {
6519  KA_TRACE(
6520  10,
6521  ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6522  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6523  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6524  return;
6525  }
6526  }
6527 
6528  /* now we can safely conduct the actual termination */
6529 
6530  __kmp_internal_end();
6531 
6532  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6533  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6534 
6535  KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6536 
6537 #ifdef DUMP_DEBUG_ON_EXIT
6538  if (__kmp_debug_buf)
6539  __kmp_dump_debug_buffer();
6540 #endif
6541 } // __kmp_internal_end_thread
6542 
6543 // -----------------------------------------------------------------------------
6544 // Library registration stuff.
6545 
6546 static long __kmp_registration_flag = 0;
6547 // Random value used to indicate library initialization.
6548 static char *__kmp_registration_str = NULL;
6549 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6550 
6551 static inline char *__kmp_reg_status_name() {
6552 /* On RHEL 3u5 if linked statically, getpid() returns different values in
6553  each thread. If registration and unregistration go in different threads
6554  (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6555  env var can not be found, because the name will contain different pid. */
6556 // macOS* complains about name being too long with additional getuid()
6557 #if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB
6558  return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(),
6559  (int)getuid());
6560 #else
6561  return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6562 #endif
6563 } // __kmp_reg_status_get
6564 
6565 void __kmp_register_library_startup(void) {
6566 
6567  char *name = __kmp_reg_status_name(); // Name of the environment variable.
6568  int done = 0;
6569  union {
6570  double dtime;
6571  long ltime;
6572  } time;
6573 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6574  __kmp_initialize_system_tick();
6575 #endif
6576  __kmp_read_system_time(&time.dtime);
6577  __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6578  __kmp_registration_str =
6579  __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6580  __kmp_registration_flag, KMP_LIBRARY_FILE);
6581 
6582  KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6583  __kmp_registration_str));
6584 
6585  while (!done) {
6586 
6587  char *value = NULL; // Actual value of the environment variable.
6588 
6589 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6590  char *shm_name = __kmp_str_format("/%s", name);
6591  int shm_preexist = 0;
6592  char *data1;
6593  int fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666);
6594  if ((fd1 == -1) && (errno == EEXIST)) {
6595  // file didn't open because it already exists.
6596  // try opening existing file
6597  fd1 = shm_open(shm_name, O_RDWR, 0666);
6598  if (fd1 == -1) { // file didn't open
6599  // error out here
6600  __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM"), KMP_ERR(0),
6601  __kmp_msg_null);
6602  } else {
6603  // able to open existing file
6604  shm_preexist = 1;
6605  }
6606  } else if (fd1 == -1) { // SHM didn't open; it was due to error other than
6607  // already exists.
6608  // error out here.
6609  __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM2"), KMP_ERR(errno),
6610  __kmp_msg_null);
6611  }
6612  if (shm_preexist == 0) {
6613  // we created SHM now set size
6614  if (ftruncate(fd1, SHM_SIZE) == -1) {
6615  // error occured setting size;
6616  __kmp_fatal(KMP_MSG(FunctionError, "Can't set size of SHM"),
6617  KMP_ERR(errno), __kmp_msg_null);
6618  }
6619  }
6620  data1 =
6621  (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0);
6622  if (data1 == MAP_FAILED) {
6623  // failed to map shared memory
6624  __kmp_fatal(KMP_MSG(FunctionError, "Can't map SHM"), KMP_ERR(errno),
6625  __kmp_msg_null);
6626  }
6627  if (shm_preexist == 0) { // set data to SHM, set value
6628  KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6629  }
6630  // Read value from either what we just wrote or existing file.
6631  value = __kmp_str_format("%s", data1); // read value from SHM
6632  munmap(data1, SHM_SIZE);
6633  close(fd1);
6634 #else // Windows and unix with static library
6635  // Set environment variable, but do not overwrite if it is exist.
6636  __kmp_env_set(name, __kmp_registration_str, 0);
6637  // read value to see if it got set
6638  value = __kmp_env_get(name);
6639 #endif
6640 
6641  if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6642  done = 1; // Ok, environment variable set successfully, exit the loop.
6643  } else {
6644  // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6645  // Check whether it alive or dead.
6646  int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6647  char *tail = value;
6648  char *flag_addr_str = NULL;
6649  char *flag_val_str = NULL;
6650  char const *file_name = NULL;
6651  __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6652  __kmp_str_split(tail, '-', &flag_val_str, &tail);
6653  file_name = tail;
6654  if (tail != NULL) {
6655  long *flag_addr = 0;
6656  unsigned long flag_val = 0;
6657  KMP_SSCANF(flag_addr_str, "%p", RCAST(void **, &flag_addr));
6658  KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6659  if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6660  // First, check whether environment-encoded address is mapped into
6661  // addr space.
6662  // If so, dereference it to see if it still has the right value.
6663  if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6664  neighbor = 1;
6665  } else {
6666  // If not, then we know the other copy of the library is no longer
6667  // running.
6668  neighbor = 2;
6669  }
6670  }
6671  }
6672  switch (neighbor) {
6673  case 0: // Cannot parse environment variable -- neighbor status unknown.
6674  // Assume it is the incompatible format of future version of the
6675  // library. Assume the other library is alive.
6676  // WARN( ... ); // TODO: Issue a warning.
6677  file_name = "unknown library";
6678  KMP_FALLTHROUGH();
6679  // Attention! Falling to the next case. That's intentional.
6680  case 1: { // Neighbor is alive.
6681  // Check it is allowed.
6682  char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6683  if (!__kmp_str_match_true(duplicate_ok)) {
6684  // That's not allowed. Issue fatal error.
6685  __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6686  KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6687  }
6688  KMP_INTERNAL_FREE(duplicate_ok);
6689  __kmp_duplicate_library_ok = 1;
6690  done = 1; // Exit the loop.
6691  } break;
6692  case 2: { // Neighbor is dead.
6693 
6694 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6695  // close shared memory.
6696  shm_unlink(shm_name); // this removes file in /dev/shm
6697 #else
6698  // Clear the variable and try to register library again.
6699  __kmp_env_unset(name);
6700 #endif
6701  } break;
6702  default: {
6703  KMP_DEBUG_ASSERT(0);
6704  } break;
6705  }
6706  }
6707  KMP_INTERNAL_FREE((void *)value);
6708 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6709  KMP_INTERNAL_FREE((void *)shm_name);
6710 #endif
6711  } // while
6712  KMP_INTERNAL_FREE((void *)name);
6713 
6714 } // func __kmp_register_library_startup
6715 
6716 void __kmp_unregister_library(void) {
6717 
6718  char *name = __kmp_reg_status_name();
6719  char *value = NULL;
6720 
6721 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6722  char *shm_name = __kmp_str_format("/%s", name);
6723  int fd1 = shm_open(shm_name, O_RDONLY, 0666);
6724  if (fd1 == -1) {
6725  // file did not open. return.
6726  return;
6727  }
6728  char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6729  if (data1 != MAP_FAILED) {
6730  value = __kmp_str_format("%s", data1); // read value from SHM
6731  munmap(data1, SHM_SIZE);
6732  }
6733  close(fd1);
6734 #else
6735  value = __kmp_env_get(name);
6736 #endif
6737 
6738  KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6739  KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6740  if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6741 // Ok, this is our variable. Delete it.
6742 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6743  shm_unlink(shm_name); // this removes file in /dev/shm
6744 #else
6745  __kmp_env_unset(name);
6746 #endif
6747  }
6748 
6749 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6750  KMP_INTERNAL_FREE(shm_name);
6751 #endif
6752 
6753  KMP_INTERNAL_FREE(__kmp_registration_str);
6754  KMP_INTERNAL_FREE(value);
6755  KMP_INTERNAL_FREE(name);
6756 
6757  __kmp_registration_flag = 0;
6758  __kmp_registration_str = NULL;
6759 
6760 } // __kmp_unregister_library
6761 
6762 // End of Library registration stuff.
6763 // -----------------------------------------------------------------------------
6764 
6765 #if KMP_MIC_SUPPORTED
6766 
6767 static void __kmp_check_mic_type() {
6768  kmp_cpuid_t cpuid_state = {0};
6769  kmp_cpuid_t *cs_p = &cpuid_state;
6770  __kmp_x86_cpuid(1, 0, cs_p);
6771  // We don't support mic1 at the moment
6772  if ((cs_p->eax & 0xff0) == 0xB10) {
6773  __kmp_mic_type = mic2;
6774  } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6775  __kmp_mic_type = mic3;
6776  } else {
6777  __kmp_mic_type = non_mic;
6778  }
6779 }
6780 
6781 #endif /* KMP_MIC_SUPPORTED */
6782 
6783 #if KMP_HAVE_UMWAIT
6784 static void __kmp_user_level_mwait_init() {
6785  struct kmp_cpuid buf;
6786  __kmp_x86_cpuid(7, 0, &buf);
6787  __kmp_umwait_enabled = ((buf.ecx >> 5) & 1) && __kmp_user_level_mwait;
6788  KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n",
6789  __kmp_umwait_enabled));
6790 }
6791 #elif KMP_HAVE_MWAIT
6792 #ifndef AT_INTELPHIUSERMWAIT
6793 // Spurious, non-existent value that should always fail to return anything.
6794 // Will be replaced with the correct value when we know that.
6795 #define AT_INTELPHIUSERMWAIT 10000
6796 #endif
6797 // getauxval() function is available in RHEL7 and SLES12. If a system with an
6798 // earlier OS is used to build the RTL, we'll use the following internal
6799 // function when the entry is not found.
6800 unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL;
6801 unsigned long getauxval(unsigned long) { return 0; }
6802 
6803 static void __kmp_user_level_mwait_init() {
6804  // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available
6805  // use them to find if the user-level mwait is enabled. Otherwise, forcibly
6806  // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable
6807  // KMP_USER_LEVEL_MWAIT was set to TRUE.
6808  if (__kmp_mic_type == mic3) {
6809  unsigned long res = getauxval(AT_INTELPHIUSERMWAIT);
6810  if ((res & 0x1) || __kmp_user_level_mwait) {
6811  __kmp_mwait_enabled = TRUE;
6812  if (__kmp_user_level_mwait) {
6813  KMP_INFORM(EnvMwaitWarn);
6814  }
6815  } else {
6816  __kmp_mwait_enabled = FALSE;
6817  }
6818  }
6819  KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, "
6820  "__kmp_mwait_enabled = %d\n",
6821  __kmp_mic_type, __kmp_mwait_enabled));
6822 }
6823 #endif /* KMP_HAVE_UMWAIT */
6824 
6825 static void __kmp_do_serial_initialize(void) {
6826  int i, gtid;
6827  size_t size;
6828 
6829  KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
6830 
6831  KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
6832  KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
6833  KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
6834  KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
6835  KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
6836 
6837 #if OMPT_SUPPORT
6838  ompt_pre_init();
6839 #endif
6840 #if OMPD_SUPPORT
6841  __kmp_env_dump();
6842  ompd_init();
6843 #endif
6844 
6845  __kmp_validate_locks();
6846 
6847  /* Initialize internal memory allocator */
6848  __kmp_init_allocator();
6849 
6850  /* Register the library startup via an environment variable and check to see
6851  whether another copy of the library is already registered. */
6852 
6853  __kmp_register_library_startup();
6854 
6855  /* TODO reinitialization of library */
6856  if (TCR_4(__kmp_global.g.g_done)) {
6857  KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
6858  }
6859 
6860  __kmp_global.g.g_abort = 0;
6861  TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6862 
6863 /* initialize the locks */
6864 #if KMP_USE_ADAPTIVE_LOCKS
6865 #if KMP_DEBUG_ADAPTIVE_LOCKS
6866  __kmp_init_speculative_stats();
6867 #endif
6868 #endif
6869 #if KMP_STATS_ENABLED
6870  __kmp_stats_init();
6871 #endif
6872  __kmp_init_lock(&__kmp_global_lock);
6873  __kmp_init_queuing_lock(&__kmp_dispatch_lock);
6874  __kmp_init_lock(&__kmp_debug_lock);
6875  __kmp_init_atomic_lock(&__kmp_atomic_lock);
6876  __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
6877  __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
6878  __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
6879  __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
6880  __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
6881  __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
6882  __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
6883  __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
6884  __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
6885  __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
6886  __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
6887  __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
6888  __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
6889  __kmp_init_bootstrap_lock(&__kmp_exit_lock);
6890 #if KMP_USE_MONITOR
6891  __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
6892 #endif
6893  __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
6894 
6895  /* conduct initialization and initial setup of configuration */
6896 
6897  __kmp_runtime_initialize();
6898 
6899 #if KMP_MIC_SUPPORTED
6900  __kmp_check_mic_type();
6901 #endif
6902 
6903 // Some global variable initialization moved here from kmp_env_initialize()
6904 #ifdef KMP_DEBUG
6905  kmp_diag = 0;
6906 #endif
6907  __kmp_abort_delay = 0;
6908 
6909  // From __kmp_init_dflt_team_nth()
6910  /* assume the entire machine will be used */
6911  __kmp_dflt_team_nth_ub = __kmp_xproc;
6912  if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
6913  __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6914  }
6915  if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
6916  __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6917  }
6918  __kmp_max_nth = __kmp_sys_max_nth;
6919  __kmp_cg_max_nth = __kmp_sys_max_nth;
6920  __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
6921  if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
6922  __kmp_teams_max_nth = __kmp_sys_max_nth;
6923  }
6924 
6925  // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
6926  // part
6927  __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6928 #if KMP_USE_MONITOR
6929  __kmp_monitor_wakeups =
6930  KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6931  __kmp_bt_intervals =
6932  KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6933 #endif
6934  // From "KMP_LIBRARY" part of __kmp_env_initialize()
6935  __kmp_library = library_throughput;
6936  // From KMP_SCHEDULE initialization
6937  __kmp_static = kmp_sch_static_balanced;
6938 // AC: do not use analytical here, because it is non-monotonous
6939 //__kmp_guided = kmp_sch_guided_iterative_chunked;
6940 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
6941 // need to repeat assignment
6942 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
6943 // bit control and barrier method control parts
6944 #if KMP_FAST_REDUCTION_BARRIER
6945 #define kmp_reduction_barrier_gather_bb ((int)1)
6946 #define kmp_reduction_barrier_release_bb ((int)1)
6947 #define kmp_reduction_barrier_gather_pat __kmp_barrier_gather_pat_dflt
6948 #define kmp_reduction_barrier_release_pat __kmp_barrier_release_pat_dflt
6949 #endif // KMP_FAST_REDUCTION_BARRIER
6950  for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
6951  __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
6952  __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
6953  __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
6954  __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
6955 #if KMP_FAST_REDUCTION_BARRIER
6956  if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
6957  // lin_64 ): hyper,1
6958  __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
6959  __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
6960  __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
6961  __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
6962  }
6963 #endif // KMP_FAST_REDUCTION_BARRIER
6964  }
6965 #if KMP_FAST_REDUCTION_BARRIER
6966 #undef kmp_reduction_barrier_release_pat
6967 #undef kmp_reduction_barrier_gather_pat
6968 #undef kmp_reduction_barrier_release_bb
6969 #undef kmp_reduction_barrier_gather_bb
6970 #endif // KMP_FAST_REDUCTION_BARRIER
6971 #if KMP_MIC_SUPPORTED
6972  if (__kmp_mic_type == mic2) { // KNC
6973  // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
6974  __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
6975  __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
6976  1; // forkjoin release
6977  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6978  __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6979  }
6980 #if KMP_FAST_REDUCTION_BARRIER
6981  if (__kmp_mic_type == mic2) { // KNC
6982  __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6983  __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6984  }
6985 #endif // KMP_FAST_REDUCTION_BARRIER
6986 #endif // KMP_MIC_SUPPORTED
6987 
6988 // From KMP_CHECKS initialization
6989 #ifdef KMP_DEBUG
6990  __kmp_env_checks = TRUE; /* development versions have the extra checks */
6991 #else
6992  __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
6993 #endif
6994 
6995  // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
6996  __kmp_foreign_tp = TRUE;
6997 
6998  __kmp_global.g.g_dynamic = FALSE;
6999  __kmp_global.g.g_dynamic_mode = dynamic_default;
7000 
7001  __kmp_init_nesting_mode();
7002 
7003  __kmp_env_initialize(NULL);
7004 
7005 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
7006  __kmp_user_level_mwait_init();
7007 #endif
7008 // Print all messages in message catalog for testing purposes.
7009 #ifdef KMP_DEBUG
7010  char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
7011  if (__kmp_str_match_true(val)) {
7012  kmp_str_buf_t buffer;
7013  __kmp_str_buf_init(&buffer);
7014  __kmp_i18n_dump_catalog(&buffer);
7015  __kmp_printf("%s", buffer.str);
7016  __kmp_str_buf_free(&buffer);
7017  }
7018  __kmp_env_free(&val);
7019 #endif
7020 
7021  __kmp_threads_capacity =
7022  __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
7023  // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
7024  __kmp_tp_capacity = __kmp_default_tp_capacity(
7025  __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
7026 
7027  // If the library is shut down properly, both pools must be NULL. Just in
7028  // case, set them to NULL -- some memory may leak, but subsequent code will
7029  // work even if pools are not freed.
7030  KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
7031  KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
7032  KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
7033  __kmp_thread_pool = NULL;
7034  __kmp_thread_pool_insert_pt = NULL;
7035  __kmp_team_pool = NULL;
7036 
7037  /* Allocate all of the variable sized records */
7038  /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
7039  * expandable */
7040  /* Since allocation is cache-aligned, just add extra padding at the end */
7041  size =
7042  (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
7043  CACHE_LINE;
7044  __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
7045  __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
7046  sizeof(kmp_info_t *) * __kmp_threads_capacity);
7047 
7048  /* init thread counts */
7049  KMP_DEBUG_ASSERT(__kmp_all_nth ==
7050  0); // Asserts fail if the library is reinitializing and
7051  KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
7052  __kmp_all_nth = 0;
7053  __kmp_nth = 0;
7054 
7055  /* setup the uber master thread and hierarchy */
7056  gtid = __kmp_register_root(TRUE);
7057  KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid));
7058  KMP_ASSERT(KMP_UBER_GTID(gtid));
7059  KMP_ASSERT(KMP_INITIAL_GTID(gtid));
7060 
7061  KMP_MB(); /* Flush all pending memory write invalidates. */
7062 
7063  __kmp_common_initialize();
7064 
7065 #if KMP_OS_UNIX
7066  /* invoke the child fork handler */
7067  __kmp_register_atfork();
7068 #endif
7069 
7070 #if !KMP_DYNAMIC_LIB
7071  {
7072  /* Invoke the exit handler when the program finishes, only for static
7073  library. For dynamic library, we already have _fini and DllMain. */
7074  int rc = atexit(__kmp_internal_end_atexit);
7075  if (rc != 0) {
7076  __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
7077  __kmp_msg_null);
7078  }
7079  }
7080 #endif
7081 
7082 #if KMP_HANDLE_SIGNALS
7083 #if KMP_OS_UNIX
7084  /* NOTE: make sure that this is called before the user installs their own
7085  signal handlers so that the user handlers are called first. this way they
7086  can return false, not call our handler, avoid terminating the library, and
7087  continue execution where they left off. */
7088  __kmp_install_signals(FALSE);
7089 #endif /* KMP_OS_UNIX */
7090 #if KMP_OS_WINDOWS
7091  __kmp_install_signals(TRUE);
7092 #endif /* KMP_OS_WINDOWS */
7093 #endif
7094 
7095  /* we have finished the serial initialization */
7096  __kmp_init_counter++;
7097 
7098  __kmp_init_serial = TRUE;
7099 
7100  if (__kmp_settings) {
7101  __kmp_env_print();
7102  }
7103 
7104  if (__kmp_display_env || __kmp_display_env_verbose) {
7105  __kmp_env_print_2();
7106  }
7107 
7108 #if OMPT_SUPPORT
7109  ompt_post_init();
7110 #endif
7111 
7112  KMP_MB();
7113 
7114  KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
7115 }
7116 
7117 void __kmp_serial_initialize(void) {
7118  if (__kmp_init_serial) {
7119  return;
7120  }
7121  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7122  if (__kmp_init_serial) {
7123  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7124  return;
7125  }
7126  __kmp_do_serial_initialize();
7127  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7128 }
7129 
7130 static void __kmp_do_middle_initialize(void) {
7131  int i, j;
7132  int prev_dflt_team_nth;
7133 
7134  if (!__kmp_init_serial) {
7135  __kmp_do_serial_initialize();
7136  }
7137 
7138  KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
7139 
7140  // Save the previous value for the __kmp_dflt_team_nth so that
7141  // we can avoid some reinitialization if it hasn't changed.
7142  prev_dflt_team_nth = __kmp_dflt_team_nth;
7143 
7144 #if KMP_AFFINITY_SUPPORTED
7145  // __kmp_affinity_initialize() will try to set __kmp_ncores to the
7146  // number of cores on the machine.
7147  __kmp_affinity_initialize();
7148 
7149 #endif /* KMP_AFFINITY_SUPPORTED */
7150 
7151  KMP_ASSERT(__kmp_xproc > 0);
7152  if (__kmp_avail_proc == 0) {
7153  __kmp_avail_proc = __kmp_xproc;
7154  }
7155 
7156  // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
7157  // correct them now
7158  j = 0;
7159  while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
7160  __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
7161  __kmp_avail_proc;
7162  j++;
7163  }
7164 
7165  if (__kmp_dflt_team_nth == 0) {
7166 #ifdef KMP_DFLT_NTH_CORES
7167  // Default #threads = #cores
7168  __kmp_dflt_team_nth = __kmp_ncores;
7169  KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7170  "__kmp_ncores (%d)\n",
7171  __kmp_dflt_team_nth));
7172 #else
7173  // Default #threads = #available OS procs
7174  __kmp_dflt_team_nth = __kmp_avail_proc;
7175  KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7176  "__kmp_avail_proc(%d)\n",
7177  __kmp_dflt_team_nth));
7178 #endif /* KMP_DFLT_NTH_CORES */
7179  }
7180 
7181  if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
7182  __kmp_dflt_team_nth = KMP_MIN_NTH;
7183  }
7184  if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
7185  __kmp_dflt_team_nth = __kmp_sys_max_nth;
7186  }
7187 
7188  if (__kmp_nesting_mode > 0)
7189  __kmp_set_nesting_mode_threads();
7190 
7191  // There's no harm in continuing if the following check fails,
7192  // but it indicates an error in the previous logic.
7193  KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
7194 
7195  if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
7196  // Run through the __kmp_threads array and set the num threads icv for each
7197  // root thread that is currently registered with the RTL (which has not
7198  // already explicitly set its nthreads-var with a call to
7199  // omp_set_num_threads()).
7200  for (i = 0; i < __kmp_threads_capacity; i++) {
7201  kmp_info_t *thread = __kmp_threads[i];
7202  if (thread == NULL)
7203  continue;
7204  if (thread->th.th_current_task->td_icvs.nproc != 0)
7205  continue;
7206 
7207  set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
7208  }
7209  }
7210  KA_TRACE(
7211  20,
7212  ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
7213  __kmp_dflt_team_nth));
7214 
7215 #ifdef KMP_ADJUST_BLOCKTIME
7216  /* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */
7217  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
7218  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
7219  if (__kmp_nth > __kmp_avail_proc) {
7220  __kmp_zero_bt = TRUE;
7221  }
7222  }
7223 #endif /* KMP_ADJUST_BLOCKTIME */
7224 
7225  /* we have finished middle initialization */
7226  TCW_SYNC_4(__kmp_init_middle, TRUE);
7227 
7228  KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
7229 }
7230 
7231 void __kmp_middle_initialize(void) {
7232  if (__kmp_init_middle) {
7233  return;
7234  }
7235  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7236  if (__kmp_init_middle) {
7237  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7238  return;
7239  }
7240  __kmp_do_middle_initialize();
7241  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7242 }
7243 
7244 void __kmp_parallel_initialize(void) {
7245  int gtid = __kmp_entry_gtid(); // this might be a new root
7246 
7247  /* synchronize parallel initialization (for sibling) */
7248  if (TCR_4(__kmp_init_parallel))
7249  return;
7250  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7251  if (TCR_4(__kmp_init_parallel)) {
7252  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7253  return;
7254  }
7255 
7256  /* TODO reinitialization after we have already shut down */
7257  if (TCR_4(__kmp_global.g.g_done)) {
7258  KA_TRACE(
7259  10,
7260  ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
7261  __kmp_infinite_loop();
7262  }
7263 
7264  /* jc: The lock __kmp_initz_lock is already held, so calling
7265  __kmp_serial_initialize would cause a deadlock. So we call
7266  __kmp_do_serial_initialize directly. */
7267  if (!__kmp_init_middle) {
7268  __kmp_do_middle_initialize();
7269  }
7270  __kmp_assign_root_init_mask();
7271  __kmp_resume_if_hard_paused();
7272 
7273  /* begin initialization */
7274  KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
7275  KMP_ASSERT(KMP_UBER_GTID(gtid));
7276 
7277 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
7278  // Save the FP control regs.
7279  // Worker threads will set theirs to these values at thread startup.
7280  __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
7281  __kmp_store_mxcsr(&__kmp_init_mxcsr);
7282  __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
7283 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
7284 
7285 #if KMP_OS_UNIX
7286 #if KMP_HANDLE_SIGNALS
7287  /* must be after __kmp_serial_initialize */
7288  __kmp_install_signals(TRUE);
7289 #endif
7290 #endif
7291 
7292  __kmp_suspend_initialize();
7293 
7294 #if defined(USE_LOAD_BALANCE)
7295  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7296  __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
7297  }
7298 #else
7299  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7300  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7301  }
7302 #endif
7303 
7304  if (__kmp_version) {
7305  __kmp_print_version_2();
7306  }
7307 
7308  /* we have finished parallel initialization */
7309  TCW_SYNC_4(__kmp_init_parallel, TRUE);
7310 
7311  KMP_MB();
7312  KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
7313 
7314  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7315 }
7316 
7317 void __kmp_hidden_helper_initialize() {
7318  if (TCR_4(__kmp_init_hidden_helper))
7319  return;
7320 
7321  // __kmp_parallel_initialize is required before we initialize hidden helper
7322  if (!TCR_4(__kmp_init_parallel))
7323  __kmp_parallel_initialize();
7324 
7325  // Double check. Note that this double check should not be placed before
7326  // __kmp_parallel_initialize as it will cause dead lock.
7327  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7328  if (TCR_4(__kmp_init_hidden_helper)) {
7329  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7330  return;
7331  }
7332 
7333  // Set the count of hidden helper tasks to be executed to zero
7334  KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0);
7335 
7336  // Set the global variable indicating that we're initializing hidden helper
7337  // team/threads
7338  TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE);
7339 
7340  // Platform independent initialization
7341  __kmp_do_initialize_hidden_helper_threads();
7342 
7343  // Wait here for the finish of initialization of hidden helper teams
7344  __kmp_hidden_helper_threads_initz_wait();
7345 
7346  // We have finished hidden helper initialization
7347  TCW_SYNC_4(__kmp_init_hidden_helper, TRUE);
7348 
7349  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7350 }
7351 
7352 /* ------------------------------------------------------------------------ */
7353 
7354 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7355  kmp_team_t *team) {
7356  kmp_disp_t *dispatch;
7357 
7358  KMP_MB();
7359 
7360  /* none of the threads have encountered any constructs, yet. */
7361  this_thr->th.th_local.this_construct = 0;
7362 #if KMP_CACHE_MANAGE
7363  KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
7364 #endif /* KMP_CACHE_MANAGE */
7365  dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7366  KMP_DEBUG_ASSERT(dispatch);
7367  KMP_DEBUG_ASSERT(team->t.t_dispatch);
7368  // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7369  // this_thr->th.th_info.ds.ds_tid ] );
7370 
7371  dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7372  dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
7373  if (__kmp_env_consistency_check)
7374  __kmp_push_parallel(gtid, team->t.t_ident);
7375 
7376  KMP_MB(); /* Flush all pending memory write invalidates. */
7377 }
7378 
7379 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7380  kmp_team_t *team) {
7381  if (__kmp_env_consistency_check)
7382  __kmp_pop_parallel(gtid, team->t.t_ident);
7383 
7384  __kmp_finish_implicit_task(this_thr);
7385 }
7386 
7387 int __kmp_invoke_task_func(int gtid) {
7388  int rc;
7389  int tid = __kmp_tid_from_gtid(gtid);
7390  kmp_info_t *this_thr = __kmp_threads[gtid];
7391  kmp_team_t *team = this_thr->th.th_team;
7392 
7393  __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7394 #if USE_ITT_BUILD
7395  if (__itt_stack_caller_create_ptr) {
7396  // inform ittnotify about entering user's code
7397  if (team->t.t_stack_id != NULL) {
7398  __kmp_itt_stack_callee_enter((__itt_caller)team->t.t_stack_id);
7399  } else {
7400  KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7401  __kmp_itt_stack_callee_enter(
7402  (__itt_caller)team->t.t_parent->t.t_stack_id);
7403  }
7404  }
7405 #endif /* USE_ITT_BUILD */
7406 #if INCLUDE_SSC_MARKS
7407  SSC_MARK_INVOKING();
7408 #endif
7409 
7410 #if OMPT_SUPPORT
7411  void *dummy;
7412  void **exit_frame_p;
7413  ompt_data_t *my_task_data;
7414  ompt_data_t *my_parallel_data;
7415  int ompt_team_size;
7416 
7417  if (ompt_enabled.enabled) {
7418  exit_frame_p = &(team->t.t_implicit_task_taskdata[tid]
7419  .ompt_task_info.frame.exit_frame.ptr);
7420  } else {
7421  exit_frame_p = &dummy;
7422  }
7423 
7424  my_task_data =
7425  &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7426  my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7427  if (ompt_enabled.ompt_callback_implicit_task) {
7428  ompt_team_size = team->t.t_nproc;
7429  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7430  ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7431  __kmp_tid_from_gtid(gtid), ompt_task_implicit);
7432  OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7433  }
7434 #endif
7435 
7436 #if KMP_STATS_ENABLED
7437  stats_state_e previous_state = KMP_GET_THREAD_STATE();
7438  if (previous_state == stats_state_e::TEAMS_REGION) {
7439  KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7440  } else {
7441  KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7442  }
7443  KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7444 #endif
7445 
7446  rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7447  tid, (int)team->t.t_argc, (void **)team->t.t_argv
7448 #if OMPT_SUPPORT
7449  ,
7450  exit_frame_p
7451 #endif
7452  );
7453 #if OMPT_SUPPORT
7454  *exit_frame_p = NULL;
7455  this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team;
7456 #endif
7457 
7458 #if KMP_STATS_ENABLED
7459  if (previous_state == stats_state_e::TEAMS_REGION) {
7460  KMP_SET_THREAD_STATE(previous_state);
7461  }
7462  KMP_POP_PARTITIONED_TIMER();
7463 #endif
7464 
7465 #if USE_ITT_BUILD
7466  if (__itt_stack_caller_create_ptr) {
7467  // inform ittnotify about leaving user's code
7468  if (team->t.t_stack_id != NULL) {
7469  __kmp_itt_stack_callee_leave((__itt_caller)team->t.t_stack_id);
7470  } else {
7471  KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7472  __kmp_itt_stack_callee_leave(
7473  (__itt_caller)team->t.t_parent->t.t_stack_id);
7474  }
7475  }
7476 #endif /* USE_ITT_BUILD */
7477  __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7478 
7479  return rc;
7480 }
7481 
7482 void __kmp_teams_master(int gtid) {
7483  // This routine is called by all primary threads in teams construct
7484  kmp_info_t *thr = __kmp_threads[gtid];
7485  kmp_team_t *team = thr->th.th_team;
7486  ident_t *loc = team->t.t_ident;
7487  thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7488  KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7489  KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7490  KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7491  __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7492 
7493  // This thread is a new CG root. Set up the proper variables.
7494  kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7495  tmp->cg_root = thr; // Make thr the CG root
7496  // Init to thread limit stored when league primary threads were forked
7497  tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7498  tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7499  KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7500  " cg_nthreads to 1\n",
7501  thr, tmp));
7502  tmp->up = thr->th.th_cg_roots;
7503  thr->th.th_cg_roots = tmp;
7504 
7505 // Launch league of teams now, but not let workers execute
7506 // (they hang on fork barrier until next parallel)
7507 #if INCLUDE_SSC_MARKS
7508  SSC_MARK_FORKING();
7509 #endif
7510  __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7511  (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7512  VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7513 #if INCLUDE_SSC_MARKS
7514  SSC_MARK_JOINING();
7515 #endif
7516  // If the team size was reduced from the limit, set it to the new size
7517  if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7518  thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7519  // AC: last parameter "1" eliminates join barrier which won't work because
7520  // worker threads are in a fork barrier waiting for more parallel regions
7521  __kmp_join_call(loc, gtid
7522 #if OMPT_SUPPORT
7523  ,
7524  fork_context_intel
7525 #endif
7526  ,
7527  1);
7528 }
7529 
7530 int __kmp_invoke_teams_master(int gtid) {
7531  kmp_info_t *this_thr = __kmp_threads[gtid];
7532  kmp_team_t *team = this_thr->th.th_team;
7533 #if KMP_DEBUG
7534  if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7535  KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7536  (void *)__kmp_teams_master);
7537 #endif
7538  __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7539 #if OMPT_SUPPORT
7540  int tid = __kmp_tid_from_gtid(gtid);
7541  ompt_data_t *task_data =
7542  &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
7543  ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
7544  if (ompt_enabled.ompt_callback_implicit_task) {
7545  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7546  ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
7547  ompt_task_initial);
7548  OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
7549  }
7550 #endif
7551  __kmp_teams_master(gtid);
7552 #if OMPT_SUPPORT
7553  this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league;
7554 #endif
7555  __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7556  return 1;
7557 }
7558 
7559 /* this sets the requested number of threads for the next parallel region
7560  encountered by this team. since this should be enclosed in the forkjoin
7561  critical section it should avoid race conditions with asymmetrical nested
7562  parallelism */
7563 
7564 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7565  kmp_info_t *thr = __kmp_threads[gtid];
7566 
7567  if (num_threads > 0)
7568  thr->th.th_set_nproc = num_threads;
7569 }
7570 
7571 static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams,
7572  int num_threads) {
7573  KMP_DEBUG_ASSERT(thr);
7574  // Remember the number of threads for inner parallel regions
7575  if (!TCR_4(__kmp_init_middle))
7576  __kmp_middle_initialize(); // get internal globals calculated
7577  __kmp_assign_root_init_mask();
7578  KMP_DEBUG_ASSERT(__kmp_avail_proc);
7579  KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
7580 
7581  if (num_threads == 0) {
7582  if (__kmp_teams_thread_limit > 0) {
7583  num_threads = __kmp_teams_thread_limit;
7584  } else {
7585  num_threads = __kmp_avail_proc / num_teams;
7586  }
7587  // adjust num_threads w/o warning as it is not user setting
7588  // num_threads = min(num_threads, nthreads-var, thread-limit-var)
7589  // no thread_limit clause specified - do not change thread-limit-var ICV
7590  if (num_threads > __kmp_dflt_team_nth) {
7591  num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7592  }
7593  if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
7594  num_threads = thr->th.th_current_task->td_icvs.thread_limit;
7595  } // prevent team size to exceed thread-limit-var
7596  if (num_teams * num_threads > __kmp_teams_max_nth) {
7597  num_threads = __kmp_teams_max_nth / num_teams;
7598  }
7599  if (num_threads == 0) {
7600  num_threads = 1;
7601  }
7602  } else {
7603  // This thread will be the primary thread of the league primary threads
7604  // Store new thread limit; old limit is saved in th_cg_roots list
7605  thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7606  // num_threads = min(num_threads, nthreads-var)
7607  if (num_threads > __kmp_dflt_team_nth) {
7608  num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7609  }
7610  if (num_teams * num_threads > __kmp_teams_max_nth) {
7611  int new_threads = __kmp_teams_max_nth / num_teams;
7612  if (new_threads == 0) {
7613  new_threads = 1;
7614  }
7615  if (new_threads != num_threads) {
7616  if (!__kmp_reserve_warn) { // user asked for too many threads
7617  __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7618  __kmp_msg(kmp_ms_warning,
7619  KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7620  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7621  }
7622  }
7623  num_threads = new_threads;
7624  }
7625  }
7626  thr->th.th_teams_size.nth = num_threads;
7627 }
7628 
7629 /* this sets the requested number of teams for the teams region and/or
7630  the number of threads for the next parallel region encountered */
7631 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7632  int num_threads) {
7633  kmp_info_t *thr = __kmp_threads[gtid];
7634  KMP_DEBUG_ASSERT(num_teams >= 0);
7635  KMP_DEBUG_ASSERT(num_threads >= 0);
7636 
7637  if (num_teams == 0) {
7638  if (__kmp_nteams > 0) {
7639  num_teams = __kmp_nteams;
7640  } else {
7641  num_teams = 1; // default number of teams is 1.
7642  }
7643  }
7644  if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7645  if (!__kmp_reserve_warn) {
7646  __kmp_reserve_warn = 1;
7647  __kmp_msg(kmp_ms_warning,
7648  KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7649  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7650  }
7651  num_teams = __kmp_teams_max_nth;
7652  }
7653  // Set number of teams (number of threads in the outer "parallel" of the
7654  // teams)
7655  thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7656 
7657  __kmp_push_thread_limit(thr, num_teams, num_threads);
7658 }
7659 
7660 /* This sets the requested number of teams for the teams region and/or
7661  the number of threads for the next parallel region encountered */
7662 void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb,
7663  int num_teams_ub, int num_threads) {
7664  kmp_info_t *thr = __kmp_threads[gtid];
7665  KMP_DEBUG_ASSERT(num_teams_lb >= 0 && num_teams_ub >= 0);
7666  KMP_DEBUG_ASSERT(num_teams_ub >= num_teams_lb);
7667  KMP_DEBUG_ASSERT(num_threads >= 0);
7668 
7669  if (num_teams_lb > num_teams_ub) {
7670  __kmp_fatal(KMP_MSG(FailedToCreateTeam, num_teams_lb, num_teams_ub),
7671  KMP_HNT(SetNewBound, __kmp_teams_max_nth), __kmp_msg_null);
7672  }
7673 
7674  int num_teams = 1; // defalt number of teams is 1.
7675 
7676  if (num_teams_lb == 0 && num_teams_ub > 0)
7677  num_teams_lb = num_teams_ub;
7678 
7679  if (num_teams_lb == 0 && num_teams_ub == 0) { // no num_teams clause
7680  num_teams = (__kmp_nteams > 0) ? __kmp_nteams : num_teams;
7681  if (num_teams > __kmp_teams_max_nth) {
7682  if (!__kmp_reserve_warn) {
7683  __kmp_reserve_warn = 1;
7684  __kmp_msg(kmp_ms_warning,
7685  KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7686  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7687  }
7688  num_teams = __kmp_teams_max_nth;
7689  }
7690  } else if (num_teams_lb == num_teams_ub) { // requires exact number of teams
7691  num_teams = num_teams_ub;
7692  } else { // num_teams_lb <= num_teams <= num_teams_ub
7693  if (num_threads == 0) {
7694  if (num_teams_ub > __kmp_teams_max_nth) {
7695  num_teams = num_teams_lb;
7696  } else {
7697  num_teams = num_teams_ub;
7698  }
7699  } else {
7700  num_teams = (num_threads > __kmp_teams_max_nth)
7701  ? num_teams
7702  : __kmp_teams_max_nth / num_threads;
7703  if (num_teams < num_teams_lb) {
7704  num_teams = num_teams_lb;
7705  } else if (num_teams > num_teams_ub) {
7706  num_teams = num_teams_ub;
7707  }
7708  }
7709  }
7710  // Set number of teams (number of threads in the outer "parallel" of the
7711  // teams)
7712  thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7713 
7714  __kmp_push_thread_limit(thr, num_teams, num_threads);
7715 }
7716 
7717 // Set the proc_bind var to use in the following parallel region.
7718 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7719  kmp_info_t *thr = __kmp_threads[gtid];
7720  thr->th.th_set_proc_bind = proc_bind;
7721 }
7722 
7723 /* Launch the worker threads into the microtask. */
7724 
7725 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7726  kmp_info_t *this_thr = __kmp_threads[gtid];
7727 
7728 #ifdef KMP_DEBUG
7729  int f;
7730 #endif /* KMP_DEBUG */
7731 
7732  KMP_DEBUG_ASSERT(team);
7733  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7734  KMP_ASSERT(KMP_MASTER_GTID(gtid));
7735  KMP_MB(); /* Flush all pending memory write invalidates. */
7736 
7737  team->t.t_construct = 0; /* no single directives seen yet */
7738  team->t.t_ordered.dt.t_value =
7739  0; /* thread 0 enters the ordered section first */
7740 
7741  /* Reset the identifiers on the dispatch buffer */
7742  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7743  if (team->t.t_max_nproc > 1) {
7744  int i;
7745  for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7746  team->t.t_disp_buffer[i].buffer_index = i;
7747  team->t.t_disp_buffer[i].doacross_buf_idx = i;
7748  }
7749  } else {
7750  team->t.t_disp_buffer[0].buffer_index = 0;
7751  team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7752  }
7753 
7754  KMP_MB(); /* Flush all pending memory write invalidates. */
7755  KMP_ASSERT(this_thr->th.th_team == team);
7756 
7757 #ifdef KMP_DEBUG
7758  for (f = 0; f < team->t.t_nproc; f++) {
7759  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7760  team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7761  }
7762 #endif /* KMP_DEBUG */
7763 
7764  /* release the worker threads so they may begin working */
7765  __kmp_fork_barrier(gtid, 0);
7766 }
7767 
7768 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7769  kmp_info_t *this_thr = __kmp_threads[gtid];
7770 
7771  KMP_DEBUG_ASSERT(team);
7772  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7773  KMP_ASSERT(KMP_MASTER_GTID(gtid));
7774  KMP_MB(); /* Flush all pending memory write invalidates. */
7775 
7776  /* Join barrier after fork */
7777 
7778 #ifdef KMP_DEBUG
7779  if (__kmp_threads[gtid] &&
7780  __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7781  __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7782  __kmp_threads[gtid]);
7783  __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
7784  "team->t.t_nproc=%d\n",
7785  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
7786  team->t.t_nproc);
7787  __kmp_print_structure();
7788  }
7789  KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
7790  __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
7791 #endif /* KMP_DEBUG */
7792 
7793  __kmp_join_barrier(gtid); /* wait for everyone */
7794 #if OMPT_SUPPORT
7795  if (ompt_enabled.enabled &&
7796  this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
7797  int ds_tid = this_thr->th.th_info.ds.ds_tid;
7798  ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
7799  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
7800 #if OMPT_OPTIONAL
7801  void *codeptr = NULL;
7802  if (KMP_MASTER_TID(ds_tid) &&
7803  (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
7804  ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
7805  codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
7806 
7807  if (ompt_enabled.ompt_callback_sync_region_wait) {
7808  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
7809  ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7810  codeptr);
7811  }
7812  if (ompt_enabled.ompt_callback_sync_region) {
7813  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
7814  ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7815  codeptr);
7816  }
7817 #endif
7818  if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
7819  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7820  ompt_scope_end, NULL, task_data, 0, ds_tid,
7821  ompt_task_implicit); // TODO: Can this be ompt_task_initial?
7822  }
7823  }
7824 #endif
7825 
7826  KMP_MB(); /* Flush all pending memory write invalidates. */
7827  KMP_ASSERT(this_thr->th.th_team == team);
7828 }
7829 
7830 /* ------------------------------------------------------------------------ */
7831 
7832 #ifdef USE_LOAD_BALANCE
7833 
7834 // Return the worker threads actively spinning in the hot team, if we
7835 // are at the outermost level of parallelism. Otherwise, return 0.
7836 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
7837  int i;
7838  int retval;
7839  kmp_team_t *hot_team;
7840 
7841  if (root->r.r_active) {
7842  return 0;
7843  }
7844  hot_team = root->r.r_hot_team;
7845  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
7846  return hot_team->t.t_nproc - 1; // Don't count primary thread
7847  }
7848 
7849  // Skip the primary thread - it is accounted for elsewhere.
7850  retval = 0;
7851  for (i = 1; i < hot_team->t.t_nproc; i++) {
7852  if (hot_team->t.t_threads[i]->th.th_active) {
7853  retval++;
7854  }
7855  }
7856  return retval;
7857 }
7858 
7859 // Perform an automatic adjustment to the number of
7860 // threads used by the next parallel region.
7861 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
7862  int retval;
7863  int pool_active;
7864  int hot_team_active;
7865  int team_curr_active;
7866  int system_active;
7867 
7868  KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
7869  set_nproc));
7870  KMP_DEBUG_ASSERT(root);
7871  KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
7872  ->th.th_current_task->td_icvs.dynamic == TRUE);
7873  KMP_DEBUG_ASSERT(set_nproc > 1);
7874 
7875  if (set_nproc == 1) {
7876  KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
7877  return 1;
7878  }
7879 
7880  // Threads that are active in the thread pool, active in the hot team for this
7881  // particular root (if we are at the outer par level), and the currently
7882  // executing thread (to become the primary thread) are available to add to the
7883  // new team, but are currently contributing to the system load, and must be
7884  // accounted for.
7885  pool_active = __kmp_thread_pool_active_nth;
7886  hot_team_active = __kmp_active_hot_team_nproc(root);
7887  team_curr_active = pool_active + hot_team_active + 1;
7888 
7889  // Check the system load.
7890  system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
7891  KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
7892  "hot team active = %d\n",
7893  system_active, pool_active, hot_team_active));
7894 
7895  if (system_active < 0) {
7896  // There was an error reading the necessary info from /proc, so use the
7897  // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
7898  // = dynamic_thread_limit, we shouldn't wind up getting back here.
7899  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7900  KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
7901 
7902  // Make this call behave like the thread limit algorithm.
7903  retval = __kmp_avail_proc - __kmp_nth +
7904  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
7905  if (retval > set_nproc) {
7906  retval = set_nproc;
7907  }
7908  if (retval < KMP_MIN_NTH) {
7909  retval = KMP_MIN_NTH;
7910  }
7911 
7912  KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
7913  retval));
7914  return retval;
7915  }
7916 
7917  // There is a slight delay in the load balance algorithm in detecting new
7918  // running procs. The real system load at this instant should be at least as
7919  // large as the #active omp thread that are available to add to the team.
7920  if (system_active < team_curr_active) {
7921  system_active = team_curr_active;
7922  }
7923  retval = __kmp_avail_proc - system_active + team_curr_active;
7924  if (retval > set_nproc) {
7925  retval = set_nproc;
7926  }
7927  if (retval < KMP_MIN_NTH) {
7928  retval = KMP_MIN_NTH;
7929  }
7930 
7931  KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
7932  return retval;
7933 } // __kmp_load_balance_nproc()
7934 
7935 #endif /* USE_LOAD_BALANCE */
7936 
7937 /* ------------------------------------------------------------------------ */
7938 
7939 /* NOTE: this is called with the __kmp_init_lock held */
7940 void __kmp_cleanup(void) {
7941  int f;
7942 
7943  KA_TRACE(10, ("__kmp_cleanup: enter\n"));
7944 
7945  if (TCR_4(__kmp_init_parallel)) {
7946 #if KMP_HANDLE_SIGNALS
7947  __kmp_remove_signals();
7948 #endif
7949  TCW_4(__kmp_init_parallel, FALSE);
7950  }
7951 
7952  if (TCR_4(__kmp_init_middle)) {
7953 #if KMP_AFFINITY_SUPPORTED
7954  __kmp_affinity_uninitialize();
7955 #endif /* KMP_AFFINITY_SUPPORTED */
7956  __kmp_cleanup_hierarchy();
7957  TCW_4(__kmp_init_middle, FALSE);
7958  }
7959 
7960  KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
7961 
7962  if (__kmp_init_serial) {
7963  __kmp_runtime_destroy();
7964  __kmp_init_serial = FALSE;
7965  }
7966 
7967  __kmp_cleanup_threadprivate_caches();
7968 
7969  for (f = 0; f < __kmp_threads_capacity; f++) {
7970  if (__kmp_root[f] != NULL) {
7971  __kmp_free(__kmp_root[f]);
7972  __kmp_root[f] = NULL;
7973  }
7974  }
7975  __kmp_free(__kmp_threads);
7976  // __kmp_threads and __kmp_root were allocated at once, as single block, so
7977  // there is no need in freeing __kmp_root.
7978  __kmp_threads = NULL;
7979  __kmp_root = NULL;
7980  __kmp_threads_capacity = 0;
7981 
7982 #if KMP_USE_DYNAMIC_LOCK
7983  __kmp_cleanup_indirect_user_locks();
7984 #else
7985  __kmp_cleanup_user_locks();
7986 #endif
7987 #if OMPD_SUPPORT
7988  if (ompd_state) {
7989  __kmp_free(ompd_env_block);
7990  ompd_env_block = NULL;
7991  ompd_env_block_size = 0;
7992  }
7993 #endif
7994 
7995 #if KMP_AFFINITY_SUPPORTED
7996  KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
7997  __kmp_cpuinfo_file = NULL;
7998 #endif /* KMP_AFFINITY_SUPPORTED */
7999 
8000 #if KMP_USE_ADAPTIVE_LOCKS
8001 #if KMP_DEBUG_ADAPTIVE_LOCKS
8002  __kmp_print_speculative_stats();
8003 #endif
8004 #endif
8005  KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
8006  __kmp_nested_nth.nth = NULL;
8007  __kmp_nested_nth.size = 0;
8008  __kmp_nested_nth.used = 0;
8009  KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
8010  __kmp_nested_proc_bind.bind_types = NULL;
8011  __kmp_nested_proc_bind.size = 0;
8012  __kmp_nested_proc_bind.used = 0;
8013  if (__kmp_affinity_format) {
8014  KMP_INTERNAL_FREE(__kmp_affinity_format);
8015  __kmp_affinity_format = NULL;
8016  }
8017 
8018  __kmp_i18n_catclose();
8019 
8020 #if KMP_USE_HIER_SCHED
8021  __kmp_hier_scheds.deallocate();
8022 #endif
8023 
8024 #if KMP_STATS_ENABLED
8025  __kmp_stats_fini();
8026 #endif
8027 
8028  KA_TRACE(10, ("__kmp_cleanup: exit\n"));
8029 }
8030 
8031 /* ------------------------------------------------------------------------ */
8032 
8033 int __kmp_ignore_mppbeg(void) {
8034  char *env;
8035 
8036  if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
8037  if (__kmp_str_match_false(env))
8038  return FALSE;
8039  }
8040  // By default __kmpc_begin() is no-op.
8041  return TRUE;
8042 }
8043 
8044 int __kmp_ignore_mppend(void) {
8045  char *env;
8046 
8047  if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
8048  if (__kmp_str_match_false(env))
8049  return FALSE;
8050  }
8051  // By default __kmpc_end() is no-op.
8052  return TRUE;
8053 }
8054 
8055 void __kmp_internal_begin(void) {
8056  int gtid;
8057  kmp_root_t *root;
8058 
8059  /* this is a very important step as it will register new sibling threads
8060  and assign these new uber threads a new gtid */
8061  gtid = __kmp_entry_gtid();
8062  root = __kmp_threads[gtid]->th.th_root;
8063  KMP_ASSERT(KMP_UBER_GTID(gtid));
8064 
8065  if (root->r.r_begin)
8066  return;
8067  __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
8068  if (root->r.r_begin) {
8069  __kmp_release_lock(&root->r.r_begin_lock, gtid);
8070  return;
8071  }
8072 
8073  root->r.r_begin = TRUE;
8074 
8075  __kmp_release_lock(&root->r.r_begin_lock, gtid);
8076 }
8077 
8078 /* ------------------------------------------------------------------------ */
8079 
8080 void __kmp_user_set_library(enum library_type arg) {
8081  int gtid;
8082  kmp_root_t *root;
8083  kmp_info_t *thread;
8084 
8085  /* first, make sure we are initialized so we can get our gtid */
8086 
8087  gtid = __kmp_entry_gtid();
8088  thread = __kmp_threads[gtid];
8089 
8090  root = thread->th.th_root;
8091 
8092  KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
8093  library_serial));
8094  if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
8095  thread */
8096  KMP_WARNING(SetLibraryIncorrectCall);
8097  return;
8098  }
8099 
8100  switch (arg) {
8101  case library_serial:
8102  thread->th.th_set_nproc = 0;
8103  set__nproc(thread, 1);
8104  break;
8105  case library_turnaround:
8106  thread->th.th_set_nproc = 0;
8107  set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8108  : __kmp_dflt_team_nth_ub);
8109  break;
8110  case library_throughput:
8111  thread->th.th_set_nproc = 0;
8112  set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8113  : __kmp_dflt_team_nth_ub);
8114  break;
8115  default:
8116  KMP_FATAL(UnknownLibraryType, arg);
8117  }
8118 
8119  __kmp_aux_set_library(arg);
8120 }
8121 
8122 void __kmp_aux_set_stacksize(size_t arg) {
8123  if (!__kmp_init_serial)
8124  __kmp_serial_initialize();
8125 
8126 #if KMP_OS_DARWIN
8127  if (arg & (0x1000 - 1)) {
8128  arg &= ~(0x1000 - 1);
8129  if (arg + 0x1000) /* check for overflow if we round up */
8130  arg += 0x1000;
8131  }
8132 #endif
8133  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8134 
8135  /* only change the default stacksize before the first parallel region */
8136  if (!TCR_4(__kmp_init_parallel)) {
8137  size_t value = arg; /* argument is in bytes */
8138 
8139  if (value < __kmp_sys_min_stksize)
8140  value = __kmp_sys_min_stksize;
8141  else if (value > KMP_MAX_STKSIZE)
8142  value = KMP_MAX_STKSIZE;
8143 
8144  __kmp_stksize = value;
8145 
8146  __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
8147  }
8148 
8149  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8150 }
8151 
8152 /* set the behaviour of the runtime library */
8153 /* TODO this can cause some odd behaviour with sibling parallelism... */
8154 void __kmp_aux_set_library(enum library_type arg) {
8155  __kmp_library = arg;
8156 
8157  switch (__kmp_library) {
8158  case library_serial: {
8159  KMP_INFORM(LibraryIsSerial);
8160  } break;
8161  case library_turnaround:
8162  if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
8163  __kmp_use_yield = 2; // only yield when oversubscribed
8164  break;
8165  case library_throughput:
8166  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
8167  __kmp_dflt_blocktime = 200;
8168  break;
8169  default:
8170  KMP_FATAL(UnknownLibraryType, arg);
8171  }
8172 }
8173 
8174 /* Getting team information common for all team API */
8175 // Returns NULL if not in teams construct
8176 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
8177  kmp_info_t *thr = __kmp_entry_thread();
8178  teams_serialized = 0;
8179  if (thr->th.th_teams_microtask) {
8180  kmp_team_t *team = thr->th.th_team;
8181  int tlevel = thr->th.th_teams_level; // the level of the teams construct
8182  int ii = team->t.t_level;
8183  teams_serialized = team->t.t_serialized;
8184  int level = tlevel + 1;
8185  KMP_DEBUG_ASSERT(ii >= tlevel);
8186  while (ii > level) {
8187  for (teams_serialized = team->t.t_serialized;
8188  (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
8189  }
8190  if (team->t.t_serialized && (!teams_serialized)) {
8191  team = team->t.t_parent;
8192  continue;
8193  }
8194  if (ii > level) {
8195  team = team->t.t_parent;
8196  ii--;
8197  }
8198  }
8199  return team;
8200  }
8201  return NULL;
8202 }
8203 
8204 int __kmp_aux_get_team_num() {
8205  int serialized;
8206  kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8207  if (team) {
8208  if (serialized > 1) {
8209  return 0; // teams region is serialized ( 1 team of 1 thread ).
8210  } else {
8211  return team->t.t_master_tid;
8212  }
8213  }
8214  return 0;
8215 }
8216 
8217 int __kmp_aux_get_num_teams() {
8218  int serialized;
8219  kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8220  if (team) {
8221  if (serialized > 1) {
8222  return 1;
8223  } else {
8224  return team->t.t_parent->t.t_nproc;
8225  }
8226  }
8227  return 1;
8228 }
8229 
8230 /* ------------------------------------------------------------------------ */
8231 
8232 /*
8233  * Affinity Format Parser
8234  *
8235  * Field is in form of: %[[[0].]size]type
8236  * % and type are required (%% means print a literal '%')
8237  * type is either single char or long name surrounded by {},
8238  * e.g., N or {num_threads}
8239  * 0 => leading zeros
8240  * . => right justified when size is specified
8241  * by default output is left justified
8242  * size is the *minimum* field length
8243  * All other characters are printed as is
8244  *
8245  * Available field types:
8246  * L {thread_level} - omp_get_level()
8247  * n {thread_num} - omp_get_thread_num()
8248  * h {host} - name of host machine
8249  * P {process_id} - process id (integer)
8250  * T {thread_identifier} - native thread identifier (integer)
8251  * N {num_threads} - omp_get_num_threads()
8252  * A {ancestor_tnum} - omp_get_ancestor_thread_num(omp_get_level()-1)
8253  * a {thread_affinity} - comma separated list of integers or integer ranges
8254  * (values of affinity mask)
8255  *
8256  * Implementation-specific field types can be added
8257  * If a type is unknown, print "undefined"
8258  */
8259 
8260 // Structure holding the short name, long name, and corresponding data type
8261 // for snprintf. A table of these will represent the entire valid keyword
8262 // field types.
8263 typedef struct kmp_affinity_format_field_t {
8264  char short_name; // from spec e.g., L -> thread level
8265  const char *long_name; // from spec thread_level -> thread level
8266  char field_format; // data type for snprintf (typically 'd' or 's'
8267  // for integer or string)
8268 } kmp_affinity_format_field_t;
8269 
8270 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
8271 #if KMP_AFFINITY_SUPPORTED
8272  {'A', "thread_affinity", 's'},
8273 #endif
8274  {'t', "team_num", 'd'},
8275  {'T', "num_teams", 'd'},
8276  {'L', "nesting_level", 'd'},
8277  {'n', "thread_num", 'd'},
8278  {'N', "num_threads", 'd'},
8279  {'a', "ancestor_tnum", 'd'},
8280  {'H', "host", 's'},
8281  {'P', "process_id", 'd'},
8282  {'i', "native_thread_id", 'd'}};
8283 
8284 // Return the number of characters it takes to hold field
8285 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
8286  const char **ptr,
8287  kmp_str_buf_t *field_buffer) {
8288  int rc, format_index, field_value;
8289  const char *width_left, *width_right;
8290  bool pad_zeros, right_justify, parse_long_name, found_valid_name;
8291  static const int FORMAT_SIZE = 20;
8292  char format[FORMAT_SIZE] = {0};
8293  char absolute_short_name = 0;
8294 
8295  KMP_DEBUG_ASSERT(gtid >= 0);
8296  KMP_DEBUG_ASSERT(th);
8297  KMP_DEBUG_ASSERT(**ptr == '%');
8298  KMP_DEBUG_ASSERT(field_buffer);
8299 
8300  __kmp_str_buf_clear(field_buffer);
8301 
8302  // Skip the initial %
8303  (*ptr)++;
8304 
8305  // Check for %% first
8306  if (**ptr == '%') {
8307  __kmp_str_buf_cat(field_buffer, "%", 1);
8308  (*ptr)++; // skip over the second %
8309  return 1;
8310  }
8311 
8312  // Parse field modifiers if they are present
8313  pad_zeros = false;
8314  if (**ptr == '0') {
8315  pad_zeros = true;
8316  (*ptr)++; // skip over 0
8317  }
8318  right_justify = false;
8319  if (**ptr == '.') {
8320  right_justify = true;
8321  (*ptr)++; // skip over .
8322  }
8323  // Parse width of field: [width_left, width_right)
8324  width_left = width_right = NULL;
8325  if (**ptr >= '0' && **ptr <= '9') {
8326  width_left = *ptr;
8327  SKIP_DIGITS(*ptr);
8328  width_right = *ptr;
8329  }
8330 
8331  // Create the format for KMP_SNPRINTF based on flags parsed above
8332  format_index = 0;
8333  format[format_index++] = '%';
8334  if (!right_justify)
8335  format[format_index++] = '-';
8336  if (pad_zeros)
8337  format[format_index++] = '0';
8338  if (width_left && width_right) {
8339  int i = 0;
8340  // Only allow 8 digit number widths.
8341  // This also prevents overflowing format variable
8342  while (i < 8 && width_left < width_right) {
8343  format[format_index++] = *width_left;
8344  width_left++;
8345  i++;
8346  }
8347  }
8348 
8349  // Parse a name (long or short)
8350  // Canonicalize the name into absolute_short_name
8351  found_valid_name = false;
8352  parse_long_name = (**ptr == '{');
8353  if (parse_long_name)
8354  (*ptr)++; // skip initial left brace
8355  for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
8356  sizeof(__kmp_affinity_format_table[0]);
8357  ++i) {
8358  char short_name = __kmp_affinity_format_table[i].short_name;
8359  const char *long_name = __kmp_affinity_format_table[i].long_name;
8360  char field_format = __kmp_affinity_format_table[i].field_format;
8361  if (parse_long_name) {
8362  size_t length = KMP_STRLEN(long_name);
8363  if (strncmp(*ptr, long_name, length) == 0) {
8364  found_valid_name = true;
8365  (*ptr) += length; // skip the long name
8366  }
8367  } else if (**ptr == short_name) {
8368  found_valid_name = true;
8369  (*ptr)++; // skip the short name
8370  }
8371  if (found_valid_name) {
8372  format[format_index++] = field_format;
8373  format[format_index++] = '\0';
8374  absolute_short_name = short_name;
8375  break;
8376  }
8377  }
8378  if (parse_long_name) {
8379  if (**ptr != '}') {
8380  absolute_short_name = 0;
8381  } else {
8382  (*ptr)++; // skip over the right brace
8383  }
8384  }
8385 
8386  // Attempt to fill the buffer with the requested
8387  // value using snprintf within __kmp_str_buf_print()
8388  switch (absolute_short_name) {
8389  case 't':
8390  rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
8391  break;
8392  case 'T':
8393  rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
8394  break;
8395  case 'L':
8396  rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
8397  break;
8398  case 'n':
8399  rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
8400  break;
8401  case 'H': {
8402  static const int BUFFER_SIZE = 256;
8403  char buf[BUFFER_SIZE];
8404  __kmp_expand_host_name(buf, BUFFER_SIZE);
8405  rc = __kmp_str_buf_print(field_buffer, format, buf);
8406  } break;
8407  case 'P':
8408  rc = __kmp_str_buf_print(field_buffer, format, getpid());
8409  break;
8410  case 'i':
8411  rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
8412  break;
8413  case 'N':
8414  rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
8415  break;
8416  case 'a':
8417  field_value =
8418  __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
8419  rc = __kmp_str_buf_print(field_buffer, format, field_value);
8420  break;
8421 #if KMP_AFFINITY_SUPPORTED
8422  case 'A': {
8423  kmp_str_buf_t buf;
8424  __kmp_str_buf_init(&buf);
8425  __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
8426  rc = __kmp_str_buf_print(field_buffer, format, buf.str);
8427  __kmp_str_buf_free(&buf);
8428  } break;
8429 #endif
8430  default:
8431  // According to spec, If an implementation does not have info for field
8432  // type, then "undefined" is printed
8433  rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
8434  // Skip the field
8435  if (parse_long_name) {
8436  SKIP_TOKEN(*ptr);
8437  if (**ptr == '}')
8438  (*ptr)++;
8439  } else {
8440  (*ptr)++;
8441  }
8442  }
8443 
8444  KMP_ASSERT(format_index <= FORMAT_SIZE);
8445  return rc;
8446 }
8447 
8448 /*
8449  * Return number of characters needed to hold the affinity string
8450  * (not including null byte character)
8451  * The resultant string is printed to buffer, which the caller can then
8452  * handle afterwards
8453  */
8454 size_t __kmp_aux_capture_affinity(int gtid, const char *format,
8455  kmp_str_buf_t *buffer) {
8456  const char *parse_ptr;
8457  size_t retval;
8458  const kmp_info_t *th;
8459  kmp_str_buf_t field;
8460 
8461  KMP_DEBUG_ASSERT(buffer);
8462  KMP_DEBUG_ASSERT(gtid >= 0);
8463 
8464  __kmp_str_buf_init(&field);
8465  __kmp_str_buf_clear(buffer);
8466 
8467  th = __kmp_threads[gtid];
8468  retval = 0;
8469 
8470  // If format is NULL or zero-length string, then we use
8471  // affinity-format-var ICV
8472  parse_ptr = format;
8473  if (parse_ptr == NULL || *parse_ptr == '\0') {
8474  parse_ptr = __kmp_affinity_format;
8475  }
8476  KMP_DEBUG_ASSERT(parse_ptr);
8477 
8478  while (*parse_ptr != '\0') {
8479  // Parse a field
8480  if (*parse_ptr == '%') {
8481  // Put field in the buffer
8482  int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
8483  __kmp_str_buf_catbuf(buffer, &field);
8484  retval += rc;
8485  } else {
8486  // Put literal character in buffer
8487  __kmp_str_buf_cat(buffer, parse_ptr, 1);
8488  retval++;
8489  parse_ptr++;
8490  }
8491  }
8492  __kmp_str_buf_free(&field);
8493  return retval;
8494 }
8495 
8496 // Displays the affinity string to stdout
8497 void __kmp_aux_display_affinity(int gtid, const char *format) {
8498  kmp_str_buf_t buf;
8499  __kmp_str_buf_init(&buf);
8500  __kmp_aux_capture_affinity(gtid, format, &buf);
8501  __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
8502  __kmp_str_buf_free(&buf);
8503 }
8504 
8505 /* ------------------------------------------------------------------------ */
8506 
8507 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8508  int blocktime = arg; /* argument is in milliseconds */
8509 #if KMP_USE_MONITOR
8510  int bt_intervals;
8511 #endif
8512  kmp_int8 bt_set;
8513 
8514  __kmp_save_internal_controls(thread);
8515 
8516  /* Normalize and set blocktime for the teams */
8517  if (blocktime < KMP_MIN_BLOCKTIME)
8518  blocktime = KMP_MIN_BLOCKTIME;
8519  else if (blocktime > KMP_MAX_BLOCKTIME)
8520  blocktime = KMP_MAX_BLOCKTIME;
8521 
8522  set__blocktime_team(thread->th.th_team, tid, blocktime);
8523  set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8524 
8525 #if KMP_USE_MONITOR
8526  /* Calculate and set blocktime intervals for the teams */
8527  bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8528 
8529  set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8530  set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8531 #endif
8532 
8533  /* Set whether blocktime has been set to "TRUE" */
8534  bt_set = TRUE;
8535 
8536  set__bt_set_team(thread->th.th_team, tid, bt_set);
8537  set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8538 #if KMP_USE_MONITOR
8539  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8540  "bt_intervals=%d, monitor_updates=%d\n",
8541  __kmp_gtid_from_tid(tid, thread->th.th_team),
8542  thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8543  __kmp_monitor_wakeups));
8544 #else
8545  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8546  __kmp_gtid_from_tid(tid, thread->th.th_team),
8547  thread->th.th_team->t.t_id, tid, blocktime));
8548 #endif
8549 }
8550 
8551 void __kmp_aux_set_defaults(char const *str, size_t len) {
8552  if (!__kmp_init_serial) {
8553  __kmp_serial_initialize();
8554  }
8555  __kmp_env_initialize(str);
8556 
8557  if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
8558  __kmp_env_print();
8559  }
8560 } // __kmp_aux_set_defaults
8561 
8562 /* ------------------------------------------------------------------------ */
8563 /* internal fast reduction routines */
8564 
8565 PACKED_REDUCTION_METHOD_T
8566 __kmp_determine_reduction_method(
8567  ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8568  void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8569  kmp_critical_name *lck) {
8570 
8571  // Default reduction method: critical construct ( lck != NULL, like in current
8572  // PAROPT )
8573  // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8574  // can be selected by RTL
8575  // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8576  // can be selected by RTL
8577  // Finally, it's up to OpenMP RTL to make a decision on which method to select
8578  // among generated by PAROPT.
8579 
8580  PACKED_REDUCTION_METHOD_T retval;
8581 
8582  int team_size;
8583 
8584  KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
8585  KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8586 
8587 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \
8588  ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))
8589 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8590 
8591  retval = critical_reduce_block;
8592 
8593  // another choice of getting a team size (with 1 dynamic deference) is slower
8594  team_size = __kmp_get_team_num_threads(global_tid);
8595  if (team_size == 1) {
8596 
8597  retval = empty_reduce_block;
8598 
8599  } else {
8600 
8601  int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8602 
8603 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || \
8604  KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64
8605 
8606 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
8607  KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8608 
8609  int teamsize_cutoff = 4;
8610 
8611 #if KMP_MIC_SUPPORTED
8612  if (__kmp_mic_type != non_mic) {
8613  teamsize_cutoff = 8;
8614  }
8615 #endif
8616  int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8617  if (tree_available) {
8618  if (team_size <= teamsize_cutoff) {
8619  if (atomic_available) {
8620  retval = atomic_reduce_block;
8621  }
8622  } else {
8623  retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8624  }
8625  } else if (atomic_available) {
8626  retval = atomic_reduce_block;
8627  }
8628 #else
8629 #error "Unknown or unsupported OS"
8630 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8631  // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8632 
8633 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
8634 
8635 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD
8636 
8637  // basic tuning
8638 
8639  if (atomic_available) {
8640  if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8641  retval = atomic_reduce_block;
8642  }
8643  } // otherwise: use critical section
8644 
8645 #elif KMP_OS_DARWIN
8646 
8647  int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8648  if (atomic_available && (num_vars <= 3)) {
8649  retval = atomic_reduce_block;
8650  } else if (tree_available) {
8651  if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8652  (reduce_size < (2000 * sizeof(kmp_real64)))) {
8653  retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8654  }
8655  } // otherwise: use critical section
8656 
8657 #else
8658 #error "Unknown or unsupported OS"
8659 #endif
8660 
8661 #else
8662 #error "Unknown or unsupported architecture"
8663 #endif
8664  }
8665 
8666  // KMP_FORCE_REDUCTION
8667 
8668  // If the team is serialized (team_size == 1), ignore the forced reduction
8669  // method and stay with the unsynchronized method (empty_reduce_block)
8670  if (__kmp_force_reduction_method != reduction_method_not_defined &&
8671  team_size != 1) {
8672 
8673  PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8674 
8675  int atomic_available, tree_available;
8676 
8677  switch ((forced_retval = __kmp_force_reduction_method)) {
8678  case critical_reduce_block:
8679  KMP_ASSERT(lck); // lck should be != 0
8680  break;
8681 
8682  case atomic_reduce_block:
8683  atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8684  if (!atomic_available) {
8685  KMP_WARNING(RedMethodNotSupported, "atomic");
8686  forced_retval = critical_reduce_block;
8687  }
8688  break;
8689 
8690  case tree_reduce_block:
8691  tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8692  if (!tree_available) {
8693  KMP_WARNING(RedMethodNotSupported, "tree");
8694  forced_retval = critical_reduce_block;
8695  } else {
8696 #if KMP_FAST_REDUCTION_BARRIER
8697  forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8698 #endif
8699  }
8700  break;
8701 
8702  default:
8703  KMP_ASSERT(0); // "unsupported method specified"
8704  }
8705 
8706  retval = forced_retval;
8707  }
8708 
8709  KA_TRACE(10, ("reduction method selected=%08x\n", retval));
8710 
8711 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
8712 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
8713 
8714  return (retval);
8715 }
8716 // this function is for testing set/get/determine reduce method
8717 kmp_int32 __kmp_get_reduce_method(void) {
8718  return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
8719 }
8720 
8721 // Soft pause sets up threads to ignore blocktime and just go to sleep.
8722 // Spin-wait code checks __kmp_pause_status and reacts accordingly.
8723 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
8724 
8725 // Hard pause shuts down the runtime completely. Resume happens naturally when
8726 // OpenMP is used subsequently.
8727 void __kmp_hard_pause() {
8728  __kmp_pause_status = kmp_hard_paused;
8729  __kmp_internal_end_thread(-1);
8730 }
8731 
8732 // Soft resume sets __kmp_pause_status, and wakes up all threads.
8733 void __kmp_resume_if_soft_paused() {
8734  if (__kmp_pause_status == kmp_soft_paused) {
8735  __kmp_pause_status = kmp_not_paused;
8736 
8737  for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
8738  kmp_info_t *thread = __kmp_threads[gtid];
8739  if (thread) { // Wake it if sleeping
8740  kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
8741  thread);
8742  if (fl.is_sleeping())
8743  fl.resume(gtid);
8744  else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
8745  __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
8746  } else { // thread holds the lock and may sleep soon
8747  do { // until either the thread sleeps, or we can get the lock
8748  if (fl.is_sleeping()) {
8749  fl.resume(gtid);
8750  break;
8751  } else if (__kmp_try_suspend_mx(thread)) {
8752  __kmp_unlock_suspend_mx(thread);
8753  break;
8754  }
8755  } while (1);
8756  }
8757  }
8758  }
8759  }
8760 }
8761 
8762 // This function is called via __kmpc_pause_resource. Returns 0 if successful.
8763 // TODO: add warning messages
8764 int __kmp_pause_resource(kmp_pause_status_t level) {
8765  if (level == kmp_not_paused) { // requesting resume
8766  if (__kmp_pause_status == kmp_not_paused) {
8767  // error message about runtime not being paused, so can't resume
8768  return 1;
8769  } else {
8770  KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
8771  __kmp_pause_status == kmp_hard_paused);
8772  __kmp_pause_status = kmp_not_paused;
8773  return 0;
8774  }
8775  } else if (level == kmp_soft_paused) { // requesting soft pause
8776  if (__kmp_pause_status != kmp_not_paused) {
8777  // error message about already being paused
8778  return 1;
8779  } else {
8780  __kmp_soft_pause();
8781  return 0;
8782  }
8783  } else if (level == kmp_hard_paused) { // requesting hard pause
8784  if (__kmp_pause_status != kmp_not_paused) {
8785  // error message about already being paused
8786  return 1;
8787  } else {
8788  __kmp_hard_pause();
8789  return 0;
8790  }
8791  } else {
8792  // error message about invalid level
8793  return 1;
8794  }
8795 }
8796 
8797 void __kmp_omp_display_env(int verbose) {
8798  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8799  if (__kmp_init_serial == 0)
8800  __kmp_do_serial_initialize();
8801  __kmp_display_env_impl(!verbose, verbose);
8802  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8803 }
8804 
8805 // The team size is changing, so distributed barrier must be modified
8806 void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
8807  int new_nthreads) {
8808  KMP_DEBUG_ASSERT(__kmp_barrier_release_pattern[bs_forkjoin_barrier] ==
8809  bp_dist_bar);
8810  kmp_info_t **other_threads = team->t.t_threads;
8811 
8812  // We want all the workers to stop waiting on the barrier while we adjust the
8813  // size of the team.
8814  for (int f = 1; f < old_nthreads; ++f) {
8815  KMP_DEBUG_ASSERT(other_threads[f] != NULL);
8816  // Ignore threads that are already inactive or not present in the team
8817  if (team->t.t_threads[f]->th.th_used_in_team.load() == 0) {
8818  // teams construct causes thread_limit to get passed in, and some of
8819  // those could be inactive; just ignore them
8820  continue;
8821  }
8822  // If thread is transitioning still to in_use state, wait for it
8823  if (team->t.t_threads[f]->th.th_used_in_team.load() == 3) {
8824  while (team->t.t_threads[f]->th.th_used_in_team.load() == 3)
8825  KMP_CPU_PAUSE();
8826  }
8827  // The thread should be in_use now
8828  KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 1);
8829  // Transition to unused state
8830  team->t.t_threads[f]->th.th_used_in_team.store(2);
8831  KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 2);
8832  }
8833  // Release all the workers
8834  kmp_uint64 new_value; // new value for go
8835  new_value = team->t.b->go_release();
8836 
8837  KMP_MFENCE();
8838 
8839  // Workers should see transition status 2 and move to 0; but may need to be
8840  // woken up first
8841  size_t my_go_index;
8842  int count = old_nthreads - 1;
8843  while (count > 0) {
8844  count = old_nthreads - 1;
8845  for (int f = 1; f < old_nthreads; ++f) {
8846  my_go_index = f / team->t.b->threads_per_go;
8847  if (other_threads[f]->th.th_used_in_team.load() != 0) {
8848  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up the workers
8849  kmp_atomic_flag_64<> *flag = (kmp_atomic_flag_64<> *)CCAST(
8850  void *, other_threads[f]->th.th_sleep_loc);
8851  __kmp_atomic_resume_64(other_threads[f]->th.th_info.ds.ds_gtid, flag);
8852  }
8853  } else {
8854  KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 0);
8855  count--;
8856  }
8857  }
8858  }
8859  // Now update the barrier size
8860  team->t.b->update_num_threads(new_nthreads);
8861  team->t.b->go_reset();
8862 }
8863 
8864 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads) {
8865  // Add the threads back to the team
8866  KMP_DEBUG_ASSERT(team);
8867  // Threads were paused and pointed at th_used_in_team temporarily during a
8868  // resize of the team. We're going to set th_used_in_team to 3 to indicate to
8869  // the thread that it should transition itself back into the team. Then, if
8870  // blocktime isn't infinite, the thread could be sleeping, so we send a resume
8871  // to wake it up.
8872  for (int f = 1; f < new_nthreads; ++f) {
8873  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
8874  KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team), 0,
8875  3);
8876  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up sleeping threads
8877  __kmp_resume_32(team->t.t_threads[f]->th.th_info.ds.ds_gtid,
8878  (kmp_flag_32<false, false> *)NULL);
8879  }
8880  }
8881  // The threads should be transitioning to the team; when they are done, they
8882  // should have set th_used_in_team to 1. This loop forces master to wait until
8883  // all threads have moved into the team and are waiting in the barrier.
8884  int count = new_nthreads - 1;
8885  while (count > 0) {
8886  count = new_nthreads - 1;
8887  for (int f = 1; f < new_nthreads; ++f) {
8888  if (team->t.t_threads[f]->th.th_used_in_team.load() == 1) {
8889  count--;
8890  }
8891  }
8892  }
8893 }
8894 
8895 // Globals and functions for hidden helper task
8896 kmp_info_t **__kmp_hidden_helper_threads;
8897 kmp_info_t *__kmp_hidden_helper_main_thread;
8898 kmp_int32 __kmp_hidden_helper_threads_num = 8;
8899 std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks;
8900 #if KMP_OS_LINUX
8901 kmp_int32 __kmp_enable_hidden_helper = TRUE;
8902 #else
8903 kmp_int32 __kmp_enable_hidden_helper = FALSE;
8904 #endif
8905 
8906 namespace {
8907 std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num;
8908 
8909 void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) {
8910  // This is an explicit synchronization on all hidden helper threads in case
8911  // that when a regular thread pushes a hidden helper task to one hidden
8912  // helper thread, the thread has not been awaken once since they're released
8913  // by the main thread after creating the team.
8914  KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num);
8915  while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) !=
8916  __kmp_hidden_helper_threads_num)
8917  ;
8918 
8919  // If main thread, then wait for signal
8920  if (__kmpc_master(nullptr, *gtid)) {
8921  // First, unset the initial state and release the initial thread
8922  TCW_4(__kmp_init_hidden_helper_threads, FALSE);
8923  __kmp_hidden_helper_initz_release();
8924  __kmp_hidden_helper_main_thread_wait();
8925  // Now wake up all worker threads
8926  for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) {
8927  __kmp_hidden_helper_worker_thread_signal();
8928  }
8929  }
8930 }
8931 } // namespace
8932 
8933 void __kmp_hidden_helper_threads_initz_routine() {
8934  // Create a new root for hidden helper team/threads
8935  const int gtid = __kmp_register_root(TRUE);
8936  __kmp_hidden_helper_main_thread = __kmp_threads[gtid];
8937  __kmp_hidden_helper_threads = &__kmp_threads[gtid];
8938  __kmp_hidden_helper_main_thread->th.th_set_nproc =
8939  __kmp_hidden_helper_threads_num;
8940 
8941  KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0);
8942 
8943  __kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn);
8944 
8945  // Set the initialization flag to FALSE
8946  TCW_SYNC_4(__kmp_init_hidden_helper, FALSE);
8947 
8948  __kmp_hidden_helper_threads_deinitz_release();
8949 }
8950 
8951 /* Nesting Mode:
8952  Set via KMP_NESTING_MODE, which takes an integer.
8953  Note: we skip duplicate topology levels, and skip levels with only
8954  one entity.
8955  KMP_NESTING_MODE=0 is the default, and doesn't use nesting mode.
8956  KMP_NESTING_MODE=1 sets as many nesting levels as there are distinct levels
8957  in the topology, and initializes the number of threads at each of those
8958  levels to the number of entities at each level, respectively, below the
8959  entity at the parent level.
8960  KMP_NESTING_MODE=N, where N>1, attempts to create up to N nesting levels,
8961  but starts with nesting OFF -- max-active-levels-var is 1 -- and requires
8962  the user to turn nesting on explicitly. This is an even more experimental
8963  option to this experimental feature, and may change or go away in the
8964  future.
8965 */
8966 
8967 // Allocate space to store nesting levels
8968 void __kmp_init_nesting_mode() {
8969  int levels = KMP_HW_LAST;
8970  __kmp_nesting_mode_nlevels = levels;
8971  __kmp_nesting_nth_level = (int *)KMP_INTERNAL_MALLOC(levels * sizeof(int));
8972  for (int i = 0; i < levels; ++i)
8973  __kmp_nesting_nth_level[i] = 0;
8974  if (__kmp_nested_nth.size < levels) {
8975  __kmp_nested_nth.nth =
8976  (int *)KMP_INTERNAL_REALLOC(__kmp_nested_nth.nth, levels * sizeof(int));
8977  __kmp_nested_nth.size = levels;
8978  }
8979 }
8980 
8981 // Set # threads for top levels of nesting; must be called after topology set
8982 void __kmp_set_nesting_mode_threads() {
8983  kmp_info_t *thread = __kmp_threads[__kmp_entry_gtid()];
8984 
8985  if (__kmp_nesting_mode == 1)
8986  __kmp_nesting_mode_nlevels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
8987  else if (__kmp_nesting_mode > 1)
8988  __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
8989 
8990  if (__kmp_topology) { // use topology info
8991  int loc, hw_level;
8992  for (loc = 0, hw_level = 0; hw_level < __kmp_topology->get_depth() &&
8993  loc < __kmp_nesting_mode_nlevels;
8994  loc++, hw_level++) {
8995  __kmp_nesting_nth_level[loc] = __kmp_topology->get_ratio(hw_level);
8996  if (__kmp_nesting_nth_level[loc] == 1)
8997  loc--;
8998  }
8999  // Make sure all cores are used
9000  if (__kmp_nesting_mode > 1 && loc > 1) {
9001  int core_level = __kmp_topology->get_level(KMP_HW_CORE);
9002  int num_cores = __kmp_topology->get_count(core_level);
9003  int upper_levels = 1;
9004  for (int level = 0; level < loc - 1; ++level)
9005  upper_levels *= __kmp_nesting_nth_level[level];
9006  if (upper_levels * __kmp_nesting_nth_level[loc - 1] < num_cores)
9007  __kmp_nesting_nth_level[loc - 1] =
9008  num_cores / __kmp_nesting_nth_level[loc - 2];
9009  }
9010  __kmp_nesting_mode_nlevels = loc;
9011  __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9012  } else { // no topology info available; provide a reasonable guesstimation
9013  if (__kmp_avail_proc >= 4) {
9014  __kmp_nesting_nth_level[0] = __kmp_avail_proc / 2;
9015  __kmp_nesting_nth_level[1] = 2;
9016  __kmp_nesting_mode_nlevels = 2;
9017  } else {
9018  __kmp_nesting_nth_level[0] = __kmp_avail_proc;
9019  __kmp_nesting_mode_nlevels = 1;
9020  }
9021  __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9022  }
9023  for (int i = 0; i < __kmp_nesting_mode_nlevels; ++i) {
9024  __kmp_nested_nth.nth[i] = __kmp_nesting_nth_level[i];
9025  }
9026  set__nproc(thread, __kmp_nesting_nth_level[0]);
9027  if (__kmp_nesting_mode > 1 && __kmp_nesting_mode_nlevels > __kmp_nesting_mode)
9028  __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9029  if (get__max_active_levels(thread) > 1) {
9030  // if max levels was set, set nesting mode levels to same
9031  __kmp_nesting_mode_nlevels = get__max_active_levels(thread);
9032  }
9033  if (__kmp_nesting_mode == 1) // turn on nesting for this case only
9034  set__max_active_levels(thread, __kmp_nesting_mode_nlevels);
9035 }
@ KMP_IDENT_AUTOPAR
Definition: kmp.h:199
KMP_EXPORT void __kmpc_serialized_parallel(ident_t *, kmp_int32 global_tid)
KMP_EXPORT void __kmpc_fork_call(ident_t *, kmp_int32 nargs, kmpc_micro microtask,...)
KMP_EXPORT void __kmpc_end_serialized_parallel(ident_t *, kmp_int32 global_tid)
#define KMP_INIT_PARTITIONED_TIMERS(name)
Initializes the partitioned timers to begin with name.
Definition: kmp_stats.h:937
#define KMP_COUNT_VALUE(name, value)
Adds value to specified timer (name).
Definition: kmp_stats.h:895
stats_state_e
the states which a thread can be in
Definition: kmp_stats.h:63
sched_type
Definition: kmp.h:357
KMP_EXPORT kmp_int32 __kmpc_master(ident_t *, kmp_int32 global_tid)
@ kmp_sch_auto
Definition: kmp.h:364
@ kmp_sch_static
Definition: kmp.h:360
@ kmp_sch_guided_chunked
Definition: kmp.h:362
Definition: kmp.h:234
kmp_int32 flags
Definition: kmp.h:236