1 files changed, 184 insertions, 96 deletions
diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h
index 87e91a0f8d10..05264f4433d3 100644
--- a/openmp/runtime/src/kmp.h
+++ b/openmp/runtime/src/kmp.h
@@ -138,6 +138,10 @@ typedef unsigned int kmp_hwloc_depth_t;
 #include "ompt-internal.h"
 #endif
 
+#if OMPD_SUPPORT
+#include "ompd-specific.h"
+#endif
+
 #ifndef UNLIKELY
 #define UNLIKELY(x) (x)
 #endif
@@ -595,6 +599,35 @@ typedef int PACKED_REDUCTION_METHOD_T;
 #include <pthread.h>
 #endif
 
+enum kmp_hw_t : int {
+  KMP_HW_UNKNOWN = -1,
+  KMP_HW_SOCKET = 0,
+  KMP_HW_PROC_GROUP,
+  KMP_HW_NUMA,
+  KMP_HW_DIE,
+  KMP_HW_LLC,
+  KMP_HW_L3,
+  KMP_HW_TILE,
+  KMP_HW_MODULE,
+  KMP_HW_L2,
+  KMP_HW_L1,
+  KMP_HW_CORE,
+  KMP_HW_THREAD,
+  KMP_HW_LAST
+};
+
+#define KMP_DEBUG_ASSERT_VALID_HW_TYPE(type)                                   \
+  KMP_DEBUG_ASSERT(type >= (kmp_hw_t)0 && type < KMP_HW_LAST)
+#define KMP_ASSERT_VALID_HW_TYPE(type)                                         \
+  KMP_ASSERT(type >= (kmp_hw_t)0 && type < KMP_HW_LAST)
+
+#define KMP_FOREACH_HW_TYPE(type)                                              \
+  for (kmp_hw_t type = (kmp_hw_t)0; type < KMP_HW_LAST;                        \
+       type = (kmp_hw_t)((int)type + 1))
+
+const char *__kmp_hw_get_keyword(kmp_hw_t type, bool plural = false);
+const char *__kmp_hw_get_catalog_string(kmp_hw_t type, bool plural = false);
+
 /* Only Linux* OS and Windows* OS support thread affinity. */
 #if KMP_AFFINITY_SUPPORTED
 
@@ -629,8 +662,6 @@ extern kmp_SetThreadGroupAffinity_t __kmp_SetThreadGroupAffinity;
 #if KMP_USE_HWLOC
 extern hwloc_topology_t __kmp_hwloc_topology;
 extern int __kmp_hwloc_error;
-extern int __kmp_numa_detected;
-extern int __kmp_tile_depth;
 #endif
 
 extern size_t __kmp_affin_mask_size;
@@ -758,27 +789,12 @@ enum affinity_type {
   affinity_default
 };
 
-enum affinity_gran {
-  affinity_gran_fine = 0,
-  affinity_gran_thread,
-  affinity_gran_core,
-  affinity_gran_tile,
-  affinity_gran_numa,
-  affinity_gran_package,
-  affinity_gran_node,
-#if KMP_GROUP_AFFINITY
-  // The "group" granularity isn't necesssarily coarser than all of the
-  // other levels, but we put it last in the enum.
-  affinity_gran_group,
-#endif /* KMP_GROUP_AFFINITY */
-  affinity_gran_default
-};
-
 enum affinity_top_method {
   affinity_top_method_all = 0, // try all (supported) methods, in order
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
   affinity_top_method_apicid,
   affinity_top_method_x2apicid,
+  affinity_top_method_x2apicid_1f,
 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
   affinity_top_method_cpuinfo, // KMP_CPUINFO_FILE is usable on Windows* OS, too
 #if KMP_GROUP_AFFINITY
@@ -794,7 +810,7 @@ enum affinity_top_method {
 #define affinity_respect_mask_default (-1)
 
 extern enum affinity_type __kmp_affinity_type; /* Affinity type */
-extern enum affinity_gran __kmp_affinity_gran; /* Affinity granularity */
+extern kmp_hw_t __kmp_affinity_gran; /* Affinity granularity */
 extern int __kmp_affinity_gran_levels; /* corresponding int value */
 extern int __kmp_affinity_dups; /* Affinity duplicate masks */
 extern enum affinity_top_method __kmp_affinity_top_method;
@@ -817,7 +833,7 @@ extern char *__kmp_cpuinfo_file;
 typedef enum kmp_proc_bind_t {
   proc_bind_false = 0,
   proc_bind_true,
-  proc_bind_master,
+  proc_bind_primary,
   proc_bind_close,
   proc_bind_spread,
   proc_bind_intel, // use KMP_AFFINITY interface
@@ -835,6 +851,10 @@ extern kmp_nested_proc_bind_t __kmp_nested_proc_bind;
 extern int __kmp_display_affinity;
 extern char *__kmp_affinity_format;
 static const size_t KMP_AFFINITY_FORMAT_SIZE = 512;
+#if OMPT_SUPPORT
+extern int __kmp_tool;
+extern char *__kmp_tool_libraries;
+#endif // OMPT_SUPPORT
 
 #if KMP_AFFINITY_SUPPORTED
 #define KMP_PLACE_ALL (-1)
@@ -863,6 +883,7 @@ typedef struct kmp_hws_item {
 } kmp_hws_item_t;
 
 extern kmp_hws_item_t __kmp_hws_socket;
+extern kmp_hws_item_t __kmp_hws_die;
 extern kmp_hws_item_t __kmp_hws_node;
 extern kmp_hws_item_t __kmp_hws_tile;
 extern kmp_hws_item_t __kmp_hws_core;
@@ -929,6 +950,10 @@ extern omp_memspace_handle_t const omp_large_cap_mem_space;
 extern omp_memspace_handle_t const omp_const_mem_space;
 extern omp_memspace_handle_t const omp_high_bw_mem_space;
 extern omp_memspace_handle_t const omp_low_lat_mem_space;
+// Preview of target memory support
+extern omp_memspace_handle_t const llvm_omp_target_host_mem_space;
+extern omp_memspace_handle_t const llvm_omp_target_shared_mem_space;
+extern omp_memspace_handle_t const llvm_omp_target_device_mem_space;
 
 typedef struct {
   omp_alloctrait_key_t key;
@@ -945,6 +970,10 @@ extern omp_allocator_handle_t const omp_low_lat_mem_alloc;
 extern omp_allocator_handle_t const omp_cgroup_mem_alloc;
 extern omp_allocator_handle_t const omp_pteam_mem_alloc;
 extern omp_allocator_handle_t const omp_thread_mem_alloc;
+// Preview of target memory support
+extern omp_allocator_handle_t const llvm_omp_target_host_mem_alloc;
+extern omp_allocator_handle_t const llvm_omp_target_shared_mem_alloc;
+extern omp_allocator_handle_t const llvm_omp_target_device_mem_alloc;
 extern omp_allocator_handle_t const kmp_max_mem_alloc;
 extern omp_allocator_handle_t __kmp_def_allocator;
 
@@ -982,6 +1011,7 @@ extern void __kmpc_free(int gtid, void *ptr, omp_allocator_handle_t al);
 
 extern void __kmp_init_memkind();
 extern void __kmp_fini_memkind();
+extern void __kmp_init_target_mem();
 
 /* ------------------------------------------------------------------------ */
 
@@ -1046,13 +1076,11 @@ extern void __kmp_fini_memkind();
 /* Calculate new number of monitor wakeups for a specific block time based on
    previous monitor_wakeups. Only allow increasing number of wakeups */
 #define KMP_WAKEUPS_FROM_BLOCKTIME(blocktime, monitor_wakeups)                 \
-  (((blocktime) == KMP_MAX_BLOCKTIME)                                          \
+  (((blocktime) == KMP_MAX_BLOCKTIME)   ? (monitor_wakeups)                    \
+   : ((blocktime) == KMP_MIN_BLOCKTIME) ? KMP_MAX_MONITOR_WAKEUPS              \
+   : ((monitor_wakeups) > (KMP_BLOCKTIME_MULTIPLIER / (blocktime)))            \
        ? (monitor_wakeups)                                                     \
-       : ((blocktime) == KMP_MIN_BLOCKTIME)                                    \
-             ? KMP_MAX_MONITOR_WAKEUPS                                         \
-             : ((monitor_wakeups) > (KMP_BLOCKTIME_MULTIPLIER / (blocktime)))  \
-                   ? (monitor_wakeups)                                         \
-                   : (KMP_BLOCKTIME_MULTIPLIER) / (blocktime))
+       : (KMP_BLOCKTIME_MULTIPLIER) / (blocktime))
 
 /* Calculate number of intervals for a specific block time based on
    monitor_wakeups */
@@ -1097,7 +1125,10 @@ extern kmp_uint64 __kmp_now_nsec();
 #define KMP_MAX_CHUNK (INT_MAX - 1)
 #define KMP_DEFAULT_CHUNK 1
 
+#define KMP_MIN_DISP_NUM_BUFF 1
 #define KMP_DFLT_DISP_NUM_BUFF 7
+#define KMP_MAX_DISP_NUM_BUFF 4096
+
 #define KMP_MAX_ORDERED 8
 
 #define KMP_MAX_FIELDS 32
@@ -1182,7 +1213,6 @@ typedef struct kmp_cpuinfo {
   int stepping; // CPUID(1).EAX[3:0] ( Stepping )
   int sse2; // 0 if SSE2 instructions are not supported, 1 otherwise.
   int rtm; // 0 if RTM instructions are not supported, 1 otherwise.
-  int cpu_stackoffset;
   int apic_id;
   int physical_id;
   int logical_id;
@@ -1339,8 +1369,7 @@ static inline void __kmp_x86_pause(void) { _mm_pause(); }
 #endif
 #endif // KMP_HAVE_WAITPKG_INTRINSICS
 KMP_ATTRIBUTE_TARGET_WAITPKG
-static inline int
-__kmp_tpause(uint32_t hint, uint64_t counter) {
+static inline int __kmp_tpause(uint32_t hint, uint64_t counter) {
 #if !KMP_HAVE_WAITPKG_INTRINSICS
   uint32_t timeHi = uint32_t(counter >> 32);
   uint32_t timeLo = uint32_t(counter & 0xffffffff);
@@ -1356,8 +1385,7 @@ __kmp_tpause(uint32_t hint, uint64_t counter) {
 #endif
 }
 KMP_ATTRIBUTE_TARGET_WAITPKG
-static inline void
-__kmp_umonitor(void *cacheline) {
+static inline void __kmp_umonitor(void *cacheline) {
 #if !KMP_HAVE_WAITPKG_INTRINSICS
   __asm__ volatile("# umonitor\n.byte 0xF3, 0x0F, 0xAE, 0x01 "
                    :
@@ -1368,8 +1396,7 @@ __kmp_umonitor(void *cacheline) {
 #endif
 }
 KMP_ATTRIBUTE_TARGET_WAITPKG
-static inline int
-__kmp_umwait(uint32_t hint, uint64_t counter) {
+static inline int __kmp_umwait(uint32_t hint, uint64_t counter) {
 #if !KMP_HAVE_WAITPKG_INTRINSICS
   uint32_t timeHi = uint32_t(counter >> 32);
   uint32_t timeLo = uint32_t(counter & 0xffffffff);
@@ -1422,7 +1449,8 @@ enum cons_type {
   ct_ordered_in_pdo,
   ct_master,
   ct_reduce,
-  ct_barrier
+  ct_barrier,
+  ct_masked
 };
 
 #define IS_CONS_TYPE_ORDERED(ct) ((ct) == ct_pdo_ordered)
@@ -1570,7 +1598,7 @@ struct private_common {
   struct private_common *next;
   struct private_common *link;
   void *gbl_addr;
-  void *par_addr; /* par_addr == gbl_addr for MASTER thread */
+  void *par_addr; /* par_addr == gbl_addr for PRIMARY thread */
   size_t cmn_size;
 };
 
@@ -1645,14 +1673,12 @@ typedef struct KMP_ALIGN_CACHE dispatch_private_info32 {
   kmp_int32 lb;
   kmp_int32 st;
   kmp_int32 tc;
-  kmp_int32 static_steal_counter; /* for static_steal only; maybe better to put
-                                     after ub */
-  kmp_lock_t *th_steal_lock; // lock used for chunk stealing
-  // KMP_ALIGN( 16 ) ensures ( if the KMP_ALIGN macro is turned on )
+  kmp_lock_t *steal_lock; // lock used for chunk stealing
+  // KMP_ALIGN(32) ensures (if the KMP_ALIGN macro is turned on)
   //    a) parm3 is properly aligned and
-  //    b) all parm1-4 are in the same cache line.
+  //    b) all parm1-4 are on the same cache line.
   // Because of parm1-4 are used together, performance seems to be better
-  // if they are in the same line (not measured though).
+  // if they are on the same cache line (not measured though).
 
   struct KMP_ALIGN(32) { // AC: changed 16 to 32 in order to simplify template
     kmp_int32 parm1; //     structures in kmp_dispatch.cpp. This should
@@ -1664,9 +1690,6 @@ typedef struct KMP_ALIGN_CACHE dispatch_private_info32 {
   kmp_uint32 ordered_lower;
   kmp_uint32 ordered_upper;
 #if KMP_OS_WINDOWS
-  // This var can be placed in the hole between 'tc' and 'parm1', instead of
-  // 'static_steal_counter'. It would be nice to measure execution times.
-  // Conditional if/endif can be removed at all.
   kmp_int32 last_upper;
 #endif /* KMP_OS_WINDOWS */
 } dispatch_private_info32_t;
@@ -1678,9 +1701,7 @@ typedef struct KMP_ALIGN_CACHE dispatch_private_info64 {
   kmp_int64 lb; /* lower-bound */
   kmp_int64 st; /* stride */
   kmp_int64 tc; /* trip count (number of iterations) */
-  kmp_int64 static_steal_counter; /* for static_steal only; maybe better to put
-                                     after ub */
-  kmp_lock_t *th_steal_lock; // lock used for chunk stealing
+  kmp_lock_t *steal_lock; // lock used for chunk stealing
   /* parm[1-4] are used in different ways by different scheduling algorithms */
 
   // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
@@ -1699,9 +1720,6 @@ typedef struct KMP_ALIGN_CACHE dispatch_private_info64 {
   kmp_uint64 ordered_lower;
   kmp_uint64 ordered_upper;
 #if KMP_OS_WINDOWS
-  // This var can be placed in the hole between 'tc' and 'parm1', instead of
-  // 'static_steal_counter'. It would be nice to measure execution times.
-  // Conditional if/endif can be removed at all.
   kmp_int64 last_upper;
 #endif /* KMP_OS_WINDOWS */
 } dispatch_private_info64_t;
@@ -1755,9 +1773,8 @@ typedef struct KMP_ALIGN_CACHE dispatch_private_info {
   } u;
   enum sched_type schedule; /* scheduling algorithm */
   kmp_sched_flags_t flags; /* flags (e.g., ordered, nomerge, etc.) */
+  std::atomic<kmp_uint32> steal_flag; // static_steal only, state of a buffer
   kmp_int32 ordered_bumped;
-  // To retain the structure size after making ordered_iteration scalar
-  kmp_int32 ordered_dummy[KMP_MAX_ORDERED - 3];
   // Stack of buffers for nest of serial regions
   struct dispatch_private_info *next;
   kmp_int32 type_size; /* the size of types in private_info */
@@ -1772,7 +1789,7 @@ typedef struct dispatch_shared_info32 {
   /* chunk index under dynamic, number of idle threads under static-steal;
      iteration index otherwise */
   volatile kmp_uint32 iteration;
-  volatile kmp_uint32 num_done;
+  volatile kmp_int32 num_done;
   volatile kmp_uint32 ordered_iteration;
   // Dummy to retain the structure size after making ordered_iteration scalar
   kmp_int32 ordered_dummy[KMP_MAX_ORDERED - 1];
@@ -1782,7 +1799,7 @@ typedef struct dispatch_shared_info64 {
   /* chunk index under dynamic, number of idle threads under static-steal;
      iteration index otherwise */
   volatile kmp_uint64 iteration;
-  volatile kmp_uint64 num_done;
+  volatile kmp_int64 num_done;
   volatile kmp_uint64 ordered_iteration;
   // Dummy to retain the structure size after making ordered_iteration scalar
   kmp_int64 ordered_dummy[KMP_MAX_ORDERED - 3];
@@ -1818,7 +1835,7 @@ typedef struct kmp_disp {
   dispatch_private_info_t *th_dispatch_pr_current;
 
   dispatch_private_info_t *th_disp_buffer;
-  kmp_int32 th_disp_index;
+  kmp_uint32 th_disp_index;
   kmp_int32 th_doacross_buf_idx; // thread's doacross buffer index
   volatile kmp_uint32 *th_doacross_flags; // pointer to shared array of flags
   kmp_int64 *th_doacross_info; // info on loop bounds
@@ -1882,9 +1899,8 @@ typedef enum kmp_bar_pat { /* Barrier communication patterns */
                                0, /* Single level (degenerate) tree */
                            bp_tree_bar =
                                1, /* Balanced tree with branching factor 2^n */
-                           bp_hyper_bar =
-                               2, /* Hypercube-embedded tree with min branching
-                                     factor 2^n */
+                           bp_hyper_bar = 2, /* Hypercube-embedded tree with min
+                                                branching factor 2^n */
                            bp_hierarchical_bar = 3, /* Machine hierarchy tree */
                            bp_last_bar /* Placeholder to mark the end */
 } kmp_bar_pat_e;
@@ -1969,9 +1985,9 @@ union KMP_ALIGN_CACHE kmp_barrier_team_union {
     kmp_uint64 b_arrived; /* STATE => task reached synch point. */
 #if USE_DEBUGGER
     // The following two fields are indended for the debugger solely. Only
-    // master of the team accesses these fields: the first one is increased by
-    // 1 when master arrives to a barrier, the second one is increased by one
-    // when all the threads arrived.
+    // primary thread of the team accesses these fields: the first one is
+    // increased by 1 when the primary thread arrives to a barrier, the second
+    // one is increased by one when all the threads arrived.
     kmp_uint b_master_arrived;
     kmp_uint b_team_arrived;
 #endif
@@ -2217,6 +2233,7 @@ typedef struct kmp_taskgroup {
   // Block of data to perform task reduction
   void *reduce_data; // reduction related info
   kmp_int32 reduce_num_data; // number of data items to reduce
+  uintptr_t *gomp_data; // gomp reduction data
 } kmp_taskgroup_t;
 
 // forward declarations
@@ -2224,15 +2241,24 @@ typedef union kmp_depnode kmp_depnode_t;
 typedef struct kmp_depnode_list kmp_depnode_list_t;
 typedef struct kmp_dephash_entry kmp_dephash_entry_t;
 
+#define KMP_DEP_IN 0x1
+#define KMP_DEP_OUT 0x2
+#define KMP_DEP_INOUT 0x3
+#define KMP_DEP_MTX 0x4
+#define KMP_DEP_SET 0x8
 // Compiler sends us this info:
 typedef struct kmp_depend_info {
   kmp_intptr_t base_addr;
   size_t len;
-  struct {
-    bool in : 1;
-    bool out : 1;
-    bool mtx : 1;
-  } flags;
+  union {
+    kmp_uint8 flag;
+    struct {
+      unsigned in : 1;
+      unsigned out : 1;
+      unsigned mtx : 1;
+      unsigned set : 1;
+    } flags;
+  };
 } kmp_depend_info_t;
 
 // Internal structures to work with task dependencies:
@@ -2266,9 +2292,9 @@ union KMP_ALIGN_CACHE kmp_depnode {
 struct kmp_dephash_entry {
   kmp_intptr_t addr;
   kmp_depnode_t *last_out;
-  kmp_depnode_list_t *last_ins;
-  kmp_depnode_list_t *last_mtxs;
-  kmp_int32 last_flag;
+  kmp_depnode_list_t *last_set;
+  kmp_depnode_list_t *prev_set;
+  kmp_uint8 last_flag;
   kmp_lock_t *mtx_lock; /* is referenced by depnodes w/mutexinoutset dep */
   kmp_dephash_entry_t *next_in_bucket;
 };
@@ -2501,7 +2527,7 @@ typedef struct kmp_teams_size {
 
 // This struct stores a thread that acts as a "root" for a contention
 // group. Contention groups are rooted at kmp_root threads, but also at
-// each master thread of each team created in the teams construct.
+// each primary thread of each team created in the teams construct.
 // This struct therefore also stores a thread_limit associated with
 // that contention group, and a counter to track the number of threads
 // active in that contention group. Each thread has a list of these: CG
@@ -2513,7 +2539,7 @@ typedef struct kmp_teams_size {
 typedef struct kmp_cg_root {
   kmp_info_p *cg_root; // "root" thread for a contention group
   // The CG root's limit comes from OMP_THREAD_LIMIT for root threads, or
-  // thread_limit clause for teams masters
+  // thread_limit clause for teams primary threads
   kmp_int32 cg_thread_limit;
   kmp_int32 cg_nthreads; // Count of active threads in CG rooted at cg_root
   struct kmp_cg_root *up; // pointer to higher level CG root in list
@@ -2523,8 +2549,9 @@ typedef struct kmp_cg_root {
 
 typedef struct KMP_ALIGN_CACHE kmp_base_info {
   /* Start with the readonly data which is cache aligned and padded. This is
-     written before the thread starts working by the master. Uber masters may
-     update themselves later. Usage does not consider serialized regions.  */
+     written before the thread starts working by the primary thread. Uber
+     masters may update themselves later. Usage does not consider serialized
+     regions.  */
   kmp_desc_t th_info;
   kmp_team_p *th_team; /* team we belong to */
   kmp_root_p *th_root; /* pointer to root of task hierarchy */
@@ -2535,7 +2562,7 @@ typedef struct KMP_ALIGN_CACHE kmp_base_info {
   /* The following are cached from the team info structure */
   /* TODO use these in more places as determined to be needed via profiling */
   int th_team_nproc; /* number of threads in a team */
-  kmp_info_p *th_team_master; /* the team's master thread */
+  kmp_info_p *th_team_master; /* the team's primary thread */
   int th_team_serialized; /* team is serialized */
   microtask_t th_teams_microtask; /* save entry address for teams construct */
   int th_teams_level; /* save initial level of teams construct */
@@ -2556,7 +2583,7 @@ typedef struct KMP_ALIGN_CACHE kmp_base_info {
   kmp_affin_mask_t *th_affin_mask; /* thread's current affinity mask */
 #endif
   omp_allocator_handle_t th_def_allocator; /* default allocator */
-  /* The data set by the master at reinit, then R/W by the worker */
+  /* The data set by the primary thread at reinit, then R/W by the worker */
   KMP_ALIGN_CACHE int
       th_set_nproc; /* if > 0, then only use this request for the next fork */
 #if KMP_NESTED_HOT_TEAMS
@@ -2592,7 +2619,7 @@ typedef struct KMP_ALIGN_CACHE kmp_base_info {
   ompt_thread_info_t ompt_thread_info;
 #endif
 
-  /* The following are also read by the master during reinit */
+  /* The following are also read by the primary thread during reinit */
   struct common_table *th_pri_common;
 
   volatile kmp_uint32 th_spin_here; /* thread-local location for spinning */
@@ -2669,7 +2696,9 @@ typedef union KMP_ALIGN_CACHE kmp_info {
 
 // OpenMP thread team data structures
 
-typedef struct kmp_base_data { volatile kmp_uint32 t_value; } kmp_base_data_t;
+typedef struct kmp_base_data {
+  volatile kmp_uint32 t_value;
+} kmp_base_data_t;
 
 typedef union KMP_ALIGN_CACHE kmp_sleep_team {
   double dt_align; /* use worst case alignment */
@@ -2690,7 +2719,7 @@ typedef int (*launch_t)(int gtid);
 
 // Set up how many argv pointers will fit in cache lines containing
 // t_inline_argv. Historically, we have supported at least 96 bytes. Using a
-// larger value for more space between the master write/worker read section and
+// larger value for more space between the primary write/worker read section and
 // read/write by all section seems to buy more performance on EPCC PARALLEL.
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
 #define KMP_INLINE_ARGV_BYTES                                                  \
@@ -2716,11 +2745,11 @@ typedef struct KMP_ALIGN_CACHE kmp_base_team {
   std::atomic<void *> t_tg_reduce_data[2]; // to support task modifier
   std::atomic<int> t_tg_fini_counter[2]; // sync end of task reductions
 
-  // Master only
+  // Primary thread only
   // ---------------------------------------------------------------------------
-  KMP_ALIGN_CACHE int t_master_tid; // tid of master in parent team
-  int t_master_this_cons; // "this_construct" single counter of master in parent
-  // team
+  KMP_ALIGN_CACHE int t_master_tid; // tid of primary thread in parent team
+  int t_master_this_cons; // "this_construct" single counter of primary thread
+  // in parent team
   ident_t *t_ident; // if volatile, have to change too much other crud to
   // volatile too
   kmp_team_p *t_parent; // parent team
@@ -2732,7 +2761,7 @@ typedef struct KMP_ALIGN_CACHE kmp_base_team {
   kmp_uint64 t_region_time; // region begin timestamp
 #endif /* USE_ITT_BUILD */
 
-  // Master write, workers read
+  // Primary thread write, workers read
   // --------------------------------------------------------------------------
   KMP_ALIGN_CACHE void **t_argv;
   int t_argc;
@@ -2768,7 +2797,7 @@ typedef struct KMP_ALIGN_CACHE kmp_base_team {
   kmp_r_sched_t t_sched; // run-time schedule for the team
 #if KMP_AFFINITY_SUPPORTED
   int t_first_place; // first & last place in parent thread's partition.
-  int t_last_place; // Restore these values to master after par region.
+  int t_last_place; // Restore these values to primary thread after par region.
 #endif // KMP_AFFINITY_SUPPORTED
   int t_display_affinity;
   int t_size_changed; // team size was changed?: 0: no, 1: yes, -1: changed via
@@ -2843,6 +2872,9 @@ typedef struct kmp_base_root {
   kmp_lock_t r_begin_lock;
   volatile int r_begin;
   int r_blocktime; /* blocktime for this root and descendants */
+#if KMP_AFFINITY_SUPPORTED
+  int r_affinity_assigned;
+#endif // KMP_AFFINITY_SUPPORTED
 } kmp_base_root_t;
 
 typedef union KMP_ALIGN_CACHE kmp_root {
@@ -2975,6 +3007,7 @@ extern enum sched_type __kmp_static; /* default static scheduling method */
 extern enum sched_type __kmp_guided; /* default guided scheduling method */
 extern enum sched_type __kmp_auto; /* default auto scheduling method */
 extern int __kmp_chunk; /* default runtime chunk size */
+extern int __kmp_force_monotonic; /* whether monotonic scheduling forced */
 
 extern size_t __kmp_stksize; /* stack size per thread         */
 #if KMP_USE_MONITOR
@@ -3043,9 +3076,8 @@ extern int __kmp_ncores; /* Total number of cores for threads placement */
 extern int __kmp_abort_delay;
 
 extern int __kmp_need_register_atfork_specified;
-extern int
-    __kmp_need_register_atfork; /* At initialization, call pthread_atfork to
-                                   install fork handler */
+extern int __kmp_need_register_atfork; /* At initialization, call pthread_atfork
+                                          to install fork handler */
 extern int __kmp_gtid_mode; /* Method of getting gtid, values:
                                0 - not set, will be set at runtime
                                1 - using stack search
@@ -3123,6 +3155,8 @@ extern const char *__kmp_speculative_statsfile;
 extern int __kmp_display_env; /* TRUE or FALSE */
 extern int __kmp_display_env_verbose; /* TRUE if OMP_DISPLAY_ENV=VERBOSE */
 extern int __kmp_omp_cancellation; /* TRUE or FALSE */
+extern int __kmp_nteams;
+extern int __kmp_teams_thread_limit;
 
 /* ------------------------------------------------------------------------- */
 
@@ -3331,6 +3365,8 @@ extern void __kmp_push_proc_bind(ident_t *loc, int gtid,
                                  kmp_proc_bind_t proc_bind);
 extern void __kmp_push_num_teams(ident_t *loc, int gtid, int num_teams,
                                  int num_threads);
+extern void __kmp_push_num_teams_51(ident_t *loc, int gtid, int num_teams_lb,
+                                    int num_teams_ub, int num_threads);
 
 extern void __kmp_yield();
 
@@ -3409,7 +3445,7 @@ extern void __kmp_wait_64(kmp_info_t *this_thr, kmp_flag_64<> *flag,
                           ,
                           void *itt_sync_obj
 #endif
-                          );
+);
 extern void __kmp_release_64(kmp_flag_64<> *flag);
 
 extern void __kmp_infinite_loop(void);
@@ -3430,7 +3466,7 @@ extern void __kmp_check_stack_overlap(kmp_info_t *thr);
 extern void __kmp_expand_host_name(char *buffer, size_t size);
 extern void __kmp_expand_file_name(char *result, size_t rlen, char *pattern);
 
-#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64 || (KMP_OS_WINDOWS && KMP_ARCH_AARCH64)
 extern void
 __kmp_initialize_system_tick(void); /* Initialize timer tick value */
 #endif
@@ -3460,6 +3496,16 @@ extern void __kmp_balanced_affinity(kmp_info_t *th, int team_size);
 #if KMP_OS_LINUX || KMP_OS_FREEBSD
 extern int kmp_set_thread_affinity_mask_initial(void);
 #endif
+static inline void __kmp_assign_root_init_mask() {
+  int gtid = __kmp_entry_gtid();
+  kmp_root_t *r = __kmp_threads[gtid]->th.th_root;
+  if (r->r.r_uber_thread == __kmp_threads[gtid] && !r->r.r_affinity_assigned) {
+    __kmp_affinity_set_init_mask(gtid, TRUE);
+    r->r.r_affinity_assigned = TRUE;
+  }
+}
+#else /* KMP_AFFINITY_SUPPORTED */
+#define __kmp_assign_root_init_mask() /* Nothing */
 #endif /* KMP_AFFINITY_SUPPORTED */
 // No need for KMP_AFFINITY_SUPPORTED guard as only one field in the
 // format string is for affinity, so platforms that do not support
@@ -3667,7 +3713,7 @@ extern int __kmp_invoke_microtask(microtask_t pkfn, int gtid, int npr, int argc,
                                   ,
                                   void **exit_frame_ptr
 #endif
-                                  );
+);
 
 /* ------------------------------------------------------------------------ */
 
@@ -3701,6 +3747,9 @@ KMP_EXPORT void __kmpc_flush(ident_t *);
 KMP_EXPORT void __kmpc_barrier(ident_t *, kmp_int32 global_tid);
 KMP_EXPORT kmp_int32 __kmpc_master(ident_t *, kmp_int32 global_tid);
 KMP_EXPORT void __kmpc_end_master(ident_t *, kmp_int32 global_tid);
+KMP_EXPORT kmp_int32 __kmpc_masked(ident_t *, kmp_int32 global_tid,
+                                   kmp_int32 filter);
+KMP_EXPORT void __kmpc_end_masked(ident_t *, kmp_int32 global_tid);
 KMP_EXPORT void __kmpc_ordered(ident_t *, kmp_int32 global_tid);
 KMP_EXPORT void __kmpc_end_ordered(ident_t *, kmp_int32 global_tid);
 KMP_EXPORT void __kmpc_critical(ident_t *, kmp_int32 global_tid,
@@ -3744,12 +3793,9 @@ KMP_EXPORT kmp_task_t *__kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
                                              size_t sizeof_kmp_task_t,
                                              size_t sizeof_shareds,
                                              kmp_routine_entry_t task_entry);
-KMP_EXPORT kmp_task_t *__kmpc_omp_target_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
-                                                    kmp_int32 flags,
-                                                    size_t sizeof_kmp_task_t,
-                                                    size_t sizeof_shareds,
-                                                    kmp_routine_entry_t task_entry,
-                                                    kmp_int64 device_id);
+KMP_EXPORT kmp_task_t *__kmpc_omp_target_task_alloc(
+    ident_t *loc_ref, kmp_int32 gtid, kmp_int32 flags, size_t sizeof_kmp_task_t,
+    size_t sizeof_shareds, kmp_routine_entry_t task_entry, kmp_int64 device_id);
 KMP_EXPORT void __kmpc_omp_task_begin_if0(ident_t *loc_ref, kmp_int32 gtid,
                                           kmp_task_t *task);
 KMP_EXPORT void __kmpc_omp_task_complete_if0(ident_t *loc_ref, kmp_int32 gtid,
@@ -3817,6 +3863,10 @@ KMP_EXPORT void __kmpc_task_reduction_modifier_fini(ident_t *loc, int gtid,
 KMP_EXPORT kmp_int32 __kmpc_omp_reg_task_with_affinity(
     ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *new_task, kmp_int32 naffins,
     kmp_task_affinity_info_t *affin_list);
+KMP_EXPORT void __kmp_set_num_teams(int num_teams);
+KMP_EXPORT int __kmp_get_max_teams(void);
+KMP_EXPORT void __kmp_set_teams_thread_limit(int limit);
+KMP_EXPORT int __kmp_get_teams_thread_limit(void);
 
 /* Lock interface routines (fast versions with gtid passed in) */
 KMP_EXPORT void __kmpc_init_lock(ident_t *loc, kmp_int32 gtid,
@@ -3885,6 +3935,11 @@ KMP_EXPORT void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
 KMP_EXPORT void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid,
                                       kmp_int32 num_teams,
                                       kmp_int32 num_threads);
+/* Function for OpenMP 5.1 num_teams clause */
+KMP_EXPORT void __kmpc_push_num_teams_51(ident_t *loc, kmp_int32 global_tid,
+                                         kmp_int32 num_teams_lb,
+                                         kmp_int32 num_teams_ub,
+                                         kmp_int32 num_threads);
 KMP_EXPORT void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc,
                                   kmpc_micro microtask, ...);
 struct kmp_dim { // loop bounds info casted to kmp_int64
@@ -3955,6 +4010,11 @@ KMP_EXPORT void KMPC_CONVENTION kmpc_set_stacksize_s(size_t);
 KMP_EXPORT void KMPC_CONVENTION kmpc_set_library(int);
 KMP_EXPORT void KMPC_CONVENTION kmpc_set_defaults(char const *);
 KMP_EXPORT void KMPC_CONVENTION kmpc_set_disp_num_buffers(int);
+void KMP_EXPAND_NAME(ompc_set_affinity_format)(char const *format);
+size_t KMP_EXPAND_NAME(ompc_get_affinity_format)(char *buffer, size_t size);
+void KMP_EXPAND_NAME(ompc_display_affinity)(char const *format);
+size_t KMP_EXPAND_NAME(ompc_capture_affinity)(char *buffer, size_t buf_size,
+                                              char const *format);
 
 enum kmp_target_offload_kind {
   tgt_disabled = 0,
@@ -4031,11 +4091,33 @@ extern void __kmp_hidden_helper_main_thread_release();
 #define KMP_HIDDEN_HELPER_WORKER_THREAD(gtid)                                  \
   ((gtid) > 1 && (gtid) <= __kmp_hidden_helper_threads_num)
 
+#define KMP_HIDDEN_HELPER_TEAM(team)                                           \
+  (team->t.t_threads[0] == __kmp_hidden_helper_main_thread)
+
 // Map a gtid to a hidden helper thread. The first hidden helper thread, a.k.a
 // main thread, is skipped.
 #define KMP_GTID_TO_SHADOW_GTID(gtid)                                          \
   ((gtid) % (__kmp_hidden_helper_threads_num - 1) + 2)
 
+// Return the adjusted gtid value by subtracting from gtid the number
+// of hidden helper threads. This adjusted value is the gtid the thread would
+// have received if there were no hidden helper threads.
+static inline int __kmp_adjust_gtid_for_hidden_helpers(int gtid) {
+  int adjusted_gtid = gtid;
+  if (__kmp_hidden_helper_threads_num > 0 && gtid > 0 &&
+      gtid - __kmp_hidden_helper_threads_num >= 0) {
+    adjusted_gtid -= __kmp_hidden_helper_threads_num;
+  }
+  return adjusted_gtid;
+}
+
+// Support for error directive
+typedef enum kmp_severity_t {
+  severity_warning = 1,
+  severity_fatal = 2
+} kmp_severity_t;
+extern void __kmpc_error(ident_t *loc, int severity, const char *message);
+
 #ifdef __cplusplus
 }
 #endif
@@ -4082,6 +4164,12 @@ int __kmp_execute_tasks_oncore(kmp_info_t *thread, kmp_int32 gtid,
 #endif /* USE_ITT_BUILD */
                                kmp_int32 is_constrained);
 
+extern int __kmp_nesting_mode;
+extern int __kmp_nesting_mode_nlevels;
+extern int *__kmp_nesting_nth_level;
+extern void __kmp_init_nesting_mode();
+extern void __kmp_set_nesting_mode_threads();
+
 /// This class safely opens and closes a C-style FILE* object using RAII
 /// semantics. There are also methods which allow using stdout or stderr as
 /// the underlying FILE* object. With the implicit conversion operator to