73 files changed, 10419 insertions, 7834 deletions
diff --git a/openmp/LICENSE.txt b/openmp/LICENSE.TXT
index 990756638292..990756638292 100644
--- a/openmp/LICENSE.txt
+++ b/openmp/LICENSE.TXT
diff --git a/openmp/runtime/src/dllexports b/openmp/runtime/src/dllexports
index 1c29ca90657a..473746887574 100644
--- a/openmp/runtime/src/dllexports
+++ b/openmp/runtime/src/dllexports
@@ -360,6 +360,7 @@ kmpc_set_defaults                           224
         __kmpc_team_static_init_4u          256
         __kmpc_team_static_init_8           257
         __kmpc_team_static_init_8u          258
+        __kmpc_push_num_teams_51            284
 %endif
 
 %ifndef stub
@@ -389,6 +390,9 @@ kmpc_set_disp_num_buffers                   267
         __kmpc_taskred_init                 277
         __kmpc_taskred_modifier_init        278
         __kmpc_omp_target_task_alloc        279
+        __kmpc_error                        281
+        __kmpc_masked                       282
+        __kmpc_end_masked                   283
 %endif
 
 # User API entry points that have both lower- and upper- case versions for Fortran.
@@ -537,6 +541,10 @@ kmp_set_disp_num_buffers                    890
     omp_pause_resource_all                  757
     omp_get_supported_active_levels         758
     omp_fulfill_event                       759
+    omp_set_num_teams                       802
+    omp_get_max_teams                       803
+    omp_set_teams_thread_limit              804
+    omp_get_teams_thread_limit              805
     omp_display_env                         733
     omp_calloc                              776
     omp_realloc                             777
@@ -550,12 +558,20 @@ kmp_set_disp_num_buffers                    890
     omp_cgroup_mem_alloc                   DATA
     omp_pteam_mem_alloc                    DATA
     omp_thread_mem_alloc                   DATA
+    # Preview of target memory support
+    llvm_omp_target_host_mem_alloc         DATA
+    llvm_omp_target_shared_mem_alloc       DATA
+    llvm_omp_target_device_mem_alloc       DATA
 
     omp_default_mem_space                  DATA
     omp_large_cap_mem_space                DATA
     omp_const_mem_space                    DATA
     omp_high_bw_mem_space                  DATA
     omp_low_lat_mem_space                  DATA
+    # Preview of target memory support
+    llvm_omp_target_host_mem_space         DATA
+    llvm_omp_target_shared_mem_space       DATA
+    llvm_omp_target_device_mem_space       DATA
 
 %ifndef stub
     # Ordinals between 900 and 999 are reserved
@@ -776,7 +792,9 @@ kmp_set_disp_num_buffers                    890
 
     %endif
 
+    # These are specific to x86 and x64
     %ifndef arch_64
+    %ifndef arch_aarch64
 
         # ATOMIC extensions for OpenMP 3.1 spec (x86 and x64 only)
 
@@ -1180,6 +1198,7 @@ kmp_set_disp_num_buffers                    890
             __kmpc_atomic_float10_div_cpt_rev_fp
             %endif
 
+    %endif   # arch_aarch64
     %endif   # arch_64
 
     %ifdef HAVE_QUAD
diff --git a/openmp/runtime/src/exports_so.txt b/openmp/runtime/src/exports_so.txt
index 30222418163d..cb79ae72e67b 100644
--- a/openmp/runtime/src/exports_so.txt
+++ b/openmp/runtime/src/exports_so.txt
@@ -27,9 +27,6 @@ VERSION {
         #
         ompt_start_tool;     # OMPT start interface
 
-        # icc drops weak attribute at linking step without the following line:
-        Annotate*;           # TSAN annotation
-
         ompc_*;    # omp.h renames some standard functions to ompc_*.
         kmp_*;     # Intel extensions.
         kmpc_*;    # Intel extensions.
@@ -107,6 +104,8 @@ OMP_4.0 {
 } OMP_3.1;
 OMP_4.5 {
 } OMP_4.0;
+OMP_5.0 {
+} OMP_4.5;
 
 # sets up GCC GOMP_ version dependency chain
 GOMP_1.0 {
diff --git a/openmp/runtime/src/i18n/en_US.txt b/openmp/runtime/src/i18n/en_US.txt
index 26f4cf572dab..0b5436fd5801 100644
--- a/openmp/runtime/src/i18n/en_US.txt
+++ b/openmp/runtime/src/i18n/en_US.txt
@@ -103,6 +103,30 @@ DisplayEnvEnd                "OPENMP DISPLAY ENVIRONMENT END"
 Device                       "[device]"
 Host                         "[host]"
 Tile                         "tile"
+Tiles                        "tiles"
+Threads                      "threads"
+Cores                        "cores"
+Socket                       "socket"
+Sockets                      "sockets"
+Die                          "die"
+Dice                         "dice"
+Module                       "module"
+Modules                      "modules"
+L1Cache                      "L1 cache"
+L1Caches                     "L1 caches"
+L2Cache                      "L2 cache"
+L2Caches                     "L2 caches"
+L3Cache                      "L3 cache"
+L3Caches                     "L3 caches"
+NumaDomain                   "NUMA domain"
+NumaDomains                  "NUMA domains"
+ProcGroup                    "processor group"
+ProcGroups                   "processor groups"
+Unknown                      "unknown"
+NoLeaf31Support              "cpuid leaf 31 not supported"
+HwlocFailed                  "Hwloc api failure"
+LLCache                      "LL cache"
+LLCaches                     "LL caches"
 
 
 
@@ -117,7 +141,7 @@ SysErr                       "OMP: System error #%1$d: %2$s\n"
 Hint                         "OMP: Hint %1$s\n"
 
 Pragma                       "%1$s pragma (at %2$s:%3$s():%4$s)"
-    # %1 is pragma name (like "parallel" or "master",
+    # %1 is pragma name (like "parallel" or "masked",
     # %2 is file name,
     # %3 is function (routine) name,
     # %4 is the line number (as string, so "s" type specifier should be used).
@@ -333,6 +357,8 @@ TopologyExtraNoTi            "%1$s: %2$d packages x %3$d nodes/pkg x %4$d tiles/
 OmptOutdatedWorkshare        "OMPT: Cannot determine workshare type; using the default (loop) instead. "
                              "This issue is fixed in an up-to-date compiler."
 OmpNoAllocator               "Allocator %1$s is not available, will use default allocator."
+TopologyGeneric              "%1$s: %2$s (%3$d total cores)"
+AffGranularityBad            "%1$s: granularity setting: %2$s does not exist in topology.  Using granularity=%3$s instead."
 
 # --- OpenMP errors detected at runtime ---
 #
@@ -362,8 +388,8 @@ StaticLibNotSupport          "Static %1$s does not support %2$s. Continuing with
 OBSOLETE                     "KMP_DYNAMIC_MODE=irml cannot be used with KMP_USE_IRML=0"
 IttUnknownGroup              "ittnotify: Unknown group \"%2$s\" specified in environment variable \"%1$s\"."
 IttEnvVarTooLong             "ittnotify: Environment variable \"%1$s\" too long: Actual lengths is %2$lu, max allowed length is %3$lu."
-AffUseGlobCpuidL11           "%1$s: Affinity capable, using global cpuid leaf 11 info"
-AffNotCapableUseLocCpuidL11  "%1$s: Affinity not capable, using local cpuid leaf 11 info"
+OBSOLETE                     "%1$s: Affinity capable, using global cpuid leaf 11 info"
+OBSOLETE                     "%1$s: Affinity not capable, using local cpuid leaf 11 info"
 AffInfoStr                   "%1$s: %2$s."
 AffInfoStrStr                "%1$s: %2$s - %3$s."
 OSProcToPhysicalThreadMap    "%1$s: OS proc to physical thread map:"
@@ -429,6 +455,18 @@ HierSchedInvalid             "Hierarchy ignored: unsupported level: %1$s."
 AffFormatDefault             "OMP: pid %1$s tid %2$s thread %3$s bound to OS proc set {%4$s}"
 APIDeprecated                "%1$s routine deprecated, please use %2$s instead."
 GompFeatureNotSupported      "libgomp compatibility layer does not support OpenMP feature: %1$s"
+AffHWSubsetManyDies          "KMP_HW_SUBSET ignored: too many Dies requested."
+AffUseGlobCpuidL             "%1$s: Affinity capable, using global cpuid leaf %2$d info"
+AffNotCapableUseLocCpuidL    "%1$s: Affinity not capable, using local cpuid leaf %2$d info"
+AffNotUsingHwloc             "%1$s: Affinity not capable, using hwloc."
+UserDirectedError            "%1$s: Encountered user-directed error: %2$s."
+UserDirectedWarning          "%1$s: Encountered user-directed warning: %2$s."
+FailedToCreateTeam           "Failed to create teams between lower bound (%1$d) and upper bound (%2$d)."
+AffHWSubsetManyGeneric       "KMP_HW_SUBSET ignored: %1$s: too many requested."
+AffHWSubsetNotExistGeneric   "KMP_HW_SUBSET ignored: %1$s: level not detected in machine topology."
+AffHWSubsetEqvLayers         "KMP_HW_SUBSET ignored: %1$s, %2$s: layers are equivalent, please only specify one."
+AffHWSubsetOutOfOrder        "KMP_HW_SUBSET ignored: %1$s layer should come after %2$s."
+AffEqualTopologyTypes        "%1$s: topology layer \"%2$s\" is equivalent to \"%3$s\"."
 
 # --------------------------------------------------------------------------------------------------
 -*- HINTS -*-
@@ -486,7 +524,7 @@ BadExeFormat                 "System error #193 is \"Bad format of EXE or DLL fi
                              "a file for another architecture. "
                              "Check whether \"%1$s\" is a file for %2$s architecture."
 SystemLimitOnThreads         "System-related limit on the number of threads."
-
+SetNewBound                  "Try setting new bounds (preferably less than or equal to %1$d) for num_teams clause."
 
 
 # --------------------------------------------------------------------------------------------------
diff --git a/openmp/runtime/src/include/omp-tools.h.var b/openmp/runtime/src/include/omp-tools.h.var
index 961e767c63c9..5092174d66ef 100644
--- a/openmp/runtime/src/include/omp-tools.h.var
+++ b/openmp/runtime/src/include/omp-tools.h.var
@@ -483,6 +483,8 @@ typedef enum ompd_rc_t {
   ompd_rc_device_read_error = 8,
   ompd_rc_device_write_error = 9,
   ompd_rc_nomem = 10,
+  ompd_rc_incomplete = 11,
+  ompd_rc_callback_error = 12
 } ompd_rc_t;
 
 typedef void (*ompt_interface_fn_t) (void);
@@ -1099,6 +1101,13 @@ typedef void (*ompt_callback_error_t) (
   const void *codeptr_ra
 );
 
+typedef struct ompt_record_error_t {
+  ompt_severity_t severity;
+  const char *message;
+  size_t length;
+  const void *codeptr_ra;
+} ompt_record_error_t;
+
 typedef struct ompd_address_t {
   ompd_seg_t segment;
   ompd_addr_t address;
@@ -1126,6 +1135,198 @@ typedef struct ompd_device_type_sizes_t {
   uint8_t sizeof_pointer;
 } ompd_device_type_sizes_t;
 
+void ompd_dll_locations_valid(void);
+
+typedef ompd_rc_t (*ompd_callback_memory_alloc_fn_t)(ompd_size_t nbytes,
+                                                     void **ptr);
+
+typedef ompd_rc_t (*ompd_callback_memory_free_fn_t)(void *ptr);
+
+typedef ompd_rc_t (*ompd_callback_get_thread_context_for_thread_id_fn_t)(
+    ompd_address_space_context_t *address_space_context, ompd_thread_id_t kind,
+    ompd_size_t sizeof_thread_id, const void *thread_id,
+    ompd_thread_context_t **thread_context);
+
+typedef ompd_rc_t (*ompd_callback_sizeof_fn_t)(
+    ompd_address_space_context_t *address_space_context,
+    ompd_device_type_sizes_t *sizes);
+
+typedef ompd_rc_t (*ompd_callback_symbol_addr_fn_t)(
+    ompd_address_space_context_t *address_space_context,
+    ompd_thread_context_t *thread_context, const char *symbol_name,
+    ompd_address_t *symbol_addr, const char *file_name);
+
+typedef ompd_rc_t (*ompd_callback_memory_read_fn_t)(
+    ompd_address_space_context_t *address_space_context,
+    ompd_thread_context_t *thread_context, const ompd_address_t *addr,
+    ompd_size_t nbytes, void *buffer);
+
+typedef ompd_rc_t (*ompd_callback_memory_write_fn_t)(
+    ompd_address_space_context_t *address_space_context,
+    ompd_thread_context_t *thread_context, const ompd_address_t *addr,
+    ompd_size_t nbytes, const void *buffer);
+
+typedef ompd_rc_t (*ompd_callback_device_host_fn_t)(
+    ompd_address_space_context_t *address_space_context, const void *input,
+    ompd_size_t unit_size, ompd_size_t count, void *output);
+
+typedef ompd_rc_t (*ompd_callback_print_string_fn_t)(const char *string,
+                                                     int category);
+
+typedef struct ompd_callbacks_t {
+  ompd_callback_memory_alloc_fn_t alloc_memory;
+  ompd_callback_memory_free_fn_t free_memory;
+  ompd_callback_print_string_fn_t print_string;
+  ompd_callback_sizeof_fn_t sizeof_type;
+  ompd_callback_symbol_addr_fn_t symbol_addr_lookup;
+  ompd_callback_memory_read_fn_t read_memory;
+  ompd_callback_memory_write_fn_t write_memory;
+  ompd_callback_memory_read_fn_t read_string;
+  ompd_callback_device_host_fn_t device_to_host;
+  ompd_callback_device_host_fn_t host_to_device;
+  ompd_callback_get_thread_context_for_thread_id_fn_t
+      get_thread_context_for_thread_id;
+} ompd_callbacks_t;
+
+void ompd_bp_parallel_begin(void);
+
+void ompd_bp_parallel_end(void);
+
+void ompd_bp_task_begin(void);
+
+void ompd_bp_task_end(void);
+
+void ompd_bp_thread_begin(void);
+
+void ompd_bp_thread_end(void);
+
+void ompd_bp_device_begin(void);
+
+void ompd_bp_device_end(void);
+
+ompd_rc_t ompd_initialize(ompd_word_t api_version,
+                          const ompd_callbacks_t *callbacks);
+
+ompd_rc_t ompd_get_api_version(ompd_word_t *version);
+
+ompd_rc_t ompd_get_version_string(const char **string);
+
+ompd_rc_t ompd_finalize(void);
+
+ompd_rc_t ompd_process_initialize(ompd_address_space_context_t *context,
+                                  ompd_address_space_handle_t **handle);
+
+ompd_rc_t ompd_device_initialize(ompd_address_space_handle_t *process_handle,
+                                 ompd_address_space_context_t *device_context,
+                                 ompd_device_t kind, ompd_size_t sizeof_id,
+                                 void *id,
+                                 ompd_address_space_handle_t **device_handle);
+
+ompd_rc_t ompd_rel_address_space_handle(ompd_address_space_handle_t *handle);
+
+ompd_rc_t ompd_get_omp_version(ompd_address_space_handle_t *address_space,
+                               ompd_word_t *omp_version);
+
+ompd_rc_t
+ompd_get_omp_version_string(ompd_address_space_handle_t *address_space,
+                            const char **string);
+
+ompd_rc_t ompd_get_thread_in_parallel(ompd_parallel_handle_t *parallel_handle,
+                                      int thread_num,
+                                      ompd_thread_handle_t **thread_handle);
+
+ompd_rc_t ompd_get_thread_handle(ompd_address_space_handle_t *handle,
+                                 ompd_thread_id_t kind,
+                                 ompd_size_t sizeof_thread_id,
+                                 const void *thread_id,
+                                 ompd_thread_handle_t **thread_handle);
+
+ompd_rc_t ompd_rel_thread_handle(ompd_thread_handle_t *thread_handle);
+
+ompd_rc_t ompd_thread_handle_compare(ompd_thread_handle_t *thread_handle_1,
+                                     ompd_thread_handle_t *thread_handle_2,
+                                     int *cmp_value);
+
+ompd_rc_t ompd_get_thread_id(ompd_thread_handle_t *thread_handle,
+                             ompd_thread_id_t kind,
+                             ompd_size_t sizeof_thread_id, void *thread_id);
+
+ompd_rc_t
+ompd_get_curr_parallel_handle(ompd_thread_handle_t *thread_handle,
+                              ompd_parallel_handle_t **parallel_handle);
+
+ompd_rc_t ompd_get_enclosing_parallel_handle(
+    ompd_parallel_handle_t *parallel_handle,
+    ompd_parallel_handle_t **enclosing_parallel_handle);
+
+ompd_rc_t
+ompd_get_task_parallel_handle(ompd_task_handle_t *task_handle,
+                              ompd_parallel_handle_t **task_parallel_handle);
+
+ompd_rc_t ompd_rel_parallel_handle(ompd_parallel_handle_t *parallel_handle);
+
+ompd_rc_t
+ompd_parallel_handle_compare(ompd_parallel_handle_t *parallel_handle_1,
+                             ompd_parallel_handle_t *parallel_handle_2,
+                             int *cmp_value);
+
+ompd_rc_t ompd_get_curr_task_handle(ompd_thread_handle_t *thread_handle,
+                                    ompd_task_handle_t **task_handle);
+
+ompd_rc_t
+ompd_get_generating_task_handle(ompd_task_handle_t *task_handle,
+                                ompd_task_handle_t **generating_task_handle);
+
+ompd_rc_t
+ompd_get_scheduling_task_handle(ompd_task_handle_t *task_handle,
+                                ompd_task_handle_t **scheduling_task_handle);
+
+ompd_rc_t ompd_get_task_in_parallel(ompd_parallel_handle_t *parallel_handle,
+                                    int thread_num,
+                                    ompd_task_handle_t **task_handle);
+
+ompd_rc_t ompd_rel_task_handle(ompd_task_handle_t *task_handle);
+
+ompd_rc_t ompd_task_handle_compare(ompd_task_handle_t *task_handle_1,
+                                   ompd_task_handle_t *task_handle_2,
+                                   int *cmp_value);
+
+ompd_rc_t ompd_get_task_function(ompd_task_handle_t *task_handle,
+                                 ompd_address_t *entry_point);
+
+ompd_rc_t ompd_get_task_frame(ompd_task_handle_t *task_handle,
+                              ompd_frame_info_t *exit_frame,
+                              ompd_frame_info_t *enter_frame);
+
+ompd_rc_t
+ompd_enumerate_states(ompd_address_space_handle_t *address_space_handle,
+                      ompd_word_t current_state, ompd_word_t *next_state,
+                      const char **next_state_name, ompd_word_t *more_enums);
+
+ompd_rc_t ompd_get_state(ompd_thread_handle_t *thread_handle,
+                         ompd_word_t *state, ompd_wait_id_t *wait_id);
+
+ompd_rc_t
+ompd_get_display_control_vars(ompd_address_space_handle_t *address_space_handle,
+                              const char *const **control_vars);
+
+ompd_rc_t ompd_rel_display_control_vars(const char *const **control_vars);
+
+ompd_rc_t ompd_enumerate_icvs(ompd_address_space_handle_t *handle,
+                              ompd_icv_id_t current, ompd_icv_id_t *next_id,
+                              const char **next_icv_name,
+                              ompd_scope_t *next_scope, int *more);
+
+ompd_rc_t ompd_get_icv_from_scope(void *handle, ompd_scope_t scope,
+                                  ompd_icv_id_t icv_id, ompd_word_t *icv_value);
+
+ompd_rc_t ompd_get_icv_string_from_scope(void *handle, ompd_scope_t scope,
+                                         ompd_icv_id_t icv_id,
+                                         const char **icv_string);
+
+ompd_rc_t ompd_get_tool_data(void *handle, ompd_scope_t scope,
+                             ompd_word_t *value, ompd_address_t *ptr);
+
 typedef struct ompt_record_ompt_t {
   ompt_callbacks_t type;
   ompt_device_time_t time;
diff --git a/openmp/runtime/src/include/omp.h.var b/openmp/runtime/src/include/omp.h.var
index 4d055f905bcb..588c52b02a8f 100644
--- a/openmp/runtime/src/include/omp.h.var
+++ b/openmp/runtime/src/include/omp.h.var
@@ -141,12 +141,12 @@
     extern int   __KAI_KMPC_CONVENTION  omp_get_initial_device (void);
     extern void* __KAI_KMPC_CONVENTION  omp_target_alloc(size_t, int);
     extern void  __KAI_KMPC_CONVENTION  omp_target_free(void *, int);
-    extern int   __KAI_KMPC_CONVENTION  omp_target_is_present(void *, int);
-    extern int   __KAI_KMPC_CONVENTION  omp_target_memcpy(void *, void *, size_t, size_t, size_t, int, int);
-    extern int   __KAI_KMPC_CONVENTION  omp_target_memcpy_rect(void *, void *, size_t, int, const size_t *,
+    extern int   __KAI_KMPC_CONVENTION  omp_target_is_present(const void *, int);
+    extern int   __KAI_KMPC_CONVENTION  omp_target_memcpy(void *, const void *, size_t, size_t, size_t, int, int);
+    extern int   __KAI_KMPC_CONVENTION  omp_target_memcpy_rect(void *, const void *, size_t, int, const size_t *,
                                             const size_t *, const size_t *, const size_t *, const size_t *, int, int);
-    extern int   __KAI_KMPC_CONVENTION  omp_target_associate_ptr(void *, void *, size_t, size_t, int);
-    extern int   __KAI_KMPC_CONVENTION  omp_target_disassociate_ptr(void *, int);
+    extern int   __KAI_KMPC_CONVENTION  omp_target_associate_ptr(const void *, const void *, size_t, size_t, int);
+    extern int   __KAI_KMPC_CONVENTION  omp_target_disassociate_ptr(const void *, int);
 
     /* OpenMP 5.0 */
     extern int   __KAI_KMPC_CONVENTION  omp_get_device_num (void);
@@ -182,6 +182,16 @@
         omp_irc_other = -6
     } omp_interop_rc_t;
 
+    typedef enum omp_interop_fr {
+        omp_ifr_cuda = 1,
+        omp_ifr_cuda_driver = 2,
+        omp_ifr_opencl = 3,
+        omp_ifr_sycl = 4,
+        omp_ifr_hip = 5,
+        omp_ifr_level_zero = 6,
+        omp_ifr_last = 7
+    } omp_interop_fr_t;
+
     typedef void * omp_interop_t;
 
     /*!
@@ -211,7 +221,7 @@
     /*!
      * The `omp_get_interop_rc_desc` routine retrieves a description of the return code associated with an `omp_interop_t` object.
      */
-    extern const char * __KAI_KMPC_CONVENTION  omp_get_interop_rc_desc(const omp_interop_rc_t, omp_interop_rc_t);
+    extern const char * __KAI_KMPC_CONVENTION  omp_get_interop_rc_desc(const omp_interop_t, omp_interop_rc_t);
 
     /* OpenMP 5.1 device memory routines */
 
@@ -230,6 +240,7 @@
      * The `omp_get_mapped_ptr` routine returns the device pointer that is associated with a host pointer for a given device.
      */
     extern void * __KAI_KMPC_CONVENTION  omp_get_mapped_ptr(const void *, int);
+    extern int    __KAI_KMPC_CONVENTION  omp_target_is_accessible(const void *, size_t, int);
 
     /* kmp API functions */
     extern int    __KAI_KMPC_CONVENTION  kmp_get_stacksize          (void);
@@ -357,12 +368,21 @@
     extern __KMP_IMP omp_allocator_handle_t const omp_cgroup_mem_alloc;
     extern __KMP_IMP omp_allocator_handle_t const omp_pteam_mem_alloc;
     extern __KMP_IMP omp_allocator_handle_t const omp_thread_mem_alloc;
+    /* Preview of target memory support */
+    extern __KMP_IMP omp_allocator_handle_t const llvm_omp_target_host_mem_alloc;
+    extern __KMP_IMP omp_allocator_handle_t const llvm_omp_target_shared_mem_alloc;
+    extern __KMP_IMP omp_allocator_handle_t const llvm_omp_target_device_mem_alloc;
+
     typedef omp_uintptr_t omp_memspace_handle_t;
     extern __KMP_IMP omp_memspace_handle_t const omp_default_mem_space;
     extern __KMP_IMP omp_memspace_handle_t const omp_large_cap_mem_space;
     extern __KMP_IMP omp_memspace_handle_t const omp_const_mem_space;
     extern __KMP_IMP omp_memspace_handle_t const omp_high_bw_mem_space;
     extern __KMP_IMP omp_memspace_handle_t const omp_low_lat_mem_space;
+    /* Preview of target memory support */
+    extern __KMP_IMP omp_memspace_handle_t const llvm_omp_target_host_mem_space;
+    extern __KMP_IMP omp_memspace_handle_t const llvm_omp_target_shared_mem_space;
+    extern __KMP_IMP omp_memspace_handle_t const llvm_omp_target_device_mem_space;
 #   else
 #       if __cplusplus >= 201103
     typedef enum omp_allocator_handle_t : omp_uintptr_t
@@ -379,6 +399,10 @@
       omp_cgroup_mem_alloc = 6,
       omp_pteam_mem_alloc = 7,
       omp_thread_mem_alloc = 8,
+      /* Preview of target memory support */
+      llvm_omp_target_host_mem_alloc = 100,
+      llvm_omp_target_shared_mem_alloc = 101,
+      llvm_omp_target_device_mem_alloc = 102,
       KMP_ALLOCATOR_MAX_HANDLE = UINTPTR_MAX
     } omp_allocator_handle_t;
 #       if __cplusplus >= 201103
@@ -392,6 +416,10 @@
       omp_const_mem_space = 2,
       omp_high_bw_mem_space = 3,
       omp_low_lat_mem_space = 4,
+      /* Preview of target memory support */
+      llvm_omp_target_host_mem_space = 100,
+      llvm_omp_target_shared_mem_space = 101,
+      llvm_omp_target_device_mem_space = 102,
       KMP_MEMSPACE_MAX_HANDLE = UINTPTR_MAX
     } omp_memspace_handle_t;
 #   endif
@@ -442,9 +470,24 @@
 
     extern int __KAI_KMPC_CONVENTION omp_get_supported_active_levels(void);
 
+    /* OpenMP 5.1 */
+    extern void __KAI_KMPC_CONVENTION omp_set_num_teams(int num_teams);
+    extern int __KAI_KMPC_CONVENTION omp_get_max_teams(void);
+    extern void __KAI_KMPC_CONVENTION omp_set_teams_thread_limit(int limit);
+    extern int __KAI_KMPC_CONVENTION omp_get_teams_thread_limit(void);
+
     /* OpenMP 5.1 Display Environment */
     extern void omp_display_env(int verbose);
 
+#   if defined(_OPENMP) && _OPENMP >= 201811
+    #pragma omp begin declare variant match(device={kind(host)})
+    static inline int omp_is_initial_device(void) { return 1; }
+    #pragma omp end declare variant
+    #pragma omp begin declare variant match(device={kind(nohost)})
+    static inline int omp_is_initial_device(void) { return 0; }
+    #pragma omp end declare variant
+#   endif
+
 #   undef __KAI_KMPC_CONVENTION
 #   undef __KMP_IMP
 
diff --git a/openmp/runtime/src/include/omp_lib.f90.var b/openmp/runtime/src/include/omp_lib.f90.var
index 2fc8d7c3daa4..48622e2154c0 100644
--- a/openmp/runtime/src/include/omp_lib.f90.var
+++ b/openmp/runtime/src/include/omp_lib.f90.var
@@ -34,6 +34,8 @@
         integer, parameter :: omp_memspace_handle_kind = c_intptr_t
         integer, parameter :: omp_alloctrait_key_kind = omp_integer_kind
         integer, parameter :: omp_alloctrait_val_kind = c_intptr_t
+        integer, parameter :: omp_interop_kind = c_intptr_t
+        integer, parameter :: omp_interop_fr_kind = omp_integer_kind
 
         type omp_alloctrait
           integer(kind=omp_alloctrait_key_kind) key
@@ -137,17 +139,35 @@
         integer (kind=omp_allocator_handle_kind), parameter :: omp_cgroup_mem_alloc = 6
         integer (kind=omp_allocator_handle_kind), parameter :: omp_pteam_mem_alloc = 7
         integer (kind=omp_allocator_handle_kind), parameter :: omp_thread_mem_alloc = 8
+        ! Preview of target memory support
+        integer (kind=omp_allocator_handle_kind), parameter :: llvm_omp_target_host_mem_alloc = 100
+        integer (kind=omp_allocator_handle_kind), parameter :: llvm_omp_target_shared_mem_alloc = 101
+        integer (kind=omp_allocator_handle_kind), parameter :: llvm_omp_target_device_mem_alloc = 102
 
         integer (kind=omp_memspace_handle_kind), parameter :: omp_default_mem_space = 0
         integer (kind=omp_memspace_handle_kind), parameter :: omp_large_cap_mem_space = 1
         integer (kind=omp_memspace_handle_kind), parameter :: omp_const_mem_space = 2
         integer (kind=omp_memspace_handle_kind), parameter :: omp_high_bw_mem_space = 3
         integer (kind=omp_memspace_handle_kind), parameter :: omp_low_lat_mem_space = 4
+        ! Preview of target memory support
+        integer (kind=omp_memspace_handle_kind), parameter :: llvm_omp_target_host_mem_space = 100
+        integer (kind=omp_memspace_handle_kind), parameter :: llvm_omp_target_shared_mem_space = 101
+        integer (kind=omp_memspace_handle_kind), parameter :: llvm_omp_target_device_mem_space = 102
 
         integer (kind=omp_pause_resource_kind), parameter :: omp_pause_resume = 0
         integer (kind=omp_pause_resource_kind), parameter :: omp_pause_soft = 1
         integer (kind=omp_pause_resource_kind), parameter :: omp_pause_hard = 2
 
+        integer (kind=omp_interop_fr_kind), parameter :: omp_ifr_cuda = 1
+        integer (kind=omp_interop_fr_kind), parameter :: omp_ifr_cuda_driver = 2
+        integer (kind=omp_interop_fr_kind), parameter :: omp_ifr_opencl = 3
+        integer (kind=omp_interop_fr_kind), parameter :: omp_ifr_sycl = 4
+        integer (kind=omp_interop_fr_kind), parameter :: omp_ifr_hip = 5
+        integer (kind=omp_interop_fr_kind), parameter :: omp_ifr_level_zero = 6
+        integer (kind=omp_interop_fr_kind), parameter :: omp_ifr_last = 7
+
+        integer (kind=omp_interop_kind), parameter :: omp_interop_none = 0
+
         interface
 
 !         ***
@@ -504,6 +524,26 @@
             integer (kind=kmp_size_t_kind) :: omp_capture_affinity
           end function omp_capture_affinity
 
+          subroutine omp_set_num_teams(num_teams) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind), value :: num_teams
+          end subroutine omp_set_num_teams
+
+          function omp_get_max_teams() bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_max_teams
+          end function omp_get_max_teams
+
+          subroutine omp_set_teams_thread_limit(thread_limit) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind), value :: thread_limit
+          end subroutine omp_set_teams_thread_limit
+
+          function omp_get_teams_thread_limit() bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_teams_thread_limit
+          end function omp_get_teams_thread_limit
+
           subroutine omp_display_env(verbose) bind(c)
             use omp_lib_kinds
             logical (kind=omp_logical_kind), value :: verbose
@@ -601,6 +641,15 @@
             integer(c_int), value :: device_num
           end function omp_target_disassociate_ptr
 
+          function omp_target_is_accessible(ptr, size, device_num) bind(c)
+            use omp_lib_kinds
+            use, intrinsic :: iso_c_binding, only : c_ptr, c_size_t, c_int
+            integer(c_int) omp_target_is_accessible
+            type(c_ptr), value :: ptr
+            integer(c_size_t), value :: size
+            integer(c_int), value :: device_num
+          end function omp_target_is_accessible
+
 !         ***
 !         *** kmp_* entry points
 !         ***
diff --git a/openmp/runtime/src/include/omp_lib.h.var b/openmp/runtime/src/include/omp_lib.h.var
index 3fb2d25b15f1..9122fb26613e 100644
--- a/openmp/runtime/src/include/omp_lib.h.var
+++ b/openmp/runtime/src/include/omp_lib.h.var
@@ -52,6 +52,10 @@
       parameter(omp_depend_kind=int_ptr_kind())
       integer omp_event_handle_kind
       parameter(omp_event_handle_kind=int_ptr_kind())
+      integer omp_interop_kind
+      parameter(omp_interop_kind=int_ptr_kind())
+      integer omp_interop_fr_kind
+      parameter(omp_interop_fr_kind=omp_integer_kind)
 
       integer(kind=omp_integer_kind)openmp_version
       parameter(openmp_version=@LIBOMP_OMP_YEAR_MONTH@)
@@ -214,6 +218,13 @@
       parameter(omp_pteam_mem_alloc=7)
       integer(kind=omp_allocator_handle_kind)omp_thread_mem_alloc
       parameter(omp_thread_mem_alloc=8)
+      ! Preview of target memory support
+      integer(kind=omp_allocator_handle_kind)llvm_omp_target_host_mem_alloc
+      parameter(llvm_omp_target_host_mem_alloc=100)
+      integer(kind=omp_allocator_handle_kind)llvm_omp_target_shared_mem_alloc
+      parameter(llvm_omp_target_shared_mem_alloc=101)
+      integer(kind=omp_allocator_handle_kind)llvm_omp_target_device_mem_alloc
+      parameter(llvm_omp_target_device_mem_alloc=102)
 
       integer(kind=omp_memspace_handle_kind)omp_default_mem_space
       parameter(omp_default_mem_space=0)
@@ -225,6 +236,13 @@
       parameter(omp_high_bw_mem_space=3)
       integer(kind=omp_memspace_handle_kind)omp_low_lat_mem_space
       parameter(omp_low_lat_mem_space=4)
+      ! Preview of target memory support
+      integer(kind=omp_memspace_handle_kind)llvm_omp_target_host_mem_space
+      parameter(llvm_omp_target_host_mem_space=100)
+      integer(kind=omp_memspace_handle_kind)llvm_omp_target_shared_mem_space
+      parameter(llvm_omp_target_shared_mem_space=101)
+      integer(kind=omp_memspace_handle_kind)llvm_omp_target_device_mem_space
+      parameter(llvm_omp_target_device_mem_space=102)
 
       integer(kind=omp_pause_resource_kind)omp_pause_resume
       parameter(omp_pause_resume=0)
@@ -233,6 +251,24 @@
       integer(kind=omp_pause_resource_kind)omp_pause_hard
       parameter(omp_pause_hard=2)
 
+      integer(kind=omp_interop_fr_kind)omp_ifr_cuda
+      parameter(omp_ifr_cuda=1)
+      integer(kind=omp_interop_fr_kind)omp_ifr_cuda_driver
+      parameter(omp_ifr_cuda_driver=2)
+      integer(kind=omp_interop_fr_kind)omp_ifr_opencl
+      parameter(omp_ifr_opencl=3)
+      integer(kind=omp_interop_fr_kind)omp_ifr_sycl
+      parameter(omp_ifr_sycl=4)
+      integer(kind=omp_interop_fr_kind)omp_ifr_hip
+      parameter(omp_ifr_hip=5)
+      integer(kind=omp_interop_fr_kind)omp_ifr_level_zero
+      parameter(omp_ifr_level_zero=6)
+      integer(kind=omp_interop_fr_kind)omp_ifr_last
+      parameter(omp_ifr_last=7)
+
+      integer(kind=omp_interop_kind)omp_interop_none
+      parameter(omp_interop_none=0)
+
       interface
 
 !       ***
@@ -582,6 +618,26 @@
           integer (kind=kmp_size_t_kind) :: omp_capture_affinity
         end function omp_capture_affinity
 
+        subroutine omp_set_num_teams(num_teams) bind(c)
+          import
+          integer (kind=omp_integer_kind), value :: num_teams
+        end subroutine omp_set_num_teams
+
+        function omp_get_max_teams() bind(c)
+          import
+          integer (kind=omp_integer_kind) omp_get_max_teams
+        end function omp_get_max_teams
+
+        subroutine omp_set_teams_thread_limit(thread_limit) bind(c)
+          import
+          integer (kind=omp_integer_kind), value :: thread_limit
+        end subroutine omp_set_teams_thread_limit
+
+        function omp_get_teams_thread_limit() bind(c)
+          import
+          integer (kind=omp_integer_kind) omp_get_teams_thread_limit
+        end function omp_get_teams_thread_limit
+
         subroutine omp_display_env(verbose) bind(c)
           import
           logical (kind=omp_logical_kind), value :: verbose
@@ -681,6 +737,14 @@
           integer(c_int), value :: device_num
         end function omp_target_disassociate_ptr
 
+        function omp_target_is_accessible(ptr, size, device_num) bind(c)
+          use, intrinsic :: iso_c_binding, only : c_ptr, c_size_t, c_int
+          integer(c_int) omp_target_is_accessible
+          type(c_ptr), value :: ptr
+          integer(c_size_t), value :: size
+          integer(c_int), value :: device_num
+        end function omp_target_is_accessible
+
 !       ***
 !       *** kmp_* entry points
 !       ***
diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h
index 87e91a0f8d10..05264f4433d3 100644
--- a/openmp/runtime/src/kmp.h
+++ b/openmp/runtime/src/kmp.h
@@ -138,6 +138,10 @@ typedef unsigned int kmp_hwloc_depth_t;
 #include "ompt-internal.h"
 #endif
 
+#if OMPD_SUPPORT
+#include "ompd-specific.h"
+#endif
+
 #ifndef UNLIKELY
 #define UNLIKELY(x) (x)
 #endif
@@ -595,6 +599,35 @@ typedef int PACKED_REDUCTION_METHOD_T;
 #include <pthread.h>
 #endif
 
+enum kmp_hw_t : int {
+  KMP_HW_UNKNOWN = -1,
+  KMP_HW_SOCKET = 0,
+  KMP_HW_PROC_GROUP,
+  KMP_HW_NUMA,
+  KMP_HW_DIE,
+  KMP_HW_LLC,
+  KMP_HW_L3,
+  KMP_HW_TILE,
+  KMP_HW_MODULE,
+  KMP_HW_L2,
+  KMP_HW_L1,
+  KMP_HW_CORE,
+  KMP_HW_THREAD,
+  KMP_HW_LAST
+};
+
+#define KMP_DEBUG_ASSERT_VALID_HW_TYPE(type)                                   \
+  KMP_DEBUG_ASSERT(type >= (kmp_hw_t)0 && type < KMP_HW_LAST)
+#define KMP_ASSERT_VALID_HW_TYPE(type)                                         \
+  KMP_ASSERT(type >= (kmp_hw_t)0 && type < KMP_HW_LAST)
+
+#define KMP_FOREACH_HW_TYPE(type)                                              \
+  for (kmp_hw_t type = (kmp_hw_t)0; type < KMP_HW_LAST;                        \
+       type = (kmp_hw_t)((int)type + 1))
+
+const char *__kmp_hw_get_keyword(kmp_hw_t type, bool plural = false);
+const char *__kmp_hw_get_catalog_string(kmp_hw_t type, bool plural = false);
+
 /* Only Linux* OS and Windows* OS support thread affinity. */
 #if KMP_AFFINITY_SUPPORTED
 
@@ -629,8 +662,6 @@ extern kmp_SetThreadGroupAffinity_t __kmp_SetThreadGroupAffinity;
 #if KMP_USE_HWLOC
 extern hwloc_topology_t __kmp_hwloc_topology;
 extern int __kmp_hwloc_error;
-extern int __kmp_numa_detected;
-extern int __kmp_tile_depth;
 #endif
 
 extern size_t __kmp_affin_mask_size;
@@ -758,27 +789,12 @@ enum affinity_type {
   affinity_default
 };
 
-enum affinity_gran {
-  affinity_gran_fine = 0,
-  affinity_gran_thread,
-  affinity_gran_core,
-  affinity_gran_tile,
-  affinity_gran_numa,
-  affinity_gran_package,
-  affinity_gran_node,
-#if KMP_GROUP_AFFINITY
-  // The "group" granularity isn't necesssarily coarser than all of the
-  // other levels, but we put it last in the enum.
-  affinity_gran_group,
-#endif /* KMP_GROUP_AFFINITY */
-  affinity_gran_default
-};
-
 enum affinity_top_method {
   affinity_top_method_all = 0, // try all (supported) methods, in order
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
   affinity_top_method_apicid,
   affinity_top_method_x2apicid,
+  affinity_top_method_x2apicid_1f,
 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
   affinity_top_method_cpuinfo, // KMP_CPUINFO_FILE is usable on Windows* OS, too
 #if KMP_GROUP_AFFINITY
@@ -794,7 +810,7 @@ enum affinity_top_method {
 #define affinity_respect_mask_default (-1)
 
 extern enum affinity_type __kmp_affinity_type; /* Affinity type */
-extern enum affinity_gran __kmp_affinity_gran; /* Affinity granularity */
+extern kmp_hw_t __kmp_affinity_gran; /* Affinity granularity */
 extern int __kmp_affinity_gran_levels; /* corresponding int value */
 extern int __kmp_affinity_dups; /* Affinity duplicate masks */
 extern enum affinity_top_method __kmp_affinity_top_method;
@@ -817,7 +833,7 @@ extern char *__kmp_cpuinfo_file;
 typedef enum kmp_proc_bind_t {
   proc_bind_false = 0,
   proc_bind_true,
-  proc_bind_master,
+  proc_bind_primary,
   proc_bind_close,
   proc_bind_spread,
   proc_bind_intel, // use KMP_AFFINITY interface
@@ -835,6 +851,10 @@ extern kmp_nested_proc_bind_t __kmp_nested_proc_bind;
 extern int __kmp_display_affinity;
 extern char *__kmp_affinity_format;
 static const size_t KMP_AFFINITY_FORMAT_SIZE = 512;
+#if OMPT_SUPPORT
+extern int __kmp_tool;
+extern char *__kmp_tool_libraries;
+#endif // OMPT_SUPPORT
 
 #if KMP_AFFINITY_SUPPORTED
 #define KMP_PLACE_ALL (-1)
@@ -863,6 +883,7 @@ typedef struct kmp_hws_item {
 } kmp_hws_item_t;
 
 extern kmp_hws_item_t __kmp_hws_socket;
+extern kmp_hws_item_t __kmp_hws_die;
 extern kmp_hws_item_t __kmp_hws_node;
 extern kmp_hws_item_t __kmp_hws_tile;
 extern kmp_hws_item_t __kmp_hws_core;
@@ -929,6 +950,10 @@ extern omp_memspace_handle_t const omp_large_cap_mem_space;
 extern omp_memspace_handle_t const omp_const_mem_space;
 extern omp_memspace_handle_t const omp_high_bw_mem_space;
 extern omp_memspace_handle_t const omp_low_lat_mem_space;
+// Preview of target memory support
+extern omp_memspace_handle_t const llvm_omp_target_host_mem_space;
+extern omp_memspace_handle_t const llvm_omp_target_shared_mem_space;
+extern omp_memspace_handle_t const llvm_omp_target_device_mem_space;
 
 typedef struct {
   omp_alloctrait_key_t key;
@@ -945,6 +970,10 @@ extern omp_allocator_handle_t const omp_low_lat_mem_alloc;
 extern omp_allocator_handle_t const omp_cgroup_mem_alloc;
 extern omp_allocator_handle_t const omp_pteam_mem_alloc;
 extern omp_allocator_handle_t const omp_thread_mem_alloc;
+// Preview of target memory support
+extern omp_allocator_handle_t const llvm_omp_target_host_mem_alloc;
+extern omp_allocator_handle_t const llvm_omp_target_shared_mem_alloc;
+extern omp_allocator_handle_t const llvm_omp_target_device_mem_alloc;
 extern omp_allocator_handle_t const kmp_max_mem_alloc;
 extern omp_allocator_handle_t __kmp_def_allocator;
 
@@ -982,6 +1011,7 @@ extern void __kmpc_free(int gtid, void *ptr, omp_allocator_handle_t al);
 
 extern void __kmp_init_memkind();
 extern void __kmp_fini_memkind();
+extern void __kmp_init_target_mem();
 
 /* ------------------------------------------------------------------------ */
 
@@ -1046,13 +1076,11 @@ extern void __kmp_fini_memkind();
 /* Calculate new number of monitor wakeups for a specific block time based on
    previous monitor_wakeups. Only allow increasing number of wakeups */
 #define KMP_WAKEUPS_FROM_BLOCKTIME(blocktime, monitor_wakeups)                 \
-  (((blocktime) == KMP_MAX_BLOCKTIME)                                          \
+  (((blocktime) == KMP_MAX_BLOCKTIME)   ? (monitor_wakeups)                    \
+   : ((blocktime) == KMP_MIN_BLOCKTIME) ? KMP_MAX_MONITOR_WAKEUPS              \
+   : ((monitor_wakeups) > (KMP_BLOCKTIME_MULTIPLIER / (blocktime)))            \
        ? (monitor_wakeups)                                                     \
-       : ((blocktime) == KMP_MIN_BLOCKTIME)                                    \
-             ? KMP_MAX_MONITOR_WAKEUPS                                         \
-             : ((monitor_wakeups) > (KMP_BLOCKTIME_MULTIPLIER / (blocktime)))  \
-                   ? (monitor_wakeups)                                         \
-                   : (KMP_BLOCKTIME_MULTIPLIER) / (blocktime))
+       : (KMP_BLOCKTIME_MULTIPLIER) / (blocktime))
 
 /* Calculate number of intervals for a specific block time based on
    monitor_wakeups */
@@ -1097,7 +1125,10 @@ extern kmp_uint64 __kmp_now_nsec();
 #define KMP_MAX_CHUNK (INT_MAX - 1)
 #define KMP_DEFAULT_CHUNK 1
 
+#define KMP_MIN_DISP_NUM_BUFF 1
 #define KMP_DFLT_DISP_NUM_BUFF 7
+#define KMP_MAX_DISP_NUM_BUFF 4096
+
 #define KMP_MAX_ORDERED 8
 
 #define KMP_MAX_FIELDS 32
@@ -1182,7 +1213,6 @@ typedef struct kmp_cpuinfo {
   int stepping; // CPUID(1).EAX[3:0] ( Stepping )
   int sse2; // 0 if SSE2 instructions are not supported, 1 otherwise.
   int rtm; // 0 if RTM instructions are not supported, 1 otherwise.
-  int cpu_stackoffset;
   int apic_id;
   int physical_id;
   int logical_id;
@@ -1339,8 +1369,7 @@ static inline void __kmp_x86_pause(void) { _mm_pause(); }
 #endif
 #endif // KMP_HAVE_WAITPKG_INTRINSICS
 KMP_ATTRIBUTE_TARGET_WAITPKG
-static inline int
-__kmp_tpause(uint32_t hint, uint64_t counter) {
+static inline int __kmp_tpause(uint32_t hint, uint64_t counter) {
 #if !KMP_HAVE_WAITPKG_INTRINSICS
   uint32_t timeHi = uint32_t(counter >> 32);
   uint32_t timeLo = uint32_t(counter & 0xffffffff);
@@ -1356,8 +1385,7 @@ __kmp_tpause(uint32_t hint, uint64_t counter) {
 #endif
 }
 KMP_ATTRIBUTE_TARGET_WAITPKG
-static inline void
-__kmp_umonitor(void *cacheline) {
+static inline void __kmp_umonitor(void *cacheline) {
 #if !KMP_HAVE_WAITPKG_INTRINSICS
   __asm__ volatile("# umonitor\n.byte 0xF3, 0x0F, 0xAE, 0x01 "
                    :
@@ -1368,8 +1396,7 @@ __kmp_umonitor(void *cacheline) {
 #endif
 }
 KMP_ATTRIBUTE_TARGET_WAITPKG
-static inline int
-__kmp_umwait(uint32_t hint, uint64_t counter) {
+static inline int __kmp_umwait(uint32_t hint, uint64_t counter) {
 #if !KMP_HAVE_WAITPKG_INTRINSICS
   uint32_t timeHi = uint32_t(counter >> 32);
   uint32_t timeLo = uint32_t(counter & 0xffffffff);
@@ -1422,7 +1449,8 @@ enum cons_type {
   ct_ordered_in_pdo,
   ct_master,
   ct_reduce,
-  ct_barrier
+  ct_barrier,
+  ct_masked
 };
 
 #define IS_CONS_TYPE_ORDERED(ct) ((ct) == ct_pdo_ordered)
@@ -1570,7 +1598,7 @@ struct private_common {
   struct private_common *next;
   struct private_common *link;
   void *gbl_addr;
-  void *par_addr; /* par_addr == gbl_addr for MASTER thread */
+  void *par_addr; /* par_addr == gbl_addr for PRIMARY thread */
   size_t cmn_size;
 };
 
@@ -1645,14 +1673,12 @@ typedef struct KMP_ALIGN_CACHE dispatch_private_info32 {
   kmp_int32 lb;
   kmp_int32 st;
   kmp_int32 tc;
-  kmp_int32 static_steal_counter; /* for static_steal only; maybe better to put
-                                     after ub */
-  kmp_lock_t *th_steal_lock; // lock used for chunk stealing
-  // KMP_ALIGN( 16 ) ensures ( if the KMP_ALIGN macro is turned on )
+  kmp_lock_t *steal_lock; // lock used for chunk stealing
+  // KMP_ALIGN(32) ensures (if the KMP_ALIGN macro is turned on)
   //    a) parm3 is properly aligned and
-  //    b) all parm1-4 are in the same cache line.
+  //    b) all parm1-4 are on the same cache line.
   // Because of parm1-4 are used together, performance seems to be better
-  // if they are in the same line (not measured though).
+  // if they are on the same cache line (not measured though).
 
   struct KMP_ALIGN(32) { // AC: changed 16 to 32 in order to simplify template
     kmp_int32 parm1; //     structures in kmp_dispatch.cpp. This should
@@ -1664,9 +1690,6 @@ typedef struct KMP_ALIGN_CACHE dispatch_private_info32 {
   kmp_uint32 ordered_lower;
   kmp_uint32 ordered_upper;
 #if KMP_OS_WINDOWS
-  // This var can be placed in the hole between 'tc' and 'parm1', instead of
-  // 'static_steal_counter'. It would be nice to measure execution times.
-  // Conditional if/endif can be removed at all.
   kmp_int32 last_upper;
 #endif /* KMP_OS_WINDOWS */
 } dispatch_private_info32_t;
@@ -1678,9 +1701,7 @@ typedef struct KMP_ALIGN_CACHE dispatch_private_info64 {
   kmp_int64 lb; /* lower-bound */
   kmp_int64 st; /* stride */
   kmp_int64 tc; /* trip count (number of iterations) */
-  kmp_int64 static_steal_counter; /* for static_steal only; maybe better to put
-                                     after ub */
-  kmp_lock_t *th_steal_lock; // lock used for chunk stealing
+  kmp_lock_t *steal_lock; // lock used for chunk stealing
   /* parm[1-4] are used in different ways by different scheduling algorithms */
 
   // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
@@ -1699,9 +1720,6 @@ typedef struct KMP_ALIGN_CACHE dispatch_private_info64 {
   kmp_uint64 ordered_lower;
   kmp_uint64 ordered_upper;
 #if KMP_OS_WINDOWS
-  // This var can be placed in the hole between 'tc' and 'parm1', instead of
-  // 'static_steal_counter'. It would be nice to measure execution times.
-  // Conditional if/endif can be removed at all.
   kmp_int64 last_upper;
 #endif /* KMP_OS_WINDOWS */
 } dispatch_private_info64_t;
@@ -1755,9 +1773,8 @@ typedef struct KMP_ALIGN_CACHE dispatch_private_info {
   } u;
   enum sched_type schedule; /* scheduling algorithm */
   kmp_sched_flags_t flags; /* flags (e.g., ordered, nomerge, etc.) */
+  std::atomic<kmp_uint32> steal_flag; // static_steal only, state of a buffer
   kmp_int32 ordered_bumped;
-  // To retain the structure size after making ordered_iteration scalar
-  kmp_int32 ordered_dummy[KMP_MAX_ORDERED - 3];
   // Stack of buffers for nest of serial regions
   struct dispatch_private_info *next;
   kmp_int32 type_size; /* the size of types in private_info */
@@ -1772,7 +1789,7 @@ typedef struct dispatch_shared_info32 {
   /* chunk index under dynamic, number of idle threads under static-steal;
      iteration index otherwise */
   volatile kmp_uint32 iteration;
-  volatile kmp_uint32 num_done;
+  volatile kmp_int32 num_done;
   volatile kmp_uint32 ordered_iteration;
   // Dummy to retain the structure size after making ordered_iteration scalar
   kmp_int32 ordered_dummy[KMP_MAX_ORDERED - 1];
@@ -1782,7 +1799,7 @@ typedef struct dispatch_shared_info64 {
   /* chunk index under dynamic, number of idle threads under static-steal;
      iteration index otherwise */
   volatile kmp_uint64 iteration;
-  volatile kmp_uint64 num_done;
+  volatile kmp_int64 num_done;
   volatile kmp_uint64 ordered_iteration;
   // Dummy to retain the structure size after making ordered_iteration scalar
   kmp_int64 ordered_dummy[KMP_MAX_ORDERED - 3];
@@ -1818,7 +1835,7 @@ typedef struct kmp_disp {
   dispatch_private_info_t *th_dispatch_pr_current;
 
   dispatch_private_info_t *th_disp_buffer;
-  kmp_int32 th_disp_index;
+  kmp_uint32 th_disp_index;
   kmp_int32 th_doacross_buf_idx; // thread's doacross buffer index
   volatile kmp_uint32 *th_doacross_flags; // pointer to shared array of flags
   kmp_int64 *th_doacross_info; // info on loop bounds
@@ -1882,9 +1899,8 @@ typedef enum kmp_bar_pat { /* Barrier communication patterns */
                                0, /* Single level (degenerate) tree */
                            bp_tree_bar =
                                1, /* Balanced tree with branching factor 2^n */
-                           bp_hyper_bar =
-                               2, /* Hypercube-embedded tree with min branching
-                                     factor 2^n */
+                           bp_hyper_bar = 2, /* Hypercube-embedded tree with min
+                                                branching factor 2^n */
                            bp_hierarchical_bar = 3, /* Machine hierarchy tree */
                            bp_last_bar /* Placeholder to mark the end */
 } kmp_bar_pat_e;
@@ -1969,9 +1985,9 @@ union KMP_ALIGN_CACHE kmp_barrier_team_union {
     kmp_uint64 b_arrived; /* STATE => task reached synch point. */
 #if USE_DEBUGGER
     // The following two fields are indended for the debugger solely. Only
-    // master of the team accesses these fields: the first one is increased by
-    // 1 when master arrives to a barrier, the second one is increased by one
-    // when all the threads arrived.
+    // primary thread of the team accesses these fields: the first one is
+    // increased by 1 when the primary thread arrives to a barrier, the second
+    // one is increased by one when all the threads arrived.
     kmp_uint b_master_arrived;
     kmp_uint b_team_arrived;
 #endif
@@ -2217,6 +2233,7 @@ typedef struct kmp_taskgroup {
   // Block of data to perform task reduction
   void *reduce_data; // reduction related info
   kmp_int32 reduce_num_data; // number of data items to reduce
+  uintptr_t *gomp_data; // gomp reduction data
 } kmp_taskgroup_t;
 
 // forward declarations
@@ -2224,15 +2241,24 @@ typedef union kmp_depnode kmp_depnode_t;
 typedef struct kmp_depnode_list kmp_depnode_list_t;
 typedef struct kmp_dephash_entry kmp_dephash_entry_t;
 
+#define KMP_DEP_IN 0x1
+#define KMP_DEP_OUT 0x2
+#define KMP_DEP_INOUT 0x3
+#define KMP_DEP_MTX 0x4
+#define KMP_DEP_SET 0x8
 // Compiler sends us this info:
 typedef struct kmp_depend_info {
   kmp_intptr_t base_addr;
   size_t len;
-  struct {
-    bool in : 1;
-    bool out : 1;
-    bool mtx : 1;
-  } flags;
+  union {
+    kmp_uint8 flag;
+    struct {
+      unsigned in : 1;
+      unsigned out : 1;
+      unsigned mtx : 1;
+      unsigned set : 1;
+    } flags;
+  };
 } kmp_depend_info_t;
 
 // Internal structures to work with task dependencies:
@@ -2266,9 +2292,9 @@ union KMP_ALIGN_CACHE kmp_depnode {
 struct kmp_dephash_entry {
   kmp_intptr_t addr;
   kmp_depnode_t *last_out;
-  kmp_depnode_list_t *last_ins;
-  kmp_depnode_list_t *last_mtxs;
-  kmp_int32 last_flag;
+  kmp_depnode_list_t *last_set;
+  kmp_depnode_list_t *prev_set;
+  kmp_uint8 last_flag;
   kmp_lock_t *mtx_lock; /* is referenced by depnodes w/mutexinoutset dep */
   kmp_dephash_entry_t *next_in_bucket;
 };
@@ -2501,7 +2527,7 @@ typedef struct kmp_teams_size {
 
 // This struct stores a thread that acts as a "root" for a contention
 // group. Contention groups are rooted at kmp_root threads, but also at
-// each master thread of each team created in the teams construct.
+// each primary thread of each team created in the teams construct.
 // This struct therefore also stores a thread_limit associated with
 // that contention group, and a counter to track the number of threads
 // active in that contention group. Each thread has a list of these: CG
@@ -2513,7 +2539,7 @@ typedef struct kmp_teams_size {
 typedef struct kmp_cg_root {
   kmp_info_p *cg_root; // "root" thread for a contention group
   // The CG root's limit comes from OMP_THREAD_LIMIT for root threads, or
-  // thread_limit clause for teams masters
+  // thread_limit clause for teams primary threads
   kmp_int32 cg_thread_limit;
   kmp_int32 cg_nthreads; // Count of active threads in CG rooted at cg_root
   struct kmp_cg_root *up; // pointer to higher level CG root in list
@@ -2523,8 +2549,9 @@ typedef struct kmp_cg_root {
 
 typedef struct KMP_ALIGN_CACHE kmp_base_info {
   /* Start with the readonly data which is cache aligned and padded. This is
-     written before the thread starts working by the master. Uber masters may
-     update themselves later. Usage does not consider serialized regions.  */
+     written before the thread starts working by the primary thread. Uber
+     masters may update themselves later. Usage does not consider serialized
+     regions.  */
   kmp_desc_t th_info;
   kmp_team_p *th_team; /* team we belong to */
   kmp_root_p *th_root; /* pointer to root of task hierarchy */
@@ -2535,7 +2562,7 @@ typedef struct KMP_ALIGN_CACHE kmp_base_info {
   /* The following are cached from the team info structure */
   /* TODO use these in more places as determined to be needed via profiling */
   int th_team_nproc; /* number of threads in a team */
-  kmp_info_p *th_team_master; /* the team's master thread */
+  kmp_info_p *th_team_master; /* the team's primary thread */
   int th_team_serialized; /* team is serialized */
   microtask_t th_teams_microtask; /* save entry address for teams construct */
   int th_teams_level; /* save initial level of teams construct */
@@ -2556,7 +2583,7 @@ typedef struct KMP_ALIGN_CACHE kmp_base_info {
   kmp_affin_mask_t *th_affin_mask; /* thread's current affinity mask */
 #endif
   omp_allocator_handle_t th_def_allocator; /* default allocator */
-  /* The data set by the master at reinit, then R/W by the worker */
+  /* The data set by the primary thread at reinit, then R/W by the worker */
   KMP_ALIGN_CACHE int
       th_set_nproc; /* if > 0, then only use this request for the next fork */
 #if KMP_NESTED_HOT_TEAMS
@@ -2592,7 +2619,7 @@ typedef struct KMP_ALIGN_CACHE kmp_base_info {
   ompt_thread_info_t ompt_thread_info;
 #endif
 
-  /* The following are also read by the master during reinit */
+  /* The following are also read by the primary thread during reinit */
   struct common_table *th_pri_common;
 
   volatile kmp_uint32 th_spin_here; /* thread-local location for spinning */
@@ -2669,7 +2696,9 @@ typedef union KMP_ALIGN_CACHE kmp_info {
 
 // OpenMP thread team data structures
 
-typedef struct kmp_base_data { volatile kmp_uint32 t_value; } kmp_base_data_t;
+typedef struct kmp_base_data {
+  volatile kmp_uint32 t_value;
+} kmp_base_data_t;
 
 typedef union KMP_ALIGN_CACHE kmp_sleep_team {
   double dt_align; /* use worst case alignment */
@@ -2690,7 +2719,7 @@ typedef int (*launch_t)(int gtid);
 
 // Set up how many argv pointers will fit in cache lines containing
 // t_inline_argv. Historically, we have supported at least 96 bytes. Using a
-// larger value for more space between the master write/worker read section and
+// larger value for more space between the primary write/worker read section and
 // read/write by all section seems to buy more performance on EPCC PARALLEL.
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
 #define KMP_INLINE_ARGV_BYTES                                                  \
@@ -2716,11 +2745,11 @@ typedef struct KMP_ALIGN_CACHE kmp_base_team {
   std::atomic<void *> t_tg_reduce_data[2]; // to support task modifier
   std::atomic<int> t_tg_fini_counter[2]; // sync end of task reductions
 
-  // Master only
+  // Primary thread only
   // ---------------------------------------------------------------------------
-  KMP_ALIGN_CACHE int t_master_tid; // tid of master in parent team
-  int t_master_this_cons; // "this_construct" single counter of master in parent
-  // team
+  KMP_ALIGN_CACHE int t_master_tid; // tid of primary thread in parent team
+  int t_master_this_cons; // "this_construct" single counter of primary thread
+  // in parent team
   ident_t *t_ident; // if volatile, have to change too much other crud to
   // volatile too
   kmp_team_p *t_parent; // parent team
@@ -2732,7 +2761,7 @@ typedef struct KMP_ALIGN_CACHE kmp_base_team {
   kmp_uint64 t_region_time; // region begin timestamp
 #endif /* USE_ITT_BUILD */
 
-  // Master write, workers read
+  // Primary thread write, workers read
   // --------------------------------------------------------------------------
   KMP_ALIGN_CACHE void **t_argv;
   int t_argc;
@@ -2768,7 +2797,7 @@ typedef struct KMP_ALIGN_CACHE kmp_base_team {
   kmp_r_sched_t t_sched; // run-time schedule for the team
 #if KMP_AFFINITY_SUPPORTED
   int t_first_place; // first & last place in parent thread's partition.
-  int t_last_place; // Restore these values to master after par region.
+  int t_last_place; // Restore these values to primary thread after par region.
 #endif // KMP_AFFINITY_SUPPORTED
   int t_display_affinity;
   int t_size_changed; // team size was changed?: 0: no, 1: yes, -1: changed via
@@ -2843,6 +2872,9 @@ typedef struct kmp_base_root {
   kmp_lock_t r_begin_lock;
   volatile int r_begin;
   int r_blocktime; /* blocktime for this root and descendants */
+#if KMP_AFFINITY_SUPPORTED
+  int r_affinity_assigned;
+#endif // KMP_AFFINITY_SUPPORTED
 } kmp_base_root_t;
 
 typedef union KMP_ALIGN_CACHE kmp_root {
@@ -2975,6 +3007,7 @@ extern enum sched_type __kmp_static; /* default static scheduling method */
 extern enum sched_type __kmp_guided; /* default guided scheduling method */
 extern enum sched_type __kmp_auto; /* default auto scheduling method */
 extern int __kmp_chunk; /* default runtime chunk size */
+extern int __kmp_force_monotonic; /* whether monotonic scheduling forced */
 
 extern size_t __kmp_stksize; /* stack size per thread         */
 #if KMP_USE_MONITOR
@@ -3043,9 +3076,8 @@ extern int __kmp_ncores; /* Total number of cores for threads placement */
 extern int __kmp_abort_delay;
 
 extern int __kmp_need_register_atfork_specified;
-extern int
-    __kmp_need_register_atfork; /* At initialization, call pthread_atfork to
-                                   install fork handler */
+extern int __kmp_need_register_atfork; /* At initialization, call pthread_atfork
+                                          to install fork handler */
 extern int __kmp_gtid_mode; /* Method of getting gtid, values:
                                0 - not set, will be set at runtime
                                1 - using stack search
@@ -3123,6 +3155,8 @@ extern const char *__kmp_speculative_statsfile;
 extern int __kmp_display_env; /* TRUE or FALSE */
 extern int __kmp_display_env_verbose; /* TRUE if OMP_DISPLAY_ENV=VERBOSE */
 extern int __kmp_omp_cancellation; /* TRUE or FALSE */
+extern int __kmp_nteams;
+extern int __kmp_teams_thread_limit;
 
 /* ------------------------------------------------------------------------- */
 
@@ -3331,6 +3365,8 @@ extern void __kmp_push_proc_bind(ident_t *loc, int gtid,
                                  kmp_proc_bind_t proc_bind);
 extern void __kmp_push_num_teams(ident_t *loc, int gtid, int num_teams,
                                  int num_threads);
+extern void __kmp_push_num_teams_51(ident_t *loc, int gtid, int num_teams_lb,
+                                    int num_teams_ub, int num_threads);
 
 extern void __kmp_yield();
 
@@ -3409,7 +3445,7 @@ extern void __kmp_wait_64(kmp_info_t *this_thr, kmp_flag_64<> *flag,
                           ,
                           void *itt_sync_obj
 #endif
-                          );
+);
 extern void __kmp_release_64(kmp_flag_64<> *flag);
 
 extern void __kmp_infinite_loop(void);
@@ -3430,7 +3466,7 @@ extern void __kmp_check_stack_overlap(kmp_info_t *thr);
 extern void __kmp_expand_host_name(char *buffer, size_t size);
 extern void __kmp_expand_file_name(char *result, size_t rlen, char *pattern);
 
-#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64 || (KMP_OS_WINDOWS && KMP_ARCH_AARCH64)
 extern void
 __kmp_initialize_system_tick(void); /* Initialize timer tick value */
 #endif
@@ -3460,6 +3496,16 @@ extern void __kmp_balanced_affinity(kmp_info_t *th, int team_size);
 #if KMP_OS_LINUX || KMP_OS_FREEBSD
 extern int kmp_set_thread_affinity_mask_initial(void);
 #endif
+static inline void __kmp_assign_root_init_mask() {
+  int gtid = __kmp_entry_gtid();
+  kmp_root_t *r = __kmp_threads[gtid]->th.th_root;
+  if (r->r.r_uber_thread == __kmp_threads[gtid] && !r->r.r_affinity_assigned) {
+    __kmp_affinity_set_init_mask(gtid, TRUE);
+    r->r.r_affinity_assigned = TRUE;
+  }
+}
+#else /* KMP_AFFINITY_SUPPORTED */
+#define __kmp_assign_root_init_mask() /* Nothing */
 #endif /* KMP_AFFINITY_SUPPORTED */
 // No need for KMP_AFFINITY_SUPPORTED guard as only one field in the
 // format string is for affinity, so platforms that do not support
@@ -3667,7 +3713,7 @@ extern int __kmp_invoke_microtask(microtask_t pkfn, int gtid, int npr, int argc,
                                   ,
                                   void **exit_frame_ptr
 #endif
-                                  );
+);
 
 /* ------------------------------------------------------------------------ */
 
@@ -3701,6 +3747,9 @@ KMP_EXPORT void __kmpc_flush(ident_t *);
 KMP_EXPORT void __kmpc_barrier(ident_t *, kmp_int32 global_tid);
 KMP_EXPORT kmp_int32 __kmpc_master(ident_t *, kmp_int32 global_tid);
 KMP_EXPORT void __kmpc_end_master(ident_t *, kmp_int32 global_tid);
+KMP_EXPORT kmp_int32 __kmpc_masked(ident_t *, kmp_int32 global_tid,
+                                   kmp_int32 filter);
+KMP_EXPORT void __kmpc_end_masked(ident_t *, kmp_int32 global_tid);
 KMP_EXPORT void __kmpc_ordered(ident_t *, kmp_int32 global_tid);
 KMP_EXPORT void __kmpc_end_ordered(ident_t *, kmp_int32 global_tid);
 KMP_EXPORT void __kmpc_critical(ident_t *, kmp_int32 global_tid,
@@ -3744,12 +3793,9 @@ KMP_EXPORT kmp_task_t *__kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
                                              size_t sizeof_kmp_task_t,
                                              size_t sizeof_shareds,
                                              kmp_routine_entry_t task_entry);
-KMP_EXPORT kmp_task_t *__kmpc_omp_target_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
-                                                    kmp_int32 flags,
-                                                    size_t sizeof_kmp_task_t,
-                                                    size_t sizeof_shareds,
-                                                    kmp_routine_entry_t task_entry,
-                                                    kmp_int64 device_id);
+KMP_EXPORT kmp_task_t *__kmpc_omp_target_task_alloc(
+    ident_t *loc_ref, kmp_int32 gtid, kmp_int32 flags, size_t sizeof_kmp_task_t,
+    size_t sizeof_shareds, kmp_routine_entry_t task_entry, kmp_int64 device_id);
 KMP_EXPORT void __kmpc_omp_task_begin_if0(ident_t *loc_ref, kmp_int32 gtid,
                                           kmp_task_t *task);
 KMP_EXPORT void __kmpc_omp_task_complete_if0(ident_t *loc_ref, kmp_int32 gtid,
@@ -3817,6 +3863,10 @@ KMP_EXPORT void __kmpc_task_reduction_modifier_fini(ident_t *loc, int gtid,
 KMP_EXPORT kmp_int32 __kmpc_omp_reg_task_with_affinity(
     ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *new_task, kmp_int32 naffins,
     kmp_task_affinity_info_t *affin_list);
+KMP_EXPORT void __kmp_set_num_teams(int num_teams);
+KMP_EXPORT int __kmp_get_max_teams(void);
+KMP_EXPORT void __kmp_set_teams_thread_limit(int limit);
+KMP_EXPORT int __kmp_get_teams_thread_limit(void);
 
 /* Lock interface routines (fast versions with gtid passed in) */
 KMP_EXPORT void __kmpc_init_lock(ident_t *loc, kmp_int32 gtid,
@@ -3885,6 +3935,11 @@ KMP_EXPORT void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
 KMP_EXPORT void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid,
                                       kmp_int32 num_teams,
                                       kmp_int32 num_threads);
+/* Function for OpenMP 5.1 num_teams clause */
+KMP_EXPORT void __kmpc_push_num_teams_51(ident_t *loc, kmp_int32 global_tid,
+                                         kmp_int32 num_teams_lb,
+                                         kmp_int32 num_teams_ub,
+                                         kmp_int32 num_threads);
 KMP_EXPORT void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc,
                                   kmpc_micro microtask, ...);
 struct kmp_dim { // loop bounds info casted to kmp_int64
@@ -3955,6 +4010,11 @@ KMP_EXPORT void KMPC_CONVENTION kmpc_set_stacksize_s(size_t);
 KMP_EXPORT void KMPC_CONVENTION kmpc_set_library(int);
 KMP_EXPORT void KMPC_CONVENTION kmpc_set_defaults(char const *);
 KMP_EXPORT void KMPC_CONVENTION kmpc_set_disp_num_buffers(int);
+void KMP_EXPAND_NAME(ompc_set_affinity_format)(char const *format);
+size_t KMP_EXPAND_NAME(ompc_get_affinity_format)(char *buffer, size_t size);
+void KMP_EXPAND_NAME(ompc_display_affinity)(char const *format);
+size_t KMP_EXPAND_NAME(ompc_capture_affinity)(char *buffer, size_t buf_size,
+                                              char const *format);
 
 enum kmp_target_offload_kind {
   tgt_disabled = 0,
@@ -4031,11 +4091,33 @@ extern void __kmp_hidden_helper_main_thread_release();
 #define KMP_HIDDEN_HELPER_WORKER_THREAD(gtid)                                  \
   ((gtid) > 1 && (gtid) <= __kmp_hidden_helper_threads_num)
 
+#define KMP_HIDDEN_HELPER_TEAM(team)                                           \
+  (team->t.t_threads[0] == __kmp_hidden_helper_main_thread)
+
 // Map a gtid to a hidden helper thread. The first hidden helper thread, a.k.a
 // main thread, is skipped.
 #define KMP_GTID_TO_SHADOW_GTID(gtid)                                          \
   ((gtid) % (__kmp_hidden_helper_threads_num - 1) + 2)
 
+// Return the adjusted gtid value by subtracting from gtid the number
+// of hidden helper threads. This adjusted value is the gtid the thread would
+// have received if there were no hidden helper threads.
+static inline int __kmp_adjust_gtid_for_hidden_helpers(int gtid) {
+  int adjusted_gtid = gtid;
+  if (__kmp_hidden_helper_threads_num > 0 && gtid > 0 &&
+      gtid - __kmp_hidden_helper_threads_num >= 0) {
+    adjusted_gtid -= __kmp_hidden_helper_threads_num;
+  }
+  return adjusted_gtid;
+}
+
+// Support for error directive
+typedef enum kmp_severity_t {
+  severity_warning = 1,
+  severity_fatal = 2
+} kmp_severity_t;
+extern void __kmpc_error(ident_t *loc, int severity, const char *message);
+
 #ifdef __cplusplus
 }
 #endif
@@ -4082,6 +4164,12 @@ int __kmp_execute_tasks_oncore(kmp_info_t *thread, kmp_int32 gtid,
 #endif /* USE_ITT_BUILD */
                                kmp_int32 is_constrained);
 
+extern int __kmp_nesting_mode;
+extern int __kmp_nesting_mode_nlevels;
+extern int *__kmp_nesting_nth_level;
+extern void __kmp_init_nesting_mode();
+extern void __kmp_set_nesting_mode_threads();
+
 /// This class safely opens and closes a C-style FILE* object using RAII
 /// semantics. There are also methods which allow using stdout or stderr as
 /// the underlying FILE* object. With the implicit conversion operator to
diff --git a/openmp/runtime/src/kmp_affinity.cpp b/openmp/runtime/src/kmp_affinity.cpp
index 103dc269d742..3a092a803276 100644
--- a/openmp/runtime/src/kmp_affinity.cpp
+++ b/openmp/runtime/src/kmp_affinity.cpp
@@ -19,6 +19,18 @@
 #if KMP_USE_HIER_SCHED
 #include "kmp_dispatch_hier.h"
 #endif
+#if KMP_USE_HWLOC
+// Copied from hwloc
+#define HWLOC_GROUP_KIND_INTEL_MODULE 102
+#define HWLOC_GROUP_KIND_INTEL_TILE 103
+#define HWLOC_GROUP_KIND_INTEL_DIE 104
+#define HWLOC_GROUP_KIND_WINDOWS_PROCESSOR_GROUP 220
+#endif
+
+// The machine topology
+kmp_topology_t *__kmp_topology = nullptr;
+// KMP_HW_SUBSET environment variable
+kmp_hw_subset_t *__kmp_hw_subset = nullptr;
 
 // Store the real or imagined machine hierarchy here
 static hierarchy_info machine_hierarchy;
@@ -30,7 +42,7 @@ void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) {
   // The test below is true if affinity is available, but set to "none". Need to
   // init on first use of hierarchical barrier.
   if (TCR_1(machine_hierarchy.uninitialized))
-    machine_hierarchy.init(NULL, nproc);
+    machine_hierarchy.init(nproc);
 
   // Adjust the hierarchy in case num threads exceeds original
   if (nproc > machine_hierarchy.base_num_threads)
@@ -45,7 +57,770 @@ void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) {
   thr_bar->skip_per_level = machine_hierarchy.skipPerLevel;
 }
 
+static int nCoresPerPkg, nPackages;
+static int __kmp_nThreadsPerCore;
+#ifndef KMP_DFLT_NTH_CORES
+static int __kmp_ncores;
+#endif
+
+const char *__kmp_hw_get_catalog_string(kmp_hw_t type, bool plural) {
+  switch (type) {
+  case KMP_HW_SOCKET:
+    return ((plural) ? KMP_I18N_STR(Sockets) : KMP_I18N_STR(Socket));
+  case KMP_HW_DIE:
+    return ((plural) ? KMP_I18N_STR(Dice) : KMP_I18N_STR(Die));
+  case KMP_HW_MODULE:
+    return ((plural) ? KMP_I18N_STR(Modules) : KMP_I18N_STR(Module));
+  case KMP_HW_TILE:
+    return ((plural) ? KMP_I18N_STR(Tiles) : KMP_I18N_STR(Tile));
+  case KMP_HW_NUMA:
+    return ((plural) ? KMP_I18N_STR(NumaDomains) : KMP_I18N_STR(NumaDomain));
+  case KMP_HW_L3:
+    return ((plural) ? KMP_I18N_STR(L3Caches) : KMP_I18N_STR(L3Cache));
+  case KMP_HW_L2:
+    return ((plural) ? KMP_I18N_STR(L2Caches) : KMP_I18N_STR(L2Cache));
+  case KMP_HW_L1:
+    return ((plural) ? KMP_I18N_STR(L1Caches) : KMP_I18N_STR(L1Cache));
+  case KMP_HW_LLC:
+    return ((plural) ? KMP_I18N_STR(LLCaches) : KMP_I18N_STR(LLCache));
+  case KMP_HW_CORE:
+    return ((plural) ? KMP_I18N_STR(Cores) : KMP_I18N_STR(Core));
+  case KMP_HW_THREAD:
+    return ((plural) ? KMP_I18N_STR(Threads) : KMP_I18N_STR(Thread));
+  case KMP_HW_PROC_GROUP:
+    return ((plural) ? KMP_I18N_STR(ProcGroups) : KMP_I18N_STR(ProcGroup));
+  }
+  return KMP_I18N_STR(Unknown);
+}
+
+const char *__kmp_hw_get_keyword(kmp_hw_t type, bool plural) {
+  switch (type) {
+  case KMP_HW_SOCKET:
+    return ((plural) ? "sockets" : "socket");
+  case KMP_HW_DIE:
+    return ((plural) ? "dice" : "die");
+  case KMP_HW_MODULE:
+    return ((plural) ? "modules" : "module");
+  case KMP_HW_TILE:
+    return ((plural) ? "tiles" : "tile");
+  case KMP_HW_NUMA:
+    return ((plural) ? "numa_domains" : "numa_domain");
+  case KMP_HW_L3:
+    return ((plural) ? "l3_caches" : "l3_cache");
+  case KMP_HW_L2:
+    return ((plural) ? "l2_caches" : "l2_cache");
+  case KMP_HW_L1:
+    return ((plural) ? "l1_caches" : "l1_cache");
+  case KMP_HW_LLC:
+    return ((plural) ? "ll_caches" : "ll_cache");
+  case KMP_HW_CORE:
+    return ((plural) ? "cores" : "core");
+  case KMP_HW_THREAD:
+    return ((plural) ? "threads" : "thread");
+  case KMP_HW_PROC_GROUP:
+    return ((plural) ? "proc_groups" : "proc_group");
+  }
+  return ((plural) ? "unknowns" : "unknown");
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// kmp_hw_thread_t methods
+int kmp_hw_thread_t::compare_ids(const void *a, const void *b) {
+  const kmp_hw_thread_t *ahwthread = (const kmp_hw_thread_t *)a;
+  const kmp_hw_thread_t *bhwthread = (const kmp_hw_thread_t *)b;
+  int depth = __kmp_topology->get_depth();
+  for (int level = 0; level < depth; ++level) {
+    if (ahwthread->ids[level] < bhwthread->ids[level])
+      return -1;
+    else if (ahwthread->ids[level] > bhwthread->ids[level])
+      return 1;
+  }
+  if (ahwthread->os_id < bhwthread->os_id)
+    return -1;
+  else if (ahwthread->os_id > bhwthread->os_id)
+    return 1;
+  return 0;
+}
+
 #if KMP_AFFINITY_SUPPORTED
+int kmp_hw_thread_t::compare_compact(const void *a, const void *b) {
+  int i;
+  const kmp_hw_thread_t *aa = (const kmp_hw_thread_t *)a;
+  const kmp_hw_thread_t *bb = (const kmp_hw_thread_t *)b;
+  int depth = __kmp_topology->get_depth();
+  KMP_DEBUG_ASSERT(__kmp_affinity_compact >= 0);
+  KMP_DEBUG_ASSERT(__kmp_affinity_compact <= depth);
+  for (i = 0; i < __kmp_affinity_compact; i++) {
+    int j = depth - i - 1;
+    if (aa->sub_ids[j] < bb->sub_ids[j])
+      return -1;
+    if (aa->sub_ids[j] > bb->sub_ids[j])
+      return 1;
+  }
+  for (; i < depth; i++) {
+    int j = i - __kmp_affinity_compact;
+    if (aa->sub_ids[j] < bb->sub_ids[j])
+      return -1;
+    if (aa->sub_ids[j] > bb->sub_ids[j])
+      return 1;
+  }
+  return 0;
+}
+#endif
+
+void kmp_hw_thread_t::print() const {
+  int depth = __kmp_topology->get_depth();
+  printf("%4d ", os_id);
+  for (int i = 0; i < depth; ++i) {
+    printf("%4d ", ids[i]);
+  }
+  printf("\n");
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// kmp_topology_t methods
+
+// Remove layers that don't add information to the topology.
+// This is done by having the layer take on the id = UNKNOWN_ID (-1)
+void kmp_topology_t::_remove_radix1_layers() {
+  int preference[KMP_HW_LAST];
+  int top_index1, top_index2;
+  // Set up preference associative array
+  preference[KMP_HW_PROC_GROUP] = 110;
+  preference[KMP_HW_SOCKET] = 100;
+  preference[KMP_HW_CORE] = 95;
+  preference[KMP_HW_THREAD] = 90;
+  preference[KMP_HW_NUMA] = 85;
+  preference[KMP_HW_DIE] = 80;
+  preference[KMP_HW_TILE] = 75;
+  preference[KMP_HW_MODULE] = 73;
+  preference[KMP_HW_L3] = 70;
+  preference[KMP_HW_L2] = 65;
+  preference[KMP_HW_L1] = 60;
+  preference[KMP_HW_LLC] = 5;
+  top_index1 = 0;
+  top_index2 = 1;
+  while (top_index1 < depth - 1 && top_index2 < depth) {
+    kmp_hw_t type1 = types[top_index1];
+    kmp_hw_t type2 = types[top_index2];
+    KMP_ASSERT_VALID_HW_TYPE(type1);
+    KMP_ASSERT_VALID_HW_TYPE(type2);
+    // Do not allow the three main topology levels (sockets, cores, threads) to
+    // be compacted down
+    if ((type1 == KMP_HW_THREAD || type1 == KMP_HW_CORE ||
+         type1 == KMP_HW_SOCKET) &&
+        (type2 == KMP_HW_THREAD || type2 == KMP_HW_CORE ||
+         type2 == KMP_HW_SOCKET)) {
+      top_index1 = top_index2++;
+      continue;
+    }
+    bool radix1 = true;
+    bool all_same = true;
+    int id1 = hw_threads[0].ids[top_index1];
+    int id2 = hw_threads[0].ids[top_index2];
+    int pref1 = preference[type1];
+    int pref2 = preference[type2];
+    for (int hwidx = 1; hwidx < num_hw_threads; ++hwidx) {
+      if (hw_threads[hwidx].ids[top_index1] == id1 &&
+          hw_threads[hwidx].ids[top_index2] != id2) {
+        radix1 = false;
+        break;
+      }
+      if (hw_threads[hwidx].ids[top_index2] != id2)
+        all_same = false;
+      id1 = hw_threads[hwidx].ids[top_index1];
+      id2 = hw_threads[hwidx].ids[top_index2];
+    }
+    if (radix1) {
+      // Select the layer to remove based on preference
+      kmp_hw_t remove_type, keep_type;
+      int remove_layer, remove_layer_ids;
+      if (pref1 > pref2) {
+        remove_type = type2;
+        remove_layer = remove_layer_ids = top_index2;
+        keep_type = type1;
+      } else {
+        remove_type = type1;
+        remove_layer = remove_layer_ids = top_index1;
+        keep_type = type2;
+      }
+      // If all the indexes for the second (deeper) layer are the same.
+      // e.g., all are zero, then make sure to keep the first layer's ids
+      if (all_same)
+        remove_layer_ids = top_index2;
+      // Remove radix one type by setting the equivalence, removing the id from
+      // the hw threads and removing the layer from types and depth
+      set_equivalent_type(remove_type, keep_type);
+      for (int idx = 0; idx < num_hw_threads; ++idx) {
+        kmp_hw_thread_t &hw_thread = hw_threads[idx];
+        for (int d = remove_layer_ids; d < depth - 1; ++d)
+          hw_thread.ids[d] = hw_thread.ids[d + 1];
+      }
+      for (int idx = remove_layer; idx < depth - 1; ++idx)
+        types[idx] = types[idx + 1];
+      depth--;
+    } else {
+      top_index1 = top_index2++;
+    }
+  }
+  KMP_ASSERT(depth > 0);
+}
+
+void kmp_topology_t::_set_last_level_cache() {
+  if (get_equivalent_type(KMP_HW_L3) != KMP_HW_UNKNOWN)
+    set_equivalent_type(KMP_HW_LLC, KMP_HW_L3);
+  else if (get_equivalent_type(KMP_HW_L2) != KMP_HW_UNKNOWN)
+    set_equivalent_type(KMP_HW_LLC, KMP_HW_L2);
+#if KMP_MIC_SUPPORTED
+  else if (__kmp_mic_type == mic3) {
+    if (get_equivalent_type(KMP_HW_L2) != KMP_HW_UNKNOWN)
+      set_equivalent_type(KMP_HW_LLC, KMP_HW_L2);
+    else if (get_equivalent_type(KMP_HW_TILE) != KMP_HW_UNKNOWN)
+      set_equivalent_type(KMP_HW_LLC, KMP_HW_TILE);
+    // L2/Tile wasn't detected so just say L1
+    else
+      set_equivalent_type(KMP_HW_LLC, KMP_HW_L1);
+  }
+#endif
+  else if (get_equivalent_type(KMP_HW_L1) != KMP_HW_UNKNOWN)
+    set_equivalent_type(KMP_HW_LLC, KMP_HW_L1);
+  // Fallback is to set last level cache to socket or core
+  if (get_equivalent_type(KMP_HW_LLC) == KMP_HW_UNKNOWN) {
+    if (get_equivalent_type(KMP_HW_SOCKET) != KMP_HW_UNKNOWN)
+      set_equivalent_type(KMP_HW_LLC, KMP_HW_SOCKET);
+    else if (get_equivalent_type(KMP_HW_CORE) != KMP_HW_UNKNOWN)
+      set_equivalent_type(KMP_HW_LLC, KMP_HW_CORE);
+  }
+  KMP_ASSERT(get_equivalent_type(KMP_HW_LLC) != KMP_HW_UNKNOWN);
+}
+
+// Gather the count of each topology layer and the ratio
+void kmp_topology_t::_gather_enumeration_information() {
+  int previous_id[KMP_HW_LAST];
+  int max[KMP_HW_LAST];
+
+  for (int i = 0; i < depth; ++i) {
+    previous_id[i] = kmp_hw_thread_t::UNKNOWN_ID;
+    max[i] = 0;
+    count[i] = 0;
+    ratio[i] = 0;
+  }
+  for (int i = 0; i < num_hw_threads; ++i) {
+    kmp_hw_thread_t &hw_thread = hw_threads[i];
+    for (int layer = 0; layer < depth; ++layer) {
+      int id = hw_thread.ids[layer];
+      if (id != previous_id[layer]) {
+        // Add an additional increment to each count
+        for (int l = layer; l < depth; ++l)
+          count[l]++;
+        // Keep track of topology layer ratio statistics
+        max[layer]++;
+        for (int l = layer + 1; l < depth; ++l) {
+          if (max[l] > ratio[l])
+            ratio[l] = max[l];
+          max[l] = 1;
+        }
+        break;
+      }
+    }
+    for (int layer = 0; layer < depth; ++layer) {
+      previous_id[layer] = hw_thread.ids[layer];
+    }
+  }
+  for (int layer = 0; layer < depth; ++layer) {
+    if (max[layer] > ratio[layer])
+      ratio[layer] = max[layer];
+  }
+}
+
+// Find out if the topology is uniform
+void kmp_topology_t::_discover_uniformity() {
+  int num = 1;
+  for (int level = 0; level < depth; ++level)
+    num *= ratio[level];
+  flags.uniform = (num == count[depth - 1]);
+}
+
+// Set all the sub_ids for each hardware thread
+void kmp_topology_t::_set_sub_ids() {
+  int previous_id[KMP_HW_LAST];
+  int sub_id[KMP_HW_LAST];
+
+  for (int i = 0; i < depth; ++i) {
+    previous_id[i] = -1;
+    sub_id[i] = -1;
+  }
+  for (int i = 0; i < num_hw_threads; ++i) {
+    kmp_hw_thread_t &hw_thread = hw_threads[i];
+    // Setup the sub_id
+    for (int j = 0; j < depth; ++j) {
+      if (hw_thread.ids[j] != previous_id[j]) {
+        sub_id[j]++;
+        for (int k = j + 1; k < depth; ++k) {
+          sub_id[k] = 0;
+        }
+        break;
+      }
+    }
+    // Set previous_id
+    for (int j = 0; j < depth; ++j) {
+      previous_id[j] = hw_thread.ids[j];
+    }
+    // Set the sub_ids field
+    for (int j = 0; j < depth; ++j) {
+      hw_thread.sub_ids[j] = sub_id[j];
+    }
+  }
+}
+
+void kmp_topology_t::_set_globals() {
+  // Set nCoresPerPkg, nPackages, __kmp_nThreadsPerCore, __kmp_ncores
+  int core_level, thread_level, package_level;
+  package_level = get_level(KMP_HW_SOCKET);
+#if KMP_GROUP_AFFINITY
+  if (package_level == -1)
+    package_level = get_level(KMP_HW_PROC_GROUP);
+#endif
+  core_level = get_level(KMP_HW_CORE);
+  thread_level = get_level(KMP_HW_THREAD);
+
+  KMP_ASSERT(core_level != -1);
+  KMP_ASSERT(thread_level != -1);
+
+  __kmp_nThreadsPerCore = calculate_ratio(thread_level, core_level);
+  if (package_level != -1) {
+    nCoresPerPkg = calculate_ratio(core_level, package_level);
+    nPackages = get_count(package_level);
+  } else {
+    // assume one socket
+    nCoresPerPkg = get_count(core_level);
+    nPackages = 1;
+  }
+#ifndef KMP_DFLT_NTH_CORES
+  __kmp_ncores = get_count(core_level);
+#endif
+}
+
+kmp_topology_t *kmp_topology_t::allocate(int nproc, int ndepth,
+                                         const kmp_hw_t *types) {
+  kmp_topology_t *retval;
+  // Allocate all data in one large allocation
+  size_t size = sizeof(kmp_topology_t) + sizeof(kmp_hw_thread_t) * nproc +
+                sizeof(int) * ndepth * 3;
+  char *bytes = (char *)__kmp_allocate(size);
+  retval = (kmp_topology_t *)bytes;
+  if (nproc > 0) {
+    retval->hw_threads = (kmp_hw_thread_t *)(bytes + sizeof(kmp_topology_t));
+  } else {
+    retval->hw_threads = nullptr;
+  }
+  retval->num_hw_threads = nproc;
+  retval->depth = ndepth;
+  int *arr =
+      (int *)(bytes + sizeof(kmp_topology_t) + sizeof(kmp_hw_thread_t) * nproc);
+  retval->types = (kmp_hw_t *)arr;
+  retval->ratio = arr + ndepth;
+  retval->count = arr + 2 * ndepth;
+  KMP_FOREACH_HW_TYPE(type) { retval->equivalent[type] = KMP_HW_UNKNOWN; }
+  for (int i = 0; i < ndepth; ++i) {
+    retval->types[i] = types[i];
+    retval->equivalent[types[i]] = types[i];
+  }
+  return retval;
+}
+
+void kmp_topology_t::deallocate(kmp_topology_t *topology) {
+  if (topology)
+    __kmp_free(topology);
+}
+
+bool kmp_topology_t::check_ids() const {
+  // Assume ids have been sorted
+  if (num_hw_threads == 0)
+    return true;
+  for (int i = 1; i < num_hw_threads; ++i) {
+    kmp_hw_thread_t &current_thread = hw_threads[i];
+    kmp_hw_thread_t &previous_thread = hw_threads[i - 1];
+    bool unique = false;
+    for (int j = 0; j < depth; ++j) {
+      if (previous_thread.ids[j] != current_thread.ids[j]) {
+        unique = true;
+        break;
+      }
+    }
+    if (unique)
+      continue;
+    return false;
+  }
+  return true;
+}
+
+void kmp_topology_t::dump() const {
+  printf("***********************\n");
+  printf("*** __kmp_topology: ***\n");
+  printf("***********************\n");
+  printf("* depth: %d\n", depth);
+
+  printf("* types: ");
+  for (int i = 0; i < depth; ++i)
+    printf("%15s ", __kmp_hw_get_keyword(types[i]));
+  printf("\n");
+
+  printf("* ratio: ");
+  for (int i = 0; i < depth; ++i) {
+    printf("%15d ", ratio[i]);
+  }
+  printf("\n");
+
+  printf("* count: ");
+  for (int i = 0; i < depth; ++i) {
+    printf("%15d ", count[i]);
+  }
+  printf("\n");
+
+  printf("* equivalent map:\n");
+  KMP_FOREACH_HW_TYPE(i) {
+    const char *key = __kmp_hw_get_keyword(i);
+    const char *value = __kmp_hw_get_keyword(equivalent[i]);
+    printf("%-15s -> %-15s\n", key, value);
+  }
+
+  printf("* uniform: %s\n", (is_uniform() ? "Yes" : "No"));
+
+  printf("* num_hw_threads: %d\n", num_hw_threads);
+  printf("* hw_threads:\n");
+  for (int i = 0; i < num_hw_threads; ++i) {
+    hw_threads[i].print();
+  }
+  printf("***********************\n");
+}
+
+void kmp_topology_t::print(const char *env_var) const {
+  kmp_str_buf_t buf;
+  int print_types_depth;
+  __kmp_str_buf_init(&buf);
+  kmp_hw_t print_types[KMP_HW_LAST + 2];
+
+  // Num Available Threads
+  KMP_INFORM(AvailableOSProc, env_var, num_hw_threads);
+
+  // Uniform or not
+  if (is_uniform()) {
+    KMP_INFORM(Uniform, env_var);
+  } else {
+    KMP_INFORM(NonUniform, env_var);
+  }
+
+  // Equivalent types
+  KMP_FOREACH_HW_TYPE(type) {
+    kmp_hw_t eq_type = equivalent[type];
+    if (eq_type != KMP_HW_UNKNOWN && eq_type != type) {
+      KMP_INFORM(AffEqualTopologyTypes, env_var,
+                 __kmp_hw_get_catalog_string(type),
+                 __kmp_hw_get_catalog_string(eq_type));
+    }
+  }
+
+  // Quick topology
+  KMP_ASSERT(depth > 0 && depth <= (int)KMP_HW_LAST);
+  // Create a print types array that always guarantees printing
+  // the core and thread level
+  print_types_depth = 0;
+  for (int level = 0; level < depth; ++level)
+    print_types[print_types_depth++] = types[level];
+  if (equivalent[KMP_HW_CORE] != KMP_HW_CORE) {
+    // Force in the core level for quick topology
+    if (print_types[print_types_depth - 1] == KMP_HW_THREAD) {
+      // Force core before thread e.g., 1 socket X 2 threads/socket
+      // becomes 1 socket X 1 core/socket X 2 threads/socket
+      print_types[print_types_depth - 1] = KMP_HW_CORE;
+      print_types[print_types_depth++] = KMP_HW_THREAD;
+    } else {
+      print_types[print_types_depth++] = KMP_HW_CORE;
+    }
+  }
+  // Always put threads at very end of quick topology
+  if (equivalent[KMP_HW_THREAD] != KMP_HW_THREAD)
+    print_types[print_types_depth++] = KMP_HW_THREAD;
+
+  __kmp_str_buf_clear(&buf);
+  kmp_hw_t numerator_type;
+  kmp_hw_t denominator_type = KMP_HW_UNKNOWN;
+  int core_level = get_level(KMP_HW_CORE);
+  int ncores = get_count(core_level);
+
+  for (int plevel = 0, level = 0; plevel < print_types_depth; ++plevel) {
+    int c;
+    bool plural;
+    numerator_type = print_types[plevel];
+    KMP_ASSERT_VALID_HW_TYPE(numerator_type);
+    if (equivalent[numerator_type] != numerator_type)
+      c = 1;
+    else
+      c = get_ratio(level++);
+    plural = (c > 1);
+    if (plevel == 0) {
+      __kmp_str_buf_print(&buf, "%d %s", c,
+                          __kmp_hw_get_catalog_string(numerator_type, plural));
+    } else {
+      __kmp_str_buf_print(&buf, " x %d %s/%s", c,
+                          __kmp_hw_get_catalog_string(numerator_type, plural),
+                          __kmp_hw_get_catalog_string(denominator_type));
+    }
+    denominator_type = numerator_type;
+  }
+  KMP_INFORM(TopologyGeneric, env_var, buf.str, ncores);
+
+  if (num_hw_threads <= 0) {
+    __kmp_str_buf_free(&buf);
+    return;
+  }
+
+  // Full OS proc to hardware thread map
+  KMP_INFORM(OSProcToPhysicalThreadMap, env_var);
+  for (int i = 0; i < num_hw_threads; i++) {
+    __kmp_str_buf_clear(&buf);
+    for (int level = 0; level < depth; ++level) {
+      kmp_hw_t type = types[level];
+      __kmp_str_buf_print(&buf, "%s ", __kmp_hw_get_catalog_string(type));
+      __kmp_str_buf_print(&buf, "%d ", hw_threads[i].ids[level]);
+    }
+    KMP_INFORM(OSProcMapToPack, env_var, hw_threads[i].os_id, buf.str);
+  }
+
+  __kmp_str_buf_free(&buf);
+}
+
+void kmp_topology_t::canonicalize() {
+  _remove_radix1_layers();
+  _gather_enumeration_information();
+  _discover_uniformity();
+  _set_sub_ids();
+  _set_globals();
+  _set_last_level_cache();
+
+#if KMP_MIC_SUPPORTED
+  // Manually Add L2 = Tile equivalence
+  if (__kmp_mic_type == mic3) {
+    if (get_level(KMP_HW_L2) != -1)
+      set_equivalent_type(KMP_HW_TILE, KMP_HW_L2);
+    else if (get_level(KMP_HW_TILE) != -1)
+      set_equivalent_type(KMP_HW_L2, KMP_HW_TILE);
+  }
+#endif
+
+  // Perform post canonicalization checking
+  KMP_ASSERT(depth > 0);
+  for (int level = 0; level < depth; ++level) {
+    // All counts, ratios, and types must be valid
+    KMP_ASSERT(count[level] > 0 && ratio[level] > 0);
+    KMP_ASSERT_VALID_HW_TYPE(types[level]);
+    // Detected types must point to themselves
+    KMP_ASSERT(equivalent[types[level]] == types[level]);
+  }
+
+#if KMP_AFFINITY_SUPPORTED
+  // Set the number of affinity granularity levels
+  if (__kmp_affinity_gran_levels < 0) {
+    kmp_hw_t gran_type = get_equivalent_type(__kmp_affinity_gran);
+    // Check if user's granularity request is valid
+    if (gran_type == KMP_HW_UNKNOWN) {
+      // First try core, then thread, then package
+      kmp_hw_t gran_types[3] = {KMP_HW_CORE, KMP_HW_THREAD, KMP_HW_SOCKET};
+      for (auto g : gran_types) {
+        if (__kmp_topology->get_equivalent_type(g) != KMP_HW_UNKNOWN) {
+          gran_type = g;
+          break;
+        }
+      }
+      KMP_ASSERT(gran_type != KMP_HW_UNKNOWN);
+      // Warn user what granularity setting will be used instead
+      KMP_WARNING(AffGranularityBad, "KMP_AFFINITY",
+                  __kmp_hw_get_catalog_string(__kmp_affinity_gran),
+                  __kmp_hw_get_catalog_string(gran_type));
+      __kmp_affinity_gran = gran_type;
+    }
+    __kmp_affinity_gran_levels = 0;
+    for (int i = depth - 1; i >= 0 && get_type(i) != gran_type; --i)
+      __kmp_affinity_gran_levels++;
+  }
+#endif // KMP_AFFINITY_SUPPORTED
+}
+
+// Canonicalize an explicit packages X cores/pkg X threads/core topology
+void kmp_topology_t::canonicalize(int npackages, int ncores_per_pkg,
+                                  int nthreads_per_core, int ncores) {
+  int ndepth = 3;
+  depth = ndepth;
+  KMP_FOREACH_HW_TYPE(i) { equivalent[i] = KMP_HW_UNKNOWN; }
+  for (int level = 0; level < depth; ++level) {
+    count[level] = 0;
+    ratio[level] = 0;
+  }
+  count[0] = npackages;
+  count[1] = ncores;
+  count[2] = __kmp_xproc;
+  ratio[0] = npackages;
+  ratio[1] = ncores_per_pkg;
+  ratio[2] = nthreads_per_core;
+  equivalent[KMP_HW_SOCKET] = KMP_HW_SOCKET;
+  equivalent[KMP_HW_CORE] = KMP_HW_CORE;
+  equivalent[KMP_HW_THREAD] = KMP_HW_THREAD;
+  types[0] = KMP_HW_SOCKET;
+  types[1] = KMP_HW_CORE;
+  types[2] = KMP_HW_THREAD;
+  //__kmp_avail_proc = __kmp_xproc;
+  _discover_uniformity();
+}
+
+// Apply the KMP_HW_SUBSET envirable to the topology
+// Returns true if KMP_HW_SUBSET filtered any processors
+// otherwise, returns false
+bool kmp_topology_t::filter_hw_subset() {
+  // If KMP_HW_SUBSET wasn't requested, then do nothing.
+  if (!__kmp_hw_subset)
+    return false;
+
+  // Check to see if KMP_HW_SUBSET is a valid subset of the detected topology
+  int hw_subset_depth = __kmp_hw_subset->get_depth();
+  kmp_hw_t specified[KMP_HW_LAST];
+  KMP_ASSERT(hw_subset_depth > 0);
+  KMP_FOREACH_HW_TYPE(i) { specified[i] = KMP_HW_UNKNOWN; }
+  for (int i = 0; i < hw_subset_depth; ++i) {
+    int max_count;
+    int num = __kmp_hw_subset->at(i).num;
+    int offset = __kmp_hw_subset->at(i).offset;
+    kmp_hw_t type = __kmp_hw_subset->at(i).type;
+    kmp_hw_t equivalent_type = equivalent[type];
+    int level = get_level(type);
+
+    // Check to see if current layer is in detected machine topology
+    if (equivalent_type != KMP_HW_UNKNOWN) {
+      __kmp_hw_subset->at(i).type = equivalent_type;
+    } else {
+      KMP_WARNING(AffHWSubsetNotExistGeneric,
+                  __kmp_hw_get_catalog_string(type));
+      return false;
+    }
+
+    // Check to see if current layer has already been specified
+    // either directly or through an equivalent type
+    if (specified[equivalent_type] != KMP_HW_UNKNOWN) {
+      KMP_WARNING(AffHWSubsetEqvLayers, __kmp_hw_get_catalog_string(type),
+                  __kmp_hw_get_catalog_string(specified[equivalent_type]));
+      return false;
+    }
+    specified[equivalent_type] = type;
+
+    // Check to see if layers are in order
+    if (i + 1 < hw_subset_depth) {
+      kmp_hw_t next_type = get_equivalent_type(__kmp_hw_subset->at(i + 1).type);
+      if (next_type == KMP_HW_UNKNOWN) {
+        KMP_WARNING(
+            AffHWSubsetNotExistGeneric,
+            __kmp_hw_get_catalog_string(__kmp_hw_subset->at(i + 1).type));
+        return false;
+      }
+      int next_topology_level = get_level(next_type);
+      if (level > next_topology_level) {
+        KMP_WARNING(AffHWSubsetOutOfOrder, __kmp_hw_get_catalog_string(type),
+                    __kmp_hw_get_catalog_string(next_type));
+        return false;
+      }
+    }
+
+    // Check to see if each layer's num & offset parameters are valid
+    max_count = get_ratio(level);
+    if (max_count < 0 || num + offset > max_count) {
+      bool plural = (num > 1);
+      KMP_WARNING(AffHWSubsetManyGeneric,
+                  __kmp_hw_get_catalog_string(type, plural));
+      return false;
+    }
+  }
+
+  // Apply the filtered hardware subset
+  int new_index = 0;
+  for (int i = 0; i < num_hw_threads; ++i) {
+    kmp_hw_thread_t &hw_thread = hw_threads[i];
+    // Check to see if this hardware thread should be filtered
+    bool should_be_filtered = false;
+    for (int level = 0, hw_subset_index = 0;
+         level < depth && hw_subset_index < hw_subset_depth; ++level) {
+      kmp_hw_t topology_type = types[level];
+      auto hw_subset_item = __kmp_hw_subset->at(hw_subset_index);
+      kmp_hw_t hw_subset_type = hw_subset_item.type;
+      if (topology_type != hw_subset_type)
+        continue;
+      int num = hw_subset_item.num;
+      int offset = hw_subset_item.offset;
+      hw_subset_index++;
+      if (hw_thread.sub_ids[level] < offset ||
+          hw_thread.sub_ids[level] >= offset + num) {
+        should_be_filtered = true;
+        break;
+      }
+    }
+    if (!should_be_filtered) {
+      if (i != new_index)
+        hw_threads[new_index] = hw_thread;
+      new_index++;
+    } else {
+#if KMP_AFFINITY_SUPPORTED
+      KMP_CPU_CLR(hw_thread.os_id, __kmp_affin_fullMask);
+#endif
+      __kmp_avail_proc--;
+    }
+  }
+  KMP_DEBUG_ASSERT(new_index <= num_hw_threads);
+  num_hw_threads = new_index;
+
+  // Post hardware subset canonicalization
+  _gather_enumeration_information();
+  _discover_uniformity();
+  _set_globals();
+  _set_last_level_cache();
+  return true;
+}
+
+bool kmp_topology_t::is_close(int hwt1, int hwt2, int hw_level) const {
+  if (hw_level >= depth)
+    return true;
+  bool retval = true;
+  const kmp_hw_thread_t &t1 = hw_threads[hwt1];
+  const kmp_hw_thread_t &t2 = hw_threads[hwt2];
+  for (int i = 0; i < (depth - hw_level); ++i) {
+    if (t1.ids[i] != t2.ids[i])
+      return false;
+  }
+  return retval;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+#if KMP_AFFINITY_SUPPORTED
+class kmp_affinity_raii_t {
+  kmp_affin_mask_t *mask;
+  bool restored;
+
+public:
+  kmp_affinity_raii_t() : restored(false) {
+    KMP_CPU_ALLOC(mask);
+    KMP_ASSERT(mask != NULL);
+    __kmp_get_system_affinity(mask, TRUE);
+  }
+  void restore() {
+    __kmp_set_system_affinity(mask, TRUE);
+    KMP_CPU_FREE(mask);
+    restored = true;
+  }
+  ~kmp_affinity_raii_t() {
+    if (!restored) {
+      __kmp_set_system_affinity(mask, TRUE);
+      KMP_CPU_FREE(mask);
+    }
+  }
+};
 
 bool KMPAffinity::picked_api = false;
 
@@ -238,201 +1013,67 @@ void __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) {
   }
 }
 
-// When sorting by labels, __kmp_affinity_assign_child_nums() must first be
-// called to renumber the labels from [0..n] and place them into the child_num
-// vector of the address object.  This is done in case the labels used for
-// the children at one node of the hierarchy differ from those used for
-// another node at the same level.  Example:  suppose the machine has 2 nodes
-// with 2 packages each.  The first node contains packages 601 and 602, and
-// second node contains packages 603 and 604.  If we try to sort the table
-// for "scatter" affinity, the table will still be sorted 601, 602, 603, 604
-// because we are paying attention to the labels themselves, not the ordinal
-// child numbers.  By using the child numbers in the sort, the result is
-// {0,0}=601, {0,1}=603, {1,0}=602, {1,1}=604.
-static void __kmp_affinity_assign_child_nums(AddrUnsPair *address2os,
-                                             int numAddrs) {
-  KMP_DEBUG_ASSERT(numAddrs > 0);
-  int depth = address2os->first.depth;
-  unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
-  unsigned *lastLabel = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
-  int labCt;
-  for (labCt = 0; labCt < depth; labCt++) {
-    address2os[0].first.childNums[labCt] = counts[labCt] = 0;
-    lastLabel[labCt] = address2os[0].first.labels[labCt];
-  }
-  int i;
-  for (i = 1; i < numAddrs; i++) {
-    for (labCt = 0; labCt < depth; labCt++) {
-      if (address2os[i].first.labels[labCt] != lastLabel[labCt]) {
-        int labCt2;
-        for (labCt2 = labCt + 1; labCt2 < depth; labCt2++) {
-          counts[labCt2] = 0;
-          lastLabel[labCt2] = address2os[i].first.labels[labCt2];
-        }
-        counts[labCt]++;
-        lastLabel[labCt] = address2os[i].first.labels[labCt];
-        break;
-      }
-    }
-    for (labCt = 0; labCt < depth; labCt++) {
-      address2os[i].first.childNums[labCt] = counts[labCt];
-    }
-    for (; labCt < (int)Address::maxDepth; labCt++) {
-      address2os[i].first.childNums[labCt] = 0;
-    }
-  }
-  __kmp_free(lastLabel);
-  __kmp_free(counts);
-}
-
-// All of the __kmp_affinity_create_*_map() routines should set
-// __kmp_affinity_masks to a vector of affinity mask objects of length
-// __kmp_affinity_num_masks, if __kmp_affinity_type != affinity_none, and return
-// the number of levels in the machine topology tree (zero if
-// __kmp_affinity_type == affinity_none).
-//
-// All of the __kmp_affinity_create_*_map() routines should set
-// *__kmp_affin_fullMask to the affinity mask for the initialization thread.
-// They need to save and restore the mask, and it could be needed later, so
-// saving it is just an optimization to avoid calling kmp_get_system_affinity()
-// again.
+// All of the __kmp_affinity_create_*_map() routines should allocate the
+// internal topology object and set the layer ids for it.  Each routine
+// returns a boolean on whether it was successful at doing so.
 kmp_affin_mask_t *__kmp_affin_fullMask = NULL;
 
-static int nCoresPerPkg, nPackages;
-static int __kmp_nThreadsPerCore;
-#ifndef KMP_DFLT_NTH_CORES
-static int __kmp_ncores;
-#endif
-static int *__kmp_pu_os_idx = NULL;
-
-// __kmp_affinity_uniform_topology() doesn't work when called from
-// places which support arbitrarily many levels in the machine topology
-// map, i.e. the non-default cases in __kmp_affinity_create_cpuinfo_map()
-// __kmp_affinity_create_x2apicid_map().
-inline static bool __kmp_affinity_uniform_topology() {
-  return __kmp_avail_proc == (__kmp_nThreadsPerCore * nCoresPerPkg * nPackages);
-}
-
-// Print out the detailed machine topology map, i.e. the physical locations
-// of each OS proc.
-static void __kmp_affinity_print_topology(AddrUnsPair *address2os, int len,
-                                          int depth, int pkgLevel,
-                                          int coreLevel, int threadLevel) {
-  int proc;
-
-  KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY");
-  for (proc = 0; proc < len; proc++) {
-    int level;
-    kmp_str_buf_t buf;
-    __kmp_str_buf_init(&buf);
-    for (level = 0; level < depth; level++) {
-      if (level == threadLevel) {
-        __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Thread));
-      } else if (level == coreLevel) {
-        __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Core));
-      } else if (level == pkgLevel) {
-        __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Package));
-      } else if (level > pkgLevel) {
-        __kmp_str_buf_print(&buf, "%s_%d ", KMP_I18N_STR(Node),
-                            level - pkgLevel - 1);
-      } else {
-        __kmp_str_buf_print(&buf, "L%d ", level);
-      }
-      __kmp_str_buf_print(&buf, "%d ", address2os[proc].first.labels[level]);
-    }
-    KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", address2os[proc].second,
-               buf.str);
-    __kmp_str_buf_free(&buf);
-  }
-}
-
 #if KMP_USE_HWLOC
-
-static void __kmp_affinity_print_hwloc_tp(AddrUnsPair *addrP, int len,
-                                          int depth, int *levels) {
-  int proc;
-  kmp_str_buf_t buf;
-  __kmp_str_buf_init(&buf);
-  KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY");
-  for (proc = 0; proc < len; proc++) {
-    __kmp_str_buf_print(&buf, "%s %d ", KMP_I18N_STR(Package),
-                        addrP[proc].first.labels[0]);
-    if (depth > 1) {
-      int level = 1; // iterate over levels
-      int label = 1; // iterate over labels
-      if (__kmp_numa_detected)
-        // node level follows package
-        if (levels[level++] > 0)
-          __kmp_str_buf_print(&buf, "%s %d ", KMP_I18N_STR(Node),
-                              addrP[proc].first.labels[label++]);
-      if (__kmp_tile_depth > 0)
-        // tile level follows node if any, or package
-        if (levels[level++] > 0)
-          __kmp_str_buf_print(&buf, "%s %d ", KMP_I18N_STR(Tile),
-                              addrP[proc].first.labels[label++]);
-      if (levels[level++] > 0)
-        // core level follows
-        __kmp_str_buf_print(&buf, "%s %d ", KMP_I18N_STR(Core),
-                            addrP[proc].first.labels[label++]);
-      if (levels[level++] > 0)
-        // thread level is the latest
-        __kmp_str_buf_print(&buf, "%s %d ", KMP_I18N_STR(Thread),
-                            addrP[proc].first.labels[label++]);
-      KMP_DEBUG_ASSERT(label == depth);
-    }
-    KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", addrP[proc].second, buf.str);
-    __kmp_str_buf_clear(&buf);
-  }
-  __kmp_str_buf_free(&buf);
+static inline bool __kmp_hwloc_is_cache_type(hwloc_obj_t obj) {
+#if HWLOC_API_VERSION >= 0x00020000
+  return hwloc_obj_type_is_cache(obj->type);
+#else
+  return obj->type == HWLOC_OBJ_CACHE;
+#endif
 }
 
-static int nNodePerPkg, nTilePerPkg, nTilePerNode, nCorePerNode, nCorePerTile;
-
-// This function removes the topology levels that are radix 1 and don't offer
-// further information about the topology.  The most common example is when you
-// have one thread context per core, we don't want the extra thread context
-// level if it offers no unique labels.  So they are removed.
-// return value: the new depth of address2os
-static int __kmp_affinity_remove_radix_one_levels(AddrUnsPair *addrP, int nTh,
-                                                  int depth, int *levels) {
-  int level;
-  int i;
-  int radix1_detected;
-  int new_depth = depth;
-  for (level = depth - 1; level > 0; --level) {
-    // Detect if this level is radix 1
-    radix1_detected = 1;
-    for (i = 1; i < nTh; ++i) {
-      if (addrP[0].first.labels[level] != addrP[i].first.labels[level]) {
-        // There are differing label values for this level so it stays
-        radix1_detected = 0;
-        break;
-      }
-    }
-    if (!radix1_detected)
-      continue;
-    // Radix 1 was detected
-    --new_depth;
-    levels[level] = -1; // mark level as not present in address2os array
-    if (level == new_depth) {
-      // "turn off" deepest level, just decrement the depth that removes
-      // the level from address2os array
-      for (i = 0; i < nTh; ++i) {
-        addrP[i].first.depth--;
+// Returns KMP_HW_* type derived from HWLOC_* type
+static inline kmp_hw_t __kmp_hwloc_type_2_topology_type(hwloc_obj_t obj) {
+
+  if (__kmp_hwloc_is_cache_type(obj)) {
+    if (obj->attr->cache.type == HWLOC_OBJ_CACHE_INSTRUCTION)
+      return KMP_HW_UNKNOWN;
+    switch (obj->attr->cache.depth) {
+    case 1:
+      return KMP_HW_L1;
+    case 2:
+#if KMP_MIC_SUPPORTED
+      if (__kmp_mic_type == mic3) {
+        return KMP_HW_TILE;
       }
-    } else {
-      // For other levels, we move labels over and also reduce the depth
-      int j;
-      for (j = level; j < new_depth; ++j) {
-        for (i = 0; i < nTh; ++i) {
-          addrP[i].first.labels[j] = addrP[i].first.labels[j + 1];
-          addrP[i].first.depth--;
-        }
-        levels[j + 1] -= 1;
-      }
-    }
+#endif
+      return KMP_HW_L2;
+    case 3:
+      return KMP_HW_L3;
+    }
+    return KMP_HW_UNKNOWN;
+  }
+
+  switch (obj->type) {
+  case HWLOC_OBJ_PACKAGE:
+    return KMP_HW_SOCKET;
+  case HWLOC_OBJ_NUMANODE:
+    return KMP_HW_NUMA;
+  case HWLOC_OBJ_CORE:
+    return KMP_HW_CORE;
+  case HWLOC_OBJ_PU:
+    return KMP_HW_THREAD;
+  case HWLOC_OBJ_GROUP:
+    if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_DIE)
+      return KMP_HW_DIE;
+    else if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_TILE)
+      return KMP_HW_TILE;
+    else if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_MODULE)
+      return KMP_HW_MODULE;
+    else if (obj->attr->group.kind == HWLOC_GROUP_KIND_WINDOWS_PROCESSOR_GROUP)
+      return KMP_HW_PROC_GROUP;
+    return KMP_HW_UNKNOWN;
+#if HWLOC_API_VERSION >= 0x00020100
+  case HWLOC_OBJ_DIE:
+    return KMP_HW_DIE;
+#endif
   }
-  return new_depth;
+  return KMP_HW_UNKNOWN;
 }
 
 // Returns the number of objects of type 'type' below 'obj' within the topology
@@ -445,9 +1086,8 @@ static int __kmp_hwloc_get_nobjs_under_obj(hwloc_obj_t obj,
   hwloc_obj_t first;
   for (first = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, obj->type,
                                            obj->logical_index, type, 0);
-       first != NULL &&
-       hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology, obj->type, first) ==
-           obj;
+       first != NULL && hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology,
+                                                       obj->type, first) == obj;
        first = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, first->type,
                                           first)) {
     ++retval;
@@ -455,126 +1095,48 @@ static int __kmp_hwloc_get_nobjs_under_obj(hwloc_obj_t obj,
   return retval;
 }
 
-static int __kmp_hwloc_count_children_by_depth(hwloc_topology_t t,
-                                               hwloc_obj_t o,
-                                               kmp_hwloc_depth_t depth,
-                                               hwloc_obj_t *f) {
-  if (o->depth == depth) {
-    if (*f == NULL)
-      *f = o; // output first descendant found
-    return 1;
-  }
-  int sum = 0;
-  for (unsigned i = 0; i < o->arity; i++)
-    sum += __kmp_hwloc_count_children_by_depth(t, o->children[i], depth, f);
-  return sum; // will be 0 if no one found (as PU arity is 0)
-}
-
-static int __kmp_hwloc_count_children_by_type(hwloc_topology_t t, hwloc_obj_t o,
-                                              hwloc_obj_type_t type,
-                                              hwloc_obj_t *f) {
-  if (!hwloc_compare_types(o->type, type)) {
-    if (*f == NULL)
-      *f = o; // output first descendant found
-    return 1;
-  }
-  int sum = 0;
-  for (unsigned i = 0; i < o->arity; i++)
-    sum += __kmp_hwloc_count_children_by_type(t, o->children[i], type, f);
-  return sum; // will be 0 if no one found (as PU arity is 0)
-}
-
-static int __kmp_hwloc_process_obj_core_pu(AddrUnsPair *addrPair,
-                                           int &nActiveThreads,
-                                           int &num_active_cores,
-                                           hwloc_obj_t obj, int depth,
-                                           int *labels) {
-  hwloc_obj_t core = NULL;
-  hwloc_topology_t &tp = __kmp_hwloc_topology;
-  int NC = __kmp_hwloc_count_children_by_type(tp, obj, HWLOC_OBJ_CORE, &core);
-  for (int core_id = 0; core_id < NC; ++core_id, core = core->next_cousin) {
-    hwloc_obj_t pu = NULL;
-    KMP_DEBUG_ASSERT(core != NULL);
-    int num_active_threads = 0;
-    int NT = __kmp_hwloc_count_children_by_type(tp, core, HWLOC_OBJ_PU, &pu);
-    // int NT = core->arity; pu = core->first_child; // faster?
-    for (int pu_id = 0; pu_id < NT; ++pu_id, pu = pu->next_cousin) {
-      KMP_DEBUG_ASSERT(pu != NULL);
-      if (!KMP_CPU_ISSET(pu->os_index, __kmp_affin_fullMask))
-        continue; // skip inactive (inaccessible) unit
-      Address addr(depth + 2);
-      KA_TRACE(20, ("Hwloc inserting %d (%d) %d (%d) %d (%d) into address2os\n",
-                    obj->os_index, obj->logical_index, core->os_index,
-                    core->logical_index, pu->os_index, pu->logical_index));
-      for (int i = 0; i < depth; ++i)
-        addr.labels[i] = labels[i]; // package, etc.
-      addr.labels[depth] = core_id; // core
-      addr.labels[depth + 1] = pu_id; // pu
-      addrPair[nActiveThreads] = AddrUnsPair(addr, pu->os_index);
-      __kmp_pu_os_idx[nActiveThreads] = pu->os_index;
-      nActiveThreads++;
-      ++num_active_threads; // count active threads per core
-    }
-    if (num_active_threads) { // were there any active threads on the core?
-      ++__kmp_ncores; // count total active cores
-      ++num_active_cores; // count active cores per socket
-      if (num_active_threads > __kmp_nThreadsPerCore)
-        __kmp_nThreadsPerCore = num_active_threads; // calc maximum
+// This gets the sub_id for a lower object under a higher object in the
+// topology tree
+static int __kmp_hwloc_get_sub_id(hwloc_topology_t t, hwloc_obj_t higher,
+                                  hwloc_obj_t lower) {
+  hwloc_obj_t obj;
+  hwloc_obj_type_t ltype = lower->type;
+  int lindex = lower->logical_index - 1;
+  int sub_id = 0;
+  // Get the previous lower object
+  obj = hwloc_get_obj_by_type(t, ltype, lindex);
+  while (obj && lindex >= 0 &&
+         hwloc_bitmap_isincluded(obj->cpuset, higher->cpuset)) {
+    if (obj->userdata) {
+      sub_id = (int)(RCAST(kmp_intptr_t, obj->userdata));
+      break;
     }
+    sub_id++;
+    lindex--;
+    obj = hwloc_get_obj_by_type(t, ltype, lindex);
   }
-  return 0;
+  // store sub_id + 1 so that 0 is differed from NULL
+  lower->userdata = RCAST(void *, sub_id + 1);
+  return sub_id;
 }
 
-// Check if NUMA node detected below the package,
-// and if tile object is detected and return its depth
-static int __kmp_hwloc_check_numa() {
-  hwloc_topology_t &tp = __kmp_hwloc_topology;
-  hwloc_obj_t hT, hC, hL, hN, hS; // hwloc objects (pointers to)
-  int depth, l2cache_depth, package_depth;
-
-  // Get some PU
-  hT = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PU, 0);
-  if (hT == NULL) // something has gone wrong
-    return 1;
+static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t *const msg_id) {
+  kmp_hw_t type;
+  int hw_thread_index, sub_id;
+  int depth;
+  hwloc_obj_t pu, obj, root, prev;
+  kmp_hw_t types[KMP_HW_LAST];
+  hwloc_obj_type_t hwloc_types[KMP_HW_LAST];
 
-  // check NUMA node below PACKAGE
-  hN = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hT);
-  hS = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_PACKAGE, hT);
-  KMP_DEBUG_ASSERT(hS != NULL);
-  if (hN != NULL && hN->depth > hS->depth) {
-    __kmp_numa_detected = TRUE; // socket includes node(s)
-    if (__kmp_affinity_gran == affinity_gran_node) {
-      __kmp_affinity_gran = affinity_gran_numa;
-    }
-  }
-
-  package_depth = hwloc_get_type_depth(tp, HWLOC_OBJ_PACKAGE);
-  l2cache_depth = hwloc_get_cache_type_depth(tp, 2, HWLOC_OBJ_CACHE_UNIFIED);
-  // check tile, get object by depth because of multiple caches possible
-  depth = (l2cache_depth < package_depth) ? package_depth : l2cache_depth;
-  hL = hwloc_get_ancestor_obj_by_depth(tp, depth, hT);
-  hC = NULL; // not used, but reset it here just in case
-  if (hL != NULL &&
-      __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, &hC) > 1)
-    __kmp_tile_depth = depth; // tile consists of multiple cores
-  return 0;
-}
-
-static int __kmp_affinity_create_hwloc_map(AddrUnsPair **address2os,
-                                           kmp_i18n_id_t *const msg_id) {
-  hwloc_topology_t &tp = __kmp_hwloc_topology; // shortcut of a long name
-  *address2os = NULL;
+  hwloc_topology_t tp = __kmp_hwloc_topology;
   *msg_id = kmp_i18n_null;
-
-  // Save the affinity mask for the current thread.
-  kmp_affin_mask_t *oldMask;
-  KMP_CPU_ALLOC(oldMask);
-  __kmp_get_system_affinity(oldMask, TRUE);
-  __kmp_hwloc_check_numa();
+  if (__kmp_affinity_verbose) {
+    KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
+  }
 
   if (!KMP_AFFINITY_CAPABLE()) {
     // Hack to try and infer the machine topology using only the data
-    // available from cpuid on the current thread, and __kmp_xproc.
+    // available from hwloc on the current thread, and __kmp_xproc.
     KMP_ASSERT(__kmp_affinity_type == affinity_none);
     // hwloc only guarantees existance of PU object, so check PACKAGE and CORE
     hwloc_obj_t o = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PACKAGE, 0);
@@ -591,272 +1153,127 @@ static int __kmp_affinity_create_hwloc_map(AddrUnsPair **address2os,
     if (nCoresPerPkg == 0)
       nCoresPerPkg = 1; // to prevent possible division by 0
     nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
-    if (__kmp_affinity_verbose) {
-      KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY");
-      KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
-      if (__kmp_affinity_uniform_topology()) {
-        KMP_INFORM(Uniform, "KMP_AFFINITY");
-      } else {
-        KMP_INFORM(NonUniform, "KMP_AFFINITY");
+    return true;
+  }
+
+  root = hwloc_get_root_obj(tp);
+
+  // Figure out the depth and types in the topology
+  depth = 0;
+  pu = hwloc_get_pu_obj_by_os_index(tp, __kmp_affin_fullMask->begin());
+  KMP_ASSERT(pu);
+  obj = pu;
+  types[depth] = KMP_HW_THREAD;
+  hwloc_types[depth] = obj->type;
+  depth++;
+  while (obj != root && obj != NULL) {
+    obj = obj->parent;
+#if HWLOC_API_VERSION >= 0x00020000
+    if (obj->memory_arity) {
+      hwloc_obj_t memory;
+      for (memory = obj->memory_first_child; memory;
+           memory = hwloc_get_next_child(tp, obj, memory)) {
+        if (memory->type == HWLOC_OBJ_NUMANODE)
+          break;
+      }
+      if (memory && memory->type == HWLOC_OBJ_NUMANODE) {
+        types[depth] = KMP_HW_NUMA;
+        hwloc_types[depth] = memory->type;
+        depth++;
       }
-      KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
-                 __kmp_nThreadsPerCore, __kmp_ncores);
     }
-    KMP_CPU_FREE(oldMask);
-    return 0;
+#endif
+    type = __kmp_hwloc_type_2_topology_type(obj);
+    if (type != KMP_HW_UNKNOWN) {
+      types[depth] = type;
+      hwloc_types[depth] = obj->type;
+      depth++;
+    }
   }
+  KMP_ASSERT(depth > 0);
 
-  int depth = 3;
-  int levels[5] = {0, 1, 2, 3, 4}; // package, [node,] [tile,] core, thread
-  int labels[3] = {0}; // package [,node] [,tile] - head of labels array
-  if (__kmp_numa_detected)
-    ++depth;
-  if (__kmp_tile_depth)
-    ++depth;
+  // Get the order for the types correct
+  for (int i = 0, j = depth - 1; i < j; ++i, --j) {
+    hwloc_obj_type_t hwloc_temp = hwloc_types[i];
+    kmp_hw_t temp = types[i];
+    types[i] = types[j];
+    types[j] = temp;
+    hwloc_types[i] = hwloc_types[j];
+    hwloc_types[j] = hwloc_temp;
+  }
 
   // Allocate the data structure to be returned.
-  AddrUnsPair *retval =
-      (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc);
-  KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
-  __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
-
-  // When affinity is off, this routine will still be called to set
-  // __kmp_ncores, as well as __kmp_nThreadsPerCore,
-  // nCoresPerPkg, & nPackages.  Make sure all these vars are set
-  // correctly, and return if affinity is not enabled.
-
-  hwloc_obj_t socket, node, tile;
-  int nActiveThreads = 0;
-  int socket_id = 0;
-  // re-calculate globals to count only accessible resources
-  __kmp_ncores = nPackages = nCoresPerPkg = __kmp_nThreadsPerCore = 0;
-  nNodePerPkg = nTilePerPkg = nTilePerNode = nCorePerNode = nCorePerTile = 0;
-  for (socket = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PACKAGE, 0); socket != NULL;
-       socket = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PACKAGE, socket),
-      socket_id++) {
-    labels[0] = socket_id;
-    if (__kmp_numa_detected) {
-      int NN;
-      int n_active_nodes = 0;
-      node = NULL;
-      NN = __kmp_hwloc_count_children_by_type(tp, socket, HWLOC_OBJ_NUMANODE,
-                                              &node);
-      for (int node_id = 0; node_id < NN; ++node_id, node = node->next_cousin) {
-        labels[1] = node_id;
-        if (__kmp_tile_depth) {
-          // NUMA + tiles
-          int NT;
-          int n_active_tiles = 0;
-          tile = NULL;
-          NT = __kmp_hwloc_count_children_by_depth(tp, node, __kmp_tile_depth,
-                                                   &tile);
-          for (int tl_id = 0; tl_id < NT; ++tl_id, tile = tile->next_cousin) {
-            labels[2] = tl_id;
-            int n_active_cores = 0;
-            __kmp_hwloc_process_obj_core_pu(retval, nActiveThreads,
-                                            n_active_cores, tile, 3, labels);
-            if (n_active_cores) { // were there any active cores on the socket?
-              ++n_active_tiles; // count active tiles per node
-              if (n_active_cores > nCorePerTile)
-                nCorePerTile = n_active_cores; // calc maximum
-            }
-          }
-          if (n_active_tiles) { // were there any active tiles on the socket?
-            ++n_active_nodes; // count active nodes per package
-            if (n_active_tiles > nTilePerNode)
-              nTilePerNode = n_active_tiles; // calc maximum
-          }
-        } else {
-          // NUMA, no tiles
-          int n_active_cores = 0;
-          __kmp_hwloc_process_obj_core_pu(retval, nActiveThreads,
-                                          n_active_cores, node, 2, labels);
-          if (n_active_cores) { // were there any active cores on the socket?
-            ++n_active_nodes; // count active nodes per package
-            if (n_active_cores > nCorePerNode)
-              nCorePerNode = n_active_cores; // calc maximum
-          }
+  __kmp_topology = kmp_topology_t::allocate(__kmp_avail_proc, depth, types);
+
+  hw_thread_index = 0;
+  pu = NULL;
+  while (pu = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, pu)) {
+    int index = depth - 1;
+    bool included = KMP_CPU_ISSET(pu->os_index, __kmp_affin_fullMask);
+    kmp_hw_thread_t &hw_thread = __kmp_topology->at(hw_thread_index);
+    if (included) {
+      hw_thread.clear();
+      hw_thread.ids[index] = pu->logical_index;
+      hw_thread.os_id = pu->os_index;
+      index--;
+    }
+    obj = pu;
+    prev = obj;
+    while (obj != root && obj != NULL) {
+      obj = obj->parent;
+#if HWLOC_API_VERSION >= 0x00020000
+      // NUMA Nodes are handled differently since they are not within the
+      // parent/child structure anymore.  They are separate children
+      // of obj (memory_first_child points to first memory child)
+      if (obj->memory_arity) {
+        hwloc_obj_t memory;
+        for (memory = obj->memory_first_child; memory;
+             memory = hwloc_get_next_child(tp, obj, memory)) {
+          if (memory->type == HWLOC_OBJ_NUMANODE)
+            break;
         }
-      }
-      if (n_active_nodes) { // were there any active nodes on the socket?
-        ++nPackages; // count total active packages
-        if (n_active_nodes > nNodePerPkg)
-          nNodePerPkg = n_active_nodes; // calc maximum
-      }
-    } else {
-      if (__kmp_tile_depth) {
-        // no NUMA, tiles
-        int NT;
-        int n_active_tiles = 0;
-        tile = NULL;
-        NT = __kmp_hwloc_count_children_by_depth(tp, socket, __kmp_tile_depth,
-                                                 &tile);
-        for (int tl_id = 0; tl_id < NT; ++tl_id, tile = tile->next_cousin) {
-          labels[1] = tl_id;
-          int n_active_cores = 0;
-          __kmp_hwloc_process_obj_core_pu(retval, nActiveThreads,
-                                          n_active_cores, tile, 2, labels);
-          if (n_active_cores) { // were there any active cores on the socket?
-            ++n_active_tiles; // count active tiles per package
-            if (n_active_cores > nCorePerTile)
-              nCorePerTile = n_active_cores; // calc maximum
+        if (memory && memory->type == HWLOC_OBJ_NUMANODE) {
+          sub_id = __kmp_hwloc_get_sub_id(tp, memory, prev);
+          if (included) {
+            hw_thread.ids[index] = memory->logical_index;
+            hw_thread.ids[index + 1] = sub_id;
+            index--;
           }
+          prev = memory;
         }
-        if (n_active_tiles) { // were there any active tiles on the socket?
-          ++nPackages; // count total active packages
-          if (n_active_tiles > nTilePerPkg)
-            nTilePerPkg = n_active_tiles; // calc maximum
-        }
-      } else {
-        // no NUMA, no tiles
-        int n_active_cores = 0;
-        __kmp_hwloc_process_obj_core_pu(retval, nActiveThreads, n_active_cores,
-                                        socket, 1, labels);
-        if (n_active_cores) { // were there any active cores on the socket?
-          ++nPackages; // count total active packages
-          if (n_active_cores > nCoresPerPkg)
-            nCoresPerPkg = n_active_cores; // calc maximum
-        }
+        prev = obj;
       }
-    }
-  }
-
-  // If there's only one thread context to bind to, return now.
-  KMP_DEBUG_ASSERT(nActiveThreads == __kmp_avail_proc);
-  KMP_ASSERT(nActiveThreads > 0);
-  if (nActiveThreads == 1) {
-    __kmp_ncores = nPackages = 1;
-    __kmp_nThreadsPerCore = nCoresPerPkg = 1;
-    if (__kmp_affinity_verbose) {
-      KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
-      KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
-      KMP_INFORM(Uniform, "KMP_AFFINITY");
-      KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
-                 __kmp_nThreadsPerCore, __kmp_ncores);
-    }
-
-    if (__kmp_affinity_type == affinity_none) {
-      __kmp_free(retval);
-      KMP_CPU_FREE(oldMask);
-      return 0;
-    }
-
-    // Form an Address object which only includes the package level.
-    Address addr(1);
-    addr.labels[0] = retval[0].first.labels[0];
-    retval[0].first = addr;
-
-    if (__kmp_affinity_gran_levels < 0) {
-      __kmp_affinity_gran_levels = 0;
-    }
-
-    if (__kmp_affinity_verbose) {
-      __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1);
-    }
-
-    *address2os = retval;
-    KMP_CPU_FREE(oldMask);
-    return 1;
-  }
-
-  // Sort the table by physical Id.
-  qsort(retval, nActiveThreads, sizeof(*retval),
-        __kmp_affinity_cmp_Address_labels);
-
-  // Check to see if the machine topology is uniform
-  int nPUs = nPackages * __kmp_nThreadsPerCore;
-  if (__kmp_numa_detected) {
-    if (__kmp_tile_depth) { // NUMA + tiles
-      nPUs *= (nNodePerPkg * nTilePerNode * nCorePerTile);
-    } else { // NUMA, no tiles
-      nPUs *= (nNodePerPkg * nCorePerNode);
-    }
-  } else {
-    if (__kmp_tile_depth) { // no NUMA, tiles
-      nPUs *= (nTilePerPkg * nCorePerTile);
-    } else { // no NUMA, no tiles
-      nPUs *= nCoresPerPkg;
-    }
-  }
-  unsigned uniform = (nPUs == nActiveThreads);
-
-  // Print the machine topology summary.
-  if (__kmp_affinity_verbose) {
-    KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
-    if (uniform) {
-      KMP_INFORM(Uniform, "KMP_AFFINITY");
-    } else {
-      KMP_INFORM(NonUniform, "KMP_AFFINITY");
-    }
-    if (__kmp_numa_detected) {
-      if (__kmp_tile_depth) { // NUMA + tiles
-        KMP_INFORM(TopologyExtraNoTi, "KMP_AFFINITY", nPackages, nNodePerPkg,
-                   nTilePerNode, nCorePerTile, __kmp_nThreadsPerCore,
-                   __kmp_ncores);
-      } else { // NUMA, no tiles
-        KMP_INFORM(TopologyExtraNode, "KMP_AFFINITY", nPackages, nNodePerPkg,
-                   nCorePerNode, __kmp_nThreadsPerCore, __kmp_ncores);
-        nPUs *= (nNodePerPkg * nCorePerNode);
-      }
-    } else {
-      if (__kmp_tile_depth) { // no NUMA, tiles
-        KMP_INFORM(TopologyExtraTile, "KMP_AFFINITY", nPackages, nTilePerPkg,
-                   nCorePerTile, __kmp_nThreadsPerCore, __kmp_ncores);
-      } else { // no NUMA, no tiles
-        kmp_str_buf_t buf;
-        __kmp_str_buf_init(&buf);
-        __kmp_str_buf_print(&buf, "%d", nPackages);
-        KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg,
-                   __kmp_nThreadsPerCore, __kmp_ncores);
-        __kmp_str_buf_free(&buf);
-      }
-    }
-  }
-
-  if (__kmp_affinity_type == affinity_none) {
-    __kmp_free(retval);
-    KMP_CPU_FREE(oldMask);
-    return 0;
-  }
-
-  int depth_full = depth; // number of levels before compressing
-  // Find any levels with radix 1, and remove them from the map
-  // (except for the package level).
-  depth = __kmp_affinity_remove_radix_one_levels(retval, nActiveThreads, depth,
-                                                 levels);
-  KMP_DEBUG_ASSERT(__kmp_affinity_gran != affinity_gran_default);
-  if (__kmp_affinity_gran_levels < 0) {
-    // Set the granularity level based on what levels are modeled
-    // in the machine topology map.
-    __kmp_affinity_gran_levels = 0; // lowest level (e.g. fine)
-    if (__kmp_affinity_gran > affinity_gran_thread) {
-      for (int i = 1; i <= depth_full; ++i) {
-        if (__kmp_affinity_gran <= i) // only count deeper levels
-          break;
-        if (levels[depth_full - i] > 0)
-          __kmp_affinity_gran_levels++;
+#endif
+      type = __kmp_hwloc_type_2_topology_type(obj);
+      if (type != KMP_HW_UNKNOWN) {
+        sub_id = __kmp_hwloc_get_sub_id(tp, obj, prev);
+        if (included) {
+          hw_thread.ids[index] = obj->logical_index;
+          hw_thread.ids[index + 1] = sub_id;
+          index--;
+        }
+        prev = obj;
       }
     }
-    if (__kmp_affinity_gran > affinity_gran_package)
-      __kmp_affinity_gran_levels++; // e.g. granularity = group
+    if (included)
+      hw_thread_index++;
   }
-
-  if (__kmp_affinity_verbose)
-    __kmp_affinity_print_hwloc_tp(retval, nActiveThreads, depth, levels);
-
-  KMP_CPU_FREE(oldMask);
-  *address2os = retval;
-  return depth;
+  __kmp_topology->sort_ids();
+  return true;
 }
 #endif // KMP_USE_HWLOC
 
 // If we don't know how to retrieve the machine's processor topology, or
 // encounter an error in doing so, this routine is called to form a "flat"
 // mapping of os thread id's <-> processor id's.
-static int __kmp_affinity_create_flat_map(AddrUnsPair **address2os,
-                                          kmp_i18n_id_t *const msg_id) {
-  *address2os = NULL;
+static bool __kmp_affinity_create_flat_map(kmp_i18n_id_t *const msg_id) {
   *msg_id = kmp_i18n_null;
+  int depth = 3;
+  kmp_hw_t types[] = {KMP_HW_SOCKET, KMP_HW_CORE, KMP_HW_THREAD};
+
+  if (__kmp_affinity_verbose) {
+    KMP_INFORM(UsingFlatOS, "KMP_AFFINITY");
+  }
 
   // Even if __kmp_affinity_type == affinity_none, this routine might still
   // called to set __kmp_ncores, as well as
@@ -865,14 +1282,7 @@ static int __kmp_affinity_create_flat_map(AddrUnsPair **address2os,
     KMP_ASSERT(__kmp_affinity_type == affinity_none);
     __kmp_ncores = nPackages = __kmp_xproc;
     __kmp_nThreadsPerCore = nCoresPerPkg = 1;
-    if (__kmp_affinity_verbose) {
-      KMP_INFORM(AffFlatTopology, "KMP_AFFINITY");
-      KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
-      KMP_INFORM(Uniform, "KMP_AFFINITY");
-      KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
-                 __kmp_nThreadsPerCore, __kmp_ncores);
-    }
-    return 0;
+    return true;
   }
 
   // When affinity is off, this routine will still be called to set
@@ -881,29 +1291,9 @@ static int __kmp_affinity_create_flat_map(AddrUnsPair **address2os,
   // not enabled.
   __kmp_ncores = nPackages = __kmp_avail_proc;
   __kmp_nThreadsPerCore = nCoresPerPkg = 1;
-  if (__kmp_affinity_verbose) {
-    KMP_INFORM(AffCapableUseFlat, "KMP_AFFINITY");
-    KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
-    KMP_INFORM(Uniform, "KMP_AFFINITY");
-    KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
-               __kmp_nThreadsPerCore, __kmp_ncores);
-  }
-  KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
-  __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
-  if (__kmp_affinity_type == affinity_none) {
-    int avail_ct = 0;
-    int i;
-    KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
-      if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask))
-        continue;
-      __kmp_pu_os_idx[avail_ct++] = i; // suppose indices are flat
-    }
-    return 0;
-  }
 
   // Construct the data structure to be returned.
-  *address2os =
-      (AddrUnsPair *)__kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
+  __kmp_topology = kmp_topology_t::allocate(__kmp_avail_proc, depth, types);
   int avail_ct = 0;
   int i;
   KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
@@ -911,50 +1301,47 @@ static int __kmp_affinity_create_flat_map(AddrUnsPair **address2os,
     if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
       continue;
     }
-    __kmp_pu_os_idx[avail_ct] = i; // suppose indices are flat
-    Address addr(1);
-    addr.labels[0] = i;
-    (*address2os)[avail_ct++] = AddrUnsPair(addr, i);
+    kmp_hw_thread_t &hw_thread = __kmp_topology->at(avail_ct);
+    hw_thread.clear();
+    hw_thread.os_id = i;
+    hw_thread.ids[0] = i;
+    hw_thread.ids[1] = 0;
+    hw_thread.ids[2] = 0;
+    avail_ct++;
   }
   if (__kmp_affinity_verbose) {
     KMP_INFORM(OSProcToPackage, "KMP_AFFINITY");
   }
-
-  if (__kmp_affinity_gran_levels < 0) {
-    // Only the package level is modeled in the machine topology map,
-    // so the #levels of granularity is either 0 or 1.
-    if (__kmp_affinity_gran > affinity_gran_package) {
-      __kmp_affinity_gran_levels = 1;
-    } else {
-      __kmp_affinity_gran_levels = 0;
-    }
-  }
-  return 1;
+  return true;
 }
 
 #if KMP_GROUP_AFFINITY
-
 // If multiple Windows* OS processor groups exist, we can create a 2-level
 // topology map with the groups at level 0 and the individual procs at level 1.
 // This facilitates letting the threads float among all procs in a group,
 // if granularity=group (the default when there are multiple groups).
-static int __kmp_affinity_create_proc_group_map(AddrUnsPair **address2os,
-                                                kmp_i18n_id_t *const msg_id) {
-  *address2os = NULL;
+static bool __kmp_affinity_create_proc_group_map(kmp_i18n_id_t *const msg_id) {
   *msg_id = kmp_i18n_null;
+  int depth = 3;
+  kmp_hw_t types[] = {KMP_HW_PROC_GROUP, KMP_HW_CORE, KMP_HW_THREAD};
+  const static size_t BITS_PER_GROUP = CHAR_BIT * sizeof(DWORD_PTR);
 
-  // If we aren't affinity capable, then return now.
-  // The flat mapping will be used.
+  if (__kmp_affinity_verbose) {
+    KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
+  }
+
+  // If we aren't affinity capable, then use flat topology
   if (!KMP_AFFINITY_CAPABLE()) {
-    // FIXME set *msg_id
-    return -1;
+    KMP_ASSERT(__kmp_affinity_type == affinity_none);
+    nPackages = __kmp_num_proc_groups;
+    __kmp_nThreadsPerCore = 1;
+    __kmp_ncores = __kmp_xproc;
+    nCoresPerPkg = nPackages / __kmp_ncores;
+    return true;
   }
 
   // Construct the data structure to be returned.
-  *address2os =
-      (AddrUnsPair *)__kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
-  KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
-  __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
+  __kmp_topology = kmp_topology_t::allocate(__kmp_avail_proc, depth, types);
   int avail_ct = 0;
   int i;
   KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
@@ -962,48 +1349,28 @@ static int __kmp_affinity_create_proc_group_map(AddrUnsPair **address2os,
     if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
       continue;
     }
-    __kmp_pu_os_idx[avail_ct] = i; // suppose indices are flat
-    Address addr(2);
-    addr.labels[0] = i / (CHAR_BIT * sizeof(DWORD_PTR));
-    addr.labels[1] = i % (CHAR_BIT * sizeof(DWORD_PTR));
-    (*address2os)[avail_ct++] = AddrUnsPair(addr, i);
-
-    if (__kmp_affinity_verbose) {
-      KMP_INFORM(AffOSProcToGroup, "KMP_AFFINITY", i, addr.labels[0],
-                 addr.labels[1]);
-    }
+    kmp_hw_thread_t &hw_thread = __kmp_topology->at(avail_ct++);
+    hw_thread.clear();
+    hw_thread.os_id = i;
+    hw_thread.ids[0] = i / BITS_PER_GROUP;
+    hw_thread.ids[1] = hw_thread.ids[2] = i % BITS_PER_GROUP;
   }
-
-  if (__kmp_affinity_gran_levels < 0) {
-    if (__kmp_affinity_gran == affinity_gran_group) {
-      __kmp_affinity_gran_levels = 1;
-    } else if ((__kmp_affinity_gran == affinity_gran_fine) ||
-               (__kmp_affinity_gran == affinity_gran_thread)) {
-      __kmp_affinity_gran_levels = 0;
-    } else {
-      const char *gran_str = NULL;
-      if (__kmp_affinity_gran == affinity_gran_core) {
-        gran_str = "core";
-      } else if (__kmp_affinity_gran == affinity_gran_package) {
-        gran_str = "package";
-      } else if (__kmp_affinity_gran == affinity_gran_node) {
-        gran_str = "node";
-      } else {
-        KMP_ASSERT(0);
-      }
-
-      // Warning: can't use affinity granularity \"gran\" with group topology
-      // method, using "thread"
-      __kmp_affinity_gran_levels = 0;
-    }
-  }
-  return 2;
+  return true;
 }
-
 #endif /* KMP_GROUP_AFFINITY */
 
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
 
+template <kmp_uint32 LSB, kmp_uint32 MSB>
+static inline unsigned __kmp_extract_bits(kmp_uint32 v) {
+  const kmp_uint32 SHIFT_LEFT = sizeof(kmp_uint32) * 8 - 1 - MSB;
+  const kmp_uint32 SHIFT_RIGHT = LSB;
+  kmp_uint32 retval = v;
+  retval <<= SHIFT_LEFT;
+  retval >>= (SHIFT_LEFT + SHIFT_RIGHT);
+  return retval;
+}
+
 static int __kmp_cpuid_mask_width(int count) {
   int r = 0;
 
@@ -1042,21 +1409,78 @@ static int __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a,
   return 0;
 }
 
+class kmp_cache_info_t {
+public:
+  struct info_t {
+    unsigned level, mask;
+  };
+  kmp_cache_info_t() : depth(0) { get_leaf4_levels(); }
+  size_t get_depth() const { return depth; }
+  info_t &operator[](size_t index) { return table[index]; }
+  const info_t &operator[](size_t index) const { return table[index]; }
+
+  static kmp_hw_t get_topology_type(unsigned level) {
+    KMP_DEBUG_ASSERT(level >= 1 && level <= MAX_CACHE_LEVEL);
+    switch (level) {
+    case 1:
+      return KMP_HW_L1;
+    case 2:
+      return KMP_HW_L2;
+    case 3:
+      return KMP_HW_L3;
+    }
+    return KMP_HW_UNKNOWN;
+  }
+
+private:
+  static const int MAX_CACHE_LEVEL = 3;
+
+  size_t depth;
+  info_t table[MAX_CACHE_LEVEL];
+
+  void get_leaf4_levels() {
+    unsigned level = 0;
+    while (depth < MAX_CACHE_LEVEL) {
+      unsigned cache_type, max_threads_sharing;
+      unsigned cache_level, cache_mask_width;
+      kmp_cpuid buf2;
+      __kmp_x86_cpuid(4, level, &buf2);
+      cache_type = __kmp_extract_bits<0, 4>(buf2.eax);
+      if (!cache_type)
+        break;
+      // Skip instruction caches
+      if (cache_type == 2) {
+        level++;
+        continue;
+      }
+      max_threads_sharing = __kmp_extract_bits<14, 25>(buf2.eax) + 1;
+      cache_mask_width = __kmp_cpuid_mask_width(max_threads_sharing);
+      cache_level = __kmp_extract_bits<5, 7>(buf2.eax);
+      table[depth].level = cache_level;
+      table[depth].mask = ((-1) << cache_mask_width);
+      depth++;
+      level++;
+    }
+  }
+};
+
 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use
 // an algorithm which cycles through the available os threads, setting
 // the current thread's affinity mask to that thread, and then retrieves
 // the Apic Id for each thread context using the cpuid instruction.
-static int __kmp_affinity_create_apicid_map(AddrUnsPair **address2os,
-                                            kmp_i18n_id_t *const msg_id) {
+static bool __kmp_affinity_create_apicid_map(kmp_i18n_id_t *const msg_id) {
   kmp_cpuid buf;
-  *address2os = NULL;
   *msg_id = kmp_i18n_null;
 
+  if (__kmp_affinity_verbose) {
+    KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC));
+  }
+
   // Check if cpuid leaf 4 is supported.
   __kmp_x86_cpuid(0, 0, &buf);
   if (buf.eax < 4) {
     *msg_id = kmp_i18n_str_NoLeaf4Support;
-    return -1;
+    return false;
   }
 
   // The algorithm used starts by setting the affinity to each available thread
@@ -1114,18 +1538,7 @@ static int __kmp_affinity_create_apicid_map(AddrUnsPair **address2os,
     __kmp_ncores = __kmp_xproc;
     nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
     __kmp_nThreadsPerCore = 1;
-    if (__kmp_affinity_verbose) {
-      KMP_INFORM(AffNotCapableUseLocCpuid, "KMP_AFFINITY");
-      KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
-      if (__kmp_affinity_uniform_topology()) {
-        KMP_INFORM(Uniform, "KMP_AFFINITY");
-      } else {
-        KMP_INFORM(NonUniform, "KMP_AFFINITY");
-      }
-      KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
-                 __kmp_nThreadsPerCore, __kmp_ncores);
-    }
-    return 0;
+    return true;
   }
 
   // From here on, we can assume that it is safe to call
@@ -1133,10 +1546,7 @@ static int __kmp_affinity_create_apicid_map(AddrUnsPair **address2os,
   // __kmp_affinity_type = affinity_none.
 
   // Save the affinity mask for the current thread.
-  kmp_affin_mask_t *oldMask;
-  KMP_CPU_ALLOC(oldMask);
-  KMP_ASSERT(oldMask != NULL);
-  __kmp_get_system_affinity(oldMask, TRUE);
+  kmp_affinity_raii_t previous_affinity;
 
   // Run through each of the available contexts, binding the current thread
   // to it, and obtaining the pertinent information using the cpuid instr.
@@ -1180,11 +1590,9 @@ static int __kmp_affinity_create_apicid_map(AddrUnsPair **address2os,
     // The apic id and max threads per pkg come from cpuid(1).
     __kmp_x86_cpuid(1, 0, &buf);
     if (((buf.edx >> 9) & 1) == 0) {
-      __kmp_set_system_affinity(oldMask, TRUE);
       __kmp_free(threadInfo);
-      KMP_CPU_FREE(oldMask);
       *msg_id = kmp_i18n_str_ApicNotPresent;
-      return -1;
+      return false;
     }
     threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff;
     threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
@@ -1216,11 +1624,9 @@ static int __kmp_affinity_create_apicid_map(AddrUnsPair **address2os,
       // I've never seen this one happen, but I suppose it could, if the cpuid
       // instruction on a chip was really screwed up. Make sure to restore the
       // affinity mask before the tail call.
-      __kmp_set_system_affinity(oldMask, TRUE);
       __kmp_free(threadInfo);
-      KMP_CPU_FREE(oldMask);
       *msg_id = kmp_i18n_str_InvalidCpuidInfo;
-      return -1;
+      return false;
     }
 
     int maskC = (1 << widthC) - 1;
@@ -1234,50 +1640,7 @@ static int __kmp_affinity_create_apicid_map(AddrUnsPair **address2os,
 
   // We've collected all the info we need.
   // Restore the old affinity mask for this thread.
-  __kmp_set_system_affinity(oldMask, TRUE);
-
-  // If there's only one thread context to bind to, form an Address object
-  // with depth 1 and return immediately (or, if affinity is off, set
-  // address2os to NULL and return).
-  //
-  // If it is configured to omit the package level when there is only a single
-  // package, the logic at the end of this routine won't work if there is only
-  // a single thread - it would try to form an Address object with depth 0.
-  KMP_ASSERT(nApics > 0);
-  if (nApics == 1) {
-    __kmp_ncores = nPackages = 1;
-    __kmp_nThreadsPerCore = nCoresPerPkg = 1;
-    if (__kmp_affinity_verbose) {
-      KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
-      KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
-      KMP_INFORM(Uniform, "KMP_AFFINITY");
-      KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
-                 __kmp_nThreadsPerCore, __kmp_ncores);
-    }
-
-    if (__kmp_affinity_type == affinity_none) {
-      __kmp_free(threadInfo);
-      KMP_CPU_FREE(oldMask);
-      return 0;
-    }
-
-    *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair));
-    Address addr(1);
-    addr.labels[0] = threadInfo[0].pkgId;
-    (*address2os)[0] = AddrUnsPair(addr, threadInfo[0].osId);
-
-    if (__kmp_affinity_gran_levels < 0) {
-      __kmp_affinity_gran_levels = 0;
-    }
-
-    if (__kmp_affinity_verbose) {
-      __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
-    }
-
-    __kmp_free(threadInfo);
-    KMP_CPU_FREE(oldMask);
-    return 1;
-  }
+  previous_affinity.restore();
 
   // Sort the threadInfo table by physical Id.
   qsort(threadInfo, nApics, sizeof(*threadInfo),
@@ -1346,9 +1709,8 @@ static int __kmp_affinity_create_apicid_map(AddrUnsPair **address2os,
       lastThreadId = threadInfo[i].threadId;
     } else {
       __kmp_free(threadInfo);
-      KMP_CPU_FREE(oldMask);
       *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique;
-      return -1;
+      return false;
     }
 
     // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg
@@ -1356,228 +1718,295 @@ static int __kmp_affinity_create_apicid_map(AddrUnsPair **address2os,
     if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg) ||
         (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) {
       __kmp_free(threadInfo);
-      KMP_CPU_FREE(oldMask);
       *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
-      return -1;
+      return false;
     }
   }
+  // When affinity is off, this routine will still be called to set
+  // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
+  // Make sure all these vars are set correctly
   nPackages = pkgCt;
   if ((int)coreCt > nCoresPerPkg)
     nCoresPerPkg = coreCt;
   if ((int)threadCt > __kmp_nThreadsPerCore)
     __kmp_nThreadsPerCore = threadCt;
-
-  // When affinity is off, this routine will still be called to set
-  // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
-  // Make sure all these vars are set correctly, and return now if affinity is
-  // not enabled.
   __kmp_ncores = nCores;
-  if (__kmp_affinity_verbose) {
-    KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
-    KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
-    if (__kmp_affinity_uniform_topology()) {
-      KMP_INFORM(Uniform, "KMP_AFFINITY");
-    } else {
-      KMP_INFORM(NonUniform, "KMP_AFFINITY");
-    }
-    KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
-               __kmp_nThreadsPerCore, __kmp_ncores);
-  }
-  KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
   KMP_DEBUG_ASSERT(nApics == (unsigned)__kmp_avail_proc);
-  __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
-  for (i = 0; i < nApics; ++i) {
-    __kmp_pu_os_idx[i] = threadInfo[i].osId;
-  }
-  if (__kmp_affinity_type == affinity_none) {
-    __kmp_free(threadInfo);
-    KMP_CPU_FREE(oldMask);
-    return 0;
-  }
 
   // Now that we've determined the number of packages, the number of cores per
   // package, and the number of threads per core, we can construct the data
   // structure that is to be returned.
+  int idx = 0;
   int pkgLevel = 0;
-  int coreLevel = (nCoresPerPkg <= 1) ? -1 : 1;
-  int threadLevel =
-      (__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1);
-  unsigned depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0);
+  int coreLevel = 1;
+  int threadLevel = 2;
+  //(__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1);
+  int depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0);
+  kmp_hw_t types[3];
+  if (pkgLevel >= 0)
+    types[idx++] = KMP_HW_SOCKET;
+  if (coreLevel >= 0)
+    types[idx++] = KMP_HW_CORE;
+  if (threadLevel >= 0)
+    types[idx++] = KMP_HW_THREAD;
 
   KMP_ASSERT(depth > 0);
-  *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * nApics);
+  __kmp_topology = kmp_topology_t::allocate(nApics, depth, types);
 
   for (i = 0; i < nApics; ++i) {
-    Address addr(depth);
+    idx = 0;
     unsigned os = threadInfo[i].osId;
-    int d = 0;
+    kmp_hw_thread_t &hw_thread = __kmp_topology->at(i);
+    hw_thread.clear();
 
     if (pkgLevel >= 0) {
-      addr.labels[d++] = threadInfo[i].pkgId;
+      hw_thread.ids[idx++] = threadInfo[i].pkgId;
     }
     if (coreLevel >= 0) {
-      addr.labels[d++] = threadInfo[i].coreId;
+      hw_thread.ids[idx++] = threadInfo[i].coreId;
     }
     if (threadLevel >= 0) {
-      addr.labels[d++] = threadInfo[i].threadId;
+      hw_thread.ids[idx++] = threadInfo[i].threadId;
     }
-    (*address2os)[i] = AddrUnsPair(addr, os);
-  }
-
-  if (__kmp_affinity_gran_levels < 0) {
-    // Set the granularity level based on what levels are modeled in the machine
-    // topology map.
-    __kmp_affinity_gran_levels = 0;
-    if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) {
-      __kmp_affinity_gran_levels++;
-    }
-    if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
-      __kmp_affinity_gran_levels++;
-    }
-    if ((pkgLevel >= 0) && (__kmp_affinity_gran > affinity_gran_package)) {
-      __kmp_affinity_gran_levels++;
-    }
-  }
-
-  if (__kmp_affinity_verbose) {
-    __kmp_affinity_print_topology(*address2os, nApics, depth, pkgLevel,
-                                  coreLevel, threadLevel);
+    hw_thread.os_id = os;
   }
 
   __kmp_free(threadInfo);
-  KMP_CPU_FREE(oldMask);
-  return depth;
+  __kmp_topology->sort_ids();
+  if (!__kmp_topology->check_ids()) {
+    kmp_topology_t::deallocate(__kmp_topology);
+    __kmp_topology = nullptr;
+    *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique;
+    return false;
+  }
+  return true;
 }
 
 // Intel(R) microarchitecture code name Nehalem, Dunnington and later
 // architectures support a newer interface for specifying the x2APIC Ids,
-// based on cpuid leaf 11.
-static int __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os,
-                                              kmp_i18n_id_t *const msg_id) {
-  kmp_cpuid buf;
-  *address2os = NULL;
-  *msg_id = kmp_i18n_null;
+// based on CPUID.B or CPUID.1F
+/*
+ * CPUID.B or 1F, Input ECX (sub leaf # aka level number)
+    Bits            Bits            Bits           Bits
+    31-16           15-8            7-4            4-0
+---+-----------+--------------+-------------+-----------------+
+EAX| reserved  |   reserved   |   reserved  |  Bits to Shift  |
+---+-----------|--------------+-------------+-----------------|
+EBX| reserved  | Num logical processors at level (16 bits)    |
+---+-----------|--------------+-------------------------------|
+ECX| reserved  |   Level Type |      Level Number (8 bits)    |
+---+-----------+--------------+-------------------------------|
+EDX|                    X2APIC ID (32 bits)                   |
+---+----------------------------------------------------------+
+*/
+
+enum {
+  INTEL_LEVEL_TYPE_INVALID = 0, // Package level
+  INTEL_LEVEL_TYPE_SMT = 1,
+  INTEL_LEVEL_TYPE_CORE = 2,
+  INTEL_LEVEL_TYPE_TILE = 3,
+  INTEL_LEVEL_TYPE_MODULE = 4,
+  INTEL_LEVEL_TYPE_DIE = 5,
+  INTEL_LEVEL_TYPE_LAST = 6,
+};
 
-  // Check to see if cpuid leaf 11 is supported.
-  __kmp_x86_cpuid(0, 0, &buf);
-  if (buf.eax < 11) {
-    *msg_id = kmp_i18n_str_NoLeaf11Support;
-    return -1;
-  }
-  __kmp_x86_cpuid(11, 0, &buf);
-  if (buf.ebx == 0) {
-    *msg_id = kmp_i18n_str_NoLeaf11Support;
-    return -1;
-  }
+struct cpuid_level_info_t {
+  unsigned level_type, mask, mask_width, nitems, cache_mask;
+};
 
-  // Find the number of levels in the machine topology. While we're at it, get
-  // the default values for __kmp_nThreadsPerCore & nCoresPerPkg. We will try to
-  // get more accurate values later by explicitly counting them, but get
-  // reasonable defaults now, in case we return early.
-  int level;
-  int threadLevel = -1;
-  int coreLevel = -1;
-  int pkgLevel = -1;
-  __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
+static kmp_hw_t __kmp_intel_type_2_topology_type(int intel_type) {
+  switch (intel_type) {
+  case INTEL_LEVEL_TYPE_INVALID:
+    return KMP_HW_SOCKET;
+  case INTEL_LEVEL_TYPE_SMT:
+    return KMP_HW_THREAD;
+  case INTEL_LEVEL_TYPE_CORE:
+    return KMP_HW_CORE;
+  case INTEL_LEVEL_TYPE_TILE:
+    return KMP_HW_TILE;
+  case INTEL_LEVEL_TYPE_MODULE:
+    return KMP_HW_MODULE;
+  case INTEL_LEVEL_TYPE_DIE:
+    return KMP_HW_DIE;
+  }
+  return KMP_HW_UNKNOWN;
+}
 
-  for (level = 0;; level++) {
-    if (level > 31) {
-      // FIXME: Hack for DPD200163180
-      //
-      // If level is big then something went wrong -> exiting
-      //
-      // There could actually be 32 valid levels in the machine topology, but so
-      // far, the only machine we have seen which does not exit this loop before
-      // iteration 32 has fubar x2APIC settings.
-      //
-      // For now, just reject this case based upon loop trip count.
-      *msg_id = kmp_i18n_str_InvalidCpuidInfo;
-      return -1;
-    }
-    __kmp_x86_cpuid(11, level, &buf);
-    if (buf.ebx == 0) {
-      if (pkgLevel < 0) {
-        // Will infer nPackages from __kmp_xproc
-        pkgLevel = level;
-        level++;
+// This function takes the topology leaf, a levels array to store the levels
+// detected and a bitmap of the known levels.
+// Returns the number of levels in the topology
+static unsigned
+__kmp_x2apicid_get_levels(int leaf,
+                          cpuid_level_info_t levels[INTEL_LEVEL_TYPE_LAST],
+                          kmp_uint64 known_levels) {
+  unsigned level, levels_index;
+  unsigned level_type, mask_width, nitems;
+  kmp_cpuid buf;
+
+  // New algorithm has known topology layers act as highest unknown topology
+  // layers when unknown topology layers exist.
+  // e.g., Suppose layers were SMT <X> CORE <Y> <Z> PACKAGE, where <X> <Y> <Z>
+  // are unknown topology layers, Then SMT will take the characteristics of
+  // (SMT x <X>) and CORE will take the characteristics of (CORE x <Y> x <Z>).
+  // This eliminates unknown portions of the topology while still keeping the
+  // correct structure.
+  level = levels_index = 0;
+  do {
+    __kmp_x86_cpuid(leaf, level, &buf);
+    level_type = __kmp_extract_bits<8, 15>(buf.ecx);
+    mask_width = __kmp_extract_bits<0, 4>(buf.eax);
+    nitems = __kmp_extract_bits<0, 15>(buf.ebx);
+    if (level_type != INTEL_LEVEL_TYPE_INVALID && nitems == 0)
+      return 0;
+
+    if (known_levels & (1ull << level_type)) {
+      // Add a new level to the topology
+      KMP_ASSERT(levels_index < INTEL_LEVEL_TYPE_LAST);
+      levels[levels_index].level_type = level_type;
+      levels[levels_index].mask_width = mask_width;
+      levels[levels_index].nitems = nitems;
+      levels_index++;
+    } else {
+      // If it is an unknown level, then logically move the previous layer up
+      if (levels_index > 0) {
+        levels[levels_index - 1].mask_width = mask_width;
+        levels[levels_index - 1].nitems = nitems;
       }
-      break;
     }
-    int kind = (buf.ecx >> 8) & 0xff;
-    if (kind == 1) {
-      // SMT level
-      threadLevel = level;
-      coreLevel = -1;
-      pkgLevel = -1;
-      __kmp_nThreadsPerCore = buf.ebx & 0xffff;
-      if (__kmp_nThreadsPerCore == 0) {
-        *msg_id = kmp_i18n_str_InvalidCpuidInfo;
-        return -1;
-      }
-    } else if (kind == 2) {
-      // core level
-      coreLevel = level;
-      pkgLevel = -1;
-      nCoresPerPkg = buf.ebx & 0xffff;
-      if (nCoresPerPkg == 0) {
-        *msg_id = kmp_i18n_str_InvalidCpuidInfo;
-        return -1;
-      }
+    level++;
+  } while (level_type != INTEL_LEVEL_TYPE_INVALID);
+
+  // Set the masks to & with apicid
+  for (unsigned i = 0; i < levels_index; ++i) {
+    if (levels[i].level_type != INTEL_LEVEL_TYPE_INVALID) {
+      levels[i].mask = ~((-1) << levels[i].mask_width);
+      levels[i].cache_mask = (-1) << levels[i].mask_width;
+      for (unsigned j = 0; j < i; ++j)
+        levels[i].mask ^= levels[j].mask;
     } else {
-      if (level <= 0) {
-        *msg_id = kmp_i18n_str_InvalidCpuidInfo;
-        return -1;
-      }
-      if (pkgLevel >= 0) {
-        continue;
-      }
-      pkgLevel = level;
-      nPackages = buf.ebx & 0xffff;
-      if (nPackages == 0) {
-        *msg_id = kmp_i18n_str_InvalidCpuidInfo;
-        return -1;
-      }
+      KMP_DEBUG_ASSERT(levels_index > 0);
+      levels[i].mask = (-1) << levels[i - 1].mask_width;
+      levels[i].cache_mask = 0;
     }
   }
-  int depth = level;
+  return levels_index;
+}
+
+static bool __kmp_affinity_create_x2apicid_map(kmp_i18n_id_t *const msg_id) {
+
+  cpuid_level_info_t levels[INTEL_LEVEL_TYPE_LAST];
+  kmp_hw_t types[INTEL_LEVEL_TYPE_LAST];
+  unsigned levels_index;
+  kmp_cpuid buf;
+  kmp_uint64 known_levels;
+  int topology_leaf, highest_leaf, apic_id;
+  int num_leaves;
+  static int leaves[] = {0, 0};
+
+  kmp_i18n_id_t leaf_message_id;
+
+  KMP_BUILD_ASSERT(sizeof(known_levels) * CHAR_BIT > KMP_HW_LAST);
+
+  *msg_id = kmp_i18n_null;
+  if (__kmp_affinity_verbose) {
+    KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC));
+  }
+
+  // Figure out the known topology levels
+  known_levels = 0ull;
+  for (int i = 0; i < INTEL_LEVEL_TYPE_LAST; ++i) {
+    if (__kmp_intel_type_2_topology_type(i) != KMP_HW_UNKNOWN) {
+      known_levels |= (1ull << i);
+    }
+  }
+
+  // Get the highest cpuid leaf supported
+  __kmp_x86_cpuid(0, 0, &buf);
+  highest_leaf = buf.eax;
+
+  // If a specific topology method was requested, only allow that specific leaf
+  // otherwise, try both leaves 31 and 11 in that order
+  num_leaves = 0;
+  if (__kmp_affinity_top_method == affinity_top_method_x2apicid) {
+    num_leaves = 1;
+    leaves[0] = 11;
+    leaf_message_id = kmp_i18n_str_NoLeaf11Support;
+  } else if (__kmp_affinity_top_method == affinity_top_method_x2apicid_1f) {
+    num_leaves = 1;
+    leaves[0] = 31;
+    leaf_message_id = kmp_i18n_str_NoLeaf31Support;
+  } else {
+    num_leaves = 2;
+    leaves[0] = 31;
+    leaves[1] = 11;
+    leaf_message_id = kmp_i18n_str_NoLeaf11Support;
+  }
 
-  // In the above loop, "level" was counted from the finest level (usually
-  // thread) to the coarsest.  The caller expects that we will place the labels
-  // in (*address2os)[].first.labels[] in the inverse order, so we need to
-  // invert the vars saying which level means what.
-  if (threadLevel >= 0) {
-    threadLevel = depth - threadLevel - 1;
+  // Check to see if cpuid leaf 31 or 11 is supported.
+  __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
+  topology_leaf = -1;
+  for (int i = 0; i < num_leaves; ++i) {
+    int leaf = leaves[i];
+    if (highest_leaf < leaf)
+      continue;
+    __kmp_x86_cpuid(leaf, 0, &buf);
+    if (buf.ebx == 0)
+      continue;
+    topology_leaf = leaf;
+    levels_index = __kmp_x2apicid_get_levels(leaf, levels, known_levels);
+    if (levels_index == 0)
+      continue;
+    break;
   }
-  if (coreLevel >= 0) {
-    coreLevel = depth - coreLevel - 1;
+  if (topology_leaf == -1 || levels_index == 0) {
+    *msg_id = leaf_message_id;
+    return false;
   }
-  KMP_DEBUG_ASSERT(pkgLevel >= 0);
-  pkgLevel = depth - pkgLevel - 1;
+  KMP_ASSERT(levels_index <= INTEL_LEVEL_TYPE_LAST);
 
   // The algorithm used starts by setting the affinity to each available thread
   // and retrieving info from the cpuid instruction, so if we are not capable of
-  // calling __kmp_get_system_affinity() and _kmp_get_system_affinity(), then we
-  // need to do something else - use the defaults that we calculated from
+  // calling __kmp_get_system_affinity() and __kmp_get_system_affinity(), then
+  // we need to do something else - use the defaults that we calculated from
   // issuing cpuid without binding to each proc.
   if (!KMP_AFFINITY_CAPABLE()) {
     // Hack to try and infer the machine topology using only the data
     // available from cpuid on the current thread, and __kmp_xproc.
     KMP_ASSERT(__kmp_affinity_type == affinity_none);
-
+    for (unsigned i = 0; i < levels_index; ++i) {
+      if (levels[i].level_type == INTEL_LEVEL_TYPE_SMT) {
+        __kmp_nThreadsPerCore = levels[i].nitems;
+      } else if (levels[i].level_type == INTEL_LEVEL_TYPE_CORE) {
+        nCoresPerPkg = levels[i].nitems;
+      }
+    }
     __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
     nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
-    if (__kmp_affinity_verbose) {
-      KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY");
-      KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
-      if (__kmp_affinity_uniform_topology()) {
-        KMP_INFORM(Uniform, "KMP_AFFINITY");
-      } else {
-        KMP_INFORM(NonUniform, "KMP_AFFINITY");
+    return true;
+  }
+
+  // Allocate the data structure to be returned.
+  int depth = levels_index;
+  for (int i = depth - 1, j = 0; i >= 0; --i, ++j)
+    types[j] = __kmp_intel_type_2_topology_type(levels[i].level_type);
+  __kmp_topology =
+      kmp_topology_t::allocate(__kmp_avail_proc, levels_index, types);
+
+  // Insert equivalent cache types if they exist
+  kmp_cache_info_t cache_info;
+  for (size_t i = 0; i < cache_info.get_depth(); ++i) {
+    const kmp_cache_info_t::info_t &info = cache_info[i];
+    unsigned cache_mask = info.mask;
+    unsigned cache_level = info.level;
+    for (unsigned j = 0; j < levels_index; ++j) {
+      unsigned hw_cache_mask = levels[j].cache_mask;
+      kmp_hw_t cache_type = kmp_cache_info_t::get_topology_type(cache_level);
+      if (hw_cache_mask == cache_mask && j < levels_index - 1) {
+        kmp_hw_t type =
+            __kmp_intel_type_2_topology_type(levels[j + 1].level_type);
+        __kmp_topology->set_equivalent_type(cache_type, type);
       }
-      KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
-                 __kmp_nThreadsPerCore, __kmp_ncores);
     }
-    return 0;
   }
 
   // From here on, we can assume that it is safe to call
@@ -1585,302 +2014,55 @@ static int __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os,
   // __kmp_affinity_type = affinity_none.
 
   // Save the affinity mask for the current thread.
-  kmp_affin_mask_t *oldMask;
-  KMP_CPU_ALLOC(oldMask);
-  __kmp_get_system_affinity(oldMask, TRUE);
-
-  // Allocate the data structure to be returned.
-  AddrUnsPair *retval =
-      (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc);
+  kmp_affinity_raii_t previous_affinity;
 
   // Run through each of the available contexts, binding the current thread
   // to it, and obtaining the pertinent information using the cpuid instr.
   unsigned int proc;
-  int nApics = 0;
+  int hw_thread_index = 0;
   KMP_CPU_SET_ITERATE(proc, __kmp_affin_fullMask) {
+    cpuid_level_info_t my_levels[INTEL_LEVEL_TYPE_LAST];
+    unsigned my_levels_index;
+
     // Skip this proc if it is not included in the machine model.
     if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
       continue;
     }
-    KMP_DEBUG_ASSERT(nApics < __kmp_avail_proc);
+    KMP_DEBUG_ASSERT(hw_thread_index < __kmp_avail_proc);
 
     __kmp_affinity_dispatch->bind_thread(proc);
 
-    // Extract labels for each level in the machine topology map from Apic ID.
-    Address addr(depth);
-    int prev_shift = 0;
-
-    for (level = 0; level < depth; level++) {
-      __kmp_x86_cpuid(11, level, &buf);
-      unsigned apicId = buf.edx;
-      if (buf.ebx == 0) {
-        if (level != depth - 1) {
-          KMP_CPU_FREE(oldMask);
-          *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
-          return -1;
-        }
-        addr.labels[depth - level - 1] = apicId >> prev_shift;
-        level++;
-        break;
-      }
-      int shift = buf.eax & 0x1f;
-      int mask = (1 << shift) - 1;
-      addr.labels[depth - level - 1] = (apicId & mask) >> prev_shift;
-      prev_shift = shift;
-    }
-    if (level != depth) {
-      KMP_CPU_FREE(oldMask);
-      *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
-      return -1;
-    }
-
-    retval[nApics] = AddrUnsPair(addr, proc);
-    nApics++;
-  }
-
-  // We've collected all the info we need.
-  // Restore the old affinity mask for this thread.
-  __kmp_set_system_affinity(oldMask, TRUE);
-
-  // If there's only one thread context to bind to, return now.
-  KMP_ASSERT(nApics > 0);
-  if (nApics == 1) {
-    __kmp_ncores = nPackages = 1;
-    __kmp_nThreadsPerCore = nCoresPerPkg = 1;
-    if (__kmp_affinity_verbose) {
-      KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
-      KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
-      KMP_INFORM(Uniform, "KMP_AFFINITY");
-      KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
-                 __kmp_nThreadsPerCore, __kmp_ncores);
-    }
-
-    if (__kmp_affinity_type == affinity_none) {
-      __kmp_free(retval);
-      KMP_CPU_FREE(oldMask);
-      return 0;
-    }
-
-    // Form an Address object which only includes the package level.
-    Address addr(1);
-    addr.labels[0] = retval[0].first.labels[pkgLevel];
-    retval[0].first = addr;
-
-    if (__kmp_affinity_gran_levels < 0) {
-      __kmp_affinity_gran_levels = 0;
-    }
-
-    if (__kmp_affinity_verbose) {
-      __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1);
-    }
-
-    *address2os = retval;
-    KMP_CPU_FREE(oldMask);
-    return 1;
-  }
-
-  // Sort the table by physical Id.
-  qsort(retval, nApics, sizeof(*retval), __kmp_affinity_cmp_Address_labels);
-
-  // Find the radix at each of the levels.
-  unsigned *totals = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
-  unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
-  unsigned *maxCt = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
-  unsigned *last = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
-  for (level = 0; level < depth; level++) {
-    totals[level] = 1;
-    maxCt[level] = 1;
-    counts[level] = 1;
-    last[level] = retval[0].first.labels[level];
-  }
-
-  // From here on, the iteration variable "level" runs from the finest level to
-  // the coarsest, i.e. we iterate forward through
-  // (*address2os)[].first.labels[] - in the previous loops, we iterated
-  // backwards.
-  for (proc = 1; (int)proc < nApics; proc++) {
-    int level;
-    for (level = 0; level < depth; level++) {
-      if (retval[proc].first.labels[level] != last[level]) {
-        int j;
-        for (j = level + 1; j < depth; j++) {
-          totals[j]++;
-          counts[j] = 1;
-          // The line below causes printing incorrect topology information in
-          // case the max value for some level (maxCt[level]) is encountered
-          // earlier than some less value while going through the array. For
-          // example, let pkg0 has 4 cores and pkg1 has 2 cores. Then
-          // maxCt[1] == 2
-          // whereas it must be 4.
-          // TODO!!! Check if it can be commented safely
-          // maxCt[j] = 1;
-          last[j] = retval[proc].first.labels[j];
-        }
-        totals[level]++;
-        counts[level]++;
-        if (counts[level] > maxCt[level]) {
-          maxCt[level] = counts[level];
-        }
-        last[level] = retval[proc].first.labels[level];
-        break;
-      } else if (level == depth - 1) {
-        __kmp_free(last);
-        __kmp_free(maxCt);
-        __kmp_free(counts);
-        __kmp_free(totals);
-        __kmp_free(retval);
-        KMP_CPU_FREE(oldMask);
-        *msg_id = kmp_i18n_str_x2ApicIDsNotUnique;
-        return -1;
-      }
-    }
-  }
-
-  // When affinity is off, this routine will still be called to set
-  // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
-  // Make sure all these vars are set correctly, and return if affinity is not
-  // enabled.
-  if (threadLevel >= 0) {
-    __kmp_nThreadsPerCore = maxCt[threadLevel];
-  } else {
-    __kmp_nThreadsPerCore = 1;
-  }
-  nPackages = totals[pkgLevel];
-
-  if (coreLevel >= 0) {
-    __kmp_ncores = totals[coreLevel];
-    nCoresPerPkg = maxCt[coreLevel];
-  } else {
-    __kmp_ncores = nPackages;
-    nCoresPerPkg = 1;
-  }
-
-  // Check to see if the machine topology is uniform
-  unsigned prod = maxCt[0];
-  for (level = 1; level < depth; level++) {
-    prod *= maxCt[level];
-  }
-  bool uniform = (prod == totals[level - 1]);
-
-  // Print the machine topology summary.
-  if (__kmp_affinity_verbose) {
-    KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
-    KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
-    if (uniform) {
-      KMP_INFORM(Uniform, "KMP_AFFINITY");
-    } else {
-      KMP_INFORM(NonUniform, "KMP_AFFINITY");
-    }
-
-    kmp_str_buf_t buf;
-    __kmp_str_buf_init(&buf);
-
-    __kmp_str_buf_print(&buf, "%d", totals[0]);
-    for (level = 1; level <= pkgLevel; level++) {
-      __kmp_str_buf_print(&buf, " x %d", maxCt[level]);
-    }
-    KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg,
-               __kmp_nThreadsPerCore, __kmp_ncores);
-
-    __kmp_str_buf_free(&buf);
-  }
-  KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
-  KMP_DEBUG_ASSERT(nApics == __kmp_avail_proc);
-  __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
-  for (proc = 0; (int)proc < nApics; ++proc) {
-    __kmp_pu_os_idx[proc] = retval[proc].second;
-  }
-  if (__kmp_affinity_type == affinity_none) {
-    __kmp_free(last);
-    __kmp_free(maxCt);
-    __kmp_free(counts);
-    __kmp_free(totals);
-    __kmp_free(retval);
-    KMP_CPU_FREE(oldMask);
-    return 0;
-  }
-
-  // Find any levels with radix 1, and remove them from the map
-  // (except for the package level).
-  int new_depth = 0;
-  for (level = 0; level < depth; level++) {
-    if ((maxCt[level] == 1) && (level != pkgLevel)) {
-      continue;
-    }
-    new_depth++;
-  }
-
-  // If we are removing any levels, allocate a new vector to return,
-  // and copy the relevant information to it.
-  if (new_depth != depth) {
-    AddrUnsPair *new_retval =
-        (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * nApics);
-    for (proc = 0; (int)proc < nApics; proc++) {
-      Address addr(new_depth);
-      new_retval[proc] = AddrUnsPair(addr, retval[proc].second);
-    }
-    int new_level = 0;
-    int newPkgLevel = -1;
-    int newCoreLevel = -1;
-    int newThreadLevel = -1;
-    for (level = 0; level < depth; level++) {
-      if ((maxCt[level] == 1) && (level != pkgLevel)) {
-        // Remove this level. Never remove the package level
-        continue;
-      }
-      if (level == pkgLevel) {
-        newPkgLevel = new_level;
-      }
-      if (level == coreLevel) {
-        newCoreLevel = new_level;
-      }
-      if (level == threadLevel) {
-        newThreadLevel = new_level;
-      }
-      for (proc = 0; (int)proc < nApics; proc++) {
-        new_retval[proc].first.labels[new_level] =
-            retval[proc].first.labels[level];
+    // New algorithm
+    __kmp_x86_cpuid(topology_leaf, 0, &buf);
+    apic_id = buf.edx;
+    kmp_hw_thread_t &hw_thread = __kmp_topology->at(hw_thread_index);
+    my_levels_index =
+        __kmp_x2apicid_get_levels(topology_leaf, my_levels, known_levels);
+    if (my_levels_index == 0 || my_levels_index != levels_index) {
+      *msg_id = kmp_i18n_str_InvalidCpuidInfo;
+      return false;
+    }
+    hw_thread.clear();
+    hw_thread.os_id = proc;
+    // Put in topology information
+    for (unsigned j = 0, idx = depth - 1; j < my_levels_index; ++j, --idx) {
+      hw_thread.ids[idx] = apic_id & my_levels[j].mask;
+      if (j > 0) {
+        hw_thread.ids[idx] >>= my_levels[j - 1].mask_width;
       }
-      new_level++;
     }
-
-    __kmp_free(retval);
-    retval = new_retval;
-    depth = new_depth;
-    pkgLevel = newPkgLevel;
-    coreLevel = newCoreLevel;
-    threadLevel = newThreadLevel;
+    hw_thread_index++;
   }
-
-  if (__kmp_affinity_gran_levels < 0) {
-    // Set the granularity level based on what levels are modeled
-    // in the machine topology map.
-    __kmp_affinity_gran_levels = 0;
-    if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) {
-      __kmp_affinity_gran_levels++;
-    }
-    if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
-      __kmp_affinity_gran_levels++;
-    }
-    if (__kmp_affinity_gran > affinity_gran_package) {
-      __kmp_affinity_gran_levels++;
-    }
+  KMP_ASSERT(hw_thread_index > 0);
+  __kmp_topology->sort_ids();
+  if (!__kmp_topology->check_ids()) {
+    kmp_topology_t::deallocate(__kmp_topology);
+    __kmp_topology = nullptr;
+    *msg_id = kmp_i18n_str_x2ApicIDsNotUnique;
+    return false;
   }
-
-  if (__kmp_affinity_verbose) {
-    __kmp_affinity_print_topology(retval, nApics, depth, pkgLevel, coreLevel,
-                                  threadLevel);
-  }
-
-  __kmp_free(last);
-  __kmp_free(maxCt);
-  __kmp_free(counts);
-  __kmp_free(totals);
-  KMP_CPU_FREE(oldMask);
-  *address2os = retval;
-  return depth;
+  return true;
 }
-
 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
 
 #define osIdIndex 0
@@ -1919,7 +2101,7 @@ static void __kmp_dispatch_set_hierarchy_values() {
   __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1] =
       nPackages * nCoresPerPkg * __kmp_nThreadsPerCore;
   __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L1 + 1] = __kmp_ncores;
-#if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS) && \
+#if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS) &&   \
     KMP_MIC_SUPPORTED
   if (__kmp_mic_type >= mic3)
     __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L2 + 1] = __kmp_ncores / 2;
@@ -1934,7 +2116,7 @@ static void __kmp_dispatch_set_hierarchy_values() {
   __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_THREAD + 1] = 1;
   __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L1 + 1] =
       __kmp_nThreadsPerCore;
-#if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS) && \
+#if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS) &&   \
     KMP_MIC_SUPPORTED
   if (__kmp_mic_type >= mic3)
     __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L2 + 1] =
@@ -1980,15 +2162,36 @@ int __kmp_dispatch_get_t1_per_t2(kmp_hier_layer_e t1, kmp_hier_layer_e t2) {
 }
 #endif // KMP_USE_HIER_SCHED
 
+static inline const char *__kmp_cpuinfo_get_filename() {
+  const char *filename;
+  if (__kmp_cpuinfo_file != nullptr)
+    filename = __kmp_cpuinfo_file;
+  else
+    filename = "/proc/cpuinfo";
+  return filename;
+}
+
+static inline const char *__kmp_cpuinfo_get_envvar() {
+  const char *envvar = nullptr;
+  if (__kmp_cpuinfo_file != nullptr)
+    envvar = "KMP_CPUINFO_FILE";
+  return envvar;
+}
+
 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the
 // affinity map.
-static int __kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os,
-                                             int *line,
-                                             kmp_i18n_id_t *const msg_id,
-                                             FILE *f) {
-  *address2os = NULL;
+static bool __kmp_affinity_create_cpuinfo_map(int *line,
+                                              kmp_i18n_id_t *const msg_id) {
+  const char *filename = __kmp_cpuinfo_get_filename();
+  const char *envvar = __kmp_cpuinfo_get_envvar();
   *msg_id = kmp_i18n_null;
 
+  if (__kmp_affinity_verbose) {
+    KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename);
+  }
+
+  kmp_safe_raii_file_t f(filename, "r", envvar);
+
   // Scan of the file, and count the number of "processor" (osId) fields,
   // and find the highest value of <n> for a node_<n> field.
   char buf[256];
@@ -2009,6 +2212,10 @@ static int __kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os,
     // FIXME - this will match "node_<n> <garbage>"
     unsigned level;
     if (KMP_SSCANF(buf, "node_%u id", &level) == 1) {
+      // validate the input fisrt:
+      if (level > (unsigned)__kmp_xproc) { // level is too big
+        level = __kmp_xproc;
+      }
       if (nodeIdIndex + level >= maxIndex) {
         maxIndex = nodeIdIndex + level;
       }
@@ -2019,14 +2226,12 @@ static int __kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os,
   // Check for empty file / no valid processor records, or too many. The number
   // of records can't exceed the number of valid bits in the affinity mask.
   if (num_records == 0) {
-    *line = 0;
     *msg_id = kmp_i18n_str_NoProcRecords;
-    return -1;
+    return false;
   }
   if (num_records > (unsigned)__kmp_xproc) {
-    *line = 0;
     *msg_id = kmp_i18n_str_TooManyProcRecords;
-    return -1;
+    return false;
   }
 
   // Set the file pointer back to the beginning, so that we can scan the file
@@ -2035,9 +2240,8 @@ static int __kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os,
   // at the end allows us to remove a lot of extra checks for termination
   // conditions.
   if (fseek(f, 0, SEEK_SET) != 0) {
-    *line = 0;
     *msg_id = kmp_i18n_str_CantRewindCpuinfo;
-    return -1;
+    return false;
   }
 
   // Allocate the array of records to store the proc info in.  The dummy
@@ -2100,7 +2304,7 @@ static int __kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os,
   if (long_line) {                                                             \
     CLEANUP_THREAD_INFO;                                                       \
     *msg_id = kmp_i18n_str_LongLineCpuinfo;                                    \
-    return -1;                                                                 \
+    return false;                                                              \
   }
       }
       (*line)++;
@@ -2208,7 +2412,7 @@ static int __kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os,
       if ((int)num_avail == __kmp_xproc) {
         CLEANUP_THREAD_INFO;
         *msg_id = kmp_i18n_str_TooManyEntries;
-        return -1;
+        return false;
       }
 
       // Check for missing fields.  The osId field must be there, and we
@@ -2216,12 +2420,12 @@ static int __kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os,
       if (threadInfo[num_avail][osIdIndex] == UINT_MAX) {
         CLEANUP_THREAD_INFO;
         *msg_id = kmp_i18n_str_MissingProcField;
-        return -1;
+        return false;
       }
       if (threadInfo[0][pkgIdIndex] == UINT_MAX) {
         CLEANUP_THREAD_INFO;
         *msg_id = kmp_i18n_str_MissingPhysicalIDField;
-        return -1;
+        return false;
       }
 
       // Skip this proc if it is not included in the machine model.
@@ -2242,12 +2446,12 @@ static int __kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os,
   no_val:
     CLEANUP_THREAD_INFO;
     *msg_id = kmp_i18n_str_MissingValCpuinfo;
-    return -1;
+    return false;
 
   dup_field:
     CLEANUP_THREAD_INFO;
     *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo;
-    return -1;
+    return false;
   }
   *line = 0;
 
@@ -2257,60 +2461,11 @@ static int __kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os,
 
   // check for num_records == __kmp_xproc ???
 
-  // If there's only one thread context to bind to, form an Address object with
-  // depth 1 and return immediately (or, if affinity is off, set address2os to
-  // NULL and return).
-  //
   // If it is configured to omit the package level when there is only a single
   // package, the logic at the end of this routine won't work if there is only a
-  // single thread - it would try to form an Address object with depth 0.
+  // single thread
   KMP_ASSERT(num_avail > 0);
   KMP_ASSERT(num_avail <= num_records);
-  if (num_avail == 1) {
-    __kmp_ncores = 1;
-    __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
-    if (__kmp_affinity_verbose) {
-      if (!KMP_AFFINITY_CAPABLE()) {
-        KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
-        KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
-        KMP_INFORM(Uniform, "KMP_AFFINITY");
-      } else {
-        KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
-        KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
-        KMP_INFORM(Uniform, "KMP_AFFINITY");
-      }
-      int index;
-      kmp_str_buf_t buf;
-      __kmp_str_buf_init(&buf);
-      __kmp_str_buf_print(&buf, "1");
-      for (index = maxIndex - 1; index > pkgIdIndex; index--) {
-        __kmp_str_buf_print(&buf, " x 1");
-      }
-      KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, 1, 1, 1);
-      __kmp_str_buf_free(&buf);
-    }
-
-    if (__kmp_affinity_type == affinity_none) {
-      CLEANUP_THREAD_INFO;
-      return 0;
-    }
-
-    *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair));
-    Address addr(1);
-    addr.labels[0] = threadInfo[0][pkgIdIndex];
-    (*address2os)[0] = AddrUnsPair(addr, threadInfo[0][osIdIndex]);
-
-    if (__kmp_affinity_gran_levels < 0) {
-      __kmp_affinity_gran_levels = 0;
-    }
-
-    if (__kmp_affinity_verbose) {
-      __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
-    }
-
-    CLEANUP_THREAD_INFO;
-    return 1;
-  }
 
   // Sort the threadInfo table by physical Id.
   qsort(threadInfo, num_avail, sizeof(*threadInfo),
@@ -2427,7 +2582,7 @@ restart_radix_check:
         __kmp_free(counts);
         CLEANUP_THREAD_INFO;
         *msg_id = kmp_i18n_str_PhysicalIDsNotUnique;
-        return -1;
+        return false;
       }
 
       // If the thread ids were not specified and we see entries entries that
@@ -2453,48 +2608,14 @@ restart_radix_check:
   nCoresPerPkg = maxCt[coreIdIndex];
   nPackages = totals[pkgIdIndex];
 
-  // Check to see if the machine topology is uniform
-  unsigned prod = totals[maxIndex];
-  for (index = threadIdIndex; index < maxIndex; index++) {
-    prod *= maxCt[index];
-  }
-  bool uniform = (prod == totals[threadIdIndex]);
-
   // When affinity is off, this routine will still be called to set
   // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
   // Make sure all these vars are set correctly, and return now if affinity is
   // not enabled.
   __kmp_ncores = totals[coreIdIndex];
-
-  if (__kmp_affinity_verbose) {
-    if (!KMP_AFFINITY_CAPABLE()) {
-      KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
-      KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
-      if (uniform) {
-        KMP_INFORM(Uniform, "KMP_AFFINITY");
-      } else {
-        KMP_INFORM(NonUniform, "KMP_AFFINITY");
-      }
-    } else {
-      KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
-      KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
-      if (uniform) {
-        KMP_INFORM(Uniform, "KMP_AFFINITY");
-      } else {
-        KMP_INFORM(NonUniform, "KMP_AFFINITY");
-      }
-    }
-    kmp_str_buf_t buf;
-    __kmp_str_buf_init(&buf);
-
-    __kmp_str_buf_print(&buf, "%d", totals[maxIndex]);
-    for (index = maxIndex - 1; index >= pkgIdIndex; index--) {
-      __kmp_str_buf_print(&buf, " x %d", maxCt[index]);
-    }
-    KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, maxCt[coreIdIndex],
-               maxCt[threadIdIndex], __kmp_ncores);
-
-    __kmp_str_buf_free(&buf);
+  if (!KMP_AFFINITY_CAPABLE()) {
+    KMP_ASSERT(__kmp_affinity_type == affinity_none);
+    return true;
   }
 
 #if KMP_MIC && REDUCE_TEAM_SIZE
@@ -2507,21 +2628,7 @@ restart_radix_check:
   }
 #endif // KMP_MIC && REDUCE_TEAM_SIZE
 
-  KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
   KMP_DEBUG_ASSERT(num_avail == (unsigned)__kmp_avail_proc);
-  __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
-  for (i = 0; i < num_avail; ++i) { // fill the os indices
-    __kmp_pu_os_idx[i] = threadInfo[i][osIdIndex];
-  }
-
-  if (__kmp_affinity_type == affinity_none) {
-    __kmp_free(lastId);
-    __kmp_free(totals);
-    __kmp_free(maxCt);
-    __kmp_free(counts);
-    CLEANUP_THREAD_INFO;
-    return 0;
-  }
 
   // Count the number of levels which have more nodes at that level than at the
   // parent's level (with there being an implicit root node of the top level).
@@ -2535,78 +2642,59 @@ restart_radix_check:
   }
   inMap[maxIndex] = (totals[maxIndex] > 1);
   inMap[pkgIdIndex] = true;
+  inMap[coreIdIndex] = true;
+  inMap[threadIdIndex] = true;
 
   int depth = 0;
+  int idx = 0;
+  kmp_hw_t types[KMP_HW_LAST];
+  int pkgLevel = -1;
+  int coreLevel = -1;
+  int threadLevel = -1;
   for (index = threadIdIndex; index <= maxIndex; index++) {
     if (inMap[index]) {
       depth++;
     }
   }
+  if (inMap[pkgIdIndex]) {
+    pkgLevel = idx;
+    types[idx++] = KMP_HW_SOCKET;
+  }
+  if (inMap[coreIdIndex]) {
+    coreLevel = idx;
+    types[idx++] = KMP_HW_CORE;
+  }
+  if (inMap[threadIdIndex]) {
+    threadLevel = idx;
+    types[idx++] = KMP_HW_THREAD;
+  }
   KMP_ASSERT(depth > 0);
 
   // Construct the data structure that is to be returned.
-  *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * num_avail);
-  int pkgLevel = -1;
-  int coreLevel = -1;
-  int threadLevel = -1;
+  __kmp_topology = kmp_topology_t::allocate(num_avail, depth, types);
 
   for (i = 0; i < num_avail; ++i) {
-    Address addr(depth);
     unsigned os = threadInfo[i][osIdIndex];
     int src_index;
     int dst_index = 0;
+    kmp_hw_thread_t &hw_thread = __kmp_topology->at(i);
+    hw_thread.clear();
+    hw_thread.os_id = os;
 
+    idx = 0;
     for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) {
       if (!inMap[src_index]) {
         continue;
       }
-      addr.labels[dst_index] = threadInfo[i][src_index];
       if (src_index == pkgIdIndex) {
-        pkgLevel = dst_index;
+        hw_thread.ids[pkgLevel] = threadInfo[i][src_index];
       } else if (src_index == coreIdIndex) {
-        coreLevel = dst_index;
+        hw_thread.ids[coreLevel] = threadInfo[i][src_index];
       } else if (src_index == threadIdIndex) {
-        threadLevel = dst_index;
+        hw_thread.ids[threadLevel] = threadInfo[i][src_index];
       }
       dst_index++;
     }
-    (*address2os)[i] = AddrUnsPair(addr, os);
-  }
-
-  if (__kmp_affinity_gran_levels < 0) {
-    // Set the granularity level based on what levels are modeled
-    // in the machine topology map.
-    unsigned src_index;
-    __kmp_affinity_gran_levels = 0;
-    for (src_index = threadIdIndex; src_index <= maxIndex; src_index++) {
-      if (!inMap[src_index]) {
-        continue;
-      }
-      switch (src_index) {
-      case threadIdIndex:
-        if (__kmp_affinity_gran > affinity_gran_thread) {
-          __kmp_affinity_gran_levels++;
-        }
-
-        break;
-      case coreIdIndex:
-        if (__kmp_affinity_gran > affinity_gran_core) {
-          __kmp_affinity_gran_levels++;
-        }
-        break;
-
-      case pkgIdIndex:
-        if (__kmp_affinity_gran > affinity_gran_package) {
-          __kmp_affinity_gran_levels++;
-        }
-        break;
-      }
-    }
-  }
-
-  if (__kmp_affinity_verbose) {
-    __kmp_affinity_print_topology(*address2os, num_avail, depth, pkgLevel,
-                                  coreLevel, threadLevel);
   }
 
   __kmp_free(inMap);
@@ -2615,27 +2703,32 @@ restart_radix_check:
   __kmp_free(maxCt);
   __kmp_free(counts);
   CLEANUP_THREAD_INFO;
-  return depth;
+  __kmp_topology->sort_ids();
+  if (!__kmp_topology->check_ids()) {
+    kmp_topology_t::deallocate(__kmp_topology);
+    __kmp_topology = nullptr;
+    *msg_id = kmp_i18n_str_PhysicalIDsNotUnique;
+    return false;
+  }
+  return true;
 }
 
 // Create and return a table of affinity masks, indexed by OS thread ID.
 // This routine handles OR'ing together all the affinity masks of threads
 // that are sufficiently close, if granularity > fine.
 static kmp_affin_mask_t *__kmp_create_masks(unsigned *maxIndex,
-                                            unsigned *numUnique,
-                                            AddrUnsPair *address2os,
-                                            unsigned numAddrs) {
+                                            unsigned *numUnique) {
   // First form a table of affinity masks in order of OS thread id.
-  unsigned depth;
-  unsigned maxOsId;
-  unsigned i;
-
-  KMP_ASSERT(numAddrs > 0);
-  depth = address2os[0].first.depth;
+  int maxOsId;
+  int i;
+  int numAddrs = __kmp_topology->get_num_hw_threads();
+  int depth = __kmp_topology->get_depth();
+  KMP_ASSERT(numAddrs);
+  KMP_ASSERT(depth);
 
   maxOsId = 0;
   for (i = numAddrs - 1;; --i) {
-    unsigned osId = address2os[i].second;
+    int osId = __kmp_topology->at(i).os_id;
     if (osId > maxOsId) {
       maxOsId = osId;
     }
@@ -2644,12 +2737,6 @@ static kmp_affin_mask_t *__kmp_create_masks(unsigned *maxIndex,
   }
   kmp_affin_mask_t *osId2Mask;
   KMP_CPU_ALLOC_ARRAY(osId2Mask, (maxOsId + 1));
-
-  // Sort the address2os table according to physical order. Doing so will put
-  // all threads on the same core/package/node in consecutive locations.
-  qsort(address2os, numAddrs, sizeof(*address2os),
-        __kmp_affinity_cmp_Address_labels);
-
   KMP_ASSERT(__kmp_affinity_gran_levels >= 0);
   if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) {
     KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY", __kmp_affinity_gran_levels);
@@ -2662,52 +2749,50 @@ static kmp_affin_mask_t *__kmp_create_masks(unsigned *maxIndex,
   }
 
   // Run through the table, forming the masks for all threads on each core.
-  // Threads on the same core will have identical "Address" objects, not
+  // Threads on the same core will have identical kmp_hw_thread_t objects, not
   // considering the last level, which must be the thread id. All threads on a
   // core will appear consecutively.
-  unsigned unique = 0;
-  unsigned j = 0; // index of 1st thread on core
-  unsigned leader = 0;
-  Address *leaderAddr = &(address2os[0].first);
+  int unique = 0;
+  int j = 0; // index of 1st thread on core
+  int leader = 0;
   kmp_affin_mask_t *sum;
   KMP_CPU_ALLOC_ON_STACK(sum);
   KMP_CPU_ZERO(sum);
-  KMP_CPU_SET(address2os[0].second, sum);
+  KMP_CPU_SET(__kmp_topology->at(0).os_id, sum);
   for (i = 1; i < numAddrs; i++) {
     // If this thread is sufficiently close to the leader (within the
     // granularity setting), then set the bit for this os thread in the
     // affinity mask for this group, and go on to the next thread.
-    if (leaderAddr->isClose(address2os[i].first, __kmp_affinity_gran_levels)) {
-      KMP_CPU_SET(address2os[i].second, sum);
+    if (__kmp_topology->is_close(leader, i, __kmp_affinity_gran_levels)) {
+      KMP_CPU_SET(__kmp_topology->at(i).os_id, sum);
       continue;
     }
 
     // For every thread in this group, copy the mask to the thread's entry in
     // the osId2Mask table.  Mark the first address as a leader.
     for (; j < i; j++) {
-      unsigned osId = address2os[j].second;
+      int osId = __kmp_topology->at(j).os_id;
       KMP_DEBUG_ASSERT(osId <= maxOsId);
       kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
       KMP_CPU_COPY(mask, sum);
-      address2os[j].first.leader = (j == leader);
+      __kmp_topology->at(j).leader = (j == leader);
     }
     unique++;
 
     // Start a new mask.
     leader = i;
-    leaderAddr = &(address2os[i].first);
     KMP_CPU_ZERO(sum);
-    KMP_CPU_SET(address2os[i].second, sum);
+    KMP_CPU_SET(__kmp_topology->at(i).os_id, sum);
   }
 
   // For every thread in last group, copy the mask to the thread's
   // entry in the osId2Mask table.
   for (; j < i; j++) {
-    unsigned osId = address2os[j].second;
+    int osId = __kmp_topology->at(j).os_id;
     KMP_DEBUG_ASSERT(osId <= maxOsId);
     kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
     KMP_CPU_COPY(mask, sum);
-    address2os[j].first.leader = (j == leader);
+    __kmp_topology->at(j).leader = (j == leader);
   }
   unique++;
   KMP_CPU_FREE_FROM_STACK(sum);
@@ -3297,647 +3382,15 @@ void __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks,
 #undef ADD_MASK
 #undef ADD_MASK_OSID
 
-#if KMP_USE_HWLOC
-static int __kmp_hwloc_skip_PUs_obj(hwloc_topology_t t, hwloc_obj_t o) {
-  // skip PUs descendants of the object o
-  int skipped = 0;
-  hwloc_obj_t hT = NULL;
-  int N = __kmp_hwloc_count_children_by_type(t, o, HWLOC_OBJ_PU, &hT);
-  for (int i = 0; i < N; ++i) {
-    KMP_DEBUG_ASSERT(hT);
-    unsigned idx = hT->os_index;
-    if (KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) {
-      KMP_CPU_CLR(idx, __kmp_affin_fullMask);
-      KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx));
-      ++skipped;
-    }
-    hT = hwloc_get_next_obj_by_type(t, HWLOC_OBJ_PU, hT);
-  }
-  return skipped; // count number of skipped units
-}
-
-static int __kmp_hwloc_obj_has_PUs(hwloc_topology_t t, hwloc_obj_t o) {
-  // check if obj has PUs present in fullMask
-  hwloc_obj_t hT = NULL;
-  int N = __kmp_hwloc_count_children_by_type(t, o, HWLOC_OBJ_PU, &hT);
-  for (int i = 0; i < N; ++i) {
-    KMP_DEBUG_ASSERT(hT);
-    unsigned idx = hT->os_index;
-    if (KMP_CPU_ISSET(idx, __kmp_affin_fullMask))
-      return 1; // found PU
-    hT = hwloc_get_next_obj_by_type(t, HWLOC_OBJ_PU, hT);
-  }
-  return 0; // no PUs found
-}
-#endif // KMP_USE_HWLOC
-
-static void __kmp_apply_thread_places(AddrUnsPair **pAddr, int depth) {
-  AddrUnsPair *newAddr;
-  if (__kmp_hws_requested == 0)
-    goto _exit; // no topology limiting actions requested, exit
-#if KMP_USE_HWLOC
-  if (__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC) {
-    // Number of subobjects calculated dynamically, this works fine for
-    // any non-uniform topology.
-    // L2 cache objects are determined by depth, other objects - by type.
-    hwloc_topology_t tp = __kmp_hwloc_topology;
-    int nS = 0, nN = 0, nL = 0, nC = 0,
-        nT = 0; // logical index including skipped
-    int nCr = 0, nTr = 0; // number of requested units
-    int nPkg = 0, nCo = 0, n_new = 0, n_old = 0, nCpP = 0, nTpC = 0; // counters
-    hwloc_obj_t hT, hC, hL, hN, hS; // hwloc objects (pointers to)
-    int L2depth, idx;
-
-    // check support of extensions ----------------------------------
-    int numa_support = 0, tile_support = 0;
-    if (__kmp_pu_os_idx)
-      hT = hwloc_get_pu_obj_by_os_index(tp,
-                                        __kmp_pu_os_idx[__kmp_avail_proc - 1]);
-    else
-      hT = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PU, __kmp_avail_proc - 1);
-    if (hT == NULL) { // something's gone wrong
-      KMP_WARNING(AffHWSubsetUnsupported);
-      goto _exit;
-    }
-    // check NUMA node
-    hN = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hT);
-    hS = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_PACKAGE, hT);
-    if (hN != NULL && hN->depth > hS->depth) {
-      numa_support = 1; // 1 in case socket includes node(s)
-    } else if (__kmp_hws_node.num > 0) {
-      // don't support sockets inside NUMA node (no such HW found for testing)
-      KMP_WARNING(AffHWSubsetUnsupported);
-      goto _exit;
-    }
-    // check L2 cahce, get object by depth because of multiple caches
-    L2depth = hwloc_get_cache_type_depth(tp, 2, HWLOC_OBJ_CACHE_UNIFIED);
-    hL = hwloc_get_ancestor_obj_by_depth(tp, L2depth, hT);
-    if (hL != NULL &&
-        __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, &hC) > 1) {
-      tile_support = 1; // no sense to count L2 if it includes single core
-    } else if (__kmp_hws_tile.num > 0) {
-      if (__kmp_hws_core.num == 0) {
-        __kmp_hws_core = __kmp_hws_tile; // replace L2 with core
-        __kmp_hws_tile.num = 0;
-      } else {
-        // L2 and core are both requested, but represent same object
-        KMP_WARNING(AffHWSubsetInvalid);
-        goto _exit;
-      }
-    }
-    // end of check of extensions -----------------------------------
-
-    // fill in unset items, validate settings -----------------------
-    if (__kmp_hws_socket.num == 0)
-      __kmp_hws_socket.num = nPackages; // use all available sockets
-    if (__kmp_hws_socket.offset >= nPackages) {
-      KMP_WARNING(AffHWSubsetManySockets);
-      goto _exit;
-    }
-    if (numa_support) {
-      hN = NULL;
-      int NN = __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_NUMANODE,
-                                                  &hN); // num nodes in socket
-      if (__kmp_hws_node.num == 0)
-        __kmp_hws_node.num = NN; // use all available nodes
-      if (__kmp_hws_node.offset >= NN) {
-        KMP_WARNING(AffHWSubsetManyNodes);
-        goto _exit;
-      }
-      if (tile_support) {
-        // get num tiles in node
-        int NL = __kmp_hwloc_count_children_by_depth(tp, hN, L2depth, &hL);
-        if (__kmp_hws_tile.num == 0) {
-          __kmp_hws_tile.num = NL + 1;
-        } // use all available tiles, some node may have more tiles, thus +1
-        if (__kmp_hws_tile.offset >= NL) {
-          KMP_WARNING(AffHWSubsetManyTiles);
-          goto _exit;
-        }
-        int NC = __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE,
-                                                    &hC); // num cores in tile
-        if (__kmp_hws_core.num == 0)
-          __kmp_hws_core.num = NC; // use all available cores
-        if (__kmp_hws_core.offset >= NC) {
-          KMP_WARNING(AffHWSubsetManyCores);
-          goto _exit;
-        }
-      } else { // tile_support
-        int NC = __kmp_hwloc_count_children_by_type(tp, hN, HWLOC_OBJ_CORE,
-                                                    &hC); // num cores in node
-        if (__kmp_hws_core.num == 0)
-          __kmp_hws_core.num = NC; // use all available cores
-        if (__kmp_hws_core.offset >= NC) {
-          KMP_WARNING(AffHWSubsetManyCores);
-          goto _exit;
-        }
-      } // tile_support
-    } else { // numa_support
-      if (tile_support) {
-        // get num tiles in socket
-        int NL = __kmp_hwloc_count_children_by_depth(tp, hS, L2depth, &hL);
-        if (__kmp_hws_tile.num == 0)
-          __kmp_hws_tile.num = NL; // use all available tiles
-        if (__kmp_hws_tile.offset >= NL) {
-          KMP_WARNING(AffHWSubsetManyTiles);
-          goto _exit;
-        }
-        int NC = __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE,
-                                                    &hC); // num cores in tile
-        if (__kmp_hws_core.num == 0)
-          __kmp_hws_core.num = NC; // use all available cores
-        if (__kmp_hws_core.offset >= NC) {
-          KMP_WARNING(AffHWSubsetManyCores);
-          goto _exit;
-        }
-      } else { // tile_support
-        int NC = __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_CORE,
-                                                    &hC); // num cores in socket
-        if (__kmp_hws_core.num == 0)
-          __kmp_hws_core.num = NC; // use all available cores
-        if (__kmp_hws_core.offset >= NC) {
-          KMP_WARNING(AffHWSubsetManyCores);
-          goto _exit;
-        }
-      } // tile_support
-    }
-    if (__kmp_hws_proc.num == 0)
-      __kmp_hws_proc.num = __kmp_nThreadsPerCore; // use all available procs
-    if (__kmp_hws_proc.offset >= __kmp_nThreadsPerCore) {
-      KMP_WARNING(AffHWSubsetManyProcs);
-      goto _exit;
-    }
-    // end of validation --------------------------------------------
-
-    if (pAddr) // pAddr is NULL in case of affinity_none
-      newAddr = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) *
-                                              __kmp_avail_proc); // max size
-    // main loop to form HW subset ----------------------------------
-    hS = NULL;
-    int NP = hwloc_get_nbobjs_by_type(tp, HWLOC_OBJ_PACKAGE);
-    for (int s = 0; s < NP; ++s) {
-      // Check Socket -----------------------------------------------
-      hS = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PACKAGE, hS);
-      if (!__kmp_hwloc_obj_has_PUs(tp, hS))
-        continue; // skip socket if all PUs are out of fullMask
-      ++nS; // only count objects those have PUs in affinity mask
-      if (nS <= __kmp_hws_socket.offset ||
-          nS > __kmp_hws_socket.num + __kmp_hws_socket.offset) {
-        n_old += __kmp_hwloc_skip_PUs_obj(tp, hS); // skip socket
-        continue; // move to next socket
-      }
-      nCr = 0; // count number of cores per socket
-      // socket requested, go down the topology tree
-      // check 4 cases: (+NUMA+Tile), (+NUMA-Tile), (-NUMA+Tile), (-NUMA-Tile)
-      if (numa_support) {
-        nN = 0;
-        hN = NULL;
-        // num nodes in current socket
-        int NN =
-            __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_NUMANODE, &hN);
-        for (int n = 0; n < NN; ++n) {
-          // Check NUMA Node ----------------------------------------
-          if (!__kmp_hwloc_obj_has_PUs(tp, hN)) {
-            hN = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hN);
-            continue; // skip node if all PUs are out of fullMask
-          }
-          ++nN;
-          if (nN <= __kmp_hws_node.offset ||
-              nN > __kmp_hws_node.num + __kmp_hws_node.offset) {
-            // skip node as not requested
-            n_old += __kmp_hwloc_skip_PUs_obj(tp, hN); // skip node
-            hN = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hN);
-            continue; // move to next node
-          }
-          // node requested, go down the topology tree
-          if (tile_support) {
-            nL = 0;
-            hL = NULL;
-            int NL = __kmp_hwloc_count_children_by_depth(tp, hN, L2depth, &hL);
-            for (int l = 0; l < NL; ++l) {
-              // Check L2 (tile) ------------------------------------
-              if (!__kmp_hwloc_obj_has_PUs(tp, hL)) {
-                hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL);
-                continue; // skip tile if all PUs are out of fullMask
-              }
-              ++nL;
-              if (nL <= __kmp_hws_tile.offset ||
-                  nL > __kmp_hws_tile.num + __kmp_hws_tile.offset) {
-                // skip tile as not requested
-                n_old += __kmp_hwloc_skip_PUs_obj(tp, hL); // skip tile
-                hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL);
-                continue; // move to next tile
-              }
-              // tile requested, go down the topology tree
-              nC = 0;
-              hC = NULL;
-              // num cores in current tile
-              int NC = __kmp_hwloc_count_children_by_type(tp, hL,
-                                                          HWLOC_OBJ_CORE, &hC);
-              for (int c = 0; c < NC; ++c) {
-                // Check Core ---------------------------------------
-                if (!__kmp_hwloc_obj_has_PUs(tp, hC)) {
-                  hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
-                  continue; // skip core if all PUs are out of fullMask
-                }
-                ++nC;
-                if (nC <= __kmp_hws_core.offset ||
-                    nC > __kmp_hws_core.num + __kmp_hws_core.offset) {
-                  // skip node as not requested
-                  n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core
-                  hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
-                  continue; // move to next node
-                }
-                // core requested, go down to PUs
-                nT = 0;
-                nTr = 0;
-                hT = NULL;
-                // num procs in current core
-                int NT = __kmp_hwloc_count_children_by_type(tp, hC,
-                                                            HWLOC_OBJ_PU, &hT);
-                for (int t = 0; t < NT; ++t) {
-                  // Check PU ---------------------------------------
-                  idx = hT->os_index;
-                  if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) {
-                    hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
-                    continue; // skip PU if not in fullMask
-                  }
-                  ++nT;
-                  if (nT <= __kmp_hws_proc.offset ||
-                      nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) {
-                    // skip PU
-                    KMP_CPU_CLR(idx, __kmp_affin_fullMask);
-                    ++n_old;
-                    KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx));
-                    hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
-                    continue; // move to next node
-                  }
-                  ++nTr;
-                  if (pAddr) // collect requested thread's data
-                    newAddr[n_new] = (*pAddr)[n_old];
-                  ++n_new;
-                  ++n_old;
-                  hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
-                } // threads loop
-                if (nTr > 0) {
-                  ++nCr; // num cores per socket
-                  ++nCo; // total num cores
-                  if (nTr > nTpC)
-                    nTpC = nTr; // calc max threads per core
-                }
-                hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
-              } // cores loop
-              hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL);
-            } // tiles loop
-          } else { // tile_support
-            // no tiles, check cores
-            nC = 0;
-            hC = NULL;
-            // num cores in current node
-            int NC =
-                __kmp_hwloc_count_children_by_type(tp, hN, HWLOC_OBJ_CORE, &hC);
-            for (int c = 0; c < NC; ++c) {
-              // Check Core ---------------------------------------
-              if (!__kmp_hwloc_obj_has_PUs(tp, hC)) {
-                hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
-                continue; // skip core if all PUs are out of fullMask
-              }
-              ++nC;
-              if (nC <= __kmp_hws_core.offset ||
-                  nC > __kmp_hws_core.num + __kmp_hws_core.offset) {
-                // skip node as not requested
-                n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core
-                hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
-                continue; // move to next node
-              }
-              // core requested, go down to PUs
-              nT = 0;
-              nTr = 0;
-              hT = NULL;
-              int NT =
-                  __kmp_hwloc_count_children_by_type(tp, hC, HWLOC_OBJ_PU, &hT);
-              for (int t = 0; t < NT; ++t) {
-                // Check PU ---------------------------------------
-                idx = hT->os_index;
-                if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) {
-                  hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
-                  continue; // skip PU if not in fullMask
-                }
-                ++nT;
-                if (nT <= __kmp_hws_proc.offset ||
-                    nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) {
-                  // skip PU
-                  KMP_CPU_CLR(idx, __kmp_affin_fullMask);
-                  ++n_old;
-                  KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx));
-                  hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
-                  continue; // move to next node
-                }
-                ++nTr;
-                if (pAddr) // collect requested thread's data
-                  newAddr[n_new] = (*pAddr)[n_old];
-                ++n_new;
-                ++n_old;
-                hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
-              } // threads loop
-              if (nTr > 0) {
-                ++nCr; // num cores per socket
-                ++nCo; // total num cores
-                if (nTr > nTpC)
-                  nTpC = nTr; // calc max threads per core
-              }
-              hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
-            } // cores loop
-          } // tiles support
-          hN = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hN);
-        } // nodes loop
-      } else { // numa_support
-        // no NUMA support
-        if (tile_support) {
-          nL = 0;
-          hL = NULL;
-          // num tiles in current socket
-          int NL = __kmp_hwloc_count_children_by_depth(tp, hS, L2depth, &hL);
-          for (int l = 0; l < NL; ++l) {
-            // Check L2 (tile) ------------------------------------
-            if (!__kmp_hwloc_obj_has_PUs(tp, hL)) {
-              hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL);
-              continue; // skip tile if all PUs are out of fullMask
-            }
-            ++nL;
-            if (nL <= __kmp_hws_tile.offset ||
-                nL > __kmp_hws_tile.num + __kmp_hws_tile.offset) {
-              // skip tile as not requested
-              n_old += __kmp_hwloc_skip_PUs_obj(tp, hL); // skip tile
-              hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL);
-              continue; // move to next tile
-            }
-            // tile requested, go down the topology tree
-            nC = 0;
-            hC = NULL;
-            // num cores per tile
-            int NC =
-                __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, &hC);
-            for (int c = 0; c < NC; ++c) {
-              // Check Core ---------------------------------------
-              if (!__kmp_hwloc_obj_has_PUs(tp, hC)) {
-                hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
-                continue; // skip core if all PUs are out of fullMask
-              }
-              ++nC;
-              if (nC <= __kmp_hws_core.offset ||
-                  nC > __kmp_hws_core.num + __kmp_hws_core.offset) {
-                // skip node as not requested
-                n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core
-                hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
-                continue; // move to next node
-              }
-              // core requested, go down to PUs
-              nT = 0;
-              nTr = 0;
-              hT = NULL;
-              // num procs per core
-              int NT =
-                  __kmp_hwloc_count_children_by_type(tp, hC, HWLOC_OBJ_PU, &hT);
-              for (int t = 0; t < NT; ++t) {
-                // Check PU ---------------------------------------
-                idx = hT->os_index;
-                if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) {
-                  hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
-                  continue; // skip PU if not in fullMask
-                }
-                ++nT;
-                if (nT <= __kmp_hws_proc.offset ||
-                    nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) {
-                  // skip PU
-                  KMP_CPU_CLR(idx, __kmp_affin_fullMask);
-                  ++n_old;
-                  KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx));
-                  hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
-                  continue; // move to next node
-                }
-                ++nTr;
-                if (pAddr) // collect requested thread's data
-                  newAddr[n_new] = (*pAddr)[n_old];
-                ++n_new;
-                ++n_old;
-                hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
-              } // threads loop
-              if (nTr > 0) {
-                ++nCr; // num cores per socket
-                ++nCo; // total num cores
-                if (nTr > nTpC)
-                  nTpC = nTr; // calc max threads per core
-              }
-              hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
-            } // cores loop
-            hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL);
-          } // tiles loop
-        } else { // tile_support
-          // no tiles, check cores
-          nC = 0;
-          hC = NULL;
-          // num cores in socket
-          int NC =
-              __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_CORE, &hC);
-          for (int c = 0; c < NC; ++c) {
-            // Check Core -------------------------------------------
-            if (!__kmp_hwloc_obj_has_PUs(tp, hC)) {
-              hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
-              continue; // skip core if all PUs are out of fullMask
-            }
-            ++nC;
-            if (nC <= __kmp_hws_core.offset ||
-                nC > __kmp_hws_core.num + __kmp_hws_core.offset) {
-              // skip node as not requested
-              n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core
-              hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
-              continue; // move to next node
-            }
-            // core requested, go down to PUs
-            nT = 0;
-            nTr = 0;
-            hT = NULL;
-            // num procs per core
-            int NT =
-                __kmp_hwloc_count_children_by_type(tp, hC, HWLOC_OBJ_PU, &hT);
-            for (int t = 0; t < NT; ++t) {
-              // Check PU ---------------------------------------
-              idx = hT->os_index;
-              if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) {
-                hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
-                continue; // skip PU if not in fullMask
-              }
-              ++nT;
-              if (nT <= __kmp_hws_proc.offset ||
-                  nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) {
-                // skip PU
-                KMP_CPU_CLR(idx, __kmp_affin_fullMask);
-                ++n_old;
-                KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx));
-                hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
-                continue; // move to next node
-              }
-              ++nTr;
-              if (pAddr) // collect requested thread's data
-                newAddr[n_new] = (*pAddr)[n_old];
-              ++n_new;
-              ++n_old;
-              hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
-            } // threads loop
-            if (nTr > 0) {
-              ++nCr; // num cores per socket
-              ++nCo; // total num cores
-              if (nTr > nTpC)
-                nTpC = nTr; // calc max threads per core
-            }
-            hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
-          } // cores loop
-        } // tiles support
-      } // numa_support
-      if (nCr > 0) { // found cores?
-        ++nPkg; // num sockets
-        if (nCr > nCpP)
-          nCpP = nCr; // calc max cores per socket
-      }
-    } // sockets loop
-
-    // check the subset is valid
-    KMP_DEBUG_ASSERT(n_old == __kmp_avail_proc);
-    KMP_DEBUG_ASSERT(nPkg > 0);
-    KMP_DEBUG_ASSERT(nCpP > 0);
-    KMP_DEBUG_ASSERT(nTpC > 0);
-    KMP_DEBUG_ASSERT(nCo > 0);
-    KMP_DEBUG_ASSERT(nPkg <= nPackages);
-    KMP_DEBUG_ASSERT(nCpP <= nCoresPerPkg);
-    KMP_DEBUG_ASSERT(nTpC <= __kmp_nThreadsPerCore);
-    KMP_DEBUG_ASSERT(nCo <= __kmp_ncores);
-
-    nPackages = nPkg; // correct num sockets
-    nCoresPerPkg = nCpP; // correct num cores per socket
-    __kmp_nThreadsPerCore = nTpC; // correct num threads per core
-    __kmp_avail_proc = n_new; // correct num procs
-    __kmp_ncores = nCo; // correct num cores
-    // hwloc topology method end
-  } else
-#endif // KMP_USE_HWLOC
-  {
-    int n_old = 0, n_new = 0, proc_num = 0;
-    if (__kmp_hws_node.num > 0 || __kmp_hws_tile.num > 0) {
-      KMP_WARNING(AffHWSubsetNoHWLOC);
-      goto _exit;
-    }
-    if (__kmp_hws_socket.num == 0)
-      __kmp_hws_socket.num = nPackages; // use all available sockets
-    if (__kmp_hws_core.num == 0)
-      __kmp_hws_core.num = nCoresPerPkg; // use all available cores
-    if (__kmp_hws_proc.num == 0 || __kmp_hws_proc.num > __kmp_nThreadsPerCore)
-      __kmp_hws_proc.num = __kmp_nThreadsPerCore; // use all HW contexts
-    if (!__kmp_affinity_uniform_topology()) {
-      KMP_WARNING(AffHWSubsetNonUniform);
-      goto _exit; // don't support non-uniform topology
-    }
-    if (depth > 3) {
-      KMP_WARNING(AffHWSubsetNonThreeLevel);
-      goto _exit; // don't support not-3-level topology
-    }
-    if (__kmp_hws_socket.offset + __kmp_hws_socket.num > nPackages) {
-      KMP_WARNING(AffHWSubsetManySockets);
-      goto _exit;
-    }
-    if (__kmp_hws_core.offset + __kmp_hws_core.num > nCoresPerPkg) {
-      KMP_WARNING(AffHWSubsetManyCores);
-      goto _exit;
-    }
-    // Form the requested subset
-    if (pAddr) // pAddr is NULL in case of affinity_none
-      newAddr = (AddrUnsPair *)__kmp_allocate(
-          sizeof(AddrUnsPair) * __kmp_hws_socket.num * __kmp_hws_core.num *
-          __kmp_hws_proc.num);
-    for (int i = 0; i < nPackages; ++i) {
-      if (i < __kmp_hws_socket.offset ||
-          i >= __kmp_hws_socket.offset + __kmp_hws_socket.num) {
-        // skip not-requested socket
-        n_old += nCoresPerPkg * __kmp_nThreadsPerCore;
-        if (__kmp_pu_os_idx != NULL) {
-          // walk through skipped socket
-          for (int j = 0; j < nCoresPerPkg; ++j) {
-            for (int k = 0; k < __kmp_nThreadsPerCore; ++k) {
-              KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask);
-              ++proc_num;
-            }
-          }
-        }
-      } else {
-        // walk through requested socket
-        for (int j = 0; j < nCoresPerPkg; ++j) {
-          if (j < __kmp_hws_core.offset ||
-              j >= __kmp_hws_core.offset +
-                       __kmp_hws_core.num) { // skip not-requested core
-            n_old += __kmp_nThreadsPerCore;
-            if (__kmp_pu_os_idx != NULL) {
-              for (int k = 0; k < __kmp_nThreadsPerCore; ++k) {
-                KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask);
-                ++proc_num;
-              }
-            }
-          } else {
-            // walk through requested core
-            for (int k = 0; k < __kmp_nThreadsPerCore; ++k) {
-              if (k < __kmp_hws_proc.num) {
-                if (pAddr) // collect requested thread's data
-                  newAddr[n_new] = (*pAddr)[n_old];
-                n_new++;
-              } else {
-                if (__kmp_pu_os_idx != NULL)
-                  KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask);
-              }
-              n_old++;
-              ++proc_num;
-            }
-          }
-        }
-      }
-    }
-    KMP_DEBUG_ASSERT(n_old == nPackages * nCoresPerPkg * __kmp_nThreadsPerCore);
-    KMP_DEBUG_ASSERT(n_new ==
-                     __kmp_hws_socket.num * __kmp_hws_core.num *
-                         __kmp_hws_proc.num);
-    nPackages = __kmp_hws_socket.num; // correct nPackages
-    nCoresPerPkg = __kmp_hws_core.num; // correct nCoresPerPkg
-    __kmp_nThreadsPerCore = __kmp_hws_proc.num; // correct __kmp_nThreadsPerCore
-    __kmp_avail_proc = n_new; // correct avail_proc
-    __kmp_ncores = nPackages * __kmp_hws_core.num; // correct ncores
-  } // non-hwloc topology method
-  if (pAddr) {
-    __kmp_free(*pAddr);
-    *pAddr = newAddr; // replace old topology with new one
-  }
-  if (__kmp_affinity_verbose) {
-    KMP_INFORM(AvailableOSProc, "KMP_HW_SUBSET", __kmp_avail_proc);
-    kmp_str_buf_t buf;
-    __kmp_str_buf_init(&buf);
-    __kmp_str_buf_print(&buf, "%d", nPackages);
-    KMP_INFORM(TopologyExtra, "KMP_HW_SUBSET", buf.str, nCoresPerPkg,
-               __kmp_nThreadsPerCore, __kmp_ncores);
-    __kmp_str_buf_free(&buf);
-  }
-_exit:
-  if (__kmp_pu_os_idx != NULL) {
-    __kmp_free(__kmp_pu_os_idx);
-    __kmp_pu_os_idx = NULL;
-  }
-}
-
 // This function figures out the deepest level at which there is at least one
 // cluster/core with more than one processing unit bound to it.
-static int __kmp_affinity_find_core_level(const AddrUnsPair *address2os,
-                                          int nprocs, int bottom_level) {
+static int __kmp_affinity_find_core_level(int nprocs, int bottom_level) {
   int core_level = 0;
 
   for (int i = 0; i < nprocs; i++) {
+    const kmp_hw_thread_t &hw_thread = __kmp_topology->at(i);
     for (int j = bottom_level; j > 0; j--) {
-      if (address2os[i].first.labels[j] > 0) {
+      if (hw_thread.ids[j] > 0) {
         if (core_level < (j - 1)) {
           core_level = j - 1;
         }
@@ -3948,83 +3401,42 @@ static int __kmp_affinity_find_core_level(const AddrUnsPair *address2os,
 }
 
 // This function counts number of clusters/cores at given level.
-static int __kmp_affinity_compute_ncores(const AddrUnsPair *address2os,
-                                         int nprocs, int bottom_level,
+static int __kmp_affinity_compute_ncores(int nprocs, int bottom_level,
                                          int core_level) {
-  int ncores = 0;
-  int i, j;
-
-  j = bottom_level;
-  for (i = 0; i < nprocs; i++) {
-    for (j = bottom_level; j > core_level; j--) {
-      if ((i + 1) < nprocs) {
-        if (address2os[i + 1].first.labels[j] > 0) {
+  return __kmp_topology->get_count(core_level);
+}
+// This function finds to which cluster/core given processing unit is bound.
+static int __kmp_affinity_find_core(int proc, int bottom_level,
+                                    int core_level) {
+  int core = 0;
+  KMP_DEBUG_ASSERT(proc >= 0 && proc < __kmp_topology->get_num_hw_threads());
+  for (int i = 0; i <= proc; ++i) {
+    if (i + 1 <= proc) {
+      for (int j = 0; j <= core_level; ++j) {
+        if (__kmp_topology->at(i + 1).sub_ids[j] !=
+            __kmp_topology->at(i).sub_ids[j]) {
+          core++;
           break;
         }
       }
     }
-    if (j == core_level) {
-      ncores++;
-    }
-  }
-  if (j > core_level) {
-    // In case of ( nprocs < __kmp_avail_proc ) we may end too deep and miss one
-    // core. May occur when called from __kmp_affinity_find_core().
-    ncores++;
   }
-  return ncores;
-}
-
-// This function finds to which cluster/core given processing unit is bound.
-static int __kmp_affinity_find_core(const AddrUnsPair *address2os, int proc,
-                                    int bottom_level, int core_level) {
-  return __kmp_affinity_compute_ncores(address2os, proc + 1, bottom_level,
-                                       core_level) -
-         1;
+  return core;
 }
 
 // This function finds maximal number of processing units bound to a
 // cluster/core at given level.
-static int __kmp_affinity_max_proc_per_core(const AddrUnsPair *address2os,
-                                            int nprocs, int bottom_level,
+static int __kmp_affinity_max_proc_per_core(int nprocs, int bottom_level,
                                             int core_level) {
-  int maxprocpercore = 0;
-
-  if (core_level < bottom_level) {
-    for (int i = 0; i < nprocs; i++) {
-      int percore = address2os[i].first.labels[core_level + 1] + 1;
-
-      if (percore > maxprocpercore) {
-        maxprocpercore = percore;
-      }
-    }
-  } else {
-    maxprocpercore = 1;
-  }
-  return maxprocpercore;
+  if (core_level >= bottom_level)
+    return 1;
+  int thread_level = __kmp_topology->get_level(KMP_HW_THREAD);
+  return __kmp_topology->calculate_ratio(thread_level, core_level);
 }
 
-static AddrUnsPair *address2os = NULL;
 static int *procarr = NULL;
 static int __kmp_aff_depth = 0;
 
-#if KMP_USE_HIER_SCHED
-#define KMP_EXIT_AFF_NONE                                                      \
-  KMP_ASSERT(__kmp_affinity_type == affinity_none);                            \
-  KMP_ASSERT(address2os == NULL);                                              \
-  __kmp_apply_thread_places(NULL, 0);                                          \
-  __kmp_create_affinity_none_places();                                         \
-  __kmp_dispatch_set_hierarchy_values();                                       \
-  return;
-#else
-#define KMP_EXIT_AFF_NONE                                                      \
-  KMP_ASSERT(__kmp_affinity_type == affinity_none);                            \
-  KMP_ASSERT(address2os == NULL);                                              \
-  __kmp_apply_thread_places(NULL, 0);                                          \
-  __kmp_create_affinity_none_places();                                         \
-  return;
-#endif
-
 // Create a one element mask array (set of places) which only contains the
 // initial process's affinity mask
 static void __kmp_create_affinity_none_places() {
@@ -4036,31 +3448,6 @@ static void __kmp_create_affinity_none_places() {
   KMP_CPU_COPY(dest, __kmp_affin_fullMask);
 }
 
-static int __kmp_affinity_cmp_Address_child_num(const void *a, const void *b) {
-  const Address *aa = &(((const AddrUnsPair *)a)->first);
-  const Address *bb = &(((const AddrUnsPair *)b)->first);
-  unsigned depth = aa->depth;
-  unsigned i;
-  KMP_DEBUG_ASSERT(depth == bb->depth);
-  KMP_DEBUG_ASSERT((unsigned)__kmp_affinity_compact <= depth);
-  KMP_DEBUG_ASSERT(__kmp_affinity_compact >= 0);
-  for (i = 0; i < (unsigned)__kmp_affinity_compact; i++) {
-    int j = depth - i - 1;
-    if (aa->childNums[j] < bb->childNums[j])
-      return -1;
-    if (aa->childNums[j] > bb->childNums[j])
-      return 1;
-  }
-  for (; i < depth; i++) {
-    int j = i - __kmp_affinity_compact;
-    if (aa->childNums[j] < bb->childNums[j])
-      return -1;
-    if (aa->childNums[j] > bb->childNums[j])
-      return 1;
-  }
-  return 0;
-}
-
 static void __kmp_aux_affinity_initialize(void) {
   if (__kmp_affinity_masks != NULL) {
     KMP_ASSERT(__kmp_affin_fullMask != NULL);
@@ -4120,14 +3507,6 @@ static void __kmp_aux_affinity_initialize(void) {
     }
   }
 
-  if (__kmp_affinity_gran == affinity_gran_tile &&
-      // check if user's request is valid
-      __kmp_affinity_dispatch->get_api_type() == KMPAffinity::NATIVE_OS) {
-    KMP_WARNING(AffTilesNoHWLOC, "KMP_AFFINITY");
-    __kmp_affinity_gran = affinity_gran_package;
-  }
-
-  int depth = -1;
   kmp_i18n_id_t msg_id = kmp_i18n_null;
 
   // For backward compatibility, setting KMP_CPUINFO_FILE =>
@@ -4137,23 +3516,17 @@ static void __kmp_aux_affinity_initialize(void) {
     __kmp_affinity_top_method = affinity_top_method_cpuinfo;
   }
 
+  bool success = false;
   if (__kmp_affinity_top_method == affinity_top_method_all) {
-    // In the default code path, errors are not fatal - we just try using
-    // another method. We only emit a warning message if affinity is on, or the
-    // verbose flag is set, and the nowarnings flag was not set.
-    const char *file_name = NULL;
-    int line = 0;
+// In the default code path, errors are not fatal - we just try using
+// another method. We only emit a warning message if affinity is on, or the
+// verbose flag is set, an the nowarnings flag was not set.
 #if KMP_USE_HWLOC
-    if (depth < 0 &&
+    if (!success &&
         __kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC) {
-      if (__kmp_affinity_verbose) {
-        KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
-      }
       if (!__kmp_hwloc_error) {
-        depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id);
-        if (depth == 0) {
-          KMP_EXIT_AFF_NONE;
-        } else if (depth < 0 && __kmp_affinity_verbose) {
+        success = __kmp_affinity_create_hwloc_map(&msg_id);
+        if (!success && __kmp_affinity_verbose) {
           KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY");
         }
       } else if (__kmp_affinity_verbose) {
@@ -4163,165 +3536,85 @@ static void __kmp_aux_affinity_initialize(void) {
 #endif
 
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
-
-    if (depth < 0) {
-      if (__kmp_affinity_verbose) {
-        KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC));
+    if (!success) {
+      success = __kmp_affinity_create_x2apicid_map(&msg_id);
+      if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) {
+        KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id));
       }
-
-      file_name = NULL;
-      depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
-      if (depth == 0) {
-        KMP_EXIT_AFF_NONE;
-      }
-
-      if (depth < 0) {
-        if (__kmp_affinity_verbose) {
-          if (msg_id != kmp_i18n_null) {
-            KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY",
-                       __kmp_i18n_catgets(msg_id),
-                       KMP_I18N_STR(DecodingLegacyAPIC));
-          } else {
-            KMP_INFORM(AffInfoStr, "KMP_AFFINITY",
-                       KMP_I18N_STR(DecodingLegacyAPIC));
-          }
-        }
-
-        file_name = NULL;
-        depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
-        if (depth == 0) {
-          KMP_EXIT_AFF_NONE;
-        }
+    }
+    if (!success) {
+      success = __kmp_affinity_create_apicid_map(&msg_id);
+      if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) {
+        KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id));
       }
     }
-
 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
 
 #if KMP_OS_LINUX
-
-    if (depth < 0) {
-      if (__kmp_affinity_verbose) {
-        if (msg_id != kmp_i18n_null) {
-          KMP_INFORM(AffStrParseFilename, "KMP_AFFINITY",
-                     __kmp_i18n_catgets(msg_id), "/proc/cpuinfo");
-        } else {
-          KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "/proc/cpuinfo");
-        }
-      }
-
-      kmp_safe_raii_file_t f("/proc/cpuinfo", "r");
-      depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
-      if (depth == 0) {
-        KMP_EXIT_AFF_NONE;
+    if (!success) {
+      int line = 0;
+      success = __kmp_affinity_create_cpuinfo_map(&line, &msg_id);
+      if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) {
+        KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id));
       }
     }
-
 #endif /* KMP_OS_LINUX */
 
 #if KMP_GROUP_AFFINITY
-
-    if ((depth < 0) && (__kmp_num_proc_groups > 1)) {
-      if (__kmp_affinity_verbose) {
-        KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
+    if (!success && (__kmp_num_proc_groups > 1)) {
+      success = __kmp_affinity_create_proc_group_map(&msg_id);
+      if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) {
+        KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id));
       }
-
-      depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
-      KMP_ASSERT(depth != 0);
     }
-
 #endif /* KMP_GROUP_AFFINITY */
 
-    if (depth < 0) {
-      if (__kmp_affinity_verbose && (msg_id != kmp_i18n_null)) {
-        if (file_name == NULL) {
-          KMP_INFORM(UsingFlatOS, __kmp_i18n_catgets(msg_id));
-        } else if (line == 0) {
-          KMP_INFORM(UsingFlatOSFile, file_name, __kmp_i18n_catgets(msg_id));
-        } else {
-          KMP_INFORM(UsingFlatOSFileLine, file_name, line,
-                     __kmp_i18n_catgets(msg_id));
-        }
-      }
-      // FIXME - print msg if msg_id = kmp_i18n_null ???
-
-      file_name = "";
-      depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
-      if (depth == 0) {
-        KMP_EXIT_AFF_NONE;
+    if (!success) {
+      success = __kmp_affinity_create_flat_map(&msg_id);
+      if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) {
+        KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id));
       }
-      KMP_ASSERT(depth > 0);
-      KMP_ASSERT(address2os != NULL);
+      KMP_ASSERT(success);
     }
   }
 
+// If the user has specified that a paricular topology discovery method is to be
+// used, then we abort if that method fails. The exception is group affinity,
+// which might have been implicitly set.
 #if KMP_USE_HWLOC
   else if (__kmp_affinity_top_method == affinity_top_method_hwloc) {
     KMP_ASSERT(__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC);
-    if (__kmp_affinity_verbose) {
-      KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
-    }
-    depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id);
-    if (depth == 0) {
-      KMP_EXIT_AFF_NONE;
+    success = __kmp_affinity_create_hwloc_map(&msg_id);
+    if (!success) {
+      KMP_ASSERT(msg_id != kmp_i18n_null);
+      KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
     }
   }
 #endif // KMP_USE_HWLOC
 
-// If the user has specified that a particular topology discovery method is to be
-// used, then we abort if that method fails. The exception is group affinity,
-// which might have been implicitly set.
-
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
-
-  else if (__kmp_affinity_top_method == affinity_top_method_x2apicid) {
-    if (__kmp_affinity_verbose) {
-      KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC));
-    }
-
-    depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
-    if (depth == 0) {
-      KMP_EXIT_AFF_NONE;
-    }
-    if (depth < 0) {
+  else if (__kmp_affinity_top_method == affinity_top_method_x2apicid ||
+           __kmp_affinity_top_method == affinity_top_method_x2apicid_1f) {
+    success = __kmp_affinity_create_x2apicid_map(&msg_id);
+    if (!success) {
       KMP_ASSERT(msg_id != kmp_i18n_null);
       KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
     }
   } else if (__kmp_affinity_top_method == affinity_top_method_apicid) {
-    if (__kmp_affinity_verbose) {
-      KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC));
-    }
-
-    depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
-    if (depth == 0) {
-      KMP_EXIT_AFF_NONE;
-    }
-    if (depth < 0) {
+    success = __kmp_affinity_create_apicid_map(&msg_id);
+    if (!success) {
       KMP_ASSERT(msg_id != kmp_i18n_null);
       KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
     }
   }
-
 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
 
   else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) {
-    const char *filename;
-    const char *env_var = nullptr;
-    if (__kmp_cpuinfo_file != NULL) {
-      filename = __kmp_cpuinfo_file;
-      env_var = "KMP_CPUINFO_FILE";
-    } else {
-      filename = "/proc/cpuinfo";
-    }
-
-    if (__kmp_affinity_verbose) {
-      KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename);
-    }
-
-    kmp_safe_raii_file_t f(filename, "r", env_var);
     int line = 0;
-    depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
-    if (depth < 0) {
+    success = __kmp_affinity_create_cpuinfo_map(&line, &msg_id);
+    if (!success) {
       KMP_ASSERT(msg_id != kmp_i18n_null);
+      const char *filename = __kmp_cpuinfo_get_filename();
       if (line > 0) {
         KMP_FATAL(FileLineMsgExiting, filename, line,
                   __kmp_i18n_catgets(msg_id));
@@ -4329,84 +3622,80 @@ static void __kmp_aux_affinity_initialize(void) {
         KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id));
       }
     }
-    if (__kmp_affinity_type == affinity_none) {
-      KMP_ASSERT(depth == 0);
-      KMP_EXIT_AFF_NONE;
-    }
   }
 
 #if KMP_GROUP_AFFINITY
-
   else if (__kmp_affinity_top_method == affinity_top_method_group) {
-    if (__kmp_affinity_verbose) {
-      KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
-    }
-
-    depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
-    KMP_ASSERT(depth != 0);
-    if (depth < 0) {
+    success = __kmp_affinity_create_proc_group_map(&msg_id);
+    KMP_ASSERT(success);
+    if (!success) {
       KMP_ASSERT(msg_id != kmp_i18n_null);
       KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
     }
   }
-
 #endif /* KMP_GROUP_AFFINITY */
 
   else if (__kmp_affinity_top_method == affinity_top_method_flat) {
-    if (__kmp_affinity_verbose) {
-      KMP_INFORM(AffUsingFlatOS, "KMP_AFFINITY");
-    }
-
-    depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
-    if (depth == 0) {
-      KMP_EXIT_AFF_NONE;
-    }
+    success = __kmp_affinity_create_flat_map(&msg_id);
     // should not fail
-    KMP_ASSERT(depth > 0);
-    KMP_ASSERT(address2os != NULL);
+    KMP_ASSERT(success);
   }
 
-#if KMP_USE_HIER_SCHED
-  __kmp_dispatch_set_hierarchy_values();
-#endif
-
-  if (address2os == NULL) {
+  // Early exit if topology could not be created
+  if (!__kmp_topology) {
     if (KMP_AFFINITY_CAPABLE() &&
         (__kmp_affinity_verbose ||
          (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none)))) {
       KMP_WARNING(ErrorInitializeAffinity);
     }
+    if (nPackages > 0 && nCoresPerPkg > 0 && __kmp_nThreadsPerCore > 0 &&
+        __kmp_ncores > 0) {
+      __kmp_topology = kmp_topology_t::allocate(0, 0, NULL);
+      __kmp_topology->canonicalize(nPackages, nCoresPerPkg,
+                                   __kmp_nThreadsPerCore, __kmp_ncores);
+      if (__kmp_affinity_verbose) {
+        __kmp_topology->print("KMP_AFFINITY");
+      }
+    }
     __kmp_affinity_type = affinity_none;
     __kmp_create_affinity_none_places();
+#if KMP_USE_HIER_SCHED
+    __kmp_dispatch_set_hierarchy_values();
+#endif
     KMP_AFFINITY_DISABLE();
     return;
   }
 
-  if (__kmp_affinity_gran == affinity_gran_tile
-#if KMP_USE_HWLOC
-      && __kmp_tile_depth == 0
+  // Canonicalize, print (if requested), apply KMP_HW_SUBSET, and
+  // initialize other data structures which depend on the topology
+  __kmp_topology->canonicalize();
+  if (__kmp_affinity_verbose)
+    __kmp_topology->print("KMP_AFFINITY");
+  bool filtered = __kmp_topology->filter_hw_subset();
+  if (filtered && __kmp_affinity_verbose)
+    __kmp_topology->print("KMP_HW_SUBSET");
+  machine_hierarchy.init(__kmp_topology->get_num_hw_threads());
+  KMP_ASSERT(__kmp_avail_proc == __kmp_topology->get_num_hw_threads());
+  // If KMP_AFFINITY=none, then only create the single "none" place
+  // which is the process's initial affinity mask or the number of
+  // hardware threads depending on respect,norespect
+  if (__kmp_affinity_type == affinity_none) {
+    __kmp_create_affinity_none_places();
+#if KMP_USE_HIER_SCHED
+    __kmp_dispatch_set_hierarchy_values();
 #endif
-      ) {
-    // tiles requested but not detected, warn user on this
-    KMP_WARNING(AffTilesNoTiles, "KMP_AFFINITY");
+    return;
   }
-
-  __kmp_apply_thread_places(&address2os, depth);
+  int depth = __kmp_topology->get_depth();
 
   // Create the table of masks, indexed by thread Id.
   unsigned maxIndex;
   unsigned numUnique;
-  kmp_affin_mask_t *osId2Mask =
-      __kmp_create_masks(&maxIndex, &numUnique, address2os, __kmp_avail_proc);
+  kmp_affin_mask_t *osId2Mask = __kmp_create_masks(&maxIndex, &numUnique);
   if (__kmp_affinity_gran_levels == 0) {
     KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc);
   }
 
-  // Set the childNums vector in all Address objects. This must be done before
-  // we can sort using __kmp_affinity_cmp_Address_child_num(), which takes into
-  // account the setting of __kmp_affinity_compact.
-  __kmp_affinity_assign_child_nums(address2os, __kmp_avail_proc);
-
   switch (__kmp_affinity_type) {
 
   case affinity_explicit:
@@ -4431,18 +3720,17 @@ static void __kmp_aux_affinity_initialize(void) {
     }
     break;
 
-  // The other affinity types rely on sorting the Addresses according to some
-  // permutation of the machine topology tree. Set __kmp_affinity_compact and
-  // __kmp_affinity_offset appropriately, then jump to a common code fragment
-  // to do the sort and create the array of affinity masks.
-
+  // The other affinity types rely on sorting the hardware threads according to
+  // some permutation of the machine topology tree. Set __kmp_affinity_compact
+  // and __kmp_affinity_offset appropriately, then jump to a common code
+  // fragment to do the sort and create the array of affinity masks.
   case affinity_logical:
     __kmp_affinity_compact = 0;
     if (__kmp_affinity_offset) {
       __kmp_affinity_offset =
           __kmp_nThreadsPerCore * __kmp_affinity_offset % __kmp_avail_proc;
     }
-    goto sortAddresses;
+    goto sortTopology;
 
   case affinity_physical:
     if (__kmp_nThreadsPerCore > 1) {
@@ -4457,7 +3745,7 @@ static void __kmp_aux_affinity_initialize(void) {
       __kmp_affinity_offset =
           __kmp_nThreadsPerCore * __kmp_affinity_offset % __kmp_avail_proc;
     }
-    goto sortAddresses;
+    goto sortTopology;
 
   case affinity_scatter:
     if (__kmp_affinity_compact >= depth) {
@@ -4465,13 +3753,13 @@ static void __kmp_aux_affinity_initialize(void) {
     } else {
       __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact;
     }
-    goto sortAddresses;
+    goto sortTopology;
 
   case affinity_compact:
     if (__kmp_affinity_compact >= depth) {
       __kmp_affinity_compact = depth - 1;
     }
-    goto sortAddresses;
+    goto sortTopology;
 
   case affinity_balanced:
     if (depth <= 1) {
@@ -4481,16 +3769,16 @@ static void __kmp_aux_affinity_initialize(void) {
       __kmp_affinity_type = affinity_none;
       __kmp_create_affinity_none_places();
       return;
-    } else if (!__kmp_affinity_uniform_topology()) {
+    } else if (!__kmp_topology->is_uniform()) {
       // Save the depth for further usage
       __kmp_aff_depth = depth;
 
-      int core_level = __kmp_affinity_find_core_level(
-          address2os, __kmp_avail_proc, depth - 1);
-      int ncores = __kmp_affinity_compute_ncores(address2os, __kmp_avail_proc,
-                                                 depth - 1, core_level);
+      int core_level =
+          __kmp_affinity_find_core_level(__kmp_avail_proc, depth - 1);
+      int ncores = __kmp_affinity_compute_ncores(__kmp_avail_proc, depth - 1,
+                                                 core_level);
       int maxprocpercore = __kmp_affinity_max_proc_per_core(
-          address2os, __kmp_avail_proc, depth - 1, core_level);
+          __kmp_avail_proc, depth - 1, core_level);
 
       int nproc = ncores * maxprocpercore;
       if ((nproc < 2) || (nproc < __kmp_avail_proc)) {
@@ -4509,9 +3797,8 @@ static void __kmp_aux_affinity_initialize(void) {
       int lastcore = -1;
       int inlastcore = 0;
       for (int i = 0; i < __kmp_avail_proc; i++) {
-        int proc = address2os[i].second;
-        int core =
-            __kmp_affinity_find_core(address2os, i, depth - 1, core_level);
+        int proc = __kmp_topology->at(i).os_id;
+        int core = __kmp_affinity_find_core(i, depth - 1, core_level);
 
         if (core == lastcore) {
           inlastcore++;
@@ -4527,7 +3814,7 @@ static void __kmp_aux_affinity_initialize(void) {
       __kmp_affinity_compact = depth - 1;
     }
 
-  sortAddresses:
+  sortTopology:
     // Allocate the gtid->affinity mask table.
     if (__kmp_affinity_dups) {
       __kmp_affinity_num_masks = __kmp_avail_proc;
@@ -4543,18 +3830,19 @@ static void __kmp_aux_affinity_initialize(void) {
 
     KMP_CPU_ALLOC_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks);
 
-    // Sort the address2os table according to the current setting of
+    // Sort the topology table according to the current setting of
     // __kmp_affinity_compact, then fill out __kmp_affinity_masks.
-    qsort(address2os, __kmp_avail_proc, sizeof(*address2os),
-          __kmp_affinity_cmp_Address_child_num);
+    __kmp_topology->sort_compact();
     {
       int i;
       unsigned j;
-      for (i = 0, j = 0; i < __kmp_avail_proc; i++) {
-        if ((!__kmp_affinity_dups) && (!address2os[i].first.leader)) {
+      int num_hw_threads = __kmp_topology->get_num_hw_threads();
+      for (i = 0, j = 0; i < num_hw_threads; i++) {
+        if ((!__kmp_affinity_dups) && (!__kmp_topology->at(i).leader)) {
           continue;
         }
-        unsigned osId = address2os[i].second;
+        int osId = __kmp_topology->at(i).os_id;
+
         kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId);
         kmp_affin_mask_t *dest = KMP_CPU_INDEX(__kmp_affinity_masks, j);
         KMP_ASSERT(KMP_CPU_ISSET(osId, src));
@@ -4565,6 +3853,8 @@ static void __kmp_aux_affinity_initialize(void) {
       }
       KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks);
     }
+    // Sort the topology back using ids
+    __kmp_topology->sort_ids();
     break;
 
   default:
@@ -4572,9 +3862,7 @@ static void __kmp_aux_affinity_initialize(void) {
   }
 
   KMP_CPU_FREE_ARRAY(osId2Mask, maxIndex + 1);
-  machine_hierarchy.init(address2os, __kmp_avail_proc);
 }
-#undef KMP_EXIT_AFF_NONE
 
 void __kmp_affinity_initialize(void) {
   // Much of the code above was written assuming that if a machine was not
@@ -4614,10 +3902,6 @@ void __kmp_affinity_uninitialize(void) {
     __kmp_free(__kmp_affinity_proclist);
     __kmp_affinity_proclist = NULL;
   }
-  if (address2os != NULL) {
-    __kmp_free(address2os);
-    address2os = NULL;
-  }
   if (procarr != NULL) {
     __kmp_free(procarr);
     procarr = NULL;
@@ -4628,6 +3912,14 @@ void __kmp_affinity_uninitialize(void) {
     __kmp_hwloc_topology = NULL;
   }
 #endif
+  if (__kmp_hw_subset) {
+    kmp_hw_subset_t::deallocate(__kmp_hw_subset);
+    __kmp_hw_subset = nullptr;
+  }
+  if (__kmp_topology) {
+    kmp_topology_t::deallocate(__kmp_topology);
+    __kmp_topology = nullptr;
+  }
   KMPAffinity::destroy_api();
 }
 
@@ -4652,7 +3944,8 @@ void __kmp_affinity_set_init_mask(int gtid, int isa_root) {
 
   if (KMP_AFFINITY_NON_PROC_BIND) {
     if ((__kmp_affinity_type == affinity_none) ||
-        (__kmp_affinity_type == affinity_balanced)) {
+        (__kmp_affinity_type == affinity_balanced) ||
+        KMP_HIDDEN_HELPER_THREAD(gtid)) {
 #if KMP_GROUP_AFFINITY
       if (__kmp_num_proc_groups > 1) {
         return;
@@ -4662,12 +3955,13 @@ void __kmp_affinity_set_init_mask(int gtid, int isa_root) {
       i = 0;
       mask = __kmp_affin_fullMask;
     } else {
+      int mask_idx = __kmp_adjust_gtid_for_hidden_helpers(gtid);
       KMP_DEBUG_ASSERT(__kmp_affinity_num_masks > 0);
-      i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
+      i = (mask_idx + __kmp_affinity_offset) % __kmp_affinity_num_masks;
       mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
     }
   } else {
-    if ((!isa_root) ||
+    if ((!isa_root) || KMP_HIDDEN_HELPER_THREAD(gtid) ||
         (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) {
 #if KMP_GROUP_AFFINITY
       if (__kmp_num_proc_groups > 1) {
@@ -4679,15 +3973,16 @@ void __kmp_affinity_set_init_mask(int gtid, int isa_root) {
       mask = __kmp_affin_fullMask;
     } else {
       // int i = some hash function or just a counter that doesn't
-      // always start at 0.  Use gtid for now.
+      // always start at 0.  Use adjusted gtid for now.
+      int mask_idx = __kmp_adjust_gtid_for_hidden_helpers(gtid);
       KMP_DEBUG_ASSERT(__kmp_affinity_num_masks > 0);
-      i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
+      i = (mask_idx + __kmp_affinity_offset) % __kmp_affinity_num_masks;
       mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
     }
   }
 
   th->th.th_current_place = i;
-  if (isa_root) {
+  if (isa_root || KMP_HIDDEN_HELPER_THREAD(gtid)) {
     th->th.th_new_place = i;
     th->th.th_first_place = 0;
     th->th.th_last_place = __kmp_affinity_num_masks - 1;
@@ -4708,7 +4003,7 @@ void __kmp_affinity_set_init_mask(int gtid, int isa_root) {
 
   KMP_CPU_COPY(th->th.th_affin_mask, mask);
 
-  if (__kmp_affinity_verbose
+  if (__kmp_affinity_verbose && !KMP_HIDDEN_HELPER_THREAD(gtid)
       /* to avoid duplicate printing (will be correctly printed on barrier) */
       && (__kmp_affinity_type == affinity_none ||
           (i != KMP_PLACE_ALL && __kmp_affinity_type != affinity_balanced))) {
@@ -4719,6 +4014,17 @@ void __kmp_affinity_set_init_mask(int gtid, int isa_root) {
                __kmp_gettid(), gtid, buf);
   }
 
+#if KMP_DEBUG
+  // Hidden helper thread affinity only printed for debug builds
+  if (__kmp_affinity_verbose && KMP_HIDDEN_HELPER_THREAD(gtid)) {
+    char buf[KMP_AFFIN_MASK_PRINT_LEN];
+    __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
+                              th->th.th_affin_mask);
+    KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY (hidden helper thread)",
+               (kmp_int32)getpid(), __kmp_gettid(), gtid, buf);
+  }
+#endif
+
 #if KMP_OS_WINDOWS
   // On Windows* OS, the process affinity mask might have changed. If the user
   // didn't request affinity and this call fails, just continue silently.
@@ -4780,14 +4086,15 @@ int __kmp_aux_set_affinity(void **mask) {
   }
 
   gtid = __kmp_entry_gtid();
-  KA_TRACE(1000, (""); {
-    char buf[KMP_AFFIN_MASK_PRINT_LEN];
-    __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
-                              (kmp_affin_mask_t *)(*mask));
-    __kmp_debug_printf(
-        "kmp_set_affinity: setting affinity mask for thread %d = %s\n", gtid,
-        buf);
-  });
+  KA_TRACE(
+      1000, (""); {
+        char buf[KMP_AFFIN_MASK_PRINT_LEN];
+        __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
+                                  (kmp_affin_mask_t *)(*mask));
+        __kmp_debug_printf(
+            "kmp_set_affinity: setting affinity mask for thread %d = %s\n",
+            gtid, buf);
+      });
 
   if (__kmp_env_consistency_check) {
     if ((mask == NULL) || (*mask == NULL)) {
@@ -4848,13 +4155,15 @@ int __kmp_aux_get_affinity(void **mask) {
   th = __kmp_threads[gtid];
   KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
 
-  KA_TRACE(1000, (""); {
-    char buf[KMP_AFFIN_MASK_PRINT_LEN];
-    __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
-                              th->th.th_affin_mask);
-    __kmp_printf("kmp_get_affinity: stored affinity mask for thread %d = %s\n",
-                 gtid, buf);
-  });
+  KA_TRACE(
+      1000, (""); {
+        char buf[KMP_AFFIN_MASK_PRINT_LEN];
+        __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
+                                  th->th.th_affin_mask);
+        __kmp_printf(
+            "kmp_get_affinity: stored affinity mask for thread %d = %s\n", gtid,
+            buf);
+      });
 
   if (__kmp_env_consistency_check) {
     if ((mask == NULL) || (*mask == NULL)) {
@@ -4865,16 +4174,19 @@ int __kmp_aux_get_affinity(void **mask) {
 #if !KMP_OS_WINDOWS
 
   retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
-  KA_TRACE(1000, (""); {
-    char buf[KMP_AFFIN_MASK_PRINT_LEN];
-    __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
-                              (kmp_affin_mask_t *)(*mask));
-    __kmp_printf("kmp_get_affinity: system affinity mask for thread %d = %s\n",
-                 gtid, buf);
-  });
+  KA_TRACE(
+      1000, (""); {
+        char buf[KMP_AFFIN_MASK_PRINT_LEN];
+        __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
+                                  (kmp_affin_mask_t *)(*mask));
+        __kmp_printf(
+            "kmp_get_affinity: system affinity mask for thread %d = %s\n", gtid,
+            buf);
+      });
   return retval;
 
 #else
+  (void)retval;
 
   KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask);
   return 0;
@@ -4899,15 +4211,16 @@ int __kmp_aux_set_affinity_mask_proc(int proc, void **mask) {
     return -1;
   }
 
-  KA_TRACE(1000, (""); {
-    int gtid = __kmp_entry_gtid();
-    char buf[KMP_AFFIN_MASK_PRINT_LEN];
-    __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
-                              (kmp_affin_mask_t *)(*mask));
-    __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in "
-                       "affinity mask for thread %d = %s\n",
-                       proc, gtid, buf);
-  });
+  KA_TRACE(
+      1000, (""); {
+        int gtid = __kmp_entry_gtid();
+        char buf[KMP_AFFIN_MASK_PRINT_LEN];
+        __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
+                                  (kmp_affin_mask_t *)(*mask));
+        __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in "
+                           "affinity mask for thread %d = %s\n",
+                           proc, gtid, buf);
+      });
 
   if (__kmp_env_consistency_check) {
     if ((mask == NULL) || (*mask == NULL)) {
@@ -4931,15 +4244,16 @@ int __kmp_aux_unset_affinity_mask_proc(int proc, void **mask) {
     return -1;
   }
 
-  KA_TRACE(1000, (""); {
-    int gtid = __kmp_entry_gtid();
-    char buf[KMP_AFFIN_MASK_PRINT_LEN];
-    __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
-                              (kmp_affin_mask_t *)(*mask));
-    __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in "
-                       "affinity mask for thread %d = %s\n",
-                       proc, gtid, buf);
-  });
+  KA_TRACE(
+      1000, (""); {
+        int gtid = __kmp_entry_gtid();
+        char buf[KMP_AFFIN_MASK_PRINT_LEN];
+        __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
+                                  (kmp_affin_mask_t *)(*mask));
+        __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in "
+                           "affinity mask for thread %d = %s\n",
+                           proc, gtid, buf);
+      });
 
   if (__kmp_env_consistency_check) {
     if ((mask == NULL) || (*mask == NULL)) {
@@ -4963,15 +4277,16 @@ int __kmp_aux_get_affinity_mask_proc(int proc, void **mask) {
     return -1;
   }
 
-  KA_TRACE(1000, (""); {
-    int gtid = __kmp_entry_gtid();
-    char buf[KMP_AFFIN_MASK_PRINT_LEN];
-    __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
-                              (kmp_affin_mask_t *)(*mask));
-    __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in "
-                       "affinity mask for thread %d = %s\n",
-                       proc, gtid, buf);
-  });
+  KA_TRACE(
+      1000, (""); {
+        int gtid = __kmp_entry_gtid();
+        char buf[KMP_AFFIN_MASK_PRINT_LEN];
+        __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
+                                  (kmp_affin_mask_t *)(*mask));
+        __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in "
+                           "affinity mask for thread %d = %s\n",
+                           proc, gtid, buf);
+      });
 
   if (__kmp_env_consistency_check) {
     if ((mask == NULL) || (*mask == NULL)) {
@@ -4995,16 +4310,19 @@ void __kmp_balanced_affinity(kmp_info_t *th, int nthreads) {
   bool fine_gran = true;
   int tid = th->th.th_info.ds.ds_tid;
 
+  // Do not perform balanced affinity for the hidden helper threads
+  if (KMP_HIDDEN_HELPER_THREAD(__kmp_gtid_from_thread(th)))
+    return;
+
   switch (__kmp_affinity_gran) {
-  case affinity_gran_fine:
-  case affinity_gran_thread:
+  case KMP_HW_THREAD:
     break;
-  case affinity_gran_core:
+  case KMP_HW_CORE:
     if (__kmp_nThreadsPerCore > 1) {
       fine_gran = false;
     }
     break;
-  case affinity_gran_package:
+  case KMP_HW_SOCKET:
     if (nCoresPerPkg > 1) {
       fine_gran = false;
     }
@@ -5013,7 +4331,7 @@ void __kmp_balanced_affinity(kmp_info_t *th, int nthreads) {
     fine_gran = false;
   }
 
-  if (__kmp_affinity_uniform_topology()) {
+  if (__kmp_topology->is_uniform()) {
     int coreID;
     int threadID;
     // Number of hyper threads per core in HT machine
@@ -5037,7 +4355,6 @@ void __kmp_balanced_affinity(kmp_info_t *th, int nthreads) {
       coreID = (tid - big_cores) / chunk;
       threadID = ((tid - big_cores) % chunk) % __kmp_nth_per_core;
     }
-
     KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(),
                       "Illegal set affinity operation when not capable");
 
@@ -5045,12 +4362,13 @@ void __kmp_balanced_affinity(kmp_info_t *th, int nthreads) {
     KMP_CPU_ZERO(mask);
 
     if (fine_gran) {
-      int osID = address2os[coreID * __kmp_nth_per_core + threadID].second;
+      int osID =
+          __kmp_topology->at(coreID * __kmp_nth_per_core + threadID).os_id;
       KMP_CPU_SET(osID, mask);
     } else {
       for (int i = 0; i < __kmp_nth_per_core; i++) {
         int osID;
-        osID = address2os[coreID * __kmp_nth_per_core + i].second;
+        osID = __kmp_topology->at(coreID * __kmp_nth_per_core + i).os_id;
         KMP_CPU_SET(osID, mask);
       }
     }
@@ -5066,26 +4384,26 @@ void __kmp_balanced_affinity(kmp_info_t *th, int nthreads) {
     kmp_affin_mask_t *mask = th->th.th_affin_mask;
     KMP_CPU_ZERO(mask);
 
-    int core_level = __kmp_affinity_find_core_level(
-        address2os, __kmp_avail_proc, __kmp_aff_depth - 1);
-    int ncores = __kmp_affinity_compute_ncores(address2os, __kmp_avail_proc,
+    int core_level =
+        __kmp_affinity_find_core_level(__kmp_avail_proc, __kmp_aff_depth - 1);
+    int ncores = __kmp_affinity_compute_ncores(__kmp_avail_proc,
                                                __kmp_aff_depth - 1, core_level);
     int nth_per_core = __kmp_affinity_max_proc_per_core(
-        address2os, __kmp_avail_proc, __kmp_aff_depth - 1, core_level);
+        __kmp_avail_proc, __kmp_aff_depth - 1, core_level);
 
     // For performance gain consider the special case nthreads ==
     // __kmp_avail_proc
     if (nthreads == __kmp_avail_proc) {
       if (fine_gran) {
-        int osID = address2os[tid].second;
+        int osID = __kmp_topology->at(tid).os_id;
         KMP_CPU_SET(osID, mask);
       } else {
-        int core = __kmp_affinity_find_core(address2os, tid,
-                                            __kmp_aff_depth - 1, core_level);
+        int core =
+            __kmp_affinity_find_core(tid, __kmp_aff_depth - 1, core_level);
         for (int i = 0; i < __kmp_avail_proc; i++) {
-          int osID = address2os[i].second;
-          if (__kmp_affinity_find_core(address2os, i, __kmp_aff_depth - 1,
-                                       core_level) == core) {
+          int osID = __kmp_topology->at(i).os_id;
+          if (__kmp_affinity_find_core(i, __kmp_aff_depth - 1, core_level) ==
+              core) {
             KMP_CPU_SET(osID, mask);
           }
         }
diff --git a/openmp/runtime/src/kmp_affinity.h b/openmp/runtime/src/kmp_affinity.h
index 013080bbc9ea..8e72922d2c6e 100644
--- a/openmp/runtime/src/kmp_affinity.h
+++ b/openmp/runtime/src/kmp_affinity.h
@@ -337,8 +337,8 @@ class KMPNativeAffinity : public KMPAffinity {
       long retval =
           syscall(__NR_sched_getaffinity, 0, __kmp_affin_mask_size, mask);
 #elif KMP_OS_FREEBSD
-      int r =
-          pthread_getaffinity_np(pthread_self(), __kmp_affin_mask_size, reinterpret_cast<cpuset_t *>(mask));
+      int r = pthread_getaffinity_np(pthread_self(), __kmp_affin_mask_size,
+                                     reinterpret_cast<cpuset_t *>(mask));
       int retval = (r == 0 ? 0 : -1);
 #endif
       if (retval >= 0) {
@@ -357,8 +357,8 @@ class KMPNativeAffinity : public KMPAffinity {
       long retval =
           syscall(__NR_sched_setaffinity, 0, __kmp_affin_mask_size, mask);
 #elif KMP_OS_FREEBSD
-      int r =
-          pthread_setaffinity_np(pthread_self(), __kmp_affin_mask_size, reinterpret_cast<cpuset_t *>(mask));
+      int r = pthread_setaffinity_np(pthread_self(), __kmp_affin_mask_size,
+                                     reinterpret_cast<cpuset_t *>(mask));
       int retval = (r == 0 ? 0 : -1);
 #endif
       if (retval >= 0) {
@@ -598,91 +598,274 @@ class KMPNativeAffinity : public KMPAffinity {
 #endif /* KMP_OS_WINDOWS */
 #endif /* KMP_AFFINITY_SUPPORTED */
 
-class Address {
+class kmp_hw_thread_t {
 public:
-  static const unsigned maxDepth = 32;
-  unsigned labels[maxDepth];
-  unsigned childNums[maxDepth];
-  unsigned depth;
-  unsigned leader;
-  Address(unsigned _depth) : depth(_depth), leader(FALSE) {}
-  Address &operator=(const Address &b) {
-    depth = b.depth;
-    for (unsigned i = 0; i < depth; i++) {
-      labels[i] = b.labels[i];
-      childNums[i] = b.childNums[i];
-    }
-    leader = FALSE;
-    return *this;
-  }
-  bool operator==(const Address &b) const {
-    if (depth != b.depth)
-      return false;
-    for (unsigned i = 0; i < depth; i++)
-      if (labels[i] != b.labels[i])
-        return false;
-    return true;
-  }
-  bool isClose(const Address &b, int level) const {
-    if (depth != b.depth)
-      return false;
-    if ((unsigned)level >= depth)
-      return true;
-    for (unsigned i = 0; i < (depth - level); i++)
-      if (labels[i] != b.labels[i])
-        return false;
-    return true;
-  }
-  bool operator!=(const Address &b) const { return !operator==(b); }
-  void print() const {
-    unsigned i;
-    printf("Depth: %u --- ", depth);
-    for (i = 0; i < depth; i++) {
-      printf("%u ", labels[i]);
-    }
+  static const int UNKNOWN_ID = -1;
+  static int compare_ids(const void *a, const void *b);
+  static int compare_compact(const void *a, const void *b);
+  int ids[KMP_HW_LAST];
+  int sub_ids[KMP_HW_LAST];
+  bool leader;
+  int os_id;
+  void print() const;
+  void clear() {
+    for (int i = 0; i < (int)KMP_HW_LAST; ++i)
+      ids[i] = UNKNOWN_ID;
+    leader = false;
   }
 };
 
-class AddrUnsPair {
+class kmp_topology_t {
+
+  struct flags_t {
+    int uniform : 1;
+    int reserved : 31;
+  };
+
+  int depth;
+
+  // The following arrays are all 'depth' long
+
+  // Orderd array of the types in the topology
+  kmp_hw_t *types;
+
+  // Keep quick topology ratios, for non-uniform topologies,
+  // this ratio holds the max number of itemAs per itemB
+  // e.g., [ 4 packages | 6 cores / package | 2 threads / core ]
+  int *ratio;
+
+  // Storage containing the absolute number of each topology layer
+  int *count;
+
+  // The hardware threads array
+  // hw_threads is num_hw_threads long
+  // Each hw_thread's ids and sub_ids are depth deep
+  int num_hw_threads;
+  kmp_hw_thread_t *hw_threads;
+
+  // Equivalence hash where the key is the hardware topology item
+  // and the value is the equivalent hardware topology type in the
+  // types[] array, if the value is KMP_HW_UNKNOWN, then there is no
+  // known equivalence for the topology type
+  kmp_hw_t equivalent[KMP_HW_LAST];
+
+  // Flags describing the topology
+  flags_t flags;
+
+  // Count each item & get the num x's per y
+  // e.g., get the number of cores and the number of threads per core
+  // for each (x, y) in (KMP_HW_* , KMP_HW_*)
+  void _gather_enumeration_information();
+
+  // Remove layers that don't add information to the topology.
+  // This is done by having the layer take on the id = UNKNOWN_ID (-1)
+  void _remove_radix1_layers();
+
+  // Find out if the topology is uniform
+  void _discover_uniformity();
+
+  // Set all the sub_ids for each hardware thread
+  void _set_sub_ids();
+
+  // Set global affinity variables describing the number of threads per
+  // core, the number of packages, the number of cores per package, and
+  // the number of cores.
+  void _set_globals();
+
+  // Set the last level cache equivalent type
+  void _set_last_level_cache();
+
 public:
-  Address first;
-  unsigned second;
-  AddrUnsPair(Address _first, unsigned _second)
-      : first(_first), second(_second) {}
-  AddrUnsPair &operator=(const AddrUnsPair &b) {
-    first = b.first;
-    second = b.second;
-    return *this;
-  }
-  void print() const {
-    printf("first = ");
-    first.print();
-    printf(" --- second = %u", second);
-  }
-  bool operator==(const AddrUnsPair &b) const {
-    if (first != b.first)
-      return false;
-    if (second != b.second)
-      return false;
-    return true;
-  }
-  bool operator!=(const AddrUnsPair &b) const { return !operator==(b); }
-};
+  // Force use of allocate()/deallocate()
+  kmp_topology_t() = delete;
+  kmp_topology_t(const kmp_topology_t &t) = delete;
+  kmp_topology_t(kmp_topology_t &&t) = delete;
+  kmp_topology_t &operator=(const kmp_topology_t &t) = delete;
+  kmp_topology_t &operator=(kmp_topology_t &&t) = delete;
+
+  static kmp_topology_t *allocate(int nproc, int ndepth, const kmp_hw_t *types);
+  static void deallocate(kmp_topology_t *);
+
+  // Functions used in create_map() routines
+  kmp_hw_thread_t &at(int index) {
+    KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads);
+    return hw_threads[index];
+  }
+  const kmp_hw_thread_t &at(int index) const {
+    KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads);
+    return hw_threads[index];
+  }
+  int get_num_hw_threads() const { return num_hw_threads; }
+  void sort_ids() {
+    qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),
+          kmp_hw_thread_t::compare_ids);
+  }
+  // Check if the hardware ids are unique, if they are
+  // return true, otherwise return false
+  bool check_ids() const;
 
-static int __kmp_affinity_cmp_Address_labels(const void *a, const void *b) {
-  const Address *aa = &(((const AddrUnsPair *)a)->first);
-  const Address *bb = &(((const AddrUnsPair *)b)->first);
-  unsigned depth = aa->depth;
-  unsigned i;
-  KMP_DEBUG_ASSERT(depth == bb->depth);
-  for (i = 0; i < depth; i++) {
-    if (aa->labels[i] < bb->labels[i])
+  // Function to call after the create_map() routine
+  void canonicalize();
+  void canonicalize(int pkgs, int cores_per_pkg, int thr_per_core, int cores);
+
+  // Functions used after canonicalize() called
+  bool filter_hw_subset();
+  bool is_close(int hwt1, int hwt2, int level) const;
+  bool is_uniform() const { return flags.uniform; }
+  // Tell whether a type is a valid type in the topology
+  // returns KMP_HW_UNKNOWN when there is no equivalent type
+  kmp_hw_t get_equivalent_type(kmp_hw_t type) const { return equivalent[type]; }
+  // Set type1 = type2
+  void set_equivalent_type(kmp_hw_t type1, kmp_hw_t type2) {
+    KMP_DEBUG_ASSERT_VALID_HW_TYPE(type1);
+    KMP_DEBUG_ASSERT_VALID_HW_TYPE(type2);
+    kmp_hw_t real_type2 = equivalent[type2];
+    if (real_type2 == KMP_HW_UNKNOWN)
+      real_type2 = type2;
+    equivalent[type1] = real_type2;
+    // This loop is required since any of the types may have been set to
+    // be equivalent to type1.  They all must be checked and reset to type2.
+    KMP_FOREACH_HW_TYPE(type) {
+      if (equivalent[type] == type1) {
+        equivalent[type] = real_type2;
+      }
+    }
+  }
+  // Calculate number of types corresponding to level1
+  // per types corresponding to level2 (e.g., number of threads per core)
+  int calculate_ratio(int level1, int level2) const {
+    KMP_DEBUG_ASSERT(level1 >= 0 && level1 < depth);
+    KMP_DEBUG_ASSERT(level2 >= 0 && level2 < depth);
+    int r = 1;
+    for (int level = level1; level > level2; --level)
+      r *= ratio[level];
+    return r;
+  }
+  int get_ratio(int level) const {
+    KMP_DEBUG_ASSERT(level >= 0 && level < depth);
+    return ratio[level];
+  }
+  int get_depth() const { return depth; };
+  kmp_hw_t get_type(int level) const {
+    KMP_DEBUG_ASSERT(level >= 0 && level < depth);
+    return types[level];
+  }
+  int get_level(kmp_hw_t type) const {
+    KMP_DEBUG_ASSERT_VALID_HW_TYPE(type);
+    int eq_type = equivalent[type];
+    if (eq_type == KMP_HW_UNKNOWN)
       return -1;
-    if (aa->labels[i] > bb->labels[i])
-      return 1;
+    for (int i = 0; i < depth; ++i)
+      if (types[i] == eq_type)
+        return i;
+    return -1;
+  }
+  int get_count(int level) const {
+    KMP_DEBUG_ASSERT(level >= 0 && level < depth);
+    return count[level];
+  }
+#if KMP_AFFINITY_SUPPORTED
+  void sort_compact() {
+    qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),
+          kmp_hw_thread_t::compare_compact);
+  }
+#endif
+  void print(const char *env_var = "KMP_AFFINITY") const;
+  void dump() const;
+};
+
+class kmp_hw_subset_t {
+public:
+  struct item_t {
+    int num;
+    kmp_hw_t type;
+    int offset;
+  };
+
+private:
+  int depth;
+  int capacity;
+  item_t *items;
+  kmp_uint64 set;
+  bool absolute;
+  // The set must be able to handle up to KMP_HW_LAST number of layers
+  KMP_BUILD_ASSERT(sizeof(set) * 8 >= KMP_HW_LAST);
+
+public:
+  // Force use of allocate()/deallocate()
+  kmp_hw_subset_t() = delete;
+  kmp_hw_subset_t(const kmp_hw_subset_t &t) = delete;
+  kmp_hw_subset_t(kmp_hw_subset_t &&t) = delete;
+  kmp_hw_subset_t &operator=(const kmp_hw_subset_t &t) = delete;
+  kmp_hw_subset_t &operator=(kmp_hw_subset_t &&t) = delete;
+
+  static kmp_hw_subset_t *allocate() {
+    int initial_capacity = 5;
+    kmp_hw_subset_t *retval =
+        (kmp_hw_subset_t *)__kmp_allocate(sizeof(kmp_hw_subset_t));
+    retval->depth = 0;
+    retval->capacity = initial_capacity;
+    retval->set = 0ull;
+    retval->absolute = false;
+    retval->items = (item_t *)__kmp_allocate(sizeof(item_t) * initial_capacity);
+    return retval;
+  }
+  static void deallocate(kmp_hw_subset_t *subset) {
+    __kmp_free(subset->items);
+    __kmp_free(subset);
+  }
+  void set_absolute() { absolute = true; }
+  bool is_absolute() const { return absolute; }
+  void push_back(int num, kmp_hw_t type, int offset) {
+    if (depth == capacity - 1) {
+      capacity *= 2;
+      item_t *new_items = (item_t *)__kmp_allocate(sizeof(item_t) * capacity);
+      for (int i = 0; i < depth; ++i)
+        new_items[i] = items[i];
+      __kmp_free(items);
+      items = new_items;
+    }
+    items[depth].num = num;
+    items[depth].type = type;
+    items[depth].offset = offset;
+    depth++;
+    set |= (1ull << type);
+  }
+  int get_depth() const { return depth; }
+  const item_t &at(int index) const {
+    KMP_DEBUG_ASSERT(index >= 0 && index < depth);
+    return items[index];
+  }
+  item_t &at(int index) {
+    KMP_DEBUG_ASSERT(index >= 0 && index < depth);
+    return items[index];
   }
-  return 0;
-}
+  void remove(int index) {
+    KMP_DEBUG_ASSERT(index >= 0 && index < depth);
+    set &= ~(1ull << items[index].type);
+    for (int j = index + 1; j < depth; ++j) {
+      items[j - 1] = items[j];
+    }
+    depth--;
+  }
+  bool specified(kmp_hw_t type) const { return ((set & (1ull << type)) > 0); }
+  void dump() const {
+    printf("**********************\n");
+    printf("*** kmp_hw_subset: ***\n");
+    printf("* depth: %d\n", depth);
+    printf("* items:\n");
+    for (int i = 0; i < depth; ++i) {
+      printf("num: %d, type: %s, offset: %d\n", items[i].num,
+             __kmp_hw_get_keyword(items[i].type), items[i].offset);
+    }
+    printf("* set: 0x%llx\n", set);
+    printf("* absolute: %d\n", absolute);
+    printf("**********************\n");
+  }
+};
+
+extern kmp_topology_t *__kmp_topology;
+extern kmp_hw_subset_t *__kmp_hw_subset;
 
 /* A structure for holding machine-specific hierarchy info to be computed once
    at init. This structure represents a mapping of threads to the actual machine
@@ -721,18 +904,10 @@ public:
   kmp_uint32 *numPerLevel;
   kmp_uint32 *skipPerLevel;
 
-  void deriveLevels(AddrUnsPair *adr2os, int num_addrs) {
-    int hier_depth = adr2os[0].first.depth;
-    int level = 0;
-    for (int i = hier_depth - 1; i >= 0; --i) {
-      int max = -1;
-      for (int j = 0; j < num_addrs; ++j) {
-        int next = adr2os[j].first.childNums[i];
-        if (next > max)
-          max = next;
-      }
-      numPerLevel[level] = max + 1;
-      ++level;
+  void deriveLevels() {
+    int hier_depth = __kmp_topology->get_depth();
+    for (int i = hier_depth - 1, level = 0; i >= 0; --i, ++level) {
+      numPerLevel[level] = __kmp_topology->get_ratio(i);
     }
   }
 
@@ -747,7 +922,7 @@ public:
     }
   }
 
-  void init(AddrUnsPair *adr2os, int num_addrs) {
+  void init(int num_addrs) {
     kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(
         &uninitialized, not_initialized, initializing);
     if (bool_result == 0) { // Wait for initialization
@@ -774,10 +949,8 @@ public:
     }
 
     // Sort table by physical ID
-    if (adr2os) {
-      qsort(adr2os, num_addrs, sizeof(*adr2os),
-            __kmp_affinity_cmp_Address_labels);
-      deriveLevels(adr2os, num_addrs);
+    if (__kmp_topology && __kmp_topology->get_depth() > 0) {
+      deriveLevels();
     } else {
       numPerLevel[0] = maxLeaves;
       numPerLevel[1] = num_addrs / maxLeaves;
diff --git a/openmp/runtime/src/kmp_alloc.cpp b/openmp/runtime/src/kmp_alloc.cpp
index 31981d5c1d55..857855cf12d6 100644
--- a/openmp/runtime/src/kmp_alloc.cpp
+++ b/openmp/runtime/src/kmp_alloc.cpp
@@ -315,7 +315,7 @@ static void __kmp_bget_enqueue(kmp_info_t *th, void *buf
                                ,
                                kmp_int32 rel_gtid
 #endif
-                               ) {
+) {
   bfhead_t *b = BFH(((char *)buf) - sizeof(bhead_t));
 
   KMP_DEBUG_ASSERT(b->bh.bb.bsize != 0);
@@ -738,7 +738,7 @@ static void brel(kmp_info_t *th, void *buf) {
                        ,
                        __kmp_gtid_from_thread(th)
 #endif
-                           );
+    );
     return;
   }
 
@@ -1242,6 +1242,20 @@ static void **mk_hbw_preferred_hugetlb;
 static void **mk_dax_kmem;
 static void **mk_dax_kmem_all;
 static void **mk_dax_kmem_preferred;
+// Preview of target memory support
+static void *(*kmp_target_alloc_host)(size_t size, int device);
+static void *(*kmp_target_alloc_shared)(size_t size, int device);
+static void *(*kmp_target_alloc_device)(size_t size, int device);
+static void *(*kmp_target_free)(void *ptr, int device);
+static bool __kmp_target_mem_available;
+#define KMP_IS_TARGET_MEM_SPACE(MS)                                            \
+  (MS == llvm_omp_target_host_mem_space ||                                     \
+   MS == llvm_omp_target_shared_mem_space ||                                   \
+   MS == llvm_omp_target_device_mem_space)
+#define KMP_IS_TARGET_MEM_ALLOC(MA)                                            \
+  (MA == llvm_omp_target_host_mem_alloc ||                                     \
+   MA == llvm_omp_target_shared_mem_alloc ||                                   \
+   MA == llvm_omp_target_device_mem_alloc)
 
 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
 static inline void chk_kind(void ***pkind) {
@@ -1338,6 +1352,18 @@ void __kmp_fini_memkind() {
   mk_dax_kmem_preferred = NULL;
 #endif
 }
+// Preview of target memory support
+void __kmp_init_target_mem() {
+  *(void **)(&kmp_target_alloc_host) = KMP_DLSYM("llvm_omp_target_alloc_host");
+  *(void **)(&kmp_target_alloc_shared) =
+      KMP_DLSYM("llvm_omp_target_alloc_shared");
+  *(void **)(&kmp_target_alloc_device) =
+      KMP_DLSYM("llvm_omp_target_alloc_device");
+  *(void **)(&kmp_target_free) = KMP_DLSYM("omp_target_free");
+  __kmp_target_mem_available = kmp_target_alloc_host &&
+                               kmp_target_alloc_shared &&
+                               kmp_target_alloc_device && kmp_target_free;
+}
 
 omp_allocator_handle_t __kmpc_init_allocator(int gtid, omp_memspace_handle_t ms,
                                              int ntraits,
@@ -1345,7 +1371,7 @@ omp_allocator_handle_t __kmpc_init_allocator(int gtid, omp_memspace_handle_t ms,
   // OpenMP 5.0 only allows predefined memspaces
   KMP_DEBUG_ASSERT(ms == omp_default_mem_space || ms == omp_low_lat_mem_space ||
                    ms == omp_large_cap_mem_space || ms == omp_const_mem_space ||
-                   ms == omp_high_bw_mem_space);
+                   ms == omp_high_bw_mem_space || KMP_IS_TARGET_MEM_SPACE(ms));
   kmp_allocator_t *al;
   int i;
   al = (kmp_allocator_t *)__kmp_allocate(sizeof(kmp_allocator_t)); // zeroed
@@ -1423,6 +1449,9 @@ omp_allocator_handle_t __kmpc_init_allocator(int gtid, omp_memspace_handle_t ms,
         al->memkind = mk_default;
       }
     }
+  } else if (KMP_IS_TARGET_MEM_SPACE(ms) && !__kmp_target_mem_available) {
+    __kmp_free(al);
+    return omp_null_allocator;
   } else {
     if (ms == omp_high_bw_mem_space) {
       // cannot detect HBW memory presence without memkind library
@@ -1543,6 +1572,22 @@ void *__kmpc_alloc(int gtid, size_t size, omp_allocator_handle_t allocator) {
       }
     }
   } else if (allocator < kmp_max_mem_alloc) {
+    if (KMP_IS_TARGET_MEM_ALLOC(allocator)) {
+      // Use size input directly as the memory may not be accessible on host.
+      // Use default device for now.
+      if (__kmp_target_mem_available) {
+        kmp_int32 device =
+            __kmp_threads[gtid]->th.th_current_task->td_icvs.default_device;
+        if (allocator == llvm_omp_target_host_mem_alloc)
+          ptr = kmp_target_alloc_host(size, device);
+        else if (allocator == llvm_omp_target_shared_mem_alloc)
+          ptr = kmp_target_alloc_shared(size, device);
+        else // allocator == llvm_omp_target_device_mem_alloc
+          ptr = kmp_target_alloc_device(size, device);
+      }
+      return ptr;
+    }
+
     // pre-defined allocator
     if (allocator == omp_high_bw_mem_alloc) {
       // ptr = NULL;
@@ -1551,6 +1596,18 @@ void *__kmpc_alloc(int gtid, size_t size, omp_allocator_handle_t allocator) {
     } else {
       ptr = __kmp_thread_malloc(__kmp_thread_from_gtid(gtid), desc.size_a);
     }
+  } else if (KMP_IS_TARGET_MEM_SPACE(al->memspace)) {
+    if (__kmp_target_mem_available) {
+      kmp_int32 device =
+          __kmp_threads[gtid]->th.th_current_task->td_icvs.default_device;
+      if (al->memspace == llvm_omp_target_host_mem_space)
+        ptr = kmp_target_alloc_host(size, device);
+      else if (al->memspace == llvm_omp_target_shared_mem_space)
+        ptr = kmp_target_alloc_shared(size, device);
+      else // al->memspace == llvm_omp_target_device_mem_space
+        ptr = kmp_target_alloc_device(size, device);
+    }
+    return ptr;
   } else if (al->pool_size > 0) {
     // custom allocator with pool size requested
     kmp_uint64 used =
@@ -1685,6 +1742,15 @@ void __kmpc_free(int gtid, void *ptr, const omp_allocator_handle_t allocator) {
   kmp_mem_desc_t desc;
   kmp_uintptr_t addr_align; // address to return to caller
   kmp_uintptr_t addr_descr; // address of memory block descriptor
+  if (KMP_IS_TARGET_MEM_ALLOC(allocator) ||
+      (allocator > kmp_max_mem_alloc &&
+       KMP_IS_TARGET_MEM_SPACE(al->memspace))) {
+    KMP_DEBUG_ASSERT(kmp_target_free);
+    kmp_int32 device =
+        __kmp_threads[gtid]->th.th_current_task->td_icvs.default_device;
+    kmp_target_free(ptr, device);
+    return;
+  }
 
   addr_align = (kmp_uintptr_t)ptr;
   addr_descr = addr_align - sizeof(kmp_mem_desc_t);
@@ -1945,10 +2011,9 @@ void *___kmp_fast_allocate(kmp_info_t *this_thr, size_t size KMP_SRC_LOC_DECL) {
   if (ptr != NULL) {
     // pop the head of no-sync free list
     this_thr->th.th_free_lists[index].th_free_list_self = *((void **)ptr);
-    KMP_DEBUG_ASSERT(
-        this_thr ==
-        ((kmp_mem_descr_t *)((kmp_uintptr_t)ptr - sizeof(kmp_mem_descr_t)))
-            ->ptr_aligned);
+    KMP_DEBUG_ASSERT(this_thr == ((kmp_mem_descr_t *)((kmp_uintptr_t)ptr -
+                                                      sizeof(kmp_mem_descr_t)))
+                                     ->ptr_aligned);
     goto end;
   }
   ptr = TCR_SYNC_PTR(this_thr->th.th_free_lists[index].th_free_list_sync);
@@ -1964,10 +2029,9 @@ void *___kmp_fast_allocate(kmp_info_t *this_thr, size_t size KMP_SRC_LOC_DECL) {
     // push the rest of chain into no-sync free list (can be NULL if there was
     // the only block)
     this_thr->th.th_free_lists[index].th_free_list_self = *((void **)ptr);
-    KMP_DEBUG_ASSERT(
-        this_thr ==
-        ((kmp_mem_descr_t *)((kmp_uintptr_t)ptr - sizeof(kmp_mem_descr_t)))
-            ->ptr_aligned);
+    KMP_DEBUG_ASSERT(this_thr == ((kmp_mem_descr_t *)((kmp_uintptr_t)ptr -
+                                                      sizeof(kmp_mem_descr_t)))
+                                     ->ptr_aligned);
     goto end;
   }
 
diff --git a/openmp/runtime/src/kmp_atomic.cpp b/openmp/runtime/src/kmp_atomic.cpp
index a9d5257ab2aa..fcc06216a4fa 100644
--- a/openmp/runtime/src/kmp_atomic.cpp
+++ b/openmp/runtime/src/kmp_atomic.cpp
@@ -779,7 +779,7 @@ static inline kmp_cmplx128_a16_t operator/(kmp_cmplx128_a16_t &lhs,
 #if KMP_MIC
 #define KMP_DO_PAUSE _mm_delay_32(1)
 #else
-#define KMP_DO_PAUSE KMP_CPU_PAUSE()
+#define KMP_DO_PAUSE
 #endif /* KMP_MIC */
 
 // ------------------------------------------------------------------------
@@ -832,6 +832,39 @@ static inline kmp_cmplx128_a16_t operator/(kmp_cmplx128_a16_t &lhs,
 // end of the first part of the workaround for C78287
 #endif // USE_CMPXCHG_FIX
 
+#if KMP_OS_WINDOWS && KMP_ARCH_AARCH64
+// Undo explicit type casts to get MSVC ARM64 to build. Uses
+// OP_CMPXCHG_WORKAROUND definition for OP_CMPXCHG
+#undef OP_CMPXCHG
+#define OP_CMPXCHG(TYPE, BITS, OP)                                             \
+  {                                                                            \
+    struct _sss {                                                              \
+      TYPE cmp;                                                                \
+      kmp_int##BITS *vvv;                                                      \
+    };                                                                         \
+    struct _sss old_value, new_value;                                          \
+    old_value.vvv = (kmp_int##BITS *)&old_value.cmp;                           \
+    new_value.vvv = (kmp_int##BITS *)&new_value.cmp;                           \
+    *old_value.vvv = *(volatile kmp_int##BITS *)lhs;                           \
+    new_value.cmp = old_value.cmp OP rhs;                                      \
+    while (!KMP_COMPARE_AND_STORE_ACQ##BITS(                                   \
+        (kmp_int##BITS *)lhs, *VOLATILE_CAST(kmp_int##BITS *) old_value.vvv,   \
+        *VOLATILE_CAST(kmp_int##BITS *) new_value.vvv)) {                      \
+      KMP_DO_PAUSE;                                                            \
+                                                                               \
+      *old_value.vvv = *(volatile kmp_int##BITS *)lhs;                         \
+      new_value.cmp = old_value.cmp OP rhs;                                    \
+    }                                                                          \
+  }
+
+#undef OP_UPDATE_CRITICAL
+#define OP_UPDATE_CRITICAL(TYPE, OP, LCK_ID)                                   \
+  __kmp_acquire_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid);                       \
+  (*lhs) = (*lhs)OP rhs;                                                       \
+  __kmp_release_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid);
+
+#endif // KMP_OS_WINDOWS && KMP_ARCH_AARCH64
+
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
 
 // ------------------------------------------------------------------------
@@ -1132,7 +1165,6 @@ ATOMIC_CMPX_L(fixed8, orl, kmp_int64, 64, ||, 8i, 7,
                (kmp_int##BITS *)lhs,                                           \
                *VOLATILE_CAST(kmp_int##BITS *) & old_value,                    \
                *VOLATILE_CAST(kmp_int##BITS *) & rhs)) {                       \
-      KMP_CPU_PAUSE();                                                         \
       temp_val = *lhs;                                                         \
       old_value = temp_val;                                                    \
     }                                                                          \
@@ -2087,8 +2119,6 @@ ATOMIC_CRITICAL_READ(cmplx16, a16_rd, kmp_cmplx128_a16_t, +, 32c,
     while (!KMP_COMPARE_AND_STORE_ACQ##BITS(                                   \
         (kmp_int##BITS *)lhs, *VOLATILE_CAST(kmp_int##BITS *) & old_value,     \
         *VOLATILE_CAST(kmp_int##BITS *) & new_value)) {                        \
-      KMP_CPU_PAUSE();                                                         \
-                                                                               \
       temp_val = *lhs;                                                         \
       old_value = temp_val;                                                    \
       new_value = rhs;                                                         \
@@ -2237,8 +2267,6 @@ ATOMIC_CRITICAL_WR(cmplx16, a16_wr, kmp_cmplx128_a16_t, =, 32c,
     while (!KMP_COMPARE_AND_STORE_ACQ##BITS(                                   \
         (kmp_int##BITS *)lhs, *VOLATILE_CAST(kmp_int##BITS *) & old_value,     \
         *VOLATILE_CAST(kmp_int##BITS *) & new_value)) {                        \
-      KMP_CPU_PAUSE();                                                         \
-                                                                               \
       temp_val = *lhs;                                                         \
       old_value = temp_val;                                                    \
       new_value = (TYPE)(old_value OP rhs);                                    \
@@ -2253,6 +2281,7 @@ ATOMIC_CRITICAL_WR(cmplx16, a16_wr, kmp_cmplx128_a16_t, =, 32c,
 #define ATOMIC_CMPXCHG_CPT(TYPE_ID, OP_ID, TYPE, BITS, OP, GOMP_FLAG)          \
   ATOMIC_BEGIN_CPT(TYPE_ID, OP_ID, TYPE, TYPE)                                 \
   TYPE new_value;                                                              \
+  (void)new_value;                                                             \
   OP_GOMP_CRITICAL_CPT(TYPE, OP, GOMP_FLAG)                                    \
   OP_CMPXCHG_CPT(TYPE, BITS, OP)                                               \
   }
@@ -2261,6 +2290,7 @@ ATOMIC_CRITICAL_WR(cmplx16, a16_wr, kmp_cmplx128_a16_t, =, 32c,
 #define ATOMIC_FIXED_ADD_CPT(TYPE_ID, OP_ID, TYPE, BITS, OP, GOMP_FLAG)        \
   ATOMIC_BEGIN_CPT(TYPE_ID, OP_ID, TYPE, TYPE)                                 \
   TYPE old_value, new_value;                                                   \
+  (void)new_value;                                                             \
   OP_GOMP_CRITICAL_CPT(TYPE, OP, GOMP_FLAG)                                    \
   /* OP used as a sign for subtraction: (lhs-rhs) --> (lhs+-rhs) */            \
   old_value = KMP_TEST_THEN_ADD##BITS(lhs, OP rhs);                            \
@@ -2561,6 +2591,7 @@ ATOMIC_CRITICAL_CPT_MIX(float10, long double, div_cpt, /, fp, _Quad, 10r,
 #define ATOMIC_CMPX_L_CPT(TYPE_ID, OP_ID, TYPE, BITS, OP, GOMP_FLAG)           \
   ATOMIC_BEGIN_CPT(TYPE_ID, OP_ID, TYPE, TYPE)                                 \
   TYPE new_value;                                                              \
+  (void)new_value;                                                             \
   OP_GOMP_CRITICAL_L_CPT(= *lhs OP, GOMP_FLAG)                                 \
   OP_CMPXCHG_CPT(TYPE, BITS, OP)                                               \
   }
@@ -2630,7 +2661,6 @@ ATOMIC_CMPX_L_CPT(fixed8, orl_cpt, kmp_int64, 64, ||,
                (kmp_int##BITS *)lhs,                                           \
                *VOLATILE_CAST(kmp_int##BITS *) & old_value,                    \
                *VOLATILE_CAST(kmp_int##BITS *) & rhs)) {                       \
-      KMP_CPU_PAUSE();                                                         \
       temp_val = *lhs;                                                         \
       old_value = temp_val;                                                    \
     }                                                                          \
@@ -2655,6 +2685,7 @@ ATOMIC_CMPX_L_CPT(fixed8, orl_cpt, kmp_int64, 64, ||,
 #define MIN_MAX_COMPXCHG_CPT(TYPE_ID, OP_ID, TYPE, BITS, OP, GOMP_FLAG)        \
   ATOMIC_BEGIN_CPT(TYPE_ID, OP_ID, TYPE, TYPE)                                 \
   TYPE new_value, old_value;                                                   \
+  (void)new_value;                                                             \
   if (*lhs OP rhs) {                                                           \
     GOMP_MIN_MAX_CRITSECT_CPT(OP, GOMP_FLAG)                                   \
     MIN_MAX_CMPXCHG_CPT(TYPE, BITS, OP)                                        \
@@ -2713,6 +2744,7 @@ MIN_MAX_CRITICAL_CPT(float16, min_a16_cpt, Quad_a16_t, >, 16r,
 #define ATOMIC_CMPX_EQV_CPT(TYPE_ID, OP_ID, TYPE, BITS, OP, GOMP_FLAG)         \
   ATOMIC_BEGIN_CPT(TYPE_ID, OP_ID, TYPE, TYPE)                                 \
   TYPE new_value;                                                              \
+  (void)new_value;                                                             \
   OP_GOMP_CRITICAL_EQV_CPT(^= (TYPE) ~, GOMP_FLAG) /* send assignment */       \
   OP_CMPXCHG_CPT(TYPE, BITS, OP)                                               \
   }
@@ -2925,8 +2957,6 @@ ATOMIC_CRITICAL_CPT(cmplx16, div_a16_cpt, kmp_cmplx128_a16_t, /, 32c,
     while (!KMP_COMPARE_AND_STORE_ACQ##BITS(                                   \
         (kmp_int##BITS *)lhs, *VOLATILE_CAST(kmp_int##BITS *) & old_value,     \
         *VOLATILE_CAST(kmp_int##BITS *) & new_value)) {                        \
-      KMP_CPU_PAUSE();                                                         \
-                                                                               \
       temp_val = *lhs;                                                         \
       old_value = temp_val;                                                    \
       new_value = (TYPE)(rhs OP old_value);                                    \
@@ -2941,6 +2971,7 @@ ATOMIC_CRITICAL_CPT(cmplx16, div_a16_cpt, kmp_cmplx128_a16_t, /, 32c,
 #define ATOMIC_CMPXCHG_CPT_REV(TYPE_ID, OP_ID, TYPE, BITS, OP, GOMP_FLAG)      \
   ATOMIC_BEGIN_CPT(TYPE_ID, OP_ID, TYPE, TYPE)                                 \
   TYPE new_value;                                                              \
+  (void)new_value;                                                             \
   OP_GOMP_CRITICAL_CPT_REV(TYPE, OP, GOMP_FLAG)                                \
   OP_CMPXCHG_CPT_REV(TYPE, BITS, OP)                                           \
   }
@@ -3248,8 +3279,6 @@ ATOMIC_CRITICAL_CPT_REV_MIX(float10, long double, div_cpt_rev, /, fp, _Quad,
     while (!KMP_COMPARE_AND_STORE_ACQ##BITS(                                   \
         (kmp_int##BITS *)lhs, *VOLATILE_CAST(kmp_int##BITS *) & old_value,     \
         *VOLATILE_CAST(kmp_int##BITS *) & new_value)) {                        \
-      KMP_CPU_PAUSE();                                                         \
-                                                                               \
       temp_val = *lhs;                                                         \
       old_value = temp_val;                                                    \
       new_value = rhs;                                                         \
@@ -3261,6 +3290,7 @@ ATOMIC_CRITICAL_CPT_REV_MIX(float10, long double, div_cpt_rev, /, fp, _Quad,
 #define ATOMIC_CMPXCHG_SWP(TYPE_ID, TYPE, BITS, GOMP_FLAG)                     \
   ATOMIC_BEGIN_SWP(TYPE_ID, TYPE)                                              \
   TYPE old_value;                                                              \
+  (void)old_value;                                                             \
   GOMP_CRITICAL_SWP(GOMP_FLAG)                                                 \
   CMPXCHG_SWP(TYPE, BITS)                                                      \
   }
@@ -3374,7 +3404,7 @@ void __kmpc_atomic_1(ident_t *id_ref, int gtid, void *lhs, void *rhs,
 #else
       TRUE
 #endif // KMP_ARCH_X86 && defined(KMP_GOMP_COMPAT)
-      ) {
+  ) {
     kmp_int8 old_value, new_value;
 
     old_value = *(kmp_int8 *)lhs;
@@ -3391,7 +3421,7 @@ void __kmpc_atomic_1(ident_t *id_ref, int gtid, void *lhs, void *rhs,
 
     return;
   } else {
-// All 1-byte data is of integer data type.
+    // All 1-byte data is of integer data type.
 
 #ifdef KMP_GOMP_COMPAT
     if (__kmp_atomic_mode == 2) {
@@ -3421,7 +3451,7 @@ void __kmpc_atomic_2(ident_t *id_ref, int gtid, void *lhs, void *rhs,
 #else
       !((kmp_uintptr_t)lhs & 0x1) /* make sure address is 2-byte aligned */
 #endif // KMP_ARCH_X86 && defined(KMP_GOMP_COMPAT)
-      ) {
+  ) {
     kmp_int16 old_value, new_value;
 
     old_value = *(kmp_int16 *)lhs;
@@ -3438,7 +3468,7 @@ void __kmpc_atomic_2(ident_t *id_ref, int gtid, void *lhs, void *rhs,
 
     return;
   } else {
-// All 2-byte data is of integer data type.
+    // All 2-byte data is of integer data type.
 
 #ifdef KMP_GOMP_COMPAT
     if (__kmp_atomic_mode == 2) {
@@ -3470,7 +3500,7 @@ void __kmpc_atomic_4(ident_t *id_ref, int gtid, void *lhs, void *rhs,
 #else
       !((kmp_uintptr_t)lhs & 0x3) /* make sure address is 4-byte aligned */
 #endif // KMP_ARCH_X86 || KMP_ARCH_X86_64
-      ) {
+  ) {
     kmp_int32 old_value, new_value;
 
     old_value = *(kmp_int32 *)lhs;
@@ -3487,8 +3517,8 @@ void __kmpc_atomic_4(ident_t *id_ref, int gtid, void *lhs, void *rhs,
 
     return;
   } else {
-// Use __kmp_atomic_lock_4i for all 4-byte data,
-// even if it isn't of integer data type.
+    // Use __kmp_atomic_lock_4i for all 4-byte data,
+    // even if it isn't of integer data type.
 
 #ifdef KMP_GOMP_COMPAT
     if (__kmp_atomic_mode == 2) {
@@ -3520,7 +3550,7 @@ void __kmpc_atomic_8(ident_t *id_ref, int gtid, void *lhs, void *rhs,
 #else
       !((kmp_uintptr_t)lhs & 0x7) /* make sure address is 8-byte aligned */
 #endif // KMP_ARCH_X86 && defined(KMP_GOMP_COMPAT)
-      ) {
+  ) {
     kmp_int64 old_value, new_value;
 
     old_value = *(kmp_int64 *)lhs;
@@ -3536,8 +3566,8 @@ void __kmpc_atomic_8(ident_t *id_ref, int gtid, void *lhs, void *rhs,
 
     return;
   } else {
-// Use __kmp_atomic_lock_8i for all 8-byte data,
-// even if it isn't of integer data type.
+    // Use __kmp_atomic_lock_8i for all 8-byte data,
+    // even if it isn't of integer data type.
 
 #ifdef KMP_GOMP_COMPAT
     if (__kmp_atomic_mode == 2) {
diff --git a/openmp/runtime/src/kmp_atomic.h b/openmp/runtime/src/kmp_atomic.h
index 8f70928ca985..6a0827aaf1ea 100644
--- a/openmp/runtime/src/kmp_atomic.h
+++ b/openmp/runtime/src/kmp_atomic.h
@@ -363,8 +363,8 @@ static inline void __kmp_acquire_atomic_lock(kmp_atomic_lock_t *lck,
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   if (ompt_enabled.ompt_callback_mutex_acquire) {
     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
-        ompt_mutex_atomic, 0, kmp_mutex_impl_queuing, (ompt_wait_id_t)(uintptr_t)lck,
-        OMPT_GET_RETURN_ADDRESS(0));
+        ompt_mutex_atomic, 0, kmp_mutex_impl_queuing,
+        (ompt_wait_id_t)(uintptr_t)lck, OMPT_GET_RETURN_ADDRESS(0));
   }
 #endif
 
@@ -373,7 +373,8 @@ static inline void __kmp_acquire_atomic_lock(kmp_atomic_lock_t *lck,
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   if (ompt_enabled.ompt_callback_mutex_acquired) {
     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
-        ompt_mutex_atomic, (ompt_wait_id_t)(uintptr_t)lck, OMPT_GET_RETURN_ADDRESS(0));
+        ompt_mutex_atomic, (ompt_wait_id_t)(uintptr_t)lck,
+        OMPT_GET_RETURN_ADDRESS(0));
   }
 #endif
 }
@@ -389,7 +390,8 @@ static inline void __kmp_release_atomic_lock(kmp_atomic_lock_t *lck,
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   if (ompt_enabled.ompt_callback_mutex_released) {
     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
-        ompt_mutex_atomic, (ompt_wait_id_t)(uintptr_t)lck, OMPT_GET_RETURN_ADDRESS(0));
+        ompt_mutex_atomic, (ompt_wait_id_t)(uintptr_t)lck,
+        OMPT_GET_RETURN_ADDRESS(0));
   }
 #endif
 }
diff --git a/openmp/runtime/src/kmp_barrier.cpp b/openmp/runtime/src/kmp_barrier.cpp
index 4da2d0bd5220..93112156a1ef 100644
--- a/openmp/runtime/src/kmp_barrier.cpp
+++ b/openmp/runtime/src/kmp_barrier.cpp
@@ -22,8 +22,6 @@
 #define USE_NGO_STORES 1
 #endif // KMP_MIC
 
-#include "tsan_annotations.h"
-
 #if KMP_MIC && USE_NGO_STORES
 // ICV copying
 #define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
@@ -73,11 +71,10 @@ static bool __kmp_linear_barrier_gather_template(
               gtid, team->t.t_id, tid, __kmp_gtid_from_tid(0, team),
               team->t.t_id, 0, &thr_bar->b_arrived, thr_bar->b_arrived,
               thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
-    // Mark arrival to master thread
+    // Mark arrival to primary thread
     /* After performing this write, a worker thread may not assume that the team
-       is valid any more - it could be deallocated by the master thread at any
+       is valid any more - it could be deallocated by the primary thread at any
        time. */
-    ANNOTATE_BARRIER_BEGIN(this_thr);
     kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[0]);
     flag.release();
   } else {
@@ -111,7 +108,6 @@ static bool __kmp_linear_barrier_gather_template(
                            new_state);
         flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
       }
-      ANNOTATE_BARRIER_END(other_threads[i]);
 #if USE_ITT_BUILD && USE_ITT_NOTIFY
       // Barrier imbalance - write min of the thread time and the other thread
       // time to the thread.
@@ -125,14 +121,11 @@ static bool __kmp_linear_barrier_gather_template(
                  ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
                   gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
                   team->t.t_id, i));
-        ANNOTATE_REDUCE_AFTER(reduce);
         OMPT_REDUCTION_DECL(this_thr, gtid);
         OMPT_REDUCTION_BEGIN;
         (*reduce)(this_thr->th.th_local.reduce_data,
                   other_threads[i]->th.th_local.reduce_data);
         OMPT_REDUCTION_END;
-        ANNOTATE_REDUCE_BEFORE(reduce);
-        ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
       }
     }
     // Don't have to worry about sleep bit here or atomic since team setting
@@ -166,7 +159,7 @@ static bool __kmp_linear_barrier_release_template(
     KMP_DEBUG_ASSERT(team != NULL);
     other_threads = team->t.t_threads;
 
-    KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) master enter for "
+    KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) primary enter for "
                   "barrier type %d\n",
                   gtid, team->t.t_id, tid, bt));
 
@@ -202,13 +195,12 @@ static bool __kmp_linear_barrier_release_template(
              team->t.t_id, i, &other_threads[i]->th.th_bar[bt].bb.b_go,
              other_threads[i]->th.th_bar[bt].bb.b_go,
              other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
-        ANNOTATE_BARRIER_BEGIN(other_threads[i]);
         kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_go,
-                         other_threads[i]);
+                           other_threads[i]);
         flag.release();
       }
     }
-  } else { // Wait for the MASTER thread to release us
+  } else { // Wait for the PRIMARY thread to release us
     KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
                   gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
     if (cancellable) {
@@ -219,7 +211,6 @@ static bool __kmp_linear_barrier_release_template(
       kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
       flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
     }
-    ANNOTATE_BARRIER_END(this_thr);
 #if USE_ITT_BUILD && USE_ITT_NOTIFY
     if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
       // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is
@@ -288,10 +279,9 @@ static bool __kmp_linear_barrier_release_cancellable(
 }
 
 // Tree barrier
-static void
-__kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
-                          int tid, void (*reduce)(void *, void *)
-                                       USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
+static void __kmp_tree_barrier_gather(
+    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
+    void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather);
   kmp_team_t *team = this_thr->th.th_team;
   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
@@ -301,7 +291,7 @@ __kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
   kmp_uint32 branch_factor = 1 << branch_bits;
   kmp_uint32 child;
   kmp_uint32 child_tid;
-  kmp_uint64 new_state;
+  kmp_uint64 new_state = 0;
 
   KA_TRACE(
       20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
@@ -339,7 +329,6 @@ __kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
       // Wait for child to arrive
       kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
       flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
-      ANNOTATE_BARRIER_END(child_thr);
 #if USE_ITT_BUILD && USE_ITT_NOTIFY
       // Barrier imbalance - write min of the thread time and a child time to
       // the thread.
@@ -353,14 +342,11 @@ __kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
                  ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
                   gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
                   team->t.t_id, child_tid));
-        ANNOTATE_REDUCE_AFTER(reduce);
         OMPT_REDUCTION_DECL(this_thr, gtid);
         OMPT_REDUCTION_BEGIN;
         (*reduce)(this_thr->th.th_local.reduce_data,
                   child_thr->th.th_local.reduce_data);
         OMPT_REDUCTION_END;
-        ANNOTATE_REDUCE_BEFORE(reduce);
-        ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
       }
       child++;
       child_tid++;
@@ -379,13 +365,12 @@ __kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
 
     // Mark arrival to parent thread
     /* After performing this write, a worker thread may not assume that the team
-       is valid any more - it could be deallocated by the master thread at any
+       is valid any more - it could be deallocated by the primary thread at any
        time.  */
-    ANNOTATE_BARRIER_BEGIN(this_thr);
     kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[parent_tid]);
     flag.release();
   } else {
-    // Need to update the team arrived pointer if we are the master thread
+    // Need to update the team arrived pointer if we are the primary thread
     if (nproc > 1) // New value was already computed above
       team->t.t_bar[bt].b_arrived = new_state;
     else
@@ -420,7 +405,6 @@ static void __kmp_tree_barrier_release(
     // Wait for parent thread to release us
     kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
     flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
-    ANNOTATE_BARRIER_END(this_thr);
 #if USE_ITT_BUILD && USE_ITT_NOTIFY
     if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
       // In fork barrier where we could not get the object reliably (or
@@ -455,7 +439,7 @@ static void __kmp_tree_barrier_release(
   } else {
     team = __kmp_threads[gtid]->th.th_team;
     KMP_DEBUG_ASSERT(team != NULL);
-    KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) master enter for "
+    KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) primary enter for "
                   "barrier type %d\n",
                   gtid, team->t.t_id, tid, bt));
   }
@@ -495,7 +479,6 @@ static void __kmp_tree_barrier_release(
                 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
                 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
       // Release child from barrier
-      ANNOTATE_BARRIER_BEGIN(child_thr);
       kmp_flag_64<> flag(&child_bar->b_go, child_thr);
       flag.release();
       child++;
@@ -508,10 +491,9 @@ static void __kmp_tree_barrier_release(
 }
 
 // Hyper Barrier
-static void
-__kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
-                           int tid, void (*reduce)(void *, void *)
-                                        USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
+static void __kmp_hyper_barrier_gather(
+    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
+    void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather);
   kmp_team_t *team = this_thr->th.th_team;
   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
@@ -558,8 +540,7 @@ __kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
       // Mark arrival to parent thread
       /* After performing this write (in the last iteration of the enclosing for
          loop), a worker thread may not assume that the team is valid any more
-         - it could be deallocated by the master thread at any time.  */
-      ANNOTATE_BARRIER_BEGIN(this_thr);
+         - it could be deallocated by the primary thread at any time.  */
       p_flag.set_waiter(other_threads[parent_tid]);
       p_flag.release();
       break;
@@ -588,7 +569,6 @@ __kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
       // Wait for child to arrive
       kmp_flag_64<> c_flag(&child_bar->b_arrived, new_state);
       c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
-      ANNOTATE_BARRIER_END(child_thr);
       KMP_MB(); // Synchronize parent and child threads.
 #if USE_ITT_BUILD && USE_ITT_NOTIFY
       // Barrier imbalance - write min of the thread time and a child time to
@@ -603,20 +583,17 @@ __kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
                  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
                   gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
                   team->t.t_id, child_tid));
-        ANNOTATE_REDUCE_AFTER(reduce);
         OMPT_REDUCTION_DECL(this_thr, gtid);
         OMPT_REDUCTION_BEGIN;
         (*reduce)(this_thr->th.th_local.reduce_data,
                   child_thr->th.th_local.reduce_data);
         OMPT_REDUCTION_END;
-        ANNOTATE_REDUCE_BEFORE(reduce);
-        ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
       }
     }
   }
 
   if (KMP_MASTER_TID(tid)) {
-    // Need to update the team arrived pointer if we are the master thread
+    // Need to update the team arrived pointer if we are the primary thread
     if (new_state == KMP_BARRIER_UNUSED_STATE)
       team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
     else
@@ -652,14 +629,14 @@ static void __kmp_hyper_barrier_release(
      been gathered. If KMP_REVERSE_HYPER_BAR is defined (default) the threads
      are released in the reverse order of the corresponding gather, otherwise
      threads are released in the same order. */
-  if (KMP_MASTER_TID(tid)) { // master
+  if (KMP_MASTER_TID(tid)) { // primary thread
     team = __kmp_threads[gtid]->th.th_team;
     KMP_DEBUG_ASSERT(team != NULL);
-    KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for "
+    KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) primary enter for "
                   "barrier type %d\n",
                   gtid, team->t.t_id, tid, bt));
 #if KMP_BARRIER_ICV_PUSH
-    if (propagate_icvs) { // master already has ICVs in final destination; copy
+    if (propagate_icvs) { // primary already has ICVs in final destination; copy
       copy_icvs(&thr_bar->th_fixed_icvs,
                 &team->t.t_implicit_task_taskdata[tid].td_icvs);
     }
@@ -670,7 +647,6 @@ static void __kmp_hyper_barrier_release(
     // Wait for parent thread to release us
     kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
     flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
-    ANNOTATE_BARRIER_END(this_thr);
 #if USE_ITT_BUILD && USE_ITT_NOTIFY
     if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
       // In fork barrier where we could not get the object reliably
@@ -769,7 +745,6 @@ static void __kmp_hyper_barrier_release(
              team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
              child_bar->b_go + KMP_BARRIER_STATE_BUMP));
         // Release child from barrier
-        ANNOTATE_BARRIER_BEGIN(child_thr);
         kmp_flag_64<> flag(&child_bar->b_go, child_thr);
         flag.release();
       }
@@ -816,15 +791,15 @@ static bool __kmp_init_hierarchical_barrier_thread(enum barrier_type bt,
   }
 
   if (uninitialized || team_sz_changed || tid_changed) {
-    thr_bar->my_level = thr_bar->depth - 1; // default for master
-    thr_bar->parent_tid = -1; // default for master
-    if (!KMP_MASTER_TID(
-            tid)) { // if not master, find parent thread in hierarchy
+    thr_bar->my_level = thr_bar->depth - 1; // default for primary thread
+    thr_bar->parent_tid = -1; // default for primary thread
+    if (!KMP_MASTER_TID(tid)) {
+      // if not primary thread, find parent thread in hierarchy
       kmp_uint32 d = 0;
       while (d < thr_bar->depth) { // find parent based on level of thread in
         // hierarchy, and note level
         kmp_uint32 rem;
-        if (d == thr_bar->depth - 2) { // reached level right below the master
+        if (d == thr_bar->depth - 2) { // reached level right below the primary
           thr_bar->parent_tid = 0;
           thr_bar->my_level = d;
           break;
@@ -875,7 +850,7 @@ static void __kmp_hierarchical_barrier_gather(
   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
   kmp_uint32 nproc = this_thr->th.th_team_nproc;
   kmp_info_t **other_threads = team->t.t_threads;
-  kmp_uint64 new_state;
+  kmp_uint64 new_state = 0;
 
   int level = team->t.t_level;
   if (other_threads[0]
@@ -920,7 +895,6 @@ static void __kmp_hierarchical_barrier_gather(
         kmp_flag_64<> flag(&thr_bar->b_arrived, leaf_state);
         flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
         if (reduce) {
-          ANNOTATE_REDUCE_AFTER(reduce);
           OMPT_REDUCTION_DECL(this_thr, gtid);
           OMPT_REDUCTION_BEGIN;
           for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids;
@@ -930,13 +904,10 @@ static void __kmp_hierarchical_barrier_gather(
                            gtid, team->t.t_id, tid,
                            __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
                            child_tid));
-            ANNOTATE_BARRIER_END(other_threads[child_tid]);
             (*reduce)(this_thr->th.th_local.reduce_data,
                       other_threads[child_tid]->th.th_local.reduce_data);
           }
           OMPT_REDUCTION_END;
-          ANNOTATE_REDUCE_BEFORE(reduce);
-          ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
         }
         // clear leaf_state bits
         KMP_TEST_THEN_AND64(&thr_bar->b_arrived, ~(thr_bar->leaf_state));
@@ -959,18 +930,14 @@ static void __kmp_hierarchical_barrier_gather(
                         child_tid, &child_bar->b_arrived, new_state));
           kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
           flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
-          ANNOTATE_BARRIER_END(child_thr);
           if (reduce) {
             KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
                            "T#%d(%d:%d)\n",
                            gtid, team->t.t_id, tid,
                            __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
                            child_tid));
-            ANNOTATE_REDUCE_AFTER(reduce);
             (*reduce)(this_thr->th.th_local.reduce_data,
                       child_thr->th.th_local.reduce_data);
-            ANNOTATE_REDUCE_BEFORE(reduce);
-            ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
           }
         }
       }
@@ -992,24 +959,20 @@ static void __kmp_hierarchical_barrier_gather(
                         child_tid, &child_bar->b_arrived, new_state));
           kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
           flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
-          ANNOTATE_BARRIER_END(child_thr);
           if (reduce) {
             KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
                            "T#%d(%d:%d)\n",
                            gtid, team->t.t_id, tid,
                            __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
                            child_tid));
-            ANNOTATE_REDUCE_AFTER(reduce);
             (*reduce)(this_thr->th.th_local.reduce_data,
                       child_thr->th.th_local.reduce_data);
-            ANNOTATE_REDUCE_BEFORE(reduce);
-            ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
           }
         }
       }
     }
   }
-  // All subordinates are gathered; now release parent if not master thread
+  // All subordinates are gathered; now release parent if not primary thread
 
   if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
     KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing"
@@ -1020,11 +983,10 @@ static void __kmp_hierarchical_barrier_gather(
                   thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
     /* Mark arrival to parent: After performing this write, a worker thread may
        not assume that the team is valid any more - it could be deallocated by
-       the master thread at any time. */
+       the primary thread at any time. */
     if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ||
         !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived
       // flag; release it
-      ANNOTATE_BARRIER_BEGIN(this_thr);
       kmp_flag_64<> flag(&thr_bar->b_arrived,
                          other_threads[thr_bar->parent_tid]);
       flag.release();
@@ -1036,7 +998,7 @@ static void __kmp_hierarchical_barrier_gather(
       flag.set_waiter(other_threads[thr_bar->parent_tid]);
       flag.release();
     }
-  } else { // Master thread needs to update the team's b_arrived value
+  } else { // Primary thread needs to update the team's b_arrived value
     team->t.t_bar[bt].b_arrived = new_state;
     KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d "
                   "arrived(%p) = %llu\n",
@@ -1061,7 +1023,7 @@ static void __kmp_hierarchical_barrier_release(
   if (KMP_MASTER_TID(tid)) {
     team = __kmp_threads[gtid]->th.th_team;
     KMP_DEBUG_ASSERT(team != NULL);
-    KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) master "
+    KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) primary "
                   "entered barrier type %d\n",
                   gtid, team->t.t_id, tid, bt));
   } else { // Worker threads
@@ -1073,7 +1035,6 @@ static void __kmp_hierarchical_barrier_release(
       thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
       kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
       flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
-      ANNOTATE_BARRIER_END(this_thr);
       TCW_8(thr_bar->b_go,
             KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
     } else { // Thread barrier data is initialized, this is a leaf, blocktime is
@@ -1139,7 +1100,7 @@ static void __kmp_hierarchical_barrier_release(
     __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
                              FALSE);
     if (KMP_MASTER_TID(
-            tid)) { // master already has copy in final destination; copy
+            tid)) { // primary already has copy in final destination; copy
       copy_icvs(&thr_bar->th_fixed_icvs,
                 &team->t.t_implicit_task_taskdata[tid].td_icvs);
     } else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
@@ -1219,7 +1180,6 @@ static void __kmp_hierarchical_barrier_release(
                  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
                  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
             // Release child using child's b_go flag
-            ANNOTATE_BARRIER_BEGIN(child_thr);
             kmp_flag_64<> flag(&child_bar->b_go, child_thr);
             flag.release();
           }
@@ -1245,7 +1205,6 @@ static void __kmp_hierarchical_barrier_release(
                         child_tid, &child_bar->b_go, child_bar->b_go,
                         child_bar->b_go + KMP_BARRIER_STATE_BUMP));
           // Release child using child's b_go flag
-          ANNOTATE_BARRIER_BEGIN(child_thr);
           kmp_flag_64<> flag(&child_bar->b_go, child_thr);
           flag.release();
         }
@@ -1289,7 +1248,7 @@ template <> struct is_cancellable<false> {
    If reduce is non-NULL, do a split reduction barrier, otherwise, do a split
    barrier
    When cancellable = false,
-     Returns 0 if master thread, 1 if worker thread.
+     Returns 0 if primary thread, 1 if worker thread.
    When cancellable = true
      Returns 0 if not cancelled, 1 if cancelled.  */
 template <bool cancellable = false>
@@ -1313,7 +1272,6 @@ static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split,
   KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n", gtid,
                 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
 
-  ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
 #if OMPT_SUPPORT
   if (ompt_enabled.enabled) {
 #if OMPT_OPTIONAL
@@ -1376,7 +1334,7 @@ static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split,
 #endif /* USE_ITT_BUILD */
 #if USE_DEBUGGER
     // Let the debugger know: the thread arrived to the barrier and waiting.
-    if (KMP_MASTER_TID(tid)) { // Master counter is stored in team structure.
+    if (KMP_MASTER_TID(tid)) { // Primary thread counter stored in team struct
       team->t.t_bar[bt].b_master_arrived += 1;
     } else {
       this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
@@ -1444,7 +1402,7 @@ static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split,
         }
       }
 #if USE_ITT_BUILD
-      /* TODO: In case of split reduction barrier, master thread may send
+      /* TODO: In case of split reduction barrier, primary thread may send
          acquired event early, before the final summation into the shared
          variable is done (final summation can be a long operation for array
          reductions).  */
@@ -1476,7 +1434,7 @@ static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split,
           break;
         case 3:
           if (__itt_metadata_add_ptr) {
-            // Initialize with master's wait time
+            // Initialize with primary thread's wait time
             kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
             // Set arrive time to zero to be able to check it in
             // __kmp_invoke_task(); the same is done inside the loop below
@@ -1589,14 +1547,13 @@ static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split,
     this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
   }
 #endif
-  ANNOTATE_BARRIER_END(&team->t.t_bar);
 
   if (cancellable)
     return (int)cancelled;
   return status;
 }
 
-// Returns 0 if master thread, 1 if worker thread.
+// Returns 0 if primary thread, 1 if worker thread.
 int __kmp_barrier(enum barrier_type bt, int gtid, int is_split,
                   size_t reduce_size, void *reduce_data,
                   void (*reduce)(void *, void *)) {
@@ -1614,7 +1571,7 @@ int __kmp_barrier_gomp_cancel(int gtid) {
       int tid = __kmp_tid_from_gtid(gtid);
       kmp_info_t *this_thr = __kmp_threads[gtid];
       if (KMP_MASTER_TID(tid)) {
-        // Master does not need to revert anything
+        // Primary thread does not need to revert anything
       } else {
         // Workers need to revert their private b_arrived flag
         this_thr->th.th_bar[bs_plain_barrier].bb.b_arrived -=
@@ -1631,11 +1588,11 @@ int __kmp_barrier_gomp_cancel(int gtid) {
 void __kmp_end_split_barrier(enum barrier_type bt, int gtid) {
   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier);
   KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
+  KMP_DEBUG_ASSERT(bt < bs_last_barrier);
   int tid = __kmp_tid_from_gtid(gtid);
   kmp_info_t *this_thr = __kmp_threads[gtid];
   kmp_team_t *team = this_thr->th.th_team;
 
-  ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
   if (!team->t.t_serialized) {
     if (KMP_MASTER_GTID(gtid)) {
       switch (__kmp_barrier_release_pattern[bt]) {
@@ -1666,12 +1623,14 @@ void __kmp_end_split_barrier(enum barrier_type bt, int gtid) {
       } // if
     }
   }
-  ANNOTATE_BARRIER_END(&team->t.t_bar);
 }
 
 void __kmp_join_barrier(int gtid) {
   KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier);
   KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
+
+  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
+
   kmp_info_t *this_thr = __kmp_threads[gtid];
   kmp_team_t *team;
   kmp_uint nproc;
@@ -1708,14 +1667,12 @@ void __kmp_join_barrier(int gtid) {
   KMP_MB();
 
   // Verify state
-  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
   KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
   KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
   KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
   KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n",
                 gtid, team_id, tid));
 
-  ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
 #if OMPT_SUPPORT
   if (ompt_enabled.enabled) {
 #if OMPT_OPTIONAL
@@ -1809,7 +1766,7 @@ void __kmp_join_barrier(int gtid) {
   }
 
   /* From this point on, the team data structure may be deallocated at any time
-     by the master thread - it is unsafe to reference it in any of the worker
+     by the primary thread - it is unsafe to reference it in any of the worker
      threads. Any per-team data items that need to be referenced before the
      end of the barrier should be moved to the kmp_task_team_t structs.  */
   if (KMP_MASTER_TID(tid)) {
@@ -1820,7 +1777,7 @@ void __kmp_join_barrier(int gtid) {
       KMP_CHECK_UPDATE(team->t.t_display_affinity, 0);
     }
 #if KMP_STATS_ENABLED
-    // Have master thread flag the workers to indicate they are now waiting for
+    // Have primary thread flag the workers to indicate they are now waiting for
     // next parallel region, Also wake them up so they switch their timers to
     // idle.
     for (int i = 0; i < team->t.t_nproc; ++i) {
@@ -1862,7 +1819,7 @@ void __kmp_join_barrier(int gtid) {
         break;
       case 3:
         if (__itt_metadata_add_ptr) {
-          // Initialize with master's wait time
+          // Initialize with primary thread's wait time
           kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
           // Set arrive time to zero to be able to check it in
           // __kmp_invoke_task(); the same is done inside the loop below
@@ -1903,7 +1860,6 @@ void __kmp_join_barrier(int gtid) {
   KA_TRACE(10,
            ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
 
-  ANNOTATE_BARRIER_END(&team->t.t_bar);
 }
 
 // TODO release worker threads' fork barriers as we are ready instead of all at
@@ -1917,12 +1873,11 @@ void __kmp_fork_barrier(int gtid, int tid) {
   void *itt_sync_obj = NULL;
 #endif /* USE_ITT_BUILD */
   if (team)
-    ANNOTATE_BARRIER_END(&team->t.t_bar);
 
   KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid,
                 (team != NULL) ? team->t.t_id : -1, tid));
 
-  // th_team pointer only valid for master thread here
+  // th_team pointer only valid for primary thread here
   if (KMP_MASTER_TID(tid)) {
 #if USE_ITT_BUILD && USE_ITT_NOTIFY
     if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
@@ -1933,6 +1888,7 @@ void __kmp_fork_barrier(int gtid, int tid) {
 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
 
 #ifdef KMP_DEBUG
+    KMP_DEBUG_ASSERT(team);
     kmp_info_t **other_threads = team->t.t_threads;
     int i;
 
@@ -1958,8 +1914,8 @@ void __kmp_fork_barrier(int gtid, int tid) {
       __kmp_task_team_setup(this_thr, team, 0);
     }
 
-    /* The master thread may have changed its blocktime between the join barrier
-       and the fork barrier. Copy the blocktime info to the thread, where
+    /* The primary thread may have changed its blocktime between join barrier
+       and fork barrier. Copy the blocktime info to the thread, where
        __kmp_wait_template() can access it when the team struct is not
        guaranteed to exist. */
     // See note about the corresponding code in __kmp_join_barrier() being
@@ -1974,7 +1930,7 @@ void __kmp_fork_barrier(int gtid, int tid) {
       this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
 #endif
     }
-  } // master
+  } // primary thread
 
   switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
   case bp_hyper_bar: {
@@ -2013,7 +1969,7 @@ void __kmp_fork_barrier(int gtid, int tid) {
     if (KMP_MASTER_TID(ds_tid) &&
         (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
          ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
-      codeptr = team->t.ompt_team_info.master_return_address;
+      codeptr = team ? team->t.ompt_team_info.master_return_address : NULL;
     if (ompt_enabled.ompt_callback_sync_region_wait) {
       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
@@ -2027,7 +1983,8 @@ void __kmp_fork_barrier(int gtid, int tid) {
 #endif
     if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
-          ompt_scope_end, NULL, task_data, 0, ds_tid, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
+          ompt_scope_end, NULL, task_data, 0, ds_tid,
+          ompt_task_implicit); // TODO: Can this be ompt_task_initial?
     }
   }
 #endif
@@ -2050,25 +2007,25 @@ void __kmp_fork_barrier(int gtid, int tid) {
   }
 
   /* We can now assume that a valid team structure has been allocated by the
-     master and propagated to all worker threads. The current thread, however,
-     may not be part of the team, so we can't blindly assume that the team
-     pointer is non-null.  */
+     primary thread and propagated to all worker threads. The current thread,
+     however, may not be part of the team, so we can't blindly assume that the
+     team pointer is non-null.  */
   team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
   KMP_DEBUG_ASSERT(team != NULL);
   tid = __kmp_tid_from_gtid(gtid);
 
 #if KMP_BARRIER_ICV_PULL
-  /* Master thread's copy of the ICVs was set up on the implicit taskdata in
-     __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's
+  /* Primary thread's copy of the ICVs was set up on the implicit taskdata in
+     __kmp_reinitialize_team. __kmp_fork_call() assumes the primary thread's
      implicit task has this data before this function is called. We cannot
-     modify __kmp_fork_call() to look at the fixed ICVs in the master's thread
-     struct, because it is not always the case that the threads arrays have
-     been allocated when __kmp_fork_call() is executed. */
+     modify __kmp_fork_call() to look at the fixed ICVs in the primary thread's
+     thread struct, because it is not always the case that the threads arrays
+     have been allocated when __kmp_fork_call() is executed. */
   {
     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
-    if (!KMP_MASTER_TID(tid)) { // master thread already has ICVs
-      // Copy the initial ICVs from the master's thread struct to the implicit
-      // task for this tid.
+    if (!KMP_MASTER_TID(tid)) { // primary thread already has ICVs
+      // Copy the initial ICVs from the primary thread's thread struct to the
+      // implicit task for this tid.
       KA_TRACE(10,
                ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
       __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
@@ -2108,7 +2065,7 @@ void __kmp_fork_barrier(int gtid, int tid) {
 #if KMP_AFFINITY_SUPPORTED
         || (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed)
 #endif
-            ) {
+    ) {
       // NULL means use the affinity-format-var ICV
       __kmp_aux_display_affinity(gtid, NULL);
       this_thr->th.th_prev_num_threads = team->t.t_nproc;
@@ -2127,7 +2084,6 @@ void __kmp_fork_barrier(int gtid, int tid) {
     } // (prepare called inside barrier_release)
   }
 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
-  ANNOTATE_BARRIER_END(&team->t.t_bar);
   KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid,
                 team->t.t_id, tid));
 }
@@ -2139,13 +2095,13 @@ void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
   KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
   KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
 
-/* Master thread's copy of the ICVs was set up on the implicit taskdata in
-   __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's
+/* Primary thread's copy of the ICVs was set up on the implicit taskdata in
+   __kmp_reinitialize_team. __kmp_fork_call() assumes the primary thread's
    implicit task has this data before this function is called. */
 #if KMP_BARRIER_ICV_PULL
-  /* Copy ICVs to master's thread structure into th_fixed_icvs (which remains
-     untouched), where all of the worker threads can access them and make their
-     own copies after the barrier. */
+  /* Copy ICVs to primary thread's thread structure into th_fixed_icvs (which
+     remains untouched), where all of the worker threads can access them and
+     make their own copies after the barrier. */
   KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
   // allocated at this point
   copy_icvs(
@@ -2159,12 +2115,12 @@ void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
   KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 0,
                 team->t.t_threads[0], team));
 #else
-  // Copy the ICVs to each of the non-master threads.  This takes O(nthreads)
+  // Copy the ICVs to each of the non-primary threads.  This takes O(nthreads)
   // time.
   ngo_load(new_icvs);
   KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
   // allocated at this point
-  for (int f = 1; f < new_nproc; ++f) { // Skip the master thread
+  for (int f = 1; f < new_nproc; ++f) { // Skip the primary thread
     // TODO: GEH - pass in better source location info since usually NULL here
     KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
                   f, team->t.t_threads[f], team));
diff --git a/openmp/runtime/src/kmp_config.h.cmake b/openmp/runtime/src/kmp_config.h.cmake
index 3d682c690fc7..0b07d115ff7b 100644
--- a/openmp/runtime/src/kmp_config.h.cmake
+++ b/openmp/runtime/src/kmp_config.h.cmake
@@ -44,8 +44,10 @@
 #define OMPT_DEBUG LIBOMP_OMPT_DEBUG
 #cmakedefine01 LIBOMP_OMPT_SUPPORT
 #define OMPT_SUPPORT LIBOMP_OMPT_SUPPORT
-#cmakedefine01 LIBOMPTARGET_PROFILING_SUPPORT
-#define OMPTARGET_PROFILING_SUPPORT LIBOMPTARGET_PROFILING_SUPPORT
+#cmakedefine01 LIBOMP_OMPD_SUPPORT
+#define OMPD_SUPPORT LIBOMP_OMPD_SUPPORT
+#cmakedefine01 LIBOMP_PROFILING_SUPPORT
+#define OMP_PROFILING_SUPPORT LIBOMP_PROFILING_SUPPORT
 #cmakedefine01 LIBOMP_OMPT_OPTIONAL
 #define OMPT_OPTIONAL LIBOMP_OMPT_OPTIONAL
 #cmakedefine01 LIBOMP_USE_ADAPTIVE_LOCKS
@@ -66,10 +68,6 @@
 #define KMP_LIBRARY_FILE "@LIBOMP_LIB_FILE@"
 #define KMP_VERSION_MAJOR @LIBOMP_VERSION_MAJOR@
 #define KMP_VERSION_MINOR @LIBOMP_VERSION_MINOR@
-#cmakedefine01 LIBOMP_TSAN_SUPPORT
-#if LIBOMP_TSAN_SUPPORT
-#define TSAN_SUPPORT
-#endif
 #cmakedefine01 MSVC
 #define KMP_MSVC_COMPAT MSVC
 #cmakedefine01 LIBOMP_HAVE_WAITPKG_INTRINSICS
diff --git a/openmp/runtime/src/kmp_csupport.cpp b/openmp/runtime/src/kmp_csupport.cpp
index 4b188763a58a..2a7c9a8cb2ec 100644
--- a/openmp/runtime/src/kmp_csupport.cpp
+++ b/openmp/runtime/src/kmp_csupport.cpp
@@ -39,6 +39,7 @@ void __kmpc_begin(ident_t *loc, kmp_int32 flags) {
   if ((env = getenv("KMP_INITIAL_THREAD_BIND")) != NULL &&
       __kmp_str_match_true(env)) {
     __kmp_middle_initialize();
+    __kmp_assign_root_init_mask();
     KC_TRACE(10, ("__kmpc_begin: middle initialization called\n"));
   } else if (__kmp_ignore_mppbeg() == FALSE) {
     // By default __kmp_ignore_mppbeg() returns TRUE.
@@ -88,7 +89,7 @@ If the runtime has ony been entered at the outermost level from a
 single (necessarily non-OpenMP<sup>*</sup>) thread, then the thread number is
 that which would be returned by omp_get_thread_num() in the outermost
 active parallel construct. (Or zero if there is no active parallel
-construct, since the master thread is necessarily thread zero).
+construct, since the primary thread is necessarily thread zero).
 
 If multiple non-OpenMP threads all enter an OpenMP construct then this
 will be a unique thread identifier among all the threads created by
@@ -316,7 +317,7 @@ void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) {
                     ,
                     fork_context_intel
 #endif
-                    );
+    );
 
     va_end(ap);
   }
@@ -353,6 +354,33 @@ void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid,
 
 /*!
 @ingroup PARALLEL
+@param loc source location information
+@param global_tid global thread number
+@param num_teams_lo lower bound on number of teams requested for the teams
+construct
+@param num_teams_up upper bound on number of teams requested for the teams
+construct
+@param num_threads number of threads per team requested for the teams construct
+
+Set the number of teams to be used by the teams construct. The number of initial
+teams cretaed will be greater than or equal to the lower bound and less than or
+equal to the upper bound.
+This call is only required if the teams construct has a `num_teams` clause
+or a `thread_limit` clause (or both).
+*/
+void __kmpc_push_num_teams_51(ident_t *loc, kmp_int32 global_tid,
+                              kmp_int32 num_teams_lb, kmp_int32 num_teams_ub,
+                              kmp_int32 num_threads) {
+  KA_TRACE(20, ("__kmpc_push_num_teams_51: enter T#%d num_teams_lb=%d"
+                " num_teams_ub=%d num_threads=%d\n",
+                global_tid, num_teams_lb, num_teams_ub, num_threads));
+  __kmp_assert_valid_gtid(global_tid);
+  __kmp_push_num_teams_51(loc, global_tid, num_teams_lb, num_teams_ub,
+                          num_threads);
+}
+
+/*!
+@ingroup PARALLEL
 @param loc  source location information
 @param argc  total number of arguments in the ellipsis
 @param microtask  pointer to callback routine consisting of outlined teams
@@ -411,7 +439,7 @@ void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,
                   ,
                   fork_context_intel
 #endif
-                  );
+  );
 
   // Pop current CG root off list
   KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
@@ -567,7 +595,7 @@ void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
   --serial_team->t.t_serialized;
   if (serial_team->t.t_serialized == 0) {
 
-/* return to the parallel section */
+    /* return to the parallel section */
 
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
     if (__kmp_inherit_fp_control && serial_team->t.t_fp_control_saved) {
@@ -577,6 +605,11 @@ void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
     }
 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
 
+#if OMPD_SUPPORT
+    if (ompd_state & OMPD_ENABLE_BP)
+      ompd_bp_parallel_end();
+#endif
+
     this_thr->th.th_team = serial_team->t.t_parent;
     this_thr->th.th_info.ds.ds_tid = serial_team->t.t_master_tid;
 
@@ -823,6 +856,92 @@ void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid) {
 /*!
 @ingroup WORK_SHARING
 @param loc  source location information.
+@param global_tid  global thread number.
+@param filter result of evaluating filter clause on thread global_tid, or zero
+if no filter clause present
+@return 1 if this thread should execute the <tt>masked</tt> block, 0 otherwise.
+*/
+kmp_int32 __kmpc_masked(ident_t *loc, kmp_int32 global_tid, kmp_int32 filter) {
+  int status = 0;
+  int tid;
+  KC_TRACE(10, ("__kmpc_masked: called T#%d\n", global_tid));
+  __kmp_assert_valid_gtid(global_tid);
+
+  if (!TCR_4(__kmp_init_parallel))
+    __kmp_parallel_initialize();
+
+  __kmp_resume_if_soft_paused();
+
+  tid = __kmp_tid_from_gtid(global_tid);
+  if (tid == filter) {
+    KMP_COUNT_BLOCK(OMP_MASKED);
+    KMP_PUSH_PARTITIONED_TIMER(OMP_masked);
+    status = 1;
+  }
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (status) {
+    if (ompt_enabled.ompt_callback_masked) {
+      kmp_info_t *this_thr = __kmp_threads[global_tid];
+      kmp_team_t *team = this_thr->th.th_team;
+      ompt_callbacks.ompt_callback(ompt_callback_masked)(
+          ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
+          &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
+          OMPT_GET_RETURN_ADDRESS(0));
+    }
+  }
+#endif
+
+  if (__kmp_env_consistency_check) {
+#if KMP_USE_DYNAMIC_LOCK
+    if (status)
+      __kmp_push_sync(global_tid, ct_masked, loc, NULL, 0);
+    else
+      __kmp_check_sync(global_tid, ct_masked, loc, NULL, 0);
+#else
+    if (status)
+      __kmp_push_sync(global_tid, ct_masked, loc, NULL);
+    else
+      __kmp_check_sync(global_tid, ct_masked, loc, NULL);
+#endif
+  }
+
+  return status;
+}
+
+/*!
+@ingroup WORK_SHARING
+@param loc  source location information.
+@param global_tid  global thread number .
+
+Mark the end of a <tt>masked</tt> region. This should only be called by the
+thread that executes the <tt>masked</tt> region.
+*/
+void __kmpc_end_masked(ident_t *loc, kmp_int32 global_tid) {
+  KC_TRACE(10, ("__kmpc_end_masked: called T#%d\n", global_tid));
+  __kmp_assert_valid_gtid(global_tid);
+  KMP_POP_PARTITIONED_TIMER();
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  kmp_info_t *this_thr = __kmp_threads[global_tid];
+  kmp_team_t *team = this_thr->th.th_team;
+  if (ompt_enabled.ompt_callback_masked) {
+    int tid = __kmp_tid_from_gtid(global_tid);
+    ompt_callbacks.ompt_callback(ompt_callback_masked)(
+        ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
+        &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
+        OMPT_GET_RETURN_ADDRESS(0));
+  }
+#endif
+
+  if (__kmp_env_consistency_check) {
+    __kmp_pop_sync(global_tid, ct_masked, loc);
+  }
+}
+
+/*!
+@ingroup WORK_SHARING
+@param loc  source location information.
 @param gtid  global thread number.
 
 Start execution of an <tt>ordered</tt> construct.
@@ -1175,10 +1294,10 @@ void __kmpc_critical(ident_t *loc, kmp_int32 global_tid,
   if (__kmp_env_consistency_check)
     __kmp_push_sync(global_tid, ct_critical, loc, lck);
 
-// since the critical directive binds to all threads, not just the current
-// team we have to check this even if we are in a serialized team.
-// also, even if we are the uber thread, we still have to conduct the lock,
-// as we have to contend with sibling threads.
+    // since the critical directive binds to all threads, not just the current
+    // team we have to check this even if we are in a serialized team.
+    // also, even if we are the uber thread, we still have to conduct the lock,
+    // as we have to contend with sibling threads.
 
 #if USE_ITT_BUILD
   __kmp_itt_critical_acquiring(lck);
@@ -1381,13 +1500,13 @@ void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid,
   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
   // Check if it is initialized.
   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
+  kmp_dyna_lockseq_t lockseq = __kmp_map_hint_to_lock(hint);
   if (*lk == 0) {
-    kmp_dyna_lockseq_t lckseq = __kmp_map_hint_to_lock(hint);
-    if (KMP_IS_D_LOCK(lckseq)) {
+    if (KMP_IS_D_LOCK(lockseq)) {
       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
-                                  KMP_GET_D_TAG(lckseq));
+                                  KMP_GET_D_TAG(lockseq));
     } else {
-      __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lckseq));
+      __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lockseq));
     }
   }
   // Branch for accessing the actual lock object and set operation. This
@@ -1420,11 +1539,11 @@ void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid,
     }
 #endif
 #if KMP_USE_INLINED_TAS
-    if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
+    if (lockseq == lockseq_tas && !__kmp_env_consistency_check) {
       KMP_ACQUIRE_TAS_LOCK(lck, global_tid);
     } else
 #elif KMP_USE_INLINED_FUTEX
-    if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
+    if (lockseq == lockseq_futex && !__kmp_env_consistency_check) {
       KMP_ACQUIRE_FUTEX_LOCK(lck, global_tid);
     } else
 #endif
@@ -1501,7 +1620,8 @@ void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid,
   KC_TRACE(10, ("__kmpc_end_critical: called T#%d\n", global_tid));
 
 #if KMP_USE_DYNAMIC_LOCK
-  if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
+  int locktag = KMP_EXTRACT_D_TAG(crit);
+  if (locktag) {
     lck = (kmp_user_lock_p)crit;
     KMP_ASSERT(lck != NULL);
     if (__kmp_env_consistency_check) {
@@ -1511,11 +1631,11 @@ void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid,
     __kmp_itt_critical_releasing(lck);
 #endif
 #if KMP_USE_INLINED_TAS
-    if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
+    if (locktag == locktag_tas && !__kmp_env_consistency_check) {
       KMP_RELEASE_TAS_LOCK(lck, global_tid);
     } else
 #elif KMP_USE_INLINED_FUTEX
-    if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
+    if (locktag == locktag_futex && !__kmp_env_consistency_check) {
       KMP_RELEASE_FUTEX_LOCK(lck, global_tid);
     } else
 #endif
@@ -1877,8 +1997,7 @@ int ompc_get_team_size(int level) {
 }
 
 /* OpenMP 5.0 Affinity Format API */
-
-void ompc_set_affinity_format(char const *format) {
+void KMP_EXPAND_NAME(ompc_set_affinity_format)(char const *format) {
   if (!__kmp_init_serial) {
     __kmp_serial_initialize();
   }
@@ -1886,7 +2005,7 @@ void ompc_set_affinity_format(char const *format) {
                          format, KMP_STRLEN(format) + 1);
 }
 
-size_t ompc_get_affinity_format(char *buffer, size_t size) {
+size_t KMP_EXPAND_NAME(ompc_get_affinity_format)(char *buffer, size_t size) {
   size_t format_size;
   if (!__kmp_init_serial) {
     __kmp_serial_initialize();
@@ -1899,23 +2018,25 @@ size_t ompc_get_affinity_format(char *buffer, size_t size) {
   return format_size;
 }
 
-void ompc_display_affinity(char const *format) {
+void KMP_EXPAND_NAME(ompc_display_affinity)(char const *format) {
   int gtid;
   if (!TCR_4(__kmp_init_middle)) {
     __kmp_middle_initialize();
   }
+  __kmp_assign_root_init_mask();
   gtid = __kmp_get_gtid();
   __kmp_aux_display_affinity(gtid, format);
 }
 
-size_t ompc_capture_affinity(char *buffer, size_t buf_size,
-                             char const *format) {
+size_t KMP_EXPAND_NAME(ompc_capture_affinity)(char *buffer, size_t buf_size,
+                                              char const *format) {
   int gtid;
   size_t num_required;
   kmp_str_buf_t capture_buf;
   if (!TCR_4(__kmp_init_middle)) {
     __kmp_middle_initialize();
   }
+  __kmp_assign_root_init_mask();
   gtid = __kmp_get_gtid();
   __kmp_str_buf_init(&capture_buf);
   num_required = __kmp_aux_capture_affinity(gtid, format, &capture_buf);
@@ -1961,8 +2082,10 @@ void kmpc_set_defaults(char const *str) {
 void kmpc_set_disp_num_buffers(int arg) {
   // ignore after initialization because some teams have already
   // allocated dispatch buffers
-  if (__kmp_init_serial == 0 && arg > 0)
+  if (__kmp_init_serial == FALSE && arg >= KMP_MIN_DISP_NUM_BUFF &&
+      arg <= KMP_MAX_DISP_NUM_BUFF) {
     __kmp_dispatch_num_buffers = arg;
+  }
 }
 
 int kmpc_set_affinity_mask_proc(int proc, void **mask) {
@@ -1972,6 +2095,7 @@ int kmpc_set_affinity_mask_proc(int proc, void **mask) {
   if (!TCR_4(__kmp_init_middle)) {
     __kmp_middle_initialize();
   }
+  __kmp_assign_root_init_mask();
   return __kmp_aux_set_affinity_mask_proc(proc, mask);
 #endif
 }
@@ -1983,6 +2107,7 @@ int kmpc_unset_affinity_mask_proc(int proc, void **mask) {
   if (!TCR_4(__kmp_init_middle)) {
     __kmp_middle_initialize();
   }
+  __kmp_assign_root_init_mask();
   return __kmp_aux_unset_affinity_mask_proc(proc, mask);
 #endif
 }
@@ -1994,6 +2119,7 @@ int kmpc_get_affinity_mask_proc(int proc, void **mask) {
   if (!TCR_4(__kmp_init_middle)) {
     __kmp_middle_initialize();
   }
+  __kmp_assign_root_init_mask();
   return __kmp_aux_get_affinity_mask_proc(proc, mask);
 #endif
 }
@@ -2083,21 +2209,21 @@ void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size,
   if (!didit)
     (*cpy_func)(cpy_data, *data_ptr);
 
-// Consider next barrier a user-visible barrier for barrier region boundaries
-// Nesting checks are already handled by the single construct checks
+  // Consider next barrier a user-visible barrier for barrier region boundaries
+  // Nesting checks are already handled by the single construct checks
   {
 #if OMPT_SUPPORT
     OMPT_STORE_RETURN_ADDRESS(gtid);
 #endif
 #if USE_ITT_NOTIFY
-  __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g.
+    __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g.
 // tasks can overwrite the location)
 #endif
-  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
+    __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
 #if OMPT_SUPPORT && OMPT_OPTIONAL
-  if (ompt_enabled.enabled) {
-    ompt_frame->enter_frame = ompt_data_none;
-  }
+    if (ompt_enabled.enabled) {
+      ompt_frame->enter_frame = ompt_data_none;
+    }
 #endif
   }
 }
@@ -2393,12 +2519,6 @@ void __kmpc_destroy_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
   if (!codeptr)
     codeptr = OMPT_GET_RETURN_ADDRESS(0);
   if (ompt_enabled.ompt_callback_lock_destroy) {
-    kmp_user_lock_p lck;
-    if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
-      lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
-    } else {
-      lck = (kmp_user_lock_p)user_lock;
-    }
     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
   }
@@ -2642,7 +2762,7 @@ void __kmpc_set_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
 #endif
   int acquire_status =
       KMP_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid);
-  (void) acquire_status;
+  (void)acquire_status;
 #if USE_ITT_BUILD
   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
 #endif
@@ -2835,7 +2955,7 @@ void __kmpc_unset_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
 #endif
   int release_status =
       KMP_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid);
-  (void) release_status;
+  (void)release_status;
 
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   // This is the case, if called from omp_init_lock_with_hint:
@@ -3055,7 +3175,7 @@ int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
 
   return (rc ? FTN_TRUE : FTN_FALSE);
 
-/* Can't use serial interval since not block structured */
+  /* Can't use serial interval since not block structured */
 
 #endif // KMP_USE_DYNAMIC_LOCK
 }
@@ -3173,7 +3293,7 @@ int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
 #endif
   return rc;
 
-/* Can't use serial interval since not block structured */
+  /* Can't use serial interval since not block structured */
 
 #endif // KMP_USE_DYNAMIC_LOCK
 }
@@ -3346,7 +3466,7 @@ __kmp_restore_swapped_teams(kmp_info_t *th, kmp_team_t *team, int task_state) {
 @param reduce_func callback function providing reduction operation on two
 operands and returning result of reduction in lhs_data
 @param lck pointer to the unique lock data structure
-@result 1 for the master thread, 0 for all other team threads, 2 for all team
+@result 1 for the primary thread, 0 for all other team threads, 2 for all team
 threads if atomic reduction needed
 
 The nowait version is used for a reduce clause with the nowait argument.
@@ -3442,11 +3562,11 @@ __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
                                    tree_reduce_block)) {
 
 // AT: performance issue: a real barrier here
-// AT:     (if master goes slow, other threads are blocked here waiting for the
-// master to come and release them)
-// AT:     (it's not what a customer might expect specifying NOWAIT clause)
-// AT:     (specifying NOWAIT won't result in improvement of performance, it'll
-// be confusing to a customer)
+// AT: (if primary thread is slow, other threads are blocked here waiting for
+//      the primary thread to come and release them)
+// AT: (it's not what a customer might expect specifying NOWAIT clause)
+// AT: (specifying NOWAIT won't result in improvement of performance, it'll
+//      be confusing to a customer)
 // AT: another implementation of *barrier_gather*nowait() (or some other design)
 // might go faster and be more in line with sense of NOWAIT
 // AT: TO DO: do epcc test and compare times
@@ -3480,7 +3600,7 @@ __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
     }
 #endif
 
-    // all other workers except master should do this pop here
+    // all other workers except primary thread should do this pop here
     //     ( none of other workers will get to __kmpc_end_reduce_nowait() )
     if (__kmp_env_consistency_check) {
       if (retval == 0) {
@@ -3538,7 +3658,7 @@ void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
 
   } else if (packed_reduction_method == atomic_reduce_block) {
 
-    // neither master nor other workers should get here
+    // neither primary thread nor other workers should get here
     //     (code gen does not generate this call in case 2: atomic reduce block)
     // actually it's better to remove this elseif at all;
     // after removal this value will checked by the 'else' and will assert
@@ -3546,7 +3666,7 @@ void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
                                    tree_reduce_block)) {
 
-    // only master gets here
+    // only primary thread gets here
     // OMPT: tree reduction is annotated in the barrier code
 
   } else {
@@ -3576,7 +3696,7 @@ void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
 @param reduce_func callback function providing reduction operation on two
 operands and returning result of reduction in lhs_data
 @param lck pointer to the unique lock data structure
-@result 1 for the master thread, 0 for all other team threads, 2 for all team
+@result 1 for the primary thread, 0 for all other team threads, 2 for all team
 threads if atomic reduction needed
 
 A blocking reduce that includes an implicit barrier.
@@ -3670,10 +3790,10 @@ kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
     }
 #endif
 
-    // all other workers except master should do this pop here
-    // ( none of other workers except master will enter __kmpc_end_reduce() )
+    // all other workers except primary thread should do this pop here
+    // (none of other workers except primary will enter __kmpc_end_reduce())
     if (__kmp_env_consistency_check) {
-      if (retval == 0) { // 0: all other workers; 1: master
+      if (retval == 0) { // 0: all other workers; 1: primary thread
         __kmp_pop_sync(global_tid, ct_reduce, loc);
       }
     }
@@ -3799,7 +3919,7 @@ void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
                                    tree_reduce_block)) {
 
-    // only master executes here (master releases all other workers)
+    // only primary thread executes here (primary releases all other workers)
     __kmp_end_split_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
                             global_tid);
 
@@ -4242,3 +4362,67 @@ int __kmpc_pause_resource(kmp_pause_status_t level) {
   }
   return __kmp_pause_resource(level);
 }
+
+void __kmpc_error(ident_t *loc, int severity, const char *message) {
+  if (!__kmp_init_serial)
+    __kmp_serial_initialize();
+
+  KMP_ASSERT(severity == severity_warning || severity == severity_fatal);
+
+#if OMPT_SUPPORT
+  if (ompt_enabled.enabled && ompt_enabled.ompt_callback_error) {
+    ompt_callbacks.ompt_callback(ompt_callback_error)(
+        (ompt_severity_t)severity, message, KMP_STRLEN(message),
+        OMPT_GET_RETURN_ADDRESS(0));
+  }
+#endif // OMPT_SUPPORT
+
+  char *src_loc;
+  if (loc && loc->psource) {
+    kmp_str_loc_t str_loc = __kmp_str_loc_init(loc->psource, false);
+    src_loc =
+        __kmp_str_format("%s:%s:%s", str_loc.file, str_loc.line, str_loc.col);
+    __kmp_str_loc_free(&str_loc);
+  } else {
+    src_loc = __kmp_str_format("unknown");
+  }
+
+  if (severity == severity_warning)
+    KMP_WARNING(UserDirectedWarning, src_loc, message);
+  else
+    KMP_FATAL(UserDirectedError, src_loc, message);
+
+  __kmp_str_free(&src_loc);
+}
+
+#ifdef KMP_USE_VERSION_SYMBOLS
+// For GOMP compatibility there are two versions of each omp_* API.
+// One is the plain C symbol and one is the Fortran symbol with an appended
+// underscore. When we implement a specific ompc_* version of an omp_*
+// function, we want the plain GOMP versioned symbol to alias the ompc_* version
+// instead of the Fortran versions in kmp_ftn_entry.h
+extern "C" {
+// Have to undef these from omp.h so they aren't translated into
+// their ompc counterparts in the KMP_VERSION_OMPC_SYMBOL macros below
+#ifdef omp_set_affinity_format
+#undef omp_set_affinity_format
+#endif
+#ifdef omp_get_affinity_format
+#undef omp_get_affinity_format
+#endif
+#ifdef omp_display_affinity
+#undef omp_display_affinity
+#endif
+#ifdef omp_capture_affinity
+#undef omp_capture_affinity
+#endif
+KMP_VERSION_OMPC_SYMBOL(ompc_set_affinity_format, omp_set_affinity_format, 50,
+                        "OMP_5.0");
+KMP_VERSION_OMPC_SYMBOL(ompc_get_affinity_format, omp_get_affinity_format, 50,
+                        "OMP_5.0");
+KMP_VERSION_OMPC_SYMBOL(ompc_display_affinity, omp_display_affinity, 50,
+                        "OMP_5.0");
+KMP_VERSION_OMPC_SYMBOL(ompc_capture_affinity, omp_capture_affinity, 50,
+                        "OMP_5.0");
+} // extern "C"
+#endif
diff --git a/openmp/runtime/src/kmp_dispatch.cpp b/openmp/runtime/src/kmp_dispatch.cpp
index a805ee44d5db..cc2d0012bf38 100644
--- a/openmp/runtime/src/kmp_dispatch.cpp
+++ b/openmp/runtime/src/kmp_dispatch.cpp
@@ -77,10 +77,10 @@ static inline int __kmp_get_monotonicity(ident_t *loc, enum sched_type schedule,
 
   // Let default be monotonic for executables
   // compiled with OpenMP* 4.5 or less compilers
-  if (loc->get_openmp_version() < 50)
+  if (loc != NULL && loc->get_openmp_version() < 50)
     monotonicity = SCHEDULE_MONOTONIC;
 
-  if (use_hier)
+  if (use_hier || __kmp_force_monotonic)
     monotonicity = SCHEDULE_MONOTONIC;
   else if (SCHEDULE_HAS_NONMONOTONIC(schedule))
     monotonicity = SCHEDULE_NONMONOTONIC;
@@ -90,6 +90,22 @@ static inline int __kmp_get_monotonicity(ident_t *loc, enum sched_type schedule,
   return monotonicity;
 }
 
+#if KMP_STATIC_STEAL_ENABLED
+enum { // values for steal_flag (possible states of private per-loop buffer)
+  UNUSED = 0,
+  CLAIMED = 1, // owner thread started initialization
+  READY = 2, // available for stealing
+  THIEF = 3 // finished by owner, or claimed by thief
+  // possible state changes:
+  // 0 -> 1 owner only, sync
+  // 0 -> 3 thief only, sync
+  // 1 -> 2 owner only, async
+  // 2 -> 3 owner only, async
+  // 3 -> 2 owner only, async
+  // 3 -> 0 last thread finishing the loop, async
+};
+#endif
+
 // Initialize a dispatch_private_info_template<T> buffer for a particular
 // type of schedule,chunk.  The loop description is found in lb (lower bound),
 // ub (upper bound), and st (stride).  nproc is the number of threads relevant
@@ -187,6 +203,8 @@ void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
       schedule = team->t.t_sched.r_sched_type;
       monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
       schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
+      if (pr->flags.ordered) // correct monotonicity for ordered loop if needed
+        monotonicity = SCHEDULE_MONOTONIC;
       // Detail the schedule if needed (global controls are differentiated
       // appropriately)
       if (schedule == kmp_sch_guided_chunked) {
@@ -346,7 +364,7 @@ void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
   }
 
   switch (schedule) {
-#if (KMP_STATIC_STEAL_ENABLED)
+#if KMP_STATIC_STEAL_ENABLED
   case kmp_sch_static_steal: {
     T ntc, init;
 
@@ -359,41 +377,45 @@ void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
       KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL);
       T id = tid;
       T small_chunk, extras;
-
+      kmp_uint32 old = UNUSED;
+      int claimed = pr->steal_flag.compare_exchange_strong(old, CLAIMED);
+      if (traits_t<T>::type_size > 4) {
+        // AC: TODO: check if 16-byte CAS available and use it to
+        // improve performance (probably wait for explicit request
+        // before spending time on this).
+        // For now use dynamically allocated per-private-buffer lock,
+        // free memory in __kmp_dispatch_next when status==0.
+        pr->u.p.steal_lock = (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
+        __kmp_init_lock(pr->u.p.steal_lock);
+      }
       small_chunk = ntc / nproc;
       extras = ntc % nproc;
 
       init = id * small_chunk + (id < extras ? id : extras);
       pr->u.p.count = init;
-      pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
-
-      pr->u.p.parm2 = lb;
+      if (claimed) { // are we succeeded in claiming own buffer?
+        pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
+        // Other threads will inspect steal_flag when searching for a victim.
+        // READY means other threads may steal from this thread from now on.
+        KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
+      } else {
+        // other thread has stolen whole our range
+        KMP_DEBUG_ASSERT(pr->steal_flag == THIEF);
+        pr->u.p.ub = init; // mark there is no iterations to work on
+      }
+      pr->u.p.parm2 = ntc; // save number of chunks
       // parm3 is the number of times to attempt stealing which is
-      // proportional to the number of chunks per thread up until
-      // the maximum value of nproc.
-      pr->u.p.parm3 = KMP_MIN(small_chunk + extras, nproc);
+      // nproc (just a heuristics, could be optimized later on).
+      pr->u.p.parm3 = nproc;
       pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
-      pr->u.p.st = st;
-      if (traits_t<T>::type_size > 4) {
-        // AC: TODO: check if 16-byte CAS available and use it to
-        // improve performance (probably wait for explicit request
-        // before spending time on this).
-        // For now use dynamically allocated per-thread lock,
-        // free memory in __kmp_dispatch_next when status==0.
-        KMP_DEBUG_ASSERT(pr->u.p.th_steal_lock == NULL);
-        pr->u.p.th_steal_lock =
-            (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
-        __kmp_init_lock(pr->u.p.th_steal_lock);
-      }
       break;
     } else {
       /* too few chunks: switching to kmp_sch_dynamic_chunked */
       schedule = kmp_sch_dynamic_chunked;
       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d switching to "
                      "kmp_sch_dynamic_chunked\n",
-                      gtid));
-      if (pr->u.p.parm1 <= 0)
-        pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
+                     gtid));
+      goto dynamic_init;
       break;
     } // if
   } // case
@@ -490,6 +512,7 @@ void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
       if ((2L * chunk + 1) * nproc >= tc) {
         /* chunk size too large, switch to dynamic */
         schedule = kmp_sch_dynamic_chunked;
+        goto dynamic_init;
       } else {
         // when remaining iters become less than parm2 - switch to dynamic
         pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
@@ -519,6 +542,7 @@ void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
       if ((2L * chunk + 1) * nproc >= tc) {
         /* chunk size too large, switch to dynamic */
         schedule = kmp_sch_dynamic_chunked;
+        goto dynamic_init;
       } else {
         /* commonly used term: (2 nproc - 1)/(2 nproc) */
         DBL x;
@@ -615,8 +639,9 @@ void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
 #define GUIDED_ANALYTICAL_WORKAROUND (x)
 #endif
         /* dynamic-style scheduling offset */
-        pr->u.p.count = tc - __kmp_dispatch_guided_remaining(
-                                 tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
+        pr->u.p.count = tc -
+                        __kmp_dispatch_guided_remaining(
+                            tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
                         cross * chunk;
 #if KMP_USE_X87CONTROL
         // restore FPCW
@@ -642,9 +667,14 @@ void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
     break;
   case kmp_sch_static_chunked:
   case kmp_sch_dynamic_chunked:
-    if (pr->u.p.parm1 <= 0) {
+  dynamic_init:
+    if (pr->u.p.parm1 <= 0)
       pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
-    }
+    else if (pr->u.p.parm1 > tc)
+      pr->u.p.parm1 = tc;
+    // Store the total number of chunks to prevent integer overflow during
+    // bounds calculations in the get next chunk routine.
+    pr->u.p.parm2 = (tc / pr->u.p.parm1) + (tc % pr->u.p.parm1 ? 1 : 0);
     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
                    "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
                    gtid));
@@ -702,7 +732,7 @@ void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
     __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
                 KMP_HNT(GetNewerLibrary), // Hint
                 __kmp_msg_null // Variadic argument list terminator
-                );
+    );
   } break;
   } // switch
   pr->schedule = schedule;
@@ -874,6 +904,18 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
         &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
     KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid,
                   my_buffer_index));
+    if (sh->buffer_index != my_buffer_index) { // too many loops in progress?
+      KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d"
+                     " sh->buffer_index:%d\n",
+                     gtid, my_buffer_index, sh->buffer_index));
+      __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
+                             __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
+      // Note: KMP_WAIT() cannot be used there: buffer index and
+      // my_buffer_index are *always* 32-bit integers.
+      KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
+                     "sh->buffer_index:%d\n",
+                     gtid, my_buffer_index, sh->buffer_index));
+    }
   }
 
   __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st,
@@ -890,24 +932,6 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
       th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
       th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
     }
-  }
-
-  if (active) {
-    /* The name of this buffer should be my_buffer_index when it's free to use
-     * it */
-
-    KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d "
-                   "sh->buffer_index:%d\n",
-                   gtid, my_buffer_index, sh->buffer_index));
-    __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
-                           __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
-    // Note: KMP_WAIT() cannot be used there: buffer index and
-    // my_buffer_index are *always* 32-bit integers.
-    KMP_MB(); /* is this necessary? */
-    KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
-                   "sh->buffer_index:%d\n",
-                   gtid, my_buffer_index, sh->buffer_index));
-
     th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
     th->th.th_dispatch->th_dispatch_sh_current =
         CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
@@ -917,7 +941,7 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
     }
     // Report loop metadata
     if (itt_need_metadata_reporting) {
-      // Only report metadata by master of active team at level 1
+      // Only report metadata by primary thread of active team at level 1
       kmp_uint64 schedtype = 0;
       switch (schedule) {
       case kmp_sch_static_chunked:
@@ -971,21 +995,6 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
     __kmp_str_free(&buff);
   }
 #endif
-#if (KMP_STATIC_STEAL_ENABLED)
-  // It cannot be guaranteed that after execution of a loop with some other
-  // schedule kind all the parm3 variables will contain the same value. Even if
-  // all parm3 will be the same, it still exists a bad case like using 0 and 1
-  // rather than program life-time increment. So the dedicated variable is
-  // required. The 'static_steal_counter' is used.
-  if (pr->schedule == kmp_sch_static_steal) {
-    // Other threads will inspect this variable when searching for a victim.
-    // This is a flag showing that other threads may steal from this thread
-    // since then.
-    volatile T *p = &pr->u.p.static_steal_counter;
-    *p = *p + 1;
-  }
-#endif // ( KMP_STATIC_STEAL_ENABLED )
-
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   if (ompt_enabled.ompt_callback_work) {
     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
@@ -1075,7 +1084,6 @@ static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
 
   KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
   if (!th->th.th_team->t.t_serialized) {
-    //        int cid;
     dispatch_private_info_template<UT> *pr =
         reinterpret_cast<dispatch_private_info_template<UT> *>(
             th->th.th_dispatch->th_dispatch_pr_current);
@@ -1087,7 +1095,6 @@ static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
 
-    //        for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
     UT lower = pr->u.p.ordered_lower;
     UT upper = pr->u.p.ordered_upper;
     UT inc = upper - lower + 1;
@@ -1193,10 +1200,10 @@ int __kmp_dispatch_next_algorithm(int gtid,
   }
 
   switch (pr->schedule) {
-#if (KMP_STATIC_STEAL_ENABLED)
+#if KMP_STATIC_STEAL_ENABLED
   case kmp_sch_static_steal: {
     T chunk = pr->u.p.parm1;
-
+    UT nchunks = pr->u.p.parm2;
     KD_TRACE(100,
              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",
               gtid));
@@ -1204,11 +1211,12 @@ int __kmp_dispatch_next_algorithm(int gtid,
     trip = pr->u.p.tc - 1;
 
     if (traits_t<T>::type_size > 4) {
-      // use lock for 8-byte and CAS for 4-byte induction
-      // variable. TODO (optional): check and use 16-byte CAS
-      kmp_lock_t *lck = pr->u.p.th_steal_lock;
+      // use lock for 8-byte induction variable.
+      // TODO (optional): check presence and use 16-byte CAS
+      kmp_lock_t *lck = pr->u.p.steal_lock;
       KMP_DEBUG_ASSERT(lck != NULL);
       if (pr->u.p.count < (UT)pr->u.p.ub) {
+        KMP_DEBUG_ASSERT(pr->steal_flag == READY);
         __kmp_acquire_lock(lck, gtid);
         // try to get own chunk of iterations
         init = (pr->u.p.count)++;
@@ -1218,76 +1226,122 @@ int __kmp_dispatch_next_algorithm(int gtid,
         status = 0; // no own chunks
       }
       if (!status) { // try to steal
-        kmp_info_t **other_threads = team->t.t_threads;
+        kmp_lock_t *lckv; // victim buffer's lock
         T while_limit = pr->u.p.parm3;
         T while_index = 0;
-        T id = pr->u.p.static_steal_counter; // loop id
         int idx = (th->th.th_dispatch->th_disp_index - 1) %
                   __kmp_dispatch_num_buffers; // current loop index
         // note: victim thread can potentially execute another loop
-        // TODO: algorithm of searching for a victim
-        // should be cleaned up and measured
+        KMP_ATOMIC_ST_REL(&pr->steal_flag, THIEF); // mark self buffer inactive
         while ((!status) && (while_limit != ++while_index)) {
-          dispatch_private_info_template<T> *victim;
+          dispatch_private_info_template<T> *v;
           T remaining;
-          T victimIdx = pr->u.p.parm4;
-          T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
-          victim = reinterpret_cast<dispatch_private_info_template<T> *>(
-              &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
-          KMP_DEBUG_ASSERT(victim);
-          while ((victim == pr || id != victim->u.p.static_steal_counter) &&
-                 oldVictimIdx != victimIdx) {
-            victimIdx = (victimIdx + 1) % nproc;
-            victim = reinterpret_cast<dispatch_private_info_template<T> *>(
-                &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
-            KMP_DEBUG_ASSERT(victim);
+          T victimId = pr->u.p.parm4;
+          T oldVictimId = victimId ? victimId - 1 : nproc - 1;
+          v = reinterpret_cast<dispatch_private_info_template<T> *>(
+              &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
+          KMP_DEBUG_ASSERT(v);
+          while ((v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) &&
+                 oldVictimId != victimId) {
+            victimId = (victimId + 1) % nproc;
+            v = reinterpret_cast<dispatch_private_info_template<T> *>(
+                &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
+            KMP_DEBUG_ASSERT(v);
           }
-          if (victim == pr || id != victim->u.p.static_steal_counter) {
+          if (v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) {
             continue; // try once more (nproc attempts in total)
-            // no victim is ready yet to participate in stealing
-            // because no victim passed kmp_init_dispatch yet
           }
-          if (victim->u.p.count + 2 > (UT)victim->u.p.ub) {
-            pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid
-            continue; // not enough chunks to steal, goto next victim
+          if (KMP_ATOMIC_LD_RLX(&v->steal_flag) == UNUSED) {
+            kmp_uint32 old = UNUSED;
+            // try to steal whole range from inactive victim
+            status = v->steal_flag.compare_exchange_strong(old, THIEF);
+            if (status) {
+              // initialize self buffer with victim's whole range of chunks
+              T id = victimId;
+              T small_chunk, extras;
+              small_chunk = nchunks / nproc; // chunks per thread
+              extras = nchunks % nproc;
+              init = id * small_chunk + (id < extras ? id : extras);
+              __kmp_acquire_lock(lck, gtid);
+              pr->u.p.count = init + 1; // exclude one we execute immediately
+              pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
+              __kmp_release_lock(lck, gtid);
+              pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
+              // no need to reinitialize other thread invariants: lb, st, etc.
+#ifdef KMP_DEBUG
+              {
+                char *buff;
+                // create format specifiers before the debug output
+                buff = __kmp_str_format(
+                    "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
+                    "count:%%%s ub:%%%s\n",
+                    traits_t<UT>::spec, traits_t<T>::spec);
+                KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub));
+                __kmp_str_free(&buff);
+              }
+#endif
+              // activate non-empty buffer and let others steal from us
+              if (pr->u.p.count < (UT)pr->u.p.ub)
+                KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
+              break;
+            }
           }
-
-          lck = victim->u.p.th_steal_lock;
-          KMP_ASSERT(lck != NULL);
-          __kmp_acquire_lock(lck, gtid);
-          limit = victim->u.p.ub; // keep initial ub
-          if (victim->u.p.count >= limit ||
-              (remaining = limit - victim->u.p.count) < 2) {
-            __kmp_release_lock(lck, gtid);
-            pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim
-            continue; // not enough chunks to steal
+          if (KMP_ATOMIC_LD_RLX(&v->steal_flag) != READY ||
+              v->u.p.count >= (UT)v->u.p.ub) {
+            pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim tid
+            continue; // no chunks to steal, try next victim
           }
-          // stealing succeeded, reduce victim's ub by 1/4 of undone chunks or
-          // by 1
-          if (remaining > 3) {
+          lckv = v->u.p.steal_lock;
+          KMP_ASSERT(lckv != NULL);
+          __kmp_acquire_lock(lckv, gtid);
+          limit = v->u.p.ub; // keep initial ub
+          if (v->u.p.count >= limit) {
+            __kmp_release_lock(lckv, gtid);
+            pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim tid
+            continue; // no chunks to steal, try next victim
+          }
+
+          // stealing succeded, reduce victim's ub by 1/4 of undone chunks
+          // TODO: is this heuristics good enough??
+          remaining = limit - v->u.p.count;
+          if (remaining > 7) {
             // steal 1/4 of remaining
             KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2);
-            init = (victim->u.p.ub -= (remaining >> 2));
+            init = (v->u.p.ub -= (remaining >> 2));
           } else {
-            // steal 1 chunk of 2 or 3 remaining
+            // steal 1 chunk of 1..7 remaining
             KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1);
-            init = (victim->u.p.ub -= 1);
+            init = (v->u.p.ub -= 1);
           }
-          __kmp_release_lock(lck, gtid);
-
+          __kmp_release_lock(lckv, gtid);
+#ifdef KMP_DEBUG
+          {
+            char *buff;
+            // create format specifiers before the debug output
+            buff = __kmp_str_format(
+                "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
+                "count:%%%s ub:%%%s\n",
+                traits_t<UT>::spec, traits_t<UT>::spec);
+            KD_TRACE(10, (buff, gtid, victimId, init, limit));
+            __kmp_str_free(&buff);
+          }
+#endif
           KMP_DEBUG_ASSERT(init + 1 <= limit);
-          pr->u.p.parm4 = victimIdx; // remember victim to steal from
+          pr->u.p.parm4 = victimId; // remember victim to steal from
           status = 1;
-          while_index = 0;
-          // now update own count and ub with stolen range but init chunk
-          __kmp_acquire_lock(pr->u.p.th_steal_lock, gtid);
+          // now update own count and ub with stolen range excluding init chunk
+          __kmp_acquire_lock(lck, gtid);
           pr->u.p.count = init + 1;
           pr->u.p.ub = limit;
-          __kmp_release_lock(pr->u.p.th_steal_lock, gtid);
+          __kmp_release_lock(lck, gtid);
+          // activate non-empty buffer and let others steal from us
+          if (init + 1 < limit)
+            KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
         } // while (search for victim)
       } // if (try to find victim and steal)
     } else {
       // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
+      // as all operations on pair (count, ub) must be done atomically
       typedef union {
         struct {
           UT count;
@@ -1295,86 +1349,129 @@ int __kmp_dispatch_next_algorithm(int gtid,
         } p;
         kmp_int64 b;
       } union_i4;
-      // All operations on 'count' or 'ub' must be combined atomically
-      // together.
-      {
-        union_i4 vold, vnew;
+      union_i4 vold, vnew;
+      if (pr->u.p.count < (UT)pr->u.p.ub) {
+        KMP_DEBUG_ASSERT(pr->steal_flag == READY);
         vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
-        vnew = vold;
-        vnew.p.count++;
-        while (!KMP_COMPARE_AND_STORE_ACQ64(
+        vnew.b = vold.b;
+        vnew.p.count++; // get chunk from head of self range
+        while (!KMP_COMPARE_AND_STORE_REL64(
             (volatile kmp_int64 *)&pr->u.p.count,
             *VOLATILE_CAST(kmp_int64 *) & vold.b,
             *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
           KMP_CPU_PAUSE();
           vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
-          vnew = vold;
+          vnew.b = vold.b;
           vnew.p.count++;
         }
-        vnew = vold;
-        init = vnew.p.count;
-        status = (init < (UT)vnew.p.ub);
+        init = vold.p.count;
+        status = (init < (UT)vold.p.ub);
+      } else {
+        status = 0; // no own chunks
       }
-
-      if (!status) {
-        kmp_info_t **other_threads = team->t.t_threads;
+      if (!status) { // try to steal
         T while_limit = pr->u.p.parm3;
         T while_index = 0;
-        T id = pr->u.p.static_steal_counter; // loop id
         int idx = (th->th.th_dispatch->th_disp_index - 1) %
                   __kmp_dispatch_num_buffers; // current loop index
         // note: victim thread can potentially execute another loop
-        // TODO: algorithm of searching for a victim
-        // should be cleaned up and measured
+        KMP_ATOMIC_ST_REL(&pr->steal_flag, THIEF); // mark self buffer inactive
         while ((!status) && (while_limit != ++while_index)) {
-          dispatch_private_info_template<T> *victim;
-          union_i4 vold, vnew;
+          dispatch_private_info_template<T> *v;
           T remaining;
-          T victimIdx = pr->u.p.parm4;
-          T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
-          victim = reinterpret_cast<dispatch_private_info_template<T> *>(
-              &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
-          KMP_DEBUG_ASSERT(victim);
-          while ((victim == pr || id != victim->u.p.static_steal_counter) &&
-                 oldVictimIdx != victimIdx) {
-            victimIdx = (victimIdx + 1) % nproc;
-            victim = reinterpret_cast<dispatch_private_info_template<T> *>(
-                &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
-            KMP_DEBUG_ASSERT(victim);
+          T victimId = pr->u.p.parm4;
+          T oldVictimId = victimId ? victimId - 1 : nproc - 1;
+          v = reinterpret_cast<dispatch_private_info_template<T> *>(
+              &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
+          KMP_DEBUG_ASSERT(v);
+          while ((v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) &&
+                 oldVictimId != victimId) {
+            victimId = (victimId + 1) % nproc;
+            v = reinterpret_cast<dispatch_private_info_template<T> *>(
+                &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
+            KMP_DEBUG_ASSERT(v);
           }
-          if (victim == pr || id != victim->u.p.static_steal_counter) {
+          if (v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) {
             continue; // try once more (nproc attempts in total)
-            // no victim is ready yet to participate in stealing
-            // because no victim passed kmp_init_dispatch yet
           }
-          pr->u.p.parm4 = victimIdx; // new victim found
-          while (1) { // CAS loop if victim has enough chunks to steal
-            vold.b = *(volatile kmp_int64 *)(&victim->u.p.count);
-            vnew = vold;
-
-            KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
-            if (vnew.p.count >= (UT)vnew.p.ub ||
-                (remaining = vnew.p.ub - vnew.p.count) < 2) {
-              pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start victim id
-              break; // not enough chunks to steal, goto next victim
+          if (KMP_ATOMIC_LD_RLX(&v->steal_flag) == UNUSED) {
+            kmp_uint32 old = UNUSED;
+            // try to steal whole range from inactive victim
+            status = v->steal_flag.compare_exchange_strong(old, THIEF);
+            if (status) {
+              // initialize self buffer with victim's whole range of chunks
+              T id = victimId;
+              T small_chunk, extras;
+              small_chunk = nchunks / nproc; // chunks per thread
+              extras = nchunks % nproc;
+              init = id * small_chunk + (id < extras ? id : extras);
+              vnew.p.count = init + 1;
+              vnew.p.ub = init + small_chunk + (id < extras ? 1 : 0);
+              // write pair (count, ub) at once atomically
+#if KMP_ARCH_X86
+              KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vnew.b);
+#else
+              *(volatile kmp_int64 *)(&pr->u.p.count) = vnew.b;
+#endif
+              pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
+              // no need to initialize other thread invariants: lb, st, etc.
+#ifdef KMP_DEBUG
+              {
+                char *buff;
+                // create format specifiers before the debug output
+                buff = __kmp_str_format(
+                    "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
+                    "count:%%%s ub:%%%s\n",
+                    traits_t<UT>::spec, traits_t<T>::spec);
+                KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub));
+                __kmp_str_free(&buff);
+              }
+#endif
+              // activate non-empty buffer and let others steal from us
+              if (pr->u.p.count < (UT)pr->u.p.ub)
+                KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
+              break;
             }
-            if (remaining > 3) {
-              // try to steal 1/4 of remaining
-              vnew.p.ub -= remaining >> 2;
+          }
+          while (1) { // CAS loop with check if victim still has enough chunks
+            // many threads may be stealing concurrently from same victim
+            vold.b = *(volatile kmp_int64 *)(&v->u.p.count);
+            if (KMP_ATOMIC_LD_ACQ(&v->steal_flag) != READY ||
+                vold.p.count >= (UT)vold.p.ub) {
+              pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim id
+              break; // no chunks to steal, try next victim
+            }
+            vnew.b = vold.b;
+            remaining = vold.p.ub - vold.p.count;
+            // try to steal 1/4 of remaining
+            // TODO: is this heuristics good enough??
+            if (remaining > 7) {
+              vnew.p.ub -= remaining >> 2; // steal from tail of victim's range
             } else {
-              vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining
+              vnew.p.ub -= 1; // steal 1 chunk of 1..7 remaining
             }
-            KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
-            // TODO: Should this be acquire or release?
-            if (KMP_COMPARE_AND_STORE_ACQ64(
-                    (volatile kmp_int64 *)&victim->u.p.count,
+            KMP_DEBUG_ASSERT(vnew.p.ub * (UT)chunk <= trip);
+            if (KMP_COMPARE_AND_STORE_REL64(
+                    (volatile kmp_int64 *)&v->u.p.count,
                     *VOLATILE_CAST(kmp_int64 *) & vold.b,
                     *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
-              // stealing succeeded
+              // stealing succedded
+#ifdef KMP_DEBUG
+              {
+                char *buff;
+                // create format specifiers before the debug output
+                buff = __kmp_str_format(
+                    "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
+                    "count:%%%s ub:%%%s\n",
+                    traits_t<T>::spec, traits_t<T>::spec);
+                KD_TRACE(10, (buff, gtid, victimId, vnew.p.ub, vold.p.ub));
+                __kmp_str_free(&buff);
+              }
+#endif
               KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,
                                         vold.p.ub - vnew.p.ub);
               status = 1;
-              while_index = 0;
+              pr->u.p.parm4 = victimId; // keep victim id
               // now update own count and ub
               init = vnew.p.ub;
               vold.p.count = init + 1;
@@ -1383,6 +1480,9 @@ int __kmp_dispatch_next_algorithm(int gtid,
 #else
               *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
 #endif
+              // activate non-empty buffer and let others steal from us
+              if (vold.p.count < (UT)vold.p.ub)
+                KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
               break;
             } // if (check CAS result)
             KMP_CPU_PAUSE(); // CAS failed, repeatedly attempt
@@ -1396,13 +1496,16 @@ int __kmp_dispatch_next_algorithm(int gtid,
       if (p_st != NULL)
         *p_st = 0;
     } else {
-      start = pr->u.p.parm2;
+      start = pr->u.p.lb;
       init *= chunk;
       limit = chunk + init - 1;
       incr = pr->u.p.st;
       KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1);
 
       KMP_DEBUG_ASSERT(init <= trip);
+      // keep track of done chunks for possible early exit from stealing
+      // TODO: count executed chunks locally with rare update of shared location
+      // test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
       if ((last = (limit >= trip)) != 0)
         limit = trip;
       if (p_st != NULL)
@@ -1415,15 +1518,10 @@ int __kmp_dispatch_next_algorithm(int gtid,
         *p_lb = start + init * incr;
         *p_ub = start + limit * incr;
       }
-
-      if (pr->flags.ordered) {
-        pr->u.p.ordered_lower = init;
-        pr->u.p.ordered_upper = limit;
-      } // if
     } // if
     break;
   } // case
-#endif // ( KMP_STATIC_STEAL_ENABLED )
+#endif // KMP_STATIC_STEAL_ENABLED
   case kmp_sch_static_balanced: {
     KD_TRACE(
         10,
@@ -1485,28 +1583,32 @@ int __kmp_dispatch_next_algorithm(int gtid,
   break;
 
   case kmp_sch_dynamic_chunked: {
-    T chunk = pr->u.p.parm1;
+    UT chunk_number;
+    UT chunk_size = pr->u.p.parm1;
+    UT nchunks = pr->u.p.parm2;
 
     KD_TRACE(
         100,
         ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",
          gtid));
 
-    init = chunk * test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
-    trip = pr->u.p.tc - 1;
-
-    if ((status = (init <= trip)) == 0) {
+    chunk_number = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
+    status = (chunk_number < nchunks);
+    if (!status) {
       *p_lb = 0;
       *p_ub = 0;
       if (p_st != NULL)
         *p_st = 0;
     } else {
+      init = chunk_size * chunk_number;
+      trip = pr->u.p.tc - 1;
       start = pr->u.p.lb;
-      limit = chunk + init - 1;
       incr = pr->u.p.st;
 
-      if ((last = (limit >= trip)) != 0)
+      if ((last = (trip - init < (UT)chunk_size)))
         limit = trip;
+      else
+        limit = chunk_size + init - 1;
 
       if (p_st != NULL)
         *p_st = incr;
@@ -1814,7 +1916,7 @@ int __kmp_dispatch_next_algorithm(int gtid,
     __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
                 KMP_HNT(GetNewerLibrary), // Hint
                 __kmp_msg_null // Variadic argument list terminator
-                );
+    );
   } break;
   } // switch
   if (p_last)
@@ -1836,6 +1938,8 @@ int __kmp_dispatch_next_algorithm(int gtid,
         "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d "
         "p_lb:%%%s p_ub:%%%s p_st:%%%s\n",
         traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
+    KMP_DEBUG_ASSERT(p_last);
+    KMP_DEBUG_ASSERT(p_st);
     KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st));
     __kmp_str_free(&buff);
   }
@@ -1902,7 +2006,7 @@ static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
                                ,
                                void *codeptr
 #endif
-                               ) {
+) {
 
   typedef typename traits_t<T>::unsigned_t UT;
   typedef typename traits_t<T>::signed_t ST;
@@ -2062,16 +2166,15 @@ static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
                                                 th->th.th_info.ds.ds_tid);
     // status == 0: no more iterations to execute
     if (status == 0) {
-      UT num_done;
-
-      num_done = test_then_inc<ST>((volatile ST *)&sh->u.s.num_done);
+      ST num_done;
+      num_done = test_then_inc<ST>(&sh->u.s.num_done);
 #ifdef KMP_DEBUG
       {
         char *buff;
         // create format specifiers before the debug output
         buff = __kmp_str_format(
             "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
-            traits_t<UT>::spec);
+            traits_t<ST>::spec);
         KD_TRACE(10, (buff, gtid, sh->u.s.num_done));
         __kmp_str_free(&buff);
       }
@@ -2080,28 +2183,31 @@ static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
 #if KMP_USE_HIER_SCHED
       pr->flags.use_hier = FALSE;
 #endif
-      if ((ST)num_done == th->th.th_team_nproc - 1) {
-#if (KMP_STATIC_STEAL_ENABLED)
-        if (pr->schedule == kmp_sch_static_steal &&
-            traits_t<T>::type_size > 4) {
+      if (num_done == th->th.th_team_nproc - 1) {
+#if KMP_STATIC_STEAL_ENABLED
+        if (pr->schedule == kmp_sch_static_steal) {
           int i;
           int idx = (th->th.th_dispatch->th_disp_index - 1) %
                     __kmp_dispatch_num_buffers; // current loop index
-          kmp_info_t **other_threads = team->t.t_threads;
           // loop complete, safe to destroy locks used for stealing
           for (i = 0; i < th->th.th_team_nproc; ++i) {
             dispatch_private_info_template<T> *buf =
                 reinterpret_cast<dispatch_private_info_template<T> *>(
-                    &other_threads[i]->th.th_dispatch->th_disp_buffer[idx]);
-            kmp_lock_t *lck = buf->u.p.th_steal_lock;
-            KMP_ASSERT(lck != NULL);
-            __kmp_destroy_lock(lck);
-            __kmp_free(lck);
-            buf->u.p.th_steal_lock = NULL;
+                    &team->t.t_dispatch[i].th_disp_buffer[idx]);
+            KMP_ASSERT(buf->steal_flag == THIEF); // buffer must be inactive
+            KMP_ATOMIC_ST_RLX(&buf->steal_flag, UNUSED);
+            if (traits_t<T>::type_size > 4) {
+              // destroy locks used for stealing
+              kmp_lock_t *lck = buf->u.p.steal_lock;
+              KMP_ASSERT(lck != NULL);
+              __kmp_destroy_lock(lck);
+              __kmp_free(lck);
+              buf->u.p.steal_lock = NULL;
+            }
           }
         }
 #endif
-        /* NOTE: release this buffer to be reused */
+        /* NOTE: release shared buffer to be reused */
 
         KMP_MB(); /* Flush all pending memory write invalidates.  */
 
@@ -2113,8 +2219,6 @@ static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
           sh->u.s.ordered_iteration = 0;
         }
 
-        KMP_MB(); /* Flush all pending memory write invalidates.  */
-
         sh->buffer_index += __kmp_dispatch_num_buffers;
         KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
                        gtid, sh->buffer_index));
@@ -2429,7 +2533,7 @@ int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
                                         ,
                                         OMPT_LOAD_RETURN_ADDRESS(gtid)
 #endif
-                                            );
+  );
 }
 
 /*!
@@ -2446,7 +2550,7 @@ int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
                                          ,
                                          OMPT_LOAD_RETURN_ADDRESS(gtid)
 #endif
-                                             );
+  );
 }
 
 /*!
@@ -2462,7 +2566,7 @@ int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
                                         ,
                                         OMPT_LOAD_RETURN_ADDRESS(gtid)
 #endif
-                                            );
+  );
 }
 
 /*!
@@ -2479,7 +2583,7 @@ int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
                                          ,
                                          OMPT_LOAD_RETURN_ADDRESS(gtid)
 #endif
-                                             );
+  );
 }
 
 /*!
@@ -2541,7 +2645,7 @@ kmp_uint32
 __kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
              kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
              void *obj // Higher-level synchronization object, or NULL.
-             ) {
+) {
   // note: we may not belong to a team at this point
   volatile kmp_uint32 *spin = spinner;
   kmp_uint32 check = checker;
@@ -2567,7 +2671,7 @@ __kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
 void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker,
                       kmp_uint32 (*pred)(void *, kmp_uint32),
                       void *obj // Higher-level synchronization object, or NULL.
-                      ) {
+) {
   // note: we may not belong to a team at this point
   void *spin = spinner;
   kmp_uint32 check = checker;
diff --git a/openmp/runtime/src/kmp_dispatch.h b/openmp/runtime/src/kmp_dispatch.h
index 1f98e4b80a79..ae11361ca512 100644
--- a/openmp/runtime/src/kmp_dispatch.h
+++ b/openmp/runtime/src/kmp_dispatch.h
@@ -74,8 +74,7 @@ template <typename T> struct dispatch_private_infoXX_template {
   T lb;
   ST st; // signed
   UT tc; // unsigned
-  T static_steal_counter; // for static_steal only; maybe better to put after ub
-  kmp_lock_t *th_steal_lock; // lock used for chunk stealing
+  kmp_lock_t *steal_lock; // lock used for chunk stealing
   /* parm[1-4] are used in different ways by different scheduling algorithms */
 
   // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
@@ -134,9 +133,8 @@ template <typename T> struct KMP_ALIGN_CACHE dispatch_private_info_template {
   } u;
   enum sched_type schedule; /* scheduling algorithm */
   kmp_sched_flags_t flags; /* flags (e.g., ordered, nomerge, etc.) */
+  std::atomic<kmp_uint32> steal_flag; // static_steal only, state of a buffer
   kmp_uint32 ordered_bumped;
-  // to retain the structure size after making order
-  kmp_int32 ordered_dummy[KMP_MAX_ORDERED - 3];
   dispatch_private_info *next; /* stack of buffers for nest of serial regions */
   kmp_uint32 type_size;
 #if KMP_USE_HIER_SCHED
@@ -153,10 +151,11 @@ template <typename T> struct KMP_ALIGN_CACHE dispatch_private_info_template {
 // dispatch_shared_info{32,64}_t types
 template <typename T> struct dispatch_shared_infoXX_template {
   typedef typename traits_t<T>::unsigned_t UT;
+  typedef typename traits_t<T>::signed_t ST;
   /* chunk index under dynamic, number of idle threads under static-steal;
      iteration index otherwise */
   volatile UT iteration;
-  volatile UT num_done;
+  volatile ST num_done;
   volatile UT ordered_iteration;
   // to retain the structure size making ordered_iteration scalar
   UT ordered_dummy[KMP_MAX_ORDERED - 3];
diff --git a/openmp/runtime/src/kmp_dispatch_hier.h b/openmp/runtime/src/kmp_dispatch_hier.h
index 721c7f678e70..dbea088ffb35 100644
--- a/openmp/runtime/src/kmp_dispatch_hier.h
+++ b/openmp/runtime/src/kmp_dispatch_hier.h
@@ -496,7 +496,7 @@ private:
     T hier_id = (T)current->get_hier_id();
     // Attempt to grab next iteration range for this level
     if (previous_id == 0) {
-      KD_TRACE(1, ("kmp_hier_t.next_recurse(): T#%d (%d) is master of unit\n",
+      KD_TRACE(1, ("kmp_hier_t.next_recurse(): T#%d (%d) is primary of unit\n",
                    gtid, hier_level));
       kmp_int32 contains_last;
       T my_lb, my_ub;
@@ -590,7 +590,7 @@ private:
       }
       if (p_last)
         *p_last = contains_last;
-    } // if master thread of this unit
+    } // if primary thread of this unit
     if (hier_level > 0 || !__kmp_dispatch_hand_threading) {
       KD_TRACE(10,
                ("kmp_hier_t.next_recurse(): T#%d (%d) going into barrier.\n",
@@ -740,7 +740,7 @@ public:
                 gtid));
       if (unit_id == 0) {
         // For hand threading, the sh buffer on the lowest level is only ever
-        // modified and read by the master thread on that level.  Because of
+        // modified and read by the primary thread on that level.  Because of
         // this, we can always use the first sh buffer.
         auto sh = &(parent->hier_barrier.sh[0]);
         KMP_DEBUG_ASSERT(sh);
@@ -784,7 +784,7 @@ public:
           }
         }
         parent->set_next_hand_thread(*p_lb, *p_ub, *p_st, status, tdata->index);
-      } // if master thread of lowest unit level
+      } // if primary thread of lowest unit level
       parent->barrier(pr->get_hier_id(), tdata);
       if (unit_id != 0) {
         *p_lb = parent->get_curr_lb(tdata->index);
@@ -924,7 +924,7 @@ void __kmp_dispatch_init_hierarchy(ident_t *loc, int n,
                                    T lb, T ub,
                                    typename traits_t<T>::signed_t st) {
   int tid, gtid, num_hw_threads, num_threads_per_layer1, active;
-  int my_buffer_index;
+  unsigned int my_buffer_index;
   kmp_info_t *th;
   kmp_team_t *team;
   dispatch_private_info_template<T> *pr;
@@ -975,7 +975,7 @@ void __kmp_dispatch_init_hierarchy(ident_t *loc, int n,
   KMP_DEBUG_ASSERT(sh);
   pr->flags.use_hier = TRUE;
   pr->u.p.tc = 0;
-  // Have master allocate the hierarchy
+  // Have primary thread allocate the hierarchy
   if (__kmp_tid_from_gtid(gtid) == 0) {
     KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d pr:%p sh:%p allocating "
                   "hierarchy\n",
@@ -1071,7 +1071,7 @@ void __kmp_dispatch_init_hierarchy(ident_t *loc, int n,
       break;
     int index = __kmp_dispatch_get_index(tid, hier->get_type(i));
     kmp_hier_top_unit_t<T> *my_unit = hier->get_unit(i, index);
-    // Only master threads of this unit within the hierarchy do initialization
+    // Only primary threads of this unit within the hierarchy do initialization
     KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d (%d) prev_id is 0\n",
                   gtid, i));
     my_unit->reset_shared_barrier();
diff --git a/openmp/runtime/src/kmp_environment.cpp b/openmp/runtime/src/kmp_environment.cpp
index 19c59be6cf23..b35027b57f03 100644
--- a/openmp/runtime/src/kmp_environment.cpp
+++ b/openmp/runtime/src/kmp_environment.cpp
@@ -236,7 +236,7 @@ void __kmp_env_unset(char const *name) {
 static void
 ___kmp_env_blk_parse_string(kmp_env_blk_t *block, // M: Env block to fill.
                             char const *env // I: String to parse.
-                            ) {
+) {
 
   char const chr_delimiter = '|';
   char const str_delimiter[] = {chr_delimiter, 0};
@@ -305,7 +305,7 @@ ___kmp_env_blk_parse_string(kmp_env_blk_t *block, // M: Env block to fill.
 static void ___kmp_env_blk_parse_windows(
     kmp_env_blk_t *block, // M: Env block to fill.
     char const *env // I: Pointer to Windows* OS (DOS) environment block.
-    ) {
+) {
 
   char *bulk = NULL;
   kmp_env_var_t *vars = NULL;
@@ -376,10 +376,11 @@ static void ___kmp_env_blk_parse_windows(
         { "HOME=/home/lev", "TERM=xterm", NULL }
 */
 
+#if KMP_OS_UNIX
 static void
 ___kmp_env_blk_parse_unix(kmp_env_blk_t *block, // M: Env block to fill.
                           char **env // I: Unix environment to parse.
-                          ) {
+) {
   char *bulk = NULL;
   kmp_env_var_t *vars = NULL;
   int count = 0;
@@ -423,10 +424,11 @@ ___kmp_env_blk_parse_unix(kmp_env_blk_t *block, // M: Env block to fill.
   block->vars = vars;
   block->count = count;
 }
+#endif
 
 void __kmp_env_blk_init(kmp_env_blk_t *block, // M: Block to initialize.
                         char const *bulk // I: Initialization string, or NULL.
-                        ) {
+) {
 
   if (bulk != NULL) {
     ___kmp_env_blk_parse_string(block, bulk);
@@ -458,7 +460,7 @@ static int ___kmp_env_var_cmp( // Comparison function for qsort().
 
 void __kmp_env_blk_sort(
     kmp_env_blk_t *block // M: Block of environment variables to sort.
-    ) {
+) {
 
   qsort(CCAST(kmp_env_var_t *, block->vars), block->count,
         sizeof(kmp_env_var_t),
@@ -468,7 +470,7 @@ void __kmp_env_blk_sort(
 
 void __kmp_env_blk_free(
     kmp_env_blk_t *block // M: Block of environment variables to free.
-    ) {
+) {
 
   KMP_INTERNAL_FREE(CCAST(kmp_env_var_t *, block->vars));
   __kmp_str_free(&(block->bulk));
@@ -479,10 +481,9 @@ void __kmp_env_blk_free(
 } // __kmp_env_blk_free
 
 char const * // R: Value of variable or NULL if variable does not exist.
-    __kmp_env_blk_var(
-        kmp_env_blk_t *block, // I: Block of environment variables.
-        char const *name // I: Name of variable to find.
-        ) {
+__kmp_env_blk_var(kmp_env_blk_t *block, // I: Block of environment variables.
+                  char const *name // I: Name of variable to find.
+) {
 
   int i;
   for (i = 0; i < block->count; ++i) {
diff --git a/openmp/runtime/src/kmp_error.cpp b/openmp/runtime/src/kmp_error.cpp
index 7fc0ce17a05c..cf5749dfd9fb 100644
--- a/openmp/runtime/src/kmp_error.cpp
+++ b/openmp/runtime/src/kmp_error.cpp
@@ -20,17 +20,23 @@
 #define MIN_STACK 100
 
 static char const *cons_text_c[] = {
-    "(none)", "\"parallel\"", "work-sharing", /* this is not called "for"
-                                                 because of lowering of
-                                                 "sections" pragmas */
+    "(none)",
+    "\"parallel\"",
+    "work-sharing", /* this is not called "for"
+                       because of lowering of
+                       "sections" pragmas */
     "\"ordered\" work-sharing", /* this is not called "for ordered" because of
                                    lowering of "sections" pragmas */
     "\"sections\"",
     "work-sharing", /* this is not called "single" because of lowering of
                        "sections" pragmas */
-    "\"critical\"", "\"ordered\"", /* in PARALLEL */
+    "\"critical\"",
+    "\"ordered\"", /* in PARALLEL */
     "\"ordered\"", /* in PDO */
-    "\"master\"", "\"reduce\"", "\"barrier\""};
+    "\"master\"",
+    "\"reduce\"",
+    "\"barrier\"",
+    "\"masked\""};
 
 #define get_src(ident) ((ident) == NULL ? NULL : (ident)->psource)
 
@@ -106,7 +112,7 @@ static char *__kmp_pragma(int ct, ident_t const *ident) {
 void __kmp_error_construct(kmp_i18n_id_t id, // Message identifier.
                            enum cons_type ct, // Construct type.
                            ident_t const *ident // Construct ident.
-                           ) {
+) {
   char *construct = __kmp_pragma(ct, ident);
   __kmp_fatal(__kmp_msg_format(id, construct), __kmp_msg_null);
   KMP_INTERNAL_FREE(construct);
@@ -116,7 +122,7 @@ void __kmp_error_construct2(kmp_i18n_id_t id, // Message identifier.
                             enum cons_type ct, // First construct type.
                             ident_t const *ident, // First construct ident.
                             struct cons_data const *cons // Second construct.
-                            ) {
+) {
   char *construct1 = __kmp_pragma(ct, ident);
   char *construct2 = __kmp_pragma(cons->type, cons->ident);
   __kmp_fatal(__kmp_msg_format(id, construct1, construct2), __kmp_msg_null);
@@ -311,7 +317,7 @@ __kmp_check_sync( int gtid, enum cons_type ct, ident_t const * ident, kmp_user_l
       /* we are in CRITICAL which is inside a CRITICAL construct of same name */
       __kmp_error_construct2(kmp_i18n_msg_CnsNestingSameName, ct, ident, &cons);
     }
-  } else if (ct == ct_master || ct == ct_reduce) {
+  } else if (ct == ct_master || ct == ct_masked || ct == ct_reduce) {
     if (p->w_top > p->p_top) {
       /* inside a WORKSHARING construct for this PARALLEL region */
       __kmp_error_construct2(kmp_i18n_msg_CnsInvalidNesting, ct, ident,
diff --git a/openmp/runtime/src/kmp_ftn_entry.h b/openmp/runtime/src/kmp_ftn_entry.h
index 1eb1a0dc9813..0786ed3c119a 100644
--- a/openmp/runtime/src/kmp_ftn_entry.h
+++ b/openmp/runtime/src/kmp_ftn_entry.h
@@ -58,6 +58,16 @@ extern "C" {
 #define KMP_DEREF *
 #endif
 
+// For API with specific C vs. Fortran interfaces (ompc_* exists in
+// kmp_csupport.cpp), only create GOMP versioned symbols of the API for the
+// APPEND Fortran entries in this file. The GOMP versioned symbols of the C API
+// will take place where the ompc_* functions are defined.
+#if KMP_FTN_ENTRIES == KMP_FTN_APPEND
+#define KMP_EXPAND_NAME_IF_APPEND(name) KMP_EXPAND_NAME(name)
+#else
+#define KMP_EXPAND_NAME_IF_APPEND(name) name
+#endif
+
 void FTN_STDCALL FTN_SET_STACKSIZE(int KMP_DEREF arg) {
 #ifdef KMP_STUB
   __kmps_set_stacksize(KMP_DEREF arg);
@@ -118,12 +128,10 @@ int FTN_STDCALL FTN_GET_BLOCKTIME(void) {
   return __kmps_get_blocktime();
 #else
   int gtid, tid;
-  kmp_info_t *thread;
   kmp_team_p *team;
 
   gtid = __kmp_entry_gtid();
   tid = __kmp_tid_from_gtid(gtid);
-  thread = __kmp_thread_from_gtid(gtid);
   team = __kmp_threads[gtid]->th.th_team;
 
   /* These must match the settings used in __kmp_wait_sleep() */
@@ -202,8 +210,11 @@ void FTN_STDCALL FTN_SET_DISP_NUM_BUFFERS(int KMP_DEREF arg) {
 #else
   // ignore after initialization because some teams have already
   // allocated dispatch buffers
-  if (__kmp_init_serial == 0 && (KMP_DEREF arg) > 0)
-    __kmp_dispatch_num_buffers = KMP_DEREF arg;
+  int num_buffers = KMP_DEREF arg;
+  if (__kmp_init_serial == FALSE && num_buffers >= KMP_MIN_DISP_NUM_BUFF &&
+      num_buffers <= KMP_MAX_DISP_NUM_BUFF) {
+    __kmp_dispatch_num_buffers = num_buffers;
+  }
 #endif
 }
 
@@ -214,6 +225,7 @@ int FTN_STDCALL FTN_SET_AFFINITY(void **mask) {
   if (!TCR_4(__kmp_init_middle)) {
     __kmp_middle_initialize();
   }
+  __kmp_assign_root_init_mask();
   return __kmp_aux_set_affinity(mask);
 #endif
 }
@@ -225,6 +237,7 @@ int FTN_STDCALL FTN_GET_AFFINITY(void **mask) {
   if (!TCR_4(__kmp_init_middle)) {
     __kmp_middle_initialize();
   }
+  __kmp_assign_root_init_mask();
   return __kmp_aux_get_affinity(mask);
 #endif
 }
@@ -237,6 +250,7 @@ int FTN_STDCALL FTN_GET_AFFINITY_MAX_PROC(void) {
   if (!TCR_4(__kmp_init_middle)) {
     __kmp_middle_initialize();
   }
+  __kmp_assign_root_init_mask();
   return __kmp_aux_get_affinity_max_proc();
 #endif
 }
@@ -250,6 +264,7 @@ void FTN_STDCALL FTN_CREATE_AFFINITY_MASK(void **mask) {
   if (!TCR_4(__kmp_init_middle)) {
     __kmp_middle_initialize();
   }
+  __kmp_assign_root_init_mask();
   mask_internals = __kmp_affinity_dispatch->allocate_mask();
   KMP_CPU_ZERO(mask_internals);
   *mask = mask_internals;
@@ -265,6 +280,7 @@ void FTN_STDCALL FTN_DESTROY_AFFINITY_MASK(void **mask) {
   if (!TCR_4(__kmp_init_middle)) {
     __kmp_middle_initialize();
   }
+  __kmp_assign_root_init_mask();
   if (__kmp_env_consistency_check) {
     if (*mask == NULL) {
       KMP_FATAL(AffinityInvalidMask, "kmp_destroy_affinity_mask");
@@ -283,6 +299,7 @@ int FTN_STDCALL FTN_SET_AFFINITY_MASK_PROC(int KMP_DEREF proc, void **mask) {
   if (!TCR_4(__kmp_init_middle)) {
     __kmp_middle_initialize();
   }
+  __kmp_assign_root_init_mask();
   return __kmp_aux_set_affinity_mask_proc(KMP_DEREF proc, mask);
 #endif
 }
@@ -294,6 +311,7 @@ int FTN_STDCALL FTN_UNSET_AFFINITY_MASK_PROC(int KMP_DEREF proc, void **mask) {
   if (!TCR_4(__kmp_init_middle)) {
     __kmp_middle_initialize();
   }
+  __kmp_assign_root_init_mask();
   return __kmp_aux_unset_affinity_mask_proc(KMP_DEREF proc, mask);
 #endif
 }
@@ -305,6 +323,7 @@ int FTN_STDCALL FTN_GET_AFFINITY_MASK_PROC(int KMP_DEREF proc, void **mask) {
   if (!TCR_4(__kmp_init_middle)) {
     __kmp_middle_initialize();
   }
+  __kmp_assign_root_init_mask();
   return __kmp_aux_get_affinity_mask_proc(KMP_DEREF proc, mask);
 #endif
 }
@@ -339,6 +358,7 @@ int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_MAX_THREADS)(void) {
   if (!TCR_4(__kmp_init_middle)) {
     __kmp_middle_initialize();
   }
+  __kmp_assign_root_init_mask();
   gtid = __kmp_entry_gtid();
   thread = __kmp_threads[gtid];
   // return thread -> th.th_team -> t.t_current_task[
@@ -432,8 +452,9 @@ public:
 /*
  * Set the value of the affinity-format-var ICV on the current device to the
  * format specified in the argument.
-*/
-void FTN_STDCALL FTN_SET_AFFINITY_FORMAT(char const *format, size_t size) {
+ */
+void FTN_STDCALL KMP_EXPAND_NAME_IF_APPEND(FTN_SET_AFFINITY_FORMAT)(
+    char const *format, size_t size) {
 #ifdef KMP_STUB
   return;
 #else
@@ -453,8 +474,9 @@ void FTN_STDCALL FTN_SET_AFFINITY_FORMAT(char const *format, size_t size) {
  * specification (not including null byte character) and writes the value of the
  * affinity-format-var ICV on the current device to buffer. If the return value
  * is larger than size, the affinity format specification is truncated.
-*/
-size_t FTN_STDCALL FTN_GET_AFFINITY_FORMAT(char *buffer, size_t size) {
+ */
+size_t FTN_STDCALL KMP_EXPAND_NAME_IF_APPEND(FTN_GET_AFFINITY_FORMAT)(
+    char *buffer, size_t size) {
 #ifdef KMP_STUB
   return 0;
 #else
@@ -475,8 +497,9 @@ size_t FTN_STDCALL FTN_GET_AFFINITY_FORMAT(char *buffer, size_t size) {
  * Prints the thread affinity information of the current thread in the format
  * specified by the format argument. If the format is NULL or a zero-length
  * string, the value of the affinity-format-var ICV is used.
-*/
-void FTN_STDCALL FTN_DISPLAY_AFFINITY(char const *format, size_t size) {
+ */
+void FTN_STDCALL KMP_EXPAND_NAME_IF_APPEND(FTN_DISPLAY_AFFINITY)(
+    char const *format, size_t size) {
 #ifdef KMP_STUB
   return;
 #else
@@ -484,6 +507,7 @@ void FTN_STDCALL FTN_DISPLAY_AFFINITY(char const *format, size_t size) {
   if (!TCR_4(__kmp_init_middle)) {
     __kmp_middle_initialize();
   }
+  __kmp_assign_root_init_mask();
   gtid = __kmp_get_gtid();
   ConvertedString cformat(format, size);
   __kmp_aux_display_affinity(gtid, cformat.get());
@@ -499,9 +523,9 @@ void FTN_STDCALL FTN_DISPLAY_AFFINITY(char const *format, size_t size) {
  * used. The buffer must be allocated prior to calling the routine. If the
  * return value is larger than size, the affinity format specification is
  * truncated.
-*/
-size_t FTN_STDCALL FTN_CAPTURE_AFFINITY(char *buffer, char const *format,
-                                        size_t buf_size, size_t for_size) {
+ */
+size_t FTN_STDCALL KMP_EXPAND_NAME_IF_APPEND(FTN_CAPTURE_AFFINITY)(
+    char *buffer, char const *format, size_t buf_size, size_t for_size) {
 #if defined(KMP_STUB)
   return 0;
 #else
@@ -511,6 +535,7 @@ size_t FTN_STDCALL FTN_CAPTURE_AFFINITY(char *buffer, char const *format,
   if (!TCR_4(__kmp_init_middle)) {
     __kmp_middle_initialize();
   }
+  __kmp_assign_root_init_mask();
   gtid = __kmp_get_gtid();
   __kmp_str_buf_init(&capture_buf);
   ConvertedString cformat(format, for_size);
@@ -531,7 +556,7 @@ int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_THREAD_NUM)(void) {
   int gtid;
 
 #if KMP_OS_DARWIN || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||    \
-        KMP_OS_HURD|| KMP_OS_OPENBSD
+    KMP_OS_HURD || KMP_OS_OPENBSD
   gtid = __kmp_entry_gtid();
 #elif KMP_OS_WINDOWS
   if (!__kmp_init_parallel ||
@@ -587,18 +612,19 @@ int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_NUM_PROCS)(void) {
   if (!TCR_4(__kmp_init_middle)) {
     __kmp_middle_initialize();
   }
+  __kmp_assign_root_init_mask();
   return __kmp_avail_proc;
 #endif
 }
 
 void FTN_STDCALL KMP_EXPAND_NAME(FTN_SET_NESTED)(int KMP_DEREF flag) {
-  KMP_INFORM(APIDeprecated, "omp_set_nested", "omp_set_max_active_levels");
 #ifdef KMP_STUB
   __kmps_set_nested(KMP_DEREF flag);
 #else
   kmp_info_t *thread;
   /* For the thread-private internal controls implementation */
   thread = __kmp_entry_thread();
+  KMP_INFORM(APIDeprecated, "omp_set_nested", "omp_set_max_active_levels");
   __kmp_save_internal_controls(thread);
   // Somewhat arbitrarily decide where to get a value for max_active_levels
   int max_active_levels = get__max_active_levels(thread);
@@ -609,12 +635,12 @@ void FTN_STDCALL KMP_EXPAND_NAME(FTN_SET_NESTED)(int KMP_DEREF flag) {
 }
 
 int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_NESTED)(void) {
-  KMP_INFORM(APIDeprecated, "omp_get_nested", "omp_get_max_active_levels");
 #ifdef KMP_STUB
   return __kmps_get_nested();
 #else
   kmp_info_t *thread;
   thread = __kmp_entry_thread();
+  KMP_INFORM(APIDeprecated, "omp_get_nested", "omp_get_max_active_levels");
   return get__max_active_levels(thread) > 1;
 #endif
 }
@@ -692,6 +718,9 @@ int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_MAX_ACTIVE_LEVELS)(void) {
   return 0;
 #else
   /* TO DO: We want per-task implementation of this internal control */
+  if (!TCR_4(__kmp_init_middle)) {
+    __kmp_middle_initialize();
+  }
   return __kmp_get_max_active_levels(__kmp_entry_gtid());
 #endif
 }
@@ -715,7 +744,7 @@ int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_LEVEL)(void) {
 }
 
 int FTN_STDCALL
-    KMP_EXPAND_NAME(FTN_GET_ANCESTOR_THREAD_NUM)(int KMP_DEREF level) {
+KMP_EXPAND_NAME(FTN_GET_ANCESTOR_THREAD_NUM)(int KMP_DEREF level) {
 #ifdef KMP_STUB
   return (KMP_DEREF level) ? (-1) : (0);
 #else
@@ -773,6 +802,7 @@ int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_NUM_PLACES)(void) {
   if (!TCR_4(__kmp_init_middle)) {
     __kmp_middle_initialize();
   }
+  __kmp_assign_root_init_mask();
   if (!KMP_AFFINITY_CAPABLE())
     return 0;
   return __kmp_affinity_num_masks;
@@ -788,6 +818,7 @@ int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_PLACE_NUM_PROCS)(int place_num) {
   if (!TCR_4(__kmp_init_middle)) {
     __kmp_middle_initialize();
   }
+  __kmp_assign_root_init_mask();
   if (!KMP_AFFINITY_CAPABLE())
     return 0;
   if (place_num < 0 || place_num >= (int)__kmp_affinity_num_masks)
@@ -813,6 +844,7 @@ void FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_PLACE_PROC_IDS)(int place_num,
   if (!TCR_4(__kmp_init_middle)) {
     __kmp_middle_initialize();
   }
+  __kmp_assign_root_init_mask();
   if (!KMP_AFFINITY_CAPABLE())
     return;
   if (place_num < 0 || place_num >= (int)__kmp_affinity_num_masks)
@@ -838,6 +870,7 @@ int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_PLACE_NUM)(void) {
   if (!TCR_4(__kmp_init_middle)) {
     __kmp_middle_initialize();
   }
+  __kmp_assign_root_init_mask();
   if (!KMP_AFFINITY_CAPABLE())
     return -1;
   gtid = __kmp_entry_gtid();
@@ -857,6 +890,7 @@ int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_PARTITION_NUM_PLACES)(void) {
   if (!TCR_4(__kmp_init_middle)) {
     __kmp_middle_initialize();
   }
+  __kmp_assign_root_init_mask();
   if (!KMP_AFFINITY_CAPABLE())
     return 0;
   gtid = __kmp_entry_gtid();
@@ -873,8 +907,8 @@ int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_PARTITION_NUM_PLACES)(void) {
 #endif
 }
 
-void
-    FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_PARTITION_PLACE_NUMS)(int *place_nums) {
+void FTN_STDCALL
+KMP_EXPAND_NAME(FTN_GET_PARTITION_PLACE_NUMS)(int *place_nums) {
 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
 // Nothing.
 #else
@@ -883,6 +917,7 @@ void
   if (!TCR_4(__kmp_init_middle)) {
     __kmp_middle_initialize();
   }
+  __kmp_assign_root_init_mask();
   if (!KMP_AFFINITY_CAPABLE())
     return;
   gtid = __kmp_entry_gtid();
@@ -939,15 +974,18 @@ void FTN_STDCALL KMP_EXPAND_NAME(FTN_SET_DEFAULT_DEVICE)(int KMP_DEREF arg) {
 
 // Get number of NON-HOST devices.
 // libomptarget, if loaded, provides this function in api.cpp.
-int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_NUM_DEVICES)(void) KMP_WEAK_ATTRIBUTE_EXTERNAL;
+int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_NUM_DEVICES)(void)
+    KMP_WEAK_ATTRIBUTE_EXTERNAL;
 int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_NUM_DEVICES)(void) {
-#if KMP_MIC || KMP_OS_DARWIN || KMP_OS_WINDOWS || defined(KMP_STUB)
+#if KMP_MIC || KMP_OS_DARWIN || defined(KMP_STUB)
   return 0;
 #else
   int (*fptr)();
-  if ((*(void **)(&fptr) = dlsym(RTLD_DEFAULT, "_Offload_number_of_devices"))) {
+  if ((*(void **)(&fptr) = KMP_DLSYM("__tgt_get_num_devices"))) {
+    return (*fptr)();
+  } else if ((*(void **)(&fptr) = KMP_DLSYM_NEXT("omp_get_num_devices"))) {
     return (*fptr)();
-  } else if ((*(void **)(&fptr) = dlsym(RTLD_NEXT, "omp_get_num_devices"))) {
+  } else if ((*(void **)(&fptr) = KMP_DLSYM("_Offload_number_of_devices"))) {
     return (*fptr)();
   } else { // liboffload & libomptarget don't exist
     return 0;
@@ -957,26 +995,18 @@ int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_NUM_DEVICES)(void) {
 
 // This function always returns true when called on host device.
 // Compiler/libomptarget should handle when it is called inside target region.
-int FTN_STDCALL KMP_EXPAND_NAME(FTN_IS_INITIAL_DEVICE)(void) KMP_WEAK_ATTRIBUTE_EXTERNAL;
+int FTN_STDCALL KMP_EXPAND_NAME(FTN_IS_INITIAL_DEVICE)(void)
+    KMP_WEAK_ATTRIBUTE_EXTERNAL;
 int FTN_STDCALL KMP_EXPAND_NAME(FTN_IS_INITIAL_DEVICE)(void) {
   return 1; // This is the host
 }
 
 // libomptarget, if loaded, provides this function
-int FTN_STDCALL FTN_GET_INITIAL_DEVICE(void) KMP_WEAK_ATTRIBUTE_EXTERNAL;
-int FTN_STDCALL FTN_GET_INITIAL_DEVICE(void) {
-#if KMP_MIC || KMP_OS_DARWIN || KMP_OS_WINDOWS || defined(KMP_STUB)
+int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_INITIAL_DEVICE)(void)
+    KMP_WEAK_ATTRIBUTE_EXTERNAL;
+int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_INITIAL_DEVICE)(void) {
   // same as omp_get_num_devices()
-  return 0;
-#else
-  int (*fptr)();
-  if ((*(void **)(&fptr) = dlsym(RTLD_NEXT, "omp_get_initial_device"))) {
-    return (*fptr)();
-  } else { // liboffload & libomptarget don't exist
-    // same as omp_get_num_devices()
-    return 0;
-  }
-#endif
+  return KMP_EXPAND_NAME(FTN_GET_NUM_DEVICES)();
 }
 
 #if defined(KMP_STUB)
@@ -1273,7 +1303,7 @@ void FTN_STDCALL FTN_SET_DEFAULTS(char const *str
                                   ,
                                   int len
 #endif
-                                  ) {
+) {
 #ifndef KMP_STUB
 #ifdef PASS_ARGS_BY_VALUE
   int len = (int)KMP_STRLEN(str);
@@ -1321,38 +1351,38 @@ int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_MAX_TASK_PRIORITY)(void) {
 // loaded, we assume we are on the host and return KMP_HOST_DEVICE.
 // Compiler/libomptarget will handle this if called inside target.
 int FTN_STDCALL FTN_GET_DEVICE_NUM(void) KMP_WEAK_ATTRIBUTE_EXTERNAL;
-int FTN_STDCALL FTN_GET_DEVICE_NUM(void) { return FTN_GET_INITIAL_DEVICE(); }
+int FTN_STDCALL FTN_GET_DEVICE_NUM(void) {
+  return KMP_EXPAND_NAME(FTN_GET_INITIAL_DEVICE)();
+}
 
 // Compiler will ensure that this is only called from host in sequential region
-int FTN_STDCALL FTN_PAUSE_RESOURCE(kmp_pause_status_t kind, int device_num) {
+int FTN_STDCALL KMP_EXPAND_NAME(FTN_PAUSE_RESOURCE)(kmp_pause_status_t kind,
+                                                    int device_num) {
 #ifdef KMP_STUB
   return 1; // just fail
 #else
-  if (device_num == FTN_GET_INITIAL_DEVICE())
+  if (device_num == KMP_EXPAND_NAME(FTN_GET_INITIAL_DEVICE)())
     return __kmpc_pause_resource(kind);
   else {
-#if !KMP_OS_WINDOWS
     int (*fptr)(kmp_pause_status_t, int);
-    if ((*(void **)(&fptr) = dlsym(RTLD_DEFAULT, "tgt_pause_resource")))
+    if ((*(void **)(&fptr) = KMP_DLSYM("tgt_pause_resource")))
       return (*fptr)(kind, device_num);
     else
-#endif
       return 1; // just fail if there is no libomptarget
   }
 #endif
 }
 
 // Compiler will ensure that this is only called from host in sequential region
-int FTN_STDCALL FTN_PAUSE_RESOURCE_ALL(kmp_pause_status_t kind) {
+int FTN_STDCALL
+    KMP_EXPAND_NAME(FTN_PAUSE_RESOURCE_ALL)(kmp_pause_status_t kind) {
 #ifdef KMP_STUB
   return 1; // just fail
 #else
   int fails = 0;
-#if !KMP_OS_WINDOWS
   int (*fptr)(kmp_pause_status_t, int);
-  if ((*(void **)(&fptr) = dlsym(RTLD_DEFAULT, "tgt_pause_resource")))
+  if ((*(void **)(&fptr) = KMP_DLSYM("tgt_pause_resource")))
     fails = (*fptr)(kind, KMP_DEVICE_ALL); // pause devices
-#endif
   fails += __kmpc_pause_resource(kind); // pause host
   return fails;
 #endif
@@ -1373,6 +1403,49 @@ void FTN_STDCALL FTN_FULFILL_EVENT(kmp_event_t *event) {
 #endif
 }
 
+// nteams-var per-device ICV
+void FTN_STDCALL FTN_SET_NUM_TEAMS(int KMP_DEREF num_teams) {
+#ifdef KMP_STUB
+// Nothing.
+#else
+  if (!__kmp_init_serial) {
+    __kmp_serial_initialize();
+  }
+  __kmp_set_num_teams(KMP_DEREF num_teams);
+#endif
+}
+int FTN_STDCALL FTN_GET_MAX_TEAMS(void) {
+#ifdef KMP_STUB
+  return 1;
+#else
+  if (!__kmp_init_serial) {
+    __kmp_serial_initialize();
+  }
+  return __kmp_get_max_teams();
+#endif
+}
+// teams-thread-limit-var per-device ICV
+void FTN_STDCALL FTN_SET_TEAMS_THREAD_LIMIT(int KMP_DEREF limit) {
+#ifdef KMP_STUB
+// Nothing.
+#else
+  if (!__kmp_init_serial) {
+    __kmp_serial_initialize();
+  }
+  __kmp_set_teams_thread_limit(KMP_DEREF limit);
+#endif
+}
+int FTN_STDCALL FTN_GET_TEAMS_THREAD_LIMIT(void) {
+#ifdef KMP_STUB
+  return 1;
+#else
+  if (!__kmp_init_serial) {
+    __kmp_serial_initialize();
+  }
+  return __kmp_get_teams_thread_limit();
+#endif
+}
+
 // display environment variables when requested
 void FTN_STDCALL FTN_DISPLAY_ENV(int verbose) {
 #ifndef KMP_STUB
@@ -1472,12 +1545,19 @@ KMP_VERSION_SYMBOL(FTN_GET_PLACE_PROC_IDS, 45, "OMP_4.5");
 KMP_VERSION_SYMBOL(FTN_GET_PLACE_NUM, 45, "OMP_4.5");
 KMP_VERSION_SYMBOL(FTN_GET_PARTITION_NUM_PLACES, 45, "OMP_4.5");
 KMP_VERSION_SYMBOL(FTN_GET_PARTITION_PLACE_NUMS, 45, "OMP_4.5");
-// KMP_VERSION_SYMBOL(FTN_GET_INITIAL_DEVICE, 45, "OMP_4.5");
+KMP_VERSION_SYMBOL(FTN_GET_INITIAL_DEVICE, 45, "OMP_4.5");
 
 // OMP_5.0 versioned symbols
 // KMP_VERSION_SYMBOL(FTN_GET_DEVICE_NUM, 50, "OMP_5.0");
-// KMP_VERSION_SYMBOL(FTN_PAUSE_RESOURCE, 50, "OMP_5.0");
-// KMP_VERSION_SYMBOL(FTN_PAUSE_RESOURCE_ALL, 50, "OMP_5.0");
+KMP_VERSION_SYMBOL(FTN_PAUSE_RESOURCE, 50, "OMP_5.0");
+KMP_VERSION_SYMBOL(FTN_PAUSE_RESOURCE_ALL, 50, "OMP_5.0");
+// The C versions (KMP_FTN_PLAIN) of these symbols are in kmp_csupport.c
+#if KMP_FTN_ENTRIES == KMP_FTN_APPEND
+KMP_VERSION_SYMBOL(FTN_CAPTURE_AFFINITY, 50, "OMP_5.0");
+KMP_VERSION_SYMBOL(FTN_DISPLAY_AFFINITY, 50, "OMP_5.0");
+KMP_VERSION_SYMBOL(FTN_GET_AFFINITY_FORMAT, 50, "OMP_5.0");
+KMP_VERSION_SYMBOL(FTN_SET_AFFINITY_FORMAT, 50, "OMP_5.0");
+#endif
 // KMP_VERSION_SYMBOL(FTN_GET_SUPPORTED_ACTIVE_LEVELS, 50, "OMP_5.0");
 // KMP_VERSION_SYMBOL(FTN_FULFILL_EVENT, 50, "OMP_5.0");
 
diff --git a/openmp/runtime/src/kmp_ftn_os.h b/openmp/runtime/src/kmp_ftn_os.h
index 39958e2dbc3a..5b9e396e3dd9 100644
--- a/openmp/runtime/src/kmp_ftn_os.h
+++ b/openmp/runtime/src/kmp_ftn_os.h
@@ -135,6 +135,10 @@
 #define FTN_GET_SUPPORTED_ACTIVE_LEVELS omp_get_supported_active_levels
 #define FTN_DISPLAY_ENV omp_display_env
 #define FTN_FULFILL_EVENT omp_fulfill_event
+#define FTN_SET_NUM_TEAMS omp_set_num_teams
+#define FTN_GET_MAX_TEAMS omp_get_max_teams
+#define FTN_SET_TEAMS_THREAD_LIMIT omp_set_teams_thread_limit
+#define FTN_GET_TEAMS_THREAD_LIMIT omp_get_teams_thread_limit
 
 #endif /* KMP_FTN_PLAIN */
 
@@ -259,6 +263,10 @@
 #define FTN_GET_SUPPORTED_ACTIVE_LEVELS omp_get_supported_active_levels_
 #define FTN_DISPLAY_ENV omp_display_env_
 #define FTN_FULFILL_EVENT omp_fulfill_event_
+#define FTN_SET_NUM_TEAMS omp_set_num_teams_
+#define FTN_GET_MAX_TEAMS omp_get_max_teams_
+#define FTN_SET_TEAMS_THREAD_LIMIT omp_set_teams_thread_limit_
+#define FTN_GET_TEAMS_THREAD_LIMIT omp_get_teams_thread_limit_
 
 #endif /* KMP_FTN_APPEND */
 
@@ -381,6 +389,10 @@
 #define FTN_GET_SUPPORTED_ACTIVE_LEVELS OMP_GET_SUPPORTED_ACTIVE_LEVELS
 #define FTN_DISPLAY_ENV OMP_DISPLAY_ENV
 #define FTN_FULFILL_EVENT OMP_FULFILL_EVENT
+#define FTN_SET_NUM_TEAMS OMP_SET_NUM_TEAMS
+#define FTN_GET_MAX_TEAMS OMP_GET_MAX_TEAMS
+#define FTN_SET_TEAMS_THREAD_LIMIT OMP_SET_TEAMS_THREAD_LIMIT
+#define FTN_GET_TEAMS_THREAD_LIMIT OMP_GET_TEAMS_THREAD_LIMIT
 
 #endif /* KMP_FTN_UPPER */
 
@@ -505,6 +517,10 @@
 #define FTN_GET_SUPPORTED_ACTIVE_LEVELS OMP_GET_SUPPORTED_ACTIVE_LEVELS_
 #define FTN_DISPLAY_ENV OMP_DISPLAY_ENV_
 #define FTN_FULFILL_EVENT OMP_FULFILL_EVENT_
+#define FTN_SET_NUM_TEAMS OMP_SET_NUM_TEAMS_
+#define FTN_GET_MAX_TEAMS OMP_GET_MAX_TEAMS_
+#define FTN_SET_TEAMS_THREAD_LIMIT OMP_SET_TEAMS_THREAD_LIMIT_
+#define FTN_GET_TEAMS_THREAD_LIMIT OMP_GET_TEAMS_THREAD_LIMIT_
 
 #endif /* KMP_FTN_UAPPEND */
 
@@ -681,5 +697,20 @@
   GOMP_parallel_loop_maybe_nonmonotonic_runtime
 #define KMP_API_NAME_GOMP_TEAMS_REG GOMP_teams_reg
 #define KMP_API_NAME_GOMP_TASKWAIT_DEPEND GOMP_taskwait_depend
+#define KMP_API_NAME_GOMP_TASKGROUP_REDUCTION_REGISTER                         \
+  GOMP_taskgroup_reduction_register
+#define KMP_API_NAME_GOMP_TASKGROUP_REDUCTION_UNREGISTER                       \
+  GOMP_taskgroup_reduction_unregister
+#define KMP_API_NAME_GOMP_TASK_REDUCTION_REMAP GOMP_task_reduction_remap
+#define KMP_API_NAME_GOMP_PARALLEL_REDUCTIONS GOMP_parallel_reductions
+#define KMP_API_NAME_GOMP_LOOP_START GOMP_loop_start
+#define KMP_API_NAME_GOMP_LOOP_ULL_START GOMP_loop_ull_start
+#define KMP_API_NAME_GOMP_LOOP_DOACROSS_START GOMP_loop_doacross_start
+#define KMP_API_NAME_GOMP_LOOP_ULL_DOACROSS_START GOMP_loop_ull_doacross_start
+#define KMP_API_NAME_GOMP_LOOP_ORDERED_START GOMP_loop_ordered_start
+#define KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_START GOMP_loop_ull_ordered_start
+#define KMP_API_NAME_GOMP_SECTIONS2_START GOMP_sections2_start
+#define KMP_API_NAME_GOMP_WORKSHARE_TASK_REDUCTION_UNREGISTER                  \
+  GOMP_workshare_task_reduction_unregister
 
 #endif /* KMP_FTN_OS_H */
diff --git a/openmp/runtime/src/kmp_global.cpp b/openmp/runtime/src/kmp_global.cpp
index 4e0035e0e066..24de14fe8c33 100644
--- a/openmp/runtime/src/kmp_global.cpp
+++ b/openmp/runtime/src/kmp_global.cpp
@@ -166,6 +166,7 @@ int __kmp_zero_bt = FALSE;
 int __kmp_ncores = 0;
 #endif
 int __kmp_chunk = 0;
+int __kmp_force_monotonic = 0;
 int __kmp_abort_delay = 0;
 #if KMP_OS_LINUX && defined(KMP_TDATA_GTID)
 int __kmp_gtid_mode = 3; /* use __declspec(thread) TLS to store gtid */
@@ -208,6 +209,8 @@ const char *__kmp_speculative_statsfile = "-";
 int __kmp_display_env = FALSE;
 int __kmp_display_env_verbose = FALSE;
 int __kmp_omp_cancellation = FALSE;
+int __kmp_nteams = 0;
+int __kmp_teams_thread_limit = 0;
 
 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
 int __kmp_user_level_mwait = FALSE;
@@ -244,8 +247,6 @@ KMPAffinity *__kmp_affinity_dispatch = NULL;
 #if KMP_USE_HWLOC
 int __kmp_hwloc_error = FALSE;
 hwloc_topology_t __kmp_hwloc_topology = NULL;
-int __kmp_numa_detected = FALSE;
-int __kmp_tile_depth = 0;
 #endif
 
 #if KMP_OS_WINDOWS
@@ -260,7 +261,7 @@ kmp_SetThreadGroupAffinity_t __kmp_SetThreadGroupAffinity = NULL;
 
 size_t __kmp_affin_mask_size = 0;
 enum affinity_type __kmp_affinity_type = affinity_default;
-enum affinity_gran __kmp_affinity_gran = affinity_gran_default;
+kmp_hw_t __kmp_affinity_gran = KMP_HW_UNKNOWN;
 int __kmp_affinity_gran_levels = -1;
 int __kmp_affinity_dups = TRUE;
 enum affinity_top_method __kmp_affinity_top_method =
@@ -283,14 +284,6 @@ int __kmp_affinity_num_places = 0;
 int __kmp_display_affinity = FALSE;
 char *__kmp_affinity_format = NULL;
 
-kmp_hws_item_t __kmp_hws_socket = {0, 0};
-kmp_hws_item_t __kmp_hws_node = {0, 0};
-kmp_hws_item_t __kmp_hws_tile = {0, 0};
-kmp_hws_item_t __kmp_hws_core = {0, 0};
-kmp_hws_item_t __kmp_hws_proc = {0, 0};
-int __kmp_hws_requested = 0;
-int __kmp_hws_abs_flag = 0; // absolute or per-item number requested
-
 kmp_int32 __kmp_default_device = 0;
 
 kmp_tasking_mode_t __kmp_tasking_mode = tskm_task_teams;
@@ -315,6 +308,13 @@ omp_allocator_handle_t const omp_pteam_mem_alloc =
     (omp_allocator_handle_t const)7;
 omp_allocator_handle_t const omp_thread_mem_alloc =
     (omp_allocator_handle_t const)8;
+// Preview of target memory support
+omp_allocator_handle_t const llvm_omp_target_host_mem_alloc =
+    (omp_allocator_handle_t const)100;
+omp_allocator_handle_t const llvm_omp_target_shared_mem_alloc =
+    (omp_allocator_handle_t const)101;
+omp_allocator_handle_t const llvm_omp_target_device_mem_alloc =
+    (omp_allocator_handle_t const)102;
 omp_allocator_handle_t const kmp_max_mem_alloc =
     (omp_allocator_handle_t const)1024;
 omp_allocator_handle_t __kmp_def_allocator = omp_default_mem_alloc;
@@ -329,6 +329,13 @@ omp_memspace_handle_t const omp_high_bw_mem_space =
     (omp_memspace_handle_t const)3;
 omp_memspace_handle_t const omp_low_lat_mem_space =
     (omp_memspace_handle_t const)4;
+// Preview of target memory support
+omp_memspace_handle_t const llvm_omp_target_host_mem_space =
+    (omp_memspace_handle_t const)100;
+omp_memspace_handle_t const llvm_omp_target_shared_mem_space =
+    (omp_memspace_handle_t const)101;
+omp_memspace_handle_t const llvm_omp_target_device_mem_space =
+    (omp_memspace_handle_t const)102;
 
 /* This check ensures that the compiler is passing the correct data type for the
    flags formal parameter of the function kmpc_omp_task_alloc(). If the type is
@@ -420,7 +427,7 @@ kmp_uint32 __kmp_yield_next = KMP_NEXT_WAIT;
 
 /* ------------------------------------------------------ */
 /* STATE mostly syncronized with global lock */
-/* data written to rarely by masters, read often by workers */
+/* data written to rarely by primary threads, read often by workers */
 /* TODO: None of this global padding stuff works consistently because the order
    of declaration is not necessarily correlated to storage order. To fix this,
    all the important globals must be put in a big structure instead. */
@@ -428,7 +435,7 @@ KMP_ALIGN_CACHE
 kmp_info_t **__kmp_threads = NULL;
 kmp_root_t **__kmp_root = NULL;
 
-/* data read/written to often by masters */
+/* data read/written to often by primary threads */
 KMP_ALIGN_CACHE
 volatile int __kmp_nth = 0;
 volatile int __kmp_all_nth = 0;
@@ -541,4 +548,9 @@ kmp_target_offload_kind_t __kmp_target_offload = tgt_default;
 // OMP Pause Resources
 kmp_pause_status_t __kmp_pause_status = kmp_not_paused;
 
+// Nesting mode
+int __kmp_nesting_mode = 0;
+int __kmp_nesting_mode_nlevels = 1;
+int *__kmp_nesting_nth_level;
+
 // end of file //
diff --git a/openmp/runtime/src/kmp_gsupport.cpp b/openmp/runtime/src/kmp_gsupport.cpp
index 11a35873f366..61a3199f1a03 100644
--- a/openmp/runtime/src/kmp_gsupport.cpp
+++ b/openmp/runtime/src/kmp_gsupport.cpp
@@ -242,8 +242,8 @@ void *KMP_EXPAND_NAME(KMP_API_NAME_GOMP_SINGLE_COPY_START)(void) {
   if (__kmp_enter_single(gtid, &loc, FALSE))
     return NULL;
 
-// Wait for the first thread to set the copyprivate data pointer,
-// and for all other threads to reach this point.
+    // Wait for the first thread to set the copyprivate data pointer,
+    // and for all other threads to reach this point.
 
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   ompt_frame_t *ompt_frame;
@@ -463,7 +463,8 @@ static void __kmp_GOMP_fork_call(ident_t *loc, int gtid, unsigned num_threads,
       ompt_team_size = __kmp_team_from_gtid(gtid)->t.t_nproc;
       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
           ompt_scope_begin, &(team_info->parallel_data),
-          &(task_info->task_data), ompt_team_size, __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
+          &(task_info->task_data), ompt_team_size, __kmp_tid_from_gtid(gtid),
+          ompt_task_implicit); // TODO: Can this be ompt_task_initial?
       task_info->thread_num = __kmp_tid_from_gtid(gtid);
     }
     thr->th.ompt_thread_info.state = ompt_state_work_parallel;
@@ -497,6 +498,10 @@ void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_START)(void (*task)(void *),
     frame->exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
   }
 #endif
+#if OMPD_SUPPORT
+  if (ompd_state & OMPD_ENABLE_BP)
+    ompd_bp_parallel_begin();
+#endif
 }
 
 void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_END)(void) {
@@ -526,7 +531,11 @@ void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_END)(void) {
                   ,
                   fork_context_gnu
 #endif
-                  );
+  );
+#if OMPD_SUPPORT
+  if (ompd_state & OMPD_ENABLE_BP)
+    ompd_bp_parallel_end();
+#endif
 }
 
 // Loop worksharing constructs
@@ -960,12 +969,12 @@ LOOP_START_ULL(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_GUIDED_START),
 LOOP_NEXT_ULL(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_GUIDED_NEXT), {})
 LOOP_START_ULL(
     KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_NONMONOTONIC_DYNAMIC_START),
-               kmp_sch_dynamic_chunked)
+    kmp_sch_dynamic_chunked)
 LOOP_NEXT_ULL(
     KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_NONMONOTONIC_DYNAMIC_NEXT), {})
 LOOP_START_ULL(
     KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_NONMONOTONIC_GUIDED_START),
-               kmp_sch_guided_chunked)
+    kmp_sch_guided_chunked)
 LOOP_NEXT_ULL(
     KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_NONMONOTONIC_GUIDED_NEXT), {})
 LOOP_RUNTIME_START_ULL(
@@ -1232,7 +1241,8 @@ void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_TASK)(void (*func)(void *), void *data,
   kmp_taskdata_t *current_task;
   if (ompt_enabled.enabled) {
     current_task = __kmp_threads[gtid]->th.th_current_task;
-    current_task->ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
+    current_task->ompt_task_info.frame.enter_frame.ptr =
+        OMPT_GET_FRAME_ADDRESS(0);
   }
   OMPT_STORE_RETURN_ADDRESS(gtid);
 #endif
@@ -1494,10 +1504,10 @@ void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_SECTIONS)(void (*task)(void *),
 
   {
 #if OMPT_SUPPORT
-  OMPT_STORE_RETURN_ADDRESS(gtid);
+    OMPT_STORE_RETURN_ADDRESS(gtid);
 #endif
 
-  KMP_DISPATCH_INIT(&loc, gtid, kmp_nm_dynamic_chunked, 1, count, 1, 1, TRUE);
+    KMP_DISPATCH_INIT(&loc, gtid, kmp_nm_dynamic_chunked, 1, count, 1, 1, TRUE);
   }
   task(data);
   KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_END)();
@@ -1541,10 +1551,10 @@ PARALLEL_LOOP(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_LOOP_DYNAMIC),
               kmp_sch_dynamic_chunked, OMPT_LOOP_PRE, OMPT_LOOP_POST)
 PARALLEL_LOOP(
     KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_LOOP_NONMONOTONIC_GUIDED),
-              kmp_sch_guided_chunked, OMPT_LOOP_PRE, OMPT_LOOP_POST)
+    kmp_sch_guided_chunked, OMPT_LOOP_PRE, OMPT_LOOP_POST)
 PARALLEL_LOOP(
     KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_LOOP_NONMONOTONIC_DYNAMIC),
-              kmp_sch_dynamic_chunked, OMPT_LOOP_PRE, OMPT_LOOP_POST)
+    kmp_sch_dynamic_chunked, OMPT_LOOP_PRE, OMPT_LOOP_POST)
 PARALLEL_LOOP(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_LOOP_GUIDED),
               kmp_sch_guided_chunked, OMPT_LOOP_PRE, OMPT_LOOP_POST)
 PARALLEL_LOOP(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_LOOP_RUNTIME),
@@ -1686,6 +1696,9 @@ static void __kmp_gomp_task_dup(kmp_task_t *dest, kmp_task_t *src,
   }
 }
 
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_TASKGROUP_REDUCTION_REGISTER)(
+    uintptr_t *);
+
 #ifdef __cplusplus
 } // extern "C"
 #endif
@@ -1705,6 +1718,7 @@ void __GOMP_taskloop(void (*func)(void *), void *data,
   int if_val = gomp_flags & (1u << 10);
   int nogroup = gomp_flags & (1u << 11);
   int up = gomp_flags & (1u << 8);
+  int reductions = gomp_flags & (1u << 12);
   p_task_dup_t task_dup = NULL;
   kmp_tasking_flags_t *input_flags = (kmp_tasking_flags_t *)&flags;
 #ifdef KMP_DEBUG
@@ -1776,9 +1790,31 @@ void __GOMP_taskloop(void (*func)(void *), void *data,
   loop_bounds = (T *)task->shareds;
   loop_bounds[0] = start;
   loop_bounds[1] = end + (up ? -1 : 1);
+
+  if (!nogroup) {
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+    __kmpc_taskgroup(&loc, gtid);
+    if (reductions) {
+      // The data pointer points to lb, ub, then reduction data
+      struct data_t {
+        T a, b;
+        uintptr_t *d;
+      };
+      uintptr_t *d = ((data_t *)data)->d;
+      KMP_EXPAND_NAME(KMP_API_NAME_GOMP_TASKGROUP_REDUCTION_REGISTER)(d);
+    }
+  }
   __kmpc_taskloop(&loc, gtid, task, if_val, (kmp_uint64 *)&(loop_bounds[0]),
-                  (kmp_uint64 *)&(loop_bounds[1]), (kmp_int64)step, nogroup,
-                  sched, (kmp_uint64)num_tasks, (void *)task_dup);
+                  (kmp_uint64 *)&(loop_bounds[1]), (kmp_int64)step, 1, sched,
+                  (kmp_uint64)num_tasks, (void *)task_dup);
+  if (!nogroup) {
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+    __kmpc_end_taskgroup(&loc, gtid);
+  }
 }
 
 // 4 byte version of GOMP_doacross_post
@@ -1875,7 +1911,7 @@ void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_DOACROSS_ULL_WAIT)(
   va_end(args);
 }
 
-// fn: the function each master thread of new team will call
+// fn: the function each primary thread of new team will call
 // data: argument to fn
 // num_teams, thread_limit: max bounds on respective ICV
 // flags: unused
@@ -1910,6 +1946,488 @@ void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_TASKWAIT_DEPEND)(void **depend) {
   KA_TRACE(20, ("GOMP_taskwait_depend exit: T#%d\n", gtid));
 }
 
+static inline void
+__kmp_GOMP_taskgroup_reduction_register(uintptr_t *data, kmp_taskgroup_t *tg,
+                                        int nthreads,
+                                        uintptr_t *allocated = nullptr) {
+  KMP_ASSERT(data);
+  KMP_ASSERT(nthreads > 0);
+  // Have private copy pointers point to previously allocated
+  // reduction data or allocate new data here
+  if (allocated) {
+    data[2] = allocated[2];
+    data[6] = allocated[6];
+  } else {
+    data[2] = (uintptr_t)__kmp_allocate(nthreads * data[1]);
+    data[6] = data[2] + (nthreads * data[1]);
+  }
+  if (tg)
+    tg->gomp_data = data;
+}
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_TASKGROUP_REDUCTION_REGISTER)(
+    uintptr_t *data) {
+  int gtid = __kmp_entry_gtid();
+  KA_TRACE(20, ("GOMP_taskgroup_reduction_register: T#%d\n", gtid));
+  kmp_info_t *thread = __kmp_threads[gtid];
+  kmp_taskgroup_t *tg = thread->th.th_current_task->td_taskgroup;
+  int nthreads = thread->th.th_team_nproc;
+  __kmp_GOMP_taskgroup_reduction_register(data, tg, nthreads);
+}
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_TASKGROUP_REDUCTION_UNREGISTER)(
+    uintptr_t *data) {
+  KA_TRACE(20,
+           ("GOMP_taskgroup_reduction_unregister: T#%d\n", __kmp_get_gtid()));
+  KMP_ASSERT(data && data[2]);
+  __kmp_free((void *)data[2]);
+}
+
+// Search through reduction data and set ptrs[] elements
+// to proper privatized copy address
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_TASK_REDUCTION_REMAP)(size_t cnt,
+                                                             size_t cntorig,
+                                                             void **ptrs) {
+  int gtid = __kmp_entry_gtid();
+  KA_TRACE(20, ("GOMP_task_reduction_remap: T#%d\n", gtid));
+  kmp_info_t *thread = __kmp_threads[gtid];
+  kmp_int32 tid = __kmp_get_tid();
+  for (size_t i = 0; i < cnt; ++i) {
+    uintptr_t address = (uintptr_t)ptrs[i];
+    void *propagated_address = NULL;
+    void *mapped_address = NULL;
+    // Check taskgroups reduce data
+    kmp_taskgroup_t *tg = thread->th.th_current_task->td_taskgroup;
+    while (tg) {
+      uintptr_t *gomp_data = tg->gomp_data;
+      if (!gomp_data) {
+        tg = tg->parent;
+        continue;
+      }
+      // Check the shared addresses list
+      size_t num_vars = (size_t)gomp_data[0];
+      uintptr_t per_thread_size = gomp_data[1];
+      uintptr_t reduce_data = gomp_data[2];
+      uintptr_t end_reduce_data = gomp_data[6];
+      for (size_t j = 0; j < num_vars; ++j) {
+        uintptr_t *entry = gomp_data + 7 + 3 * j;
+        if (entry[0] == address) {
+          uintptr_t offset = entry[1];
+          mapped_address =
+              (void *)(reduce_data + tid * per_thread_size + offset);
+          if (i < cntorig)
+            propagated_address = (void *)entry[0];
+          break;
+        }
+      }
+      if (mapped_address)
+        break;
+      // Check if address is within privatized copies range
+      if (!mapped_address && address >= reduce_data &&
+          address < end_reduce_data) {
+        uintptr_t offset = (address - reduce_data) % per_thread_size;
+        mapped_address = (void *)(reduce_data + tid * per_thread_size + offset);
+        if (i < cntorig) {
+          for (size_t j = 0; j < num_vars; ++j) {
+            uintptr_t *entry = gomp_data + 7 + 3 * j;
+            if (entry[1] == offset) {
+              propagated_address = (void *)entry[0];
+              break;
+            }
+          }
+        }
+      }
+      if (mapped_address)
+        break;
+      tg = tg->parent;
+    }
+    KMP_ASSERT(mapped_address);
+    ptrs[i] = mapped_address;
+    if (i < cntorig) {
+      KMP_ASSERT(propagated_address);
+      ptrs[cnt + i] = propagated_address;
+    }
+  }
+}
+
+static void __kmp_GOMP_init_reductions(int gtid, uintptr_t *data, int is_ws) {
+  kmp_info_t *thr = __kmp_threads[gtid];
+  kmp_team_t *team = thr->th.th_team;
+  // First start a taskgroup
+  __kmpc_taskgroup(NULL, gtid);
+  // Then setup reduction data
+  void *reduce_data = KMP_ATOMIC_LD_RLX(&team->t.t_tg_reduce_data[is_ws]);
+  if (reduce_data == NULL &&
+      __kmp_atomic_compare_store(&team->t.t_tg_reduce_data[is_ws], reduce_data,
+                                 (void *)1)) {
+    // Single thread enters this block to initialize common reduction data
+    KMP_DEBUG_ASSERT(reduce_data == NULL);
+    __kmp_GOMP_taskgroup_reduction_register(data, NULL, thr->th.th_team_nproc);
+    KMP_ATOMIC_ST_REL(&team->t.t_tg_fini_counter[is_ws], 0);
+    KMP_ATOMIC_ST_REL(&team->t.t_tg_reduce_data[is_ws], (void *)data);
+  } else {
+    // Wait for task reduction initialization
+    while ((reduce_data = KMP_ATOMIC_LD_ACQ(
+                &team->t.t_tg_reduce_data[is_ws])) == (void *)1) {
+      KMP_CPU_PAUSE();
+    }
+    KMP_DEBUG_ASSERT(reduce_data > (void *)1); // should be valid pointer here
+  }
+  // For worksharing constructs, each thread has its own reduction structure.
+  // Have each reduction structure point to same privatized copies of vars.
+  // For parallel, each thread points to same reduction structure and privatized
+  // copies of vars
+  if (is_ws) {
+    __kmp_GOMP_taskgroup_reduction_register(
+        data, NULL, thr->th.th_team_nproc,
+        (uintptr_t *)KMP_ATOMIC_LD_ACQ(&team->t.t_tg_reduce_data[is_ws]));
+  }
+  kmp_taskgroup_t *tg = thr->th.th_current_task->td_taskgroup;
+  tg->gomp_data = data;
+}
+
+static unsigned
+__kmp_GOMP_par_reductions_microtask_wrapper(int *gtid, int *npr,
+                                            void (*task)(void *), void *data) {
+  kmp_info_t *thr = __kmp_threads[*gtid];
+  kmp_team_t *team = thr->th.th_team;
+  uintptr_t *reduce_data = *(uintptr_t **)data;
+  __kmp_GOMP_init_reductions(*gtid, reduce_data, 0);
+
+#if OMPT_SUPPORT
+  ompt_frame_t *ompt_frame;
+  ompt_state_t enclosing_state;
+
+  if (ompt_enabled.enabled) {
+    // save enclosing task state; set current state for task
+    enclosing_state = thr->th.ompt_thread_info.state;
+    thr->th.ompt_thread_info.state = ompt_state_work_parallel;
+
+    // set task frame
+    __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
+    ompt_frame->exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
+  }
+#endif
+
+  task(data);
+
+#if OMPT_SUPPORT
+  if (ompt_enabled.enabled) {
+    // clear task frame
+    ompt_frame->exit_frame = ompt_data_none;
+
+    // restore enclosing state
+    thr->th.ompt_thread_info.state = enclosing_state;
+  }
+#endif
+  __kmpc_end_taskgroup(NULL, *gtid);
+  // if last thread out, then reset the team's reduce data
+  // the GOMP_taskgroup_reduction_unregister() function will deallocate
+  // private copies after reduction calculations take place.
+  int count = KMP_ATOMIC_INC(&team->t.t_tg_fini_counter[0]);
+  if (count == thr->th.th_team_nproc - 1) {
+    KMP_ATOMIC_ST_REL(&team->t.t_tg_reduce_data[0], NULL);
+    KMP_ATOMIC_ST_REL(&team->t.t_tg_fini_counter[0], 0);
+  }
+  return (unsigned)thr->th.th_team_nproc;
+}
+
+unsigned KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_REDUCTIONS)(
+    void (*task)(void *), void *data, unsigned num_threads,
+    unsigned int flags) {
+  MKLOC(loc, "GOMP_parallel_reductions");
+  int gtid = __kmp_entry_gtid();
+  KA_TRACE(20, ("GOMP_parallel_reductions: T#%d\n", gtid));
+  __kmp_GOMP_fork_call(&loc, gtid, num_threads, flags, task,
+                       (microtask_t)__kmp_GOMP_par_reductions_microtask_wrapper,
+                       2, task, data);
+  unsigned retval =
+      __kmp_GOMP_par_reductions_microtask_wrapper(&gtid, NULL, task, data);
+  KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_END)();
+  KA_TRACE(20, ("GOMP_parallel_reductions exit: T#%d\n", gtid));
+  return retval;
+}
+
+bool KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_START)(
+    long start, long end, long incr, long sched, long chunk_size, long *istart,
+    long *iend, uintptr_t *reductions, void **mem) {
+  int status = 0;
+  int gtid = __kmp_entry_gtid();
+  KA_TRACE(20, ("GOMP_loop_start: T#%d, reductions: %p\n", gtid, reductions));
+  if (reductions)
+    __kmp_GOMP_init_reductions(gtid, reductions, 1);
+  if (mem)
+    KMP_FATAL(GompFeatureNotSupported, "scan");
+  if (istart == NULL)
+    return true;
+  const long MONOTONIC_FLAG = (long)(kmp_sched_monotonic);
+  long monotonic = sched & MONOTONIC_FLAG;
+  sched &= ~MONOTONIC_FLAG;
+  if (sched == 0) {
+    if (monotonic)
+      status = KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_RUNTIME_START)(
+          start, end, incr, istart, iend);
+    else
+      status = KMP_EXPAND_NAME(
+          KMP_API_NAME_GOMP_LOOP_MAYBE_NONMONOTONIC_RUNTIME_START)(
+          start, end, incr, istart, iend);
+  } else if (sched == 1) {
+    status = KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_STATIC_START)(
+        start, end, incr, chunk_size, istart, iend);
+  } else if (sched == 2) {
+    if (monotonic)
+      status = KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_DYNAMIC_START)(
+          start, end, incr, chunk_size, istart, iend);
+    else
+      status =
+          KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_NONMONOTONIC_DYNAMIC_START)(
+              start, end, incr, chunk_size, istart, iend);
+  } else if (sched == 3) {
+    if (monotonic)
+      status = KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_GUIDED_START)(
+          start, end, incr, chunk_size, istart, iend);
+    else
+      status =
+          KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_NONMONOTONIC_GUIDED_START)(
+              start, end, incr, chunk_size, istart, iend);
+  } else if (sched == 4) {
+    status = KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_NONMONOTONIC_RUNTIME_START)(
+        start, end, incr, istart, iend);
+  } else {
+    KMP_ASSERT(0);
+  }
+  return status;
+}
+
+bool KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_START)(
+    bool up, unsigned long long start, unsigned long long end,
+    unsigned long long incr, long sched, unsigned long long chunk_size,
+    unsigned long long *istart, unsigned long long *iend, uintptr_t *reductions,
+    void **mem) {
+  int status = 0;
+  int gtid = __kmp_entry_gtid();
+  KA_TRACE(20,
+           ("GOMP_loop_ull_start: T#%d, reductions: %p\n", gtid, reductions));
+  if (reductions)
+    __kmp_GOMP_init_reductions(gtid, reductions, 1);
+  if (mem)
+    KMP_FATAL(GompFeatureNotSupported, "scan");
+  if (istart == NULL)
+    return true;
+  const long MONOTONIC_FLAG = (long)(kmp_sched_monotonic);
+  long monotonic = sched & MONOTONIC_FLAG;
+  sched &= ~MONOTONIC_FLAG;
+  if (sched == 0) {
+    if (monotonic)
+      status = KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_RUNTIME_START)(
+          up, start, end, incr, istart, iend);
+    else
+      status = KMP_EXPAND_NAME(
+          KMP_API_NAME_GOMP_LOOP_ULL_MAYBE_NONMONOTONIC_RUNTIME_START)(
+          up, start, end, incr, istart, iend);
+  } else if (sched == 1) {
+    status = KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_STATIC_START)(
+        up, start, end, incr, chunk_size, istart, iend);
+  } else if (sched == 2) {
+    if (monotonic)
+      status = KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_DYNAMIC_START)(
+          up, start, end, incr, chunk_size, istart, iend);
+    else
+      status = KMP_EXPAND_NAME(
+          KMP_API_NAME_GOMP_LOOP_ULL_NONMONOTONIC_DYNAMIC_START)(
+          up, start, end, incr, chunk_size, istart, iend);
+  } else if (sched == 3) {
+    if (monotonic)
+      status = KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_GUIDED_START)(
+          up, start, end, incr, chunk_size, istart, iend);
+    else
+      status =
+          KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_NONMONOTONIC_GUIDED_START)(
+              up, start, end, incr, chunk_size, istart, iend);
+  } else if (sched == 4) {
+    status =
+        KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_NONMONOTONIC_RUNTIME_START)(
+            up, start, end, incr, istart, iend);
+  } else {
+    KMP_ASSERT(0);
+  }
+  return status;
+}
+
+bool KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_DOACROSS_START)(
+    unsigned ncounts, long *counts, long sched, long chunk_size, long *istart,
+    long *iend, uintptr_t *reductions, void **mem) {
+  int status = 0;
+  int gtid = __kmp_entry_gtid();
+  KA_TRACE(20, ("GOMP_loop_doacross_start: T#%d, reductions: %p\n", gtid,
+                reductions));
+  if (reductions)
+    __kmp_GOMP_init_reductions(gtid, reductions, 1);
+  if (mem)
+    KMP_FATAL(GompFeatureNotSupported, "scan");
+  if (istart == NULL)
+    return true;
+  // Ignore any monotonic flag
+  const long MONOTONIC_FLAG = (long)(kmp_sched_monotonic);
+  sched &= ~MONOTONIC_FLAG;
+  if (sched == 0) {
+    status = KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_DOACROSS_RUNTIME_START)(
+        ncounts, counts, istart, iend);
+  } else if (sched == 1) {
+    status = KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_DOACROSS_STATIC_START)(
+        ncounts, counts, chunk_size, istart, iend);
+  } else if (sched == 2) {
+    status = KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_DOACROSS_DYNAMIC_START)(
+        ncounts, counts, chunk_size, istart, iend);
+  } else if (sched == 3) {
+    status = KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_DOACROSS_GUIDED_START)(
+        ncounts, counts, chunk_size, istart, iend);
+  } else {
+    KMP_ASSERT(0);
+  }
+  return status;
+}
+
+bool KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_DOACROSS_START)(
+    unsigned ncounts, unsigned long long *counts, long sched,
+    unsigned long long chunk_size, unsigned long long *istart,
+    unsigned long long *iend, uintptr_t *reductions, void **mem) {
+  int status = 0;
+  int gtid = __kmp_entry_gtid();
+  KA_TRACE(20, ("GOMP_loop_ull_doacross_start: T#%d, reductions: %p\n", gtid,
+                reductions));
+  if (reductions)
+    __kmp_GOMP_init_reductions(gtid, reductions, 1);
+  if (mem)
+    KMP_FATAL(GompFeatureNotSupported, "scan");
+  if (istart == NULL)
+    return true;
+  // Ignore any monotonic flag
+  const long MONOTONIC_FLAG = (long)(kmp_sched_monotonic);
+  sched &= ~MONOTONIC_FLAG;
+  if (sched == 0) {
+    status = KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_DOACROSS_RUNTIME_START)(
+        ncounts, counts, istart, iend);
+  } else if (sched == 1) {
+    status = KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_DOACROSS_STATIC_START)(
+        ncounts, counts, chunk_size, istart, iend);
+  } else if (sched == 2) {
+    status = KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_DOACROSS_DYNAMIC_START)(
+        ncounts, counts, chunk_size, istart, iend);
+  } else if (sched == 3) {
+    status = KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_DOACROSS_GUIDED_START)(
+        ncounts, counts, chunk_size, istart, iend);
+  } else {
+    KMP_ASSERT(0);
+  }
+  return status;
+}
+
+bool KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ORDERED_START)(
+    long start, long end, long incr, long sched, long chunk_size, long *istart,
+    long *iend, uintptr_t *reductions, void **mem) {
+  int status = 0;
+  int gtid = __kmp_entry_gtid();
+  KA_TRACE(20, ("GOMP_loop_ordered_start: T#%d, reductions: %p\n", gtid,
+                reductions));
+  if (reductions)
+    __kmp_GOMP_init_reductions(gtid, reductions, 1);
+  if (mem)
+    KMP_FATAL(GompFeatureNotSupported, "scan");
+  if (istart == NULL)
+    return true;
+  // Ignore any monotonic flag
+  const long MONOTONIC_FLAG = (long)(kmp_sched_monotonic);
+  sched &= ~MONOTONIC_FLAG;
+  if (sched == 0) {
+    status = KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ORDERED_RUNTIME_START)(
+        start, end, incr, istart, iend);
+  } else if (sched == 1) {
+    status = KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ORDERED_STATIC_START)(
+        start, end, incr, chunk_size, istart, iend);
+  } else if (sched == 2) {
+    status = KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ORDERED_DYNAMIC_START)(
+        start, end, incr, chunk_size, istart, iend);
+  } else if (sched == 3) {
+    status = KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ORDERED_GUIDED_START)(
+        start, end, incr, chunk_size, istart, iend);
+  } else {
+    KMP_ASSERT(0);
+  }
+  return status;
+}
+
+bool KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_START)(
+    bool up, unsigned long long start, unsigned long long end,
+    unsigned long long incr, long sched, unsigned long long chunk_size,
+    unsigned long long *istart, unsigned long long *iend, uintptr_t *reductions,
+    void **mem) {
+  int status = 0;
+  int gtid = __kmp_entry_gtid();
+  KA_TRACE(20, ("GOMP_loop_ull_ordered_start: T#%d, reductions: %p\n", gtid,
+                reductions));
+  if (reductions)
+    __kmp_GOMP_init_reductions(gtid, reductions, 1);
+  if (mem)
+    KMP_FATAL(GompFeatureNotSupported, "scan");
+  if (istart == NULL)
+    return true;
+  // Ignore any monotonic flag
+  const long MONOTONIC_FLAG = (long)(kmp_sched_monotonic);
+  sched &= ~MONOTONIC_FLAG;
+  if (sched == 0) {
+    status = KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_RUNTIME_START)(
+        up, start, end, incr, istart, iend);
+  } else if (sched == 1) {
+    status = KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_STATIC_START)(
+        up, start, end, incr, chunk_size, istart, iend);
+  } else if (sched == 2) {
+    status = KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_DYNAMIC_START)(
+        up, start, end, incr, chunk_size, istart, iend);
+  } else if (sched == 3) {
+    status = KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_GUIDED_START)(
+        up, start, end, incr, chunk_size, istart, iend);
+  } else {
+    KMP_ASSERT(0);
+  }
+  return status;
+}
+
+unsigned KMP_EXPAND_NAME(KMP_API_NAME_GOMP_SECTIONS2_START)(
+    unsigned count, uintptr_t *reductions, void **mem) {
+  int gtid = __kmp_entry_gtid();
+  KA_TRACE(20,
+           ("GOMP_sections2_start: T#%d, reductions: %p\n", gtid, reductions));
+  if (reductions)
+    __kmp_GOMP_init_reductions(gtid, reductions, 1);
+  if (mem)
+    KMP_FATAL(GompFeatureNotSupported, "scan");
+  return KMP_EXPAND_NAME(KMP_API_NAME_GOMP_SECTIONS_START)(count);
+}
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_WORKSHARE_TASK_REDUCTION_UNREGISTER)(
+    bool cancelled) {
+  int gtid = __kmp_get_gtid();
+  MKLOC(loc, "GOMP_workshare_task_reduction_unregister");
+  KA_TRACE(20, ("GOMP_workshare_task_reduction_unregister: T#%d\n", gtid));
+  kmp_info_t *thr = __kmp_threads[gtid];
+  kmp_team_t *team = thr->th.th_team;
+  __kmpc_end_taskgroup(NULL, gtid);
+  // If last thread out of workshare, then reset the team's reduce data
+  // the GOMP_taskgroup_reduction_unregister() function will deallocate
+  // private copies after reduction calculations take place.
+  int count = KMP_ATOMIC_INC(&team->t.t_tg_fini_counter[1]);
+  if (count == thr->th.th_team_nproc - 1) {
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_TASKGROUP_REDUCTION_UNREGISTER)
+    ((uintptr_t *)KMP_ATOMIC_LD_RLX(&team->t.t_tg_reduce_data[1]));
+    KMP_ATOMIC_ST_REL(&team->t.t_tg_reduce_data[1], NULL);
+    KMP_ATOMIC_ST_REL(&team->t.t_tg_fini_counter[1], 0);
+  }
+  if (!cancelled) {
+    __kmpc_barrier(&loc, gtid);
+  }
+}
+
 /* The following sections of code create aliases for the GOMP_* functions, then
    create versioned symbols using the assembler directive .symver. This is only
    pertinent for ELF .so library. The KMP_VERSION_SYMBOL macro is defined in
@@ -2083,7 +2601,21 @@ KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_PARALLEL_LOOP_MAYBE_NONMONOTONIC_RUNTIME,
                    50, "GOMP_5.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_TEAMS_REG, 50, "GOMP_5.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_TASKWAIT_DEPEND, 50, "GOMP_5.0");
-
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_TASKGROUP_REDUCTION_REGISTER, 50,
+                   "GOMP_5.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_TASKGROUP_REDUCTION_UNREGISTER, 50,
+                   "GOMP_5.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_TASK_REDUCTION_REMAP, 50, "GOMP_5.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_PARALLEL_REDUCTIONS, 50, "GOMP_5.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_START, 50, "GOMP_5.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_START, 50, "GOMP_5.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_DOACROSS_START, 50, "GOMP_5.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_DOACROSS_START, 50, "GOMP_5.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ORDERED_START, 50, "GOMP_5.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_START, 50, "GOMP_5.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_SECTIONS2_START, 50, "GOMP_5.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_WORKSHARE_TASK_REDUCTION_UNREGISTER, 50,
+                   "GOMP_5.0");
 #endif // KMP_USE_VERSION_SYMBOLS
 
 #ifdef __cplusplus
diff --git a/openmp/runtime/src/kmp_i18n.cpp b/openmp/runtime/src/kmp_i18n.cpp
index d2651cfabdf3..a164aa180dd4 100644
--- a/openmp/runtime/src/kmp_i18n.cpp
+++ b/openmp/runtime/src/kmp_i18n.cpp
@@ -364,9 +364,10 @@ void __kmp_i18n_do_catopen() {
          Issue hint in this case so cause of trouble is more understandable. */
       kmp_msg_t err_code = KMP_SYSERRCODE(error);
       __kmp_msg(kmp_ms_warning, KMP_MSG(CantOpenMessageCatalog, path.str),
-                err_code, (error == ERROR_BAD_EXE_FORMAT
-                               ? KMP_HNT(BadExeFormat, path.str, KMP_ARCH_STR)
-                               : __kmp_msg_null),
+                err_code,
+                (error == ERROR_BAD_EXE_FORMAT
+                     ? KMP_HNT(BadExeFormat, path.str, KMP_ARCH_STR)
+                     : __kmp_msg_null),
                 __kmp_msg_null);
       if (__kmp_generate_warnings == kmp_warnings_off) {
         __kmp_str_free(&err_code.str);
@@ -488,7 +489,7 @@ static char const *___catgets(kmp_i18n_id_t id) {
                             wmsg, wlen, // Wide buffer and size.
                             NULL, 0, // Buffer and size.
                             NULL, NULL // Default char and used default char.
-                            );
+  );
   if (len <= 0) {
     goto end;
   }
@@ -502,7 +503,7 @@ static char const *___catgets(kmp_i18n_id_t id) {
                            wmsg, wlen, // Wide buffer and size.
                            msg, len, // Buffer and size.
                            NULL, NULL // Default char and used default char.
-                           );
+  );
   if (rc <= 0 || rc > len) {
     goto end;
   }
@@ -701,11 +702,11 @@ static char *sys_error(int err) {
 
 #else // Non-Windows* OS: Linux* OS or OS X*
 
-/* There are 2 incompatible versions of strerror_r:
+  /* There are 2 incompatible versions of strerror_r:
 
-   char * strerror_r( int, char *, size_t );  // GNU version
-   int    strerror_r( int, char *, size_t );  // XSI version
-*/
+     char * strerror_r( int, char *, size_t );  // GNU version
+     int    strerror_r( int, char *, size_t );  // XSI version
+  */
 
 #if (defined(__GLIBC__) && defined(_GNU_SOURCE)) ||                            \
     (defined(__BIONIC__) && defined(_GNU_SOURCE) &&                            \
@@ -806,7 +807,9 @@ void __kmp_msg(kmp_msg_severity_t severity, kmp_msg_t message, va_list args) {
   case kmp_ms_fatal: {
     format = kmp_i18n_fmt_Fatal;
   } break;
-  default: { KMP_DEBUG_ASSERT(0); }
+  default: {
+    KMP_DEBUG_ASSERT(0);
+  }
   }
   fmsg = __kmp_msg_format(format, message.num, message.str);
   __kmp_str_free(&message.str);
@@ -830,7 +833,9 @@ void __kmp_msg(kmp_msg_severity_t severity, kmp_msg_t message, va_list args) {
       format = kmp_i18n_fmt_SysErr;
       fmsg = __kmp_msg_format(format, message.num, message.str);
     } break;
-    default: { KMP_DEBUG_ASSERT(0); }
+    default: {
+      KMP_DEBUG_ASSERT(0);
+    }
     }
     __kmp_str_free(&message.str);
     __kmp_str_buf_cat(&buffer, fmsg.str, fmsg.len);
diff --git a/openmp/runtime/src/kmp_i18n.h b/openmp/runtime/src/kmp_i18n.h
index c3f21d6a58cc..23f6f20bd6ec 100644
--- a/openmp/runtime/src/kmp_i18n.h
+++ b/openmp/runtime/src/kmp_i18n.h
@@ -170,7 +170,7 @@ void __kmp_i18n_dump_catalog(kmp_str_buf_t *buffer);
 #endif // KMP_DEBUG
 
 #ifdef __cplusplus
-}; // extern "C"
+} // extern "C"
 #endif // __cplusplus
 
 #endif // KMP_I18N_H
diff --git a/openmp/runtime/src/kmp_io.cpp b/openmp/runtime/src/kmp_io.cpp
index 4e6ea6a38a33..578e6e671cdf 100644
--- a/openmp/runtime/src/kmp_io.cpp
+++ b/openmp/runtime/src/kmp_io.cpp
@@ -84,6 +84,7 @@ void __kmp_close_console(void) {
 static void __kmp_redirect_output(void) {
   __kmp_acquire_bootstrap_lock(&__kmp_console_lock);
 
+  (void)is_console;
   if (!__kmp_console_exists) {
     HANDLE ho;
     HANDLE he;
@@ -101,6 +102,7 @@ static void __kmp_redirect_output(void) {
 
       DWORD err = GetLastError();
       // TODO: output error somehow (maybe message box)
+      (void)err;
       __kmp_stdout = NULL;
 
     } else {
@@ -112,6 +114,7 @@ static void __kmp_redirect_output(void) {
 
       DWORD err = GetLastError();
       // TODO: output error somehow (maybe message box)
+      (void)err;
       __kmp_stderr = NULL;
 
     } else {
@@ -149,8 +152,8 @@ void __kmp_vprintf(enum kmp_io out_stream, char const *format, va_list ap) {
     int chars = 0;
 
 #ifdef KMP_DEBUG_PIDS
-    chars = KMP_SNPRINTF(db, __kmp_debug_buf_chars, "pid=%d: ",
-                         (kmp_int32)getpid());
+    chars = KMP_SNPRINTF(db, __kmp_debug_buf_chars,
+                         "pid=%d: ", (kmp_int32)getpid());
 #endif
     chars += KMP_VSNPRINTF(db, __kmp_debug_buf_chars, format, ap);
 
@@ -158,16 +161,18 @@ void __kmp_vprintf(enum kmp_io out_stream, char const *format, va_list ap) {
       if (chars + 1 > __kmp_debug_buf_warn_chars) {
 #if KMP_OS_WINDOWS
         DWORD count;
-        __kmp_str_buf_print(&__kmp_console_buf, "OMP warning: Debugging buffer "
-                                                "overflow; increase "
-                                                "KMP_DEBUG_BUF_CHARS to %d\n",
+        __kmp_str_buf_print(&__kmp_console_buf,
+                            "OMP warning: Debugging buffer "
+                            "overflow; increase "
+                            "KMP_DEBUG_BUF_CHARS to %d\n",
                             chars + 1);
         WriteFile(stream, __kmp_console_buf.str, __kmp_console_buf.used, &count,
                   NULL);
         __kmp_str_buf_clear(&__kmp_console_buf);
 #else
-        fprintf(stream, "OMP warning: Debugging buffer overflow; "
-                        "increase KMP_DEBUG_BUF_CHARS to %d\n",
+        fprintf(stream,
+                "OMP warning: Debugging buffer overflow; "
+                "increase KMP_DEBUG_BUF_CHARS to %d\n",
                 chars + 1);
         fflush(stream);
 #endif
diff --git a/openmp/runtime/src/kmp_itt.cpp b/openmp/runtime/src/kmp_itt.cpp
index fa286eccad22..0aa8a70fc70f 100644
--- a/openmp/runtime/src/kmp_itt.cpp
+++ b/openmp/runtime/src/kmp_itt.cpp
@@ -63,9 +63,9 @@ void __kmp_itt_reset() {
 
 void __kmp_itt_initialize() {
 
-// ITTNotify library is loaded and initialized at first call to any ittnotify
-// function, so we do not need to explicitly load it any more. Just report OMP
-// RTL version to ITTNotify.
+  // ITTNotify library is loaded and initialized at first call to any ittnotify
+  // function, so we do not need to explicitly load it any more. Just report OMP
+  // RTL version to ITTNotify.
 
 #if USE_ITT_NOTIFY
   // Backup a clean global state
@@ -153,7 +153,9 @@ extern "C" void __itt_error_handler(__itt_error_code err, va_list args) {
       __kmp_str_free(&err_code.str);
     }
   } break;
-  default: { KMP_WARNING(IttUnknownError, err); }
+  default: {
+    KMP_WARNING(IttUnknownError, err);
+  }
   }
 } // __itt_error_handler
 
diff --git a/openmp/runtime/src/kmp_itt.h b/openmp/runtime/src/kmp_itt.h
index b14a19301459..9872764a375c 100644
--- a/openmp/runtime/src/kmp_itt.h
+++ b/openmp/runtime/src/kmp_itt.h
@@ -53,9 +53,9 @@ void __kmp_itt_reset();
 // --- Parallel region reporting ---
 __kmp_inline void
 __kmp_itt_region_forking(int gtid, int team_size,
-                         int barriers); // Master only, before forking threads.
+                         int barriers); // Primary only, before forking threads.
 __kmp_inline void
-__kmp_itt_region_joined(int gtid); // Master only, after joining threads.
+__kmp_itt_region_joined(int gtid); // Primary only, after joining threads.
 // (*) Note: A thread may execute tasks after this point, though.
 
 // --- Frame reporting ---
@@ -90,6 +90,16 @@ __kmp_inline void __kmp_itt_barrier_finished(int gtid, void *object);
 __kmp_inline void *__kmp_itt_taskwait_object(int gtid);
 __kmp_inline void __kmp_itt_taskwait_starting(int gtid, void *object);
 __kmp_inline void __kmp_itt_taskwait_finished(int gtid, void *object);
+#define KMP_ITT_TASKWAIT_STARTING(obj)                                         \
+  if (UNLIKELY(__itt_sync_create_ptr)) {                                       \
+    obj = __kmp_itt_taskwait_object(gtid);                                     \
+    if (obj != NULL) {                                                         \
+      __kmp_itt_taskwait_starting(gtid, obj);                                  \
+    }                                                                          \
+  }
+#define KMP_ITT_TASKWAIT_FINISHED(obj)                                         \
+  if (UNLIKELY(obj != NULL))                                                   \
+    __kmp_itt_taskwait_finished(gtid, obj);
 
 // --- Task reporting ---
 __kmp_inline void __kmp_itt_task_starting(void *object);
@@ -181,7 +191,7 @@ __kmp_inline void __kmp_itt_stack_callee_leave(__itt_caller);
 #define SSC_MARK_SPIN_END() INSERT_SSC_MARK(0x4377)
 
 // Markers for architecture simulation.
-// FORKING      : Before the master thread forks.
+// FORKING      : Before the primary thread forks.
 // JOINING      : At the start of the join.
 // INVOKING     : Before the threads invoke microtasks.
 // DISPATCH_INIT: At the start of dynamically scheduled loop.
diff --git a/openmp/runtime/src/kmp_itt.inl b/openmp/runtime/src/kmp_itt.inl
index e7c6041d619c..ecfcb966bb79 100644
--- a/openmp/runtime/src/kmp_itt.inl
+++ b/openmp/runtime/src/kmp_itt.inl
@@ -64,14 +64,14 @@ static kmp_bootstrap_lock_t metadata_lock =
     KMP_BOOTSTRAP_LOCK_INITIALIZER(metadata_lock);
 
 /* Parallel region reporting.
- * __kmp_itt_region_forking should be called by master thread of a team.
+ * __kmp_itt_region_forking should be called by primary thread of a team.
    Exact moment of call does not matter, but it should be completed before any
    thread of this team calls __kmp_itt_region_starting.
  * __kmp_itt_region_starting should be called by each thread of a team just
    before entering parallel region body.
  * __kmp_itt_region_finished should be called by each thread of a team right
    after returning from parallel region body.
- * __kmp_itt_region_joined should be called by master thread of a team, after
+ * __kmp_itt_region_joined should be called by primary thread of a team, after
    all threads called __kmp_itt_region_finished.
 
  Note: Thread waiting at join barrier (after __kmp_itt_region_finished) can
@@ -448,10 +448,10 @@ LINKAGE void __kmp_itt_region_joined(int gtid) {
 /* Barriers reporting.
 
    A barrier consists of two phases:
-   1. Gather -- master waits for arriving of all the worker threads; each
+   1. Gather -- primary thread waits for all worker threads to arrive; each
       worker thread registers arrival and goes further.
-   2. Release -- each worker threads waits until master lets it go; master lets
-      worker threads go.
+   2. Release -- each worker thread waits until primary thread lets it go;
+      primary thread lets worker threads go.
 
    Function should be called by each thread:
    * __kmp_itt_barrier_starting() -- before arriving to the gather phase.
@@ -487,7 +487,7 @@ void *__kmp_itt_barrier_object(int gtid, int bt, int set_name,
   // solution, and reporting fork/join barriers to ITT should be revisited.
 
   if (team != NULL) {
-    // Master thread increases b_arrived by KMP_BARRIER_STATE_BUMP each time.
+    // Primary thread increases b_arrived by KMP_BARRIER_STATE_BUMP each time.
     // Divide b_arrived by KMP_BARRIER_STATE_BUMP to get plain barrier counter.
     kmp_uint64 counter =
         team->t.t_bar[bt].b_arrived / KMP_BARRIER_STATE_BUMP + delta;
@@ -550,12 +550,13 @@ void *__kmp_itt_barrier_object(int gtid, int bt, int set_name,
       case bs_forkjoin_barrier: {
         // In case of fork/join barrier we can read thr->th.th_ident, because it
         // contains location of last passed construct (while join barrier is not
-        // such one). Use th_ident of master thread instead -- __kmp_join_call()
-        // called by the master thread saves location.
+        // such one). Use th_ident of primary thread instead --
+        // __kmp_join_call() called by the primary thread saves location.
         //
-        // AC: cannot read from master because __kmp_join_call may be not called
-        //    yet, so we read the location from team. This is the same location.
-        //    And team is valid at the enter to join barrier where this happens.
+        // AC: cannot read from primary thread because __kmp_join_call may not
+        //    be called yet, so we read the location from team. This is the
+        //    same location. Team is valid on entry to join barrier where this
+        //    happens.
         loc = team->t.t_ident;
         if (loc != NULL) {
           src = loc->psource;
@@ -958,7 +959,7 @@ void __kmp_itt_thread_name(int gtid) {
     kmp_str_buf_t name;
     __kmp_str_buf_init(&name);
     if (KMP_MASTER_GTID(gtid)) {
-      __kmp_str_buf_print(&name, "OMP Master Thread #%d", gtid);
+      __kmp_str_buf_print(&name, "OMP Primary Thread #%d", gtid);
     } else {
       __kmp_str_buf_print(&name, "OMP Worker Thread #%d", gtid);
     }
@@ -986,9 +987,9 @@ void __kmp_itt_system_object_created(void *object, char const *name) {
 } // __kmp_itt_system_object_created
 
 /* Stack stitching api.
-   Master calls "create" and put the stitching id into team structure.
+   Primary thread calls "create" and put the stitching id into team structure.
    Workers read the stitching id and call "enter" / "leave" api.
-   Master calls "destroy" at the end of the parallel region. */
+   Primary thread calls "destroy" at the end of the parallel region. */
 
 __itt_caller __kmp_itt_stack_caller_create() {
 #if USE_ITT_NOTIFY
diff --git a/openmp/runtime/src/kmp_lock.cpp b/openmp/runtime/src/kmp_lock.cpp
index 05e879dbd59f..59726f2b9f21 100644
--- a/openmp/runtime/src/kmp_lock.cpp
+++ b/openmp/runtime/src/kmp_lock.cpp
@@ -21,8 +21,6 @@
 #include "kmp_wait_release.h"
 #include "kmp_wrapper_getpid.h"
 
-#include "tsan_annotations.h"
-
 #if KMP_USE_FUTEX
 #include <sys/syscall.h>
 #include <unistd.h>
@@ -112,7 +110,6 @@ __kmp_acquire_tas_lock_timed_template(kmp_tas_lock_t *lck, kmp_int32 gtid) {
 
 int __kmp_acquire_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid) {
   int retval = __kmp_acquire_tas_lock_timed_template(lck, gtid);
-  ANNOTATE_TAS_ACQUIRED(lck);
   return retval;
 }
 
@@ -154,7 +151,6 @@ int __kmp_release_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid) {
   KMP_MB(); /* Flush all pending memory write invalidates.  */
 
   KMP_FSYNC_RELEASING(lck);
-  ANNOTATE_TAS_RELEASED(lck);
   KMP_ATOMIC_ST_REL(&lck->lk.poll, KMP_LOCK_FREE(tas));
   KMP_MB(); /* Flush all pending memory write invalidates.  */
 
@@ -208,7 +204,6 @@ int __kmp_acquire_nested_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid) {
     return KMP_LOCK_ACQUIRED_NEXT;
   } else {
     __kmp_acquire_tas_lock_timed_template(lck, gtid);
-    ANNOTATE_TAS_ACQUIRED(lck);
     lck->lk.depth_locked = 1;
     return KMP_LOCK_ACQUIRED_FIRST;
   }
@@ -398,7 +393,6 @@ __kmp_acquire_futex_lock_timed_template(kmp_futex_lock_t *lck, kmp_int32 gtid) {
 
 int __kmp_acquire_futex_lock(kmp_futex_lock_t *lck, kmp_int32 gtid) {
   int retval = __kmp_acquire_futex_lock_timed_template(lck, gtid);
-  ANNOTATE_FUTEX_ACQUIRED(lck);
   return retval;
 }
 
@@ -441,7 +435,6 @@ int __kmp_release_futex_lock(kmp_futex_lock_t *lck, kmp_int32 gtid) {
                   lck, lck->lk.poll, gtid));
 
   KMP_FSYNC_RELEASING(lck);
-  ANNOTATE_FUTEX_RELEASED(lck);
 
   kmp_int32 poll_val = KMP_XCHG_FIXED32(&(lck->lk.poll), KMP_LOCK_FREE(futex));
 
@@ -512,7 +505,6 @@ int __kmp_acquire_nested_futex_lock(kmp_futex_lock_t *lck, kmp_int32 gtid) {
     return KMP_LOCK_ACQUIRED_NEXT;
   } else {
     __kmp_acquire_futex_lock_timed_template(lck, gtid);
-    ANNOTATE_FUTEX_ACQUIRED(lck);
     lck->lk.depth_locked = 1;
     return KMP_LOCK_ACQUIRED_FIRST;
   }
@@ -644,7 +636,6 @@ __kmp_acquire_ticket_lock_timed_template(kmp_ticket_lock_t *lck,
 
 int __kmp_acquire_ticket_lock(kmp_ticket_lock_t *lck, kmp_int32 gtid) {
   int retval = __kmp_acquire_ticket_lock_timed_template(lck, gtid);
-  ANNOTATE_TICKET_ACQUIRED(lck);
   return retval;
 }
 
@@ -719,7 +710,6 @@ int __kmp_release_ticket_lock(kmp_ticket_lock_t *lck, kmp_int32 gtid) {
                         std::atomic_load_explicit(&lck->lk.now_serving,
                                                   std::memory_order_relaxed);
 
-  ANNOTATE_TICKET_RELEASED(lck);
   std::atomic_fetch_add_explicit(&lck->lk.now_serving, 1U,
                                  std::memory_order_release);
 
@@ -814,7 +804,6 @@ int __kmp_acquire_nested_ticket_lock(kmp_ticket_lock_t *lck, kmp_int32 gtid) {
     return KMP_LOCK_ACQUIRED_NEXT;
   } else {
     __kmp_acquire_ticket_lock_timed_template(lck, gtid);
-    ANNOTATE_TICKET_ACQUIRED(lck);
     std::atomic_store_explicit(&lck->lk.depth_locked, 1,
                                std::memory_order_relaxed);
     std::atomic_store_explicit(&lck->lk.owner_id, gtid + 1,
@@ -1091,7 +1080,6 @@ __kmp_acquire_queuing_lock_timed_template(kmp_queuing_lock_t *lck,
   volatile kmp_int32 *head_id_p = &lck->lk.head_id;
   volatile kmp_int32 *tail_id_p = &lck->lk.tail_id;
   volatile kmp_uint32 *spin_here_p;
-  kmp_int32 need_mf = 1;
 
 #if OMPT_SUPPORT
   ompt_state_t prev_state = ompt_state_undefined;
@@ -1142,7 +1130,6 @@ __kmp_acquire_queuing_lock_timed_template(kmp_queuing_lock_t *lck,
                 if ( t > 0 ) condition in the enqueued case below, which is not
                 necessary for this state transition */
 
-      need_mf = 0;
       /* try (-1,0)->(tid,tid) */
       enqueued = KMP_COMPARE_AND_STORE_ACQ64((volatile kmp_int64 *)tail_id_p,
                                              KMP_PACK_64(-1, 0),
@@ -1164,7 +1151,6 @@ __kmp_acquire_queuing_lock_timed_template(kmp_queuing_lock_t *lck,
       if (tail == 0) {
         enqueued = FALSE;
       } else {
-        need_mf = 0;
         /* try (h,t) or (h,h)->(h,tid) */
         enqueued = KMP_COMPARE_AND_STORE_ACQ32(tail_id_p, tail, gtid + 1);
 
@@ -1285,7 +1271,6 @@ int __kmp_acquire_queuing_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) {
   KMP_DEBUG_ASSERT(gtid >= 0);
 
   int retval = __kmp_acquire_queuing_lock_timed_template<false>(lck, gtid);
-  ANNOTATE_QUEUING_ACQUIRED(lck);
   return retval;
 }
 
@@ -1331,7 +1316,6 @@ int __kmp_test_queuing_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) {
       KA_TRACE(1000,
                ("__kmp_test_queuing_lock: T#%d exiting: holding lock\n", gtid));
       KMP_FSYNC_ACQUIRED(lck);
-      ANNOTATE_QUEUING_ACQUIRED(lck);
       return TRUE;
     }
   }
@@ -1381,7 +1365,6 @@ int __kmp_release_queuing_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) {
   KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
 
   KMP_FSYNC_RELEASING(lck);
-  ANNOTATE_QUEUING_RELEASED(lck);
 
   while (1) {
     kmp_int32 dequeued;
@@ -1490,8 +1473,8 @@ int __kmp_release_queuing_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) {
 #endif
       return KMP_LOCK_RELEASED;
     }
-/* KMP_CPU_PAUSE(); don't want to make releasing thread hold up acquiring
-   threads */
+    /* KMP_CPU_PAUSE(); don't want to make releasing thread hold up acquiring
+       threads */
 
 #ifdef DEBUG_QUEUING_LOCKS
     TRACE_LOCK(gtid + 1, "rel retry");
@@ -1570,7 +1553,6 @@ int __kmp_acquire_nested_queuing_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) {
     return KMP_LOCK_ACQUIRED_NEXT;
   } else {
     __kmp_acquire_queuing_lock_timed_template<false>(lck, gtid);
-    ANNOTATE_QUEUING_ACQUIRED(lck);
     KMP_MB();
     lck->lk.depth_locked = 1;
     KMP_MB();
@@ -1919,9 +1901,10 @@ void __kmp_print_speculative_stats() {
   }
 
   fprintf(statsFile, "Speculative lock statistics (all approximate!)\n");
-  fprintf(statsFile, " Lock parameters: \n"
-                     "   max_soft_retries               : %10d\n"
-                     "   max_badness                    : %10d\n",
+  fprintf(statsFile,
+          " Lock parameters: \n"
+          "   max_soft_retries               : %10d\n"
+          "   max_badness                    : %10d\n",
           __kmp_adaptive_backoff_params.max_soft_retries,
           __kmp_adaptive_backoff_params.max_badness);
   fprintf(statsFile, " Non-speculative acquire attempts : %10d\n",
@@ -2126,7 +2109,6 @@ static void __kmp_acquire_adaptive_lock(kmp_adaptive_lock_t *lck,
   __kmp_acquire_queuing_lock_timed_template<FALSE>(GET_QLK_PTR(lck), gtid);
   // We have acquired the base lock, so count that.
   KMP_INC_STAT(lck, nonSpeculativeAcquires);
-  ANNOTATE_QUEUING_ACQUIRED(lck);
 }
 
 static void __kmp_acquire_adaptive_lock_with_checks(kmp_adaptive_lock_t *lck,
@@ -2359,7 +2341,6 @@ __kmp_acquire_drdpa_lock_timed_template(kmp_drdpa_lock_t *lck, kmp_int32 gtid) {
 
 int __kmp_acquire_drdpa_lock(kmp_drdpa_lock_t *lck, kmp_int32 gtid) {
   int retval = __kmp_acquire_drdpa_lock_timed_template(lck, gtid);
-  ANNOTATE_DRDPA_ACQUIRED(lck);
   return retval;
 }
 
@@ -2436,7 +2417,6 @@ int __kmp_release_drdpa_lock(kmp_drdpa_lock_t *lck, kmp_int32 gtid) {
   KA_TRACE(1000, ("__kmp_release_drdpa_lock: ticket #%lld released lock %p\n",
                   ticket - 1, lck));
   KMP_FSYNC_RELEASING(lck);
-  ANNOTATE_DRDPA_RELEASED(lck);
   polls[ticket & mask] = ticket; // atomic store
   return KMP_LOCK_RELEASED;
 }
@@ -2523,7 +2503,6 @@ int __kmp_acquire_nested_drdpa_lock(kmp_drdpa_lock_t *lck, kmp_int32 gtid) {
     return KMP_LOCK_ACQUIRED_NEXT;
   } else {
     __kmp_acquire_drdpa_lock_timed_template(lck, gtid);
-    ANNOTATE_DRDPA_ACQUIRED(lck);
     KMP_MB();
     lck->lk.depth_locked = 1;
     KMP_MB();
@@ -3846,15 +3825,11 @@ kmp_user_lock_p __kmp_user_lock_allocate(void **user_lock, kmp_int32 gtid,
   if (__kmp_lock_pool == NULL) {
     // Lock pool is empty. Allocate new memory.
 
-    // ANNOTATION: Found no good way to express the syncronisation
-    // between allocation and usage, so ignore the allocation
-    ANNOTATE_IGNORE_WRITES_BEGIN();
     if (__kmp_num_locks_in_block <= 1) { // Tune this cutoff point.
       lck = (kmp_user_lock_p)__kmp_allocate(__kmp_user_lock_size);
     } else {
       lck = __kmp_lock_block_allocate();
     }
-    ANNOTATE_IGNORE_WRITES_END();
 
     // Insert lock in the table so that it can be freed in __kmp_cleanup,
     // and debugger has info on all allocated locks.
diff --git a/openmp/runtime/src/kmp_lock.h b/openmp/runtime/src/kmp_lock.h
index 3b70f95c7c56..4f6ad6414e53 100644
--- a/openmp/runtime/src/kmp_lock.h
+++ b/openmp/runtime/src/kmp_lock.h
@@ -655,9 +655,8 @@ extern int (*__kmp_acquire_user_lock_with_checks_)(kmp_user_lock_p lck,
       KMP_INIT_YIELD(spins);                                                   \
       do {                                                                     \
         KMP_YIELD_OVERSUB_ELSE_SPIN(spins);                                    \
-      } while (                                                                \
-          lck->tas.lk.poll != 0 ||                                             \
-          !__kmp_atomic_compare_store_acq(&lck->tas.lk.poll, 0, gtid + 1));    \
+      } while (lck->tas.lk.poll != 0 || !__kmp_atomic_compare_store_acq(       \
+                                            &lck->tas.lk.poll, 0, gtid + 1));  \
     }                                                                          \
     KMP_FSYNC_ACQUIRED(lck);                                                   \
   } else {                                                                     \
diff --git a/openmp/runtime/src/kmp_omp.h b/openmp/runtime/src/kmp_omp.h
index c7ba32a14338..995241ff65cd 100644
--- a/openmp/runtime/src/kmp_omp.h
+++ b/openmp/runtime/src/kmp_omp.h
@@ -123,7 +123,7 @@ typedef struct {
 
   /* team structure information */
   kmp_int32 t_sizeof_struct;
-  offset_and_size_t t_master_tid; // tid of master in parent team
+  offset_and_size_t t_master_tid; // tid of primary thread in parent team
   offset_and_size_t t_ident; // location of parallel region
   offset_and_size_t t_parent; // parent team
   offset_and_size_t t_nproc; // # team threads
@@ -136,7 +136,7 @@ typedef struct {
   offset_and_size_t t_cancel_request;
   offset_and_size_t t_bar;
   offset_and_size_t
-      t_b_master_arrived; // increased by 1 when master arrives to a barrier
+      t_b_master_arrived; // incremented when primary thread reaches barrier
   offset_and_size_t
       t_b_team_arrived; // increased by one when all the threads arrived
 
diff --git a/openmp/runtime/src/kmp_os.h b/openmp/runtime/src/kmp_os.h
index 5ceae9857b32..4437cf251892 100644
--- a/openmp/runtime/src/kmp_os.h
+++ b/openmp/runtime/src/kmp_os.h
@@ -139,7 +139,7 @@ typedef struct kmp_struct64 kmp_uint64;
 #undef KMP_USE_X87CONTROL
 #define KMP_USE_X87CONTROL 1
 #endif
-#if KMP_ARCH_X86_64
+#if KMP_ARCH_X86_64 || KMP_ARCH_AARCH64
 #define KMP_INTPTR 1
 typedef __int64 kmp_intptr_t;
 typedef unsigned __int64 kmp_uintptr_t;
@@ -333,13 +333,13 @@ extern "C" {
 //   Code from libcxx/include/__config
 // Use a function like macro to imply that it must be followed by a semicolon
 #if __cplusplus > 201402L && __has_cpp_attribute(fallthrough)
-#  define KMP_FALLTHROUGH() [[fallthrough]]
+#define KMP_FALLTHROUGH() [[fallthrough]]
 #elif __has_cpp_attribute(clang::fallthrough)
-#  define KMP_FALLTHROUGH() [[clang::fallthrough]]
+#define KMP_FALLTHROUGH() [[clang::fallthrough]]
 #elif __has_attribute(fallthrough) || __GNUC__ >= 7
-#  define KMP_FALLTHROUGH() __attribute__((__fallthrough__))
+#define KMP_FALLTHROUGH() __attribute__((__fallthrough__))
 #else
-#  define KMP_FALLTHROUGH() ((void)0)
+#define KMP_FALLTHROUGH() ((void)0)
 #endif
 
 #if KMP_HAVE_ATTRIBUTE_WAITPKG
@@ -406,9 +406,24 @@ extern "C" {
           api_name) "@" ver_str "\n\t");                                        \
   __asm__(".symver " KMP_STR(__kmp_api_##api_name) "," KMP_STR(                 \
       api_name) "@@" default_ver "\n\t")
+
+#define KMP_VERSION_OMPC_SYMBOL(apic_name, api_name, ver_num, ver_str)         \
+  _KMP_VERSION_OMPC_SYMBOL(apic_name, api_name, ver_num, ver_str, "VERSION")
+#define _KMP_VERSION_OMPC_SYMBOL(apic_name, api_name, ver_num, ver_str,          \
+                                 default_ver)                                    \
+  __typeof__(__kmp_api_##apic_name) __kmp_api_##apic_name##_##ver_num##_alias    \
+      __attribute__((alias(KMP_STR(__kmp_api_##apic_name))));                    \
+  __asm__(".symver " KMP_STR(__kmp_api_##apic_name) "," KMP_STR(                 \
+      apic_name) "@@" default_ver "\n\t");                                       \
+  __asm__(                                                                       \
+      ".symver " KMP_STR(__kmp_api_##apic_name##_##ver_num##_alias) "," KMP_STR( \
+          api_name) "@" ver_str "\n\t")
+
 #else // KMP_USE_VERSION_SYMBOLS
 #define KMP_EXPAND_NAME(api_name) api_name
 #define KMP_VERSION_SYMBOL(api_name, ver_num, ver_str) /* Nothing */
+#define KMP_VERSION_OMPC_SYMBOL(apic_name, api_name, ver_num,                  \
+                                ver_str) /* Nothing */
 #endif // KMP_USE_VERSION_SYMBOLS
 
 /* Temporary note: if performance testing of this passes, we can remove
@@ -463,8 +478,13 @@ inline kmp_real32 KMP_XCHG_REAL32(volatile kmp_real32 *p, kmp_real32 v) {
   return *(kmp_real32 *)&tmp;
 }
 
-// Routines that we still need to implement in assembly.
-extern kmp_int8 __kmp_test_then_add8(volatile kmp_int8 *p, kmp_int8 v);
+#define KMP_TEST_THEN_OR8(p, v) __kmp_test_then_or8((p), (v))
+#define KMP_TEST_THEN_AND8(p, v) __kmp_test_then_and8((p), (v))
+#define KMP_TEST_THEN_OR32(p, v) __kmp_test_then_or32((p), (v))
+#define KMP_TEST_THEN_AND32(p, v) __kmp_test_then_and32((p), (v))
+#define KMP_TEST_THEN_OR64(p, v) __kmp_test_then_or64((p), (v))
+#define KMP_TEST_THEN_AND64(p, v) __kmp_test_then_and64((p), (v))
+
 extern kmp_int8 __kmp_test_then_or8(volatile kmp_int8 *p, kmp_int8 v);
 extern kmp_int8 __kmp_test_then_and8(volatile kmp_int8 *p, kmp_int8 v);
 extern kmp_int32 __kmp_test_then_add32(volatile kmp_int32 *p, kmp_int32 v);
@@ -474,6 +494,119 @@ extern kmp_int64 __kmp_test_then_add64(volatile kmp_int64 *p, kmp_int64 v);
 extern kmp_uint64 __kmp_test_then_or64(volatile kmp_uint64 *p, kmp_uint64 v);
 extern kmp_uint64 __kmp_test_then_and64(volatile kmp_uint64 *p, kmp_uint64 v);
 
+#if KMP_ARCH_AARCH64 && KMP_COMPILER_MSVC && !KMP_COMPILER_CLANG
+#define KMP_TEST_THEN_INC64(p) _InterlockedExchangeAdd64((p), 1LL)
+#define KMP_TEST_THEN_INC_ACQ64(p) _InterlockedExchangeAdd64_acq((p), 1LL)
+#define KMP_TEST_THEN_ADD4_64(p) _InterlockedExchangeAdd64((p), 4LL)
+// #define KMP_TEST_THEN_ADD4_ACQ64(p) _InterlockedExchangeAdd64_acq((p), 4LL)
+// #define KMP_TEST_THEN_DEC64(p) _InterlockedExchangeAdd64((p), -1LL)
+// #define KMP_TEST_THEN_DEC_ACQ64(p) _InterlockedExchangeAdd64_acq((p), -1LL)
+// #define KMP_TEST_THEN_ADD8(p, v) _InterlockedExchangeAdd8((p), (v))
+#define KMP_TEST_THEN_ADD64(p, v) _InterlockedExchangeAdd64((p), (v))
+
+#define KMP_COMPARE_AND_STORE_ACQ8(p, cv, sv)                                  \
+  __kmp_compare_and_store_acq8((p), (cv), (sv))
+#define KMP_COMPARE_AND_STORE_REL8(p, cv, sv)                                  \
+  __kmp_compare_and_store_rel8((p), (cv), (sv))
+#define KMP_COMPARE_AND_STORE_ACQ16(p, cv, sv)                                 \
+  __kmp_compare_and_store_acq16((p), (cv), (sv))
+/*
+#define KMP_COMPARE_AND_STORE_REL16(p, cv, sv)                                 \
+  __kmp_compare_and_store_rel16((p), (cv), (sv))
+*/
+#define KMP_COMPARE_AND_STORE_ACQ32(p, cv, sv)                                 \
+  __kmp_compare_and_store_acq32((volatile kmp_int32 *)(p), (kmp_int32)(cv),    \
+                                (kmp_int32)(sv))
+#define KMP_COMPARE_AND_STORE_REL32(p, cv, sv)                                 \
+  __kmp_compare_and_store_rel32((volatile kmp_int32 *)(p), (kmp_int32)(cv),    \
+                                (kmp_int32)(sv))
+#define KMP_COMPARE_AND_STORE_ACQ64(p, cv, sv)                                 \
+  __kmp_compare_and_store_acq64((volatile kmp_int64 *)(p), (kmp_int64)(cv),    \
+                                (kmp_int64)(sv))
+#define KMP_COMPARE_AND_STORE_REL64(p, cv, sv)                                 \
+  __kmp_compare_and_store_rel64((volatile kmp_int64 *)(p), (kmp_int64)(cv),    \
+                                (kmp_int64)(sv))
+#define KMP_COMPARE_AND_STORE_PTR(p, cv, sv)                                   \
+  __kmp_compare_and_store_ptr((void *volatile *)(p), (void *)(cv), (void *)(sv))
+
+//  KMP_COMPARE_AND_STORE expects this order:       pointer, compare, exchange
+// _InterlockedCompareExchange expects this order:  pointer, exchange, compare
+// KMP_COMPARE_AND_STORE also returns a bool indicating a successful write. A
+// write is successful if the return value of _InterlockedCompareExchange is the
+// same as the compare value.
+inline kmp_int8 __kmp_compare_and_store_acq8(volatile kmp_int8 *p, kmp_int8 cv,
+                                             kmp_int8 sv) {
+  return _InterlockedCompareExchange8_acq(p, sv, cv) == cv;
+}
+
+inline kmp_int8 __kmp_compare_and_store_rel8(volatile kmp_int8 *p, kmp_int8 cv,
+                                             kmp_int8 sv) {
+  return _InterlockedCompareExchange8_rel(p, sv, cv) == cv;
+}
+
+inline kmp_int16 __kmp_compare_and_store_acq16(volatile kmp_int16 *p,
+                                               kmp_int16 cv, kmp_int16 sv) {
+  return _InterlockedCompareExchange16_acq(p, sv, cv) == cv;
+}
+
+inline kmp_int16 __kmp_compare_and_store_rel16(volatile kmp_int16 *p,
+                                               kmp_int16 cv, kmp_int16 sv) {
+  return _InterlockedCompareExchange16_rel(p, sv, cv) == cv;
+}
+
+inline kmp_int32 __kmp_compare_and_store_acq32(volatile kmp_int32 *p,
+                                               kmp_int32 cv, kmp_int32 sv) {
+  return _InterlockedCompareExchange_acq((volatile long *)p, sv, cv) == cv;
+}
+
+inline kmp_int32 __kmp_compare_and_store_rel32(volatile kmp_int32 *p,
+                                               kmp_int32 cv, kmp_int32 sv) {
+  return _InterlockedCompareExchange_rel((volatile long *)p, sv, cv) == cv;
+}
+
+inline kmp_int32 __kmp_compare_and_store_acq64(volatile kmp_int64 *p,
+                                               kmp_int64 cv, kmp_int64 sv) {
+  return _InterlockedCompareExchange64_acq(p, sv, cv) == cv;
+}
+
+inline kmp_int32 __kmp_compare_and_store_rel64(volatile kmp_int64 *p,
+                                               kmp_int64 cv, kmp_int64 sv) {
+  return _InterlockedCompareExchange64_rel(p, sv, cv) == cv;
+}
+
+inline kmp_int32 __kmp_compare_and_store_ptr(void *volatile *p, void *cv,
+                                             void *sv) {
+  return _InterlockedCompareExchangePointer(p, sv, cv) == cv;
+}
+
+// The _RET versions return the value instead of a bool
+/*
+#define KMP_COMPARE_AND_STORE_RET8(p, cv, sv)                                  \
+   _InterlockedCompareExchange8((p), (sv), (cv))
+#define KMP_COMPARE_AND_STORE_RET16(p, cv, sv)                                 \
+  _InterlockedCompareExchange16((p), (sv), (cv))
+*/
+#define KMP_COMPARE_AND_STORE_RET64(p, cv, sv)                                 \
+  _InterlockedCompareExchange64((volatile kmp_int64 *)(p), (kmp_int64)(sv),    \
+                                (kmp_int64)(cv))
+
+/*
+#define KMP_XCHG_FIXED8(p, v)                                                  \
+  _InterlockedExchange8((volatile kmp_int8 *)(p), (kmp_int8)(v));
+*/
+// #define KMP_XCHG_FIXED16(p, v) _InterlockedExchange16((p), (v));
+// #define KMP_XCHG_REAL64(p, v) __kmp_xchg_real64((p), (v)));
+
+// inline kmp_real64 __kmp_xchg_real64(volatile kmp_real64 *p, kmp_real64 v) {
+//   kmp_int64 tmp = _InterlockedExchange64((volatile kmp_int64 *)p, *(kmp_int64
+//   *)&v); return *(kmp_real64 *)&tmp;
+// }
+
+#else // !KMP_ARCH_AARCH64
+
+// Routines that we still need to implement in assembly.
+extern kmp_int8 __kmp_test_then_add8(volatile kmp_int8 *p, kmp_int8 v);
+
 extern kmp_int8 __kmp_compare_and_store8(volatile kmp_int8 *p, kmp_int8 cv,
                                          kmp_int8 sv);
 extern kmp_int16 __kmp_compare_and_store16(volatile kmp_int16 *p, kmp_int16 cv,
@@ -514,12 +647,6 @@ extern kmp_real64 __kmp_xchg_real64(volatile kmp_real64 *p, kmp_real64 v);
 #define KMP_TEST_THEN_ADD8(p, v) __kmp_test_then_add8((p), (v))
 #define KMP_TEST_THEN_ADD64(p, v) __kmp_test_then_add64((p), (v))
 
-#define KMP_TEST_THEN_OR8(p, v) __kmp_test_then_or8((p), (v))
-#define KMP_TEST_THEN_AND8(p, v) __kmp_test_then_and8((p), (v))
-#define KMP_TEST_THEN_OR32(p, v) __kmp_test_then_or32((p), (v))
-#define KMP_TEST_THEN_AND32(p, v) __kmp_test_then_and32((p), (v))
-#define KMP_TEST_THEN_OR64(p, v) __kmp_test_then_or64((p), (v))
-#define KMP_TEST_THEN_AND64(p, v) __kmp_test_then_and64((p), (v))
 
 #define KMP_COMPARE_AND_STORE_ACQ8(p, cv, sv)                                  \
   __kmp_compare_and_store8((p), (cv), (sv))
@@ -567,6 +694,7 @@ extern kmp_real64 __kmp_xchg_real64(volatile kmp_real64 *p, kmp_real64 v);
 //#define KMP_XCHG_FIXED64(p, v) __kmp_xchg_fixed64((p), (v));
 //#define KMP_XCHG_REAL32(p, v) __kmp_xchg_real32((p), (v));
 #define KMP_XCHG_REAL64(p, v) __kmp_xchg_real64((p), (v));
+#endif
 
 #elif (KMP_ASM_INTRINS && KMP_OS_UNIX) || !(KMP_ARCH_X86 || KMP_ARCH_X86_64)
 
@@ -680,26 +808,28 @@ extern kmp_real64 __kmp_xchg_real64(volatile kmp_real64 *p, kmp_real64 v);
   __sync_val_compare_and_swap((volatile kmp_uint32 *)(p), (kmp_uint32)(cv),    \
                               (kmp_uint32)(sv))
 #if KMP_ARCH_MIPS
-static inline bool mips_sync_bool_compare_and_swap(
-  volatile kmp_uint64 *p, kmp_uint64 cv, kmp_uint64 sv) {
+static inline bool mips_sync_bool_compare_and_swap(volatile kmp_uint64 *p,
+                                                   kmp_uint64 cv,
+                                                   kmp_uint64 sv) {
   return __atomic_compare_exchange(p, &cv, &sv, false, __ATOMIC_SEQ_CST,
-                                                       __ATOMIC_SEQ_CST);
+                                   __ATOMIC_SEQ_CST);
 }
-static inline bool mips_sync_val_compare_and_swap(
-  volatile kmp_uint64 *p, kmp_uint64 cv, kmp_uint64 sv) {
+static inline bool mips_sync_val_compare_and_swap(volatile kmp_uint64 *p,
+                                                  kmp_uint64 cv,
+                                                  kmp_uint64 sv) {
   __atomic_compare_exchange(p, &cv, &sv, false, __ATOMIC_SEQ_CST,
-                                                __ATOMIC_SEQ_CST);
+                            __ATOMIC_SEQ_CST);
   return cv;
 }
 #define KMP_COMPARE_AND_STORE_ACQ64(p, cv, sv)                                 \
-  mips_sync_bool_compare_and_swap((volatile kmp_uint64 *)(p), (kmp_uint64)(cv),\
-                               (kmp_uint64)(sv))
+  mips_sync_bool_compare_and_swap((volatile kmp_uint64 *)(p),                  \
+                                  (kmp_uint64)(cv), (kmp_uint64)(sv))
 #define KMP_COMPARE_AND_STORE_REL64(p, cv, sv)                                 \
-  mips_sync_bool_compare_and_swap((volatile kmp_uint64 *)(p), (kmp_uint64)(cv),\
-                               (kmp_uint64)(sv))
+  mips_sync_bool_compare_and_swap((volatile kmp_uint64 *)(p),                  \
+                                  (kmp_uint64)(cv), (kmp_uint64)(sv))
 #define KMP_COMPARE_AND_STORE_RET64(p, cv, sv)                                 \
   mips_sync_val_compare_and_swap((volatile kmp_uint64 *)(p), (kmp_uint64)(cv), \
-                              (kmp_uint64)(sv))
+                                 (kmp_uint64)(sv))
 #else
 #define KMP_COMPARE_AND_STORE_ACQ64(p, cv, sv)                                 \
   __sync_bool_compare_and_swap((volatile kmp_uint64 *)(p), (kmp_uint64)(cv),   \
@@ -883,8 +1013,13 @@ extern kmp_real64 __kmp_xchg_real64(volatile kmp_real64 *p, kmp_real64 v);
 
 #if KMP_ARCH_PPC64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS ||     \
     KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64
+#if KMP_OS_WINDOWS
+#undef KMP_MB
+#define KMP_MB() std::atomic_thread_fence(std::memory_order_seq_cst)
+#else /* !KMP_OS_WINDOWS */
 #define KMP_MB() __sync_synchronize()
 #endif
+#endif
 
 #ifndef KMP_MB
 #define KMP_MB() /* nothing to do */
@@ -1038,6 +1173,9 @@ enum kmp_warnings_level {
 } // extern "C"
 #endif // __cplusplus
 
+// Safe C API
+#include "kmp_safe_c_api.h"
+
 // Macros for C++11 atomic functions
 #define KMP_ATOMIC_LD(p, order) (p)->load(std::memory_order_##order)
 #define KMP_ATOMIC_OP(op, p, v, order) (p)->op(v, std::memory_order_##order)
@@ -1077,6 +1215,14 @@ bool __kmp_atomic_compare_store_rel(std::atomic<T> *p, T expected, T desired) {
       expected, desired, std::memory_order_release, std::memory_order_relaxed);
 }
 
+// Symbol lookup on Linux/Windows
+#if KMP_OS_WINDOWS
+extern void *__kmp_lookup_symbol(const char *name);
+#define KMP_DLSYM(name) __kmp_lookup_symbol(name)
+#define KMP_DLSYM_NEXT(name) nullptr
+#else
+#define KMP_DLSYM(name) dlsym(RTLD_DEFAULT, name)
+#define KMP_DLSYM_NEXT(name) dlsym(RTLD_NEXT, name)
+#endif
+
 #endif /* KMP_OS_H */
-// Safe C API
-#include "kmp_safe_c_api.h"
diff --git a/openmp/runtime/src/kmp_platform.h b/openmp/runtime/src/kmp_platform.h
index 4296ca31d67d..bbbd72dd6951 100644
--- a/openmp/runtime/src/kmp_platform.h
+++ b/openmp/runtime/src/kmp_platform.h
@@ -70,14 +70,13 @@
 #define KMP_OS_HURD 1
 #endif
 
-#if (1 !=                                                                      \
-     KMP_OS_LINUX + KMP_OS_DRAGONFLY + KMP_OS_FREEBSD + KMP_OS_NETBSD +        \
-         KMP_OS_OPENBSD + KMP_OS_DARWIN + KMP_OS_WINDOWS + KMP_OS_HURD)
+#if (1 != KMP_OS_LINUX + KMP_OS_DRAGONFLY + KMP_OS_FREEBSD + KMP_OS_NETBSD +   \
+              KMP_OS_OPENBSD + KMP_OS_DARWIN + KMP_OS_WINDOWS + KMP_OS_HURD)
 #error Unknown OS
 #endif
 
 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
-        KMP_OS_OPENBSD || KMP_OS_DARWIN || KMP_OS_HURD
+    KMP_OS_OPENBSD || KMP_OS_DARWIN || KMP_OS_HURD
 #undef KMP_OS_UNIX
 #define KMP_OS_UNIX 1
 #endif
@@ -98,6 +97,9 @@
 #if defined(_M_AMD64) || defined(__x86_64)
 #undef KMP_ARCH_X86_64
 #define KMP_ARCH_X86_64 1
+#elif defined(__aarch64__) || defined(_M_ARM64)
+#undef KMP_ARCH_AARCH64
+#define KMP_ARCH_AARCH64 1
 #else
 #undef KMP_ARCH_X86
 #define KMP_ARCH_X86 1
@@ -195,9 +197,9 @@
   ((KMP_ARCH_X86 || KMP_ARCH_X86_64) && (KMP_OS_LINUX || KMP_OS_WINDOWS))
 
 // TODO: Fixme - This is clever, but really fugly
-#if (1 !=                                                                      \
-     KMP_ARCH_X86 + KMP_ARCH_X86_64 + KMP_ARCH_ARM + KMP_ARCH_PPC64 +          \
-     KMP_ARCH_AARCH64 + KMP_ARCH_MIPS + KMP_ARCH_MIPS64 + KMP_ARCH_RISCV64)
+#if (1 != KMP_ARCH_X86 + KMP_ARCH_X86_64 + KMP_ARCH_ARM + KMP_ARCH_PPC64 +     \
+              KMP_ARCH_AARCH64 + KMP_ARCH_MIPS + KMP_ARCH_MIPS64 +             \
+              KMP_ARCH_RISCV64)
 #error Unknown or unsupported architecture
 #endif
 
diff --git a/openmp/runtime/src/kmp_runtime.cpp b/openmp/runtime/src/kmp_runtime.cpp
index 4a0634d59cff..16d415dc8a74 100644
--- a/openmp/runtime/src/kmp_runtime.cpp
+++ b/openmp/runtime/src/kmp_runtime.cpp
@@ -31,8 +31,11 @@
 #if OMPT_SUPPORT
 #include "ompt-specific.h"
 #endif
+#if OMPD_SUPPORT
+#include "ompd-specific.h"
+#endif
 
-#if OMPTARGET_PROFILING_SUPPORT
+#if OMP_PROFILING_SUPPORT
 #include "llvm/Support/TimeProfiler.h"
 static char *ProfileTraceFile = nullptr;
 #endif
@@ -44,8 +47,6 @@ static char *ProfileTraceFile = nullptr;
 #include <process.h>
 #endif
 
-#include "tsan_annotations.h"
-
 #if KMP_OS_WINDOWS
 // windows does not need include files as it doesn't use shared memory
 #else
@@ -540,7 +541,10 @@ static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
                                "%s_%d.t_disp_buffer", header, team_id);
 }
 
-static void __kmp_init_allocator() { __kmp_init_memkind(); }
+static void __kmp_init_allocator() {
+  __kmp_init_memkind();
+  __kmp_init_target_mem();
+}
 static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
 
 /* ------------------------------------------------------------------------ */
@@ -548,58 +552,6 @@ static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
 #if KMP_DYNAMIC_LIB
 #if KMP_OS_WINDOWS
 
-static void __kmp_reset_lock(kmp_bootstrap_lock_t *lck) {
-  // TODO: Change to __kmp_break_bootstrap_lock().
-  __kmp_init_bootstrap_lock(lck); // make the lock released
-}
-
-static void __kmp_reset_locks_on_process_detach(int gtid_req) {
-  int i;
-  int thread_count;
-
-  // PROCESS_DETACH is expected to be called by a thread that executes
-  // ProcessExit() or FreeLibrary(). OS terminates other threads (except the one
-  // calling ProcessExit or FreeLibrary). So, it might be safe to access the
-  // __kmp_threads[] without taking the forkjoin_lock. However, in fact, some
-  // threads can be still alive here, although being about to be terminated. The
-  // threads in the array with ds_thread==0 are most suspicious. Actually, it
-  // can be not safe to access the __kmp_threads[].
-
-  // TODO: does it make sense to check __kmp_roots[] ?
-
-  // Let's check that there are no other alive threads registered with the OMP
-  // lib.
-  while (1) {
-    thread_count = 0;
-    for (i = 0; i < __kmp_threads_capacity; ++i) {
-      if (!__kmp_threads)
-        continue;
-      kmp_info_t *th = __kmp_threads[i];
-      if (th == NULL)
-        continue;
-      int gtid = th->th.th_info.ds.ds_gtid;
-      if (gtid == gtid_req)
-        continue;
-      if (gtid < 0)
-        continue;
-      DWORD exit_val;
-      int alive = __kmp_is_thread_alive(th, &exit_val);
-      if (alive) {
-        ++thread_count;
-      }
-    }
-    if (thread_count == 0)
-      break; // success
-  }
-
-  // Assume that I'm alone. Now it might be safe to check and reset locks.
-  // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset.
-  __kmp_reset_lock(&__kmp_forkjoin_lock);
-#ifdef KMP_DEBUG
-  __kmp_reset_lock(&__kmp_stdio_lock);
-#endif // KMP_DEBUG
-}
-
 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
   //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
 
@@ -613,36 +565,19 @@ BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
   case DLL_PROCESS_DETACH:
     KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
 
-    if (lpReserved != NULL) {
-      // lpReserved is used for telling the difference:
-      //   lpReserved == NULL when FreeLibrary() was called,
-      //   lpReserved != NULL when the process terminates.
-      // When FreeLibrary() is called, worker threads remain alive. So they will
-      // release the forkjoin lock by themselves. When the process terminates,
-      // worker threads disappear triggering the problem of unreleased forkjoin
-      // lock as described below.
-
-      // A worker thread can take the forkjoin lock. The problem comes up if
-      // that worker thread becomes dead before it releases the forkjoin lock.
-      // The forkjoin lock remains taken, while the thread executing
-      // DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below will try
-      // to take the forkjoin lock and will always fail, so that the application
-      // will never finish [normally]. This scenario is possible if
-      // __kmpc_end() has not been executed. It looks like it's not a corner
-      // case, but common cases:
-      // - the main function was compiled by an alternative compiler;
-      // - the main function was compiled by icl but without /Qopenmp
-      //   (application with plugins);
-      // - application terminates by calling C exit(), Fortran CALL EXIT() or
-      //   Fortran STOP.
-      // - alive foreign thread prevented __kmpc_end from doing cleanup.
-      //
-      // This is a hack to work around the problem.
-      // TODO: !!! figure out something better.
-      __kmp_reset_locks_on_process_detach(__kmp_gtid_get_specific());
-    }
-
-    __kmp_internal_end_library(__kmp_gtid_get_specific());
+    // According to Windows* documentation for DllMain entry point:
+    // for DLL_PROCESS_DETACH, lpReserved is used for telling the difference:
+    //   lpReserved == NULL when FreeLibrary() is called,
+    //   lpReserved != NULL when the process is terminated.
+    // When FreeLibrary() is called, worker threads remain alive. So the
+    // runtime's state is consistent and executing proper shutdown is OK.
+    // When the process is terminated, worker threads have exited or been
+    // forcefully terminated by the OS and only the shutdown thread remains.
+    // This can leave the runtime in an inconsistent state.
+    // Hence, only attempt proper cleanup when FreeLibrary() is called.
+    // Otherwise, rely on OS to reclaim resources.
+    if (lpReserved == NULL)
+      __kmp_internal_end_library(__kmp_gtid_get_specific());
 
     return TRUE;
 
@@ -750,8 +685,8 @@ int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
 #if USE_ITT_BUILD
     if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
         KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
-        team->t.t_active_level ==
-            1) { // Only report metadata by master of active team at level 1
+        team->t.t_active_level == 1) {
+      // Only report metadata by primary thread of active team at level 1
       __kmp_itt_metadata_single(id_ref);
     }
 #endif /* USE_ITT_BUILD */
@@ -920,6 +855,12 @@ static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
   if (TCR_PTR(__kmp_threads[0]) == NULL) {
     --capacity;
   }
+  // If it is not for initializing the hidden helper team, we need to take
+  // __kmp_hidden_helper_threads_num out of the capacity because it is included
+  // in __kmp_threads_capacity.
+  if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
+    capacity -= __kmp_hidden_helper_threads_num;
+  }
   if (__kmp_nth + new_nthreads -
           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
       capacity) {
@@ -977,7 +918,7 @@ static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
   KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
   KMP_MB();
 
-  /* first, let's setup the master thread */
+  /* first, let's setup the primary thread */
   master_th->th.th_info.ds.ds_tid = 0;
   master_th->th.th_team = team;
   master_th->th.th_team_nproc = team->t.t_nproc;
@@ -1022,7 +963,7 @@ static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
 #endif
   if (!use_hot_team) {
 
-    /* install the master thread */
+    /* install the primary thread */
     team->t.t_threads[0] = master_th;
     __kmp_initialize_info(master_th, team, 0, master_gtid);
 
@@ -1085,7 +1026,7 @@ inline static void propagateFPControl(kmp_team_t *team) {
     kmp_int16 x87_fpu_control_word;
     kmp_uint32 mxcsr;
 
-    // Get master values of FPU control flags (both X87 and vector)
+    // Get primary thread's values of FPU control flags (both X87 and vector)
     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
     __kmp_store_mxcsr(&mxcsr);
     mxcsr &= KMP_X86_MXCSR_MASK;
@@ -1142,7 +1083,7 @@ static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
                                      int realloc); // forward declaration
 
 /* Run a parallel region that has been serialized, so runs only in a team of the
-   single master thread. */
+   single primary thread. */
 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
   kmp_info_t *this_thr;
   kmp_team_t *serial_team;
@@ -1190,7 +1131,6 @@ void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
 
 #if OMPT_SUPPORT
   ompt_data_t ompt_parallel_data = ompt_data_none;
-  ompt_data_t *implicit_task_data;
   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
   if (ompt_enabled.enabled &&
       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
@@ -1372,7 +1312,8 @@ void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
   serial_team->t.ompt_team_info.master_return_address = codeptr;
   if (ompt_enabled.enabled &&
       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
-    OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
+    OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
+        OMPT_GET_FRAME_ADDRESS(0);
 
     ompt_lw_taskteam_t lw_taskteam;
     __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
@@ -1382,18 +1323,19 @@ void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
     // don't use lw_taskteam after linking. content was swaped
 
     /* OMPT implicit task begin */
-    implicit_task_data = OMPT_CUR_TASK_DATA(this_thr);
     if (ompt_enabled.ompt_callback_implicit_task) {
       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
           ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
-          OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
-      OMPT_CUR_TASK_INFO(this_thr)
-          ->thread_num = __kmp_tid_from_gtid(global_tid);
+          OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid),
+          ompt_task_implicit); // TODO: Can this be ompt_task_initial?
+      OMPT_CUR_TASK_INFO(this_thr)->thread_num =
+          __kmp_tid_from_gtid(global_tid);
     }
 
     /* OMPT state */
     this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
-    OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
+    OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
+        OMPT_GET_FRAME_ADDRESS(0);
   }
 #endif
 }
@@ -1466,6 +1408,9 @@ int __kmp_fork_call(ident_t *loc, int gtid,
     }
 #endif
 
+    // Assign affinity to root thread if it hasn't happened yet
+    __kmp_assign_root_init_mask();
+
     // Nested level will be an index in the nested nthreads array
     level = parent_team->t.t_level;
     // used to launch non-serial teams even if nested is not allowed
@@ -1527,6 +1472,10 @@ int __kmp_fork_call(ident_t *loc, int gtid,
           return TRUE;
         }
 
+#if OMPD_SUPPORT
+        parent_team->t.t_pkfn = microtask;
+#endif
+
 #if OMPT_SUPPORT
         void *dummy;
         void **exit_frame_p;
@@ -1544,8 +1493,8 @@ int __kmp_fork_call(ident_t *loc, int gtid,
           /* OMPT implicit task begin */
           implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
           if (ompt_enabled.ompt_callback_implicit_task) {
-            OMPT_CUR_TASK_INFO(master_th)
-                ->thread_num = __kmp_tid_from_gtid(gtid);
+            OMPT_CUR_TASK_INFO(master_th)->thread_num =
+                __kmp_tid_from_gtid(gtid);
             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
                 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
                 implicit_task_data, 1,
@@ -1570,7 +1519,7 @@ int __kmp_fork_call(ident_t *loc, int gtid,
                                  ,
                                  exit_frame_p
 #endif
-                                 );
+          );
         }
 
 #if OMPT_SUPPORT
@@ -1646,6 +1595,7 @@ int __kmp_fork_call(ident_t *loc, int gtid,
         parent_team->t.t_region_time = tmp_time;
       }
       if (__itt_stack_caller_create_ptr) {
+        KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
         // create new stack stitching id before entering fork barrier
         parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
       }
@@ -1662,12 +1612,12 @@ int __kmp_fork_call(ident_t *loc, int gtid,
       if (call_context == fork_context_gnu)
         return TRUE;
 
-      /* Invoke microtask for MASTER thread */
+      /* Invoke microtask for PRIMARY thread */
       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
                     parent_team->t.t_id, parent_team->t.t_pkfn));
 
       if (!parent_team->t.t_invoke(gtid)) {
-        KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
+        KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
       }
       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
                     parent_team->t.t_id, parent_team->t.t_pkfn));
@@ -1685,12 +1635,13 @@ int __kmp_fork_call(ident_t *loc, int gtid,
     }
 #endif
 
+    int enter_teams = 0;
     if (parent_team->t.t_active_level >=
         master_th->th.th_current_task->td_icvs.max_active_levels) {
       nthreads = 1;
     } else {
-      int enter_teams = ((ap == NULL && active_level == 0) ||
-                         (ap && teams_level > 0 && teams_level == level));
+      enter_teams = ((ap == NULL && active_level == 0) ||
+                     (ap && teams_level > 0 && teams_level == level));
       nthreads =
           master_set_numthreads
               ? master_set_numthreads
@@ -1749,13 +1700,17 @@ int __kmp_fork_call(ident_t *loc, int gtid,
 
       __kmpc_serialized_parallel(loc, gtid);
 
+#if OMPD_SUPPORT
+      master_th->th.th_serial_team->t.t_pkfn = microtask;
+#endif
+
       if (call_context == fork_context_intel) {
         /* TODO this sucks, use the compiler itself to pass args! :) */
         master_th->th.th_serial_team->t.t_ident = loc;
         if (!ap) {
           // revert change made in __kmpc_serialized_parallel()
           master_th->th.th_serial_team->t.t_level--;
-// Get args from parent team for teams construct
+          // Get args from parent team for teams construct
 
 #if OMPT_SUPPORT
           void *dummy;
@@ -1774,8 +1729,8 @@ int __kmp_fork_call(ident_t *loc, int gtid,
             task_info = OMPT_CUR_TASK_INFO(master_th);
             exit_frame_p = &(task_info->frame.exit_frame.ptr);
             if (ompt_enabled.ompt_callback_implicit_task) {
-              OMPT_CUR_TASK_INFO(master_th)
-                  ->thread_num = __kmp_tid_from_gtid(gtid);
+              OMPT_CUR_TASK_INFO(master_th)->thread_num =
+                  __kmp_tid_from_gtid(gtid);
               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
                   &(task_info->task_data), 1,
@@ -1799,7 +1754,7 @@ int __kmp_fork_call(ident_t *loc, int gtid,
                                    ,
                                    exit_frame_p
 #endif
-                                   );
+            );
           }
 
 #if OMPT_SUPPORT
@@ -1889,8 +1844,8 @@ int __kmp_fork_call(ident_t *loc, int gtid,
                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
                   implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
                   ompt_task_implicit);
-              OMPT_CUR_TASK_INFO(master_th)
-                  ->thread_num = __kmp_tid_from_gtid(gtid);
+              OMPT_CUR_TASK_INFO(master_th)->thread_num =
+                  __kmp_tid_from_gtid(gtid);
             }
 
             /* OMPT state */
@@ -1908,7 +1863,7 @@ int __kmp_fork_call(ident_t *loc, int gtid,
                                    ,
                                    exit_frame_p
 #endif
-                                   );
+            );
           }
 
 #if OMPT_SUPPORT
@@ -2067,7 +2022,7 @@ int __kmp_fork_call(ident_t *loc, int gtid,
       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
     }
     kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
-    // set master's schedule as new run-time schedule
+    // set primary thread's schedule as new run-time schedule
     KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
 
     KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
@@ -2075,20 +2030,24 @@ int __kmp_fork_call(ident_t *loc, int gtid,
 
     // Update the floating point rounding in the team if required.
     propagateFPControl(team);
+#if OMPD_SUPPORT
+    if (ompd_state & OMPD_ENABLE_BP)
+      ompd_bp_parallel_begin();
+#endif
 
     if (__kmp_tasking_mode != tskm_immediate_exec) {
-      // Set master's task team to team's task team. Unless this is hot team, it
-      // should be NULL.
+      // Set primary thread's task team to team's task team. Unless this is hot
+      // team, it should be NULL.
       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
                        parent_team->t.t_task_team[master_th->th.th_task_state]);
-      KA_TRACE(20, ("__kmp_fork_call: Master T#%d pushing task_team %p / team "
+      KA_TRACE(20, ("__kmp_fork_call: Primary T#%d pushing task_team %p / team "
                     "%p, new task_team %p / team %p\n",
                     __kmp_gtid_from_thread(master_th),
                     master_th->th.th_task_team, parent_team,
                     team->t.t_task_team[master_th->th.th_task_state], team));
 
       if (active_level || master_th->th.th_task_team) {
-        // Take a memo of master's task_state
+        // Take a memo of primary thread's task_state
         KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
         if (master_th->th.th_task_state_top >=
             master_th->th.th_task_state_stack_sz) { // increase size
@@ -2108,7 +2067,7 @@ int __kmp_fork_call(ident_t *loc, int gtid,
           master_th->th.th_task_state_stack_sz = new_size;
           __kmp_free(old_stack);
         }
-        // Store master's task_state on stack
+        // Store primary thread's task_state on stack
         master_th->th
             .th_task_state_memo_stack[master_th->th.th_task_state_top] =
             master_th->th.th_task_state;
@@ -2117,7 +2076,7 @@ int __kmp_fork_call(ident_t *loc, int gtid,
         if (master_th->th.th_hot_teams &&
             active_level < __kmp_hot_teams_max_level &&
             team == master_th->th.th_hot_teams[active_level].hot_team) {
-          // Restore master's nested state if nested hot team
+          // Restore primary thread's nested state if nested hot team
           master_th->th.th_task_state =
               master_th->th
                   .th_task_state_memo_stack[master_th->th.th_task_state_top];
@@ -2209,13 +2168,22 @@ int __kmp_fork_call(ident_t *loc, int gtid,
 
 #if USE_ITT_BUILD
     if (__itt_stack_caller_create_ptr) {
-      team->t.t_stack_id =
-          __kmp_itt_stack_caller_create(); // create new stack stitching id
-      // before entering fork barrier
+      // create new stack stitching id before entering fork barrier
+      if (!enter_teams) {
+        KMP_DEBUG_ASSERT(team->t.t_stack_id == NULL);
+        team->t.t_stack_id = __kmp_itt_stack_caller_create();
+      } else if (parent_team->t.t_serialized) {
+        // keep stack stitching id in the serialized parent_team;
+        // current team will be used for parallel inside the teams;
+        // if parent_team is active, then it already keeps stack stitching id
+        // for the league of teams
+        KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
+        parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
+      }
     }
 #endif /* USE_ITT_BUILD */
 
-    // AC: skip __kmp_internal_fork at teams construct, let only master
+    // AC: skip __kmp_internal_fork at teams construct, let only primary
     // threads execute
     if (ap) {
       __kmp_internal_fork(loc, gtid, team);
@@ -2229,7 +2197,7 @@ int __kmp_fork_call(ident_t *loc, int gtid,
       return TRUE;
     }
 
-    /* Invoke microtask for MASTER thread */
+    /* Invoke microtask for PRIMARY thread */
     KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
                   team->t.t_id, team->t.t_pkfn));
   } // END of timer KMP_fork_call block
@@ -2243,7 +2211,7 @@ int __kmp_fork_call(ident_t *loc, int gtid,
 #endif
 
   if (!team->t.t_invoke(gtid)) {
-    KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
+    KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
   }
 
 #if KMP_STATS_ENABLED
@@ -2258,7 +2226,6 @@ int __kmp_fork_call(ident_t *loc, int gtid,
   KMP_MB(); /* Flush all pending memory write invalidates.  */
 
   KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
-
 #if OMPT_SUPPORT
   if (ompt_enabled.enabled) {
     master_th->th.ompt_thread_info.state = ompt_state_overhead;
@@ -2371,9 +2338,27 @@ void __kmp_join_call(ident_t *loc, int gtid
     // AC: No barrier for internal teams at exit from teams construct.
     //     But there is barrier for external team (league).
     __kmp_internal_join(loc, gtid, team);
+#if USE_ITT_BUILD
+    if (__itt_stack_caller_create_ptr) {
+      KMP_DEBUG_ASSERT(team->t.t_stack_id != NULL);
+      // destroy the stack stitching id after join barrier
+      __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
+      team->t.t_stack_id = NULL;
+    }
+#endif
   } else {
     master_th->th.th_task_state =
         0; // AC: no tasking in teams (out of any parallel)
+#if USE_ITT_BUILD
+    if (__itt_stack_caller_create_ptr && parent_team->t.t_serialized) {
+      KMP_DEBUG_ASSERT(parent_team->t.t_stack_id != NULL);
+      // destroy the stack stitching id on exit from the teams construct
+      // if parent_team is active, then the id will be destroyed later on
+      // by master of the league of teams
+      __kmp_itt_stack_caller_destroy((__itt_caller)parent_team->t.t_stack_id);
+      parent_team->t.t_stack_id = NULL;
+    }
+#endif
   }
 
   KMP_MB();
@@ -2384,10 +2369,6 @@ void __kmp_join_call(ident_t *loc, int gtid
 #endif
 
 #if USE_ITT_BUILD
-  if (__itt_stack_caller_create_ptr) {
-    // destroy the stack stitching id after join barrier
-    __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
-  }
   // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
   if (team->t.t_active_level == 1 &&
       (!master_th->th.th_teams_microtask || /* not in teams construct */
@@ -2435,7 +2416,7 @@ void __kmp_join_call(ident_t *loc, int gtid
 
     // Restore number of threads in the team if needed. This code relies on
     // the proper adjustment of th_teams_size.nth after the fork in
-    // __kmp_teams_master on each teams master in the case that
+    // __kmp_teams_master on each teams primary thread in the case that
     // __kmp_reserve_threads reduced it.
     if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
       int old_num = master_th->th.th_team_nproc;
@@ -2520,6 +2501,10 @@ void __kmp_join_call(ident_t *loc, int gtid
 #endif // KMP_AFFINITY_SUPPORTED
   master_th->th.th_def_allocator = team->t.t_def_allocator;
 
+#if OMPD_SUPPORT
+  if (ompd_state & OMPD_ENABLE_BP)
+    ompd_bp_parallel_end();
+#endif
   updateHWFPControl(team);
 
   if (root->r.r_active != master_active)
@@ -2551,7 +2536,7 @@ void __kmp_join_call(ident_t *loc, int gtid
     if (master_th->th.th_task_state_top >
         0) { // Restore task state from memo stack
       KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
-      // Remember master's state if we re-use this nested hot team
+      // Remember primary thread's state if we re-use this nested hot team
       master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
           master_th->th.th_task_state;
       --master_th->th.th_task_state_top; // pop
@@ -2560,11 +2545,11 @@ void __kmp_join_call(ident_t *loc, int gtid
           master_th->th
               .th_task_state_memo_stack[master_th->th.th_task_state_top];
     }
-    // Copy the task team from the parent team to the master thread
+    // Copy the task team from the parent team to the primary thread
     master_th->th.th_task_team =
         parent_team->t.t_task_team[master_th->th.th_task_state];
     KA_TRACE(20,
-             ("__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
+             ("__kmp_join_call: Primary T#%d restoring task_team %p, team %p\n",
               __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
               parent_team));
   }
@@ -2655,7 +2640,7 @@ void __kmp_set_num_threads(int new_nth, int gtid) {
 #if KMP_NESTED_HOT_TEAMS
       && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
 #endif
-      ) {
+  ) {
     kmp_team_t *hot_team = root->r.r_hot_team;
     int f;
 
@@ -2753,6 +2738,19 @@ int __kmp_get_max_active_levels(int gtid) {
   return thread->th.th_current_task->td_icvs.max_active_levels;
 }
 
+// nteams-var per-device ICV
+void __kmp_set_num_teams(int num_teams) {
+  if (num_teams > 0)
+    __kmp_nteams = num_teams;
+}
+int __kmp_get_max_teams(void) { return __kmp_nteams; }
+// teams-thread-limit-var per-device ICV
+void __kmp_set_teams_thread_limit(int limit) {
+  if (limit > 0)
+    __kmp_teams_thread_limit = limit;
+}
+int __kmp_get_teams_thread_limit(void) { return __kmp_teams_thread_limit; }
+
 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
 
@@ -3172,6 +3170,9 @@ static void __kmp_initialize_root(kmp_root_t *root) {
   root->r.r_active = FALSE;
   root->r.r_in_parallel = 0;
   root->r.r_blocktime = __kmp_dflt_blocktime;
+#if KMP_AFFINITY_SUPPORTED
+  root->r.r_affinity_assigned = FALSE;
+#endif
 
   /* setup the root team for this task */
   /* allocate the root team structure */
@@ -3186,7 +3187,7 @@ static void __kmp_initialize_root(kmp_root_t *root) {
 #endif
                           __kmp_nested_proc_bind.bind_types[0], &r_icvs,
                           0 // argc
-                          USE_NESTED_HOT_ARG(NULL) // master thread is unknown
+                          USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
                           );
 #if USE_DEBUGGER
   // Non-NULL value should be assigned to make the debugger display the root
@@ -3223,7 +3224,7 @@ static void __kmp_initialize_root(kmp_root_t *root) {
 #endif
                           __kmp_nested_proc_bind.bind_types[0], &r_icvs,
                           0 // argc
-                          USE_NESTED_HOT_ARG(NULL) // master thread is unknown
+                          USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
                           );
   KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
 
@@ -3255,7 +3256,7 @@ typedef kmp_team_list_item_t *kmp_team_list_t;
 static void __kmp_print_structure_team_accum( // Add team to list of teams.
     kmp_team_list_t list, // List of teams.
     kmp_team_p const *team // Team to add.
-    ) {
+) {
 
   // List must terminate with item where both entry and next are NULL.
   // Team is added to the list only once.
@@ -3299,7 +3300,7 @@ static void __kmp_print_structure_team_accum( // Add team to list of teams.
 
 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
 
-                                       ) {
+) {
   __kmp_printf("%s", title);
   if (team != NULL) {
     __kmp_printf("%2x %p\n", team->t.t_id, team);
@@ -3358,7 +3359,7 @@ void __kmp_print_structure(void) {
         __kmp_print_structure_team("    Serial Team:  ",
                                    thread->th.th_serial_team);
         __kmp_printf("    Threads:      %2d\n", thread->th.th_team_nproc);
-        __kmp_print_structure_thread("    Master:       ",
+        __kmp_print_structure_thread("    Primary:      ",
                                      thread->th.th_team_master);
         __kmp_printf("    Serialized?:  %2d\n", thread->th.th_team_serialized);
         __kmp_printf("    Set NProc:    %2d\n", thread->th.th_set_nproc);
@@ -3406,7 +3407,7 @@ void __kmp_print_structure(void) {
     int i;
     __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
     __kmp_print_structure_team("    Parent Team:      ", team->t.t_parent);
-    __kmp_printf("    Master TID:       %2d\n", team->t.t_master_tid);
+    __kmp_printf("    Primary TID:      %2d\n", team->t.t_master_tid);
     __kmp_printf("    Max threads:      %2d\n", team->t.t_max_nproc);
     __kmp_printf("    Levels of serial: %2d\n", team->t.t_serialized);
     __kmp_printf("    Number threads:   %2d\n", team->t.t_nproc);
@@ -3522,9 +3523,9 @@ static int __kmp_expand_threads(int nNeed) {
   kmp_info_t **newThreads;
   kmp_root_t **newRoot;
 
-// All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
-// resizing __kmp_threads does not need additional protection if foreign
-// threads are present
+  // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
+  // resizing __kmp_threads does not need additional protection if foreign
+  // threads are present
 
 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
   /* only for Windows static library */
@@ -3632,6 +3633,13 @@ int __kmp_register_root(int initial_thread) {
     --capacity;
   }
 
+  // If it is not for initializing the hidden helper team, we need to take
+  // __kmp_hidden_helper_threads_num out of the capacity because it is included
+  // in __kmp_threads_capacity.
+  if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
+    capacity -= __kmp_hidden_helper_threads_num;
+  }
+
   /* see if there are too many threads */
   if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
     if (__kmp_tp_cached) {
@@ -3664,7 +3672,7 @@ int __kmp_register_root(int initial_thread) {
     /* find an available thread slot */
     // Don't reassign the zero slot since we need that to only be used by
     // initial thread. Slots for hidden helper threads should also be skipped.
-    if (initial_thread && __kmp_threads[0] == NULL) {
+    if (initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
       gtid = 0;
     } else {
       for (gtid = __kmp_hidden_helper_threads_num + 1;
@@ -3775,7 +3783,7 @@ int __kmp_register_root(int initial_thread) {
   __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
   TCW_4(__kmp_init_gtid, TRUE);
 
-  /* prepare the master thread for get_gtid() */
+  /* prepare the primary thread for get_gtid() */
   __kmp_gtid_set_specific(gtid);
 
 #if USE_ITT_BUILD
@@ -3810,9 +3818,6 @@ int __kmp_register_root(int initial_thread) {
   root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
   root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
   root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
-  if (TCR_4(__kmp_init_middle)) {
-    __kmp_affinity_set_init_mask(gtid, TRUE);
-  }
 #endif /* KMP_AFFINITY_SUPPORTED */
   root_thread->th.th_def_allocator = __kmp_def_allocator;
   root_thread->th.th_prev_level = 0;
@@ -3843,7 +3848,8 @@ int __kmp_register_root(int initial_thread) {
     }
     ompt_data_t *task_data;
     ompt_data_t *parallel_data;
-    __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data, NULL);
+    __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
+                                  NULL);
     if (ompt_enabled.ompt_callback_implicit_task) {
       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
           ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
@@ -3852,6 +3858,10 @@ int __kmp_register_root(int initial_thread) {
     ompt_set_thread_state(root_thread, ompt_state_work_serial);
   }
 #endif
+#if OMPD_SUPPORT
+  if (ompd_state & OMPD_ENABLE_BP)
+    ompd_bp_thread_begin();
+#endif
 
   KMP_MB();
   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
@@ -3870,7 +3880,7 @@ static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
   KMP_DEBUG_ASSERT(level < max_level);
   kmp_team_t *team = hot_teams[level].hot_team;
   nth = hot_teams[level].hot_team_nth;
-  n = nth - 1; // master is not freed
+  n = nth - 1; // primary thread is not freed
   if (level < max_level - 1) {
     for (i = 0; i < nth; ++i) {
       kmp_info_t *th = team->t.t_threads[i];
@@ -3935,10 +3945,16 @@ static int __kmp_reset_root(int gtid, kmp_root_t *root) {
   __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
 #endif /* KMP_OS_WINDOWS */
 
+#if OMPD_SUPPORT
+  if (ompd_state & OMPD_ENABLE_BP)
+    ompd_bp_thread_end();
+#endif
+
 #if OMPT_SUPPORT
   ompt_data_t *task_data;
   ompt_data_t *parallel_data;
-  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data, NULL);
+  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
+                                NULL);
   if (ompt_enabled.ompt_callback_implicit_task) {
     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
         ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
@@ -4065,12 +4081,12 @@ static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
   /* this_thr->th.th_info.ds.ds_gtid is setup in
      kmp_allocate_thread/create_worker.
      this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
-  kmp_info_t *master = team->t.t_threads[0];
   KMP_DEBUG_ASSERT(this_thr != NULL);
   KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
   KMP_DEBUG_ASSERT(team);
   KMP_DEBUG_ASSERT(team->t.t_threads);
   KMP_DEBUG_ASSERT(team->t.t_dispatch);
+  kmp_info_t *master = team->t.t_threads[0];
   KMP_DEBUG_ASSERT(master);
   KMP_DEBUG_ASSERT(master->th.th_root);
 
@@ -4127,9 +4143,9 @@ static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
     this_thr->th.th_pri_head = NULL;
   }
 
-  if (this_thr != master && // Master's CG root is initialized elsewhere
+  if (this_thr != master && // Primary thread's CG root is initialized elsewhere
       this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
-    // Make new thread's CG root same as master's
+    // Make new thread's CG root same as primary thread's
     KMP_DEBUG_ASSERT(master->th.th_cg_roots);
     kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
     if (tmp) {
@@ -4179,8 +4195,9 @@ static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
             &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
                                           ? 1
                                           : __kmp_dispatch_num_buffers],
-            disp_size, "th_%d.th_dispatch.th_disp_buffer "
-                       "(team_%d.t_dispatch[%d].th_disp_buffer)",
+            disp_size,
+            "th_%d.th_dispatch.th_disp_buffer "
+            "(team_%d.t_dispatch[%d].th_disp_buffer)",
             gtid, team->t.t_id, gtid);
       }
     } else {
@@ -4309,11 +4326,11 @@ kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
       // The reason is that if the library is loaded/unloaded in a loop with
       // small (parallel) work in between, then there is high probability that
       // monitor thread started after the library shutdown. At shutdown it is
-      // too late to cope with the problem, because when the master is in
-      // DllMain (process detach) the monitor has no chances to start (it is
-      // blocked), and master has no means to inform the monitor that the
-      // library has gone, because all the memory which the monitor can access
-      // is going to be released/reset.
+      // too late to cope with the problem, because when the primary thread is
+      // in DllMain (process detach) the monitor has no chances to start (it is
+      // blocked), and primary thread has no means to inform the monitor that
+      // the library has gone, because all the memory which the monitor can
+      // access is going to be released/reset.
       while (TCR_4(__kmp_init_monitor) < 2) {
         KMP_YIELD(TRUE);
       }
@@ -4383,7 +4400,7 @@ kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
     __kmp_print_thread_storage_map(new_thr, new_gtid);
   }
 
-  // add the reserve serialized team, initialized from the team's master thread
+  // add the reserve serialized team, initialized from the team's primary thread
   {
     kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
     KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
@@ -4507,7 +4524,7 @@ static void __kmp_reinitialize_team(kmp_team_t *team,
   KMP_CHECK_UPDATE(team->t.t_ident, loc);
 
   KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
-  // Copy ICVs to the master thread's implicit taskdata
+  // Copy ICVs to the primary thread's implicit taskdata
   __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
   copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
 
@@ -4593,11 +4610,14 @@ __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
 #if KMP_AFFINITY_SUPPORTED
 
 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
-// It calculates the worker + master thread's partition based upon the parent
+// It calculates the worker + primary thread's partition based upon the parent
 // thread's partition, and binds each worker to a thread in their partition.
-// The master thread's partition should already include its current binding.
+// The primary thread's partition should already include its current binding.
 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
-  // Copy the master thread's place partition to the team struct
+  // Do not partition places for the hidden helper team
+  if (KMP_HIDDEN_HELPER_TEAM(team))
+    return;
+  // Copy the primary thread's place partition to the team struct
   kmp_info_t *master_th = team->t.t_threads[0];
   KMP_DEBUG_ASSERT(master_th != NULL);
   kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
@@ -4615,12 +4635,12 @@ static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
   switch (proc_bind) {
 
   case proc_bind_default:
-    // serial teams might have the proc_bind policy set to proc_bind_default. It
-    // doesn't matter, as we don't rebind master thread for any proc_bind policy
+    // Serial teams might have the proc_bind policy set to proc_bind_default.
+    // Not an issue -- we don't rebind primary thread for any proc_bind policy.
     KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
     break;
 
-  case proc_bind_master: {
+  case proc_bind_primary: {
     int f;
     int n_th = team->t.t_nproc;
     for (f = 1; f < n_th; f++) {
@@ -4634,7 +4654,7 @@ static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
         team->t.t_display_affinity = 1;
       }
 
-      KA_TRACE(100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d "
+      KA_TRACE(100, ("__kmp_partition_places: primary: T#%d(%d:%d) place %d "
                      "partition = [%d,%d]\n",
                      __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
                      f, masters_place, first_place, last_place));
@@ -5022,7 +5042,7 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
 
       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
       kmp_r_sched_t new_sched = new_icvs->sched;
-      // set master's schedule as new run-time schedule
+      // set primary thread's schedule as new run-time schedule
       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
 
       __kmp_reinitialize_team(team, new_icvs,
@@ -5102,7 +5122,7 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
         team->t.t_threads[f]->th.th_team_nproc = new_nproc;
       }
 
-      // restore the current task state of the master thread: should be the
+      // restore the current task state of the primary thread: should be the
       // implicit task
       KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
                     team->t.t_threads[0], team));
@@ -5172,10 +5192,11 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
         }
 
 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
-        /* Temporarily set full mask for master thread before creation of
-           workers. The reason is that workers inherit the affinity from master,
-           so if a lot of workers are created on the single core quickly, they
-           don't get a chance to set their own affinity for a long time. */
+        /* Temporarily set full mask for primary thread before creation of
+           workers. The reason is that workers inherit the affinity from the
+           primary thread, so if a lot of workers are created on the single
+           core quickly, they don't get a chance to set their own affinity for
+           a long time. */
         __kmp_set_thread_affinity_mask_full_tmp(old_mask);
 #endif
 
@@ -5208,7 +5229,7 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
 
 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
         if (KMP_AFFINITY_CAPABLE()) {
-          /* Restore initial master thread's affinity mask */
+          /* Restore initial primary thread's affinity mask */
           __kmp_set_system_affinity(old_mask, TRUE);
           KMP_CPU_FREE(old_mask);
         }
@@ -5231,15 +5252,15 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
       if (level) { // set th_task_state for new threads in nested hot team
         // __kmp_initialize_info() no longer zeroes th_task_state, so we should
         // only need to set the th_task_state for the new threads. th_task_state
-        // for master thread will not be accurate until after this in
-        // __kmp_fork_call(), so we look to the master's memo_stack to get the
-        // correct value.
+        // for primary thread will not be accurate until after this in
+        // __kmp_fork_call(), so we look to the primary thread's memo_stack to
+        // get the correct value.
         for (f = old_nproc; f < team->t.t_nproc; ++f)
           team->t.t_threads[f]->th.th_task_state =
               team->t.t_threads[0]->th.th_task_state_memo_stack[level];
       } else { // set th_task_state for new threads in non-nested hot team
-        kmp_uint8 old_state =
-            team->t.t_threads[0]->th.th_task_state; // copy master's state
+        // copy primary thread's state
+        kmp_uint8 old_state = team->t.t_threads[0]->th.th_task_state;
         for (f = old_nproc; f < team->t.t_nproc; ++f)
           team->t.t_threads[f]->th.th_task_state = old_state;
       }
@@ -5540,7 +5561,7 @@ void __kmp_free_team(kmp_root_t *root,
     /* TODO limit size of team pool, call reap_team if pool too large */
     team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
     __kmp_team_pool = (volatile kmp_team_t *)team;
-  } else { // Check if team was created for the masters in a teams construct
+  } else { // Check if team was created for primary threads in teams construct
     // See if first worker is a CG root
     KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
                      team->t.t_threads[1]->th.th_cg_roots);
@@ -5740,7 +5761,7 @@ void __kmp_free_thread(kmp_info_t *this_th) {
 /* ------------------------------------------------------------------------ */
 
 void *__kmp_launch_thread(kmp_info_t *this_thr) {
-#if OMPTARGET_PROFILING_SUPPORT
+#if OMP_PROFILING_SUPPORT
   ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE");
   // TODO: add a configuration option for time granularity
   if (ProfileTraceFile)
@@ -5758,8 +5779,13 @@ void *__kmp_launch_thread(kmp_info_t *this_thr) {
     this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
   }
 
+#if OMPD_SUPPORT
+  if (ompd_state & OMPD_ENABLE_BP)
+    ompd_bp_thread_begin();
+#endif
+
 #if OMPT_SUPPORT
-  ompt_data_t *thread_data;
+  ompt_data_t *thread_data = nullptr;
   if (ompt_enabled.enabled) {
     thread_data = &(this_thr->th.ompt_thread_info.thread_data);
     *thread_data = ompt_data_none;
@@ -5835,6 +5861,11 @@ void *__kmp_launch_thread(kmp_info_t *this_thr) {
   }
   TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
 
+#if OMPD_SUPPORT
+  if (ompd_state & OMPD_ENABLE_BP)
+    ompd_bp_thread_end();
+#endif
+
 #if OMPT_SUPPORT
   if (ompt_enabled.ompt_callback_thread_end) {
     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
@@ -5848,7 +5879,7 @@ void *__kmp_launch_thread(kmp_info_t *this_thr) {
   KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
   KMP_MB();
 
-#if OMPTARGET_PROFILING_SUPPORT
+#if OMP_PROFILING_SUPPORT
   llvm::timeTraceProfilerFinishThread();
 #endif
   return this_thr;
@@ -5926,7 +5957,6 @@ static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
                gtid));
       /* Need release fence here to prevent seg faults for tree forkjoin barrier
        * (GEH) */
-      ANNOTATE_HAPPENS_BEFORE(thread);
       kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
                          thread);
       __kmp_release_64(&flag);
@@ -5965,7 +5995,7 @@ static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
   TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
 
   --__kmp_all_nth;
-// __kmp_nth was decremented when thread is added to the pool.
+  // __kmp_nth was decremented when thread is added to the pool.
 
 #ifdef KMP_ADJUST_BLOCKTIME
   /* Adjust blocktime back to user setting or default if necessary */
@@ -6171,6 +6201,16 @@ void __kmp_internal_end_library(int gtid_req) {
     return;
   }
 
+  // If hidden helper team has been initialized, we need to deinit it
+  if (TCR_4(__kmp_init_hidden_helper) &&
+      !TCR_4(__kmp_hidden_helper_team_done)) {
+    TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
+    // First release the main thread to let it continue its work
+    __kmp_hidden_helper_main_thread_release();
+    // Wait until the hidden helper team has been destroyed
+    __kmp_hidden_helper_threads_deinitz_wait();
+  }
+
   KMP_MB(); /* Flush all pending memory write invalidates.  */
   /* find out who we are and what we should do */
   {
@@ -6284,7 +6324,8 @@ void __kmp_internal_end_thread(int gtid_req) {
   }
 
   // If hidden helper team has been initialized, we need to deinit it
-  if (TCR_4(__kmp_init_hidden_helper)) {
+  if (TCR_4(__kmp_init_hidden_helper) &&
+      !TCR_4(__kmp_hidden_helper_team_done)) {
     TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
     // First release the main thread to let it continue its work
     __kmp_hidden_helper_main_thread_release();
@@ -6511,9 +6552,9 @@ void __kmp_register_library_startup(void) {
       __kmp_str_split(tail, '-', &flag_val_str, &tail);
       file_name = tail;
       if (tail != NULL) {
-        long *flag_addr = 0;
-        long flag_val = 0;
-        KMP_SSCANF(flag_addr_str, "%p", RCAST(void**, &flag_addr));
+        unsigned long *flag_addr = 0;
+        unsigned long flag_val = 0;
+        KMP_SSCANF(flag_addr_str, "%p", RCAST(void **, &flag_addr));
         KMP_SSCANF(flag_val_str, "%lx", &flag_val);
         if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
           // First, check whether environment-encoded address is mapped into
@@ -6558,7 +6599,9 @@ void __kmp_register_library_startup(void) {
         __kmp_env_unset(name);
 #endif
       } break;
-      default: { KMP_DEBUG_ASSERT(0); } break;
+      default: {
+        KMP_DEBUG_ASSERT(0);
+      } break;
       }
     }
     KMP_INTERNAL_FREE((void *)value);
@@ -6694,6 +6737,10 @@ static void __kmp_do_serial_initialize(void) {
 #if OMPT_SUPPORT
   ompt_pre_init();
 #endif
+#if OMPD_SUPPORT
+  __kmp_env_dump();
+  ompd_init();
+#endif
 
   __kmp_validate_locks();
 
@@ -6851,6 +6898,8 @@ static void __kmp_do_serial_initialize(void) {
   __kmp_global.g.g_dynamic = FALSE;
   __kmp_global.g.g_dynamic_mode = dynamic_default;
 
+  __kmp_init_nesting_mode();
+
   __kmp_env_initialize(NULL);
 
 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
@@ -6997,13 +7046,6 @@ static void __kmp_do_middle_initialize(void) {
   // number of cores on the machine.
   __kmp_affinity_initialize();
 
-  // Run through the __kmp_threads array and set the affinity mask
-  // for each root thread that is currently registered with the RTL.
-  for (i = 0; i < __kmp_threads_capacity; i++) {
-    if (TCR_PTR(__kmp_threads[i]) != NULL) {
-      __kmp_affinity_set_init_mask(i, TRUE);
-    }
-  }
 #endif /* KMP_AFFINITY_SUPPORTED */
 
   KMP_ASSERT(__kmp_xproc > 0);
@@ -7043,6 +7085,9 @@ static void __kmp_do_middle_initialize(void) {
     __kmp_dflt_team_nth = __kmp_sys_max_nth;
   }
 
+  if (__kmp_nesting_mode > 0)
+    __kmp_set_nesting_mode_threads();
+
   // There's no harm in continuing if the following check fails,
   // but it indicates an error in the previous logic.
   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
@@ -7122,6 +7167,7 @@ void __kmp_parallel_initialize(void) {
   if (!__kmp_init_middle) {
     __kmp_do_middle_initialize();
   }
+  __kmp_assign_root_init_mask();
   __kmp_resume_if_hard_paused();
 
   /* begin initialization */
@@ -7247,9 +7293,14 @@ int __kmp_invoke_task_func(int gtid) {
   __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
 #if USE_ITT_BUILD
   if (__itt_stack_caller_create_ptr) {
-    __kmp_itt_stack_callee_enter(
-        (__itt_caller)
-            team->t.t_stack_id); // inform ittnotify about entering user's code
+    // inform ittnotify about entering user's code
+    if (team->t.t_stack_id != NULL) {
+      __kmp_itt_stack_callee_enter((__itt_caller)team->t.t_stack_id);
+    } else {
+      KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
+      __kmp_itt_stack_callee_enter(
+          (__itt_caller)team->t.t_parent->t.t_stack_id);
+    }
   }
 #endif /* USE_ITT_BUILD */
 #if INCLUDE_SSC_MARKS
@@ -7264,8 +7315,8 @@ int __kmp_invoke_task_func(int gtid) {
   int ompt_team_size;
 
   if (ompt_enabled.enabled) {
-    exit_frame_p = &(
-        team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame.exit_frame.ptr);
+    exit_frame_p = &(team->t.t_implicit_task_taskdata[tid]
+                         .ompt_task_info.frame.exit_frame.ptr);
   } else {
     exit_frame_p = &dummy;
   }
@@ -7298,10 +7349,10 @@ int __kmp_invoke_task_func(int gtid) {
                               ,
                               exit_frame_p
 #endif
-                              );
+  );
 #if OMPT_SUPPORT
   *exit_frame_p = NULL;
-   this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team;
+  this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team;
 #endif
 
 #if KMP_STATS_ENABLED
@@ -7313,9 +7364,14 @@ int __kmp_invoke_task_func(int gtid) {
 
 #if USE_ITT_BUILD
   if (__itt_stack_caller_create_ptr) {
-    __kmp_itt_stack_callee_leave(
-        (__itt_caller)
-            team->t.t_stack_id); // inform ittnotify about leaving user's code
+    // inform ittnotify about leaving user's code
+    if (team->t.t_stack_id != NULL) {
+      __kmp_itt_stack_callee_leave((__itt_caller)team->t.t_stack_id);
+    } else {
+      KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
+      __kmp_itt_stack_callee_leave(
+          (__itt_caller)team->t.t_parent->t.t_stack_id);
+    }
   }
 #endif /* USE_ITT_BUILD */
   __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
@@ -7324,7 +7380,7 @@ int __kmp_invoke_task_func(int gtid) {
 }
 
 void __kmp_teams_master(int gtid) {
-  // This routine is called by all master threads in teams construct
+  // This routine is called by all primary threads in teams construct
   kmp_info_t *thr = __kmp_threads[gtid];
   kmp_team_t *team = thr->th.th_team;
   ident_t *loc = team->t.t_ident;
@@ -7337,7 +7393,7 @@ void __kmp_teams_master(int gtid) {
   // This thread is a new CG root.  Set up the proper variables.
   kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
   tmp->cg_root = thr; // Make thr the CG root
-  // Init to thread limit that was stored when league masters were forked
+  // Init to thread limit stored when league primary threads were forked
   tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
   tmp->cg_nthreads = 1; // Init counter to one active thread, this one
   KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
@@ -7412,36 +7468,22 @@ void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
     thr->th.th_set_nproc = num_threads;
 }
 
-/* this sets the requested number of teams for the teams region and/or
-   the number of threads for the next parallel region encountered  */
-void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
-                          int num_threads) {
-  kmp_info_t *thr = __kmp_threads[gtid];
-  KMP_DEBUG_ASSERT(num_teams >= 0);
-  KMP_DEBUG_ASSERT(num_threads >= 0);
-
-  if (num_teams == 0)
-    num_teams = 1; // default number of teams is 1.
-  if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
-    if (!__kmp_reserve_warn) {
-      __kmp_reserve_warn = 1;
-      __kmp_msg(kmp_ms_warning,
-                KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
-                KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
-    }
-    num_teams = __kmp_teams_max_nth;
-  }
-  // Set number of teams (number of threads in the outer "parallel" of the
-  // teams)
-  thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
-
+static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams,
+                                    int num_threads) {
+  KMP_DEBUG_ASSERT(thr);
   // Remember the number of threads for inner parallel regions
   if (!TCR_4(__kmp_init_middle))
     __kmp_middle_initialize(); // get internal globals calculated
+  __kmp_assign_root_init_mask();
   KMP_DEBUG_ASSERT(__kmp_avail_proc);
   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
+
   if (num_threads == 0) {
-    num_threads = __kmp_avail_proc / num_teams;
+    if (__kmp_teams_thread_limit > 0) {
+      num_threads = __kmp_teams_thread_limit;
+    } else {
+      num_threads = __kmp_avail_proc / num_teams;
+    }
     // adjust num_threads w/o warning as it is not user setting
     // num_threads = min(num_threads, nthreads-var, thread-limit-var)
     // no thread_limit clause specified -  do not change thread-limit-var ICV
@@ -7454,8 +7496,11 @@ void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
     if (num_teams * num_threads > __kmp_teams_max_nth) {
       num_threads = __kmp_teams_max_nth / num_teams;
     }
+    if (num_threads == 0) {
+      num_threads = 1;
+    }
   } else {
-    // This thread will be the master of the league masters
+    // This thread will be the primary thread of the league primary threads
     // Store new thread limit; old limit is saved in th_cg_roots list
     thr->th.th_current_task->td_icvs.thread_limit = num_threads;
     // num_threads = min(num_threads, nthreads-var)
@@ -7464,11 +7509,16 @@ void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
     }
     if (num_teams * num_threads > __kmp_teams_max_nth) {
       int new_threads = __kmp_teams_max_nth / num_teams;
-      if (!__kmp_reserve_warn) { // user asked for too many threads
-        __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
-        __kmp_msg(kmp_ms_warning,
-                  KMP_MSG(CantFormThrTeam, num_threads, new_threads),
-                  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
+      if (new_threads == 0) {
+        new_threads = 1;
+      }
+      if (new_threads != num_threads) {
+        if (!__kmp_reserve_warn) { // user asked for too many threads
+          __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
+          __kmp_msg(kmp_ms_warning,
+                    KMP_MSG(CantFormThrTeam, num_threads, new_threads),
+                    KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
+        }
       }
       num_threads = new_threads;
     }
@@ -7476,6 +7526,94 @@ void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
   thr->th.th_teams_size.nth = num_threads;
 }
 
+/* this sets the requested number of teams for the teams region and/or
+   the number of threads for the next parallel region encountered  */
+void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
+                          int num_threads) {
+  kmp_info_t *thr = __kmp_threads[gtid];
+  KMP_DEBUG_ASSERT(num_teams >= 0);
+  KMP_DEBUG_ASSERT(num_threads >= 0);
+
+  if (num_teams == 0) {
+    if (__kmp_nteams > 0) {
+      num_teams = __kmp_nteams;
+    } else {
+      num_teams = 1; // default number of teams is 1.
+    }
+  }
+  if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
+    if (!__kmp_reserve_warn) {
+      __kmp_reserve_warn = 1;
+      __kmp_msg(kmp_ms_warning,
+                KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
+                KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
+    }
+    num_teams = __kmp_teams_max_nth;
+  }
+  // Set number of teams (number of threads in the outer "parallel" of the
+  // teams)
+  thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
+
+  __kmp_push_thread_limit(thr, num_teams, num_threads);
+}
+
+/* This sets the requested number of teams for the teams region and/or
+   the number of threads for the next parallel region encountered  */
+void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb,
+                             int num_teams_ub, int num_threads) {
+  kmp_info_t *thr = __kmp_threads[gtid];
+  KMP_DEBUG_ASSERT(num_teams_lb >= 0 && num_teams_ub >= 0);
+  KMP_DEBUG_ASSERT(num_teams_ub >= num_teams_lb);
+  KMP_DEBUG_ASSERT(num_threads >= 0);
+
+  if (num_teams_lb > num_teams_ub) {
+    __kmp_fatal(KMP_MSG(FailedToCreateTeam, num_teams_lb, num_teams_ub),
+                KMP_HNT(SetNewBound, __kmp_teams_max_nth), __kmp_msg_null);
+  }
+
+  int num_teams = 1; // defalt number of teams is 1.
+
+  if (num_teams_lb == 0 && num_teams_ub > 0)
+    num_teams_lb = num_teams_ub;
+
+  if (num_teams_lb == 0 && num_teams_ub == 0) { // no num_teams clause
+    num_teams = (__kmp_nteams > 0) ? __kmp_nteams : num_teams;
+    if (num_teams > __kmp_teams_max_nth) {
+      if (!__kmp_reserve_warn) {
+        __kmp_reserve_warn = 1;
+        __kmp_msg(kmp_ms_warning,
+                  KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
+                  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
+      }
+      num_teams = __kmp_teams_max_nth;
+    }
+  } else if (num_teams_lb == num_teams_ub) { // requires exact number of teams
+    num_teams = num_teams_ub;
+  } else { // num_teams_lb <= num_teams <= num_teams_ub
+    if (num_threads == 0) {
+      if (num_teams_ub > __kmp_teams_max_nth) {
+        num_teams = num_teams_lb;
+      } else {
+        num_teams = num_teams_ub;
+      }
+    } else {
+      num_teams = (num_threads > __kmp_teams_max_nth)
+                      ? num_teams
+                      : __kmp_teams_max_nth / num_threads;
+      if (num_teams < num_teams_lb) {
+        num_teams = num_teams_lb;
+      } else if (num_teams > num_teams_ub) {
+        num_teams = num_teams_ub;
+      }
+    }
+  }
+  // Set number of teams (number of threads in the outer "parallel" of the
+  // teams)
+  thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
+
+  __kmp_push_thread_limit(thr, num_teams, num_threads);
+}
+
 // Set the proc_bind var to use in the following parallel region.
 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
   kmp_info_t *thr = __kmp_threads[gtid];
@@ -7535,7 +7673,7 @@ void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
   KMP_ASSERT(KMP_MASTER_GTID(gtid));
   KMP_MB(); /* Flush all pending memory write invalidates.  */
 
-/* Join barrier after fork */
+  /* Join barrier after fork */
 
 #ifdef KMP_DEBUG
   if (__kmp_threads[gtid] &&
@@ -7579,7 +7717,8 @@ void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
 #endif
     if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
-          ompt_scope_end, NULL, task_data, 0, ds_tid, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
+          ompt_scope_end, NULL, task_data, 0, ds_tid,
+          ompt_task_implicit); // TODO: Can this be ompt_task_initial?
     }
   }
 #endif
@@ -7604,10 +7743,10 @@ static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
   }
   hot_team = root->r.r_hot_team;
   if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
-    return hot_team->t.t_nproc - 1; // Don't count master thread
+    return hot_team->t.t_nproc - 1; // Don't count primary thread
   }
 
-  // Skip the master thread - it is accounted for elsewhere.
+  // Skip the primary thread - it is accounted for elsewhere.
   retval = 0;
   for (i = 1; i < hot_team->t.t_nproc; i++) {
     if (hot_team->t.t_threads[i]->th.th_active) {
@@ -7640,8 +7779,8 @@ static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
 
   // Threads that are active in the thread pool, active in the hot team for this
   // particular root (if we are at the outer par level), and the currently
-  // executing thread (to become the master) are available to add to the new
-  // team, but are currently contributing to the system load, and must be
+  // executing thread (to become the primary thread) are available to add to the
+  // new team, but are currently contributing to the system load, and must be
   // accounted for.
   pool_active = __kmp_thread_pool_active_nth;
   hot_team_active = __kmp_active_hot_team_nproc(root);
@@ -7745,6 +7884,13 @@ void __kmp_cleanup(void) {
 #else
   __kmp_cleanup_user_locks();
 #endif
+#if OMPD_SUPPORT
+  if (ompd_state) {
+    __kmp_free(ompd_env_block);
+    ompd_env_block = NULL;
+    ompd_env_block_size = 0;
+  }
+#endif
 
 #if KMP_AFFINITY_SUPPORTED
   KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
@@ -8009,7 +8155,7 @@ int __kmp_aux_get_num_teams() {
  *
  * Implementation-specific field types can be added
  * If a type is unknown, print "undefined"
-*/
+ */
 
 // Structure holding the short name, long name, and corresponding data type
 // for snprintf.  A table of these will represent the entire valid keyword
@@ -8204,7 +8350,7 @@ static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
  * (not including null byte character)
  * The resultant string is printed to buffer, which the caller can then
  * handle afterwards
-*/
+ */
 size_t __kmp_aux_capture_affinity(int gtid, const char *format,
                                   kmp_str_buf_t *buffer) {
   const char *parse_ptr;
@@ -8559,11 +8705,12 @@ void __kmp_omp_display_env(int verbose) {
 // Globals and functions for hidden helper task
 kmp_info_t **__kmp_hidden_helper_threads;
 kmp_info_t *__kmp_hidden_helper_main_thread;
-kmp_int32 __kmp_hidden_helper_threads_num = 8;
 std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks;
 #if KMP_OS_LINUX
+kmp_int32 __kmp_hidden_helper_threads_num = 8;
 kmp_int32 __kmp_enable_hidden_helper = TRUE;
 #else
+kmp_int32 __kmp_hidden_helper_threads_num = 0;
 kmp_int32 __kmp_enable_hidden_helper = FALSE;
 #endif
 
@@ -8611,3 +8758,89 @@ void __kmp_hidden_helper_threads_initz_routine() {
 
   __kmp_hidden_helper_threads_deinitz_release();
 }
+
+/* Nesting Mode:
+   Set via KMP_NESTING_MODE, which takes an integer.
+   Note: we skip duplicate topology levels, and skip levels with only
+      one entity.
+   KMP_NESTING_MODE=0 is the default, and doesn't use nesting mode.
+   KMP_NESTING_MODE=1 sets as many nesting levels as there are distinct levels
+      in the topology, and initializes the number of threads at each of those
+      levels to the number of entities at each level, respectively, below the
+      entity at the parent level.
+   KMP_NESTING_MODE=N, where N>1, attempts to create up to N nesting levels,
+      but starts with nesting OFF -- max-active-levels-var is 1 -- and requires
+      the user to turn nesting on explicitly. This is an even more experimental
+      option to this experimental feature, and may change or go away in the
+      future.
+*/
+
+// Allocate space to store nesting levels
+void __kmp_init_nesting_mode() {
+  int levels = KMP_HW_LAST;
+  __kmp_nesting_mode_nlevels = levels;
+  __kmp_nesting_nth_level = (int *)KMP_INTERNAL_MALLOC(levels * sizeof(int));
+  for (int i = 0; i < levels; ++i)
+    __kmp_nesting_nth_level[i] = 0;
+  if (__kmp_nested_nth.size < levels) {
+    __kmp_nested_nth.nth =
+        (int *)KMP_INTERNAL_REALLOC(__kmp_nested_nth.nth, levels * sizeof(int));
+    __kmp_nested_nth.size = levels;
+  }
+}
+
+// Set # threads for top levels of nesting; must be called after topology set
+void __kmp_set_nesting_mode_threads() {
+  kmp_info_t *thread = __kmp_threads[__kmp_entry_gtid()];
+
+  if (__kmp_nesting_mode == 1)
+    __kmp_nesting_mode_nlevels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
+  else if (__kmp_nesting_mode > 1)
+    __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
+
+  if (__kmp_topology) { // use topology info
+    int loc, hw_level;
+    for (loc = 0, hw_level = 0; hw_level < __kmp_topology->get_depth() &&
+                                loc < __kmp_nesting_mode_nlevels;
+         loc++, hw_level++) {
+      __kmp_nesting_nth_level[loc] = __kmp_topology->get_ratio(hw_level);
+      if (__kmp_nesting_nth_level[loc] == 1)
+        loc--;
+    }
+    // Make sure all cores are used
+    if (__kmp_nesting_mode > 1 && loc > 1) {
+      int core_level = __kmp_topology->get_level(KMP_HW_CORE);
+      int num_cores = __kmp_topology->get_count(core_level);
+      int upper_levels = 1;
+      for (int level = 0; level < loc - 1; ++level)
+        upper_levels *= __kmp_nesting_nth_level[level];
+      if (upper_levels * __kmp_nesting_nth_level[loc - 1] < num_cores)
+        __kmp_nesting_nth_level[loc - 1] =
+            num_cores / __kmp_nesting_nth_level[loc - 2];
+    }
+    __kmp_nesting_mode_nlevels = loc;
+    __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
+  } else { // no topology info available; provide a reasonable guesstimation
+    if (__kmp_avail_proc >= 4) {
+      __kmp_nesting_nth_level[0] = __kmp_avail_proc / 2;
+      __kmp_nesting_nth_level[1] = 2;
+      __kmp_nesting_mode_nlevels = 2;
+    } else {
+      __kmp_nesting_nth_level[0] = __kmp_avail_proc;
+      __kmp_nesting_mode_nlevels = 1;
+    }
+    __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
+  }
+  for (int i = 0; i < __kmp_nesting_mode_nlevels; ++i) {
+    __kmp_nested_nth.nth[i] = __kmp_nesting_nth_level[i];
+  }
+  set__nproc(thread, __kmp_nesting_nth_level[0]);
+  if (__kmp_nesting_mode > 1 && __kmp_nesting_mode_nlevels > __kmp_nesting_mode)
+    __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
+  if (get__max_active_levels(thread) > 1) {
+    // if max levels was set, set nesting mode levels to same
+    __kmp_nesting_mode_nlevels = get__max_active_levels(thread);
+  }
+  if (__kmp_nesting_mode == 1) // turn on nesting for this case only
+    set__max_active_levels(thread, __kmp_nesting_mode_nlevels);
+}
diff --git a/openmp/runtime/src/kmp_safe_c_api.h b/openmp/runtime/src/kmp_safe_c_api.h
index abc0a16f87cf..3db1ada37b07 100644
--- a/openmp/runtime/src/kmp_safe_c_api.h
+++ b/openmp/runtime/src/kmp_safe_c_api.h
@@ -10,6 +10,7 @@
 #ifndef KMP_SAFE_C_API_H
 #define KMP_SAFE_C_API_H
 
+#include <type_traits>
 #include "kmp_platform.h"
 #include <string.h>
 
@@ -33,7 +34,15 @@
 // Use this only when buffer size is unknown
 #define KMP_MEMCPY(dst, src, cnt) memcpy_s(dst, cnt, src, cnt)
 
-#define KMP_STRLEN(str) strnlen_s(str, RSIZE_MAX_STR)
+template <typename T, bool B = std::is_array<T>::value>
+struct kmp_get_rmax_t {};
+template <typename T> struct kmp_get_rmax_t<T, false> {
+  static const size_t value = RSIZE_MAX_STR;
+};
+template <typename T> struct kmp_get_rmax_t<T, true> {
+  static const size_t value = sizeof(T);
+};
+#define KMP_STRLEN(str) strnlen_s(str, kmp_get_rmax_t<decltype(str)>::value)
 
 // Use this only when buffer size is unknown
 #define KMP_STRNCPY(dst, src, cnt) strncpy_s(dst, cnt, src, cnt)
diff --git a/openmp/runtime/src/kmp_sched.cpp b/openmp/runtime/src/kmp_sched.cpp
index 2d8f644c88b2..09e497e02914 100644
--- a/openmp/runtime/src/kmp_sched.cpp
+++ b/openmp/runtime/src/kmp_sched.cpp
@@ -78,7 +78,7 @@ static void __kmp_for_static_init(ident_t *loc, kmp_int32 global_tid,
                                   ,
                                   void *codeptr
 #endif
-                                  ) {
+) {
   KMP_COUNT_BLOCK(OMP_LOOP_STATIC);
   KMP_PUSH_PARTITIONED_TIMER(OMP_loop_static);
   KMP_PUSH_PARTITIONED_TIMER(OMP_loop_static_scheduling);
@@ -167,6 +167,7 @@ static void __kmp_for_static_init(ident_t *loc, kmp_int32 global_tid,
                               "signed?<%s>, loc = %%s\n",
                               traits_t<T>::spec, traits_t<T>::spec,
                               traits_t<ST>::spec, traits_t<T>::spec);
+      check_loc(loc);
       KD_TRACE(100,
                (buff, *plastiter, *plower, *pupper, *pstride, loc->psource));
       __kmp_str_free(&buff);
@@ -301,7 +302,8 @@ static void __kmp_for_static_init(ident_t *loc, kmp_int32 global_tid,
       if (tid < trip_count) {
         *pupper = *plower = *plower + tid * incr;
       } else {
-        *plower = *pupper + incr;
+        // set bounds so non-active threads execute no iterations
+        *plower = *pupper + (incr > 0 ? 1 : -1);
       }
       if (plastiter != NULL)
         *plastiter = (tid == trip_count - 1);
@@ -345,15 +347,28 @@ static void __kmp_for_static_init(ident_t *loc, kmp_int32 global_tid,
   }
   case kmp_sch_static_chunked: {
     ST span;
-    if (chunk < 1) {
+    UT nchunks;
+    if (chunk < 1)
       chunk = 1;
-    }
+    else if ((UT)chunk > trip_count)
+      chunk = trip_count;
+    nchunks = (trip_count) / (UT)chunk + (trip_count % (UT)chunk ? 1 : 0);
     span = chunk * incr;
-    *pstride = span * nth;
-    *plower = *plower + (span * tid);
-    *pupper = *plower + span - incr;
+    if (nchunks < nth) {
+      *pstride = span * nchunks;
+      if (tid < nchunks) {
+        *plower = *plower + (span * tid);
+        *pupper = *plower + span - incr;
+      } else {
+        *plower = *pupper + (incr > 0 ? 1 : -1);
+      }
+    } else {
+      *pstride = span * nth;
+      *plower = *plower + (span * tid);
+      *pupper = *plower + span - incr;
+    }
     if (plastiter != NULL)
-      *plastiter = (tid == ((trip_count - 1) / (UT)chunk) % nth);
+      *plastiter = (tid == (nchunks - 1) % nth);
     break;
   }
   case kmp_sch_static_balanced_chunked: {
@@ -508,8 +523,8 @@ static void __kmp_dist_for_static_init(ident_t *loc, kmp_int32 gtid,
         __kmp_static == kmp_sch_static_greedy ||
         __kmp_static ==
             kmp_sch_static_balanced); // Unknown static scheduling type.
-    // only masters of some teams get single iteration, other threads get
-    // nothing
+    // only primary threads of some teams get single iteration, other threads
+    // get nothing
     if (team_id < trip_count && tid == 0) {
       *pupper = *pupperDist = *plower = *plower + team_id * incr;
     } else {
@@ -811,7 +826,7 @@ void __kmpc_for_static_init_4(ident_t *loc, kmp_int32 gtid, kmp_int32 schedtype,
                                    ,
                                    OMPT_GET_RETURN_ADDRESS(0)
 #endif
-                                       );
+  );
 }
 
 /*!
@@ -828,7 +843,7 @@ void __kmpc_for_static_init_4u(ident_t *loc, kmp_int32 gtid,
                                     ,
                                     OMPT_GET_RETURN_ADDRESS(0)
 #endif
-                                        );
+  );
 }
 
 /*!
@@ -844,7 +859,7 @@ void __kmpc_for_static_init_8(ident_t *loc, kmp_int32 gtid, kmp_int32 schedtype,
                                    ,
                                    OMPT_GET_RETURN_ADDRESS(0)
 #endif
-                                       );
+  );
 }
 
 /*!
@@ -861,7 +876,7 @@ void __kmpc_for_static_init_8u(ident_t *loc, kmp_int32 gtid,
                                     ,
                                     OMPT_GET_RETURN_ADDRESS(0)
 #endif
-                                        );
+  );
 }
 /*!
 @}
diff --git a/openmp/runtime/src/kmp_settings.cpp b/openmp/runtime/src/kmp_settings.cpp
index a8522130f972..f287c27f29a5 100644
--- a/openmp/runtime/src/kmp_settings.cpp
+++ b/openmp/runtime/src/kmp_settings.cpp
@@ -25,6 +25,9 @@
 #include "kmp_str.h"
 #include "kmp_wrapper_getpid.h"
 #include <ctype.h> // toupper()
+#if OMPD_SUPPORT
+#include "ompd-specific.h"
+#endif
 
 static int __kmp_env_toPrint(char const *name, int flag);
 
@@ -223,11 +226,11 @@ static int __kmp_strcasecmp_with_sentinel(char const *a, char const *b,
     ++a;
     ++b;
   }
-  return *a
-             ? (*b && *b != sentinel)
-                   ? (int)(unsigned char)*a - (int)(unsigned char)*b
-                   : 1
-             : (*b && *b != sentinel) ? -1 : 0;
+  return *a                       ? (*b && *b != sentinel)
+                                        ? (int)(unsigned char)*a - (int)(unsigned char)*b
+                                        : 1
+         : (*b && *b != sentinel) ? -1
+                                  : 0;
 }
 
 // =============================================================================
@@ -272,7 +275,7 @@ static int __kmp_stg_check_rivals( // 0 -- Ok, 1 -- errors found.
     char const *name, // Name of variable.
     char const *value, // Value of the variable.
     kmp_setting_t **rivals // List of rival settings (must include current one).
-    );
+);
 
 // -----------------------------------------------------------------------------
 // Helper parse functions.
@@ -367,7 +370,7 @@ static void __kmp_stg_parse_int(
     int min, // I: Minimum allowed value.
     int max, // I: Maximum allowed value.
     int *out // O: Output (parsed) value.
-    ) {
+) {
   char const *msg = NULL;
   kmp_uint64 uint = *out;
   __kmp_str_to_uint(value, &uint, &msg);
@@ -504,9 +507,10 @@ int __kmp_initial_threads_capacity(int req_nproc) {
     nth = (4 * __kmp_xproc);
 
   // If hidden helper task is enabled, we initialize the thread capacity with
-  // extra
-  // __kmp_hidden_helper_threads_num.
-  nth += __kmp_hidden_helper_threads_num;
+  // extra __kmp_hidden_helper_threads_num.
+  if (__kmp_enable_hidden_helper) {
+    nth += __kmp_hidden_helper_threads_num;
+  }
 
   if (nth > __kmp_max_nth)
     nth = __kmp_max_nth;
@@ -634,6 +638,33 @@ static void __kmp_stg_print_thread_limit(kmp_str_buf_t *buffer,
 } // __kmp_stg_print_thread_limit
 
 // -----------------------------------------------------------------------------
+// OMP_NUM_TEAMS
+static void __kmp_stg_parse_nteams(char const *name, char const *value,
+                                   void *data) {
+  __kmp_stg_parse_int(name, value, 1, __kmp_sys_max_nth, &__kmp_nteams);
+  K_DIAG(1, ("__kmp_nteams == %d\n", __kmp_nteams));
+} // __kmp_stg_parse_nteams
+
+static void __kmp_stg_print_nteams(kmp_str_buf_t *buffer, char const *name,
+                                   void *data) {
+  __kmp_stg_print_int(buffer, name, __kmp_nteams);
+} // __kmp_stg_print_nteams
+
+// -----------------------------------------------------------------------------
+// OMP_TEAMS_THREAD_LIMIT
+static void __kmp_stg_parse_teams_th_limit(char const *name, char const *value,
+                                           void *data) {
+  __kmp_stg_parse_int(name, value, 1, __kmp_sys_max_nth,
+                      &__kmp_teams_thread_limit);
+  K_DIAG(1, ("__kmp_teams_thread_limit == %d\n", __kmp_teams_thread_limit));
+} // __kmp_stg_parse_teams_th_limit
+
+static void __kmp_stg_print_teams_th_limit(kmp_str_buf_t *buffer,
+                                           char const *name, void *data) {
+  __kmp_stg_print_int(buffer, name, __kmp_teams_thread_limit);
+} // __kmp_stg_print_teams_th_limit
+
+// -----------------------------------------------------------------------------
 // KMP_TEAMS_THREAD_LIMIT
 static void __kmp_stg_parse_teams_thread_limit(char const *name,
                                                char const *value, void *data) {
@@ -883,7 +914,7 @@ static void __kmp_stg_parse_stackpad(char const *name, char const *value,
                       KMP_MIN_STKPADDING, // Min value
                       KMP_MAX_STKPADDING, // Max value
                       &__kmp_stkpadding // Var to initialize
-                      );
+  );
 } // __kmp_stg_parse_stackpad
 
 static void __kmp_stg_print_stackpad(kmp_str_buf_t *buffer, char const *name,
@@ -988,6 +1019,28 @@ static void __kmp_stg_print_warnings(kmp_str_buf_t *buffer, char const *name,
 } // __kmp_stg_print_warnings
 
 // -----------------------------------------------------------------------------
+// KMP_NESTING_MODE
+
+static void __kmp_stg_parse_nesting_mode(char const *name, char const *value,
+                                         void *data) {
+  __kmp_stg_parse_int(name, value, 0, INT_MAX, &__kmp_nesting_mode);
+#if KMP_AFFINITY_SUPPORTED && KMP_USE_HWLOC
+  if (__kmp_nesting_mode > 0)
+    __kmp_affinity_top_method = affinity_top_method_hwloc;
+#endif
+} // __kmp_stg_parse_nesting_mode
+
+static void __kmp_stg_print_nesting_mode(kmp_str_buf_t *buffer,
+                                         char const *name, void *data) {
+  if (__kmp_env_format) {
+    KMP_STR_BUF_PRINT_NAME;
+  } else {
+    __kmp_str_buf_print(buffer, "   %s", name);
+  }
+  __kmp_str_buf_print(buffer, "=%d\n", __kmp_nesting_mode);
+} // __kmp_stg_print_nesting_mode
+
+// -----------------------------------------------------------------------------
 // OMP_NESTED, OMP_NUM_THREADS
 
 static void __kmp_stg_parse_nested(char const *name, char const *value,
@@ -1362,7 +1415,8 @@ static void __kmp_stg_parse_disp_buffers(char const *name, char const *value,
     KMP_WARNING(EnvSerialWarn, name);
     return;
   } // read value before serial initialization only
-  __kmp_stg_parse_int(name, value, 1, KMP_MAX_NTH, &__kmp_dispatch_num_buffers);
+  __kmp_stg_parse_int(name, value, KMP_MIN_DISP_NUM_BUFF, KMP_MAX_DISP_NUM_BUFF,
+                      &__kmp_dispatch_num_buffers);
 } // __kmp_stg_parse_disp_buffers
 
 static void __kmp_stg_print_disp_buffers(kmp_str_buf_t *buffer,
@@ -1685,6 +1739,7 @@ static void __kmp_stg_print_barrier_pattern(kmp_str_buf_t *buffer,
         __kmp_str_buf_print(buffer, "   %s='",
                             __kmp_barrier_pattern_env_name[i]);
       }
+      KMP_DEBUG_ASSERT(j < bs_last_barrier && k < bs_last_barrier);
       __kmp_str_buf_print(buffer, "%s,%s'\n", __kmp_barrier_pattern_name[j],
                           __kmp_barrier_pattern_name[k]);
     }
@@ -2039,9 +2094,9 @@ static void __kmp_parse_affinity_env(char const *name, char const *value,
                                      enum affinity_type *out_type,
                                      char **out_proclist, int *out_verbose,
                                      int *out_warn, int *out_respect,
-                                     enum affinity_gran *out_gran,
-                                     int *out_gran_levels, int *out_dups,
-                                     int *out_compact, int *out_offset) {
+                                     kmp_hw_t *out_gran, int *out_gran_levels,
+                                     int *out_dups, int *out_compact,
+                                     int *out_offset) {
   char *buffer = NULL; // Copy of env var value.
   char *buf = NULL; // Buffer for strtok_r() function.
   char *next = NULL; // end of token / start of next.
@@ -2057,6 +2112,7 @@ static void __kmp_parse_affinity_env(char const *name, char const *value,
   int respect = 0;
   int gran = 0;
   int dups = 0;
+  bool set = false;
 
   KMP_ASSERT(value != NULL);
 
@@ -2202,42 +2258,51 @@ static void __kmp_parse_affinity_env(char const *name, char const *value,
       SKIP_WS(next);
 
       buf = next;
-      if (__kmp_match_str("fine", buf, CCAST(const char **, &next))) {
-        set_gran(affinity_gran_fine, -1);
-        buf = next;
-      } else if (__kmp_match_str("thread", buf, CCAST(const char **, &next))) {
-        set_gran(affinity_gran_thread, -1);
-        buf = next;
-      } else if (__kmp_match_str("core", buf, CCAST(const char **, &next))) {
-        set_gran(affinity_gran_core, -1);
-        buf = next;
-#if KMP_USE_HWLOC
-      } else if (__kmp_match_str("tile", buf, CCAST(const char **, &next))) {
-        set_gran(affinity_gran_tile, -1);
-        buf = next;
-#endif
-      } else if (__kmp_match_str("package", buf, CCAST(const char **, &next))) {
-        set_gran(affinity_gran_package, -1);
-        buf = next;
-      } else if (__kmp_match_str("node", buf, CCAST(const char **, &next))) {
-        set_gran(affinity_gran_node, -1);
-        buf = next;
+
+      // Try any hardware topology type for granularity
+      KMP_FOREACH_HW_TYPE(type) {
+        const char *name = __kmp_hw_get_keyword(type);
+        if (__kmp_match_str(name, buf, CCAST(const char **, &next))) {
+          set_gran(type, -1);
+          buf = next;
+          set = true;
+          break;
+        }
+      }
+      if (!set) {
+        // Support older names for different granularity layers
+        if (__kmp_match_str("fine", buf, CCAST(const char **, &next))) {
+          set_gran(KMP_HW_THREAD, -1);
+          buf = next;
+          set = true;
+        } else if (__kmp_match_str("package", buf,
+                                   CCAST(const char **, &next))) {
+          set_gran(KMP_HW_SOCKET, -1);
+          buf = next;
+          set = true;
+        } else if (__kmp_match_str("node", buf, CCAST(const char **, &next))) {
+          set_gran(KMP_HW_NUMA, -1);
+          buf = next;
+          set = true;
 #if KMP_GROUP_AFFINITY
-      } else if (__kmp_match_str("group", buf, CCAST(const char **, &next))) {
-        set_gran(affinity_gran_group, -1);
-        buf = next;
+        } else if (__kmp_match_str("group", buf, CCAST(const char **, &next))) {
+          set_gran(KMP_HW_PROC_GROUP, -1);
+          buf = next;
+          set = true;
 #endif /* KMP_GROUP AFFINITY */
-      } else if ((*buf >= '0') && (*buf <= '9')) {
-        int n;
-        next = buf;
-        SKIP_DIGITS(next);
-        n = __kmp_str_to_int(buf, *next);
-        KMP_ASSERT(n >= 0);
-        buf = next;
-        set_gran(affinity_gran_default, n);
-      } else {
-        EMIT_WARN(TRUE, (AffInvalidParam, name, start));
-        continue;
+        } else if ((*buf >= '0') && (*buf <= '9')) {
+          int n;
+          next = buf;
+          SKIP_DIGITS(next);
+          n = __kmp_str_to_int(buf, *next);
+          KMP_ASSERT(n >= 0);
+          buf = next;
+          set_gran(KMP_HW_UNKNOWN, n);
+          set = true;
+        } else {
+          EMIT_WARN(TRUE, (AffInvalidParam, name, start));
+          continue;
+        }
       }
     } else if (__kmp_match_str("proclist", buf, CCAST(const char **, &next))) {
       char *temp_proclist;
@@ -2344,20 +2409,20 @@ static void __kmp_parse_affinity_env(char const *name, char const *value,
       *out_offset = number[1];
     }
 
-    if (__kmp_affinity_gran == affinity_gran_default) {
+    if (__kmp_affinity_gran == KMP_HW_UNKNOWN) {
 #if KMP_MIC_SUPPORTED
       if (__kmp_mic_type != non_mic) {
         if (__kmp_affinity_verbose || __kmp_affinity_warnings) {
           KMP_WARNING(AffGranUsing, "KMP_AFFINITY", "fine");
         }
-        __kmp_affinity_gran = affinity_gran_fine;
+        __kmp_affinity_gran = KMP_HW_THREAD;
       } else
 #endif
       {
         if (__kmp_affinity_verbose || __kmp_affinity_warnings) {
           KMP_WARNING(AffGranUsing, "KMP_AFFINITY", "core");
         }
-        __kmp_affinity_gran = affinity_gran_core;
+        __kmp_affinity_gran = KMP_HW_CORE;
       }
     }
   } break;
@@ -2394,7 +2459,9 @@ static void __kmp_parse_affinity_env(char const *name, char const *value,
       KMP_WARNING(AffNoParam, name, "default");
     }
   } break;
-  default: { KMP_ASSERT(0); }
+  default: {
+    KMP_ASSERT(0);
+  }
   }
 } // __kmp_parse_affinity_env
 
@@ -2440,31 +2507,8 @@ static void __kmp_stg_print_affinity(kmp_str_buf_t *buffer, char const *name,
     } else {
       __kmp_str_buf_print(buffer, "%s,", "norespect");
     }
-    switch (__kmp_affinity_gran) {
-    case affinity_gran_default:
-      __kmp_str_buf_print(buffer, "%s", "granularity=default,");
-      break;
-    case affinity_gran_fine:
-      __kmp_str_buf_print(buffer, "%s", "granularity=fine,");
-      break;
-    case affinity_gran_thread:
-      __kmp_str_buf_print(buffer, "%s", "granularity=thread,");
-      break;
-    case affinity_gran_core:
-      __kmp_str_buf_print(buffer, "%s", "granularity=core,");
-      break;
-    case affinity_gran_package:
-      __kmp_str_buf_print(buffer, "%s", "granularity=package,");
-      break;
-    case affinity_gran_node:
-      __kmp_str_buf_print(buffer, "%s", "granularity=node,");
-      break;
-#if KMP_GROUP_AFFINITY
-    case affinity_gran_group:
-      __kmp_str_buf_print(buffer, "%s", "granularity=group,");
-      break;
-#endif /* KMP_GROUP_AFFINITY */
-    }
+    __kmp_str_buf_print(buffer, "granularity=%s,",
+                        __kmp_hw_get_keyword(__kmp_affinity_gran, false));
   }
   if (!KMP_AFFINITY_CAPABLE()) {
     __kmp_str_buf_print(buffer, "%s", "disabled");
@@ -2536,7 +2580,7 @@ static void __kmp_stg_parse_gomp_cpu_affinity(char const *name,
       // GOMP_CPU_AFFINITY => granularity=fine,explicit,proclist=...
       __kmp_affinity_proclist = temp_proclist;
       __kmp_affinity_type = affinity_explicit;
-      __kmp_affinity_gran = affinity_gran_fine;
+      __kmp_affinity_gran = KMP_HW_THREAD;
       __kmp_nested_proc_bind.bind_types[0] = proc_bind_intel;
     } else {
       KMP_WARNING(AffSyntaxError, name);
@@ -2573,6 +2617,11 @@ signed := + signed
 signed := - signed
 -----------------------------------------------------------------------------*/
 
+// Warning to issue for syntax error during parsing of OMP_PLACES
+static inline void __kmp_omp_places_syntax_warn(const char *var) {
+  KMP_WARNING(SyntaxErrorUsing, var, "\"cores\"");
+}
+
 static int __kmp_parse_subplace_list(const char *var, const char **scan) {
   const char *next;
 
@@ -2584,7 +2633,7 @@ static int __kmp_parse_subplace_list(const char *var, const char **scan) {
     //
     SKIP_WS(*scan);
     if ((**scan < '0') || (**scan > '9')) {
-      KMP_WARNING(SyntaxErrorUsing, var, "\"threads\"");
+      __kmp_omp_places_syntax_warn(var);
       return FALSE;
     }
     next = *scan;
@@ -2603,7 +2652,7 @@ static int __kmp_parse_subplace_list(const char *var, const char **scan) {
       continue;
     }
     if (**scan != ':') {
-      KMP_WARNING(SyntaxErrorUsing, var, "\"threads\"");
+      __kmp_omp_places_syntax_warn(var);
       return FALSE;
     }
     (*scan)++; // skip ':'
@@ -2611,7 +2660,7 @@ static int __kmp_parse_subplace_list(const char *var, const char **scan) {
     // Read count parameter
     SKIP_WS(*scan);
     if ((**scan < '0') || (**scan > '9')) {
-      KMP_WARNING(SyntaxErrorUsing, var, "\"threads\"");
+      __kmp_omp_places_syntax_warn(var);
       return FALSE;
     }
     next = *scan;
@@ -2630,7 +2679,7 @@ static int __kmp_parse_subplace_list(const char *var, const char **scan) {
       continue;
     }
     if (**scan != ':') {
-      KMP_WARNING(SyntaxErrorUsing, var, "\"threads\"");
+      __kmp_omp_places_syntax_warn(var);
       return FALSE;
     }
     (*scan)++; // skip ':'
@@ -2652,7 +2701,7 @@ static int __kmp_parse_subplace_list(const char *var, const char **scan) {
     }
     SKIP_WS(*scan);
     if ((**scan < '0') || (**scan > '9')) {
-      KMP_WARNING(SyntaxErrorUsing, var, "\"threads\"");
+      __kmp_omp_places_syntax_warn(var);
       return FALSE;
     }
     next = *scan;
@@ -2672,7 +2721,7 @@ static int __kmp_parse_subplace_list(const char *var, const char **scan) {
       continue;
     }
 
-    KMP_WARNING(SyntaxErrorUsing, var, "\"threads\"");
+    __kmp_omp_places_syntax_warn(var);
     return FALSE;
   }
   return TRUE;
@@ -2689,7 +2738,7 @@ static int __kmp_parse_place(const char *var, const char **scan) {
       return FALSE;
     }
     if (**scan != '}') {
-      KMP_WARNING(SyntaxErrorUsing, var, "\"threads\"");
+      __kmp_omp_places_syntax_warn(var);
       return FALSE;
     }
     (*scan)++; // skip '}'
@@ -2703,7 +2752,7 @@ static int __kmp_parse_place(const char *var, const char **scan) {
     KMP_ASSERT(proc >= 0);
     *scan = next;
   } else {
-    KMP_WARNING(SyntaxErrorUsing, var, "\"threads\"");
+    __kmp_omp_places_syntax_warn(var);
     return FALSE;
   }
   return TRUE;
@@ -2731,7 +2780,7 @@ static int __kmp_parse_place_list(const char *var, const char *env,
       continue;
     }
     if (*scan != ':') {
-      KMP_WARNING(SyntaxErrorUsing, var, "\"threads\"");
+      __kmp_omp_places_syntax_warn(var);
       return FALSE;
     }
     scan++; // skip ':'
@@ -2739,7 +2788,7 @@ static int __kmp_parse_place_list(const char *var, const char *env,
     // Read count parameter
     SKIP_WS(scan);
     if ((*scan < '0') || (*scan > '9')) {
-      KMP_WARNING(SyntaxErrorUsing, var, "\"threads\"");
+      __kmp_omp_places_syntax_warn(var);
       return FALSE;
     }
     next = scan;
@@ -2758,7 +2807,7 @@ static int __kmp_parse_place_list(const char *var, const char *env,
       continue;
     }
     if (*scan != ':') {
-      KMP_WARNING(SyntaxErrorUsing, var, "\"threads\"");
+      __kmp_omp_places_syntax_warn(var);
       return FALSE;
     }
     scan++; // skip ':'
@@ -2780,7 +2829,7 @@ static int __kmp_parse_place_list(const char *var, const char *env,
     }
     SKIP_WS(scan);
     if ((*scan < '0') || (*scan > '9')) {
-      KMP_WARNING(SyntaxErrorUsing, var, "\"threads\"");
+      __kmp_omp_places_syntax_warn(var);
       return FALSE;
     }
     next = scan;
@@ -2800,7 +2849,7 @@ static int __kmp_parse_place_list(const char *var, const char *env,
       continue;
     }
 
-    KMP_WARNING(SyntaxErrorUsing, var, "\"threads\"");
+    __kmp_omp_places_syntax_warn(var);
     return FALSE;
   }
 
@@ -2816,10 +2865,20 @@ static int __kmp_parse_place_list(const char *var, const char *env,
 
 static void __kmp_stg_parse_places(char const *name, char const *value,
                                    void *data) {
+  struct kmp_place_t {
+    const char *name;
+    kmp_hw_t type;
+  };
   int count;
+  bool set = false;
   const char *scan = value;
   const char *next = scan;
   const char *kind = "\"threads\"";
+  kmp_place_t std_places[] = {{"threads", KMP_HW_THREAD},
+                              {"cores", KMP_HW_CORE},
+                              {"numa_domains", KMP_HW_NUMA},
+                              {"ll_caches", KMP_HW_LLC},
+                              {"sockets", KMP_HW_SOCKET}};
   kmp_setting_t **rivals = (kmp_setting_t **)data;
   int rc;
 
@@ -2828,55 +2887,57 @@ static void __kmp_stg_parse_places(char const *name, char const *value,
     return;
   }
 
-  // If OMP_PROC_BIND is not specified but OMP_PLACES is,
-  // then let OMP_PROC_BIND default to true.
-  if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_default) {
-    __kmp_nested_proc_bind.bind_types[0] = proc_bind_true;
+  // Standard choices
+  for (size_t i = 0; i < sizeof(std_places) / sizeof(std_places[0]); ++i) {
+    const kmp_place_t &place = std_places[i];
+    if (__kmp_match_str(place.name, scan, &next)) {
+      scan = next;
+      __kmp_affinity_type = affinity_compact;
+      __kmp_affinity_gran = place.type;
+      __kmp_affinity_dups = FALSE;
+      set = true;
+      break;
+    }
   }
-
-  //__kmp_affinity_num_places = 0;
-
-  if (__kmp_match_str("threads", scan, &next)) {
-    scan = next;
-    __kmp_affinity_type = affinity_compact;
-    __kmp_affinity_gran = affinity_gran_thread;
-    __kmp_affinity_dups = FALSE;
-    kind = "\"threads\"";
-  } else if (__kmp_match_str("cores", scan, &next)) {
-    scan = next;
-    __kmp_affinity_type = affinity_compact;
-    __kmp_affinity_gran = affinity_gran_core;
-    __kmp_affinity_dups = FALSE;
-    kind = "\"cores\"";
-#if KMP_USE_HWLOC
-  } else if (__kmp_match_str("tiles", scan, &next)) {
-    scan = next;
-    __kmp_affinity_type = affinity_compact;
-    __kmp_affinity_gran = affinity_gran_tile;
-    __kmp_affinity_dups = FALSE;
-    kind = "\"tiles\"";
-#endif
-  } else if (__kmp_match_str("sockets", scan, &next)) {
-    scan = next;
-    __kmp_affinity_type = affinity_compact;
-    __kmp_affinity_gran = affinity_gran_package;
-    __kmp_affinity_dups = FALSE;
-    kind = "\"sockets\"";
-  } else {
+  // Implementation choices for OMP_PLACES based on internal types
+  if (!set) {
+    KMP_FOREACH_HW_TYPE(type) {
+      const char *name = __kmp_hw_get_keyword(type, true);
+      if (__kmp_match_str("unknowns", scan, &next))
+        continue;
+      if (__kmp_match_str(name, scan, &next)) {
+        scan = next;
+        __kmp_affinity_type = affinity_compact;
+        __kmp_affinity_gran = type;
+        __kmp_affinity_dups = FALSE;
+        set = true;
+        break;
+      }
+    }
+  }
+  if (!set) {
     if (__kmp_affinity_proclist != NULL) {
       KMP_INTERNAL_FREE((void *)__kmp_affinity_proclist);
       __kmp_affinity_proclist = NULL;
     }
     if (__kmp_parse_place_list(name, value, &__kmp_affinity_proclist)) {
       __kmp_affinity_type = affinity_explicit;
-      __kmp_affinity_gran = affinity_gran_fine;
+      __kmp_affinity_gran = KMP_HW_THREAD;
+      __kmp_affinity_dups = FALSE;
+    } else {
+      // Syntax error fallback
+      __kmp_affinity_type = affinity_compact;
+      __kmp_affinity_gran = KMP_HW_CORE;
       __kmp_affinity_dups = FALSE;
-      if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_default) {
-        __kmp_nested_proc_bind.bind_types[0] = proc_bind_true;
-      }
+    }
+    if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_default) {
+      __kmp_nested_proc_bind.bind_types[0] = proc_bind_true;
     }
     return;
   }
+  if (__kmp_affinity_gran != KMP_HW_UNKNOWN) {
+    kind = __kmp_hw_get_keyword(__kmp_affinity_gran);
+  }
 
   if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_default) {
     __kmp_nested_proc_bind.bind_types[0] = proc_bind_true;
@@ -2941,31 +3002,12 @@ static void __kmp_stg_print_places(kmp_str_buf_t *buffer, char const *name,
     } else {
       num = 0;
     }
-    if (__kmp_affinity_gran == affinity_gran_thread) {
-      if (num > 0) {
-        __kmp_str_buf_print(buffer, "='threads(%d)'\n", num);
-      } else {
-        __kmp_str_buf_print(buffer, "='threads'\n");
-      }
-    } else if (__kmp_affinity_gran == affinity_gran_core) {
-      if (num > 0) {
-        __kmp_str_buf_print(buffer, "='cores(%d)' \n", num);
-      } else {
-        __kmp_str_buf_print(buffer, "='cores'\n");
-      }
-#if KMP_USE_HWLOC
-    } else if (__kmp_affinity_gran == affinity_gran_tile) {
+    if (__kmp_affinity_gran != KMP_HW_UNKNOWN) {
+      const char *name = __kmp_hw_get_keyword(__kmp_affinity_gran, true);
       if (num > 0) {
-        __kmp_str_buf_print(buffer, "='tiles(%d)' \n", num);
+        __kmp_str_buf_print(buffer, "='%s(%d)'\n", name, num);
       } else {
-        __kmp_str_buf_print(buffer, "='tiles'\n");
-      }
-#endif
-    } else if (__kmp_affinity_gran == affinity_gran_package) {
-      if (num > 0) {
-        __kmp_str_buf_print(buffer, "='sockets(%d)'\n", num);
-      } else {
-        __kmp_str_buf_print(buffer, "='sockets'\n");
+        __kmp_str_buf_print(buffer, "='%s'\n", name);
       }
     } else {
       __kmp_str_buf_print(buffer, ": %s\n", KMP_I18N_STR(NotDefined));
@@ -2986,28 +3028,38 @@ static void __kmp_stg_parse_topology_method(char const *name, char const *value,
   }
 #endif
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
-  else if (__kmp_str_match("x2apic id", 9, value) ||
-           __kmp_str_match("x2apic_id", 9, value) ||
-           __kmp_str_match("x2apic-id", 9, value) ||
-           __kmp_str_match("x2apicid", 8, value) ||
-           __kmp_str_match("cpuid leaf 11", 13, value) ||
-           __kmp_str_match("cpuid_leaf_11", 13, value) ||
-           __kmp_str_match("cpuid-leaf-11", 13, value) ||
-           __kmp_str_match("cpuid leaf11", 12, value) ||
-           __kmp_str_match("cpuid_leaf11", 12, value) ||
-           __kmp_str_match("cpuid-leaf11", 12, value) ||
-           __kmp_str_match("cpuidleaf 11", 12, value) ||
-           __kmp_str_match("cpuidleaf_11", 12, value) ||
-           __kmp_str_match("cpuidleaf-11", 12, value) ||
-           __kmp_str_match("cpuidleaf11", 11, value) ||
-           __kmp_str_match("cpuid 11", 8, value) ||
-           __kmp_str_match("cpuid_11", 8, value) ||
-           __kmp_str_match("cpuid-11", 8, value) ||
-           __kmp_str_match("cpuid11", 7, value) ||
-           __kmp_str_match("leaf 11", 7, value) ||
-           __kmp_str_match("leaf_11", 7, value) ||
-           __kmp_str_match("leaf-11", 7, value) ||
-           __kmp_str_match("leaf11", 6, value)) {
+  else if (__kmp_str_match("cpuid_leaf31", 12, value) ||
+           __kmp_str_match("cpuid 1f", 8, value) ||
+           __kmp_str_match("cpuid 31", 8, value) ||
+           __kmp_str_match("cpuid1f", 7, value) ||
+           __kmp_str_match("cpuid31", 7, value) ||
+           __kmp_str_match("leaf 1f", 7, value) ||
+           __kmp_str_match("leaf 31", 7, value) ||
+           __kmp_str_match("leaf1f", 6, value) ||
+           __kmp_str_match("leaf31", 6, value)) {
+    __kmp_affinity_top_method = affinity_top_method_x2apicid_1f;
+  } else if (__kmp_str_match("x2apic id", 9, value) ||
+             __kmp_str_match("x2apic_id", 9, value) ||
+             __kmp_str_match("x2apic-id", 9, value) ||
+             __kmp_str_match("x2apicid", 8, value) ||
+             __kmp_str_match("cpuid leaf 11", 13, value) ||
+             __kmp_str_match("cpuid_leaf_11", 13, value) ||
+             __kmp_str_match("cpuid-leaf-11", 13, value) ||
+             __kmp_str_match("cpuid leaf11", 12, value) ||
+             __kmp_str_match("cpuid_leaf11", 12, value) ||
+             __kmp_str_match("cpuid-leaf11", 12, value) ||
+             __kmp_str_match("cpuidleaf 11", 12, value) ||
+             __kmp_str_match("cpuidleaf_11", 12, value) ||
+             __kmp_str_match("cpuidleaf-11", 12, value) ||
+             __kmp_str_match("cpuidleaf11", 11, value) ||
+             __kmp_str_match("cpuid 11", 8, value) ||
+             __kmp_str_match("cpuid_11", 8, value) ||
+             __kmp_str_match("cpuid-11", 8, value) ||
+             __kmp_str_match("cpuid11", 7, value) ||
+             __kmp_str_match("leaf 11", 7, value) ||
+             __kmp_str_match("leaf_11", 7, value) ||
+             __kmp_str_match("leaf-11", 7, value) ||
+             __kmp_str_match("leaf11", 6, value)) {
     __kmp_affinity_top_method = affinity_top_method_x2apicid;
   } else if (__kmp_str_match("apic id", 7, value) ||
              __kmp_str_match("apic_id", 7, value) ||
@@ -3064,8 +3116,12 @@ static void __kmp_stg_print_topology_method(kmp_str_buf_t *buffer,
     break;
 
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
+  case affinity_top_method_x2apicid_1f:
+    value = "x2APIC id leaf 0x1f";
+    break;
+
   case affinity_top_method_x2apicid:
-    value = "x2APIC id";
+    value = "x2APIC id leaf 0xb";
     break;
 
   case affinity_top_method_apicid:
@@ -3187,11 +3243,12 @@ static void __kmp_stg_parse_proc_bind(char const *name, char const *value,
     for (;;) {
       enum kmp_proc_bind_t bind;
 
-      if ((num == (int)proc_bind_master) ||
-          __kmp_match_str("master", buf, &next)) {
+      if ((num == (int)proc_bind_primary) ||
+          __kmp_match_str("master", buf, &next) ||
+          __kmp_match_str("primary", buf, &next)) {
         buf = next;
         SKIP_WS(buf);
-        bind = proc_bind_master;
+        bind = proc_bind_primary;
       } else if ((num == (int)proc_bind_close) ||
                  __kmp_match_str("close", buf, &next)) {
         buf = next;
@@ -3259,8 +3316,8 @@ static void __kmp_stg_print_proc_bind(kmp_str_buf_t *buffer, char const *name,
         __kmp_str_buf_print(buffer, "true");
         break;
 
-      case proc_bind_master:
-        __kmp_str_buf_print(buffer, "master");
+      case proc_bind_primary:
+        __kmp_str_buf_print(buffer, "primary");
         break;
 
       case proc_bind_close:
@@ -3355,7 +3412,8 @@ static void __kmp_stg_parse_allocator(char const *name, char const *value,
         ntraits++;
     }
   }
-  omp_alloctrait_t traits[ntraits];
+  omp_alloctrait_t *traits =
+      (omp_alloctrait_t *)KMP_ALLOCA(ntraits * sizeof(omp_alloctrait_t));
 
 // Helper macros
 #define IS_POWER_OF_TWO(n) (((n) & ((n)-1)) == 0)
@@ -3930,8 +3988,11 @@ static const char *__kmp_parse_single_omp_schedule(const char *name,
   else if (!__kmp_strcasecmp_with_sentinel("static", ptr, *delim))
     sched = kmp_sch_static;
 #if KMP_STATIC_STEAL_ENABLED
-  else if (!__kmp_strcasecmp_with_sentinel("static_steal", ptr, *delim))
-    sched = kmp_sch_static_steal;
+  else if (!__kmp_strcasecmp_with_sentinel("static_steal", ptr, *delim)) {
+    // replace static_steal with dynamic to better cope with ordered loops
+    sched = kmp_sch_dynamic_chunked;
+    sched_modifier = sched_type::kmp_sch_modifier_nonmonotonic;
+  }
 #endif
   else {
     // If there is no proper schedule kind, then this schedule is invalid
@@ -4114,6 +4175,18 @@ static void __kmp_stg_print_kmp_hand_thread(kmp_str_buf_t *buffer,
 #endif
 
 // -----------------------------------------------------------------------------
+// KMP_FORCE_MONOTONIC_DYNAMIC_SCHEDULE
+static void __kmp_stg_parse_kmp_force_monotonic(char const *name,
+                                                char const *value, void *data) {
+  __kmp_stg_parse_bool(name, value, &(__kmp_force_monotonic));
+} // __kmp_stg_parse_kmp_force_monotonic
+
+static void __kmp_stg_print_kmp_force_monotonic(kmp_str_buf_t *buffer,
+                                                char const *name, void *data) {
+  __kmp_stg_print_bool(buffer, name, __kmp_force_monotonic);
+} // __kmp_stg_print_kmp_force_monotonic
+
+// -----------------------------------------------------------------------------
 // KMP_ATOMIC_MODE
 
 static void __kmp_stg_parse_atomic_mode(char const *name, char const *value,
@@ -4638,7 +4711,8 @@ static void __kmp_stg_print_adaptive_lock_props(kmp_str_buf_t *buffer,
 static void __kmp_stg_parse_speculative_statsfile(char const *name,
                                                   char const *value,
                                                   void *data) {
-  __kmp_stg_parse_file(name, value, "", CCAST(char**, &__kmp_speculative_statsfile));
+  __kmp_stg_parse_file(name, value, "",
+                       CCAST(char **, &__kmp_speculative_statsfile));
 } // __kmp_stg_parse_speculative_statsfile
 
 static void __kmp_stg_print_speculative_statsfile(kmp_str_buf_t *buffer,
@@ -4658,12 +4732,92 @@ static void __kmp_stg_print_speculative_statsfile(kmp_str_buf_t *buffer,
 
 // -----------------------------------------------------------------------------
 // KMP_HW_SUBSET (was KMP_PLACE_THREADS)
+// 2s16c,2t => 2S16C,2T => 2S16C \0 2T
+
+// Return KMP_HW_SUBSET preferred hardware type in case a token is ambiguously
+// short. The original KMP_HW_SUBSET environment variable had single letters:
+// s, c, t for sockets, cores, threads repsectively.
+static kmp_hw_t __kmp_hw_subset_break_tie(const kmp_hw_t *possible,
+                                          size_t num_possible) {
+  for (size_t i = 0; i < num_possible; ++i) {
+    if (possible[i] == KMP_HW_THREAD)
+      return KMP_HW_THREAD;
+    else if (possible[i] == KMP_HW_CORE)
+      return KMP_HW_CORE;
+    else if (possible[i] == KMP_HW_SOCKET)
+      return KMP_HW_SOCKET;
+  }
+  return KMP_HW_UNKNOWN;
+}
+
+// Return hardware type from string or HW_UNKNOWN if string cannot be parsed
+// This algorithm is very forgiving to the user in that, the instant it can
+// reduce the search space to one, it assumes that is the topology level the
+// user wanted, even if it is misspelled later in the token.
+static kmp_hw_t __kmp_stg_parse_hw_subset_name(char const *token) {
+  size_t index, num_possible, token_length;
+  kmp_hw_t possible[KMP_HW_LAST];
+  const char *end;
+
+  // Find the end of the hardware token string
+  end = token;
+  token_length = 0;
+  while (isalnum(*end) || *end == '_') {
+    token_length++;
+    end++;
+  }
+
+  // Set the possibilities to all hardware types
+  num_possible = 0;
+  KMP_FOREACH_HW_TYPE(type) { possible[num_possible++] = type; }
+
+  // Eliminate hardware types by comparing the front of the token
+  // with hardware names
+  // In most cases, the first letter in the token will indicate exactly
+  // which hardware type is parsed, e.g., 'C' = Core
+  index = 0;
+  while (num_possible > 1 && index < token_length) {
+    size_t n = num_possible;
+    char token_char = (char)toupper(token[index]);
+    for (size_t i = 0; i < n; ++i) {
+      const char *s;
+      kmp_hw_t type = possible[i];
+      s = __kmp_hw_get_keyword(type, false);
+      if (index < KMP_STRLEN(s)) {
+        char c = (char)toupper(s[index]);
+        // Mark hardware types for removal when the characters do not match
+        if (c != token_char) {
+          possible[i] = KMP_HW_UNKNOWN;
+          num_possible--;
+        }
+      }
+    }
+    // Remove hardware types that this token cannot be
+    size_t start = 0;
+    for (size_t i = 0; i < n; ++i) {
+      if (possible[i] != KMP_HW_UNKNOWN) {
+        kmp_hw_t temp = possible[i];
+        possible[i] = possible[start];
+        possible[start] = temp;
+        start++;
+      }
+    }
+    KMP_ASSERT(start == num_possible);
+    index++;
+  }
 
-// The longest observable sequence of items is
-// Socket-Node-Tile-Core-Thread
-// So, let's limit to 5 levels for now
+  // Attempt to break a tie if user has very short token
+  // (e.g., is 'T' tile or thread?)
+  if (num_possible > 1)
+    return __kmp_hw_subset_break_tie(possible, num_possible);
+  if (num_possible == 1)
+    return possible[0];
+  return KMP_HW_UNKNOWN;
+}
+
+// The longest observable sequence of items can only be HW_LAST length
 // The input string is usually short enough, let's use 512 limit for now
-#define MAX_T_LEVEL 5
+#define MAX_T_LEVEL KMP_HW_LAST
 #define MAX_STR_LEN 512
 static void __kmp_stg_parse_hw_subset(char const *name, char const *value,
                                       void *data) {
@@ -4682,12 +4836,13 @@ static void __kmp_stg_parse_hw_subset(char const *name, char const *value,
   char input[MAX_STR_LEN];
   size_t len = 0, mlen = MAX_STR_LEN;
   int level = 0;
-  // Canonize the string (remove spaces, unify delimiters, etc.)
+  bool absolute = false;
+  // Canonicalize the string (remove spaces, unify delimiters, etc.)
   char *pos = CCAST(char *, value);
   while (*pos && mlen) {
     if (*pos != ' ') { // skip spaces
       if (len == 0 && *pos == ':') {
-        __kmp_hws_abs_flag = 1; // if the first symbol is ":", skip it
+        absolute = true;
       } else {
         input[len] = (char)(toupper(*pos));
         if (input[len] == 'X')
@@ -4700,10 +4855,10 @@ static void __kmp_stg_parse_hw_subset(char const *name, char const *value,
     mlen--;
     pos++;
   }
-  if (len == 0 || mlen == 0)
+  if (len == 0 || mlen == 0) {
     goto err; // contents is either empty or too long
+  }
   input[len] = '\0';
-  __kmp_hws_requested = 1; // mark that subset requested
   // Split by delimiter
   pos = input;
   components[level++] = pos;
@@ -4713,133 +4868,68 @@ static void __kmp_stg_parse_hw_subset(char const *name, char const *value,
     *pos = '\0'; // modify input and avoid more copying
     components[level++] = ++pos; // expect something after ","
   }
+
+  __kmp_hw_subset = kmp_hw_subset_t::allocate();
+  if (absolute)
+    __kmp_hw_subset->set_absolute();
+
   // Check each component
   for (int i = 0; i < level; ++i) {
     int offset = 0;
     int num = atoi(components[i]); // each component should start with a number
+    if (num <= 0) {
+      goto err; // only positive integers are valid for count
+    }
     if ((pos = strchr(components[i], '@'))) {
       offset = atoi(pos + 1); // save offset
       *pos = '\0'; // cut the offset from the component
     }
     pos = components[i] + strspn(components[i], digits);
-    if (pos == components[i])
+    if (pos == components[i]) {
       goto err;
+    }
     // detect the component type
-    switch (*pos) {
-    case 'S': // Socket
-      if (__kmp_hws_socket.num > 0)
-        goto err; // duplicate is not allowed
-      __kmp_hws_socket.num = num;
-      __kmp_hws_socket.offset = offset;
-      break;
-    case 'N': // NUMA Node
-      if (__kmp_hws_node.num > 0)
-        goto err; // duplicate is not allowed
-      __kmp_hws_node.num = num;
-      __kmp_hws_node.offset = offset;
-      break;
-    case 'L': // Cache
-      if (*(pos + 1) == '2') { // L2 - Tile
-        if (__kmp_hws_tile.num > 0)
-          goto err; // duplicate is not allowed
-        __kmp_hws_tile.num = num;
-        __kmp_hws_tile.offset = offset;
-      } else if (*(pos + 1) == '3') { // L3 - Socket
-        if (__kmp_hws_socket.num > 0)
-          goto err; // duplicate is not allowed
-        __kmp_hws_socket.num = num;
-        __kmp_hws_socket.offset = offset;
-      } else if (*(pos + 1) == '1') { // L1 - Core
-        if (__kmp_hws_core.num > 0)
-          goto err; // duplicate is not allowed
-        __kmp_hws_core.num = num;
-        __kmp_hws_core.offset = offset;
-      }
-      break;
-    case 'C': // Core (or Cache?)
-      if (*(pos + 1) != 'A') {
-        if (__kmp_hws_core.num > 0)
-          goto err; // duplicate is not allowed
-        __kmp_hws_core.num = num;
-        __kmp_hws_core.offset = offset;
-      } else { // Cache
-        char *d = pos + strcspn(pos, digits); // find digit
-        if (*d == '2') { // L2 - Tile
-          if (__kmp_hws_tile.num > 0)
-            goto err; // duplicate is not allowed
-          __kmp_hws_tile.num = num;
-          __kmp_hws_tile.offset = offset;
-        } else if (*d == '3') { // L3 - Socket
-          if (__kmp_hws_socket.num > 0)
-            goto err; // duplicate is not allowed
-          __kmp_hws_socket.num = num;
-          __kmp_hws_socket.offset = offset;
-        } else if (*d == '1') { // L1 - Core
-          if (__kmp_hws_core.num > 0)
-            goto err; // duplicate is not allowed
-          __kmp_hws_core.num = num;
-          __kmp_hws_core.offset = offset;
-        } else {
-          goto err;
-        }
-      }
-      break;
-    case 'T': // Thread
-      if (__kmp_hws_proc.num > 0)
-        goto err; // duplicate is not allowed
-      __kmp_hws_proc.num = num;
-      __kmp_hws_proc.offset = offset;
-      break;
-    default:
+    kmp_hw_t type = __kmp_stg_parse_hw_subset_name(pos);
+    if (type == KMP_HW_UNKNOWN) {
+      goto err;
+    }
+    if (__kmp_hw_subset->specified(type)) {
       goto err;
     }
+    __kmp_hw_subset->push_back(num, type, offset);
   }
   return;
 err:
   KMP_WARNING(AffHWSubsetInvalid, name, value);
-  __kmp_hws_requested = 0; // mark that subset not requested
+  if (__kmp_hw_subset) {
+    kmp_hw_subset_t::deallocate(__kmp_hw_subset);
+    __kmp_hw_subset = nullptr;
+  }
   return;
 }
 
 static void __kmp_stg_print_hw_subset(kmp_str_buf_t *buffer, char const *name,
                                       void *data) {
-  if (__kmp_hws_requested) {
-    int comma = 0;
-    kmp_str_buf_t buf;
-    __kmp_str_buf_init(&buf);
-    if (__kmp_env_format)
-      KMP_STR_BUF_PRINT_NAME_EX(name);
-    else
-      __kmp_str_buf_print(buffer, "   %s='", name);
-    if (__kmp_hws_socket.num) {
-      __kmp_str_buf_print(&buf, "%ds", __kmp_hws_socket.num);
-      if (__kmp_hws_socket.offset)
-        __kmp_str_buf_print(&buf, "@%d", __kmp_hws_socket.offset);
-      comma = 1;
-    }
-    if (__kmp_hws_node.num) {
-      __kmp_str_buf_print(&buf, "%s%dn", comma ? "," : "", __kmp_hws_node.num);
-      if (__kmp_hws_node.offset)
-        __kmp_str_buf_print(&buf, "@%d", __kmp_hws_node.offset);
-      comma = 1;
-    }
-    if (__kmp_hws_tile.num) {
-      __kmp_str_buf_print(&buf, "%s%dL2", comma ? "," : "", __kmp_hws_tile.num);
-      if (__kmp_hws_tile.offset)
-        __kmp_str_buf_print(&buf, "@%d", __kmp_hws_tile.offset);
-      comma = 1;
-    }
-    if (__kmp_hws_core.num) {
-      __kmp_str_buf_print(&buf, "%s%dc", comma ? "," : "", __kmp_hws_core.num);
-      if (__kmp_hws_core.offset)
-        __kmp_str_buf_print(&buf, "@%d", __kmp_hws_core.offset);
-      comma = 1;
-    }
-    if (__kmp_hws_proc.num)
-      __kmp_str_buf_print(&buf, "%s%dt", comma ? "," : "", __kmp_hws_proc.num);
-    __kmp_str_buf_print(buffer, "%s'\n", buf.str);
-    __kmp_str_buf_free(&buf);
+  kmp_str_buf_t buf;
+  int depth;
+  if (!__kmp_hw_subset)
+    return;
+  __kmp_str_buf_init(&buf);
+  if (__kmp_env_format)
+    KMP_STR_BUF_PRINT_NAME_EX(name);
+  else
+    __kmp_str_buf_print(buffer, "   %s='", name);
+
+  depth = __kmp_hw_subset->get_depth();
+  for (int i = 0; i < depth; ++i) {
+    const auto &item = __kmp_hw_subset->at(i);
+    __kmp_str_buf_print(&buf, "%s%d%s", (i > 0 ? "," : ""), item.num,
+                        __kmp_hw_get_keyword(item.type));
+    if (item.offset)
+      __kmp_str_buf_print(&buf, "@%d", item.offset);
   }
+  __kmp_str_buf_print(buffer, "%s'\n", buf.str);
+  __kmp_str_buf_free(&buf);
 }
 
 #if USE_ITT_BUILD
@@ -4874,12 +4964,11 @@ static void __kmp_stg_print_forkjoin_frames_mode(kmp_str_buf_t *buffer,
 // -----------------------------------------------------------------------------
 // KMP_ENABLE_TASK_THROTTLING
 
-static void __kmp_stg_parse_task_throttling(char const *name,
-                                            char const *value, void *data) {
+static void __kmp_stg_parse_task_throttling(char const *name, char const *value,
+                                            void *data) {
   __kmp_stg_parse_bool(name, value, &__kmp_enable_task_throttling);
 } // __kmp_stg_parse_task_throttling
 
-
 static void __kmp_stg_print_task_throttling(kmp_str_buf_t *buffer,
                                             char const *name, void *data) {
   __kmp_stg_print_bool(buffer, name, __kmp_enable_task_throttling);
@@ -4950,7 +5039,7 @@ static void __kmp_stg_print_omp_cancellation(kmp_str_buf_t *buffer,
 } // __kmp_stg_print_omp_cancellation
 
 #if OMPT_SUPPORT
-static int __kmp_tool = 1;
+int __kmp_tool = 1;
 
 static void __kmp_stg_parse_omp_tool(char const *name, char const *value,
                                      void *data) {
@@ -4967,7 +5056,7 @@ static void __kmp_stg_print_omp_tool(kmp_str_buf_t *buffer, char const *name,
   }
 } // __kmp_stg_print_omp_tool
 
-static char *__kmp_tool_libraries = NULL;
+char *__kmp_tool_libraries = NULL;
 
 static void __kmp_stg_parse_omp_tool_libraries(char const *name,
                                                char const *value, void *data) {
@@ -4988,17 +5077,19 @@ static void __kmp_stg_print_omp_tool_libraries(kmp_str_buf_t *buffer,
   }
 } // __kmp_stg_print_omp_tool_libraries
 
-static char *__kmp_tool_verbose_init = NULL;
+char *__kmp_tool_verbose_init = NULL;
 
 static void __kmp_stg_parse_omp_tool_verbose_init(char const *name,
-                                                  char const *value, void *data) {
+                                                  char const *value,
+                                                  void *data) {
   __kmp_stg_parse_str(name, value, &__kmp_tool_verbose_init);
 } // __kmp_stg_parse_omp_tool_libraries
 
 static void __kmp_stg_print_omp_tool_verbose_init(kmp_str_buf_t *buffer,
-                                                  char const *name, void *data) {
+                                                  char const *name,
+                                                  void *data) {
   if (__kmp_tool_verbose_init)
-    __kmp_stg_print_str(buffer, name, __kmp_tool_libraries);
+    __kmp_stg_print_str(buffer, name, __kmp_tool_verbose_init);
   else {
     if (__kmp_env_format) {
       KMP_STR_BUF_PRINT_NAME;
@@ -5043,6 +5134,8 @@ static kmp_setting_t __kmp_stg_table[] = {
     {"KMP_WARNINGS", __kmp_stg_parse_warnings, __kmp_stg_print_warnings, NULL,
      0, 0},
 
+    {"KMP_NESTING_MODE", __kmp_stg_parse_nesting_mode,
+     __kmp_stg_print_nesting_mode, NULL, 0, 0},
     {"OMP_NESTED", __kmp_stg_parse_nested, __kmp_stg_print_nested, NULL, 0, 0},
     {"OMP_NUM_THREADS", __kmp_stg_parse_num_threads,
      __kmp_stg_print_num_threads, NULL, 0, 0},
@@ -5067,6 +5160,10 @@ static kmp_setting_t __kmp_stg_table[] = {
      __kmp_stg_print_thread_limit, NULL, 0, 0},
     {"KMP_TEAMS_THREAD_LIMIT", __kmp_stg_parse_teams_thread_limit,
      __kmp_stg_print_teams_thread_limit, NULL, 0, 0},
+    {"OMP_NUM_TEAMS", __kmp_stg_parse_nteams, __kmp_stg_print_nteams, NULL, 0,
+     0},
+    {"OMP_TEAMS_THREAD_LIMIT", __kmp_stg_parse_teams_th_limit,
+     __kmp_stg_print_teams_th_limit, NULL, 0, 0},
     {"OMP_WAIT_POLICY", __kmp_stg_parse_wait_policy,
      __kmp_stg_print_wait_policy, NULL, 0, 0},
     {"KMP_DISP_NUM_BUFFERS", __kmp_stg_parse_disp_buffers,
@@ -5189,6 +5286,9 @@ static kmp_setting_t __kmp_stg_table[] = {
     {"KMP_DISP_HAND_THREAD", __kmp_stg_parse_kmp_hand_thread,
      __kmp_stg_print_kmp_hand_thread, NULL, 0, 0},
 #endif
+    {"KMP_FORCE_MONOTONIC_DYNAMIC_SCHEDULE",
+     __kmp_stg_parse_kmp_force_monotonic, __kmp_stg_print_kmp_force_monotonic,
+     NULL, 0, 0},
     {"KMP_ATOMIC_MODE", __kmp_stg_parse_atomic_mode,
      __kmp_stg_print_atomic_mode, NULL, 0, 0},
     {"KMP_CONSISTENCY_CHECK", __kmp_stg_parse_consistency_check,
@@ -5523,7 +5623,7 @@ static int __kmp_stg_check_rivals( // 0 -- Ok, 1 -- errors found.
     char const *name, // Name of variable.
     char const *value, // Value of the variable.
     kmp_setting_t **rivals // List of rival settings (must include current one).
-    ) {
+) {
 
   if (rivals == NULL) {
     return 0;
@@ -5643,15 +5743,15 @@ void __kmp_env_initialize(char const *string) {
   __kmp_affinity_notype = NULL;
   char const *aff_str = __kmp_env_blk_var(&block, "KMP_AFFINITY");
   if (aff_str != NULL) {
-// Check if the KMP_AFFINITY type is specified in the string.
-// We just search the string for "compact", "scatter", etc.
-// without really parsing the string.  The syntax of the
-// KMP_AFFINITY env var is such that none of the affinity
-// type names can appear anywhere other that the type
-// specifier, even as substrings.
-//
-// I can't find a case-insensitive version of strstr on Windows* OS.
-// Use the case-sensitive version for now.
+    // Check if the KMP_AFFINITY type is specified in the string.
+    // We just search the string for "compact", "scatter", etc.
+    // without really parsing the string.  The syntax of the
+    // KMP_AFFINITY env var is such that none of the affinity
+    // type names can appear anywhere other that the type
+    // specifier, even as substrings.
+    //
+    // I can't find a case-insensitive version of strstr on Windows* OS.
+    // Use the case-sensitive version for now.
 
 #if KMP_OS_WINDOWS
 #define FIND strstr
@@ -5673,7 +5773,7 @@ void __kmp_env_initialize(char const *string) {
       // Reset the affinity flags to their default values,
       // in case this is called from kmp_set_defaults().
       __kmp_affinity_type = affinity_default;
-      __kmp_affinity_gran = affinity_gran_default;
+      __kmp_affinity_gran = KMP_HW_UNKNOWN;
       __kmp_affinity_top_method = affinity_top_method_default;
       __kmp_affinity_respect_mask = affinity_respect_mask_default;
     }
@@ -5683,7 +5783,7 @@ void __kmp_env_initialize(char const *string) {
     aff_str = __kmp_env_blk_var(&block, "OMP_PROC_BIND");
     if (aff_str != NULL) {
       __kmp_affinity_type = affinity_default;
-      __kmp_affinity_gran = affinity_gran_default;
+      __kmp_affinity_gran = KMP_HW_UNKNOWN;
       __kmp_affinity_top_method = affinity_top_method_default;
       __kmp_affinity_respect_mask = affinity_respect_mask_default;
     }
@@ -5755,12 +5855,19 @@ void __kmp_env_initialize(char const *string) {
   if (!TCR_4(__kmp_init_middle)) {
 #if KMP_USE_HWLOC
     // Force using hwloc when either tiles or numa nodes requested within
-    // KMP_HW_SUBSET and no other topology method is requested
-    if ((__kmp_hws_node.num > 0 || __kmp_hws_tile.num > 0 ||
-         __kmp_affinity_gran == affinity_gran_tile) &&
-        (__kmp_affinity_top_method == affinity_top_method_default)) {
+    // KMP_HW_SUBSET or granularity setting and no other topology method
+    // is requested
+    if (__kmp_hw_subset &&
+        __kmp_affinity_top_method == affinity_top_method_default)
+      if (__kmp_hw_subset->specified(KMP_HW_NUMA) ||
+          __kmp_hw_subset->specified(KMP_HW_TILE) ||
+          __kmp_affinity_gran == KMP_HW_TILE ||
+          __kmp_affinity_gran == KMP_HW_NUMA)
+        __kmp_affinity_top_method = affinity_top_method_hwloc;
+    // Force using hwloc when tiles or numa nodes requested for OMP_PLACES
+    if (__kmp_affinity_gran == KMP_HW_NUMA ||
+        __kmp_affinity_gran == KMP_HW_TILE)
       __kmp_affinity_top_method = affinity_top_method_hwloc;
-    }
 #endif
     // Determine if the machine/OS is actually capable of supporting
     // affinity.
@@ -5790,7 +5897,7 @@ void __kmp_env_initialize(char const *string) {
         }
         __kmp_affinity_type = affinity_disabled;
         __kmp_affinity_respect_mask = 0;
-        __kmp_affinity_gran = affinity_gran_fine;
+        __kmp_affinity_gran = KMP_HW_THREAD;
       }
     }
 
@@ -5848,44 +5955,27 @@ void __kmp_env_initialize(char const *string) {
           __kmp_nested_proc_bind.bind_types[0] = proc_bind_intel;
         }
         if (__kmp_affinity_top_method == affinity_top_method_default) {
-          if (__kmp_affinity_gran == affinity_gran_default) {
+          if (__kmp_affinity_gran == KMP_HW_UNKNOWN) {
             __kmp_affinity_top_method = affinity_top_method_group;
-            __kmp_affinity_gran = affinity_gran_group;
-          } else if (__kmp_affinity_gran == affinity_gran_group) {
+            __kmp_affinity_gran = KMP_HW_PROC_GROUP;
+          } else if (__kmp_affinity_gran == KMP_HW_PROC_GROUP) {
             __kmp_affinity_top_method = affinity_top_method_group;
           } else {
             __kmp_affinity_top_method = affinity_top_method_all;
           }
         } else if (__kmp_affinity_top_method == affinity_top_method_group) {
-          if (__kmp_affinity_gran == affinity_gran_default) {
-            __kmp_affinity_gran = affinity_gran_group;
-          } else if ((__kmp_affinity_gran != affinity_gran_group) &&
-                     (__kmp_affinity_gran != affinity_gran_fine) &&
-                     (__kmp_affinity_gran != affinity_gran_thread)) {
-            const char *str = NULL;
-            switch (__kmp_affinity_gran) {
-            case affinity_gran_core:
-              str = "core";
-              break;
-            case affinity_gran_package:
-              str = "package";
-              break;
-            case affinity_gran_node:
-              str = "node";
-              break;
-            case affinity_gran_tile:
-              str = "tile";
-              break;
-            default:
-              KMP_DEBUG_ASSERT(0);
-            }
+          if (__kmp_affinity_gran == KMP_HW_UNKNOWN) {
+            __kmp_affinity_gran = KMP_HW_PROC_GROUP;
+          } else if ((__kmp_affinity_gran != KMP_HW_PROC_GROUP) &&
+                     (__kmp_affinity_gran != KMP_HW_THREAD)) {
+            const char *str = __kmp_hw_get_keyword(__kmp_affinity_gran);
             KMP_WARNING(AffGranTopGroup, var, str);
-            __kmp_affinity_gran = affinity_gran_fine;
+            __kmp_affinity_gran = KMP_HW_THREAD;
           }
         } else {
-          if (__kmp_affinity_gran == affinity_gran_default) {
-            __kmp_affinity_gran = affinity_gran_core;
-          } else if (__kmp_affinity_gran == affinity_gran_group) {
+          if (__kmp_affinity_gran == KMP_HW_UNKNOWN) {
+            __kmp_affinity_gran = KMP_HW_CORE;
+          } else if (__kmp_affinity_gran == KMP_HW_PROC_GROUP) {
             const char *str = NULL;
             switch (__kmp_affinity_type) {
             case affinity_physical:
@@ -5908,7 +5998,7 @@ void __kmp_env_initialize(char const *string) {
               KMP_DEBUG_ASSERT(0);
             }
             KMP_WARNING(AffGranGroupType, var, str);
-            __kmp_affinity_gran = affinity_gran_core;
+            __kmp_affinity_gran = KMP_HW_CORE;
           }
         }
       } else
@@ -5950,15 +6040,15 @@ void __kmp_env_initialize(char const *string) {
             __kmp_affinity_type = affinity_none;
           }
         }
-        if ((__kmp_affinity_gran == affinity_gran_default) &&
+        if ((__kmp_affinity_gran == KMP_HW_UNKNOWN) &&
             (__kmp_affinity_gran_levels < 0)) {
 #if KMP_MIC_SUPPORTED
           if (__kmp_mic_type != non_mic) {
-            __kmp_affinity_gran = affinity_gran_fine;
+            __kmp_affinity_gran = KMP_HW_THREAD;
           } else
 #endif
           {
-            __kmp_affinity_gran = affinity_gran_core;
+            __kmp_affinity_gran = KMP_HW_CORE;
           }
         }
         if (__kmp_affinity_top_method == affinity_top_method_default) {
@@ -6022,7 +6112,7 @@ void __kmp_env_print() {
 #ifdef KMP_GOMP_COMPAT
         || strncmp(name, "GOMP_", 5) == 0
 #endif // KMP_GOMP_COMPAT
-        ) {
+    ) {
       __kmp_str_buf_print(&buffer, "   %s=%s\n", name, value);
     }
   }
@@ -6050,7 +6140,6 @@ void __kmp_env_print_2() {
   __kmp_display_env_impl(__kmp_display_env, __kmp_display_env_verbose);
 } // __kmp_env_print_2
 
-
 void __kmp_display_env_impl(int display_env, int display_env_verbose) {
   kmp_env_blk_t block;
   kmp_str_buf_t buffer;
@@ -6068,8 +6157,7 @@ void __kmp_display_env_impl(int display_env, int display_env_verbose) {
 
   for (int i = 0; i < __kmp_stg_count; ++i) {
     if (__kmp_stg_table[i].print != NULL &&
-        ((display_env &&
-          strncmp(__kmp_stg_table[i].name, "OMP_", 4) == 0) ||
+        ((display_env && strncmp(__kmp_stg_table[i].name, "OMP_", 4) == 0) ||
          display_env_verbose)) {
       __kmp_stg_table[i].print(&buffer, __kmp_stg_table[i].name,
                                __kmp_stg_table[i].data);
@@ -6087,4 +6175,47 @@ void __kmp_display_env_impl(int display_env, int display_env_verbose) {
   __kmp_printf("\n");
 }
 
+#if OMPD_SUPPORT
+// Dump environment variables for OMPD
+void __kmp_env_dump() {
+
+  kmp_env_blk_t block;
+  kmp_str_buf_t buffer, env, notdefined;
+
+  __kmp_stg_init();
+  __kmp_str_buf_init(&buffer);
+  __kmp_str_buf_init(&env);
+  __kmp_str_buf_init(&notdefined);
+
+  __kmp_env_blk_init(&block, NULL);
+  __kmp_env_blk_sort(&block);
+
+  __kmp_str_buf_print(&notdefined, ": %s", KMP_I18N_STR(NotDefined));
+
+  for (int i = 0; i < __kmp_stg_count; ++i) {
+    if (__kmp_stg_table[i].print == NULL)
+      continue;
+    __kmp_str_buf_clear(&env);
+    __kmp_stg_table[i].print(&env, __kmp_stg_table[i].name,
+                             __kmp_stg_table[i].data);
+    if (env.used < 4) // valid definition must have indents (3) and a new line
+      continue;
+    if (strstr(env.str, notdefined.str))
+      // normalize the string
+      __kmp_str_buf_print(&buffer, "%s=undefined\n", __kmp_stg_table[i].name);
+    else
+      __kmp_str_buf_cat(&buffer, env.str + 3, env.used - 3);
+  }
+
+  ompd_env_block = (char *)__kmp_allocate(buffer.used + 1);
+  KMP_MEMCPY(ompd_env_block, buffer.str, buffer.used + 1);
+  ompd_env_block_size = (ompd_size_t)KMP_STRLEN(ompd_env_block);
+
+  __kmp_env_blk_free(&block);
+  __kmp_str_buf_free(&buffer);
+  __kmp_str_buf_free(&env);
+  __kmp_str_buf_free(&notdefined);
+}
+#endif // OMPD_SUPPORT
+
 // end of file
diff --git a/openmp/runtime/src/kmp_settings.h b/openmp/runtime/src/kmp_settings.h
index d61c40694cf6..f63f105940ef 100644
--- a/openmp/runtime/src/kmp_settings.h
+++ b/openmp/runtime/src/kmp_settings.h
@@ -18,6 +18,9 @@ void __kmp_env_initialize(char const *);
 void __kmp_env_print();
 void __kmp_env_print_2();
 void __kmp_display_env_impl(int display_env, int display_env_verbose);
+#if OMPD_SUPPORT
+void __kmp_env_dump();
+#endif
 
 int __kmp_initial_threads_capacity(int req_nproc);
 void __kmp_init_dflt_team_nth();
diff --git a/openmp/runtime/src/kmp_stats.cpp b/openmp/runtime/src/kmp_stats.cpp
index 280c4738c595..8657bfe18c44 100644
--- a/openmp/runtime/src/kmp_stats.cpp
+++ b/openmp/runtime/src/kmp_stats.cpp
@@ -700,16 +700,18 @@ void kmp_stats_output_module::printPloticusFile() {
                    "   pagesize: 15 10\n"
                    "   scale: 1.0\n\n");
 
-  fprintf(plotOut, "#proc getdata\n"
-                   "   file: %s\n\n",
+  fprintf(plotOut,
+          "#proc getdata\n"
+          "   file: %s\n\n",
           eventsFileName);
 
-  fprintf(plotOut, "#proc areadef\n"
-                   "   title: OpenMP Sampling Timeline\n"
-                   "   titledetails: align=center size=16\n"
-                   "   rectangle: 1 1 13 9\n"
-                   "   xautorange: datafield=2,3\n"
-                   "   yautorange: -1 %d\n\n",
+  fprintf(plotOut,
+          "#proc areadef\n"
+          "   title: OpenMP Sampling Timeline\n"
+          "   titledetails: align=center size=16\n"
+          "   rectangle: 1 1 13 9\n"
+          "   xautorange: datafield=2,3\n"
+          "   yautorange: -1 %d\n\n",
           size);
 
   fprintf(plotOut, "#proc xaxis\n"
@@ -718,12 +720,13 @@ void kmp_stats_output_module::printPloticusFile() {
                    "   label: Time (ticks)\n"
                    "   labeldetails: size=14\n\n");
 
-  fprintf(plotOut, "#proc yaxis\n"
-                   "   stubs: inc 1\n"
-                   "   stubrange: 0 %d\n"
-                   "   stubdetails: size=12\n"
-                   "   label: Thread #\n"
-                   "   labeldetails: size=14\n\n",
+  fprintf(plotOut,
+          "#proc yaxis\n"
+          "   stubs: inc 1\n"
+          "   stubrange: 0 %d\n"
+          "   stubdetails: size=12\n"
+          "   label: Thread #\n"
+          "   labeldetails: size=14\n\n",
           size - 1);
 
   fprintf(plotOut, "#proc bars\n"
@@ -737,10 +740,11 @@ void kmp_stats_output_module::printPloticusFile() {
   for (i = 0; i < TIMER_LAST; i++) {
     if (timeStat::logEvent((timer_e)i)) {
       rgb_color c = getEventColor((timer_e)i);
-      fprintf(plotOut, "#proc legendentry\n"
-                       "   sampletype: color\n"
-                       "   label: %s\n"
-                       "   details: rgb(%1.1f,%1.1f,%1.1f)\n\n",
+      fprintf(plotOut,
+              "#proc legendentry\n"
+              "   sampletype: color\n"
+              "   label: %s\n"
+              "   details: rgb(%1.1f,%1.1f,%1.1f)\n\n",
               timeStat::name((timer_e)i), c.r, c.g, c.b);
     }
   }
@@ -832,10 +836,10 @@ void kmp_stats_output_module::outputStats(const char *heading) {
     // Accumulate timers.
     for (timer_e s = timer_e(0); s < TIMER_LAST; s = timer_e(s + 1)) {
       // See if we should ignore this timer when aggregating
-      if ((timeStat::masterOnly(s) && (t != 0)) || // Timer only valid on master
-          // and this thread is worker
+      if ((timeStat::masterOnly(s) && (t != 0)) || // Timer only valid on
+          // primary thread and this thread is worker
           (timeStat::workerOnly(s) && (t == 0)) // Timer only valid on worker
-          // and this thread is the master
+          // and this thread is the primary thread
           ) {
         continue;
       }
diff --git a/openmp/runtime/src/kmp_stats.h b/openmp/runtime/src/kmp_stats.h
index 7f4a9492b24f..4c5053df3fef 100644
--- a/openmp/runtime/src/kmp_stats.h
+++ b/openmp/runtime/src/kmp_stats.h
@@ -48,9 +48,9 @@
  */
 enum stats_flags_e {
   noTotal = 1 << 0, //!< do not show a TOTAL_aggregation for this statistic
-  onlyInMaster = 1 << 1, //!< statistic is valid only for master
+  onlyInMaster = 1 << 1, //!< statistic is valid only for primary thread
   noUnits = 1 << 2, //!< statistic doesn't need units printed next to it
-  notInMaster = 1 << 3, //!< statistic is valid only for non-master threads
+  notInMaster = 1 << 3, //!< statistic is valid only for non-primary threads
   logEvent = 1 << 4 //!< statistic can be logged on the event timeline when
   //! KMP_STATS_EVENTS is on (valid only for timers)
 };
@@ -103,6 +103,7 @@ enum stats_state_e {
   macro(OMP_CRITICAL, 0, arg)                                                  \
   macro(OMP_SINGLE, 0, arg)                                                    \
   macro(OMP_MASTER, 0, arg)                                                    \
+  macro(OMP_MASKED, 0, arg)                                                    \
   macro(OMP_TEAMS, 0, arg)                                                     \
   macro(OMP_set_lock, 0, arg)                                                  \
   macro(OMP_test_lock, 0, arg)                                                 \
@@ -150,6 +151,7 @@ enum stats_state_e {
   macro (OMP_critical_wait, 0, arg)                                            \
   macro (OMP_single, 0, arg)                                                   \
   macro (OMP_master, 0, arg)                                                   \
+  macro (OMP_masked, 0, arg)                                                   \
   macro (OMP_task_immediate, 0, arg)                                           \
   macro (OMP_task_taskwait, 0, arg)                                            \
   macro (OMP_task_taskyield, 0, arg)                                           \
@@ -180,8 +182,8 @@ enum stats_state_e {
 // clang-format on
 
 // OMP_worker_thread_life -- Time from thread becoming an OpenMP thread (either
-//                           initializing OpenMP or being created by a master)
-//                           until the thread is destroyed
+//                           initializing OpenMP or being created by a primary
+//                           thread) until the thread is destroyed
 // OMP_parallel           -- Time thread spends executing work directly
 //                           within a #pragma omp parallel
 // OMP_parallel_overhead  -- Time thread spends setting up a parallel region
@@ -198,6 +200,7 @@ enum stats_state_e {
 //                           a critical section
 // OMP_single             -- Time spent executing a "single" region
 // OMP_master             -- Time spent executing a "master" region
+// OMP_masked             -- Time spent executing a "masked" region
 // OMP_task_immediate     -- Time spent executing non-deferred tasks
 // OMP_task_taskwait      -- Time spent executing tasks inside a taskwait
 //                           construct
@@ -289,7 +292,7 @@ enum stats_state_e {
  * same as that of a timer above.
  *
  * @ingroup STATS_GATHERING
-*/
+ */
 #define KMP_FOREACH_EXPLICIT_TIMER(macro, arg) KMP_FOREACH_TIMER(macro, arg)
 
 #define ENUMERATE(name, ignore, prefix) prefix##name,
@@ -710,7 +713,7 @@ public:
     to the bar width in the timeline graph.
 
     Every thread will have a thread local pointer to its node in
-    the list.  The sentinel node is used by the master thread to
+    the list.  The sentinel node is used by the primary thread to
     store "dummy" statistics before __kmp_create_worker() is called.
 **************************************************************** */
 class kmp_stats_list {
@@ -884,7 +887,7 @@ extern kmp_stats_output_module __kmp_stats_output;
  * a timer statistics.
  *
  * @ingroup STATS_GATHERING
-*/
+ */
 #define KMP_COUNT_VALUE(name, value)                                           \
   __kmp_stats_thread_ptr->getTimer(TIMER_##name)->addSample((double)value)
 
@@ -897,7 +900,7 @@ extern kmp_stats_output_module __kmp_stats_output;
  * counter for the executing thread.
  *
  * @ingroup STATS_GATHERING
-*/
+ */
 #define KMP_COUNT_BLOCK(name)                                                  \
   __kmp_stats_thread_ptr->getCounter(COUNTER_##name)->increment()
 
@@ -917,7 +920,7 @@ extern kmp_stats_output_module __kmp_stats_output;
  * macro is called.
  *
  * @ingroup STATS_GATHERING
-*/
+ */
 #define KMP_OUTPUT_STATS(heading_string) __kmp_output_stats(heading_string)
 
 /*!
@@ -926,7 +929,7 @@ extern kmp_stats_output_module __kmp_stats_output;
  * @param name timer which you want this thread to begin with
  *
  * @ingroup STATS_GATHERING
-*/
+ */
 #define KMP_INIT_PARTITIONED_TIMERS(name)                                      \
   __kmp_stats_thread_ptr->getPartitionedTimers()->init(explicitTimer(          \
       __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name))
@@ -963,7 +966,7 @@ extern kmp_stats_output_module __kmp_stats_output;
  * \details Reset all stats for all threads.
  *
  * @ingroup STATS_GATHERING
-*/
+ */
 #define KMP_RESET_STATS() __kmp_reset_stats()
 
 #if (KMP_DEVELOPER_STATS)
diff --git a/openmp/runtime/src/kmp_str.cpp b/openmp/runtime/src/kmp_str.cpp
index 6838bffeca4e..ffce2b88ab35 100644
--- a/openmp/runtime/src/kmp_str.cpp
+++ b/openmp/runtime/src/kmp_str.cpp
@@ -169,14 +169,15 @@ int __kmp_str_buf_vprint(kmp_str_buf_t *buffer, char const *format,
 
     // Try to format string.
     {
-/* On Linux* OS Intel(R) 64, vsnprintf() modifies args argument, so vsnprintf()
-   crashes if it is called for the second time with the same args. To prevent
-   the crash, we have to pass a fresh intact copy of args to vsnprintf() on each
-   iteration.
+      /* On Linux* OS Intel(R) 64, vsnprintf() modifies args argument, so
+         vsnprintf() crashes if it is called for the second time with the same
+         args. To prevent the crash, we have to pass a fresh intact copy of args
+         to vsnprintf() on each iteration.
 
-   Unfortunately, standard va_copy() macro is not available on Windows* OS.
-   However, it seems vsnprintf() does not modify args argument on Windows* OS.
-*/
+         Unfortunately, standard va_copy() macro is not available on Windows*
+         OS. However, it seems vsnprintf() does not modify args argument on
+         Windows* OS.
+      */
 
 #if !KMP_OS_WINDOWS
       va_list _args;
@@ -403,7 +404,7 @@ void __kmp_str_loc_free(kmp_str_loc_t *loc) {
 int __kmp_str_eqf( // True, if strings are equal, false otherwise.
     char const *lhs, // First string.
     char const *rhs // Second string.
-    ) {
+) {
   int result;
 #if KMP_OS_WINDOWS
   result = (_stricmp(lhs, rhs) == 0);
@@ -447,7 +448,7 @@ int __kmp_str_eqf( // True, if strings are equal, false otherwise.
 char *__kmp_str_format( // Allocated string.
     char const *format, // Format string.
     ... // Other parameters.
-    ) {
+) {
   va_list args;
   int size = 512;
   char *buffer = NULL;
@@ -546,7 +547,7 @@ void __kmp_str_split(char *str, // I: String to split.
                      char delim, // I: Character to split on.
                      char **head, // O: Pointer to head (may be NULL).
                      char **tail // O: Pointer to tail (may be NULL).
-                     ) {
+) {
   char *h = str;
   char *t = NULL;
   if (str != NULL) {
@@ -570,7 +571,7 @@ char *__kmp_str_token(
     char *str, // String to split into tokens. Note: String *is* modified!
     char const *delim, // Delimiters.
     char **buf // Internal buffer.
-    ) {
+) {
   char *token = NULL;
 #if KMP_OS_WINDOWS
   // On Windows* OS there is no strtok_r() function. Let us implement it.
@@ -652,7 +653,7 @@ void __kmp_str_to_size( // R: Error code.
     size_t *out, // O: Parsed number.
     size_t dfactor, // I: The factor if none of the letters specified.
     char const **error // O: Null if everything is ok, error message otherwise.
-    ) {
+) {
 
   size_t value = 0;
   size_t factor = 0;
@@ -751,7 +752,7 @@ void __kmp_str_to_uint( // R: Error code.
     char const *str, // I: String of characters, unsigned number.
     kmp_uint64 *out, // O: Parsed number.
     char const **error // O: Null if everything is ok, error message otherwise.
-    ) {
+) {
   size_t value = 0;
   int overflow = 0;
   int i = 0;
diff --git a/openmp/runtime/src/kmp_stub.cpp b/openmp/runtime/src/kmp_stub.cpp
index 58add6b6ae3c..87e5388ca9db 100644
--- a/openmp/runtime/src/kmp_stub.cpp
+++ b/openmp/runtime/src/kmp_stub.cpp
@@ -350,6 +350,13 @@ omp_allocator_handle_t const omp_pteam_mem_alloc =
     (omp_allocator_handle_t const)7;
 omp_allocator_handle_t const omp_thread_mem_alloc =
     (omp_allocator_handle_t const)8;
+// Preview of target memory support
+omp_allocator_handle_t const llvm_omp_target_host_mem_alloc =
+    (omp_allocator_handle_t const)100;
+omp_allocator_handle_t const llvm_omp_target_shared_mem_alloc =
+    (omp_allocator_handle_t const)101;
+omp_allocator_handle_t const llvm_omp_target_device_mem_alloc =
+    (omp_allocator_handle_t const)102;
 
 omp_memspace_handle_t const omp_default_mem_space =
     (omp_memspace_handle_t const)0;
@@ -361,6 +368,13 @@ omp_memspace_handle_t const omp_high_bw_mem_space =
     (omp_memspace_handle_t const)3;
 omp_memspace_handle_t const omp_low_lat_mem_space =
     (omp_memspace_handle_t const)4;
+// Preview of target memory support
+omp_memspace_handle_t const llvm_omp_target_host_mem_space =
+    (omp_memspace_handle_t const)100;
+omp_memspace_handle_t const llvm_omp_target_shared_mem_space =
+    (omp_memspace_handle_t const)101;
+omp_memspace_handle_t const llvm_omp_target_device_mem_space =
+    (omp_memspace_handle_t const)102;
 #endif /* KMP_OS_WINDOWS */
 void *omp_alloc(size_t size, const omp_allocator_handle_t allocator) {
   i;
diff --git a/openmp/runtime/src/kmp_taskdeps.cpp b/openmp/runtime/src/kmp_taskdeps.cpp
index ca7e593fd07f..162fb38e1eed 100644
--- a/openmp/runtime/src/kmp_taskdeps.cpp
+++ b/openmp/runtime/src/kmp_taskdeps.cpp
@@ -54,7 +54,7 @@ static inline kmp_depnode_t *__kmp_node_ref(kmp_depnode_t *node) {
 
 enum { KMP_DEPHASH_OTHER_SIZE = 97, KMP_DEPHASH_MASTER_SIZE = 997 };
 
-size_t sizes[] = { 997, 2003, 4001, 8191, 16001, 32003, 64007, 131071, 270029 };
+size_t sizes[] = {997, 2003, 4001, 8191, 16001, 32003, 64007, 131071, 270029};
 const size_t MAX_GEN = 8;
 
 static inline size_t __kmp_dephash_hash(kmp_intptr_t addr, size_t hsize) {
@@ -149,14 +149,11 @@ static kmp_dephash_t *__kmp_dephash_create(kmp_info_t *thread,
   return h;
 }
 
-#define ENTRY_LAST_INS 0
-#define ENTRY_LAST_MTXS 1
-
-static kmp_dephash_entry *
-__kmp_dephash_find(kmp_info_t *thread, kmp_dephash_t **hash, kmp_intptr_t addr) {
+static kmp_dephash_entry *__kmp_dephash_find(kmp_info_t *thread,
+                                             kmp_dephash_t **hash,
+                                             kmp_intptr_t addr) {
   kmp_dephash_t *h = *hash;
-  if (h->nelements != 0
-      && h->nconflicts/h->size >= 1) {
+  if (h->nelements != 0 && h->nconflicts / h->size >= 1) {
     *hash = __kmp_dephash_extend(thread, h);
     h = *hash;
   }
@@ -178,9 +175,9 @@ __kmp_dephash_find(kmp_info_t *thread, kmp_dephash_t **hash, kmp_intptr_t addr)
 #endif
     entry->addr = addr;
     entry->last_out = NULL;
-    entry->last_ins = NULL;
-    entry->last_mtxs = NULL;
-    entry->last_flag = ENTRY_LAST_INS;
+    entry->last_set = NULL;
+    entry->prev_set = NULL;
+    entry->last_flag = 0;
     entry->mtx_lock = NULL;
     entry->next_in_bucket = h->buckets[bucket];
     h->buckets[bucket] = entry;
@@ -215,7 +212,7 @@ static inline void __kmp_track_dependence(kmp_int32 gtid, kmp_depnode_t *source,
                                           kmp_task_t *sink_task) {
 #ifdef KMP_SUPPORT_GRAPH_OUTPUT
   kmp_taskdata_t *task_source = KMP_TASK_TO_TASKDATA(source->dn.task);
-  // do not use sink->dn.task as that is only filled after the dependencies
+  // do not use sink->dn.task as that is only filled after the dependences
   // are already processed!
   kmp_taskdata_t *task_sink = KMP_TASK_TO_TASKDATA(sink_task);
 
@@ -298,7 +295,7 @@ static inline kmp_int32
 __kmp_process_deps(kmp_int32 gtid, kmp_depnode_t *node, kmp_dephash_t **hash,
                    bool dep_barrier, kmp_int32 ndeps,
                    kmp_depend_info_t *dep_list, kmp_task_t *task) {
-  KA_TRACE(30, ("__kmp_process_deps<%d>: T#%d processing %d dependencies : "
+  KA_TRACE(30, ("__kmp_process_deps<%d>: T#%d processing %d dependences : "
                 "dep_barrier = %d\n",
                 filter, gtid, ndeps, dep_barrier));
 
@@ -313,96 +310,81 @@ __kmp_process_deps(kmp_int32 gtid, kmp_depnode_t *node, kmp_dephash_t **hash,
     kmp_dephash_entry_t *info =
         __kmp_dephash_find(thread, hash, dep->base_addr);
     kmp_depnode_t *last_out = info->last_out;
-    kmp_depnode_list_t *last_ins = info->last_ins;
-    kmp_depnode_list_t *last_mtxs = info->last_mtxs;
-
-    if (dep->flags.out) { // out --> clean lists of ins and mtxs if any
-      if (last_ins || last_mtxs) {
-        if (info->last_flag == ENTRY_LAST_INS) { // INS were last
-          npredecessors +=
-              __kmp_depnode_link_successor(gtid, thread, task, node, last_ins);
-        } else { // MTXS were last
-          npredecessors +=
-              __kmp_depnode_link_successor(gtid, thread, task, node, last_mtxs);
-        }
-        __kmp_depnode_list_free(thread, last_ins);
-        __kmp_depnode_list_free(thread, last_mtxs);
-        info->last_ins = NULL;
-        info->last_mtxs = NULL;
+    kmp_depnode_list_t *last_set = info->last_set;
+    kmp_depnode_list_t *prev_set = info->prev_set;
+
+    if (dep->flags.out) { // out or inout --> clean lists if any
+      if (last_set) {
+        npredecessors +=
+            __kmp_depnode_link_successor(gtid, thread, task, node, last_set);
+        __kmp_depnode_list_free(thread, last_set);
+        __kmp_depnode_list_free(thread, prev_set);
+        info->last_set = NULL;
+        info->prev_set = NULL;
+        info->last_flag = 0; // no sets in this dephash entry
       } else {
         npredecessors +=
             __kmp_depnode_link_successor(gtid, thread, task, node, last_out);
       }
       __kmp_node_deref(thread, last_out);
-      if (dep_barrier) {
+      if (!dep_barrier) {
+        info->last_out = __kmp_node_ref(node);
+      } else {
         // if this is a sync point in the serial sequence, then the previous
         // outputs are guaranteed to be completed after the execution of this
         // task so the previous output nodes can be cleared.
         info->last_out = NULL;
-      } else {
-        info->last_out = __kmp_node_ref(node);
       }
-    } else if (dep->flags.in) {
-      // in --> link node to either last_out or last_mtxs, clean earlier deps
-      if (last_mtxs) {
-        npredecessors +=
-            __kmp_depnode_link_successor(gtid, thread, task, node, last_mtxs);
-        __kmp_node_deref(thread, last_out);
-        info->last_out = NULL;
-        if (info->last_flag == ENTRY_LAST_MTXS && last_ins) { // MTXS were last
-          // clean old INS before creating new list
-          __kmp_depnode_list_free(thread, last_ins);
-          info->last_ins = NULL;
-        }
-      } else {
+    } else { // either IN or MTX or SET
+      if (info->last_flag == 0 || info->last_flag == dep->flag) {
+        // last_set either didn't exist or of same dep kind
         // link node as successor of the last_out if any
         npredecessors +=
             __kmp_depnode_link_successor(gtid, thread, task, node, last_out);
-      }
-      info->last_flag = ENTRY_LAST_INS;
-      info->last_ins = __kmp_add_node(thread, info->last_ins, node);
-    } else {
-      KMP_DEBUG_ASSERT(dep->flags.mtx == 1);
-      // mtx --> link node to either last_out or last_ins, clean earlier deps
-      if (last_ins) {
+        // link node as successor of all nodes in the prev_set if any
+        npredecessors +=
+            __kmp_depnode_link_successor(gtid, thread, task, node, prev_set);
+      } else { // last_set is of different dep kind, make it prev_set
+        // link node as successor of all nodes in the last_set
         npredecessors +=
-            __kmp_depnode_link_successor(gtid, thread, task, node, last_ins);
+            __kmp_depnode_link_successor(gtid, thread, task, node, last_set);
+        // clean last_out if any
         __kmp_node_deref(thread, last_out);
         info->last_out = NULL;
-        if (info->last_flag == ENTRY_LAST_INS && last_mtxs) { // INS were last
-          // clean old MTXS before creating new list
-          __kmp_depnode_list_free(thread, last_mtxs);
-          info->last_mtxs = NULL;
-        }
-      } else {
-        // link node as successor of the last_out if any
-        npredecessors +=
-            __kmp_depnode_link_successor(gtid, thread, task, node, last_out);
-      }
-      info->last_flag = ENTRY_LAST_MTXS;
-      info->last_mtxs = __kmp_add_node(thread, info->last_mtxs, node);
-      if (info->mtx_lock == NULL) {
-        info->mtx_lock = (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
-        __kmp_init_lock(info->mtx_lock);
+        // clean prev_set if any
+        __kmp_depnode_list_free(thread, prev_set);
+        // move last_set to prev_set, new last_set will be allocated
+        info->prev_set = last_set;
+        info->last_set = NULL;
       }
-      KMP_DEBUG_ASSERT(node->dn.mtx_num_locks < MAX_MTX_DEPS);
-      kmp_int32 m;
-      // Save lock in node's array
-      for (m = 0; m < MAX_MTX_DEPS; ++m) {
-        // sort pointers in decreasing order to avoid potential livelock
-        if (node->dn.mtx_locks[m] < info->mtx_lock) {
-          KMP_DEBUG_ASSERT(node->dn.mtx_locks[node->dn.mtx_num_locks] == NULL);
-          for (int n = node->dn.mtx_num_locks; n > m; --n) {
-            // shift right all lesser non-NULL pointers
-            KMP_DEBUG_ASSERT(node->dn.mtx_locks[n - 1] != NULL);
-            node->dn.mtx_locks[n] = node->dn.mtx_locks[n - 1];
+      info->last_flag = dep->flag; // store dep kind of the last_set
+      info->last_set = __kmp_add_node(thread, info->last_set, node);
+
+      // check if we are processing MTX dependency
+      if (dep->flag == KMP_DEP_MTX) {
+        if (info->mtx_lock == NULL) {
+          info->mtx_lock = (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
+          __kmp_init_lock(info->mtx_lock);
+        }
+        KMP_DEBUG_ASSERT(node->dn.mtx_num_locks < MAX_MTX_DEPS);
+        kmp_int32 m;
+        // Save lock in node's array
+        for (m = 0; m < MAX_MTX_DEPS; ++m) {
+          // sort pointers in decreasing order to avoid potential livelock
+          if (node->dn.mtx_locks[m] < info->mtx_lock) {
+            KMP_DEBUG_ASSERT(!node->dn.mtx_locks[node->dn.mtx_num_locks]);
+            for (int n = node->dn.mtx_num_locks; n > m; --n) {
+              // shift right all lesser non-NULL pointers
+              KMP_DEBUG_ASSERT(node->dn.mtx_locks[n - 1] != NULL);
+              node->dn.mtx_locks[n] = node->dn.mtx_locks[n - 1];
+            }
+            node->dn.mtx_locks[m] = info->mtx_lock;
+            break;
           }
-          node->dn.mtx_locks[m] = info->mtx_lock;
-          break;
         }
+        KMP_DEBUG_ASSERT(m < MAX_MTX_DEPS); // must break from loop
+        node->dn.mtx_num_locks++;
       }
-      KMP_DEBUG_ASSERT(m < MAX_MTX_DEPS); // must break from loop
-      node->dn.mtx_num_locks++;
     }
   }
   KA_TRACE(30, ("__kmp_process_deps<%d>: T#%d found %d predecessors\n", filter,
@@ -424,8 +406,8 @@ static bool __kmp_check_deps(kmp_int32 gtid, kmp_depnode_t *node,
 #if KMP_DEBUG
   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
 #endif
-  KA_TRACE(20, ("__kmp_check_deps: T#%d checking dependencies for task %p : %d "
-                "possibly aliased dependencies, %d non-aliased dependencies : "
+  KA_TRACE(20, ("__kmp_check_deps: T#%d checking dependences for task %p : %d "
+                "possibly aliased dependences, %d non-aliased dependences : "
                 "dep_barrier=%d .\n",
                 gtid, taskdata, ndeps, ndeps_noalias, dep_barrier));
 
@@ -433,27 +415,25 @@ static bool __kmp_check_deps(kmp_int32 gtid, kmp_depnode_t *node,
   // TODO: Different algorithm for large dep_list ( > 10 ? )
   for (i = 0; i < ndeps; i++) {
     if (dep_list[i].base_addr != 0) {
+      KMP_DEBUG_ASSERT(
+          dep_list[i].flag == KMP_DEP_IN || dep_list[i].flag == KMP_DEP_OUT ||
+          dep_list[i].flag == KMP_DEP_INOUT ||
+          dep_list[i].flag == KMP_DEP_MTX || dep_list[i].flag == KMP_DEP_SET);
       for (int j = i + 1; j < ndeps; j++) {
         if (dep_list[i].base_addr == dep_list[j].base_addr) {
-          dep_list[i].flags.in |= dep_list[j].flags.in;
-          dep_list[i].flags.out |=
-              (dep_list[j].flags.out ||
-               (dep_list[i].flags.in && dep_list[j].flags.mtx) ||
-               (dep_list[i].flags.mtx && dep_list[j].flags.in));
-          dep_list[i].flags.mtx =
-              dep_list[i].flags.mtx | dep_list[j].flags.mtx &&
-              !dep_list[i].flags.out;
+          if (dep_list[i].flag != dep_list[j].flag) {
+            // two different dependences on same address work identical to OUT
+            dep_list[i].flag = KMP_DEP_OUT;
+          }
           dep_list[j].base_addr = 0; // Mark j element as void
         }
       }
-      if (dep_list[i].flags.mtx) {
+      if (dep_list[i].flag == KMP_DEP_MTX) {
         // limit number of mtx deps to MAX_MTX_DEPS per node
         if (n_mtxs < MAX_MTX_DEPS && task != NULL) {
           ++n_mtxs;
         } else {
-          dep_list[i].flags.in = 1; // downgrade mutexinoutset to inout
-          dep_list[i].flags.out = 1;
-          dep_list[i].flags.mtx = 0;
+          dep_list[i].flag = KMP_DEP_OUT; // downgrade mutexinoutset to inout
         }
       }
     }
@@ -462,7 +442,7 @@ static bool __kmp_check_deps(kmp_int32 gtid, kmp_depnode_t *node,
   // doesn't need to be atomic as no other thread is going to be accessing this
   // node just yet.
   // npredecessors is set -1 to ensure that none of the releasing tasks queues
-  // this task before we have finished processing all the dependencies
+  // this task before we have finished processing all the dependences
   node->dn.npredecessors = -1;
 
   // used to pack all npredecessors additions into a single atomic operation at
@@ -537,13 +517,13 @@ kmp_int32 __kmpc_omp_task_with_deps(ident_t *loc_ref, kmp_int32 gtid,
           OMPT_LOAD_OR_GET_RETURN_ADDRESS(gtid));
     }
 
-    new_taskdata->ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
+    new_taskdata->ompt_task_info.frame.enter_frame.ptr =
+        OMPT_GET_FRAME_ADDRESS(0);
   }
 
 #if OMPT_OPTIONAL
   /* OMPT grab all dependences if requested by the tool */
-  if (ndeps + ndeps_noalias > 0 &&
-      ompt_enabled.ompt_callback_dependences) {
+  if (ndeps + ndeps_noalias > 0 && ompt_enabled.ompt_callback_dependences) {
     kmp_int32 i;
 
     int ompt_ndeps = ndeps + ndeps_noalias;
@@ -562,6 +542,8 @@ kmp_int32 __kmpc_omp_task_with_deps(ident_t *loc_ref, kmp_int32 gtid,
         ompt_deps[i].dependence_type = ompt_dependence_type_in;
       else if (dep_list[i].flags.mtx)
         ompt_deps[i].dependence_type = ompt_dependence_type_mutexinoutset;
+      else if (dep_list[i].flags.set)
+        ompt_deps[i].dependence_type = ompt_dependence_type_inoutset;
     }
     for (i = 0; i < ndeps_noalias; i++) {
       ompt_deps[ndeps + i].variable.ptr = (void *)noalias_dep_list[i].base_addr;
@@ -574,10 +556,12 @@ kmp_int32 __kmpc_omp_task_with_deps(ident_t *loc_ref, kmp_int32 gtid,
       else if (noalias_dep_list[i].flags.mtx)
         ompt_deps[ndeps + i].dependence_type =
             ompt_dependence_type_mutexinoutset;
+      else if (noalias_dep_list[i].flags.set)
+        ompt_deps[ndeps + i].dependence_type = ompt_dependence_type_inoutset;
     }
     ompt_callbacks.ompt_callback(ompt_callback_dependences)(
         &(new_taskdata->ompt_task_info.task_data), ompt_deps, ompt_ndeps);
-    /* We can now free the allocated memory for the dependencies */
+    /* We can now free the allocated memory for the dependences */
     /* For OMPD we might want to delay the free until end of this function */
     KMP_OMPT_DEPS_FREE(thread, ompt_deps);
   }
@@ -593,7 +577,7 @@ kmp_int32 __kmpc_omp_task_with_deps(ident_t *loc_ref, kmp_int32 gtid,
                            task_team->tt.tt_hidden_helper_task_encountered));
 
   if (!serial && (ndeps > 0 || ndeps_noalias > 0)) {
-    /* if no dependencies have been tracked yet, create the dependence hash */
+    /* if no dependences have been tracked yet, create the dependence hash */
     if (current_task->td_dephash == NULL)
       current_task->td_dephash = __kmp_dephash_create(thread, current_task);
 
@@ -612,7 +596,7 @@ kmp_int32 __kmpc_omp_task_with_deps(ident_t *loc_ref, kmp_int32 gtid,
                          NO_DEP_BARRIER, ndeps, dep_list, ndeps_noalias,
                          noalias_dep_list)) {
       KA_TRACE(10, ("__kmpc_omp_task_with_deps(exit): T#%d task had blocking "
-                    "dependencies: "
+                    "dependences: "
                     "loc=%p task=%p, return: TASK_CURRENT_NOT_QUEUED\n",
                     gtid, loc_ref, new_taskdata));
 #if OMPT_SUPPORT
@@ -623,14 +607,13 @@ kmp_int32 __kmpc_omp_task_with_deps(ident_t *loc_ref, kmp_int32 gtid,
       return TASK_CURRENT_NOT_QUEUED;
     }
   } else {
-    KA_TRACE(10, ("__kmpc_omp_task_with_deps(exit): T#%d ignored dependencies "
-                  "for task (serialized)"
-                  "loc=%p task=%p\n",
+    KA_TRACE(10, ("__kmpc_omp_task_with_deps(exit): T#%d ignored dependences "
+                  "for task (serialized) loc=%p task=%p\n",
                   gtid, loc_ref, new_taskdata));
   }
 
   KA_TRACE(10, ("__kmpc_omp_task_with_deps(exit): T#%d task had no blocking "
-                "dependencies : "
+                "dependences : "
                 "loc=%p task=%p, transferring to __kmp_omp_task\n",
                 gtid, loc_ref, new_taskdata));
 
@@ -648,11 +631,7 @@ void __ompt_taskwait_dep_finish(kmp_taskdata_t *current_task,
                                 ompt_data_t *taskwait_task_data) {
   if (ompt_enabled.ompt_callback_task_schedule) {
     ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
-        &(current_task->ompt_task_info.task_data), ompt_task_switch,
-        taskwait_task_data);
-    ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
-        taskwait_task_data, ompt_task_complete,
-        &(current_task->ompt_task_info.task_data));
+        taskwait_task_data, ompt_taskwait_complete, NULL);
   }
   current_task->ompt_task_info.frame.enter_frame.ptr = NULL;
   *taskwait_task_data = ompt_data_none;
@@ -668,7 +647,7 @@ void __ompt_taskwait_dep_finish(kmp_taskdata_t *current_task,
 @param ndeps_noalias Number of depend items with no aliasing
 @param noalias_dep_list List of depend items with no aliasing
 
-Blocks the current task until all specifies dependencies have been fulfilled.
+Blocks the current task until all specifies dependences have been fulfilled.
 */
 void __kmpc_omp_wait_deps(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 ndeps,
                           kmp_depend_info_t *dep_list, kmp_int32 ndeps_noalias,
@@ -676,7 +655,7 @@ void __kmpc_omp_wait_deps(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 ndeps,
   KA_TRACE(10, ("__kmpc_omp_wait_deps(enter): T#%d loc=%p\n", gtid, loc_ref));
 
   if (ndeps == 0 && ndeps_noalias == 0) {
-    KA_TRACE(10, ("__kmpc_omp_wait_deps(exit): T#%d has no dependencies to "
+    KA_TRACE(10, ("__kmpc_omp_wait_deps(exit): T#%d has no dependences to "
                   "wait upon : loc=%p\n",
                   gtid, loc_ref));
     return;
@@ -701,7 +680,7 @@ void __kmpc_omp_wait_deps(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 ndeps,
       ompt_callbacks.ompt_callback(ompt_callback_task_create)(
           &(current_task->ompt_task_info.task_data),
           &(current_task->ompt_task_info.frame), taskwait_task_data,
-          ompt_task_explicit | ompt_task_undeferred | ompt_task_mergeable, 1,
+          ompt_task_taskwait | ompt_task_undeferred | ompt_task_mergeable, 1,
           OMPT_LOAD_OR_GET_RETURN_ADDRESS(gtid));
     }
   }
@@ -728,6 +707,8 @@ void __kmpc_omp_wait_deps(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 ndeps,
       else if (dep_list[i].flags.mtx)
         ompt_deps[ndeps + i].dependence_type =
             ompt_dependence_type_mutexinoutset;
+      else if (dep_list[i].flags.set)
+        ompt_deps[ndeps + i].dependence_type = ompt_dependence_type_inoutset;
     }
     for (i = 0; i < ndeps_noalias; i++) {
       ompt_deps[ndeps + i].variable.ptr = (void *)noalias_dep_list[i].base_addr;
@@ -740,10 +721,12 @@ void __kmpc_omp_wait_deps(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 ndeps,
       else if (noalias_dep_list[i].flags.mtx)
         ompt_deps[ndeps + i].dependence_type =
             ompt_dependence_type_mutexinoutset;
+      else if (noalias_dep_list[i].flags.set)
+        ompt_deps[ndeps + i].dependence_type = ompt_dependence_type_inoutset;
     }
     ompt_callbacks.ompt_callback(ompt_callback_dependences)(
         taskwait_task_data, ompt_deps, ompt_ndeps);
-    /* We can now free the allocated memory for the dependencies */
+    /* We can now free the allocated memory for the dependences */
     /* For OMPD we might want to delay the free until end of this function */
     KMP_OMPT_DEPS_FREE(thread, ompt_deps);
     ompt_deps = NULL;
@@ -763,7 +746,7 @@ void __kmpc_omp_wait_deps(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 ndeps,
 
   if (ignore) {
     KA_TRACE(10, ("__kmpc_omp_wait_deps(exit): T#%d has no blocking "
-                  "dependencies : loc=%p\n",
+                  "dependences : loc=%p\n",
                   gtid, loc_ref));
 #if OMPT_SUPPORT
     __ompt_taskwait_dep_finish(current_task, taskwait_task_data);
@@ -780,7 +763,7 @@ void __kmpc_omp_wait_deps(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 ndeps,
                         DEP_BARRIER, ndeps, dep_list, ndeps_noalias,
                         noalias_dep_list)) {
     KA_TRACE(10, ("__kmpc_omp_wait_deps(exit): T#%d has no blocking "
-                  "dependencies : loc=%p\n",
+                  "dependences : loc=%p\n",
                   gtid, loc_ref));
 #if OMPT_SUPPORT
     __ompt_taskwait_dep_finish(current_task, taskwait_task_data);
diff --git a/openmp/runtime/src/kmp_taskdeps.h b/openmp/runtime/src/kmp_taskdeps.h
index a1ddf3638433..d1576dd5b791 100644
--- a/openmp/runtime/src/kmp_taskdeps.h
+++ b/openmp/runtime/src/kmp_taskdeps.h
@@ -2,7 +2,6 @@
  * kmp_taskdeps.h
  */
 
-
 //===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
@@ -11,7 +10,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-
 #ifndef KMP_TASKDEPS_H
 #define KMP_TASKDEPS_H
 
@@ -25,6 +23,8 @@ static inline void __kmp_node_deref(kmp_info_t *thread, kmp_depnode_t *node) {
     return;
 
   kmp_int32 n = KMP_ATOMIC_DEC(&node->dn.nrefs) - 1;
+  // TODO: temporarily disable assertion until the bug with dependences is fixed
+  //  KMP_DEBUG_ASSERT(n >= 0);
   if (n == 0) {
     KMP_ASSERT(node->dn.nrefs == 0);
 #if USE_FAST_MEMORY
@@ -58,8 +58,8 @@ static inline void __kmp_dephash_free_entries(kmp_info_t *thread,
       kmp_dephash_entry_t *next;
       for (kmp_dephash_entry_t *entry = h->buckets[i]; entry; entry = next) {
         next = entry->next_in_bucket;
-        __kmp_depnode_list_free(thread, entry->last_ins);
-        __kmp_depnode_list_free(thread, entry->last_mtxs);
+        __kmp_depnode_list_free(thread, entry->last_set);
+        __kmp_depnode_list_free(thread, entry->prev_set);
         __kmp_node_deref(thread, entry->last_out);
         if (entry->mtx_lock) {
           __kmp_destroy_lock(entry->mtx_lock);
@@ -85,6 +85,8 @@ static inline void __kmp_dephash_free(kmp_info_t *thread, kmp_dephash_t *h) {
 #endif
 }
 
+extern void __kmpc_give_task(kmp_task_t *ptask, kmp_int32 start);
+
 static inline void __kmp_release_deps(kmp_int32 gtid, kmp_taskdata_t *task) {
   kmp_info_t *thread = __kmp_threads[gtid];
   kmp_depnode_t *node = task->td_depnode;
@@ -143,7 +145,9 @@ static inline void __kmp_release_deps(kmp_int32 gtid, kmp_taskdata_t *task) {
           // encountering thread's queue; otherwise, it can be pushed to its own
           // queue.
           if (!next_taskdata->td_flags.hidden_helper) {
-            __kmp_omp_task(task->encountering_gtid, successor->dn.task, false);
+            __kmpc_give_task(
+                successor->dn.task,
+                __kmp_tid_from_gtid(next_taskdata->encountering_gtid));
           } else {
             __kmp_omp_task(gtid, successor->dn.task, false);
           }
diff --git a/openmp/runtime/src/kmp_tasking.cpp b/openmp/runtime/src/kmp_tasking.cpp
index 3d7021128dbd..6c3e2c95cb5a 100644
--- a/openmp/runtime/src/kmp_tasking.cpp
+++ b/openmp/runtime/src/kmp_tasking.cpp
@@ -21,8 +21,6 @@
 #include "ompt-specific.h"
 #endif
 
-#include "tsan_annotations.h"
-
 /* forward declaration */
 static void __kmp_enable_tasking(kmp_task_team_t *task_team,
                                  kmp_info_t *this_thr);
@@ -326,7 +324,8 @@ static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) {
   kmp_info_t *thread = __kmp_threads[gtid];
   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
 
-  if (taskdata->td_flags.hidden_helper) {
+  // We don't need to map to shadow gtid if it is already hidden helper thread
+  if (taskdata->td_flags.hidden_helper && !KMP_HIDDEN_HELPER_THREAD(gtid)) {
     gtid = KMP_GTID_TO_SHADOW_GTID(gtid);
     thread = __kmp_threads[gtid];
   }
@@ -435,10 +434,12 @@ static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) {
                 gtid, taskdata, thread_data->td.td_deque_ntasks,
                 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
 
+  auto hidden_helper = taskdata->td_flags.hidden_helper;
+
   __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
 
   // Signal one worker thread to execute the task
-  if (taskdata->td_flags.hidden_helper) {
+  if (UNLIKELY(hidden_helper)) {
     // Wake hidden helper threads up if they're sleeping
     __kmp_hidden_helper_worker_thread_signal();
   }
@@ -564,8 +565,10 @@ static inline void __ompt_task_init(kmp_taskdata_t *task, int tid) {
   task->ompt_task_info.task_data.value = 0;
   task->ompt_task_info.frame.exit_frame = ompt_data_none;
   task->ompt_task_info.frame.enter_frame = ompt_data_none;
-  task->ompt_task_info.frame.exit_frame_flags = ompt_frame_runtime | ompt_frame_framepointer;
-  task->ompt_task_info.frame.enter_frame_flags = ompt_frame_runtime | ompt_frame_framepointer;
+  task->ompt_task_info.frame.exit_frame_flags =
+      ompt_frame_runtime | ompt_frame_framepointer;
+  task->ompt_task_info.frame.enter_frame_flags =
+      ompt_frame_runtime | ompt_frame_framepointer;
 }
 
 // __ompt_task_start:
@@ -620,7 +623,7 @@ static void __kmpc_omp_task_begin_if0_template(ident_t *loc_ref, kmp_int32 gtid,
                 "current_task=%p\n",
                 gtid, loc_ref, taskdata, current_task));
 
-  if (taskdata->td_flags.tiedness == TASK_UNTIED) {
+  if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
     // untied task needs to increment counter so that the task structure is not
     // freed prematurely
     kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count);
@@ -640,7 +643,8 @@ static void __kmpc_omp_task_begin_if0_template(ident_t *loc_ref, kmp_int32 gtid,
       current_task->ompt_task_info.frame.enter_frame.ptr =
           taskdata->ompt_task_info.frame.exit_frame.ptr = frame_address;
       current_task->ompt_task_info.frame.enter_frame_flags =
-          taskdata->ompt_task_info.frame.exit_frame_flags = ompt_frame_application | ompt_frame_framepointer;
+          taskdata->ompt_task_info.frame.exit_frame_flags =
+              ompt_frame_application | ompt_frame_framepointer;
     }
     if (ompt_enabled.ompt_callback_task_create) {
       ompt_task_info_t *parent_info = &(current_task->ompt_task_info);
@@ -728,7 +732,6 @@ static void __kmp_free_task(kmp_int32 gtid, kmp_taskdata_t *taskdata,
   KMP_DEBUG_ASSERT(taskdata->td_incomplete_child_tasks == 0);
 
   taskdata->td_flags.freed = 1;
-  ANNOTATE_HAPPENS_BEFORE(taskdata);
 // deallocate the taskdata and shared variable blocks associated with this task
 #if USE_FAST_MEMORY
   __kmp_fast_free(thread, taskdata);
@@ -883,7 +886,7 @@ static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
      hence overlapping the destructor invocations with some other work in the
      released tasks.  The OpenMP spec is not specific on when the destructors
      are invoked, so we should be free to choose. */
-  if (taskdata->td_flags.destructors_thunk) {
+  if (UNLIKELY(taskdata->td_flags.destructors_thunk)) {
     kmp_routine_entry_t destr_thunk = task->data1.destructors;
     KMP_ASSERT(destr_thunk);
     destr_thunk(gtid, task);
@@ -894,7 +897,7 @@ static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
   KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
 
   bool detach = false;
-  if (taskdata->td_flags.detachable == TASK_DETACHABLE) {
+  if (UNLIKELY(taskdata->td_flags.detachable == TASK_DETACHABLE)) {
     if (taskdata->td_allow_completion_event.type ==
         KMP_EVENT_ALLOW_COMPLETION) {
       // event hasn't been fulfilled yet. Try to detach task.
@@ -937,16 +940,17 @@ static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
     if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) ||
         taskdata->td_flags.detachable == TASK_DETACHABLE ||
         taskdata->td_flags.hidden_helper) {
+      __kmp_release_deps(gtid, taskdata);
       // Predecrement simulated by "- 1" calculation
       children =
           KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks) - 1;
       KMP_DEBUG_ASSERT(children >= 0);
       if (taskdata->td_taskgroup)
         KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
-      __kmp_release_deps(gtid, taskdata);
-    } else if (task_team && task_team->tt.tt_found_proxy_tasks) {
-      // if we found proxy tasks there could exist a dependency chain
-      // with the proxy task as origin
+    } else if (task_team && (task_team->tt.tt_found_proxy_tasks ||
+                             task_team->tt.tt_hidden_helper_task_encountered)) {
+      // if we found proxy or hidden helper tasks there could exist a dependency
+      // chain with the proxy task as origin
       __kmp_release_deps(gtid, taskdata);
     }
     // td_flags.executing must be marked as 0 after __kmp_release_deps has been
@@ -957,7 +961,6 @@ static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
     taskdata->td_flags.executing = 0; // suspend the finishing task
   }
 
-
   KA_TRACE(
       20, ("__kmp_task_finish: T#%d finished task %p, %d incomplete children\n",
            gtid, taskdata, children));
@@ -987,7 +990,7 @@ static void __kmpc_omp_task_complete_if0_template(ident_t *loc_ref,
                                                   kmp_task_t *task) {
   KA_TRACE(10, ("__kmpc_omp_task_complete_if0(enter): T#%d loc=%p task=%p\n",
                 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
-  __kmp_assert_valid_gtid(gtid);
+  KMP_DEBUG_ASSERT(gtid >= 0);
   // this routine will provide task to resume
   __kmp_task_finish<ompt>(gtid, task, NULL);
 
@@ -999,7 +1002,8 @@ static void __kmpc_omp_task_complete_if0_template(ident_t *loc_ref,
     ompt_frame_t *ompt_frame;
     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
     ompt_frame->enter_frame = ompt_data_none;
-    ompt_frame->enter_frame_flags = ompt_frame_runtime | ompt_frame_framepointer;
+    ompt_frame->enter_frame_flags =
+        ompt_frame_runtime | ompt_frame_framepointer;
   }
 #endif
 
@@ -1217,6 +1221,7 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
                 gtid, loc_ref, *((kmp_int32 *)flags), sizeof_kmp_task_t,
                 sizeof_shareds, task_entry));
 
+  KMP_DEBUG_ASSERT(parent_task);
   if (parent_task->td_flags.final) {
     if (flags->merged_if0) {
     }
@@ -1234,8 +1239,8 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
   // Detachable tasks are not proxy tasks yet but could be in the future. Doing
   // the tasking setup
   // when that happens is too late.
-  if (flags->proxy == TASK_PROXY || flags->detachable == TASK_DETACHABLE ||
-      flags->hidden_helper) {
+  if (UNLIKELY(flags->proxy == TASK_PROXY ||
+               flags->detachable == TASK_DETACHABLE || flags->hidden_helper)) {
     if (flags->proxy == TASK_PROXY) {
       flags->tiedness = TASK_UNTIED;
       flags->merged_if0 = 1;
@@ -1271,7 +1276,7 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
       }
     }
 
-    if (flags->proxy == TASK_PROXY &&
+    if ((flags->proxy == TASK_PROXY || flags->detachable == TASK_DETACHABLE) &&
         task_team->tt.tt_found_proxy_tasks == FALSE)
       TCW_4(task_team->tt.tt_found_proxy_tasks, TRUE);
     if (flags->hidden_helper &&
@@ -1298,7 +1303,6 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
   taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(
       encountering_thread, shareds_offset + sizeof_shareds);
 #endif /* USE_FAST_MEMORY */
-  ANNOTATE_HAPPENS_AFTER(taskdata);
 
   task = KMP_TASKDATA_TO_TASK(taskdata);
 
@@ -1337,13 +1341,7 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
   if (flags->proxy == TASK_FULL)
     copy_icvs(&taskdata->td_icvs, &taskdata->td_parent->td_icvs);
 
-  taskdata->td_flags.tiedness = flags->tiedness;
-  taskdata->td_flags.final = flags->final;
-  taskdata->td_flags.merged_if0 = flags->merged_if0;
-  taskdata->td_flags.destructors_thunk = flags->destructors_thunk;
-  taskdata->td_flags.proxy = flags->proxy;
-  taskdata->td_flags.detachable = flags->detachable;
-  taskdata->td_flags.hidden_helper = flags->hidden_helper;
+  taskdata->td_flags = *flags;
   taskdata->encountering_gtid = gtid;
   taskdata->td_task_team = thread->th.th_task_team;
   taskdata->td_size_alloc = shareds_offset + sizeof_shareds;
@@ -1368,8 +1366,6 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
   taskdata->td_flags.complete = 0;
   taskdata->td_flags.freed = 0;
 
-  taskdata->td_flags.native = flags->native;
-
   KMP_ATOMIC_ST_RLX(&taskdata->td_incomplete_child_tasks, 0);
   // start at one because counts current task and children
   KMP_ATOMIC_ST_RLX(&taskdata->td_allocated_child_tasks, 1);
@@ -1399,17 +1395,15 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
     if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT) {
       KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks);
     }
-  }
-
-  if (flags->hidden_helper) {
-    taskdata->td_flags.task_serial = FALSE;
-    // Increment the number of hidden helper tasks to be executed
-    KMP_ATOMIC_INC(&__kmp_unexecuted_hidden_helper_tasks);
+    if (flags->hidden_helper) {
+      taskdata->td_flags.task_serial = FALSE;
+      // Increment the number of hidden helper tasks to be executed
+      KMP_ATOMIC_INC(&__kmp_unexecuted_hidden_helper_tasks);
+    }
   }
 
   KA_TRACE(20, ("__kmp_task_alloc(exit): T#%d created task %p parent=%p\n",
                 gtid, taskdata, taskdata->td_parent));
-  ANNOTATE_HAPPENS_BEFORE(task);
 
   return task;
 }
@@ -1422,7 +1416,7 @@ kmp_task_t *__kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
   kmp_tasking_flags_t *input_flags = (kmp_tasking_flags_t *)&flags;
   __kmp_assert_valid_gtid(gtid);
   input_flags->native = FALSE;
-// __kmp_task_alloc() sets up all other runtime flags
+  // __kmp_task_alloc() sets up all other runtime flags
   KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s %s %s) "
                 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
                 gtid, loc_ref, input_flags->tiedness ? "tied  " : "untied",
@@ -1530,7 +1524,6 @@ static void __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task,
 
   // Proxy tasks are not handled by the runtime
   if (taskdata->td_flags.proxy != TASK_PROXY) {
-    ANNOTATE_HAPPENS_AFTER(task);
     __kmp_task_start(gtid, task, current_task); // OMPT only if not discarded
   }
 
@@ -1598,6 +1591,11 @@ static void __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task,
       __ompt_task_start(task, current_task, gtid);
 #endif
 
+#if OMPD_SUPPORT
+    if (ompd_state & OMPD_ENABLE_BP)
+      ompd_bp_task_begin();
+#endif
+
 #if USE_ITT_BUILD && USE_ITT_NOTIFY
     kmp_uint64 cur_time;
     kmp_int32 kmp_itt_count_task =
@@ -1632,12 +1630,15 @@ static void __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task,
     KMP_FSYNC_CANCEL(taskdata); // destroy self (just executed)
     KMP_FSYNC_RELEASING(taskdata->td_parent); // releasing parent
 #endif
-
   }
 
+#if OMPD_SUPPORT
+  if (ompd_state & OMPD_ENABLE_BP)
+    ompd_bp_task_end();
+#endif
+
   // Proxy tasks are not handled by the runtime
   if (taskdata->td_flags.proxy != TASK_PROXY) {
-    ANNOTATE_HAPPENS_BEFORE(taskdata->td_parent);
 #if OMPT_SUPPORT
     if (UNLIKELY(ompt_enabled.enabled)) {
       thread->th.ompt_thread_info = oldInfo;
@@ -1679,10 +1680,8 @@ kmp_int32 __kmpc_omp_task_parts(ident_t *loc_ref, kmp_int32 gtid,
   if (UNLIKELY(ompt_enabled.enabled)) {
     parent = new_taskdata->td_parent;
     if (ompt_enabled.ompt_callback_task_create) {
-      ompt_data_t task_data = ompt_data_none;
       ompt_callbacks.ompt_callback(ompt_callback_task_create)(
-          parent ? &(parent->ompt_task_info.task_data) : &task_data,
-          parent ? &(parent->ompt_task_info.frame) : NULL,
+          &(parent->ompt_task_info.task_data), &(parent->ompt_task_info.frame),
           &(new_taskdata->ompt_task_info.task_data), ompt_task_explicit, 0,
           OMPT_GET_RETURN_ADDRESS(0));
     }
@@ -1705,7 +1704,6 @@ kmp_int32 __kmpc_omp_task_parts(ident_t *loc_ref, kmp_int32 gtid,
        "loc=%p task=%p, return: TASK_CURRENT_NOT_QUEUED\n",
        gtid, loc_ref, new_taskdata));
 
-  ANNOTATE_HAPPENS_BEFORE(new_task);
 #if OMPT_SUPPORT
   if (UNLIKELY(ompt_enabled.enabled)) {
     parent->ompt_task_info.frame.enter_frame = ompt_data_none;
@@ -1740,7 +1738,6 @@ kmp_int32 __kmp_omp_task(kmp_int32 gtid, kmp_task_t *new_task,
     __kmp_invoke_task(gtid, new_task, current_task);
   }
 
-  ANNOTATE_HAPPENS_BEFORE(new_task);
   return TASK_CURRENT_NOT_QUEUED;
 }
 
@@ -1775,13 +1772,13 @@ kmp_int32 __kmpc_omp_task(ident_t *loc_ref, kmp_int32 gtid,
       OMPT_STORE_RETURN_ADDRESS(gtid);
       parent = new_taskdata->td_parent;
       if (!parent->ompt_task_info.frame.enter_frame.ptr) {
-        parent->ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
+        parent->ompt_task_info.frame.enter_frame.ptr =
+            OMPT_GET_FRAME_ADDRESS(0);
       }
       if (ompt_enabled.ompt_callback_task_create) {
-        ompt_data_t task_data = ompt_data_none;
         ompt_callbacks.ompt_callback(ompt_callback_task_create)(
-            parent ? &(parent->ompt_task_info.task_data) : &task_data,
-            parent ? &(parent->ompt_task_info.frame) : NULL,
+            &(parent->ompt_task_info.task_data),
+            &(parent->ompt_task_info.frame),
             &(new_taskdata->ompt_task_info.task_data),
             ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
             OMPT_LOAD_RETURN_ADDRESS(gtid));
@@ -1841,10 +1838,8 @@ kmp_int32 __kmp_omp_taskloop_task(ident_t *loc_ref, kmp_int32 gtid,
     if (!parent->ompt_task_info.frame.enter_frame.ptr)
       parent->ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
     if (ompt_enabled.ompt_callback_task_create) {
-      ompt_data_t task_data = ompt_data_none;
       ompt_callbacks.ompt_callback(ompt_callback_task_create)(
-          parent ? &(parent->ompt_task_info.task_data) : &task_data,
-          parent ? &(parent->ompt_task_info.frame) : NULL,
+          &(parent->ompt_task_info.task_data), &(parent->ompt_task_info.frame),
           &(new_taskdata->ompt_task_info.task_data),
           ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
           codeptr_ra);
@@ -1869,13 +1864,13 @@ template <bool ompt>
 static kmp_int32 __kmpc_omp_taskwait_template(ident_t *loc_ref, kmp_int32 gtid,
                                               void *frame_address,
                                               void *return_address) {
-  kmp_taskdata_t *taskdata;
+  kmp_taskdata_t *taskdata = nullptr;
   kmp_info_t *thread;
   int thread_finished = FALSE;
   KMP_SET_THREAD_STATE_BLOCK(TASKWAIT);
 
   KA_TRACE(10, ("__kmpc_omp_taskwait(enter): T#%d loc=%p\n", gtid, loc_ref));
-  __kmp_assert_valid_gtid(gtid);
+  KMP_DEBUG_ASSERT(gtid >= 0);
 
   if (__kmp_tasking_mode != tskm_immediate_exec) {
     thread = __kmp_threads[gtid];
@@ -1915,9 +1910,10 @@ static kmp_int32 __kmpc_omp_taskwait_template(ident_t *loc_ref, kmp_int32 gtid,
     taskdata->td_taskwait_thread = gtid + 1;
 
 #if USE_ITT_BUILD
-    void *itt_sync_obj = __kmp_itt_taskwait_object(gtid);
-    if (UNLIKELY(itt_sync_obj != NULL))
-      __kmp_itt_taskwait_starting(gtid, itt_sync_obj);
+    void *itt_sync_obj = NULL;
+#if USE_ITT_NOTIFY
+    KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);
+#endif /* USE_ITT_NOTIFY */
 #endif /* USE_ITT_BUILD */
 
     bool must_wait =
@@ -1943,8 +1939,7 @@ static kmp_int32 __kmpc_omp_taskwait_template(ident_t *loc_ref, kmp_int32 gtid,
       }
     }
 #if USE_ITT_BUILD
-    if (UNLIKELY(itt_sync_obj != NULL))
-      __kmp_itt_taskwait_finished(gtid, itt_sync_obj);
+    KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);
     KMP_FSYNC_ACQUIRED(taskdata); // acquire self - sync with children
 #endif /* USE_ITT_BUILD */
 
@@ -1968,7 +1963,6 @@ static kmp_int32 __kmpc_omp_taskwait_template(ident_t *loc_ref, kmp_int32 gtid,
     }
 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
 
-    ANNOTATE_HAPPENS_AFTER(taskdata);
   }
 
   KA_TRACE(10, ("__kmpc_omp_taskwait(exit): T#%d task %p finished waiting, "
@@ -2003,7 +1997,7 @@ kmp_int32 __kmpc_omp_taskwait(ident_t *loc_ref, kmp_int32 gtid) {
 
 // __kmpc_omp_taskyield: switch to a different task
 kmp_int32 __kmpc_omp_taskyield(ident_t *loc_ref, kmp_int32 gtid, int end_part) {
-  kmp_taskdata_t *taskdata;
+  kmp_taskdata_t *taskdata = NULL;
   kmp_info_t *thread;
   int thread_finished = FALSE;
 
@@ -2028,9 +2022,10 @@ kmp_int32 __kmpc_omp_taskyield(ident_t *loc_ref, kmp_int32 gtid, int end_part) {
     taskdata->td_taskwait_thread = gtid + 1;
 
 #if USE_ITT_BUILD
-    void *itt_sync_obj = __kmp_itt_taskwait_object(gtid);
-    if (UNLIKELY(itt_sync_obj != NULL))
-      __kmp_itt_taskwait_starting(gtid, itt_sync_obj);
+    void *itt_sync_obj = NULL;
+#if USE_ITT_NOTIFY
+    KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);
+#endif /* USE_ITT_NOTIFY */
 #endif /* USE_ITT_BUILD */
     if (!taskdata->td_flags.team_serial) {
       kmp_task_team_t *task_team = thread->th.th_task_team;
@@ -2052,8 +2047,7 @@ kmp_int32 __kmpc_omp_taskyield(ident_t *loc_ref, kmp_int32 gtid, int end_part) {
       }
     }
 #if USE_ITT_BUILD
-    if (UNLIKELY(itt_sync_obj != NULL))
-      __kmp_itt_taskwait_finished(gtid, itt_sync_obj);
+    KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);
 #endif /* USE_ITT_BUILD */
 
     // Debugger:  The taskwait is completed. Location remains, but thread is
@@ -2497,6 +2491,7 @@ void __kmpc_taskgroup(ident_t *loc, int gtid) {
   tg_new->parent = taskdata->td_taskgroup;
   tg_new->reduce_data = NULL;
   tg_new->reduce_num_data = 0;
+  tg_new->gomp_data = NULL;
   taskdata->td_taskgroup = tg_new;
 
 #if OMPT_SUPPORT && OMPT_OPTIONAL
@@ -2529,7 +2524,7 @@ void __kmpc_end_taskgroup(ident_t *loc, int gtid) {
   kmp_team_t *team;
   ompt_data_t my_task_data;
   ompt_data_t my_parallel_data;
-  void *codeptr;
+  void *codeptr = nullptr;
   if (UNLIKELY(ompt_enabled.enabled)) {
     team = thread->th.th_team;
     my_task_data = taskdata->ompt_task_info.task_data;
@@ -2553,9 +2548,10 @@ void __kmpc_end_taskgroup(ident_t *loc, int gtid) {
 #if USE_ITT_BUILD
     // For ITT the taskgroup wait is similar to taskwait until we need to
     // distinguish them
-    void *itt_sync_obj = __kmp_itt_taskwait_object(gtid);
-    if (UNLIKELY(itt_sync_obj != NULL))
-      __kmp_itt_taskwait_starting(gtid, itt_sync_obj);
+    void *itt_sync_obj = NULL;
+#if USE_ITT_NOTIFY
+    KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);
+#endif /* USE_ITT_NOTIFY */
 #endif /* USE_ITT_BUILD */
 
 #if OMPT_SUPPORT && OMPT_OPTIONAL
@@ -2568,7 +2564,8 @@ void __kmpc_end_taskgroup(ident_t *loc, int gtid) {
 
     if (!taskdata->td_flags.team_serial ||
         (thread->th.th_task_team != NULL &&
-         thread->th.th_task_team->tt.tt_found_proxy_tasks)) {
+         (thread->th.th_task_team->tt.tt_found_proxy_tasks ||
+          thread->th.th_task_team->tt.tt_hidden_helper_task_encountered))) {
       kmp_flag_32<false, false> flag(
           RCAST(std::atomic<kmp_uint32> *, &(taskgroup->count)), 0U);
       while (KMP_ATOMIC_LD_ACQ(&taskgroup->count) != 0) {
@@ -2588,14 +2585,14 @@ void __kmpc_end_taskgroup(ident_t *loc, int gtid) {
 #endif
 
 #if USE_ITT_BUILD
-    if (UNLIKELY(itt_sync_obj != NULL))
-      __kmp_itt_taskwait_finished(gtid, itt_sync_obj);
+    KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);
     KMP_FSYNC_ACQUIRED(taskdata); // acquire self - sync with descendants
 #endif /* USE_ITT_BUILD */
   }
   KMP_DEBUG_ASSERT(taskgroup->count == 0);
 
-  if (taskgroup->reduce_data != NULL) { // need to reduce?
+  if (taskgroup->reduce_data != NULL &&
+      !taskgroup->gomp_data) { // need to reduce?
     int cnt;
     void *reduce_data;
     kmp_team_t *t = thread->th.th_team;
@@ -2649,7 +2646,6 @@ void __kmpc_end_taskgroup(ident_t *loc, int gtid) {
 
   KA_TRACE(10, ("__kmpc_end_taskgroup(exit): T#%d task %p finished waiting\n",
                 gtid, taskdata));
-  ANNOTATE_HAPPENS_AFTER(taskdata);
 
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) {
@@ -2835,7 +2831,7 @@ static kmp_task_t *__kmp_steal_task(kmp_info_t *victim_thr, kmp_int32 gtid,
   if (*thread_finished) {
     // We need to un-mark this victim as a finished victim.  This must be done
     // before releasing the lock, or else other threads (starting with the
-    // master victim) might be prematurely released from the barrier!!!
+    // primary thread victim) might be prematurely released from the barrier!!!
     kmp_int32 count;
 
     count = KMP_ATOMIC_INC(unfinished_threads);
@@ -3047,7 +3043,7 @@ static inline int __kmp_execute_tasks_template(
       }
 
       // It is now unsafe to reference thread->th.th_team !!!
-      // Decrementing task_team->tt.tt_unfinished_threads can allow the master
+      // Decrementing task_team->tt.tt_unfinished_threads can allow the primary
       // thread to pass through the barrier, where it might reset each thread's
       // th.th_team field for the next parallel region. If we can steal more
       // work, we know that this has not happened yet.
@@ -3060,8 +3056,8 @@ static inline int __kmp_execute_tasks_template(
       }
     }
 
-    // If this thread's task team is NULL, master has recognized that there are
-    // no more tasks; bail out
+    // If this thread's task team is NULL, primary thread has recognized that
+    // there are no more tasks; bail out
     if (thread->th.th_task_team == NULL) {
       KA_TRACE(15,
                ("__kmp_execute_tasks_template: T#%d no more tasks\n", gtid));
@@ -3201,7 +3197,7 @@ static void __kmp_enable_tasking(kmp_task_team_t *task_team,
  * After a child * thread checks into a barrier and calls __kmp_release() from
  * the particular variant of __kmp_<barrier_kind>_barrier_gather(), it can no
  * longer assume that the kmp_team_t structure is intact (at any moment, the
- * master thread may exit the barrier code and free the team data structure,
+ * primary thread may exit the barrier code and free the team data structure,
  * and return the threads to the thread pool).
  *
  * This does not work with the tasking code, as the thread is still
@@ -3210,11 +3206,11 @@ static void __kmp_enable_tasking(kmp_task_team_t *task_team,
  * to each thread in the team, so that it can steal work from it.
  *
  * Enter the existence of the kmp_task_team_t struct.  It employs a reference
- * counting mechanism, and is allocated by the master thread before calling
+ * counting mechanism, and is allocated by the primary thread before calling
  * __kmp_<barrier_kind>_release, and then is release by the last thread to
  * exit __kmp_<barrier_kind>_release at the next barrier.  I.e. the lifetimes
  * of the kmp_task_team_t structs for consecutive barriers can overlap
- * (and will, unless the master thread is the last thread to exit the barrier
+ * (and will, unless the primary thread is the last thread to exit the barrier
  * release phase, which is not typical). The existence of such a struct is
  * useful outside the context of tasking.
  *
@@ -3341,7 +3337,7 @@ static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
           __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data);
         }
 #endif // BUILD_TIED_TASK_STACK
-        // Install the new data and free the old data
+       // Install the new data and free the old data
         (*threads_data_p) = new_data;
         __kmp_free(old_data);
       } else {
@@ -3351,10 +3347,8 @@ static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
         // Make the initial allocate for threads_data array, and zero entries
         // Cannot use __kmp_thread_calloc() because threads not around for
         // kmp_reap_task_team( ).
-        ANNOTATE_IGNORE_WRITES_BEGIN();
         *threads_data_p = (kmp_thread_data_t *)__kmp_allocate(
             nthreads * sizeof(kmp_thread_data_t));
-        ANNOTATE_IGNORE_WRITES_END();
 #ifdef BUILD_TIED_TASK_STACK
         // GEH: Figure out if this is the right thing to do
         for (i = 0; i < nthreads; i++) {
@@ -3586,11 +3580,10 @@ void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team, int always) {
       (always || team->t.t_nproc > 1)) {
     team->t.t_task_team[this_thr->th.th_task_state] =
         __kmp_allocate_task_team(this_thr, team);
-    KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d created new task_team %p "
-                  "for team %d at parity=%d\n",
+    KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d created new task_team %p"
+                  " for team %d at parity=%d\n",
                   __kmp_gtid_from_thread(this_thr),
-                  team->t.t_task_team[this_thr->th.th_task_state],
-                  ((team != NULL) ? team->t.t_id : -1),
+                  team->t.t_task_team[this_thr->th.th_task_state], team->t.t_id,
                   this_thr->th.th_task_state));
   }
 
@@ -3599,18 +3592,18 @@ void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team, int always) {
   // threads spin in the barrier release phase, they will continue to use the
   // previous task_team struct(above), until they receive the signal to stop
   // checking for tasks (they can't safely reference the kmp_team_t struct,
-  // which could be reallocated by the master thread). No task teams are formed
+  // which could be reallocated by the primary thread). No task teams are formed
   // for serialized teams.
   if (team->t.t_nproc > 1) {
     int other_team = 1 - this_thr->th.th_task_state;
+    KMP_DEBUG_ASSERT(other_team >= 0 && other_team < 2);
     if (team->t.t_task_team[other_team] == NULL) { // setup other team as well
       team->t.t_task_team[other_team] =
           __kmp_allocate_task_team(this_thr, team);
-      KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d created second new "
+      KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d created second new "
                     "task_team %p for team %d at parity=%d\n",
                     __kmp_gtid_from_thread(this_thr),
-                    team->t.t_task_team[other_team],
-                    ((team != NULL) ? team->t.t_id : -1), other_team));
+                    team->t.t_task_team[other_team], team->t.t_id, other_team));
     } else { // Leave the old task team struct in place for the upcoming region;
       // adjust as needed
       kmp_task_team_t *task_team = team->t.t_task_team[other_team];
@@ -3625,11 +3618,10 @@ void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team, int always) {
       }
       // if team size has changed, the first thread to enable tasking will
       // realloc threads_data if necessary
-      KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d reset next task_team "
+      KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d reset next task_team "
                     "%p for team %d at parity=%d\n",
                     __kmp_gtid_from_thread(this_thr),
-                    team->t.t_task_team[other_team],
-                    ((team != NULL) ? team->t.t_id : -1), other_team));
+                    team->t.t_task_team[other_team], team->t.t_id, other_team));
     }
   }
 
@@ -3672,15 +3664,15 @@ void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team) {
            ("__kmp_task_team_sync: Thread T#%d task team switched to task_team "
             "%p from Team #%d (parity=%d)\n",
             __kmp_gtid_from_thread(this_thr), this_thr->th.th_task_team,
-            ((team != NULL) ? team->t.t_id : -1), this_thr->th.th_task_state));
+            team->t.t_id, this_thr->th.th_task_state));
 }
 
-// __kmp_task_team_wait: Master thread waits for outstanding tasks after the
-// barrier gather phase. Only called by master thread if #threads in team > 1 or
-// if proxy tasks were created.
+// __kmp_task_team_wait: Primary thread waits for outstanding tasks after the
+// barrier gather phase. Only called by primary thread if #threads in team > 1
+// or if proxy tasks were created.
 //
 // wait is a flag that defaults to 1 (see kmp.h), but waiting can be turned off
-// by passing in 0 optionally as the last argument. When wait is zero, master
+// by passing in 0 optionally as the last argument. When wait is zero, primary
 // thread does not wait for unfinished_threads to reach 0.
 void __kmp_task_team_wait(
     kmp_info_t *this_thr,
@@ -3692,12 +3684,12 @@ void __kmp_task_team_wait(
 
   if ((task_team != NULL) && KMP_TASKING_ENABLED(task_team)) {
     if (wait) {
-      KA_TRACE(20, ("__kmp_task_team_wait: Master T#%d waiting for all tasks "
+      KA_TRACE(20, ("__kmp_task_team_wait: Primary T#%d waiting for all tasks "
                     "(for unfinished_threads to reach 0) on task_team = %p\n",
                     __kmp_gtid_from_thread(this_thr), task_team));
       // Worker threads may have dropped through to release phase, but could
       // still be executing tasks. Wait here for tasks to complete. To avoid
-      // memory contention, only master thread checks termination condition.
+      // memory contention, only primary thread checks termination condition.
       kmp_flag_32<false, false> flag(
           RCAST(std::atomic<kmp_uint32> *,
                 &task_team->tt.tt_unfinished_threads),
@@ -3708,7 +3700,7 @@ void __kmp_task_team_wait(
     // referencing it while spinning.
     KA_TRACE(
         20,
-        ("__kmp_task_team_wait: Master T#%d deactivating task_team %p: "
+        ("__kmp_task_team_wait: Primary T#%d deactivating task_team %p: "
          "setting active to false, setting local and team's pointer to NULL\n",
          __kmp_gtid_from_thread(this_thr), task_team));
     KMP_DEBUG_ASSERT(task_team->tt.tt_nproc > 1 ||
@@ -3842,6 +3834,7 @@ release_and_exit:
   return result;
 }
 
+#define PROXY_TASK_FLAG 0x40000000
 /* The finish of the proxy tasks is divided in two pieces:
     - the top half is the one that can be done from a thread outside the team
     - the bottom half must be run from a thread within the team
@@ -3871,7 +3864,7 @@ static void __kmp_first_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
 
   // Create an imaginary children for this task so the bottom half cannot
   // release the task before we have completed the second top half
-  KMP_ATOMIC_INC(&taskdata->td_incomplete_child_tasks);
+  KMP_ATOMIC_OR(&taskdata->td_incomplete_child_tasks, PROXY_TASK_FLAG);
 }
 
 static void __kmp_second_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
@@ -3883,7 +3876,7 @@ static void __kmp_second_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
   KMP_DEBUG_ASSERT(children >= 0);
 
   // Remove the imaginary children
-  KMP_ATOMIC_DEC(&taskdata->td_incomplete_child_tasks);
+  KMP_ATOMIC_AND(&taskdata->td_incomplete_child_tasks, ~PROXY_TASK_FLAG);
 }
 
 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask) {
@@ -3896,7 +3889,8 @@ static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask) {
 
   // We need to wait to make sure the top half is finished
   // Spinning here should be ok as this should happen quickly
-  while (KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) > 0)
+  while ((KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) &
+          PROXY_TASK_FLAG) > 0)
     ;
 
   __kmp_release_deps(gtid, taskdata);
@@ -3929,26 +3923,10 @@ void __kmpc_proxy_task_completed(kmp_int32 gtid, kmp_task_t *ptask) {
             gtid, taskdata));
 }
 
-/*!
-@ingroup TASKING
-@param ptask Task which execution is completed
-
-Execute the completion of a proxy task from a thread that could not belong to
-the team.
-*/
-void __kmpc_proxy_task_completed_ooo(kmp_task_t *ptask) {
+void __kmpc_give_task(kmp_task_t *ptask, kmp_int32 start = 0) {
   KMP_DEBUG_ASSERT(ptask != NULL);
   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
 
-  KA_TRACE(
-      10,
-      ("__kmp_proxy_task_completed_ooo(enter): proxy task completing ooo %p\n",
-       taskdata));
-
-  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
-
-  __kmp_first_top_half_finish_proxy(taskdata);
-
   // Enqueue task to complete bottom half completion from a thread within the
   // corresponding team
   kmp_team_t *team = taskdata->td_team;
@@ -3957,7 +3935,7 @@ void __kmpc_proxy_task_completed_ooo(kmp_task_t *ptask) {
 
   // This should be similar to start_k = __kmp_get_random( thread ) % nthreads
   // but we cannot use __kmp_get_random here
-  kmp_int32 start_k = 0;
+  kmp_int32 start_k = start;
   kmp_int32 pass = 1;
   kmp_int32 k = start_k;
 
@@ -3971,6 +3949,29 @@ void __kmpc_proxy_task_completed_ooo(kmp_task_t *ptask) {
       pass = pass << 1;
 
   } while (!__kmp_give_task(thread, k, ptask, pass));
+}
+
+/*!
+@ingroup TASKING
+@param ptask Task which execution is completed
+
+Execute the completion of a proxy task from a thread that could not belong to
+the team.
+*/
+void __kmpc_proxy_task_completed_ooo(kmp_task_t *ptask) {
+  KMP_DEBUG_ASSERT(ptask != NULL);
+  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
+
+  KA_TRACE(
+      10,
+      ("__kmp_proxy_task_completed_ooo(enter): proxy task completing ooo %p\n",
+       taskdata));
+
+  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
+
+  __kmp_first_top_half_finish_proxy(taskdata);
+
+  __kmpc_give_task(ptask);
 
   __kmp_second_top_half_finish_proxy(taskdata);
 
@@ -4157,6 +4158,7 @@ public:
       }
     }
 #else
+    (void)taskdata;
     retval = *(kmp_int64 *)((char *)task + lower_offset);
 #endif // defined(KMP_GOMP_COMPAT)
     return retval;
@@ -4260,8 +4262,8 @@ void __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task,
   kmp_task_t *next_task;
   kmp_int32 lastpriv = 0;
 
-  KMP_DEBUG_ASSERT(
-      tc == num_tasks * grainsize + (last_chunk < 0 ? last_chunk : extras));
+  KMP_DEBUG_ASSERT(tc == num_tasks * grainsize +
+                             (last_chunk < 0 ? last_chunk : extras));
   KMP_DEBUG_ASSERT(num_tasks > extras);
   KMP_DEBUG_ASSERT(num_tasks > 0);
   KA_TRACE(20, ("__kmp_taskloop_linear: T#%d: %lld tasks, grainsize %lld, "
@@ -4321,7 +4323,7 @@ void __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task,
               next_task_bounds.get_upper_offset()));
 #if OMPT_SUPPORT
     __kmp_omp_taskloop_task(NULL, gtid, next_task,
-                           codeptr_ra); // schedule new task
+                            codeptr_ra); // schedule new task
 #else
     __kmp_omp_task(gtid, next_task, true); // schedule new task
 #endif
@@ -4457,8 +4459,8 @@ void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task,
   size_t upper_offset =
       (char *)ub - (char *)task; // remember offset of ub in the task structure
 
-  KMP_DEBUG_ASSERT(
-      tc == num_tasks * grainsize + (last_chunk < 0 ? last_chunk : extras));
+  KMP_DEBUG_ASSERT(tc == num_tasks * grainsize +
+                             (last_chunk < 0 ? last_chunk : extras));
   KMP_DEBUG_ASSERT(num_tasks > extras);
   KMP_DEBUG_ASSERT(num_tasks > 0);
 
@@ -4652,8 +4654,8 @@ static void __kmp_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
     KMP_ASSERT2(0, "unknown scheduling of taskloop");
   }
 
-  KMP_DEBUG_ASSERT(
-      tc == num_tasks * grainsize + (last_chunk < 0 ? last_chunk : extras));
+  KMP_DEBUG_ASSERT(tc == num_tasks * grainsize +
+                             (last_chunk < 0 ? last_chunk : extras));
   KMP_DEBUG_ASSERT(num_tasks > extras);
   KMP_DEBUG_ASSERT(num_tasks > 0);
   // =========================================================================
diff --git a/openmp/runtime/src/kmp_threadprivate.cpp b/openmp/runtime/src/kmp_threadprivate.cpp
index 270c90abfde2..b79ac7d6d2b2 100644
--- a/openmp/runtime/src/kmp_threadprivate.cpp
+++ b/openmp/runtime/src/kmp_threadprivate.cpp
@@ -169,7 +169,7 @@ void __kmp_common_destroy(void) {
       struct shared_common *d_tn;
 
       /* C++ destructors need to be called once per thread before exiting.
-         Don't call destructors for master thread though unless we used copy
+         Don't call destructors for primary thread though unless we used copy
          constructor */
 
       for (d_tn = __kmp_threadprivate_d_table.data[q]; d_tn;
@@ -410,7 +410,7 @@ struct private_common *kmp_threadprivate_insert(int gtid, void *pc_addr,
   }
 
   __kmp_release_lock(&__kmp_global_lock, gtid);
-/* +++++++++ END OF CRITICAL SECTION +++++++++ */
+  /* +++++++++ END OF CRITICAL SECTION +++++++++ */
 
 #ifdef USE_CHECKS_COMMON
   if (pc_size > d_tn->cmn_size) {
@@ -451,15 +451,16 @@ struct private_common *kmp_threadprivate_insert(int gtid, void *pc_addr,
     return tn;
 
   /* if C++ object with copy constructor, use it;
-   * else if C++ object with constructor, use it for the non-master copies only;
+   * else if C++ object with constructor, use it for the non-primary thread
+     copies only;
    * else use pod_init and memcpy
    *
-   * C++ constructors need to be called once for each non-master thread on
+   * C++ constructors need to be called once for each non-primary thread on
    * allocate
    * C++ copy constructors need to be called once for each thread on allocate */
 
   /* C++ object with constructors/destructors; don't call constructors for
-     master thread though */
+     primary thread though */
   if (d_tn->is_vec) {
     if (d_tn->ct.ctorv != 0) {
       (void)(*d_tn->ct.ctorv)(tn->par_addr, d_tn->vec_len);
diff --git a/openmp/runtime/src/kmp_utility.cpp b/openmp/runtime/src/kmp_utility.cpp
index f163f0551aa5..c4bfead9d0d6 100644
--- a/openmp/runtime/src/kmp_utility.cpp
+++ b/openmp/runtime/src/kmp_utility.cpp
@@ -92,7 +92,7 @@ int __kmp_get_logical_id(int log_per_phy, int apic_id) {
 
 static kmp_uint64 __kmp_parse_frequency( // R: Frequency in Hz.
     char const *frequency // I: Float number and unit: MHz, GHz, or TGz.
-    ) {
+) {
 
   double value = 0.0;
   char *unit = NULL;
@@ -230,16 +230,6 @@ void __kmp_query_cpuid(kmp_cpuinfo_t *p) {
       log_per_phy = data[2];
       p->apic_id = data[3]; /* Bits 31-24: Processor Initial APIC ID (X) */
       KA_TRACE(trace_level, (" HT(%d TPUs)", log_per_phy));
-
-      if (log_per_phy > 1) {
-/* default to 1k FOR JT-enabled processors (4k on OS X*) */
-#if KMP_OS_DARWIN
-        p->cpu_stackoffset = 4 * 1024;
-#else
-        p->cpu_stackoffset = 1 * 1024;
-#endif
-      }
-
       p->physical_id = __kmp_get_physical_id(log_per_phy, p->apic_id);
       p->logical_id = __kmp_get_logical_id(log_per_phy, p->apic_id);
     }
diff --git a/openmp/runtime/src/kmp_version.cpp b/openmp/runtime/src/kmp_version.cpp
index 7464d1972684..db2454c0f4de 100644
--- a/openmp/runtime/src/kmp_version.cpp
+++ b/openmp/runtime/src/kmp_version.cpp
@@ -51,6 +51,8 @@
 #define KMP_COMPILER "Intel(R) C++ Compiler 18.0"
 #elif __INTEL_COMPILER == 1900
 #define KMP_COMPILER "Intel(R) C++ Compiler 19.0"
+#elif __INTEL_COMPILER == 1910
+#define KMP_COMPILER "Intel(R) C++ Compiler 19.1"
 #elif __INTEL_COMPILER >= 9900
 #define KMP_COMPILER "Intel(R) C++ Compiler mainline"
 #endif
@@ -178,7 +180,7 @@ void __kmp_print_version_1(void) {
 #else
       "no"
 #endif
-          );
+  );
   __kmp_printf("%s", buffer.str);
   __kmp_str_buf_free(&buffer);
   K_DIAG(1, ("KMP_VERSION is true\n"));
diff --git a/openmp/runtime/src/kmp_wait_release.h b/openmp/runtime/src/kmp_wait_release.h
index bb7a3e0d483d..d528ce9f1801 100644
--- a/openmp/runtime/src/kmp_wait_release.h
+++ b/openmp/runtime/src/kmp_wait_release.h
@@ -179,7 +179,6 @@ __kmp_wait_template(kmp_info_t *this_thr,
   kmp_uint32 spins;
   int th_gtid;
   int tasks_completed = FALSE;
-  int oversubscribed;
 #if !KMP_USE_MONITOR
   kmp_uint64 poll_count;
   kmp_uint64 hibernate_goal;
@@ -321,10 +320,10 @@ final_spin=FALSE)
     } else
       hibernate_goal = KMP_NOW() + this_thr->th.th_team_bt_intervals;
     poll_count = 0;
+    (void)poll_count;
 #endif // KMP_USE_MONITOR
   }
 
-  oversubscribed = (TCR_4(__kmp_nth) > __kmp_avail_proc);
   KMP_MB();
 
   // Main wait spin loop
@@ -983,7 +982,7 @@ public:
     else if (flag_switch) {
       this_thr->th.th_bar[bt].bb.wait_flag = KMP_BARRIER_SWITCHING;
       kmp_flag_64<> flag(&this_thr->th.th_bar[bt].bb.b_go,
-                       (kmp_uint64)KMP_BARRIER_STATE_BUMP);
+                         (kmp_uint64)KMP_BARRIER_STATE_BUMP);
       __kmp_wait_64(this_thr, &flag, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
     }
     return false;
@@ -1026,9 +1025,18 @@ public:
   int execute_tasks(kmp_info_t *this_thr, kmp_int32 gtid, int final_spin,
                     int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
                     kmp_int32 is_constrained) {
+#if OMPD_SUPPORT
+    int ret = __kmp_execute_tasks_oncore(
+        this_thr, gtid, this, final_spin,
+        thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
+    if (ompd_state & OMPD_ENABLE_BP)
+      ompd_bp_task_end();
+    return ret;
+#else
     return __kmp_execute_tasks_oncore(
         this_thr, gtid, this, final_spin,
         thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
+#endif
   }
   kmp_uint8 *get_stolen() { return NULL; }
   enum barrier_type get_bt() { return bt; }
diff --git a/openmp/runtime/src/ompd-specific.cpp b/openmp/runtime/src/ompd-specific.cpp
new file mode 100644
index 000000000000..c4018789eb5b
--- /dev/null
+++ b/openmp/runtime/src/ompd-specific.cpp
@@ -0,0 +1,154 @@
+/*
+ * ompd-specific.cpp -- OpenMP debug support
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "ompd-specific.h"
+
+#if OMPD_SUPPORT
+
+/**
+ * Declaration of symbols to hold struct size and member offset information
+ */
+
+#define ompd_declare_access(t, m) uint64_t ompd_access__##t##__##m;
+OMPD_FOREACH_ACCESS(ompd_declare_access)
+#undef ompd_declare_access
+
+#define ompd_declare_sizeof_member(t, m) uint64_t ompd_sizeof__##t##__##m;
+OMPD_FOREACH_ACCESS(ompd_declare_sizeof_member)
+#undef ompd_declare_sizeof_member
+
+#define ompd_declare_bitfield(t, m) uint64_t ompd_bitfield__##t##__##m;
+OMPD_FOREACH_BITFIELD(ompd_declare_bitfield)
+#undef ompd_declare_bitfield
+
+#define ompd_declare_sizeof(t) uint64_t ompd_sizeof__##t;
+OMPD_FOREACH_SIZEOF(ompd_declare_sizeof)
+#undef ompd_declare_sizeof
+
+volatile const char **ompd_dll_locations = NULL;
+uint64_t ompd_state = 0;
+
+char *ompd_env_block = NULL;
+ompd_size_t ompd_env_block_size = 0;
+
+void ompd_init() {
+
+  static int ompd_initialized = 0;
+
+  if (ompd_initialized)
+    return;
+
+    /**
+     * Calculate member offsets for structs and unions
+     */
+
+#define ompd_init_access(t, m)                                                 \
+  ompd_access__##t##__##m = (uint64_t) & (((t *)0)->m);
+  OMPD_FOREACH_ACCESS(ompd_init_access)
+#undef ompd_init_access
+
+  /**
+   * Create bit mask for bitfield access
+   */
+
+#define ompd_init_bitfield(t, m)                                               \
+  ompd_bitfield__##t##__##m = 0;                                               \
+  ((t *)(&ompd_bitfield__##t##__##m))->m = 1;
+  OMPD_FOREACH_BITFIELD(ompd_init_bitfield)
+#undef ompd_init_bitfield
+
+  /**
+   * Calculate type size information
+   */
+
+#define ompd_init_sizeof_member(t, m)                                          \
+  ompd_sizeof__##t##__##m = sizeof(((t *)0)->m);
+  OMPD_FOREACH_ACCESS(ompd_init_sizeof_member)
+#undef ompd_init_sizeof_member
+
+#define ompd_init_sizeof(t) ompd_sizeof__##t = sizeof(t);
+  OMPD_FOREACH_SIZEOF(ompd_init_sizeof)
+#undef ompd_init_sizeof
+
+  char *libname = NULL;
+
+#if KMP_OS_UNIX
+  // Find the location of libomp.so thru dladdr and replace the libomp with
+  // libompd to get the full path of libompd
+  Dl_info dl_info;
+  int ret = dladdr((void *)ompd_init, &dl_info);
+  if (!ret) {
+    fprintf(stderr, "%s\n", dlerror());
+  }
+  int lib_path_length;
+  if (strrchr(dl_info.dli_fname, '/')) {
+    lib_path_length = strrchr(dl_info.dli_fname, '/') - dl_info.dli_fname;
+    libname =
+        (char *)malloc(lib_path_length + 12 /*for '/libompd.so' and '\0'*/);
+    strncpy(libname, dl_info.dli_fname, lib_path_length);
+    memcpy(libname + lib_path_length, "/libompd.so\0", 12);
+  }
+#endif
+
+  const char *ompd_env_var = getenv("OMP_DEBUG");
+  if (ompd_env_var && !strcmp(ompd_env_var, "enabled")) {
+    fprintf(stderr, "OMP_OMPD active\n");
+    ompt_enabled.enabled = 1;
+    ompd_state |= OMPD_ENABLE_BP;
+  }
+
+  ompd_initialized = 1;
+  ompd_dll_locations = (volatile const char **)malloc(3 * sizeof(const char *));
+  ompd_dll_locations[0] = "libompd.so";
+  ompd_dll_locations[1] = libname;
+  ompd_dll_locations[2] = NULL;
+  ompd_dll_locations_valid();
+}
+
+void __attribute__((noinline)) ompd_dll_locations_valid(void) {
+  /* naive way of implementing hard to opt-out empty function
+     we might want to use a separate object file? */
+  asm("");
+}
+
+void ompd_bp_parallel_begin(void) {
+  /* naive way of implementing hard to opt-out empty function
+     we might want to use a separate object file? */
+  asm("");
+}
+void ompd_bp_parallel_end(void) {
+  /* naive way of implementing hard to opt-out empty function
+     we might want to use a separate object file? */
+  asm("");
+}
+void ompd_bp_task_begin(void) {
+  /* naive way of implementing hard to opt-out empty function
+     we might want to use a separate object file? */
+  asm("");
+}
+void ompd_bp_task_end(void) {
+  /* naive way of implementing hard to opt-out empty function
+     we might want to use a separate object file? */
+  asm("");
+}
+void ompd_bp_thread_begin(void) {
+  /* naive way of implementing hard to opt-out empty function
+     we might want to use a separate object file? */
+  asm("");
+}
+void ompd_bp_thread_end(void) {
+  /* naive way of implementing hard to opt-out empty function
+     we might want to use a separate object file? */
+  asm("");
+}
+
+#endif /* OMPD_SUPPORT */
diff --git a/openmp/runtime/src/ompd-specific.h b/openmp/runtime/src/ompd-specific.h
new file mode 100644
index 000000000000..21809ef52f53
--- /dev/null
+++ b/openmp/runtime/src/ompd-specific.h
@@ -0,0 +1,154 @@
+/*
+ * ompd-specific.h -- OpenMP debug support
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "kmp.h"
+#include "omp-tools.h"
+#include <stdint.h>
+
+#ifndef __OMPD_SPECIFIC_H__
+#define __OMPD_SPECIFIC_H__
+
+#if OMPD_SUPPORT
+
+void ompd_init();
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+extern char *ompd_env_block;
+extern ompd_size_t ompd_env_block_size;
+extern char *__kmp_tool_verbose_init;
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+extern uint64_t ompd_state;
+#define OMPD_ENABLE_BP 0x1
+
+#define OMPD_FOREACH_ACCESS(OMPD_ACCESS)                                       \
+  OMPD_ACCESS(kmp_base_info_t, th_current_task)                                \
+  OMPD_ACCESS(kmp_base_info_t, th_team)                                        \
+  OMPD_ACCESS(kmp_base_info_t, th_info)                                        \
+  OMPD_ACCESS(kmp_base_info_t, ompt_thread_info)                               \
+                                                                               \
+  OMPD_ACCESS(kmp_base_root_t, r_in_parallel)                                  \
+                                                                               \
+  OMPD_ACCESS(kmp_base_team_t, ompt_team_info)                                 \
+  OMPD_ACCESS(kmp_base_team_t, ompt_serialized_team_info)                      \
+  OMPD_ACCESS(kmp_base_team_t, t_active_level)                                 \
+  OMPD_ACCESS(kmp_base_team_t, t_implicit_task_taskdata)                       \
+  OMPD_ACCESS(kmp_base_team_t, t_master_tid)                                   \
+  OMPD_ACCESS(kmp_base_team_t, t_nproc)                                        \
+  OMPD_ACCESS(kmp_base_team_t, t_level)                                        \
+  OMPD_ACCESS(kmp_base_team_t, t_parent)                                       \
+  OMPD_ACCESS(kmp_base_team_t, t_pkfn)                                         \
+  OMPD_ACCESS(kmp_base_team_t, t_threads)                                      \
+                                                                               \
+  OMPD_ACCESS(kmp_desc_t, ds)                                                  \
+                                                                               \
+  OMPD_ACCESS(kmp_desc_base_t, ds_thread)                                      \
+  OMPD_ACCESS(kmp_desc_base_t, ds_tid)                                         \
+                                                                               \
+  OMPD_ACCESS(kmp_info_t, th)                                                  \
+                                                                               \
+  OMPD_ACCESS(kmp_r_sched_t, r_sched_type)                                     \
+  OMPD_ACCESS(kmp_r_sched_t, chunk)                                            \
+                                                                               \
+  OMPD_ACCESS(kmp_root_t, r)                                                   \
+                                                                               \
+  OMPD_ACCESS(kmp_internal_control_t, dynamic)                                 \
+  OMPD_ACCESS(kmp_internal_control_t, max_active_levels)                       \
+  OMPD_ACCESS(kmp_internal_control_t, nproc)                                   \
+  OMPD_ACCESS(kmp_internal_control_t, proc_bind)                               \
+  OMPD_ACCESS(kmp_internal_control_t, sched)                                   \
+  OMPD_ACCESS(kmp_internal_control_t, default_device)                          \
+  OMPD_ACCESS(kmp_internal_control_t, thread_limit)                            \
+                                                                               \
+  OMPD_ACCESS(kmp_taskdata_t, ompt_task_info)                                  \
+  OMPD_ACCESS(kmp_taskdata_t, td_flags)                                        \
+  OMPD_ACCESS(kmp_taskdata_t, td_icvs)                                         \
+  OMPD_ACCESS(kmp_taskdata_t, td_parent)                                       \
+  OMPD_ACCESS(kmp_taskdata_t, td_team)                                         \
+                                                                               \
+  OMPD_ACCESS(kmp_task_t, routine)                                             \
+                                                                               \
+  OMPD_ACCESS(kmp_team_p, t)                                                   \
+                                                                               \
+  OMPD_ACCESS(kmp_nested_nthreads_t, used)                                     \
+  OMPD_ACCESS(kmp_nested_nthreads_t, nth)                                      \
+                                                                               \
+  OMPD_ACCESS(kmp_nested_proc_bind_t, used)                                    \
+  OMPD_ACCESS(kmp_nested_proc_bind_t, bind_types)                              \
+                                                                               \
+  OMPD_ACCESS(ompt_task_info_t, frame)                                         \
+  OMPD_ACCESS(ompt_task_info_t, scheduling_parent)                             \
+  OMPD_ACCESS(ompt_task_info_t, task_data)                                     \
+                                                                               \
+  OMPD_ACCESS(ompt_team_info_t, parallel_data)                                 \
+                                                                               \
+  OMPD_ACCESS(ompt_thread_info_t, state)                                       \
+  OMPD_ACCESS(ompt_thread_info_t, wait_id)                                     \
+  OMPD_ACCESS(ompt_thread_info_t, thread_data)                                 \
+                                                                               \
+  OMPD_ACCESS(ompt_data_t, value)                                              \
+  OMPD_ACCESS(ompt_data_t, ptr)                                                \
+                                                                               \
+  OMPD_ACCESS(ompt_frame_t, exit_frame)                                        \
+  OMPD_ACCESS(ompt_frame_t, enter_frame)                                       \
+                                                                               \
+  OMPD_ACCESS(ompt_lw_taskteam_t, parent)                                      \
+  OMPD_ACCESS(ompt_lw_taskteam_t, ompt_team_info)                              \
+  OMPD_ACCESS(ompt_lw_taskteam_t, ompt_task_info)
+
+#define OMPD_FOREACH_BITFIELD(OMPD_BITFIELD)                                   \
+  OMPD_BITFIELD(kmp_tasking_flags_t, final)                                    \
+  OMPD_BITFIELD(kmp_tasking_flags_t, tiedness)                                 \
+  OMPD_BITFIELD(kmp_tasking_flags_t, tasktype)                                 \
+  OMPD_BITFIELD(kmp_tasking_flags_t, task_serial)                              \
+  OMPD_BITFIELD(kmp_tasking_flags_t, tasking_ser)                              \
+  OMPD_BITFIELD(kmp_tasking_flags_t, team_serial)                              \
+  OMPD_BITFIELD(kmp_tasking_flags_t, started)                                  \
+  OMPD_BITFIELD(kmp_tasking_flags_t, executing)                                \
+  OMPD_BITFIELD(kmp_tasking_flags_t, complete)                                 \
+  OMPD_BITFIELD(kmp_tasking_flags_t, freed)                                    \
+  OMPD_BITFIELD(kmp_tasking_flags_t, native)
+
+#define OMPD_FOREACH_SIZEOF(OMPD_SIZEOF)                                       \
+  OMPD_SIZEOF(kmp_info_t)                                                      \
+  OMPD_SIZEOF(kmp_taskdata_t)                                                  \
+  OMPD_SIZEOF(kmp_task_t)                                                      \
+  OMPD_SIZEOF(kmp_tasking_flags_t)                                             \
+  OMPD_SIZEOF(kmp_thread_t)                                                    \
+  OMPD_SIZEOF(ompt_data_t)                                                     \
+  OMPD_SIZEOF(ompt_id_t)                                                       \
+  OMPD_SIZEOF(__kmp_avail_proc)                                                \
+  OMPD_SIZEOF(__kmp_max_nth)                                                   \
+  OMPD_SIZEOF(__kmp_stksize)                                                   \
+  OMPD_SIZEOF(__kmp_omp_cancellation)                                          \
+  OMPD_SIZEOF(__kmp_max_task_priority)                                         \
+  OMPD_SIZEOF(__kmp_display_affinity)                                          \
+  OMPD_SIZEOF(__kmp_affinity_format)                                           \
+  OMPD_SIZEOF(__kmp_tool_libraries)                                            \
+  OMPD_SIZEOF(__kmp_tool_verbose_init)                                         \
+  OMPD_SIZEOF(__kmp_tool)                                                      \
+  OMPD_SIZEOF(ompd_state)                                                      \
+  OMPD_SIZEOF(kmp_nested_nthreads_t)                                           \
+  OMPD_SIZEOF(__kmp_nested_nth)                                                \
+  OMPD_SIZEOF(kmp_nested_proc_bind_t)                                          \
+  OMPD_SIZEOF(__kmp_nested_proc_bind)                                          \
+  OMPD_SIZEOF(int)                                                             \
+  OMPD_SIZEOF(char)                                                            \
+  OMPD_SIZEOF(__kmp_gtid)                                                      \
+  OMPD_SIZEOF(__kmp_nth)
+
+#endif /* OMPD_SUPPORT */
+#endif
diff --git a/openmp/runtime/src/ompt-event-specific.h b/openmp/runtime/src/ompt-event-specific.h
index 9b780f5b221d..875d6921b7b7 100644
--- a/openmp/runtime/src/ompt-event-specific.h
+++ b/openmp/runtime/src/ompt-event-specific.h
@@ -79,8 +79,7 @@
 
 #define ompt_callback_mutex_released_implemented ompt_event_MAY_ALWAYS_OPTIONAL
 
-#define ompt_callback_dependences_implemented                             \
-  ompt_event_MAY_ALWAYS_OPTIONAL
+#define ompt_callback_dependences_implemented ompt_event_MAY_ALWAYS_OPTIONAL
 #define ompt_callback_task_dependence_implemented ompt_event_MAY_ALWAYS_OPTIONAL
 
 #define ompt_callback_work_implemented ompt_event_MAY_ALWAYS_OPTIONAL
@@ -107,6 +106,6 @@
 
 #define ompt_callback_dispatch_implemented ompt_event_UNIMPLEMENTED
 
-#define ompt_callback_error_implemented ompt_event_UNIMPLEMENTED
+#define ompt_callback_error_implemented ompt_event_MAY_ALWAYS_OPTIONAL
 
 #endif
diff --git a/openmp/runtime/src/ompt-general.cpp b/openmp/runtime/src/ompt-general.cpp
index c52a3f28c2e7..3d8ef041f724 100644
--- a/openmp/runtime/src/ompt-general.cpp
+++ b/openmp/runtime/src/ompt-general.cpp
@@ -102,6 +102,14 @@ ompt_callbacks_internal_t ompt_callbacks;
 
 static ompt_start_tool_result_t *ompt_start_tool_result = NULL;
 
+#if KMP_OS_WINDOWS
+static HMODULE ompt_tool_module = NULL;
+#define OMPT_DLCLOSE(Lib) FreeLibrary(Lib)
+#else
+static void *ompt_tool_module = NULL;
+#define OMPT_DLCLOSE(Lib) dlclose(Lib)
+#endif
+
 /*****************************************************************************
  * forward declarations
  ****************************************************************************/
@@ -258,7 +266,7 @@ ompt_try_start_tool(unsigned int omp_version, const char *runtime_version) {
 #error Activation of OMPT is not supported on this platform.
 #endif
   if (ret) {
-    OMPT_VERBOSE_INIT_CONTINUED_PRINT("Sucess.\n");
+    OMPT_VERBOSE_INIT_CONTINUED_PRINT("Success.\n");
     OMPT_VERBOSE_INIT_PRINT(
         "Tool was started and is using the OMPT interface.\n");
     OMPT_VERBOSE_INIT_PRINT("----- END LOGGING OF TOOL REGISTRATION -----\n");
@@ -302,24 +310,26 @@ ompt_try_start_tool(unsigned int omp_version, const char *runtime_version) {
                                 fname);
         start_tool = (ompt_start_tool_t)GetProcAddress(h, "ompt_start_tool");
         if (!start_tool) {
-          OMPT_VERBOSE_INIT_CONTINUED_PRINT("Failed: Error %s\n",
+          OMPT_VERBOSE_INIT_CONTINUED_PRINT("Failed: Error %u\n",
                                             GetLastError());
         } else
 #else
 #error Activation of OMPT is not supported on this platform.
 #endif
-        {// if (start_tool)
+        { // if (start_tool)
           ret = (*start_tool)(omp_version, runtime_version);
           if (ret) {
             OMPT_VERBOSE_INIT_CONTINUED_PRINT("Success.\n");
             OMPT_VERBOSE_INIT_PRINT(
                 "Tool was started and is using the OMPT interface.\n");
+            ompt_tool_module = h;
             break;
           }
           OMPT_VERBOSE_INIT_CONTINUED_PRINT(
               "Found but not using the OMPT interface.\n");
           OMPT_VERBOSE_INIT_PRINT("Continuing search...\n");
         }
+        OMPT_DLCLOSE(h);
       }
       fname = __kmp_str_token(NULL, sep, &buf);
     }
@@ -428,9 +438,10 @@ void ompt_pre_init() {
     break;
 
   case omp_tool_error:
-    fprintf(stderr, "Warning: OMP_TOOL has invalid value \"%s\".\n"
-                    "  legal values are (NULL,\"\",\"disabled\","
-                    "\"enabled\").\n",
+    fprintf(stderr,
+            "Warning: OMP_TOOL has invalid value \"%s\".\n"
+            "  legal values are (NULL,\"\",\"disabled\","
+            "\"enabled\").\n",
             ompt_env_var);
     break;
   }
@@ -459,7 +470,8 @@ void ompt_post_init() {
   //--------------------------------------------------
   if (ompt_start_tool_result) {
     ompt_enabled.enabled = !!ompt_start_tool_result->initialize(
-        ompt_fn_lookup, omp_get_initial_device(), &(ompt_start_tool_result->tool_data));
+        ompt_fn_lookup, omp_get_initial_device(),
+        &(ompt_start_tool_result->tool_data));
 
     if (!ompt_enabled.enabled) {
       // tool not enabled, zero out the bitmap, and done
@@ -477,7 +489,8 @@ void ompt_post_init() {
     }
     ompt_data_t *task_data;
     ompt_data_t *parallel_data;
-    __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data, NULL);
+    __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
+                                  NULL);
     if (ompt_enabled.ompt_callback_implicit_task) {
       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
           ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
@@ -488,10 +501,16 @@ void ompt_post_init() {
 }
 
 void ompt_fini() {
-  if (ompt_enabled.enabled) {
+  if (ompt_enabled.enabled
+#if OMPD_SUPPORT
+      && ompt_start_tool_result && ompt_start_tool_result->finalize
+#endif
+  ) {
     ompt_start_tool_result->finalize(&(ompt_start_tool_result->tool_data));
   }
 
+  if (ompt_tool_module)
+    OMPT_DLCLOSE(ompt_tool_module);
   memset(&ompt_enabled, 0, sizeof(ompt_enabled));
 }
 
@@ -540,7 +559,7 @@ OMPT_API_ROUTINE int ompt_enumerate_mutex_impls(int current_impl,
  ****************************************************************************/
 
 OMPT_API_ROUTINE ompt_set_result_t ompt_set_callback(ompt_callbacks_t which,
-                                       ompt_callback_t callback) {
+                                                     ompt_callback_t callback) {
   switch (which) {
 
 #define ompt_event_macro(event_name, callback_type, event_id)                  \
@@ -672,6 +691,8 @@ OMPT_API_ROUTINE int ompt_get_place_proc_ids(int place_num, int ids_size,
 #else
   int i, count;
   int tmp_ids[ids_size];
+  for (int j = 0; j < ids_size; j++)
+    tmp_ids[j] = 0;
   if (!KMP_AFFINITY_CAPABLE())
     return 0;
   if (place_num < 0 || place_num >= (int)__kmp_affinity_num_masks)
@@ -782,7 +803,7 @@ OMPT_API_ROUTINE int ompt_get_ompt_version() { return OMPT_VERSION; }
 */
 
 /*****************************************************************************
-* application-facing API
+ * application-facing API
  ****************************************************************************/
 
 /*----------------------------------------------------------------------------
diff --git a/openmp/runtime/src/ompt-internal.h b/openmp/runtime/src/ompt-internal.h
index f753ab4ebc6d..6665bb5e83eb 100644
--- a/openmp/runtime/src/ompt-internal.h
+++ b/openmp/runtime/src/ompt-internal.h
@@ -119,7 +119,7 @@ extern ompt_callbacks_active_t ompt_enabled;
 #endif
 
 #ifdef __cplusplus
-};
+}
 #endif
 
 #endif
diff --git a/openmp/runtime/src/ompt-specific.cpp b/openmp/runtime/src/ompt-specific.cpp
index c74426c3012c..1ad0e17ed408 100644
--- a/openmp/runtime/src/ompt-specific.cpp
+++ b/openmp/runtime/src/ompt-specific.cpp
@@ -292,10 +292,20 @@ void __ompt_lw_taskteam_link(ompt_lw_taskteam_t *lwt, kmp_info_t *thr,
         thr->th.th_team->t.ompt_serialized_team_info;
     link_lwt->parent = my_parent;
     thr->th.th_team->t.ompt_serialized_team_info = link_lwt;
+#if OMPD_SUPPORT
+    if (ompd_state & OMPD_ENABLE_BP) {
+      ompd_bp_parallel_begin();
+    }
+#endif
   } else {
     // this is the first serialized team, so we just store the values in the
     // team and drop the taskteam-object
     *OMPT_CUR_TEAM_INFO(thr) = lwt->ompt_team_info;
+#if OMPD_SUPPORT
+    if (ompd_state & OMPD_ENABLE_BP) {
+      ompd_bp_parallel_begin();
+    }
+#endif
     *OMPT_CUR_TASK_INFO(thr) = lwt->ompt_task_info;
   }
 }
@@ -303,6 +313,11 @@ void __ompt_lw_taskteam_link(ompt_lw_taskteam_t *lwt, kmp_info_t *thr,
 void __ompt_lw_taskteam_unlink(kmp_info_t *thr) {
   ompt_lw_taskteam_t *lwtask = thr->th.th_team->t.ompt_serialized_team_info;
   if (lwtask) {
+#if OMPD_SUPPORT
+    if (ompd_state & OMPD_ENABLE_BP) {
+      ompd_bp_parallel_end();
+    }
+#endif
     thr->th.th_team->t.ompt_serialized_team_info = lwtask->parent;
 
     ompt_team_info_t tmp_team = lwtask->ompt_team_info;
diff --git a/openmp/runtime/src/ompt-specific.h b/openmp/runtime/src/ompt-specific.h
index 18816e7337c1..2fc7ee1c35bc 100644
--- a/openmp/runtime/src/ompt-specific.h
+++ b/openmp/runtime/src/ompt-specific.h
@@ -23,8 +23,8 @@
 void __ompt_team_assign_id(kmp_team_t *team, ompt_data_t ompt_pid);
 void __ompt_thread_assign_wait_id(void *variable);
 
-void __ompt_lw_taskteam_init(ompt_lw_taskteam_t *lwt, kmp_info_t *thr,
-                             int gtid, ompt_data_t *ompt_pid, void *codeptr);
+void __ompt_lw_taskteam_init(ompt_lw_taskteam_t *lwt, kmp_info_t *thr, int gtid,
+                             ompt_data_t *ompt_pid, void *codeptr);
 
 void __ompt_lw_taskteam_link(ompt_lw_taskteam_t *lwt, kmp_info_t *thr,
                              int on_heap, bool always = false);
@@ -85,9 +85,9 @@ inline void *__ompt_load_return_address(int gtid) {
 #define OMPT_LOAD_RETURN_ADDRESS(gtid) __ompt_load_return_address(gtid)
 #define OMPT_LOAD_OR_GET_RETURN_ADDRESS(gtid)                                  \
   ((ompt_enabled.enabled && gtid >= 0 && __kmp_threads[gtid] &&                \
-      __kmp_threads[gtid]->th.ompt_thread_info.return_address)?                \
-      __ompt_load_return_address(gtid):                                        \
-      __builtin_return_address(0))
+    __kmp_threads[gtid]->th.ompt_thread_info.return_address)                   \
+       ? __ompt_load_return_address(gtid)                                      \
+       : __builtin_return_address(0))
 
 //******************************************************************************
 // inline functions
@@ -103,7 +103,8 @@ inline kmp_info_t *ompt_get_thread() {
 }
 
 inline void ompt_set_thread_state(kmp_info_t *thread, ompt_state_t state) {
-  thread->th.ompt_thread_info.state = state;
+  if (thread)
+    thread->th.ompt_thread_info.state = state;
 }
 
 inline const char *ompt_get_runtime_version() {
diff --git a/openmp/runtime/src/test-touch.c b/openmp/runtime/src/test-touch.c
index 71e05e76683d..62e81fe0ed49 100644
--- a/openmp/runtime/src/test-touch.c
+++ b/openmp/runtime/src/test-touch.c
@@ -1,6 +1,5 @@
 // test-touch.c //
 
-
 //===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
@@ -9,22 +8,21 @@
 //
 //===----------------------------------------------------------------------===//
 
-
 #ifdef __cplusplus
 extern "C" {
 #endif
 extern double omp_get_wtime();
-extern int    omp_get_num_threads();
-extern int    omp_get_max_threads();
+extern int omp_get_num_threads();
+extern int omp_get_max_threads();
 #ifdef __cplusplus
 }
 #endif
 
 int main() {
-    omp_get_wtime();
-    omp_get_num_threads();
-    omp_get_max_threads();
-    return 0;
+  omp_get_wtime();
+  omp_get_num_threads();
+  omp_get_max_threads();
+  return 0;
 }
 
 // end of file //
diff --git a/openmp/runtime/src/thirdparty/ittnotify/disable_warnings.h b/openmp/runtime/src/thirdparty/ittnotify/disable_warnings.h
index 6b06035b41fb..e331ffe72950 100644
--- a/openmp/runtime/src/thirdparty/ittnotify/disable_warnings.h
+++ b/openmp/runtime/src/thirdparty/ittnotify/disable_warnings.h
@@ -1,4 +1,4 @@
-
+// clang-format off
 //===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
@@ -7,23 +7,24 @@
 //
 //===----------------------------------------------------------------------===//
 
-
 #include "ittnotify_config.h"
 
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
 
-#pragma warning (disable: 593)   /* parameter "XXXX" was set but never used                 */
-#pragma warning (disable: 344)   /* typedef name has already been declared (with same type) */
-#pragma warning (disable: 174)   /* expression has no effect                                */
-#pragma warning (disable: 4127)  /* conditional expression is constant                      */
-#pragma warning (disable: 4306)  /* conversion from '?' to '?' of greater size              */
+#pragma warning(disable: 593) /* parameter "XXXX" was set but never used */
+#pragma warning(disable: 344) /* typedef name has already been declared (with
+                                  same type) */
+#pragma warning(disable: 174) /* expression has no effect */
+#pragma warning(disable: 4127) /* conditional expression is constant */
+#pragma warning(disable: 4306) /* conversion from '?' to '?' of greater size */
 
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 
 #if defined __INTEL_COMPILER
 
-#pragma warning (disable: 869)  /* parameter "XXXXX" was never referenced                  */
-#pragma warning (disable: 1418) /* external function definition with no prior declaration  */
-#pragma warning (disable: 1419) /* external declaration in primary source file             */
+#pragma warning(disable: 869) /* parameter "XXXXX" was never referenced */
+#pragma warning(disable: 1418) /* external function definition with no prior
+                                  declaration  */
+#pragma warning(disable: 1419) /* external declaration in primary source file */
 
 #endif /* __INTEL_COMPILER */
diff --git a/openmp/runtime/src/thirdparty/ittnotify/ittnotify.h b/openmp/runtime/src/thirdparty/ittnotify/ittnotify.h
index db1c0d0d9d21..e1eee8cde613 100644
--- a/openmp/runtime/src/thirdparty/ittnotify/ittnotify.h
+++ b/openmp/runtime/src/thirdparty/ittnotify/ittnotify.h
@@ -78,59 +78,59 @@ The same ID may not be reused for different instances, unless a previous
 
 /** @cond exclude_from_documentation */
 #ifndef ITT_OS_WIN
-#  define ITT_OS_WIN   1
+#define ITT_OS_WIN 1
 #endif /* ITT_OS_WIN */
 
 #ifndef ITT_OS_LINUX
-#  define ITT_OS_LINUX 2
+#define ITT_OS_LINUX 2
 #endif /* ITT_OS_LINUX */
 
 #ifndef ITT_OS_MAC
-#  define ITT_OS_MAC   3
+#define ITT_OS_MAC 3
 #endif /* ITT_OS_MAC */
 
 #ifndef ITT_OS_FREEBSD
-#  define ITT_OS_FREEBSD   4
+#define ITT_OS_FREEBSD 4
 #endif /* ITT_OS_FREEBSD */
 
 #ifndef ITT_OS
-#  if defined WIN32 || defined _WIN32
-#    define ITT_OS ITT_OS_WIN
-#  elif defined( __APPLE__ ) && defined( __MACH__ )
-#    define ITT_OS ITT_OS_MAC
-#  elif defined( __FreeBSD__ )
-#    define ITT_OS ITT_OS_FREEBSD
-#  else
-#    define ITT_OS ITT_OS_LINUX
-#  endif
+#if defined WIN32 || defined _WIN32
+#define ITT_OS ITT_OS_WIN
+#elif defined(__APPLE__) && defined(__MACH__)
+#define ITT_OS ITT_OS_MAC
+#elif defined(__FreeBSD__)
+#define ITT_OS ITT_OS_FREEBSD
+#else
+#define ITT_OS ITT_OS_LINUX
+#endif
 #endif /* ITT_OS */
 
 #ifndef ITT_PLATFORM_WIN
-#  define ITT_PLATFORM_WIN 1
+#define ITT_PLATFORM_WIN 1
 #endif /* ITT_PLATFORM_WIN */
 
 #ifndef ITT_PLATFORM_POSIX
-#  define ITT_PLATFORM_POSIX 2
+#define ITT_PLATFORM_POSIX 2
 #endif /* ITT_PLATFORM_POSIX */
 
 #ifndef ITT_PLATFORM_MAC
-#  define ITT_PLATFORM_MAC 3
+#define ITT_PLATFORM_MAC 3
 #endif /* ITT_PLATFORM_MAC */
 
 #ifndef ITT_PLATFORM_FREEBSD
-#  define ITT_PLATFORM_FREEBSD 4
+#define ITT_PLATFORM_FREEBSD 4
 #endif /* ITT_PLATFORM_FREEBSD */
 
 #ifndef ITT_PLATFORM
-#  if ITT_OS==ITT_OS_WIN
-#    define ITT_PLATFORM ITT_PLATFORM_WIN
-#  elif ITT_OS==ITT_OS_MAC
-#    define ITT_PLATFORM ITT_PLATFORM_MAC
-#  elif ITT_OS==ITT_OS_FREEBSD
-#    define ITT_PLATFORM ITT_PLATFORM_FREEBSD
-#  else
-#    define ITT_PLATFORM ITT_PLATFORM_POSIX
-#  endif
+#if ITT_OS == ITT_OS_WIN
+#define ITT_PLATFORM ITT_PLATFORM_WIN
+#elif ITT_OS == ITT_OS_MAC
+#define ITT_PLATFORM ITT_PLATFORM_MAC
+#elif ITT_OS == ITT_OS_FREEBSD
+#define ITT_PLATFORM ITT_PLATFORM_FREEBSD
+#else
+#define ITT_PLATFORM ITT_PLATFORM_POSIX
+#endif
 #endif /* ITT_PLATFORM */
 
 #if defined(_UNICODE) && !defined(UNICODE)
@@ -138,9 +138,9 @@ The same ID may not be reused for different instances, unless a previous
 #endif
 
 #include <stddef.h>
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
 #include <tchar.h>
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 #include <stdint.h>
 #if defined(UNICODE) || defined(_UNICODE)
 #include <wchar.h>
@@ -148,69 +148,71 @@ The same ID may not be reused for different instances, unless a previous
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 
 #ifndef ITTAPI_CDECL
-#  if ITT_PLATFORM==ITT_PLATFORM_WIN
-#    define ITTAPI_CDECL __cdecl
-#  else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#    if defined _M_IX86 || defined __i386__
-#      define ITTAPI_CDECL __attribute__ ((cdecl))
-#    else  /* _M_IX86 || __i386__ */
-#      define ITTAPI_CDECL /* actual only on x86 platform */
-#    endif /* _M_IX86 || __i386__ */
-#  endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+#define ITTAPI_CDECL __cdecl
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if defined _M_IX86 || defined __i386__
+#define ITTAPI_CDECL __attribute__((cdecl))
+#else /* _M_IX86 || __i386__ */
+#define ITTAPI_CDECL /* actual only on x86 platform */
+#endif /* _M_IX86 || __i386__ */
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 #endif /* ITTAPI_CDECL */
 
 #ifndef STDCALL
-#  if ITT_PLATFORM==ITT_PLATFORM_WIN
-#    define STDCALL __stdcall
-#  else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#    if defined _M_IX86 || defined __i386__
-#      define STDCALL __attribute__ ((stdcall))
-#    else  /* _M_IX86 || __i386__ */
-#      define STDCALL /* supported only on x86 platform */
-#    endif /* _M_IX86 || __i386__ */
-#  endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+#define STDCALL __stdcall
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if defined _M_IX86 || defined __i386__
+#define STDCALL __attribute__((stdcall))
+#else /* _M_IX86 || __i386__ */
+#define STDCALL /* supported only on x86 platform */
+#endif /* _M_IX86 || __i386__ */
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 #endif /* STDCALL */
 
-#define ITTAPI    ITTAPI_CDECL
+#define ITTAPI ITTAPI_CDECL
 #define LIBITTAPI ITTAPI_CDECL
 
 /* TODO: Temporary for compatibility! */
-#define ITTAPI_CALL    ITTAPI_CDECL
+#define ITTAPI_CALL ITTAPI_CDECL
 #define LIBITTAPI_CALL ITTAPI_CDECL
 
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
 /* use __forceinline (VC++ specific) */
-#define ITT_INLINE           __forceinline
+#define ITT_INLINE __forceinline
 #define ITT_INLINE_ATTRIBUTE /* nothing */
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 /*
  * Generally, functions are not inlined unless optimization is specified.
  * For functions declared inline, this attribute inlines the function even
  * if no optimization level was specified.
  */
 #ifdef __STRICT_ANSI__
-#define ITT_INLINE           static
+#define ITT_INLINE static
 #define ITT_INLINE_ATTRIBUTE __attribute__((unused))
-#else  /* __STRICT_ANSI__ */
-#define ITT_INLINE           static inline
+#else /* __STRICT_ANSI__ */
+#define ITT_INLINE static inline
 #define ITT_INLINE_ATTRIBUTE __attribute__((always_inline, unused))
 #endif /* __STRICT_ANSI__ */
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 /** @endcond */
 
 #ifdef INTEL_ITTNOTIFY_ENABLE_LEGACY
-#  if ITT_PLATFORM==ITT_PLATFORM_WIN
-#    pragma message("WARNING!!! Deprecated API is used. Please undefine INTEL_ITTNOTIFY_ENABLE_LEGACY macro")
-#  else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#    warning "Deprecated API is used. Please undefine INTEL_ITTNOTIFY_ENABLE_LEGACY macro"
-#  endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#  include "legacy/ittnotify.h"
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+#pragma message(                                                               \
+    "WARNING!!! Deprecated API is used. Please undefine INTEL_ITTNOTIFY_ENABLE_LEGACY macro")
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#warning                                                                       \
+    "Deprecated API is used. Please undefine INTEL_ITTNOTIFY_ENABLE_LEGACY macro"
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#include "legacy/ittnotify.h"
 #endif /* INTEL_ITTNOTIFY_ENABLE_LEGACY */
 
 /** @cond exclude_from_documentation */
 /* Helper macro for joining tokens */
-#define ITT_JOIN_AUX(p,n) p##n
-#define ITT_JOIN(p,n)     ITT_JOIN_AUX(p,n)
+#define ITT_JOIN_AUX(p, n) p##n
+#define ITT_JOIN(p, n) ITT_JOIN_AUX(p, n)
 
 #ifdef ITT_MAJOR
 #undef ITT_MAJOR
@@ -218,43 +220,75 @@ The same ID may not be reused for different instances, unless a previous
 #ifdef ITT_MINOR
 #undef ITT_MINOR
 #endif
-#define ITT_MAJOR     3
-#define ITT_MINOR     0
+#define ITT_MAJOR 3
+#define ITT_MINOR 0
 
 /* Standard versioning of a token with major and minor version numbers */
-#define ITT_VERSIONIZE(x)    \
-    ITT_JOIN(x,              \
-    ITT_JOIN(_,              \
-    ITT_JOIN(ITT_MAJOR,      \
-    ITT_JOIN(_, ITT_MINOR))))
+#define ITT_VERSIONIZE(x)                                                      \
+  ITT_JOIN(x, ITT_JOIN(_, ITT_JOIN(ITT_MAJOR, ITT_JOIN(_, ITT_MINOR))))
 
 #ifndef INTEL_ITTNOTIFY_PREFIX
-#  define INTEL_ITTNOTIFY_PREFIX __itt_
+#define INTEL_ITTNOTIFY_PREFIX __itt_
 #endif /* INTEL_ITTNOTIFY_PREFIX */
 #ifndef INTEL_ITTNOTIFY_POSTFIX
-#  define INTEL_ITTNOTIFY_POSTFIX _ptr_
+#define INTEL_ITTNOTIFY_POSTFIX _ptr_
 #endif /* INTEL_ITTNOTIFY_POSTFIX */
 
-#define ITTNOTIFY_NAME_AUX(n) ITT_JOIN(INTEL_ITTNOTIFY_PREFIX,n)
-#define ITTNOTIFY_NAME(n)     ITT_VERSIONIZE(ITTNOTIFY_NAME_AUX(ITT_JOIN(n,INTEL_ITTNOTIFY_POSTFIX)))
+#define ITTNOTIFY_NAME_AUX(n) ITT_JOIN(INTEL_ITTNOTIFY_PREFIX, n)
+#define ITTNOTIFY_NAME(n)                                                      \
+  ITT_VERSIONIZE(ITTNOTIFY_NAME_AUX(ITT_JOIN(n, INTEL_ITTNOTIFY_POSTFIX)))
 
 #define ITTNOTIFY_VOID(n) (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)
-#define ITTNOTIFY_DATA(n) (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)
-
-#define ITTNOTIFY_VOID_D0(n,d)       (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d)
-#define ITTNOTIFY_VOID_D1(n,d,x)     (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x)
-#define ITTNOTIFY_VOID_D2(n,d,x,y)   (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y)
-#define ITTNOTIFY_VOID_D3(n,d,x,y,z) (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y,z)
-#define ITTNOTIFY_VOID_D4(n,d,x,y,z,a)     (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y,z,a)
-#define ITTNOTIFY_VOID_D5(n,d,x,y,z,a,b)   (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y,z,a,b)
-#define ITTNOTIFY_VOID_D6(n,d,x,y,z,a,b,c) (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y,z,a,b,c)
-#define ITTNOTIFY_DATA_D0(n,d)       (!(d)->flags) ?       0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d)
-#define ITTNOTIFY_DATA_D1(n,d,x)     (!(d)->flags) ?       0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x)
-#define ITTNOTIFY_DATA_D2(n,d,x,y)   (!(d)->flags) ?       0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y)
-#define ITTNOTIFY_DATA_D3(n,d,x,y,z) (!(d)->flags) ?       0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y,z)
-#define ITTNOTIFY_DATA_D4(n,d,x,y,z,a)     (!(d)->flags) ? 0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y,z,a)
-#define ITTNOTIFY_DATA_D5(n,d,x,y,z,a,b)   (!(d)->flags) ? 0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y,z,a,b)
-#define ITTNOTIFY_DATA_D6(n,d,x,y,z,a,b,c) (!(d)->flags) ? 0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y,z,a,b,c)
+#define ITTNOTIFY_DATA(n) (!ITTNOTIFY_NAME(n)) ? 0 : ITTNOTIFY_NAME(n)
+
+#define ITTNOTIFY_VOID_D0(n, d)                                                \
+  (!(d)->flags)          ? (void)0                                             \
+  : (!ITTNOTIFY_NAME(n)) ? (void)0                                             \
+                         : ITTNOTIFY_NAME(n)(d)
+#define ITTNOTIFY_VOID_D1(n, d, x)                                             \
+  (!(d)->flags)          ? (void)0                                             \
+  : (!ITTNOTIFY_NAME(n)) ? (void)0                                             \
+                         : ITTNOTIFY_NAME(n)(d, x)
+#define ITTNOTIFY_VOID_D2(n, d, x, y)                                          \
+  (!(d)->flags)          ? (void)0                                             \
+  : (!ITTNOTIFY_NAME(n)) ? (void)0                                             \
+                         : ITTNOTIFY_NAME(n)(d, x, y)
+#define ITTNOTIFY_VOID_D3(n, d, x, y, z)                                       \
+  (!(d)->flags)          ? (void)0                                             \
+  : (!ITTNOTIFY_NAME(n)) ? (void)0                                             \
+                         : ITTNOTIFY_NAME(n)(d, x, y, z)
+#define ITTNOTIFY_VOID_D4(n, d, x, y, z, a)                                    \
+  (!(d)->flags)          ? (void)0                                             \
+  : (!ITTNOTIFY_NAME(n)) ? (void)0                                             \
+                         : ITTNOTIFY_NAME(n)(d, x, y, z, a)
+#define ITTNOTIFY_VOID_D5(n, d, x, y, z, a, b)                                 \
+  (!(d)->flags)          ? (void)0                                             \
+  : (!ITTNOTIFY_NAME(n)) ? (void)0                                             \
+                         : ITTNOTIFY_NAME(n)(d, x, y, z, a, b)
+#define ITTNOTIFY_VOID_D6(n, d, x, y, z, a, b, c)                              \
+  (!(d)->flags)          ? (void)0                                             \
+  : (!ITTNOTIFY_NAME(n)) ? (void)0                                             \
+                         : ITTNOTIFY_NAME(n)(d, x, y, z, a, b, c)
+#define ITTNOTIFY_DATA_D0(n, d)                                                \
+  (!(d)->flags) ? 0 : (!ITTNOTIFY_NAME(n)) ? 0 : ITTNOTIFY_NAME(n)(d)
+#define ITTNOTIFY_DATA_D1(n, d, x)                                             \
+  (!(d)->flags) ? 0 : (!ITTNOTIFY_NAME(n)) ? 0 : ITTNOTIFY_NAME(n)(d, x)
+#define ITTNOTIFY_DATA_D2(n, d, x, y)                                          \
+  (!(d)->flags) ? 0 : (!ITTNOTIFY_NAME(n)) ? 0 : ITTNOTIFY_NAME(n)(d, x, y)
+#define ITTNOTIFY_DATA_D3(n, d, x, y, z)                                       \
+  (!(d)->flags) ? 0 : (!ITTNOTIFY_NAME(n)) ? 0 : ITTNOTIFY_NAME(n)(d, x, y, z)
+#define ITTNOTIFY_DATA_D4(n, d, x, y, z, a)                                    \
+  (!(d)->flags)          ? 0                                                   \
+  : (!ITTNOTIFY_NAME(n)) ? 0                                                   \
+                         : ITTNOTIFY_NAME(n)(d, x, y, z, a)
+#define ITTNOTIFY_DATA_D5(n, d, x, y, z, a, b)                                 \
+  (!(d)->flags)          ? 0                                                   \
+  : (!ITTNOTIFY_NAME(n)) ? 0                                                   \
+                         : ITTNOTIFY_NAME(n)(d, x, y, z, a, b)
+#define ITTNOTIFY_DATA_D6(n, d, x, y, z, a, b, c)                              \
+  (!(d)->flags)          ? 0                                                   \
+  : (!ITTNOTIFY_NAME(n)) ? 0                                                   \
+                         : ITTNOTIFY_NAME(n)(d, x, y, z, a, b, c)
 
 #ifdef ITT_STUB
 #undef ITT_STUB
@@ -262,9 +296,9 @@ The same ID may not be reused for different instances, unless a previous
 #ifdef ITT_STUBV
 #undef ITT_STUBV
 #endif
-#define ITT_STUBV(api,type,name,args)                             \
-    typedef type (api* ITT_JOIN(ITTNOTIFY_NAME(name),_t)) args;   \
-    extern ITT_JOIN(ITTNOTIFY_NAME(name),_t) ITTNOTIFY_NAME(name);
+#define ITT_STUBV(api, type, name, args)                                       \
+  typedef type(api *ITT_JOIN(ITTNOTIFY_NAME(name), _t)) args;                  \
+  extern ITT_JOIN(ITTNOTIFY_NAME(name), _t) ITTNOTIFY_NAME(name);
 #define ITT_STUB ITT_STUBV
 /** @endcond */
 
@@ -282,9 +316,11 @@ extern "C" {
 /**
  * @defgroup control Collection Control
  * @ingroup public
- * General behavior: application continues to run, but no profiling information is being collected
+ * General behavior: application continues to run, but no profiling information
+ * is being collected
  *
- * Pausing occurs not only for the current thread but for all process as well as spawned processes
+ * Pausing occurs not only for the current thread but for all process as well as
+ * spawned processes
  * - Intel(R) Parallel Inspector and Intel(R) Inspector XE:
  *   - Does not analyze or report errors that involve memory access.
  *   - Other errors are reported as usual. Pausing data collection in
@@ -310,25 +346,25 @@ void ITTAPI __itt_detach(void);
 /** @cond exclude_from_documentation */
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, pause,  (void))
+ITT_STUBV(ITTAPI, void, pause, (void))
 ITT_STUBV(ITTAPI, void, resume, (void))
 ITT_STUBV(ITTAPI, void, detach, (void))
-#define __itt_pause      ITTNOTIFY_VOID(pause)
-#define __itt_pause_ptr  ITTNOTIFY_NAME(pause)
-#define __itt_resume     ITTNOTIFY_VOID(resume)
+#define __itt_pause ITTNOTIFY_VOID(pause)
+#define __itt_pause_ptr ITTNOTIFY_NAME(pause)
+#define __itt_resume ITTNOTIFY_VOID(resume)
 #define __itt_resume_ptr ITTNOTIFY_NAME(resume)
-#define __itt_detach     ITTNOTIFY_VOID(detach)
+#define __itt_detach ITTNOTIFY_VOID(detach)
 #define __itt_detach_ptr ITTNOTIFY_NAME(detach)
-#else  /* INTEL_NO_ITTNOTIFY_API */
+#else /* INTEL_NO_ITTNOTIFY_API */
 #define __itt_pause()
-#define __itt_pause_ptr  0
+#define __itt_pause_ptr 0
 #define __itt_resume()
 #define __itt_resume_ptr 0
 #define __itt_detach()
 #define __itt_detach_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_pause_ptr  0
+#else /* INTEL_NO_MACRO_BODY */
+#define __itt_pause_ptr 0
 #define __itt_resume_ptr 0
 #define __itt_detach_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
@@ -346,40 +382,40 @@ ITT_STUBV(ITTAPI, void, detach, (void))
  * @brief Sets thread name of calling thread
  * @param[in] name - name of thread
  */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-void ITTAPI __itt_thread_set_nameA(const char    *name);
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+void ITTAPI __itt_thread_set_nameA(const char *name);
 void ITTAPI __itt_thread_set_nameW(const wchar_t *name);
 #if defined(UNICODE) || defined(_UNICODE)
-#  define __itt_thread_set_name     __itt_thread_set_nameW
-#  define __itt_thread_set_name_ptr __itt_thread_set_nameW_ptr
+#define __itt_thread_set_name __itt_thread_set_nameW
+#define __itt_thread_set_name_ptr __itt_thread_set_nameW_ptr
 #else /* UNICODE */
-#  define __itt_thread_set_name     __itt_thread_set_nameA
-#  define __itt_thread_set_name_ptr __itt_thread_set_nameA_ptr
+#define __itt_thread_set_name __itt_thread_set_nameA
+#define __itt_thread_set_name_ptr __itt_thread_set_nameA_ptr
 #endif /* UNICODE */
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 void ITTAPI __itt_thread_set_name(const char *name);
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 
 /** @cond exclude_from_documentation */
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-ITT_STUBV(ITTAPI, void, thread_set_nameA, (const char    *name))
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+ITT_STUBV(ITTAPI, void, thread_set_nameA, (const char *name))
 ITT_STUBV(ITTAPI, void, thread_set_nameW, (const wchar_t *name))
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-ITT_STUBV(ITTAPI, void, thread_set_name,  (const char    *name))
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUBV(ITTAPI, void, thread_set_name, (const char *name))
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_thread_set_nameA     ITTNOTIFY_VOID(thread_set_nameA)
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+#define __itt_thread_set_nameA ITTNOTIFY_VOID(thread_set_nameA)
 #define __itt_thread_set_nameA_ptr ITTNOTIFY_NAME(thread_set_nameA)
-#define __itt_thread_set_nameW     ITTNOTIFY_VOID(thread_set_nameW)
+#define __itt_thread_set_nameW ITTNOTIFY_VOID(thread_set_nameW)
 #define __itt_thread_set_nameW_ptr ITTNOTIFY_NAME(thread_set_nameW)
 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_thread_set_name     ITTNOTIFY_VOID(thread_set_name)
+#define __itt_thread_set_name ITTNOTIFY_VOID(thread_set_name)
 #define __itt_thread_set_name_ptr ITTNOTIFY_NAME(thread_set_name)
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#else /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
 #define __itt_thread_set_nameA(name)
 #define __itt_thread_set_nameA_ptr 0
 #define __itt_thread_set_nameW(name)
@@ -389,8 +425,8 @@ ITT_STUBV(ITTAPI, void, thread_set_name,  (const char    *name))
 #define __itt_thread_set_name_ptr 0
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#else /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
 #define __itt_thread_set_nameA_ptr 0
 #define __itt_thread_set_nameW_ptr 0
 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
@@ -402,7 +438,8 @@ ITT_STUBV(ITTAPI, void, thread_set_name,  (const char    *name))
 /** @cond exclude_from_gpa_documentation */
 
 /**
- * @brief Mark current thread as ignored from this point on, for the duration of its existence.
+ * @brief Mark current thread as ignored from this point on, for the duration of
+ * its existence.
  */
 void ITTAPI __itt_thread_ignore(void);
 
@@ -410,13 +447,13 @@ void ITTAPI __itt_thread_ignore(void);
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
 ITT_STUBV(ITTAPI, void, thread_ignore, (void))
-#define __itt_thread_ignore     ITTNOTIFY_VOID(thread_ignore)
+#define __itt_thread_ignore ITTNOTIFY_VOID(thread_ignore)
 #define __itt_thread_ignore_ptr ITTNOTIFY_NAME(thread_ignore)
-#else  /* INTEL_NO_ITTNOTIFY_API */
+#else /* INTEL_NO_ITTNOTIFY_API */
 #define __itt_thread_ignore()
 #define __itt_thread_ignore_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
+#else /* INTEL_NO_MACRO_BODY */
 #define __itt_thread_ignore_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
@@ -430,9 +467,11 @@ ITT_STUBV(ITTAPI, void, thread_ignore, (void))
  * @{
  */
 
+// clang-format off
 /*****************************************************************//**
  * @name group of functions used for error suppression in correctness tools
  *********************************************************************/
+// clang-format on
 /** @{ */
 /**
  * @hideinitializer
@@ -442,13 +481,15 @@ ITT_STUBV(ITTAPI, void, thread_ignore, (void))
 
 /**
  * @hideinitializer
- * @brief possible value for suppression mask (suppresses errors from threading analysis)
+ * @brief possible value for suppression mask (suppresses errors from threading
+ * analysis)
  */
 #define __itt_suppress_threading_errors 0x000000ff
 
 /**
  * @hideinitializer
- * @brief possible value for suppression mask (suppresses errors from memory analysis)
+ * @brief possible value for suppression mask (suppresses errors from memory
+ * analysis)
  */
 #define __itt_suppress_memory_errors 0x0000ff00
 
@@ -461,13 +502,13 @@ void ITTAPI __itt_suppress_push(unsigned int mask);
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
 ITT_STUBV(ITTAPI, void, suppress_push, (unsigned int mask))
-#define __itt_suppress_push     ITTNOTIFY_VOID(suppress_push)
+#define __itt_suppress_push ITTNOTIFY_VOID(suppress_push)
 #define __itt_suppress_push_ptr ITTNOTIFY_NAME(suppress_push)
-#else  /* INTEL_NO_ITTNOTIFY_API */
+#else /* INTEL_NO_ITTNOTIFY_API */
 #define __itt_suppress_push(mask)
 #define __itt_suppress_push_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
+#else /* INTEL_NO_MACRO_BODY */
 #define __itt_suppress_push_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
@@ -481,13 +522,13 @@ void ITTAPI __itt_suppress_pop(void);
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
 ITT_STUBV(ITTAPI, void, suppress_pop, (void))
-#define __itt_suppress_pop     ITTNOTIFY_VOID(suppress_pop)
+#define __itt_suppress_pop ITTNOTIFY_VOID(suppress_pop)
 #define __itt_suppress_pop_ptr ITTNOTIFY_NAME(suppress_pop)
-#else  /* INTEL_NO_ITTNOTIFY_API */
+#else /* INTEL_NO_ITTNOTIFY_API */
 #define __itt_suppress_pop()
 #define __itt_suppress_pop_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
+#else /* INTEL_NO_MACRO_BODY */
 #define __itt_suppress_pop_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
@@ -497,47 +538,56 @@ ITT_STUBV(ITTAPI, void, suppress_pop, (void))
  * @brief Enumerator for the disable methods
  */
 typedef enum __itt_suppress_mode {
-    __itt_unsuppress_range,
-    __itt_suppress_range
+  __itt_unsuppress_range,
+  __itt_suppress_range
 } __itt_suppress_mode_t;
 
 /**
- * @brief Mark a range of memory for error suppression or unsuppression for error types included in mask
+ * @brief Mark a range of memory for error suppression or unsuppression for
+ * error types included in mask
  */
-void ITTAPI __itt_suppress_mark_range(__itt_suppress_mode_t mode, unsigned int mask, void * address, size_t size);
+void ITTAPI __itt_suppress_mark_range(__itt_suppress_mode_t mode,
+                                      unsigned int mask, void *address,
+                                      size_t size);
 
 /** @cond exclude_from_documentation */
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, suppress_mark_range, (__itt_suppress_mode_t mode, unsigned int mask, void * address, size_t size))
-#define __itt_suppress_mark_range     ITTNOTIFY_VOID(suppress_mark_range)
+ITT_STUBV(ITTAPI, void, suppress_mark_range,
+          (__itt_suppress_mode_t mode, unsigned int mask, void *address,
+           size_t size))
+#define __itt_suppress_mark_range ITTNOTIFY_VOID(suppress_mark_range)
 #define __itt_suppress_mark_range_ptr ITTNOTIFY_NAME(suppress_mark_range)
-#else  /* INTEL_NO_ITTNOTIFY_API */
+#else /* INTEL_NO_ITTNOTIFY_API */
 #define __itt_suppress_mark_range(mask)
 #define __itt_suppress_mark_range_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
+#else /* INTEL_NO_MACRO_BODY */
 #define __itt_suppress_mark_range_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
 
 /**
- * @brief Undo the effect of a matching call to __itt_suppress_mark_range.   If not matching
- *        call is found, nothing is changed.
+ * @brief Undo the effect of a matching call to __itt_suppress_mark_range.   If
+ * not matching call is found, nothing is changed.
  */
-void ITTAPI __itt_suppress_clear_range(__itt_suppress_mode_t mode, unsigned int mask, void * address, size_t size);
+void ITTAPI __itt_suppress_clear_range(__itt_suppress_mode_t mode,
+                                       unsigned int mask, void *address,
+                                       size_t size);
 
 /** @cond exclude_from_documentation */
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, suppress_clear_range, (__itt_suppress_mode_t mode, unsigned int mask, void * address, size_t size))
-#define __itt_suppress_clear_range     ITTNOTIFY_VOID(suppress_clear_range)
+ITT_STUBV(ITTAPI, void, suppress_clear_range,
+          (__itt_suppress_mode_t mode, unsigned int mask, void *address,
+           size_t size))
+#define __itt_suppress_clear_range ITTNOTIFY_VOID(suppress_clear_range)
 #define __itt_suppress_clear_range_ptr ITTNOTIFY_NAME(suppress_clear_range)
-#else  /* INTEL_NO_ITTNOTIFY_API */
+#else /* INTEL_NO_ITTNOTIFY_API */
 #define __itt_suppress_clear_range(mask)
 #define __itt_suppress_clear_range_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
+#else /* INTEL_NO_MACRO_BODY */
 #define __itt_suppress_clear_range_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
@@ -560,7 +610,7 @@ ITT_STUBV(ITTAPI, void, suppress_clear_range, (__itt_suppress_mode_t mode, unsig
  * @hideinitializer
  * @brief possible value of attribute argument for sync object type
  */
-#define __itt_attr_mutex   2
+#define __itt_attr_mutex 2
 
 /**
 @brief Name a synchronization object
@@ -573,40 +623,47 @@ no name will be assigned to the object.
 @param[in] attribute  one of [#__itt_attr_barrier, #__itt_attr_mutex]
  */
 
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-void ITTAPI __itt_sync_createA(void *addr, const char    *objtype, const char    *objname, int attribute);
-void ITTAPI __itt_sync_createW(void *addr, const wchar_t *objtype, const wchar_t *objname, int attribute);
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+void ITTAPI __itt_sync_createA(void *addr, const char *objtype,
+                               const char *objname, int attribute);
+void ITTAPI __itt_sync_createW(void *addr, const wchar_t *objtype,
+                               const wchar_t *objname, int attribute);
 #if defined(UNICODE) || defined(_UNICODE)
-#  define __itt_sync_create     __itt_sync_createW
-#  define __itt_sync_create_ptr __itt_sync_createW_ptr
+#define __itt_sync_create __itt_sync_createW
+#define __itt_sync_create_ptr __itt_sync_createW_ptr
 #else /* UNICODE */
-#  define __itt_sync_create     __itt_sync_createA
-#  define __itt_sync_create_ptr __itt_sync_createA_ptr
+#define __itt_sync_create __itt_sync_createA
+#define __itt_sync_create_ptr __itt_sync_createA_ptr
 #endif /* UNICODE */
 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-void ITTAPI __itt_sync_create (void *addr, const char *objtype, const char *objname, int attribute);
+void ITTAPI __itt_sync_create(void *addr, const char *objtype,
+                              const char *objname, int attribute);
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 
 /** @cond exclude_from_documentation */
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-ITT_STUBV(ITTAPI, void, sync_createA, (void *addr, const char    *objtype, const char    *objname, int attribute))
-ITT_STUBV(ITTAPI, void, sync_createW, (void *addr, const wchar_t *objtype, const wchar_t *objname, int attribute))
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-ITT_STUBV(ITTAPI, void, sync_create,  (void *addr, const char*    objtype, const char*    objname, int attribute))
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+ITT_STUBV(ITTAPI, void, sync_createA,
+          (void *addr, const char *objtype, const char *objname, int attribute))
+ITT_STUBV(ITTAPI, void, sync_createW,
+          (void *addr, const wchar_t *objtype, const wchar_t *objname,
+           int attribute))
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUBV(ITTAPI, void, sync_create,
+          (void *addr, const char *objtype, const char *objname, int attribute))
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_sync_createA     ITTNOTIFY_VOID(sync_createA)
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+#define __itt_sync_createA ITTNOTIFY_VOID(sync_createA)
 #define __itt_sync_createA_ptr ITTNOTIFY_NAME(sync_createA)
-#define __itt_sync_createW     ITTNOTIFY_VOID(sync_createW)
+#define __itt_sync_createW ITTNOTIFY_VOID(sync_createW)
 #define __itt_sync_createW_ptr ITTNOTIFY_NAME(sync_createW)
 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_sync_create     ITTNOTIFY_VOID(sync_create)
+#define __itt_sync_create ITTNOTIFY_VOID(sync_create)
 #define __itt_sync_create_ptr ITTNOTIFY_NAME(sync_create)
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#else /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
 #define __itt_sync_createA(addr, objtype, objname, attribute)
 #define __itt_sync_createA_ptr 0
 #define __itt_sync_createW(addr, objtype, objname, attribute)
@@ -616,8 +673,8 @@ ITT_STUBV(ITTAPI, void, sync_create,  (void *addr, const char*    objtype, const
 #define __itt_sync_create_ptr 0
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#else /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
 #define __itt_sync_createA_ptr 0
 #define __itt_sync_createW_ptr 0
 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
@@ -634,15 +691,15 @@ synchronization object.
 @param[in] addr  handle for the synchronization object.
 @param[in] name  null-terminated object name string.
 */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-void ITTAPI __itt_sync_renameA(void *addr, const char    *name);
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+void ITTAPI __itt_sync_renameA(void *addr, const char *name);
 void ITTAPI __itt_sync_renameW(void *addr, const wchar_t *name);
 #if defined(UNICODE) || defined(_UNICODE)
-#  define __itt_sync_rename     __itt_sync_renameW
-#  define __itt_sync_rename_ptr __itt_sync_renameW_ptr
+#define __itt_sync_rename __itt_sync_renameW
+#define __itt_sync_rename_ptr __itt_sync_renameW_ptr
 #else /* UNICODE */
-#  define __itt_sync_rename     __itt_sync_renameA
-#  define __itt_sync_rename_ptr __itt_sync_renameA_ptr
+#define __itt_sync_rename __itt_sync_renameA
+#define __itt_sync_rename_ptr __itt_sync_renameA_ptr
 #endif /* UNICODE */
 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 void ITTAPI __itt_sync_rename(void *addr, const char *name);
@@ -651,23 +708,23 @@ void ITTAPI __itt_sync_rename(void *addr, const char *name);
 /** @cond exclude_from_documentation */
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-ITT_STUBV(ITTAPI, void, sync_renameA, (void *addr, const char    *name))
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+ITT_STUBV(ITTAPI, void, sync_renameA, (void *addr, const char *name))
 ITT_STUBV(ITTAPI, void, sync_renameW, (void *addr, const wchar_t *name))
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-ITT_STUBV(ITTAPI, void, sync_rename,  (void *addr, const char    *name))
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUBV(ITTAPI, void, sync_rename, (void *addr, const char *name))
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_sync_renameA     ITTNOTIFY_VOID(sync_renameA)
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+#define __itt_sync_renameA ITTNOTIFY_VOID(sync_renameA)
 #define __itt_sync_renameA_ptr ITTNOTIFY_NAME(sync_renameA)
-#define __itt_sync_renameW     ITTNOTIFY_VOID(sync_renameW)
+#define __itt_sync_renameW ITTNOTIFY_VOID(sync_renameW)
 #define __itt_sync_renameW_ptr ITTNOTIFY_NAME(sync_renameW)
 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_sync_rename     ITTNOTIFY_VOID(sync_rename)
+#define __itt_sync_rename ITTNOTIFY_VOID(sync_rename)
 #define __itt_sync_rename_ptr ITTNOTIFY_NAME(sync_rename)
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#else /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
 #define __itt_sync_renameA(addr, name)
 #define __itt_sync_renameA_ptr 0
 #define __itt_sync_renameW(addr, name)
@@ -677,8 +734,8 @@ ITT_STUBV(ITTAPI, void, sync_rename,  (void *addr, const char    *name))
 #define __itt_sync_rename_ptr 0
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#else /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
 #define __itt_sync_renameA_ptr 0
 #define __itt_sync_renameW_ptr 0
 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
@@ -697,37 +754,39 @@ void ITTAPI __itt_sync_destroy(void *addr);
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
 ITT_STUBV(ITTAPI, void, sync_destroy, (void *addr))
-#define __itt_sync_destroy     ITTNOTIFY_VOID(sync_destroy)
+#define __itt_sync_destroy ITTNOTIFY_VOID(sync_destroy)
 #define __itt_sync_destroy_ptr ITTNOTIFY_NAME(sync_destroy)
-#else  /* INTEL_NO_ITTNOTIFY_API */
+#else /* INTEL_NO_ITTNOTIFY_API */
 #define __itt_sync_destroy(addr)
 #define __itt_sync_destroy_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
+#else /* INTEL_NO_MACRO_BODY */
 #define __itt_sync_destroy_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
 
+// clang-format off
 /*****************************************************************//**
  * @name group of functions is used for performance measurement tools
  *********************************************************************/
+// clang-format on
 /** @{ */
 /**
  * @brief Enter spin loop on user-defined sync object
  */
-void ITTAPI __itt_sync_prepare(void* addr);
+void ITTAPI __itt_sync_prepare(void *addr);
 
 /** @cond exclude_from_documentation */
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
 ITT_STUBV(ITTAPI, void, sync_prepare, (void *addr))
-#define __itt_sync_prepare     ITTNOTIFY_VOID(sync_prepare)
+#define __itt_sync_prepare ITTNOTIFY_VOID(sync_prepare)
 #define __itt_sync_prepare_ptr ITTNOTIFY_NAME(sync_prepare)
-#else  /* INTEL_NO_ITTNOTIFY_API */
+#else /* INTEL_NO_ITTNOTIFY_API */
 #define __itt_sync_prepare(addr)
 #define __itt_sync_prepare_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
+#else /* INTEL_NO_MACRO_BODY */
 #define __itt_sync_prepare_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
@@ -741,13 +800,13 @@ void ITTAPI __itt_sync_cancel(void *addr);
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
 ITT_STUBV(ITTAPI, void, sync_cancel, (void *addr))
-#define __itt_sync_cancel     ITTNOTIFY_VOID(sync_cancel)
+#define __itt_sync_cancel ITTNOTIFY_VOID(sync_cancel)
 #define __itt_sync_cancel_ptr ITTNOTIFY_NAME(sync_cancel)
-#else  /* INTEL_NO_ITTNOTIFY_API */
+#else /* INTEL_NO_ITTNOTIFY_API */
 #define __itt_sync_cancel(addr)
 #define __itt_sync_cancel_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
+#else /* INTEL_NO_MACRO_BODY */
 #define __itt_sync_cancel_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
@@ -761,33 +820,34 @@ void ITTAPI __itt_sync_acquired(void *addr);
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
 ITT_STUBV(ITTAPI, void, sync_acquired, (void *addr))
-#define __itt_sync_acquired     ITTNOTIFY_VOID(sync_acquired)
+#define __itt_sync_acquired ITTNOTIFY_VOID(sync_acquired)
 #define __itt_sync_acquired_ptr ITTNOTIFY_NAME(sync_acquired)
-#else  /* INTEL_NO_ITTNOTIFY_API */
+#else /* INTEL_NO_ITTNOTIFY_API */
 #define __itt_sync_acquired(addr)
 #define __itt_sync_acquired_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
+#else /* INTEL_NO_MACRO_BODY */
 #define __itt_sync_acquired_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
 
 /**
- * @brief Start sync object releasing code. Is called before the lock release call.
+ * @brief Start sync object releasing code. Is called before the lock release
+ * call.
  */
-void ITTAPI __itt_sync_releasing(void* addr);
+void ITTAPI __itt_sync_releasing(void *addr);
 
 /** @cond exclude_from_documentation */
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
 ITT_STUBV(ITTAPI, void, sync_releasing, (void *addr))
-#define __itt_sync_releasing     ITTNOTIFY_VOID(sync_releasing)
+#define __itt_sync_releasing ITTNOTIFY_VOID(sync_releasing)
 #define __itt_sync_releasing_ptr ITTNOTIFY_NAME(sync_releasing)
-#else  /* INTEL_NO_ITTNOTIFY_API */
+#else /* INTEL_NO_ITTNOTIFY_API */
 #define __itt_sync_releasing(addr)
 #define __itt_sync_releasing_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
+#else /* INTEL_NO_MACRO_BODY */
 #define __itt_sync_releasing_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
@@ -795,33 +855,36 @@ ITT_STUBV(ITTAPI, void, sync_releasing, (void *addr))
 
 /** @} sync group */
 
+// clang-format off
 /**************************************************************//**
  * @name group of functions is used for correctness checking tools
  ******************************************************************/
+// clang-format on
 /** @{ */
 /**
  * @ingroup legacy
  * @deprecated Legacy API
  * @brief Fast synchronization which does no require spinning.
- * - This special function is to be used by TBB and OpenMP libraries only when they know
- *   there is no spin but they need to suppress TC warnings about shared variable modifications.
- * - It only has corresponding pointers in static library and does not have corresponding function
- *   in dynamic library.
+ * - This special function is to be used by TBB and OpenMP libraries only when
+ * they know there is no spin but they need to suppress TC warnings about shared
+ * variable modifications.
+ * - It only has corresponding pointers in static library and does not have
+ * corresponding function in dynamic library.
  * @see void __itt_sync_prepare(void* addr);
  */
-void ITTAPI __itt_fsync_prepare(void* addr);
+void ITTAPI __itt_fsync_prepare(void *addr);
 
 /** @cond exclude_from_documentation */
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
 ITT_STUBV(ITTAPI, void, fsync_prepare, (void *addr))
-#define __itt_fsync_prepare     ITTNOTIFY_VOID(fsync_prepare)
+#define __itt_fsync_prepare ITTNOTIFY_VOID(fsync_prepare)
 #define __itt_fsync_prepare_ptr ITTNOTIFY_NAME(fsync_prepare)
-#else  /* INTEL_NO_ITTNOTIFY_API */
+#else /* INTEL_NO_ITTNOTIFY_API */
 #define __itt_fsync_prepare(addr)
 #define __itt_fsync_prepare_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
+#else /* INTEL_NO_MACRO_BODY */
 #define __itt_fsync_prepare_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
@@ -830,10 +893,11 @@ ITT_STUBV(ITTAPI, void, fsync_prepare, (void *addr))
  * @ingroup legacy
  * @deprecated Legacy API
  * @brief Fast synchronization which does no require spinning.
- * - This special function is to be used by TBB and OpenMP libraries only when they know
- *   there is no spin but they need to suppress TC warnings about shared variable modifications.
- * - It only has corresponding pointers in static library and does not have corresponding function
- *   in dynamic library.
+ * - This special function is to be used by TBB and OpenMP libraries only when
+ * they know there is no spin but they need to suppress TC warnings about shared
+ * variable modifications.
+ * - It only has corresponding pointers in static library and does not have
+ * corresponding function in dynamic library.
  * @see void __itt_sync_cancel(void *addr);
  */
 void ITTAPI __itt_fsync_cancel(void *addr);
@@ -842,13 +906,13 @@ void ITTAPI __itt_fsync_cancel(void *addr);
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
 ITT_STUBV(ITTAPI, void, fsync_cancel, (void *addr))
-#define __itt_fsync_cancel     ITTNOTIFY_VOID(fsync_cancel)
+#define __itt_fsync_cancel ITTNOTIFY_VOID(fsync_cancel)
 #define __itt_fsync_cancel_ptr ITTNOTIFY_NAME(fsync_cancel)
-#else  /* INTEL_NO_ITTNOTIFY_API */
+#else /* INTEL_NO_ITTNOTIFY_API */
 #define __itt_fsync_cancel(addr)
 #define __itt_fsync_cancel_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
+#else /* INTEL_NO_MACRO_BODY */
 #define __itt_fsync_cancel_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
@@ -857,10 +921,11 @@ ITT_STUBV(ITTAPI, void, fsync_cancel, (void *addr))
  * @ingroup legacy
  * @deprecated Legacy API
  * @brief Fast synchronization which does no require spinning.
- * - This special function is to be used by TBB and OpenMP libraries only when they know
- *   there is no spin but they need to suppress TC warnings about shared variable modifications.
- * - It only has corresponding pointers in static library and does not have corresponding function
- *   in dynamic library.
+ * - This special function is to be used by TBB and OpenMP libraries only when
+ * they know there is no spin but they need to suppress TC warnings about shared
+ * variable modifications.
+ * - It only has corresponding pointers in static library and does not have
+ * corresponding function in dynamic library.
  * @see void __itt_sync_acquired(void *addr);
  */
 void ITTAPI __itt_fsync_acquired(void *addr);
@@ -869,13 +934,13 @@ void ITTAPI __itt_fsync_acquired(void *addr);
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
 ITT_STUBV(ITTAPI, void, fsync_acquired, (void *addr))
-#define __itt_fsync_acquired     ITTNOTIFY_VOID(fsync_acquired)
+#define __itt_fsync_acquired ITTNOTIFY_VOID(fsync_acquired)
 #define __itt_fsync_acquired_ptr ITTNOTIFY_NAME(fsync_acquired)
-#else  /* INTEL_NO_ITTNOTIFY_API */
+#else /* INTEL_NO_ITTNOTIFY_API */
 #define __itt_fsync_acquired(addr)
 #define __itt_fsync_acquired_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
+#else /* INTEL_NO_MACRO_BODY */
 #define __itt_fsync_acquired_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
@@ -884,25 +949,26 @@ ITT_STUBV(ITTAPI, void, fsync_acquired, (void *addr))
  * @ingroup legacy
  * @deprecated Legacy API
  * @brief Fast synchronization which does no require spinning.
- * - This special function is to be used by TBB and OpenMP libraries only when they know
- *   there is no spin but they need to suppress TC warnings about shared variable modifications.
- * - It only has corresponding pointers in static library and does not have corresponding function
- *   in dynamic library.
+ * - This special function is to be used by TBB and OpenMP libraries only when
+ * they know there is no spin but they need to suppress TC warnings about shared
+ * variable modifications.
+ * - It only has corresponding pointers in static library and does not have
+ * corresponding function in dynamic library.
  * @see void __itt_sync_releasing(void* addr);
  */
-void ITTAPI __itt_fsync_releasing(void* addr);
+void ITTAPI __itt_fsync_releasing(void *addr);
 
 /** @cond exclude_from_documentation */
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
 ITT_STUBV(ITTAPI, void, fsync_releasing, (void *addr))
-#define __itt_fsync_releasing     ITTNOTIFY_VOID(fsync_releasing)
+#define __itt_fsync_releasing ITTNOTIFY_VOID(fsync_releasing)
 #define __itt_fsync_releasing_ptr ITTNOTIFY_NAME(fsync_releasing)
-#else  /* INTEL_NO_ITTNOTIFY_API */
+#else /* INTEL_NO_ITTNOTIFY_API */
 #define __itt_fsync_releasing(addr)
 #define __itt_fsync_releasing_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
+#else /* INTEL_NO_MACRO_BODY */
 #define __itt_fsync_releasing_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
@@ -935,18 +1001,20 @@ ITT_STUBV(ITTAPI, void, fsync_releasing, (void *addr))
  */
 #if !defined(_ADVISOR_ANNOTATE_H_) || defined(ANNOTATE_EXPAND_NULL)
 
-typedef void* __itt_model_site;             /*!< @brief handle for lexical site     */
-typedef void* __itt_model_site_instance;    /*!< @brief handle for dynamic instance */
-typedef void* __itt_model_task;             /*!< @brief handle for lexical site     */
-typedef void* __itt_model_task_instance;    /*!< @brief handle for dynamic instance */
+typedef void *__itt_model_site; /*!< @brief handle for lexical site     */
+typedef void
+    *__itt_model_site_instance; /*!< @brief handle for dynamic instance */
+typedef void *__itt_model_task; /*!< @brief handle for lexical site     */
+typedef void
+    *__itt_model_task_instance; /*!< @brief handle for dynamic instance */
 
 /**
  * @enum __itt_model_disable
  * @brief Enumerator for the disable methods
  */
 typedef enum {
-    __itt_model_disable_observation,
-    __itt_model_disable_collection
+  __itt_model_disable_observation,
+  __itt_model_disable_collection
 } __itt_model_disable;
 
 #endif /* !_ADVISOR_ANNOTATE_H_ || ANNOTATE_EXPAND_NULL */
@@ -963,65 +1031,72 @@ typedef enum {
  * lexical sites match, it is unspecified whether they are treated as the
  * same or different for data presentation.
  */
-void ITTAPI __itt_model_site_begin(__itt_model_site *site, __itt_model_site_instance *instance, const char *name);
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
+void ITTAPI __itt_model_site_begin(__itt_model_site *site,
+                                   __itt_model_site_instance *instance,
+                                   const char *name);
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
 void ITTAPI __itt_model_site_beginW(const wchar_t *name);
 #endif
 void ITTAPI __itt_model_site_beginA(const char *name);
 void ITTAPI __itt_model_site_beginAL(const char *name, size_t siteNameLen);
-void ITTAPI __itt_model_site_end  (__itt_model_site *site, __itt_model_site_instance *instance);
+void ITTAPI __itt_model_site_end(__itt_model_site *site,
+                                 __itt_model_site_instance *instance);
 void ITTAPI __itt_model_site_end_2(void);
 
 /** @cond exclude_from_documentation */
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, model_site_begin,  (__itt_model_site *site, __itt_model_site_instance *instance, const char *name))
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-ITT_STUBV(ITTAPI, void, model_site_beginW,  (const wchar_t *name))
+ITT_STUBV(ITTAPI, void, model_site_begin,
+          (__itt_model_site * site, __itt_model_site_instance *instance,
+           const char *name))
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+ITT_STUBV(ITTAPI, void, model_site_beginW, (const wchar_t *name))
 #endif
-ITT_STUBV(ITTAPI, void, model_site_beginA,  (const char *name))
-ITT_STUBV(ITTAPI, void, model_site_beginAL,  (const char *name, size_t siteNameLen))
-ITT_STUBV(ITTAPI, void, model_site_end,    (__itt_model_site *site, __itt_model_site_instance *instance))
-ITT_STUBV(ITTAPI, void, model_site_end_2,  (void))
-#define __itt_model_site_begin      ITTNOTIFY_VOID(model_site_begin)
-#define __itt_model_site_begin_ptr  ITTNOTIFY_NAME(model_site_begin)
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_model_site_beginW      ITTNOTIFY_VOID(model_site_beginW)
-#define __itt_model_site_beginW_ptr  ITTNOTIFY_NAME(model_site_beginW)
+ITT_STUBV(ITTAPI, void, model_site_beginA, (const char *name))
+ITT_STUBV(ITTAPI, void, model_site_beginAL,
+          (const char *name, size_t siteNameLen))
+ITT_STUBV(ITTAPI, void, model_site_end,
+          (__itt_model_site * site, __itt_model_site_instance *instance))
+ITT_STUBV(ITTAPI, void, model_site_end_2, (void))
+#define __itt_model_site_begin ITTNOTIFY_VOID(model_site_begin)
+#define __itt_model_site_begin_ptr ITTNOTIFY_NAME(model_site_begin)
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+#define __itt_model_site_beginW ITTNOTIFY_VOID(model_site_beginW)
+#define __itt_model_site_beginW_ptr ITTNOTIFY_NAME(model_site_beginW)
 #endif
-#define __itt_model_site_beginA      ITTNOTIFY_VOID(model_site_beginA)
-#define __itt_model_site_beginA_ptr  ITTNOTIFY_NAME(model_site_beginA)
-#define __itt_model_site_beginAL      ITTNOTIFY_VOID(model_site_beginAL)
-#define __itt_model_site_beginAL_ptr  ITTNOTIFY_NAME(model_site_beginAL)
-#define __itt_model_site_end        ITTNOTIFY_VOID(model_site_end)
-#define __itt_model_site_end_ptr    ITTNOTIFY_NAME(model_site_end)
-#define __itt_model_site_end_2        ITTNOTIFY_VOID(model_site_end_2)
-#define __itt_model_site_end_2_ptr    ITTNOTIFY_NAME(model_site_end_2)
-#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_model_site_beginA ITTNOTIFY_VOID(model_site_beginA)
+#define __itt_model_site_beginA_ptr ITTNOTIFY_NAME(model_site_beginA)
+#define __itt_model_site_beginAL ITTNOTIFY_VOID(model_site_beginAL)
+#define __itt_model_site_beginAL_ptr ITTNOTIFY_NAME(model_site_beginAL)
+#define __itt_model_site_end ITTNOTIFY_VOID(model_site_end)
+#define __itt_model_site_end_ptr ITTNOTIFY_NAME(model_site_end)
+#define __itt_model_site_end_2 ITTNOTIFY_VOID(model_site_end_2)
+#define __itt_model_site_end_2_ptr ITTNOTIFY_NAME(model_site_end_2)
+#else /* INTEL_NO_ITTNOTIFY_API */
 #define __itt_model_site_begin(site, instance, name)
-#define __itt_model_site_begin_ptr  0
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_model_site_begin_ptr 0
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
 #define __itt_model_site_beginW(name)
-#define __itt_model_site_beginW_ptr  0
+#define __itt_model_site_beginW_ptr 0
 #endif
 #define __itt_model_site_beginA(name)
-#define __itt_model_site_beginA_ptr  0
+#define __itt_model_site_beginA_ptr 0
 #define __itt_model_site_beginAL(name, siteNameLen)
-#define __itt_model_site_beginAL_ptr  0
+#define __itt_model_site_beginAL_ptr 0
 #define __itt_model_site_end(site, instance)
-#define __itt_model_site_end_ptr    0
+#define __itt_model_site_end_ptr 0
 #define __itt_model_site_end_2()
-#define __itt_model_site_end_2_ptr    0
+#define __itt_model_site_end_2_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_model_site_begin_ptr  0
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_model_site_beginW_ptr  0
+#else /* INTEL_NO_MACRO_BODY */
+#define __itt_model_site_begin_ptr 0
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+#define __itt_model_site_beginW_ptr 0
 #endif
-#define __itt_model_site_beginA_ptr  0
-#define __itt_model_site_beginAL_ptr  0
-#define __itt_model_site_end_ptr    0
-#define __itt_model_site_end_2_ptr    0
+#define __itt_model_site_beginA_ptr 0
+#define __itt_model_site_beginAL_ptr 0
+#define __itt_model_site_end_ptr 0
+#define __itt_model_site_end_2_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
 
@@ -1036,8 +1111,10 @@ ITT_STUBV(ITTAPI, void, model_site_end_2,  (void))
  * should not fail due to construct nesting issues, nor attempt to directly
  * indicate the problem.
  */
-void ITTAPI __itt_model_task_begin(__itt_model_task *task, __itt_model_task_instance *instance, const char *name);
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
+void ITTAPI __itt_model_task_begin(__itt_model_task *task,
+                                   __itt_model_task_instance *instance,
+                                   const char *name);
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
 void ITTAPI __itt_model_task_beginW(const wchar_t *name);
 void ITTAPI __itt_model_iteration_taskW(const wchar_t *name);
 #endif
@@ -1045,74 +1122,80 @@ void ITTAPI __itt_model_task_beginA(const char *name);
 void ITTAPI __itt_model_task_beginAL(const char *name, size_t taskNameLen);
 void ITTAPI __itt_model_iteration_taskA(const char *name);
 void ITTAPI __itt_model_iteration_taskAL(const char *name, size_t taskNameLen);
-void ITTAPI __itt_model_task_end  (__itt_model_task *task, __itt_model_task_instance *instance);
+void ITTAPI __itt_model_task_end(__itt_model_task *task,
+                                 __itt_model_task_instance *instance);
 void ITTAPI __itt_model_task_end_2(void);
 
 /** @cond exclude_from_documentation */
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, model_task_begin,  (__itt_model_task *task, __itt_model_task_instance *instance, const char *name))
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-ITT_STUBV(ITTAPI, void, model_task_beginW,  (const wchar_t *name))
+ITT_STUBV(ITTAPI, void, model_task_begin,
+          (__itt_model_task * task, __itt_model_task_instance *instance,
+           const char *name))
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+ITT_STUBV(ITTAPI, void, model_task_beginW, (const wchar_t *name))
 ITT_STUBV(ITTAPI, void, model_iteration_taskW, (const wchar_t *name))
 #endif
-ITT_STUBV(ITTAPI, void, model_task_beginA,  (const char *name))
-ITT_STUBV(ITTAPI, void, model_task_beginAL,  (const char *name, size_t taskNameLen))
-ITT_STUBV(ITTAPI, void, model_iteration_taskA,  (const char *name))
-ITT_STUBV(ITTAPI, void, model_iteration_taskAL,  (const char *name, size_t taskNameLen))
-ITT_STUBV(ITTAPI, void, model_task_end,    (__itt_model_task *task, __itt_model_task_instance *instance))
-ITT_STUBV(ITTAPI, void, model_task_end_2,  (void))
-#define __itt_model_task_begin      ITTNOTIFY_VOID(model_task_begin)
-#define __itt_model_task_begin_ptr  ITTNOTIFY_NAME(model_task_begin)
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_model_task_beginW     ITTNOTIFY_VOID(model_task_beginW)
+ITT_STUBV(ITTAPI, void, model_task_beginA, (const char *name))
+ITT_STUBV(ITTAPI, void, model_task_beginAL,
+          (const char *name, size_t taskNameLen))
+ITT_STUBV(ITTAPI, void, model_iteration_taskA, (const char *name))
+ITT_STUBV(ITTAPI, void, model_iteration_taskAL,
+          (const char *name, size_t taskNameLen))
+ITT_STUBV(ITTAPI, void, model_task_end,
+          (__itt_model_task * task, __itt_model_task_instance *instance))
+ITT_STUBV(ITTAPI, void, model_task_end_2, (void))
+#define __itt_model_task_begin ITTNOTIFY_VOID(model_task_begin)
+#define __itt_model_task_begin_ptr ITTNOTIFY_NAME(model_task_begin)
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+#define __itt_model_task_beginW ITTNOTIFY_VOID(model_task_beginW)
 #define __itt_model_task_beginW_ptr ITTNOTIFY_NAME(model_task_beginW)
-#define __itt_model_iteration_taskW     ITTNOTIFY_VOID(model_iteration_taskW)
+#define __itt_model_iteration_taskW ITTNOTIFY_VOID(model_iteration_taskW)
 #define __itt_model_iteration_taskW_ptr ITTNOTIFY_NAME(model_iteration_taskW)
 #endif
-#define __itt_model_task_beginA    ITTNOTIFY_VOID(model_task_beginA)
+#define __itt_model_task_beginA ITTNOTIFY_VOID(model_task_beginA)
 #define __itt_model_task_beginA_ptr ITTNOTIFY_NAME(model_task_beginA)
-#define __itt_model_task_beginAL    ITTNOTIFY_VOID(model_task_beginAL)
+#define __itt_model_task_beginAL ITTNOTIFY_VOID(model_task_beginAL)
 #define __itt_model_task_beginAL_ptr ITTNOTIFY_NAME(model_task_beginAL)
-#define __itt_model_iteration_taskA    ITTNOTIFY_VOID(model_iteration_taskA)
+#define __itt_model_iteration_taskA ITTNOTIFY_VOID(model_iteration_taskA)
 #define __itt_model_iteration_taskA_ptr ITTNOTIFY_NAME(model_iteration_taskA)
-#define __itt_model_iteration_taskAL    ITTNOTIFY_VOID(model_iteration_taskAL)
+#define __itt_model_iteration_taskAL ITTNOTIFY_VOID(model_iteration_taskAL)
 #define __itt_model_iteration_taskAL_ptr ITTNOTIFY_NAME(model_iteration_taskAL)
-#define __itt_model_task_end        ITTNOTIFY_VOID(model_task_end)
-#define __itt_model_task_end_ptr    ITTNOTIFY_NAME(model_task_end)
-#define __itt_model_task_end_2        ITTNOTIFY_VOID(model_task_end_2)
-#define __itt_model_task_end_2_ptr    ITTNOTIFY_NAME(model_task_end_2)
-#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_model_task_end ITTNOTIFY_VOID(model_task_end)
+#define __itt_model_task_end_ptr ITTNOTIFY_NAME(model_task_end)
+#define __itt_model_task_end_2 ITTNOTIFY_VOID(model_task_end_2)
+#define __itt_model_task_end_2_ptr ITTNOTIFY_NAME(model_task_end_2)
+#else /* INTEL_NO_ITTNOTIFY_API */
 #define __itt_model_task_begin(task, instance, name)
-#define __itt_model_task_begin_ptr  0
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_model_task_begin_ptr 0
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
 #define __itt_model_task_beginW(name)
-#define __itt_model_task_beginW_ptr  0
+#define __itt_model_task_beginW_ptr 0
 #endif
 #define __itt_model_task_beginA(name)
-#define __itt_model_task_beginA_ptr  0
+#define __itt_model_task_beginA_ptr 0
 #define __itt_model_task_beginAL(name, siteNameLen)
-#define __itt_model_task_beginAL_ptr  0
+#define __itt_model_task_beginAL_ptr 0
 #define __itt_model_iteration_taskA(name)
-#define __itt_model_iteration_taskA_ptr  0
+#define __itt_model_iteration_taskA_ptr 0
 #define __itt_model_iteration_taskAL(name, siteNameLen)
-#define __itt_model_iteration_taskAL_ptr  0
+#define __itt_model_iteration_taskAL_ptr 0
 #define __itt_model_task_end(task, instance)
-#define __itt_model_task_end_ptr    0
+#define __itt_model_task_end_ptr 0
 #define __itt_model_task_end_2()
-#define __itt_model_task_end_2_ptr    0
+#define __itt_model_task_end_2_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_model_task_begin_ptr  0
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#else /* INTEL_NO_MACRO_BODY */
+#define __itt_model_task_begin_ptr 0
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
 #define __itt_model_task_beginW_ptr 0
 #endif
-#define __itt_model_task_beginA_ptr  0
-#define __itt_model_task_beginAL_ptr  0
-#define __itt_model_iteration_taskA_ptr    0
-#define __itt_model_iteration_taskAL_ptr    0
-#define __itt_model_task_end_ptr    0
-#define __itt_model_task_end_2_ptr    0
+#define __itt_model_task_beginA_ptr 0
+#define __itt_model_task_beginAL_ptr 0
+#define __itt_model_iteration_taskA_ptr 0
+#define __itt_model_iteration_taskAL_ptr 0
+#define __itt_model_task_end_ptr 0
+#define __itt_model_task_end_2_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
 
@@ -1140,15 +1223,15 @@ ITT_STUBV(ITTAPI, void, model_lock_acquire, (void *lock))
 ITT_STUBV(ITTAPI, void, model_lock_acquire_2, (void *lock))
 ITT_STUBV(ITTAPI, void, model_lock_release, (void *lock))
 ITT_STUBV(ITTAPI, void, model_lock_release_2, (void *lock))
-#define __itt_model_lock_acquire     ITTNOTIFY_VOID(model_lock_acquire)
+#define __itt_model_lock_acquire ITTNOTIFY_VOID(model_lock_acquire)
 #define __itt_model_lock_acquire_ptr ITTNOTIFY_NAME(model_lock_acquire)
-#define __itt_model_lock_acquire_2     ITTNOTIFY_VOID(model_lock_acquire_2)
+#define __itt_model_lock_acquire_2 ITTNOTIFY_VOID(model_lock_acquire_2)
 #define __itt_model_lock_acquire_2_ptr ITTNOTIFY_NAME(model_lock_acquire_2)
-#define __itt_model_lock_release     ITTNOTIFY_VOID(model_lock_release)
+#define __itt_model_lock_release ITTNOTIFY_VOID(model_lock_release)
 #define __itt_model_lock_release_ptr ITTNOTIFY_NAME(model_lock_release)
-#define __itt_model_lock_release_2     ITTNOTIFY_VOID(model_lock_release_2)
+#define __itt_model_lock_release_2 ITTNOTIFY_VOID(model_lock_release_2)
 #define __itt_model_lock_release_2_ptr ITTNOTIFY_NAME(model_lock_release_2)
-#else  /* INTEL_NO_ITTNOTIFY_API */
+#else /* INTEL_NO_ITTNOTIFY_API */
 #define __itt_model_lock_acquire(lock)
 #define __itt_model_lock_acquire_ptr 0
 #define __itt_model_lock_acquire_2(lock)
@@ -1158,7 +1241,7 @@ ITT_STUBV(ITTAPI, void, model_lock_release_2, (void *lock))
 #define __itt_model_lock_release_2(lock)
 #define __itt_model_lock_release_2_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
+#else /* INTEL_NO_MACRO_BODY */
 #define __itt_model_lock_acquire_ptr 0
 #define __itt_model_lock_acquire_2_ptr 0
 #define __itt_model_lock_release_ptr 0
@@ -1173,26 +1256,29 @@ ITT_STUBV(ITTAPI, void, model_lock_release_2, (void *lock))
  * behavior, which may be required for correctness modeling to understand
  * when storage is not expected to be actually reused across threads.
  */
-void ITTAPI __itt_model_record_allocation  (void *addr, size_t size);
+void ITTAPI __itt_model_record_allocation(void *addr, size_t size);
 void ITTAPI __itt_model_record_deallocation(void *addr);
 
 /** @cond exclude_from_documentation */
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, model_record_allocation,   (void *addr, size_t size))
+ITT_STUBV(ITTAPI, void, model_record_allocation, (void *addr, size_t size))
 ITT_STUBV(ITTAPI, void, model_record_deallocation, (void *addr))
-#define __itt_model_record_allocation       ITTNOTIFY_VOID(model_record_allocation)
-#define __itt_model_record_allocation_ptr   ITTNOTIFY_NAME(model_record_allocation)
-#define __itt_model_record_deallocation     ITTNOTIFY_VOID(model_record_deallocation)
-#define __itt_model_record_deallocation_ptr ITTNOTIFY_NAME(model_record_deallocation)
-#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_model_record_allocation ITTNOTIFY_VOID(model_record_allocation)
+#define __itt_model_record_allocation_ptr                                      \
+  ITTNOTIFY_NAME(model_record_allocation)
+#define __itt_model_record_deallocation                                        \
+  ITTNOTIFY_VOID(model_record_deallocation)
+#define __itt_model_record_deallocation_ptr                                    \
+  ITTNOTIFY_NAME(model_record_deallocation)
+#else /* INTEL_NO_ITTNOTIFY_API */
 #define __itt_model_record_allocation(addr, size)
-#define __itt_model_record_allocation_ptr   0
+#define __itt_model_record_allocation_ptr 0
 #define __itt_model_record_deallocation(addr)
 #define __itt_model_record_deallocation_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_model_record_allocation_ptr   0
+#else /* INTEL_NO_MACRO_BODY */
+#define __itt_model_record_allocation_ptr 0
 #define __itt_model_record_deallocation_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
@@ -1202,20 +1288,20 @@ ITT_STUBV(ITTAPI, void, model_record_deallocation, (void *addr))
  *
  * Note particular storage is inductive through the end of the current site
  */
-void ITTAPI __itt_model_induction_uses(void* addr, size_t size);
+void ITTAPI __itt_model_induction_uses(void *addr, size_t size);
 
 /** @cond exclude_from_documentation */
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
 ITT_STUBV(ITTAPI, void, model_induction_uses, (void *addr, size_t size))
-#define __itt_model_induction_uses     ITTNOTIFY_VOID(model_induction_uses)
+#define __itt_model_induction_uses ITTNOTIFY_VOID(model_induction_uses)
 #define __itt_model_induction_uses_ptr ITTNOTIFY_NAME(model_induction_uses)
-#else  /* INTEL_NO_ITTNOTIFY_API */
+#else /* INTEL_NO_ITTNOTIFY_API */
 #define __itt_model_induction_uses(addr, size)
-#define __itt_model_induction_uses_ptr   0
+#define __itt_model_induction_uses_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_model_induction_uses_ptr   0
+#else /* INTEL_NO_MACRO_BODY */
+#define __itt_model_induction_uses_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
 
@@ -1225,20 +1311,20 @@ ITT_STUBV(ITTAPI, void, model_induction_uses, (void *addr, size_t size))
  * Note particular storage is used for reduction through the end
  * of the current site
  */
-void ITTAPI __itt_model_reduction_uses(void* addr, size_t size);
+void ITTAPI __itt_model_reduction_uses(void *addr, size_t size);
 
 /** @cond exclude_from_documentation */
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
 ITT_STUBV(ITTAPI, void, model_reduction_uses, (void *addr, size_t size))
-#define __itt_model_reduction_uses     ITTNOTIFY_VOID(model_reduction_uses)
+#define __itt_model_reduction_uses ITTNOTIFY_VOID(model_reduction_uses)
 #define __itt_model_reduction_uses_ptr ITTNOTIFY_NAME(model_reduction_uses)
-#else  /* INTEL_NO_ITTNOTIFY_API */
+#else /* INTEL_NO_ITTNOTIFY_API */
 #define __itt_model_reduction_uses(addr, size)
-#define __itt_model_reduction_uses_ptr   0
+#define __itt_model_reduction_uses_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_model_reduction_uses_ptr   0
+#else /* INTEL_NO_MACRO_BODY */
+#define __itt_model_reduction_uses_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
 
@@ -1248,20 +1334,20 @@ ITT_STUBV(ITTAPI, void, model_reduction_uses, (void *addr, size_t size))
  * Have correctness modeling record observations about uses of storage
  * through the end of the current site
  */
-void ITTAPI __itt_model_observe_uses(void* addr, size_t size);
+void ITTAPI __itt_model_observe_uses(void *addr, size_t size);
 
 /** @cond exclude_from_documentation */
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
 ITT_STUBV(ITTAPI, void, model_observe_uses, (void *addr, size_t size))
-#define __itt_model_observe_uses     ITTNOTIFY_VOID(model_observe_uses)
+#define __itt_model_observe_uses ITTNOTIFY_VOID(model_observe_uses)
 #define __itt_model_observe_uses_ptr ITTNOTIFY_NAME(model_observe_uses)
-#else  /* INTEL_NO_ITTNOTIFY_API */
+#else /* INTEL_NO_ITTNOTIFY_API */
 #define __itt_model_observe_uses(addr, size)
-#define __itt_model_observe_uses_ptr   0
+#define __itt_model_observe_uses_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_model_observe_uses_ptr   0
+#else /* INTEL_NO_MACRO_BODY */
+#define __itt_model_observe_uses_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
 
@@ -1271,19 +1357,19 @@ ITT_STUBV(ITTAPI, void, model_observe_uses, (void *addr, size_t size))
  * Clear the special handling of a piece of storage related to induction,
  * reduction or observe_uses
  */
-void ITTAPI __itt_model_clear_uses(void* addr);
+void ITTAPI __itt_model_clear_uses(void *addr);
 
 /** @cond exclude_from_documentation */
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
 ITT_STUBV(ITTAPI, void, model_clear_uses, (void *addr))
-#define __itt_model_clear_uses     ITTNOTIFY_VOID(model_clear_uses)
+#define __itt_model_clear_uses ITTNOTIFY_VOID(model_clear_uses)
 #define __itt_model_clear_uses_ptr ITTNOTIFY_NAME(model_clear_uses)
-#else  /* INTEL_NO_ITTNOTIFY_API */
+#else /* INTEL_NO_ITTNOTIFY_API */
 #define __itt_model_clear_uses(addr)
 #define __itt_model_clear_uses_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
+#else /* INTEL_NO_MACRO_BODY */
 #define __itt_model_clear_uses_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
@@ -1317,15 +1403,15 @@ void ITTAPI __itt_model_aggregate_task(size_t x);
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
 ITT_STUBV(ITTAPI, void, model_disable_push, (__itt_model_disable x))
-ITT_STUBV(ITTAPI, void, model_disable_pop,  (void))
+ITT_STUBV(ITTAPI, void, model_disable_pop, (void))
 ITT_STUBV(ITTAPI, void, model_aggregate_task, (size_t x))
-#define __itt_model_disable_push     ITTNOTIFY_VOID(model_disable_push)
+#define __itt_model_disable_push ITTNOTIFY_VOID(model_disable_push)
 #define __itt_model_disable_push_ptr ITTNOTIFY_NAME(model_disable_push)
-#define __itt_model_disable_pop      ITTNOTIFY_VOID(model_disable_pop)
-#define __itt_model_disable_pop_ptr  ITTNOTIFY_NAME(model_disable_pop)
-#define __itt_model_aggregate_task      ITTNOTIFY_VOID(model_aggregate_task)
-#define __itt_model_aggregate_task_ptr  ITTNOTIFY_NAME(model_aggregate_task)
-#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_model_disable_pop ITTNOTIFY_VOID(model_disable_pop)
+#define __itt_model_disable_pop_ptr ITTNOTIFY_NAME(model_disable_pop)
+#define __itt_model_aggregate_task ITTNOTIFY_VOID(model_aggregate_task)
+#define __itt_model_aggregate_task_ptr ITTNOTIFY_NAME(model_aggregate_task)
+#else /* INTEL_NO_ITTNOTIFY_API */
 #define __itt_model_disable_push(x)
 #define __itt_model_disable_push_ptr 0
 #define __itt_model_disable_pop()
@@ -1333,7 +1419,7 @@ ITT_STUBV(ITTAPI, void, model_aggregate_task, (size_t x))
 #define __itt_model_aggregate_task(x)
 #define __itt_model_aggregate_task_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
+#else /* INTEL_NO_MACRO_BODY */
 #define __itt_model_disable_push_ptr 0
 #define __itt_model_disable_pop_ptr 0
 #define __itt_model_aggregate_task_ptr 0
@@ -1348,61 +1434,67 @@ ITT_STUBV(ITTAPI, void, model_aggregate_task, (size_t x))
  * @{
  */
 
-typedef void* __itt_heap_function;
+typedef void *__itt_heap_function;
 
 /**
  * @brief Create an identification for heap function
  * @return non-zero identifier or NULL
  */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-__itt_heap_function ITTAPI __itt_heap_function_createA(const char*    name, const char*    domain);
-__itt_heap_function ITTAPI __itt_heap_function_createW(const wchar_t* name, const wchar_t* domain);
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+__itt_heap_function ITTAPI __itt_heap_function_createA(const char *name,
+                                                       const char *domain);
+__itt_heap_function ITTAPI __itt_heap_function_createW(const wchar_t *name,
+                                                       const wchar_t *domain);
 #if defined(UNICODE) || defined(_UNICODE)
-#  define __itt_heap_function_create     __itt_heap_function_createW
-#  define __itt_heap_function_create_ptr __itt_heap_function_createW_ptr
+#define __itt_heap_function_create __itt_heap_function_createW
+#define __itt_heap_function_create_ptr __itt_heap_function_createW_ptr
 #else
-#  define __itt_heap_function_create     __itt_heap_function_createA
-#  define __itt_heap_function_create_ptr __itt_heap_function_createA_ptr
+#define __itt_heap_function_create __itt_heap_function_createA
+#define __itt_heap_function_create_ptr __itt_heap_function_createA_ptr
 #endif /* UNICODE */
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-__itt_heap_function ITTAPI __itt_heap_function_create(const char* name, const char* domain);
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+__itt_heap_function ITTAPI __itt_heap_function_create(const char *name,
+                                                      const char *domain);
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 
 /** @cond exclude_from_documentation */
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-ITT_STUB(ITTAPI, __itt_heap_function, heap_function_createA, (const char*    name, const char*    domain))
-ITT_STUB(ITTAPI, __itt_heap_function, heap_function_createW, (const wchar_t* name, const wchar_t* domain))
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-ITT_STUB(ITTAPI, __itt_heap_function, heap_function_create,  (const char*    name, const char*    domain))
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_heap_function, heap_function_createA,
+         (const char *name, const char *domain))
+ITT_STUB(ITTAPI, __itt_heap_function, heap_function_createW,
+         (const wchar_t *name, const wchar_t *domain))
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_heap_function, heap_function_create,
+         (const char *name, const char *domain))
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_heap_function_createA     ITTNOTIFY_DATA(heap_function_createA)
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+#define __itt_heap_function_createA ITTNOTIFY_DATA(heap_function_createA)
 #define __itt_heap_function_createA_ptr ITTNOTIFY_NAME(heap_function_createA)
-#define __itt_heap_function_createW     ITTNOTIFY_DATA(heap_function_createW)
+#define __itt_heap_function_createW ITTNOTIFY_DATA(heap_function_createW)
 #define __itt_heap_function_createW_ptr ITTNOTIFY_NAME(heap_function_createW)
 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_heap_function_create      ITTNOTIFY_DATA(heap_function_create)
-#define __itt_heap_function_create_ptr  ITTNOTIFY_NAME(heap_function_create)
+#define __itt_heap_function_create ITTNOTIFY_DATA(heap_function_create)
+#define __itt_heap_function_create_ptr ITTNOTIFY_NAME(heap_function_create)
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#else /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
 #define __itt_heap_function_createA(name, domain) (__itt_heap_function)0
 #define __itt_heap_function_createA_ptr 0
 #define __itt_heap_function_createW(name, domain) (__itt_heap_function)0
 #define __itt_heap_function_createW_ptr 0
 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_heap_function_create(name, domain)  (__itt_heap_function)0
-#define __itt_heap_function_create_ptr  0
+#define __itt_heap_function_create(name, domain) (__itt_heap_function)0
+#define __itt_heap_function_create_ptr 0
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#else /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
 #define __itt_heap_function_createA_ptr 0
 #define __itt_heap_function_createW_ptr 0
 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_heap_function_create_ptr  0
+#define __itt_heap_function_create_ptr 0
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
@@ -1410,120 +1502,130 @@ ITT_STUB(ITTAPI, __itt_heap_function, heap_function_create,  (const char*    nam
 /**
  * @brief Record an allocation begin occurrence.
  */
-void ITTAPI __itt_heap_allocate_begin(__itt_heap_function h, size_t size, int initialized);
+void ITTAPI __itt_heap_allocate_begin(__itt_heap_function h, size_t size,
+                                      int initialized);
 
 /** @cond exclude_from_documentation */
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, heap_allocate_begin, (__itt_heap_function h, size_t size, int initialized))
-#define __itt_heap_allocate_begin     ITTNOTIFY_VOID(heap_allocate_begin)
+ITT_STUBV(ITTAPI, void, heap_allocate_begin,
+          (__itt_heap_function h, size_t size, int initialized))
+#define __itt_heap_allocate_begin ITTNOTIFY_VOID(heap_allocate_begin)
 #define __itt_heap_allocate_begin_ptr ITTNOTIFY_NAME(heap_allocate_begin)
-#else  /* INTEL_NO_ITTNOTIFY_API */
+#else /* INTEL_NO_ITTNOTIFY_API */
 #define __itt_heap_allocate_begin(h, size, initialized)
-#define __itt_heap_allocate_begin_ptr   0
+#define __itt_heap_allocate_begin_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_heap_allocate_begin_ptr   0
+#else /* INTEL_NO_MACRO_BODY */
+#define __itt_heap_allocate_begin_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
 
 /**
  * @brief Record an allocation end occurrence.
  */
-void ITTAPI __itt_heap_allocate_end(__itt_heap_function h, void** addr, size_t size, int initialized);
+void ITTAPI __itt_heap_allocate_end(__itt_heap_function h, void **addr,
+                                    size_t size, int initialized);
 
 /** @cond exclude_from_documentation */
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, heap_allocate_end, (__itt_heap_function h, void** addr, size_t size, int initialized))
-#define __itt_heap_allocate_end     ITTNOTIFY_VOID(heap_allocate_end)
+ITT_STUBV(ITTAPI, void, heap_allocate_end,
+          (__itt_heap_function h, void **addr, size_t size, int initialized))
+#define __itt_heap_allocate_end ITTNOTIFY_VOID(heap_allocate_end)
 #define __itt_heap_allocate_end_ptr ITTNOTIFY_NAME(heap_allocate_end)
-#else  /* INTEL_NO_ITTNOTIFY_API */
+#else /* INTEL_NO_ITTNOTIFY_API */
 #define __itt_heap_allocate_end(h, addr, size, initialized)
-#define __itt_heap_allocate_end_ptr   0
+#define __itt_heap_allocate_end_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_heap_allocate_end_ptr   0
+#else /* INTEL_NO_MACRO_BODY */
+#define __itt_heap_allocate_end_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
 
 /**
  * @brief Record a free begin occurrence.
  */
-void ITTAPI __itt_heap_free_begin(__itt_heap_function h, void* addr);
+void ITTAPI __itt_heap_free_begin(__itt_heap_function h, void *addr);
 
 /** @cond exclude_from_documentation */
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, heap_free_begin, (__itt_heap_function h, void* addr))
-#define __itt_heap_free_begin     ITTNOTIFY_VOID(heap_free_begin)
+ITT_STUBV(ITTAPI, void, heap_free_begin, (__itt_heap_function h, void *addr))
+#define __itt_heap_free_begin ITTNOTIFY_VOID(heap_free_begin)
 #define __itt_heap_free_begin_ptr ITTNOTIFY_NAME(heap_free_begin)
-#else  /* INTEL_NO_ITTNOTIFY_API */
+#else /* INTEL_NO_ITTNOTIFY_API */
 #define __itt_heap_free_begin(h, addr)
-#define __itt_heap_free_begin_ptr   0
+#define __itt_heap_free_begin_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_heap_free_begin_ptr   0
+#else /* INTEL_NO_MACRO_BODY */
+#define __itt_heap_free_begin_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
 
 /**
  * @brief Record a free end occurrence.
  */
-void ITTAPI __itt_heap_free_end(__itt_heap_function h, void* addr);
+void ITTAPI __itt_heap_free_end(__itt_heap_function h, void *addr);
 
 /** @cond exclude_from_documentation */
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, heap_free_end, (__itt_heap_function h, void* addr))
-#define __itt_heap_free_end     ITTNOTIFY_VOID(heap_free_end)
+ITT_STUBV(ITTAPI, void, heap_free_end, (__itt_heap_function h, void *addr))
+#define __itt_heap_free_end ITTNOTIFY_VOID(heap_free_end)
 #define __itt_heap_free_end_ptr ITTNOTIFY_NAME(heap_free_end)
-#else  /* INTEL_NO_ITTNOTIFY_API */
+#else /* INTEL_NO_ITTNOTIFY_API */
 #define __itt_heap_free_end(h, addr)
-#define __itt_heap_free_end_ptr   0
+#define __itt_heap_free_end_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_heap_free_end_ptr   0
+#else /* INTEL_NO_MACRO_BODY */
+#define __itt_heap_free_end_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
 
 /**
  * @brief Record a reallocation begin occurrence.
  */
-void ITTAPI __itt_heap_reallocate_begin(__itt_heap_function h, void* addr, size_t new_size, int initialized);
+void ITTAPI __itt_heap_reallocate_begin(__itt_heap_function h, void *addr,
+                                        size_t new_size, int initialized);
 
 /** @cond exclude_from_documentation */
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, heap_reallocate_begin, (__itt_heap_function h, void* addr, size_t new_size, int initialized))
-#define __itt_heap_reallocate_begin     ITTNOTIFY_VOID(heap_reallocate_begin)
+ITT_STUBV(ITTAPI, void, heap_reallocate_begin,
+          (__itt_heap_function h, void *addr, size_t new_size, int initialized))
+#define __itt_heap_reallocate_begin ITTNOTIFY_VOID(heap_reallocate_begin)
 #define __itt_heap_reallocate_begin_ptr ITTNOTIFY_NAME(heap_reallocate_begin)
-#else  /* INTEL_NO_ITTNOTIFY_API */
+#else /* INTEL_NO_ITTNOTIFY_API */
 #define __itt_heap_reallocate_begin(h, addr, new_size, initialized)
-#define __itt_heap_reallocate_begin_ptr   0
+#define __itt_heap_reallocate_begin_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_heap_reallocate_begin_ptr   0
+#else /* INTEL_NO_MACRO_BODY */
+#define __itt_heap_reallocate_begin_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
 
 /**
  * @brief Record a reallocation end occurrence.
  */
-void ITTAPI __itt_heap_reallocate_end(__itt_heap_function h, void* addr, void** new_addr, size_t new_size, int initialized);
+void ITTAPI __itt_heap_reallocate_end(__itt_heap_function h, void *addr,
+                                      void **new_addr, size_t new_size,
+                                      int initialized);
 
 /** @cond exclude_from_documentation */
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, heap_reallocate_end, (__itt_heap_function h, void* addr, void** new_addr, size_t new_size, int initialized))
-#define __itt_heap_reallocate_end     ITTNOTIFY_VOID(heap_reallocate_end)
+ITT_STUBV(ITTAPI, void, heap_reallocate_end,
+          (__itt_heap_function h, void *addr, void **new_addr, size_t new_size,
+           int initialized))
+#define __itt_heap_reallocate_end ITTNOTIFY_VOID(heap_reallocate_end)
 #define __itt_heap_reallocate_end_ptr ITTNOTIFY_NAME(heap_reallocate_end)
-#else  /* INTEL_NO_ITTNOTIFY_API */
+#else /* INTEL_NO_ITTNOTIFY_API */
 #define __itt_heap_reallocate_end(h, addr, new_addr, new_size, initialized)
-#define __itt_heap_reallocate_end_ptr   0
+#define __itt_heap_reallocate_end_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_heap_reallocate_end_ptr   0
+#else /* INTEL_NO_MACRO_BODY */
+#define __itt_heap_reallocate_end_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
 
@@ -1533,15 +1635,17 @@ void ITTAPI __itt_heap_internal_access_begin(void);
 /** @cond exclude_from_documentation */
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, heap_internal_access_begin,  (void))
-#define __itt_heap_internal_access_begin      ITTNOTIFY_VOID(heap_internal_access_begin)
-#define __itt_heap_internal_access_begin_ptr  ITTNOTIFY_NAME(heap_internal_access_begin)
-#else  /* INTEL_NO_ITTNOTIFY_API */
+ITT_STUBV(ITTAPI, void, heap_internal_access_begin, (void))
+#define __itt_heap_internal_access_begin                                       \
+  ITTNOTIFY_VOID(heap_internal_access_begin)
+#define __itt_heap_internal_access_begin_ptr                                   \
+  ITTNOTIFY_NAME(heap_internal_access_begin)
+#else /* INTEL_NO_ITTNOTIFY_API */
 #define __itt_heap_internal_access_begin()
-#define __itt_heap_internal_access_begin_ptr  0
+#define __itt_heap_internal_access_begin_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_heap_internal_access_begin_ptr  0
+#else /* INTEL_NO_MACRO_BODY */
+#define __itt_heap_internal_access_begin_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
 
@@ -1552,13 +1656,14 @@ void ITTAPI __itt_heap_internal_access_end(void);
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
 ITT_STUBV(ITTAPI, void, heap_internal_access_end, (void))
-#define __itt_heap_internal_access_end     ITTNOTIFY_VOID(heap_internal_access_end)
-#define __itt_heap_internal_access_end_ptr ITTNOTIFY_NAME(heap_internal_access_end)
-#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_heap_internal_access_end ITTNOTIFY_VOID(heap_internal_access_end)
+#define __itt_heap_internal_access_end_ptr                                     \
+  ITTNOTIFY_NAME(heap_internal_access_end)
+#else /* INTEL_NO_ITTNOTIFY_API */
 #define __itt_heap_internal_access_end()
 #define __itt_heap_internal_access_end_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
+#else /* INTEL_NO_MACRO_BODY */
 #define __itt_heap_internal_access_end_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
@@ -1569,15 +1674,17 @@ void ITTAPI __itt_heap_record_memory_growth_begin(void);
 /** @cond exclude_from_documentation */
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, heap_record_memory_growth_begin,  (void))
-#define __itt_heap_record_memory_growth_begin      ITTNOTIFY_VOID(heap_record_memory_growth_begin)
-#define __itt_heap_record_memory_growth_begin_ptr  ITTNOTIFY_NAME(heap_record_memory_growth_begin)
-#else  /* INTEL_NO_ITTNOTIFY_API */
+ITT_STUBV(ITTAPI, void, heap_record_memory_growth_begin, (void))
+#define __itt_heap_record_memory_growth_begin                                  \
+  ITTNOTIFY_VOID(heap_record_memory_growth_begin)
+#define __itt_heap_record_memory_growth_begin_ptr                              \
+  ITTNOTIFY_NAME(heap_record_memory_growth_begin)
+#else /* INTEL_NO_ITTNOTIFY_API */
 #define __itt_heap_record_memory_growth_begin()
-#define __itt_heap_record_memory_growth_begin_ptr  0
+#define __itt_heap_record_memory_growth_begin_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_heap_record_memory_growth_begin_ptr  0
+#else /* INTEL_NO_MACRO_BODY */
+#define __itt_heap_record_memory_growth_begin_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
 
@@ -1588,13 +1695,15 @@ void ITTAPI __itt_heap_record_memory_growth_end(void);
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
 ITT_STUBV(ITTAPI, void, heap_record_memory_growth_end, (void))
-#define __itt_heap_record_memory_growth_end     ITTNOTIFY_VOID(heap_record_memory_growth_end)
-#define __itt_heap_record_memory_growth_end_ptr ITTNOTIFY_NAME(heap_record_memory_growth_end)
-#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_heap_record_memory_growth_end                                    \
+  ITTNOTIFY_VOID(heap_record_memory_growth_end)
+#define __itt_heap_record_memory_growth_end_ptr                                \
+  ITTNOTIFY_NAME(heap_record_memory_growth_end)
+#else /* INTEL_NO_ITTNOTIFY_API */
 #define __itt_heap_record_memory_growth_end()
 #define __itt_heap_record_memory_growth_end_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
+#else /* INTEL_NO_MACRO_BODY */
 #define __itt_heap_record_memory_growth_end_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
@@ -1614,22 +1723,21 @@ ITT_STUBV(ITTAPI, void, heap_record_memory_growth_end, (void))
  */
 #define __itt_heap_growth 0x00000002
 
-
 /** @brief heap reset detection */
 void ITTAPI __itt_heap_reset_detection(unsigned int reset_mask);
 
 /** @cond exclude_from_documentation */
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, heap_reset_detection,  (unsigned int reset_mask))
-#define __itt_heap_reset_detection      ITTNOTIFY_VOID(heap_reset_detection)
-#define __itt_heap_reset_detection_ptr  ITTNOTIFY_NAME(heap_reset_detection)
-#else  /* INTEL_NO_ITTNOTIFY_API */
+ITT_STUBV(ITTAPI, void, heap_reset_detection, (unsigned int reset_mask))
+#define __itt_heap_reset_detection ITTNOTIFY_VOID(heap_reset_detection)
+#define __itt_heap_reset_detection_ptr ITTNOTIFY_NAME(heap_reset_detection)
+#else /* INTEL_NO_ITTNOTIFY_API */
 #define __itt_heap_reset_detection()
-#define __itt_heap_reset_detection_ptr  0
+#define __itt_heap_reset_detection_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_heap_reset_detection_ptr  0
+#else /* INTEL_NO_MACRO_BODY */
+#define __itt_heap_reset_detection_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
 
@@ -1640,13 +1748,13 @@ void ITTAPI __itt_heap_record(unsigned int record_mask);
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
 ITT_STUBV(ITTAPI, void, heap_record, (unsigned int record_mask))
-#define __itt_heap_record     ITTNOTIFY_VOID(heap_record)
+#define __itt_heap_record ITTNOTIFY_VOID(heap_record)
 #define __itt_heap_record_ptr ITTNOTIFY_NAME(heap_record)
-#else  /* INTEL_NO_ITTNOTIFY_API */
+#else /* INTEL_NO_ITTNOTIFY_API */
 #define __itt_heap_record()
 #define __itt_heap_record_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
+#else /* INTEL_NO_MACRO_BODY */
 #define __itt_heap_record_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
@@ -1665,18 +1773,18 @@ ITT_STUBV(ITTAPI, void, heap_record, (unsigned int record_mask))
 /** @cond exclude_from_documentation */
 #pragma pack(push, 8)
 
-typedef struct ___itt_domain
-{
-    volatile int flags; /*!< Zero if disabled, non-zero if enabled. The meaning of different non-zero values is reserved to the runtime */
-    const char* nameA;  /*!< Copy of original name in ASCII. */
+typedef struct ___itt_domain {
+  volatile int flags; /*!< Zero if disabled, non-zero if enabled. The meaning of
+                         different non-zero values is reserved to the runtime */
+  const char *nameA; /*!< Copy of original name in ASCII. */
 #if defined(UNICODE) || defined(_UNICODE)
-    const wchar_t* nameW; /*!< Copy of original name in UNICODE. */
-#else  /* UNICODE || _UNICODE */
-    void* nameW;
+  const wchar_t *nameW; /*!< Copy of original name in UNICODE. */
+#else /* UNICODE || _UNICODE */
+  void *nameW;
 #endif /* UNICODE || _UNICODE */
-    int   extra1; /*!< Reserved to the runtime */
-    void* extra2; /*!< Reserved to the runtime */
-    struct ___itt_domain* next;
+  int extra1; /*!< Reserved to the runtime */
+  void *extra2; /*!< Reserved to the runtime */
+  struct ___itt_domain *next;
 } __itt_domain;
 
 #pragma pack(pop)
@@ -1692,55 +1800,55 @@ typedef struct ___itt_domain
  * which thread created the domain. This call is thread-safe.
  * @param[in] name name of domain
  */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-__itt_domain* ITTAPI __itt_domain_createA(const char    *name);
-__itt_domain* ITTAPI __itt_domain_createW(const wchar_t *name);
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+__itt_domain *ITTAPI __itt_domain_createA(const char *name);
+__itt_domain *ITTAPI __itt_domain_createW(const wchar_t *name);
 #if defined(UNICODE) || defined(_UNICODE)
-#  define __itt_domain_create     __itt_domain_createW
-#  define __itt_domain_create_ptr __itt_domain_createW_ptr
+#define __itt_domain_create __itt_domain_createW
+#define __itt_domain_create_ptr __itt_domain_createW_ptr
 #else /* UNICODE */
-#  define __itt_domain_create     __itt_domain_createA
-#  define __itt_domain_create_ptr __itt_domain_createA_ptr
+#define __itt_domain_create __itt_domain_createA
+#define __itt_domain_create_ptr __itt_domain_createA_ptr
 #endif /* UNICODE */
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-__itt_domain* ITTAPI __itt_domain_create(const char *name);
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+__itt_domain *ITTAPI __itt_domain_create(const char *name);
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 
 /** @cond exclude_from_documentation */
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-ITT_STUB(ITTAPI, __itt_domain*, domain_createA, (const char    *name))
-ITT_STUB(ITTAPI, __itt_domain*, domain_createW, (const wchar_t *name))
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-ITT_STUB(ITTAPI, __itt_domain*, domain_create,  (const char    *name))
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_domain *, domain_createA, (const char *name))
+ITT_STUB(ITTAPI, __itt_domain *, domain_createW, (const wchar_t *name))
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_domain *, domain_create, (const char *name))
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_domain_createA     ITTNOTIFY_DATA(domain_createA)
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+#define __itt_domain_createA ITTNOTIFY_DATA(domain_createA)
 #define __itt_domain_createA_ptr ITTNOTIFY_NAME(domain_createA)
-#define __itt_domain_createW     ITTNOTIFY_DATA(domain_createW)
+#define __itt_domain_createW ITTNOTIFY_DATA(domain_createW)
 #define __itt_domain_createW_ptr ITTNOTIFY_NAME(domain_createW)
 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_domain_create     ITTNOTIFY_DATA(domain_create)
+#define __itt_domain_create ITTNOTIFY_DATA(domain_create)
 #define __itt_domain_create_ptr ITTNOTIFY_NAME(domain_create)
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_domain_createA(name) (__itt_domain*)0
+#else /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+#define __itt_domain_createA(name) (__itt_domain *)0
 #define __itt_domain_createA_ptr 0
-#define __itt_domain_createW(name) (__itt_domain*)0
+#define __itt_domain_createW(name) (__itt_domain *)0
 #define __itt_domain_createW_ptr 0
 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_domain_create(name)  (__itt_domain*)0
+#define __itt_domain_create(name) (__itt_domain *)0
 #define __itt_domain_create_ptr 0
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#else /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
 #define __itt_domain_createA_ptr 0
 #define __itt_domain_createW_ptr 0
 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_domain_create_ptr  0
+#define __itt_domain_create_ptr 0
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
@@ -1756,35 +1864,36 @@ ITT_STUB(ITTAPI, __itt_domain*, domain_create,  (const char    *name))
 /** @cond exclude_from_documentation */
 #pragma pack(push, 8)
 
-typedef struct ___itt_id
-{
-    unsigned long long d1, d2, d3;
+typedef struct ___itt_id {
+  unsigned long long d1, d2, d3;
 } __itt_id;
 
 #pragma pack(pop)
 /** @endcond */
 
-static const __itt_id __itt_null = { 0, 0, 0 };
+static const __itt_id __itt_null = {0, 0, 0};
 
 /**
  * @ingroup ids
- * @brief A convenience function is provided to create an ID without domain control.
- * @brief This is a convenience function to initialize an __itt_id structure. This function
- * does not affect the collector runtime in any way. After you make the ID with this
- * function, you still must create it with the __itt_id_create function before using the ID
- * to identify a named entity.
+ * @brief A convenience function is provided to create an ID without domain
+ * control.
+ * @brief This is a convenience function to initialize an __itt_id structure.
+ * This function does not affect the collector runtime in any way. After you
+ * make the ID with this function, you still must create it with the
+ * __itt_id_create function before using the ID to identify a named entity.
  * @param[in] addr The address of object; high QWORD of the ID value.
- * @param[in] extra The extra data to unique identify object; low QWORD of the ID value.
- */
-
-ITT_INLINE __itt_id ITTAPI __itt_id_make(void* addr, unsigned long long extra) ITT_INLINE_ATTRIBUTE;
-ITT_INLINE __itt_id ITTAPI __itt_id_make(void* addr, unsigned long long extra)
-{
-    __itt_id id = __itt_null;
-    id.d1 = (unsigned long long)((uintptr_t)addr);
-    id.d2 = (unsigned long long)extra;
-    id.d3 = (unsigned long long)0; /* Reserved. Must be zero */
-    return id;
+ * @param[in] extra The extra data to unique identify object; low QWORD of the
+ * ID value.
+ */
+
+ITT_INLINE __itt_id ITTAPI __itt_id_make(void *addr, unsigned long long extra)
+    ITT_INLINE_ATTRIBUTE;
+ITT_INLINE __itt_id ITTAPI __itt_id_make(void *addr, unsigned long long extra) {
+  __itt_id id = __itt_null;
+  id.d1 = (unsigned long long)((uintptr_t)addr);
+  id.d2 = (unsigned long long)extra;
+  id.d3 = (unsigned long long)0; /* Reserved. Must be zero */
+  return id;
 }
 
 /**
@@ -1805,13 +1914,13 @@ void ITTAPI __itt_id_create(const __itt_domain *domain, __itt_id id);
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
 ITT_STUBV(ITTAPI, void, id_create, (const __itt_domain *domain, __itt_id id))
-#define __itt_id_create(d,x) ITTNOTIFY_VOID_D1(id_create,d,x)
-#define __itt_id_create_ptr  ITTNOTIFY_NAME(id_create)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_id_create(domain,id)
+#define __itt_id_create(d, x) ITTNOTIFY_VOID_D1(id_create, d, x)
+#define __itt_id_create_ptr ITTNOTIFY_NAME(id_create)
+#else /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_id_create(domain, id)
 #define __itt_id_create_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
+#else /* INTEL_NO_MACRO_BODY */
 #define __itt_id_create_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
@@ -1819,10 +1928,10 @@ ITT_STUBV(ITTAPI, void, id_create, (const __itt_domain *domain, __itt_id id))
 /**
  * @ingroup ids
  * @brief Destroy an instance of identifier.
- * This ends the lifetime of the current instance of the given ID value in the trace.
- * Any relationships that are established after this lifetime ends are invalid.
- * This call must be performed before the given ID value can be reused for a different
- * named entity instance.
+ * This ends the lifetime of the current instance of the given ID value in the
+ * trace. Any relationships that are established after this lifetime ends are
+ * invalid. This call must be performed before the given ID value can be reused
+ * for a different named entity instance.
  * @param[in] domain The domain controlling the execution of this call.
  * @param[in] id The ID to destroy.
  */
@@ -1832,13 +1941,13 @@ void ITTAPI __itt_id_destroy(const __itt_domain *domain, __itt_id id);
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
 ITT_STUBV(ITTAPI, void, id_destroy, (const __itt_domain *domain, __itt_id id))
-#define __itt_id_destroy(d,x) ITTNOTIFY_VOID_D1(id_destroy,d,x)
-#define __itt_id_destroy_ptr  ITTNOTIFY_NAME(id_destroy)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_id_destroy(domain,id)
+#define __itt_id_destroy(d, x) ITTNOTIFY_VOID_D1(id_destroy, d, x)
+#define __itt_id_destroy_ptr ITTNOTIFY_NAME(id_destroy)
+#else /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_id_destroy(domain, id)
 #define __itt_id_destroy_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
+#else /* INTEL_NO_MACRO_BODY */
 #define __itt_id_destroy_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
@@ -1854,17 +1963,16 @@ ITT_STUBV(ITTAPI, void, id_destroy, (const __itt_domain *domain, __itt_id id))
 /** @cond exclude_from_documentation */
 #pragma pack(push, 8)
 
-typedef struct ___itt_string_handle
-{
-    const char* strA; /*!< Copy of original string in ASCII. */
+typedef struct ___itt_string_handle {
+  const char *strA; /*!< Copy of original string in ASCII. */
 #if defined(UNICODE) || defined(_UNICODE)
-    const wchar_t* strW; /*!< Copy of original string in UNICODE. */
-#else  /* UNICODE || _UNICODE */
-    void* strW;
+  const wchar_t *strW; /*!< Copy of original string in UNICODE. */
+#else /* UNICODE || _UNICODE */
+  void *strW;
 #endif /* UNICODE || _UNICODE */
-    int   extra1; /*!< Reserved. Must be zero   */
-    void* extra2; /*!< Reserved. Must be zero   */
-    struct ___itt_string_handle* next;
+  int extra1; /*!< Reserved. Must be zero   */
+  void *extra2; /*!< Reserved. Must be zero   */
+  struct ___itt_string_handle *next;
 } __itt_string_handle;
 
 #pragma pack(pop)
@@ -1875,61 +1983,65 @@ typedef struct ___itt_string_handle
  * @brief Create a string handle.
  * Create and return handle value that can be associated with a string.
  * Consecutive calls to __itt_string_handle_create with the same name
- * return the same value. Because the set of string handles is expected to remain
- * static during the application's execution time, there is no mechanism to destroy a string handle.
- * Any string handle can be accessed by any thread in the process, regardless of which thread created
- * the string handle. This call is thread-safe.
+ * return the same value. Because the set of string handles is expected to
+ * remain static during the application's execution time, there is no mechanism
+ * to destroy a string handle. Any string handle can be accessed by any thread
+ * in the process, regardless of which thread created the string handle. This
+ * call is thread-safe.
  * @param[in] name The input string
  */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-__itt_string_handle* ITTAPI __itt_string_handle_createA(const char    *name);
-__itt_string_handle* ITTAPI __itt_string_handle_createW(const wchar_t *name);
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+__itt_string_handle *ITTAPI __itt_string_handle_createA(const char *name);
+__itt_string_handle *ITTAPI __itt_string_handle_createW(const wchar_t *name);
 #if defined(UNICODE) || defined(_UNICODE)
-#  define __itt_string_handle_create     __itt_string_handle_createW
-#  define __itt_string_handle_create_ptr __itt_string_handle_createW_ptr
+#define __itt_string_handle_create __itt_string_handle_createW
+#define __itt_string_handle_create_ptr __itt_string_handle_createW_ptr
 #else /* UNICODE */
-#  define __itt_string_handle_create     __itt_string_handle_createA
-#  define __itt_string_handle_create_ptr __itt_string_handle_createA_ptr
+#define __itt_string_handle_create __itt_string_handle_createA
+#define __itt_string_handle_create_ptr __itt_string_handle_createA_ptr
 #endif /* UNICODE */
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-__itt_string_handle* ITTAPI __itt_string_handle_create(const char *name);
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+__itt_string_handle *ITTAPI __itt_string_handle_create(const char *name);
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 
 /** @cond exclude_from_documentation */
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-ITT_STUB(ITTAPI, __itt_string_handle*, string_handle_createA, (const char    *name))
-ITT_STUB(ITTAPI, __itt_string_handle*, string_handle_createW, (const wchar_t *name))
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-ITT_STUB(ITTAPI, __itt_string_handle*, string_handle_create,  (const char    *name))
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_string_handle *, string_handle_createA,
+         (const char *name))
+ITT_STUB(ITTAPI, __itt_string_handle *, string_handle_createW,
+         (const wchar_t *name))
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_string_handle *, string_handle_create,
+         (const char *name))
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_string_handle_createA     ITTNOTIFY_DATA(string_handle_createA)
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+#define __itt_string_handle_createA ITTNOTIFY_DATA(string_handle_createA)
 #define __itt_string_handle_createA_ptr ITTNOTIFY_NAME(string_handle_createA)
-#define __itt_string_handle_createW     ITTNOTIFY_DATA(string_handle_createW)
+#define __itt_string_handle_createW ITTNOTIFY_DATA(string_handle_createW)
 #define __itt_string_handle_createW_ptr ITTNOTIFY_NAME(string_handle_createW)
 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_string_handle_create     ITTNOTIFY_DATA(string_handle_create)
+#define __itt_string_handle_create ITTNOTIFY_DATA(string_handle_create)
 #define __itt_string_handle_create_ptr ITTNOTIFY_NAME(string_handle_create)
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_string_handle_createA(name) (__itt_string_handle*)0
+#else /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+#define __itt_string_handle_createA(name) (__itt_string_handle *)0
 #define __itt_string_handle_createA_ptr 0
-#define __itt_string_handle_createW(name) (__itt_string_handle*)0
+#define __itt_string_handle_createW(name) (__itt_string_handle *)0
 #define __itt_string_handle_createW_ptr 0
 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_string_handle_create(name)  (__itt_string_handle*)0
+#define __itt_string_handle_create(name) (__itt_string_handle *)0
 #define __itt_string_handle_create_ptr 0
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#else /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
 #define __itt_string_handle_createA_ptr 0
 #define __itt_string_handle_createW_ptr 0
 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_string_handle_create_ptr  0
+#define __itt_string_handle_create_ptr 0
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
@@ -1946,9 +2058,9 @@ typedef unsigned long long __itt_timestamp;
 /**
  * @ingroup timestamps
  * @brief Return timestamp corresponding to the current moment.
- * This returns the timestamp in the format that is the most relevant for the current
- * host or platform (RDTSC, QPC, and others). You can use the "<" operator to
- * compare __itt_timestamp values.
+ * This returns the timestamp in the format that is the most relevant for the
+ * current host or platform (RDTSC, QPC, and others). You can use the "<"
+ * operator to compare __itt_timestamp values.
  */
 __itt_timestamp ITTAPI __itt_get_timestamp(void);
 
@@ -1956,13 +2068,13 @@ __itt_timestamp ITTAPI __itt_get_timestamp(void);
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
 ITT_STUB(ITTAPI, __itt_timestamp, get_timestamp, (void))
-#define __itt_get_timestamp      ITTNOTIFY_DATA(get_timestamp)
-#define __itt_get_timestamp_ptr  ITTNOTIFY_NAME(get_timestamp)
-#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_get_timestamp ITTNOTIFY_DATA(get_timestamp)
+#define __itt_get_timestamp_ptr ITTNOTIFY_NAME(get_timestamp)
+#else /* INTEL_NO_ITTNOTIFY_API */
 #define __itt_get_timestamp()
 #define __itt_get_timestamp_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
+#else /* INTEL_NO_MACRO_BODY */
 #define __itt_get_timestamp_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
@@ -1983,11 +2095,14 @@ ITT_STUB(ITTAPI, __itt_timestamp, get_timestamp, (void))
  * Successive calls to __itt_region_begin with the same ID are ignored
  * until a call to __itt_region_end with the same ID
  * @param[in] domain The domain for this region instance
- * @param[in] id The instance ID for this region instance. Must not be __itt_null
- * @param[in] parentid The instance ID for the parent of this region instance, or __itt_null
+ * @param[in] id The instance ID for this region instance. Must not be
+ * __itt_null
+ * @param[in] parentid The instance ID for the parent of this region instance,
+ * or __itt_null
  * @param[in] name The name of this region
  */
-void ITTAPI __itt_region_begin(const __itt_domain *domain, __itt_id id, __itt_id parentid, __itt_string_handle *name);
+void ITTAPI __itt_region_begin(const __itt_domain *domain, __itt_id id,
+                               __itt_id parentid, __itt_string_handle *name);
 
 /**
  * @ingroup regions
@@ -2003,21 +2118,24 @@ void ITTAPI __itt_region_end(const __itt_domain *domain, __itt_id id);
 /** @cond exclude_from_documentation */
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, region_begin, (const __itt_domain *domain, __itt_id id, __itt_id parentid, __itt_string_handle *name))
-ITT_STUBV(ITTAPI, void, region_end,   (const __itt_domain *domain, __itt_id id))
-#define __itt_region_begin(d,x,y,z) ITTNOTIFY_VOID_D3(region_begin,d,x,y,z)
-#define __itt_region_begin_ptr      ITTNOTIFY_NAME(region_begin)
-#define __itt_region_end(d,x)       ITTNOTIFY_VOID_D1(region_end,d,x)
-#define __itt_region_end_ptr        ITTNOTIFY_NAME(region_end)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_region_begin(d,x,y,z)
+ITT_STUBV(ITTAPI, void, region_begin,
+          (const __itt_domain *domain, __itt_id id, __itt_id parentid,
+           __itt_string_handle *name))
+ITT_STUBV(ITTAPI, void, region_end, (const __itt_domain *domain, __itt_id id))
+#define __itt_region_begin(d, x, y, z)                                         \
+  ITTNOTIFY_VOID_D3(region_begin, d, x, y, z)
+#define __itt_region_begin_ptr ITTNOTIFY_NAME(region_begin)
+#define __itt_region_end(d, x) ITTNOTIFY_VOID_D1(region_end, d, x)
+#define __itt_region_end_ptr ITTNOTIFY_NAME(region_end)
+#else /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_region_begin(d, x, y, z)
 #define __itt_region_begin_ptr 0
-#define __itt_region_end(d,x)
-#define __itt_region_end_ptr   0
+#define __itt_region_end(d, x)
+#define __itt_region_end_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
+#else /* INTEL_NO_MACRO_BODY */
 #define __itt_region_begin_ptr 0
-#define __itt_region_end_ptr   0
+#define __itt_region_end_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
 /** @} regions group */
@@ -2025,8 +2143,8 @@ ITT_STUBV(ITTAPI, void, region_end,   (const __itt_domain *domain, __itt_id id))
 /**
  * @defgroup frames Frames
  * @ingroup public
- * Frames are similar to regions, but are intended to be easier to use and to implement.
- * In particular:
+ * Frames are similar to regions, but are intended to be easier to use and to
+ * implement. In particular:
  * - Frames always represent periods of elapsed time
  * - By default, frames have no nesting relationships
  * @{
@@ -2067,32 +2185,37 @@ void ITTAPI __itt_frame_end_v3(const __itt_domain *domain, __itt_id *id);
  * @param[in] end Timestamp of the end of the frame
  */
 void ITTAPI __itt_frame_submit_v3(const __itt_domain *domain, __itt_id *id,
-    __itt_timestamp begin, __itt_timestamp end);
+                                  __itt_timestamp begin, __itt_timestamp end);
 
 /** @cond exclude_from_documentation */
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, frame_begin_v3,  (const __itt_domain *domain, __itt_id *id))
-ITT_STUBV(ITTAPI, void, frame_end_v3,    (const __itt_domain *domain, __itt_id *id))
-ITT_STUBV(ITTAPI, void, frame_submit_v3, (const __itt_domain *domain, __itt_id *id, __itt_timestamp begin, __itt_timestamp end))
-#define __itt_frame_begin_v3(d,x)      ITTNOTIFY_VOID_D1(frame_begin_v3,d,x)
-#define __itt_frame_begin_v3_ptr       ITTNOTIFY_NAME(frame_begin_v3)
-#define __itt_frame_end_v3(d,x)        ITTNOTIFY_VOID_D1(frame_end_v3,d,x)
-#define __itt_frame_end_v3_ptr         ITTNOTIFY_NAME(frame_end_v3)
-#define __itt_frame_submit_v3(d,x,b,e) ITTNOTIFY_VOID_D3(frame_submit_v3,d,x,b,e)
-#define __itt_frame_submit_v3_ptr      ITTNOTIFY_NAME(frame_submit_v3)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_frame_begin_v3(domain,id)
+ITT_STUBV(ITTAPI, void, frame_begin_v3,
+          (const __itt_domain *domain, __itt_id *id))
+ITT_STUBV(ITTAPI, void, frame_end_v3,
+          (const __itt_domain *domain, __itt_id *id))
+ITT_STUBV(ITTAPI, void, frame_submit_v3,
+          (const __itt_domain *domain, __itt_id *id, __itt_timestamp begin,
+           __itt_timestamp end))
+#define __itt_frame_begin_v3(d, x) ITTNOTIFY_VOID_D1(frame_begin_v3, d, x)
+#define __itt_frame_begin_v3_ptr ITTNOTIFY_NAME(frame_begin_v3)
+#define __itt_frame_end_v3(d, x) ITTNOTIFY_VOID_D1(frame_end_v3, d, x)
+#define __itt_frame_end_v3_ptr ITTNOTIFY_NAME(frame_end_v3)
+#define __itt_frame_submit_v3(d, x, b, e)                                      \
+  ITTNOTIFY_VOID_D3(frame_submit_v3, d, x, b, e)
+#define __itt_frame_submit_v3_ptr ITTNOTIFY_NAME(frame_submit_v3)
+#else /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_frame_begin_v3(domain, id)
 #define __itt_frame_begin_v3_ptr 0
-#define __itt_frame_end_v3(domain,id)
-#define __itt_frame_end_v3_ptr   0
-#define __itt_frame_submit_v3(domain,id,begin,end)
-#define __itt_frame_submit_v3_ptr   0
+#define __itt_frame_end_v3(domain, id)
+#define __itt_frame_end_v3_ptr 0
+#define __itt_frame_submit_v3(domain, id, begin, end)
+#define __itt_frame_submit_v3_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
+#else /* INTEL_NO_MACRO_BODY */
 #define __itt_frame_begin_v3_ptr 0
-#define __itt_frame_end_v3_ptr   0
-#define __itt_frame_submit_v3_ptr   0
+#define __itt_frame_end_v3_ptr 0
+#define __itt_frame_submit_v3_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
 /** @} frames group */
@@ -2109,23 +2232,28 @@ ITT_STUBV(ITTAPI, void, frame_submit_v3, (const __itt_domain *domain, __itt_id *
  * @brief Denotes a task_group instance.
  * Successive calls to __itt_task_group with the same ID are ignored.
  * @param[in] domain The domain for this task_group instance
- * @param[in] id The instance ID for this task_group instance. Must not be __itt_null.
- * @param[in] parentid The instance ID for the parent of this task_group instance, or __itt_null.
+ * @param[in] id The instance ID for this task_group instance. Must not be
+ * __itt_null.
+ * @param[in] parentid The instance ID for the parent of this task_group
+ * instance, or __itt_null.
  * @param[in] name The name of this task_group
  */
-void ITTAPI __itt_task_group(const __itt_domain *domain, __itt_id id, __itt_id parentid, __itt_string_handle *name);
+void ITTAPI __itt_task_group(const __itt_domain *domain, __itt_id id,
+                             __itt_id parentid, __itt_string_handle *name);
 
 /** @cond exclude_from_documentation */
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, task_group, (const __itt_domain *domain, __itt_id id, __itt_id parentid, __itt_string_handle *name))
-#define __itt_task_group(d,x,y,z) ITTNOTIFY_VOID_D3(task_group,d,x,y,z)
-#define __itt_task_group_ptr      ITTNOTIFY_NAME(task_group)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_task_group(d,x,y,z)
+ITT_STUBV(ITTAPI, void, task_group,
+          (const __itt_domain *domain, __itt_id id, __itt_id parentid,
+           __itt_string_handle *name))
+#define __itt_task_group(d, x, y, z) ITTNOTIFY_VOID_D3(task_group, d, x, y, z)
+#define __itt_task_group_ptr ITTNOTIFY_NAME(task_group)
+#else /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_task_group(d, x, y, z)
 #define __itt_task_group_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
+#else /* INTEL_NO_MACRO_BODY */
 #define __itt_task_group_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
@@ -2156,10 +2284,12 @@ ITT_STUBV(ITTAPI, void, task_group, (const __itt_domain *domain, __itt_id id, __
  * @brief Begin a task instance.
  * @param[in] domain The domain for this task
  * @param[in] taskid The instance ID for this task instance, or __itt_null
- * @param[in] parentid The parent instance to which this task instance belongs, or __itt_null
+ * @param[in] parentid The parent instance to which this task instance belongs,
+ * or __itt_null
  * @param[in] name The name of this task
  */
-void ITTAPI __itt_task_begin(const __itt_domain *domain, __itt_id taskid, __itt_id parentid, __itt_string_handle *name);
+void ITTAPI __itt_task_begin(const __itt_domain *domain, __itt_id taskid,
+                             __itt_id parentid, __itt_string_handle *name);
 
 /**
  * @ingroup tasks
@@ -2169,7 +2299,8 @@ void ITTAPI __itt_task_begin(const __itt_domain *domain, __itt_id taskid, __itt_
  * @param[in] parentid The parent of this task (may be 0)
  * @param[in] fn The pointer to the function you are tracing
  */
-void ITTAPI __itt_task_begin_fn(const __itt_domain *domain, __itt_id taskid, __itt_id parentid, void* fn);
+void ITTAPI __itt_task_begin_fn(const __itt_domain *domain, __itt_id taskid,
+                                __itt_id parentid, void *fn);
 
 /**
  * @ingroup tasks
@@ -2182,11 +2313,14 @@ void ITTAPI __itt_task_end(const __itt_domain *domain);
  * @ingroup tasks
  * @brief Begin an overlapped task instance.
  * @param[in] domain The domain for this task.
- * @param[in] taskid The identifier for this task instance, *cannot* be __itt_null.
+ * @param[in] taskid The identifier for this task instance, *cannot* be
+ * __itt_null.
  * @param[in] parentid The parent of this task, or __itt_null.
  * @param[in] name The name of this task.
  */
-void ITTAPI __itt_task_begin_overlapped(const __itt_domain* domain, __itt_id taskid, __itt_id parentid, __itt_string_handle* name);
+void ITTAPI __itt_task_begin_overlapped(const __itt_domain *domain,
+                                        __itt_id taskid, __itt_id parentid,
+                                        __itt_string_handle *name);
 
 /**
  * @ingroup tasks
@@ -2194,49 +2328,59 @@ void ITTAPI __itt_task_begin_overlapped(const __itt_domain* domain, __itt_id tas
  * @param[in] domain The domain for this task
  * @param[in] taskid Explicit ID of finished task
  */
-void ITTAPI __itt_task_end_overlapped(const __itt_domain *domain, __itt_id taskid);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, task_begin,    (const __itt_domain *domain, __itt_id id, __itt_id parentid, __itt_string_handle *name))
-ITT_STUBV(ITTAPI, void, task_begin_fn, (const __itt_domain *domain, __itt_id id, __itt_id parentid, void* fn))
-ITT_STUBV(ITTAPI, void, task_end,      (const __itt_domain *domain))
-ITT_STUBV(ITTAPI, void, task_begin_overlapped, (const __itt_domain *domain, __itt_id taskid, __itt_id parentid, __itt_string_handle *name))
-ITT_STUBV(ITTAPI, void, task_end_overlapped,   (const __itt_domain *domain, __itt_id taskid))
-#define __itt_task_begin(d,x,y,z)    ITTNOTIFY_VOID_D3(task_begin,d,x,y,z)
-#define __itt_task_begin_ptr         ITTNOTIFY_NAME(task_begin)
-#define __itt_task_begin_fn(d,x,y,z) ITTNOTIFY_VOID_D3(task_begin_fn,d,x,y,z)
-#define __itt_task_begin_fn_ptr      ITTNOTIFY_NAME(task_begin_fn)
-#define __itt_task_end(d)            ITTNOTIFY_VOID_D0(task_end,d)
-#define __itt_task_end_ptr           ITTNOTIFY_NAME(task_end)
-#define __itt_task_begin_overlapped(d,x,y,z) ITTNOTIFY_VOID_D3(task_begin_overlapped,d,x,y,z)
-#define __itt_task_begin_overlapped_ptr      ITTNOTIFY_NAME(task_begin_overlapped)
-#define __itt_task_end_overlapped(d,x)       ITTNOTIFY_VOID_D1(task_end_overlapped,d,x)
-#define __itt_task_end_overlapped_ptr        ITTNOTIFY_NAME(task_end_overlapped)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_task_begin(domain,id,parentid,name)
-#define __itt_task_begin_ptr    0
-#define __itt_task_begin_fn(domain,id,parentid,fn)
+void ITTAPI __itt_task_end_overlapped(const __itt_domain *domain,
+                                      __itt_id taskid);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, task_begin,
+          (const __itt_domain *domain, __itt_id id, __itt_id parentid,
+           __itt_string_handle *name))
+ITT_STUBV(ITTAPI, void, task_begin_fn,
+          (const __itt_domain *domain, __itt_id id, __itt_id parentid,
+           void *fn))
+ITT_STUBV(ITTAPI, void, task_end, (const __itt_domain *domain))
+ITT_STUBV(ITTAPI, void, task_begin_overlapped,
+          (const __itt_domain *domain, __itt_id taskid, __itt_id parentid,
+           __itt_string_handle *name))
+ITT_STUBV(ITTAPI, void, task_end_overlapped,
+          (const __itt_domain *domain, __itt_id taskid))
+#define __itt_task_begin(d, x, y, z) ITTNOTIFY_VOID_D3(task_begin, d, x, y, z)
+#define __itt_task_begin_ptr ITTNOTIFY_NAME(task_begin)
+#define __itt_task_begin_fn(d, x, y, z)                                        \
+  ITTNOTIFY_VOID_D3(task_begin_fn, d, x, y, z)
+#define __itt_task_begin_fn_ptr ITTNOTIFY_NAME(task_begin_fn)
+#define __itt_task_end(d) ITTNOTIFY_VOID_D0(task_end, d)
+#define __itt_task_end_ptr ITTNOTIFY_NAME(task_end)
+#define __itt_task_begin_overlapped(d, x, y, z)                                \
+  ITTNOTIFY_VOID_D3(task_begin_overlapped, d, x, y, z)
+#define __itt_task_begin_overlapped_ptr ITTNOTIFY_NAME(task_begin_overlapped)
+#define __itt_task_end_overlapped(d, x)                                        \
+  ITTNOTIFY_VOID_D1(task_end_overlapped, d, x)
+#define __itt_task_end_overlapped_ptr ITTNOTIFY_NAME(task_end_overlapped)
+#else /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_task_begin(domain, id, parentid, name)
+#define __itt_task_begin_ptr 0
+#define __itt_task_begin_fn(domain, id, parentid, fn)
 #define __itt_task_begin_fn_ptr 0
 #define __itt_task_end(domain)
-#define __itt_task_end_ptr      0
-#define __itt_task_begin_overlapped(domain,taskid,parentid,name)
-#define __itt_task_begin_overlapped_ptr         0
-#define __itt_task_end_overlapped(domain,taskid)
-#define __itt_task_end_overlapped_ptr           0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_task_begin_ptr    0
+#define __itt_task_end_ptr 0
+#define __itt_task_begin_overlapped(domain, taskid, parentid, name)
+#define __itt_task_begin_overlapped_ptr 0
+#define __itt_task_end_overlapped(domain, taskid)
+#define __itt_task_end_overlapped_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else /* INTEL_NO_MACRO_BODY */
+#define __itt_task_begin_ptr 0
 #define __itt_task_begin_fn_ptr 0
-#define __itt_task_end_ptr      0
+#define __itt_task_end_ptr 0
 #define __itt_task_begin_overlapped_ptr 0
-#define __itt_task_end_overlapped_ptr   0
+#define __itt_task_end_overlapped_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
 /** @} tasks group */
 
-
 /**
  * @defgroup markers Markers
  * Markers represent a single discreet event in time. Markers have a scope,
@@ -2249,22 +2393,21 @@ ITT_STUBV(ITTAPI, void, task_end_overlapped,   (const __itt_domain *domain, __it
 /**
  * @brief Describes the scope of an event object in the trace.
  */
-typedef enum
-{
-    __itt_scope_unknown = 0,
-    __itt_scope_global,
-    __itt_scope_track_group,
-    __itt_scope_track,
-    __itt_scope_task,
-    __itt_scope_marker
+typedef enum {
+  __itt_scope_unknown = 0,
+  __itt_scope_global,
+  __itt_scope_track_group,
+  __itt_scope_track,
+  __itt_scope_task,
+  __itt_scope_marker
 } __itt_scope;
 
 /** @cond exclude_from_documentation */
-#define __itt_marker_scope_unknown  __itt_scope_unknown
-#define __itt_marker_scope_global   __itt_scope_global
-#define __itt_marker_scope_process  __itt_scope_track_group
-#define __itt_marker_scope_thread   __itt_scope_track
-#define __itt_marker_scope_task     __itt_scope_task
+#define __itt_marker_scope_unknown __itt_scope_unknown
+#define __itt_marker_scope_global __itt_scope_global
+#define __itt_marker_scope_process __itt_scope_track_group
+#define __itt_marker_scope_thread __itt_scope_track
+#define __itt_marker_scope_task __itt_scope_task
 /** @endcond */
 
 /**
@@ -2275,19 +2418,22 @@ typedef enum
  * @param[in] name The name for this marker
  * @param[in] scope The scope for this marker
  */
-void ITTAPI __itt_marker(const __itt_domain *domain, __itt_id id, __itt_string_handle *name, __itt_scope scope);
+void ITTAPI __itt_marker(const __itt_domain *domain, __itt_id id,
+                         __itt_string_handle *name, __itt_scope scope);
 
 /** @cond exclude_from_documentation */
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, marker, (const __itt_domain *domain, __itt_id id, __itt_string_handle *name, __itt_scope scope))
-#define __itt_marker(d,x,y,z) ITTNOTIFY_VOID_D3(marker,d,x,y,z)
-#define __itt_marker_ptr      ITTNOTIFY_NAME(marker)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_marker(domain,id,name,scope)
+ITT_STUBV(ITTAPI, void, marker,
+          (const __itt_domain *domain, __itt_id id, __itt_string_handle *name,
+           __itt_scope scope))
+#define __itt_marker(d, x, y, z) ITTNOTIFY_VOID_D3(marker, d, x, y, z)
+#define __itt_marker_ptr ITTNOTIFY_NAME(marker)
+#else /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_marker(domain, id, name, scope)
 #define __itt_marker_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
+#else /* INTEL_NO_MACRO_BODY */
 #define __itt_marker_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
@@ -2312,40 +2458,48 @@ ITT_STUBV(ITTAPI, void, marker, (const __itt_domain *domain, __itt_id id, __itt_
  * @brief describes the type of metadata
  */
 typedef enum {
-    __itt_metadata_unknown = 0,
-    __itt_metadata_u64,     /**< Unsigned 64-bit integer */
-    __itt_metadata_s64,     /**< Signed 64-bit integer */
-    __itt_metadata_u32,     /**< Unsigned 32-bit integer */
-    __itt_metadata_s32,     /**< Signed 32-bit integer */
-    __itt_metadata_u16,     /**< Unsigned 16-bit integer */
-    __itt_metadata_s16,     /**< Signed 16-bit integer */
-    __itt_metadata_float,   /**< Signed 32-bit floating-point */
-    __itt_metadata_double   /**< SIgned 64-bit floating-point */
+  __itt_metadata_unknown = 0,
+  __itt_metadata_u64, /**< Unsigned 64-bit integer */
+  __itt_metadata_s64, /**< Signed 64-bit integer */
+  __itt_metadata_u32, /**< Unsigned 32-bit integer */
+  __itt_metadata_s32, /**< Signed 32-bit integer */
+  __itt_metadata_u16, /**< Unsigned 16-bit integer */
+  __itt_metadata_s16, /**< Signed 16-bit integer */
+  __itt_metadata_float, /**< Signed 32-bit floating-point */
+  __itt_metadata_double /**< SIgned 64-bit floating-point */
 } __itt_metadata_type;
 
 /**
  * @ingroup parameters
  * @brief Add metadata to an instance of a named entity.
  * @param[in] domain The domain controlling the call
- * @param[in] id The identifier of the instance to which the metadata is to be added, or __itt_null to add to the current task
+ * @param[in] id The identifier of the instance to which the metadata is to be
+ * added, or __itt_null to add to the current task
  * @param[in] key The name of the metadata
  * @param[in] type The type of the metadata
- * @param[in] count The number of elements of the given type. If count == 0, no metadata will be added.
+ * @param[in] count The number of elements of the given type. If count == 0, no
+ * metadata will be added.
  * @param[in] data The metadata itself
-*/
-void ITTAPI __itt_metadata_add(const __itt_domain *domain, __itt_id id, __itt_string_handle *key, __itt_metadata_type type, size_t count, void *data);
+ */
+void ITTAPI __itt_metadata_add(const __itt_domain *domain, __itt_id id,
+                               __itt_string_handle *key,
+                               __itt_metadata_type type, size_t count,
+                               void *data);
 
 /** @cond exclude_from_documentation */
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, metadata_add, (const __itt_domain *domain, __itt_id id, __itt_string_handle *key, __itt_metadata_type type, size_t count, void *data))
-#define __itt_metadata_add(d,x,y,z,a,b) ITTNOTIFY_VOID_D5(metadata_add,d,x,y,z,a,b)
-#define __itt_metadata_add_ptr          ITTNOTIFY_NAME(metadata_add)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_metadata_add(d,x,y,z,a,b)
+ITT_STUBV(ITTAPI, void, metadata_add,
+          (const __itt_domain *domain, __itt_id id, __itt_string_handle *key,
+           __itt_metadata_type type, size_t count, void *data))
+#define __itt_metadata_add(d, x, y, z, a, b)                                   \
+  ITTNOTIFY_VOID_D5(metadata_add, d, x, y, z, a, b)
+#define __itt_metadata_add_ptr ITTNOTIFY_NAME(metadata_add)
+#else /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_metadata_add(d, x, y, z, a, b)
 #define __itt_metadata_add_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
+#else /* INTEL_NO_MACRO_BODY */
 #define __itt_metadata_add_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
@@ -2354,60 +2508,77 @@ ITT_STUBV(ITTAPI, void, metadata_add, (const __itt_domain *domain, __itt_id id,
  * @ingroup parameters
  * @brief Add string metadata to an instance of a named entity.
  * @param[in] domain The domain controlling the call
- * @param[in] id The identifier of the instance to which the metadata is to be added, or __itt_null to add to the current task
+ * @param[in] id The identifier of the instance to which the metadata is to be
+ * added, or __itt_null to add to the current task
  * @param[in] key The name of the metadata
  * @param[in] data The metadata itself
- * @param[in] length The number of characters in the string, or -1 if the length is unknown but the string is null-terminated
-*/
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-void ITTAPI __itt_metadata_str_addA(const __itt_domain *domain, __itt_id id, __itt_string_handle *key, const char *data, size_t length);
-void ITTAPI __itt_metadata_str_addW(const __itt_domain *domain, __itt_id id, __itt_string_handle *key, const wchar_t *data, size_t length);
+ * @param[in] length The number of characters in the string, or -1 if the length
+ * is unknown but the string is null-terminated
+ */
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+void ITTAPI __itt_metadata_str_addA(const __itt_domain *domain, __itt_id id,
+                                    __itt_string_handle *key, const char *data,
+                                    size_t length);
+void ITTAPI __itt_metadata_str_addW(const __itt_domain *domain, __itt_id id,
+                                    __itt_string_handle *key,
+                                    const wchar_t *data, size_t length);
 #if defined(UNICODE) || defined(_UNICODE)
-#  define __itt_metadata_str_add     __itt_metadata_str_addW
-#  define __itt_metadata_str_add_ptr __itt_metadata_str_addW_ptr
+#define __itt_metadata_str_add __itt_metadata_str_addW
+#define __itt_metadata_str_add_ptr __itt_metadata_str_addW_ptr
 #else /* UNICODE */
-#  define __itt_metadata_str_add     __itt_metadata_str_addA
-#  define __itt_metadata_str_add_ptr __itt_metadata_str_addA_ptr
+#define __itt_metadata_str_add __itt_metadata_str_addA
+#define __itt_metadata_str_add_ptr __itt_metadata_str_addA_ptr
 #endif /* UNICODE */
 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-void ITTAPI __itt_metadata_str_add(const __itt_domain *domain, __itt_id id, __itt_string_handle *key, const char *data, size_t length);
+void ITTAPI __itt_metadata_str_add(const __itt_domain *domain, __itt_id id,
+                                   __itt_string_handle *key, const char *data,
+                                   size_t length);
 #endif
 
 /** @cond exclude_from_documentation */
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-ITT_STUBV(ITTAPI, void, metadata_str_addA, (const __itt_domain *domain, __itt_id id, __itt_string_handle *key, const char *data, size_t length))
-ITT_STUBV(ITTAPI, void, metadata_str_addW, (const __itt_domain *domain, __itt_id id, __itt_string_handle *key, const wchar_t *data, size_t length))
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-ITT_STUBV(ITTAPI, void, metadata_str_add, (const __itt_domain *domain, __itt_id id, __itt_string_handle *key, const char *data, size_t length))
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+ITT_STUBV(ITTAPI, void, metadata_str_addA,
+          (const __itt_domain *domain, __itt_id id, __itt_string_handle *key,
+           const char *data, size_t length))
+ITT_STUBV(ITTAPI, void, metadata_str_addW,
+          (const __itt_domain *domain, __itt_id id, __itt_string_handle *key,
+           const wchar_t *data, size_t length))
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUBV(ITTAPI, void, metadata_str_add,
+          (const __itt_domain *domain, __itt_id id, __itt_string_handle *key,
+           const char *data, size_t length))
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_metadata_str_addA(d,x,y,z,a) ITTNOTIFY_VOID_D4(metadata_str_addA,d,x,y,z,a)
-#define __itt_metadata_str_addA_ptr        ITTNOTIFY_NAME(metadata_str_addA)
-#define __itt_metadata_str_addW(d,x,y,z,a) ITTNOTIFY_VOID_D4(metadata_str_addW,d,x,y,z,a)
-#define __itt_metadata_str_addW_ptr        ITTNOTIFY_NAME(metadata_str_addW)
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+#define __itt_metadata_str_addA(d, x, y, z, a)                                 \
+  ITTNOTIFY_VOID_D4(metadata_str_addA, d, x, y, z, a)
+#define __itt_metadata_str_addA_ptr ITTNOTIFY_NAME(metadata_str_addA)
+#define __itt_metadata_str_addW(d, x, y, z, a)                                 \
+  ITTNOTIFY_VOID_D4(metadata_str_addW, d, x, y, z, a)
+#define __itt_metadata_str_addW_ptr ITTNOTIFY_NAME(metadata_str_addW)
 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_metadata_str_add(d,x,y,z,a)  ITTNOTIFY_VOID_D4(metadata_str_add,d,x,y,z,a)
-#define __itt_metadata_str_add_ptr         ITTNOTIFY_NAME(metadata_str_add)
+#define __itt_metadata_str_add(d, x, y, z, a)                                  \
+  ITTNOTIFY_VOID_D4(metadata_str_add, d, x, y, z, a)
+#define __itt_metadata_str_add_ptr ITTNOTIFY_NAME(metadata_str_add)
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_metadata_str_addA(d,x,y,z,a)
+#else /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+#define __itt_metadata_str_addA(d, x, y, z, a)
 #define __itt_metadata_str_addA_ptr 0
-#define __itt_metadata_str_addW(d,x,y,z,a)
+#define __itt_metadata_str_addW(d, x, y, z, a)
 #define __itt_metadata_str_addW_ptr 0
 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_metadata_str_add(d,x,y,z,a)
+#define __itt_metadata_str_add(d, x, y, z, a)
 #define __itt_metadata_str_add_ptr 0
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#else /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
 #define __itt_metadata_str_addA_ptr 0
 #define __itt_metadata_str_addW_ptr 0
 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_metadata_str_add_ptr  0
+#define __itt_metadata_str_add_ptr 0
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
@@ -2416,28 +2587,40 @@ ITT_STUBV(ITTAPI, void, metadata_str_add, (const __itt_domain *domain, __itt_id
  * @ingroup parameters
  * @brief Add metadata to an instance of a named entity.
  * @param[in] domain The domain controlling the call
- * @param[in] scope The scope of the instance to which the metadata is to be added
+ * @param[in] scope The scope of the instance to which the metadata is to be
+ added
 
- * @param[in] id The identifier of the instance to which the metadata is to be added, or __itt_null to add to the current task
+ * @param[in] id The identifier of the instance to which the metadata is to be
+ added, or __itt_null to add to the current task
 
  * @param[in] key The name of the metadata
  * @param[in] type The type of the metadata
- * @param[in] count The number of elements of the given type. If count == 0, no metadata will be added.
+ * @param[in] count The number of elements of the given type. If count == 0, no
+ metadata will be added.
  * @param[in] data The metadata itself
 */
-void ITTAPI __itt_metadata_add_with_scope(const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, __itt_metadata_type type, size_t count, void *data);
+void ITTAPI __itt_metadata_add_with_scope(const __itt_domain *domain,
+                                          __itt_scope scope,
+                                          __itt_string_handle *key,
+                                          __itt_metadata_type type,
+                                          size_t count, void *data);
 
 /** @cond exclude_from_documentation */
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, metadata_add_with_scope, (const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, __itt_metadata_type type, size_t count, void *data))
-#define __itt_metadata_add_with_scope(d,x,y,z,a,b) ITTNOTIFY_VOID_D5(metadata_add_with_scope,d,x,y,z,a,b)
-#define __itt_metadata_add_with_scope_ptr          ITTNOTIFY_NAME(metadata_add_with_scope)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_metadata_add_with_scope(d,x,y,z,a,b)
+ITT_STUBV(ITTAPI, void, metadata_add_with_scope,
+          (const __itt_domain *domain, __itt_scope scope,
+           __itt_string_handle *key, __itt_metadata_type type, size_t count,
+           void *data))
+#define __itt_metadata_add_with_scope(d, x, y, z, a, b)                        \
+  ITTNOTIFY_VOID_D5(metadata_add_with_scope, d, x, y, z, a, b)
+#define __itt_metadata_add_with_scope_ptr                                      \
+  ITTNOTIFY_NAME(metadata_add_with_scope)
+#else /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_metadata_add_with_scope(d, x, y, z, a, b)
 #define __itt_metadata_add_with_scope_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
+#else /* INTEL_NO_MACRO_BODY */
 #define __itt_metadata_add_with_scope_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
@@ -2446,63 +2629,90 @@ ITT_STUBV(ITTAPI, void, metadata_add_with_scope, (const __itt_domain *domain, __
  * @ingroup parameters
  * @brief Add string metadata to an instance of a named entity.
  * @param[in] domain The domain controlling the call
- * @param[in] scope The scope of the instance to which the metadata is to be added
+ * @param[in] scope The scope of the instance to which the metadata is to be
+ added
 
- * @param[in] id The identifier of the instance to which the metadata is to be added, or __itt_null to add to the current task
+ * @param[in] id The identifier of the instance to which the metadata is to be
+ added, or __itt_null to add to the current task
 
  * @param[in] key The name of the metadata
  * @param[in] data The metadata itself
- * @param[in] length The number of characters in the string, or -1 if the length is unknown but the string is null-terminated
+ * @param[in] length The number of characters in the string, or -1 if the length
+ is unknown but the string is null-terminated
 */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-void ITTAPI __itt_metadata_str_add_with_scopeA(const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, const char *data, size_t length);
-void ITTAPI __itt_metadata_str_add_with_scopeW(const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, const wchar_t *data, size_t length);
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+void ITTAPI __itt_metadata_str_add_with_scopeA(const __itt_domain *domain,
+                                               __itt_scope scope,
+                                               __itt_string_handle *key,
+                                               const char *data, size_t length);
+void ITTAPI __itt_metadata_str_add_with_scopeW(const __itt_domain *domain,
+                                               __itt_scope scope,
+                                               __itt_string_handle *key,
+                                               const wchar_t *data,
+                                               size_t length);
 #if defined(UNICODE) || defined(_UNICODE)
-#  define __itt_metadata_str_add_with_scope     __itt_metadata_str_add_with_scopeW
-#  define __itt_metadata_str_add_with_scope_ptr __itt_metadata_str_add_with_scopeW_ptr
+#define __itt_metadata_str_add_with_scope __itt_metadata_str_add_with_scopeW
+#define __itt_metadata_str_add_with_scope_ptr                                  \
+  __itt_metadata_str_add_with_scopeW_ptr
 #else /* UNICODE */
-#  define __itt_metadata_str_add_with_scope     __itt_metadata_str_add_with_scopeA
-#  define __itt_metadata_str_add_with_scope_ptr __itt_metadata_str_add_with_scopeA_ptr
+#define __itt_metadata_str_add_with_scope __itt_metadata_str_add_with_scopeA
+#define __itt_metadata_str_add_with_scope_ptr                                  \
+  __itt_metadata_str_add_with_scopeA_ptr
 #endif /* UNICODE */
 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-void ITTAPI __itt_metadata_str_add_with_scope(const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, const char *data, size_t length);
+void ITTAPI __itt_metadata_str_add_with_scope(const __itt_domain *domain,
+                                              __itt_scope scope,
+                                              __itt_string_handle *key,
+                                              const char *data, size_t length);
 #endif
 
 /** @cond exclude_from_documentation */
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-ITT_STUBV(ITTAPI, void, metadata_str_add_with_scopeA, (const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, const char *data, size_t length))
-ITT_STUBV(ITTAPI, void, metadata_str_add_with_scopeW, (const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, const wchar_t *data, size_t length))
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-ITT_STUBV(ITTAPI, void, metadata_str_add_with_scope, (const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, const char *data, size_t length))
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+ITT_STUBV(ITTAPI, void, metadata_str_add_with_scopeA,
+          (const __itt_domain *domain, __itt_scope scope,
+           __itt_string_handle *key, const char *data, size_t length))
+ITT_STUBV(ITTAPI, void, metadata_str_add_with_scopeW,
+          (const __itt_domain *domain, __itt_scope scope,
+           __itt_string_handle *key, const wchar_t *data, size_t length))
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUBV(ITTAPI, void, metadata_str_add_with_scope,
+          (const __itt_domain *domain, __itt_scope scope,
+           __itt_string_handle *key, const char *data, size_t length))
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_metadata_str_add_with_scopeA(d,x,y,z,a) ITTNOTIFY_VOID_D4(metadata_str_add_with_scopeA,d,x,y,z,a)
-#define __itt_metadata_str_add_with_scopeA_ptr        ITTNOTIFY_NAME(metadata_str_add_with_scopeA)
-#define __itt_metadata_str_add_with_scopeW(d,x,y,z,a) ITTNOTIFY_VOID_D4(metadata_str_add_with_scopeW,d,x,y,z,a)
-#define __itt_metadata_str_add_with_scopeW_ptr        ITTNOTIFY_NAME(metadata_str_add_with_scopeW)
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+#define __itt_metadata_str_add_with_scopeA(d, x, y, z, a)                      \
+  ITTNOTIFY_VOID_D4(metadata_str_add_with_scopeA, d, x, y, z, a)
+#define __itt_metadata_str_add_with_scopeA_ptr                                 \
+  ITTNOTIFY_NAME(metadata_str_add_with_scopeA)
+#define __itt_metadata_str_add_with_scopeW(d, x, y, z, a)                      \
+  ITTNOTIFY_VOID_D4(metadata_str_add_with_scopeW, d, x, y, z, a)
+#define __itt_metadata_str_add_with_scopeW_ptr                                 \
+  ITTNOTIFY_NAME(metadata_str_add_with_scopeW)
 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_metadata_str_add_with_scope(d,x,y,z,a)  ITTNOTIFY_VOID_D4(metadata_str_add_with_scope,d,x,y,z,a)
-#define __itt_metadata_str_add_with_scope_ptr         ITTNOTIFY_NAME(metadata_str_add_with_scope)
+#define __itt_metadata_str_add_with_scope(d, x, y, z, a)                       \
+  ITTNOTIFY_VOID_D4(metadata_str_add_with_scope, d, x, y, z, a)
+#define __itt_metadata_str_add_with_scope_ptr                                  \
+  ITTNOTIFY_NAME(metadata_str_add_with_scope)
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_metadata_str_add_with_scopeA(d,x,y,z,a)
-#define __itt_metadata_str_add_with_scopeA_ptr  0
-#define __itt_metadata_str_add_with_scopeW(d,x,y,z,a)
-#define __itt_metadata_str_add_with_scopeW_ptr  0
+#else /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+#define __itt_metadata_str_add_with_scopeA(d, x, y, z, a)
+#define __itt_metadata_str_add_with_scopeA_ptr 0
+#define __itt_metadata_str_add_with_scopeW(d, x, y, z, a)
+#define __itt_metadata_str_add_with_scopeW_ptr 0
 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_metadata_str_add_with_scope(d,x,y,z,a)
-#define __itt_metadata_str_add_with_scope_ptr   0
+#define __itt_metadata_str_add_with_scope(d, x, y, z, a)
+#define __itt_metadata_str_add_with_scope_ptr 0
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_metadata_str_add_with_scopeA_ptr  0
-#define __itt_metadata_str_add_with_scopeW_ptr  0
+#else /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+#define __itt_metadata_str_add_with_scopeA_ptr 0
+#define __itt_metadata_str_add_with_scopeW_ptr 0
 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_metadata_str_add_with_scope_ptr   0
+#define __itt_metadata_str_add_with_scope_ptr 0
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
@@ -2519,23 +2729,30 @@ ITT_STUBV(ITTAPI, void, metadata_str_add_with_scope, (const __itt_domain *domain
 
 /**
  * @ingroup relations
- * @brief The kind of relation between two instances is specified by the enumerated type __itt_relation.
- * Relations between instances can be added with an API call. The relation
- * API uses instance IDs. Relations can be added before or after the actual
- * instances are created and persist independently of the instances. This
- * is the motivation for having different lifetimes for instance IDs and
- * the actual instances.
- */
-typedef enum
-{
-    __itt_relation_is_unknown = 0,
-    __itt_relation_is_dependent_on,         /**< "A is dependent on B" means that A cannot start until B completes */
-    __itt_relation_is_sibling_of,           /**< "A is sibling of B" means that A and B were created as a group */
-    __itt_relation_is_parent_of,            /**< "A is parent of B" means that A created B */
-    __itt_relation_is_continuation_of,      /**< "A is continuation of B" means that A assumes the dependencies of B */
-    __itt_relation_is_child_of,             /**< "A is child of B" means that A was created by B (inverse of is_parent_of) */
-    __itt_relation_is_continued_by,         /**< "A is continued by B" means that B assumes the dependencies of A (inverse of is_continuation_of) */
-    __itt_relation_is_predecessor_to        /**< "A is predecessor to B" means that B cannot start until A completes (inverse of is_dependent_on) */
+ * @brief The kind of relation between two instances is specified by the
+ * enumerated type __itt_relation. Relations between instances can be added with
+ * an API call. The relation API uses instance IDs. Relations can be added
+ * before or after the actual instances are created and persist independently of
+ * the instances. This is the motivation for having different lifetimes for
+ * instance IDs and the actual instances.
+ */
+typedef enum {
+  __itt_relation_is_unknown = 0,
+  __itt_relation_is_dependent_on, /**< "A is dependent on B" means that A cannot
+                                     start until B completes */
+  __itt_relation_is_sibling_of, /**< "A is sibling of B" means that A and B were
+                                   created as a group */
+  __itt_relation_is_parent_of, /**< "A is parent of B" means that A created B */
+  __itt_relation_is_continuation_of, /**< "A is continuation of B" means that A
+                                        assumes the dependencies of B */
+  __itt_relation_is_child_of, /**< "A is child of B" means that A was created by
+                                 B (inverse of is_parent_of) */
+  __itt_relation_is_continued_by, /**< "A is continued by B" means that B
+                                     assumes the dependencies of A (inverse of
+                                     is_continuation_of) */
+  __itt_relation_is_predecessor_to /**< "A is predecessor to B" means that B
+                                      cannot start until A completes (inverse of
+                                      is_dependent_on) */
 } __itt_relation;
 
 /**
@@ -2546,7 +2763,9 @@ typedef enum
  * @param[in] relation The kind of relation
  * @param[in] tail The ID for the tail of the relation
  */
-void ITTAPI __itt_relation_add_to_current(const __itt_domain *domain, __itt_relation relation, __itt_id tail);
+void ITTAPI __itt_relation_add_to_current(const __itt_domain *domain,
+                                          __itt_relation relation,
+                                          __itt_id tail);
 
 /**
  * @ingroup relations
@@ -2556,24 +2775,31 @@ void ITTAPI __itt_relation_add_to_current(const __itt_domain *domain, __itt_rela
  * @param[in] relation The kind of relation
  * @param[in] tail The ID for the tail of the relation
  */
-void ITTAPI __itt_relation_add(const __itt_domain *domain, __itt_id head, __itt_relation relation, __itt_id tail);
+void ITTAPI __itt_relation_add(const __itt_domain *domain, __itt_id head,
+                               __itt_relation relation, __itt_id tail);
 
 /** @cond exclude_from_documentation */
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, relation_add_to_current, (const __itt_domain *domain, __itt_relation relation, __itt_id tail))
-ITT_STUBV(ITTAPI, void, relation_add,            (const __itt_domain *domain, __itt_id head, __itt_relation relation, __itt_id tail))
-#define __itt_relation_add_to_current(d,x,y) ITTNOTIFY_VOID_D2(relation_add_to_current,d,x,y)
-#define __itt_relation_add_to_current_ptr    ITTNOTIFY_NAME(relation_add_to_current)
-#define __itt_relation_add(d,x,y,z)          ITTNOTIFY_VOID_D3(relation_add,d,x,y,z)
-#define __itt_relation_add_ptr               ITTNOTIFY_NAME(relation_add)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_relation_add_to_current(d,x,y)
+ITT_STUBV(ITTAPI, void, relation_add_to_current,
+          (const __itt_domain *domain, __itt_relation relation, __itt_id tail))
+ITT_STUBV(ITTAPI, void, relation_add,
+          (const __itt_domain *domain, __itt_id head, __itt_relation relation,
+           __itt_id tail))
+#define __itt_relation_add_to_current(d, x, y)                                 \
+  ITTNOTIFY_VOID_D2(relation_add_to_current, d, x, y)
+#define __itt_relation_add_to_current_ptr                                      \
+  ITTNOTIFY_NAME(relation_add_to_current)
+#define __itt_relation_add(d, x, y, z)                                         \
+  ITTNOTIFY_VOID_D3(relation_add, d, x, y, z)
+#define __itt_relation_add_ptr ITTNOTIFY_NAME(relation_add)
+#else /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_relation_add_to_current(d, x, y)
 #define __itt_relation_add_to_current_ptr 0
-#define __itt_relation_add(d,x,y,z)
+#define __itt_relation_add(d, x, y, z)
 #define __itt_relation_add_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
+#else /* INTEL_NO_MACRO_BODY */
 #define __itt_relation_add_to_current_ptr 0
 #define __itt_relation_add_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
@@ -2583,30 +2809,29 @@ ITT_STUBV(ITTAPI, void, relation_add,            (const __itt_domain *domain, __
 /** @cond exclude_from_documentation */
 #pragma pack(push, 8)
 
-typedef struct ___itt_clock_info
-{
-    unsigned long long clock_freq; /*!< Clock domain frequency */
-    unsigned long long clock_base; /*!< Clock domain base timestamp */
+typedef struct ___itt_clock_info {
+  unsigned long long clock_freq; /*!< Clock domain frequency */
+  unsigned long long clock_base; /*!< Clock domain base timestamp */
 } __itt_clock_info;
 
 #pragma pack(pop)
 /** @endcond */
 
 /** @cond exclude_from_documentation */
-typedef void (ITTAPI *__itt_get_clock_info_fn)(__itt_clock_info* clock_info, void* data);
+typedef void(ITTAPI *__itt_get_clock_info_fn)(__itt_clock_info *clock_info,
+                                              void *data);
 /** @endcond */
 
 /** @cond exclude_from_documentation */
 #pragma pack(push, 8)
 
-typedef struct ___itt_clock_domain
-{
-    __itt_clock_info info;      /*!< Most recent clock domain info */
-    __itt_get_clock_info_fn fn; /*!< Callback function pointer */
-    void* fn_data;              /*!< Input argument for the callback function */
-    int   extra1;               /*!< Reserved. Must be zero */
-    void* extra2;               /*!< Reserved. Must be zero */
-    struct ___itt_clock_domain* next;
+typedef struct ___itt_clock_domain {
+  __itt_clock_info info; /*!< Most recent clock domain info */
+  __itt_get_clock_info_fn fn; /*!< Callback function pointer */
+  void *fn_data; /*!< Input argument for the callback function */
+  int extra1; /*!< Reserved. Must be zero */
+  void *extra2; /*!< Reserved. Must be zero */
+  struct ___itt_clock_domain *next;
 } __itt_clock_domain;
 
 #pragma pack(pop)
@@ -2616,28 +2841,30 @@ typedef struct ___itt_clock_domain
  * @ingroup clockdomains
  * @brief Create a clock domain.
  * Certain applications require the capability to trace their application using
- * a clock domain different than the CPU, for instance the instrumentation of events
- * that occur on a GPU.
- * Because the set of domains is expected to be static over the application's execution time,
- * there is no mechanism to destroy a domain.
- * Any domain can be accessed by any thread in the process, regardless of which thread created
- * the domain. This call is thread-safe.
- * @param[in] fn A pointer to a callback function which retrieves alternative CPU timestamps
+ * a clock domain different than the CPU, for instance the instrumentation of
+ * events that occur on a GPU. Because the set of domains is expected to be
+ * static over the application's execution time, there is no mechanism to
+ * destroy a domain. Any domain can be accessed by any thread in the process,
+ * regardless of which thread created the domain. This call is thread-safe.
+ * @param[in] fn A pointer to a callback function which retrieves alternative
+ * CPU timestamps
  * @param[in] fn_data Argument for a callback function; may be NULL
  */
-__itt_clock_domain* ITTAPI __itt_clock_domain_create(__itt_get_clock_info_fn fn, void* fn_data);
+__itt_clock_domain *ITTAPI __itt_clock_domain_create(__itt_get_clock_info_fn fn,
+                                                     void *fn_data);
 
 /** @cond exclude_from_documentation */
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUB(ITTAPI, __itt_clock_domain*, clock_domain_create, (__itt_get_clock_info_fn fn, void* fn_data))
-#define __itt_clock_domain_create     ITTNOTIFY_DATA(clock_domain_create)
+ITT_STUB(ITTAPI, __itt_clock_domain *, clock_domain_create,
+         (__itt_get_clock_info_fn fn, void *fn_data))
+#define __itt_clock_domain_create ITTNOTIFY_DATA(clock_domain_create)
 #define __itt_clock_domain_create_ptr ITTNOTIFY_NAME(clock_domain_create)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_clock_domain_create(fn,fn_data) (__itt_clock_domain*)0
+#else /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_clock_domain_create(fn, fn_data) (__itt_clock_domain *)0
 #define __itt_clock_domain_create_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
+#else /* INTEL_NO_MACRO_BODY */
 #define __itt_clock_domain_create_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
@@ -2652,60 +2879,74 @@ void ITTAPI __itt_clock_domain_reset(void);
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
 ITT_STUBV(ITTAPI, void, clock_domain_reset, (void))
-#define __itt_clock_domain_reset     ITTNOTIFY_VOID(clock_domain_reset)
+#define __itt_clock_domain_reset ITTNOTIFY_VOID(clock_domain_reset)
 #define __itt_clock_domain_reset_ptr ITTNOTIFY_NAME(clock_domain_reset)
-#else  /* INTEL_NO_ITTNOTIFY_API */
+#else /* INTEL_NO_ITTNOTIFY_API */
 #define __itt_clock_domain_reset()
 #define __itt_clock_domain_reset_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
+#else /* INTEL_NO_MACRO_BODY */
 #define __itt_clock_domain_reset_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
 
 /**
  * @ingroup clockdomain
- * @brief Create an instance of identifier. This establishes the beginning of the lifetime of
- * an instance of the given ID in the trace. Once this lifetime starts, the ID can be used to
- * tag named entity instances in calls such as __itt_task_begin, and to specify relationships among
- * identified named entity instances, using the \ref relations APIs.
+ * @brief Create an instance of identifier. This establishes the beginning of
+ * the lifetime of an instance of the given ID in the trace. Once this lifetime
+ * starts, the ID can be used to tag named entity instances in calls such as
+ * __itt_task_begin, and to specify relationships among identified named entity
+ * instances, using the \ref relations APIs.
  * @param[in] domain The domain controlling the execution of this call.
- * @param[in] clock_domain The clock domain controlling the execution of this call.
+ * @param[in] clock_domain The clock domain controlling the execution of this
+ * call.
  * @param[in] timestamp The user defined timestamp.
  * @param[in] id The ID to create.
  */
-void ITTAPI __itt_id_create_ex(const __itt_domain* domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id);
+void ITTAPI __itt_id_create_ex(const __itt_domain *domain,
+                               __itt_clock_domain *clock_domain,
+                               unsigned long long timestamp, __itt_id id);
 
 /**
  * @ingroup clockdomain
- * @brief Destroy an instance of identifier. This ends the lifetime of the current instance of the
- * given ID value in the trace. Any relationships that are established after this lifetime ends are
- * invalid. This call must be performed before the given ID value can be reused for a different
- * named entity instance.
+ * @brief Destroy an instance of identifier. This ends the lifetime of the
+ * current instance of the given ID value in the trace. Any relationships that
+ * are established after this lifetime ends are invalid. This call must be
+ * performed before the given ID value can be reused for a different named
+ * entity instance.
  * @param[in] domain The domain controlling the execution of this call.
- * @param[in] clock_domain The clock domain controlling the execution of this call.
+ * @param[in] clock_domain The clock domain controlling the execution of this
+ * call.
  * @param[in] timestamp The user defined timestamp.
  * @param[in] id The ID to destroy.
  */
-void ITTAPI __itt_id_destroy_ex(const __itt_domain* domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id);
+void ITTAPI __itt_id_destroy_ex(const __itt_domain *domain,
+                                __itt_clock_domain *clock_domain,
+                                unsigned long long timestamp, __itt_id id);
 
 /** @cond exclude_from_documentation */
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, id_create_ex,  (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id))
-ITT_STUBV(ITTAPI, void, id_destroy_ex, (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id))
-#define __itt_id_create_ex(d,x,y,z)  ITTNOTIFY_VOID_D3(id_create_ex,d,x,y,z)
-#define __itt_id_create_ex_ptr       ITTNOTIFY_NAME(id_create_ex)
-#define __itt_id_destroy_ex(d,x,y,z) ITTNOTIFY_VOID_D3(id_destroy_ex,d,x,y,z)
-#define __itt_id_destroy_ex_ptr      ITTNOTIFY_NAME(id_destroy_ex)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_id_create_ex(domain,clock_domain,timestamp,id)
-#define __itt_id_create_ex_ptr    0
-#define __itt_id_destroy_ex(domain,clock_domain,timestamp,id)
+ITT_STUBV(ITTAPI, void, id_create_ex,
+          (const __itt_domain *domain, __itt_clock_domain *clock_domain,
+           unsigned long long timestamp, __itt_id id))
+ITT_STUBV(ITTAPI, void, id_destroy_ex,
+          (const __itt_domain *domain, __itt_clock_domain *clock_domain,
+           unsigned long long timestamp, __itt_id id))
+#define __itt_id_create_ex(d, x, y, z)                                         \
+  ITTNOTIFY_VOID_D3(id_create_ex, d, x, y, z)
+#define __itt_id_create_ex_ptr ITTNOTIFY_NAME(id_create_ex)
+#define __itt_id_destroy_ex(d, x, y, z)                                        \
+  ITTNOTIFY_VOID_D3(id_destroy_ex, d, x, y, z)
+#define __itt_id_destroy_ex_ptr ITTNOTIFY_NAME(id_destroy_ex)
+#else /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_id_create_ex(domain, clock_domain, timestamp, id)
+#define __itt_id_create_ex_ptr 0
+#define __itt_id_destroy_ex(domain, clock_domain, timestamp, id)
 #define __itt_id_destroy_ex_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_id_create_ex_ptr    0
+#else /* INTEL_NO_MACRO_BODY */
+#define __itt_id_create_ex_ptr 0
 #define __itt_id_destroy_ex_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
@@ -2714,59 +2955,83 @@ ITT_STUBV(ITTAPI, void, id_destroy_ex, (const __itt_domain *domain, __itt_clock_
  * @ingroup clockdomain
  * @brief Begin a task instance.
  * @param[in] domain The domain for this task
- * @param[in] clock_domain The clock domain controlling the execution of this call.
+ * @param[in] clock_domain The clock domain controlling the execution of this
+ * call.
  * @param[in] timestamp The user defined timestamp.
  * @param[in] taskid The instance ID for this task instance, or __itt_null
- * @param[in] parentid The parent instance to which this task instance belongs, or __itt_null
+ * @param[in] parentid The parent instance to which this task instance belongs,
+ * or __itt_null
  * @param[in] name The name of this task
  */
-void ITTAPI __itt_task_begin_ex(const __itt_domain* domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id taskid, __itt_id parentid, __itt_string_handle* name);
+void ITTAPI __itt_task_begin_ex(const __itt_domain *domain,
+                                __itt_clock_domain *clock_domain,
+                                unsigned long long timestamp, __itt_id taskid,
+                                __itt_id parentid, __itt_string_handle *name);
 
 /**
  * @ingroup clockdomain
  * @brief Begin a task instance.
  * @param[in] domain The domain for this task
- * @param[in] clock_domain The clock domain controlling the execution of this call.
+ * @param[in] clock_domain The clock domain controlling the execution of this
+ * call.
  * @param[in] timestamp The user defined timestamp.
  * @param[in] taskid The identifier for this task instance, or __itt_null
  * @param[in] parentid The parent of this task, or __itt_null
  * @param[in] fn The pointer to the function you are tracing
  */
-void ITTAPI __itt_task_begin_fn_ex(const __itt_domain* domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id taskid, __itt_id parentid, void* fn);
+void ITTAPI __itt_task_begin_fn_ex(const __itt_domain *domain,
+                                   __itt_clock_domain *clock_domain,
+                                   unsigned long long timestamp,
+                                   __itt_id taskid, __itt_id parentid,
+                                   void *fn);
 
 /**
  * @ingroup clockdomain
  * @brief End the current task instance.
  * @param[in] domain The domain for this task
- * @param[in] clock_domain The clock domain controlling the execution of this call.
+ * @param[in] clock_domain The clock domain controlling the execution of this
+ * call.
  * @param[in] timestamp The user defined timestamp.
  */
-void ITTAPI __itt_task_end_ex(const __itt_domain* domain, __itt_clock_domain* clock_domain, unsigned long long timestamp);
+void ITTAPI __itt_task_end_ex(const __itt_domain *domain,
+                              __itt_clock_domain *clock_domain,
+                              unsigned long long timestamp);
 
 /** @cond exclude_from_documentation */
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, task_begin_ex,        (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id, __itt_id parentid, __itt_string_handle *name))
-ITT_STUBV(ITTAPI, void, task_begin_fn_ex,     (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id, __itt_id parentid, void* fn))
-ITT_STUBV(ITTAPI, void, task_end_ex,          (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp))
-#define __itt_task_begin_ex(d,x,y,z,a,b)      ITTNOTIFY_VOID_D5(task_begin_ex,d,x,y,z,a,b)
-#define __itt_task_begin_ex_ptr               ITTNOTIFY_NAME(task_begin_ex)
-#define __itt_task_begin_fn_ex(d,x,y,z,a,b)   ITTNOTIFY_VOID_D5(task_begin_fn_ex,d,x,y,z,a,b)
-#define __itt_task_begin_fn_ex_ptr            ITTNOTIFY_NAME(task_begin_fn_ex)
-#define __itt_task_end_ex(d,x,y)              ITTNOTIFY_VOID_D2(task_end_ex,d,x,y)
-#define __itt_task_end_ex_ptr                 ITTNOTIFY_NAME(task_end_ex)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_task_begin_ex(domain,clock_domain,timestamp,id,parentid,name)
-#define __itt_task_begin_ex_ptr          0
-#define __itt_task_begin_fn_ex(domain,clock_domain,timestamp,id,parentid,fn)
-#define __itt_task_begin_fn_ex_ptr       0
-#define __itt_task_end_ex(domain,clock_domain,timestamp)
-#define __itt_task_end_ex_ptr            0
+ITT_STUBV(ITTAPI, void, task_begin_ex,
+          (const __itt_domain *domain, __itt_clock_domain *clock_domain,
+           unsigned long long timestamp, __itt_id id, __itt_id parentid,
+           __itt_string_handle *name))
+ITT_STUBV(ITTAPI, void, task_begin_fn_ex,
+          (const __itt_domain *domain, __itt_clock_domain *clock_domain,
+           unsigned long long timestamp, __itt_id id, __itt_id parentid,
+           void *fn))
+ITT_STUBV(ITTAPI, void, task_end_ex,
+          (const __itt_domain *domain, __itt_clock_domain *clock_domain,
+           unsigned long long timestamp))
+#define __itt_task_begin_ex(d, x, y, z, a, b)                                  \
+  ITTNOTIFY_VOID_D5(task_begin_ex, d, x, y, z, a, b)
+#define __itt_task_begin_ex_ptr ITTNOTIFY_NAME(task_begin_ex)
+#define __itt_task_begin_fn_ex(d, x, y, z, a, b)                               \
+  ITTNOTIFY_VOID_D5(task_begin_fn_ex, d, x, y, z, a, b)
+#define __itt_task_begin_fn_ex_ptr ITTNOTIFY_NAME(task_begin_fn_ex)
+#define __itt_task_end_ex(d, x, y) ITTNOTIFY_VOID_D2(task_end_ex, d, x, y)
+#define __itt_task_end_ex_ptr ITTNOTIFY_NAME(task_end_ex)
+#else /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_task_begin_ex(domain, clock_domain, timestamp, id, parentid, name)
+#define __itt_task_begin_ex_ptr 0
+#define __itt_task_begin_fn_ex(domain, clock_domain, timestamp, id, parentid,  \
+                               fn)
+#define __itt_task_begin_fn_ex_ptr 0
+#define __itt_task_end_ex(domain, clock_domain, timestamp)
+#define __itt_task_end_ex_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_task_begin_ex_ptr          0
-#define __itt_task_begin_fn_ex_ptr       0
-#define __itt_task_end_ex_ptr            0
+#else /* INTEL_NO_MACRO_BODY */
+#define __itt_task_begin_ex_ptr 0
+#define __itt_task_begin_fn_ex_ptr 0
+#define __itt_task_end_ex_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
 
@@ -2785,26 +3050,31 @@ ITT_STUBV(ITTAPI, void, task_end_ex,          (const __itt_domain *domain, __itt
  */
 /** @cond exclude_from_documentation */
 
-typedef struct ___itt_counter* __itt_counter;
+typedef struct ___itt_counter *__itt_counter;
 
 /**
  * @brief Create an unsigned 64 bits integer counter with given name/domain
  *
- * After __itt_counter_create() is called, __itt_counter_inc(id), __itt_counter_inc_delta(id, delta),
- * __itt_counter_set_value(id, value_ptr) or __itt_counter_set_value_ex(id, clock_domain, timestamp, value_ptr)
- * can be used to change the value of the counter, where value_ptr is a pointer to an unsigned 64 bits integer
+ * After __itt_counter_create() is called, __itt_counter_inc(id),
+ * __itt_counter_inc_delta(id, delta),
+ * __itt_counter_set_value(id, value_ptr) or __itt_counter_set_value_ex(id,
+ * clock_domain, timestamp, value_ptr) can be used to change the value of the
+ * counter, where value_ptr is a pointer to an unsigned 64 bits integer
  *
- * The call is equal to __itt_counter_create_typed(name, domain, __itt_metadata_u64)
- */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-__itt_counter ITTAPI __itt_counter_createA(const char    *name, const char    *domain);
-__itt_counter ITTAPI __itt_counter_createW(const wchar_t *name, const wchar_t *domain);
+ * The call is equal to __itt_counter_create_typed(name, domain,
+ * __itt_metadata_u64)
+ */
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+__itt_counter ITTAPI __itt_counter_createA(const char *name,
+                                           const char *domain);
+__itt_counter ITTAPI __itt_counter_createW(const wchar_t *name,
+                                           const wchar_t *domain);
 #if defined(UNICODE) || defined(_UNICODE)
-#  define __itt_counter_create     __itt_counter_createW
-#  define __itt_counter_create_ptr __itt_counter_createW_ptr
+#define __itt_counter_create __itt_counter_createW
+#define __itt_counter_create_ptr __itt_counter_createW_ptr
 #else /* UNICODE */
-#  define __itt_counter_create     __itt_counter_createA
-#  define __itt_counter_create_ptr __itt_counter_createA_ptr
+#define __itt_counter_create __itt_counter_createA
+#define __itt_counter_create_ptr __itt_counter_createA_ptr
 #endif /* UNICODE */
 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 __itt_counter ITTAPI __itt_counter_create(const char *name, const char *domain);
@@ -2812,38 +3082,41 @@ __itt_counter ITTAPI __itt_counter_create(const char *name, const char *domain);
 
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-ITT_STUB(ITTAPI, __itt_counter, counter_createA, (const char    *name, const char    *domain))
-ITT_STUB(ITTAPI, __itt_counter, counter_createW, (const wchar_t *name, const wchar_t *domain))
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-ITT_STUB(ITTAPI, __itt_counter, counter_create,  (const char *name, const char *domain))
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_counter, counter_createA,
+         (const char *name, const char *domain))
+ITT_STUB(ITTAPI, __itt_counter, counter_createW,
+         (const wchar_t *name, const wchar_t *domain))
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_counter, counter_create,
+         (const char *name, const char *domain))
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_counter_createA     ITTNOTIFY_DATA(counter_createA)
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+#define __itt_counter_createA ITTNOTIFY_DATA(counter_createA)
 #define __itt_counter_createA_ptr ITTNOTIFY_NAME(counter_createA)
-#define __itt_counter_createW     ITTNOTIFY_DATA(counter_createW)
+#define __itt_counter_createW ITTNOTIFY_DATA(counter_createW)
 #define __itt_counter_createW_ptr ITTNOTIFY_NAME(counter_createW)
 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_counter_create     ITTNOTIFY_DATA(counter_create)
+#define __itt_counter_create ITTNOTIFY_DATA(counter_create)
 #define __itt_counter_create_ptr ITTNOTIFY_NAME(counter_create)
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#else /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
 #define __itt_counter_createA(name, domain)
 #define __itt_counter_createA_ptr 0
 #define __itt_counter_createW(name, domain)
 #define __itt_counter_createW_ptr 0
 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 #define __itt_counter_create(name, domain)
-#define __itt_counter_create_ptr  0
+#define __itt_counter_create_ptr 0
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#else /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
 #define __itt_counter_createA_ptr 0
 #define __itt_counter_createW_ptr 0
 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_counter_create_ptr  0
+#define __itt_counter_create_ptr 0
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
@@ -2858,13 +3131,13 @@ void ITTAPI __itt_counter_inc(__itt_counter id);
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
 ITT_STUBV(ITTAPI, void, counter_inc, (__itt_counter id))
-#define __itt_counter_inc     ITTNOTIFY_VOID(counter_inc)
+#define __itt_counter_inc ITTNOTIFY_VOID(counter_inc)
 #define __itt_counter_inc_ptr ITTNOTIFY_NAME(counter_inc)
-#else  /* INTEL_NO_ITTNOTIFY_API */
+#else /* INTEL_NO_ITTNOTIFY_API */
 #define __itt_counter_inc(id)
 #define __itt_counter_inc_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
+#else /* INTEL_NO_MACRO_BODY */
 #define __itt_counter_inc_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
@@ -2877,14 +3150,15 @@ void ITTAPI __itt_counter_inc_delta(__itt_counter id, unsigned long long value);
 
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, counter_inc_delta, (__itt_counter id, unsigned long long value))
-#define __itt_counter_inc_delta     ITTNOTIFY_VOID(counter_inc_delta)
+ITT_STUBV(ITTAPI, void, counter_inc_delta,
+          (__itt_counter id, unsigned long long value))
+#define __itt_counter_inc_delta ITTNOTIFY_VOID(counter_inc_delta)
 #define __itt_counter_inc_delta_ptr ITTNOTIFY_NAME(counter_inc_delta)
-#else  /* INTEL_NO_ITTNOTIFY_API */
+#else /* INTEL_NO_ITTNOTIFY_API */
 #define __itt_counter_inc_delta(id, value)
 #define __itt_counter_inc_delta_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
+#else /* INTEL_NO_MACRO_BODY */
 #define __itt_counter_inc_delta_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
@@ -2899,13 +3173,13 @@ void ITTAPI __itt_counter_dec(__itt_counter id);
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
 ITT_STUBV(ITTAPI, void, counter_dec, (__itt_counter id))
-#define __itt_counter_dec     ITTNOTIFY_VOID(counter_dec)
+#define __itt_counter_dec ITTNOTIFY_VOID(counter_dec)
 #define __itt_counter_dec_ptr ITTNOTIFY_NAME(counter_dec)
-#else  /* INTEL_NO_ITTNOTIFY_API */
+#else /* INTEL_NO_ITTNOTIFY_API */
 #define __itt_counter_dec(id)
 #define __itt_counter_dec_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
+#else /* INTEL_NO_MACRO_BODY */
 #define __itt_counter_dec_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
@@ -2918,14 +3192,15 @@ void ITTAPI __itt_counter_dec_delta(__itt_counter id, unsigned long long value);
 
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, counter_dec_delta, (__itt_counter id, unsigned long long value))
-#define __itt_counter_dec_delta     ITTNOTIFY_VOID(counter_dec_delta)
+ITT_STUBV(ITTAPI, void, counter_dec_delta,
+          (__itt_counter id, unsigned long long value))
+#define __itt_counter_dec_delta ITTNOTIFY_VOID(counter_dec_delta)
 #define __itt_counter_dec_delta_ptr ITTNOTIFY_NAME(counter_dec_delta)
-#else  /* INTEL_NO_ITTNOTIFY_API */
+#else /* INTEL_NO_ITTNOTIFY_API */
 #define __itt_counter_dec_delta(id, value)
 #define __itt_counter_dec_delta_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
+#else /* INTEL_NO_MACRO_BODY */
 #define __itt_counter_dec_delta_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
@@ -2935,87 +3210,103 @@ ITT_STUBV(ITTAPI, void, counter_dec_delta, (__itt_counter id, unsigned long long
  * @brief Increment a counter by one.
  * The first call with a given name creates a counter by that name and sets its
  * value to zero. Successive calls increment the counter value.
- * @param[in] domain The domain controlling the call. Counter names are not domain specific.
- *            The domain argument is used only to enable or disable the API calls.
+ * @param[in] domain The domain controlling the call. Counter names are not
+ * domain specific. The domain argument is used only to enable or disable the
+ * API calls.
  * @param[in] name The name of the counter
  */
-void ITTAPI __itt_counter_inc_v3(const __itt_domain *domain, __itt_string_handle *name);
+void ITTAPI __itt_counter_inc_v3(const __itt_domain *domain,
+                                 __itt_string_handle *name);
 
 /**
  * @ingroup counters
  * @brief Increment a counter by the value specified in delta.
- * @param[in] domain The domain controlling the call. Counter names are not domain specific.
- *            The domain argument is used only to enable or disable the API calls.
+ * @param[in] domain The domain controlling the call. Counter names are not
+ * domain specific. The domain argument is used only to enable or disable the
+ * API calls.
  * @param[in] name The name of the counter
  * @param[in] delta The amount by which to increment the counter
  */
-void ITTAPI __itt_counter_inc_delta_v3(const __itt_domain *domain, __itt_string_handle *name, unsigned long long delta);
+void ITTAPI __itt_counter_inc_delta_v3(const __itt_domain *domain,
+                                       __itt_string_handle *name,
+                                       unsigned long long delta);
 
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, counter_inc_v3,       (const __itt_domain *domain, __itt_string_handle *name))
-ITT_STUBV(ITTAPI, void, counter_inc_delta_v3, (const __itt_domain *domain, __itt_string_handle *name, unsigned long long delta))
-#define __itt_counter_inc_v3(d,x)         ITTNOTIFY_VOID_D1(counter_inc_v3,d,x)
-#define __itt_counter_inc_v3_ptr          ITTNOTIFY_NAME(counter_inc_v3)
-#define __itt_counter_inc_delta_v3(d,x,y) ITTNOTIFY_VOID_D2(counter_inc_delta_v3,d,x,y)
-#define __itt_counter_inc_delta_v3_ptr    ITTNOTIFY_NAME(counter_inc_delta_v3)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_counter_inc_v3(domain,name)
-#define __itt_counter_inc_v3_ptr       0
-#define __itt_counter_inc_delta_v3(domain,name,delta)
+ITT_STUBV(ITTAPI, void, counter_inc_v3,
+          (const __itt_domain *domain, __itt_string_handle *name))
+ITT_STUBV(ITTAPI, void, counter_inc_delta_v3,
+          (const __itt_domain *domain, __itt_string_handle *name,
+           unsigned long long delta))
+#define __itt_counter_inc_v3(d, x) ITTNOTIFY_VOID_D1(counter_inc_v3, d, x)
+#define __itt_counter_inc_v3_ptr ITTNOTIFY_NAME(counter_inc_v3)
+#define __itt_counter_inc_delta_v3(d, x, y)                                    \
+  ITTNOTIFY_VOID_D2(counter_inc_delta_v3, d, x, y)
+#define __itt_counter_inc_delta_v3_ptr ITTNOTIFY_NAME(counter_inc_delta_v3)
+#else /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_counter_inc_v3(domain, name)
+#define __itt_counter_inc_v3_ptr 0
+#define __itt_counter_inc_delta_v3(domain, name, delta)
 #define __itt_counter_inc_delta_v3_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_counter_inc_v3_ptr       0
+#else /* INTEL_NO_MACRO_BODY */
+#define __itt_counter_inc_v3_ptr 0
 #define __itt_counter_inc_delta_v3_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
 
-
 /**
  * @ingroup counters
  * @brief Decrement a counter by one.
  * The first call with a given name creates a counter by that name and sets its
  * value to zero. Successive calls decrement the counter value.
- * @param[in] domain The domain controlling the call. Counter names are not domain specific.
- *            The domain argument is used only to enable or disable the API calls.
+ * @param[in] domain The domain controlling the call. Counter names are not
+ * domain specific. The domain argument is used only to enable or disable the
+ * API calls.
  * @param[in] name The name of the counter
  */
-void ITTAPI __itt_counter_dec_v3(const __itt_domain *domain, __itt_string_handle *name);
+void ITTAPI __itt_counter_dec_v3(const __itt_domain *domain,
+                                 __itt_string_handle *name);
 
 /**
  * @ingroup counters
  * @brief Decrement a counter by the value specified in delta.
- * @param[in] domain The domain controlling the call. Counter names are not domain specific.
- *            The domain argument is used only to enable or disable the API calls.
+ * @param[in] domain The domain controlling the call. Counter names are not
+ * domain specific. The domain argument is used only to enable or disable the
+ * API calls.
  * @param[in] name The name of the counter
  * @param[in] delta The amount by which to decrement the counter
  */
-void ITTAPI __itt_counter_dec_delta_v3(const __itt_domain *domain, __itt_string_handle *name, unsigned long long delta);
+void ITTAPI __itt_counter_dec_delta_v3(const __itt_domain *domain,
+                                       __itt_string_handle *name,
+                                       unsigned long long delta);
 
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, counter_dec_v3,       (const __itt_domain *domain, __itt_string_handle *name))
-ITT_STUBV(ITTAPI, void, counter_dec_delta_v3, (const __itt_domain *domain, __itt_string_handle *name, unsigned long long delta))
-#define __itt_counter_dec_v3(d,x)         ITTNOTIFY_VOID_D1(counter_dec_v3,d,x)
-#define __itt_counter_dec_v3_ptr          ITTNOTIFY_NAME(counter_dec_v3)
-#define __itt_counter_dec_delta_v3(d,x,y) ITTNOTIFY_VOID_D2(counter_dec_delta_v3,d,x,y)
-#define __itt_counter_dec_delta_v3_ptr    ITTNOTIFY_NAME(counter_dec_delta_v3)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_counter_dec_v3(domain,name)
-#define __itt_counter_dec_v3_ptr       0
-#define __itt_counter_dec_delta_v3(domain,name,delta)
+ITT_STUBV(ITTAPI, void, counter_dec_v3,
+          (const __itt_domain *domain, __itt_string_handle *name))
+ITT_STUBV(ITTAPI, void, counter_dec_delta_v3,
+          (const __itt_domain *domain, __itt_string_handle *name,
+           unsigned long long delta))
+#define __itt_counter_dec_v3(d, x) ITTNOTIFY_VOID_D1(counter_dec_v3, d, x)
+#define __itt_counter_dec_v3_ptr ITTNOTIFY_NAME(counter_dec_v3)
+#define __itt_counter_dec_delta_v3(d, x, y)                                    \
+  ITTNOTIFY_VOID_D2(counter_dec_delta_v3, d, x, y)
+#define __itt_counter_dec_delta_v3_ptr ITTNOTIFY_NAME(counter_dec_delta_v3)
+#else /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_counter_dec_v3(domain, name)
+#define __itt_counter_dec_v3_ptr 0
+#define __itt_counter_dec_delta_v3(domain, name, delta)
 #define __itt_counter_dec_delta_v3_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_counter_dec_v3_ptr       0
+#else /* INTEL_NO_MACRO_BODY */
+#define __itt_counter_dec_v3_ptr 0
 #define __itt_counter_dec_delta_v3_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
 
 /** @} counters group */
 
-
 /**
  * @brief Set the counter value
  */
@@ -3024,13 +3315,13 @@ void ITTAPI __itt_counter_set_value(__itt_counter id, void *value_ptr);
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
 ITT_STUBV(ITTAPI, void, counter_set_value, (__itt_counter id, void *value_ptr))
-#define __itt_counter_set_value     ITTNOTIFY_VOID(counter_set_value)
+#define __itt_counter_set_value ITTNOTIFY_VOID(counter_set_value)
 #define __itt_counter_set_value_ptr ITTNOTIFY_NAME(counter_set_value)
-#else  /* INTEL_NO_ITTNOTIFY_API */
+#else /* INTEL_NO_ITTNOTIFY_API */
 #define __itt_counter_set_value(id, value_ptr)
 #define __itt_counter_set_value_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
+#else /* INTEL_NO_MACRO_BODY */
 #define __itt_counter_set_value_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
@@ -3038,19 +3329,24 @@ ITT_STUBV(ITTAPI, void, counter_set_value, (__itt_counter id, void *value_ptr))
 /**
  * @brief Set the counter value
  */
-void ITTAPI __itt_counter_set_value_ex(__itt_counter id, __itt_clock_domain *clock_domain, unsigned long long timestamp, void *value_ptr);
+void ITTAPI __itt_counter_set_value_ex(__itt_counter id,
+                                       __itt_clock_domain *clock_domain,
+                                       unsigned long long timestamp,
+                                       void *value_ptr);
 
 /** @cond exclude_from_documentation */
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, counter_set_value_ex, (__itt_counter id, __itt_clock_domain *clock_domain, unsigned long long timestamp, void *value_ptr))
-#define __itt_counter_set_value_ex     ITTNOTIFY_VOID(counter_set_value_ex)
+ITT_STUBV(ITTAPI, void, counter_set_value_ex,
+          (__itt_counter id, __itt_clock_domain *clock_domain,
+           unsigned long long timestamp, void *value_ptr))
+#define __itt_counter_set_value_ex ITTNOTIFY_VOID(counter_set_value_ex)
 #define __itt_counter_set_value_ex_ptr ITTNOTIFY_NAME(counter_set_value_ex)
-#else  /* INTEL_NO_ITTNOTIFY_API */
+#else /* INTEL_NO_ITTNOTIFY_API */
 #define __itt_counter_set_value_ex(id, clock_domain, timestamp, value_ptr)
 #define __itt_counter_set_value_ex_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
+#else /* INTEL_NO_MACRO_BODY */
 #define __itt_counter_set_value_ex_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
@@ -3058,64 +3354,76 @@ ITT_STUBV(ITTAPI, void, counter_set_value_ex, (__itt_counter id, __itt_clock_dom
 /**
  * @brief Create a typed counter with given name/domain
  *
- * After __itt_counter_create_typed() is called, __itt_counter_inc(id), __itt_counter_inc_delta(id, delta),
- * __itt_counter_set_value(id, value_ptr) or __itt_counter_set_value_ex(id, clock_domain, timestamp, value_ptr)
- * can be used to change the value of the counter
- */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-__itt_counter ITTAPI __itt_counter_create_typedA(const char    *name, const char    *domain, __itt_metadata_type type);
-__itt_counter ITTAPI __itt_counter_create_typedW(const wchar_t *name, const wchar_t *domain, __itt_metadata_type type);
+ * After __itt_counter_create_typed() is called, __itt_counter_inc(id),
+ * __itt_counter_inc_delta(id, delta),
+ * __itt_counter_set_value(id, value_ptr) or __itt_counter_set_value_ex(id,
+ * clock_domain, timestamp, value_ptr) can be used to change the value of the
+ * counter
+ */
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+__itt_counter ITTAPI __itt_counter_create_typedA(const char *name,
+                                                 const char *domain,
+                                                 __itt_metadata_type type);
+__itt_counter ITTAPI __itt_counter_create_typedW(const wchar_t *name,
+                                                 const wchar_t *domain,
+                                                 __itt_metadata_type type);
 #if defined(UNICODE) || defined(_UNICODE)
-#  define __itt_counter_create_typed     __itt_counter_create_typedW
-#  define __itt_counter_create_typed_ptr __itt_counter_create_typedW_ptr
+#define __itt_counter_create_typed __itt_counter_create_typedW
+#define __itt_counter_create_typed_ptr __itt_counter_create_typedW_ptr
 #else /* UNICODE */
-#  define __itt_counter_create_typed     __itt_counter_create_typedA
-#  define __itt_counter_create_typed_ptr __itt_counter_create_typedA_ptr
+#define __itt_counter_create_typed __itt_counter_create_typedA
+#define __itt_counter_create_typed_ptr __itt_counter_create_typedA_ptr
 #endif /* UNICODE */
 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-__itt_counter ITTAPI __itt_counter_create_typed(const char *name, const char *domain, __itt_metadata_type type);
+__itt_counter ITTAPI __itt_counter_create_typed(const char *name,
+                                                const char *domain,
+                                                __itt_metadata_type type);
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-ITT_STUB(ITTAPI, __itt_counter, counter_create_typedA, (const char    *name, const char    *domain, __itt_metadata_type type))
-ITT_STUB(ITTAPI, __itt_counter, counter_create_typedW, (const wchar_t *name, const wchar_t *domain, __itt_metadata_type type))
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-ITT_STUB(ITTAPI, __itt_counter, counter_create_typed,  (const char *name, const char *domain, __itt_metadata_type type))
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_counter, counter_create_typedA,
+         (const char *name, const char *domain, __itt_metadata_type type))
+ITT_STUB(ITTAPI, __itt_counter, counter_create_typedW,
+         (const wchar_t *name, const wchar_t *domain, __itt_metadata_type type))
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_counter, counter_create_typed,
+         (const char *name, const char *domain, __itt_metadata_type type))
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_counter_create_typedA     ITTNOTIFY_DATA(counter_create_typedA)
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+#define __itt_counter_create_typedA ITTNOTIFY_DATA(counter_create_typedA)
 #define __itt_counter_create_typedA_ptr ITTNOTIFY_NAME(counter_create_typedA)
-#define __itt_counter_create_typedW     ITTNOTIFY_DATA(counter_create_typedW)
+#define __itt_counter_create_typedW ITTNOTIFY_DATA(counter_create_typedW)
 #define __itt_counter_create_typedW_ptr ITTNOTIFY_NAME(counter_create_typedW)
 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_counter_create_typed     ITTNOTIFY_DATA(counter_create_typed)
+#define __itt_counter_create_typed ITTNOTIFY_DATA(counter_create_typed)
 #define __itt_counter_create_typed_ptr ITTNOTIFY_NAME(counter_create_typed)
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#else /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
 #define __itt_counter_create_typedA(name, domain, type)
 #define __itt_counter_create_typedA_ptr 0
 #define __itt_counter_create_typedW(name, domain, type)
 #define __itt_counter_create_typedW_ptr 0
 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 #define __itt_counter_create_typed(name, domain, type)
-#define __itt_counter_create_typed_ptr  0
+#define __itt_counter_create_typed_ptr 0
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#else /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
 #define __itt_counter_create_typedA_ptr 0
 #define __itt_counter_create_typedW_ptr 0
 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_counter_create_typed_ptr  0
+#define __itt_counter_create_typed_ptr 0
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
 
 /**
- * @brief Destroy the counter identified by the pointer previously returned by __itt_counter_create() or
+ * @brief Destroy the counter identified by the pointer previously returned by
+ * __itt_counter_create() or
  * __itt_counter_create_typed()
  */
 void ITTAPI __itt_counter_destroy(__itt_counter id);
@@ -3123,13 +3431,13 @@ void ITTAPI __itt_counter_destroy(__itt_counter id);
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
 ITT_STUBV(ITTAPI, void, counter_destroy, (__itt_counter id))
-#define __itt_counter_destroy     ITTNOTIFY_VOID(counter_destroy)
+#define __itt_counter_destroy ITTNOTIFY_VOID(counter_destroy)
 #define __itt_counter_destroy_ptr ITTNOTIFY_NAME(counter_destroy)
-#else  /* INTEL_NO_ITTNOTIFY_API */
+#else /* INTEL_NO_ITTNOTIFY_API */
 #define __itt_counter_destroy(id)
 #define __itt_counter_destroy_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
+#else /* INTEL_NO_MACRO_BODY */
 #define __itt_counter_destroy_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
@@ -3139,26 +3447,34 @@ ITT_STUBV(ITTAPI, void, counter_destroy, (__itt_counter id))
  * @ingroup markers
  * @brief Create a marker instance.
  * @param[in] domain The domain for this marker
- * @param[in] clock_domain The clock domain controlling the execution of this call.
+ * @param[in] clock_domain The clock domain controlling the execution of this
+ * call.
  * @param[in] timestamp The user defined timestamp.
  * @param[in] id The instance ID for this marker, or __itt_null
  * @param[in] name The name for this marker
  * @param[in] scope The scope for this marker
  */
-void ITTAPI __itt_marker_ex(const __itt_domain *domain,  __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id, __itt_string_handle *name, __itt_scope scope);
+void ITTAPI __itt_marker_ex(const __itt_domain *domain,
+                            __itt_clock_domain *clock_domain,
+                            unsigned long long timestamp, __itt_id id,
+                            __itt_string_handle *name, __itt_scope scope);
 
 /** @cond exclude_from_documentation */
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, marker_ex,    (const __itt_domain *domain,  __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id, __itt_string_handle *name, __itt_scope scope))
-#define __itt_marker_ex(d,x,y,z,a,b)    ITTNOTIFY_VOID_D5(marker_ex,d,x,y,z,a,b)
-#define __itt_marker_ex_ptr             ITTNOTIFY_NAME(marker_ex)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_marker_ex(domain,clock_domain,timestamp,id,name,scope)
-#define __itt_marker_ex_ptr    0
+ITT_STUBV(ITTAPI, void, marker_ex,
+          (const __itt_domain *domain, __itt_clock_domain *clock_domain,
+           unsigned long long timestamp, __itt_id id, __itt_string_handle *name,
+           __itt_scope scope))
+#define __itt_marker_ex(d, x, y, z, a, b)                                      \
+  ITTNOTIFY_VOID_D5(marker_ex, d, x, y, z, a, b)
+#define __itt_marker_ex_ptr ITTNOTIFY_NAME(marker_ex)
+#else /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_marker_ex(domain, clock_domain, timestamp, id, name, scope)
+#define __itt_marker_ex_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_marker_ex_ptr    0
+#else /* INTEL_NO_MACRO_BODY */
+#define __itt_marker_ex_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
 
@@ -3167,64 +3483,82 @@ ITT_STUBV(ITTAPI, void, marker_ex,    (const __itt_domain *domain,  __itt_clock_
  * @brief Add a relation to the current task instance.
  * The current task instance is the head of the relation.
  * @param[in] domain The domain controlling this call
- * @param[in] clock_domain The clock domain controlling the execution of this call.
+ * @param[in] clock_domain The clock domain controlling the execution of this
+ * call.
  * @param[in] timestamp The user defined timestamp.
  * @param[in] relation The kind of relation
  * @param[in] tail The ID for the tail of the relation
  */
-void ITTAPI __itt_relation_add_to_current_ex(const __itt_domain *domain,  __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_relation relation, __itt_id tail);
+void ITTAPI __itt_relation_add_to_current_ex(const __itt_domain *domain,
+                                             __itt_clock_domain *clock_domain,
+                                             unsigned long long timestamp,
+                                             __itt_relation relation,
+                                             __itt_id tail);
 
 /**
  * @ingroup clockdomain
  * @brief Add a relation between two instance identifiers.
  * @param[in] domain The domain controlling this call
- * @param[in] clock_domain The clock domain controlling the execution of this call.
+ * @param[in] clock_domain The clock domain controlling the execution of this
+ * call.
  * @param[in] timestamp The user defined timestamp.
  * @param[in] head The ID for the head of the relation
  * @param[in] relation The kind of relation
  * @param[in] tail The ID for the tail of the relation
  */
-void ITTAPI __itt_relation_add_ex(const __itt_domain *domain,  __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id head, __itt_relation relation, __itt_id tail);
+void ITTAPI __itt_relation_add_ex(const __itt_domain *domain,
+                                  __itt_clock_domain *clock_domain,
+                                  unsigned long long timestamp, __itt_id head,
+                                  __itt_relation relation, __itt_id tail);
 
 /** @cond exclude_from_documentation */
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, relation_add_to_current_ex, (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_relation relation, __itt_id tail))
-ITT_STUBV(ITTAPI, void, relation_add_ex,            (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id head, __itt_relation relation, __itt_id tail))
-#define __itt_relation_add_to_current_ex(d,x,y,z,a) ITTNOTIFY_VOID_D4(relation_add_to_current_ex,d,x,y,z,a)
-#define __itt_relation_add_to_current_ex_ptr        ITTNOTIFY_NAME(relation_add_to_current_ex)
-#define __itt_relation_add_ex(d,x,y,z,a,b)          ITTNOTIFY_VOID_D5(relation_add_ex,d,x,y,z,a,b)
-#define __itt_relation_add_ex_ptr                   ITTNOTIFY_NAME(relation_add_ex)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_relation_add_to_current_ex(domain,clock_domain,timestamp,relation,tail)
+ITT_STUBV(ITTAPI, void, relation_add_to_current_ex,
+          (const __itt_domain *domain, __itt_clock_domain *clock_domain,
+           unsigned long long timestamp, __itt_relation relation,
+           __itt_id tail))
+ITT_STUBV(ITTAPI, void, relation_add_ex,
+          (const __itt_domain *domain, __itt_clock_domain *clock_domain,
+           unsigned long long timestamp, __itt_id head, __itt_relation relation,
+           __itt_id tail))
+#define __itt_relation_add_to_current_ex(d, x, y, z, a)                        \
+  ITTNOTIFY_VOID_D4(relation_add_to_current_ex, d, x, y, z, a)
+#define __itt_relation_add_to_current_ex_ptr                                   \
+  ITTNOTIFY_NAME(relation_add_to_current_ex)
+#define __itt_relation_add_ex(d, x, y, z, a, b)                                \
+  ITTNOTIFY_VOID_D5(relation_add_ex, d, x, y, z, a, b)
+#define __itt_relation_add_ex_ptr ITTNOTIFY_NAME(relation_add_ex)
+#else /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_relation_add_to_current_ex(domain, clock_domain, timestamp,      \
+                                         relation, tail)
 #define __itt_relation_add_to_current_ex_ptr 0
-#define __itt_relation_add_ex(domain,clock_domain,timestamp,head,relation,tail)
+#define __itt_relation_add_ex(domain, clock_domain, timestamp, head, relation, \
+                              tail)
 #define __itt_relation_add_ex_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
+#else /* INTEL_NO_MACRO_BODY */
 #define __itt_relation_add_to_current_ex_ptr 0
 #define __itt_relation_add_ex_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
 
 /** @cond exclude_from_documentation */
-typedef enum ___itt_track_group_type
-{
-    __itt_track_group_type_normal = 0
+typedef enum ___itt_track_group_type {
+  __itt_track_group_type_normal = 0
 } __itt_track_group_type;
 /** @endcond */
 
 /** @cond exclude_from_documentation */
 #pragma pack(push, 8)
 
-typedef struct ___itt_track_group
-{
-    __itt_string_handle* name;     /*!< Name of the track group */
-    struct ___itt_track* track;    /*!< List of child tracks    */
-    __itt_track_group_type tgtype; /*!< Type of the track group */
-    int   extra1;                  /*!< Reserved. Must be zero  */
-    void* extra2;                  /*!< Reserved. Must be zero  */
-    struct ___itt_track_group* next;
+typedef struct ___itt_track_group {
+  __itt_string_handle *name; /*!< Name of the track group */
+  struct ___itt_track *track; /*!< List of child tracks    */
+  __itt_track_group_type tgtype; /*!< Type of the track group */
+  int extra1; /*!< Reserved. Must be zero  */
+  void *extra2; /*!< Reserved. Must be zero  */
+  struct ___itt_track_group *next;
 } __itt_track_group;
 
 #pragma pack(pop)
@@ -3234,25 +3568,24 @@ typedef struct ___itt_track_group
  * @brief Placeholder for custom track types. Currently, "normal" custom track
  * is the only available track type.
  */
-typedef enum ___itt_track_type
-{
-    __itt_track_type_normal = 0
+typedef enum ___itt_track_type {
+  __itt_track_type_normal = 0
 #ifdef INTEL_ITTNOTIFY_API_PRIVATE
-    , __itt_track_type_queue
+  ,
+  __itt_track_type_queue
 #endif /* INTEL_ITTNOTIFY_API_PRIVATE */
 } __itt_track_type;
 
 /** @cond exclude_from_documentation */
 #pragma pack(push, 8)
 
-typedef struct ___itt_track
-{
-    __itt_string_handle* name; /*!< Name of the track group */
-    __itt_track_group* group;  /*!< Parent group to a track */
-    __itt_track_type ttype;    /*!< Type of the track       */
-    int   extra1;              /*!< Reserved. Must be zero  */
-    void* extra2;              /*!< Reserved. Must be zero  */
-    struct ___itt_track* next;
+typedef struct ___itt_track {
+  __itt_string_handle *name; /*!< Name of the track group */
+  __itt_track_group *group; /*!< Parent group to a track */
+  __itt_track_type ttype; /*!< Type of the track       */
+  int extra1; /*!< Reserved. Must be zero  */
+  void *extra2; /*!< Reserved. Must be zero  */
+  struct ___itt_track *next;
 } __itt_track;
 
 #pragma pack(pop)
@@ -3261,19 +3594,21 @@ typedef struct ___itt_track
 /**
  * @brief Create logical track group.
  */
-__itt_track_group* ITTAPI __itt_track_group_create(__itt_string_handle* name, __itt_track_group_type track_group_type);
+__itt_track_group *ITTAPI __itt_track_group_create(
+    __itt_string_handle *name, __itt_track_group_type track_group_type);
 
 /** @cond exclude_from_documentation */
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUB(ITTAPI, __itt_track_group*, track_group_create, (__itt_string_handle* name, __itt_track_group_type track_group_type))
-#define __itt_track_group_create     ITTNOTIFY_DATA(track_group_create)
+ITT_STUB(ITTAPI, __itt_track_group *, track_group_create,
+         (__itt_string_handle * name, __itt_track_group_type track_group_type))
+#define __itt_track_group_create ITTNOTIFY_DATA(track_group_create)
 #define __itt_track_group_create_ptr ITTNOTIFY_NAME(track_group_create)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_track_group_create(name)  (__itt_track_group*)0
+#else /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_track_group_create(name) (__itt_track_group *)0
 #define __itt_track_group_create_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
+#else /* INTEL_NO_MACRO_BODY */
 #define __itt_track_group_create_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
@@ -3281,19 +3616,23 @@ ITT_STUB(ITTAPI, __itt_track_group*, track_group_create, (__itt_string_handle* n
 /**
  * @brief Create logical track.
  */
-__itt_track* ITTAPI __itt_track_create(__itt_track_group* track_group, __itt_string_handle* name, __itt_track_type track_type);
+__itt_track *ITTAPI __itt_track_create(__itt_track_group *track_group,
+                                       __itt_string_handle *name,
+                                       __itt_track_type track_type);
 
 /** @cond exclude_from_documentation */
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUB(ITTAPI, __itt_track*, track_create, (__itt_track_group* track_group,__itt_string_handle* name, __itt_track_type track_type))
-#define __itt_track_create     ITTNOTIFY_DATA(track_create)
+ITT_STUB(ITTAPI, __itt_track *, track_create,
+         (__itt_track_group * track_group, __itt_string_handle *name,
+          __itt_track_type track_type))
+#define __itt_track_create ITTNOTIFY_DATA(track_create)
 #define __itt_track_create_ptr ITTNOTIFY_NAME(track_create)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_track_create(track_group,name,track_type)  (__itt_track*)0
+#else /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_track_create(track_group, name, track_type) (__itt_track *)0
 #define __itt_track_create_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
+#else /* INTEL_NO_MACRO_BODY */
 #define __itt_track_create_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
@@ -3301,19 +3640,19 @@ ITT_STUB(ITTAPI, __itt_track*, track_create, (__itt_track_group* track_group,__i
 /**
  * @brief Set the logical track.
  */
-void ITTAPI __itt_set_track(__itt_track* track);
+void ITTAPI __itt_set_track(__itt_track *track);
 
 /** @cond exclude_from_documentation */
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, set_track, (__itt_track *track))
-#define __itt_set_track     ITTNOTIFY_VOID(set_track)
+ITT_STUBV(ITTAPI, void, set_track, (__itt_track * track))
+#define __itt_set_track ITTNOTIFY_VOID(set_track)
 #define __itt_set_track_ptr ITTNOTIFY_NAME(set_track)
-#else  /* INTEL_NO_ITTNOTIFY_API */
+#else /* INTEL_NO_ITTNOTIFY_API */
 #define __itt_set_track(track)
 #define __itt_set_track_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
+#else /* INTEL_NO_MACRO_BODY */
 #define __itt_set_track_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
@@ -3331,65 +3670,68 @@ typedef int __itt_event;
 
 /**
  * @brief Create an event notification
- * @note name or namelen being null/name and namelen not matching, user event feature not enabled
+ * @note name or namelen being null/name and namelen not matching, user event
+ * feature not enabled
  * @return non-zero event identifier upon success and __itt_err otherwise
  */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-__itt_event LIBITTAPI __itt_event_createA(const char    *name, int namelen);
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+__itt_event LIBITTAPI __itt_event_createA(const char *name, int namelen);
 __itt_event LIBITTAPI __itt_event_createW(const wchar_t *name, int namelen);
 #if defined(UNICODE) || defined(_UNICODE)
-#  define __itt_event_create     __itt_event_createW
-#  define __itt_event_create_ptr __itt_event_createW_ptr
+#define __itt_event_create __itt_event_createW
+#define __itt_event_create_ptr __itt_event_createW_ptr
 #else
-#  define __itt_event_create     __itt_event_createA
-#  define __itt_event_create_ptr __itt_event_createA_ptr
+#define __itt_event_create __itt_event_createA
+#define __itt_event_create_ptr __itt_event_createA_ptr
 #endif /* UNICODE */
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 __itt_event LIBITTAPI __itt_event_create(const char *name, int namelen);
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 
 /** @cond exclude_from_documentation */
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-ITT_STUB(LIBITTAPI, __itt_event, event_createA, (const char    *name, int namelen))
-ITT_STUB(LIBITTAPI, __itt_event, event_createW, (const wchar_t *name, int namelen))
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-ITT_STUB(LIBITTAPI, __itt_event, event_create,  (const char    *name, int namelen))
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+ITT_STUB(LIBITTAPI, __itt_event, event_createA, (const char *name, int namelen))
+ITT_STUB(LIBITTAPI, __itt_event, event_createW,
+         (const wchar_t *name, int namelen))
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(LIBITTAPI, __itt_event, event_create, (const char *name, int namelen))
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_event_createA     ITTNOTIFY_DATA(event_createA)
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+#define __itt_event_createA ITTNOTIFY_DATA(event_createA)
 #define __itt_event_createA_ptr ITTNOTIFY_NAME(event_createA)
-#define __itt_event_createW     ITTNOTIFY_DATA(event_createW)
+#define __itt_event_createW ITTNOTIFY_DATA(event_createW)
 #define __itt_event_createW_ptr ITTNOTIFY_NAME(event_createW)
 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_event_create      ITTNOTIFY_DATA(event_create)
-#define __itt_event_create_ptr  ITTNOTIFY_NAME(event_create)
+#define __itt_event_create ITTNOTIFY_DATA(event_create)
+#define __itt_event_create_ptr ITTNOTIFY_NAME(event_create)
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#else /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
 #define __itt_event_createA(name, namelen) (__itt_event)0
 #define __itt_event_createA_ptr 0
 #define __itt_event_createW(name, namelen) (__itt_event)0
 #define __itt_event_createW_ptr 0
 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_event_create(name, namelen)  (__itt_event)0
-#define __itt_event_create_ptr  0
+#define __itt_event_create(name, namelen) (__itt_event)0
+#define __itt_event_create_ptr 0
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#else /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
 #define __itt_event_createA_ptr 0
 #define __itt_event_createW_ptr 0
 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_event_create_ptr  0
+#define __itt_event_create_ptr 0
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
 
 /**
  * @brief Record an event occurrence.
- * @return __itt_err upon failure (invalid event id/user event feature not enabled)
+ * @return __itt_err upon failure (invalid event id/user event feature not
+ * enabled)
  */
 int LIBITTAPI __itt_event_start(__itt_event event);
 
@@ -3397,13 +3739,13 @@ int LIBITTAPI __itt_event_start(__itt_event event);
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
 ITT_STUB(LIBITTAPI, int, event_start, (__itt_event event))
-#define __itt_event_start     ITTNOTIFY_DATA(event_start)
+#define __itt_event_start ITTNOTIFY_DATA(event_start)
 #define __itt_event_start_ptr ITTNOTIFY_NAME(event_start)
-#else  /* INTEL_NO_ITTNOTIFY_API */
+#else /* INTEL_NO_ITTNOTIFY_API */
 #define __itt_event_start(event) (int)0
 #define __itt_event_start_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
+#else /* INTEL_NO_MACRO_BODY */
 #define __itt_event_start_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
@@ -3411,7 +3753,8 @@ ITT_STUB(LIBITTAPI, int, event_start, (__itt_event event))
 /**
  * @brief Record an event end occurrence.
  * @note It is optional if events do not have durations.
- * @return __itt_err upon failure (invalid event id/user event feature not enabled)
+ * @return __itt_err upon failure (invalid event id/user event feature not
+ * enabled)
  */
 int LIBITTAPI __itt_event_end(__itt_event event);
 
@@ -3419,19 +3762,18 @@ int LIBITTAPI __itt_event_end(__itt_event event);
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
 ITT_STUB(LIBITTAPI, int, event_end, (__itt_event event))
-#define __itt_event_end     ITTNOTIFY_DATA(event_end)
+#define __itt_event_end ITTNOTIFY_DATA(event_end)
 #define __itt_event_end_ptr ITTNOTIFY_NAME(event_end)
-#else  /* INTEL_NO_ITTNOTIFY_API */
+#else /* INTEL_NO_ITTNOTIFY_API */
 #define __itt_event_end(event) (int)0
 #define __itt_event_end_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
+#else /* INTEL_NO_MACRO_BODY */
 #define __itt_event_end_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
 /** @} events group */
 
-
 /**
  * @defgroup arrays Arrays Visualizer
  * @ingroup public
@@ -3443,69 +3785,81 @@ ITT_STUB(LIBITTAPI, int, event_end, (__itt_event event))
  * @enum __itt_av_data_type
  * @brief Defines types of arrays data (for C/C++ intrinsic types)
  */
-typedef enum
-{
-    __itt_e_first = 0,
-    __itt_e_char = 0,  /* 1-byte integer */
-    __itt_e_uchar,     /* 1-byte unsigned integer */
-    __itt_e_int16,     /* 2-byte integer */
-    __itt_e_uint16,    /* 2-byte unsigned integer  */
-    __itt_e_int32,     /* 4-byte integer */
-    __itt_e_uint32,    /* 4-byte unsigned integer */
-    __itt_e_int64,     /* 8-byte integer */
-    __itt_e_uint64,    /* 8-byte unsigned integer */
-    __itt_e_float,     /* 4-byte floating */
-    __itt_e_double,    /* 8-byte floating */
-    __itt_e_last = __itt_e_double
+typedef enum {
+  __itt_e_first = 0,
+  __itt_e_char = 0, /* 1-byte integer */
+  __itt_e_uchar, /* 1-byte unsigned integer */
+  __itt_e_int16, /* 2-byte integer */
+  __itt_e_uint16, /* 2-byte unsigned integer  */
+  __itt_e_int32, /* 4-byte integer */
+  __itt_e_uint32, /* 4-byte unsigned integer */
+  __itt_e_int64, /* 8-byte integer */
+  __itt_e_uint64, /* 8-byte unsigned integer */
+  __itt_e_float, /* 4-byte floating */
+  __itt_e_double, /* 8-byte floating */
+  __itt_e_last = __itt_e_double
 } __itt_av_data_type;
 
 /**
  * @brief Save an array data to a file.
- * Output format is defined by the file extension. The csv and bmp formats are supported (bmp - for 2-dimensional array only).
+ * Output format is defined by the file extension. The csv and bmp formats are
+ * supported (bmp - for 2-dimensional array only).
  * @param[in] data - pointer to the array data
  * @param[in] rank - the rank of the array
- * @param[in] dimensions - pointer to an array of integers, which specifies the array dimensions.
- * The size of dimensions must be equal to the rank
- * @param[in] type - the type of the array, specified as one of the __itt_av_data_type values (for intrinsic types)
- * @param[in] filePath - the file path; the output format is defined by the file extension
- * @param[in] columnOrder - defines how the array is stored in the linear memory.
- * It should be 1 for column-major order (e.g. in FORTRAN) or 0 - for row-major order (e.g. in C).
- */
-
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-int ITTAPI __itt_av_saveA(void *data, int rank, const int *dimensions, int type, const char *filePath, int columnOrder);
-int ITTAPI __itt_av_saveW(void *data, int rank, const int *dimensions, int type, const wchar_t *filePath, int columnOrder);
+ * @param[in] dimensions - pointer to an array of integers, which specifies the
+ * array dimensions. The size of dimensions must be equal to the rank
+ * @param[in] type - the type of the array, specified as one of the
+ * __itt_av_data_type values (for intrinsic types)
+ * @param[in] filePath - the file path; the output format is defined by the file
+ * extension
+ * @param[in] columnOrder - defines how the array is stored in the linear
+ * memory. It should be 1 for column-major order (e.g. in FORTRAN) or 0 - for
+ * row-major order (e.g. in C).
+ */
+
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+int ITTAPI __itt_av_saveA(void *data, int rank, const int *dimensions, int type,
+                          const char *filePath, int columnOrder);
+int ITTAPI __itt_av_saveW(void *data, int rank, const int *dimensions, int type,
+                          const wchar_t *filePath, int columnOrder);
 #if defined(UNICODE) || defined(_UNICODE)
-#  define __itt_av_save     __itt_av_saveW
-#  define __itt_av_save_ptr __itt_av_saveW_ptr
+#define __itt_av_save __itt_av_saveW
+#define __itt_av_save_ptr __itt_av_saveW_ptr
 #else /* UNICODE */
-#  define __itt_av_save     __itt_av_saveA
-#  define __itt_av_save_ptr __itt_av_saveA_ptr
+#define __itt_av_save __itt_av_saveA
+#define __itt_av_save_ptr __itt_av_saveA_ptr
 #endif /* UNICODE */
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-int ITTAPI __itt_av_save(void *data, int rank, const int *dimensions, int type, const char *filePath, int columnOrder);
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+int ITTAPI __itt_av_save(void *data, int rank, const int *dimensions, int type,
+                         const char *filePath, int columnOrder);
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 
 /** @cond exclude_from_documentation */
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-ITT_STUB(ITTAPI, int, av_saveA, (void *data, int rank, const int *dimensions, int type, const char *filePath, int columnOrder))
-ITT_STUB(ITTAPI, int, av_saveW, (void *data, int rank, const int *dimensions, int type, const wchar_t *filePath, int columnOrder))
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-ITT_STUB(ITTAPI, int, av_save,  (void *data, int rank, const int *dimensions, int type, const char *filePath, int columnOrder))
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, int, av_saveA,
+         (void *data, int rank, const int *dimensions, int type,
+          const char *filePath, int columnOrder))
+ITT_STUB(ITTAPI, int, av_saveW,
+         (void *data, int rank, const int *dimensions, int type,
+          const wchar_t *filePath, int columnOrder))
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, int, av_save,
+         (void *data, int rank, const int *dimensions, int type,
+          const char *filePath, int columnOrder))
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_av_saveA     ITTNOTIFY_DATA(av_saveA)
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+#define __itt_av_saveA ITTNOTIFY_DATA(av_saveA)
 #define __itt_av_saveA_ptr ITTNOTIFY_NAME(av_saveA)
-#define __itt_av_saveW     ITTNOTIFY_DATA(av_saveW)
+#define __itt_av_saveW ITTNOTIFY_DATA(av_saveW)
 #define __itt_av_saveW_ptr ITTNOTIFY_NAME(av_saveW)
 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_av_save     ITTNOTIFY_DATA(av_save)
+#define __itt_av_save ITTNOTIFY_DATA(av_save)
 #define __itt_av_save_ptr ITTNOTIFY_NAME(av_save)
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#else /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
 #define __itt_av_saveA(name)
 #define __itt_av_saveA_ptr 0
 #define __itt_av_saveW(name)
@@ -3515,8 +3869,8 @@ ITT_STUB(ITTAPI, int, av_save,  (void *data, int rank, const int *dimensions, in
 #define __itt_av_save_ptr 0
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#else /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
 #define __itt_av_saveA_ptr 0
 #define __itt_av_saveW_ptr 0
 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
@@ -3531,13 +3885,13 @@ void ITTAPI __itt_enable_attach(void);
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
 ITT_STUBV(ITTAPI, void, enable_attach, (void))
-#define __itt_enable_attach     ITTNOTIFY_VOID(enable_attach)
+#define __itt_enable_attach ITTNOTIFY_VOID(enable_attach)
 #define __itt_enable_attach_ptr ITTNOTIFY_NAME(enable_attach)
-#else  /* INTEL_NO_ITTNOTIFY_API */
+#else /* INTEL_NO_ITTNOTIFY_API */
 #define __itt_enable_attach()
 #define __itt_enable_attach_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
+#else /* INTEL_NO_MACRO_BODY */
 #define __itt_enable_attach_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
@@ -3555,40 +3909,46 @@ ITT_STUBV(ITTAPI, void, enable_attach, (void))
  * @param[in] end_addr - relocated module end address
  * @param[in] path - file system path to the module
  */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-void ITTAPI __itt_module_loadA(void *start_addr, void *end_addr, const char *path);
-void ITTAPI __itt_module_loadW(void *start_addr, void *end_addr, const wchar_t *path);
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+void ITTAPI __itt_module_loadA(void *start_addr, void *end_addr,
+                               const char *path);
+void ITTAPI __itt_module_loadW(void *start_addr, void *end_addr,
+                               const wchar_t *path);
 #if defined(UNICODE) || defined(_UNICODE)
-#  define __itt_module_load     __itt_module_loadW
-#  define __itt_module_load_ptr __itt_module_loadW_ptr
+#define __itt_module_load __itt_module_loadW
+#define __itt_module_load_ptr __itt_module_loadW_ptr
 #else /* UNICODE */
-#  define __itt_module_load     __itt_module_loadA
-#  define __itt_module_load_ptr __itt_module_loadA_ptr
+#define __itt_module_load __itt_module_loadA
+#define __itt_module_load_ptr __itt_module_loadA_ptr
 #endif /* UNICODE */
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-void ITTAPI __itt_module_load(void *start_addr, void *end_addr, const char *path);
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+void ITTAPI __itt_module_load(void *start_addr, void *end_addr,
+                              const char *path);
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 
 /** @cond exclude_from_documentation */
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-ITT_STUB(ITTAPI, void, module_loadA, (void *start_addr, void *end_addr, const char *path))
-ITT_STUB(ITTAPI, void, module_loadW, (void *start_addr, void *end_addr, const wchar_t *path))
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-ITT_STUB(ITTAPI, void, module_load,  (void *start_addr, void *end_addr, const char *path))
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, void, module_loadA,
+         (void *start_addr, void *end_addr, const char *path))
+ITT_STUB(ITTAPI, void, module_loadW,
+         (void *start_addr, void *end_addr, const wchar_t *path))
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, void, module_load,
+         (void *start_addr, void *end_addr, const char *path))
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_module_loadA     ITTNOTIFY_VOID(module_loadA)
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+#define __itt_module_loadA ITTNOTIFY_VOID(module_loadA)
 #define __itt_module_loadA_ptr ITTNOTIFY_NAME(module_loadA)
-#define __itt_module_loadW     ITTNOTIFY_VOID(module_loadW)
+#define __itt_module_loadW ITTNOTIFY_VOID(module_loadW)
 #define __itt_module_loadW_ptr ITTNOTIFY_NAME(module_loadW)
 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_module_load     ITTNOTIFY_VOID(module_load)
+#define __itt_module_load ITTNOTIFY_VOID(module_load)
 #define __itt_module_load_ptr ITTNOTIFY_NAME(module_load)
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#else /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
 #define __itt_module_loadA(start_addr, end_addr, path)
 #define __itt_module_loadA_ptr 0
 #define __itt_module_loadW(start_addr, end_addr, path)
@@ -3598,18 +3958,16 @@ ITT_STUB(ITTAPI, void, module_load,  (void *start_addr, void *end_addr, const ch
 #define __itt_module_load_ptr 0
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#else /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
 #define __itt_module_loadA_ptr 0
 #define __itt_module_loadW_ptr 0
 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_module_load_ptr  0
+#define __itt_module_load_ptr 0
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
 
-
-
 #ifdef __cplusplus
 }
 #endif /* __cplusplus */
@@ -3629,43 +3987,62 @@ extern "C" {
  * @ingroup clockdomain
  * @brief Begin an overlapped task instance.
  * @param[in] domain The domain for this task
- * @param[in] clock_domain The clock domain controlling the execution of this call.
+ * @param[in] clock_domain The clock domain controlling the execution of this
+ * call.
  * @param[in] timestamp The user defined timestamp.
- * @param[in] taskid The identifier for this task instance, *cannot* be __itt_null.
+ * @param[in] taskid The identifier for this task instance, *cannot* be
+ * __itt_null.
  * @param[in] parentid The parent of this task, or __itt_null.
  * @param[in] name The name of this task.
  */
-void ITTAPI __itt_task_begin_overlapped_ex(const __itt_domain* domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id taskid, __itt_id parentid, __itt_string_handle* name);
+void ITTAPI __itt_task_begin_overlapped_ex(const __itt_domain *domain,
+                                           __itt_clock_domain *clock_domain,
+                                           unsigned long long timestamp,
+                                           __itt_id taskid, __itt_id parentid,
+                                           __itt_string_handle *name);
 
 /**
  * @ingroup clockdomain
  * @brief End an overlapped task instance.
  * @param[in] domain The domain for this task
- * @param[in] clock_domain The clock domain controlling the execution of this call.
+ * @param[in] clock_domain The clock domain controlling the execution of this
+ * call.
  * @param[in] timestamp The user defined timestamp.
  * @param[in] taskid Explicit ID of finished task
  */
-void ITTAPI __itt_task_end_overlapped_ex(const __itt_domain* domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id taskid);
+void ITTAPI __itt_task_end_overlapped_ex(const __itt_domain *domain,
+                                         __itt_clock_domain *clock_domain,
+                                         unsigned long long timestamp,
+                                         __itt_id taskid);
 
 /** @cond exclude_from_documentation */
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, task_begin_overlapped_ex,       (const __itt_domain* domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id taskid, __itt_id parentid, __itt_string_handle* name))
-ITT_STUBV(ITTAPI, void, task_end_overlapped_ex,         (const __itt_domain* domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id taskid))
-#define __itt_task_begin_overlapped_ex(d,x,y,z,a,b)     ITTNOTIFY_VOID_D5(task_begin_overlapped_ex,d,x,y,z,a,b)
-#define __itt_task_begin_overlapped_ex_ptr              ITTNOTIFY_NAME(task_begin_overlapped_ex)
-#define __itt_task_end_overlapped_ex(d,x,y,z)           ITTNOTIFY_VOID_D3(task_end_overlapped_ex,d,x,y,z)
-#define __itt_task_end_overlapped_ex_ptr                ITTNOTIFY_NAME(task_end_overlapped_ex)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_task_begin_overlapped_ex(domain,clock_domain,timestamp,taskid,parentid,name)
-#define __itt_task_begin_overlapped_ex_ptr      0
-#define __itt_task_end_overlapped_ex(domain,clock_domain,timestamp,taskid)
-#define __itt_task_end_overlapped_ex_ptr        0
+ITT_STUBV(ITTAPI, void, task_begin_overlapped_ex,
+          (const __itt_domain *domain, __itt_clock_domain *clock_domain,
+           unsigned long long timestamp, __itt_id taskid, __itt_id parentid,
+           __itt_string_handle *name))
+ITT_STUBV(ITTAPI, void, task_end_overlapped_ex,
+          (const __itt_domain *domain, __itt_clock_domain *clock_domain,
+           unsigned long long timestamp, __itt_id taskid))
+#define __itt_task_begin_overlapped_ex(d, x, y, z, a, b)                       \
+  ITTNOTIFY_VOID_D5(task_begin_overlapped_ex, d, x, y, z, a, b)
+#define __itt_task_begin_overlapped_ex_ptr                                     \
+  ITTNOTIFY_NAME(task_begin_overlapped_ex)
+#define __itt_task_end_overlapped_ex(d, x, y, z)                               \
+  ITTNOTIFY_VOID_D3(task_end_overlapped_ex, d, x, y, z)
+#define __itt_task_end_overlapped_ex_ptr ITTNOTIFY_NAME(task_end_overlapped_ex)
+#else /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_task_begin_overlapped_ex(domain, clock_domain, timestamp,        \
+                                       taskid, parentid, name)
+#define __itt_task_begin_overlapped_ex_ptr 0
+#define __itt_task_end_overlapped_ex(domain, clock_domain, timestamp, taskid)
+#define __itt_task_end_overlapped_ex_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_task_begin_overlapped_ex_ptr      0
-#define __itt_task_end_overlapped_ptr           0
-#define __itt_task_end_overlapped_ex_ptr        0
+#else /* INTEL_NO_MACRO_BODY */
+#define __itt_task_begin_overlapped_ex_ptr 0
+#define __itt_task_end_overlapped_ptr 0
+#define __itt_task_end_overlapped_ex_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
 
@@ -3682,19 +4059,20 @@ ITT_STUBV(ITTAPI, void, task_end_overlapped_ex,         (const __itt_domain* dom
 typedef int __itt_mark_type;
 
 /**
- * @brief Creates a user mark type with the specified name using char or Unicode string.
+ * @brief Creates a user mark type with the specified name using char or Unicode
+ * string.
  * @param[in] name - name of mark to create
  * @return Returns a handle to the mark type
  */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-__itt_mark_type ITTAPI __itt_mark_createA(const char    *name);
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+__itt_mark_type ITTAPI __itt_mark_createA(const char *name);
 __itt_mark_type ITTAPI __itt_mark_createW(const wchar_t *name);
 #if defined(UNICODE) || defined(_UNICODE)
-#  define __itt_mark_create     __itt_mark_createW
-#  define __itt_mark_create_ptr __itt_mark_createW_ptr
+#define __itt_mark_create __itt_mark_createW
+#define __itt_mark_create_ptr __itt_mark_createW_ptr
 #else /* UNICODE */
-#  define __itt_mark_create     __itt_mark_createA
-#  define __itt_mark_create_ptr __itt_mark_createA_ptr
+#define __itt_mark_create __itt_mark_createA
+#define __itt_mark_create_ptr __itt_mark_createA_ptr
 #endif /* UNICODE */
 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 __itt_mark_type ITTAPI __itt_mark_create(const char *name);
@@ -3703,63 +4081,68 @@ __itt_mark_type ITTAPI __itt_mark_create(const char *name);
 /** @cond exclude_from_documentation */
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-ITT_STUB(ITTAPI, __itt_mark_type, mark_createA, (const char    *name))
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_mark_type, mark_createA, (const char *name))
 ITT_STUB(ITTAPI, __itt_mark_type, mark_createW, (const wchar_t *name))
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-ITT_STUB(ITTAPI, __itt_mark_type, mark_create,  (const char *name))
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_mark_type, mark_create, (const char *name))
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_mark_createA     ITTNOTIFY_DATA(mark_createA)
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+#define __itt_mark_createA ITTNOTIFY_DATA(mark_createA)
 #define __itt_mark_createA_ptr ITTNOTIFY_NAME(mark_createA)
-#define __itt_mark_createW     ITTNOTIFY_DATA(mark_createW)
+#define __itt_mark_createW ITTNOTIFY_DATA(mark_createW)
 #define __itt_mark_createW_ptr ITTNOTIFY_NAME(mark_createW)
 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_mark_create      ITTNOTIFY_DATA(mark_create)
-#define __itt_mark_create_ptr  ITTNOTIFY_NAME(mark_create)
+#define __itt_mark_create ITTNOTIFY_DATA(mark_create)
+#define __itt_mark_create_ptr ITTNOTIFY_NAME(mark_create)
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#else /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
 #define __itt_mark_createA(name) (__itt_mark_type)0
 #define __itt_mark_createA_ptr 0
 #define __itt_mark_createW(name) (__itt_mark_type)0
 #define __itt_mark_createW_ptr 0
 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_mark_create(name)  (__itt_mark_type)0
-#define __itt_mark_create_ptr  0
+#define __itt_mark_create(name) (__itt_mark_type)0
+#define __itt_mark_create_ptr 0
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#else /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
 #define __itt_mark_createA_ptr 0
 #define __itt_mark_createW_ptr 0
 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_mark_create_ptr  0
+#define __itt_mark_create_ptr 0
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
 
 /**
- * @brief Creates a "discrete" user mark type of the specified type and an optional parameter using char or Unicode string.
+ * @brief Creates a "discrete" user mark type of the specified type and an
+ * optional parameter using char or Unicode string.
  *
- * - The mark of "discrete" type is placed to collection results in case of success. It appears in overtime view(s) as a special tick sign.
- * - The call is "synchronous" - function returns after mark is actually added to results.
- * - This function is useful, for example, to mark different phases of application
- *   (beginning of the next mark automatically meand end of current region).
- * - Can be used together with "continuous" marks (see below) at the same collection session
+ * - The mark of "discrete" type is placed to collection results in case of
+ * success. It appears in overtime view(s) as a special tick sign.
+ * - The call is "synchronous" - function returns after mark is actually added
+ * to results.
+ * - This function is useful, for example, to mark different phases of
+ * application (beginning of the next mark automatically meand end of current
+ * region).
+ * - Can be used together with "continuous" marks (see below) at the same
+ * collection session
  * @param[in] mt - mark, created by __itt_mark_create(const char* name) function
  * @param[in] parameter - string parameter of mark
  * @return Returns zero value in case of success, non-zero value otherwise.
  */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-int ITTAPI __itt_markA(__itt_mark_type mt, const char    *parameter);
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+int ITTAPI __itt_markA(__itt_mark_type mt, const char *parameter);
 int ITTAPI __itt_markW(__itt_mark_type mt, const wchar_t *parameter);
 #if defined(UNICODE) || defined(_UNICODE)
-#  define __itt_mark     __itt_markW
-#  define __itt_mark_ptr __itt_markW_ptr
+#define __itt_mark __itt_markW
+#define __itt_mark_ptr __itt_markW_ptr
 #else /* UNICODE  */
-#  define __itt_mark     __itt_markA
-#  define __itt_mark_ptr __itt_markA_ptr
+#define __itt_mark __itt_markA
+#define __itt_mark_ptr __itt_markA_ptr
 #endif /* UNICODE */
 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 int ITTAPI __itt_mark(__itt_mark_type mt, const char *parameter);
@@ -3768,56 +4151,56 @@ int ITTAPI __itt_mark(__itt_mark_type mt, const char *parameter);
 /** @cond exclude_from_documentation */
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-ITT_STUB(ITTAPI, int, markA, (__itt_mark_type mt, const char    *parameter))
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, int, markA, (__itt_mark_type mt, const char *parameter))
 ITT_STUB(ITTAPI, int, markW, (__itt_mark_type mt, const wchar_t *parameter))
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-ITT_STUB(ITTAPI, int, mark,  (__itt_mark_type mt, const char *parameter))
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, int, mark, (__itt_mark_type mt, const char *parameter))
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_markA     ITTNOTIFY_DATA(markA)
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+#define __itt_markA ITTNOTIFY_DATA(markA)
 #define __itt_markA_ptr ITTNOTIFY_NAME(markA)
-#define __itt_markW     ITTNOTIFY_DATA(markW)
+#define __itt_markW ITTNOTIFY_DATA(markW)
 #define __itt_markW_ptr ITTNOTIFY_NAME(markW)
 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_mark      ITTNOTIFY_DATA(mark)
-#define __itt_mark_ptr  ITTNOTIFY_NAME(mark)
+#define __itt_mark ITTNOTIFY_DATA(mark)
+#define __itt_mark_ptr ITTNOTIFY_NAME(mark)
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#else /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
 #define __itt_markA(mt, parameter) (int)0
 #define __itt_markA_ptr 0
 #define __itt_markW(mt, parameter) (int)0
 #define __itt_markW_ptr 0
 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_mark(mt, parameter)  (int)0
-#define __itt_mark_ptr  0
+#define __itt_mark(mt, parameter) (int)0
+#define __itt_mark_ptr 0
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#else /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
 #define __itt_markA_ptr 0
 #define __itt_markW_ptr 0
 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_mark_ptr  0
+#define __itt_mark_ptr 0
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
 
 /**
- * @brief Use this if necessary to create a "discrete" user event type (mark) for process
- * rather then for one thread
+ * @brief Use this if necessary to create a "discrete" user event type (mark)
+ * for process rather then for one thread
  * @see int __itt_mark(__itt_mark_type mt, const char* parameter);
  */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-int ITTAPI __itt_mark_globalA(__itt_mark_type mt, const char    *parameter);
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+int ITTAPI __itt_mark_globalA(__itt_mark_type mt, const char *parameter);
 int ITTAPI __itt_mark_globalW(__itt_mark_type mt, const wchar_t *parameter);
 #if defined(UNICODE) || defined(_UNICODE)
-#  define __itt_mark_global     __itt_mark_globalW
-#  define __itt_mark_global_ptr __itt_mark_globalW_ptr
+#define __itt_mark_global __itt_mark_globalW
+#define __itt_mark_global_ptr __itt_mark_globalW_ptr
 #else /* UNICODE  */
-#  define __itt_mark_global     __itt_mark_globalA
-#  define __itt_mark_global_ptr __itt_mark_globalA_ptr
+#define __itt_mark_global __itt_mark_globalA
+#define __itt_mark_global_ptr __itt_mark_globalA_ptr
 #endif /* UNICODE */
 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 int ITTAPI __itt_mark_global(__itt_mark_type mt, const char *parameter);
@@ -3826,38 +4209,39 @@ int ITTAPI __itt_mark_global(__itt_mark_type mt, const char *parameter);
 /** @cond exclude_from_documentation */
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-ITT_STUB(ITTAPI, int, mark_globalA, (__itt_mark_type mt, const char    *parameter))
-ITT_STUB(ITTAPI, int, mark_globalW, (__itt_mark_type mt, const wchar_t *parameter))
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-ITT_STUB(ITTAPI, int, mark_global,  (__itt_mark_type mt, const char *parameter))
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, int, mark_globalA, (__itt_mark_type mt, const char *parameter))
+ITT_STUB(ITTAPI, int, mark_globalW,
+         (__itt_mark_type mt, const wchar_t *parameter))
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, int, mark_global, (__itt_mark_type mt, const char *parameter))
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_mark_globalA     ITTNOTIFY_DATA(mark_globalA)
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+#define __itt_mark_globalA ITTNOTIFY_DATA(mark_globalA)
 #define __itt_mark_globalA_ptr ITTNOTIFY_NAME(mark_globalA)
-#define __itt_mark_globalW     ITTNOTIFY_DATA(mark_globalW)
+#define __itt_mark_globalW ITTNOTIFY_DATA(mark_globalW)
 #define __itt_mark_globalW_ptr ITTNOTIFY_NAME(mark_globalW)
 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_mark_global      ITTNOTIFY_DATA(mark_global)
-#define __itt_mark_global_ptr  ITTNOTIFY_NAME(mark_global)
+#define __itt_mark_global ITTNOTIFY_DATA(mark_global)
+#define __itt_mark_global_ptr ITTNOTIFY_NAME(mark_global)
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#else /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
 #define __itt_mark_globalA(mt, parameter) (int)0
 #define __itt_mark_globalA_ptr 0
 #define __itt_mark_globalW(mt, parameter) (int)0
 #define __itt_mark_globalW_ptr 0
 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_mark_global(mt, parameter)  (int)0
-#define __itt_mark_global_ptr  0
+#define __itt_mark_global(mt, parameter) (int)0
+#define __itt_mark_global_ptr 0
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#else /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
 #define __itt_mark_globalA_ptr 0
 #define __itt_mark_globalW_ptr 0
 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_mark_global_ptr  0
+#define __itt_mark_global_ptr 0
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
@@ -3883,13 +4267,13 @@ int ITTAPI __itt_mark_off(__itt_mark_type mt);
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
 ITT_STUB(ITTAPI, int, mark_off, (__itt_mark_type mt))
-#define __itt_mark_off     ITTNOTIFY_DATA(mark_off)
+#define __itt_mark_off ITTNOTIFY_DATA(mark_off)
 #define __itt_mark_off_ptr ITTNOTIFY_NAME(mark_off)
-#else  /* INTEL_NO_ITTNOTIFY_API */
+#else /* INTEL_NO_ITTNOTIFY_API */
 #define __itt_mark_off(mt) (int)0
 #define __itt_mark_off_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
+#else /* INTEL_NO_MACRO_BODY */
 #define __itt_mark_off_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
@@ -3904,13 +4288,13 @@ int ITTAPI __itt_mark_global_off(__itt_mark_type mt);
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
 ITT_STUB(ITTAPI, int, mark_global_off, (__itt_mark_type mt))
-#define __itt_mark_global_off     ITTNOTIFY_DATA(mark_global_off)
+#define __itt_mark_global_off ITTNOTIFY_DATA(mark_global_off)
 #define __itt_mark_global_off_ptr ITTNOTIFY_NAME(mark_global_off)
-#else  /* INTEL_NO_ITTNOTIFY_API */
+#else /* INTEL_NO_ITTNOTIFY_API */
 #define __itt_mark_global_off(mt) (int)0
 #define __itt_mark_global_off_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
+#else /* INTEL_NO_MACRO_BODY */
 #define __itt_mark_global_off_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
@@ -3923,7 +4307,6 @@ ITT_STUB(ITTAPI, int, mark_global_off, (__itt_mark_type mt))
  * @{
  */
 
-
 /**
  * @defgroup stitch Stack Stitching
  * @ingroup internal
@@ -3936,8 +4319,9 @@ ITT_STUB(ITTAPI, int, mark_global_off, (__itt_mark_type mt))
 typedef struct ___itt_caller *__itt_caller;
 
 /**
- * @brief Create the stitch point e.g. a point in call stack where other stacks should be stitched to.
- * The function returns a unique identifier which is used to match the cut points with corresponding stitch points.
+ * @brief Create the stitch point e.g. a point in call stack where other stacks
+ * should be stitched to. The function returns a unique identifier which is used
+ * to match the cut points with corresponding stitch points.
  */
 __itt_caller ITTAPI __itt_stack_caller_create(void);
 
@@ -3945,19 +4329,20 @@ __itt_caller ITTAPI __itt_stack_caller_create(void);
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
 ITT_STUB(ITTAPI, __itt_caller, stack_caller_create, (void))
-#define __itt_stack_caller_create     ITTNOTIFY_DATA(stack_caller_create)
+#define __itt_stack_caller_create ITTNOTIFY_DATA(stack_caller_create)
 #define __itt_stack_caller_create_ptr ITTNOTIFY_NAME(stack_caller_create)
-#else  /* INTEL_NO_ITTNOTIFY_API */
+#else /* INTEL_NO_ITTNOTIFY_API */
 #define __itt_stack_caller_create() (__itt_caller)0
 #define __itt_stack_caller_create_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
+#else /* INTEL_NO_MACRO_BODY */
 #define __itt_stack_caller_create_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
 
 /**
- * @brief Destroy the information about stitch point identified by the pointer previously returned by __itt_stack_caller_create()
+ * @brief Destroy the information about stitch point identified by the pointer
+ * previously returned by __itt_stack_caller_create()
  */
 void ITTAPI __itt_stack_caller_destroy(__itt_caller id);
 
@@ -3965,20 +4350,21 @@ void ITTAPI __itt_stack_caller_destroy(__itt_caller id);
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
 ITT_STUBV(ITTAPI, void, stack_caller_destroy, (__itt_caller id))
-#define __itt_stack_caller_destroy     ITTNOTIFY_VOID(stack_caller_destroy)
+#define __itt_stack_caller_destroy ITTNOTIFY_VOID(stack_caller_destroy)
 #define __itt_stack_caller_destroy_ptr ITTNOTIFY_NAME(stack_caller_destroy)
-#else  /* INTEL_NO_ITTNOTIFY_API */
+#else /* INTEL_NO_ITTNOTIFY_API */
 #define __itt_stack_caller_destroy(id)
 #define __itt_stack_caller_destroy_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
+#else /* INTEL_NO_MACRO_BODY */
 #define __itt_stack_caller_destroy_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
 
 /**
- * @brief Sets the cut point. Stack from each event which occurs after this call will be cut
- * at the same stack level the function was called and stitched to the corresponding stitch point.
+ * @brief Sets the cut point. Stack from each event which occurs after this call
+ * will be cut at the same stack level the function was called and stitched to
+ * the corresponding stitch point.
  */
 void ITTAPI __itt_stack_callee_enter(__itt_caller id);
 
@@ -3986,19 +4372,20 @@ void ITTAPI __itt_stack_callee_enter(__itt_caller id);
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
 ITT_STUBV(ITTAPI, void, stack_callee_enter, (__itt_caller id))
-#define __itt_stack_callee_enter     ITTNOTIFY_VOID(stack_callee_enter)
+#define __itt_stack_callee_enter ITTNOTIFY_VOID(stack_callee_enter)
 #define __itt_stack_callee_enter_ptr ITTNOTIFY_NAME(stack_callee_enter)
-#else  /* INTEL_NO_ITTNOTIFY_API */
+#else /* INTEL_NO_ITTNOTIFY_API */
 #define __itt_stack_callee_enter(id)
 #define __itt_stack_callee_enter_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
+#else /* INTEL_NO_MACRO_BODY */
 #define __itt_stack_callee_enter_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
 
 /**
- * @brief This function eliminates the cut point which was set by latest __itt_stack_callee_enter().
+ * @brief This function eliminates the cut point which was set by latest
+ * __itt_stack_callee_enter().
  */
 void ITTAPI __itt_stack_callee_leave(__itt_caller id);
 
@@ -4006,45 +4393,48 @@ void ITTAPI __itt_stack_callee_leave(__itt_caller id);
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
 ITT_STUBV(ITTAPI, void, stack_callee_leave, (__itt_caller id))
-#define __itt_stack_callee_leave     ITTNOTIFY_VOID(stack_callee_leave)
+#define __itt_stack_callee_leave ITTNOTIFY_VOID(stack_callee_leave)
 #define __itt_stack_callee_leave_ptr ITTNOTIFY_NAME(stack_callee_leave)
-#else  /* INTEL_NO_ITTNOTIFY_API */
+#else /* INTEL_NO_ITTNOTIFY_API */
 #define __itt_stack_callee_leave(id)
 #define __itt_stack_callee_leave_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
+#else /* INTEL_NO_MACRO_BODY */
 #define __itt_stack_callee_leave_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
 
 /** @} stitch group */
 
-/* ***************************************************************************************************************************** */
+/* *****************************************************************************************************************************
+ */
 
 #include <stdarg.h>
 
 /** @cond exclude_from_documentation */
-typedef enum __itt_error_code
-{
-    __itt_error_success       = 0, /*!< no error */
-    __itt_error_no_module     = 1, /*!< module can't be loaded */
-    /* %1$s -- library name; win: %2$d -- system error code; unx: %2$s -- system error message. */
-    __itt_error_no_symbol     = 2, /*!< symbol not found */
-    /* %1$s -- library name, %2$s -- symbol name. */
-    __itt_error_unknown_group = 3, /*!< unknown group specified */
-    /* %1$s -- env var name, %2$s -- group name. */
-    __itt_error_cant_read_env = 4, /*!< GetEnvironmentVariable() failed */
-    /* %1$s -- env var name, %2$d -- system error. */
-    __itt_error_env_too_long  = 5, /*!< variable value too long */
-    /* %1$s -- env var name, %2$d -- actual length of the var, %3$d -- max allowed length. */
-    __itt_error_system        = 6  /*!< pthread_mutexattr_init or pthread_mutex_init failed */
-    /* %1$s -- function name, %2$d -- errno. */
+typedef enum __itt_error_code {
+  __itt_error_success = 0, /*!< no error */
+  __itt_error_no_module = 1, /*!< module can't be loaded */
+  /* %1$s -- library name; win: %2$d -- system error code; unx: %2$s -- system
+     error message. */
+  __itt_error_no_symbol = 2, /*!< symbol not found */
+  /* %1$s -- library name, %2$s -- symbol name. */
+  __itt_error_unknown_group = 3, /*!< unknown group specified */
+  /* %1$s -- env var name, %2$s -- group name. */
+  __itt_error_cant_read_env = 4, /*!< GetEnvironmentVariable() failed */
+  /* %1$s -- env var name, %2$d -- system error. */
+  __itt_error_env_too_long = 5, /*!< variable value too long */
+  /* %1$s -- env var name, %2$d -- actual length of the var, %3$d -- max allowed
+     length. */
+  __itt_error_system =
+      6 /*!< pthread_mutexattr_init or pthread_mutex_init failed */
+  /* %1$s -- function name, %2$d -- errno. */
 } __itt_error_code;
 
-typedef void (__itt_error_handler_t)(__itt_error_code code, va_list);
-__itt_error_handler_t* __itt_set_error_handler(__itt_error_handler_t*);
+typedef void(__itt_error_handler_t)(__itt_error_code code, va_list);
+__itt_error_handler_t *__itt_set_error_handler(__itt_error_handler_t *);
 
-const char* ITTAPI __itt_api_version(void);
+const char *ITTAPI __itt_api_version(void);
 /** @endcond */
 
 /** @cond exclude_from_documentation */
@@ -4054,14 +4444,14 @@ const char* ITTAPI __itt_api_version(void);
 void __itt_error_handler(__itt_error_code code, va_list args);
 extern const int ITTNOTIFY_NAME(err);
 #define __itt_err ITTNOTIFY_NAME(err)
-ITT_STUB(ITTAPI, const char*, api_version, (void))
-#define __itt_api_version     ITTNOTIFY_DATA(api_version)
+ITT_STUB(ITTAPI, const char *, api_version, (void))
+#define __itt_api_version ITTNOTIFY_DATA(api_version)
 #define __itt_api_version_ptr ITTNOTIFY_NAME(api_version)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_api_version()   (const char*)0
+#else /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_api_version() (const char *)0
 #define __itt_api_version_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
+#else /* INTEL_NO_MACRO_BODY */
 #define __itt_api_version_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
diff --git a/openmp/runtime/src/thirdparty/ittnotify/ittnotify_config.h b/openmp/runtime/src/thirdparty/ittnotify/ittnotify_config.h
index f231e70d181f..a49236b14885 100644
--- a/openmp/runtime/src/thirdparty/ittnotify/ittnotify_config.h
+++ b/openmp/runtime/src/thirdparty/ittnotify/ittnotify_config.h
@@ -12,59 +12,59 @@
 
 /** @cond exclude_from_documentation */
 #ifndef ITT_OS_WIN
-#  define ITT_OS_WIN   1
+#define ITT_OS_WIN 1
 #endif /* ITT_OS_WIN */
 
 #ifndef ITT_OS_LINUX
-#  define ITT_OS_LINUX 2
+#define ITT_OS_LINUX 2
 #endif /* ITT_OS_LINUX */
 
 #ifndef ITT_OS_MAC
-#  define ITT_OS_MAC   3
+#define ITT_OS_MAC 3
 #endif /* ITT_OS_MAC */
 
 #ifndef ITT_OS_FREEBSD
-#  define ITT_OS_FREEBSD   4
+#define ITT_OS_FREEBSD 4
 #endif /* ITT_OS_FREEBSD */
 
 #ifndef ITT_OS
-#  if defined WIN32 || defined _WIN32
-#    define ITT_OS ITT_OS_WIN
-#  elif defined( __APPLE__ ) && defined( __MACH__ )
-#    define ITT_OS ITT_OS_MAC
-#  elif defined( __FreeBSD__ )
-#    define ITT_OS ITT_OS_FREEBSD
-#  else
-#    define ITT_OS ITT_OS_LINUX
-#  endif
+#if defined WIN32 || defined _WIN32
+#define ITT_OS ITT_OS_WIN
+#elif defined(__APPLE__) && defined(__MACH__)
+#define ITT_OS ITT_OS_MAC
+#elif defined(__FreeBSD__)
+#define ITT_OS ITT_OS_FREEBSD
+#else
+#define ITT_OS ITT_OS_LINUX
+#endif
 #endif /* ITT_OS */
 
 #ifndef ITT_PLATFORM_WIN
-#  define ITT_PLATFORM_WIN 1
+#define ITT_PLATFORM_WIN 1
 #endif /* ITT_PLATFORM_WIN */
 
 #ifndef ITT_PLATFORM_POSIX
-#  define ITT_PLATFORM_POSIX 2
+#define ITT_PLATFORM_POSIX 2
 #endif /* ITT_PLATFORM_POSIX */
 
 #ifndef ITT_PLATFORM_MAC
-#  define ITT_PLATFORM_MAC 3
+#define ITT_PLATFORM_MAC 3
 #endif /* ITT_PLATFORM_MAC */
 
 #ifndef ITT_PLATFORM_FREEBSD
-#  define ITT_PLATFORM_FREEBSD 4
+#define ITT_PLATFORM_FREEBSD 4
 #endif /* ITT_PLATFORM_FREEBSD */
 
 #ifndef ITT_PLATFORM
-#  if ITT_OS==ITT_OS_WIN
-#    define ITT_PLATFORM ITT_PLATFORM_WIN
-#  elif ITT_OS==ITT_OS_MAC
-#    define ITT_PLATFORM ITT_PLATFORM_MAC
-#  elif ITT_OS==ITT_OS_FREEBSD
-#    define ITT_PLATFORM ITT_PLATFORM_FREEBSD
-#  else
-#    define ITT_PLATFORM ITT_PLATFORM_POSIX
-#  endif
+#if ITT_OS == ITT_OS_WIN
+#define ITT_PLATFORM ITT_PLATFORM_WIN
+#elif ITT_OS == ITT_OS_MAC
+#define ITT_PLATFORM ITT_PLATFORM_MAC
+#elif ITT_OS == ITT_OS_FREEBSD
+#define ITT_PLATFORM ITT_PLATFORM_FREEBSD
+#else
+#define ITT_PLATFORM ITT_PLATFORM_POSIX
+#endif
 #endif /* ITT_PLATFORM */
 
 #if defined(_UNICODE) && !defined(UNICODE)
@@ -72,9 +72,9 @@
 #endif
 
 #include <stddef.h>
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
 #include <tchar.h>
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 #include <stdint.h>
 #if defined(UNICODE) || defined(_UNICODE)
 #include <wchar.h>
@@ -82,152 +82,156 @@
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 
 #ifndef ITTAPI_CDECL
-#  if ITT_PLATFORM==ITT_PLATFORM_WIN
-#    define ITTAPI_CDECL __cdecl
-#  else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#    if defined _M_IX86 || defined __i386__
-#      define ITTAPI_CDECL __attribute__ ((cdecl))
-#    else  /* _M_IX86 || __i386__ */
-#      define ITTAPI_CDECL /* actual only on x86 platform */
-#    endif /* _M_IX86 || __i386__ */
-#  endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+#define ITTAPI_CDECL __cdecl
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if defined _M_IX86 || defined __i386__
+#define ITTAPI_CDECL __attribute__((cdecl))
+#else /* _M_IX86 || __i386__ */
+#define ITTAPI_CDECL /* actual only on x86 platform */
+#endif /* _M_IX86 || __i386__ */
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 #endif /* ITTAPI_CDECL */
 
 #ifndef STDCALL
-#  if ITT_PLATFORM==ITT_PLATFORM_WIN
-#    define STDCALL __stdcall
-#  else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#    if defined _M_IX86 || defined __i386__
-#      define STDCALL __attribute__ ((stdcall))
-#    else  /* _M_IX86 || __i386__ */
-#      define STDCALL /* supported only on x86 platform */
-#    endif /* _M_IX86 || __i386__ */
-#  endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+#define STDCALL __stdcall
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if defined _M_IX86 || defined __i386__
+#define STDCALL __attribute__((stdcall))
+#else /* _M_IX86 || __i386__ */
+#define STDCALL /* supported only on x86 platform */
+#endif /* _M_IX86 || __i386__ */
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 #endif /* STDCALL */
 
-#define ITTAPI    ITTAPI_CDECL
+#define ITTAPI ITTAPI_CDECL
 #define LIBITTAPI ITTAPI_CDECL
 
 /* TODO: Temporary for compatibility! */
-#define ITTAPI_CALL    ITTAPI_CDECL
+#define ITTAPI_CALL ITTAPI_CDECL
 #define LIBITTAPI_CALL ITTAPI_CDECL
 
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
 /* use __forceinline (VC++ specific) */
-#define ITT_INLINE           __forceinline
+#define ITT_INLINE __forceinline
 #define ITT_INLINE_ATTRIBUTE /* nothing */
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 /*
  * Generally, functions are not inlined unless optimization is specified.
  * For functions declared inline, this attribute inlines the function even
  * if no optimization level was specified.
  */
 #ifdef __STRICT_ANSI__
-#define ITT_INLINE           static
+#define ITT_INLINE static
 #define ITT_INLINE_ATTRIBUTE __attribute__((unused))
-#else  /* __STRICT_ANSI__ */
-#define ITT_INLINE           static inline
+#else /* __STRICT_ANSI__ */
+#define ITT_INLINE static inline
 #define ITT_INLINE_ATTRIBUTE __attribute__((always_inline, unused))
 #endif /* __STRICT_ANSI__ */
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 /** @endcond */
 
 #ifndef ITT_ARCH_IA32
-#  define ITT_ARCH_IA32  1
+#define ITT_ARCH_IA32 1
 #endif /* ITT_ARCH_IA32 */
 
 #ifndef ITT_ARCH_IA32E
-#  define ITT_ARCH_IA32E 2
+#define ITT_ARCH_IA32E 2
 #endif /* ITT_ARCH_IA32E */
 
 /* Was there a magical reason we didn't have 3 here before? */
 #ifndef ITT_ARCH_AARCH64
-#  define ITT_ARCH_AARCH64  3
+#define ITT_ARCH_AARCH64 3
 #endif /* ITT_ARCH_AARCH64 */
 
 #ifndef ITT_ARCH_ARM
-#  define ITT_ARCH_ARM  4
+#define ITT_ARCH_ARM 4
 #endif /* ITT_ARCH_ARM */
 
 #ifndef ITT_ARCH_PPC64
-#  define ITT_ARCH_PPC64  5
+#define ITT_ARCH_PPC64 5
 #endif /* ITT_ARCH_PPC64 */
 
 #ifndef ITT_ARCH_MIPS
-#  define ITT_ARCH_MIPS  6
+#define ITT_ARCH_MIPS 6
 #endif /* ITT_ARCH_MIPS */
 
 #ifndef ITT_ARCH_MIPS64
-#  define ITT_ARCH_MIPS64  6
+#define ITT_ARCH_MIPS64 6
 #endif /* ITT_ARCH_MIPS64 */
 
 #ifndef ITT_ARCH_RISCV64
-#  define ITT_ARCH_RISCV64  7
+#define ITT_ARCH_RISCV64 7
 #endif /* ITT_ARCH_RISCV64 */
 
 #ifndef ITT_ARCH
-#  if defined _M_IX86 || defined __i386__
-#    define ITT_ARCH ITT_ARCH_IA32
-#  elif defined _M_X64 || defined _M_AMD64 || defined __x86_64__
-#    define ITT_ARCH ITT_ARCH_IA32E
-#  elif defined _M_IA64 || defined __ia64__
-#    define ITT_ARCH ITT_ARCH_IA64
-#  elif defined _M_ARM || defined __arm__
-#    define ITT_ARCH ITT_ARCH_ARM
-#  elif defined __powerpc64__
-#    define ITT_ARCH ITT_ARCH_PPC64
-#  elif defined __aarch64__
-#    define ITT_ARCH ITT_ARCH_AARCH64
-#  elif defined __mips__ && !defined __mips64
-#    define ITT_ARCH ITT_ARCH_MIPS
-#  elif defined __mips__ && defined __mips64
-#    define ITT_ARCH ITT_ARCH_MIPS64
-#  elif defined __riscv && __riscv_xlen == 64
-#    define ITT_ARCH ITT_ARCH_RISCV64
-#  endif
+#if defined _M_IX86 || defined __i386__
+#define ITT_ARCH ITT_ARCH_IA32
+#elif defined _M_X64 || defined _M_AMD64 || defined __x86_64__
+#define ITT_ARCH ITT_ARCH_IA32E
+#elif defined _M_IA64 || defined __ia64__
+#define ITT_ARCH ITT_ARCH_IA64
+#elif defined _M_ARM || defined __arm__
+#define ITT_ARCH ITT_ARCH_ARM
+#elif defined __powerpc64__
+#define ITT_ARCH ITT_ARCH_PPC64
+#elif defined __aarch64__
+#define ITT_ARCH ITT_ARCH_AARCH64
+#elif defined __mips__ && !defined __mips64
+#define ITT_ARCH ITT_ARCH_MIPS
+#elif defined __mips__ && defined __mips64
+#define ITT_ARCH ITT_ARCH_MIPS64
+#elif defined __riscv && __riscv_xlen == 64
+#define ITT_ARCH ITT_ARCH_RISCV64
+#endif
 #endif
 
 #ifdef __cplusplus
-#  define ITT_EXTERN_C extern "C"
-#  define ITT_EXTERN_C_BEGIN extern "C" {
-#  define ITT_EXTERN_C_END }
+#define ITT_EXTERN_C extern "C"
+#define ITT_EXTERN_C_BEGIN extern "C" {
+#define ITT_EXTERN_C_END }
 #else
-#  define ITT_EXTERN_C /* nothing */
-#  define ITT_EXTERN_C_BEGIN /* nothing */
-#  define ITT_EXTERN_C_END /* nothing */
+#define ITT_EXTERN_C /* nothing */
+#define ITT_EXTERN_C_BEGIN /* nothing */
+#define ITT_EXTERN_C_END /* nothing */
 #endif /* __cplusplus */
 
 #define ITT_TO_STR_AUX(x) #x
-#define ITT_TO_STR(x)     ITT_TO_STR_AUX(x)
+#define ITT_TO_STR(x) ITT_TO_STR_AUX(x)
 
-#define __ITT_BUILD_ASSERT(expr, suffix) do { \
-    static char __itt_build_check_##suffix[(expr) ? 1 : -1]; \
-    __itt_build_check_##suffix[0] = 0; \
-} while(0)
-#define _ITT_BUILD_ASSERT(expr, suffix)  __ITT_BUILD_ASSERT((expr), suffix)
-#define ITT_BUILD_ASSERT(expr)           _ITT_BUILD_ASSERT((expr), __LINE__)
+#define __ITT_BUILD_ASSERT(expr, suffix)                                       \
+  do {                                                                         \
+    static char __itt_build_check_##suffix[(expr) ? 1 : -1];                   \
+    __itt_build_check_##suffix[0] = 0;                                         \
+  } while (0)
+#define _ITT_BUILD_ASSERT(expr, suffix) __ITT_BUILD_ASSERT((expr), suffix)
+#define ITT_BUILD_ASSERT(expr) _ITT_BUILD_ASSERT((expr), __LINE__)
 
-#define ITT_MAGIC { 0xED, 0xAB, 0xAB, 0xEC, 0x0D, 0xEE, 0xDA, 0x30 }
+#define ITT_MAGIC                                                              \
+  { 0xED, 0xAB, 0xAB, 0xEC, 0x0D, 0xEE, 0xDA, 0x30 }
 
 /* Replace with snapshot date YYYYMMDD for promotion build. */
-#define API_VERSION_BUILD    20151119
+#define API_VERSION_BUILD 20151119
 
 #ifndef API_VERSION_NUM
 #define API_VERSION_NUM 0.0.0
 #endif /* API_VERSION_NUM */
 
-#define API_VERSION "ITT-API-Version " ITT_TO_STR(API_VERSION_NUM) \
-                                " (" ITT_TO_STR(API_VERSION_BUILD) ")"
+#define API_VERSION                                                            \
+  "ITT-API-Version " ITT_TO_STR(API_VERSION_NUM) " (" ITT_TO_STR(              \
+      API_VERSION_BUILD) ")"
 
 /* OS communication functions */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
 #include <windows.h>
-typedef HMODULE           lib_t;
-typedef DWORD             TIDT;
-typedef CRITICAL_SECTION  mutex_t;
-#define MUTEX_INITIALIZER { 0 }
+typedef HMODULE lib_t;
+typedef DWORD TIDT;
+typedef CRITICAL_SECTION mutex_t;
+#define MUTEX_INITIALIZER                                                      \
+  { 0 }
 #define strong_alias(name, aliasname) /* empty for Windows */
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 #include <dlfcn.h>
 #if defined(UNICODE) || defined(_UNICODE)
 #include <wchar.h>
@@ -236,38 +240,39 @@ typedef CRITICAL_SECTION  mutex_t;
 #define _GNU_SOURCE 1 /* need for PTHREAD_MUTEX_RECURSIVE */
 #endif /* _GNU_SOURCE */
 #ifndef __USE_UNIX98
-#define __USE_UNIX98 1 /* need for PTHREAD_MUTEX_RECURSIVE, on SLES11.1 with gcc 4.3.4 wherein pthread.h missing dependency on __USE_XOPEN2K8 */
+#define __USE_UNIX98                                                           \
+  1 /* need for PTHREAD_MUTEX_RECURSIVE, on SLES11.1 with gcc 4.3.4 wherein    \
+       pthread.h missing dependency on __USE_XOPEN2K8 */
 #endif /*__USE_UNIX98*/
 #include <pthread.h>
-typedef void*             lib_t;
-typedef pthread_t         TIDT;
-typedef pthread_mutex_t   mutex_t;
+typedef void *lib_t;
+typedef pthread_t TIDT;
+typedef pthread_mutex_t mutex_t;
 #define MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER
-#define _strong_alias(name, aliasname) \
-            extern __typeof (name) aliasname __attribute__ ((alias (#name)));
+#define _strong_alias(name, aliasname)                                         \
+  extern __typeof(name) aliasname __attribute__((alias(#name)));
 #define strong_alias(name, aliasname) _strong_alias(name, aliasname)
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
 #define __itt_get_proc(lib, name) GetProcAddress(lib, name)
-#define __itt_mutex_init(mutex)   InitializeCriticalSection(mutex)
-#define __itt_mutex_lock(mutex)   EnterCriticalSection(mutex)
+#define __itt_mutex_init(mutex) InitializeCriticalSection(mutex)
+#define __itt_mutex_lock(mutex) EnterCriticalSection(mutex)
 #define __itt_mutex_unlock(mutex) LeaveCriticalSection(mutex)
-#define __itt_load_lib(name)      LoadLibraryA(name)
-#define __itt_unload_lib(handle)  FreeLibrary(handle)
-#define __itt_system_error()      (int)GetLastError()
-#define __itt_fstrcmp(s1, s2)     lstrcmpA(s1, s2)
-#define __itt_fstrnlen(s, l)      strnlen_s(s, l)
+#define __itt_load_lib(name) LoadLibraryA(name)
+#define __itt_unload_lib(handle) FreeLibrary(handle)
+#define __itt_system_error() (int)GetLastError()
+#define __itt_fstrcmp(s1, s2) lstrcmpA(s1, s2)
+#define __itt_fstrnlen(s, l) strnlen_s(s, l)
 #define __itt_fstrcpyn(s1, b, s2, l) strncpy_s(s1, b, s2, l)
-#define __itt_fstrdup(s)          _strdup(s)
-#define __itt_thread_id()         GetCurrentThreadId()
-#define __itt_thread_yield()      SwitchToThread()
+#define __itt_fstrdup(s) _strdup(s)
+#define __itt_thread_id() GetCurrentThreadId()
+#define __itt_thread_yield() SwitchToThread()
 #ifndef ITT_SIMPLE_INIT
 ITT_INLINE long
-__itt_interlocked_increment(volatile long* ptr) ITT_INLINE_ATTRIBUTE;
-ITT_INLINE long __itt_interlocked_increment(volatile long* ptr)
-{
-    return InterlockedIncrement(ptr);
+__itt_interlocked_increment(volatile long *ptr) ITT_INLINE_ATTRIBUTE;
+ITT_INLINE long __itt_interlocked_increment(volatile long *ptr) {
+  return InterlockedIncrement(ptr);
 }
 #endif /* ITT_SIMPLE_INIT */
 
@@ -276,38 +281,39 @@ ITT_INLINE long __itt_interlocked_increment(volatile long* ptr)
 
 #else /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
 #define __itt_get_proc(lib, name) dlsym(lib, name)
-#define __itt_mutex_init(mutex)   {\
-    pthread_mutexattr_t mutex_attr;                                         \
-    int error_code = pthread_mutexattr_init(&mutex_attr);                   \
-    if (error_code)                                                         \
-        __itt_report_error(__itt_error_system, "pthread_mutexattr_init",    \
-                           error_code);                                     \
-    error_code = pthread_mutexattr_settype(&mutex_attr,                     \
-                                           PTHREAD_MUTEX_RECURSIVE);        \
-    if (error_code)                                                         \
-        __itt_report_error(__itt_error_system, "pthread_mutexattr_settype", \
-                           error_code);                                     \
-    error_code = pthread_mutex_init(mutex, &mutex_attr);                    \
-    if (error_code)                                                         \
-        __itt_report_error(__itt_error_system, "pthread_mutex_init",        \
-                           error_code);                                     \
-    error_code = pthread_mutexattr_destroy(&mutex_attr);                    \
-    if (error_code)                                                         \
-        __itt_report_error(__itt_error_system, "pthread_mutexattr_destroy", \
-                           error_code);                                     \
-}
-#define __itt_mutex_lock(mutex)   pthread_mutex_lock(mutex)
+#define __itt_mutex_init(mutex)                                                \
+  {                                                                            \
+    pthread_mutexattr_t mutex_attr;                                            \
+    int error_code = pthread_mutexattr_init(&mutex_attr);                      \
+    if (error_code)                                                            \
+      __itt_report_error(__itt_error_system, "pthread_mutexattr_init",         \
+                         error_code);                                          \
+    error_code =                                                               \
+        pthread_mutexattr_settype(&mutex_attr, PTHREAD_MUTEX_RECURSIVE);       \
+    if (error_code)                                                            \
+      __itt_report_error(__itt_error_system, "pthread_mutexattr_settype",      \
+                         error_code);                                          \
+    error_code = pthread_mutex_init(mutex, &mutex_attr);                       \
+    if (error_code)                                                            \
+      __itt_report_error(__itt_error_system, "pthread_mutex_init",             \
+                         error_code);                                          \
+    error_code = pthread_mutexattr_destroy(&mutex_attr);                       \
+    if (error_code)                                                            \
+      __itt_report_error(__itt_error_system, "pthread_mutexattr_destroy",      \
+                         error_code);                                          \
+  }
+#define __itt_mutex_lock(mutex) pthread_mutex_lock(mutex)
 #define __itt_mutex_unlock(mutex) pthread_mutex_unlock(mutex)
-#define __itt_load_lib(name)      dlopen(name, RTLD_LAZY)
-#define __itt_unload_lib(handle)  dlclose(handle)
-#define __itt_system_error()      errno
-#define __itt_fstrcmp(s1, s2)     strcmp(s1, s2)
+#define __itt_load_lib(name) dlopen(name, RTLD_LAZY)
+#define __itt_unload_lib(handle) dlclose(handle)
+#define __itt_system_error() errno
+#define __itt_fstrcmp(s1, s2) strcmp(s1, s2)
 
 /* makes customer code define safe APIs for SDL_STRNLEN_S and SDL_STRNCPY_S */
 #ifdef SDL_STRNLEN_S
-#define __itt_fstrnlen(s, l)      SDL_STRNLEN_S(s, l)
+#define __itt_fstrnlen(s, l) SDL_STRNLEN_S(s, l)
 #else
-#define __itt_fstrnlen(s, l)      strlen(s)
+#define __itt_fstrnlen(s, l) strlen(s)
 #endif /* SDL_STRNLEN_S */
 #ifdef SDL_STRNCPY_S
 #define __itt_fstrcpyn(s1, b, s2, l) SDL_STRNCPY_S(s1, b, s2, l)
@@ -315,26 +321,26 @@ ITT_INLINE long __itt_interlocked_increment(volatile long* ptr)
 #define __itt_fstrcpyn(s1, b, s2, l) strncpy(s1, s2, l)
 #endif /* SDL_STRNCPY_S */
 
-#define __itt_fstrdup(s)          strdup(s)
-#define __itt_thread_id()         pthread_self()
-#define __itt_thread_yield()      sched_yield()
-#if ITT_ARCH==ITT_ARCH_IA64
+#define __itt_fstrdup(s) strdup(s)
+#define __itt_thread_id() pthread_self()
+#define __itt_thread_yield() sched_yield()
+#if ITT_ARCH == ITT_ARCH_IA64
 #ifdef __INTEL_COMPILER
 #define __TBB_machine_fetchadd4(addr, val) __fetchadd4_acq((void *)addr, val)
-#else  /* __INTEL_COMPILER */
+#else /* __INTEL_COMPILER */
 /* TODO: Add Support for not Intel compilers for IA-64 architecture */
 #endif /* __INTEL_COMPILER */
-#elif ITT_ARCH==ITT_ARCH_IA32 || ITT_ARCH==ITT_ARCH_IA32E /* ITT_ARCH!=ITT_ARCH_IA64 */
-ITT_INLINE long
-__TBB_machine_fetchadd4(volatile void* ptr, long addend) ITT_INLINE_ATTRIBUTE;
-ITT_INLINE long __TBB_machine_fetchadd4(volatile void* ptr, long addend)
-{
-    long result;
-    __asm__ __volatile__("lock\nxadd %0,%1"
-                          : "=r"(result),"=m"(*(volatile int*)ptr)
-                          : "0"(addend), "m"(*(volatile int*)ptr)
-                          : "memory");
-    return result;
+#elif ITT_ARCH == ITT_ARCH_IA32 ||                                             \
+    ITT_ARCH == ITT_ARCH_IA32E /* ITT_ARCH!=ITT_ARCH_IA64 */
+ITT_INLINE long __TBB_machine_fetchadd4(volatile void *ptr,
+                                        long addend) ITT_INLINE_ATTRIBUTE;
+ITT_INLINE long __TBB_machine_fetchadd4(volatile void *ptr, long addend) {
+  long result;
+  __asm__ __volatile__("lock\nxadd %0,%1"
+                       : "=r"(result), "=m"(*(volatile int *)ptr)
+                       : "0"(addend), "m"(*(volatile int *)ptr)
+                       : "memory");
+  return result;
 }
 #elif ITT_ARCH == ITT_ARCH_ARM || ITT_ARCH == ITT_ARCH_PPC64 ||                \
     ITT_ARCH == ITT_ARCH_AARCH64 || ITT_ARCH == ITT_ARCH_MIPS ||               \
@@ -343,253 +349,259 @@ ITT_INLINE long __TBB_machine_fetchadd4(volatile void* ptr, long addend)
 #endif /* ITT_ARCH==ITT_ARCH_IA64 */
 #ifndef ITT_SIMPLE_INIT
 ITT_INLINE long
-__itt_interlocked_increment(volatile long* ptr) ITT_INLINE_ATTRIBUTE;
-ITT_INLINE long __itt_interlocked_increment(volatile long* ptr)
-{
-    return __TBB_machine_fetchadd4(ptr, 1) + 1L;
+__itt_interlocked_increment(volatile long *ptr) ITT_INLINE_ATTRIBUTE;
+ITT_INLINE long __itt_interlocked_increment(volatile long *ptr) {
+  return __TBB_machine_fetchadd4(ptr, 1) + 1L;
 }
 #endif /* ITT_SIMPLE_INIT */
 
-void* dlopen(const char*, int) __attribute__((weak));
-void* dlsym(void*, const char*) __attribute__((weak));
-int dlclose(void*) __attribute__((weak));
+void *dlopen(const char *, int) __attribute__((weak));
+void *dlsym(void *, const char *) __attribute__((weak));
+int dlclose(void *) __attribute__((weak));
 #define DL_SYMBOLS (dlopen && dlsym && dlclose)
 
-int pthread_mutex_init(pthread_mutex_t*, const pthread_mutexattr_t*) __attribute__((weak));
-int pthread_mutex_lock(pthread_mutex_t*) __attribute__((weak));
-int pthread_mutex_unlock(pthread_mutex_t*) __attribute__((weak));
-int pthread_mutex_destroy(pthread_mutex_t*) __attribute__((weak));
-int pthread_mutexattr_init(pthread_mutexattr_t*) __attribute__((weak));
-int pthread_mutexattr_settype(pthread_mutexattr_t*, int) __attribute__((weak));
-int pthread_mutexattr_destroy(pthread_mutexattr_t*) __attribute__((weak));
+int pthread_mutex_init(pthread_mutex_t *, const pthread_mutexattr_t *)
+    __attribute__((weak));
+int pthread_mutex_lock(pthread_mutex_t *) __attribute__((weak));
+int pthread_mutex_unlock(pthread_mutex_t *) __attribute__((weak));
+int pthread_mutex_destroy(pthread_mutex_t *) __attribute__((weak));
+int pthread_mutexattr_init(pthread_mutexattr_t *) __attribute__((weak));
+int pthread_mutexattr_settype(pthread_mutexattr_t *, int) __attribute__((weak));
+int pthread_mutexattr_destroy(pthread_mutexattr_t *) __attribute__((weak));
 pthread_t pthread_self(void) __attribute__((weak));
-#define PTHREAD_SYMBOLS (pthread_mutex_init && pthread_mutex_lock && pthread_mutex_unlock && pthread_mutex_destroy && pthread_mutexattr_init && pthread_mutexattr_settype && pthread_mutexattr_destroy && pthread_self)
+#define PTHREAD_SYMBOLS                                                        \
+  (pthread_mutex_init && pthread_mutex_lock && pthread_mutex_unlock &&         \
+   pthread_mutex_destroy && pthread_mutexattr_init &&                          \
+   pthread_mutexattr_settype && pthread_mutexattr_destroy && pthread_self)
 
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 
 typedef enum {
-    __itt_collection_normal = 0,
-    __itt_collection_paused = 1
+  __itt_collection_normal = 0,
+  __itt_collection_paused = 1
 } __itt_collection_state;
 
 typedef enum {
-    __itt_thread_normal  = 0,
-    __itt_thread_ignored = 1
+  __itt_thread_normal = 0,
+  __itt_thread_ignored = 1
 } __itt_thread_state;
 
 #pragma pack(push, 8)
 
-typedef struct ___itt_thread_info
-{
-    const char* nameA; /*!< Copy of original name in ASCII. */
+typedef struct ___itt_thread_info {
+  const char *nameA; /*!< Copy of original name in ASCII. */
 #if defined(UNICODE) || defined(_UNICODE)
-    const wchar_t* nameW; /*!< Copy of original name in UNICODE. */
-#else  /* UNICODE || _UNICODE */
-    void* nameW;
+  const wchar_t *nameW; /*!< Copy of original name in UNICODE. */
+#else /* UNICODE || _UNICODE */
+  void *nameW;
 #endif /* UNICODE || _UNICODE */
-    TIDT               tid;
-    __itt_thread_state state;   /*!< Thread state (paused or normal) */
-    int                extra1;  /*!< Reserved to the runtime */
-    void*              extra2;  /*!< Reserved to the runtime */
-    struct ___itt_thread_info* next;
+  TIDT tid;
+  __itt_thread_state state; /*!< Thread state (paused or normal) */
+  int extra1; /*!< Reserved to the runtime */
+  void *extra2; /*!< Reserved to the runtime */
+  struct ___itt_thread_info *next;
 } __itt_thread_info;
 
 #include "ittnotify_types.h" /* For __itt_group_id definition */
 
-typedef struct ___itt_api_info_20101001
-{
-    const char*    name;
-    void**         func_ptr;
-    void*          init_func;
-    __itt_group_id group;
-}  __itt_api_info_20101001;
-
-typedef struct ___itt_api_info
-{
-    const char*    name;
-    void**         func_ptr;
-    void*          init_func;
-    void*          null_func;
-    __itt_group_id group;
-}  __itt_api_info;
-
-typedef struct __itt_counter_info
-{
-    const char* nameA;  /*!< Copy of original name in ASCII. */
+typedef struct ___itt_api_info_20101001 {
+  const char *name;
+  void **func_ptr;
+  void *init_func;
+  __itt_group_id group;
+} __itt_api_info_20101001;
+
+typedef struct ___itt_api_info {
+  const char *name;
+  void **func_ptr;
+  void *init_func;
+  void *null_func;
+  __itt_group_id group;
+} __itt_api_info;
+
+typedef struct __itt_counter_info {
+  const char *nameA; /*!< Copy of original name in ASCII. */
 #if defined(UNICODE) || defined(_UNICODE)
-    const wchar_t* nameW; /*!< Copy of original name in UNICODE. */
-#else  /* UNICODE || _UNICODE */
-    void* nameW;
+  const wchar_t *nameW; /*!< Copy of original name in UNICODE. */
+#else /* UNICODE || _UNICODE */
+  void *nameW;
 #endif /* UNICODE || _UNICODE */
-    const char* domainA;  /*!< Copy of original name in ASCII. */
+  const char *domainA; /*!< Copy of original name in ASCII. */
 #if defined(UNICODE) || defined(_UNICODE)
-    const wchar_t* domainW; /*!< Copy of original name in UNICODE. */
-#else  /* UNICODE || _UNICODE */
-    void* domainW;
+  const wchar_t *domainW; /*!< Copy of original name in UNICODE. */
+#else /* UNICODE || _UNICODE */
+  void *domainW;
 #endif /* UNICODE || _UNICODE */
-    int type;
-    long index;
-    int   extra1; /*!< Reserved to the runtime */
-    void* extra2; /*!< Reserved to the runtime */
-    struct __itt_counter_info* next;
-}  __itt_counter_info_t;
+  int type;
+  long index;
+  int extra1; /*!< Reserved to the runtime */
+  void *extra2; /*!< Reserved to the runtime */
+  struct __itt_counter_info *next;
+} __itt_counter_info_t;
 
 struct ___itt_domain;
 struct ___itt_string_handle;
 
-typedef struct ___itt_global
-{
-    unsigned char          magic[8];
-    unsigned long          version_major;
-    unsigned long          version_minor;
-    unsigned long          version_build;
-    volatile long          api_initialized;
-    volatile long          mutex_initialized;
-    volatile long          atomic_counter;
-    mutex_t                mutex;
-    lib_t                  lib;
-    void*                  error_handler;
-    const char**           dll_path_ptr;
-    __itt_api_info*        api_list_ptr;
-    struct ___itt_global*  next;
-    /* Joinable structures below */
-    __itt_thread_info*     thread_list;
-    struct ___itt_domain*  domain_list;
-    struct ___itt_string_handle* string_list;
-    __itt_collection_state state;
-    __itt_counter_info_t* counter_list;
+typedef struct ___itt_global {
+  unsigned char magic[8];
+  unsigned long version_major;
+  unsigned long version_minor;
+  unsigned long version_build;
+  volatile long api_initialized;
+  volatile long mutex_initialized;
+  volatile long atomic_counter;
+  mutex_t mutex;
+  lib_t lib;
+  void *error_handler;
+  const char **dll_path_ptr;
+  __itt_api_info *api_list_ptr;
+  struct ___itt_global *next;
+  /* Joinable structures below */
+  __itt_thread_info *thread_list;
+  struct ___itt_domain *domain_list;
+  struct ___itt_string_handle *string_list;
+  __itt_collection_state state;
+  __itt_counter_info_t *counter_list;
 } __itt_global;
 
 #pragma pack(pop)
 
-#define NEW_THREAD_INFO_W(gptr,h,h_tail,t,s,n) { \
-    h = (__itt_thread_info*)malloc(sizeof(__itt_thread_info)); \
-    if (h != NULL) { \
-        h->tid    = t; \
-        h->nameA  = NULL; \
-        h->nameW  = n ? _wcsdup(n) : NULL; \
-        h->state  = s; \
-        h->extra1 = 0;    /* reserved */ \
-        h->extra2 = NULL; /* reserved */ \
-        h->next   = NULL; \
-        if (h_tail == NULL) \
-            (gptr)->thread_list = h; \
-        else \
-            h_tail->next = h; \
-    } \
-}
-
-#define NEW_THREAD_INFO_A(gptr,h,h_tail,t,s,n) { \
-    h = (__itt_thread_info*)malloc(sizeof(__itt_thread_info)); \
-    if (h != NULL) { \
-        h->tid    = t; \
-        h->nameA  = n ? __itt_fstrdup(n) : NULL; \
-        h->nameW  = NULL; \
-        h->state  = s; \
-        h->extra1 = 0;    /* reserved */ \
-        h->extra2 = NULL; /* reserved */ \
-        h->next   = NULL; \
-        if (h_tail == NULL) \
-            (gptr)->thread_list = h; \
-        else \
-            h_tail->next = h; \
-    } \
-}
-
-#define NEW_DOMAIN_W(gptr,h,h_tail,name) { \
-    h = (__itt_domain*)malloc(sizeof(__itt_domain)); \
-    if (h != NULL) { \
-        h->flags  = 1;    /* domain is enabled by default */ \
-        h->nameA  = NULL; \
-        h->nameW  = name ? _wcsdup(name) : NULL; \
-        h->extra1 = 0;    /* reserved */ \
-        h->extra2 = NULL; /* reserved */ \
-        h->next   = NULL; \
-        if (h_tail == NULL) \
-            (gptr)->domain_list = h; \
-        else \
-            h_tail->next = h; \
-    } \
-}
-
-#define NEW_DOMAIN_A(gptr,h,h_tail,name) { \
-    h = (__itt_domain*)malloc(sizeof(__itt_domain)); \
-    if (h != NULL) { \
-        h->flags  = 1;    /* domain is enabled by default */ \
-        h->nameA  = name ? __itt_fstrdup(name) : NULL; \
-        h->nameW  = NULL; \
-        h->extra1 = 0;    /* reserved */ \
-        h->extra2 = NULL; /* reserved */ \
-        h->next   = NULL; \
-        if (h_tail == NULL) \
-            (gptr)->domain_list = h; \
-        else \
-            h_tail->next = h; \
-    } \
-}
-
-#define NEW_STRING_HANDLE_W(gptr,h,h_tail,name) { \
-    h = (__itt_string_handle*)malloc(sizeof(__itt_string_handle)); \
-    if (h != NULL) { \
-        h->strA   = NULL; \
-        h->strW   = name ? _wcsdup(name) : NULL; \
-        h->extra1 = 0;    /* reserved */ \
-        h->extra2 = NULL; /* reserved */ \
-        h->next   = NULL; \
-        if (h_tail == NULL) \
-            (gptr)->string_list = h; \
-        else \
-            h_tail->next = h; \
-    } \
-}
-
-#define NEW_STRING_HANDLE_A(gptr,h,h_tail,name) { \
-    h = (__itt_string_handle*)malloc(sizeof(__itt_string_handle)); \
-    if (h != NULL) { \
-        h->strA   = name ? __itt_fstrdup(name) : NULL; \
-        h->strW   = NULL; \
-        h->extra1 = 0;    /* reserved */ \
-        h->extra2 = NULL; /* reserved */ \
-        h->next   = NULL; \
-        if (h_tail == NULL) \
-            (gptr)->string_list = h; \
-        else \
-            h_tail->next = h; \
-    } \
-}
-
-#define NEW_COUNTER_W(gptr,h,h_tail,name,domain,type) { \
-    h = (__itt_counter_info_t*)malloc(sizeof(__itt_counter_info_t)); \
-    if (h != NULL) { \
-        h->nameA   = NULL; \
-        h->nameW   = name ? _wcsdup(name) : NULL; \
-        h->domainA   = NULL; \
-        h->domainW   = name ? _wcsdup(domain) : NULL; \
-        h->type = type; \
-        h->index = 0; \
-        h->next   = NULL; \
-        if (h_tail == NULL) \
-            (gptr)->counter_list = h; \
-        else \
-            h_tail->next = h; \
-    } \
-}
-
-#define NEW_COUNTER_A(gptr,h,h_tail,name,domain,type) { \
-    h = (__itt_counter_info_t*)malloc(sizeof(__itt_counter_info_t)); \
-    if (h != NULL) { \
-        h->nameA   = name ? __itt_fstrdup(name) : NULL; \
-        h->nameW   = NULL; \
-        h->domainA   = domain ? __itt_fstrdup(domain) : NULL; \
-        h->domainW   = NULL; \
-        h->type = type; \
-        h->index = 0; \
-        h->next   = NULL; \
-        if (h_tail == NULL) \
-            (gptr)->counter_list = h; \
-        else \
-            h_tail->next = h; \
-    } \
-}
+#define NEW_THREAD_INFO_W(gptr, h, h_tail, t, s, n)                            \
+  {                                                                            \
+    h = (__itt_thread_info *)malloc(sizeof(__itt_thread_info));                \
+    if (h != NULL) {                                                           \
+      h->tid = t;                                                              \
+      h->nameA = NULL;                                                         \
+      h->nameW = n ? _wcsdup(n) : NULL;                                        \
+      h->state = s;                                                            \
+      h->extra1 = 0; /* reserved */                                            \
+      h->extra2 = NULL; /* reserved */                                         \
+      h->next = NULL;                                                          \
+      if (h_tail == NULL)                                                      \
+        (gptr)->thread_list = h;                                               \
+      else                                                                     \
+        h_tail->next = h;                                                      \
+    }                                                                          \
+  }
+
+#define NEW_THREAD_INFO_A(gptr, h, h_tail, t, s, n)                            \
+  {                                                                            \
+    h = (__itt_thread_info *)malloc(sizeof(__itt_thread_info));                \
+    if (h != NULL) {                                                           \
+      h->tid = t;                                                              \
+      h->nameA = n ? __itt_fstrdup(n) : NULL;                                  \
+      h->nameW = NULL;                                                         \
+      h->state = s;                                                            \
+      h->extra1 = 0; /* reserved */                                            \
+      h->extra2 = NULL; /* reserved */                                         \
+      h->next = NULL;                                                          \
+      if (h_tail == NULL)                                                      \
+        (gptr)->thread_list = h;                                               \
+      else                                                                     \
+        h_tail->next = h;                                                      \
+    }                                                                          \
+  }
+
+#define NEW_DOMAIN_W(gptr, h, h_tail, name)                                    \
+  {                                                                            \
+    h = (__itt_domain *)malloc(sizeof(__itt_domain));                          \
+    if (h != NULL) {                                                           \
+      h->flags = 1; /* domain is enabled by default */                         \
+      h->nameA = NULL;                                                         \
+      h->nameW = name ? _wcsdup(name) : NULL;                                  \
+      h->extra1 = 0; /* reserved */                                            \
+      h->extra2 = NULL; /* reserved */                                         \
+      h->next = NULL;                                                          \
+      if (h_tail == NULL)                                                      \
+        (gptr)->domain_list = h;                                               \
+      else                                                                     \
+        h_tail->next = h;                                                      \
+    }                                                                          \
+  }
+
+#define NEW_DOMAIN_A(gptr, h, h_tail, name)                                    \
+  {                                                                            \
+    h = (__itt_domain *)malloc(sizeof(__itt_domain));                          \
+    if (h != NULL) {                                                           \
+      h->flags = 1; /* domain is enabled by default */                         \
+      h->nameA = name ? __itt_fstrdup(name) : NULL;                            \
+      h->nameW = NULL;                                                         \
+      h->extra1 = 0; /* reserved */                                            \
+      h->extra2 = NULL; /* reserved */                                         \
+      h->next = NULL;                                                          \
+      if (h_tail == NULL)                                                      \
+        (gptr)->domain_list = h;                                               \
+      else                                                                     \
+        h_tail->next = h;                                                      \
+    }                                                                          \
+  }
+
+#define NEW_STRING_HANDLE_W(gptr, h, h_tail, name)                             \
+  {                                                                            \
+    h = (__itt_string_handle *)malloc(sizeof(__itt_string_handle));            \
+    if (h != NULL) {                                                           \
+      h->strA = NULL;                                                          \
+      h->strW = name ? _wcsdup(name) : NULL;                                   \
+      h->extra1 = 0; /* reserved */                                            \
+      h->extra2 = NULL; /* reserved */                                         \
+      h->next = NULL;                                                          \
+      if (h_tail == NULL)                                                      \
+        (gptr)->string_list = h;                                               \
+      else                                                                     \
+        h_tail->next = h;                                                      \
+    }                                                                          \
+  }
+
+#define NEW_STRING_HANDLE_A(gptr, h, h_tail, name)                             \
+  {                                                                            \
+    h = (__itt_string_handle *)malloc(sizeof(__itt_string_handle));            \
+    if (h != NULL) {                                                           \
+      h->strA = name ? __itt_fstrdup(name) : NULL;                             \
+      h->strW = NULL;                                                          \
+      h->extra1 = 0; /* reserved */                                            \
+      h->extra2 = NULL; /* reserved */                                         \
+      h->next = NULL;                                                          \
+      if (h_tail == NULL)                                                      \
+        (gptr)->string_list = h;                                               \
+      else                                                                     \
+        h_tail->next = h;                                                      \
+    }                                                                          \
+  }
+
+#define NEW_COUNTER_W(gptr, h, h_tail, name, domain, type)                     \
+  {                                                                            \
+    h = (__itt_counter_info_t *)malloc(sizeof(__itt_counter_info_t));          \
+    if (h != NULL) {                                                           \
+      h->nameA = NULL;                                                         \
+      h->nameW = name ? _wcsdup(name) : NULL;                                  \
+      h->domainA = NULL;                                                       \
+      h->domainW = name ? _wcsdup(domain) : NULL;                              \
+      h->type = type;                                                          \
+      h->index = 0;                                                            \
+      h->next = NULL;                                                          \
+      if (h_tail == NULL)                                                      \
+        (gptr)->counter_list = h;                                              \
+      else                                                                     \
+        h_tail->next = h;                                                      \
+    }                                                                          \
+  }
+
+#define NEW_COUNTER_A(gptr, h, h_tail, name, domain, type)                     \
+  {                                                                            \
+    h = (__itt_counter_info_t *)malloc(sizeof(__itt_counter_info_t));          \
+    if (h != NULL) {                                                           \
+      h->nameA = name ? __itt_fstrdup(name) : NULL;                            \
+      h->nameW = NULL;                                                         \
+      h->domainA = domain ? __itt_fstrdup(domain) : NULL;                      \
+      h->domainW = NULL;                                                       \
+      h->type = type;                                                          \
+      h->index = 0;                                                            \
+      h->next = NULL;                                                          \
+      if (h_tail == NULL)                                                      \
+        (gptr)->counter_list = h;                                              \
+      else                                                                     \
+        h_tail->next = h;                                                      \
+    }                                                                          \
+  }
 
 #endif /* _ITTNOTIFY_CONFIG_H_ */
diff --git a/openmp/runtime/src/thirdparty/ittnotify/ittnotify_static.cpp b/openmp/runtime/src/thirdparty/ittnotify/ittnotify_static.cpp
index 4936b9baaf80..eae3c7615cd7 100644
--- a/openmp/runtime/src/thirdparty/ittnotify/ittnotify_static.cpp
+++ b/openmp/runtime/src/thirdparty/ittnotify/ittnotify_static.cpp
@@ -11,7 +11,7 @@
 #include "kmp_os.h"
 #include "ittnotify_config.h"
 
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
 #if defined(__MINGW32__)
 #include <limits.h>
 #else
@@ -38,14 +38,14 @@
 
 static const char api_version[] = API_VERSION "\0\n@(#) $Revision: 481659 $\n";
 
-#define _N_(n) ITT_JOIN(INTEL_ITTNOTIFY_PREFIX,n)
+#define _N_(n) ITT_JOIN(INTEL_ITTNOTIFY_PREFIX, n)
 
-#if ITT_OS==ITT_OS_WIN
-static const char* ittnotify_lib_name = "libittnotify.dll";
-#elif ITT_OS==ITT_OS_LINUX || ITT_OS==ITT_OS_FREEBSD
-static const char* ittnotify_lib_name = "libittnotify.so";
-#elif ITT_OS==ITT_OS_MAC
-static const char* ittnotify_lib_name = "libittnotify.dylib";
+#if ITT_OS == ITT_OS_WIN
+static const char *ittnotify_lib_name = "libittnotify.dll";
+#elif ITT_OS == ITT_OS_LINUX || ITT_OS == ITT_OS_FREEBSD
+static const char *ittnotify_lib_name = "libittnotify.so";
+#elif ITT_OS == ITT_OS_MAC
+static const char *ittnotify_lib_name = "libittnotify.dylib";
 #else
 #error Unsupported or unknown OS.
 #endif
@@ -60,26 +60,34 @@ static const char* ittnotify_lib_name = "libittnotify.dylib";
 #include <linux/limits.h>
 
 #ifdef ITT_ANDROID_LOG
-    #define ITT_ANDROID_LOG_TAG   "INTEL_VTUNE_USERAPI"
-    #define ITT_ANDROID_LOGI(...) ((void)__android_log_print(ANDROID_LOG_INFO, ITT_ANDROID_LOG_TAG, __VA_ARGS__))
-    #define ITT_ANDROID_LOGW(...) ((void)__android_log_print(ANDROID_LOG_WARN, ITT_ANDROID_LOG_TAG, __VA_ARGS__))
-    #define ITT_ANDROID_LOGE(...) ((void)__android_log_print(ANDROID_LOG_ERROR,ITT_ANDROID_LOG_TAG, __VA_ARGS__))
-    #define ITT_ANDROID_LOGD(...) ((void)__android_log_print(ANDROID_LOG_DEBUG,ITT_ANDROID_LOG_TAG, __VA_ARGS__))
+#define ITT_ANDROID_LOG_TAG "INTEL_VTUNE_USERAPI"
+#define ITT_ANDROID_LOGI(...)                                                  \
+  ((void)__android_log_print(ANDROID_LOG_INFO, ITT_ANDROID_LOG_TAG,            \
+                             __VA_ARGS__))
+#define ITT_ANDROID_LOGW(...)                                                  \
+  ((void)__android_log_print(ANDROID_LOG_WARN, ITT_ANDROID_LOG_TAG,            \
+                             __VA_ARGS__))
+#define ITT_ANDROID_LOGE(...)                                                  \
+  ((void)__android_log_print(ANDROID_LOG_ERROR, ITT_ANDROID_LOG_TAG,           \
+                             __VA_ARGS__))
+#define ITT_ANDROID_LOGD(...)                                                  \
+  ((void)__android_log_print(ANDROID_LOG_DEBUG, ITT_ANDROID_LOG_TAG,           \
+                             __VA_ARGS__))
 #else
-    #define ITT_ANDROID_LOGI(...)
-    #define ITT_ANDROID_LOGW(...)
-    #define ITT_ANDROID_LOGE(...)
-    #define ITT_ANDROID_LOGD(...)
+#define ITT_ANDROID_LOGI(...)
+#define ITT_ANDROID_LOGW(...)
+#define ITT_ANDROID_LOGE(...)
+#define ITT_ANDROID_LOGD(...)
 #endif
 
 /* default location of userapi collector on Android */
-#define ANDROID_ITTNOTIFY_DEFAULT_PATH_MASK(x)  "/data/data/com.intel.vtune/perfrun/lib" \
-                                                #x "/runtime/libittnotify.so"
+#define ANDROID_ITTNOTIFY_DEFAULT_PATH_MASK(x)                                 \
+  "/data/data/com.intel.vtune/perfrun/lib" #x "/runtime/libittnotify.so"
 
-#if ITT_ARCH==ITT_ARCH_IA32 || ITT_ARCH==ITT_ARCH_ARM
-#define ANDROID_ITTNOTIFY_DEFAULT_PATH  ANDROID_ITTNOTIFY_DEFAULT_PATH_MASK(32)
+#if ITT_ARCH == ITT_ARCH_IA32 || ITT_ARCH == ITT_ARCH_ARM
+#define ANDROID_ITTNOTIFY_DEFAULT_PATH ANDROID_ITTNOTIFY_DEFAULT_PATH_MASK(32)
 #else
-#define ANDROID_ITTNOTIFY_DEFAULT_PATH  ANDROID_ITTNOTIFY_DEFAULT_PATH_MASK(64)
+#define ANDROID_ITTNOTIFY_DEFAULT_PATH ANDROID_ITTNOTIFY_DEFAULT_PATH_MASK(64)
 #endif
 
 #endif
@@ -88,94 +96,99 @@ static const char* ittnotify_lib_name = "libittnotify.dylib";
 #define PATH_MAX 4096
 #endif
 
-
 #ifndef LIB_VAR_NAME
-#if ITT_ARCH==ITT_ARCH_IA32 || ITT_ARCH==ITT_ARCH_ARM || ITT_ARCH==ITT_ARCH_MIPS
+#if ITT_ARCH == ITT_ARCH_IA32 || ITT_ARCH == ITT_ARCH_ARM ||                   \
+    ITT_ARCH == ITT_ARCH_MIPS
 #define LIB_VAR_NAME INTEL_LIBITTNOTIFY32
 #else
 #define LIB_VAR_NAME INTEL_LIBITTNOTIFY64
 #endif
 #endif /* LIB_VAR_NAME */
 
-#define ITT_MUTEX_INIT_AND_LOCK(p) {                                 \
-    if (PTHREAD_SYMBOLS)                                             \
-    {                                                                \
-        if (!p.mutex_initialized)                                    \
-        {                                                            \
-            if (__itt_interlocked_increment(&p.atomic_counter) == 1) \
-            {                                                        \
-                __itt_mutex_init(&p.mutex);                          \
-                p.mutex_initialized = 1;                             \
-            }                                                        \
-            else                                                     \
-                while (!p.mutex_initialized)                         \
-                    __itt_thread_yield();                            \
-        }                                                            \
-        __itt_mutex_lock(&p.mutex);                                  \
-    }                                                                \
-}
-
-typedef int (__itt_init_ittlib_t)(const char*, __itt_group_id);
+#define ITT_MUTEX_INIT_AND_LOCK(p)                                             \
+  {                                                                            \
+    if (PTHREAD_SYMBOLS) {                                                     \
+      if (!p.mutex_initialized) {                                              \
+        if (__itt_interlocked_increment(&p.atomic_counter) == 1) {             \
+          __itt_mutex_init(&p.mutex);                                          \
+          p.mutex_initialized = 1;                                             \
+        } else                                                                 \
+          while (!p.mutex_initialized)                                         \
+            __itt_thread_yield();                                              \
+      }                                                                        \
+      __itt_mutex_lock(&p.mutex);                                              \
+    }                                                                          \
+  }
+
+typedef int(__itt_init_ittlib_t)(const char *, __itt_group_id);
 
 /* this define used to control initialization function name. */
 #ifndef __itt_init_ittlib_name
-ITT_EXTERN_C int _N_(init_ittlib)(const char*, __itt_group_id);
-static __itt_init_ittlib_t* __itt_init_ittlib_ptr = _N_(init_ittlib);
+ITT_EXTERN_C int _N_(init_ittlib)(const char *, __itt_group_id);
+static __itt_init_ittlib_t *__itt_init_ittlib_ptr = _N_(init_ittlib);
 #define __itt_init_ittlib_name __itt_init_ittlib_ptr
 #endif /* __itt_init_ittlib_name */
 
-typedef void (__itt_fini_ittlib_t)(void);
+typedef void(__itt_fini_ittlib_t)(void);
 
 /* this define used to control finalization function name. */
 #ifndef __itt_fini_ittlib_name
 ITT_EXTERN_C void _N_(fini_ittlib)(void);
-static __itt_fini_ittlib_t* __itt_fini_ittlib_ptr = _N_(fini_ittlib);
+static __itt_fini_ittlib_t *__itt_fini_ittlib_ptr = _N_(fini_ittlib);
 #define __itt_fini_ittlib_name __itt_fini_ittlib_ptr
 #endif /* __itt_fini_ittlib_name */
 
 /* building pointers to imported funcs */
 #undef ITT_STUBV
 #undef ITT_STUB
-#define ITT_STUB(api,type,name,args,params,ptr,group,format)   \
-static type api ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)) args;\
-typedef type api ITT_JOIN(_N_(name),_t) args;                  \
-ITT_EXTERN_C_BEGIN ITT_JOIN(_N_(name),_t)* ITTNOTIFY_NAME(name) = ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)); ITT_EXTERN_C_END \
-static type api ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)) args \
-{                                                              \
-    __itt_init_ittlib_name(NULL, __itt_group_all);             \
-    if (ITTNOTIFY_NAME(name) && ITTNOTIFY_NAME(name) != ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init))) \
-        return ITTNOTIFY_NAME(name) params;                    \
-    else                                                       \
-        return (type)0;                                        \
-}
-
-#define ITT_STUBV(api,type,name,args,params,ptr,group,format)  \
-static type api ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)) args;\
-typedef type api ITT_JOIN(_N_(name),_t) args;                  \
-ITT_EXTERN_C_BEGIN ITT_JOIN(_N_(name),_t)* ITTNOTIFY_NAME(name) = ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)); ITT_EXTERN_C_END \
-static type api ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)) args \
-{                                                              \
-    __itt_init_ittlib_name(NULL, __itt_group_all);             \
-    if (ITTNOTIFY_NAME(name) && ITTNOTIFY_NAME(name) != ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init))) \
-        ITTNOTIFY_NAME(name) params;                           \
-    else                                                       \
-        return;                                                \
-}
+#define ITT_STUB(api, type, name, args, params, ptr, group, format)            \
+  static type api ITT_VERSIONIZE(ITT_JOIN(_N_(name), _init)) args;             \
+  typedef type api ITT_JOIN(_N_(name), _t) args;                               \
+  ITT_EXTERN_C_BEGIN ITT_JOIN(_N_(name), _t) * ITTNOTIFY_NAME(name) =          \
+      ITT_VERSIONIZE(ITT_JOIN(_N_(name), _init));                              \
+  ITT_EXTERN_C_END                                                             \
+  static type api ITT_VERSIONIZE(ITT_JOIN(_N_(name), _init)) args {            \
+    __itt_init_ittlib_name(NULL, __itt_group_all);                             \
+    if (ITTNOTIFY_NAME(name) &&                                                \
+        ITTNOTIFY_NAME(name) != ITT_VERSIONIZE(ITT_JOIN(_N_(name), _init)))    \
+      return ITTNOTIFY_NAME(name) params;                                      \
+    else                                                                       \
+      return (type)0;                                                          \
+  }
+
+#define ITT_STUBV(api, type, name, args, params, ptr, group, format)           \
+  static type api ITT_VERSIONIZE(ITT_JOIN(_N_(name), _init)) args;             \
+  typedef type api ITT_JOIN(_N_(name), _t) args;                               \
+  ITT_EXTERN_C_BEGIN ITT_JOIN(_N_(name), _t) * ITTNOTIFY_NAME(name) =          \
+      ITT_VERSIONIZE(ITT_JOIN(_N_(name), _init));                              \
+  ITT_EXTERN_C_END                                                             \
+  static type api ITT_VERSIONIZE(ITT_JOIN(_N_(name), _init)) args {            \
+    __itt_init_ittlib_name(NULL, __itt_group_all);                             \
+    if (ITTNOTIFY_NAME(name) &&                                                \
+        ITTNOTIFY_NAME(name) != ITT_VERSIONIZE(ITT_JOIN(_N_(name), _init)))    \
+      ITTNOTIFY_NAME(name) params;                                             \
+    else                                                                       \
+      return;                                                                  \
+  }
 
 #undef __ITT_INTERNAL_INIT
 #include "ittnotify_static.h"
 
 #undef ITT_STUB
 #undef ITT_STUBV
-#define ITT_STUB(api,type,name,args,params,ptr,group,format)   \
-static type api ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)) args;\
-typedef type api ITT_JOIN(_N_(name),_t) args;                  \
-ITT_EXTERN_C_BEGIN ITT_JOIN(_N_(name),_t)* ITTNOTIFY_NAME(name) = ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)); ITT_EXTERN_C_END
-
-#define ITT_STUBV(api,type,name,args,params,ptr,group,format)  \
-static type api ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)) args;\
-typedef type api ITT_JOIN(_N_(name),_t) args;                  \
-ITT_EXTERN_C_BEGIN ITT_JOIN(_N_(name),_t)* ITTNOTIFY_NAME(name) = ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)); ITT_EXTERN_C_END
+#define ITT_STUB(api, type, name, args, params, ptr, group, format)            \
+  static type api ITT_VERSIONIZE(ITT_JOIN(_N_(name), _init)) args;             \
+  typedef type api ITT_JOIN(_N_(name), _t) args;                               \
+  ITT_EXTERN_C_BEGIN ITT_JOIN(_N_(name), _t) * ITTNOTIFY_NAME(name) =          \
+      ITT_VERSIONIZE(ITT_JOIN(_N_(name), _init));                              \
+  ITT_EXTERN_C_END
+
+#define ITT_STUBV(api, type, name, args, params, ptr, group, format)           \
+  static type api ITT_VERSIONIZE(ITT_JOIN(_N_(name), _init)) args;             \
+  typedef type api ITT_JOIN(_N_(name), _t) args;                               \
+  ITT_EXTERN_C_BEGIN ITT_JOIN(_N_(name), _t) * ITTNOTIFY_NAME(name) =          \
+      ITT_VERSIONIZE(ITT_JOIN(_N_(name), _init));                              \
+  ITT_EXTERN_C_END
 
 #define __ITT_INTERNAL_INIT
 #include "ittnotify_static.h"
@@ -185,31 +198,44 @@ ITT_GROUP_LIST(group_list);
 
 #pragma pack(push, 8)
 
-typedef struct ___itt_group_alias
-{
-    const char*    env_var;
-    __itt_group_id groups;
+typedef struct ___itt_group_alias {
+  const char *env_var;
+  __itt_group_id groups;
 } __itt_group_alias;
 
 static __itt_group_alias group_alias[] = {
-    { "KMP_FOR_TPROFILE", (__itt_group_id)(__itt_group_control | __itt_group_thread | __itt_group_sync  | __itt_group_mark) },
-    { "KMP_FOR_TCHECK",   (__itt_group_id)(__itt_group_control | __itt_group_thread | __itt_group_sync  | __itt_group_fsync | __itt_group_mark | __itt_group_suppress) },
-    { NULL,               (__itt_group_none) },
-    { api_version,        (__itt_group_none) } /* !!! Just to avoid unused code elimination !!! */
+    {"KMP_FOR_TPROFILE",
+     (__itt_group_id)(__itt_group_control | __itt_group_thread |
+                      __itt_group_sync | __itt_group_mark)},
+    {"KMP_FOR_TCHECK",
+     (__itt_group_id)(__itt_group_control | __itt_group_thread |
+                      __itt_group_sync | __itt_group_fsync | __itt_group_mark |
+                      __itt_group_suppress)},
+    {NULL, (__itt_group_none)},
+    {api_version,
+     (__itt_group_none)} /* !!! Just to avoid unused code elimination !!! */
 };
 
 #pragma pack(pop)
 
-#if ITT_PLATFORM==ITT_PLATFORM_WIN && KMP_MSVC_COMPAT
+// clang-format off
+#if ITT_PLATFORM == ITT_PLATFORM_WIN && KMP_MSVC_COMPAT
 #pragma warning(push)
-#pragma warning(disable: 4054) /* warning C4054: 'type cast' : from function pointer 'XXX' to data pointer 'void *' */
+#pragma warning(disable: 4054) /* warning C4054: 'type cast' : from function
+                                  pointer 'XXX' to data pointer 'void *' */
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+// clang-format onå
 
 static __itt_api_info api_list[] = {
 /* Define functions with static implementation */
 #undef ITT_STUB
 #undef ITT_STUBV
-#define ITT_STUB(api,type,name,args,params,nameindll,group,format) { ITT_TO_STR(ITT_JOIN(__itt_,nameindll)), (void**)(void*)&ITTNOTIFY_NAME(name), (void*)(size_t)&ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)), (void*)(size_t)&ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)), (__itt_group_id)(group)},
+#define ITT_STUB(api, type, name, args, params, nameindll, group, format)      \
+  {ITT_TO_STR(ITT_JOIN(__itt_, nameindll)),                                    \
+   (void **)(void *)&ITTNOTIFY_NAME(name),                                     \
+   (void *)(size_t)&ITT_VERSIONIZE(ITT_JOIN(_N_(name), _init)),                \
+   (void *)(size_t)&ITT_VERSIONIZE(ITT_JOIN(_N_(name), _init)),                \
+   (__itt_group_id)(group)},
 #define ITT_STUBV ITT_STUB
 #define __ITT_INTERNAL_INIT
 #include "ittnotify_static.h"
@@ -217,38 +243,43 @@ static __itt_api_info api_list[] = {
 /* Define functions without static implementation */
 #undef ITT_STUB
 #undef ITT_STUBV
-#define ITT_STUB(api,type,name,args,params,nameindll,group,format) {ITT_TO_STR(ITT_JOIN(__itt_,nameindll)), (void**)(void*)&ITTNOTIFY_NAME(name), (void*)(size_t)&ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)), NULL, (__itt_group_id)(group)},
+#define ITT_STUB(api, type, name, args, params, nameindll, group, format)      \
+  {ITT_TO_STR(ITT_JOIN(__itt_, nameindll)),                                    \
+   (void **)(void *)&ITTNOTIFY_NAME(name),                                     \
+   (void *)(size_t)&ITT_VERSIONIZE(ITT_JOIN(_N_(name), _init)), NULL,          \
+   (__itt_group_id)(group)},
 #define ITT_STUBV ITT_STUB
 #include "ittnotify_static.h"
-    {NULL, NULL, NULL, NULL, __itt_group_none}
-};
+    {NULL, NULL, NULL, NULL, __itt_group_none}};
 
-#if ITT_PLATFORM==ITT_PLATFORM_WIN && KMP_MSVC_COMPAT
+#if ITT_PLATFORM == ITT_PLATFORM_WIN && KMP_MSVC_COMPAT
 #pragma warning(pop)
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 
 /* static part descriptor which handles. all notification api attributes. */
 __itt_global _N_(_ittapi_global) = {
-    ITT_MAGIC,                                     /* identification info */
-    ITT_MAJOR, ITT_MINOR, API_VERSION_BUILD,       /* version info */
-    0,                                             /* api_initialized */
-    0,                                             /* mutex_initialized */
-    0,                                             /* atomic_counter */
-    MUTEX_INITIALIZER,                             /* mutex */
-    NULL,                                          /* dynamic library handle */
-    NULL,                                          /* error_handler */
-    NULL,                                          /* dll_path_ptr */
-    (__itt_api_info*)&api_list,                    /* api_list_ptr */
-    NULL,                                          /* next __itt_global */
-    NULL,                                          /* thread_list */
-    NULL,                                          /* domain_list */
-    NULL,                                          /* string_list */
-    __itt_collection_normal,                       /* collection state */
-    NULL                                          /* counter_list */
+    ITT_MAGIC, /* identification info */
+    ITT_MAJOR,
+    ITT_MINOR,
+    API_VERSION_BUILD, /* version info */
+    0, /* api_initialized */
+    0, /* mutex_initialized */
+    0, /* atomic_counter */
+    MUTEX_INITIALIZER, /* mutex */
+    NULL, /* dynamic library handle */
+    NULL, /* error_handler */
+    NULL, /* dll_path_ptr */
+    (__itt_api_info *)&api_list, /* api_list_ptr */
+    NULL, /* next __itt_global */
+    NULL, /* thread_list */
+    NULL, /* domain_list */
+    NULL, /* string_list */
+    __itt_collection_normal, /* collection state */
+    NULL /* counter_list */
 };
 
-typedef void (__itt_api_init_t)(__itt_global*, __itt_group_id);
-typedef void (__itt_api_fini_t)(__itt_global*);
+typedef void(__itt_api_init_t)(__itt_global *, __itt_group_id);
+typedef void(__itt_api_fini_t)(__itt_global *);
 
 /* ========================================================================= */
 
@@ -256,508 +287,526 @@ typedef void (__itt_api_fini_t)(__itt_global*);
 ITT_EXTERN_C void _N_(error_handler)(__itt_error_code, va_list args);
 #endif /* ITT_NOTIFY_EXT_REPORT */
 
-#if ITT_PLATFORM==ITT_PLATFORM_WIN && KMP_MSVC_COMPAT
+// clang-format off
+#if ITT_PLATFORM == ITT_PLATFORM_WIN && KMP_MSVC_COMPAT
 #pragma warning(push)
-#pragma warning(disable: 4055) /* warning C4055: 'type cast' : from data pointer 'void *' to function pointer 'XXX' */
+#pragma warning(disable: 4055) /* warning C4055: 'type cast' : from data pointer
+                                  'void *' to function pointer 'XXX' */
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-
-static void __itt_report_error(unsigned code_arg, ...)
-{
-    va_list args;
-    va_start(args, code_arg);
-
-    // We use unsigned for the code argument and explicitly cast it here to the
-    // right enumerator because variadic functions are not compatible with
-    // default promotions.
-    __itt_error_code code = (__itt_error_code)code_arg;
-
-    if (_N_(_ittapi_global).error_handler != NULL)
-    {
-        __itt_error_handler_t* handler = (__itt_error_handler_t*)(size_t)_N_(_ittapi_global).error_handler;
-        handler(code, args);
-    }
+// clang-format on
+
+static void __itt_report_error(unsigned code_arg, ...) {
+  va_list args;
+  va_start(args, code_arg);
+
+  // We use unsigned for the code argument and explicitly cast it here to the
+  // right enumerator because variadic functions are not compatible with
+  // default promotions.
+  __itt_error_code code = (__itt_error_code)code_arg;
+
+  if (_N_(_ittapi_global).error_handler != NULL) {
+    __itt_error_handler_t *handler =
+        (__itt_error_handler_t *)(size_t)_N_(_ittapi_global).error_handler;
+    handler(code, args);
+  }
 #ifdef ITT_NOTIFY_EXT_REPORT
-    _N_(error_handler)(code, args);
+  _N_(error_handler)(code, args);
 #endif /* ITT_NOTIFY_EXT_REPORT */
-    va_end(args);
+  va_end(args);
 }
 
-#if ITT_PLATFORM==ITT_PLATFORM_WIN && KMP_MSVC_COMPAT
+#if ITT_PLATFORM == ITT_PLATFORM_WIN && KMP_MSVC_COMPAT
 #pragma warning(pop)
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-static __itt_domain* ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(domain_createW),_init))(const wchar_t* name)
-{
-    __itt_domain *h_tail = NULL, *h = NULL;
-
-    if (name == NULL)
-    {
-        return NULL;
-    }
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+static __itt_domain *ITTAPI
+ITT_VERSIONIZE(ITT_JOIN(_N_(domain_createW), _init))(const wchar_t *name) {
+  __itt_domain *h_tail = NULL, *h = NULL;
 
-    ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global));
-    if (_N_(_ittapi_global).api_initialized)
-    {
-        if (ITTNOTIFY_NAME(domain_createW) && ITTNOTIFY_NAME(domain_createW) != ITT_VERSIONIZE(ITT_JOIN(_N_(domain_createW),_init)))
-        {
-            __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
-            return ITTNOTIFY_NAME(domain_createW)(name);
-        }
-    }
-    for (h_tail = NULL, h = _N_(_ittapi_global).domain_list; h != NULL; h_tail = h, h = h->next)
-    {
-        if (h->nameW != NULL && !wcscmp(h->nameW, name)) break;
-    }
-    if (h == NULL)
-    {
-        NEW_DOMAIN_W(&_N_(_ittapi_global),h,h_tail,name);
+  if (name == NULL) {
+    return NULL;
+  }
+
+  ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global));
+  if (_N_(_ittapi_global).api_initialized) {
+    if (ITTNOTIFY_NAME(domain_createW) &&
+        ITTNOTIFY_NAME(domain_createW) !=
+            ITT_VERSIONIZE(ITT_JOIN(_N_(domain_createW), _init))) {
+      __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+      return ITTNOTIFY_NAME(domain_createW)(name);
     }
-    if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
-    return h;
+  }
+  for (h_tail = NULL, h = _N_(_ittapi_global).domain_list; h != NULL;
+       h_tail = h, h = h->next) {
+    if (h->nameW != NULL && !wcscmp(h->nameW, name))
+      break;
+  }
+  if (h == NULL) {
+    NEW_DOMAIN_W(&_N_(_ittapi_global), h, h_tail, name);
+  }
+  if (PTHREAD_SYMBOLS)
+    __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+  return h;
 }
 
-static __itt_domain* ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(domain_createA),_init))(const char* name)
-#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
-static __itt_domain* ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(domain_create),_init))(const char* name)
+static __itt_domain *ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(domain_createA),
+                                                    _init))(const char *name)
+#else /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+static __itt_domain *ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(domain_create),
+                                                    _init))(const char *name)
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 {
-    __itt_domain *h_tail = NULL, *h = NULL;
+  __itt_domain *h_tail = NULL, *h = NULL;
 
-    if (name == NULL)
-    {
-        return NULL;
+  if (name == NULL) {
+    return NULL;
+  }
+
+  ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global));
+  if (_N_(_ittapi_global).api_initialized) {
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+    if (ITTNOTIFY_NAME(domain_createA) &&
+        ITTNOTIFY_NAME(domain_createA) !=
+            ITT_VERSIONIZE(ITT_JOIN(_N_(domain_createA), _init))) {
+      __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+      return ITTNOTIFY_NAME(domain_createA)(name);
     }
-
-    ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global));
-    if (_N_(_ittapi_global).api_initialized)
-    {
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-        if (ITTNOTIFY_NAME(domain_createA) && ITTNOTIFY_NAME(domain_createA) != ITT_VERSIONIZE(ITT_JOIN(_N_(domain_createA),_init)))
-        {
-            __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
-            return ITTNOTIFY_NAME(domain_createA)(name);
-        }
 #else
-        if (ITTNOTIFY_NAME(domain_create) && ITTNOTIFY_NAME(domain_create) != ITT_VERSIONIZE(ITT_JOIN(_N_(domain_create),_init)))
-        {
-            if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
-            return ITTNOTIFY_NAME(domain_create)(name);
-        }
-#endif
-    }
-    for (h_tail = NULL, h = _N_(_ittapi_global).domain_list; h != NULL; h_tail = h, h = h->next)
-    {
-        if (h->nameA != NULL && !__itt_fstrcmp(h->nameA, name)) break;
-    }
-    if (h == NULL)
-    {
-        NEW_DOMAIN_A(&_N_(_ittapi_global),h,h_tail,name);
+    if (ITTNOTIFY_NAME(domain_create) &&
+        ITTNOTIFY_NAME(domain_create) !=
+            ITT_VERSIONIZE(ITT_JOIN(_N_(domain_create), _init))) {
+      if (PTHREAD_SYMBOLS)
+        __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+      return ITTNOTIFY_NAME(domain_create)(name);
     }
-    if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
-    return h;
+#endif
+  }
+  for (h_tail = NULL, h = _N_(_ittapi_global).domain_list; h != NULL;
+       h_tail = h, h = h->next) {
+    if (h->nameA != NULL && !__itt_fstrcmp(h->nameA, name))
+      break;
+  }
+  if (h == NULL) {
+    NEW_DOMAIN_A(&_N_(_ittapi_global), h, h_tail, name);
+  }
+  if (PTHREAD_SYMBOLS)
+    __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+  return h;
 }
 
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-static __itt_string_handle* ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(string_handle_createW),_init))(const wchar_t* name)
-{
-    __itt_string_handle *h_tail = NULL, *h = NULL;
-
-    if (name == NULL)
-    {
-        return NULL;
-    }
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+static __itt_string_handle *ITTAPI ITT_VERSIONIZE(
+    ITT_JOIN(_N_(string_handle_createW), _init))(const wchar_t *name) {
+  __itt_string_handle *h_tail = NULL, *h = NULL;
 
-    ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global));
-    if (_N_(_ittapi_global).api_initialized)
-    {
-        if (ITTNOTIFY_NAME(string_handle_createW) && ITTNOTIFY_NAME(string_handle_createW) != ITT_VERSIONIZE(ITT_JOIN(_N_(string_handle_createW),_init)))
-        {
-            __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
-            return ITTNOTIFY_NAME(string_handle_createW)(name);
-        }
-    }
-    for (h_tail = NULL, h = _N_(_ittapi_global).string_list; h != NULL; h_tail = h, h = h->next)
-    {
-        if (h->strW != NULL && !wcscmp(h->strW, name)) break;
-    }
-    if (h == NULL)
-    {
-        NEW_STRING_HANDLE_W(&_N_(_ittapi_global),h,h_tail,name);
+  if (name == NULL) {
+    return NULL;
+  }
+
+  ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global));
+  if (_N_(_ittapi_global).api_initialized) {
+    if (ITTNOTIFY_NAME(string_handle_createW) &&
+        ITTNOTIFY_NAME(string_handle_createW) !=
+            ITT_VERSIONIZE(ITT_JOIN(_N_(string_handle_createW), _init))) {
+      __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+      return ITTNOTIFY_NAME(string_handle_createW)(name);
     }
-    __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
-    return h;
+  }
+  for (h_tail = NULL, h = _N_(_ittapi_global).string_list; h != NULL;
+       h_tail = h, h = h->next) {
+    if (h->strW != NULL && !wcscmp(h->strW, name))
+      break;
+  }
+  if (h == NULL) {
+    NEW_STRING_HANDLE_W(&_N_(_ittapi_global), h, h_tail, name);
+  }
+  __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+  return h;
 }
 
-static __itt_string_handle* ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(string_handle_createA),_init))(const char* name)
-#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
-static __itt_string_handle* ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(string_handle_create),_init))(const char* name)
+static __itt_string_handle *ITTAPI
+ITT_VERSIONIZE(ITT_JOIN(_N_(string_handle_createA), _init))(const char *name)
+#else /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+static __itt_string_handle *ITTAPI
+ITT_VERSIONIZE(ITT_JOIN(_N_(string_handle_create), _init))(const char *name)
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 {
-    __itt_string_handle *h_tail = NULL, *h = NULL;
+  __itt_string_handle *h_tail = NULL, *h = NULL;
 
-    if (name == NULL)
-    {
-        return NULL;
+  if (name == NULL) {
+    return NULL;
+  }
+
+  ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global));
+  if (_N_(_ittapi_global).api_initialized) {
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+    if (ITTNOTIFY_NAME(string_handle_createA) &&
+        ITTNOTIFY_NAME(string_handle_createA) !=
+            ITT_VERSIONIZE(ITT_JOIN(_N_(string_handle_createA), _init))) {
+      __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+      return ITTNOTIFY_NAME(string_handle_createA)(name);
     }
-
-    ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global));
-    if (_N_(_ittapi_global).api_initialized)
-    {
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-        if (ITTNOTIFY_NAME(string_handle_createA) && ITTNOTIFY_NAME(string_handle_createA) != ITT_VERSIONIZE(ITT_JOIN(_N_(string_handle_createA),_init)))
-        {
-            __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
-            return ITTNOTIFY_NAME(string_handle_createA)(name);
-        }
 #else
-        if (ITTNOTIFY_NAME(string_handle_create) && ITTNOTIFY_NAME(string_handle_create) != ITT_VERSIONIZE(ITT_JOIN(_N_(string_handle_create),_init)))
-        {
-            if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
-            return ITTNOTIFY_NAME(string_handle_create)(name);
-        }
-#endif
-    }
-    for (h_tail = NULL, h = _N_(_ittapi_global).string_list; h != NULL; h_tail = h, h = h->next)
-    {
-        if (h->strA != NULL && !__itt_fstrcmp(h->strA, name)) break;
+    if (ITTNOTIFY_NAME(string_handle_create) &&
+        ITTNOTIFY_NAME(string_handle_create) !=
+            ITT_VERSIONIZE(ITT_JOIN(_N_(string_handle_create), _init))) {
+      if (PTHREAD_SYMBOLS)
+        __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+      return ITTNOTIFY_NAME(string_handle_create)(name);
     }
-    if (h == NULL)
-    {
-        NEW_STRING_HANDLE_A(&_N_(_ittapi_global),h,h_tail,name);
-    }
-    if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
-    return h;
+#endif
+  }
+  for (h_tail = NULL, h = _N_(_ittapi_global).string_list; h != NULL;
+       h_tail = h, h = h->next) {
+    if (h->strA != NULL && !__itt_fstrcmp(h->strA, name))
+      break;
+  }
+  if (h == NULL) {
+    NEW_STRING_HANDLE_A(&_N_(_ittapi_global), h, h_tail, name);
+  }
+  if (PTHREAD_SYMBOLS)
+    __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+  return h;
 }
 
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-static __itt_counter ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(counter_createW),_init))(const wchar_t *name, const wchar_t *domain)
-{
-    __itt_counter_info_t *h_tail = NULL, *h = NULL;
-    __itt_metadata_type type = __itt_metadata_u64;
-
-    if (name == NULL)
-    {
-        return NULL;
-    }
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+static __itt_counter ITTAPI ITT_VERSIONIZE(ITT_JOIN(
+    _N_(counter_createW), _init))(const wchar_t *name, const wchar_t *domain) {
+  __itt_counter_info_t *h_tail = NULL, *h = NULL;
+  __itt_metadata_type type = __itt_metadata_u64;
 
-    ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global));
-    if (_N_(_ittapi_global).api_initialized)
-    {
-        if (ITTNOTIFY_NAME(counter_createW) && ITTNOTIFY_NAME(counter_createW) != ITT_VERSIONIZE(ITT_JOIN(_N_(counter_createW),_init)))
-        {
-            __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
-            return ITTNOTIFY_NAME(counter_createW)(name, domain);
-        }
-    }
-    for (h_tail = NULL, h = _N_(_ittapi_global).counter_list; h != NULL; h_tail = h, h = h->next)
-    {
-        if (h->nameW != NULL  && h->type == type && !wcscmp(h->nameW, name) && ((h->domainW == NULL && domain == NULL) ||
-            (h->domainW != NULL && domain != NULL && !wcscmp(h->domainW, domain)))) break;
-
-    }
-    if (h == NULL)
-    {
-        NEW_COUNTER_W(&_N_(_ittapi_global),h,h_tail,name,domain,type);
+  if (name == NULL) {
+    return NULL;
+  }
+
+  ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global));
+  if (_N_(_ittapi_global).api_initialized) {
+    if (ITTNOTIFY_NAME(counter_createW) &&
+        ITTNOTIFY_NAME(counter_createW) !=
+            ITT_VERSIONIZE(ITT_JOIN(_N_(counter_createW), _init))) {
+      __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+      return ITTNOTIFY_NAME(counter_createW)(name, domain);
     }
-    __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
-    return (__itt_counter)h;
+  }
+  for (h_tail = NULL, h = _N_(_ittapi_global).counter_list; h != NULL;
+       h_tail = h, h = h->next) {
+    if (h->nameW != NULL && h->type == type && !wcscmp(h->nameW, name) &&
+        ((h->domainW == NULL && domain == NULL) ||
+         (h->domainW != NULL && domain != NULL && !wcscmp(h->domainW, domain))))
+      break;
+  }
+  if (h == NULL) {
+    NEW_COUNTER_W(&_N_(_ittapi_global), h, h_tail, name, domain, type);
+  }
+  __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+  return (__itt_counter)h;
 }
 
-static __itt_counter ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(counter_createA),_init))(const char *name, const char *domain)
-#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
-static __itt_counter ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(counter_create),_init))(const char *name, const char *domain)
+static __itt_counter ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(counter_createA),
+                                                    _init))(const char *name,
+                                                            const char *domain)
+#else /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+static __itt_counter ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(counter_create),
+                                                    _init))(const char *name,
+                                                            const char *domain)
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 {
-    __itt_counter_info_t *h_tail = NULL, *h = NULL;
-    __itt_metadata_type type = __itt_metadata_u64;
+  __itt_counter_info_t *h_tail = NULL, *h = NULL;
+  __itt_metadata_type type = __itt_metadata_u64;
 
-    if (name == NULL)
-    {
-        return NULL;
+  if (name == NULL) {
+    return NULL;
+  }
+
+  ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global));
+  if (_N_(_ittapi_global).api_initialized) {
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+    if (ITTNOTIFY_NAME(counter_createA) &&
+        ITTNOTIFY_NAME(counter_createA) !=
+            ITT_VERSIONIZE(ITT_JOIN(_N_(counter_createA), _init))) {
+      __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+      return ITTNOTIFY_NAME(counter_createA)(name, domain);
     }
-
-    ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global));
-    if (_N_(_ittapi_global).api_initialized)
-    {
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-        if (ITTNOTIFY_NAME(counter_createA) && ITTNOTIFY_NAME(counter_createA) != ITT_VERSIONIZE(ITT_JOIN(_N_(counter_createA),_init)))
-        {
-            __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
-            return ITTNOTIFY_NAME(counter_createA)(name, domain);
-        }
 #else
-        if (ITTNOTIFY_NAME(counter_create) && ITTNOTIFY_NAME(counter_create) != ITT_VERSIONIZE(ITT_JOIN(_N_(counter_create),_init)))
-        {
-            if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
-            return ITTNOTIFY_NAME(counter_create)(name, domain);
-        }
-#endif
+    if (ITTNOTIFY_NAME(counter_create) &&
+        ITTNOTIFY_NAME(counter_create) !=
+            ITT_VERSIONIZE(ITT_JOIN(_N_(counter_create), _init))) {
+      if (PTHREAD_SYMBOLS)
+        __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+      return ITTNOTIFY_NAME(counter_create)(name, domain);
     }
-    for (h_tail = NULL, h = _N_(_ittapi_global).counter_list; h != NULL; h_tail = h, h = h->next)
-    {
-        if (h->nameA != NULL  && h->type == type && !__itt_fstrcmp(h->nameA, name) && ((h->domainA == NULL && domain == NULL) ||
-            (h->domainA != NULL && domain != NULL && !__itt_fstrcmp(h->domainA, domain)))) break;
-    }
-    if (h == NULL)
-    {
-       NEW_COUNTER_A(&_N_(_ittapi_global),h,h_tail,name,domain,type);
-    }
-    if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
-    return (__itt_counter)h;
+#endif
+  }
+  for (h_tail = NULL, h = _N_(_ittapi_global).counter_list; h != NULL;
+       h_tail = h, h = h->next) {
+    if (h->nameA != NULL && h->type == type && !__itt_fstrcmp(h->nameA, name) &&
+        ((h->domainA == NULL && domain == NULL) ||
+         (h->domainA != NULL && domain != NULL &&
+          !__itt_fstrcmp(h->domainA, domain))))
+      break;
+  }
+  if (h == NULL) {
+    NEW_COUNTER_A(&_N_(_ittapi_global), h, h_tail, name, domain, type);
+  }
+  if (PTHREAD_SYMBOLS)
+    __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+  return (__itt_counter)h;
 }
 
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-static __itt_counter ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(counter_create_typedW),_init))(const wchar_t *name, const wchar_t *domain, __itt_metadata_type type)
-{
-    __itt_counter_info_t *h_tail = NULL, *h = NULL;
-
-    if (name == NULL)
-    {
-        return NULL;
-    }
-
-    ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global));
-    if (_N_(_ittapi_global).api_initialized)
-    {
-        if (ITTNOTIFY_NAME(counter_create_typedW) && ITTNOTIFY_NAME(counter_create_typedW) != ITT_VERSIONIZE(ITT_JOIN(_N_(counter_create_typedW),_init)))
-        {
-            __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
-            return ITTNOTIFY_NAME(counter_create_typedW)(name, domain, type);
-        }
-    }
-    for (h_tail = NULL, h = _N_(_ittapi_global).counter_list; h != NULL; h_tail = h, h = h->next)
-    {
-        if (h->nameW != NULL  && h->type == type && !wcscmp(h->nameW, name) && ((h->domainW == NULL && domain == NULL) ||
-            (h->domainW != NULL && domain != NULL && !wcscmp(h->domainW, domain)))) break;
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+static __itt_counter ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(counter_create_typedW),
+                                                    _init))(
+    const wchar_t *name, const wchar_t *domain, __itt_metadata_type type) {
+  __itt_counter_info_t *h_tail = NULL, *h = NULL;
 
+  if (name == NULL) {
+    return NULL;
+  }
+
+  ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global));
+  if (_N_(_ittapi_global).api_initialized) {
+    if (ITTNOTIFY_NAME(counter_create_typedW) &&
+        ITTNOTIFY_NAME(counter_create_typedW) !=
+            ITT_VERSIONIZE(ITT_JOIN(_N_(counter_create_typedW), _init))) {
+      __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+      return ITTNOTIFY_NAME(counter_create_typedW)(name, domain, type);
     }
-    if (h == NULL)
-    {
-        NEW_COUNTER_W(&_N_(_ittapi_global),h,h_tail,name,domain,type);
-    }
-    __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
-    return (__itt_counter)h;
+  }
+  for (h_tail = NULL, h = _N_(_ittapi_global).counter_list; h != NULL;
+       h_tail = h, h = h->next) {
+    if (h->nameW != NULL && h->type == type && !wcscmp(h->nameW, name) &&
+        ((h->domainW == NULL && domain == NULL) ||
+         (h->domainW != NULL && domain != NULL && !wcscmp(h->domainW, domain))))
+      break;
+  }
+  if (h == NULL) {
+    NEW_COUNTER_W(&_N_(_ittapi_global), h, h_tail, name, domain, type);
+  }
+  __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+  return (__itt_counter)h;
 }
 
-static __itt_counter ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(counter_create_typedA),_init))(const char *name, const char *domain, __itt_metadata_type type)
-#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
-static __itt_counter ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(counter_create_typed),_init))(const char *name, const char *domain, __itt_metadata_type type)
+static __itt_counter ITTAPI ITT_VERSIONIZE(ITT_JOIN(
+    _N_(counter_create_typedA), _init))(const char *name, const char *domain,
+                                        __itt_metadata_type type)
+#else /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+static __itt_counter ITTAPI ITT_VERSIONIZE(ITT_JOIN(
+    _N_(counter_create_typed), _init))(const char *name, const char *domain,
+                                       __itt_metadata_type type)
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 {
-    __itt_counter_info_t *h_tail = NULL, *h = NULL;
+  __itt_counter_info_t *h_tail = NULL, *h = NULL;
 
-    if (name == NULL)
-    {
-        return NULL;
+  if (name == NULL) {
+    return NULL;
+  }
+
+  ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global));
+  if (_N_(_ittapi_global).api_initialized) {
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+    if (ITTNOTIFY_NAME(counter_create_typedA) &&
+        ITTNOTIFY_NAME(counter_create_typedA) !=
+            ITT_VERSIONIZE(ITT_JOIN(_N_(counter_create_typedA), _init))) {
+      __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+      return ITTNOTIFY_NAME(counter_create_typedA)(name, domain, type);
     }
-
-    ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global));
-    if (_N_(_ittapi_global).api_initialized)
-    {
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-        if (ITTNOTIFY_NAME(counter_create_typedA) && ITTNOTIFY_NAME(counter_create_typedA) != ITT_VERSIONIZE(ITT_JOIN(_N_(counter_create_typedA),_init)))
-        {
-            __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
-            return ITTNOTIFY_NAME(counter_create_typedA)(name, domain, type);
-        }
 #else
-        if (ITTNOTIFY_NAME(counter_create_typed) && ITTNOTIFY_NAME(counter_create_typed) != ITT_VERSIONIZE(ITT_JOIN(_N_(counter_create_typed),_init)))
-        {
-            if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
-            return ITTNOTIFY_NAME(counter_create_typed)(name, domain, type);
-        }
-#endif
-    }
-    for (h_tail = NULL, h = _N_(_ittapi_global).counter_list; h != NULL; h_tail = h, h = h->next)
-    {
-        if (h->nameA != NULL  && h->type == type && !__itt_fstrcmp(h->nameA, name) && ((h->domainA == NULL && domain == NULL) ||
-            (h->domainA != NULL && domain != NULL && !__itt_fstrcmp(h->domainA, domain)))) break;
+    if (ITTNOTIFY_NAME(counter_create_typed) &&
+        ITTNOTIFY_NAME(counter_create_typed) !=
+            ITT_VERSIONIZE(ITT_JOIN(_N_(counter_create_typed), _init))) {
+      if (PTHREAD_SYMBOLS)
+        __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+      return ITTNOTIFY_NAME(counter_create_typed)(name, domain, type);
     }
-    if (h == NULL)
-    {
-       NEW_COUNTER_A(&_N_(_ittapi_global),h,h_tail,name,domain,type);
-    }
-    if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
-    return (__itt_counter)h;
+#endif
+  }
+  for (h_tail = NULL, h = _N_(_ittapi_global).counter_list; h != NULL;
+       h_tail = h, h = h->next) {
+    if (h->nameA != NULL && h->type == type && !__itt_fstrcmp(h->nameA, name) &&
+        ((h->domainA == NULL && domain == NULL) ||
+         (h->domainA != NULL && domain != NULL &&
+          !__itt_fstrcmp(h->domainA, domain))))
+      break;
+  }
+  if (h == NULL) {
+    NEW_COUNTER_A(&_N_(_ittapi_global), h, h_tail, name, domain, type);
+  }
+  if (PTHREAD_SYMBOLS)
+    __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+  return (__itt_counter)h;
 }
 
 /* -------------------------------------------------------------------------- */
 
-static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(pause),_init))(void)
-{
-    if (!_N_(_ittapi_global).api_initialized && _N_(_ittapi_global).thread_list == NULL)
-    {
-        __itt_init_ittlib_name(NULL, __itt_group_all);
-    }
-    if (ITTNOTIFY_NAME(pause) && ITTNOTIFY_NAME(pause) != ITT_VERSIONIZE(ITT_JOIN(_N_(pause),_init)))
-    {
-        ITTNOTIFY_NAME(pause)();
-    }
-    else
-    {
-        _N_(_ittapi_global).state = __itt_collection_paused;
-    }
+static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(pause), _init))(void) {
+  if (!_N_(_ittapi_global).api_initialized &&
+      _N_(_ittapi_global).thread_list == NULL) {
+    __itt_init_ittlib_name(NULL, __itt_group_all);
+  }
+  if (ITTNOTIFY_NAME(pause) &&
+      ITTNOTIFY_NAME(pause) != ITT_VERSIONIZE(ITT_JOIN(_N_(pause), _init))) {
+    ITTNOTIFY_NAME(pause)();
+  } else {
+    _N_(_ittapi_global).state = __itt_collection_paused;
+  }
 }
 
-static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(resume),_init))(void)
-{
-    if (!_N_(_ittapi_global).api_initialized && _N_(_ittapi_global).thread_list == NULL)
-    {
-        __itt_init_ittlib_name(NULL, __itt_group_all);
-    }
-    if (ITTNOTIFY_NAME(resume) && ITTNOTIFY_NAME(resume) != ITT_VERSIONIZE(ITT_JOIN(_N_(resume),_init)))
-    {
-        ITTNOTIFY_NAME(resume)();
-    }
-    else
-    {
-        _N_(_ittapi_global).state = __itt_collection_normal;
-    }
+static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(resume), _init))(void) {
+  if (!_N_(_ittapi_global).api_initialized &&
+      _N_(_ittapi_global).thread_list == NULL) {
+    __itt_init_ittlib_name(NULL, __itt_group_all);
+  }
+  if (ITTNOTIFY_NAME(resume) &&
+      ITTNOTIFY_NAME(resume) != ITT_VERSIONIZE(ITT_JOIN(_N_(resume), _init))) {
+    ITTNOTIFY_NAME(resume)();
+  } else {
+    _N_(_ittapi_global).state = __itt_collection_normal;
+  }
 }
 
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_nameW),_init))(const wchar_t* name)
-{
-    if (!_N_(_ittapi_global).api_initialized && _N_(_ittapi_global).thread_list == NULL)
-    {
-        __itt_init_ittlib_name(NULL, __itt_group_all);
-    }
-    if (ITTNOTIFY_NAME(thread_set_nameW) && ITTNOTIFY_NAME(thread_set_nameW) != ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_nameW),_init)))
-    {
-        ITTNOTIFY_NAME(thread_set_nameW)(name);
-    }
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_nameW),
+                                           _init))(const wchar_t *name) {
+  if (!_N_(_ittapi_global).api_initialized &&
+      _N_(_ittapi_global).thread_list == NULL) {
+    __itt_init_ittlib_name(NULL, __itt_group_all);
+  }
+  if (ITTNOTIFY_NAME(thread_set_nameW) &&
+      ITTNOTIFY_NAME(thread_set_nameW) !=
+          ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_nameW), _init))) {
+    ITTNOTIFY_NAME(thread_set_nameW)(name);
+  }
 }
 
-static int ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(thr_name_setW),_init))(const wchar_t* name, int namelen)
-{
-    (void)namelen;
-    ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_nameW),_init))(name);
-    return 0;
+static int ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(thr_name_setW),
+                                          _init))(const wchar_t *name,
+                                                  int namelen) {
+  (void)namelen;
+  ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_nameW), _init))(name);
+  return 0;
 }
 
-static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_nameA),_init))(const char* name)
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_name),_init))(const char* name)
+static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_nameA),
+                                           _init))(const char *name)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_name),
+                                           _init))(const char *name)
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 {
-    if (!_N_(_ittapi_global).api_initialized && _N_(_ittapi_global).thread_list == NULL)
-    {
-        __itt_init_ittlib_name(NULL, __itt_group_all);
-    }
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-    if (ITTNOTIFY_NAME(thread_set_nameA) && ITTNOTIFY_NAME(thread_set_nameA) != ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_nameA),_init)))
-    {
-        ITTNOTIFY_NAME(thread_set_nameA)(name);
-    }
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-    if (ITTNOTIFY_NAME(thread_set_name) && ITTNOTIFY_NAME(thread_set_name) != ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_name),_init)))
-    {
-        ITTNOTIFY_NAME(thread_set_name)(name);
-    }
+  if (!_N_(_ittapi_global).api_initialized &&
+      _N_(_ittapi_global).thread_list == NULL) {
+    __itt_init_ittlib_name(NULL, __itt_group_all);
+  }
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+  if (ITTNOTIFY_NAME(thread_set_nameA) &&
+      ITTNOTIFY_NAME(thread_set_nameA) !=
+          ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_nameA), _init))) {
+    ITTNOTIFY_NAME(thread_set_nameA)(name);
+  }
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+  if (ITTNOTIFY_NAME(thread_set_name) &&
+      ITTNOTIFY_NAME(thread_set_name) !=
+          ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_name), _init))) {
+    ITTNOTIFY_NAME(thread_set_name)(name);
+  }
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 }
 
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-static int ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(thr_name_setA),_init))(const char* name, int namelen)
-{
-    (void)namelen;
-    ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_nameA),_init))(name);
-    return 0;
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+static int ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(thr_name_setA),
+                                          _init))(const char *name,
+                                                  int namelen) {
+  (void)namelen;
+  ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_nameA), _init))(name);
+  return 0;
 }
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-static int ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(thr_name_set),_init))(const char* name, int namelen)
-{
-    (void)namelen;
-    ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_name),_init))(name);
-    return 0;
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+static int ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(thr_name_set),
+                                          _init))(const char *name,
+                                                  int namelen) {
+  (void)namelen;
+  ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_name), _init))(name);
+  return 0;
 }
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 
-static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(thread_ignore),_init))(void)
-{
-    if (!_N_(_ittapi_global).api_initialized && _N_(_ittapi_global).thread_list == NULL)
-    {
-        __itt_init_ittlib_name(NULL, __itt_group_all);
-    }
-    if (ITTNOTIFY_NAME(thread_ignore) && ITTNOTIFY_NAME(thread_ignore) != ITT_VERSIONIZE(ITT_JOIN(_N_(thread_ignore),_init)))
-    {
-        ITTNOTIFY_NAME(thread_ignore)();
-    }
+static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(thread_ignore), _init))(void) {
+  if (!_N_(_ittapi_global).api_initialized &&
+      _N_(_ittapi_global).thread_list == NULL) {
+    __itt_init_ittlib_name(NULL, __itt_group_all);
+  }
+  if (ITTNOTIFY_NAME(thread_ignore) &&
+      ITTNOTIFY_NAME(thread_ignore) !=
+          ITT_VERSIONIZE(ITT_JOIN(_N_(thread_ignore), _init))) {
+    ITTNOTIFY_NAME(thread_ignore)();
+  }
 }
 
-static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(thr_ignore),_init))(void)
-{
-    ITT_VERSIONIZE(ITT_JOIN(_N_(thread_ignore),_init))();
+static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(thr_ignore), _init))(void) {
+  ITT_VERSIONIZE(ITT_JOIN(_N_(thread_ignore), _init))();
 }
 
-static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(enable_attach),_init))(void)
-{
+static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(enable_attach), _init))(void) {
 #ifdef __ANDROID__
-    /*
-     * if LIB_VAR_NAME env variable were set before then stay previous value
-     * else set default path
-    */
-    setenv(ITT_TO_STR(LIB_VAR_NAME), ANDROID_ITTNOTIFY_DEFAULT_PATH, 0);
+  /*
+   * if LIB_VAR_NAME env variable were set before then stay previous value
+   * else set default path
+   */
+  setenv(ITT_TO_STR(LIB_VAR_NAME), ANDROID_ITTNOTIFY_DEFAULT_PATH, 0);
 #endif
 }
 
 /* -------------------------------------------------------------------------- */
 
-static const char* __itt_fsplit(const char* s, const char* sep, const char** out, int* len)
-{
-    int i;
-    int j;
-
-    if (!s || !sep || !out || !len)
-        return NULL;
+static const char *__itt_fsplit(const char *s, const char *sep,
+                                const char **out, int *len) {
+  int i;
+  int j;
 
-    for (i = 0; s[i]; i++)
-    {
-        int b = 0;
-        for (j = 0; sep[j]; j++)
-            if (s[i] == sep[j])
-            {
-                b = 1;
-                break;
-            }
-        if (!b)
-            break;
-    }
-
-    if (!s[i])
-        return NULL;
-
-    *len = 0;
-    *out = &s[i];
-
-    for (; s[i]; i++, (*len)++)
-    {
-        int b = 0;
-        for (j = 0; sep[j]; j++)
-            if (s[i] == sep[j])
-            {
-                b = 1;
-                break;
-            }
-        if (b)
-            break;
-    }
+  if (!s || !sep || !out || !len)
+    return NULL;
 
-    for (; s[i]; i++)
-    {
-        int b = 0;
-        for (j = 0; sep[j]; j++)
-            if (s[i] == sep[j])
-            {
-                b = 1;
-                break;
-            }
-        if (!b)
-            break;
-    }
+  for (i = 0; s[i]; i++) {
+    int b = 0;
+    for (j = 0; sep[j]; j++)
+      if (s[i] == sep[j]) {
+        b = 1;
+        break;
+      }
+    if (!b)
+      break;
+  }
+
+  if (!s[i])
+    return NULL;
 
-    return &s[i];
+  *len = 0;
+  *out = &s[i];
+
+  for (; s[i]; i++, (*len)++) {
+    int b = 0;
+    for (j = 0; sep[j]; j++)
+      if (s[i] == sep[j]) {
+        b = 1;
+        break;
+      }
+    if (b)
+      break;
+  }
+
+  for (; s[i]; i++) {
+    int b = 0;
+    for (j = 0; sep[j]; j++)
+      if (s[i] == sep[j]) {
+        b = 1;
+        break;
+      }
+    if (!b)
+      break;
+  }
+
+  return &s[i];
 }
 
 /* This function return value of env variable that placed into static buffer.
@@ -765,238 +814,213 @@ static const char* __itt_fsplit(const char* s, const char* sep, const char** out
  * This was done to avoid dynamic allocation for few calls.
  * Actually we need this function only four times.
  */
-static const char* __itt_get_env_var(const char* name)
-{
+static const char *__itt_get_env_var(const char *name) {
 #define MAX_ENV_VALUE_SIZE 4086
-    static char  env_buff[MAX_ENV_VALUE_SIZE];
-    static char* env_value = (char*)env_buff;
-
-    if (name != NULL)
-    {
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-        size_t max_len = MAX_ENV_VALUE_SIZE - (size_t)(env_value - env_buff);
-        DWORD rc = GetEnvironmentVariableA(name, env_value, (DWORD)max_len);
-        if (rc >= max_len)
-            __itt_report_error(__itt_error_env_too_long, name, (size_t)rc - 1, (size_t)(max_len - 1));
-        else if (rc > 0)
-        {
-            const char* ret = (const char*)env_value;
-            env_value += rc + 1;
-            return ret;
-        }
-        else
-        {
-            /* If environment variable is empty, GetEnvironmentVariables()
-             * returns zero (number of characters (not including terminating null),
-             * and GetLastError() returns ERROR_SUCCESS. */
-            DWORD err = GetLastError();
-            if (err == ERROR_SUCCESS)
-                return env_value;
-
-            if (err != ERROR_ENVVAR_NOT_FOUND)
-                __itt_report_error(__itt_error_cant_read_env, name, (int)err);
-        }
-#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
-        char* env = getenv(name);
-        if (env != NULL)
-        {
-            size_t len = __itt_fstrnlen(env, MAX_ENV_VALUE_SIZE);
-            size_t max_len = MAX_ENV_VALUE_SIZE - (size_t)(env_value - env_buff);
-            if (len < max_len)
-            {
-                const char* ret = (const char*)env_value;
-                __itt_fstrcpyn(env_value, max_len, env, len + 1);
-                env_value += len + 1;
-                return ret;
-            } else
-                __itt_report_error(__itt_error_env_too_long, name, (size_t)len, (size_t)(max_len - 1));
-        }
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+  static char env_buff[MAX_ENV_VALUE_SIZE];
+  static char *env_value = (char *)env_buff;
+
+  if (name != NULL) {
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+    size_t max_len = MAX_ENV_VALUE_SIZE - (size_t)(env_value - env_buff);
+    DWORD rc = GetEnvironmentVariableA(name, env_value, (DWORD)max_len);
+    if (rc >= max_len)
+      __itt_report_error(__itt_error_env_too_long, name, (size_t)rc - 1,
+                         (size_t)(max_len - 1));
+    else if (rc > 0) {
+      const char *ret = (const char *)env_value;
+      env_value += rc + 1;
+      return ret;
+    } else {
+      /* If environment variable is empty, GetEnvironmentVariables()
+       * returns zero (number of characters (not including terminating null),
+       * and GetLastError() returns ERROR_SUCCESS. */
+      DWORD err = GetLastError();
+      if (err == ERROR_SUCCESS)
+        return env_value;
+
+      if (err != ERROR_ENVVAR_NOT_FOUND)
+        __itt_report_error(__itt_error_cant_read_env, name, (int)err);
     }
-    return NULL;
+#else /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+    char *env = getenv(name);
+    if (env != NULL) {
+      size_t len = __itt_fstrnlen(env, MAX_ENV_VALUE_SIZE);
+      size_t max_len = MAX_ENV_VALUE_SIZE - (size_t)(env_value - env_buff);
+      if (len < max_len) {
+        const char *ret = (const char *)env_value;
+        __itt_fstrcpyn(env_value, max_len, env, len + 1);
+        env_value += len + 1;
+        return ret;
+      } else
+        __itt_report_error(__itt_error_env_too_long, name, (size_t)len,
+                           (size_t)(max_len - 1));
+    }
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+  }
+  return NULL;
 }
 
-static const char* __itt_get_lib_name(void)
-{
-    const char* lib_name = __itt_get_env_var(ITT_TO_STR(LIB_VAR_NAME));
+static const char *__itt_get_lib_name(void) {
+  const char *lib_name = __itt_get_env_var(ITT_TO_STR(LIB_VAR_NAME));
 
 #ifdef __ANDROID__
-    if (lib_name == NULL)
-    {
+  if (lib_name == NULL) {
 
-#if ITT_ARCH==ITT_ARCH_IA32 || ITT_ARCH==ITT_ARCH_ARM
-        const char* const marker_filename = "com.intel.itt.collector_lib_32";
+#if ITT_ARCH == ITT_ARCH_IA32 || ITT_ARCH == ITT_ARCH_ARM
+    const char *const marker_filename = "com.intel.itt.collector_lib_32";
 #else
-        const char* const marker_filename = "com.intel.itt.collector_lib_64";
+    const char *const marker_filename = "com.intel.itt.collector_lib_64";
 #endif
 
-        char system_wide_marker_filename[PATH_MAX] = {0};
-        int itt_marker_file_fd = -1;
-        ssize_t res = 0;
-
-        res = snprintf(system_wide_marker_filename, PATH_MAX - 1, "%s%s", "/data/local/tmp/", marker_filename);
-        if (res < 0)
-        {
-            ITT_ANDROID_LOGE("Unable to concatenate marker file string.");
-            return lib_name;
-        }
-        itt_marker_file_fd = open(system_wide_marker_filename, O_RDONLY);
-
-        if (itt_marker_file_fd == -1)
-        {
-            const pid_t my_pid = getpid();
-            char cmdline_path[PATH_MAX] = {0};
-            char package_name[PATH_MAX] = {0};
-            char app_sandbox_file[PATH_MAX] = {0};
-            int cmdline_fd = 0;
-
-            ITT_ANDROID_LOGI("Unable to open system-wide marker file.");
-            res = snprintf(cmdline_path, PATH_MAX - 1, "/proc/%d/cmdline", my_pid);
-            if (res < 0)
-            {
-                ITT_ANDROID_LOGE("Unable to get cmdline path string.");
-                return lib_name;
-            }
+    char system_wide_marker_filename[PATH_MAX] = {0};
+    int itt_marker_file_fd = -1;
+    ssize_t res = 0;
 
-            ITT_ANDROID_LOGI("CMD file: %s\n", cmdline_path);
-            cmdline_fd = open(cmdline_path, O_RDONLY);
-            if (cmdline_fd == -1)
-            {
-                ITT_ANDROID_LOGE("Unable to open %s file!", cmdline_path);
-                return lib_name;
-            }
-            res = read(cmdline_fd, package_name, PATH_MAX - 1);
-            if (res == -1)
-            {
-                ITT_ANDROID_LOGE("Unable to read %s file!", cmdline_path);
-                res = close(cmdline_fd);
-                if (res == -1)
-                {
-                    ITT_ANDROID_LOGE("Unable to close %s file!", cmdline_path);
-                }
-                return lib_name;
-            }
-            res = close(cmdline_fd);
-            if (res == -1)
-            {
-                ITT_ANDROID_LOGE("Unable to close %s file!", cmdline_path);
-                return lib_name;
-            }
-            ITT_ANDROID_LOGI("Package name: %s\n", package_name);
-            res = snprintf(app_sandbox_file, PATH_MAX - 1, "/data/data/%s/%s", package_name, marker_filename);
-            if (res < 0)
-            {
-                ITT_ANDROID_LOGE("Unable to concatenate marker file string.");
-                return lib_name;
-            }
-
-            ITT_ANDROID_LOGI("Lib marker file name: %s\n", app_sandbox_file);
-            itt_marker_file_fd = open(app_sandbox_file, O_RDONLY);
-            if (itt_marker_file_fd == -1)
-            {
-                ITT_ANDROID_LOGE("Unable to open app marker file!");
-                return lib_name;
-            }
+    res = snprintf(system_wide_marker_filename, PATH_MAX - 1, "%s%s",
+                   "/data/local/tmp/", marker_filename);
+    if (res < 0) {
+      ITT_ANDROID_LOGE("Unable to concatenate marker file string.");
+      return lib_name;
+    }
+    itt_marker_file_fd = open(system_wide_marker_filename, O_RDONLY);
+
+    if (itt_marker_file_fd == -1) {
+      const pid_t my_pid = getpid();
+      char cmdline_path[PATH_MAX] = {0};
+      char package_name[PATH_MAX] = {0};
+      char app_sandbox_file[PATH_MAX] = {0};
+      int cmdline_fd = 0;
+
+      ITT_ANDROID_LOGI("Unable to open system-wide marker file.");
+      res = snprintf(cmdline_path, PATH_MAX - 1, "/proc/%d/cmdline", my_pid);
+      if (res < 0) {
+        ITT_ANDROID_LOGE("Unable to get cmdline path string.");
+        return lib_name;
+      }
+
+      ITT_ANDROID_LOGI("CMD file: %s\n", cmdline_path);
+      cmdline_fd = open(cmdline_path, O_RDONLY);
+      if (cmdline_fd == -1) {
+        ITT_ANDROID_LOGE("Unable to open %s file!", cmdline_path);
+        return lib_name;
+      }
+      res = read(cmdline_fd, package_name, PATH_MAX - 1);
+      if (res == -1) {
+        ITT_ANDROID_LOGE("Unable to read %s file!", cmdline_path);
+        res = close(cmdline_fd);
+        if (res == -1) {
+          ITT_ANDROID_LOGE("Unable to close %s file!", cmdline_path);
         }
+        return lib_name;
+      }
+      res = close(cmdline_fd);
+      if (res == -1) {
+        ITT_ANDROID_LOGE("Unable to close %s file!", cmdline_path);
+        return lib_name;
+      }
+      ITT_ANDROID_LOGI("Package name: %s\n", package_name);
+      res = snprintf(app_sandbox_file, PATH_MAX - 1, "/data/data/%s/%s",
+                     package_name, marker_filename);
+      if (res < 0) {
+        ITT_ANDROID_LOGE("Unable to concatenate marker file string.");
+        return lib_name;
+      }
+
+      ITT_ANDROID_LOGI("Lib marker file name: %s\n", app_sandbox_file);
+      itt_marker_file_fd = open(app_sandbox_file, O_RDONLY);
+      if (itt_marker_file_fd == -1) {
+        ITT_ANDROID_LOGE("Unable to open app marker file!");
+        return lib_name;
+      }
+    }
 
-        {
-            char itt_lib_name[PATH_MAX] = {0};
-
-            res = read(itt_marker_file_fd, itt_lib_name, PATH_MAX - 1);
-            if (res == -1)
-            {
-                ITT_ANDROID_LOGE("Unable to read %s file!", itt_marker_file_fd);
-                res = close(itt_marker_file_fd);
-                if (res == -1)
-                {
-                    ITT_ANDROID_LOGE("Unable to close %s file!", itt_marker_file_fd);
-                }
-                return lib_name;
-            }
-            ITT_ANDROID_LOGI("ITT Lib path: %s", itt_lib_name);
-            res = close(itt_marker_file_fd);
-            if (res == -1)
-            {
-                ITT_ANDROID_LOGE("Unable to close %s file!", itt_marker_file_fd);
-                return lib_name;
-            }
-            ITT_ANDROID_LOGI("Set env %s to %s", ITT_TO_STR(LIB_VAR_NAME), itt_lib_name);
-            res = setenv(ITT_TO_STR(LIB_VAR_NAME), itt_lib_name, 0);
-            if (res == -1)
-            {
-                ITT_ANDROID_LOGE("Unable to set env var!");
-                return lib_name;
-            }
-            lib_name = __itt_get_env_var(ITT_TO_STR(LIB_VAR_NAME));
-            ITT_ANDROID_LOGI("ITT Lib path from env: %s", lib_name);
+    {
+      char itt_lib_name[PATH_MAX] = {0};
+
+      res = read(itt_marker_file_fd, itt_lib_name, PATH_MAX - 1);
+      if (res == -1) {
+        ITT_ANDROID_LOGE("Unable to read %s file!", itt_marker_file_fd);
+        res = close(itt_marker_file_fd);
+        if (res == -1) {
+          ITT_ANDROID_LOGE("Unable to close %s file!", itt_marker_file_fd);
         }
+        return lib_name;
+      }
+      ITT_ANDROID_LOGI("ITT Lib path: %s", itt_lib_name);
+      res = close(itt_marker_file_fd);
+      if (res == -1) {
+        ITT_ANDROID_LOGE("Unable to close %s file!", itt_marker_file_fd);
+        return lib_name;
+      }
+      ITT_ANDROID_LOGI("Set env %s to %s", ITT_TO_STR(LIB_VAR_NAME),
+                       itt_lib_name);
+      res = setenv(ITT_TO_STR(LIB_VAR_NAME), itt_lib_name, 0);
+      if (res == -1) {
+        ITT_ANDROID_LOGE("Unable to set env var!");
+        return lib_name;
+      }
+      lib_name = __itt_get_env_var(ITT_TO_STR(LIB_VAR_NAME));
+      ITT_ANDROID_LOGI("ITT Lib path from env: %s", lib_name);
     }
+  }
 #endif
 
-    return lib_name;
+  return lib_name;
 }
 
 /* Avoid clashes with std::min, reported by tbb team */
-#define __itt_min(a,b) (a) < (b) ? (a) : (b)
-
-static __itt_group_id __itt_get_groups(void)
-{
-    int i;
-    __itt_group_id res = __itt_group_none;
-    const char* var_name  = "INTEL_ITTNOTIFY_GROUPS";
-    const char* group_str = __itt_get_env_var(var_name);
-
-    if (group_str != NULL)
-    {
-        int len;
-        char gr[255];
-        const char* chunk;
-        while ((group_str = __itt_fsplit(group_str, ",; ", &chunk, &len)) != NULL)
-        {
-            int min_len = __itt_min(len, (int)(sizeof(gr) - 1));
-            __itt_fstrcpyn(gr, sizeof(gr) - 1, chunk,  min_len);
-            gr[min_len] = 0;
-
-            for (i = 0; group_list[i].name != NULL; i++)
-            {
-                if (!__itt_fstrcmp(gr, group_list[i].name))
-                {
-                    res = (__itt_group_id)(res | group_list[i].id);
-                    break;
-                }
-            }
+#define __itt_min(a, b) (a) < (b) ? (a) : (b)
+
+static __itt_group_id __itt_get_groups(void) {
+  int i;
+  __itt_group_id res = __itt_group_none;
+  const char *var_name = "INTEL_ITTNOTIFY_GROUPS";
+  const char *group_str = __itt_get_env_var(var_name);
+
+  if (group_str != NULL) {
+    int len;
+    char gr[255];
+    const char *chunk;
+    while ((group_str = __itt_fsplit(group_str, ",; ", &chunk, &len)) != NULL) {
+      int min_len = __itt_min(len, (int)(sizeof(gr) - 1));
+      __itt_fstrcpyn(gr, sizeof(gr) - 1, chunk, min_len);
+      gr[min_len] = 0;
+
+      for (i = 0; group_list[i].name != NULL; i++) {
+        if (!__itt_fstrcmp(gr, group_list[i].name)) {
+          res = (__itt_group_id)(res | group_list[i].id);
+          break;
         }
-        /* TODO: !!! Workaround for bug with warning for unknown group !!!
-         * Should be fixed in new initialization scheme.
-         * Now the following groups should be set always. */
-        for (i = 0; group_list[i].id != __itt_group_none; i++)
-            if (group_list[i].id != __itt_group_all &&
-                group_list[i].id > __itt_group_splitter_min &&
-                group_list[i].id < __itt_group_splitter_max)
-                res = (__itt_group_id)(res | group_list[i].id);
-        return res;
+      }
     }
-    else
-    {
-        for (i = 0; group_alias[i].env_var != NULL; i++)
-            if (__itt_get_env_var(group_alias[i].env_var) != NULL)
-                return group_alias[i].groups;
-    }
-
+    /* TODO: !!! Workaround for bug with warning for unknown group !!!
+     * Should be fixed in new initialization scheme.
+     * Now the following groups should be set always. */
+    for (i = 0; group_list[i].id != __itt_group_none; i++)
+      if (group_list[i].id != __itt_group_all &&
+          group_list[i].id > __itt_group_splitter_min &&
+          group_list[i].id < __itt_group_splitter_max)
+        res = (__itt_group_id)(res | group_list[i].id);
     return res;
+  } else {
+    for (i = 0; group_alias[i].env_var != NULL; i++)
+      if (__itt_get_env_var(group_alias[i].env_var) != NULL)
+        return group_alias[i].groups;
+  }
+
+  return res;
 }
 
 #undef __itt_min
 
-static int __itt_lib_version(lib_t lib)
-{
-    if (lib == NULL)
-        return 0;
-    if (__itt_get_proc(lib, "__itt_api_init"))
-        return 2;
-    if (__itt_get_proc(lib, "__itt_api_version"))
-        return 1;
+static int __itt_lib_version(lib_t lib) {
+  if (lib == NULL)
     return 0;
+  if (__itt_get_proc(lib, "__itt_api_init"))
+    return 2;
+  if (__itt_get_proc(lib, "__itt_api_version"))
+    return 1;
+  return 0;
 }
 
 /* It's not used right now! Comment it out to avoid warnings.
@@ -1005,197 +1029,206 @@ static void __itt_reinit_all_pointers(void)
     int i;
     // Fill all pointers with initial stubs
     for (i = 0; _N_(_ittapi_global).api_list_ptr[i].name != NULL; i++)
-        *_N_(_ittapi_global).api_list_ptr[i].func_ptr = _N_(_ittapi_global).api_list_ptr[i].init_func;
+        *_N_(_ittapi_global).api_list_ptr[i].func_ptr =
+_N_(_ittapi_global).api_list_ptr[i].init_func;
 }
 */
 
-static void __itt_nullify_all_pointers(void)
-{
-    int i;
-    /* Nullify all pointers except domain_create, string_handle_create  and counter_create */
-    for (i = 0; _N_(_ittapi_global).api_list_ptr[i].name != NULL; i++)
-        *_N_(_ittapi_global).api_list_ptr[i].func_ptr = _N_(_ittapi_global).api_list_ptr[i].null_func;
+static void __itt_nullify_all_pointers(void) {
+  int i;
+  /* Nullify all pointers except domain_create, string_handle_create  and
+   * counter_create */
+  for (i = 0; _N_(_ittapi_global).api_list_ptr[i].name != NULL; i++)
+    *_N_(_ittapi_global).api_list_ptr[i].func_ptr =
+        _N_(_ittapi_global).api_list_ptr[i].null_func;
 }
 
-#if ITT_PLATFORM==ITT_PLATFORM_WIN && KMP_MSVC_COMPAT
+// clang-format off
+#if ITT_PLATFORM == ITT_PLATFORM_WIN && KMP_MSVC_COMPAT
 #pragma warning(push)
-#pragma warning(disable: 4054) /* warning C4054: 'type cast' : from function pointer 'XXX' to data pointer 'void *' */
-#pragma warning(disable: 4055) /* warning C4055: 'type cast' : from data pointer 'void *' to function pointer 'XXX' */
+#pragma warning(disable: 4054) /* warning C4054: 'type cast' : from function
+                                  pointer 'XXX' to data pointer 'void *' */
+#pragma warning(disable: 4055) /* warning C4055: 'type cast' : from data pointer
+                                  'void *' to function pointer 'XXX' */
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+// clang-format on
 
-ITT_EXTERN_C void _N_(fini_ittlib)(void)
-{
-    __itt_api_fini_t* __itt_api_fini_ptr = NULL;
-    static volatile TIDT current_thread = 0;
+ITT_EXTERN_C void _N_(fini_ittlib)(void) {
+  __itt_api_fini_t *__itt_api_fini_ptr = NULL;
+  static volatile TIDT current_thread = 0;
 
-    if (_N_(_ittapi_global).api_initialized)
-    {
-        ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global));
-        if (_N_(_ittapi_global).api_initialized)
-        {
-            if (current_thread == 0)
-            {
-                if (PTHREAD_SYMBOLS) current_thread = __itt_thread_id();
-                if (_N_(_ittapi_global).lib != NULL)
-                {
-                    __itt_api_fini_ptr = (__itt_api_fini_t*)(size_t)__itt_get_proc(_N_(_ittapi_global).lib, "__itt_api_fini");
-                }
-                if (__itt_api_fini_ptr)
-                {
-                    __itt_api_fini_ptr(&_N_(_ittapi_global));
-                }
-
-                __itt_nullify_all_pointers();
-
- /* TODO: !!! not safe !!! don't support unload so far.
-  *             if (_N_(_ittapi_global).lib != NULL)
-  *                 __itt_unload_lib(_N_(_ittapi_global).lib);
-  *             _N_(_ittapi_global).lib = NULL;
-  */
-                _N_(_ittapi_global).api_initialized = 0;
-                current_thread = 0;
-            }
+  if (_N_(_ittapi_global).api_initialized) {
+    ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global));
+    if (_N_(_ittapi_global).api_initialized) {
+      if (current_thread == 0) {
+        if (PTHREAD_SYMBOLS)
+          current_thread = __itt_thread_id();
+        if (_N_(_ittapi_global).lib != NULL) {
+          __itt_api_fini_ptr = (__itt_api_fini_t *)(size_t)__itt_get_proc(
+              _N_(_ittapi_global).lib, "__itt_api_fini");
         }
-        if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+        if (__itt_api_fini_ptr) {
+          __itt_api_fini_ptr(&_N_(_ittapi_global));
+        }
+
+        __itt_nullify_all_pointers();
+
+        /* TODO: !!! not safe !!! don't support unload so far.
+         *             if (_N_(_ittapi_global).lib != NULL)
+         *                 __itt_unload_lib(_N_(_ittapi_global).lib);
+         *             _N_(_ittapi_global).lib = NULL;
+         */
+        _N_(_ittapi_global).api_initialized = 0;
+        current_thread = 0;
+      }
     }
+    if (PTHREAD_SYMBOLS)
+      __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+  }
 }
 
-ITT_EXTERN_C int _N_(init_ittlib)(const char* lib_name, __itt_group_id init_groups)
-{
-    int i;
-    __itt_group_id groups;
+ITT_EXTERN_C int _N_(init_ittlib)(const char *lib_name,
+                                  __itt_group_id init_groups) {
+  int i;
+  __itt_group_id groups;
 #ifdef ITT_COMPLETE_GROUP
-    __itt_group_id zero_group = __itt_group_none;
+  __itt_group_id zero_group = __itt_group_none;
 #endif /* ITT_COMPLETE_GROUP */
-    static volatile TIDT current_thread = 0;
+  static volatile TIDT current_thread = 0;
 
-    if (!_N_(_ittapi_global).api_initialized)
-    {
+  if (!_N_(_ittapi_global).api_initialized) {
 #ifndef ITT_SIMPLE_INIT
-        ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global));
+    ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global));
 #endif /* ITT_SIMPLE_INIT */
 
-        if (!_N_(_ittapi_global).api_initialized)
-        {
-            if (current_thread == 0)
-            {
-                if (PTHREAD_SYMBOLS) current_thread = __itt_thread_id();
-                if (lib_name == NULL)
-                {
-                    lib_name = __itt_get_lib_name();
-                }
-                groups = __itt_get_groups();
-                if (DL_SYMBOLS && (groups != __itt_group_none || lib_name != NULL))
-                {
-                    _N_(_ittapi_global).lib = __itt_load_lib((lib_name == NULL) ? ittnotify_lib_name : lib_name);
-
-                    if (_N_(_ittapi_global).lib != NULL)
-                    {
-                        __itt_api_init_t* __itt_api_init_ptr;
-                        int lib_version = __itt_lib_version(_N_(_ittapi_global).lib);
-
-                        switch (lib_version) {
-                        case 0:
-                            groups = __itt_group_legacy;
-                            KMP_FALLTHROUGH();
-                        case 1:
-                            /* Fill all pointers from dynamic library */
-                            for (i = 0; _N_(_ittapi_global).api_list_ptr[i].name != NULL; i++)
-                            {
-                                if (_N_(_ittapi_global).api_list_ptr[i].group & groups & init_groups)
-                                {
-                                    *_N_(_ittapi_global).api_list_ptr[i].func_ptr = (void*)__itt_get_proc(_N_(_ittapi_global).lib, _N_(_ittapi_global).api_list_ptr[i].name);
-                                    if (*_N_(_ittapi_global).api_list_ptr[i].func_ptr == NULL)
-                                    {
-                                        /* Restore pointers for function with static implementation */
-                                        *_N_(_ittapi_global).api_list_ptr[i].func_ptr = _N_(_ittapi_global).api_list_ptr[i].null_func;
-                                        __itt_report_error(__itt_error_no_symbol, lib_name, _N_(_ittapi_global).api_list_ptr[i].name);
+    if (!_N_(_ittapi_global).api_initialized) {
+      if (current_thread == 0) {
+        if (PTHREAD_SYMBOLS)
+          current_thread = __itt_thread_id();
+        if (lib_name == NULL) {
+          lib_name = __itt_get_lib_name();
+        }
+        groups = __itt_get_groups();
+        if (DL_SYMBOLS && (groups != __itt_group_none || lib_name != NULL)) {
+          _N_(_ittapi_global).lib = __itt_load_lib(
+              (lib_name == NULL) ? ittnotify_lib_name : lib_name);
+
+          if (_N_(_ittapi_global).lib != NULL) {
+            __itt_api_init_t *__itt_api_init_ptr;
+            int lib_version = __itt_lib_version(_N_(_ittapi_global).lib);
+
+            switch (lib_version) {
+            case 0:
+              groups = __itt_group_legacy;
+              KMP_FALLTHROUGH();
+            case 1:
+              /* Fill all pointers from dynamic library */
+              for (i = 0; _N_(_ittapi_global).api_list_ptr[i].name != NULL;
+                   i++) {
+                if (_N_(_ittapi_global).api_list_ptr[i].group & groups &
+                    init_groups) {
+                  *_N_(_ittapi_global).api_list_ptr[i].func_ptr =
+                      (void *)__itt_get_proc(
+                          _N_(_ittapi_global).lib,
+                          _N_(_ittapi_global).api_list_ptr[i].name);
+                  if (*_N_(_ittapi_global).api_list_ptr[i].func_ptr == NULL) {
+                    /* Restore pointers for function with static implementation
+                     */
+                    *_N_(_ittapi_global).api_list_ptr[i].func_ptr =
+                        _N_(_ittapi_global).api_list_ptr[i].null_func;
+                    __itt_report_error(
+                        __itt_error_no_symbol, lib_name,
+                        _N_(_ittapi_global).api_list_ptr[i].name);
 #ifdef ITT_COMPLETE_GROUP
-                                        zero_group = (__itt_group_id)(zero_group | _N_(_ittapi_global).api_list_ptr[i].group);
+                    zero_group = (__itt_group_id)(
+                        zero_group | _N_(_ittapi_global).api_list_ptr[i].group);
 #endif /* ITT_COMPLETE_GROUP */
-                                    }
-                                }
-                                else
-                                    *_N_(_ittapi_global).api_list_ptr[i].func_ptr = _N_(_ittapi_global).api_list_ptr[i].null_func;
-                            }
-
-                            if (groups == __itt_group_legacy)
-                            {
-                                /* Compatibility with legacy tools */
-                                ITTNOTIFY_NAME(thread_ignore)  = ITTNOTIFY_NAME(thr_ignore);
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-                                ITTNOTIFY_NAME(sync_createA)   = ITTNOTIFY_NAME(sync_set_nameA);
-                                ITTNOTIFY_NAME(sync_createW)   = ITTNOTIFY_NAME(sync_set_nameW);
-#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
-                                ITTNOTIFY_NAME(sync_create)    = ITTNOTIFY_NAME(sync_set_name);
+                  }
+                } else
+                  *_N_(_ittapi_global).api_list_ptr[i].func_ptr =
+                      _N_(_ittapi_global).api_list_ptr[i].null_func;
+              }
+
+              if (groups == __itt_group_legacy) {
+                /* Compatibility with legacy tools */
+                ITTNOTIFY_NAME(thread_ignore) = ITTNOTIFY_NAME(thr_ignore);
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+                ITTNOTIFY_NAME(sync_createA) = ITTNOTIFY_NAME(sync_set_nameA);
+                ITTNOTIFY_NAME(sync_createW) = ITTNOTIFY_NAME(sync_set_nameW);
+#else /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+                ITTNOTIFY_NAME(sync_create) = ITTNOTIFY_NAME(sync_set_name);
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-                                ITTNOTIFY_NAME(sync_prepare)   = ITTNOTIFY_NAME(notify_sync_prepare);
-                                ITTNOTIFY_NAME(sync_cancel)    = ITTNOTIFY_NAME(notify_sync_cancel);
-                                ITTNOTIFY_NAME(sync_acquired)  = ITTNOTIFY_NAME(notify_sync_acquired);
-                                ITTNOTIFY_NAME(sync_releasing) = ITTNOTIFY_NAME(notify_sync_releasing);
-                            }
+                ITTNOTIFY_NAME(sync_prepare) =
+                    ITTNOTIFY_NAME(notify_sync_prepare);
+                ITTNOTIFY_NAME(sync_cancel) =
+                    ITTNOTIFY_NAME(notify_sync_cancel);
+                ITTNOTIFY_NAME(sync_acquired) =
+                    ITTNOTIFY_NAME(notify_sync_acquired);
+                ITTNOTIFY_NAME(sync_releasing) =
+                    ITTNOTIFY_NAME(notify_sync_releasing);
+              }
 
 #ifdef ITT_COMPLETE_GROUP
-                            for (i = 0; _N_(_ittapi_global).api_list_ptr[i].name != NULL; i++)
-                                if (_N_(_ittapi_global).api_list_ptr[i].group & zero_group)
-                                    *_N_(_ittapi_global).api_list_ptr[i].func_ptr = _N_(_ittapi_global).api_list_ptr[i].null_func;
+              for (i = 0; _N_(_ittapi_global).api_list_ptr[i].name != NULL; i++)
+                if (_N_(_ittapi_global).api_list_ptr[i].group & zero_group)
+                  *_N_(_ittapi_global).api_list_ptr[i].func_ptr =
+                      _N_(_ittapi_global).api_list_ptr[i].null_func;
 #endif /* ITT_COMPLETE_GROUP */
-                            break;
-                        case 2:
-                            __itt_api_init_ptr = (__itt_api_init_t*)(size_t)__itt_get_proc(_N_(_ittapi_global).lib, "__itt_api_init");
-                            if (__itt_api_init_ptr)
-                                __itt_api_init_ptr(&_N_(_ittapi_global), init_groups);
-                            break;
-                        }
-                    }
-                    else
-                    {
-                        __itt_nullify_all_pointers();
-
-                        __itt_report_error(__itt_error_no_module, lib_name,
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-                            __itt_system_error()
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-                            dlerror()
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-                        );
-                    }
-                }
-                else
-                {
-                    __itt_nullify_all_pointers();
-                }
-                _N_(_ittapi_global).api_initialized = 1;
-                current_thread = 0;
-                /* !!! Just to avoid unused code elimination !!! */
-                if (__itt_fini_ittlib_ptr == _N_(fini_ittlib)) current_thread = 0;
+              break;
+            case 2:
+              __itt_api_init_ptr = (__itt_api_init_t *)(size_t)__itt_get_proc(
+                  _N_(_ittapi_global).lib, "__itt_api_init");
+              if (__itt_api_init_ptr)
+                __itt_api_init_ptr(&_N_(_ittapi_global), init_groups);
+              break;
             }
+          } else {
+            __itt_nullify_all_pointers();
+
+            __itt_report_error(__itt_error_no_module, lib_name,
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+                               __itt_system_error()
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+                               dlerror()
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+            );
+          }
+        } else {
+          __itt_nullify_all_pointers();
         }
+        _N_(_ittapi_global).api_initialized = 1;
+        current_thread = 0;
+        /* !!! Just to avoid unused code elimination !!! */
+        if (__itt_fini_ittlib_ptr == _N_(fini_ittlib))
+          current_thread = 0;
+      }
+    }
 
 #ifndef ITT_SIMPLE_INIT
-        if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+    if (PTHREAD_SYMBOLS)
+      __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
 #endif /* ITT_SIMPLE_INIT */
+  }
+
+  /* Evaluating if any function ptr is non empty and it's in init_groups */
+  for (i = 0; _N_(_ittapi_global).api_list_ptr[i].name != NULL; i++) {
+    if (*_N_(_ittapi_global).api_list_ptr[i].func_ptr !=
+            _N_(_ittapi_global).api_list_ptr[i].null_func &&
+        _N_(_ittapi_global).api_list_ptr[i].group & init_groups) {
+      return 1;
     }
-
-    /* Evaluating if any function ptr is non empty and it's in init_groups */
-    for (i = 0; _N_(_ittapi_global).api_list_ptr[i].name != NULL; i++)
-    {
-        if (*_N_(_ittapi_global).api_list_ptr[i].func_ptr != _N_(_ittapi_global).api_list_ptr[i].null_func &&
-            _N_(_ittapi_global).api_list_ptr[i].group & init_groups)
-        {
-            return 1;
-        }
-    }
-    return 0;
+  }
+  return 0;
 }
 
-ITT_EXTERN_C __itt_error_handler_t* _N_(set_error_handler)(__itt_error_handler_t* handler)
-{
-    __itt_error_handler_t* prev = (__itt_error_handler_t*)(size_t)_N_(_ittapi_global).error_handler;
-    _N_(_ittapi_global).error_handler = (void*)(size_t)handler;
-    return prev;
+ITT_EXTERN_C __itt_error_handler_t *
+_N_(set_error_handler)(__itt_error_handler_t *handler) {
+  __itt_error_handler_t *prev =
+      (__itt_error_handler_t *)(size_t)_N_(_ittapi_global).error_handler;
+  _N_(_ittapi_global).error_handler = (void *)(size_t)handler;
+  return prev;
 }
 
-#if ITT_PLATFORM==ITT_PLATFORM_WIN && KMP_MSVC_COMPAT
+#if ITT_PLATFORM == ITT_PLATFORM_WIN && KMP_MSVC_COMPAT
 #pragma warning(pop)
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
diff --git a/openmp/runtime/src/thirdparty/ittnotify/ittnotify_static.h b/openmp/runtime/src/thirdparty/ittnotify/ittnotify_static.h
index a2022263003d..cb884a8b3a1c 100644
--- a/openmp/runtime/src/thirdparty/ittnotify/ittnotify_static.h
+++ b/openmp/runtime/src/thirdparty/ittnotify/ittnotify_static.h
@@ -7,335 +7,752 @@
 //
 //===----------------------------------------------------------------------===//
 
-
 #include "ittnotify_config.h"
 
 #ifndef ITT_FORMAT_DEFINED
-#  ifndef ITT_FORMAT
-#    define ITT_FORMAT
-#  endif /* ITT_FORMAT */
-#  ifndef ITT_NO_PARAMS
-#    define ITT_NO_PARAMS
-#  endif /* ITT_NO_PARAMS */
+#ifndef ITT_FORMAT
+#define ITT_FORMAT
+#endif /* ITT_FORMAT */
+#ifndef ITT_NO_PARAMS
+#define ITT_NO_PARAMS
+#endif /* ITT_NO_PARAMS */
 #endif /* ITT_FORMAT_DEFINED */
 
 /*
  * parameters for macro expected:
- * ITT_STUB(api, type, func_name, arguments, params, func_name_in_dll, group, printf_fmt)
+ * ITT_STUB(api, type, func_name, arguments, params, func_name_in_dll, group,
+ * printf_fmt)
  */
 #ifdef __ITT_INTERNAL_INIT
 
 #ifndef __ITT_INTERNAL_BODY
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-ITT_STUB(ITTAPI, __itt_domain*, domain_createA, (const char    *name), (ITT_FORMAT name), domain_createA, __itt_group_structure, "\"%s\"")
-ITT_STUB(ITTAPI, __itt_domain*, domain_createW, (const wchar_t *name), (ITT_FORMAT name), domain_createW, __itt_group_structure, "\"%S\"")
-#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
-ITT_STUB(ITTAPI, __itt_domain*, domain_create,  (const char    *name), (ITT_FORMAT name), domain_create,  __itt_group_structure, "\"%s\"")
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_domain *, domain_createA, (const char *name),
+         (ITT_FORMAT name), domain_createA, __itt_group_structure, "\"%s\"")
+ITT_STUB(ITTAPI, __itt_domain *, domain_createW, (const wchar_t *name),
+         (ITT_FORMAT name), domain_createW, __itt_group_structure, "\"%S\"")
+#else /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_domain *, domain_create, (const char *name),
+         (ITT_FORMAT name), domain_create, __itt_group_structure, "\"%s\"")
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-ITT_STUB(ITTAPI, __itt_string_handle*, string_handle_createA, (const char    *name), (ITT_FORMAT name), string_handle_createA, __itt_group_structure, "\"%s\"")
-ITT_STUB(ITTAPI, __itt_string_handle*, string_handle_createW, (const wchar_t *name), (ITT_FORMAT name), string_handle_createW, __itt_group_structure, "\"%S\"")
-#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
-ITT_STUB(ITTAPI, __itt_string_handle*, string_handle_create,  (const char    *name), (ITT_FORMAT name), string_handle_create,  __itt_group_structure, "\"%s\"")
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_string_handle *, string_handle_createA,
+         (const char *name), (ITT_FORMAT name), string_handle_createA,
+         __itt_group_structure, "\"%s\"")
+ITT_STUB(ITTAPI, __itt_string_handle *, string_handle_createW,
+         (const wchar_t *name), (ITT_FORMAT name), string_handle_createW,
+         __itt_group_structure, "\"%S\"")
+#else /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_string_handle *, string_handle_create,
+         (const char *name), (ITT_FORMAT name), string_handle_create,
+         __itt_group_structure, "\"%s\"")
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-ITT_STUB(ITTAPI, __itt_counter, counter_createA, (const char    *name, const char    *domain), (ITT_FORMAT name, domain), counter_createA, __itt_group_counter, "\"%s\", \"%s\"")
-ITT_STUB(ITTAPI, __itt_counter, counter_createW, (const wchar_t *name, const wchar_t *domain), (ITT_FORMAT name, domain), counter_createW, __itt_group_counter, "\"%s\", \"%s\"")
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-ITT_STUB(ITTAPI, __itt_counter, counter_create,  (const char    *name, const char    *domain), (ITT_FORMAT name, domain), counter_create,  __itt_group_counter, "\"%s\", \"%s\"")
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_counter, counter_createA,
+         (const char *name, const char *domain), (ITT_FORMAT name, domain),
+         counter_createA, __itt_group_counter, "\"%s\", \"%s\"")
+ITT_STUB(ITTAPI, __itt_counter, counter_createW,
+         (const wchar_t *name, const wchar_t *domain),
+         (ITT_FORMAT name, domain), counter_createW, __itt_group_counter,
+         "\"%s\", \"%s\"")
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_counter, counter_create,
+         (const char *name, const char *domain), (ITT_FORMAT name, domain),
+         counter_create, __itt_group_counter, "\"%s\", \"%s\"")
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-ITT_STUB(ITTAPI, __itt_counter, counter_create_typedA, (const char    *name, const char    *domain, __itt_metadata_type type), (ITT_FORMAT name, domain, type), counter_create_typedA, __itt_group_counter, "\"%s\", \"%s\", %d")
-ITT_STUB(ITTAPI, __itt_counter, counter_create_typedW, (const wchar_t *name, const wchar_t *domain, __itt_metadata_type type), (ITT_FORMAT name, domain, type), counter_create_typedW, __itt_group_counter, "\"%s\", \"%s\", %d")
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-ITT_STUB(ITTAPI, __itt_counter, counter_create_typed,  (const char    *name, const char    *domain, __itt_metadata_type type), (ITT_FORMAT name, domain, type), counter_create_typed,  __itt_group_counter, "\"%s\", \"%s\", %d")
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_counter, counter_create_typedA,
+         (const char *name, const char *domain, __itt_metadata_type type),
+         (ITT_FORMAT name, domain, type), counter_create_typedA,
+         __itt_group_counter, "\"%s\", \"%s\", %d")
+ITT_STUB(ITTAPI, __itt_counter, counter_create_typedW,
+         (const wchar_t *name, const wchar_t *domain, __itt_metadata_type type),
+         (ITT_FORMAT name, domain, type), counter_create_typedW,
+         __itt_group_counter, "\"%s\", \"%s\", %d")
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_counter, counter_create_typed,
+         (const char *name, const char *domain, __itt_metadata_type type),
+         (ITT_FORMAT name, domain, type), counter_create_typed,
+         __itt_group_counter, "\"%s\", \"%s\", %d")
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 
-
-ITT_STUBV(ITTAPI, void, pause,  (void), (ITT_NO_PARAMS), pause,  __itt_group_control | __itt_group_legacy, "no args")
-ITT_STUBV(ITTAPI, void, resume, (void), (ITT_NO_PARAMS), resume, __itt_group_control | __itt_group_legacy, "no args")
-
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-ITT_STUBV(ITTAPI, void, thread_set_nameA, (const char    *name), (ITT_FORMAT name), thread_set_nameA, __itt_group_thread, "\"%s\"")
-ITT_STUBV(ITTAPI, void, thread_set_nameW, (const wchar_t *name), (ITT_FORMAT name), thread_set_nameW, __itt_group_thread, "\"%S\"")
-#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
-ITT_STUBV(ITTAPI, void, thread_set_name,  (const char    *name), (ITT_FORMAT name), thread_set_name,  __itt_group_thread, "\"%s\"")
+ITT_STUBV(ITTAPI, void, pause, (void), (ITT_NO_PARAMS), pause,
+          __itt_group_control | __itt_group_legacy, "no args")
+ITT_STUBV(ITTAPI, void, resume, (void), (ITT_NO_PARAMS), resume,
+          __itt_group_control | __itt_group_legacy, "no args")
+
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+ITT_STUBV(ITTAPI, void, thread_set_nameA, (const char *name), (ITT_FORMAT name),
+          thread_set_nameA, __itt_group_thread, "\"%s\"")
+ITT_STUBV(ITTAPI, void, thread_set_nameW, (const wchar_t *name),
+          (ITT_FORMAT name), thread_set_nameW, __itt_group_thread, "\"%S\"")
+#else /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+ITT_STUBV(ITTAPI, void, thread_set_name, (const char *name), (ITT_FORMAT name),
+          thread_set_name, __itt_group_thread, "\"%s\"")
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-ITT_STUBV(ITTAPI, void, thread_ignore, (void), (ITT_NO_PARAMS), thread_ignore, __itt_group_thread, "no args")
-
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-ITT_STUB(LIBITTAPI, int,  thr_name_setA, (const char    *name, int namelen), (ITT_FORMAT name, namelen), thr_name_setA, __itt_group_thread | __itt_group_legacy, "\"%s\", %d")
-ITT_STUB(LIBITTAPI, int,  thr_name_setW, (const wchar_t *name, int namelen), (ITT_FORMAT name, namelen), thr_name_setW, __itt_group_thread | __itt_group_legacy, "\"%S\", %d")
-#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
-ITT_STUB(LIBITTAPI, int,  thr_name_set,  (const char    *name, int namelen), (ITT_FORMAT name, namelen), thr_name_set,  __itt_group_thread | __itt_group_legacy, "\"%s\", %d")
+ITT_STUBV(ITTAPI, void, thread_ignore, (void), (ITT_NO_PARAMS), thread_ignore,
+          __itt_group_thread, "no args")
+
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+ITT_STUB(LIBITTAPI, int, thr_name_setA, (const char *name, int namelen),
+         (ITT_FORMAT name, namelen), thr_name_setA,
+         __itt_group_thread | __itt_group_legacy, "\"%s\", %d")
+ITT_STUB(LIBITTAPI, int, thr_name_setW, (const wchar_t *name, int namelen),
+         (ITT_FORMAT name, namelen), thr_name_setW,
+         __itt_group_thread | __itt_group_legacy, "\"%S\", %d")
+#else /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+ITT_STUB(LIBITTAPI, int, thr_name_set, (const char *name, int namelen),
+         (ITT_FORMAT name, namelen), thr_name_set,
+         __itt_group_thread | __itt_group_legacy, "\"%s\", %d")
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-ITT_STUBV(LIBITTAPI, void, thr_ignore,   (void),                             (ITT_NO_PARAMS),            thr_ignore,    __itt_group_thread | __itt_group_legacy, "no args")
+ITT_STUBV(LIBITTAPI, void, thr_ignore, (void), (ITT_NO_PARAMS), thr_ignore,
+          __itt_group_thread | __itt_group_legacy, "no args")
 #endif /* __ITT_INTERNAL_BODY */
 
-ITT_STUBV(ITTAPI, void, enable_attach, (void), (ITT_NO_PARAMS), enable_attach, __itt_group_all, "no args")
-
-#else  /* __ITT_INTERNAL_INIT */
-
-ITT_STUBV(ITTAPI, void, detach, (void), (ITT_NO_PARAMS), detach, __itt_group_control | __itt_group_legacy, "no args")
-
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-ITT_STUBV(ITTAPI, void, sync_createA, (void *addr, const char    *objtype, const char    *objname, int attribute), (ITT_FORMAT addr, objtype, objname, attribute), sync_createA, __itt_group_sync | __itt_group_fsync, "%p, \"%s\", \"%s\", %x")
-ITT_STUBV(ITTAPI, void, sync_createW, (void *addr, const wchar_t *objtype, const wchar_t *objname, int attribute), (ITT_FORMAT addr, objtype, objname, attribute), sync_createW, __itt_group_sync | __itt_group_fsync, "%p, \"%S\", \"%S\", %x")
-ITT_STUBV(ITTAPI, void, sync_renameA, (void *addr, const char    *name), (ITT_FORMAT addr, name), sync_renameA, __itt_group_sync | __itt_group_fsync, "%p, \"%s\"")
-ITT_STUBV(ITTAPI, void, sync_renameW, (void *addr, const wchar_t *name), (ITT_FORMAT addr, name), sync_renameW, __itt_group_sync | __itt_group_fsync, "%p, \"%S\"")
-#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
-ITT_STUBV(ITTAPI, void, sync_create,  (void *addr, const char    *objtype, const char    *objname, int attribute), (ITT_FORMAT addr, objtype, objname, attribute), sync_create,  __itt_group_sync | __itt_group_fsync, "%p, \"%s\", \"%s\", %x")
-ITT_STUBV(ITTAPI, void, sync_rename,  (void *addr, const char    *name), (ITT_FORMAT addr, name), sync_rename,  __itt_group_sync | __itt_group_fsync, "%p, \"%s\"")
+ITT_STUBV(ITTAPI, void, enable_attach, (void), (ITT_NO_PARAMS), enable_attach,
+          __itt_group_all, "no args")
+
+#else /* __ITT_INTERNAL_INIT */
+
+ITT_STUBV(ITTAPI, void, detach, (void), (ITT_NO_PARAMS), detach,
+          __itt_group_control | __itt_group_legacy, "no args")
+
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+ITT_STUBV(ITTAPI, void, sync_createA,
+          (void *addr, const char *objtype, const char *objname, int attribute),
+          (ITT_FORMAT addr, objtype, objname, attribute), sync_createA,
+          __itt_group_sync | __itt_group_fsync, "%p, \"%s\", \"%s\", %x")
+ITT_STUBV(ITTAPI, void, sync_createW,
+          (void *addr, const wchar_t *objtype, const wchar_t *objname,
+           int attribute),
+          (ITT_FORMAT addr, objtype, objname, attribute), sync_createW,
+          __itt_group_sync | __itt_group_fsync, "%p, \"%S\", \"%S\", %x")
+ITT_STUBV(ITTAPI, void, sync_renameA, (void *addr, const char *name),
+          (ITT_FORMAT addr, name), sync_renameA,
+          __itt_group_sync | __itt_group_fsync, "%p, \"%s\"")
+ITT_STUBV(ITTAPI, void, sync_renameW, (void *addr, const wchar_t *name),
+          (ITT_FORMAT addr, name), sync_renameW,
+          __itt_group_sync | __itt_group_fsync, "%p, \"%S\"")
+#else /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+ITT_STUBV(ITTAPI, void, sync_create,
+          (void *addr, const char *objtype, const char *objname, int attribute),
+          (ITT_FORMAT addr, objtype, objname, attribute), sync_create,
+          __itt_group_sync | __itt_group_fsync, "%p, \"%s\", \"%s\", %x")
+ITT_STUBV(ITTAPI, void, sync_rename, (void *addr, const char *name),
+          (ITT_FORMAT addr, name), sync_rename,
+          __itt_group_sync | __itt_group_fsync, "%p, \"%s\"")
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-ITT_STUBV(ITTAPI, void, sync_destroy,    (void *addr), (ITT_FORMAT addr), sync_destroy,   __itt_group_sync | __itt_group_fsync, "%p")
-
-ITT_STUBV(ITTAPI, void, sync_prepare,    (void* addr), (ITT_FORMAT addr), sync_prepare,   __itt_group_sync,  "%p")
-ITT_STUBV(ITTAPI, void, sync_cancel,     (void *addr), (ITT_FORMAT addr), sync_cancel,    __itt_group_sync,  "%p")
-ITT_STUBV(ITTAPI, void, sync_acquired,   (void *addr), (ITT_FORMAT addr), sync_acquired,  __itt_group_sync,  "%p")
-ITT_STUBV(ITTAPI, void, sync_releasing,  (void* addr), (ITT_FORMAT addr), sync_releasing, __itt_group_sync,  "%p")
-
-ITT_STUBV(ITTAPI, void, suppress_push,       (unsigned int mask),                             (ITT_FORMAT mask), suppress_push,  __itt_group_suppress,  "%p")
-ITT_STUBV(ITTAPI, void, suppress_pop,        (void),                                          (ITT_NO_PARAMS),   suppress_pop,   __itt_group_suppress,  "no args")
-ITT_STUBV(ITTAPI, void, suppress_mark_range, (__itt_suppress_mode_t mode, unsigned int mask, void * address, size_t size),(ITT_FORMAT mode, mask, address, size), suppress_mark_range, __itt_group_suppress, "%d, %p, %p, %d")
-ITT_STUBV(ITTAPI, void, suppress_clear_range,(__itt_suppress_mode_t mode, unsigned int mask, void * address, size_t size),(ITT_FORMAT mode, mask, address, size), suppress_clear_range,__itt_group_suppress, "%d, %p, %p, %d")
-
-ITT_STUBV(ITTAPI, void, fsync_prepare,   (void* addr), (ITT_FORMAT addr), sync_prepare,   __itt_group_fsync, "%p")
-ITT_STUBV(ITTAPI, void, fsync_cancel,    (void *addr), (ITT_FORMAT addr), sync_cancel,    __itt_group_fsync, "%p")
-ITT_STUBV(ITTAPI, void, fsync_acquired,  (void *addr), (ITT_FORMAT addr), sync_acquired,  __itt_group_fsync, "%p")
-ITT_STUBV(ITTAPI, void, fsync_releasing, (void* addr), (ITT_FORMAT addr), sync_releasing, __itt_group_fsync, "%p")
-
-ITT_STUBV(ITTAPI, void, model_site_begin,          (__itt_model_site *site, __itt_model_site_instance *instance, const char *name), (ITT_FORMAT site, instance, name), model_site_begin, __itt_group_model, "%p, %p, \"%s\"")
-ITT_STUBV(ITTAPI, void, model_site_end,            (__itt_model_site *site, __itt_model_site_instance *instance),                   (ITT_FORMAT site, instance),       model_site_end,   __itt_group_model, "%p, %p")
-ITT_STUBV(ITTAPI, void, model_task_begin,          (__itt_model_task *task, __itt_model_task_instance *instance, const char *name), (ITT_FORMAT task, instance, name), model_task_begin, __itt_group_model, "%p, %p, \"%s\"")
-ITT_STUBV(ITTAPI, void, model_task_end,            (__itt_model_task *task, __itt_model_task_instance *instance),                   (ITT_FORMAT task, instance),       model_task_end,   __itt_group_model, "%p, %p")
-ITT_STUBV(ITTAPI, void, model_lock_acquire,        (void *lock), (ITT_FORMAT lock), model_lock_acquire, __itt_group_model, "%p")
-ITT_STUBV(ITTAPI, void, model_lock_release,        (void *lock), (ITT_FORMAT lock), model_lock_release, __itt_group_model, "%p")
-ITT_STUBV(ITTAPI, void, model_record_allocation,   (void *addr, size_t size), (ITT_FORMAT addr, size), model_record_allocation,   __itt_group_model, "%p, %d")
-ITT_STUBV(ITTAPI, void, model_record_deallocation, (void *addr),              (ITT_FORMAT addr),       model_record_deallocation, __itt_group_model, "%p")
-ITT_STUBV(ITTAPI, void, model_induction_uses,      (void* addr, size_t size), (ITT_FORMAT addr, size), model_induction_uses,      __itt_group_model, "%p, %d")
-ITT_STUBV(ITTAPI, void, model_reduction_uses,      (void* addr, size_t size), (ITT_FORMAT addr, size), model_reduction_uses,      __itt_group_model, "%p, %d")
-ITT_STUBV(ITTAPI, void, model_observe_uses,        (void* addr, size_t size), (ITT_FORMAT addr, size), model_observe_uses,        __itt_group_model, "%p, %d")
-ITT_STUBV(ITTAPI, void, model_clear_uses,          (void* addr),              (ITT_FORMAT addr),       model_clear_uses,          __itt_group_model, "%p")
+ITT_STUBV(ITTAPI, void, sync_destroy, (void *addr), (ITT_FORMAT addr),
+          sync_destroy, __itt_group_sync | __itt_group_fsync, "%p")
+
+ITT_STUBV(ITTAPI, void, sync_prepare, (void *addr), (ITT_FORMAT addr),
+          sync_prepare, __itt_group_sync, "%p")
+ITT_STUBV(ITTAPI, void, sync_cancel, (void *addr), (ITT_FORMAT addr),
+          sync_cancel, __itt_group_sync, "%p")
+ITT_STUBV(ITTAPI, void, sync_acquired, (void *addr), (ITT_FORMAT addr),
+          sync_acquired, __itt_group_sync, "%p")
+ITT_STUBV(ITTAPI, void, sync_releasing, (void *addr), (ITT_FORMAT addr),
+          sync_releasing, __itt_group_sync, "%p")
+
+ITT_STUBV(ITTAPI, void, suppress_push, (unsigned int mask), (ITT_FORMAT mask),
+          suppress_push, __itt_group_suppress, "%p")
+ITT_STUBV(ITTAPI, void, suppress_pop, (void), (ITT_NO_PARAMS), suppress_pop,
+          __itt_group_suppress, "no args")
+ITT_STUBV(ITTAPI, void, suppress_mark_range,
+          (__itt_suppress_mode_t mode, unsigned int mask, void *address,
+           size_t size),
+          (ITT_FORMAT mode, mask, address, size), suppress_mark_range,
+          __itt_group_suppress, "%d, %p, %p, %d")
+ITT_STUBV(ITTAPI, void, suppress_clear_range,
+          (__itt_suppress_mode_t mode, unsigned int mask, void *address,
+           size_t size),
+          (ITT_FORMAT mode, mask, address, size), suppress_clear_range,
+          __itt_group_suppress, "%d, %p, %p, %d")
+
+ITT_STUBV(ITTAPI, void, fsync_prepare, (void *addr), (ITT_FORMAT addr),
+          sync_prepare, __itt_group_fsync, "%p")
+ITT_STUBV(ITTAPI, void, fsync_cancel, (void *addr), (ITT_FORMAT addr),
+          sync_cancel, __itt_group_fsync, "%p")
+ITT_STUBV(ITTAPI, void, fsync_acquired, (void *addr), (ITT_FORMAT addr),
+          sync_acquired, __itt_group_fsync, "%p")
+ITT_STUBV(ITTAPI, void, fsync_releasing, (void *addr), (ITT_FORMAT addr),
+          sync_releasing, __itt_group_fsync, "%p")
+
+ITT_STUBV(ITTAPI, void, model_site_begin,
+          (__itt_model_site * site, __itt_model_site_instance *instance,
+           const char *name),
+          (ITT_FORMAT site, instance, name), model_site_begin,
+          __itt_group_model, "%p, %p, \"%s\"")
+ITT_STUBV(ITTAPI, void, model_site_end,
+          (__itt_model_site * site, __itt_model_site_instance *instance),
+          (ITT_FORMAT site, instance), model_site_end, __itt_group_model,
+          "%p, %p")
+ITT_STUBV(ITTAPI, void, model_task_begin,
+          (__itt_model_task * task, __itt_model_task_instance *instance,
+           const char *name),
+          (ITT_FORMAT task, instance, name), model_task_begin,
+          __itt_group_model, "%p, %p, \"%s\"")
+ITT_STUBV(ITTAPI, void, model_task_end,
+          (__itt_model_task * task, __itt_model_task_instance *instance),
+          (ITT_FORMAT task, instance), model_task_end, __itt_group_model,
+          "%p, %p")
+ITT_STUBV(ITTAPI, void, model_lock_acquire, (void *lock), (ITT_FORMAT lock),
+          model_lock_acquire, __itt_group_model, "%p")
+ITT_STUBV(ITTAPI, void, model_lock_release, (void *lock), (ITT_FORMAT lock),
+          model_lock_release, __itt_group_model, "%p")
+ITT_STUBV(ITTAPI, void, model_record_allocation, (void *addr, size_t size),
+          (ITT_FORMAT addr, size), model_record_allocation, __itt_group_model,
+          "%p, %d")
+ITT_STUBV(ITTAPI, void, model_record_deallocation, (void *addr),
+          (ITT_FORMAT addr), model_record_deallocation, __itt_group_model, "%p")
+ITT_STUBV(ITTAPI, void, model_induction_uses, (void *addr, size_t size),
+          (ITT_FORMAT addr, size), model_induction_uses, __itt_group_model,
+          "%p, %d")
+ITT_STUBV(ITTAPI, void, model_reduction_uses, (void *addr, size_t size),
+          (ITT_FORMAT addr, size), model_reduction_uses, __itt_group_model,
+          "%p, %d")
+ITT_STUBV(ITTAPI, void, model_observe_uses, (void *addr, size_t size),
+          (ITT_FORMAT addr, size), model_observe_uses, __itt_group_model,
+          "%p, %d")
+ITT_STUBV(ITTAPI, void, model_clear_uses, (void *addr), (ITT_FORMAT addr),
+          model_clear_uses, __itt_group_model, "%p")
 
 #ifndef __ITT_INTERNAL_BODY
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-ITT_STUBV(ITTAPI, void, model_site_beginW,         (const wchar_t *name),     (ITT_FORMAT name),       model_site_beginW,         __itt_group_model, "\"%s\"")
-ITT_STUBV(ITTAPI, void, model_task_beginW,         (const wchar_t *name),     (ITT_FORMAT name),       model_task_beginW,         __itt_group_model, "\"%s\"")
-ITT_STUBV(ITTAPI, void, model_iteration_taskW,     (const wchar_t *name),     (ITT_FORMAT name),       model_iteration_taskW,     __itt_group_model, "\"%s\"")
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+ITT_STUBV(ITTAPI, void, model_site_beginW, (const wchar_t *name),
+          (ITT_FORMAT name), model_site_beginW, __itt_group_model, "\"%s\"")
+ITT_STUBV(ITTAPI, void, model_task_beginW, (const wchar_t *name),
+          (ITT_FORMAT name), model_task_beginW, __itt_group_model, "\"%s\"")
+ITT_STUBV(ITTAPI, void, model_iteration_taskW, (const wchar_t *name),
+          (ITT_FORMAT name), model_iteration_taskW, __itt_group_model, "\"%s\"")
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-ITT_STUBV(ITTAPI, void, model_site_beginA,         (const char *name),        (ITT_FORMAT name),       model_site_beginA,         __itt_group_model, "\"%s\"")
-ITT_STUBV(ITTAPI, void, model_site_beginAL,        (const char *name, size_t len), (ITT_FORMAT name, len), model_site_beginAL,    __itt_group_model, "\"%s\", %d")
-ITT_STUBV(ITTAPI, void, model_task_beginA,         (const char *name),        (ITT_FORMAT name),       model_task_beginA,         __itt_group_model, "\"%s\"")
-ITT_STUBV(ITTAPI, void, model_task_beginAL,        (const char *name, size_t len), (ITT_FORMAT name, len), model_task_beginAL,    __itt_group_model, "\"%s\", %d")
-ITT_STUBV(ITTAPI, void, model_iteration_taskA,     (const char *name),        (ITT_FORMAT name),       model_iteration_taskA,     __itt_group_model, "\"%s\"")
-ITT_STUBV(ITTAPI, void, model_iteration_taskAL,    (const char *name, size_t len), (ITT_FORMAT name, len), model_iteration_taskAL, __itt_group_model, "\"%s\", %d")
-ITT_STUBV(ITTAPI, void, model_site_end_2,          (void),                    (ITT_NO_PARAMS),         model_site_end_2,          __itt_group_model, "no args")
-ITT_STUBV(ITTAPI, void, model_task_end_2,          (void),                    (ITT_NO_PARAMS),         model_task_end_2,          __itt_group_model, "no args")
-ITT_STUBV(ITTAPI, void, model_lock_acquire_2,      (void *lock),              (ITT_FORMAT lock),       model_lock_acquire_2,      __itt_group_model, "%p")
-ITT_STUBV(ITTAPI, void, model_lock_release_2,      (void *lock),              (ITT_FORMAT lock),       model_lock_release_2,      __itt_group_model, "%p")
-ITT_STUBV(ITTAPI, void, model_aggregate_task,      (size_t count),            (ITT_FORMAT count),      model_aggregate_task,      __itt_group_model, "%d")
-ITT_STUBV(ITTAPI, void, model_disable_push,        (__itt_model_disable x),   (ITT_FORMAT x),          model_disable_push,        __itt_group_model, "%p")
-ITT_STUBV(ITTAPI, void, model_disable_pop,         (void),                    (ITT_NO_PARAMS),         model_disable_pop,         __itt_group_model, "no args")
+ITT_STUBV(ITTAPI, void, model_site_beginA, (const char *name),
+          (ITT_FORMAT name), model_site_beginA, __itt_group_model, "\"%s\"")
+ITT_STUBV(ITTAPI, void, model_site_beginAL, (const char *name, size_t len),
+          (ITT_FORMAT name, len), model_site_beginAL, __itt_group_model,
+          "\"%s\", %d")
+ITT_STUBV(ITTAPI, void, model_task_beginA, (const char *name),
+          (ITT_FORMAT name), model_task_beginA, __itt_group_model, "\"%s\"")
+ITT_STUBV(ITTAPI, void, model_task_beginAL, (const char *name, size_t len),
+          (ITT_FORMAT name, len), model_task_beginAL, __itt_group_model,
+          "\"%s\", %d")
+ITT_STUBV(ITTAPI, void, model_iteration_taskA, (const char *name),
+          (ITT_FORMAT name), model_iteration_taskA, __itt_group_model, "\"%s\"")
+ITT_STUBV(ITTAPI, void, model_iteration_taskAL, (const char *name, size_t len),
+          (ITT_FORMAT name, len), model_iteration_taskAL, __itt_group_model,
+          "\"%s\", %d")
+ITT_STUBV(ITTAPI, void, model_site_end_2, (void), (ITT_NO_PARAMS),
+          model_site_end_2, __itt_group_model, "no args")
+ITT_STUBV(ITTAPI, void, model_task_end_2, (void), (ITT_NO_PARAMS),
+          model_task_end_2, __itt_group_model, "no args")
+ITT_STUBV(ITTAPI, void, model_lock_acquire_2, (void *lock), (ITT_FORMAT lock),
+          model_lock_acquire_2, __itt_group_model, "%p")
+ITT_STUBV(ITTAPI, void, model_lock_release_2, (void *lock), (ITT_FORMAT lock),
+          model_lock_release_2, __itt_group_model, "%p")
+ITT_STUBV(ITTAPI, void, model_aggregate_task, (size_t count),
+          (ITT_FORMAT count), model_aggregate_task, __itt_group_model, "%d")
+ITT_STUBV(ITTAPI, void, model_disable_push, (__itt_model_disable x),
+          (ITT_FORMAT x), model_disable_push, __itt_group_model, "%p")
+ITT_STUBV(ITTAPI, void, model_disable_pop, (void), (ITT_NO_PARAMS),
+          model_disable_pop, __itt_group_model, "no args")
 #endif /* __ITT_INTERNAL_BODY */
 
 #ifndef __ITT_INTERNAL_BODY
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-ITT_STUB(ITTAPI, __itt_heap_function, heap_function_createA, (const char    *name, const char    *domain), (ITT_FORMAT name, domain), heap_function_createA, __itt_group_heap, "\"%s\", \"%s\"")
-ITT_STUB(ITTAPI, __itt_heap_function, heap_function_createW, (const wchar_t *name, const wchar_t *domain), (ITT_FORMAT name, domain), heap_function_createW, __itt_group_heap, "\"%s\", \"%s\"")
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-ITT_STUB(ITTAPI, __itt_heap_function, heap_function_create,  (const char    *name, const char    *domain), (ITT_FORMAT name, domain), heap_function_create,  __itt_group_heap, "\"%s\", \"%s\"")
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_heap_function, heap_function_createA,
+         (const char *name, const char *domain), (ITT_FORMAT name, domain),
+         heap_function_createA, __itt_group_heap, "\"%s\", \"%s\"")
+ITT_STUB(ITTAPI, __itt_heap_function, heap_function_createW,
+         (const wchar_t *name, const wchar_t *domain),
+         (ITT_FORMAT name, domain), heap_function_createW, __itt_group_heap,
+         "\"%s\", \"%s\"")
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_heap_function, heap_function_create,
+         (const char *name, const char *domain), (ITT_FORMAT name, domain),
+         heap_function_create, __itt_group_heap, "\"%s\", \"%s\"")
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 #endif /* __ITT_INTERNAL_BODY */
-ITT_STUBV(ITTAPI, void, heap_allocate_begin,   (__itt_heap_function h, size_t size, int initialized),             (ITT_FORMAT h, size, initialized),       heap_allocate_begin, __itt_group_heap, "%p, %lu, %d")
-ITT_STUBV(ITTAPI, void, heap_allocate_end,     (__itt_heap_function h, void** addr, size_t size, int initialized), (ITT_FORMAT h, addr, size, initialized), heap_allocate_end,   __itt_group_heap, "%p, %p, %lu, %d")
-ITT_STUBV(ITTAPI, void, heap_free_begin,       (__itt_heap_function h, void*  addr), (ITT_FORMAT h, addr), heap_free_begin, __itt_group_heap, "%p, %p")
-ITT_STUBV(ITTAPI, void, heap_free_end,         (__itt_heap_function h, void*  addr), (ITT_FORMAT h, addr), heap_free_end,   __itt_group_heap, "%p, %p")
-ITT_STUBV(ITTAPI, void, heap_reallocate_begin, (__itt_heap_function h, void*  addr, size_t new_size, int initialized),                  (ITT_FORMAT h, addr, new_size, initialized),           heap_reallocate_begin, __itt_group_heap, "%p, %p, %lu, %d")
-ITT_STUBV(ITTAPI, void, heap_reallocate_end,   (__itt_heap_function h, void*  addr, void** new_addr, size_t new_size, int initialized), (ITT_FORMAT h, addr, new_addr, new_size, initialized), heap_reallocate_end,   __itt_group_heap, "%p, %p, %p, %lu, %d")
-ITT_STUBV(ITTAPI, void, heap_internal_access_begin, (void), (ITT_NO_PARAMS), heap_internal_access_begin, __itt_group_heap, "no args")
-ITT_STUBV(ITTAPI, void, heap_internal_access_end,   (void), (ITT_NO_PARAMS), heap_internal_access_end,   __itt_group_heap, "no args")
-ITT_STUBV(ITTAPI, void, heap_record_memory_growth_begin, (void), (ITT_NO_PARAMS), heap_record_memory_growth_begin, __itt_group_heap, "no args")
-ITT_STUBV(ITTAPI, void, heap_record_memory_growth_end,   (void), (ITT_NO_PARAMS), heap_record_memory_growth_end,   __itt_group_heap, "no args")
-ITT_STUBV(ITTAPI, void, heap_reset_detection, (unsigned int reset_mask),  (ITT_FORMAT reset_mask), heap_reset_detection, __itt_group_heap, "%u")
-ITT_STUBV(ITTAPI, void, heap_record,          (unsigned int record_mask), (ITT_FORMAT record_mask),  heap_record,        __itt_group_heap, "%u")
-
-ITT_STUBV(ITTAPI, void, id_create,  (const __itt_domain *domain, __itt_id id), (ITT_FORMAT domain, id), id_create,  __itt_group_structure, "%p, %lu")
-ITT_STUBV(ITTAPI, void, id_destroy, (const __itt_domain *domain, __itt_id id), (ITT_FORMAT domain, id), id_destroy, __itt_group_structure, "%p, %lu")
-
-ITT_STUB(ITTAPI, __itt_timestamp, get_timestamp, (void), (ITT_NO_PARAMS), get_timestamp,  __itt_group_structure, "no args")
-
-ITT_STUBV(ITTAPI, void, region_begin, (const __itt_domain *domain, __itt_id id, __itt_id parent, __itt_string_handle *name), (ITT_FORMAT domain, id, parent, name), region_begin, __itt_group_structure, "%p, %lu, %lu, %p")
-ITT_STUBV(ITTAPI, void, region_end,   (const __itt_domain *domain, __itt_id id),                                             (ITT_FORMAT domain, id),               region_end,   __itt_group_structure, "%p, %lu")
+ITT_STUBV(ITTAPI, void, heap_allocate_begin,
+          (__itt_heap_function h, size_t size, int initialized),
+          (ITT_FORMAT h, size, initialized), heap_allocate_begin,
+          __itt_group_heap, "%p, %lu, %d")
+ITT_STUBV(ITTAPI, void, heap_allocate_end,
+          (__itt_heap_function h, void **addr, size_t size, int initialized),
+          (ITT_FORMAT h, addr, size, initialized), heap_allocate_end,
+          __itt_group_heap, "%p, %p, %lu, %d")
+ITT_STUBV(ITTAPI, void, heap_free_begin, (__itt_heap_function h, void *addr),
+          (ITT_FORMAT h, addr), heap_free_begin, __itt_group_heap, "%p, %p")
+ITT_STUBV(ITTAPI, void, heap_free_end, (__itt_heap_function h, void *addr),
+          (ITT_FORMAT h, addr), heap_free_end, __itt_group_heap, "%p, %p")
+ITT_STUBV(ITTAPI, void, heap_reallocate_begin,
+          (__itt_heap_function h, void *addr, size_t new_size, int initialized),
+          (ITT_FORMAT h, addr, new_size, initialized), heap_reallocate_begin,
+          __itt_group_heap, "%p, %p, %lu, %d")
+ITT_STUBV(ITTAPI, void, heap_reallocate_end,
+          (__itt_heap_function h, void *addr, void **new_addr, size_t new_size,
+           int initialized),
+          (ITT_FORMAT h, addr, new_addr, new_size, initialized),
+          heap_reallocate_end, __itt_group_heap, "%p, %p, %p, %lu, %d")
+ITT_STUBV(ITTAPI, void, heap_internal_access_begin, (void), (ITT_NO_PARAMS),
+          heap_internal_access_begin, __itt_group_heap, "no args")
+ITT_STUBV(ITTAPI, void, heap_internal_access_end, (void), (ITT_NO_PARAMS),
+          heap_internal_access_end, __itt_group_heap, "no args")
+ITT_STUBV(ITTAPI, void, heap_record_memory_growth_begin, (void),
+          (ITT_NO_PARAMS), heap_record_memory_growth_begin, __itt_group_heap,
+          "no args")
+ITT_STUBV(ITTAPI, void, heap_record_memory_growth_end, (void), (ITT_NO_PARAMS),
+          heap_record_memory_growth_end, __itt_group_heap, "no args")
+ITT_STUBV(ITTAPI, void, heap_reset_detection, (unsigned int reset_mask),
+          (ITT_FORMAT reset_mask), heap_reset_detection, __itt_group_heap, "%u")
+ITT_STUBV(ITTAPI, void, heap_record, (unsigned int record_mask),
+          (ITT_FORMAT record_mask), heap_record, __itt_group_heap, "%u")
+
+ITT_STUBV(ITTAPI, void, id_create, (const __itt_domain *domain, __itt_id id),
+          (ITT_FORMAT domain, id), id_create, __itt_group_structure, "%p, %lu")
+ITT_STUBV(ITTAPI, void, id_destroy, (const __itt_domain *domain, __itt_id id),
+          (ITT_FORMAT domain, id), id_destroy, __itt_group_structure, "%p, %lu")
+
+ITT_STUB(ITTAPI, __itt_timestamp, get_timestamp, (void), (ITT_NO_PARAMS),
+         get_timestamp, __itt_group_structure, "no args")
+
+ITT_STUBV(ITTAPI, void, region_begin,
+          (const __itt_domain *domain, __itt_id id, __itt_id parent,
+           __itt_string_handle *name),
+          (ITT_FORMAT domain, id, parent, name), region_begin,
+          __itt_group_structure, "%p, %lu, %lu, %p")
+ITT_STUBV(ITTAPI, void, region_end, (const __itt_domain *domain, __itt_id id),
+          (ITT_FORMAT domain, id), region_end, __itt_group_structure, "%p, %lu")
 
 #ifndef __ITT_INTERNAL_BODY
-ITT_STUBV(ITTAPI, void, frame_begin_v3,  (const __itt_domain *domain, __itt_id *id),                                             (ITT_FORMAT domain, id),             frame_begin_v3,  __itt_group_structure, "%p, %p")
-ITT_STUBV(ITTAPI, void, frame_end_v3,    (const __itt_domain *domain, __itt_id *id),                                             (ITT_FORMAT domain, id),             frame_end_v3,    __itt_group_structure, "%p, %p")
-ITT_STUBV(ITTAPI, void, frame_submit_v3, (const __itt_domain *domain, __itt_id *id, __itt_timestamp begin, __itt_timestamp end), (ITT_FORMAT domain, id, begin, end), frame_submit_v3, __itt_group_structure, "%p, %p, %lu, %lu")
+ITT_STUBV(ITTAPI, void, frame_begin_v3,
+          (const __itt_domain *domain, __itt_id *id), (ITT_FORMAT domain, id),
+          frame_begin_v3, __itt_group_structure, "%p, %p")
+ITT_STUBV(ITTAPI, void, frame_end_v3,
+          (const __itt_domain *domain, __itt_id *id), (ITT_FORMAT domain, id),
+          frame_end_v3, __itt_group_structure, "%p, %p")
+ITT_STUBV(ITTAPI, void, frame_submit_v3,
+          (const __itt_domain *domain, __itt_id *id, __itt_timestamp begin,
+           __itt_timestamp end),
+          (ITT_FORMAT domain, id, begin, end), frame_submit_v3,
+          __itt_group_structure, "%p, %p, %lu, %lu")
 #endif /* __ITT_INTERNAL_BODY */
 
-ITT_STUBV(ITTAPI, void, task_group,   (const __itt_domain *domain, __itt_id id, __itt_id parent, __itt_string_handle *name), (ITT_FORMAT domain, id, parent, name), task_group,  __itt_group_structure, "%p, %lu, %lu, %p")
-
-ITT_STUBV(ITTAPI, void, task_begin,    (const __itt_domain *domain, __itt_id id, __itt_id parent, __itt_string_handle *name), (ITT_FORMAT domain, id, parent, name), task_begin,    __itt_group_structure, "%p, %lu, %lu, %p")
-ITT_STUBV(ITTAPI, void, task_begin_fn, (const __itt_domain *domain, __itt_id id, __itt_id parent, void* fn),                  (ITT_FORMAT domain, id, parent, fn),   task_begin_fn, __itt_group_structure, "%p, %lu, %lu, %p")
-ITT_STUBV(ITTAPI, void, task_end,      (const __itt_domain *domain),                                                          (ITT_FORMAT domain),                   task_end,      __itt_group_structure, "%p")
-
-ITT_STUBV(ITTAPI, void, counter_inc_v3,       (const __itt_domain *domain, __itt_string_handle *name),                           (ITT_FORMAT domain, name),        counter_inc_v3,       __itt_group_structure, "%p, %p")
-ITT_STUBV(ITTAPI, void, counter_inc_delta_v3, (const __itt_domain *domain, __itt_string_handle *name, unsigned long long value), (ITT_FORMAT domain, name, value), counter_inc_delta_v3, __itt_group_structure, "%p, %p, %lu")
-ITT_STUBV(ITTAPI, void, counter_dec_v3,       (const __itt_domain *domain, __itt_string_handle *name),                           (ITT_FORMAT domain, name),        counter_dec_v3,       __itt_group_structure, "%p, %p")
-ITT_STUBV(ITTAPI, void, counter_dec_delta_v3, (const __itt_domain *domain, __itt_string_handle *name, unsigned long long value), (ITT_FORMAT domain, name, value), counter_dec_delta_v3, __itt_group_structure, "%p, %p, %lu")
-
-ITT_STUBV(ITTAPI, void, marker, (const __itt_domain *domain, __itt_id id, __itt_string_handle *name, __itt_scope scope), (ITT_FORMAT domain, id, name, scope), marker, __itt_group_structure, "%p, %lu, %p, %d")
-
-ITT_STUBV(ITTAPI, void, metadata_add,      (const __itt_domain *domain, __itt_id id, __itt_string_handle *key, __itt_metadata_type type, size_t count, void *data), (ITT_FORMAT domain, id, key, type, count, data), metadata_add, __itt_group_structure, "%p, %lu, %p, %d, %lu, %p")
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-ITT_STUBV(ITTAPI, void, metadata_str_addA, (const __itt_domain *domain, __itt_id id, __itt_string_handle *key, const char* data, size_t length),    (ITT_FORMAT domain, id, key, data, length), metadata_str_addA, __itt_group_structure, "%p, %lu, %p, %p, %lu")
-ITT_STUBV(ITTAPI, void, metadata_str_addW, (const __itt_domain *domain, __itt_id id, __itt_string_handle *key, const wchar_t* data, size_t length), (ITT_FORMAT domain, id, key, data, length), metadata_str_addW, __itt_group_structure, "%p, %lu, %p, %p, %lu")
-#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
-ITT_STUBV(ITTAPI, void, metadata_str_add,  (const __itt_domain *domain, __itt_id id, __itt_string_handle *key, const char* data, size_t length),    (ITT_FORMAT domain, id, key, data, length), metadata_str_add,  __itt_group_structure, "%p, %lu, %p, %p, %lu")
+ITT_STUBV(ITTAPI, void, task_group,
+          (const __itt_domain *domain, __itt_id id, __itt_id parent,
+           __itt_string_handle *name),
+          (ITT_FORMAT domain, id, parent, name), task_group,
+          __itt_group_structure, "%p, %lu, %lu, %p")
+
+ITT_STUBV(ITTAPI, void, task_begin,
+          (const __itt_domain *domain, __itt_id id, __itt_id parent,
+           __itt_string_handle *name),
+          (ITT_FORMAT domain, id, parent, name), task_begin,
+          __itt_group_structure, "%p, %lu, %lu, %p")
+ITT_STUBV(ITTAPI, void, task_begin_fn,
+          (const __itt_domain *domain, __itt_id id, __itt_id parent, void *fn),
+          (ITT_FORMAT domain, id, parent, fn), task_begin_fn,
+          __itt_group_structure, "%p, %lu, %lu, %p")
+ITT_STUBV(ITTAPI, void, task_end, (const __itt_domain *domain),
+          (ITT_FORMAT domain), task_end, __itt_group_structure, "%p")
+
+ITT_STUBV(ITTAPI, void, counter_inc_v3,
+          (const __itt_domain *domain, __itt_string_handle *name),
+          (ITT_FORMAT domain, name), counter_inc_v3, __itt_group_structure,
+          "%p, %p")
+ITT_STUBV(ITTAPI, void, counter_inc_delta_v3,
+          (const __itt_domain *domain, __itt_string_handle *name,
+           unsigned long long value),
+          (ITT_FORMAT domain, name, value), counter_inc_delta_v3,
+          __itt_group_structure, "%p, %p, %lu")
+ITT_STUBV(ITTAPI, void, counter_dec_v3,
+          (const __itt_domain *domain, __itt_string_handle *name),
+          (ITT_FORMAT domain, name), counter_dec_v3, __itt_group_structure,
+          "%p, %p")
+ITT_STUBV(ITTAPI, void, counter_dec_delta_v3,
+          (const __itt_domain *domain, __itt_string_handle *name,
+           unsigned long long value),
+          (ITT_FORMAT domain, name, value), counter_dec_delta_v3,
+          __itt_group_structure, "%p, %p, %lu")
+
+ITT_STUBV(ITTAPI, void, marker,
+          (const __itt_domain *domain, __itt_id id, __itt_string_handle *name,
+           __itt_scope scope),
+          (ITT_FORMAT domain, id, name, scope), marker, __itt_group_structure,
+          "%p, %lu, %p, %d")
+
+ITT_STUBV(ITTAPI, void, metadata_add,
+          (const __itt_domain *domain, __itt_id id, __itt_string_handle *key,
+           __itt_metadata_type type, size_t count, void *data),
+          (ITT_FORMAT domain, id, key, type, count, data), metadata_add,
+          __itt_group_structure, "%p, %lu, %p, %d, %lu, %p")
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+ITT_STUBV(ITTAPI, void, metadata_str_addA,
+          (const __itt_domain *domain, __itt_id id, __itt_string_handle *key,
+           const char *data, size_t length),
+          (ITT_FORMAT domain, id, key, data, length), metadata_str_addA,
+          __itt_group_structure, "%p, %lu, %p, %p, %lu")
+ITT_STUBV(ITTAPI, void, metadata_str_addW,
+          (const __itt_domain *domain, __itt_id id, __itt_string_handle *key,
+           const wchar_t *data, size_t length),
+          (ITT_FORMAT domain, id, key, data, length), metadata_str_addW,
+          __itt_group_structure, "%p, %lu, %p, %p, %lu")
+#else /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+ITT_STUBV(ITTAPI, void, metadata_str_add,
+          (const __itt_domain *domain, __itt_id id, __itt_string_handle *key,
+           const char *data, size_t length),
+          (ITT_FORMAT domain, id, key, data, length), metadata_str_add,
+          __itt_group_structure, "%p, %lu, %p, %p, %lu")
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 
-ITT_STUBV(ITTAPI, void, relation_add_to_current, (const __itt_domain *domain, __itt_relation relation, __itt_id tail),                (ITT_FORMAT domain, relation, tail),       relation_add_to_current, __itt_group_structure, "%p, %lu, %p")
-ITT_STUBV(ITTAPI, void, relation_add,            (const __itt_domain *domain, __itt_id head, __itt_relation relation, __itt_id tail), (ITT_FORMAT domain, head, relation, tail), relation_add,            __itt_group_structure, "%p, %p, %lu, %p")
+ITT_STUBV(ITTAPI, void, relation_add_to_current,
+          (const __itt_domain *domain, __itt_relation relation, __itt_id tail),
+          (ITT_FORMAT domain, relation, tail), relation_add_to_current,
+          __itt_group_structure, "%p, %lu, %p")
+ITT_STUBV(ITTAPI, void, relation_add,
+          (const __itt_domain *domain, __itt_id head, __itt_relation relation,
+           __itt_id tail),
+          (ITT_FORMAT domain, head, relation, tail), relation_add,
+          __itt_group_structure, "%p, %p, %lu, %p")
 
 #ifndef __ITT_INTERNAL_BODY
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-ITT_STUB(LIBITTAPI, __itt_event, event_createA, (const char    *name, int namelen), (ITT_FORMAT name, namelen), event_createA, __itt_group_mark | __itt_group_legacy, "\"%s\", %d")
-ITT_STUB(LIBITTAPI, __itt_event, event_createW, (const wchar_t *name, int namelen), (ITT_FORMAT name, namelen), event_createW, __itt_group_mark | __itt_group_legacy, "\"%S\", %d")
-#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
-ITT_STUB(LIBITTAPI, __itt_event, event_create,  (const char    *name, int namelen), (ITT_FORMAT name, namelen), event_create,  __itt_group_mark | __itt_group_legacy, "\"%s\", %d")
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+ITT_STUB(LIBITTAPI, __itt_event, event_createA, (const char *name, int namelen),
+         (ITT_FORMAT name, namelen), event_createA,
+         __itt_group_mark | __itt_group_legacy, "\"%s\", %d")
+ITT_STUB(LIBITTAPI, __itt_event, event_createW,
+         (const wchar_t *name, int namelen), (ITT_FORMAT name, namelen),
+         event_createW, __itt_group_mark | __itt_group_legacy, "\"%S\", %d")
+#else /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+ITT_STUB(LIBITTAPI, __itt_event, event_create, (const char *name, int namelen),
+         (ITT_FORMAT name, namelen), event_create,
+         __itt_group_mark | __itt_group_legacy, "\"%s\", %d")
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-ITT_STUB(LIBITTAPI, int,  event_start,          (__itt_event event),                (ITT_FORMAT event),         event_start,   __itt_group_mark | __itt_group_legacy, "%d")
-ITT_STUB(LIBITTAPI, int,  event_end,            (__itt_event event),                (ITT_FORMAT event),         event_end,     __itt_group_mark | __itt_group_legacy, "%d")
+ITT_STUB(LIBITTAPI, int, event_start, (__itt_event event), (ITT_FORMAT event),
+         event_start, __itt_group_mark | __itt_group_legacy, "%d")
+ITT_STUB(LIBITTAPI, int, event_end, (__itt_event event), (ITT_FORMAT event),
+         event_end, __itt_group_mark | __itt_group_legacy, "%d")
 #endif /* __ITT_INTERNAL_BODY */
 
 #ifndef __ITT_INTERNAL_BODY
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-ITT_STUBV(ITTAPI, void, sync_set_nameA, (void *addr, const char    *objtype, const char    *objname, int attribute), (ITT_FORMAT addr, objtype, objname, attribute), sync_set_nameA, __itt_group_sync | __itt_group_fsync | __itt_group_legacy, "%p, \"%s\", \"%s\", %x")
-ITT_STUBV(ITTAPI, void, sync_set_nameW, (void *addr, const wchar_t *objtype, const wchar_t *objname, int attribute), (ITT_FORMAT addr, objtype, objname, attribute), sync_set_nameW, __itt_group_sync | __itt_group_fsync | __itt_group_legacy, "%p, \"%S\", \"%S\", %x")
-#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
-ITT_STUBV(ITTAPI, void, sync_set_name,  (void *addr, const char    *objtype, const char    *objname, int attribute), (ITT_FORMAT addr, objtype, objname, attribute), sync_set_name,  __itt_group_sync | __itt_group_fsync | __itt_group_legacy, "p, \"%s\", \"%s\", %x")
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+ITT_STUBV(ITTAPI, void, sync_set_nameA,
+          (void *addr, const char *objtype, const char *objname, int attribute),
+          (ITT_FORMAT addr, objtype, objname, attribute), sync_set_nameA,
+          __itt_group_sync | __itt_group_fsync | __itt_group_legacy,
+          "%p, \"%s\", \"%s\", %x")
+ITT_STUBV(ITTAPI, void, sync_set_nameW,
+          (void *addr, const wchar_t *objtype, const wchar_t *objname,
+           int attribute),
+          (ITT_FORMAT addr, objtype, objname, attribute), sync_set_nameW,
+          __itt_group_sync | __itt_group_fsync | __itt_group_legacy,
+          "%p, \"%S\", \"%S\", %x")
+#else /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+ITT_STUBV(ITTAPI, void, sync_set_name,
+          (void *addr, const char *objtype, const char *objname, int attribute),
+          (ITT_FORMAT addr, objtype, objname, attribute), sync_set_name,
+          __itt_group_sync | __itt_group_fsync | __itt_group_legacy,
+          "p, \"%s\", \"%s\", %x")
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-ITT_STUB(LIBITTAPI, int, notify_sync_nameA, (void *p, const char    *objtype, int typelen, const char    *objname, int namelen, int attribute), (ITT_FORMAT p, objtype, typelen, objname, namelen, attribute), notify_sync_nameA, __itt_group_sync | __itt_group_fsync | __itt_group_legacy, "%p, \"%s\", %d, \"%s\", %d, %x")
-ITT_STUB(LIBITTAPI, int, notify_sync_nameW, (void *p, const wchar_t *objtype, int typelen, const wchar_t *objname, int namelen, int attribute), (ITT_FORMAT p, objtype, typelen, objname, namelen, attribute), notify_sync_nameW, __itt_group_sync | __itt_group_fsync | __itt_group_legacy, "%p, \"%S\", %d, \"%S\", %d, %x")
-#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
-ITT_STUB(LIBITTAPI, int, notify_sync_name,  (void *p, const char    *objtype, int typelen, const char    *objname, int namelen, int attribute), (ITT_FORMAT p, objtype, typelen, objname, namelen, attribute), notify_sync_name,  __itt_group_sync | __itt_group_fsync | __itt_group_legacy, "%p, \"%s\", %d, \"%s\", %d, %x")
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+ITT_STUB(LIBITTAPI, int, notify_sync_nameA,
+         (void *p, const char *objtype, int typelen, const char *objname,
+          int namelen, int attribute),
+         (ITT_FORMAT p, objtype, typelen, objname, namelen, attribute),
+         notify_sync_nameA,
+         __itt_group_sync | __itt_group_fsync | __itt_group_legacy,
+         "%p, \"%s\", %d, \"%s\", %d, %x")
+ITT_STUB(LIBITTAPI, int, notify_sync_nameW,
+         (void *p, const wchar_t *objtype, int typelen, const wchar_t *objname,
+          int namelen, int attribute),
+         (ITT_FORMAT p, objtype, typelen, objname, namelen, attribute),
+         notify_sync_nameW,
+         __itt_group_sync | __itt_group_fsync | __itt_group_legacy,
+         "%p, \"%S\", %d, \"%S\", %d, %x")
+#else /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+ITT_STUB(LIBITTAPI, int, notify_sync_name,
+         (void *p, const char *objtype, int typelen, const char *objname,
+          int namelen, int attribute),
+         (ITT_FORMAT p, objtype, typelen, objname, namelen, attribute),
+         notify_sync_name,
+         __itt_group_sync | __itt_group_fsync | __itt_group_legacy,
+         "%p, \"%s\", %d, \"%s\", %d, %x")
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 
-ITT_STUBV(LIBITTAPI, void, notify_sync_prepare,   (void *p), (ITT_FORMAT p), notify_sync_prepare,   __itt_group_sync | __itt_group_fsync | __itt_group_legacy, "%p")
-ITT_STUBV(LIBITTAPI, void, notify_sync_cancel,    (void *p), (ITT_FORMAT p), notify_sync_cancel,    __itt_group_sync | __itt_group_fsync | __itt_group_legacy, "%p")
-ITT_STUBV(LIBITTAPI, void, notify_sync_acquired,  (void *p), (ITT_FORMAT p), notify_sync_acquired,  __itt_group_sync | __itt_group_fsync | __itt_group_legacy, "%p")
-ITT_STUBV(LIBITTAPI, void, notify_sync_releasing, (void *p), (ITT_FORMAT p), notify_sync_releasing, __itt_group_sync | __itt_group_fsync | __itt_group_legacy, "%p")
+ITT_STUBV(LIBITTAPI, void, notify_sync_prepare, (void *p), (ITT_FORMAT p),
+          notify_sync_prepare,
+          __itt_group_sync | __itt_group_fsync | __itt_group_legacy, "%p")
+ITT_STUBV(LIBITTAPI, void, notify_sync_cancel, (void *p), (ITT_FORMAT p),
+          notify_sync_cancel,
+          __itt_group_sync | __itt_group_fsync | __itt_group_legacy, "%p")
+ITT_STUBV(LIBITTAPI, void, notify_sync_acquired, (void *p), (ITT_FORMAT p),
+          notify_sync_acquired,
+          __itt_group_sync | __itt_group_fsync | __itt_group_legacy, "%p")
+ITT_STUBV(LIBITTAPI, void, notify_sync_releasing, (void *p), (ITT_FORMAT p),
+          notify_sync_releasing,
+          __itt_group_sync | __itt_group_fsync | __itt_group_legacy, "%p")
 #endif /* __ITT_INTERNAL_BODY */
 
-ITT_STUBV(LIBITTAPI, void, memory_read,   (void *addr, size_t size), (ITT_FORMAT addr, size), memory_read,   __itt_group_legacy, "%p, %lu")
-ITT_STUBV(LIBITTAPI, void, memory_write,  (void *addr, size_t size), (ITT_FORMAT addr, size), memory_write,  __itt_group_legacy, "%p, %lu")
-ITT_STUBV(LIBITTAPI, void, memory_update, (void *addr, size_t size), (ITT_FORMAT addr, size), memory_update, __itt_group_legacy, "%p, %lu")
-
-ITT_STUB(LIBITTAPI, __itt_state_t,     state_get,    (void),                                    (ITT_NO_PARAMS),   state_get,    __itt_group_legacy, "no args")
-ITT_STUB(LIBITTAPI, __itt_state_t,     state_set,    (__itt_state_t s),                         (ITT_FORMAT s),    state_set,    __itt_group_legacy, "%d")
-ITT_STUB(LIBITTAPI, __itt_obj_state_t, obj_mode_set, (__itt_obj_prop_t p, __itt_obj_state_t s), (ITT_FORMAT p, s), obj_mode_set, __itt_group_legacy, "%d, %d")
-ITT_STUB(LIBITTAPI, __itt_thr_state_t, thr_mode_set, (__itt_thr_prop_t p, __itt_thr_state_t s), (ITT_FORMAT p, s), thr_mode_set, __itt_group_legacy, "%d, %d")
+ITT_STUBV(LIBITTAPI, void, memory_read, (void *addr, size_t size),
+          (ITT_FORMAT addr, size), memory_read, __itt_group_legacy, "%p, %lu")
+ITT_STUBV(LIBITTAPI, void, memory_write, (void *addr, size_t size),
+          (ITT_FORMAT addr, size), memory_write, __itt_group_legacy, "%p, %lu")
+ITT_STUBV(LIBITTAPI, void, memory_update, (void *addr, size_t size),
+          (ITT_FORMAT addr, size), memory_update, __itt_group_legacy, "%p, %lu")
+
+ITT_STUB(LIBITTAPI, __itt_state_t, state_get, (void), (ITT_NO_PARAMS),
+         state_get, __itt_group_legacy, "no args")
+ITT_STUB(LIBITTAPI, __itt_state_t, state_set, (__itt_state_t s), (ITT_FORMAT s),
+         state_set, __itt_group_legacy, "%d")
+ITT_STUB(LIBITTAPI, __itt_obj_state_t, obj_mode_set,
+         (__itt_obj_prop_t p, __itt_obj_state_t s), (ITT_FORMAT p, s),
+         obj_mode_set, __itt_group_legacy, "%d, %d")
+ITT_STUB(LIBITTAPI, __itt_thr_state_t, thr_mode_set,
+         (__itt_thr_prop_t p, __itt_thr_state_t s), (ITT_FORMAT p, s),
+         thr_mode_set, __itt_group_legacy, "%d, %d")
 
 #ifndef __ITT_INTERNAL_BODY
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-ITT_STUB(ITTAPI, __itt_frame, frame_createA, (const char    *domain), (ITT_FORMAT domain), frame_createA, __itt_group_frame, "\"%s\"")
-ITT_STUB(ITTAPI, __itt_frame, frame_createW, (const wchar_t *domain), (ITT_FORMAT domain), frame_createW, __itt_group_frame, "\"%s\"")
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-ITT_STUB(ITTAPI, __itt_frame, frame_create,  (const char    *domain), (ITT_FORMAT domain), frame_create,  __itt_group_frame, "\"%s\"")
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_frame, frame_createA, (const char *domain),
+         (ITT_FORMAT domain), frame_createA, __itt_group_frame, "\"%s\"")
+ITT_STUB(ITTAPI, __itt_frame, frame_createW, (const wchar_t *domain),
+         (ITT_FORMAT domain), frame_createW, __itt_group_frame, "\"%s\"")
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_frame, frame_create, (const char *domain),
+         (ITT_FORMAT domain), frame_create, __itt_group_frame, "\"%s\"")
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 #endif /* __ITT_INTERNAL_BODY */
-ITT_STUBV(ITTAPI, void, frame_begin,         (__itt_frame frame),     (ITT_FORMAT frame),  frame_begin,   __itt_group_frame, "%p")
-ITT_STUBV(ITTAPI, void, frame_end,           (__itt_frame frame),     (ITT_FORMAT frame),  frame_end,     __itt_group_frame, "%p")
-
-ITT_STUBV(ITTAPI, void, counter_destroy,      (__itt_counter id),                                                                                  (ITT_FORMAT id),        counter_destroy,   __itt_group_counter, "%p")
-ITT_STUBV(ITTAPI, void, counter_inc,          (__itt_counter id),                                                                                  (ITT_FORMAT id),        counter_inc,       __itt_group_counter, "%p")
-ITT_STUBV(ITTAPI, void, counter_inc_delta,    (__itt_counter id, unsigned long long value),                                                        (ITT_FORMAT id, value), counter_inc_delta, __itt_group_counter, "%p, %lu")
-ITT_STUBV(ITTAPI, void, counter_dec,          (__itt_counter id),                                                                                  (ITT_FORMAT id),        counter_dec,       __itt_group_counter, "%p")
-ITT_STUBV(ITTAPI, void, counter_dec_delta,    (__itt_counter id, unsigned long long value),                                                        (ITT_FORMAT id, value), counter_dec_delta, __itt_group_counter, "%p, %lu")
-ITT_STUBV(ITTAPI, void, counter_set_value,    (__itt_counter id, void *value_ptr),                                                                 (ITT_FORMAT id, value_ptr),                          counter_set_value,    __itt_group_counter, "%p, %p")
-ITT_STUBV(ITTAPI, void, counter_set_value_ex, (__itt_counter id, __itt_clock_domain *clock_domain, unsigned long long timestamp, void *value_ptr), (ITT_FORMAT id, clock_domain, timestamp, value_ptr), counter_set_value_ex, __itt_group_counter, "%p, %p, %llu, %p")
+ITT_STUBV(ITTAPI, void, frame_begin, (__itt_frame frame), (ITT_FORMAT frame),
+          frame_begin, __itt_group_frame, "%p")
+ITT_STUBV(ITTAPI, void, frame_end, (__itt_frame frame), (ITT_FORMAT frame),
+          frame_end, __itt_group_frame, "%p")
+
+ITT_STUBV(ITTAPI, void, counter_destroy, (__itt_counter id), (ITT_FORMAT id),
+          counter_destroy, __itt_group_counter, "%p")
+ITT_STUBV(ITTAPI, void, counter_inc, (__itt_counter id), (ITT_FORMAT id),
+          counter_inc, __itt_group_counter, "%p")
+ITT_STUBV(ITTAPI, void, counter_inc_delta,
+          (__itt_counter id, unsigned long long value), (ITT_FORMAT id, value),
+          counter_inc_delta, __itt_group_counter, "%p, %lu")
+ITT_STUBV(ITTAPI, void, counter_dec, (__itt_counter id), (ITT_FORMAT id),
+          counter_dec, __itt_group_counter, "%p")
+ITT_STUBV(ITTAPI, void, counter_dec_delta,
+          (__itt_counter id, unsigned long long value), (ITT_FORMAT id, value),
+          counter_dec_delta, __itt_group_counter, "%p, %lu")
+ITT_STUBV(ITTAPI, void, counter_set_value, (__itt_counter id, void *value_ptr),
+          (ITT_FORMAT id, value_ptr), counter_set_value, __itt_group_counter,
+          "%p, %p")
+ITT_STUBV(ITTAPI, void, counter_set_value_ex,
+          (__itt_counter id, __itt_clock_domain *clock_domain,
+           unsigned long long timestamp, void *value_ptr),
+          (ITT_FORMAT id, clock_domain, timestamp, value_ptr),
+          counter_set_value_ex, __itt_group_counter, "%p, %p, %llu, %p")
 
 #ifndef __ITT_INTERNAL_BODY
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-ITT_STUB(ITTAPI, __itt_mark_type, mark_createA, (const char    *name), (ITT_FORMAT name), mark_createA, __itt_group_mark, "\"%s\"")
-ITT_STUB(ITTAPI, __itt_mark_type, mark_createW, (const wchar_t *name), (ITT_FORMAT name), mark_createW, __itt_group_mark, "\"%S\"")
-#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
-ITT_STUB(ITTAPI, __itt_mark_type, mark_create,  (const char    *name), (ITT_FORMAT name), mark_create,  __itt_group_mark, "\"%s\"")
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_mark_type, mark_createA, (const char *name),
+         (ITT_FORMAT name), mark_createA, __itt_group_mark, "\"%s\"")
+ITT_STUB(ITTAPI, __itt_mark_type, mark_createW, (const wchar_t *name),
+         (ITT_FORMAT name), mark_createW, __itt_group_mark, "\"%S\"")
+#else /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_mark_type, mark_create, (const char *name),
+         (ITT_FORMAT name), mark_create, __itt_group_mark, "\"%s\"")
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 #endif /* __ITT_INTERNAL_BODY */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-ITT_STUB(ITTAPI, int,  markA,        (__itt_mark_type mt, const char    *parameter), (ITT_FORMAT mt, parameter), markA, __itt_group_mark, "%d, \"%s\"")
-ITT_STUB(ITTAPI, int,  markW,        (__itt_mark_type mt, const wchar_t *parameter), (ITT_FORMAT mt, parameter), markW, __itt_group_mark, "%d, \"%S\"")
-#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
-ITT_STUB(ITTAPI, int,  mark,         (__itt_mark_type mt, const char    *parameter), (ITT_FORMAT mt, parameter), mark,  __itt_group_mark, "%d, \"%s\"")
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, int, markA, (__itt_mark_type mt, const char *parameter),
+         (ITT_FORMAT mt, parameter), markA, __itt_group_mark, "%d, \"%s\"")
+ITT_STUB(ITTAPI, int, markW, (__itt_mark_type mt, const wchar_t *parameter),
+         (ITT_FORMAT mt, parameter), markW, __itt_group_mark, "%d, \"%S\"")
+#else /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, int, mark, (__itt_mark_type mt, const char *parameter),
+         (ITT_FORMAT mt, parameter), mark, __itt_group_mark, "%d, \"%s\"")
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-ITT_STUB(ITTAPI, int,  mark_off, (__itt_mark_type mt), (ITT_FORMAT mt), mark_off, __itt_group_mark, "%d")
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-ITT_STUB(ITTAPI, int,  mark_globalA, (__itt_mark_type mt, const char    *parameter), (ITT_FORMAT mt, parameter), mark_globalA, __itt_group_mark, "%d, \"%s\"")
-ITT_STUB(ITTAPI, int,  mark_globalW, (__itt_mark_type mt, const wchar_t *parameter), (ITT_FORMAT mt, parameter), mark_globalW, __itt_group_mark, "%d, \"%S\"")
-#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
-ITT_STUB(ITTAPI, int,  mark_global,  (__itt_mark_type mt, const char    *parameter), (ITT_FORMAT mt, parameter), mark_global,  __itt_group_mark, "%d, \"%S\"")
+ITT_STUB(ITTAPI, int, mark_off, (__itt_mark_type mt), (ITT_FORMAT mt), mark_off,
+         __itt_group_mark, "%d")
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, int, mark_globalA, (__itt_mark_type mt, const char *parameter),
+         (ITT_FORMAT mt, parameter), mark_globalA, __itt_group_mark,
+         "%d, \"%s\"")
+ITT_STUB(ITTAPI, int, mark_globalW,
+         (__itt_mark_type mt, const wchar_t *parameter),
+         (ITT_FORMAT mt, parameter), mark_globalW, __itt_group_mark,
+         "%d, \"%S\"")
+#else /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, int, mark_global, (__itt_mark_type mt, const char *parameter),
+         (ITT_FORMAT mt, parameter), mark_global, __itt_group_mark,
+         "%d, \"%S\"")
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-ITT_STUB(ITTAPI, int,  mark_global_off, (__itt_mark_type mt),                        (ITT_FORMAT mt),            mark_global_off, __itt_group_mark, "%d")
+ITT_STUB(ITTAPI, int, mark_global_off, (__itt_mark_type mt), (ITT_FORMAT mt),
+         mark_global_off, __itt_group_mark, "%d")
 
 #ifndef __ITT_INTERNAL_BODY
-ITT_STUB(ITTAPI, __itt_caller, stack_caller_create, (void), (ITT_NO_PARAMS), stack_caller_create,  __itt_group_stitch, "no args")
+ITT_STUB(ITTAPI, __itt_caller, stack_caller_create, (void), (ITT_NO_PARAMS),
+         stack_caller_create, __itt_group_stitch, "no args")
 #endif /* __ITT_INTERNAL_BODY */
-ITT_STUBV(ITTAPI, void, stack_caller_destroy, (__itt_caller id), (ITT_FORMAT id), stack_caller_destroy, __itt_group_stitch, "%p")
-ITT_STUBV(ITTAPI, void, stack_callee_enter,   (__itt_caller id), (ITT_FORMAT id), stack_callee_enter,   __itt_group_stitch, "%p")
-ITT_STUBV(ITTAPI, void, stack_callee_leave,   (__itt_caller id), (ITT_FORMAT id), stack_callee_leave,   __itt_group_stitch, "%p")
-
-ITT_STUB(ITTAPI,  __itt_clock_domain*, clock_domain_create, (__itt_get_clock_info_fn fn, void* fn_data), (ITT_FORMAT fn, fn_data), clock_domain_create, __itt_group_structure, "%p, %p")
-ITT_STUBV(ITTAPI, void,                clock_domain_reset,  (void),                                      (ITT_NO_PARAMS),          clock_domain_reset,  __itt_group_structure, "no args")
-ITT_STUBV(ITTAPI, void, id_create_ex,  (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id), (ITT_FORMAT domain, clock_domain, timestamp, id), id_create_ex,  __itt_group_structure, "%p, %p, %lu, %lu")
-ITT_STUBV(ITTAPI, void, id_destroy_ex, (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id), (ITT_FORMAT domain, clock_domain, timestamp, id), id_destroy_ex, __itt_group_structure, "%p, %p, %lu, %lu")
-ITT_STUBV(ITTAPI, void, task_begin_ex,    (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id, __itt_id parentid, __itt_string_handle *name), (ITT_FORMAT domain, clock_domain, timestamp, id, parentid, name), task_begin_ex, __itt_group_structure, "%p, %p, %lu, %lu, %lu, %p")
-ITT_STUBV(ITTAPI, void, task_begin_fn_ex, (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id, __itt_id parentid, void* fn),                  (ITT_FORMAT domain, clock_domain, timestamp, id, parentid, fn), task_begin_fn_ex, __itt_group_structure, "%p, %p, %lu, %lu, %lu, %p")
-ITT_STUBV(ITTAPI, void, task_end_ex,      (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp),                                                            (ITT_FORMAT domain, clock_domain, timestamp), task_end_ex, __itt_group_structure, "%p, %p, %lu")
-ITT_STUBV(ITTAPI, void, task_begin_overlapped,       (const __itt_domain *domain, __itt_id id, __itt_id parent, __itt_string_handle *name),                                                                   (ITT_FORMAT domain, id, parent, name), task_begin_overlapped, __itt_group_structure, "%p, %lu, %lu, %p")
-ITT_STUBV(ITTAPI, void, task_begin_overlapped_ex,    (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id, __itt_id parentid, __itt_string_handle *name), (ITT_FORMAT domain, clock_domain, timestamp, id, parentid, name), task_begin_overlapped_ex, __itt_group_structure, "%p, %p, %lu, %lu, %lu, %p")
-ITT_STUBV(ITTAPI, void, task_end_overlapped, (const __itt_domain *domain, __itt_id id),                                                                                                                       (ITT_FORMAT domain, id), task_end_overlapped, __itt_group_structure, "%p, %lu")
-ITT_STUBV(ITTAPI, void, task_end_overlapped_ex, (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id),                                                    (ITT_FORMAT domain, clock_domain, timestamp, id), task_end_overlapped_ex, __itt_group_structure, "%p, %p, %lu, %lu")
-ITT_STUBV(ITTAPI, void, marker_ex, (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id, __itt_string_handle *name, __itt_scope scope), (ITT_FORMAT domain, clock_domain, timestamp, id, name, scope), marker_ex, __itt_group_structure, "%p, %p, %lu, %lu, %p, %d")
-ITT_STUBV(ITTAPI, void, metadata_add_with_scope, (const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, __itt_metadata_type type, size_t count, void *data), (ITT_FORMAT domain, scope, key, type, count, data), metadata_add_with_scope, __itt_group_structure, "%p, %d, %p, %d, %lu, %p")
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-ITT_STUBV(ITTAPI, void, metadata_str_add_with_scopeA, (const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, const char *data, size_t length),    (ITT_FORMAT domain, scope, key, data, length), metadata_str_add_with_scopeA, __itt_group_structure, "%p, %d, %p, %p, %lu")
-ITT_STUBV(ITTAPI, void, metadata_str_add_with_scopeW, (const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, const wchar_t *data, size_t length), (ITT_FORMAT domain, scope, key, data, length), metadata_str_add_with_scopeW, __itt_group_structure, "%p, %d, %p, %p, %lu")
-#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
-ITT_STUBV(ITTAPI, void, metadata_str_add_with_scope,  (const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, const char *data, size_t length),    (ITT_FORMAT domain, scope, key, data, length), metadata_str_add_with_scope,  __itt_group_structure, "%p, %d, %p, %p, %lu")
+ITT_STUBV(ITTAPI, void, stack_caller_destroy, (__itt_caller id),
+          (ITT_FORMAT id), stack_caller_destroy, __itt_group_stitch, "%p")
+ITT_STUBV(ITTAPI, void, stack_callee_enter, (__itt_caller id), (ITT_FORMAT id),
+          stack_callee_enter, __itt_group_stitch, "%p")
+ITT_STUBV(ITTAPI, void, stack_callee_leave, (__itt_caller id), (ITT_FORMAT id),
+          stack_callee_leave, __itt_group_stitch, "%p")
+
+ITT_STUB(ITTAPI, __itt_clock_domain *, clock_domain_create,
+         (__itt_get_clock_info_fn fn, void *fn_data), (ITT_FORMAT fn, fn_data),
+         clock_domain_create, __itt_group_structure, "%p, %p")
+ITT_STUBV(ITTAPI, void, clock_domain_reset, (void), (ITT_NO_PARAMS),
+          clock_domain_reset, __itt_group_structure, "no args")
+ITT_STUBV(ITTAPI, void, id_create_ex,
+          (const __itt_domain *domain, __itt_clock_domain *clock_domain,
+           unsigned long long timestamp, __itt_id id),
+          (ITT_FORMAT domain, clock_domain, timestamp, id), id_create_ex,
+          __itt_group_structure, "%p, %p, %lu, %lu")
+ITT_STUBV(ITTAPI, void, id_destroy_ex,
+          (const __itt_domain *domain, __itt_clock_domain *clock_domain,
+           unsigned long long timestamp, __itt_id id),
+          (ITT_FORMAT domain, clock_domain, timestamp, id), id_destroy_ex,
+          __itt_group_structure, "%p, %p, %lu, %lu")
+ITT_STUBV(ITTAPI, void, task_begin_ex,
+          (const __itt_domain *domain, __itt_clock_domain *clock_domain,
+           unsigned long long timestamp, __itt_id id, __itt_id parentid,
+           __itt_string_handle *name),
+          (ITT_FORMAT domain, clock_domain, timestamp, id, parentid, name),
+          task_begin_ex, __itt_group_structure, "%p, %p, %lu, %lu, %lu, %p")
+ITT_STUBV(ITTAPI, void, task_begin_fn_ex,
+          (const __itt_domain *domain, __itt_clock_domain *clock_domain,
+           unsigned long long timestamp, __itt_id id, __itt_id parentid,
+           void *fn),
+          (ITT_FORMAT domain, clock_domain, timestamp, id, parentid, fn),
+          task_begin_fn_ex, __itt_group_structure, "%p, %p, %lu, %lu, %lu, %p")
+ITT_STUBV(ITTAPI, void, task_end_ex,
+          (const __itt_domain *domain, __itt_clock_domain *clock_domain,
+           unsigned long long timestamp),
+          (ITT_FORMAT domain, clock_domain, timestamp), task_end_ex,
+          __itt_group_structure, "%p, %p, %lu")
+ITT_STUBV(ITTAPI, void, task_begin_overlapped,
+          (const __itt_domain *domain, __itt_id id, __itt_id parent,
+           __itt_string_handle *name),
+          (ITT_FORMAT domain, id, parent, name), task_begin_overlapped,
+          __itt_group_structure, "%p, %lu, %lu, %p")
+ITT_STUBV(ITTAPI, void, task_begin_overlapped_ex,
+          (const __itt_domain *domain, __itt_clock_domain *clock_domain,
+           unsigned long long timestamp, __itt_id id, __itt_id parentid,
+           __itt_string_handle *name),
+          (ITT_FORMAT domain, clock_domain, timestamp, id, parentid, name),
+          task_begin_overlapped_ex, __itt_group_structure,
+          "%p, %p, %lu, %lu, %lu, %p")
+ITT_STUBV(ITTAPI, void, task_end_overlapped,
+          (const __itt_domain *domain, __itt_id id), (ITT_FORMAT domain, id),
+          task_end_overlapped, __itt_group_structure, "%p, %lu")
+ITT_STUBV(ITTAPI, void, task_end_overlapped_ex,
+          (const __itt_domain *domain, __itt_clock_domain *clock_domain,
+           unsigned long long timestamp, __itt_id id),
+          (ITT_FORMAT domain, clock_domain, timestamp, id),
+          task_end_overlapped_ex, __itt_group_structure, "%p, %p, %lu, %lu")
+ITT_STUBV(ITTAPI, void, marker_ex,
+          (const __itt_domain *domain, __itt_clock_domain *clock_domain,
+           unsigned long long timestamp, __itt_id id, __itt_string_handle *name,
+           __itt_scope scope),
+          (ITT_FORMAT domain, clock_domain, timestamp, id, name, scope),
+          marker_ex, __itt_group_structure, "%p, %p, %lu, %lu, %p, %d")
+ITT_STUBV(ITTAPI, void, metadata_add_with_scope,
+          (const __itt_domain *domain, __itt_scope scope,
+           __itt_string_handle *key, __itt_metadata_type type, size_t count,
+           void *data),
+          (ITT_FORMAT domain, scope, key, type, count, data),
+          metadata_add_with_scope, __itt_group_structure,
+          "%p, %d, %p, %d, %lu, %p")
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+ITT_STUBV(ITTAPI, void, metadata_str_add_with_scopeA,
+          (const __itt_domain *domain, __itt_scope scope,
+           __itt_string_handle *key, const char *data, size_t length),
+          (ITT_FORMAT domain, scope, key, data, length),
+          metadata_str_add_with_scopeA, __itt_group_structure,
+          "%p, %d, %p, %p, %lu")
+ITT_STUBV(ITTAPI, void, metadata_str_add_with_scopeW,
+          (const __itt_domain *domain, __itt_scope scope,
+           __itt_string_handle *key, const wchar_t *data, size_t length),
+          (ITT_FORMAT domain, scope, key, data, length),
+          metadata_str_add_with_scopeW, __itt_group_structure,
+          "%p, %d, %p, %p, %lu")
+#else /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+ITT_STUBV(ITTAPI, void, metadata_str_add_with_scope,
+          (const __itt_domain *domain, __itt_scope scope,
+           __itt_string_handle *key, const char *data, size_t length),
+          (ITT_FORMAT domain, scope, key, data, length),
+          metadata_str_add_with_scope, __itt_group_structure,
+          "%p, %d, %p, %p, %lu")
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-ITT_STUBV(ITTAPI, void, relation_add_to_current_ex, (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_relation relation, __itt_id tail),                (ITT_FORMAT domain, clock_domain, timestamp, relation, tail),       relation_add_to_current_ex, __itt_group_structure, "%p, %p, %lu, %d, %lu")
-ITT_STUBV(ITTAPI, void, relation_add_ex,            (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id head, __itt_relation relation, __itt_id tail), (ITT_FORMAT domain, clock_domain, timestamp, head, relation, tail), relation_add_ex,            __itt_group_structure, "%p, %p, %lu, %lu, %d, %lu")
-ITT_STUB(ITTAPI,  __itt_track_group*, track_group_create, (__itt_string_handle* name, __itt_track_group_type track_group_type),                    (ITT_FORMAT name, track_group_type),        track_group_create, __itt_group_structure, "%p, %d")
-ITT_STUB(ITTAPI,  __itt_track*,       track_create,       (__itt_track_group* track_group,__itt_string_handle* name, __itt_track_type track_type), (ITT_FORMAT track_group, name, track_type), track_create,       __itt_group_structure, "%p, %p, %d")
-ITT_STUBV(ITTAPI, void,               set_track,          (__itt_track *track),                                                                    (ITT_FORMAT track),                         set_track,          __itt_group_structure, "%p")
+ITT_STUBV(ITTAPI, void, relation_add_to_current_ex,
+          (const __itt_domain *domain, __itt_clock_domain *clock_domain,
+           unsigned long long timestamp, __itt_relation relation,
+           __itt_id tail),
+          (ITT_FORMAT domain, clock_domain, timestamp, relation, tail),
+          relation_add_to_current_ex, __itt_group_structure,
+          "%p, %p, %lu, %d, %lu")
+ITT_STUBV(ITTAPI, void, relation_add_ex,
+          (const __itt_domain *domain, __itt_clock_domain *clock_domain,
+           unsigned long long timestamp, __itt_id head, __itt_relation relation,
+           __itt_id tail),
+          (ITT_FORMAT domain, clock_domain, timestamp, head, relation, tail),
+          relation_add_ex, __itt_group_structure, "%p, %p, %lu, %lu, %d, %lu")
+ITT_STUB(ITTAPI, __itt_track_group *, track_group_create,
+         (__itt_string_handle * name, __itt_track_group_type track_group_type),
+         (ITT_FORMAT name, track_group_type), track_group_create,
+         __itt_group_structure, "%p, %d")
+ITT_STUB(ITTAPI, __itt_track *, track_create,
+         (__itt_track_group * track_group, __itt_string_handle *name,
+          __itt_track_type track_type),
+         (ITT_FORMAT track_group, name, track_type), track_create,
+         __itt_group_structure, "%p, %p, %d")
+ITT_STUBV(ITTAPI, void, set_track, (__itt_track * track), (ITT_FORMAT track),
+          set_track, __itt_group_structure, "%p")
 
 #ifndef __ITT_INTERNAL_BODY
-ITT_STUB(ITTAPI, const char*, api_version, (void), (ITT_NO_PARAMS), api_version, __itt_group_all & ~__itt_group_legacy, "no args")
+ITT_STUB(ITTAPI, const char *, api_version, (void), (ITT_NO_PARAMS),
+         api_version, __itt_group_all & ~__itt_group_legacy, "no args")
 #endif /* __ITT_INTERNAL_BODY */
 
 #ifndef __ITT_INTERNAL_BODY
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-ITT_STUB(ITTAPI, int, av_saveA, (void *data, int rank, const int *dimensions, int type, const char *filePath, int columnOrder), (ITT_FORMAT data, rank, dimensions, type, filePath, columnOrder), av_saveA, __itt_group_arrays, "%p, %d, %p, %d, \"%s\", %d")
-ITT_STUB(ITTAPI, int, av_saveW, (void *data, int rank, const int *dimensions, int type, const wchar_t *filePath, int columnOrder), (ITT_FORMAT data, rank, dimensions, type, filePath, columnOrder), av_saveW, __itt_group_arrays, "%p, %d, %p, %d, \"%S\", %d")
-#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
-ITT_STUB(ITTAPI, int, av_save,  (void *data, int rank, const int *dimensions, int type, const char *filePath, int columnOrder), (ITT_FORMAT data, rank, dimensions, type, filePath, columnOrder), av_save,  __itt_group_arrays, "%p, %d, %p, %d, \"%s\", %d")
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, int, av_saveA,
+         (void *data, int rank, const int *dimensions, int type,
+          const char *filePath, int columnOrder),
+         (ITT_FORMAT data, rank, dimensions, type, filePath, columnOrder),
+         av_saveA, __itt_group_arrays, "%p, %d, %p, %d, \"%s\", %d")
+ITT_STUB(ITTAPI, int, av_saveW,
+         (void *data, int rank, const int *dimensions, int type,
+          const wchar_t *filePath, int columnOrder),
+         (ITT_FORMAT data, rank, dimensions, type, filePath, columnOrder),
+         av_saveW, __itt_group_arrays, "%p, %d, %p, %d, \"%S\", %d")
+#else /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, int, av_save,
+         (void *data, int rank, const int *dimensions, int type,
+          const char *filePath, int columnOrder),
+         (ITT_FORMAT data, rank, dimensions, type, filePath, columnOrder),
+         av_save, __itt_group_arrays, "%p, %d, %p, %d, \"%s\", %d")
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 #endif /* __ITT_INTERNAL_BODY */
 
 #ifndef __ITT_INTERNAL_BODY
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-ITT_STUBV(ITTAPI, void, module_loadA, (void *start_addr, void* end_addr, const char *path), (ITT_FORMAT start_addr, end_addr, path), module_loadA, __itt_group_none, "%p, %p, %p")
-ITT_STUBV(ITTAPI, void, module_loadW, (void *start_addr, void* end_addr, const wchar_t *path), (ITT_FORMAT start_addr, end_addr, path), module_loadW, __itt_group_none, "%p, %p, %p")
-#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
-ITT_STUBV(ITTAPI, void, module_load, (void *start_addr, void *end_addr, const char *path), (ITT_FORMAT start_addr, end_addr, path), module_load, __itt_group_none, "%p, %p, %p")
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+ITT_STUBV(ITTAPI, void, module_loadA,
+          (void *start_addr, void *end_addr, const char *path),
+          (ITT_FORMAT start_addr, end_addr, path), module_loadA,
+          __itt_group_none, "%p, %p, %p")
+ITT_STUBV(ITTAPI, void, module_loadW,
+          (void *start_addr, void *end_addr, const wchar_t *path),
+          (ITT_FORMAT start_addr, end_addr, path), module_loadW,
+          __itt_group_none, "%p, %p, %p")
+#else /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+ITT_STUBV(ITTAPI, void, module_load,
+          (void *start_addr, void *end_addr, const char *path),
+          (ITT_FORMAT start_addr, end_addr, path), module_load,
+          __itt_group_none, "%p, %p, %p")
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 #endif /* __ITT_INTERNAL_BODY */
 
-
 #endif /* __ITT_INTERNAL_INIT */
diff --git a/openmp/runtime/src/thirdparty/ittnotify/ittnotify_types.h b/openmp/runtime/src/thirdparty/ittnotify/ittnotify_types.h
index 88181612b23b..66afd8c7d183 100644
--- a/openmp/runtime/src/thirdparty/ittnotify/ittnotify_types.h
+++ b/openmp/runtime/src/thirdparty/ittnotify/ittnotify_types.h
@@ -7,61 +7,56 @@
 //
 //===----------------------------------------------------------------------===//
 
-
 #ifndef _ITTNOTIFY_TYPES_H_
 #define _ITTNOTIFY_TYPES_H_
 
-typedef enum ___itt_group_id
-{
-    __itt_group_none      = 0,
-    __itt_group_legacy    = 1<<0,
-    __itt_group_control   = 1<<1,
-    __itt_group_thread    = 1<<2,
-    __itt_group_mark      = 1<<3,
-    __itt_group_sync      = 1<<4,
-    __itt_group_fsync     = 1<<5,
-    __itt_group_jit       = 1<<6,
-    __itt_group_model     = 1<<7,
-    __itt_group_splitter_min = 1<<7,
-    __itt_group_counter   = 1<<8,
-    __itt_group_frame     = 1<<9,
-    __itt_group_stitch    = 1<<10,
-    __itt_group_heap      = 1<<11,
-    __itt_group_splitter_max = 1<<12,
-    __itt_group_structure = 1<<12,
-    __itt_group_suppress = 1<<13,
-    __itt_group_arrays    = 1<<14,
-    __itt_group_all       = -1
+typedef enum ___itt_group_id {
+  __itt_group_none = 0,
+  __itt_group_legacy = 1 << 0,
+  __itt_group_control = 1 << 1,
+  __itt_group_thread = 1 << 2,
+  __itt_group_mark = 1 << 3,
+  __itt_group_sync = 1 << 4,
+  __itt_group_fsync = 1 << 5,
+  __itt_group_jit = 1 << 6,
+  __itt_group_model = 1 << 7,
+  __itt_group_splitter_min = 1 << 7,
+  __itt_group_counter = 1 << 8,
+  __itt_group_frame = 1 << 9,
+  __itt_group_stitch = 1 << 10,
+  __itt_group_heap = 1 << 11,
+  __itt_group_splitter_max = 1 << 12,
+  __itt_group_structure = 1 << 12,
+  __itt_group_suppress = 1 << 13,
+  __itt_group_arrays = 1 << 14,
+  __itt_group_all = -1
 } __itt_group_id;
 
 #pragma pack(push, 8)
 
-typedef struct ___itt_group_list
-{
-    __itt_group_id id;
-    const char*    name;
+typedef struct ___itt_group_list {
+  __itt_group_id id;
+  const char *name;
 } __itt_group_list;
 
 #pragma pack(pop)
 
-#define ITT_GROUP_LIST(varname) \
-    static __itt_group_list varname[] = {       \
-        { __itt_group_all,       "all"       }, \
-        { __itt_group_control,   "control"   }, \
-        { __itt_group_thread,    "thread"    }, \
-        { __itt_group_mark,      "mark"      }, \
-        { __itt_group_sync,      "sync"      }, \
-        { __itt_group_fsync,     "fsync"     }, \
-        { __itt_group_jit,       "jit"       }, \
-        { __itt_group_model,     "model"     }, \
-        { __itt_group_counter,   "counter"   }, \
-        { __itt_group_frame,     "frame"     }, \
-        { __itt_group_stitch,    "stitch"    }, \
-        { __itt_group_heap,      "heap"      }, \
-        { __itt_group_structure, "structure" }, \
-        { __itt_group_suppress,  "suppress"  }, \
-        { __itt_group_arrays,    "arrays"    }, \
-        { __itt_group_none,      NULL        }  \
-    }
+#define ITT_GROUP_LIST(varname)                                                \
+  static __itt_group_list varname[] = {{__itt_group_all, "all"},               \
+                                       {__itt_group_control, "control"},       \
+                                       {__itt_group_thread, "thread"},         \
+                                       {__itt_group_mark, "mark"},             \
+                                       {__itt_group_sync, "sync"},             \
+                                       {__itt_group_fsync, "fsync"},           \
+                                       {__itt_group_jit, "jit"},               \
+                                       {__itt_group_model, "model"},           \
+                                       {__itt_group_counter, "counter"},       \
+                                       {__itt_group_frame, "frame"},           \
+                                       {__itt_group_stitch, "stitch"},         \
+                                       {__itt_group_heap, "heap"},             \
+                                       {__itt_group_structure, "structure"},   \
+                                       {__itt_group_suppress, "suppress"},     \
+                                       {__itt_group_arrays, "arrays"},         \
+                                       {__itt_group_none, NULL}}
 
 #endif /* _ITTNOTIFY_TYPES_H_ */
diff --git a/openmp/runtime/src/thirdparty/ittnotify/legacy/ittnotify.h b/openmp/runtime/src/thirdparty/ittnotify/legacy/ittnotify.h
index a4061e168d1d..384a55881e1f 100644
--- a/openmp/runtime/src/thirdparty/ittnotify/legacy/ittnotify.h
+++ b/openmp/runtime/src/thirdparty/ittnotify/legacy/ittnotify.h
@@ -17,59 +17,59 @@
 
 /** @cond exclude_from_documentation */
 #ifndef ITT_OS_WIN
-#  define ITT_OS_WIN   1
+#define ITT_OS_WIN 1
 #endif /* ITT_OS_WIN */
 
 #ifndef ITT_OS_LINUX
-#  define ITT_OS_LINUX 2
+#define ITT_OS_LINUX 2
 #endif /* ITT_OS_LINUX */
 
 #ifndef ITT_OS_MAC
-#  define ITT_OS_MAC   3
+#define ITT_OS_MAC 3
 #endif /* ITT_OS_MAC */
 
 #ifndef ITT_OS_FREEBSD
-#  define ITT_OS_FREEBSD   4
+#define ITT_OS_FREEBSD 4
 #endif /* ITT_OS_FREEBSD */
 
 #ifndef ITT_OS
-#  if defined WIN32 || defined _WIN32
-#    define ITT_OS ITT_OS_WIN
-#  elif defined( __APPLE__ ) && defined( __MACH__ )
-#    define ITT_OS ITT_OS_MAC
-#  elif defined( __FreeBSD__ )
-#    define ITT_OS ITT_OS_FREEBSD
-#  else
-#    define ITT_OS ITT_OS_LINUX
-#  endif
+#if defined WIN32 || defined _WIN32
+#define ITT_OS ITT_OS_WIN
+#elif defined(__APPLE__) && defined(__MACH__)
+#define ITT_OS ITT_OS_MAC
+#elif defined(__FreeBSD__)
+#define ITT_OS ITT_OS_FREEBSD
+#else
+#define ITT_OS ITT_OS_LINUX
+#endif
 #endif /* ITT_OS */
 
 #ifndef ITT_PLATFORM_WIN
-#  define ITT_PLATFORM_WIN 1
+#define ITT_PLATFORM_WIN 1
 #endif /* ITT_PLATFORM_WIN */
 
 #ifndef ITT_PLATFORM_POSIX
-#  define ITT_PLATFORM_POSIX 2
+#define ITT_PLATFORM_POSIX 2
 #endif /* ITT_PLATFORM_POSIX */
 
 #ifndef ITT_PLATFORM_MAC
-#  define ITT_PLATFORM_MAC 3
+#define ITT_PLATFORM_MAC 3
 #endif /* ITT_PLATFORM_MAC */
 
 #ifndef ITT_PLATFORM_FREEBSD
-#  define ITT_PLATFORM_FREEBSD 4
+#define ITT_PLATFORM_FREEBSD 4
 #endif /* ITT_PLATFORM_FREEBSD */
 
 #ifndef ITT_PLATFORM
-#  if ITT_OS==ITT_OS_WIN
-#    define ITT_PLATFORM ITT_PLATFORM_WIN
-#  elif ITT_OS==ITT_OS_MAC
-#    define ITT_PLATFORM ITT_PLATFORM_MAC
-#  elif ITT_OS==ITT_OS_FREEBSD
-#    define ITT_PLATFORM ITT_PLATFORM_FREEBSD
-#  else
-#    define ITT_PLATFORM ITT_PLATFORM_POSIX
-#  endif
+#if ITT_OS == ITT_OS_WIN
+#define ITT_PLATFORM ITT_PLATFORM_WIN
+#elif ITT_OS == ITT_OS_MAC
+#define ITT_PLATFORM ITT_PLATFORM_MAC
+#elif ITT_OS == ITT_OS_FREEBSD
+#define ITT_PLATFORM ITT_PLATFORM_FREEBSD
+#else
+#define ITT_PLATFORM ITT_PLATFORM_POSIX
+#endif
 #endif /* ITT_PLATFORM */
 
 #if defined(_UNICODE) && !defined(UNICODE)
@@ -77,9 +77,9 @@
 #endif
 
 #include <stddef.h>
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
 #include <tchar.h>
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 #include <stdint.h>
 #if defined(UNICODE) || defined(_UNICODE)
 #include <wchar.h>
@@ -87,51 +87,51 @@
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 
 #ifndef ITTAPI_CDECL
-#  if ITT_PLATFORM==ITT_PLATFORM_WIN
-#    define ITTAPI_CDECL __cdecl
-#  else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#    if defined _M_IX86 || defined __i386__
-#      define ITTAPI_CDECL __attribute__ ((cdecl))
-#    else  /* _M_IX86 || __i386__ */
-#      define ITTAPI_CDECL /* actual only on x86 platform */
-#    endif /* _M_IX86 || __i386__ */
-#  endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+#define ITTAPI_CDECL __cdecl
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if defined _M_IX86 || defined __i386__
+#define ITTAPI_CDECL __attribute__((cdecl))
+#else /* _M_IX86 || __i386__ */
+#define ITTAPI_CDECL /* actual only on x86 platform */
+#endif /* _M_IX86 || __i386__ */
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 #endif /* ITTAPI_CDECL */
 
 #ifndef STDCALL
-#  if ITT_PLATFORM==ITT_PLATFORM_WIN
-#    define STDCALL __stdcall
-#  else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#    if defined _M_IX86 || defined __i386__
-#      define STDCALL __attribute__ ((stdcall))
-#    else  /* _M_IX86 || __i386__ */
-#      define STDCALL /* supported only on x86 platform */
-#    endif /* _M_IX86 || __i386__ */
-#  endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+#define STDCALL __stdcall
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if defined _M_IX86 || defined __i386__
+#define STDCALL __attribute__((stdcall))
+#else /* _M_IX86 || __i386__ */
+#define STDCALL /* supported only on x86 platform */
+#endif /* _M_IX86 || __i386__ */
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 #endif /* STDCALL */
 
-#define ITTAPI    ITTAPI_CDECL
+#define ITTAPI ITTAPI_CDECL
 #define LIBITTAPI ITTAPI_CDECL
 
 /* TODO: Temporary for compatibility! */
-#define ITTAPI_CALL    ITTAPI_CDECL
+#define ITTAPI_CALL ITTAPI_CDECL
 #define LIBITTAPI_CALL ITTAPI_CDECL
 
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
 /* use __forceinline (VC++ specific) */
-#define ITT_INLINE           __forceinline
+#define ITT_INLINE __forceinline
 #define ITT_INLINE_ATTRIBUTE /* nothing */
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 /*
  * Generally, functions are not inlined unless optimization is specified.
  * For functions declared inline, this attribute inlines the function even
  * if no optimization level was specified.
  */
 #ifdef __STRICT_ANSI__
-#define ITT_INLINE           static
+#define ITT_INLINE static
 #define ITT_INLINE_ATTRIBUTE __attribute__((unused))
-#else  /* __STRICT_ANSI__ */
-#define ITT_INLINE           static inline
+#else /* __STRICT_ANSI__ */
+#define ITT_INLINE static inline
 #define ITT_INLINE_ATTRIBUTE __attribute__((always_inline, unused))
 #endif /* __STRICT_ANSI__ */
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
@@ -139,8 +139,8 @@
 
 /** @cond exclude_from_documentation */
 /* Helper macro for joining tokens */
-#define ITT_JOIN_AUX(p,n) p##n
-#define ITT_JOIN(p,n)     ITT_JOIN_AUX(p,n)
+#define ITT_JOIN_AUX(p, n) p##n
+#define ITT_JOIN(p, n) ITT_JOIN_AUX(p, n)
 
 #ifdef ITT_MAJOR
 #undef ITT_MAJOR
@@ -148,43 +148,75 @@
 #ifdef ITT_MINOR
 #undef ITT_MINOR
 #endif
-#define ITT_MAJOR     3
-#define ITT_MINOR     0
+#define ITT_MAJOR 3
+#define ITT_MINOR 0
 
 /* Standard versioning of a token with major and minor version numbers */
-#define ITT_VERSIONIZE(x)    \
-    ITT_JOIN(x,              \
-    ITT_JOIN(_,              \
-    ITT_JOIN(ITT_MAJOR,      \
-    ITT_JOIN(_, ITT_MINOR))))
+#define ITT_VERSIONIZE(x)                                                      \
+  ITT_JOIN(x, ITT_JOIN(_, ITT_JOIN(ITT_MAJOR, ITT_JOIN(_, ITT_MINOR))))
 
 #ifndef INTEL_ITTNOTIFY_PREFIX
-#  define INTEL_ITTNOTIFY_PREFIX __itt_
+#define INTEL_ITTNOTIFY_PREFIX __itt_
 #endif /* INTEL_ITTNOTIFY_PREFIX */
 #ifndef INTEL_ITTNOTIFY_POSTFIX
-#  define INTEL_ITTNOTIFY_POSTFIX _ptr_
+#define INTEL_ITTNOTIFY_POSTFIX _ptr_
 #endif /* INTEL_ITTNOTIFY_POSTFIX */
 
-#define ITTNOTIFY_NAME_AUX(n) ITT_JOIN(INTEL_ITTNOTIFY_PREFIX,n)
-#define ITTNOTIFY_NAME(n)     ITT_VERSIONIZE(ITTNOTIFY_NAME_AUX(ITT_JOIN(n,INTEL_ITTNOTIFY_POSTFIX)))
+#define ITTNOTIFY_NAME_AUX(n) ITT_JOIN(INTEL_ITTNOTIFY_PREFIX, n)
+#define ITTNOTIFY_NAME(n)                                                      \
+  ITT_VERSIONIZE(ITTNOTIFY_NAME_AUX(ITT_JOIN(n, INTEL_ITTNOTIFY_POSTFIX)))
 
 #define ITTNOTIFY_VOID(n) (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)
-#define ITTNOTIFY_DATA(n) (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)
-
-#define ITTNOTIFY_VOID_D0(n,d)       (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d)
-#define ITTNOTIFY_VOID_D1(n,d,x)     (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x)
-#define ITTNOTIFY_VOID_D2(n,d,x,y)   (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y)
-#define ITTNOTIFY_VOID_D3(n,d,x,y,z) (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y,z)
-#define ITTNOTIFY_VOID_D4(n,d,x,y,z,a)     (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y,z,a)
-#define ITTNOTIFY_VOID_D5(n,d,x,y,z,a,b)   (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y,z,a,b)
-#define ITTNOTIFY_VOID_D6(n,d,x,y,z,a,b,c) (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y,z,a,b,c)
-#define ITTNOTIFY_DATA_D0(n,d)       (!(d)->flags) ?       0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d)
-#define ITTNOTIFY_DATA_D1(n,d,x)     (!(d)->flags) ?       0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x)
-#define ITTNOTIFY_DATA_D2(n,d,x,y)   (!(d)->flags) ?       0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y)
-#define ITTNOTIFY_DATA_D3(n,d,x,y,z) (!(d)->flags) ?       0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y,z)
-#define ITTNOTIFY_DATA_D4(n,d,x,y,z,a)     (!(d)->flags) ? 0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y,z,a)
-#define ITTNOTIFY_DATA_D5(n,d,x,y,z,a,b)   (!(d)->flags) ? 0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y,z,a,b)
-#define ITTNOTIFY_DATA_D6(n,d,x,y,z,a,b,c) (!(d)->flags) ? 0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y,z,a,b,c)
+#define ITTNOTIFY_DATA(n) (!ITTNOTIFY_NAME(n)) ? 0 : ITTNOTIFY_NAME(n)
+
+#define ITTNOTIFY_VOID_D0(n, d)                                                \
+  (!(d)->flags)          ? (void)0                                             \
+  : (!ITTNOTIFY_NAME(n)) ? (void)0                                             \
+                         : ITTNOTIFY_NAME(n)(d)
+#define ITTNOTIFY_VOID_D1(n, d, x)                                             \
+  (!(d)->flags)          ? (void)0                                             \
+  : (!ITTNOTIFY_NAME(n)) ? (void)0                                             \
+                         : ITTNOTIFY_NAME(n)(d, x)
+#define ITTNOTIFY_VOID_D2(n, d, x, y)                                          \
+  (!(d)->flags)          ? (void)0                                             \
+  : (!ITTNOTIFY_NAME(n)) ? (void)0                                             \
+                         : ITTNOTIFY_NAME(n)(d, x, y)
+#define ITTNOTIFY_VOID_D3(n, d, x, y, z)                                       \
+  (!(d)->flags)          ? (void)0                                             \
+  : (!ITTNOTIFY_NAME(n)) ? (void)0                                             \
+                         : ITTNOTIFY_NAME(n)(d, x, y, z)
+#define ITTNOTIFY_VOID_D4(n, d, x, y, z, a)                                    \
+  (!(d)->flags)          ? (void)0                                             \
+  : (!ITTNOTIFY_NAME(n)) ? (void)0                                             \
+                         : ITTNOTIFY_NAME(n)(d, x, y, z, a)
+#define ITTNOTIFY_VOID_D5(n, d, x, y, z, a, b)                                 \
+  (!(d)->flags)          ? (void)0                                             \
+  : (!ITTNOTIFY_NAME(n)) ? (void)0                                             \
+                         : ITTNOTIFY_NAME(n)(d, x, y, z, a, b)
+#define ITTNOTIFY_VOID_D6(n, d, x, y, z, a, b, c)                              \
+  (!(d)->flags)          ? (void)0                                             \
+  : (!ITTNOTIFY_NAME(n)) ? (void)0                                             \
+                         : ITTNOTIFY_NAME(n)(d, x, y, z, a, b, c)
+#define ITTNOTIFY_DATA_D0(n, d)                                                \
+  (!(d)->flags) ? 0 : (!ITTNOTIFY_NAME(n)) ? 0 : ITTNOTIFY_NAME(n)(d)
+#define ITTNOTIFY_DATA_D1(n, d, x)                                             \
+  (!(d)->flags) ? 0 : (!ITTNOTIFY_NAME(n)) ? 0 : ITTNOTIFY_NAME(n)(d, x)
+#define ITTNOTIFY_DATA_D2(n, d, x, y)                                          \
+  (!(d)->flags) ? 0 : (!ITTNOTIFY_NAME(n)) ? 0 : ITTNOTIFY_NAME(n)(d, x, y)
+#define ITTNOTIFY_DATA_D3(n, d, x, y, z)                                       \
+  (!(d)->flags) ? 0 : (!ITTNOTIFY_NAME(n)) ? 0 : ITTNOTIFY_NAME(n)(d, x, y, z)
+#define ITTNOTIFY_DATA_D4(n, d, x, y, z, a)                                    \
+  (!(d)->flags)          ? 0                                                   \
+  : (!ITTNOTIFY_NAME(n)) ? 0                                                   \
+                         : ITTNOTIFY_NAME(n)(d, x, y, z, a)
+#define ITTNOTIFY_DATA_D5(n, d, x, y, z, a, b)                                 \
+  (!(d)->flags)          ? 0                                                   \
+  : (!ITTNOTIFY_NAME(n)) ? 0                                                   \
+                         : ITTNOTIFY_NAME(n)(d, x, y, z, a, b)
+#define ITTNOTIFY_DATA_D6(n, d, x, y, z, a, b, c)                              \
+  (!(d)->flags)          ? 0                                                   \
+  : (!ITTNOTIFY_NAME(n)) ? 0                                                   \
+                         : ITTNOTIFY_NAME(n)(d, x, y, z, a, b, c)
 
 #ifdef ITT_STUB
 #undef ITT_STUB
@@ -192,9 +224,9 @@
 #ifdef ITT_STUBV
 #undef ITT_STUBV
 #endif
-#define ITT_STUBV(api,type,name,args)                             \
-    typedef type (api* ITT_JOIN(ITTNOTIFY_NAME(name),_t)) args;   \
-    extern ITT_JOIN(ITTNOTIFY_NAME(name),_t) ITTNOTIFY_NAME(name);
+#define ITT_STUBV(api, type, name, args)                                       \
+  typedef type(api *ITT_JOIN(ITTNOTIFY_NAME(name), _t)) args;                  \
+  extern ITT_JOIN(ITTNOTIFY_NAME(name), _t) ITTNOTIFY_NAME(name);
 #define ITT_STUB ITT_STUBV
 /** @endcond */
 
@@ -211,9 +243,11 @@ extern "C" {
 /**
  * @defgroup legacy_control Collection Control
  * @ingroup legacy
- * General behavior: application continues to run, but no profiling information is being collected
+ * General behavior: application continues to run, but no profiling information
+ * is being collected
  *
- * Pausing occurs not only for the current thread but for all process as well as spawned processes
+ * Pausing occurs not only for the current thread but for all process as well as
+ * spawned processes
  * - Intel(R) Parallel Inspector and Intel(R) Inspector XE:
  *   - Does not analyze or report errors that involve memory access.
  *   - Other errors are reported as usual. Pausing data collection in
@@ -240,25 +274,25 @@ void ITTAPI __itt_detach(void);
 /** @cond exclude_from_documentation */
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, pause,   (void))
-ITT_STUBV(ITTAPI, void, resume,  (void))
-ITT_STUBV(ITTAPI, void, detach,  (void))
-#define __itt_pause      ITTNOTIFY_VOID(pause)
-#define __itt_pause_ptr  ITTNOTIFY_NAME(pause)
-#define __itt_resume     ITTNOTIFY_VOID(resume)
+ITT_STUBV(ITTAPI, void, pause, (void))
+ITT_STUBV(ITTAPI, void, resume, (void))
+ITT_STUBV(ITTAPI, void, detach, (void))
+#define __itt_pause ITTNOTIFY_VOID(pause)
+#define __itt_pause_ptr ITTNOTIFY_NAME(pause)
+#define __itt_resume ITTNOTIFY_VOID(resume)
 #define __itt_resume_ptr ITTNOTIFY_NAME(resume)
-#define __itt_detach     ITTNOTIFY_VOID(detach)
+#define __itt_detach ITTNOTIFY_VOID(detach)
 #define __itt_detach_ptr ITTNOTIFY_NAME(detach)
-#else  /* INTEL_NO_ITTNOTIFY_API */
+#else /* INTEL_NO_ITTNOTIFY_API */
 #define __itt_pause()
-#define __itt_pause_ptr  0
+#define __itt_pause_ptr 0
 #define __itt_resume()
 #define __itt_resume_ptr 0
 #define __itt_detach()
 #define __itt_detach_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_pause_ptr  0
+#else /* INTEL_NO_MACRO_BODY */
+#define __itt_pause_ptr 0
 #define __itt_resume_ptr 0
 #define __itt_detach_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
@@ -276,17 +310,18 @@ ITT_STUBV(ITTAPI, void, detach,  (void))
 /**
  * @deprecated Legacy API
  * @brief Set name to be associated with thread in analysis GUI.
- * @return __itt_err upon failure (name or namelen being null,name and namelen mismatched)
+ * @return __itt_err upon failure (name or namelen being null,name and namelen
+ * mismatched)
  */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-int LIBITTAPI __itt_thr_name_setA(const char    *name, int namelen);
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+int LIBITTAPI __itt_thr_name_setA(const char *name, int namelen);
 int LIBITTAPI __itt_thr_name_setW(const wchar_t *name, int namelen);
 #if defined(UNICODE) || defined(_UNICODE)
-#  define __itt_thr_name_set     __itt_thr_name_setW
-#  define __itt_thr_name_set_ptr __itt_thr_name_setW_ptr
+#define __itt_thr_name_set __itt_thr_name_setW
+#define __itt_thr_name_set_ptr __itt_thr_name_setW_ptr
 #else
-#  define __itt_thr_name_set     __itt_thr_name_setA
-#  define __itt_thr_name_set_ptr __itt_thr_name_setA_ptr
+#define __itt_thr_name_set __itt_thr_name_setA
+#define __itt_thr_name_set_ptr __itt_thr_name_setA_ptr
 #endif /* UNICODE */
 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 int LIBITTAPI __itt_thr_name_set(const char *name, int namelen);
@@ -295,23 +330,23 @@ int LIBITTAPI __itt_thr_name_set(const char *name, int namelen);
 /** @cond exclude_from_documentation */
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-ITT_STUB(LIBITTAPI, int, thr_name_setA, (const char    *name, int namelen))
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+ITT_STUB(LIBITTAPI, int, thr_name_setA, (const char *name, int namelen))
 ITT_STUB(LIBITTAPI, int, thr_name_setW, (const wchar_t *name, int namelen))
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-ITT_STUB(LIBITTAPI, int, thr_name_set,  (const char    *name, int namelen))
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(LIBITTAPI, int, thr_name_set, (const char *name, int namelen))
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_thr_name_setA     ITTNOTIFY_DATA(thr_name_setA)
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+#define __itt_thr_name_setA ITTNOTIFY_DATA(thr_name_setA)
 #define __itt_thr_name_setA_ptr ITTNOTIFY_NAME(thr_name_setA)
-#define __itt_thr_name_setW     ITTNOTIFY_DATA(thr_name_setW)
+#define __itt_thr_name_setW ITTNOTIFY_DATA(thr_name_setW)
 #define __itt_thr_name_setW_ptr ITTNOTIFY_NAME(thr_name_setW)
 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_thr_name_set     ITTNOTIFY_DATA(thr_name_set)
+#define __itt_thr_name_set ITTNOTIFY_DATA(thr_name_set)
 #define __itt_thr_name_set_ptr ITTNOTIFY_NAME(thr_name_set)
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#else /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
 #define __itt_thr_name_setA(name, namelen)
 #define __itt_thr_name_setA_ptr 0
 #define __itt_thr_name_setW(name, namelen)
@@ -321,8 +356,8 @@ ITT_STUB(LIBITTAPI, int, thr_name_set,  (const char    *name, int namelen))
 #define __itt_thr_name_set_ptr 0
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#else /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
 #define __itt_thr_name_setA_ptr 0
 #define __itt_thr_name_setW_ptr 0
 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
@@ -333,7 +368,8 @@ ITT_STUB(LIBITTAPI, int, thr_name_set,  (const char    *name, int namelen))
 
 /**
  * @deprecated Legacy API
- * @brief Mark current thread as ignored from this point on, for the duration of its existence.
+ * @brief Mark current thread as ignored from this point on, for the duration of
+ * its existence.
  */
 void LIBITTAPI __itt_thr_ignore(void);
 
@@ -341,13 +377,13 @@ void LIBITTAPI __itt_thr_ignore(void);
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
 ITT_STUBV(LIBITTAPI, void, thr_ignore, (void))
-#define __itt_thr_ignore     ITTNOTIFY_VOID(thr_ignore)
+#define __itt_thr_ignore ITTNOTIFY_VOID(thr_ignore)
 #define __itt_thr_ignore_ptr ITTNOTIFY_NAME(thr_ignore)
-#else  /* INTEL_NO_ITTNOTIFY_API */
+#else /* INTEL_NO_ITTNOTIFY_API */
 #define __itt_thr_ignore()
 #define __itt_thr_ignore_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
+#else /* INTEL_NO_MACRO_BODY */
 #define __itt_thr_ignore_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
@@ -370,55 +406,64 @@ ITT_STUBV(LIBITTAPI, void, thr_ignore, (void))
  * @hideinitializer
  * @brief possible value of attribute argument for sync object type
  */
-#define __itt_attr_mutex   2
+#define __itt_attr_mutex 2
 
 /**
  * @deprecated Legacy API
  * @brief Assign a name to a sync object using char or Unicode string
- * @param[in] addr    - pointer to the sync object. You should use a real pointer to your object
- *                      to make sure that the values don't clash with other object addresses
- * @param[in] objtype - null-terminated object type string. If NULL is passed, the object will
- *                      be assumed to be of generic "User Synchronization" type
- * @param[in] objname - null-terminated object name string. If NULL, no name will be assigned
- *                      to the object -- you can use the __itt_sync_rename call later to assign
- *                      the name
- * @param[in] attribute - one of [#__itt_attr_barrier, #__itt_attr_mutex] values which defines the
- *                      exact semantics of how prepare/acquired/releasing calls work.
- */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-void ITTAPI __itt_sync_set_nameA(void *addr, const char    *objtype, const char    *objname, int attribute);
-void ITTAPI __itt_sync_set_nameW(void *addr, const wchar_t *objtype, const wchar_t *objname, int attribute);
+ * @param[in] addr    - pointer to the sync object. You should use a real
+ * pointer to your object to make sure that the values don't clash with other
+ * object addresses
+ * @param[in] objtype - null-terminated object type string. If NULL is passed,
+ * the object will be assumed to be of generic "User Synchronization" type
+ * @param[in] objname - null-terminated object name string. If NULL, no name
+ * will be assigned to the object -- you can use the __itt_sync_rename call
+ * later to assign the name
+ * @param[in] attribute - one of [#__itt_attr_barrier, #__itt_attr_mutex] values
+ * which defines the exact semantics of how prepare/acquired/releasing calls
+ * work.
+ */
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+void ITTAPI __itt_sync_set_nameA(void *addr, const char *objtype,
+                                 const char *objname, int attribute);
+void ITTAPI __itt_sync_set_nameW(void *addr, const wchar_t *objtype,
+                                 const wchar_t *objname, int attribute);
 #if defined(UNICODE) || defined(_UNICODE)
-#  define __itt_sync_set_name     __itt_sync_set_nameW
-#  define __itt_sync_set_name_ptr __itt_sync_set_nameW_ptr
+#define __itt_sync_set_name __itt_sync_set_nameW
+#define __itt_sync_set_name_ptr __itt_sync_set_nameW_ptr
 #else /* UNICODE */
-#  define __itt_sync_set_name     __itt_sync_set_nameA
-#  define __itt_sync_set_name_ptr __itt_sync_set_nameA_ptr
+#define __itt_sync_set_name __itt_sync_set_nameA
+#define __itt_sync_set_name_ptr __itt_sync_set_nameA_ptr
 #endif /* UNICODE */
 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-void ITTAPI __itt_sync_set_name(void *addr, const char* objtype, const char* objname, int attribute);
+void ITTAPI __itt_sync_set_name(void *addr, const char *objtype,
+                                const char *objname, int attribute);
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 
 /** @cond exclude_from_documentation */
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-ITT_STUBV(ITTAPI, void, sync_set_nameA, (void *addr, const char    *objtype, const char    *objname, int attribute))
-ITT_STUBV(ITTAPI, void, sync_set_nameW, (void *addr, const wchar_t *objtype, const wchar_t *objname, int attribute))
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-ITT_STUBV(ITTAPI, void, sync_set_name,  (void *addr, const char    *objtype, const char    *objname, int attribute))
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+ITT_STUBV(ITTAPI, void, sync_set_nameA,
+          (void *addr, const char *objtype, const char *objname, int attribute))
+ITT_STUBV(ITTAPI, void, sync_set_nameW,
+          (void *addr, const wchar_t *objtype, const wchar_t *objname,
+           int attribute))
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUBV(ITTAPI, void, sync_set_name,
+          (void *addr, const char *objtype, const char *objname, int attribute))
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_sync_set_nameA     ITTNOTIFY_VOID(sync_set_nameA)
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+#define __itt_sync_set_nameA ITTNOTIFY_VOID(sync_set_nameA)
 #define __itt_sync_set_nameA_ptr ITTNOTIFY_NAME(sync_set_nameA)
-#define __itt_sync_set_nameW     ITTNOTIFY_VOID(sync_set_nameW)
+#define __itt_sync_set_nameW ITTNOTIFY_VOID(sync_set_nameW)
 #define __itt_sync_set_nameW_ptr ITTNOTIFY_NAME(sync_set_nameW)
 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_sync_set_name     ITTNOTIFY_VOID(sync_set_name)
+#define __itt_sync_set_name ITTNOTIFY_VOID(sync_set_name)
 #define __itt_sync_set_name_ptr ITTNOTIFY_NAME(sync_set_name)
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#else /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
 #define __itt_sync_set_nameA(addr, objtype, objname, attribute)
 #define __itt_sync_set_nameA_ptr 0
 #define __itt_sync_set_nameW(addr, objtype, objname, attribute)
@@ -428,8 +473,8 @@ ITT_STUBV(ITTAPI, void, sync_set_name,  (void *addr, const char    *objtype, con
 #define __itt_sync_set_name_ptr 0
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#else /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
 #define __itt_sync_set_nameA_ptr 0
 #define __itt_sync_set_nameW_ptr 0
 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
@@ -441,61 +486,80 @@ ITT_STUBV(ITTAPI, void, sync_set_name,  (void *addr, const char    *objtype, con
 /**
  * @deprecated Legacy API
  * @brief Assign a name and type to a sync object using char or Unicode string
- * @param[in] addr -      pointer to the sync object. You should use a real pointer to your object
- *                        to make sure that the values don't clash with other object addresses
- * @param[in] objtype -   null-terminated object type string. If NULL is passed, the object will
- *                        be assumed to be of generic "User Synchronization" type
- * @param[in] objname -   null-terminated object name string. If NULL, no name will be assigned
- *                        to the object -- you can use the __itt_sync_rename call later to assign
- *                        the name
- * @param[in] typelen, namelen -   a length of string for appropriate objtype and objname parameter
- * @param[in] attribute - one of [#__itt_attr_barrier, #__itt_attr_mutex] values which defines the
- *                        exact semantics of how prepare/acquired/releasing calls work.
- * @return __itt_err upon failure (name or namelen being null,name and namelen mismatched)
- */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-int LIBITTAPI __itt_notify_sync_nameA(void *addr, const char    *objtype, int typelen, const char    *objname, int namelen, int attribute);
-int LIBITTAPI __itt_notify_sync_nameW(void *addr, const wchar_t *objtype, int typelen, const wchar_t *objname, int namelen, int attribute);
+ * @param[in] addr -      pointer to the sync object. You should use a real
+ * pointer to your object to make sure that the values don't clash with other
+ * object addresses
+ * @param[in] objtype -   null-terminated object type string. If NULL is passed,
+ * the object will be assumed to be of generic "User Synchronization" type
+ * @param[in] objname -   null-terminated object name string. If NULL, no name
+ * will be assigned to the object -- you can use the __itt_sync_rename call
+ * later to assign the name
+ * @param[in] typelen, namelen -   a length of string for appropriate objtype
+ * and objname parameter
+ * @param[in] attribute - one of [#__itt_attr_barrier, #__itt_attr_mutex] values
+ * which defines the exact semantics of how prepare/acquired/releasing calls
+ * work.
+ * @return __itt_err upon failure (name or namelen being null,name and namelen
+ * mismatched)
+ */
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+int LIBITTAPI __itt_notify_sync_nameA(void *addr, const char *objtype,
+                                      int typelen, const char *objname,
+                                      int namelen, int attribute);
+int LIBITTAPI __itt_notify_sync_nameW(void *addr, const wchar_t *objtype,
+                                      int typelen, const wchar_t *objname,
+                                      int namelen, int attribute);
 #if defined(UNICODE) || defined(_UNICODE)
-#  define __itt_notify_sync_name __itt_notify_sync_nameW
+#define __itt_notify_sync_name __itt_notify_sync_nameW
 #else
-#  define __itt_notify_sync_name __itt_notify_sync_nameA
+#define __itt_notify_sync_name __itt_notify_sync_nameA
 #endif
 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-int LIBITTAPI __itt_notify_sync_name(void *addr, const char *objtype, int typelen, const char *objname, int namelen, int attribute);
+int LIBITTAPI __itt_notify_sync_name(void *addr, const char *objtype,
+                                     int typelen, const char *objname,
+                                     int namelen, int attribute);
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 
 /** @cond exclude_from_documentation */
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-ITT_STUB(LIBITTAPI, int, notify_sync_nameA, (void *addr, const char    *objtype, int typelen, const char    *objname, int namelen, int attribute))
-ITT_STUB(LIBITTAPI, int, notify_sync_nameW, (void *addr, const wchar_t *objtype, int typelen, const wchar_t *objname, int namelen, int attribute))
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-ITT_STUB(LIBITTAPI, int, notify_sync_name,  (void *addr, const char    *objtype, int typelen, const char    *objname, int namelen, int attribute))
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+ITT_STUB(LIBITTAPI, int, notify_sync_nameA,
+         (void *addr, const char *objtype, int typelen, const char *objname,
+          int namelen, int attribute))
+ITT_STUB(LIBITTAPI, int, notify_sync_nameW,
+         (void *addr, const wchar_t *objtype, int typelen,
+          const wchar_t *objname, int namelen, int attribute))
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(LIBITTAPI, int, notify_sync_name,
+         (void *addr, const char *objtype, int typelen, const char *objname,
+          int namelen, int attribute))
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_notify_sync_nameA     ITTNOTIFY_DATA(notify_sync_nameA)
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+#define __itt_notify_sync_nameA ITTNOTIFY_DATA(notify_sync_nameA)
 #define __itt_notify_sync_nameA_ptr ITTNOTIFY_NAME(notify_sync_nameA)
-#define __itt_notify_sync_nameW     ITTNOTIFY_DATA(notify_sync_nameW)
+#define __itt_notify_sync_nameW ITTNOTIFY_DATA(notify_sync_nameW)
 #define __itt_notify_sync_nameW_ptr ITTNOTIFY_NAME(notify_sync_nameW)
 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_notify_sync_name     ITTNOTIFY_DATA(notify_sync_name)
+#define __itt_notify_sync_name ITTNOTIFY_DATA(notify_sync_name)
 #define __itt_notify_sync_name_ptr ITTNOTIFY_NAME(notify_sync_name)
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_notify_sync_nameA(addr, objtype, typelen, objname, namelen, attribute)
+#else /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+#define __itt_notify_sync_nameA(addr, objtype, typelen, objname, namelen,      \
+                                attribute)
 #define __itt_notify_sync_nameA_ptr 0
-#define __itt_notify_sync_nameW(addr, objtype, typelen, objname, namelen, attribute)
+#define __itt_notify_sync_nameW(addr, objtype, typelen, objname, namelen,      \
+                                attribute)
 #define __itt_notify_sync_nameW_ptr 0
 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_notify_sync_name(addr, objtype, typelen, objname, namelen, attribute)
+#define __itt_notify_sync_name(addr, objtype, typelen, objname, namelen,       \
+                               attribute)
 #define __itt_notify_sync_name_ptr 0
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#else /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
 #define __itt_notify_sync_nameA_ptr 0
 #define __itt_notify_sync_nameW_ptr 0
 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
@@ -508,19 +572,19 @@ ITT_STUB(LIBITTAPI, int, notify_sync_name,  (void *addr, const char    *objtype,
  * @deprecated Legacy API
  * @brief Enter spin loop on user-defined sync object
  */
-void LIBITTAPI __itt_notify_sync_prepare(void* addr);
+void LIBITTAPI __itt_notify_sync_prepare(void *addr);
 
 /** @cond exclude_from_documentation */
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
 ITT_STUBV(LIBITTAPI, void, notify_sync_prepare, (void *addr))
-#define __itt_notify_sync_prepare     ITTNOTIFY_VOID(notify_sync_prepare)
+#define __itt_notify_sync_prepare ITTNOTIFY_VOID(notify_sync_prepare)
 #define __itt_notify_sync_prepare_ptr ITTNOTIFY_NAME(notify_sync_prepare)
-#else  /* INTEL_NO_ITTNOTIFY_API */
+#else /* INTEL_NO_ITTNOTIFY_API */
 #define __itt_notify_sync_prepare(addr)
 #define __itt_notify_sync_prepare_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
+#else /* INTEL_NO_MACRO_BODY */
 #define __itt_notify_sync_prepare_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
@@ -535,13 +599,13 @@ void LIBITTAPI __itt_notify_sync_cancel(void *addr);
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
 ITT_STUBV(LIBITTAPI, void, notify_sync_cancel, (void *addr))
-#define __itt_notify_sync_cancel     ITTNOTIFY_VOID(notify_sync_cancel)
+#define __itt_notify_sync_cancel ITTNOTIFY_VOID(notify_sync_cancel)
 #define __itt_notify_sync_cancel_ptr ITTNOTIFY_NAME(notify_sync_cancel)
-#else  /* INTEL_NO_ITTNOTIFY_API */
+#else /* INTEL_NO_ITTNOTIFY_API */
 #define __itt_notify_sync_cancel(addr)
 #define __itt_notify_sync_cancel_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
+#else /* INTEL_NO_MACRO_BODY */
 #define __itt_notify_sync_cancel_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
@@ -556,34 +620,35 @@ void LIBITTAPI __itt_notify_sync_acquired(void *addr);
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
 ITT_STUBV(LIBITTAPI, void, notify_sync_acquired, (void *addr))
-#define __itt_notify_sync_acquired     ITTNOTIFY_VOID(notify_sync_acquired)
+#define __itt_notify_sync_acquired ITTNOTIFY_VOID(notify_sync_acquired)
 #define __itt_notify_sync_acquired_ptr ITTNOTIFY_NAME(notify_sync_acquired)
-#else  /* INTEL_NO_ITTNOTIFY_API */
+#else /* INTEL_NO_ITTNOTIFY_API */
 #define __itt_notify_sync_acquired(addr)
 #define __itt_notify_sync_acquired_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
+#else /* INTEL_NO_MACRO_BODY */
 #define __itt_notify_sync_acquired_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
 
 /**
  * @deprecated Legacy API
- * @brief Start sync object releasing code. Is called before the lock release call.
+ * @brief Start sync object releasing code. Is called before the lock release
+ * call.
  */
-void LIBITTAPI __itt_notify_sync_releasing(void* addr);
+void LIBITTAPI __itt_notify_sync_releasing(void *addr);
 
 /** @cond exclude_from_documentation */
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
 ITT_STUBV(LIBITTAPI, void, notify_sync_releasing, (void *addr))
-#define __itt_notify_sync_releasing     ITTNOTIFY_VOID(notify_sync_releasing)
+#define __itt_notify_sync_releasing ITTNOTIFY_VOID(notify_sync_releasing)
 #define __itt_notify_sync_releasing_ptr ITTNOTIFY_NAME(notify_sync_releasing)
-#else  /* INTEL_NO_ITTNOTIFY_API */
+#else /* INTEL_NO_ITTNOTIFY_API */
 #define __itt_notify_sync_releasing(addr)
 #define __itt_notify_sync_releasing_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
+#else /* INTEL_NO_MACRO_BODY */
 #define __itt_notify_sync_releasing_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
@@ -602,65 +667,68 @@ typedef int __itt_event;
 
 /**
  * @brief Create an event notification
- * @note name or namelen being null/name and namelen not matching, user event feature not enabled
+ * @note name or namelen being null/name and namelen not matching, user event
+ * feature not enabled
  * @return non-zero event identifier upon success and __itt_err otherwise
  */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-__itt_event LIBITTAPI __itt_event_createA(const char    *name, int namelen);
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+__itt_event LIBITTAPI __itt_event_createA(const char *name, int namelen);
 __itt_event LIBITTAPI __itt_event_createW(const wchar_t *name, int namelen);
 #if defined(UNICODE) || defined(_UNICODE)
-#  define __itt_event_create     __itt_event_createW
-#  define __itt_event_create_ptr __itt_event_createW_ptr
+#define __itt_event_create __itt_event_createW
+#define __itt_event_create_ptr __itt_event_createW_ptr
 #else
-#  define __itt_event_create     __itt_event_createA
-#  define __itt_event_create_ptr __itt_event_createA_ptr
+#define __itt_event_create __itt_event_createA
+#define __itt_event_create_ptr __itt_event_createA_ptr
 #endif /* UNICODE */
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 __itt_event LIBITTAPI __itt_event_create(const char *name, int namelen);
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 
 /** @cond exclude_from_documentation */
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-ITT_STUB(LIBITTAPI, __itt_event, event_createA, (const char    *name, int namelen))
-ITT_STUB(LIBITTAPI, __itt_event, event_createW, (const wchar_t *name, int namelen))
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-ITT_STUB(LIBITTAPI, __itt_event, event_create,  (const char *name, int namelen))
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+ITT_STUB(LIBITTAPI, __itt_event, event_createA, (const char *name, int namelen))
+ITT_STUB(LIBITTAPI, __itt_event, event_createW,
+         (const wchar_t *name, int namelen))
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(LIBITTAPI, __itt_event, event_create, (const char *name, int namelen))
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_event_createA     ITTNOTIFY_DATA(event_createA)
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+#define __itt_event_createA ITTNOTIFY_DATA(event_createA)
 #define __itt_event_createA_ptr ITTNOTIFY_NAME(event_createA)
-#define __itt_event_createW     ITTNOTIFY_DATA(event_createW)
+#define __itt_event_createW ITTNOTIFY_DATA(event_createW)
 #define __itt_event_createW_ptr ITTNOTIFY_NAME(event_createW)
 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_event_create      ITTNOTIFY_DATA(event_create)
-#define __itt_event_create_ptr  ITTNOTIFY_NAME(event_create)
+#define __itt_event_create ITTNOTIFY_DATA(event_create)
+#define __itt_event_create_ptr ITTNOTIFY_NAME(event_create)
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#else /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
 #define __itt_event_createA(name, namelen) (__itt_event)0
 #define __itt_event_createA_ptr 0
 #define __itt_event_createW(name, namelen) (__itt_event)0
 #define __itt_event_createW_ptr 0
 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_event_create(name, namelen)  (__itt_event)0
-#define __itt_event_create_ptr  0
+#define __itt_event_create(name, namelen) (__itt_event)0
+#define __itt_event_create_ptr 0
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#else /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
 #define __itt_event_createA_ptr 0
 #define __itt_event_createW_ptr 0
 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_event_create_ptr  0
+#define __itt_event_create_ptr 0
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
 
 /**
  * @brief Record an event occurrence.
- * @return __itt_err upon failure (invalid event id/user event feature not enabled)
+ * @return __itt_err upon failure (invalid event id/user event feature not
+ * enabled)
  */
 int LIBITTAPI __itt_event_start(__itt_event event);
 
@@ -668,13 +736,13 @@ int LIBITTAPI __itt_event_start(__itt_event event);
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
 ITT_STUB(LIBITTAPI, int, event_start, (__itt_event event))
-#define __itt_event_start     ITTNOTIFY_DATA(event_start)
+#define __itt_event_start ITTNOTIFY_DATA(event_start)
 #define __itt_event_start_ptr ITTNOTIFY_NAME(event_start)
-#else  /* INTEL_NO_ITTNOTIFY_API */
+#else /* INTEL_NO_ITTNOTIFY_API */
 #define __itt_event_start(event) (int)0
 #define __itt_event_start_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
+#else /* INTEL_NO_MACRO_BODY */
 #define __itt_event_start_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
@@ -682,7 +750,8 @@ ITT_STUB(LIBITTAPI, int, event_start, (__itt_event event))
 /**
  * @brief Record an event end occurrence.
  * @note It is optional if events do not have durations.
- * @return __itt_err upon failure (invalid event id/user event feature not enabled)
+ * @return __itt_err upon failure (invalid event id/user event feature not
+ * enabled)
  */
 int LIBITTAPI __itt_event_end(__itt_event event);
 
@@ -690,13 +759,13 @@ int LIBITTAPI __itt_event_end(__itt_event event);
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
 ITT_STUB(LIBITTAPI, int, event_end, (__itt_event event))
-#define __itt_event_end     ITTNOTIFY_DATA(event_end)
+#define __itt_event_end ITTNOTIFY_DATA(event_end)
 #define __itt_event_end_ptr ITTNOTIFY_NAME(event_end)
-#else  /* INTEL_NO_ITTNOTIFY_API */
+#else /* INTEL_NO_ITTNOTIFY_API */
 #define __itt_event_end(event) (int)0
 #define __itt_event_end_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
+#else /* INTEL_NO_MACRO_BODY */
 #define __itt_event_end_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
@@ -718,13 +787,13 @@ void LIBITTAPI __itt_memory_read(void *addr, size_t size);
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
 ITT_STUBV(LIBITTAPI, void, memory_read, (void *addr, size_t size))
-#define __itt_memory_read     ITTNOTIFY_VOID(memory_read)
+#define __itt_memory_read ITTNOTIFY_VOID(memory_read)
 #define __itt_memory_read_ptr ITTNOTIFY_NAME(memory_read)
-#else  /* INTEL_NO_ITTNOTIFY_API */
+#else /* INTEL_NO_ITTNOTIFY_API */
 #define __itt_memory_read(addr, size)
 #define __itt_memory_read_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
+#else /* INTEL_NO_MACRO_BODY */
 #define __itt_memory_read_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
@@ -739,13 +808,13 @@ void LIBITTAPI __itt_memory_write(void *addr, size_t size);
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
 ITT_STUBV(LIBITTAPI, void, memory_write, (void *addr, size_t size))
-#define __itt_memory_write     ITTNOTIFY_VOID(memory_write)
+#define __itt_memory_write ITTNOTIFY_VOID(memory_write)
 #define __itt_memory_write_ptr ITTNOTIFY_NAME(memory_write)
-#else  /* INTEL_NO_ITTNOTIFY_API */
+#else /* INTEL_NO_ITTNOTIFY_API */
 #define __itt_memory_write(addr, size)
 #define __itt_memory_write_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
+#else /* INTEL_NO_MACRO_BODY */
 #define __itt_memory_write_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
@@ -760,13 +829,13 @@ void LIBITTAPI __itt_memory_update(void *address, size_t size);
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
 ITT_STUBV(LIBITTAPI, void, memory_update, (void *addr, size_t size))
-#define __itt_memory_update     ITTNOTIFY_VOID(memory_update)
+#define __itt_memory_update ITTNOTIFY_VOID(memory_update)
 #define __itt_memory_update_ptr ITTNOTIFY_NAME(memory_update)
-#else  /* INTEL_NO_ITTNOTIFY_API */
+#else /* INTEL_NO_ITTNOTIFY_API */
 #define __itt_memory_update(addr, size)
 #define __itt_memory_update_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
+#else /* INTEL_NO_MACRO_BODY */
 #define __itt_memory_update_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
@@ -782,27 +851,25 @@ typedef int __itt_state_t;
 
 /** @cond exclude_from_documentation */
 typedef enum __itt_obj_state {
-    __itt_obj_state_err = 0,
-    __itt_obj_state_clr = 1,
-    __itt_obj_state_set = 2,
-    __itt_obj_state_use = 3
+  __itt_obj_state_err = 0,
+  __itt_obj_state_clr = 1,
+  __itt_obj_state_set = 2,
+  __itt_obj_state_use = 3
 } __itt_obj_state_t;
 
 typedef enum __itt_thr_state {
-    __itt_thr_state_err = 0,
-    __itt_thr_state_clr = 1,
-    __itt_thr_state_set = 2
+  __itt_thr_state_err = 0,
+  __itt_thr_state_clr = 1,
+  __itt_thr_state_set = 2
 } __itt_thr_state_t;
 
 typedef enum __itt_obj_prop {
-    __itt_obj_prop_watch    = 1,
-    __itt_obj_prop_ignore   = 2,
-    __itt_obj_prop_sharable = 3
+  __itt_obj_prop_watch = 1,
+  __itt_obj_prop_ignore = 2,
+  __itt_obj_prop_sharable = 3
 } __itt_obj_prop_t;
 
-typedef enum __itt_thr_prop {
-    __itt_thr_prop_quiet = 1
-} __itt_thr_prop_t;
+typedef enum __itt_thr_prop { __itt_thr_prop_quiet = 1 } __itt_thr_prop_t;
 /** @endcond */
 
 /**
@@ -815,13 +882,13 @@ __itt_state_t LIBITTAPI __itt_state_get(void);
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
 ITT_STUB(ITTAPI, __itt_state_t, state_get, (void))
-#define __itt_state_get     ITTNOTIFY_DATA(state_get)
+#define __itt_state_get ITTNOTIFY_DATA(state_get)
 #define __itt_state_get_ptr ITTNOTIFY_NAME(state_get)
-#else  /* INTEL_NO_ITTNOTIFY_API */
+#else /* INTEL_NO_ITTNOTIFY_API */
 #define __itt_state_get(void) (__itt_state_t)0
 #define __itt_state_get_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
+#else /* INTEL_NO_MACRO_BODY */
 #define __itt_state_get_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
@@ -836,13 +903,13 @@ __itt_state_t LIBITTAPI __itt_state_set(__itt_state_t s);
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
 ITT_STUB(ITTAPI, __itt_state_t, state_set, (__itt_state_t s))
-#define __itt_state_set     ITTNOTIFY_DATA(state_set)
+#define __itt_state_set ITTNOTIFY_DATA(state_set)
 #define __itt_state_set_ptr ITTNOTIFY_NAME(state_set)
-#else  /* INTEL_NO_ITTNOTIFY_API */
+#else /* INTEL_NO_ITTNOTIFY_API */
 #define __itt_state_set(s) (__itt_state_t)0
 #define __itt_state_set_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
+#else /* INTEL_NO_MACRO_BODY */
 #define __itt_state_set_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
@@ -851,19 +918,21 @@ ITT_STUB(ITTAPI, __itt_state_t, state_set, (__itt_state_t s))
  * @deprecated Legacy API
  * @brief managing thread and object modes
  */
-__itt_thr_state_t LIBITTAPI __itt_thr_mode_set(__itt_thr_prop_t p, __itt_thr_state_t s);
+__itt_thr_state_t LIBITTAPI __itt_thr_mode_set(__itt_thr_prop_t p,
+                                               __itt_thr_state_t s);
 
 /** @cond exclude_from_documentation */
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUB(ITTAPI, __itt_thr_state_t, thr_mode_set, (__itt_thr_prop_t p, __itt_thr_state_t s))
-#define __itt_thr_mode_set     ITTNOTIFY_DATA(thr_mode_set)
+ITT_STUB(ITTAPI, __itt_thr_state_t, thr_mode_set,
+         (__itt_thr_prop_t p, __itt_thr_state_t s))
+#define __itt_thr_mode_set ITTNOTIFY_DATA(thr_mode_set)
 #define __itt_thr_mode_set_ptr ITTNOTIFY_NAME(thr_mode_set)
-#else  /* INTEL_NO_ITTNOTIFY_API */
+#else /* INTEL_NO_ITTNOTIFY_API */
 #define __itt_thr_mode_set(p, s) (__itt_thr_state_t)0
 #define __itt_thr_mode_set_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
+#else /* INTEL_NO_MACRO_BODY */
 #define __itt_thr_mode_set_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
@@ -872,19 +941,21 @@ ITT_STUB(ITTAPI, __itt_thr_state_t, thr_mode_set, (__itt_thr_prop_t p, __itt_thr
  * @deprecated Legacy API
  * @brief managing thread and object modes
  */
-__itt_obj_state_t LIBITTAPI __itt_obj_mode_set(__itt_obj_prop_t p, __itt_obj_state_t s);
+__itt_obj_state_t LIBITTAPI __itt_obj_mode_set(__itt_obj_prop_t p,
+                                               __itt_obj_state_t s);
 
 /** @cond exclude_from_documentation */
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUB(ITTAPI, __itt_obj_state_t, obj_mode_set, (__itt_obj_prop_t p, __itt_obj_state_t s))
-#define __itt_obj_mode_set     ITTNOTIFY_DATA(obj_mode_set)
+ITT_STUB(ITTAPI, __itt_obj_state_t, obj_mode_set,
+         (__itt_obj_prop_t p, __itt_obj_state_t s))
+#define __itt_obj_mode_set ITTNOTIFY_DATA(obj_mode_set)
 #define __itt_obj_mode_set_ptr ITTNOTIFY_NAME(obj_mode_set)
-#else  /* INTEL_NO_ITTNOTIFY_API */
+#else /* INTEL_NO_ITTNOTIFY_API */
 #define __itt_obj_mode_set(p, s) (__itt_obj_state_t)0
 #define __itt_obj_mode_set_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
+#else /* INTEL_NO_MACRO_BODY */
 #define __itt_obj_mode_set_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
@@ -904,15 +975,15 @@ typedef struct __itt_frame_t *__itt_frame;
 /**
  * @brief Create a global frame with given domain
  */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-__itt_frame ITTAPI __itt_frame_createA(const char    *domain);
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+__itt_frame ITTAPI __itt_frame_createA(const char *domain);
 __itt_frame ITTAPI __itt_frame_createW(const wchar_t *domain);
 #if defined(UNICODE) || defined(_UNICODE)
-#  define __itt_frame_create     __itt_frame_createW
-#  define __itt_frame_create_ptr __itt_frame_createW_ptr
+#define __itt_frame_create __itt_frame_createW
+#define __itt_frame_create_ptr __itt_frame_createW_ptr
 #else /* UNICODE */
-#  define __itt_frame_create     __itt_frame_createA
-#  define __itt_frame_create_ptr __itt_frame_createA_ptr
+#define __itt_frame_create __itt_frame_createA
+#define __itt_frame_create_ptr __itt_frame_createA_ptr
 #endif /* UNICODE */
 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 __itt_frame ITTAPI __itt_frame_create(const char *domain);
@@ -921,38 +992,38 @@ __itt_frame ITTAPI __itt_frame_create(const char *domain);
 /** @cond exclude_from_documentation */
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-ITT_STUB(ITTAPI, __itt_frame, frame_createA, (const char    *domain))
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_frame, frame_createA, (const char *domain))
 ITT_STUB(ITTAPI, __itt_frame, frame_createW, (const wchar_t *domain))
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-ITT_STUB(ITTAPI, __itt_frame, frame_create,  (const char *domain))
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_frame, frame_create, (const char *domain))
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_frame_createA     ITTNOTIFY_DATA(frame_createA)
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+#define __itt_frame_createA ITTNOTIFY_DATA(frame_createA)
 #define __itt_frame_createA_ptr ITTNOTIFY_NAME(frame_createA)
-#define __itt_frame_createW     ITTNOTIFY_DATA(frame_createW)
+#define __itt_frame_createW ITTNOTIFY_DATA(frame_createW)
 #define __itt_frame_createW_ptr ITTNOTIFY_NAME(frame_createW)
 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_frame_create     ITTNOTIFY_DATA(frame_create)
+#define __itt_frame_create ITTNOTIFY_DATA(frame_create)
 #define __itt_frame_create_ptr ITTNOTIFY_NAME(frame_create)
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#else /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
 #define __itt_frame_createA(domain)
 #define __itt_frame_createA_ptr 0
 #define __itt_frame_createW(domain)
 #define __itt_frame_createW_ptr 0
 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 #define __itt_frame_create(domain)
-#define __itt_frame_create_ptr  0
+#define __itt_frame_create_ptr 0
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#else /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
 #define __itt_frame_createA_ptr 0
 #define __itt_frame_createW_ptr 0
 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_frame_create_ptr  0
+#define __itt_frame_create_ptr 0
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
@@ -960,26 +1031,26 @@ ITT_STUB(ITTAPI, __itt_frame, frame_create,  (const char *domain))
 /** @brief Record a frame begin occurrence. */
 void ITTAPI __itt_frame_begin(__itt_frame frame);
 /** @brief Record a frame end occurrence. */
-void ITTAPI __itt_frame_end  (__itt_frame frame);
+void ITTAPI __itt_frame_end(__itt_frame frame);
 
 /** @cond exclude_from_documentation */
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
 ITT_STUBV(ITTAPI, void, frame_begin, (__itt_frame frame))
-ITT_STUBV(ITTAPI, void, frame_end,   (__itt_frame frame))
-#define __itt_frame_begin     ITTNOTIFY_VOID(frame_begin)
+ITT_STUBV(ITTAPI, void, frame_end, (__itt_frame frame))
+#define __itt_frame_begin ITTNOTIFY_VOID(frame_begin)
 #define __itt_frame_begin_ptr ITTNOTIFY_NAME(frame_begin)
-#define __itt_frame_end       ITTNOTIFY_VOID(frame_end)
-#define __itt_frame_end_ptr   ITTNOTIFY_NAME(frame_end)
-#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_frame_end ITTNOTIFY_VOID(frame_end)
+#define __itt_frame_end_ptr ITTNOTIFY_NAME(frame_end)
+#else /* INTEL_NO_ITTNOTIFY_API */
 #define __itt_frame_begin(frame)
 #define __itt_frame_begin_ptr 0
 #define __itt_frame_end(frame)
-#define __itt_frame_end_ptr   0
+#define __itt_frame_end_ptr 0
 #endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
+#else /* INTEL_NO_MACRO_BODY */
 #define __itt_frame_begin_ptr 0
-#define __itt_frame_end_ptr   0
+#define __itt_frame_end_ptr 0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
 /** @} frames group */
diff --git a/openmp/runtime/src/tsan_annotations.cpp b/openmp/runtime/src/tsan_annotations.cpp
deleted file mode 100644
index 5be17f8337ce..000000000000
--- a/openmp/runtime/src/tsan_annotations.cpp
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * tsan_annotations.cpp -- ThreadSanitizer annotations to support data
- * race detection in OpenMP programs.
- */
-
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "tsan_annotations.h"
-
-#include <stdio.h>
-
-typedef unsigned long uptr;
-typedef signed long sptr;
-
-extern "C" __attribute__((weak)) void AnnotateHappensBefore(const char *f,
-                                                            int l, uptr addr) {}
-extern "C" __attribute__((weak)) void AnnotateHappensAfter(const char *f, int l,
-                                                           uptr addr) {}
-extern "C" __attribute__((weak)) void AnnotateCondVarSignal(const char *f,
-                                                            int l, uptr cv) {}
-extern "C" __attribute__((weak)) void AnnotateCondVarSignalAll(const char *f,
-                                                               int l, uptr cv) {
-}
-extern "C" __attribute__((weak)) void AnnotateMutexIsNotPHB(const char *f,
-                                                            int l, uptr mu) {}
-extern "C" __attribute__((weak)) void AnnotateCondVarWait(const char *f, int l,
-                                                          uptr cv, uptr lock) {}
-extern "C" __attribute__((weak)) void AnnotateRWLockCreate(const char *f, int l,
-                                                           uptr m) {}
-extern "C" __attribute__((weak)) void
-AnnotateRWLockCreateStatic(const char *f, int l, uptr m) {}
-extern "C" __attribute__((weak)) void AnnotateRWLockDestroy(const char *f,
-                                                            int l, uptr m) {}
-extern "C" __attribute__((weak)) void
-AnnotateRWLockAcquired(const char *f, int l, uptr m, uptr is_w) {}
-extern "C" __attribute__((weak)) void
-AnnotateRWLockReleased(const char *f, int l, uptr m, uptr is_w) {}
-extern "C" __attribute__((weak)) void AnnotateTraceMemory(const char *f, int l,
-                                                          uptr mem) {}
-extern "C" __attribute__((weak)) void AnnotateFlushState(const char *f, int l) {
-}
-extern "C" __attribute__((weak)) void AnnotateNewMemory(const char *f, int l,
-                                                        uptr mem, uptr size) {}
-extern "C" __attribute__((weak)) void AnnotateNoOp(const char *f, int l,
-                                                   uptr mem) {}
-extern "C" __attribute__((weak)) void AnnotateFlushExpectedRaces(const char *f,
-                                                                 int l) {}
-extern "C" __attribute__((weak)) void
-AnnotateEnableRaceDetection(const char *f, int l, int enable) {}
-extern "C" __attribute__((weak)) void
-AnnotateMutexIsUsedAsCondVar(const char *f, int l, uptr mu) {}
-extern "C" __attribute__((weak)) void AnnotatePCQGet(const char *f, int l,
-                                                     uptr pcq) {}
-extern "C" __attribute__((weak)) void AnnotatePCQPut(const char *f, int l,
-                                                     uptr pcq) {}
-extern "C" __attribute__((weak)) void AnnotatePCQDestroy(const char *f, int l,
-                                                         uptr pcq) {}
-extern "C" __attribute__((weak)) void AnnotatePCQCreate(const char *f, int l,
-                                                        uptr pcq) {}
-extern "C" __attribute__((weak)) void AnnotateExpectRace(const char *f, int l,
-                                                         uptr mem, char *desc) {
-}
-extern "C" __attribute__((weak)) void
-AnnotateBenignRaceSized(const char *f, int l, uptr mem, uptr size, char *desc) {
-}
-extern "C" __attribute__((weak)) void AnnotateBenignRace(const char *f, int l,
-                                                         uptr mem, char *desc) {
-}
-extern "C" __attribute__((weak)) void AnnotateIgnoreReadsBegin(const char *f,
-                                                               int l) {}
-extern "C" __attribute__((weak)) void AnnotateIgnoreReadsEnd(const char *f,
-                                                             int l) {}
-extern "C" __attribute__((weak)) void AnnotateIgnoreWritesBegin(const char *f,
-                                                                int l) {}
-extern "C" __attribute__((weak)) void AnnotateIgnoreWritesEnd(const char *f,
-                                                              int l) {}
-extern "C" __attribute__((weak)) void AnnotateIgnoreSyncBegin(const char *f,
-                                                              int l) {}
-extern "C" __attribute__((weak)) void AnnotateIgnoreSyncEnd(const char *f,
-                                                            int l) {}
-extern "C" __attribute__((weak)) void
-AnnotatePublishMemoryRange(const char *f, int l, uptr addr, uptr size) {}
-extern "C" __attribute__((weak)) void
-AnnotateUnpublishMemoryRange(const char *f, int l, uptr addr, uptr size) {}
-extern "C" __attribute__((weak)) void AnnotateThreadName(const char *f, int l,
-                                                         char *name) {}
-extern "C" __attribute__((weak)) void
-WTFAnnotateHappensBefore(const char *f, int l, uptr addr) {}
-extern "C" __attribute__((weak)) void
-WTFAnnotateHappensAfter(const char *f, int l, uptr addr) {}
-extern "C" __attribute__((weak)) void
-WTFAnnotateBenignRaceSized(const char *f, int l, uptr mem, uptr sz,
-                           char *desc) {}
-extern "C" __attribute__((weak)) int RunningOnValgrind() { return 0; }
-extern "C" __attribute__((weak)) double ValgrindSlowdown(void) { return 0; }
-extern "C" __attribute__((weak)) const char __attribute__((weak)) *
-    ThreadSanitizerQuery(const char *query) {
-  return 0;
-}
-extern "C" __attribute__((weak)) void
-AnnotateMemoryIsInitialized(const char *f, int l, uptr mem, uptr sz) {}
diff --git a/openmp/runtime/src/tsan_annotations.h b/openmp/runtime/src/tsan_annotations.h
deleted file mode 100644
index 2b1debbcad4e..000000000000
--- a/openmp/runtime/src/tsan_annotations.h
+++ /dev/null
@@ -1,169 +0,0 @@
-/*! \file */
-/*
- * tsan_annotations.h -- ThreadSanitizer annotations to support data
- * race detection in OpenMP programs.
- */
-
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef TSAN_ANNOTATIONS_H
-#define TSAN_ANNOTATIONS_H
-
-#include "kmp_config.h"
-
-/* types as used in tsan/rtl/tsan_interface_ann.cc */
-typedef unsigned long uptr;
-typedef signed long sptr;
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/* Declaration of all annotation functions in tsan/rtl/tsan_interface_ann.cc */
-void AnnotateHappensBefore(const char *f, int l, uptr addr);
-void AnnotateHappensAfter(const char *f, int l, uptr addr);
-void AnnotateCondVarSignal(const char *f, int l, uptr cv);
-void AnnotateCondVarSignalAll(const char *f, int l, uptr cv);
-void AnnotateMutexIsNotPHB(const char *f, int l, uptr mu);
-void AnnotateCondVarWait(const char *f, int l, uptr cv, uptr lock);
-void AnnotateRWLockCreate(const char *f, int l, uptr m);
-void AnnotateRWLockCreateStatic(const char *f, int l, uptr m);
-void AnnotateRWLockDestroy(const char *f, int l, uptr m);
-void AnnotateRWLockAcquired(const char *f, int l, uptr m, uptr is_w);
-void AnnotateRWLockReleased(const char *f, int l, uptr m, uptr is_w);
-void AnnotateTraceMemory(const char *f, int l, uptr mem);
-void AnnotateFlushState(const char *f, int l);
-void AnnotateNewMemory(const char *f, int l, uptr mem, uptr size);
-void AnnotateNoOp(const char *f, int l, uptr mem);
-void AnnotateFlushExpectedRaces(const char *f, int l);
-void AnnotateEnableRaceDetection(const char *f, int l, int enable);
-void AnnotateMutexIsUsedAsCondVar(const char *f, int l, uptr mu);
-void AnnotatePCQGet(const char *f, int l, uptr pcq);
-void AnnotatePCQPut(const char *f, int l, uptr pcq);
-void AnnotatePCQDestroy(const char *f, int l, uptr pcq);
-void AnnotatePCQCreate(const char *f, int l, uptr pcq);
-void AnnotateExpectRace(const char *f, int l, uptr mem, char *desc);
-void AnnotateBenignRaceSized(const char *f, int l, uptr mem, uptr size,
-                             char *desc);
-void AnnotateBenignRace(const char *f, int l, uptr mem, char *desc);
-void AnnotateIgnoreReadsBegin(const char *f, int l);
-void AnnotateIgnoreReadsEnd(const char *f, int l);
-void AnnotateIgnoreWritesBegin(const char *f, int l);
-void AnnotateIgnoreWritesEnd(const char *f, int l);
-void AnnotateIgnoreSyncBegin(const char *f, int l);
-void AnnotateIgnoreSyncEnd(const char *f, int l);
-void AnnotatePublishMemoryRange(const char *f, int l, uptr addr, uptr size);
-void AnnotateUnpublishMemoryRange(const char *f, int l, uptr addr, uptr size);
-void AnnotateThreadName(const char *f, int l, char *name);
-void WTFAnnotateHappensBefore(const char *f, int l, uptr addr);
-void WTFAnnotateHappensAfter(const char *f, int l, uptr addr);
-void WTFAnnotateBenignRaceSized(const char *f, int l, uptr mem, uptr sz,
-                                char *desc);
-int RunningOnValgrind();
-double ValgrindSlowdown(void);
-const char *ThreadSanitizerQuery(const char *query);
-void AnnotateMemoryIsInitialized(const char *f, int l, uptr mem, uptr sz);
-
-#ifdef __cplusplus
-}
-#endif
-
-#ifdef TSAN_SUPPORT
-#define ANNOTATE_HAPPENS_AFTER(addr)                                           \
-  AnnotateHappensAfter(__FILE__, __LINE__, (uptr)addr)
-#define ANNOTATE_HAPPENS_BEFORE(addr)                                          \
-  AnnotateHappensBefore(__FILE__, __LINE__, (uptr)addr)
-#define ANNOTATE_IGNORE_WRITES_BEGIN()                                         \
-  AnnotateIgnoreWritesBegin(__FILE__, __LINE__)
-#define ANNOTATE_IGNORE_WRITES_END() AnnotateIgnoreWritesEnd(__FILE__, __LINE__)
-#define ANNOTATE_RWLOCK_CREATE(lck)                                            \
-  AnnotateRWLockCreate(__FILE__, __LINE__, (uptr)lck)
-#define ANNOTATE_RWLOCK_RELEASED(lck)                                          \
-  AnnotateRWLockAcquired(__FILE__, __LINE__, (uptr)lck, 1)
-#define ANNOTATE_RWLOCK_ACQUIRED(lck)                                          \
-  AnnotateRWLockReleased(__FILE__, __LINE__, (uptr)lck, 1)
-#define ANNOTATE_BARRIER_BEGIN(addr)                                           \
-  AnnotateHappensBefore(__FILE__, __LINE__, (uptr)addr)
-#define ANNOTATE_BARRIER_END(addr)                                             \
-  AnnotateHappensAfter(__FILE__, __LINE__, (uptr)addr)
-#define ANNOTATE_REDUCE_AFTER(addr)                                            \
-  AnnotateHappensAfter(__FILE__, __LINE__, (uptr)addr)
-#define ANNOTATE_REDUCE_BEFORE(addr)                                           \
-  AnnotateHappensBefore(__FILE__, __LINE__, (uptr)addr)
-#else
-#define ANNOTATE_HAPPENS_AFTER(addr)
-#define ANNOTATE_HAPPENS_BEFORE(addr)
-#define ANNOTATE_IGNORE_WRITES_BEGIN()
-#define ANNOTATE_IGNORE_WRITES_END()
-#define ANNOTATE_RWLOCK_CREATE(lck)
-#define ANNOTATE_RWLOCK_RELEASED(lck)
-#define ANNOTATE_RWLOCK_ACQUIRED(lck)
-#define ANNOTATE_BARRIER_BEGIN(addr)
-#define ANNOTATE_BARRIER_END(addr)
-#define ANNOTATE_REDUCE_AFTER(addr)
-#define ANNOTATE_REDUCE_BEFORE(addr)
-#endif
-
-#define ANNOTATE_QUEUING
-#define ANNOTATE_TICKET
-#define ANNOTATE_FUTEX
-#define ANNOTATE_TAS
-#define ANNOTATE_DRDPA
-
-#ifdef ANNOTATE_QUEUING
-#define ANNOTATE_QUEUING_CREATE(lck)
-#define ANNOTATE_QUEUING_RELEASED(lck) ANNOTATE_HAPPENS_BEFORE(lck)
-#define ANNOTATE_QUEUING_ACQUIRED(lck) ANNOTATE_HAPPENS_AFTER(lck)
-#else
-#define ANNOTATE_QUEUING_CREATE(lck)
-#define ANNOTATE_QUEUING_RELEASED(lck)
-#define ANNOTATE_QUEUING_ACQUIRED(lck)
-#endif
-
-#ifdef ANNOTATE_TICKET
-#define ANNOTATE_TICKET_CREATE(lck)
-#define ANNOTATE_TICKET_RELEASED(lck) ANNOTATE_HAPPENS_BEFORE(lck)
-#define ANNOTATE_TICKET_ACQUIRED(lck) ANNOTATE_HAPPENS_AFTER(lck)
-#else
-#define ANNOTATE_TICKET_CREATE(lck)
-#define ANNOTATE_TICKET_RELEASED(lck)
-#define ANNOTATE_TICKET_ACQUIRED(lck)
-#endif
-
-#ifdef ANNOTATE_FUTEX
-#define ANNOTATE_FUTEX_CREATE(lck)
-#define ANNOTATE_FUTEX_RELEASED(lck) ANNOTATE_HAPPENS_BEFORE(lck)
-#define ANNOTATE_FUTEX_ACQUIRED(lck) ANNOTATE_HAPPENS_AFTER(lck)
-#else
-#define ANNOTATE_FUTEX_CREATE(lck)
-#define ANNOTATE_FUTEX_RELEASED(lck)
-#define ANNOTATE_FUTEX_ACQUIRED(lck)
-#endif
-
-#ifdef ANNOTATE_TAS
-#define ANNOTATE_TAS_CREATE(lck)
-#define ANNOTATE_TAS_RELEASED(lck) ANNOTATE_HAPPENS_BEFORE(lck)
-#define ANNOTATE_TAS_ACQUIRED(lck) ANNOTATE_HAPPENS_AFTER(lck)
-#else
-#define ANNOTATE_TAS_CREATE(lck)
-#define ANNOTATE_TAS_RELEASED(lck)
-#define ANNOTATE_TAS_ACQUIRED(lck)
-#endif
-
-#ifdef ANNOTATE_DRDPA
-#define ANNOTATE_DRDPA_CREATE(lck)
-#define ANNOTATE_DRDPA_RELEASED(lck) ANNOTATE_HAPPENS_BEFORE(lck)
-#define ANNOTATE_DRDPA_ACQUIRED(lck) ANNOTATE_HAPPENS_AFTER(lck)
-#else
-#define ANNOTATE_DRDPA_CREATE(lck)
-#define ANNOTATE_DRDPA_RELEASED(lck)
-#define ANNOTATE_DRDPA_ACQUIRED(lck)
-#endif
-
-#endif
diff --git a/openmp/runtime/src/z_Linux_util.cpp b/openmp/runtime/src/z_Linux_util.cpp
index 95feafbec8e5..42ad1d56f9ec 100644
--- a/openmp/runtime/src/z_Linux_util.cpp
+++ b/openmp/runtime/src/z_Linux_util.cpp
@@ -25,7 +25,9 @@
 #include <alloca.h>
 #endif
 #include <math.h> // HUGE_VAL.
+#if KMP_OS_LINUX
 #include <semaphore.h>
+#endif // KMP_OS_LINUX
 #include <sys/resource.h>
 #include <sys/syscall.h>
 #include <sys/time.h>
@@ -64,8 +66,6 @@
 #include <dirent.h>
 #include <fcntl.h>
 
-#include "tsan_annotations.h"
-
 struct kmp_sys_timer {
   struct timespec start;
 };
@@ -122,30 +122,28 @@ void __kmp_affinity_bind_thread(int which) {
  * Linux* OS by checking __NR_sched_{get,set}affinity system calls, and set
  * __kmp_affin_mask_size to the appropriate value (0 means not capable). */
 void __kmp_affinity_determine_capable(const char *env_var) {
-// Check and see if the OS supports thread affinity.
+  // Check and see if the OS supports thread affinity.
 
 #if KMP_OS_LINUX
 #define KMP_CPU_SET_SIZE_LIMIT (1024 * 1024)
+#define KMP_CPU_SET_TRY_SIZE CACHE_LINE
 #elif KMP_OS_FREEBSD
 #define KMP_CPU_SET_SIZE_LIMIT (sizeof(cpuset_t))
 #endif
 
-
 #if KMP_OS_LINUX
-  // If Linux* OS:
-  // If the syscall fails or returns a suggestion for the size,
-  // then we don't have to search for an appropriate size.
   long gCode;
-  long sCode;
   unsigned char *buf;
   buf = (unsigned char *)KMP_INTERNAL_MALLOC(KMP_CPU_SET_SIZE_LIMIT);
-  gCode = syscall(__NR_sched_getaffinity, 0, KMP_CPU_SET_SIZE_LIMIT, buf);
+
+  // If the syscall returns a suggestion for the size,
+  // then we don't have to search for an appropriate size.
+  gCode = syscall(__NR_sched_getaffinity, 0, KMP_CPU_SET_TRY_SIZE, buf);
   KA_TRACE(30, ("__kmp_affinity_determine_capable: "
                 "initial getaffinity call returned %ld errno = %d\n",
                 gCode, errno));
 
-  // if ((gCode < 0) && (errno == ENOSYS))
-  if (gCode < 0) {
+  if (gCode < 0 && errno != EINVAL) {
     // System call not supported
     if (__kmp_affinity_verbose ||
         (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none) &&
@@ -162,43 +160,14 @@ void __kmp_affinity_determine_capable(const char *env_var) {
     KMP_AFFINITY_DISABLE();
     KMP_INTERNAL_FREE(buf);
     return;
-  }
-  if (gCode > 0) { // Linux* OS only
+  } else if (gCode > 0) {
     // The optimal situation: the OS returns the size of the buffer it expects.
-    //
-    // A verification of correct behavior is that setaffinity on a NULL
-    // buffer with the same size fails with errno set to EFAULT.
-    sCode = syscall(__NR_sched_setaffinity, 0, gCode, NULL);
-    KA_TRACE(30, ("__kmp_affinity_determine_capable: "
-                  "setaffinity for mask size %ld returned %ld errno = %d\n",
-                  gCode, sCode, errno));
-    if (sCode < 0) {
-      if (errno == ENOSYS) {
-        if (__kmp_affinity_verbose ||
-            (__kmp_affinity_warnings &&
-             (__kmp_affinity_type != affinity_none) &&
-             (__kmp_affinity_type != affinity_default) &&
-             (__kmp_affinity_type != affinity_disabled))) {
-          int error = errno;
-          kmp_msg_t err_code = KMP_ERR(error);
-          __kmp_msg(kmp_ms_warning, KMP_MSG(SetAffSysCallNotSupported, env_var),
-                    err_code, __kmp_msg_null);
-          if (__kmp_generate_warnings == kmp_warnings_off) {
-            __kmp_str_free(&err_code.str);
-          }
-        }
-        KMP_AFFINITY_DISABLE();
-        KMP_INTERNAL_FREE(buf);
-      }
-      if (errno == EFAULT) {
-        KMP_AFFINITY_ENABLE(gCode);
-        KA_TRACE(10, ("__kmp_affinity_determine_capable: "
-                      "affinity supported (mask size %d)\n",
-                      (int)__kmp_affin_mask_size));
-        KMP_INTERNAL_FREE(buf);
-        return;
-      }
-    }
+    KMP_AFFINITY_ENABLE(gCode);
+    KA_TRACE(10, ("__kmp_affinity_determine_capable: "
+                  "affinity supported (mask size %d)\n",
+                  (int)__kmp_affin_mask_size));
+    KMP_INTERNAL_FREE(buf);
+    return;
   }
 
   // Call the getaffinity system call repeatedly with increasing set sizes
@@ -239,49 +208,19 @@ void __kmp_affinity_determine_capable(const char *env_var) {
       continue;
     }
 
-    sCode = syscall(__NR_sched_setaffinity, 0, gCode, NULL);
-    KA_TRACE(30, ("__kmp_affinity_determine_capable: "
-                  "setaffinity for mask size %ld returned %ld errno = %d\n",
-                  gCode, sCode, errno));
-    if (sCode < 0) {
-      if (errno == ENOSYS) { // Linux* OS only
-        // We shouldn't get here
-        KA_TRACE(30, ("__kmp_affinity_determine_capable: "
-                      "inconsistent OS call behavior: errno == ENOSYS for mask "
-                      "size %d\n",
-                      size));
-        if (__kmp_affinity_verbose ||
-            (__kmp_affinity_warnings &&
-             (__kmp_affinity_type != affinity_none) &&
-             (__kmp_affinity_type != affinity_default) &&
-             (__kmp_affinity_type != affinity_disabled))) {
-          int error = errno;
-          kmp_msg_t err_code = KMP_ERR(error);
-          __kmp_msg(kmp_ms_warning, KMP_MSG(SetAffSysCallNotSupported, env_var),
-                    err_code, __kmp_msg_null);
-          if (__kmp_generate_warnings == kmp_warnings_off) {
-            __kmp_str_free(&err_code.str);
-          }
-        }
-        KMP_AFFINITY_DISABLE();
-        KMP_INTERNAL_FREE(buf);
-        return;
-      }
-      if (errno == EFAULT) {
-        KMP_AFFINITY_ENABLE(gCode);
-        KA_TRACE(10, ("__kmp_affinity_determine_capable: "
-                      "affinity supported (mask size %d)\n",
-                      (int)__kmp_affin_mask_size));
-        KMP_INTERNAL_FREE(buf);
-        return;
-      }
-    }
+    KMP_AFFINITY_ENABLE(gCode);
+    KA_TRACE(10, ("__kmp_affinity_determine_capable: "
+                  "affinity supported (mask size %d)\n",
+                  (int)__kmp_affin_mask_size));
+    KMP_INTERNAL_FREE(buf);
+    return;
   }
 #elif KMP_OS_FREEBSD
   long gCode;
   unsigned char *buf;
   buf = (unsigned char *)KMP_INTERNAL_MALLOC(KMP_CPU_SET_SIZE_LIMIT);
-  gCode = pthread_getaffinity_np(pthread_self(), KMP_CPU_SET_SIZE_LIMIT, reinterpret_cast<cpuset_t *>(buf));
+  gCode = pthread_getaffinity_np(pthread_self(), KMP_CPU_SET_SIZE_LIMIT,
+                                 reinterpret_cast<cpuset_t *>(buf));
   KA_TRACE(30, ("__kmp_affinity_determine_capable: "
                 "initial getaffinity call returned %d errno = %d\n",
                 gCode, errno));
@@ -289,16 +228,12 @@ void __kmp_affinity_determine_capable(const char *env_var) {
     KMP_AFFINITY_ENABLE(KMP_CPU_SET_SIZE_LIMIT);
     KA_TRACE(10, ("__kmp_affinity_determine_capable: "
                   "affinity supported (mask size %d)\n",
-		  (int)__kmp_affin_mask_size));
+                  (int)__kmp_affin_mask_size));
     KMP_INTERNAL_FREE(buf);
     return;
   }
 #endif
-  // save uncaught error code
-  // int error = errno;
   KMP_INTERNAL_FREE(buf);
-  // restore uncaught error code, will be printed at the next KMP_WARNING below
-  // errno = error;
 
   // Affinity is not supported
   KMP_AFFINITY_DISABLE();
@@ -474,7 +409,7 @@ void __kmp_terminate_thread(int gtid) {
 static kmp_int32 __kmp_set_stack_info(int gtid, kmp_info_t *th) {
   int stack_data;
 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
-        KMP_OS_HURD
+    KMP_OS_HURD
   pthread_attr_t attr;
   int status;
   size_t size = 0;
@@ -512,8 +447,8 @@ static kmp_int32 __kmp_set_stack_info(int gtid, kmp_info_t *th) {
     TCW_4(th->th.th_info.ds.ds_stackgrow, FALSE);
     return TRUE;
   }
-#endif /* KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
-              KMP_OS_HURD */
+#endif /* KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD  \
+          || KMP_OS_HURD */
   /* Use incremental refinement starting from initial conservative estimate */
   TCW_PTR(th->th.th_info.ds.ds_stacksize, 0);
   TCW_PTR(th->th.th_info.ds.ds_stackbase, &stack_data);
@@ -528,7 +463,7 @@ static void *__kmp_launch_worker(void *thr) {
 #endif /* KMP_BLOCK_SIGNALS */
   void *exit_val;
 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
-        KMP_OS_OPENBSD || KMP_OS_HURD
+    KMP_OS_OPENBSD || KMP_OS_HURD
   void *volatile padding = 0;
 #endif
   int gtid;
@@ -577,9 +512,10 @@ static void *__kmp_launch_worker(void *thr) {
 #endif /* KMP_BLOCK_SIGNALS */
 
 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
-        KMP_OS_OPENBSD
+    KMP_OS_OPENBSD
   if (__kmp_stkoffset > 0 && gtid > 0) {
     padding = KMP_ALLOCA(gtid * __kmp_stkoffset);
+    (void)padding;
   }
 #endif
 
@@ -830,10 +766,10 @@ void __kmp_create_worker(int gtid, kmp_info_t *th, size_t stack_size) {
   stack_size += gtid * __kmp_stkoffset * 2;
 
 #if defined(__ANDROID__) && __ANDROID_API__ < 19
-    // Round the stack size to a multiple of the page size. Older versions of
-    // Android (until KitKat) would fail pthread_attr_setstacksize with EINVAL
-    // if the stack size was not a multiple of the page size.
-    stack_size = (stack_size + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1);
+  // Round the stack size to a multiple of the page size. Older versions of
+  // Android (until KitKat) would fail pthread_attr_setstacksize with EINVAL
+  // if the stack size was not a multiple of the page size.
+  stack_size = (stack_size + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1);
 #endif
 
   KA_TRACE(10, ("__kmp_create_worker: T#%d, default stacksize = %lu bytes, "
@@ -1303,6 +1239,8 @@ static void __kmp_atfork_child(void) {
   if (__kmp_nested_proc_bind.bind_types != NULL) {
     __kmp_nested_proc_bind.bind_types[0] = proc_bind_false;
   }
+  __kmp_affinity_masks = NULL;
+  __kmp_affinity_num_masks = 0;
 #endif // KMP_AFFINITY_SUPPORTED
 
 #if KMP_USE_MONITOR
@@ -1388,16 +1326,14 @@ void __kmp_suspend_initialize(void) {
 }
 
 void __kmp_suspend_initialize_thread(kmp_info_t *th) {
-  ANNOTATE_HAPPENS_AFTER(&th->th.th_suspend_init_count);
   int old_value = KMP_ATOMIC_LD_RLX(&th->th.th_suspend_init_count);
   int new_value = __kmp_fork_count + 1;
   // Return if already initialized
   if (old_value == new_value)
     return;
   // Wait, then return if being initialized
-  if (old_value == -1 ||
-      !__kmp_atomic_compare_store(&th->th.th_suspend_init_count, old_value,
-                                  -1)) {
+  if (old_value == -1 || !__kmp_atomic_compare_store(
+                             &th->th.th_suspend_init_count, old_value, -1)) {
     while (KMP_ATOMIC_LD_ACQ(&th->th.th_suspend_init_count) != new_value) {
       KMP_CPU_PAUSE();
     }
@@ -1411,7 +1347,6 @@ void __kmp_suspend_initialize_thread(kmp_info_t *th) {
                                 &__kmp_suspend_mutex_attr);
     KMP_CHECK_SYSFAIL("pthread_mutex_init", status);
     KMP_ATOMIC_ST_REL(&th->th.th_suspend_init_count, new_value);
-    ANNOTATE_HAPPENS_BEFORE(&th->th.th_suspend_init_count);
   }
 }
 
@@ -1807,7 +1742,7 @@ static int __kmp_get_xproc(void) {
   int r = 0;
 
 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
-        KMP_OS_OPENBSD || KMP_OS_HURD
+    KMP_OS_OPENBSD || KMP_OS_HURD
 
   __kmp_type_convert(sysconf(_SC_NPROCESSORS_ONLN), &(r));
 
@@ -1870,7 +1805,7 @@ void __kmp_runtime_initialize(void) {
 
   __kmp_xproc = __kmp_get_xproc();
 
-#if ! KMP_32_BIT_ARCH
+#if !KMP_32_BIT_ARCH
   struct rlimit rlim;
   // read stack size of calling thread, save it as default for worker threads;
   // this should be done before reading environment variables
@@ -1910,10 +1845,14 @@ void __kmp_runtime_initialize(void) {
   KMP_CHECK_SYSFAIL("pthread_mutexattr_init", status);
   status = pthread_mutex_init(&__kmp_wait_mx.m_mutex, &mutex_attr);
   KMP_CHECK_SYSFAIL("pthread_mutex_init", status);
+  status = pthread_mutexattr_destroy(&mutex_attr);
+  KMP_CHECK_SYSFAIL("pthread_mutexattr_destroy", status);
   status = pthread_condattr_init(&cond_attr);
   KMP_CHECK_SYSFAIL("pthread_condattr_init", status);
   status = pthread_cond_init(&__kmp_wait_cv.c_cond, &cond_attr);
   KMP_CHECK_SYSFAIL("pthread_cond_init", status);
+  status = pthread_condattr_destroy(&cond_attr);
+  KMP_CHECK_SYSFAIL("pthread_condattr_destroy", status);
 #if USE_ITT_BUILD
   __kmp_itt_initialize();
 #endif /* USE_ITT_BUILD */
@@ -2015,8 +1954,8 @@ int __kmp_is_address_mapped(void *addr) {
 
 #if KMP_OS_LINUX || KMP_OS_HURD
 
-  /* On GNUish OSes, read the /proc/<pid>/maps pseudo-file to get all the address
-     ranges mapped into the address space. */
+  /* On GNUish OSes, read the /proc/<pid>/maps pseudo-file to get all the
+     address ranges mapped into the address space. */
 
   char *name = __kmp_str_format("/proc/%d/maps", getpid());
   FILE *file = NULL;
@@ -2057,36 +1996,36 @@ int __kmp_is_address_mapped(void *addr) {
   int mib[] = {CTL_KERN, KERN_PROC, KERN_PROC_VMMAP, getpid()};
   rc = sysctl(mib, 4, NULL, &lstsz, NULL, 0);
   if (rc < 0)
-     return 0;
+    return 0;
   // We pass from number of vm entry's semantic
   // to size of whole entry map list.
   lstsz = lstsz * 4 / 3;
   buf = reinterpret_cast<char *>(kmpc_malloc(lstsz));
   rc = sysctl(mib, 4, buf, &lstsz, NULL, 0);
   if (rc < 0) {
-     kmpc_free(buf);
-     return 0;
+    kmpc_free(buf);
+    return 0;
   }
 
   char *lw = buf;
   char *up = buf + lstsz;
 
   while (lw < up) {
-      struct kinfo_vmentry *cur = reinterpret_cast<struct kinfo_vmentry *>(lw);
-      size_t cursz = cur->kve_structsize;
-      if (cursz == 0)
-          break;
-      void *start = reinterpret_cast<void *>(cur->kve_start);
-      void *end = reinterpret_cast<void *>(cur->kve_end);
-      // Readable/Writable addresses within current map entry
-      if ((addr >= start) && (addr < end)) {
-          if ((cur->kve_protection & KVME_PROT_READ) != 0 &&
-              (cur->kve_protection & KVME_PROT_WRITE) != 0) {
-              found = 1;
-              break;
-          }
+    struct kinfo_vmentry *cur = reinterpret_cast<struct kinfo_vmentry *>(lw);
+    size_t cursz = cur->kve_structsize;
+    if (cursz == 0)
+      break;
+    void *start = reinterpret_cast<void *>(cur->kve_start);
+    void *end = reinterpret_cast<void *>(cur->kve_end);
+    // Readable/Writable addresses within current map entry
+    if ((addr >= start) && (addr < end)) {
+      if ((cur->kve_protection & KVME_PROT_READ) != 0 &&
+          (cur->kve_protection & KVME_PROT_WRITE) != 0) {
+        found = 1;
+        break;
       }
-      lw += cursz;
+    }
+    lw += cursz;
   }
   kmpc_free(buf);
 
@@ -2103,7 +2042,7 @@ int __kmp_is_address_mapped(void *addr) {
       1, // Number of bytes to be read.
       (vm_address_t)(&buffer), // Address of buffer to save read bytes in.
       &count // Address of var to save number of read bytes in.
-      );
+  );
   if (rc == 0) {
     // Memory successfully read.
     found = 1;
@@ -2527,6 +2466,7 @@ int __kmp_invoke_microtask(microtask_t pkfn, int gtid, int tid, int argc,
 
 #endif
 
+#if KMP_OS_LINUX
 // Functions for hidden helper task
 namespace {
 // Condition variable for initializing hidden helper team
@@ -2687,5 +2627,42 @@ void __kmp_hidden_helper_threads_deinitz_release() {
   status = pthread_mutex_unlock(&hidden_helper_threads_deinitz_lock);
   KMP_CHECK_SYSFAIL("pthread_mutex_unlock", status);
 }
+#else // KMP_OS_LINUX
+void __kmp_hidden_helper_worker_thread_wait() {
+  KMP_ASSERT(0 && "Hidden helper task is not supported on this OS");
+}
+
+void __kmp_do_initialize_hidden_helper_threads() {
+  KMP_ASSERT(0 && "Hidden helper task is not supported on this OS");
+}
+
+void __kmp_hidden_helper_threads_initz_wait() {
+  KMP_ASSERT(0 && "Hidden helper task is not supported on this OS");
+}
+
+void __kmp_hidden_helper_initz_release() {
+  KMP_ASSERT(0 && "Hidden helper task is not supported on this OS");
+}
+
+void __kmp_hidden_helper_main_thread_wait() {
+  KMP_ASSERT(0 && "Hidden helper task is not supported on this OS");
+}
+
+void __kmp_hidden_helper_main_thread_release() {
+  KMP_ASSERT(0 && "Hidden helper task is not supported on this OS");
+}
+
+void __kmp_hidden_helper_worker_thread_signal() {
+  KMP_ASSERT(0 && "Hidden helper task is not supported on this OS");
+}
+
+void __kmp_hidden_helper_threads_deinitz_wait() {
+  KMP_ASSERT(0 && "Hidden helper task is not supported on this OS");
+}
+
+void __kmp_hidden_helper_threads_deinitz_release() {
+  KMP_ASSERT(0 && "Hidden helper task is not supported on this OS");
+}
+#endif // KMP_OS_LINUX
 
 // end of file //
diff --git a/openmp/runtime/src/z_Windows_NT-586_util.cpp b/openmp/runtime/src/z_Windows_NT-586_util.cpp
index b3728a5d975f..991943c1b2b5 100644
--- a/openmp/runtime/src/z_Windows_NT-586_util.cpp
+++ b/openmp/runtime/src/z_Windows_NT-586_util.cpp
@@ -12,7 +12,7 @@
 
 #include "kmp.h"
 
-#if (KMP_ARCH_X86 || KMP_ARCH_X86_64)
+#if (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_AARCH64)
 /* Only 32-bit "add-exchange" instruction on IA-32 architecture causes us to
    use compare_and_store for these routines */
 
@@ -22,7 +22,7 @@ kmp_int8 __kmp_test_then_or8(volatile kmp_int8 *p, kmp_int8 d) {
   old_value = TCR_1(*p);
   new_value = old_value | d;
 
-  while (!__kmp_compare_and_store8(p, old_value, new_value)) {
+  while (!KMP_COMPARE_AND_STORE_REL8(p, old_value, new_value)) {
     KMP_CPU_PAUSE();
     old_value = TCR_1(*p);
     new_value = old_value | d;
@@ -36,7 +36,7 @@ kmp_int8 __kmp_test_then_and8(volatile kmp_int8 *p, kmp_int8 d) {
   old_value = TCR_1(*p);
   new_value = old_value & d;
 
-  while (!__kmp_compare_and_store8(p, old_value, new_value)) {
+  while (!KMP_COMPARE_AND_STORE_REL8(p, old_value, new_value)) {
     KMP_CPU_PAUSE();
     old_value = TCR_1(*p);
     new_value = old_value & d;
@@ -50,8 +50,8 @@ kmp_uint32 __kmp_test_then_or32(volatile kmp_uint32 *p, kmp_uint32 d) {
   old_value = TCR_4(*p);
   new_value = old_value | d;
 
-  while (!__kmp_compare_and_store32((volatile kmp_int32 *)p, old_value,
-                                    new_value)) {
+  while (!KMP_COMPARE_AND_STORE_REL32((volatile kmp_int32 *)p, old_value,
+                                      new_value)) {
     KMP_CPU_PAUSE();
     old_value = TCR_4(*p);
     new_value = old_value | d;
@@ -65,8 +65,8 @@ kmp_uint32 __kmp_test_then_and32(volatile kmp_uint32 *p, kmp_uint32 d) {
   old_value = TCR_4(*p);
   new_value = old_value & d;
 
-  while (!__kmp_compare_and_store32((volatile kmp_int32 *)p, old_value,
-                                    new_value)) {
+  while (!KMP_COMPARE_AND_STORE_REL32((volatile kmp_int32 *)p, old_value,
+                                      new_value)) {
     KMP_CPU_PAUSE();
     old_value = TCR_4(*p);
     new_value = old_value & d;
@@ -74,6 +74,7 @@ kmp_uint32 __kmp_test_then_and32(volatile kmp_uint32 *p, kmp_uint32 d) {
   return old_value;
 }
 
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
 kmp_int8 __kmp_test_then_add8(volatile kmp_int8 *p, kmp_int8 d) {
   kmp_int64 old_value, new_value;
 
@@ -101,14 +102,15 @@ kmp_int64 __kmp_test_then_add64(volatile kmp_int64 *p, kmp_int64 d) {
   return old_value;
 }
 #endif /* KMP_ARCH_X86 */
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
 
 kmp_uint64 __kmp_test_then_or64(volatile kmp_uint64 *p, kmp_uint64 d) {
   kmp_uint64 old_value, new_value;
 
   old_value = TCR_8(*p);
   new_value = old_value | d;
-  while (!__kmp_compare_and_store64((volatile kmp_int64 *)p, old_value,
-                                    new_value)) {
+  while (!KMP_COMPARE_AND_STORE_REL64((volatile kmp_int64 *)p, old_value,
+                                      new_value)) {
     KMP_CPU_PAUSE();
     old_value = TCR_8(*p);
     new_value = old_value | d;
@@ -122,8 +124,8 @@ kmp_uint64 __kmp_test_then_and64(volatile kmp_uint64 *p, kmp_uint64 d) {
 
   old_value = TCR_8(*p);
   new_value = old_value & d;
-  while (!__kmp_compare_and_store64((volatile kmp_int64 *)p, old_value,
-                                    new_value)) {
+  while (!KMP_COMPARE_AND_STORE_REL64((volatile kmp_int64 *)p, old_value,
+                                      new_value)) {
     KMP_CPU_PAUSE();
     old_value = TCR_8(*p);
     new_value = old_value & d;
@@ -132,4 +134,57 @@ kmp_uint64 __kmp_test_then_and64(volatile kmp_uint64 *p, kmp_uint64 d) {
   return old_value;
 }
 
-#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+#if KMP_ARCH_AARCH64
+int __kmp_invoke_microtask(microtask_t pkfn, int gtid, int tid, int argc,
+                           void *p_argv[]
+#if OMPT_SUPPORT
+                           ,
+                           void **exit_frame_ptr
+#endif
+) {
+#if OMPT_SUPPORT
+  *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
+#endif
+
+  switch (argc) {
+  case 0:
+    (*pkfn)(&gtid, &tid);
+    break;
+  case 1:
+    (*pkfn)(&gtid, &tid, p_argv[0]);
+    break;
+  case 2:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1]);
+    break;
+  case 3:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2]);
+    break;
+  case 4:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3]);
+    break;
+  case 5:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4]);
+    break;
+  default: {
+    // p_argv[6] and onwards must be passed on the stack since 8 registers are
+    // already used.
+    size_t len = (argc - 6) * sizeof(void *);
+    void *argbuf = alloca(len);
+    memcpy(argbuf, &p_argv[6], len);
+  }
+    [[fallthrough]];
+  case 6:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5]);
+    break;
+  }
+
+#if OMPT_SUPPORT
+  *exit_frame_ptr = 0;
+#endif
+
+  return 1;
+}
+#endif
+
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_AARCH64 */
diff --git a/openmp/runtime/src/z_Windows_NT_util.cpp b/openmp/runtime/src/z_Windows_NT_util.cpp
index 2a07c5449c70..320920283c9d 100644
--- a/openmp/runtime/src/z_Windows_NT_util.cpp
+++ b/openmp/runtime/src/z_Windows_NT_util.cpp
@@ -23,6 +23,10 @@
 
 #include <ntsecapi.h> // UNICODE_STRING
 #include <ntstatus.h>
+#include <psapi.h>
+#ifdef _MSC_VER
+#pragma comment(lib, "psapi.lib")
+#endif
 
 enum SYSTEM_INFORMATION_CLASS {
   SystemProcessInformation = 5
@@ -239,9 +243,10 @@ static void __kmp_win32_cond_wait(kmp_win32_cond_t *cv, kmp_win32_mutex_t *mx,
       old_f = flag->unset_sleeping();
       KMP_DEBUG_ASSERT(old_f & KMP_BARRIER_SLEEP_STATE);
       TCW_PTR(th->th.th_sleep_loc, NULL);
-      KF_TRACE(50, ("__kmp_win32_cond_wait: exiting, condition "
-                    "fulfilled: flag's loc(%p): %u => %u\n",
-                    flag->get(), old_f, *(flag->get())));
+      KF_TRACE(50,
+               ("__kmp_win32_cond_wait: exiting, condition "
+                "fulfilled: flag's loc(%p): %u => %u\n",
+                flag->get(), (unsigned int)old_f, (unsigned int)flag->load()));
 
       __kmp_win32_mutex_lock(&cv->waiters_count_lock_);
       KMP_DEBUG_ASSERT(cv->waiters_count_ > 0);
@@ -356,7 +361,6 @@ void __kmp_unlock_suspend_mx(kmp_info_t *th) {
 template <class C>
 static inline void __kmp_suspend_template(int th_gtid, C *flag) {
   kmp_info_t *th = __kmp_threads[th_gtid];
-  int status;
   typename C::flag_t old_spin;
 
   KF_TRACE(30, ("__kmp_suspend_template: T#%d enter for flag's loc(%p)\n",
@@ -380,8 +384,8 @@ static inline void __kmp_suspend_template(int th_gtid, C *flag) {
   }
 
   KF_TRACE(5, ("__kmp_suspend_template: T#%d set sleep bit for flag's"
-               " loc(%p)==%d\n",
-               th_gtid, flag->get(), *(flag->get())));
+               " loc(%p)==%u\n",
+               th_gtid, flag->get(), (unsigned int)flag->load()));
 
   if (flag->done_check_val(old_spin)) {
     old_spin = flag->unset_sleeping();
@@ -462,7 +466,6 @@ template void __kmp_suspend_64<true, false>(int, kmp_flag_64<true, false> *);
 template <class C>
 static inline void __kmp_resume_template(int target_gtid, C *flag) {
   kmp_info_t *th = __kmp_threads[target_gtid];
-  int status;
 
 #ifdef KMP_DEBUG
   int gtid = TCR_4(__kmp_init_gtid) ? __kmp_get_gtid() : -1;
@@ -493,7 +496,8 @@ static inline void __kmp_resume_template(int target_gtid, C *flag) {
     if (!flag->is_sleeping_val(old_spin)) {
       KF_TRACE(5, ("__kmp_resume_template: T#%d exiting, thread T#%d already "
                    "awake: flag's loc(%p): %u => %u\n",
-                   gtid, target_gtid, flag->get(), old_spin, *(flag->get())));
+                   gtid, target_gtid, flag->get(), (unsigned int)old_spin,
+                   (unsigned int)flag->load()));
       __kmp_unlock_suspend_mx(th);
       return;
     }
@@ -591,7 +595,8 @@ void __kmp_affinity_bind_thread(int proc) {
 }
 
 void __kmp_affinity_determine_capable(const char *env_var) {
-// All versions of Windows* OS (since Win '95) support SetThreadAffinityMask().
+  // All versions of Windows* OS (since Win '95) support
+  // SetThreadAffinityMask().
 
 #if KMP_GROUP_AFFINITY
   KMP_AFFINITY_ENABLE(__kmp_num_proc_groups * sizeof(DWORD_PTR));
@@ -664,6 +669,7 @@ void __kmp_runtime_initialize(void) {
     BOOL ret = GetModuleHandleEx(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS |
                                      GET_MODULE_HANDLE_EX_FLAG_PIN,
                                  (LPCTSTR)&__kmp_serial_initialize, &h);
+    (void)ret;
     KMP_DEBUG_ASSERT2(h && ret, "OpenMP RTL cannot find itself loaded");
     SetErrorMode(err_mode); // Restore error mode
     KA_TRACE(10, ("__kmp_runtime_initialize: dynamic library pinned\n"));
@@ -823,6 +829,7 @@ void __kmp_runtime_initialize(void) {
     __kmp_xproc = info.dwNumberOfProcessors;
   }
 #else
+  (void)kernel32;
   GetSystemInfo(&info);
   __kmp_xproc = info.dwNumberOfProcessors;
 #endif /* KMP_GROUP_AFFINITY */
@@ -950,8 +957,7 @@ kmp_uint64 __kmp_now_nsec() {
   return 1e9 * __kmp_win32_tick * now.QuadPart;
 }
 
-extern "C"
-void *__stdcall __kmp_launch_worker(void *arg) {
+extern "C" void *__stdcall __kmp_launch_worker(void *arg) {
   volatile void *stack_data;
   void *exit_val;
   void *padding = 0;
@@ -1630,6 +1636,29 @@ finish: // Clean up and exit.
   return running_threads;
 } //__kmp_get_load_balance()
 
+// Find symbol from the loaded modules
+void *__kmp_lookup_symbol(const char *name) {
+  HANDLE process = GetCurrentProcess();
+  DWORD needed;
+  HMODULE *modules = nullptr;
+  if (!EnumProcessModules(process, modules, 0, &needed))
+    return nullptr;
+  DWORD num_modules = needed / sizeof(HMODULE);
+  modules = (HMODULE *)malloc(num_modules * sizeof(HMODULE));
+  if (!EnumProcessModules(process, modules, needed, &needed)) {
+    free(modules);
+    return nullptr;
+  }
+  void *proc = nullptr;
+  for (uint32_t i = 0; i < num_modules; i++) {
+    proc = (void *)GetProcAddress(modules[i], name);
+    if (proc)
+      break;
+  }
+  free(modules);
+  return proc;
+}
+
 // Functions for hidden helper task
 void __kmp_hidden_helper_worker_thread_wait() {
   KMP_ASSERT(0 && "Hidden helper task is not supported on Windows");