aboutsummaryrefslogtreecommitdiff
path: root/openmp/runtime/src/kmp_affinity.h
diff options
context:
space:
mode:
Diffstat (limited to 'openmp/runtime/src/kmp_affinity.h')
-rw-r--r--openmp/runtime/src/kmp_affinity.h369
1 files changed, 271 insertions, 98 deletions
diff --git a/openmp/runtime/src/kmp_affinity.h b/openmp/runtime/src/kmp_affinity.h
index 013080bbc9ea..8e72922d2c6e 100644
--- a/openmp/runtime/src/kmp_affinity.h
+++ b/openmp/runtime/src/kmp_affinity.h
@@ -337,8 +337,8 @@ class KMPNativeAffinity : public KMPAffinity {
long retval =
syscall(__NR_sched_getaffinity, 0, __kmp_affin_mask_size, mask);
#elif KMP_OS_FREEBSD
- int r =
- pthread_getaffinity_np(pthread_self(), __kmp_affin_mask_size, reinterpret_cast<cpuset_t *>(mask));
+ int r = pthread_getaffinity_np(pthread_self(), __kmp_affin_mask_size,
+ reinterpret_cast<cpuset_t *>(mask));
int retval = (r == 0 ? 0 : -1);
#endif
if (retval >= 0) {
@@ -357,8 +357,8 @@ class KMPNativeAffinity : public KMPAffinity {
long retval =
syscall(__NR_sched_setaffinity, 0, __kmp_affin_mask_size, mask);
#elif KMP_OS_FREEBSD
- int r =
- pthread_setaffinity_np(pthread_self(), __kmp_affin_mask_size, reinterpret_cast<cpuset_t *>(mask));
+ int r = pthread_setaffinity_np(pthread_self(), __kmp_affin_mask_size,
+ reinterpret_cast<cpuset_t *>(mask));
int retval = (r == 0 ? 0 : -1);
#endif
if (retval >= 0) {
@@ -598,91 +598,274 @@ class KMPNativeAffinity : public KMPAffinity {
#endif /* KMP_OS_WINDOWS */
#endif /* KMP_AFFINITY_SUPPORTED */
-class Address {
+class kmp_hw_thread_t {
public:
- static const unsigned maxDepth = 32;
- unsigned labels[maxDepth];
- unsigned childNums[maxDepth];
- unsigned depth;
- unsigned leader;
- Address(unsigned _depth) : depth(_depth), leader(FALSE) {}
- Address &operator=(const Address &b) {
- depth = b.depth;
- for (unsigned i = 0; i < depth; i++) {
- labels[i] = b.labels[i];
- childNums[i] = b.childNums[i];
- }
- leader = FALSE;
- return *this;
- }
- bool operator==(const Address &b) const {
- if (depth != b.depth)
- return false;
- for (unsigned i = 0; i < depth; i++)
- if (labels[i] != b.labels[i])
- return false;
- return true;
- }
- bool isClose(const Address &b, int level) const {
- if (depth != b.depth)
- return false;
- if ((unsigned)level >= depth)
- return true;
- for (unsigned i = 0; i < (depth - level); i++)
- if (labels[i] != b.labels[i])
- return false;
- return true;
- }
- bool operator!=(const Address &b) const { return !operator==(b); }
- void print() const {
- unsigned i;
- printf("Depth: %u --- ", depth);
- for (i = 0; i < depth; i++) {
- printf("%u ", labels[i]);
- }
+ static const int UNKNOWN_ID = -1;
+ static int compare_ids(const void *a, const void *b);
+ static int compare_compact(const void *a, const void *b);
+ int ids[KMP_HW_LAST];
+ int sub_ids[KMP_HW_LAST];
+ bool leader;
+ int os_id;
+ void print() const;
+ void clear() {
+ for (int i = 0; i < (int)KMP_HW_LAST; ++i)
+ ids[i] = UNKNOWN_ID;
+ leader = false;
}
};
-class AddrUnsPair {
+class kmp_topology_t {
+
+ struct flags_t {
+ int uniform : 1;
+ int reserved : 31;
+ };
+
+ int depth;
+
+ // The following arrays are all 'depth' long
+
+ // Orderd array of the types in the topology
+ kmp_hw_t *types;
+
+ // Keep quick topology ratios, for non-uniform topologies,
+ // this ratio holds the max number of itemAs per itemB
+ // e.g., [ 4 packages | 6 cores / package | 2 threads / core ]
+ int *ratio;
+
+ // Storage containing the absolute number of each topology layer
+ int *count;
+
+ // The hardware threads array
+ // hw_threads is num_hw_threads long
+ // Each hw_thread's ids and sub_ids are depth deep
+ int num_hw_threads;
+ kmp_hw_thread_t *hw_threads;
+
+ // Equivalence hash where the key is the hardware topology item
+ // and the value is the equivalent hardware topology type in the
+ // types[] array, if the value is KMP_HW_UNKNOWN, then there is no
+ // known equivalence for the topology type
+ kmp_hw_t equivalent[KMP_HW_LAST];
+
+ // Flags describing the topology
+ flags_t flags;
+
+ // Count each item & get the num x's per y
+ // e.g., get the number of cores and the number of threads per core
+ // for each (x, y) in (KMP_HW_* , KMP_HW_*)
+ void _gather_enumeration_information();
+
+ // Remove layers that don't add information to the topology.
+ // This is done by having the layer take on the id = UNKNOWN_ID (-1)
+ void _remove_radix1_layers();
+
+ // Find out if the topology is uniform
+ void _discover_uniformity();
+
+ // Set all the sub_ids for each hardware thread
+ void _set_sub_ids();
+
+ // Set global affinity variables describing the number of threads per
+ // core, the number of packages, the number of cores per package, and
+ // the number of cores.
+ void _set_globals();
+
+ // Set the last level cache equivalent type
+ void _set_last_level_cache();
+
public:
- Address first;
- unsigned second;
- AddrUnsPair(Address _first, unsigned _second)
- : first(_first), second(_second) {}
- AddrUnsPair &operator=(const AddrUnsPair &b) {
- first = b.first;
- second = b.second;
- return *this;
- }
- void print() const {
- printf("first = ");
- first.print();
- printf(" --- second = %u", second);
- }
- bool operator==(const AddrUnsPair &b) const {
- if (first != b.first)
- return false;
- if (second != b.second)
- return false;
- return true;
- }
- bool operator!=(const AddrUnsPair &b) const { return !operator==(b); }
-};
+ // Force use of allocate()/deallocate()
+ kmp_topology_t() = delete;
+ kmp_topology_t(const kmp_topology_t &t) = delete;
+ kmp_topology_t(kmp_topology_t &&t) = delete;
+ kmp_topology_t &operator=(const kmp_topology_t &t) = delete;
+ kmp_topology_t &operator=(kmp_topology_t &&t) = delete;
+
+ static kmp_topology_t *allocate(int nproc, int ndepth, const kmp_hw_t *types);
+ static void deallocate(kmp_topology_t *);
+
+ // Functions used in create_map() routines
+ kmp_hw_thread_t &at(int index) {
+ KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads);
+ return hw_threads[index];
+ }
+ const kmp_hw_thread_t &at(int index) const {
+ KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads);
+ return hw_threads[index];
+ }
+ int get_num_hw_threads() const { return num_hw_threads; }
+ void sort_ids() {
+ qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),
+ kmp_hw_thread_t::compare_ids);
+ }
+ // Check if the hardware ids are unique, if they are
+ // return true, otherwise return false
+ bool check_ids() const;
-static int __kmp_affinity_cmp_Address_labels(const void *a, const void *b) {
- const Address *aa = &(((const AddrUnsPair *)a)->first);
- const Address *bb = &(((const AddrUnsPair *)b)->first);
- unsigned depth = aa->depth;
- unsigned i;
- KMP_DEBUG_ASSERT(depth == bb->depth);
- for (i = 0; i < depth; i++) {
- if (aa->labels[i] < bb->labels[i])
+ // Function to call after the create_map() routine
+ void canonicalize();
+ void canonicalize(int pkgs, int cores_per_pkg, int thr_per_core, int cores);
+
+ // Functions used after canonicalize() called
+ bool filter_hw_subset();
+ bool is_close(int hwt1, int hwt2, int level) const;
+ bool is_uniform() const { return flags.uniform; }
+ // Tell whether a type is a valid type in the topology
+ // returns KMP_HW_UNKNOWN when there is no equivalent type
+ kmp_hw_t get_equivalent_type(kmp_hw_t type) const { return equivalent[type]; }
+ // Set type1 = type2
+ void set_equivalent_type(kmp_hw_t type1, kmp_hw_t type2) {
+ KMP_DEBUG_ASSERT_VALID_HW_TYPE(type1);
+ KMP_DEBUG_ASSERT_VALID_HW_TYPE(type2);
+ kmp_hw_t real_type2 = equivalent[type2];
+ if (real_type2 == KMP_HW_UNKNOWN)
+ real_type2 = type2;
+ equivalent[type1] = real_type2;
+ // This loop is required since any of the types may have been set to
+ // be equivalent to type1. They all must be checked and reset to type2.
+ KMP_FOREACH_HW_TYPE(type) {
+ if (equivalent[type] == type1) {
+ equivalent[type] = real_type2;
+ }
+ }
+ }
+ // Calculate number of types corresponding to level1
+ // per types corresponding to level2 (e.g., number of threads per core)
+ int calculate_ratio(int level1, int level2) const {
+ KMP_DEBUG_ASSERT(level1 >= 0 && level1 < depth);
+ KMP_DEBUG_ASSERT(level2 >= 0 && level2 < depth);
+ int r = 1;
+ for (int level = level1; level > level2; --level)
+ r *= ratio[level];
+ return r;
+ }
+ int get_ratio(int level) const {
+ KMP_DEBUG_ASSERT(level >= 0 && level < depth);
+ return ratio[level];
+ }
+ int get_depth() const { return depth; };
+ kmp_hw_t get_type(int level) const {
+ KMP_DEBUG_ASSERT(level >= 0 && level < depth);
+ return types[level];
+ }
+ int get_level(kmp_hw_t type) const {
+ KMP_DEBUG_ASSERT_VALID_HW_TYPE(type);
+ int eq_type = equivalent[type];
+ if (eq_type == KMP_HW_UNKNOWN)
return -1;
- if (aa->labels[i] > bb->labels[i])
- return 1;
+ for (int i = 0; i < depth; ++i)
+ if (types[i] == eq_type)
+ return i;
+ return -1;
+ }
+ int get_count(int level) const {
+ KMP_DEBUG_ASSERT(level >= 0 && level < depth);
+ return count[level];
+ }
+#if KMP_AFFINITY_SUPPORTED
+ void sort_compact() {
+ qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),
+ kmp_hw_thread_t::compare_compact);
+ }
+#endif
+ void print(const char *env_var = "KMP_AFFINITY") const;
+ void dump() const;
+};
+
+class kmp_hw_subset_t {
+public:
+ struct item_t {
+ int num;
+ kmp_hw_t type;
+ int offset;
+ };
+
+private:
+ int depth;
+ int capacity;
+ item_t *items;
+ kmp_uint64 set;
+ bool absolute;
+ // The set must be able to handle up to KMP_HW_LAST number of layers
+ KMP_BUILD_ASSERT(sizeof(set) * 8 >= KMP_HW_LAST);
+
+public:
+ // Force use of allocate()/deallocate()
+ kmp_hw_subset_t() = delete;
+ kmp_hw_subset_t(const kmp_hw_subset_t &t) = delete;
+ kmp_hw_subset_t(kmp_hw_subset_t &&t) = delete;
+ kmp_hw_subset_t &operator=(const kmp_hw_subset_t &t) = delete;
+ kmp_hw_subset_t &operator=(kmp_hw_subset_t &&t) = delete;
+
+ static kmp_hw_subset_t *allocate() {
+ int initial_capacity = 5;
+ kmp_hw_subset_t *retval =
+ (kmp_hw_subset_t *)__kmp_allocate(sizeof(kmp_hw_subset_t));
+ retval->depth = 0;
+ retval->capacity = initial_capacity;
+ retval->set = 0ull;
+ retval->absolute = false;
+ retval->items = (item_t *)__kmp_allocate(sizeof(item_t) * initial_capacity);
+ return retval;
+ }
+ static void deallocate(kmp_hw_subset_t *subset) {
+ __kmp_free(subset->items);
+ __kmp_free(subset);
+ }
+ void set_absolute() { absolute = true; }
+ bool is_absolute() const { return absolute; }
+ void push_back(int num, kmp_hw_t type, int offset) {
+ if (depth == capacity - 1) {
+ capacity *= 2;
+ item_t *new_items = (item_t *)__kmp_allocate(sizeof(item_t) * capacity);
+ for (int i = 0; i < depth; ++i)
+ new_items[i] = items[i];
+ __kmp_free(items);
+ items = new_items;
+ }
+ items[depth].num = num;
+ items[depth].type = type;
+ items[depth].offset = offset;
+ depth++;
+ set |= (1ull << type);
+ }
+ int get_depth() const { return depth; }
+ const item_t &at(int index) const {
+ KMP_DEBUG_ASSERT(index >= 0 && index < depth);
+ return items[index];
+ }
+ item_t &at(int index) {
+ KMP_DEBUG_ASSERT(index >= 0 && index < depth);
+ return items[index];
}
- return 0;
-}
+ void remove(int index) {
+ KMP_DEBUG_ASSERT(index >= 0 && index < depth);
+ set &= ~(1ull << items[index].type);
+ for (int j = index + 1; j < depth; ++j) {
+ items[j - 1] = items[j];
+ }
+ depth--;
+ }
+ bool specified(kmp_hw_t type) const { return ((set & (1ull << type)) > 0); }
+ void dump() const {
+ printf("**********************\n");
+ printf("*** kmp_hw_subset: ***\n");
+ printf("* depth: %d\n", depth);
+ printf("* items:\n");
+ for (int i = 0; i < depth; ++i) {
+ printf("num: %d, type: %s, offset: %d\n", items[i].num,
+ __kmp_hw_get_keyword(items[i].type), items[i].offset);
+ }
+ printf("* set: 0x%llx\n", set);
+ printf("* absolute: %d\n", absolute);
+ printf("**********************\n");
+ }
+};
+
+extern kmp_topology_t *__kmp_topology;
+extern kmp_hw_subset_t *__kmp_hw_subset;
/* A structure for holding machine-specific hierarchy info to be computed once
at init. This structure represents a mapping of threads to the actual machine
@@ -721,18 +904,10 @@ public:
kmp_uint32 *numPerLevel;
kmp_uint32 *skipPerLevel;
- void deriveLevels(AddrUnsPair *adr2os, int num_addrs) {
- int hier_depth = adr2os[0].first.depth;
- int level = 0;
- for (int i = hier_depth - 1; i >= 0; --i) {
- int max = -1;
- for (int j = 0; j < num_addrs; ++j) {
- int next = adr2os[j].first.childNums[i];
- if (next > max)
- max = next;
- }
- numPerLevel[level] = max + 1;
- ++level;
+ void deriveLevels() {
+ int hier_depth = __kmp_topology->get_depth();
+ for (int i = hier_depth - 1, level = 0; i >= 0; --i, ++level) {
+ numPerLevel[level] = __kmp_topology->get_ratio(i);
}
}
@@ -747,7 +922,7 @@ public:
}
}
- void init(AddrUnsPair *adr2os, int num_addrs) {
+ void init(int num_addrs) {
kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(
&uninitialized, not_initialized, initializing);
if (bool_result == 0) { // Wait for initialization
@@ -774,10 +949,8 @@ public:
}
// Sort table by physical ID
- if (adr2os) {
- qsort(adr2os, num_addrs, sizeof(*adr2os),
- __kmp_affinity_cmp_Address_labels);
- deriveLevels(adr2os, num_addrs);
+ if (__kmp_topology && __kmp_topology->get_depth() > 0) {
+ deriveLevels();
} else {
numPerLevel[0] = maxLeaves;
numPerLevel[1] = num_addrs / maxLeaves;