diff options
Diffstat (limited to 'openmp/runtime/src/kmp_affinity.h')
-rw-r--r-- | openmp/runtime/src/kmp_affinity.h | 369 |
1 files changed, 271 insertions, 98 deletions
diff --git a/openmp/runtime/src/kmp_affinity.h b/openmp/runtime/src/kmp_affinity.h index 013080bbc9ea..8e72922d2c6e 100644 --- a/openmp/runtime/src/kmp_affinity.h +++ b/openmp/runtime/src/kmp_affinity.h @@ -337,8 +337,8 @@ class KMPNativeAffinity : public KMPAffinity { long retval = syscall(__NR_sched_getaffinity, 0, __kmp_affin_mask_size, mask); #elif KMP_OS_FREEBSD - int r = - pthread_getaffinity_np(pthread_self(), __kmp_affin_mask_size, reinterpret_cast<cpuset_t *>(mask)); + int r = pthread_getaffinity_np(pthread_self(), __kmp_affin_mask_size, + reinterpret_cast<cpuset_t *>(mask)); int retval = (r == 0 ? 0 : -1); #endif if (retval >= 0) { @@ -357,8 +357,8 @@ class KMPNativeAffinity : public KMPAffinity { long retval = syscall(__NR_sched_setaffinity, 0, __kmp_affin_mask_size, mask); #elif KMP_OS_FREEBSD - int r = - pthread_setaffinity_np(pthread_self(), __kmp_affin_mask_size, reinterpret_cast<cpuset_t *>(mask)); + int r = pthread_setaffinity_np(pthread_self(), __kmp_affin_mask_size, + reinterpret_cast<cpuset_t *>(mask)); int retval = (r == 0 ? 0 : -1); #endif if (retval >= 0) { @@ -598,91 +598,274 @@ class KMPNativeAffinity : public KMPAffinity { #endif /* KMP_OS_WINDOWS */ #endif /* KMP_AFFINITY_SUPPORTED */ -class Address { +class kmp_hw_thread_t { public: - static const unsigned maxDepth = 32; - unsigned labels[maxDepth]; - unsigned childNums[maxDepth]; - unsigned depth; - unsigned leader; - Address(unsigned _depth) : depth(_depth), leader(FALSE) {} - Address &operator=(const Address &b) { - depth = b.depth; - for (unsigned i = 0; i < depth; i++) { - labels[i] = b.labels[i]; - childNums[i] = b.childNums[i]; - } - leader = FALSE; - return *this; - } - bool operator==(const Address &b) const { - if (depth != b.depth) - return false; - for (unsigned i = 0; i < depth; i++) - if (labels[i] != b.labels[i]) - return false; - return true; - } - bool isClose(const Address &b, int level) const { - if (depth != b.depth) - return false; - if ((unsigned)level >= depth) - return true; - for (unsigned i = 0; i < (depth - level); i++) - if (labels[i] != b.labels[i]) - return false; - return true; - } - bool operator!=(const Address &b) const { return !operator==(b); } - void print() const { - unsigned i; - printf("Depth: %u --- ", depth); - for (i = 0; i < depth; i++) { - printf("%u ", labels[i]); - } + static const int UNKNOWN_ID = -1; + static int compare_ids(const void *a, const void *b); + static int compare_compact(const void *a, const void *b); + int ids[KMP_HW_LAST]; + int sub_ids[KMP_HW_LAST]; + bool leader; + int os_id; + void print() const; + void clear() { + for (int i = 0; i < (int)KMP_HW_LAST; ++i) + ids[i] = UNKNOWN_ID; + leader = false; } }; -class AddrUnsPair { +class kmp_topology_t { + + struct flags_t { + int uniform : 1; + int reserved : 31; + }; + + int depth; + + // The following arrays are all 'depth' long + + // Orderd array of the types in the topology + kmp_hw_t *types; + + // Keep quick topology ratios, for non-uniform topologies, + // this ratio holds the max number of itemAs per itemB + // e.g., [ 4 packages | 6 cores / package | 2 threads / core ] + int *ratio; + + // Storage containing the absolute number of each topology layer + int *count; + + // The hardware threads array + // hw_threads is num_hw_threads long + // Each hw_thread's ids and sub_ids are depth deep + int num_hw_threads; + kmp_hw_thread_t *hw_threads; + + // Equivalence hash where the key is the hardware topology item + // and the value is the equivalent hardware topology type in the + // types[] array, if the value is KMP_HW_UNKNOWN, then there is no + // known equivalence for the topology type + kmp_hw_t equivalent[KMP_HW_LAST]; + + // Flags describing the topology + flags_t flags; + + // Count each item & get the num x's per y + // e.g., get the number of cores and the number of threads per core + // for each (x, y) in (KMP_HW_* , KMP_HW_*) + void _gather_enumeration_information(); + + // Remove layers that don't add information to the topology. + // This is done by having the layer take on the id = UNKNOWN_ID (-1) + void _remove_radix1_layers(); + + // Find out if the topology is uniform + void _discover_uniformity(); + + // Set all the sub_ids for each hardware thread + void _set_sub_ids(); + + // Set global affinity variables describing the number of threads per + // core, the number of packages, the number of cores per package, and + // the number of cores. + void _set_globals(); + + // Set the last level cache equivalent type + void _set_last_level_cache(); + public: - Address first; - unsigned second; - AddrUnsPair(Address _first, unsigned _second) - : first(_first), second(_second) {} - AddrUnsPair &operator=(const AddrUnsPair &b) { - first = b.first; - second = b.second; - return *this; - } - void print() const { - printf("first = "); - first.print(); - printf(" --- second = %u", second); - } - bool operator==(const AddrUnsPair &b) const { - if (first != b.first) - return false; - if (second != b.second) - return false; - return true; - } - bool operator!=(const AddrUnsPair &b) const { return !operator==(b); } -}; + // Force use of allocate()/deallocate() + kmp_topology_t() = delete; + kmp_topology_t(const kmp_topology_t &t) = delete; + kmp_topology_t(kmp_topology_t &&t) = delete; + kmp_topology_t &operator=(const kmp_topology_t &t) = delete; + kmp_topology_t &operator=(kmp_topology_t &&t) = delete; + + static kmp_topology_t *allocate(int nproc, int ndepth, const kmp_hw_t *types); + static void deallocate(kmp_topology_t *); + + // Functions used in create_map() routines + kmp_hw_thread_t &at(int index) { + KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads); + return hw_threads[index]; + } + const kmp_hw_thread_t &at(int index) const { + KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads); + return hw_threads[index]; + } + int get_num_hw_threads() const { return num_hw_threads; } + void sort_ids() { + qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t), + kmp_hw_thread_t::compare_ids); + } + // Check if the hardware ids are unique, if they are + // return true, otherwise return false + bool check_ids() const; -static int __kmp_affinity_cmp_Address_labels(const void *a, const void *b) { - const Address *aa = &(((const AddrUnsPair *)a)->first); - const Address *bb = &(((const AddrUnsPair *)b)->first); - unsigned depth = aa->depth; - unsigned i; - KMP_DEBUG_ASSERT(depth == bb->depth); - for (i = 0; i < depth; i++) { - if (aa->labels[i] < bb->labels[i]) + // Function to call after the create_map() routine + void canonicalize(); + void canonicalize(int pkgs, int cores_per_pkg, int thr_per_core, int cores); + + // Functions used after canonicalize() called + bool filter_hw_subset(); + bool is_close(int hwt1, int hwt2, int level) const; + bool is_uniform() const { return flags.uniform; } + // Tell whether a type is a valid type in the topology + // returns KMP_HW_UNKNOWN when there is no equivalent type + kmp_hw_t get_equivalent_type(kmp_hw_t type) const { return equivalent[type]; } + // Set type1 = type2 + void set_equivalent_type(kmp_hw_t type1, kmp_hw_t type2) { + KMP_DEBUG_ASSERT_VALID_HW_TYPE(type1); + KMP_DEBUG_ASSERT_VALID_HW_TYPE(type2); + kmp_hw_t real_type2 = equivalent[type2]; + if (real_type2 == KMP_HW_UNKNOWN) + real_type2 = type2; + equivalent[type1] = real_type2; + // This loop is required since any of the types may have been set to + // be equivalent to type1. They all must be checked and reset to type2. + KMP_FOREACH_HW_TYPE(type) { + if (equivalent[type] == type1) { + equivalent[type] = real_type2; + } + } + } + // Calculate number of types corresponding to level1 + // per types corresponding to level2 (e.g., number of threads per core) + int calculate_ratio(int level1, int level2) const { + KMP_DEBUG_ASSERT(level1 >= 0 && level1 < depth); + KMP_DEBUG_ASSERT(level2 >= 0 && level2 < depth); + int r = 1; + for (int level = level1; level > level2; --level) + r *= ratio[level]; + return r; + } + int get_ratio(int level) const { + KMP_DEBUG_ASSERT(level >= 0 && level < depth); + return ratio[level]; + } + int get_depth() const { return depth; }; + kmp_hw_t get_type(int level) const { + KMP_DEBUG_ASSERT(level >= 0 && level < depth); + return types[level]; + } + int get_level(kmp_hw_t type) const { + KMP_DEBUG_ASSERT_VALID_HW_TYPE(type); + int eq_type = equivalent[type]; + if (eq_type == KMP_HW_UNKNOWN) return -1; - if (aa->labels[i] > bb->labels[i]) - return 1; + for (int i = 0; i < depth; ++i) + if (types[i] == eq_type) + return i; + return -1; + } + int get_count(int level) const { + KMP_DEBUG_ASSERT(level >= 0 && level < depth); + return count[level]; + } +#if KMP_AFFINITY_SUPPORTED + void sort_compact() { + qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t), + kmp_hw_thread_t::compare_compact); + } +#endif + void print(const char *env_var = "KMP_AFFINITY") const; + void dump() const; +}; + +class kmp_hw_subset_t { +public: + struct item_t { + int num; + kmp_hw_t type; + int offset; + }; + +private: + int depth; + int capacity; + item_t *items; + kmp_uint64 set; + bool absolute; + // The set must be able to handle up to KMP_HW_LAST number of layers + KMP_BUILD_ASSERT(sizeof(set) * 8 >= KMP_HW_LAST); + +public: + // Force use of allocate()/deallocate() + kmp_hw_subset_t() = delete; + kmp_hw_subset_t(const kmp_hw_subset_t &t) = delete; + kmp_hw_subset_t(kmp_hw_subset_t &&t) = delete; + kmp_hw_subset_t &operator=(const kmp_hw_subset_t &t) = delete; + kmp_hw_subset_t &operator=(kmp_hw_subset_t &&t) = delete; + + static kmp_hw_subset_t *allocate() { + int initial_capacity = 5; + kmp_hw_subset_t *retval = + (kmp_hw_subset_t *)__kmp_allocate(sizeof(kmp_hw_subset_t)); + retval->depth = 0; + retval->capacity = initial_capacity; + retval->set = 0ull; + retval->absolute = false; + retval->items = (item_t *)__kmp_allocate(sizeof(item_t) * initial_capacity); + return retval; + } + static void deallocate(kmp_hw_subset_t *subset) { + __kmp_free(subset->items); + __kmp_free(subset); + } + void set_absolute() { absolute = true; } + bool is_absolute() const { return absolute; } + void push_back(int num, kmp_hw_t type, int offset) { + if (depth == capacity - 1) { + capacity *= 2; + item_t *new_items = (item_t *)__kmp_allocate(sizeof(item_t) * capacity); + for (int i = 0; i < depth; ++i) + new_items[i] = items[i]; + __kmp_free(items); + items = new_items; + } + items[depth].num = num; + items[depth].type = type; + items[depth].offset = offset; + depth++; + set |= (1ull << type); + } + int get_depth() const { return depth; } + const item_t &at(int index) const { + KMP_DEBUG_ASSERT(index >= 0 && index < depth); + return items[index]; + } + item_t &at(int index) { + KMP_DEBUG_ASSERT(index >= 0 && index < depth); + return items[index]; } - return 0; -} + void remove(int index) { + KMP_DEBUG_ASSERT(index >= 0 && index < depth); + set &= ~(1ull << items[index].type); + for (int j = index + 1; j < depth; ++j) { + items[j - 1] = items[j]; + } + depth--; + } + bool specified(kmp_hw_t type) const { return ((set & (1ull << type)) > 0); } + void dump() const { + printf("**********************\n"); + printf("*** kmp_hw_subset: ***\n"); + printf("* depth: %d\n", depth); + printf("* items:\n"); + for (int i = 0; i < depth; ++i) { + printf("num: %d, type: %s, offset: %d\n", items[i].num, + __kmp_hw_get_keyword(items[i].type), items[i].offset); + } + printf("* set: 0x%llx\n", set); + printf("* absolute: %d\n", absolute); + printf("**********************\n"); + } +}; + +extern kmp_topology_t *__kmp_topology; +extern kmp_hw_subset_t *__kmp_hw_subset; /* A structure for holding machine-specific hierarchy info to be computed once at init. This structure represents a mapping of threads to the actual machine @@ -721,18 +904,10 @@ public: kmp_uint32 *numPerLevel; kmp_uint32 *skipPerLevel; - void deriveLevels(AddrUnsPair *adr2os, int num_addrs) { - int hier_depth = adr2os[0].first.depth; - int level = 0; - for (int i = hier_depth - 1; i >= 0; --i) { - int max = -1; - for (int j = 0; j < num_addrs; ++j) { - int next = adr2os[j].first.childNums[i]; - if (next > max) - max = next; - } - numPerLevel[level] = max + 1; - ++level; + void deriveLevels() { + int hier_depth = __kmp_topology->get_depth(); + for (int i = hier_depth - 1, level = 0; i >= 0; --i, ++level) { + numPerLevel[level] = __kmp_topology->get_ratio(i); } } @@ -747,7 +922,7 @@ public: } } - void init(AddrUnsPair *adr2os, int num_addrs) { + void init(int num_addrs) { kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8( &uninitialized, not_initialized, initializing); if (bool_result == 0) { // Wait for initialization @@ -774,10 +949,8 @@ public: } // Sort table by physical ID - if (adr2os) { - qsort(adr2os, num_addrs, sizeof(*adr2os), - __kmp_affinity_cmp_Address_labels); - deriveLevels(adr2os, num_addrs); + if (__kmp_topology && __kmp_topology->get_depth() > 0) { + deriveLevels(); } else { numPerLevel[0] = maxLeaves; numPerLevel[1] = num_addrs / maxLeaves; |