diff options
Diffstat (limited to 'runtime/src/kmp_dispatch.cpp')
-rw-r--r-- | runtime/src/kmp_dispatch.cpp | 152 |
1 files changed, 88 insertions, 64 deletions
diff --git a/runtime/src/kmp_dispatch.cpp b/runtime/src/kmp_dispatch.cpp index 1090e9de1d0e..161a2c696357 100644 --- a/runtime/src/kmp_dispatch.cpp +++ b/runtime/src/kmp_dispatch.cpp @@ -4,10 +4,9 @@ //===----------------------------------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is dual licensed under the MIT and the University of Illinois Open -// Source Licenses. See LICENSE.txt for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -69,6 +68,20 @@ void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { } } +// Returns either SCHEDULE_MONOTONIC or SCHEDULE_NONMONOTONIC +static inline int __kmp_get_monotonicity(enum sched_type schedule, + bool use_hier = false) { + // Pick up the nonmonotonic/monotonic bits from the scheduling type + int monotonicity; + // default to monotonic + monotonicity = SCHEDULE_MONOTONIC; + if (SCHEDULE_HAS_NONMONOTONIC(schedule)) + monotonicity = SCHEDULE_NONMONOTONIC; + else if (SCHEDULE_HAS_MONOTONIC(schedule)) + monotonicity = SCHEDULE_MONOTONIC; + return monotonicity; +} + // Initialize a dispatch_private_info_template<T> buffer for a particular // type of schedule,chunk. The loop description is found in lb (lower bound), // ub (upper bound), and st (stride). nproc is the number of threads relevant @@ -96,6 +109,8 @@ void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid, T tc; kmp_info_t *th; kmp_team_t *team; + int monotonicity; + bool use_hier; #ifdef KMP_DEBUG typedef typename traits_t<T>::signed_t ST; @@ -118,21 +133,21 @@ void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid, active = !team->t.t_serialized; #if USE_ITT_BUILD - int itt_need_metadata_reporting = __itt_metadata_add_ptr && - __kmp_forkjoin_frames_mode == 3 && - KMP_MASTER_GTID(gtid) && -#if OMP_40_ENABLED - th->th.th_teams_microtask == NULL && + int itt_need_metadata_reporting = + __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && + KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL && + team->t.t_active_level == 1; #endif - team->t.t_active_level == 1; -#endif -#if (KMP_STATIC_STEAL_ENABLED) - if (SCHEDULE_HAS_NONMONOTONIC(schedule)) - // AC: we now have only one implementation of stealing, so use it - schedule = kmp_sch_static_steal; - else + +#if KMP_USE_HIER_SCHED + use_hier = pr->flags.use_hier; +#else + use_hier = false; #endif - schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); + + /* Pick up the nonmonotonic/monotonic bits from the scheduling type */ + monotonicity = __kmp_get_monotonicity(schedule, use_hier); + schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); /* Pick up the nomerge/ordered bits from the scheduling type */ if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) { @@ -150,6 +165,10 @@ void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid, } else { pr->flags.ordered = FALSE; } + // Ordered overrides nonmonotonic + if (pr->flags.ordered) { + monotonicity = SCHEDULE_MONOTONIC; + } if (schedule == kmp_sch_static) { schedule = __kmp_static; @@ -158,6 +177,8 @@ void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid, // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if // not specified) schedule = team->t.t_sched.r_sched_type; + monotonicity = __kmp_get_monotonicity(schedule, use_hier); + schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); // Detail the schedule if needed (global controls are differentiated // appropriately) if (schedule == kmp_sch_guided_chunked) { @@ -208,16 +229,23 @@ void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid, } #endif } - +#if KMP_STATIC_STEAL_ENABLED + // map nonmonotonic:dynamic to static steal + if (schedule == kmp_sch_dynamic_chunked) { + if (monotonicity == SCHEDULE_NONMONOTONIC) + schedule = kmp_sch_static_steal; + } +#endif /* guided analytical not safe for too many threads */ if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) { schedule = kmp_sch_guided_iterative_chunked; KMP_WARNING(DispatchManyThreads); } -#if OMP_45_ENABLED if (schedule == kmp_sch_runtime_simd) { // compiler provides simd_width in the chunk parameter schedule = team->t.t_sched.r_sched_type; + monotonicity = __kmp_get_monotonicity(schedule, use_hier); + schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); // Detail the schedule if needed (global controls are differentiated // appropriately) if (schedule == kmp_sch_static || schedule == kmp_sch_auto || @@ -237,15 +265,15 @@ void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid, { char *buff; // create format specifiers before the debug output - buff = __kmp_str_format("__kmp_dispatch_init: T#%%d new: schedule:%%d" - " chunk:%%%s\n", - traits_t<ST>::spec); + buff = __kmp_str_format( + "__kmp_dispatch_init_algorithm: T#%%d new: schedule:%%d" + " chunk:%%%s\n", + traits_t<ST>::spec); KD_TRACE(10, (buff, gtid, schedule, chunk)); __kmp_str_free(&buff); } #endif } -#endif // OMP_45_ENABLED pr->u.p.parm1 = chunk; } KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper), @@ -284,6 +312,12 @@ void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid, } } +#if KMP_STATS_ENABLED + if (KMP_MASTER_GTID(gtid)) { + KMP_COUNT_VALUE(OMP_loop_dynamic_total_iterations, tc); + } +#endif + pr->u.p.lb = lb; pr->u.p.ub = ub; pr->u.p.st = st; @@ -326,7 +360,10 @@ void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid, pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0); pr->u.p.parm2 = lb; - // pr->pfields.parm3 = 0; // it's not used in static_steal + // parm3 is the number of times to attempt stealing which is + // proportional to the number of chunks per thread up until + // the maximum value of nproc. + pr->u.p.parm3 = KMP_MIN(small_chunk + extras, nproc); pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid pr->u.p.st = st; if (traits_t<T>::type_size > 4) { @@ -349,6 +386,7 @@ void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid, /* too few iterations: fall-through to kmp_sch_static_balanced */ } // if /* FALL-THROUGH to static balanced */ + KMP_FALLTHROUGH(); } // case #endif case kmp_sch_static_balanced: { @@ -418,7 +456,6 @@ void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid, } break; } // case -#if OMP_45_ENABLED case kmp_sch_static_balanced_chunked: { // similar to balanced, but chunk adjusted to multiple of simd width T nth = nproc; @@ -433,7 +470,6 @@ void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid, break; } // case case kmp_sch_guided_simd: -#endif // OMP_45_ENABLED case kmp_sch_guided_iterative_chunked: { KD_TRACE( 100, @@ -740,6 +776,8 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb, if (!TCR_4(__kmp_init_parallel)) __kmp_parallel_initialize(); + __kmp_resume_if_soft_paused(); + #if INCLUDE_SSC_MARKS SSC_MARK_DISPATCH_INIT(); #endif @@ -804,13 +842,10 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb, #if USE_ITT_BUILD kmp_uint64 cur_chunk = chunk; - int itt_need_metadata_reporting = __itt_metadata_add_ptr && - __kmp_forkjoin_frames_mode == 3 && - KMP_MASTER_GTID(gtid) && -#if OMP_40_ENABLED - th->th.th_teams_microtask == NULL && -#endif - team->t.t_active_level == 1; + int itt_need_metadata_reporting = + __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && + KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL && + team->t.t_active_level == 1; #endif if (!active) { pr = reinterpret_cast<dispatch_private_info_template<T> *>( @@ -854,9 +889,9 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb, KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d " "sh->buffer_index:%d\n", gtid, my_buffer_index, sh->buffer_index)); - __kmp_wait_yield<kmp_uint32>(&sh->buffer_index, my_buffer_index, - __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL)); - // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and + __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index, + __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL)); + // Note: KMP_WAIT() cannot be used there: buffer index and // my_buffer_index are *always* 32-bit integers. KMP_MB(); /* is this necessary? */ KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d " @@ -886,9 +921,7 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb, break; case kmp_sch_guided_iterative_chunked: case kmp_sch_guided_analytical_chunked: -#if OMP_45_ENABLED case kmp_sch_guided_simd: -#endif schedtype = 2; break; default: @@ -1000,8 +1033,8 @@ static void __kmp_dispatch_finish(int gtid, ident_t *loc) { } #endif - __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower, - __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL)); + __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower, + __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL)); KMP_MB(); /* is this necessary? */ #ifdef KMP_DEBUG { @@ -1069,8 +1102,8 @@ static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) { } #endif - __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower, - __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL)); + __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower, + __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL)); KMP_MB(); /* is this necessary? */ KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting " @@ -1174,7 +1207,7 @@ int __kmp_dispatch_next_algorithm(int gtid, } if (!status) { // try to steal kmp_info_t **other_threads = team->t.t_threads; - int while_limit = nproc; // nproc attempts to find a victim + int while_limit = pr->u.p.parm3; int while_index = 0; // TODO: algorithm of searching for a victim // should be cleaned up and measured @@ -1272,7 +1305,7 @@ int __kmp_dispatch_next_algorithm(int gtid, if (!status) { kmp_info_t **other_threads = team->t.t_threads; - int while_limit = nproc; // nproc attempts to find a victim + int while_limit = pr->u.p.parm3; int while_index = 0; // TODO: algorithm of searching for a victim @@ -1547,7 +1580,6 @@ int __kmp_dispatch_next_algorithm(int gtid, } // case break; -#if OMP_45_ENABLED case kmp_sch_guided_simd: { // same as iterative but curr-chunk adjusted to be multiple of given // chunk @@ -1620,7 +1652,6 @@ int __kmp_dispatch_next_algorithm(int gtid, } // if } // case break; -#endif // OMP_45_ENABLED case kmp_sch_guided_analytical_chunked: { T chunkspec = pr->u.p.parm1; @@ -2156,10 +2187,8 @@ static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid, } th = __kmp_threads[gtid]; team = th->th.th_team; -#if OMP_40_ENABLED KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct nteams = th->th.th_teams_size.nteams; -#endif team_id = team->t.t_master_tid; KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc); @@ -2485,10 +2514,10 @@ kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) { } kmp_uint32 -__kmp_wait_yield_4(volatile kmp_uint32 *spinner, kmp_uint32 checker, - kmp_uint32 (*pred)(kmp_uint32, kmp_uint32), - void *obj // Higher-level synchronization object, or NULL. - ) { +__kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker, + kmp_uint32 (*pred)(kmp_uint32, kmp_uint32), + void *obj // Higher-level synchronization object, or NULL. + ) { // note: we may not belong to a team at this point volatile kmp_uint32 *spin = spinner; kmp_uint32 check = checker; @@ -2505,20 +2534,16 @@ __kmp_wait_yield_4(volatile kmp_uint32 *spinner, kmp_uint32 checker, split. It causes problems with infinite recursion because of exit lock */ /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) __kmp_abort_thread(); */ - - /* if we have waited a bit, or are oversubscribed, yield */ - /* pause is in the following code */ - KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc); - KMP_YIELD_SPIN(spins); + KMP_YIELD_OVERSUB_ELSE_SPIN(spins); } KMP_FSYNC_SPIN_ACQUIRED(obj); return r; } -void __kmp_wait_yield_4_ptr( - void *spinner, kmp_uint32 checker, kmp_uint32 (*pred)(void *, kmp_uint32), - void *obj // Higher-level synchronization object, or NULL. - ) { +void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker, + kmp_uint32 (*pred)(void *, kmp_uint32), + void *obj // Higher-level synchronization object, or NULL. + ) { // note: we may not belong to a team at this point void *spin = spinner; kmp_uint32 check = checker; @@ -2530,10 +2555,9 @@ void __kmp_wait_yield_4_ptr( // main wait spin loop while (!f(spin, check)) { KMP_FSYNC_SPIN_PREPARE(obj); - /* if we have waited a bit, or are oversubscribed, yield */ + /* if we have waited a bit, or are noversubscribed, yield */ /* pause is in the following code */ - KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc); - KMP_YIELD_SPIN(spins); + KMP_YIELD_OVERSUB_ELSE_SPIN(spins); } KMP_FSYNC_SPIN_ACQUIRED(obj); } |