aboutsummaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
authorOlivier Houchard <cognet@FreeBSD.org>2021-10-29 17:01:15 +0000
committerOlivier Houchard <cognet@FreeBSD.org>2021-10-29 17:01:15 +0000
commitce929fe84f9c453263af379f3b255ff8eca01d48 (patch)
treee67b03511733872a7fea5ab703fee973ee18a6d4 /include
parent48289654a0ac91e3c29f25461373b3f1b7c82095 (diff)
downloadsrc-ce929fe84f9c453263af379f3b255ff8eca01d48.tar.gz
src-ce929fe84f9c453263af379f3b255ff8eca01d48.zip
Import CK as of commit 2265c7846f4ce667f5216456afe2779b23c3e5f7.vendor/ck/2021029vendor/ck
Diffstat (limited to 'include')
-rw-r--r--include/ck_backoff.h2
-rw-r--r--include/ck_cc.h2
-rw-r--r--include/ck_ec.h945
-rw-r--r--include/ck_fifo.h2
-rw-r--r--include/ck_hs.h8
-rw-r--r--include/ck_pr.h15
-rw-r--r--include/ck_queue.h20
-rw-r--r--include/ck_ring.h672
-rw-r--r--include/gcc/aarch64/ck_pr.h12
-rw-r--r--include/gcc/aarch64/ck_pr_llsc.h106
-rw-r--r--include/gcc/aarch64/ck_pr_lse.h37
-rw-r--r--include/gcc/ck_cc.h9
-rw-r--r--include/gcc/x86/ck_pr.h109
-rw-r--r--include/gcc/x86_64/ck_pr.h113
-rw-r--r--include/spinlock/fas.h9
15 files changed, 1704 insertions, 357 deletions
diff --git a/include/ck_backoff.h b/include/ck_backoff.h
index 82a4f2152e3c..a1f7616a55db 100644
--- a/include/ck_backoff.h
+++ b/include/ck_backoff.h
@@ -50,7 +50,7 @@ ck_backoff_eb(unsigned int *c)
for (i = 0; i < ceiling; i++)
ck_pr_barrier();
- *c = ceiling <<= ceiling < CK_BACKOFF_CEILING;
+ *c = ceiling << (ceiling < CK_BACKOFF_CEILING);
return;
}
diff --git a/include/ck_cc.h b/include/ck_cc.h
index 9a152a3cddab..1b4ff4635fa6 100644
--- a/include/ck_cc.h
+++ b/include/ck_cc.h
@@ -50,6 +50,7 @@
* Container function.
* This relies on (compiler) implementation-defined behavior.
*/
+#ifndef CK_CC_CONTAINER
#define CK_CC_CONTAINER(F, T, M, N) \
CK_CC_INLINE static T * \
N(F *p) \
@@ -57,6 +58,7 @@
F *n = p; \
return (T *)(void *)(((char *)n) - ((size_t)&((T *)0)->M)); \
}
+#endif
#define CK_CC_PAD(x) union { char pad[x]; }
diff --git a/include/ck_ec.h b/include/ck_ec.h
new file mode 100644
index 000000000000..cd2a36813a79
--- /dev/null
+++ b/include/ck_ec.h
@@ -0,0 +1,945 @@
+/*
+ * Copyright 2018 Paul Khuong, Google LLC.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Overview
+ * ========
+ *
+ * ck_ec implements 32- and 64- bit event counts. Event counts let us
+ * easily integrate OS-level blocking (e.g., futexes) in lock-free
+ * protocols. Waiters block conditionally, if the event count's value
+ * is still equal to some old value.
+ *
+ * Event counts come in four variants: 32 and 64 bit (with one bit
+ * stolen for internal signaling, so 31 and 63 bit counters), and
+ * single or multiple producers (wakers). Waiters are always multiple
+ * consumers. The 32 bit variants are smaller, and more efficient,
+ * especially in single producer mode. The 64 bit variants are larger,
+ * but practically invulnerable to ABA.
+ *
+ * The 32 bit variant is always available. The 64 bit variant is only
+ * available if CK supports 64-bit atomic operations. Currently,
+ * specialization for single producer is only implemented for x86 and
+ * x86-64, on compilers that support GCC extended inline assembly;
+ * other platforms fall back to the multiple producer code path.
+ *
+ * A typical usage pattern is:
+ *
+ * 1. On the producer side:
+ *
+ * - Make changes to some shared data structure, without involving
+ * the event count at all.
+ * - After each change, call ck_ec_inc on the event count. The call
+ * acts as a write-write barrier, and wakes up any consumer blocked
+ * on the event count (waiting for new changes).
+ *
+ * 2. On the consumer side:
+ *
+ * - Snapshot ck_ec_value of the event count. The call acts as a
+ * read barrier.
+ * - Read and process the shared data structure.
+ * - Wait for new changes by calling ck_ec_wait with the snapshot value.
+ *
+ * Some data structures may opt for tighter integration with their
+ * event count. For example, an SPMC ring buffer or disruptor might
+ * use the event count's value as the write pointer. If the buffer is
+ * regularly full, it might also make sense to store the read pointer
+ * in an MP event count.
+ *
+ * This event count implementation supports tighter integration in two
+ * ways.
+ *
+ * Producers may opt to increment by an arbitrary value (less than
+ * INT32_MAX / INT64_MAX), in order to encode, e.g., byte
+ * offsets. Larger increment values make wraparound more likely, so
+ * the increments should still be relatively small.
+ *
+ * Consumers may pass a predicate to ck_ec_wait_pred. This predicate
+ * can make `ck_ec_wait_pred` return early, before the event count's
+ * value changes, and can override the deadline passed to futex_wait.
+ * This lets consumer block on one eventcount, while optimistically
+ * looking at other waking conditions.
+ *
+ * API Reference
+ * =============
+ *
+ * When compiled as C11 or later, this header defines type-generic
+ * macros for ck_ec32 and ck_ec64; the reference describes this
+ * type-generic API.
+ *
+ * ck_ec needs additional OS primitives to determine the current time,
+ * to wait on an address, and to wake all threads waiting on a given
+ * address. These are defined with fields in a struct ck_ec_ops. Each
+ * ck_ec_ops may additionally define the number of spin loop
+ * iterations in the slow path, as well as the initial wait time in
+ * the internal exponential backoff, the exponential scale factor, and
+ * the right shift count (< 32).
+ *
+ * The ops, in addition to the single/multiple producer flag, are
+ * encapsulated in a struct ck_ec_mode, passed to most ck_ec
+ * operations.
+ *
+ * ec is a struct ck_ec32 *, or a struct ck_ec64 *.
+ *
+ * value is an uint32_t for ck_ec32, and an uint64_t for ck_ec64. It
+ * never exceeds INT32_MAX and INT64_MAX respectively.
+ *
+ * mode is a struct ck_ec_mode *.
+ *
+ * deadline is either NULL, or a `const struct timespec *` that will
+ * be treated as an absolute deadline.
+ *
+ * `void ck_ec_init(ec, value)`: initializes the event count to value.
+ *
+ * `value ck_ec_value(ec)`: returns the current value of the event
+ * counter. This read acts as a read (acquire) barrier.
+ *
+ * `bool ck_ec_has_waiters(ec)`: returns whether some thread has
+ * marked the event count as requiring an OS wakeup.
+ *
+ * `void ck_ec_inc(ec, mode)`: increments the value of the event
+ * counter by one. This writes acts as a write barrier. Wakes up
+ * any waiting thread.
+ *
+ * `value ck_ec_add(ec, mode, value)`: increments the event counter by
+ * `value`, and returns the event counter's previous value. This
+ * write acts as a write barrier. Wakes up any waiting thread.
+ *
+ * `int ck_ec_deadline(struct timespec *new_deadline,
+ * mode,
+ * const struct timespec *timeout)`:
+ * computes a deadline `timeout` away from the current time. If
+ * timeout is NULL, computes a deadline in the infinite future. The
+ * resulting deadline is written to `new_deadline`. Returns 0 on
+ * success, and -1 if ops->gettime failed (without touching errno).
+ *
+ * `int ck_ec_wait(ec, mode, value, deadline)`: waits until the event
+ * counter's value differs from `value`, or, if `deadline` is
+ * provided and non-NULL, until the current time is after that
+ * deadline. Use a deadline with tv_sec = 0 for a non-blocking
+ * execution. Returns 0 if the event counter has changed, and -1 on
+ * timeout. This function acts as a read (acquire) barrier.
+ *
+ * `int ck_ec_wait_pred(ec, mode, value, pred, data, deadline)`: waits
+ * until the event counter's value differs from `value`, or until
+ * `pred` returns non-zero, or, if `deadline` is provided and
+ * non-NULL, until the current time is after that deadline. Use a
+ * deadline with tv_sec = 0 for a non-blocking execution. Returns 0 if
+ * the event counter has changed, `pred`'s return value if non-zero,
+ * and -1 on timeout. This function acts as a read (acquire) barrier.
+ *
+ * `pred` is always called as `pred(data, iteration_deadline, now)`,
+ * where `iteration_deadline` is a timespec of the deadline for this
+ * exponential backoff iteration, and `now` is the current time. If
+ * `pred` returns a non-zero value, that value is immediately returned
+ * to the waiter. Otherwise, `pred` is free to modify
+ * `iteration_deadline` (moving it further in the future is a bad
+ * idea).
+ *
+ * Implementation notes
+ * ====================
+ *
+ * The multiple producer implementation is a regular locked event
+ * count, with a single flag bit to denote the need to wake up waiting
+ * threads.
+ *
+ * The single producer specialization is heavily tied to
+ * [x86-TSO](https://www.cl.cam.ac.uk/~pes20/weakmemory/cacm.pdf), and
+ * to non-atomic read-modify-write instructions (e.g., `inc mem`);
+ * these non-atomic RMW let us write to the same memory locations with
+ * atomic and non-atomic instructions, without suffering from process
+ * scheduling stalls.
+ *
+ * The reason we can mix atomic and non-atomic writes to the `counter`
+ * word is that every non-atomic write obviates the need for the
+ * atomically flipped flag bit: we only use non-atomic writes to
+ * update the event count, and the atomic flag only informs the
+ * producer that we would like a futex_wake, because of the update.
+ * We only require the non-atomic RMW counter update to prevent
+ * preemption from introducing arbitrarily long worst case delays.
+ *
+ * Correctness does not rely on the usual ordering argument: in the
+ * absence of fences, there is no strict ordering between atomic and
+ * non-atomic writes. The key is instead x86-TSO's guarantee that a
+ * read is satisfied from the most recent buffered write in the local
+ * store queue if there is one, or from memory if there is no write to
+ * that address in the store queue.
+ *
+ * x86-TSO's constraint on reads suffices to guarantee that the
+ * producer will never forget about a counter update. If the last
+ * update is still queued, the new update will be based on the queued
+ * value. Otherwise, the new update will be based on the value in
+ * memory, which may or may not have had its flag flipped. In either
+ * case, the value of the counter (modulo flag) is correct.
+ *
+ * When the producer forwards the counter's value from its store
+ * queue, the new update might not preserve a flag flip. Any waiter
+ * thus has to check from time to time to determine if it wasn't
+ * woken up because the flag bit was silently cleared.
+ *
+ * In reality, the store queue in x86-TSO stands for in-flight
+ * instructions in the chip's out-of-order backend. In the vast
+ * majority of cases, instructions will only remain in flight for a
+ * few hundred or thousand of cycles. That's why ck_ec_wait spins on
+ * the `counter` word for ~100 iterations after flipping its flag bit:
+ * if the counter hasn't changed after that many iterations, it is
+ * very likely that the producer's next counter update will observe
+ * the flag flip.
+ *
+ * That's still not a hard guarantee of correctness. Conservatively,
+ * we can expect that no instruction will remain in flight for more
+ * than 1 second... if only because some interrupt will have forced
+ * the chip to store its architectural state in memory, at which point
+ * an instruction is either fully retired or rolled back. Interrupts,
+ * particularly the pre-emption timer, are why single-producer updates
+ * must happen in a single non-atomic read-modify-write instruction.
+ * Having a single instruction as the critical section means we only
+ * have to consider the worst-case execution time for that
+ * instruction. That's easier than doing the same for a pair of
+ * instructions, which an unlucky pre-emption could delay for
+ * arbitrarily long.
+ *
+ * Thus, after a short spin loop, ck_ec_wait enters an exponential
+ * backoff loop, where each "sleep" is instead a futex_wait. The
+ * backoff is only necessary to handle rare cases where the flag flip
+ * was overwritten after the spin loop. Eventually, more than one
+ * second will have elapsed since the flag flip, and the sleep timeout
+ * becomes infinite: since the flag bit has been set for much longer
+ * than the time for which an instruction may remain in flight, the
+ * flag will definitely be observed at the next counter update.
+ *
+ * The 64 bit ck_ec_wait pulls another trick: futexes only handle 32
+ * bit ints, so we must treat the 64 bit counter's low 32 bits as an
+ * int in futex_wait. That's a bit dodgy, but fine in practice, given
+ * that the OS's futex code will always read whatever value is
+ * currently in memory: even if the producer thread were to wait on
+ * its own event count, the syscall and ring transition would empty
+ * the store queue (the out-of-order execution backend).
+ *
+ * Finally, what happens when the producer is migrated to another core
+ * or otherwise pre-empted? Migration must already incur a barrier, so
+ * that thread always sees its own writes, so that's safe. As for
+ * pre-emption, that requires storing the architectural state, which
+ * means every instruction must either be executed fully or not at
+ * all when pre-emption happens.
+ */
+
+#ifndef CK_EC_H
+#define CK_EC_H
+#include <ck_cc.h>
+#include <ck_pr.h>
+#include <ck_stdbool.h>
+#include <ck_stdint.h>
+#include <ck_stddef.h>
+#include <sys/time.h>
+
+/*
+ * If we have ck_pr_faa_64 (and, presumably, ck_pr_load_64), we
+ * support 63 bit counters.
+ */
+#ifdef CK_F_PR_FAA_64
+#define CK_F_EC64
+#endif /* CK_F_PR_FAA_64 */
+
+/*
+ * GCC inline assembly lets us exploit non-atomic read-modify-write
+ * instructions on x86/x86_64 for a fast single-producer mode.
+ *
+ * If we CK_F_EC_SP is not defined, CK_EC always uses the slower
+ * multiple producer code.
+ */
+#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
+#define CK_F_EC_SP
+#endif /* GNUC && (__i386__ || __x86_64__) */
+
+struct ck_ec_ops;
+
+struct ck_ec_wait_state {
+ struct timespec start; /* Time when we entered ck_ec_wait. */
+ struct timespec now; /* Time now. */
+ const struct ck_ec_ops *ops;
+ void *data; /* Opaque pointer for the predicate's internal state. */
+
+};
+
+/*
+ * ck_ec_ops define system-specific functions to get the current time,
+ * atomically wait on an address if it still has some expected value,
+ * and to wake all threads waiting on an address.
+ *
+ * Each platform is expected to have few (one) opaque pointer to a
+ * const ops struct, and reuse it for all ck_ec_mode structs.
+ */
+struct ck_ec_ops {
+ /* Populates out with the current time. Returns non-zero on failure. */
+ int (*gettime)(const struct ck_ec_ops *, struct timespec *out);
+
+ /*
+ * Waits on address if its value is still `expected`. If
+ * deadline is non-NULL, stops waiting once that deadline is
+ * reached. May return early for any reason.
+ */
+ void (*wait32)(const struct ck_ec_wait_state *, const uint32_t *,
+ uint32_t expected, const struct timespec *deadline);
+
+ /*
+ * Same as wait32, but for a 64 bit counter. Only used if
+ * CK_F_EC64 is defined.
+ *
+ * If underlying blocking primitive only supports 32 bit
+ * control words, it should be safe to block on the least
+ * significant half of the 64 bit address.
+ */
+ void (*wait64)(const struct ck_ec_wait_state *, const uint64_t *,
+ uint64_t expected, const struct timespec *deadline);
+
+ /* Wakes up all threads waiting on address. */
+ void (*wake32)(const struct ck_ec_ops *, const uint32_t *address);
+
+ /*
+ * Same as wake32, but for a 64 bit counter. Only used if
+ * CK_F_EC64 is defined.
+ *
+ * When wait64 truncates the control word at address to `only`
+ * consider its least significant half, wake64 should perform
+ * any necessary fixup (e.g., on big endian platforms).
+ */
+ void (*wake64)(const struct ck_ec_ops *, const uint64_t *address);
+
+ /*
+ * Number of iterations for the initial busy wait. 0 defaults
+ * to 100 (not ABI stable).
+ */
+ uint32_t busy_loop_iter;
+
+ /*
+ * Delay in nanoseconds for the first iteration of the
+ * exponential backoff. 0 defaults to 2 ms (not ABI stable).
+ */
+ uint32_t initial_wait_ns;
+
+ /*
+ * Scale factor for the exponential backoff. 0 defaults to 8x
+ * (not ABI stable).
+ */
+ uint32_t wait_scale_factor;
+
+ /*
+ * Right shift count for the exponential backoff. The update
+ * after each iteration is
+ * wait_ns = (wait_ns * wait_scale_factor) >> wait_shift_count,
+ * until one second has elapsed. After that, the deadline goes
+ * to infinity.
+ */
+ uint32_t wait_shift_count;
+};
+
+/*
+ * ck_ec_mode wraps the ops table, and informs the fast path whether
+ * it should attempt to specialize for single producer mode.
+ *
+ * mode structs are expected to be exposed by value, e.g.,
+ *
+ * extern const struct ck_ec_ops system_ec_ops;
+ *
+ * static const struct ck_ec_mode ec_sp = {
+ * .ops = &system_ec_ops,
+ * .single_producer = true
+ * };
+ *
+ * static const struct ck_ec_mode ec_mp = {
+ * .ops = &system_ec_ops,
+ * .single_producer = false
+ * };
+ *
+ * ck_ec_mode structs are only passed to inline functions defined in
+ * this header, and never escape to their slow paths, so they should
+ * not result in any object file size increase.
+ */
+struct ck_ec_mode {
+ const struct ck_ec_ops *ops;
+ /*
+ * If single_producer is true, the event count has a unique
+ * incrementer. The implementation will specialize ck_ec_inc
+ * and ck_ec_add if possible (if CK_F_EC_SP is defined).
+ */
+ bool single_producer;
+};
+
+struct ck_ec32 {
+ /* Flag is "sign" bit, value in bits 0:30. */
+ uint32_t counter;
+};
+
+typedef struct ck_ec32 ck_ec32_t;
+
+#ifdef CK_F_EC64
+struct ck_ec64 {
+ /*
+ * Flag is bottom bit, value in bits 1:63. Eventcount only
+ * works on x86-64 (i.e., little endian), so the futex int
+ * lies in the first 4 (bottom) bytes.
+ */
+ uint64_t counter;
+};
+
+typedef struct ck_ec64 ck_ec64_t;
+#endif /* CK_F_EC64 */
+
+#define CK_EC_INITIALIZER { .counter = 0 }
+
+/*
+ * Initializes the event count to `value`. The value must not
+ * exceed INT32_MAX.
+ */
+static void ck_ec32_init(struct ck_ec32 *ec, uint32_t value);
+
+#ifndef CK_F_EC64
+#define ck_ec_init ck_ec32_init
+#else
+/*
+ * Initializes the event count to `value`. The value must not
+ * exceed INT64_MAX.
+ */
+static void ck_ec64_init(struct ck_ec64 *ec, uint64_t value);
+
+#if __STDC_VERSION__ >= 201112L
+#define ck_ec_init(EC, VALUE) \
+ (_Generic(*(EC), \
+ struct ck_ec32 : ck_ec32_init, \
+ struct ck_ec64 : ck_ec64_init)((EC), (VALUE)))
+#endif /* __STDC_VERSION__ */
+#endif /* CK_F_EC64 */
+
+/*
+ * Returns the counter value in the event count. The value is at most
+ * INT32_MAX.
+ */
+static uint32_t ck_ec32_value(const struct ck_ec32* ec);
+
+#ifndef CK_F_EC64
+#define ck_ec_value ck_ec32_value
+#else
+/*
+ * Returns the counter value in the event count. The value is at most
+ * INT64_MAX.
+ */
+static uint64_t ck_ec64_value(const struct ck_ec64* ec);
+
+#if __STDC_VERSION__ >= 201112L
+#define ck_ec_value(EC) \
+ (_Generic(*(EC), \
+ struct ck_ec32 : ck_ec32_value, \
+ struct ck_ec64 : ck_ec64_value)((EC)))
+#endif /* __STDC_VERSION__ */
+#endif /* CK_F_EC64 */
+
+/*
+ * Returns whether there may be slow pathed waiters that need an
+ * explicit OS wakeup for this event count.
+ */
+static bool ck_ec32_has_waiters(const struct ck_ec32 *ec);
+
+#ifndef CK_F_EC64
+#define ck_ec_has_waiters ck_ec32_has_waiters
+#else
+static bool ck_ec64_has_waiters(const struct ck_ec64 *ec);
+
+#if __STDC_VERSION__ >= 201112L
+#define ck_ec_has_waiters(EC) \
+ (_Generic(*(EC), \
+ struct ck_ec32 : ck_ec32_has_waiters, \
+ struct ck_ec64 : ck_ec64_has_waiters)((EC)))
+#endif /* __STDC_VERSION__ */
+#endif /* CK_F_EC64 */
+
+/*
+ * Increments the counter value in the event count by one, and wakes
+ * up any waiter.
+ */
+static void ck_ec32_inc(struct ck_ec32 *ec, const struct ck_ec_mode *mode);
+
+#ifndef CK_F_EC64
+#define ck_ec_inc ck_ec32_inc
+#else
+static void ck_ec64_inc(struct ck_ec64 *ec, const struct ck_ec_mode *mode);
+
+#if __STDC_VERSION__ >= 201112L
+#define ck_ec_inc(EC, MODE) \
+ (_Generic(*(EC), \
+ struct ck_ec32 : ck_ec32_inc, \
+ struct ck_ec64 : ck_ec64_inc)((EC), (MODE)))
+#endif /* __STDC_VERSION__ */
+#endif /* CK_F_EC64 */
+
+/*
+ * Increments the counter value in the event count by delta, wakes
+ * up any waiter, and returns the previous counter value.
+ */
+static uint32_t ck_ec32_add(struct ck_ec32 *ec,
+ const struct ck_ec_mode *mode,
+ uint32_t delta);
+
+#ifndef CK_F_EC64
+#define ck_ec_add ck_ec32_add
+#else
+static uint64_t ck_ec64_add(struct ck_ec64 *ec,
+ const struct ck_ec_mode *mode,
+ uint64_t delta);
+
+#if __STDC_VERSION__ >= 201112L
+#define ck_ec_add(EC, MODE, DELTA) \
+ (_Generic(*(EC), \
+ struct ck_ec32 : ck_ec32_add, \
+ struct ck_ec64 : ck_ec64_add)((EC), (MODE), (DELTA)))
+#endif /* __STDC_VERSION__ */
+#endif /* CK_F_EC64 */
+
+/*
+ * Populates `new_deadline` with a deadline `timeout` in the future.
+ * Returns 0 on success, and -1 if clock_gettime failed, in which
+ * case errno is left as is.
+ */
+static int ck_ec_deadline(struct timespec *new_deadline,
+ const struct ck_ec_mode *mode,
+ const struct timespec *timeout);
+
+/*
+ * Waits until the counter value in the event count differs from
+ * old_value, or, if deadline is non-NULL, until CLOCK_MONOTONIC is
+ * past the deadline.
+ *
+ * Returns 0 on success, and -1 on timeout.
+ */
+static int ck_ec32_wait(struct ck_ec32 *ec,
+ const struct ck_ec_mode *mode,
+ uint32_t old_value,
+ const struct timespec *deadline);
+
+#ifndef CK_F_EC64
+#define ck_ec_wait ck_ec32_wait
+#else
+static int ck_ec64_wait(struct ck_ec64 *ec,
+ const struct ck_ec_mode *mode,
+ uint64_t old_value,
+ const struct timespec *deadline);
+
+#if __STDC_VERSION__ >= 201112L
+#define ck_ec_wait(EC, MODE, OLD_VALUE, DEADLINE) \
+ (_Generic(*(EC), \
+ struct ck_ec32 : ck_ec32_wait, \
+ struct ck_ec64 : ck_ec64_wait)((EC), (MODE), \
+ (OLD_VALUE), (DEADLINE)))
+
+#endif /* __STDC_VERSION__ */
+#endif /* CK_F_EC64 */
+
+/*
+ * Waits until the counter value in the event count differs from
+ * old_value, pred returns non-zero, or, if deadline is non-NULL,
+ * until CLOCK_MONOTONIC is past the deadline.
+ *
+ * Returns 0 on success, -1 on timeout, and the return value of pred
+ * if it returns non-zero.
+ *
+ * A NULL pred represents a function that always returns 0.
+ */
+static int ck_ec32_wait_pred(struct ck_ec32 *ec,
+ const struct ck_ec_mode *mode,
+ uint32_t old_value,
+ int (*pred)(const struct ck_ec_wait_state *,
+ struct timespec *deadline),
+ void *data,
+ const struct timespec *deadline);
+
+#ifndef CK_F_EC64
+#define ck_ec_wait_pred ck_ec32_wait_pred
+#else
+static int ck_ec64_wait_pred(struct ck_ec64 *ec,
+ const struct ck_ec_mode *mode,
+ uint64_t old_value,
+ int (*pred)(const struct ck_ec_wait_state *,
+ struct timespec *deadline),
+ void *data,
+ const struct timespec *deadline);
+
+#if __STDC_VERSION__ >= 201112L
+#define ck_ec_wait_pred(EC, MODE, OLD_VALUE, PRED, DATA, DEADLINE) \
+ (_Generic(*(EC), \
+ struct ck_ec32 : ck_ec32_wait_pred, \
+ struct ck_ec64 : ck_ec64_wait_pred) \
+ ((EC), (MODE), (OLD_VALUE), (PRED), (DATA), (DEADLINE)))
+#endif /* __STDC_VERSION__ */
+#endif /* CK_F_EC64 */
+
+/*
+ * Inline implementation details. 32 bit first, then 64 bit
+ * conditionally.
+ */
+CK_CC_FORCE_INLINE void ck_ec32_init(struct ck_ec32 *ec, uint32_t value)
+{
+ ec->counter = value & ~(1UL << 31);
+ return;
+}
+
+CK_CC_FORCE_INLINE uint32_t ck_ec32_value(const struct ck_ec32 *ec)
+{
+ uint32_t ret = ck_pr_load_32(&ec->counter) & ~(1UL << 31);
+
+ ck_pr_fence_acquire();
+ return ret;
+}
+
+CK_CC_FORCE_INLINE bool ck_ec32_has_waiters(const struct ck_ec32 *ec)
+{
+ return ck_pr_load_32(&ec->counter) & (1UL << 31);
+}
+
+/* Slow path for ck_ec{32,64}_{inc,add} */
+void ck_ec32_wake(struct ck_ec32 *ec, const struct ck_ec_ops *ops);
+
+CK_CC_FORCE_INLINE void ck_ec32_inc(struct ck_ec32 *ec,
+ const struct ck_ec_mode *mode)
+{
+#if !defined(CK_F_EC_SP)
+ /* Nothing to specialize if we don't have EC_SP. */
+ ck_ec32_add(ec, mode, 1);
+ return;
+#else
+ char flagged;
+
+#if __GNUC__ >= 6
+ /*
+ * We don't want to wake if the sign bit is 0. We do want to
+ * wake if the sign bit just flipped from 1 to 0. We don't
+ * care what happens when our increment caused the sign bit to
+ * flip from 0 to 1 (that's once per 2^31 increment).
+ *
+ * This leaves us with four cases:
+ *
+ * old sign bit | new sign bit | SF | OF | ZF
+ * -------------------------------------------
+ * 0 | 0 | 0 | 0 | ?
+ * 0 | 1 | 1 | 0 | ?
+ * 1 | 1 | 1 | 0 | ?
+ * 1 | 0 | 0 | 0 | 1
+ *
+ * In the first case, we don't want to hit ck_ec32_wake. In
+ * the last two cases, we do want to call ck_ec32_wake. In the
+ * second case, we don't care, so we arbitrarily choose to
+ * call ck_ec32_wake.
+ *
+ * The "le" condition checks if SF != OF, or ZF == 1, which
+ * meets our requirements.
+ */
+#define CK_EC32_INC_ASM(PREFIX) \
+ __asm__ volatile(PREFIX " incl %0" \
+ : "+m"(ec->counter), "=@ccle"(flagged) \
+ :: "cc", "memory")
+#else
+#define CK_EC32_INC_ASM(PREFIX) \
+ __asm__ volatile(PREFIX " incl %0; setle %1" \
+ : "+m"(ec->counter), "=r"(flagged) \
+ :: "cc", "memory")
+#endif /* __GNUC__ */
+
+ if (mode->single_producer == true) {
+ ck_pr_fence_store();
+ CK_EC32_INC_ASM("");
+ } else {
+ ck_pr_fence_store_atomic();
+ CK_EC32_INC_ASM("lock");
+ }
+#undef CK_EC32_INC_ASM
+
+ if (CK_CC_UNLIKELY(flagged)) {
+ ck_ec32_wake(ec, mode->ops);
+ }
+
+ return;
+#endif /* CK_F_EC_SP */
+}
+
+CK_CC_FORCE_INLINE uint32_t ck_ec32_add_epilogue(struct ck_ec32 *ec,
+ const struct ck_ec_mode *mode,
+ uint32_t old)
+{
+ const uint32_t flag_mask = 1U << 31;
+ uint32_t ret;
+
+ ret = old & ~flag_mask;
+ /* These two only differ if the flag bit is set. */
+ if (CK_CC_UNLIKELY(old != ret)) {
+ ck_ec32_wake(ec, mode->ops);
+ }
+
+ return ret;
+}
+
+static CK_CC_INLINE uint32_t ck_ec32_add_mp(struct ck_ec32 *ec,
+ const struct ck_ec_mode *mode,
+ uint32_t delta)
+{
+ uint32_t old;
+
+ ck_pr_fence_store_atomic();
+ old = ck_pr_faa_32(&ec->counter, delta);
+ return ck_ec32_add_epilogue(ec, mode, old);
+}
+
+#ifdef CK_F_EC_SP
+static CK_CC_INLINE uint32_t ck_ec32_add_sp(struct ck_ec32 *ec,
+ const struct ck_ec_mode *mode,
+ uint32_t delta)
+{
+ uint32_t old;
+
+ /*
+ * Correctness of this racy write depends on actually
+ * having an update to write. Exit here if the update
+ * is a no-op.
+ */
+ if (CK_CC_UNLIKELY(delta == 0)) {
+ return ck_ec32_value(ec);
+ }
+
+ ck_pr_fence_store();
+ old = delta;
+ __asm__ volatile("xaddl %1, %0"
+ : "+m"(ec->counter), "+r"(old)
+ :: "cc", "memory");
+ return ck_ec32_add_epilogue(ec, mode, old);
+}
+#endif /* CK_F_EC_SP */
+
+CK_CC_FORCE_INLINE uint32_t ck_ec32_add(struct ck_ec32 *ec,
+ const struct ck_ec_mode *mode,
+ uint32_t delta)
+{
+#ifdef CK_F_EC_SP
+ if (mode->single_producer == true) {
+ return ck_ec32_add_sp(ec, mode, delta);
+ }
+#endif
+
+ return ck_ec32_add_mp(ec, mode, delta);
+}
+
+int ck_ec_deadline_impl(struct timespec *new_deadline,
+ const struct ck_ec_ops *ops,
+ const struct timespec *timeout);
+
+CK_CC_FORCE_INLINE int ck_ec_deadline(struct timespec *new_deadline,
+ const struct ck_ec_mode *mode,
+ const struct timespec *timeout)
+{
+ return ck_ec_deadline_impl(new_deadline, mode->ops, timeout);
+}
+
+
+int ck_ec32_wait_slow(struct ck_ec32 *ec,
+ const struct ck_ec_ops *ops,
+ uint32_t old_value,
+ const struct timespec *deadline);
+
+CK_CC_FORCE_INLINE int ck_ec32_wait(struct ck_ec32 *ec,
+ const struct ck_ec_mode *mode,
+ uint32_t old_value,
+ const struct timespec *deadline)
+{
+ if (ck_ec32_value(ec) != old_value) {
+ return 0;
+ }
+
+ return ck_ec32_wait_slow(ec, mode->ops, old_value, deadline);
+}
+
+int ck_ec32_wait_pred_slow(struct ck_ec32 *ec,
+ const struct ck_ec_ops *ops,
+ uint32_t old_value,
+ int (*pred)(const struct ck_ec_wait_state *state,
+ struct timespec *deadline),
+ void *data,
+ const struct timespec *deadline);
+
+CK_CC_FORCE_INLINE int
+ck_ec32_wait_pred(struct ck_ec32 *ec,
+ const struct ck_ec_mode *mode,
+ uint32_t old_value,
+ int (*pred)(const struct ck_ec_wait_state *state,
+ struct timespec *deadline),
+ void *data,
+ const struct timespec *deadline)
+{
+ if (ck_ec32_value(ec) != old_value) {
+ return 0;
+ }
+
+ return ck_ec32_wait_pred_slow(ec, mode->ops, old_value,
+ pred, data, deadline);
+}
+
+#ifdef CK_F_EC64
+CK_CC_FORCE_INLINE void ck_ec64_init(struct ck_ec64 *ec, uint64_t value)
+{
+ ec->counter = value << 1;
+ return;
+}
+
+CK_CC_FORCE_INLINE uint64_t ck_ec64_value(const struct ck_ec64 *ec)
+{
+ uint64_t ret = ck_pr_load_64(&ec->counter) >> 1;
+
+ ck_pr_fence_acquire();
+ return ret;
+}
+
+CK_CC_FORCE_INLINE bool ck_ec64_has_waiters(const struct ck_ec64 *ec)
+{
+ return ck_pr_load_64(&ec->counter) & 1;
+}
+
+void ck_ec64_wake(struct ck_ec64 *ec, const struct ck_ec_ops *ops);
+
+CK_CC_FORCE_INLINE void ck_ec64_inc(struct ck_ec64 *ec,
+ const struct ck_ec_mode *mode)
+{
+ /* We always xadd, so there's no special optimization here. */
+ (void)ck_ec64_add(ec, mode, 1);
+ return;
+}
+
+CK_CC_FORCE_INLINE uint64_t ck_ec_add64_epilogue(struct ck_ec64 *ec,
+ const struct ck_ec_mode *mode,
+ uint64_t old)
+{
+ uint64_t ret = old >> 1;
+
+ if (CK_CC_UNLIKELY(old & 1)) {
+ ck_ec64_wake(ec, mode->ops);
+ }
+
+ return ret;
+}
+
+static CK_CC_INLINE uint64_t ck_ec64_add_mp(struct ck_ec64 *ec,
+ const struct ck_ec_mode *mode,
+ uint64_t delta)
+{
+ uint64_t inc = 2 * delta; /* The low bit is the flag bit. */
+
+ ck_pr_fence_store_atomic();
+ return ck_ec_add64_epilogue(ec, mode, ck_pr_faa_64(&ec->counter, inc));
+}
+
+#ifdef CK_F_EC_SP
+/* Single-producer specialisation. */
+static CK_CC_INLINE uint64_t ck_ec64_add_sp(struct ck_ec64 *ec,
+ const struct ck_ec_mode *mode,
+ uint64_t delta)
+{
+ uint64_t old;
+
+ /*
+ * Correctness of this racy write depends on actually
+ * having an update to write. Exit here if the update
+ * is a no-op.
+ */
+ if (CK_CC_UNLIKELY(delta == 0)) {
+ return ck_ec64_value(ec);
+ }
+
+ ck_pr_fence_store();
+ old = 2 * delta; /* The low bit is the flag bit. */
+ __asm__ volatile("xaddq %1, %0"
+ : "+m"(ec->counter), "+r"(old)
+ :: "cc", "memory");
+ return ck_ec_add64_epilogue(ec, mode, old);
+}
+#endif /* CK_F_EC_SP */
+
+/*
+ * Dispatch on mode->single_producer in this FORCE_INLINE function:
+ * the end result is always small, but not all compilers have enough
+ * foresight to inline and get the reduction.
+ */
+CK_CC_FORCE_INLINE uint64_t ck_ec64_add(struct ck_ec64 *ec,
+ const struct ck_ec_mode *mode,
+ uint64_t delta)
+{
+#ifdef CK_F_EC_SP
+ if (mode->single_producer == true) {
+ return ck_ec64_add_sp(ec, mode, delta);
+ }
+#endif
+
+ return ck_ec64_add_mp(ec, mode, delta);
+}
+
+int ck_ec64_wait_slow(struct ck_ec64 *ec,
+ const struct ck_ec_ops *ops,
+ uint64_t old_value,
+ const struct timespec *deadline);
+
+CK_CC_FORCE_INLINE int ck_ec64_wait(struct ck_ec64 *ec,
+ const struct ck_ec_mode *mode,
+ uint64_t old_value,
+ const struct timespec *deadline)
+{
+ if (ck_ec64_value(ec) != old_value) {
+ return 0;
+ }
+
+ return ck_ec64_wait_slow(ec, mode->ops, old_value, deadline);
+}
+
+int ck_ec64_wait_pred_slow(struct ck_ec64 *ec,
+ const struct ck_ec_ops *ops,
+ uint64_t old_value,
+ int (*pred)(const struct ck_ec_wait_state *state,
+ struct timespec *deadline),
+ void *data,
+ const struct timespec *deadline);
+
+
+CK_CC_FORCE_INLINE int
+ck_ec64_wait_pred(struct ck_ec64 *ec,
+ const struct ck_ec_mode *mode,
+ uint64_t old_value,
+ int (*pred)(const struct ck_ec_wait_state *state,
+ struct timespec *deadline),
+ void *data,
+ const struct timespec *deadline)
+{
+ if (ck_ec64_value(ec) != old_value) {
+ return 0;
+ }
+
+ return ck_ec64_wait_pred_slow(ec, mode->ops, old_value,
+ pred, data, deadline);
+}
+#endif /* CK_F_EC64 */
+#endif /* !CK_EC_H */
diff --git a/include/ck_fifo.h b/include/ck_fifo.h
index 6d500708c445..c9a6f3d9a87d 100644
--- a/include/ck_fifo.h
+++ b/include/ck_fifo.h
@@ -115,7 +115,7 @@ CK_CC_INLINE static void
ck_fifo_spsc_deinit(struct ck_fifo_spsc *fifo, struct ck_fifo_spsc_entry **garbage)
{
- *garbage = fifo->head;
+ *garbage = fifo->garbage;
fifo->head = fifo->tail = NULL;
return;
}
diff --git a/include/ck_hs.h b/include/ck_hs.h
index 3c12b6e602a7..cd3e5dac87aa 100644
--- a/include/ck_hs.h
+++ b/include/ck_hs.h
@@ -109,6 +109,14 @@ typedef struct ck_hs_iterator ck_hs_iterator_t;
/* Convenience wrapper to table hash function. */
#define CK_HS_HASH(T, F, K) F((K), (T)->seed)
+/* Computes the hash of n bytes of k for the specified hash map. */
+static inline unsigned long
+ck_hs_hash(const struct ck_hs *hs, const void *k)
+{
+
+ return hs->hf(k, hs->seed);
+}
+
typedef void *ck_hs_apply_fn_t(void *, void *);
bool ck_hs_apply(ck_hs_t *, unsigned long, const void *, ck_hs_apply_fn_t *, void *);
void ck_hs_iterator_init(ck_hs_iterator_t *);
diff --git a/include/ck_pr.h b/include/ck_pr.h
index 2de6e13ec3c9..8ebf855692dd 100644
--- a/include/ck_pr.h
+++ b/include/ck_pr.h
@@ -34,7 +34,20 @@
#include <ck_stdint.h>
#include <ck_stdbool.h>
-#ifndef CK_USE_CC_BUILTINS
+/*
+ * Default to using builtins for clang analyzer, coverity, and sparse:
+ * inline assembly is often too opaque for useful analysis. Override
+ * the defaults by defining CK_USE_CC_BUILTINS=0 or 1.
+ */
+#if !defined(CK_USE_CC_BUILTINS)
+#if defined(__clang_analyzer__) || defined(__COVERITY__) || defined(__CHECKER__)
+#define CK_USE_CC_BUILTINS 1
+#else
+#define CK_USE_CC_BUILTINS 0
+#endif
+#endif
+
+#if !CK_USE_CC_BUILTINS
#if defined(__x86_64__)
#include "gcc/x86_64/ck_pr.h"
#elif defined(__x86__)
diff --git a/include/ck_queue.h b/include/ck_queue.h
index 3f503aa6c3e5..fd38d8a583fa 100644
--- a/include/ck_queue.h
+++ b/include/ck_queue.h
@@ -53,7 +53,7 @@
* SUCH DAMAGE.
*
* @(#)queue.h 8.5 (Berkeley) 8/20/94
- * $FreeBSD$
+ * $FreeBSD: release/9.0.0/sys/sys/queue.h 221843 2011-05-13 15:49:23Z mdf $
*/
#ifndef CK_QUEUE_H
@@ -150,17 +150,17 @@ struct { \
#define CK_SLIST_FOREACH(var, head, field) \
for ((var) = CK_SLIST_FIRST((head)); \
- (var) && (ck_pr_fence_load(), 1); \
+ (var); \
(var) = CK_SLIST_NEXT((var), field))
-#define CK_SLIST_FOREACH_SAFE(var, head, field, tvar) \
- for ((var) = CK_SLIST_FIRST(head); \
- (var) && (ck_pr_fence_load(), (tvar) = CK_SLIST_NEXT(var, field), 1);\
+#define CK_SLIST_FOREACH_SAFE(var, head, field, tvar) \
+ for ((var) = CK_SLIST_FIRST(head); \
+ (var) && ((tvar) = CK_SLIST_NEXT(var, field), 1); \
(var) = (tvar))
#define CK_SLIST_FOREACH_PREVPTR(var, varp, head, field) \
for ((varp) = &(head)->cslh_first; \
- ((var) = ck_pr_load_ptr(varp)) != NULL && (ck_pr_fence_load(), 1); \
+ ((var) = ck_pr_load_ptr(varp)) != NULL; \
(varp) = &(var)->field.csle_next)
#define CK_SLIST_INIT(head) do { \
@@ -259,12 +259,12 @@ struct { \
#define CK_STAILQ_FOREACH(var, head, field) \
for((var) = CK_STAILQ_FIRST((head)); \
- (var) && (ck_pr_fence_load(), 1); \
+ (var); \
(var) = CK_STAILQ_NEXT((var), field))
#define CK_STAILQ_FOREACH_SAFE(var, head, field, tvar) \
for ((var) = CK_STAILQ_FIRST((head)); \
- (var) && (ck_pr_fence_load(), (tvar) = \
+ (var) && ((tvar) = \
CK_STAILQ_NEXT((var), field), 1); \
(var) = (tvar))
@@ -371,12 +371,12 @@ struct { \
#define CK_LIST_FOREACH(var, head, field) \
for ((var) = CK_LIST_FIRST((head)); \
- (var) && (ck_pr_fence_load(), 1); \
+ (var); \
(var) = CK_LIST_NEXT((var), field))
#define CK_LIST_FOREACH_SAFE(var, head, field, tvar) \
for ((var) = CK_LIST_FIRST((head)); \
- (var) && (ck_pr_fence_load(), (tvar) = CK_LIST_NEXT((var), field), 1);\
+ (var) && ((tvar) = CK_LIST_NEXT((var), field), 1); \
(var) = (tvar))
#define CK_LIST_INIT(head) do { \
diff --git a/include/ck_ring.h b/include/ck_ring.h
index e5f0712ef7cf..9f6754e0cd24 100644
--- a/include/ck_ring.h
+++ b/include/ck_ring.h
@@ -66,9 +66,56 @@ ck_ring_size(const struct ck_ring *ring)
CK_CC_INLINE static unsigned int
ck_ring_capacity(const struct ck_ring *ring)
{
+
return ring->size;
}
+/*
+ * This function is only safe to call when there are no concurrent operations
+ * on the ring. This is primarily meant for persistent ck_ring use-cases. The
+ * function returns true if any mutations were performed on the ring.
+ */
+CK_CC_INLINE static bool
+ck_ring_repair(struct ck_ring *ring)
+{
+ bool r = false;
+
+ if (ring->p_tail != ring->p_head) {
+ ring->p_tail = ring->p_head;
+ r = true;
+ }
+
+ return r;
+}
+
+/*
+ * This can be called when no concurrent updates are occurring on the ring
+ * structure to check for consistency. This is primarily meant to be used for
+ * persistent storage of the ring. If this functions returns false, the ring
+ * is in an inconsistent state.
+ */
+CK_CC_INLINE static bool
+ck_ring_valid(const struct ck_ring *ring)
+{
+ unsigned int size = ring->size;
+ unsigned int c_head = ring->c_head;
+ unsigned int p_head = ring->p_head;
+
+ /* The ring must be a power of 2. */
+ if (size & (size - 1))
+ return false;
+
+ /* The consumer counter must always be smaller than the producer. */
+ if (c_head > p_head)
+ return false;
+
+ /* The producer may only be up to size slots ahead of consumer. */
+ if (p_head - c_head >= size)
+ return false;
+
+ return true;
+}
+
CK_CC_INLINE static void
ck_ring_init(struct ck_ring *ring, unsigned int size)
{
@@ -84,6 +131,45 @@ ck_ring_init(struct ck_ring *ring, unsigned int size)
/*
* The _ck_ring_* namespace is internal only and must not used externally.
*/
+
+/*
+ * This function will return a region of memory to write for the next value
+ * for a single producer.
+ */
+CK_CC_FORCE_INLINE static void *
+_ck_ring_enqueue_reserve_sp(struct ck_ring *ring,
+ void *CK_CC_RESTRICT buffer,
+ unsigned int ts,
+ unsigned int *size)
+{
+ const unsigned int mask = ring->mask;
+ unsigned int consumer, producer, delta;
+
+ consumer = ck_pr_load_uint(&ring->c_head);
+ producer = ring->p_tail;
+ delta = producer + 1;
+ if (size != NULL)
+ *size = (producer - consumer) & mask;
+
+ if (CK_CC_UNLIKELY((delta & mask) == (consumer & mask)))
+ return NULL;
+
+ return (char *)buffer + ts * (producer & mask);
+}
+
+/*
+ * This is to be called to commit and make visible a region of previously
+ * reserved with reverse_sp.
+ */
+CK_CC_FORCE_INLINE static void
+_ck_ring_enqueue_commit_sp(struct ck_ring *ring)
+{
+
+ ck_pr_fence_store();
+ ck_pr_store_uint(&ring->p_tail, ring->p_tail + 1);
+ return;
+}
+
CK_CC_FORCE_INLINE static bool
_ck_ring_enqueue_sp(struct ck_ring *ring,
void *CK_CC_RESTRICT buffer,
@@ -163,6 +249,65 @@ _ck_ring_dequeue_sc(struct ck_ring *ring,
return true;
}
+CK_CC_FORCE_INLINE static void *
+_ck_ring_enqueue_reserve_mp(struct ck_ring *ring,
+ void *buffer,
+ unsigned int ts,
+ unsigned int *ticket,
+ unsigned int *size)
+{
+ const unsigned int mask = ring->mask;
+ unsigned int producer, consumer, delta;
+
+ producer = ck_pr_load_uint(&ring->p_head);
+
+ for (;;) {
+ ck_pr_fence_load();
+ consumer = ck_pr_load_uint(&ring->c_head);
+
+ delta = producer + 1;
+
+ if (CK_CC_LIKELY((producer - consumer) < mask)) {
+ if (ck_pr_cas_uint_value(&ring->p_head,
+ producer, delta, &producer) == true) {
+ break;
+ }
+ } else {
+ unsigned int new_producer;
+
+ ck_pr_fence_load();
+ new_producer = ck_pr_load_uint(&ring->p_head);
+
+ if (producer == new_producer) {
+ if (size != NULL)
+ *size = (producer - consumer) & mask;
+
+ return false;
+ }
+
+ producer = new_producer;
+ }
+ }
+
+ *ticket = producer;
+ if (size != NULL)
+ *size = (producer - consumer) & mask;
+
+ return (char *)buffer + ts * (producer & mask);
+}
+
+CK_CC_FORCE_INLINE static void
+_ck_ring_enqueue_commit_mp(struct ck_ring *ring, unsigned int producer)
+{
+
+ while (ck_pr_load_uint(&ring->p_tail) != producer)
+ ck_pr_stall();
+
+ ck_pr_fence_store();
+ ck_pr_store_uint(&ring->p_tail, producer + 1);
+ return;
+}
+
CK_CC_FORCE_INLINE static bool
_ck_ring_enqueue_mp(struct ck_ring *ring,
void *buffer,
@@ -354,6 +499,33 @@ ck_ring_enqueue_spsc(struct ck_ring *ring,
&entry, sizeof(entry), NULL);
}
+CK_CC_INLINE static void *
+ck_ring_enqueue_reserve_spsc_size(struct ck_ring *ring,
+ struct ck_ring_buffer *buffer,
+ unsigned int *size)
+{
+
+ return _ck_ring_enqueue_reserve_sp(ring, buffer, sizeof(void *),
+ size);
+}
+
+CK_CC_INLINE static void *
+ck_ring_enqueue_reserve_spsc(struct ck_ring *ring,
+ struct ck_ring_buffer *buffer)
+{
+
+ return _ck_ring_enqueue_reserve_sp(ring, buffer, sizeof(void *),
+ NULL);
+}
+
+CK_CC_INLINE static void
+ck_ring_enqueue_commit_spsc(struct ck_ring *ring)
+{
+
+ _ck_ring_enqueue_commit_sp(ring);
+ return;
+}
+
CK_CC_INLINE static bool
ck_ring_dequeue_spsc(struct ck_ring *ring,
const struct ck_ring_buffer *buffer,
@@ -375,8 +547,7 @@ ck_ring_enqueue_mpmc(struct ck_ring *ring,
const void *entry)
{
- return _ck_ring_enqueue_mp(ring, buffer, &entry,
- sizeof(entry), NULL);
+ return _ck_ring_enqueue_mp(ring, buffer, &entry, sizeof(entry), NULL);
}
CK_CC_INLINE static bool
@@ -386,8 +557,37 @@ ck_ring_enqueue_mpmc_size(struct ck_ring *ring,
unsigned int *size)
{
- return _ck_ring_enqueue_mp_size(ring, buffer, &entry,
- sizeof(entry), size);
+ return _ck_ring_enqueue_mp_size(ring, buffer, &entry, sizeof(entry),
+ size);
+}
+
+CK_CC_INLINE static void *
+ck_ring_enqueue_reserve_mpmc(struct ck_ring *ring,
+ struct ck_ring_buffer *buffer,
+ unsigned int *ticket)
+{
+
+ return _ck_ring_enqueue_reserve_mp(ring, buffer, sizeof(void *),
+ ticket, NULL);
+}
+
+CK_CC_INLINE static void *
+ck_ring_enqueue_reserve_mpmc_size(struct ck_ring *ring,
+ struct ck_ring_buffer *buffer,
+ unsigned int *ticket,
+ unsigned int *size)
+{
+
+ return _ck_ring_enqueue_reserve_mp(ring, buffer, sizeof(void *),
+ ticket, size);
+}
+
+CK_CC_INLINE static void
+ck_ring_enqueue_commit_mpmc(struct ck_ring *ring, unsigned int ticket)
+{
+
+ _ck_ring_enqueue_commit_mp(ring, ticket);
+ return;
}
CK_CC_INLINE static bool
@@ -415,6 +615,31 @@ ck_ring_dequeue_mpmc(struct ck_ring *ring,
* ring buffer containing pointers. Correctness is provided for any number of
* consumers with up to one concurrent producer.
*/
+CK_CC_INLINE static void *
+ck_ring_enqueue_reserve_spmc_size(struct ck_ring *ring,
+ struct ck_ring_buffer *buffer,
+ unsigned int *size)
+{
+
+ return _ck_ring_enqueue_reserve_sp(ring, buffer, sizeof(void *), size);
+}
+
+CK_CC_INLINE static void *
+ck_ring_enqueue_reserve_spmc(struct ck_ring *ring,
+ struct ck_ring_buffer *buffer)
+{
+
+ return _ck_ring_enqueue_reserve_sp(ring, buffer, sizeof(void *), NULL);
+}
+
+CK_CC_INLINE static void
+ck_ring_enqueue_commit_spmc(struct ck_ring *ring)
+{
+
+ _ck_ring_enqueue_commit_sp(ring);
+ return;
+}
+
CK_CC_INLINE static bool
ck_ring_enqueue_spmc_size(struct ck_ring *ring,
struct ck_ring_buffer *buffer,
@@ -459,6 +684,35 @@ ck_ring_dequeue_spmc(struct ck_ring *ring,
* ring buffer containing pointers. Correctness is provided for any number of
* producers with up to one concurrent consumers.
*/
+CK_CC_INLINE static void *
+ck_ring_enqueue_reserve_mpsc(struct ck_ring *ring,
+ struct ck_ring_buffer *buffer,
+ unsigned int *ticket)
+{
+
+ return _ck_ring_enqueue_reserve_mp(ring, buffer, sizeof(void *),
+ ticket, NULL);
+}
+
+CK_CC_INLINE static void *
+ck_ring_enqueue_reserve_mpsc_size(struct ck_ring *ring,
+ struct ck_ring_buffer *buffer,
+ unsigned int *ticket,
+ unsigned int *size)
+{
+
+ return _ck_ring_enqueue_reserve_mp(ring, buffer, sizeof(void *),
+ ticket, size);
+}
+
+CK_CC_INLINE static void
+ck_ring_enqueue_commit_mpsc(struct ck_ring *ring, unsigned int ticket)
+{
+
+ _ck_ring_enqueue_commit_mp(ring, ticket);
+ return;
+}
+
CK_CC_INLINE static bool
ck_ring_enqueue_mpsc(struct ck_ring *ring,
struct ck_ring_buffer *buffer,
@@ -494,194 +748,290 @@ ck_ring_dequeue_mpsc(struct ck_ring *ring,
* CK_RING_PROTOTYPE is used to define a type-safe interface for inlining
* values of a particular type in the ring the buffer.
*/
-#define CK_RING_PROTOTYPE(name, type) \
-CK_CC_INLINE static bool \
-ck_ring_enqueue_spsc_size_##name(struct ck_ring *a, \
- struct type *b, \
- struct type *c, \
- unsigned int *d) \
-{ \
- \
- return _ck_ring_enqueue_sp_size(a, b, c, \
- sizeof(struct type), d); \
-} \
- \
-CK_CC_INLINE static bool \
-ck_ring_enqueue_spsc_##name(struct ck_ring *a, \
- struct type *b, \
- struct type *c) \
-{ \
- \
- return _ck_ring_enqueue_sp(a, b, c, \
- sizeof(struct type), NULL); \
-} \
- \
-CK_CC_INLINE static bool \
-ck_ring_dequeue_spsc_##name(struct ck_ring *a, \
- struct type *b, \
- struct type *c) \
-{ \
- \
- return _ck_ring_dequeue_sc(a, b, c, \
- sizeof(struct type)); \
-} \
- \
-CK_CC_INLINE static bool \
-ck_ring_enqueue_spmc_size_##name(struct ck_ring *a, \
- struct type *b, \
- struct type *c, \
- unsigned int *d) \
-{ \
- \
- return _ck_ring_enqueue_sp_size(a, b, c, \
- sizeof(struct type), d); \
-} \
- \
-CK_CC_INLINE static bool \
-ck_ring_enqueue_spmc_##name(struct ck_ring *a, \
- struct type *b, \
- struct type *c) \
-{ \
- \
- return _ck_ring_enqueue_sp(a, b, c, \
- sizeof(struct type), NULL); \
-} \
- \
-CK_CC_INLINE static bool \
-ck_ring_trydequeue_spmc_##name(struct ck_ring *a, \
- struct type *b, \
- struct type *c) \
-{ \
- \
- return _ck_ring_trydequeue_mc(a, \
- b, c, sizeof(struct type)); \
-} \
- \
-CK_CC_INLINE static bool \
-ck_ring_dequeue_spmc_##name(struct ck_ring *a, \
- struct type *b, \
- struct type *c) \
-{ \
- \
- return _ck_ring_dequeue_mc(a, b, c, \
- sizeof(struct type)); \
-} \
- \
-CK_CC_INLINE static bool \
-ck_ring_enqueue_mpsc_##name(struct ck_ring *a, \
- struct type *b, \
- struct type *c) \
-{ \
- \
- return _ck_ring_enqueue_mp(a, b, c, \
- sizeof(struct type), NULL); \
-} \
- \
-CK_CC_INLINE static bool \
-ck_ring_enqueue_mpsc_size_##name(struct ck_ring *a, \
- struct type *b, \
- struct type *c, \
- unsigned int *d) \
-{ \
- \
- return _ck_ring_enqueue_mp_size(a, b, c, \
- sizeof(struct type), d); \
-} \
- \
-CK_CC_INLINE static bool \
-ck_ring_dequeue_mpsc_##name(struct ck_ring *a, \
- struct type *b, \
- struct type *c) \
-{ \
- \
- return _ck_ring_dequeue_sc(a, b, c, \
- sizeof(struct type)); \
-} \
- \
-CK_CC_INLINE static bool \
-ck_ring_enqueue_mpmc_size_##name(struct ck_ring *a, \
- struct type *b, \
- struct type *c, \
- unsigned int *d) \
-{ \
- \
- return _ck_ring_enqueue_mp_size(a, b, c, \
- sizeof(struct type), d); \
-} \
- \
-CK_CC_INLINE static bool \
-ck_ring_enqueue_mpmc_##name(struct ck_ring *a, \
- struct type *b, \
- struct type *c) \
-{ \
- \
- return _ck_ring_enqueue_mp(a, b, c, \
- sizeof(struct type), NULL); \
-} \
- \
-CK_CC_INLINE static bool \
-ck_ring_trydequeue_mpmc_##name(struct ck_ring *a, \
- struct type *b, \
- struct type *c) \
-{ \
- \
- return _ck_ring_trydequeue_mc(a, \
- b, c, sizeof(struct type)); \
-} \
- \
-CK_CC_INLINE static bool \
-ck_ring_dequeue_mpmc_##name(struct ck_ring *a, \
- struct type *b, \
- struct type *c) \
-{ \
- \
- return _ck_ring_dequeue_mc(a, b, c, \
- sizeof(struct type)); \
+#define CK_RING_PROTOTYPE(name, type) \
+CK_CC_INLINE static struct type * \
+ck_ring_enqueue_reserve_spsc_##name(struct ck_ring *a, \
+ struct type *b) \
+{ \
+ \
+ return _ck_ring_enqueue_reserve_sp(a, b, \
+ sizeof(struct type), NULL); \
+} \
+ \
+CK_CC_INLINE static struct type * \
+ck_ring_enqueue_reserve_spsc_size_##name(struct ck_ring *a, \
+ struct type *b, \
+ unsigned int *c) \
+{ \
+ \
+ return _ck_ring_enqueue_reserve_sp(a, b, \
+ sizeof(struct type), c); \
+} \
+ \
+CK_CC_INLINE static bool \
+ck_ring_enqueue_spsc_size_##name(struct ck_ring *a, \
+ struct type *b, \
+ struct type *c, \
+ unsigned int *d) \
+{ \
+ \
+ return _ck_ring_enqueue_sp_size(a, b, c, \
+ sizeof(struct type), d); \
+} \
+ \
+CK_CC_INLINE static bool \
+ck_ring_enqueue_spsc_##name(struct ck_ring *a, \
+ struct type *b, \
+ struct type *c) \
+{ \
+ \
+ return _ck_ring_enqueue_sp(a, b, c, \
+ sizeof(struct type), NULL); \
+} \
+ \
+CK_CC_INLINE static bool \
+ck_ring_dequeue_spsc_##name(struct ck_ring *a, \
+ struct type *b, \
+ struct type *c) \
+{ \
+ \
+ return _ck_ring_dequeue_sc(a, b, c, \
+ sizeof(struct type)); \
+} \
+ \
+CK_CC_INLINE static struct type * \
+ck_ring_enqueue_reserve_spmc_##name(struct ck_ring *a, \
+ struct type *b) \
+{ \
+ \
+ return _ck_ring_enqueue_reserve_sp(a, b, \
+ sizeof(struct type), NULL); \
+} \
+ \
+CK_CC_INLINE static struct type * \
+ck_ring_enqueue_reserve_spmc_size_##name(struct ck_ring *a, \
+ struct type *b, \
+ unsigned int *c) \
+{ \
+ \
+ return _ck_ring_enqueue_reserve_sp(a, b, \
+ sizeof(struct type), c); \
+} \
+ \
+CK_CC_INLINE static bool \
+ck_ring_enqueue_spmc_size_##name(struct ck_ring *a, \
+ struct type *b, \
+ struct type *c, \
+ unsigned int *d) \
+{ \
+ \
+ return _ck_ring_enqueue_sp_size(a, b, c, \
+ sizeof(struct type), d); \
+} \
+ \
+CK_CC_INLINE static bool \
+ck_ring_enqueue_spmc_##name(struct ck_ring *a, \
+ struct type *b, \
+ struct type *c) \
+{ \
+ \
+ return _ck_ring_enqueue_sp(a, b, c, \
+ sizeof(struct type), NULL); \
+} \
+ \
+CK_CC_INLINE static bool \
+ck_ring_trydequeue_spmc_##name(struct ck_ring *a, \
+ struct type *b, \
+ struct type *c) \
+{ \
+ \
+ return _ck_ring_trydequeue_mc(a, \
+ b, c, sizeof(struct type)); \
+} \
+ \
+CK_CC_INLINE static bool \
+ck_ring_dequeue_spmc_##name(struct ck_ring *a, \
+ struct type *b, \
+ struct type *c) \
+{ \
+ \
+ return _ck_ring_dequeue_mc(a, b, c, \
+ sizeof(struct type)); \
+} \
+ \
+CK_CC_INLINE static struct type * \
+ck_ring_enqueue_reserve_mpsc_##name(struct ck_ring *a, \
+ struct type *b, \
+ unsigned int *c) \
+{ \
+ \
+ return _ck_ring_enqueue_reserve_mp(a, b, \
+ sizeof(struct type), c, NULL); \
+} \
+ \
+CK_CC_INLINE static struct type * \
+ck_ring_enqueue_reserve_mpsc_size_##name(struct ck_ring *a, \
+ struct type *b, \
+ unsigned int *c, \
+ unsigned int *d) \
+{ \
+ \
+ return _ck_ring_enqueue_reserve_mp(a, b, \
+ sizeof(struct type), c, d); \
+} \
+ \
+CK_CC_INLINE static bool \
+ck_ring_enqueue_mpsc_##name(struct ck_ring *a, \
+ struct type *b, \
+ struct type *c) \
+{ \
+ \
+ return _ck_ring_enqueue_mp(a, b, c, \
+ sizeof(struct type), NULL); \
+} \
+ \
+CK_CC_INLINE static bool \
+ck_ring_enqueue_mpsc_size_##name(struct ck_ring *a, \
+ struct type *b, \
+ struct type *c, \
+ unsigned int *d) \
+{ \
+ \
+ return _ck_ring_enqueue_mp_size(a, b, c, \
+ sizeof(struct type), d); \
+} \
+ \
+CK_CC_INLINE static bool \
+ck_ring_dequeue_mpsc_##name(struct ck_ring *a, \
+ struct type *b, \
+ struct type *c) \
+{ \
+ \
+ return _ck_ring_dequeue_sc(a, b, c, \
+ sizeof(struct type)); \
+} \
+ \
+CK_CC_INLINE static struct type * \
+ck_ring_enqueue_reserve_mpmc_##name(struct ck_ring *a, \
+ struct type *b, \
+ unsigned int *c) \
+{ \
+ \
+ return _ck_ring_enqueue_reserve_mp(a, b, \
+ sizeof(struct type), c, NULL); \
+} \
+ \
+CK_CC_INLINE static struct type * \
+ck_ring_enqueue_reserve_mpmc_size_##name(struct ck_ring *a, \
+ struct type *b, \
+ unsigned int *c, \
+ unsigned int *d) \
+{ \
+ \
+ return _ck_ring_enqueue_reserve_mp(a, b, \
+ sizeof(struct type), c, d); \
+} \
+ \
+CK_CC_INLINE static bool \
+ck_ring_enqueue_mpmc_size_##name(struct ck_ring *a, \
+ struct type *b, \
+ struct type *c, \
+ unsigned int *d) \
+{ \
+ \
+ return _ck_ring_enqueue_mp_size(a, b, c, \
+ sizeof(struct type), d); \
+} \
+ \
+CK_CC_INLINE static bool \
+ck_ring_enqueue_mpmc_##name(struct ck_ring *a, \
+ struct type *b, \
+ struct type *c) \
+{ \
+ \
+ return _ck_ring_enqueue_mp(a, b, c, \
+ sizeof(struct type), NULL); \
+} \
+ \
+CK_CC_INLINE static bool \
+ck_ring_trydequeue_mpmc_##name(struct ck_ring *a, \
+ struct type *b, \
+ struct type *c) \
+{ \
+ \
+ return _ck_ring_trydequeue_mc(a, \
+ b, c, sizeof(struct type)); \
+} \
+ \
+CK_CC_INLINE static bool \
+ck_ring_dequeue_mpmc_##name(struct ck_ring *a, \
+ struct type *b, \
+ struct type *c) \
+{ \
+ \
+ return _ck_ring_dequeue_mc(a, b, c, \
+ sizeof(struct type)); \
}
/*
* A single producer with one concurrent consumer.
*/
-#define CK_RING_ENQUEUE_SPSC(name, a, b, c) \
+#define CK_RING_ENQUEUE_SPSC(name, a, b, c) \
ck_ring_enqueue_spsc_##name(a, b, c)
-#define CK_RING_ENQUEUE_SPSC_SIZE(name, a, b, c, d) \
+#define CK_RING_ENQUEUE_SPSC_SIZE(name, a, b, c, d) \
ck_ring_enqueue_spsc_size_##name(a, b, c, d)
-#define CK_RING_DEQUEUE_SPSC(name, a, b, c) \
+#define CK_RING_ENQUEUE_RESERVE_SPSC(name, a, b, c) \
+ ck_ring_enqueue_reserve_spsc_##name(a, b, c)
+#define CK_RING_ENQUEUE_RESERVE_SPSC_SIZE(name, a, b, c, d) \
+ ck_ring_enqueue_reserve_spsc_size_##name(a, b, c, d)
+#define CK_RING_DEQUEUE_SPSC(name, a, b, c) \
ck_ring_dequeue_spsc_##name(a, b, c)
/*
* A single producer with any number of concurrent consumers.
*/
-#define CK_RING_ENQUEUE_SPMC(name, a, b, c) \
+#define CK_RING_ENQUEUE_SPMC(name, a, b, c) \
ck_ring_enqueue_spmc_##name(a, b, c)
-#define CK_RING_ENQUEUE_SPMC_SIZE(name, a, b, c, d) \
+#define CK_RING_ENQUEUE_SPMC_SIZE(name, a, b, c, d) \
ck_ring_enqueue_spmc_size_##name(a, b, c, d)
-#define CK_RING_TRYDEQUEUE_SPMC(name, a, b, c) \
+#define CK_RING_ENQUEUE_RESERVE_SPMC(name, a, b, c) \
+ ck_ring_enqueue_reserve_spmc_##name(a, b, c)
+#define CK_RING_ENQUEUE_RESERVE_SPMC_SIZE(name, a, b, c, d) \
+ ck_ring_enqueue_reserve_spmc_size_##name(a, b, c, d)
+#define CK_RING_TRYDEQUEUE_SPMC(name, a, b, c) \
ck_ring_trydequeue_spmc_##name(a, b, c)
-#define CK_RING_DEQUEUE_SPMC(name, a, b, c) \
+#define CK_RING_DEQUEUE_SPMC(name, a, b, c) \
ck_ring_dequeue_spmc_##name(a, b, c)
/*
* Any number of concurrent producers with up to one
* concurrent consumer.
*/
-#define CK_RING_ENQUEUE_MPSC(name, a, b, c) \
+#define CK_RING_ENQUEUE_MPSC(name, a, b, c) \
ck_ring_enqueue_mpsc_##name(a, b, c)
-#define CK_RING_ENQUEUE_MPSC_SIZE(name, a, b, c, d) \
+#define CK_RING_ENQUEUE_MPSC_SIZE(name, a, b, c, d) \
ck_ring_enqueue_mpsc_size_##name(a, b, c, d)
-#define CK_RING_DEQUEUE_MPSC(name, a, b, c) \
+#define CK_RING_ENQUEUE_RESERVE_MPSC(name, a, b, c) \
+ ck_ring_enqueue_reserve_mpsc_##name(a, b, c)
+#define CK_RING_ENQUEUE_RESERVE_MPSC_SIZE(name, a, b, c, d) \
+ ck_ring_enqueue_reserve_mpsc_size_##name(a, b, c, d)
+#define CK_RING_DEQUEUE_MPSC(name, a, b, c) \
ck_ring_dequeue_mpsc_##name(a, b, c)
/*
* Any number of concurrent producers and consumers.
*/
-#define CK_RING_ENQUEUE_MPMC(name, a, b, c) \
+#define CK_RING_ENQUEUE_MPMC(name, a, b, c) \
ck_ring_enqueue_mpmc_##name(a, b, c)
-#define CK_RING_ENQUEUE_MPMC_SIZE(name, a, b, c, d) \
+#define CK_RING_ENQUEUE_MPMC_SIZE(name, a, b, c, d) \
ck_ring_enqueue_mpmc_size_##name(a, b, c, d)
-#define CK_RING_TRYDEQUEUE_MPMC(name, a, b, c) \
+#define CK_RING_ENQUEUE_RESERVE_MPMC(name, a, b, c) \
+ ck_ring_enqueue_reserve_mpmc_##name(a, b, c)
+#define CK_RING_ENQUEUE_RESERVE_MPMC_SIZE(name, a, b, c, d) \
+ ck_ring_enqueue_reserve_mpmc_size_##name(a, b, c, d)
+#define CK_RING_TRYDEQUEUE_MPMC(name, a, b, c) \
ck_ring_trydequeue_mpmc_##name(a, b, c)
-#define CK_RING_DEQUEUE_MPMC(name, a, b, c) \
+#define CK_RING_DEQUEUE_MPMC(name, a, b, c) \
ck_ring_dequeue_mpmc_##name(a, b, c)
#endif /* CK_RING_H */
diff --git a/include/gcc/aarch64/ck_pr.h b/include/gcc/aarch64/ck_pr.h
index e739c4d5b18e..0a473072fffd 100644
--- a/include/gcc/aarch64/ck_pr.h
+++ b/include/gcc/aarch64/ck_pr.h
@@ -92,7 +92,7 @@ CK_PR_FENCE(unlock, CK_DMB_SY)
ck_pr_md_load_##S(const M *target) \
{ \
long r = 0; \
- __asm__ __volatile__(I " %w0, [%1];" \
+ __asm__ __volatile__(I " %w0, [%1]\n" \
: "=r" (r) \
: "r" (target) \
: "memory"); \
@@ -103,7 +103,7 @@ CK_PR_FENCE(unlock, CK_DMB_SY)
ck_pr_md_load_##S(const M *target) \
{ \
long r = 0; \
- __asm__ __volatile__(I " %0, [%1];" \
+ __asm__ __volatile__(I " %0, [%1]\n" \
: "=r" (r) \
: "r" (target) \
: "memory"); \
@@ -195,10 +195,10 @@ CK_PR_STORE_S_64(double, double, "str")
T previous = 0; \
T tmp = 0; \
__asm__ __volatile__("1:" \
- "ldxr" W " %" R "0, [%2];" \
- "neg %" R "0, %" R "0;" \
- "stxr" W " %w1, %" R "0, [%2];" \
- "cbnz %w1, 1b;" \
+ "ldxr" W " %" R "0, [%2]\n"\
+ "neg %" R "0, %" R "0\n" \
+ "stxr" W " %w1, %" R "0, [%2]\n" \
+ "cbnz %w1, 1b\n" \
: "=&r" (previous), \
"=&r" (tmp) \
: "r" (target) \
diff --git a/include/gcc/aarch64/ck_pr_llsc.h b/include/gcc/aarch64/ck_pr_llsc.h
index aa4e3090fa3a..6500d9661c08 100644
--- a/include/gcc/aarch64/ck_pr_llsc.h
+++ b/include/gcc/aarch64/ck_pr_llsc.h
@@ -38,17 +38,17 @@ ck_pr_cas_64_2_value(uint64_t target[2], uint64_t compare[2], uint64_t set[2], u
uint64_t tmp1, tmp2;
__asm__ __volatile__("1:"
- "ldxp %0, %1, [%4];"
- "mov %2, %0;"
- "mov %3, %1;"
- "eor %0, %0, %5;"
- "eor %1, %1, %6;"
- "orr %1, %0, %1;"
- "mov %w0, #0;"
- "cbnz %1, 2f;"
- "stxp %w0, %7, %8, [%4];"
- "cbnz %w0, 1b;"
- "mov %w0, #1;"
+ "ldxp %0, %1, [%4]\n"
+ "mov %2, %0\n"
+ "mov %3, %1\n"
+ "eor %0, %0, %5\n"
+ "eor %1, %1, %6\n"
+ "orr %1, %0, %1\n"
+ "mov %w0, #0\n"
+ "cbnz %1, 2f\n"
+ "stxp %w0, %7, %8, [%4]\n"
+ "cbnz %w0, 1b\n"
+ "mov %w0, #1\n"
"2:"
: "=&r" (tmp1), "=&r" (tmp2), "=&r" (value[0]), "=&r" (value[1])
: "r" (target), "r" (compare[0]), "r" (compare[1]), "r" (set[0]), "r" (set[1])
@@ -72,15 +72,15 @@ ck_pr_cas_64_2(uint64_t target[2], uint64_t compare[2], uint64_t set[2])
uint64_t tmp1, tmp2;
__asm__ __volatile__("1:"
- "ldxp %0, %1, [%2];"
- "eor %0, %0, %3;"
- "eor %1, %1, %4;"
- "orr %1, %0, %1;"
- "mov %w0, #0;"
- "cbnz %1, 2f;"
- "stxp %w0, %5, %6, [%2];"
- "cbnz %w0, 1b;"
- "mov %w0, #1;"
+ "ldxp %0, %1, [%2]\n"
+ "eor %0, %0, %3\n"
+ "eor %1, %1, %4\n"
+ "orr %1, %0, %1\n"
+ "mov %w0, #0\n"
+ "cbnz %1, 2f\n"
+ "stxp %w0, %5, %6, [%2]\n"
+ "cbnz %w0, 1b\n"
+ "mov %w0, #1\n"
"2:"
: "=&r" (tmp1), "=&r" (tmp2)
: "r" (target), "r" (compare[0]), "r" (compare[1]), "r" (set[0]), "r" (set[1])
@@ -103,12 +103,12 @@ ck_pr_cas_ptr_2(void *target, void *compare, void *set)
{ \
T previous; \
T tmp; \
- __asm__ __volatile__("1:" \
- "ldxr" W " %" R "0, [%2];" \
- "cmp %" R "0, %" R "4;" \
- "b.ne 2f;" \
- "stxr" W " %w1, %" R "3, [%2];" \
- "cbnz %w1, 1b;" \
+ __asm__ __volatile__("1:\n" \
+ "ldxr" W " %" R "0, [%2]\n" \
+ "cmp %" R "0, %" R "4\n" \
+ "b.ne 2f\n" \
+ "stxr" W " %w1, %" R "3, [%2]\n" \
+ "cbnz %w1, 1b\n" \
"2:" \
: "=&r" (previous), \
"=&r" (tmp) \
@@ -126,11 +126,11 @@ ck_pr_cas_ptr_2(void *target, void *compare, void *set)
T tmp; \
__asm__ __volatile__( \
"1:" \
- "ldxr" W " %" R "0, [%2];" \
- "cmp %" R "0, %" R "4;" \
- "b.ne 2f;" \
- "stxr" W " %w1, %" R "3, [%2];" \
- "cbnz %w1, 1b;" \
+ "ldxr" W " %" R "0, [%2]\n" \
+ "cmp %" R "0, %" R "4\n" \
+ "b.ne 2f\n" \
+ "stxr" W " %w1, %" R "3, [%2]\n" \
+ "cbnz %w1, 1b\n" \
"2:" \
: "=&r" (previous), \
"=&r" (tmp) \
@@ -167,9 +167,9 @@ CK_PR_CAS_S(char, char, "b", "w")
T previous; \
T tmp; \
__asm__ __volatile__("1:" \
- "ldxr" W " %" R "0, [%2];" \
- "stxr" W " %w1, %" R "3, [%2];"\
- "cbnz %w1, 1b;" \
+ "ldxr" W " %" R "0, [%2]\n"\
+ "stxr" W " %w1, %" R "3, [%2]\n"\
+ "cbnz %w1, 1b\n" \
: "=&r" (previous), \
"=&r" (tmp) \
: "r" (target), \
@@ -198,10 +198,10 @@ CK_PR_FAS(char, char, char, "b", "w")
T previous = 0; \
T tmp = 0; \
__asm__ __volatile__("1:" \
- "ldxr" W " %" R "0, [%2];" \
- I ";" \
- "stxr" W " %w1, %" R "0, [%2];" \
- "cbnz %w1, 1b;" \
+ "ldxr" W " %" R "0, [%2]\n"\
+ I "\n" \
+ "stxr" W " %w1, %" R "0, [%2]\n" \
+ "cbnz %w1, 1b\n" \
: "=&r" (previous), \
"=&r" (tmp) \
: "r" (target) \
@@ -239,10 +239,10 @@ CK_PR_UNARY_S(char, char, "b")
T previous; \
T tmp; \
__asm__ __volatile__("1:" \
- "ldxr" W " %" R "0, [%2];"\
- I " %" R "0, %" R "0, %" R "3;" \
- "stxr" W " %w1, %" R "0, [%2];" \
- "cbnz %w1, 1b;" \
+ "ldxr" W " %" R "0, [%2]\n"\
+ I " %" R "0, %" R "0, %" R "3\n" \
+ "stxr" W " %w1, %" R "0, [%2]\n" \
+ "cbnz %w1, 1b\n" \
: "=&r" (previous), \
"=&r" (tmp) \
: "r" (target), \
@@ -286,10 +286,10 @@ ck_pr_faa_ptr(void *target, uintptr_t delta)
uintptr_t previous, r, tmp;
__asm__ __volatile__("1:"
- "ldxr %0, [%3];"
- "add %1, %4, %0;"
- "stxr %w2, %1, [%3];"
- "cbnz %w2, 1b;"
+ "ldxr %0, [%3]\n"
+ "add %1, %4, %0\n"
+ "stxr %w2, %1, [%3]\n"
+ "cbnz %w2, 1b\n"
: "=&r" (previous),
"=&r" (r),
"=&r" (tmp)
@@ -306,9 +306,9 @@ ck_pr_faa_64(uint64_t *target, uint64_t delta)
uint64_t previous, r, tmp;
__asm__ __volatile__("1:"
- "ldxr %0, [%3];"
- "add %1, %4, %0;"
- "stxr %w2, %1, [%3];"
+ "ldxr %0, [%3]\n"
+ "add %1, %4, %0\n"
+ "stxr %w2, %1, [%3]\n"
"cbnz %w2, 1b;"
: "=&r" (previous),
"=&r" (r),
@@ -326,10 +326,10 @@ ck_pr_faa_64(uint64_t *target, uint64_t delta)
{ \
T previous, r, tmp; \
__asm__ __volatile__("1:" \
- "ldxr" W " %w0, [%3];" \
- "add %w1, %w4, %w0;" \
- "stxr" W " %w2, %w1, [%3];" \
- "cbnz %w2, 1b;" \
+ "ldxr" W " %w0, [%3]\n" \
+ "add %w1, %w4, %w0\n" \
+ "stxr" W " %w2, %w1, [%3]\n" \
+ "cbnz %w2, 1b\n" \
: "=&r" (previous), \
"=&r" (r), \
"=&r" (tmp) \
diff --git a/include/gcc/aarch64/ck_pr_lse.h b/include/gcc/aarch64/ck_pr_lse.h
index e2c9554c8b4a..e450e72d60ec 100644
--- a/include/gcc/aarch64/ck_pr_lse.h
+++ b/include/gcc/aarch64/ck_pr_lse.h
@@ -29,6 +29,7 @@
#ifndef CK_PR_AARCH64_LSE_H
#define CK_PR_AARCH64_LSE_H
+#error bite
#ifndef CK_PR_H
#error Do not include this file directly, use ck_pr.h
#endif
@@ -43,10 +44,10 @@ ck_pr_cas_64_2_value(uint64_t target[2], uint64_t compare[2], uint64_t set[2], u
register uint64_t x2 __asm__ ("x2") = set[0];
register uint64_t x3 __asm__ ("x3") = set[1];
- __asm__ __volatile__("casp %0, %1, %4, %5, [%6];"
- "eor %2, %0, %7;"
- "eor %3, %1, %8;"
- "orr %2, %2, %3;"
+ __asm__ __volatile__("casp %0, %1, %4, %5, [%6]\n"
+ "eor %2, %0, %7\n"
+ "eor %3, %1, %8\n"
+ "orr %2, %2, %3\n"
: "+&r" (x0), "+&r" (x1), "=&r" (tmp1), "=&r" (tmp2)
: "r" (x2), "r" (x3), "r" (target), "r" (compare[0]), "r" (compare[1])
: "memory");
@@ -74,10 +75,10 @@ ck_pr_cas_64_2(uint64_t target[2], uint64_t compare[2], uint64_t set[2])
register uint64_t x2 __asm__ ("x2") = set[0];
register uint64_t x3 __asm__ ("x3") = set[1];
- __asm__ __volatile__("casp %0, %1, %2, %3, [%4];"
- "eor %0, %0, %5;"
- "eor %1, %1, %6;"
- "orr %0, %0, %1;"
+ __asm__ __volatile__("casp %0, %1, %2, %3, [%4]\n"
+ "eor %0, %0, %5\n"
+ "eor %1, %1, %6\n"
+ "orr %0, %0, %1\n"
: "+&r" (x0), "+&r" (x1)
: "r" (x2), "r" (x3), "r" (target), "r" (compare[0]), "r" (compare[1])
: "memory");
@@ -99,7 +100,7 @@ ck_pr_cas_ptr_2(void *target, void *compare, void *set)
{ \
*(T *)value = compare; \
__asm__ __volatile__( \
- "cas" W " %" R "0, %" R "2, [%1];" \
+ "cas" W " %" R "0, %" R "2, [%1]\n"\
: "+&r" (*(T *)value) \
: "r" (target), \
"r" (set) \
@@ -111,7 +112,7 @@ ck_pr_cas_ptr_2(void *target, void *compare, void *set)
{ \
T previous = compare; \
__asm__ __volatile__( \
- "cas" W " %" R "0, %" R "2, [%1];" \
+ "cas" W " %" R "0, %" R "2, [%1]\n"\
: "+&r" (previous) \
: "r" (target), \
"r" (set) \
@@ -144,7 +145,7 @@ CK_PR_CAS_S(char, char, "b", "w")
{ \
T previous; \
__asm__ __volatile__( \
- "swp" W " %" R "2, %" R "0, [%1];" \
+ "swp" W " %" R "2, %" R "0, [%1]\n"\
: "=&r" (previous) \
: "r" (target), \
"r" (v) \
@@ -169,8 +170,8 @@ CK_PR_FAS(char, char, char, "b", "w")
CK_CC_INLINE static void \
ck_pr_##O##_##N(M *target) \
{ \
- __asm__ __volatile__(I ";" \
- "st" S W " " R "0, [%0];" \
+ __asm__ __volatile__(I "\n" \
+ "st" S W " " R "0, [%0]\n" \
: \
: "r" (target) \
: "x0", "memory"); \
@@ -204,8 +205,8 @@ CK_PR_UNARY_S(char, char, "b")
CK_CC_INLINE static void \
ck_pr_##O##_##N(M *target, T delta) \
{ \
- __asm__ __volatile__(I ";" \
- "st" S W " %" R "0, [%1];" \
+ __asm__ __volatile__(I "\n" \
+ "st" S W " %" R "0, [%1]\n"\
: "+&r" (delta) \
: "r" (target) \
: "memory"); \
@@ -247,7 +248,7 @@ ck_pr_faa_ptr(void *target, uintptr_t delta)
uintptr_t previous;
__asm__ __volatile__(
- "ldadd %2, %0, [%1];"
+ "ldadd %2, %0, [%1]\n"
: "=r" (previous)
: "r" (target),
"r" (delta)
@@ -262,7 +263,7 @@ ck_pr_faa_64(uint64_t *target, uint64_t delta)
uint64_t previous;
__asm__ __volatile__(
- "ldadd %2, %0, [%1];"
+ "ldadd %2, %0, [%1]\n"
: "=r" (previous)
: "r" (target),
"r" (delta)
@@ -277,7 +278,7 @@ ck_pr_faa_64(uint64_t *target, uint64_t delta)
{ \
T previous; \
__asm__ __volatile__( \
- "ldadd" W " %w2, %w0, [%1];" \
+ "ldadd" W " %w2, %w0, [%1]\n" \
: "=r" (previous) \
: "r" (target), \
"r" (delta) \
diff --git a/include/gcc/ck_cc.h b/include/gcc/ck_cc.h
index 6ebc59cb5921..0a6d17b93569 100644
--- a/include/gcc/ck_cc.h
+++ b/include/gcc/ck_cc.h
@@ -39,6 +39,15 @@
#define CK_CC_UNUSED __attribute__((unused))
#define CK_CC_USED __attribute__((used))
#define CK_CC_IMM "i"
+
+#define CK_CC_CONTAINER(F, T, M, N) \
+ CK_CC_INLINE static T * \
+ N(F *p) \
+ { \
+ \
+ return (T *)(void *)((char *)p - __builtin_offsetof(T, M)); \
+ }
+
#if defined(__x86_64__) || defined(__x86__)
#define CK_CC_IMM_U32 "Z"
#define CK_CC_IMM_S32 "e"
diff --git a/include/gcc/x86/ck_pr.h b/include/gcc/x86/ck_pr.h
index e678e830e0b4..12291c830dfd 100644
--- a/include/gcc/x86/ck_pr.h
+++ b/include/gcc/x86/ck_pr.h
@@ -120,7 +120,7 @@ CK_PR_FENCE(unlock, CK_MD_X86_MFENCE)
return v; \
}
-CK_PR_FAS(ptr, void, void *, char, "xchgl")
+CK_PR_FAS(ptr, void, void *, uint32_t, "xchgl")
#define CK_PR_FAS_S(S, T, I) CK_PR_FAS(S, T, T, T, I)
@@ -146,7 +146,7 @@ CK_PR_FAS_S(8, uint8_t, "xchgb")
return (r); \
}
-CK_PR_LOAD(ptr, void, void *, char, "movl")
+CK_PR_LOAD(ptr, void, void *, uint32_t, "movl")
#define CK_PR_LOAD_S(S, T, I) CK_PR_LOAD(S, T, T, T, I)
@@ -171,7 +171,7 @@ CK_PR_LOAD_S(8, uint8_t, "movb")
return; \
}
-CK_PR_STORE(ptr, void, const void *, char, "movl")
+CK_PR_STORE(ptr, void, const void *, uint32_t, "movl")
#define CK_PR_STORE_S(S, T, I) CK_PR_STORE(S, T, T, T, I)
@@ -200,7 +200,7 @@ CK_PR_STORE_S(8, uint8_t, "movb")
return (d); \
}
-CK_PR_FAA(ptr, void, uintptr_t, char, "xaddl")
+CK_PR_FAA(ptr, void, uintptr_t, uint32_t, "xaddl")
#define CK_PR_FAA_S(S, T, I) CK_PR_FAA(S, T, T, T, I)
@@ -239,7 +239,7 @@ CK_PR_FAA_S(8, uint8_t, "xaddb")
bool ret; \
__asm__ __volatile__(CK_PR_LOCK_PREFIX I " %0; setz %1" \
: "+m" (*(C *)target), \
- "=rm" (ret) \
+ "=qm" (ret) \
: \
: "memory", "cc"); \
return ret; \
@@ -248,7 +248,7 @@ CK_PR_FAA_S(8, uint8_t, "xaddb")
#define CK_PR_UNARY_S(K, S, T, I) CK_PR_UNARY(K, S, T, T, I)
#define CK_PR_GENERATE(K) \
- CK_PR_UNARY(K, ptr, void, char, #K "l") \
+ CK_PR_UNARY(K, ptr, void, uint32_t, #K "l") \
CK_PR_UNARY_S(K, char, char, #K "b") \
CK_PR_UNARY_S(K, int, int, #K "l") \
CK_PR_UNARY_S(K, uint, unsigned int, #K "l") \
@@ -288,7 +288,7 @@ CK_PR_GENERATE(not)
#define CK_PR_BINARY_S(K, S, T, I) CK_PR_BINARY(K, S, T, T, T, I)
#define CK_PR_GENERATE(K) \
- CK_PR_BINARY(K, ptr, void, uintptr_t, char, #K "l") \
+ CK_PR_BINARY(K, ptr, void, uintptr_t, uint32_t, #K "l") \
CK_PR_BINARY_S(K, char, char, #K "b") \
CK_PR_BINARY_S(K, int, int, #K "l") \
CK_PR_BINARY_S(K, uint, unsigned int, #K "l") \
@@ -307,8 +307,38 @@ CK_PR_GENERATE(xor)
#undef CK_PR_BINARY
/*
- * Atomic compare and swap.
+ * Atomic compare and swap, with a variant that sets *v to the old value of target.
*/
+#ifdef __GCC_ASM_FLAG_OUTPUTS__
+#define CK_PR_CAS(S, M, T, C, I) \
+ CK_CC_INLINE static bool \
+ ck_pr_cas_##S(M *target, T compare, T set) \
+ { \
+ bool z; \
+ __asm__ __volatile__(CK_PR_LOCK_PREFIX I " %3, %0" \
+ : "+m" (*(C *)target), \
+ "=@ccz" (z), \
+ /* RAX is clobbered by cmpxchg. */ \
+ "+a" (compare) \
+ : "q" (set) \
+ : "memory", "cc"); \
+ return z; \
+ } \
+ \
+ CK_CC_INLINE static bool \
+ ck_pr_cas_##S##_value(M *target, T compare, T set, M *v) \
+ { \
+ bool z; \
+ __asm__ __volatile__(CK_PR_LOCK_PREFIX I " %3, %0;" \
+ : "+m" (*(C *)target), \
+ "=@ccz" (z), \
+ "+a" (compare) \
+ : "q" (set) \
+ : "memory", "cc"); \
+ *(T *)v = compare; \
+ return z; \
+ }
+#else
#define CK_PR_CAS(S, M, T, C, I) \
CK_CC_INLINE static bool \
ck_pr_cas_##S(M *target, T compare, T set) \
@@ -321,9 +351,25 @@ CK_PR_GENERATE(xor)
"a" (compare) \
: "memory", "cc"); \
return z; \
+ } \
+ \
+ CK_CC_INLINE static bool \
+ ck_pr_cas_##S##_value(M *target, T compare, T set, M *v) \
+ { \
+ bool z; \
+ __asm__ __volatile__(CK_PR_LOCK_PREFIX I " %3, %0;" \
+ "setz %1;" \
+ : "+m" (*(C *)target), \
+ "=q" (z), \
+ "+a" (compare) \
+ : "q" (set) \
+ : "memory", "cc"); \
+ *(T *)v = compare; \
+ return z; \
}
+#endif
-CK_PR_CAS(ptr, void, void *, char, "cmpxchgl")
+CK_PR_CAS(ptr, void, void *, uint32_t, "cmpxchgl")
#define CK_PR_CAS_S(S, T, I) CK_PR_CAS(S, T, T, T, I)
@@ -338,41 +384,6 @@ CK_PR_CAS_S(8, uint8_t, "cmpxchgb")
#undef CK_PR_CAS
/*
- * Compare and swap, set *v to old value of target.
- */
-#define CK_PR_CAS_O(S, M, T, C, I, R) \
- CK_CC_INLINE static bool \
- ck_pr_cas_##S##_value(M *target, T compare, T set, M *v) \
- { \
- bool z; \
- __asm__ __volatile__(CK_PR_LOCK_PREFIX "cmpxchg" I " %3, %0;" \
- "mov %% " R ", %2;" \
- "setz %1;" \
- : "+m" (*(C *)target), \
- "=a" (z), \
- "=m" (*(C *)v) \
- : "q" (set), \
- "a" (compare) \
- : "memory", "cc"); \
- return (bool)z; \
- }
-
-CK_PR_CAS_O(ptr, void, void *, char, "l", "eax")
-
-#define CK_PR_CAS_O_S(S, T, I, R) \
- CK_PR_CAS_O(S, T, T, T, I, R)
-
-CK_PR_CAS_O_S(char, char, "b", "al")
-CK_PR_CAS_O_S(int, int, "l", "eax")
-CK_PR_CAS_O_S(uint, unsigned int, "l", "eax")
-CK_PR_CAS_O_S(32, uint32_t, "l", "eax")
-CK_PR_CAS_O_S(16, uint16_t, "w", "ax")
-CK_PR_CAS_O_S(8, uint8_t, "b", "al")
-
-#undef CK_PR_CAS_O_S
-#undef CK_PR_CAS_O
-
-/*
* Atomic bit test operations.
*/
#define CK_PR_BT(K, S, T, P, C, I) \
@@ -390,11 +401,11 @@ CK_PR_CAS_O_S(8, uint8_t, "b", "al")
#define CK_PR_BT_S(K, S, T, I) CK_PR_BT(K, S, T, T, T, I)
-#define CK_PR_GENERATE(K) \
- CK_PR_BT(K, ptr, void, uint32_t, char, #K "l %2, %0") \
- CK_PR_BT_S(K, uint, unsigned int, #K "l %2, %0") \
- CK_PR_BT_S(K, int, int, #K "l %2, %0") \
- CK_PR_BT_S(K, 32, uint32_t, #K "l %2, %0") \
+#define CK_PR_GENERATE(K) \
+ CK_PR_BT(K, ptr, void, uint32_t, uint32_t, #K "l %2, %0") \
+ CK_PR_BT_S(K, uint, unsigned int, #K "l %2, %0") \
+ CK_PR_BT_S(K, int, int, #K "l %2, %0") \
+ CK_PR_BT_S(K, 32, uint32_t, #K "l %2, %0") \
CK_PR_BT_S(K, 16, uint16_t, #K "w %w2, %0")
CK_PR_GENERATE(btc)
diff --git a/include/gcc/x86_64/ck_pr.h b/include/gcc/x86_64/ck_pr.h
index fb2804e8d8e5..37678b12b44a 100644
--- a/include/gcc/x86_64/ck_pr.h
+++ b/include/gcc/x86_64/ck_pr.h
@@ -149,7 +149,7 @@ ck_pr_rfo(const void *m)
return v; \
}
-CK_PR_FAS(ptr, void, void *, char, "xchgq")
+CK_PR_FAS(ptr, void, void *, uint64_t, "xchgq")
#define CK_PR_FAS_S(S, T, I) CK_PR_FAS(S, T, T, T, I)
@@ -182,7 +182,7 @@ CK_PR_FAS_S(8, uint8_t, "xchgb")
return (r); \
}
-CK_PR_LOAD(ptr, void, void *, char, "movq")
+CK_PR_LOAD(ptr, void, void *, uint64_t, "movq")
#define CK_PR_LOAD_S(S, T, I) CK_PR_LOAD(S, T, T, T, I)
@@ -264,7 +264,7 @@ CK_PR_LOAD_2(8, 16, uint8_t)
return; \
}
-CK_PR_STORE_IMM(ptr, void, const void *, char, "movq", CK_CC_IMM_U32)
+CK_PR_STORE_IMM(ptr, void, const void *, uint64_t, "movq", CK_CC_IMM_U32)
#ifndef CK_PR_DISABLE_DOUBLE
CK_PR_STORE(double, double, double, double, "movq")
#endif
@@ -298,7 +298,7 @@ CK_PR_STORE_S(8, uint8_t, "movb", CK_CC_IMM_U32)
return (d); \
}
-CK_PR_FAA(ptr, void, uintptr_t, char, "xaddq")
+CK_PR_FAA(ptr, void, uintptr_t, uint64_t, "xaddq")
#define CK_PR_FAA_S(S, T, I) CK_PR_FAA(S, T, T, T, I)
@@ -347,7 +347,7 @@ CK_PR_FAA_S(8, uint8_t, "xaddb")
#define CK_PR_UNARY_S(K, S, T, I) CK_PR_UNARY(K, S, T, T, I)
#define CK_PR_GENERATE(K) \
- CK_PR_UNARY(K, ptr, void, char, #K "q") \
+ CK_PR_UNARY(K, ptr, void, uint64_t, #K "q") \
CK_PR_UNARY_S(K, char, char, #K "b") \
CK_PR_UNARY_S(K, int, int, #K "l") \
CK_PR_UNARY_S(K, uint, unsigned int, #K "l") \
@@ -388,7 +388,7 @@ CK_PR_GENERATE(not)
#define CK_PR_BINARY_S(K, S, T, I, O) CK_PR_BINARY(K, S, T, T, T, I, O)
#define CK_PR_GENERATE(K) \
- CK_PR_BINARY(K, ptr, void, uintptr_t, char, #K "q", CK_CC_IMM_U32) \
+ CK_PR_BINARY(K, ptr, void, uintptr_t, uint64_t, #K "q", CK_CC_IMM_U32) \
CK_PR_BINARY_S(K, char, char, #K "b", CK_CC_IMM_S32) \
CK_PR_BINARY_S(K, int, int, #K "l", CK_CC_IMM_S32) \
CK_PR_BINARY_S(K, uint, unsigned int, #K "l", CK_CC_IMM_U32) \
@@ -408,8 +408,38 @@ CK_PR_GENERATE(xor)
#undef CK_PR_BINARY
/*
- * Atomic compare and swap.
+ * Atomic compare and swap, with a variant that sets *v to the old value of target.
*/
+#ifdef __GCC_ASM_FLAG_OUTPUTS__
+#define CK_PR_CAS(S, M, T, C, I) \
+ CK_CC_INLINE static bool \
+ ck_pr_cas_##S(M *target, T compare, T set) \
+ { \
+ bool z; \
+ __asm__ __volatile__(CK_PR_LOCK_PREFIX I " %3, %0" \
+ : "+m" (*(C *)target), \
+ "=@ccz" (z), \
+ /* RAX is clobbered by cmpxchg. */ \
+ "+a" (compare) \
+ : "q" (set) \
+ : "memory", "cc"); \
+ return z; \
+ } \
+ \
+ CK_CC_INLINE static bool \
+ ck_pr_cas_##S##_value(M *target, T compare, T set, M *v) \
+ { \
+ bool z; \
+ __asm__ __volatile__(CK_PR_LOCK_PREFIX I " %3, %0;" \
+ : "+m" (*(C *)target), \
+ "=@ccz" (z), \
+ "+a" (compare) \
+ : "q" (set) \
+ : "memory", "cc"); \
+ *(T *)v = compare; \
+ return z; \
+ }
+#else
#define CK_PR_CAS(S, M, T, C, I) \
CK_CC_INLINE static bool \
ck_pr_cas_##S(M *target, T compare, T set) \
@@ -422,9 +452,25 @@ CK_PR_GENERATE(xor)
"a" (compare) \
: "memory", "cc"); \
return z; \
+ } \
+ \
+ CK_CC_INLINE static bool \
+ ck_pr_cas_##S##_value(M *target, T compare, T set, M *v) \
+ { \
+ bool z; \
+ __asm__ __volatile__(CK_PR_LOCK_PREFIX I " %3, %0;" \
+ "setz %1;" \
+ : "+m" (*(C *)target), \
+ "=q" (z), \
+ "+a" (compare) \
+ : "q" (set) \
+ : "memory", "cc"); \
+ *(T *)v = compare; \
+ return z; \
}
+#endif
-CK_PR_CAS(ptr, void, void *, char, "cmpxchgq")
+CK_PR_CAS(ptr, void, void *, uint64_t, "cmpxchgq")
#define CK_PR_CAS_S(S, T, I) CK_PR_CAS(S, T, T, T, I)
@@ -443,45 +489,6 @@ CK_PR_CAS_S(8, uint8_t, "cmpxchgb")
#undef CK_PR_CAS
/*
- * Compare and swap, set *v to old value of target.
- */
-#define CK_PR_CAS_O(S, M, T, C, I, R) \
- CK_CC_INLINE static bool \
- ck_pr_cas_##S##_value(M *target, T compare, T set, M *v) \
- { \
- bool z; \
- __asm__ __volatile__(CK_PR_LOCK_PREFIX "cmpxchg" I " %3, %0;" \
- "mov %% " R ", %2;" \
- "setz %1;" \
- : "+m" (*(C *)target), \
- "=a" (z), \
- "=m" (*(C *)v) \
- : "q" (set), \
- "a" (compare) \
- : "memory", "cc"); \
- return z; \
- }
-
-CK_PR_CAS_O(ptr, void, void *, char, "q", "rax")
-
-#define CK_PR_CAS_O_S(S, T, I, R) \
- CK_PR_CAS_O(S, T, T, T, I, R)
-
-CK_PR_CAS_O_S(char, char, "b", "al")
-CK_PR_CAS_O_S(int, int, "l", "eax")
-CK_PR_CAS_O_S(uint, unsigned int, "l", "eax")
-#ifndef CK_PR_DISABLE_DOUBLE
-CK_PR_CAS_O_S(double, double, "q", "rax")
-#endif
-CK_PR_CAS_O_S(64, uint64_t, "q", "rax")
-CK_PR_CAS_O_S(32, uint32_t, "l", "eax")
-CK_PR_CAS_O_S(16, uint16_t, "w", "ax")
-CK_PR_CAS_O_S(8, uint8_t, "b", "al")
-
-#undef CK_PR_CAS_O_S
-#undef CK_PR_CAS_O
-
-/*
* Contrary to C-interface, alignment requirements are that of uint64_t[2].
*/
CK_CC_INLINE static bool
@@ -587,12 +594,12 @@ CK_PR_CAS_V(8, 16, uint8_t)
#define CK_PR_BT_S(K, S, T, I) CK_PR_BT(K, S, T, T, T, I)
-#define CK_PR_GENERATE(K) \
- CK_PR_BT(K, ptr, void, uint64_t, char, #K "q %2, %0") \
- CK_PR_BT_S(K, uint, unsigned int, #K "l %2, %0") \
- CK_PR_BT_S(K, int, int, #K "l %2, %0") \
- CK_PR_BT_S(K, 64, uint64_t, #K "q %2, %0") \
- CK_PR_BT_S(K, 32, uint32_t, #K "l %2, %0") \
+#define CK_PR_GENERATE(K) \
+ CK_PR_BT(K, ptr, void, uint64_t, uint64_t, #K "q %2, %0") \
+ CK_PR_BT_S(K, uint, unsigned int, #K "l %2, %0") \
+ CK_PR_BT_S(K, int, int, #K "l %2, %0") \
+ CK_PR_BT_S(K, 64, uint64_t, #K "q %2, %0") \
+ CK_PR_BT_S(K, 32, uint32_t, #K "l %2, %0") \
CK_PR_BT_S(K, 16, uint16_t, #K "w %w2, %0")
CK_PR_GENERATE(btc)
diff --git a/include/spinlock/fas.h b/include/spinlock/fas.h
index 4e6c1230eaf1..bfe91fed2f9f 100644
--- a/include/spinlock/fas.h
+++ b/include/spinlock/fas.h
@@ -77,10 +77,11 @@ CK_CC_INLINE static void
ck_spinlock_fas_lock(struct ck_spinlock_fas *lock)
{
- while (ck_pr_fas_uint(&lock->value, true) == true) {
- while (ck_pr_load_uint(&lock->value) == true)
- ck_pr_stall();
- }
+ while (CK_CC_UNLIKELY(ck_pr_fas_uint(&lock->value, true) == true)) {
+ do {
+ ck_pr_stall();
+ } while (ck_pr_load_uint(&lock->value) == true);
+ }
ck_pr_fence_lock();
return;