Files
UnrealEngine/Engine/Source/ThirdParty/AtomicQueue/AtomicQueue.h
2025-05-18 13:04:45 +08:00

608 lines
22 KiB
C++

#pragma once
// UE4-adapted version of https://github.com/max0x7ba/atomic_queue
// Copyright (c) 2019 Maxim Egorushkin. MIT License. See the full licence in file LICENSE.
#include <algorithm>
#include <cassert>
#include <cstddef>
#include <cstdint>
#include <memory>
#include <type_traits>
#include <utility>
#include <atomic>
#if PLATFORM_USE_SSE2_FOR_THREAD_YIELD
#include <emmintrin.h>
#endif
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
namespace atomic_queue {
using std::uint32_t;
using std::uint64_t;
using std::uint8_t;
static inline void spin_loop_pause() noexcept {
// TODO(andriy): x86/x64 only
#if PLATFORM_USE_SSE2_FOR_THREAD_YIELD
_mm_pause();
#elif PLATFORM_CPU_ARM_FAMILY
# if _MSC_VER
__yield();
# else
__asm__ __volatile__("yield");
# endif
#else
#error Implement this for your platform/architecture
#endif
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
namespace details {
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
template<size_t elements_per_cache_line> struct GetCacheLineIndexBits { static int constexpr value = 0; };
template<> struct GetCacheLineIndexBits<64> { static int constexpr value = 6; };
template<> struct GetCacheLineIndexBits<32> { static int constexpr value = 5; };
template<> struct GetCacheLineIndexBits<16> { static int constexpr value = 4; };
template<> struct GetCacheLineIndexBits< 8> { static int constexpr value = 3; };
template<> struct GetCacheLineIndexBits< 4> { static int constexpr value = 2; };
template<> struct GetCacheLineIndexBits< 2> { static int constexpr value = 1; };
template<bool minimize_contention, unsigned array_size, size_t elements_per_cache_line>
struct GetIndexShuffleBits {
static int constexpr bits = GetCacheLineIndexBits<elements_per_cache_line>::value;
static unsigned constexpr min_size = 1u << (bits * 2);
static int constexpr value = array_size < min_size ? 0 : bits;
};
template<unsigned array_size, size_t elements_per_cache_line>
struct GetIndexShuffleBits<false, array_size, elements_per_cache_line> {
static int constexpr value = 0;
};
// Multiple writers/readers contend on the same cache line when storing/loading elements at
// subsequent indexes, aka false sharing. For power of 2 ring buffer size it is possible to re-map
// the index in such a way that each subsequent element resides on another cache line, which
// minimizes contention. This is done by swapping the lowest order N bits (which are the index of
// the element within the cache line) with the next N bits (which are the index of the cache line)
// of the element index.
template<int BITS>
constexpr unsigned remap_index(unsigned index) noexcept {
constexpr unsigned MASK = (1u << BITS) - 1;
unsigned mix = (index ^ (index >> BITS))& MASK;
return index ^ mix ^ (mix << BITS);
}
template<>
constexpr unsigned remap_index<0>(unsigned index) noexcept {
return index;
}
template<int BITS, class T>
constexpr T& map(T* elements, unsigned index) noexcept {
index = remap_index<BITS>(index);
return elements[index];
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
constexpr uint32_t round_up_to_power_of_2(uint32_t a) noexcept {
--a;
a |= a >> 1;
a |= a >> 2;
a |= a >> 4;
a |= a >> 8;
a |= a >> 16;
++a;
return a;
}
constexpr uint64_t round_up_to_power_of_2(uint64_t a) noexcept {
--a;
a |= a >> 1;
a |= a >> 2;
a |= a >> 4;
a |= a >> 8;
a |= a >> 16;
a |= a >> 32;
++a;
return a;
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
} // namespace details
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
template<class Derived>
class AtomicQueueCommon {
protected:
// Put these on different cache lines to avoid false sharing between readers and writers.
alignas(PLATFORM_CACHE_LINE_SIZE) std::atomic<unsigned> head_ = {};
alignas(PLATFORM_CACHE_LINE_SIZE) std::atomic<unsigned> tail_ = {};
// The special member functions are not thread-safe.
AtomicQueueCommon() noexcept = default;
AtomicQueueCommon(AtomicQueueCommon const& b) noexcept
: head_(b.head_.load(std::memory_order_relaxed))
, tail_(b.tail_.load(std::memory_order_relaxed)) {}
AtomicQueueCommon& operator=(AtomicQueueCommon const& b) noexcept {
head_.store(b.head_.load(std::memory_order_relaxed), std::memory_order_relaxed);
tail_.store(b.tail_.load(std::memory_order_relaxed), std::memory_order_relaxed);
return *this;
}
void swap(AtomicQueueCommon& b) noexcept {
unsigned h = head_.load(std::memory_order_relaxed);
unsigned t = tail_.load(std::memory_order_relaxed);
head_.store(b.head_.load(std::memory_order_relaxed), std::memory_order_relaxed);
tail_.store(b.tail_.load(std::memory_order_relaxed), std::memory_order_relaxed);
b.head_.store(h, std::memory_order_relaxed);
b.tail_.store(t, std::memory_order_relaxed);
}
template<class T, T NIL>
static T do_pop_atomic(std::atomic<T>& q_element) noexcept {
if (Derived::spsc_) {
for (;;) {
T element = q_element.load(std::memory_order_relaxed);
if (LIKELY(element != NIL)) {
q_element.store(NIL, std::memory_order_release);
return element;
}
if (Derived::maximize_throughput_)
spin_loop_pause();
}
}
else {
for (;;) {
T element = q_element.exchange(NIL, std::memory_order_release); // (2) The store to wait for.
if (LIKELY(element != NIL))
return element;
// Do speculative loads while busy-waiting to avoid broadcasting RFO messages.
do
spin_loop_pause();
while (Derived::maximize_throughput_ && q_element.load(std::memory_order_relaxed) == NIL);
}
}
}
template<class T, T NIL>
static void do_push_atomic(T element, std::atomic<T>& q_element) noexcept {
assert(element != NIL);
if (Derived::spsc_) {
while (UNLIKELY(q_element.load(std::memory_order_relaxed) != NIL))
if (Derived::maximize_throughput_)
spin_loop_pause();
q_element.store(element, std::memory_order_release);
}
else {
for (T expected = NIL; UNLIKELY(!q_element.compare_exchange_strong(expected, element, std::memory_order_release, std::memory_order_relaxed)); expected = NIL) {
do
spin_loop_pause(); // (1) Wait for store (2) to complete.
while (Derived::maximize_throughput_ && q_element.load(std::memory_order_relaxed) != NIL);
}
}
}
enum State : unsigned char { EMPTY, STORING, STORED, LOADING };
template<class T>
static T do_pop_any(std::atomic<unsigned char>& state, T& q_element) noexcept {
if (Derived::spsc_) {
while (UNLIKELY(state.load(std::memory_order_acquire) != STORED))
if (Derived::maximize_throughput_)
spin_loop_pause();
T element{ std::move(q_element) };
state.store(EMPTY, std::memory_order_release);
return element;
}
else {
for (;;) {
unsigned char expected = STORED;
if (LIKELY(state.compare_exchange_strong(expected, LOADING, std::memory_order_relaxed, std::memory_order_relaxed))) {
T element{ std::move(q_element) };
state.store(EMPTY, std::memory_order_release);
return element;
}
// Do speculative loads while busy-waiting to avoid broadcasting RFO messages.
do
spin_loop_pause();
while (Derived::maximize_throughput_ && state.load(std::memory_order_relaxed) != STORED);
}
}
}
template<class U, class T>
static void do_push_any(U&& element, std::atomic<unsigned char>& state, T& q_element) noexcept {
if (Derived::spsc_) {
while (UNLIKELY(state.load(std::memory_order_acquire) != EMPTY))
if (Derived::maximize_throughput_)
spin_loop_pause();
q_element = std::forward<U>(element);
state.store(STORED, std::memory_order_release);
}
else {
for (;;) {
unsigned char expected = EMPTY;
if (LIKELY(state.compare_exchange_strong(expected, STORING, std::memory_order_relaxed, std::memory_order_relaxed))) {
q_element = std::forward<U>(element);
state.store(STORED, std::memory_order_release);
return;
}
// Do speculative loads while busy-waiting to avoid broadcasting RFO messages.
do
spin_loop_pause();
while (Derived::maximize_throughput_ && state.load(std::memory_order_relaxed) != EMPTY);
}
}
}
public:
template<class T>
bool try_push(T&& element) noexcept {
auto head = head_.load(std::memory_order_relaxed);
if (Derived::spsc_) {
if (static_cast<int>(head - tail_.load(std::memory_order_relaxed)) >= static_cast<int>(static_cast<Derived&>(*this).size_))
return false;
head_.store(head + 1, std::memory_order_relaxed);
}
else {
do {
if (static_cast<int>(head - tail_.load(std::memory_order_relaxed)) >= static_cast<int>(static_cast<Derived&>(*this).size_))
return false;
} while (UNLIKELY(!head_.compare_exchange_strong(head, head + 1, std::memory_order_acquire, std::memory_order_relaxed))); // This loop is not FIFO.
}
static_cast<Derived&>(*this).do_push(std::forward<T>(element), head);
return true;
}
template<class T>
bool try_pop(T& element) noexcept {
auto tail = tail_.load(std::memory_order_relaxed);
if (Derived::spsc_) {
if (static_cast<int>(head_.load(std::memory_order_relaxed) - tail) <= 0)
return false;
tail_.store(tail + 1, std::memory_order_relaxed);
}
else {
do {
if (static_cast<int>(head_.load(std::memory_order_relaxed) - tail) <= 0)
return false;
} while (UNLIKELY(!tail_.compare_exchange_strong(tail, tail + 1, std::memory_order_acquire, std::memory_order_relaxed))); // This loop is not FIFO.
}
element = static_cast<Derived&>(*this).do_pop(tail);
return true;
}
template<class T>
void push(T&& element) noexcept {
unsigned head;
if (Derived::spsc_) {
head = head_.load(std::memory_order_relaxed);
head_.store(head + 1, std::memory_order_relaxed);
}
else {
constexpr auto memory_order = Derived::total_order_ ? std::memory_order_seq_cst : std::memory_order_acquire;
head = head_.fetch_add(1, memory_order); // FIFO and total order on Intel regardless, as of 2019.
}
static_cast<Derived&>(*this).do_push(std::forward<T>(element), head);
}
auto pop() noexcept {
unsigned tail;
if (Derived::spsc_) {
tail = tail_.load(std::memory_order_relaxed);
tail_.store(tail + 1, std::memory_order_relaxed);
}
else {
constexpr auto memory_order = Derived::total_order_ ? std::memory_order_seq_cst : std::memory_order_acquire;
tail = tail_.fetch_add(1, memory_order); // FIFO and total order on Intel regardless, as of 2019.
}
return static_cast<Derived&>(*this).do_pop(tail);
}
bool was_empty() const noexcept {
return static_cast<int>(head_.load(std::memory_order_relaxed) - tail_.load(std::memory_order_relaxed)) <= 0;
}
bool was_full() const noexcept {
return static_cast<int>(head_.load(std::memory_order_relaxed) - tail_.load(std::memory_order_relaxed)) >= static_cast<int>(static_cast<Derived const&>(*this).size_);
}
unsigned capacity() const noexcept {
return static_cast<Derived const&>(*this).size_;
}
};
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
template<class T, unsigned SIZE, T NIL = T{}, bool MINIMIZE_CONTENTION = true, bool MAXIMIZE_THROUGHPUT = true, bool TOTAL_ORDER = false, bool SPSC = false >
class AtomicQueue : public AtomicQueueCommon<AtomicQueue<T, SIZE, NIL, MINIMIZE_CONTENTION, MAXIMIZE_THROUGHPUT, TOTAL_ORDER, SPSC>> {
using Base = AtomicQueueCommon<AtomicQueue<T, SIZE, NIL, MINIMIZE_CONTENTION, MAXIMIZE_THROUGHPUT, TOTAL_ORDER, SPSC>>;
friend Base;
static constexpr unsigned size_ = MINIMIZE_CONTENTION ? details::round_up_to_power_of_2(SIZE) : SIZE;
static constexpr int SHUFFLE_BITS = details::GetIndexShuffleBits<MINIMIZE_CONTENTION, size_, PLATFORM_CACHE_LINE_SIZE / sizeof(std::atomic<T>)>::value;
static constexpr bool total_order_ = TOTAL_ORDER;
static constexpr bool spsc_ = SPSC;
static constexpr bool maximize_throughput_ = MAXIMIZE_THROUGHPUT;
alignas(PLATFORM_CACHE_LINE_SIZE) std::atomic<T> elements_[size_] = {}; // Empty elements are NIL.
T do_pop(unsigned tail) noexcept {
std::atomic<T>& q_element = details::map<SHUFFLE_BITS>(elements_, tail % size_);
return Base::template do_pop_atomic<T, NIL>(q_element);
}
void do_push(T element, unsigned head) noexcept {
std::atomic<T>& q_element = details::map<SHUFFLE_BITS>(elements_, head % size_);
Base::template do_push_atomic<T, NIL>(element, q_element);
}
public:
using value_type = T;
AtomicQueue() noexcept {
assert(std::atomic<T>{NIL}.is_lock_free()); // This queue is for atomic elements only. AtomicQueue2 is for non-atomic ones.
if (T{} != NIL)
for (auto& element : elements_)
element.store(NIL, std::memory_order_relaxed);
}
AtomicQueue(AtomicQueue const&) = delete;
AtomicQueue& operator=(AtomicQueue const&) = delete;
};
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
template<class T, unsigned SIZE, bool MINIMIZE_CONTENTION = true, bool MAXIMIZE_THROUGHPUT = true, bool TOTAL_ORDER = false, bool SPSC = false>
class AtomicQueue2 : public AtomicQueueCommon<AtomicQueue2<T, SIZE, MINIMIZE_CONTENTION, MAXIMIZE_THROUGHPUT, TOTAL_ORDER, SPSC>> {
using Base = AtomicQueueCommon<AtomicQueue2<T, SIZE, MINIMIZE_CONTENTION, MAXIMIZE_THROUGHPUT, TOTAL_ORDER, SPSC>>;
using State = typename Base::State;
friend Base;
static constexpr unsigned size_ = MINIMIZE_CONTENTION ? details::round_up_to_power_of_2(SIZE) : SIZE;
static constexpr int SHUFFLE_BITS = details::GetIndexShuffleBits<MINIMIZE_CONTENTION, size_, PLATFORM_CACHE_LINE_SIZE / sizeof(State)>::value;
static constexpr bool total_order_ = TOTAL_ORDER;
static constexpr bool spsc_ = SPSC;
static constexpr bool maximize_throughput_ = MAXIMIZE_THROUGHPUT;
alignas(PLATFORM_CACHE_LINE_SIZE) std::atomic<unsigned char> states_[size_] = {};
alignas(PLATFORM_CACHE_LINE_SIZE) T elements_[size_] = {};
T do_pop(unsigned tail) noexcept {
unsigned index = details::remap_index<SHUFFLE_BITS>(tail % size_);
return Base::template do_pop_any(states_[index], elements_[index]);
}
template<class U>
void do_push(U&& element, unsigned head) noexcept {
unsigned index = details::remap_index<SHUFFLE_BITS>(head % size_);
Base::template do_push_any(std::forward<U>(element), states_[index], elements_[index]);
}
public:
using value_type = T;
AtomicQueue2() noexcept = default;
AtomicQueue2(AtomicQueue2 const&) = delete;
AtomicQueue2& operator=(AtomicQueue2 const&) = delete;
};
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
template<class T, class A = std::allocator<T>, T NIL = T{}, bool MAXIMIZE_THROUGHPUT = true, bool TOTAL_ORDER = false, bool SPSC = false >
class AtomicQueueB : public AtomicQueueCommon<AtomicQueueB<T, A, NIL, MAXIMIZE_THROUGHPUT, TOTAL_ORDER, SPSC>>,
private std::allocator_traits<A>::template rebind_alloc<std::atomic<T>> {
using Base = AtomicQueueCommon<AtomicQueueB<T, A, NIL, MAXIMIZE_THROUGHPUT, TOTAL_ORDER, SPSC>>;
friend Base;
static constexpr bool total_order_ = TOTAL_ORDER;
static constexpr bool spsc_ = SPSC;
static constexpr bool maximize_throughput_ = MAXIMIZE_THROUGHPUT;
using AllocatorElements = typename std::allocator_traits<A>::template rebind_alloc<std::atomic<T>>;
static constexpr auto ELEMENTS_PER_CACHE_LINE = PLATFORM_CACHE_LINE_SIZE / sizeof(std::atomic<T>);
static_assert(ELEMENTS_PER_CACHE_LINE, "Unexpected ELEMENTS_PER_CACHE_LINE.");
static constexpr auto SHUFFLE_BITS = details::GetCacheLineIndexBits<ELEMENTS_PER_CACHE_LINE>::value;
static_assert(SHUFFLE_BITS, "Unexpected SHUFFLE_BITS.");
// AtomicQueueCommon members are stored into by readers and writers.
// Allocate these immutable members on another cache line which never gets invalidated by stores.
alignas(PLATFORM_CACHE_LINE_SIZE) unsigned size_;
std::atomic<T>* elements_;
T do_pop(unsigned tail) noexcept {
std::atomic<T>& q_element = details::map<SHUFFLE_BITS>(elements_, tail & (size_ - 1));
return Base::template do_pop_atomic<T, NIL>(q_element);
}
void do_push(T element, unsigned head) noexcept {
std::atomic<T>& q_element = details::map<SHUFFLE_BITS>(elements_, head & (size_ - 1));
Base::template do_push_atomic<T, NIL>(element, q_element);
}
public:
using value_type = T;
// The special member functions are not thread-safe.
AtomicQueueB(unsigned size)
: size_(std::max(details::round_up_to_power_of_2(size), 1u << (SHUFFLE_BITS * 2)))
, elements_(AllocatorElements::allocate(size_)) {
assert(std::atomic<T>{NIL}.is_lock_free()); // This queue is for atomic elements only. AtomicQueueB2 is for non-atomic ones.
for (auto p = elements_, q = elements_ + size_; p < q; ++p)
p->store(NIL, std::memory_order_relaxed);
}
AtomicQueueB(AtomicQueueB&& b) noexcept
: Base(static_cast<Base&&>(b))
, AllocatorElements(static_cast<AllocatorElements&&>(b)) // TODO: This must be noexcept, static_assert that.
, size_(b.size_)
, elements_(b.elements_) {
b.size_ = 0;
b.elements_ = 0;
}
AtomicQueueB& operator=(AtomicQueueB&& b) noexcept {
b.swap(*this);
return *this;
}
~AtomicQueueB() noexcept {
if (elements_)
AllocatorElements::deallocate(elements_, size_); // TODO: This must be noexcept, static_assert that.
}
void swap(AtomicQueueB& b) noexcept {
using std::swap;
this->Base::swap(b);
swap(static_cast<AllocatorElements&>(*this), static_cast<AllocatorElements&>(b));
swap(size_, b.size_);
swap(elements_, b.elements_);
}
friend void swap(AtomicQueueB& a, AtomicQueueB& b) {
a.swap(b);
}
};
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
template<class T, class A = std::allocator<T>, bool MAXIMIZE_THROUGHPUT = true, bool TOTAL_ORDER = false, bool SPSC = false>
class AtomicQueueB2 : public AtomicQueueCommon<AtomicQueueB2<T, A, MAXIMIZE_THROUGHPUT, TOTAL_ORDER, SPSC>>,
private A,
private std::allocator_traits<A>::template rebind_alloc<std::atomic<uint8_t>> {
using Base = AtomicQueueCommon<AtomicQueueB2<T, A, MAXIMIZE_THROUGHPUT, TOTAL_ORDER, SPSC>>;
using State = typename Base::State;
friend Base;
static constexpr bool total_order_ = TOTAL_ORDER;
static constexpr bool spsc_ = SPSC;
static constexpr bool maximize_throughput_ = MAXIMIZE_THROUGHPUT;
using AllocatorElements = A;
using AllocatorStates = typename std::allocator_traits<A>::template rebind_alloc<std::atomic<uint8_t>>;
// AtomicQueueCommon members are stored into by readers and writers.
// Allocate these immutable members on another cache line which never gets invalidated by stores.
alignas(PLATFORM_CACHE_LINE_SIZE) unsigned size_;
std::atomic<unsigned char>* states_;
T* elements_;
static constexpr auto STATES_PER_CACHE_LINE = PLATFORM_CACHE_LINE_SIZE / sizeof(State);
static_assert(STATES_PER_CACHE_LINE, "Unexpected STATES_PER_CACHE_LINE.");
static constexpr auto SHUFFLE_BITS = details::GetCacheLineIndexBits<STATES_PER_CACHE_LINE>::value;
static_assert(SHUFFLE_BITS, "Unexpected SHUFFLE_BITS.");
T do_pop(unsigned tail) noexcept {
unsigned index = details::remap_index<SHUFFLE_BITS>(tail & (size_ - 1));
return Base::template do_pop_any(states_[index], elements_[index]);
}
template<class U>
void do_push(U&& element, unsigned head) noexcept {
unsigned index = details::remap_index<SHUFFLE_BITS>(head & (size_ - 1));
Base::template do_push_any(std::forward<U>(element), states_[index], elements_[index]);
}
public:
using value_type = T;
// The special member functions are not thread-safe.
AtomicQueueB2(unsigned size)
: size_(std::max(details::round_up_to_power_of_2(size), 1u << (SHUFFLE_BITS * 2)))
, states_(AllocatorStates::allocate(size_))
, elements_(AllocatorElements::allocate(size_)) {
for (auto p = states_, q = states_ + size_; p < q; ++p)
p->store(Base::EMPTY, std::memory_order_relaxed);
AllocatorElements& ae = *this;
for (auto p = elements_, q = elements_ + size_; p < q; ++p)
std::allocator_traits<AllocatorElements>::construct(ae, p);
}
AtomicQueueB2(AtomicQueueB2&& b) noexcept
: Base(static_cast<Base&&>(b))
, AllocatorElements(static_cast<AllocatorElements&&>(b)) // TODO: This must be noexcept, static_assert that.
, AllocatorStates(static_cast<AllocatorStates&&>(b)) // TODO: This must be noexcept, static_assert that.
, size_(b.size_)
, states_(b.states_)
, elements_(b.elements_) {
b.size_ = 0;
b.states_ = 0;
b.elements_ = 0;
}
AtomicQueueB2& operator=(AtomicQueueB2&& b) noexcept {
b.swap(*this);
return *this;
}
~AtomicQueueB2() noexcept {
if (elements_) {
AllocatorElements& ae = *this;
for (auto p = elements_, q = elements_ + size_; p < q; ++p)
std::allocator_traits<AllocatorElements>::destroy(ae, p);
AllocatorElements::deallocate(elements_, size_); // TODO: This must be noexcept, static_assert that.
AllocatorStates::deallocate(states_, size_); // TODO: This must be noexcept, static_assert that.
}
}
void swap(AtomicQueueB2& b) noexcept {
using std::swap;
this->Base::swap(b);
swap(static_cast<AllocatorElements&>(*this), static_cast<AllocatorElements&>(b));
swap(static_cast<AllocatorStates&>(*this), static_cast<AllocatorStates&>(b));
swap(size_, b.size_);
swap(states_, b.states_);
swap(elements_, b.elements_);
}
friend void swap(AtomicQueueB2& a, AtomicQueueB2& b) noexcept {
a.swap(b);
}
};
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
template<class Queue>
struct RetryDecorator : Queue {
using T = typename Queue::value_type;
using Queue::Queue;
void push(T element) noexcept {
while (!this->try_push(element))
spin_loop_pause();
}
T pop() noexcept {
T element;
while (!this->try_pop(element))
spin_loop_pause();
return element;
}
};
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
} // namespace atomic_queue