#pragma once // UE4-adapted version of https://github.com/max0x7ba/atomic_queue // Copyright (c) 2019 Maxim Egorushkin. MIT License. See the full licence in file LICENSE. #include #include #include #include #include #include #include #include #if PLATFORM_USE_SSE2_FOR_THREAD_YIELD #include #endif //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// namespace atomic_queue { using std::uint32_t; using std::uint64_t; using std::uint8_t; static inline void spin_loop_pause() noexcept { // TODO(andriy): x86/x64 only #if PLATFORM_USE_SSE2_FOR_THREAD_YIELD _mm_pause(); #elif PLATFORM_CPU_ARM_FAMILY # if _MSC_VER __yield(); # else __asm__ __volatile__("yield"); # endif #else #error Implement this for your platform/architecture #endif } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// namespace details { //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// template struct GetCacheLineIndexBits { static int constexpr value = 0; }; template<> struct GetCacheLineIndexBits<64> { static int constexpr value = 6; }; template<> struct GetCacheLineIndexBits<32> { static int constexpr value = 5; }; template<> struct GetCacheLineIndexBits<16> { static int constexpr value = 4; }; template<> struct GetCacheLineIndexBits< 8> { static int constexpr value = 3; }; template<> struct GetCacheLineIndexBits< 4> { static int constexpr value = 2; }; template<> struct GetCacheLineIndexBits< 2> { static int constexpr value = 1; }; template struct GetIndexShuffleBits { static int constexpr bits = GetCacheLineIndexBits::value; static unsigned constexpr min_size = 1u << (bits * 2); static int constexpr value = array_size < min_size ? 0 : bits; }; template struct GetIndexShuffleBits { static int constexpr value = 0; }; // Multiple writers/readers contend on the same cache line when storing/loading elements at // subsequent indexes, aka false sharing. For power of 2 ring buffer size it is possible to re-map // the index in such a way that each subsequent element resides on another cache line, which // minimizes contention. This is done by swapping the lowest order N bits (which are the index of // the element within the cache line) with the next N bits (which are the index of the cache line) // of the element index. template constexpr unsigned remap_index(unsigned index) noexcept { constexpr unsigned MASK = (1u << BITS) - 1; unsigned mix = (index ^ (index >> BITS))& MASK; return index ^ mix ^ (mix << BITS); } template<> constexpr unsigned remap_index<0>(unsigned index) noexcept { return index; } template constexpr T& map(T* elements, unsigned index) noexcept { index = remap_index(index); return elements[index]; } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// constexpr uint32_t round_up_to_power_of_2(uint32_t a) noexcept { --a; a |= a >> 1; a |= a >> 2; a |= a >> 4; a |= a >> 8; a |= a >> 16; ++a; return a; } constexpr uint64_t round_up_to_power_of_2(uint64_t a) noexcept { --a; a |= a >> 1; a |= a >> 2; a |= a >> 4; a |= a >> 8; a |= a >> 16; a |= a >> 32; ++a; return a; } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// } // namespace details //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// template class AtomicQueueCommon { protected: // Put these on different cache lines to avoid false sharing between readers and writers. alignas(PLATFORM_CACHE_LINE_SIZE) std::atomic head_ = {}; alignas(PLATFORM_CACHE_LINE_SIZE) std::atomic tail_ = {}; // The special member functions are not thread-safe. AtomicQueueCommon() noexcept = default; AtomicQueueCommon(AtomicQueueCommon const& b) noexcept : head_(b.head_.load(std::memory_order_relaxed)) , tail_(b.tail_.load(std::memory_order_relaxed)) {} AtomicQueueCommon& operator=(AtomicQueueCommon const& b) noexcept { head_.store(b.head_.load(std::memory_order_relaxed), std::memory_order_relaxed); tail_.store(b.tail_.load(std::memory_order_relaxed), std::memory_order_relaxed); return *this; } void swap(AtomicQueueCommon& b) noexcept { unsigned h = head_.load(std::memory_order_relaxed); unsigned t = tail_.load(std::memory_order_relaxed); head_.store(b.head_.load(std::memory_order_relaxed), std::memory_order_relaxed); tail_.store(b.tail_.load(std::memory_order_relaxed), std::memory_order_relaxed); b.head_.store(h, std::memory_order_relaxed); b.tail_.store(t, std::memory_order_relaxed); } template static T do_pop_atomic(std::atomic& q_element) noexcept { if (Derived::spsc_) { for (;;) { T element = q_element.load(std::memory_order_relaxed); if (LIKELY(element != NIL)) { q_element.store(NIL, std::memory_order_release); return element; } if (Derived::maximize_throughput_) spin_loop_pause(); } } else { for (;;) { T element = q_element.exchange(NIL, std::memory_order_release); // (2) The store to wait for. if (LIKELY(element != NIL)) return element; // Do speculative loads while busy-waiting to avoid broadcasting RFO messages. do spin_loop_pause(); while (Derived::maximize_throughput_ && q_element.load(std::memory_order_relaxed) == NIL); } } } template static void do_push_atomic(T element, std::atomic& q_element) noexcept { assert(element != NIL); if (Derived::spsc_) { while (UNLIKELY(q_element.load(std::memory_order_relaxed) != NIL)) if (Derived::maximize_throughput_) spin_loop_pause(); q_element.store(element, std::memory_order_release); } else { for (T expected = NIL; UNLIKELY(!q_element.compare_exchange_strong(expected, element, std::memory_order_release, std::memory_order_relaxed)); expected = NIL) { do spin_loop_pause(); // (1) Wait for store (2) to complete. while (Derived::maximize_throughput_ && q_element.load(std::memory_order_relaxed) != NIL); } } } enum State : unsigned char { EMPTY, STORING, STORED, LOADING }; template static T do_pop_any(std::atomic& state, T& q_element) noexcept { if (Derived::spsc_) { while (UNLIKELY(state.load(std::memory_order_acquire) != STORED)) if (Derived::maximize_throughput_) spin_loop_pause(); T element{ std::move(q_element) }; state.store(EMPTY, std::memory_order_release); return element; } else { for (;;) { unsigned char expected = STORED; if (LIKELY(state.compare_exchange_strong(expected, LOADING, std::memory_order_relaxed, std::memory_order_relaxed))) { T element{ std::move(q_element) }; state.store(EMPTY, std::memory_order_release); return element; } // Do speculative loads while busy-waiting to avoid broadcasting RFO messages. do spin_loop_pause(); while (Derived::maximize_throughput_ && state.load(std::memory_order_relaxed) != STORED); } } } template static void do_push_any(U&& element, std::atomic& state, T& q_element) noexcept { if (Derived::spsc_) { while (UNLIKELY(state.load(std::memory_order_acquire) != EMPTY)) if (Derived::maximize_throughput_) spin_loop_pause(); q_element = std::forward(element); state.store(STORED, std::memory_order_release); } else { for (;;) { unsigned char expected = EMPTY; if (LIKELY(state.compare_exchange_strong(expected, STORING, std::memory_order_relaxed, std::memory_order_relaxed))) { q_element = std::forward(element); state.store(STORED, std::memory_order_release); return; } // Do speculative loads while busy-waiting to avoid broadcasting RFO messages. do spin_loop_pause(); while (Derived::maximize_throughput_ && state.load(std::memory_order_relaxed) != EMPTY); } } } public: template bool try_push(T&& element) noexcept { auto head = head_.load(std::memory_order_relaxed); if (Derived::spsc_) { if (static_cast(head - tail_.load(std::memory_order_relaxed)) >= static_cast(static_cast(*this).size_)) return false; head_.store(head + 1, std::memory_order_relaxed); } else { do { if (static_cast(head - tail_.load(std::memory_order_relaxed)) >= static_cast(static_cast(*this).size_)) return false; } while (UNLIKELY(!head_.compare_exchange_strong(head, head + 1, std::memory_order_acquire, std::memory_order_relaxed))); // This loop is not FIFO. } static_cast(*this).do_push(std::forward(element), head); return true; } template bool try_pop(T& element) noexcept { auto tail = tail_.load(std::memory_order_relaxed); if (Derived::spsc_) { if (static_cast(head_.load(std::memory_order_relaxed) - tail) <= 0) return false; tail_.store(tail + 1, std::memory_order_relaxed); } else { do { if (static_cast(head_.load(std::memory_order_relaxed) - tail) <= 0) return false; } while (UNLIKELY(!tail_.compare_exchange_strong(tail, tail + 1, std::memory_order_acquire, std::memory_order_relaxed))); // This loop is not FIFO. } element = static_cast(*this).do_pop(tail); return true; } template void push(T&& element) noexcept { unsigned head; if (Derived::spsc_) { head = head_.load(std::memory_order_relaxed); head_.store(head + 1, std::memory_order_relaxed); } else { constexpr auto memory_order = Derived::total_order_ ? std::memory_order_seq_cst : std::memory_order_acquire; head = head_.fetch_add(1, memory_order); // FIFO and total order on Intel regardless, as of 2019. } static_cast(*this).do_push(std::forward(element), head); } auto pop() noexcept { unsigned tail; if (Derived::spsc_) { tail = tail_.load(std::memory_order_relaxed); tail_.store(tail + 1, std::memory_order_relaxed); } else { constexpr auto memory_order = Derived::total_order_ ? std::memory_order_seq_cst : std::memory_order_acquire; tail = tail_.fetch_add(1, memory_order); // FIFO and total order on Intel regardless, as of 2019. } return static_cast(*this).do_pop(tail); } bool was_empty() const noexcept { return static_cast(head_.load(std::memory_order_relaxed) - tail_.load(std::memory_order_relaxed)) <= 0; } bool was_full() const noexcept { return static_cast(head_.load(std::memory_order_relaxed) - tail_.load(std::memory_order_relaxed)) >= static_cast(static_cast(*this).size_); } unsigned capacity() const noexcept { return static_cast(*this).size_; } }; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// template class AtomicQueue : public AtomicQueueCommon> { using Base = AtomicQueueCommon>; friend Base; static constexpr unsigned size_ = MINIMIZE_CONTENTION ? details::round_up_to_power_of_2(SIZE) : SIZE; static constexpr int SHUFFLE_BITS = details::GetIndexShuffleBits)>::value; static constexpr bool total_order_ = TOTAL_ORDER; static constexpr bool spsc_ = SPSC; static constexpr bool maximize_throughput_ = MAXIMIZE_THROUGHPUT; alignas(PLATFORM_CACHE_LINE_SIZE) std::atomic elements_[size_] = {}; // Empty elements are NIL. T do_pop(unsigned tail) noexcept { std::atomic& q_element = details::map(elements_, tail % size_); return Base::template do_pop_atomic(q_element); } void do_push(T element, unsigned head) noexcept { std::atomic& q_element = details::map(elements_, head % size_); Base::template do_push_atomic(element, q_element); } public: using value_type = T; AtomicQueue() noexcept { assert(std::atomic{NIL}.is_lock_free()); // This queue is for atomic elements only. AtomicQueue2 is for non-atomic ones. if (T{} != NIL) for (auto& element : elements_) element.store(NIL, std::memory_order_relaxed); } AtomicQueue(AtomicQueue const&) = delete; AtomicQueue& operator=(AtomicQueue const&) = delete; }; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// template class AtomicQueue2 : public AtomicQueueCommon> { using Base = AtomicQueueCommon>; using State = typename Base::State; friend Base; static constexpr unsigned size_ = MINIMIZE_CONTENTION ? details::round_up_to_power_of_2(SIZE) : SIZE; static constexpr int SHUFFLE_BITS = details::GetIndexShuffleBits::value; static constexpr bool total_order_ = TOTAL_ORDER; static constexpr bool spsc_ = SPSC; static constexpr bool maximize_throughput_ = MAXIMIZE_THROUGHPUT; alignas(PLATFORM_CACHE_LINE_SIZE) std::atomic states_[size_] = {}; alignas(PLATFORM_CACHE_LINE_SIZE) T elements_[size_] = {}; T do_pop(unsigned tail) noexcept { unsigned index = details::remap_index(tail % size_); return Base::template do_pop_any(states_[index], elements_[index]); } template void do_push(U&& element, unsigned head) noexcept { unsigned index = details::remap_index(head % size_); Base::template do_push_any(std::forward(element), states_[index], elements_[index]); } public: using value_type = T; AtomicQueue2() noexcept = default; AtomicQueue2(AtomicQueue2 const&) = delete; AtomicQueue2& operator=(AtomicQueue2 const&) = delete; }; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// template, T NIL = T{}, bool MAXIMIZE_THROUGHPUT = true, bool TOTAL_ORDER = false, bool SPSC = false > class AtomicQueueB : public AtomicQueueCommon>, private std::allocator_traits::template rebind_alloc> { using Base = AtomicQueueCommon>; friend Base; static constexpr bool total_order_ = TOTAL_ORDER; static constexpr bool spsc_ = SPSC; static constexpr bool maximize_throughput_ = MAXIMIZE_THROUGHPUT; using AllocatorElements = typename std::allocator_traits::template rebind_alloc>; static constexpr auto ELEMENTS_PER_CACHE_LINE = PLATFORM_CACHE_LINE_SIZE / sizeof(std::atomic); static_assert(ELEMENTS_PER_CACHE_LINE, "Unexpected ELEMENTS_PER_CACHE_LINE."); static constexpr auto SHUFFLE_BITS = details::GetCacheLineIndexBits::value; static_assert(SHUFFLE_BITS, "Unexpected SHUFFLE_BITS."); // AtomicQueueCommon members are stored into by readers and writers. // Allocate these immutable members on another cache line which never gets invalidated by stores. alignas(PLATFORM_CACHE_LINE_SIZE) unsigned size_; std::atomic* elements_; T do_pop(unsigned tail) noexcept { std::atomic& q_element = details::map(elements_, tail & (size_ - 1)); return Base::template do_pop_atomic(q_element); } void do_push(T element, unsigned head) noexcept { std::atomic& q_element = details::map(elements_, head & (size_ - 1)); Base::template do_push_atomic(element, q_element); } public: using value_type = T; // The special member functions are not thread-safe. AtomicQueueB(unsigned size) : size_(std::max(details::round_up_to_power_of_2(size), 1u << (SHUFFLE_BITS * 2))) , elements_(AllocatorElements::allocate(size_)) { assert(std::atomic{NIL}.is_lock_free()); // This queue is for atomic elements only. AtomicQueueB2 is for non-atomic ones. for (auto p = elements_, q = elements_ + size_; p < q; ++p) p->store(NIL, std::memory_order_relaxed); } AtomicQueueB(AtomicQueueB&& b) noexcept : Base(static_cast(b)) , AllocatorElements(static_cast(b)) // TODO: This must be noexcept, static_assert that. , size_(b.size_) , elements_(b.elements_) { b.size_ = 0; b.elements_ = 0; } AtomicQueueB& operator=(AtomicQueueB&& b) noexcept { b.swap(*this); return *this; } ~AtomicQueueB() noexcept { if (elements_) AllocatorElements::deallocate(elements_, size_); // TODO: This must be noexcept, static_assert that. } void swap(AtomicQueueB& b) noexcept { using std::swap; this->Base::swap(b); swap(static_cast(*this), static_cast(b)); swap(size_, b.size_); swap(elements_, b.elements_); } friend void swap(AtomicQueueB& a, AtomicQueueB& b) { a.swap(b); } }; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// template, bool MAXIMIZE_THROUGHPUT = true, bool TOTAL_ORDER = false, bool SPSC = false> class AtomicQueueB2 : public AtomicQueueCommon>, private A, private std::allocator_traits::template rebind_alloc> { using Base = AtomicQueueCommon>; using State = typename Base::State; friend Base; static constexpr bool total_order_ = TOTAL_ORDER; static constexpr bool spsc_ = SPSC; static constexpr bool maximize_throughput_ = MAXIMIZE_THROUGHPUT; using AllocatorElements = A; using AllocatorStates = typename std::allocator_traits::template rebind_alloc>; // AtomicQueueCommon members are stored into by readers and writers. // Allocate these immutable members on another cache line which never gets invalidated by stores. alignas(PLATFORM_CACHE_LINE_SIZE) unsigned size_; std::atomic* states_; T* elements_; static constexpr auto STATES_PER_CACHE_LINE = PLATFORM_CACHE_LINE_SIZE / sizeof(State); static_assert(STATES_PER_CACHE_LINE, "Unexpected STATES_PER_CACHE_LINE."); static constexpr auto SHUFFLE_BITS = details::GetCacheLineIndexBits::value; static_assert(SHUFFLE_BITS, "Unexpected SHUFFLE_BITS."); T do_pop(unsigned tail) noexcept { unsigned index = details::remap_index(tail & (size_ - 1)); return Base::template do_pop_any(states_[index], elements_[index]); } template void do_push(U&& element, unsigned head) noexcept { unsigned index = details::remap_index(head & (size_ - 1)); Base::template do_push_any(std::forward(element), states_[index], elements_[index]); } public: using value_type = T; // The special member functions are not thread-safe. AtomicQueueB2(unsigned size) : size_(std::max(details::round_up_to_power_of_2(size), 1u << (SHUFFLE_BITS * 2))) , states_(AllocatorStates::allocate(size_)) , elements_(AllocatorElements::allocate(size_)) { for (auto p = states_, q = states_ + size_; p < q; ++p) p->store(Base::EMPTY, std::memory_order_relaxed); AllocatorElements& ae = *this; for (auto p = elements_, q = elements_ + size_; p < q; ++p) std::allocator_traits::construct(ae, p); } AtomicQueueB2(AtomicQueueB2&& b) noexcept : Base(static_cast(b)) , AllocatorElements(static_cast(b)) // TODO: This must be noexcept, static_assert that. , AllocatorStates(static_cast(b)) // TODO: This must be noexcept, static_assert that. , size_(b.size_) , states_(b.states_) , elements_(b.elements_) { b.size_ = 0; b.states_ = 0; b.elements_ = 0; } AtomicQueueB2& operator=(AtomicQueueB2&& b) noexcept { b.swap(*this); return *this; } ~AtomicQueueB2() noexcept { if (elements_) { AllocatorElements& ae = *this; for (auto p = elements_, q = elements_ + size_; p < q; ++p) std::allocator_traits::destroy(ae, p); AllocatorElements::deallocate(elements_, size_); // TODO: This must be noexcept, static_assert that. AllocatorStates::deallocate(states_, size_); // TODO: This must be noexcept, static_assert that. } } void swap(AtomicQueueB2& b) noexcept { using std::swap; this->Base::swap(b); swap(static_cast(*this), static_cast(b)); swap(static_cast(*this), static_cast(b)); swap(size_, b.size_); swap(states_, b.states_); swap(elements_, b.elements_); } friend void swap(AtomicQueueB2& a, AtomicQueueB2& b) noexcept { a.swap(b); } }; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// template struct RetryDecorator : Queue { using T = typename Queue::value_type; using Queue::Queue; void push(T element) noexcept { while (!this->try_push(element)) spin_loop_pause(); } T pop() noexcept { T element; while (!this->try_pop(element)) spin_loop_pause(); return element; } }; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// } // namespace atomic_queue