A fast lock-free queue for C++

来源:互联网 发布:黑米软件官方吧 编辑:程序博客网 时间:2024/05/21 18:48

http://moodycamel.com/blog/2013/a-fast-lock-free-queue-for-c++#benchmarks


// ©2013 Cameron Desrochers.// Distributed under the simplified BSD license (see the license file that// should have come with this header).#pragma once// Provides portable (VC++2010+, Intel ICC 13, GCC 4.7+, and anything C++11 compliant) implementation// of low-level memory barriers, plus a few semi-portable utility macros (for inlining and alignment).// Also has a basic atomic type (limited to hardware-supported atomics with no memory ordering guarantees).// Uses the AE_* prefix for macros (historical reasons), and the "moodycamel" namespace for symbols.#include <cassert>// Platform detection#if defined(__INTEL_COMPILER)#define AE_ICC#elif defined(_MSC_VER)#define AE_VCPP#elif defined(__GNUC__)#define AE_GCC#endif#if defined(_M_IA64) || defined(__ia64__)#define AE_ARCH_IA64#elif defined(_WIN64) || defined(__amd64__) || defined(_M_X64) || defined(__x86_64__)#define AE_ARCH_X64#elif defined(_M_IX86) || defined(__i386__)#define AE_ARCH_X86#elif defined(_M_PPC) || defined(__powerpc__)#define AE_ARCH_PPC#else#define AE_ARCH_UNKNOWN#endif// AE_FORCEINLINE#if defined(AE_VCPP) || defined(AE_ICC)#define AE_FORCEINLINE __forceinline#elif defined(AE_GCC)//#define AE_FORCEINLINE __attribute__((always_inline)) #define AE_FORCEINLINE inline#else#define AE_FORCEINLINE inline#endif// AE_ALIGN#if defined(AE_VCPP) || defined(AE_ICC)#define AE_ALIGN(x) __declspec(align(x))#elif defined(AE_GCC)#define AE_ALIGN(x) __attribute__((aligned(x)))#else// Assume GCC compliant syntax...#define AE_ALIGN(x) __attribute__((aligned(x)))#endif// Portable atomic fences implemented below:namespace moodycamel {enum memory_order {memory_order_relaxed,memory_order_acquire,memory_order_release,memory_order_acq_rel,memory_order_seq_cst,// memory_order_sync: Forces a full sync:// #LoadLoad, #LoadStore, #StoreStore, and most significantly, #StoreLoadmemory_order_sync = memory_order_seq_cst};}    // end namespace moodycamel#if defined(AE_VCPP) || defined(AE_ICC)// VS2010 and ICC13 don't support std::atomic_*_fence, implement our own fences#include <intrin.h>#if defined(AE_ARCH_X64) || defined(AE_ARCH_X86)#define AeFullSync _mm_mfence#define AeLiteSync _mm_mfence#elif defined(AE_ARCH_IA64)#define AeFullSync __mf#define AeLiteSync __mf#elif defined(AE_ARCH_PPC)#include <ppcintrinsics.h>#define AeFullSync __sync#define AeLiteSync __lwsync#endifnamespace moodycamel {AE_FORCEINLINE void compiler_fence(memory_order order){switch (order) {case memory_order_relaxed: break;case memory_order_acquire: _ReadBarrier(); break;case memory_order_release: _WriteBarrier(); break;case memory_order_acq_rel: _ReadWriteBarrier(); break;case memory_order_seq_cst: _ReadWriteBarrier(); break;default: assert(false);}}// x86/x64 have a strong memory model -- all loads and stores have// acquire and release semantics automatically (so only need compiler// barriers for those).#if defined(AE_ARCH_X86) || defined(AE_ARCH_X64)AE_FORCEINLINE void fence(memory_order order){switch (order) {case memory_order_relaxed: break;case memory_order_acquire: _ReadBarrier(); break;case memory_order_release: _WriteBarrier(); break;case memory_order_acq_rel: _ReadWriteBarrier(); break;case memory_order_seq_cst:_ReadWriteBarrier();AeFullSync();_ReadWriteBarrier();break;default: assert(false);}}#elseAE_FORCEINLINE void fence(memory_order order){// Non-specialized arch, use heavier memory barriers everywhere just in case :-(switch (order) {case memory_order_relaxed:break;case memory_order_acquire:_ReadBarrier();AeLiteSync();_ReadBarrier();break;case memory_order_release:_WriteBarrier();AeLiteSync();_WriteBarrier();break;case memory_order_acq_rel:_ReadWriteBarrier();AeLiteSync();_ReadWriteBarrier();break;case memory_order_seq_cst:_ReadWriteBarrier();AeFullSync();_ReadWriteBarrier();break;default: assert(false);}}#endif}    // end namespace moodycamel#else// Use standard library of atomics#include <atomic>namespace moodycamel {AE_FORCEINLINE void compiler_fence(memory_order order){switch (order) {case memory_order_relaxed: break;case memory_order_acquire: std::atomic_signal_fence(std::memory_order_acquire); break;case memory_order_release: std::atomic_signal_fence(std::memory_order_release); break;case memory_order_acq_rel: std::atomic_signal_fence(std::memory_order_acq_rel); break;case memory_order_seq_cst: std::atomic_signal_fence(std::memory_order_seq_cst); break;default: assert(false);}}AE_FORCEINLINE void fence(memory_order order){switch (order) {case memory_order_relaxed: break;case memory_order_acquire: std::atomic_thread_fence(std::memory_order_acquire); break;case memory_order_release: std::atomic_thread_fence(std::memory_order_release); break;case memory_order_acq_rel: std::atomic_thread_fence(std::memory_order_acq_rel); break;case memory_order_seq_cst: std::atomic_thread_fence(std::memory_order_seq_cst); break;default: assert(false);}}}    // end namespace moodycamel#endif#if !defined(AE_VCPP) || _MSC_VER >= 1700#define AE_USE_STD_ATOMIC_FOR_WEAK_ATOMIC#endif#ifdef AE_USE_STD_ATOMIC_FOR_WEAK_ATOMIC#include <atomic>#endif#include <utility>// WARNING: *NOT* A REPLACEMENT FOR std::atomic. READ CAREFULLY:// Provides basic support for atomic variables -- no memory ordering guarantees are provided.// The guarantee of atomicity is only made for types that already have atomic load and store guarantees// at the hardware level -- on most platforms this generally means aligned pointers and integers (only).namespace moodycamel {template<typename T>class weak_atomic{public:weak_atomic() { }template<typename U> weak_atomic(U&& x) : value(std::forward<U>(x)) {  }weak_atomic(weak_atomic const& other) : value(other.value) {  }weak_atomic(weak_atomic&& other) : value(std::move(other.value)) {  }AE_FORCEINLINE operator T() const { return load(); }#ifndef AE_USE_STD_ATOMIC_FOR_WEAK_ATOMICtemplate<typename U> AE_FORCEINLINE weak_atomic const& operator=(U&& x) { value = std::forward<U>(x); return *this; }AE_FORCEINLINE weak_atomic const& operator=(weak_atomic const& other) { value = other.value; return *this; }AE_FORCEINLINE T load() const { return value; }#elsetemplate<typename U>AE_FORCEINLINE weak_atomic const& operator=(U&& x){value.store(std::forward<U>(x), std::memory_order_relaxed);return *this;}AE_FORCEINLINE weak_atomic const& operator=(weak_atomic const& other){value.store(other.value.load(std::memory_order_relaxed), std::memory_order_relaxed);return *this;}AE_FORCEINLINE T load() const { return value.load(std::memory_order_relaxed); }#endifprivate:#ifndef AE_USE_STD_ATOMIC_FOR_WEAK_ATOMIC// No std::atomic support, but still need to circumvent compiler optimizations.// `volatile` will make memory access slow, but is guaranteed to be reliable.volatile T value;#elsestd::atomic<T> value;#endif};}// end namespace moodycamel

// ©2013 Cameron Desrochers.// Distributed under the simplified BSD license (see the license file that// should have come with this header).#pragma once#include "atomicops.h"#include <type_traits>#include <utility>#include <cassert>#include <stdexcept>#include <cstdint>#include <cstdlib>// For malloc/free & size_t// A lock-free queue for a single-consumer, single-producer architecture.// The queue is also wait-free in the common path (except if more memory// needs to be allocated, in which case malloc is called).// Allocates memory sparingly (O(lg(n) times, amortized), and only once if// the original maximum size estimate is never exceeded.// Tested on x86/x64 processors, but semantics should be correct for all// architectures (given the right implementations in atomicops.h), provided// that aligned integer and pointer accesses are naturally atomic.// Note that there should only be one consumer thread and producer thread;// Switching roles of the threads, or using multiple consecutive threads for// one role, is not safe unless properly synchronized.// Using the queue exclusively from one thread is fine, though a bit silly.#define CACHE_LINE_SIZE 64namespace moodycamel {template<typename T>class ReaderWriterQueue{// Design: Based on a queue-of-queues. The low-level queues are just// circular buffers with front and tail indices indicating where the// next element to dequeue is and where the next element can be enqueued,// respectively. Each low-level queue is called a "block". Each block// wastes exactly one element's worth of space to keep the design simple// (if front == tail then the queue is empty, and can't be full).// The high-level queue is a circular linked list of blocks; again there// is a front and tail, but this time they are pointers to the blocks.// The front block is where the next element to be dequeued is, provided// the block is not empty. The back block is where elements are to be// enqueued, provided the block is not full.// The producer thread owns all the tail indices/pointers. The consumer// thread owns all the front indices/pointers. Both threads read each// other's variables, but only the owning thread updates them. E.g. After// the consumer reads the producer's tail, the tail may change before the// consumer is done dequeuing an object, but the consumer knows the tail// will never go backwards, only forwards.// If there is no room to enqueue an object, an additional block (of// greater size than the last block) is added. Blocks are never removed.public:// Constructs a queue that can hold maxSize elements without further// allocations. Allocates maxSize + 1, rounded up to the nearest power// of 2, elements.explicit ReaderWriterQueue(size_t maxSize = 15): largestBlockSize(ceilToPow2(maxSize + 1))// We need a spare slot to fit maxSize elements in the block#ifndef NDEBUG,enqueuing(false),dequeuing(false)#endif{assert(maxSize > 0);auto firstBlock = new Block(largestBlockSize);firstBlock->next = firstBlock;frontBlock = firstBlock;tailBlock = firstBlock;// Make sure the reader/writer threads will have the initialized memory setup above:fence(memory_order_sync);}// Note: The queue should not be accessed concurrently while it's// being deleted. It's up to the user to synchronize this.~ReaderWriterQueue(){// Make sure we get the latest version of all variables from other CPUs:fence(memory_order_sync);// Destroy any remaining objects in queue and free memoryBlock* tailBlock_ = tailBlock;Block* block = frontBlock;do {Block* nextBlock = block->next;size_t blockFront = block->front;size_t blockTail = block->tail;for (size_t i = blockFront; i != blockTail; i = (i + 1) & block->sizeMask()) {auto element = reinterpret_cast<T*>(block->data + i * sizeof(T));element->~T();}delete block;block = nextBlock;} while (block != tailBlock_);}// Enqueues a copy of element if there is room in the queue.// Returns true if the element was enqueued, false otherwise.// Does not allocate memory.AE_FORCEINLINE bool try_enqueue(T const& element){return inner_enqueue<CannotAlloc>(element);}// Enqueues a moved copy of element if there is room in the queue.// Returns true if the element was enqueued, false otherwise.// Does not allocate memory.AE_FORCEINLINE bool try_enqueue(T&& element){return inner_enqueue<CannotAlloc>(element);}// Enqueues a copy of element on the queue.// Allocates an additional block of memory if needed.AE_FORCEINLINE void enqueue(T const& element){inner_enqueue<CanAlloc>(element);}// Enqueues a moved copy of element on the queue.// Allocates an additional block of memory if needed.AE_FORCEINLINE void enqueue(T&& element){inner_enqueue<CanAlloc>(element);}// Attempts to dequeue an element; if the queue is empty,// returns false instead. If the queue has at least one element,// moves front to result using operator=, then returns true.bool try_dequeue(T& result){#ifndef NDEBUGReentrantGuard guard(this->dequeuing);#endif// High-level pseudocode:// Remember where the tail block is// If the front block has an element in it, dequeue it// Else//     If front block was the tail block when we entered the function, return false//     Else advance to next block and dequeue the item there// Note that we have to use the value of the tail block from before we check if the front// block is full or not, in case the front block is empty and then, before we check if the// tail block is at the front block or not, the producer fills up the front block *and// moves on*, which would make us skip a filled block. Seems unlikely, but was consistently// reproducible in practice.Block* tailBlockAtStart = tailBlock;fence(memory_order_acquire);Block* frontBlock_ = frontBlock.load();size_t blockTail = frontBlock_->tail.load();size_t blockFront = frontBlock_->front.load();fence(memory_order_acquire);if (blockFront != blockTail) {// Front block not empty, dequeue from hereauto element = reinterpret_cast<T*>(frontBlock_->data + blockFront * sizeof(T));result = std::move(*element);element->~T();blockFront = (blockFront + 1) & frontBlock_->sizeMask();fence(memory_order_release);frontBlock_->front = blockFront;}else if (frontBlock_ != tailBlockAtStart) {// Front block is empty but there's another block ahead, advance to itBlock* nextBlock = frontBlock_->next;// Don't need an acquire fence here since next can only ever be set on the tailBlock,// and we're not the tailBlock, and we did an acquire earlier after reading tailBlock which// ensures next is up-to-date on this CPU in case we recently were at tailBlock.size_t nextBlockFront = nextBlock->front.load();size_t nextBlockTail = nextBlock->tail;fence(memory_order_acquire);// Since the tailBlock is only ever advanced after being written to,// we know there's for sure an element to dequeue on itassert(nextBlockFront != nextBlockTail);// We're done with this block, let the producer use it if it needsfence(memory_order_release);// Expose possibly pending changes to frontBlock->front from last dequeuefrontBlock = frontBlock_ = nextBlock;compiler_fence(memory_order_release);// Not strictly neededauto element = reinterpret_cast<T*>(frontBlock_->data + nextBlockFront * sizeof(T));result = std::move(*element);element->~T();nextBlockFront = (nextBlockFront + 1) & frontBlock_->sizeMask();fence(memory_order_release);frontBlock_->front = nextBlockFront;}else {// No elements in current block and no other block to advance toreturn false;}return true;}private:enum AllocationMode { CanAlloc, CannotAlloc };template<AllocationMode canAlloc, typename U>bool inner_enqueue(U&& element){#ifndef NDEBUGReentrantGuard guard(this->enqueuing);#endif// High-level pseudocode (assuming we're allowed to alloc a new block):// If room in tail block, add to tail// Else check next block//     If next block is not the head block, enqueue on next block//     Else create a new block and enqueue there//     Advance tail to the block we just enqueued toBlock* tailBlock_ = tailBlock.load();size_t blockFront = tailBlock_->front.load();size_t blockTail = tailBlock_->tail.load();fence(memory_order_acquire);size_t nextBlockTail = (blockTail + 1) & tailBlock_->sizeMask();if (nextBlockTail != blockFront) {// This block has room for at least one more elementchar* location = tailBlock_->data + blockTail * sizeof(T);new (location) T(std::forward<U>(element));fence(memory_order_release);tailBlock_->tail = nextBlockTail;}else if (tailBlock_->next.load() != frontBlock) {// Note that the reason we can't advance to the frontBlock and start adding new entries there// is because if we did, then dequeue would stay in that block, eventually reading the new values,// instead of advancing to the next full block (whose values were enqueued first and so should be// consumed first).fence(memory_order_acquire);// Ensure we get latest writes if we got the latest frontBlock// tailBlock is full, but there's a free block ahead, use itBlock* tailBlockNext = tailBlock_->next.load();size_t nextBlockFront = tailBlockNext->front.load();size_t nextBlockTail = tailBlockNext->tail.load();fence(memory_order_acquire);// This block must be empty since it's not the head block and we// go through the blocks in a circleassert(nextBlockFront == nextBlockTail);char* location = tailBlockNext->data + nextBlockTail * sizeof(T);new (location) T(std::forward<U>(element));tailBlockNext->tail = (nextBlockTail + 1) & tailBlockNext->sizeMask();fence(memory_order_release);tailBlock = tailBlockNext;}else if (canAlloc == CanAlloc) {// tailBlock is full and there's no free block ahead; create a new blocklargestBlockSize *= 2;Block* newBlock = new Block(largestBlockSize);new (newBlock->data) T(std::forward<U>(element));assert(newBlock->front == 0);newBlock->tail = 1;newBlock->next = tailBlock_->next.load();tailBlock_->next = newBlock;// Might be possible for the dequeue thread to see the new tailBlock->next// *without* seeing the new tailBlock value, but this is OK since it can't// advance to the next block until tailBlock is set anyway (because the only// case where it could try to read the next is if it's already at the tailBlock,// and it won't advance past tailBlock in any circumstance).fence(memory_order_release);tailBlock = newBlock;}else if (canAlloc == CannotAlloc) {// Would have had to allocate a new block to enqueue, but not allowedreturn false;}else {assert(false && "Should be unreachable code");return false;}return true;}// Disable copyingReaderWriterQueue(ReaderWriterQueue const&) {  }// Disable assignmentReaderWriterQueue& operator=(ReaderWriterQueue const&) {  }AE_FORCEINLINE static size_t ceilToPow2(size_t x){// From http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2--x;x |= x >> 1;x |= x >> 2;x |= x >> 4;x |= x >> 8;x |= x >> 16;if (sizeof(size_t) > 4U) {x |= x >> 32;}++x;return x;}private:#ifndef NDEBUGstruct ReentrantGuard{ReentrantGuard(bool& inSection): inSection(inSection){assert(!inSection);if (inSection) {throw std::runtime_error("ReaderWriterQueue does not support enqueuing or dequeuing elements from other elements' ctors and dtors");}inSection = true;}~ReentrantGuard() { inSection = false; }private:bool& inSection;};#endifstruct Block{// Avoid false-sharing by putting highly contended variables on their own cache linesAE_ALIGN(CACHE_LINE_SIZE)weak_atomic<size_t> front;// (Atomic) Elements are read from hereAE_ALIGN(CACHE_LINE_SIZE)weak_atomic<size_t> tail;// (Atomic) Elements are enqueued hereAE_ALIGN(CACHE_LINE_SIZE)// next isn't very contended, but we don't want it on the same cache line as tail (which is)weak_atomic<Block*> next;// (Atomic)char* data;// Contents (on heap) are aligned to T's alignmentconst size_t size;AE_FORCEINLINE size_t sizeMask() const { return size - 1; }// size must be a power of two (and greater than 0)Block(size_t const& size): front(0), tail(0), next(nullptr), size(size){// Allocate enough memory for an array of Ts, alignedsize_t alignment = std::alignment_of<T>::value;data = rawData = static_cast<char*>(std::malloc(sizeof(T) * size + alignment - 1));assert(rawData);auto alignmentOffset = (uintptr_t)rawData % alignment;if (alignmentOffset != 0) { data += alignment - alignmentOffset;}}~Block(){std::free(rawData);}private:char* rawData;};private:AE_ALIGN(CACHE_LINE_SIZE)weak_atomic<Block*> frontBlock;// (Atomic) Elements are enqueued to this blockAE_ALIGN(CACHE_LINE_SIZE)weak_atomic<Block*> tailBlock;// (Atomic) Elements are dequeued from this blockAE_ALIGN(CACHE_LINE_SIZE)// Ensure tailBlock gets its own cache linesize_t largestBlockSize;#ifndef NDEBUGbool enqueuing;bool dequeuing;#endif};}    // end namespace moodycamel