* chash.c
* concurrent hash tables
*
- * The goal of this module is to implement a hash table that can be
- * searched without any locking at all and updated with minimal locking.
- * While a fully lock-free (or, better still, wait-free) hash table seems
- * very desirable, currently known techniques require memory management
- * techniques that are either very complex or difficult to implement in
- * the context of a fixed-size shared memory segment.
+ * A concurrent hash table stores a collection of fixed-size objects.
+ * From the point of view of this module, such objects are merely an
+ * opaque array of bytes, but the caller will typically implement them as
+ * a C "struct". Some fixed-size, leading portion of each object is
+ * designated as the key, which must be distinct for all objects in the
+ * collection. Since PostgreSQL's shared memory model does not permit
+ * dynamic shared-memory allocation, we preallocate shared-memory space
+ * for the maximum number of entities which can be stored (plus a few
+ * extra, for reasons that will be further explained below). This space
+ * is allocated as a single large array called the arena, and we often
+ * refer to entities by their position in the arena rather than via an
+ * ordinary pointer. This saves a considerable amount of memory, since
+ * most modern architectures are 64-bit and therefore use 8-byte pointers,
+ * while arena offsets can be stored in a 32-bit word. In fact, we
+ * reserve one bit in each such word as a mark bit, so the maximum size
+ * of the arena is 2^31 elements, a restriction that does not currently
+ * appear to be problematic.
+ *
+ * When an element is inserted, we copy the data from the backend-private
+ * object supplied by the caller into one of these shared-memory entities.
+ * When the hash table is searched, the caller passes a backend-private
+ * entity with just the key filled in; if a matching element is found,
+ * data is copied from the shared memory entity into the non-key portion
+ * of the user-supplied entity. In this way, clients of this module
+ * never use pointers into shared memory directly.
+ *
+ * As normal, we structure the hash table as an array of buckets, whose
+ * size is always a power of two, so that the low-order bytes of the
+ * hash code can be used to select a bucket. If multiple entities has
+ * to the same bucket, we use separate chaining: each entity in the
+ * arena has an 8-byte header that stores the 4-byte arena offset of the
+ * next item in the bucket and the hash value of the entity's key.
+ * Bucket chains are maintained in order by ascending hash value and
+ * then by ascending entity key (as per memcmp) so that there is
+ * precisely one legal location at which a given new item can be inserted
+ * into a bucket.
+ *
+ * For good concurrency, it seems essential to avoid locking buckets
+ * while they are being scanned. Taking even a shared LWLock or similar
+ * still means acquiring and releasing a spinlock, with is both
+ * inefficient in terms of raw cycles and a potential contention point.
+ * Thus, we decree that readers must be able to scan bucket chains without
+ * executing any atomic operations either before, during, or after the
+ * scan. Writers necessarily require some locking; for now, each bucket
+ * has a separate spinlock which must be taken to modify that bucket chain,
+ * but not when reading it. In the future, we might further adapt this
+ * code to instead use compare-and-swap where available.
+ *
+ * Even after an entity has been deleted from a bucket chain, it is still
+ * possible that some other backend holds a pointer to it from a bucket
+ * chain traversal which began before the deletion was carried out.
+ * Thus, we cannot recycle the block of memory used by an entity for a
+ * new and unrelated entity until we can guarantee that no private
+ * references to it remain. Instead, we add the entity to one of several
+ * "garbage lists" of items removed from bucket chains that are not yet
+ * known to be recyclable. Periodically, we move items from garbage lists
+ * to free lists from which they can be reallocated. This is accomplished
+ * by having each backend which wishes to scan a bucket store the hash
+ * table id and bucket identifier in a per-backend slot in shared memory
+ * before it begins scanning the bucket and clear the value only after it
+ * finishes scanning the bucket, so that it is possible for another
+ * backend to wait (by spinning) for all backends in the process of
+ * scanning a bucket to finish doing so. To make sure we don't need to
+ * garbage-collect too often, we allocate a slightly larger arena than
+ * the caller's stated maximum size.
*
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
#include "postgres.h"
+#include "miscadmin.h"
#include "storage/shmem.h"
#include "storage/spin.h"
#include "utils/chash.h"
+#include "utils/memutils.h"
/*
- * The memory needed to store the entries in a hash table is preallocated in
- * a single chunk called the arena. We refer to entries using a CHashPtr
- * rather than an ordinary pointer. One bit of each CHashPtr is reserved for
- * use as a "mark" bit, which is used to implement concurrent deletion.
- * The remaining bits form an offset into the arena. By storing offset
- * rather than pointers, we can reduce the memory footprint of the hash table
- * considerably, at the cost of limiting the maximum number of elements in a
- * single concurrent hash table to 2^31. That limitation appears acceptable
- * for now, and we can always switch to pointers or 64-bit integers here in
- * the future, if needed.
+ * CHashPtr represents an offset into the arena, plus a mark bit that is
+ * used to implement concurrent deletion.
*/
typedef uint32 CHashPtr;
#define InvalidCHashPtr ((uint32) -1)
#define CHashPtrMark(x) ((x) | 1)
#define CHashPtrUnmark(x) ((x) & ~1)
#define MakeCHashPtr(x) ((x) << 1)
-
-static uint32 CHashMaxCapacity = CHashPtrGetOffset(InvalidCHashPtr);
+#define CHashMaxCapacity CHashPtrGetOffset(InvalidCHashPtr)
/*
- * Each hash bucket is implemented as a pointer to the first item in the
- * bucket, or InvalidCHashPtr if the bucket is empty. Each item contains a
- * pointer to the next item in the bucket, or InvalidCHashPtr if there are no
- * more items.
- *
- * Each bucket also has a spinlock which is used to serialize modifications
- * to the bucket, but need not be taken when searching it.
+ * CHashBucket represents a single hash bucket, garbage list, or free list.
*/
typedef struct
{
- CHashPtr head; /* arena offset of first element in bucket */
- slock_t mutex; /* mutual exclusion for modifications */
+ CHashPtr head; /* arena offset of bucket head */
+ slock_t mutex; /* mutual exclusion for changes */
} CHashBucket;
/*
- * Each free list is implemented as a pointer to the first item on the
- * free list, or InvalidCHashPtr if the free list is empty. Each free list
- * is protected by a spinlock.
- */
-typedef struct
-{
- CHashPtr head; /* arena offset of first element in bucket */
- slock_t mutex; /* mutual exclusion for modifications */
-} CHashFreeList;
-
-/*
- * Each item stored in the hash table is represented by a CHashNode, which
+ * Each object stored in the hash table is represented by a CHashNode, which
* stores a pointer to the next item in the same bucket, and the exact hash
* value of the current item. Each CHashNode is followed by space for the
* item itself.
*/
typedef struct
{
- CHashPtr next; /* arena offset of next element in bucket */
- uint32 hash_value; /* hash(key) */
+ CHashPtr next; /* arena offset of next element */
+ uint32 hash_value; /* hash(key) */
} CHashNode;
-#define CHashNodeGetItem(x) ((void *) (((char *) x) + sizeof(CHashNode)))
+
+#define SizeOfCHashNode MAXALIGN(sizeof(CHashNode))
+#define CHashNodeGetItem(x) ((void *) (((char *) x) + SizeOfCHashNode))
/*
* CHashTableData stores all the information that we need in order to access
* table. None of this information changes after the initial setup of the
* hash table.
*/
-typedef struct
+typedef struct CHashTableData
{
CHashDescriptor desc; /* descriptor for this hash table */
uint32 nbuckets; /* # of buckets; must be a power of two */
uint32 bucket_mask; /* # of buckets, minus one */
- uint32 nfreelists; /* # of freelists, also a power of two */
- void *arena; /* arena */
+ uint32 garbage_shift; /* log2(nbuckets/ngarbage) */
+ uint32 ngarbage; /* # of garbage lists, a power of two */
+ uint32 nfreelists; /* # of freelists */
+ uint32 arena_limit; /* # of arena elements */
+ uint32 arena_stride; /* bytes allocated per arena element */
CHashBucket *bucket; /* array of size nbuckets */
- CHashFreeList *freelist; /* array of size nfreelists */
+ CHashBucket *garbage; /* array of size ngarbage */
+ CHashBucket *freelist; /* array of size nfreelists */
+ void *arena; /* arena */
} CHashTableData;
/*
- * Compute the number of buckets and the number of freelists for a hash table
- * with a given capacity.
+ * First stage of CHashTable initialization. We fill in all the constants
+ * here, but not the pointers.
*/
-static void
-CHashSizingParameters(uint32 capacity, uint32 *nbuckets, uint32 *nfreelists)
+CHashTable
+CHashBootstrap(CHashDescriptor *desc)
{
- uint32 bucket_shift;
- uint32 freelist_shift;
+ CHashTable table;
+ uint32 bucket_shift;
- if (capacity < 1 || capacity > CHashMaxCapacity)
+ /* Allocate table and copy descriptor. */
+ table = MemoryContextAlloc(TopMemoryContext, sizeof(CHashTableData));
+ memcpy(&table->desc, desc, sizeof(CHashDescriptor));
+
+ /* Sanity checks. */
+ if (desc->capacity < 1 || desc->capacity > CHashMaxCapacity)
elog(ERROR, "invalid capacity for concurrent hash");
+ if (desc->key_size < 1 || desc->key_size > desc->element_size)
+ elog(ERROR, "invalid key size for concurrent hash");
/*
* The number of buckets must be a power of two. To avoid (as much as
* factor <= 1.0, so this is a pretty simple calculation: we just find the
* smallest power of two greater than or equal to the target capacity.
*/
- bucket_shift = fls(capacity) - 1;
- *nbuckets = 1 << bucket_shift;
+ bucket_shift = fls(desc->capacity) - 1;
+ table->nbuckets = 1 << bucket_shift;
+ table->bucket_mask = table->nbuckets - 1;
/*
- * The number of freelists must also be a power of two, and must be no
- * larger than the number of buckets.
+ * It's not exactly clear how to determine the optimal number of garbage
+ * lists. If there are too few, then garbage collection will have to wait
+ * behind concurrent scans excessively frequently. But if there are too
+ * many, then garbage collection won't recover very many items.
*/
- freelist_shift = bucket_shift / 2;
- *nfreelists = 1 << freelist_shift;
+ table->garbage_shift = Min(bucket_shift, 6);
+ table->ngarbage = table->nbuckets >> table->garbage_shift;
+
+ /*
+ * The number of freelists must be large enough to avoid contention;
+ * having extras is fairly harmless. But there seems to be no point in
+ * having more free lists than garbage lists; if the garbage lists aren't
+ * causing contention, an equal number of free lists shouldn't either.
+ */
+ table->nfreelists = Min(table->ngarbage, 16);
+
+ /*
+ * To make garbage collection efficient, we overallocate. Normally, we
+ * overallocate by one-eighth, but if that would be less than 15 elements,
+ * then we allocate 15 elements instead. This extra capacity can actually
+ * be used, but for best performance, it shouldn't be. It's the caller's
+ * responsibility to avoid this where relevant.
+ */
+ table->arena_limit = desc->capacity;
+ if (desc->capacity < 120)
+ table->arena_limit += 15;
+ else
+ table->arena_limit += table->arena_limit / 8;
+
+ /* Each arena element must be MAXALIGN'd and include per-node space. */
+ table->arena_stride = SizeOfCHashNode + MAXALIGN(desc->element_size);
+
+ return table;
}
+/*
+ * Estimate shared memory requirements.
+ */
Size
-CHashEstimateSize(CHashDescriptor *desc)
+CHashEstimateSize(CHashTable table)
{
- uint32 nbuckets,
- nfreelists;
- Size size;
+ Size size;
+ Size total_buckets;
- CHashSizingParameters(desc->capacity, &nbuckets, &nfreelists);
+ total_buckets = add_size(table->nbuckets, table->ngarbage);
+ total_buckets = add_size(total_buckets, table->nfreelists);
size = MAXALIGN(sizeof(CHashTableData));
- size = add_size(size, mul_size(MAXALIGN(sizeof(CHashBucket)), nbuckets));
+ size = add_size(size, mul_size(sizeof(CHashBucket), total_buckets));
+ size = add_size(size, mul_size(table->arena_stride, table->arena_limit));
return size;
}
+
+/*
+ * Create a concurrent hash table in shared memory, or attach to an existing
+ * table.
+ */
+CHashTable
+CHashInitialize(CHashTable table, CHashDescriptor *desc)
+{
+ Size size;
+ bool found;
+ void *shmem;
+
+ /*
+ * If we're under the postmaster, this must be the EXEC_BACKEND case where
+ * we need to attach to an existing shared-memory segment.
+ */
+ if (IsUnderPostmaster)
+ {
+ void *shmem;
+
+ Assert(table == NULL);
+ table = MemoryContextAlloc(TopMemoryContext, sizeof(CHashTableData));
+ shmem = ShmemAttachStruct(desc->shmem_name);
+ memcpy(table, shmem, sizeof(CHashTableData));
+ return table;
+ }
+
+ /*
+ * Otherwise, the hash table should not already exist, and we must
+ * create it. But the table should already be bootstrapped, since we
+ * must previously have computed its size when figuring out our shared
+ * memory allocation.
+ */
+ Assert(table != NULL);
+ size = CHashEstimateSize(table);
+ shmem = ShmemInitStruct(table->desc.shmem_name, size, &found);
+ Assert(!found);
+
+ /* Bucket, garbage, and freelist arrays follow table info. */
+ table->bucket = (CHashBucket *)
+ (((char *) shmem) + MAXALIGN(sizeof(CHashTableData)));
+ table->garbage = &table->bucket[table->nbuckets];
+ table->freelist = &table->garbage[table->ngarbage];
+
+ /* Arena follows the various lists. */
+ table->arena = (void *) (&table->freelist[table->nfreelists]);
+
+ /*
+ * Copy table (with pointers now filled in) to shared memory. This is
+ * arguably unnecessary when not using EXEC_BACKEND, but we do it anyway.
+ */
+ memcpy(shmem, table, sizeof(CHashTableData));
+
+ return table;
+}