Improve the performance of the slab memory allocator

author David Rowley <[email protected]>

Tue, 20 Dec 2022 08:48:51 +0000 (21:48 +1300)

committer David Rowley <[email protected]>

Tue, 20 Dec 2022 08:48:51 +0000 (21:48 +1300)
author David Rowley <[email protected]>
Tue, 20 Dec 2022 08:48:51 +0000 (21:48 +1300)
committer David Rowley <[email protected]>
Tue, 20 Dec 2022 08:48:51 +0000 (21:48 +1300)
diff --git a/src/backend/utils/mmgr/slab.c b/src/backend/utils/mmgr/slab.c

index c2f9bb6ad34bef6d58e191662ea264aa3be3600d..c366febdcc0d77f1c5a285fd65b877ae91dde298 100644 (file)
--- a/src/backend/utils/mmgr/slab.c
+++ b/src/backend/utils/mmgr/slab.c
@@ -4,7 +4,8 @@
   *       SLAB allocator definitions.
   *
   * SLAB is a MemoryContext implementation designed for cases where large
- * numbers of equally-sized objects are allocated (and freed).
+ * numbers of equally-sized objects can be allocated and freed efficiently
+ * with minimal memory wastage and fragmentation.
   *
   *
   * Portions Copyright (c) 2017-2022, PostgreSQL Global Development Group
@@ -16,36 +17,51 @@
   * NOTE:
   *     The constant allocation size allows significant simplification and various
   *     optimizations over more general purpose allocators. The blocks are carved
- *     into chunks of exactly the right size (plus alignment), not wasting any
- *     memory.
+ *     into chunks of exactly the right size, wasting only the space required to
+ *     MAXALIGN the allocated chunks.
   *
- *     The information about free chunks is maintained both at the block level and
- *     global (context) level. This is possible as the chunk size (and thus also
- *     the number of chunks per block) is fixed.
+ *     Slab can also help reduce memory fragmentation in cases where longer-lived
+ *     chunks remain stored on blocks while most of the other chunks have already
+ *     been pfree'd.  We give priority to putting new allocations into the
+ *     "fullest" block.  This help avoid having too many sparsely used blocks
+ *     around and allows blocks to more easily become completely unused which
+ *     allows them to be eventually free'd.
   *
- *     On each block, free chunks are tracked in a simple linked list. Contents
- *     of free chunks is replaced with an index of the next free chunk, forming
- *     a very simple linked list. Each block also contains a counter of free
- *     chunks. Combined with the local block-level freelist, it makes it trivial
- *     to eventually free the whole block.
+ *     We identify the "fullest" block to put new allocations on by using a block
+ *     from the lowest populated element of the context's "blocklist" array.
+ *     This is an array of dlists containing blocks which we partition by the
+ *     number of free chunks which block has.  Blocks with fewer free chunks are
+ *     stored in a lower indexed dlist array slot.  Full blocks go on the 0th
+ *     element of the blocklist array.  So that we don't have to have too many
+ *     elements in the array, each dlist in the array is responsible for a range
+ *     of free chunks.  When a chunk is palloc'd or pfree'd we may need to move
+ *     the block onto another dlist if the number of free chunks crosses the
+ *     range boundary that the current list is responsible for.  Having just a
+ *     few blocklist elements reduces the number of times we must move the block
+ *     onto another dlist element.
   *
- *     At the context level, we use 'freelist' to track blocks ordered by number
- *     of free chunks, starting with blocks having a single allocated chunk, and
- *     with completely full blocks on the tail.
+ *     We keep track of free chunks within each block by using a block-level free
+ *     list.  We consult this list when we allocate a new chunk in the block.
+ *     The free list is a linked list, the head of which is pointed to with
+ *     SlabBlock's freehead field.  Each subsequent list item is stored in the
+ *     free chunk's memory.  We ensure chunks are large enough to store this
+ *     address.
   *
- *     This also allows various optimizations - for example when searching for
- *     free chunk, the allocator reuses space from the fullest blocks first, in
- *     the hope that some of the less full blocks will get completely empty (and
- *     returned back to the OS).
- *
- *     For each block, we maintain pointer to the first free chunk - this is quite
- *     cheap and allows us to skip all the preceding used chunks, eliminating
- *     a significant number of lookups in many common usage patterns. In the worst
- *     case this performs as if the pointer was not maintained.
- *
- *     We cache the freelist index for the blocks with the fewest free chunks
- *     (minFreeChunks), so that we don't have to search the freelist on every
- *     SlabAlloc() call, which is quite expensive.
+ *     When we allocate a new block, technically all chunks are free, however, to
+ *     avoid having to write out the entire block to set the linked list for the
+ *     free chunks for every chunk in the block, we instead store a pointer to
+ *     the next "unused" chunk on the block and keep track of how many of these
+ *     unused chunks there are.  When a new block is malloc'd, all chunks are
+ *     unused.  The unused pointer starts with the first chunk on the block and
+ *     as chunks are allocated, the unused pointer is incremented.  As chunks are
+ *     pfree'd, the unused pointer never goes backwards.  The unused pointer can
+ *     be thought of as a high watermark for the maximum number of chunks in the
+ *     block which have been in use concurrently.  When a chunk is pfree'd the
+ *     chunk is put onto the head of the free list and the unused pointer is not
+ *     changed.  We only consume more unused chunks if we run out of free chunks
+ *     on the free list.  This method effectively gives priority to using
+ *     previously used chunks over previously unused chunks, which should perform
+ *     better due to CPU caching effects.
   *
   *-------------------------------------------------------------------------
   */
@@ -60,6 +76,27 @@
  
  #define Slab_BLOCKHDRSZ        MAXALIGN(sizeof(SlabBlock))
  
+#ifdef MEMORY_CONTEXT_CHECKING
+/*
+ * Size of the memory required to store the SlabContext.
+ * MEMORY_CONTEXT_CHECKING builds need some extra memory for the isChunkFree
+ * array.
+ */
+#define Slab_CONTEXT_HDRSZ(chunksPerBlock)     \
+       (sizeof(SlabContext) + ((chunksPerBlock) * sizeof(bool)))
+#else
+#define Slab_CONTEXT_HDRSZ(chunksPerBlock)     sizeof(SlabContext)
+#endif
+
+/*
+ * The number of partitions to divide the blocklist into based their number of
+ * free chunks.  There must be at least 2.
+ */
+#define SLAB_BLOCKLIST_COUNT 3
+
+/* The maximum number of completely empty blocks to keep around for reuse. */
+#define SLAB_MAXIMUM_EMPTY_BLOCKS 10
+
  /*
   * SlabContext is a specialized implementation of MemoryContext.
   */
@@ -67,64 +104,206 @@ typedef struct SlabContext
  {
         MemoryContextData header;       /* Standard memory-context fields */
         /* Allocation parameters for this context: */
-       Size            chunkSize;              /* chunk size */
-       Size            fullChunkSize;  /* chunk size including header and alignment */
-       Size            blockSize;              /* block size */
-       Size            headerSize;             /* allocated size of context header */
-       int                     chunksPerBlock; /* number of chunks per block */
-       int                     minFreeChunks;  /* min number of free chunks in any block */
-       int                     nblocks;                /* number of blocks allocated */
+       Size            chunkSize;              /* the requested (non-aligned) chunk size */
+       Size            fullChunkSize;  /* chunk size with chunk header and alignment */
+       Size            blockSize;              /* the size to make each block of chunks */
+       int32           chunksPerBlock; /* number of chunks that fit in 1 block */
+       int32           curBlocklistIndex;      /* index into the blocklist[] element
+                                                                        * containing the fullest, blocks */
  #ifdef MEMORY_CONTEXT_CHECKING
-       bool       *freechunks;         /* bitmap of free chunks in a block */
+       bool       *isChunkFree;        /* array to mark free chunks in a block during
+                                                                * SlabCheck */
  #endif
-       /* blocks with free space, grouped by number of free chunks: */
-       dlist_head      freelist[FLEXIBLE_ARRAY_MEMBER];
+
+       int32           blocklist_shift;        /* number of bits to shift the nfree count
+                                                                        * by to get the index into blocklist[] */
+       dclist_head emptyblocks;        /* empty blocks to use up first instead of
+                                                                * mallocing new blocks */
+
+       /*
+        * Blocks with free space, grouped by the number of free chunks they
+        * contain.  Completely full blocks are stored in the 0th element.
+        * Completely empty blocks are stored in emptyblocks or free'd if we have
+        * enough empty blocks already.
+        */
+       dlist_head      blocklist[SLAB_BLOCKLIST_COUNT];
  } SlabContext;
  
  /*
   * SlabBlock
- *             Structure of a single block in SLAB allocator.
+ *             Structure of a single slab block.
   *
- * node: doubly-linked list of blocks in global freelist
- * nfree: number of free chunks in this block
- * firstFreeChunk: index of the first free chunk
+ * slab: pointer back to the owning MemoryContext
+ * nfree: number of chunks on the block which are unallocated
+ * nunused: number of chunks on the block unallocated and not on the block's
+ * freelist.
+ * freehead: linked-list header storing a pointer to the first free chunk on
+ * the block.  Subsequent pointers are stored in the chunk's memory.  NULL
+ * indicates the end of the list.
+ * unused: pointer to the next chunk which has yet to be used.
+ * node: doubly-linked list node for the context's blocklist
   */
  typedef struct SlabBlock
  {
-       dlist_node      node;                   /* doubly-linked list */
-       int                     nfree;                  /* number of free chunks */
-       int                     firstFreeChunk; /* index of the first free chunk in the block */
         SlabContext *slab;                      /* owning context */
+       int32           nfree;                  /* number of chunks on free + unused chunks */
+       int32           nunused;                /* number of unused chunks */
+       MemoryChunk *freehead;          /* pointer to the first free chunk */
+       MemoryChunk *unused;            /* pointer to the next unused chunk */
+       dlist_node      node;                   /* doubly-linked list for blocklist[] */
  } SlabBlock;
  
  
  #define Slab_CHUNKHDRSZ sizeof(MemoryChunk)
-#define SlabPointerGetChunk(ptr)       \
-       ((MemoryChunk *)(((char *)(ptr)) - sizeof(MemoryChunk)))
  #define SlabChunkGetPointer(chk)       \
-       ((void *)(((char *)(chk)) + sizeof(MemoryChunk)))
-#define SlabBlockGetChunk(slab, block, idx) \
+       ((void *) (((char *) (chk)) + sizeof(MemoryChunk)))
+
+/*
+ * SlabBlockGetChunk
+ *             Obtain a pointer to the nth (0-based) chunk in the block
+ */
+#define SlabBlockGetChunk(slab, block, n) \
         ((MemoryChunk *) ((char *) (block) + Slab_BLOCKHDRSZ    \
-                                       + (idx * slab->fullChunkSize)))
-#define SlabBlockStart(block)  \
-       ((char *) block + Slab_BLOCKHDRSZ)
+                                       + ((n) * (slab)->fullChunkSize)))
+
+#if defined(MEMORY_CONTEXT_CHECKING) || defined(USE_ASSERT_CHECKING)
+
+/*
+ * SlabChunkIndex
+ *             Get the 0-based index of how many chunks into the block the given
+ *             chunk is.
+*/
  #define SlabChunkIndex(slab, block, chunk)     \
-       (((char *) chunk - SlabBlockStart(block)) / slab->fullChunkSize)
+       (((char *) (chunk) - (char *) SlabBlockGetChunk(slab, block, 0)) / \
+       (slab)->fullChunkSize)
+
+/*
+ * SlabChunkMod
+ *             A MemoryChunk should always be at an address which is a multiple of
+ *             fullChunkSize starting from the 0th chunk position.  This will return
+ *             non-zero if it's not.
+ */
+#define SlabChunkMod(slab, block, chunk)       \
+       (((char *) (chunk) - (char *) SlabBlockGetChunk(slab, block, 0)) % \
+       (slab)->fullChunkSize)
+
+#endif
  
  /*
   * SlabIsValid
- *             True iff set is valid slab allocation set.
+ *             True iff set is a valid slab allocation set.
   */
-#define SlabIsValid(set) \
-       (PointerIsValid(set) && IsA(set, SlabContext))
+#define SlabIsValid(set) (PointerIsValid(set) && IsA(set, SlabContext))
  
  /*
   * SlabBlockIsValid
- *             True iff block is valid block of slab allocation set.
+ *             True iff block is a valid block of slab allocation set.
   */
  #define SlabBlockIsValid(block) \
         (PointerIsValid(block) && SlabIsValid((block)->slab))
  
+/*
+ * SlabBlocklistIndex
+ *             Determine the blocklist index that a block should be in for the given
+ *             number of free chunks.
+ */
+static inline int32
+SlabBlocklistIndex(SlabContext *slab, int nfree)
+{
+       int32           index;
+       int32           blocklist_shift = slab->blocklist_shift;
+
+       Assert(nfree >= 0 && nfree <= slab->chunksPerBlock);
+
+       /*
+        * Determine the blocklist index based on the number of free chunks.  We
+        * must ensure that 0 free chunks is dedicated to index 0.  Everything
+        * else must be >= 1 and < SLAB_BLOCKLIST_COUNT.
+        *
+        * To make this as efficient as possible, we exploit some two's complement
+        * arithmetic where we reverse the sign before bit shifting.  This results
+        * in an nfree of 0 using index 0 and anything non-zero staying non-zero.
+        * This is exploiting 0 and -0 being the same in two's complement.  When
+        * we're done, we just need to flip the sign back over again for a
+        * positive index.
+        */
+       index = -((-nfree) >> blocklist_shift);
+
+       if (nfree == 0)
+               Assert(index == 0);
+       else
+               Assert(index >= 1 && index < SLAB_BLOCKLIST_COUNT);
+
+       return index;
+}
+
+/*
+ * SlabFindNextBlockListIndex
+ *             Search blocklist for blocks which have free chunks and return the
+ *             index of the blocklist found containing at least 1 block with free
+ *             chunks.  If no block can be found we return 0.
+ *
+ * Note: We give priority to fuller blocks so that these are filled before
+ * emptier blocks.  This is done to increase the chances that mostly-empty
+ * blocks will eventually become completely empty so they can be free'd.
+ */
+static int32
+SlabFindNextBlockListIndex(SlabContext *slab)
+{
+       /* start at 1 as blocklist[0] is for full blocks. */
+       for (int i = 1; i < SLAB_BLOCKLIST_COUNT; i++)
+       {
+               /* return the first found non-empty index */
+               if (!dlist_is_empty(&slab->blocklist[i]))
+                       return i;
+       }
+
+       /* no blocks with free space */
+       return 0;
+}
+
+/*
+ * SlabGetNextFreeChunk
+ *             Return the next free chunk in block and update the block to account
+ *             for the returned chunk now being used.
+ */
+static inline MemoryChunk *
+SlabGetNextFreeChunk(SlabContext *slab, SlabBlock *block)
+{
+       MemoryChunk *chunk;
+
+       Assert(block->nfree > 0);
+
+       if (block->freehead != NULL)
+       {
+               chunk = block->freehead;
+
+               /*
+                * Pop the chunk from the linked list of free chunks.  The pointer to
+                * the next free chunk is stored in the chunk itself.
+                */
+               VALGRIND_MAKE_MEM_DEFINED(SlabChunkGetPointer(chunk), sizeof(MemoryChunk *));
+               block->freehead = *(MemoryChunk **) SlabChunkGetPointer(chunk);
+
+               /* check nothing stomped on the free chunk's memory */
+               Assert(block->freehead == NULL ||
+                          (block->freehead >= SlabBlockGetChunk(slab, block, 0) &&
+                               block->freehead <= SlabBlockGetChunk(slab, block, slab->chunksPerBlock - 1) &&
+                               SlabChunkMod(slab, block, block->freehead) == 0));
+       }
+       else
+       {
+               Assert(block->nunused > 0);
+
+               chunk = block->unused;
+               block->unused = (MemoryChunk *) (((char *) block->unused) + slab->fullChunkSize);
+               block->nunused--;
+       }
+
+       block->nfree--;
+
+       return chunk;
+}
  
  /*
   * SlabContextCreate
@@ -145,8 +324,6 @@ SlabContextCreate(MemoryContext parent,
  {
         int                     chunksPerBlock;
         Size            fullChunkSize;
-       Size            freelistSize;
-       Size            headerSize;
         SlabContext *slab;
         int                     i;
  
@@ -155,11 +332,14 @@ SlabContextCreate(MemoryContext parent,
                                          "sizeof(MemoryChunk) is not maxaligned");
         Assert(MAXALIGN(chunkSize) <= MEMORYCHUNK_MAX_VALUE);
  
-       /* Make sure the linked list node fits inside a freed chunk */
-       if (chunkSize < sizeof(int))
-               chunkSize = sizeof(int);
+       /*
+        * Ensure there's enough space to store the pointer to the next free chunk
+        * in the memory of the (otherwise) unused allocation.
+        */
+       if (chunkSize < sizeof(MemoryChunk *))
+               chunkSize = sizeof(MemoryChunk *);
  
-       /* chunk, including SLAB header (both addresses nicely aligned) */
+       /* length of the maxaligned chunk including the chunk header  */
  #ifdef MEMORY_CONTEXT_CHECKING
         /* ensure there's always space for the sentinel byte */
         fullChunkSize = Slab_CHUNKHDRSZ + MAXALIGN(chunkSize + 1);
@@ -167,36 +347,17 @@ SlabContextCreate(MemoryContext parent,
         fullChunkSize = Slab_CHUNKHDRSZ + MAXALIGN(chunkSize);
  #endif
  
-       /* Make sure the block can store at least one chunk. */
-       if (blockSize < fullChunkSize + Slab_BLOCKHDRSZ)
-               elog(ERROR, "block size %zu for slab is too small for %zu chunks",
-                        blockSize, chunkSize);
-
-       /* Compute maximum number of chunks per block */
+       /* compute the number of chunks that will fit on each block */
         chunksPerBlock = (blockSize - Slab_BLOCKHDRSZ) / fullChunkSize;
  
-       /* The freelist starts with 0, ends with chunksPerBlock. */
-       freelistSize = sizeof(dlist_head) * (chunksPerBlock + 1);
-
-       /*
-        * Allocate the context header.  Unlike aset.c, we never try to combine
-        * this with the first regular block; not worth the extra complication.
-        */
+       /* Make sure the block can store at least one chunk. */
+       if (chunksPerBlock == 0)
+               elog(ERROR, "block size %zu for slab is too small for %zu-byte chunks",
+                        blockSize, chunkSize);
  
-       /* Size of the memory context header */
-       headerSize = offsetof(SlabContext, freelist) + freelistSize;
  
-#ifdef MEMORY_CONTEXT_CHECKING
-
-       /*
-        * With memory checking, we need to allocate extra space for the bitmap of
-        * free chunks. The bitmap is an array of bools, so we don't need to worry
-        * about alignment.
-        */
-       headerSize += chunksPerBlock * sizeof(bool);
-#endif
  
-       slab = (SlabContext *) malloc(headerSize);
+       slab = (SlabContext *) malloc(Slab_CONTEXT_HDRSZ(chunksPerBlock));
         if (slab == NULL)
         {
                 MemoryContextStats(TopMemoryContext);
@@ -216,19 +377,33 @@ SlabContextCreate(MemoryContext parent,
         slab->chunkSize = chunkSize;
         slab->fullChunkSize = fullChunkSize;
         slab->blockSize = blockSize;
-       slab->headerSize = headerSize;
         slab->chunksPerBlock = chunksPerBlock;
-       slab->minFreeChunks = 0;
-       slab->nblocks = 0;
+       slab->curBlocklistIndex = 0;
  
-       /* initialize the freelist slots */
-       for (i = 0; i < (slab->chunksPerBlock + 1); i++)
-               dlist_init(&slab->freelist[i]);
+       /*
+        * Compute a shift that guarantees that shifting chunksPerBlock with it is
+        * < SLAB_BLOCKLIST_COUNT - 1.  The reason that we subtract 1 from
+        * SLAB_BLOCKLIST_COUNT in this calculation is that we reserve the 0th
+        * blocklist element for blocks which have no free chunks.
+        *
+        * We calculate the number of bits to shift by rather than a divisor to
+        * divide by as performing division each time we need to find the
+        * blocklist index would be much slower.
+        */
+       slab->blocklist_shift = 0;
+       while ((slab->chunksPerBlock >> slab->blocklist_shift) >= (SLAB_BLOCKLIST_COUNT - 1))
+               slab->blocklist_shift++;
+
+       /* initialize the list to store empty blocks to be reused */
+       dclist_init(&slab->emptyblocks);
+
+       /* initialize each blocklist slot */
+       for (i = 0; i < SLAB_BLOCKLIST_COUNT; i++)
+               dlist_init(&slab->blocklist[i]);
  
  #ifdef MEMORY_CONTEXT_CHECKING
-       /* set the freechunks pointer right after the freelists array */
-       slab->freechunks
-               = (bool *) slab + offsetof(SlabContext, freelist) + freelistSize;
+       /* set the isChunkFree pointer right after the end of the context */
+       slab->isChunkFree = (bool *) ((char *) slab + sizeof(SlabContext));
  #endif
  
         /* Finally, do the type-independent part of context creation */
@@ -252,6 +427,7 @@ void
  SlabReset(MemoryContext context)
  {
         SlabContext *slab = (SlabContext *) context;
+       dlist_mutable_iter miter;
         int                     i;
  
         Assert(SlabIsValid(slab));
@@ -261,12 +437,24 @@ SlabReset(MemoryContext context)
         SlabCheck(context);
  #endif
  
-       /* walk over freelists and free the blocks */
-       for (i = 0; i <= slab->chunksPerBlock; i++)
+       /* release any retained empty blocks */
+       dclist_foreach_modify(miter, &slab->emptyblocks)
         {
-               dlist_mutable_iter miter;
+               SlabBlock  *block = dlist_container(SlabBlock, node, miter.cur);
+
+               dclist_delete_from(&slab->emptyblocks, miter.cur);
  
-               dlist_foreach_modify(miter, &slab->freelist[i])
+#ifdef CLOBBER_FREED_MEMORY
+               wipe_mem(block, slab->blockSize);
+#endif
+               free(block);
+               context->mem_allocated -= slab->blockSize;
+       }
+
+       /* walk over blocklist and free the blocks */
+       for (i = 0; i < SLAB_BLOCKLIST_COUNT; i++)
+       {
+               dlist_foreach_modify(miter, &slab->blocklist[i])
                 {
                         SlabBlock  *block = dlist_container(SlabBlock, node, miter.cur);
  
@@ -276,14 +464,12 @@ SlabReset(MemoryContext context)
                         wipe_mem(block, slab->blockSize);
  #endif
                         free(block);
-                       slab->nblocks--;
                         context->mem_allocated -= slab->blockSize;
                 }
         }
  
-       slab->minFreeChunks = 0;
+       slab->curBlocklistIndex = 0;
  
-       Assert(slab->nblocks == 0);
         Assert(context->mem_allocated == 0);
  }
  
@@ -302,7 +488,7 @@ SlabDelete(MemoryContext context)
  
  /*
   * SlabAlloc
- *             Returns pointer to allocated memory of given size or NULL if
+ *             Returns a pointer to allocated memory of given size or NULL if
   *             request could not be completed; memory is added to the slab.
   */
  void *
@@ -311,127 +497,118 @@ SlabAlloc(MemoryContext context, Size size)
         SlabContext *slab = (SlabContext *) context;
         SlabBlock  *block;
         MemoryChunk *chunk;
-       int                     idx;
  
         Assert(SlabIsValid(slab));
  
-       Assert((slab->minFreeChunks >= 0) &&
-                  (slab->minFreeChunks < slab->chunksPerBlock));
+       /* sanity check that this is pointing to a valid blocklist */
+       Assert(slab->curBlocklistIndex >= 0);
+       Assert(slab->curBlocklistIndex <= SlabBlocklistIndex(slab, slab->chunksPerBlock));
  
         /* make sure we only allow correct request size */
-       if (size != slab->chunkSize)
+       if (unlikely(size != slab->chunkSize))
                 elog(ERROR, "unexpected alloc chunk size %zu (expected %zu)",
                          size, slab->chunkSize);
  
         /*
-        * If there are no free chunks in any existing block, create a new block
-        * and put it to the last freelist bucket.
-        *
-        * slab->minFreeChunks == 0 means there are no blocks with free chunks,
-        * thanks to how minFreeChunks is updated at the end of SlabAlloc().
+        * Handle the case when there are no partially filled blocks available.
+        * SlabFree() will have updated the curBlocklistIndex setting it to zero
+        * to indicate that it has freed the final block.  Also later in
+        * SlabAlloc() we will set the curBlocklistIndex to zero if we end up
+        * filling the final block.
          */
-       if (slab->minFreeChunks == 0)
+       if (unlikely(slab->curBlocklistIndex == 0))
         {
-               block = (SlabBlock *) malloc(slab->blockSize);
+               dlist_head *blocklist;
+               int                     blocklist_idx;
+
+               /* to save allocating a new one, first check the empty blocks list */
+               if (dclist_count(&slab->emptyblocks) > 0)
+               {
+                       dlist_node *node = dclist_pop_head_node(&slab->emptyblocks);
  
-               if (block == NULL)
-                       return NULL;
+                       block = dlist_container(SlabBlock, node, node);
  
-               block->nfree = slab->chunksPerBlock;
-               block->firstFreeChunk = 0;
-               block->slab = slab;
+                       /*
+                        * SlabFree() should have left this block in a valid state with
+                        * all chunks free.  Ensure that's the case.
+                        */
+                       Assert(block->nfree == slab->chunksPerBlock);
  
-               /*
-                * Put all the chunks on a freelist. Walk the chunks and point each
-                * one to the next one.
-                */
-               for (idx = 0; idx < slab->chunksPerBlock; idx++)
-               {
-                       chunk = SlabBlockGetChunk(slab, block, idx);
-                       *(int32 *) MemoryChunkGetPointer(chunk) = (idx + 1);
+                       /* fetch the next chunk from this block */
+                       chunk = SlabGetNextFreeChunk(slab, block);
                 }
+               else
+               {
+                       block = (SlabBlock *) malloc(slab->blockSize);
  
-               /*
-                * And add it to the last freelist with all chunks empty.
-                *
-                * We know there are no blocks in the freelist, otherwise we wouldn't
-                * need a new block.
-                */
-               Assert(dlist_is_empty(&slab->freelist[slab->chunksPerBlock]));
+                       if (unlikely(block == NULL))
+                               return NULL;
  
-               dlist_push_head(&slab->freelist[slab->chunksPerBlock], &block->node);
+                       block->slab = slab;
+                       context->mem_allocated += slab->blockSize;
  
-               slab->minFreeChunks = slab->chunksPerBlock;
-               slab->nblocks += 1;
-               context->mem_allocated += slab->blockSize;
-       }
+                       /* use the first chunk in the new block */
+                       chunk = SlabBlockGetChunk(slab, block, 0);
  
-       /* grab the block from the freelist (even the new block is there) */
-       block = dlist_head_element(SlabBlock, node,
-                                                          &slab->freelist[slab->minFreeChunks]);
+                       block->nfree = slab->chunksPerBlock - 1;
+                       block->unused = SlabBlockGetChunk(slab, block, 1);
+                       block->freehead = NULL;
+                       block->nunused = slab->chunksPerBlock - 1;
+               }
  
-       /* make sure we actually got a valid block, with matching nfree */
-       Assert(block != NULL);
-       Assert(slab->minFreeChunks == block->nfree);
-       Assert(block->nfree > 0);
+               /* find the blocklist element for storing blocks with 1 used chunk */
+               blocklist_idx = SlabBlocklistIndex(slab, block->nfree);
+               blocklist = &slab->blocklist[blocklist_idx];
  
-       /* we know index of the first free chunk in the block */
-       idx = block->firstFreeChunk;
+               /* this better be empty.  We just added a block thinking it was */
+               Assert(dlist_is_empty(blocklist));
  
-       /* make sure the chunk index is valid, and that it's marked as empty */
-       Assert((idx >= 0) && (idx < slab->chunksPerBlock));
+               dlist_push_head(blocklist, &block->node);
  
-       /* compute the chunk location block start (after the block header) */
-       chunk = SlabBlockGetChunk(slab, block, idx);
+               slab->curBlocklistIndex = blocklist_idx;
+       }
+       else
+       {
+               dlist_head *blocklist = &slab->blocklist[slab->curBlocklistIndex];
+               int                     new_blocklist_idx;
  
-       /*
-        * Update the block nfree count, and also the minFreeChunks as we've
-        * decreased nfree for a block with the minimum number of free chunks
-        * (because that's how we chose the block).
-        */
-       block->nfree--;
-       slab->minFreeChunks = block->nfree;
+               Assert(!dlist_is_empty(blocklist));
  
-       /*
-        * Remove the chunk from the freelist head. The index of the next free
-        * chunk is stored in the chunk itself.
-        */
-       VALGRIND_MAKE_MEM_DEFINED(MemoryChunkGetPointer(chunk), sizeof(int32));
-       block->firstFreeChunk = *(int32 *) MemoryChunkGetPointer(chunk);
+               /* grab the block from the blocklist */
+               block = dlist_head_element(SlabBlock, node, blocklist);
  
-       Assert(block->firstFreeChunk >= 0);
-       Assert(block->firstFreeChunk <= slab->chunksPerBlock);
+               /* make sure we actually got a valid block, with matching nfree */
+               Assert(block != NULL);
+               Assert(slab->curBlocklistIndex == SlabBlocklistIndex(slab, block->nfree));
+               Assert(block->nfree > 0);
  
-       Assert((block->nfree != 0 &&
-                       block->firstFreeChunk < slab->chunksPerBlock) ||
-                  (block->nfree == 0 &&
-                       block->firstFreeChunk == slab->chunksPerBlock));
+               /* fetch the next chunk from this block */
+               chunk = SlabGetNextFreeChunk(slab, block);
  
-       /* move the whole block to the right place in the freelist */
-       dlist_delete(&block->node);
-       dlist_push_head(&slab->freelist[block->nfree], &block->node);
+               /* get the new blocklist index based on the new free chunk count */
+               new_blocklist_idx = SlabBlocklistIndex(slab, block->nfree);
  
-       /*
-        * And finally update minFreeChunks, i.e. the index to the block with the
-        * lowest number of free chunks. We only need to do that when the block
-        * got full (otherwise we know the current block is the right one). We'll
-        * simply walk the freelist until we find a non-empty entry.
-        */
-       if (slab->minFreeChunks == 0)
-       {
-               for (idx = 1; idx <= slab->chunksPerBlock; idx++)
+               /*
+                * Handle the case where the blocklist index changes.  This also deals
+                * with blocks becoming full as only full blocks go at index 0.
+                */
+               if (unlikely(slab->curBlocklistIndex != new_blocklist_idx))
                 {
-                       if (dlist_is_empty(&slab->freelist[idx]))
-                               continue;
+                       dlist_delete_from(blocklist, &block->node);
+                       dlist_push_head(&slab->blocklist[new_blocklist_idx], &block->node);
  
-                       /* found a non-empty freelist */
-                       slab->minFreeChunks = idx;
-                       break;
+                       if (dlist_is_empty(blocklist))
+                               slab->curBlocklistIndex = SlabFindNextBlockListIndex(slab);
                 }
         }
  
-       if (slab->minFreeChunks == slab->chunksPerBlock)
-               slab->minFreeChunks = 0;
+       /*
+        * Check that the chunk pointer is actually somewhere on the block and is
+        * aligned as expected.
+        */
+       Assert(chunk >= SlabBlockGetChunk(slab, block, 0));
+       Assert(chunk <= SlabBlockGetChunk(slab, block, slab->chunksPerBlock - 1));
+       Assert(SlabChunkMod(slab, block, chunk) == 0);
  
         /* Prepare to initialize the chunk header. */
         VALGRIND_MAKE_MEM_UNDEFINED(chunk, Slab_CHUNKHDRSZ);
@@ -453,8 +630,6 @@ SlabAlloc(MemoryContext context, Size size)
         randomize_mem((char *) MemoryChunkGetPointer(chunk), size);
  #endif
  
-       Assert(slab->nblocks * slab->blockSize == context->mem_allocated);
-
         return MemoryChunkGetPointer(chunk);
  }
  
@@ -468,7 +643,8 @@ SlabFree(void *pointer)
         MemoryChunk *chunk = PointerGetMemoryChunk(pointer);
         SlabBlock  *block = MemoryChunkGetBlock(chunk);
         SlabContext *slab;
-       int                     idx;
+       int                     curBlocklistIdx;
+       int                     newBlocklistIdx;
  
         /*
          * For speed reasons we just Assert that the referenced block is good.
@@ -486,63 +662,82 @@ SlabFree(void *pointer)
                          slab->header.name, chunk);
  #endif
  
-       /* compute index of the chunk with respect to block start */
-       idx = SlabChunkIndex(slab, block, chunk);
+       /* push this chunk onto the head of the block's free list */
+       *(MemoryChunk **) pointer = block->freehead;
+       block->freehead = chunk;
  
-       /* add chunk to freelist, and update block nfree count */
-       *(int32 *) pointer = block->firstFreeChunk;
-       block->firstFreeChunk = idx;
         block->nfree++;
  
         Assert(block->nfree > 0);
         Assert(block->nfree <= slab->chunksPerBlock);
  
  #ifdef CLOBBER_FREED_MEMORY
-       /* XXX don't wipe the int32 index, used for block-level freelist */
-       wipe_mem((char *) pointer + sizeof(int32),
-                        slab->chunkSize - sizeof(int32));
+       /* don't wipe the free list MemoryChunk pointer stored in the chunk */
+       wipe_mem((char *) pointer + sizeof(MemoryChunk *),
+                        slab->chunkSize - sizeof(MemoryChunk *));
  #endif
  
-       /* remove the block from a freelist */
-       dlist_delete(&block->node);
+       curBlocklistIdx = SlabBlocklistIndex(slab, block->nfree - 1);
+       newBlocklistIdx = SlabBlocklistIndex(slab, block->nfree);
  
         /*
-        * See if we need to update the minFreeChunks field for the slab - we only
-        * need to do that if there the block had that number of free chunks
-        * before we freed one. In that case, we check if there still are blocks
-        * in the original freelist and we either keep the current value (if there
-        * still are blocks) or increment it by one (the new block is still the
-        * one with minimum free chunks).
-        *
-        * The one exception is when the block will get completely free - in that
-        * case we will free it, se we can't use it for minFreeChunks. It however
-        * means there are no more blocks with free chunks.
+        * Check if the block needs to be moved to another element on the
+        * blocklist based on it now having 1 more free chunk.
          */
-       if (slab->minFreeChunks == (block->nfree - 1))
+       if (unlikely(curBlocklistIdx != newBlocklistIdx))
         {
-               /* Have we removed the last chunk from the freelist? */
-               if (dlist_is_empty(&slab->freelist[slab->minFreeChunks]))
+               /* do the move */
+               dlist_delete_from(&slab->blocklist[curBlocklistIdx], &block->node);
+               dlist_push_head(&slab->blocklist[newBlocklistIdx], &block->node);
+
+               /*
+                * It's possible that we've no blocks in the blocklist at the
+                * curBlocklistIndex position.  When this happens we must find the
+                * next blocklist index which contains blocks.  We can be certain
+                * we'll find a block as at least one must exist for the chunk we're
+                * currently freeing.
+                */
+               if (slab->curBlocklistIndex == curBlocklistIdx &&
+                       dlist_is_empty(&slab->blocklist[curBlocklistIdx]))
                 {
-                       /* but if we made the block entirely free, we'll free it */
-                       if (block->nfree == slab->chunksPerBlock)
-                               slab->minFreeChunks = 0;
-                       else
-                               slab->minFreeChunks++;
+                       slab->curBlocklistIndex = SlabFindNextBlockListIndex(slab);
+                       Assert(slab->curBlocklistIndex > 0);
                 }
         }
  
-       /* If the block is now completely empty, free it. */
-       if (block->nfree == slab->chunksPerBlock)
+       /* Handle when a block becomes completely empty */
+       if (unlikely(block->nfree == slab->chunksPerBlock))
         {
-               free(block);
-               slab->nblocks--;
-               slab->header.mem_allocated -= slab->blockSize;
-       }
-       else
-               dlist_push_head(&slab->freelist[block->nfree], &block->node);
+               /* remove the block */
+               dlist_delete_from(&slab->blocklist[newBlocklistIdx], &block->node);
+
+               /*
+                * To avoid thrashing malloc/free, we keep a list of empty blocks that
+                * we can reuse again instead of having to malloc a new one.
+                */
+               if (dclist_count(&slab->emptyblocks) < SLAB_MAXIMUM_EMPTY_BLOCKS)
+                       dclist_push_head(&slab->emptyblocks, &block->node);
+               else
+               {
+                       /*
+                        * When we have enough empty blocks stored already, we actually
+                        * free the block.
+                        */
+#ifdef CLOBBER_FREED_MEMORY
+                       wipe_mem(block, slab->blockSize);
+#endif
+                       free(block);
+                       slab->header.mem_allocated -= slab->blockSize;
+               }
  
-       Assert(slab->nblocks >= 0);
-       Assert(slab->nblocks * slab->blockSize == slab->header.mem_allocated);
+               /*
+                * Check if we need to reset the blocklist index.  This is required
+                * when the blocklist this block is on has become completely empty.
+                */
+               if (slab->curBlocklistIndex == newBlocklistIdx &&
+                       dlist_is_empty(&slab->blocklist[newBlocklistIdx]))
+                       slab->curBlocklistIndex = SlabFindNextBlockListIndex(slab);
+       }
  }
  
  /*
@@ -617,16 +812,14 @@ SlabGetChunkSpace(void *pointer)
  
  /*
   * SlabIsEmpty
- *             Is an Slab empty of any allocated space?
+ *             Is the slab empty of any allocated space?
   */
  bool
  SlabIsEmpty(MemoryContext context)
  {
-       SlabContext *slab = (SlabContext *) context;
-
-       Assert(SlabIsValid(slab));
+       Assert(SlabIsValid((SlabContext *) context));
  
-       return (slab->nblocks == 0);
+       return (context->mem_allocated == 0);
  }
  
  /*
@@ -654,13 +847,16 @@ SlabStats(MemoryContext context,
         Assert(SlabIsValid(slab));
  
         /* Include context header in totalspace */
-       totalspace = slab->headerSize;
+       totalspace = Slab_CONTEXT_HDRSZ(slab->chunksPerBlock);
  
-       for (i = 0; i <= slab->chunksPerBlock; i++)
+       /* Add the space consumed by blocks in the emptyblocks list */
+       totalspace += dclist_count(&slab->emptyblocks) * slab->blockSize;
+
+       for (i = 0; i < SLAB_BLOCKLIST_COUNT; i++)
         {
                 dlist_iter      iter;
  
-               dlist_foreach(iter, &slab->freelist[i])
+               dlist_foreach(iter, &slab->blocklist[i])
                 {
                         SlabBlock  *block = dlist_container(SlabBlock, node, iter.cur);
  
@@ -675,10 +871,11 @@ SlabStats(MemoryContext context,
         {
                 char            stats_string[200];
  
+               /* XXX should we include free chunks on empty blocks? */
                 snprintf(stats_string, sizeof(stats_string),
-                                "%zu total in %zu blocks; %zu free (%zu chunks); %zu used",
-                                totalspace, nblocks, freespace, freechunks,
-                                totalspace - freespace);
+                                "%zu total in %zu blocks; %u empty blocks; %zu free (%zu chunks); %zu used",
+                                totalspace, nblocks, dclist_count(&slab->emptyblocks),
+                                freespace, freechunks, totalspace - freespace);
                 printfunc(context, passthru, stats_string, print_to_stderr);
         }
  
@@ -696,7 +893,7 @@ SlabStats(MemoryContext context,
  
  /*
   * SlabCheck
- *             Walk through chunks and check consistency of memory.
+ *             Walk through all blocks looking for inconsistencies.
   *
   * NOTE: report errors as WARNING, *not* ERROR or FATAL.  Otherwise you'll
   * find yourself in an infinite loop when trouble occurs, because this
@@ -707,67 +904,113 @@ SlabCheck(MemoryContext context)
  {
         SlabContext *slab = (SlabContext *) context;
         int                     i;
+       int                     nblocks = 0;
         const char *name = slab->header.name;
+       dlist_iter      iter;
  
         Assert(SlabIsValid(slab));
         Assert(slab->chunksPerBlock > 0);
  
-       /* walk all the freelists */
-       for (i = 0; i <= slab->chunksPerBlock; i++)
+       /*
+        * Have a look at the empty blocks.  These should have all their chunks
+        * marked as free.  Ensure that's the case.
+        */
+       dclist_foreach(iter, &slab->emptyblocks)
+       {
+               SlabBlock  *block = dlist_container(SlabBlock, node, iter.cur);
+
+               if (block->nfree != slab->chunksPerBlock)
+                       elog(WARNING, "problem in slab %s: empty block %p should have %d free chunks but has %d chunks free",
+                                name, block, slab->chunksPerBlock, block->nfree);
+       }
+
+       /* walk the non-empty block lists */
+       for (i = 0; i < SLAB_BLOCKLIST_COUNT; i++)
         {
                 int                     j,
                                         nfree;
-               dlist_iter      iter;
  
-               /* walk all blocks on this freelist */
-               dlist_foreach(iter, &slab->freelist[i])
+               /* walk all blocks on this blocklist */
+               dlist_foreach(iter, &slab->blocklist[i])
                 {
-                       int                     idx;
                         SlabBlock  *block = dlist_container(SlabBlock, node, iter.cur);
+                       MemoryChunk *cur_chunk;
  
                         /*
                          * Make sure the number of free chunks (in the block header)
-                        * matches position in the freelist.
+                        * matches the position in the blocklist.
                          */
-                       if (block->nfree != i)
-                               elog(WARNING, "problem in slab %s: number of free chunks %d in block %p does not match freelist %d",
-                                        name, block->nfree, block, i);
+                       if (SlabBlocklistIndex(slab, block->nfree) != i)
+                               elog(WARNING, "problem in slab %s: block %p is on blocklist %d but should be on blocklist %d",
+                                        name, block, i, SlabBlocklistIndex(slab, block->nfree));
+
+                       /* make sure the block is not empty */
+                       if (block->nfree >= slab->chunksPerBlock)
+                               elog(WARNING, "problem in slab %s: empty block %p incorrectly stored on blocklist element %d",
+                                        name, block, i);
  
                         /* make sure the slab pointer correctly points to this context */
                         if (block->slab != slab)
                                 elog(WARNING, "problem in slab %s: bogus slab link in block %p",
                                          name, block);
  
-                       /* reset the bitmap of free chunks for this block */
-                       memset(slab->freechunks, 0, (slab->chunksPerBlock * sizeof(bool)));
-                       idx = block->firstFreeChunk;
+                       /* reset the array of free chunks for this block */
+                       memset(slab->isChunkFree, 0, (slab->chunksPerBlock * sizeof(bool)));
+                       nfree = 0;
+
+                       /* walk through the block's free list chunks */
+                       cur_chunk = block->freehead;
+                       while (cur_chunk != NULL)
+                       {
+                               int                     chunkidx = SlabChunkIndex(slab, block, cur_chunk);
+
+                               /*
+                                * Ensure the free list link points to something on the block
+                                * at an address aligned according to the full chunk size.
+                                */
+                               if (cur_chunk < SlabBlockGetChunk(slab, block, 0) ||
+                                       cur_chunk > SlabBlockGetChunk(slab, block, slab->chunksPerBlock - 1) ||
+                                       SlabChunkMod(slab, block, cur_chunk) != 0)
+                                       elog(WARNING, "problem in slab %s: bogus free list link %p in block %p",
+                                                name, cur_chunk, block);
+
+                               /* count the chunk and mark it free on the free chunk array */
+                               nfree++;
+                               slab->isChunkFree[chunkidx] = true;
+
+                               /* read pointer of the next free chunk */
+                               VALGRIND_MAKE_MEM_DEFINED(MemoryChunkGetPointer(cur_chunk), sizeof(MemoryChunk *));
+                               cur_chunk = *(MemoryChunk **) SlabChunkGetPointer(cur_chunk);
+                       }
+
+                       /* check that the unused pointer matches what nunused claims */
+                       if (SlabBlockGetChunk(slab, block, slab->chunksPerBlock - block->nunused) !=
+                               block->unused)
+                               elog(WARNING, "problem in slab %s: mismatch detected between nunused chunks and unused pointer in block %p",
+                                        name, block);
  
                         /*
-                        * Now walk through the chunks, count the free ones and also
-                        * perform some additional checks for the used ones. As the chunk
-                        * freelist is stored within the chunks themselves, we have to
-                        * walk through the chunks and construct our own bitmap.
+                        * count the remaining free chunks that have yet to make it onto
+                        * the block's free list.
                          */
-
-                       nfree = 0;
-                       while (idx < slab->chunksPerBlock)
+                       cur_chunk = block->unused;
+                       for (j = 0; j < block->nunused; j++)
                         {
-                               MemoryChunk *chunk;
+                               int                     chunkidx = SlabChunkIndex(slab, block, cur_chunk);
+
  
-                               /* count the chunk as free, add it to the bitmap */
+                               /* count the chunk as free and mark it as so in the array */
                                 nfree++;
-                               slab->freechunks[idx] = true;
+                               if (chunkidx < slab->chunksPerBlock)
+                                       slab->isChunkFree[chunkidx] = true;
  
-                               /* read index of the next free chunk */
-                               chunk = SlabBlockGetChunk(slab, block, idx);
-                               VALGRIND_MAKE_MEM_DEFINED(MemoryChunkGetPointer(chunk), sizeof(int32));
-                               idx = *(int32 *) MemoryChunkGetPointer(chunk);
+                               /* move forward 1 chunk */
+                               cur_chunk = (MemoryChunk *) (((char *) cur_chunk) + slab->fullChunkSize);
                         }
  
                         for (j = 0; j < slab->chunksPerBlock; j++)
                         {
-                               /* non-zero bit in the bitmap means chunk the chunk is used */
-                               if (!slab->freechunks[j])
+                               if (!slab->isChunkFree[j])
                                 {
                                         MemoryChunk *chunk = SlabBlockGetChunk(slab, block, j);
                                         SlabBlock  *chunkblock = (SlabBlock *) MemoryChunkGetBlock(chunk);
@@ -793,12 +1036,17 @@ SlabCheck(MemoryContext context)
                          * in the block header).
                          */
                         if (nfree != block->nfree)
-                               elog(WARNING, "problem in slab %s: number of free chunks %d in block %p does not match bitmap %d",
-                                        name, block->nfree, block, nfree);
+                               elog(WARNING, "problem in slab %s: nfree in block %p is %d but %d chunk were found as free",
+                                        name, block, block->nfree, nfree);
+
+                       nblocks++;
                 }
         }
  
-       Assert(slab->nblocks * slab->blockSize == context->mem_allocated);
+       /* the stored empty blocks are tracked in mem_allocated too */
+       nblocks += dclist_count(&slab->emptyblocks);
+
+       Assert(nblocks * slab->blockSize == context->mem_allocated);
  }
  
  #endif                                                 /* MEMORY_CONTEXT_CHECKING */
author	David Rowley <[email protected]>
	Tue, 20 Dec 2022 08:48:51 +0000 (21:48 +1300)
committer	David Rowley <[email protected]>
	Tue, 20 Dec 2022 08:48:51 +0000 (21:48 +1300)