Start of work on GC.
authorRobert Haas <[email protected]>
Sat, 14 Jul 2012 05:44:22 +0000 (01:44 -0400)
committerRobert Haas <[email protected]>
Tue, 24 Jul 2012 16:31:53 +0000 (12:31 -0400)
src/backend/utils/hash/chash.c

index 50fdafca430af2b8052dab12d244ad00757e7ddf..0bcd576cd5512d65157af67ef23076c1ef543c1a 100644 (file)
@@ -142,6 +142,8 @@ typedef struct CHashTableData
        uint32                  bucket_mask;    /* # of buckets, minus one */
        uint32                  garbage_shift;  /* log2(nbuckets/ngarbage) */
        uint32                  ngarbage;               /* # of garbage lists, a power of two */
+       uint32                  gc_next;                /* next garbage list to reclaim */
+       int                             gc_pid;                 /* PID that set gc_next */
        uint32                  nfreelists;             /* # of freelists */
        uint32                  arena_limit;    /* # of arena elements */
        uint32                  arena_stride;   /* bytes allocated per arena element */
@@ -155,6 +157,25 @@ typedef struct CHashTableData
        (AssertMacro((offset) < (table)->arena_limit), \
         (CHashNode *) ((table)->arena + (table)->arena_stride * (offset)))
 
+/*
+ * We need a memory barrier to make sure that our bucket advertisement is
+ * committed to memory before we begin scanning the bucket, and another one
+ * to make sure the scan is done before we cease advertising it.
+ */
+#define CHashTableSuppressGC(table, bno) \
+       do { \
+               Assert(MyProc->chash_bucket == 0); \
+               MyProc->chash_bucket = \
+                       ((uint64) table->desc.id)<<32 | (bucket>>table->garbage_shift); \
+               pg_memory_barrier(); \
+       } while (0)
+#define CHashTableUnsuppressGC() \
+       do { \
+               Assert(MyProc->chash_bucket != 0); \
+               pg_memory_barrier(); \
+               MyProc->chash_bucket = 0; \
+       } while (0)
+
 /*
  * First stage of CHashTable initialization.  We fill in all the constants
  * here, but not the pointers.
@@ -195,6 +216,8 @@ CHashBootstrap(CHashDescriptor *desc)
         */
        table->garbage_shift = Min(bucket_shift, 6);
        table->ngarbage = table->nbuckets >> table->garbage_shift;
+       table->gc_next = 0;
+       table->gc_pid = 0;
 
        /*
         * The number of freelists must be large enough to avoid contention;
@@ -341,13 +364,8 @@ CHashSearch(CHashTable table, void *entry)
        CHashNode  *n;
        bool    found = false;
 
-       /*
-        * Suppress garbage collection for target bucket.  We need a memory
-        * barrier to make sure that our bucket advertisement is committed to
-        * memory before we begin scanning the bucket.
-        */
-       MyProc->chash_bucket = ((uint64) table->desc.id) << 32 | bucket;
-       pg_memory_barrier();
+       /* Suppress garbage collection for target bucket. */
+       CHashTableSuppressGC(table, bucket);
 
        /* Scan bucket. */
        c = table->bucket[bucket].head;
@@ -367,7 +385,7 @@ CHashSearch(CHashTable table, void *entry)
                pg_read_barrier_depends();
 
                /* Compare current node by hashcode, then by memcmp. */
-               n = CHashTableGetNode(table, c);
+               n = CHashTableGetNode(table, CHashPtrGetOffset(c));
                if (n->hashcode == hashcode)
                        cmp = memcmp(CHashNodeGetItem(n), entry, table->desc.key_size);
                else if (n->hashcode > hashcode)
@@ -406,15 +424,129 @@ CHashSearch(CHashTable table, void *entry)
                }
        }
 
-       /*
-        * Once we're done scanning the bucket, we can permit garbage collection
-        * of that bucket to resume.  The memory barrier ensures that all the reads
-        * that are part of the bucket scan finish before we allow garbage
-        * collection.
-        */
-       pg_memory_barrier();
-       MyProc->chash_bucket = 0;
+       /* Done scanning bucket. */
+       CHashTableUnsuppressGC();
 
        /* Return result to caller. */
        return found;
 }
+
+/*
+ * Insert into a concurrent hash table.  entry should be the complete entry
+ * to be inserted.  If no duplicate entry is found in the table, this function
+ * will insert the entry and return true.  Otherwise, the duplicate entry's
+ * value will be copied into the supplied entry and the function will return
+ * false.
+ */
+bool
+CHashInsert(CHashTable table, void *entry)
+{
+}
+
+/*
+ * Allocate an arena slot for a new item to be inserted into a hash table.
+ *
+ * We don't want to wait until every single free-list is completely empty
+ * before beginning to garbage collect, because that could have undesirable
+ * latency characteristics, and might possibly render free-lists thoroughly
+ * worthless from the point of view of contention avoidance.  Instead, we
+ * check free lists and garbage lists in alternation.  If we find a non-empty
+ * free list, we allocate from it; if we find a non-empty garbage list, we
+ * garbage collect it and put the contents on our free list.
+ *
+ * We always start with the same free list - which one is based on our
+ * backend ID - but we try to round-robin among all the garbage lists.
+ */
+static CHashPtr
+CHashAllocate(CHashTable table)
+{
+       uint32          f_home = ((uint32) MyBackendId) % table->nfreelists;
+       uint32          f_current = f_home;
+       CHashPtr        new;
+       CHashPtr        garbage;
+       volatile CHashTable vtable = table;
+
+       /* If this process hasn't initialized gc_next yet, do that now. */
+       if (table->gc_pid != MyProcPid)
+       {
+               table->gc_pid = MyProcPid;
+               table->gc_next = ((uint32) MyProcPid) % table->ngarbage;
+       }
+
+       /* Loop until we allocate a buffer. */
+       for (;;)
+       {
+               /*
+                * Check one freelist for an available arena slot.  To minimize
+                * spinlock traffic, we do an unlocked test first.  We must recheck
+                * after acquiring the spinlock.
+                */
+               if (vtable->freelist[f_current].head != InvalidCHashPtr)
+               {
+                       SpinLockAcquire(&vtable->freelist[f_current].mutex);
+                       new = vtable->freelist[f_current].head;
+                       if (new != InvalidCHashPtr)
+                       {
+                               CHashNode          *n = CHashTableGetNode(table, new);
+                               vtable->freelist[f_current].head = n->next;
+                               SpinLockRelease(&vtable->freelist[f_current].mutex);
+                               return new;
+                       }
+                       SpinLockRelease(&vtable->freelist[f_current].mutex);
+               }
+
+               /*
+                * Check the next garbage list for recyclable buffers.  If we
+                * find any, try to garbage collect them.
+                */
+               table->gc_next = (table->gc_next + 1) % table->ngarbage;
+               if (vtable->garbage[table->gc_next].head != InvalidCHashPtr)
+               {
+                       volatile CHashBucket *b = &vtable->freelist[f_current];
+                       uint32          i;
+                       uint64          chash_bucket;
+
+                       /* Pop garbage off list. */
+                       SpinLockAcquire(&b->mutex);
+                       garbage = b->head;
+                       b->head = InvalidCHashPtr;
+                       SpinLockRelease(&b->mutex);
+
+                       /* Anything to recycle? */
+                       if (garbage != InvalidCHashPtr)
+                       {
+                               /*
+                                * Be certain that the writes associated with popping the
+                                * garbage list are complete before we start checking whether
+                                * the garbage is recycleable.
+                                */
+                               pg_memory_barrier();
+
+                               /*
+                                * Spin until garbage is recyclable.  We could have a "soft"
+                                * version of this that merely requeues the garbage if it's not
+                                * immediately recycleable, but it's not clear that we need
+                                * such a thing.  On the flip side we might want to eventually
+                                * enter a longer sleep here, or PANIC, but it's not clear
+                                * exactly how to calibrate that, either.
+                                */
+                               chash_bucket = ((uint64) table->desc.id)<<32 | table->gc_next;
+                       for (i = 0; i < ProcGlobal->allProcCount; i++)
+                               {
+                                       PGPROC     *proc = &ProcGlobal->allProcs[i];
+
+                                       while (proc->chash_bucket == chash_bucket)
+                                               ;
+                               }
+
+                               /*
+                                * Be certain that all prior reads are done before starting
+                                * the next batch of writes.
+                                */
+                               pg_memory_barrier();
+
+                               /* XXX.  Recycle garbage here! */
+                       }
+               }
+       }
+}