Fix ABA problem with inserts.
authorRobert Haas <[email protected]>
Wed, 1 Aug 2012 16:51:09 +0000 (16:51 +0000)
committerRobert Haas <[email protected]>
Wed, 1 Aug 2012 16:51:09 +0000 (16:51 +0000)
src/backend/utils/hash/chash.c

index 83e514689ed2865eef02cccfdfdc3c74c0f063df..dba906586e276eb1318ada5af2a571abe62e82b3 100644 (file)
@@ -186,7 +186,6 @@ typedef struct
 /* Function prototypes. */
 static CHashPtr CHashAllocate(CHashTable table);
 static void CHashAddToGarbage(CHashTable table, uint32 bucket, CHashPtr c);
-static void CHashImmediateFree(CHashTable table, CHashPtr c);
 static void CHashBucketScan(CHashTable table,
                                CHashPtr *start,
                                uint32 hashcode,
@@ -426,7 +425,6 @@ CHashInsert(CHashTable table, void *entry)
        memcpy(CHashNodeGetItem(nnew), entry, table->desc.element_size);
 
        /* Prevent garbage collection for this bucket. */
-       Assert(MyProc->hazard[0] == NULL);
        MyProc->hazard[0] = b;
        pg_memory_barrier();
 
@@ -473,9 +471,22 @@ retry:
        pg_memory_barrier();
        MyProc->hazard[0] = NULL;
 
-       /* If the insert failed, free the entry we allocated. */
+       /*
+        * If the insert failed, add the entry we found to the appropriate
+        * garbage list.  We can't simply put this back on the freelist,
+        * because that leads to an ABA problem: some other backend could
+        * read the head of the freelist, go into the tank, and then use
+        * CAS to pop an element.  If at that point, it finds the same
+        * element at the head of the freelist but with a different successor,
+        * we're hosed.  To prevent that, we ensure that elements are added
+        * to a given freelist only after verifying that any allocations in
+        * progress at the time we popped the freelist has completed.  This
+        * guarantees that any allocation still in progress at the time this
+        * element makes it back to the freelist is trying to allocate some
+        * other node.
+        */
        if (scan.found)
-               CHashImmediateFree(table, new);
+               CHashAddToGarbage(table, bucket, new);
 
        /* The insert succeeded if and only if no duplicate was found. */
        return !scan.found;
@@ -845,8 +856,19 @@ CHashAllocate(CHashTable table)
        {
                CHashPtr  *b;
 
-               /* Try to pop a buffer from a freelist using compare-and-swap. */
+               /*
+                * Try to pop a buffer from a freelist using compare-and-swap.
+                *
+                * Note that this is only safe if it's impossible for the next pointer
+                * of the first element on the list to change between the time when
+                * we read it and the time we use CAS to pop it off the list.  This
+                * means that we can't allow any element that is currently on this
+                * freelist to be allocated, marked as garbage, garbage collected,
+                * and returned back to this freelist before we finish the CAS.
+                */
                b = &table->freelist[f_current];
+               MyProc->hazard[0] = b;
+               pg_memory_barrier();
                new = *b;
                if (!CHashPtrIsInvalid(new))
                {
@@ -876,12 +898,20 @@ CHashAllocate(CHashTable table)
                        uint32          i;
                        CHashPtr        fhead;
                        CHashNode *n;
+                       CHashPtr   *fh = &table->freelist[f_home];
 
                        /*
-                        * Spin until garbage is recyclable.  We can't begin this operation
-                        * until the clearing of the garbage list has been committed to
-                        * memory, but since that was done using an atomic operation no
-                        * explicit barrier is needed here.
+                        * Before removing elements removed from the garbage list to the
+                        * freelist, we must wait until (1) all bucket scans that might
+                        * still see elements on the freelist as part of the bucket chain
+                        * have completed and (2) all allocations that might see an old
+                        * version of the freelist containing one of the elements to be
+                        * garbage collected have completed.
+                        *
+                        * Note: We can't begin this operation until the clearing of the
+                        * garbage list has been committed to memory, but since that was
+                        * done using an atomic operation no explicit barrier is needed
+                        * here.
                         *
                         * Note: We could have a "soft" version of this that merely
                         * requeues the garbage if it's not immediately recycleable, but
@@ -889,12 +919,16 @@ CHashAllocate(CHashTable table)
                         * might want to eventually enter a longer sleep here, or PANIC,
                         * but it's not clear exactly how to calibrate that.
                         */
+                       MyProc->hazard[0] = NULL;
                        for (i = 0; i < ProcGlobal->allProcCount; i++)
                        {
-                               PGPROC     *proc = &ProcGlobal->allProcs[i];
+                               volatile PGPROC *proc = &ProcGlobal->allProcs[i];
+                               void       *hazard;
 
-                               while (proc->hazard[0] == b)
-                                       ;
+                               do
+                               {
+                                       hazard = proc->hazard[0];
+                               } while (hazard == b || hazard == fh);
                        }
 
                        /* Remove one item from list to satisfy current allocation. */
@@ -922,12 +956,11 @@ CHashAllocate(CHashTable table)
                                }
 
                                /* Push reclaimed elements onto home free list. */
-                               b = &table->freelist[f_home];
                                for (;;)
                                {
-                                       oldhead = *b;
+                                       oldhead = *fh;
                                        n->un.gcnext = oldhead;
-                                       if (__sync_bool_compare_and_swap(b, oldhead, fhead))
+                                       if (__sync_bool_compare_and_swap(fh, oldhead, fhead))
                                                break;
                                        table->stats.s_gc_reclaim_retry++;
                                }
@@ -973,33 +1006,3 @@ CHashAddToGarbage(CHashTable table, uint32 bucket, CHashPtr c)
                table->stats.s_garbage_retry++;
        }
 }
-
-/*
- * Free an arena slot immediately.
- *
- * When a slot that's actually in use is freed, we can't use this routine,
- * because a concurrent bucket scan might have a private pointer to the
- * object.  However, if an insert fails due to a duplicate key, then we need
- * to put it back on the free list immediately.
- */
-static void
-CHashImmediateFree(CHashTable table, CHashPtr c)
-{
-       CHashNode  *n;
-       CHashPtr   *free;
-       uint32          f_home;
-       CHashPtr        f;
-
-       f_home = ((uint32) MyBackendId) % table->nfreelists;
-       n = CHashTableGetNode(table, c);
-       free = &table->freelist[f_home];
-
-       for (;;)
-       {
-               f = *free;
-               n->un.gcnext = f;
-               if (__sync_bool_compare_and_swap(free, f, c))
-                       break;
-               table->stats.s_free_retry++;
-       }
-}