Don't rename recovery.conf out of the way until the first checkpoint, like
authorHeikki Linnakangas <[email protected]>
Wed, 4 Feb 2009 18:15:44 +0000 (20:15 +0200)
committerHeikki Linnakangas <[email protected]>
Wed, 4 Feb 2009 18:15:44 +0000 (20:15 +0200)
it was done in Simon's original patch. And fix some other comments pointed out
by Simon.

src/backend/access/transam/xlog.c
src/include/access/xlog.h

index 79505a01452e58368f33d69f179fdf936544831a..961bcf3c439f1a547a4ec4436aeac2ff5db352da 100644 (file)
@@ -1765,6 +1765,13 @@ XLogSetAsyncCommitLSN(XLogRecPtr asyncCommitLSN)
        SpinLockRelease(&xlogctl->info_lck);
 }
 
+/*
+ * Advance minRecoveryPoint in control file.
+ *
+ * If we crash during reocvery, we must reach this point again before
+ * the database is consistent. If minRecoveryPoint is already greater than
+ * or equal to 'lsn', it is not updated.
+ */
 static void
 UpdateMinRecoveryPoint(XLogRecPtr lsn)
 {
@@ -1772,14 +1779,6 @@ UpdateMinRecoveryPoint(XLogRecPtr lsn)
        if (!updateMinRecoveryPoint || XLByteLE(lsn, minRecoveryPoint))
                return;
 
-       /* XXX
-        * Calculate and write out a new safeStartPoint. This defines
-        * the latest LSN that might appear on-disk while we apply
-        * the WAL records in this file. If we crash during recovery
-        * we must reach this point again before we can prove
-        * database consistency. Not a restartpoint! Restart points
-        * define where we should start recovery from, if we crash.
-        */
        LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
 
        /* update local copy */
@@ -1797,21 +1796,22 @@ UpdateMinRecoveryPoint(XLogRecPtr lsn)
                volatile XLogCtlData *xlogctl = XLogCtl;
 
                /*
-                * We need to update the control file. To avoid having to update it
-                * too often, we update it all the way to EndRecPtr, even though 'lsn'
+                * To avoid having to update the control file too often, we update
+                * it all the way to the last record being replayed, even though 'lsn'
                 * would suffice for correctness.
                 */
                SpinLockAcquire(&xlogctl->info_lck);
                minRecoveryPoint = xlogctl->replayEndRecPtr;
                SpinLockRelease(&xlogctl->info_lck);
 
+               /* update control file */
                ControlFile->minRecoveryPoint = minRecoveryPoint;
                UpdateControlFile();
+
+               elog(DEBUG2, "updated min recovery point to %X/%X",
+                        minRecoveryPoint.xlogid, minRecoveryPoint.xrecoff);
        }
        LWLockRelease(ControlFileLock);
-       
-       elog(LOG, "updated min recovery point to %X/%X",
-                minRecoveryPoint.xlogid, minRecoveryPoint.xrecoff);
 }
 
 /*
@@ -4835,18 +4835,13 @@ exitArchiveRecovery(TimeLineID endTLI, uint32 endLogId, uint32 endLogSeg)
        unlink(recoveryPath);           /* ignore any error */
 
        /*
-        * Rename the config file out of the way, so that we don't accidentally
-        * re-enter archive recovery mode in a subsequent crash. We have already
-        * restored all the WAL segments we need from the archive, and we trust
-        * that they are not going to go away even if we crash. (XXX: should
-        * we fsync() them all to ensure that?)
+        * As of 8.4 we no longer rename the recovery.conf file out of the
+        * way until after we have performed a full checkpoint. This ensures
+        * that any crash between now and the end of the checkpoint does not
+        * attempt to restart from a WAL file that is no longer available to us.
+        * As soon as we remove recovery.conf we lose our recovery_command and
+        * cannot reaccess WAL files from the archive.
         */
-       unlink(RECOVERY_COMMAND_DONE);
-       if (rename(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE) != 0)
-               ereport(FATAL,
-                               (errcode_for_file_access(),
-                                errmsg("could not rename file \"%s\" to \"%s\": %m",
-                                               RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE)));
 
        ereport(LOG,
                        (errmsg("archive recovery complete")));
@@ -5279,16 +5274,23 @@ StartupXLOG(void)
                        /* use volatile pointer to prevent code rearrangement */
                        volatile XLogCtlData *xlogctl = XLogCtl;
 
-                       InRedo = true;
-                       ereport(LOG,
-                                       (errmsg("redo starts at %X/%X",
-                                                       ReadRecPtr.xlogid, ReadRecPtr.xrecoff)));
-
                        /* Update shared copy of replayEndRecPtr */
                        SpinLockAcquire(&xlogctl->info_lck);
                        xlogctl->replayEndRecPtr = ReadRecPtr;
                        SpinLockRelease(&xlogctl->info_lck);
 
+                       InRedo = true;
+
+                       if (minRecoveryPoint.xlogid == 0 && minRecoveryPoint.xrecoff == 0)
+                               ereport(LOG,
+                                               (errmsg("redo starts at %X/%X",
+                                                               ReadRecPtr.xlogid, ReadRecPtr.xrecoff)));
+                       else
+                               ereport(LOG,
+                                               (errmsg("redo starts at %X/%X, consistency will be reached at %X/%X",
+                                               ReadRecPtr.xlogid, ReadRecPtr.xrecoff,
+                                               minRecoveryPoint.xlogid, minRecoveryPoint.xrecoff)));
+
                        /*
                         * Let postmaster know we've started redo now.
                         *
@@ -5355,8 +5357,7 @@ StartupXLOG(void)
                                        if (InArchiveRecovery)
                                        {
                                                ereport(LOG,
-                                                       (errmsg("consistent recovery state reached at %X/%X",
-                                                               EndRecPtr.xlogid, EndRecPtr.xrecoff)));
+                                                               (errmsg("consistent recovery state reached")));
                                                if (IsUnderPostmaster)
                                                        SendPostmasterSignal(PMSIGNAL_RECOVERY_CONSISTENT);
                                        }
@@ -6029,6 +6030,7 @@ CreateCheckPoint(int flags)
        uint32          _logSeg;
        TransactionId *inCommitXids;
        int                     nInCommit;
+       bool            leavingArchiveRecovery;
 
        /* shouldn't happen */
        if (IsRecoveryProcessingMode())
@@ -6042,6 +6044,13 @@ CreateCheckPoint(int flags)
         */
        LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
 
+       /*
+        * Find out if this is the first checkpoint after archive recovery.
+        */
+       LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+       leavingArchiveRecovery = (ControlFile->state == DB_IN_ARCHIVE_RECOVERY);
+       LWLockRelease(ControlFileLock);
+
        /*
         * Prepare to accumulate statistics.
         *
@@ -6286,6 +6295,21 @@ CreateCheckPoint(int flags)
        UpdateControlFile();
        LWLockRelease(ControlFileLock);
 
+       if (leavingArchiveRecovery)
+       {
+               /*
+                * Rename the config file out of the way, so that we don't accidentally
+                * re-enter archive recovery mode in a subsequent crash. Prior to
+                * 8.4 this step was performed at end of exitArchiveRecovery().
+                */
+               unlink(RECOVERY_COMMAND_DONE);
+               if (rename(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE) != 0)
+                       ereport(ERROR,
+                                       (errcode_for_file_access(),
+                                        errmsg("could not rename file \"%s\" to \"%s\": %m",
+                                                       RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE)));
+       }
+
        /* Update shared-memory copy of checkpoint XID/epoch */
        {
                /* use volatile pointer to prevent code rearrangement */
@@ -6329,8 +6353,7 @@ CreateCheckPoint(int flags)
         * Truncate pg_subtrans if possible.  We can throw away all data before
         * the oldest XMIN of any running transaction.  No future transaction will
         * attempt to reference any pg_subtrans entry older than that (see Asserts
-        * in subtrans.c).      During recovery, though, we mustn't do this because
-        * StartupSUBTRANS hasn't been called yet.
+        * in subtrans.c).
         */
        TruncateSUBTRANS(GetOldestXmin(true, false));
 
index c3b3ec7ee1e95207288011c7386da0ab1221e98f..b97a6afbf0bb9a5804f4100eaee387272f8e78f8 100644 (file)
@@ -168,9 +168,9 @@ extern bool XLOG_DEBUG;
 
 /* These directly affect the behavior of CreateCheckPoint and subsidiaries */
 #define CHECKPOINT_IS_SHUTDOWN 0x0001  /* Checkpoint is for shutdown */
-#define CHECKPOINT_IMMEDIATE   0x0002  /* Do it without delays */
-#define CHECKPOINT_FORCE               0x0004  /* Force even if no activity */
-#define CHECKPOINT_STARTUP             0x0008  /* Startup checkpoint */
+#define CHECKPOINT_IS_STARTUP  0x0002  /* Startup checkpoint */
+#define CHECKPOINT_IMMEDIATE   0x0003  /* Do it without delays */
+#define CHECKPOINT_FORCE               0x0008  /* Force even if no activity */
 /* These are important to RequestCheckpoint */
 #define CHECKPOINT_WAIT                        0x0010  /* Wait for completion */
 /* These indicate the cause of a checkpoint request */