Start background writer during archive recovery. Background writer now performs

author Heikki Linnakangas <[email protected]>

Wed, 18 Feb 2009 15:58:41 +0000 (15:58 +0000)

committer Heikki Linnakangas <[email protected]>

Wed, 18 Feb 2009 15:58:41 +0000 (15:58 +0000)
author Heikki Linnakangas <[email protected]>
Wed, 18 Feb 2009 15:58:41 +0000 (15:58 +0000)
committer Heikki Linnakangas <[email protected]>
Wed, 18 Feb 2009 15:58:41 +0000 (15:58 +0000)
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c

index bb70fb422c719becc80c1311d14d0c4c9c9e3cca..97fb14868a44f9c0960860506f4fb38f62195596 100644 (file)
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -36,6 +36,7 @@
  #include "catalog/pg_control.h"
  #include "catalog/pg_type.h"
  #include "funcapi.h"
+#include "libpq/pqsignal.h"
  #include "miscadmin.h"
  #include "pgstat.h"
  #include "postmaster/bgwriter.h"
@@ -47,6 +48,7 @@
  #include "storage/smgr.h"
  #include "storage/spin.h"
  #include "utils/builtins.h"
+#include "utils/flatfiles.h"
  #include "utils/guc.h"
  #include "utils/ps_status.h"
  #include "pg_trace.h"
@@ -119,12 +121,27 @@ CheckpointStatsData CheckpointStats;
   */
  TimeLineID     ThisTimeLineID = 0;
  
-/* Are we doing recovery from XLOG? */
+/*
+ * Are we doing recovery from XLOG? 
+ *
+ * This is only ever true in the startup process, even if the system is still
+ * in recovery. Prior to 8.4, all activity during recovery were carried out
+ * by Startup process. This local variable continues to be used in functions
+ * that need to act differently when called from a redo function (e.g skip
+ * WAL logging). To check whether the system is in recovery regardless of what
+ * process you're running in, use RecoveryInProgress().
+ */
  bool           InRecovery = false;
  
  /* Are we recovering using offline XLOG archives? */
  static bool InArchiveRecovery = false;
  
+/*
+ * Local copy of SharedRecoveryInProgress variable. True actually means "not
+ * known, need to check the shared state"
+ */
+static bool LocalRecoveryInProgress = true;
+
  /* Was the last xlog file restored from archive, or local? */
  static bool restoredFromArchive = false;
  
@@ -133,7 +150,6 @@ static char *recoveryRestoreCommand = NULL;
  static bool recoveryTarget = false;
  static bool recoveryTargetExact = false;
  static bool recoveryTargetInclusive = true;
-static bool recoveryLogRestartpoints = false;
  static TransactionId recoveryTargetXid;
  static TimestampTz recoveryTargetTime;
  static TimestampTz recoveryLastXTime = 0;
@@ -242,9 +258,8 @@ static XLogRecPtr RedoRecPtr;
   * ControlFileLock: must be held to read/update control file or create
   * new log file.
   *
- * CheckpointLock: must be held to do a checkpoint (ensures only one
- * checkpointer at a time; currently, with all checkpoints done by the
- * bgwriter, this is just pro forma).
+ * CheckpointLock: must be held to do a checkpoint or restartpoint (ensures
+ * only one checkpointer at a time)
   *
   *----------
   */
@@ -313,6 +328,25 @@ typedef struct XLogCtlData
         int                     XLogCacheBlck;  /* highest allocated xlog buffer index */
         TimeLineID      ThisTimeLineID;
  
+       /*
+        * SharedRecoveryInProgress indicates if we're still in crash or archive
+        * recovery.  It's checked by RecoveryInProgress().
+        */
+       bool            SharedRecoveryInProgress;
+
+       /*
+        * During recovery, we keep a copy of the latest checkpoint record
+        * here.  Used by the background writer when it wants to create
+        * a restartpoint.
+        *
+        * Protected by info_lck.
+        */
+       XLogRecPtr      lastCheckPointRecPtr;
+       CheckPoint      lastCheckPoint;
+
+       /* end+1 of the last record replayed (or being replayed) */
+       XLogRecPtr      replayEndRecPtr;
+
         slock_t         info_lck;               /* locks shared variables shown above */
  } XLogCtlData;
  
@@ -387,9 +421,21 @@ static XLogRecPtr ReadRecPtr;      /* start of last record read */
  static XLogRecPtr EndRecPtr;   /* end+1 of last record read */
  static XLogRecord *nextRecord = NULL;
  static TimeLineID lastPageTLI = 0;
+static XLogRecPtr minRecoveryPoint; /* local copy of ControlFile->minRecoveryPoint */
+static bool    updateMinRecoveryPoint = true;
  
  static bool InRedo = false;
  
+/*
+ * Flag set by interrupt handlers for later service in the redo loop.
+ */
+static volatile sig_atomic_t shutdown_requested = false;
+/*
+ * Flag set when executing a restore command, to tell SIGTERM signal handler
+ * that it's safe to just proc_exit(0).
+ */
+static volatile sig_atomic_t in_restore_command = false;
+
  
  static void XLogArchiveNotify(const char *xlog);
  static void XLogArchiveNotifySeg(uint32 log, uint32 seg);
@@ -420,6 +466,7 @@ static void PreallocXlogFiles(XLogRecPtr endptr);
  static void RemoveOldXlogFiles(uint32 log, uint32 seg, XLogRecPtr endptr);
  static void ValidateXLOGDirectoryStructure(void);
  static void CleanupBackupHistory(void);
+static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force);
  static XLogRecord *ReadRecord(XLogRecPtr *RecPtr, int emode);
  static bool ValidXLOGHeader(XLogPageHeader hdr, int emode);
  static XLogRecord *ReadCheckpointRecord(XLogRecPtr RecPtr, int whichChkpt);
@@ -484,6 +531,10 @@ XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata)
         bool            doPageWrites;
         bool            isLogSwitch = (rmid == RM_XLOG_ID && info == XLOG_SWITCH);
  
+       /* cross-check on whether we should be here or not */
+       if (RecoveryInProgress())
+               elog(FATAL, "cannot make new WAL entries during recovery");
+
         /* info's high bits are reserved for use by me */
         if (info & XLR_INFO_MASK)
                 elog(PANIC, "invalid xlog info mask %02X", info);
@@ -1717,6 +1768,63 @@ XLogSetAsyncCommitLSN(XLogRecPtr asyncCommitLSN)
         SpinLockRelease(&xlogctl->info_lck);
  }
  
+/*
+ * Advance minRecoveryPoint in control file.
+ *
+ * If we crash during recovery, we must reach this point again before the
+ * database is consistent. 
+ * 
+ * If 'force' is true, 'lsn' argument is ignored. Otherwise, minRecoveryPoint
+ * is is only updated if it's not already greater than or equal to 'lsn'.
+ */
+static void
+UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
+{
+       /* Quick check using our local copy of the variable */
+       if (!updateMinRecoveryPoint || (!force && XLByteLE(lsn, minRecoveryPoint)))
+               return;
+
+       LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+
+       /* update local copy */
+       minRecoveryPoint = ControlFile->minRecoveryPoint;
+
+       /*
+        * An invalid minRecoveryPoint means that we need to recover all the WAL,
+        * ie. crash recovery. Don't update the control file in that case.
+        */
+       if (minRecoveryPoint.xlogid == 0 && minRecoveryPoint.xrecoff == 0)
+               updateMinRecoveryPoint = false;
+       else if (force || XLByteLT(minRecoveryPoint, lsn))
+       {
+               /* use volatile pointer to prevent code rearrangement */
+               volatile XLogCtlData *xlogctl = XLogCtl;
+               XLogRecPtr newMinRecoveryPoint;
+
+               /*
+                * To avoid having to update the control file too often, we update it
+                * all the way to the last record being replayed, even though 'lsn'
+                * would suffice for correctness.
+                */
+               SpinLockAcquire(&xlogctl->info_lck);
+               newMinRecoveryPoint = xlogctl->replayEndRecPtr;
+               SpinLockRelease(&xlogctl->info_lck);
+
+               /* update control file */
+               if (XLByteLT(ControlFile->minRecoveryPoint, newMinRecoveryPoint))
+               {
+                       ControlFile->minRecoveryPoint = newMinRecoveryPoint;
+                       UpdateControlFile();
+                       minRecoveryPoint = newMinRecoveryPoint;
+
+                       ereport(DEBUG2,
+                                       (errmsg("updated min recovery point to %X/%X",
+                                               minRecoveryPoint.xlogid, minRecoveryPoint.xrecoff)));
+               }
+       }
+       LWLockRelease(ControlFileLock);
+}
+
  /*
   * Ensure that all XLOG data through the given position is flushed to disk.
   *
@@ -1729,9 +1837,15 @@ XLogFlush(XLogRecPtr record)
         XLogRecPtr      WriteRqstPtr;
         XLogwrtRqst WriteRqst;
  
-       /* Disabled during REDO */
-       if (InRedo)
+       /*
+        * During REDO, we don't try to flush the WAL, but update minRecoveryPoint
+        * instead.
+        */
+       if (RecoveryInProgress())
+       {
+               UpdateMinRecoveryPoint(record, false);
                 return;
+       }
  
         /* Quick exit if already known flushed */
         if (XLByteLE(record, LogwrtResult.Flush))
@@ -1818,9 +1932,9 @@ XLogFlush(XLogRecPtr record)
          * the bad page is encountered again during recovery then we would be
          * unable to restart the database at all!  (This scenario has actually
          * happened in the field several times with 7.1 releases. Note that we
-        * cannot get here while InRedo is true, but if the bad page is brought in
-        * and marked dirty during recovery then CreateCheckPoint will try to
-        * flush it at the end of recovery.)
+        * cannot get here while RecoveryInProgress(), but if the bad page is
+        * brought in and marked dirty during recovery then if a checkpoint were
+        * performed at the end of recovery it will try to flush it.
          *
          * The current approach is to ERROR under normal conditions, but only
          * WARNING during recovery, so that the system can be brought up even if
@@ -1857,6 +1971,10 @@ XLogBackgroundFlush(void)
         XLogRecPtr      WriteRqstPtr;
         bool            flexible = true;
  
+       /* XLOG doesn't need flushing during recovery */
+       if (RecoveryInProgress())
+               return;
+
         /* read LogwrtResult and update local state */
         {
                 /* use volatile pointer to prevent code rearrangement */
@@ -1928,6 +2046,10 @@ XLogAsyncCommitFlush(void)
         /* use volatile pointer to prevent code rearrangement */
         volatile XLogCtlData *xlogctl = XLogCtl;
  
+       /* There's no asynchronously committed transactions during recovery */
+       if (RecoveryInProgress())
+               return;
+
         SpinLockAcquire(&xlogctl->info_lck);
         WriteRqstPtr = xlogctl->asyncCommitLSN;
         SpinLockRelease(&xlogctl->info_lck);
@@ -1944,6 +2066,10 @@ XLogAsyncCommitFlush(void)
  bool
  XLogNeedsFlush(XLogRecPtr record)
  {
+       /* XLOG doesn't need flushing during recovery */
+       if (RecoveryInProgress())
+               return false;
+
         /* Quick exit if already known flushed */
         if (XLByteLE(record, LogwrtResult.Flush))
                 return false;
@@ -2618,10 +2744,23 @@ RestoreArchivedFile(char *path, const char *xlogfname,
                         (errmsg_internal("executing restore command \"%s\"",
                                                          xlogRestoreCmd)));
  
+       /*
+        * Set in_restore_command to tell the signal handler that we should exit
+        * right away on SIGTERM. We know that we're in a safe point to do that.
+        * Check if we had already received the signal, so that we don't miss a
+        * shutdown request received just before this.
+        */
+       in_restore_command = true;
+       if (shutdown_requested)
+               proc_exit(0);
+
         /*
          * Copy xlog from archival storage to XLOGDIR
          */
         rc = system(xlogRestoreCmd);
+
+       in_restore_command = false;
+
         if (rc == 0)
         {
                 /*
@@ -2674,14 +2813,24 @@ RestoreArchivedFile(char *path, const char *xlogfname,
          * assume that recovery is complete and start up the database!) It's
          * essential to abort on child SIGINT and SIGQUIT, because per spec
          * system() ignores SIGINT and SIGQUIT while waiting; if we see one of
-        * those it's a good bet we should have gotten it too.  Aborting on other
-        * signals such as SIGTERM seems a good idea as well.
+        * those it's a good bet we should have gotten it too.
+        *
+        * On SIGTERM, assume we have received a fast shutdown request, and exit
+        * cleanly. It's pure chance whether we receive the SIGTERM first, or the
+        * child process. If we receive it first, the signal handler will call
+        * proc_exit(0), otherwise we do it here. If we or the child process
+        * received SIGTERM for any other reason than a fast shutdown request,
+        * postmaster will perform an immediate shutdown when it sees us exiting
+        * unexpectedly.
          *
          * Per the Single Unix Spec, shells report exit status > 128 when a called
          * command died on a signal.  Also, 126 and 127 are used to report
          * problems such as an unfindable command; treat those as fatal errors
          * too.
          */
+       if (WTERMSIG(rc) == SIGTERM)
+               proc_exit(0);
+
         signaled = WIFSIGNALED(rc) || WEXITSTATUS(rc) > 125;
  
         ereport(signaled ? FATAL : DEBUG2,
@@ -4584,18 +4733,6 @@ readRecoveryCommandFile(void)
                         ereport(LOG,
                                         (errmsg("recovery_target_inclusive = %s", tok2)));
                 }
-               else if (strcmp(tok1, "log_restartpoints") == 0)
-               {
-                       /*
-                        * does nothing if a recovery_target is not also set
-                        */
-                       if (!parse_bool(tok2, &recoveryLogRestartpoints))
-                                 ereport(ERROR,
-                                                       (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
-                                         errmsg("parameter \"log_restartpoints\" requires a Boolean value")));
-                       ereport(LOG,
-                                       (errmsg("log_restartpoints = %s", tok2)));
-               }
                 else
                         ereport(FATAL,
                                         (errmsg("unrecognized recovery parameter \"%s\"",
@@ -4877,7 +5014,7 @@ StartupXLOG(void)
         XLogRecPtr      RecPtr,
                                 LastRec,
                                 checkPointLoc,
-                               minRecoveryLoc,
+                               backupStopLoc,
                                 EndOfLog;
         uint32          endLogId;
         uint32          endLogSeg;
@@ -4885,6 +5022,8 @@ StartupXLOG(void)
         uint32          freespace;
         TransactionId oldestActiveXID;
  
+       XLogCtl->SharedRecoveryInProgress = true;
+
         /*
          * Read control file and check XLOG status looks valid.
          *
@@ -4964,7 +5103,7 @@ StartupXLOG(void)
                                                 recoveryTargetTLI,
                                                 ControlFile->checkPointCopy.ThisTimeLineID)));
  
-       if (read_backup_label(&checkPointLoc, &minRecoveryLoc))
+       if (read_backup_label(&checkPointLoc, &backupStopLoc))
         {
                 /*
                  * When a backup_label file is present, we want to roll forward from
@@ -5102,11 +5241,23 @@ StartupXLOG(void)
                 ControlFile->prevCheckPoint = ControlFile->checkPoint;
                 ControlFile->checkPoint = checkPointLoc;
                 ControlFile->checkPointCopy = checkPoint;
-               if (minRecoveryLoc.xlogid != 0 || minRecoveryLoc.xrecoff != 0)
-                       ControlFile->minRecoveryPoint = minRecoveryLoc;
+               if (backupStopLoc.xlogid != 0 || backupStopLoc.xrecoff != 0)
+               {
+                       if (XLByteLT(ControlFile->minRecoveryPoint, backupStopLoc))
+                               ControlFile->minRecoveryPoint = backupStopLoc;
+               }
                 ControlFile->time = (pg_time_t) time(NULL);
+               /* No need to hold ControlFileLock yet, we aren't up far enough */
                 UpdateControlFile();
  
+               /* update our local copy of minRecoveryPoint */
+               minRecoveryPoint = ControlFile->minRecoveryPoint;
+
+               /*
+                * Reset pgstat data, because it may be invalid after recovery.
+                */
+               pgstat_reset_all();
+
                 /*
                  * If there was a backup label file, it's done its job and the info
                  * has now been propagated into pg_control.  We must get rid of the
@@ -5151,12 +5302,41 @@ StartupXLOG(void)
                 {
                         bool            recoveryContinue = true;
                         bool            recoveryApply = true;
+                       bool            reachedMinRecoveryPoint = false;
                         ErrorContextCallback errcontext;
+                       /* use volatile pointer to prevent code rearrangement */
+                       volatile XLogCtlData *xlogctl = XLogCtl;
+
+                       /* Update shared replayEndRecPtr */
+                       SpinLockAcquire(&xlogctl->info_lck);
+                       xlogctl->replayEndRecPtr = ReadRecPtr;
+                       SpinLockRelease(&xlogctl->info_lck);
  
                         InRedo = true;
-                       ereport(LOG,
-                                       (errmsg("redo starts at %X/%X",
-                                                       ReadRecPtr.xlogid, ReadRecPtr.xrecoff)));
+
+                       if (minRecoveryPoint.xlogid == 0 && minRecoveryPoint.xrecoff == 0)
+                               ereport(LOG,
+                                               (errmsg("redo starts at %X/%X",
+                                                               ReadRecPtr.xlogid, ReadRecPtr.xrecoff)));
+                       else
+                               ereport(LOG,
+                                               (errmsg("redo starts at %X/%X, consistency will be reached at %X/%X",
+                                               ReadRecPtr.xlogid, ReadRecPtr.xrecoff,
+                                               minRecoveryPoint.xlogid, minRecoveryPoint.xrecoff)));
+
+                       /*
+                        * Let postmaster know we've started redo now, so that it can
+                        * launch bgwriter to perform restartpoints.  We don't bother
+                        * during crash recovery as restartpoints can only be performed
+                        * during archive recovery.  And we'd like to keep crash recovery
+                        * simple, to avoid introducing bugs that could you from
+                        * recovering after crash.
+                        *
+                        * After this point, we can no longer assume that we're the only
+                        * process in addition to postmaster!
+                        */
+                       if (InArchiveRecovery && IsUnderPostmaster)
+                               SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
  
                         /*
                          * main redo apply loop
@@ -5182,6 +5362,30 @@ StartupXLOG(void)
                                 }
  #endif
  
+                               /*
+                                * Check if we were requested to exit without finishing
+                                * recovery.
+                                */
+                               if (shutdown_requested)
+                                       proc_exit(0);
+
+                               /*
+                                * Have we reached our safe starting point? If so, we can
+                                * tell postmaster that the database is consistent now.
+                                */
+                               if (!reachedMinRecoveryPoint && 
+                                        XLByteLE(minRecoveryPoint, EndRecPtr))
+                               {
+                                       reachedMinRecoveryPoint = true;
+                                       if (InArchiveRecovery)
+                                       {
+                                               ereport(LOG,
+                                                               (errmsg("consistent recovery state reached")));
+                                               if (IsUnderPostmaster)
+                                                       SendPostmasterSignal(PMSIGNAL_RECOVERY_CONSISTENT);
+                                       }
+                               }
+
                                 /*
                                  * Have we reached our recovery target?
                                  */
@@ -5207,6 +5411,15 @@ StartupXLOG(void)
                                         TransactionIdAdvance(ShmemVariableCache->nextXid);
                                 }
  
+                               /*
+                                * Update shared replayEndRecPtr before replaying this
+                                * record, so that XLogFlush will update minRecoveryPoint
+                                * correctly.
+                                */
+                               SpinLockAcquire(&xlogctl->info_lck);
+                               xlogctl->replayEndRecPtr = EndRecPtr;
+                               SpinLockRelease(&xlogctl->info_lck);
+
                                 RmgrTable[record->xl_rmid].rm_redo(EndRecPtr, record);
  
                                 /* Pop the error context stack */
@@ -5250,14 +5463,14 @@ StartupXLOG(void)
          * Complain if we did not roll forward far enough to render the backup
          * dump consistent.
          */
-       if (XLByteLT(EndOfLog, ControlFile->minRecoveryPoint))
+       if (InRecovery && XLByteLT(EndOfLog, minRecoveryPoint))
         {
                 if (reachedStopPoint)   /* stopped because of stop request */
                         ereport(FATAL,
-                                       (errmsg("requested recovery stop point is before end time of backup dump")));
+                                       (errmsg("requested recovery stop point is before consistent recovery point")));
                 else    /* ran off end of WAL */
                         ereport(FATAL,
-                                       (errmsg("WAL ends before end time of backup dump")));
+                                       (errmsg("WAL ends before consistent recovery point")));
         }
  
         /*
@@ -5352,6 +5565,12 @@ StartupXLOG(void)
         /* Pre-scan prepared transactions to find out the range of XIDs present */
         oldestActiveXID = PrescanPreparedTransactions();
  
+       /*
+        * Allow writing WAL for us, so that we can create a checkpoint record.
+        * But not yet for other backends!
+        */
+       LocalRecoveryInProgress = false;
+
         if (InRecovery)
         {
                 int                     rmid;
@@ -5371,11 +5590,6 @@ StartupXLOG(void)
                  */
                 XLogCheckInvalidPages();
  
-               /*
-                * Reset pgstat data, because it may be invalid after recovery.
-                */
-               pgstat_reset_all();
-
                 /*
                  * Perform a checkpoint to update all our recovery activity to disk.
                  *
@@ -5398,12 +5612,14 @@ StartupXLOG(void)
          */
         InRecovery = false;
  
+       LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
         ControlFile->state = DB_IN_PRODUCTION;
         ControlFile->time = (pg_time_t) time(NULL);
         UpdateControlFile();
+       LWLockRelease(ControlFileLock);
  
         /* start the archive_timeout timer running */
-       XLogCtl->Write.lastSegSwitchTime = ControlFile->time;
+       XLogCtl->Write.lastSegSwitchTime = (pg_time_t) time(NULL);
  
         /* initialize shared-memory copy of latest checkpoint XID/epoch */
         XLogCtl->ckptXidEpoch = ControlFile->checkPointCopy.nextXidEpoch;
@@ -5438,6 +5654,45 @@ StartupXLOG(void)
                 readRecordBuf = NULL;
                 readRecordBufSize = 0;
         }
+
+       /*
+        * All done. Allow others to write WAL.
+        */
+       XLogCtl->SharedRecoveryInProgress = false;
+}
+
+/*
+ * Is the system still in recovery?
+ *
+ * As a side-effect, we initialize the local TimeLineID and RedoRecPtr
+ * variables the first time we see that recovery is finished.
+ */
+bool
+RecoveryInProgress(void)
+{
+       /*
+        * We check shared state each time only until we leave recovery mode.
+        * We can't re-enter recovery, so we rely on the local state variable
+        * after that.
+        */
+       if (!LocalRecoveryInProgress)
+               return false;
+       else
+       {
+               /* use volatile pointer to prevent code rearrangement */
+               volatile XLogCtlData *xlogctl = XLogCtl;
+
+               LocalRecoveryInProgress = xlogctl->SharedRecoveryInProgress;
+
+               /*
+                * Initialize TimeLineID and RedoRecPtr the first time we see that
+                * recovery is finished.
+                */
+               if (!LocalRecoveryInProgress)
+                       InitXLOGAccess();
+
+               return LocalRecoveryInProgress;
+       }
  }
  
  /*
@@ -5569,6 +5824,8 @@ InitXLOGAccess(void)
  {
         /* ThisTimeLineID doesn't change so we need no lock to copy it */
         ThisTimeLineID = XLogCtl->ThisTimeLineID;
+       Assert(ThisTimeLineID != 0);
+
         /* Use GetRedoRecPtr to copy the RedoRecPtr safely */
         (void) GetRedoRecPtr();
  }
@@ -5680,7 +5937,10 @@ ShutdownXLOG(int code, Datum arg)
         ereport(LOG,
                         (errmsg("shutting down")));
  
-       CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
+       if (RecoveryInProgress())
+               CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
+       else
+               CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
         ShutdownCLOG();
         ShutdownSUBTRANS();
         ShutdownMultiXact();
@@ -5693,9 +5953,20 @@ ShutdownXLOG(int code, Datum arg)
   * Log start of a checkpoint.
   */
  static void
-LogCheckpointStart(int flags)
+LogCheckpointStart(int flags, bool restartpoint)
  {
-       elog(LOG, "checkpoint starting:%s%s%s%s%s%s",
+       char *msg;
+
+       /*
+        * XXX: This is hopelessly untranslatable. We could call gettext_noop
+        * for the main message, but what about all the flags?
+        */
+       if (restartpoint)
+               msg = "restartpoint starting:%s%s%s%s%s%s";
+       else
+               msg = "checkpoint starting:%s%s%s%s%s%s";
+
+       elog(LOG, msg,
                  (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
                  (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
                  (flags & CHECKPOINT_FORCE) ? " force" : "",
@@ -5708,7 +5979,7 @@ LogCheckpointStart(int flags)
   * Log end of a checkpoint.
   */
  static void
-LogCheckpointEnd(void)
+LogCheckpointEnd(bool restartpoint)
  {
         long            write_secs,
                                 sync_secs,
@@ -5731,17 +6002,26 @@ LogCheckpointEnd(void)
                                                 CheckpointStats.ckpt_sync_end_t,
                                                 &sync_secs, &sync_usecs);
  
-       elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); "
-                "%d transaction log file(s) added, %d removed, %d recycled; "
-                "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
-                CheckpointStats.ckpt_bufs_written,
-                (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
-                CheckpointStats.ckpt_segs_added,
-                CheckpointStats.ckpt_segs_removed,
-                CheckpointStats.ckpt_segs_recycled,
-                write_secs, write_usecs / 1000,
-                sync_secs, sync_usecs / 1000,
-                total_secs, total_usecs / 1000);
+       if (restartpoint)
+               elog(LOG, "restartpoint complete: wrote %d buffers (%.1f%%); "
+                        "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
+                        CheckpointStats.ckpt_bufs_written,
+                        (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
+                        write_secs, write_usecs / 1000,
+                        sync_secs, sync_usecs / 1000,
+                        total_secs, total_usecs / 1000);
+       else
+               elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); "
+                        "%d transaction log file(s) added, %d removed, %d recycled; "
+                        "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
+                        CheckpointStats.ckpt_bufs_written,
+                        (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
+                        CheckpointStats.ckpt_segs_added,
+                        CheckpointStats.ckpt_segs_removed,
+                        CheckpointStats.ckpt_segs_recycled,
+                        write_secs, write_usecs / 1000,
+                        sync_secs, sync_usecs / 1000,
+                        total_secs, total_usecs / 1000);
  }
  
  /*
@@ -5772,13 +6052,33 @@ CreateCheckPoint(int flags)
         TransactionId *inCommitXids;
         int                     nInCommit;
  
+       /* shouldn't happen */
+       if (RecoveryInProgress())
+               elog(ERROR, "can't create a checkpoint during recovery");
+
         /*
          * Acquire CheckpointLock to ensure only one checkpoint happens at a time.
-        * (This is just pro forma, since in the present system structure there is
-        * only one process that is allowed to issue checkpoints at any given
-        * time.)
+        * During normal operation, bgwriter is the only process that creates
+        * checkpoints, but at the end of archive recovery, the bgwriter can be
+        * busy creating a restartpoint while the startup process tries to perform
+        * the startup checkpoint.
          */
-       LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
+       if (!LWLockConditionalAcquire(CheckpointLock, LW_EXCLUSIVE))
+       {
+               Assert(InRecovery);
+
+               /*
+                * A restartpoint is in progress. Wait until it finishes. This can
+                * cause an extra restartpoint to be performed, but that's OK because
+                * we're just about to perform a checkpoint anyway. Flushing the
+                * buffers in this restartpoint can take some time, but that time is
+                * saved from the upcoming checkpoint so the net effect is zero.
+                */
+               ereport(DEBUG2, (errmsg("hurrying in-progress restartpoint")));
+               RequestCheckpoint(CHECKPOINT_IMMEDIATE | CHECKPOINT_WAIT);
+
+               LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
+       }
  
         /*
          * Prepare to accumulate statistics.
@@ -5797,9 +6097,11 @@ CreateCheckPoint(int flags)
  
         if (shutdown)
         {
+               LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
                 ControlFile->state = DB_SHUTDOWNING;
                 ControlFile->time = (pg_time_t) time(NULL);
                 UpdateControlFile();
+               LWLockRelease(ControlFileLock);
         }
  
         /*
@@ -5903,7 +6205,7 @@ CreateCheckPoint(int flags)
          * to log anything if we decided to skip the checkpoint.
          */
         if (log_checkpoints)
-               LogCheckpointStart(flags);
+               LogCheckpointStart(flags, false);
  
         TRACE_POSTGRESQL_CHECKPOINT_START(flags);
  
@@ -6070,7 +6372,7 @@ CreateCheckPoint(int flags)
  
         /* All real work is done, but log before releasing lock. */
         if (log_checkpoints)
-               LogCheckpointEnd();
+               LogCheckpointEnd(false);
  
          TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written,
                                  NBuffers, CheckpointStats.ckpt_segs_added,
@@ -6098,32 +6400,17 @@ CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
  }
  
  /*
- * Set a recovery restart point if appropriate
- *
- * This is similar to CreateCheckPoint, but is used during WAL recovery
- * to establish a point from which recovery can roll forward without
- * replaying the entire recovery log.  This function is called each time
- * a checkpoint record is read from XLOG; it must determine whether a
- * restartpoint is needed or not.
+ * This is used during WAL recovery to establish a point from which recovery
+ * can roll forward without replaying the entire recovery log.  This function
+ * is called each time a checkpoint record is read from XLOG. It is stored
+ * in shared memory, so that it can be used as a restartpoint later on.
   */
  static void
  RecoveryRestartPoint(const CheckPoint *checkPoint)
  {
-       int                     elapsed_secs;
         int                     rmid;
-
-       /*
-        * Do nothing if the elapsed time since the last restartpoint is less than
-        * half of checkpoint_timeout.  (We use a value less than
-        * checkpoint_timeout so that variations in the timing of checkpoints on
-        * the master, or speed of transmission of WAL segments to a slave, won't
-        * make the slave skip a restartpoint once it's synced with the master.)
-        * Checking true elapsed time keeps us from doing restartpoints too often
-        * while rapidly scanning large amounts of WAL.
-        */
-       elapsed_secs = (pg_time_t) time(NULL) - ControlFile->time;
-       if (elapsed_secs < CheckPointTimeout / 2)
-               return;
+       /* use volatile pointer to prevent code rearrangement */
+       volatile XLogCtlData *xlogctl = XLogCtl;
  
         /*
          * Is it safe to checkpoint?  We must ask each of the resource managers
@@ -6145,28 +6432,128 @@ RecoveryRestartPoint(const CheckPoint *checkPoint)
         }
  
         /*
-        * OK, force data out to disk
+        * Copy the checkpoint record to shared memory, so that bgwriter can
+        * use it the next time it wants to perform a restartpoint.
+        */
+       SpinLockAcquire(&xlogctl->info_lck);
+       XLogCtl->lastCheckPointRecPtr = ReadRecPtr;
+       memcpy(&XLogCtl->lastCheckPoint, checkPoint, sizeof(CheckPoint));
+       SpinLockRelease(&xlogctl->info_lck);
+}
+
+/*
+ * This is similar to CreateCheckPoint, but is used during WAL recovery
+ * to establish a point from which recovery can roll forward without
+ * replaying the entire recovery log.
+ *
+ * Returns true if a new restartpoint was established. We can only establish
+ * a restartpoint if we have replayed a checkpoint record since last
+ * restartpoint.
+ */
+bool
+CreateRestartPoint(int flags)
+{
+       XLogRecPtr lastCheckPointRecPtr;
+       CheckPoint lastCheckPoint;
+       /* use volatile pointer to prevent code rearrangement */
+       volatile XLogCtlData *xlogctl = XLogCtl;
+
+       /*
+        * Acquire CheckpointLock to ensure only one restartpoint or checkpoint
+        * happens at a time.
+        */
+       LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
+
+       /* Get the a local copy of the last checkpoint record. */
+       SpinLockAcquire(&xlogctl->info_lck);
+       lastCheckPointRecPtr = xlogctl->lastCheckPointRecPtr;
+       memcpy(&lastCheckPoint, &XLogCtl->lastCheckPoint, sizeof(CheckPoint));
+       SpinLockRelease(&xlogctl->info_lck);
+
+       /* 
+        * Check that we're still in recovery mode. It's ok if we exit recovery
+        * mode after this check, the restart point is valid anyway.
+        */
+       if (!RecoveryInProgress())
+       {
+               ereport(DEBUG2,
+                               (errmsg("skipping restartpoint, recovery has already ended")));
+               LWLockRelease(CheckpointLock);
+               return false;
+       }
+
+       /*
+        * If the last checkpoint record we've replayed is already our last
+        * restartpoint, we can't perform a new restart point. We still update
+        * minRecoveryPoint in that case, so that if this is a shutdown restart
+        * point, we won't start up earlier than before. That's not strictly
+        * necessary, but when we get hot standby capability, it would be rather
+        * weird if the database opened up for read-only connections at a
+        * point-in-time before the last shutdown. Such time travel is still
+        * possible in case of immediate shutdown, though.
+        *
+        * We don't explicitly advance minRecoveryPoint when we do create a
+        * restartpoint. It's assumed that flushing the buffers will do that
+        * as a side-effect.
          */
-       CheckPointGuts(checkPoint->redo, CHECKPOINT_IMMEDIATE);
+       if (XLogRecPtrIsInvalid(lastCheckPointRecPtr) ||
+               XLByteLE(lastCheckPoint.redo, ControlFile->checkPointCopy.redo))
+       {
+               XLogRecPtr InvalidXLogRecPtr = {0, 0};
+               ereport(DEBUG2,
+                               (errmsg("skipping restartpoint, already performed at %X/%X",
+                                               lastCheckPoint.redo.xlogid, lastCheckPoint.redo.xrecoff)));
+
+               UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
+               LWLockRelease(CheckpointLock);
+               return false;
+       }
+
+       if (log_checkpoints)
+       {
+               /*
+                * Prepare to accumulate statistics.
+                */
+               MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
+               CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
+
+               LogCheckpointStart(flags, true);
+       }
+
+       CheckPointGuts(lastCheckPoint.redo, flags);
  
         /*
-        * Update pg_control so that any subsequent crash will restart from this
-        * checkpoint.  Note: ReadRecPtr gives the XLOG address of the checkpoint
-        * record itself.
+        * Update pg_control, using current time
          */
+       LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
         ControlFile->prevCheckPoint = ControlFile->checkPoint;
-       ControlFile->checkPoint = ReadRecPtr;
-       ControlFile->checkPointCopy = *checkPoint;
+       ControlFile->checkPoint = lastCheckPointRecPtr;
+       ControlFile->checkPointCopy = lastCheckPoint;
         ControlFile->time = (pg_time_t) time(NULL);
         UpdateControlFile();
+       LWLockRelease(ControlFileLock);
  
-       ereport((recoveryLogRestartpoints ? LOG : DEBUG2),
+       /*
+        * Currently, there is no need to truncate pg_subtrans during recovery.
+        * If we did do that, we will need to have called StartupSUBTRANS()
+        * already and then TruncateSUBTRANS() would go here.
+        */
+
+       /* All real work is done, but log before releasing lock. */
+       if (log_checkpoints)
+               LogCheckpointEnd(true);
+
+       ereport((log_checkpoints ? LOG : DEBUG2),
                         (errmsg("recovery restart point at %X/%X",
-                                       checkPoint->redo.xlogid, checkPoint->redo.xrecoff)));
+                                       lastCheckPoint.redo.xlogid, lastCheckPoint.redo.xrecoff)));
+
         if (recoveryLastXTime)
-               ereport((recoveryLogRestartpoints ? LOG : DEBUG2),
-                               (errmsg("last completed transaction was at log time %s",
-                                               timestamptz_to_str(recoveryLastXTime))));
+               ereport((log_checkpoints ? LOG : DEBUG2),
+                       (errmsg("last completed transaction was at log time %s",
+                                       timestamptz_to_str(recoveryLastXTime))));
+
+       LWLockRelease(CheckpointLock);
+       return true;
  }
  
  /*
@@ -6232,6 +6619,9 @@ RequestXLogSwitch(void)
  
  /*
   * XLOG resource manager's routines
+ *
+ * Definitions of info values are in include/catalog/pg_control.h, though
+ * not all records types are related to control file processing.
   */
  void
  xlog_redo(XLogRecPtr lsn, XLogRecord *record)
@@ -6278,9 +6668,9 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record)
                                                                  (int) checkPoint.ThisTimeLineID))
                                 ereport(PANIC,
                                                 (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
-                                                               checkPoint.ThisTimeLineID, ThisTimeLineID)));
-                       /* Following WAL records should be run with new TLI */
-                       ThisTimeLineID = checkPoint.ThisTimeLineID;
+                               checkPoint.ThisTimeLineID, ThisTimeLineID)));
+           /* Following WAL records should be run with new TLI */
+           ThisTimeLineID = checkPoint.ThisTimeLineID;
                 }
  
                 RecoveryRestartPoint(&checkPoint);
@@ -7221,3 +7611,92 @@ CancelBackup(void)
         }
  }
  
+/* ------------------------------------------------------
+ *  Startup Process main entry point and signal handlers
+ * ------------------------------------------------------
+ */
+
+/*
+ * startupproc_quickdie() occurs when signalled SIGQUIT by the postmaster.
+ *
+ * Some backend has bought the farm,
+ * so we need to stop what we're doing and exit.
+ */
+static void
+startupproc_quickdie(SIGNAL_ARGS)
+{
+       PG_SETMASK(&BlockSig);
+
+       /*
+        * DO NOT proc_exit() -- we're here because shared memory may be
+        * corrupted, so we don't want to try to clean up our transaction. Just
+        * nail the windows shut and get out of town.
+        *
+        * Note we do exit(2) not exit(0).      This is to force the postmaster into a
+        * system reset cycle if some idiot DBA sends a manual SIGQUIT to a random
+        * backend.  This is necessary precisely because we don't clean up our
+        * shared memory state.
+        */
+       exit(2);
+}
+
+
+/* SIGTERM: set flag to abort redo and exit */
+static void
+StartupProcShutdownHandler(SIGNAL_ARGS)
+{
+       if (in_restore_command)
+               proc_exit(0);
+       else
+               shutdown_requested = true;
+}
+
+/* Main entry point for startup process */
+void
+StartupProcessMain(void)
+{
+       /*
+        * If possible, make this process a group leader, so that the postmaster
+        * can signal any child processes too.
+        */
+#ifdef HAVE_SETSID
+       if (setsid() < 0)
+               elog(FATAL, "setsid() failed: %m");
+#endif
+
+       /*
+        * Properly accept or ignore signals the postmaster might send us
+        */
+       pqsignal(SIGHUP, SIG_IGN);      /* ignore config file updates */
+       pqsignal(SIGINT, SIG_IGN);              /* ignore query cancel */
+       pqsignal(SIGTERM, StartupProcShutdownHandler); /* request shutdown */
+       pqsignal(SIGQUIT, startupproc_quickdie);                /* hard crash time */
+       pqsignal(SIGALRM, SIG_IGN);
+       pqsignal(SIGPIPE, SIG_IGN);
+       pqsignal(SIGUSR1, SIG_IGN);
+       pqsignal(SIGUSR2, SIG_IGN);
+
+       /*
+        * Reset some signals that are accepted by postmaster but not here
+        */
+       pqsignal(SIGCHLD, SIG_DFL);
+       pqsignal(SIGTTIN, SIG_DFL);
+       pqsignal(SIGTTOU, SIG_DFL);
+       pqsignal(SIGCONT, SIG_DFL);
+       pqsignal(SIGWINCH, SIG_DFL);
+
+       /*
+        * Unblock signals (they were blocked when the postmaster forked us)
+        */
+       PG_SETMASK(&UnBlockSig);
+
+       StartupXLOG();  
+
+       BuildFlatFiles(false);
+
+       /* Let postmaster know that startup is finished */
+       SendPostmasterSignal(PMSIGNAL_RECOVERY_COMPLETED);
+
+       /* exit normally */
+       proc_exit(0);
+}
diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c

index 431a95fdf2e04a1f0e2b34f94663acaf37f49d66..13d5bcb43615d5f04551e2c77bb2d26f6c56718a 100644 (file)
--- a/src/backend/bootstrap/bootstrap.c
+++ b/src/backend/bootstrap/bootstrap.c
@@ -37,7 +37,6 @@
  #include "storage/proc.h"
  #include "tcop/tcopprot.h"
  #include "utils/builtins.h"
-#include "utils/flatfiles.h"
  #include "utils/fmgroids.h"
  #include "utils/memutils.h"
  #include "utils/ps_status.h"
@@ -416,14 +415,12 @@ AuxiliaryProcessMain(int argc, char *argv[])
                         proc_exit(1);           /* should never return */
  
                 case StartupProcess:
-                       bootstrap_signals();
-                       StartupXLOG();
-                       BuildFlatFiles(false);
-                       proc_exit(0);           /* startup done */
+                       /* don't set signals, startup process has its own agenda */
+                       StartupProcessMain();
+                       proc_exit(1);           /* should never return */
  
                 case BgWriterProcess:
                         /* don't set signals, bgwriter has its own agenda */
-                       InitXLOGAccess();
                         BackgroundWriterMain();
                         proc_exit(1);           /* should never return */
  
diff --git a/src/backend/postmaster/bgwriter.c b/src/backend/postmaster/bgwriter.c

index 6a0cd4eebf3fcfd5d4fc89d761d0b12ef615c4ba..d916f3242f1851f8ecb161ba28b0206a6365fe92 100644 (file)
--- a/src/backend/postmaster/bgwriter.c
+++ b/src/backend/postmaster/bgwriter.c
@@ -49,6 +49,7 @@
  #include <unistd.h>
  
  #include "access/xlog_internal.h"
+#include "catalog/pg_control.h"
  #include "libpq/pqsignal.h"
  #include "miscadmin.h"
  #include "pgstat.h"
@@ -423,9 +424,19 @@ BackgroundWriterMain(void)
                  */
                 if (do_checkpoint)
                 {
+                       bool    ckpt_performed = false;
+                       bool    do_restartpoint;
+
                         /* use volatile pointer to prevent code rearrangement */
                         volatile BgWriterShmemStruct *bgs = BgWriterShmem;
  
+                       /*
+                        * Check if we should perform a checkpoint or a restartpoint.
+                        * As a side-effect, RecoveryInProgress() initializes
+                        * TimeLineID if it's not set yet.
+                        */
+                       do_restartpoint = RecoveryInProgress();
+
                         /*
                          * Atomically fetch the request flags to figure out what kind of a
                          * checkpoint we should perform, and increase the started-counter
@@ -444,7 +455,8 @@ BackgroundWriterMain(void)
                          * implementation will not generate warnings caused by
                          * CheckPointTimeout < CheckPointWarning.
                          */
-                       if ((flags & CHECKPOINT_CAUSE_XLOG) &&
+                       if (!do_restartpoint &&
+                               (flags & CHECKPOINT_CAUSE_XLOG) &&
                                 elapsed_secs < CheckPointWarning)
                                 ereport(LOG,
                                                 (errmsg("checkpoints are occurring too frequently (%d seconds apart)",
@@ -455,14 +467,21 @@ BackgroundWriterMain(void)
                          * Initialize bgwriter-private variables used during checkpoint.
                          */
                         ckpt_active = true;
-                       ckpt_start_recptr = GetInsertRecPtr();
+                       if (!do_restartpoint)
+                               ckpt_start_recptr = GetInsertRecPtr();
                         ckpt_start_time = now;
                         ckpt_cached_elapsed = 0;
  
                         /*
                          * Do the checkpoint.
                          */
-                       CreateCheckPoint(flags);
+                       if (!do_restartpoint)
+                       {
+                               CreateCheckPoint(flags);
+                               ckpt_performed = true;
+                       }
+                       else
+                               ckpt_performed = CreateRestartPoint(flags);
  
                         /*
                          * After any checkpoint, close all smgr files.  This is so we
@@ -477,14 +496,27 @@ BackgroundWriterMain(void)
                         bgs->ckpt_done = bgs->ckpt_started;
                         SpinLockRelease(&bgs->ckpt_lck);
  
-                       ckpt_active = false;
+                       if (ckpt_performed)
+                       {
+                               /*
+                                * Note we record the checkpoint start time not end time as
+                                * last_checkpoint_time.  This is so that time-driven
+                                * checkpoints happen at a predictable spacing.
+                                */
+                               last_checkpoint_time = now;
+                       }
+                       else
+                       {
+                               /*
+                                * We were not able to perform the restartpoint (checkpoints
+                                * throw an ERROR in case of error).  Most likely because we
+                                * have not received any new checkpoint WAL records since the
+                                * last restartpoint. Try again in 15 s.
+                                */
+                               last_checkpoint_time = now - CheckPointTimeout + 15;
+                       }
  
-                       /*
-                        * Note we record the checkpoint start time not end time as
-                        * last_checkpoint_time.  This is so that time-driven checkpoints
-                        * happen at a predictable spacing.
-                        */
-                       last_checkpoint_time = now;
+                       ckpt_active = false;
                 }
                 else
                         BgBufferSync();
@@ -507,7 +539,7 @@ CheckArchiveTimeout(void)
         pg_time_t       now;
         pg_time_t       last_time;
  
-       if (XLogArchiveTimeout <= 0)
+       if (XLogArchiveTimeout <= 0 || RecoveryInProgress())
                 return;
  
         now = (pg_time_t) time(NULL);
@@ -714,16 +746,19 @@ IsCheckpointOnSchedule(double progress)
          * However, it's good enough for our purposes, we're only calculating an
          * estimate anyway.
          */
-       recptr = GetInsertRecPtr();
-       elapsed_xlogs =
-               (((double) (int32) (recptr.xlogid - ckpt_start_recptr.xlogid)) * XLogSegsPerFile +
-                ((double) recptr.xrecoff - (double) ckpt_start_recptr.xrecoff) / XLogSegSize) /
-               CheckPointSegments;
-
-       if (progress < elapsed_xlogs)
+       if (!RecoveryInProgress())
         {
-               ckpt_cached_elapsed = elapsed_xlogs;
-               return false;
+               recptr = GetInsertRecPtr();
+               elapsed_xlogs =
+                       (((double) (int32) (recptr.xlogid - ckpt_start_recptr.xlogid)) * XLogSegsPerFile +
+                        ((double) recptr.xrecoff - (double) ckpt_start_recptr.xrecoff) / XLogSegSize) /
+                       CheckPointSegments;
+
+               if (progress < elapsed_xlogs)
+               {
+                       ckpt_cached_elapsed = elapsed_xlogs;
+                       return false;
+               }
         }
  
         /*
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c

index 3380b806f654e6bbe21755af9290685cae758e7d..70d9ca246c4fee02b577782ffa57bf56ec232720 100644 (file)
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -225,11 +225,38 @@ static pid_t StartupPID = 0,
  static int     Shutdown = NoShutdown;
  
  static bool FatalError = false; /* T if recovering from backend crash */
+static bool RecoveryError = false; /* T if recovery failed */
+
+/* State of WAL redo */
+#define                        NoRecovery                      0
+#define                        RecoveryStarted         1
+#define                        RecoveryConsistent      2
+#define                        RecoveryCompleted       3
+
+static int     RecoveryStatus = NoRecovery;
  
  /*
   * We use a simple state machine to control startup, shutdown, and
   * crash recovery (which is rather like shutdown followed by startup).
   *
+ * After doing all the postmaster initialization work, we enter PM_STARTUP
+ * state and the startup process is launched. The startup process begins by
+ * reading the control file and other preliminary initialization steps. When
+ * it's ready to start WAL redo, it signals postmaster, and we switch to
+ * PM_RECOVERY phase. The background writer is launched, while the startup
+ * process continues applying WAL. 
+ * 
+ * After reaching a consistent point in WAL redo, startup process signals
+ * us again, and we switch to PM_RECOVERY_CONSISTENT phase. There's currently
+ * no difference between PM_RECOVERY and PM_RECOVERY_CONSISTENT, but we
+ * could start accepting connections to perform read-only queries at this
+ * point, if we had the infrastructure to do that.
+ *
+ * When the WAL redo is finished, the startup process signals us the third
+ * time, and we switch to PM_RUN state. The startup process can also skip the
+ * recovery and consistent recovery phases altogether, as it will during
+ * normal startup when there's no recovery to be done, for example.
+ *
   * Normal child backends can only be launched when we are in PM_RUN state.
   * (We also allow it in PM_WAIT_BACKUP state, but only for superusers.)
   * In other states we handle connection requests by launching "dead_end"
@@ -245,15 +272,19 @@ static bool FatalError = false; /* T if recovering from backend crash */
   *
   * Notice that this state variable does not distinguish *why* we entered
   * states later than PM_RUN --- Shutdown and FatalError must be consulted
- * to find that out.  FatalError is never true in PM_RUN state, nor in
- * PM_SHUTDOWN states (because we don't enter those states when trying to
- * recover from a crash).  It can be true in PM_STARTUP state, because we
- * don't clear it until we've successfully recovered.
+ * to find that out.  FatalError is never true in PM_RECOVERY_* or PM_RUN
+ * states, nor in PM_SHUTDOWN states (because we don't enter those states
+ * when trying to recover from a crash).  It can be true in PM_STARTUP state,
+ * because we don't clear it until we've successfully started WAL redo.
+ * Similarly, RecoveryError means that we have crashed during recovery, and
+ * should not try to restart.
   */
  typedef enum
  {
         PM_INIT,                                        /* postmaster starting */
         PM_STARTUP,                                     /* waiting for startup subprocess */
+       PM_RECOVERY,                            /* in recovery mode */
+       PM_RECOVERY_CONSISTENT,         /* consistent recovery mode */
         PM_RUN,                                         /* normal "database is alive" state */
         PM_WAIT_BACKUP,                         /* waiting for online backup mode to end */
         PM_WAIT_BACKENDS,                       /* waiting for live backends to exit */
@@ -307,6 +338,7 @@ static void pmdie(SIGNAL_ARGS);
  static void reaper(SIGNAL_ARGS);
  static void sigusr1_handler(SIGNAL_ARGS);
  static void dummy_handler(SIGNAL_ARGS);
+static void CheckRecoverySignals(void);
  static void CleanupBackend(int pid, int exitstatus);
  static void HandleChildCrash(int pid, int exitstatus, const char *procname);
  static void LogChildExit(int lev, const char *procname,
@@ -1302,7 +1334,9 @@ ServerLoop(void)
                  * state that prevents it, start one.  It doesn't matter if this
                  * fails, we'll just try again later.
                  */
-               if (BgWriterPID == 0 && pmState == PM_RUN)
+               if (BgWriterPID == 0 &&
+                       (pmState == PM_RUN || pmState == PM_RECOVERY || 
+                        pmState == PM_RECOVERY_CONSISTENT))
                         BgWriterPID = StartBackgroundWriter();
  
                 /*
@@ -1752,7 +1786,10 @@ canAcceptConnections(void)
                         return CAC_WAITBACKUP;  /* allow superusers only */
                 if (Shutdown > NoShutdown)
                         return CAC_SHUTDOWN;    /* shutdown is pending */
-               if (pmState == PM_STARTUP && !FatalError)
+               if (!FatalError &&
+                       (pmState == PM_STARTUP ||
+                        pmState == PM_RECOVERY ||
+                        pmState == PM_RECOVERY_CONSISTENT))
                         return CAC_STARTUP; /* normal startup */
                 return CAC_RECOVERY;    /* else must be crash recovery */
         }
@@ -1982,7 +2019,7 @@ pmdie(SIGNAL_ARGS)
                         ereport(LOG,
                                         (errmsg("received smart shutdown request")));
  
-                       if (pmState == PM_RUN)
+                       if (pmState == PM_RUN || pmState == PM_RECOVERY || pmState == PM_RECOVERY_CONSISTENT)
                         {
                                 /* autovacuum workers are told to shut down immediately */
                                 SignalAutovacWorkers(SIGTERM);
@@ -2019,7 +2056,14 @@ pmdie(SIGNAL_ARGS)
  
                         if (StartupPID != 0)
                                 signal_child(StartupPID, SIGTERM);
-                       if (pmState == PM_RUN || pmState == PM_WAIT_BACKUP)
+                       if (pmState == PM_RECOVERY)
+                       {
+                               /* only bgwriter is active in this state */
+                               pmState = PM_WAIT_BACKENDS;
+                       }
+                       if (pmState == PM_RUN ||
+                               pmState == PM_WAIT_BACKUP ||
+                               pmState == PM_RECOVERY_CONSISTENT)
                         {
                                 ereport(LOG,
                                                 (errmsg("aborting any active transactions")));
@@ -2116,10 +2160,22 @@ reaper(SIGNAL_ARGS)
                 if (pid == StartupPID)
                 {
                         StartupPID = 0;
-                       Assert(pmState == PM_STARTUP);
  
-                       /* FATAL exit of startup is treated as catastrophic */
-                       if (!EXIT_STATUS_0(exitstatus))
+                       /*
+                        * Check if we've received a signal from the startup process
+                        * first. This can change pmState. If the startup process sends
+                        * a signal and exits immediately after that, we might not have
+                        * processed the signal yet. We need to know if it completed
+                        * recovery before it exited.
+                        */
+                       CheckRecoverySignals();
+
+                       /*
+                        * Unexpected exit of startup process (including FATAL exit)
+                        * during PM_STARTUP is treated as catastrophic. There is no
+                        * other processes running yet.
+                        */
+                       if (pmState == PM_STARTUP)
                         {
                                 LogChildExit(LOG, _("startup process"),
                                                          pid, exitstatus);
@@ -2127,60 +2183,30 @@ reaper(SIGNAL_ARGS)
                                 (errmsg("aborting startup due to startup process failure")));
                                 ExitPostmaster(1);
                         }
-
                         /*
-                        * Startup succeeded - we are done with system startup or
-                        * recovery.
+                        * Any unexpected exit (including FATAL exit) of the startup
+                        * process is treated as a crash, except that we don't want
+                        * to reinitialize.
                          */
-                       FatalError = false;
-
-                       /*
-                        * Go to shutdown mode if a shutdown request was pending.
-                        */
-                       if (Shutdown > NoShutdown)
+                       if (!EXIT_STATUS_0(exitstatus))
                         {
-                               pmState = PM_WAIT_BACKENDS;
-                               /* PostmasterStateMachine logic does the rest */
+                               RecoveryError = true;
+                               HandleChildCrash(pid, exitstatus,
+                                                                _("startup process"));
                                 continue;
                         }
-
                         /*
-                        * Otherwise, commence normal operations.
-                        */
-                       pmState = PM_RUN;
-
-                       /*
-                        * Load the flat authorization file into postmaster's cache. The
-                        * startup process has recomputed this from the database contents,
-                        * so we wait till it finishes before loading it.
-                        */
-                       load_role();
-
-                       /*
-                        * Crank up the background writer.      It doesn't matter if this
-                        * fails, we'll just try again later.
+                        * Startup process exited normally, but didn't finish recovery.
+                        * This can happen if someone else than postmaster kills the
+                        * startup process with SIGTERM. Treat it like a crash.
                          */
-                       Assert(BgWriterPID == 0);
-                       BgWriterPID = StartBackgroundWriter();
-
-                       /*
-                        * Likewise, start other special children as needed.  In a restart
-                        * situation, some of them may be alive already.
-                        */
-                       if (WalWriterPID == 0)
-                               WalWriterPID = StartWalWriter();
-                       if (AutoVacuumingActive() && AutoVacPID == 0)
-                               AutoVacPID = StartAutoVacLauncher();
-                       if (XLogArchivingActive() && PgArchPID == 0)
-                               PgArchPID = pgarch_start();
-                       if (PgStatPID == 0)
-                               PgStatPID = pgstat_start();
-
-                       /* at this point we are really open for business */
-                       ereport(LOG,
-                                (errmsg("database system is ready to accept connections")));
-
-                       continue;
+                       if (pmState == PM_RECOVERY || pmState == PM_RECOVERY_CONSISTENT)
+                       {
+                               RecoveryError = true;
+                               HandleChildCrash(pid, exitstatus,
+                                                                _("startup process"));
+                               continue;
+                       }
                 }
  
                 /*
@@ -2443,6 +2469,18 @@ HandleChildCrash(int pid, int exitstatus, const char *procname)
                 }
         }
  
+       /* Take care of the startup process too */
+       if (pid == StartupPID)
+               StartupPID = 0;
+       else if (StartupPID != 0 && !FatalError)
+       {
+               ereport(DEBUG2,
+                               (errmsg_internal("sending %s to process %d",
+                                                                (SendStop ? "SIGSTOP" : "SIGQUIT"),
+                                                                (int) StartupPID)));
+               signal_child(BgWriterPID, (SendStop ? SIGSTOP : SIGQUIT));
+       }
+
         /* Take care of the bgwriter too */
         if (pid == BgWriterPID)
                 BgWriterPID = 0;
@@ -2514,7 +2552,9 @@ HandleChildCrash(int pid, int exitstatus, const char *procname)
  
         FatalError = true;
         /* We now transit into a state of waiting for children to die */
-       if (pmState == PM_RUN ||
+       if (pmState == PM_RECOVERY ||
+               pmState == PM_RECOVERY_CONSISTENT ||
+               pmState == PM_RUN ||
                 pmState == PM_WAIT_BACKUP ||
                 pmState == PM_SHUTDOWN)
                 pmState = PM_WAIT_BACKENDS;
@@ -2582,6 +2622,127 @@ LogChildExit(int lev, const char *procname, int pid, int exitstatus)
  static void
  PostmasterStateMachine(void)
  {
+       /* Startup states */
+
+       if (pmState == PM_STARTUP && RecoveryStatus > NoRecovery)
+       {
+               /* WAL redo has started. We're out of reinitialization. */
+               FatalError = false;
+
+               /*
+                * Go to shutdown mode if a shutdown request was pending.
+                */
+               if (Shutdown > NoShutdown)
+               {
+                       pmState = PM_WAIT_BACKENDS;
+                       /* PostmasterStateMachine logic does the rest */
+               }
+               else
+               {
+                       /*
+                        * Crank up the background writer.      It doesn't matter if this
+                        * fails, we'll just try again later.
+                        */
+                       Assert(BgWriterPID == 0);
+                       BgWriterPID = StartBackgroundWriter();
+
+                       pmState = PM_RECOVERY;
+               }
+       }
+       if (pmState == PM_RECOVERY && RecoveryStatus >= RecoveryConsistent)
+       {
+               /*
+                * Go to shutdown mode if a shutdown request was pending.
+                */
+               if (Shutdown > NoShutdown)
+               {
+                       pmState = PM_WAIT_BACKENDS;
+                       /* PostmasterStateMachine logic does the rest */
+               }
+               else
+               {
+                       /*
+                        * Startup process has entered recovery. We consider that good
+                        * enough to reset FatalError.
+                        */
+                       pmState = PM_RECOVERY_CONSISTENT;
+
+                       /*
+                        * Load the flat authorization file into postmaster's cache. The
+                        * startup process won't have recomputed this from the database yet,
+                        * so we it may change following recovery. 
+                        */
+                       load_role();
+
+                       /*
+                        * Likewise, start other special children as needed.
+                        */
+                       Assert(PgStatPID == 0);
+                       PgStatPID = pgstat_start();
+
+                       /* XXX at this point we could accept read-only connections */
+                       ereport(DEBUG1,
+                                (errmsg("database system is in consistent recovery mode")));
+               }
+       }
+       if ((pmState == PM_RECOVERY || 
+                pmState == PM_RECOVERY_CONSISTENT ||
+                pmState == PM_STARTUP) &&
+               RecoveryStatus == RecoveryCompleted)
+       {
+               /*
+                * Startup succeeded.
+                *
+                * Go to shutdown mode if a shutdown request was pending.
+                */
+               if (Shutdown > NoShutdown)
+               {
+                       pmState = PM_WAIT_BACKENDS;
+                       /* PostmasterStateMachine logic does the rest */
+               }
+               else
+               {
+                       /*
+                        * Otherwise, commence normal operations.
+                        */
+                       pmState = PM_RUN;
+
+                       /*
+                        * Load the flat authorization file into postmaster's cache. The
+                        * startup process has recomputed this from the database contents,
+                        * so we wait till it finishes before loading it.
+                        */
+                       load_role();
+
+                       /*
+                        * Crank up the background writer, if we didn't do that already
+                        * when we entered consistent recovery phase.  It doesn't matter
+                        * if this fails, we'll just try again later.
+                        */
+                       if (BgWriterPID == 0)
+                               BgWriterPID = StartBackgroundWriter();
+
+                       /*
+                        * Likewise, start other special children as needed.  In a restart
+                        * situation, some of them may be alive already.
+                        */
+                       if (WalWriterPID == 0)
+                               WalWriterPID = StartWalWriter();
+                       if (AutoVacuumingActive() && AutoVacPID == 0)
+                               AutoVacPID = StartAutoVacLauncher();
+                       if (XLogArchivingActive() && PgArchPID == 0)
+                               PgArchPID = pgarch_start();
+                       if (PgStatPID == 0)
+                               PgStatPID = pgstat_start();
+
+                       /* at this point we are really open for business */
+                       ereport(LOG,
+                               (errmsg("database system is ready to accept connections")));
+               }
+       }
+
+       /* Shutdown states */
+
         if (pmState == PM_WAIT_BACKUP)
         {
                 /*
@@ -2722,6 +2883,15 @@ PostmasterStateMachine(void)
                 }
         }
  
+       /*
+        * If recovery failed, wait for all non-syslogger children to exit,
+        * and then exit postmaster. We don't try to reinitialize when recovery
+        * fails, because more than likely it will just fail again and we will
+        * keep trying forever.
+        */
+       if (RecoveryError && pmState == PM_NO_CHILDREN)
+               ExitPostmaster(1);              
+
         /*
          * If we need to recover from a crash, wait for all non-syslogger
          * children to exit, then reset shmem and StartupDataBase.
@@ -2734,6 +2904,8 @@ PostmasterStateMachine(void)
                 shmem_exit(1);
                 reset_shared(PostPortNumber);
  
+               RecoveryStatus = NoRecovery;
+
                 StartupPID = StartupDataBase();
                 Assert(StartupPID != 0);
                 pmState = PM_STARTUP;
@@ -3837,6 +4009,37 @@ ExitPostmaster(int status)
         proc_exit(status);
  }
  
+/*
+ * common code used in sigusr1_handler() and reaper() to handle
+ * recovery-related signals from startup process
+ */
+static void
+CheckRecoverySignals(void)
+{
+       bool changed = false;
+
+       if (CheckPostmasterSignal(PMSIGNAL_RECOVERY_STARTED))
+       {
+               Assert(pmState == PM_STARTUP);
+
+               RecoveryStatus = RecoveryStarted;
+               changed = true;
+       }
+       if (CheckPostmasterSignal(PMSIGNAL_RECOVERY_CONSISTENT))
+       {
+               RecoveryStatus = RecoveryConsistent;
+               changed = true;
+       }
+       if (CheckPostmasterSignal(PMSIGNAL_RECOVERY_COMPLETED))
+       {
+               RecoveryStatus = RecoveryCompleted;
+               changed = true;
+       }
+
+       if (changed)
+               PostmasterStateMachine();
+}
+
  /*
   * sigusr1_handler - handle signal conditions from child processes
   */
@@ -3847,6 +4050,8 @@ sigusr1_handler(SIGNAL_ARGS)
  
         PG_SETMASK(&BlockSig);
  
+       CheckRecoverySignals();
+
         if (CheckPostmasterSignal(PMSIGNAL_PASSWORD_CHANGE))
         {
                 /*
diff --git a/src/backend/storage/buffer/README b/src/backend/storage/buffer/README

index 62b22bd1db8c04338bc8873418af7fc6a7605334..bfacfea0f1d2d997a4c1032ed2e1f12028173e07 100644 (file)
--- a/src/backend/storage/buffer/README
+++ b/src/backend/storage/buffer/README
@@ -268,3 +268,8 @@ out (and anyone else who flushes buffer contents to disk must do so too).
  This ensures that the page image transferred to disk is reasonably consistent.
  We might miss a hint-bit update or two but that isn't a problem, for the same
  reasons mentioned under buffer access rules.
+
+As of 8.4, background writer starts during recovery mode when there is
+some form of potentially extended recovery to perform. It performs an
+identical service to normal processing, except that checkpoints it
+writes are technically restartpoints.
diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c

index cf98323d2a25732ceca3e187b7e1b24211d054dc..b35939556f17f32966f13628d636ee632ac10a6b 100644 (file)
--- a/src/backend/utils/init/postinit.c
+++ b/src/backend/utils/init/postinit.c
@@ -324,7 +324,7 @@ InitCommunication(void)
   * If you're wondering why this is separate from InitPostgres at all:
   * the critical distinction is that this stuff has to happen before we can
   * run XLOG-related initialization, which is done before InitPostgres --- in
- * fact, for cases such as checkpoint creation processes, InitPostgres may
+ * fact, for cases such as the background writer process, InitPostgres may
   * never be done at all.
   */
  void
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h

index 6913f7c800267f2caeb5438c083042176ac8b81b..f8720bbc2cdedfc090c13ed1644d85acdbed01cc 100644 (file)
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -199,6 +199,8 @@ extern void RestoreBkpBlocks(XLogRecPtr lsn, XLogRecord *record, bool cleanup);
  extern void xlog_redo(XLogRecPtr lsn, XLogRecord *record);
  extern void xlog_desc(StringInfo buf, uint8 xl_info, char *rec);
  
+extern bool RecoveryInProgress(void);
+
  extern void UpdateControlFile(void);
  extern Size XLOGShmemSize(void);
  extern void XLOGShmemInit(void);
@@ -207,9 +209,12 @@ extern void StartupXLOG(void);
  extern void ShutdownXLOG(int code, Datum arg);
  extern void InitXLOGAccess(void);
  extern void CreateCheckPoint(int flags);
+extern bool CreateRestartPoint(int flags);
  extern void XLogPutNextOid(Oid nextOid);
  extern XLogRecPtr GetRedoRecPtr(void);
  extern XLogRecPtr GetInsertRecPtr(void);
  extern void GetNextXidAndEpoch(TransactionId *xid, uint32 *epoch);
  
+extern void StartupProcessMain(void);
+
  #endif   /* XLOG_H */
diff --git a/src/include/storage/pmsignal.h b/src/include/storage/pmsignal.h

index 3101092cbd4d3c511708b96c228d0654d39c0a17..21b1e90f5952a7fc359ce1dc3aa570bcc7c94430 100644 (file)
--- a/src/include/storage/pmsignal.h
+++ b/src/include/storage/pmsignal.h
@@ -22,6 +22,9 @@
   */
  typedef enum
  {
+       PMSIGNAL_RECOVERY_STARTED,      /* recovery has started */
+       PMSIGNAL_RECOVERY_CONSISTENT, /* recovery has reached consistent state */
+       PMSIGNAL_RECOVERY_COMPLETED, /* recovery has completed */
         PMSIGNAL_PASSWORD_CHANGE,       /* pg_auth file has changed */
         PMSIGNAL_WAKEN_ARCHIVER,        /* send a NOTIFY signal to xlog archiver */
         PMSIGNAL_ROTATE_LOGFILE,        /* send SIGUSR1 to syslogger to rotate logfile */
author	Heikki Linnakangas <[email protected]>
	Wed, 18 Feb 2009 15:58:41 +0000 (15:58 +0000)
committer	Heikki Linnakangas <[email protected]>
	Wed, 18 Feb 2009 15:58:41 +0000 (15:58 +0000)
src/backend/access/transam/xlog.c		patch \| blob \| blame \| history
src/backend/bootstrap/bootstrap.c		patch \| blob \| blame \| history
src/backend/postmaster/bgwriter.c		patch \| blob \| blame \| history
src/backend/postmaster/postmaster.c		patch \| blob \| blame \| history
src/backend/storage/buffer/README		patch \| blob \| blame \| history
src/backend/utils/init/postinit.c		patch \| blob \| blame \| history
src/include/access/xlog.h		patch \| blob \| blame \| history
src/include/storage/pmsignal.h		patch \| blob \| blame \| history