Drop slot's LWLock before returning from SaveSlotToPath()
authorPeter Eisentraut <[email protected]>
Thu, 26 Mar 2020 10:51:39 +0000 (11:51 +0100)
committerPeter Eisentraut <[email protected]>
Thu, 26 Mar 2020 12:29:20 +0000 (13:29 +0100)
When SaveSlotToPath() is called with elevel=LOG, the early exits didn't
release the slot's io_in_progress_lock.

This could result in a walsender being stuck on the lock forever.  A
possible way to get into this situation is if the offending code paths
are triggered in a low disk space situation.

Author: Pavan Deolasee <[email protected]>
Reported-by: Craig Ringer <[email protected]>
Discussion: https://www.postgresql.org/message-id/flat/56a138c5-de61-f553-7e8f-6789296de785%402ndquadrant.com

src/backend/replication/slot.c

index 1cec53d748426ac00a3a184f53a8623c2cae7e8d..d90c7235e986f52341744b04202cb55a7e4c1f82 100644 (file)
@@ -1256,6 +1256,12 @@ SaveSlotToPath(ReplicationSlot *slot, const char *dir, int elevel)
        fd = OpenTransientFile(tmppath, O_CREAT | O_EXCL | O_WRONLY | PG_BINARY);
        if (fd < 0)
        {
+               /*
+                * If not an ERROR, then release the lock before returning.  In case
+                * of an ERROR, the error recovery path automatically releases the
+                * lock, but no harm in explicitly releasing even in that case.
+                */
+               LWLockRelease(&slot->io_in_progress_lock);
                ereport(elevel,
                                (errcode_for_file_access(),
                                 errmsg("could not create file \"%s\": %m",
@@ -1287,6 +1293,7 @@ SaveSlotToPath(ReplicationSlot *slot, const char *dir, int elevel)
 
                pgstat_report_wait_end();
                CloseTransientFile(fd);
+               LWLockRelease(&slot->io_in_progress_lock);
 
                /* if write didn't set errno, assume problem is no disk space */
                errno = save_errno ? save_errno : ENOSPC;
@@ -1306,6 +1313,7 @@ SaveSlotToPath(ReplicationSlot *slot, const char *dir, int elevel)
 
                pgstat_report_wait_end();
                CloseTransientFile(fd);
+               LWLockRelease(&slot->io_in_progress_lock);
                errno = save_errno;
                ereport(elevel,
                                (errcode_for_file_access(),
@@ -1317,6 +1325,7 @@ SaveSlotToPath(ReplicationSlot *slot, const char *dir, int elevel)
 
        if (CloseTransientFile(fd) != 0)
        {
+               LWLockRelease(&slot->io_in_progress_lock);
                ereport(elevel,
                                (errcode_for_file_access(),
                                 errmsg("could not close file \"%s\": %m",
@@ -1327,6 +1336,7 @@ SaveSlotToPath(ReplicationSlot *slot, const char *dir, int elevel)
        /* rename to permanent file, fsync file and directory */
        if (rename(tmppath, path) != 0)
        {
+               LWLockRelease(&slot->io_in_progress_lock);
                ereport(elevel,
                                (errcode_for_file_access(),
                                 errmsg("could not rename file \"%s\" to \"%s\": %m",