Detect POLLHUP/POLLRDHUP while running queries.
authorThomas Munro <[email protected]>
Fri, 2 Apr 2021 19:52:30 +0000 (08:52 +1300)
committerThomas Munro <[email protected]>
Fri, 2 Apr 2021 20:02:41 +0000 (09:02 +1300)
Provide a new GUC check_client_connection_interval that can be used to
check whether the client connection has gone away, while running very
long queries.  It is disabled by default.

For now this uses a non-standard Linux extension (also adopted by at
least one other OS).  POLLRDHUP is not defined by POSIX, and other OSes
don't have a reliable way to know if a connection was closed without
actually trying to read or write.

In future we might consider trying to send a no-op/heartbeat message
instead, but that could require protocol changes.

Author: Sergey Cherkashin <[email protected]>
Author: Thomas Munro <[email protected]>
Reviewed-by: Thomas Munro <[email protected]>
Reviewed-by: Tatsuo Ishii <[email protected]>
Reviewed-by: Konstantin Knizhnik <[email protected]>
Reviewed-by: Zhihong Yu <[email protected]>
Reviewed-by: Andres Freund <[email protected]>
Reviewed-by: Maksim Milyutin <[email protected]>
Reviewed-by: Tsunakawa, Takayuki/綱川 貴之 <[email protected]>
Reviewed-by: Tom Lane <[email protected]> (much earlier version)
Discussion: https://postgr.es/m/77def86b27e41f0efcba411460e929ae%40postgrespro.ru

doc/src/sgml/config.sgml
src/backend/libpq/pqcomm.c
src/backend/tcop/postgres.c
src/backend/utils/init/globals.c
src/backend/utils/init/postinit.c
src/backend/utils/misc/guc.c
src/backend/utils/misc/postgresql.conf.sample
src/include/libpq/libpq.h
src/include/miscadmin.h
src/include/tcop/tcopprot.h
src/include/utils/timeout.h

index 9d87b5097afd5334d281e6140546762d24c24f1e..0c9128a55d0de284e7170db648bba022b157a825 100644 (file)
@@ -998,6 +998,43 @@ include_dir 'conf.d'
       </listitem>
      </varlistentry>
 
+     <varlistentry id="guc-client-connection-check-interval" xreflabel="client_connection_check_interval">
+      <term><varname>client_connection_check_interval</varname> (<type>integer</type>)
+      <indexterm>
+       <primary><varname>client_connection_check_interval</varname> configuration parameter</primary>
+      </indexterm>
+      </term>
+      <listitem>
+       <para>
+        Sets the time interval between optional checks that the client is still
+        connected, while running queries.  The check is performed by polling
+        the socket, and allows long running queries to be aborted sooner if
+        the kernel reports that the connection is closed.
+       </para>
+       <para>
+        This option is currently available only on systems that support the
+        non-standard <symbol>POLLRDHUP</symbol> extension to the
+        <symbol>poll</symbol> system call, including Linux.
+       </para>
+       <para>
+        If the value is specified without units, it is taken as milliseconds.
+        The default value is <literal>0</literal>, which disables connection
+        checks.  Without connection checks, the server will detect the loss of
+        the connection only at the next interaction with the socket, when it
+        waits for, receives or sends data.
+       </para>
+       <para>
+        For the kernel itself to detect lost TCP connections reliably and within
+        a known timeframe in all scenarios including network failure, it may
+        also be necessary to adjust the TCP keepalive settings of the operating
+        system, or the <xref linkend="guc-tcp-keepalives-idle"/>,
+        <xref linkend="guc-tcp-keepalives-interval"/> and
+        <xref linkend="guc-tcp-keepalives-count"/> settings of
+        <productname>PostgreSQL</productname>.
+       </para>
+      </listitem>
+     </varlistentry>
+
      </variablelist>
      </sect2>
 
index 4c7b1e7bfdf893e6fadafabab68f09aa78e4969e..4cd6d6dfbb98530442e31bd790b34a9372db84b9 100644 (file)
@@ -54,6 +54,9 @@
  */
 #include "postgres.h"
 
+#ifdef HAVE_POLL_H
+#include <poll.h>
+#endif
 #include <signal.h>
 #include <fcntl.h>
 #include <grp.h>
@@ -1921,3 +1924,40 @@ pq_settcpusertimeout(int timeout, Port *port)
 
    return STATUS_OK;
 }
+
+/*
+ * Check if the client is still connected.
+ */
+bool
+pq_check_connection(void)
+{
+#if defined(POLLRDHUP)
+   /*
+    * POLLRDHUP is a Linux extension to poll(2) to detect sockets closed by
+    * the other end.  We don't have a portable way to do that without
+    * actually trying to read or write data on other systems.  We don't want
+    * to read because that would be confused by pipelined queries and COPY
+    * data. Perhaps in future we'll try to write a heartbeat message instead.
+    */
+   struct pollfd pollfd;
+   int         rc;
+
+   pollfd.fd = MyProcPort->sock;
+   pollfd.events = POLLOUT | POLLIN | POLLRDHUP;
+   pollfd.revents = 0;
+
+   rc = poll(&pollfd, 1, 0);
+
+   if (rc < 0)
+   {
+       ereport(COMMERROR,
+               (errcode_for_socket_access(),
+                errmsg("could not poll socket: %m")));
+       return false;
+   }
+   else if (rc == 1 && (pollfd.revents & (POLLHUP | POLLRDHUP)))
+       return false;
+#endif
+
+   return true;
+}
index 2b1b68109fd67840c49107d69eee37e324074e72..ad351e2fd1e613095fc8441c39232463dff40eee 100644 (file)
@@ -102,6 +102,9 @@ int         max_stack_depth = 100;
 /* wait N seconds to allow attach from a debugger */
 int            PostAuthDelay = 0;
 
+/* Time between checks that the client is still connected. */
+int            client_connection_check_interval = 0;
+
 /* ----------------
  *     private typedefs etc
  * ----------------
@@ -2671,6 +2674,14 @@ start_xact_command(void)
     * not desired, the timeout has to be disabled explicitly.
     */
    enable_statement_timeout();
+
+   /* Start timeout for checking if the client has gone away if necessary. */
+   if (client_connection_check_interval > 0 &&
+       IsUnderPostmaster &&
+       MyProcPort &&
+       !get_timeout_active(CLIENT_CONNECTION_CHECK_TIMEOUT))
+       enable_timeout_after(CLIENT_CONNECTION_CHECK_TIMEOUT,
+                            client_connection_check_interval);
 }
 
 static void
@@ -3149,6 +3160,27 @@ ProcessInterrupts(void)
                    (errcode(ERRCODE_ADMIN_SHUTDOWN),
                     errmsg("terminating connection due to administrator command")));
    }
+
+   if (CheckClientConnectionPending)
+   {
+       CheckClientConnectionPending = false;
+
+       /*
+        * Check for lost connection and re-arm, if still configured, but not
+        * if we've arrived back at DoingCommandRead state.  We don't want to
+        * wake up idle sessions, and they already know how to detect lost
+        * connections.
+        */
+       if (!DoingCommandRead && client_connection_check_interval > 0)
+       {
+           if (!pq_check_connection())
+               ClientConnectionLost = true;
+           else
+               enable_timeout_after(CLIENT_CONNECTION_CHECK_TIMEOUT,
+                                    client_connection_check_interval);
+       }
+   }
+
    if (ClientConnectionLost)
    {
        QueryCancelPending = false; /* lost connection trumps QueryCancel */
index 73e0a672ae33d53e70cae0460ae4c936717c7fc5..a9f0fc3017c9433a23778140b203d052230f4a02 100644 (file)
@@ -30,6 +30,7 @@ ProtocolVersion FrontendProtocol;
 volatile sig_atomic_t InterruptPending = false;
 volatile sig_atomic_t QueryCancelPending = false;
 volatile sig_atomic_t ProcDiePending = false;
+volatile sig_atomic_t CheckClientConnectionPending = false;
 volatile sig_atomic_t ClientConnectionLost = false;
 volatile sig_atomic_t IdleInTransactionSessionTimeoutPending = false;
 volatile sig_atomic_t IdleSessionTimeoutPending = false;
index 7abeccb536285ed3ce64605e3c308342d9a0bcbc..a3ec358538a6542251b6359e0a7426611c84eece 100644 (file)
@@ -73,6 +73,7 @@ static void StatementTimeoutHandler(void);
 static void LockTimeoutHandler(void);
 static void IdleInTransactionSessionTimeoutHandler(void);
 static void IdleSessionTimeoutHandler(void);
+static void ClientCheckTimeoutHandler(void);
 static bool ThereIsAtLeastOneRole(void);
 static void process_startup_options(Port *port, bool am_superuser);
 static void process_settings(Oid databaseid, Oid roleid);
@@ -620,6 +621,7 @@ InitPostgres(const char *in_dbname, Oid dboid, const char *username,
        RegisterTimeout(IDLE_IN_TRANSACTION_SESSION_TIMEOUT,
                        IdleInTransactionSessionTimeoutHandler);
        RegisterTimeout(IDLE_SESSION_TIMEOUT, IdleSessionTimeoutHandler);
+       RegisterTimeout(CLIENT_CONNECTION_CHECK_TIMEOUT, ClientCheckTimeoutHandler);
    }
 
    /*
@@ -1242,6 +1244,14 @@ IdleSessionTimeoutHandler(void)
    SetLatch(MyLatch);
 }
 
+static void
+ClientCheckTimeoutHandler(void)
+{
+   CheckClientConnectionPending = true;
+   InterruptPending = true;
+   SetLatch(MyLatch);
+}
+
 /*
  * Returns true if at least one role is defined in this database cluster.
  */
index 584daffc8a96fba4b0c7579685623eae20f8599b..60a9c7a2a0b5187c24b9731db462e6cd05ef3e7d 100644 (file)
@@ -20,6 +20,9 @@
 #include <float.h>
 #include <math.h>
 #include <limits.h>
+#ifdef HAVE_POLL_H
+#include <poll.h>
+#endif
 #ifndef WIN32
 #include <sys/mman.h>
 #endif
@@ -204,6 +207,7 @@ static bool check_autovacuum_work_mem(int *newval, void **extra, GucSource sourc
 static bool check_effective_io_concurrency(int *newval, void **extra, GucSource source);
 static bool check_maintenance_io_concurrency(int *newval, void **extra, GucSource source);
 static bool check_huge_page_size(int *newval, void **extra, GucSource source);
+static bool check_client_connection_check_interval(int *newval, void **extra, GucSource source);
 static void assign_pgstat_temp_directory(const char *newval, void *extra);
 static bool check_application_name(char **newval, void **extra, GucSource source);
 static void assign_application_name(const char *newval, void *extra);
@@ -3501,6 +3505,17 @@ static struct config_int ConfigureNamesInt[] =
        NULL, NULL, NULL
    },
 
+   {
+       {"client_connection_check_interval", PGC_USERSET, CLIENT_CONN_OTHER,
+           gettext_noop("Sets the time interval between checks for disconnection while running queries."),
+           NULL,
+           GUC_UNIT_MS
+       },
+       &client_connection_check_interval,
+       0, 0, INT_MAX,
+       check_client_connection_check_interval, NULL, NULL
+   },
+
    /* End-of-list marker */
    {
        {NULL, 0, 0, NULL, NULL}, NULL, 0, 0, 0, NULL, NULL, NULL
@@ -11980,6 +11995,20 @@ check_huge_page_size(int *newval, void **extra, GucSource source)
    return true;
 }
 
+static bool
+check_client_connection_check_interval(int *newval, void **extra, GucSource source)
+{
+#ifndef POLLRDHUP
+   /* Linux only, for now.  See pq_check_connection(). */
+   if (*newval != 0)
+   {
+       GUC_check_errdetail("client_connection_check_interval must be set to 0 on platforms that lack POLLRDHUP.");
+       return false;
+   }
+#endif
+   return true;
+}
+
 static void
 assign_pgstat_temp_directory(const char *newval, void *extra)
 {
index 30cfddac1f70d1534ab20ecd24163dd69499ce53..39da7cc9427f26656701f166eead36a148272f9a 100644 (file)
 
 #dynamic_library_path = '$libdir'
 
+#client_connection_check_interval = 0  # time between checks for client
+                   # disconnection while running queries;
+                   # 0 for never
 
 #------------------------------------------------------------------------------
 # LOCK MANAGEMENT
index b20deeb5550d1a7d2021de2ee5a3c67f9a39c581..3ebbc8d6656bd46e896e8a65f7a612f8b1882526 100644 (file)
@@ -71,6 +71,7 @@ extern int    pq_getbyte(void);
 extern int pq_peekbyte(void);
 extern int pq_getbyte_if_available(unsigned char *c);
 extern int pq_putmessage_v2(char msgtype, const char *s, size_t len);
+extern bool pq_check_connection(void);
 
 /*
  * prototypes for functions in be-secure.c
index 013850ac288fbee5b78d26cbcfbda402154e7a74..6f8251e0b07d0ed2f39f237465a9ae201fb1d687 100644 (file)
@@ -85,6 +85,7 @@ extern PGDLLIMPORT volatile sig_atomic_t IdleInTransactionSessionTimeoutPending;
 extern PGDLLIMPORT volatile sig_atomic_t IdleSessionTimeoutPending;
 extern PGDLLIMPORT volatile sig_atomic_t ProcSignalBarrierPending;
 
+extern PGDLLIMPORT volatile sig_atomic_t CheckClientConnectionPending;
 extern PGDLLIMPORT volatile sig_atomic_t ClientConnectionLost;
 
 /* these are marked volatile because they are examined by signal handlers: */
index e5472100a436dd1edcaac328611d55e442ec5019..241e7c99614a1b968cbda0d46a2bf05c9b9bd894 100644 (file)
@@ -29,6 +29,7 @@ extern CommandDest whereToSendOutput;
 extern PGDLLIMPORT const char *debug_query_string;
 extern int max_stack_depth;
 extern int PostAuthDelay;
+extern int client_connection_check_interval;
 
 /* GUC-configurable parameters */
 
index ecb2a366a5f44783293eab2d79601a3f836c272f..93e6a691b3f36c30e54326527c25651922567861 100644 (file)
@@ -32,6 +32,7 @@ typedef enum TimeoutId
    STANDBY_LOCK_TIMEOUT,
    IDLE_IN_TRANSACTION_SESSION_TIMEOUT,
    IDLE_SESSION_TIMEOUT,
+   CLIENT_CONNECTION_CHECK_TIMEOUT,
    /* First user-definable timeout reason */
    USER_TIMEOUT,
    /* Maximum number of timeout reasons */