Track shared buffer hits in pg_stat_io
authorAndres Freund <[email protected]>
Fri, 31 Mar 2023 02:22:40 +0000 (19:22 -0700)
committerAndres Freund <[email protected]>
Fri, 31 Mar 2023 02:24:21 +0000 (19:24 -0700)
Among other things, this should make it easier to calculate a useful cache hit
ratio by excluding buffer reads via buffer access strategies. As buffer access
strategies reuse buffers (and thus evict the prior buffer contents), it is
normal to see reads on repeated scans of the same data.

Author: Melanie Plageman <[email protected]>
Reviewed-by: Bertrand Drouvot <[email protected]>
Reviewed-by: Andres Freund <[email protected]>
Discussion: https://postgr.es/m/CAAKRu_beMa9Hzih40%3DXPYqhDVz6tsgUGTrhZXRo%3Dunp%2Bszb%3DUA%40mail.gmail.com

12 files changed:
doc/src/sgml/monitoring.sgml
src/backend/catalog/system_views.sql
src/backend/storage/buffer/bufmgr.c
src/backend/storage/buffer/localbuf.c
src/backend/utils/activity/pgstat_io.c
src/backend/utils/adt/pgstatfuncs.c
src/include/catalog/pg_proc.dat
src/include/pgstat.h
src/include/storage/buf_internals.h
src/test/regress/expected/rules.out
src/test/regress/expected/stats.out
src/test/regress/sql/stats.sql

index c809ff1ba4a56476c01fbb41239d81c31846f4a5..d5a45f996d0dc3944f971b855409ebb01711b2a1 100644 (file)
@@ -3855,6 +3855,17 @@ SELECT pid, wait_event_type, wait_event FROM pg_stat_activity WHERE wait_event i
       </entry>
      </row>
 
+     <row>
+      <entry role="catalog_table_entry">
+       <para role="column_definition">
+        <structfield>hits</structfield> <type>bigint</type>
+       </para>
+       <para>
+        The number of times a desired block was found in a shared buffer.
+       </para>
+      </entry>
+     </row>
+
      <row>
       <entry role="catalog_table_entry">
        <para role="column_definition">
index 9508d8ba552cb1a81254d7bc4a86d8dcb871e77f..574cbc2e4482dbc01b9ab8cdf698676b5ee26989 100644 (file)
@@ -1128,6 +1128,7 @@ SELECT
        b.writes,
        b.extends,
        b.op_bytes,
+       b.hits,
        b.evictions,
        b.reuses,
        b.fsyncs,
index fe029d2ea60024a05b9ee9ca4d0ec774387a3208..b3adbbe7d2323bc270660f198d060191cd74afcf 100644 (file)
@@ -472,7 +472,7 @@ static BufferDesc *BufferAlloc(SMgrRelation smgr,
                               ForkNumber forkNum,
                               BlockNumber blockNum,
                               BufferAccessStrategy strategy,
-                              bool *foundPtr, IOContext *io_context);
+                              bool *foundPtr, IOContext io_context);
 static void FlushBuffer(BufferDesc *buf, SMgrRelation reln,
                        IOObject io_object, IOContext io_context);
 static void FindAndDropRelationBuffers(RelFileLocator rlocator,
@@ -850,13 +850,14 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
    if (isLocalBuf)
    {
        /*
-        * LocalBufferAlloc() will set the io_context to IOCONTEXT_NORMAL. We
-        * do not use a BufferAccessStrategy for I/O of temporary tables.
+        * We do not use a BufferAccessStrategy for I/O of temporary tables.
         * However, in some cases, the "strategy" may not be NULL, so we can't
         * rely on IOContextForStrategy() to set the right IOContext for us.
         * This may happen in cases like CREATE TEMPORARY TABLE AS...
         */
-       bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, &found, &io_context);
+       io_context = IOCONTEXT_NORMAL;
+       io_object = IOOBJECT_TEMP_RELATION;
+       bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, &found);
        if (found)
            pgBufferUsage.local_blks_hit++;
        else if (isExtend)
@@ -871,8 +872,10 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
         * lookup the buffer.  IO_IN_PROGRESS is set if the requested block is
         * not currently in memory.
         */
+       io_context = IOContextForStrategy(strategy);
+       io_object = IOOBJECT_RELATION;
        bufHdr = BufferAlloc(smgr, relpersistence, forkNum, blockNum,
-                            strategy, &found, &io_context);
+                            strategy, &found, io_context);
        if (found)
            pgBufferUsage.shared_blks_hit++;
        else if (isExtend)
@@ -892,6 +895,7 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
            /* Just need to update stats before we exit */
            *hit = true;
            VacuumPageHit++;
+           pgstat_count_io_op(io_object, io_context, IOOP_HIT);
 
            if (VacuumCostActive)
                VacuumCostBalance += VacuumCostPageHit;
@@ -987,16 +991,7 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
     */
    Assert(!(pg_atomic_read_u32(&bufHdr->state) & BM_VALID));   /* spinlock not needed */
 
-   if (isLocalBuf)
-   {
-       bufBlock = LocalBufHdrGetBlock(bufHdr);
-       io_object = IOOBJECT_TEMP_RELATION;
-   }
-   else
-   {
-       bufBlock = BufHdrGetBlock(bufHdr);
-       io_object = IOOBJECT_RELATION;
-   }
+   bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
 
    if (isExtend)
    {
@@ -1139,7 +1134,7 @@ static BufferDesc *
 BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
            BlockNumber blockNum,
            BufferAccessStrategy strategy,
-           bool *foundPtr, IOContext *io_context)
+           bool *foundPtr, IOContext io_context)
 {
    bool        from_ring;
    BufferTag   newTag;         /* identity of requested block */
@@ -1193,11 +1188,8 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
            {
                /*
                 * If we get here, previous attempts to read the buffer must
-                * have failed ... but we shall bravely try again. Set
-                * io_context since we will in fact need to count an IO
-                * Operation.
+                * have failed ... but we shall bravely try again.
                 */
-               *io_context = IOContextForStrategy(strategy);
                *foundPtr = false;
            }
        }
@@ -1211,8 +1203,6 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
     */
    LWLockRelease(newPartitionLock);
 
-   *io_context = IOContextForStrategy(strategy);
-
    /* Loop here in case we have to try another victim buffer */
    for (;;)
    {
@@ -1295,7 +1285,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
                                                          smgr->smgr_rlocator.locator.dbOid,
                                                          smgr->smgr_rlocator.locator.relNumber);
 
-               FlushBuffer(buf, NULL, IOOBJECT_RELATION, *io_context);
+               FlushBuffer(buf, NULL, IOOBJECT_RELATION, io_context);
                LWLockRelease(BufferDescriptorGetContentLock(buf));
 
                ScheduleBufferTagForWriteback(&BackendWritebackContext,
@@ -1494,7 +1484,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
         * we may have been forced to release the buffer due to concurrent
         * pinners or erroring out.
         */
-       pgstat_count_io_op(IOOBJECT_RELATION, *io_context,
+       pgstat_count_io_op(IOOBJECT_RELATION, io_context,
                           from_ring ? IOOP_REUSE : IOOP_EVICT);
    }
 
index 68b4817c67bcb44b298c174215e05ad4bf1a8569..6f9e7eda57cfab54cb44d232ccf7305f79d16fc9 100644 (file)
@@ -108,7 +108,7 @@ PrefetchLocalBuffer(SMgrRelation smgr, ForkNumber forkNum,
  */
 BufferDesc *
 LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
-                bool *foundPtr, IOContext *io_context)
+                bool *foundPtr)
 {
    BufferTag   newTag;         /* identity of requested block */
    LocalBufferLookupEnt *hresult;
@@ -128,14 +128,6 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
    hresult = (LocalBufferLookupEnt *)
        hash_search(LocalBufHash, &newTag, HASH_FIND, NULL);
 
-   /*
-    * IO Operations on local buffers are only done in IOCONTEXT_NORMAL. Set
-    * io_context here (instead of after a buffer hit would have returned) for
-    * convenience since we don't have to worry about the overhead of calling
-    * IOContextForStrategy().
-    */
-   *io_context = IOCONTEXT_NORMAL;
-
    if (hresult)
    {
        b = hresult->id;
@@ -239,6 +231,7 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
        buf_state &= ~BM_DIRTY;
        pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
 
+       /* Temporary table I/O does not use Buffer Access Strategies */
        pgstat_count_io_op(IOOBJECT_TEMP_RELATION, IOCONTEXT_NORMAL, IOOP_WRITE);
        pgBufferUsage.local_blks_written++;
    }
index af5d554610195fb38eb9f0f096ea45c1f1fec111..ae8bb34f78b1677809e871144277584ba9688e4f 100644 (file)
@@ -344,7 +344,7 @@ pgstat_tracks_io_op(BackendType bktype, IOObject io_object,
     * Some BackendTypes will not do certain IOOps.
     */
    if ((bktype == B_BG_WRITER || bktype == B_CHECKPOINTER) &&
-       (io_op == IOOP_READ || io_op == IOOP_EVICT))
+       (io_op == IOOP_READ || io_op == IOOP_EVICT || io_op == IOOP_HIT))
        return false;
 
    if ((bktype == B_AUTOVAC_LAUNCHER || bktype == B_BG_WRITER ||
index 0ab31ec6e3b38173bd752a1a3c8fef048aa355c7..eec9f3cf9b1681a266a6ede4a97a9f4bda3911b2 100644 (file)
@@ -1259,6 +1259,7 @@ typedef enum io_stat_col
    IO_COL_WRITES,
    IO_COL_EXTENDS,
    IO_COL_CONVERSION,
+   IO_COL_HITS,
    IO_COL_EVICTIONS,
    IO_COL_REUSES,
    IO_COL_FSYNCS,
@@ -1277,16 +1278,18 @@ pgstat_get_io_op_index(IOOp io_op)
    {
        case IOOP_EVICT:
            return IO_COL_EVICTIONS;
+       case IOOP_EXTEND:
+           return IO_COL_EXTENDS;
+       case IOOP_FSYNC:
+           return IO_COL_FSYNCS;
+       case IOOP_HIT:
+           return IO_COL_HITS;
        case IOOP_READ:
            return IO_COL_READS;
        case IOOP_REUSE:
            return IO_COL_REUSES;
        case IOOP_WRITE:
            return IO_COL_WRITES;
-       case IOOP_EXTEND:
-           return IO_COL_EXTENDS;
-       case IOOP_FSYNC:
-           return IO_COL_FSYNCS;
    }
 
    elog(ERROR, "unrecognized IOOp value: %d", io_op);
index 5736c1082cfe246d5fe7478877a1eed5240dbd31..f9f264220153aaa78e68fb81599871f898da2191 100644 (file)
   proname => 'pg_stat_get_io', provolatile => 'v',
   prorows => '30', proretset => 't',
   proparallel => 'r', prorettype => 'record', proargtypes => '',
-  proallargtypes => '{text,text,text,int8,int8,int8,int8,int8,int8,int8,timestamptz}',
-  proargmodes => '{o,o,o,o,o,o,o,o,o,o,o}',
-  proargnames => '{backend_type,io_object,io_context,reads,writes,extends,op_bytes,evictions,reuses,fsyncs,stats_reset}',
+  proallargtypes => '{text,text,text,int8,int8,int8,int8,int8,int8,int8,int8,timestamptz}',
+  proargmodes => '{o,o,o,o,o,o,o,o,o,o,o,o}',
+  proargnames => '{backend_type,io_object,io_context,reads,writes,extends,op_bytes,hits,evictions,reuses,fsyncs,stats_reset}',
   prosrc => 'pg_stat_get_io' },
 
 { oid => '1136', descr => 'statistics: information about WAL activity',
index 17ee94d8b66b7a06e7a9056b8ae5ee83364a1f9d..a56182af9fa17c10de19198fe88d469d591d9b43 100644 (file)
@@ -296,6 +296,7 @@ typedef enum IOOp
    IOOP_EVICT,
    IOOP_EXTEND,
    IOOP_FSYNC,
+   IOOP_HIT,
    IOOP_READ,
    IOOP_REUSE,
    IOOP_WRITE,
index 0b448147407012a7cf990253f877b9f4ca63ff72..2afb9bb3099b8929831f26003af5b3e512107599 100644 (file)
@@ -419,7 +419,7 @@ extern PrefetchBufferResult PrefetchLocalBuffer(SMgrRelation smgr,
                                                ForkNumber forkNum,
                                                BlockNumber blockNum);
 extern BufferDesc *LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum,
-                                   BlockNumber blockNum, bool *foundPtr, IOContext *io_context);
+                                   BlockNumber blockNum, bool *foundPtr);
 extern void MarkLocalBufferDirty(Buffer buffer);
 extern void DropRelationLocalBuffers(RelFileLocator rlocator,
                                     ForkNumber forkNum,
index c71758db46e926b65013b3e02fbf5b27fc0d0b98..ab1aebfde420b06be22662e65802a925e574312f 100644 (file)
@@ -1884,11 +1884,12 @@ pg_stat_io| SELECT backend_type,
     writes,
     extends,
     op_bytes,
+    hits,
     evictions,
     reuses,
     fsyncs,
     stats_reset
-   FROM pg_stat_get_io() b(backend_type, io_object, io_context, reads, writes, extends, op_bytes, evictions, reuses, fsyncs, stats_reset);
+   FROM pg_stat_get_io() b(backend_type, io_object, io_context, reads, writes, extends, op_bytes, hits, evictions, reuses, fsyncs, stats_reset);
 pg_stat_progress_analyze| SELECT s.pid,
     s.datid,
     d.datname,
index 55b4c6df014346d00667f36395fd4c0c62ae10ea..5f1821938d57c75744ab26eebc7802cbee4dc0e6 100644 (file)
@@ -1131,6 +1131,7 @@ SELECT pg_stat_get_subscription_stats(NULL);
 -- - writes of shared buffers to permanent storage
 -- - extends of relations using shared buffers
 -- - fsyncs done to ensure the durability of data dirtying shared buffers
+-- - shared buffer hits
 -- There is no test for blocks evicted from shared buffers, because we cannot
 -- be sure of the state of shared buffers at the point the test is run.
 -- Create a regular table and insert some data to generate IOCONTEXT_NORMAL
@@ -1208,6 +1209,47 @@ SELECT :io_sum_shared_after_reads > :io_sum_shared_before_reads;
  t
 (1 row)
 
+SELECT sum(hits) AS io_sum_shared_before_hits
+  FROM pg_stat_io WHERE io_context = 'normal' AND io_object = 'relation' \gset
+-- Select from the table again to count hits.
+-- Ensure we generate hits by forcing a nested loop self-join with no
+-- materialize node. The outer side's buffer will stay pinned, preventing its
+-- eviction, while we loop through the inner side and generate hits.
+BEGIN;
+SET LOCAL enable_nestloop TO on; SET LOCAL enable_mergejoin TO off;
+SET LOCAL enable_hashjoin TO off; SET LOCAL enable_material TO off;
+-- ensure plan stays as we expect it to
+EXPLAIN (COSTS OFF) SELECT COUNT(*) FROM test_io_shared t1 INNER JOIN test_io_shared t2 USING (a);
+                QUERY PLAN                 
+-------------------------------------------
+ Aggregate
+   ->  Nested Loop
+         Join Filter: (t1.a = t2.a)
+         ->  Seq Scan on test_io_shared t1
+         ->  Seq Scan on test_io_shared t2
+(5 rows)
+
+SELECT COUNT(*) FROM test_io_shared t1 INNER JOIN test_io_shared t2 USING (a);
+ count 
+-------
+   100
+(1 row)
+
+COMMIT;
+SELECT pg_stat_force_next_flush();
+ pg_stat_force_next_flush 
+--------------------------
+(1 row)
+
+SELECT sum(hits) AS io_sum_shared_after_hits
+  FROM pg_stat_io WHERE io_context = 'normal' AND io_object = 'relation' \gset
+SELECT :io_sum_shared_after_hits > :io_sum_shared_before_hits;
+ ?column? 
+----------
+ t
+(1 row)
+
 DROP TABLE test_io_shared;
 -- Test that the follow IOCONTEXT_LOCAL IOOps are tracked in pg_stat_io:
 -- - eviction of local buffers in order to reuse them
@@ -1342,7 +1384,7 @@ SELECT pg_stat_have_stats('io', 0, 0);
  t
 (1 row)
 
-SELECT sum(evictions) + sum(reuses) + sum(extends) + sum(fsyncs) + sum(reads) + sum(writes) AS io_stats_pre_reset
+SELECT sum(evictions) + sum(reuses) + sum(extends) + sum(fsyncs) + sum(reads) + sum(writes) + sum(hits) AS io_stats_pre_reset
   FROM pg_stat_io \gset
 SELECT pg_stat_reset_shared('io');
  pg_stat_reset_shared 
@@ -1350,7 +1392,7 @@ SELECT pg_stat_reset_shared('io');
  
 (1 row)
 
-SELECT sum(evictions) + sum(reuses) + sum(extends) + sum(fsyncs) + sum(reads) + sum(writes) AS io_stats_post_reset
+SELECT sum(evictions) + sum(reuses) + sum(extends) + sum(fsyncs) + sum(reads) + sum(writes) + sum(hits) AS io_stats_post_reset
   FROM pg_stat_io \gset
 SELECT :io_stats_post_reset < :io_stats_pre_reset;
  ?column? 
index d958e70a866cbd2b19388f763a69550cdb559fc6..58db803ed69b274017cc45700e73b35033b818a5 100644 (file)
@@ -541,6 +541,7 @@ SELECT pg_stat_get_subscription_stats(NULL);
 -- - writes of shared buffers to permanent storage
 -- - extends of relations using shared buffers
 -- - fsyncs done to ensure the durability of data dirtying shared buffers
+-- - shared buffer hits
 
 -- There is no test for blocks evicted from shared buffers, because we cannot
 -- be sure of the state of shared buffers at the point the test is run.
@@ -587,6 +588,25 @@ SELECT pg_stat_force_next_flush();
 SELECT sum(reads) AS io_sum_shared_after_reads
   FROM pg_stat_io WHERE io_context = 'normal' AND io_object = 'relation'  \gset
 SELECT :io_sum_shared_after_reads > :io_sum_shared_before_reads;
+
+SELECT sum(hits) AS io_sum_shared_before_hits
+  FROM pg_stat_io WHERE io_context = 'normal' AND io_object = 'relation' \gset
+-- Select from the table again to count hits.
+-- Ensure we generate hits by forcing a nested loop self-join with no
+-- materialize node. The outer side's buffer will stay pinned, preventing its
+-- eviction, while we loop through the inner side and generate hits.
+BEGIN;
+SET LOCAL enable_nestloop TO on; SET LOCAL enable_mergejoin TO off;
+SET LOCAL enable_hashjoin TO off; SET LOCAL enable_material TO off;
+-- ensure plan stays as we expect it to
+EXPLAIN (COSTS OFF) SELECT COUNT(*) FROM test_io_shared t1 INNER JOIN test_io_shared t2 USING (a);
+SELECT COUNT(*) FROM test_io_shared t1 INNER JOIN test_io_shared t2 USING (a);
+COMMIT;
+SELECT pg_stat_force_next_flush();
+SELECT sum(hits) AS io_sum_shared_after_hits
+  FROM pg_stat_io WHERE io_context = 'normal' AND io_object = 'relation' \gset
+SELECT :io_sum_shared_after_hits > :io_sum_shared_before_hits;
+
 DROP TABLE test_io_shared;
 
 -- Test that the follow IOCONTEXT_LOCAL IOOps are tracked in pg_stat_io:
@@ -674,10 +694,10 @@ SELECT :io_sum_bulkwrite_strategy_extends_after > :io_sum_bulkwrite_strategy_ext
 
 -- Test IO stats reset
 SELECT pg_stat_have_stats('io', 0, 0);
-SELECT sum(evictions) + sum(reuses) + sum(extends) + sum(fsyncs) + sum(reads) + sum(writes) AS io_stats_pre_reset
+SELECT sum(evictions) + sum(reuses) + sum(extends) + sum(fsyncs) + sum(reads) + sum(writes) + sum(hits) AS io_stats_pre_reset
   FROM pg_stat_io \gset
 SELECT pg_stat_reset_shared('io');
-SELECT sum(evictions) + sum(reuses) + sum(extends) + sum(fsyncs) + sum(reads) + sum(writes) AS io_stats_post_reset
+SELECT sum(evictions) + sum(reuses) + sum(extends) + sum(fsyncs) + sum(reads) + sum(writes) + sum(hits) AS io_stats_post_reset
   FROM pg_stat_io \gset
 SELECT :io_stats_post_reset < :io_stats_pre_reset;