pg_test_timing utility, to measure clock monotonicity and timing cost.
authorRobert Haas <[email protected]>
Tue, 27 Mar 2012 20:14:00 +0000 (16:14 -0400)
committerRobert Haas <[email protected]>
Tue, 27 Mar 2012 20:14:00 +0000 (16:14 -0400)
Ants Aasma, Greg Smith

contrib/Makefile
contrib/pg_test_timing/.gitignore [new file with mode: 0644]
contrib/pg_test_timing/Makefile [new file with mode: 0644]
contrib/pg_test_timing/pg_test_timing.c [new file with mode: 0644]
doc/src/sgml/config.sgml
doc/src/sgml/contrib.sgml
doc/src/sgml/filelist.sgml
doc/src/sgml/perform.sgml
doc/src/sgml/pgtesttiming.sgml [new file with mode: 0644]

index ac0a80a014059288acd39bb3175338cd5ef439d8..d230451a1297e793d39f542b4ee4a3454644ff24 100644 (file)
@@ -35,6 +35,7 @@ SUBDIRS = \
        pg_standby  \
        pg_stat_statements \
        pg_test_fsync   \
+       pg_test_timing  \
        pg_trgm     \
        pg_upgrade  \
        pg_upgrade_support \
diff --git a/contrib/pg_test_timing/.gitignore b/contrib/pg_test_timing/.gitignore
new file mode 100644 (file)
index 0000000..f6c664c
--- /dev/null
@@ -0,0 +1 @@
+/pg_test_timing
diff --git a/contrib/pg_test_timing/Makefile b/contrib/pg_test_timing/Makefile
new file mode 100644 (file)
index 0000000..b8b266a
--- /dev/null
@@ -0,0 +1,18 @@
+# contrib/pg_test_timing/Makefile
+
+PGFILEDESC = "pg_test_timing - test timing overhead"
+PGAPPICON = win32
+
+PROGRAM  = pg_test_timing
+OBJS = pg_test_timing.o
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = contrib/pg_test_timing
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
diff --git a/contrib/pg_test_timing/pg_test_timing.c b/contrib/pg_test_timing/pg_test_timing.c
new file mode 100644 (file)
index 0000000..cdfa07f
--- /dev/null
@@ -0,0 +1,162 @@
+/*
+ * pg_test_timing.c
+ *     tests overhead of timing calls and their monotonicity:  that
+ *         they always move forward
+ */
+
+#include "postgres_fe.h"
+
+#include "getopt_long.h"
+#include "portability/instr_time.h"
+
+static const char *progname;
+
+static int32 test_duration = 3;
+
+static void handle_args(int argc, char *argv[]);
+static void test_timing(int32);
+
+int
+main(int argc, char *argv[])
+{
+   progname = get_progname(argv[0]);
+
+   handle_args(argc, argv);
+
+   test_timing(test_duration);
+
+   return 0;
+}
+
+static void
+handle_args(int argc, char *argv[])
+{
+   static struct option long_options[] = {
+       {"duration", required_argument, NULL, 'd'},
+       {NULL, 0, NULL, 0}
+   };
+   int option;         /* Command line option */
+   int optindex = 0;   /* used by getopt_long */
+
+   if (argc > 1)
+   {
+       if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-h") == 0 ||
+           strcmp(argv[1], "-?") == 0)
+       {
+           printf("Usage: %s [-d DURATION]\n", progname);
+           exit(0);
+       }
+       if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-V") == 0)
+       {
+           puts("pg_test_timing (PostgreSQL) " PG_VERSION);
+           exit(0);
+       }
+   }
+
+   while ((option = getopt_long(argc, argv, "d:",
+                                long_options, &optindex)) != -1)
+   {
+       switch (option)
+       {
+           case 'd':
+               test_duration = atoi(optarg);
+               break;
+
+           default:
+               fprintf(stderr, "Try \"%s --help\" for more information.\n",
+                       progname);
+               exit(1);
+               break;
+       }
+   }
+
+   if (argc > optind)
+   {
+       fprintf(stderr,
+               "%s: too many command-line arguments (first is \"%s\")\n",
+               progname, argv[optind]);
+       fprintf(stderr, "Try \"%s --help\" for more information.\n",
+               progname);
+       exit(1);
+   }
+
+   if (test_duration > 0)
+   {
+       printf("Testing timing overhead for %d seconds.\n", test_duration);
+   }
+   else
+   {
+       fprintf(stderr,
+               "%s: duration must be a positive integer (duration is \"%d\")\n",
+               progname, test_duration);
+       fprintf(stderr, "Try \"%s --help\" for more information.\n",
+               progname);
+       exit(1);
+   }
+}
+
+static void
+test_timing(int32 duration)
+{
+   uint64 total_time;
+   int64 time_elapsed = 0;
+   uint64 loop_count = 0;
+   uint64 prev, cur;
+   int32 diff, i, bits, found;
+
+   instr_time start_time, end_time, temp;
+
+   static int64 histogram[32];
+
+   total_time = duration > 0 ? duration * 1000000 : 0;
+
+   INSTR_TIME_SET_CURRENT(start_time);
+   cur = INSTR_TIME_GET_MICROSEC(start_time);
+
+   while (time_elapsed < total_time)
+   {
+       prev = cur;
+       INSTR_TIME_SET_CURRENT(temp);
+       cur = INSTR_TIME_GET_MICROSEC(temp);
+       diff = cur - prev;
+
+       if (diff < 0)
+       {
+           printf("Detected clock going backwards in time.\n");
+           printf("Time warp: %d microseconds\n", diff);
+           exit(1);
+       }
+
+       bits = 0;
+       while (diff)
+       {
+           diff >>= 1;
+           bits++;
+       }
+       histogram[bits]++;
+
+       loop_count++;
+       INSTR_TIME_SUBTRACT(temp, start_time);
+       time_elapsed = INSTR_TIME_GET_MICROSEC(temp);
+   }
+
+   INSTR_TIME_SET_CURRENT(end_time);
+
+   INSTR_TIME_SUBTRACT(end_time, start_time);
+
+   printf("Per loop time including overhead: %0.2f nsec\n",
+           INSTR_TIME_GET_DOUBLE(end_time) * 1e9 / loop_count);
+   printf("Histogram of timing durations:\n");
+   printf("%9s: %10s %9s\n", "< usec", "count", "percent");
+
+   found = 0;
+    for (i = 31; i >= 0; i--)
+    {
+        if (found || histogram[i])
+        {
+            found = 1;
+            printf("%9ld: %10ld %8.5f%%\n", 1l << i, histogram[i],
+                (double) histogram[i] * 100 / loop_count);
+        }
+    }
+}
index 9baaa3fb36acd9c6a4ff50a7074a5b7491034dca..0a5d519214da042a6ad53a2e92e77b42221e9513 100644 (file)
@@ -4294,7 +4294,9 @@ COPY postgres_log FROM '/full/path/to/logfile.csv' WITH csv;
         Enables timing of database I/O calls.  This parameter is off by
         default, because it will repeatedly query the operating system for
         the current time, which may cause significant overhead on some
-        platforms.  Only superusers can change this setting.
+        platforms.  You can use the <xref linkend="pgtesttiming"> tool to
+        measure the overhead of timing on your system.  Only superusers can
+        change this setting.
        </para>
       </listitem>
      </varlistentry>
index d4da4eec87d9c8e5d7ec550e70ecd3473d5adc59..97031dddaad92d1a8ced6d43a537b1a978873ba1 100644 (file)
@@ -121,6 +121,7 @@ CREATE EXTENSION <replaceable>module_name</> FROM unpackaged;
  &pgstatstatements;
  &pgstattuple;
  &pgtestfsync;
+ &pgtesttiming;
  &pgtrgm;
  &pgupgrade;
  &seg;
index b5d3c6d4fce27ffcc56162cc283f4e032312ca51..428a1672783c6c496e7a9dd1564d2c6af7ff533d 100644 (file)
 <!ENTITY pgstatstatements SYSTEM "pgstatstatements.sgml">
 <!ENTITY pgstattuple     SYSTEM "pgstattuple.sgml">
 <!ENTITY pgtestfsync     SYSTEM "pgtestfsync.sgml">
+<!ENTITY pgtesttiming    SYSTEM "pgtesttiming.sgml">
 <!ENTITY pgtrgm          SYSTEM "pgtrgm.sgml">
 <!ENTITY pgupgrade       SYSTEM "pgupgrade.sgml">
 <!ENTITY seg             SYSTEM "seg.sgml">
index 8e695fd54019547c5bddf28c2c4c3d1b20e1c731..5a8c6fc7dc8633e59552a3c61e5d604f6934941b 100644 (file)
@@ -770,7 +770,9 @@ ROLLBACK;
     network transmission costs and I/O conversion costs are not included.
     Second, the measurement overhead added by <command>EXPLAIN
     ANALYZE</command> can be significant, especially on machines with slow
-    <function>gettimeofday()</> operating-system calls.
+    <function>gettimeofday()</> operating-system calls. You can use the
+    <xref linkend="pgtesttiming"> tool to measure the overhead of timing
+    on your system.
    </para>
 
    <para>
diff --git a/doc/src/sgml/pgtesttiming.sgml b/doc/src/sgml/pgtesttiming.sgml
new file mode 100644 (file)
index 0000000..9bdf3e3
--- /dev/null
@@ -0,0 +1,261 @@
+<!-- doc/src/sgml/pgtesttiming.sgml -->
+
+<sect1 id="pgtesttiming" xreflabel="pg_test_timing">
+ <title>pg_test_timing</title>
+
+ <indexterm zone="pgtesttiming">
+  <primary>pg_test_timing</primary>
+ </indexterm>
+
+ <para>
+  <application>pg_test_timing</> is a tool to measure the timing overhead
+  on your system and confirm that the system time never moves backwards.
+  Systems that are slow to collect timing data can give less accurate 
+  <command>EXPLAIN ANALYZE</command> results.
+ </para>
+
+ <sect2>
+  <title>Usage</title>
+
+<synopsis>
+pg_test_timing [options]
+</synopsis>
+
+   <para>
+    <application>pg_test_timing</application> accepts the following
+    command-line options:
+
+    <variablelist>
+
+     <varlistentry>
+      <term><option>-d</option></term>
+      <term><option>--duration</option></term>
+      <listitem>
+       <para>
+        Specifies the test duration, in seconds. Longer durations
+        give slightly better accuracy, and are more likely to discover
+        problems with the system clock moving backwards. The default
+        test duration is 3 seconds.
+       </para>
+      </listitem>
+     </varlistentry>
+
+    </variablelist>
+   </para>
+
+ </sect2>
+
+ <sect2>
+  <title>Interpreting results</title>
+
+  <para>
+        Good results will show most (>90%) individual timing calls take less
+        than one microsecond. Average per loop overhead will be even lower,
+        below 100 nanoseconds. This example from an Intel i7-860 system using
+        a TSC clock source shows excellent performance:
+  </para>
+
+<screen>
+Testing timing overhead for 3 seconds.
+Per loop time including overhead: 35.96 nsec
+Histogram of timing durations:
+   < usec:      count   percent
+       16:          2  0.00000%
+        8:         13  0.00002%
+        4:        126  0.00015%
+        2:    2999652  3.59518%
+        1:   80435604 96.40465%
+</screen>
+
+  <para>
+        Note that different units are used for the per loop time than the
+        histogram. The loop can have resolution within a few nanoseconds
+        (nsec), while the individual timing calls can only resolve down to
+        one microsecond (usec).
+  </para>
+
+ </sect2>
+ <sect2>
+  <title>Measuring executor timing overhead</title>
+
+  <para>
+        When the query executor is running a statement using
+        <command>EXPLAIN ANALYZE</command>, individual operations are
+        timed as well as showing a summary.  The overhead of your system
+        can be checked by counting rows with the psql program:
+  </para>
+
+<screen>
+CREATE TABLE t AS SELECT * FROM generate_series(1,100000);
+\timing
+SELECT COUNT(*) FROM t;
+EXPLAIN ANALYZE SELECT COUNT(*) FROM t;
+</screen>
+
+  <para>
+        The i7-860 system measured runs the count query in 9.8 ms while
+        the <command>EXPLAIN ANALYZE</command> version takes 16.6 ms,
+        each processing just over 100,000 rows.  That 6.8 ms difference
+        means the timing overhead per row is 68 ns, about twice what
+        pg_test_timing estimated it would be.  Even that relatively
+        small amount of overhead is making the fully timed count statement
+        take almost 70% longer.  On more substantial queries, the
+        timing overhead would be less problematic.
+  </para>
+
+ </sect2>
+ <sect2>
+  <title>Changing time sources</title>
+  <para>
+        On some newer Linux systems, it's possible to change the clock
+        source used to collect timing data at any time.  A second example
+        shows the slowdown possible from switching to the slower acpi_pm
+        time source, on the same system used for the fast results above:
+  </para>
+
+<screen>
+# cat /sys/devices/system/clocksource/clocksource0/available_clocksource 
+tsc hpet acpi_pm
+# echo acpi_pm > /sys/devices/system/clocksource/clocksource0/current_clocksource
+# pg_test_timing
+Per loop time including overhead: 722.92 nsec
+Histogram of timing durations:
+   < usec:      count   percent
+       16:          3  0.00007%
+        8:        563  0.01357%
+        4:       3241  0.07810%
+        2:    2990371 72.05956%
+        1:    1155682 27.84870%
+</screen>
+
+  <para>
+       In this configuration, the sample <command>EXPLAIN ANALYZE</command>
+       above takes 115.9 ms.  That's 1061 nsec of timing overhead, again
+       a small multiple of what's measured directly by this utility.
+       That much timing overhead means the actual query itself is only
+       taking a tiny fraction of the accounted for time, most of it
+       is being consumed in overhead instead.  In this configuration,
+       any <command>EXPLAIN ANALYZE</command> totals involving many
+       timed operations would be inflated significantly by timing overhead.
+  </para>
+
+  <para>
+       FreeBSD also allows changing the time source on the fly, and
+       it logs information about the timer selected during boot:
+  </para>
+
+<screen>
+dmesg | grep "Timecounter"       
+sysctl kern.timecounter.hardware=TSC
+</screen>
+
+  <para>
+       Other systems may only allow setting the time source on boot.
+       On older Linux systems the "clock" kernel setting is the only way
+       to make this sort of change.  And even on some more recent ones,
+       the only option you'll see for a clock source is "jiffies".  Jiffies
+       are the older Linux software clock implementation, which can have
+       good resolution when it's backed by fast enough timing hardware,
+       as in this example:
+  </para>       
+
+<screen>
+$ cat /sys/devices/system/clocksource/clocksource0/available_clocksource 
+jiffies 
+$ dmesg | grep time.c
+time.c: Using 3.579545 MHz WALL PM GTOD PIT/TSC timer.
+time.c: Detected 2400.153 MHz processor.
+$ ./pg_test_timing 
+Testing timing overhead for 3 seconds.
+Per timing duration including loop overhead: 97.75 ns
+Histogram of timing durations:
+   < usec:      count   percent
+       32:          1  0.00000%
+       16:          1  0.00000%
+        8:         22  0.00007%
+        4:       3010  0.00981%
+        2:    2993204  9.75277%
+        1:   27694571 90.23734%
+</screen>
+
+ </sect2>
+ <sect2>
+  <title>Clock hardware and timing accuracy</title>
+
+  <para>
+       Collecting accurate timing information is normally done on computers
+       using hardware clocks with various levels of accuracy.  With some
+       hardware the operating systems can pass the system clock time almost
+       directly to programs.  A system clock can also be derived from a chip
+       that simply provides timing interrupts, periodic ticks at some known
+       time interval.  In either case, operating system kernels provide
+       a clock source that hides these details.  But the accuracy of that
+       clock source and how quickly it can return results varies based
+       on the underlying hardware.
+  </para>
+
+  <para>
+        Inaccurate time keeping can result in system instability.  Test
+        any change to the clock source very carefully.  Operating system
+        defaults are sometimes made to favor reliability over best
+        accuracy. And if you are using a virtual machine, look into the
+        recommended time sources compatible with it.  Virtual hardware
+        faces additional difficulties when emulating timers, and there
+        are often per operating system settings suggested by vendors.
+  </para>
+
+  <para>
+        The Time Stamp Counter (TSC) clock source is the most accurate one
+        available on current generation CPUs. It's the preferred way to track
+        the system time when it's supported by the operating system and the
+        TSC clock is reliable. There are several ways that TSC can fail
+        to provide an accurate timing source, making it unreliable. Older
+        systems can have a TSC clock that varies based on the CPU
+        temperature, making it unusable for timing. Trying to use TSC on some
+        older multi-core CPUs can give a reported time that's inconsistent
+        among multiple cores. This can result in the time going backwards, a
+        problem this program checks for.  And even the newest systems can
+        fail to provide accurate TSC timing with very aggressive power saving
+        configurations.
+  </para>
+
+  <para>
+        Newer operating systems may check for the known TSC problems and
+        switch to a slower, more stable clock source when they are seen. 
+        If your system supports TSC time but doesn't default to that, it
+        may be disabled for a good reason.  And some operating systems may
+        not detect all the possible problems correctly, or will allow using
+        TSC even in situations where it's known to be inaccurate.
+  </para>
+
+  <para>
+        The High Precision Event Timer (HPET) is the preferred timer on
+        systems where it's available and TSC is not accurate.  The timer
+        chip itself is programmable to allow up to 100 nanosecond resolution,
+        but you may not see that much accuracy in your system clock.
+  </para>
+
+  <para>
+        Advanced Configuration and Power Interface (ACPI) provides a
+        Power Management (PM) Timer, which Linux refers to as the acpi_pm.
+        The clock derived from acpi_pm will at best provide 300 nanosecond
+        resolution.
+  </para>
+
+  <para>
+        Timers used on older PC hardware including the 8254 Programmable
+        Interval Timer (PIT), the real-time clock (RTC), the Advanced
+        Programmable Interrupt Controller (APIC) timer, and the Cyclone
+        timer.  These timers aim for millisecond resolution.
+  </para>
+ </sect2>
+
+ <sect2>
+  <title>Author</title>
+
+  <para>
+   Ants Aasma <email>[email protected]</email>
+  </para>
+ </sect2>
+
+</sect1>