In order to have the option to use O_DIRECT/FILE_FLAG_NO_BUFFERING in a
later commit, we need the addresses of user space buffers to be well
aligned. The exact requirements vary by OS and file system (typically
sectors and/or memory pages). The address alignment size is set to
4096, which is enough for currently known systems: it matches modern
sectors and common memory page size. There is no standard governing
O_DIRECT's requirements so we might eventually have to reconsider this
with more information from the field or future systems.
Aligning I/O buffers on memory pages is also known to improve regular
buffered I/O performance.
Three classes of I/O buffers for regular data pages are adjusted:
(1) Heap buffers are now allocated with the new palloc_aligned() or
MemoryContextAllocAligned() functions introduced by commit
439f6175.
(2) Stack buffers now use a new struct PGIOAlignedBlock to respect
PG_IO_ALIGN_SIZE, if possible with this compiler. (3) The buffer
pool is also aligned in shared memory.
WAL buffers were already aligned on XLOG_BLCKSZ. It's possible for
XLOG_BLCKSZ to be configured smaller than PG_IO_ALIGNED_SIZE and thus
for O_DIRECT WAL writes to fail to be well aligned, but that's a
pre-existing condition and will be addressed by a later commit.
BufFiles are not yet addressed (there's no current plan to use O_DIRECT
for those, but they could potentially get some incidental speedup even
in plain buffered I/O operations through better alignment).
If we can't align stack objects suitably using the compiler extensions
we know about, we disable the use of O_DIRECT by setting PG_O_DIRECT to
0. This avoids the need to consider systems that have O_DIRECT but
can't align stack objects the way we want; such systems could in theory
be supported with more work but we don't currently know of any such
machines, so it's easier to pretend there is no O_DIRECT support
instead. That's an existing and tested class of system.
Add assertions that all buffers passed into smgrread(), smgrwrite() and
smgrextend() are correctly aligned, unless PG_O_DIRECT is 0 (= stack
alignment tricks may be unavailable) or the block size has been set too
small to allow arrays of buffers to be all aligned.
Author: Thomas Munro <
[email protected]>
Author: Andres Freund <
[email protected]>
Reviewed-by: Justin Pryzby <[email protected]>
Discussion: https://postgr.es/m/CA+hUKGK1X532hYqJ_MzFWt0n1zt8trz980D79WbjwnT-yYLZpg@mail.gmail.com
Page metapage;
/* Construct metapage. */
- metapage = (Page) palloc(BLCKSZ);
+ metapage = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0);
BloomFillMetapage(index, metapage);
/*
PREWARM_BUFFER
} PrewarmType;
-static PGAlignedBlock blockbuffer;
+static PGIOAlignedBlock blockbuffer;
/*
* pg_prewarm(regclass, mode text, fork text,
* Write an empty page as a placeholder for the root page. It will be
* replaced with the real root page at the end.
*/
- page = palloc0(BLCKSZ);
+ page = palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, MCXT_ALLOC_ZERO);
smgrextend(RelationGetSmgr(state->indexrel), MAIN_FORKNUM, GIST_ROOT_BLKNO,
page, true);
state->pages_allocated++;
levelstate->current_page++;
if (levelstate->pages[levelstate->current_page] == NULL)
- levelstate->pages[levelstate->current_page] = palloc(BLCKSZ);
+ levelstate->pages[levelstate->current_page] =
+ palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0);
newPage = levelstate->pages[levelstate->current_page];
gistinitpage(newPage, old_page_flags);
/* Create page and copy data */
data = (char *) (dist->list);
- target = palloc0(BLCKSZ);
+ target = palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, MCXT_ALLOC_ZERO);
gistinitpage(target, isleaf ? F_LEAF : 0);
for (int i = 0; i < dist->block.num; i++)
{
if (parent == NULL)
{
parent = palloc0(sizeof(GistSortedBuildLevelState));
- parent->pages[0] = (Page) palloc(BLCKSZ);
+ parent->pages[0] = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0);
parent->parent = NULL;
gistinitpage(parent->pages[0], 0);
_hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks)
{
BlockNumber lastblock;
- PGAlignedBlock zerobuf;
+ PGIOAlignedBlock zerobuf;
Page page;
HashPageOpaque ovflopaque;
state->rs_old_rel = old_heap;
state->rs_new_rel = new_heap;
- state->rs_buffer = (Page) palloc(BLCKSZ);
+ state->rs_buffer = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0);
/* new_heap needn't be empty, just locked */
state->rs_blockno = RelationGetNumberOfBlocks(new_heap);
state->rs_buffer_valid = false;
Page metapage;
/* Construct metapage. */
- metapage = (Page) palloc(BLCKSZ);
+ metapage = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0);
_bt_initmetapage(metapage, P_NONE, 0, _bt_allequalimage(index, false));
/*
Page page;
BTPageOpaque opaque;
- page = (Page) palloc(BLCKSZ);
+ page = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0);
/* Zero the page and set up standard page header info */
_bt_pageinit(page, BLCKSZ);
while (blkno > wstate->btws_pages_written)
{
if (!wstate->btws_zeropage)
- wstate->btws_zeropage = (Page) palloc0(BLCKSZ);
+ wstate->btws_zeropage = (Page) palloc_aligned(BLCKSZ,
+ PG_IO_ALIGN_SIZE,
+ MCXT_ALLOC_ZERO);
/* don't set checksum for all-zero page */
smgrextend(RelationGetSmgr(wstate->index), MAIN_FORKNUM,
wstate->btws_pages_written++,
* set to point to "P_NONE"). This changes the index to the "valid" state
* by filling in a valid magic number in the metapage.
*/
- metapage = (Page) palloc(BLCKSZ);
+ metapage = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0);
_bt_initmetapage(metapage, rootblkno, rootlevel,
wstate->inskey->allequalimage);
_bt_blwritepage(wstate, metapage, BTREE_METAPAGE);
Page page;
/* Construct metapage. */
- page = (Page) palloc(BLCKSZ);
+ page = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0);
SpGistInitMetapage(page);
/*
char delta[MAX_DELTA_SIZE]; /* delta between page images */
} PageData;
-/* State of generic xlog record construction */
+/*
+ * State of generic xlog record construction. Must be allocated at an I/O
+ * aligned address.
+ */
struct GenericXLogState
{
+ /* Page images (properly aligned, must be first) */
+ PGIOAlignedBlock images[MAX_GENERIC_XLOG_PAGES];
/* Info about each page, see above */
PageData pages[MAX_GENERIC_XLOG_PAGES];
bool isLogged;
- /* Page images (properly aligned) */
- PGAlignedBlock images[MAX_GENERIC_XLOG_PAGES];
};
static void writeFragment(PageData *pageData, OffsetNumber offset,
GenericXLogState *state;
int i;
- state = (GenericXLogState *) palloc(sizeof(GenericXLogState));
+ state = (GenericXLogState *) palloc_aligned(sizeof(GenericXLogState),
+ PG_IO_ALIGN_SIZE,
+ 0);
state->isLogged = RelationNeedsWAL(relation);
for (i = 0; i < MAX_GENERIC_XLOG_PAGES; i++)
/* xlblocks array */
size = add_size(size, mul_size(sizeof(XLogRecPtr), XLOGbuffers));
/* extra alignment padding for XLOG I/O buffers */
- size = add_size(size, XLOG_BLCKSZ);
+ size = add_size(size, Max(XLOG_BLCKSZ, PG_IO_ALIGN_SIZE));
/* and the buffers themselves */
size = add_size(size, mul_size(XLOG_BLCKSZ, XLOGbuffers));
RelationCopyStorage(SMgrRelation src, SMgrRelation dst,
ForkNumber forkNum, char relpersistence)
{
- PGAlignedBlock buf;
+ PGIOAlignedBlock buf;
Page page;
bool use_wal;
bool copying_initfork;
NBuffers * sizeof(BufferDescPadded),
&foundDescs);
+ /* Align buffer pool on IO page size boundary. */
BufferBlocks = (char *)
- ShmemInitStruct("Buffer Blocks",
- NBuffers * (Size) BLCKSZ, &foundBufs);
+ TYPEALIGN(PG_IO_ALIGN_SIZE,
+ ShmemInitStruct("Buffer Blocks",
+ NBuffers * (Size) BLCKSZ + PG_IO_ALIGN_SIZE,
+ &foundBufs));
/* Align condition variables to cacheline boundary. */
BufferIOCVArray = (ConditionVariableMinimallyPadded *)
/* to allow aligning buffer descriptors */
size = add_size(size, PG_CACHE_LINE_SIZE);
- /* size of data pages */
+ /* size of data pages, plus alignment padding */
+ size = add_size(size, PG_IO_ALIGN_SIZE);
size = add_size(size, mul_size(NBuffers, BLCKSZ));
/* size of stuff controlled by freelist.c */
bool use_wal;
BlockNumber nblocks;
BlockNumber blkno;
- PGAlignedBlock buf;
+ PGIOAlignedBlock buf;
BufferAccessStrategy bstrategy_src;
BufferAccessStrategy bstrategy_dst;
/* And don't overflow MaxAllocSize, either */
num_bufs = Min(num_bufs, MaxAllocSize / BLCKSZ);
- cur_block = (char *) MemoryContextAlloc(LocalBufferContext,
- num_bufs * BLCKSZ);
+ /* Buffers should be I/O aligned. */
+ cur_block = (char *)
+ TYPEALIGN(PG_IO_ALIGN_SIZE,
+ MemoryContextAlloc(LocalBufferContext,
+ num_bufs * BLCKSZ + PG_IO_ALIGN_SIZE));
next_buf_in_block = 0;
num_bufs_in_block = num_bufs;
}
off_t curOffset; /* offset part of current pos */
int pos; /* next read/write position in buffer */
int nbytes; /* total # of valid bytes in buffer */
+
+ /*
+ * XXX Should ideally us PGIOAlignedBlock, but might need a way to avoid
+ * wasting per-file alignment padding when some users create many
+ * files.
+ */
PGAlignedBlock buffer;
};
* and second to avoid wasting space in processes that never call this.
*/
if (pageCopy == NULL)
- pageCopy = MemoryContextAlloc(TopMemoryContext, BLCKSZ);
+ pageCopy = MemoryContextAllocAligned(TopMemoryContext,
+ BLCKSZ,
+ PG_IO_ALIGN_SIZE,
+ 0);
memcpy(pageCopy, (char *) page, BLCKSZ);
((PageHeader) pageCopy)->pd_checksum = pg_checksum_page(pageCopy, blkno);
int nbytes;
MdfdVec *v;
+ /* If this build supports direct I/O, the buffer must be I/O aligned. */
+ if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ)
+ Assert((uintptr_t) buffer == TYPEALIGN(PG_IO_ALIGN_SIZE, buffer));
+
/* This assert is too expensive to have on normally ... */
#ifdef CHECK_WRITE_VS_EXTEND
Assert(blocknum >= mdnblocks(reln, forknum));
int nbytes;
MdfdVec *v;
+ /* If this build supports direct I/O, the buffer must be I/O aligned. */
+ if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ)
+ Assert((uintptr_t) buffer == TYPEALIGN(PG_IO_ALIGN_SIZE, buffer));
+
TRACE_POSTGRESQL_SMGR_MD_READ_START(forknum, blocknum,
reln->smgr_rlocator.locator.spcOid,
reln->smgr_rlocator.locator.dbOid,
int nbytes;
MdfdVec *v;
+ /* If this build supports direct I/O, the buffer must be I/O aligned. */
+ if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ)
+ Assert((uintptr_t) buffer == TYPEALIGN(PG_IO_ALIGN_SIZE, buffer));
+
/* This assert is too expensive to have on normally ... */
#ifdef CHECK_WRITE_VS_EXTEND
Assert(blocknum < mdnblocks(reln, forknum));
*/
if (nblocks < ((BlockNumber) RELSEG_SIZE))
{
- char *zerobuf = palloc0(BLCKSZ);
+ char *zerobuf = palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE,
+ MCXT_ALLOC_ZERO);
mdextend(reln, forknum,
nextsegno * ((BlockNumber) RELSEG_SIZE) - 1,
*/
while (blocknum > lts->nBlocksWritten)
{
- PGAlignedBlock zerobuf;
+ PGIOAlignedBlock zerobuf;
MemSet(zerobuf.data, 0, sizeof(zerobuf));
static void
scan_file(const char *fn, int segmentno)
{
- PGAlignedBlock buf;
+ PGIOAlignedBlock buf;
PageHeader header = (PageHeader) buf.data;
int f;
BlockNumber blockno;
local_queue_fetch_file(rewind_source *source, const char *path, size_t len)
{
const char *datadir = ((local_source *) source)->datadir;
- PGAlignedBlock buf;
+ PGIOAlignedBlock buf;
char srcpath[MAXPGPATH];
int srcfd;
size_t written_len;
size_t len)
{
const char *datadir = ((local_source *) source)->datadir;
- PGAlignedBlock buf;
+ PGIOAlignedBlock buf;
char srcpath[MAXPGPATH];
int srcfd;
off_t begin = off;
{
int src_fd;
int dst_fd;
- PGAlignedBlock buffer;
- PGAlignedBlock new_vmbuf;
+ PGIOAlignedBlock buffer;
+ PGIOAlignedBlock new_vmbuf;
ssize_t totalBytesRead = 0;
ssize_t src_filesize;
int rewriteVmBytesPerPage;
ssize_t
pg_pwrite_zeros(int fd, size_t size, off_t offset)
{
- static const PGAlignedBlock zbuffer = {{0}}; /* worth BLCKSZ */
- void *zerobuf_addr = unconstify(PGAlignedBlock *, &zbuffer)->data;
+ static const PGIOAlignedBlock zbuffer = {{0}}; /* worth BLCKSZ */
+ void *zerobuf_addr = unconstify(PGIOAlignedBlock *, &zbuffer)->data;
struct iovec iov[PG_IOV_MAX];
size_t remaining_size = size;
ssize_t total_written = 0;
/*
* Use this, not "char buf[BLCKSZ]", to declare a field or local variable
- * holding a page buffer, if that page might be accessed as a page and not
- * just a string of bytes. Otherwise the variable might be under-aligned,
- * causing problems on alignment-picky hardware. (In some places, we use
- * this to declare buffers even though we only pass them to read() and
- * write(), because copying to/from aligned buffers is usually faster than
- * using unaligned buffers.) We include both "double" and "int64" in the
- * union to ensure that the compiler knows the value must be MAXALIGN'ed
- * (cf. configure's computation of MAXIMUM_ALIGNOF).
+ * holding a page buffer, if that page might be accessed as a page. Otherwise
+ * the variable might be under-aligned, causing problems on alignment-picky
+ * hardware. We include both "double" and "int64" in the union to ensure that
+ * the compiler knows the value must be MAXALIGN'ed (cf. configure's
+ * computation of MAXIMUM_ALIGNOF).
*/
typedef union PGAlignedBlock
{
int64 force_align_i64;
} PGAlignedBlock;
+/*
+ * Use this to declare a field or local variable holding a page buffer, if that
+ * page might be accessed as a page or passed to an SMgr I/O function. If
+ * allocating using the MemoryContext API, the aligned allocation functions
+ * should be used with PG_IO_ALIGN_SIZE. This alignment may be more efficient
+ * for I/O in general, but may be strictly required on some platforms when
+ * using direct I/O.
+ */
+typedef union PGIOAlignedBlock
+{
+#ifdef pg_attribute_aligned
+ pg_attribute_aligned(PG_IO_ALIGN_SIZE)
+#endif
+ char data[BLCKSZ];
+ double force_align_d;
+ int64 force_align_i64;
+} PGIOAlignedBlock;
+
/* Same, but for an XLOG_BLCKSZ-sized buffer */
typedef union PGAlignedXLogBlock
{
+#ifdef pg_attribute_aligned
+ pg_attribute_aligned(PG_IO_ALIGN_SIZE)
+#endif
char data[XLOG_BLCKSZ];
double force_align_d;
int64 force_align_i64;
*/
#define PG_CACHE_LINE_SIZE 128
+/*
+ * Assumed alignment requirement for direct I/O. 4K corresponds to common
+ * sector and memory page size.
+ */
+#define PG_IO_ALIGN_SIZE 4096
+
/*
*------------------------------------------------------------------------
* The following symbols are for enabling debugging code, not for
* to the appropriate Windows flag in src/port/open.c. We simulate it with
* fcntl(F_NOCACHE) on macOS inside fd.c's open() wrapper. We use the name
* PG_O_DIRECT rather than defining O_DIRECT in that case (probably not a good
- * idea on a Unix).
+ * idea on a Unix). We can only use it if the compiler will correctly align
+ * PGIOAlignedBlock for us, though.
*/
-#if defined(O_DIRECT)
+#if defined(O_DIRECT) && defined(pg_attribute_aligned)
#define PG_O_DIRECT O_DIRECT
#elif defined(F_NOCACHE)
#define PG_O_DIRECT 0x80000000
PGFInfoFunction
PGFileType
PGFunction
+PGIOAlignedBlock
PGLZ_HistEntry
PGLZ_Strategy
PGLoadBalanceType