return returnCode;
}
+/*
+ * Zero a region of the file.
+ *
+ * Returns 0 on success, -1 otherwise. In the latter case errno is set to the
+ * appropriate error.
+ */
+int
+FileZero(File file, off_t offset, off_t amount, uint32 wait_event_info)
+{
+ int returnCode;
+ ssize_t written;
+
+ Assert(FileIsValid(file));
+
+ DO_DB(elog(LOG, "FileZero: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
+ file, VfdCache[file].fileName,
+ (int64) offset, (int64) amount));
+
+ returnCode = FileAccess(file);
+ if (returnCode < 0)
+ return returnCode;
+
+ pgstat_report_wait_start(wait_event_info);
+ written = pg_pwrite_zeros(VfdCache[file].fd, amount, offset);
+ pgstat_report_wait_end();
+
+ if (written < 0)
+ return -1;
+ else if (written != amount)
+ {
+ /* if errno is unset, assume problem is no disk space */
+ if (errno == 0)
+ errno = ENOSPC;
+ return -1;
+ }
+
+ return 0;
+}
+
+/*
+ * Try to reserve file space with posix_fallocate(). If posix_fallocate() is
+ * not implemented on the operating system or fails with EINVAL / EOPNOTSUPP,
+ * use FileZero() instead.
+ *
+ * Note that at least glibc() implements posix_fallocate() in userspace if not
+ * implemented by the filesystem. That's not the case for all environments
+ * though.
+ *
+ * Returns 0 on success, -1 otherwise. In the latter case errno is set to the
+ * appropriate error.
+ */
+int
+FileFallocate(File file, off_t offset, off_t amount, uint32 wait_event_info)
+{
+#ifdef HAVE_POSIX_FALLOCATE
+ int returnCode;
+
+ Assert(FileIsValid(file));
+
+ DO_DB(elog(LOG, "FileFallocate: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
+ file, VfdCache[file].fileName,
+ (int64) offset, (int64) amount));
+
+ returnCode = FileAccess(file);
+ if (returnCode < 0)
+ return -1;
+
+ pgstat_report_wait_start(wait_event_info);
+ returnCode = posix_fallocate(VfdCache[file].fd, offset, amount);
+ pgstat_report_wait_end();
+
+ if (returnCode == 0)
+ return 0;
+
+ /* for compatibility with %m printing etc */
+ errno = returnCode;
+
+ /*
+ * Return in cases of a "real" failure, if fallocate is not supported,
+ * fall through to the FileZero() backed implementation.
+ */
+ if (returnCode != EINVAL && returnCode != EOPNOTSUPP)
+ return -1;
+#endif
+
+ return FileZero(file, offset, amount, wait_event_info);
+}
+
off_t
FileSize(File file)
{
Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
}
+/*
+ * mdzeroextend() -- Add new zeroed out blocks to the specified relation.
+ *
+ * Similar to mdextend(), except the relation can be extended by multiple
+ * blocks at once and the added blocks will be filled with zeroes.
+ */
+void
+mdzeroextend(SMgrRelation reln, ForkNumber forknum,
+ BlockNumber blocknum, int nblocks, bool skipFsync)
+{
+ MdfdVec *v;
+ BlockNumber curblocknum = blocknum;
+ int remblocks = nblocks;
+
+ Assert(nblocks > 0);
+
+ /* This assert is too expensive to have on normally ... */
+#ifdef CHECK_WRITE_VS_EXTEND
+ Assert(blocknum >= mdnblocks(reln, forknum));
+#endif
+
+ /*
+ * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
+ * more --- we mustn't create a block whose number actually is
+ * InvalidBlockNumber or larger.
+ */
+ if ((uint64) blocknum + nblocks >= (uint64) InvalidBlockNumber)
+ ereport(ERROR,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("cannot extend file \"%s\" beyond %u blocks",
+ relpath(reln->smgr_rlocator, forknum),
+ InvalidBlockNumber)));
+
+ while (remblocks > 0)
+ {
+ BlockNumber segstartblock = curblocknum % ((BlockNumber) RELSEG_SIZE);
+ off_t seekpos = (off_t) BLCKSZ * segstartblock;
+ int numblocks;
+
+ if (segstartblock + remblocks > RELSEG_SIZE)
+ numblocks = RELSEG_SIZE - segstartblock;
+ else
+ numblocks = remblocks;
+
+ v = _mdfd_getseg(reln, forknum, curblocknum, skipFsync, EXTENSION_CREATE);
+
+ Assert(segstartblock < RELSEG_SIZE);
+ Assert(segstartblock + numblocks <= RELSEG_SIZE);
+
+ /*
+ * If available and useful, use posix_fallocate() (via FileAllocate())
+ * to extend the relation. That's often more efficient than using
+ * write(), as it commonly won't cause the kernel to allocate page
+ * cache space for the extended pages.
+ *
+ * However, we don't use FileAllocate() for small extensions, as it
+ * defeats delayed allocation on some filesystems. Not clear where
+ * that decision should be made though? For now just use a cutoff of
+ * 8, anything between 4 and 8 worked OK in some local testing.
+ */
+ if (numblocks > 8)
+ {
+ int ret;
+
+ ret = FileFallocate(v->mdfd_vfd,
+ seekpos, (off_t) BLCKSZ * numblocks,
+ WAIT_EVENT_DATA_FILE_EXTEND);
+ if (ret != 0)
+ {
+ ereport(ERROR,
+ errcode_for_file_access(),
+ errmsg("could not extend file \"%s\" with FileFallocate(): %m",
+ FilePathName(v->mdfd_vfd)),
+ errhint("Check free disk space."));
+ }
+ }
+ else
+ {
+ int ret;
+
+ /*
+ * Even if we don't want to use fallocate, we can still extend a
+ * bit more efficiently than writing each 8kB block individually.
+ * pg_pwrite_zeroes() (via FileZero()) uses
+ * pg_pwritev_with_retry() to avoid multiple writes or needing a
+ * zeroed buffer for the whole length of the extension.
+ */
+ ret = FileZero(v->mdfd_vfd,
+ seekpos, (off_t) BLCKSZ * numblocks,
+ WAIT_EVENT_DATA_FILE_EXTEND);
+ if (ret < 0)
+ ereport(ERROR,
+ errcode_for_file_access(),
+ errmsg("could not extend file \"%s\": %m",
+ FilePathName(v->mdfd_vfd)),
+ errhint("Check free disk space."));
+ }
+
+ if (!skipFsync && !SmgrIsTemp(reln))
+ register_dirty_segment(reln, forknum, v);
+
+ Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
+
+ remblocks -= numblocks;
+ curblocknum += numblocks;
+ }
+}
+
/*
* mdopenfork() -- Open one fork of the specified relation.
*
bool isRedo);
void (*smgr_extend) (SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, const void *buffer, bool skipFsync);
+ void (*smgr_zeroextend) (SMgrRelation reln, ForkNumber forknum,
+ BlockNumber blocknum, int nblocks, bool skipFsync);
bool (*smgr_prefetch) (SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum);
void (*smgr_read) (SMgrRelation reln, ForkNumber forknum,
.smgr_exists = mdexists,
.smgr_unlink = mdunlink,
.smgr_extend = mdextend,
+ .smgr_zeroextend = mdzeroextend,
.smgr_prefetch = mdprefetch,
.smgr_read = mdread,
.smgr_write = mdwrite,
reln->smgr_cached_nblocks[forknum] = InvalidBlockNumber;
}
+/*
+ * smgrzeroextend() -- Add new zeroed out blocks to a file.
+ *
+ * Similar to smgrextend(), except the relation can be extended by
+ * multiple blocks at once and the added blocks will be filled with
+ * zeroes.
+ */
+void
+smgrzeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+ int nblocks, bool skipFsync)
+{
+ smgrsw[reln->smgr_which].smgr_zeroextend(reln, forknum, blocknum,
+ nblocks, skipFsync);
+
+ /*
+ * Normally we expect this to increase the fork size by nblocks, but if
+ * the cached value isn't as expected, just invalidate it so the next call
+ * asks the kernel.
+ */
+ if (reln->smgr_cached_nblocks[forknum] == blocknum)
+ reln->smgr_cached_nblocks[forknum] = blocknum + nblocks;
+ else
+ reln->smgr_cached_nblocks[forknum] = InvalidBlockNumber;
+}
+
/*
* smgrprefetch() -- Initiate asynchronous read of the specified block of a relation.
*
extern int FileRead(File file, void *buffer, size_t amount, off_t offset, uint32 wait_event_info);
extern int FileWrite(File file, const void *buffer, size_t amount, off_t offset, uint32 wait_event_info);
extern int FileSync(File file, uint32 wait_event_info);
+extern int FileZero(File file, off_t offset, off_t amount, uint32 wait_event_info);
+extern int FileFallocate(File file, off_t offset, off_t amount, uint32 wait_event_info);
+
extern off_t FileSize(File file);
extern int FileTruncate(File file, off_t offset, uint32 wait_event_info);
extern void FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info);
extern void mdunlink(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo);
extern void mdextend(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, const void *buffer, bool skipFsync);
+extern void mdzeroextend(SMgrRelation reln, ForkNumber forknum,
+ BlockNumber blocknum, int nblocks, bool skipFsync);
extern bool mdprefetch(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum);
extern void mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
extern void smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo);
extern void smgrextend(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, const void *buffer, bool skipFsync);
+extern void smgrzeroextend(SMgrRelation reln, ForkNumber forknum,
+ BlockNumber blocknum, int nblocks, bool skipFsync);
extern bool smgrprefetch(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum);
extern void smgrread(SMgrRelation reln, ForkNumber forknum,