[prev in list] [next in list] [prev in thread] [next in thread] 

List:       postgresql-hackers
Subject:    Re: Cache relation sizes?
From:       Thomas Munro <thomas.munro () gmail ! com>
Date:       2020-07-31 21:56:14
Message-ID: CA+hUKGKEW7-9pq+s2_4Q-Fcpr9cc7_0b3pkno5qzPKC3y2nOPA () mail ! gmail ! com
[Download RAW message or body]

On Fri, Jul 31, 2020 at 2:36 PM Thomas Munro <thomas.munro@gmail.com> wrote:
> There's still the matter of crazy numbers of lseeks in regular
> backends; looking at all processes while running the above test, I get
> 1,469,060 (9.18 per pgbench transaction) without -M prepared, and
> 193,722 with -M prepared (1.21 per pgbench transaction).  Fixing that
> with this approach will require bullet-proof shared invalidation, but
> I think it's doable, in later work.

I couldn't help hacking on this a bit.  Perhaps instead of
bullet-proof general shared invalidation, we should have a way for
localised bits of code to state that they are ok with a "relaxed"
value.  Then they should explain the theory for why that is safe in
each case based on arguments about memory barrier pairs, but leave all
other callers making the system call so that we don't torpedo the
whole project by making it too hard.  For now, the main cases I'm
interested in are the ones that show up all the time as the dominant
system call in various workloads:

(1) Sequential scan relation-size probe.  This should be OK with a
relaxed value.  You can't miss the invalidation for a truncation,
because the read barrier in your lock acquisition on the relation
pairs with the write barrier in the exclusive lock release of any
session that truncated, and you can't miss relation any relation
extension that your snapshot can see, because the release of the
extension lock pairs with the lock involved in snapshot acquisition.

(2) Planner relation-size probe, which should be OK with a relaxed
value.  Similar arguments give you a fresh enough view, I think.

Or maybe there is a theory along these lines that already covers every
use of smgrnblocks(), so a separate mode isn't require... I don't
know!

The attached sketch-quality patch shows some of the required elements
working, though I ran out of steam trying to figure out how to thread
this thing through the right API layers so for now it always asks for
a relaxed value in table_block_relation_size().

["0001-WIP-Cache-smgrnblocks-in-more-cases.patch" (text/x-patch)]

From 0392adb59be052bb5d04a4b2e0d151cefdd810e4 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Sat, 1 Aug 2020 09:30:41 +1200
Subject: [PATCH] WIP: Cache smgrnblocks() in more cases.

This is just early sketch code and may be completely wrong...

Discussion: https://postgr.es/m/CAEepm%3D3SSw-Ty1DFcK%3D1rU-K6GSzYzfdD4d%2BZwapdN7dTa6%3DnQ%40mail.gmail.com
                
---
 src/backend/access/gist/gistbuild.c       |  2 +-
 src/backend/access/heap/visibilitymap.c   |  6 +-
 src/backend/access/table/tableam.c        |  4 +-
 src/backend/access/transam/xlogutils.c    |  2 +-
 src/backend/catalog/storage.c             |  4 +-
 src/backend/storage/buffer/bufmgr.c       |  4 +-
 src/backend/storage/freespace/freespace.c |  6 +-
 src/backend/storage/ipc/ipci.c            |  2 +
 src/backend/storage/smgr/smgr.c           | 85 +++++++++++++++++++----
 src/include/storage/smgr.h                | 11 ++-
 10 files changed, 97 insertions(+), 29 deletions(-)

diff --git a/src/backend/access/gist/gistbuild.c \
b/src/backend/access/gist/gistbuild.c index 671b5e9186..d5baa8d21a 100644
--- a/src/backend/access/gist/gistbuild.c
+++ b/src/backend/access/gist/gistbuild.c
@@ -495,7 +495,7 @@ gistBuildCallback(Relation index,
 	 */
 	if ((buildstate->bufferingMode == GIST_BUFFERING_AUTO &&
 		 buildstate->indtuples % BUFFERING_MODE_SWITCH_CHECK_STEP == 0 &&
-		 effective_cache_size < smgrnblocks(index->rd_smgr, MAIN_FORKNUM)) ||
+		 effective_cache_size < smgrnblocks(index->rd_smgr, MAIN_FORKNUM, 0)) ||
 		(buildstate->bufferingMode == GIST_BUFFERING_STATS &&
 		 buildstate->indtuples >= BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET))
 	{
diff --git a/src/backend/access/heap/visibilitymap.c \
b/src/backend/access/heap/visibilitymap.c index b1072183bc..242b4a183f 100644
--- a/src/backend/access/heap/visibilitymap.c
+++ b/src/backend/access/heap/visibilitymap.c
@@ -528,7 +528,7 @@ visibilitymap_prepare_truncate(Relation rel, BlockNumber \
nheapblocks)  else
 		newnblocks = truncBlock;
 
-	if (smgrnblocks(rel->rd_smgr, VISIBILITYMAP_FORKNUM) <= newnblocks)
+	if (smgrnblocks(rel->rd_smgr, VISIBILITYMAP_FORKNUM, 0) <= newnblocks)
 	{
 		/* nothing to do, the file was already smaller than requested size */
 		return InvalidBlockNumber;
@@ -564,7 +564,7 @@ vm_readbuf(Relation rel, BlockNumber blkno, bool extend)
 	if (rel->rd_smgr->smgr_cached_nblocks[VISIBILITYMAP_FORKNUM] == InvalidBlockNumber)
 	{
 		if (smgrexists(rel->rd_smgr, VISIBILITYMAP_FORKNUM))
-			smgrnblocks(rel->rd_smgr, VISIBILITYMAP_FORKNUM);
+			smgrnblocks(rel->rd_smgr, VISIBILITYMAP_FORKNUM, 0);
 		else
 			rel->rd_smgr->smgr_cached_nblocks[VISIBILITYMAP_FORKNUM] = 0;
 	}
@@ -647,7 +647,7 @@ vm_extend(Relation rel, BlockNumber vm_nblocks)
 
 	/* Invalidate cache so that smgrnblocks() asks the kernel. */
 	rel->rd_smgr->smgr_cached_nblocks[VISIBILITYMAP_FORKNUM] = InvalidBlockNumber;
-	vm_nblocks_now = smgrnblocks(rel->rd_smgr, VISIBILITYMAP_FORKNUM);
+	vm_nblocks_now = smgrnblocks(rel->rd_smgr, VISIBILITYMAP_FORKNUM, 0);
 
 	/* Now extend the file */
 	while (vm_nblocks_now < vm_nblocks)
diff --git a/src/backend/access/table/tableam.c b/src/backend/access/table/tableam.c
index 3afb63b1fe..e8ead8f019 100644
--- a/src/backend/access/table/tableam.c
+++ b/src/backend/access/table/tableam.c
@@ -628,10 +628,10 @@ table_block_relation_size(Relation rel, ForkNumber forkNumber)
 	if (forkNumber == InvalidForkNumber)
 	{
 		for (int i = 0; i < MAX_FORKNUM; i++)
-			nblocks += smgrnblocks(rel->rd_smgr, i);
+			nblocks += smgrnblocks(rel->rd_smgr, i, 0);
 	}
 	else
-		nblocks = smgrnblocks(rel->rd_smgr, forkNumber);
+		nblocks = smgrnblocks(rel->rd_smgr, forkNumber, SMGRNBLOCKS_RELAXED);
 
 	return nblocks * BLCKSZ;
 }
diff --git a/src/backend/access/transam/xlogutils.c \
b/src/backend/access/transam/xlogutils.c index b2ca0cd4cf..c7768a7595 100644
--- a/src/backend/access/transam/xlogutils.c
+++ b/src/backend/access/transam/xlogutils.c
@@ -461,7 +461,7 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
 	 */
 	smgrcreate(smgr, forknum, true);
 
-	lastblock = smgrnblocks(smgr, forknum);
+	lastblock = smgrnblocks(smgr, forknum, 0);
 
 	if (blkno < lastblock)
 	{
diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c
index 9e6e6c42d3..5e359ff5f2 100644
--- a/src/backend/catalog/storage.c
+++ b/src/backend/catalog/storage.c
@@ -434,7 +434,7 @@ RelationCopyStorage(SMgrRelation src, SMgrRelation dst,
 	use_wal = XLogIsNeeded() &&
 		(relpersistence == RELPERSISTENCE_PERMANENT || copying_initfork);
 
-	nblocks = smgrnblocks(src, forkNum);
+	nblocks = smgrnblocks(src, forkNum, 0);
 
 	for (blkno = 0; blkno < nblocks; blkno++)
 	{
@@ -721,7 +721,7 @@ smgrDoPendingSyncs(bool isCommit, bool isParallelWorker)
 			{
 				if (smgrexists(srel, fork))
 				{
-					BlockNumber n = smgrnblocks(srel, fork);
+					BlockNumber n = smgrnblocks(srel, fork, 0);
 
 					/* we shouldn't come here for unlogged relations */
 					Assert(fork != INIT_FORKNUM);
diff --git a/src/backend/storage/buffer/bufmgr.c \
b/src/backend/storage/buffer/bufmgr.c index f1ae6f9f84..35806e60ea 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -739,7 +739,7 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, \
ForkNumber forkNum,  
 	/* Substitute proper block number if caller asked for P_NEW */
 	if (isExtend)
-		blockNum = smgrnblocks(smgr, forkNum);
+		blockNum = smgrnblocks(smgr, forkNum, 0);
 
 	if (isLocalBuf)
 	{
@@ -2862,7 +2862,7 @@ RelationGetNumberOfBlocksInFork(Relation relation, ForkNumber \
forkNum)  /* Open it at the smgr level if not already done */
 			RelationOpenSmgr(relation);
 
-			return smgrnblocks(relation->rd_smgr, forkNum);
+			return smgrnblocks(relation->rd_smgr, forkNum, 0);
 
 		case RELKIND_RELATION:
 		case RELKIND_TOASTVALUE:
diff --git a/src/backend/storage/freespace/freespace.c \
b/src/backend/storage/freespace/freespace.c index 6a96126b0c..e4d93b4794 100644
--- a/src/backend/storage/freespace/freespace.c
+++ b/src/backend/storage/freespace/freespace.c
@@ -317,7 +317,7 @@ FreeSpaceMapPrepareTruncateRel(Relation rel, BlockNumber nblocks)
 	else
 	{
 		new_nfsmblocks = fsm_logical_to_physical(first_removed_address);
-		if (smgrnblocks(rel->rd_smgr, FSM_FORKNUM) <= new_nfsmblocks)
+		if (smgrnblocks(rel->rd_smgr, FSM_FORKNUM, 0) <= new_nfsmblocks)
 			return InvalidBlockNumber;	/* nothing to do; the FSM was already
 										 * smaller */
 	}
@@ -547,7 +547,7 @@ fsm_readbuf(Relation rel, FSMAddress addr, bool extend)
 		/* Invalidate the cache so smgrnblocks asks the kernel. */
 		rel->rd_smgr->smgr_cached_nblocks[FSM_FORKNUM] = InvalidBlockNumber;
 		if (smgrexists(rel->rd_smgr, FSM_FORKNUM))
-			smgrnblocks(rel->rd_smgr, FSM_FORKNUM);
+			smgrnblocks(rel->rd_smgr, FSM_FORKNUM, 0);
 		else
 			rel->rd_smgr->smgr_cached_nblocks[FSM_FORKNUM] = 0;
 	}
@@ -633,7 +633,7 @@ fsm_extend(Relation rel, BlockNumber fsm_nblocks)
 
 	/* Invalidate cache so that smgrnblocks() asks the kernel. */
 	rel->rd_smgr->smgr_cached_nblocks[FSM_FORKNUM] = InvalidBlockNumber;
-	fsm_nblocks_now = smgrnblocks(rel->rd_smgr, FSM_FORKNUM);
+	fsm_nblocks_now = smgrnblocks(rel->rd_smgr, FSM_FORKNUM, 0);
 
 	while (fsm_nblocks_now < fsm_nblocks)
 	{
diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c
index 96c2aaabbd..1182d3f0df 100644
--- a/src/backend/storage/ipc/ipci.c
+++ b/src/backend/storage/ipc/ipci.c
@@ -120,6 +120,7 @@ CreateSharedMemoryAndSemaphores(void)
 		size = add_size(size, SpinlockSemaSize());
 		size = add_size(size, hash_estimate_size(SHMEM_INDEX_SIZE,
 												 sizeof(ShmemIndexEnt)));
+		size = add_size(size, smgr_shmem_estimate_size());
 		size = add_size(size, dsm_estimate_size());
 		size = add_size(size, BufferShmemSize());
 		size = add_size(size, LockShmemSize());
@@ -220,6 +221,7 @@ CreateSharedMemoryAndSemaphores(void)
 	CommitTsShmemInit();
 	SUBTRANSShmemInit();
 	MultiXactShmemInit();
+	smgr_shmem_init();
 	InitBufferPool();
 
 	/*
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index dcc09df0c7..218826e2b1 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -22,10 +22,17 @@
 #include "storage/bufmgr.h"
 #include "storage/ipc.h"
 #include "storage/md.h"
+#include "storage/shmem.h"
 #include "storage/smgr.h"
 #include "utils/hsearch.h"
 #include "utils/inval.h"
 
+/* Control object in shared memory. */
+typedef struct SMgrShared
+{
+	pg_atomic_uint64 nblocks_inval[1024];
+} SMgrShared;
+
 
 /*
  * This struct of function pointers defines the API between smgr.c and
@@ -98,6 +105,50 @@ static dlist_head unowned_relns;
 /* local function prototypes */
 static void smgrshutdown(int code, Datum arg);
 
+static SMgrShared *smgr_shared;
+
+size_t
+smgr_shmem_estimate_size(void)
+{
+	return sizeof(*smgr_shared);
+}
+
+void
+smgr_shmem_init(void)
+{
+	bool		found;
+
+	smgr_shared = ShmemInitStruct("SMgr", sizeof(*smgr_shared), &found);
+	if (!found)
+	{
+		for (size_t i = 0; i < lengthof(smgr_shared->nblocks_inval); ++i)
+			pg_atomic_init_u64(&smgr_shared->nblocks_inval[i], 0);
+	}
+}
+
+/*
+ * Read the invalidation counter that a given relation maps to.
+ */
+static uint64
+smgrnblocks_get_inval(SMgrRelation reln)
+{
+	size_t		slot;
+
+	slot = reln->smgr_rnode.node.relNode % lengthof(smgr_shared->nblocks_inval);
+	return pg_atomic_read_u64(&smgr_shared->nblocks_inval[slot]);
+}
+
+/*
+ * Increment the invalidation counter that a given relation maps to.
+ */
+static uint64
+smgrnblocks_inc_inval(SMgrRelation reln)
+{
+	size_t		slot;
+
+	slot = reln->smgr_rnode.node.relNode % lengthof(smgr_shared->nblocks_inval);
+	return pg_atomic_fetch_add_u64(&smgr_shared->nblocks_inval[slot], 1) + 1;
+}
 
 /*
  *	smgrinit(), smgrshutdown() -- Initialize or shut down storage
@@ -466,15 +517,8 @@ smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber \
blocknum,  smgrsw[reln->smgr_which].smgr_extend(reln, forknum, blocknum,
 										 buffer, skipFsync);
 
-	/*
-	 * Normally we expect this to increase nblocks by one, but if the cached
-	 * value isn't as expected, just invalidate it so the next call asks the
-	 * kernel.
-	 */
-	if (reln->smgr_cached_nblocks[forknum] == blocknum)
-		reln->smgr_cached_nblocks[forknum] = blocknum + 1;
-	else
-		reln->smgr_cached_nblocks[forknum] = InvalidBlockNumber;
+	reln->smgr_cached_nblocks_inval[forknum] = smgrnblocks_inc_inval(reln);
+	reln->smgr_cached_nblocks[forknum] = blocknum + 1;
 }
 
 /*
@@ -543,23 +587,35 @@ smgrwriteback(SMgrRelation reln, ForkNumber forknum, \
BlockNumber blocknum,  
 /*
  *	smgrnblocks() -- Calculate the number of blocks in the
- *					 supplied relation.
+ *					 supplied relation, optionally returning a relaxed value
+ *					 that already out of date, but not older than the most
+ *					 recent read barrier that pairs with the most recent
+ * 					 write barrier in any process that earlier changed the
+ *					 true value.
  */
 BlockNumber
-smgrnblocks(SMgrRelation reln, ForkNumber forknum)
+smgrnblocks(SMgrRelation reln, ForkNumber forknum, int flags)
 {
 	BlockNumber result;
+	uint64		inval;
 
 	/*
-	 * For now, we only use cached values in recovery due to lack of a shared
-	 * invalidation mechanism for changes in file size.
+	 * We can use cached values in recovery since no other process can change
+	 * the size of a relation.  Otherwise, we only use a cached value if the
+	 * caller said that a relaxed value is OK, and the shared invalidation
+	 * counter hasn't moved.
 	 */
-	if (InRecovery && reln->smgr_cached_nblocks[forknum] != InvalidBlockNumber)
+	inval = smgrnblocks_get_inval(reln);
+	if ((InRecovery ||
+		 ((flags & SMGRNBLOCKS_RELAXED) &&
+		  inval == reln->smgr_cached_nblocks_inval[forknum])) &&
+		reln->smgr_cached_nblocks[forknum] != InvalidBlockNumber)
 		return reln->smgr_cached_nblocks[forknum];
 
 	result = smgrsw[reln->smgr_which].smgr_nblocks(reln, forknum);
 
 	reln->smgr_cached_nblocks[forknum] = result;
+	reln->smgr_cached_nblocks_inval[forknum] = inval;
 
 	return result;
 }
@@ -614,6 +670,7 @@ smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks, \
                BlockNumber *nb
 		 */
 		reln->smgr_cached_nblocks[forknum[i]] = nblocks[i];
 	}
+	smgrnblocks_inc_inval(reln);
 }
 
 /*
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index f28a842401..af8a5cb874 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -14,10 +14,14 @@
 #ifndef SMGR_H
 #define SMGR_H
 
+#include "port/atomics.h"
 #include "lib/ilist.h"
 #include "storage/block.h"
 #include "storage/relfilenode.h"
 
+/* Flags for smgrnblocks(). */
+#define SMGRNBLOCKS_RELAXED 0x01
+
 /*
  * smgr.c maintains a table of SMgrRelation objects, which are essentially
  * cached file handles.  An SMgrRelation is created (if not already present)
@@ -52,6 +56,7 @@ typedef struct SMgrRelationData
 	 */
 	BlockNumber smgr_targblock; /* current insertion target block */
 	BlockNumber smgr_cached_nblocks[MAX_FORKNUM + 1];	/* last known size */
+	uint64		smgr_cached_nblocks_inval[MAX_FORKNUM + 1];
 
 	/* additional public fields may someday exist here */
 
@@ -77,6 +82,9 @@ typedef SMgrRelationData *SMgrRelation;
 #define SmgrIsTemp(smgr) \
 	RelFileNodeBackendIsTemp((smgr)->smgr_rnode)
 
+extern size_t smgr_shmem_estimate_size(void);
+extern void smgr_shmem_init(void);
+
 extern void smgrinit(void);
 extern SMgrRelation smgropen(RelFileNode rnode, BackendId backend);
 extern bool smgrexists(SMgrRelation reln, ForkNumber forknum);
@@ -98,7 +106,8 @@ extern void smgrwrite(SMgrRelation reln, ForkNumber forknum,
 					  BlockNumber blocknum, char *buffer, bool skipFsync);
 extern void smgrwriteback(SMgrRelation reln, ForkNumber forknum,
 						  BlockNumber blocknum, BlockNumber nblocks);
-extern BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum);
+extern BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum,
+							   int flags);
 extern void smgrtruncate(SMgrRelation reln, ForkNumber *forknum,
 						 int nforks, BlockNumber *nblocks);
 extern void smgrimmedsync(SMgrRelation reln, ForkNumber forknum);
-- 
2.20.1



[prev in list] [next in list] [prev in thread] [next in thread] 

Configure | About | News | Add a list | Sponsored by KoreLogic