[prev in list] [next in list] [prev in thread] [next in thread] 

List:       postgresql-hackers
Subject:    [PATCH] lazy relations delete
From:       Maxim Orlov <m.orlov () postgrespro ! ru>
Date:       2019-12-31 10:16:49
Message-ID: 8023beea-ffdc-10dd-f390-6f27a21ae055 () postgrespro ! ru
[Download RAW message or body]

Hi!

Here is the case.

Assume we have a master to slave replication with shared_buffers set up 
to 2 GB at the master and 4 GB at the slave. All of the data is written 
to the master, while reading occurs from slave.

Now we decided to drop many tables, let's say 1000 or 10000 not in a 
single transaction, but each table in a separate one. So, due to "plain" 
shared_buffers memory we have to do for loop for every relation which 
leads to lag between master and slave.

In real case scenario such issue lead to not a minutes lag, but hours 
lag. At the same time PostgreSQL have a great routine to delete many 
relations in a single transaction.

So, to get rid of this kind of issue here came up an idea: what if not 
to delete everyone of relations right away and just store them in an 
array, prevent shared buffers (correspond to a deleted relations) from 
been flushed. And then array reaches it max size we need to walk all 
buffers only once to "free" shared buffers correspond to a deleted 
relations.

Here some values from the test which I am made.

Without patch:

1.

(master 2 GB) - drop 1000 tables took 6 sec

(slave 4 GB) - drop 1000 tables took 8 sec

2.

(master 4 GB) - drop 1000 tables took 10 sec

(slave 8 GB) - drop 1000 tables took 16 sec

3.

(master 10 GB) - drop 1000 tables took 22 sec

(slave 20 GB) - drop 1000 tables took 38 sec


With patch:

1.

(master 2 GB) - drop 1000 tables took 2 sec

(slave 4 GB) - drop 1000 tables took 2 sec

2.

(master 4 GB) - drop 1000 tables took 3 sec

(slave 8 GB) - drop 1000 tables took 3 sec

3.

(master 10 GB) - drop 1000 tables took 4 sec

(slave 20 GB) - drop 1000 tables took 4 sec

-- 
Max Orlov
E-mail: m.orlov@postgrespro.ru


["init.sh" (application/x-shellscript)]

#!/bin/sh

ROOT=`pwd`

# Install some usable contribs
CONTRIBDIR="$(dirname "$ROOT")"/build/contrib
cd $CONTRIBDIR/pg_prewarm/
make -s
make -s install
cd $ROOT

cd $CONTRIBDIR/pg_buffercache/
make -s
make -s install
cd $ROOT

# Set up env
BINDIR=$ROOT/bin
DATADIR="$ROOT/data"
DATADIR_PRIMARY=$DATADIR/primary
DATADIR_SECONDARY=$DATADIR/secondary
LOGSDIR="$DATADIR/logs"

mkdir -p $DATADIR
mkdir -p $LOGSDIR
mkdir -p $DATADIR_PRIMARY
mkdir -p $DATADIR_SECONDARY

chmod 700 -R $DATADIR_SECONDARY

LOGFILE=$LOGSDIR/init-db.log

PORT=55432
SLOT="slot01"

# build primary (master)
$BINDIR/pg_ctl -D$DATADIR_PRIMARY init
$BINDIR/pg_ctl -D$DATADIR_PRIMARY -l$LOGFILE -o"--port=$PORT" start
$BINDIR/psql -p$PORT -dpostgres <<EOF
\t on
select pg_create_physical_replication_slot('$SLOT');
create extension pg_prewarm;
create extension pg_buffercache;
EOF

# build secondary (replica)
$BINDIR/pg_basebackup -p$PORT -D$DATADIR_SECONDARY -S$SLOT -Xstream -vP

cat <<-CONF >> $DATADIR_SECONDARY/postgresql.conf
primary_conninfo = 'port=$PORT'
recovery_target_timeline = 'latest'
primary_slot_name = '$SLOT'
checkpoint_timeout = 5min
max_wal_size = 1GB
min_wal_size = 100MB
hot_standby_feedback = on
CONF

touch $DATADIR_SECONDARY/standby.signal

# all done, stop primary
$BINDIR/pg_ctl -D$DATADIR_PRIMARY stop

["run-001.sh" (application/x-shellscript)]

#!/bin/sh

ROOT=`pwd`
BINDIR=$ROOT/bin

DATADIR="$ROOT/data"
DATA_PRIMARY=$DATADIR/primary
DATA_SECONDARY=$DATADIR/secondary

LOGSDIR="$DATADIR/logs"
LOGS_PREFIX=`date +"%F_%T"`
LOG_PRIMARY=$LOGSDIR/$LOGS_PREFIX.primary.log
LOG_SECONDARY=$LOGSDIR/$LOGS_PREFIX.secondary.log

PORT_PRIMARY=55432
PORT_SECONDARY=55433
SHBUF_PRIMARY_SIZE="10 GB"
SHBUF_SECONDARY_SIZE="20 GB"

# Run the servers
$BINDIR/pg_ctl -D$DATA_PRIMARY   -l"$LOG_PRIMARY" \
	-o"--port=$PORT_PRIMARY   --shared_buffers='$SHBUF_PRIMARY_SIZE'" start
$BINDIR/pg_ctl -D$DATA_SECONDARY -l"$LOG_SECONDARY" \
	-o"--port=$PORT_SECONDARY --shared_buffers='$SHBUF_SECONDARY_SIZE'" start

# Const
PG_CODE='$$'

# Create procedures
$BINDIR/psql -dpostgres -p$PORT_PRIMARY <<EOF
---
--- Procedure f_watch: watch for moments of deleting/adding any tables in
--- public scheme, except for the the t_filler.
---
--- This procedure watch for the two moments:
--- 1. When the number of tables has changed for the first time (since run).
--- 2. When the number of tables becomes equal to zero.
---
--- The output is provided by "rise info".
---
create or replace procedure f_watch(node text, notice_delay int = 5)
language plpgsql
as
$PG_CODE
declare
  n bigint;
  i bigint;
  start timestamptz;
  finish timestamptz;
  s text;
begin
  select count(*) into n
  from pg_tables 
  where schemaname = 'public' and tablename <> 't_filler';

  loop
    select count(*) into i
    from pg_tables
    where schemaname = 'public' and tablename <> 't_filler';

    if i <> n then
      start := clock_timestamp();
      exit;
    end if;
  end loop;

  loop
    select count(*) into i
    from pg_tables
    where schemaname = 'public' and tablename <> 't_filler';

    if i = 0 then
      finish := clock_timestamp();
      exit;
    end if;

    perform pg_sleep(1);
  end loop;

  perform pg_sleep(notice_delay);

  select replace(replace(format('%-48s', node||'*'), ' ', '-'), '*', ' ') 
  into s;

  raise info ' % ', s;
  raise info '  Start delete at : %', start;
  raise info ' Finish delete at : %', finish;
  raise info '          It took : %', (finish - start);

  select repeat('-', 48)
  into s;
  raise info ' % ', s;
end $PG_CODE ;
EOF

echo " *** STEP 1 *** "
./run-001-core.sh
echo " *** STEP 2 *** "
./run-001-core.sh
echo " *** STEP 3 *** "
./run-001-core.sh
echo " *** STEP 4 *** "
./run-001-core.sh
echo " *** STEP 5 *** "
./run-001-core.sh

# Stop the servers
$BINDIR/pg_ctl -D$DATA_PRIMARY stop
$BINDIR/pg_ctl -D$DATA_SECONDARY stop

["run-001-core.sh" (application/x-shellscript)]

#!/bin/sh

ROOT=`pwd`
BINDIR=$ROOT/bin

PORT_PRIMARY=55432
PORT_SECONDARY=55433

# Const
N_TABLES=1000
N_TABLE_ROWS=100

# Create tables
CREATE_SQL=""

for i in `seq 1 $N_TABLES`
do
  CREATE_SQL="${CREATE_SQL} 
  create table if not exists t_$i as
    select g, md5(g::text)
    from generate_series(1, $N_TABLE_ROWS) g;\n"
done

echo $CREATE_SQL > create.sql.tmp
echo -n "Creating tables ... "
$BINDIR/psql -dpostgres -p$PORT_PRIMARY --quiet --tuples-only -fcreate.sql.tmp
rm -f create.sql.tmp
echo "DONE"

# Warm up the servers
WARMUP_SQL=""

for i in `seq 1 $N_TABLES`
do
  WARMUP_SQL="${WARMUP_SQL} 
  select pg_prewarm('t_$i');"
done

echo $WARMUP_SQL > warmup.sql.tmp
echo -n "Warm up tables ... "
$BINDIR/psql -dpostgres -p$PORT_PRIMARY   -q -fwarmup.sql.tmp > /dev/null
$BINDIR/psql -dpostgres -p$PORT_SECONDARY -q -fwarmup.sql.tmp > /dev/null
rm -f warmup.sql.tmp
echo "DONE"

# Watch for tables
$BINDIR/psql -dpostgres -p$PORT_PRIMARY --quiet --tuples-only -c"
call f_watch('PRIMARY', 3);
" &

$BINDIR/psql -dpostgres -p$PORT_SECONDARY --quiet --tuples-only -c"
call f_watch('SECONDARY', 6);
" &

# Delete tables
DELETE_SQL=""

for i in `seq 1 $N_TABLES`
do
  DELETE_SQL="${DELETE_SQL}
  drop table t_"$i";"
done

echo $DELETE_SQL > drop.sql.tmp
echo -n "Begin dropping tables ... "
$BINDIR/psql -dpostgres -p$PORT_PRIMARY --quiet --tuples-only -fdrop.sql.tmp
echo "DONE"
rm -f drop.sql.tmp

# Wait for jobs to be done
wait $(jobs -p)

["lazy-relations-delete.patch" (text/x-patch)]

diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 1f10a97dc78..e0d1fb6a824 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -66,8 +66,6 @@
 #define BUF_WRITTEN				0x01
 #define BUF_REUSABLE			0x02
 
-#define DROP_RELS_BSEARCH_THRESHOLD		20
-
 typedef struct PrivateRefCountEntry
 {
 	Buffer		buffer;
@@ -410,6 +408,72 @@ ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref)
 	}
 }
 
+/*
+ * Lazy relations delete mechanism.
+ *
+ * On relation drop no need to walk shared buffers every time, just put a deleted
+ * RelFileNode into an array. Thus we store RelFileNodes in RelNodeDeleted
+ * struct and delete them later when number of deleted relations will
+ * exceed LAZY_DELETE_ARRAY_SIZE.
+ */
+
+#define	LAZY_DELETE_ARRAY_SIZE 				128
+
+/* Wrapper for array of lazy deleted RelFileNodes. */
+typedef struct RelNodeDeletedBuffer
+{
+	RelFileNode 	rnodes[LAZY_DELETE_ARRAY_SIZE];	/* Array of deleted */
+	int 			idx;							/* Current position */
+} RelNodeDeletedBuffer;
+
+static RelNodeDeletedBuffer *RelNodeDeleted = NULL;
+
+/*
+ * Initialize shared array of lazy deleted relations.
+ *
+ * This is called once during shared-memory initialization (either in the
+ * postmaster, or in a standalone backend).
+ */
+void InitRelNodeDeletedBuffer(void)
+{
+	bool 	found;
+
+	RelNodeDeleted = ShmemInitStruct("Lazy Deleted Relations",
+								sizeof(RelNodeDeletedBuffer),
+								&found);
+
+	if (!found)
+		MemSet(RelNodeDeleted, 0, sizeof(RelNodeDeletedBuffer));
+}
+
+/*
+ * Compute the size of shared memory for the buffer of lazy deleted relations.
+ */
+Size
+RelNodeDeletedBufferSize(void)
+{
+	Size		size = 0;
+
+	size = add_size(size, sizeof(RelNodeDeletedBuffer));
+
+	return size;
+}
+
+/*
+ * Check for relation is in lazy deleted buffer (up to n-th position).
+ */
+static inline bool
+IsRelFileNodeDeleted(RelFileNode rnode, int n)
+{
+	int i;
+
+	for (i = 0; i < n; i++)
+		if (RelFileNodeEquals(rnode, RelNodeDeleted->rnodes[i]))
+			return true;
+
+	return false;
+}
+
 /*
  * BufferIsPinned
  *		True iff the buffer is pinned (also checks for valid buffer number).
@@ -452,6 +516,7 @@ static BufferDesc *BufferAlloc(SMgrRelation smgr,
 							   BlockNumber blockNum,
 							   BufferAccessStrategy strategy,
 							   bool *foundPtr);
+static void InvalidateRelFileNodesBuffers(RelFileNode *nodes, int nnodes);
 static void FlushBuffer(BufferDesc *buf, SMgrRelation reln);
 static void AtProcExit_Buffers(int code, Datum arg);
 static void CheckForBufferLeaks(void);
@@ -2690,6 +2755,22 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln)
 	char	   *bufToWrite;
 	uint32		buf_state;
 
+	LWLockAcquire(LazyRelDeleteLock, LW_SHARED);
+
+	/*
+	 * If rnode is in lazy deleted buffer clear dirty and checkpoint flags.
+	 */
+	if (IsRelFileNodeDeleted(buf->tag.rnode, RelNodeDeleted->idx))
+	{
+		buf_state = LockBufHdr(buf);
+		buf_state &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED);
+		UnlockBufHdr(buf, buf_state);
+		LWLockRelease(LazyRelDeleteLock);
+		return;
+	}
+
+	LWLockRelease(LazyRelDeleteLock);
+
 	/*
 	 * Acquire the buffer's io_in_progress lock.  If StartBufferIO returns
 	 * false, then someone else flushed the buffer before we could, so we need
@@ -2993,6 +3074,43 @@ DropRelFileNodeBuffers(RelFileNodeBackend rnode, ForkNumber *forkNum,
 	}
 }
 
+/*
+ * Invalidate all of the nodes buffers.
+ */
+void
+InvalidateRelFileNodesBuffers(RelFileNode *nodes, int nnodes)
+{
+	int 		i;
+
+	pg_qsort(nodes, nnodes, sizeof(RelFileNode), rnode_comparator);
+
+	for (i = 0; i < NBuffers; i++)
+	{
+		RelFileNode *rnode = NULL;
+		BufferDesc *bufHdr = GetBufferDescriptor(i);
+		uint32		buf_state;
+
+		/*
+		 * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
+		 * and saves some cycles.
+		 */
+
+		rnode = bsearch((const void *) &(bufHdr->tag.rnode),
+						nodes, nnodes, sizeof(RelFileNode),
+						rnode_comparator);
+
+		/* buffer doesn't belong to any of the given relfilenodes; skip it */
+		if (rnode == NULL)
+			continue;
+
+		buf_state = LockBufHdr(bufHdr);
+		if (RelFileNodeEquals(bufHdr->tag.rnode, (*rnode)))
+			InvalidateBuffer(bufHdr);	/* releases spinlock */
+		else
+			UnlockBufHdr(bufHdr, buf_state);
+	}
+}
+
 /* ---------------------------------------------------------------------
  *		DropRelFileNodesAllBuffers
  *
@@ -3006,25 +3124,27 @@ void
 DropRelFileNodesAllBuffers(RelFileNodeBackend *rnodes, int nnodes)
 {
 	int			i,
+				j,
 				n = 0;
 	RelFileNode *nodes;
-	bool		use_bsearch;
 
 	if (nnodes == 0)
 		return;
 
 	nodes = palloc(sizeof(RelFileNode) * nnodes);	/* non-local relations */
 
-	/* If it's a local relation, it's localbuf.c's problem. */
 	for (i = 0; i < nnodes; i++)
 	{
+		/* If it's a local relation, it's localbuf.c's problem. */
 		if (RelFileNodeBackendIsTemp(rnodes[i]))
 		{
 			if (rnodes[i].backend == MyBackendId)
 				DropRelFileNodeAllLocalBuffers(rnodes[i].node);
 		}
 		else
+		{
 			nodes[n++] = rnodes[i].node;
+		}
 	}
 
 	/*
@@ -3037,58 +3157,41 @@ DropRelFileNodesAllBuffers(RelFileNodeBackend *rnodes, int nnodes)
 		return;
 	}
 
-	/*
-	 * For low number of relations to drop just use a simple walk through, to
-	 * save the bsearch overhead. The threshold to use is rather a guess than
-	 * an exactly determined value, as it depends on many factors (CPU and RAM
-	 * speeds, amount of shared buffers etc.).
-	 */
-	use_bsearch = n > DROP_RELS_BSEARCH_THRESHOLD;
+	if (n > LAZY_DELETE_ARRAY_SIZE)
+	{
+		InvalidateRelFileNodesBuffers(nodes, n);
+	}
+	else
+	{
+		int		idx;
 
-	/* sort the list of rnodes if necessary */
-	if (use_bsearch)
-		pg_qsort(nodes, n, sizeof(RelFileNode), rnode_comparator);
+		LWLockAcquire(LazyRelDeleteLock, LW_SHARED);
+		idx = RelNodeDeleted->idx;
+		Assert(idx < LAZY_DELETE_ARRAY_SIZE);
 
-	for (i = 0; i < NBuffers; i++)
-	{
-		RelFileNode *rnode = NULL;
-		BufferDesc *bufHdr = GetBufferDescriptor(i);
-		uint32		buf_state;
+		/* Copy deleted relations into lazy deleted array, but watch the size ... */
+		for (i = idx, j = 0; i < LAZY_DELETE_ARRAY_SIZE && j < n; i++, j++)
+			RelNodeDeleted->rnodes[i] = rnodes[j].node;
 
-		/*
-		 * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
-		 * and saves some cycles.
-		 */
+		idx += j;
 
-		if (!use_bsearch)
+		if (idx == LAZY_DELETE_ARRAY_SIZE)
 		{
-			int			j;
+			InvalidateRelFileNodesBuffers(RelNodeDeleted->rnodes, idx);
 
-			for (j = 0; j < n; j++)
-			{
-				if (RelFileNodeEquals(bufHdr->tag.rnode, nodes[j]))
-				{
-					rnode = &nodes[j];
-					break;
-				}
-			}
+			/* ... and сopy rest of deleted relations. */
+			for (i = 0; j < n; i++, j++)
+				RelNodeDeleted->rnodes[i] = rnodes[j].node;
+
+			RelNodeDeleted->idx = i;
 		}
 		else
 		{
-			rnode = bsearch((const void *) &(bufHdr->tag.rnode),
-							nodes, n, sizeof(RelFileNode),
-							rnode_comparator);
+			RelNodeDeleted->idx = idx;
 		}
 
-		/* buffer doesn't belong to any of the given relfilenodes; skip it */
-		if (rnode == NULL)
-			continue;
-
-		buf_state = LockBufHdr(bufHdr);
-		if (RelFileNodeEquals(bufHdr->tag.rnode, (*rnode)))
-			InvalidateBuffer(bufHdr);	/* releases spinlock */
-		else
-			UnlockBufHdr(bufHdr, buf_state);
+		Assert(RelNodeDeleted->idx < LAZY_DELETE_ARRAY_SIZE);
+		LWLockRelease(LazyRelDeleteLock);
 	}
 
 	pfree(nodes);
@@ -3133,6 +3236,22 @@ DropDatabaseBuffers(Oid dbid)
 		else
 			UnlockBufHdr(bufHdr, buf_state);
 	}
+
+	/*
+	 * Remove all relations of dbid from lazy deleted array.
+	 */
+	LWLockAcquire(LazyRelDeleteLock, LW_SHARED);
+	{
+		int		idx = RelNodeDeleted->idx;
+
+		for (i = 0; i < idx; i++)
+		{
+			if (RelNodeDeleted->rnodes[i].dbNode == dbid)
+				RelNodeDeleted->rnodes[i] = RelNodeDeleted->rnodes[--idx];
+		}
+		RelNodeDeleted->idx = idx;
+	}
+	LWLockRelease(LazyRelDeleteLock);
 }
 
 /* -----------------------------------------------------------------
@@ -3330,6 +3449,15 @@ FlushDatabaseBuffers(Oid dbid)
 		if (bufHdr->tag.rnode.dbNode != dbid)
 			continue;
 
+		LWLockAcquire(LazyRelDeleteLock, LW_SHARED);
+
+		if (IsRelFileNodeDeleted(bufHdr->tag.rnode, RelNodeDeleted->idx))
+		{
+			LWLockRelease(LazyRelDeleteLock);
+			continue;
+		}
+
+		LWLockRelease(LazyRelDeleteLock);
 		ReservePrivateRefCountEntry();
 
 		buf_state = LockBufHdr(bufHdr);
diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c
index 4829953ee64..fdfdb78dff8 100644
--- a/src/backend/storage/ipc/ipci.c
+++ b/src/backend/storage/ipc/ipci.c
@@ -120,6 +120,7 @@ CreateSharedMemoryAndSemaphores(void)
 		size = add_size(size, hash_estimate_size(SHMEM_INDEX_SIZE,
 												 sizeof(ShmemIndexEnt)));
 		size = add_size(size, BufferShmemSize());
+		size = add_size(size, RelNodeDeletedBufferSize());
 		size = add_size(size, LockShmemSize());
 		size = add_size(size, PredicateLockShmemSize());
 		size = add_size(size, ProcGlobalShmemSize());
@@ -217,6 +218,7 @@ CreateSharedMemoryAndSemaphores(void)
 	SUBTRANSShmemInit();
 	MultiXactShmemInit();
 	InitBufferPool();
+	InitRelNodeDeletedBuffer();
 
 	/*
 	 * Set up lock manager
diff --git a/src/backend/storage/lmgr/lwlocknames.txt b/src/backend/storage/lmgr/lwlocknames.txt
index db478432291..a35e99190ff 100644
--- a/src/backend/storage/lmgr/lwlocknames.txt
+++ b/src/backend/storage/lmgr/lwlocknames.txt
@@ -49,3 +49,4 @@ MultiXactTruncationLock				41
 OldSnapshotTimeMapLock				42
 LogicalRepWorkerLock				43
 CLogTruncationLock					44
+LazyRelDeleteLock					45
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index 17b97f7e38f..9cb51bc5f0d 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -179,6 +179,7 @@ extern Buffer ReleaseAndReadBuffer(Buffer buffer, Relation relation,
 								   BlockNumber blockNum);
 
 extern void InitBufferPool(void);
+extern void InitRelNodeDeletedBuffer(void);
 extern void InitBufferPoolAccess(void);
 extern void InitBufferPoolBackend(void);
 extern void AtEOXact_Buffers(bool isCommit);
@@ -205,6 +206,7 @@ extern XLogRecPtr BufferGetLSNAtomic(Buffer buffer);
 extern void PrintPinnedBufs(void);
 #endif
 extern Size BufferShmemSize(void);
+extern Size RelNodeDeletedBufferSize(void);
 extern void BufferGetTag(Buffer buffer, RelFileNode *rnode,
 						 ForkNumber *forknum, BlockNumber *blknum);
 


[prev in list] [next in list] [prev in thread] [next in thread] 

Configure | About | News | Add a list | Sponsored by KoreLogic