[prev in list] [next in list] [prev in thread] [next in thread]
List: postgresql-hackers
Subject: [PATCH] lazy relations delete
From: Maxim Orlov <m.orlov () postgrespro ! ru>
Date: 2019-12-31 10:16:49
Message-ID: 8023beea-ffdc-10dd-f390-6f27a21ae055 () postgrespro ! ru
[Download RAW message or body]
Hi!
Here is the case.
Assume we have a master to slave replication with shared_buffers set up
to 2 GB at the master and 4 GB at the slave. All of the data is written
to the master, while reading occurs from slave.
Now we decided to drop many tables, let's say 1000 or 10000 not in a
single transaction, but each table in a separate one. So, due to "plain"
shared_buffers memory we have to do for loop for every relation which
leads to lag between master and slave.
In real case scenario such issue lead to not a minutes lag, but hours
lag. At the same time PostgreSQL have a great routine to delete many
relations in a single transaction.
So, to get rid of this kind of issue here came up an idea: what if not
to delete everyone of relations right away and just store them in an
array, prevent shared buffers (correspond to a deleted relations) from
been flushed. And then array reaches it max size we need to walk all
buffers only once to "free" shared buffers correspond to a deleted
relations.
Here some values from the test which I am made.
Without patch:
1.
(master 2 GB) - drop 1000 tables took 6 sec
(slave 4 GB) - drop 1000 tables took 8 sec
2.
(master 4 GB) - drop 1000 tables took 10 sec
(slave 8 GB) - drop 1000 tables took 16 sec
3.
(master 10 GB) - drop 1000 tables took 22 sec
(slave 20 GB) - drop 1000 tables took 38 sec
With patch:
1.
(master 2 GB) - drop 1000 tables took 2 sec
(slave 4 GB) - drop 1000 tables took 2 sec
2.
(master 4 GB) - drop 1000 tables took 3 sec
(slave 8 GB) - drop 1000 tables took 3 sec
3.
(master 10 GB) - drop 1000 tables took 4 sec
(slave 20 GB) - drop 1000 tables took 4 sec
--
Max Orlov
E-mail: m.orlov@postgrespro.ru
["init.sh" (application/x-shellscript)]
#!/bin/sh
ROOT=`pwd`
# Install some usable contribs
CONTRIBDIR="$(dirname "$ROOT")"/build/contrib
cd $CONTRIBDIR/pg_prewarm/
make -s
make -s install
cd $ROOT
cd $CONTRIBDIR/pg_buffercache/
make -s
make -s install
cd $ROOT
# Set up env
BINDIR=$ROOT/bin
DATADIR="$ROOT/data"
DATADIR_PRIMARY=$DATADIR/primary
DATADIR_SECONDARY=$DATADIR/secondary
LOGSDIR="$DATADIR/logs"
mkdir -p $DATADIR
mkdir -p $LOGSDIR
mkdir -p $DATADIR_PRIMARY
mkdir -p $DATADIR_SECONDARY
chmod 700 -R $DATADIR_SECONDARY
LOGFILE=$LOGSDIR/init-db.log
PORT=55432
SLOT="slot01"
# build primary (master)
$BINDIR/pg_ctl -D$DATADIR_PRIMARY init
$BINDIR/pg_ctl -D$DATADIR_PRIMARY -l$LOGFILE -o"--port=$PORT" start
$BINDIR/psql -p$PORT -dpostgres <<EOF
\t on
select pg_create_physical_replication_slot('$SLOT');
create extension pg_prewarm;
create extension pg_buffercache;
EOF
# build secondary (replica)
$BINDIR/pg_basebackup -p$PORT -D$DATADIR_SECONDARY -S$SLOT -Xstream -vP
cat <<-CONF >> $DATADIR_SECONDARY/postgresql.conf
primary_conninfo = 'port=$PORT'
recovery_target_timeline = 'latest'
primary_slot_name = '$SLOT'
checkpoint_timeout = 5min
max_wal_size = 1GB
min_wal_size = 100MB
hot_standby_feedback = on
CONF
touch $DATADIR_SECONDARY/standby.signal
# all done, stop primary
$BINDIR/pg_ctl -D$DATADIR_PRIMARY stop
["run-001.sh" (application/x-shellscript)]
#!/bin/sh
ROOT=`pwd`
BINDIR=$ROOT/bin
DATADIR="$ROOT/data"
DATA_PRIMARY=$DATADIR/primary
DATA_SECONDARY=$DATADIR/secondary
LOGSDIR="$DATADIR/logs"
LOGS_PREFIX=`date +"%F_%T"`
LOG_PRIMARY=$LOGSDIR/$LOGS_PREFIX.primary.log
LOG_SECONDARY=$LOGSDIR/$LOGS_PREFIX.secondary.log
PORT_PRIMARY=55432
PORT_SECONDARY=55433
SHBUF_PRIMARY_SIZE="10 GB"
SHBUF_SECONDARY_SIZE="20 GB"
# Run the servers
$BINDIR/pg_ctl -D$DATA_PRIMARY -l"$LOG_PRIMARY" \
-o"--port=$PORT_PRIMARY --shared_buffers='$SHBUF_PRIMARY_SIZE'" start
$BINDIR/pg_ctl -D$DATA_SECONDARY -l"$LOG_SECONDARY" \
-o"--port=$PORT_SECONDARY --shared_buffers='$SHBUF_SECONDARY_SIZE'" start
# Const
PG_CODE='$$'
# Create procedures
$BINDIR/psql -dpostgres -p$PORT_PRIMARY <<EOF
---
--- Procedure f_watch: watch for moments of deleting/adding any tables in
--- public scheme, except for the the t_filler.
---
--- This procedure watch for the two moments:
--- 1. When the number of tables has changed for the first time (since run).
--- 2. When the number of tables becomes equal to zero.
---
--- The output is provided by "rise info".
---
create or replace procedure f_watch(node text, notice_delay int = 5)
language plpgsql
as
$PG_CODE
declare
n bigint;
i bigint;
start timestamptz;
finish timestamptz;
s text;
begin
select count(*) into n
from pg_tables
where schemaname = 'public' and tablename <> 't_filler';
loop
select count(*) into i
from pg_tables
where schemaname = 'public' and tablename <> 't_filler';
if i <> n then
start := clock_timestamp();
exit;
end if;
end loop;
loop
select count(*) into i
from pg_tables
where schemaname = 'public' and tablename <> 't_filler';
if i = 0 then
finish := clock_timestamp();
exit;
end if;
perform pg_sleep(1);
end loop;
perform pg_sleep(notice_delay);
select replace(replace(format('%-48s', node||'*'), ' ', '-'), '*', ' ')
into s;
raise info ' % ', s;
raise info ' Start delete at : %', start;
raise info ' Finish delete at : %', finish;
raise info ' It took : %', (finish - start);
select repeat('-', 48)
into s;
raise info ' % ', s;
end $PG_CODE ;
EOF
echo " *** STEP 1 *** "
./run-001-core.sh
echo " *** STEP 2 *** "
./run-001-core.sh
echo " *** STEP 3 *** "
./run-001-core.sh
echo " *** STEP 4 *** "
./run-001-core.sh
echo " *** STEP 5 *** "
./run-001-core.sh
# Stop the servers
$BINDIR/pg_ctl -D$DATA_PRIMARY stop
$BINDIR/pg_ctl -D$DATA_SECONDARY stop
["run-001-core.sh" (application/x-shellscript)]
#!/bin/sh
ROOT=`pwd`
BINDIR=$ROOT/bin
PORT_PRIMARY=55432
PORT_SECONDARY=55433
# Const
N_TABLES=1000
N_TABLE_ROWS=100
# Create tables
CREATE_SQL=""
for i in `seq 1 $N_TABLES`
do
CREATE_SQL="${CREATE_SQL}
create table if not exists t_$i as
select g, md5(g::text)
from generate_series(1, $N_TABLE_ROWS) g;\n"
done
echo $CREATE_SQL > create.sql.tmp
echo -n "Creating tables ... "
$BINDIR/psql -dpostgres -p$PORT_PRIMARY --quiet --tuples-only -fcreate.sql.tmp
rm -f create.sql.tmp
echo "DONE"
# Warm up the servers
WARMUP_SQL=""
for i in `seq 1 $N_TABLES`
do
WARMUP_SQL="${WARMUP_SQL}
select pg_prewarm('t_$i');"
done
echo $WARMUP_SQL > warmup.sql.tmp
echo -n "Warm up tables ... "
$BINDIR/psql -dpostgres -p$PORT_PRIMARY -q -fwarmup.sql.tmp > /dev/null
$BINDIR/psql -dpostgres -p$PORT_SECONDARY -q -fwarmup.sql.tmp > /dev/null
rm -f warmup.sql.tmp
echo "DONE"
# Watch for tables
$BINDIR/psql -dpostgres -p$PORT_PRIMARY --quiet --tuples-only -c"
call f_watch('PRIMARY', 3);
" &
$BINDIR/psql -dpostgres -p$PORT_SECONDARY --quiet --tuples-only -c"
call f_watch('SECONDARY', 6);
" &
# Delete tables
DELETE_SQL=""
for i in `seq 1 $N_TABLES`
do
DELETE_SQL="${DELETE_SQL}
drop table t_"$i";"
done
echo $DELETE_SQL > drop.sql.tmp
echo -n "Begin dropping tables ... "
$BINDIR/psql -dpostgres -p$PORT_PRIMARY --quiet --tuples-only -fdrop.sql.tmp
echo "DONE"
rm -f drop.sql.tmp
# Wait for jobs to be done
wait $(jobs -p)
["lazy-relations-delete.patch" (text/x-patch)]
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 1f10a97dc78..e0d1fb6a824 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -66,8 +66,6 @@
#define BUF_WRITTEN 0x01
#define BUF_REUSABLE 0x02
-#define DROP_RELS_BSEARCH_THRESHOLD 20
-
typedef struct PrivateRefCountEntry
{
Buffer buffer;
@@ -410,6 +408,72 @@ ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref)
}
}
+/*
+ * Lazy relations delete mechanism.
+ *
+ * On relation drop no need to walk shared buffers every time, just put a deleted
+ * RelFileNode into an array. Thus we store RelFileNodes in RelNodeDeleted
+ * struct and delete them later when number of deleted relations will
+ * exceed LAZY_DELETE_ARRAY_SIZE.
+ */
+
+#define LAZY_DELETE_ARRAY_SIZE 128
+
+/* Wrapper for array of lazy deleted RelFileNodes. */
+typedef struct RelNodeDeletedBuffer
+{
+ RelFileNode rnodes[LAZY_DELETE_ARRAY_SIZE]; /* Array of deleted */
+ int idx; /* Current position */
+} RelNodeDeletedBuffer;
+
+static RelNodeDeletedBuffer *RelNodeDeleted = NULL;
+
+/*
+ * Initialize shared array of lazy deleted relations.
+ *
+ * This is called once during shared-memory initialization (either in the
+ * postmaster, or in a standalone backend).
+ */
+void InitRelNodeDeletedBuffer(void)
+{
+ bool found;
+
+ RelNodeDeleted = ShmemInitStruct("Lazy Deleted Relations",
+ sizeof(RelNodeDeletedBuffer),
+ &found);
+
+ if (!found)
+ MemSet(RelNodeDeleted, 0, sizeof(RelNodeDeletedBuffer));
+}
+
+/*
+ * Compute the size of shared memory for the buffer of lazy deleted relations.
+ */
+Size
+RelNodeDeletedBufferSize(void)
+{
+ Size size = 0;
+
+ size = add_size(size, sizeof(RelNodeDeletedBuffer));
+
+ return size;
+}
+
+/*
+ * Check for relation is in lazy deleted buffer (up to n-th position).
+ */
+static inline bool
+IsRelFileNodeDeleted(RelFileNode rnode, int n)
+{
+ int i;
+
+ for (i = 0; i < n; i++)
+ if (RelFileNodeEquals(rnode, RelNodeDeleted->rnodes[i]))
+ return true;
+
+ return false;
+}
+
/*
* BufferIsPinned
* True iff the buffer is pinned (also checks for valid buffer number).
@@ -452,6 +516,7 @@ static BufferDesc *BufferAlloc(SMgrRelation smgr,
BlockNumber blockNum,
BufferAccessStrategy strategy,
bool *foundPtr);
+static void InvalidateRelFileNodesBuffers(RelFileNode *nodes, int nnodes);
static void FlushBuffer(BufferDesc *buf, SMgrRelation reln);
static void AtProcExit_Buffers(int code, Datum arg);
static void CheckForBufferLeaks(void);
@@ -2690,6 +2755,22 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln)
char *bufToWrite;
uint32 buf_state;
+ LWLockAcquire(LazyRelDeleteLock, LW_SHARED);
+
+ /*
+ * If rnode is in lazy deleted buffer clear dirty and checkpoint flags.
+ */
+ if (IsRelFileNodeDeleted(buf->tag.rnode, RelNodeDeleted->idx))
+ {
+ buf_state = LockBufHdr(buf);
+ buf_state &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED);
+ UnlockBufHdr(buf, buf_state);
+ LWLockRelease(LazyRelDeleteLock);
+ return;
+ }
+
+ LWLockRelease(LazyRelDeleteLock);
+
/*
* Acquire the buffer's io_in_progress lock. If StartBufferIO returns
* false, then someone else flushed the buffer before we could, so we need
@@ -2993,6 +3074,43 @@ DropRelFileNodeBuffers(RelFileNodeBackend rnode, ForkNumber *forkNum,
}
}
+/*
+ * Invalidate all of the nodes buffers.
+ */
+void
+InvalidateRelFileNodesBuffers(RelFileNode *nodes, int nnodes)
+{
+ int i;
+
+ pg_qsort(nodes, nnodes, sizeof(RelFileNode), rnode_comparator);
+
+ for (i = 0; i < NBuffers; i++)
+ {
+ RelFileNode *rnode = NULL;
+ BufferDesc *bufHdr = GetBufferDescriptor(i);
+ uint32 buf_state;
+
+ /*
+ * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
+ * and saves some cycles.
+ */
+
+ rnode = bsearch((const void *) &(bufHdr->tag.rnode),
+ nodes, nnodes, sizeof(RelFileNode),
+ rnode_comparator);
+
+ /* buffer doesn't belong to any of the given relfilenodes; skip it */
+ if (rnode == NULL)
+ continue;
+
+ buf_state = LockBufHdr(bufHdr);
+ if (RelFileNodeEquals(bufHdr->tag.rnode, (*rnode)))
+ InvalidateBuffer(bufHdr); /* releases spinlock */
+ else
+ UnlockBufHdr(bufHdr, buf_state);
+ }
+}
+
/* ---------------------------------------------------------------------
* DropRelFileNodesAllBuffers
*
@@ -3006,25 +3124,27 @@ void
DropRelFileNodesAllBuffers(RelFileNodeBackend *rnodes, int nnodes)
{
int i,
+ j,
n = 0;
RelFileNode *nodes;
- bool use_bsearch;
if (nnodes == 0)
return;
nodes = palloc(sizeof(RelFileNode) * nnodes); /* non-local relations */
- /* If it's a local relation, it's localbuf.c's problem. */
for (i = 0; i < nnodes; i++)
{
+ /* If it's a local relation, it's localbuf.c's problem. */
if (RelFileNodeBackendIsTemp(rnodes[i]))
{
if (rnodes[i].backend == MyBackendId)
DropRelFileNodeAllLocalBuffers(rnodes[i].node);
}
else
+ {
nodes[n++] = rnodes[i].node;
+ }
}
/*
@@ -3037,58 +3157,41 @@ DropRelFileNodesAllBuffers(RelFileNodeBackend *rnodes, int nnodes)
return;
}
- /*
- * For low number of relations to drop just use a simple walk through, to
- * save the bsearch overhead. The threshold to use is rather a guess than
- * an exactly determined value, as it depends on many factors (CPU and RAM
- * speeds, amount of shared buffers etc.).
- */
- use_bsearch = n > DROP_RELS_BSEARCH_THRESHOLD;
+ if (n > LAZY_DELETE_ARRAY_SIZE)
+ {
+ InvalidateRelFileNodesBuffers(nodes, n);
+ }
+ else
+ {
+ int idx;
- /* sort the list of rnodes if necessary */
- if (use_bsearch)
- pg_qsort(nodes, n, sizeof(RelFileNode), rnode_comparator);
+ LWLockAcquire(LazyRelDeleteLock, LW_SHARED);
+ idx = RelNodeDeleted->idx;
+ Assert(idx < LAZY_DELETE_ARRAY_SIZE);
- for (i = 0; i < NBuffers; i++)
- {
- RelFileNode *rnode = NULL;
- BufferDesc *bufHdr = GetBufferDescriptor(i);
- uint32 buf_state;
+ /* Copy deleted relations into lazy deleted array, but watch the size ... */
+ for (i = idx, j = 0; i < LAZY_DELETE_ARRAY_SIZE && j < n; i++, j++)
+ RelNodeDeleted->rnodes[i] = rnodes[j].node;
- /*
- * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
- * and saves some cycles.
- */
+ idx += j;
- if (!use_bsearch)
+ if (idx == LAZY_DELETE_ARRAY_SIZE)
{
- int j;
+ InvalidateRelFileNodesBuffers(RelNodeDeleted->rnodes, idx);
- for (j = 0; j < n; j++)
- {
- if (RelFileNodeEquals(bufHdr->tag.rnode, nodes[j]))
- {
- rnode = &nodes[j];
- break;
- }
- }
+ /* ... and сopy rest of deleted relations. */
+ for (i = 0; j < n; i++, j++)
+ RelNodeDeleted->rnodes[i] = rnodes[j].node;
+
+ RelNodeDeleted->idx = i;
}
else
{
- rnode = bsearch((const void *) &(bufHdr->tag.rnode),
- nodes, n, sizeof(RelFileNode),
- rnode_comparator);
+ RelNodeDeleted->idx = idx;
}
- /* buffer doesn't belong to any of the given relfilenodes; skip it */
- if (rnode == NULL)
- continue;
-
- buf_state = LockBufHdr(bufHdr);
- if (RelFileNodeEquals(bufHdr->tag.rnode, (*rnode)))
- InvalidateBuffer(bufHdr); /* releases spinlock */
- else
- UnlockBufHdr(bufHdr, buf_state);
+ Assert(RelNodeDeleted->idx < LAZY_DELETE_ARRAY_SIZE);
+ LWLockRelease(LazyRelDeleteLock);
}
pfree(nodes);
@@ -3133,6 +3236,22 @@ DropDatabaseBuffers(Oid dbid)
else
UnlockBufHdr(bufHdr, buf_state);
}
+
+ /*
+ * Remove all relations of dbid from lazy deleted array.
+ */
+ LWLockAcquire(LazyRelDeleteLock, LW_SHARED);
+ {
+ int idx = RelNodeDeleted->idx;
+
+ for (i = 0; i < idx; i++)
+ {
+ if (RelNodeDeleted->rnodes[i].dbNode == dbid)
+ RelNodeDeleted->rnodes[i] = RelNodeDeleted->rnodes[--idx];
+ }
+ RelNodeDeleted->idx = idx;
+ }
+ LWLockRelease(LazyRelDeleteLock);
}
/* -----------------------------------------------------------------
@@ -3330,6 +3449,15 @@ FlushDatabaseBuffers(Oid dbid)
if (bufHdr->tag.rnode.dbNode != dbid)
continue;
+ LWLockAcquire(LazyRelDeleteLock, LW_SHARED);
+
+ if (IsRelFileNodeDeleted(bufHdr->tag.rnode, RelNodeDeleted->idx))
+ {
+ LWLockRelease(LazyRelDeleteLock);
+ continue;
+ }
+
+ LWLockRelease(LazyRelDeleteLock);
ReservePrivateRefCountEntry();
buf_state = LockBufHdr(bufHdr);
diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c
index 4829953ee64..fdfdb78dff8 100644
--- a/src/backend/storage/ipc/ipci.c
+++ b/src/backend/storage/ipc/ipci.c
@@ -120,6 +120,7 @@ CreateSharedMemoryAndSemaphores(void)
size = add_size(size, hash_estimate_size(SHMEM_INDEX_SIZE,
sizeof(ShmemIndexEnt)));
size = add_size(size, BufferShmemSize());
+ size = add_size(size, RelNodeDeletedBufferSize());
size = add_size(size, LockShmemSize());
size = add_size(size, PredicateLockShmemSize());
size = add_size(size, ProcGlobalShmemSize());
@@ -217,6 +218,7 @@ CreateSharedMemoryAndSemaphores(void)
SUBTRANSShmemInit();
MultiXactShmemInit();
InitBufferPool();
+ InitRelNodeDeletedBuffer();
/*
* Set up lock manager
diff --git a/src/backend/storage/lmgr/lwlocknames.txt b/src/backend/storage/lmgr/lwlocknames.txt
index db478432291..a35e99190ff 100644
--- a/src/backend/storage/lmgr/lwlocknames.txt
+++ b/src/backend/storage/lmgr/lwlocknames.txt
@@ -49,3 +49,4 @@ MultiXactTruncationLock 41
OldSnapshotTimeMapLock 42
LogicalRepWorkerLock 43
CLogTruncationLock 44
+LazyRelDeleteLock 45
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index 17b97f7e38f..9cb51bc5f0d 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -179,6 +179,7 @@ extern Buffer ReleaseAndReadBuffer(Buffer buffer, Relation relation,
BlockNumber blockNum);
extern void InitBufferPool(void);
+extern void InitRelNodeDeletedBuffer(void);
extern void InitBufferPoolAccess(void);
extern void InitBufferPoolBackend(void);
extern void AtEOXact_Buffers(bool isCommit);
@@ -205,6 +206,7 @@ extern XLogRecPtr BufferGetLSNAtomic(Buffer buffer);
extern void PrintPinnedBufs(void);
#endif
extern Size BufferShmemSize(void);
+extern Size RelNodeDeletedBufferSize(void);
extern void BufferGetTag(Buffer buffer, RelFileNode *rnode,
ForkNumber *forknum, BlockNumber *blknum);
[prev in list] [next in list] [prev in thread] [next in thread]
Configure |
About |
News |
Add a list |
Sponsored by KoreLogic