[prev in list] [next in list] [prev in thread] [next in thread] 

List:       sanlock-devel
Subject:    [sanlock] 04/04: sanlock: fix release interference with paxos
From:       git () pagure ! io (git repository hosting)
Date:       2017-11-22 22:20:27
Message-ID: 20171122222024.39FE3246102A () pagure01 ! fedoraproject ! org
[Download RAW message or body]

This is an automated email from the git hooks/post-receive script.

teigland pushed a commit to branch testing
in repository sanlock.

commit 76332a0c5364578ea01adc801c743984aaf62e9f
Author: David Teigland <teigland@redhat.com>
Date:   Wed Nov 22 16:04:36 2017 -0600

    sanlock: fix release interference with paxos
    
    The process of releasing a lease on disk involved
    first zeroing our own dblock values prior to zeroing
    the leader record.  Clearing dblock values can
    interfere with an in-progress paxos ballot, and can
    result in a second host owning the same lease version
    that we released.
    
    The incorrect zeroing of dblocks was added in
    commit d6bef45b9716c581d99466280a52a01c9ebe3bf7
    
    The reason for zeroing was to fix an issue with shared
    leases here: e40e1f6e22f9b10f08d53fc7da94f5158d9e4ae8
    That issue is now resolved by setting a new flag in the
    dblock to indicate we are releasing the lease.  The new
    flag is set, along with the unchanged dblock values,
    before clearing the leader block.
---
 src/leader.h           |  2 +-
 src/ondisk.c           |  2 ++
 src/paxos_dblock.h     |  8 +++++
 src/paxos_lease.c      | 86 ++++++++++++++++++++++++++++++++++++++++++++------
 src/resource.c         | 64 +++++++++++++++++++++++++++++++------
 src/sanlock_internal.h |  1 +
 6 files changed, 143 insertions(+), 20 deletions(-)

diff --git a/src/leader.h b/src/leader.h
index b95866a..d67c243 100644
--- a/src/leader.h
+++ b/src/leader.h
@@ -18,7 +18,7 @@
 #define PAXOS_DISK_MAGIC 0x06152010
 #define PAXOS_DISK_CLEAR 0x11282016
 #define PAXOS_DISK_VERSION_MAJOR 0x00060000
-#define PAXOS_DISK_VERSION_MINOR 0x00000002
+#define PAXOS_DISK_VERSION_MINOR 0x00000003
 
 #define DELTA_DISK_MAGIC 0x12212010
 #define DELTA_DISK_VERSION_MAJOR 0x00030000
diff --git a/src/ondisk.c b/src/ondisk.c
index 83b6832..21f9b64 100644
--- a/src/ondisk.c
+++ b/src/ondisk.c
@@ -92,6 +92,7 @@ void paxos_dblock_in(struct paxos_dblock *end, struct paxos_dblock \
*pd)  pd->inp3     = le64_to_cpu(end->inp3);
 	pd->lver     = le64_to_cpu(end->lver);
 	pd->checksum = le32_to_cpu(end->checksum);
+	pd->flags    = le32_to_cpu(end->flags);
 }
 
 void paxos_dblock_out(struct paxos_dblock *pd, struct paxos_dblock *end)
@@ -104,6 +105,7 @@ void paxos_dblock_out(struct paxos_dblock *pd, struct \
paxos_dblock *end)  end->lver     = cpu_to_le64(pd->lver);
 	/* N.B. the checksum must be computed after the byte swapping */
 	/* paxos_dblock_out(pd, end); checksum = compute(end), end->checksum = \
cpu_to_le32(checksum); */ +	end->flags    = cpu_to_le32(pd->flags);
 }
 
 void mode_block_in(struct mode_block *end, struct mode_block *mb)
diff --git a/src/paxos_dblock.h b/src/paxos_dblock.h
index a07abc4..7d08c95 100644
--- a/src/paxos_dblock.h
+++ b/src/paxos_dblock.h
@@ -12,6 +12,8 @@
 
 #define DBLOCK_CHECKSUM_LEN      48  /* ends before checksum field */
 
+#define DBLOCK_FL_RELEASED	0x00000001
+
 struct paxos_dblock {
 	uint64_t mbal;
 	uint64_t bal;
@@ -20,6 +22,12 @@ struct paxos_dblock {
 	uint64_t inp3;  /* host_id's timestamp */
 	uint64_t lver;
 	uint32_t checksum;
+	uint32_t flags; /* DBLOCK_FL_ */
 };
 
+/*
+ * This struct cannot grow any larger than MBLOCK_OFFSET (128)
+ * because the mode_block starts at that offset in the same sector.
+ */
+
 #endif
diff --git a/src/paxos_lease.c b/src/paxos_lease.c
index 777757b..59a854b 100644
--- a/src/paxos_lease.c
+++ b/src/paxos_lease.c
@@ -1696,9 +1696,9 @@ int paxos_lease_acquire(struct task *task,
 			if (cur_leader.write_id != cur_leader.owner_id) {
 				rv = read_dblock(task, token, &token->disks[0],
 						 cur_leader.owner_id, &owner_dblock);
-				if (!rv && !owner_dblock.lver) {
+				if (!rv && (owner_dblock.flags & DBLOCK_FL_RELEASED)) {
 					/* not an error, but interesting to see */
-					log_errot(token, "paxos_acquire owner %llu %llu %llu writer %llu owner dblock \
free", +					log_errot(token, "paxos_acquire owner %llu %llu %llu writer %llu owner \
dblock released",  (unsigned long long)cur_leader.owner_id,
 						  (unsigned long long)cur_leader.owner_generation,
 						  (unsigned long long)cur_leader.timestamp,
@@ -2026,28 +2026,68 @@ int paxos_lease_release(struct task *task,
 		last = leader_last;
 
 	if (leader.lver != last->lver) {
-		log_errot(token, "paxos_release %llu other lver %llu",
+		log_errot(token, "paxos_release other lver "
+			  "last lver %llu owner %llu %llu %llu writer %llu %llu %llu "
+			  "disk lver %llu owner %llu %llu %llu writer %llu %llu %llu",
 			  (unsigned long long)last->lver,
-			  (unsigned long long)leader.lver);
+			  (unsigned long long)last->owner_id,
+			  (unsigned long long)last->owner_generation,
+			  (unsigned long long)last->timestamp,
+			  (unsigned long long)last->write_id,
+			  (unsigned long long)last->write_generation,
+			  (unsigned long long)last->write_timestamp,
+			  (unsigned long long)leader.lver,
+			  (unsigned long long)leader.owner_id,
+			  (unsigned long long)leader.owner_generation,
+			  (unsigned long long)leader.timestamp,
+			  (unsigned long long)leader.write_id,
+			  (unsigned long long)leader.write_generation,
+			  (unsigned long long)leader.write_timestamp);
 		return SANLK_RELEASE_LVER;
 	}
 
 	if (leader.timestamp == LEASE_FREE) {
-		log_errot(token, "paxos_release %llu already free %llu %llu %llu",
+		log_errot(token, "paxos_release already free "
+			  "last lver %llu owner %llu %llu %llu writer %llu %llu %llu "
+			  "disk lver %llu owner %llu %llu %llu writer %llu %llu %llu",
 			  (unsigned long long)last->lver,
+			  (unsigned long long)last->owner_id,
+			  (unsigned long long)last->owner_generation,
+			  (unsigned long long)last->timestamp,
+			  (unsigned long long)last->write_id,
+			  (unsigned long long)last->write_generation,
+			  (unsigned long long)last->write_timestamp,
+			  (unsigned long long)leader.lver,
 			  (unsigned long long)leader.owner_id,
 			  (unsigned long long)leader.owner_generation,
-			  (unsigned long long)leader.timestamp);
+			  (unsigned long long)leader.timestamp,
+			  (unsigned long long)leader.write_id,
+			  (unsigned long long)leader.write_generation,
+			  (unsigned long long)leader.write_timestamp);
+
 		return SANLK_RELEASE_OWNER;
 	}
 
 	if (leader.owner_id != token->host_id ||
 	    leader.owner_generation != token->host_generation) {
-		log_errot(token, "paxos_release %llu other owner %llu %llu %llu",
+		log_errot(token, "paxos_release other owner "
+			  "last lver %llu owner %llu %llu %llu writer %llu %llu %llu "
+			  "disk lver %llu owner %llu %llu %llu writer %llu %llu %llu",
 			  (unsigned long long)last->lver,
+			  (unsigned long long)last->owner_id,
+			  (unsigned long long)last->owner_generation,
+			  (unsigned long long)last->timestamp,
+			  (unsigned long long)last->write_id,
+			  (unsigned long long)last->write_generation,
+			  (unsigned long long)last->write_timestamp,
+			  (unsigned long long)leader.lver,
 			  (unsigned long long)leader.owner_id,
 			  (unsigned long long)leader.owner_generation,
-			  (unsigned long long)leader.timestamp);
+			  (unsigned long long)leader.timestamp,
+			  (unsigned long long)leader.write_id,
+			  (unsigned long long)leader.write_generation,
+			  (unsigned long long)leader.write_timestamp);
+
 		return SANLK_RELEASE_OWNER;
 	}
 
@@ -2063,19 +2103,45 @@ int paxos_lease_release(struct task *task,
 		 * perfectly fine (use log_error since it's interesting
 		 * to see when this happens.)
 		 */
-		log_errot(token, "paxos_release %llu leader different "
-			  "write %llu %llu %llu vs %llu %llu %llu",
+		log_errot(token, "paxos_release different vals "
+			  "last lver %llu owner %llu %llu %llu writer %llu %llu %llu "
+			  "disk lver %llu owner %llu %llu %llu writer %llu %llu %llu",
 			  (unsigned long long)last->lver,
+			  (unsigned long long)last->owner_id,
+			  (unsigned long long)last->owner_generation,
+			  (unsigned long long)last->timestamp,
 			  (unsigned long long)last->write_id,
 			  (unsigned long long)last->write_generation,
 			  (unsigned long long)last->write_timestamp,
+			  (unsigned long long)leader.lver,
+			  (unsigned long long)leader.owner_id,
+			  (unsigned long long)leader.owner_generation,
+			  (unsigned long long)leader.timestamp,
 			  (unsigned long long)leader.write_id,
 			  (unsigned long long)leader.write_generation,
 			  (unsigned long long)leader.write_timestamp);
+
 		/*
 		log_leader_error(0, token, &token->disks[0], last, "paxos_release");
 		log_leader_error(0, token, &token->disks[0], &leader, "paxos_release");
 		*/
+
+		/*
+		 * If another host was the writer and committed us as the
+		 * owner, then we don't zero the leader record when we release,
+		 * we just release our dblock (by setting the release flag,
+		 * already done prior to calling paxos_lease_release).  This is
+		 * because other hosts will ignore our leader record if we were
+		 * not the writer once we release our dblock.  Those other
+		 * hosts will then run a ballot and commit/write a new leader.
+		 * If we are also zeroing the leader, that can race with
+		 * another host writing a new leader, and we could clobber the
+		 * new leader.
+		 */
+		if (leader.write_id != token->host_id) {
+			log_token(token, "paxos_release %llu skip leader write", (unsigned long \
long)last->lver); +			return 0;
+		}
 	}
 
 	if (resrename)
diff --git a/src/resource.c b/src/resource.c
index 6309b9c..3339422 100644
--- a/src/resource.c
+++ b/src/resource.c
@@ -361,8 +361,21 @@ static int write_host_block(struct task *task, struct token \
*token,  log_errot(token, "write_host_block host_id %llu flags %x gen %llu rv %d",
 			  (unsigned long long)host_id, mb_flags, (unsigned long long)mb_gen, rv);
 	} else {
-		log_token(token, "write_host_block host_id %llu flags %x gen %llu",
-			  (unsigned long long)host_id, mb_flags, (unsigned long long)mb_gen);
+		if (pd)
+			log_token(token, "write_host_block host_id %llu flags %x gen %llu dblock \
%llu:%llu:%llu:%llu:%llu:%llu%s", +				  (unsigned long long)host_id,
+				  mb_flags,
+				  (unsigned long long)mb_gen,
+				  (unsigned long long)pd->mbal,
+				  (unsigned long long)pd->bal,
+				  (unsigned long long)pd->inp,
+				  (unsigned long long)pd->inp2,
+				  (unsigned long long)pd->inp3,
+				  (unsigned long long)pd->lver,
+				  (pd->flags & DBLOCK_FL_RELEASED) ? ":RELEASED." : ".");
+		else
+			log_token(token, "write_host_block host_id %llu flags %x gen %llu dblock 0",
+				  (unsigned long long)host_id, mb_flags, (unsigned long long)mb_gen);
 	}
 
 	if (rv != SANLK_AIO_TIMEOUT)
@@ -370,6 +383,29 @@ static int write_host_block(struct task *task, struct token \
*token,  return rv;
 }
 
+static int write_host_block_zero_dblock_release(struct task *task, struct token \
*token) +{
+	struct paxos_dblock dblock;
+
+	memcpy(&dblock, &token->resource->acquire_dblock, sizeof(dblock));
+
+	dblock.flags = DBLOCK_FL_RELEASED;
+
+	return write_host_block(task, token, token->host_id, 0, 0, &dblock);
+}
+
+static int write_host_block_shared_dblock_release(struct task *task, struct token \
*token) +{
+	struct paxos_dblock dblock;
+
+	memcpy(&dblock, &token->resource->acquire_dblock, sizeof(dblock));
+
+	dblock.flags = DBLOCK_FL_RELEASED;
+
+	return write_host_block(task, token, token->host_id, token->host_generation,
+				MBLOCK_SHARED, &dblock);
+}
+
 static int read_mode_block(struct task *task, struct token *token,
 			   uint64_t host_id, struct mode_block *mb_out)
 {
@@ -838,6 +874,8 @@ static int _release_token(struct task *task, struct token *token,
 	 */
 
 	if (r_flags & R_ERASE_ALL) {
+		/* FIXME: figure out what to clear to avoid disrupting ongoing paxos */
+
 		rv = write_host_block(task, token, token->host_id, 0, 0, NULL);
 		if (rv < 0) {
 			log_errot(token, "release_token erase all write_host_block %d", rv);
@@ -868,6 +906,8 @@ static int _release_token(struct task *task, struct token *token,
 			  (unsigned long long)lver, rv);
 
 	} else if (r_flags & R_UNDO_SHARED) {
+		/* FIXME: figure out what to clear to avoid disrupting ongoing paxos */
+
 		rv = write_host_block(task, token, token->host_id, 0, 0, NULL);
 		if (rv < 0) {
 			log_errot(token, "release_token undo shared write_host_block %d", rv);
@@ -889,7 +929,7 @@ static int _release_token(struct task *task, struct token *token,
 	} else if (r_flags & R_SHARED) {
 		/* normal release of sh lease */
 
-		rv = write_host_block(task, token, token->host_id, 0, 0, NULL);
+		rv = write_host_block_zero_dblock_release(task, token);
 		if (rv < 0) {
 			log_errot(token, "release_token shared write_host_block %d", rv);
 			ret = rv;
@@ -919,7 +959,7 @@ static int _release_token(struct task *task, struct token *token,
 		}
 
 		/* Failure here is not a big deal and can be ignored. */
-		rv = write_host_block(task, token, token->host_id, 0, 0, NULL);
+		rv = write_host_block_zero_dblock_release(task, token);
 		if (rv < 0)
 			log_errot(token, "release_token write_host_block %d", rv);
 
@@ -1300,8 +1340,7 @@ static int convert_ex2sh_token(struct task *task, struct \
resource *r, struct tok  if (r->flags & R_LVB_WRITE_RELEASE)
 		write_lvb_block(task, r, token);
 
-	/* FIXME: preserve dblock */
-	rv = write_host_block(task, token, token->host_id, token->host_generation, \
MBLOCK_SHARED, NULL); +	rv = write_host_block_shared_dblock_release(task, token);
 	if (rv < 0) {
 		log_errot(token, "convert_ex2sh write_host_block error %d", rv);
 		return rv;
@@ -1668,13 +1707,15 @@ int acquire_token(struct task *task, struct token *token, \
uint32_t cmd_flags,  if (!(token->acquire_flags & SANLK_RES_SHARED))
 		token->r.lver = leader.lver;
 
+	memcpy(&token->resource->acquire_dblock, &dblock, sizeof(dblock));
+
 	/*
 	 * acquiring shared lease, so we set SHARED in our mode_block
 	 * and release the leader owner.
 	 */
 
 	if (token->acquire_flags & SANLK_RES_SHARED) {
-		rv = write_host_block(task, token, token->host_id, token->host_generation, \
MBLOCK_SHARED, &dblock); +		rv = write_host_block_shared_dblock_release(task, token);
 		if (rv < 0) {
 			log_errot(token, "acquire_token sh write_host_block error %d", rv);
 			r->flags &= ~R_SHARED;
@@ -2033,6 +2074,8 @@ static void resource_thread_release(struct task *task, struct \
resource *r, struc  log_token(token, "release async r_flags %x", r_flags);
 
 	if (r_flags & R_ERASE_ALL) {
+		/* FIXME: figure out what to clear to avoid disrupting ongoing paxos */
+
 		rv = write_host_block(task, token, token->host_id, 0, 0, NULL);
 		if (rv < 0)
 			log_errot(token, "release async erase all write_host_block %d", rv);
@@ -2058,6 +2101,8 @@ static void resource_thread_release(struct task *task, struct \
resource *r, struc  (unsigned long long)r->leader.lver, rv);
 
 	} else if (r_flags & R_UNDO_SHARED) {
+		/* FIXME: figure out what to clear to avoid disrupting ongoing paxos */
+
 		rv = write_host_block(task, token, token->host_id, 0, 0, NULL);
 		if (rv < 0)
 			log_errot(token, "release async undo shared write_host_block %d", rv);
@@ -2075,7 +2120,7 @@ static void resource_thread_release(struct task *task, struct \
resource *r, struc  } else if (r_flags & R_SHARED) {
 		/* normal release of sh lease */
 
-		rv = write_host_block(task, token, token->host_id, 0, 0, NULL);
+		rv = write_host_block_zero_dblock_release(task, token);
 		if (rv < 0)
 			log_errot(token, "release async shared write_host_block %d", rv);
 
@@ -2094,7 +2139,7 @@ static void resource_thread_release(struct task *task, struct \
resource *r, struc  }
 
 		/* Failure here is not a big deal and can be ignored. */
-		rv = write_host_block(task, token, token->host_id, 0, 0, NULL);
+		rv = write_host_block_zero_dblock_release(task, token);
 		if (rv < 0)
 			log_errot(token, "release async write_host_block %d", rv);
 
@@ -2256,6 +2301,7 @@ static void *resource_thread(void *arg GNUC_UNUSED)
 			tt->host_generation = r->host_generation;
 			tt->token_id = r->release_token_id;
 			tt->io_timeout = r->io_timeout;
+			tt->resource = r;
 
 			/*
 			 * Set the time after which we should try to release this
diff --git a/src/sanlock_internal.h b/src/sanlock_internal.h
index 472380f..279b56d 100644
--- a/src/sanlock_internal.h
+++ b/src/sanlock_internal.h
@@ -130,6 +130,7 @@ struct resource {
 	char killpath[SANLK_HELPER_PATH_LEN]; /* copied from client */
 	char killargs[SANLK_HELPER_ARGS_LEN]; /* copied from client */
 	struct leader_record leader; /* copy of last leader_record we wrote */
+	struct paxos_dblock acquire_dblock; /* dblock we wrote in acquire */
 	struct sanlk_resource r;
 };
 

-- 
To stop receiving notification emails like this one, please contact
the administrator of this repository.
_______________________________________________
sanlock-devel mailing list -- sanlock-devel@lists.fedorahosted.org
To unsubscribe send an email to sanlock-devel-leave@lists.fedorahosted.org


[prev in list] [next in list] [prev in thread] [next in thread] 

Configure | About | News | Add a list | Sponsored by KoreLogic