[prev in list] [next in list] [prev in thread] [next in thread]
List: lxc-devel
Subject: [lxc-devel] [lxd/master] Make database queries timeout after 10s if cluster db is unavail
From: freeekanayaka on Github <lxc-bot () linuxcontainers ! org>
Date: 2018-08-29 11:28:59
Message-ID: 20180829112859.D30FA5A518 () mailman01 ! srv ! dcmtl ! stgraber ! net
[Download RAW message or body]
[Attachment #2 (text/x-mailbox)]
The following pull request was submitted through Github.
It can be accessed and reviewed at: https://github.com/lxc/lxd/pull/4983
This e-mail was sent by the LXC bot, direct replies will not reach the author
unless they happen to be subscribed to this list.
=== Description (from pull-request) ===
This makes database queries timeout after 10 seconds if quorum is lost **after** the \
node had initially started fine (e.g. cluster was up but a majority of db nodes were \
shutdown), which is probably the most common case we want to address.
What it does **not** do is trying to make the node minimally operational (e.g. server \
responding to "lxc info" without hanging forever) in the case where the daemon is \
still going through its startup sequence and it's stuck waiting for other nodes to be \
upgraded, since it found that its own version is more recent. That information is of \
course present in the logs of the daemon tho, so it should be pretty obvious to \
figure what's wrong with the stuck client.
Given that we're going to trigger snap refreshes explicitly when we detect that one \
node has upgraded, probably it's not worth too much polishing the UX for that window \
of time. It desired, that can be done in a separate branch.
(partially) Fixes #4479.
Signed-off-by: Free Ekanayaka <free.ekanayaka@canonical.com>
[Attachment #3 (text/plain)]
From 066ea3acbe4da5a7e1a292df203f933fd7a56fdd Mon Sep 17 00:00:00 2001
From: Free Ekanayaka <free.ekanayaka@canonical.com>
Date: Wed, 29 Aug 2018 12:57:54 +0200
Subject: [PATCH] Make database queries timeout after 10s if cluster db is
unavail
Signed-off-by: Free Ekanayaka <free.ekanayaka@canonical.com>
---
lxd/api.go | 4 +++-
lxd/api_cluster.go | 1 +
lxd/cluster/heartbeat_test.go | 2 +-
lxd/cluster/membership_test.go | 6 ++++--
lxd/daemon.go | 3 ++-
lxd/db/db.go | 13 ++++++-------
lxd/db/query/retry.go | 2 +-
lxd/db/testing.go | 3 ++-
lxd/response.go | 6 +++++-
9 files changed, 25 insertions(+), 15 deletions(-)
diff --git a/lxd/api.go b/lxd/api.go
index f6509de1c3..2ab2736697 100644
--- a/lxd/api.go
+++ b/lxd/api.go
@@ -63,7 +63,9 @@ func (s *lxdHttpServer) ServeHTTP(rw http.ResponseWriter, req \
*http.Request) { return nil
})
if err != nil {
- http.Error(rw, err.Error(), http.StatusInternalServerError)
+ response := SmartError(err)
+ response.Render(rw)
+ return
}
}
diff --git a/lxd/api_cluster.go b/lxd/api_cluster.go
index 785100ea4a..c585bca840 100644
--- a/lxd/api_cluster.go
+++ b/lxd/api_cluster.go
@@ -477,6 +477,7 @@ func clusterPutDisable(d *Daemon) Response {
store := d.gateway.ServerStore()
d.cluster, err = db.OpenCluster(
"db.bin", store, address, "/unused/db/dir",
+ d.config.DqliteSetupTimeout,
dqlite.WithDialFunc(d.gateway.DialFunc()),
dqlite.WithContext(d.gateway.Context()),
)
diff --git a/lxd/cluster/heartbeat_test.go b/lxd/cluster/heartbeat_test.go
index b6d698ad59..68c9883518 100644
--- a/lxd/cluster/heartbeat_test.go
+++ b/lxd/cluster/heartbeat_test.go
@@ -253,7 +253,7 @@ func (f *heartbeatFixture) node() (*state.State, \
*cluster.Gateway, string) { store := gateway.ServerStore()
dial := gateway.DialFunc()
state.Cluster, err = db.OpenCluster(
- "db.bin", store, address, "/unused/db/dir", dqlite.WithDialFunc(dial))
+ "db.bin", store, address, "/unused/db/dir", 5*time.Second, \
dqlite.WithDialFunc(dial)) require.NoError(f.t, err)
f.gateways[len(f.gateways)] = gateway
diff --git a/lxd/cluster/membership_test.go b/lxd/cluster/membership_test.go
index 85707961b6..a954381e2d 100644
--- a/lxd/cluster/membership_test.go
+++ b/lxd/cluster/membership_test.go
@@ -6,6 +6,7 @@ import (
"net/http"
"path/filepath"
"testing"
+ "time"
"github.com/CanonicalLtd/go-dqlite"
"github.com/lxc/lxd/lxd/cluster"
@@ -258,6 +259,7 @@ func TestJoin(t *testing.T) {
var err error
targetState.Cluster, err = db.OpenCluster(
"db.bin", targetStore, targetAddress, "/unused/db/dir",
+ 10*time.Second,
dqlite.WithDialFunc(targetDialFunc))
require.NoError(t, err)
@@ -294,7 +296,7 @@ func TestJoin(t *testing.T) {
dialFunc := gateway.DialFunc()
state.Cluster, err = db.OpenCluster(
- "db.bin", store, address, "/unused/db/dir", dqlite.WithDialFunc(dialFunc))
+ "db.bin", store, address, "/unused/db/dir", 5*time.Second, \
dqlite.WithDialFunc(dialFunc)) require.NoError(t, err)
f := &membershipFixtures{t: t, state: state}
@@ -382,7 +384,7 @@ func FLAKY_TestPromote(t *testing.T) {
store := targetGateway.ServerStore()
dialFunc := targetGateway.DialFunc()
targetState.Cluster, err = db.OpenCluster(
- "db.bin", store, targetAddress, "/unused/db/dir", dqlite.WithDialFunc(dialFunc))
+ "db.bin", store, targetAddress, "/unused/db/dir", 5*time.Second, \
dqlite.WithDialFunc(dialFunc)) require.NoError(t, err)
targetF := &membershipFixtures{t: t, state: targetState}
targetF.NetworkAddress(targetAddress)
diff --git a/lxd/daemon.go b/lxd/daemon.go
index 4be1835898..55b7841033 100644
--- a/lxd/daemon.go
+++ b/lxd/daemon.go
@@ -475,9 +475,10 @@ func (d *Daemon) init() error {
store := d.gateway.ServerStore()
d.cluster, err = db.OpenCluster(
"db.bin", store, address, dir,
+ d.config.DqliteSetupTimeout,
dqlite.WithDialFunc(d.gateway.DialFunc()),
dqlite.WithContext(d.gateway.Context()),
- dqlite.WithConnectionTimeout(d.config.DqliteSetupTimeout),
+ dqlite.WithConnectionTimeout(10*time.Second),
dqlite.WithLogFunc(cluster.DqliteLog),
)
if err == nil {
diff --git a/lxd/db/db.go b/lxd/db/db.go
index 6c6954191b..40e4036d65 100644
--- a/lxd/db/db.go
+++ b/lxd/db/db.go
@@ -8,7 +8,6 @@ import (
"github.com/CanonicalLtd/go-dqlite"
"github.com/pkg/errors"
- "golang.org/x/net/context"
"github.com/lxc/lxd/lxd/db/cluster"
"github.com/lxc/lxd/lxd/db/node"
@@ -158,15 +157,15 @@ type Cluster struct {
// database matches our version, and possibly trigger a schema update. If the
// schema update can't be performed right now, because some nodes are still
// behind, an Upgrading error is returned.
-func OpenCluster(name string, store dqlite.ServerStore, address, dir string, options \
...dqlite.DriverOption) (*Cluster, error) { +func OpenCluster(name string, store \
dqlite.ServerStore, address, dir string, timeout time.Duration, options \
...dqlite.DriverOption) (*Cluster, error) { db, err := cluster.Open(name, store, \
options...) if err != nil {
return nil, errors.Wrap(err, "failed to open database")
}
- // Test that the cluster database is operational. We wait up to 10
- // minutes, in case there's no quorum of nodes online yet.
- timeout := time.After(10 * time.Minute)
+ // Test that the cluster database is operational. We wait up to the
+ // given timeout , in case there's no quorum of nodes online yet.
+ timer := time.After(timeout)
for i := 0; ; i++ {
// Log initial attempts at debug level, but use warn
// level after the 5'th attempt (about 10 seconds).
@@ -186,7 +185,7 @@ func OpenCluster(name string, store dqlite.ServerStore, address, \
dir string, opt }
cause := errors.Cause(err)
- if cause != context.DeadlineExceeded {
+ if cause != dqlite.ErrNoAvailableLeader {
return nil, err
}
@@ -199,7 +198,7 @@ func OpenCluster(name string, store dqlite.ServerStore, address, \
dir string, opt
time.Sleep(2 * time.Second)
select {
- case <-timeout:
+ case <-timer:
return nil, fmt.Errorf("failed to connect to cluster database")
default:
}
diff --git a/lxd/db/query/retry.go b/lxd/db/query/retry.go
index 0c78a8c7f7..1b288377a4 100644
--- a/lxd/db/query/retry.go
+++ b/lxd/db/query/retry.go
@@ -16,7 +16,7 @@ import (
func Retry(f func() error) error {
// TODO: the retry loop should be configurable.
var err error
- for i := 0; i < 20; i++ {
+ for i := 0; i < 5; i++ {
err = f()
if err != nil {
logger.Debugf("Database error: %#v", err)
diff --git a/lxd/db/testing.go b/lxd/db/testing.go
index 18a59e3031..850b474a0f 100644
--- a/lxd/db/testing.go
+++ b/lxd/db/testing.go
@@ -7,6 +7,7 @@ import (
"net"
"os"
"testing"
+ "time"
"github.com/CanonicalLtd/go-dqlite"
"github.com/CanonicalLtd/raft-test"
@@ -63,7 +64,7 @@ func NewTestCluster(t *testing.T) (*Cluster, func()) {
}
cluster, err := OpenCluster(
- "test.db", store, "1", "/unused/db/dir",
+ "test.db", store, "1", "/unused/db/dir", 5*time.Second,
dqlite.WithLogFunc(log), dqlite.WithDialFunc(dial))
require.NoError(t, err)
diff --git a/lxd/response.go b/lxd/response.go
index f4feb09769..dd56c0255b 100644
--- a/lxd/response.go
+++ b/lxd/response.go
@@ -11,7 +11,9 @@ import (
"os"
"time"
+ dqlite "github.com/CanonicalLtd/go-dqlite"
"github.com/mattn/go-sqlite3"
+ "github.com/pkg/errors"
lxd "github.com/lxc/lxd/client"
"github.com/lxc/lxd/lxd/cluster"
@@ -509,7 +511,7 @@ func PreconditionFailed(err error) Response {
* SmartError returns the right error message based on err.
*/
func SmartError(err error) Response {
- switch err {
+ switch errors.Cause(err) {
case nil:
return EmptySyncResponse
case os.ErrNotExist:
@@ -524,6 +526,8 @@ func SmartError(err error) Response {
return Conflict(nil)
case sqlite3.ErrConstraintUnique:
return Conflict(nil)
+ case dqlite.ErrNoAvailableLeader:
+ return Unavailable(err)
default:
return InternalError(err)
}
[Attachment #4 (text/plain)]
_______________________________________________
lxc-devel mailing list
lxc-devel@lists.linuxcontainers.org
http://lists.linuxcontainers.org/listinfo/lxc-devel
[prev in list] [next in list] [prev in thread] [next in thread]
Configure |
About |
News |
Add a list |
Sponsored by KoreLogic