[prev in list] [next in list] [prev in thread] [next in thread] 

List:       mesos-commits
Subject:    [mesos] 03/06: Added logging of tasks and operations during agent drain initiation.
From:       bmahler () apache ! org
Date:       2020-02-26 18:03:29
Message-ID: 20200226180326.E3FAD8DACE () gitbox ! apache ! org
[Download RAW message or body]

This is an automated email from the ASF dual-hosted git repository.

bmahler pushed a commit to branch 1.9.x
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit 73efa9f7bfad542fc153c25bd54e6c3e223c8382
Author: Benjamin Mahler <bmahler@apache.org>
AuthorDate: Wed Feb 12 20:40:33 2020 -0500

    Added logging of tasks and operations during agent drain initiation.
    
    When draining an agent, it's hard to tell which tasks failed to
    terminate from the logs. The master prints a count of the tasks
    remaining (only as VLOG(1) however), but not the IDs.
    
    This patch adds logging to the initiation of the drain on both the
    master and agent, that shows which tasks and operations are present.
    This makes it possible to then see which ones did not transition to
    a terminal state (with a bit of log analysis effort).
    
    Review: https://reviews.apache.org/r/72124
---
 src/master/http.cpp | 24 ++++++++++++++++++++++++
 src/slave/slave.cpp | 38 +++++++++++++++++++++++++++++++++++---
 2 files changed, 59 insertions(+), 3 deletions(-)

diff --git a/src/master/http.cpp b/src/master/http.cpp
index 0987d93..a93718a 100644
--- a/src/master/http.cpp
+++ b/src/master/http.cpp
@@ -3940,7 +3940,31 @@ Future<Response> Master::Http::_drainAgent(
           master->slaves.deactivated.insert(slaveId);
 
           Slave* slave = master->slaves.registered.get(slaveId);
+
+          // It's possible for the slave to be removed in the interim
+          // if it is marked unreachable.
           if (slave != nullptr) {
+            hashmap<FrameworkID, hashset<TaskID>> pendingTaskIds;
+            foreachpair (const FrameworkID& frameworkId,
+                         const auto& tasks,
+                         slave->pendingTasks) {
+              pendingTaskIds[frameworkId] = tasks.keys();
+            }
+
+            hashmap<FrameworkID, hashset<TaskID>> taskIds;
+            foreachpair (const FrameworkID& frameworkId,
+                         const auto& tasks,
+                         slave->tasks) {
+              taskIds[frameworkId] = tasks.keys();
+            }
+
+            LOG(INFO)
+              << "Transitioning agent " << slaveId << " to the DRAINING state"
+              << "; agent has (pending tasks, tasks, operations) == ("
+              << stringify(pendingTaskIds) << ", "
+              << stringify(taskIds) << ", "
+              << stringify(slave->operations.keys()) << ")";
+
             master->deactivate(slave);
 
             // Tell the agent to start draining.
diff --git a/src/slave/slave.cpp b/src/slave/slave.cpp
index 2c2b5a9..23d2ddd 100644
--- a/src/slave/slave.cpp
+++ b/src/slave/slave.cpp
@@ -998,10 +998,42 @@ void Slave::drain(
     const UPID& from,
     DrainSlaveMessage&& drainSlaveMessage)
 {
+  hashmap<FrameworkID, hashset<TaskID>> pendingTaskIds;
+  foreachvalue (Framework* framework, frameworks) {
+    foreachvalue (const auto& taskMap, framework->pendingTasks) {
+      pendingTaskIds[framework->id()] = taskMap.keys();
+    }
+  }
+
+  hashmap<FrameworkID, hashset<TaskID>> queuedTaskIds;
+  foreachvalue (Framework* framework, frameworks) {
+    foreachvalue (Executor* executor, framework->executors) {
+      foreachkey (const TaskID& taskId, executor->queuedTasks) {
+        queuedTaskIds[framework->id()].insert(taskId);
+      }
+    }
+  }
+
+  hashmap<FrameworkID, hashset<TaskID>> launchedTaskIds;
+  foreachvalue (Framework* framework, frameworks) {
+    foreachvalue (Executor* executor, framework->executors) {
+      foreachkey (const TaskID& taskId, executor->launchedTasks) {
+        launchedTaskIds[framework->id()].insert(taskId);
+      }
+    }
+  }
+
   LOG(INFO)
-    << "Checkpointing DrainConfig. Previous drain config was "
-    << (drainConfig.isSome() ? stringify(drainConfig.get()) : "NONE")
-    << ", new drain config is " << drainSlaveMessage.config();
+    << "Initiating drain with DrainConfig " << drainSlaveMessage.config()
+    << (drainConfig.isSome()
+        ? "; overwriting previous DrainConfig " + stringify(*drainConfig)
+        : "")
+    << "; agent has (pending tasks, queued tasks, launched tasks, operations)"
+    << " == ("
+    << stringify(pendingTaskIds) << ", "
+    << stringify(queuedTaskIds) << ", "
+    << stringify(launchedTaskIds) << ", "
+    << stringify(operations.keys()) << ")";
 
   CHECK_SOME(state::checkpoint(
       paths::getDrainConfigPath(metaDir, info.id()),

[prev in list] [next in list] [prev in thread] [next in thread] 

Configure | About | News | Add a list | Sponsored by KoreLogic