[prev in list] [next in list] [prev in thread] [next in thread]
List: kde-commits
Subject: kdenonbeta/icecream/icecream/services
From: Stephan Kulow <coolo () kde ! org>
Date: 2005-12-09 16:42:01
Message-ID: 1134146521.088076.6954.nullmailer () svn ! kde ! org
[Download RAW message or body]
SVN commit 487155 by coolo:
hopefully catch the last bothering issue: no more pending DONE
jobs (that you only see if you have really lots of compile jobs)
CCMAIL: matz@suse.de
M +50 -27 scheduler.cpp
--- trunk/kdenonbeta/icecream/icecream/services/scheduler.cpp #487154:487155
@@ -167,6 +167,14 @@
Environments environments;
time_t starttime; // _local_ to the compiler server
time_t start_on_scheduler; // starttime local to scheduler
+ /**
+ * the end signal from client and daemon is a bit of a race and
+ * in 99.9% of all cases it's catched correctly. But for the remaining
+ * 0.1% we need a solution too - otherwise these jobs are eating up slots.
+ * So the solution is to track done jobs (client exited, daemon didn't signal)
+ * and after 10s no signal, kill the daemon (and let it rehup) **/
+ time_t done_time;
+
string target_platform;
string filename;
list<Job*> master_job_for;
@@ -175,19 +183,22 @@
Job (MsgChannel *c, unsigned int _id, CS *subm)
: id(_id), state(PENDING), server(0),
submitter(subm),
- client_channel(c), starttime(0), start_on_scheduler(0), arg_flags( 0 ) {}
+ client_channel(c), starttime(0), start_on_scheduler(0), done_time( 0 ), arg_flags( 0 ) {}
~Job()
{
// XXX is this really deleted on all other paths?
/* fd2chan.erase (channel->fd);
delete channel;*/
}
+
};
// A subset of connected_hosts representing the compiler servers
static list<CS*> css;
static unsigned int new_job_id;
static map<unsigned int, Job*> jobs;
+static map<unsigned int, Job*> done_jobs;
+
/* XXX Uah. Don't use a queue for the job requests. It's a hell
to delete anything out of them (for clean up). */
struct UnansweredList {
@@ -764,10 +775,10 @@
return bestui;
}
-/* Prunes the list of connected clients by those which haven't
+/* Prunes the list of connected servers by those which haven't
answered for a long time. */
static void
-prune_clients ()
+prune_servers ()
{
list<CS*>::iterator it;
@@ -800,6 +811,29 @@
++it;
}
+
+
+ /**
+ * check the jobs that were not cared about even though they are done
+ * (one in a million ;( */
+ for (map<unsigned int, Job*>::const_iterator it = done_jobs.begin();
+ it != done_jobs.end(); ++it)
+ {
+ Job *j = it->second;
+ if (j->done_time - now > 30 )
+ {
+ trace() << "undone " << dump_job( j ) << endl;
+ trace() << "FORCED removing " << j->server->nodename << endl;
+ handle_end( j->server, 0 );
+ /* the above will kill all jobs associated with this server, so
+ we better get out of this, as done_jobs is changed too and
+ we'll come back (</schwarzeneggeraccent>)
+ */
+ break;
+ }
+
+ }
+
}
static Job*
@@ -817,8 +851,6 @@
static bool
empty_queue()
{
- prune_clients ();
-
Job *job = get_job_request ();
if (!job)
return false;
@@ -855,7 +887,7 @@
&& can_install (cs, job).size()))
{
trace() << " and failed ";
-
+
#if DEBUG_SCHEDULER > 1
list<UnansweredList*>::iterator it;
for (it = toanswer.begin(); it != toanswer.end(); ++it)
@@ -1077,26 +1109,9 @@
notify_monitors (MonJobDoneMsg (*m));
j->server->busy_installing = 0;
jobs.erase (m->job_id);
+ done_jobs.erase (m->job_id);
delete j;
-
-#if DEBUG_SCHEDULER > 0
- if (new_job_id % 1000) // don't polute the log file in checking this for every job
- return true;
-
- bool first = true;
-
- for (map<unsigned int, Job*>::const_iterator it = jobs.begin();
- it != jobs.end(); ++it)
- {
- int id = it->first;
- Job *j = it->second;
- if (new_job_id - id > 2000)
- trace() << " undone: " << dump_job( j ) << endl;
- first = false;
- }
-#endif
-
return true;
}
@@ -1357,6 +1372,7 @@
if ((*jit)->server)
(*jit)->server->busy_installing = 0;
jobs.erase( (*jit)->id );
+ done_jobs.erase( (*jit)->id );
delete (*jit);
}
delete l;
@@ -1381,6 +1397,7 @@
job->server->joblist.remove (job);
if (job->server)
job->server->busy_installing = 0;
+ done_jobs.erase( job->id );
jobs.erase( mit++ );
delete job;
}
@@ -1442,6 +1459,7 @@
job->server->joblist.remove (job);
job->server->busy_installing = 0;
}
+ done_jobs.erase( job->id );
jobs.erase (it++);
delete job;
}
@@ -1458,8 +1476,11 @@
for (it = jobs.begin(); it != jobs.end(); ++it)
if (it->second->client_channel == c)
{
- it->second->client_channel = 0;
- it->second->state = Job::WAITINGFORDONE;
+ Job *done = it->second;
+ done->client_channel = 0;
+ done->state = Job::WAITINGFORDONE;
+ done_jobs[done->id] = done;
+ done->done_time = time( 0 );
}
}
}
@@ -1704,9 +1725,11 @@
while (1)
{
+ prune_servers ();
+
while (empty_queue())
continue;
-
+
fd_set read_set;
int max_fd = 0;
FD_ZERO (&read_set);
[prev in list] [next in list] [prev in thread] [next in thread]
Configure |
About |
News |
Add a list |
Sponsored by KoreLogic