/usr/lib/oar/sarko is in oar-server 2.5.6-2ubuntu1.
This file is owned by root:root, with mode 0o755.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 | #!/usr/bin/perl
# $Id$
#Almighty module : check walltimes and jobs to frag
use strict;
use DBI();
use Data::Dumper;
use OAR::IO;
use OAR::Modules::Judas qw(oar_debug oar_warn oar_error set_current_log_category);
use OAR::Conf qw(init_conf dump_conf get_conf is_conf);
use OAR::Tools;
# Log category
set_current_log_category('main');
# Get job delete and checkpoint walltime values
my $Leon_soft_walltime = OAR::Tools::get_default_leon_soft_walltime();
my $Leon_walltime = OAR::Tools::get_default_leon_walltime();
init_conf($ENV{OARCONFFILE});
if (is_conf("JOBDEL_SOFTWALLTIME")){
$Leon_soft_walltime = get_conf("JOBDEL_SOFTWALLTIME");
}
if (is_conf("JOBDEL_WALLTIME")){
$Leon_walltime = get_conf("JOBDEL_WALLTIME");
}
if ($Leon_walltime <= $Leon_soft_walltime){
$Leon_walltime = $Leon_soft_walltime + 1;
oar_warn("[sarko] (JOBDEL_WALLTIME <= JOBDEL_SOFTWALLTIME) so I must set JOBDEL_WALLTIME to $Leon_walltime\n");
}
my $Server_hostname = get_conf("SERVER_HOSTNAME");
my $Deploy_hostname = get_conf("DEPLOY_HOSTNAME");
if (!defined($Deploy_hostname)){
$Deploy_hostname = $Server_hostname;
}
my $Cosystem_hostname = get_conf("COSYSTEM_HOSTNAME");
if (!defined($Cosystem_hostname)){
$Cosystem_hostname = $Server_hostname;
}
my $Openssh_cmd = get_conf("OPENSSH_CMD");
$Openssh_cmd = OAR::Tools::get_default_openssh_cmd() if (!defined($Openssh_cmd));
if (is_conf("OAR_SSH_CONNECTION_TIMEOUT")){
OAR::Tools::set_ssh_timeout(get_conf("OAR_SSH_CONNECTION_TIMEOUT"));
}
if (is_conf("OAR_RUNTIME_DIRECTORY")){
OAR::Tools::set_default_oarexec_directory(get_conf("OAR_RUNTIME_DIRECTORY"));
}
oar_debug("[sarko] JOBDEL_SOFTWALLTIME = $Leon_soft_walltime; JOBDEL_WALLTIME = $Leon_walltime\n");
# get script args
my $base = OAR::IO::connect();
if (!defined($base)){
oar_error("[sarko] Can not connect to the database\n");
exit(1);
}
oar_debug("[sarko] Hello, identity control !!!\n");
my $guilty_found=0;
my $current = OAR::IO::get_date($base);
# Look at leon timers
# Decide if OAR must retry to delete the job or just change values in the database
foreach my $j (OAR::IO::get_timered_job($base)){
my $job_ref = OAR::IO::get_job($base,$j->{job_id});
if (($job_ref->{state} eq "Terminated") || ($job_ref->{state} eq "Error") || ($job_ref->{state} eq "Finishing")){
OAR::IO::job_fragged($base,$j->{job_id});
oar_debug("[sarko] I set to FRAGGED the job $j->{job_id}\n");
}else{
my $frag_date = OAR::IO::get_frag_date($base,$j->{job_id});
oar_debug("[sarko] frag date : $frag_date , $frag_date\n");
if (($current > $frag_date+$Leon_soft_walltime) && ($current <= $frag_date+$Leon_walltime)){
oar_debug("[sarko] Leon will RE-FRAG bipbip of job $j->{job_id}\n");
OAR::IO::job_refrag($base,$j->{job_id});
$guilty_found=1;
}elsif ($current > $frag_date+$Leon_walltime){
oar_debug("[sarko] Leon will EXTERMINATE bipbip of job $j->{job_id}\n");
OAR::IO::job_leon_exterminate($base,$j->{job_id});
$guilty_found=1;
}else{
oar_debug("[sarko] The leon timer is not yet expired for the job $j->{job_id}; I do nothing\n");
}
}
}
# Look at job walltimes
foreach my $job (OAR::IO::get_jobs_in_state($base, "Running")){
my ($start, $max);
# Get starting time
$start = $job->{start_time};
# Get maxtime
my $mold_job = OAR::IO::get_current_moldable_job($base,$job->{assigned_moldable_job});
$max = $mold_job->{moldable_walltime};
if ($job->{suspended} eq "YES"){
# This job was suspended so we must recalculate the walltime
$max += OAR::IO::get_job_suspended_sum_duration($base,$job->{job_id},$current);
}
oar_debug("[sarko] Job [$job->{job_id}] from $start with $max; current time=$current\n");
if ($current > $start+$max){
oar_debug("--> (Elapsed)\n");
$guilty_found=1;
OAR::IO::lock_table($base,["frag_jobs","event_logs","jobs"]);
OAR::IO::frag_job($base, $job->{job_id});
OAR::IO::unlock_table($base);
OAR::IO::add_new_event($base,"WALLTIME",$job->{job_id},"[sarko] Job [$job->{job_id}] from $start with $max; current time=$current (Elapsed)");
}elsif (($job->{checkpoint} > 0) && ($current >= ($start+$max-$job->{checkpoint}))){
# OAR must notify the job to checkpoint itself
oar_debug("[sarko] Send checkpoint signal to the job $job->{job_id}\n");
# Retrieve node names used by the job
my @hosts = OAR::IO::get_job_current_hostnames($base,$job->{job_id});
my $types = OAR::IO::get_job_types_hash($base,$job->{job_id});
my $host_to_connect = $hosts[0];
if ((defined($types->{cosystem})) or ($#hosts < 0)){
$host_to_connect = $Cosystem_hostname;
}elsif (defined($types->{deploy})){
$host_to_connect = $Deploy_hostname;
}
OAR::IO::add_new_event($base,"CHECKPOINT",$job->{job_id},"User oar (sarko) requested a checkpoint on the job $job->{job_id} on $host_to_connect");
my $str_comment;
my @exit_codes;
# Timeout the ssh command
eval {
$SIG{ALRM} = sub { die "alarm\n" };
alarm(OAR::Tools::get_ssh_timeout());
@exit_codes = OAR::Tools::signal_oarexec($host_to_connect,$job->{job_id},"SIGUSR2",1,$base, $Openssh_cmd, '');
alarm(0);
};
if ($@){
if ($@ eq "alarm\n"){
$str_comment = "[sarko] Cannot contact $hosts[0], operation timouted (".OAR::Tools::get_ssh_timeout()." s). So I cannot send checkpoint signal to the job $job->{job_id} on $host_to_connect";
oar_warn("$str_comment\n");
OAR::IO::add_new_event($base,"CHECKPOINT_ERROR",$job->{job_id},$str_comment);
}else{
$str_comment = "[sarko] An unknown error occured during the sending of the checkpoint signal to the job $job->{job_id} on the host $host_to_connect";
oar_warn("$str_comment\n");
OAR::IO::add_new_event($base,"CHECKPOINT_ERROR",$job->{job_id},$str_comment);
}
}else{
if ($exit_codes[0] == 0){
$str_comment = "[sarko] The job $job->{job_id} was notified to checkpoint itself on the node $host_to_connect";
oar_debug("$str_comment\n");
OAR::IO::add_new_event($base,"CHECKPOINT_SUCCESSFULL",$job->{job_id},$str_comment);
}else{
$str_comment = "[sarko] The kill command return a bad exit code (@exit_codes) for the job $job->{job_id} on the node $host_to_connect";
oar_warn("$str_comment\n");
OAR::IO::add_new_event($base,"CHECKPOINT_ERROR",$job->{job_id},$str_comment);
}
}
}
}
# Retrieve nodes with expiry_dates in the past
# special for Desktop computing
my @resources = OAR::IO::get_expired_resources($base);
if ($#resources >= 0) {
# First mark the nodes as dead
foreach my $r (@resources) {
OAR::IO::set_resource_nextState($base, $r, 'Suspected');
my $rinfo = OAR::IO::get_resource_info($base, $r);
OAR::IO::add_new_event_with_host($base, "LOG_SUSPECTED", 0, "The DESKTOP COMPUTING resource $r has expired on node $rinfo->{network_address}", [$rinfo->{network_address}]);
}
# Then notify Almighty
my $remote_host = get_conf("SERVER_HOSTNAME");
my $remote_port = get_conf("SERVER_PORT");
OAR::Tools::notify_tcp_socket($remote_host,$remote_port,"ChState");
}
my $dead_switch_time = OAR::Tools::get_default_dead_switch_time();
if (is_conf("DEAD_SWITCH_TIME")){
$dead_switch_time = get_conf("DEAD_SWITCH_TIME");
}
# Get Absent and Suspected nodes for more than 5 mn (default)
if ($dead_switch_time > 0){
my $notify = 0;
foreach my $r (OAR::IO::get_absent_suspected_resources_for_a_timeout($base,$dead_switch_time)){
OAR::IO::set_resource_nextState($base,$r,"Dead");
OAR::IO::update_resource_nextFinaudDecision($base,$r,"YES");
oar_debug("[Sarko] Set the next state of $r to Dead\n");
$notify = 1;
}
if ($notify > 0){
my $remote_host = get_conf("SERVER_HOSTNAME");
my $remote_port = get_conf("SERVER_PORT");
OAR::Tools::notify_tcp_socket($remote_host,$remote_port,"ChState");
}
}
OAR::IO::disconnect($base);
exit($guilty_found);
|