This file is indexed.

/usr/share/arc/scan-pbs-job is in nordugrid-arc-arex 1.1.1-1.

This file is owned by root:root, with mode 0o755.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
#!/bin/bash
#
#   Periodically read log files of PBS and put mark files
# for job, which finished.
#   If log files are not available scan for finished (absent) jobs 
# in PBS and put mark files for job, which finished.
#
# usage: scan_pbs_job control_dir ...

# ARC1 passes first the config file.
if [ "$1" = "--config" ]; then shift; ARC_CONFIG=$1; shift; fi

if [ -z "$1" ] ; then exit 1 ; fi

basedir=`dirname $0`
basedir=`cd $basedir > /dev/null && pwd` || exit $?

libexecdir="${ARC_LOCATION:-/usr}/lib/arc/"
pkgdatadir="$basedir"

. ${pkgdatadir}/configure-pbs-env.sh || exit $?

# Assume that gm-kick and scan_common is installed in the same directory
GMKICK=${libexecdir}/gm-kick

. "${pkgdatadir}/scan_common.sh" || exit $?

# Where to store temporary files
TMPDIR=${TMPDIR:-/tmp}

# directory containing PBS server logs
pbs_log_dir=${CONFIG_pbs_log_path:-/var/spool/pbs/server_logs}

RUNTIME_NODE_SEES_FRONTEND=$CONFIG_shared_filesystem

#default is NFS
if [ -z "${RUNTIME_NODE_SEES_FRONTEND}" ] ; then
  RUNTIME_NODE_SEES_FRONTEND=yes
fi
# locally empty means no
if [ "${RUNTIME_NODE_SEES_FRONTEND}" = 'no' ] ; then
  RUNTIME_NODE_SEES_FRONTEND=
fi

# first control_dir is used for storing own files
control_dir=$1
control_dirs=
while [ $# -gt 0 ] ; do
  control_dirs="${control_dirs} \"$1\""
  shift
done

my_id=`id -u`

state_file=$control_dir/pbs_log_scan.`id -un`

lines=`cat "$state_file" 2>/dev/null`
ldt=`echo $lines | awk '{split($0,field," ");print field[1]}' `
lines=`echo $lines | awk '{split($0,field," ");print field[2]}'`
lines_skip=$(( $lines + 0 ))
ldate=$(( $ldt + 0 ))
if [ -z "$lines_skip" ] ; then lines_skip='0' ; fi
if [ -z "$ldate" ] ; then ldate='0' ; fi

find_by_local() {
  eval "set -- $control_dirs"
  for ctr_dir in "$@"; do
    find ${ctr_dir}/processing -name 'job.*.status' -print0 \
    | sed 's/processing\/job\.\([^\.]*\)\.status/job.\1.local/g' \
    | xargs -0 grep -F -l "localid=$job_id" 2>/dev/null
  done \
  | head -n 1
}

find_by_grami() {
  eval "set -- $control_dirs"
  for ctr_dir in "$@"; do
    find ${ctr_dir}/processing -name 'job.*.status' -print0 \
    | sed 's/processing\/job\.\([^\.]*\)\.status/job.\1.grami/g' \
    | xargs -0 grep -F -l "joboption_jobid=$job_id" 2>/dev/null
  done \
  | sed 's/\.grami$/.local/' \
  | head -n 1
}

# set_job_vars takes a line from pbs logs and splits it, returning information
# in pbs_date, pbs_code, pbs_server, pbs_job, job_id, job_message and rest_line

set_job_vars() {
  pbs_date=$1
  pbs_code=$2
  pbs_server=$3
  pbs_job=$4
  job_id=$5
  job_message=$6
  rest_line=$7
}

#
#  Main function for processing one PBS log.
#  Extracts log lines with code 0010 (job exited) and 0008 (job killed)
#
# TODO this should be split into smaller functions
process_log_file () {
  eval "set -- $control_dirs"

  #we grep for finished jobs, then use sed to remove already processed lines
  #OBS: deleted jobs have a 0008 message with not much info in it. A 0010
  # message may follow (or not) with full usage stats. By this time the
  # job has already been processed, so this info is ignored!
  #TODO: make log scanning more intelligent.
  exited_killed_jobs=`egrep '^[^;]*;0010;[^;]*;Job;|^[^;]*;0008;[^;]*;Job;[^;]*;Exit_status=|^[^;]*;0008;[^;]*;Job;[^;]*;Job deleted' ${lname} | tail -n+$(( $lines_skip + 1 ))`

  #TODO should we add processed lines before jobs have actually been processed? What if the last job only has half a record?
  new_lines=`echo "$exited_killed_jobs" | wc -l`
  # new_lines set to 1 when string is empty, should have been 0
  [ "x$exited_killed_jobs" = x ] &&  continue

  lines_processed=$(( $lines_skip + $new_lines ))

  if [ "$lines_processed" -lt '0' ] ; then
    lines_processed=0;
  fi

  echo "$cname $lines_processed"> $state_file

  exited_killed_jobs=`echo "$exited_killed_jobs" | sort -u`

  # force word splitting to happen only on newlines
  old_IFS=$IFS; IFS='
'
  for job in $exited_killed_jobs; do

    # Split line into fields by forcing word splitting to happen on ";" 
    IFS=";"
    set_job_vars $job
    IFS=$old_IFS

    # Try to extract exit code of PBS (note: if executable fails it's code goes to PBS)
    exit_code=`echo "$job_message" | sed -n 's/^.*Exit_status=\([-0-9]*\).*/\1/p'`
    job_id=`echo "$job_id" | awk '{split($0,field,".");print field[1]"."field[2]}'`

    # look for this id in job.ID.local, then in job.ID.grami
    name=`find_by_local`
    if [ -z "$name" ]; then
      name=`find_by_grami`
      if [ -z "$name" ]; then continue; fi
    fi
    if [ "$my_id" != '0' ] ; then
      if [ ! -O "$name" ] ; then continue ; fi
    fi
    uid=$(get_owner_uid "$name")
    [ -z "$uid" ] && { log "Failed to stat $name"; continue; }

    base_name=`echo "$name" 2>/dev/null | sed -n 's/\.local$//p'`
    if [ -z "${base_name}" ] ; then continue ; fi

    # check if job already reported
    if [ -f "${base_name}.lrms_done" ] ; then continue ; fi

    statusfile=`echo "$name" 2>/dev/null | sed -n 's/job\.\([^\.]*\)\.local$/processing\/job.\1.status/p'`

    # more protection - check if grid-manager thinks job is still running
    egrep 'INLRMS|SUBMIT|CANCELING' "$statusfile" >/dev/null 2>&1
    if [ ! $? = '0' ] ; then continue ; fi

    # So far only PBS exit code is available
    # It would be nice to have exit code of main executable
    exitcode=''

    # get session directory of this job
    sessiondir=`grep -h '^sessiondir=' "${base_name}.local" | sed 's/^sessiondir=\(.*\)/\1/'`
    diagfile="${sessiondir}.diag"
    commentfile="${sessiondir}.comment"

    if [ -z "$sessiondir" ] ; then
      log "Failed to determine the path of the job's session directory"
    else
      # have chance to obtain exit code
      if [ -z "${RUNTIME_NODE_SEES_FRONTEND}" ] ; then
        # In case of non-NFS setup it may take some time till
        # diagnostics file is delivered. Wait for it max 2 minutes.
        # OBS: exitcode may never appear in the .diag file if the job was
        # killed. There will be a 2 minute delay for every such job!
        diag_tries=0
        while [ "$diag_tries" -lt 20 ] ; do
          job_read_diag # uses $sessiondir, $uid
          if [ ! -z "$exitcode" ] ; then break ; fi
          sleep 10
          diag_tries=$(( $diag_tries + 1 ))
          log "no exitcode in diag file $diagfile (try $diag_tries of 20)"
        done
      else
        job_read_diag # uses $sessiondir, $uid
      fi
    fi
    # Try to obtain message from PBS if any
    pbs_comment=$(do_as_uid "$uid" "tail -n 1 '$commentfile'")
    save_commentfile "$uid" "$commentfile" "${base_name}.errors"

    # Extract values from PBS
    walltime=`echo "$job_message" | sed -n 's/^.*resources_used.walltime=\(\([0-9]*:\)*[0-9][0-9]\).*/\1/p'`
    cputime=`echo "$job_message" | sed -n 's/^.*resources_used.cput=\(\([0-9]*:\)*[0-9][0-9]\).*/\1/p'`
    mem=`echo "$job_message" | sed -n 's/^.*resources_used.mem=\([0-9]*\)kb.*/\1/p'`
    vmem=`echo "$job_message" | sed -n 's/^.*resources_used.vmem=\([0-9]*\)kb.*/\1/p'`

    # Convert to utc and store as seconds
    date_to_utc_seconds "$pbs_date"
    if [ ! -z "$return_date_seconds" ]; then
      # Convert from seconds to YYYYMMDDHHMMSSZ
      seconds_to_mds_date "$return_date_seconds"
      endtime=$return_mds_date
      # Find out how many seconds the job executed
      interval_to_seconds "$walltime"
      if [ ! -z "$return_interval_seconds" ]; then
        # Convert from seconds to YYYYMMDDHHMMSSZ
        seconds_to_mds_date $(( $return_date_seconds - $return_interval_seconds ))
        starttime=$return_mds_date
      fi
    fi

    # Values to write to diag. These will override values already written.
    interval_to_seconds "$walltime"
    [ -n "$return_interval_seconds" ] && WallTime=$return_interval_seconds
    interval_to_seconds "$cputime"
    [ -n "$return_interval_seconds" ] && UserTime=$return_interval_seconds
    [ -n "$return_interval_seconds" ] && KernelTime=0
    [ -n "$mem" ]                     && UsedMemory=$mem
    [ -n "$vmem" ]                    && TotalMemory=$vmem
    [ -n "$starttime" ]               && LRMSStartTime=$starttime
    [ -n "$endtime" ]                 && LRMSEndTime=$endtime
    [ -n "$pbs_comment" ]             && LRMSMessage=$pbs_comment
    [ -n "$exit_code" ]               && LRMSExitcode=$exit_code
    job_write_diag

    if [ -z "$exitcode" ] ; then
      # No exit code of job means job was most probably killed
      if [ -z "$exit_code" ] ; then exit_code='-1'; fi
      if [ "$exit_code" = '0' ] ; then
        echo "Job $job_id failed but PBS have not noticed that" 1>&2
        echo "-1 Job failed but PBS reported 0 exit code." > "${base_name}.lrms_done"
      elif [ -z "$pbs_comment" ] ; then
        echo "Job $job_id failed with PBS exit code $exit_code" 1>&2
        echo "$exit_code Job was killed by PBS." > "${base_name}.lrms_done"
      else
        echo "Job $job_id failed with PBS exit code $exit_code" 1>&2
        echo "$exit_code $pbs_comment" > "${base_name}.lrms_done"
      fi
    else
      if [ -z "$exit_code" ] ; then exit_code='-1'; fi
      if [ ! "$exitcode" = 0 ] ; then
        if [ "$exit_code" = '0' ] ; then exit_code='-1'; fi
        echo "Job $job_id failed with exit code $exitcode, PBS reported $exit_code." 1>&2
        echo "$exit_code Job failed with exit code $exitcode." > "${base_name}.lrms_done"
      else
        if [ ! "$exit_code" = '0' ] ; then
          echo "Job finished properly but PBS reported $exit_code." 1>&2
          if [ -z "$pbs_comment" ] ; then
            echo "$exit_code Job was killed by PBS." > "${base_name}.lrms_done"
          else
            echo "$exit_code $pbs_comment" > "${base_name}.lrms_done"
          fi
        else
          # echo "Job finished without errors." 1>&2
          echo "0" > "${base_name}.lrms_done"
        fi
      fi
    fi
    # wake up GM
    ${GMKICK} "${base_name}.local"
  done
  IFS=$old_IFS
}

readable_logs=no
# Check $pbs_log_dir for readable files
# if any are found, process them and update relevant information
if [ ! -z "${pbs_log_dir}" ] ; then
for cname in `ls -1 ${pbs_log_dir}/ 2>/dev/null | grep '^[0-9]*$'` ; do
  lname="${pbs_log_dir}/$cname"
  if [ ! -r "$lname" ] ; then continue ; fi
  readable_logs=yes
  if [ "$cname" -lt "$ldate" ] ; then
    continue
  elif [ "$cname" -gt "$ldate" ] ; then
    lines_skip=0
  fi
  echo "Date: " $cname
  last_modified=`stat $lname | grep Modify`
  process_log_file
done
fi

# main loop, stay here up to 60 seconds if log is still updated while
# we are reading it.
if [ "$readable_logs" = 'yes' ] ; then
  time_count=0
  while true ; do
    new_modified=`stat $lname | grep Modify`
    if [ "$new_modified" != "$last_modified" ] ; then
      last_modified="$new_modified"
      lines=`cat "$state_file" 2>/dev/null`
      ldt=`echo $lines | awk '{split($0,field," ");print field[1]}' `
      lines=`echo $lines | awk '{split($0,field," ");print field[2]}'`
      lines_skip=$(( $lines + 0 ))
      ldate=$(( $ldt + 0 ))
      process_log_file
    fi
    sleep 10
    time_count=$(( $time_count + 1 ))
    if [ "$time_count" -gt 60 ] ; then break ; fi
  done
  exit 0
fi

# If no PBS logs found try ordinary 'qstat'
eval "set -- $control_dirs"
# Get all running jobs

pidslist=`mktemp "$TMPDIR/qstat.XXXXXX"` || 
if [ ! "$?" = '0' ] ; then
  # FS problems ?
  # TODO debug output here
  sleep 60
  exit 1
fi
${PBS_BIN_PATH}/qstat -a 2>/dev/null 1>"$pidslist"
if [ ! "$?" = '0' ] ; then
  rm -f "$pidslist"
  # PBS server down ?
  sleep 60
  exit 1
fi

exclude_completed () {
  awk '$10!="C"{print $0}'
}

pids=`cat "$pidslist" | grep '^[0-9][0-9]*\.' | exclude_completed | sed 's/^\([0-9][0-9]*\).*/\1/'`
rm -f "$pidslist"
# Go through directories
for ctr_dir in "$@" ; do
  # Obtain ids stored in job.*.local
  ids=`find ${ctr_dir}/processing -name 'job.*.status' -print0 \
       | sed 's/processing\/job\.\([^\.]*\)\.status/job.\1.local/g' \
       | xargs -0 grep -h "^localid=" 2>/dev/null | sed 's/^localid=\([0-9]*\).*/\1/'`
  if [ -z "$ids" ] ; then continue ; fi
  # compare them to running jobs and find missing
  bids=
  for id in $ids ; do
    found=`echo "$pids" | grep "^$id$"`
    if [ -z "$found" ] ; then
      bids="$bids $id"
    fi
  done
  # go through missing ids
  for id in $bids ; do
    # find grid job corresponding to curent local id
    jobfile=`find ${ctr_dir}/processing -name 'job.*.status' -print0 \
             | sed 's/processing\/job\.\([^\.]*\)\.status/job.\1.local/g' \
             | xargs -0 grep -F -l "localid=$id." 2>/dev/null`
    if [ -z "$jobfile" ] ; then continue ; fi
    if [ "$my_id" != '0' ] ; then
      if [ ! -O "$jobfile" ] ; then continue ; fi
    fi
    uid=$(get_owner_uid "$jobfile")
    [ -z "$uid" ] && { log "Failed to stat $jobfile"; continue; }
    # extract grid id
    gridid=`basename "$jobfile" '.local' | sed 's/^job\.//'`
    donefile="${ctr_dir}/job.${gridid}.lrms_done"
    if [ -f "$donefile" ] ; then continue ; fi
    statusfile="${ctr_dir}/processing/job.${gridid}.status"
    if [ ! -f "$statusfile" ] ; then continue ; fi
    status=`cat "$statusfile"`
    if [ "$status" != "INLRMS" ] && [ "$status" != "CANCELING" ]; then continue ; fi

    # get session directory of this job
    session=`grep -h '^sessiondir=' "$jobfile" | sed 's/^sessiondir=\(.*\)/\1/'`
    if [ ! -z "$session" ] ; then
      # have chance to obtain exit code
      diagfile="${session}.diag"
      if [ ! -z "$session" ] ; then
        # have chance to obtain exit code
        exitcode=$(do_as_uid "$uid" "grep '^exitcode=' '$diagfile'" | sed 's/^exitcode=//')
      fi
      if [ ! -z "$exitcode" ] ; then
        # job finished and exit code is known
        save_commentfile "$uid" "${session}.comment" "${ctr_dir}/job.${gridid}.errors"
        echo "$exitcode Executable finished with exit code $exitcode" > "$donefile"
        ${GMKICK} "$jobfile"
        echo "Job $gridid finished with exit code $exitcode"
        continue
      fi
    fi
    # job has probaly finished and exit code is not known
    exitcode='-1'
    countfile="${ctr_dir}/job.${gridid}.lrms_job"
    counter=0
    if [ -f "$countfile" ] ; then
      counter=`cat "$countfile"`
      counter=$(( $counter + 1 ))
    fi
    if [ "$counter" -gt 5 ] ; then
      rm -f "$countfile"
      save_commentfile "$uid" "${session}.comment" "${ctr_dir}/job.${gridid}.errors"
      echo "$exitcode Job was lost with unknown exit code" > "$donefile"
      ${GMKICK} "$jobfile"
      echo "Job $gridid finished with unknown exit code"
    else
      echo "$counter" > "$countfile"
    fi
  done
  # go through existing ids
  for id in $pids ; do
    # find grid job corresponding to curent local id
    jobfile=`find ${ctr_dir} -name 'job.*.local' -print0 | xargs -0 grep -F -l "localid=$id." 2>/dev/null`
    if [ -z "$jobfile" ] ; then continue ; fi
    gridid=`basename "$jobfile" '.local' | sed 's/^job\.//'`
    countfile="${ctr_dir}/job.${gridid}.lrms_job"
    # reset failure counter
    rm -f "$countfile"
  done
done
sleep 60
exit 0