#!/bin/sh
#
#  Periodically monitor for jobs which has finished or failed but not
#  reported an exitcode
#
# usage: parse_fork_log control_dir ...

id=`id -u`

#debug='eval echo >> /tmp/parse-fork-log.$id'
debug=:

$debug "run at `date`"
$debug "options = $@"

if [ -z "$1" ] ; then exit 1 ; fi

# Where to store temporary files on gatekeeper
TMP_DIR=${TMP_DIR:-/tmp}

for control_dir in "$@" ; do

    if [ ! -d "${control_dir}" ]; then 
	echo "No control dir $control_dir" >&2
        continue
    fi

    for job in `grep -H INLRMS ${control_dir}/job.*.status 2>/dev/null | sed -e 's/.*job.//' -e 's/.status.*$//'` ; do
        $debug "scanning job = $job"
        unset joboption_jobid
	unset joboption_user
        unset joboption_directory

        [ -f "${control_dir}/job.${job}.lrms_done" ] && continue

        [ ! -f "${control_dir}/job.${job}.grami" ] && continue
        .  "${control_dir}/job.${job}.grami"

        [ -z "$joboption_jobid" ] && continue
    
        $debug "local jobid = $joboption_jobid"
        $debug "local user  = $joboption_user"
    
        ps u $joboption_jobid | grep "^${joboption_user}"
        rc=$?
	$debug "ps returned $rc"
        [ $rc -eq 0 ] && continue

        $debug "checking ${joboption_directory}.diag"
        if [ `id -u` -eq '0' ] ; then
          exitcode=`su "${joboption_user}" -c "cat ${joboption_directory}.diag" 2>/dev/null | sed -n 's/^exitcode=\([0-9]*\).*/\1/p'`
        else
          exitcode=`cat "${joboption_directory}.diag" 2>/dev/null | sed -n 's/^exitcode=\([0-9]*\).*/\1/p'`
        fi
        $debug "exitcode = [$exitcode] extracted from ${joboption_directory}.diag"
        if [ -z "$exitcode" ] ; then
	  $debug "checking ${control_dir}/job.${job}.diag"
          exitcode=`cat "${control_dir}/job.${job}.diag" 2>/dev/null | sed -n 's/^exitcode=\([0-9]*\).*/\1/p'`
          $debug "exitcode = [$exitcode] extracted from ${control_dir}/job.${job}.diag"
	fi	
        fork_comment=""
	if [ -z "$exitcode" ]; then
            echo "Job $job with PID $joboption_jobid died unexpectedly" >&2
            fork_comment="Job died unexpectedly" >&2
    	    exitcode=1
	elif [ "$exitcode" -ne '0' ]; then
            fork_comment="Job finished with non-zero exit code" >&2
	fi
        $debug "got exitcode=$exitcode"
	echo "$exitcode $fork_comment" > "${control_dir}/job.${job}.lrms_done"
    done    

done

$debug "done, going to sleep"

sleep 120
exit 0
