

#!/bin/bash
#
#
#   Adapted to LSF
#   Use only 2nd method: scan for finished jobs
#
#   Sergio Maffioletti <sergio.maffioletti@cscs.ch>
#
# usage: scan_lsf_job control_dir ...


# Set variables:
#   LSF_BIN_PATH

# "Checking NORDUGRID... "
if [ -z ${NORDUGRID_LOCATION} ] ; then
    echo "NORDUGRID_LOCATION not set." 1>&2
    exit 1
fi

# "Sourcing ${NORDUGRID_LOCATION} config... " 1>&2
if [ ! -f "${NORDUGRID_LOCATION}/libexec/configure-lsf-env.sh" ] ; then
    echo "${NORDUGRID_LOCATION}/libexec/configure-lsf-env.sh not found." 1>&2
    exit 1
fi

source ${NORDUGRID_LOCATION}/libexec/configure-lsf-env.sh

umask 022

STAT_USERNAME='stat -c %U'
eval $STAT_USERNAME $0 2>/dev/null 1>&2
if [ ! $? = '0' ] ; then
  STAT_USERNAME='stat -c "uid (%U)"'
  eval $STAT_USERNAME $0 2>/dev/null 1>&2
  if [ ! $? = '0' ] ; then
    echo "Can't find useable stat utility" 1>&2
    sleep 60
    exit 1
  fi
fi 

if [ -z "$1" ] ; then 
    echo "Missing Inpu Script file as arg1" 1>&2
    exit 1 ; 
fi

# first control_dir is used for storing own files

echo `date`" : control_dir=$1" 1>&2 #FIXME

control_dir=$1
control_dirs=
while [ $# -gt 0 ] ; do
  control_dirs="${control_dirs} $1"
  shift
done

my_id=`id -u`

my_name=`id -un`


# SM: no attempt to look for LSF Manager logfiles, restrict to job logs.


# Get all running jobs
#
# first running jobs, grep for MASTER to avoild slave procs
# pids=`${SGE_BIN_PATH}/qstat -s r 2>/dev/null | grep MASTER | grep '^ [0-9]* ' | sed 's/^ \([^ ]*\).*/\1/'`

# echo -n "Checking ${LSF_BIN_PATH}... " 1>&2

if [ -z ${LSF_BIN_PATH} ]; then
    echo "${LSF_BIN_PATH} not set" 1>&2
    exit 1
fi

pidslist=`mktemp "$TMP_DIR/lsfstat.XXXXXX"` || 
if [ ! "$?" = '0' ] ; then 
  rm -f "$pidslist"
  # PBS server down ?
  sleep 60
  exit 1
fi


if [ -z ${LSF_QUEUE_NAME} ]; then
    lsf_stat=`${LSF_BIN_PATH}/bjobs -a -u all -q normal 2>/dev/null` # | grep RUN | grep '^ [:digit:]'
else
    lsf_stat=`${LSF_BIN_PATH}/bjobs -a -u all -q ${LSF_QUEUE_NAME} 2>/dev/null` # | grep RUN | grep '^ [:digit:]'
fi

if [ -z "${lsf_stat}" ] ; then
    echo "bjobs returned empty result" 1>&2
    rm -f "$pidslist"
    sleep 60
    exit 0
fi

echo "${lsf_stat}" 1>$pidslist

pids=`cat "$pidslist" | egrep 'PSUSP|USUSP|SSUSP|RUN|PEND' | sed -e 's/^\([^ ]*\).*/\1/'`
rm -f "$pidslist"

eval "set -- $control_dirs"

# Go through directories
for ctr_dir in $control_dir ; do

  # Obtain ids stored in job.*.local
    ids=`find ${ctr_dir} -name 'job.*.local' -print0 | xargs -0 grep -h "^localid=" 2>/dev/null | sed 's/^localid=\([0-9]*\).*/\1/'`

    if [ -z "$ids" ] ; then continue ; fi
	
    # compare them to running jobs and find missing
    bids=
    for id in $ids ; do
	found=`echo "$pids" | grep "^$id$"`
	if [ -z "$found" ] ; then
	    bids="$bids $id"
	fi
    done

    # go through missing ids
    for id in $bids ; do

	# find grid job corresponding to curent local id
	jobfile=`find ${ctr_dir} -name 'job.*.local' -print0 | xargs -0 grep -F -l "localid=$id" 2>/dev/null`
	if [ -z "$jobfile" ] ; then continue ; fi

	# extract grid id
	gridid=`basename "$jobfile" '.local' | sed 's/^job\.//'`
	donefile="${ctr_dir}/job.${gridid}.lrms_done"
	if [ -f "$donefile" ] ; then continue ; fi

	statusfile="${ctr_dir}/job.${gridid}.status"
	if [ ! -f "$statusfile" ] ; then continue ; fi

	status=`cat "$statusfile"`
	if [ ! "$status" = "INLRMS" ] ; then continue ; fi

	# get session directory of this job
	session=`grep -h '^sessiondir=' "$jobfile" | sed 's/^sessiondir=\(.*\)/\1/'`
	if [ ! -z "$session" ] ; then
	    # have chance to obtain exit code
	    diagfile="${session}.diag"
	    if [ "$my_id" = '0' ] ; then
		username=`eval $STAT_USERNAME "${jobfile}" | grep 'uid' | sed 's/[^(]*(\([^(]*\))/\1/;t leave;s/.*//;:leave'`
	    else
		username=
	    fi

	    if [ ! -z "$session" ] ; then
		# have chance to obtain exit code
		if [ -z "$username" ] ; then
		    exitcode=`grep '^exitcode=' "$diagfile" | sed 's/^exitcode=//'`
		else
		    exitcode=`su "${username}" -c "grep '^exitcode=' $diagfile" | sed 's/^exitcode=//'`
		fi
	    fi

	    if [ ! -z "$exitcode" ] ; then
		# job finished and exit code is known
		echo "$exitcode Executable finished with exit code $exitcode" > "$donefile"
		${NORDUGRID_LOCATION}/libexec/gm-kick "$statusfile"
		continue
	    fi
	fi

	# job has probaly finished and exit code is not known
	exitcode='-1'
	countfile="${ctr_dir}/job.${gridid}.lrms_job"
	counter=0
	if [ -f "$countfile" ] ; then
	    counter=`cat "$countfile"`
	    counter=$(( $counter + 1 ))
	fi

	if [ "$counter" -gt 5 ] ; then
	    rm -f "$countfile"
	    echo "$exitcode Job was lost with unknown exit code" > "$donefile"
	    ${NORDUGRID_LOCATION}/libexec/gm-kick "$statusfile"
	else
	    echo "$counter" > "$countfile"
	fi
  

    done # for id in $bids ; do

    # go through existing ids
    for id in $pids ; do
	# find grid job corresponding to curent local id
	jobfile=`find ${ctr_dir} -name 'job.*.local' -print0 | xargs -0 grep -F -l "localid=$id." 2>/dev/null`
	if [ -z "$jobfile" ] ; then continue ; fi
	gridid=`basename "$jobfile" '.local' | sed 's/^job\.//'`
	countfile="${ctr_dir}/job.${gridid}.lrms_job"
	# reset failure counter
	rm -f "$countfile"
    done

done # for ctr_dir in $control_dir ; do

sleep 60
exit 0
