/usr/lib/mpich/sbin/tstmachines

#! /bin/sh
#
# This script tests each of the machines in the machines/machine.$1 list
# to make sure that it is accessible and can run programs.  This is only
# a partial test
#
# In order for this to run in the background, we're careful to use -n 
# in the remote shell commands
#
verbose=0
rsh="/usr/bin/rsh"
prefix=/usr/lib/mpich
bindir=/usr/lib/mpich/bin
# Use the same definition as in mpirun.in
datadir=/usr/lib/mpich/share
machineDir=$datadir
CC="cc"
CLINKER="cc"

for arg in "$@" ; do
    case $arg in 
        -echo) set -x ;;
	-v) verbose=1 ;;
	-machinefile=*)
        machineFile=`echo A$arg | sed -e 's/A-machinefile=//g'`
	;;
        -help) cat <<.
Usage: 
     tstmachines [ -echo ] [ -v ] [ -machinefile=filename ] architecture
Tests that you can run remote shells with $rsh on the systems in 
your machines file and that executables are all cross-mounted.  To run this
test, you must be able to create files in the current directory.

If there are problems with some of the machines, this test may take several
minutes per problem machine.   If there are no problems, there will be no
output.  The option -v may be used to see what tests are being run.
.
        exit 1 
	;;
        *) arch=$arg ;;
    esac
done
#
if [ -z "$arch" ] ; then
    if [ -x $bindir/tarch ] ; then
        arch=`$bindir/tarch` 
    else 
        echo "Could not determine architecture - provide as argument"
	exit 1
    fi
fi
#
if [ -z "$machineFile" ] ; then
    machineFile="${machineDir}/machines.${arch}"
fi
if [ ! -f $machineFile ] ; then
    echo "Cannot read list of nodes $machineFile"
    exit 1
fi
list=`cat $machineFile | sed -e '/\#/d' | tr -s '\012' ' '`
rm -f mpichfoo
echo "A test" > mpichfoo
if [ ! -s mpichfoo ] ; then
    echo "Could not create test file in this directory. "
    echo "cd to a directory in which you have write privs and rerun this test"
    exit 1
fi
# Use same mechanism as in mpirun to get the value of pwd...
if [ -n "sed -e s@/tmp_mnt/@/@g" ] ; then
    PWDtest=`pwd | sed -e s@/tmp_mnt/@/@g`
    if [ ! -d $PWDtest ] ; then
        PWDtest=`pwd`
    fi
    if [ -n "$PWD" ] ; then
        rm -f $PWDtest/.mpirtmp $PWD/.mpirtmp
        echo "test" > $PWD/.mpirtmp
        if [ ! -s $PWDtest/.mpirtmp ] ; then
	    rm -f $PWD/.mpirtmp
            PWD=$PWDtest
        fi
        rm -f $PWDtest/.mpirtmp $PWD/.mpirtmp
    else 
        PWD=$PWDtest
    fi
else
    PWD=`pwd`
fi
#
if [ -n "$PWD" ] ; then
    PWD_TRIAL=$PWD
else
    PWD_TRIAL=$PWDtest
fi
if [ ! -d $PWD_TRIAL ] ; then 
    echo "Warning: your default path uses the automounter; this may"
    echo "cause some problems if you use other NFS-connected systems."
    PWD_TRIAL=`pwd`
fi
#
# First try running a simple program
# (test for access or stty/who am i problems)
myprog=$PWD_TRIAL/mpichfoo
errcnt=0
errsimple=0
livelist=""
printedheader=""
for machine in $list ; do
    # Strip cluster size from machine name
    # Some systems return empty for no match; others return 0.  Empty 
    # is what they should use (0 is ambiguous), but we have to be prepared for
    # either.
    ntest=`expr $machine : '.*:\([0-9]*\)'`
    if [ -n "$ntest" -a "$ntest" != "0" ] ; then
        machine=`expr $machine : '\(.*\):.*'`
    fi
    if [ $verbose = 1 ] ; then 
	echo "Trying true on $machine ..."
    fi
    output=`$rsh $machine -n true 2>&1`
    if [ -n "$output" ] ; then
	if [ -z "$printedheader" ] ; then
	    echo "Errors while trying to run $rsh $machine -n true"
	    printedheader=1
	fi
	echo "Unexpected response from $machine:"
        echo "--> $output"
	# Check for stty or who am i problems"
	iswho=`echo $output | grep -i 'am i'`
        if [ -n "$iswho" ] ; then
	    echo "You may have a command like"
	    echo "    who am i"
            echo "in your .login or .cshrc file.  This command can only be"
	    echo "used when a process is attached to a terminal."
            echo "See the Users Manual for ways to fix this."
        fi
        isstty=`echo $output | grep -i stty`
	if [ -n "$isstty" ] ; then
	    echo "You may have a command like"
	    echo "    stty ...."
            echo "in your .login or .cshrc file.  This command can only be"
	    echo "used when a process is attached to a terminal."
            echo "See the Users Manual for ways to fix this."
        fi
        # Add comment about unguarded messages
        echo "If your .cshrc, login, .bashrc, or other startup file"
        echo "contains a command that generates any output when logging in,"
        echo "such as fortune or hostname or even echo, you should modify"
        echo "that startup file to only print such a message when the"
        echo "process is attached to a terminal.  Examples of how to do"
        echo "this are in the Users Manual.  If you do not do this, MPICH"
        echo "will still work, but this script and the test programs will"
        echo "report problems because they compare expected output from"
        echo "what the programs produce."
	errcnt=`expr $errcnt + 1`
	errsimple=`expr $errsimple + 1`
    else
	livelist="$livelist $machine"
    fi
done
if [ $errsimple -gt 0 ] ; then
cat <<EOF
    The test of $rsh <machine> true  failed on some machines.
    This may be due to problems in your .login or .cshrc files; 
    some common problems are described when detected.  Look at the 
    output above to see what the problem is.

    If the problem is something like 'permission denied', then the 
    remote shell command $rsh does not allow you to run programs.
    See the documentation about remote shell and rhosts.

EOF
fi
# Next try running ls
# (Test for consistent filesystem)
#
myprog=$PWD_TRIAL/mpichfoo
errcnt=0
errls=0
livelist=""
printedheader=""
#
# Get the output form to expect from ls.  
# Use /bin/ls to avoid any alias problems
tstout=`/bin/ls $myprog`
for machine in $list ; do
    # Strip cluster size from machine name
    ntest=`expr $machine : '.*:\([0-9]*\)'`
    if [ -n "$ntest" -a "$ntest" != "0" ] ; then
        machine=`expr $machine : '\(.*\):.*'`
    fi
    if [ $verbose = 1 ] ; then 
	echo "Trying ls on $machine ..."
    fi
    output=`$rsh $machine -n /bin/ls $myprog 2>&1`
    if [ "$output" != "$tstout" ] ; then
	if [ -z "$printedheader" ] ; then
	    echo "Errors while trying to run $rsh $machine -n /bin/ls $myprog"
	    printedheader=1
	fi
	echo "Unexpected response from $machine:"
        echo "--> $output"
	errcnt=`expr $errcnt + 1`
	errls=`expr $errls + 1`
    else
	livelist="$livelist $machine"
    fi
done
rm -f mpichfoo
if [ $errls -gt 0 ] ; then
cat <<EOF
    The ls test failed on some machines.
    This usually means that you do not have a common filesystem on 
    all of the machines in your machines list; MPICH requires this
    for mpirun (it is possible to handle this in a procgroup file; see
    the documentation for more details).

    Other possible problems include:
        The remote shell command $rsh does not allow you to run ls.
           See the documentation about remote shell and rhosts.
        You have a common file system, but with inconsistent names.
           See the documentation on the automounter fix.

EOF
fi
#
# Now, try running a simple USER program
rm -f tstfoo.c 
cat >tstfoo.c <<.
main(){return 0;}
.
$CC -c tstfoo.c
$CLINKER -o tstfoo tstfoo.o
if [ ! -x tstfoo ] ; then
    echo "Could not build a sample program using $CC and $CLINKER!"
    rm -f tstfoo.c
    exit 1
fi
myprog=$PWD_TRIAL/tstfoo
list="$livelist"
livelist=""
printedheader=""
erruser=0
for machine in $list ; do
    if [ $verbose = 1 ] ; then 
	echo "Trying user program on $machine ..."
    fi
    output=`$rsh $machine -n $myprog 2>&1`
    if [ "$output" != "" ] ; then
	if [ -z "$printedheader" ] ; then
	    echo "Errors while trying to run a simple C program with $rsh $machine -n"
	    printedheader=1
	fi
	echo "Unexpected response from $machine:"
        echo "--> $output"
	errcnt=`expr $errcnt + 1`
        erruser=`expr $erruser + 1`
    else
	livelist="$livelist $machine"
    fi
done
rm -f tstfoo tstfoo.c tstfoo.o
if [ $erruser -gt 0 ] ; then
    cat <<EOF
    The simple program test failed.

    This test tries to run a simple program on the machines in your machines
    list with the command
        $rsh machinename -n program 

    This can fail if you do not have a common filesystem (this should have
    been detected above) or if the remote shell command $rsh does not allow 
    you to run the program on the indicated remote machines.  
        See the documentation about remote shell and rhosts for possible 
    fixes.
    
EOF
fi
#
#
if [ $errcnt -gt 0 ] ; then
    echo " "
    echo "$errcnt errors were encountered while testing the machines list for $arch"
    if [ -n "$livelist" ] ; then
	echo "Only these machines seem to be available"
	for machine in $livelist ; do
	    echo "    $machine"
	done
    else
	echo "No machines seem to be available!"
    fi
    exit 1
fi
exit 0
mpich-bin 1.2.7-10ubuntu1 / usr / lib / mpich / sbin / tstmachines