Our SAS Grid nodes automatically closes and does not accept new jobs when the server memory reaches the threshold. During this time the Object Spawners gets hung since it continuously tries to submit job to the Grid nodes and fails. I am using the below script to restart the Object Spawners whenever it gets hung. Thought sharing this to other administrators could be helpful.
#!/bin/bash
# Configuration
# Description
# This ObjectSpawn_restart.sh script is used to restart the Object Spawners across multiple Grid nodes sequentially without any outage.
Machine=$(/bin/hostname)
case "${Machine}" in
Gridnodes*) # SAS Grid TEST Environment
servers=(Gridnode1 Gridnode2) # Gridnodes where Object Spawnwers run
export ScriptDir="/../../../config/Lev1/ObjectSpawnerGrid" # ObjectSpawner Directory
;;
Gridnodes*) # SAS Grid DEV Environment
servers=(Gridnode1 Gridnode2 Gridnode3 ..... Gridnoden)
export ScriptDir="/../../..//config/Lev1/ObjectSpawnerGrid"
;;
Gridnodes*) # SAS GRID PROD Environment
servers=(Gridnode1 Gridnode2 Gridnode3)
export ScriptDir="/../../../config/Lev1/ObjectSpawnerGrid"
;;
esac
# Configuration.
SSH="/usr/share/centrifydc/bin/ssh"
for num in ${servers[@]}; do
processids=$(${SSH} ${num} ps -ef | grep ${ScriptDir} | grep -v grep | awk '{print $2}')
count=`echo ${processids[@]} | wc -w`
if [ ${count} -ne 0 ]; then
${SSH} ${num} kill -9 ${processids[@]}
fi
sleep 3
${SSH} -tt ${num} nohup ${ScriptDir}/ObjectSpawner.sh start & >/dev/null 2>&1
>/dev/null 2>&1
sleep 3
${SSH} -tt ${num} echo New PID: `cat ${ScriptDir}/server.${Machine}.pid`
done