#!/usr/bin/ksh
##set -x
##-----------------------------------------------------------------------
##Filename:   gg_proc_lag_check.sh
##Purpose:    Checkup GGS Process and lag to Notify at a definded interval
#------------------------------------------------------------------------  

. ~/.bash_profile

ORACLE_SID=$1
REPEAT=$2
ORACLE_HOME=`cat /etc/oratab |grep "^$ORACLE_SID:" |cut -f2 -d:|tr -d '[:blank:]'`
if [ "$ORACLE_HOME" = "" ]
then
  echo "$ORACLE_HOME";
  echo "Unable to determin ORACLE_HOME in /etc/oratab. exit." ;
  exit 1;
fi

export GGS_HOME=/u03/golden/bin
PATH=${PATH}:${ORACLE_HOME}/bin:${GGS_HOME}
export ORACLE_SID;
export ORACLE_HOME;
export LD_LIBRARY_PATH=${ORACLE_HOME}/lib:${GGS_HOME}:/usr/dt/lib:/usr/ucblib
export SHLIB_PATH=${ORACLE_HOME}/lib:${GGS_HOME}
export PATH;
export DL_LIST='onstardbteam@hp.com'

rm $HOME/scripts/tmp_pagers.txt                              
cp $HOME/scripts/dba_pagers.txt $HOME/scripts/tmp_pagers.txt 
sed -i -e 's/,/ /g' $HOME/scripts/tmp_pagers.txt             
export FAILURE_CONTACT=`cat $HOME/scripts/tmp_pagers.txt`    


export LOGDIR=/u03/golden/scripts/logs
export SCRIPTDIR=/u03/golden/scripts

export LOGFILE=ggs_check.log
export LOGFILE_LAG=ggs_lag.log

### LAG_CHK_MIN is for mininum replication checkpoint lag threshold in minutes.
### Can be modified based on business needs.
LAG_CHK_MIN=60

##if [ ! -d ${LOGDIR} ]; then
##   mkdir -p $LOGDIR
##fi

##ALL Process CheckUp
cd $GGS_HOME
./ggsci > ${LOGDIR}/${LOGFILE} << EOF
info all
exit
EOF

##if stopped, send info, if abend, send error message in email

egrep -i 'STOP|ABEND' ${LOGDIR}/${LOGFILE} > /dev/null
if [ $? = 0 ]; then
##egrep -i 'STOP|ABEND' ${LOGDIR}/${LOGFILE} | awk '{print $3}' | while read f
   for f in `egrep -i 'STOP|ABEND' ${LOGDIR}/${LOGFILE} | awk '{print $3}'`
   do
##there may be times some process stopped/abend, put those process name into
##the ggs_not_mon_proc.txt file and we will not get warning on them!

   egrep $f ${SCRIPTDIR}/ggs_not_mon_proc.txt > /dev/null
   if [ $? != 0 ]; then
      egrep -i 'STOP' ${LOGDIR}/${LOGFILE} | egrep $f > /dev/null
        if [ $? = 0 ]; then
          if [ ! -f ${LOGDIR}/${f}_down_count ]; then
             echo "1" > ${LOGDIR}/${f}_down_count
             egrep $f ${LOGDIR}/${LOGFILE} | mailx -s "GGS `hostname` : $f Process DOWN" -r $DL_LIST $FAILURE_CONTACT 
          elif [ -f ${LOGDIR}/${f}_down_count ]; then
            if [ `cat ${LOGDIR}/${f}_down_count` -ge ${REPEAT} ]; then
              egrep $f ${LOGDIR}/${LOGFILE} | mailx -s "GGS `hostname` : $f Process DOWN" -r $DL_LIST $FAILURE_CONTACT
              rm ${LOGDIR}/${f}_down_count
            else
            expr `cat ${LOGDIR}/${f}_down_count` + 1 > ${LOGDIR}/${f}_down_count
            fi
          fi
        fi
      egrep -i 'ABEND' ${LOGDIR}/${LOGFILE} | egrep $f > /dev/null
        if [ $? = 0 ]; then
          if [ ! -f ${LOGDIR}/${f}_abend_count ]; then
             echo "1" > ${LOGDIR}/${f}_abend_count
             cat $GGS_HOME/dirrpt/$f.rpt | egrep -i "ORA-" | egrep -v "last error" | mailx -s "GGS `hostname` : $f Error" -r $DL_LIST $FAILURE_CONTACT  
          elif [ -f ${LOGDIR}/${f}_abend_count ]; then
            if [ `cat ${LOGDIR}/${f}_abend_count` -ge ${REPEAT} ]; then
              cat $GGS_HOME/dirrpt/$f.rpt | egrep -i "ORA-" | egrep -v "last error" | mailx -s "GGS `hostname` : $f Error" -r $DL_LIST $FAILURE_CONTACT 
              rm ${LOGDIR}/${f}_abend_count
            else
            expr `cat ${LOGDIR}/${f}_abend_count` + 1 > ${LOGDIR}/${f}_abend_count
            fi
          fi
        fi
   fi
   done
fi

for running_proc in `egrep -i 'RUNNING' ${LOGDIR}/${LOGFILE} | awk '{print $3}'`
  do
  rm ${LOGDIR}/${running_proc}_down_count > /dev/null 2>&1
  rm ${LOGDIR}/${running_proc}_abend_count > /dev/null 2>&1
  done

##If manager process down, send email!

egrep -i 'STOP' ${LOGDIR}/${LOGFILE} | egrep -i "MANAGER"
if [ $? = 0 ]; then
   if [ ! -f ${LOGDIR}/mgr_down_count ]; then
      echo "1" > ${LOGDIR}/mgr_down_count
      egrep -i "MANAGER" ${LOGDIR}/${LOGFILE} | mailx -s "GGS `hostname` : Manager DOWN" -r $DL_LIST $FAILURE_CONTACT 
   elif [ -f ${LOGDIR}/mgr_down_count ]; then
      if [ `cat ${LOGDIR}/mgr_down_count` -ge ${REPEAT} ]; then
         egrep -i "MANAGER" ${LOGDIR}/${LOGFILE} | mailx -s "GGS `hostname` : Manager DOWN" -r $DL_LIST $FAILURE_CONTACT 
         rm ${LOGDIR}/mgr_down_count
      else
         expr `cat ${LOGDIR}/mgr_down_count` + 1 > ${LOGDIR}/mgr_down_count
      fi
   fi
elif [ $? != 0 ]; then
   rm ${LOGDIR}/mgr_down_count > /dev/null 2>&1
fi

##Check Process Lag!
##there may be times some process stopped/abend, put those process name into
##the ggs_not_mon_proc.txt file and we will not get warning on them!

cat ${LOGDIR}/${LOGFILE} | egrep -i 'EXTRACT|REPLICAT' | while read line
do
## Replication lagging check
   HOUR=`echo $line | awk '{print $4}' | awk -F":" '{print $1}'`
   MINUTE=`echo $line | awk '{print $4}' | awk -F":" '{print $2}'`
   SECOND=`echo $line | awk '{print $4}' | awk -F":" '{print $3}'`
   proc_name=`echo $line | awk '{print $3}'`

   if [ $MINUTE -gt $LAG_CHK_MIN -o $HOUR -gt "00" ]; then
      egrep $proc_name ${SCRIPTDIR}/ggs_not_mon_proc.txt
      if [ $? != 0 ]; then
        if [ ! -f ${LOGDIR}/${proc_name}_lag_count ]; then
           echo "1" > ${LOGDIR}/${proc_name}_lag_count
           mailx -s "GGS `hostname` : $proc_name LAG Alert (> $LAG_CHK_MIN Minutes)" -r $DL_LIST $FAILURE_CONTACT < ${LOGDIR}/${LOGFILE} 
        elif [ -f ${LOGDIR}/${proc_name}_lag_count ]; then
           if [ `cat ${LOGDIR}/${proc_name}_lag_count` -ge ${REPEAT} ]; then
              mailx -s "GGS `hostname` : $proc_name LAG Alert (> $LAG_CHK_MIN Minutes)" -r $DL_LIST $FAILURE_CONTACT < ${LOGDIR}/${LOGFILE} 
              rm ${LOGDIR}/${proc_name}_lag_count
           else
              expr `cat ${LOGDIR}/${proc_name}_lag_count` + 1 > ${LOGDIR}/${proc_name}_lag_count
           fi
        fi
      fi
   else
      rm ${LOGDIR}/${proc_name}_lag_count > /dev/null 2>&1
   fi

## Checkpoint lagging check
   HOUR_chk=`echo $line | awk '{print $5}' | awk -F":" '{print $1}'`
   MINUTE_chk=`echo $line | awk '{print $5}' | awk -F":" '{print $2}'`
   SECOND_chk=`echo $line | awk '{print $5}' | awk -F":" '{print $3}'`
   proc_name=`echo $line | awk '{print $3}'`

   if [ $MINUTE_chk -gt $LAG_CHK_MIN -o $HOUR_chk -gt "00" ]; then
      egrep $proc_name ${SCRIPTDIR}/ggs_not_mon_proc.txt
      if [ $? != 0 ]; then
        if [ ! -f ${LOGDIR}/${proc_name}_chk_count ]; then
           echo "1" > ${LOGDIR}/${proc_name}_chk_count
           mailx -s "GGS `hostname` : $proc_name CHECKPOINT Alert (> $LAG_CHK_MIN minutes)" -r $DL_LIST $FAILURE_CONTACT < ${LOGDIR}/${LOGFILE}
        elif [ -f ${LOGDIR}/${proc_name}_chk_count ]; then
           if [ `cat ${LOGDIR}/${proc_name}_chk_count` -ge ${REPEAT} ]; then
              mailx -s "GGS `hostname` : $proc_name CHECKPOINT Alert (> $LAG_CHK_MIN minutes)" -r $DL_LIST $FAILURE_CONTACT < ${LOGDIR}/${LOGFILE} 
              rm ${LOGDIR}/${proc_name}_chk_count
           else
              expr `cat ${LOGDIR}/${proc_name}_chk_count` + 1 > ${LOGDIR}/${proc_name}_chk_count
           fi
        fi
      fi
   else
      rm ${LOGDIR}/${proc_name}_chk_count > /dev/null 2>&1
   fi
done
exit