#!/bin/ksh
#
# $Id: pnd_arc_extract.sh,v 1.1 2023/03/27 16:07:05 root Exp $
#
# The following code is Confidential and is covered by the installation license
# (c) Copyright Fortra, LLC. and its group of companies.
#
#&& Performance Navigator Data archive extractor (pnd_arc_extract.sh) 
#& Performs extract of portion of mpgdata2.hostname_archive.gz 
#& File name of extract is mpgdata2.hostname_extract_YYYYMM_to_YYYYMM
#
#&@ pndcutils.sh is also required
#
#&% Run as need needed with options to extract data from archive
#&% with dates between YYYYDD and YYYYDD
#
# Initial Created 10/21/11
#

mkdir -p /tmp/helpsystems_tmp

invocdir=`dirname $0`
if [ -s $invocdir/pndcutils.sh ] ; then
   . $invocdir/pndcutils.sh
else
   echo "pndcutils.sh file does not exist at `pwd`, (Required) exiting "
fi

if [ -s $invocdir/pn.config ] ; then
   . $invocdir/pn.config 
else
   echo "pn.config file does not exist at `pwd`"
fi

# Three new environmental variables
# installdir
# mpgdatadir
# nmondatadir

# installdir should be defined, but if not use invocdir
if [ -z "$installdir" ] ; then
   installdir=$invocdir
fi

# mpgdatadir should be defined, but if not use invocdir
if [ -z "$mpgdatadir" ] ; then
   mpgdatadir=$invocdir
fi

# nmondatadir should be defined, but if not use invocdir
if [ -z "$nmondatadir" ] ; then
   nmondatadir=$invocdir
fi

if [ -d $mpgdatadir ] ; then
   cd $mpgdatadir
fi

# Below used for testing only
# set > /tmp/helpsystems_tmp/pnd_arc_extract_set.hsllc

###############################
###############################

function usage {
echo "USAGE:
   pnd_arc_extract.sh [options]
   pnd_arc_extract.sh [-f filename_archive.gz] -s startdate -e enddate
In order to pull the first January of 2009 an example syntax is:
   pnd_arc_extract.sh -s 200901 -e 200901 
In order to pull the first 6 months of 2009 an example syntax is:
   pnd_arc_extract.sh -s 200901 -e 200906 
Defaults to: 'mpgdata2.hostname_archive.gz' for input filename
and outputs to either 
   mpgdata2.hostname_archive_YYYYMM or 
   mpgdata2.hostname_archive_YYYYMM_to_YYYYMM  
Options are any of:
   -D              - debug
   -s startdate    - Starting month to pull from archive in form YYYYMM
   -e enddate      - Starting month to pull from archive in form YYYYMM
   -f file2process - Gzipped archive. If not specified, 
                     default is mpgdata2.hostname_archive.gz
   -t test         - Test only, does not actually create a file pulled from 
                     the archive 
   -h help         - This help
"
}
   
passopts="";
while getopts "Ds:e:f:th" opt ; do
   case $opt in
      D)      debug="T" ;;
      s)      opt_startdate=$OPTARG ; passopts="$passopts -s $OPTARG" ;;
      e)      opt_enddate=$OPTARG ; passopts="$passopts -e $OPTARG" ;;
      f)      opt_datafile=$OPTARG ; passopts="$passopts -f $OPTARG" ;;
      t)      jtest="T" ;;
      h)      usage;  exit ;;
      ?)      usage;  exit ;;
   esac
done

if [ "x${debug}x" = "xTx" ]; then
   echo "debug=($debug)"
   echo "opt_startdate=($opt_startdate)"
   echo "opt_enddate=($opt_enddate)"
   echo "opt_datafile=($opt_datafile)"
   echo "jtest=($jtest)"
   echo "jhelp=($jhelp)"
fi

###############################
###############################

if [ "x${opt_datafile}x" != "xx" ]; then
   jdatafile=$opt_datafile
else
   jdatafile=${def_datafile}_archive.gz 
fi

jarcfile=$jdatafile
jdatafile=`echo "$jdatafile"|sed s,\.gz$,,` 

jsfirst=$opt_startdate
jslast=$opt_enddate

#  The next two variables can be set manually to force a date range
#  jsfirst="201007"
#  jslast="201007"

echo "Processing archive file (${jarcfile}) and validating data." 

if [ ! -s $jarcfile ] ; then 
   echo "### Error ###  Cannot find file specified ($jarcfile)"
   echo ""
   exit 1
fi

rm -f ${jdatafile}.data_errors 

echo "Review details in: ${jdatafile}.dv_rpt"  
# echo $invocdir
# wc -l $jdatafile

jreccnt=`gunzip -c ${jdatafile}.gz |wc -l | awk '{print $1}'`

gunzip -c ${jdatafile}.gz | egrep -n "^ZZZZ,T[0-9][0-9][0-9][0-9],[0-9][0-9]:[0-9][0-9]:[0-9][0-9],[0-9][0-9]-[JFMASOND][AEPUCO][NBRYLGPTVC]-[0-9][0-9][0-9][0-9]|^AAA\,(progname|date|time|note|command)|^DISKBSIZE|^TOP"  \
   | sed s/\:/\,/ \
   | awk -v "jrecs=$jreccnt" '

function dtconv (date_in) {
   split("JAN FEB MAR APR MAY JUN JUL AUG SEP OCT NOV DEC", month, " ")
   for (i=1; i<=12; i++) mdigit[month[i]]=i
      m=toupper(substr(date_in,4,3))
   out_date=substr(date_in,8,4) sprintf("%02d",mdigit[m]) substr(date_in,1,2)
   # Date goes out yyyymmdd
   return out_date
}

BEGIN { 
   FS=","
   errhdrrec = "Ok"  
   errsubrec = "Ok"  
   errdbfrec = "Ok"  
   bdbfrec = 1 
   bhdrrec = 1 
   findhdrrec = "F"  
   findsubrec = "F"
   cntsubrec = 0
   # dtconv(datein) 12-AUG-2009 returns 20090812
}
 
{if($2 == "AAA" && findhdrrec != "T" && bdbfrec == 1 && index($0,"nmon.") == 0) 
# {if($2 == "AAA" && findhdrrec != "T" && NR == 1 && index($0,"nmon.") == 0) 
   {  # print $0
      edbfrec = $1-1 
      esubrec = edbfrec 
      findhdrrec = "T"
      findsubrec = "F"
      # print $1,$2,$3
      {  if(NR != 1) 
         {  esubrec = edbfrec
            { if(subrecdate != newhdrdate && subrecdate != newhdrdate + 1){
                errsubrec = "Error_subrec_sequence"
                errdbfrec = "Error" }
         }
            {  if(cntsubrec != 0) 
               {  print "SUB_REC",bsubrec, esubrec, subrecboth, errsubrec, cntsubrec }
            } 
            {  if(cntsubrec == 0) 
               { errdbfrec = "Error_NoRecords"}
            } 
            {  if(cntsubrec > 288) 
               { errdbfrec = "Error_RecCntHi"}
            } 
            # Testing only ## print NR, "DBF_REC",bdbfrec, edbfrec, newhdrboth, errdbfrec, cntsubrec
            print "DBF_REC",bdbfrec, edbfrec, newhdrboth, errdbfrec, cntsubrec
            cntsubrec = 0
            errsubrec = "Ok"
            errdbfrec = "Ok"
         }
      } 
      bdbfrec = $1 
      bhdrrec = $1
   } 
}  

{if($2 == "AAA" && findhdrrec != "T" && bdbfrec != 1 && index($0,"nmon.") == 0 )
   {  # print $0
      edbfrec = $1-1 
      esubrec = edbfrec 
      findhdrrec = "T"
      findsubrec = "F"
      # print $1,$2,$3
      { if(subrecboth < newhdrboth){
           errsubrec = "Error_subrec_sequence"
           errdbfrec = "Error"}}
      {  if(cntsubrec != 0) 
         {  print "SUB_REC",bsubrec, esubrec, subrecboth, errsubrec, cntsubrec }
      } 
      {  if(cntsubrec == 0) 
         { errdbfrec = "Error_NoRecords"}
      } 
      {  if(cntsubrec > 288) 
         { errdbfrec = "Error_RecCntHi"}
      } 
      # Testing only ## print NR, "DBF_REC",bdbfrec, edbfrec, newhdrboth, errdbfrec, cntsubrec 
      print "DBF_REC",bdbfrec, edbfrec, newhdrboth, errdbfrec, cntsubrec 
      cntsubrec = 0
      errsubrec = "Ok"
      errdbfrec = "Ok"
      bdbfrec = $1 
      bhdrrec = $1
   } 
}  

# {if($2 == "AAA" && $3 == "progname")
#    {  # print $0
#       hdrdate = $5
#       # print "dtconv", $4
#       newhdrdate = dtconv($4) 
#       newhdrboth = newhdrdate newhdrtime
#       findhdrrec = "T"
#       findsubrec = "F"
#       cntsubrec = 0
#       # print $0, newhdrdate 
#       # print $1,$2,$3,$4,$5
#    }
# }  

{if($2 == "AAA" && $3 == "date")
   {  # print $0
      hdrdate = $5
      # print "dtconv", $4
      newhdrdate = dtconv($4) 
      newhdrboth = newhdrdate newhdrtime
      findhdrrec = "T"
      findsubrec = "F"
      cntsubrec = 0
      # print $0, newhdrdate 
      # print $1,$2,$3,$4,$5
   }
}  

{if($2 == "AAA" && $3 == "time")
   {  # print $0
      hdrtime = $4
      # print "hdrtime", $4
      newhdrtime = substr($4,1,2)substr($4,4,2)substr($4,7,2)
      # print "newhdrtime", newhdrtime
      # print "newhdrboth", newhdrboth
      # print $0, newhdrtime 
      # print $1,$2,$3,$4,$5
   }
}  

{if($2 == "ZZZZ" && $3 == "T0001")
   {  # print $0
      # Determine subrecord date on each subrecord
      subrecdate = dtconv($5) 
      subrectime = substr($4,1,2)substr($4,4,2)substr($4,7,2)
      subrecboth = subrecdate subrectime
      ehdrrec = $1-1 
      findhdrrec = "F" 
      bsubrec = $1 
      # print "HDR_REC", bhdrrec, ehdrrec, newhdrdate, errhdrrec 
      print "HDR_REC", bhdrrec, ehdrrec, newhdrboth, errhdrrec 
      errhdrrec = "Ok"
      cntsubrec = cntsubrec + 1 
   }  
} 
 
{if($2 == "ZZZZ" && $3 != "T0001")
   {  # print $0
      # Determine subrecord date on each subrecord
      findhdrrec = "F" 
      esubrec = $1-1 
      { if( subrecboth < newhdrboth ){
           errsubrec = "Error_subrec_sequence"
           errdbfrec = "Error"}}
      {  if(cntsubrec != 0) 
         {  print "SUB_REC",bsubrec, esubrec, subrecboth, errsubrec, cntsubrec }
      } 
      subrecdate = dtconv($5) 
      subrectime = substr($4,1,2)substr($4,4,2)substr($4,7,2)
      subrecboth = subrecdate subrectime
      errsubrec = "Ok"
      cntsubrec = cntsubrec + 1 
      bsubrec = $1 
   }
}  

# { if(substr($2,1,9) == "DISKBSIZE" || substr($2,1,3) == "TOP" )
#    {  # print $0
#      edbfrec = $1 
#      findhdrrec = "F" 
#      findsubrec = "F"
#   }
# }  

{if($2 == "ZZZZ" && findsubrec != "T")
   {  # print $0
      bsubrec = $1 
      findhdrrec = "F" 
      findsubrec = "T"
      # print $1,$2,$3
   }
}  

{if($2 == "AAA" && findsubrec != "T" && index($0,"nmon.") != 0)
   {  # print $0
      bsubrec = $1 
      findhdrrec = "F" 
      findsubrec = "T"
      cntsubrec = 0
      # print $1,$2,$3
   }
}

END { 
    # jrecs is the record count of the mpgdata2.filename.
    # It is set at the end of the awk statement just before it goes into the output file
    edbfrec = jrecs
    esubrec = edbfrec 
    { if( subrecboth < newhdrboth ){
         errsubrec = "Error_subrec_sequence"
         errdbfrec = "Error"}}
    {  if(cntsubrec != 0) 
       {  print "SUB_REC",bsubrec, esubrec, subrecboth, errsubrec, cntsubrec }
    } 
    {  if(cntsubrec == 0) 
       { errdbfrec = "Error_NoRecords"}
    } 
    {  if(cntsubrec > 288) 
       { errdbfrec = "Error_RecCntHi"}
    } 
    # Testing only ## print NR, "DBF_REC",bdbfrec, edbfrec, newhdrboth, errdbfrec, cntsubrec
    print "DBF_REC",bdbfrec, edbfrec, newhdrboth, errdbfrec, cntsubrec
    errsubrec = "Ok"
    errdbfrec = "Ok"
    }
    ### Output to datafilename+.dv_rpt ### 
'  >  ${jdatafile}.dv_rpt  

# This looks for duplicate records or those that do not have DBF & HDR records 
egrep "SUB"  ${jdatafile}.dv_rpt  | awk '{print $4}' | sort -n | uniq -c | grep -v " 1 "  | awk '{print $1,$2}' > ${jdatafile}.dup_recs  
egrep "HDR|DBF"  ${jdatafile}.dv_rpt  | awk '{print $4}' | sort -n | uniq -c | grep -v " 2 " | awk '{print $1,$2}' >> ${jdatafile}.dup_recs  
if [ -s ${jdatafile}.dup_recs ]; then
   rm -f ${jdatafile}.dup_recs_details  
   while 
   read  tmpcnt dttime
   do
      echo "(${dttime})"  >> ${jdatafile}.dup_recs_details  
      grep ${dttime}  ${jdatafile}.dv_rpt >> ${jdatafile}.dup_recs_details  
      echo ""  >> ${jdatafile}.dup_recs_details  
   done < ${jdatafile}.dup_recs  
   echo "There may be duplicate, or problem records."
   echo "Please check ${jdatafile}.dup_recs_details  and  ${jdatafile}.dup_recs_details"   
else
   rm -f ${jdatafile}.dup_recs  ${jdatafile}.dup_recs_details   
fi 


# List DBF records 
grep DBF ${jdatafile}.dv_rpt > ${jdatafile}.dv_rpt_DBF


# List DBF records sorted by date
# Linux
uname_s=`uname -s`
if [ "x${uname_s}x" = "xLinuxx" ]
then
   # Sort command for Linux to sort by 4th field
   grep DBF ${jdatafile}.dv_rpt | sort -n -k +4 > ${jdatafile}.dv_rpt_DBF_sort
else
   # Sort command for SunOS or AIX to sort by 4th field
   grep DBF ${jdatafile}.dv_rpt | sort -n +3 > ${jdatafile}.dv_rpt_DBF_sort
fi

# Identify errors that are in the record report.
grep -v Ok ${jdatafile}.dv_rpt > ${jdatafile}.data_errors.tmp2
if [ -s ${jdatafile}.data_errors.tmp2 ]; then
   echo  '---------------------------' >> ${jdatafile}.data_errors 
   echo  "Errors were found with the data in the report file ${jdatafile}.dv_rpt" >> ${jdatafile}.data_errors 
   echo  "Most likley cause is nmon runnning multiple times simultaneously." >> ${jdatafile}.data_errors 
   echo  "Normal counts are 288(12*24). During daylight savings in early " >> ${jdatafile}.data_errors
   echo  "November they could be 300, for one day when time rolls back" >> ${jdatafile}.data_errors
   echo  "Or this could be from causes unknown. " >> ${jdatafile}.data_errors 
   echo  '---------------------------' >> ${jdatafile}.data_errors 
   echo  'Details:' >> ${jdatafile}.data_errors 
   echo  '---------------------------' >> ${jdatafile}.data_errors 
   cat ${jdatafile}.data_errors.tmp2 >> ${jdatafile}.data_errors 
   echo  '' >> ${jdatafile}.data_errors 
   echo  '' >> ${jdatafile}.data_errors 
fi 

# Remove  sorted file from previous times this script is ran.
rm -f ${jdatafile}.sorted 

# If both DBF lists are the same, indicate sequenced correctly.
# Otherwise, create a new file with correctly sequenced DBF records.
diff ${jdatafile}.dv_rpt_DBF ${jdatafile}.dv_rpt_DBF_sort > /dev/null
jdiffchk=$?
# Uncommenting the line below will force a resort failure
# echo "Remove Later"; jdiffchk=1
if [[ ${jdiffchk} -ne 0 ]] ; then
   echo "${jdatafile} is out of date sequence."
   jsequence="OutOfSequence"
   echo "Creating a correctly indexed file ${jdatafile}.sorted "
   rm -f ${jdatafile}.sorted.gz 
   touch ${jdatafile}.sorted.gz
   # Added reduction process to record list.
   awk '{if(NR == 1){jbegrec=$2; jendrec=$3}}
   {if(NR != 1 && $2-jendrec == 1) 
   { jendrec=$3 } 
   else if(NR != 1 && $2-jendrec != 1)
   {print jbegrec, jendrec ; jbegrec=$2; jendrec=$3}}
   END {print jbegrec, jendrec}' ${jdatafile}.dv_rpt_DBF_sort > ${jdatafile}.dv_rpt_DBF_sort_reduced 
   # End of reduction process
   ##### echo "Remove later"; exit 1
   while
   read jRecBegin jRecEnd
   # read jRecType jRecBegin jRecEnd jRecDate jRecStat
   do
      ## Uncomment next line to see actual sed commands as they are issued
      ## echo "sed -n \"${jRecBegin},${jRecEnd}p\" ${jdatafile} >> ${jdatafile}.sorted"
      ## Had started with sed, but awk is much faster, and perl just a bit 
      ## slower than awk.  All three approaches are listed, but awk is faster.
      ## sed -n "${jRecBegin},${jRecEnd}p" ${jdatafile} >> ${jdatafile}.sorted
      ## perl -nle "print if \$.>=${jRecBegin}; exit if \$. >= ${jRecEnd}" ${jdatafile} >> ${jdatafile}.sorted
      ## awk -v jbegin=${jRecBegin} -v jend=${jRecEnd} '{if(NR >= jbegin && NR <= jend )print $0}{if (NR > jend) exit}' ${jdatafile} >> ${jdatafile}.sorted
      # echo "Test gunzip -c ${jarcfile} | gzip -c >> ${jdatafile}.sorted.gz"
      gunzip -c ${jarcfile} | awk -v jbegin=${jRecBegin} -v jend=${jRecEnd} '{if(NR >= jbegin && NR <= jend )print $0}{if (NR > jend) exit}' | gzip -c >> ${jdatafile}.sorted.gz
   done < ${jdatafile}.dv_rpt_DBF_sort_reduced
   ## echo "Remove later"; exit 1
   # Previously # done < ${jdatafile}.dv_rpt_DBF_sort
   # Ensure the date is the same for the original and corrected file
   touch -r ${jarcfile} ${jdatafile}.sorted.gz  
   echo "If the size is slightly smaller, there may have been invalid data."
   echo "If the size is slightly larger, this could be caused by gzip in auto."
   echo "please check the file ${jdatafile}.dv_rpt for lines without OK"
   ls -al ${jdatafile}.sorted.gz ${jarcfile} | sed s,\ \ *,\ ,g
   echo "Please check the Word counts below."
   echo "Word counts `gunzip -c ${jarcfile} | wc` for ${jarcfile}" 
   echo "Word counts `gunzip -c ${jdatafile}.sorted.gz | wc` for ${jdatafile}.sorted.gz " 
   echo "The sorted data should be copied over the actual data once confirmed."
   echo "Command required:"
   echo "   mv ${jdatafile}.sorted.gz ${jarcfile}"
   # We do not want to fix automatically, because of possible problems.
   # How do we want to handle totally invalid data that may be in the datafile?
else
   echo "${jdatafile} is sequenced correctly."
   jsequence="Ok"
   rm -f ${jdatafile}.sorted 
fi

if [ -s ${jdatafile}.data_errors ] ; then
   echo "***   There are errors in ${jdatafile}"         
   echo "***   Please review ${jdatafile}.data_errors for details"
fi

if [ -s ${jdatafile}.data_errors.tmp ]; then
   echo  "Times were decreasing in the data."
fi

if [ -s ${jdatafile}.data_errors.tmp1 ]; then
   echo  "Header info placed mid line."
fi

if [ -s ${jdatafile}.data_errors.tmp2 ]; then
   echo  "Assorted errors in the data." 
fi

cat ${jdatafile}.dv_rpt_DBF | awk '{print $2,$3,substr($4,1,6)}' > ${jdatafile}_YYYYMM_locs
jfirstdate=`head -1 ${jdatafile}_YYYYMM_locs | awk '{print $3}'`
jlastdate=`tail -1 ${jdatafile}_YYYYMM_locs | awk '{print $3}'`
echo "Dates available are ${jfirstdate} to ${jlastdate}"

echo "Selected from ${jsfirst} to ${jslast}"

jchanged=0


if [ "x${jsfirst}x" = "xx" ] || [ "x${jslast}x" = "xx" ] ; then
   echo "Either startdate (${jsfirst}) or lastdate (${jslast}) is empty, exiting."
   exit 1
fi

if [ ${jsfirst} -gt ${jslast} ] ; then
   echo "Selected startdate (${jsfirst}) is greater than (${jslast}), exiting."
   exit 1
fi

if [ ${jsfirst} -gt ${jlastdate} ] ; then
   echo "The startdate (${jsfirst}) is greater than the last date (${jlastdate}), exiting."
   exit 1
fi

if [ ${jslast} -lt ${jfirstdate} ] ; then
   echo "The enddate (${jslast}) is less than the first date (${jfirstdate}), exiting."
   exit 1
fi

if [ ${jsfirst} -lt ${jfirstdate} ] ; then
   echo "Selected startdate is too early, adjusting to ${jfirstdate}."
   jsfirst=$jfirstdate
   jchanged=1
fi

if [ ${jslast} -gt ${jlastdate} ] ; then
   echo "Selected lastdate is too late, adjusting to ${jlastdate}."
   jslast=$jlastdate
   jchanged=1
fi

if [ ${jchanged} -eq 1 ] ; then
   echo "Using from ${jsfirst} to ${jslast}" 
fi

if [ "x${jtest}x" = "xTx" ] ; then
   echo "Test only was selected.  No data will be extracted."
   exit 1
fi

if [ ${jsfirst} -eq ${jslast} ] ; then
   jarcoutfile=${jdatafile}_${jsfirst}
else
   jarcoutfile=${jdatafile}_${jsfirst}_to_${jslast}
fi
echo "Creating file ${jarcoutfile} "

awk -v jbegin=${jsfirst} -v jend=${jslast} '{if( $3 >= jbegin && $3 <= jend )print $0}' ${jdatafile}_YYYYMM_locs > ${jarcoutfile}.locs

jarcfirst=`cat ${jarcoutfile}.locs | head -1 | awk '{print $1}'`
jarclast=`cat ${jarcoutfile}.locs | tail -1 | awk '{print $2}'`

## echo "${jarcfirst} to ${jarclast}"

gunzip -c ${jarcfile} | awk -v jbegin=${jarcfirst} -v jend=${jarclast} '{if(NR >= jbegin && NR <= jend )print $0}{if (NR > jend) exit}' > ${jarcoutfile} 



if [ "x${debug}x" != "xTx" ]; then
   rm -f ${jdatafile}_YYYYMM_locs
   rm -f ${jarcoutfile}.locs
   rm -f ${jdatafile}.data_errors.tmp
   rm -f ${jdatafile}.data_errors.tmp1
   rm -f ${jdatafile}.data_errors.tmp2
else
   echo "debug=($debug).  Temorary files were not removed. "
fi

#  ${jdatafile}.dv_rpt_DBF

if [ "x${jsequence}x" = "xOkx" ] ; then
   rm -f ${jdatafile}.dv_rpt_DBF_sort 
fi

# echo "Remove later!!!!!!"; exit
ls -al ${jarcoutfile}  | sed s,\ \ *,\ ,g
echo "Done!"
# echo ""
