#!/usr/bin/env python
#
# @package      hubzero-submit-monitors
# @file         BatchMonitors/monitorCondorG.py
# @author       Steven Clark <clarks@purdue.edu>
# @copyright    Copyright (c) 2004-2015 HUBzero Foundation, LLC.
# @license      http://opensource.org/licenses/MIT MIT
#
# Copyright (c) 2004-2015 HUBzero Foundation, LLC.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
#
# HUBzero is a registered trademark of HUBzero Foundation, LLC.
#
# ----------------------------------------------------------------------
#  monitorCondorG.py
#
#  script which monitors the Condor G queue and reports changes in job status
#
import sys
import os
import select
import subprocess
import re
import signal

from LogMessage import openLog, log

CONDOR_ROOT   = ""
CONDOR_CONFIG = ""

SITEDESIGNATOR     = "condorG"
MONITORROOT        = os.path.join(os.sep,'opt','submit')
QSTATCOMMAND       = "condor_q -long"
MONITORLOGLOCATION = os.path.join(os.sep,'var','log','submit','monitors')
MONITORLOGFILENAME = "monitorCondorG.log"
LOGPATH            = os.path.join(MONITORLOGLOCATION,MONITORLOGFILENAME)
HISTORYFILENAME    = "monitorCondorG.history"
HISTORYFILEPATH    = os.path.join(MONITORROOT,HISTORYFILENAME)

SLEEPTIME       = 60
PAUSETIME       = 5.
MAXIMUMIDLETIME = 30*60


class QueueMonitor:
   def __init__(self,
                siteDesignator,
                qstatCommand,
                historyFilePath,
                sleepTime,
                pauseTime,
                maximumIdleTime):
      self.siteDesignator                = siteDesignator
      self.qstatCommand                  = qstatCommand
      self.historyFilePath               = historyFilePath
      self.sleepTime                     = sleepTime
      self.pauseTime                     = pauseTime
      self.maximumConsecutiveEmptyQueues = maximumIdleTime/sleepTime

      self.historyFile     = None
      self.updates         = []
      self.activeJobs      = {}
      self.bufferSize      = 4096

      signal.signal(signal.SIGINT,self.sigINT_handler)
      signal.signal(signal.SIGHUP,self.sigHUP_handler)
      signal.signal(signal.SIGQUIT,self.sigQUIT_handler)
      signal.signal(signal.SIGABRT,self.sigABRT_handler)
      signal.signal(signal.SIGTERM,self.sigTERM_handler)


   def cleanup(self):
      if self.historyFile:
         self.historyFile.close()
         self.historyFile = None


   def sigGEN_handler(self,
                      signalNumber,
                      frame):
      self.cleanup()
      log("%s monitor stopped" % (self.siteDesignator))
      sys.exit(1)


   def sigINT_handler(self,
                      signalNumber,
                      frame):
      log("Received SIGINT!")
      self.sigGEN_handler(signalNumber,frame)


   def sigHUP_handler(self,
                      signalNumber,
                      frame):
      log("Received SIGHUP!")
      self.sigGEN_handler(signalNumber,frame)


   def sigQUIT_handler(self,
                       signalNumber,
                       frame):
      log("Received SIGQUIT!")
      self.sigGEN_handler(signalNumber,frame)


   def sigABRT_handler(self,
                       signalNumber,
                       frame):
      log("Received SIGABRT!")
      self.sigGEN_handler(signalNumber,frame)


   def sigTERM_handler(self,
                       signalNumber,
                       frame):
      log("Received SIGTERM!")
      self.sigGEN_handler(signalNumber,frame)


   def openHistory(self,
                   accessMode):
      if accessMode == 'r':
         if os.path.isfile(self.historyFilePath):
            self.historyFile = open(self.historyFilePath,accessMode)
         else:
            self.historyFile = None
      else:
         self.historyFile = open(self.historyFilePath,accessMode)


   def recordHistory(self,
                     jobId):
      self.historyFile.write("%s:%s %s %s %s\n" % (self.siteDesignator,str(jobId),self.activeJobs[jobId][0], \
                                                                                  self.activeJobs[jobId][1], \
                                                                                  self.activeJobs[jobId][2]))
      self.historyFile.flush()
      self.updates.append(str(jobId) + ' ' + self.activeJobs[jobId][0] + ' ' + \
                                             self.activeJobs[jobId][1] + ';' + \
                                             self.activeJobs[jobId][2])


   def loadHistory(self):
      self.openHistory('r')
      if self.historyFile:
#  c:6760 R
#  --------------------
         records = self.historyFile.readlines()
         for record in records:
            colon = record.find(':')
            if colon > 0:
               jobState = record[colon+1:].split()
               jobId = jobState[0]
               status = jobState[1]
               if(len(jobState) > 2):
                  stage = jobState[2]
               else:
                  stage = '?'
               if(len(jobState) > 3):
                  runJobMatch = jobState[3]
               else:
                  runJobMatch = '?'
               if status == 'D':
                  if jobId in self.activeJobs:
                     del self.activeJobs[jobId]
               else:
                  self.activeJobs[jobId] = (status,stage,runJobMatch)
         self.historyFile.close()
         self.historyFile = None


   def saveHistory(self):
      self.openHistory('w')
      if self.historyFile:
         for activeJob in self.activeJobs:
            self.historyFile.write("%s:%s %s %s %s\n" % (self.siteDesignator,str(activeJob),self.activeJobs[activeJob][0], \
                                                                                            self.activeJobs[activeJob][1], \
                                                                                            self.activeJobs[activeJob][2]))
         self.historyFile.close()
         self.historyFile = None


   def executeQstatCommand(self,
                           command):
      child = subprocess.Popen(command,shell=True,bufsize=self.bufferSize,
                               stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE,
                               close_fds=True)
      childPid   = child.pid
      childout   = child.stdout
      childoutFd = childout.fileno()
      childerr   = child.stderr
      childerrFd = childerr.fileno()

      outEOF = False
      errEOF = False

      outData = []
      errData = []

      while True:
         toCheck = []
         if not outEOF:
            toCheck.append(childoutFd)
         if not errEOF:
            toCheck.append(childerrFd)
         ready = select.select(toCheck,[],[],self.sleepTime) # wait for input
         if childoutFd in ready[0]:
            outChunk = os.read(childoutFd,self.bufferSize)
            if outChunk == '':
               outEOF = True
            outData.append(outChunk)

         if childerrFd in ready[0]:
            errChunk = os.read(childerrFd,self.bufferSize)
            if errChunk == '':
               errEOF = True
            errData.append(errChunk)

         if len(ready[0]) == 0:
            os.kill(child.pid,signal.SIGTERM)

         if outEOF and errEOF:
            break

      pid,err = os.waitpid(childPid,0)
      if err != 0:
         if os.WIFSIGNALED(err):
            log("%s failed w/ exit code %d signal %d" % (command,os.WEXITSTATUS(err),os.WTERMSIG(err)))
         else:
            if os.WIFEXITED(err):
               err = os.WEXITSTATUS(err)
            log("%s failed w/ exit code %d" % (command,err))
         log("%s" % ("".join(errData)))

      return(err,"".join(outData),"".join(errData))


   def filterCondorQ(self,
                     condorqRecords):
      filteredRecords = ""

      jobStatusLabels = {}
      jobStatusLabels[1] = 'I'
      jobStatusLabels[2] = 'R'
      jobStatusLabels[3] = 'X'
      jobStatusLabels[4] = 'C'
      jobStatusLabels[5] = 'H'
      jobStatusLabels[6] = 'SE'
      jobStatusLabels[7] = '7'
      jobStatusLabels[8] = '8'
      jobStatusLabels[9] = '9'

      dagNodeNames  = {}
      dagmanJobIds  = {}
      jobStatuses   = {}
      processIds    = {}
      runJobMatches = {}

      clusterId   = -1
      dagNodeName = '?'
      dagManJobId = -1
      jobStatus   = 9
      processId   = 0
      runJobMatch = '?'

      for record in condorqRecords:
         record = record.strip()
         if   record == "":
            if clusterId > 0:
               dagNodeNames[clusterId]  = dagNodeName.strip('"')
               dagmanJobIds[clusterId]  = int(dagManJobId)
               jobStatuses[clusterId]   = int(jobStatus)
               processIds[clusterId]    = int(processId)
               runJobMatches[clusterId] = runJobMatch.strip('"')
            clusterId   = -1
            dagNodeName = '?'
            dagManJobId = -1
            jobStatus   = 9
            processId   = 0
            runJobMatch = '?'
         elif record.startswith('ClusterId ='):
            clusterId   = int(record.split('=')[1].strip())
         elif record.startswith('DAGNodeName ='):
            dagNodeName = record.split('=')[1].strip()
         elif record.startswith('DAGManJobId ='):
            dagManJobId = record.split('=')[1].strip()
         elif record.startswith('JobStatus ='):
            jobStatus   = record.split('=')[1].strip()
         elif record.startswith('ProcId ='):
            processId   = record.split('=')[1].strip()
         elif record.startswith('RunJobMatches ='):
            runJobSiteMatches = record.split('=')[1].strip()
            runJobMatch = runJobSiteMatches.split(',')[-1].strip()
            if runJobMatch == '""':
               runJobMatch = '?'

      for dagmanJobId in dagmanJobIds:
         clusterId = dagmanJobIds[dagmanJobId]
         if dagmanJobIds[dagmanJobId] > 0:
            if clusterId in dagNodeNames:
               dagNodeNames[clusterId] = dagNodeNames[dagmanJobId]
               jobStatuses[clusterId]  = jobStatuses[dagmanJobId]
               processIds[clusterId]   = processIds[dagmanJobId]

      for clusterId in dagNodeNames:
         if dagmanJobIds[clusterId] < 0:
            filteredRecords += "%d.%d %s %s %s\n" % (clusterId,processIds[clusterId], \
                                                     jobStatusLabels[jobStatuses[clusterId]], \
                                                     dagNodeNames[clusterId], \
                                                     runJobMatches[clusterId])

      return(filteredRecords)


   def monitorQ(self):
      self.openHistory('a')
      consecutiveEmptyQueues = 0
      lastReportedActiveJobCount = 0

      toCheck = []
      toCheck.append(sys.stdin.fileno())
      while 1:
         activeJobCount = len(self.activeJobs)
         if activeJobCount != lastReportedActiveJobCount:
            log("%d monitored jobs" % (activeJobCount))
         lastReportedActiveJobCount = activeJobCount

         self.updates  = []
         currentJobs   = {}
         completedJobs = []

         delayTime = 0
         while delayTime <= self.sleepTime:
            if os.getppid() == 1:
               os.kill(os.getpid(),signal.SIGTERM)

            ready = select.select(toCheck,[],[],self.pauseTime) # wait for input
            if sys.stdin.fileno() in ready[0]:
               newJob = sys.stdin.readline().strip()
               if newJob != "":
                  if not newJob in self.activeJobs:
                     self.activeJobs[newJob] = ('N','Job','?')
                     self.recordHistory(newJob)
                     self.activeJobs[newJob] = ('n','Job','?')
                  consecutiveEmptyQueues = 0
            delayTime += self.pauseTime

      # -- Submitter: steele-fe00.rcac.purdue.edu : <128.211.158.123:60527> : steele-fe00.rcac.purdue.edu
      #  ID      OWNER            SUBMITTED     RUN_TIME ST PRI SIZE CMD
      # 887562.0   njpatter        6/4  16:07   2+13:06:43 I  0   7.3  main 0

         errStatus,qstatOutput,qstatError = self.executeQstatCommand(self.qstatCommand)
         if errStatus == 0:
            jobs = self.filterCondorQ(qstatOutput.splitlines()).splitlines()
            for job in jobs:
               if re.match("^ *[0-9]*\.[0-9] ",job):
                  jobState = job.split()
                  jobId       = jobState[0]
                  status      = jobState[1]
                  stage       = jobState[2]
                  runJobMatch = jobState[3]
                  currentJobs[jobId] = (status,stage,runJobMatch)

            if len(currentJobs) == 0:
               consecutiveEmptyQueues += 1
            else:
               consecutiveEmptyQueues = 0

            for activeJob in self.activeJobs:
               if self.activeJobs[activeJob][0] == 'n':
                  self.activeJobs[activeJob] = ('N','Job','?')
               else:
                  if not activeJob in currentJobs:
                     self.activeJobs[activeJob] = ('D',self.activeJobs[activeJob][1],self.activeJobs[activeJob][2])
                     self.recordHistory(activeJob)
                     completedJobs.append(activeJob)

            for currentJob in currentJobs:
               if   not currentJob in self.activeJobs:
                  self.activeJobs[currentJob] = currentJobs[currentJob]
                  self.recordHistory(currentJob)
               elif currentJobs[currentJob] != self.activeJobs[currentJob]:
                  self.activeJobs[currentJob] = currentJobs[currentJob]
                  self.recordHistory(currentJob)
               if self.activeJobs[currentJob][0] == 'D':
                  completedJobs.append(currentJob)

            for completedJob in completedJobs:
               del self.activeJobs[completedJob]

            del currentJobs
            del completedJobs

            if len(self.updates) > 0:
               updateMessage = str(len(self.updates)) + ' ' + self.siteDesignator + ':' + ':'.join(self.updates)
               sys.stdout.write("%s\n" % (updateMessage))
               sys.stdout.flush()

            del self.updates

            if self.historyFile:
               self.historyFile.close()
               self.historyFile = None
               self.saveHistory()
               self.openHistory('a')

            if consecutiveEmptyQueues == self.maximumConsecutiveEmptyQueues:
               self.cleanup()
               log("%s monitor stopped" % (self.siteDesignator))
               sys.exit(0)
         else:
            log("Error %d in %s command:\n%s" % (errStatus,self.qstatCommand,qstatError))


if __name__ == '__main__':

   openLog(LOGPATH)

   if CONDOR_ROOT != "":
      os.environ['PATH'] = os.path.join(CONDOR_ROOT,'bin') + ':' + os.path.join(CONDOR_ROOT,'sbin') + ':' + os.environ['PATH']
      if CONDOR_CONFIG != "":
         os.environ['CONDOR_CONFIG'] = CONDOR_CONFIG

   log("%s monitor started" % (SITEDESIGNATOR))


   __queueMonitor__ = QueueMonitor(SITEDESIGNATOR,QSTATCOMMAND,HISTORYFILEPATH, \
                                   SLEEPTIME,PAUSETIME,MAXIMUMIDLETIME)

   __queueMonitor__.loadHistory()
   __queueMonitor__.saveHistory()
   __queueMonitor__.monitorQ()


