#!/usr/bin/env python
#
# @package      hubzero-submit-monitors
# @file         BatchMonitors/monitorPBS.py
# @author       Steven Clark <clarks@purdue.edu>
# @copyright    Copyright (c) 2004-2013 HUBzero Foundation, LLC.
# @license      http://www.gnu.org/licenses/lgpl-3.0.html LGPLv3
#
# Copyright (c) 2004-2013 HUBzero Foundation, LLC.
#
# This file is part of: The HUBzero(R) Platform for Scientific Collaboration
#
# The HUBzero(R) Platform for Scientific Collaboration (HUBzero) is free
# software: you can redistribute it and/or modify it under the terms of
# the GNU Lesser General Public License as published by the Free Software
# Foundation, either version 3 of the License, or (at your option) any
# later version.
#
# HUBzero is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
#
# HUBzero is a registered trademark of HUBzero Foundation, LLC.
#
# ----------------------------------------------------------------------
#  monitorPBS.py
#
#  script which monitors the PBS queue and reports changes in job status
#
import sys
import os
import select
import subprocess
import re
import signal

from LogMessage import openLog, log

SITEDESIGNATOR     = "pbsHost"
USERNAME           = "pbsUser"
MONITORROOT        = os.path.join(os.sep,'home','pbsUser','Submit','pbsHost')
QSTATCOMMAND       = "/usr/pbs/bin/qstat -f1"
MONITORLOGLOCATION = os.path.join(os.sep,'var','log','submit','monitors')
MONITORLOGFILENAME = "monitorPBS.log"
LOGPATH            = os.path.join(MONITORLOGLOCATION,MONITORLOGFILENAME)
HISTORYFILENAME    = "monitorPBS.history"
HISTORYFILEPATH    = os.path.join(MONITORROOT,HISTORYFILENAME)

SLEEPTIME       = 10
PAUSETIME       = 5.
MAXIMUMIDLETIME = 30*60


class QueueMonitor:
   def __init__(self,
                siteDesignator,
                userName,
                qstatCommand,
                historyFilePath,
                sleepTime,
                pauseTime,
                maximumIdleTime):
      self.siteDesignator                = siteDesignator
      self.userName                      = userName
      self.qstatCommand                  = qstatCommand
      self.historyFilePath               = historyFilePath
      self.sleepTime                     = sleepTime
      self.pauseTime                     = pauseTime
      self.maximumConsecutiveEmptyQueues = maximumIdleTime/sleepTime

      self.historyFile = None
      self.updates     = []
      self.activeJobs  = {}
      self.bufferSize  = 4096

      signal.signal(signal.SIGINT,self.sigINT_handler)
      signal.signal(signal.SIGHUP,self.sigHUP_handler)
      signal.signal(signal.SIGQUIT,self.sigQUIT_handler)
      signal.signal(signal.SIGABRT,self.sigABRT_handler)
      signal.signal(signal.SIGTERM,self.sigTERM_handler)


   def cleanup(self):
      if self.historyFile:
         self.historyFile.close()
         self.historyFile = None


   def sigGEN_handler(self,
                      signalNumber,
                      frame):
      self.cleanup()
      log("%s monitor stopped" % (self.siteDesignator))
      sys.exit(1)


   def sigINT_handler(self,
                      signalNumber,
                      frame):
      log("Received SIGINT!")
      self.sigGEN_handler(signalNumber,frame)


   def sigHUP_handler(self,
                      signalNumber,
                      frame):
      log("Received SIGHUP!")
      self.sigGEN_handler(signalNumber,frame)


   def sigQUIT_handler(self,
                       signalNumber,
                       frame):
      log("Received SIGQUIT!")
      self.sigGEN_handler(signalNumber,frame)


   def sigABRT_handler(self,
                       signalNumber,
                       frame):
      log("Received SIGABRT!")
      self.sigGEN_handler(signalNumber,frame)


   def sigTERM_handler(self,
                       signalNumber,
                       frame):
      log("Received SIGTERM!")
      self.sigGEN_handler(signalNumber,frame)


   def openHistory(self,
                   accessMode):
      if accessMode == 'r':
         if os.path.isfile(self.historyFilePath):
            self.historyFile = open(self.historyFilePath,accessMode)
         else:
            self.historyFile = None
      else:
         self.historyFile = open(self.historyFilePath,accessMode)


   def recordHistory(self,
                     jobId):
      self.historyFile.write("%s:%s %s %s %s\n" % (self.siteDesignator,str(jobId),self.activeJobs[jobId][0], \
                                                                                  self.activeJobs[jobId][1], \
                                                                                  self.activeJobs[jobId][2]))
      self.historyFile.flush()
      self.updates.append(str(jobId) + ' ' + self.activeJobs[jobId][0] + ' ' + \
                                             self.activeJobs[jobId][1] + ' ' + \
                                             self.activeJobs[jobId][2])


   def loadHistory(self):
      self.openHistory('r')
      if self.historyFile:
#     lPBS:6760 R
#     --------------------
         records = self.historyFile.readlines()
         for record in records:
            colon = record.find(':')
            if colon > 0:
               jobId,status,queue = (record[colon+1:].split() + ['?'])[:3]
               stage  = 'Simulation'
               if status == 'D':
                  if jobId in self.activeJobs:
                     del self.activeJobs[jobId]
               else:
                  self.activeJobs[jobId] = (status,stage,queue)
         self.historyFile.close()
         self.historyFile = None


   def saveHistory(self):
      self.openHistory('w')
      if self.historyFile:
         for activeJob in self.activeJobs:
            self.historyFile.write("%s:%s %s %s %s\n" % (self.siteDesignator,str(activeJob),self.activeJobs[activeJob][0], \
                                                                                            self.activeJobs[activeJob][1], \
                                                                                            self.activeJobs[activeJob][2]))
         self.historyFile.close()
         self.historyFile = None


   def executeQstatCommand(self,
                           command):
      child = subprocess.Popen(command,shell=True,bufsize=self.bufferSize,
                               stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE,
                               close_fds=True)
      childPid   = child.pid
      childout   = child.stdout
      childoutFd = childout.fileno()
      childerr   = child.stderr
      childerrFd = childerr.fileno()

      outEOF = False
      errEOF = False

      outData = []
      errData = []

      while True:
         toCheck = []
         if not outEOF:
            toCheck.append(childoutFd)
         if not errEOF:
            toCheck.append(childerrFd)
         ready = select.select(toCheck,[],[],self.sleepTime) # wait for input
         if childoutFd in ready[0]:
            outChunk = os.read(childoutFd,self.bufferSize)
            if outChunk == '':
               outEOF = True
            outData.append(outChunk)

         if childerrFd in ready[0]:
            errChunk = os.read(childerrFd,self.bufferSize)
            if errChunk == '':
               errEOF = True
            errData.append(errChunk)

         if len(ready[0]) == 0:
            os.kill(childPid,signal.SIGTERM)

         if outEOF and errEOF:
            break

      pid,err = os.waitpid(childPid,0)
      childPid = 0
      if err != 0:
         if os.WIFSIGNALED(err):
            log("%s failed w/ signal %d" % (command,os.WTERMSIG(err)))
         else:
            if os.WIFEXITED(err):
               err = os.WEXITSTATUS(err)
            log("%s failed w/ exit code %d" % (command,err))
         log("%s" % ("".join(errData)))

      return(err,"".join(outData),"".join(errData))


   def filterQstat(self,
                   qstatRecords):
      filteredRecords = ""

      jobStatuses = {}
      jobQueues   = {}

#Job Id: 6600461.pbsHost
#    Job_Name = JOB_00146720_01
#    Job_Owner = pbsUser@pbsHost
#    job_state = R
#    queue = standby
#    server = pbsHost
#    Checkpoint = u
#    ctime = Fri Mar  8 11:27:37 2013
#    Error_Path = pbsHost:/scratch/lustreA/n/pbsUser/HUBjobs/1362778055_00146720_01/pbs_00146720_01.stderr
#    exec_host = d006/1
#    exec_port = 15003
#    Hold_Types = n
#    Join_Path = n
#    Keep_Files = n
#    Mail_Points = n
#    mtime = Fri Mar  8 11:28:35 2013
#    Output_Path = pbsHost:/scratch/lustreA/n/pbsUser/HUBjobs/1362778055_00146720_01/pbs_00146720_01.stdout
#    Priority = 0
#    qtime = Fri Mar  8 11:27:37 2013
#    Rerunable = True
#    Resource_List.ncpus = 1
#    Resource_List.nodect = 1
#    Resource_List.nodes = 1:ppn=1
#    Resource_List.walltime = 01:00:00
#    session_id = 0
#    Shell_Path_List = /bin/sh
#    Variable_List = PBS_O_QUEUE=standby,...
#    etime = Fri Mar  8 11:27:37 2013
#    submit_args = ./00146720_01.pbs
#    start_time = Fri Mar  8 11:28:36 2013
#    Walltime.Remaining = 3584
#    start_count = 1
#    fault_tolerant = False
#    job_radix = 0
#    submit_host = pbsHost

      jobId    = -1
      jobOwner = ""
      queue    = ""
      status   = ""
      for qstatRecord in qstatRecords:
         qstatRecord = qstatRecord.strip()
         if   qstatRecord == "":
            if jobId > 0:
               if jobOwner == self.userName:
                  jobQueues[jobId]   = queue
                  jobStatuses[jobId] = status
               jobId    = -1
               jobOwner = ""
               queue    = ""
               status   = ""
         elif qstatRecord.startswith('Job Id:'):
            jobId = int(qstatRecord.split(':')[-1].split('.')[0].strip())
         elif qstatRecord.startswith('Job_Owner ='):
            jobOwner = qstatRecord.split('=')[-1].split('@')[0].strip()
         elif qstatRecord.startswith('job_state ='):
            status = qstatRecord.split('=')[-1].strip()
         elif qstatRecord.startswith('queue ='):
            queue = qstatRecord.split('=')[-1].strip()

      for jobId in jobStatuses:
         filteredRecords += "%d %s %s\n" % (jobId, \
                                            jobQueues[jobId], \
                                            jobStatuses[jobId])

      return(filteredRecords)


   def monitorQ(self):
      self.openHistory('a')
      consecutiveEmptyQueues = 0
      lastReportedActiveJobCount = 0

      toCheck = []
      toCheck.append(sys.stdin.fileno())
      while 1:
         activeJobCount = len(self.activeJobs)
         if activeJobCount != lastReportedActiveJobCount:
            log("%d monitored jobs" % (activeJobCount))
         lastReportedActiveJobCount = activeJobCount

         self.updates  = []
         currentJobs   = {}
         completedJobs = []

         delayTime = 0
         while delayTime <= self.sleepTime:
            if os.getppid() == 1:
               os.kill(os.getpid(),signal.SIGTERM)

            ready = select.select(toCheck,[],[],self.pauseTime) # wait for input
            if sys.stdin.fileno() in ready[0]:
               newJob = sys.stdin.readline().strip()
               if newJob != "":
                  if not newJob in self.activeJobs:
                     self.activeJobs[newJob] = ('N','Job','?')
                     self.recordHistory(newJob)
                     self.activeJobs[newJob] = ('n','Job','?')
                  consecutiveEmptyQueues = 0
            delayTime += self.pauseTime

      # $ qstat
      #  Job id              Name             User            Time Use S Queue
      #  ------------------- ---------------- --------------- -------- - -----
      #  6759.vma111         Nanowire-51394L  biswajit025     00:00:00 R workq
      #  6769.vma111         Nanowire-51411L  mmclennan       00:00:00 R workq

      # $ qstat -u
      #                                                                    Req'd  Req'd   Elap
      # Job ID               Username Queue    Jobname    SessID NDS   TSK Memory Time  S Time
      # -------------------- -------- -------- ---------- ------ ----- --- ------ ----- - -----
      # 8901.vma111.punch.pu saumitra workq    Nanowire-5  12530     1  --    --  08:00 R 05:56

         errStatus,qstatOutput,qstatError = self.executeQstatCommand(self.qstatCommand)
         if errStatus == 0:
            jobs = self.filterQstat(qstatOutput.splitlines()).splitlines()
            for job in jobs:
               if re.match("[0-9]*",job):
                  jobState = job.split()
                  jobId  = jobState[0]
                  queue  = jobState[1]
                  status = jobState[2]
                  stage  = 'Simulation'
                  currentJobs[jobId] = (status,stage,queue)

            if len(currentJobs) == 0:
               consecutiveEmptyQueues += 1
            else:
               consecutiveEmptyQueues = 0

            for activeJob in self.activeJobs:
               if self.activeJobs[activeJob][0] == 'n':
                  self.activeJobs[activeJob] = ('N','Job','?')
               else:
                  if not activeJob in currentJobs:
                     self.activeJobs[activeJob] = ('D',self.activeJobs[activeJob][1],self.activeJobs[activeJob][2])
                     self.recordHistory(activeJob)
                     completedJobs.append(activeJob)

            for currentJob in currentJobs:
               if   not currentJob in self.activeJobs:
                  self.activeJobs[currentJob] = currentJobs[currentJob]
                  self.recordHistory(currentJob)
               elif currentJobs[currentJob] != self.activeJobs[currentJob]:
                  self.activeJobs[currentJob] = currentJobs[currentJob]
                  self.recordHistory(currentJob)
               if self.activeJobs[currentJob][0] == 'D':
                  completedJobs.append(currentJob)

            for completedJob in completedJobs:
               del self.activeJobs[completedJob]

            del currentJobs
            del completedJobs

            if len(self.updates) > 0:
               updateMessage = str(len(self.updates)) + ' ' + self.siteDesignator + ':' + ':'.join(self.updates)
               sys.stdout.write("%s\n" % (updateMessage))
               sys.stdout.flush()

            del self.updates

            if self.historyFile:
               self.historyFile.close()
               self.historyFile = None
               self.saveHistory()
               self.openHistory('a')

            if consecutiveEmptyQueues == self.maximumConsecutiveEmptyQueues:
               self.cleanup()
               log("%s monitor stopped" % (self.siteDesignator))
               sys.exit(0)
         else:
            log("Error %d in %s command:\n%s" % (errStatus,self.qstatCommand,qstatError))


if __name__ == '__main__':

   openLog(LOGPATH)

   log("%s monitor started" % (SITEDESIGNATOR))

   __queueMonitor__ = QueueMonitor(SITEDESIGNATOR,USERNAME,QSTATCOMMAND,HISTORYFILEPATH, \
                                   SLEEPTIME,PAUSETIME,MAXIMUMIDLETIME)

   __queueMonitor__.loadHistory()
   __queueMonitor__.saveHistory()
   __queueMonitor__.monitorQ()


