CMS 3D CMS Logo

/data/refman/pasoursint/CMSSW_4_4_5_patch3/src/Validation/Performance/python/parserPerfsuiteMetadata.py

Go to the documentation of this file.
00001 import re
00002 import os, sys
00003 import time
00004 import parsingRulesHelper
00005 import glob
00006 from commands import getstatusoutput
00007 
00008 class parserPerfsuiteMetadata:
00009         """ 
00010                 The whole parsing works as following. We split the file into 3 parts (we keep 3 variables of line lists:self.lines_general, self.lines_timesize, self.lines_other ):
00011 
00012                         * General info
00013                 As most of the info are simple one line strings, we define some regular expressions defining and matching each of those lines. The regular expressions are associated with data which we can get from them. e.g. ^Suite started at (.+) on (.+) by user (.+)$ would match only the line defining the time suite started and on which machine. It's associated with tuple of field names for general info which will be filled in. in this way we get info = {'start_time': start-taken-from-regexp, 'host': host, 'user': user}. This is done by calling simple function _applyParsingRules which checks each lines with each if one passes another, if it does fills in the result dictionary with the result.
00014                 Additionaly we get the cpu and memmory info from /proc/cpuinfo /proc/meminfo
00015 
00016                         * TimeSize test
00017                 We use the same technique a little bit also. But at first we divide the timesize lines by job (individual run of cmssw - per candle, and pileup/not). Then for each of the jobs we apply our parsing rules, also we find the starting and ending times (i.e. We know that start timestamp is somethere after certain line containing "Written out cmsRelvalreport.py input file at:")
00018 
00019                         * All other tests
00020                 We find the stating that the test is being launched (containing the test name, core and num events). Above we have the thread number, and below the starting time.
00021                 The ending time can be ONLY connected with the starting time by the Thread-ID. The problem is that the file names different the same test instance like <Launching "PILE UP Memcheck"> and <"Memcheck" stopped>.
00022         """
00023         _LINE_SEPARATOR = "|"
00024         def validateSteps(self, steps):
00025                 """ Simple function for error detection. TODO: we could use a list of possible steps also """
00026                 return not (not steps or len(steps) > self._MAX_STEPS)
00027 
00028         def __init__(self, path):
00029                 
00030                 self._MAX_STEPS  = 5 # MAXIMUM NUMBER OF STEPS PER RUN (taskset relvalreport.py...)
00031                 self._DEBUG = False
00032 
00033 
00034                 self._path = path
00035                 
00036                 """ some initialisation to speedup the other functions """
00037                 #for cmsscimark
00038                 self.reCmsScimarkTest = re.compile(r"""^Composite Score:(\s*)([^\s]+)$""")
00039 
00040                 #TimeSize
00041                 """ the separator for beginning of timeSize / end of general statistics """
00042                 self._timeSizeStart = re.compile(r"""^Launching the TimeSize tests \(TimingReport, TimeReport, SimpleMemoryCheck, EdmSize\) with (\d+) events each$""")
00043                 """ (the first timestamp is the start of TimeSize) """
00044 
00045 
00046                 """ the separator for end of timeSize / beginning of IgProf_Perf, IgProf_Mem,  Memcheck, Callgrind tests """
00047                 self._timeSizeEnd = re.compile(r"""^Stopping all cmsScimark jobs now$""")
00048 
00049                 #Other tests:
00050                 self._otherStart = re.compile(r"^Preparing")
00051 
00052                 """ 
00053                 ----- READ THE DATA -----
00054                 """
00055                 lines = self.readInput(path)
00056                 """ split the whole file  into parts """
00057                 #Let's not assume there are ALWAYS TimeSize tests in the runs of the Performance Suite!:
00058                 #Check first:  
00059                 #FIXME: Vidmantas did not think to this case... will need to implement protectionb against it for all the IB tests...
00060                 #To do as soon as possible...
00061                 #Maybe revisit the strategy if it can be done quickly.
00062                 timesize_end= [lines.index(line)  for line in lines if self._timeSizeEnd.match(line)]
00063                 if timesize_end:
00064                         timesize_end_index = timesize_end[0]
00065                 else:
00066                         timesize_end_index=0
00067                 timesize_start=[lines.index(line) for line in lines if self._timeSizeStart.match(line)]
00068                 general_stop=[lines.index(line) for line in lines if self._otherStart.match(line)]
00069                 if timesize_start:
00070                         timesize_start_index = timesize_start[0]
00071                         general_stop_index=timesize_start_index
00072                 elif general_stop:
00073                         timesize_start_index=0
00074                         general_stop_index=general_stop[0]
00075                 else:
00076                         timesize_start_index=0
00077                         general_stop_index=-1
00078 
00079                 """ we split the structure:
00080                         * general
00081                         * timesize
00082                         * all others [igprof etc]
00083                 """
00084         
00085                 """ we get the indexes of spliting """
00086                 #Not OK to use timsize_start_index for the general lines... want to be general, also to cases of no TimeSize tests...
00087                 #self.lines_general = lines[:timesize_start_index]
00088                 self.lines_general = lines[:general_stop_index]
00089                 self.lines_timesize = lines[timesize_start_index:timesize_end_index+1]
00090                 self.lines_other = lines[timesize_end_index:]           
00091         
00092                 """ a list of missing fields """
00093                 self.missing_fields = []
00094 
00095         @staticmethod
00096         def isTimeStamp(line):
00097                 """
00098                 Returns whether the string is a timestamp (if not returns None)
00099 
00100                 >>> parserPerfsuiteMetadata.isTimeStamp("Fri Aug 14 01:16:03 2009")
00101                 True
00102                 >>> parserPerfsuiteMetadata.isTimeStamp("Fri Augx 14 01:16:03 2009")
00103 
00104                 """
00105                 datetime_format = "%a %b %d %H:%M:%S %Y" # we use default date format
00106                 try:
00107                         time.strptime(line, datetime_format)
00108                         return True
00109                 except ValueError:
00110                         return None
00111         
00112         @staticmethod
00113         def findFirstIndex_ofStartsWith(job_lines, start_of_line):
00114                 return [job_lines.index(line) 
00115                         for line in job_lines 
00116                         if line.startswith(start_of_line)][0]
00117         
00118         def findLineBefore(self, line_index, lines, test_condition):
00119                 """ finds a line satisfying the `test_condition` comming before the `line_index` """
00120                 # we're going backwards the lines list
00121                 for line_index in  xrange(line_index -1, -1, -1):
00122                         line = lines[line_index]
00123 
00124                         if test_condition(line):
00125                                 return line
00126                 raise ValueError
00127 
00128 
00129         def findLineAfter(self, line_index, lines, test_condition, return_index = False):
00130                 """ finds a line satisfying the `test_condition` comming after the `line_index` """
00131                 # we're going forward the lines list
00132                 for line_index in xrange(line_index + 1, len(lines)):
00133                         line = lines[line_index]
00134 
00135                         if test_condition(line):        
00136                                 if return_index:
00137                                         return line_index
00138                                 return line
00139 
00140         def firstTimeStampBefore(self, line_index, lines):
00141                 """ returns the first timestamp BEFORE the line with given index """
00142 
00143                 return self.findLineBefore(line_index, lines, test_condition = self.isTimeStamp)
00144 
00145         def firstTimeStampAfter(self, line_index, lines):
00146                 """ returns the first timestamp AFTER the line with given index """
00147 
00148                 return self.findLineAfter(line_index, lines, test_condition = self.isTimeStamp)
00149 
00150         def handleParsingError(self, message):
00151                 if self._DEBUG:
00152                         raise ValueError, message
00153                 print " ======== AND ERROR WHILE PARSING METADATA ===="
00154                 print message
00155                 print " =============== end ========================= "
00156 
00157         #IgProf_Perf, IgProf_Mem,  Memcheck, Callgrind
00158         #TODO: divide the input using separators
00159 
00160         """ reads the input cmsPerfsuite.log file  """
00161         def readInput(self, path, fileName = "cmsPerfSuite.log"):
00162                 try:
00163                         f = open(os.path.join(path, fileName), "r")
00164                         lines =  [s.strip() for s in f.readlines()]
00165                         f.close()
00166                 except IOError:
00167                         lines = []
00168 
00169                 #print self._lines
00170                 return lines
00171 
00172 
00173 
00174 
00175         def getMachineInfo(self):
00176                 """ Returns the cpu and memory info  """
00177 
00178                 """ cpu info """
00179 
00180                 """
00181                 we assume that:
00182                  * num_cores = max(core id+1) [it's counted from 0]
00183                  * 'model name' is processor type [we will return only the first one - we assume others to be same!!??
00184                  * cpu MHz - is the speed of CPU
00185                 """
00186                 #TODO: BUT cpu MHz show not the maximum speed but current, 
00187                 """
00188                 for 
00189                         model name      : Intel(R) Core(TM)2 Duo CPU     L9400  @ 1.86GHz
00190                         cpu MHz         : 800.000
00191                         cache size      : 6144 KB
00192                 """
00193                 cpu_result = {}
00194                 try:
00195                         f= open(os.path.join(self._path, "cpuinfo"), "r")
00196 
00197                         #we split data into a list of tuples = [(attr_name, attr_value), ...]
00198                         cpu_attributes = [l.strip().split(":") for l in f.readlines()]
00199                         #print cpu_attributes
00200                         f.close()
00201                         cpu_result = {
00202                                 "num_cores": max ([int(attr[1].strip())+1 for attr in cpu_attributes if attr[0].strip() == "processor"]), #Bug... Vidmantas used "core id"
00203                                 "cpu_speed_MHZ": max ([attr[1].strip() for attr in cpu_attributes if attr[0].strip() == "cpu MHz"]),
00204                                 "cpu_cache_size": [attr[1].strip() for attr in cpu_attributes if attr[0].strip() == "cache size"][0],
00205                                 "cpu_model_name": [attr[1].strip() for attr in cpu_attributes if attr[0].strip() == "model name"][0]
00206                         }
00207                 except IOError,e:
00208                         print e
00209 
00210                 
00211                 
00212 
00213 
00214                 """ memory info """
00215                 mem_result = {}
00216 
00217                 try:
00218                         f= open(os.path.join(self._path, "meminfo"), "r")
00219 
00220                         #we split data into a list of tuples = [(attr_name, attr_value), ...]
00221                         mem_attributes = [l.strip().split(":") for l in f.readlines()]
00222 
00223                         mem_result = {
00224                                 "memory_total_ram": [attr[1].strip() for attr in mem_attributes if attr[0].strip() == "MemTotal"][0]
00225                         }
00226 
00227                 except IOError,e:
00228                         print e
00229         
00230                 cpu_result.update(mem_result)
00231                 return cpu_result
00232 
00233 
00234         
00235         def _applyParsingRules(self, parsing_rules, lines):
00236                 """ 
00237                         Applies the (provided) regular expression rules (=rule[1] for rule in parsing_rules)
00238                         to each line and if it matches the line,
00239                         puts the mached information to the dictionary as the specified keys (=rule[0]) which is later returned
00240                         Rule[3] contains whether the field is required to be found. If so and it isn't found the exception would be raised.
00241                         rules = [
00242                           ( (field_name_1_to_match, field_name_2), regular expression, /optionaly: is the field required? if so "req"/ )
00243                         ]
00244                  """
00245                 """ we call a shared parsing helper """
00246                 #parsing_rules = map(parsingRulesHelper.rulesRegexpCompileFunction, parsing_rules)
00247                 #print parsing_rules
00248                 (info, missing_fields) = parsingRulesHelper.rulesParser(parsing_rules, lines, compileRules = True)
00249 
00250                 self.missing_fields.extend(missing_fields)
00251 
00252                 return info
00253 
00254 
00255         def parseGeneralInfo(self):
00256                 lines = self.lines_general
00257                 """ we define a simple list (tuple) of rules for parsing, the first part tuple defines the parameters to be fetched from the
00258                         regexp while the second one is the regexp itself """
00259                 #TIP: don't forget that tuple of one ends with ,
00260                 parsing_rules = (
00261                         (("", "num_cores", "run_on_cpus"), r"""^This machine \((.+)\) is assumed to have (\d+) cores, and the suite will be run on cpu \[(.+)\]$"""),
00262                         (("start_time", "host", "local_workdir", "user"), r"""^Performance Suite started running at (.+) on (.+) in directory (.+), run by user (.+)$""", "req"),
00263                         (("architecture",) ,r"""^Current Architecture is (.+)$"""),
00264                         (("test_release_based_on",), r"""^Test Release based on: (.+)$""", "req"),
00265                         (("base_release_path",) , r"""^Base Release in: (.+)$"""),
00266                         (("test_release_local_path",) , r"""^Your Test release in: (.+)$"""),
00267 
00268                         (("castor_dir",) , r"""^The performance suite results tarball will be stored in CASTOR at (.+)$"""),
00269                         
00270                         (("TimeSize_events",) , r"""^(\d+) TimeSize events$"""),
00271                         (("IgProf_events",) , r"""^(\d+) IgProf events$"""),
00272                         (("CallGrind_events",) , r"""^(\d+) Callgrind events$"""),
00273                         (("Memcheck_events",) , r"""^(\d+) Memcheck events$"""), 
00274 
00275                         (("candles_TimeSize",) , r"""^TimeSizeCandles \[(.*)\]$"""),
00276                         (("candles_TimeSizePU",) , r"""^TimeSizePUCandles \[(.*)\]$"""),
00277                         
00278                         (("candles_Memcheck",) , r"""^MemcheckCandles \[(.*)\]$"""),
00279                         (("candles_MemcheckPU",) , r"""^MemcheckPUCandles \[(.*)\]$"""),
00280 
00281                         (("candles_Callgrind",) , r"""^CallgrindCandles \[(.*)\]$"""),
00282                         (("candles_CallgrindPU",) , r"""^CallgrindPUCandles \[(.*)\]$"""),
00283 
00284                         (("candles_IgProfPU",) , r"""^IgProfPUCandles \[(.*)\]$"""),
00285                         (("candles_IgProf",) , r"""^IgProfCandles \[(.*)\]$"""),
00286 
00287 
00288                         (("cmsScimark_before",) , r"""^(\d+) cmsScimark benchmarks before starting the tests$"""),
00289                         (("cmsScimark_after",) , r"""^(\d+) cmsScimarkLarge benchmarks before starting the tests$"""),
00290                         (("cmsDriverOptions",) , r"""^Running cmsDriver.py with user defined options: --cmsdriver="(.+)"$"""),
00291 
00292                         (("HEPSPEC06_SCORE",) ,r"""^This machine's HEPSPEC06 score is: (.+)$"""),
00293 
00294 
00295                 )
00296                 """ we apply the defined parsing rules to extract the required fields of information into the dictionary (as defined in parsing rules) """
00297                 info = self._applyParsingRules(parsing_rules, lines)
00298 
00299 
00300                 """ postprocess the candles list """
00301                 candles = {}
00302                 for field, value in info.items():
00303                         if field.startswith("candles_"):
00304                                 test = field.replace("candles_", "")
00305                                 value = [v.strip(" '") for v in value.split(",")]
00306                                 #if value:
00307                                 candles[test]=value
00308                                 del info[field]
00309                 #print candles
00310                 info["candles"] = self._LINE_SEPARATOR.join([k+":"+",".join(v) for (k, v) in candles.items()])
00311 
00312 
00313                 """ TAGS """
00314                 """ 
00315                 --- Tag ---    --- RelTag --- -------- Package --------                        
00316                 HEAD           V05-03-06      IgTools/IgProf                                   
00317                 V01-06-05      V01-06-04      Validation/Performance                           
00318                 ---------------------------------------
00319                 total packages: 2 (2 displayed)
00320                 """
00321                 tags_start_index = -1 # set some default
00322                 try:
00323                         tags_start_index = [i for i in xrange(0, len(lines)) if lines[i].startswith("--- Tag ---")][0]
00324                 except:
00325                         pass
00326                 if tags_start_index > -1:
00327                         tags_end_index = [i for i in xrange(tags_start_index + 1, len(lines)) if lines[i].startswith("---------------------------------------")][0]
00328                         # print "tags start index: %s, end index: %s" % (tags_start_index, tags_end_index)
00329                         tags = lines[tags_start_index:tags_end_index+2]
00330                         # print [tag.split("  ") for tag in tags]
00331                         # print "\n".join(tags)
00332                 else: # no tags found, make an empty list ...
00333                         tags = []
00334                 """ we join the tags with separator to store as simple string """
00335                 info["tags"] = self._LINE_SEPARATOR.join(tags)
00336                 #FILES/PATHS
00337         
00338 
00339                 """ get the command line """
00340                 try:
00341                         cmd_index = self.findFirstIndex_ofStartsWith(lines, "Performance suite invoked with command line:") + 1 #that's the next line
00342                         info["command_line"] =  lines[cmd_index]
00343                 except IndexError, e:
00344                         if self._DEBUG:
00345                                 print e
00346                         info["command_line"] =  ""
00347                 
00348                 try:
00349                         cmd_parsed_start = self.findFirstIndex_ofStartsWith(lines, "Initial PerfSuite Arguments:") + 1
00350                         cmd_parsed_end = self.findFirstIndex_ofStartsWith(lines, "Running cmsDriver.py")
00351                         info["command_line_parsed"] = self._LINE_SEPARATOR.join(lines[cmd_parsed_start:cmd_parsed_end])
00352                 except IndexError, e:
00353                         if self._DEBUG:
00354                                 print e
00355                         info["command_line"] =  ""
00356 
00357                 return  info
00358 
00359         
00360         def parseAllOtherTests(self):
00361                 threads = {}
00362                 tests = {
00363                         #"IgProf_Perf": {}, "IgProf_Mem": {}, "Memcheck": {},   "Callgrind": {},
00364                 }
00365 
00366                 lines = self.lines_other
00367                 """
00368 
00369                 for each of IgProf_Perf, IgProf_Mem,  Memcheck, Callgrind tests we have such a structure of input file:
00370                 * beginning ->> and start timestamp- the firstone:
00371                         Adding thread <simpleGenReportThread(Thread-1, started)> to the list of active threads
00372                         Launching the Memcheck tests on cpu 3 with 5 events each
00373                         Fri Aug 14 01:16:03 2009
00374 
00375                         <... whatever might be here, might overlap with other test start/end messages ..>
00376 
00377                         Fri Aug 14 02:13:18 2009
00378                         Memcheck test, in thread <simpleGenReportThread(Thread-1, stopped)> is done running on core 3
00379                 * ending - the last timestamp "before is done running ...."
00380                 """
00381                 # we take the first TimeStamp after the starting message and the first before the finishing message
00382 
00383         
00384                 #TODO: if threads would be changed it would stop working!!!
00385 
00386                 # i.e. Memcheck, cpu, events
00387                 reStart = re.compile(r"""^Launching the (.*) tests on cpu (\d+) with (\d+) events each$""")
00388                 # i.e. Memcheck, thread name,core number
00389                 reEnd = re.compile(r"""^(.*) test, in thread <simpleGenReportThread\((.+), stopped\)> is done running on core (\d+)$""")
00390                 
00391                 #i.e. thread = Thread-1
00392                 reAddThread =  re.compile(r"""^Adding thread <simpleGenReportThread\((.+), started\)> to the list of active threads$""")
00393 
00394                 reExitCode = re.compile(r"""Individual cmsRelvalreport.py ExitCode (\d+)""")
00395                 """ we search for lines being either: (it's a little pascal'ish but we need the index!) """
00396                 for line_index in xrange(0, len(lines)):
00397                         line = lines[line_index]
00398 
00399                         # * starting of test
00400                         if reStart.match(line):
00401                                 #print reStart.match(line).groups()
00402                                 testName, testCore, testEventsNum = reStart.match(line).groups()
00403 
00404                                 time = self.firstTimeStampAfter(line_index, lines)
00405 
00406                                 #find the name of Thread: it's one of the lines before
00407                                 line_thread = self.findLineBefore(line_index, lines, test_condition=lambda l: reAddThread.match(l))
00408                                 (thread_id, ) =  reAddThread.match(line_thread).groups()
00409                                 
00410                                 #we add it to the list of threads as we DO NOT KNOW EXACT NAME OF TEST
00411                                 if not threads.has_key(thread_id):
00412                                         threads[thread_id] = {}
00413                                 # this way we would get an Exception in case of unknown test name! 
00414                                 threads[thread_id].update({"name": testName, "events_num": testEventsNum, "core": testCore, "start": time, "thread_id": thread_id})
00415 
00416                         # * or end of test
00417                         if reEnd.match(line):
00418                                 testName, thread_id, testCore = reEnd.match(line).groups()
00419                                 if not threads.has_key(testName):
00420                                         threads[thread_id] = {}
00421                                 #TODO: we get an exception if we found non existing
00422 
00423                                 time = self.firstTimeStampBefore(line_index, lines)
00424                                 try:
00425                                         exit_code = ""
00426                                         #we search for the exit code
00427                                         line_exitcode = self.findLineBefore(line_index, lines, test_condition=lambda l: reExitCode.match(l))
00428                                         exit_code, = reExitCode.match(line_exitcode).groups()
00429                                 except Exception, e:
00430                                         print "Error while getting exit code (Other test): %s" + str(e)
00431                                         
00432 
00433                                 # this way we would get an Exception in case of unknown test name! So we would be warned if the format have changed
00434                                 threads[thread_id].update({"end": time, "exit_code":exit_code})
00435                         for key, thread in threads.items():
00436                                 tests[thread["name"]] = thread
00437                 return tests
00438 
00439 
00440         def parseTimeSize(self):
00441                 """ parses the timeSize """
00442                 timesize_result = []
00443 
00444                 # TODO: we will use the first timestamp after the "or these tests will use user input file..."
00445                 #TODO: do we have to save the name of input file somewhere?
00446                 """
00447                 the structure of input file:
00448                 * beginning ->> and start timestamp- the firstone:              
00449                         >>> [optional:For these tests will use user input file /build/RAWReference/MinBias_RAW_320_IDEAL.root]
00450                         <...>
00451                         Using user-specified cmsDriver.py options: --conditions FrontierConditions_GlobalTag,MC_31X_V4::All --eventcontent RECOSIM
00452                         Candle MinBias will be PROCESSED
00453                         You defined your own steps to run:
00454                         RAW2DIGI-RECO
00455                         *Candle MinBias
00456                         Written out cmsRelvalreport.py input file at:
00457                         /build/relval/CMSSW_3_2_4/workStep2/MinBias_TimeSize/SimulationCandles_CMSSW_3_2_4.txt
00458                         Thu Aug 13 14:53:37 2009 [start]
00459                         <....>
00460                         Thu Aug 13 16:04:48 2009 [end]
00461                         Individual cmsRelvalreport.py ExitCode 0
00462                 * ending - the last timestamp "... ExitCode ...."
00463                 """
00464                 #TODO: do we need the cmsDriver --conditions? I suppose it would the global per work directory = 1 perfsuite run (so samefor all candles in one work dir)
00465                 # TODO: which candle definition to use?
00466                 """ divide into separate jobs """
00467                 lines = self.lines_timesize
00468                 jobs = []
00469                 start = False
00470                 timesize_start_indicator = re.compile(r"""^taskset -c (\d+) cmsRelvalreportInput.py""")
00471                 for line_index in xrange(0, len(lines)):
00472                         line = lines[line_index]
00473                         # search for start of each TimeSize job (with a certain candle and step)
00474                         if timesize_start_indicator.match(line):
00475                                 if start:
00476                                         jobs.append(lines[start:line_index])
00477                                 start = line_index
00478                 #add the last one
00479                 jobs.append(lines[start:len(lines)])
00480                 #print "\n".join(str(i) for i in jobs)
00481 
00482                 parsing_rules = (
00483                         (("", "candle", ), r"""^(Candle|ONLY) (.+) will be PROCESSED$""", "req"),
00484                         #e.g.: --conditions FrontierConditions_GlobalTag,MC_31X_V4::All --eventcontent RECOSIM
00485                         (("cms_driver_options", ), r"""^Using user-specified cmsDriver.py options: (.+)$"""),
00486                         (("", "conditions", ""), r"""^Using user-specified cmsDriver.py options: (.*)--conditions ([^\s]+)(.*)$""", "req"),
00487                         # for this we cannot guarrantee that it has been found, TODO: we might count the number of pileup candles and compare with arguments
00488                         (("",  "pileup_type", ""), r"""^Using user-specified cmsDriver.py options:(.*)--pileup=([^\s]+)(.*)$"""),
00489                         #not shure if event content is required
00490                         (("",  "event_content", ""), r"""^Using user-specified cmsDriver.py options:(.*)--eventcontent ([^\s]+)(.*)$""", "req"),
00491                         #TODO: after changeing the splitter to "taskset -c ..." this is no longer included into the part of correct job
00492                         #(("input_user_root_file", ), r"""^For these tests will use user input file (.+)$"""),
00493                 )
00494 
00495                 #parse each of the TimeSize jobs: find candles, etc and start-end times
00496 
00497                 reExit_code = re.compile(r"""Individual ([^\s]+) ExitCode (\d+)""")
00498 
00499                 if self._DEBUG:
00500                         print "TimeSize (%d) jobs: %s" % (len(jobs), str(jobs))
00501 
00502                 for job_lines in jobs:
00503                         """ we apply the defined parsing rules to extract the required fields of information into the dictionary (as defined in parsing rules) """
00504                         info = self._applyParsingRules(parsing_rules, job_lines)
00505                         #Fixing here the compatibility with new cmsdriver.py --conditions option (for which now we have autoconditions and FrontierConditions_GlobalTag is optional):
00506                         if 'auto:' in info['conditions']:
00507                                 from Configuration.PyReleaseValidation.autoCond import autoCond
00508                                 info['conditions'] = autoCond[ info['conditions'].split(':')[1] ].split("::")[0]
00509                         else:
00510                                 if 'FrontierConditions_GlobalTag' in info['conditions']:
00511                                         info['conditions']=info['conditions'].split(",")[1]
00512                                                                                                                                 
00513                         #DEBUG:
00514                         #print "CONDITIONS are: %s"%info['conditions']
00515                         #start time - the index after which comes the time stamp
00516                         """ the following is not available on one of the releases, instead
00517                         use the first timestamp available on our job - that's the starting time :) """ 
00518                         
00519                         #start_time_after = self.findFirstIndex_ofStartsWith(job_lines, "Written out cmsRelvalreport.py input file at:")
00520                         #print start_time_after
00521                         info["start"] = self.firstTimeStampAfter(0, job_lines)
00522 
00523                         #TODO: improve in future (in case of some changes) we could use findBefore instead which uses the regexp as parameter for searching 
00524                         #end time - the index before which comes the time stamp
00525 
00526                         # On older files we have - "Individual Relvalreport.py ExitCode 0" instead of "Individual cmsRelvalreport.py ExitCode"
00527                         end_time_before = self.findLineAfter(0, job_lines, test_condition = reExit_code.match, return_index = True)
00528 
00529                         # on the same line we have the exit Code - so let's get it
00530                         nothing, exit_code = reExit_code.match(job_lines[end_time_before]).groups()
00531 
00532                         info["end"] = self.firstTimeStampBefore(end_time_before, job_lines)
00533                         info["exit_code"] = exit_code
00534 
00535                         steps_start = self.findFirstIndex_ofStartsWith(job_lines, "You defined your own steps to run:")
00536                         steps_end = self.findFirstIndex_ofStartsWith(job_lines, "*Candle ")
00537                         #probably it includes steps until we found *Candle... ?
00538                         steps = job_lines[steps_start + 1:steps_end]
00539                         if not self.validateSteps(steps):
00540                                 self.handleParsingError( "Steps were not found corrently: %s for current job: %s" % (str(steps), str(job_lines)))
00541                                 
00542                                 """ quite nasty - just a work around """
00543                                 print "Trying to recover from this error in case of old cmssw"
00544                                 
00545                                 """ we assume that steps are between the following sentance and a TimeStamp """
00546                                 steps_start = self.findFirstIndex_ofStartsWith(job_lines, "Steps passed to writeCommands")
00547                                 steps_end = self.findLineAfter(steps_start, job_lines, test_condition = self.isTimeStamp, return_index = True)
00548                                 
00549                                 steps = job_lines[steps_start + 1:steps_end]
00550                                 if not self.validateSteps(steps):
00551                                         self.handleParsingError( "EVEN AFTER RECOVERY Steps were not found corrently! : %s for current job: %s" % (str(steps), str(job_lines)))
00552                                 else:
00553                                         print "RECOVERY SEEMS to be successful: %s" % str(steps)
00554 
00555                         info["steps"] = self._LINE_SEPARATOR.join(steps) #!!!! STEPS MIGHT CONTAIN COMMA: ","
00556                         
00557 
00558                         timesize_result.append(info)
00559                 return {"TimeSize": timesize_result}
00560         #TODO:
00561         
00562 
00563 
00564         def readCmsScimarkTest(self, testName, testType, core):
00565                 lines  = self.readInput(self._path, fileName = testName + ".log")
00566                 scores = [{"score": self.reCmsScimarkTest.match(line).groups()[1], "type": testType, "core": core}
00567                                 for line in lines 
00568                                 if self.reCmsScimarkTest.match(line)]
00569                 #add the number of messurment
00570                 i = 0
00571                 for score in scores:
00572                         i += 1
00573                         score.update({"messurement_number": i})
00574                 return scores
00575                 
00576         def readCmsScimark(self, main_cores = [1]):
00577                 main_core = main_cores[0]
00578                 #TODO: WE DO NOT ALWAYS REALLY KNOW THE MAIN CORE NUMBER! but we don't care too much
00579                 #we parse each of the SciMark files and the Composite scores
00580                 csimark = []
00581                 csimark.extend(self.readCmsScimarkTest(testName = "cmsScimark2", testType = "mainCore", core = main_core))
00582                 csimark.extend(self.readCmsScimarkTest(testName = "cmsScimark2_large", testType = "mainCore_Large", core = main_core))
00583 
00584 
00585                 #we not always know the number of cores available so we will just search the directory to find out core numbers
00586                 reIsCsiMark_notusedcore = re.compile("^cmsScimark_(\d+).log$")
00587                 scimark_files = [reIsCsiMark_notusedcore.match(f).groups()[0]
00588                                 for f in os.listdir(self._path)
00589                                  if reIsCsiMark_notusedcore.match(f) 
00590                                         and os.path.isfile(os.path.join(self._path, f)) ]
00591 
00592                 for core_number in scimark_files:
00593                         try:
00594                                 csimark.extend(self.readCmsScimarkTest(testName = "cmsScimark_%s" % str(core_number), testType = "NotUsedCore_%s" %str(core_number), core = core_number))
00595                         except IOError, e:
00596                                 if self._DEBUG:
00597                                         print e
00598                 return csimark
00599                 #print csimark
00600 
00601         #get IgProf summary information from the sql3 files
00602         def getIgSummary(self):
00603                 igresult = []
00604                 globbed = glob.glob(os.path.join(self._path, "../*/IgProfData/*/*/*.sql3"))
00605 
00606                 for f in globbed:
00607                         #print f
00608                         profileInfo = self.getSummaryInfo(f)
00609                         if not profileInfo:
00610                                 continue
00611                         cumCounts, cumCalls = profileInfo
00612                         dump, architecture, release, rest = f.rsplit("/", 3)
00613                         candle, sequence, pileup, conditions, process, counterType, events = rest.split("___")
00614                         events = events.replace(".sql3", "")
00615                         igresult.append({"counter_type": counterType, "event": events, "cumcounts": cumCounts, "cumcalls": cumCalls})
00616 
00617                 return igresult 
00618 
00619         def getSummaryInfo(self, database):
00620                 summary_query="""SELECT counter, total_count, total_freq, tick_period
00621                                  FROM summary;"""
00622                 error, output = self.doQuery(summary_query, database)
00623                 if error or not output or output.count("\n") > 1:
00624                         return None
00625                 counter, total_count, total_freq, tick_period = output.split("@@@")
00626                 if counter == "PERF_TICKS":
00627                         return float(tick_period) * float(total_count), int(total_freq)
00628                 else:
00629                         return int(total_count), int(total_freq)
00630 
00631         def doQuery(self, query, database):
00632                 if os.path.exists("/usr/bin/sqlite3"):
00633                         sqlite="/usr/bin/sqlite3"
00634                 else:
00635                         sqlite="/afs/cern.ch/user/e/eulisse/www/bin/sqlite"
00636                 return getstatusoutput("echo '%s' | %s -separator @@@ %s" % (query, sqlite, database))
00637                     
00638         def parseTheCompletion(self):
00639                 """
00640                  checks if the suite has successfully finished  
00641                         and if the tarball was successfully archived and uploaded to the castor """
00642 
00643                 parsing_rules = (
00644                         (("finishing_time", "", ""), r"""^Performance Suite finished running at (.+) on (.+) in directory (.+)$"""),
00645                         (("castor_md5",) , r"""^The md5 checksum of the tarball: (.+)$"""),     
00646                         (("successfully_archived_tarball", ), r"""^Successfully archived the tarball (.+) in CASTOR!$"""),
00647                         #TODO: WE MUST HAVE THE CASTOR URL, but for some of files it's not included [probably crashed]
00648                         (("castor_file_url",), r"""^The tarball can be found: (.+)$"""),                        
00649                         (("castor_logfile_url",), r"""^The logfile can be found: (.+)$"""),
00650                 )
00651 
00652                 
00653                 """ we apply the defined parsing rules to extract the required fields of information into the dictionary (as defined in parsing rules) """
00654                 info = self._applyParsingRules(parsing_rules, self.lines_other)
00655 
00656                 """ did we detect any errors in log files ? """
00657                 info["no_errors_detected"] = [line for line in self.lines_other if line == "There were no errors detected in any of the log files!"] and "1" or "0"
00658                 if not info["successfully_archived_tarball"]:
00659                         info["castor_file_url"] = ""
00660 
00661                 if not info["castor_file_url"]:
00662                         #TODO: get the castor file url or abort
00663                         self.handleParsingError( "Castor tarball URL not found. Trying to get from environment")
00664                         lmdb_castor_url_is_valid = lambda url: url.startswith("/castor/")
00665 
00666                         url = ""
00667                         try:
00668                                 print "HERE!"
00669                                 url=self.get_tarball_fromlog()
00670                                 print "Extracted castor tarball full path by re-parsing cmsPerfSuite.log: %s"%url
00671                                 
00672                         except:
00673                                 if os.environ.has_key("PERFDB_CASTOR_FILE_URL"):
00674                                         url = os.environ["PERFDB_CASTOR_FILE_URL"]
00675                                         
00676                                 else: #FIXME: add the possibility to get it directly from the cmsPerfSuite.log file (make sure it is dumped there before doing the tarball itself...)
00677                                         print "Failed to get the tarball location from environment variable PERFDB_CASTOR_FILE_URL" 
00678                                         self.handleParsingError( "Castor tarball URL not found. Provide interactively")
00679 
00680                         while True:
00681                                 
00682                                 if lmdb_castor_url_is_valid(url):
00683                                         info["castor_file_url"] = url
00684                                         break
00685                                 print "Please enter a valid CASTOR url: has to start with /castor/ and should point to the tarball"
00686                                 url = sys.stdin.readline()
00687 
00688 
00689                 return info
00690         def get_tarball_fromlog(self):
00691                 '''Return the tarball castor location by parsing the cmsPerfSuite.log file'''
00692                 print "Getting the url from the cmsPerfSuite.log"
00693                 log=open("cmsPerfSuite.log","r")
00694                 castor_dir="UNKNOWN_CASTOR_DIR"
00695                 tarball="UNKNOWN_TARBALL"
00696                 for line in log.readlines():
00697                         if 'castordir' in line:
00698                                 castor_dir=line.split()[1]
00699                         if 'tgz' in line and tarball=="UNKNOWN_TARBALL": #Pick the first line that contains the tar command...
00700                                 if 'tar' in line:
00701                                         tarball=os.path.basename(line.split()[2])
00702                 castor_tarball=os.path.join(castor_dir,tarball)
00703                 return castor_tarball
00704 
00705         def parseAll(self):
00706                 result = {"General": {}, "TestResults":{}, "cmsSciMark":{}, "IgSummary":{}, 'unrecognized_jobs': []}
00707 
00708                 """ all the general info - start, arguments, host etc """
00709                 result["General"].update(self.parseGeneralInfo())
00710 
00711                 """ machine info - cpu, memmory """
00712                 result["General"].update(self.getMachineInfo())
00713 
00714                 """ we add info about how successfull was the run, when it finished and final castor url to the file! """
00715                 result["General"].update(self.parseTheCompletion())
00716 
00717                 try:
00718                         result["TestResults"].update(self.parseTimeSize())
00719                 except Exception, e:
00720                         print "BAD BAD BAD UNHANDLED ERROR" + str(e)
00721 
00722 
00723                 #TODO:
00724                 #Check what Vidmantas was doing in the parseAllOtherTests, de facto it is not used now, so commenting it for now (to avoid the "BAD BAD BAD...."
00725                 #try:
00726                 #       result["TestResults"].update(self.parseAllOtherTests())
00727                 #except Exception, e:
00728                 #       print "BAD BAD BAD UNHANDLED ERROR" + str(e)
00729 
00730 
00731                 main_cores = [result["General"]["run_on_cpus"]]
00732                 num_cores = result["General"].get("num_cores", 0)
00733                 #DEBUG
00734                 #print "Number of cores was: %s"%num_cores
00735                 #TODO: temporarly - search for cores, use regexp
00736                 main_cores = [1]
00737 
00738                 # THE MAHCINE SCIMARKS
00739                 result["cmsSciMark"] = self.readCmsScimark(main_cores = main_cores)
00740                 result["IgSummary"] = self.getIgSummary()
00741                 
00742 
00743 
00744                 if self.missing_fields:
00745                         self.handleParsingError("========== SOME REQUIRED FIELDS WERE NOT FOUND DURING PARSING ======= "+ str(self.missing_fields))
00746 
00747                 return result
00748                 
00749                 
00750 
00751 if __name__ == "__main__":
00752         from xml.dom import minidom
00753         import cmssw_exportdb_xml
00754         #steps do not get parsed corectly
00755         #path = "/home/vidma/Desktop/CERN_code/cmssw/data/CMSSW_3_1_0_pre7_--usersteps=RAW2DIGI-RECO_lxbuild107.cern.ch_relval/relval/CMSSW_3_1_0_pre7/work2" 
00756         #path = "/home/vidma/Desktop/CERN_code/cmssw/data/CMSSW_3_2_0_--usersteps=GEN-SIM,DIGI_lxbuild106.cern.ch_relval/relval/CMSSW_3_2_0/workGENSIMDIGI"
00757         #includes finishing time, succesfully archived tarball etc
00758         #path = "/home/vidma/Desktop/CERN_code/cmssw/CVS_PerfSuiteDB/COMP/PerfSuiteDB/export_data_to_xml/example_of_files/PileUp"
00759         path = os.path.abspath(".") #Better to point to the local dir than to some old Vidmantas' laptop dirs ;)
00760         #p = parserPerfsuiteMetadata("/home/vidma/Desktop/CERN_code/cmssw/CVS_PerfSuiteDB/COMP/PerfSuiteDB/export_data_to_xml/example_of_files/PerfsuiteRun")
00761         p = parserPerfsuiteMetadata(path)
00762         run_info = p.parseAll()
00763         
00764         #print "======= GENERAL ========= "
00765         #print "\n".join("%s : %s" % (k, v) for k, v in p.parseAll()["General"].items())
00766         #print "======= Test results ========= "
00767         #print "\n".join("%s : %s" % (k, v) for k, v in p.parseAll()["TestResults"].items())
00768 
00769         xml_doc = minidom.Document()
00770         cmssw_exportdb_xml.exportRunInfo(xml_doc, run_info, print_out = True)
00771         #print "General info:" + str(p.parseGeneralInfo())
00772         import doctest
00773         doctest.testmod()
00774         
00775         #print p.readCmsScimark()
00776 
00777