CMS 3D CMS Logo

/data/refman/pasoursint/CMSSW_5_3_0/src/Validation/Performance/python/parserPerfsuiteMetadata.py

Go to the documentation of this file.
00001 import re
00002 import os, sys
00003 import time
00004 import parsingRulesHelper
00005 import glob
00006 from commands import getstatusoutput
00007 
00008 class parserPerfsuiteMetadata:
00009         """ 
00010                 The whole parsing works as following. We split the file into 3 parts (we keep 3 variables of line lists:self.lines_general, self.lines_timesize, self.lines_other ):
00011 
00012                         * General info
00013                 As most of the info are simple one line strings, we define some regular expressions defining and matching each of those lines. The regular expressions are associated with data which we can get from them. e.g. ^Suite started at (.+) on (.+) by user (.+)$ would match only the line defining the time suite started and on which machine. It's associated with tuple of field names for general info which will be filled in. in this way we get info = {'start_time': start-taken-from-regexp, 'host': host, 'user': user}. This is done by calling simple function _applyParsingRules which checks each lines with each if one passes another, if it does fills in the result dictionary with the result.
00014                 Additionaly we get the cpu and memmory info from /proc/cpuinfo /proc/meminfo
00015 
00016                         * TimeSize test
00017                 We use the same technique a little bit also. But at first we divide the timesize lines by job (individual run of cmssw - per candle, and pileup/not). Then for each of the jobs we apply our parsing rules, also we find the starting and ending times (i.e. We know that start timestamp is somethere after certain line containing "Written out cmsRelvalreport.py input file at:")
00018 
00019                         * All other tests
00020                 We find the stating that the test is being launched (containing the test name, core and num events). Above we have the thread number, and below the starting time.
00021                 The ending time can be ONLY connected with the starting time by the Thread-ID. The problem is that the file names different the same test instance like <Launching "PILE UP Memcheck"> and <"Memcheck" stopped>.
00022         """
00023         _LINE_SEPARATOR = "|"
00024         def validateSteps(self, steps):
00025                 """ Simple function for error detection. TODO: we could use a list of possible steps also """
00026                 return not (not steps or len(steps) > self._MAX_STEPS)
00027 
00028         def __init__(self, path):
00029                 
00030                 self._MAX_STEPS  = 5 # MAXIMUM NUMBER OF STEPS PER RUN (taskset relvalreport.py...)
00031                 self._DEBUG = False
00032 
00033 
00034                 self._path = path
00035                 
00036                 """ some initialisation to speedup the other functions """
00037                 #for cmsscimark
00038                 self.reCmsScimarkTest = re.compile(r"""^Composite Score:(\s*)([^\s]+)$""")
00039 
00040                 #TimeSize
00041                 """ the separator for beginning of timeSize / end of general statistics """
00042                 self._timeSizeStart = re.compile(r"""^Launching the TimeSize tests \(TimingReport, TimeReport, SimpleMemoryCheck, EdmSize\) with (\d+) events each$""")
00043                 """ (the first timestamp is the start of TimeSize) """
00044 
00045 
00046                 """ the separator for end of timeSize / beginning of IgProf_Perf, IgProf_Mem,  Memcheck, Callgrind tests """
00047                 self._timeSizeEnd = re.compile(r"""^Stopping all cmsScimark jobs now$""")
00048 
00049                 #Other tests:
00050                 self._otherStart = re.compile(r"^Preparing")
00051 
00052                 """ 
00053                 ----- READ THE DATA -----
00054                 """
00055                 lines = self.readInput(path)
00056                 """ split the whole file  into parts """
00057                 #Let's not assume there are ALWAYS TimeSize tests in the runs of the Performance Suite!:
00058                 #Check first:  
00059                 #FIXME: Vidmantas did not think to this case... will need to implement protectionb against it for all the IB tests...
00060                 #To do as soon as possible...
00061                 #Maybe revisit the strategy if it can be done quickly.
00062                 timesize_end= [lines.index(line)  for line in lines if self._timeSizeEnd.match(line)]
00063                 if timesize_end:
00064                         timesize_end_index = timesize_end[0]
00065                 else:
00066                         timesize_end_index=0
00067                 timesize_start=[lines.index(line) for line in lines if self._timeSizeStart.match(line)]
00068                 general_stop=[lines.index(line) for line in lines if self._otherStart.match(line)]
00069                 if timesize_start:
00070                         timesize_start_index = timesize_start[0]
00071                         general_stop_index = timesize_start_index
00072                 elif general_stop:
00073                         timesize_start_index=timesize_end_index+1
00074                         general_stop_index=general_stop[0]
00075                 else:
00076                         timesize_start_index=0
00077                         general_stop_index=-1
00078 
00079                 """ we split the structure:
00080                         * general
00081                         * timesize
00082                         * all others [igprof etc]
00083                 """
00084         
00085                 """ we get the indexes of spliting """
00086                 #Not OK to use timsize_start_index for the general lines... want to be general, also to cases of no TimeSize tests...
00087                 #self.lines_general = lines[:timesize_start_index]
00088                 self.lines_general = lines[:general_stop_index]
00089                 self.lines_timesize = lines[timesize_start_index:timesize_end_index+1]
00090                 self.lines_other = lines[timesize_end_index:]           
00091         
00092                 """ a list of missing fields """
00093                 self.missing_fields = []
00094 
00095         @staticmethod
00096         def isTimeStamp(line):
00097                 """
00098                 Returns whether the string is a timestamp (if not returns None)
00099 
00100                 >>> parserPerfsuiteMetadata.isTimeStamp("Fri Aug 14 01:16:03 2009")
00101                 True
00102                 >>> parserPerfsuiteMetadata.isTimeStamp("Fri Augx 14 01:16:03 2009")
00103 
00104                 """
00105                 datetime_format = "%a %b %d %H:%M:%S %Y" # we use default date format
00106                 try:
00107                         time.strptime(line, datetime_format)
00108                         return True
00109                 except ValueError:
00110                         return None
00111         
00112         @staticmethod
00113         def findFirstIndex_ofStartsWith(job_lines, start_of_line):
00114                 return [job_lines.index(line) 
00115                         for line in job_lines 
00116                         if line.startswith(start_of_line)][0]
00117         
00118         def findLineBefore(self, line_index, lines, test_condition):
00119                 """ finds a line satisfying the `test_condition` comming before the `line_index` """
00120                 # we're going backwards the lines list
00121                 for line_index in  xrange(line_index -1, -1, -1):
00122                         line = lines[line_index]
00123 
00124                         if test_condition(line):
00125                                 return line
00126                 raise ValueError
00127 
00128 
00129         def findLineAfter(self, line_index, lines, test_condition, return_index = False):
00130                 """ finds a line satisfying the `test_condition` comming after the `line_index` """
00131                 # we're going forward the lines list
00132                 for line_index in xrange(line_index + 1, len(lines)):
00133                         line = lines[line_index]
00134 
00135                         if test_condition(line):        
00136                                 if return_index:
00137                                         return line_index
00138                                 return line
00139 
00140         def firstTimeStampBefore(self, line_index, lines):
00141                 """ returns the first timestamp BEFORE the line with given index """
00142 
00143                 return self.findLineBefore(line_index, lines, test_condition = self.isTimeStamp)
00144 
00145         def firstTimeStampAfter(self, line_index, lines):
00146                 """ returns the first timestamp AFTER the line with given index """
00147 
00148                 return self.findLineAfter(line_index, lines, test_condition = self.isTimeStamp)
00149 
00150         def handleParsingError(self, message):
00151                 if self._DEBUG:
00152                         raise ValueError, message
00153                 print " ======== AND ERROR WHILE PARSING METADATA ===="
00154                 print message
00155                 print " =============== end ========================= "
00156 
00157         #IgProf_Perf, IgProf_Mem,  Memcheck, Callgrind
00158         #TODO: divide the input using separators
00159 
00160         """ reads the input cmsPerfsuite.log file  """
00161         def readInput(self, path, fileName = "cmsPerfSuite.log"):
00162                 try:
00163                         f = open(os.path.join(path, fileName), "r")
00164                         lines =  [s.strip() for s in f.readlines()]
00165                         f.close()
00166                 except IOError:
00167                         lines = []
00168 
00169                 #print self._lines
00170                 return lines
00171 
00172 
00173 
00174 
00175         def getMachineInfo(self):
00176                 """ Returns the cpu and memory info  """
00177 
00178                 """ cpu info """
00179 
00180                 """
00181                 we assume that:
00182                  * num_cores = max(core id+1) [it's counted from 0]
00183                  * 'model name' is processor type [we will return only the first one - we assume others to be same!!??
00184                  * cpu MHz - is the speed of CPU
00185                 """
00186                 #TODO: BUT cpu MHz show not the maximum speed but current, 
00187                 """
00188                 for 
00189                         model name      : Intel(R) Core(TM)2 Duo CPU     L9400  @ 1.86GHz
00190                         cpu MHz         : 800.000
00191                         cache size      : 6144 KB
00192                 """
00193                 cpu_result = {}
00194                 try:
00195                         f= open(os.path.join(self._path, "cpuinfo"), "r")
00196 
00197                         #we split data into a list of tuples = [(attr_name, attr_value), ...]
00198                         cpu_attributes = [l.strip().split(":") for l in f.readlines()]
00199                         #print cpu_attributes
00200                         f.close()
00201                         cpu_result = {
00202                                 "num_cores": max ([int(attr[1].strip())+1 for attr in cpu_attributes if attr[0].strip() == "processor"]), #Bug... Vidmantas used "core id"
00203                                 "cpu_speed_MHZ": max ([attr[1].strip() for attr in cpu_attributes if attr[0].strip() == "cpu MHz"]),
00204                                 "cpu_cache_size": [attr[1].strip() for attr in cpu_attributes if attr[0].strip() == "cache size"][0],
00205                                 "cpu_model_name": [attr[1].strip() for attr in cpu_attributes if attr[0].strip() == "model name"][0]
00206                         }
00207                 except IOError,e:
00208                         print e
00209 
00210                 
00211                 
00212 
00213 
00214                 """ memory info """
00215                 mem_result = {}
00216 
00217                 try:
00218                         f= open(os.path.join(self._path, "meminfo"), "r")
00219 
00220                         #we split data into a list of tuples = [(attr_name, attr_value), ...]
00221                         mem_attributes = [l.strip().split(":") for l in f.readlines()]
00222 
00223                         mem_result = {
00224                                 "memory_total_ram": [attr[1].strip() for attr in mem_attributes if attr[0].strip() == "MemTotal"][0]
00225                         }
00226 
00227                 except IOError,e:
00228                         print e
00229         
00230                 cpu_result.update(mem_result)
00231                 return cpu_result
00232 
00233 
00234         
00235         def _applyParsingRules(self, parsing_rules, lines):
00236                 """ 
00237                         Applies the (provided) regular expression rules (=rule[1] for rule in parsing_rules)
00238                         to each line and if it matches the line,
00239                         puts the mached information to the dictionary as the specified keys (=rule[0]) which is later returned
00240                         Rule[3] contains whether the field is required to be found. If so and it isn't found the exception would be raised.
00241                         rules = [
00242                           ( (field_name_1_to_match, field_name_2), regular expression, /optionaly: is the field required? if so "req"/ )
00243                         ]
00244                  """
00245                 """ we call a shared parsing helper """
00246                 #parsing_rules = map(parsingRulesHelper.rulesRegexpCompileFunction, parsing_rules)
00247                 #print parsing_rules
00248                 (info, missing_fields) = parsingRulesHelper.rulesParser(parsing_rules, lines, compileRules = True)
00249 
00250                 self.missing_fields.extend(missing_fields)
00251 
00252                 return info
00253 
00254 
00255         def parseGeneralInfo(self):
00256                 lines = self.lines_general
00257                 """ we define a simple list (tuple) of rules for parsing, the first part tuple defines the parameters to be fetched from the
00258                         regexp while the second one is the regexp itself """
00259                 #TIP: don't forget that tuple of one ends with ,
00260                 parsing_rules = (
00261                         (("", "num_cores", "run_on_cpus"), r"""^This machine \((.+)\) is assumed to have (\d+) cores, and the suite will be run on cpu \[(.+)\]$"""),
00262                         (("start_time", "host", "local_workdir", "user"), r"""^Performance Suite started running at (.+) on (.+) in directory (.+), run by user (.+)$""", "req"),
00263                         (("architecture",) ,r"""^Current Architecture is (.+)$"""),
00264                         (("test_release_based_on",), r"""^Test Release based on: (.+)$""", "req"),
00265                         (("base_release_path",) , r"""^Base Release in: (.+)$"""),
00266                         (("test_release_local_path",) , r"""^Your Test release in: (.+)$"""),
00267 
00268                         (("castor_dir",) , r"""^The performance suite results tarball will be stored in CASTOR at (.+)$"""),
00269                         
00270                         (("TimeSize_events",) , r"""^(\d+) TimeSize events$"""),
00271                         (("IgProf_events",) , r"""^(\d+) IgProf events$"""),
00272                         (("CallGrind_events",) , r"""^(\d+) Callgrind events$"""),
00273                         (("Memcheck_events",) , r"""^(\d+) Memcheck events$"""), 
00274 
00275                         (("candles_TimeSize",) , r"""^TimeSizeCandles \[(.*)\]$"""),
00276                         (("candles_TimeSizePU",) , r"""^TimeSizePUCandles \[(.*)\]$"""),
00277                         
00278                         (("candles_Memcheck",) , r"""^MemcheckCandles \[(.*)\]$"""),
00279                         (("candles_MemcheckPU",) , r"""^MemcheckPUCandles \[(.*)\]$"""),
00280 
00281                         (("candles_Callgrind",) , r"""^CallgrindCandles \[(.*)\]$"""),
00282                         (("candles_CallgrindPU",) , r"""^CallgrindPUCandles \[(.*)\]$"""),
00283 
00284                         (("candles_IgProfPU",) , r"""^IgProfPUCandles \[(.*)\]$"""),
00285                         (("candles_IgProf",) , r"""^IgProfCandles \[(.*)\]$"""),
00286 
00287 
00288                         (("cmsScimark_before",) , r"""^(\d+) cmsScimark benchmarks before starting the tests$"""),
00289                         (("cmsScimark_after",) , r"""^(\d+) cmsScimarkLarge benchmarks before starting the tests$"""),
00290                         (("cmsDriverOptions",) , r"""^Running cmsDriver.py with user defined options: --cmsdriver="(.+)"$"""),
00291 
00292                         (("HEPSPEC06_SCORE",) ,r"""^This machine's HEPSPEC06 score is: (.+)$"""),
00293 
00294 
00295                 )
00296                 """ we apply the defined parsing rules to extract the required fields of information into the dictionary (as defined in parsing rules) """
00297                 info = self._applyParsingRules(parsing_rules, lines)
00298 
00299 
00300                 """ postprocess the candles list """
00301                 candles = {}
00302                 for field, value in info.items():
00303                         if field.startswith("candles_"):
00304                                 test = field.replace("candles_", "")
00305                                 value = [v.strip(" '") for v in value.split(",")]
00306                                 #if value:
00307                                 candles[test]=value
00308                                 del info[field]
00309                 #print candles
00310                 info["candles"] = self._LINE_SEPARATOR.join([k+":"+",".join(v) for (k, v) in candles.items()])
00311 
00312 
00313                 """ TAGS """
00314                 """ 
00315                 --- Tag ---    --- RelTag --- -------- Package --------                        
00316                 HEAD           V05-03-06      IgTools/IgProf                                   
00317                 V01-06-05      V01-06-04      Validation/Performance                           
00318                 ---------------------------------------
00319                 total packages: 2 (2 displayed)
00320                 """
00321                 tags_start_index = -1 # set some default
00322                 try:
00323                         tags_start_index = [i for i in xrange(0, len(lines)) if lines[i].startswith("--- Tag ---")][0]
00324                 except:
00325                         pass
00326                 if tags_start_index > -1:
00327                         tags_end_index = [i for i in xrange(tags_start_index + 1, len(lines)) if lines[i].startswith("---------------------------------------")][0]
00328                         # print "tags start index: %s, end index: %s" % (tags_start_index, tags_end_index)
00329                         tags = lines[tags_start_index:tags_end_index+2]
00330                         # print [tag.split("  ") for tag in tags]
00331                         # print "\n".join(tags)
00332                 else: # no tags found, make an empty list ...
00333                         tags = []
00334                 """ we join the tags with separator to store as simple string """
00335                 info["tags"] = self._LINE_SEPARATOR.join(tags)
00336                 #FILES/PATHS
00337         
00338 
00339                 """ get the command line """
00340                 try:
00341                         cmd_index = self.findFirstIndex_ofStartsWith(lines, "Performance suite invoked with command line:") + 1 #that's the next line
00342                         info["command_line"] =  lines[cmd_index]
00343                 except IndexError, e:
00344                         if self._DEBUG:
00345                                 print e
00346                         info["command_line"] =  ""
00347                 
00348                 try:
00349                         cmd_parsed_start = self.findFirstIndex_ofStartsWith(lines, "Initial PerfSuite Arguments:") + 1
00350                         cmd_parsed_end = self.findFirstIndex_ofStartsWith(lines, "Running cmsDriver.py")
00351                         info["command_line_parsed"] = self._LINE_SEPARATOR.join(lines[cmd_parsed_start:cmd_parsed_end])
00352                 except IndexError, e:
00353                         if self._DEBUG:
00354                                 print e
00355                         info["command_line"] =  ""
00356 
00357                 return  info
00358 
00359         
00360         def parseAllOtherTests(self):
00361                 #make it general, for whatever test comes...
00362                 test = {}
00363 
00364                 parsing_rules = (
00365                         (("", "candle", ), r"""^(Candle|ONLY) (.+) will be PROCESSED$""", "req"),
00366                         #e.g.: --conditions FrontierConditions_GlobalTag,MC_31X_V4::All --eventcontent RECOSIM
00367                         (("cms_driver_options", ), r"""^Using user-specified cmsDriver.py options: (.+)$"""),
00368                         (("", "conditions", ""), r"""^Using user-specified cmsDriver.py options: (.*)--conditions ([^\s]+)(.*)$""", "req"),
00369                         # for this we cannot guarrantee that it has been found, TODO: we might count the number of pileup candles and compare with arguments
00370                         (("",  "pileup_type", ""), r"""^Using user-specified cmsDriver.py options:(.*)--pileup=([^\s]+)(.*)$"""),
00371                         #not shure if event content is required
00372                         (("",  "event_content", ""), r"""^Using user-specified cmsDriver.py options:(.*)--eventcontent ([^\s]+)(.*)$""", "req"),
00373                         #TODO: after changeing the splitter to "taskset -c ..." this is no longer included into the part of correct job
00374                         #(("input_user_root_file", ), r"""^For these tests will use user input file (.+)$"""),
00375                 )
00376 
00377 
00378                 lines = self.lines_other
00379                 """
00380 
00381                 for each of IgProf_Perf, IgProf_Mem,  Memcheck, Callgrind tests we have such a structure of input file:
00382                 * beginning ->> and start timestamp- the firstone:
00383                         Launching the PILE UP IgProf_Mem tests on cpu 4 with 201 events each
00384                         Adding thread <simpleGenReportThread(Thread-1, started -176235632)> to the list of active threads
00385                         Mon Jun 14 20:06:54 2010
00386 
00387                         <... whatever might be here, might overlap with other test start/end messages ..>
00388 
00389                         Mon Jun 14 21:59:33 2010
00390                         IgProf_Mem test, in thread <simpleGenReportThread(Thread-1, stopped -176235632)> is done running on core 4
00391 
00392                 * ending - the last timestamp "before is done running ...."
00393                 """
00394                 # we take the first TimeStamp after the starting message and the first before the finishing message in 2 rounds..
00395         
00396                 #TODO: if threads would be changed it would stop working!!!
00397 
00398                 # i.e. Memcheck, cpu, events
00399                 reSubmit = re.compile(r"""^Let's submit (.+) test on core (\d+)$""")
00400                 
00401                 reStart = re.compile(r"""^Launching the (PILE UP |)(.*) tests on cpu (\d+) with (\d+) events each$""")
00402 
00403                 # i.e. Memcheck, thread name,id,core number
00404                 reEnd = re.compile(r"""^(.*) test, in thread <simpleGenReportThread\((.+), stopped -(\d+)\)> is done running on core (\d+)$""")
00405                 
00406                 reAddThread =  re.compile(r"""^Adding thread <simpleGenReportThread\((.+), started -(\d+)\)> to the list of active threads$""")
00407 
00408                 reWaiting = re.compile(r"""^Waiting for tests to be done...$""")
00409 
00410                 reExitCode = re.compile(r"""Individual cmsRelvalreport.py ExitCode (\d+)""")
00411                 """ we search for lines being either: (it's a little pascal'ish but we need the index!) """
00412 
00413                 jobs = []
00414 
00415                 #can split it into jobs ! just have to reparse it for the exit codes later....
00416                 for line_index in xrange(0, len(lines)):
00417                         line = lines[line_index]
00418                         if reSubmit.match(line):
00419                                 end_index = self.findLineAfter(line_index, lines, test_condition=lambda l: reWaiting.match(l), return_index = True)
00420                                 jobs.append(lines[line_index:end_index])
00421 
00422                 for job_lines in jobs:
00423                         #print job_lines
00424                         info = self._applyParsingRules(parsing_rules, job_lines)
00425                         #Fixing here the compatibility with new cmsdriver.py --conditions option
00426                         #(for which now we have autoconditions and FrontierConditions_GlobalTag is optional):
00427                         if 'auto:' in info['conditions']:
00428                                 from Configuration.AlCa.autoCond import autoCond
00429                                 info['conditions'] = autoCond[ info['conditions'].split(':')[1] ].split("::")[0]
00430                         else:
00431                                 if 'FrontierConditions_GlobalTag' in info['conditions']:
00432                                         info['conditions']=info['conditions'].split(",")[1]
00433 
00434                         steps_start = self.findFirstIndex_ofStartsWith(job_lines, "You defined your own steps to run:")
00435                         steps_end = self.findFirstIndex_ofStartsWith(job_lines, "*Candle ")
00436                         #probably it includes steps until we found *Candle... ?
00437                         steps = job_lines[steps_start + 1:steps_end]
00438                         if not self.validateSteps(steps):
00439                                 self.handleParsingError( "Steps were not found corrently: %s for current job: %s" % (str(steps), str(job_lines)))
00440                                 
00441                                 """ quite nasty - just a work around """
00442                                 print "Trying to recover from this error in case of old cmssw"
00443                                 
00444                                 """ we assume that steps are between the following sentance and a TimeStamp """
00445                                 steps_start = self.findFirstIndex_ofStartsWith(job_lines, "Steps passed to writeCommands")
00446                                 steps_end = self.findLineAfter(steps_start, job_lines, test_condition = self.isTimeStamp, return_index = True)
00447                                 
00448                                 steps = job_lines[steps_start + 1:steps_end]
00449                                 if not self.validateSteps(steps):
00450                                         self.handleParsingError( "EVEN AFTER RECOVERY Steps were not found corrently! : %s for current job: %s" % (str(steps), str(job_lines)))
00451                                 else:
00452                                         print "RECOVERY SEEMS to be successful: %s" % str(steps)
00453 
00454                         info["steps"] = self._LINE_SEPARATOR.join(steps) #!!!! STEPS MIGHT CONTAIN COMMA: ","
00455 
00456                         start_id_index = self.findLineAfter(0, job_lines, test_condition = reStart.match, return_index = True)
00457                         pileUp, testName, testCore, testEventsNum = reStart.match(job_lines[start_id_index]).groups()                   
00458                         info["testname"] = testName
00459 
00460                         thread_id_index = self.findLineAfter(0, job_lines, test_condition = reAddThread.match, return_index = True)
00461                         info["start"] = self.firstTimeStampAfter(thread_id_index, job_lines)
00462 
00463                         thread_id, thread_number = reAddThread.match(job_lines[thread_id_index]).groups()
00464                         info["thread_id"] = thread_id
00465                         
00466                         if not test.has_key(testName):
00467                                 test[testName] = []
00468                         test[testName].append(info)
00469                 
00470                 for line_index in xrange(0, len(lines)):
00471                         line = lines[line_index]
00472 
00473                         if reEnd.match(line):
00474                                 testName, thread_id, thread_num, testCore = reEnd.match(line).groups()
00475                                 time = self.firstTimeStampBefore(line_index, lines)
00476                                 try:
00477                                         exit_code = ""
00478                                         #we search for the exit code
00479                                         line_exitcode = self.findLineBefore(line_index, lines, test_condition=lambda l: reExitCode.match(l))
00480                                         exit_code, = reExitCode.match(line_exitcode).groups()
00481                                 except Exception, e:
00482                                         print "Error while getting exit code (Other test): %s" + str(e)
00483                                         
00484                                 for key, thread in test.items():
00485                                         for i in range(0, len(thread)):
00486                                                 if thread[i]["thread_id"] == thread_id:
00487                                                         thread[i].update({"end": time, "exit_code": exit_code})
00488                                                         break
00489                                 
00490                 return test
00491                                                 
00492 
00493         def parseTimeSize(self):
00494                 """ parses the timeSize """
00495                 timesize_result = []
00496 
00497                 # TODO: we will use the first timestamp after the "or these tests will use user input file..."
00498                 #TODO: do we have to save the name of input file somewhere?
00499                 """
00500                 the structure of input file:
00501                 * beginning ->> and start timestamp- the firstone:              
00502                         >>> [optional:For these tests will use user input file /build/RAWReference/MinBias_RAW_320_IDEAL.root]
00503                         <...>
00504                         Using user-specified cmsDriver.py options: --conditions FrontierConditions_GlobalTag,MC_31X_V4::All --eventcontent RECOSIM
00505                         Candle MinBias will be PROCESSED
00506                         You defined your own steps to run:
00507                         RAW2DIGI-RECO
00508                         *Candle MinBias
00509                         Written out cmsRelvalreport.py input file at:
00510                         /build/relval/CMSSW_3_2_4/workStep2/MinBias_TimeSize/SimulationCandles_CMSSW_3_2_4.txt
00511                         Thu Aug 13 14:53:37 2009 [start]
00512                         <....>
00513                         Thu Aug 13 16:04:48 2009 [end]
00514                         Individual cmsRelvalreport.py ExitCode 0
00515                 * ending - the last timestamp "... ExitCode ...."
00516                 """
00517                 #TODO: do we need the cmsDriver --conditions? I suppose it would the global per work directory = 1 perfsuite run (so samefor all candles in one work dir)
00518                 # TODO: which candle definition to use?
00519                 """ divide into separate jobs """
00520                 lines = self.lines_timesize
00521                 jobs = []
00522                 start = False
00523                 timesize_start_indicator = re.compile(r"""^taskset -c (\d+) cmsRelvalreportInput.py""")
00524                 for line_index in xrange(0, len(lines)):
00525                         line = lines[line_index]
00526                         # search for start of each TimeSize job (with a certain candle and step)
00527                         if timesize_start_indicator.match(line):
00528                                 if start:
00529                                         jobs.append(lines[start:line_index])
00530                                 start = line_index
00531                 #add the last one
00532                 jobs.append(lines[start:len(lines)])
00533                 #print "\n".join(str(i) for i in jobs)
00534 
00535                 parsing_rules = (
00536                         (("", "candle", ), r"""^(Candle|ONLY) (.+) will be PROCESSED$""", "req"),
00537                         #e.g.: --conditions FrontierConditions_GlobalTag,MC_31X_V4::All --eventcontent RECOSIM
00538                         (("cms_driver_options", ), r"""^Using user-specified cmsDriver.py options: (.+)$"""),
00539                         (("", "conditions", ""), r"""^Using user-specified cmsDriver.py options: (.*)--conditions ([^\s]+)(.*)$""", "req"),
00540                         # for this we cannot guarrantee that it has been found, TODO: we might count the number of pileup candles and compare with arguments
00541                         (("",  "pileup_type", ""), r"""^Using user-specified cmsDriver.py options:(.*)--pileup=([^\s]+)(.*)$"""),
00542                         #not shure if event content is required
00543                         (("",  "event_content", ""), r"""^Using user-specified cmsDriver.py options:(.*)--eventcontent ([^\s]+)(.*)$""", "req"),
00544                         #TODO: after changeing the splitter to "taskset -c ..." this is no longer included into the part of correct job
00545                         #(("input_user_root_file", ), r"""^For these tests will use user input file (.+)$"""),
00546                 )
00547 
00548                 #parse each of the TimeSize jobs: find candles, etc and start-end times
00549 
00550                 reExit_code = re.compile(r"""Individual ([^\s]+) ExitCode (\d+)""")
00551 
00552                 if self._DEBUG:
00553                         print "TimeSize (%d) jobs: %s" % (len(jobs), str(jobs))
00554 
00555                 for job_lines in jobs:
00556                         """ we apply the defined parsing rules to extract the required fields of information into the dictionary (as defined in parsing rules) """
00557                         info = self._applyParsingRules(parsing_rules, job_lines)
00558                         #Fixing here the compatibility with new cmsdriver.py --conditions option (for which now we have autoconditions and FrontierConditions_GlobalTag is optional):
00559                         if 'auto:' in info['conditions']:
00560                                 from Configuration.AlCa.autoCond import autoCond
00561                                 info['conditions'] = autoCond[ info['conditions'].split(':')[1] ].split("::")[0]
00562                         else:
00563                                 if 'FrontierConditions_GlobalTag' in info['conditions']:
00564                                         info['conditions']=info['conditions'].split(",")[1]
00565                                                                                                                                 
00566                         #DEBUG:
00567                         #print "CONDITIONS are: %s"%info['conditions']
00568                         #start time - the index after which comes the time stamp
00569                         """ the following is not available on one of the releases, instead
00570                         use the first timestamp available on our job - that's the starting time :) """ 
00571                         
00572                         #start_time_after = self.findFirstIndex_ofStartsWith(job_lines, "Written out cmsRelvalreport.py input file at:")
00573                         #print start_time_after
00574                         info["start"] = self.firstTimeStampAfter(0, job_lines)
00575 
00576                         #TODO: improve in future (in case of some changes) we could use findBefore instead which uses the regexp as parameter for searching 
00577                         #end time - the index before which comes the time stamp
00578 
00579                         # On older files we have - "Individual Relvalreport.py ExitCode 0" instead of "Individual cmsRelvalreport.py ExitCode"
00580                         end_time_before = self.findLineAfter(0, job_lines, test_condition = reExit_code.match, return_index = True)
00581 
00582                         # on the same line we have the exit Code - so let's get it
00583                         nothing, exit_code = reExit_code.match(job_lines[end_time_before]).groups()
00584 
00585                         info["end"] = self.firstTimeStampBefore(end_time_before, job_lines)
00586                         info["exit_code"] = exit_code
00587 
00588                         steps_start = self.findFirstIndex_ofStartsWith(job_lines, "You defined your own steps to run:")
00589                         steps_end = self.findFirstIndex_ofStartsWith(job_lines, "*Candle ")
00590                         #probably it includes steps until we found *Candle... ?
00591                         steps = job_lines[steps_start + 1:steps_end]
00592                         if not self.validateSteps(steps):
00593                                 self.handleParsingError( "Steps were not found corrently: %s for current job: %s" % (str(steps), str(job_lines)))
00594                                 
00595                                 """ quite nasty - just a work around """
00596                                 print "Trying to recover from this error in case of old cmssw"
00597                                 
00598                                 """ we assume that steps are between the following sentance and a TimeStamp """
00599                                 steps_start = self.findFirstIndex_ofStartsWith(job_lines, "Steps passed to writeCommands")
00600                                 steps_end = self.findLineAfter(steps_start, job_lines, test_condition = self.isTimeStamp, return_index = True)
00601                                 
00602                                 steps = job_lines[steps_start + 1:steps_end]
00603                                 if not self.validateSteps(steps):
00604                                         self.handleParsingError( "EVEN AFTER RECOVERY Steps were not found corrently! : %s for current job: %s" % (str(steps), str(job_lines)))
00605                                 else:
00606                                         print "RECOVERY SEEMS to be successful: %s" % str(steps)
00607 
00608                         info["steps"] = self._LINE_SEPARATOR.join(steps) #!!!! STEPS MIGHT CONTAIN COMMA: ","
00609                         
00610 
00611                         timesize_result.append(info)
00612                 return {"TimeSize": timesize_result}
00613         #TODO:
00614         
00615 
00616 
00617         def readCmsScimarkTest(self, testName, testType, core):
00618                 lines  = self.readInput(self._path, fileName = testName + ".log")
00619                 scores = [{"score": self.reCmsScimarkTest.match(line).groups()[1], "type": testType, "core": core}
00620                                 for line in lines 
00621                                 if self.reCmsScimarkTest.match(line)]
00622                 #add the number of messurment
00623                 i = 0
00624                 for score in scores:
00625                         i += 1
00626                         score.update({"messurement_number": i})
00627                 return scores
00628                 
00629         def readCmsScimark(self, main_cores = [1]):
00630                 main_core = main_cores[0]
00631                 #TODO: WE DO NOT ALWAYS REALLY KNOW THE MAIN CORE NUMBER! but we don't care too much
00632                 #we parse each of the SciMark files and the Composite scores
00633                 csimark = []
00634                 csimark.extend(self.readCmsScimarkTest(testName = "cmsScimark2", testType = "mainCore", core = main_core))
00635                 csimark.extend(self.readCmsScimarkTest(testName = "cmsScimark2_large", testType = "mainCore_Large", core = main_core))
00636 
00637 
00638                 #we not always know the number of cores available so we will just search the directory to find out core numbers
00639                 reIsCsiMark_notusedcore = re.compile("^cmsScimark_(\d+).log$")
00640                 scimark_files = [reIsCsiMark_notusedcore.match(f).groups()[0]
00641                                 for f in os.listdir(self._path)
00642                                  if reIsCsiMark_notusedcore.match(f) 
00643                                         and os.path.isfile(os.path.join(self._path, f)) ]
00644 
00645                 for core_number in scimark_files:
00646                         try:
00647                                 csimark.extend(self.readCmsScimarkTest(testName = "cmsScimark_%s" % str(core_number), testType = "NotUsedCore_%s" %str(core_number), core = core_number))
00648                         except IOError, e:
00649                                 if self._DEBUG:
00650                                         print e
00651                 return csimark
00652                 #print csimark
00653 
00654         def parseTheCompletion(self):
00655                 """
00656                  checks if the suite has successfully finished  
00657                         and if the tarball was successfully archived and uploaded to the castor """
00658 
00659                 parsing_rules = (
00660                         (("finishing_time", "", ""), r"""^Performance Suite finished running at (.+) on (.+) in directory (.+)$"""),
00661                         (("castor_md5",) , r"""^The md5 checksum of the tarball: (.+)$"""),     
00662                         (("successfully_archived_tarball", ), r"""^Successfully archived the tarball (.+) in CASTOR!$"""),
00663                         #TODO: WE MUST HAVE THE CASTOR URL, but for some of files it's not included [probably crashed]
00664                         (("castor_file_url",), r"""^The tarball can be found: (.+)$"""),                        
00665                         (("castor_logfile_url",), r"""^The logfile can be found: (.+)$"""),
00666                 )
00667 
00668                 
00669                 """ we apply the defined parsing rules to extract the required fields of information into the dictionary (as defined in parsing rules) """
00670                 info = self._applyParsingRules(parsing_rules, self.lines_other)
00671 
00672                 """ did we detect any errors in log files ? """
00673                 info["no_errors_detected"] = [line for line in self.lines_other if line == "There were no errors detected in any of the log files!"] and "1" or "0"
00674                 if not info["successfully_archived_tarball"]:
00675                         info["castor_file_url"] = ""
00676 
00677                 if not info["castor_file_url"]:
00678                         #TODO: get the castor file url or abort
00679                         self.handleParsingError( "Castor tarball URL not found. Trying to get from environment")
00680                         lmdb_castor_url_is_valid = lambda url: url.startswith("/castor/")
00681 
00682                         url = ""
00683                         try:
00684                                 #print "HERE!"
00685                                 url=self.get_tarball_fromlog()
00686                                 print "Extracted castor tarball full path by re-parsing cmsPerfSuite.log: %s"%url
00687                                 
00688                         except:
00689                                 if os.environ.has_key("PERFDB_CASTOR_FILE_URL"):
00690                                         url = os.environ["PERFDB_CASTOR_FILE_URL"]
00691                                         
00692                                 else: #FIXME: add the possibility to get it directly from the cmsPerfSuite.log file (make sure it is dumped there before doing the tarball itself...)
00693                                         print "Failed to get the tarball location from environment variable PERFDB_CASTOR_FILE_URL" 
00694                                         self.handleParsingError( "Castor tarball URL not found. Provide interactively")
00695 
00696                         while True:
00697                                 
00698                                 if lmdb_castor_url_is_valid(url):
00699                                         info["castor_file_url"] = url
00700                                         break
00701                                 print "Please enter a valid CASTOR url: has to start with /castor/ and should point to the tarball"
00702                                 if os.isatty(0): url = sys.stdin.readline()
00703                                 else: raise IOError("stdin is closed.")
00704 
00705 
00706                 return info
00707         def get_tarball_fromlog(self):
00708                 '''Return the tarball castor location by parsing the cmsPerfSuite.log file'''
00709                 print "Getting the url from the cmsPerfSuite.log"
00710                 log=open("cmsPerfSuite.log","r")
00711                 castor_dir="UNKNOWN_CASTOR_DIR"
00712                 tarball="UNKNOWN_TARBALL"
00713                 for line in log.readlines():
00714                         if 'castordir' in line:
00715                                 castor_dir=line.split()[1]
00716                         if 'tgz' in line and tarball=="UNKNOWN_TARBALL": #Pick the first line that contains the tar command...
00717                                 if 'tar' in line:
00718                                         tarball=os.path.basename(line.split()[2])
00719                 castor_tarball=os.path.join(castor_dir,tarball)
00720                 return castor_tarball
00721 
00722         def parseAll(self):
00723                 result = {"General": {}, "TestResults":{}, "cmsSciMark":{}, 'unrecognized_jobs': []}
00724 
00725                 """ all the general info - start, arguments, host etc """
00726                 result["General"].update(self.parseGeneralInfo())
00727 
00728                 """ machine info - cpu, memmory """
00729                 result["General"].update(self.getMachineInfo())
00730 
00731                 """ we add info about how successfull was the run, when it finished and final castor url to the file! """
00732                 result["General"].update(self.parseTheCompletion())
00733 
00734                 print "Parsing TimeSize runs..."
00735                 if len(self.lines_timesize) > 0:
00736                         try:
00737                                 result["TestResults"].update(self.parseTimeSize())
00738                         except Exception, e:
00739                                 print "BAD BAD BAD UNHANDLED ERROR in parseTimeSize: " + str(e)
00740 
00741                 print "Parsing Other(IgProf, Memcheck, ...) runs..."
00742                 try:
00743                         result["TestResults"].update(self.parseAllOtherTests())
00744                 except Exception, e:
00745                         print "BAD BAD BAD UNHANDLED ERROR in parseAllOtherTests: " + str(e)
00746 
00747                 #print result["TestResults"]
00748 
00749 
00750                 main_cores = [result["General"]["run_on_cpus"]]
00751                 num_cores = result["General"].get("num_cores", 0)
00752                 #DEBUG
00753                 #print "Number of cores was: %s"%num_cores
00754                 #TODO: temporarly - search for cores, use regexp
00755                 main_cores = [1]
00756 
00757                 # THE MAHCINE SCIMARKS
00758                 result["cmsSciMark"] = self.readCmsScimark(main_cores = main_cores)
00759 
00760                 if self.missing_fields:
00761                         self.handleParsingError("========== SOME REQUIRED FIELDS WERE NOT FOUND DURING PARSING ======= "+ str(self.missing_fields))
00762 
00763                 return result
00764                 
00765                 
00766 
00767 if __name__ == "__main__":
00768         from xml.dom import minidom
00769         import cmssw_exportdb_xml
00770         #steps do not get parsed corectly
00771         #path = "/home/vidma/Desktop/CERN_code/cmssw/data/CMSSW_3_1_0_pre7_--usersteps=RAW2DIGI-RECO_lxbuild107.cern.ch_relval/relval/CMSSW_3_1_0_pre7/work2" 
00772         #path = "/home/vidma/Desktop/CERN_code/cmssw/data/CMSSW_3_2_0_--usersteps=GEN-SIM,DIGI_lxbuild106.cern.ch_relval/relval/CMSSW_3_2_0/workGENSIMDIGI"
00773         #includes finishing time, succesfully archived tarball etc
00774         #path = "/home/vidma/Desktop/CERN_code/cmssw/CVS_PerfSuiteDB/COMP/PerfSuiteDB/export_data_to_xml/example_of_files/PileUp"
00775         path = os.path.abspath(".") #Better to point to the local dir than to some old Vidmantas' laptop dirs ;)
00776         #p = parserPerfsuiteMetadata("/home/vidma/Desktop/CERN_code/cmssw/CVS_PerfSuiteDB/COMP/PerfSuiteDB/export_data_to_xml/example_of_files/PerfsuiteRun")
00777         p = parserPerfsuiteMetadata(path)
00778         run_info = p.parseAll()
00779         
00780         #print "======= GENERAL ========= "
00781         #print "\n".join("%s : %s" % (k, v) for k, v in p.parseAll()["General"].items())
00782         #print "======= Test results ========= "
00783         #print "\n".join("%s : %s" % (k, v) for k, v in p.parseAll()["TestResults"].items())
00784 
00785         xml_doc = minidom.Document()
00786         cmssw_exportdb_xml.exportRunInfo(xml_doc, run_info, print_out = True)
00787         #print "General info:" + str(p.parseGeneralInfo())
00788         import doctest
00789         doctest.testmod()
00790         
00791         #print p.readCmsScimark()
00792 
00793