CMS 3D CMS Logo

/data/refman/pasoursint/CMSSW_4_2_9_HLT1_bphpatch4/src/Validation/Performance/scripts/cmsPerfSuiteHarvest.py

Go to the documentation of this file.
00001 #!/usr/bin/env python
00002 #A script to "harverst" PerfSuite work directories, producing an xml file with all the data ready to be uploaded to the PerfSuiteDB DB.
00003 import sys, os, re
00004 import getopt
00005 from Validation.Performance.parserTimingReport import *
00006 from Validation.Performance.cmssw_exportdb_xml import *
00007 import Validation.Performance.cmssw_exportdb_xml as cmssw_exportdb_xml
00008 from Validation.Performance.parserPerfsuiteMetadata import parserPerfsuiteMetadata
00009 
00010 from Validation.Performance.FileNamesHelper import *
00011 import Validation.Performance.parserEdmSize as parserEdmSize
00012 
00013 """ indicates whether the CMSSW is [use False] available or not. on our testing machine it's not [use True] """
00014 _TEST_RUN = False
00015 
00016 """ global variables """
00017 test_timing_report_log = re.compile("TimingReport.log$", re.IGNORECASE)
00018 
00019 
00020 xmldoc = minidom.Document()
00021 release = None
00022 steps = {}
00023 candles = {}
00024 pileups = {}
00025 
00026 def usage(argv):
00027     script = argv[0]
00028     return """
00029     Usage: %(script)s [-v cmssw_version] [--version=cmssw_version]
00030     
00031     if the cmssw version is in the system's environment (after running cmsenv):
00032     $ %(script)s 
00033     
00034     otherwise one must specify the cmssw version:
00035     $ %(script)s --version=CMSSW_3_2_0
00036     $ %(script)s -v CMSSW_3_2_0    
00037     
00038     """ % locals()
00039 
00040 def get_params(argv):
00041     """ 
00042     Returns the version of CMSSW to be used which it is taken from:
00043     * command line parameter or 
00044     * environment variable 
00045     in case of error returns None
00046 
00047         And also the directory to put the xml files to: if none --> returns ""
00048     """
00049     
00050     """ try to get the version for command line argument """
00051     #print argv
00052     #FIXME: this should be rewritten using getopt properly
00053     version = None
00054     #xml_dir = "cmsperfvm:/data/projects/conf/PerfSuiteDB/xml_dropbox" #Set this as default (assume change in write_xml to write to remote machines)
00055     #NB write_xml is in Validation/Performance/python/cmssw_exportdb_xml.py
00056     #Setting the default to write to a local directory:
00057     xml_dir="PerfSuiteDBData"
00058     try:                              
00059         opts, args = getopt.getopt(argv[1:], "v:", ["version=", "outdir="])
00060     except getopt.GetoptError, e:  
00061         print e
00062     for opt, arg in opts:
00063         if opt in ("-v", "--version"):
00064             version = arg
00065         if opt == "--outdir":
00066              xml_dir = arg
00067     
00068     """ if not get it from environment string """
00069     if not version:
00070         try:
00071             version = os.environ["CMSSW_VERSION"]
00072         except KeyError:
00073             pass
00074     
00075     return (version, xml_dir)
00076         
00077 def _eventContent_DEBUG(edm_report):
00078         # for testing / information
00079         EC_count = {}
00080         if not _TEST_RUN:
00081                 # count the products in event-content's
00082                 for prod in edm_report:
00083                         ecs = parseEventContent.List_ECs_forProduct(prod)
00084                         for ec in ecs:
00085                                 if not EC_count.has_key(ec):
00086                                         EC_count[ec] = []       
00087                                 EC_count[ec].append(prod)
00088                 #print out the statistics
00089                 for (ec, prods) in EC_count.items():
00090                         print "==== %s EVENT CONTENT: have %d items, the listing is: ===" % (ec, len(prods))
00091                         # list of products
00092                         print "\n *".join(["%(cpp_type)s_%(module_name)s_%(module_label)s" % prod for prod in prods])
00093 
00094 
00095 def assign_event_content_for_product(product):
00096         """ returns modified product by adding the event content relationship """
00097 
00098         if not _TEST_RUN:
00099                 product["event_content"] = ",".join(parseEventContent.List_ECs_forProduct(product))
00100         return product
00101 
00102 
00103 def get_modules_sequences_relationships():
00104         (sequenceWithModules, sequenceWithModulesString) =ModuleToSequenceAssign.assignModulesToSeqs()
00105         return [{"name": seq, "modules": ",".join(modules)} for (seq, modules) in sequenceWithModulesString.items()]
00106 
00107 
00108 def exportTimeSizeJob(path, timeSizeReport,  runinfo):
00109                 candleLong = os.path.split(path)[1].replace("_TimeSize", "").replace("_PU", "")
00110                 jobID = timeSizeReport["jobID"]
00111                 print candleLong
00112 
00113                 #search for a run Test to which could belong our JOB
00114                 found = False
00115                 if runinfo['TestResults'].has_key('TimeSize'):
00116                         for result in runinfo['TestResults']['TimeSize']:
00117                                 #print result
00118                                 """ If this is the testResult which fits TimeSize job """
00119                                 #TODO: we do not check teh step when assigning because of the different names, check if this is really OK. make a decission which step name to use later, long or short one
00120                                 #and jobID["step"] in result['steps'].split(parserPerfsuiteMetadata._LINE_SEPARATOR)
00121                                 if result['candle'] == candleLong  and jobID["pileup_type"] == result['pileup_type'] and jobID["conditions"] == result['conditions'] and jobID["event_content"] == result['event_content']:
00122                                         #print result
00123                                         if not result.has_key("jobs"):
00124                                                 result['jobs'] = []
00125                                         result['jobs'].append(timeSizeReport)
00126                                         found = True
00127                                         break
00128                 
00129                 if not found:
00130                         print "============ (almost) ERROR: NOT FOUND THE ENTRY in cmsPerfSuite.log, exporting as separate entry ======== "
00131                         print "JOB ID: %s " % str(jobID)
00132                         print " ====================== "
00133                         runinfo['unrecognized_jobs'].append(timeSizeReport)
00134                         #export_xml(xml_doc = xmldoc, **timeSizeReport) 
00135                         
00136 
00137 def process_timesize_dir(path, runinfo):
00138         global release,event_content,conditions
00139         """ if the release is not provided explicitly we take it from the Simulation candles file """
00140         if (not release):
00141                 release_fromlogfile = read_SimulationCandles(path)
00142                 release  = release_fromlogfile 
00143                 print "release from simulation candles: %s" % release
00144         
00145         if (not release):
00146                 # TODO: raise exception!
00147                 raise Exception("the release was not found!")
00148 
00149 
00150         """ process the TimingReport log files """
00151 
00152         # get the file list 
00153         files = os.listdir(path)
00154         timing_report_files = [os.path.join(path, f) for f in files
00155                                  if test_timing_report_log.search(f) 
00156                                         and os.path.isfile(os.path.join(path, f)) ]
00157 
00158         # print timing_report_files
00159         for timelog_f in timing_report_files:
00160                 print "\nProcessing file: %s" % timelog_f
00161                 print "------- "
00162                 
00163                 jobID = getJobID_fromTimeReportLogName(os.path.join(path, timelog_f))
00164                 print "jobID: %s" % str(jobID)
00165                 (candle, step, pileup_type, conditions, event_content) = jobID
00166                 jobID = dict(zip(("candle", "step", "pileup_type", "conditions", "event_content"), jobID))
00167                 print "Dictionary based jobID %s: " % str(jobID)
00168                 
00169                 #if any of jobID fields except (isPILEUP) is empty we discard the job as all those are the jobID keys and we must have them
00170                 discard = len([key for key, value in jobID.items() if key != "pileup_type" and not value])
00171                 if discard:
00172                         print " ====================== The job HAS BEEN DISCARDED =============== "
00173                         print " NOT ALL DATA WAS AVAILABLE "
00174                         print " JOB ID = %s " % str(jobID)
00175                         print " ======================= end ===================================== "
00176                         continue
00177 
00178                 num_events = read_ConfigurationFromSimulationCandles(path = path, step = step, is_pileup = pileup_type)["num_events"]
00179                 # TODO: automaticaly detect type of report file!!!
00180                 (mod_timelog, evt_timelog, rss_data, vsize_data) =loadTimeLog(timelog_f)
00181         
00182                 mod_timelog= processModuleTimeLogData(mod_timelog, groupBy = "module_name")
00183                 print "Number of modules grouped by (module_label+module_name): %s" % len(mod_timelog)
00184 
00185                 # add to the list to generate the readable filename :)
00186                 steps[step] = 1
00187                 candles[candle] = 1
00188                 if pileup_type=="":
00189                     pileups["NoPileUp"]=1
00190                 else:
00191                     pileups[pileup_type] = 1
00192         
00193                 # root file size (number)
00194                 root_file_size = getRootFileSize(path = path, candle = candle, step = step)
00195 
00196                 #EdmSize
00197                 edm_report = parserEdmSize.getEdmReport(path = path, candle = candle, step = step)
00198                 if edm_report != False:
00199                         try:
00200                                 # add event content data
00201                                 edm_report  = map(assign_event_content_for_product, edm_report)
00202                                 # for testing / imformation
00203                                 _eventContent_DEBUG(edm_report)
00204                         except Exception, e:
00205                                 print e
00206 
00207 
00208                 timeSizeReport = {
00209                                 "jobID":jobID,
00210                                 "release": release, 
00211                                 "timelog_result": (mod_timelog, evt_timelog, rss_data, vsize_data), 
00212                                 "metadata": {"root_file_size": root_file_size, "num_events": num_events}, 
00213                                 "edmSize_result": edm_report 
00214                 }
00215                 
00216                 # export to xml: actualy exporting gets suspended and put into runinfo
00217                 exportTimeSizeJob(path, timeSizeReport,  runinfo)
00218 
00219 #TimeSize
00220 def searchTimeSizeFiles(runinfo):
00221         """ so far we will use the current dir to search in """
00222         path = os.getcwd()
00223         #print path
00224         print 'full path =', os.path.abspath(path)
00225 
00226         files = os.listdir(path)
00227         
00228         test_timeSizeDirs = re.compile("_TimeSize$", re.IGNORECASE)          
00229         timesize_dirs = [os.path.join(path, f) for f in files if test_timeSizeDirs.search(f) and os.path.isdir(os.path.join(path, f))]
00230         
00231         for timesize_dir in timesize_dirs:
00232                 # print timesize_dir
00233                 process_timesize_dir(timesize_dir, runinfo)
00234 
00235 def exportSequences():
00236     """ Exports the sequences to XML Doc """
00237     try:
00238         env_cmssw_version = os.environ["CMSSW_VERSION"]
00239     except KeyError:
00240         print "<<<<<  ====== Error: cannot get CMSSW version [just integrity check for sequences]. \
00241                                          Is the CMSSW environment initialized? (use cmsenv) ==== >>>>"
00242         env_cmssw_version = None
00243 
00244     print " ==== exporting the sequences. loading files for currently loaded CMSSW version: %s, while the CMSSW we are currently harversting is %s ===" %(env_cmssw_version, release)
00245     xml_export_Sequences(xml_doc = xmldoc, sequences = get_modules_sequences_relationships(), release=release)
00246 
00247 
00248 
00249 if __name__ == "__main__":
00250         #searchFiles()
00251     #TO DO:
00252     #Use option parser! This is messy.
00253     
00254     (release, output_dir) = get_params(sys.argv)
00255 
00256     if not release:
00257         """ print usage(sys.argv)
00258         sys.exit(2) """
00259         print "The version was not provided explicitly, will try to get one from SimulationCandles file """
00260 
00261 
00262     # Export the metadata from cmsPerfSuite.log (in current working directory!)
00263     print "Parsing cmsPerfSuite.log: getting all the metadata concerning the run"
00264     p = parserPerfsuiteMetadata(os.getcwd())
00265     run_info = p.parseAll()
00266 
00267     print "Loading Sequences and Event-Content(s). Please wait..."
00268 
00269     Sequences_OK = False
00270     EventContents_OK = False
00271 
00272     if not _TEST_RUN:
00273          try:
00274                  import Validation.Performance.ModuleToSequenceAssign as ModuleToSequenceAssign
00275                  Sequences_OK = True
00276          except Exception, e:
00277                 print e
00278          try:
00279                 import Validation.Performance.parseEventContent as parseEventContent
00280                 EventContents_OK = True
00281          except Exception, e:
00282                 print e 
00283 
00284     print "Parsing TimeSize report"
00285     # Search for TimeSize files: EdmSize, TimingReport
00286     searchTimeSizeFiles(run_info)
00287     #print run_info
00288 
00289     print "Exporting sequences and event-content rules"
00290     if not _TEST_RUN:
00291             """ for testing on laptom we have no CMSSW """
00292             # export sequences (for currently loaded CMSSW)
00293             if Sequences_OK:
00294                 exportSequences()
00295 
00296             if EventContents_OK:
00297                     # export event's content rules
00298                     eventContentRules = parseEventContent.getTxtEventContentRules()
00299                     cmssw_exportdb_xml.exportECRules(xmldoc, eventContentRules)
00300                     
00301 
00302     cmssw_exportdb_xml.exportRunInfo(xmldoc, run_info, release = release)
00303     #save the XML file, TODO: change fileName after introducting the JobID
00304     import datetime
00305     now = datetime.datetime.now()
00306     #Changing slightly the XML filename format
00307     #FIXME: review this convention and archive the xml in a separate CASTOR xml directory for quick recovery of DB...
00308     file_name = "%s___%s___%s___%s___%s___%s___%s.xml" % (release, "_".join(steps.keys()), "_".join(candles.keys()), "_".join(pileups.keys()),event_content,conditions,now.isoformat())
00309     print "Writing the output to: %s " % file_name
00310 
00311     write_xml(xmldoc, output_dir, file_name) #change this function to be able to handle directories in remote machines (via tar pipes for now could always revert to rsync later).
00312     #NB write_xml is in Validation/Performance/python/cmssw_exportdb_xml.py