CMS 3D CMS Logo

 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Pages
cmsPerfSuiteHarvest.py
Go to the documentation of this file.
1 #!/usr/bin/env python
2 #A script to "harverst" PerfSuite work directories, producing an xml file with all the data ready to be uploaded to the PerfSuiteDB DB.
3 import sys, os, re
4 import getopt
7 import Validation.Performance.cmssw_exportdb_xml as cmssw_exportdb_xml
8 from Validation.Performance.parserPerfsuiteMetadata import parserPerfsuiteMetadata
9 
11 import Validation.Performance.parserEdmSize as parserEdmSize
12 
13 """ indicates whether the CMSSW is [use False] available or not. on our testing machine it's not [use True] """
14 _TEST_RUN = False
15 
16 """ global variables """
17 test_timing_report_log = re.compile("TimingReport.log$", re.IGNORECASE)
18 
19 
20 xmldoc = minidom.Document()
21 release = None
22 steps = {}
23 candles = {}
24 pileups = {}
25 
26 def usage(argv):
27  script = argv[0]
28  return """
29  Usage: %(script)s [-v cmssw_version] [--version=cmssw_version]
30 
31  if the cmssw version is in the system's environment (after running cmsenv):
32  $ %(script)s
33 
34  otherwise one must specify the cmssw version:
35  $ %(script)s --version=CMSSW_3_2_0
36  $ %(script)s -v CMSSW_3_2_0
37 
38  """ % locals()
39 
40 def get_params(argv):
41  """
42  Returns the version of CMSSW to be used which it is taken from:
43  * command line parameter or
44  * environment variable
45  in case of error returns None
46 
47  And also the directory to put the xml files to: if none --> returns ""
48  """
49 
50  """ try to get the version for command line argument """
51  #print argv
52  #FIXME: this should be rewritten using getopt properly
53  version = None
54  #xml_dir = "cmsperfvm:/data/projects/conf/PerfSuiteDB/xml_dropbox" #Set this as default (assume change in write_xml to write to remote machines)
55  #NB write_xml is in Validation/Performance/python/cmssw_exportdb_xml.py
56  #Setting the default to write to a local directory:
57  xml_dir="PerfSuiteDBData"
58  try:
59  opts, args = getopt.getopt(argv[1:], "v:", ["version=", "outdir="])
60  except getopt.GetoptError, e:
61  print e
62  for opt, arg in opts:
63  if opt in ("-v", "--version"):
64  version = arg
65  if opt == "--outdir":
66  xml_dir = arg
67 
68  """ if not get it from environment string """
69  if not version:
70  try:
71  version = os.environ["CMSSW_VERSION"]
72  except KeyError:
73  pass
74 
75  return (version, xml_dir)
76 
77 def _eventContent_DEBUG(edm_report):
78  # for testing / information
79  EC_count = {}
80  if not _TEST_RUN:
81  # count the products in event-content's
82  for prod in edm_report:
84  for ec in ecs:
85  if not EC_count.has_key(ec):
86  EC_count[ec] = []
87  EC_count[ec].append(prod)
88  #print out the statistics
89  for (ec, prods) in EC_count.items():
90  print "==== %s EVENT CONTENT: have %d items, the listing is: ===" % (ec, len(prods))
91  # list of products
92  print "\n *".join(["%(cpp_type)s_%(module_name)s_%(module_label)s" % prod for prod in prods])
93 
94 
96  """ returns modified product by adding the event content relationship """
97 
98  if not _TEST_RUN:
99  product["event_content"] = ",".join(parseEventContent.List_ECs_forProduct(product))
100  return product
101 
102 
104  (sequenceWithModules, sequenceWithModulesString) =ModuleToSequenceAssign.assignModulesToSeqs()
105  return [{"name": seq, "modules": ",".join(modules)} for (seq, modules) in sequenceWithModulesString.items()]
106 
107 
108 def exportTimeSizeJob(path, timeSizeReport, runinfo):
109  candleLong = os.path.split(path)[1].replace("_TimeSize", "").replace("_PU", "")
110  jobID = timeSizeReport["jobID"]
111  print candleLong
112 
113  #search for a run Test to which could belong our JOB
114  found = False
115  if runinfo['TestResults'].has_key('TimeSize'):
116  for result in runinfo['TestResults']['TimeSize']:
117  #print result
118  """ If this is the testResult which fits TimeSize job """
119  #TODO: we do not check teh step when assigning because of the different names, check if this is really OK. make a decission which step name to use later, long or short one
120  #and jobID["step"] in result['steps'].split(parserPerfsuiteMetadata._LINE_SEPARATOR)
121  if result['candle'] == candleLong and jobID["pileup_type"] == result['pileup_type'] and jobID["conditions"] == result['conditions'] and jobID["event_content"] == result['event_content']:
122  #print result
123  if not result.has_key("jobs"):
124  result['jobs'] = []
125  result['jobs'].append(timeSizeReport)
126  found = True
127  break
128 
129  if not found:
130  print "============ (almost) ERROR: NOT FOUND THE ENTRY in cmsPerfSuite.log, exporting as separate entry ======== "
131  print "JOB ID: %s " % str(jobID)
132  print " ====================== "
133  runinfo['unrecognized_jobs'].append(timeSizeReport)
134  #export_xml(xml_doc = xmldoc, **timeSizeReport)
135 
136 
137 def process_timesize_dir(path, runinfo):
138  global release,event_content,conditions
139  """ if the release is not provided explicitly we take it from the Simulation candles file """
140  if (not release):
141  release_fromlogfile = read_SimulationCandles(path)
142  release = release_fromlogfile
143  print "release from simulation candles: %s" % release
144 
145  if (not release):
146  # TODO: raise exception!
147  raise Exception("the release was not found!")
148 
149 
150  """ process the TimingReport log files """
151 
152  # get the file list
153  files = os.listdir(path)
154  timing_report_files = [os.path.join(path, f) for f in files
155  if test_timing_report_log.search(f)
156  and os.path.isfile(os.path.join(path, f)) ]
157 
158  # print timing_report_files
159  for timelog_f in timing_report_files:
160  print "\nProcessing file: %s" % timelog_f
161  print "------- "
162 
163  jobID = getJobID_fromTimeReportLogName(os.path.join(path, timelog_f))
164  print "jobID: %s" % str(jobID)
165  (candle, step, pileup_type, conditions, event_content) = jobID
166  jobID = dict(zip(("candle", "step", "pileup_type", "conditions", "event_content"), jobID))
167  print "Dictionary based jobID %s: " % str(jobID)
168 
169  #if any of jobID fields except (isPILEUP) is empty we discard the job as all those are the jobID keys and we must have them
170  discard = len([key for key, value in jobID.items() if key != "pileup_type" and not value])
171  if discard:
172  print " ====================== The job HAS BEEN DISCARDED =============== "
173  print " NOT ALL DATA WAS AVAILABLE "
174  print " JOB ID = %s " % str(jobID)
175  print " ======================= end ===================================== "
176  continue
177 
178  num_events = read_ConfigurationFromSimulationCandles(path = path, step = step, is_pileup = pileup_type)["num_events"]
179  # TODO: automaticaly detect type of report file!!!
180  (mod_timelog, evt_timelog, rss_data, vsize_data) =loadTimeLog(timelog_f)
181 
182  mod_timelog= processModuleTimeLogData(mod_timelog, groupBy = "module_name")
183  print "Number of modules grouped by (module_label+module_name): %s" % len(mod_timelog)
184 
185  # add to the list to generate the readable filename :)
186  steps[step] = 1
187  candles[candle] = 1
188  if pileup_type=="":
189  pileups["NoPileUp"]=1
190  else:
191  pileups[pileup_type] = 1
192 
193  # root file size (number)
194  root_file_size = getRootFileSize(path = path, candle = candle, step = step)
195 
196  #EdmSize
197  edm_report = parserEdmSize.getEdmReport(path = path, candle = candle, step = step)
198  if edm_report != False:
199  try:
200  # add event content data
201  edm_report = map(assign_event_content_for_product, edm_report)
202  # for testing / imformation
203  _eventContent_DEBUG(edm_report)
204  except Exception, e:
205  print e
206 
207 
208  timeSizeReport = {
209  "jobID":jobID,
210  "release": release,
211  "timelog_result": (mod_timelog, evt_timelog, rss_data, vsize_data),
212  "metadata": {"root_file_size": root_file_size, "num_events": num_events},
213  "edmSize_result": edm_report
214  }
215 
216  # export to xml: actualy exporting gets suspended and put into runinfo
217  exportTimeSizeJob(path, timeSizeReport, runinfo)
218 
219 #TimeSize
220 def searchTimeSizeFiles(runinfo):
221  """ so far we will use the current dir to search in """
222  path = os.getcwd()
223  #print path
224  print 'full path =', os.path.abspath(path)
225 
226  files = os.listdir(path)
227 
228  test_timeSizeDirs = re.compile("_TimeSize$", re.IGNORECASE)
229  timesize_dirs = [os.path.join(path, f) for f in files if test_timeSizeDirs.search(f) and os.path.isdir(os.path.join(path, f))]
230 
231  for timesize_dir in timesize_dirs:
232  # print timesize_dir
233  process_timesize_dir(timesize_dir, runinfo)
234 
236  """ Exports the sequences to XML Doc """
237  try:
238  env_cmssw_version = os.environ["CMSSW_VERSION"]
239  except KeyError:
240  print "<<<<< ====== Error: cannot get CMSSW version [just integrity check for sequences]. \
241  Is the CMSSW environment initialized? (use cmsenv) ==== >>>>"
242  env_cmssw_version = None
243 
244  print " ==== exporting the sequences. loading files for currently loaded CMSSW version: %s, while the CMSSW we are currently harversting is %s ===" %(env_cmssw_version, release)
245  xml_export_Sequences(xml_doc = xmldoc, sequences = get_modules_sequences_relationships(), release=release)
246 
247 
248 
249 if __name__ == "__main__":
250  #searchFiles()
251  #TO DO:
252  #Use option parser! This is messy.
253 
254  (release, output_dir) = get_params(sys.argv)
255 
256  if not release:
257  """ print usage(sys.argv)
258  sys.exit(2) """
259  print "The version was not provided explicitly, will try to get one from SimulationCandles file """
260 
261 
262  # Export the metadata from cmsPerfSuite.log (in current working directory!)
263  print "Parsing cmsPerfSuite.log: getting all the metadata concerning the run"
264  p = parserPerfsuiteMetadata(os.getcwd())
265  run_info = p.parseAll()
266 
267  print "Loading Sequences and Event-Content(s). Please wait..."
268 
269  Sequences_OK = False
270  EventContents_OK = False
271 
272  if not _TEST_RUN:
273  try:
274  import Validation.Performance.ModuleToSequenceAssign as ModuleToSequenceAssign
275  Sequences_OK = True
276  except Exception, e:
277  print e
278  try:
279  import Validation.Performance.parseEventContent as parseEventContent
280  EventContents_OK = True
281  except Exception, e:
282  print e
283 
284  print "Parsing TimeSize report"
285  # Search for TimeSize files: EdmSize, TimingReport
286  searchTimeSizeFiles(run_info)
287  #print run_info
288 
289  print "Exporting sequences and event-content rules"
290  if not _TEST_RUN:
291  """ for testing on laptom we have no CMSSW """
292  # export sequences (for currently loaded CMSSW)
293  if Sequences_OK:
294  exportSequences()
295 
296  if EventContents_OK:
297  # export event's content rules
298  eventContentRules = parseEventContent.getTxtEventContentRules()
299  cmssw_exportdb_xml.exportECRules(xmldoc, eventContentRules)
300 
301 
302  cmssw_exportdb_xml.exportRunInfo(xmldoc, run_info, release = release)
303  #save the XML file, TODO: change fileName after introducting the JobID
304  import datetime
305  now = datetime.datetime.now()
306  #Changing slightly the XML filename format
307  #FIXME: review this convention and archive the xml in a separate CASTOR xml directory for quick recovery of DB...
308  file_name = "%s___%s___%s___%s___%s___%s___%s.xml" % (release, "_".join(steps.keys()), "_".join(candles.keys()), "_".join(pileups.keys()),event_content,conditions,now.isoformat())
309  print "Writing the output to: %s " % file_name
310 
311  write_xml(xmldoc, output_dir, file_name) #change this function to be able to handle directories in remote machines (via tar pipes for now could always revert to rsync later).
312  #NB write_xml is in Validation/Performance/python/cmssw_exportdb_xml.py
def getJobID_fromTimeReportLogName
def processModuleTimeLogData
mod_data[&quot;stats&quot;] =calc_MinMaxAvgRMS(f_time = lambda x: x[&quot;time&quot;], f_evt_num = lambda x: x[&quot;event_num...
def read_ConfigurationFromSimulationCandles
static std::string join(char **cmd)
Definition: RemoteFile.cc:18