CMS 3D CMS Logo

parserTimingReport.py
Go to the documentation of this file.
1 #!/usr/bin/env python
2 from __future__ import print_function
3 from __future__ import absolute_import
4 import sys
5 import math
6 import re
7 from .cmssw_exportdb_xml import *
8 from .FileNamesHelper import *
9 
10 """
11 Performance profiling:
12  ncalls tottime percall cumtime percall filename:lineno(function)
13  1 0.000 0.000 71.618 71.618 <stdin>:1(foo)
14  1 0.315 0.315 71.933 71.933 <string>:1(<module>)
15  1 47.561 47.561 71.618 71.618 parserTimingReport.py:27(loadTimeLog)
16  8000 0.191 0.000 0.343 0.000 parserTimingReport.py:8(extractRSS_VSIZE)
17  1 0.000 0.000 0.000 0.000 {len}
18  2384000 3.239 0.000 3.239 0.000 {method 'append' of 'list' objects}
19  1 0.000 0.000 0.000 0.000 {method 'close' of 'file' objects}
20  1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}
21  24000 0.041 0.000 0.041 0.000 {method 'partition' of 'str' objects}
22  2392000 5.804 0.000 5.804 0.000 {method 'split' of 'str' objects}
23  10791332 14.782 0.000 14.782 0.000 {method 'strip' of 'str' objects}
24  1 0.000 0.000 0.000 0.000 {method 'xreadlines' of 'file' objects}
25  1 0.000 0.000 0.000 0.000 {open}
26 
27 """
28 
29 
30 """ given two lines returns the VSIZE and RSS values along with event number """
31 def extractRSS_VSIZE(line1, line2, record_number):
32  """
33  >>> extractRSS_VSIZE("%MSG-w MemoryCheck: PostModule 19-Jun-2009 13:06:08 CEST Run: 1 Event: 1", \
34  "MemoryCheck: event : VSIZE 923.07 0 RSS 760.25 0")
35  (('1', '760.25'), ('1', '923.07'))
36  """
37 
38  if ("Run" in line1) and ("Event" in line1): # the first line
39  event_number = line1.split('Event:')[1].strip()
40  else: return False
41 
42  """ it's first or second MemoryCheck line """
43  if ("VSIZE" in line2) and ("RSS" in line2): # the second line
44  RSS = line2.split("RSS")[1].strip().split(" ")[0].strip() #changed partition into split for backward compatability with py2.3
45  VSIZE = line2.split("RSS")[0].strip().split("VSIZE")[1].strip().split(" ")[0].strip()
46  #Hack to return the record number instea of event number for now... can always switch back of add event number on top
47  #return ((event_number, RSS), (event_number, VSIZE))
48  return ((record_number, RSS), (record_number, VSIZE))
49  else: return False
50 
51 
52 def loadTimeLog(log_filename, maxsize_rad = 0): #TODO: remove maxsize to read, used for debugging
53  """ gets the timing data from the logfile
54  returns 4 lists:
55 
56  * ModuleTime data (event_number, module_label, module_name, seconds) and
57  * EventTime data
58  - with granularity of event (initial - not processed data)
59  * RSS per event
60  * VSIZE per event
61  """
62  # ----- format of logfile ----
63  # Report columns headings for modules: eventnum runnum modulelabel modulename timetakeni"
64  # e.g. TimeModule> 1 1 csctfDigis CSCTFUnpacker 0.0624561
65 
66  mod_data = []
67  evt_data = []
68  rss_data = []
69  vsize_data = []
70  # open file and read it and fill the structure!
71  logfile = open(log_filename, 'r')
72 
73  # get only the lines which have time report data
74  #TODO: reading and processing line by line might speed up the process!
75 
76  memcheck_line1 = False
77 
78  record_number=0
79  last_record=0
80  last_event=0
81  for line in logfile:
82  if 'TimeModule>' in line.strip():
83  line = line.strip()
84  line_content_list = line.split(' ')[0:]
85  #Hack to avoid issues with the non-consecutive run numbers:
86  event_number = int(line_content_list[1])
87  if event_number != last_event:
88  record_number=record_number+1
89  last_event=event_number
90  # module label and name were mixed up in the original doc
91  module_label = str(line_content_list[4])
92  module_name = str(line_content_list[3])
93  seconds = float(line_content_list[5])
94  #For now let's try to switch to the record_number... if we need to also have the event_number we can always add it back.
95  #mod_data.append((event_number, module_label, module_name, seconds))
96  mod_data.append((record_number, module_label, module_name, seconds))
97  if 'TimeEvent>' in line.strip():
98  line = line.strip()
99  line_content_list = line.split(' ')[0:]
100  #Hack to avoid issues with the non-consecutive run numbers:
101  event_number = int(line_content_list[1])
102  if event_number != last_event:
103  record_number=record_number+1
104  last_event=event_number
105  # module label and name were mixed up in the original doc
106  time_seconds = str(line_content_list[3])
107 
108  #TODO: what are the other [last two] numbers? Real time? smf else? TimeEvent> 1 1 15.3982 13.451 13.451
109  #For now let's try to switch to the record_number... if we need to also have the event_number we can always add it back.
110  #evt_data.append((event_number, time_seconds))
111  evt_data.append((record_number, time_seconds))
112  """
113  %MSG-w MemoryCheck: PostModule 19-Jun-2009 13:06:08 CEST Run: 1 Event: 1
114  MemoryCheck: event : VSIZE 923.07 0 RSS 760.25 0
115  """
116  if 'MemoryCheck:' in line.strip():
117  # this is the first line out of two
118  if (not memcheck_line1):
119  memcheck_line1 = line.strip()
120  else:
121  #FIXME (eventually)
122  #Hacking in the record_number extracted from the TimeEvent and TimeModule parsing... NOT ROBUST...
123  (rss, vsize) = extractRSS_VSIZE(memcheck_line1, line.strip(), record_number)
124  rss_data.append(rss)
125  vsize_data.append(vsize)
126  else:
127  memcheck_line1 = False
128 
129  logfile.close()
130 
131  return (mod_data, evt_data, rss_data, vsize_data)
132 
133 
134 
135 
136 def calcRMS(items,avg):
137  """ returns RootMeanSquare of items in a list """
138  # sqrt(sum(x^2))
139  # Not statistics RMS... "physics" RMS, i.e. standard deviation: sqrt(sum((x-avg)**2)/N)
140  # return math.sqrt(reduce(lambda x: (x - avg)**2, items) / len(items))
141  return math.sqrt(sum([(x-avg)**2 for x in items])/len(items))
142 
143 def calc_MinMaxAvgRMS(items, remove_first = True, f_time = lambda x: x[0], f_evt_num = lambda x: x[1],):
144  """ returns a dict of avg, min, max, rms """
145  # save the cpu time of first event before removing the first result!
146  cpu_time_first = f_time(items[0])
147 
148  if len(items) > 1 and remove_first == True:
149  items.remove(items[0]) #TODO: if there is only one event - we have a problem -> do we eliminate the whole module?
150  # TODO: it removes it completely from all the data because we do not save/ do not copy it
151 
152  items_time = map(f_time, items)
153  min_value = min(items_time)
154  max_value = max(items_time)
155  max_index = items_time.index(max_value)
156  avg_value = float(sum(items_time)) / float(len(items_time))
157  rms_value = calcRMS(items_time,avg_value)
158 
159  return {"min": min_value, "max": max_value, "cputime_first": cpu_time_first,
160  "rms": rms_value, "avg": avg_value,
161  "event_number_of_max": f_evt_num(items[max_index])}
162 
163 
164 def processModuleTimeLogData(modules_timelog, groupBy = "module_name"):
165  """ Processes the timelog data grouping events by module and calculates min, max, avg, rms
166  Returns data as a list of dicts like: !
167 
168  {
169  <module_name>:
170  {name:, label:,
171  stats: {num_events, avg, min, max, rms}
172  }
173 
174  """
175  # group by module_name, we save a list for each module name
176  times_bymod = {}
177 
178  # print "Num of useful TimeLog lines: %s" % len(modules_timelog)
179 
180  for time_data in modules_timelog:
181  (event_number, module_label, module_name, seconds) = time_data
182 
183  # group times of modules By label or name, TODO: maybe both
184  if groupBy == "module_label":
185  key = module_label
186  else:
187  if groupBy =="name+label":
188  key = module_name + "_" + module_label
189  else:
190  key = module_name
191 
192 
193  try:
194  # is the list for current module initialized?
195  times_bymod[key]
196  except KeyError:
197  #Changing this from a list to a dict (see comments below):
198  #times_bymod[key] = []
199  times_bymod[key] = {}
200  #Running out of memory!
201  #times_bymod[key].append({"label": module_label, "name": module_name, "time": seconds, "event_number": event_number})
202  #Let's do it right:
203  #Instead of times_bymod[key]=[{"label": module_label, "name": module_name, "time": seconds, "event_number": event_number}]
204  #let's do times_bymod[key]={"module_label":{"module_name":[(seconds,event_number)]}} so we do not repeat label and name and especially they are not a pair of key/value
205  #During the first event all the keys will be initialized, then from event 2 on it will be just appending the (seconds,event_number) tuple to the list with the appropriate keys:
206 
207  #Check/Set up the module label dict:
208  try:
209  times_bymod[key][module_label]
210  except KeyError:
211  times_bymod[key].update({module_label:{}})
212 
213  #Check/Set up the module name dict:
214  try:
215  times_bymod[key][module_label][module_name]
216  except KeyError:
217  times_bymod[key][module_label].update({module_name:[]})
218 
219  #We're now ready to add the info as a tuple in the list!
220  times_bymod[key][module_label][module_name].append((seconds,event_number))
221 
222 
223  # calculate Min,Max, Avg, RMS for each module and in this way get the final data to be imported
224  ##for mod_name in times_bymod.keys():
225  ## #copy needed data
226  ## #mod_data = {"label": times_bymod[mod_name][0]["label"], "name": times_bymod[mod_name][0]["name"]}
227  ## #New data structure:
228  ## mod_data = {"label":times_bymod[mod_name].keys()[0],"name":times_bymod[mod_name][times_bymod[mod_name].keys()[0]].keys()[0]}
229  ## # add statistical data
230 
237  for key in times_bymod.keys():
238  for label in times_bymod[key].keys():
239  mod_data={'label':label}
240  for name in times_bymod[key][label].keys():
241  mod_data.update({'name':name})
242  mod_data['stats']= calc_MinMaxAvgRMS(f_time= lambda x:x[0],f_evt_num=lambda x:x[1],items=times_bymod[key][label][name])
243  mod_data['stats']['num_events']=len(times_bymod[key][label][name])
244  times_bymod[key]=mod_data
245  return times_bymod
246 
248  timelog_f = "TTBAR__RAW2DIGI,RECO_TimingReport.log"
249  timelog_f = "TTBAR__GEN,SIM,DIGI,L1,DIGI2RAW,HLT_TimingReport.log"
250  #TODO: get STEP name from filename
251  release_files = {
252 
253  "CMSSW_3_1_0_pre9":
254  (
255  "CMSSW_3_1_0_pre9/MINBIAS__RAW2DIGI,RECO_TimingReport.log",
256  "CMSSW_3_1_0_pre9/TTBAR__RAW2DIGI,RECO_TimingReport.log")
257  ## "CMSSW_3_1_0_pre10":
258  }
259  for release, files in release_files.items():
260  print("Processing release: %s" % release)
261  for timelog_f in files:
262  print("Processing file: %s" % timelog_f)
263 
264  # TODO: automaticaly detect type of report file!!!
265  (mod_timelog, evt_timelog, rss_data, vsize_data) =loadTimeLog(timelog_f)
266 
267  mod_timelog= processModuleTimeLogData(mod_timelog, groupBy = "module_label")
268  print("Number of modules grouped by (module_label): %s" % len(mod_timelog))
269 
270  (candle, step, pileup_type, conditions, event_content) = getJobID_fromTimeReportLogName(timelog_f)
271 
272  """ We could get release from the path but that's quite ugly! """
273  export_xml(jobID = jobID, release=release, timelog_result=(mod_timelog, evt_timelog, rss_data, vsize_data))
274 
275 """ use to run performance profiling """
277  timelog_f = "test_data/TTBAR__RAW2DIGI,RECO_TimingReport.log"
278  (modules_timelog, evt_timelog, rss_data, vsize_data) = loadTimeLog(timelog_f)
279 
280  mod_timelog= processModuleTimeLogData(modules_timelog, groupBy = "module_label")
281 
282  (candle, step, pileup_type, conditions, event_content) = getJobID_fromTimeReportLogName(timelog_f)
283 
284  xmldoc = minidom.Document()
285  export_xml(step = step, candle = candle, release="test", timelog_result=(mod_timelog, evt_timelog, rss_data, vsize_data), xml_doc = xmldoc)
286  write_xml(xmldoc, "test_xml_output.xml")
287 
288 if (__name__ == "__main__"):
289  perf_profile()
S & print(S &os, JobReport::InputFile const &f)
Definition: JobReport.cc:66
def export_xml(release, jobID, timelog_result, xml_doc, metadata=None, edmSize_result=None, parentNode=None)
def write_xml(scenario, fileName)
T min(T a, T b)
Definition: MathUtil.h:58
def loadTimeLog(log_filename, maxsize_rad=0)
def processModuleTimeLogData(modules_timelog, groupBy="module_name")
mod_data["stats"] =calc_MinMaxAvgRMS(f_time = lambda x: x["time"], f_evt_num = lambda x: x["event_num...
def calc_MinMaxAvgRMS(items, remove_first=True, f_time=lambda x:x[0], f_evt_num=lambda x:x[1])
def extractRSS_VSIZE(line1, line2, record_number)
#define update(a, b)
def calcRMS(items, avg)
def getJobID_fromTimeReportLogName(logfile_name)
#define str(s)
double split
Definition: MVATrainer.cc:139