CMS 3D CMS Logo

 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Pages
parserTimingReport.py
Go to the documentation of this file.
1 #!/usr/bin/env python2.4
2 import sys
3 import math
4 import re
5 from cmssw_exportdb_xml import *
6 from FileNamesHelper import *
7 
8 """
9 Performance profiling:
10  ncalls tottime percall cumtime percall filename:lineno(function)
11  1 0.000 0.000 71.618 71.618 <stdin>:1(foo)
12  1 0.315 0.315 71.933 71.933 <string>:1(<module>)
13  1 47.561 47.561 71.618 71.618 parserTimingReport.py:27(loadTimeLog)
14  8000 0.191 0.000 0.343 0.000 parserTimingReport.py:8(extractRSS_VSIZE)
15  1 0.000 0.000 0.000 0.000 {len}
16  2384000 3.239 0.000 3.239 0.000 {method 'append' of 'list' objects}
17  1 0.000 0.000 0.000 0.000 {method 'close' of 'file' objects}
18  1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}
19  24000 0.041 0.000 0.041 0.000 {method 'partition' of 'str' objects}
20  2392000 5.804 0.000 5.804 0.000 {method 'split' of 'str' objects}
21  10791332 14.782 0.000 14.782 0.000 {method 'strip' of 'str' objects}
22  1 0.000 0.000 0.000 0.000 {method 'xreadlines' of 'file' objects}
23  1 0.000 0.000 0.000 0.000 {open}
24 
25 """
26 
27 
28 """ given two lines returns the VSIZE and RSS values along with event number """
29 def extractRSS_VSIZE(line1, line2):
30  """
31  >>> extractRSS_VSIZE("%MSG-w MemoryCheck: PostModule 19-Jun-2009 13:06:08 CEST Run: 1 Event: 1", \
32  "MemoryCheck: event : VSIZE 923.07 0 RSS 760.25 0")
33  (('1', '760.25'), ('1', '923.07'))
34  """
35 
36  if ("Run" in line1) and ("Event" in line1): # the first line
37  event_number = line1.split('Event:')[1].strip()
38  else: return False
39 
40  """ it's first or second MemoryCheck line """
41  if ("VSIZE" in line2) and ("RSS" in line2): # the second line
42  RSS = line2.split("RSS")[1].strip().split(" ")[0].strip() #changed partition into split for backward compatability with py2.3
43  VSIZE = line2.split("RSS")[0].strip().split("VSIZE")[1].strip().split(" ")[0].strip()
44  return ((event_number, RSS), (event_number, VSIZE))
45  else: return False
46 
47 
48 def loadTimeLog(log_filename, maxsize_rad = 0): #TODO: remove maxsize to read, used for debugging
49  """ gets the timing data from the logfile
50  returns 4 lists:
51 
52  * ModuleTime data (event_number, module_label, module_name, seconds) and
53  * EventTime data
54  - with granularity of event (initial - not processed data)
55  * RSS per event
56  * VSIZE per event
57  """
58  # ----- format of logfile ----
59  # Report columns headings for modules: eventnum runnum modulelabel modulename timetakeni"
60  # e.g. TimeModule> 1 1 csctfDigis CSCTFUnpacker 0.0624561
61 
62  mod_data = []
63  evt_data = []
64  rss_data = []
65  vsize_data = []
66  # open file and read it and fill the structure!
67  logfile = open(log_filename, 'r')
68 
69  # get only the lines which have time report data
70  #TODO: reading and processing line by line might speed up the process!
71 
72  memcheck_line1 = False
73 
74 
75  for line in logfile.xreadlines():
76  if 'TimeModule>' in line.strip():
77  line = line.strip()
78  line_content_list = line.split(' ')[0:]
79 
80  event_number = int(line_content_list[1])
81  # module label and name were mixed up in the original doc
82  module_label = str(line_content_list[4])
83  module_name = str(line_content_list[3])
84  seconds = float(line_content_list[5])
85 
86  mod_data.append((event_number, module_label, module_name, seconds))
87 
88  if 'TimeEvent>' in line.strip():
89  line = line.strip()
90  line_content_list = line.split(' ')[0:]
91 
92  event_number = int(line_content_list[1])
93  time_seconds = str(line_content_list[3])
94 
95  #TODO: what are the other [last two] numbers? Real time? smf else? TimeEvent> 1 1 15.3982 13.451 13.451
96  evt_data.append((event_number, time_seconds))
97  """
98  %MSG-w MemoryCheck: PostModule 19-Jun-2009 13:06:08 CEST Run: 1 Event: 1
99  MemoryCheck: event : VSIZE 923.07 0 RSS 760.25 0
100  """
101  if 'MemoryCheck:' in line.strip():
102  # this is the first line out of two
103  if (not memcheck_line1):
104  memcheck_line1 = line.strip()
105  else:
106  (rss, vsize) = extractRSS_VSIZE(memcheck_line1, line.strip())
107  rss_data.append(rss)
108  vsize_data.append(vsize)
109  else:
110  memcheck_line1 = False
111 
112  logfile.close()
113 
114  return (mod_data, evt_data, rss_data, vsize_data)
115 
116 
117 
118 
119 def calcRMS(items,avg):
120  """ returns RootMeanSquare of items in a list """
121  # sqrt(sum(x^2))
122  # Not statistics RMS... "physics" RMS, i.e. standard deviation: sqrt(sum((x-avg)**2)/N)
123  # return math.sqrt(reduce(lambda x: (x - avg)**2, items) / len(items))
124  return math.sqrt(sum([(x-avg)**2 for x in items])/len(items))
125 
126 def calc_MinMaxAvgRMS(items, remove_first = True, f_time = lambda x: x[0], f_evt_num = lambda x: x[1],):
127  """ returns a dict of avg, min, max, rms """
128  # save the cpu time of first event before removing the first result!
129  cpu_time_first = f_time(items[0])
130 
131  if len(items) > 1 and remove_first == True:
132  items.remove(items[0]) #TODO: if there is only one event - we have a problem -> do we eliminate the whole module?
133  # TODO: it removes it completely from all the data because we do not save/ do not copy it
134 
135  items_time = map(f_time, items)
136  min_value = min(items_time)
137  max_value = max(items_time)
138  max_index = items_time.index(max_value)
139  avg_value = float(sum(items_time)) / float(len(items_time))
140  rms_value = calcRMS(items_time,avg_value)
141 
142  return {"min": min_value, "max": max_value, "cputime_first": cpu_time_first,
143  "rms": rms_value, "avg": avg_value,
144  "event_number_of_max": f_evt_num(items[max_index])}
145 
146 
147 def processModuleTimeLogData(modules_timelog, groupBy = "module_name"):
148  """ Processes the timelog data grouping events by module and calculates min, max, avg, rms
149  Returns data as a list of dicts like: !
150 
151  {
152  <module_name>:
153  {name:, label:,
154  stats: {num_events, avg, min, max, rms}
155  }
156 
157  """
158  # group by module_name, we save a list for each module name
159  times_bymod = {}
160 
161  # print "Num of useful TimeLog lines: %s" % len(modules_timelog)
162 
163  for time_data in modules_timelog:
164  (event_number, module_label, module_name, seconds) = time_data
165 
166  # group times of modules By label or name, TODO: maybe both
167  if groupBy == "module_label":
168  key = module_label
169  else:
170  if groupBy =="name+label":
171  key = module_name + "_" + module_label
172  else:
173  key = module_name
174 
175 
176  try:
177  # is the list for current module initialized?
178  times_bymod[key]
179  except KeyError:
180  #Changing this from a list to a dict (see comments below):
181  #times_bymod[key] = []
182  times_bymod[key] = {}
183  #Running out of memory!
184  #times_bymod[key].append({"label": module_label, "name": module_name, "time": seconds, "event_number": event_number})
185  #Let's do it right:
186  #Instead of times_bymod[key]=[{"label": module_label, "name": module_name, "time": seconds, "event_number": event_number}]
187  #let's do times_bymod[key]={"module_label":{"module_name":[(seconds,event_number)]}} so we do not repeat label and name and especially they are not a pair of key/value
188  #During the first event all the keys will be initialized, then from event 2 on it will be just appending the (seconds,event_number) tuple to the list with the appropriate keys:
189 
190  #Check/Set up the module label dict:
191  try:
192  times_bymod[key][module_label]
193  except KeyError:
194  times_bymod[key].update({module_label:{}})
195 
196  #Check/Set up the module name dict:
197  try:
198  times_bymod[key][module_label][module_name]
199  except KeyError:
200  times_bymod[key][module_label].update({module_name:[]})
201 
202  #We're now ready to add the info as a tuple in the list!
203  times_bymod[key][module_label][module_name].append((seconds,event_number))
204 
205 
206  # calculate Min,Max, Avg, RMS for each module and in this way get the final data to be imported
207  ##for mod_name in times_bymod.keys():
208  ## #copy needed data
209  ## #mod_data = {"label": times_bymod[mod_name][0]["label"], "name": times_bymod[mod_name][0]["name"]}
210  ## #New data structure:
211  ## mod_data = {"label":times_bymod[mod_name].keys()[0],"name":times_bymod[mod_name][times_bymod[mod_name].keys()[0]].keys()[0]}
212  ## # add statistical data
213  ##
214  ## mod_data["stats"] =calc_MinMaxAvgRMS(f_time = lambda x: x["time"], f_evt_num = lambda x: x["event_number"], items = times_bymod[mod_name])
215  ##
216  ## mod_data["stats"]["num_events"] = len(times_bymod[mod_name])
217  ##
218  ## times_bymod[mod_name] = mod_data
219  #Let's rewrite this using the dictionary we now have without any logical change (could do with some...):
220  for key in times_bymod.keys():
221  for label in times_bymod[key].keys():
222  mod_data={'label':label}
223  for name in times_bymod[key][label].keys():
224  mod_data.update({'name':name})
225  mod_data['stats']= calc_MinMaxAvgRMS(f_time= lambda x:x[0],f_evt_num=lambda x:x[1],items=times_bymod[key][label][name])
226  mod_data['stats']['num_events']=len(times_bymod[key][label][name])
227  times_bymod[key]=mod_data
228  return times_bymod
229 
231  timelog_f = "TTBAR__RAW2DIGI,RECO_TimingReport.log"
232  timelog_f = "TTBAR__GEN,SIM,DIGI,L1,DIGI2RAW,HLT_TimingReport.log"
233  #TODO: get STEP name from filename
234  release_files = {
235 
236  "CMSSW_3_1_0_pre9":
237  (
238  "CMSSW_3_1_0_pre9/MINBIAS__RAW2DIGI,RECO_TimingReport.log",
239  "CMSSW_3_1_0_pre9/TTBAR__RAW2DIGI,RECO_TimingReport.log")
240  ## "CMSSW_3_1_0_pre10":
241  }
242  for release, files in release_files.items():
243  print "Processing release: %s" % release
244  for timelog_f in files:
245  print "Processing file: %s" % timelog_f
246 
247  # TODO: automaticaly detect type of report file!!!
248  (mod_timelog, evt_timelog, rss_data, vsize_data) =loadTimeLog(timelog_f)
249 
250  mod_timelog= processModuleTimeLogData(mod_timelog, groupBy = "module_label")
251  print "Number of modules grouped by (module_label): %s" % len(mod_timelog)
252 
253  (candle, step, pileup_type, conditions, event_content) = getJobID_fromTimeReportLogName(timelog_f)
254 
255  """ We could get release from the path but that's quite ugly! """
256  export_xml(jobID = jobID, release=release, timelog_result=(mod_timelog, evt_timelog, rss_data, vsize_data))
257 
258 """ use to run performance profiling """
260  timelog_f = "test_data/TTBAR__RAW2DIGI,RECO_TimingReport.log"
261  (modules_timelog, evt_timelog, rss_data, vsize_data) = loadTimeLog(timelog_f)
262 
263  mod_timelog= processModuleTimeLogData(modules_timelog, groupBy = "module_label")
264 
265  (candle, step, pileup_type, conditions, event_content) = getJobID_fromTimeReportLogName(timelog_f)
266 
267  xmldoc = minidom.Document()
268  export_xml(step = step, candle = candle, release="test", timelog_result=(mod_timelog, evt_timelog, rss_data, vsize_data), xml_doc = xmldoc)
269  write_xml(xmldoc, "test_xml_output.xml")
270 
271 if (__name__ == "__main__"):
272  perf_profile()
def getJobID_fromTimeReportLogName
void strip(std::string &input, const std::string &blanks=" \n\t")
Definition: stringTools.cc:16
def processModuleTimeLogData
mod_data[&quot;stats&quot;] =calc_MinMaxAvgRMS(f_time = lambda x: x[&quot;time&quot;], f_evt_num = lambda x: x[&quot;event_num...
#define min(a, b)
Definition: mlp_lapack.h:161
const T & max(const T &a, const T &b)
#define update(a, b)
double split
Definition: MVATrainer.cc:139