CMS 3D CMS Logo

/data/refman/pasoursint/CMSSW_4_1_8_patch9/src/Validation/Performance/python/parserTimingReport.py

Go to the documentation of this file.
00001 #!/usr/bin/env python2.4
00002 import sys
00003 import math
00004 import re
00005 from cmssw_exportdb_xml import *
00006 from FileNamesHelper import *
00007 
00008 """
00009 Performance profiling:
00010    ncalls  tottime  percall  cumtime  percall filename:lineno(function)
00011         1    0.000    0.000   71.618   71.618 <stdin>:1(foo)
00012         1    0.315    0.315   71.933   71.933 <string>:1(<module>)
00013         1   47.561   47.561   71.618   71.618 parserTimingReport.py:27(loadTimeLog)
00014      8000    0.191    0.000    0.343    0.000 parserTimingReport.py:8(extractRSS_VSIZE)
00015         1    0.000    0.000    0.000    0.000 {len}
00016   2384000    3.239    0.000    3.239    0.000 {method 'append' of 'list' objects}
00017         1    0.000    0.000    0.000    0.000 {method 'close' of 'file' objects}
00018         1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}
00019     24000    0.041    0.000    0.041    0.000 {method 'partition' of 'str' objects}
00020   2392000    5.804    0.000    5.804    0.000 {method 'split' of 'str' objects}
00021  10791332   14.782    0.000   14.782    0.000 {method 'strip' of 'str' objects}
00022         1    0.000    0.000    0.000    0.000 {method 'xreadlines' of 'file' objects}
00023         1    0.000    0.000    0.000    0.000 {open}
00024 
00025 """
00026 
00027 
00028 """ given two lines returns the VSIZE and RSS values along with event number """
00029 def extractRSS_VSIZE(line1, line2):
00030         """ 
00031         >>> extractRSS_VSIZE("%MSG-w MemoryCheck:  PostModule 19-Jun-2009 13:06:08 CEST Run: 1 Event: 1", \
00032                              "MemoryCheck: event : VSIZE 923.07 0 RSS 760.25 0")
00033         (('1', '760.25'), ('1', '923.07'))
00034         """
00035 
00036         if ("Run" in line1) and  ("Event" in line1): # the first line
00037                 event_number = line1.split('Event:')[1].strip() 
00038         else: return False
00039 
00040         """ it's first or second MemoryCheck line """
00041         if ("VSIZE" in line2) and ("RSS" in line2): # the second line
00042                 RSS = line2.split("RSS")[1].strip().split(" ")[0].strip() #changed partition into split for backward compatability with py2.3
00043                 VSIZE = line2.split("RSS")[0].strip().split("VSIZE")[1].strip().split(" ")[0].strip()
00044                 return ((event_number, RSS), (event_number, VSIZE))
00045         else: return False
00046 
00047 
00048 def loadTimeLog(log_filename, maxsize_rad = 0): #TODO: remove maxsize to read, used for debugging
00049         """ gets the timing data from the logfile
00050          returns 4 lists:
00051 
00052                 * ModuleTime data (event_number, module_label, module_name, seconds) and
00053                 * EventTime data
00054                         - with granularity of event (initial - not processed data)
00055                 * RSS per event
00056                 * VSIZE per event
00057          """
00058         # ----- format of logfile ----
00059         # Report columns headings for modules: eventnum runnum modulelabel modulename timetakeni"
00060         # e.g. TimeModule> 1 1 csctfDigis CSCTFUnpacker 0.0624561
00061 
00062         mod_data = []
00063         evt_data = []
00064         rss_data = []
00065         vsize_data = []
00066         # open file and read it and fill the structure!
00067         logfile = open(log_filename, 'r')
00068         
00069         # get only the lines which have time report data
00070         #TODO: reading and processing line by line might speed up the process!
00071                                 
00072         memcheck_line1 = False
00073 
00074 
00075         for line in logfile.xreadlines():
00076                 if 'TimeModule>' in line.strip():
00077                         line = line.strip()
00078                         line_content_list = line.split(' ')[0:]
00079         
00080                         event_number = int(line_content_list[1])
00081                                 # module label and name were mixed up in the original doc
00082                         module_label = str(line_content_list[4])
00083                         module_name = str(line_content_list[3])
00084                         seconds = float(line_content_list[5])
00085         
00086                         mod_data.append((event_number, module_label, module_name, seconds))
00087 
00088                 if 'TimeEvent>' in line.strip():
00089                         line = line.strip()
00090                         line_content_list = line.split(' ')[0:]
00091         
00092                         event_number = int(line_content_list[1])
00093                         time_seconds = str(line_content_list[3])
00094                         
00095                         #TODO: what are the other [last two] numbers? Real time? smf else? TimeEvent> 1 1 15.3982 13.451 13.451
00096                         evt_data.append((event_number, time_seconds))
00097                 """ 
00098                         %MSG-w MemoryCheck:  PostModule 19-Jun-2009 13:06:08 CEST Run: 1 Event: 1
00099                         MemoryCheck: event : VSIZE 923.07 0 RSS 760.25 0
00100                 """
00101                 if 'MemoryCheck:' in line.strip():
00102                         # this is the first line out of two
00103                         if (not memcheck_line1):
00104                                 memcheck_line1 = line.strip()
00105                         else: 
00106                                 (rss, vsize) = extractRSS_VSIZE(memcheck_line1, line.strip())
00107                                 rss_data.append(rss)
00108                                 vsize_data.append(vsize)
00109                 else: 
00110                         memcheck_line1 = False                                                                                                                                                                                                                                                                                          
00111 
00112         logfile.close()
00113         
00114         return (mod_data, evt_data, rss_data, vsize_data)
00115 
00116 
00117 
00118 
00119 def calcRMS(items,avg):
00120         """ returns RootMeanSquare  of items in a list """ 
00121         # sqrt(sum(x^2))
00122         # Not statistics RMS... "physics" RMS, i.e. standard deviation: sqrt(sum((x-avg)**2)/N)
00123         # return math.sqrt(reduce(lambda x: (x - avg)**2, items) / len(items))  
00124         return math.sqrt(sum([(x-avg)**2 for x in items])/len(items))
00125 
00126 def calc_MinMaxAvgRMS(items, remove_first = True, f_time = lambda x: x[0], f_evt_num = lambda x: x[1],):
00127         """ returns a dict of avg, min, max, rms """
00128         # save the cpu time of first event before removing the first result!
00129         cpu_time_first = f_time(items[0])
00130 
00131         if len(items) > 1 and remove_first == True:
00132                 items.remove(items[0]) #TODO: if there is only one event - we have a problem -> do we eliminate the whole module?
00133                 # TODO: it removes it completely from all the data because we do not save/ do not copy it
00134 
00135         items_time = map(f_time, items)
00136         min_value = min(items_time)
00137         max_value = max(items_time)
00138         max_index = items_time.index(max_value)
00139         avg_value = float(sum(items_time)) / float(len(items_time))
00140         rms_value = calcRMS(items_time,avg_value)
00141 
00142         return {"min":  min_value, "max": max_value, "cputime_first": cpu_time_first,
00143                         "rms": rms_value, "avg": avg_value,
00144                         "event_number_of_max": f_evt_num(items[max_index])}
00145         
00146         
00147 def processModuleTimeLogData(modules_timelog, groupBy = "module_name"):
00148         """ Processes the timelog data grouping events by module and calculates min, max, avg, rms 
00149         Returns data as a list of dicts like: !
00150         
00151          {
00152                 <module_name>: 
00153                         {name:, label:, 
00154                                 stats: {num_events, avg, min, max, rms} 
00155          } 
00156          
00157          """
00158         # group by module_name, we save a list for each module name
00159         times_bymod = {}
00160         
00161         # print "Num of useful TimeLog lines: %s" % len(modules_timelog)
00162         
00163         for time_data in modules_timelog:
00164                 (event_number, module_label, module_name, seconds) = time_data
00165                 
00166                 # group times of modules By label or name, TODO: maybe both 
00167                 if groupBy == "module_label":
00168                         key = module_label
00169                 else:
00170                         if groupBy =="name+label": 
00171                                 key = module_name + "_" + module_label
00172                         else:
00173                                 key = module_name
00174 
00175                         
00176                 try:
00177                         # is the list for current module initialized?
00178                         times_bymod[key]
00179                 except KeyError:
00180                         #Changing this from a list to a dict (see comments below):
00181                         #times_bymod[key] = []
00182                         times_bymod[key] = {}
00183                 #Running out of memory!
00184                 #times_bymod[key].append({"label": module_label, "name": module_name, "time": seconds, "event_number": event_number})
00185                 #Let's do it right:
00186                 #Instead of times_bymod[key]=[{"label": module_label, "name": module_name, "time": seconds, "event_number": event_number}]
00187                 #let's do times_bymod[key]={"module_label":{"module_name":[(seconds,event_number)]}} so we do not repeat label and name and especially they are not a pair of key/value
00188                 #During the first event all the keys will be initialized, then from event 2 on it will be just appending the (seconds,event_number) tuple to the list with the appropriate keys:
00189 
00190                 #Check/Set up the module label dict:
00191                 try:
00192                         times_bymod[key][module_label]
00193                 except KeyError:
00194                         times_bymod[key].update({module_label:{}})
00195                 
00196                 #Check/Set up the module name dict:
00197                 try:
00198                         times_bymod[key][module_label][module_name]
00199                 except KeyError:
00200                         times_bymod[key][module_label].update({module_name:[]})
00201                 
00202                 #We're now ready to add the info as a tuple in the list!
00203                 times_bymod[key][module_label][module_name].append((seconds,event_number))
00204                 
00205                 
00206         # calculate Min,Max, Avg, RMS for each module and in this way get the final data to be imported
00207         ##for mod_name in times_bymod.keys():
00208         ##      #copy needed data
00209         ##      #mod_data = {"label": times_bymod[mod_name][0]["label"], "name": times_bymod[mod_name][0]["name"]}
00210         ##      #New data structure:
00211         ##      mod_data = {"label":times_bymod[mod_name].keys()[0],"name":times_bymod[mod_name][times_bymod[mod_name].keys()[0]].keys()[0]}
00212         ##      # add statistical data
00213         ##
00214         ##      mod_data["stats"] =calc_MinMaxAvgRMS(f_time = lambda x: x["time"], f_evt_num = lambda x: x["event_number"], items = times_bymod[mod_name])
00215         ##
00216         ##      mod_data["stats"]["num_events"] = len(times_bymod[mod_name])
00217         ##      
00218         ##      times_bymod[mod_name] = mod_data
00219         #Let's rewrite this using the dictionary we now have without any logical change (could do with some...):
00220         for key in times_bymod.keys():
00221                 for label in times_bymod[key].keys():
00222                         mod_data={'label':label}
00223                         for name in times_bymod[key][label].keys():
00224                                 mod_data.update({'name':name})
00225                                 mod_data['stats']= calc_MinMaxAvgRMS(f_time= lambda x:x[0],f_evt_num=lambda x:x[1],items=times_bymod[key][label][name])
00226                                 mod_data['stats']['num_events']=len(times_bymod[key][label][name])
00227                 times_bymod[key]=mod_data
00228         return times_bymod
00229 
00230 def manual_run():
00231         timelog_f = "TTBAR__RAW2DIGI,RECO_TimingReport.log"
00232         timelog_f = "TTBAR__GEN,SIM,DIGI,L1,DIGI2RAW,HLT_TimingReport.log"      
00233         #TODO: get STEP name from filename
00234         release_files = {
00235                          
00236                          "CMSSW_3_1_0_pre9": 
00237                          (
00238                          "CMSSW_3_1_0_pre9/MINBIAS__RAW2DIGI,RECO_TimingReport.log", 
00239                           "CMSSW_3_1_0_pre9/TTBAR__RAW2DIGI,RECO_TimingReport.log")
00240                          ## "CMSSW_3_1_0_pre10": 
00241                          }
00242         for release, files in release_files.items():
00243                 print "Processing release: %s" % release
00244                 for timelog_f in files:
00245                         print "Processing file: %s" % timelog_f
00246                         
00247                         # TODO: automaticaly detect type of report file!!!
00248                         (mod_timelog, evt_timelog, rss_data, vsize_data) =loadTimeLog(timelog_f)
00249                         
00250                         mod_timelog= processModuleTimeLogData(mod_timelog, groupBy = "module_label")
00251                         print "Number of modules grouped by (module_label): %s" % len(mod_timelog)
00252 
00253                         (candle, step, pileup_type, conditions, event_content) = getJobID_fromTimeReportLogName(timelog_f)
00254                         
00255                         """ We could get release from the path but that's quite ugly! """
00256                         export_xml(jobID = jobID, release=release, timelog_result=(mod_timelog, evt_timelog, rss_data, vsize_data))     
00257 
00258 """ use to run performance profiling """
00259 def perf_profile():
00260         timelog_f = "test_data/TTBAR__RAW2DIGI,RECO_TimingReport.log"
00261         (modules_timelog, evt_timelog, rss_data, vsize_data) = loadTimeLog(timelog_f)   
00262 
00263         mod_timelog= processModuleTimeLogData(modules_timelog, groupBy = "module_label")
00264 
00265         (candle, step, pileup_type, conditions, event_content) = getJobID_fromTimeReportLogName(timelog_f)
00266         
00267         xmldoc = minidom.Document()
00268         export_xml(step = step, candle = candle, release="test", timelog_result=(mod_timelog, evt_timelog, rss_data, vsize_data), xml_doc = xmldoc)     
00269         write_xml(xmldoc, "test_xml_output.xml")
00270         
00271 if (__name__ == "__main__"):
00272         perf_profile()