CMS 3D CMS Logo

Functions

parserTimingReport Namespace Reference

Functions

def calc_MinMaxAvgRMS
def calcRMS
def extractRSS_VSIZE
def loadTimeLog
def manual_run
def perf_profile
def processModuleTimeLogData
 mod_data["stats"] =calc_MinMaxAvgRMS(f_time = lambda x: x["time"], f_evt_num = lambda x: x["event_number"], items = times_bymod[mod_name])

Function Documentation

def parserTimingReport::calc_MinMaxAvgRMS (   items,
  remove_first = True,
  f_time = lambda x: x[0],
  f_evt_num = lambda x: x[1] 
)
returns a dict of avg, min, max, rms 

Definition at line 141 of file parserTimingReport.py.

00141                                                                    : x[0], f_evt_num = lambda x: x[1],):
00142         """ returns a dict of avg, min, max, rms """
00143         # save the cpu time of first event before removing the first result!
00144         cpu_time_first = f_time(items[0])
00145 
00146         if len(items) > 1 and remove_first == True:
00147                 items.remove(items[0]) #TODO: if there is only one event - we have a problem -> do we eliminate the whole module?
00148                 # TODO: it removes it completely from all the data because we do not save/ do not copy it
00149 
00150         items_time = map(f_time, items)
00151         min_value = min(items_time)
00152         max_value = max(items_time)
00153         max_index = items_time.index(max_value)
00154         avg_value = float(sum(items_time)) / float(len(items_time))
00155         rms_value = calcRMS(items_time,avg_value)
00156 
00157         return {"min":  min_value, "max": max_value, "cputime_first": cpu_time_first,
00158                         "rms": rms_value, "avg": avg_value,
00159                         "event_number_of_max": f_evt_num(items[max_index])}
00160         
00161         
def parserTimingReport::calcRMS (   items,
  avg 
)
returns RootMeanSquare  of items in a list 

Definition at line 134 of file parserTimingReport.py.

00135                       :
00136         """ returns RootMeanSquare  of items in a list """ 
00137         # sqrt(sum(x^2))
00138         # Not statistics RMS... "physics" RMS, i.e. standard deviation: sqrt(sum((x-avg)**2)/N)
00139         # return math.sqrt(reduce(lambda x: (x - avg)**2, items) / len(items))  
00140         return math.sqrt(sum([(x-avg)**2 for x in items])/len(items))

def parserTimingReport::extractRSS_VSIZE (   line1,
  line2,
  record_number 
)
>>> extractRSS_VSIZE("%MSG-w MemoryCheck:  PostModule 19-Jun-2009 13:06:08 CEST Run: 1 Event: 1", \
                     "MemoryCheck: event : VSIZE 923.07 0 RSS 760.25 0")
(('1', '760.25'), ('1', '923.07'))

Definition at line 29 of file parserTimingReport.py.

00030                                                  :
00031         """ 
00032         >>> extractRSS_VSIZE("%MSG-w MemoryCheck:  PostModule 19-Jun-2009 13:06:08 CEST Run: 1 Event: 1", \
00033                              "MemoryCheck: event : VSIZE 923.07 0 RSS 760.25 0")
00034         (('1', '760.25'), ('1', '923.07'))
00035         """
00036 
00037         if ("Run" in line1) and  ("Event" in line1): # the first line
00038                 event_number = line1.split('Event:')[1].strip() 
00039         else: return False
00040 
00041         """ it's first or second MemoryCheck line """
00042         if ("VSIZE" in line2) and ("RSS" in line2): # the second line
00043                 RSS = line2.split("RSS")[1].strip().split(" ")[0].strip() #changed partition into split for backward compatability with py2.3
00044                 VSIZE = line2.split("RSS")[0].strip().split("VSIZE")[1].strip().split(" ")[0].strip()
00045                 #Hack to return the record number instea of event number for now... can always switch back of add event number on top
00046                 #return ((event_number, RSS), (event_number, VSIZE))
00047                 return ((record_number, RSS), (record_number, VSIZE))
00048         else: return False
00049 

def parserTimingReport::loadTimeLog (   log_filename,
  maxsize_rad = 0 
)
gets the timing data from the logfile
 returns 4 lists:

        * ModuleTime data (event_number, module_label, module_name, seconds) and
        * EventTime data
                - with granularity of event (initial - not processed data)
        * RSS per event
        * VSIZE per event
 

Definition at line 50 of file parserTimingReport.py.

00050                                               : #TODO: remove maxsize to read, used for debugging
00051         """ gets the timing data from the logfile
00052          returns 4 lists:
00053 
00054                 * ModuleTime data (event_number, module_label, module_name, seconds) and
00055                 * EventTime data
00056                         - with granularity of event (initial - not processed data)
00057                 * RSS per event
00058                 * VSIZE per event
00059          """
00060         # ----- format of logfile ----
00061         # Report columns headings for modules: eventnum runnum modulelabel modulename timetakeni"
00062         # e.g. TimeModule> 1 1 csctfDigis CSCTFUnpacker 0.0624561
00063 
00064         mod_data = []
00065         evt_data = []
00066         rss_data = []
00067         vsize_data = []
00068         # open file and read it and fill the structure!
00069         logfile = open(log_filename, 'r')
00070         
00071         # get only the lines which have time report data
00072         #TODO: reading and processing line by line might speed up the process!
00073                                 
00074         memcheck_line1 = False
00075 
00076         record_number=0
00077         last_record=0
00078         last_event=0
00079         for line in logfile.xreadlines():
00080                 if 'TimeModule>' in line.strip():
00081                         line = line.strip()
00082                         line_content_list = line.split(' ')[0:]
00083                         #Hack to avoid issues with the non-consecutive run numbers:
00084                         event_number = int(line_content_list[1])
00085                         if event_number != last_event:
00086                                 record_number=record_number+1
00087                                 last_event=event_number
00088                                 # module label and name were mixed up in the original doc
00089                         module_label = str(line_content_list[4])
00090                         module_name = str(line_content_list[3])
00091                         seconds = float(line_content_list[5])
00092                         #For now let's try to switch to the record_number... if we need to also have the event_number we can always add it back.
00093                         #mod_data.append((event_number, module_label, module_name, seconds))
00094                         mod_data.append((record_number, module_label, module_name, seconds))
00095                 if 'TimeEvent>' in line.strip():
00096                         line = line.strip()
00097                         line_content_list = line.split(' ')[0:]
00098                         #Hack to avoid issues with the non-consecutive run numbers:
00099                         event_number = int(line_content_list[1])
00100                         if event_number != last_event:
00101                                 record_number=record_number+1
00102                                 last_event=event_number
00103                                 # module label and name were mixed up in the original doc
00104                         time_seconds = str(line_content_list[3])
00105                         
00106                         #TODO: what are the other [last two] numbers? Real time? smf else? TimeEvent> 1 1 15.3982 13.451 13.451
00107                         #For now let's try to switch to the record_number... if we need to also have the event_number we can always add it back.
00108                         #evt_data.append((event_number, time_seconds))
00109                         evt_data.append((record_number, time_seconds))
00110                 """ 
00111                         %MSG-w MemoryCheck:  PostModule 19-Jun-2009 13:06:08 CEST Run: 1 Event: 1
00112                         MemoryCheck: event : VSIZE 923.07 0 RSS 760.25 0
00113                 """
00114                 if 'MemoryCheck:' in line.strip():
00115                         # this is the first line out of two
00116                         if (not memcheck_line1):
00117                                 memcheck_line1 = line.strip()
00118                         else:
00119                                 #FIXME (eventually)
00120                                 #Hacking in the record_number extracted from the TimeEvent and TimeModule parsing... NOT ROBUST...
00121                                 (rss, vsize) = extractRSS_VSIZE(memcheck_line1, line.strip(), record_number)
00122                                 rss_data.append(rss)
00123                                 vsize_data.append(vsize)
00124                 else: 
00125                         memcheck_line1 = False                                                                                                                                                                                                                                                                                          
00126 
00127         logfile.close()
00128         
00129         return (mod_data, evt_data, rss_data, vsize_data)
00130 
00131 
00132 
00133 
def parserTimingReport::manual_run ( )

Definition at line 245 of file parserTimingReport.py.

00246                 :
00247         timelog_f = "TTBAR__RAW2DIGI,RECO_TimingReport.log"
00248         timelog_f = "TTBAR__GEN,SIM,DIGI,L1,DIGI2RAW,HLT_TimingReport.log"      
00249         #TODO: get STEP name from filename
00250         release_files = {
00251                          
00252                          "CMSSW_3_1_0_pre9": 
00253                          (
00254                          "CMSSW_3_1_0_pre9/MINBIAS__RAW2DIGI,RECO_TimingReport.log", 
00255                           "CMSSW_3_1_0_pre9/TTBAR__RAW2DIGI,RECO_TimingReport.log")
00256                          ## "CMSSW_3_1_0_pre10": 
00257                          }
00258         for release, files in release_files.items():
00259                 print "Processing release: %s" % release
00260                 for timelog_f in files:
00261                         print "Processing file: %s" % timelog_f
00262                         
00263                         # TODO: automaticaly detect type of report file!!!
00264                         (mod_timelog, evt_timelog, rss_data, vsize_data) =loadTimeLog(timelog_f)
00265                         
00266                         mod_timelog= processModuleTimeLogData(mod_timelog, groupBy = "module_label")
00267                         print "Number of modules grouped by (module_label): %s" % len(mod_timelog)
00268 
00269                         (candle, step, pileup_type, conditions, event_content) = getJobID_fromTimeReportLogName(timelog_f)
00270                         
00271                         """ We could get release from the path but that's quite ugly! """
00272                         export_xml(jobID = jobID, release=release, timelog_result=(mod_timelog, evt_timelog, rss_data, vsize_data))     
00273 
""" use to run performance profiling """
def parserTimingReport::perf_profile ( )

Definition at line 274 of file parserTimingReport.py.

00275                   :
00276         timelog_f = "test_data/TTBAR__RAW2DIGI,RECO_TimingReport.log"
00277         (modules_timelog, evt_timelog, rss_data, vsize_data) = loadTimeLog(timelog_f)   
00278 
00279         mod_timelog= processModuleTimeLogData(modules_timelog, groupBy = "module_label")
00280 
00281         (candle, step, pileup_type, conditions, event_content) = getJobID_fromTimeReportLogName(timelog_f)
00282         
00283         xmldoc = minidom.Document()
00284         export_xml(step = step, candle = candle, release="test", timelog_result=(mod_timelog, evt_timelog, rss_data, vsize_data), xml_doc = xmldoc)     
00285         write_xml(xmldoc, "test_xml_output.xml")
        
def parserTimingReport::processModuleTimeLogData (   modules_timelog,
  groupBy = "module_name" 
)

mod_data["stats"] =calc_MinMaxAvgRMS(f_time = lambda x: x["time"], f_evt_num = lambda x: x["event_number"], items = times_bymod[mod_name])

Processes the timelog data grouping events by module and calculates min, max, avg, rms 
Returns data as a list of dicts like: !

 {
        <module_name>: 
                {name:, label:, 
                        stats: {num_events, avg, min, max, rms} 
 } 
 
 

mod_data["stats"]["num_events"] = len(times_bymod[mod_name])

times_bymod[mod_name] = mod_data Let's rewrite this using the dictionary we now have without any logical change (could do with some...):

Definition at line 162 of file parserTimingReport.py.

00163                                                                       :
00164         """ Processes the timelog data grouping events by module and calculates min, max, avg, rms 
00165         Returns data as a list of dicts like: !
00166         
00167          {
00168                 <module_name>: 
00169                         {name:, label:, 
00170                                 stats: {num_events, avg, min, max, rms} 
00171          } 
00172          
00173          """
00174         # group by module_name, we save a list for each module name
00175         times_bymod = {}
00176         
00177         # print "Num of useful TimeLog lines: %s" % len(modules_timelog)
00178         
00179         for time_data in modules_timelog:
00180                 (event_number, module_label, module_name, seconds) = time_data
00181                 
00182                 # group times of modules By label or name, TODO: maybe both 
00183                 if groupBy == "module_label":
00184                         key = module_label
00185                 else:
00186                         if groupBy =="name+label": 
00187                                 key = module_name + "_" + module_label
00188                         else:
00189                                 key = module_name
00190 
00191                         
00192                 try:
00193                         # is the list for current module initialized?
00194                         times_bymod[key]
00195                 except KeyError:
00196                         #Changing this from a list to a dict (see comments below):
00197                         #times_bymod[key] = []
00198                         times_bymod[key] = {}
00199                 #Running out of memory!
00200                 #times_bymod[key].append({"label": module_label, "name": module_name, "time": seconds, "event_number": event_number})
00201                 #Let's do it right:
00202                 #Instead of times_bymod[key]=[{"label": module_label, "name": module_name, "time": seconds, "event_number": event_number}]
00203                 #let's do times_bymod[key]={"module_label":{"module_name":[(seconds,event_number)]}} so we do not repeat label and name and especially they are not a pair of key/value
00204                 #During the first event all the keys will be initialized, then from event 2 on it will be just appending the (seconds,event_number) tuple to the list with the appropriate keys:
00205 
00206                 #Check/Set up the module label dict:
00207                 try:
00208                         times_bymod[key][module_label]
00209                 except KeyError:
00210                         times_bymod[key].update({module_label:{}})
00211                 
00212                 #Check/Set up the module name dict:
00213                 try:
00214                         times_bymod[key][module_label][module_name]
00215                 except KeyError:
00216                         times_bymod[key][module_label].update({module_name:[]})
00217                 
00218                 #We're now ready to add the info as a tuple in the list!
00219                 times_bymod[key][module_label][module_name].append((seconds,event_number))
00220                 
00221                 
00222         # calculate Min,Max, Avg, RMS for each module and in this way get the final data to be imported
00223         ##for mod_name in times_bymod.keys():
00224         ##      #copy needed data
00225         ##      #mod_data = {"label": times_bymod[mod_name][0]["label"], "name": times_bymod[mod_name][0]["name"]}
00226         ##      #New data structure:
00227         ##      mod_data = {"label":times_bymod[mod_name].keys()[0],"name":times_bymod[mod_name][times_bymod[mod_name].keys()[0]].keys()[0]}
00228         ##      # add statistical data
00229         ##
00230         ##      mod_data["stats"] =calc_MinMaxAvgRMS(f_time = lambda x: x["time"], f_evt_num = lambda x: x["event_number"], items = times_bymod[mod_name])
00231         ##
00232         ##      mod_data["stats"]["num_events"] = len(times_bymod[mod_name])
00233         ##      
00234         ##      times_bymod[mod_name] = mod_data
00235         #Let's rewrite this using the dictionary we now have without any logical change (could do with some...):
00236         for key in times_bymod.keys():
00237                 for label in times_bymod[key].keys():
00238                         mod_data={'label':label}
00239                         for name in times_bymod[key][label].keys():
00240                                 mod_data.update({'name':name})
00241                                 mod_data['stats']= calc_MinMaxAvgRMS(f_time= lambda x:x[0],f_evt_num=lambda x:x[1],items=times_bymod[key][label][name])
00242                                 mod_data['stats']['num_events']=len(times_bymod[key][label][name])
00243                 times_bymod[key]=mod_data
00244         return times_bymod