CMS 3D CMS Logo

 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Pages
edmStreamStallGrapher.py
Go to the documentation of this file.
1 #!/usr/bin/env python
2 
3 #----------------------------------------------
4 def printHelp():
5  s = """Purpose: Convert a cmsRun log with Tracer info into a stream stall graph.
6 
7 edmStreamStallGrapher [-g] <log file name>
8 
9 Option: -g instead of ascii art, create a pdf file showing the work being done on each stream
10 
11 To Use: Add the Tracer Service to the cmsRun job you want to check for stream stalls.
12  Make sure to use the 'printTimstamps' option
13  cms.Service("Tracer", printTimestamps = cms.untracked.bool(True))
14  After running the job, execute this script and pass the name of the log file to the
15  script as the only command line argument.
16 
17 To Read: The script will then print an 'ASCII art' stall graph which consists of the name of
18  the module which either started or stopped running on a stream, and the state of each
19  stream at that moment in time and if the module just started, you will also see the
20  amount of time on that stream between the previous module finishing and this module starting.
21  The state of a stream is represented by a symbol:
22  blank (" ") the stream is currently running a module
23  line ("|") the stream is waiting to run a module
24  minus ("-") the stream has just finished waiting and is starting a module
25  plus ("+") the stream just finished running a module
26  If a module had to wait more than 0.1 seconds, the end of the line will have "STALLED".
27  Once the first 4 events have finished processing, the program prints "FINISH INIT".
28  This is useful if one wants to ignore stalled caused by startup actions, e.g. reading
29  conditions.
30 
31  Once the graph is completed, the program outputs the list of modules which had
32  the greatest total stall times. The list is sorted by total stall time and
33  written in descending order. In addition, the list of all stall times for the
34  module is given.
35 """
36  print s
37 
38 
39 #----------------------------------------------
40 def getTime(line):
41  time = line.split(" ")[1]
42  time = time.split(":")
43  time = int(time[0])*60*60+int(time[1])*60+float(time[2])
44  return time
45 
46 #Stream states
47 kStarted=0
48 kFinished=1
49 
50 #Special names
51 kSourceFindEvent = "sourceFindEvent"
52 
53 #----------------------------------------------
54 def readLogFile(fileName):
55  f = open(fileName,"r")
56 
57  processingSteps = list()
58  numStreams = 0
59  maxNameSize = 0
60  startTime = 0
61  foundEventToStartFrom = False
62  for l in f:
63  if not foundEventToStartFrom:
64  if l.find("event = 5") != -1:
65  foundEventToStartFrom = True
66  stream = int( l[l.find("stream = ")+9])
67  processingSteps.append(("FINISH INIT",kFinished,stream,getTime(l)-startTime,False))
68  if l.find("processing event :") != -1:
69  time = getTime(l)
70  if startTime == 0:
71  startTime = time
72  time = time - startTime
73  delayed = False
74  streamIndex = l.find("stream = ")
75  stream = int( l[streamIndex+9:l.find(" ",streamIndex+10)])
76  name = kSourceFindEvent
77  trans = kStarted
78  #the start of an event is the end of the framework part
79  if l.find("starting:") != -1:
80  trans = kFinished
81  processingSteps.append((name,trans,stream,time,delayed))
82  if stream > numStreams:
83  numStreams = stream
84  if l.find("processing event for module") != -1:
85  time = getTime(l)
86  if startTime == 0:
87  startTime = time
88  time = time - startTime
89  trans = kStarted
90  stream = 0
91  delayed = False
92  if l.find("finished:") != -1:
93  trans = kFinished
94  streamIndex = l.find("stream = ")
95  stream = int( l[streamIndex+9:l.find(" ",streamIndex+10)])
96  name = l.split("'")[1]
97  if l.find("delayed") != -1:
98  delayed = True
99  if len(name) > maxNameSize:
100  maxNameSize = len(name)
101  processingSteps.append((name,trans,stream,time,delayed))
102  if stream > numStreams:
103  numStreams = stream
104  f.close()
105  return (processingSteps,numStreams,maxNameSize)
106 
107 
108 #----------------------------------------------
109 # Patterns:
110 #
111 # source: The source is identified by having a 'starting: delayed' followed immediately by a 'finished: delayed'
112 # the total time is actually the wait time plus the time the source was doing work
113 # scheduled module: This is identified by no delays before call to 'starting' and then immediate call to 'finished'
114 # unscheduled module: Has a 'starting: delayed' followed by a 'starting' followed by a 'finished' the a 'finished: delayed'
115 #
116 #
117 def findStalledModules(processingSteps, numStreams):
118  streamTime = [0]*(numStreams+1)
119  stalledModules = {}
120  previousStartWasADelayed = [False]*(numStreams+1)
121  for n,trans,s,time,delayed in processingSteps:
122  waitTime = None
123  if trans == kStarted:
124  if delayed or (n == kSourceFindEvent):
125  previousStartWasADelayed[s] = True
126  streamTime[s] = time
127  else:
128  previousStartWasADelayed[s] = False
129  waitTime = time - streamTime[s]
130  else:
131  if (delayed or (n == kSourceFindEvent)) and previousStartWasADelayed[s]:
132  #for a source we only know the combined time for waiting and running
133  if n != kSourceFindEvent:
134  n = "source"
135  waitTime = time - streamTime[s]
136  previousStartWasADelayed[s] = False
137  streamTime[s] = time
138  if waitTime is not None:
139  if waitTime > 0.1:
140  t = stalledModules.setdefault(n,[])
141  t.append(waitTime)
142  return stalledModules
143 
144 
145 #----------------------------------------------
146 def createAsciiImage(processingSteps, numStreams, maxNameSize):
147  #print processingSteps
148  #exit(1)
149  streamState = [1]*(numStreams+1)
150  streamTime = [0]*(numStreams+1)
151  #lastTime = 0
152  seenInit = False
153  previousStartWasADelayed = [False]*(numStreams+1)
154  for n,trans,s,time,delayed in processingSteps:
155  if n == "FINISH INIT":
156  seenInit = True
157  continue
158  oldState = streamState[s]
159  streamState[s]=trans
160  waitTime = None
161  if trans == kStarted:
162  if delayed or (n == kSourceFindEvent):
163  previousStartWasADelayed[s] = True
164  streamTime[s] = time
165  continue
166  else:
167  previousStartWasADelayed[s] = False
168  waitTime = time - streamTime[s]
169  streamState[s]=2
170  else:
171  if delayed or (n == kSourceFindEvent):
172  if previousStartWasADelayed[s]:
173  if n != kSourceFindEvent:
174  n="source"
175  waitTime = time - streamTime[s]
176  streamState[s]=2
177  previousStartWasADelayed[s] = False
178  else:
179  streamTime[s] = time
180  continue
181  if oldState != trans:
182  streamState[s]=3
183  streamTime[s] = time
184  states = "%-*s: " % (maxNameSize,n)
185  for state in streamState:
186  if state == 0:
187  states +=" "
188  elif state == 1:
189  states +="|"
190  elif state == 2:
191  states +="-"
192  elif state == 3:
193  states +="+"
194  if waitTime is not None:
195  states += " %.2f"% waitTime
196  if waitTime > 0.1 and seenInit:
197  states += " STALLED "+str(time)+" "+str(s)
198 
199  print states
200  streamState[s]=trans
201  return stalledModules
202 
203 #----------------------------------------------
204 def printStalledModulesInOrder(stalledModules):
205  priorities = list()
206  maxNameSize = 0
207  for n,t in stalledModules.iteritems():
208  nameLength = len(n)
209  if nameLength > maxNameSize:
210  maxNameSize = nameLength
211  t.sort(reverse=True)
212  priorities.append((n,sum(t),t))
213 
214  def sumSort(i,j):
215  return cmp(i[1],j[1])
216  priorities.sort(cmp=sumSort, reverse=True)
217 
218  nameColumn = "Stalled Module"
219  if len(nameColumn) > maxNameSize:
220  maxNameSize = len(nameColumn)
221 
222  stallColumn = "Tot Stall Time"
223  stallColumnLength = len(stallColumn)
224 
225  print "%-*s" % (maxNameSize, nameColumn), "%-*s"%(stallColumnLength,stallColumn), " Stall Times"
226  for n,s,t in priorities:
227  paddedName = "%-*s:" % (maxNameSize,n)
228  print paddedName, "%-*.2f"%(stallColumnLength,s), ", ".join([ "%.2f"%x for x in t])
229 
230 
231 def createPDFImage(processingSteps, numStreams, stalledModuleInfo):
232  import matplotlib.pyplot as plt
233 
234  previousStartWasADelayed=[False]*(numStreams+1)
235  streamTime = [0]*(numStreams+1)
236  streamStartTimes = [ [] for x in xrange(numStreams+1)]
237  streamColors = [[] for x in xrange(numStreams+1)]
238 
239  stalledModuleNames = [ x for x in stalledModuleInfo.iterkeys()]
240 
241  streamStartTimes = [ [] for x in xrange(numStreams+1)]
242  streamColors = [[] for x in xrange(numStreams+1)]
243 
244  for n,trans,s,time,delayed in processingSteps:
245  if n == "FINISH INIT":
246  continue
247  if trans == kStarted:
248  previousStartWasADelayed[s] = delayed
249  streamTime[s] = time
250  else:
251  c="green"
252  if delayed:
253  if previousStartWasADelayed[s]:
254  #this was time for a source
255  c="orange"
256  previousStartWasADelayed[s] = False
257  else:
258  continue
259  if n == kSourceFindEvent:
260  c = "orange"
261  streamStartTimes[s].append((streamTime[s],time-streamTime[s]))
262  if c == "green" and n in stalledModuleNames:
263  c="red"
264  #elif len(streamColors[s]) %2:
265  # c="blue"
266  streamColors[s].append(c)
267 
268  #consolodate contiguous blocks with the same color
269  # this drastically reduces the size of the pdf file
270  oldStreamTimes = streamStartTimes
271  oldStreamColors = streamColors
272 
273  streamStartTimes = [ [] for x in xrange(numStreams+1)]
274  streamColors = [[] for x in xrange(numStreams+1)]
275 
276  for s in xrange(numStreams+1):
277  streamStartTimes[s].append(oldStreamTimes[s][0])
278  streamColors[s].append(oldStreamColors[s][0])
279  lastStartTime,lastTimeLength = oldStreamTimes[s][0]
280  lastColor = oldStreamColors[s][0]
281  for i in xrange(1, len(oldStreamTimes[s])):
282  start,length = oldStreamTimes[s][i]
283  color = oldStreamColors[s][i]
284  #use a millisecond tolerance to avoid rounding
285  if color == lastColor and abs(lastStartTime+lastTimeLength-start)<0.001:
286  lastTimeLength += length
287  else:
288  streamStartTimes[s].append((lastStartTime,lastTimeLength))
289  streamColors[s].append(lastColor)
290  lastStartTime = start
291  lastTimeLength = length
292  lastColor = color
293  streamStartTimes[s].append((lastStartTime,lastTimeLength))
294  streamColors[s].append(lastColor)
295 
296  fig, ax = plt.subplots()
297  ax.set_xlabel("Time (sec)")
298  ax.set_ylabel("Stream ID")
299 
300  i=1
301  for s in xrange(numStreams+1):
302  t = streamStartTimes[s]
303  ax.broken_barh(t,(i-0.4,0.8),facecolors=streamColors[s],edgecolors=streamColors[s],linewidth=0)
304  i=i+1
305 
306  #add key .1, .3, .7
307  fig.text(0.1, 0.95, "modules running", color = "green", horizontalalignment = 'left')
308  fig.text(0.5, 0.95, "stalled module running", color = "red", horizontalalignment = 'center')
309  fig.text(0.9, 0.95, "read from input", color = "orange", horizontalalignment = 'right')
310  plt.savefig("stall.pdf")
311 
312 
313 
314 #=======================================
315 if __name__=="__main__":
316  import sys
317 
318  if len(sys.argv) == 1:
319  printHelp()
320  exit(0)
321 
322  doGraphic = False
323  if len(sys.argv) == 3:
324  if sys.argv[1] == '-g':
325  doGraphic = True
326  else:
327  print "unknown argument ",sys.argv[1]
328  exit(-1)
329  fileName =sys.argv[-1]
330 
331  processingSteps,numStreams,maxNameSize = readLogFile(sys.argv[-1])
332  stalledModules = findStalledModules(processingSteps, numStreams)
333  if not doGraphic:
334  createAsciiImage(processingSteps, numStreams, maxNameSize)
335  else:
336  createPDFImage(processingSteps, numStreams, stalledModules)
337  printStalledModulesInOrder(stalledModules)
338 
boost::dynamic_bitset append(const boost::dynamic_bitset<> &bs1, const boost::dynamic_bitset<> &bs2)
this method takes two bitsets bs1 and bs2 and returns result of bs2 appended to the end of bs1 ...
Abs< T >::type abs(const T &t)
Definition: Abs.h:22
static std::string join(char **cmd)
Definition: RemoteFile.cc:18
How EventSelector::AcceptEvent() decides whether to accept an event for output otherwise it is excluding the probing of A single or multiple positive and the trigger will pass if any such matching triggers are PASS or EXCEPTION[A criterion thatmatches no triggers at all is detected and causes a throw.] A single negative with an expectation of appropriate bit checking in the decision and the trigger will pass if any such matching triggers are FAIL or EXCEPTION A wildcarded negative criterion that matches more than one trigger in the trigger list("!*","!HLTx*"if it matches 2 triggers or more) will accept the event if all the matching triggers are FAIL.It will reject the event if any of the triggers are PASS or EXCEPTION(this matches the behavior of"!*"before the partial wildcard feature was incorporated).Triggers which are in the READY state are completely ignored.(READY should never be returned since the trigger paths have been run