CMS 3D CMS Logo

 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Pages
edmStreamStallGrapher.py
Go to the documentation of this file.
1 #!/usr/bin/env python
2 
3 #----------------------------------------------
4 def printHelp():
5  s = """Purpose: Convert a cmsRun log with Tracer info into a stream stall graph.
6 
7 edmStreamStallGrapher [-g] <log file name>
8 
9 Option: -g instead of ascii art, create a pdf file showing the work being done on each stream
10 
11 To Use: Add the Tracer Service to the cmsRun job you want to check for stream stalls.
12  Make sure to use the 'printTimstamps' option
13  cms.Service("Tracer", printTimestamps = cms.untracked.bool(True))
14  After running the job, execute this script and pass the name of the log file to the
15  script as the only command line argument.
16 
17 To Read: The script will then print an 'ASCII art' stall graph which consists of the name of
18  the module which either started or stopped running on a stream, and the state of each
19  stream at that moment in time and if the module just started, you will also see the
20  amount of time on that stream between the previous module finishing and this module starting.
21  The state of a stream is represented by a symbol:
22  blank (" ") the stream is currently running a module
23  line ("|") the stream is waiting to run a module
24  minus ("-") the stream has just finished waiting and is starting a module
25  plus ("+") the stream just finished running a module
26  If a module had to wait more than 0.1 seconds, the end of the line will have "STALLED".
27  Once the first 4 events have finished processing, the program prints "FINISH INIT".
28  This is useful if one wants to ignore stalled caused by startup actions, e.g. reading
29  conditions.
30 
31  Once the graph is completed, the program outputs the list of modules which had
32  the greatest total stall times. The list is sorted by total stall time and
33  written in descending order. In addition, the list of all stall times for the
34  module is given.
35 """
36  print s
37 
38 
39 #----------------------------------------------
40 def getTime(line):
41  time = line.split(" ")[1]
42  time = time.split(":")
43  time = int(time[0])*60*60+int(time[1])*60+float(time[2])
44  return time
45 
46 #Stream states
47 kStarted=0
48 kFinished=1
49 
50 #----------------------------------------------
51 def readLogFile(fileName):
52  f = open(fileName,"r")
53 
54  processingSteps = list()
55  numStreams = 0
56  maxNameSize = 0
57  startTime = 0
58  foundEventToStartFrom = False
59  for l in f:
60  if not foundEventToStartFrom:
61  if l.find("event = 5") != -1:
62  foundEventToStartFrom = True
63  stream = int( l[l.find("stream = ")+9])
64  processingSteps.append(("FINISH INIT",kFinished,stream,getTime(l)-startTime,False))
65  if l.find("processing event for module") != -1:
66  time = getTime(l)
67  if startTime == 0:
68  startTime = time
69  time = time - startTime
70  trans = kStarted
71  stream = 0
72  delayed = False
73  if l.find("finished:") != -1:
74  trans = kFinished
75  stream = int( l[l.find("stream = ")+9])
76  name = l.split("'")[1]
77  if l.find("delayed") != -1:
78  delayed = True
79  if len(name) > maxNameSize:
80  maxNameSize = len(name)
81  processingSteps.append((name,trans,stream,time,delayed))
82  if stream > numStreams:
83  numStreams = stream
84  f.close()
85  return (processingSteps,numStreams,maxNameSize)
86 
87 
88 #----------------------------------------------
89 def findStalledModules(processingSteps, numStreams):
90  streamTime = [0]*(numStreams+1)
91  stalledModules = {}
92  for n,trans,s,time,delayed in processingSteps:
93  waitTime = None
94  if delayed:
95  n = "source"
96  if trans == kStarted:
97  waitTime = time - streamTime[s]
98  else:
99  streamTime[s] = time
100  if waitTime is not None:
101  if waitTime > 0.1:
102  t = stalledModules.setdefault(n,[])
103  t.append(waitTime)
104  return stalledModules
105 
106 
107 #----------------------------------------------
108 def createAsciiImage(processingSteps, numStreams, maxNameSize):
109  #print processingSteps
110  #exit(1)
111  streamState = [1]*(numStreams+1)
112  streamTime = [0]*(numStreams+1)
113  #lastTime = 0
114  seenInit = False
115  for n,trans,s,time,delayed in processingSteps:
116  if n == "FINISH INIT":
117  seenInit = True
118  continue
119  oldState = streamState[s]
120  streamState[s]=trans
121  waitTime = None
122  if delayed:
123  n = "source"
124  if trans == kStarted:
125  waitTime = time - streamTime[s]
126  streamState[s]=2
127  else:
128  if oldState != trans:
129  streamState[s]=3
130  streamTime[s] = time
131  states = "%-*s: " % (maxNameSize,n)
132  for state in streamState:
133  if state == 0:
134  states +=" "
135  elif state == 1:
136  states +="|"
137  elif state == 2:
138  states +="-"
139  elif state == 3:
140  states +="+"
141  if waitTime is not None:
142  states += " %.2f"% waitTime
143  if waitTime > 0.1 and seenInit:
144  states += " STALLED"
145 
146  print states
147  streamState[s]=trans
148  return stalledModules
149 
150 #----------------------------------------------
151 def printStalledModulesInOrder(stalledModules):
152  priorities = list()
153  maxNameSize = 0
154  for n,t in stalledModules.iteritems():
155  nameLength = len(n)
156  if nameLength > maxNameSize:
157  maxNameSize = nameLength
158  t.sort(reverse=True)
159  priorities.append((n,sum(t),t))
160 
161  def sumSort(i,j):
162  return cmp(i[1],j[1])
163  priorities.sort(cmp=sumSort, reverse=True)
164 
165  nameColumn = "Stalled Module"
166  if len(nameColumn) > maxNameSize:
167  maxNameSize = len(nameColumn)
168 
169  stallColumn = "Tot Stall Time"
170  stallColumnLength = len(stallColumn)
171 
172  print "%-*s" % (maxNameSize, nameColumn), "%-*s"%(stallColumnLength,stallColumn), " Stall Times"
173  for n,s,t in priorities:
174  paddedName = "%-*s:" % (maxNameSize,n)
175  print paddedName, "%-*.2f"%(stallColumnLength,s), ", ".join([ "%.2f"%x for x in t])
176 
177 
178 def createPDFImage(processingSteps, numStreams, stalledModuleInfo):
179  import matplotlib.pyplot as plt
180 
181  streamStartDepth = [0]*(numStreams+1)
182  streamTime = [0]*(numStreams+1)
183  streamStartTimes = [ [] for x in xrange(numStreams+1)]
184  streamColors = [[] for x in xrange(numStreams+1)]
185 
186  stalledModuleNames = [ x for x in stalledModuleInfo.iterkeys()]
187 
188  streamStartTimes = [ [] for x in xrange(numStreams+1)]
189  streamColors = [[] for x in xrange(numStreams+1)]
190 
191  for n,trans,s,time,delayed in processingSteps:
192  if trans == kStarted:
193  streamStartDepth[s] +=1
194  streamTime[s] = time
195  else:
196  streamStartDepth[s] -=1
197  if 0 == streamStartDepth[s]:
198  streamStartTimes[s].append((streamTime[s],time-streamTime[s]))
199  c="green"
200  if delayed:
201  c="orange"
202  elif n in stalledModuleNames:
203  c="red"
204  #elif len(streamColors[s]) %2:
205  # c="blue"
206  streamColors[s].append(c)
207 
208  #consolodate contiguous blocks with the same color
209  # this drastically reduces the size of the pdf file
210  oldStreamTimes = streamStartTimes
211  oldStreamColors = streamColors
212 
213  streamStartTimes = [ [] for x in xrange(numStreams+1)]
214  streamColors = [[] for x in xrange(numStreams+1)]
215 
216  for s in xrange(numStreams+1):
217  streamStartTimes[s].append(oldStreamTimes[s][0])
218  streamColors[s].append(oldStreamColors[s][0])
219  lastStartTime,lastTimeLength = oldStreamTimes[s][0]
220  lastColor = oldStreamColors[s][0]
221  for i in xrange(1, len(oldStreamTimes[s])):
222  start,length = oldStreamTimes[s][i]
223  color = oldStreamColors[s][i]
224  #use a millisecond tolerance to avoid rounding
225  if color == lastColor and abs(lastStartTime+lastTimeLength-start)<0.001:
226  lastTimeLength += length
227  else:
228  streamStartTimes[s].append((lastStartTime,lastTimeLength))
229  streamColors[s].append(lastColor)
230  lastStartTime = start
231  lastTimeLength = length
232  lastColor = color
233  streamStartTimes[s].append((lastStartTime,lastTimeLength))
234  streamColors[s].append(lastColor)
235 
236  fig, ax = plt.subplots()
237  ax.set_xlabel("Time (sec)")
238  ax.set_ylabel("Stream ID")
239 
240  i=1
241  for s in xrange(numStreams+1):
242  t = streamStartTimes[s]
243  ax.broken_barh(t,(i-0.4,0.8),facecolors=streamColors[s],edgecolors=streamColors[s],linewidth=0)
244  i=i+1
245 
246  #add key .1, .3, .7
247  fig.text(0.1, 0.95, "modules running", color = "green", horizontalalignment = 'left')
248  fig.text(0.5, 0.95, "stalled module running", color = "red", horizontalalignment = 'center')
249  fig.text(0.9, 0.95, "read from input", color = "orange", horizontalalignment = 'right')
250  plt.savefig("stall.pdf")
251 
252 
253 
254 #=======================================
255 if __name__=="__main__":
256  import sys
257 
258  if len(sys.argv) == 1:
259  printHelp()
260  exit(0)
261 
262  doGraphic = False
263  if len(sys.argv) == 3:
264  if sys.argv[1] == '-g':
265  doGraphic = True
266  else:
267  print "unknown argument ",sys.argv[1]
268  exit(-1)
269  fileName =sys.argv[-1]
270 
271  processingSteps,numStreams,maxNameSize = readLogFile(sys.argv[-1])
272  stalledModules = findStalledModules(processingSteps, numStreams)
273  if not doGraphic:
274  createAsciiImage(processingSteps, numStreams, maxNameSize)
275  else:
276  createPDFImage(processingSteps, numStreams, stalledModules)
277  printStalledModulesInOrder(stalledModules)
278 
Abs< T >::type abs(const T &t)
Definition: Abs.h:22
static std::string join(char **cmd)
Definition: RemoteFile.cc:18
How EventSelector::AcceptEvent() decides whether to accept an event for output otherwise it is excluding the probing of A single or multiple positive and the trigger will pass if any such matching triggers are PASS or EXCEPTION[A criterion thatmatches no triggers at all is detected and causes a throw.] A single negative with an expectation of appropriate bit checking in the decision and the trigger will pass if any such matching triggers are FAIL or EXCEPTION A wildcarded negative criterion that matches more than one trigger in the trigger list("!*","!HLTx*"if it matches 2 triggers or more) will accept the event if all the matching triggers are FAIL.It will reject the event if any of the triggers are PASS or EXCEPTION(this matches the behavior of"!*"before the partial wildcard feature was incorporated).Triggers which are in the READY state are completely ignored.(READY should never be returned since the trigger paths have been run