CMS 3D CMS Logo

Page1Parser.py
Go to the documentation of this file.
1 from HTMLParser import HTMLParser
2 from urllib2 import urlopen
3 import cPickle as pickle
4 import sys
5 import re
6 locatestarttagend = re.compile(r"""
7  <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
8  (?:\s+ # whitespace before attribute name
9  (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
10  (?:\s*=\s* # value indicator
11  (?:'[^']*' # LITA-enclosed value
12  |\"[^\"]*\" # LIT-enclosed value
13  |this.src='[^']*' # hack
14  |[^'\">\s]+ # bare value
15  )
16  )?
17  )
18  )*
19  \s* # trailing whitespace
20  """, re.VERBOSE)
21 
22 tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
23 attrfind = re.compile(
24  r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
25  r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~@]*))?')
26 
27 class Page1Parser(HTMLParser):
28 
29 
30  def __init__(self):
31  HTMLParser.__init__(self)
32 
33  self.InRow=0
34  self.InEntry=0
35  self.table = []
36  self.tmpRow = []
37  self.hyperlinks = []
38  self.RunNumber = 0
39  self.TriggerRates = []
40  self.Nevts = []
41  self.LumiByLS = []
42  self.FirstLS = -1
43  self.AvLumi = []
48  self.ColumnLumi=[]
49  self.L1Prescales=[]
50  self.RunPage = ''
51  self.RatePage = ''
52  self.LumiPage = ''
53  self.L1Page=''
54  self.TrigModePage=''
55  self.SeedMap=[]
56 
57  def parse_starttag(self, i):
58  self.__starttag_text = None
59  endpos = self.check_for_whole_start_tag(i)
60  if endpos < 0:
61  return endpos
62  rawdata = self.rawdata
63  self.__starttag_text = rawdata[i:endpos]
64 
65  # Now parse the data between i+1 and j into a tag and attrs
66  attrs = []
67  match = tagfind.match(rawdata, i+1)
68  assert match, 'unexpected call to parse_starttag()'
69  k = match.end()
70  self.lasttag = tag = rawdata[i+1:k].lower()
71 
72  if tag == 'img':
73  return endpos
74 
75  while k < endpos:
76  m = attrfind.match(rawdata, k)
77  if not m:
78  break
79  attrname, rest, attrvalue = m.group(1, 2, 3)
80  if not rest:
81  attrvalue = None
82  elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
83  attrvalue[:1] == '"' == attrvalue[-1:]:
84  attrvalue = attrvalue[1:-1]
85  attrvalue = self.unescape(attrvalue)
86  attrs.append((attrname.lower(), attrvalue))
87  k = m.end()
88 
89  end = rawdata[k:endpos].strip()
90  if end not in (">", "/>"):
91  lineno, offset = self.getpos()
92  if "\n" in self.__starttag_text:
93  lineno = lineno + self.__starttag_text.count("\n")
94  offset = len(self.__starttag_text) \
95  - self.__starttag_text.rfind("\n")
96  else:
97  offset = offset + len(self.__starttag_text)
98  self.error("junk characters in start tag: %r"
99  % (rawdata[k:endpos][:20],))
100  if end.endswith('/>'):
101  # XHTML-style empty tag: <span attr="value" />
102  self.handle_startendtag(tag, attrs)
103  else:
104  self.handle_starttag(tag, attrs)
105  if tag in self.CDATA_CONTENT_ELEMENTS:
106  self.set_cdata_mode()
107  return endpos
108 
110  rawdata = self.rawdata
111  m = locatestarttagend.match(rawdata, i)
112  if m:
113  j = m.end()
114  next = rawdata[j:j+1]
115  #print next
116  #if next == "'":
117  # j = rawdata.find(".jpg'",j)
118  # j = rawdata.find(".jpg'",j+1)
119  # next = rawdata[j:j+1]
120  if next == ">":
121  return j + 1
122  if next == "/":
123  if rawdata.startswith("/>", j):
124  return j + 2
125  if rawdata.startswith("/", j):
126  # buffer boundary
127  return -1
128  # else bogus input
129  self.updatepos(i, j + 1)
130  self.error("malformed empty start tag")
131  if next == "":
132  # end of input
133  return -1
134  if next in ("abcdefghijklmnopqrstuvwxyz=/"
135  "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
136  # end of input in or before attribute value, or we have the
137  # '/' from a '/>' ending
138  return -1
139  self.updatepos(i, j)
140  self.error("malformed start tag")
141  raise AssertionError("we should not get here!")
142 
143  def _Parse(self,url):
144  self.table = []
145  self.hyperlinks = []
146  req = urlopen(url)
147  try:
148  self.feed(req.read())
149  except Exception as inst:
150  print inst
151 
152  def handle_starttag(self,tag,attrs):
153  ValidTags = ['a','tr','td']
154  try:
155  if not tag in ValidTags:
156  return
157  tag.replace('%','')
158  tag.replace('?','')
159  if tag == 'a' and attrs:
160  self.hyperlinks.append(attrs[0][1])
161  elif tag == 'tr':
162  self.InRow=1
163  elif tag == 'td':
164  self.InEntry=1
165  except:
166  print tag
167  print attrs
168 
169  def handle_endtag(self,tag):
170  if tag =='tr':
171  if self.InRow==1:
172  self.InRow=0
173  self.table.append(self.tmpRow)
174  self.tmpRow=[]
175  if tag == 'td':
176  self.InEntry=0
177 
178  def handle_startendtag(self,tag, attrs):
179  pass
180 
181  def handle_data(self,data):
182  if self.InEntry:
183  self.tmpRow.append(data)
184 
185 
186  def ParsePage1(self):
187  # Find the first non-empty row on page one
188  MostRecent = self.table[0]
189  for line in self.table:
190  if line == []:
191  continue # skip empty rows, not exactly sure why they show up
192  MostRecent = line
193  break # find first non-empty line
194  TriggerMode = MostRecent[3]
195  isCollisions = not (TriggerMode.find('l1_hlt_collisions') == -1)
196  if not isCollisions:
197  return ''
198  self.RunNumber = MostRecent[0]
199  for link in self.hyperlinks:
200  if not link.find('RUN='+self.RunNumber)==-1:
201  self.RunPage = link
202  return link
203 
204 
205  def ParseRunPage(self):
206  for entry in self.hyperlinks:
207  entry = entry.replace('../../','http://cmswbm/')
208  if not entry.find('HLTSummary') == -1:
209  self.RatePage = entry
210  if not entry.find('L1Summary') == -1:
211  self.L1Page = entry
212  if not entry.find('LumiSections') == -1:
213  self.LumiPage = "http://cmswbm/cmsdb/servlet/"+entry
214  if not entry.find('TriggerMode') == -1:
215  if not entry.startswith("http://cmswbm/cmsdb/servlet/"):
216  entry = "http://cmswbm/cmsdb/servlet/"+entry
217  self.TrigModePage = entry
218  return [self.RatePage,self.LumiPage,self.L1Page,self.TrigModePage]
219 
221  for line in self.table:
222  if not len(line)>6: # All relevant lines in the table will be at least this long
223  continue
224  if line[1].startswith('HLT_'):
225  TriggerName = line[1][:line[1].find(' ')] # Format is HLT_... (####), this gets rid of the (####)
226  TriggerRate = float(line[6].replace(',','')) # Need to remove the ","s, since float() can't parse them
227  self.Nevts.append([TriggerName,int(line[3]),int(line[4]),int(line[5]),line[9]]) # 3-5 are the accept columns, 9 is the L1 seed name
228  PS=0
229  if int(line[4])>0:
230  PS = float(line[3])/float(line[4])
231  self.TriggerRates.append([TriggerName,TriggerRate,PS,line[9]])
232 
233  def ParseLumiPage(self):
234  for line in self.table[1:]:
235  if len(line)<4 or len(line)>12:
236  continue
237  self.PrescaleColumn.append(int(line[2]))
238  self.LumiByLS.append(float(line[4])) # Inst lumi is in position 4
239  if self.FirstLS == -1 and float(line[6]) > 0: # live lumi is in position 5, the first lumiblock with this > 0 should be recorded
240  self.FirstLS = int(line[0])
241  self.RatePage = self.RatePage.replace('HLTSummary?','HLTSummary?fromLS='+line[0]+'&toLS=&')
242  try:
243  self.AvLumi = sum(self.LumiByLS[self.FirstLS:])/len(self.LumiByLS[self.FirstLS:])
244  except ZeroDivisionError:
245  print "Cannot calculate average lumi -- something is wrong!"
246  print self.table[:10]
247  raise
248 
249  def ParseL1Page(self):
250  for line in self.table:
251  print line
252  if len(line) < 9:
253  continue
254  if line[1].startswith('L1_'):
255  pass
256 
257  def ParseTrigModePage(self):
258  ColIndex=0 ## This is the index of the next column that we look for
259  for line in self.table:
260  if len(line) < 2:
261  continue
262  ## get the column usage
263  if line[0].isdigit() and len(line)>=3:
264  if int(line[0])==ColIndex:
265  ColIndex+=1
266  StrLumiSplit = line[2].split('E')
267  if len(StrLumiSplit)!=2:
268  ColIndex=-99999999
269  else:
270  lumi = float(StrLumiSplit[0])
271  lumi*= pow(10,int(StrLumiSplit[1])-30)
272  self.ColumnLumi.append(round(lumi,1))
273 
274 
275  ## Get the actual prescale tables
276  if line[1].startswith('L1_') or line[1].startswith('HLT_'):
277  tmp=[]
278  seedtmp=[]
279  tmp.append(line[1])
280  seedtmp.append(line[1])
281  for entry in line[2:]:
282  if entry.isdigit():
283  tmp.append(entry)
284  if entry.startswith('L1_'):
285  seedtmp.append(entry)
286 
287  del tmp[len(self.ColumnLumi)+1:] ## Truncate the list (TT seeds look like prescale entries)
288 
289  if line[1].startswith('L1_'):
290  self.L1PrescaleTable.append(tmp)
291  else:
292  self.HLTPrescaleTable.append(tmp)
293  if len(seedtmp)==2:
294  self.SeedMap.append(seedtmp)
295  if len(self.PrescaleColumn)==0:
296  continue
297  for L1Row in self.L1PrescaleTable:
298  thisAvPS=0
299  nLS=0
300  for prescaleThisLS in self.PrescaleColumn[self.FirstLS:]:
301  thisAvPS+=float(L1Row[prescaleThisLS+1])
302  nLS+=1
303  thisAvPS/=nLS
304  self.L1Prescales.append([L1Row[0],thisAvPS])
305 
307  if len(self.L1PrescaleTable)==0 or len(self.HLTPrescaleTable)==0 or len(self.SeedMap)==0:
308  return
309 
310  for hltLine in self.HLTPrescaleTable:
311  totalLine=[]
312  hltName = hltLine[0]
313  l1Name = ""
314  # figure out the l1 Seed
315  for hlt,l1 in self.SeedMap:
316  if hltName==hlt:
317  l1Name=l1
318  break
319 
320  if l1Name == "":
321  totalLine = [hltName]+[l1Name]+[-3]*(len(hltLine)-1) ## couldn't figure out the L1 seed (error -3)
322  else:
323  ## Get the L1 Prescales
324  l1Line=[]
325  if not l1Name.find(' OR ')==-1: ## contains ORs, don't parse for the moment
326  l1Line = [l1Name]+[1]*(len(hltLine)-1) ## couldn't parse the ORs !! FOR NOW WE JUST SET THE L1 PRESCALE TO 1
327  else:
328  for thisl1Line in self.L1PrescaleTable:
329  if thisl1Line[0] == l1Name:
330  l1Line=thisl1Line
331  break
332  if len(l1Line)==0:
333  totalLine = [hltName]+[l1Name]+[-4]*(len(hltLine)-1) ## we found the L1 name, but there was no prescale info for it (error -4)
334  else:
335  totalLine = [hltName,l1Name]
336  for hltPS,l1PS in zip(hltLine[1:],l1Line[1:]):
337  try:
338  totalLine.append( int(hltPS)*int(l1PS) )
339  except:
340  print hltPS
341  print l1PS
342  raise
343  self.TotalPrescaleTable.append(totalLine)
344 
345 
346  def Save(self, fileName):
347  pickle.dump( self, open( fileName, 'w' ) )
348 
349  def Load(self, fileName):
350  self = pickle.load( open( fileName ) )
351 
352  def ComputePU(nBunches):
353  ScaleFactor = 71e-27/11.2e3/nBunches
354  out = []
355  for l in self.LumiByLS:
356  out.append(l*ScaleFactor)
357  return l
358 
def Load(self, fileName)
Definition: Page1Parser.py:349
def ParseRunSummaryPage(self)
Definition: Page1Parser.py:220
def replace(string, replacements)
void find(edm::Handle< EcalRecHitCollection > &hits, DetId thisDet, std::vector< EcalRecHitCollection::const_iterator > &hit, bool debug=false)
Definition: FindCaloHit.cc:20
def handle_data(self, data)
Definition: Page1Parser.py:181
def handle_endtag(self, tag)
Definition: Page1Parser.py:169
def _Parse(self, url)
Definition: Page1Parser.py:143
def Save(self, fileName)
Definition: Page1Parser.py:346
OutputIterator zip(InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, InputIterator2 last2, OutputIterator result, Compare comp)
def parse_starttag(self, i)
Definition: Page1Parser.py:57
def handle_startendtag(self, tag, attrs)
Definition: Page1Parser.py:178
def ComputePU(nBunches)
Definition: Page1Parser.py:352
def handle_starttag(self, tag, attrs)
Definition: Page1Parser.py:152
def ComputeTotalPrescales(self)
Definition: Page1Parser.py:306
def check_for_whole_start_tag(self, i)
Definition: Page1Parser.py:109
double split
Definition: MVATrainer.cc:139
Power< A, B >::type pow(const A &a, const B &b)
Definition: Power.h:40