CMS 3D CMS Logo

Page1Parser.py
Go to the documentation of this file.
1 from __future__ import print_function
2 from HTMLParser import HTMLParser
3 from urllib2 import urlopen
4 import cPickle as pickle
5 import sys
6 import re
7 locatestarttagend = re.compile(r"""
8  <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
9  (?:\s+ # whitespace before attribute name
10  (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
11  (?:\s*=\s* # value indicator
12  (?:'[^']*' # LITA-enclosed value
13  |\"[^\"]*\" # LIT-enclosed value
14  |this.src='[^']*' # hack
15  |[^'\">\s]+ # bare value
16  )
17  )?
18  )
19  )*
20  \s* # trailing whitespace
21  """, re.VERBOSE)
22 
23 tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
24 attrfind = re.compile(
25  r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
26  r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~@]*))?')
27 
28 class Page1Parser(HTMLParser):
29 
30 
31  def __init__(self):
32  HTMLParser.__init__(self)
33 
34  self.InRow=0
35  self.InEntry=0
36  self.table = []
37  self.tmpRow = []
38  self.hyperlinks = []
39  self.RunNumber = 0
40  self.TriggerRates = []
41  self.Nevts = []
42  self.LumiByLS = []
43  self.FirstLS = -1
44  self.AvLumi = []
49  self.ColumnLumi=[]
50  self.L1Prescales=[]
51  self.RunPage = ''
52  self.RatePage = ''
53  self.LumiPage = ''
54  self.L1Page=''
55  self.TrigModePage=''
56  self.SeedMap=[]
57 
58  def parse_starttag(self, i):
59  self.__starttag_text = None
60  endpos = self.check_for_whole_start_tag(i)
61  if endpos < 0:
62  return endpos
63  rawdata = self.rawdata
64  self.__starttag_text = rawdata[i:endpos]
65 
66  # Now parse the data between i+1 and j into a tag and attrs
67  attrs = []
68  match = tagfind.match(rawdata, i+1)
69  assert match, 'unexpected call to parse_starttag()'
70  k = match.end()
71  self.lasttag = tag = rawdata[i+1:k].lower()
72 
73  if tag == 'img':
74  return endpos
75 
76  while k < endpos:
77  m = attrfind.match(rawdata, k)
78  if not m:
79  break
80  attrname, rest, attrvalue = m.group(1, 2, 3)
81  if not rest:
82  attrvalue = None
83  elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
84  attrvalue[:1] == '"' == attrvalue[-1:]:
85  attrvalue = attrvalue[1:-1]
86  attrvalue = self.unescape(attrvalue)
87  attrs.append((attrname.lower(), attrvalue))
88  k = m.end()
89 
90  end = rawdata[k:endpos].strip()
91  if end not in (">", "/>"):
92  lineno, offset = self.getpos()
93  if "\n" in self.__starttag_text:
94  lineno = lineno + self.__starttag_text.count("\n")
95  offset = len(self.__starttag_text) \
96  - self.__starttag_text.rfind("\n")
97  else:
98  offset = offset + len(self.__starttag_text)
99  self.error("junk characters in start tag: %r"
100  % (rawdata[k:endpos][:20],))
101  if end.endswith('/>'):
102  # XHTML-style empty tag: <span attr="value" />
103  self.handle_startendtag(tag, attrs)
104  else:
105  self.handle_starttag(tag, attrs)
106  if tag in self.CDATA_CONTENT_ELEMENTS:
107  self.set_cdata_mode()
108  return endpos
109 
111  rawdata = self.rawdata
112  m = locatestarttagend.match(rawdata, i)
113  if m:
114  j = m.end()
115  next = rawdata[j:j+1]
116  #print next
117  #if next == "'":
118  # j = rawdata.find(".jpg'",j)
119  # j = rawdata.find(".jpg'",j+1)
120  # next = rawdata[j:j+1]
121  if next == ">":
122  return j + 1
123  if next == "/":
124  if rawdata.startswith("/>", j):
125  return j + 2
126  if rawdata.startswith("/", j):
127  # buffer boundary
128  return -1
129  # else bogus input
130  self.updatepos(i, j + 1)
131  self.error("malformed empty start tag")
132  if next == "":
133  # end of input
134  return -1
135  if next in ("abcdefghijklmnopqrstuvwxyz=/"
136  "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
137  # end of input in or before attribute value, or we have the
138  # '/' from a '/>' ending
139  return -1
140  self.updatepos(i, j)
141  self.error("malformed start tag")
142  raise AssertionError("we should not get here!")
143 
144  def _Parse(self,url):
145  self.table = []
146  self.hyperlinks = []
147  req = urlopen(url)
148  try:
149  self.feed(req.read())
150  except Exception as inst:
151  print(inst)
152 
153  def handle_starttag(self,tag,attrs):
154  ValidTags = ['a','tr','td']
155  try:
156  if not tag in ValidTags:
157  return
158  tag.replace('%','')
159  tag.replace('?','')
160  if tag == 'a' and attrs:
161  self.hyperlinks.append(attrs[0][1])
162  elif tag == 'tr':
163  self.InRow=1
164  elif tag == 'td':
165  self.InEntry=1
166  except:
167  print(tag)
168  print(attrs)
169 
170  def handle_endtag(self,tag):
171  if tag =='tr':
172  if self.InRow==1:
173  self.InRow=0
174  self.table.append(self.tmpRow)
175  self.tmpRow=[]
176  if tag == 'td':
177  self.InEntry=0
178 
179  def handle_startendtag(self,tag, attrs):
180  pass
181 
182  def handle_data(self,data):
183  if self.InEntry:
184  self.tmpRow.append(data)
185 
186 
187  def ParsePage1(self):
188  # Find the first non-empty row on page one
189  MostRecent = self.table[0]
190  for line in self.table:
191  if line == []:
192  continue # skip empty rows, not exactly sure why they show up
193  MostRecent = line
194  break # find first non-empty line
195  TriggerMode = MostRecent[3]
196  isCollisions = not (TriggerMode.find('l1_hlt_collisions') == -1)
197  if not isCollisions:
198  return ''
199  self.RunNumber = MostRecent[0]
200  for link in self.hyperlinks:
201  if not link.find('RUN='+self.RunNumber)==-1:
202  self.RunPage = link
203  return link
204 
205 
206  def ParseRunPage(self):
207  for entry in self.hyperlinks:
208  entry = entry.replace('../../','http://cmswbm/')
209  if not entry.find('HLTSummary') == -1:
210  self.RatePage = entry
211  if not entry.find('L1Summary') == -1:
212  self.L1Page = entry
213  if not entry.find('LumiSections') == -1:
214  self.LumiPage = "http://cmswbm/cmsdb/servlet/"+entry
215  if not entry.find('TriggerMode') == -1:
216  if not entry.startswith("http://cmswbm/cmsdb/servlet/"):
217  entry = "http://cmswbm/cmsdb/servlet/"+entry
218  self.TrigModePage = entry
219  return [self.RatePage,self.LumiPage,self.L1Page,self.TrigModePage]
220 
222  for line in self.table:
223  if not len(line)>6: # All relevant lines in the table will be at least this long
224  continue
225  if line[1].startswith('HLT_'):
226  TriggerName = line[1][:line[1].find(' ')] # Format is HLT_... (####), this gets rid of the (####)
227  TriggerRate = float(line[6].replace(',','')) # Need to remove the ","s, since float() can't parse them
228  self.Nevts.append([TriggerName,int(line[3]),int(line[4]),int(line[5]),line[9]]) # 3-5 are the accept columns, 9 is the L1 seed name
229  PS=0
230  if int(line[4])>0:
231  PS = float(line[3])/float(line[4])
232  self.TriggerRates.append([TriggerName,TriggerRate,PS,line[9]])
233 
234  def ParseLumiPage(self):
235  for line in self.table[1:]:
236  if len(line)<4 or len(line)>12:
237  continue
238  self.PrescaleColumn.append(int(line[2]))
239  self.LumiByLS.append(float(line[4])) # Inst lumi is in position 4
240  if self.FirstLS == -1 and float(line[6]) > 0: # live lumi is in position 5, the first lumiblock with this > 0 should be recorded
241  self.FirstLS = int(line[0])
242  self.RatePage = self.RatePage.replace('HLTSummary?','HLTSummary?fromLS='+line[0]+'&toLS=&')
243  try:
244  self.AvLumi = sum(self.LumiByLS[self.FirstLS:])/len(self.LumiByLS[self.FirstLS:])
245  except ZeroDivisionError:
246  print("Cannot calculate average lumi -- something is wrong!")
247  print(self.table[:10])
248  raise
249 
250  def ParseL1Page(self):
251  for line in self.table:
252  print(line)
253  if len(line) < 9:
254  continue
255  if line[1].startswith('L1_'):
256  pass
257 
258  def ParseTrigModePage(self):
259  ColIndex=0 ## This is the index of the next column that we look for
260  for line in self.table:
261  if len(line) < 2:
262  continue
263  ## get the column usage
264  if line[0].isdigit() and len(line)>=3:
265  if int(line[0])==ColIndex:
266  ColIndex+=1
267  StrLumiSplit = line[2].split('E')
268  if len(StrLumiSplit)!=2:
269  ColIndex=-99999999
270  else:
271  lumi = float(StrLumiSplit[0])
272  lumi*= pow(10,int(StrLumiSplit[1])-30)
273  self.ColumnLumi.append(round(lumi,1))
274 
275 
276  ## Get the actual prescale tables
277  if line[1].startswith('L1_') or line[1].startswith('HLT_'):
278  tmp=[]
279  seedtmp=[]
280  tmp.append(line[1])
281  seedtmp.append(line[1])
282  for entry in line[2:]:
283  if entry.isdigit():
284  tmp.append(entry)
285  if entry.startswith('L1_'):
286  seedtmp.append(entry)
287 
288  del tmp[len(self.ColumnLumi)+1:] ## Truncate the list (TT seeds look like prescale entries)
289 
290  if line[1].startswith('L1_'):
291  self.L1PrescaleTable.append(tmp)
292  else:
293  self.HLTPrescaleTable.append(tmp)
294  if len(seedtmp)==2:
295  self.SeedMap.append(seedtmp)
296  if len(self.PrescaleColumn)==0:
297  continue
298  for L1Row in self.L1PrescaleTable:
299  thisAvPS=0
300  nLS=0
301  for prescaleThisLS in self.PrescaleColumn[self.FirstLS:]:
302  thisAvPS+=float(L1Row[prescaleThisLS+1])
303  nLS+=1
304  thisAvPS/=nLS
305  self.L1Prescales.append([L1Row[0],thisAvPS])
306 
308  if len(self.L1PrescaleTable)==0 or len(self.HLTPrescaleTable)==0 or len(self.SeedMap)==0:
309  return
310 
311  for hltLine in self.HLTPrescaleTable:
312  totalLine=[]
313  hltName = hltLine[0]
314  l1Name = ""
315  # figure out the l1 Seed
316  for hlt,l1 in self.SeedMap:
317  if hltName==hlt:
318  l1Name=l1
319  break
320 
321  if l1Name == "":
322  totalLine = [hltName]+[l1Name]+[-3]*(len(hltLine)-1) ## couldn't figure out the L1 seed (error -3)
323  else:
324  ## Get the L1 Prescales
325  l1Line=[]
326  if not l1Name.find(' OR ')==-1: ## contains ORs, don't parse for the moment
327  l1Line = [l1Name]+[1]*(len(hltLine)-1) ## couldn't parse the ORs !! FOR NOW WE JUST SET THE L1 PRESCALE TO 1
328  else:
329  for thisl1Line in self.L1PrescaleTable:
330  if thisl1Line[0] == l1Name:
331  l1Line=thisl1Line
332  break
333  if len(l1Line)==0:
334  totalLine = [hltName]+[l1Name]+[-4]*(len(hltLine)-1) ## we found the L1 name, but there was no prescale info for it (error -4)
335  else:
336  totalLine = [hltName,l1Name]
337  for hltPS,l1PS in zip(hltLine[1:],l1Line[1:]):
338  try:
339  totalLine.append( int(hltPS)*int(l1PS) )
340  except:
341  print(hltPS)
342  print(l1PS)
343  raise
344  self.TotalPrescaleTable.append(totalLine)
345 
346 
347  def Save(self, fileName):
348  pickle.dump( self, open( fileName, 'w' ) )
349 
350  def Load(self, fileName):
351  self = pickle.load( open( fileName ) )
352 
353  def ComputePU(nBunches):
354  ScaleFactor = 71e-27/11.2e3/nBunches
355  out = []
356  for l in self.LumiByLS:
357  out.append(l*ScaleFactor)
358  return l
359 
def Load(self, fileName)
Definition: Page1Parser.py:350
def ParseRunSummaryPage(self)
Definition: Page1Parser.py:221
def replace(string, replacements)
S & print(S &os, JobReport::InputFile const &f)
Definition: JobReport.cc:66
void find(edm::Handle< EcalRecHitCollection > &hits, DetId thisDet, std::vector< EcalRecHitCollection::const_iterator > &hit, bool debug=false)
Definition: FindCaloHit.cc:20
def handle_data(self, data)
Definition: Page1Parser.py:182
def handle_endtag(self, tag)
Definition: Page1Parser.py:170
def _Parse(self, url)
Definition: Page1Parser.py:144
def Save(self, fileName)
Definition: Page1Parser.py:347
OutputIterator zip(InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, InputIterator2 last2, OutputIterator result, Compare comp)
def parse_starttag(self, i)
Definition: Page1Parser.py:58
def handle_startendtag(self, tag, attrs)
Definition: Page1Parser.py:179
def ComputePU(nBunches)
Definition: Page1Parser.py:353
def handle_starttag(self, tag, attrs)
Definition: Page1Parser.py:153
def ComputeTotalPrescales(self)
Definition: Page1Parser.py:307
def check_for_whole_start_tag(self, i)
Definition: Page1Parser.py:110
double split
Definition: MVATrainer.cc:139
Power< A, B >::type pow(const A &a, const B &b)
Definition: Power.h:40