CMS 3D CMS Logo

TableParser.py
Go to the documentation of this file.
1 # email: cmsdoxy@cern.ch, ali.mehmet.altundag@cern.ch
2 
3 # please have a look at the namespaces.html (namespace list) and annotated.html
4 # (~class list) html files to understand the tags/attributes that we use in
5 # this script.
6 
7 from BeautifulSoup import *
8 import sys, os, copy
9 
10 htmlFullPath = None
11 htmlFilePath = None
12 htmlFileName = None
13 fileNameTemplate = None # html file name template
14 htmlPage = None
15 tableClassName = 'directory'
16 
17 # load rows from the table in [C]lass and [N]amespace list pages and prapere
18 # pages in the following structure: pages = {'A' : [...], 'B' : [...]}
19 def extractPages(configFileFlag = False):
20  # initial page, A
21  pages = {'A':[]}
22  # find all class/namespace talbe rows.
23  table = htmlPage.find('table', {'class' : tableClassName})
24  for row in table.findAll('tr'):
25  # please see the related html file (annotated.html) to understand the
26  # approach here. you will see that, only hidden rows have style
27  # attribute and these hidden rows must be added to pages of their
28  # parents. This is why we need to check whether row has a style
29  # attribute or not.
30  styleFlag = False
31  if row.has_key('style'): styleFlag = True
32  # change the first letter if row is not hidden (child) one
33  if not styleFlag: firstLetter = row.findAll('td')[0].text[0].upper()
34  # if pages dict doesn't have the page yet..
35  if not pages.has_key(firstLetter):
36  pages[firstLetter] = []
37  # insert the row into the related page
38  if configFileFlag:
39  url = row.find('a')['href']
40  if '_cff' in url or '_cfi' in url or '_cfg' in url:
41  pages[firstLetter].append(row)
42  else:
43  pages[firstLetter].append(row)
44  return pages
45 
46 # load rows from the package documentation page. output structure:
47 # pages = {'PackageA' : [..], 'PackageB' : [...]}
49  # initial page, A
50  pages = {}
51  table = htmlPage.find('table', {'class' : tableClassName})
52  for row in table.findAll('tr'):
53  # first cell contains name of the package...
54  name = row.findAll('td')[0].text
55  # parse package names --please have a look at the pages.html file
56  name = name[name.find(' '):name.find('/')].strip()
57  # if the package is not added yet
58  if not pages.has_key(name): pages[name] = []
59  pages[name].append(row)
60  return pages
61 
62 # generate alphabetic tab for html pages that will be generated by this script
63 def generateTab(items, curr, tabClass = 'tabs3'):
64  itemTagMap = {}; tab = ''
65  for item in items:
66  fn = fileNameTemplate % item.replace(' ', '_') # generate file name
67  if item != curr: tab += '<li><a href="%s">%s</a></li>' % (fn, item)
68  else: tab += '<li class="current"><a href="%s">%s</a></li>'%(fn, item)
69  return '<div class="%s"><ul class="tablist">%s</ul></div>' % (tabClass,tab)
70 
71 if __name__ == "__main__":
72  if len(sys.argv) < 2:
73  sys.stderr.write("not enough parameter!\n")
74  sys.exit(1)
75 
76  # initialize variables
77  htmlFullPath = sys.argv[1]
78  htmlFilePath = os.path.split(htmlFullPath)[0]
79  htmlFileName = os.path.split(htmlFullPath)[1]
80  fileNameTemplate = htmlFileName.replace('.html', '_%s.html')
81 
82  # load the html page
83  with open(htmlFullPath) as f:
84  htmlPage = f.read()
85  htmlPage = BeautifulSoup(htmlPage)
86 
87  # please have a look at the pages.html page. You will see that class name
88  # of the related tab, which we will use to put 'index tab' by using this
89  # tab, is different for pages.html file. For namespaces.html (namespace
90  # list) and annotated.html (~class list) files, class names are the same
91  # tabs2. this is why we are setting 'the destination tab class name' up
92  # differently depending on the html file name.
93  if htmlFileName == 'packageDocumentation.html':
94  pages = extractPagesForPackage()
95  destTabClassName = 'tabs'
96  elif htmlFileName == 'configfiles.html':
97  pages = extractPages(configFileFlag = True)
98  destTabClassName = 'tabs2'
99  else:
100  pages = extractPages()
101  destTabClassName = 'tabs2'
102 
103  allRows = []
104  pageNames = pages.keys(); pageNames.sort()
105  for page in pageNames:
106  allRows = allRows + pages[page]
107  pages['All'] = allRows
108  pageNames.append('All')
109 
110  # prepare the template
111  table = htmlPage.find('table', {'class' : tableClassName})
112  # generate template (clean whole table content)
113  for row in table.findAll('tr'):
114  row.extract()
115 
116  # generate pages
117  for page in pageNames:
118  print 'generating %s...' % (fileNameTemplate % page)
119  temp = BeautifulSoup(str(htmlPage))
120  table = temp.find('table', {'class' : tableClassName})
121  oldTab = temp.find('div', {'class' : destTabClassName})
122  newTab = generateTab(pageNames, page)
123  oldTab.replaceWith(BeautifulSoup(oldTab.prettify() + str(newTab)))
124  for row in pages[page]:
125  table.append(row)
126  # replace blank character with '_'. Please notice that you will not
127  # be able to use original page name after this line.
128  page = page.replace(' ', '_')
129  with open('%s/%s'%(htmlFilePath, fileNameTemplate % page), 'w') as f:
130  f.write(str(temp))
def extractPagesForPackage()
Definition: TableParser.py:48
def extractPages(configFileFlag=False)
Definition: TableParser.py:19
def generateTab(items, curr, tabClass='tabs3')
Definition: TableParser.py:63
#define str(s)