CMS 3D CMS Logo

 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Pages
fetchall_from_DQM_v2.py
Go to the documentation of this file.
1 #! /usr/bin/env python
2 '''
3 Script fetches files matching specified RegExps from DQM GUI.
4 
5 Author: Albertas Gimbutas, Vilnius University (LT)
6 e-mail: albertasgim@gmail.com
7 '''
8 ################################################################################
9 # Change logs:
10 # 2012-10-22 11:31 - Checking to Download also files <1MB (like GEN samples)
11 # 2012-07-09 16:10 - BugFix: RELEASE has to be in selected file names.
12 # 2012-07-09 16:10 - Added How-To examples and command line option
13 # explanations for -h option.
14 # 2012-07-08 17:23 - Added file download in bunches.
15 # 2012-07-06 14:30 - Added multiprocessing for file download:
16 # http://docs.python.org/library/multiprocessing
17 # 2012-07-06 14:09 - Added new commandline options implmenetation.
18 # 2012-07-06 09:48 - fixed ``--data`` commandline option small bug. Now it
19 # does not requires to specifie its value.
20 ################################################################################
21 
22 import re
23 import sys
24 import os
25 
26 from multiprocessing import Pool, Queue, Process
27 from Queue import Empty
28 from os.path import basename, isfile
29 from optparse import OptionParser
30 from urllib2 import build_opener, Request
31 
32 try:
33  from Utilities.RelMon.authentication import X509CertOpen
34 except ImportError:
35  from authentication import X509CertOpen
36 
37 
38 def auth_wget(url, chunk_size=1048576):
39  """Returns the content of specified URL, which requires authentication.
40  If the content is bigger than 1MB, then save it to file.
41  """
42  opener = build_opener(X509CertOpen())
43  url_file = opener.open(Request(url))
44  size = int(url_file.headers["Content-Length"])
45 
46  if size < 1048576: # if File size < 1MB
47  filename = basename(url) #still download
48  readed = url_file.read() ## and then check if its not an empty dir (parent directory)
49  if filename != '':
50  outfile = open(filename, 'wb') #then write File to local system
51  outfile.write(readed)
52  return readed
53 
54  filename = basename(url)
55  file_id = selected_files.index(filename)
56 
57  if isfile("./%s" % filename):
58  print '%d. Exsits on disk. Skipping.' % (file_id +1)
59  return
60 
61  print '%d. Downloading...' % (file_id +1)
62  file = open(filename, 'wb')
63  # progress = 0
64  chunk = url_file.read(chunk_size)
65  while chunk:
66  file.write(chunk)
67  # progress += chunk_size
68  chunk = url_file.read(chunk_size)
69  print '%d. Done.' % (file_id +1)
70  file.close()
71 
72 
73 ## Define options
74 parser = OptionParser(usage='usage: %prog [options]')
75 parser.add_option('-d', '--data', action='store_true', dest='is_from_data',
76  help='Fetch data relvals.')
77 parser.add_option('-m', '--mc', action='store_false', dest='is_from_data',
78  help='Fetch Monte Carlo relvals.')
79 parser.add_option('-r', '--release', action='store', dest='release',
80  help='Release to fetch from. RELEASE format "CMSSW_x_x_x", e.g. CMSSW_5_3_2.')
81 parser.add_option('-e', '--re', '--regexp', action='store', dest='regexp', default='',
82  help='Comma separated regular expresions for file names. e.g. to fetch '+
83  'files, which names contain "cos" or "jet" and does not contain "2010", use: '+
84  '"cos,jet,^((?!2010).)*$".')
85 parser.add_option('--mthreads', action='store', default='3', dest='mthreads',
86  help='Number of threads for file download. Default is 3.')
87 parser.add_option('--dry', action='store_true', default=False, dest='dry_run',
88  help='Show files matched by regular expresion, but do not download them.')
89 ## Parse sys.argv
90 (options, args) = parser.parse_args()
91 options.release = options.release.strip('"\'=')
92 options.regexp = options.regexp.strip('"\'=')
93 
94 ## Check for option errors
95 if options.is_from_data is None:
96  parser.error('You have to specify the directory, use --mc for "RelVal" or ' +
97  '--data for "RelValData"')
98 elif options.release is None:
99  parser.error('You have to specify the CMSSW release, use --release option. ' +
100  'E.g. --release CMSSW_5_3_2')
101 elif not options.mthreads.isdigit():
102  parser.error('Bad --mthreads argument format. It has to be integer. E.g. ' +
103  '--mthreads 3')
104 
105 ## Use options
106 relvaldir = "RelVal"
107 if options.is_from_data:
108  relvaldir = "RelValData"
109 
110 release = re.findall('(CMSSW_\d*_\d*_)\d*(?:_[\w\d]*)?', options.release)
111 if not release:
112  parser.error('No such CMSSW release found. Please check the ``--release`` commandline option value.')
113 releasedir = release[0] + "x"
114 
115 base_url = 'https://cmsweb.cern.ch/dqm/relval/data/browse/ROOT/'
116 filedir_url = base_url + relvaldir + '/' + releasedir + '/'
117 filedir_html = auth_wget(filedir_url)
118 
119 #auth_wget("https://cmsweb.cern.ch/dqm/offline/data/browse/ROOT/OfflineData/Run2012/JetHT/0002029xx/DQM_V0001_R000202950__JetHT__Run2012C-PromptReco-v2__DQM.root")
120 #auth_wget("https://cmsweb.cern.ch/dqm/relval/data/browse/ROOT/RelValData/CMSSW_5_3_x/DQM_V0001_R000205921__JetHT__CMSSW_5_3_3_patch1-PR_newconditions_RelVal_R205921_121105-v2__DQM.root")
121 
122 file_list_re = re.compile(r"<a href='[-./\w]*'>([-./\w]*)<")
123 all_files = file_list_re.findall(filedir_html)[1:] # list of file names
124 
125 options.mthreads = int(options.mthreads)
126 if options.mthreads > 3 or options.mthreads < 1:
127  options.mthreads = 3
128 
129 ### Fetch the files, using multi-processing
130 file_res = [re.compile(r) for r in options.regexp.split(',') + [options.release]]
131 selected_files = [f for f in all_files if all([r.search(f) for r in file_res])]
132 
133 print 'Downloading files:'
134 for i, name in enumerate(selected_files):
135  print '%d. %s' % (i+1, name)
136 
137 if not options.dry_run:
138  print '\nProgress:'
139  pool = Pool(options.mthreads)
140  pool.map(auth_wget, [filedir_url + name for name in selected_files])