CMS 3D CMS Logo

fetchall_from_DQM_v2.py
Go to the documentation of this file.
1 #! /usr/bin/env python
2 '''
3 Script fetches files matching specified RegExps from DQM GUI.
4 
5 Author: Albertas Gimbutas, Vilnius University (LT)
6 e-mail: albertasgim@gmail.com
7 '''
8 from __future__ import print_function
9 ################################################################################
10 # Change logs:
11 # 2012-10-22 11:31 - Checking to Download also files <1MB (like GEN samples)
12 # 2012-07-09 16:10 - BugFix: RELEASE has to be in selected file names.
13 # 2012-07-09 16:10 - Added How-To examples and command line option
14 # explanations for -h option.
15 # 2012-07-08 17:23 - Added file download in bunches.
16 # 2012-07-06 14:30 - Added multiprocessing for file download:
17 # http://docs.python.org/library/multiprocessing
18 # 2012-07-06 14:09 - Added new commandline options implmenetation.
19 # 2012-07-06 09:48 - fixed ``--data`` commandline option small bug. Now it
20 # does not requires to specifie its value.
21 ################################################################################
22 
23 import re
24 import sys
25 import os
26 
27 from multiprocessing import Pool, Queue, Process
28 from Queue import Empty
29 from os.path import basename, isfile
30 from optparse import OptionParser
31 from urllib2 import build_opener, Request
32 
33 try:
34  from Utilities.RelMon.authentication import X509CertOpen
35 except ImportError:
36  from authentication import X509CertOpen
37 
38 
39 def auth_wget(url, chunk_size=1048576):
40  """Returns the content of specified URL, which requires authentication.
41  If the content is bigger than 1MB, then save it to file.
42  """
43  opener = build_opener(X509CertOpen())
44  url_file = opener.open(Request(url))
45  size = int(url_file.headers["Content-Length"])
46 
47  if size < 1048576: # if File size < 1MB
48  filename = basename(url) #still download
49  readed = url_file.read() ## and then check if its not an empty dir (parent directory)
50  if filename != '':
51  outfile = open(filename, 'wb') #then write File to local system
52  outfile.write(readed)
53  return readed
54 
55  filename = basename(url)
56  file_id = selected_files.index(filename)
57 
58  if isfile("./%s" % filename):
59  print('%d. Exsits on disk. Skipping.' % (file_id +1))
60  return
61 
62  print('%d. Downloading...' % (file_id +1))
63  file = open(filename, 'wb')
64  # progress = 0
65  chunk = url_file.read(chunk_size)
66  while chunk:
67  file.write(chunk)
68  # progress += chunk_size
69  chunk = url_file.read(chunk_size)
70  print('%d. Done.' % (file_id +1))
71  file.close()
72 
73 
74 ## Define options
75 parser = OptionParser(usage='usage: %prog [options]')
76 parser.add_option('-d', '--data', action='store_true', dest='is_from_data',
77  help='Fetch data relvals.')
78 parser.add_option('-m', '--mc', action='store_false', dest='is_from_data',
79  help='Fetch Monte Carlo relvals.')
80 parser.add_option('-r', '--release', action='store', dest='release',
81  help='Release to fetch from. RELEASE format "CMSSW_x_x_x", e.g. CMSSW_5_3_2.')
82 parser.add_option('-e', '--re', '--regexp', action='store', dest='regexp', default='',
83  help='Comma separated regular expresions for file names. e.g. to fetch '+
84  'files, which names contain "cos" or "jet" and does not contain "2010", use: '+
85  '"cos,jet,^((?!2010).)*$".')
86 parser.add_option('--mthreads', action='store', default='3', dest='mthreads',
87  help='Number of threads for file download. Default is 3.')
88 parser.add_option('--dry', action='store_true', default=False, dest='dry_run',
89  help='Show files matched by regular expresion, but do not download them.')
90 ## Parse sys.argv
91 (options, args) = parser.parse_args()
92 options.release = options.release.strip('"\'=')
93 options.regexp = options.regexp.strip('"\'=')
94 
95 ## Check for option errors
96 if options.is_from_data is None:
97  parser.error('You have to specify the directory, use --mc for "RelVal" or ' +
98  '--data for "RelValData"')
99 elif options.release is None:
100  parser.error('You have to specify the CMSSW release, use --release option. ' +
101  'E.g. --release CMSSW_5_3_2')
102 elif not options.mthreads.isdigit():
103  parser.error('Bad --mthreads argument format. It has to be integer. E.g. ' +
104  '--mthreads 3')
105 
106 ## Use options
107 relvaldir = "RelVal"
108 if options.is_from_data:
109  relvaldir = "RelValData"
110 
111 release = re.findall('(CMSSW_\d*_\d*_)\d*(?:_[\w\d]*)?', options.release)
112 if not release:
113  parser.error('No such CMSSW release found. Please check the ``--release`` commandline option value.')
114 releasedir = release[0] + "x"
115 
116 base_url = 'https://cmsweb.cern.ch/dqm/relval/data/browse/ROOT/'
117 filedir_url = base_url + relvaldir + '/' + releasedir + '/'
118 filedir_html = auth_wget(filedir_url)
119 
120 #auth_wget("https://cmsweb.cern.ch/dqm/offline/data/browse/ROOT/OfflineData/Run2012/JetHT/0002029xx/DQM_V0001_R000202950__JetHT__Run2012C-PromptReco-v2__DQM.root")
121 #auth_wget("https://cmsweb.cern.ch/dqm/relval/data/browse/ROOT/RelValData/CMSSW_5_3_x/DQM_V0001_R000205921__JetHT__CMSSW_5_3_3_patch1-PR_newconditions_RelVal_R205921_121105-v2__DQM.root")
122 
123 file_list_re = re.compile(r"<a href='[-./\w]*'>([-./\w]*)<")
124 all_files = file_list_re.findall(filedir_html)[1:] # list of file names
125 
126 options.mthreads = int(options.mthreads)
127 if options.mthreads > 3 or options.mthreads < 1:
128  options.mthreads = 3
129 
130 ### Fetch the files, using multi-processing
131 file_res = [re.compile(r) for r in options.regexp.split(',') + [options.release]]
132 selected_files = [f for f in all_files if all([r.search(f) for r in file_res])]
133 
134 print('Downloading files:')
135 for i, name in enumerate(selected_files):
136  print('%d. %s' % (i+1, name))
137 
138 if not options.dry_run:
139  print('\nProgress:')
140  pool = Pool(options.mthreads)
141  pool.map(auth_wget, [filedir_url + name for name in selected_files])
S & print(S &os, JobReport::InputFile const &f)
Definition: JobReport.cc:66
def auth_wget(url, chunk_size=1048576)