CMS 3D CMS Logo

electronDataDiscovery.py
Go to the documentation of this file.
1 from __future__ import print_function
2 
3 # ===================================================================
4 # So to get the list of input files. One must call :
5 # search(), to get the list of primary files
6 # search2(), to get the list of eventual secondary files
7 #
8 # The selection of files is configured thanks to shell
9 # environment variables:
10 #
11 # DD_RELEASE, for example CMSSW_2_2_0_pre1
12 # DD_SAMPLE, for example RelValSingleElectronPt35
13 # DD_RUN, for example ''
14 # DD_COND , for example MC_31X_V2-v1
15 # DD_TIER , for example RECO
16 # DD_TIER_SECONDARY, for eventual secondary files
17 #
18 # DD_SOURCE:
19 # das: use das
20 # dbs: use dbs search
21 # lsf: use dbs lsf
22 # /castor/cern.ch/cms/...: assumed to be the path of a castor directory containing the input data files
23 # for relvals: '/castor/cern.ch/cms/store/relval/${DD_RELEASE}/${DD_SAMPLE}/${DD_TIER}/${DD_COND}/'
24 # for harvested dqm: '/castor/cern.ch/cms/store/unmerged/dqm/${DD_SAMPLE}-${DD_RELEASE}-${DD_COND}-DQM-DQMHarvest-OfflineDQM'
25 # /eos/cms/...: assumed to be the path of a castor directory containing the input data files
26 # for relvals: '/eos/cms/store/relval/${DD_RELEASE}/${DD_SAMPLE}/${DD_TIER}/${DD_COND}/'
27 # for harvested dqm: '/eos/cms/store/unmerged/dqm/${DD_SAMPLE}-${DD_RELEASE}-${DD_COND}-DQM-DQMHarvest-OfflineDQM'
28 # /...: assumed to be the path of a text file containing the list of input data files
29 #
30 # All except DD_SOURCE can use wildcard *.
31 # ===================================================================
32 
33 import os, sys, re # , das_client
34 import httplib, urllib, urllib2, types, string # , os, sys
35 import Utilities.General.cmssw_das_client as das_client
36 import json
37 from json import loads, dumps
38 
39 if 'DD_SOURCE' not in os.environ:
40  os.environ['DD_SOURCE'] = 'das'
41 if 'DD_RELEASE' not in os.environ:
42  os.environ['DD_RELEASE'] = ''
43 if 'DD_SAMPLE' not in os.environ:
44  os.environ['DD_SAMPLE'] = ''
45 if 'DD_COND' not in os.environ:
46  os.environ['DD_COND'] = ''
47 if 'DD_TIER' not in os.environ:
48  os.environ['DD_TIER'] = ''
49 if 'DD_TIER_SECONDARY' not in os.environ:
50  os.environ['DD_TIER_SECONDARY'] = ''
51 if 'DD_RUN' not in os.environ:
52  os.environ['DD_RUN'] = ''
53 
54 dd_release_re = re.compile(os.environ['DD_RELEASE'].replace('*', '.*'));
55 dd_sample_re = re.compile(os.environ['DD_SAMPLE'].replace('*', '.*'));
56 dd_cond_re = re.compile(os.environ['DD_COND'].replace('*', '.*'));
57 dd_run_re = re.compile(os.environ['DD_RUN'].replace('*', '.*'));
58 
59 
60 def common_search(dd_tier):
61  dd_tier_re = re.compile(dd_tier.replace('*', '.*'));
62 
63  if os.environ['DD_SOURCE'] == "das":
64 
65  query = "dataset instance=cms_dbs_prod_global"
66  if os.environ['DD_RELEASE'] != "":
67  query = query + " release=" + os.environ['DD_RELEASE']
68  if os.environ['DD_SAMPLE'] != "":
69  query = query + " primary_dataset=" + os.environ['DD_SAMPLE']
70  if dd_tier != "":
71  query = query + " tier=" + dd_tier
72  if os.environ['DD_COND'] != "":
73  query = query + " dataset=*" + os.environ['DD_COND'] + "*"
74  if os.environ['DD_RUN'] != "":
75  query = query + " run=" + os.environ['DD_RUN']
76  # query = query + " | unique" # too long ??
77 
78  # data = os.popen('das_client.py --limit=0 --query "'+query+'"')
79  # datalines = data.readlines()
80  # data.close()
81  # datasets = []
82  # for line in datalines:
83  # line = line.rstrip()
84  # if line != "" and line[0] =="/":
85  # datasets.append(line)
86  # dataset = datasets[0]
87 
88  data = das_client.json.loads(das_client.get_data('https://cmsweb.cern.ch', query, 0, 0, 0))
89 
90  if data['nresults'] == 0:
91  print('[electronDataDiscovery.py] No DAS dataset for query:', query)
92  return []
93  while data['nresults'] > 1:
94  if data['data'][0]['dataset'][0]['name'] == data['data'][1]['dataset'][0]['name']:
95  data['data'].pop(0)
96  data['nresults'] -= 1
97  else:
98  print('[electronDataDiscovery.py] Several DAS datasets for query:', query)
99  for i in range(data['nresults']):
100  print(
101  '[electronDataDiscovery.py] dataset[' + str(i) + ']: ' + data['data'][i]['dataset'][0]['name'])
102  return []
103 
104  dataset = data['data'][0]['dataset'][0]['name']
105 
106  query = "file instance=cms_dbs_prod_global dataset=" + dataset
107 
108  # data = os.popen('das_client.py --limit=0 --query "'+query+'"')
109  # datalines = data.readlines()
110  # data.close()
111  # result = []
112  # for line in datalines:
113  # line = line.rstrip()
114  # if line != "" and line[0] =="/":
115  # result.append(line)
116 
117  data = das_client.json.loads(das_client.get_data('https://cmsweb.cern.ch', query, 0, 0, 0))
118 
119  if data['nresults'] == 0:
120  print('[electronDataDiscovery.py] No DAS file in dataset:', dataset)
121  return []
122  else:
123  print('there is %d results' % nresults)
124 
125  result = []
126  for i in range(0, data['nresults']):
127  result.append(str(data['data'][i]['file'][0]['name']))
128 
129  elif os.environ['DD_SOURCE'] == "dbs":
130 
131  input = "find file"
132  separator = " where "
133  if os.environ['DD_RELEASE'] != "":
134  input = input + separator + "release = " + os.environ['DD_RELEASE']
135  separator = " and "
136  if os.environ['DD_SAMPLE'] != "":
137  input = input + separator + "primds = " + os.environ['DD_SAMPLE']
138  separator = " and "
139  if os.environ['DD_RUN'] != "":
140  input = input + separator + "run = " + os.environ['DD_RUN']
141  separator = " and "
142  input = input + separator + "dataset like *" + os.environ['DD_COND'] + "*" + dd_tier + "*"
143 
144  data = os.popen(
145  'dbs search --url="http://cmsdbsprod.cern.ch/cms_dbs_prod_global/servlet/DBSServlet" --query "' + input + '"')
146  datalines = data.readlines()
147  data.close()
148  result = []
149  for line in datalines:
150  line = line.rstrip()
151  if line != "" and line[0] == "/":
152  result.append(line)
153 
154  elif os.environ['DD_SOURCE'] == "http":
155 
156  input = "find file"
157  separator = " where "
158  if os.environ['DD_RELEASE'] != "":
159  input = input + separator + "release = " + os.environ['DD_RELEASE']
160  separator = " and "
161  if os.environ['DD_SAMPLE'] != "":
162  input = input + separator + "primds = " + os.environ['DD_SAMPLE']
163  separator = " and "
164  if os.environ['DD_RUN'] != "":
165  input = input + separator + "run = " + os.environ['DD_RUN']
166  separator = " and "
167  input = input + separator + "dataset like *" + os.environ['DD_COND'] + "*" + dd_tier + "*"
168 
169  url = "https://cmsweb.cern.ch:443/dbs_discovery/aSearch"
170  final_input = urllib.quote(input);
171 
172  agent = "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)"
173  ctypes = "text/plain"
174  headers = {'User-Agent': agent, 'Accept': ctypes}
175  params = {'dbsInst': 'cms_dbs_prod_global',
176  'html': 0, 'caseSensitive': 'on', '_idx': 0, 'pagerStep': -1,
177  'userInput': final_input,
178  'xml': 0, 'details': 0, 'cff': 0, 'method': 'dbsapi'}
179  data = urllib.urlencode(params, doseq=True)
180  req = urllib2.Request(url, data, headers)
181  data = ""
182 
183  try:
184  response = urllib2.urlopen(req)
185  data = response.read()
186  except urllib2.HTTPError as e:
187  if e.code == 201:
188  print(e.headers)
189  print(e.msg)
190  pass
191  else:
192  raise e
193 
194  datalines = data.readlines()
195  data.close()
196  result = []
197  for line in datalines:
198  line = line.rstrip()
199  if line != "" and line[0] == "/":
200  result.append(line)
201 
202  elif os.environ['DD_SOURCE'] == "lsf":
203 
204  dbs_path = '/' + os.environ['DD_SAMPLE'] + '/' + os.environ['DD_RELEASE'] + '-' + os.environ['DD_COND'] + '/' + \
205  os.environ['DD_TIER'] + '"'
206  if __name__ == "__main__":
207  print('dbs path:', dbs_path)
208  data = os.popen('dbs lsf --path="' + dbs_path + '"')
209  datalines = data.readlines()
210  data.close()
211  result = []
212  for line in datalines:
213  line = line.rstrip()
214  if line != "" and line[0] == "/":
215  result.append(line)
216 
217  elif os.environ['DD_SOURCE'].startswith('/castor/cern.ch/cms/'): # assumed to be a castor dir
218 
219  castor_dir = os.environ['DD_SOURCE'].replace('/castor/cern.ch/cms/', '/', 1)
220  result = []
221  data = os.popen('rfdir /castor/cern.ch/cms' + castor_dir)
222  subdirs = data.readlines()
223  data.close()
224  datalines = []
225  for line in subdirs:
226  line = line.rstrip()
227  subdir = line.split()[8]
228  data = os.popen('rfdir /castor/cern.ch/cms' + castor_dir + '/' + subdir)
229  datalines = data.readlines()
230  for line in datalines:
231  line = line.rstrip()
232  file = line.split()[8]
233  if file != "":
234  result.append(castor_dir + '/' + subdir + '/' + file)
235  data.close()
236 
237  elif os.environ['DD_SOURCE'].startswith('/eos/cms/'): # assumed to be an eos dir
238 
239  data = os.popen('eos find -f ' + os.environ['DD_SOURCE'])
240  lines = data.readlines()
241  data.close()
242  result = []
243  for line in lines:
244  line = line.strip().replace('/eos/cms/', '/', 1)
245  if line == "": continue
246  if dd_sample_re.search(line) == None: continue
247  if dd_cond_re.search(line) == None: continue
248  if dd_tier_re.search(line) == None: continue
249  if dd_run_re.search(line) == None: continue
250  result.append(line)
251 
252  else: # os.environ['DD_SOURCE'] is assumed to be a file name
253 
254  result = []
255  for line in open(os.environ['DD_SOURCE']).readlines():
256  line = os.path.expandvars(line.strip())
257  if line == "": continue
258  if dd_sample_re.search(line) == None: continue
259  if dd_cond_re.search(line) == None: continue
260  if dd_tier_re.search(line) == None: continue
261  if dd_run_re.search(line) == None: continue
262  result.append(line)
263 
264  if len(result) == 0:
265  diag = '[electronDataDiscovery.py] No more files after filtering with :'
266  if os.environ['DD_SAMPLE'] != '': diag += ' ' + os.environ['DD_SAMPLE']
267  if os.environ['DD_COND'] != '': diag += ' ' + os.environ['DD_COND']
268  if dd_tier != '': diag += ' ' + dd_tier
269  if os.environ['DD_RUN'] != '': diag += ' ' + os.environ['DD_RUN']
270  print(diag)
271 
272  return result
273 
274 
275 def search():
276  print('search in %s' % 'DD_TIER')
277  return common_search(os.environ['DD_TIER'])
278 
279 
280 def search2():
281  return common_search(os.environ['DD_TIER_SECONDARY'])
282 
283 
284 def getCMSdata(data, dbs="prod/global"):
285  # Read DAS database.
286  cmd = 'dasgoclient --query="file dataset=DATA instance=DBS" | sort'
287  cmd2 = cmd.replace('DATA', data).replace('DBS', dbs)
288  files = os.popen(cmd2).read()
289  # Create python list containing file names.
290  flist = files.split('\n')
291  del flist[-1]
292  return flist
def get_data(host, query, idx, limit, debug, threshold=300, ckey=None, cert=None, capath=None, qcache=0, das_headers=True)
Definition: das_client.py:276
def replace(string, replacements)
void print(TMatrixD &m, const char *label=nullptr, bool mathematicaFormat=false)
Definition: Utilities.cc:47
def getCMSdata(data, dbs="prod/global")
#define str(s)