CMS 3D CMS Logo

electronDataDiscovery.py
Go to the documentation of this file.
1 from __future__ import print_function
2 
3 #===================================================================
4 # So to get the list of input files. One must call :
5 # search(), to get the list of primary files
6 # search2(), to get the list of eventual secondary files
7 #
8 # The selection of files is configured thanks to shell
9 # environment variables:
10 #
11 # DD_RELEASE, for example CMSSW_2_2_0_pre1
12 # DD_SAMPLE, for example RelValSingleElectronPt35
13 # DD_RUN, for example ''
14 # DD_COND , for example MC_31X_V2-v1
15 # DD_TIER , for example RECO
16 # DD_TIER_SECONDARY, for eventual secondary files
17 #
18 # DD_SOURCE:
19 # das: use das
20 # dbs: use dbs search
21 # lsf: use dbs lsf
22 # /castor/cern.ch/cms/...: assumed to be the path of a castor directory containing the input data files
23 # for relvals: '/castor/cern.ch/cms/store/relval/${DD_RELEASE}/${DD_SAMPLE}/${DD_TIER}/${DD_COND}/'
24 # for harvested dqm: '/castor/cern.ch/cms/store/unmerged/dqm/${DD_SAMPLE}-${DD_RELEASE}-${DD_COND}-DQM-DQMHarvest-OfflineDQM'
25 # /eos/cms/...: assumed to be the path of a castor directory containing the input data files
26 # for relvals: '/eos/cms/store/relval/${DD_RELEASE}/${DD_SAMPLE}/${DD_TIER}/${DD_COND}/'
27 # for harvested dqm: '/eos/cms/store/unmerged/dqm/${DD_SAMPLE}-${DD_RELEASE}-${DD_COND}-DQM-DQMHarvest-OfflineDQM'
28 # /...: assumed to be the path of a text file containing the list of input data files
29 #
30 # All except DD_SOURCE can use wildcard *.
31 #===================================================================
32 
33 #import httplib, urllib, urllib2, types, string, os, sys
34 import os, sys, re, das_client
35 
36 if 'DD_SOURCE' not in os.environ:
37  os.environ['DD_SOURCE'] = 'das'
38 if 'DD_RELEASE' not in os.environ:
39  os.environ['DD_RELEASE'] = ''
40 if 'DD_SAMPLE' not in os.environ:
41  os.environ['DD_SAMPLE'] = ''
42 if 'DD_COND' not in os.environ:
43  os.environ['DD_COND'] = ''
44 if 'DD_TIER' not in os.environ:
45  os.environ['DD_TIER'] = ''
46 if 'DD_TIER_SECONDARY' not in os.environ:
47  os.environ['DD_TIER_SECONDARY'] = ''
48 if 'DD_RUN' not in os.environ:
49  os.environ['DD_RUN'] = ''
50 
51 dd_release_re = re.compile(os.environ['DD_RELEASE'].replace('*','.*')) ;
52 dd_sample_re = re.compile(os.environ['DD_SAMPLE'].replace('*','.*')) ;
53 dd_cond_re = re.compile(os.environ['DD_COND'].replace('*','.*')) ;
54 dd_run_re = re.compile(os.environ['DD_RUN'].replace('*','.*')) ;
55 
56 def common_search(dd_tier):
57 
58  dd_tier_re = re.compile(dd_tier.replace('*','.*')) ;
59 
60  if os.environ['DD_SOURCE'] == "das":
61 
62  query = "dataset instance=cms_dbs_prod_global"
63  if os.environ['DD_RELEASE'] != "" :
64  query = query + " release=" + os.environ['DD_RELEASE']
65  if os.environ['DD_SAMPLE'] != "":
66  query = query + " primary_dataset=" + os.environ['DD_SAMPLE']
67  if dd_tier != "":
68  query = query + " tier=" + dd_tier
69  if os.environ['DD_COND'] != "":
70  query = query + " dataset=*" + os.environ['DD_COND'] + "*"
71  if os.environ['DD_RUN'] != "":
72  query = query + " run=" + os.environ['DD_RUN']
73  #query = query + " | unique" # too long ??
74 
75  #data = os.popen('das_client.py --limit=0 --query "'+query+'"')
76  #datalines = data.readlines()
77  #data.close()
78  #datasets = []
79  #for line in datalines:
80  # line = line.rstrip()
81  # if line != "" and line[0] =="/":
82  # datasets.append(line)
83  #dataset = datasets[0]
84 
85  data = das_client.json.loads(das_client.get_data('https://cmsweb.cern.ch',query,0,0,0))
86 
87  if data['nresults']==0:
88  print('[electronDataDiscovery.py] No DAS dataset for query:', query)
89  return []
90  while data['nresults']>1:
91  if data['data'][0]['dataset'][0]['name']==data['data'][1]['dataset'][0]['name']:
92  data['data'].pop(0)
93  data['nresults'] -= 1
94  else:
95  print('[electronDataDiscovery.py] Several DAS datasets for query:', query)
96  for i in range(data['nresults']):
97  print('[electronDataDiscovery.py] dataset['+str(i)+']: '+data['data'][i]['dataset'][0]['name'])
98  return []
99 
100  dataset = data['data'][0]['dataset'][0]['name']
101 
102  query = "file instance=cms_dbs_prod_global dataset="+dataset
103 
104  #data = os.popen('das_client.py --limit=0 --query "'+query+'"')
105  #datalines = data.readlines()
106  #data.close()
107  #result = []
108  #for line in datalines:
109  # line = line.rstrip()
110  # if line != "" and line[0] =="/":
111  # result.append(line)
112 
113  data = das_client.json.loads(das_client.get_data('https://cmsweb.cern.ch',query,0,0,0))
114 
115  if data['nresults']==0:
116  print('[electronDataDiscovery.py] No DAS file in dataset:', dataset)
117  return []
118 
119  result = []
120  for i in range(0,data['nresults']):
121  result.append(str(data['data'][i]['file'][0]['name']))
122 
123  elif os.environ['DD_SOURCE'] == "dbs":
124 
125  input = "find file"
126  separator = " where "
127  if os.environ['DD_RELEASE'] != "":
128  input = input + separator + "release = " + os.environ['DD_RELEASE']
129  separator = " and "
130  if os.environ['DD_SAMPLE'] != "":
131  input = input + separator + "primds = " + os.environ['DD_SAMPLE']
132  separator = " and "
133  if os.environ['DD_RUN'] != "":
134  input = input + separator + "run = " + os.environ['DD_RUN']
135  separator = " and "
136  input = input + separator + "dataset like *" + os.environ['DD_COND'] + "*" + dd_tier + "*"
137 
138  data = os.popen('dbs search --url="http://cmsdbsprod.cern.ch/cms_dbs_prod_global/servlet/DBSServlet" --query "'+input+'"')
139  datalines = data.readlines()
140  data.close()
141  result = []
142  for line in datalines:
143  line = line.rstrip()
144  if line != "" and line[0] =="/":
145  result.append(line)
146 
147  elif os.environ['DD_SOURCE'] == "http":
148 
149  input = "find file"
150  separator = " where "
151  if os.environ['DD_RELEASE'] != "":
152  input = input + separator + "release = " + os.environ['DD_RELEASE']
153  separator = " and "
154  if os.environ['DD_SAMPLE'] != "":
155  input = input + separator + "primds = " + os.environ['DD_SAMPLE']
156  separator = " and "
157  if os.environ['DD_RUN'] != "":
158  input = input + separator + "run = " + os.environ['DD_RUN']
159  separator = " and "
160  input = input + separator + "dataset like *" + os.environ['DD_COND'] + "*" + dd_tier + "*"
161 
162  url = "https://cmsweb.cern.ch:443/dbs_discovery/aSearch"
163  final_input = urllib.quote(input) ;
164 
165  agent = "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)"
166  ctypes = "text/plain"
167  headers = { 'User-Agent':agent, 'Accept':ctypes}
168  params = {'dbsInst':'cms_dbs_prod_global',
169  'html':0,'caseSensitive':'on','_idx':0,'pagerStep':-1,
170  'userInput':final_input,
171  'xml':0,'details':0,'cff':0,'method':'dbsapi'}
172  data = urllib.urlencode(params,doseq=True)
173  req = urllib2.Request(url, data, headers)
174  data = ""
175 
176  try:
177  response = urllib2.urlopen(req)
178  data = response.read()
179  except urllib2.HTTPError as e:
180  if e.code==201:
181  print(e.headers)
182  print(e.msg)
183  pass
184  else:
185  raise e
186 
187  datalines = data.readlines()
188  data.close()
189  result = []
190  for line in datalines:
191  line = line.rstrip()
192  if line != "" and line[0] =="/":
193  result.append(line)
194 
195  elif os.environ['DD_SOURCE'] == "lsf":
196 
197  dbs_path = '/'+os.environ['DD_SAMPLE']+'/'+os.environ['DD_RELEASE']+'-'+os.environ['DD_COND']+'/'+os.environ['DD_TIER']+'"'
198  if __name__ == "__main__":
199  print('dbs path:',dbs_path)
200  data = os.popen('dbs lsf --path="'+dbs_path+'"')
201  datalines = data.readlines()
202  data.close()
203  result = []
204  for line in datalines:
205  line = line.rstrip()
206  if line != "" and line[0] =="/":
207  result.append(line)
208 
209  elif os.environ['DD_SOURCE'].startswith('/castor/cern.ch/cms/'): # assumed to be a castor dir
210 
211  castor_dir = os.environ['DD_SOURCE'].replace('/castor/cern.ch/cms/','/',1)
212  result = []
213  data = os.popen('rfdir /castor/cern.ch/cms'+castor_dir)
214  subdirs = data.readlines()
215  data.close()
216  datalines = []
217  for line in subdirs:
218  line = line.rstrip()
219  subdir = line.split()[8]
220  data = os.popen('rfdir /castor/cern.ch/cms'+castor_dir+'/'+subdir)
221  datalines = data.readlines()
222  for line in datalines:
223  line = line.rstrip()
224  file = line.split()[8]
225  if file != "":
226  result.append(castor_dir+'/'+subdir+'/'+file)
227  data.close()
228 
229  elif os.environ['DD_SOURCE'].startswith('/eos/cms/'): # assumed to be an eos dir
230 
231  data = os.popen('eos find -f '+os.environ['DD_SOURCE'])
232  lines = data.readlines()
233  data.close()
234  result = []
235  for line in lines:
236  line = line.strip().replace('/eos/cms/','/',1)
237  if line == "": continue
238  if dd_sample_re.search(line) == None: continue
239  if dd_cond_re.search(line) == None: continue
240  if dd_tier_re.search(line) == None: continue
241  if dd_run_re.search(line) == None: continue
242  result.append(line)
243 
244  else: # os.environ['DD_SOURCE'] is assumed to be a file name
245 
246  result = []
247  for line in open(os.environ['DD_SOURCE']).readlines():
248  line = os.path.expandvars(line.strip())
249  if line == "": continue
250  if dd_sample_re.search(line) == None: continue
251  if dd_cond_re.search(line) == None: continue
252  if dd_tier_re.search(line) == None: continue
253  if dd_run_re.search(line) == None: continue
254  result.append(line)
255 
256  if len(result)==0:
257  diag = '[electronDataDiscovery.py] No more files after filtering with :'
258  if os.environ['DD_SAMPLE']!='': diag += ' ' + os.environ['DD_SAMPLE']
259  if os.environ['DD_COND']!='': diag += ' ' + os.environ['DD_COND']
260  if dd_tier!='': diag += ' ' + dd_tier
261  if os.environ['DD_RUN']!='': diag += ' ' + os.environ['DD_RUN']
262  print(diag)
263 
264  return result
265 
266 def search():
267  return common_search(os.environ['DD_TIER'])
268 
269 def search2():
270  return common_search(os.environ['DD_TIER_SECONDARY'])
271 
272 
273 
274 
def replace(string, replacements)
S & print(S &os, JobReport::InputFile const &f)
Definition: JobReport.cc:66
#define str(s)