CMS 3D CMS Logo

 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Pages
electronDataDiscovery.py
Go to the documentation of this file.
1 
2 #===================================================================
3 # So to get the list of input files. One must call :
4 # search(), to get the list of primary files
5 # search2(), to get the list of eventual secondary files
6 #
7 # The selection of files is configured thanks to shell
8 # environment variables:
9 #
10 # DD_RELEASE, for example CMSSW_2_2_0_pre1
11 # DD_SAMPLE, for example RelValSingleElectronPt35
12 # DD_RUN, for example ''
13 # DD_COND , for example MC_31X_V2-v1
14 # DD_TIER , for example RECO
15 # DD_TIER_SECONDARY, for eventual secondary files
16 #
17 # DD_SOURCE:
18 # das: use das
19 # dbs: use dbs search
20 # lsf: use dbs lsf
21 # /castor/cern.ch/cms/...: assumed to be the path of a castor directory containing the input data files
22 # for relvals: '/castor/cern.ch/cms/store/relval/${DD_RELEASE}/${DD_SAMPLE}/${DD_TIER}/${DD_COND}/'
23 # for harvested dqm: '/castor/cern.ch/cms/store/unmerged/dqm/${DD_SAMPLE}-${DD_RELEASE}-${DD_COND}-DQM-DQMHarvest-OfflineDQM'
24 # /eos/cms/...: assumed to be the path of a castor directory containing the input data files
25 # for relvals: '/eos/cms/store/relval/${DD_RELEASE}/${DD_SAMPLE}/${DD_TIER}/${DD_COND}/'
26 # for harvested dqm: '/eos/cms/store/unmerged/dqm/${DD_SAMPLE}-${DD_RELEASE}-${DD_COND}-DQM-DQMHarvest-OfflineDQM'
27 # /...: assumed to be the path of a text file containing the list of input data files
28 #
29 # All except DD_SOURCE can use wildcard *.
30 #===================================================================
31 
32 #import httplib, urllib, urllib2, types, string, os, sys
33 import os, sys, re, das_client
34 
35 if not os.environ.has_key('DD_SOURCE'):
36  os.environ['DD_SOURCE'] = 'das'
37 if not os.environ.has_key('DD_RELEASE'):
38  os.environ['DD_RELEASE'] = ''
39 if not os.environ.has_key('DD_SAMPLE'):
40  os.environ['DD_SAMPLE'] = ''
41 if not os.environ.has_key('DD_COND'):
42  os.environ['DD_COND'] = ''
43 if not os.environ.has_key('DD_TIER'):
44  os.environ['DD_TIER'] = ''
45 if not os.environ.has_key('DD_TIER_SECONDARY'):
46  os.environ['DD_TIER_SECONDARY'] = ''
47 if not os.environ.has_key('DD_RUN'):
48  os.environ['DD_RUN'] = ''
49 
50 dd_release_re = re.compile(os.environ['DD_RELEASE'].replace('*','.*')) ;
51 dd_sample_re = re.compile(os.environ['DD_SAMPLE'].replace('*','.*')) ;
52 dd_cond_re = re.compile(os.environ['DD_COND'].replace('*','.*')) ;
53 dd_run_re = re.compile(os.environ['DD_RUN'].replace('*','.*')) ;
54 
55 def common_search(dd_tier):
56 
57  dd_tier_re = re.compile(dd_tier.replace('*','.*')) ;
58 
59  if os.environ['DD_SOURCE'] == "das":
60 
61  query = "dataset instance=cms_dbs_prod_global"
62  if os.environ['DD_RELEASE'] != "" :
63  query = query + " release=" + os.environ['DD_RELEASE']
64  if os.environ['DD_SAMPLE'] != "":
65  query = query + " primary_dataset=" + os.environ['DD_SAMPLE']
66  if dd_tier != "":
67  query = query + " tier=" + dd_tier
68  if os.environ['DD_COND'] != "":
69  query = query + " dataset=*" + os.environ['DD_COND'] + "*"
70  if os.environ['DD_RUN'] != "":
71  query = query + " run=" + os.environ['DD_RUN']
72  #query = query + " | unique" # too long ??
73 
74  #data = os.popen('das_client.py --limit=0 --query "'+query+'"')
75  #datalines = data.readlines()
76  #data.close()
77  #datasets = []
78  #for line in datalines:
79  # line = line.rstrip()
80  # if line != "" and line[0] =="/":
81  # datasets.append(line)
82  #dataset = datasets[0]
83 
84  data = das_client.json.loads(das_client.get_data('https://cmsweb.cern.ch',query,0,0,0))
85 
86  if data['nresults']==0:
87  print '[electronDataDiscovery.py] No DAS dataset for query:', query
88  return []
89  while data['nresults']>1:
90  if data['data'][0]['dataset'][0]['name']==data['data'][1]['dataset'][0]['name']:
91  data['data'].pop(0)
92  data['nresults'] -= 1
93  else:
94  print '[electronDataDiscovery.py] Several DAS datasets for query:', query
95  for i in range(data['nresults']):
96  print '[electronDataDiscovery.py] dataset['+str(i)+']: '+data['data'][i]['dataset'][0]['name']
97  return []
98 
99  dataset = data['data'][0]['dataset'][0]['name']
100 
101  query = "file instance=cms_dbs_prod_global dataset="+dataset
102 
103  #data = os.popen('das_client.py --limit=0 --query "'+query+'"')
104  #datalines = data.readlines()
105  #data.close()
106  #result = []
107  #for line in datalines:
108  # line = line.rstrip()
109  # if line != "" and line[0] =="/":
110  # result.append(line)
111 
112  data = das_client.json.loads(das_client.get_data('https://cmsweb.cern.ch',query,0,0,0))
113 
114  if data['nresults']==0:
115  print '[electronDataDiscovery.py] No DAS file in dataset:', dataset
116  return []
117 
118  result = []
119  for i in range(0,data['nresults']):
120  result.append(str(data['data'][i]['file'][0]['name']))
121 
122  elif os.environ['DD_SOURCE'] == "dbs":
123 
124  input = "find file"
125  separator = " where "
126  if os.environ['DD_RELEASE'] != "":
127  input = input + separator + "release = " + os.environ['DD_RELEASE']
128  separator = " and "
129  if os.environ['DD_SAMPLE'] != "":
130  input = input + separator + "primds = " + os.environ['DD_SAMPLE']
131  separator = " and "
132  if os.environ['DD_RUN'] != "":
133  input = input + separator + "run = " + os.environ['DD_RUN']
134  separator = " and "
135  input = input + separator + "dataset like *" + os.environ['DD_COND'] + "*" + dd_tier + "*"
136 
137  data = os.popen('dbs search --url="http://cmsdbsprod.cern.ch/cms_dbs_prod_global/servlet/DBSServlet" --query "'+input+'"')
138  datalines = data.readlines()
139  data.close()
140  result = []
141  for line in datalines:
142  line = line.rstrip()
143  if line != "" and line[0] =="/":
144  result.append(line)
145 
146  elif os.environ['DD_SOURCE'] == "http":
147 
148  input = "find file"
149  separator = " where "
150  if os.environ['DD_RELEASE'] != "":
151  input = input + separator + "release = " + os.environ['DD_RELEASE']
152  separator = " and "
153  if os.environ['DD_SAMPLE'] != "":
154  input = input + separator + "primds = " + os.environ['DD_SAMPLE']
155  separator = " and "
156  if os.environ['DD_RUN'] != "":
157  input = input + separator + "run = " + os.environ['DD_RUN']
158  separator = " and "
159  input = input + separator + "dataset like *" + os.environ['DD_COND'] + "*" + dd_tier + "*"
160 
161  url = "https://cmsweb.cern.ch:443/dbs_discovery/aSearch"
162  final_input = urllib.quote(input) ;
163 
164  agent = "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)"
165  ctypes = "text/plain"
166  headers = { 'User-Agent':agent, 'Accept':ctypes}
167  params = {'dbsInst':'cms_dbs_prod_global',
168  'html':0,'caseSensitive':'on','_idx':0,'pagerStep':-1,
169  'userInput':final_input,
170  'xml':0,'details':0,'cff':0,'method':'dbsapi'}
171  data = urllib.urlencode(params,doseq=True)
172  req = urllib2.Request(url, data, headers)
173  data = ""
174 
175  try:
176  response = urllib2.urlopen(req)
177  data = response.read()
178  except urllib2.HTTPError, e:
179  if e.code==201:
180  print e.headers
181  print e.msg
182  pass
183  else:
184  raise e
185 
186  datalines = data.readlines()
187  data.close()
188  result = []
189  for line in datalines:
190  line = line.rstrip()
191  if line != "" and line[0] =="/":
192  result.append(line)
193 
194  elif os.environ['DD_SOURCE'] == "lsf":
195 
196  dbs_path = '/'+os.environ['DD_SAMPLE']+'/'+os.environ['DD_RELEASE']+'-'+os.environ['DD_COND']+'/'+os.environ['DD_TIER']+'"'
197  if __name__ == "__main__":
198  print 'dbs path:',dbs_path
199  data = os.popen('dbs lsf --path="'+dbs_path+'"')
200  datalines = data.readlines()
201  data.close()
202  result = []
203  for line in datalines:
204  line = line.rstrip()
205  if line != "" and line[0] =="/":
206  result.append(line)
207 
208  elif os.environ['DD_SOURCE'].startswith('/castor/cern.ch/cms/'): # assumed to be a castor dir
209 
210  castor_dir = os.environ['DD_SOURCE'].replace('/castor/cern.ch/cms/','/',1)
211  result = []
212  data = os.popen('rfdir /castor/cern.ch/cms'+castor_dir)
213  subdirs = data.readlines()
214  data.close()
215  datalines = []
216  for line in subdirs:
217  line = line.rstrip()
218  subdir = line.split()[8]
219  data = os.popen('rfdir /castor/cern.ch/cms'+castor_dir+'/'+subdir)
220  datalines = data.readlines()
221  for line in datalines:
222  line = line.rstrip()
223  file = line.split()[8]
224  if file != "":
225  result.append(castor_dir+'/'+subdir+'/'+file)
226  data.close()
227 
228  elif os.environ['DD_SOURCE'].startswith('/eos/cms/'): # assumed to be an eos dir
229 
230  data = os.popen('/afs/cern.ch/project/eos/installation/pro/bin/eos.select find -f '+os.environ['DD_SOURCE'])
231  lines = data.readlines()
232  data.close()
233  result = []
234  for line in lines:
235  line = line.strip().replace('/eos/cms/','/',1)
236  if line == "": continue
237  if dd_sample_re.search(line) == None: continue
238  if dd_cond_re.search(line) == None: continue
239  if dd_tier_re.search(line) == None: continue
240  if dd_run_re.search(line) == None: continue
241  result.append(line)
242 
243  else: # os.environ['DD_SOURCE'] is assumed to be a file name
244 
245  result = []
246  for line in open(os.environ['DD_SOURCE']).readlines():
247  line = os.path.expandvars(line.strip())
248  if line == "": continue
249  if dd_sample_re.search(line) == None: continue
250  if dd_cond_re.search(line) == None: continue
251  if dd_tier_re.search(line) == None: continue
252  if dd_run_re.search(line) == None: continue
253  result.append(line)
254 
255  if len(result)==0:
256  diag = '[electronDataDiscovery.py] No more files after filtering with :'
257  if os.environ['DD_SAMPLE']!='': diag += ' ' + os.environ['DD_SAMPLE']
258  if os.environ['DD_COND']!='': diag += ' ' + os.environ['DD_COND']
259  if dd_tier!='': diag += ' ' + dd_tier
260  if os.environ['DD_RUN']!='': diag += ' ' + os.environ['DD_RUN']
261  print diag
262 
263  return result
264 
265 def search():
266  return common_search(os.environ['DD_TIER'])
267 
268 def search2():
269  return common_search(os.environ['DD_TIER_SECONDARY'])
270 
271 
272 
273