CMS 3D CMS Logo

makeListRunsInFiles.py
Go to the documentation of this file.
1 #!/usr/bin/env python3
2 
3 import Utilities.General.cmssw_das_client as das_client
4 import json
5 import os
6 import sys
7 import subprocess
8 import argparse
9 
10 
12 
13  """Parse the control line arguments"""
14 
15  parser = argparse.ArgumentParser(description = "Tool to find which runs are included in files. Used to generate input dataset for JetHT validation tool in case of run based splitting for condor jobs.", formatter_class=argparse.RawTextHelpFormatter)
16  parser.add_argument("-i", "--input", action="store", help="Name of the input file list. Has one file name in each line.", required=True)
17  parser.add_argument("-o", "--output", action = "store", help ="Name of the output file in which the produced file list is stored", default = "myFileListWithRuns.txt")
18 
19  return parser.parse_args()
20 
21 
22 
24 
25  """Check if GRID proxy has been initialized."""
26 
27  try:
28  with open(os.devnull, "w") as dump:
29  subprocess.check_call(["voms-proxy-info", "--exists"],
30  stdout = dump, stderr = dump)
31  except subprocess.CalledProcessError:
32  return False
33  return True
34 
35 
36 def findInJson(jsondict, strings):
37 
38  """ Find string from json file. Code copy-pasted from dataset.py """
39 
40  if isinstance(strings, str):
41  strings = [ strings ]
42 
43  if len(strings) == 0:
44  return jsondict
45  if isinstance(jsondict,dict):
46  if strings[0] in jsondict:
47  try:
48  return findInJson(jsondict[strings[0]], strings[1:])
49  except KeyError:
50  pass
51  else:
52  for a in jsondict:
53  if strings[0] in a:
54  try:
55  return findInJson(a[strings[0]], strings[1:])
56  except (TypeError, KeyError): #TypeError because a could be a string and contain strings[0]
57  pass
58  #if it's not found
59  raise KeyError("Can't find " + strings[0])
60 
61 
62 def getData( dasQuery, dasLimit = 0 ):
63 
64  """ Get data from DAS query. Code copy-pasted from dataset.py """
65 
66  dasData = das_client.get_data(dasQuery, dasLimit)
67  if isinstance(dasData, str):
68  jsondict = json.loads( dasData )
69  else:
70  jsondict = dasData
71  # Check, if the DAS query fails
72  try:
73  error = findInJson(jsondict,["data","error"])
74  except KeyError:
75  error = None
76  if error or findInJson(jsondict,"status") != 'ok' or "data" not in jsondict:
77  try:
78  jsonstr = findInJson(jsondict,"reason")
79  except KeyError:
80  jsonstr = str(jsondict)
81  if len(jsonstr) > 10000:
82  jsonfile = "das_query_output_%i.txt"
83  i = 0
84  while os.path.lexists(jsonfile % i):
85  i += 1
86  jsonfile = jsonfile % i
87  theFile = open( jsonfile, "w" )
88  theFile.write( jsonstr )
89  theFile.close()
90  msg = "The DAS query returned an error. The output is very long, and has been stored in:\n" + jsonfile
91  else:
92  msg = "The DAS query returned a error. Here is the output\n" + jsonstr
93  msg += "\nIt's possible that this was a server error. If so, it may work if you try again later"
94  raise KeyError(msg)
95  return findInJson(jsondict,"data")
96 
97 
98 def main():
99 
100  """ Main program """
101 
102  # Before doing anything, check that grip proxy exists
103  if not check_proxy():
104  print("Grid proxy is required to connect to DAS. Cannot run the tool without it.")
105  print("Please create a proxy via 'voms-proxy-init -voms cms'.")
106  sys.exit(1)
107 
108  # Read the command line argument
109  commandLineArguments = parseArguments()
110 
111  # Read the file list from the input file
112  inputFile = open(commandLineArguments.input,"r")
113  inputFileList = inputFile.readlines()
114  inputFile.close()
115 
116  # Find which runs are included in each of the files in the file list
117  runDictionary = {} # Dictionary telling which files contain each run
118  for rawInputFile in inputFileList:
119 
120  inputFile = rawInputFile.rstrip()
121  myData = getData("run file={}".format(inputFile))
122 
123  myRunsArray = []
124  for dasInstance in myData:
125  myRunsArray.append(findInJson(dasInstance,"run"))
126 
127  for innerArray in myRunsArray:
128  for jsonDictionary in innerArray:
129  runNumber = jsonDictionary["run_number"]
130  if runNumber in runDictionary:
131  runDictionary[runNumber].append(inputFile)
132  else:
133  runDictionary[runNumber] = [inputFile]
134 
135 
136  # Create an output file indicating which runs can be found from each file
137  outputFileName = commandLineArguments.output
138  outputFile = open(outputFileName, "w")
139 
140  for runNumber in runDictionary:
141  for fileName in runDictionary[runNumber]:
142  outputFile.write("{} {}\n".format(runNumber, fileName))
143 
144  outputFile.close()
145 
146 
147 if __name__ == "__main__":
148 
149  main()
def get_data(host, query, idx, limit, debug, threshold=300, ckey=None, cert=None, capath=None, qcache=0, das_headers=True)
Definition: das_client.py:276
void print(TMatrixD &m, const char *label=nullptr, bool mathematicaFormat=false)
Definition: Utilities.cc:47
def findInJson(jsondict, strings)
def getData(dasQuery, dasLimit=0)
Definition: main.py:1
#define str(s)