9 from bs4
import BeautifulSoup
16 base_cert_url =
"https://cms-service-dqmdc.web.cern.ch/CAF/certification/" 17 base_cert_path =
"/eos/user/c/cmsdqm/www/CAF/certification/" 24 c.setopt(c.WRITEDATA, buffer)
28 return BeautifulSoup(buffer.getvalue(),
"lxml").text
31 out = subprocess.check_output(cmd, shell=
True, executable=
"/bin/bash").
decode(
'utf8')
32 return out.split(
"\n")
35 cmd =
"dasgoclient --query='file dataset=%s site=%s'"%(dataset,site)
37 df = pd.DataFrame(out,columns=[
"file"])
42 cmd =
"dasgoclient --query='file dataset=%s %s| grep file.name, file.nevents'"%(dataset,opt)
44 out = [np.array(r.split(
" "))[[0,3]]
for r
in out
if len(r) > 0]
46 df = pd.DataFrame(out,columns=[
"file",
"events"])
47 df.events = df.events.values.astype(int)
52 cmd =
"dasgoclient --query='file,lumi,run dataset=%s %s'"%(dataset,opt)
55 out = [r.split(
" ")
for r
in out
if len(r)>0]
57 df = pd.DataFrame(out,columns=[
"file",
"run",
"lumis"])
62 cmd =
"dasgoclient --query='file dataset=%s run=%s %s | sum(file.nevents) '"%(dataset,run,opt)
65 out = [o
for o
in out.split(
" ")
if "sum" not in o]
66 out =
int([r.split(
" ")
for r
in out
if len(r)>0][0][0])
71 cmd =
"dasgoclient --query='run dataset=%s %s '"%(dataset,opt)
77 print(
"No intersection between:")
78 print(
" - json : ", best_json)
79 print(
" - dataset: ", dataset)
83 if __name__ ==
'__main__':
85 parser = argparse.ArgumentParser()
86 parser.add_argument(
'--dataset',
'-d', default=
None, help=
"Dataset Name (e.g. '/DisplacedJet/Run2024C-v1/RAW' )",type=str,required=
True)
87 parser.add_argument(
'--threshold',
'-t', help =
"Event threshold per file",type=int,default=-1)
88 parser.add_argument(
'--events',
'-e', help =
"Tot number of events targeted",type=int,default=-1)
89 parser.add_argument(
'--outfile',
'-o', help=
'Dump results to file', type=str, default=
None)
90 parser.add_argument(
'--pandas',
'-pd',action=
'store_true',help=
"Store the whole dataset (no event or threshold cut) in a csv")
91 parser.add_argument(
'--proxy',
'-p', help=
'Allow to parse a x509 proxy if needed', type=str, default=
None)
92 parser.add_argument(
'--site',
'-s', help=
'Only data at specific site', type=str, default=
None)
93 parser.add_argument(
'--precheck',
'-pc', action=
'store_true', help=
'Check run per run before building the dataframes, to avoid huge caching.')
94 args = parser.parse_args()
96 if args.proxy
is not None:
97 os.environ[
"X509_USER_PROXY"] = args.proxy
98 elif "X509_USER_PROXY" not in os.environ:
99 print(
"No X509 proxy set. Exiting.")
103 testing =
"JENKINS_PREFIX" in os.environ
104 dataset = args.dataset
106 threshold = args.threshold
107 outfile = args.outfile
111 year = dataset.split(
"Run")[1][2:4]
112 PD = dataset.split(
"/")[1]
113 cert_type =
"Collisions" +
str(year)
114 if "Cosmics" in dataset:
115 cert_type =
"Cosmics" +
str(year)
116 elif "Commisioning" in dataset:
117 cert_type =
"Commisioning2020" 119 cert_type =
"Collisions" +
str(year) +
"HI" 121 cert_path = base_cert_path + cert_type +
"/" 125 if os.path.isdir(cert_path):
126 json_list = os.listdir(cert_path)
127 if len(json_list) == 0:
129 json_list = [c
for c
in json_list
if "Golden" in c
and "era" not in c]
130 json_list = [c
for c
in json_list
if c.startswith(
"Cert_C")
and c.endswith(
"json")]
135 cert_url = base_cert_url + cert_type +
"/" 137 json_list = [c
for c
in json_list
if "Golden" in c
and "era" not in c
and "Cert_C" in c]
138 json_list = [[cc
for cc
in c.split(
" ")
if cc.startswith(
"Cert_C")
and cc.endswith(
"json")][0]
for c
in json_list]
142 run_ranges = [
int(c.split(
"_")[3]) -
int(c.split(
"_")[2])
for c
in json_list]
143 latest_json = np.array(json_list[np.argmax(run_ranges)]).reshape(1,-1)[0].astype(str)
144 best_json =
str(latest_json[0])
146 with open(cert_path +
"/" + best_json)
as js:
147 golden = json.load(js)
150 golden = ast.literal_eval(golden)
157 R = R + [f
for f
in range(r[0],r[1]+1)]
163 golden_data_runs = [r
for r
in data_runs
if r
in golden_flat]
165 if (len(golden_data_runs)==0):
169 golden_data_runs_tocheck = golden_data_runs
171 if testing
or args.precheck:
172 golden_data_runs_tocheck = []
180 for r
in golden_data_runs:
182 golden_data_runs_tocheck.append(r)
183 if events > 0
and sum_events > events:
186 das_opt =
"run in %s"%(
str([
int(g)
for g
in golden_data_runs_tocheck]))
190 df[
"lumis"] = [[
int(ff)
for ff
in f.replace(
"[",
"").
replace(
"]",
"").
split(
",")]
for f
in df.lumis.values]
192 for r
in golden_data_runs_tocheck:
193 cut = (df[
"run"] == r)
200 if df_r[
"events"].sum() < threshold:
203 good_lumis = np.array([len([ll
for ll
in l
if ll
in golden_flat[r]])
for l
in df_r.lumis])
204 n_lumis = np.array([len(l)
for l
in df_r.lumis])
205 df_rs.append(df_r[good_lumis==n_lumis])
210 df = pd.concat(df_rs)
211 df.loc[:,
"min_lumi"] = [
min(f)
for f
in df.lumis]
212 df.loc[:,
"max_lumi"] = [
max(f)
for f
in df.lumis]
213 df = df.sort_values([
"run",
"min_lumi",
"max_lumi"])
216 df = df.merge(
das_file_site(dataset,site),on=
"file",how=
"inner")
219 df.to_csv(dataset.replace(
"/",
"")+
".csv")
222 df = df[df[
"events"] <= events]
223 df.loc[:,
"sum_evs"] = df.loc[:,
"events"].cumsum()
224 df = df[df[
"sum_evs"] < events]
228 if outfile
is not None:
229 with open(outfile,
'w')
as f:
bool any(const std::vector< T > &v, const T &what)
def das_file_site(dataset, site)
def replace(string, replacements)
def das_run_events_data(dataset, run, opt="")
def das_lumi_data(dataset, opt="")
void print(TMatrixD &m, const char *label=nullptr, bool mathematicaFormat=false)
def split(sequence, size)
static std::string join(char **cmd)
bool decode(bool &, std::string_view)
def das_run_data(dataset, opt="")
def das_file_data(dataset, opt="")