CMS 3D CMS Logo

 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Pages
dqmdata_cleaner.py
Go to the documentation of this file.
1 #!/usr/bin/env python
2 
3 # ToDo LIST:
4 # new option: -f FILE, --file FILE: print the selected files in a user defined file
5 
6 
7 import os
8 import time
9 import datetime
10 import re
11 import sys
12 from optparse import OptionParser
13 
14 
16  def __init__(self, path, referenceDate, versionsToKeep, noOutput):
17  self.pathToAnalyse = path
18 
19  # convert the given date to epoch time
20  if referenceDate != None:
21  self.referenceTimestamp = time.mktime(referenceDate.timetuple())
22  else:
23  self.referenceTimestamp = None
24 
25  self.versionsToKeep = versionsToKeep
26  self.noOutput = noOutput
27 
28  self.OutdatedFiles = {}
29  self.OutdatedFilesSize = 0 # in kBytes
30 
31  self.VersionedFiles = {}
32  self.VersionedFilesSize = 0 # in kBytes
33 
34  self.RootFilesExtensions = ('.ROOT', '.root')
35 
36 
37  def find_files(self):
38  self.OutdatedFiles = {}
39  self.VersionedFiles = {}
40 
41  self._walk(self.pathToAnalyse)
42 
43 
44  def _walk(self, path):
45  for currentDir, directories, files in os.walk(path):
46  # filter the ROOT files and work only with them
47  files = self._select_root_files_only(files)
48 
49  # apply all filters specified by the user
50 
51  # filter the outdated files and update the list of files to be processed by the other filter
52  if self.referenceTimestamp != None:
53  files = self._select_outdated_files(currentDir, files)
54 
55  # filter versioned files
56  if self.versionsToKeep != None:
57  self._select_versioned_files(currentDir, files)
58 
59 
60  def _select_root_files_only(self, files):
61  rootFiles = []
62 
63  for file in files:
64  if os.path.splitext(file)[1] in self.RootFilesExtensions:
65  rootFiles.append(file)
66 
67  return rootFiles
68 
69 
70  def _select_outdated_files(self, currentDir, rootFiles):
71  #self.OutdatedFiles[currentDir] = []
72  upToDateFiles = []
73 
74  for file in rootFiles:
75  fullFilePath = os.path.join(currentDir, file)
76  if self.referenceTimestamp > os.path.getmtime(fullFilePath):
77  # file is older than the date specified os it should be marked for delete
78  self.OutdatedFiles.setdefault(currentDir, []).append(file)
79  self.OutdatedFilesSize += os.path.getsize(fullFilePath) / 1024.
80  else:
81  upToDateFiles.append(file)
82 
83  # if there are some outdated files just sort them
84  if self.OutdatedFiles.has_key(currentDir):
85  self.OutdatedFiles[currentDir].sort()
86 
87  return upToDateFiles
88 
89 
90  def _select_versioned_files(self, currentDir, rootFiles):
91  subsystemRunNumberGroups = {}
92 
93  for file in rootFiles:
94 #MARCO: Involuted, I would prefer here a real regular expression with matching. Direct index addressing is cryptic and bound to a specific file format.
95  # separate files by sub-systems and run-numbers
96  fileNameSplit = re.split('_', file)
97  # the key consist of the sub-system and run-number concatenated with '_' - e.g. EcalPreshower_R000179816
98  key = fileNameSplit[2] + '_' + fileNameSplit[3][:10]
99  subsystemRunNumberGroups.setdefault(key, []).append(file) # put the file in the appropriate group
100 
101  self.VersionedFiles[currentDir] = {}
102  for key in subsystemRunNumberGroups.iterkeys():
103  # process only files that have more than "versionsToKeep" versions for a given set of sub-system_run-number
104  if len(subsystemRunNumberGroups[key]) > self.versionsToKeep:
105  # the individual sub-systems and run-numbers are separated so the list of version files can be sorted
106 
107 
108 #MARCO: What does the comment mean? The sorting, I guess, is alphabetical, so it works as expected for all version numbers. The fact that the
109 ### sorting does not do what you want does not mean that sorting is not working. can you think of a way to improve it?
110  subsystemRunNumberGroups[key].sort() # DOES NOT WORK CORRECTLY FOR VERSION NUMBERS HIGHER THAN 9999
111 
112  # the list of sorted files is divided into two lists:
113  # to be deleted - all the files with the exception of the last "versionsToKeep" files
114  # to be kept - only the most recent "versionsToKeep" files
115  self.VersionedFiles[currentDir][key] = [[],[]]
116  self.VersionedFiles[currentDir][key][0] = subsystemRunNumberGroups[key][:-self.versionsToKeep]
117  self.VersionedFiles[currentDir][key][1] = subsystemRunNumberGroups[key][-self.versionsToKeep:]
118 
119  # calculate the size of the files marked to be deleted
120  for fileToBeDeleted in self.VersionedFiles[currentDir][key][0]:
121  self.VersionedFilesSize += os.path.getsize(os.path.join(currentDir, fileToBeDeleted)) / 1024.
122 
123  # if no versioned files are found remove the directory from the dictionary
124  if len(self.VersionedFiles[currentDir]) == 0:
125  del self.VersionedFiles[currentDir]
126 
127 
129  if not self.noOutput:
130  # join the two sets of directories with files to be deleted and sort them
131  directories = sorted(self.OutdatedFiles.keys() + self.VersionedFiles.keys())
132  for directory in directories:
133  print('DIR: ' + '"' + directory + '"')
134 
135  # print the outdated files that are to be deleted if any
136  if self.OutdatedFiles.has_key(directory):
137  print('\t' + 'Outdated files to be deleted:')
138  for file in self.OutdatedFiles[directory]:
139  print('\t\t' + file)
140  print('')
141 
142  # print the versioned files that are to be deleted and also that are to be kept
143  if self.VersionedFiles.has_key(directory):
144  print('\t' + 'Versioned files:')
145  for key in sorted(self.VersionedFiles[directory].iterkeys()):
146  print('\t\t' + 'ToBe Deleted:')
147  for file in self.VersionedFiles[directory][key][0]:
148  print('\t\t\t' + file)
149  print('\t\t' + 'ToBe Kept:')
150  for file in self.VersionedFiles[directory][key][1]:
151  print('\t\t\t' + file)
152  print('')
153 
154 
156  print('The space freed by outdated files is: ' + '"' +
157  str( round( self.OutdatedFilesSize/(1024.*1024), 2)) + ' GB"')
158 
159  print('The space freed by versioned files is: ' + '"' +
160  str( round( self.VersionedFilesSize/(1024.*1024), 2)) + ' GB"')
161 
162  print('The total space freed is: ' + '"' +
163  str( round( (self.OutdatedFilesSize + self.VersionedFilesSize)/(1024.*1024), 2)) + ' GB"\n')
164 
165 
167 
168  def __init__(self):
169  usage = sys.argv[0] + ' [options] PATH_TO_ANALYSE'
170  parser = OptionParser(usage=usage)
171 
172  parser.add_option('-d',
173  '--date',
174  type='string',
175  dest='ReferenceDate',
176  metavar='YYYY-MM-DD',
177  help='All the ROOT files older than [YYYY-MM-DD] will be marked for deletion. If the '
178  'user does not specify this option no date filter will be applied at all')
179  parser.add_option('-v',
180  '--versions_to_keep',
181  type='int',
182  dest='VersionsToKeep',
183  metavar='VERSIONS_TO_KEEP',
184  help='Specify number of versions to keep. If a ROOT file has many versions only the most '
185  'recent [VERSIONS_TO_KEEP] of them will be kept. The others will be marked for '
186  'deletion. It the user does not specify this option no version filter will be applied '
187  'at all')
188  parser.add_option('-q',
189  '--quiet',
190  dest='Quiet',
191  action='store_true',
192  default=False,
193  help='If this flag is specified no output is printed to STDOUT.')
194  parser.add_option('-f',
195  '--file',
196  type='string',
197  dest='LogFile',
198  metavar='LOG_FILE',
199  default=None,
200  help='Print all ROOT files selected for deletion to a [LOG_FILE]. If [LOG_FILE] already '
201  'exists it will be deleted.')
202 
203  # parse the user specified arguments
204  (options, args) = parser.parse_args()
205  self.ReferenceDate = options.ReferenceDate
206  self.VersionsToKeep = options.VersionsToKeep
207  self.Quiet = options.Quiet
208 
209  self.ArgumentsOK = self._check_arguments(parser, args)
210 
211 
212  def _check_arguments(self, parser, args):
213 
214  # check self.PathToAnalyse
215  if len(args) == 1:
216  self.PathToAnalyse = args[0]
217  else:
218  print('Wrong number of positional arguments. You have to specify only PATH_TO_ANALYSE!\n')
219  parser.print_help()
220  return False
221 
222  if not os.path.exists(self.PathToAnalyse): # check whether self.PathToAnalyse exists
223  print('The path "' + self.PathToAnalyse + '" does not exists or in not readable!')
224  return False
225 
226  # check self.ReferenceDate - it should be a valid date string
227  if self.ReferenceDate != None:
228  dateSplit = self.ReferenceDate.split('-')
229  try: # convert self.ReferenceDate to datetime.date object
230  self.ReferenceDate = datetime.date(int(dateSplit[0]), int(dateSplit[1]), int(dateSplit[2]))
231  except:
232  print('"' + self.ReferenceDate + '" - Wrong date format (please use YYYY-MM-DD) or nonexistent date!')
233  return False
234 
235  # check self.VersionsToKeep
236  if (self.VersionsToKeep != None) and (self.VersionsToKeep < 1):
237  print('Number of versions to keep should be a positive integer. '
238  'The value you specified is "' + str(self.VersionsToKeep) + '"')
239  return False
240 
241  # if this is reached the argumnts are OK
242  return True
243 
244 
245 if __name__ == '__main__':
246 
248  if args.ArgumentsOK:
249  rootFilesFilter = RootFilesFilter(args.PathToAnalyse, args.ReferenceDate, args.VersionsToKeep, args.Quiet)
250  rootFilesFilter.find_files()
251  rootFilesFilter.show_selected_files()
252  rootFilesFilter.show_some_statistics()
253  sys.exit(0)
254  else:
255  sys.exit(1)
256 
std::string print(const Track &, edm::Verbosity=edm::Concise)
Track print utility.
Definition: print.cc:10
def show_selected_files
sorting does not do what you want does not mean that sorting is not working.