CMS 3D CMS Logo

 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Pages
ResourceMonitorCollection.cc
Go to the documentation of this file.
1 // $Id: ResourceMonitorCollection.cc,v 1.41 2011/04/18 15:18:57 mommsen Exp $
3 
4 #include <stdio.h>
5 #include <string>
6 #include <sstream>
7 #include <iomanip>
8 #include <sys/types.h>
9 #include <sys/stat.h>
10 #ifdef __APPLE__
11 #include <sys/param.h>
12 #include <sys/mount.h>
13 #else
14 #include <sys/statfs.h>
15 #endif
16 #include <fcntl.h>
17 #include <dirent.h>
18 #include <fnmatch.h>
19 #include <pwd.h>
20 #include <fstream>
21 #include <algorithm>
22 
23 #include <boost/bind.hpp>
24 #include <boost/regex.hpp>
25 
30 
31 
32 namespace stor {
33 
35  (
36  const utils::Duration_t& updateInterval,
38  ) :
39  MonitorCollection(updateInterval),
40  updateInterval_(updateInterval),
41  alarmHandler_(ah),
42  numberOfCopyWorkers_(-1),
43  numberOfInjectWorkers_(-1),
44  nLogicalDisks_(0),
45  latchedSataBeastStatus_(-1)
46  {}
47 
48 
50  {
51  boost::mutex::scoped_lock sl(diskUsageListMutex_);
52 
53  dwParams_ = dwParams;
54 
55  nLogicalDisks_ = std::max(dwParams.nLogicalDisk_, 1);
56  diskUsageList_.clear();
57  diskUsageList_.reserve(nLogicalDisks_+dwParams.otherDiskPaths_.size()+1);
58 
59  for (unsigned int i=0; i<nLogicalDisks_; ++i) {
60 
61  std::ostringstream pathName;
62  pathName << dwParams.filePath_;
63  if( dwParams.nLogicalDisk_ > 0 ) {
64  pathName << "/" << std::setfill('0') << std::setw(2) << i;
65  }
66  addDisk(pathName.str());
67  }
68  addDisk(dwParams.dbFilePath_);
69 
71  {
72  addOtherDisks();
73  }
74  }
75 
76 
77  void ResourceMonitorCollection::addDisk(const std::string& pathname)
78  {
79  if ( pathname.empty() ) return;
80 
81  DiskUsagePtr diskUsage( new DiskUsage() );
82  diskUsage->pathName = pathname;
83  retrieveDiskSize(diskUsage);
84  diskUsageList_.push_back(diskUsage);
85  }
86 
87 
89  {
90  for ( DiskWritingParams::OtherDiskPaths::const_iterator
91  it = dwParams_.otherDiskPaths_.begin(),
92  itEnd = dwParams_.otherDiskPaths_.end();
93  it != itEnd;
94  ++it)
95  {
96  addDisk(*it);
97  }
98  }
99 
100 
102  (
103  ResourceMonitorParams const& rmParams
104  )
105  {
106  rmParams_ = rmParams;
107  }
108 
109 
111  (
112  AlarmParams const& alarmParams
113  )
114  {
115  alarmParams_ = alarmParams;
116  }
117 
118 
120  {
121  getDiskStats(stats);
122 
125 
127  }
128 
129 
131  {
132  boost::mutex::scoped_lock sl(diskUsageListMutex_);
133 
134  stats.diskUsageStatsList.clear();
135  stats.diskUsageStatsList.reserve(diskUsageList_.size());
136  for ( DiskUsagePtrList::const_iterator it = diskUsageList_.begin(),
137  itEnd = diskUsageList_.end();
138  it != itEnd;
139  ++it)
140  {
141  DiskUsageStatsPtr diskUsageStats(new DiskUsageStats);
142  diskUsageStats->diskSize = (*it)->diskSize;
143  diskUsageStats->absDiskUsage = (*it)->absDiskUsage;
144  diskUsageStats->relDiskUsage = (*it)->relDiskUsage;
145  diskUsageStats->pathName = (*it)->pathName;
146  diskUsageStats->alarmState = (*it)->alarmState;
147  stats.diskUsageStatsList.push_back(diskUsageStats);
148  }
149  }
150 
151 
153  {
154  calcDiskUsage();
157  checkSataBeasts();
158  }
159 
160 
162  {
166 
167  boost::mutex::scoped_lock sl(diskUsageListMutex_);
168  for ( DiskUsagePtrList::const_iterator it = diskUsageList_.begin(),
169  itEnd = diskUsageList_.end();
170  it != itEnd;
171  ++it)
172  {
173  (*it)->absDiskUsage = -1;
174  (*it)->relDiskUsage = -1;
175  (*it)->alarmState = AlarmHandler::OKAY;
176  }
177  }
178 
179 
181  {
182  infoSpaceItems.push_back(std::make_pair("copyWorkers", &copyWorkers_));
183  infoSpaceItems.push_back(std::make_pair("injectWorkers", &injectWorkers_));
184  infoSpaceItems.push_back(std::make_pair("sataBeastStatus", &sataBeastStatus_));
185  infoSpaceItems.push_back(std::make_pair("numberOfDisks", &numberOfDisks_));
186  infoSpaceItems.push_back(std::make_pair("diskPaths", &diskPaths_));
187  infoSpaceItems.push_back(std::make_pair("totalDiskSpace", &totalDiskSpace_));
188  infoSpaceItems.push_back(std::make_pair("usedDiskSpace", &usedDiskSpace_));
189  }
190 
191 
193  {
194  Stats stats;
195  getStats(stats);
196 
197  if (stats.numberOfCopyWorkers > 0)
198  copyWorkers_ = static_cast<xdata::UnsignedInteger32>(stats.numberOfCopyWorkers);
199  else
200  copyWorkers_ = 0;
201 
202  if (stats.numberOfInjectWorkers > 0)
203  injectWorkers_ = static_cast<xdata::UnsignedInteger32>(stats.numberOfInjectWorkers);
204  else
205  injectWorkers_ = 0;
206 
209 
210  const size_t statsCount = stats.diskUsageStatsList.size();
211  const size_t infospaceCount = diskPaths_.size();
212 
213  if ( statsCount != infospaceCount )
214  {
215  diskPaths_.resize(statsCount);
216  totalDiskSpace_.resize(statsCount);
217  usedDiskSpace_.resize(statsCount);
218  }
219 
220  for (size_t i=0; i < statsCount; ++i)
221  {
222  diskPaths_.at(i) = static_cast<xdata::String>(stats.diskUsageStatsList.at(i)->pathName);
223  totalDiskSpace_.at(i) = static_cast<xdata::UnsignedInteger32>(
224  static_cast<unsigned int>(stats.diskUsageStatsList.at(i)->diskSize * 1024));
225  usedDiskSpace_.at(i) = static_cast<xdata::UnsignedInteger32>(
226  static_cast<unsigned int>(stats.diskUsageStatsList.at(i)->absDiskUsage * 1024));
227  }
228 
229  calcDiskUsage();
230  }
231 
232 
234  {
235  boost::mutex::scoped_lock sl(diskUsageListMutex_);
236 
237  for ( DiskUsagePtrList::iterator it = diskUsageList_.begin(),
238  itEnd = diskUsageList_.end();
239  it != itEnd;
240  ++it)
241  {
242  retrieveDiskSize(*it);
243  }
244  }
245 
247  {
248  #if __APPLE__
249  struct statfs buf;
250  int retVal = statfs(diskUsage->pathName.c_str(), &buf);
251  #else
252  struct statfs64 buf;
253  int retVal = statfs64(diskUsage->pathName.c_str(), &buf);
254  #endif
255  if(retVal==0) {
256  unsigned int blksize = buf.f_bsize;
257  diskUsage->diskSize =
258  static_cast<double>(buf.f_blocks * blksize) / 1024 / 1024 / 1024;
259  diskUsage->absDiskUsage =
260  diskUsage->diskSize -
261  static_cast<double>(buf.f_bavail * blksize) / 1024 / 1024 / 1024;
262  diskUsage->relDiskUsage = (100 * (diskUsage->absDiskUsage / diskUsage->diskSize));
263  if ( diskUsage->relDiskUsage > dwParams_.highWaterMark_ )
264  {
265  emitDiskSpaceAlarm(diskUsage);
266  }
267  else if ( diskUsage->relDiskUsage < dwParams_.highWaterMark_*0.95 )
268  // do not change alarm level if we are close to the high water mark
269  {
270  revokeDiskAlarm(diskUsage);
271  }
272  }
273  else
274  {
275  emitDiskAlarm(diskUsage, errno);
276  diskUsage->diskSize = -1;
277  diskUsage->absDiskUsage = -1;
278  diskUsage->relDiskUsage = -1;
279  }
280  }
281 
282 
284  // do NOT use errno here
285  {
286  std::string msg;
287 
288  switch(e)
289  {
290  case ENOENT :
291  diskUsage->alarmState = AlarmHandler::ERROR;
292  msg = "Cannot access " + diskUsage->pathName + ". Is it mounted?";
293  break;
294 
295  default :
296  diskUsage->alarmState = AlarmHandler::WARNING;
297  msg = "Failed to retrieve disk space information for " + diskUsage->pathName + ":"
298  + strerror(e);
299  }
300 
301  XCEPT_DECLARE(stor::exception::DiskSpaceAlarm, ex, msg);
302  alarmHandler_->notifySentinel(diskUsage->alarmState, ex);
303  }
304 
305 
307  {
308  if ( diskUsage->relDiskUsage > dwParams_.failHighWaterMark_ )
309  {
310  failIfImportantDisk(diskUsage);
311  }
312 
313  diskUsage->alarmState = AlarmHandler::WARNING;
314 
315  XCEPT_DECLARE(stor::exception::DiskSpaceAlarm, ex, diskUsage->toString());
316  alarmHandler_->raiseAlarm(diskUsage->pathName, diskUsage->alarmState, ex);
317  }
318 
319 
321  {
322  // do not fail if the disk is one of the other disks
323  DiskWritingParams::OtherDiskPaths::const_iterator begin =
324  dwParams_.otherDiskPaths_.begin();
325  DiskWritingParams::OtherDiskPaths::const_iterator end =
327  if ( std::find(begin, end, diskUsage->pathName) != end ) return;
328 
329  diskUsage->alarmState = AlarmHandler::FATAL;
330  XCEPT_RAISE(stor::exception::DiskSpaceAlarm, diskUsage->toString());
331  }
332 
333 
335  {
336  diskUsage->alarmState = AlarmHandler::OKAY;
337 
338  alarmHandler_->revokeAlarm(diskUsage->pathName);
339  }
340 
341 
343  {
344  struct passwd* passwd = getpwnam(rmParams_.copyWorkers_.user_.c_str());
345  if (passwd)
346  {
349  }
350  else
351  {
353  }
354 
356  {
358  }
359  }
360 
361 
363  {
364  const std::string alarmName = "CopyWorkers";
365 
367  {
368  std::ostringstream msg;
369  msg << "Expected " << rmParams_.copyWorkers_.expectedCount_ <<
370  " running CopyWorkers, but found " <<
371  numberOfCopyWorkers_ << ".";
372  XCEPT_DECLARE(stor::exception::CopyWorkers, ex, msg.str());
373  alarmHandler_->raiseAlarm(alarmName, AlarmHandler::WARNING, ex);
374  }
375  else
376  {
377  alarmHandler_->revokeAlarm(alarmName);
378  }
379  }
380 
381 
383  {
384  struct passwd* passwd = getpwnam(rmParams_.injectWorkers_.user_.c_str());
385  if (passwd)
386  {
388  }
389  else
390  {
392  }
393 
394  if (
397  )
398  {
400  }
401  }
402 
403 
405  {
406  const std::string alarmName = "InjectWorkers";
407 
409  {
410  std::ostringstream msg;
411  msg << "Expected " << rmParams_.injectWorkers_.expectedCount_ <<
412  " running InjectWorkers, but found " <<
413  numberOfInjectWorkers_ << ".";
414  XCEPT_DECLARE(stor::exception::InjectWorkers, ex, msg.str());
415  alarmHandler_->raiseAlarm(alarmName, AlarmHandler::WARNING, ex);
416  }
417  else
418  {
419  alarmHandler_->revokeAlarm(alarmName);
420  }
421  }
422 
423 
425  {
426  SATABeasts sataBeasts;
427  if ( getSataBeasts(sataBeasts) )
428  {
429  for (
430  SATABeasts::const_iterator it = sataBeasts.begin(),
431  itEnd= sataBeasts.end();
432  it != itEnd;
433  ++it
434  )
435  {
436  checkSataBeast(*it);
437  }
438  }
439  else
440  {
442  }
443  }
444 
445 
447  {
448  if (! alarmParams_.isProductionSystem_) return false;
449 
450  std::ifstream in;
451  in.open( "/proc/mounts" );
452 
453  if ( ! in.is_open() ) return false;
454 
455  std::string line;
456  while( getline(in,line) )
457  {
458  size_t pos = line.find("sata");
459  if ( pos != std::string::npos )
460  {
461  std::ostringstream host;
462  host << "satab-c2c"
463  << std::setw(2) << std::setfill('0')
464  << line.substr(pos+4,1)
465  << "-"
466  << std::setw(2) << std::setfill('0')
467  << line.substr(pos+5,1);
468  sataBeasts.insert(host.str());
469  }
470  }
471  return !sataBeasts.empty();
472  }
473 
474 
475  void ResourceMonitorCollection::checkSataBeast(const std::string& sataBeast)
476  {
477  if ( ! (checkSataDisks(sataBeast,"-00.cms") || checkSataDisks(sataBeast,"-10.cms")) )
478  {
479  XCEPT_DECLARE(stor::exception::SataBeast, ex,
480  "Failed to connect to SATA beast " + sataBeast);
481  alarmHandler_->raiseAlarm(sataBeast, AlarmHandler::ERROR, ex);
482 
483  latchedSataBeastStatus_ = 99999;
484  }
485  }
486 
487 
489  (
490  const std::string& sataBeast,
491  const std::string& hostSuffix
492  )
493  {
494  CurlInterface curlInterface;
495  CurlInterface::Content content;
496 
497  // Do not try to connect if we have no user name
498  if ( rmParams_.sataUser_.empty() ) return true;
499 
500  const CURLcode returnCode =
501  curlInterface.getContent(
502  "http://" + sataBeast + hostSuffix + "/status.asp",rmParams_.sataUser_,
503  content
504  );
505 
506  if (returnCode == CURLE_OK)
507  {
508  updateSataBeastStatus(sataBeast, std::string(&content[0]));
509  return true;
510  }
511  else
512  {
513  std::ostringstream msg;
514  msg << "Failed to connect to SATA controller "
515  << sataBeast << hostSuffix
516  << " with user name '" << rmParams_.sataUser_
517  << "': " << std::string(&content[0]);
518  XCEPT_DECLARE(stor::exception::SataBeast, ex, msg.str());
519  alarmHandler_->notifySentinel(AlarmHandler::WARNING, ex);
520 
521  return false;
522  }
523  }
524 
525 
527  (
528  const std::string& sataBeast,
529  const std::string& content
530  )
531  {
532  boost::regex failedEntry(">([^<]* has failed[^<]*)");
533  boost::regex failedDisk("Hard disk([[:digit:]]+)");
534  boost::regex failedController("RAID controller ([[:digit:]]+)");
535  boost::match_results<std::string::const_iterator> matchedEntry, matchedCause;
536  boost::match_flag_type flags = boost::match_default;
537 
538  std::string::const_iterator start = content.begin();
539  std::string::const_iterator end = content.end();
540 
541  unsigned int newSataBeastStatus = 0;
542 
543  while( regex_search(start, end, matchedEntry, failedEntry, flags) )
544  {
545  std::string errorMsg = matchedEntry[1];
546  XCEPT_DECLARE(stor::exception::SataBeast, ex, sataBeast+": "+errorMsg);
547  alarmHandler_->raiseAlarm(sataBeast, AlarmHandler::ERROR, ex);
548 
549  // find what failed
550  if ( regex_search(errorMsg, matchedCause, failedDisk) )
551  {
552  // Update the number of failed disks
553  ++newSataBeastStatus;
554  }
555  else if ( regex_search(errorMsg, matchedCause, failedController) )
556  {
557  // Update the number of failed controllers
558  newSataBeastStatus += 100;
559  }
560  else
561  {
562  // Unknown failure
563  newSataBeastStatus += 1000;
564  }
565 
566  // update search position:
567  start = matchedEntry[0].second;
568  // update flags:
569  flags |= boost::match_prev_avail;
570  flags |= boost::match_not_bob;
571  }
572 
573  latchedSataBeastStatus_ = newSataBeastStatus;
574 
575  if (latchedSataBeastStatus_ == 0) // no more problems
576  alarmHandler_->revokeAlarm(sataBeast);
577 
578  }
579 
580 
581  namespace {
582  int filter(const struct dirent *dir)
583  {
584  return !fnmatch("[1-9]*", dir->d_name, 0);
585  }
586 
587  bool matchUid(const std::string& filename, const uid_t& uid)
588  {
589  struct stat filestat;
590  int result = stat(filename.c_str(), &filestat);
591  return (result == 0 && filestat.st_uid == uid);
592  }
593 
594  bool isMaster(const char* pid)
595  {
596  // Adapted from procps::minimal::stat2proc
597  char buf[800]; // about 40 fields, 64-bit decimal is about 20 chars
598  int fd;
599  int ppid = 0;
600  std::ostringstream statfile;
601  statfile << "/proc/" << pid << "/stat";
602  snprintf(buf, 32, statfile.str().c_str(), pid);
603  if ( (fd = open(buf, O_RDONLY, 0) ) == -1 ) return false;
604  int num = read(fd, buf, sizeof buf - 1);
605  if(num<80) return false;
606  buf[num] = '\0';
607  char* tmp = strrchr(buf, ')'); // split into "PID (cmd" and "<rest>"
608  num = sscanf(tmp + 4, // skip ') %c '
609  "%d", &ppid);
610  close(fd);
611  return ( num == 1 && ppid == 1 ); // scan succeeded and parent pid is 1
612  }
613 
614  bool grep(const std::string& cmdline, const std::string& name)
615  {
616 
617  std::ifstream in;
618  in.open( cmdline.c_str() );
619 
620  std::string line;
621  if ( in.is_open() )
622  {
623  std::string tmp;
624  while( getline(in,tmp,'\0') )
625  {
626  line.append(tmp);
627  line.append(" ");
628  }
629  in.close();
630  }
631 
632  return ( line.find(name) != std::string::npos );
633  }
634  }
635 
636 
638  (
639  const std::string& processName,
640  const int& uid
641  )
642  {
643  int count(0);
644  struct dirent **namelist;
645  int n;
646 
647  #if __APPLE__
648  return -1;
649  #else
650  n = scandir("/proc", &namelist, filter, 0);
651  #endif
652  if (n < 0) return -1;
653 
654  while(n--)
655  {
656  std::ostringstream cmdline;
657  cmdline << "/proc/" << namelist[n]->d_name << "/cmdline";
658 
659  if ( grep(cmdline.str(), processName) &&
660  (uid < 0 || matchUid(cmdline.str(), uid)) &&
661  isMaster(namelist[n]->d_name) )
662  {
663  ++count;
664  }
665  free(namelist[n]);
666  }
667  free(namelist);
668 
669  return count;
670  }
671 
672 
674  {
675  std::ostringstream msg;
676  msg << std::fixed << std::setprecision(1) <<
677  "Disk space usage for " << pathName <<
678  " is " << relDiskUsage << "% (" <<
679  absDiskUsage << "GB of " <<
680  diskSize << "GB).";
681  return msg.str();
682  }
683 
684 } // namespace stor
685 
int i
Definition: DBlmapReader.cc:9
ResourceMonitorCollection(const utils::Duration_t &updateInterval, AlarmHandlerPtr)
std::vector< Variable::Flags > flags
Definition: MVATrainer.cc:135
OtherDiskPaths otherDiskPaths_
Definition: Configuration.h:48
void find(edm::Handle< EcalRecHitCollection > &hits, DetId thisDet, std::vector< EcalRecHitCollection::const_iterator > &hit, bool debug=false)
Definition: FindCaloHit.cc:7
boost::shared_ptr< DiskUsage > DiskUsagePtr
const std::string * pathName() const
Definition: HLTadd.h:31
boost::posix_time::time_duration Duration_t
Definition: Utils.h:41
const T & max(const T &a, const T &b)
xdata::Vector< xdata::UnsignedInteger32 > usedDiskSpace_
void configureDisks(DiskWritingParams const &)
tuple result
Definition: query.py:137
bool checkSataDisks(const std::string &sataBeast, const std::string &hostSuffix)
#define end
Definition: vmac.h:38
xdata::Vector< xdata::UnsignedInteger32 > totalDiskSpace_
CURLcode getContent(const std::string &url, const std::string &user, Content &content)
boost::shared_ptr< AlarmHandler > AlarmHandlerPtr
Definition: AlarmHandler.h:91
virtual void do_appendInfoSpaceItems(InfoSpaceItems &)
string host
Definition: query.py:114
void checkSataBeast(const std::string &sataBeast)
xdata::Vector< xdata::String > diskPaths_
tuple filter
USE THIS FOR SKIMMED TRACKS process.p = cms.Path(process.hltLevel1GTSeed*process.skimming*process.offlineBeamSpot*process.TrackRefitter2) OTHERWISE USE THIS.
Definition: align_tpl.py:86
bool getSataBeasts(SATABeasts &sataBeasts)
boost::shared_ptr< DiskUsageStats > DiskUsageStatsPtr
long long int num
Definition: procUtils.cc:71
std::vector< std::pair< std::string, xdata::Serializable * > > InfoSpaceItems
std::vector< std::vector< double > > tmp
Definition: MVATrainer.cc:100
#define begin
Definition: vmac.h:31
void updateSataBeastStatus(const std::string &sataBeast, const std::string &content)
tuple filename
Definition: lut2db_cfg.py:20
void emitDiskAlarm(DiskUsagePtr, error_t)
void configureResources(ResourceMonitorParams const &)
dbl *** dir
Definition: mlp_gen.cc:35
int getProcessCount(const std::string &processName, const int &uid=-1)
std::vector< char > Content
Definition: CurlInterface.h:27