CMS 3D CMS Logo

 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Pages
ResourceMonitorCollection.cc
Go to the documentation of this file.
1 // $Id: ResourceMonitorCollection.cc,v 1.43 2011/04/21 14:00:05 mommsen Exp $
3 
4 #include <stdio.h>
5 #include <string>
6 #include <sstream>
7 #include <iomanip>
8 #include <sys/types.h>
9 #include <sys/stat.h>
10 #ifdef __APPLE__
11 #include <sys/param.h>
12 #include <sys/mount.h>
13 #else
14 #include <sys/statfs.h>
15 #endif
16 #include <fcntl.h>
17 #include <dirent.h>
18 #include <fnmatch.h>
19 #include <pwd.h>
20 #include <fstream>
21 #include <algorithm>
22 
23 #include <boost/bind.hpp>
24 #include <boost/regex.hpp>
25 
30 
31 
32 namespace stor {
33 
35  (
36  const utils::Duration_t& updateInterval,
38  ) :
39  MonitorCollection(updateInterval),
40  updateInterval_(updateInterval),
41  alarmHandler_(ah),
42  numberOfCopyWorkers_(-1),
43  numberOfInjectWorkers_(-1),
44  nLogicalDisks_(0),
45  latchedSataBeastStatus_(-1)
46  {}
47 
48 
50  {
51  boost::mutex::scoped_lock sl(diskUsageListMutex_);
52 
53  dwParams_ = dwParams;
54 
55  nLogicalDisks_ = std::max(dwParams.nLogicalDisk_, 1);
56  diskUsageList_.clear();
57  diskUsageList_.reserve(nLogicalDisks_+dwParams.otherDiskPaths_.size()+1);
58 
59  for (unsigned int i=0; i<nLogicalDisks_; ++i) {
60 
61  std::ostringstream pathName;
62  pathName << dwParams.filePath_;
63  if( dwParams.nLogicalDisk_ > 0 ) {
64  pathName << "/" << std::setfill('0') << std::setw(2) << i;
65  }
66  addDisk(pathName.str());
67  }
68  addDisk(dwParams.dbFilePath_);
69 
71  {
72  addOtherDisks();
73  }
74  }
75 
76 
77  void ResourceMonitorCollection::addDisk(const std::string& pathname)
78  {
79  if ( pathname.empty() ) return;
80 
81  DiskUsagePtr diskUsage( new DiskUsage() );
82  diskUsage->pathName = pathname;
83  retrieveDiskSize(diskUsage);
84  diskUsageList_.push_back(diskUsage);
85  }
86 
87 
89  {
90  for ( DiskWritingParams::OtherDiskPaths::const_iterator
91  it = dwParams_.otherDiskPaths_.begin(),
92  itEnd = dwParams_.otherDiskPaths_.end();
93  it != itEnd;
94  ++it)
95  {
96  addDisk(*it);
97  }
98  }
99 
100 
102  (
103  ResourceMonitorParams const& rmParams
104  )
105  {
106  rmParams_ = rmParams;
107  }
108 
109 
111  (
112  AlarmParams const& alarmParams
113  )
114  {
115  alarmParams_ = alarmParams;
116  }
117 
118 
120  {
121  getDiskStats(stats);
122 
125 
127  }
128 
129 
131  {
132  boost::mutex::scoped_lock sl(diskUsageListMutex_);
133 
134  stats.diskUsageStatsList.clear();
135  stats.diskUsageStatsList.reserve(diskUsageList_.size());
136  for ( DiskUsagePtrList::const_iterator it = diskUsageList_.begin(),
137  itEnd = diskUsageList_.end();
138  it != itEnd;
139  ++it)
140  {
141  DiskUsageStatsPtr diskUsageStats(new DiskUsageStats);
142  diskUsageStats->diskSize = (*it)->diskSize;
143  diskUsageStats->absDiskUsage = (*it)->absDiskUsage;
144  diskUsageStats->relDiskUsage = (*it)->relDiskUsage;
145  diskUsageStats->pathName = (*it)->pathName;
146  diskUsageStats->alarmState = (*it)->alarmState;
147  stats.diskUsageStatsList.push_back(diskUsageStats);
148  }
149  }
150 
151 
153  {
154  calcDiskUsage();
157  checkSataBeasts();
158  }
159 
160 
162  {
166 
167  boost::mutex::scoped_lock sl(diskUsageListMutex_);
168  for ( DiskUsagePtrList::const_iterator it = diskUsageList_.begin(),
169  itEnd = diskUsageList_.end();
170  it != itEnd;
171  ++it)
172  {
173  (*it)->absDiskUsage = -1;
174  (*it)->relDiskUsage = -1;
175  (*it)->alarmState = AlarmHandler::OKAY;
176  }
177  }
178 
179 
181  {
182  infoSpaceItems.push_back(std::make_pair("copyWorkers", &copyWorkers_));
183  infoSpaceItems.push_back(std::make_pair("injectWorkers", &injectWorkers_));
184  infoSpaceItems.push_back(std::make_pair("sataBeastStatus", &sataBeastStatus_));
185  infoSpaceItems.push_back(std::make_pair("numberOfDisks", &numberOfDisks_));
186  infoSpaceItems.push_back(std::make_pair("diskPaths", &diskPaths_));
187  infoSpaceItems.push_back(std::make_pair("totalDiskSpace", &totalDiskSpace_));
188  infoSpaceItems.push_back(std::make_pair("usedDiskSpace", &usedDiskSpace_));
189  }
190 
191 
193  {
194  Stats stats;
195  getStats(stats);
196 
197  if (stats.numberOfCopyWorkers > 0)
198  copyWorkers_ = static_cast<xdata::UnsignedInteger32>(stats.numberOfCopyWorkers);
199  else
200  copyWorkers_ = 0;
201 
202  if (stats.numberOfInjectWorkers > 0)
203  injectWorkers_ = static_cast<xdata::UnsignedInteger32>(stats.numberOfInjectWorkers);
204  else
205  injectWorkers_ = 0;
206 
209 
210  diskPaths_.clear();
211  totalDiskSpace_.clear();
212  usedDiskSpace_.clear();
213 
214  diskPaths_.reserve(stats.diskUsageStatsList.size());
215  totalDiskSpace_.reserve(stats.diskUsageStatsList.size());
216  usedDiskSpace_.reserve(stats.diskUsageStatsList.size());
217 
218  for (DiskUsageStatsPtrList::const_iterator
219  it = stats.diskUsageStatsList.begin(),
220  itEnd = stats.diskUsageStatsList.end();
221  it != itEnd;
222  ++it)
223  {
224  diskPaths_.push_back(
225  static_cast<xdata::String>( (*it)->pathName )
226  );
227  totalDiskSpace_.push_back(
228  static_cast<xdata::UnsignedInteger32>(
229  static_cast<unsigned int>( (*it)->diskSize * 1024 )
230  )
231  );
232  usedDiskSpace_.push_back(
233  static_cast<xdata::UnsignedInteger32>(
234  static_cast<unsigned int>( (*it)->absDiskUsage * 1024 )
235  )
236  );
237  }
238 
239  calcDiskUsage();
240  }
241 
242 
244  {
245  boost::mutex::scoped_lock sl(diskUsageListMutex_);
246 
247  for ( DiskUsagePtrList::iterator it = diskUsageList_.begin(),
248  itEnd = diskUsageList_.end();
249  it != itEnd;
250  ++it)
251  {
252  retrieveDiskSize(*it);
253  }
254  }
255 
257  {
258  #if __APPLE__
259  struct statfs buf;
260  int retVal = statfs(diskUsage->pathName.c_str(), &buf);
261  #else
262  struct statfs64 buf;
263  int retVal = statfs64(diskUsage->pathName.c_str(), &buf);
264  #endif
265  if(retVal==0) {
266  unsigned int blksize = buf.f_bsize;
267  diskUsage->diskSize =
268  static_cast<double>(buf.f_blocks * blksize) / 1024 / 1024 / 1024;
269  diskUsage->absDiskUsage =
270  diskUsage->diskSize -
271  static_cast<double>(buf.f_bavail * blksize) / 1024 / 1024 / 1024;
272  diskUsage->relDiskUsage = (100 * (diskUsage->absDiskUsage / diskUsage->diskSize));
273  if ( diskUsage->relDiskUsage > dwParams_.highWaterMark_ )
274  {
275  emitDiskSpaceAlarm(diskUsage);
276  }
277  else if ( diskUsage->relDiskUsage < dwParams_.highWaterMark_*0.95 )
278  // do not change alarm level if we are close to the high water mark
279  {
280  revokeDiskAlarm(diskUsage);
281  }
282  }
283  else
284  {
285  emitDiskAlarm(diskUsage, errno);
286  diskUsage->diskSize = -1;
287  diskUsage->absDiskUsage = -1;
288  diskUsage->relDiskUsage = -1;
289  }
290  }
291 
292 
294  // do NOT use errno here
295  {
296  std::string msg;
297 
298  switch(e)
299  {
300  case ENOENT :
301  diskUsage->alarmState = AlarmHandler::ERROR;
302  msg = "Cannot access " + diskUsage->pathName + ". Is it mounted?";
303  break;
304 
305  default :
306  diskUsage->alarmState = AlarmHandler::WARNING;
307  msg = "Failed to retrieve disk space information for " + diskUsage->pathName + ":"
308  + strerror(e);
309  }
310 
311  XCEPT_DECLARE(stor::exception::DiskSpaceAlarm, ex, msg);
312  alarmHandler_->notifySentinel(diskUsage->alarmState, ex);
313  }
314 
315 
317  {
318  if ( diskUsage->relDiskUsage > dwParams_.failHighWaterMark_ )
319  {
320  failIfImportantDisk(diskUsage);
321  }
322 
323  diskUsage->alarmState = AlarmHandler::WARNING;
324 
325  XCEPT_DECLARE(stor::exception::DiskSpaceAlarm, ex, diskUsage->toString());
326  alarmHandler_->raiseAlarm(diskUsage->pathName, diskUsage->alarmState, ex);
327  }
328 
329 
331  {
332  // do not fail if the disk is one of the other disks
333  DiskWritingParams::OtherDiskPaths::const_iterator begin =
334  dwParams_.otherDiskPaths_.begin();
335  DiskWritingParams::OtherDiskPaths::const_iterator end =
337  if ( std::find(begin, end, diskUsage->pathName) != end ) return;
338 
339  diskUsage->alarmState = AlarmHandler::FATAL;
340  XCEPT_RAISE(stor::exception::DiskSpaceAlarm, diskUsage->toString());
341  }
342 
343 
345  {
346  diskUsage->alarmState = AlarmHandler::OKAY;
347 
348  alarmHandler_->revokeAlarm(diskUsage->pathName);
349  }
350 
351 
353  {
354  struct passwd* passwd = getpwnam(rmParams_.copyWorkers_.user_.c_str());
355  if (passwd)
356  {
359  }
360  else
361  {
363  }
364 
366  {
368  }
369  }
370 
371 
373  {
374  const std::string alarmName = "CopyWorkers";
375 
377  {
378  std::ostringstream msg;
379  msg << "Expected " << rmParams_.copyWorkers_.expectedCount_ <<
380  " running CopyWorkers, but found " <<
381  numberOfCopyWorkers_ << ".";
382  XCEPT_DECLARE(stor::exception::CopyWorkers, ex, msg.str());
383  alarmHandler_->raiseAlarm(alarmName, AlarmHandler::WARNING, ex);
384  }
385  else
386  {
387  alarmHandler_->revokeAlarm(alarmName);
388  }
389  }
390 
391 
393  {
394  struct passwd* passwd = getpwnam(rmParams_.injectWorkers_.user_.c_str());
395  if (passwd)
396  {
398  }
399  else
400  {
402  }
403 
404  if (
407  )
408  {
410  }
411  }
412 
413 
415  {
416  const std::string alarmName = "InjectWorkers";
417 
419  {
420  std::ostringstream msg;
421  msg << "Expected " << rmParams_.injectWorkers_.expectedCount_ <<
422  " running InjectWorkers, but found " <<
423  numberOfInjectWorkers_ << ".";
424  XCEPT_DECLARE(stor::exception::InjectWorkers, ex, msg.str());
425  alarmHandler_->raiseAlarm(alarmName, AlarmHandler::WARNING, ex);
426  }
427  else
428  {
429  alarmHandler_->revokeAlarm(alarmName);
430  }
431  }
432 
433 
435  {
436  SATABeasts sataBeasts;
437  if ( getSataBeasts(sataBeasts) )
438  {
439  for (
440  SATABeasts::const_iterator it = sataBeasts.begin(),
441  itEnd= sataBeasts.end();
442  it != itEnd;
443  ++it
444  )
445  {
446  checkSataBeast(*it);
447  }
448  }
449  else
450  {
452  }
453  }
454 
455 
457  {
458  if (! alarmParams_.isProductionSystem_) return false;
459 
460  std::ifstream in;
461  in.open( "/proc/mounts" );
462 
463  if ( ! in.is_open() ) return false;
464 
465  std::string line;
466  while( getline(in,line) )
467  {
468  size_t pos = line.find("sata");
469  if ( pos != std::string::npos )
470  {
471  std::ostringstream host;
472  host << "satab-c2c"
473  << std::setw(2) << std::setfill('0')
474  << line.substr(pos+4,1)
475  << "-"
476  << std::setw(2) << std::setfill('0')
477  << line.substr(pos+5,1);
478  sataBeasts.insert(host.str());
479  }
480  }
481  return !sataBeasts.empty();
482  }
483 
484 
485  void ResourceMonitorCollection::checkSataBeast(const std::string& sataBeast)
486  {
487  if ( ! (checkSataDisks(sataBeast,"-00.cms") || checkSataDisks(sataBeast,"-10.cms")) )
488  {
489  XCEPT_DECLARE(stor::exception::SataBeast, ex,
490  "Failed to connect to SATA beast " + sataBeast);
491  alarmHandler_->raiseAlarm(sataBeast, AlarmHandler::ERROR, ex);
492 
493  latchedSataBeastStatus_ = 99999;
494  }
495  }
496 
497 
499  (
500  const std::string& sataBeast,
501  const std::string& hostSuffix
502  )
503  {
504  CurlInterface curlInterface;
505  CurlInterface::Content content;
506 
507  // Do not try to connect if we have no user name
508  if ( rmParams_.sataUser_.empty() ) return true;
509 
510  const CURLcode returnCode =
511  curlInterface.getContent(
512  "http://" + sataBeast + hostSuffix + "/status.asp",rmParams_.sataUser_,
513  content
514  );
515 
516  if (returnCode == CURLE_OK)
517  {
518  updateSataBeastStatus(sataBeast, std::string(&content[0]));
519  return true;
520  }
521  else
522  {
523  std::ostringstream msg;
524  msg << "Failed to connect to SATA controller "
525  << sataBeast << hostSuffix
526  << ": " << std::string(&content[0]);
527  XCEPT_DECLARE(stor::exception::SataBeast, ex, msg.str());
528  alarmHandler_->notifySentinel(AlarmHandler::WARNING, ex);
529 
530  return false;
531  }
532  }
533 
534 
536  (
537  const std::string& sataBeast,
538  const std::string& content
539  )
540  {
541  boost::regex failedEntry(">([^<]* has failed[^<]*)");
542  boost::regex failedDisk("Hard disk([[:digit:]]+)");
543  boost::regex failedController("RAID controller ([[:digit:]]+)");
544  boost::match_results<std::string::const_iterator> matchedEntry, matchedCause;
545  boost::match_flag_type flags = boost::match_default;
546 
547  std::string::const_iterator start = content.begin();
548  std::string::const_iterator end = content.end();
549 
550  unsigned int newSataBeastStatus = 0;
551 
552  while( regex_search(start, end, matchedEntry, failedEntry, flags) )
553  {
554  std::string errorMsg = matchedEntry[1];
555  XCEPT_DECLARE(stor::exception::SataBeast, ex, sataBeast+": "+errorMsg);
556  alarmHandler_->raiseAlarm(sataBeast, AlarmHandler::ERROR, ex);
557 
558  // find what failed
559  if ( regex_search(errorMsg, matchedCause, failedDisk) )
560  {
561  // Update the number of failed disks
562  ++newSataBeastStatus;
563  }
564  else if ( regex_search(errorMsg, matchedCause, failedController) )
565  {
566  // Update the number of failed controllers
567  newSataBeastStatus += 100;
568  }
569  else
570  {
571  // Unknown failure
572  newSataBeastStatus += 1000;
573  }
574 
575  // update search position:
576  start = matchedEntry[0].second;
577  // update flags:
578  flags |= boost::match_prev_avail;
579  flags |= boost::match_not_bob;
580  }
581 
582  latchedSataBeastStatus_ = newSataBeastStatus;
583 
584  if (latchedSataBeastStatus_ == 0) // no more problems
585  alarmHandler_->revokeAlarm(sataBeast);
586 
587  }
588 
589 
590  namespace {
591  int filter(const struct dirent *dir)
592  {
593  return !fnmatch("[1-9]*", dir->d_name, 0);
594  }
595 
596  bool matchUid(const std::string& filename, const uid_t& uid)
597  {
598  struct stat filestat;
599  int result = stat(filename.c_str(), &filestat);
600  return (result == 0 && filestat.st_uid == uid);
601  }
602 
603  bool isMaster(const char* pid)
604  {
605  // Adapted from procps::minimal::stat2proc
606  char buf[800]; // about 40 fields, 64-bit decimal is about 20 chars
607  int fd;
608  int ppid = 0;
609  std::ostringstream statfile;
610  statfile << "/proc/" << pid << "/stat";
611  snprintf(buf, 32, statfile.str().c_str(), pid);
612  if ( (fd = open(buf, O_RDONLY, 0) ) == -1 ) return false;
613  int num = read(fd, buf, sizeof buf - 1);
614  if(num<80) return false;
615  buf[num] = '\0';
616  char* tmp = strrchr(buf, ')'); // split into "PID (cmd" and "<rest>"
617  num = sscanf(tmp + 4, // skip ') %c '
618  "%d", &ppid);
619  close(fd);
620  return ( num == 1 && ppid == 1 ); // scan succeeded and parent pid is 1
621  }
622 
623  bool grep(const std::string& cmdline, const std::string& name)
624  {
625 
626  std::ifstream in;
627  in.open( cmdline.c_str() );
628 
629  std::string line;
630  if ( in.is_open() )
631  {
632  std::string tmp;
633  while( getline(in,tmp,'\0') )
634  {
635  line.append(tmp);
636  line.append(" ");
637  }
638  in.close();
639  }
640 
641  return ( line.find(name) != std::string::npos );
642  }
643  }
644 
645 
647  (
648  const std::string& processName,
649  const int& uid
650  )
651  {
652  int count(0);
653  struct dirent **namelist;
654  int n;
655 
656  #if __APPLE__
657  return -1;
658  #else
659  n = scandir("/proc", &namelist, filter, 0);
660  #endif
661  if (n < 0) return -1;
662 
663  while(n--)
664  {
665  std::ostringstream cmdline;
666  cmdline << "/proc/" << namelist[n]->d_name << "/cmdline";
667 
668  if ( grep(cmdline.str(), processName) &&
669  (uid < 0 || matchUid(cmdline.str(), uid)) &&
670  isMaster(namelist[n]->d_name) )
671  {
672  ++count;
673  }
674  free(namelist[n]);
675  }
676  free(namelist);
677 
678  return count;
679  }
680 
681 
683  {
684  std::ostringstream msg;
685  msg << std::fixed << std::setprecision(1) <<
686  "Disk space usage for " << pathName <<
687  " is " << relDiskUsage << "% (" <<
688  absDiskUsage << "GB of " <<
689  diskSize << "GB).";
690  return msg.str();
691  }
692 
693 } // namespace stor
694 
int i
Definition: DBlmapReader.cc:9
ResourceMonitorCollection(const utils::Duration_t &updateInterval, AlarmHandlerPtr)
std::vector< Variable::Flags > flags
Definition: MVATrainer.cc:135
OtherDiskPaths otherDiskPaths_
Definition: Configuration.h:48
void find(edm::Handle< EcalRecHitCollection > &hits, DetId thisDet, std::vector< EcalRecHitCollection::const_iterator > &hit, bool debug=false)
Definition: FindCaloHit.cc:7
boost::shared_ptr< DiskUsage > DiskUsagePtr
const std::string * pathName() const
Definition: HLTadd.h:31
boost::posix_time::time_duration Duration_t
Definition: Utils.h:41
const T & max(const T &a, const T &b)
xdata::Vector< xdata::UnsignedInteger32 > usedDiskSpace_
void configureDisks(DiskWritingParams const &)
tuple result
Definition: query.py:137
bool checkSataDisks(const std::string &sataBeast, const std::string &hostSuffix)
#define end
Definition: vmac.h:38
xdata::Vector< xdata::UnsignedInteger32 > totalDiskSpace_
CURLcode getContent(const std::string &url, const std::string &user, Content &content)
boost::shared_ptr< AlarmHandler > AlarmHandlerPtr
Definition: AlarmHandler.h:91
virtual void do_appendInfoSpaceItems(InfoSpaceItems &)
string host
Definition: query.py:114
void checkSataBeast(const std::string &sataBeast)
xdata::Vector< xdata::String > diskPaths_
tuple filter
USE THIS FOR SKIMMED TRACKS process.p = cms.Path(process.hltLevel1GTSeed*process.skimming*process.offlineBeamSpot*process.TrackRefitter2) OTHERWISE USE THIS.
Definition: align_tpl.py:86
bool getSataBeasts(SATABeasts &sataBeasts)
boost::shared_ptr< DiskUsageStats > DiskUsageStatsPtr
long long int num
Definition: procUtils.cc:71
std::vector< std::pair< std::string, xdata::Serializable * > > InfoSpaceItems
std::vector< std::vector< double > > tmp
Definition: MVATrainer.cc:100
#define begin
Definition: vmac.h:31
void updateSataBeastStatus(const std::string &sataBeast, const std::string &content)
tuple filename
Definition: lut2db_cfg.py:20
void emitDiskAlarm(DiskUsagePtr, error_t)
void configureResources(ResourceMonitorParams const &)
dbl *** dir
Definition: mlp_gen.cc:35
int getProcessCount(const std::string &processName, const int &uid=-1)
std::vector< char > Content
Definition: CurlInterface.h:27