CMS 3D CMS Logo

 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Pages
ResourceMonitorCollection.cc
Go to the documentation of this file.
1 // $Id: ResourceMonitorCollection.cc,v 1.47 2011/11/16 14:32:22 mommsen Exp $
3 
4 #include <stdio.h>
5 #include <string>
6 #include <sstream>
7 #include <iomanip>
8 #include <sys/types.h>
9 #include <sys/stat.h>
10 #include <fcntl.h>
11 #include <dirent.h>
12 #include <fnmatch.h>
13 #include <pwd.h>
14 #include <fstream>
15 #include <algorithm>
16 
17 #include <boost/bind.hpp>
18 #include <boost/regex.hpp>
19 
25 
26 
27 namespace stor {
28 
30  (
31  const utils::Duration_t& updateInterval,
33  ) :
34  MonitorCollection(updateInterval),
35  updateInterval_(updateInterval),
36  alarmHandler_(ah),
37  numberOfCopyWorkers_(-1),
38  numberOfInjectWorkers_(-1),
39  nLogicalDisks_(0),
40  latchedSataBeastStatus_(-1)
41  {}
42 
43 
45  {
46  boost::mutex::scoped_lock sl(diskUsageListMutex_);
47 
48  dwParams_ = dwParams;
49 
50  nLogicalDisks_ = std::max(dwParams.nLogicalDisk_, 1);
51  diskUsageList_.clear();
52  diskUsageList_.reserve(nLogicalDisks_+dwParams.otherDiskPaths_.size()+1);
53 
54  for (unsigned int i=0; i<nLogicalDisks_; ++i) {
55 
56  std::ostringstream pathName;
57  pathName << dwParams.filePath_;
58  if( dwParams.nLogicalDisk_ > 0 ) {
59  pathName << "/" << std::setfill('0') << std::setw(2) << i;
60  }
61  addDisk(pathName.str());
62  }
63  addDisk(dwParams.dbFilePath_);
64 
66  {
67  addOtherDisks();
68  }
69  }
70 
71 
73  {
74  if ( pathname.empty() ) return;
75 
76  DiskUsagePtr diskUsage( new DiskUsage(pathname) );
77  retrieveDiskSize(diskUsage);
78  diskUsageList_.push_back(diskUsage);
79  }
80 
81 
83  {
84  for ( DiskWritingParams::OtherDiskPaths::const_iterator
85  it = dwParams_.otherDiskPaths_.begin(),
86  itEnd = dwParams_.otherDiskPaths_.end();
87  it != itEnd;
88  ++it)
89  {
90  addDisk(*it);
91  }
92  }
93 
94 
96  (
97  ResourceMonitorParams const& rmParams
98  )
99  {
100  rmParams_ = rmParams;
101  }
102 
103 
105  (
106  AlarmParams const& alarmParams
107  )
108  {
109  alarmParams_ = alarmParams;
110  }
111 
112 
114  {
115  getDiskStats(stats);
116 
119 
121  }
122 
123 
125  {
126  boost::mutex::scoped_lock sl(diskUsageListMutex_);
127 
128  stats.diskUsageStatsList.clear();
129  stats.diskUsageStatsList.reserve(diskUsageList_.size());
130  for ( DiskUsagePtrList::const_iterator it = diskUsageList_.begin(),
131  itEnd = diskUsageList_.end();
132  it != itEnd;
133  ++it)
134  {
135  DiskUsageStatsPtr diskUsageStats(new DiskUsageStats);
136  diskUsageStats->diskSize = (*it)->diskSize_;
137  diskUsageStats->absDiskUsage = (*it)->absDiskUsage_;
138  diskUsageStats->relDiskUsage = (*it)->relDiskUsage_;
139  diskUsageStats->pathName = (*it)->pathName_;
140  diskUsageStats->alarmState = (*it)->alarmState_;
141  stats.diskUsageStatsList.push_back(diskUsageStats);
142  }
143  }
144 
145 
147  {
148  calcDiskUsage();
151  checkSataBeasts();
152  }
153 
154 
156  {
160 
161  boost::mutex::scoped_lock sl(diskUsageListMutex_);
162  for ( DiskUsagePtrList::const_iterator it = diskUsageList_.begin(),
163  itEnd = diskUsageList_.end();
164  it != itEnd;
165  ++it)
166  {
167  if ( ! (*it)->retrievingDiskSize_ )
168  {
169  (*it)->diskSize_ = -1;
170  (*it)->absDiskUsage_ = -1;
171  (*it)->relDiskUsage_ = -1;
172  (*it)->retVal_ = 0;
173  (*it)->alarmState_ = AlarmHandler::OKAY;
174  }
175  }
176  }
177 
178 
180  {
181  infoSpaceItems.push_back(std::make_pair("copyWorkers", &copyWorkers_));
182  infoSpaceItems.push_back(std::make_pair("injectWorkers", &injectWorkers_));
183  infoSpaceItems.push_back(std::make_pair("sataBeastStatus", &sataBeastStatus_));
184  infoSpaceItems.push_back(std::make_pair("numberOfDisks", &numberOfDisks_));
185  infoSpaceItems.push_back(std::make_pair("diskPaths", &diskPaths_));
186  infoSpaceItems.push_back(std::make_pair("totalDiskSpace", &totalDiskSpace_));
187  infoSpaceItems.push_back(std::make_pair("usedDiskSpace", &usedDiskSpace_));
188  }
189 
190 
192  {
193  Stats stats;
194  getStats(stats);
195 
196  if (stats.numberOfCopyWorkers > 0)
197  copyWorkers_ = static_cast<xdata::UnsignedInteger32>(stats.numberOfCopyWorkers);
198  else
199  copyWorkers_ = 0;
200 
201  if (stats.numberOfInjectWorkers > 0)
202  injectWorkers_ = static_cast<xdata::UnsignedInteger32>(stats.numberOfInjectWorkers);
203  else
204  injectWorkers_ = 0;
205 
208 
209  diskPaths_.clear();
210  totalDiskSpace_.clear();
211  usedDiskSpace_.clear();
212 
213  diskPaths_.reserve(stats.diskUsageStatsList.size());
214  totalDiskSpace_.reserve(stats.diskUsageStatsList.size());
215  usedDiskSpace_.reserve(stats.diskUsageStatsList.size());
216 
217  for (DiskUsageStatsPtrList::const_iterator
218  it = stats.diskUsageStatsList.begin(),
219  itEnd = stats.diskUsageStatsList.end();
220  it != itEnd;
221  ++it)
222  {
223  diskPaths_.push_back(
224  static_cast<xdata::String>( (*it)->pathName )
225  );
226  totalDiskSpace_.push_back(
227  static_cast<xdata::UnsignedInteger32>(
228  static_cast<unsigned int>( (*it)->diskSize * 1024 )
229  )
230  );
231  usedDiskSpace_.push_back(
232  static_cast<xdata::UnsignedInteger32>(
233  static_cast<unsigned int>( (*it)->absDiskUsage * 1024 )
234  )
235  );
236  }
237 
238  calcDiskUsage();
239  }
240 
241 
243  {
244  boost::mutex::scoped_lock sl(diskUsageListMutex_);
245 
246  for ( DiskUsagePtrList::iterator it = diskUsageList_.begin(),
247  itEnd = diskUsageList_.end();
248  it != itEnd;
249  ++it)
250  {
251  retrieveDiskSize(*it);
252  }
253  }
254 
255 
257  {
258  if ( ! diskUsage->retrievingDiskSize_ )
259  // don't start another thread if there's already one
260  {
261  boost::thread thread(
262  boost::bind( &ResourceMonitorCollection::doStatFs, this, diskUsage)
263  );
264  if (
265  ( ! thread.timed_join( boost::posix_time::milliseconds(500) ) )
266  || (diskUsage->retVal_ != 0)
267  )
268  {
269  emitDiskAlarm(diskUsage);
270  }
271  else
272  {
273  const unsigned int blksize = diskUsage->statfs_.f_bsize;
274  diskUsage->diskSize_ =
275  static_cast<double>(diskUsage->statfs_.f_blocks * blksize) / 1024 / 1024 / 1024;
276  diskUsage->absDiskUsage_ =
277  diskUsage->diskSize_ -
278  static_cast<double>(diskUsage->statfs_.f_bavail * blksize) / 1024 / 1024 / 1024;
279  diskUsage->relDiskUsage_ = (100 * (diskUsage->absDiskUsage_ / diskUsage->diskSize_));
280  if ( diskUsage->relDiskUsage_ > dwParams_.highWaterMark_ )
281  {
282  emitDiskSpaceAlarm(diskUsage);
283  }
284  else if ( diskUsage->relDiskUsage_ < dwParams_.highWaterMark_*0.95 )
285  // do not change alarm level if we are close to the high water mark
286  {
287  revokeDiskAlarm(diskUsage);
288  }
289  }
290  }
291  }
292 
293 
295  {
296  diskUsage->retrievingDiskSize_ = true;
297 
298  #if __APPLE__
299  diskUsage->retVal_ = statfs(diskUsage->pathName_.c_str(), &(diskUsage->statfs_));
300  #else
301  diskUsage->retVal_ = statfs64(diskUsage->pathName_.c_str(), &(diskUsage->statfs_));
302  #endif
303  if (diskUsage->pathName_ == "/aSlowDiskForUnitTests") ::sleep(5);
304 
305  diskUsage->retrievingDiskSize_ = false;
306  }
307 
308 
310  {
311  const std::string msg = "Cannot access " + diskUsage->pathName_ + ". Is it mounted?";
312 
313  diskUsage->diskSize_ = -1;
314  diskUsage->absDiskUsage_ = -1;
315  diskUsage->relDiskUsage_ = -1;
316 
317  if ( isImportantDisk(diskUsage->pathName_) )
318  {
319  diskUsage->alarmState_ = AlarmHandler::FATAL;
320  XCEPT_DECLARE(stor::exception::DiskSpaceAlarm, ex, msg);
321  alarmHandler_->moveToFailedState(ex);
322  }
323  else
324  {
325  diskUsage->alarmState_ = AlarmHandler::ERROR;
326  XCEPT_DECLARE(stor::exception::DiskSpaceAlarm, ex, msg);
327  alarmHandler_->raiseAlarm(diskUsage->pathName_, diskUsage->alarmState_, ex);
328  }
329  }
330 
331 
333  {
334  if (
335  isImportantDisk(diskUsage->pathName_) &&
336  (diskUsage->relDiskUsage_ > dwParams_.failHighWaterMark_)
337  )
338  {
339  diskUsage->alarmState_ = AlarmHandler::FATAL;
340  XCEPT_DECLARE(stor::exception::DiskSpaceAlarm, ex, diskUsage->toString());
341  alarmHandler_->moveToFailedState(ex);
342  }
343  else
344  {
345  diskUsage->alarmState_ = AlarmHandler::WARNING;
346  XCEPT_DECLARE(stor::exception::DiskSpaceAlarm, ex, diskUsage->toString());
347  alarmHandler_->raiseAlarm(diskUsage->pathName_, diskUsage->alarmState_, ex);
348  }
349  }
350 
351 
353  {
354  DiskWritingParams::OtherDiskPaths::const_iterator begin =
355  dwParams_.otherDiskPaths_.begin();
356  DiskWritingParams::OtherDiskPaths::const_iterator end =
358  return ( std::find(begin, end, pathName) == end );
359  }
360 
361 
363  {
364  diskUsage->alarmState_ = AlarmHandler::OKAY;
365 
366  alarmHandler_->revokeAlarm(diskUsage->pathName_);
367  }
368 
369 
371  {
372  struct passwd* passwd = getpwnam(rmParams_.copyWorkers_.user_.c_str());
373  if (passwd)
374  {
377  }
378  else
379  {
381  }
382 
384  {
386  }
387  }
388 
389 
391  {
392  const std::string alarmName = "CopyWorkers";
393 
395  {
396  std::ostringstream msg;
397  msg << "Expected " << rmParams_.copyWorkers_.expectedCount_ <<
398  " running CopyWorkers, but found " <<
399  numberOfCopyWorkers_ << ".";
400  XCEPT_DECLARE(stor::exception::CopyWorkers, ex, msg.str());
401  alarmHandler_->raiseAlarm(alarmName, AlarmHandler::WARNING, ex);
402  }
403  else
404  {
405  alarmHandler_->revokeAlarm(alarmName);
406  }
407  }
408 
409 
411  {
412  struct passwd* passwd = getpwnam(rmParams_.injectWorkers_.user_.c_str());
413  if (passwd)
414  {
416  }
417  else
418  {
420  }
421 
422  if (
425  )
426  {
428  }
429  }
430 
431 
433  {
434  const std::string alarmName = "InjectWorkers";
435 
437  {
438  std::ostringstream msg;
439  msg << "Expected " << rmParams_.injectWorkers_.expectedCount_ <<
440  " running InjectWorkers, but found " <<
441  numberOfInjectWorkers_ << ".";
442  XCEPT_DECLARE(stor::exception::InjectWorkers, ex, msg.str());
443  alarmHandler_->raiseAlarm(alarmName, AlarmHandler::WARNING, ex);
444  }
445  else
446  {
447  alarmHandler_->revokeAlarm(alarmName);
448  }
449  }
450 
451 
453  {
454  SATABeasts sataBeasts;
455  if ( getSataBeasts(sataBeasts) )
456  {
457  for (
458  SATABeasts::const_iterator it = sataBeasts.begin(),
459  itEnd= sataBeasts.end();
460  it != itEnd;
461  ++it
462  )
463  {
464  checkSataBeast(*it);
465  }
466  }
467  else
468  {
470  }
471  }
472 
473 
475  {
476  if (! alarmParams_.isProductionSystem_) return false;
477 
478  std::ifstream in;
479  in.open( "/proc/mounts" );
480 
481  if ( ! in.is_open() ) return false;
482 
484  while( getline(in,line) )
485  {
486  size_t pos = line.find("sata");
487  if ( pos != std::string::npos )
488  {
489  std::ostringstream host;
490  host << "satab-c2c"
491  << std::setw(2) << std::setfill('0')
492  << line.substr(pos+4,1)
493  << "-"
494  << std::setw(2) << std::setfill('0')
495  << line.substr(pos+5,1);
496  sataBeasts.insert(host.str());
497  }
498  }
499  return !sataBeasts.empty();
500  }
501 
502 
504  {
505  if ( ! (checkSataDisks(sataBeast,"-00.cms") || checkSataDisks(sataBeast,"-10.cms")) )
506  {
507  XCEPT_DECLARE(stor::exception::SataBeast, ex,
508  "Failed to connect to SATA beast " + sataBeast);
509  alarmHandler_->raiseAlarm(sataBeast, AlarmHandler::ERROR, ex);
510 
511  latchedSataBeastStatus_ = 99999;
512  }
513  }
514 
515 
517  (
518  const std::string& sataBeast,
519  const std::string& hostSuffix
520  )
521  {
524 
525  // Do not try to connect if we have no user name
526  if ( rmParams_.sataUser_.empty() ) return true;
527 
528  const CURLcode returnCode =
529  curlInterface->getContent(
530  "http://" + sataBeast + hostSuffix + "/status.asp",rmParams_.sataUser_,
531  content
532  );
533 
534  if (returnCode == CURLE_OK)
535  {
536  updateSataBeastStatus(sataBeast, std::string(&content[0]));
537  return true;
538  }
539  else
540  {
541  std::ostringstream msg;
542  msg << "Failed to connect to SATA controller "
543  << sataBeast << hostSuffix
544  << ": " << std::string(&content[0]);
545  XCEPT_DECLARE(stor::exception::SataBeast, ex, msg.str());
546  alarmHandler_->notifySentinel(AlarmHandler::WARNING, ex);
547 
548  return false;
549  }
550  }
551 
552 
554  (
555  const std::string& sataBeast,
556  const std::string& content
557  )
558  {
559  boost::regex failedEntry(">([^<]* has failed[^<]*)");
560  boost::regex failedDisk("Hard disk([[:digit:]]+)");
561  boost::regex failedController("RAID controller ([[:digit:]]+)");
562  boost::match_results<std::string::const_iterator> matchedEntry, matchedCause;
563  boost::match_flag_type flags = boost::match_default;
564 
565  std::string::const_iterator start = content.begin();
566  std::string::const_iterator end = content.end();
567 
568  unsigned int newSataBeastStatus = 0;
569 
570  while( regex_search(start, end, matchedEntry, failedEntry, flags) )
571  {
572  std::string errorMsg = matchedEntry[1];
573  XCEPT_DECLARE(stor::exception::SataBeast, ex, sataBeast+": "+errorMsg);
574  alarmHandler_->raiseAlarm(sataBeast, AlarmHandler::ERROR, ex);
575 
576  // find what failed
577  if ( regex_search(errorMsg, matchedCause, failedDisk) )
578  {
579  // Update the number of failed disks
580  ++newSataBeastStatus;
581  }
582  else if ( regex_search(errorMsg, matchedCause, failedController) )
583  {
584  // Update the number of failed controllers
585  newSataBeastStatus += 100;
586  }
587  else
588  {
589  // Unknown failure
590  newSataBeastStatus += 1000;
591  }
592 
593  // update search position:
594  start = matchedEntry[0].second;
595  // update flags:
596  flags |= boost::match_prev_avail;
597  flags |= boost::match_not_bob;
598  }
599 
600  latchedSataBeastStatus_ = newSataBeastStatus;
601 
602  if (latchedSataBeastStatus_ == 0) // no more problems
603  alarmHandler_->revokeAlarm(sataBeast);
604 
605  }
606 
607 
608  namespace {
609  int filter(const struct dirent *dir)
610  {
611  return !fnmatch("[1-9]*", dir->d_name, 0);
612  }
613 
614  bool matchUid(const std::string& filename, const uid_t& uid)
615  {
616  struct stat filestat;
617  int result = stat(filename.c_str(), &filestat);
618  return (result == 0 && filestat.st_uid == uid);
619  }
620 
621  bool isMaster(const char* pid)
622  {
623  // Adapted from procps::minimal::stat2proc
624  char buf[800]; // about 40 fields, 64-bit decimal is about 20 chars
625  int fd;
626  int ppid = 0;
627  std::ostringstream statfile;
628  statfile << "/proc/" << pid << "/stat";
629  snprintf(buf, 32, statfile.str().c_str(), pid);
630  if ( (fd = open(buf, O_RDONLY, 0) ) == -1 ) return false;
631  int num = read(fd, buf, sizeof buf - 1);
632  if(num<80) return false;
633  buf[num] = '\0';
634  char* tmp = strrchr(buf, ')'); // split into "PID (cmd" and "<rest>"
635  num = sscanf(tmp + 4, // skip ') %c '
636  "%d", &ppid);
637  close(fd);
638  return ( num == 1 && ppid == 1 ); // scan succeeded and parent pid is 1
639  }
640 
641  bool grep(const std::string& cmdline, const std::string& name)
642  {
643 
644  std::ifstream in;
645  in.open( cmdline.c_str() );
646 
648  if ( in.is_open() )
649  {
651  while( getline(in,tmp,'\0') )
652  {
653  line.append(tmp);
654  line.append(" ");
655  }
656  in.close();
657  }
658 
659  return ( line.find(name) != std::string::npos );
660  }
661  }
662 
663 
665  (
666  const std::string& processName,
667  const int& uid
668  )
669  {
670  int count(0);
671  struct dirent **namelist;
672  int n;
673 
674  #if __APPLE__
675  return -1;
676  #else
677  n = scandir("/proc", &namelist, filter, 0);
678  #endif
679  if (n < 0) return -1;
680 
681  while(n--)
682  {
683  std::ostringstream cmdline;
684  cmdline << "/proc/" << namelist[n]->d_name << "/cmdline";
685 
686  if ( grep(cmdline.str(), processName) &&
687  (uid < 0 || matchUid(cmdline.str(), uid)) &&
688  isMaster(namelist[n]->d_name) )
689  {
690  ++count;
691  }
692  free(namelist[n]);
693  }
694  free(namelist);
695 
696  return count;
697  }
698 
699 
701  : pathName_(path), absDiskUsage_(-1), relDiskUsage_(-1), diskSize_(-1),
702  retrievingDiskSize_(false), alarmState_(AlarmHandler::OKAY), retVal_(0)
703  {}
704 
705 
707  {
708  std::ostringstream msg;
709  msg << std::fixed << std::setprecision(1) <<
710  "Disk space usage for " << pathName_ <<
711  " is " << relDiskUsage_ << "% (" <<
712  absDiskUsage_ << "GB of " <<
713  diskSize_ << "GB).";
714  return msg.str();
715  }
716 
717 } // namespace stor
718 
int i
Definition: DBlmapReader.cc:9
tuple start
Check for commandline option errors.
Definition: dqm_diff.py:58
ResourceMonitorCollection(const utils::Duration_t &updateInterval, AlarmHandlerPtr)
std::vector< Variable::Flags > flags
Definition: MVATrainer.cc:135
OtherDiskPaths otherDiskPaths_
Definition: Configuration.h:48
void find(edm::Handle< EcalRecHitCollection > &hits, DetId thisDet, std::vector< EcalRecHitCollection::const_iterator > &hit, bool debug=false)
Definition: FindCaloHit.cc:7
boost::shared_ptr< DiskUsage > DiskUsagePtr
void sleep(Duration_t)
Definition: Utils.h:163
boost::posix_time::time_duration Duration_t
Definition: Utils.h:41
static boost::shared_ptr< CurlInterface > getInterface()
const T & max(const T &a, const T &b)
xdata::Vector< xdata::UnsignedInteger32 > usedDiskSpace_
void configureDisks(DiskWritingParams const &)
tuple result
Definition: query.py:137
bool checkSataDisks(const std::string &sataBeast, const std::string &hostSuffix)
#define end
Definition: vmac.h:38
xdata::Vector< xdata::UnsignedInteger32 > totalDiskSpace_
boost::shared_ptr< AlarmHandler > AlarmHandlerPtr
Definition: AlarmHandler.h:116
boost::shared_ptr< CurlInterface > CurlInterfacePtr
Definition: CurlInterface.h:71
virtual void do_appendInfoSpaceItems(InfoSpaceItems &)
string host
Definition: query.py:114
void checkSataBeast(const std::string &sataBeast)
xdata::Vector< xdata::String > diskPaths_
bool getSataBeasts(SATABeasts &sataBeasts)
boost::shared_ptr< DiskUsageStats > DiskUsageStatsPtr
long long int num
Definition: procUtils.cc:71
std::vector< std::pair< std::string, xdata::Serializable * > > InfoSpaceItems
std::vector< std::vector< double > > tmp
Definition: MVATrainer.cc:100
#define begin
Definition: vmac.h:31
void updateSataBeastStatus(const std::string &sataBeast, const std::string &content)
tuple filename
Definition: lut2db_cfg.py:20
void configureResources(ResourceMonitorParams const &)
dbl *** dir
Definition: mlp_gen.cc:35
int getProcessCount(const std::string &processName, const int &uid=-1)
std::vector< char > Content
Definition: CurlInterface.h:31