CMS 3D CMS Logo

 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Pages
Vulture.cc
Go to the documentation of this file.
1 
5 
6 // to handle pt file descriptors left open at fork
7 #include "pt/PeerTransportReceiver.h"
8 #include "pt/PeerTransportAgent.h"
9 
10 #include "toolbox/task/Action.h"
11 #include "toolbox/task/WorkLoop.h"
12 #include "toolbox/task/WorkLoopFactory.h"
13 
14 #include <unistd.h>
15 #ifdef linux
16 #include <sys/prctl.h>
17 #endif
18 #include <signal.h>
19 #include <string.h>
20 #include <sys/wait.h>
21 #include <sys/time.h>
22 
23 #include <sys/stat.h>
24 #include <unistd.h>
25 #include <stdlib.h>
26 #include <stdio.h>
27 
28 #ifdef __APPLE__
29 /* getline implementation is copied from glibc. */
30 
31 #ifndef SIZE_MAX
32 # define SIZE_MAX ((size_t) -1)
33 #endif
34 #ifndef SSIZE_MAX
35 # define SSIZE_MAX ((ssize_t) (SIZE_MAX / 2))
36 #endif
37 
38 ssize_t getline (char **lineptr, size_t *n, FILE *fp)
39 {
40  ssize_t result;
41  size_t cur_len = 0;
42 
43  if (lineptr == NULL || n == NULL || fp == NULL)
44  {
45  errno = EINVAL;
46  return -1;
47  }
48 
49  if (*lineptr == NULL || *n == 0)
50  {
51  *n = 120;
52  *lineptr = (char *) malloc (*n);
53  if (*lineptr == NULL)
54  {
55  result = -1;
56  goto end;
57  }
58  }
59 
60  for (;;)
61  {
62  int i;
63 
64  i = getc (fp);
65  if (i == EOF)
66  {
67  result = -1;
68  break;
69  }
70 
71  /* Make enough space for len+1 (for final NUL) bytes. */
72  if (cur_len + 1 >= *n)
73  {
74  size_t needed_max =
75  SSIZE_MAX < SIZE_MAX ? (size_t) SSIZE_MAX + 1 : SIZE_MAX;
76  size_t needed = 2 * *n + 1; /* Be generous. */
77  char *new_lineptr;
78 
79  if (needed_max < needed)
80  needed = needed_max;
81  if (cur_len + 1 >= needed)
82  {
83  result = -1;
84  goto end;
85  }
86 
87  new_lineptr = (char *) realloc (*lineptr, needed);
88  if (new_lineptr == NULL)
89  {
90  result = -1;
91  goto end;
92  }
93 
94  *lineptr = new_lineptr;
95  *n = needed;
96  }
97 
98  (*lineptr)[cur_len] = i;
99  cur_len++;
100 
101  if (i == '\n')
102  break;
103  }
104  (*lineptr)[cur_len] = '\0';
105  result = cur_len ? (ssize_t) cur_len : result;
106 
107 end:
108  return result;
109 }
110 #endif
111 
112 namespace evf{
113 
114  const std::string Vulture::FS="/tmp";
115 
116  Vulture::Vulture(bool push)
117  : wlCtrl_(0)
118  , asCtrl_(0)
119  , running_(false)
120  , wlProwl_(0)
121  , asProwl_(0)
122  , prowling_(false)
123  , iDieUrl_("")
124  , updateMode_(push)
125  , vulturePid_(0)
126  , tmp_(0)
127  , newCores_(0)
128  , poster_(0)
129  , mq_(new MasterQueue(vulture_queue_offset))
130  , sq_(0) // this is only defined in the forked process
131  , started_(-1)
132  , stopped_(-1)
133  , handicapped_(false)
134  {
135  // create command file for gdb
136  FILE *outf = fopen("/tmp/vulture.cmd","w");
137  fprintf(outf,"where\n");
138  fclose(outf);
139  }
140 
142  {
143  delete mq_;
144  if(sq_ != 0) delete sq_;
145  if(poster_ != 0) delete poster_;
146  }
147 
149 
150  pid_t retval = fork();
151  if(retval==0){ // we are in the forked process
152  int success = -1;
153 #ifdef linux
154  success = prctl( PR_SET_DUMPABLE, 0 );
155 #endif
156  if(success != 0){
157  std::cout << "Vulture::could not set process undumpable" << std::endl;
158  handicapped_ = true;
159  }
160 #ifdef linux
161  success = prctl( PR_SET_PDEATHSIG, SIGKILL );
162 #endif
163  if(success != 0){
164  std::cout << "Vulture::could not set process death signal" << std::endl;
165  handicapped_ = true;
166  }
167  tmp_ = opendir(FS.c_str());
168 #ifdef linux
169  success = prctl ( PR_SET_NAME , "vulture");
170 #endif
171  if(success != 0){
172  std::cout << "Vulture::could not set process name" << std::endl;
173  handicapped_ = true;
174  }
175 
176  try{
177  pt::PeerTransport * ptr =
178  pt::getPeerTransportAgent()->getPeerTransport("http","soap",pt::Receiver);
179  delete ptr;
180  }
181  catch (pt::exception::PeerTransportNotFound & e ){
182  //do nothing here since we don't know what to do... ?
183  }
184  // freopen("/dev/null","w",stderr);
186  // start the ctrl workloop
187  try {
188  wlCtrl_=
189  toolbox::task::getWorkLoopFactory()->getWorkLoop("Ctrll",
190  "waiting");
191  if (!wlCtrl_->isActive()) wlCtrl_->activate();
192 
193  asCtrl_ = toolbox::task::bind(this,&Vulture::control,
194  "Ctrl");
195  wlCtrl_->submit(asCtrl_);
196  }
197  catch (xcept::Exception& e) {
198  std::cout << "Vulture:constructor - could not start workloop 'Ctrl' for process " << retval << std::endl;
199  }
200  }
201  else{
202  vulturePid_ = retval;
203  }
204  return retval;
205 
206 
207  }
208 
210  if(started_<0){
212  try{
213  mq_->rcvNonBlocking(msg2);
214  started_ = 0;
215  }
216  catch(evf::Exception &e){
217  }
218  } else {started_ = 1;}
219  return started_;
220  }
221 
223  if(stopped_<0){
225  try{
226  mq_->rcvNonBlocking(msg2);
227  stopped_ = 0;
228  }
229  catch(evf::Exception &e){
230  }
231  } else {stopped_ = 1;}
232  return stopped_;
233  }
234 
235  pid_t Vulture::start(std::string url, int run){
236 
237  //communicate start-of-run to Vulture
238  vulture_start_message stamsg;
239  strcpy(stamsg.url_,url.c_str());
240  stamsg.run_ = run;
242  memcpy(msg1->mtext,&stamsg,sizeof(vulture_start_message));
243  mq_->post(msg1);
244  stopped_ = -1;
245  return vulturePid_;
246  }
247 
249  {
250 
252  mq_->post(msg1);
253  started_ = -1;
254  return vulturePid_;
255  }
256 
257  pid_t Vulture::kill() // eventually *could* be called by master app - it isn't now
258  {
259  ::kill (vulturePid_, SIGKILL);
260  int sl;
261  pid_t killedOrNot = waitpid(vulturePid_,&sl,WNOHANG);
262  vulturePid_ = 0;
263  return killedOrNot;
264  }
265 
267  {
268  timeval now;
269  gettimeofday(&now,0);
270  lastUpdate_ = now.tv_sec;
271  prowling_ = true;
272  try {
273  wlProwl_=
274  toolbox::task::getWorkLoopFactory()->getWorkLoop("Prowl",
275  "waiting");
276  if (!wlProwl_->isActive()) wlProwl_->activate();
277 
278  asProwl_ = toolbox::task::bind(this,&Vulture::prowling,
279  "Prowl");
280  wlProwl_->submit(asProwl_);
281  }
282  catch (xcept::Exception& e) {
283  std::string msg = "Failed to start workloop 'Prowl'.";
284  XCEPT_RETHROW(evf::Exception,msg,e);
285  }
286 
287  }
288 
289  bool Vulture::control(toolbox::task::WorkLoop*wl)
290  {
291 
292  MsgBuf msg;
293  unsigned long mtype = MSQM_MESSAGE_TYPE_NOP;
294  try{mtype = sq_->rcv(msg);}catch(evf::Exception &e){
295  std::cout << "Vulture::exception on msgrcv for control, bailing out of control workloop - good bye" << std::endl;
296  return false;
297  }
298  mtype = msg->mtype;
299  switch(mtype){
301  {
302 
303  vulture_start_message *sta = (vulture_start_message*)msg->mtext;
304  if(poster_ == 0) poster_ = new CurlPoster(sta->url_);
305  if(poster_->check(sta->run_)){
306  try{
307  startProwling();
308  MsgBuf msg1(0,MSQS_VULTURE_TYPE_ACK) ;
309  sq_->post(msg1);
310  }
311  catch(evf::Exception &e)
312  {
313  std::cout << "Vulture::start - exception in starting prowling workloop " << e.what() << std::endl;
314  //@EM ToDo generate some message here
315  }
316  }else{
317  std::cout << "Vulture::start - could not contact iDie - chech Url - will not start prowling loop" << std::endl;
318  prowling_ = false;
319  }
320 
321  break;
322  }
324  {
325  prowling_ = false;
326  break;
327  }
328  default:
329  {
330  // do nothing @EM ToDo generate an appropriate error message
331  }
332  }
333  return true;
334 
335  }
336 
337  bool Vulture::prowling(toolbox::task::WorkLoop*wl)
338  {
339 
340  if(!prowling_){
341  char messageDie[5];
342  sprintf(messageDie,"Dead");
343 
344  try{
345  poster_->postString(messageDie,5,0,CurlPoster::stack);
346  }
347  catch(evf::Exception &e){
348  //do nothing just swallow the exception
349  }
350  delete poster_;
351  poster_=0;
352 
353  return false;
354  }
355 
356  newCores_ = 0;
357 
358  struct stat filestat;
359 
360  timeval now;
361  gettimeofday(&now,0);
362 
363  // examine /tmp looking for new coredumps
364  dirent *dirp;
365  while((dirp = readdir(tmp_))!=0){
366  if(strncmp(dirp->d_name,"core",4)==0){
367  stat(dirp->d_name,&filestat);
368  if(filestat.st_mtime > lastUpdate_){
369  currentCoreList_.push_back(dirp->d_name);
370  newCores_++;
371  }
372  }
373  }
374  rewinddir(tmp_);
375  lastUpdate_ = now.tv_sec;
376  try{
377  analyze();
378  }
379  catch(evf::Exception &e){
380  std::cout << "Vulture cannot send to iDie server, bail out " << std::endl;
381  return false;
382  }
383  ::sleep(60);
384  return true;
385  }
386 
388  {
389  // do a first analysis of the coredump
390  if(newCores_==0) return;
391  for(unsigned int i = currentCoreList_.size()-newCores_;
392  i < currentCoreList_.size();
393  i++){
394  std::string command = "gdb /opt/xdaq/bin/xdaq.exe -batch -x /tmp/vulture.cmd -c /tmp/";
395  std::string cmdout;
396  command += currentCoreList_[i];
397  std::string filePathAndName = FS + "/";
398  filePathAndName += currentCoreList_[i];
399  std::string pid =
400  currentCoreList_[i].substr(currentCoreList_[i].find_first_of(".")+1,
401  currentCoreList_[i].length());
402 
403  FILE *ps = popen(command.c_str(),"r");
404  size_t s = 256;
405  char *p=new char[s];
406  bool filter = false;
407  while(getline(&p,&s,ps) != -1){
408  if(strncmp("Core",p,4)==0) filter = true;
409  if(filter)cmdout += p;
410  }
411  delete[] p;
412  pclose(ps);
413  int errsv = 0;
414  int rch = chmod(filePathAndName.c_str(),0777);
415  if(rch != 0){
416  errsv = errno;
417  std::cout << "ERROR: couldn't change corefile access privileges -"
418  << strerror(errsv)<< std::endl;
419  }
420  unsigned int ipid = (unsigned int)atoi(pid.c_str());
421  poster_->postString(cmdout.c_str(),cmdout.length(),ipid, CurlPoster::stack);
422 
423  }
424  }
425 }
426 
bool check(int)
Definition: CurlPoster.cc:117
int stop()
Definition: Vulture.cc:248
char url_[VULTURE_START_MESSAGE_URL_SIZE]
Definition: Vulture.h:32
int i
Definition: DBlmapReader.cc:9
pid_t makeProcess()
Definition: Vulture.cc:148
toolbox::task::WorkLoop * wlProwl_
Definition: Vulture.h:63
#define MSQS_VULTURE_TYPE_ACK
Definition: queue_defs.h:32
virtual ~Vulture()
Definition: Vulture.cc:141
pid_t kill()
Definition: Vulture.cc:257
time_t lastUpdate_
Definition: Vulture.h:71
SlaveQueue * sq_
Definition: Vulture.h:75
static const std::string FS
Definition: Vulture.h:59
#define NULL
Definition: scimark2.h:8
bool prowling(toolbox::task::WorkLoop *)
Definition: Vulture.cc:337
#define MAX_MSG_SIZE
Definition: queue_defs.h:10
void postString(const char *, size_t, unsigned int, mode, const std::string &=standard_post_method_)
Definition: CurlPoster.cc:104
void sleep(Duration_t)
Definition: Utils.h:163
std::vector< std::string > currentCoreList_
Definition: Vulture.h:70
unsigned long rcvNonBlocking(MsgBuf &ptr)
Definition: MasterQueue.h:68
DIR * tmp_
Definition: Vulture.h:69
#define MSQM_MESSAGE_TYPE_NOP
Definition: queue_defs.h:15
toolbox::task::ActionSignature * asCtrl_
Definition: Vulture.h:61
tuple result
Definition: query.py:137
int start(std::string, int=0)
Definition: Vulture.cc:235
toolbox::task::ActionSignature * asProwl_
Definition: Vulture.h:64
#define end
Definition: vmac.h:38
toolbox::task::WorkLoop * wlCtrl_
Definition: Vulture.h:60
bool control(toolbox::task::WorkLoop *)
Definition: Vulture.cc:289
#define NUMERIC_MESSAGE_SIZE
Definition: queue_defs.h:35
bool prowling_
Definition: Vulture.h:65
void startProwling()
Definition: Vulture.cc:266
void analyze()
Definition: Vulture.cc:387
int post(MsgBuf &ptr)
Definition: SlaveQueue.h:35
CurlPoster * poster_
Definition: Vulture.h:73
#define MSQM_VULTURE_TYPE_STP
Definition: queue_defs.h:22
unsigned long rcv(MsgBuf &ptr)
Definition: SlaveQueue.h:45
static const int vulture_queue_offset
Definition: Vulture.h:53
int hasStopped()
Definition: Vulture.cc:222
MasterQueue * mq_
Definition: Vulture.h:74
int hasStarted()
Definition: Vulture.cc:209
tuple cout
Definition: gather_cfg.py:41
Vulture(bool)
Definition: Vulture.cc:116
#define MSQM_VULTURE_TYPE_STA
Definition: queue_defs.h:21
string s
Definition: asciidump.py:422
unsigned int newCores_
Definition: Vulture.h:72
int post(MsgBuf &ptr)
Definition: MasterQueue.h:45
bool handicapped_
Definition: Vulture.h:78
int stopped_
Definition: Vulture.h:77
int started_
Definition: Vulture.h:76
pid_t vulturePid_
Definition: Vulture.h:68