00001
00002 #include "EventFilter/Utilities/interface/Vulture.h"
00003 #include "EventFilter/Utilities/interface/Exception.h"
00004 #include "EventFilter/Utilities/interface/CurlPoster.h"
00005
00006
00007 #include "pt/PeerTransportReceiver.h"
00008 #include "pt/PeerTransportAgent.h"
00009
00010 #include "toolbox/task/Action.h"
00011 #include "toolbox/task/WorkLoop.h"
00012 #include "toolbox/task/WorkLoopFactory.h"
00013
00014 #include <unistd.h>
00015 #ifdef linux
00016 #include <sys/prctl.h>
00017 #endif
00018 #include <signal.h>
00019 #include <string.h>
00020 #include <sys/wait.h>
00021 #include <sys/time.h>
00022
00023 #include <sys/stat.h>
00024 #include <unistd.h>
00025 #include <stdlib.h>
00026 #include <stdio.h>
00027 #include <fstream>
00028
00029 #ifdef __APPLE__
00030
00031
00032 #ifndef SIZE_MAX
00033 # define SIZE_MAX ((size_t) -1)
00034 #endif
00035 #ifndef SSIZE_MAX
00036 # define SSIZE_MAX ((ssize_t) (SIZE_MAX / 2))
00037 #endif
00038
00039 ssize_t getline (char **lineptr, size_t *n, FILE *fp)
00040 {
00041 ssize_t result;
00042 size_t cur_len = 0;
00043
00044 if (lineptr == NULL || n == NULL || fp == NULL)
00045 {
00046 errno = EINVAL;
00047 return -1;
00048 }
00049
00050 if (*lineptr == NULL || *n == 0)
00051 {
00052 *n = 120;
00053 *lineptr = (char *) malloc (*n);
00054 if (*lineptr == NULL)
00055 {
00056 result = -1;
00057 goto end;
00058 }
00059 }
00060
00061 for (;;)
00062 {
00063 int i;
00064
00065 i = getc (fp);
00066 if (i == EOF)
00067 {
00068 result = -1;
00069 break;
00070 }
00071
00072
00073 if (cur_len + 1 >= *n)
00074 {
00075 size_t needed_max =
00076 SSIZE_MAX < SIZE_MAX ? (size_t) SSIZE_MAX + 1 : SIZE_MAX;
00077 size_t needed = 2 * *n + 1;
00078 char *new_lineptr;
00079
00080 if (needed_max < needed)
00081 needed = needed_max;
00082 if (cur_len + 1 >= needed)
00083 {
00084 result = -1;
00085 goto end;
00086 }
00087
00088 new_lineptr = (char *) realloc (*lineptr, needed);
00089 if (new_lineptr == NULL)
00090 {
00091 result = -1;
00092 goto end;
00093 }
00094
00095 *lineptr = new_lineptr;
00096 *n = needed;
00097 }
00098
00099 (*lineptr)[cur_len] = i;
00100 cur_len++;
00101
00102 if (i == '\n')
00103 break;
00104 }
00105 (*lineptr)[cur_len] = '\0';
00106 result = cur_len ? (ssize_t) cur_len : result;
00107
00108 end:
00109 return result;
00110 }
00111 #endif
00112
00113 namespace evf{
00114
00115 const std::string Vulture::FS="/tmp";
00116
00117 Vulture::Vulture(bool push)
00118 : wlCtrl_(0)
00119 , asCtrl_(0)
00120 , running_(false)
00121 , wlProwl_(0)
00122 , asProwl_(0)
00123 , prowling_(false)
00124 , iDieUrl_("")
00125 , updateMode_(push)
00126 , vulturePid_(0)
00127 , tmp_(0)
00128 , newCores_(0)
00129 , poster_(0)
00130 , mq_(new MasterQueue(vulture_queue_offset))
00131 , sq_(0)
00132 , started_(-1)
00133 , stopped_(-1)
00134 , handicapped_(false)
00135 {
00136
00137 std::ifstream vulture("/tmp/vulture.cmd");
00138 if (!vulture.good())
00139 {
00140 FILE *outf = fopen("/tmp/vulture.cmd","w");
00141 fprintf(outf,"where\n");
00142 fclose(outf);
00143 }
00144
00145 }
00146
00147 Vulture::~Vulture()
00148 {
00149 delete mq_;
00150 if(sq_ != 0) delete sq_;
00151 if(poster_ != 0) delete poster_;
00152 }
00153
00154 pid_t Vulture::makeProcess(){
00155
00156 pid_t retval = fork();
00157 if(retval==0){
00158 int success = -1;
00159
00160
00161
00162 if(success != 0){
00163 std::cout << "Vulture::could not set process undumpable" << std::endl;
00164 handicapped_ = true;
00165 }
00166 #ifdef linux
00167 success = prctl( PR_SET_PDEATHSIG, SIGKILL );
00168 #endif
00169 if(success != 0){
00170 std::cout << "Vulture::could not set process death signal" << std::endl;
00171 handicapped_ = true;
00172 }
00173 tmp_ = opendir(FS.c_str());
00174 #ifdef linux
00175 success = prctl ( PR_SET_NAME , "vulture");
00176 #endif
00177 if(success != 0){
00178 std::cout << "Vulture::could not set process name" << std::endl;
00179 handicapped_ = true;
00180 }
00181
00182 try{
00183 pt::PeerTransport * ptr =
00184 pt::getPeerTransportAgent()->getPeerTransport("http","soap",pt::Receiver);
00185 delete ptr;
00186 }
00187 catch (pt::exception::PeerTransportNotFound & e ){
00188
00189 }
00190
00191 sq_ = new SlaveQueue(vulture_queue_offset);
00192
00193 try {
00194 wlCtrl_=
00195 toolbox::task::getWorkLoopFactory()->getWorkLoop("Ctrll",
00196 "waiting");
00197 if (!wlCtrl_->isActive()) wlCtrl_->activate();
00198
00199 asCtrl_ = toolbox::task::bind(this,&Vulture::control,
00200 "Ctrl");
00201 wlCtrl_->submit(asCtrl_);
00202 }
00203 catch (xcept::Exception& e) {
00204 std::cout << "Vulture:constructor - could not start workloop 'Ctrl' for process " << retval << std::endl;
00205 }
00206 }
00207 else{
00208 vulturePid_ = retval;
00209 }
00210 return retval;
00211
00212
00213 }
00214
00215 int Vulture::hasStarted(){
00216 if(started_<0){
00217 MsgBuf msg2(MAX_MSG_SIZE,MSQS_VULTURE_TYPE_ACK);
00218 try{
00219 mq_->rcvNonBlocking(msg2);
00220 started_ = 0;
00221 }
00222 catch(evf::Exception &e){
00223 }
00224 } else {started_ = 1;}
00225 return started_;
00226 }
00227
00228 int Vulture::hasStopped(){
00229 if(stopped_<0){
00230 MsgBuf msg2(MAX_MSG_SIZE,MSQS_VULTURE_TYPE_ACK);
00231 try{
00232 mq_->rcvNonBlocking(msg2);
00233 stopped_ = 0;
00234 }
00235 catch(evf::Exception &e){
00236 }
00237 } else {stopped_ = 1;}
00238 return stopped_;
00239 }
00240
00241 pid_t Vulture::start(std::string url, int run){
00242
00243
00244 vulture_start_message stamsg;
00245 strcpy(stamsg.url_,url.c_str());
00246 stamsg.run_ = run;
00247 MsgBuf msg1(sizeof(vulture_start_message),MSQM_VULTURE_TYPE_STA);
00248 memcpy(msg1->mtext,&stamsg,sizeof(vulture_start_message));
00249 mq_->post(msg1);
00250 stopped_ = -1;
00251 return vulturePid_;
00252 }
00253
00254 pid_t Vulture::stop()
00255 {
00256
00257 MsgBuf msg1(NUMERIC_MESSAGE_SIZE,MSQM_VULTURE_TYPE_STP);
00258 mq_->post(msg1);
00259 started_ = -1;
00260 return vulturePid_;
00261 }
00262
00263 pid_t Vulture::kill()
00264 {
00265 ::kill (vulturePid_, SIGKILL);
00266 int sl;
00267 pid_t killedOrNot = waitpid(vulturePid_,&sl,WNOHANG);
00268 vulturePid_ = 0;
00269 return killedOrNot;
00270 }
00271
00272 void Vulture::startProwling()
00273 {
00274 timeval now;
00275 gettimeofday(&now,0);
00276 lastUpdate_ = now.tv_sec;
00277 prowling_ = true;
00278 try {
00279 wlProwl_=
00280 toolbox::task::getWorkLoopFactory()->getWorkLoop("Prowl",
00281 "waiting");
00282 if (!wlProwl_->isActive()) wlProwl_->activate();
00283
00284 asProwl_ = toolbox::task::bind(this,&Vulture::prowling,
00285 "Prowl");
00286 wlProwl_->submit(asProwl_);
00287 }
00288 catch (xcept::Exception& e) {
00289 std::string msg = "Failed to start workloop 'Prowl'.";
00290 XCEPT_RETHROW(evf::Exception,msg,e);
00291 }
00292
00293 }
00294
00295 bool Vulture::control(toolbox::task::WorkLoop*wl)
00296 {
00297
00298 MsgBuf msg;
00299 unsigned long mtype = MSQM_MESSAGE_TYPE_NOP;
00300 try{mtype = sq_->rcv(msg);}catch(evf::Exception &e){
00301 std::cout << "Vulture::exception on msgrcv for control, bailing out of control workloop - good bye" << std::endl;
00302 return false;
00303 }
00304 mtype = msg->mtype;
00305 switch(mtype){
00306 case MSQM_VULTURE_TYPE_STA:
00307 {
00308
00309 vulture_start_message *sta = (vulture_start_message*)msg->mtext;
00310 if(poster_ == 0) poster_ = new CurlPoster(sta->url_);
00311 if(poster_->check(sta->run_)){
00312 try{
00313 startProwling();
00314 MsgBuf msg1(0,MSQS_VULTURE_TYPE_ACK) ;
00315 sq_->post(msg1);
00316 }
00317 catch(evf::Exception &e)
00318 {
00319 std::cout << "Vulture::start - exception in starting prowling workloop " << e.what() << std::endl;
00320
00321 }
00322 }else{
00323 std::cout << "Vulture::start - could not contact iDie - chech Url - will not start prowling loop" << std::endl;
00324 prowling_ = false;
00325 }
00326
00327 break;
00328 }
00329 case MSQM_VULTURE_TYPE_STP:
00330 {
00331 prowling_ = false;
00332 break;
00333 }
00334 default:
00335 {
00336
00337 }
00338 }
00339 return true;
00340
00341 }
00342
00343 bool Vulture::prowling(toolbox::task::WorkLoop*wl)
00344 {
00345
00346 if(!prowling_){
00347 char messageDie[5];
00348 sprintf(messageDie,"Dead");
00349 if(poster_==0){
00350 std::cout << "Vulture: asked to stop prowling but no poster "
00351 << std::endl;
00352 return false;
00353 }
00354 try{
00355 poster_->postString(messageDie,5,0,CurlPoster::stack);
00356 }
00357 catch(evf::Exception &e){
00358
00359 }
00360 std::cout << "Received STOP message, going to delete poster " << std::endl;
00361
00362
00363
00364 return false;
00365 }
00366
00367 newCores_ = 0;
00368
00369 struct stat filestat;
00370
00371 timeval now;
00372 gettimeofday(&now,0);
00373
00374
00375 dirent *dirp;
00376 while((dirp = readdir(tmp_))!=0){
00377 if(strncmp(dirp->d_name,"core",4)==0){
00378 stat(dirp->d_name,&filestat);
00379 if(filestat.st_mtime > lastUpdate_){
00380 currentCoreList_.push_back(dirp->d_name);
00381 newCores_++;
00382 }
00383 }
00384 }
00385 rewinddir(tmp_);
00386 lastUpdate_ = now.tv_sec;
00387 try{
00388 analyze();
00389 }
00390 catch(evf::Exception &e){
00391 std::cout << "Vulture cannot send to iDie server, bail out " << std::endl;
00392 return false;
00393 }
00394 ::sleep(60);
00395 return true;
00396 }
00397
00398 void Vulture::analyze()
00399 {
00400
00401 if(newCores_==0) return;
00402 for(unsigned int i = currentCoreList_.size()-newCores_;
00403 i < currentCoreList_.size();
00404 i++){
00405 std::string command = "gdb /opt/xdaq/bin/xdaq.exe -batch -x /tmp/vulture.cmd -c /tmp/";
00406 std::string cmdout;
00407 command += currentCoreList_[i];
00408 std::string filePathAndName = FS + "/";
00409 filePathAndName += currentCoreList_[i];
00410 std::string pid =
00411 currentCoreList_[i].substr(currentCoreList_[i].find_first_of(".")+1,
00412 currentCoreList_[i].length());
00413
00414 FILE *ps = popen(command.c_str(),"r");
00415 size_t s = 256;
00416 char *p=new char[s];
00417 bool filter = false;
00418 while(getline(&p,&s,ps) != -1){
00419 if(strncmp("Core",p,4)==0) filter = true;
00420 if(filter)cmdout += p;
00421 }
00422 delete[] p;
00423 pclose(ps);
00424 int errsv = 0;
00425 int rch = chmod(filePathAndName.c_str(),0777);
00426 if(rch != 0){
00427 errsv = errno;
00428 std::cout << "ERROR: couldn't change corefile access privileges -"
00429 << strerror(errsv)<< std::endl;
00430 }
00431 unsigned int ipid = (unsigned int)atoi(pid.c_str());
00432 poster_->postString(cmdout.c_str(),cmdout.length(),ipid, CurlPoster::stack);
00433
00434 }
00435 }
00436 }
00437