manager.c

Go to the documentation of this file.
00001 /*
00002  * Changes:
00003  *   Jul 22, 2005:      Created  (Jorrit N. Herder)
00004  */
00005 
00006 #include "inc.h"
00007 #include <unistd.h>
00008 #include <sys/types.h>
00009 #include <sys/wait.h>
00010 #include <minix/dmap.h>
00011 #include <minix/endpoint.h>
00012 
00013 /* Allocate variables. */
00014 struct rproc rproc[NR_SYS_PROCS];               /* system process table */
00015 struct rproc *rproc_ptr[NR_PROCS];              /* mapping for fast access */
00016 int nr_in_use;                                  /* number of services */
00017 extern int errno;                               /* error status */
00018 
00019 /* Prototypes for internal functions that do the hard work. */
00020 FORWARD _PROTOTYPE( int start_service, (struct rproc *rp) );
00021 FORWARD _PROTOTYPE( int stop_service, (struct rproc *rp,int how) );
00022 
00023 PRIVATE int shutting_down = FALSE;
00024 
00025 #define EXEC_FAILED     49                      /* recognizable status */
00026 
00027 /*===========================================================================*
00028  *                                      do_up                                *
00029  *===========================================================================*/
00030 PUBLIC int do_up(m_ptr)
00031 message *m_ptr;                                 /* request message pointer */
00032 {
00033 /* A request was made to start a new system service. Dismember the request 
00034  * message and gather all information needed to start the service. Starting
00035  * is done by a helper routine.
00036  */
00037   register struct rproc *rp;                    /* system process table */
00038   int slot_nr;                                  /* local table entry */
00039   int arg_count;                                /* number of arguments */
00040   char *cmd_ptr;                                /* parse command string */
00041   enum dev_style dev_style;                     /* device style */
00042   int s;                                        /* status variable */
00043 
00044   /* See if there is a free entry in the table with system processes. */
00045   if (nr_in_use >= NR_SYS_PROCS) return(EAGAIN); 
00046   for (slot_nr = 0; slot_nr < NR_SYS_PROCS; slot_nr++) {
00047       rp = &rproc[slot_nr];                     /* get pointer to slot */
00048       if (! rp->r_flags & RS_IN_USE)            /* check if available */
00049           break;
00050   }
00051   nr_in_use ++;                                 /* update administration */
00052 
00053   /* Obtain command name and parameters. This is a space-separated string
00054    * that looks like "/sbin/service arg1 arg2 ...". Arguments are optional.
00055    */
00056   if (m_ptr->RS_CMD_LEN > MAX_COMMAND_LEN) return(E2BIG);
00057   if (OK!=(s=sys_datacopy(m_ptr->m_source, (vir_bytes) m_ptr->RS_CMD_ADDR, 
00058         SELF, (vir_bytes) rp->r_cmd, m_ptr->RS_CMD_LEN))) return(s);
00059   rp->r_cmd[m_ptr->RS_CMD_LEN] = '\0';          /* ensure it is terminated */
00060   if (rp->r_cmd[0] != '/') return(EINVAL);      /* insist on absolute path */
00061 
00062   /* Build argument vector to be passed to execute call. The format of the
00063    * arguments vector is: path, arguments, NULL. 
00064    */
00065   arg_count = 0;                                /* initialize arg count */
00066   rp->r_argv[arg_count++] = rp->r_cmd;          /* start with path */
00067   cmd_ptr = rp->r_cmd;                          /* do some parsing */ 
00068   while(*cmd_ptr != '\0') {                     /* stop at end of string */
00069       if (*cmd_ptr == ' ') {                    /* next argument */
00070           *cmd_ptr = '\0';                      /* terminate previous */
00071           while (*++cmd_ptr == ' ') ;           /* skip spaces */
00072           if (*cmd_ptr == '\0') break;          /* no arg following */
00073           if (arg_count>MAX_NR_ARGS+1) break;   /* arg vector full */
00074           rp->r_argv[arg_count++] = cmd_ptr;    /* add to arg vector */
00075       }
00076       cmd_ptr ++;                               /* continue parsing */
00077   }
00078   rp->r_argv[arg_count] = NULL;                 /* end with NULL pointer */
00079   rp->r_argc = arg_count;
00080 
00081   /* Initialize some fields. */
00082   rp->r_period = m_ptr->RS_PERIOD;
00083   rp->r_dev_nr = m_ptr->RS_DEV_MAJOR;
00084   rp->r_dev_style = STYLE_DEV; 
00085   rp->r_restarts = -1;                          /* will be incremented */
00086   
00087   /* All information was gathered. Now try to start the system service. */
00088   return(start_service(rp));
00089 }
00090 
00091 
00092 /*===========================================================================*
00093  *                              do_down                                      *
00094  *===========================================================================*/
00095 PUBLIC int do_down(message *m_ptr)
00096 {
00097   register struct rproc *rp;
00098   pid_t pid = (pid_t) m_ptr->RS_PID;
00099 
00100   for (rp=BEG_RPROC_ADDR; rp<END_RPROC_ADDR; rp++) {
00101       if (rp->r_flags & RS_IN_USE && rp->r_pid == pid) {
00102 #if VERBOSE
00103           printf("stopping %d (%d)\n", pid, m_ptr->RS_PID);
00104 #endif
00105           stop_service(rp,RS_EXITING);
00106           return(OK);
00107       }
00108   }
00109 #if VERBOSE
00110   printf("not found %d (%d)\n", pid, m_ptr->RS_PID);
00111 #endif
00112   return(ESRCH);
00113 }
00114 
00115 
00116 /*===========================================================================*
00117  *                              do_refresh                                   *
00118  *===========================================================================*/
00119 PUBLIC int do_refresh(message *m_ptr)
00120 {
00121   register struct rproc *rp;
00122   pid_t pid = (pid_t) m_ptr->RS_PID;
00123 
00124   for (rp=BEG_RPROC_ADDR; rp<END_RPROC_ADDR; rp++) {
00125       if (rp->r_flags & RS_IN_USE && rp->r_pid == pid) {
00126 #if VERBOSE
00127           printf("refreshing %d (%d)\n", pid, m_ptr->RS_PID);
00128 #endif
00129           stop_service(rp,RS_REFRESHING);
00130           return(OK);
00131       }
00132   }
00133 #if VERBOSE
00134   printf("not found %d (%d)\n", pid, m_ptr->RS_PID);
00135 #endif
00136   return(ESRCH);
00137 }
00138 
00139 /*===========================================================================*
00140  *                              do_rescue                                    *
00141  *===========================================================================*/
00142 PUBLIC int do_rescue(message *m_ptr)
00143 {
00144   char rescue_dir[MAX_RESCUE_DIR_LEN];
00145   int s;
00146 
00147   /* Copy rescue directory from user. */
00148   if (m_ptr->RS_CMD_LEN > MAX_RESCUE_DIR_LEN) return(E2BIG);
00149   if (OK!=(s=sys_datacopy(m_ptr->m_source, (vir_bytes) m_ptr->RS_CMD_ADDR, 
00150         SELF, (vir_bytes) rescue_dir, m_ptr->RS_CMD_LEN))) return(s);
00151   rescue_dir[m_ptr->RS_CMD_LEN] = '\0';         /* ensure it is terminated */
00152   if (rescue_dir[0] != '/') return(EINVAL);     /* insist on absolute path */
00153 
00154   /* Change RS' directory to the rescue directory. Provided that the needed
00155    * binaries are in the rescue dir, this makes recovery possible even if the 
00156    * (root) file system is no longer available, because no directory lookups
00157    * are required. Thus if an absolute path fails, we can try to strip the 
00158    * path an see if the command is in the rescue dir. 
00159    */
00160   if (chdir(rescue_dir) != 0) return(errno);
00161   return(OK);
00162 }
00163 
00164 /*===========================================================================*
00165  *                              do_shutdown                                  *
00166  *===========================================================================*/
00167 PUBLIC int do_shutdown(message *m_ptr)
00168 {
00169   /* Set flag so that RS server knows services shouldn't be restarted. */
00170   shutting_down = TRUE;
00171   return(OK);
00172 }
00173 
00174 /*===========================================================================*
00175  *                              do_exit                                      *
00176  *===========================================================================*/
00177 PUBLIC void do_exit(message *m_ptr)
00178 {
00179   register struct rproc *rp;
00180   pid_t exit_pid;
00181   int exit_status;
00182 
00183 #if VERBOSE
00184   printf("RS: got SIGCHLD signal, doing wait to get exited child.\n");
00185 #endif
00186 
00187   /* See which child exited and what the exit status is. This is done in a
00188    * loop because multiple childs may have exited, all reported by one 
00189    * SIGCHLD signal. The WNOHANG options is used to prevent blocking if, 
00190    * somehow, no exited child can be found. 
00191    */
00192   while ( (exit_pid = waitpid(-1, &exit_status, WNOHANG)) != 0 ) {
00193 
00194 #if VERBOSE
00195       printf("RS: proc %d, pid %d, ", rp->r_proc_nr_e, exit_pid); 
00196       if (WIFSIGNALED(exit_status)) {
00197           printf("killed, signal number %d\n", WTERMSIG(exit_status));
00198       } 
00199       else if (WIFEXITED(exit_status)) {
00200           printf("normal exit, status %d\n", WEXITSTATUS(exit_status));
00201       }
00202 #endif
00203 
00204       /* Search the system process table to see who exited. 
00205        * This should always succeed. 
00206        */
00207       for (rp=BEG_RPROC_ADDR; rp<END_RPROC_ADDR; rp++) {
00208           if ((rp->r_flags & RS_IN_USE) && rp->r_pid == exit_pid) {
00209               int proc;
00210               proc = _ENDPOINT_P(rp->r_proc_nr_e);
00211 
00212               rproc_ptr[proc] = NULL;           /* invalidate */
00213 
00214               if ((rp->r_flags & RS_EXITING) || shutting_down) {
00215                   rp->r_flags = 0;                      /* release slot */
00216                   rproc_ptr[proc] = NULL;
00217               }
00218               else if(rp->r_flags & RS_REFRESHING) {
00219                       rp->r_restarts = -1;              /* reset counter */
00220                       start_service(rp);                /* direct restart */
00221               }
00222               else if (WIFEXITED(exit_status) &&
00223                       WEXITSTATUS(exit_status) == EXEC_FAILED) {
00224                   rp->r_flags = 0;                      /* release slot */
00225               }
00226               else {
00227 #if VERBOSE
00228                   printf("Unexpected exit. Restarting %s\n", rp->r_cmd);
00229 #endif
00230                   /* Determine what to do. If this is the first unexpected 
00231                    * exit, immediately restart this service. Otherwise use
00232                    * a binary exponetial backoff.
00233                    */
00234                   if (rp->r_restarts > 0) {
00235                       rp->r_backoff = 1 << MIN(rp->r_restarts,(BACKOFF_BITS-1));
00236                       rp->r_backoff = MIN(rp->r_backoff,MAX_BACKOFF); 
00237                   }
00238                   else {
00239                       start_service(rp);                /* direct restart */
00240                   }
00241               }
00242               break;
00243           }
00244       }
00245   }
00246 }
00247 
00248 /*===========================================================================*
00249  *                              do_period                                    *
00250  *===========================================================================*/
00251 PUBLIC void do_period(m_ptr)
00252 message *m_ptr;
00253 {
00254   register struct rproc *rp;
00255   clock_t now = m_ptr->NOTIFY_TIMESTAMP;
00256   int s;
00257 
00258   /* Search system services table. Only check slots that are in use. */
00259   for (rp=BEG_RPROC_ADDR; rp<END_RPROC_ADDR; rp++) {
00260       if (rp->r_flags & RS_IN_USE) {
00261 
00262           /* If the service is to be revived (because it repeatedly exited, 
00263            * and was not directly restarted), the binary backoff field is  
00264            * greater than zero. 
00265            */
00266           if (rp->r_backoff > 0) {
00267               rp->r_backoff -= 1;
00268               if (rp->r_backoff == 0) {
00269                   start_service(rp);
00270               }
00271           }
00272 
00273           /* If the service was signaled with a SIGTERM and fails to respond,
00274            * kill the system service with a SIGKILL signal.
00275            */
00276           else if (rp->r_stop_tm > 0 && now - rp->r_stop_tm > 2*RS_DELTA_T
00277            && rp->r_pid > 0) {
00278               kill(rp->r_pid, SIGKILL);         /* terminate */
00279           }
00280         
00281           /* There seems to be no special conditions. If the service has a 
00282            * period assigned check its status. 
00283            */
00284           else if (rp->r_period > 0) {
00285 
00286               /* Check if an answer to a status request is still pending. If 
00287                * the driver didn't respond within time, kill it to simulate 
00288                * a crash. The failure will be detected and the service will 
00289                * be restarted automatically.
00290                */
00291               if (rp->r_alive_tm < rp->r_check_tm) { 
00292                   if (now - rp->r_alive_tm > 2*rp->r_period &&
00293                       rp->r_pid > 0) { 
00294 #if VERBOSE
00295                       printf("RS: service %d reported late\n", rp->r_proc_nr_e); 
00296 #endif
00297                       kill(rp->r_pid, SIGKILL);         /* simulate crash */
00298                   }
00299               }
00300 
00301               /* No answer pending. Check if a period expired since the last
00302                * check and, if so request the system service's status.
00303                */
00304               else if (now - rp->r_check_tm > rp->r_period) {
00305 #if VERBOSE
00306                   printf("RS: status request sent to %d\n", rp->r_proc_nr_e); 
00307 #endif
00308                   notify(rp->r_proc_nr_e);              /* request status */
00309                   rp->r_check_tm = now;                 /* mark time */
00310               }
00311           }
00312       }
00313   }
00314 
00315   /* Reschedule a synchronous alarm for the next period. */
00316   if (OK != (s=sys_setalarm(RS_DELTA_T, 0)))
00317       panic("RS", "couldn't set alarm", s);
00318 }
00319 
00320 
00321 /*===========================================================================*
00322  *                              start_service                                *
00323  *===========================================================================*/
00324 PRIVATE int start_service(rp)
00325 struct rproc *rp;
00326 {
00327 /* Try to execute the given system service. Fork a new process. The child
00328  * process will be inhibited from running by the NO_PRIV flag. Only let the
00329  * child run once its privileges have been set by the parent.
00330  */
00331   int child_proc_nr_e, child_proc_nr_n;         /* child process slot */
00332   pid_t child_pid;                              /* child's process id */
00333   char *file_only;
00334   int s;
00335   message m;
00336 
00337   /* Now fork and branch for parent and child process (and check for error). */
00338   child_pid = fork();
00339   switch(child_pid) {                                   /* see fork(2) */
00340   case -1:                                              /* fork failed */
00341       report("RS", "warning, fork() failed", errno);    /* shouldn't happen */
00342       return(errno);                                    /* return error */
00343 
00344   case 0:                                               /* child process */
00345       /* Try to execute the binary that has an absolute path. If this fails, 
00346        * e.g., because the root file system cannot be read, try to strip of
00347        * the path, and see if the command is in RS' current working dir.
00348        */
00349       execve(rp->r_argv[0], rp->r_argv, NULL);          /* POSIX execute */
00350       file_only = strrchr(rp->r_argv[0], '/') + 1;
00351       execve(file_only, rp->r_argv, NULL);              /* POSIX execute */
00352       printf("RS: exec failed for %s: %d\n", rp->r_argv[0], errno);
00353       exit(EXEC_FAILED);                                /* terminate child */
00354 
00355   default:                                              /* parent process */
00356       child_proc_nr_e = getnprocnr(child_pid);          /* get child slot */ 
00357       break;                                            /* continue below */
00358   }
00359 
00360   /* Only the parent process (the RS server) gets to this point. The child
00361    * is still inhibited from running because it's privilege structure is
00362    * not yet set. First try to set the device driver mapping at the FS.
00363    */
00364   if (rp->r_dev_nr > 0) {                               /* set driver map */
00365       if ((s=mapdriver(child_proc_nr_e, rp->r_dev_nr, rp->r_dev_style)) < 0) {
00366           report("RS", "couldn't map driver", errno);
00367           rp->r_flags |= RS_EXITING;                    /* expect exit */
00368           if(child_pid > 0) kill(child_pid, SIGKILL);   /* kill driver */
00369           else report("RS", "didn't kill pid", child_pid);
00370           return(s);                                    /* return error */
00371       }
00372   }
00373 
00374   /* The device driver mapping has been set, or the service was not a driver.
00375    * Now, set the privilege structure for the child process to let is run.
00376    * This should succeed: we tested number in use above.
00377    */
00378   if ((s = sys_privctl(child_proc_nr_e, SYS_PRIV_INIT, 0, NULL)) < 0) {
00379       report("RS","call to SYSTEM failed", s);          /* to let child run */
00380       rp->r_flags |= RS_EXITING;                        /* expect exit */
00381       if(child_pid > 0) kill(child_pid, SIGKILL);       /* kill driver */
00382       else report("RS", "didn't kill pid", child_pid);
00383       return(s);                                        /* return error */
00384   }
00385 
00386 #if VERBOSE
00387       printf("RS: started '%s', major %d, pid %d, endpoint %d, proc %d\n", 
00388           rp->r_cmd, rp->r_dev_nr, child_pid,
00389           child_proc_nr_e, child_proc_nr_n);
00390 #endif
00391 
00392   /* The system service now has been successfully started. Update the rest
00393    * of the system process table that is maintain by the RS server. The only 
00394    * thing that can go wrong now, is that execution fails at the child. If 
00395    * that's the case, the child will exit. 
00396    */
00397   child_proc_nr_n = _ENDPOINT_P(child_proc_nr_e);
00398   rp->r_flags = RS_IN_USE;                      /* mark slot in use */
00399   rp->r_restarts += 1;                          /* raise nr of restarts */
00400   rp->r_proc_nr_e = child_proc_nr_e;            /* set child details */
00401   rp->r_pid = child_pid;
00402   rp->r_check_tm = 0;                           /* not check yet */
00403   getuptime(&rp->r_alive_tm);                   /* currently alive */
00404   rp->r_stop_tm = 0;                            /* not exiting yet */
00405   rproc_ptr[child_proc_nr_n] = rp;              /* mapping for fast access */
00406   return(OK);
00407 }
00408 
00409 /*===========================================================================*
00410  *                              stop_service                                 *
00411  *===========================================================================*/
00412 PRIVATE int stop_service(rp,how)
00413 struct rproc *rp;
00414 int how;
00415 {
00416   /* Try to stop the system service. First send a SIGTERM signal to ask the
00417    * system service to terminate. If the service didn't install a signal 
00418    * handler, it will be killed. If it did and ignores the signal, we'll
00419    * find out because we record the time here and send a SIGKILL.
00420    */
00421 #if VERBOSE
00422   printf("RS tries to stop %s (pid %d)\n", rp->r_cmd, rp->r_pid);
00423 #endif
00424 
00425   rp->r_flags |= how;                           /* what to on exit? */
00426   if(rp->r_pid > 0) kill(rp->r_pid, SIGTERM);   /* first try friendly */
00427   else report("RS", "didn't kill pid", rp->r_pid);
00428   getuptime(&rp->r_stop_tm);                    /* record current time */
00429 }
00430 
00431 
00432 /*===========================================================================*
00433  *                              do_getsysinfo                                *
00434  *===========================================================================*/
00435 PUBLIC int do_getsysinfo(m_ptr)
00436 message *m_ptr;
00437 {
00438   vir_bytes src_addr, dst_addr;
00439   int dst_proc;
00440   size_t len;
00441   int s;
00442 
00443   switch(m_ptr->m1_i1) {
00444   case SI_PROC_TAB:
00445         src_addr = (vir_bytes) rproc;
00446         len = sizeof(struct rproc) * NR_SYS_PROCS;
00447         break; 
00448   default:
00449         return(EINVAL);
00450   }
00451 
00452   dst_proc = m_ptr->m_source;
00453   dst_addr = (vir_bytes) m_ptr->m1_p1;
00454   if (OK != (s=sys_datacopy(SELF, src_addr, dst_proc, dst_addr, len)))
00455         return(s);
00456   return(OK);
00457 }
00458 

Generated on Fri Apr 14 22:57:32 2006 for minix by  doxygen 1.4.6