Android 12 init 子进程回收与服务重启分析

Posted pecuyu

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了Android 12 init 子进程回收与服务重启分析相关的知识,希望对你有一定的参考价值。

文章托管在gitee上 Android Notes , 同步csdn
本文基于android12 分析

在init运行过程中,不可避免的会出现子进程或服务退出,需要做一些针对性处理:

  • 对于已终止的子进程需要将其回收掉,防止产生僵尸进程
  • 对于非oneshot服务,需要重新将其拉起,防止异常退出。

处理子进程退出

在init中通过监听信号 SIGCHLD,来获取子进程终止事件,然后做一些针对性动作。

InstallSignalFdHandler

初始化信号处理器,注册子进程终止的监听

/// @system/core/init/init.cpp
static void InstallSignalFdHandler(Epoll* epoll) 
    // Applying SA_NOCLDSTOP to a defaulted SIGCHLD handler prevents the signalfd from receiving
    // SIGCHLD when a child process stops or continues (b/77867680#comment9).
    const struct sigaction act  .sa_handler = SIG_DFL, .sa_flags = SA_NOCLDSTOP ;
    sigaction(SIGCHLD, &act, nullptr);// 添加flag ,不接收进程 stop/continue 事件

    sigset_t mask;
    sigemptyset(&mask);
    sigaddset(&mask, SIGCHLD);

    if (!IsRebootCapable())  // 没有CAP_SYS_BOOT capability,不具备重启能力
        // If init does not have the CAP_SYS_BOOT capability, it is running in a container.
        // In that case, receiving SIGTERM will cause the system to shut down.
        sigaddset(&mask, SIGTERM); // 添加SIGTERM到信号集
    

    if (sigprocmask(SIG_BLOCK, &mask, nullptr) == -1)  // block这些信号,与signalfd匹配使用
        PLOG(FATAL) << "failed to block signals";
    

    // Register a handler to unblock signals in the child processes.
    // UnblockSignals在fork返回之前,在子进程上下文中被执行,使得子进程不block这些信号
    const int result = pthread_atfork(nullptr, nullptr, &UnblockSignals);
    if (result != 0) 
        LOG(FATAL) << "Failed to register a fork handler: " << strerror(result);
    

    signal_fd = signalfd(-1, &mask, SFD_CLOEXEC); // 创建fd,用于读取被block的信号
    if (signal_fd == -1) 
        PLOG(FATAL) << "failed to create signalfd";
    
    // 通过 epoll 监听新的信号到来
    if (auto result = epoll->RegisterHandler(signal_fd, HandleSignalFd); !result.ok()) 
        LOG(FATAL) << result.error();
    

UnblockSignals

在子进程执行该函数,即子进程默认是不阻塞这些信号的。

/// @system/core/init/init.cpp
static void UnblockSignals() 
    const struct sigaction act  .sa_handler = SIG_DFL ;
    sigaction(SIGCHLD, &act, nullptr);

    sigset_t mask;
    sigemptyset(&mask);
    sigaddset(&mask, SIGCHLD);
    sigaddset(&mask, SIGTERM);

    if (sigprocmask(SIG_UNBLOCK, &mask, nullptr) == -1) 
        PLOG(FATAL) << "failed to unblock signals for PID " << getpid();
    

当epoll监听到signal_fd有事件到来,即产生了相关信号,则会回调HandleSignalFd来处理

HandleSignalFd

/// system/core/init/init.cpp
static void HandleSignalFd() 
    signalfd_siginfo siginfo;
    // 从fd读取信号信息
    ssize_t bytes_read = TEMP_FAILURE_RETRY(read(signal_fd, &siginfo, sizeof(siginfo)));
    if (bytes_read != sizeof(siginfo)) 
        PLOG(ERROR) << "Failed to read siginfo from signal_fd";
        return;
    

    switch (siginfo.ssi_signo) 
        case SIGCHLD: // 子进程终止事件
            ReapAnyOutstandingChildren();
            break;
        case SIGTERM: // 信号15,kill命令默认发送的信号
            HandleSigtermSignal(siginfo);
            break;
        default:
            PLOG(ERROR) << "signal_fd: received unexpected signal " << siginfo.ssi_signo;
            break;
    

处理 SIGCHLD 会调用ReapAnyOutstandingChildren,它实现了所有终止子进程的回收

ReapAnyOutstandingChildren

/// @system/core/init/sigchld_handler.cpp
void ReapAnyOutstandingChildren() 
    while (ReapOneProcess() != 0)  // 循环处理所有已终止的进程(调用exit或被信号杀死)
    

ReapOneProcess

这个函数的作用如下:

  • 调用waitid回收已经终止的进程
  • 打印进程死亡原因,被信号kill或者调用exit退出
  • 针对 service 调用其 Reap 函数,清理状态、处理重启及 onrestart 命令
/// @system/core/init/sigchld_handler.cpp
static pid_t ReapOneProcess() 
    siginfo_t siginfo = ;
    // This returns a zombie pid or informs us that there are no zombies left to be reaped.
    // It does NOT reap the pid; that is done below.
    if (TEMP_FAILURE_RETRY(waitid(P_ALL, 0, &siginfo, WEXITED | WNOHANG | WNOWAIT)) != 0) 
        PLOG(ERROR) << "waitid failed";
        return 0;
    

    auto pid = siginfo.si_pid;
    if (pid == 0) return 0;

    // At this point we know we have a zombie pid, so we use this scopeguard to reap the pid
    // whenever the function returns from this point forward.
    // We do NOT want to reap the zombie earlier as in Service::Reap(), we kill(-pid, ...) and we
    // want the pid to remain valid throughout that (and potentially future) usages.
    auto reaper = make_scope_guard([pid]  TEMP_FAILURE_RETRY(waitpid(pid, nullptr, WNOHANG)); );

    std::string name;
    std::string wait_string;
    Service* service = nullptr;

    if (SubcontextChildReap(pid))  // 处理Subcontext进程退出,非正在关机中会重启该进程
        name = "Subcontext";
     else 
      // 判断该进程是否是某个服务,比如surfaceflinger
        service = ServiceList::GetInstance().FindService(pid, &Service::pid);

        if (service)  // 服务存在
            name = StringPrintf("Service '%s' (pid %d)", service->name().c_str(), pid);
            if (service->flags() & SVC_EXEC)  // 通过'exec' or 'exec_start' 启动的可执行程序进程
                auto exec_duration = boot_clock::now() - service->time_started();
                auto exec_duration_ms =
                    std::chrono::duration_cast<std::chrono::milliseconds>(exec_duration).count();
                wait_string = StringPrintf(" waiting took %f seconds", exec_duration_ms / 1000.0f);
             else if (service->flags() & SVC_ONESHOT)  // 一次性的服务
                auto exec_duration = boot_clock::now() - service->time_started();
                auto exec_duration_ms =
                        std::chrono::duration_cast<std::chrono::milliseconds>(exec_duration)
                                .count();
                wait_string = StringPrintf(" oneshot service took %f seconds in background",
                                           exec_duration_ms / 1000.0f);
            
         else 
            name = StringPrintf("Untracked pid %d", pid); // 非服务进程,未追踪的进程退出
        
    

    if (siginfo.si_code == CLD_EXITED)  // 进程 exit
        LOG(INFO) << name << " exited with status " << siginfo.si_status << wait_string;
     else  // 进程被 kill
        LOG(INFO) << name << " received signal " << siginfo.si_status << wait_string;
    

    if (!service) return pid;

    service->Reap(siginfo); // 调用Reap,做清理工作,并重启非oneshot的服务

    if (service->flags() & SVC_TEMPORARY)  // 通过'exec' 启动的服务
        ServiceList::GetInstance().RemoveService(*service);
    

    return pid;

Service::Reap

  • kill进程组所有进程
  • 清理所有socket资源相关文件
  • 回调reap_callbacks_,比如之前设置的启动失败回调
  • critical服务持续保持退出(4分钟大于4次),则重启到BootLoader
  • 标记服务SVC_RESTARTING,在HandleProcessActions中重启服务
  • 执行onrestart命令
  • 通知服务状态改变
/// @system/core/init/service.cpp
  void Service::Reap(const siginfo_t& siginfo) 
      if (!(flags_ & SVC_ONESHOT) || (flags_ & SVC_RESTART)) // 不是一次性的或者需要重启的
          KillProcessGroup(SIGKILL, false); // 服务死亡,杀死其进程组所有进程, 第二个参数表示是否report_oneshot
       else 
          // Legacy behavior from ~2007 until Android R: this else branch did not exist and we did not
          // kill the process group in this case.
          if (SelinuxGetVendorAndroidVersion() >= __ANDROID_API_R__)  // 杀死oneshot服务的进程组
              // The new behavior in Android R is to kill these process groups in all cases.  The
              // 'true' parameter instructions KillProcessGroup() to report a warning message where it
              // detects a difference in behavior has occurred.
              KillProcessGroup(SIGKILL, true);
          
      

      // Remove any socket resources we may have created.
      for (const auto& socket : sockets_)  // 清理该服务创建的socket 路径文件
          auto path = ANDROID_SOCKET_DIR "/" + socket.name;
          unlink(path.c_str());
      

      for (const auto& f : reap_callbacks_)  // 执行通过 AddReapCallback 添加的reap操作的回调
          f(siginfo);
      

      if ((siginfo.si_code != CLD_EXITED || siginfo.si_status != 0) && on_failure_reboot_target_) 
          LOG(ERROR) << "Service with 'reboot_on_failure' option failed, shutting down system.";
          trigger_shutdown(*on_failure_reboot_target_);// 带有reboot_on_failure选项的服务,非正常退出则会触发关机
      

      if (flags_ & SVC_EXEC) UnSetExec();  // 重置 is_exec_service_running_ flag

      if (flags_ & SVC_TEMPORARY) return; // 临时oneshot服务,返回

      pid_ = 0;
      flags_ &= (~SVC_RUNNING);
      start_order_ = 0;

      // Oneshot processes go into the disabled state on exit,
      // except when manually restarted.
      // 标记为 SVC_RESTART 的,是需要重启服务的。在StopOrReset函数先kill进程,然后标记为SVC_RESTART,到回收后则进行重启
      if ((flags_ & SVC_ONESHOT) && !(flags_ & SVC_RESTART) && !(flags_ & SVC_RESET)) 
          flags_ |= SVC_DISABLED; // oneshot服务置disabled状态
      

      // Disabled and reset processes do not get restarted automatically.
      if (flags_ & (SVC_DISABLED | SVC_RESET))   // disabled 和 reset 状态服务不重启
          NotifyStateChange("stopped");
          return;
      

  #if INIT_FULL_SOURCES
      static bool is_apex_updatable = android::sysprop::ApexProperties::updatable().value_or(false);
  #else
      static bool is_apex_updatable = false;
  #endif
      const bool is_process_updatable = !use_bootstrap_ns_ && is_apex_updatable;

      // If we crash > 4 times in 'fatal_crash_window_' minutes or before boot_completed,
      // reboot into bootloader or set crashing property
      boot_clock::time_point now = boot_clock::now();
      // critica或可更新(如apex) 并且 服务未标记要重启
      if (((flags_ & SVC_CRITICAL) || is_process_updatable) && !(flags_ & SVC_RESTART)) 
          bool boot_completed = GetBoolProperty("sys.boot_completed", false);
          if (now < time_crashed_ + fatal_crash_window_ || !boot_completed)  // 在窗口时间内 或 开机流程未完成
              if (++crash_count_ > 4) 
                  auto exit_reason = boot_completed ?
                      "in " + std::to_string(fatal_crash_window_.count()) + " minutes" :
                      "before boot completed";
                  if (flags_ & SVC_CRITICAL)  // critical 服务在窗口时间(4分钟内)或开机完成前 crash超过4次,则会重启到 bootloader
                      if (!GetBoolProperty("init.svc_debug.no_fatal." + name_, false)) 
                          // Aborts into 'atal_reboot_target_'.
                          SetFatalRebootTarget(fatal_reboot_target_);
                          LOG(FATAL) << "critical process '" << name_ << "' exited 4 times "
                                     << exit_reason;
                      
                   else  // 非 critical 服务只有一个打印,然后记录到属性
                      LOG(ERROR) << "process with updatable components '" << name_
                                 << "' exited 4 times " << exit_reason;
                      // Notifies update_verifier and apexd
                      SetProperty("sys.init.updatable_crashing_process_name", name_);
                      SetProperty("sys.init.updatable_crashing", "1");
                  
              
           else  // 重新记录时间和次数
              time_crashed_ = now;
              crash_count_ = 1;
          
      

      flags_ &= (~SVC_RESTART);
      flags_ |= SVC_RESTARTING; // 注意此处标记,是服务重启的关键

      // Execute all onrestart commands for this service.
      onrestart_.ExecuteAllCommands(); // 执行所有 onrestart 命令, 在rc里面配置的

      NotifyStateChange("restarting");
      return;

Service::KillProcessGroup

void Service::KillProcessGroup(int signal, bool report_oneshot) 
    // If we've already seen a successful result from killProcessGroup*(), then we have removed
    // the cgroup already and calling these functions a second time will simply result in an error.
    // This is true regardless of which signal was sent.
    // These functions handle their own logging, so no additional logging is needed.
    if (!process_cgroup_empty_) 
        LOG(INFO) << "Sending signal " << signal << " to service '" << name_ << "' (pid " << pid_
                  << ") process group...";
        int max_processes = 0;
        int r;
        if (signal == SIGTERM) 
            r = killProcessGroupOnce(proc_attr_.uid, pid_, signal, &max_processes);
         else 
            r = killProcessGroup(proc_attr_.uid, pid_, signal, &max_processes);
        

        if (report_oneshot && max_processes > 0) 
            LOG(WARNING)
                    << "Killed " << max_processes
                    << " additional processes from a oneshot process group for service '" << name_
                    << "'. This is new behavior, previously child processes would not be killed in "
                       "this case.";
        

        if (r == 0) process_cgroup_empty_ = true;
    

    if (oom_score_adjust_ != DEFAULT_OOM_SCORE_ADJUST) 
        LmkdUnregister(name_, pid_); // 从lmkd移除进程信息
    

上面两个killProcessGroup实现如下:

/// @system/core/libprocessgroup/processgroup.cpp
int killProcessGroup(uid_t uid, int initialPid, int signal, int* max_processes) 
  // 内部调用DoKillProcessGroupOnce去kill进程组
    return KillProcessGroup(uid, initialPid, signal, 40 /*retries*/, max_processes);


int killProcessGroupOnce(uid_t uid, int initialPid, int signal, int* max_processes) 
    return KillProcessGroup(uid, initialPid, signal, 0 /*retries*/, max_processes);

关于cgroup配置可参见 cgroups.json

/// @system/core/libprocessgroup/profiles/cgroups.json

  "Cgroups": [
    
      "Controller": "blkio",
      "Path": "/dev/blkio",
      "Mode": "0755",
      "UID": "system",
      "GID": "system"
    ,
    
      "Controller": "cpu",
      "Path": "/dev/cpuctl",
      "Mode": "0755",
      "UID": "system",
      "GID": "system"
    ,
    
      "Controller": "cpuset",
      "Path": "/dev/cpuset",
      "Mode": "0755",
      "UID": "system",
      "GID": "system"
    ,
    
      "Controller": "memory",
      "Path": "/dev/memcg",
      "Mode": "0700",
      Process用法与进程详解

Linux信号详解:signal与sigaction函数

Linux信号详解:signal与sigaction函数1

Linux信号详解:signal与sigaction函数1

孤儿进程僵尸进程及其回收

孤儿进程僵尸进程及其回收