Android 12 init 子进程回收与服务重启分析
Posted pecuyu
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了Android 12 init 子进程回收与服务重启分析相关的知识,希望对你有一定的参考价值。
文章托管在gitee上 Android Notes , 同步csdn
本文基于android12 分析
在init运行过程中,不可避免的会出现子进程或服务退出,需要做一些针对性处理:
- 对于已终止的子进程需要将其回收掉,防止产生僵尸进程
- 对于非oneshot服务,需要重新将其拉起,防止异常退出。
处理子进程退出
在init中通过监听信号 SIGCHLD,来获取子进程终止事件,然后做一些针对性动作。
InstallSignalFdHandler
初始化信号处理器,注册子进程终止的监听
/// @system/core/init/init.cpp
static void InstallSignalFdHandler(Epoll* epoll)
// Applying SA_NOCLDSTOP to a defaulted SIGCHLD handler prevents the signalfd from receiving
// SIGCHLD when a child process stops or continues (b/77867680#comment9).
const struct sigaction act .sa_handler = SIG_DFL, .sa_flags = SA_NOCLDSTOP ;
sigaction(SIGCHLD, &act, nullptr);// 添加flag ,不接收进程 stop/continue 事件
sigset_t mask;
sigemptyset(&mask);
sigaddset(&mask, SIGCHLD);
if (!IsRebootCapable()) // 没有CAP_SYS_BOOT capability,不具备重启能力
// If init does not have the CAP_SYS_BOOT capability, it is running in a container.
// In that case, receiving SIGTERM will cause the system to shut down.
sigaddset(&mask, SIGTERM); // 添加SIGTERM到信号集
if (sigprocmask(SIG_BLOCK, &mask, nullptr) == -1) // block这些信号,与signalfd匹配使用
PLOG(FATAL) << "failed to block signals";
// Register a handler to unblock signals in the child processes.
// UnblockSignals在fork返回之前,在子进程上下文中被执行,使得子进程不block这些信号
const int result = pthread_atfork(nullptr, nullptr, &UnblockSignals);
if (result != 0)
LOG(FATAL) << "Failed to register a fork handler: " << strerror(result);
signal_fd = signalfd(-1, &mask, SFD_CLOEXEC); // 创建fd,用于读取被block的信号
if (signal_fd == -1)
PLOG(FATAL) << "failed to create signalfd";
// 通过 epoll 监听新的信号到来
if (auto result = epoll->RegisterHandler(signal_fd, HandleSignalFd); !result.ok())
LOG(FATAL) << result.error();
UnblockSignals
在子进程执行该函数,即子进程默认是不阻塞这些信号的。
/// @system/core/init/init.cpp
static void UnblockSignals()
const struct sigaction act .sa_handler = SIG_DFL ;
sigaction(SIGCHLD, &act, nullptr);
sigset_t mask;
sigemptyset(&mask);
sigaddset(&mask, SIGCHLD);
sigaddset(&mask, SIGTERM);
if (sigprocmask(SIG_UNBLOCK, &mask, nullptr) == -1)
PLOG(FATAL) << "failed to unblock signals for PID " << getpid();
当epoll监听到signal_fd有事件到来,即产生了相关信号,则会回调HandleSignalFd来处理
HandleSignalFd
/// system/core/init/init.cpp
static void HandleSignalFd()
signalfd_siginfo siginfo;
// 从fd读取信号信息
ssize_t bytes_read = TEMP_FAILURE_RETRY(read(signal_fd, &siginfo, sizeof(siginfo)));
if (bytes_read != sizeof(siginfo))
PLOG(ERROR) << "Failed to read siginfo from signal_fd";
return;
switch (siginfo.ssi_signo)
case SIGCHLD: // 子进程终止事件
ReapAnyOutstandingChildren();
break;
case SIGTERM: // 信号15,kill命令默认发送的信号
HandleSigtermSignal(siginfo);
break;
default:
PLOG(ERROR) << "signal_fd: received unexpected signal " << siginfo.ssi_signo;
break;
处理 SIGCHLD 会调用ReapAnyOutstandingChildren,它实现了所有终止子进程的回收
ReapAnyOutstandingChildren
/// @system/core/init/sigchld_handler.cpp
void ReapAnyOutstandingChildren()
while (ReapOneProcess() != 0) // 循环处理所有已终止的进程(调用exit或被信号杀死)
ReapOneProcess
这个函数的作用如下:
- 调用waitid回收已经终止的进程
- 打印进程死亡原因,被信号kill或者调用exit退出
- 针对 service 调用其 Reap 函数,清理状态、处理重启及 onrestart 命令
/// @system/core/init/sigchld_handler.cpp
static pid_t ReapOneProcess()
siginfo_t siginfo = ;
// This returns a zombie pid or informs us that there are no zombies left to be reaped.
// It does NOT reap the pid; that is done below.
if (TEMP_FAILURE_RETRY(waitid(P_ALL, 0, &siginfo, WEXITED | WNOHANG | WNOWAIT)) != 0)
PLOG(ERROR) << "waitid failed";
return 0;
auto pid = siginfo.si_pid;
if (pid == 0) return 0;
// At this point we know we have a zombie pid, so we use this scopeguard to reap the pid
// whenever the function returns from this point forward.
// We do NOT want to reap the zombie earlier as in Service::Reap(), we kill(-pid, ...) and we
// want the pid to remain valid throughout that (and potentially future) usages.
auto reaper = make_scope_guard([pid] TEMP_FAILURE_RETRY(waitpid(pid, nullptr, WNOHANG)); );
std::string name;
std::string wait_string;
Service* service = nullptr;
if (SubcontextChildReap(pid)) // 处理Subcontext进程退出,非正在关机中会重启该进程
name = "Subcontext";
else
// 判断该进程是否是某个服务,比如surfaceflinger
service = ServiceList::GetInstance().FindService(pid, &Service::pid);
if (service) // 服务存在
name = StringPrintf("Service '%s' (pid %d)", service->name().c_str(), pid);
if (service->flags() & SVC_EXEC) // 通过'exec' or 'exec_start' 启动的可执行程序进程
auto exec_duration = boot_clock::now() - service->time_started();
auto exec_duration_ms =
std::chrono::duration_cast<std::chrono::milliseconds>(exec_duration).count();
wait_string = StringPrintf(" waiting took %f seconds", exec_duration_ms / 1000.0f);
else if (service->flags() & SVC_ONESHOT) // 一次性的服务
auto exec_duration = boot_clock::now() - service->time_started();
auto exec_duration_ms =
std::chrono::duration_cast<std::chrono::milliseconds>(exec_duration)
.count();
wait_string = StringPrintf(" oneshot service took %f seconds in background",
exec_duration_ms / 1000.0f);
else
name = StringPrintf("Untracked pid %d", pid); // 非服务进程,未追踪的进程退出
if (siginfo.si_code == CLD_EXITED) // 进程 exit
LOG(INFO) << name << " exited with status " << siginfo.si_status << wait_string;
else // 进程被 kill
LOG(INFO) << name << " received signal " << siginfo.si_status << wait_string;
if (!service) return pid;
service->Reap(siginfo); // 调用Reap,做清理工作,并重启非oneshot的服务
if (service->flags() & SVC_TEMPORARY) // 通过'exec' 启动的服务
ServiceList::GetInstance().RemoveService(*service);
return pid;
Service::Reap
- kill进程组所有进程
- 清理所有socket资源相关文件
- 回调reap_callbacks_,比如之前设置的启动失败回调
- critical服务持续保持退出(4分钟大于4次),则重启到BootLoader
- 标记服务SVC_RESTARTING,在HandleProcessActions中重启服务
- 执行onrestart命令
- 通知服务状态改变
/// @system/core/init/service.cpp
void Service::Reap(const siginfo_t& siginfo)
if (!(flags_ & SVC_ONESHOT) || (flags_ & SVC_RESTART)) // 不是一次性的或者需要重启的
KillProcessGroup(SIGKILL, false); // 服务死亡,杀死其进程组所有进程, 第二个参数表示是否report_oneshot
else
// Legacy behavior from ~2007 until Android R: this else branch did not exist and we did not
// kill the process group in this case.
if (SelinuxGetVendorAndroidVersion() >= __ANDROID_API_R__) // 杀死oneshot服务的进程组
// The new behavior in Android R is to kill these process groups in all cases. The
// 'true' parameter instructions KillProcessGroup() to report a warning message where it
// detects a difference in behavior has occurred.
KillProcessGroup(SIGKILL, true);
// Remove any socket resources we may have created.
for (const auto& socket : sockets_) // 清理该服务创建的socket 路径文件
auto path = ANDROID_SOCKET_DIR "/" + socket.name;
unlink(path.c_str());
for (const auto& f : reap_callbacks_) // 执行通过 AddReapCallback 添加的reap操作的回调
f(siginfo);
if ((siginfo.si_code != CLD_EXITED || siginfo.si_status != 0) && on_failure_reboot_target_)
LOG(ERROR) << "Service with 'reboot_on_failure' option failed, shutting down system.";
trigger_shutdown(*on_failure_reboot_target_);// 带有reboot_on_failure选项的服务,非正常退出则会触发关机
if (flags_ & SVC_EXEC) UnSetExec(); // 重置 is_exec_service_running_ flag
if (flags_ & SVC_TEMPORARY) return; // 临时oneshot服务,返回
pid_ = 0;
flags_ &= (~SVC_RUNNING);
start_order_ = 0;
// Oneshot processes go into the disabled state on exit,
// except when manually restarted.
// 标记为 SVC_RESTART 的,是需要重启服务的。在StopOrReset函数先kill进程,然后标记为SVC_RESTART,到回收后则进行重启
if ((flags_ & SVC_ONESHOT) && !(flags_ & SVC_RESTART) && !(flags_ & SVC_RESET))
flags_ |= SVC_DISABLED; // oneshot服务置disabled状态
// Disabled and reset processes do not get restarted automatically.
if (flags_ & (SVC_DISABLED | SVC_RESET)) // disabled 和 reset 状态服务不重启
NotifyStateChange("stopped");
return;
#if INIT_FULL_SOURCES
static bool is_apex_updatable = android::sysprop::ApexProperties::updatable().value_or(false);
#else
static bool is_apex_updatable = false;
#endif
const bool is_process_updatable = !use_bootstrap_ns_ && is_apex_updatable;
// If we crash > 4 times in 'fatal_crash_window_' minutes or before boot_completed,
// reboot into bootloader or set crashing property
boot_clock::time_point now = boot_clock::now();
// critica或可更新(如apex) 并且 服务未标记要重启
if (((flags_ & SVC_CRITICAL) || is_process_updatable) && !(flags_ & SVC_RESTART))
bool boot_completed = GetBoolProperty("sys.boot_completed", false);
if (now < time_crashed_ + fatal_crash_window_ || !boot_completed) // 在窗口时间内 或 开机流程未完成
if (++crash_count_ > 4)
auto exit_reason = boot_completed ?
"in " + std::to_string(fatal_crash_window_.count()) + " minutes" :
"before boot completed";
if (flags_ & SVC_CRITICAL) // critical 服务在窗口时间(4分钟内)或开机完成前 crash超过4次,则会重启到 bootloader
if (!GetBoolProperty("init.svc_debug.no_fatal." + name_, false))
// Aborts into 'atal_reboot_target_'.
SetFatalRebootTarget(fatal_reboot_target_);
LOG(FATAL) << "critical process '" << name_ << "' exited 4 times "
<< exit_reason;
else // 非 critical 服务只有一个打印,然后记录到属性
LOG(ERROR) << "process with updatable components '" << name_
<< "' exited 4 times " << exit_reason;
// Notifies update_verifier and apexd
SetProperty("sys.init.updatable_crashing_process_name", name_);
SetProperty("sys.init.updatable_crashing", "1");
else // 重新记录时间和次数
time_crashed_ = now;
crash_count_ = 1;
flags_ &= (~SVC_RESTART);
flags_ |= SVC_RESTARTING; // 注意此处标记,是服务重启的关键
// Execute all onrestart commands for this service.
onrestart_.ExecuteAllCommands(); // 执行所有 onrestart 命令, 在rc里面配置的
NotifyStateChange("restarting");
return;
Service::KillProcessGroup
void Service::KillProcessGroup(int signal, bool report_oneshot)
// If we've already seen a successful result from killProcessGroup*(), then we have removed
// the cgroup already and calling these functions a second time will simply result in an error.
// This is true regardless of which signal was sent.
// These functions handle their own logging, so no additional logging is needed.
if (!process_cgroup_empty_)
LOG(INFO) << "Sending signal " << signal << " to service '" << name_ << "' (pid " << pid_
<< ") process group...";
int max_processes = 0;
int r;
if (signal == SIGTERM)
r = killProcessGroupOnce(proc_attr_.uid, pid_, signal, &max_processes);
else
r = killProcessGroup(proc_attr_.uid, pid_, signal, &max_processes);
if (report_oneshot && max_processes > 0)
LOG(WARNING)
<< "Killed " << max_processes
<< " additional processes from a oneshot process group for service '" << name_
<< "'. This is new behavior, previously child processes would not be killed in "
"this case.";
if (r == 0) process_cgroup_empty_ = true;
if (oom_score_adjust_ != DEFAULT_OOM_SCORE_ADJUST)
LmkdUnregister(name_, pid_); // 从lmkd移除进程信息
上面两个killProcessGroup实现如下:
/// @system/core/libprocessgroup/processgroup.cpp
int killProcessGroup(uid_t uid, int initialPid, int signal, int* max_processes)
// 内部调用DoKillProcessGroupOnce去kill进程组
return KillProcessGroup(uid, initialPid, signal, 40 /*retries*/, max_processes);
int killProcessGroupOnce(uid_t uid, int initialPid, int signal, int* max_processes)
return KillProcessGroup(uid, initialPid, signal, 0 /*retries*/, max_processes);
关于cgroup配置可参见 cgroups.json
/// @system/core/libprocessgroup/profiles/cgroups.json
"Cgroups": [
"Controller": "blkio",
"Path": "/dev/blkio",
"Mode": "0755",
"UID": "system",
"GID": "system"
,
"Controller": "cpu",
"Path": "/dev/cpuctl",
"Mode": "0755",
"UID": "system",
"GID": "system"
,
"Controller": "cpuset",
"Path": "/dev/cpuset",
"Mode": "0755",
"UID": "system",
"GID": "system"
,
"Controller": "memory",
"Path": "/dev/memcg",
"Mode": "0700",
Process用法与进程详解