远程节点保持进程活着
Posted
技术标签:
【中文标题】远程节点保持进程活着【英文标题】:Remote nodes keeping processes alive 【发布时间】:2011-08-08 05:58:00 【问题描述】:Quis custodiet ipsos custodes? -- (Decimus Iunius Iuvenalis)
我有以下设置:
在一个节点('one@erlang.enzo')上,一个服务器进程正在运行,它有一个看门狗运行另一个节点('two@erlang.enzo')。当服务器启动时,它将在远程节点上启动它的看门狗。当服务器异常退出时,看门狗会再次启动服务器。当看门狗退出时,服务器再次启动它。
服务器在网络启动后作为运行级别的一部分启动。
服务器还监视远程节点,并在它(即节点)上线后立即启动看门狗。现在服务器和看门狗之间的连接丢失可能有两个原因:首先,网络可能会出现故障;其次,节点可能会崩溃或被杀死。
我的代码似乎可以工作,但我有点怀疑正在发生以下情况:
当看门狗节点关闭(或杀死或崩溃)并重新启动时,服务器会正确地重新启动其看门狗。 但是当网络出现故障并且看门狗节点继续运行时,服务器会在重新建立连接时启动一个新的看门狗,并留下一个僵尸看门狗。我的问题是:
(A) 我要创建僵尸吗? (B) 在网络丢失的情况下,服务器如何检查看门狗是否还活着(反之亦然)? (C) 如果B可以,如何重新连接旧服务器和旧看门狗? (D) 尊敬的读者,您在我的设置中发现了哪些其他主要(和次要)缺陷?编辑:die
和 kill_dog
消息用于伪造不正常的退出,不会超出调试范围。
代码如下:
-module (watchdog).
-compile (export_all).
init () ->
io:format ("Watchdog: Starting @ ~p.~n", [node () ] ),
process_flag (trap_exit, true),
loop ().
loop () ->
receive
die -> 1 / 0;
'EXIT', _, normal ->
io:format ("Watchdog: Server shut down.~n");
'EXIT', _, _ ->
io:format ("Watchdog: Restarting server.~n"),
spawn ('one@erlang.enzo', server, start, [] );
_ -> loop ()
end.
-module (server).
-compile (export_all).
start () ->
io:format ("Server: Starting up.~n"),
register (server, spawn (fun init/0) ).
stop () ->
whereis (server) ! stop.
init () ->
process_flag (trap_exit, true),
monitor_node ('two@erlang.enzo', true),
loop (down, none).
loop (Status, Watchdog) ->
NewStatus, NewWatchdog = receive
die -> 1 / 0;
stop -> stop, none;
kill_dog ->
Watchdog ! die,
Status, Watchdog;
nodedown, 'two@erlang.enzo' ->
io:format ("Server: Watchdog node has gone down.~n"),
down, Watchdog;
'EXIT', Watchdog, noconnection ->
Status, Watchdog;
'EXIT', Watchdog, Reason ->
io:format ("Server: Watchdog has died of ~p.~n", [Reason] ),
Status, spawn_link ('two@erlang.enzo', watchdog, init, [] ) ;
_ -> Status, Watchdog
after 2000 ->
case Status of
down -> checkNode ();
up -> up, Watchdog
end
end,
case NewStatus of
stop -> ok;
_ -> loop (NewStatus, NewWatchdog)
end.
checkNode () ->
net_adm:world (),
case lists:any (fun (Node) -> Node =:= 'two@erlang.enzo' end, nodes () ) of
false ->
io:format ("Server: Watchdog node is still down.~n"),
down, none;
true ->
io:format ("Server: Watchdog node has come online.~n"),
monitor_node ('two@erlang.enzo', true),
Watchdog = spawn_link ('two@erlang.enzo', watchdog, init, [] ),
up, Watchdog
end.
【问题讨论】:
【参考方案1】:使用global
模块注册看门狗应该可以避免你的担心:
看门狗.erl:
-module (watchdog).
-compile (export_all).
init () ->
io:format ("Watchdog: Starting @ ~p.~n", [node () ] ),
process_flag (trap_exit, true),
global:register_name (watchdog, self ()),
loop ().
loop () ->
receive
die -> 1 / 0;
'EXIT', _, normal ->
io:format ("Watchdog: Server shut down.~n");
'EXIT', _, _ ->
io:format ("Watchdog: Restarting server.~n"),
spawn ('one@erlang.enzo', server, start, [] );
_ -> loop ()
end.
server.erl:
checkNode () ->
net_adm:world (),
case lists:any (fun (Node) -> Node =:= 'two@erlang.enzo' end, nodes () ) of
false ->
io:format ("Server: Watchdog node is still down.~n"),
down, none;
true ->
io:format ("Server: Watchdog node has come online.~n"),
global:sync (), %% not sure if this is necessary
case global:whereis_name (watchdog) of
undefined ->
io:format ("Watchdog process is dead"),
Watchdog = spawn_link ('two@erlang.enzo', watchdog, init, [] );
Watchdog ->
io:format ("Watchdog process is still alive")
end,
up, Watchdog
end.
【讨论】:
非常感谢。当看门狗还活着时,我不需要打电话给link/1
吗?或者进程相互收到'EXIT', Pid, noconnection'
后仍然链接?
我不确定,实际上(到目前为止没有使用分布式 Erlang)。以上是关于远程节点保持进程活着的主要内容,如果未能解决你的问题,请参考以下文章
在远程机器上启动应用程序并保持运行,powershell 脚本不应等待进程完成