ceph mon选举流程
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了ceph mon选举流程相关的知识,希望对你有一定的参考价值。
本章主要介绍mon选举的过程。
选举过程
整个选举过程,在Elector类中实现。此类之中实现了一个election_epoch:
当这个election_epoch为偶数的时候,表示处于稳定状态,为奇数的时候,表示还在选举过程中,mon leader 还未定。
以下是自己环境的mon情况,当前 "election_epoch": 120,说明目前处于稳定状态,没有在选举。
[root@ceph1 ~]# ceph quorum_status | json_reformat
{
"election_epoch": 120,
"quorum": [
0,
1,
2
],
"quorum_names": [
"ceph2",
"ceph1",
"ceph3"
],
"quorum_leader_name": "ceph2",
"monmap": {
"epoch": 1,
"fsid": "5cd6b9b7-2ebd-47b1-a1d2-7362bb077fa3",
"modified": "2020-08-04 19:55:21.613792",
"created": "2020-08-04 19:55:21.613792",
"features": {
"persistent": [
"kraken",
"luminous"
],
"optional": [
]
},
"mons": [
{
"rank": 0,
"name": "ceph2",
"addr": "192.168.111.14:6789/0",
"public_addr": "192.168.111.14:6789/0"
},
{
"rank": 1,
"name": "ceph1",
"addr": "192.168.111.20:6789/0",
"public_addr": "192.168.111.20:6789/0"
},
{
"rank": 2,
"name": "ceph3",
"addr": "192.168.111.34:6789/0",
"public_addr": "192.168.111.34:6789/0"
}
]
}
}
[root@ceph1 ~]#
选举过程关键变量
a.acked_me
/**
* Set containing all those that acked our proposal to become the Leader.
*
* If we are acked by everyone in the MonMap, we will declare
* victory. Also note each peer‘s feature set.
*/
//发起选举本端,经过选举,其他所有mon 返回ack消息,承认选自己为leader
map<int, elector_info_t> acked_me;
b.leader_acked
/**
* Indicates who we have acked
*/
本端已经acked,承认别人是leader
int leader_acked;
/*
默认值-1,代表从未承认别人是leader。
如果defer 了mon.1,那么leader_acked=1.
Elector::defer(int who)函数中进行赋值
*/
c.electing_me
/**
* Indicates if we are the ones being elected.
*
* We always attempt to be the one being elected if we are the ones starting
* the election. If we are not the ones that started it, we will only attempt
* to be elected if we think we might have a chance (i.e., the other guy‘s
* rank is lower than ours).
*/
是否选取本端作为leader,选举发起端一发起,该值就置位true
bool electing_me;
d.epoch
/**
* Latest epoch we‘ve seen.
*
* @remarks if its value is odd, we‘re electing; if it‘s even, then we‘re
* stable.
*/
//选举的版本,奇数为选举状态,偶数为稳定状态
epoch_t epoch;
以下开始介绍选举过程:
选举整体流程图
阶段1:mon端发起选举
注意,这里leader端和peon端都可以发起选举。
void Monitor::start_election()
{
dout(10) << "start_election" << dendl;
wait_for_paxos_write();
_reset();
state = STATE_ELECTING;//设置状态
logger->inc(l_mon_num_elections);
logger->inc(l_mon_election_call);
clog->info() << "mon." << name << " calling monitor election";
elector.call_election();
}
void call_election() {
start();
}
void Elector::start()
{
dout(5) << "start -- can i be leader?" << dendl;
acked_me.clear();
init();
// start by trying to elect me /*从稳定态进入选举态,需要将版本号从偶数往上抬,抬成奇数*/
if (epoch % 2 == 0) {
bump_epoch(epoch+1); // odd == election cycle
} else {
// do a trivial db write just to ensure it is writeable.
auto t(std::make_shared<MonitorDBStore::Transaction>());
t->put(Monitor::MONITOR_NAME, "election_writeable_test", rand());
int r = mon->store->apply_transaction(t);
assert(r >= 0);
}
start_stamp = ceph_clock_now();
electing_me = true;
acked_me[mon->rank].cluster_features = CEPH_FEATURES_ALL;
acked_me[mon->rank].mon_features = ceph::features::mon::get_supported();
mon->collect_metadata(&acked_me[mon->rank].metadata);
leader_acked = -1;
// bcast to everyone else /*向每一个成员广播消息,提议开始重新选举*/
for (unsigned i=0; i<mon->monmap->size(); ++i) {
if ((int)i == mon->rank) continue;
MMonElection *m =
new MMonElection(MMonElection::OP_PROPOSE, epoch, mon->monmap);// OP_PROPOSE
m->mon_features = ceph::features::mon::get_supported();
mon->messenger->send_message(m, mon->monmap->get_inst(i)); //发送选举信号
}
/*
*注意,如果所有的人都承认自己leader地位,那么可以宣布获胜。但是有些情况下,无法等到所有的回应。
*比如某个ceph-mon进程已经不在了,是不可能得到其承认的。为了防止出现这种情况下,在通知其他节点
*选自己的start函数设置了定时器“
*/
reset_timer();
}
当选举超时后,会进入expire函数。
void Elector::expire()
{
dout(5) << "election timer expired" << dendl;
// did i win? 从这里可以看到只要有超过一半的monitor 回复,仍然认为当前节点获胜为leader节点,否则就调用mon->bootstrap() 重新开始选举
if (electing_me &&
acked_me.size() > (unsigned)(mon->monmap->size() / 2)) {
// i win
victory();
} else {
// whoever i deferred to didn‘t declare victory quickly enough.
if (mon->has_ever_joined)
start();
else
mon->bootstrap();
}
}
阶段2:选举消息调度
消息发送后,会根据不同的消息类型,经过以下流程进行处理。注意这里一次选举过程,可能要经过N次以下处理流程。
bool ms_dispatch(Message *m) ->Monitor::_ms_dispatch(Message *m)-> Monitor::dispatch_op -> Elector::dispatch(MonOpRequestRef op) ->handle_propose| handle_ack|handle_victory|handle_nak
void Elector::dispatch(MonOpRequestRef op)
{
//选举的地址如果是非monmap中的地址,那么就退出
if (!mon->monmap->contains(em->get_source_addr())) {
dout(1) << "discarding election message: " << em->get_source_addr()
<< " not in my monmap " << *mon->monmap << dendl;
return;
}
...
switch (em->op) {
case MMonElection::OP_PROPOSE://发起选举提议消息
handle_propose(op);
return;
}
if (em->epoch < epoch) {
dout(5) << "old epoch, dropping" << dendl;
break;
}
switch (em->op) {
case MMonElection::OP_ACK://返回ack消息到发起端
handle_ack(op);
return;
case MMonElection::OP_VICTORY://选举成功,leader端发起消息
handle_victory(op);//peon端进行处理
return;
case MMonElection::OP_NAK:
handle_nak(op);
return;
}
阶段3:非发起端处理
//处理发起propose的mon端发过来的消息。
void Elector::handle_propose(MonOpRequestRef op)
{
op->mark_event("elector:handle_propose");
MMonElection *m = static_cast<MMonElection*>(op->get_req());
dout(5) << "handle_propose from " << m->get_source() << dendl;
int from = m->get_source().num();
assert(m->epoch % 2 == 1); // election
uint64_t required_features = mon->get_required_features();
mon_feature_t required_mon_features = mon->get_required_mon_features();
dout(10) << __func__ << " required features " << required_features
<< " " << required_mon_features
<< ", peer features " << m->get_connection()->get_features()
<< " " << m->mon_features
<< dendl;
if ((required_features ^ m->get_connection()->get_features()) &
required_features) {
dout(5) << " ignoring propose from mon" << from
<< " without required features" << dendl;
nak_old_peer(op);
return;
} else if (!m->mon_features.contains_all(required_mon_features)) {
// all the features in ‘required_mon_features‘ not in ‘m->mon_features‘
mon_feature_t missing = required_mon_features.diff(m->mon_features);
dout(5) << " ignoring propose from mon." << from
<< " without required mon_features " << missing
<< dendl;
nak_old_peer(op);
} else if (m->epoch > epoch) {
bump_epoch(m->epoch);//正常选举流程
} else if (m->epoch < epoch) {//说明是较老的propose
// got an "old" propose,
if (epoch % 2 == 0 && // in a non-election cycle
mon->quorum.count(from) == 0) { // from someone outside the quorum
// a mon just started up, call a new election so they can rejoin!
dout(5) << " got propose from old epoch, quorum is " << mon->quorum
<< ", " << m->get_source() << " must have just started" << dendl;
// we may be active; make sure we reset things in the monitor appropriately.
mon->start_election();
} else {
dout(5) << " ignoring old propose" << dendl;
return;
}
}
if (mon->rank < from) {//如果从未收到过更强者(rank更小者)发来的选举请求,调用start_election,给所有成员发消息,让他们选自己为mon leader
// i would win over them.
if (leader_acked >= 0) { // we already acked someone 自己曾经认过怂,消息来源的mon还不如自己,直接不理
assert(leader_acked < from); // and they still win, of course
dout(5) << "no, we already acked " << leader_acked << dendl;
} else {
// wait, i should win!
/*注意,electing_me记录了自己是否发出过选我为leader的请求
*如果先后收到两个弱小者发来的选举请求,处理第一个的时候,本节点已经发出了选自己当leader的请求,
*当第二个弱者消息到来的时候,没必要再发送选自己当leader的请求*/
if (!electing_me) {
mon->start_election();
}
}
} else {//赞同其他人做leader
// they would win over me
if (leader_acked < 0 || // haven‘t acked anyone yet, or /*从未承认过别人,从未认过怂*/
leader_acked > from || // they would win over who you did ack, or /*虽然承认过别人,认过怂,无奈这次来的更强大,所以还是得认怂,承认它*/
leader_acked == from) { // this is the guy we‘re already deferring to
defer(from); /*defer函数的作用是认可对方可以当leader*/
} else {//leader_acked < from
// ignore them! /*曾经认可过更强者,不可能向不够强的mon发送认可,不理*/
dout(5) << "no, we already acked " << leader_acked << dendl;
}
}
}
void Elector::bump_epoch(epoch_t e)
{
dout(10) << "bump_epoch " << epoch << " to " << e << dendl;
assert(epoch <= e);
epoch = e;//epoch 奇数
auto t(std::make_shared<MonitorDBStore::Transaction>());
t->put(Monitor::MONITOR_NAME, "election_epoch", epoch);
mon->store->apply_transaction(t);
mon->join_election();
// clear up some state
electing_me = false;//重置选举状态
acked_me.clear();//重置选举状态
}
上述handle_propose函数是判断选举条件的核心。已经注释了关键的信息。
大家可能会发现这么个现象,mon发起选举,有时所有mon中都会有选举自己的信息(见下),有时候部分日志中有,部分日志中没有。这个其实和发起选举的序号大小有关。
mon.ceph2 calling monitor election
- 本端mon序号 < mon发起端,才有可能选举自己;
- 本端mon序号 > mon发起端,不可能选举自己;
if (mon->rank < from) {
// i would win over them.
if (leader_acked >= 0) {//说明认怂过,此时也不可能选举自己
assert(leader_acked < from); // and they still win, of course
dout(5) << "no, we already acked " << leader_acked << dendl;
} else {
// wait, i should win!
if (!electing_me) {
mon->start_election();//本端发起选举
}
}
}
阶段3:非发起端向发起端认怂
此阶段,关键处理函数为defer(),该函数可能被调用多次。如果本端mon defer过序号更小的mon,当再来次小的mon,就不再调用defer() 了。
if (mon->rank < from) {
...
} else {//赞同其他人做leader
// they would win over me
if (leader_acked < 0 || // haven‘t acked anyone yet, or /*从未承认过别人,从未认过怂*/
leader_acked > from || // they would win over who you did ack, or /*虽然承认过别人,认过怂,无奈这次来的更强大,所以还是得认怂,承认它*/
leader_acked == from) { // this is the guy we‘re already deferring to
defer(from); /*defer函数的作用是认可对方可以当leader*/
} else {//leader_acked < from
// ignore them! /*曾经认可过更强者,不可能向不够强的mon发送认可,不理*/
dout(5) << "no, we already acked " << leader_acked << dendl;
}
void Elector::defer(int who)
{
dout(5) << "defer to " << who << dendl;
if (electing_me) { /*注意,认怂就要清零,即使自己曾经要求别人选过自己*/
// drop out
acked_me.clear();
electing_me = false;
}
// ack them
leader_acked = who;
ack_stamp = ceph_clock_now();
MMonElection *m = new MMonElection(MMonElection::OP_ACK, epoch, mon->monmap);// OP_ACK /*发送OP_ACK承认对方可以当leader*/
m->mon_features = ceph::features::mon::get_supported();
mon->collect_metadata(&m->metadata);
// This field is unused completely in luminous, but jewel uses it to
// determine whether we are a dumpling mon due to some crufty old
// code. It only needs to see this buffer non-empty, so put
// something useless there.
m->sharing_bl = mon->get_local_commands_bl(mon->get_required_mon_features());
mon->messenger->send_message(m, mon->monmap->get_inst(who));//发送消息
// set a timer
reset_timer(1.0); // give the leader some extra time to declare victory
}
阶段4:发起端处理非发起端的ack消息
基于阶段3,如果非发起端认怂后,会发送OP_ACK消息,那么发起端要进行对应消息的处理。主要处理函数为 Elector::handle_ack(),一般发生在leader端(临时优先级更高的mon端)。
void Elector::handle_ack(MonOpRequestRef op)
{
op->mark_event("elector:handle_ack");
MMonElection *m = static_cast<MMonElection*>(op->get_req());
dout(5) << "handle_ack from " << m->get_source() << dendl;
int from = m->get_source().num();
assert(m->epoch % 2 == 1); // election
assert(m->epoch == epoch);
if (electing_me) {
// thanks
acked_me[from].cluster_features = m->get_connection()->get_features();
acked_me[from].mon_features = m->mon_features;
acked_me[from].metadata = m->metadata;
dout(5) << " so far i have {";
for (map<int, elector_info_t>::const_iterator p = acked_me.begin();
p != acked_me.end();
++p) {
if (p != acked_me.begin())
*_dout << ",";
*_dout << " mon." << p->first << ":"
<< " features " << p->second.cluster_features
<< " " << p->second.mon_features;
}
*_dout << " }" << dendl;
// is that _everyone_? /*如果所有成员都承认了自己的leader地位,那么宣布获胜,调用victory*/
/*
*注意,如果所有的人都承认自己leader地位,那么可以宣布获胜。但是有些情况下,无法等到所有的回应。
*比如某个ceph-mon进程已经不在了,是不可能得到其承认的。为了防止出现这种情况下,在通知其他节点
*选自己的start函数设置了定时器“
*/
if (acked_me.size() == mon->monmap->size()) {//如果所有的mon端都进行了ack,说明都defer了自己,自己成为了leader
// if yes, shortcut to election finish
victory();
}
} else {
// ignore, i‘m deferring already.
assert(leader_acked >= 0);
}
}
当发起端接收到所有mon的ack信息后,开始调用victory()函数,说明本端成为了leader。继续分析
void Elector::victory()
{
leader_acked = -1;//已经选举成功了,那么把相关标记信息复位
electing_me = false;//同上
cancel_timer();
assert(epoch % 2 == 1); // election
bump_epoch(epoch+1); // is over! 这里epoch变为偶数,稳定状态。
// tell everyone!
for (set<int>::iterator p = quorum.begin();
p != quorum.end();
++p) {
if (*p == mon->rank) continue;
MMonElection *m = new MMonElection(MMonElection::OP_VICTORY, epoch,/*注意这里会发送OP_VICTORY 消息来帮自己设置为leader节点*/
mon->monmap);
m->quorum = quorum;
m->quorum_features = cluster_features;
m->mon_features = mon_features;
m->sharing_bl = mon->get_local_commands_bl(mon_features);
mon->messenger->send_message(m, mon->monmap->get_inst(*p));
}
// tell monitor
mon->win_election(epoch, quorum,
cluster_features, mon_features, metadata);
}
void Monitor::win_election(epoch_t epoch, set<int>& active, uint64_t features,
const mon_feature_t& mon_features,
const map<int,Metadata>& metadata)
{
assert(is_electing());
state = STATE_LEADER;// 修改选举状态
clog->info() << "mon." << name << " is new leader, mons " << get_quorum_names()
<< " in quorum (ranks " << quorum << ")";
}
paxos->leader_init();//函数里paoxs状态变为, state=STATE_RECOVERING,开始进行数据恢复操作
monmon()->election_finished();
_finish_svc_election();
health_monitor->start(epoch);
阶段5:选举失败端(peon)处理OP_VICTORY消息
//OP_VICTORY消息对应的处理函数
void Elector::handle_victory(MonOpRequestRef op)
{
op->mark_event("elector:handle_victory");
MMonElection *m = static_cast<MMonElection*>(op->get_req());
dout(5) << "handle_victory from " << m->get_source()
<< " quorum_features " << m->quorum_features
<< " " << m->mon_features
<< dendl;
int from = m->get_source().num();
assert(from < mon->rank);// 已经选举成功了,远端的rank自然是小的
assert(m->epoch % 2 == 0); //victory 函数中已经对epoch+1 了
leader_acked = -1;
bump_epoch(m->epoch);//设置本端的epoch以及重置选举状态。此时非leader端mon也进入稳定状态
// they win 输掉选举的话,把自己设置为peon节点
mon->lose_election(epoch, m->quorum, from,
m->quorum_features, m->mon_features);
// cancel my timer
cancel_timer();
// stash leader‘s commands
assert(m->sharing_bl.length());
vector<MonCommand> new_cmds;
bufferlist::iterator bi = m->sharing_bl.begin();
MonCommand::decode_vector(new_cmds, bi);
mon->set_leader_commands(new_cmds);
}
void Monitor::lose_election(epoch_t epoch, set<int> &q, int l,
uint64_t features,
const mon_feature_t& mon_features)
{
dout(10) << "lose_election, epoch " << epoch << " leader is mon" << leader
<< " quorum is " << quorum << " features are " << quorum_con_features
<< " mon_features are " << quorum_mon_features
<< dendl;
state = STATE_PEON;//设置peon状态
paxos->peon_init();
_finish_svc_election();
health_monitor->start(epoch);
}
void Paxos::peon_init()
{
cancel_events();
new_value.clear();
state = STATE_RECOVERING;// peon 端的paxos也进入STATE_RECOVERING状态
}
以上是关于ceph mon选举流程的主要内容,如果未能解决你的问题,请参考以下文章