pg是ceph中比较抽象的一个概念,且起到了一个承上启下的作用,客户端的对象映射到pg,而pg映射到具体的osd,pg的peering和恢复是ceph中比较复杂的一部分了。因为pg的创建过程会经历pg peering和恢复的所有过程,因此我们可以从创建pg的流程来分析pg的peering和恢复。
pg的创建请求类型是MSG_OSD_PG_CREATE,到达osd后,osd中的_dispatch会继续调用dispatch_op来处理这个请求,最终会到达handle_pg_create函数
const MOSDPGCreate *m = static_cast<const MOSDPGCreate*>(op->get_req()); //get_req()获取Message *request;
require_same_or_newer_map(op, m->epoch, false)//检查osd保存的osdmap是否小于m->epoch,如果小于则需要重新获取
if (epoch > osdmap->get_epoch())
wait_for_new_map(op);
if (waiting_for_osdmap.empty())
osdmap_subscribe(osdmap->get_epoch() + 1, false);
waiting_for_osdmap.push_back(op);
op->mark_delayed("wait for new map");
op->mark_started();
mark_flag_point(flag_started, "started");
for (map<pg_t,pg_create_t>::const_iterator p = m->mkpg.begin(); != m->mkpg.end() //由osdmoniror.cc中插入 map<pg_t,pg_create_t> mkpg
epoch_t created = p->second.created; //monitor创建pg时所在的版本
pg_t on = p->first; //要创建的pg
osdmap->pg_to_up_acting_osds(on, &up, &up_primary, &acting, &acting_primary);
int role = osdmap->calc_pg_role(whoami, acting, acting.size()); //第一个为主osd,其他为副本
if (acting_primary != whoami) //本osd不是act primary就跳过 ????????????
continue;
build_initial_pg_history(pgid, created, ci->second, &history, &pi);//建立从pg create到现在epoch的interval
enqueue_peering_evt( pgid, PGPeeringEventRef( std::make_shared<PGPeeringEvent>(osdmap->get_epoch(), osdmap->get_epoch(), NullEvt(), true, new PGCreateInfo(pgid, osdmap->get_epoch(), history, pi, true))));
handle_pg_create实现如下:
(1)利用mark_started标记该op时间点到达“started”。
(2)对于每个要创建的pg,调用pg_to_up_acting_osds函数来计算当前版本osdmap中属于该pg的acting和up集合,其中up集合保存了由crush算法得到osd集合,acting保存了由于原来up集合的主osd暂时不能充当主osd而选择的另一个osd集合。
(3)对于每个要创建的pg,调用build_initial_pg_history来初始化pg create时到现在这段期间的past_interval。
(4)调用enqueue_peering_evt,将pg相关的信息enqueue到op_shardedwq,op_shardedwq是一个osd中的线程池,负责处理osd中的事务。
pg_to_up_acting_osds函数如下:
_pg_to_up_acting_osds(pg, up, up_primary, acting, acting_primary); //raw_pg_to_pg默认为True
const pg_pool_t *pool = get_pg_pool(pg.pool()); //从OSDMap中获取pool的数据结构
_get_temp_osds(*pool, pg, &_acting, &_acting_primary); //获取pg对应的temp pg和temp pg集合中的主osd
pg = pool.raw_pg_to_pg(pg);
pg.set_ps(ceph_stable_mod(pg.ps(), pg_num, pg_num_mask));
/*
struct pg_t {
uint64_t m_pool;
uint32_t m_seed;
ps()函数返回return m_seed;
ceph_stable_mod:map a raw pg (with full precision ps) into an actual pg, for storage
*/
const auto p = pg_temp->find(pg); //ceph::shared_ptr<PGTempMap> pg_temp; // temp pg mapping (e.g. while we rebuild),pg_temp保存了pg对应的临时osd
/*
假设一个PG通过crush算法映射到的三个osd是[0,1,2],此时,如果osd0出现故障,
导致crush算法重新分配该PG的三个osd是[3,1,2],此时,osd3为该PG的主osd,但是osd3为新加入的osd,
并不能负担该PG上的读写操作。所以PG此时向monitor申请一个该pg的临时的能负责io的osd列表,比如这个临时列表为[1,3,2]
*/
temp_pg->clear();
if (p != pg_temp->end())
for (unsigned i=0; i<p->second.size(); i++)
temp_pg->push_back(p->second[i]);
const auto &pp = primary_temp->find(pg); //ceph::shared_ptr< mempool::osdmap::map<pg_t,int32_t > > primary_temp; // temp primary mapping (e.g. while we rebuild)
if (pp != primary_temp->end())
*temp_primary = pp->second;
if (_acting.empty() || up || up_primary)
_pg_to_raw_osds(*pool, pg, &raw, &pps);通过Crush算法,得到该PG从属的一组osds raw保存了通过crush算出的osd集合
ps_t pps = pool.raw_pg_to_pps(pg); // placement ps
unsigned size = pool.get_size(); //副本大小
int ruleno = crush->find_rule(pool.get_crush_rule(), pool.get_type(), size);
crush->do_rule(ruleno, pps, *osds, size, osd_weight, pg.pool());
_remove_nonexistent_osds(pool, *osds);
*ppps = pps;
_apply_upmap(*pool, pg, &raw);
_raw_to_up_osds(*pool, raw, &_up); //获得raw中所有处于up状态的osds列表
if (pool.can_shift_osds()) //如果时replicated pool返回True,Ec pool返回false
up->clear();
up->reserve(raw.size());
for (unsigned i=0; i<raw.size(); i++)
if (!exists(raw[i]) || is_down(raw[i]))
/*
exists()
return osd >= 0 && osd < max_osd && (osd_state[osd] & CEPH_OSD_EXISTS);
bool is_down(int osd)
return !exists(osd) || get_weight(osd) == CEPH_OSD_OUT;
*/
continue;
up->push_back(raw[i]); //将存在且up的osd加入到up中
_up_primary = _pick_primary(_up); //选择up集合中第一个为主osd
_apply_primary_affinity(pps, *pool, &_up, &_up_primary);//重新选择主osd,并将主osd放在_up[0]的位置
...
...
...
if (_acting.empty()) //如果pg temp未空,则acting集合保存up中的osd集合
_acting = _up;
if (_acting_primary == -1)
_acting_primary = _up_primary;
if (up)
up->swap(_up);
if (up_primary)
*up_primary = _up_primary;
if (acting)
acting->swap(_acting);
if (acting_primary)
*acting_primary = _acting_primary;
(1)调用_get_temp_osds来获取该pg的临时osd,并放入acting集合,acting_primary保存了acting集合的主osd。
(2)调用_pg_to_raw_osds函数来获取通过crush计算得到属于pg的osd集合,然后通过_raw_to_up_osds函数将集合中处于up状态的osd加入到up集合中,up_primary保存了up集合中的主osd。
(3)如果pg没有临时osd,则up和acting集合是一样的。
build_initial_pg_history函数的实现如下
h->epoch_created = created;
h->epoch_pool_created = created;
h->same_interval_since = created;
h->same_up_since = created;
h->same_primary_since = created;
h->last_scrub_stamp = created_stamp;
h->last_deep_scrub_stamp = created_stamp;
h->last_clean_scrub_stamp = created_stamp;
OSDMapRef lastmap = service.get_map(created);
lastmap->pg_to_up_acting_osds(pgid.pgid, &up, &up_primary, &acting, &acting_primary); //获取pg create时对应版本osdmap的up act集合
for (epoch_t e = created + 1; e <= osdmap->get_epoch(); ++e)
OSDMapRef osdmap = service.get_map(e);
osdmap->pg_to_up_acting_osds(pgid.pgid, &new_up, &new_up_primary, &new_acting, &new_acting_primary);//获取这个版本osdmap的up acting set
bool new_interval = PastIntervals::check_new_interval(acting_primary, new_acting_primary, acting, new_acting, up_primary, new_up_primary, up, new_up, h->same_interval_since, h->last_epoch_clean, osdmap, lastmap, pgid.pgid, &min_size_predicate, pi, &debug);
/*
if (is_new_interval( old_acting_primary, new_acting_primary, old_acting, new_acting, old_up_primary, new_up_primary, old_up, new_up, osdmap, lastmap, pgid))
// 判断new和last中的acting up , pool的size, min_size,pgnum等是否相等,如果相等则是同一个interval
pg_interval_t i;
i.first = same_interval_since; //这个interval开始的epoch
i.last = osdmap->get_epoch() - 1; //这个interval结束的epoch
assert(i.first <= i.last);
i.acting = old_acting;
i.up = old_up;
i.primary = old_acting_primary;
i.up_primary = old_up_primary;
for (vector<int>::const_iterator p = i.acting.begin(); p != i.acting.end();++p)
if (*p != CRUSH_ITEM_NONE)
++num_acting;
const pg_pool_t& old_pg_pool = lastmap->get_pools().find(pgid.pool())->second;
old_pg_pool.convert_to_pg_shards(old_acting, &old_acting_shards); //将old_acting转换为Pg_shart_t的set
if (num_acting && i.primary != -1 && num_acting >= old_pg_pool.min_size && (*could_have_gone_active)(old_acting_shards))
if (lastmap->get_up_thru(i.primary) >= i.first && lastmap->get_up_from(i.primary) <= i.first)
i.maybe_went_rw = true;
} else if (last_epoch_clean >= i.first && last_epoch_clean <= i.last) { //在这个past interval中完成了recovery
i.maybe_went_rw = true;
else
i.maybe_went_rw = false;
i.maybe_went_rw = true; //判断maybe_went_rw是不是true
past_intervals->past_intervals->add_interval(old_pg_pool.is_erasure(), i); //加入到pi中 pi要仔细看一看, 最终变为会将该interval参与的act插入到all_participants, interval插入到intervals
if (first == 0)
first = interval.first; //该段interval的开始
last = interval.last; //该段interval的结束
for (unsigned i = 0; i < interval.acting.size(); ++i)
acting.insert(pg_shard_t(interval.acting[i],ec_pool ? shard_id_t(i) : shard_id_t::NO_SHARD));
all_participants.insert(acting.begin(), acting.end()); //set<pg_shard_t> all_participants;
if (!interval.maybe_went_rw)
return;
intervals.push_back(compact_interval_t{interval.first, interval.last, acting});
*/
if (new_interval)
h->same_interval_since = e;
if (up != new_up)
h->same_up_since = e;
if (acting_primary != new_acting_primary)
h->same_primary_since = e;
up = new_up;
acting = new_acting;
up_primary = new_up_primary;
acting_primary = new_acting_primary;
build_initial_pg_history函数负责构造从pg create到当前时期的past_interval信息
(1)获取期间每个版本osdmap中pg所对应的up acting集合,如果和上一版本的不一样,则说明这个epoch是一个新的interval的开始epoch。
(2)如果是一个新的interval,则将上一个interval的信息加入到past_intervals中,具体是将acting集合加入到all_participants和intervals中。
pg创建相关操作加入到线程池后,最终会调用handle_pg_create_info来处理具体的pg创建,如下
if (maybe_wait_for_max_pg(osdmap, pgid, info->by_mon))
return nullptr;
PG::RecoveryCtx rctx = create_context();
PG::_create(*rctx.transaction, pgid, pgid.get_split_bits(pp->get_pg_num())); //创建一个op,这个op就是创建pg
PG::_init(*rctx.transaction, pgid, pp); //创建三个op,
PGRef pg = _make_pg(startmap, pgid);
pg->ch = store->create_new_collection(pg->coll);
pg->lock(true); //pg加锁
//initialize a newly instantiated pg
//Initialize PG state, as when a PG is initially created, or when it is first instantiated on the current node.
//即init是将前面计算的一系列东西赋值到PG中的属性中
pg->init(role, up, up_primary, acting, acting_primary, info->history, info->past_intervals, false, rctx.transaction);
pg->handle_initialize(&rctx);
pg->handle_activate_map(&rctx);
dispatch_context(rctx, pg.get(), osdmap, nullptr);
(1)如果该osd的pg个数大于最大pg个数就返回。
(2)调用PG::_create和PG::_init创建OP_MKCOLL、OP_COLL_HINT、OP_TOUCH、OP_OMAP_SETKEYS四个op。
(3)调用handle_initialize来抛出Initialize事件,此时PG的状态机处于Initial状态,收到Initialize事件后转移到Reset状态。
(4)调用handle_activate_map来抛出ActMap事件,这个函数的调用栈比较重要,如下:
ActMap evt;
//Reset状态接受到ActMap事件后
recovery_state.handle_event(evt, rctx); //过程同上,最终会调用PG::RecoveryState::Reset::react(const ActMap&)
PG *pg = context< RecoveryMachine >().pg; //如果不是primary osd 则应该发送notify给primary osd,即should_send_notify()返回True
context< RecoveryMachine >().send_notify(pg->get_primary(), pg_notify_t(pg->get_primary().shard, pg->pg_whoami.shard, pg->get_osdmap()->get_epoch(), pg->get_osdmap()->get_epoch(), pg->info), pg->past_intervals);
(*state->rctx->notify_list)[to.osd].push_back(make_pair(info, pi)); //指向的是RecoveryCtx结构体中的notify_list,类型为 map<int, vector<pair<pg_notify_t, PastIntervals> > >
pg->update_heartbeat_peers();
pg->take_waiters();
return transit< Started >(); //进入Started状态, 运行Started的构造函数,同时Start是Started的子状态,再进入Start的构造函数
context< RecoveryMachine >().log_enter(state_name);
if(pg->is_primary())
post_event(MakePrimary()); //Start接受MakePrimary事件进入Primary状态, Primary的默认子状态为Peering, Peering的子状态为GetInfo
else
post_event(MakeStray()); //start状态接受到MakeStray事件后进入Stray事件, 进入Stray构造函数
context< RecoveryMachine >().log_enter(state_name); //"Started/Stray"
(1)如果该osd不是主osd,则应该发送notify给主osd。
(2)进入Started状态,因为Start是Started的默认子状态,因此会进入Start状态。
(3)如果是主osd,就抛出MakePrimary事件,否则就抛出MakeStray事件。
处于Start状态的状态机接受MakePrimary事件进入Primary状态, Primary的默认子状态为Peering, Peering的子状态为GetInfo,GetInfo实现如下:
context< RecoveryMachine >().log_enter(state_name); //"Started/Primary/Peering/GetInfo"
pg->check_past_interval_bounds(); //?????????????????????????????
PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
prior_set = pg->build_prior(); // 将up和act集合中的osd加入到probe
PastIntervals::PriorSet::PriorSet
for (unsigned i = 0; i < acting.size(); i++) //将当前的acting up set中的OSD加入到probe列表中
probe.insert(pg_shard_t(acting[i], ec_pool ? shard_id_t(i) : shard_id_t::NO_SHARD));
for (unsigned i = 0; i < up.size(); i++)
probe.insert(pg_shard_t(up[i], ec_pool ? shard_id_t(i) : shard_id_t::NO_SHARD));
set<pg_shard_t> all_probe = past_intervals.get_all_probe(ec_pool);
past_intervals->get_all_participants(ec_pool);
return all_participants; //过去所有interval的osd都会加入到all_participants里, 同时在build_initial_history中,还会将acting集合中的osd加入到all_participants
for (auto &&i: all_probe) //只有现在为up状态的osd才会加入到probe集合里
switch (f(0, i.osd, nullptr))
case UP:
probe.insert(i);
case DOWN,DNE,LOST:
down.insert(i.osd); //DNE状态是获取不到pinfo时标记的,LOST状态时判断lost_at
past_intervals.iterate_mayberw_back_to //判断不小于last_epoch_started的interval是否可以用来修复
for (auto i = intervals.rbegin(); i != intervals.rend(); ++i)
if (i->last < les)
break;
f(i->first, i->acting);
/*
具体判断准则:
(1)如果这个interval的osd依然up,则加入up_now集合
(2)如果是LOST,依然加入到up_now集合
(3)如果是DOWN的,则 candidate_blocked_by[so.osd] = lost_at;且any_down_now = true;
然后调用if (!(*pcontdec)(up_now) && any_down_now)判断存活的osd是否足够用,如果不足够有且any_down_now == true;
则pg_down = true;且blocked_by.insert( candidate_blocked_by.begin(), 还么明白这个做什么用. 如果pg_down
被设置为True,(blocked_by.insert(candidate_blocked_by.begin(),candidate_blocked_by.end()); ),则即使peer_info_requested为空,则依然不能抛出GotInfo事件,而是只有在peer_info_requested.empty() && !prior_set.pg_down
才会抛出GotInfo事件
*/
set_probe_targets(prior.probe);
probe_targets.clear();
for (set<pg_shard_t>::iterator i = probe_set.begin();i != probe_set.end(); ++i)
probe_targets.insert(i->osd);
get_infos(); //将要query的信息插入到query_map中
pg->blocked_by.clear();
for (set<pg_shard_t>::const_iterator it = prior_set.probe.begin(); it != prior_set.probe.end();
context< RecoveryMachine >().send_query(peer, pg_query_t(pg_query_t::INFO, it->shard, pg->pg_whoami.shard, pg->info.history, pg->get_osdmap()->get_epoch()));
(*state->rctx->query_map)[to.osd][spg_t(pg->info.pgid.pgid, to.shard)] = query;
peer_info_requested.insert(peer);
pg->blocked_by.insert(peer.osd); //这几行貌似是发送query消息到pg中其他osd,但没看懂是怎么发的,可能是在其他地方发的
if(prior_set.pg_down) //up的osd不够
post_event(IsDown());
else if (peer_info_requested.empty()) //此时很大可能不为空
post_event(GotInfo());
//对于pg创建,以上两个都不满足,因此程序到这里就返回。
(1)调用build_prior函数去构建probe,probe包含了要获取日志信息的osd集合,其包括当前up、acting中的osd,还包括past_interval中的acting集合中且在当前依然up的osd。
(2)如果某个过去某个interval阶段中的acting集合在当前状态为up和lost的osd数量不足以恢复(默认最小值为1)并且或者该interval阶段acting集合中有osd在当前状态为down,则设置pg_down为true,并将down的osd加入到blocked_by集合。
(3)调用get_infos向probe中的osd发送pg_query_t::INFO消息(这里并没有发送,只是将消息插入到query_map)。
(4)如果pg_down为true,即过去某个interval中acting集合中存活的osd数量不够,并且acting集合中有down的osd,就抛出IsDown事件,进入Down状态。
(5)如果peer_info_requested为空(peer_info_requested中保存了要获取日志信息的osd集合),说明不需要向其他osd获取日志信息,或者其他osd日志信息都已经获取到了,就抛出GotInfo事件。但对于pg创建,这个条件一般不满足,因此函数返回,程序的调用栈返回到handle_pg_create_info函数中。
紧接着运行dispatch_contexth函数
dispatch_context(rctx, pg.get(), osdmap, nullptr);
do_notifies(*ctx.notify_list, curmap); //在Reset::react(const ActMap&)处理函数中添加
MOSDPGNotify *m = new MOSDPGNotify(curmap->get_epoch(), it->second);
con->send_message(m);
do_queries(*ctx.query_map, curmap); //get_infos 插入
MOSDPGQuery *m = new MOSDPGQuery(curmap->get_epoch(), pit->second);
con->send_message(m);
do_infos(*ctx.info_map, curmap); //在pg创建过程中还没用到info_map
MOSDPGInfo *m = new MOSDPGInfo(curmap->get_epoch());
m->pg_list = p->second;
con->send_message(m);
该函数负责将消息发送给对应的OSD,其中notify_list是从osd发送给主osd的通知消息,query_map是主osd发送给其他osd的获取日志信息消息。
紧接着程序调用栈返回ShardedOpWQ::_process函数,在此函数中handle_pg_create_infof返回后会调用qi.run(osd, sdata, pg, tp_handle),其调用栈如下:
qi.run(osd, sdata, pg, tp_handle); //qi为OpQueueItem
qitem->run(osd, sdata, pg, handle); //PGPeeringItem::run
osd->dequeue_peering_evt(sdata, pg.get(), evt, handle);
advance_pg(curmap->get_epoch(), pg, handle, &rctx);
rctx->transaction->register_on_applied(new C_FinishSplits(this, new_pgs));
dispatch_context_transaction(rctx, pg, &handle);
store->queue_transaction(pg->ch, std::move(*ctx.transaction), TrackedOpRef(), handle)
ObjectStore::Transaction::collect_contexts(tls, &on_applied, &on_commit, &on_applied_sync);
TransContext *txc = _txc_create(static_cast<Collection*>(ch.get()), osr);
txc->oncommits.swap(on_commit);
for (vector<Transaction>::iterator p = tls.begin(); p != tls.end(); ++p)
txc->bytes += (*p).get_num_bytes();
_txc_add_transaction(txc, &(*p));
Transaction::iterator i = t->begin(); //return iterator(this); 创建一个iterator
return iterator(this);
vector<OnodeRef> ovec(i.objects.size())
for (int pos = 0; i.have_op(); ++pos) //have_op判断ops是否大于0,如果仍然大于0,就有op存在, op在PG::_create创建, 依次有 OP_MKCOLL,OP_COLL_HINT,OP_TOUCH OP_OMAP_SETKEYS
switch (op->op) {
case Transaction::OP_MKCOLL:
const coll_t &cid = i.get_cid(op->cid);
r = _create_collection(txc, cid, op->split_bits, &c);
auto p = new_coll_map.find(cid);
assert(p != new_coll_map.end());
*c = p->second;
(*c)->cnode.bits = bits;
coll_map[cid] = *c;
new_coll_map.erase(p);
txc->t->set(PREFIX_COLL, stringify(cid), bl);//roscksdb
case Transaction::OP_COLL_HINT:
...
OnodeRef &o = ovec[op->oid];
if (op->op == Transaction::OP_TOUCH)
create = true;
switch (op->op)
case Transaction::OP_TOUCH:
r = _touch(txc, c, o);
_assign_nid(txc, o);
uint64_t nid = ++nid_last;
o->onode.nid = nid;
txc->last_nid = nid;
o->exists = true;
txc->write_onode(o);
onodes.insert(o);
case Transaction::OP_SETATTRS:
i.decode_attrset(aset)
r = _setattrs(txc, c, o, aset);
_txc_calc_cost(txc);
// one "io" for the kv commit
auto ios = 1 + txc->ioc.get_num_ios();
auto cost = throttle_cost_per_io.load();
txc->cost = ios * cost + txc->bytes
_txc_write_nodes(txc, txc->t);
for (auto o : txc->onodes)
_record_onode(o, t);
txn->set(PREFIX_OBJ, o->key.c_str(), o->key.size(), bl);
o->flushing_count++;
_txc_state_proc(txc);
// we're immediately readable (unlike FileStore)
for (auto c : on_applied_sync) {
c->complete(0);
}
for (auto c : on_applied) {
finishers[osr->shard]->queue(c);
}
need_up_thru = pg->get_need_up_thru();
same_interval_since = pg->get_same_interval_since();
其就是将pg创建对应的几个操作下发到磁盘,分别为OP_MKCOLL,OP_COLL_HINT,OP_TOUCH OP_OMAP_SETKEYS