更多ceph相关文章详见知乎ceph专栏:聊聊ceph
假设一个PG通过crush算法映射到的三个osd是[0,1,2],此时,如果osd0出现故障,导致crush算法重新分配该PG的三个osd是[3,1,2],此时,osd3为该PG的主osd,但是osd3为新加入的osd,并不能负担该PG上的读写操作。所以PG此时向monitor申请一 个该pg的临时的能负责io的osd列表,比如这个临时列表为[1,3,2]
class OSDMap {
//pg_temp存放的地方,由monitor决议完成后更新
ceph::shared_ptr< map<pg_t,vector<int32_t> > > pg_temp; //map中第一个参数是pgid,第二个参数vector是该pg对应的临时osd的集合)
}
class OSDService {
map<pg_t, vector<int> > pg_temp_wanted;//若osd判断自己需要申请该pg对应的临时osd,则会把需要的临时osd列表存放在这里。然后再根据该成员,向monitor发送申请临时osd的请求
}
osd判断该PG是否需要临时osd,若需要,则把PG对应的osd列表放在OSDService类的pg_temp_wanted列表中
PG::RecoveryState::GetLog::GetLog(){
//选出具有权威日志的OSD,并计算acting_backfill和backfill_targets两个OSD列表.权威OSD保存在auth_log_shard中
if (!pg->choose_acting(auth_log_shard,
&context< Peering >().history_les_bound)) {
}
}
PG::choose_acting(pg_shard_t &auth_log_shard_id, bool *history_les_bound) {
//找出拥有权威日志的OSD,保存在变量auth_log_shard中
map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard =
find_best_info(all_info, history_les_bound);
//计算得到want,acting_backfill,want_backfill列表
calc_replicated_acting();
//检查是否需要申请pg_temp (acting与up的值在generate_past_intervals函数中会调用_pg_to_up_acting_osds函数进行更新)
if (want != acting) {
want_acting = want;
if (want_acting == up) {
osd->queue_want_pg_temp(info.pgid.pgid, empty);
} else
//如果want不等于acting set,并且不等于up set;则需要申请pg_temp
osd->queue_want_pg_temp(info.pgid.pgid, want);
}
}
}
void OSDService::queue_want_pg_temp(pg_t pgid, vector<int>& want) {
pg_temp_wanted[pgid] = want;//把want赋值给OSD类成员map<pg_t, vector<int> > pg_temp_wanted;
}
}
void OSD::process_peering_events(){
service.send_pg_temp();
}
void OSDService::send_pg_temp(){
//根据pg_temp_wanted构建消息,发送给mon
MOSDPGTemp *m = new MOSDPGTemp(osdmap->get_epoch());
m->pg_temp = pg_temp_wanted; //把pg_temp_wanted赋值给m->pg_temp
monc->send_mon_message(m);
//依据pg_temp_wanted更新pg_temp_pending。并清空pg_temp_wanted
_sent_pg_temp();
}
OSDMonitor::prepare_update(MonOpRequestRef op) {
case MSG_OSD_PGTEMP:
return prepare_pgtemp(op);
}
OSDMonitor::prepare_pgtemp(MonOpRequestRef op) {
//遍历m->pg_temp
for (map<pg_t,vector<int32_t> >::iterator p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p)
//把要决议生成的pg_temp 对应的osd 放到new_pg_temp
pending_inc.new_pg_temp[p->first] = p->second;
//设置决议完成的回调
wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->map_epoch));
}
//monitor决议,决议完成后最终在OSDMap::apply_incremental函数中会生成对应PG的临时osd,保留在osdmap类的pg_temp成员中
OSDMap::apply_incremental(const Incremental &inc) {
//如果本轮决议有创建pg_temp
for (map<pg_t, vector<int> >::const_iterator p = inc.new_pg_temp.begin(); p != inc.new_pg_temp.end(); ++p) {
if (p->second.empty())
pg_temp->erase(p->first);
else
//真正的赋值pg_temp----也即申请成功该PG对应的pg_temp
//osdmap类有成员:ceph::shared_ptr< map<pg_t,vector<int32_t> > > pg_temp;
(*pg_temp)[p->first] = p->second;
}
}
Objecter::_calc_target() {
//根据类成员osdmap,得到该PG的up与set集合
osdmap->pg_to_up_acting_osds(pgid, &up, &up_primary,
&acting, &acting_primary);
}
void OSDMap::_pg_to_up_acting_osds(){
//通过Crush算法,得到该PG从属的一组osds
_pg_to_raw_osds(*pool, pg, &raw, &_up_primary, &pps);
//获得raw中所有up_set集合的osds列表
_raw_to_up_osds(*pool, raw, &_up, &_up_primary);
//通过该OSDMap的pg_temp判断:如果该PG对应有临时osd,则把临时osd加入_acting
_get_temp_osds(*pool, pg, &_acting, &_acting_primary);
//若_acting为空,说明该PG没有临时osd,便把上面获得的up集合赋值给_acting
if (_acting.empty()) {
_acting = _up;
}
//把_acting赋值给acting,也即acting set
if (acting)
acting->swap(_acting);
}
通过上面的_pg_to_up_acting_osds函数可知。当pg没有临时osd时,则把up赋值给acting。当pg有临时osd时,则会把临时osd列表赋值给acting
_pg_to_up_acting_osds除了在上面的_calc_target中会被调用,在OSD::advance_pg()等多处会被调用,都是为了通过osdmap计算pg的up与acting集合
更多ceph相关文章详见知乎ceph专栏:聊聊ceph