另辟蹊径Ceph源码分析之3:解析ceph pg_temp(ceph 临时pg)

狄飞鹏
2023-12-01

更多ceph相关文章详见知乎ceph专栏聊聊ceph


什么是pg_temp

假设一个PG通过crush算法映射到的三个osd是[0,1,2],此时,如果osd0出现故障,导致crush算法重新分配该PG的三个osd是[3,1,2],此时,osd3为该PG的主osd,但是osd3为新加入的osd,并不能负担该PG上的读写操作。所以PG此时向monitor申请一 个该pg的临时的能负责io的osd列表,比如这个临时列表为[1,3,2]

与pg_temp相关的数据结构

class OSDMap {
   //pg_temp存放的地方,由monitor决议完成后更新
   ceph::shared_ptr< map<pg_t,vector<int32_t> > > pg_temp; //map中第一个参数是pgid,第二个参数vector是该pg对应的临时osd的集合)
}

class OSDService {
    map<pg_t, vector<int> > pg_temp_wanted;//若osd判断自己需要申请该pg对应的临时osd,则会把需要的临时osd列表存放在这里。然后再根据该成员,向monitor发送申请临时osd的请求
}

osdmap类成员pg_temp的构建过程

osd判断PG是否需要临时osd

osd判断该PG是否需要临时osd,若需要,则把PG对应的osd列表放在OSDService类的pg_temp_wanted列表中

PG::RecoveryState::GetLog::GetLog(){

    //选出具有权威日志的OSD,并计算acting_backfill和backfill_targets两个OSD列表.权威OSD保存在auth_log_shard中
    if (!pg->choose_acting(auth_log_shard,
      &context< Peering >().history_les_bound)) {
          
     }
}

PG::choose_acting(pg_shard_t &auth_log_shard_id, bool *history_les_bound) {
    
    //找出拥有权威日志的OSD,保存在变量auth_log_shard中
    map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard =
       find_best_info(all_info, history_les_bound);
    
    //计算得到want,acting_backfill,want_backfill列表
    calc_replicated_acting();
    
    //检查是否需要申请pg_temp  (acting与up的值在generate_past_intervals函数中会调用_pg_to_up_acting_osds函数进行更新)
    if (want != acting) {
        want_acting = want;
        
        if (want_acting == up) {
            osd->queue_want_pg_temp(info.pgid.pgid, empty);
        } else
            //如果want不等于acting set,并且不等于up set;则需要申请pg_temp
            osd->queue_want_pg_temp(info.pgid.pgid, want);
     
        }
    }
}

void OSDService::queue_want_pg_temp(pg_t pgid, vector<int>& want) {
        pg_temp_wanted[pgid] = want;//把want赋值给OSD类成员map<pg_t, vector<int> > pg_temp_wanted;
    }
}

根据OSDService->pg_temp_wanted,向monitor发送该pg申请临时osd的消息

void OSD::process_peering_events(){
    
   service.send_pg_temp(); 
}

void OSDService::send_pg_temp(){
    //根据pg_temp_wanted构建消息,发送给mon
    MOSDPGTemp *m = new MOSDPGTemp(osdmap->get_epoch());
    m->pg_temp = pg_temp_wanted; //把pg_temp_wanted赋值给m->pg_temp
    monc->send_mon_message(m);
    
    //依据pg_temp_wanted更新pg_temp_pending。并清空pg_temp_wanted
    _sent_pg_temp(); 
}

monitor收到消息后,做决议,决议成功后更新OSDMap类的pg_temp成员

OSDMonitor::prepare_update(MonOpRequestRef op) {
    case MSG_OSD_PGTEMP:
      return prepare_pgtemp(op);
}


OSDMonitor::prepare_pgtemp(MonOpRequestRef op) {
    //遍历m->pg_temp
    for (map<pg_t,vector<int32_t> >::iterator p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p)  
  
        //把要决议生成的pg_temp 对应的osd 放到new_pg_temp
        pending_inc.new_pg_temp[p->first] = p->second;
  
  //设置决议完成的回调
  wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->map_epoch));
}

//monitor决议,决议完成后最终在OSDMap::apply_incremental函数中会生成对应PG的临时osd,保留在osdmap类的pg_temp成员中
OSDMap::apply_incremental(const Incremental &inc) {
    //如果本轮决议有创建pg_temp
    for (map<pg_t, vector<int> >::const_iterator p = inc.new_pg_temp.begin(); p != inc.new_pg_temp.end(); ++p) {
        if (p->second.empty())
            pg_temp->erase(p->first);  
        else
            //真正的赋值pg_temp----也即申请成功该PG对应的pg_temp
            //osdmap类有成员:ceph::shared_ptr< map<pg_t,vector<int32_t> > > pg_temp; 
            (*pg_temp)[p->first] = p->second;
  }
}

pg_temp的运用

Objecter::_calc_target() {
    //根据类成员osdmap,得到该PG的up与set集合
    osdmap->pg_to_up_acting_osds(pgid, &up, &up_primary,
			       &acting, &acting_primary);
}

void OSDMap::_pg_to_up_acting_osds(){
    //通过Crush算法,得到该PG从属的一组osds
    _pg_to_raw_osds(*pool, pg, &raw, &_up_primary, &pps); 
    //获得raw中所有up_set集合的osds列表
    _raw_to_up_osds(*pool, raw, &_up, &_up_primary);

    //通过该OSDMap的pg_temp判断:如果该PG对应有临时osd,则把临时osd加入_acting
    _get_temp_osds(*pool, pg, &_acting, &_acting_primary);
    
    //若_acting为空,说明该PG没有临时osd,便把上面获得的up集合赋值给_acting
    if (_acting.empty()) {
        _acting = _up;
    }
    
    //把_acting赋值给acting,也即acting set
    if (acting)
        acting->swap(_acting);
}

总结

  • 通过上面的_pg_to_up_acting_osds函数可知。当pg没有临时osd时,则把up赋值给acting。当pg有临时osd时,则会把临时osd列表赋值给acting

  • _pg_to_up_acting_osds除了在上面的_calc_target中会被调用,在OSD::advance_pg()等多处会被调用,都是为了通过osdmap计算pg的up与acting集合


更多ceph相关文章详见知乎ceph专栏聊聊ceph


 类似资料: