ceph pg peering和恢复 (1)

郑博厚
2023-12-01

pg是ceph中比较抽象的一个概念,且起到了一个承上启下的作用,客户端的对象映射到pg,而pg映射到具体的osd,pg的peering和恢复是ceph中比较复杂的一部分了。因为pg的创建过程会经历pg peering和恢复的所有过程,因此我们可以从创建pg的流程来分析pg的peering和恢复。

pg的创建请求类型是MSG_OSD_PG_CREATE,到达osd后,osd中的_dispatch会继续调用dispatch_op来处理这个请求,最终会到达handle_pg_create函数

const MOSDPGCreate *m = static_cast<const MOSDPGCreate*>(op->get_req()); //get_req()获取Message *request;
require_same_or_newer_map(op, m->epoch, false)//检查osd保存的osdmap是否小于m->epoch,如果小于则需要重新获取
if (epoch > osdmap->get_epoch())
	wait_for_new_map(op);
		if (waiting_for_osdmap.empty()) 
			osdmap_subscribe(osdmap->get_epoch() + 1, false);  
	waiting_for_osdmap.push_back(op);
	op->mark_delayed("wait for new map");
op->mark_started();
    mark_flag_point(flag_started, "started");
for (map<pg_t,pg_create_t>::const_iterator p = m->mkpg.begin(); != m->mkpg.end()    //由osdmoniror.cc中插入  map<pg_t,pg_create_t> mkpg
    epoch_t created = p->second.created;    //monitor创建pg时所在的版本
    pg_t on = p->first;  //要创建的pg 
    osdmap->pg_to_up_acting_osds(on, &up, &up_primary, &acting, &acting_primary);
	
    int role = osdmap->calc_pg_role(whoami, acting, acting.size()); //第一个为主osd,其他为副本	
    if (acting_primary != whoami)  //本osd不是act primary就跳过  ????????????
        continue;
    build_initial_pg_history(pgid, created, ci->second, &history, &pi);//建立从pg create到现在epoch的interval

    enqueue_peering_evt( pgid, PGPeeringEventRef( std::make_shared<PGPeeringEvent>(osdmap->get_epoch(), osdmap->get_epoch(), NullEvt(), true, new PGCreateInfo(pgid, osdmap->get_epoch(), history, pi, true))));				

handle_pg_create实现如下:
(1)利用mark_started标记该op时间点到达“started”。
(2)对于每个要创建的pg,调用pg_to_up_acting_osds函数来计算当前版本osdmap中属于该pg的acting和up集合,其中up集合保存了由crush算法得到osd集合,acting保存了由于原来up集合的主osd暂时不能充当主osd而选择的另一个osd集合。
(3)对于每个要创建的pg,调用build_initial_pg_history来初始化pg create时到现在这段期间的past_interval。
(4)调用enqueue_peering_evt,将pg相关的信息enqueue到op_shardedwq,op_shardedwq是一个osd中的线程池,负责处理osd中的事务。

pg_to_up_acting_osds函数如下:

_pg_to_up_acting_osds(pg, up, up_primary, acting, acting_primary);  //raw_pg_to_pg默认为True
    const pg_pool_t *pool = get_pg_pool(pg.pool()); //从OSDMap中获取pool的数据结构
    _get_temp_osds(*pool, pg, &_acting, &_acting_primary);  //获取pg对应的temp pg和temp pg集合中的主osd
        pg = pool.raw_pg_to_pg(pg);
            pg.set_ps(ceph_stable_mod(pg.ps(), pg_num, pg_num_mask));
            /*
                struct pg_t {
                uint64_t m_pool;
                uint32_t m_seed;
                ps()函数返回return m_seed;
                
                ceph_stable_mod:map a raw pg (with full precision ps) into an actual pg, for storage
            */
        const auto p = pg_temp->find(pg);  //ceph::shared_ptr<PGTempMap> pg_temp;  // temp pg mapping (e.g. while we rebuild),pg_temp保存了pg对应的临时osd
        /*
        假设一个PG通过crush算法映射到的三个osd是[0,1,2],此时,如果osd0出现故障,
        导致crush算法重新分配该PG的三个osd是[3,1,2],此时,osd3为该PG的主osd,但是osd3为新加入的osd,
        并不能负担该PG上的读写操作。所以PG此时向monitor申请一个该pg的临时的能负责io的osd列表,比如这个临时列表为[1,3,2]
        */                                
        temp_pg->clear();
        if (p != pg_temp->end())
            for (unsigned i=0; i<p->second.size(); i++)
                temp_pg->push_back(p->second[i]);
        
        const auto &pp = primary_temp->find(pg); //ceph::shared_ptr< mempool::osdmap::map<pg_t,int32_t > > primary_temp;  // temp primary mapping (e.g. while we rebuild)
        if (pp != primary_temp->end())
            *temp_primary = pp->second;
    
    if (_acting.empty() || up || up_primary)
        _pg_to_raw_osds(*pool, pg, &raw, &pps);通过Crush算法,得到该PG从属的一组osds raw保存了通过crush算出的osd集合
            ps_t pps = pool.raw_pg_to_pps(pg);  // placement ps
            unsigned size = pool.get_size(); //副本大小
            int ruleno = crush->find_rule(pool.get_crush_rule(), pool.get_type(), size);
            crush->do_rule(ruleno, pps, *osds, size, osd_weight, pg.pool());
            _remove_nonexistent_osds(pool, *osds);
            *ppps = pps;
        _apply_upmap(*pool, pg, &raw);
        _raw_to_up_osds(*pool, raw, &_up); //获得raw中所有处于up状态的osds列表
            if (pool.can_shift_osds()) //如果时replicated pool返回True,Ec pool返回false
                up->clear();
                up->reserve(raw.size());
                for (unsigned i=0; i<raw.size(); i++)
                    if (!exists(raw[i]) || is_down(raw[i]))
                    /*
                    exists()
                        return osd >= 0 && osd < max_osd && (osd_state[osd] & CEPH_OSD_EXISTS);
                    bool is_down(int osd)
                        return !exists(osd) || get_weight(osd) == CEPH_OSD_OUT;
                    */
                        continue;
                    up->push_back(raw[i]); //将存在且up的osd加入到up中
        _up_primary = _pick_primary(_up); //选择up集合中第一个为主osd
        _apply_primary_affinity(pps, *pool, &_up, &_up_primary);//重新选择主osd,并将主osd放在_up[0]的位置
            ...
            ...
            ...
        if (_acting.empty())  //如果pg temp未空,则acting集合保存up中的osd集合
            _acting = _up;
            if (_acting_primary == -1)
                _acting_primary = _up_primary;
        if (up)
            up->swap(_up);
        if (up_primary)
            *up_primary = _up_primary;
        if (acting)
            acting->swap(_acting);
        if (acting_primary)
            *acting_primary = _acting_primary;

(1)调用_get_temp_osds来获取该pg的临时osd,并放入acting集合,acting_primary保存了acting集合的主osd。
(2)调用_pg_to_raw_osds函数来获取通过crush计算得到属于pg的osd集合,然后通过_raw_to_up_osds函数将集合中处于up状态的osd加入到up集合中,up_primary保存了up集合中的主osd。
(3)如果pg没有临时osd,则up和acting集合是一样的。

build_initial_pg_history函数的实现如下

h->epoch_created = created;  
h->epoch_pool_created = created;
h->same_interval_since = created;
h->same_up_since = created;
h->same_primary_since = created;
h->last_scrub_stamp = created_stamp;
h->last_deep_scrub_stamp = created_stamp;
h->last_clean_scrub_stamp = created_stamp;	
OSDMapRef lastmap = service.get_map(created);
lastmap->pg_to_up_acting_osds(pgid.pgid, &up, &up_primary, &acting, &acting_primary);	 //获取pg create时对应版本osdmap的up act集合
for (epoch_t e = created + 1; e <= osdmap->get_epoch(); ++e)
    OSDMapRef osdmap = service.get_map(e);
    osdmap->pg_to_up_acting_osds(pgid.pgid, &new_up, &new_up_primary, &new_acting, &new_acting_primary);//获取这个版本osdmap的up acting set
    bool new_interval = PastIntervals::check_new_interval(acting_primary, new_acting_primary, acting, new_acting, up_primary, new_up_primary, up, new_up, h->same_interval_since, h->last_epoch_clean, osdmap, lastmap, pgid.pgid, &min_size_predicate, pi, &debug);						
    /*
        if (is_new_interval( old_acting_primary, new_acting_primary, old_acting, new_acting, old_up_primary, new_up_primary, old_up, new_up, osdmap, lastmap, pgid))			
        //  判断new和last中的acting up , pool的size, min_size,pgnum等是否相等,如果相等则是同一个interval
            pg_interval_t i;
            i.first = same_interval_since; //这个interval开始的epoch
            i.last = osdmap->get_epoch() - 1; //这个interval结束的epoch
            assert(i.first <= i.last);
            i.acting = old_acting;
            i.up = old_up;
            i.primary = old_acting_primary;
            i.up_primary = old_up_primary;	
            for (vector<int>::const_iterator p = i.acting.begin(); p != i.acting.end();++p)
                if (*p != CRUSH_ITEM_NONE)
                    ++num_acting;  
            const pg_pool_t& old_pg_pool = lastmap->get_pools().find(pgid.pool())->second;
            old_pg_pool.convert_to_pg_shards(old_acting, &old_acting_shards);  //将old_acting转换为Pg_shart_t的set		
            
            if (num_acting && i.primary != -1 && num_acting >= old_pg_pool.min_size && (*could_have_gone_active)(old_acting_shards))
                if (lastmap->get_up_thru(i.primary) >= i.first && lastmap->get_up_from(i.primary) <= i.first)  
                    i.maybe_went_rw = true;
                } else if (last_epoch_clean >= i.first && last_epoch_clean <= i.last) { //在这个past interval中完成了recovery
                    i.maybe_went_rw = true;
                else
                    i.maybe_went_rw = false;
            i.maybe_went_rw = true; //判断maybe_went_rw是不是true
            past_intervals->past_intervals->add_interval(old_pg_pool.is_erasure(), i);  //加入到pi中  pi要仔细看一看, 最终变为会将该interval参与的act插入到all_participants, interval插入到intervals
                if (first == 0)
                    first = interval.first; //该段interval的开始
                last = interval.last; //该段interval的结束
                for (unsigned i = 0; i < interval.acting.size(); ++i)
                    acting.insert(pg_shard_t(interval.acting[i],ec_pool ? shard_id_t(i) : shard_id_t::NO_SHARD));
                all_participants.insert(acting.begin(), acting.end());  //set<pg_shard_t> all_participants;
                if (!interval.maybe_went_rw)    
                    return;
                intervals.push_back(compact_interval_t{interval.first, interval.last, acting});
    */
    if (new_interval)
        h->same_interval_since = e;
        if (up != new_up)
            h->same_up_since = e;
        if (acting_primary != new_acting_primary)
            h->same_primary_since = e;
        up = new_up;
        acting = new_acting;
        up_primary = new_up_primary;
        acting_primary = new_acting_primary;  

build_initial_pg_history函数负责构造从pg create到当前时期的past_interval信息
(1)获取期间每个版本osdmap中pg所对应的up acting集合,如果和上一版本的不一样,则说明这个epoch是一个新的interval的开始epoch。
(2)如果是一个新的interval,则将上一个interval的信息加入到past_intervals中,具体是将acting集合加入到all_participants和intervals中。

pg创建相关操作加入到线程池后,最终会调用handle_pg_create_info来处理具体的pg创建,如下

if (maybe_wait_for_max_pg(osdmap, pgid, info->by_mon))
    return nullptr;
PG::RecoveryCtx rctx = create_context();
PG::_create(*rctx.transaction, pgid, pgid.get_split_bits(pp->get_pg_num()));  //创建一个op,这个op就是创建pg
PG::_init(*rctx.transaction, pgid, pp); //创建三个op,
PGRef pg = _make_pg(startmap, pgid);
pg->ch = store->create_new_collection(pg->coll);
pg->lock(true);  //pg加锁
//initialize a newly instantiated pg
//Initialize PG state, as when a PG is initially created, or when it is first instantiated on the current node.
//即init是将前面计算的一系列东西赋值到PG中的属性中
pg->init(role, up, up_primary, acting, acting_primary, info->history, info->past_intervals, false, rctx.transaction);
pg->handle_initialize(&rctx);
pg->handle_activate_map(&rctx);
dispatch_context(rctx, pg.get(), osdmap, nullptr);

(1)如果该osd的pg个数大于最大pg个数就返回。
(2)调用PG::_create和PG::_init创建OP_MKCOLL、OP_COLL_HINT、OP_TOUCH、OP_OMAP_SETKEYS四个op。
(3)调用handle_initialize来抛出Initialize事件,此时PG的状态机处于Initial状态,收到Initialize事件后转移到Reset状态。
(4)调用handle_activate_map来抛出ActMap事件,这个函数的调用栈比较重要,如下:

ActMap evt;
//Reset状态接受到ActMap事件后
recovery_state.handle_event(evt, rctx); //过程同上,最终会调用PG::RecoveryState::Reset::react(const ActMap&)  
    PG *pg = context< RecoveryMachine >().pg; //如果不是primary osd 则应该发送notify给primary osd,即should_send_notify()返回True
    context< RecoveryMachine >().send_notify(pg->get_primary(), pg_notify_t(pg->get_primary().shard, pg->pg_whoami.shard, pg->get_osdmap()->get_epoch(), pg->get_osdmap()->get_epoch(), pg->info), pg->past_intervals);	
        (*state->rctx->notify_list)[to.osd].push_back(make_pair(info, pi));   //指向的是RecoveryCtx结构体中的notify_list,类型为 map<int, vector<pair<pg_notify_t, PastIntervals> > >
    pg->update_heartbeat_peers();
    pg->take_waiters();
    return transit< Started >(); //进入Started状态, 运行Started的构造函数,同时Start是Started的子状态,再进入Start的构造函数
        context< RecoveryMachine >().log_enter(state_name);
        if(pg->is_primary())
            post_event(MakePrimary()); //Start接受MakePrimary事件进入Primary状态, Primary的默认子状态为Peering, Peering的子状态为GetInfo
        else
            post_event(MakeStray());  //start状态接受到MakeStray事件后进入Stray事件, 进入Stray构造函数
                context< RecoveryMachine >().log_enter(state_name); //"Started/Stray"

(1)如果该osd不是主osd,则应该发送notify给主osd。
(2)进入Started状态,因为Start是Started的默认子状态,因此会进入Start状态。
(3)如果是主osd,就抛出MakePrimary事件,否则就抛出MakeStray事件。

处于Start状态的状态机接受MakePrimary事件进入Primary状态, Primary的默认子状态为Peering, Peering的子状态为GetInfo,GetInfo实现如下:

context< RecoveryMachine >().log_enter(state_name); //"Started/Primary/Peering/GetInfo"
pg->check_past_interval_bounds();  //?????????????????????????????
PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
prior_set = pg->build_prior(); // 将up和act集合中的osd加入到probe
    PastIntervals::PriorSet::PriorSet  
        for (unsigned i = 0; i < acting.size(); i++)  //将当前的acting up set中的OSD加入到probe列表中
            probe.insert(pg_shard_t(acting[i], ec_pool ? shard_id_t(i) : shard_id_t::NO_SHARD));
        for (unsigned i = 0; i < up.size(); i++)
            probe.insert(pg_shard_t(up[i], ec_pool ? shard_id_t(i) : shard_id_t::NO_SHARD));
        set<pg_shard_t> all_probe = past_intervals.get_all_probe(ec_pool);
            past_intervals->get_all_participants(ec_pool);
                return all_participants;  //过去所有interval的osd都会加入到all_participants里,  同时在build_initial_history中,还会将acting集合中的osd加入到all_participants
        for (auto &&i: all_probe)  //只有现在为up状态的osd才会加入到probe集合里
            switch (f(0, i.osd, nullptr))
                case UP:
                    probe.insert(i);
                case DOWN,DNE,LOST:
                    down.insert(i.osd);  //DNE状态是获取不到pinfo时标记的,LOST状态时判断lost_at
        past_intervals.iterate_mayberw_back_to	 //判断不小于last_epoch_started的interval是否可以用来修复
            for (auto i = intervals.rbegin(); i != intervals.rend(); ++i)
                if (i->last < les)
                    break;
                f(i->first, i->acting);  
                /*
                    具体判断准则:
                    (1)如果这个interval的osd依然up,则加入up_now集合
                    (2)如果是LOST,依然加入到up_now集合
                    (3)如果是DOWN的,则 candidate_blocked_by[so.osd] = lost_at;且any_down_now = true;
                    然后调用if (!(*pcontdec)(up_now) && any_down_now)判断存活的osd是否足够用,如果不足够有且any_down_now == true;
                    则pg_down = true;且blocked_by.insert( candidate_blocked_by.begin(),  还么明白这个做什么用. 如果pg_down 
                    被设置为True,(blocked_by.insert(candidate_blocked_by.begin(),candidate_blocked_by.end()); ),则即使peer_info_requested为空,则依然不能抛出GotInfo事件,而是只有在peer_info_requested.empty() && !prior_set.pg_down
                    才会抛出GotInfo事件
                */
            set_probe_targets(prior.probe);
                probe_targets.clear();
                for (set<pg_shard_t>::iterator i = probe_set.begin();i != probe_set.end(); ++i)
                    probe_targets.insert(i->osd);
        
get_infos();  //将要query的信息插入到query_map中
    pg->blocked_by.clear();
    for (set<pg_shard_t>::const_iterator it = prior_set.probe.begin(); it != prior_set.probe.end();
        context< RecoveryMachine >().send_query(peer, pg_query_t(pg_query_t::INFO, it->shard, pg->pg_whoami.shard, pg->info.history, pg->get_osdmap()->get_epoch()));
                (*state->rctx->query_map)[to.osd][spg_t(pg->info.pgid.pgid, to.shard)] = query;  
    peer_info_requested.insert(peer);
    pg->blocked_by.insert(peer.osd);  //这几行貌似是发送query消息到pg中其他osd,但没看懂是怎么发的,可能是在其他地方发的	
if(prior_set.pg_down)  //up的osd不够
    post_event(IsDown());
else if (peer_info_requested.empty()) //此时很大可能不为空
    post_event(GotInfo());
//对于pg创建,以上两个都不满足,因此程序到这里就返回。

(1)调用build_prior函数去构建probe,probe包含了要获取日志信息的osd集合,其包括当前up、acting中的osd,还包括past_interval中的acting集合中且在当前依然up的osd。
(2)如果某个过去某个interval阶段中的acting集合在当前状态为up和lost的osd数量不足以恢复(默认最小值为1)并且或者该interval阶段acting集合中有osd在当前状态为down,则设置pg_down为true,并将down的osd加入到blocked_by集合。
(3)调用get_infos向probe中的osd发送pg_query_t::INFO消息(这里并没有发送,只是将消息插入到query_map)。
(4)如果pg_down为true,即过去某个interval中acting集合中存活的osd数量不够,并且acting集合中有down的osd,就抛出IsDown事件,进入Down状态。
(5)如果peer_info_requested为空(peer_info_requested中保存了要获取日志信息的osd集合),说明不需要向其他osd获取日志信息,或者其他osd日志信息都已经获取到了,就抛出GotInfo事件。但对于pg创建,这个条件一般不满足,因此函数返回,程序的调用栈返回到handle_pg_create_info函数中。

紧接着运行dispatch_contexth函数

dispatch_context(rctx, pg.get(), osdmap, nullptr);
    do_notifies(*ctx.notify_list, curmap);   //在Reset::react(const ActMap&)处理函数中添加
        MOSDPGNotify *m = new MOSDPGNotify(curmap->get_epoch(), it->second);
        con->send_message(m);
    do_queries(*ctx.query_map, curmap);  //get_infos 插入
        MOSDPGQuery *m = new MOSDPGQuery(curmap->get_epoch(), pit->second);
        con->send_message(m);
    do_infos(*ctx.info_map, curmap);	  //在pg创建过程中还没用到info_map
        MOSDPGInfo *m = new MOSDPGInfo(curmap->get_epoch());
        m->pg_list = p->second;
        con->send_message(m);

该函数负责将消息发送给对应的OSD,其中notify_list是从osd发送给主osd的通知消息,query_map是主osd发送给其他osd的获取日志信息消息。

紧接着程序调用栈返回ShardedOpWQ::_process函数,在此函数中handle_pg_create_infof返回后会调用qi.run(osd, sdata, pg, tp_handle),其调用栈如下:

qi.run(osd, sdata, pg, tp_handle); //qi为OpQueueItem
    qitem->run(osd, sdata, pg, handle);  //PGPeeringItem::run
        osd->dequeue_peering_evt(sdata, pg.get(), evt, handle);
            advance_pg(curmap->get_epoch(), pg, handle, &rctx);
                rctx->transaction->register_on_applied(new C_FinishSplits(this, new_pgs));
            dispatch_context_transaction(rctx, pg, &handle);
                store->queue_transaction(pg->ch, std::move(*ctx.transaction), TrackedOpRef(), handle)
                    ObjectStore::Transaction::collect_contexts(tls, &on_applied, &on_commit, &on_applied_sync);
                    TransContext *txc = _txc_create(static_cast<Collection*>(ch.get()), osr);
                    txc->oncommits.swap(on_commit);
                    for (vector<Transaction>::iterator p = tls.begin(); p != tls.end(); ++p)
                        txc->bytes += (*p).get_num_bytes();
                        _txc_add_transaction(txc, &(*p));
                            Transaction::iterator i = t->begin(); //return iterator(this);  创建一个iterator
                                return iterator(this);
                            vector<OnodeRef> ovec(i.objects.size())
                            for (int pos = 0; i.have_op(); ++pos)  //have_op判断ops是否大于0,如果仍然大于0,就有op存在,  op在PG::_create创建, 依次有  OP_MKCOLL,OP_COLL_HINT,OP_TOUCH OP_OMAP_SETKEYS
                                switch (op->op) {
                                    case Transaction::OP_MKCOLL:
                                        const coll_t &cid = i.get_cid(op->cid);
                                        r = _create_collection(txc, cid, op->split_bits, &c);
                                            auto p = new_coll_map.find(cid);
                                            assert(p != new_coll_map.end());
                                            *c = p->second;
                                            (*c)->cnode.bits = bits;
                                            coll_map[cid] = *c;
                                            new_coll_map.erase(p);
                                            txc->t->set(PREFIX_COLL, stringify(cid), bl);//roscksdb
                                    case Transaction::OP_COLL_HINT:
                                        ...
                                OnodeRef &o = ovec[op->oid];
                                if (op->op == Transaction::OP_TOUCH)
                                    create = true;
                                switch (op->op)
                                    case Transaction::OP_TOUCH:		
                                        r = _touch(txc, c, o);
                                            _assign_nid(txc, o);
                                                uint64_t nid = ++nid_last;
                                                o->onode.nid = nid;
                                                txc->last_nid = nid;
                                                o->exists = true;
                                            txc->write_onode(o);
                                                onodes.insert(o);
                                case Transaction::OP_SETATTRS: 
                                    i.decode_attrset(aset)
                                    r = _setattrs(txc, c, o, aset);
                        
                    _txc_calc_cost(txc);	
                        // one "io" for the kv commit
                        auto ios = 1 + txc->ioc.get_num_ios(); 
                        auto cost = throttle_cost_per_io.load(); 
                        txc->cost = ios * cost + txc->bytes
                    _txc_write_nodes(txc, txc->t);	
                        for (auto o : txc->onodes)
                            _record_onode(o, t);
                                txn->set(PREFIX_OBJ, o->key.c_str(), o->key.size(), bl);
                            o->flushing_count++;
                    _txc_state_proc(txc); 
                    // we're immediately readable (unlike FileStore)
                    for (auto c : on_applied_sync) {
                      c->complete(0);
                    }
                    for (auto c : on_applied) {
                      finishers[osr->shard]->queue(c);
                    }
            need_up_thru = pg->get_need_up_thru();
            same_interval_since = pg->get_same_interval_since();	

其就是将pg创建对应的几个操作下发到磁盘,分别为OP_MKCOLL,OP_COLL_HINT,OP_TOUCH OP_OMAP_SETKEYS

 类似资料: