stats.h
res_stats.cpp
ResolverStats.h
DnsStats.cpp
DnsStats.h
DNS Stats主要由两部分组成,前者是实现对服务器DNS解析结果进行统计,计算成功率,从而调整DNS服务器是否可用;后者是对DNS服务器综合评估,实现对DNS服务器的顺序进行排序。
DnsResolver仓是一个c/c++混合的代码仓,文件命名规则:
小驼峰+下划线对应的c代码
大驼峰命名对应的是c++
android_net_res_stats_get_usable_servers:决策是否unusable配置的服务器,发送DNS请求前调用
int android_net_res_stats_get_usable_servers(const res_params* params, res_stats stats[],
int nscount, bool usable_servers[]) {
unsigned usable_servers_found = 0;
for (int ns = 0; ns < nscount; ns++) {
bool usable = res_stats_usable_server(params, &stats[ns]);
if (usable) {
++usable_servers_found;
}
usable_servers[ns] = usable;
}
// If there are no usable servers, consider all of them usable.
// TODO: Explore other possibilities, such as enabling only the best N servers, etc.
if (usable_servers_found == 0) {
for (int ns = 0; ns < nscount; ns++) {
usable_servers[ns] = true;
}
}
return (usable_servers_found == 0) ? nscount : usable_servers_found;
}
// Returns true if the server is considered usable, i.e. if the success rate is not lower than the
// threshold for the stored stored samples. If not enough samples are stored, the server is
// considered usable.
static bool res_stats_usable_server(const res_params* params, res_stats* stats) {
int successes = -1;
int errors = -1;
int timeouts = -1;
int internal_errors = -1;
int rtt_avg = -1;
time_t last_sample_time = 0;
android_net_res_stats_aggregate(stats, &successes, &errors, &timeouts, &internal_errors,
&rtt_avg, &last_sample_time);
if (successes >= 0 && errors >= 0 && timeouts >= 0) {
int total = successes + errors + timeouts + internal_errors;
LOG(INFO) << __func__ << ": NS stats: S " << successes << " + E " << errors << " + T "
<< timeouts << " + I " << internal_errors << " = " << total
<< ", rtt = " << rtt_avg << ", min_samples = " << unsigned(params->min_samples);
if (total >= params->min_samples) {
int success_rate = successes * 100 / total;
LOG(INFO) << __func__ << ": success rate " << success_rate;
if (success_rate < params->success_threshold) {
time_t now = time(NULL);
if (now - last_sample_time > params->sample_validity) {
// Note: It might be worth considering to expire old servers after their expiry
// date has been reached, however the code for returning the ring buffer to its
// previous non-circular state would induce additional complexity.
LOG(INFO) << __func__ << ": samples stale, retrying server";
_res_stats_clear_samples(stats);
} else {
LOG(INFO) << __func__ << ": too many resolution errors, ignoring server";
return 0;
}
}
}
}
return 1;
}
而resolv_cache_add_resolver_stats_sample接口是在收到DNS响应后或者timeout后,保存sample。
每个服务器对应params.max_samples的最大样本量,一般配置为64。
单个样本记录:
rcode: 用于计算成功率
rtt:平均响应时间
at
一般是低于25%,即设置为服务器失效。
注意:如果所有服务器都失效,那么会设置所有服务器全部生效。
以上统计信息dumpsys dnsresolver中可见
getSortedServers,基于某一个协议对DNS server进行降序排序,分数越高,排序越靠前
std::vector<IPSockAddr> DnsStats::getSortedServers(Protocol protocol) const {
// DoT unsupported. The handshake overhead is expensive, and the connection will hang for a
// while. Need to figure out if it is worth doing for DoT servers.
if (protocol == PROTO_DOT) return {};
auto it = mStats.find(protocol);
if (it == mStats.end()) return {};
// Sorting on insertion in decreasing order.
std::multimap<double, IPSockAddr, std::greater<double>> sortedData;
for (const auto& [ip, statsRecords] : it->second) {
sortedData.insert({statsRecords.score(), ip});
}
std::vector<IPSockAddr> ret;
ret.reserve(sortedData.size());
for (auto& [_, v] : sortedData) {
ret.push_back(v); // IPSockAddr is trivially-copyable.
}
return ret;
}
具体的打分机制,是由以下函数实现的:
double StatsRecords::score() const {
const int avgRtt = mStatsData.averageLatencyMs();
// Set the lower bound to -1 in case of "avgRtt + mPenalty < mSkippedCount"
// 1) when the server doesn't have any stats yet.
// 2) when the sorting has been disabled while it was enabled before.
int quality = std::clamp(avgRtt + mPenalty - mSkippedCount, -1, kMaxQuality);
// Normalization.
return static_cast<double>(kMaxQuality - quality) * 100 / kMaxQuality;
}
void StatsRecords::updatePenalty(const Record& record) {
switch (record.rcode) {
case NS_R_NO_ERROR:
case NS_R_NXDOMAIN:
case NS_R_NOTAUTH:
mPenalty = 0;
return;
default:
// NS_R_TIMEOUT and NS_R_INTERNAL_ERROR are in this case.
if (mPenalty == 0) {
mPenalty = 100;
} else {
// The evaluated quality drops more quickly when continuous failures happen.
mPenalty = std::min(mPenalty * 2, kMaxQuality);
}
return;
}
}
void StatsRecords::incrementSkippedCount() {
mSkippedCount = std::min(mSkippedCount + 1, kMaxQuality);
}
int StatsData::averageLatencyMs() const {
return (total == 0) ? 0 : duration_cast<milliseconds>(latencyUs).count() / total;
}
score是由mPenalty、mSkippedCount、avgRtt、kMaxQuality四个参数确定的
// The maximum of the quantified result. As the sorting is on the basis of server latency, limit
// the maximal value of the quantity to 10000 in correspondence with the maximal cleartext
// query timeout 10000 milliseconds. This helps normalize the value of the quality to a score.
static constexpr int kMaxQuality = 10000;
// A quality factor used to prevent starvation.
int mSkippedCount = 0;
假设DNS解析成功
mPenalty = 0
avgRtt=100(ms)
score = (kMaxQuality - quality) * 100 / kMaxQuality = (10000 - 99) * 100 / 10000 = 99.01分
一旦发生超时或者错误,该服务器就会被惩罚,而且逐次翻倍,例如出现两次timeout
mPenalty = std::min(mPenalty * 2, kMaxQuality) = 100 * 2 * 2 = 400
同样是rtt均值100,socre = 95.01
由于DNS常用协议是UDP,所以基于UDP数据进行排序,默认该特性是不开启的,需要修改源码,设置默认值为1
void resolv_populate_res_for_net(ResState* statp) {
if (statp == nullptr) {
return;
}
LOG(INFO) << __func__ << ": netid=" << statp->netid;
std::lock_guard guard(cache_mutex);
NetConfig* info = find_netconfig_locked(statp->netid);
if (info == nullptr) return;
const bool sortNameservers = Experiments::getInstance()->getFlag("sort_nameservers", 0);
statp->sort_nameservers = sortNameservers;
statp->nsaddrs = sortNameservers ? info->dnsStats.getSortedServers(PROTO_UDP)
: info->nameserverSockAddrs;
statp->search_domains = info->search_domains;
statp->tc_mode = info->tc_mode;
statp->enforce_dns_uid = info->enforceDnsUid;
}
注意点:
该排序是不考虑三层协议的,IPv4、IPv6的服务器同时参与排序。
排序完成后,在发送DNS请求时,按照排序后顺序发送。
具体的stats添加是调用:resolv_stats_add,也是在收到DNS响应或者timeout后添加
最多保存128条记录,区分协议、服务器地址。目前看有部分是冗余的,例如tcp、mDNS等。可考虑优化
不考虑使用DNS服务器排序,可以将注释掉调用入口:resolv_stats_add,减少RAM占用