在我们的系统中,有几种软中断(在硬件中断的下部分执行):
enum
{
HI_SOFTIRQ=0,
TIMER_SOFTIRQ,
NET_TX_SOFTIRQ,
NET_RX_SOFTIRQ,
BLOCK_SOFTIRQ,
IRQ_POLL_SOFTIRQ,
TASKLET_SOFTIRQ,
SCHED_SOFTIRQ,
HRTIMER_SOFTIRQ, /* Unused, but kept as tools rely on the
numbering. Sigh! */
RCU_SOFTIRQ, /* Preferable RCU should always be the last softirq */
NR_SOFTIRQS
};
其中 NET_TX_SOFTIRQ 和 NET_RX_SOFTIRQ 就是我们网络收发包的软中断。
系统会为每一个cpu创建一个内核任务,见kernel/softirq.c
static struct smp_hotplug_thread softirq_threads = {
.store = &ksoftirqd,
.thread_should_run = ksoftirqd_should_run,
.thread_fn = run_ksoftirqd,
.thread_comm = "ksoftirqd/%u",
};
static __init int spawn_ksoftirqd(void)
{
cpuhp_setup_state_nocalls(CPUHP_SOFTIRQ_DEAD, "softirq:dead", NULL,
takeover_tasklets);
BUG_ON(smpboot_register_percpu_thread(&softirq_threads));
return 0;
}
early_initcall(spawn_ksoftirqd);
static void run_ksoftirqd(unsigned int cpu)
{
local_irq_disable();
if (local_softirq_pending()) {
__do_softirq();
local_irq_enable();
....
}
}
asmlinkage __visible void __softirq_entry __do_softirq(void)
{
__u32 pending;
....
pending = local_softirq_pending();//获取当前cpu软中断状态(哪些软中断需要处理)
....
while ((softirq_bit = ffs(pending))) { //一个一个处理
unsigned int vec_nr;
int prev_count;
h += softirq_bit - 1;
vec_nr = h - softirq_vec;
prev_count = preempt_count();
kstat_incr_softirqs_this_cpu(vec_nr);
trace_softirq_entry(vec_nr);
h->action(h); //处理注册的软中断回调(open_softirq)
trace_softirq_exit(vec_nr);
if (unlikely(prev_count != preempt_count())) {
pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n",
vec_nr, softirq_to_name[vec_nr], h->action,
prev_count, preempt_count());
preempt_count_set(prev_count);
}
h++;
pending >>= softirq_bit;
}
}
当调用__napi_schedule的时候,会触发一个 NET_RX_SOFTIRQ 类型的软件中断,触发内核线程执行。
注意:在调用__napi_schedule之前,需要调用 napi_schedule_prep(),判断napi 不为 NAPI_STATE_DISABLE状态,并设置napi NAPI_STATE_SCHED 状态。驱动在处理完成 <= budget 次数以后,需要调用 napi_complete(napi),来清除NAPI_STATE_SCHED 状态。
napi_schedule_prep (struct napi_struct *n)
{
return !napi_disable_pending(n) &&
!test_and_set(NAPI_STATE_SCHED, &n->state);
}
static inline void ____napi_schedule(struct softnet_data *sd,
struct napi_struct *napi)
{
list_add_tail(&napi->poll_list, &sd->poll_list);//添加到链表
__raise_softirq_irqoff(NET_RX_SOFTIRQ);//发出软中断信号
}
void __raise_softirq_irqoff(unsigned int nr)
{
trace_softirq_raise(nr);
or_softirq_pending(1UL << nr); //告诉内核,NET_RX_SOFTIRQ 软中断产生了,(设置pending状态)
}
void __napi_complete(struct napi_struct *n)
{
....
clear_bit(NAPI_STATE_SCHED, &n->state);
}
然后内核线程调用 run_ksoftirqd -> __do_softirq
asmlinkage __visible void __softirq_entry __do_softirq(void)
{
unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
unsigned long old_flags = current->flags;
int max_restart = MAX_SOFTIRQ_RESTART;
struct softirq_action *h;
bool in_hardirq;
__u32 pending;
int softirq_bit;
/*
* Mask out PF_MEMALLOC s current task context is borrowed for the
* softirq. A softirq handled such as network RX might set PF_MEMALLOC
* again if the socket is related to swap
*/
current->flags &= ~PF_MEMALLOC;
pending = local_softirq_pending();//获取软中断pending状态
account_irq_enter_time(current);
__local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET);
in_hardirq = lockdep_softirq_start();
restart:
/* Reset the pending bitmask before enabling irqs */
set_softirq_pending(0);
local_irq_enable();
h = softirq_vec;
while ((softirq_bit = ffs(pending))) {
unsigned int vec_nr;
int prev_count;
h += softirq_bit - 1;
vec_nr = h - softirq_vec;
prev_count = preempt_count();
kstat_incr_softirqs_this_cpu(vec_nr);
trace_softirq_entry(vec_nr);
h->action(h); //这就是我们注册的中断回调 open_softirq
trace_softirq_exit(vec_nr);
if (unlikely(prev_count != preempt_count())) {
pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n",
vec_nr, softirq_to_name[vec_nr], h->action,
prev_count, preempt_count());
preempt_count_set(prev_count);
}
h++;
pending >>= softirq_bit;
}
rcu_bh_qs();
local_irq_disable();
pending = local_softirq_pending();
if (pending) {
if (time_before(jiffies, end) && !need_resched() &&
--max_restart)
goto restart;
wakeup_softirqd();
}
lockdep_softirq_end(in_hardirq);
account_irq_exit_time(current);
__local_bh_enable(SOFTIRQ_OFFSET);
WARN_ON_ONCE(in_interrupt());
tsk_restore_flags(current, old_flags, PF_MEMALLOC);
}
在 net/core/dev.c 文件中,有一个函数 net_dev_init(),里面为每一个CPU初始化了skb的队列,还有我们的软中断: NET_TX_SOFTIRQ和NET_RX_SOFTIRQ,对应的处理函数为net_tx_action和net_rx_action.
static int __init net_dev_init(void)
{
....
for_each_possible_cpu(i) {
struct softnet_data *sd = &per_cpu(softnet_data, i);
skb_queue_head_init(&sd->input_pkt_queue);
skb_queue_head_init(&sd->input_pkt_queue);
INIT_LIST_HEAD(&sd->poll_list);
}
open_softirq(NET_TX_SOFTIRQ, net_tx_action);//注册中断回调
open_softirq(NET_RX_SOFTIRQ, net_rx_action);//注册中断回调
}
函数原型:
void netif_napi_add(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct naoi_struct *, int), int weight);
功能介绍:
1、初始化napi结构成员, 并将自己挂在napi的链表里面
napi->poll, napi->weight,napi->state.napi->timer,INIT_LIST_HEAD(&napi->poll_list)
2、高版本的kernel还创建了一个内核任务,调用napi_kthread_create函数创建了一个napi任务,任务函数为:napi_threaded_poll
if (dev->threaded && napi_kthread_create(napi))
dev->threaded = 0;
static int napi_kthread_create(struct napi_struct *n)
{
n->thread = kthread_run(napi_threaded_poll, n, "napi/%s-%d", n->dev->name,n->napi_id);//poll函数函数
}
该函数一般由具体的网卡驱动调用,并将poll函数传入。
当网卡产生中断以后,在中断函数里面处理的事情很少,包含:关闭中断和__napi_schedule.
__napi_schedule里面有两种处理方式:
1、如果创建了 napi任务,则唤醒任务,wake_up_process(thread),最后调用任务处理函数napi_threaded_poll进行包处理
static int napi_threaded_poll(void *data)
{
struct napi_struct *napi = data;
for(;;) {
__napi_poll(napi, &repoll);
}
}
static int __naou_poll(struct napi_struct *napi, bool *repoll)
{
if (test_bit(NAPI_STATE_SCHES, &N->STATE)) {
work = n->poll(n, weight);//这就是网卡驱动注册的poll函数
}
}
2、否则,触发一个NET_RX_SOFTIRQ软中断,__raise_softirq_irqoff(NET_RX_SOFTIRQ), 然后调用软中断处理函数 net_rx_action.
static void net_rx_action(struct softirq_action *h)
{
LIST_HEAD(list);
for(;;) {
if (list_empty(&list)) {
break;
}
n = list_first_entry(&list, struct napi_struct, poll_list);
budge -= napi_poll(n, &repoll)
}
}
napi_poll-> __napi_poll//与上面一致。
static int napi_kthread_create(struct napi_struct *n)
{
int err = 0;
n->thread = kthread_run(napi_thread_poll, n, "napi/%s-%d", n->dev->name, n->napi_id);
return err;
}
static int napi_threaded_poll(void *data)
{
struct napi_struct *napi = data;
void *have;
while (!napi_thread_wait(napi)) {
for(;;) {
.....
__napi_poll(napi, &repoll);
....
}
}
}
static int __napi_poll(struct napi_struct *n, bool *repoll)
{
int work, weight;
weight = n->weight;
if (test_bit(NAPI_STATE_SCHED, &n->state)) {
work = n->poll(n, weight);
}
}
napi_threaded_poll->__napi_poll->poll钩子函数
软中断处理函数net_rx_action中,轮训Napi_list链表,然后调用 napi_poll函数,
在napi_pool函数里面,首先从链表中删掉本napi节点,然后调用__napi_pool函数,
在__napi_pool函数里面调用我们设备注册的pool钩子。net_rx_action->napi_poll->__napi_pool->poll钩子