当前位置: 首页 > 工具软件 > orchid > 使用案例 >

Orchid select 剖析

梁鸣
2023-12-01




/*
select 系统调用的功能是对多个文件描述符进行监视,当有文件描述符的文件读写
操作完成,发生异常或者超时,该调用会返回这些文件描述符。

int select(int nfds, fd_set *readfds, fd_set *writefds,
	fd_set *exceptfds, struct timeval *timeout);
*/
Select缺点:
1 每次调用select,都需要把fd集合从用户态拷贝到内核态;
2 每次调用select,都需要在内核的遍历传递进来的所以fd,这两点在fd很多的时候开销很大;
3 select支持的文件描述符数量太小(1024)

Poll的实现与select相似,只是描述fd集合的方式不同,poll使用pollfd结构而select使用fd_set结构。
1.typedefstruct fd_set {  
2. u_int fd_count;  
3. socket fd_array[FD_SETSIZE];  
4.} fd_set;  

int  isready(int  fd)
    {
        int    rc;
        fd_set    fds;
        struct timeval    tv;
        FD_ZERO(&fds);
        FD_SET(fd,  &fds);
        tv.tv_sec = tv.tv_usec = 0;
        rc = select(fd+1, &fds, NULL, NULL, &tv);
        if( rc<0 )  //error
          return -1;
        return FD_ISSET(fd, &fds)  ? 1: 0;
    }



typedef struct {
	unsigned long *in, *out, *ex;
	unsigned long *res_in, *res_out, *res_ex;
} fd_set_bits;




typedef void(*poll_queue_proc)(struct file *, wait_queue_head_t *, struct
	poll_table_struct *);

typedef struct poll_table_struct {
	poll_queue_proc qproc;
} poll_table;

struct poll_table_entry {
	struct file * filp;						// select 要监视的 struct file 结构体
	wait_queue_t wait;						//等待队列的节点
	wait_queue_head_t * wait_address;		//文件操作的等待队列的队首
};

struct poll_table_page {
	//保存的方式是单向链表,每个节点以页为单位,分配多个 poll_table_entry 项
	struct poll_table_page * next;
	struct poll_table_entry * entry;
	struct poll_table_entry entries[0];
};

struct poll_wqueues {					//这是最主要的结构体,它保存了 select 过程中的重要信息
	poll_table pt;						//用来保存回调函数(通常负责把进程放入等待队列等关键操作)
	struct poll_table_page * table;		//记录了在 select 过程中生成的所有等待队列的结点
	int error;
};

//select的调用path如下:sys_select->do_select



asmlinkage long
sys_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, struct timeval __user *tvp)
{
	fd_set_bits fds;
	char *bits;
	long timeout;
	int ret, size, max_fdset;

	timeout = MAX_SCHEDULE_TIMEOUT;
	if (tvp) {
		time_t sec, usec;

		if ((ret = verify_area(VERIFY_READ, tvp, sizeof(*tvp)))
			|| (ret = __get_user(sec, &tvp->tv_sec))
			|| (ret = __get_user(usec, &tvp->tv_usec)))
			goto out_nofds;

		ret = -EINVAL;
		if (sec < 0 || usec < 0)
			goto out_nofds;

		if ((unsigned long)sec < MAX_SELECT_SECONDS) {
			timeout = ROUND_UP(usec, 1000000 / HZ);
			timeout += sec * (unsigned long)HZ;
		}
	}

	ret = -EINVAL;
	if (n < 0)
		goto out_nofds;

	/* max_fdset can increase, so grab it once to avoid race */
	max_fdset = current->files->max_fdset;
	if (n > max_fdset)
		n = max_fdset;

	/*
	* We need 6 bitmaps (in/out/ex for both incoming and outgoing),
	* since we used fdset we need to allocate memory in units of
	* long-words.
	*/
	ret = -ENOMEM;
	size = FDS_BYTES(n);
	bits = select_bits_alloc(size);
	if (!bits)
		goto out_nofds;
	fds.in = (unsigned long *)bits;
	fds.out = (unsigned long *)(bits + size);
	fds.ex = (unsigned long *)(bits + 2 * size);
	fds.res_in = (unsigned long *)(bits + 3 * size);
	fds.res_out = (unsigned long *)(bits + 4 * size);
	fds.res_ex = (unsigned long *)(bits + 5 * size);

	if ((ret = get_fd_set(n, inp, fds.in)) ||
		(ret = get_fd_set(n, outp, fds.out)) ||
		(ret = get_fd_set(n, exp, fds.ex)))
		goto out;
	zero_fd_set(n, fds.res_in);
	zero_fd_set(n, fds.res_out);
	zero_fd_set(n, fds.res_ex);

	ret = do_select(n, &fds, &timeout);

	if (tvp && !(current->personality & STICKY_TIMEOUTS)) {
		time_t sec = 0, usec = 0;
		if (timeout) {
			sec = timeout / HZ;
			usec = timeout % HZ;
			usec *= (1000000 / HZ);
		}
		put_user(sec, &tvp->tv_sec);
		put_user(usec, &tvp->tv_usec);
	}

	if (ret < 0)
		goto out;
	if (!ret) {
		ret = -ERESTARTNOHAND;
		if (signal_pending(current))
			goto out;
		ret = 0;
	}

	if (set_fd_set(n, inp, fds.res_in) ||
		set_fd_set(n, outp, fds.res_out) ||
		set_fd_set(n, exp, fds.res_ex))
		ret = -EFAULT;

out:
	select_bits_free(bits, size);
out_nofds:
	return ret;
}

int do_select(int n, fd_set_bits *fds, long *timeout)
{
	struct poll_wqueues table;
	poll_table *wait;
	int retval, i;
	long __timeout = *timeout;

	spin_lock(¤t->files->file_lock);
	retval = max_select_fd(n, fds);
	spin_unlock(¤t->files->file_lock);

	if (retval < 0)
		return retval;
	n = retval;

	poll_initwait(&table);   //作用就是把 poll_table 中的回调函数设置为__pollwait。

	wait = &table.pt;
	if (!__timeout)
		wait = NULL;
	retval = 0;			//retval用于保存已经准备好的描述符数,初始为0
	for (;;) {
		unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;

		set_current_state(TASK_INTERRUPTIBLE);		//将当前进程状态改为TASK_INTERRUPTIBLE

		inp = fds->in; outp = fds->out; exp = fds->ex;
		rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;

		for (i = 0; i < n; ++rinp, ++routp, ++rexp) {	//遍历每个描述符
			unsigned long in, out, ex, all_bits, bit = 1, mask, j;
			unsigned long res_in = 0, res_out = 0, res_ex = 0;
			struct file_operations *f_op = NULL;
			struct file *file = NULL;

			in = *inp++; out = *outp++; ex = *exp++;
			all_bits = in | out | ex;
			if (all_bits == 0) {
				i += __NFDBITS;		//如果这个字没有待查找的描述符, 跳过这个长字(32位)
				continue;
			}

			for (j = 0; j < __NFDBITS; ++j, ++i, bit <<= 1) {		//遍历每个长字里的每个位
				if (i >= n)
					break;
				if (!(bit & all_bits))
					continue;
				file = fget(i);
				if (file) {
					f_op = file->f_op;
					mask = DEFAULT_POLLMASK;
					if (f_op && f_op->poll)
						/* 在这里循环调用所监测的fd_set内的所有文件描述符对应的驱动程序的poll函数 */
						mask = (*f_op->poll)(file, retval ? NULL : wait);
					fput(file);
					if ((mask & POLLIN_SET) && (in & bit)) {
						res_in |= bit;
						retval++;
					}
					if ((mask & POLLOUT_SET) && (out & bit)) {
						res_out |= bit;
						retval++;
					}
					if ((mask & POLLEX_SET) && (ex & bit)) {
						res_ex |= bit;
						retval++;
					}
				}
			}
			if (res_in)
				*rinp = res_in;
			if (res_out)
				*routp = res_out;
			if (res_ex)
				*rexp = res_ex;
		}
		wait = NULL;
		if (retval || !__timeout || signal_pending(current))
			break;
		if (table.error) {
			retval = table.error;
			break;
		}
		__timeout = schedule_timeout(__timeout);
	}
	__set_current_state(TASK_RUNNING);

	poll_freewait(&table);   

	/*
	* Up-to-date the caller timeout.
	*/
	*timeout = __timeout;
	return retval;
}


 类似资料: