/*
select 系统调用的功能是对多个文件描述符进行监视,当有文件描述符的文件读写
操作完成,发生异常或者超时,该调用会返回这些文件描述符。
int select(int nfds, fd_set *readfds, fd_set *writefds,
fd_set *exceptfds, struct timeval *timeout);
*/
Select缺点:
1 每次调用select,都需要把fd集合从用户态拷贝到内核态;
2 每次调用select,都需要在内核的遍历传递进来的所以fd,这两点在fd很多的时候开销很大;
3 select支持的文件描述符数量太小(1024)
Poll的实现与select相似,只是描述fd集合的方式不同,poll使用pollfd结构而select使用fd_set结构。
1.typedefstruct fd_set {
2. u_int fd_count;
3. socket fd_array[FD_SETSIZE];
4.} fd_set;
int isready(int fd)
{
int rc;
fd_set fds;
struct timeval tv;
FD_ZERO(&fds);
FD_SET(fd, &fds);
tv.tv_sec = tv.tv_usec = 0;
rc = select(fd+1, &fds, NULL, NULL, &tv);
if( rc<0 ) //error
return -1;
return FD_ISSET(fd, &fds) ? 1: 0;
}
typedef struct {
unsigned long *in, *out, *ex;
unsigned long *res_in, *res_out, *res_ex;
} fd_set_bits;
typedef void(*poll_queue_proc)(struct file *, wait_queue_head_t *, struct
poll_table_struct *);
typedef struct poll_table_struct {
poll_queue_proc qproc;
} poll_table;
struct poll_table_entry {
struct file * filp; // select 要监视的 struct file 结构体
wait_queue_t wait; //等待队列的节点
wait_queue_head_t * wait_address; //文件操作的等待队列的队首
};
struct poll_table_page {
//保存的方式是单向链表,每个节点以页为单位,分配多个 poll_table_entry 项
struct poll_table_page * next;
struct poll_table_entry * entry;
struct poll_table_entry entries[0];
};
struct poll_wqueues { //这是最主要的结构体,它保存了 select 过程中的重要信息
poll_table pt; //用来保存回调函数(通常负责把进程放入等待队列等关键操作)
struct poll_table_page * table; //记录了在 select 过程中生成的所有等待队列的结点
int error;
};
//select的调用path如下:sys_select->do_select
asmlinkage long
sys_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, struct timeval __user *tvp)
{
fd_set_bits fds;
char *bits;
long timeout;
int ret, size, max_fdset;
timeout = MAX_SCHEDULE_TIMEOUT;
if (tvp) {
time_t sec, usec;
if ((ret = verify_area(VERIFY_READ, tvp, sizeof(*tvp)))
|| (ret = __get_user(sec, &tvp->tv_sec))
|| (ret = __get_user(usec, &tvp->tv_usec)))
goto out_nofds;
ret = -EINVAL;
if (sec < 0 || usec < 0)
goto out_nofds;
if ((unsigned long)sec < MAX_SELECT_SECONDS) {
timeout = ROUND_UP(usec, 1000000 / HZ);
timeout += sec * (unsigned long)HZ;
}
}
ret = -EINVAL;
if (n < 0)
goto out_nofds;
/* max_fdset can increase, so grab it once to avoid race */
max_fdset = current->files->max_fdset;
if (n > max_fdset)
n = max_fdset;
/*
* We need 6 bitmaps (in/out/ex for both incoming and outgoing),
* since we used fdset we need to allocate memory in units of
* long-words.
*/
ret = -ENOMEM;
size = FDS_BYTES(n);
bits = select_bits_alloc(size);
if (!bits)
goto out_nofds;
fds.in = (unsigned long *)bits;
fds.out = (unsigned long *)(bits + size);
fds.ex = (unsigned long *)(bits + 2 * size);
fds.res_in = (unsigned long *)(bits + 3 * size);
fds.res_out = (unsigned long *)(bits + 4 * size);
fds.res_ex = (unsigned long *)(bits + 5 * size);
if ((ret = get_fd_set(n, inp, fds.in)) ||
(ret = get_fd_set(n, outp, fds.out)) ||
(ret = get_fd_set(n, exp, fds.ex)))
goto out;
zero_fd_set(n, fds.res_in);
zero_fd_set(n, fds.res_out);
zero_fd_set(n, fds.res_ex);
ret = do_select(n, &fds, &timeout);
if (tvp && !(current->personality & STICKY_TIMEOUTS)) {
time_t sec = 0, usec = 0;
if (timeout) {
sec = timeout / HZ;
usec = timeout % HZ;
usec *= (1000000 / HZ);
}
put_user(sec, &tvp->tv_sec);
put_user(usec, &tvp->tv_usec);
}
if (ret < 0)
goto out;
if (!ret) {
ret = -ERESTARTNOHAND;
if (signal_pending(current))
goto out;
ret = 0;
}
if (set_fd_set(n, inp, fds.res_in) ||
set_fd_set(n, outp, fds.res_out) ||
set_fd_set(n, exp, fds.res_ex))
ret = -EFAULT;
out:
select_bits_free(bits, size);
out_nofds:
return ret;
}
int do_select(int n, fd_set_bits *fds, long *timeout)
{
struct poll_wqueues table;
poll_table *wait;
int retval, i;
long __timeout = *timeout;
spin_lock(¤t->files->file_lock);
retval = max_select_fd(n, fds);
spin_unlock(¤t->files->file_lock);
if (retval < 0)
return retval;
n = retval;
poll_initwait(&table); //作用就是把 poll_table 中的回调函数设置为__pollwait。
wait = &table.pt;
if (!__timeout)
wait = NULL;
retval = 0; //retval用于保存已经准备好的描述符数,初始为0
for (;;) {
unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
set_current_state(TASK_INTERRUPTIBLE); //将当前进程状态改为TASK_INTERRUPTIBLE
inp = fds->in; outp = fds->out; exp = fds->ex;
rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;
for (i = 0; i < n; ++rinp, ++routp, ++rexp) { //遍历每个描述符
unsigned long in, out, ex, all_bits, bit = 1, mask, j;
unsigned long res_in = 0, res_out = 0, res_ex = 0;
struct file_operations *f_op = NULL;
struct file *file = NULL;
in = *inp++; out = *outp++; ex = *exp++;
all_bits = in | out | ex;
if (all_bits == 0) {
i += __NFDBITS; //如果这个字没有待查找的描述符, 跳过这个长字(32位)
continue;
}
for (j = 0; j < __NFDBITS; ++j, ++i, bit <<= 1) { //遍历每个长字里的每个位
if (i >= n)
break;
if (!(bit & all_bits))
continue;
file = fget(i);
if (file) {
f_op = file->f_op;
mask = DEFAULT_POLLMASK;
if (f_op && f_op->poll)
/* 在这里循环调用所监测的fd_set内的所有文件描述符对应的驱动程序的poll函数 */
mask = (*f_op->poll)(file, retval ? NULL : wait);
fput(file);
if ((mask & POLLIN_SET) && (in & bit)) {
res_in |= bit;
retval++;
}
if ((mask & POLLOUT_SET) && (out & bit)) {
res_out |= bit;
retval++;
}
if ((mask & POLLEX_SET) && (ex & bit)) {
res_ex |= bit;
retval++;
}
}
}
if (res_in)
*rinp = res_in;
if (res_out)
*routp = res_out;
if (res_ex)
*rexp = res_ex;
}
wait = NULL;
if (retval || !__timeout || signal_pending(current))
break;
if (table.error) {
retval = table.error;
break;
}
__timeout = schedule_timeout(__timeout);
}
__set_current_state(TASK_RUNNING);
poll_freewait(&table);
/*
* Up-to-date the caller timeout.
*/
*timeout = __timeout;
return retval;
}