sys_socket的实现:
SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol) (前面已经说过这个宏最终会展开为sys_socket(int family, int type, int protocol))
{
int retval;
struct socket *sock;
int flags;
/*
这部分检查一些值,当条件不为真时,会编译报错,里面用到了一些编译前检查错误的技巧,可以学习下:
#define BUILD_BUG_ON(condition)
do {
((void)sizeof(char[1 - 2*!!(condition)]));
if (condition) __build_bug_on_failed = 1;
} while(0)
#endif
((void)sizeof(char[1 - 2*!!(condition)]));如果条件不为真,那么编译器报错,因为char[-1],还有很多这种方式,比如typedef int xxx[(condition)? 1: -1] ,但是有人说这个不需要条件跳转,效率会高点,但是对于这种常量,编译器在编译时就会优化,应该没什么效果。至于为什么要加void,是为了防止编译器优化空语句,不去编译这行。
*/
/* Check the SOCK_* constants for consistency. */
BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC);
BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK);
BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK);
BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK);
flags = type & ~SOCK_TYPE_MASK; //#define SOCK_TYPE_MASK 0xf, 去掉后4位
if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
return -EINVAL;
type &= SOCK_TYPE_MASK; //取后4位
if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
//创建socket
retval = sock_create(family, type, protocol, &sock);
if (retval < 0)
goto out;
//生成新的file,并将socket与该file相关联
retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));
if (retval < 0)
goto out_release;
out:
/* It may be already another descriptor 8) Not kernel problem. */
return retval;
out_release:
sock_release(sock);
return retval;
}
sock_create最终调用的是__sock_create:
int __sock_create(struct net *net, int family, int type, int protocol,
struct socket **res, int kern)
{
int err;
struct socket *sock;
const struct net_proto_family *pf;
/*
* Check protocol is in range
*/
if (family < 0 || family >= NPROTO)
return -EAFNOSUPPORT;
if (type < 0 || type >= SOCK_MAX)
return -EINVAL;
/* Compatibility.
This uglymoron is moved from INET layer to here to avoid
deadlock in module load.
*/
/*
#define PF_INETAF_INET 这2个宏的值是一样的,PF的意思是protocol family,AF的意思是address family,所以逻辑上比较合理的用法是socket用PF_INET, 设置地址值用AF_INET,例如:
socket(PF_INET, SOCK_STREAM, 0);
addr.sin_family = AF_INET;
*/
if (family == PF_INET && type == SOCK_PACKET) {
static int warned;
if (!warned) {
warned = 1;
printk(KERN_INFO "%s uses obsolete (PF_INET,SOCK_PACKET)\n",
current->comm);
}
family = PF_PACKET;
}
/*
security_socket_create调用的是
security_ops->socket_create(family, type, protocol, kern);
其中security_ops在初始化时,被初始化为default_security_ops,这是一个struct security_operations,里面的成员比较多,其中有socket_create函数指针,用来检查创建socket的安全,default_security_ops的初始化在函数security_fixup_ops中。其中socket_create的代码:
set_to_cap_if_null(ops, socket_create);
#define set_to_cap_if_null(ops, function)
do {
if (!ops->function) {
ops->function = cap_##function;//这里提一下,do while在这里的作用是为了避免宏在展开时导致语意问题
pr_debug("Had to override the " #function
" security operation with the default.\n");
}
} while (0)
所以最终调用的是:
static int cap_socket_create(int family, int type, int protocol, int kern)
{
return 0;
}
什么都没做,呵呵
*/
err = security_socket_create(family, type, protocol, kern);//检查安全性
if (err)
return err;
/*
* Allocate the socket and allow the family to set things up. if
* the protocol is 0, the family is instructed to select an appropriate
* default.
*/
/*
为新建的socket分配内存,看下:
static struct socket *sock_alloc(void)
{
struct inode *inode;
struct socket *sock;
/*
在安装socket文件系统时,会初始化该文件系统的超级块,此时会初始化超级块的操作指针s_op为sockfs_ops结构;因此此时分配inode会调用sock_alloc_inode函数来完成, 实际上分配了一个socket_alloc结构体,该结构体包含socket和inode
*/
inode = new_inode_pseudo(sock_mnt->mnt_sb); //申请一个inode内存节点,内存部分,不展开看了
if (!inode)
return NULL;
/*
这个宏值得看下
static inline struct socket *SOCKET_I(struct inode *inode)
{
return &container_of(inode, struct socket_alloc, vfs_inode)->socket; //得到跟inode绑定的socket结构
}
这个宏container_of得到包含member成员的容器结构指针
#define container_of(ptr, type, member) ({
const typeof(((type *)0)->member) * __mptr = (ptr);
//关于typeof,这是gcc的C语言扩展保留字,用于声明变量类型.const typeof( ((type *)0->member ) *__mptr = (ptr);意思是声明一个与member同一个类型的指针常量 *__mptr,并初始化为ptr. (这里不明白为什么要声明这样一个临时变量,直接用ptr不就好了吗?难道为了保护ptr以及ptr指向的内容)
(type *)((char *)__mptr - offsetof(type, member)); }) //#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER) 这个宏很有名,大家应该都知道的
#endif
*/
sock = SOCKET_I(inode);
/*
跟kmemcheck_bitfield_begin kmemcheck_bitfield_end 配套使用,检查begin和end的地址,以后去中间的数据进行初始化
*/
kmemcheck_annotate_bitfield(sock, type);
//设置inode的值
inode->i_ino = get_next_ino();
inode->i_mode = S_IFSOCK | S_IRWXUGO;
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
/*
增加变量sockets_in_use的计量,+1,原子操作
*/
this_cpu_add(sockets_in_use, 1);
return sock;
}
*/
sock = sock_alloc(); //分配了一个一个socket_alloc, 里面包含一个inode
if (!sock) {
net_warn_ratelimited("socket: no more sockets\n");
return -ENFILE;/* Not exactly a match, but its the
closest posix thing */
}
sock->type = type;
/*
基础知识:
RCU(Read-Copy Update),顾名思义就是读-拷贝修改,它是基于其原理命名的。对于被RCU保护的共享数据结构,读者不需要获得任何锁就可以访问它,但写者在访问它时首先拷贝一个副本,然后对副本进行修改,最后使用一个回调(callback)机制在适当的时机把指向原来数据的指针重新指向新的被修改的数据。详见:http://www.ibm.com/developerworks/cn/linux/l-rcu/
*/
#ifdef CONFIG_MODULES
/*
如果协议模块没有加载,那么加载协议模块
*/
/* Attempt to load a protocol module if the find failed.
*
* 12/09/1996 Marcin: But! this makes REALLY only sense, if the user
* requested real, full-featured networking support upon configuration.
* Otherwise module support will break!
*/
if (rcu_access_pointer(net_families[family]) == NULL)
request_module("net-pf-%d", family);
#endif
rcu_read_lock();
pf = rcu_dereference(net_families[family]); //读取family,rcu的操作可以看这篇文章http://blog.csdn.net/jianchaolv/article/details/7527647
err = -EAFNOSUPPORT;
if (!pf)
goto out_release;
/*
* We will call the ->create function, that possibly is in a loadable
* module, so we have to bump that loadable module refcnt first.
*/
if (!try_module_get(pf->owner))
goto out_release;
/* Now protected by module ref count */
rcu_read_unlock();
/*
协议族的初始化在sock_register里面
int sock_register(const struct net_proto_family *ops)
{
int err;
if (ops->family >= NPROTO) {
printk(KERN_CRIT "protocol %d >= NPROTO(%d)\n", ops->family,
NPROTO);
return -ENOBUFS;
}
spin_lock(&net_family_lock);
if (rcu_dereference_protected(net_families[ops->family],
lockdep_is_held(&net_family_lock)))
err = -EEXIST;
else {
rcu_assign_pointer(net_families[ops->family], ops);
err = 0;
}
spin_unlock(&net_family_lock);
printk(KERN_INFO "NET: Registered protocol family %d\n", ops->family);
return err;
}
*/
/*经常使用的ipv4, 注册在inet_init函数里面,位于文件Af_inet.c中,这里的最终调用函数为inet_create,这部分在后续的文章再详述*/
err = pf->create(net, sock, protocol, kern);
if (err < 0)
goto out_module_put;
/*
* Now to bump the refcnt of the [loadable] module that owns this
* socket at sock_release time we decrement its refcnt.
*/
if (!try_module_get(sock->ops->owner))
goto out_module_busy;
/*
* Now that we're done with the ->create function, the [loadable]
* module can have its refcnt decremented
*/
module_put(pf->owner);
err = security_socket_post_create(sock, family, type, protocol, kern); //创建结束的安全检查,在创建之前也有检查的
if (err)
goto out_sock_release;
*res = sock; //设置创建好的sock
return 0;
/*
各种错误情况
*/
out_module_busy:
err = -EAFNOSUPPORT;
out_module_put:
sock->ops = NULL;
module_put(pf->owner);
out_sock_release:
sock_release(sock);
return err;
out_release:
rcu_read_unlock();
goto out_sock_release;
}
小结:
主要详述了函数sys_socket, sock_create, 还差inet_init, sock_map_fd