Linux Kernelのパケット送信を追う(ソケット作成編)

実績解除のためにsendto(2)のシステムコール発行からe1000ドライバの送信処理までを追いかけてみた．取り急ぎソケット作成までをまとめてみる．
なお，Linux Kernelは4.14.0-rc8を対象としている．

初めに断っておくとアホほど長い上に備忘録なのでほとんど解説はしていない．もし間違えていたらごめんなさい．

IPv4の処理は数多くの先人が残しているので，今回はIPv6を追いかけてみる．
ユーザーランドのコードはこんな感じ．

#include <unistd.h>
#include <netinet/in.h>
#include <arpa/inet.h>

int main ()
{
  int sock;
  struct sockaddr_in6 addr;

  sock = socket(AF_INET6, SOCK_DGRAM, 0);

  addr.sin6_family = AF_INET6;
  addr.sin6_port = htons(12345);
  inet_pton(AF_INET6, "2001:db8::dead:beef", &addr.sin6_addr);

  sendto(sock, "HELLO", 5, 0, (struct sockaddr *) &addr, sizeof(addr));

  close(sock);

  return 0;
}

まずはsocket(2)から見ていく．システムコールの定義はSYSCALL_DEFINExマクロで行っている．SYSCALL_DEFINE.+socketでgrepするとnet/socket.cにあることが分かった．

SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)
{
	int retval;
	struct socket *sock;
	int flags;

	/* Check the SOCK_* constants for consistency.  */
	BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC);
	BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK);
	BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK);
	BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK);

	flags = type & ~SOCK_TYPE_MASK;
	if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
		return -EINVAL;
	type &= SOCK_TYPE_MASK;

	if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
		flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;

	retval = sock_create(family, type, protocol, &sock);
	if (retval < 0)
		goto out;

	retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));
	if (retval < 0)
		goto out_release;

out:
	/* It may be already another descriptor 8) Not kernel problem. */
	return retval;

out_release:
	sock_release(sock);
	return retval;
}

重要そうなのはsock_create()とsock_map_fd()だろう．sock_create()はラッパー関数であり，内部で__sock_create()を呼んでいる．

int sock_create(int family, int type, int protocol, struct socket **res)
{
	return __sock_create(current->nsproxy->net_ns, family, type, protocol, res, 0);
}

int __sock_create(struct net *net, int family, int type, int protocol,
			 struct socket **res, int kern)
{
	int err;
	struct socket *sock;
	const struct net_proto_family *pf;

	// ...

	err = security_socket_create(family, type, protocol, kern);
	if (err)
		return err;

	// ...
}

security_socket_create()はsecurity/security.cにある関数であり，include/linux/security.hよりCONFIG_SECURITY_NETWORKが有効な場合，call_int_hook()マクロを呼び出している．この時，引数のFUNCはsocket_create，IRCは0が入る．

#ifdef CONFIG_SECURITY_NETWORK
int security_socket_create(int family, int type, int protocol, int kern);
#else
static inline int security_socket_create(int family, int type,
					 int protocol, int kern)
{
	return 0;
}
#endif

int security_socket_create(int family, int type, int protocol, int kern)
{
	return call_int_hook(socket_create, 0, family, type, protocol, kern);
}

#define call_int_hook(FUNC, IRC, ...) ({			\
	int RC = IRC;						\
	do {							\
		struct security_hook_list *P;			\
								\
		list_for_each_entry(P, &security_hook_heads.FUNC, list) { \
			RC = P->hook.FUNC(__VA_ARGS__);		\
			if (RC != 0)				\
				break;				\
		}						\
	} while (0);						\
	RC;							\
})

Linux Kernelのリストの操作については他の方が書かれたページがまとまっているのでこちらを参照してほしい．
d.hatena.ne.jp

call_int_hook()はsecurity_hook_heads.socket_createというリストから順にP->hook.socket_create()を呼び出し，戻り値が0以外の場合にそれをマクロ自体の戻り値として返している．security_hook_headsという変数はsecurity.cの上の方で定義している．(__lsm_ro_after_initはよく分からないので飛ばす)
security_hook_headsリストへのフック関数の登録はLSM_HOOK_INIT()で行っている．"LSM_HOOK_INIT.+socket_create"でgrepをかけるとsecurity/selinux/hooks.cでselinux_socket_create()という関数をLSM_HOOK_INIT()を通してsecurity_hook_heads.socket_createに登録していることが分かった．

static int selinux_socket_create(int family, int type,
				 int protocol, int kern)
{
	const struct task_security_struct *tsec = current_security();
	u32 newsid;
	u16 secclass;
	int rc;

	if (kern)
		return 0;

	secclass = socket_type_to_security_class(family, type, protocol);
	rc = socket_sockcreate_sid(tsec, secclass, &newsid);
	if (rc)
		return rc;

	return avc_has_perm(tsec->sid, newsid, secclass, SOCKET__CREATE, NULL);
}

socket_type_to_security_class()はプロトコルファミリ，ソケットのタイプ，プロトコルに応じてセキュリティクラスという値を返している．

static inline u16 socket_type_to_security_class(int family, int type, int protocol)
{
	int extsockclass = selinux_policycap_extsockclass;

	switch (family) {
	// ...
	case PF_INET:
	case PF_INET6:
		switch (type) {
		case SOCK_STREAM:
		case SOCK_SEQPACKET:
			if (default_protocol_stream(protocol))
				return SECCLASS_TCP_SOCKET;
			else if (extsockclass && protocol == IPPROTO_SCTP)
				return SECCLASS_SCTP_SOCKET;
			else
				return SECCLASS_RAWIP_SOCKET;
		case SOCK_DGRAM:
			if (default_protocol_dgram(protocol))
				return SECCLASS_UDP_SOCKET;
			else if (extsockclass && (protocol == IPPROTO_ICMP ||
						  protocol == IPPROTO_ICMPV6))
				return SECCLASS_ICMP_SOCKET;
			else
				return SECCLASS_RAWIP_SOCKET;
		case SOCK_DCCP:
			return SECCLASS_DCCP_SOCKET;
		default:
			return SECCLASS_RAWIP_SOCKET;
		}
		break;
	// ...
	}
	// ...
	return SECCLASS_SOCKET;
}

static inline int default_protocol_dgram(int protocol)
{
	return (protocol == IPPROTO_IP || protocol == IPPROTO_UDP);
}

family = PF_INET6(AF_INET6と同じ)，type = SOCK_DGRAM，protocol = 0(=IPPROTO_IP)なので，sclassにはSECCLASS_UDP_SOCKETが格納される．

(SELinuxがよく分かっていないので間違っているかもしれないが)，ここでは現在のSIDとやろうとしている処理を元にAVC(?)を確認し，処理権限を持つなら0が返り，持っていない場合は-EACCESが返ってくる．このため，結果として__sock_create()は失敗することになる．

再び__sock_create()に戻る．

int __sock_create(struct net *net, int family, int type, int protocol,
			 struct socket **res, int kern)
{
	int err;
	struct socket *sock;
	const struct net_proto_family *pf;

	// ...

	err = security_socket_create(family, type, protocol, kern);
	if (err)
		return err;

	/*
	 *	Allocate the socket and allow the family to set things up. if
	 *	the protocol is 0, the family is instructed to select an appropriate
	 *	default.
	 */
	sock = sock_alloc();
	if (!sock) {
		net_warn_ratelimited("socket: no more sockets\n");
		return -ENFILE;	/* Not exactly a match, but its the
				   closest posix thing */
	}

	sock->type = type;

	// ...
}

security_socket_create()によってソケットの作成権限があることがあることが分かったので，ようやくsock_alloc()でソケットを作成する．

/**
 *	sock_alloc	-	allocate a socket
 *
 *	Allocate a new inode and socket object. The two are bound together
 *	and initialised. The socket is then returned. If we are out of inodes
 *	NULL is returned.
 */

struct socket *sock_alloc(void)
{
	struct inode *inode;
	struct socket *sock;

	inode = new_inode_pseudo(sock_mnt->mnt_sb);
	if (!inode)
		return NULL;

	sock = SOCKET_I(inode);

	kmemcheck_annotate_bitfield(sock, type);
	inode->i_ino = get_next_ino();
	inode->i_mode = S_IFSOCK | S_IRWXUGO;
	inode->i_uid = current_fsuid();
	inode->i_gid = current_fsgid();
	inode->i_op = &sockfs_inode_ops;

	this_cpu_add(sockets_in_use, 1);
	return sock;
}

かなりファイルシステムに寄った内容が多い．read/write(2)でパケットを送受信するために必要な処理らしいが，正直なところよく分かっていないので読み飛ばす．

※後で調べたら他の方がとても詳しくまとめていた．
github.com

socketのアロケートが終わったら，次はプロトコルファミリごとに初期化を行う．各プロトコルファミリはnet_families[]というnet_proto_family型の配列に格納されている．

int __sock_create(struct net *net, int family, int type, int protocol,
			 struct socket **res, int kern)
{
	int err;
	struct socket *sock;
	const struct net_proto_family *pf;

	// ...

#ifdef CONFIG_MODULES
	/* Attempt to load a protocol module if the find failed.
	 *
	 * 12/09/1996 Marcin: But! this makes REALLY only sense, if the user
	 * requested real, full-featured networking support upon configuration.
	 * Otherwise module support will break!
	 */
	if (rcu_access_pointer(net_families[family]) == NULL)
		request_module("net-pf-%d", family);
#endif

	rcu_read_lock();
	pf = rcu_dereference(net_families[family]);
	err = -EAFNOSUPPORT;
	if (!pf)
		goto out_release;

	/*
	 * We will call the ->create function, that possibly is in a loadable
	 * module, so we have to bump that loadable module refcnt first.
	 */
	if (!try_module_get(pf->owner))
		goto out_release;

	/* Now protected by module ref count */
	rcu_read_unlock();

	err = pf->create(net, sock, protocol, kern);
	if (err < 0)
		goto out_module_put;

	// ...

out_module_put:
	sock->ops = NULL;
	module_put(pf->owner);

	// ...

out_release:
	rcu_read_unlock();
	goto out_sock_release;
}

net_familiesはRCUで保護している．rcu_read_lock()とrcu_read_unlock()はそれぞれread_lock(&hoge)とread_unlock(&hoge)だと思っていい．また，

pf = rcu_dereference(net_families[family]);

は

pf = net_families[family]

と読み替えて問題ない．詳しく知りたい場合はDocumentation/RCU/whatisRCU.txt．

net_familiesへの登録はsock_register()を通して行われる．

/**
 *	sock_register - add a socket protocol handler
 *	@ops: description of protocol
 *
 *	This function is called by a protocol handler that wants to
 *	advertise its address family, and have it linked into the
 *	socket interface. The value ops->family corresponds to the
 *	socket system call protocol family.
 */
int sock_register(const struct net_proto_family *ops)
{
	int err;

	if (ops->family >= NPROTO) {
		pr_crit("protocol %d >= NPROTO(%d)\n", ops->family, NPROTO);
		return -ENOBUFS;
	}

	spin_lock(&net_family_lock);
	if (rcu_dereference_protected(net_families[ops->family],
				      lockdep_is_held(&net_family_lock)))
		err = -EEXIST;
	else {
		rcu_assign_pointer(net_families[ops->family], ops);
		err = 0;
	}
	spin_unlock(&net_family_lock);

	pr_info("NET: Registered protocol family %d\n", ops->family);
	return err;
}

今回はAF_INET6なのでnet/ipv6からsock_registerをgrepするとnet/ipv6/af_inet6.cでプロトコルファミリを登録していることが分かった．

static int __init inet6_init(void)
{
	struct list_head *r;
	int err = 0;

	// ...

	/* Register the family here so that the init calls below will
	 * be able to create sockets. (?? is this dangerous ??)
	 */
	err = sock_register(&inet6_family_ops);

	// ...
}

このため，__sock_create()のpf->create(net, sock, protocol, kern)は，af_inet6.cのinet6_family_opsよりinet6_create()が呼ばれることになる．

static int inet6_create(struct net *net, struct socket *sock, int protocol,
			int kern)
{
	struct inet_protosw *answer;
	int try_loading_module = 0;
	int err;

	if (protocol < 0 || protocol >= IPPROTO_MAX)
		return -EINVAL;

	/* Look for the requested type/protocol pair. */
lookup_protocol:
	err = -ESOCKTNOSUPPORT;
	rcu_read_lock();
	list_for_each_entry_rcu(answer, &inetsw6[sock->type], list) {
		// ...
	}
	// ...
}

まず，sock->typeには__sock_create()でsockをアロケートした後にシステムコールの引数であるtypeを格納している．つまり今回だとSOCK_DGRAM(=2)が格納されている．
inetsw6[]はlist_head型の配列であり，inet6_register_protosw()を通して配列にプロトコルを登録する．

int inet6_register_protosw(struct inet_protosw *p)
{
	struct list_head *lh;
	struct inet_protosw *answer;
	struct list_head *last_perm;
	int protocol = p->protocol;
	int ret;

	spin_lock_bh(&inetsw6_lock);

	ret = -EINVAL;
	if (p->type >= SOCK_MAX)
		goto out_illegal;

	/* If we are trying to override a permanent protocol, bail. */
	answer = NULL;
	ret = -EPERM;
	last_perm = &inetsw6[p->type];
	list_for_each(lh, &inetsw6[p->type]) {
		answer = list_entry(lh, struct inet_protosw, list);

		/* Check only the non-wild match. */
		if (INET_PROTOSW_PERMANENT & answer->flags) {
			if (protocol == answer->protocol)
				break;
			last_perm = lh;
		}

		answer = NULL;
	}
	if (answer)
		goto out_permanent;

	/* Add the new entry after the last permanent entry if any, so that
	 * the new entry does not override a permanent entry when matched with
	 * a wild-card protocol. But it is allowed to override any existing
	 * non-permanent entry.  This means that when we remove this entry, the
	 * system automatically returns to the old behavior.
	 */
	list_add_rcu(&p->list, last_perm);
	ret = 0;
out:
	spin_unlock_bh(&inetsw6_lock);
	return ret;

out_permanent:
	pr_err("Attempt to override permanent protocol %d\n", protocol);
	goto out;

out_illegal:
	pr_err("Ignoring attempt to register invalid socket type %d\n",
	       p->type);
	goto out;
}

Linux Kernelのリスト構造は構造体の型に依存しないようになっており，リストへの追加は奇妙な操作に見えるかもしれない．詳しくは他の方が書かれたページを参照してほしい．
kernhack.hatenablog.com

配列のtype番目のリストにinet_protoswな要素を追加している．データ構造から分かるように，各タイプごとに複数のプロトコルが登録できるようになっている．そんな必要あるのかと思って調べてみたところ，UDP-LiteというプロトコルもSOCK_DGRAMを用いていた(UDP-Liteを初めて知った)．include/net/protocol.hのコメントより，INET_PROTOSW_PERMANENTというのは取り除くことが出来ないプロトコルとのことだった．

登録は各L4プロトコルごとに行っている．今回はUDPなのでnet/ipv6/udp.cを見てみる．

static struct inet_protosw udpv6_protosw = {
	.type =      SOCK_DGRAM,
	.protocol =  IPPROTO_UDP,
	.prot =      &udpv6_prot,
	.ops =       &inet6_dgram_ops,
	.flags =     INET_PROTOSW_PERMANENT,
};

int __init udpv6_init(void)
{
	int ret;

	ret = inet6_add_protocol(&udpv6_protocol, IPPROTO_UDP);
	if (ret)
		goto out;

	ret = inet6_register_protosw(&udpv6_protosw);
	if (ret)
		goto out_udpv6_protocol;
out:
	return ret;

out_udpv6_protocol:
	inet6_del_protocol(&udpv6_protocol, IPPROTO_UDP);
	goto out;
}

inet6_create()に戻る．

static int inet6_create(struct net *net, struct socket *sock, int protocol,
			int kern)
{
	struct inet_protosw *answer;
	int try_loading_module = 0;
	int err;

	if (protocol < 0 || protocol >= IPPROTO_MAX)
		return -EINVAL;

	/* Look for the requested type/protocol pair. */
lookup_protocol:
	err = -ESOCKTNOSUPPORT;
	rcu_read_lock();
	list_for_each_entry_rcu(answer, &inetsw6[sock->type], list) {

		err = 0;
		/* Check the non-wild match. */
		if (protocol == answer->protocol) {
			if (protocol != IPPROTO_IP)
				break;
		} else {
			/* Check for the two wild cases. */
			if (IPPROTO_IP == protocol) {
				protocol = answer->protocol;
				break;
			}
			if (IPPROTO_IP == answer->protocol)
				break;
		}
		err = -EPROTONOSUPPORT;
	}

	if (err) {
		if (try_loading_module < 2) {
			rcu_read_unlock();
			/*
			 * Be more specific, e.g. net-pf-10-proto-132-type-1
			 * (net-pf-PF_INET6-proto-IPPROTO_SCTP-type-SOCK_STREAM)
			 */
			if (++try_loading_module == 1)
				request_module("net-pf-%d-proto-%d-type-%d",
						PF_INET6, protocol, sock->type);
			/*
			 * Fall back to generic, e.g. net-pf-10-proto-132
			 * (net-pf-PF_INET6-proto-IPPROTO_SCTP)
			 */
			else
				request_module("net-pf-%d-proto-%d",
						PF_INET6, protocol);
			goto lookup_protocol;
		} else
			goto out_rcu_unlock;
	}

	err = -EPERM;
	if (sock->type == SOCK_RAW && !kern &&
	    !ns_capable(net->user_ns, CAP_NET_RAW))
		goto out_rcu_unlock;

	sock->ops = answer->ops;
	answer_prot = answer->prot;
	answer_flags = answer->flags;
	rcu_read_unlock();

	WARN_ON(!answer_prot->slab);

	// ...

out_rcu_unlock:
	rcu_read_unlock();
	goto out;
}

今回のユーザランドプログラムではprotocolに0を指定しており，IPPROTO_IP(=0)と一致する．answerにはUDPに対応するinet_protoswなオブジェクトが格納されているため，ここでprotocolはIPPROTO_UDPとなる．
UDP-Liteのサンプルコードを見てみると，protocolにIPPROTO_UDPLITEを指定している．このようにtypeは重複していてもprotocolを指定することでちゃんと動作するようになっている(inetsw6[type]の登録順で動作は変わるので，UDPが必ず先に登録されるようになっているんだと思う)．

L4プロトコルを取得した後，RAWソケットである場合の条件式がある．多分network namespace関連の権限確認をしているのだと思う．その後にsock->opsにinet_protoswのopsを格納している．UDPの場合はinet6_dgram_opsが格納される．後はanswer_protとanswer_flagsにメモしてクリティカルセクションを抜ける．これでソケットが使用するプロトコルファミリを特定する作業は終了．

続きを追っていこう．

static int inet6_create(struct net *net, struct socket *sock, int protocol,
			int kern)
{
	struct inet_sock *inet;
	struct ipv6_pinfo *np;
	struct sock *sk;
	struct inet_protosw *answer;
	struct proto *answer_prot;
	unsigned char answer_flags;
	int try_loading_module = 0;
	int err;

	// ...

	err = -ENOBUFS;
	sk = sk_alloc(net, PF_INET6, GFP_KERNEL, answer_prot, kern);
	if (!sk)
		goto out;

	sock_init_data(sock, sk);

	err = 0;
	if (INET_PROTOSW_REUSE & answer_flags)
		sk->sk_reuse = SK_CAN_REUSE;

	// ...

out:
	return err;
}

ここで注意したいのはstruct socketとstruct sockの違いである．前者はユーザランドで用いるソケットの構造体，後者はカーネルで用いるソケットの構造体である．struct socketの場合はsock，struct sockの場合はskという変数名である場合が多い(気がする)．
sk_alloc()という関数でstruct sockをアロケートしている．sk_alloc()はnet/core/sock.cで定義されている．

/**
 *	sk_alloc - All socket objects are allocated here
 *	@net: the applicable net namespace
 *	@family: protocol family
 *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
 *	@prot: struct proto associated with this new sock instance
 *	@kern: is this to be a kernel socket?
 */
struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
		      struct proto *prot, int kern)
{
	struct sock *sk;

	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
	// ...
}

まずはsk_prot_alloc()を見てみる．

static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
		int family)
{
	struct sock *sk;
	struct kmem_cache *slab;

	slab = prot->slab;
	if (slab != NULL) {
		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
		if (!sk)
			return sk;
		if (priority & __GFP_ZERO)
			sk_prot_clear_nulls(sk, prot->obj_size);
	} else
		sk = kmalloc(prot->obj_size, priority);

	if (sk != NULL) {
		kmemcheck_annotate_bitfield(sk, flags);

		if (security_sk_alloc(sk, family, priority))
			goto out_free;

		if (!try_module_get(prot->owner))
			goto out_free_sec;
		sk_tx_queue_clear(sk);
	}

	return sk;

out_free_sec:
	security_sk_free(sk);
out_free:
	if (slab != NULL)
		kmem_cache_free(slab, sk);
	else
		kfree(sk);
	return NULL;
}

プロトコルファミリがslabというメンバを持っていたらスラブアロケータからメモリを確保，なければkmalloc()で確保している．この時，確保するオブジェクトのサイズはsizeof(struct sock)ではなく，prot->obj_size = sizeof(struct udp6_sock)であることに注意されたい．udpv6_protを見てみるとslabは持っていないのでkmalloc()で確保していることになる．kmemcheck_annotate_bitfield()はkmemcheckが未初期化メモリとしてビットフィールドを検知するのを防ぐために使用するマクロであり，引数で与えているflagsはマクロ内で文字列化されるため変数ではない．

security_sk_alloc()はsecurity/security.cで定義されており，security_socket_create()と同様の処理を行ってhook関数を呼び出している．security_socket_create()と異なる点はhook関数にSELinuxのselinux_sk_alloc_security()だけでなく，SMACKのsmack_sk_alloc_security()も登録される場合があることである．今回はSMACKについては追わないが以下が詳しい．

www.ipa.go.jp
Smack for simplified access control [LWN.net]

selinux_sk_alloc_security()を見ていく．selinux_sk_alloc_security()はsecurity/selinux/hooks.cで定義されている．

static int selinux_sk_alloc_security(struct sock *sk, int family, gfp_t priority)
{
	struct sk_security_struct *sksec;

	sksec = kzalloc(sizeof(*sksec), priority);
	if (!sksec)
		return -ENOMEM;

	sksec->peer_sid = SECINITSID_UNLABELED;
	sksec->sid = SECINITSID_UNLABELED;
	sksec->sclass = SECCLASS_SOCKET;
	selinux_netlbl_sk_security_reset(sksec);
	sk->sk_security = sksec;

	return 0;
}

SELinux用の構造体であるsk_security_struct型のオブジェクトを作成し，初期化した後にsk->sk_securityに格納するだけである．なお，sk->sk_securityはvoid型のポインタである．

sk_prot_alloc()に戻る．try_module_get()はkernel/module.cで定義されている．中で呼んでいるtrace_module_get()が何かよく分かっていないが，それより前の条件式内でatomic_inc_not_zero()としてモジュールの参照カウンタをインクリメントしていることが重要なのだと思う．Linux Kernelは参照カウンタによるGC(といってもGC専用のスレッドがいるわけではなく，参照が0になったときにfreeするだけ)を行っていて，大抵はxxx_get()で参照カウンタのインクリメント，xxx_put()で参照カウンタをデクリメントしている(skb_put()のような例外もある)．

bool try_module_get(struct module *module)
{
	bool ret = true;

	if (module) {
		preempt_disable();
		/* Note: here, we can fail to get a reference */
		if (likely(module_is_live(module) &&
			   atomic_inc_not_zero(&module->refcnt) != 0))
			trace_module_get(module, _RET_IP_);
		else
			ret = false;

		preempt_enable();
	}
	return ret;
}

preempt_disable()とpreempt_enable()は名前の通りプリエンプションの無効化/有効化をしている．

https://www.kernel.org/doc/Documentation/preempt-locking.txt
Embedded UNIX Vol1 Column

よく分かっていないので何とも言えないが，CPUごとに存在する変数を参照するような場合にプリエンプションを無効化しているようなので，moduleはCPUごとに存在しているのだろうか．ちなみにこの時のmoduleにはudpv6_protのownerであるTHIS_MODULEが格納されている．正直何も分からん．

THIS_MODULEマクロ - Linuxの備忘録とか・・・(目次へ）

sk_tx_queue_clear()は以下の簡単な処理を行う関数である．この後，sk_prot_alloc()は初期化が終わったskを返す．

static inline void sk_tx_queue_clear(struct sock *sk)
{
	sk->sk_tx_queue_mapping = -1;
}

skのアロケートに成功したのでsk_alloc()に戻ろう．以降は構造体の初期化が主な処理となっている．

/**
 *	sk_alloc - All socket objects are allocated here
 *	@net: the applicable net namespace
 *	@family: protocol family
 *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
 *	@prot: struct proto associated with this new sock instance
 *	@kern: is this to be a kernel socket?
 */
struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
		      struct proto *prot, int kern)
{
	struct sock *sk;

	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
	if (sk) {
		sk->sk_family = family;
		/*
		 * See comment in struct sock definition to understand
		 * why we need sk_prot_creator -acme
		 */
		sk->sk_prot = sk->sk_prot_creator = prot;
		sk->sk_kern_sock = kern;
		sock_lock_init(sk);
		sk->sk_net_refcnt = kern ? 0 : 1;
		if (likely(sk->sk_net_refcnt))
			get_net(net);
		sock_net_set(sk, net);
		refcount_set(&sk->sk_wmem_alloc, 1);

		mem_cgroup_sk_alloc(sk);
		cgroup_sk_alloc(&sk->sk_cgrp_data);
		sock_update_classid(&sk->sk_cgrp_data);
		sock_update_netprioidx(&sk->sk_cgrp_data);
	}

	return sk;
}

一部気になる部分はあるが割愛する．inet6_create()に戻ってsock_init_data()に入る．先述したとおり，struct sock *sockはユーザランド，struct sock *skはカーネルで用いる構造体である．やっていることはほとんど構造体(sk)の初期化である．

void sock_init_data(struct socket *sock, struct sock *sk)
{
	sk_init_common(sk);
	sk->sk_send_head	=	NULL;

	init_timer(&sk->sk_timer);

	sk->sk_allocation	=	GFP_KERNEL;
	sk->sk_rcvbuf		=	sysctl_rmem_default;
	sk->sk_sndbuf		=	sysctl_wmem_default;
	sk->sk_state		=	TCP_CLOSE;
	sk_set_socket(sk, sock);

	sock_set_flag(sk, SOCK_ZAPPED);

	if (sock) {
		sk->sk_type	=	sock->type;
		sk->sk_wq	=	sock->wq;
		sock->sk	=	sk;
		sk->sk_uid	=	SOCK_INODE(sock)->i_uid;
	} else {
		sk->sk_wq	=	NULL;
		sk->sk_uid	=	make_kuid(sock_net(sk)->user_ns, 0);
	}

	rwlock_init(&sk->sk_callback_lock);
	if (sk->sk_kern_sock)
		lockdep_set_class_and_name(
			&sk->sk_callback_lock,
			af_kern_callback_keys + sk->sk_family,
			af_family_kern_clock_key_strings[sk->sk_family]);
	else
		lockdep_set_class_and_name(
			&sk->sk_callback_lock,
			af_callback_keys + sk->sk_family,
			af_family_clock_key_strings[sk->sk_family]);

	sk->sk_state_change	=	sock_def_wakeup;
	sk->sk_data_ready	=	sock_def_readable;
	sk->sk_write_space	=	sock_def_write_space;
	sk->sk_error_report	=	sock_def_error_report;
	sk->sk_destruct		=	sock_def_destruct;

	sk->sk_frag.page	=	NULL;
	sk->sk_frag.offset	=	0;
	sk->sk_peek_off		=	-1;

	sk->sk_peer_pid 	=	NULL;
	sk->sk_peer_cred	=	NULL;
	sk->sk_write_pending	=	0;
	sk->sk_rcvlowat		=	1;
	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;

	sk->sk_stamp = SK_DEFAULT_STAMP;
	atomic_set(&sk->sk_zckey, 0);

#ifdef CONFIG_NET_RX_BUSY_POLL
	sk->sk_napi_id		=	0;
	sk->sk_ll_usec		=	sysctl_net_busy_read;
#endif

	sk->sk_max_pacing_rate = ~0U;
	sk->sk_pacing_rate = ~0U;
	sk->sk_incoming_cpu = -1;
	/*
	 * Before updating sk_refcnt, we must commit prior changes to memory
	 * (Documentation/RCU/rculist_nulls.txt for details)
	 */
	smp_wmb();
	refcount_set(&sk->sk_refcnt, 1);
	atomic_set(&sk->sk_drops, 0);
}

これにより，sockとskは

sk->sk_socket = sock;
sock->sk = sk;

という関係になる．その他は必要になったときに見ていく(疲れてきた)．それでは再びinet6_create()を見ていく．

static int inet6_create(struct net *net, struct socket *sock, int protocol,
			int kern)
{
	struct inet_sock *inet;
	struct ipv6_pinfo *np;
	struct sock *sk;
	struct inet_protosw *answer;
	struct proto *answer_prot;
	unsigned char answer_flags;
	int try_loading_module = 0;
	int err;

	// ...

	inet = inet_sk(sk);
	inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;

	if (SOCK_RAW == sock->type) {
		inet->inet_num = protocol;
		if (IPPROTO_RAW == protocol)
			inet->hdrincl = 1;
	}

	sk->sk_destruct		= inet_sock_destruct;
	sk->sk_family		= PF_INET6;
	sk->sk_protocol		= protocol;

	sk->sk_backlog_rcv	= answer->prot->backlog_rcv;

	inet_sk(sk)->pinet6 = np = inet6_sk_generic(sk);
	np->hop_limit	= -1;
	np->mcast_hops	= IPV6_DEFAULT_MCASTHOPS;
	np->mc_loop	= 1;
	np->pmtudisc	= IPV6_PMTUDISC_WANT;
	np->autoflowlabel = ip6_default_np_autolabel(net);
	np->repflow	= net->ipv6.sysctl.flowlabel_reflect;
	sk->sk_ipv6only	= net->ipv6.sysctl.bindv6only;

	/* Init the ipv4 part of the socket since we can have sockets
	 * using v6 API for ipv4.
	 */
	inet->uc_ttl	= -1;

	inet->mc_loop	= 1;
	inet->mc_ttl	= 1;
	inet->mc_index	= 0;
	inet->mc_list	= NULL;
	inet->rcv_tos	= 0;

	if (net->ipv4.sysctl_ip_no_pmtu_disc)
		inet->pmtudisc = IP_PMTUDISC_DONT;
	else
		inet->pmtudisc = IP_PMTUDISC_WANT;
	/*
	 * Increment only the relevant sk_prot->socks debug field, this changes
	 * the previous behaviour of incrementing both the equivalent to
	 * answer->prot->socks (inet6_sock_nr) and inet_sock_nr.
	 *
	 * This allows better debug granularity as we'll know exactly how many
	 * UDPv6, TCPv6, etc socks were allocated, not the sum of all IPv6
	 * transport protocol socks. -acme
	 */
	sk_refcnt_debug_inc(sk);

	if (inet->inet_num) {
		/* It assumes that any protocol which allows
		 * the user to assign a number at socket
		 * creation time automatically shares.
		 */
		inet->inet_sport = htons(inet->inet_num);
		err = sk->sk_prot->hash(sk);
		if (err) {
			sk_common_release(sk);
			goto out;
		}
	}
	if (sk->sk_prot->init) {
		err = sk->sk_prot->init(sk);
		if (err) {
			sk_common_release(sk);
			goto out;
		}
	}

	if (!kern) {
		err = BPF_CGROUP_RUN_PROG_INET_SOCK(sk);
		if (err) {
			sk_common_release(sk);
			goto out;
		}
	}
out:
	return err;
out_rcu_unlock:
	rcu_read_unlock();
	goto out;
}

inet_sk()はただのキャストである．ぎょっとするかもしれないが，sk_prot_alloc()でアロケートしたサイズはsizeof(struct sock)よりも大きいsizeof(struct udp6_sock)であることを思い出してほしい．struct inet_sockの最初のメンバはstruct sockであり，inet_sockとして操作してもskの領域を破壊することはない．

static inline struct inet_sock *inet_sk(const struct sock *sk)
{
	return (struct inet_sock *)sk;
}

inet6_sk_generic()もほとんど同様のことを行っている．sk_alloc()より，UDPの場合はsk_protにはudpv6_protが格納されており，udpv6_protのobj_sizeメンバにはsizeof(struct udp6_sock)が格納されている．udpv6_sockの一番最後の部分にipv6_pinfo用のメモリが確保されており，そこまでoffsetを進めている．

static __inline__ struct ipv6_pinfo *inet6_sk_generic(struct sock *sk)
{
	const int offset = sk->sk_prot->obj_size - sizeof(struct ipv6_pinfo);

	return (struct ipv6_pinfo *)(((u8 *)sk) + offset);
}

struct udp6_sock {
	struct udp_sock	  udp;
	/* ipv6_pinfo has to be the last member of udp6_sock, see inet6_sk_generic */
	struct ipv6_pinfo inet6;
};

後は現在のnetwork namespaceに基づいてsysctlの値をskに設定したり，RAWソケットの場合指定されたプロトコル番号をinet->inet_numに設定したり，デフォルトの挙動を設定するなどの初期化を行っている．今回はRAWソケットではないためif (inet->inet_num)の処理は行われない．また，udpv6_protにinitというメンバはないため，if (sk->sk_prot->init)も行われない．
kernは0であるためBPF_CGROUP_RUN_PROG_INET_SOCK()が実行されるが，正直良く分からないので飛ばす．

長くなったが__sock_create()のpf->create()はこれで終わり．後の処理を見ていく．

int __sock_create(struct net *net, int family, int type, int protocol,
			 struct socket **res, int kern)
{
	int err;
	struct socket *sock;
	const struct net_proto_family *pf;

	// ...

	err = pf->create(net, sock, protocol, kern);
	if (err < 0)
		goto out_module_put;

	/*
	 * Now to bump the refcnt of the [loadable] module that owns this
	 * socket at sock_release time we decrement its refcnt.
	 */
	if (!try_module_get(sock->ops->owner))
		goto out_module_busy;

	/*
	 * Now that we're done with the ->create function, the [loadable]
	 * module can have its refcnt decremented
	 */
	module_put(pf->owner);
	err = security_socket_post_create(sock, family, type, protocol, kern);
	if (err)
		goto out_sock_release;
	*res = sock;

	return 0;

out_module_busy:
	err = -EAFNOSUPPORT;
out_module_put:
	sock->ops = NULL;
	module_put(pf->owner);
out_sock_release:
	sock_release(sock);
	return err;

out_release:
	rcu_read_unlock();
	goto out_sock_release;
}

try_module_get()は先に出てきたとおり参照カウンタのインクリメントを行っている．その後module_put()でプロトコルファミリのownerメンバの参照カウンタをデクリメントしている．こちらもtrace_module_put()を呼んでいるが何かがよく分からないので飛ばす．

void module_put(struct module *module)
{
	int ret;

	if (module) {
		preempt_disable();
		ret = atomic_dec_if_positive(&module->refcnt);
		WARN_ON(ret < 0);	/* Failed to put refcount */
		trace_module_put(module, _RET_IP_);
		preempt_enable();
	}
}

__sock_create()に戻り，security_socket_post_create()を見ていこう．おなじみのsecurity/security.cで定義されている．

int security_socket_post_create(struct socket *sock, int family,
				int type, int protocol, int kern)
{
	return call_int_hook(socket_post_create, 0, sock, family, type,
						protocol, kern);
}

こちらもSELinuxのhook関数であるselinux_socket_post_create()とSMACKのhook関数であるsmack_socket_post_create()が呼び出される場合がある．security/selinux/hooks.cより，selinux_socket_post_create()を見ていく．

static int selinux_socket_post_create(struct socket *sock, int family,
				      int type, int protocol, int kern)
{
	const struct task_security_struct *tsec = current_security();
	struct inode_security_struct *isec = inode_security_novalidate(SOCK_INODE(sock));
	struct sk_security_struct *sksec;
	u16 sclass = socket_type_to_security_class(family, type, protocol);
	u32 sid = SECINITSID_KERNEL;
	int err = 0;

	if (!kern) {
		err = socket_sockcreate_sid(tsec, sclass, &sid);
		if (err)
			return err;
	}

	isec->sclass = sclass;
	isec->sid = sid;
	isec->initialized = LABEL_INITIALIZED;

	if (sock->sk) {
		sksec = sock->sk->sk_security;
		sksec->sclass = sclass;
		sksec->sid = sid;
		err = selinux_netlbl_socket_post_create(sock->sk, family);
	}

	return err;
}

selinux_socket_create()とは異なり，inode_security_structというinode絡みと思われる構造体も初期化している．SOCK_INODEはsocketを含んでいる構造体のメンバを取得するマクロであり，container_of()という~~黒魔術~~マクロを使っている．container_of()マクロはリストの操作(list_entry())などにも用いられている汎用マクロなので，一回読んでみるといいかもしれない．container_of()はinclude/linux/kernel.hに，offsetof()はinclude/linux/stddef.hにある．

static inline struct inode *SOCK_INODE(struct socket *socket)
{
	return &container_of(socket, struct socket_alloc, socket)->vfs_inode;
}

/**
 * container_of - cast a member of a structure out to the containing structure
 * @ptr:	the pointer to the member.
 * @type:	the type of the container struct this is embedded in.
 * @member:	the name of the member within the struct.
 *
 */
#define container_of(ptr, type, member) ({				\
	void *__mptr = (void *)(ptr);					\
	BUILD_BUG_ON_MSG(!__same_type(*(ptr), ((type *)0)->member) &&	\
			 !__same_type(*(ptr), void),			\
			 "pointer type mismatch in container_of()");	\
	((type *)(__mptr - offsetof(type, member))); })

#undef offsetof
#ifdef __compiler_offsetof
#define offsetof(TYPE, MEMBER)	__compiler_offsetof(TYPE, MEMBER)
#else
#define offsetof(TYPE, MEMBER)	((size_t)&((TYPE *)0)->MEMBER)
#endif

inode_security_novalidate()は単純にinodeのi_securityメンバを返している．

static struct inode_security_struct *inode_security_novalidate(struct inode *inode)
{
	return inode->i_security;
}

socket_type_to_security_class()はselinux_socket_create()の時とほぼ同じであり，sclassはSECCLASS_UDP_SOCKETとなる．
まず，kern=0なので，現在のセキュリティとsclassを元にsidが検証される．その後，isecの初期化が終わったらsock->sk->sk_securityにsidとsclassが格納され，selinux_netlbl_socket_post_create()が呼び出される．sock->sk->sk_securityはSELinuxであればselinux_sk_alloc_security()で見たように，sk_security_struct型のオブジェクトが格納されている．

/**
 * selinux_netlbl_socket_post_create - Label a socket using NetLabel
 * @sock: the socket to label
 * @family: protocol family
 *
 * Description:
 * Attempt to label a socket using the NetLabel mechanism using the given
 * SID.  Returns zero values on success, negative values on failure.
 *
 */
int selinux_netlbl_socket_post_create(struct sock *sk, u16 family)
{
	int rc;
	struct sk_security_struct *sksec = sk->sk_security;
	struct netlbl_lsm_secattr *secattr;

	if (family != PF_INET && family != PF_INET6)
		return 0;

	secattr = selinux_netlbl_sock_genattr(sk);
	if (secattr == NULL)
		return -ENOMEM;
	rc = netlbl_sock_setattr(sk, family, secattr);
	switch (rc) {
	case 0:
		sksec->nlbl_state = NLBL_LABELED;
		break;
	case -EDESTADDRREQ:
		sksec->nlbl_state = NLBL_REQSKB;
		rc = 0;
		break;
	}

	return rc;
}

コメントから推察するとNetLabelという技術を使っているらしい．SELinux自体もよく分かっていないので今後サーベイしていこう．__sock_create()はこれで終わりなので，sock_create()の処理は以上で終了となる．
socket(2)に戻ろう．次は作成したsocketをsock_map_fd()という関数に渡している．またファイル関係なので適当に飛ばしながら見ていこう．

SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)
{
	int retval;
	struct socket *sock;
	int flags;

	// ...

	retval = sock_create(family, type, protocol, &sock);
	if (retval < 0)
		goto out;

	retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));
	if (retval < 0)
		goto out_release;

out:
	/* It may be already another descriptor 8) Not kernel problem. */
	return retval;

out_release:
	sock_release(sock);
	return retval;
}

static int sock_map_fd(struct socket *sock, int flags)
{
	struct file *newfile;
	int fd = get_unused_fd_flags(flags);
	if (unlikely(fd < 0))
		return fd;

	newfile = sock_alloc_file(sock, flags, NULL);
	if (likely(!IS_ERR(newfile))) {
		fd_install(fd, newfile);
		return fd;
	}

	put_unused_fd(fd);
	return PTR_ERR(newfile);
}

get_unused_fd_flags()はcurrent->filesに基づいてファイルディスクリプタをアロケートしている．今までcurrentについて見ないふりをしていたが，これはシステムコールを発行したユーザランドプロセスが格納されているらしい．つまり，ファイルディスクリプタがプロセスごとにユニークになるのはここで分岐しているためであると予想される．

stackoverflow.com

int get_unused_fd_flags(unsigned flags)
{
	return __alloc_fd(current->files, 0, rlimit(RLIMIT_NOFILE), flags);
}

その後，sock_alloc_file()でファイルディスクリプタを取得しセットアップするらしい．何も分からんのでコメントだけ貼っておく．newfileもファイルディスクリプタを返すなら，何故fdを別個にアロケートしたのか分からないが，「In any case returned fd MAY BE not valid!」とのことなので，きっとvalidなfdに差し替えるんだろう(知らんけど)．

/*
 *	Obtains the first available file descriptor and sets it up for use.
 *
 *	These functions create file structures and maps them to fd space
 *	of the current process. On success it returns file descriptor
 *	and file struct implicitly stored in sock->file.
 *	Note that another thread may close file descriptor before we return
 *	from this function. We use the fact that now we do not refer
 *	to socket after mapping. If one day we will need it, this
 *	function will increment ref. count on file by 1.
 *
 *	In any case returned fd MAY BE not valid!
 *	This race condition is unavoidable
 *	with shared fd spaces, we cannot solve it inside kernel,
 *	but we take care of internal coherence yet.
 */

struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname)
{
	// ...
}

sock_map_fd()に戻る．fdとnewfileをfd_installという関数に渡している．コメントを見るにファイルポインタをfdの配列に格納するらしいので，ここでfdとnewfileが関連付けられるものだと推測される．

void fd_install(unsigned int fd, struct file *file)
{
	__fd_install(current->files, fd, file);
}

/*
 * Install a file pointer in the fd array.
 *
 * The VFS is full of places where we drop the files lock between
 * setting the open_fds bitmap and installing the file in the file
 * array.  At any such point, we are vulnerable to a dup2() race
 * installing a file in the array before us.  We need to detect this and
 * fput() the struct file we are about to overwrite in this case.
 *
 * It should never happen - if we allow dup2() do it, _really_ bad things
 * will follow.
 *
 * NOTE: __fd_install() variant is really, really low-level; don't
 * use it unless you are forced to by truly lousy API shoved down
 * your throat.  'files' *MUST* be either current->files or obtained
 * by get_files_struct(current) done by whoever had given it to you,
 * or really bad things will happen.  Normally you want to use
 * fd_install() instead.
 */

void __fd_install(struct files_struct *files, unsigned int fd,
		struct file *file)
{
	// ...
}

以上でsocket(2)の処理が全て終了する．分からないところを端折ったり，最後の方は疲れて適当になったりしていたが，次回はsendto(2)を見ていく予定である．

情弱ログ

参考にならないので当てにしないでください

Linux Kernelのパケット送信を追う(ソケット作成編)