在文件include/uapi/linux/netlink.h中定义了,目前共有32个组,除去第一个RTNLGRP_NONE,即31个组。
/* RTnetlink multicast groups */
enum rtnetlink_groups {
RTNLGRP_NONE,
#define RTNLGRP_NONE RTNLGRP_NONE
RTNLGRP_LINK,
#define RTNLGRP_LINK RTNLGRP_LINK
...
RTNLGRP_IPV4_MROUTE_R,
#define RTNLGRP_IPV4_MROUTE_R RTNLGRP_IPV4_MROUTE_R
RTNLGRP_IPV6_MROUTE_R,
#define RTNLGRP_IPV6_MROUTE_R RTNLGRP_IPV6_MROUTE_R
__RTNLGRP_MAX
};
#define RTNLGRP_MAX (__RTNLGRP_MAX - 1)
在看一下文件include/uapi/linux/netlink.h中定义的nl_groups为一个32位的变量。
struct sockaddr_nl {
__kernel_sa_family_t nl_family; /* AF_NETLINK */
unsigned short nl_pad; /* zero */
__u32 nl_pid; /* port ID */
__u32 nl_groups; /* multicast groups mask */
};
这将导致在应用层进程group绑定的时候,group不能超过32个。参见以下的iproute2-5.9.0代码中的monitor功能代码,函数nl_mgrp会对group大于31的情况进行判断,因为当前最大的组RTNLGRP_IPV6_MROUTE_R值就是31。
int do_ipmonitor(int argc, char **argv)
{
int lnexthop = 0, nh_set = 1;
char *file = NULL;
unsigned int groups = 0;
groups |= nl_mgrp(RTNLGRP_LINK);
...
groups |= nl_mgrp(RTNLGRP_MPLS_NETCONF);
if (rtnl_open(&rth, groups) < 0)
exit(1);
这里固定写了31,而没有使用宏定义RTNLGRP_MAX。并且,提示信息显示,如果要监听超过31的组,需要使用setsockopt接口。
static inline __u32 nl_mgrp(__u32 group)
{
if (group > 31 ) {
fprintf(stderr, "Use setsockopt for this group %d\n", group);
exit(-1);
}
return group ? (1 << (group - 1)) : 0;
如下iproute2使用rtnl_open子函数,bind函数使用的是sockaddr_nl结构的成员nl_groups,其与subscriptions都是32位的长度。
int rtnl_open_byproto(struct rtnl_handle *rth, unsigned int subscriptions, int protocol)
{
rth->fd = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, protocol);
if (rth->fd < 0) {
perror("Cannot open netlink socket");
return -1;
}
...
memset(&rth->local, 0, sizeof(rth->local));
rth->local.nl_family = AF_NETLINK;
rth->local.nl_groups = subscriptions;
if (bind(rth->fd, (struct sockaddr *)&rth->local, sizeof(rth->local)) < 0) {
perror("Cannot bind netlink socket");
return -1;
}
内核netlink设置组
首先看一内核5.0中套接口结构netlink_sock的定义,与组相关的有三个变量,其中,subscriptions表示监听组的数量;groups保存监听组的位图bitmap;变量ngroups表示的是目前groups数组中最大可保存的组数量。
struct netlink_sock {
/* struct sock has to be the first member of netlink_sock */
struct sock sk;
...
u32 subscriptions;
u32 ngroups;
unsigned long *groups;
netlink函数netlink_bind如下,如果地址结构sockaddr_nl中的成员nl_groups有值,内核将动态分配其空间,参见以下函数netlink_realloc_groups。由此可见内核支持监听超过32个的groups。
static int netlink_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
{
struct sock *sk = sock->sk;
struct net *net = sock_net(sk);
struct netlink_sock *nlk = nlk_sk(sk);
struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr;
unsigned long groups = nladdr->nl_groups;
/* Only superuser is allowed to listen multicasts */
if (groups) {
if (!netlink_allowed(sock, NL_CFG_F_NONROOT_RECV))
return -EPERM;
err = netlink_realloc_groups(sk);
如果当前套接口的组数量少于long类型的位数,将套接口地址groups中超过netlink_nl结构中最大组位数的部分清零。感觉这里使用BITS_PER_LONG不合适,因为groups的值取自nladdr->nl_groups,后者只有32位。
if (nlk->ngroups < BITS_PER_LONG)
groups &= (1UL << nlk->ngroups) - 1;
套接口nlk的成员变量subscriptions表示监听的组数量,函数netlink_update_subscriptions更新其值,注意这里的运算: hweight32(groups) - hweight32(nlk->groups[0])的结果表示监听组的数量的变化,变化值加到原来的subscriptions上,就是新的监听组数量。因为nlk->groups数组的其它原始,比如nlk->groups[1]中也可能有监听组,所以使用差值计算。
最后,将套接口nlk->groups[0]即第一个元素的低32位清零,或上新的监听组位图groups。
从这里可以看出,并没有修改nlk->groups数组除首个元素的其它元素,由于nlk->groups元素类型为long,对于64位系统,也没有修改首个元素的高32位。
if (!groups && (nlk->groups == NULL || !(u32)nlk->groups[0]))
goto unlock;
netlink_unlock_table();
netlink_table_grab();
netlink_update_subscriptions(sk, nlk->subscriptions +
hweight32(groups) -
hweight32(nlk->groups[0]));
nlk->groups[0] = (nlk->groups[0] & ~0xffffffffUL) | groups;
以下为netlink_realloc_groups函数,如果协议(例如NETLINK_ROUTE)指定的组数量(groups),大于套接口nlk目前的组数量(ngroups),需要对nlk的数组进行扩充,完成之后,将新扩充出来的空间进行清零操作。
static int netlink_realloc_groups(struct sock *sk)
{
struct netlink_sock *nlk = nlk_sk(sk);
unsigned int groups;
unsigned long *new_groups;
...
groups = nl_table[sk->sk_protocol].groups;
if (!nl_table[sk->sk_protocol].registered) {
err = -ENOENT;
goto out_unlock;
}
if (nlk->ngroups >= groups)
goto out_unlock;
new_groups = krealloc(nlk->groups, NLGRPSZ(groups), GFP_ATOMIC);
if (new_groups == NULL) {
err = -ENOMEM;
goto out_unlock;
}
memset((char *)new_groups + NLGRPSZ(nlk->ngroups), 0,
NLGRPSZ(groups) - NLGRPSZ(nlk->ngroups));
nlk->groups = new_groups;
nlk->ngroups = groups;
对于rtnetlink,其在注册时,groups数量设置为最大值RTNLGRP_MAX,对应于目前内核的31。
static int __net_init rtnetlink_net_init(struct net *net)
{
struct sock *sk;
struct netlink_kernel_cfg cfg = {
.groups = RTNLGRP_MAX,
.input = rtnetlink_rcv,
.cb_mutex = &rtnl_mutex,
.flags = NL_CFG_F_NONROOT_RECV,
.bind = rtnetlink_bind,
};
sk = netlink_kernel_create(net, NETLINK_ROUTE, &cfg);
之前提到的套接口结构成员subscriptions变量,由函数netlink_update_subscriptions进行更新,同时,更新套接口的绑定链表。
static void netlink_update_subscriptions(struct sock *sk, unsigned int subscriptions)
{
struct netlink_sock *nlk = nlk_sk(sk);
if (nlk->subscriptions && !subscriptions)
__sk_del_bind_node(sk);
else if (!nlk->subscriptions && subscriptions)
sk_add_bind_node(sk, &nl_table[sk->sk_protocol].mc_list);
nlk->subscriptions = subscriptions;
netlink获取组信息
如下函数netlink_getname所示, nladdr->nl_groups的取值为groups[0]中的值,虽然groups[0]为long类型,但是nl_groups为32位,所以对于64位系统,只取得了低32位的值。
static int netlink_getname(struct socket *sock, struct sockaddr *addr, int peer)
{
struct sock *sk = sock->sk;
struct netlink_sock *nlk = nlk_sk(sk);
DECLARE_SOCKADDR(struct sockaddr_nl *, nladdr, addr);
nladdr->nl_family = AF_NETLINK;
nladdr->nl_pad = 0;
if (peer) {
nladdr->nl_pid = nlk->dst_portid;
nladdr->nl_groups = netlink_group_mask(nlk->dst_group);
} else {
nladdr->nl_pid = nlk->portid;
netlink_lock_table();
nladdr->nl_groups = nlk->groups ? nlk->groups[0] : 0;
setsockopt接口
除了以上netlink接口设置组,还可通过setsockopt进行组设置,后者不受32位的限制。如下netlink_setsockopt,设置选项为NETLINK_ADD_MEMBERSHIP。
static int netlink_setsockopt(struct socket *sock, int level, int optname,
char __user *optval, unsigned int optlen)
{
struct sock *sk = sock->sk;
struct netlink_sock *nlk = nlk_sk(sk);
switch (optname) {
case NETLINK_ADD_MEMBERSHIP:
case NETLINK_DROP_MEMBERSHIP: {
if (!netlink_allowed(sock, NL_CFG_F_NONROOT_RECV))
return -EPERM;
err = netlink_realloc_groups(sk);
if (err) return err;
if (!val || val - 1 >= nlk->ngroups)
return -EINVAL;
if (optname == NETLINK_ADD_MEMBERSHIP && nlk->netlink_bind) {
err = nlk->netlink_bind(sock_net(sk), val);
if (err) return err;
}
netlink_table_grab();
netlink_update_socket_mc(nlk, val, optname == NETLINK_ADD_MEMBERSHIP);
如下netlink_update_socket_mc函数,用于设置subscriptions数量,以及设置nlk->groups数组位。
static void netlink_update_socket_mc(struct netlink_sock *nlk,
unsigned int group, int is_new)
{
int old, new = !!is_new, subscriptions;
old = test_bit(group - 1, nlk->groups);
subscriptions = nlk->subscriptions - old + new;
if (new)
__set_bit(group - 1, nlk->groups);
else
__clear_bit(group - 1, nlk->groups);
netlink_update_subscriptions(&nlk->sk, subscriptions);
netlink_update_listeners(&nlk->sk);
在内核5.9.9版本中,看到有新的组定义:RTNLGRP_NEXTHOP和RTNLGRP_BRVLAN,
/* RTnetlink multicast groups */
enum rtnetlink_groups {
...
RTNLGRP_IPV6_MROUTE_R,
#define RTNLGRP_IPV6_MROUTE_R RTNLGRP_IPV6_MROUTE_R
RTNLGRP_NEXTHOP,
#define RTNLGRP_NEXTHOP RTNLGRP_NEXTHOP
RTNLGRP_BRVLAN,
#define RTNLGRP_BRVLAN RTNLGRP_BRVLAN
相应的iproute2-5.9.0中,实现了函数rtnl_add_nl_group来设置新增加的组,这两个新增组使用netlink的bind接口不能下发。
int do_ipmonitor(int argc, char **argv)
{
...
if (rtnl_open(&rth, groups) < 0)
exit(1);
if (lnexthop && rtnl_add_nl_group(&rth, RTNLGRP_NEXTHOP) < 0) {
fprintf(stderr, "Failed to add nexthop group to list\n");
exit(1);
}
int rtnl_add_nl_group(struct rtnl_handle *rth, unsigned int group)
{
return setsockopt(rth->fd, SOL_NETLINK, NETLINK_ADD_MEMBERSHIP,
&group, sizeof(group));
}
内核版本 5.0