rtnetlink组数量与设置

在文件include/uapi/linux/netlink.h中定义了,目前共有32个组,除去第一个RTNLGRP_NONE,即31个组。

/* RTnetlink multicast groups */
enum rtnetlink_groups {
    RTNLGRP_NONE,
#define RTNLGRP_NONE        RTNLGRP_NONE
    RTNLGRP_LINK,
#define RTNLGRP_LINK        RTNLGRP_LINK
...
    RTNLGRP_IPV4_MROUTE_R,
#define RTNLGRP_IPV4_MROUTE_R   RTNLGRP_IPV4_MROUTE_R
    RTNLGRP_IPV6_MROUTE_R,
#define RTNLGRP_IPV6_MROUTE_R   RTNLGRP_IPV6_MROUTE_R
    __RTNLGRP_MAX
};
#define RTNLGRP_MAX (__RTNLGRP_MAX - 1)

在看一下文件include/uapi/linux/netlink.h中定义的nl_groups为一个32位的变量。

struct sockaddr_nl {
    __kernel_sa_family_t    nl_family;  /* AF_NETLINK   */
    unsigned short  nl_pad;     /* zero     */
    __u32       nl_pid;     /* port ID  */
        __u32       nl_groups;  /* multicast groups mask */
};

这将导致在应用层进程group绑定的时候,group不能超过32个。参见以下的iproute2-5.9.0代码中的monitor功能代码,函数nl_mgrp会对group大于31的情况进行判断,因为当前最大的组RTNLGRP_IPV6_MROUTE_R值就是31。

int do_ipmonitor(int argc, char **argv)
{   
    int lnexthop = 0, nh_set = 1;
    char *file = NULL;
    unsigned int groups = 0;
    
    groups |= nl_mgrp(RTNLGRP_LINK);
    ...
    groups |= nl_mgrp(RTNLGRP_MPLS_NETCONF);

    if (rtnl_open(&rth, groups) < 0)
        exit(1);

这里固定写了31,而没有使用宏定义RTNLGRP_MAX。并且,提示信息显示,如果要监听超过31的组,需要使用setsockopt接口。

static inline __u32 nl_mgrp(__u32 group)
{   
    if (group > 31 ) {
        fprintf(stderr, "Use setsockopt for this group %d\n", group);
        exit(-1);
    }   
    return group ? (1 << (group - 1)) : 0;

如下iproute2使用rtnl_open子函数,bind函数使用的是sockaddr_nl结构的成员nl_groups,其与subscriptions都是32位的长度。

int rtnl_open_byproto(struct rtnl_handle *rth, unsigned int subscriptions, int protocol)
{
    rth->fd = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, protocol);
    if (rth->fd < 0) {
        perror("Cannot open netlink socket");
        return -1;
    }
    ...
    memset(&rth->local, 0, sizeof(rth->local));
    rth->local.nl_family = AF_NETLINK;
    rth->local.nl_groups = subscriptions;

    if (bind(rth->fd, (struct sockaddr *)&rth->local, sizeof(rth->local)) < 0) {
        perror("Cannot bind netlink socket");
        return -1;
    }

内核netlink设置组

首先看一内核5.0中套接口结构netlink_sock的定义,与组相关的有三个变量,其中,subscriptions表示监听组的数量;groups保存监听组的位图bitmap;变量ngroups表示的是目前groups数组中最大可保存的组数量。

struct netlink_sock {
    /* struct sock has to be the first member of netlink_sock */
    struct sock     sk;
    ...
    u32         subscriptions;
    u32         ngroups;
    unsigned long       *groups;

netlink函数netlink_bind如下,如果地址结构sockaddr_nl中的成员nl_groups有值,内核将动态分配其空间,参见以下函数netlink_realloc_groups。由此可见内核支持监听超过32个的groups。

static int netlink_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
{
    struct sock *sk = sock->sk;
    struct net *net = sock_net(sk);
    struct netlink_sock *nlk = nlk_sk(sk);
    struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr;
    unsigned long groups = nladdr->nl_groups;

    /* Only superuser is allowed to listen multicasts */
    if (groups) {
        if (!netlink_allowed(sock, NL_CFG_F_NONROOT_RECV))
            return -EPERM;
        err = netlink_realloc_groups(sk);

如果当前套接口的组数量少于long类型的位数,将套接口地址groups中超过netlink_nl结构中最大组位数的部分清零。感觉这里使用BITS_PER_LONG不合适,因为groups的值取自nladdr->nl_groups,后者只有32位。

    if (nlk->ngroups < BITS_PER_LONG)
        groups &= (1UL << nlk->ngroups) - 1;

套接口nlk的成员变量subscriptions表示监听的组数量,函数netlink_update_subscriptions更新其值,注意这里的运算: hweight32(groups) - hweight32(nlk->groups[0])的结果表示监听组的数量的变化,变化值加到原来的subscriptions上,就是新的监听组数量。因为nlk->groups数组的其它原始,比如nlk->groups[1]中也可能有监听组,所以使用差值计算。

最后,将套接口nlk->groups[0]即第一个元素的低32位清零,或上新的监听组位图groups。

从这里可以看出,并没有修改nlk->groups数组除首个元素的其它元素,由于nlk->groups元素类型为long,对于64位系统,也没有修改首个元素的高32位。

    if (!groups && (nlk->groups == NULL || !(u32)nlk->groups[0]))
        goto unlock;
    netlink_unlock_table();

    netlink_table_grab();
    netlink_update_subscriptions(sk, nlk->subscriptions +
                     hweight32(groups) -
                     hweight32(nlk->groups[0]));
    nlk->groups[0] = (nlk->groups[0] & ~0xffffffffUL) | groups;

以下为netlink_realloc_groups函数,如果协议(例如NETLINK_ROUTE)指定的组数量(groups),大于套接口nlk目前的组数量(ngroups),需要对nlk的数组进行扩充,完成之后,将新扩充出来的空间进行清零操作。

static int netlink_realloc_groups(struct sock *sk)
{
    struct netlink_sock *nlk = nlk_sk(sk);
    unsigned int groups;
    unsigned long *new_groups;
    ...
    groups = nl_table[sk->sk_protocol].groups;
    if (!nl_table[sk->sk_protocol].registered) {
        err = -ENOENT;
        goto out_unlock;
    }

    if (nlk->ngroups >= groups)
        goto out_unlock;

    new_groups = krealloc(nlk->groups, NLGRPSZ(groups), GFP_ATOMIC);
    if (new_groups == NULL) {
        err = -ENOMEM;
        goto out_unlock;
    }
    memset((char *)new_groups + NLGRPSZ(nlk->ngroups), 0,
           NLGRPSZ(groups) - NLGRPSZ(nlk->ngroups));

    nlk->groups = new_groups;
    nlk->ngroups = groups;

对于rtnetlink,其在注册时,groups数量设置为最大值RTNLGRP_MAX,对应于目前内核的31。

static int __net_init rtnetlink_net_init(struct net *net)
{
    struct sock *sk;
    struct netlink_kernel_cfg cfg = {
        .groups     = RTNLGRP_MAX,
        .input      = rtnetlink_rcv,
        .cb_mutex   = &rtnl_mutex,
        .flags      = NL_CFG_F_NONROOT_RECV,
        .bind       = rtnetlink_bind,
    };
   
    sk = netlink_kernel_create(net, NETLINK_ROUTE, &cfg);

之前提到的套接口结构成员subscriptions变量,由函数netlink_update_subscriptions进行更新,同时,更新套接口的绑定链表。

static void netlink_update_subscriptions(struct sock *sk, unsigned int subscriptions)
{
    struct netlink_sock *nlk = nlk_sk(sk);

    if (nlk->subscriptions && !subscriptions)
        __sk_del_bind_node(sk);
    else if (!nlk->subscriptions && subscriptions)
        sk_add_bind_node(sk, &nl_table[sk->sk_protocol].mc_list);
    nlk->subscriptions = subscriptions;

netlink获取组信息

如下函数netlink_getname所示, nladdr->nl_groups的取值为groups[0]中的值,虽然groups[0]为long类型,但是nl_groups为32位,所以对于64位系统,只取得了低32位的值。

static int netlink_getname(struct socket *sock, struct sockaddr *addr, int peer)
{
    struct sock *sk = sock->sk;
    struct netlink_sock *nlk = nlk_sk(sk);
    DECLARE_SOCKADDR(struct sockaddr_nl *, nladdr, addr);

    nladdr->nl_family = AF_NETLINK;
    nladdr->nl_pad = 0;

    if (peer) {
        nladdr->nl_pid = nlk->dst_portid;
        nladdr->nl_groups = netlink_group_mask(nlk->dst_group);
    } else {
        nladdr->nl_pid = nlk->portid;
        netlink_lock_table();
        nladdr->nl_groups = nlk->groups ? nlk->groups[0] : 0;

setsockopt接口

除了以上netlink接口设置组,还可通过setsockopt进行组设置,后者不受32位的限制。如下netlink_setsockopt,设置选项为NETLINK_ADD_MEMBERSHIP。

static int netlink_setsockopt(struct socket *sock, int level, int optname,
                  char __user *optval, unsigned int optlen)
{
    struct sock *sk = sock->sk;
    struct netlink_sock *nlk = nlk_sk(sk);

    switch (optname) {
    case NETLINK_ADD_MEMBERSHIP:
    case NETLINK_DROP_MEMBERSHIP: {
        if (!netlink_allowed(sock, NL_CFG_F_NONROOT_RECV))
            return -EPERM;
        err = netlink_realloc_groups(sk);
        if (err) return err;
		
        if (!val || val - 1 >= nlk->ngroups)
            return -EINVAL;
        if (optname == NETLINK_ADD_MEMBERSHIP && nlk->netlink_bind) {
            err = nlk->netlink_bind(sock_net(sk), val);
            if (err) return err;
        }
        netlink_table_grab();
        netlink_update_socket_mc(nlk, val, optname == NETLINK_ADD_MEMBERSHIP);

如下netlink_update_socket_mc函数,用于设置subscriptions数量,以及设置nlk->groups数组位。

static void netlink_update_socket_mc(struct netlink_sock *nlk,
                     unsigned int group, int is_new)
{
    int old, new = !!is_new, subscriptions;

    old = test_bit(group - 1, nlk->groups);
    subscriptions = nlk->subscriptions - old + new;
    if (new)
        __set_bit(group - 1, nlk->groups);
    else
        __clear_bit(group - 1, nlk->groups);
    netlink_update_subscriptions(&nlk->sk, subscriptions);
    netlink_update_listeners(&nlk->sk);

在内核5.9.9版本中,看到有新的组定义:RTNLGRP_NEXTHOP和RTNLGRP_BRVLAN,

/* RTnetlink multicast groups */
enum rtnetlink_groups {
    ...

    RTNLGRP_IPV6_MROUTE_R,
#define RTNLGRP_IPV6_MROUTE_R   RTNLGRP_IPV6_MROUTE_R
    RTNLGRP_NEXTHOP,
#define RTNLGRP_NEXTHOP     RTNLGRP_NEXTHOP
    RTNLGRP_BRVLAN,
#define RTNLGRP_BRVLAN      RTNLGRP_BRVLAN

相应的iproute2-5.9.0中,实现了函数rtnl_add_nl_group来设置新增加的组,这两个新增组使用netlink的bind接口不能下发。

int do_ipmonitor(int argc, char **argv)
{
    ...
    if (rtnl_open(&rth, groups) < 0)
        exit(1);

    if (lnexthop && rtnl_add_nl_group(&rth, RTNLGRP_NEXTHOP) < 0) {
        fprintf(stderr, "Failed to add nexthop group to list\n");
        exit(1);
    }
	
int rtnl_add_nl_group(struct rtnl_handle *rth, unsigned int group)
{
    return setsockopt(rth->fd, SOL_NETLINK, NETLINK_ADD_MEMBERSHIP,
              &group, sizeof(group));
} 

内核版本 5.0

已标记关键词 清除标记
©️2020 CSDN 皮肤主题: 编程工作室 设计师:CSDN官方博客 返回首页