邻居表项的回收控制

内核中存在3个阈值控制邻居表项的回收:

  • gc_thresh1 表示最小可保留的表项数量,如果表项数量小于此值GC(Garbage collector)不进行回收操作,默认为128;
  • gc_thresh2 当表项数量超过此值时,GC将会清空大于5秒的表项,默认为512;
  • gc_thresh3 最大可允许的非永久表项数量。如果系统拥有庞大的接口数量,或者直连了大量的设备,应增大此值。默认值为1024。

另外,gc_interval不太清楚有什么用处,默认值为30秒钟。

对于IPv4,可通过以下PROC文件查看和修改以上4个值:

$ cat /proc/sys/net/ipv4/neigh/default/gc_interval 
30
$ cat /proc/sys/net/ipv4/neigh/default/gc_thresh1 
128
$ cat /proc/sys/net/ipv4/neigh/default/gc_thresh2
512
$ cat /proc/sys/net/ipv4/neigh/default/gc_thresh3
1024

在arp的全局邻居表变量arp_tbl中,初始化了这四个值。

struct neigh_table arp_tbl = {
    .family     = AF_INET,
    .key_len    = 4,
    .protocol   = cpu_to_be16(ETH_P_IP),
    .hash       = arp_hash,
    .key_eq     = arp_key_eq,
    .constructor    = arp_constructor,
    .proxy_redo = parp_redo,
    .id     = "arp_cache",
    .parms      = {
        ...
    },
    .gc_interval    = 30 * HZ,
    .gc_thresh1 = 128,
    .gc_thresh2 = 512,
    .gc_thresh3 = 1024,

gc_thresh1阈值

在邻居表初始化函数neigh_table_init中,初始化一个延迟work定期进行回收处理,处理函数为neigh_periodic_work。

void neigh_table_init(int index, struct neigh_table *tbl)
{
    ...
    INIT_DEFERRABLE_WORK(&tbl->gc_work, neigh_periodic_work);
    queue_delayed_work(system_power_efficient_wq, &tbl->gc_work,
            tbl->parms.reachable_time);
			
	tbl->last_flush = now;

首先,如果邻居表中的表项数量entries小于gc_thresh1阈值,不进行回收处理,结束执行。

static void neigh_periodic_work(struct work_struct *work)
{
    struct neigh_table *tbl = container_of(work, struct neigh_table, gc_work.work);
    struct neighbour *n;
    struct neighbour __rcu **np;
    struct neigh_hash_table *nht;

    NEIGH_CACHE_STAT_INC(tbl, periodic_gc_runs);

    write_lock_bh(&tbl->lock);
    nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock));

    ...

    if (atomic_read(&tbl->entries) < tbl->gc_thresh1)
        goto out;

否则,遍历邻居表的hash桶,即每个桶中的表项链表,如果表项的状态标志位设置了NUD_PERMANENT或者NUD_IN_TIMER位,或者此表项是由外部模块添加的,不执行回收操作。

对于状态位为NUD_PERMANENT的表项,可是接口自身的IP地址与MAC的表项,或者用户通过ip neigh命令所添加。对于状态位NUD_IN_TIMER,表明此表项还在表项自身的定时器处理控制中,暂不需回收处理。对于表项标志位NTF_EXT_LEARNED,表明此表项为外部的VXLAN或者Switchdev等模块所添加,由这些模块自行删除。

    for (i = 0 ; i < (1 << nht->hash_shift); i++) {
        np = &nht->hash_buckets[i];

        while ((n = rcu_dereference_protected(*np,
                lockdep_is_held(&tbl->lock))) != NULL) {
            unsigned int state;

            write_lock(&n->lock);

            state = n->nud_state;
            if ((state & (NUD_PERMANENT | NUD_IN_TIMER)) ||
                (n->flags & NTF_EXT_LEARNED)) {
                write_unlock(&n->lock);
                goto next_elt;
            }

当以上条件不满足时,先行更新一下表项的使用时间,如果表项的引用计数为1,并且状态位等于NUD_FAILED(注意,这里表明仅此一个状态位),或者表项已经超过g_staletime定义的时长没有使用过了,进行回收处理。

            if (time_before(n->used, n->confirmed))
                n->used = n->confirmed;

            if (refcount_read(&n->refcnt) == 1 &&
                (state == NUD_FAILED ||
                 time_after(jiffies, n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) {
                *np = n->next;
                neigh_mark_dead(n);
                write_unlock(&n->lock);
                neigh_cleanup_and_release(n);
                continue;
            }
            write_unlock(&n->lock);

next_elt:
            np = &n->next;
        }
        /* It's fine to release lock here, even if hash table grows while we are preempted.
         */
        write_unlock_bh(&tbl->lock);
        cond_resched();
        write_lock_bh(&tbl->lock);
        nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock));
    }

gc_thresh2/gc_thresh3阈值

如下邻居表强制回收函数neigh_forced_gc,强制清理邻居表的gc_list链表中的表项,最大清理数量为gc_entries中超出gc_thresh2定义的数量的部分,被回收的表项需要满足两个条件:1)引用计数为1;2)状态等于NUD_FAILED或者已超过5秒没有更新了。last_flush记录此次强制回收的时间戳。

static int neigh_forced_gc(struct neigh_table *tbl)
{
    int max_clean = atomic_read(&tbl->gc_entries) - tbl->gc_thresh2;
    unsigned long tref = jiffies - 5 * HZ;
    struct neighbour *n, *tmp;

    NEIGH_CACHE_STAT_INC(tbl, forced_gc_runs);

    write_lock_bh(&tbl->lock);

    list_for_each_entry_safe(n, tmp, &tbl->gc_list, gc_list) {
        if (refcount_read(&n->refcnt) == 1) {
            bool remove = false;

            write_lock(&n->lock);
            if ((n->nud_state == NUD_FAILED) ||
                time_after(tref, n->updated))
                remove = true;
            write_unlock(&n->lock);

            if (remove && neigh_remove_one(n, tbl))
                shrunk++;
            if (shrunk >= max_clean)
                break;
        }
    }

    tbl->last_flush = jiffies;

以上强制回收函数在neigh_alloc中调用,当gc_entries数量大于等于gc_thresh3阈值时,进行强制回收;或者gc_entries大于等于gc_thresh2,并且距离上一次强制回收超过了5秒钟的时长,也进行强制回收。

如果强制回收函数neigh_forced_gc未能成功回收任何表项,并且gc_entries大于等于gc_thresh3阈值,打印警告信息。

static struct neighbour *neigh_alloc(struct neigh_table *tbl,
                     struct net_device *dev, bool exempt_from_gc)
{
    struct neighbour *n = NULL;
    unsigned long now = jiffies;

    if (exempt_from_gc)
        goto do_alloc;

    entries = atomic_inc_return(&tbl->gc_entries) - 1;
    if (entries >= tbl->gc_thresh3 ||
        (entries >= tbl->gc_thresh2 &&
         time_after(now, tbl->last_flush + 5 * HZ))) {
        if (!neigh_forced_gc(tbl) &&
            entries >= tbl->gc_thresh3) {
            net_info_ratelimited("%s: neighbor table overflow!\n", tbl->id);
            NEIGH_CACHE_STAT_INC(tbl, table_fulls);
            goto out_entries;
        }
    }

如下可见,此函数中将初始化邻居表项自身的gc_list,并且在函数开头处增加了gc_entries计数。

do_alloc:
    n = kzalloc(tbl->entry_size + dev->neigh_priv_len, GFP_ATOMIC);
    if (!n)
        goto out_entries;
    ...
    n->dead       = 1;
    INIT_LIST_HEAD(&n->gc_list);

    atomic_inc(&tbl->entries);
out:
    return n;

out_entries:
    if (!exempt_from_gc)
        atomic_dec(&tbl->gc_entries);

以下函数neigh_add处理应用层面的表项添加,如果用户设置了状态位NUD_PERMANENT,或者标志位NTF_EXT_LEARNED,在邻居表项分配函数中将跳过以上描述的回收检查。

static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh,
             struct netlink_ext_ack *extack)
{
    ...
    neigh = neigh_lookup(tbl, dst, dev);
    if (neigh == NULL) {
        bool exempt_from_gc;

        if (!(nlh->nlmsg_flags & NLM_F_CREATE)) {
            err = -ENOENT;
            goto out;
        }

        exempt_from_gc = ndm->ndm_state & NUD_PERMANENT ||
                 ndm->ndm_flags & NTF_EXT_LEARNED;
        neigh = ___neigh_create(tbl, dst, dev, exempt_from_gc, true);

在内核自身使用的表项创建函数中,如果exempt_from_gc为零,将表项链接到邻居表的gc_list中,在内核函数__neigh_create中,将exempt_from_gc固定为false,所以,内核创建的表项初始时都在gc_list链表上。

static struct neighbour *___neigh_create(struct neigh_table *tbl,
                     const void *pkey,
                     struct net_device *dev,
                     bool exempt_from_gc, bool want_ref)
{
    struct neighbour *n1, *rc, *n = neigh_alloc(tbl, dev, exempt_from_gc);

    n->dead = 0;
    if (!exempt_from_gc)
        list_add_tail(&n->gc_list, &n->tbl->gc_list);

gc_list更新

在邻居表项更新函数__neigh_update中,如果表项的状态位NUD_PERMANENT发生变化,或则外部属性发生变化,就需要更新邻居表的gc_list。

static int __neigh_update(struct neighbour *neigh, const u8 *lladdr,
              u8 new, u32 flags, u32 nlmsg_pid,
              struct netlink_ext_ack *extack)
{
    ...
    if (((new ^ old) & NUD_PERMANENT) || ext_learn_change)
        neigh_update_gc_list(neigh);
	 
    if (notify)
        neigh_update_notify(neigh, nlmsg_pid);

如下neigh_update_gc_list函数,如果表项设置了状态位NUD_PERMANENT,或者设置了外部标志位NTF_EXT_LEARNED,将其由gc_list中移除,表明不能进行回收。否则,将其添加到gc_list链表的末尾,由于回收操作由链表头部开始,更新过的表项最后进行回收处理。

static void neigh_update_gc_list(struct neighbour *n)
{
    bool on_gc_list, exempt_from_gc;

    write_lock_bh(&n->tbl->lock);
    write_lock(&n->lock);

    /* remove from the gc list if new state is permanent or if neighbor
     * is externally learned; otherwise entry should be on the gc list
     */
    exempt_from_gc = n->nud_state & NUD_PERMANENT ||
             n->flags & NTF_EXT_LEARNED;
    on_gc_list = !list_empty(&n->gc_list);

    if (exempt_from_gc && on_gc_list) {
        list_del_init(&n->gc_list);
        atomic_dec(&n->tbl->gc_entries);
    } else if (!exempt_from_gc && !on_gc_list) {
        /* add entries to the tail; cleaning removes from the front */
        list_add_tail(&n->gc_list, &n->tbl->gc_list);
        atomic_inc(&n->tbl->gc_entries);
    }

函数neigh_mark_dead负责将表项由链表gc_list中移除。当执行清理操作时,将使用到此函数;另外,在应用层使用ip命令删除指定表项时,也使用到此函数。

static void neigh_mark_dead(struct neighbour *n)
{
    n->dead = 1;
    if (!list_empty(&n->gc_list)) {
        list_del_init(&n->gc_list);
        atomic_dec(&n->tbl->gc_entries);
    }
}

内核版本 5.0

已标记关键词 清除标记
©️2020 CSDN 皮肤主题: 编程工作室 设计师:CSDN官方博客 返回首页