IP 层收发报文简要剖析3--ip输入报文分片重组
Posted codestack
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了IP 层收发报文简要剖析3--ip输入报文分片重组相关的知识,希望对你有一定的参考价值。
在ip_local_deliver中,如果检测到是分片包,则需要将报文进行重组。其所有的分片被重新组合后才能提交到上层协议,每一个被重新组合的数据包文用ipq结构实例来表示
struct ipq { struct inet_frag_queue q; u32 user;//分片来源 __be32 saddr;//原地址 __be32 daddr;//目的地址 __be16 id;//ip报文序列号 u8 protocol;//上层协议号 //这四个字段来自ip首部是为了确定来自哪个ip数据报文 u8 ecn; /* RFC3168 support */ u16 max_df_size; /* largest frag with DF set seen */ int iif; int vif; /* L3 master device index */ unsigned int rid;//已收到的分片计数器 struct inet_peer *peer;//记录发送方信息 //通过rid peer 可以防止Dos攻击 };
网络空间分段管理结构
struct inet_frags { struct inet_frag_bucket hash[INETFRAGS_HASHSZ];//哈希队列 struct work_struct frags_work;//工作队列 unsigned int next_bucket; unsigned long last_rebuild_jiffies; bool rebuild; /* The first call to hashfn is responsible to initialize * rnd. This is best done with net_get_random_once. * * rnd_seqlock is used to let hash insertion detect * when it needs to re-lookup the hash chain to use. */ u32 rnd;//随机数 seqlock_t rnd_seqlock;// int qsize;//队列长度 unsigned int (*hashfn)(const struct inet_frag_queue *); bool (*match)(const struct inet_frag_queue *q, const void *arg);//分段队列匹配函数 void (*constructor)(struct inet_frag_queue *q, const void *arg); void (*destructor)(struct inet_frag_queue *); void (*frag_expire)(unsigned long data);//队列过期处理函数 struct kmem_cache *frags_cachep; const char *frags_cache_name; };
struct netns_frags { /* The percpu_counter "mem" need to be cacheline aligned. * mem.count must not share cacheline with other writers */ struct percpu_counter mem ____cacheline_aligned_in_smp; /* sysctls */ int timeout;超时时间 int high_thresh;内存使用上限 int low_thresh;内存使用下限 int max_dist; };
/** * struct inet_frag_queue - fragment queue * * @lock: spinlock protecting the queue * @timer: queue expiration timer * @list: hash bucket list * @refcnt: reference count of the queue * @fragments: received fragments head * @fragments_tail: received fragments tail * @stamp: timestamp of the last received fragment * @len: total length of the original datagram * @meat: length of received fragments so far * @flags: fragment queue flags * @max_size: maximum received fragment size * @net: namespace that this frag belongs to * @list_evictor: list of queues to forcefully evict (e.g. due to low memory) */ struct inet_frag_queue {//inet分段队列头 spinlock_t lock;smp环境下 需要 struct timer_list timer;队列定时器,组装非常耗时,不能无休止的等待分片的到达 struct hlist_node list;哈希节点,链入inet分段管理结构的哈希队列 atomic_t refcnt;计数器 struct sk_buff *fragments;分段数据包队列 struct sk_buff *fragments_tail; ktime_t stamp;时间戳 int len;数据包结束位置offset+len int meat;与原数据长度的差距,如果和原数据包长度一样代表接收完成 __u8 flags; u16 max_size; struct netns_frags *net;指向网络空寂分段管理结构 struct hlist_node list_evictor; };
1.1、 IP分组的初始化
void __init ipfrag_init(void) { ip4_frags_ctl_register(); register_pernet_subsys(&ip4_frags_ops);//向内核注册ipv4分段管理函数 ip4_frags.hashfn = ip4_hashfn;//设置计算hash的函数 //设置初始化ip 分段队列的构造函数 ip4_frags.constructor = ip4_frag_init; //析构函数 ip4_frags.destructor = ip4_frag_free; //队列机构长度 ip4_frags.qsize = sizeof(struct ipq); //对比ip分段队列hook ip4_frags.match = ip4_frag_match; //设置分段队列过期处理函数 ip4_frags.frag_expire = ip_expire; ip4_frags.frags_cache_name = ip_frag_cache_name; if (inet_frags_init(&ip4_frags)) panic("IP: failed to allocate ip4_frags cache\n"); } int inet_frags_init(struct inet_frags *f) { int i; //初始化工作队列 INIT_WORK(&f->frags_work, inet_frag_worker); for (i = 0; i < INETFRAGS_HASHSZ; i++) { struct inet_frag_bucket *hb = &f->hash[i];//初始化hash 队列头 spin_lock_init(&hb->chain_lock); INIT_HLIST_HEAD(&hb->chain); } seqlock_init(&f->rnd_seqlock); f->last_rebuild_jiffies = 0; f->frags_cachep = kmem_cache_create(f->frags_cache_name, f->qsize, 0, 0, NULL); if (!f->frags_cachep) return -ENOMEM; return 0; } EXPORT_SYMBOL(inet_frags_init);
int ip_local_deliver(struct sk_buff *skb) { /* * Reassemble IP fragments. */ struct net *net = dev_net(skb->dev); /* 分片重组 */ if (ip_is_fragment(ip_hdr(skb))) { if (ip_defrag(net, skb, IP_DEFRAG_LOCAL_DELIVER)) return 0; } /* 经过LOCAL_IN钩子点 */ return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN, net, NULL, skb, skb->dev, NULL, ip_local_deliver_finish); }
1.2、 ip分片报文重组的处理
/* Process an incoming IP datagram fragment. */ int ip_defrag(struct net *net, struct sk_buff *skb, u32 user) { struct net_device *dev = skb->dev ? : skb_dst(skb)->dev; int vif = l3mdev_master_ifindex_rcu(dev); struct ipq *qp; //递增计数 __IP_INC_STATS(net, IPSTATS_MIB_REASMREQDS); skb_orphan(skb); /* Lookup (or create) queue header* 查找或创建IP分片队列 */ qp = ip_find(net, ip_hdr(skb), user, vif); if (qp) {/* 分片队列存在 */ int ret; spin_lock(&qp->q.lock); ret = ip_frag_queue(qp, skb);//分片数据包入队重组数据包 spin_unlock(&qp->q.lock); ipq_put(qp); return ret; } /* 创建新的ip分片队列失败,内存不足递增失败计数*/ __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); kfree_skb(skb); return -ENOMEM; } EXPORT_SYMBOL(ip_defrag);
1.2.2 ip_find 根据ip首部以及user标志 在ipq散列表中查找对应的ipq。
/* Find the correct entry in the "incomplete datagrams" queue for * this IP datagram, and create new one, if nothing is found. enum ip_defrag_users { IP_DEFRAG_LOCAL_DELIVER, IP_DEFRAG_CALL_RA_CHAIN, IP_DEFRAG_CONNTRACK_IN, __IP_DEFRAG_CONNTRACK_IN_END = IP_DEFRAG_CONNTRACK_IN + USHRT_MAX, IP_DEFRAG_CONNTRACK_OUT, __IP_DEFRAG_CONNTRACK_OUT_END = IP_DEFRAG_CONNTRACK_OUT + USHRT_MAX, IP_DEFRAG_CONNTRACK_BRIDGE_IN, __IP_DEFRAG_CONNTRACK_BRIDGE_IN = IP_DEFRAG_CONNTRACK_BRIDGE_IN + USHRT_MAX, IP_DEFRAG_VS_IN, IP_DEFRAG_VS_OUT, IP_DEFRAG_VS_FWD, IP_DEFRAG_AF_PACKET, IP_DEFRAG_MACVLAN, }; */ static struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user, int vif) { struct inet_frag_queue *q; struct ip4_create_arg arg; unsigned int hash; /* 记录ip头和输入信息 */ arg.iph = iph; arg.user = user; arg.vif = vif; /* 通过id,源地址,目的地址,协议计算hash */ hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol); /* 根据hash值查找或创建队列 */ q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash); if (IS_ERR_OR_NULL(q)) { inet_frag_maybe_warn_overflow(q, pr_fmt()); return NULL; } return container_of(q, struct ipq, q); } struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, struct inet_frags *f, void *key, unsigned int hash) { struct inet_frag_bucket *hb; struct inet_frag_queue *q; int depth = 0; /* 分片内存已经超过了低限 */ if (frag_mem_limit(nf) > nf->low_thresh) /* 进行节点回收 */ inet_frag_schedule_worker(f); //工作队列回调函数为inet_frag_worker hash &= (INETFRAGS_HASHSZ - 1); hb = &f->hash[hash]; /* 找到hash桶 */ spin_lock(&hb->chain_lock); hlist_for_each_entry(q, &hb->chain, list) { /* 遍历链表 */ if (q->net == nf && f->match(q, key)) { atomic_inc(&q->refcnt); /* 增加引用计数 */ spin_unlock(&hb->chain_lock); return q; } depth++;/* 记录查找深度 */ } spin_unlock(&hb->chain_lock); /* 未找到 */ /* 桶节点的链表深度不超过限定 */ if (depth <= INETFRAGS_MAXDEPTH) return inet_frag_create(nf, f, key);/* 创建节点返回 */ if (inet_frag_may_rebuild(f)) { /* 如果已经超过了重建间隔时间,则重建 */ if (!f->rebuild) f->rebuild = true; inet_frag_schedule_worker(f); } return ERR_PTR(-ENOBUFS); } EXPORT_SYMBOL(inet_frag_find);
如果查找不到则会创建一个ipq 并将其插入链表中
static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf, struct inet_frags *f, void *arg) { struct inet_frag_queue *q; q = inet_frag_alloc(nf, f, arg);//分配队列头结构空间 if (!q) return NULL; return inet_frag_intern(nf, q, f, arg); } static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, struct inet_frags *f, void *arg) { struct inet_frag_queue *q; if (frag_mem_limit(nf) > nf->high_thresh) {//内存超过警戒线 回收内存 inet_frag_schedule_worker(f); return NULL; } q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC); if (!q) return NULL; q->net = nf;//记录下网络空间的分段管理结构指针 f->constructor(q, arg);//之前初始化时,构造函数来初始化-ip4_frag_init add_frag_mem_limit(nf, f->qsize);//sum 网络空间的分段内存 setup_timer(&q->timer, f->frag_expire, (unsigned long)q);//定时器initand run spin_lock_init(&q->lock); atomic_set(&q->refcnt, 1); return q; } static void ip4_frag_init(struct inet_frag_queue *q, const void *a) { struct ipq *qp = container_of(q, struct ipq, q);//获取分段队列指针 struct netns_ipv4 *ipv4 = container_of(q->net, struct netns_ipv4, frags); struct net *net = container_of(ipv4, struct net, ipv4); const struct ip4_create_arg *arg = a;//ipv4的分段信息指针 qp->protocol = arg->iph->protocol;//IP层头部协议 qp->id = arg->iph->id;//ip层id qp->ecn = ip4_frag_ecn(arg->iph->tos); qp->saddr = arg->iph->saddr; qp->daddr = arg->iph->daddr; qp->vif = arg->vif; qp->user = arg->user; //记录对方信息 qp->peer = q->net->max_dist ? inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, arg->vif, 1) : NULL; } static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf, struct inet_frag_queue *qp_in, struct inet_frags *f, void *arg) { struct inet_frag_bucket *hb = get_frag_bucket_locked(qp_in, f); struct inet_frag_queue *qp; #ifdef CONFIG_SMP /* With SMP race we have to recheck hash table, because * such entry could have been created on other cpu before * we acquired hash bucket lock. */ hlist_for_each_entry(qp, &hb->chain, list) { if (qp->net == nf && f->match(qp, arg)) { atomic_inc(&qp->refcnt); spin_unlock(&hb->chain_lock); qp_in->flags |= INET_FRAG_COMPLETE; inet_frag_put(qp_in, f); return qp; } } #endif qp = qp_in; if (!mod_timer(&qp->timer, jiffies + nf->timeout)) atomic_inc(&qp->refcnt); atomic_inc(&qp->refcnt);//链入inet分段管理结构的hash队列 hlist_add_head(&qp->list, &hb->chain); spin_unlock(&hb->chain_lock); return qp; }
1/2/3 分片数据包加入重组数据包
/* Add new segment to existing queue. */ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) { struct sk_buff *prev, *next; struct net_device *dev; unsigned int fragsize; int flags, offset; int ihl, end; int err = -ENOENT; u8 ecn; if (qp->q.flags & INET_FRAG_COMPLETE) //分段队列接收完成 则释放此分片返回 goto err; /*数据包没有分段标志or 分段队列间隔过大 //重现调整分段队列是否出错 如果不是本地生成的分片,则调用ip_frag_too_far 检测 是否存在 dos攻击,存在攻击则调用邋ip_frag_reinit释放 所用分片 */ if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) && unlikely(ip_frag_too_far(qp)) && unlikely(err = ip_frag_reinit(qp))) { ipq_kill(qp);//将ipq从散列表中移除停止定时器 计数器减一 // 调用ipq_unlink 设置ipq为complete状态,只有complete状态才能释放 goto err; } ecn = ip4_frag_ecn(ip_hdr(skb)->tos); offset = ntohs(ip_hdr(skb)->frag_off); flags = offset & ~IP_OFFSET; offset &= IP_OFFSET; offset <<= 3; /* offset is in 8-byte chunks */ ihl = ip_hdrlen(skb); /* 获取ip首部中的数据标志位 片的偏移 首部长度 */ /* Determine the position of this fragment. */ end = offset + skb->len - skb_network_offset(skb) - ihl; err = -EINVAL; /**/ /* Is this the final fragment? 如果是最后一个片则先对分片进行检测 */ if ((flags & IP_MF) == 0) { /* If we already have some bits beyond end * or have different end, the segment is corrupted. 结束位置小于前一个位置,ipq已经有 last_in 标志且分片末尾不等于原始数据长度 */ if (end < qp->q.len || ((qp->q.flags & INET_FRAG_LAST_IN) && end != qp->q.len)) goto err; qp->q.flags |= INET_FRAG_LAST_IN; qp->q.len = end; /*通过校验并设置为last_in标志,存储完整的数据长度*/ } else { if (end&7) {//按8字节对其 end &= ~7; if (skb->ip_summed != CHECKSUM_UNNECESSARY) skb->ip_summed = CHECKSUM_NONE; } if (end > qp->q.len) { /* 结束地址大于前一个分段数据地址 Some bits beyond end -> corruption. 如果设置了最后一个分段数据标志 表示最后一个包,则错误*/ if (qp->q.flags & INET_FRAG_LAST_IN) goto err; qp->q.len = end;//记录当前分段数据块的结束位置 } } if (end == offset)//等于起始位置 即分片区数据长度为0 goto err; err = -ENOMEM;//去掉ip首部 if (!pskb_pull(skb, skb_network_offset(skb) + ihl)) goto err; //skb 数据长度为end-offset ip 有效载荷长度 err = pskb_trim_rcsum(skb, end - offset); if (err) goto err; /* Find out which fragments are in front and at the back of us * in the chain of fragments so far. We must know where to put * this fragment, right? */ prev = qp->q.fragments_tail; if (!prev || FRAG_CB(prev)->offset < offset) { next = NULL; goto found; } prev = NULL; for (next = qp->q.fragments; next != NULL; next = next->next) { if (FRAG_CB(next)->offset >= offset) break; /* bingo! */ prev = next; }/*确定分片在链表中的位置,分片到达的时间顺序不同 ipq 上的分片按照分片偏移值大小排序 */ found: /* We found where to put this one. Check for overlap with * preceding fragment, and, if needed, align things so that * any overlaps are eliminated. 检验和和上一个分片数据是否有重叠 */ if (prev) { int i = (FRAG_CB(prev)->offset + prev->len) - offset; if (i > 0) {//有重叠 调用pskb_pull 消除重叠 offset += i; err = -EINVAL; if (end <= offset) goto err; err = -ENOMEM; if (!pskb_pull(skb, i)) goto err; if (skb->ip_summed != CHECKSUM_UNNECESSARY) skb->ip_summed = CHECKSUM_NONE; } } err = -ENOMEM; /*如果和后面一个分片的数据有重叠, 部分重叠还是完全重叠; 重叠部分数据超过下一个分片的数据长度,咋释放 下发一个分片并在检查与后面第二个分片的数据是否 有重叠,如果没有超过下一个则调整下一个分片。 如此反复直到对所有分片都检测完。 调整片的偏移以及分片总长度 */ while (next && FRAG_CB(next)->offset < end) { int i = end - FRAG_CB(next)->offset; /* overlap is ‘i‘ bytes */ if (i < next->len) { /* Eat head of the next overlapped fragment * and leave the loop. The next ones cannot overlap. */ if (!pskb_pull(next, i)) goto err; FRAG_CB(next)->offset += i; qp->q.meat -= i; if (next->ip_summed != CHECKSUM_UNNECESSARY) next->ip_summed = CHECKSUM_NONE; break; } else { struct sk_buff *free_it = next; /* Old fragment is completely overridden with * new one drop it. */ next = next->next; if (prev) prev->next = next; else qp->q.fragments = next; qp->q.meat -= free_it->len; sub_frag_mem_limit(qp->q.net, free_it->truesize); kfree_skb(free_it); } } FRAG_CB(skb)->offset = offset;//当前片的偏移 /* Insert this fragment in the chain of fragments. 当前的片插入到ipq队列中相应的位置*/ skb->next = next; if (!next) qp->q.fragments_tail = skb; if (prev) prev->next = skb; else qp->q.fragments = skb; dev = skb->dev; if (dev) { qp->iif = dev->ifindex; skb->dev = NULL; } qp->q.stamp = skb->tstamp;//更新时间搓 qp->q.meat += skb->len;//sum ipq已收到分片的总长度 qp->ecn |= ecn; //分片组装模块的所占内存的总长度 add_frag_mem_limit(qp->q.net, skb->truesize); if (offset == 0)//为第一个片 设置标志 qp->q.flags |= INET_FRAG_FIRST_IN; fragsize = skb->len + ihl; if (fragsize > qp->q.max_size) qp->q.max_size = fragsize; if (ip_hdr(skb)->frag_off & htons(IP_DF) && fragsize > qp->max_df_size) qp->max_df_size = fragsize; if (qp->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && qp->q.meat == qp->q.len) {//所有报文都到齐则重组 unsigned long orefdst = skb->_skb_refdst; skb->_skb_refdst = 0UL; err = ip_frag_reasm(qp, prev, dev); skb->_skb_refdst = orefdst; return err; } skb_dst_drop(skb); return -EINPROGRESS; err: kfree_skb(skb); return err; }
ip_frag_reasm 重组报文;
* Build a new IP datagram from all its fragments. */ /* *用于组装已到齐的所有分片,当原始 * 数据包的所有分片都已到齐时,会调用此函 * 数组装分片。 */ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, struct net_device *dev) { struct net *net = container_of(qp->q.net, struct net, ipv4.frags); struct iphdr *iph; struct sk_buff *fp, *head = qp->q.fragments; int len; int ihlen; int err; u8 ecn; /* * 要开始组装了,因此调用ipq_kill()将此ipq结点从 * ipq散列表删除,并删除定时器。 */ ipq_kill(qp); ecn = ip_frag_ecn_table[qp->ecn]; if (unlikely(ecn == 0xff)) { err = -EINVAL; goto out_fail; } /* Make the one we just received the head. */ if (prev) { head = prev->next; fp = skb_clone(head, GFP_ATOMIC); if (!fp) goto out_nomem; fp->next = head->next; if (!fp->next) qp->q.fragments_tail = fp; prev->next = fp; skb_morph(head, qp->q.fragments); head->next = qp->q.fragments->next; consume_skb(qp->q.fragments); qp->q.fragments = head; } WARN_ON(!head); WARN_ON(FRAG_CB(head)->offset != 0); /* Allocate a new buffer for the datagram. 计算原始报文的长度 超过64 KB*/ ihlen = ip_hdrlen(head); len = ihlen + qp->q.len; err = -E2BIG; if (len > 65535) goto out_oversize; /* Head of list must not be cloned. * 在组装分片时,所有的分片都会组装到第一个分片 * 上,因此第一个分片是不能克隆的,如果是克隆的, * 则需为分片组装重新分配一个SKB。 */ if (skb_unclone(head, GFP_ATOMIC)) goto out_nomem; /* If the first fragment is fragmented itself, we split * it to two chunks: the first with data and paged part * and the second, holding only fragments. */ /* * 分片队列的第一个SKB不能既带有数据,又带有分片,即其 * frag_list上不能有分片skb,如果有则重新分配一个SKB。最终的 * 效果是,head自身不包括数据,其frag_list上链接着所有分片的 * SKB。这也是SKB的一种表现形式,不一定是一个连续的数据块, * 但最终会调用skb_linearize()将这些数据都复制到一个连续的数据 * 块中。 */ if (skb_has_frag_list(head)) { struct sk_buff *clone; int i, plen = 0; clone = alloc_skb(0, GFP_ATOMIC); if (!clone) goto out_nomem; clone->next = head->next; head->next = clone; skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; skb_frag_list_init(head); for (i = 0; i < skb_shinfo(head)->nr_frags; i++) plen += skb_frag_size(&skb_shinfo(head)->frags[i]); clone->len = clone->data_len = head->data_len - plen; head->data_len -= clone->len; head->len -= clone->len; clone->csum = 0; clone->ip_summed = head->ip_summed; add_frag_mem_limit(qp->q.net, clone->truesize); } /* * 把所有分片组装起来即将分片链接到第一个 * SKB的frag_list上,同时还需要遍历所有分片, * 重新计算IP数据包长度以及校验和等。 */ skb_shinfo(head)->frag_list = head->next; skb_push(head, head->data - skb_network_header(head)); for (fp=head->next; fp; fp = fp->next) { head->data_len += fp->len; head->len += fp->len; if (head->ip_summed != fp->ip_summed) head->ip_summed = CHECKSUM_NONE; else if (head->ip_summed == CHECKSUM_COMPLETE) head->csum = csum_add(head->csum, fp->csum); head->truesize += fp->truesize; } /* * 重置首部长度、片偏移、标志位和总长度。 */ sub_frag_mem_limit(qp->q.net, head->truesize); head->next = NULL; head->dev = dev; head->tstamp = qp->q.stamp; IPCB(head)->frag_max_size = max(qp->max_df_size, qp->q.max_size); iph = ip_hdr(head); iph->tot_len = htons(len); iph->tos |= ecn; /* When we set IP_DF on a refragmented skb we must also force a * call to ip_fragment to avoid forwarding a DF-skb of size s while * original sender only sent fragments of size f (where f < s). * * We only set DF/IPSKB_FRAG_PMTU if such DF fragment was the largest * frag seen to avoid sending tiny DF-fragments in case skb was built * from one very small df-fragment and one large non-df frag. */ if (qp->max_df_size == qp->q.max_size) { IPCB(head)->flags |= IPSKB_FRAG_PMTU; iph->frag_off = htons(IP_DF); } else { iph->frag_off = 0; } ip_send_check(iph); __IP_INC_STATS(net, IPSTATS_MIB_REASMOKS); /* * 既然各分片都已处理完,释放ipq的分片队列。 */ qp->q.fragments = NULL; qp->q.fragments_tail = NULL; return 0; out_nomem: net_dbg_ratelimited("queue_glue: no memory for gluing queue %p\n", qp); err = -ENOMEM; goto out_fail; out_oversize: net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->saddr); out_fail: __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); return err; }
1/4/4 ipq散列表重组
static void inet_frag_secret_rebuild(struct inet_frags *f) { int i; write_seqlock_bh(&f->rnd_seqlock);//顺序锁 if (!inet_frag_may_rebuild(f)) goto out; /* 获取新的用于计算hash的随机值 */ get_random_bytes(&f->rnd, sizeof(u32)); for (i = 0; i < INETFRAGS_HASHSZ; i++) { struct inet_frag_bucket *hb; struct inet_frag_queue *q; struct hlist_node *n; hb = &f->hash[i]; /* 取的桶节点 */ spin_lock(&hb->chain_lock); hlist_for_each_entry_safe(q, n, &hb->chain, list) { unsigned int hval = inet_frag_hashfn(f, q); if (hval != i) {/* 节点不属于当前桶 */ struct inet_frag_bucket *hb_dest; hlist_del(&q->list); /* 从当前桶中删除该节点 */ /* Relink to new hash chain. */ hb_dest = &f->hash[hval]; /* 找到目标桶 */ /* This is the only place where we take * another chain_lock while already holding * one. As this will not run concurrently, * we cannot deadlock on hb_dest lock below, if its * already locked it will be released soon since * other caller cannot be waiting for hb lock * that we‘ve taken above. */ spin_lock_nested(&hb_dest->chain_lock, SINGLE_DEPTH_NESTING);/* 节点加入目标桶的链表中 */ hlist_add_head(&q->list, &hb_dest->chain); spin_unlock(&hb_dest->chain_lock); } } spin_unlock(&hb->chain_lock); } /* 设置重建标记和重建时间 */ f->rebuild = false; f->last_rebuild_jiffies = jiffies; out: write_sequnlock_bh(&f->rnd_seqlock); }
1/4/5 超时IP分片的清除
会定时清除规定 时间内没有完成重组的upq及其所有的分片
/* * Oops, a fragment queue timed out. Kill it and send an ICMP reply. */ static void ip_expire(unsigned long arg) { struct ipq *qp; struct net *net; qp = container_of((struct inet_frag_queue *) arg, struct ipq, q); net = container_of(qp->q.net, struct net, ipv4.frags); spin_lock(&qp->q.lock); //ipq 已经是complete状态不处理 直接释放ipq以及其所有的分片 if (qp->q.flags & INET_FRAG_COMPLETE) goto out; ipq_kill(qp);//将其从散列表移除 __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);//数据统计 if (!inet_frag_evicting(&qp->q)) {//在回收队列中 struct sk_buff *head = qp->q.fragments; const struct iphdr *iph; int err; __IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT); if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !qp->q.fragments) goto out; rcu_read_lock(); head->dev = dev_get_by_index_rcu(net, qp->iif); if (!head->dev) goto out_rcu_unlock; /* skb has no dst, perform route lookup again */ iph = ip_hdr(head); err = ip_route_input_noref(head, iph->daddr, iph->saddr, iph->tos, head->dev); if (err) goto out_rcu_unlock; /* Only an end host needs to send an ICMP * "Fragment Reassembly Timeout" message, per RFC792. */ if (frag_expire_skip_icmp(qp->user) && (skb_rtable(head)->rt_type != RTN_LOCAL)) goto out_rcu_unlock; /* Send an ICMP "Fragment Reassembly Timeout" message. 发送ICMP 报文*/ icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0); out_rcu_unlock: rcu_read_unlock(); } out: spin_unlock(&qp->q.lock); ipq_put(qp); }
1/4/6 进行节点回收工作队列
为了控制ip组装所占用的内存,设置了两个阈值low_thresh 、high_thresh 当前ipq散列表所占用的内存存储在 mem变量中,这些全局变量存在如下结构中(netns_frags)
struct netns_frags { /* The percpu_counter "mem" need to be cacheline aligned. * mem.count must not share cacheline with other writers */ struct percpu_counter mem ____cacheline_aligned_in_smp; /* sysctls */ int timeout; int high_thresh; int low_thresh; int max_dist; };
当mem大于high_thres 时,需要对散列表清理,直到mem值降低到low_thres。这两个值可以通过proc修改
static unsigned int inet_evict_bucket(struct inet_frags *f, struct inet_frag_bucket *hb) { struct inet_frag_queue *fq; struct hlist_node *n; unsigned int evicted = 0; HLIST_HEAD(expired); spin_lock(&hb->chain_lock); /* 遍历桶下的链表 */ hlist_for_each_entry_safe(fq, n, &hb->chain, list) { if (!inet_fragq_should_evict(fq))/* 未超过限定,无需回收 */ continue; if (!del_timer(&fq->timer)) /* 定时器无法删除 */ continue; /* 能够回收的节点加入到临时hash */ hlist_add_head(&fq->list_evictor, &expired); ++evicted; } spin_unlock(&hb->chain_lock); /* 依次调用回收函数进行回收 */ hlist_for_each_entry_safe(fq, n, &expired, list_evictor) f->frag_expire((unsigned long) fq); return evicted; } static void inet_frag_worker(struct work_struct *work) { /* 本次回收的桶节点数 */ unsigned int budget = INETFRAGS_EVICT_BUCKETS; unsigned int i, evicted = 0; struct inet_frags *f; f = container_of(work, struct inet_frags, frags_work); BUILD_BUG_ON(INETFRAGS_EVICT_BUCKETS >= INETFRAGS_HASHSZ); local_bh_disable(); /* 从上次回收完的下一个节点开始,进行回收 */ for (i = ACCESS_ONCE(f->next_bucket); budget; --budget) { evicted += inet_evict_bucket(f, &f->hash[i]); /* 回收并统计回收数量 */ i = (i + 1) & (INETFRAGS_HASHSZ - 1); /* 回收节点数超过最大值,停止 */ if (evicted > INETFRAGS_EVICT_MAX) break; } f->next_bucket = i; /* 记录下次需要开始回收的桶节点 */ local_bh_enable(); /* 如果需要重建,则重建 */ if (f->rebuild && inet_frag_may_rebuild(f)) inet_frag_secret_rebuild(f); }
以上是关于IP 层收发报文简要剖析3--ip输入报文分片重组的主要内容,如果未能解决你的问题,请参考以下文章
IP 层收发报文简要剖析2--ip报文的输入ip_local_deliver