这里主要明与NAT/Masq转发模式相关的ICMP报文处理,但也会提及由于出错引发的IPVS系统主动发送的ICMP报文。
ICMP由内到外处理流程入口
入口函数ip_vs_out实质上挂载在netfilter的3个hook点上,分别为:NF_INET_FORWARD、NF_INET_LOCAL_IN和NF_INET_LOCAL_OUT。第一个hook点作用于转发的报文;后两个作用于到本机的报文。此函数用于处理IPVS由内到外的发出报文,包括ICMP报文。如果协议号为IPPROTO_ICMP/IPPROTO_ICMPV6,分别使用函数ip_vs_out_icmp、ip_vs_out_icmp_v6进行处理。
static unsigned int ip_vs_out(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int af)
{
struct ip_vs_iphdr iph;
struct ip_vs_protocol *pp;
struct ip_vs_proto_data *pd;
struct ip_vs_conn *cp;
struct sock *sk;
ip_vs_fill_iph_skb(af, skb, false, &iph);
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6) {
if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
int verdict = ip_vs_out_icmp_v6(ipvs, skb, &related, hooknum, &iph);
if (related)
return verdict;
}
} else
#endif
if (unlikely(iph.protocol == IPPROTO_ICMP)) {
int related;
int verdict = ip_vs_out_icmp(ipvs, skb, &related, hooknum);
if (related)
return verdict;
}
否则,以下处理处理其它非关联的ICMP报文和其它的IP报文,但是,需要注意的是,对于找不到IPVS连接的报文,IPVS系统认为是由真实服务器所主动发送(目前仅支持SIP协议PE引擎,此情况下真实服务器可主动发起RTP数据报文)。否则IPVS系统将发送ICMP错误报文。
pd = ip_vs_proto_data_get(ipvs, iph.protocol);
pp = pd->pp;
cp = pp->conn_out_get(ipvs, af, skb, &iph);
if (likely(cp)) {
if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
goto ignore_cp;
return handle_response(af, skb, pd, cp, &iph, hooknum);
}
/* Check for real-server-started requests */
if (atomic_read(&ipvs->conn_out_counter)) {
/* Currently only for UDP:
* connection oriented protocols typically use ephemeral ports for outgoing connections, so
* related incoming responses would not match any VS
*/
if (pp->protocol == IPPROTO_UDP) {
cp = __ip_vs_rs_conn_out(hooknum, ipvs, af, skb, &iph);
if (likely(cp))
return handle_response(af, skb, pd, cp, &iph, hooknum);
}
}
如下所示,PROC文件/proc/sys/net/ipv4/vs/nat_icmp_send可用于控制在此种错误情况下,是否发送ICMP报文。IPVS默认的nat_icmp_send值为0,不发送ICMP。ICMP报文由函数icmp_send或者icmpv6_send发送,类型为ICMP_DEST_UNREACH,代码为ICMP_PORT_UNREACH。
if (sysctl_nat_icmp_send(ipvs) && (pp->protocol == IPPROTO_TCP || pp->protocol == IPPROTO_UDP || pp->protocol == IPPROTO_SCTP)) {
__be16 _ports[2], *pptr;
pptr = frag_safe_skb_hp(skb, iph.len, sizeof(_ports), _ports, &iph);
if (pptr == NULL)
return NF_ACCEPT; /* Not for me */
if (ip_vs_has_real_service(ipvs, af, iph.protocol, &iph.saddr, pptr[0])) {
/*
* Notify the real server: there is no existing entry if it is not RST packet or not TCP packet.
*/
if ((iph.protocol != IPPROTO_TCP && iph.protocol != IPPROTO_SCTP)
|| ((iph.protocol == IPPROTO_TCP && !is_tcp_reset(skb, iph.len))
|| (iph.protocol == IPPROTO_SCTP && !is_sctp_abort(skb, iph.len)))) {
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6) {
if (!skb->dev) skb->dev = ipvs->net->loopback_dev;
icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0);
} else
#endif
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
return NF_DROP;
IPVS由内到外的ICMP处理
函数ip_vs_out_icmp目前仅处理三种类型的ICMP报文:ICMP_DEST_UNREACH、ICMP_SOURCE_QUENCH和ICMP_TIME_EXCEEDED。如果不是这三种类型,设置为不相关联的ICMP,结束处理。
static int ip_vs_out_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related, unsigned int hooknum)
{
struct icmphdr _icmph, *ic;
struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */
struct ip_vs_iphdr ciph;
*related = 1;
iph = ip_hdr(skb);
offset = ihl = iph->ihl * 4;
ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
/*
* Work through seeing if this is for us.
* These checks are supposed to be in an order that means easy things are checked first to speed up processing.... however
* this means that some packets will manage to get a long way down this stack and then be rejected, but that's life.
*/
if ((ic->type != ICMP_DEST_UNREACH) &&
(ic->type != ICMP_SOURCE_QUENCH) &&
(ic->type != ICMP_TIME_EXCEEDED)) {
*related = 0;
return NF_ACCEPT;
}
接下来,找到ICMP报文中内层的IP报文,根据其中的IP头部信息,查找IPVS连接。如果找到的话,表明此ICMP报文是由之前客户端的请求报文所触发的,由真实服务器回复的ICMP报文。就有函数handle_response_icmp处理。
/* Now find the contained IP header */
offset += sizeof(_icmph);
cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
pp = ip_vs_proto_get(cih->protocol);
/* Is the embedded protocol header present? */
if (unlikely(cih->frag_off & htons(IP_OFFSET) && pp->dont_defrag))
return NF_ACCEPT;
ip_vs_fill_iph_skb_icmp(AF_INET, skb, offset, true, &ciph);
/* The embedded headers contain source and dest in reverse order */
cp = pp->conn_out_get(ipvs, AF_INET, skb, &ciph);
if (!cp)
return NF_ACCEPT;
snet.ip = iph->saddr;
return handle_response_icmp(AF_INET, skb, &snet, cih->protocol, cp, pp, ciph.len, ihl, hooknum);
NAT的ICMP响应处理
如下处理函数handle_response_icmp,首先一点是其仅工作于NAT/Masq转发模式。对于四层协议:IPPROTO_TCP、IPPROTO_UDP或者IPPROTO_SCTP,由于NAT将修改其中开头的源端口和目的端口号,一并将其设置为可写状态。
static int handle_response_icmp(int af, struct sk_buff *skb, union nf_inet_addr *snet, __u8 protocol, struct ip_vs_conn *cp,
struct ip_vs_protocol *pp, unsigned int offset, unsigned int ihl, unsigned int hooknum)
{
unsigned int verdict = NF_DROP;
if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
goto ignore_cp;
if (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol || IPPROTO_SCTP == protocol)
offset += 2 * sizeof(__u16);
if (!skb_make_writable(skb, offset))
goto out;
接下来又函数ip_vs_nat_icmp处理报文中NAT相关字段的修改。既然IP报文被修改,随后进行重路由,参见函数ip_vs_route_me_harder。
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6)
ip_vs_nat_icmp_v6(skb, pp, cp, 1);
else
#endif
ip_vs_nat_icmp(skb, pp, cp, 1);
if (ip_vs_route_me_harder(cp->ipvs, af, skb, hooknum))
goto out;
函数ip_vs_nat_icmp负责对ICMP报文进行NAT处理。由于当前的处理报文是由内部到外部,inout参数为1。修改报文的IP头部的源地址,和ICMP内部IP报文的目的IP地址(因为内部IP表示原方向报文),同时更新IP头部校验和。
void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp, int inout)
{
struct iphdr *iph = ip_hdr(skb);
unsigned int icmp_offset = iph->ihl*4;
struct icmphdr *icmph = (struct icmphdr *)(skb_network_header(skb) + icmp_offset);
struct iphdr *ciph = (struct iphdr *)(icmph + 1);
if (inout) {
iph->saddr = cp->vaddr.ip;
ip_send_check(iph);
ciph->daddr = cp->vaddr.ip;
ip_send_check(ciph);
} else {
iph->daddr = cp->daddr.ip;
ip_send_check(iph);
ciph->saddr = cp->daddr.ip;
ip_send_check(ciph);
}
随后,对于4层协议IPPROTO_TCP、IPPROTO_UDP和IPPROTO_SCTP,如果报文为由内到外,修改ICMP内部4层头中目的端口号(还原为发送时客户端请求的端口号)。
/* the TCP/UDP/SCTP port */
if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol || IPPROTO_SCTP == ciph->protocol) {
__be16 *ports = (void *)ciph + ciph->ihl*4;
if (inout)
ports[1] = cp->vport;
else
ports[0] = cp->dport;
}
/* And finally the ICMP checksum */
icmph->checksum = 0;
icmph->checksum = ip_vs_checksum_complete(skb, icmp_offset);
skb->ip_summed = CHECKSUM_UNNECESSARY;
内核版本 4.15