1、TSO(transimit segment offload)是针对tcp而言的,是指协议栈可以将tcp 分段的操作offload到硬件的能力,本身需要硬件的支持。当网卡具有TSO能力时,上层协议栈可以直接下发一个超过MTU数据包,而把数据包拆分的动作交给硬件去做,节省cpu资源。除了TSO,内核还有一个GSO,GSO不区分协议类型,GSO默认是开启的,GSO是在软件上实现的一种延迟分段的技术,相比TSO,GSO最终还是需要协议栈自己完成分段的处理。
即使网卡没有TSO能力,传输层依然可以封装一个超过MTU的数据包,等数据包发送给驱动之前,检查网卡是否有TSO能力,如果没有,再调用ip层和传输层的分段处理函数完成数据包的分段处理,通过这样,内核将数据包的分段延迟到了dev链路层,提升数据包处理效率。当支持GSO/TSO时,skb的数据存放格式如下所示,在skb->end后,存在一个skb_share区域,skb的非线性区数据就存放在这里,GSO/TSO分段的处理就是要把skb数据(包括线性区、非线性区)按gso_size的大小进行分割处理;本文以虚拟网卡为例,介绍TSO的整体流程。

2、驱动初始化过程
virtio驱动加载时,会根据qemu/vhost前后端feature协商的结果判断虚拟网卡是否有TSO能力,如果有,则在dev->hw_feature或上NETIF_F_TSO标志,然后赋给dev->features。
static int virtnet_probe(struct virtio_device *vdev) { /* Individual feature bits: what can host handle? */ if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_TSO4)) dev->hw_features |= NETIF_F_TSO; //gso默认为True if (gso) dev->features |= dev->hw_features & (NETIF_F_ALL_TSO|NETIF_F_UFO); }
3、注册虚拟网卡设备时,设置GSO能力。
virtnet_probe —> register_netdev —->register_netdevice
int register_netdevice(struct net_device *dev) { dev->hw_features |= NETIF_F_SOFT_FEATURES; //dev->features是给协议栈用的 dev->features |= NETIF_F_SOFT_FEATURES; }
4、在发送端发起connect连接或三次握手建立完成(tcp_v4_syn_recv_sock),会开启GSO。
int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, inet->inet_sport, inet->inet_dport, sk); if (IS_ERR(rt)) { err = PTR_ERR(rt); rt = NULL; goto failure; } /* OK, now commit destination to socket. */ //设置GSO类型为tcpv4 sk->sk_gso_type = SKB_GSO_TCPV4; sk_setup_caps(sk, &rt->dst); }
tcp_v4_connect将sock的gso_type设置为tcpv4类型,然后调用sk_setup_caps,根据net_gso_ok返回值,判断是否支持GSO能力,正常这里是返回True。
static inline bool net_gso_ok(netdev_features_t features, int gso_type) { //这个函数可以认为是检验是否具有tso能力,主要的调用地方有两个: //1、tcp层connect或三次握手完成时调用,这个调用流程里,如果开启GSO,则features同时会置上TSO, // 而GSO默认都是开启的,因此tcp层调用这个接口,会返回true //2、dev层将skb发送给驱动前调用,判断是否需要做TSO,这个调用流程里,features直接等于dev->features, // 如果网卡没有TSO能力,则features不会有TSO的标志,那这个函数就会返回false netdev_features_t feature = gso_type & SKB_GSO1_MASK; feature <<= NETIF_F_GSO_SHIFT; if (gso_type & SKB_GSO2_MASK) { netdev_features_t f = gso_type & SKB_GSO2_MASK; f <<= NETIF_F_GSO2_SHIFT; feature |= f; } /* check flags correspondence */ BUILD_BUG_ON(SKB_GSO_TCPV4 != (NETIF_F_TSO >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_UDP != (NETIF_F_UFO >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_DODGY != (NETIF_F_GSO_ROBUST >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_TCP_ECN != (NETIF_F_TSO_ECN >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_TCPV6 != (NETIF_F_TSO6 >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_FCOE != (NETIF_F_FSO >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_GRE != (NETIF_F_GSO_GRE >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_IPIP != (NETIF_F_GSO_IPIP >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_SIT != (NETIF_F_GSO_SIT >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_UDP_TUNNEL != (NETIF_F_GSO_UDP_TUNNEL >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_MPLS != (NETIF_F_GSO_MPLS >> NETIF_F_GSO_SHIFT)); /* GSO2 flags, see netdev_features.h */ BUILD_BUG_ON(SKB_GSO_GRE_CSUM != (NETIF_F_GSO_GRE_CSUM >> NETIF_F_GSO2_SHIFT)); BUILD_BUG_ON(SKB_GSO_UDP_TUNNEL_CSUM != (NETIF_F_GSO_UDP_TUNNEL_CSUM >> NETIF_F_GSO2_SHIFT)); BUILD_BUG_ON(SKB_GSO_PARTIAL != (NETIF_F_GSO_PARTIAL >> NETIF_F_GSO2_SHIFT)); BUILD_BUG_ON(SKB_GSO_SCTP != (NETIF_F_GSO_SCTP >> NETIF_F_GSO2_SHIFT)); BUILD_BUG_ON(SKB_GSO_TCP_FIXEDID != (NETIF_F_TSO_MANGLEID >> NETIF_F_GSO2_SHIFT)); return (features & feature) == feature; }
协议层校验支持gso后,会同时开启分散、聚合及csum校验能力。
void sk_setup_caps(struct sock *sk, struct dst_entry *dst) { sk_dst_set(sk, dst); sk->sk_route_caps = dst->dev->features; if (sk->sk_route_caps & NETIF_F_GSO) sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE; sk->sk_route_caps &= ~sk->sk_route_nocaps; if (sk_can_gso(sk)) { //skb头部需要额外空间,关闭GSO if (dst->header_len) { sk->sk_route_caps &= ~NETIF_F_GSO_MASK; } else { //开启skb的分散、聚合及csum功能,因为网卡做TSO的同时需要支持分散、聚合功能以及csum的重新计算能力 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; sk->sk_gso_max_size = dst->dev->gso_max_size; sk->sk_gso_max_segs = dst->dev->gso_max_segs; } } }
5、应用程序调用send发送数据包,send系统调用最终调用tcp_sendmsg,在tcp_sendmsg里判断是否支持GSO,支持的话将用户数据信息封装到skb的线性区或非线性区,封装完后的skb数据包就是一个大包了,然后调用tcp_push_one发送给IP层,当然发送之前还会调用check函数,根据csum的类型计算tcp层的csum,支持GSO、TSO情况下,tcp层只会计算伪头部的csum。
int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t size) { struct iovec *iov; struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; int iovlen, flags, err, copied = 0; int mss_now = 0, size_goal, copied_syn = 0, offset = 0; bool sg; long timeo; lock_sock(sk); flags = msg->msg_flags; if (flags & MSG_FASTOPEN) { err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size); if (err == -EINPROGRESS && copied_syn > 0) goto out; else if (err) goto out_err; offset = copied_syn; } timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); /* Wait for a connection to finish. One exception is TCP Fast Open * (passive side) where data is allowed to be sent before a connection * is fully established. */ if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) && !tcp_passive_fastopen(sk)) { if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) goto do_error; } if (unlikely(tp->repair)) { if (tp->repair_queue == TCP_RECV_QUEUE) { copied = tcp_send_rcvq(sk, msg, size); goto out_nopush; } err = -EINVAL; if (tp->repair_queue == TCP_NO_QUEUE) goto out_err; /* 'common' sending to sendq */ } /* This should be in poll */ clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); //获取mss,如果支持GSO,这里获取的是协商到的mss的整数倍 mss_now = tcp_send_mss(sk, &size_goal, flags); /* Ok commence sending. */ iovlen = msg->msg_iovlen; iov = msg->msg_iov; copied = 0; err = -EPIPE; if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) goto out_err; //判断是否有分散、聚合能力 sg = !!(sk->sk_route_caps & NETIF_F_SG); while (--iovlen >= 0) { size_t seglen = iov->iov_len; unsigned char __user *from = iov->iov_base; iov++; if (unlikely(offset > 0)) { /* Skip bytes copied in SYN */ if (offset >= seglen) { offset -= seglen; continue; } seglen -= offset; from += offset; offset = 0; } while (seglen > 0) { int copy = 0; int max = size_goal; //获取write队列的最后一个skb skb = tcp_write_queue_tail(sk); if (tcp_send_head(sk)) { if (skb->ip_summed == CHECKSUM_NONE) max = mss_now; //copy表示skb可以存放的最大的数据长度 copy = max - skb->len; } //如果skb->len >= max,说明这个skb已经填满数据了;需要重新分配一个skb if (copy <= 0) { new_segment: /* Allocate new segment. If the interface is SG, * allocate skb fitting to single page. */ if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; skb = sk_stream_alloc_skb(sk, //获取skb头的长度 select_size(sk, sg), sk->sk_allocation); if (!skb) goto wait_for_memory; /* * Check whether we can use HW checksum. */ //在sk_setup_caps,已经置上的csum能力,设置ip_summed模式为CHECKSUM_PARTIAL //意思是协议栈只做ip头和伪头部的checksum,palyload需要硬件帮忙做 if (sk->sk_route_caps & NETIF_F_CSUM_MASK) skb->ip_summed = CHECKSUM_PARTIAL; //分完一个skb后,将其加入sk->sk_write_queue队列中 skb_entail(sk, skb); copy = size_goal; max = size_goal; /* All packets are restored as if they have * already been sent. skb_mstamp isn't set to * avoid wrong rtt estimation. */ if (tp->repair) TCP_SKB_CB(skb)->sacked |= TCPCB_REPAIRED; } /* Try to append data to the end of skb. */ //拷贝的数据最多不超过用户发送的消息长度 if (copy > seglen) copy = seglen; /* Where to copy to? */ //判断线性区是否还有空间 if (skb_availroom(skb) > 0) { /* We have some space in skb head. Superb! */ copy = min_t(int, copy, skb_availroom(skb)); //将用户数据拷贝到线性区,并同步更新skb->len err = skb_add_data_nocache(sk, skb, from, copy); if (err) goto do_fault; } else { //线性区已经没有空间,将报文信息放到skinfo里 bool merge = true; int i = skb_shinfo(skb)->nr_frags; struct page_frag *pfrag = sk_page_frag(sk); //sk_page_frag表示当前的skb_shinfo的最后一个frags,这里判断最后一个frags的page是否还有 //空间可以存放数据(最小是32字节),如果没有,则重新分配一个page并放到pfrag->page里 if (!sk_page_frag_refill(sk, pfrag)) goto wait_for_memory; //判断pfrag->page是否是sk_page_frag指向的最后一个page,如果是,则表明上一步判断 //sk_page_frag里的page还有足够空间可以存放数据; //如果不是,则表明上一步有重新分配过page页,把merge置为false,接下去需要把这个新分配的page页 //添加到skb_shinfo里. if (!skb_can_coalesce(skb, i, pfrag->page, pfrag->offset)) { if (i == MAX_SKB_FRAGS || !sg) { tcp_mark_push(tp, skb); goto new_segment; } merge = false; } //取需要拷贝的数据包长度与page剩余空间的最小值 copy = min_t(int, copy, pfrag->size - pfrag->offset); if (!sk_wmem_schedule(sk, copy)) goto wait_for_memory; //拷贝用户数据到pfrag->page里,并同步更新skb->len和skb->data_len,所以skb->data_len只表示非线性区的长度 err = skb_copy_to_page_nocache(sk, from, skb, pfrag->page, pfrag->offset, copy); if (err) goto do_error; /* Update the skb. */ //如果是合并操作,则修改最后一个page的大小信息 if (merge) { skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); } else { //新分配的page页,添加到skb_shinfo(skb)->frags[i]里 //同时将skb_shinfo(skb)->nr_frags值增1 skb_fill_page_desc(skb, i, pfrag->page, pfrag->offset, copy); //page引用计数加1 get_page(pfrag->page); } pfrag->offset += copy; } if (!copied) TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH; tp->write_seq += copy; TCP_SKB_CB(skb)->end_seq += copy; skb_shinfo(skb)->gso_segs = 0; from += copy; copied += copy; if ((seglen -= copy) == 0 && iovlen == 0) goto out; if (skb->len < max || (flags & MSG_OOB) || unlikely(tp->repair)) continue; if (forced_push(tp)) { tcp_mark_push(tp, skb); __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH); } else if (skb == tcp_send_head(sk)) tcp_push_one(sk, mss_now); continue; wait_for_sndbuf: set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); wait_for_memory: if (copied) tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH, size_goal); if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) goto do_error; mss_now = tcp_send_mss(sk, &size_goal, flags); } } out: if (copied) tcp_push(sk, flags, mss_now, tp->nonagle, size_goal); out_nopush: release_sock(sk); return copied + copied_syn; do_fault: if (!skb->len) { tcp_unlink_write_queue(skb, sk); /* It is the one place in all of TCP, except connection * reset, where we can be unlinking the send_head. */ tcp_check_send_head(sk, skb); sk_wmem_free_skb(sk, skb); } do_error: if (copied + copied_syn) goto out; out_err: err = sk_stream_error(sk, flags, err); release_sock(sk); return err; }
6、在tcp_write_xmit流程里,通过tcp_init_tso_segs设置gso_size及分段个数gso_segs,其中gso_size为mss值,这两个参数用于告诉硬件做tso拆分时,需要拆分成的数据包个数及长度。
static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb, unsigned int mss_now) { struct skb_shared_info *shinfo = skb_shinfo(skb); /* Make sure we own this skb before messing gso_size/gso_segs */ WARN_ON_ONCE(skb_cloned(skb)); if (skb->len <= mss_now || !sk_can_gso(sk) || skb->ip_summed == CHECKSUM_NONE) { /* Avoid the costly divide in the normal * non-TSO case. */ shinfo->gso_segs = 1; shinfo->gso_size = 0; shinfo->gso_type = 0; } else { //gso_segs为数据包总长度除mss shinfo->gso_segs = DIV_ROUND_UP(skb->len, mss_now); //gso_size为mss值,硬件做拆分时,会按gso_size的长度拆分每个数据包 shinfo->gso_size = mss_now; shinfo->gso_type = sk->sk_gso_type; } }
7、dev层发送给驱动之前,进一步校验网卡是否具有TSO能力,如果没有,则回调tcp的分段函数完成skb的分段处理,如果支持,则直接发送给驱动;
static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev) { netdev_features_t features; if (skb->next) return skb; features = netif_skb_features(skb); skb = validate_xmit_vlan(skb, features); if (unlikely(!skb)) goto out_null; //这里的featrues就是dev->features,如果网卡不具有TSO,则dev->features不会有TSO的标志,这个函数返回true if (netif_needs_gso(skb, features)) { struct sk_buff *segs; segs = skb_gso_segment(skb, features); if (IS_ERR(segs)) { goto out_kfree_skb; } else if (segs) { consume_skb(skb); skb = segs; } } else { if (skb_needs_linearize(skb, features) && __skb_linearize(skb)) goto out_kfree_skb; /* If packet is not checksummed and device does not * support checksumming for this protocol, complete * checksumming here. */ if (skb->ip_summed == CHECKSUM_PARTIAL) { if (skb->encapsulation) skb_set_inner_transport_header(skb, skb_checksum_start_offset(skb)); else skb_set_transport_header(skb, skb_checksum_start_offset(skb)); if (skb_csum_hwoffload_help(skb, features)) goto out_kfree_skb; } } return skb; out_kfree_skb: kfree_skb(skb); out_null: return NULL; }
8、如果需要做gso分段,则先进入ip层的分段处理,在ip层分段处理函数里,主要工作是调用tcp层的分段处理函数,等tcp层分段完成后,重新对分段的skb的ip头做checksum;
static struct sk_buff *inet_gso_segment(struct sk_buff *skb, netdev_features_t features) { bool udpfrag = false, fixedid = false, gso_partial, encap; struct sk_buff *segs = ERR_PTR(-EINVAL); const struct net_offload *ops; unsigned int offset = 0; struct iphdr *iph; int proto, tot_len; int nhoff; int ihl; int id; //设置ip头基于head的偏移 skb_reset_network_header(skb); //ip头基于mac头的偏移,即使就是mac头的长度 nhoff = skb_network_header(skb) - skb_mac_header(skb); if (unlikely(!pskb_may_pull(skb, sizeof(*iph)))) goto out; iph = ip_hdr(skb); ihl = iph->ihl * 4; if (ihl < sizeof(*iph)) goto out; id = ntohs(iph->id); proto = iph->protocol; /* Warning: after this point, iph might be no longer valid */ if (unlikely(!pskb_may_pull(skb, ihl))) goto out; //剥离ip头 __skb_pull(skb, ihl); encap = SKB_GSO_CB(skb)->encap_level > 0; if (encap) features &= skb->dev->hw_enc_features; SKB_GSO_CB(skb)->encap_level += ihl; //设置tcp头基于head的偏移 skb_reset_transport_header(skb); segs = ERR_PTR(-EPROTONOSUPPORT); if (!skb->encapsulation || encap) { udpfrag = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP); fixedid = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TCP_FIXEDID); /* fixed ID is invalid if DF bit is not set */ if (fixedid && !(ip_hdr(skb)->frag_off & htons(IP_DF))) goto out; } ops = rcu_dereference(inet_offloads[proto]); if (likely(ops && ops->callbacks.gso_segment)) segs = ops->callbacks.gso_segment(skb, features); if (IS_ERR_OR_NULL(segs)) goto out; gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL); skb = segs; do { //重新为每个分段skb设置ip头信息 iph = (struct iphdr *)(skb_mac_header(skb) + nhoff); if (udpfrag) { iph->frag_off = htons(offset >> 3); if (skb->next != NULL) iph->frag_off |= htons(IP_MF); offset += skb->len - nhoff - ihl; tot_len = skb->len - nhoff; } else if (skb_is_gso(skb)) { if (!fixedid) { iph->id = htons(id); id += skb_shinfo(skb)->gso_segs; } if (gso_partial) tot_len = skb_shinfo(skb)->gso_size + SKB_GSO_CB(skb)->data_offset + skb->head - (unsigned char *)iph; else tot_len = skb->len - nhoff; } else { if (!fixedid) iph->id = htons(id++); tot_len = skb->len - nhoff; } iph->tot_len = htons(tot_len); //为每个分段skb的ip头做checksum ip_send_check(iph); if (encap) skb_reset_inner_headers(skb); skb->network_header = (u8 *)iph - skb->head; } while ((skb = skb->next)); out: return segs; }
9、进入tcp层分段处理函数后,会调用tcp_gso_segment完成skb分段,分段完成后,重新为每个分段skb做tcp层的checksum,以及为每个分段skb重新分配seq序列号等;
struct sk_buff *tcp_gso_segment(struct sk_buff *skb, netdev_features_t features) { struct sk_buff *segs = ERR_PTR(-EINVAL); unsigned int sum_truesize = 0; struct tcphdr *th; unsigned int thlen; unsigned int seq; __be32 delta; unsigned int oldlen; unsigned int mss; struct sk_buff *gso_skb = skb; __sum16 newcheck; bool ooo_okay, copy_destructor; th = tcp_hdr(skb); thlen = th->doff * 4; if (thlen < sizeof(*th)) goto out; if (!pskb_may_pull(skb, thlen)) goto out; oldlen = (u16)~skb->len; __skb_pull(skb, thlen); mss = skb_shinfo(skb)->gso_size; if (unlikely(skb->len <= mss)) goto out; if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) { /* Packet is from an untrusted source, reset gso_segs. */ skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss); segs = NULL; goto out; } copy_destructor = gso_skb->destructor == tcp_wfree; ooo_okay = gso_skb->ooo_okay; /* All segments but the first should have ooo_okay cleared */ skb->ooo_okay = 0; //真正做分段的处理函数 segs = skb_segment(skb, features); if (IS_ERR(segs)) goto out; /* Only first segment might have ooo_okay set */ segs->ooo_okay = ooo_okay; /* GSO partial and frag_list segmentation only requires splitting * the frame into an MSS multiple and possibly a remainder, both * cases return a GSO skb. So update the mss now. */ if (skb_is_gso(segs)) mss *= skb_shinfo(segs)->gso_segs; delta = htonl(oldlen + (thlen + mss)); skb = segs; th = tcp_hdr(skb); seq = ntohl(th->seq); newcheck = ~csum_fold((__force __wsum)((__force u32)th->check + (__force u32)delta)); while (skb->next) { th->fin = th->psh = 0; th->check = newcheck; //为每个分段skb做tcp层checksum if (skb->ip_summed == CHECKSUM_PARTIAL) gso_reset_checksum(skb, ~th->check); else th->check = gso_make_checksum(skb, ~th->check); //设置skb的序列号,拆分后,除了最后一个skb,其余大小均为mss seq += mss; if (copy_destructor) { skb->destructor = gso_skb->destructor; skb->sk = gso_skb->sk; sum_truesize += skb->truesize; } skb = skb->next; th = tcp_hdr(skb); th->seq = htonl(seq); th->cwr = 0; } /* Following permits TCP Small Queues to work well with GSO : * The callback to TCP stack will be called at the time last frag * is freed at TX completion, and not right now when gso_skb * is freed by GSO engine */ if (copy_destructor) { swap(gso_skb->sk, skb->sk); swap(gso_skb->destructor, skb->destructor); sum_truesize += skb->truesize; atomic_add(sum_truesize - gso_skb->truesize, &skb->sk->sk_wmem_alloc); } delta = htonl(oldlen + (skb_tail_pointer(skb) - skb_transport_header(skb)) + skb->data_len); th->check = ~csum_fold((__force __wsum)((__force u32)th->check + (__force u32)delta)); if (skb->ip_summed == CHECKSUM_PARTIAL) gso_reset_checksum(skb, ~th->check); else th->check = gso_make_checksum(skb, ~th->check); out: return segs; }
可以看到tcp_gso_segment里真正去做skb分段处理的是在skb_segment,skb_segment里将tso的skb按mss长度进行分段处理,对线性区域的数据,直接拷贝到分段skb的线性区域,对于非线性区域数据,直接将frags指针指向分段skb的frags;
struct sk_buff *skb_segment(struct sk_buff *head_skb, netdev_features_t features) { struct sk_buff *segs = NULL; struct sk_buff *tail = NULL; //flag_list存放ip分片数包,在ip_do_fragment里会设置 struct sk_buff *list_skb = skb_shinfo(head_skb)->frag_list; //frags存放分散、聚合的非线性区数据包 skb_frag_t *frag = skb_shinfo(head_skb)->frags; unsigned int mss = skb_shinfo(head_skb)->gso_size; //doffset为ip头+mac头的大小之和 unsigned int doffset = head_skb->data - skb_mac_header(head_skb); struct sk_buff *frag_skb = head_skb; unsigned int offset = doffset; unsigned int tnl_hlen = skb_tnl_header_len(head_skb); unsigned int partial_segs = 0; unsigned int headroom; unsigned int len = head_skb->len; __be16 proto; bool csum, sg; int nfrags = skb_shinfo(head_skb)->nr_frags; int err = -ENOMEM; int i = 0; int pos; int dummy; //为首个skb分配ip、mac头空间 __skb_push(head_skb, doffset); proto = skb_network_protocol(head_skb, &dummy); if (unlikely(!proto)) return ERR_PTR(-EINVAL); sg = !!(features & NETIF_F_SG); csum = !!can_checksum_protocol(features, proto); if (sg && csum && (mss != GSO_BY_FRAGS)) { if (!(features & NETIF_F_GSO_PARTIAL)) { struct sk_buff *iter; if (!list_skb || !net_gso_ok(features, skb_shinfo(head_skb)->gso_type)) goto normal; /* Split the buffer at the frag_list pointer. * This is based on the assumption that all * buffers in the chain excluding the last * containing the same amount of data. */ skb_walk_frags(head_skb, iter) { if (skb_headlen(iter)) goto normal; len -= iter->len; } } /* GSO partial only requires that we trim off any excess that * doesn't fit into an MSS sized block, so take care of that * now. */ partial_segs = len / mss; if (partial_segs > 1) mss *= partial_segs; else partial_segs = 0; } normal: headroom = skb_headroom(head_skb); //获取线性区的长度skb->len - skb->data_len pos = skb_headlen(head_skb); do { struct sk_buff *nskb; skb_frag_t *nskb_frag; int hsize; int size; if (unlikely(mss == GSO_BY_FRAGS)) { len = list_skb->len; } else { //没新增一个分段skb,offset累加分段skb的长度 //len为下一个新增分段skb的长度, 最多不超过mss值 len = head_skb->len - offset; if (len > mss) len = mss; } //skb_headlen = skb->len - skb->data_len表示skb的线性区域长度,当第一次分段时, //理论上线性区域就拷贝完成了,因此第二次的时候hsize应该就小于0了; //当hsize小于0时,直接将其赋值为0,接下来新分配的分段skb就不会再申请skb->data //线性空间了,而是直接拷贝非线性区的数据 hsize = skb_headlen(head_skb) - offset; if (hsize < 0) hsize = 0; if (hsize > len || !sg) hsize = len; //拷贝ip分片的数据包 if (!hsize && i >= nfrags && skb_headlen(list_skb) && (skb_headlen(list_skb) == len || sg)) { BUG_ON(skb_headlen(list_skb) > len); i = 0; nfrags = skb_shinfo(list_skb)->nr_frags; frag = skb_shinfo(list_skb)->frags; frag_skb = list_skb; pos += skb_headlen(list_skb); while (pos < offset + len) { BUG_ON(i >= nfrags); size = skb_frag_size(frag); if (pos + size > offset + len) break; i++; pos += size; frag++; } nskb = skb_clone(list_skb, GFP_ATOMIC); list_skb = list_skb->next; if (unlikely(!nskb)) goto err; if (unlikely(pskb_trim(nskb, len))) { kfree_skb(nskb); goto err; } hsize = skb_end_offset(nskb); if (skb_cow_head(nskb, doffset + headroom)) { kfree_skb(nskb); goto err; } nskb->truesize += skb_end_offset(nskb) - hsize; skb_release_head_state(nskb); __skb_push(nskb, doffset); } else { //拷贝skb的线性区和非线性区 nskb = __alloc_skb(hsize + doffset + headroom, GFP_ATOMIC, skb_alloc_rx_flag(head_skb), NUMA_NO_NODE); if (unlikely(!nskb)) goto err; skb_reserve(nskb, headroom); __skb_put(nskb, doffset); } //segs为空时,nskb作为首个skb赋给segs,否则将新的nskb挂到next里 if (segs) tail->next = nskb; else segs = nskb; tail = nskb; //从首个skb里拷贝skb头信息 __copy_skb_header(nskb, head_skb); skb_headers_offset_update(nskb, skb_headroom(nskb) - headroom); //设置mac头长度 skb_reset_mac_len(nskb); skb_copy_from_linear_data_offset(head_skb, -tnl_hlen, nskb->data - tnl_hlen, doffset + tnl_hlen); if (nskb->len == len + doffset) goto perform_csum_check; if (!sg) { if (!nskb->remcsum_offload) nskb->ip_summed = CHECKSUM_NONE; SKB_GSO_CB(nskb)->csum = skb_copy_and_csum_bits(head_skb, offset, skb_put(nskb, len), len, 0); SKB_GSO_CB(nskb)->csum_start = skb_headroom(nskb) + doffset; continue; } nskb_frag = skb_shinfo(nskb)->frags; //将skb的线性区域拷贝到nskb上 //hsize=0时,没有线性区域数据需要拷贝 skb_copy_from_linear_data_offset(head_skb, offset, skb_put(nskb, hsize), hsize); skb_shinfo(nskb)->tx_flags = skb_shinfo(head_skb)->tx_flags & SKBTX_SHARED_FRAG; //pos的初始值为线性区长度,offset+len表示本地分段skb做完后,需要拷贝的总的数据长度, //当pos
= nfrags) { BUG_ON(skb_headlen(list_skb)); i = 0; nfrags = skb_shinfo(list_skb)->nr_frags; frag = skb_shinfo(list_skb)->frags; frag_skb = list_skb; BUG_ON(!nfrags); list_skb = list_skb->next; } if (unlikely(skb_shinfo(nskb)->nr_frags >= MAX_SKB_FRAGS)) { net_warn_ratelimited( "skb_segment: too many frags: %u %u\n", pos, mss); goto err; } if (unlikely(skb_orphan_frags(frag_skb, GFP_ATOMIC))) goto err; //拷贝非线性区时,并非是把原来的非线性区数据拷贝到新的分段skb的线性区 //而是直接将分段skb的frags指针指向原来skb的frags *nskb_frag = *frag; __skb_frag_ref(nskb_frag); size = skb_frag_size(nskb_frag); if (pos < offset) { nskb_frag->page_offset += offset - pos; skb_frag_size_sub(nskb_frag, offset - pos); } skb_shinfo(nskb)->nr_frags++; //拷贝完成一个frags,修改pos长度 if (pos + size <= offset + len) { i++; frag++; pos += size; } else { skb_frag_size_sub(nskb_frag, pos + size - (offset + len)); goto skip_fraglist; } nskb_frag++; } skip_fraglist: nskb->data_len = len - hsize; nskb->len += nskb->data_len; nskb->truesize += nskb->data_len; perform_csum_check: if (!csum) { if (skb_has_shared_frag(nskb)) { err = __skb_linearize(nskb); if (err) goto err; } if (!nskb->remcsum_offload) nskb->ip_summed = CHECKSUM_NONE; SKB_GSO_CB(nskb)->csum = skb_checksum(nskb, doffset, nskb->len - doffset, 0); SKB_GSO_CB(nskb)->csum_start = skb_headroom(nskb) + doffset; } } while ((offset += len) < head_skb->len); //拷贝的数据长度还没到整个skb的长度,进入下一次分段 /* Some callers want to get the end of the list. * Put it in segs->prev to avoid walking the list. * (see validate_xmit_skb_list() for example) */ segs->prev = tail; if (partial_segs) { struct sk_buff *iter; int type = skb_shinfo(head_skb)->gso_type; unsigned short gso_size = skb_shinfo(head_skb)->gso_size; /* Update type to add partial and then remove dodgy if set */ type |= (features & NETIF_F_GSO_PARTIAL) / NETIF_F_GSO_PARTIAL * SKB_GSO_PARTIAL; type &= ~SKB_GSO_DODGY; /* Update GSO info and prepare to start updating headers on * our way back down the stack of protocols. */ for (iter = segs; iter; iter = iter->next) { skb_shinfo(iter)->gso_size = gso_size; skb_shinfo(iter)->gso_segs = partial_segs; skb_shinfo(iter)->gso_type = type; SKB_GSO_CB(iter)->data_offset = skb_headroom(iter) + doffset; } if (tail->len - doffset <= gso_size) skb_shinfo(tail)->gso_size = 0; else if (tail != segs) skb_shinfo(tail)->gso_segs = DIV_ROUND_UP(tail->len - doffset, gso_size); } /* Following permits correct backpressure, for protocols * using skb_set_owner_w(). * Idea is to tranfert ownership from head_skb to last segment. */ if (head_skb->destructor == sock_wfree) { swap(tail->truesize, head_skb->truesize); swap(tail->destructor, head_skb->destructor); swap(tail->sk, head_skb->sk); } return segs; err: kfree_skb_list(segs); return ERR_PTR(err); }
10、分段处理完成后,返回分段的skb链表,然后将分段好的skb链表进一步发送(dev(qdisc) —>驱动 —–>网卡)。
发布者:全栈程序员-站长,转载请注明出处:https://javaforall.net/216984.html原文链接:https://javaforall.net
