在上一篇https://cloud.tencent.com/developer/article/1180256?s=original-sharing,我们已经介绍过dpvs启动过程中的一些初始化过程,以及在数据平面线程中,是怎么把这些初始化过程串联起来的。在这篇我将主要介绍下数据平面线程的主要逻辑,在netif_loop这个线程中,它到底都做了些!
前面说了,在netif_loop中函数注册的job函数调用顺序为: lcore_job_recv_fwd -> lcore_job_xmit -> lcore_job_timer_manage -> slave_lcore_loop_func -> ipv4_frag_job -> neigh_process_ring
先从lcore_job_recv_fwd 看起,这篇文章主要也是围绕这个函数进行深入。
static void lcore_job_recv_fwd(void *arg)
{
int i, j;
portid_t pid;
lcoreid_t cid;
struct netif_queue_conf *qconf;
cid = rte_lcore_id();
assert(LCORE_ID_ANY != cid);
for (i = 0; i < lcore_conf[lcore2index[cid]].nports; i++) {
pid = lcore_conf[lcore2index[cid]].pqs[i].id;
assert(pid < rte_eth_dev_count());
for (j = 0; j < lcore_conf[lcore2index[cid]].pqs[i].nrxq; j++) {
qconf = &lcore_conf[lcore2index[cid]].pqs[i].rxqs[j];
// 从arp_ring获取arp报文
lcore_process_arp_ring(qconf,cid);
qconf->len = netif_rx_burst(pid, qconf);
lcore_stats_burst(&lcore_stats[cid], qconf->len);
lcore_process_packets(qconf, qconf->mbufs, cid, qconf->len, 1);
kni_send2kern_loop(pid, qconf);
}
}
}
看到这个函数,我们似乎有必要对lcore_conf这个变量了解下,它的定义如下:
/* worker configuration array */
static struct netif_lcore_conf lcore_conf[NETIF_MAX_LCORES + 1];
/*
* lcore conf
* Multiple ports may be processed by a lcore.
*多个网卡可能会被一个lcore处理
*/
struct netif_lcore_conf
{
lcoreid_t id;
/* nic number of this lcore to process */
int nports;
/* port list of this lcore to process */
struct netif_port_conf pqs[NETIF_MAX_PORTS];
} __rte_cache_aligned;
/*
* RX/TX port conf for lcore.
* Multiple queues of a port may be processed by a lcore.
*一个lcore也可能处理一个网卡的多个queue
*/
struct netif_port_conf
{
portid_t id;
/* rx/tx queues for this lcore to process*/
int nrxq;
int ntxq;
/* rx/tx queue list for this lcore to process */
struct netif_queue_conf rxqs[NETIF_MAX_QUEUES];
struct netif_queue_conf txqs[NETIF_MAX_QUEUES];
} __rte_cache_aligned;
netif_lcore_conf这个结构体的定义,有点奇怪,port这个翻译应该是端口的意思,但是dpvs中port一般是指网卡,这个结构的填充是通过读dpvs.conf这个配置文件完成的。一个lcore会处理多个网卡的多个接收数据队列。所以lcore_job_recv_fwd会有个二重循环读取网卡队列数据。取到数据之后调用lcore_process_packets对数据包进行处理
static void lcore_process_packets(struct netif_queue_conf *qconf, struct rte_mbuf **mbufs,
lcoreid_t cid, uint16_t count, bool pretetch)
{
.........................
/* prefetch packets */
/*从内存预取到cache,加快处理速度*/
if (pretetch) {
for (t = 0; t < qconf->len && t < NETIF_PKT_PREFETCH_OFFSET; t++)
rte_prefetch0(rte_pktmbuf_mtod(qconf->mbufs[t], void *));
}
/* L2 filter */
for (i = 0; i < count; i++) {
struct rte_mbuf *mbuf = mbufs[i];
struct netif_port *dev = netif_port_get(mbuf->port);
if (unlikely(!dev)) {
rte_pktmbuf_free(mbuf);
lcore_stats[cid].dropped++;
continue;
}
if (dev->type == PORT_TYPE_BOND_SLAVE) {
dev = dev->bond->slave.master;
mbuf->port = dev->id;
}
if (pretetch && (t < qconf->len)) {
rte_prefetch0(rte_pktmbuf_mtod(qconf->mbufs[t], void *));
t++;
}
eth_hdr = rte_pktmbuf_mtod(mbuf, struct ether_hdr *);
/* reuse mbuf.packet_type, it was RTE_PTYPE_XXX */
mbuf->packet_type = eth_type_parse(eth_hdr, dev);
/*
* In NETIF_PORT_FLAG_FORWARD2KNI mode.
* All packets received are deep copied and sent to KNI
* for the purpose of capturing forwarding packets.Since the
* rte_mbuf will be modified in the following procedure,
* we should use mbuf_copy instead of rte_pktmbuf_clone.
*/
if (dev->flag & NETIF_PORT_FLAG_FORWARD2KNI) {
if (likely(NULL != (mbuf_copied = mbuf_copy(mbuf,
pktmbuf_pool[dev->socket]))))
kni_ingress(mbuf_copied, dev, qconf);
else
RTE_LOG(WARNING, NETIF, "%s: Failed to copy mbuf\n",
__func__);
}
/*
* do not drop pkt to other hosts (ETH_PKT_OTHERHOST)
* since virtual devices may have different MAC with
* underlying device.
*/
/*
* handle VLAN
* if HW offload vlan strip, it's still need vlan module
* to act as VLAN filter.
*/
if (eth_hdr->ether_type == htons(ETH_P_8021Q) ||
mbuf->ol_flags & PKT_RX_VLAN_STRIPPED) {
if (vlan_rcv(mbuf, netif_port_get(mbuf->port)) != EDPVS_OK) {
rte_pktmbuf_free(mbuf);
lcore_stats[cid].dropped++;
continue;
}
dev = netif_port_get(mbuf->port);
if (unlikely(!dev)) {
rte_pktmbuf_free(mbuf);
lcore_stats[cid].dropped++;
continue;
}
eth_hdr = rte_pktmbuf_mtod(mbuf, struct ether_hdr *);
}
/* handler should free mbuf */
netif_deliver_mbuf(mbuf, eth_hdr->ether_type, dev, qconf,
(dev->flag & NETIF_PORT_FLAG_FORWARD2KNI) ? true:false,
cid, pkts_from_ring);
lcore_stats[cid].ibytes += mbuf->pkt_len;
lcore_stats[cid].ipackets++;
}
}
这个函数二层的过滤函数,也就是链路层的过滤。链路层的处理完之后,会调用到netif_deliver_mbuf,这到这层就是ip层了。
static inline int netif_deliver_mbuf(struct rte_mbuf *mbuf,
uint16_t eth_type,
struct netif_port *dev,
struct netif_queue_conf *qconf,
bool forward2kni,
lcoreid_t cid,
bool pkts_from_ring)
{
struct pkt_type *pt;
int err;
uint16_t data_off;
assert(mbuf->port <= NETIF_MAX_PORTS);
assert(dev != NULL);
pt = pkt_type_get(eth_type, dev);
...............................
/*在这里就开始处理上层协议了ipv4_rcv,目前只会处理ip和arp,也只注册了这两种*/
err = pt->func(mbuf, dev);
return EDPVS_OK;
}
这个函数中重点是pt->func(mbuf, dev);这个的调用,在dpvs中ip层的pkt_type只注册了两种 ip4_pkt_type
和arp_pkt_type 。其中ip4_pkt_type 注册函数在ipv4_init,上一篇中我们还提到过,ipv4_init会调用ipv4_frag_init
来注册NETIF_LCORE_JOB_SLOW类型的job; arp_pkt_type在arp_init中注册的,上篇我们也提到过,arp_init中也注册过NETIF_LCORE_JOB_SLOW类型的job.
static struct pkt_type ip4_pkt_type = {
//.type = rte_cpu_to_be_16(ETHER_TYPE_IPv4),
.func = ipv4_rcv,
.port = NULL,
};
static struct pkt_type arp_pkt_type = {
//.type = rte_cpu_to_be_16(ETHER_TYPE_ARP),
.func = neigh_resolve_input,
.port = NULL,
};
所以我们可以得知,对于ipv4的包,实际上pt->func调用的就是ipv4_rcv。我们接着再看看ipv4_rcv干了些啥
static int ipv4_rcv(struct rte_mbuf *mbuf, struct netif_port *port)
{
/*主要是一些错误检查之类的工作*/
......................
return INET_HOOK(INET_HOOK_PRE_ROUTING, mbuf, port, NULL, ipv4_rcv_fin);
csum_error:
IP4_INC_STATS(csumerrors);
inhdr_error:
IP4_INC_STATS(inhdrerrors);
drop:
rte_pktmbuf_free(mbuf);
return EDPVS_INVPKT;
}
ipv4_rcv做了些错误检查之后就调用INET_HOOK这个函数了。我们再看看这个hook又干了些啥
int INET_HOOK(unsigned int hook, struct rte_mbuf *mbuf,
struct netif_port *in, struct netif_port *out,
int (*okfn)(struct rte_mbuf *mbuf))
{
struct list_head *hook_list;
struct inet_hook_ops *ops;
struct inet_hook_state state;
int verdict = INET_ACCEPT;
state.hook = hook;
hook_list = &inet_hooks[hook];
......................
ops = list_entry(hook_list, struct inet_hook_ops, list);
if (!list_empty(hook_list)) {
verdict = INET_ACCEPT;
list_for_each_entry_continue(ops, hook_list, list) {
repeat:
verdict = ops->hook(ops->priv, mbuf, &state);/*会先后执行dp_vs_in和dp_vs_pre_routing*/
if (verdict != INET_ACCEPT) {
if (verdict == INET_REPEAT)
goto repeat;
break;
}
}
}
..............
}
这个inet_hooks是在dp_vs_init中注册的,对于INET_HOOK_PRE_ROUTING这个hooknum,有两个值分别为:
static struct inet_hook_ops dp_vs_ops[] = {
{
.hook = dp_vs_in,
.hooknum = INET_HOOK_PRE_ROUTING,
.priority = 100,
},
{
.hook = dp_vs_pre_routing,
.hooknum = INET_HOOK_PRE_ROUTING,
.priority = 99,
},
};
到此,我们可以知道了,INET_HOOK这个函数,会一次调用dp_vs_in和dp_vs_pre_routing。我们再追踪到dp_vs_in
里面看看。
static int dp_vs_in(void *priv, struct rte_mbuf *mbuf,
const struct inet_hook_state *state)
{
..........................................................
/* packet belongs to existing connection ? */
conn = prot->conn_lookup(prot, &iph, mbuf, &dir, false);
/*比如是tcp的协议,会调用到conn_sched真正会调用到tcp_conn_sched*/
if (unlikely(!conn)) {
/* try schedule RS and create new connection */
if (prot->conn_sched(prot, &iph, mbuf, &conn, &verdict) != EDPVS_OK) {
/* RTE_LOG(DEBUG, IPVS, "%s: fail to schedule.\n", __func__); */
return verdict;
}
/* only SNAT triggers connection by inside-outside traffic. */
if (conn->dest->fwdmode == DPVS_FWD_MODE_SNAT)
dir = DPVS_CONN_DIR_OUTBOUND;
else
dir = DPVS_CONN_DIR_INBOUND;
}
...................
/*xmit_inbound 将包转发给RS,xmit_outbound回包*/
/* holding the conn, need a "put" later. */
if (dir == DPVS_CONN_DIR_INBOUND)
return xmit_inbound(mbuf, prot, conn);
else
return xmit_outbound(mbuf, prot, conn);
}
dp_vs_in这个函数我也去掉了很多东西,把一些检查除开,他的主体逻辑就是判断ip包时候是某个存在的链接,如果不存在prot->conn_sched会创建一个新连接,如果存在,直接转发包,xmit_inbound会转发给RS, xmit_outbound转发出去。我们再深入到prot->conn_sched看下。
在dpvs源码中注册了3个struct dp_vs_proto,分别是tcp,upd,和icmp。这三个协议的注册在dp_vs_proto_init函数中(main->dpvs_init->dp_vs_proto_init),我们可以看下dp_vs_proto_tcp
struct dp_vs_proto dp_vs_proto_tcp = {
.name = "TCP",
.proto = IPPROTO_TCP,
.init = tcp_init,
.exit = tcp_exit,
.conn_sched = tcp_conn_sched,
.conn_lookup = tcp_conn_lookup,
.conn_expire = tcp_conn_expire,
.fnat_in_handler = tcp_fnat_in_handler,
.fnat_out_handler = tcp_fnat_out_handler,
.snat_in_handler = tcp_snat_in_handler,
.snat_out_handler = tcp_snat_out_handler,
.state_trans = tcp_state_trans,
};
dp_vs_proto_tcp 的conn_sched实际上调用的是tcp_conn_sched。tcp_conn_sched会真正的建立连接。在该函数中还会调用dp_vs_schedule,而dp_vs_schedule里面会有所谓的full-nat和snat的选择。到此3层转发介绍的也就差不多了。后面有机会的话,再介绍其它的一些模块。
在这两篇文章中,我们主要介绍了下dpvs启动到3层转发的主体逻辑,之所以会写这两篇文章,是因为最近要接触GBN项目,在跟项目前,需要学习下dpvs的源码,在此,也就记录下这个学习的过程。个人觉得dpvs的源码读起来还是很有意思的,dpvs为了提升性能,其实做了很多的指令优化,比如我贴的代码中经常会看到pretetch,likely和unlike这些东西。预取和预编译指令在dpvs中出现的还是蛮多的。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。