概述
socket filter,在BPF中的类型为BPF_PROG_TYPE_SOCKET_FILTER
,顾名思义,实现的是socket的过滤器。
本文会分析BPF_PROG_TYPE_SOCKET_FILTER
类型程序的实现原理,一直到埋点函数。
内核中有示例代码,位置在sample/bpf/sock_example.c
、samples/bpf/sockex1_kern.c
等。
一般会将socket filter程序的段名定义成SEC("socketxxx")
下文的代码分析,基于5.15.99
版本的内核
prog加载
这里通过sample/bpf/sock_example.c
学习。
一些注释和文件头
/* eBPF example program:
* - creates arraymap in kernel with key 4 bytes and value 8 bytes
*
* - loads eBPF program:
* r0 = skb->data[ETH_HLEN + offsetof(struct iphdr, protocol)];
* *(u32*)(fp - 4) = r0;
* // assuming packet is IPv4, lookup ip->proto in a map
* value = bpf_map_lookup_elem(map_fd, fp - 4);
* if (value)
* (*(u64*)value) += 1;
*
* - attaches this program to loopback interface "lo" raw socket
*
* - every second user space reads map[tcp], map[udp], map[icmp] to see
* how many packets of given protocol were seen on "lo"
*/
#include <stdio.h>
#include <unistd.h>
#include <assert.h>
#include <linux/bpf.h>
#include <string.h>
#include <stdlib.h>
#include <errno.h>
#include <sys/socket.h>
#include <arpa/inet.h>
#include <linux/if_ether.h>
#include <linux/ip.h>
#include <stddef.h>
#include <bpf/bpf.h>
#include "bpf_insn.h"
#include "sock_example.h"
加载map,使用内核的bpf_create_map
函数
int sock = -1, map_fd, prog_fd, i, key;
long long value = 0, tcp_cnt, udp_cnt, icmp_cnt;
map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, sizeof(key), sizeof(value),
256, 0);
if (map_fd < 0) {
printf("failed to create map '%s'\n", strerror(errno));
goto cleanup;
}
用字节码的形式定义的BPF prog程序本体
struct bpf_insn prog[] = {
BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
BPF_LD_ABS(BPF_B, ETH_HLEN + offsetof(struct iphdr, protocol) /* R0 = ip->proto */),
BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */
BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */
BPF_LD_MAP_FD(BPF_REG_1, map_fd),
BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */
BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_0, BPF_REG_1, 0),
BPF_MOV64_IMM(BPF_REG_0, 0), /* r0 = 0 */
BPF_EXIT_INSN(),
};
size_t insns_cnt = sizeof(prog) / sizeof(struct bpf_insn);
这里的`bpf_insn`是bpf程序底层的字节码,是抽象过的汇编代码。各种高级库都要转换成这种形式,最后转换成汇编代码。
struct bpf_insn {
__u8 code; /* opcode */
__u8 dst_reg:4; /* dest register */
__u8 src_reg:4; /* source register */
__s16 off; /* signed offset */
__s32 imm; /* signed immediate constant */
};
以上实现的程序:
r0 = skb->data[ETH_HLEN + offsetof(struct iphdr, protocol)];
*(u32*)(fp - 4) = r0;
// assuming packet is IPv4, lookup ip->proto in a map
value = bpf_map_lookup_elem(map_fd, fp - 4);
if (value)
(*(u64*)value) += 1;
用内核函数bpf_load_program
装载prog程序,参数为BPF_PROG_TYPE_SOCKET_FILTER
,表示socket/filter
类型
prog_fd = bpf_load_program(BPF_PROG_TYPE_SOCKET_FILTER, prog, insns_cnt,
"GPL", 0, bpf_log_buf, BPF_LOG_BUF_SIZE);
if (prog_fd < 0) {
printf("failed to load prog '%s'\n", strerror(errno));
goto cleanup;
}
open_raw_sock
创建一个raw_socket,调用setsockopt
将bpf prog附着到这个socket上,参数为SO_ATTACH_BPF
。
sock = open_raw_sock("lo");
if (setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog_fd,
sizeof(prog_fd)) < 0) {
printf("setsockopt %s\n", strerror(errno));
goto cleanup;
}
点位跟踪
接下来重点关注setsockopt
如何访问sock/filter的点位。
查找setsockopt
的源码,寻找SO_ATTACH_BPF参数逻辑。在5.15.99
的net/core/sock.c:1169
行,找到了处理逻辑
case SO_ATTACH_BPF:
ret = -EINVAL;
if (optlen == sizeof(u32)) {
u32 ufd;
ret = -EFAULT;
if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
break;
ret = sk_attach_bpf(ufd, sk);
}
break;
跳转到`sk_attach_bpf`函数(`net/core/filter.c:1571`)
int sk_attach_bpf(u32 ufd, struct sock *sk)
{
struct bpf_prog *prog = __get_bpf(ufd, sk);
int err;
if (IS_ERR(prog))
return PTR_ERR(prog);
err = __sk_attach_prog(prog, sk);
if (err < 0) {
bpf_prog_put(prog);
return err;
}
return 0;
}
排除关于bpf的操作(一些对prog程序的操作),跟进__sk_attach_prog
。
static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk)
{
//创建socket_filter的对象
// struct sk_filter {
// refcount_t refcnt;
// struct rcu_head rcu;
// struct bpf_prog *prog;
// };
struct sk_filter *fp, *old_fp;
fp = kmalloc(sizeof(*fp), GFP_KERNEL);
if (!fp)
return -ENOMEM;
fp->prog = prog;
// 为fp sk_filter对象分配一个socket的引用,如果失败,释放fp空间
if (!__sk_filter_charge(sk, fp)) {
kfree(fp);
return -ENOMEM;
}
refcount_set(&fp->refcnt, 1);
// 获取原先socket/filter过滤器
old_fp = rcu_dereference_protected(sk->sk_filter,
lockdep_sock_is_held(sk));
// 将sk->sk_filter的值变为我们新分配的fp
rcu_assign_pointer(sk->sk_filter, fp);
// 如果有淘汰下来的旧prog,需要对空间进行清理
if (old_fp)
sk_filter_uncharge(sk, old_fp);
return 0;
}
在代码注释里的操作过后,成功将prog对象指向了sk->sk_filter->prog。
sk_filter
查找sk_filter
代码,寻找调用函数。可以在很多函数中找到踪迹,在**include/linux/filter.h**
中找到函数的原型。
int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap);
static inline int sk_filter(struct sock *sk, struct sk_buff *skb)
{
return sk_filter_trim_cap(sk, skb, 1);
}
sk_filter是封装好的sk->sk_filter
调用原型,也有其他代码通过获取sk->sk_filter
或者直接调用sk_filter_trim_cap
来进行SOCKET_FILTER程序的运行。
sk_filter_trim_cap
跟进**sk_filter_trim_cap**
函数,net/core/filter.c
。
/**
* sk_filter_trim_cap - run a packet through a socket filter
* @sk: sock associated with &sk_buff
* @skb: buffer to filter
* @cap: limit on how short the eBPF program may trim the packet
*
* Run the eBPF program and then cut skb->data to correct size returned by
* the program. If pkt_len is 0 we toss packet. If skb->len is smaller
* than pkt_len we keep whole skb->data. This is the socket level
* wrapper to bpf_prog_run. It returns 0 if the packet should
* be accepted or -EPERM if the packet should be tossed.
*
*/
int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap)
{
int err;
struct sk_filter *filter;
/*
* If the skb was allocated from pfmemalloc reserves, only
* allow SOCK_MEMALLOC sockets to use it as this socket is
* helping free memory
*/
// 检查SKB是否分配PF_MEMALLOC标志位
if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP);
return -ENOMEM;
}
err = BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb);
if (err)
return err;
//lsm框架hook点
err = security_sock_rcv_skb(sk, skb);
if (err)
return err;
rcu_read_lock();
filter = rcu_dereference(sk->sk_filter);
if (filter) {
struct sock *save_sk = skb->sk;
unsigned int pkt_len;
skb->sk = sk;
pkt_len = bpf_prog_run_save_cb(filter->prog, skb);
skb->sk = save_sk;
err = pkt_len ? pskb_trim(skb, max(cap, pkt_len)) : -EPERM;
}
rcu_read_unlock();
return err;
}
EXPORT_SYMBOL(sk_filter_trim_cap);
PF_MEMALLOC含义:
当前进程有很多可以释放的内存,如果能分配一点紧急内存给当前进程,那么当前进程可以返回更多的内存给系统。非内存管理子系统不应该使用这个标记,除非这次分配保证会释放更大的内存给系统。如果每个子系统都滥用这个标记,可能会耗尽内存管理子系统的保留内存。
程序首先检查 skb 是否设置了PF_MEMALLOC
标志位,如果是的话,只有设置了 SOCK_MEMALLOC
标志的 socket 才能使用它,否则就返回 -ENOMEM 并增加统计计数器 **LINUX_MIB_PFMEMALLOCDROP**
。这是为了防止内存不足的情况下,非紧急的 socket 占用有限的内存资源。
下一步,调用 **BPF_CGROUP_RUN_PROG_INET_INGRESS()**
函数,执行 cgroup 的 ingress
hook上的 eBPF 程序,如果返回err,就return err。这是为了实现 cgroup 的网络隔离和限制功能。
这里如果开启了CGROUP_BPF的CGROUP_INET_INGRESS
点,则调用__cgroup_bpf_run_filter_skb
函数,执行CGROUP的filter程序。若没有开启,返回0值,继续执行代码。
cgroup细节之后讨论。
/* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */
#define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb) \
({ \
int __ret = 0; \
if (cgroup_bpf_enabled(CGROUP_INET_INGRESS)) \
__ret = __cgroup_bpf_run_filter_skb(sk, skb, \
CGROUP_INET_INGRESS); \
\
__ret; \
})
调用 **security_sock_rcv_skb**
函数,这是LSM的预留hook点,检查 socket 是否有权限接收 skb。
接下来,获取读锁,防止 sk_filter 被并发修改,从 sk 中获取 sk_filter 结构体指针。
取出sk->filter
后,更新skb中的sock为当前传入socket,并且调用**bpf_prog_run_save_cb**
执行bpf程序。然后把skb->sk
赋值旧的socket回去。
rcu_read_lock();
filter = rcu_dereference(sk->sk_filter);
if (filter) {
struct sock *save_sk = skb->sk;
unsigned int pkt_len;
skb->sk = sk;
pkt_len = bpf_prog_run_save_cb(filter->prog, skb);
skb->sk = save_sk;
err = pkt_len ? pskb_trim(skb, max(cap, pkt_len)) : -EPERM;
}
rcu_read_unlock();
如果返回的长度不为 0,就调用 pskb_trim 函数,将 skb 的数据部分裁剪到 cap 和返回的长度中的较大值,如果裁剪失败,就返回错误码;如果返回的长度为 0,就将错误码设置为 -EPERM,表示要丢弃 packet。
bpf_prog_run_save_cb
bpf prog执行的细节可以简单看一下。
其中涉及细节放到BPF系统源码分析里讲。
static inline u32 bpf_prog_run_save_cb(const struct bpf_prog *prog,
struct sk_buff *skb)
{
u32 res;
migrate_disable();
res = __bpf_prog_run_save_cb(prog, skb);
migrate_enable();
return res;
}
/* Must be invoked with migration disabled */
static inline u32 __bpf_prog_run_save_cb(const struct bpf_prog *prog,
const void *ctx)
{
const struct sk_buff *skb = ctx;
u8 *cb_data = bpf_skb_cb(skb);
u8 cb_saved[BPF_SKB_CB_LEN];
u32 res;
if (unlikely(prog->cb_access)) {
memcpy(cb_saved, cb_data, sizeof(cb_saved));
memset(cb_data, 0, sizeof(cb_saved));
}
res = bpf_prog_run(prog, skb);
if (unlikely(prog->cb_access))
memcpy(cb_data, cb_saved, sizeof(cb_saved));
return res;
}
static inline u8 *bpf_skb_cb(const struct sk_buff *skb)
{
/* eBPF programs may read/write skb->cb[] area to transfer meta
* data between tail calls. Since this also needs to work with
* tc, that scratch memory is mapped to qdisc_skb_cb's data area.
*
* In some socket filter cases, the cb unfortunately needs to be
* saved/restored so that protocol specific skb->cb[] data won't
* be lost. In any case, due to unpriviledged eBPF programs
* attached to sockets, we need to clear the bpf_skb_cb() area
* to not leak previous contents to user space.
*/
BUILD_BUG_ON(sizeof_field(struct __sk_buff, cb) != BPF_SKB_CB_LEN);
BUILD_BUG_ON(sizeof_field(struct __sk_buff, cb) !=
sizeof_field(struct qdisc_skb_cb, data));
return qdisc_skb_cb(skb)->data;
}
static __always_inline u32 bpf_prog_run(const struct bpf_prog *prog, const void *ctx)
{
return __bpf_prog_run(prog, ctx, bpf_dispatcher_nop_func);
}
static __always_inline u32 __bpf_prog_run(const struct bpf_prog *prog,
const void *ctx,
bpf_dispatcher_fn dfunc)
{
u32 ret;
cant_migrate();
if (static_branch_unlikely(&bpf_stats_enabled_key)) {
struct bpf_prog_stats *stats;
u64 start = sched_clock();
unsigned long flags;
ret = dfunc(ctx, prog->insnsi, prog->bpf_func);
stats = this_cpu_ptr(prog->stats);
flags = u64_stats_update_begin_irqsave(&stats->syncp);
u64_stats_inc(&stats->cnt);
u64_stats_add(&stats->nsecs, sched_clock() - start);
u64_stats_update_end_irqrestore(&stats->syncp, flags);
} else {
ret = dfunc(ctx, prog->insnsi, prog->bpf_func);
}
return ret;
}
调用链分析
可以通过搜索sk->sk_filter
、**sk_filter**
、**sk_filter_trim_cap**
,分析filter程序的调用
SOCKET_RAW
查找函数引用,回溯一下调用链。
查找调用sk_filter
的函数,定位到**sock_queue_rcv_skb(net/core/sock.c)**
(很多函数有注释,比如)
int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
int err;
err = sk_filter(sk, skb);
if (err)
return err;
return __sock_queue_rcv_skb(sk, skb);
}
EXPORT_SYMBOL(sock_queue_rcv_skb);
代码中的sk_filter就是埋点函数。
代码中有非常多的调用,很多都是各种协议的适配,比如J1939
,搜索后发现是汽车的CAN总线通信协议。这里我们关注net/ieee802154/socket.c
。
可以看到,dgram_rcv_skb(数据报SOCKET)
和raw_rcv_skb
都调用了sock_queue_rcv_skb
。
static int raw_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
skb = skb_share_check(skb, GFP_ATOMIC);
if (!skb)
return NET_RX_DROP;
if (sock_queue_rcv_skb(sk, skb) < 0) {
kfree_skb(skb);
return NET_RX_DROP;
}
return NET_RX_SUCCESS;
}
ipv4的**raw_rcv_skb**
逻辑也差不多,后面都进入同样的raw_rcv
static int raw_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
/* Charge it to the socket. */
ipv4_pktinfo_prepare(sk, skb);
if (sock_queue_rcv_skb(sk, skb) < 0) {
kfree_skb(skb);
return NET_RX_DROP;
}
return NET_RX_SUCCESS;
}
跟进到net/ipv4/raw.c
的 raw_rcv。
int raw_rcv(struct sock *sk, struct sk_buff *skb)
{
// 安全策略检查
if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
atomic_inc(&sk->sk_drops);
kfree_skb(skb);
return NET_RX_DROP;
}
//NFHOOK埋点,重置跟踪信息
nf_reset_ct(skb);
skb_push(skb, skb->data - skb_network_header(skb));
raw_rcv_skb(sk, skb);
return 0;
}
跟进raw_v4_input
。这个函数主要做socket_raw的RX方向sk分配。SOCKET_RAW允许多个socket同时接收同一个数据包,
/* IP input processing comes here for RAW socket delivery.
* Caller owns SKB, so we must make clones.
*
* RFC 1122: SHOULD pass TOS value up to the transport layer.
* -> It does. And not only TOS, but all IP header.
*/
static int raw_v4_input(struct sk_buff *skb, const struct iphdr *iph, int hash)
{
......
// 根据网络设备寻找匹配的socket
net = dev_net(skb->dev);
sk = __raw_v4_lookup(net, __sk_head(head), iph->protocol,
iph->saddr, iph->daddr, dif, sdif);
while (sk) {
delivered = 1;
if ((iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) &&
ip_mc_sf_allow(sk, iph->daddr, iph->saddr,
skb->dev->ifindex, sdif)) {
// clone的目的是不共享数据包,socket拥有自己的数据包
struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC);
/* Not releasing hash table! */
if (clone)
raw_rcv(sk, clone);
}
sk = __raw_v4_lookup(net, sk_next(sk), iph->protocol,
iph->saddr, iph->daddr,
dif, sdif);
}
out:
read_unlock(&raw_v4_hashinfo.lock);
return delivered;
}
跟进到raw_local_deliver
int raw_local_deliver(struct sk_buff *skb, int protocol)
{
int hash;
struct sock *raw_sk;
// 根据协议获取哈希值,从raw_v4_hashinfo链表获得socket对象
hash = protocol & (RAW_HTABLE_SIZE - 1);
raw_sk = sk_head(&raw_v4_hashinfo.ht[hash]);
/* If there maybe a raw socket we must check - if not we
* don't care less
*/
if (raw_sk && !raw_v4_input(skb, ip_hdr(skb), hash))
raw_sk = NULL;
return raw_sk != NULL;
}
跟进ip_local_deliver->ip_local_deliver_finish->ip_protocol_deliver_rcu->raw_local_deliver
,这就来到了网络层转发到传输层的函数入口了。ip_local_deliver负责网络层转发到上层协议。由于SOCKET_RAW跳过传输层,因此检查设置在了这,具体细节可以看网络系统文章。
[Linux内核源码分析]网络子系统
SOCKET_STREAM
net/ipv4/tcp_ipv4.c
的tcp_filter
函数调用了sk_filter_trim_cap
int tcp_filter(struct sock *sk, struct sk_buff *skb)
{
struct tcphdr *th = (struct tcphdr *)skb->data;
return sk_filter_trim_cap(sk, skb, th->doff * 4);
}
EXPORT_SYMBOL(tcp_filter);
跟进到tcp_v4_rcv
(AF_INET_tcp的recv函数)
在**TCP_NEW_SYN_RECV**
的处理逻辑以及主体函数逻辑中,都有**tcp_filter**
的函数调用
tcp_v4_rcv函数中会对TCP_NEW_SYN_RECV
进行处理,如果连接检查成功,则需要新建控制块来处理连接,这个新建控制块的状态将会使用**TCP_SYN_RECV**
状态;
/*
* From tcp_input.c
*/
int tcp_v4_rcv(struct sk_buff *skb)
{
struct net *net = dev_net(skb->dev);
struct sk_buff *skb_to_free;
const struct iphdr *iph;
const struct tcphdr *th;
struct sock *sk;
......
.........
th = (const struct tcphdr *)skb->data;
iph = ip_hdr(skb);
// 获取等待tcp包的合适socket
lookup:
sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
th->dest, sdif, &refcounted);
if (!sk)
goto no_tcp_socket;
process:
if (sk->sk_state == TCP_TIME_WAIT)
goto do_time_wait;
// 进入TCP_NEW_SYN_RECV逻辑
if (sk->sk_state == TCP_NEW_SYN_RECV) {
.........
if (!tcp_filter(sk, skb)) {
th = (const struct tcphdr *)skb->data;
iph = ip_hdr(skb);
tcp_v4_fill_cb(skb, iph, th);
nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
}
.........
if (tcp_filter(sk, skb)) {
drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
goto discard_and_relse;
}
......
if (!sock_owned_by_user(sk)) {
skb_to_free = sk->sk_rx_skb_cache;
sk->sk_rx_skb_cache = NULL;
ret = tcp_v4_do_rcv(sk, skb);
} else {
if (tcp_add_backlog(sk, skb))
goto discard_and_relse;
skb_to_free = NULL;
}
.........
switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
case TCP_TW_SYN: {
struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
&tcp_hashinfo, skb,
__tcp_hdrlen(th),
iph->saddr, th->source,
iph->daddr, th->dest,
inet_iif(skb),
sdif);
if (sk2) {
inet_twsk_deschedule_put(inet_twsk(sk));
sk = sk2;
tcp_v4_restore_cb(skb);
refcounted = false;
goto process;
}
}
/* to ACK */
fallthrough;
case TCP_TW_ACK:
tcp_v4_timewait_ack(sk, skb);
break;
case TCP_TW_RST:
tcp_v4_send_reset(sk, skb);
inet_twsk_deschedule_put(inet_twsk(sk));
goto discard_it;
case TCP_TW_SUCCESS:;
}
goto discard_it;
}
SOCKET_DGRAM
搜索sk_filter
找到了udp_queue_rcv_one_skb
函数。这个函数位于udp_queue_rcv_skb
内部
static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
struct sk_buff *next, *segs;
int ret;
if (likely(!udp_unexpected_gso(sk, skb)))
return udp_queue_rcv_one_skb(sk, skb);
......
}
逆向一路向上跟进至**udp_rcv**
,可知检测逻辑在UDP协议栈rcv处理函数内部。从udp_rcv顺序分析
int udp_rcv(struct sk_buff *skb)
{
return __udp4_lib_rcv(skb, &udp_table, IPPROTO_UDP);
}
跟进__udp4_lib_rcv
/*
* All we need to do is get the socket, and then do a checksum.
*/
int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
int proto)
{
struct sock *sk;
struct udphdr *uh;
struct rtable *rt = skb_rtable(skb);
__be32 saddr, daddr;
struct net *net = dev_net(skb->dev);
......
sk = skb_steal_sock(skb, &refcounted);
if (sk) {
struct dst_entry *dst = skb_dst(skb);
int ret;
if (unlikely(rcu_dereference(sk->sk_rx_dst) != dst))
udp_sk_rx_dst_set(sk, dst);
ret = udp_unicast_rcv_skb(sk, skb, uh);
if (refcounted)
sock_put(sk);
return ret;
}
if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST))
return __udp4_lib_mcast_deliver(net, skb, uh,
saddr, daddr, udptable, proto);
sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable);
if (sk)
return udp_unicast_rcv_skb(sk, skb, uh);
......
}
**udp_unicast_rcv_skb**
和**__udp4_lib_mcast_deliver**
都调用了**udp_queue_rcv_skb**
,而udp_queue_rcv_skb
内部包含**udp_queue_rcv_one_skb**
。
最后在udp_queue_rcv_one_skb中调用了**sk_filter_trim_cap**
。
/* returns:
* -1: error
* 0: success
* >0: "udp encap" protocol resubmission
*
* Note that in the success and error cases, the skb is assumed to
* have either been requeued or freed.
*/
static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
{
struct udp_sock *up = udp_sk(sk);
int is_udplite = IS_UDPLITE(sk);
/*
* Charge it to the socket, dropping if the queue is full.
*/
if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
goto drop;
nf_reset_ct(skb);
......
if (sk_filter_trim_cap(sk, skb, sizeof(struct udphdr)))
goto drop;
......
}
调用链:udp_rcv->__udp4_lib_rcv->udp_unicast_rcv_skb/__udp4_lib_mcast_deliver->udp_queue_rcv_skb->udp_queue_rcv_one_skb->sk_filter_trim_cap
其他协议
还有一些内核函数也调用sk_filter相关函数,但是属于通用sock处理逻辑(__sk_receive_skb
),一些其他协议使用,比如DCCP、pppoe、l2tp等,这里就不加以分析。
SOCKET析构逻辑
跟踪点是__sk_destruct
(net/core/sock.c),其中会检查sock_filter是否还存在,还存在的话调用sk_filter_uncharge
删除分配的内存。
/* Sockets having SOCK_RCU_FREE will call this function after one RCU
* grace period. This is the case for UDP sockets and TCP listeners.
*/
static void __sk_destruct(struct rcu_head *head)
{
struct sock *sk = container_of(head, struct sock, sk_rcu);
struct sk_filter *filter;
if (sk->sk_destruct)
sk->sk_destruct(sk);
filter = rcu_dereference_check(sk->sk_filter,
refcount_read(&sk->sk_wmem_alloc) == 0);
if (filter) {
sk_filter_uncharge(sk, filter);
RCU_INIT_POINTER(sk->sk_filter, NULL);
}
......
#ifdef CONFIG_BPF_SYSCALL
bpf_sk_storage_free(sk);
#endif
......
sk_prot_free(sk->sk_prot_creator, sk);
}
跟进到sk_destruct
void sk_destruct(struct sock *sk)
{
bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
if (rcu_access_pointer(sk->sk_reuseport_cb)) {
reuseport_detach_sock(sk);
use_call_rcu = true;
}
if (use_call_rcu)
call_rcu(&sk->sk_rcu, __sk_destruct);
else
__sk_destruct(&sk->sk_rcu);
}
跟进**__sk_free**
static void __sk_free(struct sock *sk)
{
if (likely(sk->sk_net_refcnt))
sock_inuse_add(sock_net(sk), -1);
if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
sock_diag_broadcast_destroy(sk);
else
sk_destruct(sk);
}
跟进至sk_free
void sk_free(struct sock *sk)
{
/*
* We subtract one from sk_wmem_alloc and can know if
* some packets are still in some tx queue.
* If not null, sock_wfree() will call __sk_free(sk) later
*/
if (refcount_dec_and_test(&sk->sk_wmem_alloc))
__sk_free(sk);
}
EXPORT_SYMBOL(sk_free);
sk_free是内核删除socket对象的函数。内核通过sk_alloc
分配socket对象。以下为tipc_sk_create
(net/tipc/socket.c)的示例。通过sk_alloc
创建socket对象,然后判断创建失败,sk_free释放内存。
/* Allocate socket's protocol area */
// sk_alloc - All socket objects are allocated here
sk = sk_alloc(net, AF_TIPC, GFP_KERNEL, &tipc_proto, kern);
if (sk == NULL)
return -ENOMEM;
tsk = tipc_sk(sk);
tsk->max_pkt = MAX_PKT_DEFAULT;
tsk->maxnagle = 0;
tsk->nagle_start = NAGLE_START_INIT;
INIT_LIST_HEAD(&tsk->publications);
INIT_LIST_HEAD(&tsk->cong_links);
msg = &tsk->phdr;
/* Finish initializing socket data structures */
sock->ops = ops;
sock_init_data(sock, sk);
tipc_set_sk_state(sk, TIPC_OPEN);
if (tipc_sk_insert(tsk)) {
sk_free(sk);
pr_warn("Socket create failed; port number exhausted\n");
return -EINVAL;
}
处理逻辑调用链:sk_free->sk_free->sk_destruct->sk_destruct
SOCKET_PACKET
net/packet/af_packet.c的run_filter
会取出sk->sk_filter->prog程序,bpf执行
static unsigned int run_filter(struct sk_buff *skb,
const struct sock *sk,
unsigned int res)
{
struct sk_filter *filter;
rcu_read_lock();
filter = rcu_dereference(sk->sk_filter);
if (filter != NULL)
res = bpf_prog_run_clear_cb(filter->prog, skb);
rcu_read_unlock();
return res;
}
跟进到packet_rcv
/*
* This function makes lazy skb cloning in hope that most of packets
* are discarded by BPF.
*
* Note tricky part: we DO mangle shared skb! skb->data, skb->len
* and skb->cb are mangled. It works because (and until) packets
* falling here are owned by current CPU. Output packets are cloned
* by dev_queue_xmit_nit(), input packets are processed by net_bh
* sequentially, so that if we return skb to original state on exit,
* we will not harm anyone.
*/
static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
struct packet_type *pt, struct net_device *orig_dev)
{
struct sock *sk;
struct sockaddr_ll *sll;
struct packet_sock *po;
u8 *skb_head = skb->data;
int skb_len = skb->len;
unsigned int snaplen, res;
bool is_drop_n_account = false;
if (skb->pkt_type == PACKET_LOOPBACK)
goto drop;
sk = pt->af_packet_priv;
po = pkt_sk(sk);
if (!net_eq(dev_net(dev), sock_net(sk)))
goto drop;
skb->dev = dev;
......
res = run_filter(skb, sk, snaplen);
......
}
跟进packet_create
,这是PF_PACKET
协议栈的create函数,其中创建了packet_sock,并且把packet_rcv指针赋值到协议栈处理函数中。当系统创建socket时,会调用inet_create
,从inetsw
数组中取出协议栈注册的函数,对应PF_PACKET
的就是这里的packet_create
。
/*
* Create a packet of type SOCK_PACKET.
*/
static int packet_create(struct net *net, struct socket *sock, int protocol,
int kern)
{
struct sock *sk;
struct packet_sock *po;
__be16 proto = (__force __be16)protocol; /* weird, but documented */
int err;
......
po->prot_hook.func = packet_rcv;
if (sock->type == SOCK_PACKET)
po->prot_hook.func = packet_rcv_spkt;
po->prot_hook.af_packet_priv = sk;
po->prot_hook.af_packet_net = sock_net(sk);
if (proto) {
po->prot_hook.type = proto;
__register_prot_hook(sk);
}
......
}
packet_sock的结构
struct packet_sock {
/* struct sock has to be the first member of packet_sock */
struct sock sk;
struct packet_fanout *fanout;
union tpacket_stats_u stats;
struct packet_ring_buffer rx_ring;
struct packet_ring_buffer tx_ring;
int copy_thresh;
spinlock_t bind_lock;
struct mutex pg_vec_lock;
unsigned int running; /* bind_lock must be held */
unsigned int auxdata:1, /* writer must hold sock lock */
origdev:1,
has_vnet_hdr:1,
tp_loss:1,
tp_tx_has_off:1;
int pressure;
int ifindex; /* bound device */
__be16 num;
struct packet_rollover *rollover;
struct packet_mclist *mclist;
atomic_t mapped;
enum tpacket_versions tp_version;
unsigned int tp_hdrlen;
unsigned int tp_reserve;
unsigned int tp_tstamp;
struct completion skb_completion;
struct net_device __rcu *cached_dev;
int (*xmit)(struct sk_buff *skb);
struct packet_type prot_hook ____cacheline_aligned_in_smp;
atomic_t tp_drops ____cacheline_aligned_in_smp;
};
总结
BPF_PROG_TYPE_SOCKET_FILTER
类型的bpf程序,需要利用setsockopt
函数绑定,埋点函数位于net/core/filter.c:**sk_filter_trim_cap**
。
调用链总结
- SOCKET_RAW:ip_local_deliver->ip_local_deliver_finish->ip_protocol_deliver_rcu->raw_local_deliver->raw_v4_input->raw_rcv->raw_rcv_skb
- SOCKET_STREAM:tcp_v4_rcv->tcp_filter
- SOCKET_DGRAM:udp_rcv->__udp4_lib_rcv->udp_unicast_rcv_skb/__udp4_lib_mcast_deliver->udp_queue_rcv_skb->udp_queue_rcv_one_skb->sk_filter_trim_cap
- SOCKET析构:sk_free->sk_free->sk_destruct->sk_destruct
- SOCKET_PACKET:packet_create->packet_rcv->run_filter