背景:这个是在centos7.6的环境上复现的,但该问题其实在很多内核版本上都有,如何做好对linux一些缓存的监控和控制,一直是云计算方向的热点,但这些热点属于细分场景,很难合入到linux主基线,随着ebpf的逐渐稳定,对通用linux内核的编程,观测,可能会有新的收获。下面列一下我们是怎么排查并解决这个问题的。
一、故障现象
+ 92.00% 3.96% [kernel] [k] __d_lookup - 48.95% 48.95% [kernel] [k] _raw_spin_lock 20.95% 0x70692f74656e2f73 __fopen_internal __GI___libc_open system_call sys_open do_sys_open do_filp_open path_openat link_path_walk + lookup_fast - 45.71% 44.58% [kernel] [k] proc_sys_compare - 5.48% 0x70692f74656e2f73 __fopen_internal __GI___libc_open system_call sys_open do_sys_open do_filp_open path_openat + 1.13% proc_sys_compare
几乎都消耗在内核态 __d_lookup的调用中,然后strace看到的消耗为:
open("/proc/sys/net/ipv4/neigh/kube-ipvs0/retrans_time_ms", O_RDONLY) = 8 <0.000024>------v4的比较快 open("/proc/sys/net/ipv6/neigh/ens7f0_58/retrans_time_ms", O_RDONLY) = 8 <0.>-------v6很慢
进一步手工操作,发现进入ipv6的路径很慢:
time cd /proc/sys/net
time cd /proc/sys/net/ipv6
time cd /proc/sys/net/ipv4
二、故障现象分析
__d_lookup--->if (parent->d_op->d_compare(parent, dentry, tlen, tname, name)) struct dentry *__d_lookup(const struct dentry *parent, const struct qstr *name) { ..... hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) { if (dentry->d_name.hash != hash) continue; spin_lock(&dentry->d_lock); if (dentry->d_parent != parent) goto next; if (d_unhashed(dentry)) goto next; /* * It is safe to compare names since d_move() cannot * change the qstr (protected by d_lock). */ if (parent->d_flags & DCACHE_OP_COMPARE) { int tlen = dentry->d_name.len; const char *tname = dentry->d_name.name; if (parent->d_op->d_compare(parent, dentry, tlen, tname, name)) goto next;//caq:返回1则是不相同 } else { if (dentry->d_name.len != len) goto next; if (dentry_cmp(dentry, str, len)) goto next; } .... next: spin_unlock(&dentry->d_lock);//caq:再次进入链表循环 } ..... }
static inline long hlist_count(const struct dentry *parent, const struct qstr *name) { long count = 0; unsigned int hash = name->hash; struct hlist_bl_head *b = d_hash(parent, hash); struct hlist_bl_node *node; struct dentry *dentry; rcu_read_lock(); hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) { count++; } rcu_read_unlock(); if(count >COUNT_THRES) { printk("hlist_bl_head=%p,count=%ld,name=%s,hash=%u\n",b,count,name,name->hash); } return count; }
kprobe的结果如下:
[.] hlist_bl_head=ffffb0d7029ae3b0 count = ,name=ipv6/neigh/ens7f1_46/base_reachable_time_ms,hash= [.] hlist_bl_head=ffffb0d7029ae3b0 count = ,name=ipv6/neigh/ens7f0_51/retrans_time_ms,hash= [.] hlist_bl_head=ffffb0d7029ae3b0 count = ,name=ipv6/conf/ens7f0_51/forwarding,hash= [.] hlist_bl_head=ffffb0d7029ae3b0 count = ,name=ipv6/neigh/ens7f0_51/base_reachable_time_ms,hash=
static inline struct hlist_bl_head *d_hash(const struct dentry *parent, unsigned int hash) { hash += (unsigned long) parent / L1_CACHE_BYTES; hash = hash + (hash >> D_HASHBITS); return dentry_hashtable + (hash & D_HASHMASK); } 高版本的内核是: static inline struct hlist_bl_head *d_hash(unsigned int hash) { return dentry_hashtable + (hash >> d_hash_shift); }
commit 8387ff2577eb9ed245df9a39947f66976c6bcd02 Author: Linus Torvalds
Date: Fri Jun 10 07:51:30 2016 -0700 vfs: make the string hashes salt the hash We always mixed in the parent pointer into the dentry name hash, but we did it late at lookup time. It turns out that we can simplify that lookup-time action by salting the hash with the parent pointer early instead of late.
问题分析到这里,有两个疑问如下:
crash> list dentry.d_hash -H 0xffff8a29269dc608 -s dentry.d_sb ffff89edf533d080 d_sb = 0xffff89db7fd3c800 ffff8a276fd1e3c0 d_sb = 0xffff89db7fd3c800 ffff8a2925bdaa80 d_sb = 0xffff89db7fd3c800 ffff89edf5382a80 d_sb = 0xffff89db7fd3c800 .....
crash> list super_block.s_list -H super_blocks -s super_block.s_id,s_nr_dentry_unused >/home/caq/super_block.txt # grep ffff89db7fd3c800 super_block.txt -A 2 ffff89db7fd3c800 s_id = "proc\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000"
... ffff89edf5375b00 d_inode = 0xffff8a291f11cfb0 ffff89edf06cb740 d_inode = 0xffff89edec668d10 ffff8a29218fa780 d_inode = 0xffff89edf0f75240 ffff89edf0f955c0 d_inode = 0xffff89edef9c7b40 ffff8a2769e70780 d_inode = 0xffff8a291c1c9750 ffff8a d_inode = 0xffff89edf332e1a0 ffff89edf5324b40 d_inode = 0xffff89edf ...
start_kernel-->proc_root_init()//caq:注册proc fs 由于proc是linux系统默认挂载的,所以查找 kern_mount_data 函数 pid_ns_prepare_proc-->kern_mount_data(&proc_fs_type, ns);//caq:挂载proc fs proc_sys_init-->proc_mkdir("sys", NULL);//caq:proc目录下创建sys目录 net_sysctl_init-->register_sysctl("net", empty);//caq:在/proc/sys下创建net 对于init_net: ipv6_sysctl_register-->register_net_sysctl(&init_net, "net/ipv6", ipv6_rotable); 对于其他net_namespace,一般是系统调用触发创建 ipv6_sysctl_net_init-->register_net_sysctl(net, "net/ipv6", ipv6_table);//创建ipv6
:Fri Mar 5 11:18:24 2021,runc:[1:CHILD],tid=.path=net/ipv6 0xffffffffb9ac66f0 : __register_sysctl_table+0x0/0x620 [kernel] 0xffffffffb9f4f7d2 : register_net_sysctl+0x12/0x20 [kernel] 0xffffffffb9f324c3 : ipv6_sysctl_net_init+0xc3/0x150 [kernel] 0xffffffffb9e2fe14 : ops_init+0x44/0x150 [kernel] 0xffffffffb9e2ffc3 : setup_net+0xa3/0x160 [kernel] 0xffffffffb9e30765 : copy_net_ns+0xb5/0x180 [kernel] 0xffffffffb98c8089 : create_new_namespaces+0xf9/0x180 [kernel] 0xffffffffb98c82ca : unshare_nsproxy_namespaces+0x5a/0xc0 [kernel] 0xffffffffb9897d83 : sys_unshare+0x173/0x2e0 [kernel] 0xffffffffb9f76ddb : system_call_fastpath+0x22/0x27 [kernel]
struct ctl_table_header *register_net_sysctl(struct net *net, const char *path, struct ctl_table *table) { return __register_sysctl_table(&net->sysctls, path, table); } struct ctl_table_header *__register_sysctl_table( struct ctl_table_set *set, const char *path, struct ctl_table *table) { ..... for (entry = table; entry->procname; entry++) nr_entries++;//caq:先计算该table下有多少个项 header = kzalloc(sizeof(struct ctl_table_header) + sizeof(struct ctl_node)*nr_entries, GFP_KERNEL); .... node = (struct ctl_node *)(header + 1); init_header(header, root, set, node, table); .... /* Find the directory for the ctl_table */ for (name = path; name; name = nextname) { ....//caq:遍历查找到对应的路径 } spin_lock(&sysctl_lock); if (insert_header(dir, header))//caq:插入到管理结构中去 goto fail_put_dir_locked; .... }
static int proc_sys_compare(const struct dentry *parent, const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { .... return !head || !sysctl_is_seen(head); } static int sysctl_is_seen(struct ctl_table_header *p) { struct ctl_table_set *set = p->set;//获取对应的set int res; spin_lock(&sysctl_lock); if (p->unregistering) res = 0; else if (!set->is_seen) res = 1; else res = set->is_seen(set); spin_unlock(&sysctl_lock); return res; } //不是同一个 ctl_table_set 则不可见 static int is_seen(struct ctl_table_set *set) { return ¤t->nsproxy->net_ns->sysctls == set; }
static inline void hlist_bl_add_head_rcu(struct hlist_bl_node *n, struct hlist_bl_head *h) { struct hlist_bl_node *first; /* don't need hlist_bl_first_rcu because we're under lock */ first = hlist_bl_first(h); n->next = first;//caq:每次后面添加的时候,是加在链表头 if (first) first->pprev = &n->next; n->pprev = &h->first; /* need _rcu because we can have concurrent lock free readers */ hlist_bl_set_first_rcu(h, n); }
然后看 hash表里面的dentry,d_parent很多都指向 0xffff8a0a7739fd40 这个dentry。 crash> dentry.d_subdirs 0xffff8a0a7739fd40 ----查看这个父dentry有多少child d_subdirs = { next = 0xffff8a07a3c6f710, prev = 0xffff8a0a7739fe90 } crash> list 0xffff8a07a3c6f710 |wc -l ----------居然有159万个child
然后查看集群其他机器,也发现类型现象,截取的打印如下:
count=,d_name=net,d_len=3,name=ipv6/conf/all/disable_ipv6,hash=,len=4 hlist_bl_head=ffffbd9d5a7a6cc0,count= count=,d_name=net,d_len=3,name=core/somaxconn,hash=,len=4 hlist_bl_head=ffffbd9d429a7498,count=
crash> dentry.d_parent,d_name.name,d_lockref.count,d_inode,d_subdirs ffff9bf500 d_parent = 0xffff9b d_name.name = 0xffff9bf538 "ipv6"-----这个是一个ipv6的dentry d_lockref.count = 1 d_inode = 0xffff9bba4a5e14c0 d_subdirs = { next = 0xffff9bf950, prev = 0xffff9bf950 } d_child偏移0x90,则0xffff9bf950减去0x90为 0xffff9bf8c0 crash> dentry 0xffff9bf8c0 struct dentry { ...... d_parent = 0xffff9bf500, d_name = { { { hash = , len = 4 }, hash_len = }, name = 0xffff9bf8f8 "conf"------名称为conf }, d_inode = 0xffff9bba4a5e61a0, d_iname = "conf\000bles_names\000\060\000.2\000\000pvs.(*Han", d_lockref = { ...... count = 1----------------引用计数为1,说明还有人引用 ...... }, ...... d_subdirs = { next = 0xffff9bfb90, prev = 0xffff9bfb90 }, ...... } 既然引用计数为1,则继续往下挖: crash> dentry.d_parent,d_lockref.count,d_name.name,d_subdirs 0xffff9bfb00 d_parent = 0xffff9bf8c0 d_lockref.count = 1 d_name.name = 0xffff9bfb38 "all" d_subdirs = { next = 0xffff9bef90, prev = 0xffff9bef90 } 再往下: crash> dentry.d_parent,d_lockref.count,d_name.name,d_subdirs,d_flags,d_inode -x 0xffff9bef00 d_parent = 0xffff9bfb00 d_lockref.count = 0x0-----------------------------挖到引用计数为0为止 d_name.name = 0xffff9bef38 "disable_ipv6" d_subdirs = { next = 0xffff9befa0, --------为空 prev = 0xffff9befa0 } d_flags = 0x40800ce-------------下面重点分析这个 d_inode = 0xffff9bba4a5e4fb0
#define DCACHE_FILE_TYPE 0x0 /* Other file type */ #define DCACHE_LRU_LIST 0x80000--------这个表示在lru上面 #define DCACHE_REFERENCED 0x0040 /* Recently used, don't discard. */ #define DCACHE_RCUACCESS 0x0080 /* Entry has ever been RCU-visible */ #define DCACHE_OP_COMPARE 0x0002 #define DCACHE_OP_REVALIDATE 0x0004 #define DCACHE_OP_DELETE 0x0008
static void dentry_lru_add(struct dentry *dentry) { if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST))) { spin_lock(&dcache_lru_lock); dentry->d_flags |= DCACHE_LRU_LIST;//有这个标志说明在lru上 list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru); dentry->d_sb->s_nr_dentry_unused++;//caq:放在s_dentry_lru是空闲的 dentry_stat.nr_unused++; spin_unlock(&dcache_lru_lock); } }
spin_lock(orig_sb_lock); list_for_each_entry(sb, orig_super_blocks, s_list) { if (memcmp(&(sb->s_id[0]),"proc",strlen("proc"))||\ memcmp(sb->s_type->name,"proc",strlen("proc"))||\ hlist_unhashed(&sb->s_instances)||\ (sb->s_nr_dentry_unused < NR_DENTRY_UNUSED_LEN) ) continue; sb->s_count++; spin_unlock(orig_sb_lock); printk("find proc sb=%p\n",sb); shrinker = &sb->s_shrink; count = shrinker_one(shrinker,&shrink,1000,1000); printk("shrinker_one count =%lu,sb=%p\n",count,sb); spin_lock(orig_sb_lock);//caq:再次持锁 if (sb_proc) __put_super(sb_proc); sb_proc = sb; } if(sb_proc){ __put_super(sb_proc); spin_unlock(orig_sb_lock); } else{ spin_unlock(orig_sb_lock); printk("can't find the special sb\n"); }
[.] hlist_bl_head=ffffbd9d5a7a6cc0,count=34686 [.] count=34686,d_name=net,d_len=3,name=core/somaxconn,hash=,len=4 [.] IPVS: Creating netns size=2048 id= [.] hlist_bl_head=ffffbd9d429a7498,count=34686 [.] count=34686,d_name=net,d_len=3,name=ipv6/conf/all/disable_ipv6,hash=,len=4 [.] hlist_bl_head=ffffbd9d5a7a6cc0,count=34687 [.] count=34687,d_name=net,d_len=3,name=core/somaxconn,hash=,len=4 [.] hlist_bl_head=ffffbd9d429a7498,count=34687 [.] count=34687,d_name=net,d_len=3,name=ipv6/conf/all/disable_ipv6,hash=,len=4 [.] find proc sb=ffff9b647fdd4000-----------------------开始释放 [.] shrinker_one count =,sb=ffff9b647fdd4000------释放结束
单独释放后:
[.] hlist_bl_head=ffffbd9d466aed58,count=101 [.] count=101,d_name=net,d_len=3,name=core/somaxconn,hash=,len=4 [.] IPVS: Creating netns size=2048 id= [.] hlist_bl_head=ffffbd9d466aed58,count=102 [.] count=102,d_name=net,d_len=3,name=core/somaxconn,hash=,len=4 [.] hlist_bl_head=ffffbd9d4e8af728,count=101 [.] count=101,d_name=net,d_len=3,name=ipv6/conf/all/disable_ipv6,hash=,len=4 [.] IPVS: Creating netns size=2048 id= [.043860] hlist_bl_head=ffffbd9d466aed58,count=103 [.043863] count=103,d_name=net,d_len=3,name=core/somaxconn,hash=,len=4 [.] hlist_bl_head=ffffbd9d4e8af728,count=102 [.] count=102,d_name=net,d_len=3,name=ipv6/conf/all/disable_ipv6,hash=,len=4 [.] IPVS: Creating netns size=2048 id= [.] hlist_bl_head=ffffbd9d466aed58,count=104 [.] count=104,d_name=net,d_len=3,name=core/somaxconn,hash=,len=4 [.] hlist_bl_head=ffffbd9d4e8af728,count=103
上面可以看出两个细节:
1、释放前,hlist也是在增长的,释放后,hlist还是在增长。
2、释放后,net的dentry变了,所以hashlist的位置变化了。
pid=16564,task=exe,par_pid=,task=dockerd,count=1958,d_name=net,d_len=3,name=ipv6/conf/all/disable_ipv6,hash=,len=4,hlist_bl_head=ffffbd9d429a7498 hlist_bl_head=ffffbd9d5a7a6cc0,count=1960 pid=16635,task=runc:[2:INIT],par_pid=16587,task=runc,count=1960,d_name=net,d_len=3,name=core/somaxconn,hash=,len=4,hlist_bl_head=ffffbd9d5a7a6cc0 hlist_bl_head=ffffbd9d429a7498,count=1959
三、故障复现
IPVS: Creating netns size=2048 id=
则有必要关注一下 dentry的缓存情况。
四、故障规避或解决
可能的解决方案是:
1、通过rcu的方式,读取 dentry_hashtable 的各个冲突链,大于一定程度,抛出告警。
2、通过一个proc参数,设置缓存的dentry的个数。
3、全局可以关注 /proc/sys/fs/dentry-state
5、注意与 negative-dentry-limit 的区别。
五、作者简介
Anqing
在oppo混合云负责linux内核及容器,虚拟机等虚拟化方面的工作
发布者:全栈程序员-站长,转载请注明出处:https://javaforall.net/178650.html原文链接:https://javaforall.net
