问题环境:
X86-64 架构,Kernel Ver:Centos7 3.10.0-693.el7.x86_64
从bt栈信息可以看到CPU2 卡在 smp_call_function_many函数中导致watchdog超时触发系统重启:
crash> bt
PID: 12263 TASK: ffff8803bbebdee0 CPU: 2 COMMAND: "kworker/u8:0"
bt: page excluded: kernel virtual address: ffffffffffffffff type: "cpu_online_map"
#0 [ffff88043fd03cf8] machine_kexec at ffffffff8105c4cb
#1 [ffff88043fd03d58] __crash_kexec at ffffffff81104a32
#2 [ffff88043fd03e28] panic at ffffffff8169dc5f
#3 [ffff88043fd03ea8] watchdog_timer_fn at ffffffff8112f651
#4 [ffff88043fd03ee0] __hrtimer_run_queues at ffffffff810b4ae4
#5 [ffff88043fd03f38] hrtimer_interrupt at ffffffff810b507f
#6 [ffff88043fd03f80] local_apic_timer_interrupt at ffffffff81053895
#7 [ffff88043fd03f98] smp_apic_timer_interrupt at ffffffff816b76bd
#8 [ffff88043fd03fb0] apic_timer_interrupt at ffffffff816b5c1d
--- <IRQ stack> ---
#9 [ffff88000399b738] apic_timer_interrupt at ffffffff816b5c1d
[exception RIP: smp_call_function_many+514]
RIP: ffffffff810f99a2 RSP: ffff88000399b7e8 RFLAGS: 00000202
RAX: 0000000000000003 RBX: 00000000000000fc RCX: ffff88043fd9adb8
RDX: 0000000000000003 RSI: 0000000000000004 RDI: 0000000000000000
RBP: ffff88000399b820 R8: ffff88017a1ee000 R9: ffffffff813227d9
R10: ffff88043fd19c80 R11: ffffea00000c2100 R12: 0000000000000292
R13: ffff88000399b798 R14: ffffea0010b1f842 R15: 0000000000000000
ORIG_RAX: ffffffffffffff10 CS: 0010 SS: 0018
#10 [ffff88000399b828] native_flush_tlb_others at ffffffff8106e668
#11 [ffff88000399b878] flush_tlb_page at ffffffff8106e864
结合smp_call_function_many函数反汇编以及源码推测CPU2是在csd_lock_wait中一直执行while语句:
crash> dis smp_call_function_many
...
0xffffffff810f99a0 <smp_call_function_many+512>: pause
0xffffffff810f99a2 <smp_call_function_many+514>: testb $0x1,0x20(%rcx)
0xffffffff810f99a6 <smp_call_function_many+518>: jne 0xffffffff810f99a0 <smp_call_function_many+512>
...
void smp_call_function_many(const struct cpumask *mask,
smp_call_func_t func, void *info, bool wait)
{
...
if (wait) {
//按顺序等各个接收到本cpu ipi信号的其他cpu修改csd的flag
for_each_cpu(cpu, cfd->cpumask) {
struct call_single_data *csd;
csd = per_cpu_ptr(cfd->csd, cpu);
//等待percpu变量csd的flag被置为~CSD_FLAG_LOCK,代表对应cpu已经响应并处理完本
cpu发送的ipi回调
csd_lock_wait(csd);
}
}
....
}
enum {
CSD_FLAG_LOCK = 0x01,
CSD_FLAG_WAIT = 0x02,
};
static void csd_lock_wait(struct call_single_data *csd)
{
while (csd->flags & CSD_FLAG_LOCK)
cpu_relax();
}
接收到IPI信号的CPU执行完回调函数后会对csd的flags清零:
static void flush_smp_call_function_queue(bool warn_cpu_offline)
{
....
llist_for_each_entry_safe(csd, csd_next, entry, llist) {
csd->func(csd->info);//执行ipi回调函数
csd_unlock(csd);//对csd的flags清零
}
...
}
static void csd_unlock(struct call_single_data *csd)
{
...
csd->flags &= ~CSD_FLAG_LOCK;
}
要证明我们上面的推测是成立的,那么需要找出call_singel_data类型指针变量csd的地址:
void smp_call_function_many(const struct cpumask *mask,
smp_call_func_t func, void *info, bool wait)
{
...
cfd = &__get_cpu_var(cfd_data);//获取当前cpu的cfd_data地址
...
if (wait) {
for_each_cpu(cpu, cfd->cpumask) {
struct call_single_data *csd;
csd = per_cpu_ptr(cfd->csd, cpu);//从cfd中获取per-cpu变量csd
csd_lock_wait(csd);
}
}
}
#define __get_cpu_var(var) (*this_cpu_ptr(&(var)))
问题设备一共有四个cpu:
crash> p nr_cpu_ids
nr_cpu_ids = $2 = 4
crash>
crash> px cfd_data
PER-CPU DATA TYPE:
struct call_function_data cfd_data;
PER-CPU ADDRESSES:
[0]: ffff88043fc17840
[1]: ffff88043fc97840
[2]: ffff88043fd17840
[3]: ffff88043fd97840
crash>
阻塞在smp_call_function_many函数中的是cpu2:
crash> px cfd_data:2
per_cpu(cfd_data, 2) = $3 = {
csd = 0x1adb8,
cpumask = 0xffff88017a1ee000
}
crash>
per_cpu(cfd_data,2)中cpumask的值是10,10的值转为二进制就是bit3和bit1值为1,对应cpu3和cpu1。
crash> struct cpumask 0xffff88017a1ee000
struct cpumask {
bits = {10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
}
crash>
crash> eval -b 10
hexadecimal: a
decimal: 10
octal: 12
binary: 0000000000000000000000000000000000000000000000000000000000001010
bits set: 3 1
crash>
per_cpu(cfd_data,2)中cpumask对应的cpu位其实就代表本cpu给哪些cpu发送了ipi信号,在
本问题的vmcore中代表cpu2给cpu1和cpu3发送了IPI信号:
void smp_call_function_many(const struct cpumask *mask,
smp_call_func_t func, void *info, bool wait)
{
struct call_function_data *cfd;
...
cfd = &__get_cpu_var(cfd_data);//获取本cpu对应的cfd_data地址
cpumask_and(cfd->cpumask, mask, cpu_online_mask);//cfd->cpumask=mask&cpu_online_mask
cpumask_clear_cpu(this_cpu, cfd->cpumask);//清除本cpu对应的掩码位,也就是不用给自己发送ipi
...
for_each_cpu(cpu, cfd->cpumask) {//对percpu变量csd进行初始化
struct call_single_data *csd = per_cpu_ptr(cfd->csd, cpu);
csd_lock(csd);
csd->func = func;//收到ipi信号的cpu执行的回调函数
csd->info = info;
//发起ipi的一方,通过构造一个csd挂到对方cpu的call_single_queue percpu变量中
//csd->llist添加到call_single_queue对应中
llist_add(&csd->llist, &per_cpu(call_single_queue, cpu));
}
/* Send a message to all CPUs in the map */
arch_send_call_function_ipi_mask(cfd->cpumask);//给cpumask中对应bit位的cpu发送ipi信号
if (wait) {
//按顺序等各个接收到本cpu ipi信号的其他cpu修改csd的flag
for_each_cpu(cpu, cfd->cpumask) {
struct call_single_data *csd;
csd = per_cpu_ptr(cfd->csd, cpu);
csd_lock_wait(csd);
}
}
}
查看per_cpu_ptr的实现是通过将本cpu对应cfd->csd的值加上接收IPI信号的cpu号为偏移的数组__per_cpu_offset成员的值作为csd的地址值:
#ifndef __per_cpu_offset
extern unsigned long __per_cpu_offset[NR_CPUS];
#define per_cpu_offset(x) (__per_cpu_offset[x]) //从__per_cpu_offset数组获取percpu变量基地址,x为
#endif // cpu号,本次问题x对应cpu1和cpu3
#ifndef SHIFT_PERCPU_PTR
/* Weird cast keeps both GCC and sparse happy. */
#define SHIFT_PERCPU_PTR(__p, __offset) ({ \
__verify_pcpu_ptr((__p)); \
RELOC_HIDE((typeof(*(__p)) __kernel __force *)(__p), (__offset)); \
})
#endif
#ifdef CONFIG_SMP
#define per_cpu_ptr(ptr, cpu) SHIFT_PERCPU_PTR((ptr), per_cpu_offset((cpu)))
#else
#define per_cpu_ptr(ptr, cpu) ({ (void)(cpu); VERIFY_PERCPU_PTR((ptr)); })
#endif
voidsmp_call_function_many(const struct cpumask *mask, smp_call_func_t func,void*info, bool wait)
{
if (wait) {
for_each_cpu(cpu, cfd->cpumask) {
struct call_single_data *csd;
//对应本次问题这里cfd->csd为cpu2的值,cpu为1和3
csd = per_cpu_ptr(cfd->csd, cpu);
csd_lock_wait(csd);
}
}
}
四核的机器,__per_cpu_offset只有前面四个数组成员有效:
crash> px __per_cpu_offset
__per_cpu_offset = $7 =
{0xffff88043fc00000, 0xffff88043fc80000, 0xffff88043fd00000, 0xffff88043fd80000, 0xffffffff81b2c000, 0xffffffff81b2c000, 0xffffffff81b2c000, 0xffffffff81b2c000, 0xffffffff81b2c000, 0xffffffff81b2c000, 0xffffffff81b2c000, 0xffffffff81b2c000,...,...
}
或者用kmem -o查看per cpu 变量基准地址
crash> kmem -o
PER-CPU OFFSET VALUES:
CPU 0: ffff88043fc00000
CPU 1: ffff88043fc80000
CPU 2: ffff88043fd00000
CPU 3: ffff88043fd80000
crash>
从cpu2对应的per_cpu(cfd_data, 2)的cpumask知道,cpu2本次只给cpu1和cpu3发送了IPI信号,
而cpu2对应的cfd->csd值为0x1adb8
crash> p cfd_data:2
per_cpu(cfd_data, 2) = $8 = {
csd = 0x1adb8,
cpumask = 0xffff88017a1ee000
}
rash> struct cpumask 0xffff88017a1ee000
struct cpumask {
bits = {10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
}
crash> eval -b 10
hexadecimal: a
decimal: 10
octal: 12
binary: 0000000000000000000000000000000000000000000000000000000000001010
bits set: 3 1
crash>
cpu1和cpu3的percpu变量基准地址分别为ffff88043fc80000 和 ffff88043fd80000
那么发送给cpu1的csd地址为0xffff88043fc9adb8
crash> px (0xffff88043fc80000+0x1adb8)
$11 = 0xffff88043fc9adb8
crash>
call_single_data .flags为0代表cpu1已经响应并处理了IPI信号:
crash> struct call_single_data 0xffff88043fc9adb8
struct call_single_data {
{
llist = {
next = 0x0
},
__UNIQUE_ID_rh_kabi_hide4 = {
list = {
next = 0x0,
prev = 0x0
}
},
{<No data fields>}
},
func = 0xffffffff8106e4a0 <flush_tlb_func>,
info = 0xffff88000399b830,
flags = 0
}
crash>
发送给cpu3的csd地址为0xffff88043fd9adb8
crash> px (0xffff88043fd80000+0x1adb8)
$12 = 0xffff88043fd9adb8
crash>
call_single_data .flags为1代表cpu3并未响应并处理了IPI信号:
crash> struct call_single_data 0xffff88043fd9adb8
struct call_single_data {
{
llist = {
next = 0xffff88043fd979c0
},
__UNIQUE_ID_rh_kabi_hide4 = {
list = {
next = 0xffff88043fd979c0,
prev = 0x0
}
},
{<No data fields>}
},
func = 0xffffffff8106e4a0 <flush_tlb_func>,
info = 0xffff88000399b830,
flags = 1
}
crash>
其实我们有更方便的方法可以找出死机时cpu2正在等待哪个csd的flags值被清零.
查看栈信息死机时对应RIP指令为smp_call_function_many+514:
crash> bt
PID: 12263 TASK: ffff8803bbebdee0 CPU: 2 COMMAND: "kworker/u8:0"
...
[exception RIP: smp_call_function_many+514]
RIP: ffffffff810f99a2 RSP: ffff88000399b7e8 RFLAGS: 00000202
RAX: 0000000000000003 RBX: 00000000000000fc RCX: ffff88043fd9adb8
对应的汇编代码如下
crash> dis smp_call_function_many
...
...
0xffffffff95916cb8 <smp_call_function_many+512>: pause
//call_single_data.flags值为1,所以testb两个操作数AND后的结果为1,RFLAGS的bit6标志位ZF应该为0,执行下面的jne导致一直在这死循环。当前flags为1,所以这里一直在jne和pause之间跳转。
0xffffffff95916cba <smp_call_function_many+514>: testb $0x1,0x20(%rcx)
0xffffffff95916cbe <smp_call_function_many+518>: jne 0xffffffff95916cb8 <smp_call_function_many+0x208>
....
...
crash> eval -b 0x202
hexadecimal: 202
decimal: 514
octal: 1002
binary: 0000000000000000000000000000000000000000000000000000001000000010
bits set: 9 1 //bit6 标志位ZF为0
crash>
smp_call_function_many+512到smp_call_function_many+518指令对应的c源码为csd_lock_wait函数:
static void csd_lock_wait(struct call_single_data *csd)
{
while (csd->flags & CSD_FLAG_LOCK)
cpu_relax();
}
csd的flags在call_single_data偏移0x20
crash> struct call_single_data.flags -xo
struct call_single_data {
[0x20] u16 flags;
}
crash>
testb $0x1,0x20(%rcx)指令对应的就是while (csd->flags & CSD_FLAG_LOCK),那么rcx寄存器的值ffff88043fd9adb8自然就是call_single_data 类型csd的地址,ffff88043fd9adb8对应的是cpu3的csd地址,也就是代表cpu2正在等待cpu3响应IPI信号。
我们先来看看cpu在收到IPI信号后会做些什么?
static void flush_smp_call_function_queue(bool warn_cpu_offline)
{
...
//遍历本cpu对应call_single_queue队列中的csd,也就是处理发送给本cpu的所有ipi请求
llist_for_each_entry_safe(csd, csd_next, entry, llist) {
csd->func(csd->info);//ipi回调函数,本问题中对应的是flush_tlb_func
csd_unlock(csd);//处理完回调函数后清除csd的flags
}
...
}
前面的分析我们已经知道cpu2是给cpu1和cpu3发送了ipi请求,而cpu1和cpu3对应的cpu_tlbstate.state
值都为2:
crash> p cpu_tlbstate
PER-CPU DATA TYPE:
struct tlb_state cpu_tlbstate;
PER-CPU ADDRESSES:
[0]: ffff88043fc16500
[1]: ffff88043fc96500
[2]: ffff88043fd16500
[3]: ffff88043fd96500
crash> struct tlb_state.state ffff88043fc96500
state = 2
crash> struct tlb_state.state ffff88043fd96500
state = 2
crash>
#define TLBSTATE_OK 1
#define TLBSTATE_LAZY 2
tatic void flush_tlb_func(void *info)
{
struct flush_tlb_info *f = info;
inc_irq_stat(irq_tlb_count);
if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm))
return;
if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
if (f->flush_end == TLB_FLUSH_ALL)
local_flush_tlb();
else if (!f->flush_end)
__flush_tlb_single(f->flush_start);
else {
unsigned long addr;
addr = f->flush_start;
while (addr < f->flush_end) {
__flush_tlb_single(addr);
addr += PAGE_SIZE;
}
}
} else
leave_mm(smp_processor_id());//cpu_tlbstate.state为2,执行这里。
}
cpu_tlbstate.state不为TLBSTATE_OK时flush_tlb_func调用leave_mm(smp_processor_id()),
因为死机时CPU2发送IPI给cpu1和cpu3,那么这里smp_processor_id()值就是1和3,对应的
tlb_state.active_mm分别为 0xffff88042ce99900和0xffff88042cfb0640:
crash> struct tlb_state.active_mm ffff88043fc96500
active_mm = 0xffff88042ce99900
crash> struct tlb_state.active_mm ffff88043fd96500
active_mm = 0xffff88042cfb0640
crash>
void leave_mm(int cpu)
{
struct mm_struct *active_mm = this_cpu_read(cpu_tlbstate.active_mm);
if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
BUG();
if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) {
cpumask_clear_cpu(cpu, mm_cpumask(active_mm));
load_cr3(swapper_pg_dir);
}
}
收到IPI请求的cpu处理完回调函数后会将cpu_tlbstate.active_mm的成员cpu_vm_mask_var
对应自己cpu号的位清零.
死机时cpu3对应cpu_tlbstate.active_mm的成员cpu_vm_mask_var的bit3位仍然为1
crash> struct mm_struct.cpu_vm_mask_var 0xffff88042cfb0640
cpu_vm_mask_var = 0xffff88042cfb0988
crash> rd 0xffff88042cfb0988
ffff88042cfb0988: 0000000000000008 ........
crash>
crash>
* Future-safe accessor for struct mm_struct's cpu_vm_mask. */
static inline cpumask_t *mm_cpumask(struct mm_struct *mm)
{
return mm->cpu_vm_mask_var;
}
#define cpumask_bits(maskp) ((maskp)->bits)
/**
* cpumask_clear_cpu - clear a cpu in a cpumask
* @cpu: cpu number (< nr_cpu_ids)
* @dstp: the cpumask pointe
*/
static inline void cpumask_clear_cpu(int cpu, struct cpumask *dstp)
{
clear_bit(cpumask_check(cpu), cpumask_bits(dstp));
}
我们再回过头来看看触发死机的cpu2的调用栈
rash> bt
PID: 12263 TASK: ffff8803bbebdee0 CPU: 2 COMMAND: "kworker/u8:0"
....
[exception RIP: smp_call_function_many+514]
RIP: ffffffff810f99a2 RSP: ffff88000399b7e8 RFLAGS: 00000202
RAX: 0000000000000003 RBX: 00000000000000fc RCX: ffff88043fd9adb8
RDX: 0000000000000003 RSI: 0000000000000004 RDI: 0000000000000000
RBP: ffff88000399b820 R8: ffff88017a1ee000 R9: ffffffff813227d9
R10: ffff88043fd19c80 R11: ffffea00000c2100 R12: 0000000000000292
R13: ffff88000399b798 R14: ffffea0010b1f842 R15: 0000000000000000
ORIG_RAX: ffffffffffffff10 CS: 0010 SS: 0018
#10 [ffff88000399b828] native_flush_tlb_others at ffffffff8106e668
#11 [ffff88000399b878] flush_tlb_page at ffffffff8106e864
#12 [ffff88000399b898] ptep_clear_flush at ffffffff811c2524
#13 [ffff88000399b8d0] page_mkclean at ffffffff811bbf2e
....
void flush_tlb_page(struct vm_area_struct *vma, unsigned long start)
{
struct mm_struct *mm = vma->vm_mm;
...
if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
flush_tlb_others(mm_cpumask(mm), mm, start, 0UL);
...
}
static inline void flush_tlb_others(const struct cpumask *cpumask,
struct mm_struct *mm,
unsigned long start,
unsigned long end)
{
PVOP_VCALL4(pv_mmu_ops.flush_tlb_others, cpumask, mm, start, end);
}
void native_flush_tlb_others(const struct cpumask *cpumask,
struct mm_struct *mm, unsigned long start,
unsigned long end)
{
...
smp_call_function_many(cpumask, flush_tlb_func, &info, 1);
}
找出smp_call_function_many函数参数1 cpumask指针的地址
crash> whatis smp_call_function_many
void smp_call_function_many(const struct cpumask *, smp_call_func_t, void *, bool);
crash>
crash> dis -r ffffffff8106e668
...
0xffffffff8106e654 <native_flush_tlb_others+164>: mov $0x1,%ecx
0xffffffff8106e659 <native_flush_tlb_others+169>: mov $0xffffffff8106e4a0,%rsi
0xffffffff8106e660 <native_flush_tlb_others+176>: mov %r14,%rdi //r14和rdi存放的是smp_call_function_many函数参数1
0xffffffff8106e663 <native_flush_tlb_others+179>: callq 0xffffffff810f97a0 <smp_call_function_many>
0xffffffff8106e668 <native_flush_tlb_others+184>: jmp 0xffffffff8106e62a <native_flush_tlb_others+122>
crash> dis smp_call_function_many
0xffffffff810f97a0 <smp_call_function_many>: nopl 0x0(%rax,%rax,1) [FTRACE NOP]
0xffffffff810f97a5 <smp_call_function_many+5>: push %rbp
0xffffffff810f97a6 <smp_call_function_many+6>: mov %rsp,%rbp
0xffffffff810f97a9 <smp_call_function_many+9>: push %r15
0xffffffff810f97ab <smp_call_function_many+11>: push %r14 //对r14进行压栈,此时r14存放的还是smp_call_function_many函数 //参数1
0xffffffff810f97ad <smp_call_function_many+13>: mov %rdx,%r14
0xffffffff810f97b0 <smp_call_function_many+16>: push %r13
0xffffffff810f97b2 <smp_call_function_many+18>: mov %rsi,%r13
0xffffffff810f97b5 <smp_call_function_many+21>: push %r12
0xffffffff810f97b7 <smp_call_function_many+23>: push %rbx
crash>bt -f
....
[exception RIP: smp_call_function_many+514]
RIP: ffffffff810f99a2 RSP: ffff88000399b7e8 RFLAGS: 00000202
RAX: 0000000000000003 RBX: 00000000000000fc RCX: ffff88043fd9adb8
RDX: 0000000000000003 RSI: 0000000000000004 RDI: 0000000000000000
RBP: ffff88000399b820 R8: ffff88017a1ee000 R9: ffffffff813227d9
R10: ffff88043fd19c80 R11: ffffea00000c2100 R12: 0000000000000292
R13: ffff88000399b798 R14: ffffea0010b1f842 R15: 0000000000000000
ORIG_RAX: ffffffffffffff10 CS: 0010 SS: 0018
ffff88000399b740: ffffea0010b1f842 ffff88000399b798
ffff88000399b750: 0000000000000292 ffff88000399b820
ffff88000399b760: 00000000000000fc ffffea00000c2100
ffff88000399b770: ffff88043fd19c80 ffffffff813227d9
ffff88000399b780: ffff88017a1ee000 0000000000000003
ffff88000399b790: ffff88043fd9adb8 0000000000000003
ffff88000399b7a0: 0000000000000004 0000000000000000
ffff88000399b7b0: ffffffffffffff10 ffffffff810f99a2
ffff88000399b7c0: 0000000000000010 0000000000000202
ffff88000399b7d0: ffff88000399b7e8 0000000000000018
ffff88000399b7e0: ffffffff810f9977 000000012c7e1370
ffff88000399b7f0: ffff88042cfb0988 ffff88042cfb0640
ffff88000399b800: 00007f4474b04000 0000000000000000
ffff88000399b810: ffff88042cfb0988 ffff880114338820
ffff88000399b820: ffff88000399b870 ffffffff8106e668
#10 [ffff88000399b828] native_flush_tlb_others at ffffffff8106e668
....
通过栈信息找到smp_call_function_many函数参数1的地址为0xffff88042cfb0988
crash> struct cpumask 0xffff88042cfb0988
struct cpumask {
bits = {8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
}
crash>
cpu2的调用栈smp_call_function_many函数参数1的cpumask地址实际上跟前面分析的cpu3响应处理ipi请求后需要清除
其对应cpu掩码位的cpu_vm_mask_var是一个地址。
crash> struct mm_struct.cpu_vm_mask_var 0xffff88042cfb0640
cpu_vm_mask_var = 0xffff88042cfb0988
crash
所以实际上我们仅仅只要找到cpu2调用smp_call_function_many时传递的参数1 cpumask值就可以知道cpu2是在等待
哪个cpu响应其发送的ipi请求了。
smp_call_function_many函数第一个参数传入的cpumask对应地址的值代表
哪个cpu还未响应或正在处理IPI回调函数,也就是触发重启watchdog所在的cpu正在等待响应的cpu:
cfd_data:x的值来源于smp_call_function_many函数传入的参数1 cpumask,但是由于处理ipi请求的cpu在完成ipi请求后会清除smp_call_function_many函数传入的参数1 cpumask地址所保留的对应cpu位。所以cfd_data:x的cpumask代表死机前本cpu给哪几个cpu发送了IPI信号,而smp_call_function_many函数传入的参数1 cpumask地址的值代表本cpu正在等待哪个(些)cpu响应或者处理完成IPI请求,针对本问题对应信息如下:
cpu2给cpu1和cpu3发送了ipi请求
crash> p cfd_data:2
per_cpu(cfd_data, 2) = $15 = {
csd = 0x1adb8,
cpumask = 0xffff88017a1ee000
}
crash> struct cpumask 0xffff88017a1ee000
struct cpumask {
bits = {10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
}
crash>
cpu2正在等待cpu3完成ipi请求,cpu1已经完成cpu2发送的ipi请求,所以bit1已被清0:
crash> struct cpumask 0xffff88042cfb0988
struct cpumask {
bits = {8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
}
crash>
我们再来看看cpu3此时积压了多少ipi请求未处理
crash> p call_single_queue:3
per_cpu(call_single_queue, 3) = $19 = {
first = 0xffff88043fd9adb8
}
crash> list 0xffff88043fd9adb8
ffff88043fd9adb8 // cpu2发送过来的ipi请求
ffff88043fd979c0//
crash>
cpu3为什么不处理IPI请求?
cpu3此时运行着空闲swapper进程:
crash> bt -c 3
PID: 0 TASK: ffff88017a203f40 CPU: 3 COMMAND: "swapper/3"
bt: page excluded: kernel virtual address: ffffffffffffffff type: "cpu_online_map"
#0 [ffff88043fd85e48] crash_nmi_callback at ffffffff8104fd61
#1 [ffff88043fd85e58] nmi_handle at ffffffff816ad427
#2 [ffff88043fd85eb0] do_nmi at ffffffff816ad65d
#3 [ffff88043fd85ef0] end_repeat_nmi at ffffffff816ac8d3
[exception RIP: native_safe_halt+6]
RIP: ffffffff816ab4a6 RSP: ffff88017a23bea8 RFLAGS: 00000286
RAX: 00000000ffffffed RBX: ffffffff81b1c820 RCX: 0100000000000000
RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000046
RBP: ffff88017a23bea8 R8: 0000000000000000 R9: 0000000000000000
R10: 0000000000000000 R11: 0004ef1ee033ca80 R12: 0000000000000003
R13: ffff88017a238000 R14: ffff88017a238000 R15: ffff88017a238000
ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018
--- <NMI exception stack> ---
#4 [ffff88017a23bea8] native_safe_halt at ffffffff816ab4a6
#5 [ffff88017a23beb0] default_idle at ffffffff816ab33e
#6 [ffff88017a23bed0] arch_cpu_idle at ffffffff81035006
#7 [ffff88017a23bee0] cpu_startup_entry at ffffffff810e7bca
#8 [ffff88017a23bf28] start_secondary at ffffffff81051af6
crash>
cpu3运行队列里却是有进程在等cpu资源:
crash> runq
...
CPU 3 RUNQUEUE: ffff88043fd96cc0
CURRENT: PID: 0 TASK: ffff88017a203f40 COMMAND: "swapper/3"
RT PRIO_ARRAY: ffff88043fd96e50
[no tasks queued]
CFS RB_ROOT: ffff88043fd96d68
[120] PID: 30118 TASK: ffff880012ab8000 COMMAND: "barad_agent"
[120] PID: 30121 TASK: ffff8800368dbf40 COMMAND: "java"
crash>
cpu3并未出现死锁,但是却没有响应ipi请求和调度运行可运行队列中的进程,由于是kvm虚拟机,因此推测是host主机出现异常导致子机vcpu3没有获取cpu资源导致的,排查host主机日志后发现是由于磁盘故障引起该子机对应的qemu进程D状态引起的。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。