Linux syscall过程分析（万字长文）

秃头哥编程

发布于 2019-08-23 14:31:01

14.9K10

代码可运行

文章被收录于专栏：秃头哥编程秃头哥编程

运行总次数：0

代码可运行

文章有点长，慢慢看。

一、背景

为了安全，Linux 中分为用户态和内核态两种运行状态。对于普通进程，平时都是运行在用户态下，仅拥有基本的运行能力。当进行一些敏感操作，比如说要打开文件(open)然后进行写入(write)、分配内存(malloc)时，就会切换到内核态。内核态进行相应的检查，如果通过了，则按照进程的要求执行相应的操作，分配相应的资源。这种机制被称为系统调用，用户态进程发起调用，切换到内核态，内核态完成，返回用户态继续执行，是用户态唯一主动切换到内核态的合法手段(exception 和 interrupt 是被动切换)。

关于系统调用的详细定义可以通过 man syscalls 查看，它列出了目前 Linux Kernel 提供的系统调用 ABI 。我们熟悉的调用比如 open， read ，close 之类的都属于系统调用，但它们都经过了 C 库 (glibc)的封装。实际上，只要符合 ABI 规范，我们可以自己用汇编代码来进行调用。

历史上，x86 的系统调用实现经历了 int / iret 到 sysenter / sysexit 再到 syscall / sysret 的演变。

以下的分析基于 Linux kernel 4.9.76 ，glibc 为 2.25.90。

二、int / iret

很久很久以前，我们通过 int 0x80 进行系统调用(open)：

mov 0x05 ,eax       /* 设置系统调用号 */
int 0x80

在 arch/x86/kernel/traps.c 的 trap_init 中，定义了各种 set_intr_gate / set_intr_gate_ist / set_system_intr_gate 。其中 set_system_intr_gate 用于在中断描述符表(IDT)上设置系统调用门：

#ifdef CONFIG_X86_32
   set_system_intr_gate(IA32_SYSCALL_VECTOR, entry_INT80_32);
   set_bit(IA32_SYSCALL_VECTOR, used_vectors);
#endif

根据 arch/x86/include/asm/irq_vectors.h， IA32_SYSCALL_VECTOR 值为 0x80。

于是在调用 int 0x80 后，硬件根据向量号在 IDT 中找到对应的表项，即中断描述符，进行特权级检查，发现 DPL = CPL = 3 ，允许调用。然后硬件将切换到内核栈 (tss.ss0 : tss.esp0)。接着根据中断描述符的 segment selector 在 GDT / LDT 中找到对应的段描述符，从段描述符拿到段的基址，加载到 cs 。将 offset 加载到 eip。最后硬件将 ss / sp / eflags / cs / ip / error code 依次压到内核栈。

于是从 entry_INT80_32 开始执行，其定义在 arch/x86/entry/entry_32.S ：

ENTRY(entry_INT80_32)
    ASM_CLAC
    pushl   %eax            /* pt_regs->orig_ax */
    SAVE_ALL pt_regs_ax=$-ENOSYS    /* save rest */

    /*
     * User mode is traced as though IRQs are on, and the interrupt gate
     * turned them off.
     */
    TRACE_IRQS_OFF

    movl    %esp, %eax
    call    do_int80_syscall_32
...

它将存在 eax 中的系统调用号压入栈中，然后调用 SAVE_ALL 将其他寄存器的值压入栈中进行保存：

.macro SAVE_ALL pt_regs_ax=%eax
    cld
    PUSH_GS
    pushl   %fs
    pushl   %es
    pushl   %ds
    pushl   \pt_regs_ax
    pushl   %ebp
    pushl   %edi
    pushl   %esi
    pushl   %edx
    pushl   %ecx
    pushl   %ebx
    movl    $(__USER_DS), %edx
    movl    %edx, %ds
    movl    %edx, %es
    movl    $(__KERNEL_PERCPU), %edx
    movl    %edx, %fs
    SET_KERNEL_GS %edx
.endm

保存完毕后，关闭中断，将当前栈指针保存到 eax ，调用 do_int80_syscall_32 => do_syscall_32_irqs_on ，该函数在 arch/x86/entry/common.c 中定义：

static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
{
    struct thread_info *ti = current_thread_info();
    unsigned int nr = (unsigned int)regs->orig_ax;

#ifdef CONFIG_IA32_EMULATION
    current->thread.status |= TS_COMPAT;
#endif

    if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) {
        /*
         * Subtlety here: if ptrace pokes something larger than
         * 2^32-1 into orig_ax, this truncates it.  This may or
         * may not be necessary, but it matches the old asm
         * behavior.
         */
        nr = syscall_trace_enter(regs);
    }

    if (likely(nr < IA32_NR_syscalls)) {
        /*
         * It's possible that a 32-bit syscall implementation
         * takes a 64-bit parameter but nonetheless assumes that
         * the high bits are zero.  Make sure we zero-extend all
         * of the args.
         */
        regs->ax = ia32_sys_call_table[nr](
            (unsigned int)regs->bx, (unsigned int)regs->cx,
            (unsigned int)regs->dx, (unsigned int)regs->si,
            (unsigned int)regs->di, (unsigned int)regs->bp);
    }

    syscall_return_slowpath(regs);
}

这个函数的参数 regs(struct pt_regs 定义见 arch/x86/include/asm/ptrace.h )就是先前在 entry_INT80_32 依次被压入栈的寄存器值。这里先取出系统调用号，从系统调用表(ia32_sys_call_table) 中取出对应的处理函数，然后通过先前寄存器中的参数调用之。

系统调用表 ia32_sys_call_table 在 arch/x86/entry/syscall_32.c 中定义，但内容有点奇怪，看上去表的内容是 include 进来的：

/* System call table for i386. */

#include <linux/linkage.h>
#include <linux/sys.h>
#include <linux/cache.h>
#include <asm/asm-offsets.h>
#include <asm/syscall.h>

#define __SYSCALL_I386(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
#include <asm/syscalls_32.h>
#undef __SYSCALL_I386

#define __SYSCALL_I386(nr, sym, qual) [nr] = sym,

extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);

__visible const sys_call_ptr_t ia32_sys_call_table[__NR_syscall_compat_max+1] = {
    /*
     * Smells like a compiler bug -- it doesn't work
     * when the & below is removed.
     */
    [0 ... __NR_syscall_compat_max] = &sys_ni_syscall,
#include <asm/syscalls_32.h>
};

然而我们到源码的 arch/x86/include/asm 目录下却找不到 syscalls_32.h 的，但在编译 kernel 后的 arch/x86/include/generated/asm 里面发现了它：

__SYSCALL_I386(0, sys_restart_syscall, )
__SYSCALL_I386(1, sys_exit, )
#ifdef CONFIG_X86_32
__SYSCALL_I386(2, sys_fork, )
#else
__SYSCALL_I386(2, sys_fork, )
#endif
__SYSCALL_I386(3, sys_read, )
__SYSCALL_I386(4, sys_write, )
#ifdef CONFIG_X86_32
__SYSCALL_I386(5, sys_open, )
#else
__SYSCALL_I386(5, compat_sys_open, )
...

这说明 syscalls_32.h 是在编译过程中动态生成的，请看脚本 arch/x86/entry/syscalls/syscalltbl.sh，它读取了同目录下的 syscall_32.tbl ，为每一有效行都生成了 __SYSCALL_${abi}($nr, $real_entry, $qualifier) 结构。然后在宏 __SYSCALL_I386 的作用下形成了这样的定义：

__visible const sys_call_ptr_t ia32_sys_call_table[__NR_syscall_compat_max+1] = {
   [0 ... __NR_syscall_compat_max] = &sys_ni_syscall,

   [0] = sys_restart_syscall,
   [1] = sys_exit,
   [2] = sys_fork,
   [3] = sys_read,
   [4] = sys_write,
   [5] = sys_open,
   ...
};

根据 GCC文档，这样的初始化方法在 ISO C99 中定义，个人称之为数组的乱序初始化。

因为我们的调用号是 0x05 ，所以这里调用了 sys_open ，定义在 fs/open.c 中定义：

SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode)
{
    if (force_o_largefile())
        flags |= O_LARGEFILE;

    return do_sys_open(AT_FDCWD, filename, flags, mode);
}

宏 SYSCALL_DEFINE3 及相关定义如下：

#define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__)

#define SYSCALL_DEFINEx(x, sname, ...)                \
        SYSCALL_METADATA(sname, x, __VA_ARGS__)       \
        __SYSCALL_DEFINEx(x, sname, __VA_ARGS__)

#define __SYSCALL_DEFINEx(x, name, ...)                                 \
        asmlinkage long sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))       \
                __attribute__((alias(__stringify(SyS##name))));         \
                                                                        \
        static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__));  \
                                                                        \
        asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__));      \
                                                                        \
        asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__))       \
        {                                                               \
                long ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__));  \
                __MAP(x,__SC_TEST,__VA_ARGS__);                         \
                __PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__));       \
                return ret;                                             \
        }                                                               \
                                                                        \
        static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__))

SYSCALL_METADATA 保存了调用的基本信息，供调试程序跟踪使用( kernel 需开启 CONFIG_FTRACE_SYSCALLS )。

而 __SYSCALL_DEFINEx 用于拼接函数，函数名被拼接为 sys##_##open，参数也通过 __SC_DECL 拼接，最终得到展开后的定义：

asmlinkage long sys_open(const char __user * filename, int flags, umode_t mode)
{
    if (force_o_largefile())
        flags |= O_LARGEFILE;

    return do_sys_open(AT_FDCWD, filename, flags, mode);
}

sys_open 是对 do_sys_open 的封装：

long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode)
{
    struct open_flags op;
    int fd = build_open_flags(flags, mode, &op);
    struct filename *tmp;

    if (fd)
        return fd;

    tmp = getname(filename);
    if (IS_ERR(tmp))
        return PTR_ERR(tmp);

    fd = get_unused_fd_flags(flags);
    if (fd >= 0) {
        struct file *f = do_filp_open(dfd, tmp, &op);
        if (IS_ERR(f)) {
            put_unused_fd(fd);
            fd = PTR_ERR(f);
        } else {
            fsnotify_open(f);
            fd_install(fd, f);
        }
    }
    putname(tmp);
    return fd;
}

getname 将处于用户态的文件名拷到内核态，然后通过 get_unused_fd_flags 获取一个没用过的文件描述符，然后 do_filp_open 创建 struct file ， fd_install 将 fd 和 struct file 绑定(task_struct->files->fdt[fd] = file)，然后返回 fd。

fd一直返回到 do_syscall_32_irqs_on ，被设置到 regs->ax (eax) 中。接着返回 entry_INT80_32 继续执行，最后执行 INTERRUPT_RETURN 。INTERRUPT_RETURN 在 arch/x86/include/asm/irqflags.h 中定义为 iret ，负责恢复先前压栈的寄存器，返回用户态。系统调用执行完毕。

在目前主流的系统调用库(glibc) 中，int 0x80 只有在硬件不支持快速系统调用(sysenter / syscall)的时候才会调用，但目前的硬件都支持快速系统调用，所以为了能够看看 int 0x80 的效果，我们手撸汇编：

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

int main(){
    char * filename = "/tmp/test";
    char * buffer = malloc(80);
    memset(buffer, 0, 80);
    int count;
    __asm__ __volatile__("movl $0x5, %%eax\n\t"
                         "movl %1, %%ebx\n\t"
                         "movl $0, %%ecx\n\t"
                         "movl $0664, %%edx\n\t"
                         "int $0x80\n\t"
                         "movl %%eax, %%ebx\n\t"
                         "movl $0x3, %%eax\n\t"
                         "movl %2, %%ecx\n\t"
                         "movl $80, %%edx\n\t"
                         "int $0x80\n\t"
                         "movl %%eax, %0\n\t"
                         :"=m"(count)
                         :"g"(filename), "g"(buffer)
                         :"%eax", "%ebx", "%ecx", "%edx");
    printf("%d\n", count);
    printf("%s\n", buffer);
    free(buffer);
}

这段代码首先通过 int 0x80 调用系统调用 open 得到 fd (由 eax 返回)，再作为 read 的参数传入，从而读出了文件中的内容。但比较奇怪的是如果 buffer 存储在栈中 (buffer[80])，则调用 read 失败。只有将 buffer 作为全局变量或存储在堆中，才能调用成功。希望有知道的大大指点一下。

三、sysenter / sysexit

接下来介绍的是 32位下 Intel 提出的快速系统调用 sysenter/sysexit，它和同期AMD的 syscall/sysret 机制类似。

之所以提出新指令，是因为通过软中断来实现系统调用实在太慢了。于是 Intel x86 CPU 自 Pentium II（Family 6, Model 3, Stepping 3）之后，开始支持新的系统调用指令 sysenter/sysexit。前者用于从低特权级切换到 ring 0，后者用于从ring 0 切换到低特权级。没有特权级别检查(CPL, DPL)，也没有压栈的操作，快最重要！

在 Intel SDM 中阐述了sysenter指令。首先 CPU 有一堆特殊的寄存器，名为 Model-Specific Register(MSR)，这些寄存器在操作系统运行过程中起着重要作用。对于这些寄存器，需要采用专门的指令 RDMSR 和 WRMSR 进行读写。

sysenter 用到了以下 MSR (定义在 arch/x86/include/asm/msr-index.h)：

IA32_SYSENTER_CS(174H)：存放内核态处理代码的段选择符
IA32_SYSENTER_EIP(175H)：存放内核态栈顶偏移量
IA32_SYSENTER_ESP(176H)：存放内核态处理代码偏移量

当执行 sysenter 时，执行以下操作：

清除 FLAGS 的 VM 标志，确保在保护模式下运行
清除 FLAGS 的 IF 标志，屏蔽中断
加载 IA32_SYSENTER_ESP 的值到 esp
加载 IA32_SYSENTER_EIP 的值到 eip
加载 SYSENTER_CS_MSR 的值到 CS
将 SYSENTER_CS_MSR + 8 的值加载到 ss 。因为在GDT中， ss 就跟在 cs 后面
开始执行(cs:eip)指向的代码

这些 MSR 在 arch/x86/kernel/cpu/common.c 的 enable_sep_cpu 中初始化：

void enable_sep_cpu(void)
{
    struct tss_struct *tss;
    int cpu;

    if (!boot_cpu_has(X86_FEATURE_SEP))
        return;

    cpu = get_cpu();
    tss = &per_cpu(cpu_tss, cpu);

    /*
     * We cache MSR_IA32_SYSENTER_CS's value in the TSS's ss1 field --
     * see the big comment in struct x86_hw_tss's definition.
     */

    tss->x86_tss.ss1 = __KERNEL_CS;
    wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0);

    wrmsr(MSR_IA32_SYSENTER_ESP,
          (unsigned long)tss + offsetofend(struct tss_struct, SYSENTER_stack),
          0);

    wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0);

    put_cpu();
}

这里将 __KERNEL_CS 设置到 MSR_IA32_SYSENTER_CS 中，将 tss.SYSENTER_stack 地址设置到 MSR_IA32_SYSENTER_ESP 中，最后将内核入口点 entry_SYSENTER_32 的地址设置到 MSR_IA32_SYSENTER_EIP 中。

当用户程序进行系统调用时，实际上在用户态中最终会调用到 VDSO 中映射的 __kernel_vsyscall ，其定义位于 arch/x86/entry/vdso/vdso32/system_call.S：

__kernel_vsyscall:
    CFI_STARTPROC
    pushl   %ecx
    CFI_ADJUST_CFA_OFFSET   4
    CFI_REL_OFFSET      ecx, 0
    pushl   %edx
    CFI_ADJUST_CFA_OFFSET   4
    CFI_REL_OFFSET      edx, 0
    pushl   %ebp
    CFI_ADJUST_CFA_OFFSET   4
    CFI_REL_OFFSET      ebp, 0

    #define SYSENTER_SEQUENCE   "movl %esp, %ebp; sysenter"
    #define SYSCALL_SEQUENCE    "movl %ecx, %ebp; syscall"

#ifdef CONFIG_X86_64
    /* If SYSENTER (Intel) or SYSCALL32 (AMD) is available, use it. */
    ALTERNATIVE_2 "", SYSENTER_SEQUENCE, X86_FEATURE_SYSENTER32, \
                      SYSCALL_SEQUENCE,  X86_FEATURE_SYSCALL32
#else
    ALTERNATIVE "", SYSENTER_SEQUENCE, X86_FEATURE_SEP
#endif

    /* Enter using int $0x80 */
    int $0x80
GLOBAL(int80_landing_pad)

    /*
     * Restore EDX and ECX in case they were clobbered.  EBP is not
     * clobbered (the kernel restores it), but it's cleaner and
     * probably faster to pop it than to adjust ESP using addl.
     */
    popl    %ebp
    CFI_RESTORE     ebp
    CFI_ADJUST_CFA_OFFSET   -4
    popl    %edx
    CFI_RESTORE     edx
    CFI_ADJUST_CFA_OFFSET   -4
    popl    %ecx
    CFI_RESTORE     ecx
    CFI_ADJUST_CFA_OFFSET   -4
    ret
    CFI_ENDPROC

    .size __kernel_vsyscall,.-__kernel_vsyscall
    .previous

__kernel_vsyscall 首先将寄存器当前值压栈保存，因为这些寄存器以后要用作系统调用传参。然后填入参数，调用 sysenter

ALTERNATIVE_2 宏实际上是在做选择，如果支持 X86_FEATURE_SYSENTER32(Intel CPU) ，则执行 SYSENTER_SEQUENCE ，如果支持 X86_FEATURE_SYSCALL32(AMD CPU)，则执行 SYSCALL_SEQUENCE 。如果都不支持，那么啥都不干(???)。如果啥都没干，那么接着往下执行，即执行 int $0x80，退化到传统(legacy)方式进行系统调用。

注意 sysenter 指令会覆盖掉 esp ，因此 SYSENTER_SEQUENCE 中会将当前 esp 保存到 ebp 中。sysenter 同样会覆盖 eip ，但由于返回地址是固定的(__kernel_vsyscall 函数结尾)，因此无需保存。

前文提到过，执行了 sysenter 指令之后直接切换到内核态，同时寄存器也都设置好了：eip 被设置为 IA32_SYSENTER_EIP 即 entry_SYSENTER_32 的地址，其定义在arch/x86/entry/entry_32.S中：

ENTRY(entry_SYSENTER_32)
    movl    TSS_sysenter_sp0(%esp), %esp
sysenter_past_esp:
    pushl   $__USER_DS      /* pt_regs->ss */
    pushl   %ebp            /* pt_regs->sp (stashed in bp) */
    pushfl              /* pt_regs->flags (except IF = 0) */
    orl $X86_EFLAGS_IF, (%esp)  /* Fix IF */
    pushl   $__USER_CS      /* pt_regs->cs */
    pushl   $0          /* pt_regs->ip = 0 (placeholder) */
    pushl   %eax            /* pt_regs->orig_ax */
    SAVE_ALL pt_regs_ax=$-ENOSYS    /* save rest */

    testl   $X86_EFLAGS_NT|X86_EFLAGS_AC|X86_EFLAGS_TF, PT_EFLAGS(%esp)
    jnz .Lsysenter_fix_flags
.Lsysenter_flags_fixed:

    /*
     * User mode is traced as though IRQs are on, and SYSENTER
     * turned them off.
     */
    TRACE_IRQS_OFF

    movl    %esp, %eax
    call    do_fast_syscall_32
...

/* arch/x86/kernel/asm-offsets_32.c */
/* Offset from the sysenter stack to tss.sp0 */
DEFINE(TSS_sysenter_sp0, offsetof(struct cpu_entry_area, tss.x86_tss.sp0) -
       offsetofend(struct cpu_entry_area, entry_stack_page.stack));

前文提到过，sysenter 会将 IA32_SYSENTER_ESP 加载到 esp 中，但 IA32_SYSENTER_ESP 保存的是 SYSENTER_stack 的地址，需要通过 TSS_sysenter_sp0 进行修正，指向进程的内核栈。

然后开始按照 pt_regs 的结构将相关寄存器中的值压入栈中，包括在 sysenter 前保存到 ebp 的用户态栈顶指针。由于 eip 无需保存，于是压入 0 用于占位。

最后调用 do_fast_syscall_32 ，该函数在 arch/x86/entry/common.c 中定义：

/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
__visible long do_fast_syscall_32(struct pt_regs *regs)
{
    /*
     * Called using the internal vDSO SYSENTER/SYSCALL32 calling
     * convention.  Adjust regs so it looks like we entered using int80.
     */

    unsigned long landing_pad = (unsigned long)current->mm->context.vdso +
        vdso_image_32.sym_int80_landing_pad;

    /*
     * SYSENTER loses EIP, and even SYSCALL32 needs us to skip forward
     * so that 'regs->ip -= 2' lands back on an int $0x80 instruction.
     * Fix it up.
     */
    regs->ip = landing_pad;

    enter_from_user_mode();

    local_irq_enable();

    /* Fetch EBP from where the vDSO stashed it. */
    if (
#ifdef CONFIG_X86_64
        /*
         * Micro-optimization: the pointer we're following is explicitly
         * 32 bits, so it can't be out of range.
         */
        __get_user(*(u32 *)&regs->bp,
                (u32 __user __force *)(unsigned long)(u32)regs->sp)
#else
        get_user(*(u32 *)&regs->bp,
             (u32 __user __force *)(unsigned long)(u32)regs->sp)
#endif
        ) {

        /* User code screwed up. */
        local_irq_disable();
        regs->ax = -EFAULT;
        prepare_exit_to_usermode(regs);
        return 0;   /* Keep it simple: use IRET. */
    }

    /* Now this is just like a normal syscall. */
    do_syscall_32_irqs_on(regs);

#ifdef CONFIG_X86_64
    /*
     * Opportunistic SYSRETL: if possible, try to return using SYSRETL.
     * SYSRETL is available on all 64-bit CPUs, so we don't need to
     * bother with SYSEXIT.
     *
     * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP,
     * because the ECX fixup above will ensure that this is essentially
     * never the case.
     */
    return regs->cs == __USER32_CS && regs->ss == __USER_DS &&
        regs->ip == landing_pad &&
        (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)) == 0;
#else
    /*
     * Opportunistic SYSEXIT: if possible, try to return using SYSEXIT.
     *
     * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP,
     * because the ECX fixup above will ensure that this is essentially
     * never the case.
     *
     * We don't allow syscalls at all from VM86 mode, but we still
     * need to check VM, because we might be returning from sys_vm86.
     */
    return static_cpu_has(X86_FEATURE_SEP) &&
        regs->cs == __USER_CS && regs->ss == __USER_DS &&
        regs->ip == landing_pad &&
        (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)) == 0;
#endif
}

由于没有保存 eip，我们需要计算系统调用完毕后返回到用户态的地址：current->mm->context.vdso + vdso_image_32.sym_int80_landing_pad (即跳过 sym_int80_landing_pad 来到 __kernel_vsyscall 的结尾) 覆盖掉先前压栈的 0 。

接下来就和 int 0x80 的流程一样，通过 do_syscall_32_irqs_on 从系统调用表中找到相应的处理函数进行调用。完成后，如果都符合 sysexit 的要求，返回 1，否则返回 0 。

...
    call    do_fast_syscall_32
    /* XEN PV guests always use IRET path */
    ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \
            "jmp .Lsyscall_32_done", X86_FEATURE_XENPV

/* Opportunistic SYSEXIT */
    TRACE_IRQS_ON           /* User mode traces as IRQs on. */
    movl    PT_EIP(%esp), %edx  /* pt_regs->ip */
    movl    PT_OLDESP(%esp), %ecx   /* pt_regs->sp */
1:  mov PT_FS(%esp), %fs
    PTGS_TO_GS
    popl    %ebx            /* pt_regs->bx */
    addl    $2*4, %esp      /* skip pt_regs->cx and pt_regs->dx */
    popl    %esi            /* pt_regs->si */
    popl    %edi            /* pt_regs->di */
    popl    %ebp            /* pt_regs->bp */
    popl    %eax            /* pt_regs->ax */

    /*
     * Restore all flags except IF. (We restore IF separately because
     * STI gives a one-instruction window in which we won't be interrupted,
     * whereas POPF does not.)
     */
    addl    $PT_EFLAGS-PT_DS, %esp  /* point esp at pt_regs->flags */
    btr $X86_EFLAGS_IF_BIT, (%esp)
    popfl

    /*
     * Return back to the vDSO, which will pop ecx and edx.
     * Don't bother with DS and ES (they already contain __USER_DS).
     */
    sti
    sysexit

根据 testl %eax, %eax; jz .Lsyscall_32_done ，如果 do_fast_syscall_32 的返回值(eax)为 0 ，表示不支持快速返回，于是跳转到 Lsyscall_32_done ，通过 iret 返回。否则继续执行下面代码，将内核栈中保存的值保存到相应寄存器中，然后通过 sysexit 返回。

注意这里将原有的 eip 设置到 edx、 esp 设置到 ecx ，这是因为根据 Intel SDM，sysexit 会用 edx 来设置 eip，用 ecx 来设置 esp ，从而指向先前用户空间的代码偏移和栈偏移。并加载 SYSENTER_CS_MSR+16 到 cs，加载 SYSENTER_CS_MSR+24 到 ss 。如此一来就回到了用户态的 __kernel_vsyscall 尾端。

四、实验

我们通过 gdb 一个 C 程序来检验一下：

#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>

int main(int argc, char *argv[]){
    char buffer[80] = "/tmp/test";
    int fd = open(buffer, O_RDONLY);
    int size = read(fd, buffer, sizeof(buffer));
    close(fd);
}



$ gcc -m32 -g -static -o read read.c
$ file read
read: ELF 32-bit LSB executable, Intel 80386, version 1 (GNU/Linux), statically linked, for GNU/Linux 2.6.32, BuildID[sha1]=8a7f3d69d3e4c9582551934b0617ad78e492e48c, not stripped



[txt]
(gdb) disas
   0x0804888a <+14>:    push   %ecx
   0x0804888b <+15>:    sub    $0x70,%esp
   0x0804888e <+18>:    mov    %ecx,%eax
   0x08048890 <+20>:    mov    0x4(%eax),%eax
   0x08048893 <+23>:    mov    %eax,-0x6c(%ebp)
   0x08048896 <+26>:    mov    %gs:0x14,%eax
   0x0804889c <+32>:    mov    %eax,-0xc(%ebp)
   0x0804889f <+35>:    xor    %eax,%eax
   0x080488a1 <+37>:    movl   $0x706d742f,-0x5c(%ebp)
   0x080488a8 <+44>:    movl   $0x7365742f,-0x58(%ebp)
   0x080488af <+51>:    movl   $0x74,-0x54(%ebp)
   0x080488b6 <+58>:    lea    -0x50(%ebp),%edx
   0x080488b9 <+61>:    mov    $0x0,%eax
   0x080488be <+66>:    mov    $0x11,%ecx
   0x080488c3 <+71>:    mov    %edx,%edi
   0x080488c5 <+73>:    rep stos %eax,%es:(%edi)
   0x080488c7 <+75>:    sub    $0x8,%esp
   0x080488ca <+78>:    push   $0x0
   0x080488cc <+80>:    lea    -0x5c(%ebp),%eax
   0x080488cf <+83>:    push   %eax
   0x080488d0 <+84>:    call   0x806cf30 <open>
   0x080488d5 <+89>:    add    $0x10,%esp
   0x080488d8 <+92>:    mov    %eax,-0x64(%ebp)
   0x080488db <+95>:    sub    $0x4,%esp
   0x080488de <+98>:    push   $0x50
   0x080488e0 <+100>:   lea    -0x5c(%ebp),%eax
   0x080488e3 <+103>:   push   %eax
   0x080488e4 <+104>:   pushl  -0x64(%ebp)
   0x080488e7 <+107>:   call   0x806cfa0 <read>
   0x080488ec <+112>:   add    $0x10,%esp
   0x080488ef <+115>:   mov    %eax,-0x60(%ebp)
=> 0x080488f2 <+118>:   sub    $0xc,%esp
   0x080488f5 <+121>:   pushl  -0x64(%ebp)
   0x080488f8 <+124>:   call   0x806d150 <close>
   0x080488fd <+129>:   add    $0x10,%esp
   0x08048900 <+132>:   mov    $0x0,%eax
   0x08048905 <+137>:   mov    -0xc(%ebp),%edx
   0x08048908 <+140>:   xor    %gs:0x14,%edx
   0x0804890f <+147>:   je     0x8048916 <main+154>
   0x08048911 <+149>:   call   0x806ef90 <__stack_chk_fail>
   0x08048916 <+154>:   lea    -0x8(%ebp),%esp
   0x08048919 <+157>:   pop    %ecx
   0x0804891a <+158>:   pop    %edi
   0x0804891b <+159>:   pop    %ebp
   0x0804891c <+160>:   lea    -0x4(%ecx),%esp
   0x0804891f <+163>:   ret
End of assembler dump.

首先是 open ，将将参数 O_RDONLY (根据 #define O_RDONLY 0，值为 0x0 )，将 buffer 地址(eax) 压栈后调用系统调用 glibc 的 open 函数，disas 之：

(gdb) disas 0x806cf30
Dump of assembler code for function open:
   0x0806cf30 <+0>:     cmpl   $0x0,%gs:0xc
   0x0806cf38 <+8>:     jne    0x806cf5f 
   0x0806cf3a <+0>:     push   %ebx
   0x0806cf3b <+1>:     mov    0x10(%esp),%edx
   0x0806cf3f <+5>:     mov    0xc(%esp),%ecx
   0x0806cf43 <+9>:     mov    0x8(%esp),%ebx
   0x0806cf47 <+13>:    mov    $0x5,%eax
   0x0806cf4c <+18>:    call   *0x80ea9f0
   0x0806cf52 <+24>:    pop    %ebx
   0x0806cf53 <+25>:    cmp    $0xfffff001,%eax
   0x0806cf58 <+30>:    jae    0x8070590 <__syscall_error>
   0x0806cf5e <+36>:    ret
   0x0806cf5f <+47>:    call   0x806ea80 <__libc_enable_asynccancel>
   0x0806cf64 <+52>:    push   %eax
   0x0806cf65 <+53>:    push   %ebx
   0x0806cf66 <+54>:    mov    0x14(%esp),%edx
   0x0806cf6a <+58>:    mov    0x10(%esp),%ecx
   0x0806cf6e <+62>:    mov    0xc(%esp),%ebx
   0x0806cf72 <+66>:    mov    $0x5,%eax
   0x0806cf77 <+71>:    call   *0x80ea9f0
   0x0806cf7d <+77>:    pop    %ebx
   0x0806cf7e <+78>:    xchg   %eax,(%esp)
   0x0806cf81 <+81>:    call   0x806eaf0 <__libc_disable_asynccancel>
   0x0806cf86 <+86>:    pop    %eax
   0x0806cf87 <+87>:    cmp    $0xfffff001,%eax
   0x0806cf8c <+92>:    jae    0x8070590 <__syscall_error>
   0x0806cf92 <+98>:    ret
End of assembler dump.

将压入栈中的参数保存到寄存器中，然后调用了 0x80ea9f0，用 x 查看该地址的值：

(gdb) x 0x80ea9f0
0x80ea9f0 <_dl_sysinfo>:        0xf7ffcc80

disas 之，发现来到了 __kernel_vsyscall ，并执行了sysenter指令：

(gdb) disas 0xf7ffcc80
Dump of assembler code for function __kernel_vsyscall:
   0xf7ffcc80 <+0>:     push   %ecx
   0xf7ffcc81 <+1>:     push   %edx
   0xf7ffcc82 <+2>:     push   %ebp
   0xf7ffcc83 <+3>:     mov    %esp,%ebp
   0xf7ffcc85 <+5>:     sysenter
   0xf7ffcc87 <+7>:     int    $0x80
   0xf7ffcc89 <+9>:     pop    %ebp
   0xf7ffcc8a <+10>:    pop    %edx
   0xf7ffcc8b <+11>:    pop    %ecx
   0xf7ffcc8c <+12>:    ret
End of assembler dump.

read 同理，只是有三个参数，需要 push 三次而已。

五、syscall / sysret

前文提到过，在32位下 Intel 和 AMD 对快速系统调用指令的定义有分歧，一个使用 sysenter ，另一个使用 syscall 。但到了64位下，为啥都统一成 syscall 了呢？

关于这个我在网上也没有找到权威的答案，只是一些道途听说：为什么IA-64指令集架构失败了？

在 64 位架构的开发上，Intel 和 AMD 选择了不同的道路：Intel搞出了一套全新的架构，名为安腾(IA-64)，这套架构性能完爆x86，这样用户为了更好的性能需要进行硬件换代，岂不是喜滋滋？然而这种做法在商业上取得了失败。因为 IA-64 架构虽然提高了性能，却不能向后兼容，即原来能在 x86 下跑的程序到新架构下就跑不了了，用户非常 angry 。AMD 就比较厚道，老老实实地做出了兼容 x86 的 x86_64 ，能够运行 32 位下的程序。于是农企日常翻身，逼得 Intel 反过来兼容 x86_64 架构，于是只能支持 AMD 标准中定义的 syscall 了。

这次我们直接从gdb出发，同样是之前的代码，只是这次编译成 64 位：

(gdb) disas
Dump of assembler code for function main:
   0x00000000004009ae <+0>:     push   %rbp
   0x00000000004009af <+1>:     mov    %rsp,%rbp
   0x00000000004009b2 <+4>:     add    $0xffffffffffffff80,%rsp
   0x00000000004009b6 <+8>:     mov    %edi,-0x74(%rbp)
   0x00000000004009b9 <+11>:    mov    %rsi,-0x80(%rbp)
   0x00000000004009bd <+15>:    mov    %fs:0x28,%rax
   0x00000000004009c6 <+24>:    mov    %rax,-0x8(%rbp)
   0x00000000004009ca <+28>:    xor    %eax,%eax
   0x00000000004009cc <+30>:    movabs $0x7365742f706d742f,%rax
   0x00000000004009d6 <+40>:    mov    %rax,-0x60(%rbp)
   0x00000000004009da <+44>:    movq   $0x74,-0x58(%rbp)
   0x00000000004009e2 <+52>:    lea    -0x50(%rbp),%rdx
   0x00000000004009e6 <+56>:    mov    $0x0,%eax
   0x00000000004009eb <+61>:    mov    $0x8,%ecx
   0x00000000004009f0 <+66>:    mov    %rdx,%rdi
   0x00000000004009f3 <+69>:    rep stos %rax,%es:(%rdi)
   0x00000000004009f6 <+72>:    lea    -0x60(%rbp),%rax
   0x00000000004009fa <+76>:    mov    $0x0,%esi
   0x00000000004009ff <+81>:    mov    %rax,%rdi
   0x0000000000400a02 <+84>:    mov    $0x0,%eax
   0x0000000000400a07 <+89>:    callq  0x43e650 <open64>
   0x0000000000400a0c <+94>:    mov    %eax,-0x68(%rbp)
   0x0000000000400a0f <+97>:    lea    -0x60(%rbp),%rcx
   0x0000000000400a13 <+101>:   mov    -0x68(%rbp),%eax
   0x0000000000400a16 <+104>:   mov    $0x50,%edx
   0x0000000000400a1b <+109>:   mov    %rcx,%rsi
   0x0000000000400a1e <+112>:   mov    %eax,%edi
   0x0000000000400a20 <+114>:   callq  0x43e6b0 <read>
   0x0000000000400a25 <+119>:   mov    %eax,-0x64(%rbp)
=> 0x0000000000400a28 <+122>:   mov    -0x68(%rbp),%eax
   0x0000000000400a2b <+125>:   mov    %eax,%edi
   0x0000000000400a2d <+127>:   callq  0x43e900 <close>
   0x0000000000400a32 <+132>:   mov    $0x0,%eax
   0x0000000000400a37 <+137>:   mov    -0x8(%rbp),%rdx
   0x0000000000400a3b <+141>:   xor    %fs:0x28,%rdx
   0x0000000000400a44 <+150>:   je     0x400a4b <main+157>
   0x0000000000400a46 <+152>:   callq  0x442010 <__stack_chk_fail>
   0x0000000000400a4b <+157>:   leaveq
   0x0000000000400a4c <+158>:   retq
End of assembler dump.


(gdb) disas 0x43e650
Dump of assembler code for function open64:
   0x000000000043e650 <+0>:     cmpl   $0x0,0x28db65(%rip)        # 0x6cc1bc <__libc_multiple_threads>
   0x000000000043e657 <+7>:     jne    0x43e66d <open64+29>
   0x000000000043e659 <+0>:     mov    $0x2,%eax
   0x000000000043e65e <+5>:     syscall
   0x000000000043e660 <+7>:     cmp    $0xfffffffffffff001,%rax
   0x000000000043e666 <+13>:    jae    0x4436b0 <__syscall_error>
   0x000000000043e66c <+19>:    retq
   0x000000000043e66d <+29>:    sub    $0x8,%rsp
   0x000000000043e671 <+33>:    callq  0x441b70 <__libc_enable_asynccancel>
   0x000000000043e676 <+38>:    mov    %rax,(%rsp)
   0x000000000043e67a <+42>:    mov    $0x2,%eax
   0x000000000043e67f <+47>:    syscall
   0x000000000043e681 <+49>:    mov    (%rsp),%rdi
   0x000000000043e685 <+53>:    mov    %rax,%rdx
   0x000000000043e688 <+56>:    callq  0x441bd0 <__libc_disable_asynccancel>
   0x000000000043e68d <+61>:    mov    %rdx,%rax
   0x000000000043e690 <+64>:    add    $0x8,%rsp
   0x000000000043e694 <+68>:    cmp    $0xfffffffffffff001,%rax
   0x000000000043e69a <+74>:    jae    0x4436b0 <__syscall_error>
   0x000000000043e6a0 <+80>:    retq
End of assembler dump.

open64 定义在 glibc 的 sysdeps/posix/open64.c中：

#include <fcntl.h>
#include <stdarg.h>
#include <sysdep-cancel.h>

/* Open FILE with access OFLAG.  If O_CREAT or O_TMPFILE is in OFLAG,
   a third argument is the file protection.  */
int
__libc_open64 (const char *file, int oflag, ...)
{
    int mode = 0;

    if (__OPEN_NEEDS_MODE (oflag))
    {
        va_list arg;
        va_start (arg, oflag);
        mode = va_arg (arg, int);
        va_end (arg);
    }

    if (SINGLE_THREAD_P)
        return __libc_open (file, oflag | O_LARGEFILE, mode);

    int oldtype = LIBC_CANCEL_ASYNC ();

    int result = __libc_open (file, oflag | O_LARGEFILE, mode);

    LIBC_CANCEL_RESET (oldtype);

    return result;
}
weak_alias (__libc_open64, __open64)
libc_hidden_weak (__open64)
weak_alias (__libc_open64, open64)

再看 __libc_open ，定义在 unix/sysv/linux/generic/open.c ：

#include <errno.h>
#include <fcntl.h>
#include <stdarg.h>
#include <stdio.h>
#include <sysdep-cancel.h>

/* Open FILE with access OFLAG.  If O_CREAT or O_TMPFILE is in OFLAG,
   a third argument is the file protection.  */
int
__libc_open (const char *file, int oflag, ...)
{
    int mode = 0;

    if (__OPEN_NEEDS_MODE (oflag))
    {
        va_list arg;
        va_start (arg, oflag);
        mode = va_arg (arg, int);
        va_end (arg);
    }
    return SYSCALL_CANCEL (openat, AT_FDCWD, file, oflag, mode);
}

我们将宏展开：

SYSCALL_CANCEL(openat, AT_FDCWD, file, oflag, mode)
=> __SYSCALL_CALL(openat, AT_FDCWD, file, oflag, mode)
=> __SYSCALL_DISP(__SYSCALL, openat, AT_FDCWD, file, oflag, mode)
=> __SYSCALL_CONCAT(__SYSCALL, 4)(openat, AT_FDCWD, file, oflag, mode)
=> __SYSCALL_CONCAT_X(__SYSCALL, 4)(openat, AT_FDCWD, file, oflag, mode)
=> __SYSCALL5(openat, AT_FDCWD, file, oflag, mode)
=> INLINE_SYSCALL (openat, 4, AT_FDCWD, file, oflag, mode)
=> INTERNAL_SYSCALL (openat, _, 4, AT_FDCWD, file, oflag, mode)
=> INTERNAL_SYSCALL_NCS (__NR_openat, _, 4, AT_FDCWD, file, oflag, mode)

最终到达 INTERNAL_SYSCALL_NCS ：

# define INTERNAL_SYSCALL_NCS(name, err, nr, args...) \
  ({                                          \
    unsigned long int resultvar;                          \
    LOAD_ARGS_##nr (args)                             \
    LOAD_REGS_##nr                                \
    asm volatile (                                \
    "syscall\n\t"                                 \
    : "=a" (resultvar)                                \
    : "0" (name) ASM_ARGS_##nr : "memory", REGISTERS_CLOBBERED_BY_SYSCALL);   \
    (long int) resultvar; })

LOAD_ARGS_##nr 负责把参数 args 展开，然后由 LOAD_REGS_##nr 设置到相应的寄存器中，因为 syscall 通过寄存器传参。最终调用 syscall 。

根据 Intel SDM，syscall 会将当前 rip 存到 rcx ，然后将 IA32_LSTAR 加载到 rip 。同时将 IA32_STAR[47:32] 加载到cs，IA32_STAR[47:32] + 8 加载到 ss (在 GDT 中，ss 就跟在 cs 后面)。

MSR IA32_LSTAR (MSR_LSTAR) 和 IA32_STAR (MSR_STAR) 在 arch/x86/kernel/cpu/common.c 的 syscall_init 中初始化：

void syscall_init(void)
{
    wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
    wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);

#ifdef CONFIG_IA32_EMULATION
    wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat);
    /*
     * This only works on Intel CPUs.
     * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP.
     * This does not cause SYSENTER to jump to the wrong location, because
     * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit).
     */
    wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
    wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
    wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
#else
    wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret);
    wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG);
    wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
    wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL);
#endif

    /* Flags to clear on syscall */
    wrmsrl(MSR_SYSCALL_MASK,
           X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|
           X86_EFLAGS_IOPL|X86_EFLAGS_AC|X86_EFLAGS_NT);
}

可以看到 MSR_STAR 的第 32-47 位设置为 kernel mode 的 cs，48-63位设置为 user mode 的 cs。而 IA32_LSTAR 被设置为函数 entry_SYSCALL_64 的起始地址。

于是 syscall 时，跳转到 entry_SYSCALL_64 开始执行，其定义在 arch/x86/entry/entry_64.S：

ENTRY(entry_SYSCALL_64)
    /*
     * Interrupts are off on entry.
     * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
     * it is too small to ever cause noticeable irq latency.
     */
    SWAPGS_UNSAFE_STACK
    // KAISER 进内核态需要切到内核页表
    SWITCH_KERNEL_CR3_NO_STACK
    /*
     * A hypervisor implementation might want to use a label
     * after the swapgs, so that it can do the swapgs
     * for the guest and jump here on syscall.
     */
GLOBAL(entry_SYSCALL_64_after_swapgs)
    // 将用户栈偏移保存到 per-cpu 变量 rsp_scratch 中
    movq    %rsp, PER_CPU_VAR(rsp_scratch)
    // 加载内核栈偏移
    movq    PER_CPU_VAR(cpu_current_top_of_stack), %rsp

    TRACE_IRQS_OFF

    /* Construct struct pt_regs on stack */
    pushq   $__USER_DS          /* pt_regs->ss */
    pushq   PER_CPU_VAR(rsp_scratch)    /* pt_regs->sp */
    pushq   %r11                /* pt_regs->flags */
    pushq   $__USER_CS          /* pt_regs->cs */
    pushq   %rcx                /* pt_regs->ip */
    pushq   %rax                /* pt_regs->orig_ax */
    pushq   %rdi                /* pt_regs->di */
    pushq   %rsi                /* pt_regs->si */
    pushq   %rdx                /* pt_regs->dx */
    pushq   %rcx                /* pt_regs->cx */
    pushq   $-ENOSYS            /* pt_regs->ax */
    pushq   %r8             /* pt_regs->r8 */
    pushq   %r9             /* pt_regs->r9 */
    pushq   %r10                /* pt_regs->r10 */
    pushq   %r11                /* pt_regs->r11 */
    // 为r12-r15, rbp, rbx保留位置
    sub $(6*8), %rsp            /* pt_regs->bp, bx, r12-15 not saved */

    /*
     * If we need to do entry work or if we guess we'll need to do
     * exit work, go straight to the slow path.
     */
    movq    PER_CPU_VAR(current_task), %r11
    testl   $_TIF_WORK_SYSCALL_ENTRY|_TIF_ALLWORK_MASK, TASK_TI_flags(%r11)
    jnz entry_SYSCALL64_slow_path

entry_SYSCALL_64_fastpath:
    /*
     * Easy case: enable interrupts and issue the syscall.  If the syscall
     * needs pt_regs, we'll call a stub that disables interrupts again
     * and jumps to the slow path.
     */
    TRACE_IRQS_ON
    ENABLE_INTERRUPTS(CLBR_NONE)
#if __SYSCALL_MASK == ~0
    // 确保系统调用号没超过最大值，超过了则跳转到后面的符号 1 处进行返回
    cmpq    $__NR_syscall_max, %rax
#else
    andl    $__SYSCALL_MASK, %eax
    cmpl    $__NR_syscall_max, %eax
#endif
    ja  1f              /* return -ENOSYS (already in pt_regs->ax) */
    // 除系统调用外的其他调用都通过 rcx 来传第四个参数，因此将 r10 的内容设置到 rcx
    movq    %r10, %rcx

    /*
     * This call instruction is handled specially in stub_ptregs_64.
     * It might end up jumping to the slow path.  If it jumps, RAX
     * and all argument registers are clobbered.
     */
    // 调用系统调用表中对应的函数
    call    *sys_call_table(, %rax, 8)
.Lentry_SYSCALL_64_after_fastpath_call:
    // 将函数返回值压到栈中，返回时弹出
    movq    %rax, RAX(%rsp)
1:

    /*
     * If we get here, then we know that pt_regs is clean for SYSRET64.
     * If we see that no exit work is required (which we are required
     * to check with IRQs off), then we can go straight to SYSRET64.
     */
    DISABLE_INTERRUPTS(CLBR_NONE)
    TRACE_IRQS_OFF
    movq    PER_CPU_VAR(current_task), %r11
    testl   $_TIF_ALLWORK_MASK, TASK_TI_flags(%r11)
    jnz 1f

    LOCKDEP_SYS_EXIT
    TRACE_IRQS_ON       /* user mode is traced as IRQs on */
    movq    RIP(%rsp), %rcx
    movq    EFLAGS(%rsp), %r11
    RESTORE_C_REGS_EXCEPT_RCX_R11
    /*
     * This opens a window where we have a user CR3, but are
     * running in the kernel.  This makes using the CS
     * register useless for telling whether or not we need to
     * switch CR3 in NMIs.  Normal interrupts are OK because
     * they are off here.
     */
    SWITCH_USER_CR3
    movq    RSP(%rsp), %rsp
    USERGS_SYSRET64

1:
    /*
     * The fast path looked good when we started, but something changed
     * along the way and we need to switch to the slow path.  Calling
     * raise(3) will trigger this, for example.  IRQs are off.
     */
    TRACE_IRQS_ON
    ENABLE_INTERRUPTS(CLBR_NONE)
    SAVE_EXTRA_REGS
    movq    %rsp, %rdi
    call    syscall_return_slowpath /* returns with IRQs disabled */
    jmp return_from_SYSCALL_64

entry_SYSCALL64_slow_path:
    /* IRQs are off. */
    SAVE_EXTRA_REGS
    movq    %rsp, %rdi
    call    do_syscall_64       /* returns with IRQs disabled */

return_from_SYSCALL_64:
    RESTORE_EXTRA_REGS
    TRACE_IRQS_IRETQ        /* we're about to change IF */

    /*
     * Try to use SYSRET instead of IRET if we're returning to
     * a completely clean 64-bit userspace context.
     */
    movq    RCX(%rsp), %rcx
    movq    RIP(%rsp), %r11
    cmpq    %rcx, %r11          /* RCX == RIP */
    jne opportunistic_sysret_failed

    /*
     * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
     * in kernel space.  This essentially lets the user take over
     * the kernel, since userspace controls RSP.
     *
     * If width of "canonical tail" ever becomes variable, this will need
     * to be updated to remain correct on both old and new CPUs.
     */
    .ifne __VIRTUAL_MASK_SHIFT - 47
    .error "virtual address width changed -- SYSRET checks need update"
    .endif

    /* Change top 16 bits to be the sign-extension of 47th bit */
    shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
    sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx

    /* If this changed %rcx, it was not canonical */
    cmpq    %rcx, %r11
    jne opportunistic_sysret_failed

    cmpq    $__USER_CS, CS(%rsp)        /* CS must match SYSRET */
    jne opportunistic_sysret_failed

    movq    R11(%rsp), %r11
    cmpq    %r11, EFLAGS(%rsp)      /* R11 == RFLAGS */
    jne opportunistic_sysret_failed

    /*
     * SYSCALL clears RF when it saves RFLAGS in R11 and SYSRET cannot
     * restore RF properly. If the slowpath sets it for whatever reason, we
     * need to restore it correctly.
     *
     * SYSRET can restore TF, but unlike IRET, restoring TF results in a
     * trap from userspace immediately after SYSRET.  This would cause an
     * infinite loop whenever #DB happens with register state that satisfies
     * the opportunistic SYSRET conditions.  For example, single-stepping
     * this user code:
     *
     *           movq   $stuck_here, %rcx
     *           pushfq
     *           popq %r11
     *   stuck_here:
     *
     * would never get past 'stuck_here'.
     */
    testq   $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
    jnz opportunistic_sysret_failed

    /* nothing to check for RSP */

    cmpq    $__USER_DS, SS(%rsp)        /* SS must match SYSRET */
    jne opportunistic_sysret_failed

    /*
     * We win! This label is here just for ease of understanding
     * perf profiles. Nothing jumps here.
     */
syscall_return_via_sysret:
    /* rcx and r11 are already restored (see code above) */
    RESTORE_C_REGS_EXCEPT_RCX_R11
    /*
     * This opens a window where we have a user CR3, but are
     * running in the kernel.  This makes using the CS
     * register useless for telling whether or not we need to
     * switch CR3 in NMIs.  Normal interrupts are OK because
     * they are off here.
     */
    // KAISER 返回用户态需要切回用户页表
    SWITCH_USER_CR3
    /* 根据压栈的内容，恢复 rsp 为用户态的栈顶 */
    movq    RSP(%rsp), %rsp
    USERGS_SYSRET64

    // 无法快速返回，只能退化到 iret
opportunistic_sysret_failed:
    /*
     * This opens a window where we have a user CR3, but are
     * running in the kernel.  This makes using the CS
     * register useless for telling whether or not we need to
     * switch CR3 in NMIs.  Normal interrupts are OK because
     * they are off here.
     */
    SWITCH_USER_CR3
    SWAPGS
    jmp restore_c_regs_and_iret
END(entry_SYSCALL_64)

注意 syscall 不会保存栈指针，因此 handler 首先将当前用户态栈偏移 rsp 存到 per-cpu 变量 rsp_scratch 中，然后将 per-cpu 变量 cpu_current_top_of_stack ，即内核态的栈偏移加载到 rsp。

随后将各寄存器中的值压入内核态的栈中，包括：

rax system call number
rcx return address
r11 saved rflags (note: r11 is callee-clobbered register in C ABI)
rdi arg0
rsi arg1
rdx arg2
r10 arg3 (needs to be moved to rcx to conform to C ABI)
r8 arg4
r9 arg5

接着根据系统调用号从系统调用表(sys_call_table) 中找到相应的处理函数，如 sys_open ，进行调用。64位下系统调用定义在 arch/x86/entry/syscalls/syscall_64.tbl中，ABI 和 32 位不同。

如果一切顺利的话，最终通过 USERGS_SYSRET64 ，即 sysretq 返回。

六、总结

本文主要分析了Linux下的三种系统调用方式：int 0x80 ，sysenter 和 syscall 。

传统系统调用(int 0x80) 通过中断/异常实现，在执行 int 指令时，发生 trap。硬件找到在中断描述符表中的表项，在自动切换到内核栈 (tss.ss0 : tss.esp0) 后根据中断描述符的 segment selector 在 GDT / LDT 中找到对应的段描述符，从段描述符拿到段的基址，加载到 cs ，将 offset 加载到 eip。最后硬件将 ss / sp / eflags / cs / ip / error code 依次压到内核栈。返回时，iret 将先前压栈的 ss / sp / eflags / cs / ip 弹出，恢复用户态调用时的寄存器上下文。

sysenter 和 syscall 是为了加速系统调用所引入的新指令，通过引入新的 MSR 来存放内核态的代码和栈的段号和偏移量，从而实现快速跳转：

在调用 sysenter 时将 SYSENTER_CS_MSR 加载到 cs，将 SYSENTER_CS_MSR + 8 加载到 ss，将 IA32_SYSENTER_EIP 加载到 eip ，将 IA32_SYSENTER_ESP 加载到 esp ，整套切换到内核态。返回时，sysexit 将 IA32_SYSENTER_CS + 16 加载到 cs ，将 IA32_SYSENTER_CS + 24 加载到 cs ，而 eip 和 esp 分别从 edx 和 ecx 中加载，因此返回前应该将压栈的用户态 eip(计算出来的) 和 esp(调用前用户态保存到 ebp 进行传递) 设置到这两个寄存器中。

在调用 syscall 时，会自动将 rip 保存到 rcx ，然后将 IA32_LSTAR 加载到 rip 。同时将 IA32_STAR[47:32] 加载到 cs ，IA32_STAR[47:32] + 8 加载到 ss 。栈顶指针的切换会延迟到内核态系统调用入口点 entry_SYSCALL_64 后进行处理，将用户态栈偏移 rsp 存到 per-cpu 变量 rsp_scratch 中，然后将 per-cpu 变量 cpu_current_top_of_stack ，即内核态的栈偏移加载到 rsp。返回时，sysret 将 IA32_STAR[63:48] 加载到 cs ，IA32_STAR[63:48] + 8 加载到 ss ，而 rip 从 rcx 中加载，因此返回前应该将压栈的用户态 rip 设置到 rcx 中。对于 rsp ，返回前根据先前压栈内容先设置为用户态 rsp。

文章中肯定有遗漏或理解错误的地方，欢迎留言指正，不胜感激。

参考：

https://0xax.gitbooks.io/linux-insides/content/SysCall/
https://blog.packagecloud.io/eng/2016/04/05/the-definitive-guide-to-linux-system-calls/
http://www.ibm.com/developerworks/cn/linux/kernel/l-k26ncpu/index.html
https://lwn.net/Articles/604287/
https://lwn.net/Articles/604515/

本文参与腾讯云自媒体同步曝光计划，分享自微信公众号。

原始发表：2019-08-22，如有侵权请联系 cloudcommunity@tencent.com 删除

单片机