golang 观察Golang Syscall和Runtime函数活动

Posted 2021-05-24

tags:

篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了golang 观察Golang Syscall和Runtime函数活动相关的知识，希望对你有一定的参考价值。

dtrace -qn '
  unsigned long c, sc, last_ts;
  BEGIN {
    start = timestamp;
    last_ts = start;

  }
  syscall:::entry /pid==$target/ {
    sc++;
  }

  pid$target:libc.so.1:nanosleep:entry {
    self->sleep = timestamp;
  }

  pid$target:libc.so.1:nanosleep:return /self->sleep/ {
    @slp = sum(timestamp - self->sleep);
    self->sleep = 0;
  }
  pid$target::runtime\.*:entry {c++}
  tick-10sec {
    this->now = timestamp;
    this->delta = (this->now - last_ts)/1000000000;
    last_ts = this->now;
    @runtCt["runtime.*"] = sum(c);
    @runtq["runtime.*"] = quantize(c/this->delta);
    @syscallCt["syscall"] = sum(sc);
    @syscallq["syscall"] = quantize(sc/this->delta == 0 ? 1 : sc/this->delta);
    c = 0; sc = 0;
  }
  END {
    this->now = timestamp;
    normalize(@slp, (this->now-start)/1000000);
    normalize(@runtCt, (this->now-start)/1000000000);
    normalize(@syscallCt, (this->now-start)/1000000000);
    printa("Avg. Call Count: %s/sec %@d | ", @runtCt);
    printa("%s/sec %@d\n", @syscallCt);
    printf("Total Runtime(sec): %d | ", (this->now - start)/1000000000);
    printa("avg. sleep time/sec(uS): %@d\n", @slp);
  }' -p `pgrep metricsd`

golang调度学习-调度流程 Syscall

syscall函数

Syscall函数的定义如下，传入4个参数，返回3个参数。

func syscall(fn, a1, a2, a3 uintptr) (r1, r2 uintptr, err Errno)

syscall函数的作用是传入系统调用的地址和参数，执行完成后返回。流程主要是系统调用前执行entersyscall，设置g p的状态，然后入参，执行后，写返回值然后执行exitsyscall设置g p的状态。
entersyscall和exitsyscall在g的调用中细讲。

// func Syscall(trap int64, a1, a2, a3 uintptr) (r1, r2, err uintptr);
// Trap # in AX, args in DI SI DX R10 R8 R9, return in AX DX
// Note that this differs from "standard" ABI convention, which
// would pass 4th arg in CX, not R10.

// 4个入参：PC param1 param2 param3
TEXT ·Syscall(SB),NOSPLIT,$0-56
    // 调用entersyscall 判断是执行条件是否满足 记录调度信息 切换g p的状态
    CALL    runtime·entersyscall(SB)
    // 将参数存入寄存器中
    MOVQ    a1+8(FP), DI
    MOVQ    a2+16(FP), SI
    MOVQ    a3+24(FP), DX
    MOVQ    trap+0(FP), AX  // syscall entry
    SYSCALL
    CMPQ    AX, $0xfffffffffffff001
    JLS ok
    // 执行失败时 写返回值
    MOVQ    $-1, r1+32(FP)
    MOVQ    $0, r2+40(FP)
    NEGQ    AX
    MOVQ    AX, err+48(FP)
    // 调用exitsyscall 记录调度信息
    CALL    runtime·exitsyscall(SB)
    RET
ok:
    // 执行成功时 写返回值
    MOVQ    AX, r1+32(FP)
    MOVQ    DX, r2+40(FP)
    MOVQ    $0, err+48(FP)
    CALL    runtime·exitsyscall(SB)
    RET 

TEXT    ·RawSyscall(SB),NOSPLIT,$0-56
    MOVQ    a1+8(FP), DI
    MOVQ    a2+16(FP), SI
    MOVQ    a3+24(FP), DX
    MOVQ    trap+0(FP), AX    // syscall entry
    SYSCALL
    JCC    ok1
    MOVQ    $-1, r1+32(FP)    // r1
    MOVQ    $0, r2+40(FP)    // r2
    MOVQ    AX, err+48(FP)    // errno
    RET
ok1:
    MOVQ    AX, r1+32(FP)    // r1
    MOVQ    DX, r2+40(FP)    // r2
    MOVQ    $0, err+48(FP)    // errno
    RET

明显SysCall比RawSyscall多调用了两个方法，entersyscall和exitsyscall，增加这两个函数的调用，让调度器有机会去对即将要进入系统调用的goroutine进行调整，方便调度。

entersyscall

// 系统调用的时候调用该函数
// 进入系统调用，G将会进入_Gsyscall状态，也就是会被暂时挂起，直到系统调用结束。
// 此时M进入系统调用，那么P也会放弃该M。但是，此时M还指向P，在M从系统调用返回后还能找到P
func entersyscall() {
    reentersyscall(getcallerpc(), getcallersp())
}
// Syscall跟踪：
// 在系统调用开始时，我们发出traceGoSysCall来捕获堆栈跟踪。
// 如果系统调用未阻止，则我们不会发出任何其他事件。
// 如果系统调用被阻止（即，重新获取了P），则retaker会发出traceGoSysBlock；
// 当syscall返回时，我们发出traceGoSysExit，当goroutine开始运行时
// （可能立即，如果exitsyscallfast返回true），我们发出traceGoStart。
// 为了确保在traceGoSysBlock之后严格发出traceGoSysExit，
// 我们记得syscalltick的当前值以m为单位（_g_.m.syscalltick = _g_.m.p.ptr（）。syscalltick），
// 之后发出traceGoSysBlock的人将递增p.syscalltick；
// 我们在发出traceGoSysExit之前等待增量。
// 请注意，即使未启用跟踪，增量也会完成，
// 因为可以在syscall的中间启用跟踪。 我们不希望等待挂起。
//go:nosplit
func reentersyscall(pc, sp uintptr) {
    _g_ := getg()

       //禁用抢占，因为在此功能期间g处于Gsyscall状态，但g-> sched可能不一致，请勿让GC观察它。
    _g_.m.locks++

    // Entersyscall must not call any function that might split/grow the stack.
    // (See details in comment above.)
        // 捕获可能发生的调用，方法是将堆栈保护替换为会使任何堆栈检查失败的内容，并留下一个标志来通知newstack终止。
    _g_.stackguard0 = stackPreempt
    _g_.throwsplit = true

    // Leave SP around for GC and traceback.
    save(pc, sp)
    _g_.syscallsp = sp
    _g_.syscallpc = pc
    // 让G进入_Gsyscall状态，此时G已经被挂起了，直到系统调用结束，才会让G重新写进入running
    casgstatus(_g_, _Grunning, _Gsyscall)
    if _g_.syscallsp < _g_.stack.lo || _g_.stack.hi < _g_.syscallsp {
        systemstack(func() {
            print("entersyscall inconsistent ", hex(_g_.syscallsp), " [", hex(_g_.stack.lo), ",", hex(_g_.stack.hi), "]\\n")
            throw("entersyscall")
        })
    }

    if trace.enabled {
        systemstack(traceGoSysCall)
        // systemstack itself clobbers g.sched.{pc,sp} and we might
        // need them later when the G is genuinely blocked in a
        // syscall
        save(pc, sp)
    }

    if atomic.Load(&sched.sysmonwait) != 0 {
        systemstack(entersyscall_sysmon)
        save(pc, sp)
    }

    if _g_.m.p.ptr().runSafePointFn != 0 {
        // runSafePointFn may stack split if run on this stack
        systemstack(runSafePointFn)
        save(pc, sp)
    }

    _g_.m.syscalltick = _g_.m.p.ptr().syscalltick
    _g_.sysblocktraced = true
    // 这里很关键：P的M已经陷入系统调用，于是P忍痛放弃该M
        // 但是请注意：此时M还指向P，在M从系统调用返回后还能找到P
    pp := _g_.m.p.ptr()
    pp.m = 0
    _g_.m.oldp.set(pp)
    _g_.m.p = 0
    // P的状态变为Psyscall
    atomic.Store(&pp.status, _Psyscall)
    if sched.gcwaiting != 0 {
        systemstack(entersyscall_gcwait)
        save(pc, sp)
    }
    _g_.m.locks--
}

该方法主要是为系统调用前做了准备工作：

修改g的状态为_Gsyscall
检查sysmon线程是否在执行，睡眠需要唤醒
p放弃m，但是m依旧持有p的指针，结束调用后优先选择p
修改p的状态为_Psyscal

做好这些准备工作便可以真正的执行系统调用了。当该线程m长时间阻塞在系统调用的时候，一直在运行的sysmon线程会检测到该p的状态，并将其剥离，驱动其他的m（新建或获取）来调度执行该p上的任务,这其中主要是在retake方法中实现的，该方法还处理了goroutine抢占调度，这里省略，后面介绍抢占调度在介绍：

exitsyscall

当系统Syscall返回的时，会调用exitsyscall方法恢复调度：

//go:nosplit
//go:nowritebarrierrec
//go:linkname exitsyscall
func exitsyscall() {
    _g_ := getg()

    _g_.m.locks++ // see comment in entersyscall
    if getcallersp() > _g_.syscallsp {
        throw("exitsyscall: syscall frame is no longer valid")
    }

    _g_.waitsince = 0
    oldp := _g_.m.oldp.ptr()
    _g_.m.oldp = 0
     // 重新获取p
    if exitsyscallfast(oldp) {
        if trace.enabled {
            if oldp != _g_.m.p.ptr() || _g_.m.syscalltick != _g_.m.p.ptr().syscalltick {
                systemstack(traceGoStart)
            }
        }
        // There\'s a cpu for us, so we can run.
        _g_.m.p.ptr().syscalltick++
        // We need to cas the status and scan before resuming...
        casgstatus(_g_, _Gsyscall, _Grunning)

        // Garbage collector isn\'t running (since we are),
        // so okay to clear syscallsp.
        _g_.syscallsp = 0
        _g_.m.locks--
        if _g_.preempt {
            // restore the preemption request in case we\'ve cleared it in newstack
            _g_.stackguard0 = stackPreempt
        } else {
            // otherwise restore the real _StackGuard, we\'ve spoiled it in entersyscall/entersyscallblock
            _g_.stackguard0 = _g_.stack.lo + _StackGuard
        }
        _g_.throwsplit = false

        if sched.disable.user && !schedEnabled(_g_) {
            // Scheduling of this goroutine is disabled.
            Gosched()
        }

        return
    }

    _g_.sysexitticks = 0
    if trace.enabled {
        // Wait till traceGoSysBlock event is emitted.
        // This ensures consistency of the trace (the goroutine is started after it is blocked).
        for oldp != nil && oldp.syscalltick == _g_.m.syscalltick {
            osyield()
        }
        // We can\'t trace syscall exit right now because we don\'t have a P.
        // Tracing code can invoke write barriers that cannot run without a P.
        // So instead we remember the syscall exit time and emit the event
        // in execute when we have a P.
        _g_.sysexitticks = cputicks()
    }

    _g_.m.locks--

    // 没有获取到p，只能解绑当前g，重新调度该m了
    mcall(exitsyscall0)

    // Scheduler returned, so we\'re allowed to run now.
    // Delete the syscallsp information that we left for
    // the garbage collector during the system call.
    // Must wait until now because until gosched returns
    // we don\'t know for sure that the garbage collector
    // is not running.
    _g_.syscallsp = 0
    _g_.m.p.ptr().syscalltick++
    _g_.throwsplit = false
}

exitsyscallfast

exitsyscall会尝试重新绑定p，优先选择之前m绑定的p（进入系统的调用的时候，p只是单方面解绑了和m的关系，通过m依旧可以找到p）：


//go:nosplit
func exitsyscallfast(oldp *p) bool {
    _g_ := getg()

    // Freezetheworld sets stopwait but does not retake P\'s.
    //stw，直接解绑p，然后退出
    if sched.stopwait == freezeStopWait {
        return false
    }

    // Try to re-acquire the last P.
    // 如果之前附属的P尚未被其他M,尝试绑定该P
    if oldp != nil && oldp.status == _Psyscall && atomic.Cas(&oldp.status, _Psyscall, _Pidle) {
        // There\'s a cpu for us, so we can run.
        wirep(oldp)
        exitsyscallfast_reacquired()
        return true
    }
        // 否则从空闲P列表中取出一个来
    // Try to get any other idle P.
    if sched.pidle != 0 {
        var ok bool
        systemstack(func() {
            ok = exitsyscallfast_pidle()
            if ok && trace.enabled {
                if oldp != nil {
                    // Wait till traceGoSysBlock event is emitted.
                    // This ensures consistency of the trace (the goroutine is started after it is blocked).
                    for oldp.syscalltick == _g_.m.syscalltick {
                        osyield()
                    }
                }
                traceGoSysExit(0)
            }
        })
        if ok {
            return true
        }
    }
    return false
}

exitsyscall0

func exitsyscall0(gp *g) {
    _g_ := getg()
        //修改g状态为 _Grunable
    casgstatus(gp, _Gsyscall, _Grunnable)
    dropg()                  //解绑
    lock(&sched.lock)
    var _p_ *p
    //尝试获取p
    if schedEnabled(_g_) {
        _p_ = pidleget()
    }
    if _p_ == nil {
            // 未获取到p，g进入全局队列等待调度
        globrunqput(gp)
    } else if atomic.Load(&sched.sysmonwait) != 0 {
        atomic.Store(&sched.sysmonwait, 0)
        notewakeup(&sched.sysmonnote)
    }
    unlock(&sched.lock)
    // 获取到p，绑定，然后执行
    if _p_ != nil {
        acquirep(_p_)
        execute(gp, false) // Never returns.
    }
    //  // m有绑定的g，解绑p然后绑定的g来唤醒，执行
    if _g_.m.lockedg != 0 {
        // Wait until another thread schedules gp and so m again.
        stoplockedm()
        execute(gp, false) // Never returns.
    }
    // 关联p失败了，休眠，等待唤醒，在进行调度。
    stopm()
    schedule() // Never returns.
}

总结

上述便是golang系统调用的整个流程，大致如下：

业务调用封装好的系统调用函数，编译器翻译到Syscall
执行entersyscall()方法，修改g，p的状态，p单方面解绑m，并检查唤醒sysmon线程，检测系统调用。
当sysmon线程检测到系统调用阻塞时间过长的时候，调用retake，重新调度该p，让p上可执行的得以执行，不浪费资源
系统调用返回，进入exitsyscall方法，优先获取之前的p，如果该p已经被占有，重新获取空闲的p，绑定，然后继续执行该g。当获取不到p的时候，调用exitsyscall0，解绑g，休眠，等待下次唤醒调度。

以上是关于golang 观察Golang Syscall和Runtime函数活动的主要内容，如果未能解决你的问题，请参考以下文章

golang调用DLL中的函数

如何在golang中从windows`syscall`加载图像资源？

golang windows内存文件映射

将字节数组转换为 Golang 中的 syscall.InotifyEvent 结构

golang读写文件

golang实现负载均衡算法