metrics: improve accuracy of CPU gauges (#26793)

This PR changes metrics collection to actually measure the time interval between collections, rather
than assume 3 seconds. I did some ad hoc profiling, and on slower hardware (eg, my Raspberry Pi 4)
I routinely saw intervals between 3.3 - 3.5 seconds, with some being as high as 4.5 seconds. This
will generally cause the CPU gauge readings to be too high, and in some cases can cause impossibly
large values for the CPU load metrics (eg. greater than 400 for a 4 core CPU).

---------

Co-authored-by: Felix Lange <fjl@twurst.com>
This commit is contained in:
turboboost55 2023-03-06 15:29:48 -08:00 committed by GitHub
parent 5bc2ef984f
commit 544e4a700b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 24 additions and 14 deletions

@ -17,8 +17,9 @@
package metrics package metrics
// CPUStats is the system and process CPU stats. // CPUStats is the system and process CPU stats.
// All values are in seconds.
type CPUStats struct { type CPUStats struct {
GlobalTime int64 // Time spent by the CPU working on all processes GlobalTime float64 // Time spent by the CPU working on all processes
GlobalWait int64 // Time spent by waiting on disk for all processes GlobalWait float64 // Time spent by waiting on disk for all processes
LocalTime int64 // Time spent by the CPU working on this process LocalTime float64 // Time spent by the CPU working on this process
} }

@ -38,7 +38,7 @@ func ReadCPUStats(stats *CPUStats) {
} }
// requesting all cpu times will always return an array with only one time stats entry // requesting all cpu times will always return an array with only one time stats entry
timeStat := timeStats[0] timeStat := timeStats[0]
stats.GlobalTime = int64((timeStat.User + timeStat.Nice + timeStat.System) * cpu.ClocksPerSec) stats.GlobalTime = timeStat.User + timeStat.Nice + timeStat.System
stats.GlobalWait = int64((timeStat.Iowait) * cpu.ClocksPerSec) stats.GlobalWait = timeStat.Iowait
stats.LocalTime = getProcessCPUTime() stats.LocalTime = getProcessCPUTime()
} }

@ -21,6 +21,6 @@ package metrics
// getProcessCPUTime returns 0 on Windows as there is no system call to resolve // getProcessCPUTime returns 0 on Windows as there is no system call to resolve
// the actual process' CPU time. // the actual process' CPU time.
func getProcessCPUTime() int64 { func getProcessCPUTime() float64 {
return 0 return 0
} }

@ -26,11 +26,11 @@ import (
) )
// getProcessCPUTime retrieves the process' CPU time since program startup. // getProcessCPUTime retrieves the process' CPU time since program startup.
func getProcessCPUTime() int64 { func getProcessCPUTime() float64 {
var usage syscall.Rusage var usage syscall.Rusage
if err := syscall.Getrusage(syscall.RUSAGE_SELF, &usage); err != nil { if err := syscall.Getrusage(syscall.RUSAGE_SELF, &usage); err != nil {
log.Warn("Failed to retrieve CPU time", "err", err) log.Warn("Failed to retrieve CPU time", "err", err)
return 0 return 0
} }
return int64(usage.Utime.Sec+usage.Stime.Sec)*100 + int64(usage.Utime.Usec+usage.Stime.Usec)/10000 //nolint:unconvert return float64(usage.Utime.Sec+usage.Stime.Sec) + float64(usage.Utime.Usec+usage.Stime.Usec)/1000000 //nolint:unconvert
} }

@ -127,8 +127,6 @@ func CollectProcessMetrics(refresh time.Duration) {
return return
} }
refreshFreq := int64(refresh / time.Second)
// Create the various data collectors // Create the various data collectors
var ( var (
cpustats = make([]CPUStats, 2) cpustats = make([]CPUStats, 2)
@ -163,14 +161,25 @@ func CollectProcessMetrics(refresh time.Duration) {
diskWriteBytesCounter = GetOrRegisterCounter("system/disk/writebytes", DefaultRegistry) diskWriteBytesCounter = GetOrRegisterCounter("system/disk/writebytes", DefaultRegistry)
) )
var lastCollectTime time.Time
// Iterate loading the different stats and updating the meters. // Iterate loading the different stats and updating the meters.
now, prev := 0, 1 now, prev := 0, 1
for ; ; now, prev = prev, now { for ; ; now, prev = prev, now {
// CPU // Gather CPU times.
ReadCPUStats(&cpustats[now]) ReadCPUStats(&cpustats[now])
cpuSysLoad.Update((cpustats[now].GlobalTime - cpustats[prev].GlobalTime) / refreshFreq) collectTime := time.Now()
cpuSysWait.Update((cpustats[now].GlobalWait - cpustats[prev].GlobalWait) / refreshFreq) secondsSinceLastCollect := collectTime.Sub(lastCollectTime).Seconds()
cpuProcLoad.Update((cpustats[now].LocalTime - cpustats[prev].LocalTime) / refreshFreq) lastCollectTime = collectTime
if secondsSinceLastCollect > 0 {
sysLoad := (cpustats[now].GlobalTime - cpustats[prev].GlobalTime) / secondsSinceLastCollect
sysWait := (cpustats[now].GlobalWait - cpustats[prev].GlobalWait) / secondsSinceLastCollect
procLoad := (cpustats[now].LocalTime - cpustats[prev].LocalTime) / secondsSinceLastCollect
// Convert to integer percentage.
cpuSysLoad.Update(int64(sysLoad * 100))
cpuSysWait.Update(int64(sysWait * 100))
cpuProcLoad.Update(int64(procLoad * 100))
}
// Threads // Threads
cpuThreads.Update(int64(threadCreateProfile.Count())) cpuThreads.Update(int64(threadCreateProfile.Count()))