go-pulse/metrics/metrics.go
turboboost55 7dc100714d
metrics: add cpu counters (#26796)
This PR adds counter metrics for the CPU system and the Geth process.
Currently the only metrics available for these items are gauges. Gauges are
fine when the consumer scrapes metrics data at the same interval as Geth
produces new values (every 3 seconds), but it is likely that most consumers
will not scrape that often. Intervals of 10, 15, or maybe even 30 seconds
are probably more common.

So the problem is, how does the consumer estimate what the CPU was doing in
between scrapes. With a counter, it's easy ... you just subtract two
successive values and divide by the time to get a nice, accurate average.
But with a gauge, you can't do that. A gauge reading is an instantaneous
picture of what was happening at that moment, but it gives you no idea
about what was going on between scrapes. Taking an average of values is
meaningless.
2023-03-23 14:13:50 +01:00

221 lines
8.2 KiB
Go

// Go port of Coda Hale's Metrics library
//
// <https://github.com/rcrowley/go-metrics>
//
// Coda Hale's original work: <https://github.com/codahale/metrics>
package metrics
import (
"os"
"runtime/metrics"
"runtime/pprof"
"strings"
"time"
"github.com/ethereum/go-ethereum/log"
)
// Enabled is checked by the constructor functions for all of the
// standard metrics. If it is true, the metric returned is a stub.
//
// This global kill-switch helps quantify the observer effect and makes
// for less cluttered pprof profiles.
var Enabled = false
// EnabledExpensive is a soft-flag meant for external packages to check if costly
// metrics gathering is allowed or not. The goal is to separate standard metrics
// for health monitoring and debug metrics that might impact runtime performance.
var EnabledExpensive = false
// enablerFlags is the CLI flag names to use to enable metrics collections.
var enablerFlags = []string{"metrics"}
// expensiveEnablerFlags is the CLI flag names to use to enable metrics collections.
var expensiveEnablerFlags = []string{"metrics.expensive"}
// Init enables or disables the metrics system. Since we need this to run before
// any other code gets to create meters and timers, we'll actually do an ugly hack
// and peek into the command line args for the metrics flag.
func init() {
for _, arg := range os.Args {
flag := strings.TrimLeft(arg, "-")
for _, enabler := range enablerFlags {
if !Enabled && flag == enabler {
log.Info("Enabling metrics collection")
Enabled = true
}
}
for _, enabler := range expensiveEnablerFlags {
if !EnabledExpensive && flag == enabler {
log.Info("Enabling expensive metrics collection")
EnabledExpensive = true
}
}
}
}
var threadCreateProfile = pprof.Lookup("threadcreate")
type runtimeStats struct {
GCPauses *metrics.Float64Histogram
GCAllocBytes uint64
GCFreedBytes uint64
MemTotal uint64
HeapObjects uint64
HeapFree uint64
HeapReleased uint64
HeapUnused uint64
Goroutines uint64
SchedLatency *metrics.Float64Histogram
}
var runtimeSamples = []metrics.Sample{
{Name: "/gc/pauses:seconds"}, // histogram
{Name: "/gc/heap/allocs:bytes"},
{Name: "/gc/heap/frees:bytes"},
{Name: "/memory/classes/total:bytes"},
{Name: "/memory/classes/heap/objects:bytes"},
{Name: "/memory/classes/heap/free:bytes"},
{Name: "/memory/classes/heap/released:bytes"},
{Name: "/memory/classes/heap/unused:bytes"},
{Name: "/sched/goroutines:goroutines"},
{Name: "/sched/latencies:seconds"}, // histogram
}
func readRuntimeStats(v *runtimeStats) {
metrics.Read(runtimeSamples)
for _, s := range runtimeSamples {
// Skip invalid/unknown metrics. This is needed because some metrics
// are unavailable in older Go versions, and attempting to read a 'bad'
// metric panics.
if s.Value.Kind() == metrics.KindBad {
continue
}
switch s.Name {
case "/gc/pauses:seconds":
v.GCPauses = s.Value.Float64Histogram()
case "/gc/heap/allocs:bytes":
v.GCAllocBytes = s.Value.Uint64()
case "/gc/heap/frees:bytes":
v.GCFreedBytes = s.Value.Uint64()
case "/memory/classes/total:bytes":
v.MemTotal = s.Value.Uint64()
case "/memory/classes/heap/objects:bytes":
v.HeapObjects = s.Value.Uint64()
case "/memory/classes/heap/free:bytes":
v.HeapFree = s.Value.Uint64()
case "/memory/classes/heap/released:bytes":
v.HeapReleased = s.Value.Uint64()
case "/memory/classes/heap/unused:bytes":
v.HeapUnused = s.Value.Uint64()
case "/sched/goroutines:goroutines":
v.Goroutines = s.Value.Uint64()
case "/sched/latencies:seconds":
v.SchedLatency = s.Value.Float64Histogram()
}
}
}
// CollectProcessMetrics periodically collects various metrics about the running process.
func CollectProcessMetrics(refresh time.Duration) {
// Short circuit if the metrics system is disabled
if !Enabled {
return
}
// Create the various data collectors
var (
cpustats = make([]CPUStats, 2)
diskstats = make([]DiskStats, 2)
rstats = make([]runtimeStats, 2)
)
// This scale factor is used for the runtime's time metrics. It's useful to convert to
// ns here because the runtime gives times in float seconds, but runtimeHistogram can
// only provide integers for the minimum and maximum values.
const secondsToNs = float64(time.Second)
// Define the various metrics to collect
var (
cpuSysLoad = GetOrRegisterGauge("system/cpu/sysload", DefaultRegistry)
cpuSysWait = GetOrRegisterGauge("system/cpu/syswait", DefaultRegistry)
cpuProcLoad = GetOrRegisterGauge("system/cpu/procload", DefaultRegistry)
cpuSysLoadTotal = GetOrRegisterCounterFloat64("system/cpu/sysload/total", DefaultRegistry)
cpuSysWaitTotal = GetOrRegisterCounterFloat64("system/cpu/syswait/total", DefaultRegistry)
cpuProcLoadTotal = GetOrRegisterCounterFloat64("system/cpu/procload/total", DefaultRegistry)
cpuThreads = GetOrRegisterGauge("system/cpu/threads", DefaultRegistry)
cpuGoroutines = GetOrRegisterGauge("system/cpu/goroutines", DefaultRegistry)
cpuSchedLatency = getOrRegisterRuntimeHistogram("system/cpu/schedlatency", secondsToNs, nil)
memPauses = getOrRegisterRuntimeHistogram("system/memory/pauses", secondsToNs, nil)
memAllocs = GetOrRegisterMeter("system/memory/allocs", DefaultRegistry)
memFrees = GetOrRegisterMeter("system/memory/frees", DefaultRegistry)
memTotal = GetOrRegisterGauge("system/memory/held", DefaultRegistry)
heapUsed = GetOrRegisterGauge("system/memory/used", DefaultRegistry)
heapObjects = GetOrRegisterGauge("system/memory/objects", DefaultRegistry)
diskReads = GetOrRegisterMeter("system/disk/readcount", DefaultRegistry)
diskReadBytes = GetOrRegisterMeter("system/disk/readdata", DefaultRegistry)
diskReadBytesCounter = GetOrRegisterCounter("system/disk/readbytes", DefaultRegistry)
diskWrites = GetOrRegisterMeter("system/disk/writecount", DefaultRegistry)
diskWriteBytes = GetOrRegisterMeter("system/disk/writedata", DefaultRegistry)
diskWriteBytesCounter = GetOrRegisterCounter("system/disk/writebytes", DefaultRegistry)
)
var lastCollectTime time.Time
// Iterate loading the different stats and updating the meters.
now, prev := 0, 1
for ; ; now, prev = prev, now {
// Gather CPU times.
ReadCPUStats(&cpustats[now])
collectTime := time.Now()
secondsSinceLastCollect := collectTime.Sub(lastCollectTime).Seconds()
lastCollectTime = collectTime
if secondsSinceLastCollect > 0 {
sysLoad := cpustats[now].GlobalTime - cpustats[prev].GlobalTime
sysWait := cpustats[now].GlobalWait - cpustats[prev].GlobalWait
procLoad := cpustats[now].LocalTime - cpustats[prev].LocalTime
// Convert to integer percentage.
cpuSysLoad.Update(int64(sysLoad / secondsSinceLastCollect * 100))
cpuSysWait.Update(int64(sysWait / secondsSinceLastCollect * 100))
cpuProcLoad.Update(int64(procLoad / secondsSinceLastCollect * 100))
// increment counters (ms)
cpuSysLoadTotal.Inc(sysLoad)
cpuSysWaitTotal.Inc(sysWait)
cpuProcLoadTotal.Inc(procLoad)
}
// Threads
cpuThreads.Update(int64(threadCreateProfile.Count()))
// Go runtime metrics
readRuntimeStats(&rstats[now])
cpuGoroutines.Update(int64(rstats[now].Goroutines))
cpuSchedLatency.update(rstats[now].SchedLatency)
memPauses.update(rstats[now].GCPauses)
memAllocs.Mark(int64(rstats[now].GCAllocBytes - rstats[prev].GCAllocBytes))
memFrees.Mark(int64(rstats[now].GCFreedBytes - rstats[prev].GCFreedBytes))
memTotal.Update(int64(rstats[now].MemTotal))
heapUsed.Update(int64(rstats[now].MemTotal - rstats[now].HeapUnused - rstats[now].HeapFree - rstats[now].HeapReleased))
heapObjects.Update(int64(rstats[now].HeapObjects))
// Disk
if ReadDiskStats(&diskstats[now]) == nil {
diskReads.Mark(diskstats[now].ReadCount - diskstats[prev].ReadCount)
diskReadBytes.Mark(diskstats[now].ReadBytes - diskstats[prev].ReadBytes)
diskWrites.Mark(diskstats[now].WriteCount - diskstats[prev].WriteCount)
diskWriteBytes.Mark(diskstats[now].WriteBytes - diskstats[prev].WriteBytes)
diskReadBytesCounter.Inc(diskstats[now].ReadBytes - diskstats[prev].ReadBytes)
diskWriteBytesCounter.Inc(diskstats[now].WriteBytes - diskstats[prev].WriteBytes)
}
time.Sleep(refresh)
}
}