Ver Fonte

add collection of .NET runtime metrics

Nikolay Sivko há 2 anos atrás
pai
commit
c41883ed8c
7 ficheiros alterados com 417 adições e 31 exclusões
  1. 14 29
      containers/container.go
  2. 318 0
      containers/dotnet.go
  3. 16 0
      containers/metrics.go
  4. 57 0
      containers/process.go
  5. 6 1
      go.mod
  6. 4 0
      go.sum
  7. 2 1
      proc/proc.go

+ 14 - 29
containers/container.go

@@ -1,7 +1,11 @@
 package containers
 
 import (
-	"github.com/cilium/ebpf/link"
+	"os"
+	"strings"
+	"sync"
+	"time"
+
 	"github.com/coroot/coroot-node-agent/cgroup"
 	"github.com/coroot/coroot-node-agent/common"
 	"github.com/coroot/coroot-node-agent/ebpftracer"
@@ -17,10 +21,6 @@ import (
 	"github.com/vishvananda/netns"
 	"inet.af/netaddr"
 	"k8s.io/klog/v2"
-	"os"
-	"strings"
-	"sync"
-	"time"
 )
 
 var (
@@ -85,20 +85,6 @@ type ListenDetails struct {
 	NsIPs    []netaddr.IP
 }
 
-type Process struct {
-	Pid       uint32
-	StartedAt time.Time
-	NetNsId   string
-
-	uprobes               []link.Link
-	goTlsUprobesChecked   bool
-	openSslUprobesChecked bool
-}
-
-func (p *Process) isHostNs() bool {
-	return p.NetNsId == hostNetNsId
-}
-
 type PidFd struct {
 	Pid uint32
 	Fd  uint64
@@ -328,7 +314,7 @@ func (c *Container) Collect(ch chan<- prometheus.Metric) {
 
 	appTypes := map[string]struct{}{}
 	seenJvms := map[string]bool{}
-	for pid := range c.processes {
+	for pid, process := range c.processes {
 		cmdline := proc.GetCmdline(pid)
 		if len(cmdline) == 0 {
 			continue
@@ -337,7 +323,8 @@ func (c *Container) Collect(ch chan<- prometheus.Metric) {
 		if appType != "" {
 			appTypes[appType] = struct{}{}
 		}
-		if isJvm(cmdline) {
+		switch {
+		case isJvm(cmdline):
 			jvm, jMetrics := jvmMetrics(pid)
 			if len(jMetrics) > 0 && !seenJvms[jvm] {
 				seenJvms[jvm] = true
@@ -345,6 +332,8 @@ func (c *Container) Collect(ch chan<- prometheus.Metric) {
 					ch <- m
 				}
 			}
+		case process.dotNetMonitor != nil:
+			process.dotNetMonitor.Collect(ch)
 		}
 	}
 	for appType := range appTypes {
@@ -367,13 +356,11 @@ func (c *Container) onProcessStart(pid uint32) *Process {
 	if err != nil {
 		return nil
 	}
-	ns, err := proc.GetNetNs(pid)
-	if err != nil {
+	c.zombieAt = time.Time{}
+	p := NewProcess(pid, stats)
+	if p == nil {
 		return nil
 	}
-	defer ns.Close()
-	c.zombieAt = time.Time{}
-	p := &Process{Pid: pid, StartedAt: stats.BeginTime, NetNsId: ns.UniqueId()}
 	c.processes[pid] = p
 
 	if c.startedAt.IsZero() {
@@ -397,9 +384,7 @@ func (c *Container) onProcessExit(pid uint32, oomKill bool) {
 	c.lock.Lock()
 	defer c.lock.Unlock()
 	if p := c.processes[pid]; p != nil {
-		for _, u := range p.uprobes {
-			_ = u.Close()
-		}
+		p.Close()
 	}
 	delete(c.processes, pid)
 	if len(c.processes) == 0 {

+ 318 - 0
containers/dotnet.go

@@ -0,0 +1,318 @@
+package containers
+
+import (
+	"bytes"
+	"context"
+	"debug/elf"
+	"errors"
+	"fmt"
+	"math"
+	"net"
+	"path/filepath"
+	"strconv"
+	"strings"
+	"time"
+
+	"github.com/coroot/coroot-node-agent/proc"
+	"github.com/jpillora/backoff"
+	"github.com/prometheus/client_golang/prometheus"
+	"github.com/pyroscope-io/dotnetdiag"
+	"github.com/pyroscope-io/dotnetdiag/nettrace"
+	"github.com/pyroscope-io/dotnetdiag/nettrace/typecode"
+	"k8s.io/klog/v2"
+)
+
+const (
+	dotNetDiagnosticTimeout = 500 * time.Millisecond
+	dotNetEventInterval     = 5 * time.Second
+	provider                = "System.Runtime"
+)
+
+type dotNetMetric struct {
+	fields map[string]string
+	values map[string]float64
+}
+
+func (m *dotNetMetric) value() float64 {
+	switch m.fields["CounterType"] {
+	case "Sum":
+		return m.values["Increment"]
+	case "Mean":
+		return m.values["Mean"]
+	}
+	return math.NaN()
+}
+
+func (m *dotNetMetric) name() string {
+	return m.fields["Name"]
+}
+
+func (m *dotNetMetric) units() string {
+	return m.fields["DisplayUnits"]
+}
+
+type DotNetMonitor struct {
+	pid            uint32
+	cancel         context.CancelFunc
+	lastUpdate     time.Time
+	runtimeVersion string
+
+	info                          *prometheus.GaugeVec
+	memoryAllocatedBytes          prometheus.Counter
+	exceptionCount                prometheus.Gauge
+	heapSize                      *prometheus.GaugeVec
+	gcCount                       *prometheus.CounterVec
+	heapFragmentationPercent      prometheus.Gauge
+	monitorLockContentionCount    prometheus.Counter
+	threadPoolCompletedItemsCount prometheus.Counter
+	threadPoolQueueLength         prometheus.Gauge
+	threadPoolThreadsCount        prometheus.Gauge
+}
+
+func NewDotNetMonitor(ctx context.Context, pid uint32, appName string) *DotNetMonitor {
+	klog.Infof("starting DotNetMonitor: pid=%d, app=%s", pid, appName)
+	constLabels := prometheus.Labels{"application": appName}
+
+	m := &DotNetMonitor{
+		pid: pid,
+
+		info:                          newGaugeVec("container_dotnet_info", "Meta information about the Common Language Runtime (CLR)", constLabels, "runtime_version"),
+		memoryAllocatedBytes:          newCounter("container_dotnet_memory_allocated_bytes_total", "The number of bytes allocated", constLabels),
+		exceptionCount:                newGauge("container_dotnet_exceptions_total", "The number of exceptions that have occurred", constLabels),
+		heapSize:                      newGaugeVec("container_dotnet_memory_heap_size_bytes", "Total size of the heap generation in bytes", constLabels, "generation"),
+		gcCount:                       newCounterVec("container_dotnet_gc_count_total", "The number of times GC has occurred for the generation", constLabels, "generation"),
+		heapFragmentationPercent:      newGauge("container_dotnet_heap_fragmentation_percent", "The heap fragmentation", constLabels),
+		monitorLockContentionCount:    newCounter("container_dotnet_monitor_lock_contentions_total", "The number of times there was contention when trying to take the monitor's lock", constLabels),
+		threadPoolCompletedItemsCount: newCounter("container_dotnet_thread_pool_completed_items_total", "The number of work items that have been processed in the ThreadPool", constLabels),
+		threadPoolQueueLength:         newGauge("container_dotnet_thread_pool_queue_length", "The number of work items that are currently queued to be processed in the ThreadPool", constLabels),
+		threadPoolThreadsCount:        newGauge("container_dotnet_thread_pool_size", "The number of thread pool threads that currently exist in the ThreadPool", constLabels),
+	}
+	go m.run(ctx)
+	return m
+}
+
+func (m *DotNetMonitor) Collect(ch chan<- prometheus.Metric) {
+	if m.lastUpdate.Before(time.Now().Add(-2 * dotNetEventInterval)) {
+		return
+	}
+	m.info.Collect(ch)
+	m.memoryAllocatedBytes.Collect(ch)
+	m.exceptionCount.Collect(ch)
+	m.heapSize.Collect(ch)
+	m.gcCount.Collect(ch)
+	m.heapFragmentationPercent.Collect(ch)
+	m.monitorLockContentionCount.Collect(ch)
+	m.threadPoolCompletedItemsCount.Collect(ch)
+	m.threadPoolQueueLength.Collect(ch)
+	m.threadPoolThreadsCount.Collect(ch)
+}
+
+func (m *DotNetMonitor) processMetric(name, units string, v float64) {
+	m.lastUpdate = time.Now()
+	if math.IsNaN(v) {
+		return
+	}
+	switch units {
+	case "MB":
+		v *= 1000 * 1000
+	}
+	switch name {
+	case "alloc-rate":
+		m.memoryAllocatedBytes.Add(v)
+	case "exception-count":
+		m.exceptionCount.Set(v)
+	case "gen-0-gc-count":
+		m.gcCount.WithLabelValues("Gen0").Add(v)
+	case "gen-1-gc-count":
+		m.gcCount.WithLabelValues("Gen1").Add(v)
+	case "gen-2-gc-count":
+		m.gcCount.WithLabelValues("Gen2").Add(v)
+	case "gen-0-size":
+		m.heapSize.WithLabelValues("Gen0").Set(v)
+	case "gen-1-size":
+		m.heapSize.WithLabelValues("Gen1").Set(v)
+	case "gen-2-size":
+		m.heapSize.WithLabelValues("Gen2").Set(v)
+	case "loh-size":
+		m.heapSize.WithLabelValues("LOH").Set(v)
+	case "poh-size":
+		m.heapSize.WithLabelValues("POH").Set(v)
+	case "gc-fragmentation":
+		m.heapFragmentationPercent.Set(v)
+	case "monitor-lock-contention-count":
+		m.monitorLockContentionCount.Add(v)
+	case "threadpool-completed-items-count":
+		m.threadPoolCompletedItemsCount.Add(v)
+	case "threadpool-queue-length":
+		m.threadPoolQueueLength.Set(v)
+	case "threadpool-thread-count":
+		m.threadPoolThreadsCount.Set(v)
+	}
+}
+
+func (m *DotNetMonitor) run(ctx context.Context) {
+	b := backoff.Backoff{Factor: 2, Min: time.Second, Max: time.Minute}
+	for {
+		select {
+		case <-ctx.Done():
+			return
+		default:
+			err := m.connect(ctx)
+			if err == nil {
+				return
+			}
+			d := b.Duration()
+			klog.Warningf(
+				"failed to establish connection with the .NET diagnostic socket pid=%d, next attempt in %s: %s",
+				m.pid, d.String(), err,
+			)
+			time.Sleep(d)
+		}
+	}
+}
+
+func (m *DotNetMonitor) connect(ctx context.Context) error {
+	files, _ := filepath.Glob(proc.Path(m.pid, fmt.Sprintf("root/tmp/dotnet-diagnostic-%d-*-socket", proc.GetNsPid(m.pid))))
+
+	if len(files) != 1 {
+		return fmt.Errorf("no socket found")
+	}
+	klog.Infoln(".NET diagnostic socket:", files[0])
+	c := dotnetdiag.NewClient(files[0], dotnetdiag.WithDialer(func(addr string) (net.Conn, error) {
+		return net.DialTimeout("unix", addr, dotNetDiagnosticTimeout)
+	}))
+
+	if pi, err := c.ProcessInfo(); err == nil {
+		m.info.WithLabelValues(pi.ClrProductVersion).Set(1)
+	} else {
+		m.info.WithLabelValues("unknown").Set(1)
+	}
+
+	ctc := dotnetdiag.CollectTracingConfig{
+		CircularBufferSizeMB: 10,
+		Providers: []dotnetdiag.ProviderConfig{
+			{
+				Keywords:     math.MaxInt64,
+				LogLevel:     5,
+				ProviderName: provider,
+				FilterData:   "EventCounterIntervalSec=" + strconv.Itoa(int(dotNetEventInterval.Seconds())),
+			},
+		},
+	}
+
+	sess, err := c.CollectTracing(ctc)
+	if err != nil {
+		return err
+	}
+	defer func() {
+		_ = sess.Close()
+	}()
+
+	stream := nettrace.NewStream(sess)
+	if _, err = stream.Open(); err != nil {
+		return err
+	}
+
+	metadata := map[int32]*nettrace.Metadata{}
+
+	stream.EventHandler = func(e *nettrace.Blob) error {
+		md, ok := metadata[e.Header.MetadataID]
+		if !ok {
+			return fmt.Errorf("metadata not found")
+		}
+		parser := nettrace.Parser{Buffer: e.Payload}
+
+		if md.Header.ProviderName != provider {
+			return nil
+		}
+		dnm := &dotNetMetric{
+			fields: map[string]string{},
+			values: map[string]float64{},
+		}
+		if err := parseFields(md.Payload.Fields, parser, dnm); err != nil {
+			return err
+		}
+
+		m.processMetric(dnm.name(), dnm.units(), dnm.value())
+		return nil
+	}
+	stream.MetadataHandler = func(md *nettrace.Metadata) error {
+		metadata[md.Header.MetaDataID] = md
+		return nil
+	}
+
+	for {
+		select {
+		case <-ctx.Done():
+			return nil
+		default:
+			if err = stream.Next(); err != nil {
+				return err
+			}
+		}
+	}
+}
+
+func parseFields(fields []nettrace.MetadataField, parser nettrace.Parser, metric *dotNetMetric) error {
+	for _, field := range fields {
+		switch field.TypeCode {
+		case typecode.Object:
+			if err := parseFields(field.Payload.Fields, parser, metric); err != nil {
+				return err
+			}
+		case typecode.String:
+			v := parser.UTF16NTS()
+			if err := parser.Err(); err != nil {
+				return err
+			}
+			metric.fields[field.Name] = v
+		case typecode.Double:
+			var v float64
+			parser.Read(&v)
+			if err := parser.Err(); err != nil {
+				return err
+			}
+			metric.values[field.Name] = v
+		case typecode.Single:
+			var v float32
+			parser.Read(&v)
+			if err := parser.Err(); err != nil {
+				return err
+			}
+			metric.values[field.Name] = float64(v)
+		case typecode.Int32:
+			var v int32
+			parser.Read(&v)
+			if err := parser.Err(); err != nil {
+				return err
+			}
+			metric.values[field.Name] = float64(v)
+		default:
+			return fmt.Errorf("unsupported field type: %d", field.TypeCode)
+		}
+	}
+	return nil
+}
+
+func dotNetApp(pid uint32) (string, error) {
+	file, err := elf.Open(proc.Path(pid, "exe"))
+	if err != nil {
+		return "", err
+	}
+	defer func() { _ = file.Close() }()
+	res, _ := file.DynString(elf.DT_RPATH)
+	if len(res) == 0 {
+		res, _ = file.DynString(elf.DT_RUNPATH)
+	}
+	if len(res) == 1 && res[0] == "$ORIGIN/netcoredeps" {
+		cmdline := proc.GetCmdline(pid)
+		if cmdline == nil {
+			return "", errors.New("failed to read proc cmdline")
+		}
+		firstArg := bytes.Split(cmdline, []byte{0})[0]
+		parts := strings.Split(string(firstArg), "/")
+		app := parts[len(parts)-1]
+		return app, nil
+	}
+	return "", nil
+}

+ 16 - 0
containers/metrics.go

@@ -116,3 +116,19 @@ var (
 func metric(name, help string, labels ...string) *prometheus.Desc {
 	return prometheus.NewDesc(name, help, labels, nil)
 }
+
+func newCounter(name, help string, constLabels prometheus.Labels) prometheus.Counter {
+	return prometheus.NewCounter(prometheus.CounterOpts{Name: name, Help: help, ConstLabels: constLabels})
+}
+
+func newCounterVec(name, help string, constLabels prometheus.Labels, labelNames ...string) *prometheus.CounterVec {
+	return prometheus.NewCounterVec(prometheus.CounterOpts{Name: name, Help: help, ConstLabels: constLabels}, labelNames)
+}
+
+func newGauge(name, help string, constLabels prometheus.Labels) prometheus.Gauge {
+	return prometheus.NewGauge(prometheus.GaugeOpts{Name: name, Help: help, ConstLabels: constLabels})
+}
+
+func newGaugeVec(name, help string, constLabels prometheus.Labels, labelNames ...string) *prometheus.GaugeVec {
+	return prometheus.NewGaugeVec(prometheus.GaugeOpts{Name: name, Help: help, ConstLabels: constLabels}, labelNames)
+}

+ 57 - 0
containers/process.go

@@ -0,0 +1,57 @@
+package containers
+
+import (
+	"context"
+	"time"
+
+	"github.com/cilium/ebpf/link"
+	"github.com/coroot/coroot-node-agent/proc"
+	"github.com/mdlayher/taskstats"
+)
+
+type Process struct {
+	Pid       uint32
+	StartedAt time.Time
+	NetNsId   string
+
+	ctx        context.Context
+	cancelFunc context.CancelFunc
+
+	dotNetMonitor *DotNetMonitor
+
+	uprobes               []link.Link
+	goTlsUprobesChecked   bool
+	openSslUprobesChecked bool
+}
+
+func NewProcess(pid uint32, stats *taskstats.Stats) *Process {
+	ns, err := proc.GetNetNs(pid)
+	if err != nil {
+		return nil
+	}
+	defer ns.Close()
+	p := &Process{Pid: pid, StartedAt: stats.BeginTime, NetNsId: ns.UniqueId()}
+	p.ctx, p.cancelFunc = context.WithCancel(context.Background())
+	p.instrument()
+	return p
+}
+
+func (p *Process) isHostNs() bool {
+	return p.NetNsId == hostNetNsId
+}
+
+func (p *Process) instrument() {
+	if dotNetAppName, err := dotNetApp(p.Pid); err == nil {
+		if dotNetAppName != "" {
+			p.dotNetMonitor = NewDotNetMonitor(p.ctx, p.Pid, dotNetAppName)
+		}
+		return
+	}
+}
+
+func (p *Process) Close() {
+	p.cancelFunc()
+	for _, u := range p.uprobes {
+		_ = u.Close()
+	}
+}

+ 6 - 1
go.mod

@@ -15,10 +15,12 @@ require (
 	github.com/florianl/go-conntrack v0.3.0
 	github.com/go-kit/log v0.2.1
 	github.com/grafana/pyroscope/ebpf v0.3.2
+	github.com/jpillora/backoff v1.0.0
 	github.com/mdlayher/taskstats v0.0.0-20230712191918-387b3d561d14
 	github.com/opencontainers/runtime-spec v1.0.3-0.20210326190908-1c3f411f0417
 	github.com/prometheus/client_golang v1.17.0
 	github.com/prometheus/prometheus v0.47.2
+	github.com/pyroscope-io/dotnetdiag v1.2.1
 	github.com/stretchr/testify v1.8.4
 	github.com/vishvananda/netlink v1.2.1-beta.2.0.20220608195807-1a118fe229fc
 	github.com/vishvananda/netns v0.0.4
@@ -151,4 +153,7 @@ require (
 	k8s.io/client-go v0.28.2 // indirect
 )
 
-replace github.com/optiopay/kafka => github.com/cilium/kafka v0.0.0-20180809090225-01ce283b732b
+replace (
+	github.com/optiopay/kafka => github.com/cilium/kafka v0.0.0-20180809090225-01ce283b732b
+	github.com/pyroscope-io/dotnetdiag => github.com/coroot/dotnetdiag v1.2.2
+)

+ 4 - 0
go.sum

@@ -125,6 +125,8 @@ github.com/coreos/go-systemd/v22 v22.3.2/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSV
 github.com/coreos/go-systemd/v22 v22.5.0 h1:RrqgGjYQKalulkV8NGVIfkXQf6YYmOyiJKk8iXXhfZs=
 github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc=
 github.com/coreos/pkg v0.0.0-20180928190104-399ea9e2e55f/go.mod h1:E3G3o1h8I7cfcXa63jLwjI0eiQQMgzzUDFVpN/nH/eA=
+github.com/coroot/dotnetdiag v1.2.2 h1:PVP/By8o+xhPjfVolJYcjHLbFQInM7pkaD6/otPLc8Q=
+github.com/coroot/dotnetdiag v1.2.2/go.mod h1:veXCMlFzm1yNl7wwJb/ZLxO4WbzhDBoy1VG1XtkH2ls=
 github.com/coroot/logparser v1.0.15 h1:y6OEJV2dP2A8Nw7yc53skhH+vkAraRLot9Vjc6gHVNo=
 github.com/coroot/logparser v1.0.15/go.mod h1:GHsVO1xE8pR5mmu9Eiop9IXHwN/QzNRx1s0fuzVxq7I=
 github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU=
@@ -361,6 +363,8 @@ github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFF
 github.com/josharian/native v0.0.0-20200817173448-b6b71def0850/go.mod h1:7X/raswPFr05uY3HiLlYeyQntB6OO7E/d2Cu7qoaN2w=
 github.com/josharian/native v1.1.0 h1:uuaP0hAbW7Y4l0ZRQ6C9zfb7Mg1mbFKry/xzDAfmtLA=
 github.com/josharian/native v1.1.0/go.mod h1:7X/raswPFr05uY3HiLlYeyQntB6OO7E/d2Cu7qoaN2w=
+github.com/jpillora/backoff v1.0.0 h1:uvFg412JmmHBHw7iwprIxkPMI+sGQ4kzOWsMeHnm2EA=
+github.com/jpillora/backoff v1.0.0/go.mod h1:J/6gKK9jxlEcS3zixgDgUAsiuZ7yrSoa/FX5e0EB2j4=
 github.com/jsimonetti/rtnetlink v0.0.0-20190606172950-9527aa82566a/go.mod h1:Oz+70psSo5OFh8DBl0Zv2ACw7Esh6pPUphlvZG9x7uw=
 github.com/jsimonetti/rtnetlink v0.0.0-20200117123717-f846d4f6c1f4/go.mod h1:WGuG/smIU4J/54PblvSbh+xvCZmpJnFgr3ds6Z55XMQ=
 github.com/jsimonetti/rtnetlink v0.0.0-20201009170750-9c6f07d100c1/go.mod h1:hqoO/u39cqLeBLebZ8fWdE96O7FxrAsRYhnVOdgHxok=

+ 2 - 1
proc/proc.go

@@ -2,11 +2,12 @@ package proc
 
 import (
 	"bytes"
-	"github.com/coroot/coroot-node-agent/cgroup"
 	"os"
 	"path"
 	"strconv"
 	"strings"
+
+	"github.com/coroot/coroot-node-agent/cgroup"
 )
 
 var root = "/proc"