Просмотр исходного кода

add `systemd_triggered_by` label to `container_info` metric to detect periodic jobs triggered by systemd timers

Nikolay Sivko 1 год назад
Родитель
Сommit
d0e515dece
4 измененных файлов с 73 добавлено и 12 удалено
  1. 12 11
      containers/container.go
  2. 1 1
      containers/metrics.go
  3. 4 0
      containers/registry.go
  4. 56 0
      containers/systemd.go

+ 12 - 11
containers/container.go

@@ -35,15 +35,16 @@ type ContainerNetwork struct {
 }
 
 type ContainerMetadata struct {
-	name        string
-	labels      map[string]string
-	volumes     map[string]string
-	logPath     string
-	image       string
-	logDecoder  logparser.Decoder
-	hostListens map[string][]netaddr.IPPort
-	networks    map[string]ContainerNetwork
-	env         map[string]string
+	name               string
+	labels             map[string]string
+	volumes            map[string]string
+	logPath            string
+	image              string
+	logDecoder         logparser.Decoder
+	hostListens        map[string][]netaddr.IPPort
+	networks           map[string]ContainerNetwork
+	env                map[string]string
+	systemdTriggeredBy string
 }
 
 type Delays struct {
@@ -225,8 +226,8 @@ func (c *Container) Collect(ch chan<- prometheus.Metric) {
 	c.lock.RLock()
 	defer c.lock.RUnlock()
 
-	if c.metadata.image != "" {
-		ch <- gauge(metrics.ContainerInfo, 1, c.metadata.image)
+	if c.metadata.image != "" || c.metadata.systemdTriggeredBy != "" {
+		ch <- gauge(metrics.ContainerInfo, 1, c.metadata.image, c.metadata.systemdTriggeredBy)
 	}
 
 	ch <- counter(metrics.Restarts, float64(c.restarts))

+ 1 - 1
containers/metrics.go

@@ -47,7 +47,7 @@ var metrics = struct {
 	JvmSafepointSyncTime *prometheus.Desc
 	Ip2Fqdn              *prometheus.Desc
 }{
-	ContainerInfo: metric("container_info", "Meta information about the container", "image"),
+	ContainerInfo: metric("container_info", "Meta information about the container", "image", "systemd_triggered_by"),
 
 	Restarts: metric("container_restarts_total", "Number of times the container was restarted"),
 

+ 4 - 0
containers/registry.go

@@ -410,6 +410,10 @@ func calcId(cg *cgroup.Cgroup, md *ContainerMetadata) ContainerID {
 
 func getContainerMetadata(cg *cgroup.Cgroup) (*ContainerMetadata, error) {
 	switch cg.ContainerType {
+	case cgroup.ContainerTypeSystemdService:
+		md := &ContainerMetadata{}
+		md.systemdTriggeredBy = SystemdTriggeredBy(cg.ContainerId)
+		return md, nil
 	case cgroup.ContainerTypeDocker, cgroup.ContainerTypeContainerd, cgroup.ContainerTypeSandbox, cgroup.ContainerTypeCrio:
 	default:
 		return &ContainerMetadata{}, nil

+ 56 - 0
containers/systemd.go

@@ -0,0 +1,56 @@
+package containers
+
+import (
+	"context"
+	"os"
+	"strconv"
+	"strings"
+	"time"
+
+	"github.com/coroot/coroot-node-agent/proc"
+
+	"github.com/coreos/go-systemd/v22/dbus"
+	gdbus "github.com/godbus/dbus/v5"
+
+	"k8s.io/klog/v2"
+)
+
+var (
+	conn        *dbus.Conn
+	dbusTimeout = time.Second
+)
+
+func init() {
+	var err error
+	conn, err = dbus.NewConnection(func() (*gdbus.Conn, error) {
+		c, err := gdbus.Dial("unix:path=" + proc.HostPath("/run/systemd/private"))
+		if err != nil {
+			return nil, err
+		}
+		methods := []gdbus.Auth{gdbus.AuthExternal(strconv.Itoa(os.Getuid()))}
+		if err = c.Auth(methods); err != nil {
+			conn.Close()
+			return nil, err
+		}
+		return c, nil
+	})
+	if err != nil {
+		klog.Warningln("failed to connect to systemd bus:", err)
+	}
+}
+
+func SystemdTriggeredBy(id string) string {
+	if conn == nil {
+		return ""
+	}
+	ctx, cancel := context.WithTimeout(context.Background(), dbusTimeout)
+	defer cancel()
+	parts := strings.Split(id, "/")
+	unit := parts[len(parts)-1]
+	if prop, _ := conn.GetUnitPropertyContext(ctx, unit, "TriggeredBy"); prop != nil {
+		if values, _ := prop.Value.Value().([]string); len(values) > 0 {
+			return values[0]
+		}
+	}
+	return ""
+}