collector.go 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266
  1. package node
  2. import (
  3. "github.com/coroot/coroot-node-agent/common"
  4. "github.com/coroot/coroot-node-agent/flags"
  5. "github.com/coroot/coroot-node-agent/node/metadata"
  6. "github.com/prometheus/client_golang/prometheus"
  7. "k8s.io/klog/v2"
  8. )
  9. var (
  10. procRoot = "/proc"
  11. infoDesc = prometheus.NewDesc(
  12. "node_info",
  13. "Meta information about the node",
  14. []string{"hostname", "kernel_version"}, nil,
  15. )
  16. cloudInfoDesc = prometheus.NewDesc(
  17. "node_cloud_info",
  18. "Meta information about the cloud instance",
  19. []string{"provider", "account_id", "instance_id", "instance_type", "instance_life_cycle", "region", "availability_zone", "availability_zone_id", "local_ipv4", "public_ipv4"}, nil,
  20. )
  21. cpuUsageDesc = prometheus.NewDesc(
  22. "node_resources_cpu_usage_seconds_total",
  23. "The amount of CPU time spent in each mode",
  24. []string{"mode"}, nil,
  25. )
  26. cpuLogicalCoresDesc = prometheus.NewDesc(
  27. "node_resources_cpu_logical_cores",
  28. "The number of logical CPU cores",
  29. nil, nil,
  30. )
  31. memTotalDesc = prometheus.NewDesc(
  32. "node_resources_memory_total_bytes",
  33. "The total amount of physical memory",
  34. nil, nil,
  35. )
  36. memFreeDesc = prometheus.NewDesc(
  37. "node_resources_memory_free_bytes",
  38. "The amount of unassigned memory",
  39. nil, nil,
  40. )
  41. memAvailableDesc = prometheus.NewDesc(
  42. "node_resources_memory_available_bytes",
  43. "The total amount of available memory",
  44. nil, nil,
  45. )
  46. memCacheDesc = prometheus.NewDesc(
  47. "node_resources_memory_cached_bytes",
  48. "The amount of memory used as page cache",
  49. nil, nil,
  50. )
  51. diskReadsDesc = prometheus.NewDesc(
  52. "node_resources_disk_reads_total",
  53. "The total number of reads completed successfully",
  54. []string{"device"}, nil,
  55. )
  56. diskWritesDesc = prometheus.NewDesc(
  57. "node_resources_disk_writes_total",
  58. "The total number of writes completed successfully",
  59. []string{"device"}, nil,
  60. )
  61. diskReadBytesDesc = prometheus.NewDesc(
  62. "node_resources_disk_read_bytes_total",
  63. "The total number of bytes read from the disk",
  64. []string{"device"}, nil,
  65. )
  66. diskWrittenBytesDesc = prometheus.NewDesc(
  67. "node_resources_disk_written_bytes_total",
  68. "The total number of bytes written to the disk",
  69. []string{"device"}, nil,
  70. )
  71. diskReadTimeDesc = prometheus.NewDesc(
  72. "node_resources_disk_read_time_seconds_total",
  73. "The total number of seconds spent reading",
  74. []string{"device"}, nil,
  75. )
  76. diskWriteTimeDesc = prometheus.NewDesc(
  77. "node_resources_disk_write_time_seconds_total",
  78. "The total number of seconds spent writing",
  79. []string{"device"}, nil,
  80. )
  81. diskIoTimeDesc = prometheus.NewDesc(
  82. "node_resources_disk_io_time_seconds_total",
  83. "The total number of seconds the disk spent doing I/O",
  84. []string{"device"}, nil,
  85. )
  86. netRxBytesDesc = prometheus.NewDesc(
  87. "node_net_received_bytes_total",
  88. "The total number of bytes received",
  89. []string{"interface"}, nil,
  90. )
  91. netTxBytesDesc = prometheus.NewDesc(
  92. "node_net_transmitted_bytes_total",
  93. "The total number of bytes transmitted",
  94. []string{"interface"}, nil,
  95. )
  96. netRxPacketsDesc = prometheus.NewDesc(
  97. "node_net_received_packets_total",
  98. "The total number of packets received",
  99. []string{"interface"}, nil,
  100. )
  101. netTxPacketsDesc = prometheus.NewDesc(
  102. "node_net_transmitted_packets_total",
  103. "The total number of packets transmitted",
  104. []string{"interface"}, nil,
  105. )
  106. netIfaceUpDesc = prometheus.NewDesc(
  107. "node_net_interface_up",
  108. "Status of the interface (0:down, 1:up)",
  109. []string{"interface"}, nil,
  110. )
  111. ipDesc = prometheus.NewDesc(
  112. "node_net_interface_ip",
  113. "IP address assigned to the interface",
  114. []string{"interface", "ip"}, nil,
  115. )
  116. )
  117. type MemoryStat struct {
  118. TotalBytes float64
  119. FreeBytes float64
  120. AvailableBytes float64
  121. CachedBytes float64
  122. }
  123. type CpuStat struct {
  124. TotalUsage CpuUsage
  125. LogicalCores int
  126. }
  127. type CpuUsage struct {
  128. User float64
  129. Nice float64
  130. System float64
  131. Idle float64
  132. IoWait float64
  133. Irq float64
  134. SoftIrq float64
  135. Steal float64
  136. }
  137. type Collector struct {
  138. hostname string
  139. kernelVersion string
  140. instanceMetadata *metadata.CloudMetadata
  141. }
  142. func NewCollector(hostname, kernelVersion string) *Collector {
  143. md := metadata.GetInstanceMetadata()
  144. klog.Infof("instance metadata: %+v", md)
  145. return &Collector{
  146. hostname: hostname,
  147. kernelVersion: kernelVersion,
  148. instanceMetadata: md,
  149. }
  150. }
  151. func (c *Collector) Collect(ch chan<- prometheus.Metric) {
  152. ch <- gauge(infoDesc, 1, c.hostname, c.kernelVersion)
  153. cpu, err := cpuStat(procRoot)
  154. if err != nil {
  155. if !common.IsNotExist(err) {
  156. klog.Errorln(err)
  157. }
  158. } else {
  159. ch <- counter(cpuUsageDesc, cpu.TotalUsage.User, "user")
  160. ch <- counter(cpuUsageDesc, cpu.TotalUsage.Nice, "nice")
  161. ch <- counter(cpuUsageDesc, cpu.TotalUsage.System, "system")
  162. ch <- counter(cpuUsageDesc, cpu.TotalUsage.Idle, "idle")
  163. ch <- counter(cpuUsageDesc, cpu.TotalUsage.IoWait, "iowait")
  164. ch <- counter(cpuUsageDesc, cpu.TotalUsage.Irq, "irq")
  165. ch <- counter(cpuUsageDesc, cpu.TotalUsage.SoftIrq, "softirq")
  166. ch <- counter(cpuUsageDesc, cpu.TotalUsage.Steal, "steal")
  167. ch <- gauge(cpuLogicalCoresDesc, float64(cpu.LogicalCores))
  168. }
  169. mem, err := memoryInfo(procRoot)
  170. if err != nil {
  171. if !common.IsNotExist(err) {
  172. klog.Errorln(err)
  173. }
  174. } else {
  175. ch <- gauge(memTotalDesc, mem.TotalBytes)
  176. ch <- gauge(memFreeDesc, mem.FreeBytes)
  177. ch <- gauge(memAvailableDesc, mem.AvailableBytes)
  178. ch <- gauge(memCacheDesc, mem.CachedBytes)
  179. }
  180. disks, err := GetDisks()
  181. if err != nil {
  182. klog.Errorln("failed to get disk stats:", err)
  183. } else {
  184. for _, d := range disks.BlockDevices() {
  185. ch <- counter(diskReadsDesc, d.ReadOps, d.Name)
  186. ch <- counter(diskWritesDesc, d.WriteOps, d.Name)
  187. ch <- counter(diskReadBytesDesc, d.BytesRead, d.Name)
  188. ch <- counter(diskWrittenBytesDesc, d.BytesWritten, d.Name)
  189. ch <- counter(diskReadTimeDesc, d.ReadTimeSeconds, d.Name)
  190. ch <- counter(diskWriteTimeDesc, d.WriteTimeSeconds, d.Name)
  191. ch <- counter(diskIoTimeDesc, d.IoTimeSeconds, d.Name)
  192. }
  193. }
  194. netdev, err := netDevices()
  195. if err != nil {
  196. klog.Errorln(err)
  197. } else {
  198. for _, dev := range netdev {
  199. ch <- counter(netRxBytesDesc, dev.RxBytes, dev.Name)
  200. ch <- counter(netTxBytesDesc, dev.TxBytes, dev.Name)
  201. ch <- counter(netRxPacketsDesc, dev.RxPackets, dev.Name)
  202. ch <- counter(netTxPacketsDesc, dev.TxPackets, dev.Name)
  203. ch <- gauge(netIfaceUpDesc, dev.Up, dev.Name)
  204. for _, ip := range dev.Addresses {
  205. ch <- gauge(ipDesc, 1, dev.Name, ip)
  206. }
  207. }
  208. }
  209. if c.instanceMetadata != nil {
  210. im := c.instanceMetadata
  211. ch <- gauge(cloudInfoDesc, 1,
  212. string(im.Provider), im.AccountId, im.InstanceId, im.InstanceType, im.LifeCycle,
  213. im.Region, im.AvailabilityZone, im.AvailabilityZoneId, im.LocalIPv4, im.PublicIPv4,
  214. )
  215. } else if flags.Provider != nil || flags.Region != nil || flags.AvailabilityZone != nil {
  216. ch <- gauge(cloudInfoDesc, 1,
  217. flags.GetString(flags.Provider), "", "", "", "",
  218. flags.GetString(flags.Region), flags.GetString(flags.AvailabilityZone), "", "", "",
  219. )
  220. }
  221. }
  222. func (c *Collector) Describe(ch chan<- *prometheus.Desc) {
  223. ch <- infoDesc
  224. ch <- cloudInfoDesc
  225. ch <- cpuUsageDesc
  226. ch <- cpuLogicalCoresDesc
  227. ch <- memTotalDesc
  228. ch <- memFreeDesc
  229. ch <- memAvailableDesc
  230. ch <- memCacheDesc
  231. ch <- diskReadsDesc
  232. ch <- diskWritesDesc
  233. ch <- diskReadBytesDesc
  234. ch <- diskWrittenBytesDesc
  235. ch <- diskReadTimeDesc
  236. ch <- diskWriteTimeDesc
  237. ch <- diskIoTimeDesc
  238. ch <- netRxBytesDesc
  239. ch <- netTxBytesDesc
  240. ch <- netRxPacketsDesc
  241. ch <- netTxPacketsDesc
  242. ch <- netIfaceUpDesc
  243. ch <- ipDesc
  244. }
  245. func counter(desc *prometheus.Desc, value float64, labelValues ...string) prometheus.Metric {
  246. return prometheus.MustNewConstMetric(desc, prometheus.CounterValue, value, labelValues...)
  247. }
  248. func gauge(desc *prometheus.Desc, value float64, labelValues ...string) prometheus.Metric {
  249. return prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, value, labelValues...)
  250. }