collector.go 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291
  1. package node
  2. import (
  3. "github.com/coroot/coroot-node-agent/common"
  4. "github.com/coroot/coroot-node-agent/flags"
  5. "github.com/coroot/coroot-node-agent/node/metadata"
  6. "github.com/prometheus/client_golang/prometheus"
  7. klog "github.com/sirupsen/logrus"
  8. )
  9. var (
  10. procRoot = "/proc"
  11. infoDesc = prometheus.NewDesc(
  12. "node_info",
  13. "Meta information about the node",
  14. []string{"hostname", "kernel_version"}, nil,
  15. )
  16. cloudInfoDesc = prometheus.NewDesc(
  17. "node_cloud_info",
  18. "Meta information about the cloud instance",
  19. []string{"provider", "account_id", "instance_id", "instance_type", "instance_life_cycle", "region", "availability_zone", "availability_zone_id", "local_ipv4", "public_ipv4"}, nil,
  20. )
  21. uptimeDesc = prometheus.NewDesc(
  22. "node_uptime_seconds",
  23. "Uptime of the node in seconds",
  24. []string{}, nil,
  25. )
  26. cpuUsageDesc = prometheus.NewDesc(
  27. "node_resources_cpu_usage_seconds_total",
  28. "The amount of CPU time spent in each mode",
  29. []string{"mode"}, nil,
  30. )
  31. cpuLogicalCoresDesc = prometheus.NewDesc(
  32. "node_resources_cpu_logical_cores",
  33. "The number of logical CPU cores",
  34. nil, nil,
  35. )
  36. memTotalDesc = prometheus.NewDesc(
  37. "node_resources_memory_total_bytes",
  38. "The total amount of physical memory",
  39. nil, nil,
  40. )
  41. memFreeDesc = prometheus.NewDesc(
  42. "node_resources_memory_free_bytes",
  43. "The amount of unassigned memory",
  44. nil, nil,
  45. )
  46. memAvailableDesc = prometheus.NewDesc(
  47. "node_resources_memory_available_bytes",
  48. "The total amount of available memory",
  49. nil, nil,
  50. )
  51. memCacheDesc = prometheus.NewDesc(
  52. "node_resources_memory_cached_bytes",
  53. "The amount of memory used as page cache",
  54. nil, nil,
  55. )
  56. diskReadsDesc = prometheus.NewDesc(
  57. "node_resources_disk_reads_total",
  58. "The total number of reads completed successfully",
  59. []string{"device"}, nil,
  60. )
  61. diskWritesDesc = prometheus.NewDesc(
  62. "node_resources_disk_writes_total",
  63. "The total number of writes completed successfully",
  64. []string{"device"}, nil,
  65. )
  66. diskReadBytesDesc = prometheus.NewDesc(
  67. "node_resources_disk_read_bytes_total",
  68. "The total number of bytes read from the disk",
  69. []string{"device"}, nil,
  70. )
  71. diskWrittenBytesDesc = prometheus.NewDesc(
  72. "node_resources_disk_written_bytes_total",
  73. "The total number of bytes written to the disk",
  74. []string{"device"}, nil,
  75. )
  76. diskReadTimeDesc = prometheus.NewDesc(
  77. "node_resources_disk_read_time_seconds_total",
  78. "The total number of seconds spent reading",
  79. []string{"device"}, nil,
  80. )
  81. diskWriteTimeDesc = prometheus.NewDesc(
  82. "node_resources_disk_write_time_seconds_total",
  83. "The total number of seconds spent writing",
  84. []string{"device"}, nil,
  85. )
  86. diskIoTimeDesc = prometheus.NewDesc(
  87. "node_resources_disk_io_time_seconds_total",
  88. "The total number of seconds the disk spent doing I/O",
  89. []string{"device"}, nil,
  90. )
  91. netRxBytesDesc = prometheus.NewDesc(
  92. "node_net_received_bytes_total",
  93. "The total number of bytes received",
  94. []string{"interface"}, nil,
  95. )
  96. netTxBytesDesc = prometheus.NewDesc(
  97. "node_net_transmitted_bytes_total",
  98. "The total number of bytes transmitted",
  99. []string{"interface"}, nil,
  100. )
  101. netRxPacketsDesc = prometheus.NewDesc(
  102. "node_net_received_packets_total",
  103. "The total number of packets received",
  104. []string{"interface"}, nil,
  105. )
  106. netTxPacketsDesc = prometheus.NewDesc(
  107. "node_net_transmitted_packets_total",
  108. "The total number of packets transmitted",
  109. []string{"interface"}, nil,
  110. )
  111. netIfaceUpDesc = prometheus.NewDesc(
  112. "node_net_interface_up",
  113. "Status of the interface (0:down, 1:up)",
  114. []string{"interface"}, nil,
  115. )
  116. ipDesc = prometheus.NewDesc(
  117. "node_net_interface_ip",
  118. "IP address assigned to the interface",
  119. []string{"interface", "ip"}, nil,
  120. )
  121. )
  122. type MemoryStat struct {
  123. TotalBytes float64
  124. FreeBytes float64
  125. AvailableBytes float64
  126. CachedBytes float64
  127. }
  128. type CpuStat struct {
  129. TotalUsage CpuUsage
  130. LogicalCores int
  131. }
  132. type CpuUsage struct {
  133. User float64
  134. Nice float64
  135. System float64
  136. Idle float64
  137. IoWait float64
  138. Irq float64
  139. SoftIrq float64
  140. Steal float64
  141. }
  142. type Collector struct {
  143. hostname string
  144. kernelVersion string
  145. instanceMetadata *metadata.CloudMetadata
  146. }
  147. func NewCollector(hostname, kernelVersion string) *Collector {
  148. md := metadata.GetInstanceMetadata()
  149. klog.Infof("instance metadata: %+v", md)
  150. return &Collector{
  151. hostname: hostname,
  152. kernelVersion: kernelVersion,
  153. instanceMetadata: md,
  154. }
  155. }
  156. func (c *Collector) Collect(ch chan<- prometheus.Metric) {
  157. ch <- gauge(infoDesc, 1, c.hostname, c.kernelVersion)
  158. v, err := uptime(procRoot)
  159. if err != nil {
  160. klog.Errorln(err)
  161. } else {
  162. ch <- gauge(uptimeDesc, v)
  163. }
  164. cpu, err := cpuStat(procRoot)
  165. if err != nil {
  166. if !common.IsNotExist(err) {
  167. klog.Errorln(err)
  168. }
  169. } else {
  170. ch <- counter(cpuUsageDesc, cpu.TotalUsage.User, "user")
  171. ch <- counter(cpuUsageDesc, cpu.TotalUsage.Nice, "nice")
  172. ch <- counter(cpuUsageDesc, cpu.TotalUsage.System, "system")
  173. ch <- counter(cpuUsageDesc, cpu.TotalUsage.Idle, "idle")
  174. ch <- counter(cpuUsageDesc, cpu.TotalUsage.IoWait, "iowait")
  175. ch <- counter(cpuUsageDesc, cpu.TotalUsage.Irq, "irq")
  176. ch <- counter(cpuUsageDesc, cpu.TotalUsage.SoftIrq, "softirq")
  177. ch <- counter(cpuUsageDesc, cpu.TotalUsage.Steal, "steal")
  178. ch <- gauge(cpuLogicalCoresDesc, float64(cpu.LogicalCores))
  179. }
  180. mem, err := memoryInfo(procRoot)
  181. if err != nil {
  182. if !common.IsNotExist(err) {
  183. klog.Errorln(err)
  184. }
  185. } else {
  186. ch <- gauge(memTotalDesc, mem.TotalBytes)
  187. ch <- gauge(memFreeDesc, mem.FreeBytes)
  188. ch <- gauge(memAvailableDesc, mem.AvailableBytes)
  189. ch <- gauge(memCacheDesc, mem.CachedBytes)
  190. }
  191. disks, err := GetDisks()
  192. if err != nil {
  193. klog.Errorln("failed to get disk stats:", err)
  194. } else {
  195. for _, d := range disks.BlockDevices() {
  196. ch <- counter(diskReadsDesc, d.ReadOps, d.Name)
  197. ch <- counter(diskWritesDesc, d.WriteOps, d.Name)
  198. ch <- counter(diskReadBytesDesc, d.BytesRead, d.Name)
  199. ch <- counter(diskWrittenBytesDesc, d.BytesWritten, d.Name)
  200. ch <- counter(diskReadTimeDesc, d.ReadTimeSeconds, d.Name)
  201. ch <- counter(diskWriteTimeDesc, d.WriteTimeSeconds, d.Name)
  202. ch <- counter(diskIoTimeDesc, d.IoTimeSeconds, d.Name)
  203. }
  204. }
  205. netdev, err := NetDevices()
  206. if err != nil {
  207. klog.Errorln(err)
  208. } else {
  209. for _, dev := range netdev {
  210. ch <- counter(netRxBytesDesc, dev.RxBytes, dev.Name)
  211. ch <- counter(netTxBytesDesc, dev.TxBytes, dev.Name)
  212. ch <- counter(netRxPacketsDesc, dev.RxPackets, dev.Name)
  213. ch <- counter(netTxPacketsDesc, dev.TxPackets, dev.Name)
  214. ch <- gauge(netIfaceUpDesc, dev.Up, dev.Name)
  215. for _, p := range dev.IPPrefixes {
  216. ch <- gauge(ipDesc, 1, dev.Name, p.IP().String())
  217. }
  218. }
  219. }
  220. im := metadata.CloudMetadata{}
  221. if c.instanceMetadata != nil {
  222. im = *c.instanceMetadata
  223. }
  224. if f := flags.GetString(flags.Provider); f != "" {
  225. im.Provider = metadata.CloudProvider(f)
  226. }
  227. if f := flags.GetString(flags.Region); f != "" {
  228. im.Region = f
  229. }
  230. if f := flags.GetString(flags.AvailabilityZone); f != "" {
  231. im.AvailabilityZone = f
  232. }
  233. if f := flags.GetString(flags.InstanceType); f != "" {
  234. im.InstanceType = f
  235. }
  236. if f := flags.GetString(flags.InstanceLifeCycle); f != "" {
  237. im.LifeCycle = f
  238. }
  239. ch <- gauge(cloudInfoDesc, 1,
  240. string(im.Provider), im.AccountId, im.InstanceId, im.InstanceType, im.LifeCycle,
  241. im.Region, im.AvailabilityZone, im.AvailabilityZoneId, im.LocalIPv4, im.PublicIPv4,
  242. )
  243. }
  244. func (c *Collector) Describe(ch chan<- *prometheus.Desc) {
  245. ch <- infoDesc
  246. ch <- cloudInfoDesc
  247. ch <- uptimeDesc
  248. ch <- cpuUsageDesc
  249. ch <- cpuLogicalCoresDesc
  250. ch <- memTotalDesc
  251. ch <- memFreeDesc
  252. ch <- memAvailableDesc
  253. ch <- memCacheDesc
  254. ch <- diskReadsDesc
  255. ch <- diskWritesDesc
  256. ch <- diskReadBytesDesc
  257. ch <- diskWrittenBytesDesc
  258. ch <- diskReadTimeDesc
  259. ch <- diskWriteTimeDesc
  260. ch <- diskIoTimeDesc
  261. ch <- netRxBytesDesc
  262. ch <- netTxBytesDesc
  263. ch <- netRxPacketsDesc
  264. ch <- netTxPacketsDesc
  265. ch <- netIfaceUpDesc
  266. ch <- ipDesc
  267. }
  268. func counter(desc *prometheus.Desc, value float64, labelValues ...string) prometheus.Metric {
  269. return prometheus.MustNewConstMetric(desc, prometheus.CounterValue, value, labelValues...)
  270. }
  271. func gauge(desc *prometheus.Desc, value float64, labelValues ...string) prometheus.Metric {
  272. return prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, value, labelValues...)
  273. }