container_apm.go 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556
  1. package containers
  2. import (
  3. "bufio"
  4. "bytes"
  5. "debug/elf"
  6. "fmt"
  7. "os"
  8. "sort"
  9. "strconv"
  10. "strings"
  11. "time"
  12. "github.com/coroot/coroot-node-agent/ebpftracer"
  13. "github.com/coroot/coroot-node-agent/ebpftracer/l7"
  14. "github.com/coroot/coroot-node-agent/ebpftracer/tracer"
  15. "github.com/coroot/coroot-node-agent/proc"
  16. "github.com/coroot/coroot-node-agent/tracing"
  17. "github.com/coroot/coroot-node-agent/utils"
  18. . "github.com/coroot/coroot-node-agent/utils/modelse"
  19. "github.com/pkg/errors"
  20. klog "github.com/sirupsen/logrus"
  21. "inet.af/netaddr"
  22. )
  23. const TRACE_STATUS = 1
  24. func (c *Container) getTrace(traceId uint64) (*tracing.Trace, bool) {
  25. trace, ok := c.traceMap[traceId]
  26. return trace, ok
  27. }
  28. func (c *Container) createTraceMap(traceId uint64, trace *tracing.Trace) {
  29. c.traceMap[traceId] = trace
  30. }
  31. // 查询或创建trace信息
  32. func (c *Container) getOrInitTrace(traceId uint64) (*tracing.Trace, error) {
  33. trace, ok := c.getTrace(traceId)
  34. if !ok {
  35. //new trace
  36. trace = tracing.NewTraceFromEvent(string(c.id))
  37. //create TraceMap
  38. c.createTraceMap(traceId, trace)
  39. //create ParentSpan
  40. trace.CreateRootSpan(traceId)
  41. }
  42. return trace, nil
  43. }
  44. func (c *Container) InitTrace(traceId uint64, r *l7.RequestData) error {
  45. method, path, hostIp, port := l7.ParseHttpHost(r.Payload)
  46. ip, err := netaddr.ParseIP(hostIp)
  47. if err != nil {
  48. fmt.Println("host ip error")
  49. hostIp = "127.0.0.1"
  50. }
  51. addr := netaddr.IPPortFrom(ip, port)
  52. trace := tracing.NewTrace(string(c.id), addr)
  53. if trace == nil {
  54. return fmt.Errorf("OTEL_EXPORTER_OTLP_TRACES_ENDPOINT is null")
  55. }
  56. c.traceMap[traceId] = trace
  57. trace.TraceStart(method, path, r.Status, r.Duration)
  58. return nil
  59. }
  60. // 在任意阶段,r.TraceId 不等于0 则创建 traceMap && createParentSpan
  61. // 更新 createTraceSpan 机制,更新触发traceEnd机制,当事件个数满足时,任意event均可触发end
  62. func (c *Container) SendEvent(t *tracing.Trace, traceID uint64) {
  63. if t.AllEventReady(traceID) {
  64. t.SendEvent()
  65. klog.Infof("SendEvent %d", traceID)
  66. //fmt.Println(t.GetSpan())
  67. //fmt.Println("===============")
  68. delete(c.traceMap, traceID)
  69. }
  70. }
  71. func (c *Container) valuableTrace(traceID uint64) bool {
  72. return traceID != 0
  73. }
  74. func (c *Container) onL7RequestApm(pid uint32, fd uint64, timestamp uint64, r *l7.RequestData) map[netaddr.IP]string {
  75. c.lock.Lock()
  76. defer c.lock.Unlock()
  77. if r.Protocol == l7.ProtocolDNS {
  78. ip2fqdn, _type, fqdn := c.onDNSRequest(r)
  79. if c.l7Attach && c.valuableTrace(r.TraceId) {
  80. apmTrace, err := c.getOrInitTrace(r.TraceId)
  81. if err == nil {
  82. apmTrace.DNSTraceQueryEvent(r, _type, fqdn)
  83. c.SendEvent(apmTrace, r.TraceId)
  84. }
  85. }
  86. return ip2fqdn
  87. }
  88. //if !c.valuableTrace(r.TraceId) {
  89. // return nil
  90. //}
  91. if r.Protocol == l7.ProtocolTrace && c.l7Attach && c.valuableTrace(r.TraceId) {
  92. if r.TraceStart == TRACE_STATUS {
  93. klog.Infof("====ProtocolTrace start==== %d %d", pid, r.TraceId)
  94. trace, err := c.getOrInitTrace(r.TraceId)
  95. if err == nil {
  96. method, path, hostIp, port := l7.ParseHttpHost(r.Payload)
  97. ip, _ := netaddr.ParseIP(hostIp)
  98. //codeType := c.GetCodeTypeFromCache(pid)
  99. trace.TraceStartEvent(method, path, r.Status, netaddr.IPPortFrom(ip, port), pid, c.GetAppInfo())
  100. c.SendEvent(trace, r.TraceId)
  101. }
  102. return nil
  103. }
  104. if r.TraceEnd == TRACE_STATUS {
  105. klog.Infof("====ProtocolTrace end==== %d %d", pid, r.TraceId)
  106. trace, err := c.getOrInitTrace(r.TraceId)
  107. if err == nil {
  108. trace.TraceEndEvent(r)
  109. c.SendEvent(trace, r.TraceId)
  110. }
  111. return nil
  112. }
  113. }
  114. if r.Protocol == l7.ProtocolHTTP {
  115. if c.l7Attach && c.valuableTrace(r.TraceId) {
  116. method, path, hostIp, port := l7.ParseHttpHost(r.Payload)
  117. apmTrace, err := c.getOrInitTrace(r.TraceId)
  118. //fmt.Println("ProtocolHTTP-----", r.TraceId, err)
  119. if err == nil {
  120. apmTrace.HttpTraceRequestEvent(method, path, hostIp, port, r)
  121. c.SendEvent(apmTrace, r.TraceId)
  122. }
  123. }
  124. //return nil
  125. }
  126. conn := c.connectionsByPidFd[PidFd{Pid: pid, Fd: fd}]
  127. //fmt.Println("l7.connectionsByPidFd", conn, pid, fd)
  128. if conn == nil {
  129. return nil
  130. }
  131. if timestamp != 0 && conn.Timestamp != timestamp {
  132. return nil
  133. }
  134. stats := c.l7Stats.get(r.Protocol, conn.Dest, conn.ActualDest)
  135. //trace := tracing.NewTrace(string(c.id), conn.ActualDest)
  136. switch r.Protocol {
  137. case l7.ProtocolHTTP:
  138. stats.observe(r.Status.Http(), "", r.Duration)
  139. case l7.ProtocolHTTP2:
  140. if conn.http2Parser == nil {
  141. conn.http2Parser = l7.NewHttp2Parser()
  142. }
  143. requests := conn.http2Parser.Parse(r.Method, r.Payload, uint64(r.Duration))
  144. for _, req := range requests {
  145. stats.observe(req.Status.Http(), "", req.Duration)
  146. //trace.Http2Request(req.Method, req.Path, req.Scheme, req.Status, req.Duration)
  147. }
  148. case l7.ProtocolPostgres:
  149. if r.Method != l7.MethodStatementClose {
  150. stats.observe(r.Status.String(), "", r.Duration)
  151. }
  152. //if conn.postgresParser == nil {
  153. // conn.postgresParser = l7.NewPostgresParser()
  154. //}
  155. //query := conn.postgresParser.Parse(r.Payload)
  156. //trace.PostgresQuery(query, r.Status.Error(), r.Duration)
  157. case l7.ProtocolMysql:
  158. if r.Method != l7.MethodStatementClose {
  159. stats.observe(r.Status.String(), "", r.Duration)
  160. }
  161. if c.l7Attach && c.valuableTrace(r.TraceId) {
  162. if conn.mysqlParser == nil {
  163. conn.mysqlParser = l7.NewMysqlParser()
  164. }
  165. query := conn.mysqlParser.Parse(r.Payload, r.StatementId)
  166. //trace.MysqlQuery(query, r.Status.Error(), r.Duration)
  167. //apmTrace, ok := c.getTrace(r.TraceId)
  168. apmTrace, err := c.getOrInitTrace(r.TraceId)
  169. fmt.Println(err)
  170. //fmt.Println("mysql r.TraceId:", r.TraceId)
  171. //fmt.Println("ok:", ok)
  172. //fmt.Println("traceMap:", len(c.traceMap))
  173. if err == nil {
  174. //apmTrace.MysqlTraceQuery(query, r.Status.Error(), r.Duration, conn.ActualDest)
  175. apmTrace.MysqlTraceQueryEvent(query, r, conn.ActualDest)
  176. c.SendEvent(apmTrace, r.TraceId)
  177. }
  178. }
  179. case l7.ProtocolDM:
  180. fmt.Println("---- onL7RequestApm ProtocolDM start ---->")
  181. fmt.Println("-------dm r.Status :", r.Status)
  182. //统计dm的query次数
  183. stats.observe(r.Status.String(), "", r.Duration)
  184. //是否发送数据
  185. if c.l7Attach && c.valuableTrace(r.TraceId) {
  186. if conn.dmParser == nil {
  187. conn.dmParser = l7.NewDmParser()
  188. }
  189. query := conn.dmParser.Parse(r.Payload, r.StatementId)
  190. apmTrace, err := c.getOrInitTrace(r.TraceId)
  191. fmt.Println("-------dm r.TraceId:", r.TraceId)
  192. if err == nil {
  193. apmTrace.DmTraceQueryEvent(query, r, conn.ActualDest)
  194. c.SendEvent(apmTrace, r.TraceId)
  195. }
  196. }
  197. fmt.Println("---- onL7RequestApm ProtocolDM end <----")
  198. case l7.ProtocolMemcached:
  199. stats.observe(r.Status.String(), "", r.Duration)
  200. if c.l7Attach && c.valuableTrace(r.TraceId) {
  201. }
  202. //cmd, items := l7.ParseMemcached(r.Payload)
  203. //trace.MemcachedQuery(cmd, items, r.Status.Error(), r.Duration)
  204. case l7.ProtocolRedis:
  205. stats.observe(r.Status.String(), "", r.Duration)
  206. if c.l7Attach && c.valuableTrace(r.TraceId) {
  207. cmd, args := l7.ParseRedis(r.Payload)
  208. //fmt.Println("cmd", cmd)
  209. //fmt.Println("args", args)
  210. //apmTrace, ok := c.getTrace(r.TraceId)
  211. apmTrace, err := c.getOrInitTrace(r.TraceId)
  212. if err == nil {
  213. //apmTrace.RedisTraceQuery(cmd, args, r.Status.Error(), r.Duration)
  214. apmTrace.RedisTraceQueryEvent(cmd, args, r, conn.ActualDest)
  215. c.SendEvent(apmTrace, r.TraceId)
  216. }
  217. }
  218. //trace.RedisQuery(cmd, args, r.Status.Error(), r.Duration)
  219. case l7.ProtocolMongo:
  220. stats.observe(r.Status.String(), "", r.Duration)
  221. if c.l7Attach && c.valuableTrace(r.TraceId) {
  222. }
  223. //query := l7.ParseMongo(r.Payload)
  224. //trace.MongoQuery(query, r.Status.Error(), r.Duration)
  225. case l7.ProtocolKafka, l7.ProtocolCassandra:
  226. stats.observe(r.Status.String(), "", r.Duration)
  227. if c.l7Attach && c.valuableTrace(r.TraceId) {
  228. }
  229. case l7.ProtocolRabbitmq, l7.ProtocolNats:
  230. stats.observe(r.Status.String(), r.Method.String(), 0)
  231. if c.l7Attach && c.valuableTrace(r.TraceId) {
  232. }
  233. case l7.ProtocolDubbo2:
  234. stats.observe(r.Status.String(), "", r.Duration)
  235. if c.l7Attach && c.valuableTrace(r.TraceId) {
  236. }
  237. }
  238. return nil
  239. }
  240. func (c *Container) buildIDs(pid uint32) bool {
  241. c.lock.Lock()
  242. defer c.lock.Unlock()
  243. p := c.processes[pid]
  244. if p != nil {
  245. p.cmdline = string(proc.GetRealCmdline(pid))
  246. }
  247. for address, val := range c.getListens() {
  248. if val == 1 {
  249. ip := address.IP()
  250. if ip.Is4() && !ip.IsLoopback() {
  251. // 获取端口号
  252. port := address.Port()
  253. //c.instanceID.IntVal, c.instanceID.HashtVal, _ =
  254. c.AppInfo.Sn = ip.String()
  255. c.AppInfo.Sport = int(port)
  256. strInstanceID := utils.BuildInt64ID(fmt.Sprintf("%s:%d", ip.String(), port))
  257. c.AppInfo.InstanceIdHash.IntVal, _ = strInstanceID.ToInt64()
  258. c.AppInfo.InstanceIdHash.HashtVal = strInstanceID.ToHashByte()
  259. //c.AppInfo.InstanceId = c.instanceID.IntVal
  260. strAgentID := utils.BuildInt64ID(fmt.Sprintf("%s:%s", strInstanceID, string(proc.GetExe(pid))))
  261. c.AppInfo.AgentId, _ = strAgentID.ToInt64()
  262. c.AppInfo.CodeType = c.GetCodeTypeFromCache(pid)
  263. return true
  264. }
  265. }
  266. }
  267. return false
  268. }
  269. func (c *Container) StackProcess(event ebpftracer.StackEvent, tracer *ebpftracer.Tracer) {
  270. c.lock.Lock()
  271. defer c.lock.Unlock()
  272. // get the associated uprobe
  273. uprobe, err := c.GetUprobe(event, tracer)
  274. if err != nil {
  275. fmt.Println("GetUprobeGetUprobe errer: %v", err)
  276. // log.Errorf("failed to get uprobe for event %+v: %+v", event, err)
  277. return
  278. }
  279. if event.TraceId <= 0 {
  280. fmt.Println("StackProcess TraceId id 0")
  281. // log.Errorf("failed to get uprobe for event %+v: %+v", event, err)
  282. return
  283. }
  284. // fmt.Printf("StackProcess 函数入口开始处理 fun:TraceId:%lld, Funcname:%s, time: %lld\n", event.TraceId, uprobe.Funcname, event.TimeNsEnd-event.TimeNsStart)
  285. stackFun := ebpftracer.StackFunEvent{}
  286. stackFun.Uprobe = &uprobe
  287. stackFun.StackEvent = event
  288. apmTrace, ok := c.getTrace(event.TraceId)
  289. if ok {
  290. apmTrace.FunAdd(stackFun)
  291. }
  292. }
  293. func byteExtractString(nameString [100]byte) string {
  294. n := bytes.IndexByte(nameString[:], 0)
  295. if n == -1 {
  296. n = len(nameString) // 没找到零值,使用数组长度
  297. }
  298. return string(nameString[:n])
  299. }
  300. func (c *Container) StackProcess2(event ebpftracer.StackEvent, tracer *ebpftracer.Tracer) {
  301. c.lock.Lock()
  302. defer c.lock.Unlock()
  303. // get the associated uprobe
  304. switch event.Location {
  305. case 0: // ret
  306. Funcname := ""
  307. if event.Type != uint64(CodeTypeJava) {
  308. uprobe, err := c.GetUprobe(event, tracer)
  309. if err != nil {
  310. fmt.Println("GetUprobeGetUprobe errer: %v", err)
  311. // log.Errorf("failed to get uprobe for event %+v: %+v", event, err)
  312. return
  313. }
  314. Funcname = uprobe.Funcname
  315. } else {
  316. ClassName := byteExtractString(event.ClassName)
  317. MethedName := byteExtractString(event.MethedName)
  318. Funcname = ClassName + "." + MethedName
  319. }
  320. if event.TraceId <= 0 {
  321. fmt.Println("StackProcess TraceId id 0")
  322. // log.Errorf("failed to get uprobe for event %+v: %+v", event, err)
  323. return
  324. }
  325. //fmt.Printf("StackProcess 函数入口开始处理 fun:TraceId:%lld, Funcname:%s, time: %lld\n", event.TraceId, uprobe.Funcname, event.TimeNsEnd-event.TimeNsStart)
  326. apmTrace, err := c.getOrInitTrace(event.TraceId)
  327. if err == nil {
  328. //fmt.Println("append FuncTraceQuery fun:", event.TraceId, uprobe.Funcname, event.Pid)
  329. duration := event.TimeNsEnd - event.TimeNsStart
  330. apmTrace.FuncTraceQuery(Funcname, time.Duration(duration), event.TimeNsStart, event.TimeNsEnd)
  331. c.SendEvent(apmTrace, event.TraceId)
  332. }
  333. }
  334. }
  335. // ResolveAddress returns the symbol(s) and offset of the given address.
  336. func (c *Container) ResolveAddress(addr uint64, symbols []elf.Symbol) (syms []elf.Symbol, offset uint, err error) {
  337. if addr == 0 {
  338. // err = errors.Wrapf(SymbolNotFoundError, "0")
  339. return
  340. }
  341. // symbols, _, err := e.Symbols()
  342. if err != nil {
  343. return
  344. }
  345. idx := sort.Search(len(symbols), func(i int) bool { return symbols[i].Value > addr })
  346. if idx == 0 {
  347. // err = errors.Wrap(SymbolNotFoundError, fmt.Sprintf("%x", addr))
  348. return
  349. }
  350. // why diff symbol may contains the same addr?
  351. sym := symbols[idx-1]
  352. for i := idx - 1; i >= 0 && symbols[i].Value == sym.Value; i-- {
  353. syms = append(syms, symbols[i])
  354. }
  355. for i := idx; i < len(symbols) && symbols[i].Value == sym.Value; i++ {
  356. syms = append(syms, symbols[i])
  357. }
  358. return syms, uint(addr - sym.Value), nil
  359. }
  360. type MemoryMap struct {
  361. Start, End uint64
  362. }
  363. // ReadFirstLineOfMapsFile reads the first line of /proc/<pid>/maps file and return the memory map as a MemoryMap struct
  364. func ReadFirstLineOfMapsFile(pid string) (*MemoryMap, error) {
  365. file, err := os.Open(fmt.Sprintf("/proc/%s/maps", pid))
  366. if err != nil {
  367. return nil, err
  368. }
  369. defer file.Close()
  370. scanner := bufio.NewScanner(file)
  371. if scanner.Scan() {
  372. fields := strings.Fields(scanner.Text())
  373. addresses := strings.Split(fields[0], "-")
  374. if len(addresses) != 2 {
  375. return nil, errors.New("unexpected format in /proc/<pid>/maps")
  376. }
  377. start, err := strconv.ParseUint(addresses[0], 16, 64)
  378. if err != nil {
  379. return nil, err
  380. }
  381. end, err := strconv.ParseUint(addresses[1], 16, 64)
  382. if err != nil {
  383. return nil, err
  384. }
  385. return &MemoryMap{
  386. Start: start,
  387. End: end,
  388. }, nil
  389. }
  390. if err := scanner.Err(); err != nil {
  391. return nil, err
  392. }
  393. return nil, errors.New("empty /proc/<pid>/maps")
  394. }
  395. func (c *Container) GetUprobe(event ebpftracer.StackEvent, tracer *ebpftracer.Tracer) (uprobe tracer.Uprobe, err error) {
  396. //fmt.Println("GetUprobe entory:")
  397. memoryMap, _ := ReadFirstLineOfMapsFile(strconv.Itoa(int(event.Pid)))
  398. Address := event.Ip - memoryMap.Start
  399. // fmt.Printf("memoryMap.Start: %x, event.Ip: %x, Address: %x\n", memoryMap.Start, event.Ip, Address)
  400. for _, fun := range c.UprobesMap {
  401. funAddress := fun.Address + fun.AbsOffset
  402. // fmt.Printf("GetUprobeGetUprobeGetUprobe:fun.Address %x, fun.AbsOffset: %x\n", fun.Address, fun.AbsOffset)
  403. if funAddress == Address {
  404. // fmt.Printf("---GetUprobeGetUprobeGetUprobe: %x, event.Ip: %x ---- %s--%x\n", memoryMap.Start, event.Ip, fun.Funcname, fun.Address)
  405. return fun, nil
  406. }
  407. }
  408. syms, _, err := c.ResolveAddress(event.Ip, tracer.Symbols)
  409. if err != nil {
  410. return
  411. }
  412. for _, sym := range syms {
  413. //fmt.Println("GetUprobeGetUprobeGetUprobe: %s+%d", sym.Name, offset)
  414. uprobe, ok := tracer.UprobesMap[fmt.Sprintf("%s-%s", sym.Name, sym.Value)]
  415. if ok {
  416. return uprobe, nil
  417. }
  418. }
  419. err = errors.New("uprobe not found")
  420. return
  421. }
  422. func (c *Container) GetAppInfo() AppInfo {
  423. return c.AppInfo
  424. }
  425. // 可注入前置
  426. func (c *Container) checkEventReady() bool {
  427. c.lock.Lock()
  428. defer c.lock.Unlock()
  429. return c.l7EventReady
  430. }
  431. func (c *Container) eventReady() {
  432. c.lock.Lock()
  433. defer c.lock.Unlock()
  434. c.l7EventReady = true
  435. }
  436. // uprobe前置
  437. func (c *Container) checkL7AttachReady() bool {
  438. c.lock.Lock()
  439. defer c.lock.Unlock()
  440. return c.l7Attach
  441. }
  442. func (c *Container) l7AttachSuccess() {
  443. c.lock.Lock()
  444. defer c.lock.Unlock()
  445. c.l7Attach = true
  446. }
  447. func (c *Container) verifyAttachConditions(r *Registry, pid uint32) bool {
  448. p := c.processes[pid]
  449. if p != nil && c.checkEventReady() {
  450. codeType := c.GetCodeTypeFromCache(pid)
  451. if codeType.IsUnknownCode() {
  452. klog.WithField("pid", pid).Debug("[verify] unknown language.")
  453. return false
  454. }
  455. cmdline := p.GetCmdline()
  456. if len(cmdline) == 0 {
  457. return false
  458. }
  459. whiteListByCode := r.getWhiteListByCodeType(codeType)
  460. //klog.WithField("pid", pid).WithField("codeType", codeType.String()).
  461. // Infof("[verify] white list %v", utils.ToString(whiteListByCode))
  462. // 当前语言的白名单规则
  463. for _, setting := range whiteListByCode {
  464. ruleVal := setting.Filters
  465. if ruleVal == "" {
  466. continue
  467. }
  468. // 判断规则
  469. if strings.Contains(cmdline, ruleVal) {
  470. c.WhiteSettingInfo = setting
  471. klog.WithField("pid", pid).
  472. WithField("codeType", codeType.String()).
  473. WithField("ruleVal", ruleVal).
  474. WithField("cmdline", cmdline).
  475. WithField("white list", utils.ToString(whiteListByCode)).
  476. Infoln("[verify] check successful.")
  477. return true
  478. }
  479. }
  480. }
  481. return false
  482. }
  483. func (c *Container) detachUprobes(pid uint32) {
  484. c.lock.Lock()
  485. defer c.lock.Unlock()
  486. // close uprobe
  487. if p := c.processes[pid]; p != nil {
  488. if len(p.uprobes) > 0 {
  489. klog.Infof("detachUprobes", pid)
  490. // 关闭应用层uprobes
  491. p.DynamicClose()
  492. // 关闭7层监控
  493. c.l7Attach = false
  494. // 清空应用注册信息
  495. c.AppInfo = AppInfo{}
  496. }
  497. }
  498. }
  499. func (c *Container) getRootfs() string {
  500. if c.metadata != nil && c.metadata.rootfs != "" {
  501. return c.metadata.rootfs
  502. }
  503. return ""
  504. }