极不推荐生产环境使用下述告警推送方式!!!整套监控系统由前任建立,无文档情况下接手,本文仅为工作记录。

一、环境梳理

1、首先,数据源来自Prometheus。监听端口7746,启动参数/monitor/prometheus/prometheus --web.listen-address=:7746 --config.file=/monitor/prometheus/prometheus.yml --web.enable-remote-write-receiver --web.enable-admin-api

[root@monitor alarmweb]# netstat -tunlp |grep 7746
tcp6       0      0 :::7746                 :::*                    LISTEN      4359/prometheus
[root@monitor alarmweb]# ps aux |grep 4359
root      4359  2.0  2.0 3640152 329784 ?      Sl    2025 2647:18 /monitor/prometheus/prometheus --web.listen-address=:7746 --config.file=/monitor/prometheus/prometheus.yml --web.enable-remote-write-receiver --web.enable-admin-api

可以看到Prometheus的工作目录在/monitor/prometheus

2、其次,夜莺本体监听端口80,启动参数/monitor/n9e/n9e -configs /monitor/n9e/etc

[root@monitor alarmweb]# netstat -tunlp |grep 80
tcp        0      0 0.0.0.0:18000           0.0.0.0:*               LISTEN      5649/python
tcp        0      0 0.0.0.0:18001           0.0.0.0:*               LISTEN      24294/python
tcp6       0      0 :::80                   :::*                    LISTEN      2672/n9e
[root@monitor alarmweb]# ps aux |grep 2672
root      2672  1.4  0.6 812840 104068 ?       Sl    2025 6025:14 /monitor/n9e/n9e -configs /monitor/n9e/etc
root     27479  0.0  0.0 112840  2308 pts/1    S+   14:16   0:00 grep --color=auto 2672

可以看到夜莺的工作目录在/monitor/n9e

3、从夜莺管理平台上看到,所有的告警事件都会抛给一个回调地址:http://monitor.baiyyy.com:18000/webhook/event

image-20260205141855943

查看一下是什么进程在监听18000端口。

[root@monitor alarmweb]# netstat -tunlp |grep 18000
tcp        0      0 0.0.0.0:18000           0.0.0.0:*               LISTEN      5649/python
[root@monitor alarmweb]# ps aux |grep 5649
root      5649  0.0  0.8 2307712 136588 ?      S     2025  56:43 python /zscript/alarmweb/webmain.py
root     28636  0.0  0.0 112840  2356 pts/1    S+   14:21   0:00 grep --color=auto 5649

打开/zscript/alarmweb/webmain.py,内容如下:

import requests
import json
import sys
from flask import Flask,request
import time

app = Flask(__name__)


# 企业微信机器人发送纯文本
def send_message(wkey,alm):
    wx_url = 'https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key='+wkey+'&type'
    data = {"msgtype": "text", "text": {"content": alm}}
    r = requests.post(url=wx_url, data=json.dumps(data))  # url中的xxx是你的机器人webhook对应片段,不要轻易泄露出去否则任何人都可以控制你的机器人哦
    return  str(r.status_code)

# @app.route("/")
# def hello_world():
#     return "<p>基础架构告警</p>"


@app.route("/webhook/event",methods=['POST'])
def event():
    json_data = json.loads(request.data)
    eid = str(json_data.get("id"))
    group_wkey=str(json_data.get("notify_groups_obj")[0].get("note"))
    recovered =str(json_data.get("is_recovered"))
    severity=str(json_data.get("severity"))  #告警事件级别
    rule_name=json_data.get("rule_name") #告警规则名称
    trigger_tim=time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(json_data.get("trigger_tim"))) #告警时间
    trigger_value=json_data.get("trigger_value") #告警触发值
    tags=str(json_data.get("tags")) #告警详情
    with open("/zscript/alarmweb/jsondata", 'a') as f:
        f.write(json.dumps(json_data, indent=4))
    with open("/zscript/alarmweb/alarmlog", 'a') as f:
        f.write(rule_name+' '+trigger_tim+' '+eid+recovered+ '\n')
    if recovered=='True' :
        tt='------恢复通知------'
    else:
        tt='------告警通知------'

    message='''{} {} \n告警级别: {} \n告警规则: {} \n告警时间: {} \n告警触发值: {} \n告警详情: {}
    '''.format(tt,eid,severity,rule_name,trigger_tim,trigger_value,tags)
    rcode= send_message(wkey=group_wkey,alm=message)
    return  rcode



if __name__ == '__main__':
    app.run(host="0.0.0.0",port=18000)

至此,夜莺的报警逻辑大概清晰,即夜莺产生告警事件之后,会推送给配置好的回调地址。 Flask 拉起的 Webhook 服务用来接收夜莺的告警回调,然后通过企业微信机器人发送告警/恢复通知。

4、那么是如何做到消息分组发送的呢?

查看/zscript/alarmweb/webmain.py,发现以下代码片段

def send_message(wkey,alm):
    wx_url = 'https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key='+wkey+'&type'
    data = {"msgtype": "text", "text": {"content": alm}}

这是构建企业微信机器人发送文本格式的,其中企业微信机器人的webhook地址是由wkey变量构建的,继续查看代码

rcode= send_message(wkey=group_wkey,alm=message)
def event():
    json_data = json.loads(request.data)
    eid = str(json_data.get("id"))
    group_wkey=str(json_data.get("notify_groups_obj")[0].get("note"))
    recovered =str(json_data.get("is_recovered"))

所以,wkey的值是由group_wkey引入的,group_wkey的值来自json_data.get("notify_groups_obj")[0].get("note"),所以wkey的值也是从夜莺产生告警事件中推送过来的。

查看一下官方文档,看看夜莺推送过来的数据结构:

{
          "id":  16,
          "cate":  "prometheus",
          "cluster":  "xxx",
          "datasource_id":  1,
          "group_id":  1,
          "group_name":  "Default Busi Group",
          "hash":  "0188b06deaa5eb24832548d599090f2b",
          "rule_id":  4,
          "rule_name":  "测试回调地址",
          "rule_note":  "",
          "rule_prod":  "metric",
          "rule_algo":  "",
          "severity":  2,
          "prom_for_duration":  0,
          "prom_ql":  "system_load_norm_5 \u003e 0",
          "rule_config":  {
                    "queries":  [
                              {
                                        "keys":  {
                                                  "labelKey":  "",
                                                  "valueKey":  ""
                                        },
                                        "prom_ql":  "system_load_norm_5 \u003e 0",
                                        "severity":  2
                              }
                    ]
          },
          "prom_eval_interval":  15,
          "callbacks":  [
                    "http://10.211.55.3:4321"
          ],
          "runbook_url":  "",
          "notify_recovered":  1,
          "notify_channels":  [
                    "email"
          ],
          "notify_groups":  [
                    "2"
          ],
          "notify_groups_obj":  [
                    {
                              "id":  2,
                              "name":  "测试邮件告警的团队",
                              "note":  "",
                              "create_at":  1708921626,
                              "create_by":  "root",
                              "update_at":  1708948109,
                              "update_by":  "root"
                    }
          ],
          "target_ident":  "ulric-flashcat.local",
          "target_note":  "",
          "trigger_time":  1708999492,
          "trigger_value":  "0.7229",
          "trigger_values":  "",
          "tags":  [
                    "__name__=system_load_norm_5",
                    "ident=ulric-flashcat.local",
                    "rulename=测试回调地址"
          ],
          "tags_map":  {
                    "__name__":  "system_load_norm_5",
                    "ident":  "ulric-flashcat.local",
                    "rulename":  "测试回调地址"
          },
          "annotations":  {

          },
          "is_recovered":  false,
          "notify_users_obj":  [
                    {
                              "id":  3,
                              "username":  "n9e-wecom-robot",
                              "nickname":  "夜莺V7群机器人",
                              "phone":  "",
                              "email":  "",
                              "portrait":  "",
                              "roles":  [
                                        "Guest"
                              ],
                              "contacts":  {
                                        "wecom_robot_token":  "https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=x"
                              },
                              "maintainer":  0,
                              "create_at":  1708945529,
                              "create_by":  "root",
                              "update_at":  1708945529,
                              "update_by":  "root",
                              "admin":  false
                    },
                    {
                              "id":  4,
                              "username":  "n9e-ding-robot",
                              "nickname":  "钉钉机器人",
                              "phone":  "",
                              "email":  "",
                              "portrait":  "",
                              "roles":  [
                                        "Guest"
                              ],
                              "contacts":  {
                                        "dingtalk_robot_token":  "https://oapi.dingtalk.com/robot/send?access_token=x"
                              },
                              "maintainer":  0,
                              "create_at":  1708948099,
                              "create_by":  "root",
                              "update_at":  1708948099,
                              "update_by":  "root",
                              "admin":  false
                    },
                    {
                              "id":  1,
                              "username":  "root",
                              "nickname":  "超管",
                              "phone":  "",
                              "email":  "",
                              "portrait":  "",
                              "roles":  [
                                        "Admin"
                              ],
                              "contacts":  {

                              },
                              "maintainer":  0,
                              "create_at":  1708920315,
                              "create_by":  "system",
                              "update_at":  1708920315,
                              "update_by":  "system",
                              "admin":  true
                    },
                    {
                              "id":  2,
                              "username":  "qinxiaohui",
                              "nickname":  "秦晓辉",
                              "phone":  "",
                              "email":  "qinxiaohui@flashcat.cloud",
                              "portrait":  "",
                              "roles":  [
                                        "Standard"
                              ],
                              "contacts":  {

                              },
                              "maintainer":  0,
                              "create_at":  1708921503,
                              "create_by":  "root",
                              "update_at":  1708921503,
                              "update_by":  "root",
                              "admin":  false
                    }
          ],
          "last_eval_time":  1708999492,
          "last_sent_time":  1708999492,
          "notify_cur_number":  1,
          "first_trigger_time":  1708999492,
          "extra_config":  null,
          "status":  0,
          "claimant":  "",
          "sub_rule_id":  0,
          "extra_info":  null
}

其中包含了一下字段

"notify_groups_obj":  [
                    {
                              "id":  2,
                              "name":  "测试邮件告警的团队",
                              "note":  "",
                              "create_at":  1708921626,
                              "create_by":  "root",
                              "update_at":  1708948109,
                              "update_by":  "root"
                    }
          ],

正是我们需要的json_data.get("notify_groups_obj")[0].get("note"),不过官方提供的回调 JSON示例中,note为空,这个note代表什么呢?

我们打开夜莺的“人员组织–团队列表”,选择一个已有的团队查看一下。

image-20260205150437585

可以看到,团队中有name信息:服务器告警,有更新人信息:root,还有更新时间信息:2026-02-04 17:18:17,正好对应JSON示例中的nameupdate_by

update_at,截图中还有一个备注,很有可能就是JSON示例中的note字段了。

这边只是结合已知信息进行的猜测,如需落实实际情况,可以获取一下夜莺回调过来的详细数据对比一下就可以了。

至此,消息分组的实现逻辑也依然清晰,夜莺告警规则中,选择指定的告警接收组,告警接收组的备注信息填写对应机器人的token,Flask 拉起的 Webhook 服务将接收到的token组装好,然后发送告警/恢复通知。

二、创建云之家告警渠道

云之家机器人与企业微信机器人一样,提供一个webhook地址用来发送消息。所以找了个AI仿写一个Python脚本。

import requests
import json
import sys
from flask import Flask, request
import time

app = Flask(__name__)

# 云之家机器人发送消息
def send_yunzhijia_message(webhook_url, alarm_msg):
    """
    云之家机器人发送消息
    参考文档:云之家开放平台-群组机器人文档
    """
    # 云之家机器人消息格式
    data = {
        "content": alarm_msg
    }


    try:
        # 云之家机器人webhook调用
        response = requests.post(
            url=webhook_url,
            json=data,
            timeout=10
        )

        # 检查响应
        if response.status_code == 200:
            result = response.json()
            if result.get('success') == True:
                return "200"  # 发送成功
            else:
                print(f"云之家机器人返回错误: {result}")
                return str(result.get('errorCode', '未知错误'))
        else:
            print(f"HTTP请求失败,状态码: {response.status_code}")
            print(f"响应内容: {response.text}")
            return str(response.status_code)

    except Exception as e:
        print(f"发送云之家消息异常: {e}")
        return "500"

@app.route("/webhook/event", methods=['POST'])
def event():
    try:
        json_data = request.get_json()
        if json_data is None:
            return "Invalid JSON data", 400

        eid = str(json_data.get("id", ""))
        notify_groups = json_data.get("notify_groups_obj", [])

        if not notify_groups:
            return "No notify groups found", 400

        group_wkey = str(notify_groups[0].get("note", ""))
        recovered = str(json_data.get("is_recovered", ""))
        severity = str(json_data.get("severity", ""))  # 告警事件级别
        rule_name = json_data.get("rule_name", "")  # 告警规则名称

        # 时间字段处理
        trigger_time_value = json_data.get("trigger_time", json_data.get("trigger_tim", 0))
        trigger_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(trigger_time_value))

        trigger_value = json_data.get("trigger_value", "")  # 告警触发值
        tags = str(json_data.get("tags", ""))  # 告警详情

        # 日志记录
        with open("/zscript/alarmweb/jsondata", 'a', encoding='utf-8') as f:
            f.write(json.dumps(json_data, indent=4, ensure_ascii=False))
        with open("/zscript/alarmweb/alarmlog", 'a', encoding='utf-8') as f:
            f.write(f"{rule_name} {trigger_time} {eid} {recovered}\n")

        # 判断是恢复通知还是告警通知
        if recovered == 'True':
            title = '🚀 恢复通知'
        else:
            # 根据告警级别设置不同的emoji前缀
            severity_level = int(severity) if severity.isdigit() else 3
            if severity_level == 1:
                title = '🔥 紧急告警'
            elif severity_level == 2:
                title = '⚠️  重要告警'
            else:
                title = '📢 一般告警'

        # 构造消息内容 - 根据图片格式只需要content字段
        message = f'''{title}
事件ID: {eid}
告警级别: {severity}
告警规则: {rule_name}
告警时间: {trigger_time}
触发数值: {trigger_value}
告警标签: {tags}
'''
       # 如果需要@所有人,可以在消息末尾添加 @ALL
        # 根据图片说明,@ALL会提醒所有人
        if recovered != 'True' and int(severity) <= 2:  # 紧急和重要告警@所有人
            message += "\n@ALL"

        # 检查消息内容是否为空
        if not message.strip():
            message = "收到告警通知,但内容为空"

        # 发送云之家消息
        yunzhijia_webhook_url = f"https://www.yunzhijia.com/gateway/robot/webhook/send?yzjtype=0&yzjtoken={group_wkey}"
        rcode = send_yunzhijia_message(webhook_url=yunzhijia_webhook_url, alarm_msg=message)
        return rcode

    except Exception as e:
        print(f"处理事件异常: {e}")
        import traceback
        traceback.print_exc()
        return "500"


@app.route("/health", methods=['GET'])
def health_check():
    return "云之家机器人服务运行正常"

if __name__ == '__main__':
    app.run(host="0.0.0.0", port=18001)

查看效果:
image-20260205150242911

Done!

Logo

DAMO开发者矩阵,由阿里巴巴达摩院和中国互联网协会联合发起,致力于探讨最前沿的技术趋势与应用成果,搭建高质量的交流与分享平台,推动技术创新与产业应用链接,围绕“人工智能与新型计算”构建开放共享的开发者生态。

更多推荐