AWS CloudWatch告警推送至外部AlertManager
AWS 端配置
想要实现 AWS 的 CloudWatch 产生的告警推送至业务程序,或者直接邮件、短信方式通知到运维成员,通常使用 AWS 的 SNS 服务,但是该套服务组合有一个缺点,当告警异常恢复后,CloudWatch 并不会发送一条恢复的通知,不符合业务上的需求。
在查阅无数文档后,终于发现 AWS 有一个服务可以解决这个问题,使用 EventBridge 去分发事件:告警事件和 EventBridge - Amazon CloudWatch
CloudWatch 告警会默认推送至 EventBridge,但由于 EventBridge 默认都没有配置规则,所以推送到 EventBridge 后没有执行其他动作。因此可以在 EventBridge 创建 2 个规则,分别对应告警发生和告警恢复。
进入 EventBridge 页面,在左边导航栏选择 Rules,创建一个新 rule:

Events 可以选择 All Events,Event pattern 选择 use schema,schema registry 默认只有一个 aws.events, schema 选择 aws.cloudwatch@CloudWatchAlarmStateChange。
在下面的 method 里,找到 state 和 previousState:

分别设置 state.value 和 previousState.value,然后再点击 Generate event pattern in JSON,然后进入下一步配置通知目标

这里一般会提前配置好 SNS 的 subscription,选择要推送的渠道,下一步然后创建即可。
我选择的 SNS subscription 会通过 https 回调到内部的 web 应用去处理告警信息,下一步要做的是解析 EventBridge 的消息体。
解析 EventBridge payload
payload 格式大概如下所示:
{
"AlarmName": "test-low-available-memory",
"AlarmDescription": "aws rds memory utilization >= 80%",
"AWSAccountId": "xxxxxx",
"AlarmConfigurationUpdatedTimestamp": "2025-08-12T07:49:42.878+0000",
"NewStateValue": "ALARM",
"NewStateReason": "Threshold Crossed: 3 out of the last 3 datapoints [7.066517504E8 (12/08/25 07:45:00), 7.03741952E8 (12/08/25 07:40:00), 7.072145408E8 (12/08/25 07:35:00)] were less than or equal to the threshold (8.0E8) (minimum 3 datapoints for OK -> ALARM transition).",
"StateChangeTime": "2025-08-12T07:50:34.567+0000",
"Region": "EU (Ireland)",
"AlarmArn": "arn:aws:cloudwatch:eu-west-1:xxxx:alarm:test-low-available-memory",
"OldStateValue": "INSUFFICIENT_DATA",
"OKActions": [],
"AlarmActions": ["arn:aws:sns:eu-west-1:xxx:prometheus_alert"],
"InsufficientDataActions": [],
"Trigger": {
"MetricName": "FreeableMemory",
"Namespace": "AWS/RDS",
"StatisticType": "Statistic",
"Statistic": "AVERAGE",
"Unit": null,
"Dimensions": [{
"value": "eu2-2",
"name": "DBInstanceIdentifier"
}],
"Period": 300,
"EvaluationPeriods": 3,
"DatapointsToAlarm": 3,
"ComparisonOperator": "LessThanOrEqualToThreshold",
"Threshold": 8.0E8,
"TreatMissingData": "missing",
"EvaluateLowSampleCountPercentile": ""
}
}
可以选择对自己业务有帮助的字段,再拼接为推送至 AlertManager 的消息体。
type AwsAlarmArgs struct {
Type string `json:"Type"`
MessageId string `json:"MessageId"`
Token string `json:"Token,omitempty"`
TopicArn string `json:"TopicArn"`
Subject string `json:"Subject,omitempty"`
Message string `json:"Message"`
SubscribeURL string `json:"SubscribeURL,omitempty"`
Timestamp string `json:"Timestamp"`
SignatureVersion string `json:"SignatureVersion"`
Signature string `json:"Signature"`
SigningCertURL string `json:"SigningCertURL"`
UnsubscribeURL string `json:"UnsubscribeURL,omitempty"`
}
type AwsAlarmMsg struct {
AlarmName string `json:"AlarmName"`
AlarmDescription string `json:"AlarmDescription"`
AWSAccountId string `json:"AWSAccountId"`
NewStateValue string `json:"NewStateValue"`
NewStateReason string `json:"NewStateReason"`
StateChangeTime string `json:"StateChangeTime"`
Region string `json:"Region"`
OldStateValue string `josn:"OldStateValue"`
Trigger AwsAlarmTrigger `json:"Trigger"`
}
type AwsAlarmTrigger struct {
MetricName string `json:"MetricName"`
Namespace string `json:"Namespace"`
StatisticType string `json:"StatisticType"`
Statistic string `json:"Statistic"`
Unit string `json:"Unit"`
Dimensions []AwsAlarmDimensions `json:"Dimensions"`
Period int `json:"Period"`
EvaluationPeriods int `json:"EvaluationPeriods"`
ComparisonOperator string `json:"ComparisonOperator"`
Threshold float32 `json:"Threshold"`
TreatMissingData string `json:"TreatMissingData"`
EvaluateLowSampleCountPercentile string `json:"EvaluateLowSampleCountPercentile"`
}
type AwsEvent struct {
Version string `json:"version"`
Id string `json:"id"`
DetailType string `json:"detail-type"`
Source string `json:"source"`
Account string `json:"account"`
Time string `json:"time"`
Region string `json:"region"`
Resources []string `json:"resources"`
Detail AwsEventDetail `json:"detail"`
}
type AwsEventDetail struct {
AlarmName string `json:"alarmName"`
State AwsEventState `json:"state"`
PreviousState AwsEventPreviousState `json:"previousState"`
Configuration AwsEventConfiguration `json:"configuration"`
}
type AwsEventState struct {
Value string `json:"value"`
Reason string `json:"reason"`
ReasonData string `json:"reasonData"`
Timestamp string `json:"timestamp"`
}
type AwsEventPreviousState struct {
Value string `json:"value"`
Reason string `json:"reason"`
ReasonData string `json:"reasonData"`
Timestamp string `json:"timestamp"`
}
type AwsEventConfiguration struct {
Metrics []AwsEventMetrics `json:"metrics"`
Description string `json:"description"`
}
type AwsEventMetrics struct {
Id string `json:"id"`
MetricStat AwsEventMetricStat `json:"metricStat"`
ReturnData bool `json:"returnData"`
}
type AwsEventMetricStat struct {
Metric struct {
Namespace string `json:"namespace"`
Name string `json:"name"`
Dimensions map[string]string `json:"dimensions"`
} `json:"metric"`
Period int `json:"period"`
Stat string `json:"stat"`
}
type AwsAlarmDimensions struct {
Name string `json:"name"`
Value string `json:"value"`
}
type AwsAlarm struct {
Token string
Args AwsAlarmArgs
Region string
}
处理 aws subscription 和 EventBridge 的 payload
// 接收aws告警并发送给am
func (p *Pusher) AWSAlert(c *gin.Context) (err error) {
token := c.FormValue("token")
awsArgs := AwsAlarmArgs{}
awsEvent := AwsEvent{}
body, err := ioutil.ReadAll(c.Request().Body)
if err != nil {
log.Logger.Errorf("read aws request body err:%v", err)
return
}
msgType := c.Request().Header.Get("x-amz-sns-message-type") // 获取消息类型
switch msgType {
case "SubscriptionConfirmation":
// aws SNS订阅确认消息
if err = json.Unmarshal(body, &awsArgs); err != nil {
log.Logger.Errorf("json unmarshal err:%v", err)
return
}
subscribeUrl := awsArgs.SubscribeURL
resp, err := http.Get(subscribeUrl)
if err != nil {
log.Logger.Errorf("Get url:%s err:%v", subscribeUrl, err)
return err
}
if resp.StatusCode == 200 {
log.Logger.Infof("subscribe aws topic successful, statusCode:%v, status:%s", resp.StatusCode, resp.Status)
} else {
log.Logger.Errorf("subscribe aws topic failed, statusCode:%v, status:%s", resp.StatusCode, resp.Status)
}
case "Notification":
// aws SNS通知消息
if err = json.Unmarshal(body, &awsEvent); err != nil {
log.Logger.Errorf("json unmarshal err: body: %v, err: %v", string(body), err)
return
}
state := ""
if awsEvent.Detail.State.Value == "ALARM" {
state = "firing"
} else if awsEvent.Detail.State.Value == "OK" {
state = "resolved"
}
log.Logger.Infof("get aws alarm: traceId: %v, alarm description: %v", awsEvent.Id, awsEvent.Detail.Configuration.Description)
dimension := ""
namespace := ""
// 正常情况下Metrics有一个元素,此处避免数组越界造成panic
if len(awsEvent.Detail.Configuration.Metrics) > 0 {
namespace = awsEvent.Detail.Configuration.Metrics[0].MetricStat.Metric.Namespace
for _,v :=range awsEvent.Detail.Configuration.Metrics[0].MetricStat.Metric.Dimensions {
dimension = v
break
}
}
CreateAlert(awsEvent.Detail.State.Timestamp, awsEvent.Detail.AlarmName, awsEvent.Detail.Configuration.Description, namespace, dimension, state, "error", option.Opt.AmOpt.AwsDs, "aws")
default:
// 未知类型
log.Logger.Errorf("unknown msgType:%s", msgType)
return
}
c.JSON(http.StatusOK, map[string]interface{}{
"errcode": 0,
"errmsg": "ok",
})
return
}
推送至外部 AlertManager
type AlertManagerArgs struct {
StartsAt string `json:"startsAt,omitempty"` // 报警开始时间
EndsAt string `json:"endsAt,omitempty"` // 报警结束时间
Annotations Annoattions `json:"annotations"` // 告警信息注解
Status string `json:"status"` // 告警状态
Labels Labels `json:"labels"` // 告警信息标签
}
type Annoattions struct {
Summary string `json:"summary"`
Description string `json:"description"`
}
type Labels struct {
Severity string `json:"severity"`
Ds string `json:"ds"`
AlertGroup string `json:"alertgroup"`
NameSpace string `json:"namespace"`
Instance string `json:"instance"`
AlertName string `json:"alertname"`
}
func CreateAlert(startTimeOrigin string, summary, description, namespace, instance, status, severity, ds, alertGroup string) {
amArgs := AlertManagerArgs{
Status: status,
Annotations: Annoattions{
Summary: summary,
Description: description,
},
Labels: Labels{
Severity: severity,
Ds: ds,
AlertGroup: alertGroup,
NameSpace: namespace,
Instance: instance,
AlertName: summary,
},
}
parseTime, err := time.Parse("2006-01-02T15:04:05.000-0700", startTimeOrigin)
if err != nil {
log.Logger.Errorf("parse time failed: time: %v, err: %v", startTimeOrigin, err)
return
}
if status == "firing" {
amArgs.StartsAt = parseTime.Format("2006-01-02T15:04:05Z")
// 延长endsAt,未加endsAt的告警默认2min后就关闭
amArgs.EndsAt = parseTime.Add(time.Duration(option.Opt.AmOpt.EndTimeout) * time.Hour).Format("2006-01-02T15:04:05Z")
} else if status == "resolved" {
amArgs.EndsAt = parseTime.Format("2006-01-02T15:04:05Z")
} else {
log.Logger.Errorf("status invalid: status: %v", status)
return
}
reqBody := []AlertManagerArgs{amArgs}
reqByte, err := json.Marshal(reqBody)
if err != nil {
log.Logger.Errorf("json marshal failed: err: %v", err)
return
}
req, err := http.NewRequest(http.MethodPost, option.Opt.AmOpt.AmHost, bytes.NewBuffer(reqByte))
if err != nil {
log.Logger.Errorf("new request failed: err: %v", err)
return
}
req.SetBasicAuth(option.Opt.AmOpt.Username, option.Opt.AmOpt.Password)
req.Header.Set("Content-Type", "application/json")
_, err = http.DefaultClient.Do(req)
if err != nil {
log.Logger.Errorf("http do failed: err: %v", err)
return
}
}
代码仅供参考,需要根据实际业务再作修改
参考文档
告警事件和 EventBridge - Amazon CloudWatch