AWS CloudWatch告警推送至外部AlertManager


AWS 端配置

想要实现 AWS 的 CloudWatch 产生的告警推送至业务程序,或者直接邮件、短信方式通知到运维成员,通常使用 AWS 的 SNS 服务,但是该套服务组合有一个缺点,当告警异常恢复后,CloudWatch 并不会发送一条恢复的通知,不符合业务上的需求。

在查阅无数文档后,终于发现 AWS 有一个服务可以解决这个问题,使用 EventBridge 去分发事件:告警事件和 EventBridge - Amazon CloudWatch

CloudWatch 告警会默认推送至 EventBridge,但由于 EventBridge 默认都没有配置规则,所以推送到 EventBridge 后没有执行其他动作。因此可以在 EventBridge 创建 2 个规则,分别对应告警发生和告警恢复。

进入 EventBridge 页面,在左边导航栏选择 Rules,创建一个新 rule:

Events 可以选择 All Events,Event pattern 选择 use schema,schema registry 默认只有一个 aws.events, schema 选择 aws.cloudwatch@CloudWatchAlarmStateChange。

在下面的 method 里,找到 state 和 previousState:

分别设置 state.value 和 previousState.value,然后再点击 Generate event pattern in JSON,然后进入下一步配置通知目标

这里一般会提前配置好 SNS 的 subscription,选择要推送的渠道,下一步然后创建即可。

我选择的 SNS subscription 会通过 https 回调到内部的 web 应用去处理告警信息,下一步要做的是解析 EventBridge 的消息体。

解析 EventBridge payload

payload 格式大概如下所示:

{
        "AlarmName": "test-low-available-memory",
        "AlarmDescription": "aws rds memory utilization >= 80%",
        "AWSAccountId": "xxxxxx",
        "AlarmConfigurationUpdatedTimestamp": "2025-08-12T07:49:42.878+0000",
        "NewStateValue": "ALARM",
        "NewStateReason": "Threshold Crossed: 3 out of the last 3 datapoints [7.066517504E8 (12/08/25 07:45:00), 7.03741952E8 (12/08/25 07:40:00), 7.072145408E8 (12/08/25 07:35:00)] were less than or equal to the threshold (8.0E8) (minimum 3 datapoints for OK -> ALARM transition).",
        "StateChangeTime": "2025-08-12T07:50:34.567+0000",
        "Region": "EU (Ireland)",
        "AlarmArn": "arn:aws:cloudwatch:eu-west-1:xxxx:alarm:test-low-available-memory",
        "OldStateValue": "INSUFFICIENT_DATA",
        "OKActions": [],
        "AlarmActions": ["arn:aws:sns:eu-west-1:xxx:prometheus_alert"],
        "InsufficientDataActions": [],
        "Trigger": {
                "MetricName": "FreeableMemory",
                "Namespace": "AWS/RDS",
                "StatisticType": "Statistic",
                "Statistic": "AVERAGE",
                "Unit": null,
                "Dimensions": [{
                        "value": "eu2-2",
                        "name": "DBInstanceIdentifier"
                }],
                "Period": 300,
                "EvaluationPeriods": 3,
                "DatapointsToAlarm": 3,
                "ComparisonOperator": "LessThanOrEqualToThreshold",
                "Threshold": 8.0E8,
                "TreatMissingData": "missing",
                "EvaluateLowSampleCountPercentile": ""
        }
}

可以选择对自己业务有帮助的字段,再拼接为推送至 AlertManager 的消息体。


type AwsAlarmArgs struct {
	Type             string `json:"Type"`
	MessageId        string `json:"MessageId"`
	Token            string `json:"Token,omitempty"`
	TopicArn         string `json:"TopicArn"`
	Subject          string `json:"Subject,omitempty"`
	Message          string `json:"Message"`
	SubscribeURL     string `json:"SubscribeURL,omitempty"`
	Timestamp        string `json:"Timestamp"`
	SignatureVersion string `json:"SignatureVersion"`
	Signature        string `json:"Signature"`
	SigningCertURL   string `json:"SigningCertURL"`
	UnsubscribeURL   string `json:"UnsubscribeURL,omitempty"`
}

type AwsAlarmMsg struct {
	AlarmName        string          `json:"AlarmName"`
	AlarmDescription string          `json:"AlarmDescription"`
	AWSAccountId     string          `json:"AWSAccountId"`
	NewStateValue    string          `json:"NewStateValue"`
	NewStateReason   string          `json:"NewStateReason"`
	StateChangeTime  string          `json:"StateChangeTime"`
	Region           string          `json:"Region"`
	OldStateValue    string          `josn:"OldStateValue"`
	Trigger          AwsAlarmTrigger `json:"Trigger"`
}

type AwsAlarmTrigger struct {
	MetricName                       string               `json:"MetricName"`
	Namespace                        string               `json:"Namespace"`
	StatisticType                    string               `json:"StatisticType"`
	Statistic                        string               `json:"Statistic"`
	Unit                             string               `json:"Unit"`
	Dimensions                       []AwsAlarmDimensions `json:"Dimensions"`
	Period                           int                  `json:"Period"`
	EvaluationPeriods                int                  `json:"EvaluationPeriods"`
	ComparisonOperator               string               `json:"ComparisonOperator"`
	Threshold                        float32              `json:"Threshold"`
	TreatMissingData                 string               `json:"TreatMissingData"`
	EvaluateLowSampleCountPercentile string               `json:"EvaluateLowSampleCountPercentile"`
}

type AwsEvent struct {
	Version    string         `json:"version"`
	Id         string         `json:"id"`
	DetailType string         `json:"detail-type"`
	Source     string         `json:"source"`
	Account    string         `json:"account"`
	Time       string         `json:"time"`
	Region     string         `json:"region"`
	Resources  []string       `json:"resources"`
	Detail     AwsEventDetail `json:"detail"`
}

type AwsEventDetail struct {
	AlarmName     string                `json:"alarmName"`
	State         AwsEventState         `json:"state"`
	PreviousState AwsEventPreviousState `json:"previousState"`
	Configuration AwsEventConfiguration `json:"configuration"`
}

type AwsEventState struct {
	Value      string `json:"value"`
	Reason     string `json:"reason"`
	ReasonData string `json:"reasonData"`
	Timestamp  string `json:"timestamp"`
}

type AwsEventPreviousState struct {
	Value      string `json:"value"`
	Reason     string `json:"reason"`
	ReasonData string `json:"reasonData"`
	Timestamp  string `json:"timestamp"`
}

type AwsEventConfiguration struct {
	Metrics     []AwsEventMetrics `json:"metrics"`
	Description string            `json:"description"`
}

type AwsEventMetrics struct {
	Id         string             `json:"id"`
	MetricStat AwsEventMetricStat `json:"metricStat"`
	ReturnData bool               `json:"returnData"`
}

type AwsEventMetricStat struct {
	Metric struct {
		Namespace  string            `json:"namespace"`
		Name       string            `json:"name"`
		Dimensions map[string]string `json:"dimensions"`
	} `json:"metric"`
	Period int    `json:"period"`
	Stat   string `json:"stat"`
}

type AwsAlarmDimensions struct {
	Name  string `json:"name"`
	Value string `json:"value"`
}

type AwsAlarm struct {
	Token  string
	Args   AwsAlarmArgs
	Region string
}

处理 aws subscription 和 EventBridge 的 payload

// 接收aws告警并发送给am
func (p *Pusher) AWSAlert(c *gin.Context) (err error) {
	token := c.FormValue("token")
	awsArgs := AwsAlarmArgs{}
	awsEvent := AwsEvent{}
	body, err := ioutil.ReadAll(c.Request().Body)
	if err != nil {
		log.Logger.Errorf("read aws request body err:%v", err)
		return
	}
	msgType := c.Request().Header.Get("x-amz-sns-message-type") // 获取消息类型
	switch msgType {
	case "SubscriptionConfirmation":
		// aws SNS订阅确认消息
		if err = json.Unmarshal(body, &awsArgs); err != nil {
			log.Logger.Errorf("json unmarshal err:%v", err)
			return
		}
		subscribeUrl := awsArgs.SubscribeURL
		resp, err := http.Get(subscribeUrl)
		if err != nil {
			log.Logger.Errorf("Get url:%s err:%v", subscribeUrl, err)
			return err
		}
		if resp.StatusCode == 200 {
			log.Logger.Infof("subscribe aws topic successful, statusCode:%v, status:%s", resp.StatusCode, resp.Status)
		} else {
			log.Logger.Errorf("subscribe aws topic failed, statusCode:%v, status:%s", resp.StatusCode, resp.Status)
		}
	case "Notification":
		// aws SNS通知消息
		if err = json.Unmarshal(body, &awsEvent); err != nil {
			log.Logger.Errorf("json unmarshal err: body: %v, err: %v", string(body), err)
			return
		}
		state := ""
		if awsEvent.Detail.State.Value == "ALARM" {
			state = "firing"
		} else if awsEvent.Detail.State.Value == "OK" {
			state = "resolved"
		} 
		log.Logger.Infof("get aws alarm: traceId: %v, alarm description: %v", awsEvent.Id, awsEvent.Detail.Configuration.Description)
		dimension := ""
		namespace := ""
		// 正常情况下Metrics有一个元素,此处避免数组越界造成panic
		if len(awsEvent.Detail.Configuration.Metrics) > 0 {
			namespace = awsEvent.Detail.Configuration.Metrics[0].MetricStat.Metric.Namespace
			for _,v :=range awsEvent.Detail.Configuration.Metrics[0].MetricStat.Metric.Dimensions {
				dimension = v
				break
			}
		}	
		CreateAlert(awsEvent.Detail.State.Timestamp, awsEvent.Detail.AlarmName, awsEvent.Detail.Configuration.Description, namespace, dimension, state, "error", option.Opt.AmOpt.AwsDs, "aws")
	default:
		// 未知类型
		log.Logger.Errorf("unknown msgType:%s", msgType)
		return
	}
	c.JSON(http.StatusOK, map[string]interface{}{
		"errcode": 0,
		"errmsg":  "ok",
	})
	return
}

推送至外部 AlertManager


type AlertManagerArgs struct {
	StartsAt    string      `json:"startsAt,omitempty"` // 报警开始时间
	EndsAt      string      `json:"endsAt,omitempty"`   // 报警结束时间
	Annotations Annoattions `json:"annotations"`        // 告警信息注解
	Status      string      `json:"status"`             // 告警状态
	Labels      Labels      `json:"labels"`             // 告警信息标签
}

type Annoattions struct {
	Summary     string `json:"summary"`
	Description string `json:"description"`
}

type Labels struct {
	Severity   string `json:"severity"`
	Ds         string `json:"ds"`
	AlertGroup string `json:"alertgroup"`
	NameSpace  string `json:"namespace"`
	Instance   string `json:"instance"`
	AlertName  string `json:"alertname"`
}

func CreateAlert(startTimeOrigin string, summary, description, namespace, instance, status, severity, ds, alertGroup string) {
	amArgs := AlertManagerArgs{
		Status: status,
		Annotations: Annoattions{
			Summary:     summary,
			Description: description,
		},
		Labels: Labels{
			Severity:   severity,
			Ds:         ds,
			AlertGroup: alertGroup,
			NameSpace:  namespace,
			Instance:   instance,
			AlertName:  summary,
		},
	}
	parseTime, err := time.Parse("2006-01-02T15:04:05.000-0700", startTimeOrigin)
	if err != nil {
		log.Logger.Errorf("parse time failed: time: %v, err: %v", startTimeOrigin, err)
		return
	}
	if status == "firing" {
		amArgs.StartsAt = parseTime.Format("2006-01-02T15:04:05Z")
		// 延长endsAt,未加endsAt的告警默认2min后就关闭
		amArgs.EndsAt = parseTime.Add(time.Duration(option.Opt.AmOpt.EndTimeout) * time.Hour).Format("2006-01-02T15:04:05Z")
	} else if status == "resolved" {
		amArgs.EndsAt = parseTime.Format("2006-01-02T15:04:05Z")
	} else {
		log.Logger.Errorf("status invalid: status: %v", status)
		return
	}
	reqBody := []AlertManagerArgs{amArgs}
	reqByte, err := json.Marshal(reqBody)
	if err != nil {
		log.Logger.Errorf("json marshal failed: err: %v", err)
		return
	}
	req, err := http.NewRequest(http.MethodPost, option.Opt.AmOpt.AmHost, bytes.NewBuffer(reqByte))
	if err != nil {
		log.Logger.Errorf("new request failed: err: %v", err)
		return
	}
	req.SetBasicAuth(option.Opt.AmOpt.Username, option.Opt.AmOpt.Password)
	req.Header.Set("Content-Type", "application/json")
	_, err = http.DefaultClient.Do(req)
	if err != nil {
		log.Logger.Errorf("http do failed: err: %v", err)
		return
	}

}

代码仅供参考,需要根据实际业务再作修改

参考文档

告警事件和 EventBridge - Amazon CloudWatch

repost.aws

zhuanlan.zhihu.com

接入CloudWatch告警

HTTP/HTTPS 通知 JSON 格式 - Amazon Simple Notification Service

cloud.tencent.com