mkdir -p /etc/alertmanager/
mkdir -p /etc/alertmanager/template
vim /etc/alertmanager/alertmanager.yml
global:
resolve_timeout: 5m
smtp_from: 'i@valarx.com' # 发件人
smtp_smarthost: 'smtp.office365.com:587' # 邮箱服务器的 POP3/SMTP 主机配置 smtp.qq.com 端口为 465 或 587
smtp_auth_username: 'i@valarx.com' # 用户名
smtp_auth_password: 'xxxx' # 授权码
smtp_require_tls: true
smtp_hello: 'xxxx'
templates:
- '/etc/alertmanager/template/*.tmpl'
route:
group_by: ['alertname'] # 告警分组
group_wait: 5s # 在组内等待所配置的时间,如果同组内,5 秒内出现相同报警,在一个组内出现。
group_interval: 5m # 如果组内内容不变化,合并为一条警报信息,5 分钟后发送。
repeat_interval: 5m # 发送告警间隔时间 s/m/h,如果指定时间内没有修复,则重新发送告警
receiver: 'email' # 优先使用 wechat 发送
routes: #子路由,使用 email 发送
- receiver: email
match_re:
serverity: email
receivers:
- name: 'email'
email_configs:
- to: 'xxx@qq.com' # 如果想发送多个人就以 ',' 做分割
send_resolved: true
html: '{{ template "email.html" . }}' #使用自定义的模板发送
- name: 'wechat'
wechat_configs:
- corp_id: 'xxxxxxxxxxxxx' #企业 ID
api_url: 'https://qyapi.weixin.qq.com/cgi-bin/' # 企业微信 api 接口 统一定义
to_party: '2' # 通知组 ID
agent_id: '1000002' # 新建应用的 agent_id
api_secret: 'xxxxxxxxxxxxxx' # 生成的 secret
send_resolved: true
vim /etc/alertmanager/template/email.tmpl
{{ define "email.html" }}
{{ range $i, $alert :=.Alerts }}
========监控报警==========<br>
告警状态:{{ .Status }}<br>
告警级别:{{ $alert.Labels.severity }}<br>
告警类型:{{ $alert.Labels.alertname }}<br>
告警应用:{{ $alert.Annotations.summary }}<br>
告警主机:{{ $alert.Labels.instance }}<br>
告警详情:{{ $alert.Annotations.description }}<br>
触发阀值:{{ $alert.Annotations.value }}<br>
告警时间:{{ $alert.StartsAt.Format "2006-01-02 15:04:05" }}<br>
========end=============<br>
{{ end }}
{{ end }}
docker run -d --restart=always \
--name=alertmanager \
-p 9093:9093 \
-v /etc/alertmanager:/etc/alertmanager \
-v /etc/localtime:/etc/localtime \
prom/alertmanager:latest
global:
scrape_interval: 5s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 5s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
- 10.0.4.10:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
- "/etc/prometheus/rules/*.rules"
# - "second_rules.yml"
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
vim /etc/prometheus/rules/alerts.rules
这边是我自己创建的两个告警规则:
主机CPU利用率>85%
主机MEM利用率>70%
---
groups:
- name: hostStatsAlert
rules:
- alert: hostCpuUsageAlert
expr: (1 - avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance))*100 > 85
for: 1m
labels:
severity: critical
annotations:
summary: "Instance {{ $labels.instance }} CPU usage high"
description: "{{ $labels.instance }} CPU usage above 85% (current value: {{ $value }})"
- alert: hostMemUsageAlert
expr: (1 - (node_memory_MemAvailable_bytes{} / (node_memory_MemTotal_bytes{})))* 100 > 70
for: 1m
labels:
severity: critical
annotations:
summary: "Instance {{ $labels.instance }} MEM usage high"
description: "{{ $labels.instance }} MEM usage above 70% (current value: {{ $value }})"
docker restart prometheus
#docker安装prometheus的方法请参考:https://www.valarx.com/linux/docker-prometheus.html
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。