/usr/lib/systemd/system/prometheus.service

[Unit]
Description=Prometheus
 
[Service]
ExecStart=/usr/local/prometheus/prometheus \
--config.file=/usr/local/prometheus/prometheus.yml \
--storage.tsdb.path=/usr/local/prometheus/data \
--storage.tsdb.retention=30d \
--web.enable-lifecycle

Restart=on-failure
KillMode=process
RestartSec=10
 
[Install]
WantedBy=multi-user.target

/usr/lib/systemd/system/alertmanager.service

[Unit]
Description=alertmanager

[Service]
ExecStart=/usr/local/alertmanager/alertmanager \
--config.file=/usr/local/alertmanager/alertmanager.yml \
--storage.path=/usr/local/alertmanager/data

Restart=on-failure
KillMode=process
RestartSec=10

[Install]
WantedBy=multi-user.target

/usr/lib/systemd/system/node_exporter.service

[Unit]
Description=node_exporter
After=network.target

[Service]
ExecStart=/usr/local/node_exporter/node_exporter

Restart=on-failure
KillMode=process
RestartSec=10

[Install]
WantedBy=multi-user.target

/usr/lib/systemd/system/dingtalk.service

[Unit]
Description=dingtalk
After=network.target

[Service]
ExecStart=/usr/local/prometheus-webhook-dingtalk/prometheus-webhook-dingtalk \
--config.file=/usr/local/prometheus-webhook-dingtalk/config.yml \
--web.listen-address=0.0.0.0:8060 \
--web.enable-ui \
--web.enable-lifecycle

Restart=on-failure
KillMode=process
RestartSec=10

[Install]
WantedBy=multi-user.target


prometheus.yml

# 全局配置
global:
  scrape_interval:     15s # 采集数据间隔
  evaluation_interval: 15s # 评估告警周期,报警要持续15秒才发送alertmanager

# 报警配置
alerting:
  alertmanagers:
  - static_configs:
    - targets:
      - 192.168.0.19:9093

# 报警规则
rule_files:
   - "first_rules.yml"
   - "second_rules.yml"

# 静态配置监控
scrape_configs:
  - job_name: 'prometheus'
    static_configs:
    - targets: ['localhost:9090']
      labels:
        region: local # 添加标签

  - job_name: 'node_exporter'
    static_configs:
    - targets: ['192.168.0.19:9100']
    
# 文件发现
  - job_name: 'nodes'
    file_sd_configs:
    - files:
      - targets/nodes.yml
      refresh_interval: 60m

targets/nodes.yaml

- labels:
   region: local
   type: virtual
  targets:
  - 10.32.0.12:9100
  - 10.32.0.13:9100
- labels:
   region: dev
   type: physical
  targets:
  - 10.32.0.14:9100
  - 10.32.0.15:9100

重载配置:

curl -X POST http://localhost:9090/-/reload


alertmanager.yml

global:
  resolve_timeout: 5m
  smtp_smarthost: "smtp.163.com:465"
  smtp_from: "chuxiangyi_com@163.com"
  smtp_auth_username: "chuxiangyi_com@163.com"
  smtp_auth_password: "123456"
  smtp_require_tls: false

route:
  group_by: ['alertname','instance'] # 消息按照这个标签分组,一起发送一组消息(一条消息中有多个告警信息),这个分组写不写不影响路由,主要是和group_wait一起配合使用
  group_wait: 10s # 在这个时间内将收到的告警消息合在一起发送出去,group_by组合使用
  group_interval: 10s # 相同组告警时间间隔
  repeat_interval: 1h # 报警未解决重复报警时间间隔
  receiver: 'email' # 选择填写报警名称
  routes: # 子路由,将消息再分组发送到不同的消息出口
  - match: # 匹配标签,多个标签为逻辑与
      env: prod # 这个标签在prometheus的配置文件或alertmanager的配置文件中都可以自己添加
    group_by: ['severity'] # 消息分组一起发送
    receiver: 'prd'
  - match_re: # 使用正则匹配标签
      cluster: redis|mysql
    receiver: 'dev'

receivers: # 接收报警的方式
- name: "email" # 报警方式名称
  email_configs: # 邮件报警,需要在全局中配置SMTP
  - to: "250994559@qq.com"
- name: 'dev'
  webhook_configs: # webhook方式发送报警信息
  - url: 'http://127.0.0.1:5001/'
- name: 'prd'
  webhook_configs: # webhook方式发送报警信息
  - url: 'http://127.0.0.1:5001/'

inhibit_rules: # 告警抑制
  - source_match: # 匹配到如下两个标签开始记录
      alertname: node_exporter_26
      severity: critical
    target_match: # 匹配到如下标签和equal中定义的标签,且equal中的标签要和匹配的标签值一样
      severity: critical
    equal:
    - cluster

重载配置:

curl -XPOST localhost:9093/-/reload


grafana:

常用的dashboard图:

https://github.com/starsliao/Prometheus

配置文件在:

/etc/grafana/grafana.ini

Node Exporter的Grafana模版:

英文版本:

https://grafana.com/grafana/dashboards/11074

中文版本:

https://grafana.com/grafana/dashboards/8919


报警规则:

这个规则保存为一个文件,配置到prometheus的配置文件中的rule_files字段下,以列表的形式。

rule_files:
   - "first_rules.yml"
   - "second_rules.yml"

规则:

groups:
- name: node_exporter
  rules:
  - alert: node_exporter
    expr: up{job="node_exporter"} == 0
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: node_exporter down
      description: node_exporter down

告警抑制:

inhibit_rules:
  - source_match:
      alertname: node_exporter_26 # alertname是规则中 rules.alert 定义的值
      severity: critical
    target_match:
      severity: critical
    equal:
    - cluster

如下两个告警实例:

A告警:

groups:
- name: node_exporter_26 down
  rules:
  - alert: node_exporter_26
    expr: up{job="node_exporter-26"} == 0
    for: 1m
    labels:
      severity: critical
      cluster: gitlab
    annotations:
      summary: node_exporter_26 down
      description: node_exporter_26 down

B告警:

groups:
- name: haproxy down
  rules:
  - alert: haproxy
    expr: up{job="haproxy_exporter"} == 0
    for: 1m
    labels:
      severity: critical
      cluster: gitlab
    annotations:
      summary: haproxy down

        当有告警时且匹配到source_match下定义的标签时开始记录,当再有告警出现时匹配到target_match中的标签,且这个告警中的标签和equal中的定义的标签名的值要相同时就会抑制告警。如上示例这种A告警出现后B告警就会被抑制。


常用报警规则: 对节点、CPU、内存、磁盘进行监控和报警。

groups:
- name: example
  rules:

  - alert: InstanceDown
    expr: up == 0
    for: 5m
    labels:
      severity: page
    annotations:
      summary: "Instance {{ $labels.instance }} down"
      description: "{{ $labels.instance }} of job {{ $labels.job }} {{ $labels.region }} has been down for more than 5 minutes."

  - alert: MemUsageHigh
    expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 80
    for: 10m
    labels:
      severity: critical
    annotations:
      summary: "High memory usage on {{ $labels.instance }}"
      description: "{{ $labels.instance }} {{ $labels.region }}"

  - alert: CPUUsageHigh
    expr: (1-(sum(increase(node_cpu_seconds_total{mode="idle"}[1m]))by(instance))/(sum(increase(node_cpu_seconds_total[1m]))by(instance)))*100 > 60
    for: 10m
    labels:
      severity: critical
    annotations:
      summary: "High CPU usage on {{ $labels.instance }}"
      description: "{{ $labels.instance }} {{ $labels.region }}"

  - alert: DiskUsageHigh
    expr: (1 - (node_filesystem_free_bytes{device=~"/dev.*"} / node_filesystem_size_bytes{device=~"/dev.*"})) * 100 > 80
    for: 10m
    labels:
      severity: critical
    annotations:
      summary: "High Disk usage on {{ $labels.instance }}"
      description: "{{ $labels.instance }} {{ $labels.region }}"


几个时间参数:

prometheus.yml

global:
  scrape_interval: 15s # 数据采集间隔
  evaluation_interval: 15s # 评估告警周期,报警要持续15秒才发送alertmanager
  scrape_timeout: 10s # 数据采集超时时间默认10s

alertmanager.yml

# route标记:告警如何发送分配
route:
  group_by: ['alertname'] # group_by:采用哪个标签作为分组的依据
  group_wait: 10s # group_wait:分组等待的时间
  group_interval: 10s # 上下两组发送告警的间隔时间
  repeat_interval: 1m # 重复发送告警时间。默认1h