二进制部署Prometheus alertmanager
exporter下载地址:
prometheus.io/download/
prometheus.service:
cat > /usr/lib/systemd/system/prometheus.service <<EOF [Unit] Description=Prometheus [Service] ExecStart=/usr/local/prometheus/prometheus \ --config.file=/usr/local/prometheus/prometheus.yml \ --storage.tsdb.path=/usr/local/prometheus/data \ --storage.tsdb.retention.time=30d \ --web.enable-lifecycle Restart=on-failure KillMode=process RestartSec=10 [Install] WantedBy=multi-user.target EOF
alertmanager.service:
cat > /usr/lib/systemd/system/alertmanager.service <<EOF [Unit] Description=alertmanager [Service] ExecStart=/usr/local/alertmanager/alertmanager \ --config.file=/usr/local/alertmanager/alertmanager.yml \ --storage.path=/usr/local/alertmanager/data Restart=on-failure KillMode=process RestartSec=10 [Install] WantedBy=multi-user.target EOF
prometheusalert.service:
cat > /usr/lib/systemd/system/prometheusalert.service <<EOF [Unit] Description=PrometheusAlert [Service] WorkingDirectory=/usr/local/PrometheusAlert ExecStart=/usr/local/PrometheusAlert/PrometheusAlert Restart=on-failure KillMode=process RestartSec=10 [Install] WantedBy=multi-user.target EOF 默认账号和密码: prometheusalert prometheusalert
node_exporter.service:
cat > /usr/lib/systemd/system/node_exporter.service <<EOF [Unit] Description=node_exporter After=network.target [Service] ExecStart=/usr/local/node_exporter/node_exporter Restart=on-failure KillMode=process RestartSec=10 [Install] WantedBy=multi-user.target EOF
mysqld_exporter.service:
cat > /usr/lib/systemd/system/mysqld_exporter.service <<EOF [Unit] Description=node_exporter Documentation=https://prometheus.io/ After=network.target [Service] Type=simple ExecStart=/usr/local/mysqld_exporter/mysqld_exporter \\ --config.my-cnf=/usr/local/mysqld_exporter/.my.cnf Restart=on-failure [Install] WantedBy=multi-user.target EOF
.my.cnf:
cat > .my.cnf <<EOF [client] host=192.168.0.123 user=root password=123456 EOF
dingtalk.service:
cat > /usr/lib/systemd/system/dingtalk.service <<EOF [Unit] Description=dingtalk After=network.target [Service] ExecStart=/usr/local/prometheus-webhook-dingtalk/prometheus-webhook-dingtalk \ --config.file=/usr/local/prometheus-webhook-dingtalk/config.yml \ --web.listen-address=0.0.0.0:8060 \ --web.enable-ui \ --web.enable-lifecycle Restart=on-failure KillMode=process RestartSec=10 [Install] WantedBy=multi-user.target EOF
blackbox_exporter.service:
cat > /usr/lib/systemd/system/blackbox_exporter.service <<EOF [Unit] Description=blackbox_exporter After=network.target [Service] Type=simple User=root Group=root ExecStart=/usr/local/blackbox_exporter/blackbox_exporter \ --config.file=/usr/local/blackbox_exporter/blackbox.yml \ --web.listen-address ":9115" Restart=on-failure [Install] WantedBy=multi-user.target EOF
blackbox.yml:
modules: http_2xx: prober: http timeout: 8s http: method: GET preferred_ip_protocol: "ip4" # 改成ipv4的默认不写是ipv6 ip_protocol_fallback: false http_post_2xx: prober: http http: method: POST preferred_ip_protocol: "ip4" ip_protocol_fallback: false tcp_connect: prober: tcp pop3s_banner: prober: tcp tcp: query_response: - expect: "^+OK" tls: true tls_config: insecure_skip_verify: false ssh_banner: prober: tcp tcp: query_response: - expect: "^SSH-2.0-" irc_banner: prober: tcp tcp: query_response: - send: "NICK prober" - send: "USER prober prober prober :prober" - expect: "PING :([^ ]+)" send: "PONG ${1}" - expect: "^:[^ ]+ 001" icmp: prober: icmp
监控elasticsearch:
github.com/prometheus-community/elasticsearch_exporter/releases
elasticsearch_exporter.service:
cat > /etc/systemd/system/elasticsearch_exporter.service <<EOF [Unit] Description=Elasticsearch Exporter Service Documentation=https://github.com/prometheus-community/elasticsearch_exporter After=network.target elasticsearch.service [Service] Type=simple User=root Group=root ExecStart=/usr/local/elasticsearch_exporter/elasticsearch_exporter \\ --es.uri=http://localhost:9200 \\ --es.all \\ --es.indices \\ --es.shards Restart=on-failure RestartSec=30s Environment="ES_EXPORTER_OPTS=--log.level=info" LimitNOFILE=65536 [Install] WantedBy=multi-user.target EOF
elasticsearch dashboard:
grafana.com/grafana/dashboards/2322
监控mongodb:
https://github.com/percona/mongodb_exporter
创建监控用户:
use admin db.createUser({ user: "prometheus", pwd: "prometheus", roles: [ { role: "read", db: "admin" }, { role: "readAnyDatabase", db: "admin" }, { role: "clusterMonitor", db: "admin" } ] });
mongodb_exporter.service:
cat > /etc/systemd/system/mongodb_exporter.service <<EOF [Unit] Description=MongoDB Exporter After=network.target Documentation=https://github.com/percona/mongodb_exporter [Service] User=root Group=root #EnvironmentFile=/etc/default/mongodb_exporter ExecStart=/usr/local/mongodb_exporter/mongodb_exporter \\ --mongodb.user=prometheus \\ --mongodb.password=prometheus \\ --mongodb.uri=mongodb://192.168.0.123:27018 \\ --web.listen-address=:9216 \\ --compatible-mode \\ --collector.diagnosticdata \\ --collector.replicasetstatus Restart=on-failure RestartSec=5s TimeoutStopSec=10s StandardOutput=syslog StandardError=syslog SyslogIdentifier=mongodb_exporter [Install] WantedBy=multi-user.target EOF
监控postgresql:
下载地址:
github.com/prometheus-community/postgres_exporter
postgres_exporter.service
cat > /etc/systemd/system/postgres_exporter.service <<EOF [Unit] Description=Prometheus PostgreSQL Exporter After=network.target [Service] User=root Group=root Environment="DATA_SOURCE_NAME=postgresql://postgres:postgres@172.16.43.249:5432/postgres?sslmode=disable" ExecStart=/usr/local/postgres_exporter/postgres_exporter \\ --web.listen-address=:9187 \\ --log.level=info Restart=always RestartSec=30 [Install] WantedBy=multi-user.target EOF
prometheus.yml
# 全局配置 global: scrape_interval: 15s # 采集数据间隔 evaluation_interval: 15s # 评估告警周期,报警要持续15秒才发送alertmanager # 报警配置 alerting: alertmanagers: - static_configs: - targets: - 192.168.0.19:9093 # 报警规则 rule_files: - "first_rules.yml" - "second_rules.yml" # 静态配置监控 scrape_configs: - job_name: 'prometheus' static_configs: - targets: ['localhost:9090'] labels: region: local # 添加标签 - job_name: 'node_exporter' static_configs: - targets: ['192.168.0.19:9100'] # 文件发现 - job_name: 'nodes' file_sd_configs: - files: - targets/nodes.yml refresh_interval: 60m - job_name: 'elasticsearch' scrape_interval: 60s scrape_timeout: 25s metrics_path: /metrics static_configs: - targets: - '172.16.37.126:9114' relabel_configs: - source_labels: [__address__] target_label: cluster replacement: '172.16.37.126' - job_name: 'mysqld' file_sd_configs: - files: - targets/mysqld.yml refresh_interval: 1m - job_name: 'mongodb' scrape_interval: 60s scrape_timeout: 25s static_configs: - targets: - '172.16.43.249:9216' relabel_configs: - source_labels: [__address__] target_label: instance replacement: 'mongodb-primary' - job_name: 'postgres' static_configs: - targets: - '172.16.43.249:9187'
targets/nodes.yaml
- labels: region: local type: virtual targets: - 10.32.0.12:9100 - 10.32.0.13:9100 - labels: region: dev type: physical targets: - 10.32.0.14:9100 - 10.32.0.15:9100
重载配置:
curl -X POST http://localhost:9090/-/reload
alertmanager.yml
global: resolve_timeout: 5m smtp_smarthost: "smtp.163.com:465" smtp_from: "chuxiangyi_com@163.com" smtp_auth_username: "chuxiangyi_com@163.com" smtp_auth_password: "123456" smtp_require_tls: false route: group_by: ['alertname','instance'] # 消息按照这个标签分组,一起发送一组消息(一条消息中有多个告警信息),这个分组写不写不影响路由,主要是和group_wait一起配合使用 group_wait: 10s # 在这个时间内将收到的告警消息合在一起发送出去,group_by组合使用 group_interval: 10s # 相同组告警时间间隔 repeat_interval: 1h # 报警未解决重复报警时间间隔 receiver: 'email' # 选择填写报警名称,默认路由 routes: # 子路由,将消息再分组发送到不同的消息出口 - match: # 匹配标签,多个标签为逻辑与 env: prod # 这个标签在prometheus的配置文件或alertmanager的配置文件中都可以自己添加 group_by: ['severity'] # 消息分组一起发送 receiver: 'prd' - match_re: # 使用正则匹配标签 cluster: redis|mysql receiver: 'dev' receivers: # 接收报警的方式 - name: "email" # 报警方式名称 email_configs: # 邮件报警,需要在全局中配置SMTP - to: "250994559@qq.com" - name: 'dev' webhook_configs: # webhook方式发送报警信息 - url: 'http://127.0.0.1:5001/' - name: 'prd' webhook_configs: # webhook方式发送报警信息 - url: 'http://127.0.0.1:5001/' inhibit_rules: # 告警抑制 - source_match: # 匹配到如下两个标签开始记录 alertname: node_exporter_26 severity: critical target_match: # 匹配到如下标签和equal中定义的标签,且equal中的标签要和匹配的标签值一样 severity: critical equal: - cluster
重载配置:
curl -XPOST localhost:9093/-/reload
开源告警规则汇总:常用告警规则都有
github.com/samber/awesome-prometheus-alerts
飞书示例:
alertmanager.yml
route: group_by: ['alertname'] group_wait: 30s group_interval: 1m repeat_interval: 1m receiver: 'web.hook' receivers: - name: 'web.hook' webhook_configs: - url: 'http://0.0.0.0:8080/prometheusalert?type=fs&tpl=prometheus-fs&fsurl=https://open.feishu.cn/open-apis/bot/v2/hook/c534758e-e5db-4dca-8d5e-xxxxxxx' inhibit_rules: - source_match: severity: 'critical' target_match: severity: 'warning' equal: ['alertname', 'dev', 'instance']
grafana:
国内下载源:
mirrors.huaweicloud.com/grafana/
默认用户名和密码:
admin/admin
常用的dashboard图:
https://github.com/starsliao/Prometheus
配置文件在:
/etc/grafana/grafana.ini
Node Exporter的Grafana模版:
英文版本:
https://grafana.com/grafana/dashboards/11074
中文版本:
https://grafana.com/grafana/dashboards/8919
报警规则:
这个规则保存为一个文件,配置到prometheus的配置文件中的rule_files字段下,以列表的形式。
rule_files: - "first_rules.yml" - "second_rules.yml"
规则:
groups: - name: node_exporter rules: - alert: node_exporter expr: up{job="node_exporter"} == 0 for: 5m labels: severity: critical annotations: summary: node_exporter down description: node_exporter down
告警抑制:
inhibit_rules: - source_match: alertname: node_exporter_26 # alertname是规则中 rules.alert 定义的值 severity: critical target_match: severity: critical equal: - cluster
如下两个告警实例:
A告警:
groups: - name: node_exporter_26 down rules: - alert: node_exporter_26 expr: up{job="node_exporter-26"} == 0 for: 1m labels: severity: critical cluster: gitlab annotations: summary: node_exporter_26 down description: node_exporter_26 down
B告警:
groups: - name: haproxy down rules: - alert: haproxy expr: up{job="haproxy_exporter"} == 0 for: 1m labels: severity: critical cluster: gitlab annotations: summary: haproxy down
当有告警时且匹配到source_match下定义的标签时开始记录,当再有告警出现时匹配到target_match中的标签,且这个告警中的标签和equal中的定义的标签名的值要相同时就会抑制告警。如上示例这种A告警出现后B告警就会被抑制。
常用报警规则: 对节点、CPU、内存、磁盘进行监控和报警。
groups: - name: example rules: - alert: InstanceDown expr: up == 0 for: 5m labels: severity: page annotations: summary: "Instance {{ $labels.instance }} down" description: "{{ $labels.instance }} of job {{ $labels.job }} {{ $labels.region }} has been down for more than 5 minutes." - alert: MemUsageHigh expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 80 for: 10m labels: severity: critical annotations: summary: "High memory usage on {{ $labels.instance }}" description: "{{ $labels.instance }} {{ $labels.region }}" - alert: CPUUsageHigh expr: (1-(sum(increase(node_cpu_seconds_total{mode="idle"}[1m]))by(instance))/(sum(increase(node_cpu_seconds_total[1m]))by(instance)))*100 > 60 for: 10m labels: severity: critical annotations: summary: "High CPU usage on {{ $labels.instance }}" description: "{{ $labels.instance }} {{ $labels.region }}" - alert: DiskUsageHigh expr: (1 - (node_filesystem_free_bytes{device=~"/dev.*"} / node_filesystem_size_bytes{device=~"/dev.*"})) * 100 > 80 for: 10m labels: severity: critical annotations: summary: "High Disk usage on {{ $labels.instance }}" description: "{{ $labels.instance }} {{ $labels.region }}"
几个时间参数:
prometheus.yml
global: scrape_interval: 15s # 数据采集间隔 evaluation_interval: 15s # 评估告警周期,报警要持续15秒才发送alertmanager scrape_timeout: 10s # 数据采集超时时间默认10s
alertmanager.yml
# route标记:告警如何发送分配 route: group_by: ['alertname'] # group_by:采用哪个标签作为分组的依据 group_wait: 10s # group_wait:分组等待的时间 group_interval: 10s # 上下两组发送告警的间隔时间 repeat_interval: 1m # 重复发送告警时间。默认1h
prometheus常用配置:
# my global config global: scrape_interval: 30s # Set the scrape interval to every 15 seconds. Default is every 1 minute. evaluation_interval: 30s # Evaluate rules every 15 seconds. The default is every 1 minute. # scrape_timeout is set to the global default (10s). # Alertmanager configuration alerting: alertmanagers: - static_configs: - targets: - localhost:9093 # Load rules once and periodically evaluate them according to the global 'evaluation_interval'. rule_files: - "rules/*.yml" # - "second_rules.yml" # A scrape configuration containing exactly one endpoint to scrape: # Here it's Prometheus itself. scrape_configs: # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config. - job_name: 'prometheus' # metrics_path defaults to '/metrics' # scheme defaults to 'http'. static_configs: - targets: ['localhost:9090'] - job_name: 'nodes' file_sd_configs: - files: - target/nodes.yml refresh_interval: 10m # - job_name: 'openvpn-metrics' # scrape_interval: 15s # static_configs: # - targets: ['192.168.199.9:9176'] # - job_name: 'kubernetes-pods-eureka' # metrics_path: /actuator/prometheus # basic_auth: # username: 'user' # password: '4yDGnT06' # eureka_sd_configs: # - server: 'http://autotest-eureka.demo.com/eureka' # relabel_configs: # - source_labels: # - __meta_eureka_app_name # separator: ; # regex: (.*) # target_label: appname # replacement: $1 # action: replace # - action: labelmap # regex: __meta_eureka_app_instance_(.+) # - regex: metadata_user_(.+) # action: labeldrop - job_name: 'ping_status' metrics_path: /probe params: module: [icmp] static_configs: - targets: - 192.168.199.136 - 192.168.199.148 - 192.168.199.163 - 192.168.199.186 - 192.168.199.222 - 192.168.199.250 labels: group: 'icmp' relabel_configs: - source_labels: [__address__] target_label: __param_target - source_labels: [__param_target] target_label: instance - target_label: __address__ replacement: localhost:9115 - job_name: 'port_status' metrics_path: /probe params: module: [tcp_connect] static_configs: - targets: - 101.132.237.99:7500 labels: group: 'tcp' relabel_configs: - source_labels: [__address__] target_label: __param_target - source_labels: [__param_target] target_label: instance - target_label: __address__ replacement: localhost:9115 - job_name: "http_status" scrape_interval: 1m metrics_path: /probe params: module: [http_2xx] file_sd_configs: - refresh_interval: 1m files: - target/http.yml relabel_configs: - source_labels: [__address__] target_label: __param_target - source_labels: [__param_target] target_label: instance - target_label: __address__ replacement: localhost:9115 - job_name: 'vmware_vcenter' metrics_path: '/metrics' static_configs: - targets: - 'vcenter.demo.com' relabel_configs: - source_labels: [__address__] target_label: __param_target - source_labels: [__param_target] target_label: instance - target_label: __address__ replacement: localhost:9272 - job_name: 'mysqld' file_sd_configs: - files: ['/usr/local/prometheus/target/mysqld.yml'] refresh_interval: 1m - job_name: 'redis' file_sd_configs: - files: ['/usr/local/prometheus/target/redis.yml'] refresh_interval: 1m - job_name: 'windows' file_sd_configs: - files: ['/usr/local/prometheus/target/windows.yml'] refresh_interval: 1m - job_name: 'portmapping' metrics_path: '/metrics' static_configs: - targets: - 'portmapping.demo.com' - job_name: 'snmp' scrape_interval: 60s static_configs: - targets: - 192.168.101.1 # SNMP device. #- switch.local # SNMP device. #- tcp://192.168.1.3:1161 # SNMP device using TCP transport and custom port. metrics_path: /snmp params: auth: [public_v2] module: [if_mib] relabel_configs: - source_labels: [__address__] target_label: __param_target - source_labels: [__param_target] target_label: instance - target_label: __address__ replacement: 127.0.0.1:9116 # The SNMP exporter's real hostname:port. # Global exporter-level metrics - job_name: 'snmp_exporter' static_configs: - targets: ['localhost:9116'] - job_name: 'elasticsearch_exporter' scrape_interval: 60s scrape_timeout: 25s metrics_path: /metrics static_configs: - targets: - '172.16.37.126:9114' relabel_configs: - source_labels: [__address__] target_label: cluster replacement: '172.16.37.126'