Prometheus | Grafana | Alertmanager | Pushgateway
docker-compose.yml
YAML
services:
prometheus:
image: "prom/prometheus:v3.0.1"
labels: [app=prometheus]
command:
- --config.file=/etc/prometheus/prometheus.yml
- --storage.tsdb.path=/prometheus
- --storage.tsdb.retention.time=365d
- --web.console.libraries=/usr/share/prometheus/console_libraries
- --web.console.templates=/usr/share/prometheus/consoles
ports:
- "9090:9090"
volumes:
- ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
- ./prometheus/prometheus.rules:/etc/prometheus.rules
- ./prometheus/data:/prometheus
extra_hosts:
- "some_domain_or_hostname:10.12.50.112"
restart: unless-stopped
alertmanager:
image: "prom/alertmanager:v0.27.0"
labels: [app=alertmanager]
ports:
- "9093:9093"
volumes:
- ./alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml
restart: unless-stopped
grafana:
image: "grafana/grafana:11.4.0"
labels: [app=grafana]
ports:
- "3000:3000"
environment:
- GF_SECURITY_ADMIN_USER=admin
- GF_SECURITY_ADMIN_PASSWORD=Password
volumes:
- ./grafana/data:/var/lib/grafana
- ./grafana/datasources:/etc/grafana/provisioning/datasources
restart: unless-stopped
pushgateway:
image: "prom/pushgateway:v1.10.0"
labels: [app=pushgateway]
ports:
- "9091:9091"
restart: unless-stopped
networks:
default:
external: true
name: some-net
- Create folders and files
- prometheus.yml
YAML
# my global config
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
- /etc/prometheus.rules
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: 'prometheus'
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
static_configs:
- targets:
- localhost:9090
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: 'pushgateway'
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
static_configs:
- targets:
- pushgateway:9091
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: 'alertmanager'
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
static_configs:
- targets:
- alertmanager:9093
- job_name: 'node-exporter'
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
static_configs:
- targets:
- example1:9100
# This job will be used to auto-discover node-exporter pods based on published endpoints.
- job_name: 'backend'
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
static_configs:
- targets:
- example1:8100
- job_name: 'example1'
static_configs:
- targets:
- example1:8102
- job_name: 'cadvisor'
static_configs:
- targets:
- example1:9080
# - job_name: mysql1
# static_configs:
# - targets: ['mysql1:9104']
# labels:
# alias: mysql1
# - job_name: mysql22
# static_configs:
# - targets: ['mysql22:9104']
# labels:
# alias: mysql22
#- job_name: 'cloudwatch-exporter'
## metrics_path defaults to '/metrics'
## scheme defaults to 'http'.
#static_configs:
#- targets: ['cloudwatch-exporter:9106']
- prometheus.rules
YAML
groups:
### Host Machine State Check (Node-Exporter)
- name: Server Down
rules:
# Alert for any instance that is unreachable for >5 minutes.
- alert: InstanceDown
expr: up == 0
for: 5m
labels:
severity: warning
job: node-exporter
annotations:
summary: "Instance {{ $labels.instance }} down"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
### Host Low Disk Space
- name: Low Disk Space
rules:
- alert: Low Disk Space
expr: ((node_filesystem_avail_bytes{mountpoint="/"} * 100 ) / node_filesystem_size_bytes < 15 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 30m
labels:
severity: warning
job: node-exporter
annotations:
summary: "High disk usage on [{{ $labels.instance }}]"
description: "Disk space usage is over {{ $value }} for more than 30 minutes!"
### Host Low on RAM
- name: Low on RAM
rules:
- alert: High RAM Usage
expr: (100 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 > 90) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 15m
labels:
severity: warning
annotations:
summary: "High RAM usage on [{{ $labels.instance }}]"
description: "Node RAM usage is {{ $value }} for more than 10 minutes!"
### Host High CPU Load
- name: Low on CPU
rules:
- alert: High CPU Load
expr: (sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > 0.9) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 10m
labels:
severity: warning
annotations:
summary: "High CPU load on [{{ $labels.instance}}]"
description: "Node CPU load is {{ $value }} for more than 10 minutes!"
### Docker Metrics and Alerts
# - name: Docker Containers State
# rules:
# - alert: Docker Container Issue
# expr: absent(container_last_seen)
## expr: time() - container_last_seen > 120
# for: 5m
# labels:
# severity: warning
# annotations:
# summary: "Container down on [{{ $labels.instance}}]!"
# description: "Container [{{ $labels.instance}}] has stopped for over {{ $value }} seconds."
- alertmanager.yml
Text Only
global:
smtp_smarthost: 'email-smtp.eu-west-1.amazonaws.com:587'
smtp_from: '[email protected]'
smtp_hello: 'From Example Mail Server'
smtp_auth_username: XXXXXXXXXXXXXXXXX
smtp_auth_password: XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
smtp_require_tls: true
resolve_timeout: 1m
route:
receiver: 'general-alerts-mail'
group_by: ['job', 'alertname']
group_wait: 30s
group_interval: 5m
repeat_interval: 30m
routes:
- match:
receiver: 'general-alerts-mail'
group_interval: 1m
continue: true
repeat_interval: 2m
routes:
- match:
severity: critical
receiver: 'general-alerts-mail'
- match:
receiver: 'slack'
group_interval: 1m
repeat_interval: 2m
routes:
- match:
severity: critical
receiver: 'slack'
receivers:
- name: 'general-alerts-mail'
email_configs:
- to: '[email protected]'
send_resolved: true
- name: 'slack'
slack_configs:
- api_url: 'https://hooks.slack.com/services/T5XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'
channel: '#uptime'
send_resolved: true
- Ownership
- Run