Prometheus | Grafana | Alertmanager | Pushgateway

docker-compose.yml

YAML

services:
  prometheus:
    image: "prom/prometheus:v3.0.1"
    labels: [app=prometheus]
    command:
      - --config.file=/etc/prometheus/prometheus.yml
      - --storage.tsdb.path=/prometheus
      - --storage.tsdb.retention.time=365d
      - --web.console.libraries=/usr/share/prometheus/console_libraries
      - --web.console.templates=/usr/share/prometheus/consoles
    ports:
      - "9090:9090"
    volumes:
      - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
      - ./prometheus/prometheus.rules:/etc/prometheus.rules
      - ./prometheus/data:/prometheus
    extra_hosts:
      - "some_domain_or_hostname:10.12.50.112"
    restart: unless-stopped

  alertmanager:
    image: "prom/alertmanager:v0.27.0"
    labels: [app=alertmanager]
    ports:
      - "9093:9093"
    volumes:
      - ./alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml
    restart: unless-stopped


  grafana:
    image: "grafana/grafana:11.4.0"
    labels: [app=grafana]
    ports:
      - "3000:3000"
    environment:
      - GF_SECURITY_ADMIN_USER=admin
      - GF_SECURITY_ADMIN_PASSWORD=Password
    volumes:
      - ./grafana/data:/var/lib/grafana
      - ./grafana/datasources:/etc/grafana/provisioning/datasources
    restart: unless-stopped

  pushgateway:
    image: "prom/pushgateway:v1.10.0"
    labels: [app=pushgateway]
    ports:
      - "9091:9091"
    restart: unless-stopped

networks:
  default:
    external: true
    name: some-net

Create folders and files

Bash

mkdir -p prometheus/data grafana/{data,datasources} alertmanager

prometheus.yml

Bash

nano prometheus/prometheus.yml

YAML

# my global config
global:
  scrape_interval:     15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
  evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
  # scrape_timeout is set to the global default (10s).

# Alertmanager configuration
alerting:
  alertmanagers:
  - static_configs:
    - targets:
      - alertmanager:9093

# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
  - /etc/prometheus.rules

# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
  # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
  - job_name: 'prometheus'
    # metrics_path defaults to '/metrics'
    # scheme defaults to 'http'.
    static_configs:
      - targets:
        - localhost:9090

  # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
  - job_name: 'pushgateway'
    # metrics_path defaults to '/metrics'
    # scheme defaults to 'http'.
    static_configs:
      - targets:
        - pushgateway:9091

  # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
  - job_name: 'alertmanager'
    # metrics_path defaults to '/metrics'
    # scheme defaults to 'http'.
    static_configs:
      - targets:
        - alertmanager:9093

  - job_name: 'node-exporter'
    # metrics_path defaults to '/metrics'
    # scheme defaults to 'http'.
    static_configs:
      - targets:
         - example1:9100

  # This job will be used to auto-discover node-exporter pods based on published endpoints.
 - job_name: 'backend'
   # metrics_path defaults to '/metrics'
   # scheme defaults to 'http'.
   static_configs:
     - targets:
       - example1:8100

 - job_name: 'example1'
   static_configs:
     - targets:
       - example1:8102

 - job_name: 'cadvisor'
   static_configs:
     - targets:
        - example1:9080

  # - job_name: mysql1
  #   static_configs:
  #     - targets: ['mysql1:9104']
  #       labels:
  #         alias: mysql1

  # - job_name: mysql22
  #   static_configs:
  #     - targets: ['mysql22:9104']
  #       labels:
  #         alias: mysql22



  #- job_name: 'cloudwatch-exporter'
    ## metrics_path defaults to '/metrics'
    ## scheme defaults to 'http'.
    #static_configs:
      #- targets: ['cloudwatch-exporter:9106']

prometheus.rules

Bash

nano prometheus/prometheus.rules

YAML

groups:

### Host Machine State Check (Node-Exporter)
  - name: Server Down
    rules:
      # Alert for any instance that is unreachable for >5 minutes.
      - alert: InstanceDown
        expr: up == 0
        for: 5m
        labels:
          severity: warning
          job: node-exporter
        annotations:
          summary: "Instance {{ $labels.instance }} down"
          description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."

### Host Low Disk Space
  - name: Low Disk Space
    rules:
      - alert: Low Disk Space
        expr: ((node_filesystem_avail_bytes{mountpoint="/"} * 100 ) / node_filesystem_size_bytes < 15 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
        for: 30m
        labels:
          severity: warning
          job: node-exporter
        annotations:
          summary: "High disk usage on [{{ $labels.instance }}]"
          description: "Disk space usage is over {{ $value }} for more than 30 minutes!"

### Host Low on RAM
  - name: Low on RAM
    rules:
      - alert: High RAM Usage
        expr: (100 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 > 90) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
        for: 15m
        labels:
          severity: warning
        annotations:
          summary: "High RAM usage on [{{ $labels.instance }}]"
          description: "Node RAM usage is {{ $value }} for more than 10 minutes!"

### Host High CPU Load
  - name: Low on CPU
    rules:
      - alert: High CPU Load
        expr: (sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > 0.9) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: "High CPU load on [{{ $labels.instance}}]"
          description: "Node CPU load is {{ $value }} for more than 10 minutes!"

### Docker Metrics and Alerts
#  - name: Docker Containers State
#    rules:
#    - alert: Docker Container Issue
#      expr: absent(container_last_seen)
##     expr: time() - container_last_seen > 120
#      for: 5m
#      labels:
#        severity: warning
#      annotations:
#        summary: "Container down on [{{ $labels.instance}}]!"
#        description: "Container [{{ $labels.instance}}] has stopped for over {{ $value }} seconds."

alertmanager.yml

Bash

nano alertmanager/alertmanager.yml

Text Only

global:
  smtp_smarthost: 'email-smtp.eu-west-1.amazonaws.com:587'
  smtp_from: '[email protected]'
  smtp_hello: 'From Example Mail Server'
  smtp_auth_username: XXXXXXXXXXXXXXXXX
  smtp_auth_password: XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
  smtp_require_tls: true

  resolve_timeout: 1m

route:
  receiver: 'general-alerts-mail'

  group_by: ['job', 'alertname']

  group_wait: 30s

  group_interval: 5m

  repeat_interval: 30m

  routes:
    - match:
      receiver: 'general-alerts-mail'
      group_interval: 1m
      continue: true
      repeat_interval: 2m
      routes:
      - match:
          severity: critical
        receiver: 'general-alerts-mail'

    - match:
      receiver: 'slack'
      group_interval: 1m
      repeat_interval: 2m
      routes:
      - match:
          severity: critical
        receiver: 'slack'
receivers:
  - name: 'general-alerts-mail'
    email_configs:
      - to: '[email protected]'
        send_resolved: true


  - name: 'slack'
    slack_configs:
      - api_url: 'https://hooks.slack.com/services/T5XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'
        channel: '#uptime'
        send_resolved: true

Ownership

Bash

sudo chmod 777 -R prometheus/data grafana/data

Run

Bash

docker compose up -d