kube-prometheus-stack
Useful Links:
ArtifactHUB - kube-prometheus-stack
prometheus stack
Prerequisites
- Helm installed
- Add repo
Bash
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
helm repo update
Persistent volume EFS (AWS)
- prometheus-vol.yaml
YAML
---
kind: StorageClass
apiVersion: storage.k8s.io/v1
metadata:
name: prometheus-vol
namespace: monitoring
provisioner: efs.csi.aws.com
parameters:
provisioningMode: efs-ap
fileSystemId: fs-009e574887be94064
directoryPerms: "700"
---
apiVersion: v1
kind: PersistentVolume
metadata:
name: prometheus-vol-pv
namespace: monitoring
spec:
capacity:
storage: 250Gi
volumeMode: Filesystem
accessModes:
- ReadWriteMany
persistentVolumeReclaimPolicy: Retain
storageClassName: prometheus-vol
csi:
driver: efs.csi.aws.com
volumeHandle: fs-009e574887be94064:/monitoring/prometheus # You can use / instead if you do not want to mount efs volume and create this path manually
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
# name needs to be exact like this
name: prometheus-kube-prometheus-stack-prometheus-db-prometheus-kube-prometheus-stack-prometheus-0
namespace: monitoring
spec:
accessModes:
- ReadWriteMany
storageClassName: prometheus-vol
resources:
requests:
storage: 250Gi
- Apply
Custom Values
YAML
# Choose namespace
namespaceOverride: "monitoring"
prometheus:
enabled: true
prometheusSpec:
disableCompaction: false
retention: 100d
retentionSize: ""
storageSpec:
volumeClaimTemplate:
spec:
storageClassName: "prometheus-vol"
accessModes: ["ReadWriteOnce"]
resources:
requests:
storage: 250Gi
# CUSTOM SCRAPE CONFIG
additionalScrapeConfigs:
- job_name: 'web'
static_configs:
- targets:
- web.default.svc.cluster.local:8100
- job_name: 'data-io'
static_configs:
- targets:
- data-io.default.svc.cluster.local:8100
- job_name: 'cloud-config'
static_configs:
- targets:
- cloud-config.default.svc.cluster.local:8100
- job_name: 'geofence'
static_configs:
- targets:
- geofence.default.svc.cluster.local:8100
- job_name: 'reporting'
static_configs:
- targets:
- reporting.default.svc.cluster.local:8100
- job_name: 'rule-engine'
static_configs:
- targets:
- rule-engine.default.svc.cluster.local:8100
- job_name: 'notification-system'
static_configs:
- targets:
- notification-system.default.svc.cluster.local:8100
- job_name: 'mqtt2amqp'
static_configs:
- targets:
- mqtt2amqp.default.svc.cluster.local:8100
- job_name: 'http2amqp'
static_configs:
- targets:
- http2amqp.default.svc.cluster.local:8100
- job_name: 'job-engine'
static_configs:
- targets:
- job-engine.default.svc.cluster.local:8100
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: "rabbitmq"
metrics_path: '/metrics'
scrape_interval: 5s
static_configs:
- targets: ["rabbitmq.default.svc.cluster.local:15692"]
- job_name: "mysql"
metrics_path: '/metrics'
scrape_interval: 5s
static_configs:
- targets: ["kube-prometheus-stack-prometheus-mysql-exporter:9104"]
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
# The mysqld_exporter host:port
replacement: kube-prometheus-stack-prometheus-mysql-exporter:9104
# CASSANDRA
- job_name: 'cassandra'
static_configs:
- targets: ['192.168.27.223:9500']
grafana:
enabled: true
adminPassword: Password
alertmanager:
enabled: true
config:
global:
resolve_timeout: 2m
smtp_require_tls: true
smtp_smarthost: 'email-XXXXXXXXXXXXX.com:587'
smtp_from: '[email protected]'
smtp_hello: 'From XXXXX Server'
smtp_auth_username: AXXXXXXXXXXXXBXTPF
smtp_auth_password: BATAOXXXXXXXXXXXXXXXXXXXXXXXXXXXX/R0/MaG
route:
receiver: 'general-alerts-mail'
group_by: ['job', 'alertname']
group_wait: 30s
group_interval: 2m
repeat_interval: 5m
routes:
- match:
job: node-exporter
receiver: 'general-alerts-mail'
group_interval: 1m
repeat_interval: 5m
routes:
- match:
severity: critical
receiver: 'general-alerts-mail'
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname']
# Who's going to receive emails.
receivers:
- name: 'general-alerts-mail'
email_configs:
- to: '[email protected]'
send_resolved: true
defaultRules:
create: true
rules:
alertmanager: false
etcd: false
configReloaders: false
general: false
k8sContainerCpuUsageSecondsTotal: true
k8sContainerMemoryCache: true
k8sContainerMemoryRss: true
k8sContainerMemorySwap: true
k8sContainerResource: true
k8sContainerMemoryWorkingSetBytes: true
k8sPodOwner: false
kubeApiserverAvailability: false
kubeApiserverBurnrate: false
kubeApiserverHistogram: false
kubeApiserverSlos: false
kubeControllerManager: false
kubelet: false
kubeProxy: false
kubePrometheusGeneral: false
kubePrometheusNodeRecording: false
kubernetesApps: false
kubernetesResources: true
kubernetesStorage: true
kubernetesSystem: false
kubeSchedulerAlerting: false
kubeSchedulerRecording: false
kubeStateMetrics: false
network: true
node: true
nodeExporterAlerting: false
nodeExporterRecording: false
prometheus: false
prometheusOperator: false
windows: false
disabled:
KubeAPIDown: true
NodeRAIDDegraded: true
additionalPrometheusRulesMap:
rule-name:
groups:
- name: Wolkabout
rules:
# NODE
# Node memory is filling up (< 10% left)
- alert: HostOutOfMemory
expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 5m
labels:
severity: warning
annotations:
summary: Host out of memory (instance {{ $labels.instance }})
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# DISK
# Please add ignored mountpoints in node_exporter parameters like
# "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
# Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
- alert: HostOutOfDiskSpace
expr: ((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 2m
labels:
severity: warning
annotations:
summary: Host out of disk space (instance {{ $labels.instance }})
description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Please add ignored mountpoints in node_exporter parameters like
# "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
# Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
- alert: HostDiskWillFillIn24Hours
expr: ((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 2m
labels:
severity: warning
annotations:
summary: Host disk will fill in 24 hours (instance {{ $labels.instance }})
description: "Filesystem is predicted to run out of space within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# CONTAINERS
# A container has disappeared
# This rule can be very noisy in dynamic infra with legitimate container start/stop/deployment.
- alert: ContainerKilled
expr: time() - container_last_seen > 60
for: 0m
labels:
severity: warning
annotations:
summary: Container killed (instance {{ $labels.instance }})
description: "A container has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Container CPU utilization is above 80%
- alert: ContainerHighCpuUtilization
expr: sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{container!=""}) by (container, pod) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{container!=""}) by (container, pod) * 100 > 80
for: 5m
labels:
severity: warning
annotations:
summary: Container High CPU utilization (instance {{ $labels.instance }})
description: "Container CPU utilization is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Container Memory usage is above 80%
# See https://medium.com/faun/how-much-is-too-much-the-linux-oomkiller-and-used-memory-d32186f29c9d
- alert: ContainerHighMemoryUsage
expr: sum(container_memory_working_set_bytes{job="kubelet", metrics_path="/metrics/cadvisor", container!="", image!=""}) by (container) / sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_limits{container!=""}) by (container) * 100 > 80
for: 2m
labels:
severity: warning
annotations:
summary: Container High Memory usage (instance {{ $labels.instance }})
description: "Container Memory usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Container is being throttled
- alert: ContainerHighThrottleRate
expr: rate(container_cpu_cfs_throttled_seconds_total[3m]) > 1
for: 2m
labels:
severity: warning
annotations:
summary: Container high throttle rate (instance {{ $labels.instance }})
description: "Container is being throttled\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# # Container CPU utilization is under 20% for 1 week. Consider reducing the allocated CPU.
# - alert: ContainerLowCpuUtilization
# expr: (sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (pod, container) / sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) * 100) < 20
# for: 7d
# labels:
# severity: info
# annotations:
# summary: Container Low CPU utilization (instance {{ $labels.instance }})
# description: "Container CPU utilization is under 20% for 1 week. Consider reducing the allocated CPU.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
More about creating custom alert rules here
- Install kube-prometheus-stack