429 lines
18 KiB
YAML
429 lines
18 KiB
YAML
|
apiVersion: v1
|
||
|
data:
|
||
|
crunchy-alert-rules-pg.yml: |
|
||
|
###
|
||
|
#
|
||
|
# Copyright 2017-2023 Crunchy Data Solutions, Inc. All Rights Reserved.
|
||
|
#
|
||
|
###
|
||
|
|
||
|
groups:
|
||
|
- name: alert-rules
|
||
|
rules:
|
||
|
|
||
|
########## EXPORTER RULES ##########
|
||
|
- alert: PGExporterScrapeError
|
||
|
expr: pg_exporter_last_scrape_error > 0
|
||
|
for: 60s
|
||
|
labels:
|
||
|
service: postgresql
|
||
|
severity: critical
|
||
|
severity_num: 300
|
||
|
annotations:
|
||
|
summary: 'Postgres Exporter running on {{ $labels.job }} (instance: {{ $labels.instance }}) is encountering scrape errors processing queries. Error count: ( {{ $value }} )'
|
||
|
|
||
|
|
||
|
########## SYSTEM RULES ##########
|
||
|
- alert: ExporterDown
|
||
|
expr: avg_over_time(up[5m]) < 0.5
|
||
|
for: 10s
|
||
|
labels:
|
||
|
service: system
|
||
|
severity: critical
|
||
|
severity_num: 300
|
||
|
annotations:
|
||
|
description: 'Metrics exporter service for {{ $labels.job }} running on {{ $labels.instance }} has been down at least 50% of the time for the last 5 minutes. Service may be flapping or down.'
|
||
|
summary: 'Prometheus Exporter Service Down'
|
||
|
|
||
|
|
||
|
########## POSTGRESQL RULES ##########
|
||
|
- alert: PGIsUp
|
||
|
expr: pg_up < 1
|
||
|
for: 60s
|
||
|
labels:
|
||
|
service: postgresql
|
||
|
severity: critical
|
||
|
severity_num: 300
|
||
|
annotations:
|
||
|
summary: 'postgres_exporter running on {{ $labels.job }} is unable to communicate with the configured database'
|
||
|
|
||
|
|
||
|
# Example to check for current version of PostgreSQL. Metric returns the version that the exporter is running on, so you can set a rule to check for the minimum version you'd like all systems to be on. Number returned is the 6 digit integer representation contained in the setting "server_version_num".
|
||
|
#
|
||
|
# - alert: PGMinimumVersion
|
||
|
# expr: ccp_postgresql_version_current < 110005
|
||
|
# for: 60s
|
||
|
# labels:
|
||
|
# service: postgresql
|
||
|
# severity: critical
|
||
|
# severity_num: 300
|
||
|
# annotations:
|
||
|
# summary: '{{ $labels.job }} is not running at least version 11.5 of PostgreSQL'
|
||
|
|
||
|
|
||
|
# Whether a system switches from primary to replica or vice versa must be configured per named job.
|
||
|
# No way to tell what value a system is supposed to be without a rule expression for that specific system
|
||
|
# 2 to 1 means it changed from primary to replica. 1 to 2 means it changed from replica to primary
|
||
|
# Set this alert for each system that you want to monitor a recovery status change
|
||
|
# Below is an example for a target job called "Replica" and watches for the value to change above 1 which means it's no longer a replica
|
||
|
#
|
||
|
# - alert: PGRecoveryStatusSwitch_Replica
|
||
|
# expr: ccp_is_in_recovery_status{job="Replica"} > 1
|
||
|
# for: 60s
|
||
|
# labels:
|
||
|
# service: postgresql
|
||
|
# severity: critical
|
||
|
# severity_num: 300
|
||
|
# annotations:
|
||
|
# summary: '{{ $labels.job }} has changed from replica to primary'
|
||
|
|
||
|
|
||
|
# Absence alerts must be configured per named job, otherwise there's no way to know which job is down
|
||
|
# Below is an example for a target job called "Prod"
|
||
|
# - alert: PGConnectionAbsent_Prod
|
||
|
# expr: absent(ccp_connection_stats_max_connections{job="Prod"})
|
||
|
# for: 10s
|
||
|
# labels:
|
||
|
# service: postgresql
|
||
|
# severity: critical
|
||
|
# severity_num: 300
|
||
|
# annotations:
|
||
|
# description: 'Connection metric is absent from target (Prod). Check that postgres_exporter can connect to PostgreSQL.'
|
||
|
|
||
|
|
||
|
# Optional monitor for changes to pg_settings (postgresql.conf) system catalog.
|
||
|
# A similar metric is available for monitoring pg_hba.conf. See ccp_hba_settings_checksum().
|
||
|
# If metric returns 0, then NO settings have changed for either pg_settings since last known valid state
|
||
|
# If metric returns 1, then pg_settings have changed since last known valid state
|
||
|
# To see what may have changed, check the monitor.pg_settings_checksum table for a history of config state.
|
||
|
# - alert: PGSettingsChecksum
|
||
|
# expr: ccp_pg_settings_checksum > 0
|
||
|
# for 60s
|
||
|
# labels:
|
||
|
# service: postgresql
|
||
|
# severity: critical
|
||
|
# severity_num: 300
|
||
|
# annotations:
|
||
|
# description: 'Configuration settings on {{ $labels.job }} have changed from previously known valid state. To reset current config to a valid state after alert fires, run monitor.pg_settings_checksum_set_valid().'
|
||
|
# summary: 'PGSQL Instance settings checksum'
|
||
|
|
||
|
|
||
|
# Monitor for data block checksum failures. Only works in PG12+
|
||
|
# - alert: PGDataChecksum
|
||
|
# expr: ccp_data_checksum_failure > 0
|
||
|
# for 60s
|
||
|
# labels:
|
||
|
# service: postgresql
|
||
|
# severity: critical
|
||
|
# severity_num: 300
|
||
|
# annotations:
|
||
|
# description: '{{ $labels.job }} has at least one data checksum failure in database {{ $labels.dbname }}. See pg_stat_database system catalog for more information.'
|
||
|
# summary: 'PGSQL Data Checksum failure'
|
||
|
|
||
|
- alert: PGIdleTxn
|
||
|
expr: ccp_connection_stats_max_idle_in_txn_time > 300
|
||
|
for: 60s
|
||
|
labels:
|
||
|
service: postgresql
|
||
|
severity: warning
|
||
|
severity_num: 200
|
||
|
annotations:
|
||
|
description: '{{ $labels.job }} has at least one session idle in transaction for over 5 minutes.'
|
||
|
summary: 'PGSQL Instance idle transactions'
|
||
|
|
||
|
- alert: PGIdleTxn
|
||
|
expr: ccp_connection_stats_max_idle_in_txn_time > 900
|
||
|
for: 60s
|
||
|
labels:
|
||
|
service: postgresql
|
||
|
severity: critical
|
||
|
severity_num: 300
|
||
|
annotations:
|
||
|
description: '{{ $labels.job }} has at least one session idle in transaction for over 15 minutes.'
|
||
|
summary: 'PGSQL Instance idle transactions'
|
||
|
|
||
|
- alert: PGQueryTime
|
||
|
expr: ccp_connection_stats_max_query_time > 43200
|
||
|
for: 60s
|
||
|
labels:
|
||
|
service: postgresql
|
||
|
severity: warning
|
||
|
severity_num: 200
|
||
|
annotations:
|
||
|
description: '{{ $labels.job }} has at least one query running for over 12 hours.'
|
||
|
summary: 'PGSQL Max Query Runtime'
|
||
|
|
||
|
- alert: PGQueryTime
|
||
|
expr: ccp_connection_stats_max_query_time > 86400
|
||
|
for: 60s
|
||
|
labels:
|
||
|
service: postgresql
|
||
|
severity: critical
|
||
|
severity_num: 300
|
||
|
annotations:
|
||
|
description: '{{ $labels.job }} has at least one query running for over 1 day.'
|
||
|
summary: 'PGSQL Max Query Runtime'
|
||
|
|
||
|
- alert: PGConnPerc
|
||
|
expr: 100 * (ccp_connection_stats_total / ccp_connection_stats_max_connections) > 75
|
||
|
for: 60s
|
||
|
labels:
|
||
|
service: postgresql
|
||
|
severity: warning
|
||
|
severity_num: 200
|
||
|
annotations:
|
||
|
description: '{{ $labels.job }} is using 75% or more of available connections ({{ $value }}%)'
|
||
|
summary: 'PGSQL Instance connections'
|
||
|
|
||
|
- alert: PGConnPerc
|
||
|
expr: 100 * (ccp_connection_stats_total / ccp_connection_stats_max_connections) > 90
|
||
|
for: 60s
|
||
|
labels:
|
||
|
service: postgresql
|
||
|
severity: critical
|
||
|
severity_num: 300
|
||
|
annotations:
|
||
|
description: '{{ $labels.job }} is using 90% or more of available connections ({{ $value }}%)'
|
||
|
summary: 'PGSQL Instance connections'
|
||
|
|
||
|
- alert: DiskFillPredict
|
||
|
expr: predict_linear(ccp_nodemx_data_disk_available_bytes{mount_point!~"tmpfs"}[1h], 24 * 3600) < 0 and 100 * ((ccp_nodemx_data_disk_total_bytes - ccp_nodemx_data_disk_available_bytes) / ccp_nodemx_data_disk_total_bytes) > 70
|
||
|
for: 5m
|
||
|
labels:
|
||
|
service: postgresql
|
||
|
severity: warning
|
||
|
severity_num: 200
|
||
|
annotations:
|
||
|
summary: 'Disk predicted to be full in 24 hours'
|
||
|
description: 'Disk on {{ $labels.pg_cluster }}:{{ $labels.kubernetes_pod_name }} is predicted to fill in 24 hrs based on current usage'
|
||
|
|
||
|
- alert: PGClusterRoleChange
|
||
|
expr: count by (pg_cluster) (ccp_is_in_recovery_status != ignoring(instance,ip,pod,role) (ccp_is_in_recovery_status offset 5m)) >= 1
|
||
|
for: 60s
|
||
|
labels:
|
||
|
service: postgresql
|
||
|
severity: critical
|
||
|
severity_num: 300
|
||
|
annotations:
|
||
|
summary: '{{ $labels.pg_cluster }} has had a switchover/failover event. Please check this cluster for more details'
|
||
|
|
||
|
- alert: PGDiskSize
|
||
|
expr: 100 * ((ccp_nodemx_data_disk_total_bytes - ccp_nodemx_data_disk_available_bytes) / ccp_nodemx_data_disk_total_bytes) > 75
|
||
|
for: 60s
|
||
|
labels:
|
||
|
service: postgresql
|
||
|
severity: warning
|
||
|
severity_num: 200
|
||
|
annotations:
|
||
|
description: 'PGSQL Instance {{ $labels.deployment }} over 75% disk usage at mount point "{{ $labels.mount_point }}": {{ $value }}%'
|
||
|
summary: PGSQL Instance usage warning
|
||
|
|
||
|
- alert: PGDiskSize
|
||
|
expr: 100 * ((ccp_nodemx_data_disk_total_bytes - ccp_nodemx_data_disk_available_bytes) / ccp_nodemx_data_disk_total_bytes) > 90
|
||
|
for: 60s
|
||
|
labels:
|
||
|
service: postgresql
|
||
|
severity: critical
|
||
|
severity_num: 300
|
||
|
annotations:
|
||
|
description: 'PGSQL Instance {{ $labels.deployment }} over 90% disk usage at mount point "{{ $labels.mount_point }}": {{ $value }}%'
|
||
|
summary: 'PGSQL Instance size critical'
|
||
|
|
||
|
- alert: PGReplicationByteLag
|
||
|
expr: ccp_replication_lag_size_bytes > 5.24288e+07
|
||
|
for: 60s
|
||
|
labels:
|
||
|
service: postgresql
|
||
|
severity: warning
|
||
|
severity_num: 200
|
||
|
annotations:
|
||
|
description: 'PGSQL Instance {{ $labels.job }} has at least one replica lagging over 50MB behind.'
|
||
|
summary: 'PGSQL Instance replica lag warning'
|
||
|
|
||
|
- alert: PGReplicationByteLag
|
||
|
expr: ccp_replication_lag_size_bytes > 1.048576e+08
|
||
|
for: 60s
|
||
|
labels:
|
||
|
service: postgresql
|
||
|
severity: critical
|
||
|
severity_num: 300
|
||
|
annotations:
|
||
|
description: 'PGSQL Instance {{ $labels.job }} has at least one replica lagging over 100MB behind.'
|
||
|
summary: 'PGSQL Instance replica lag warning'
|
||
|
|
||
|
- alert: PGReplicationSlotsInactive
|
||
|
expr: ccp_replication_slots_active == 0
|
||
|
for: 60s
|
||
|
labels:
|
||
|
service: postgresql
|
||
|
severity: critical
|
||
|
severity_num: 300
|
||
|
annotations:
|
||
|
description: 'PGSQL Instance {{ $labels.job }} has one or more inactive replication slots'
|
||
|
summary: 'PGSQL Instance inactive replication slot'
|
||
|
|
||
|
- alert: PGXIDWraparound
|
||
|
expr: ccp_transaction_wraparound_percent_towards_wraparound > 50
|
||
|
for: 60s
|
||
|
labels:
|
||
|
service: postgresql
|
||
|
severity: warning
|
||
|
severity_num: 200
|
||
|
annotations:
|
||
|
description: 'PGSQL Instance {{ $labels.job }} is over 50% towards transaction id wraparound.'
|
||
|
summary: 'PGSQL Instance {{ $labels.job }} transaction id wraparound imminent'
|
||
|
|
||
|
- alert: PGXIDWraparound
|
||
|
expr: ccp_transaction_wraparound_percent_towards_wraparound > 75
|
||
|
for: 60s
|
||
|
labels:
|
||
|
service: postgresql
|
||
|
severity: critical
|
||
|
severity_num: 300
|
||
|
annotations:
|
||
|
description: 'PGSQL Instance {{ $labels.job }} is over 75% towards transaction id wraparound.'
|
||
|
summary: 'PGSQL Instance transaction id wraparound imminent'
|
||
|
|
||
|
- alert: PGEmergencyVacuum
|
||
|
expr: ccp_transaction_wraparound_percent_towards_emergency_autovac > 110
|
||
|
for: 60s
|
||
|
labels:
|
||
|
service: postgresql
|
||
|
severity: warning
|
||
|
severity_num: 200
|
||
|
annotations:
|
||
|
description: 'PGSQL Instance {{ $labels.job }} is over 110% beyond autovacuum_freeze_max_age value. Autovacuum may need tuning to better keep up.'
|
||
|
summary: 'PGSQL Instance emergency vacuum imminent'
|
||
|
|
||
|
- alert: PGEmergencyVacuum
|
||
|
expr: ccp_transaction_wraparound_percent_towards_emergency_autovac > 125
|
||
|
for: 60s
|
||
|
labels:
|
||
|
service: postgresql
|
||
|
severity: critical
|
||
|
severity_num: 300
|
||
|
annotations:
|
||
|
description: 'PGSQL Instance {{ $labels.job }} is over 125% beyond autovacuum_freeze_max_age value. Autovacuum needs tuning to better keep up.'
|
||
|
summary: 'PGSQL Instance emergency vacuum imminent'
|
||
|
|
||
|
- alert: PGArchiveCommandStatus
|
||
|
expr: ccp_archive_command_status_seconds_since_last_fail > 300
|
||
|
for: 60s
|
||
|
labels:
|
||
|
service: postgresql
|
||
|
severity: critical
|
||
|
severity_num: 300
|
||
|
annotations:
|
||
|
description: 'PGSQL Instance {{ $labels.job }} has a recent failing archive command'
|
||
|
summary: 'Seconds since the last recorded failure of the archive_command'
|
||
|
|
||
|
- alert: PGSequenceExhaustion
|
||
|
expr: ccp_sequence_exhaustion_count > 0
|
||
|
for: 60s
|
||
|
labels:
|
||
|
service: postgresql
|
||
|
severity: critical
|
||
|
severity_num: 300
|
||
|
annotations:
|
||
|
description: 'Count of sequences on instance {{ $labels.job }} at over 75% usage: {{ $value }}. Run following query to see full sequence status: SELECT * FROM monitor.sequence_status() WHERE percent >= 75'
|
||
|
|
||
|
- alert: PGSettingsPendingRestart
|
||
|
expr: ccp_settings_pending_restart_count > 0
|
||
|
for: 60s
|
||
|
labels:
|
||
|
service: postgresql
|
||
|
severity: critical
|
||
|
severity_num: 300
|
||
|
annotations:
|
||
|
description: 'One or more settings in the pg_settings system catalog on system {{ $labels.job }} are in a pending_restart state. Check the system catalog for which settings are pending and review postgresql.conf for changes.'
|
||
|
|
||
|
########## PGBACKREST RULES ##########
|
||
|
#
|
||
|
# Uncomment and customize one or more of these rules to monitor your pgbackrest backups.
|
||
|
# Full backups are considered the equivalent of both differentials and incrementals since both are based on the last full
|
||
|
# And differentials are considered incrementals since incrementals will be based off the last diff if one exists
|
||
|
# This avoid false alerts, for example when you don't run diff/incr backups on the days that you run a full
|
||
|
# Stanza should also be set if different intervals are expected for each stanza.
|
||
|
# Otherwise rule will be applied to all stanzas returned on target system if not set.
|
||
|
#
|
||
|
# Relevant metric names are:
|
||
|
# ccp_backrest_last_full_backup_time_since_completion_seconds
|
||
|
# ccp_backrest_last_incr_backup_time_since_completion_seconds
|
||
|
# ccp_backrest_last_diff_backup_time_since_completion_seconds
|
||
|
#
|
||
|
# To avoid false positives on backup time alerts, 12 hours are added onto each threshold to allow a buffer if the backup runtime varies from day to day.
|
||
|
# Further adjustment may be needed depending on your backup runtimes/schedule.
|
||
|
#
|
||
|
# - alert: PGBackRestLastCompletedFull_main
|
||
|
# expr: ccp_backrest_last_full_backup_time_since_completion_seconds{stanza="main"} > 648000
|
||
|
# for: 60s
|
||
|
# labels:
|
||
|
# service: postgresql
|
||
|
# severity: critical
|
||
|
# severity_num: 300
|
||
|
# annotations:
|
||
|
# summary: 'Full backup for stanza [main] on system {{ $labels.job }} has not completed in the last week.'
|
||
|
#
|
||
|
# - alert: PGBackRestLastCompletedIncr_main
|
||
|
# expr: ccp_backrest_last_incr_backup_time_since_completion_seconds{stanza="main"} > 129600
|
||
|
# for: 60s
|
||
|
# labels:
|
||
|
# service: postgresql
|
||
|
# severity: critical
|
||
|
# severity_num: 300
|
||
|
# annotations:
|
||
|
# summary: 'Incremental backup for stanza [main] on system {{ $labels.job }} has not completed in the last 24 hours.'
|
||
|
#
|
||
|
#
|
||
|
# Runtime monitoring is handled with a single metric:
|
||
|
#
|
||
|
# ccp_backrest_last_info_backup_runtime_seconds
|
||
|
#
|
||
|
# Runtime monitoring should have the "backup_type" label set.
|
||
|
# Otherwise the rule will apply to the last run of all backup types returned (full, diff, incr)
|
||
|
# Stanza should also be set if runtimes per stanza have different expected times
|
||
|
#
|
||
|
# - alert: PGBackRestLastRuntimeFull_main
|
||
|
# expr: ccp_backrest_last_info_backup_runtime_seconds{backup_type="full", stanza="main"} > 14400
|
||
|
# for: 60s
|
||
|
# labels:
|
||
|
# service: postgresql
|
||
|
# severity: critical
|
||
|
# severity_num: 300
|
||
|
# annotations:
|
||
|
# summary: 'Expected runtime of full backup for stanza [main] has exceeded 4 hours'
|
||
|
#
|
||
|
# - alert: PGBackRestLastRuntimeDiff_main
|
||
|
# expr: ccp_backrest_last_info_backup_runtime_seconds{backup_type="diff", stanza="main"} > 3600
|
||
|
# for: 60s
|
||
|
# labels:
|
||
|
# service: postgresql
|
||
|
# severity: critical
|
||
|
# severity_num: 300
|
||
|
# annotations:
|
||
|
# summary: 'Expected runtime of diff backup for stanza [main] has exceeded 1 hour'
|
||
|
##
|
||
|
#
|
||
|
## If the pgbackrest command fails to run, the metric disappears from the exporter output and the alert never fires.
|
||
|
## An absence alert must be configured explicitly for each target (job) that backups are being monitored.
|
||
|
## Checking for absence of just the full backup type should be sufficient (no need for diff/incr).
|
||
|
## Note that while the backrest check command failing will likely also cause a scrape error alert, the addition of this
|
||
|
## check gives a clearer answer as to what is causing it and that something is wrong with the backups.
|
||
|
#
|
||
|
# - alert: PGBackrestAbsentFull_Prod
|
||
|
# expr: absent(ccp_backrest_last_full_backup_time_since_completion_seconds{job="Prod"})
|
||
|
# for: 10s
|
||
|
# labels:
|
||
|
# service: postgresql
|
||
|
# severity: critical
|
||
|
# severity_num: 300
|
||
|
# annotations:
|
||
|
# description: 'Backup Full status missing for Prod. Check that pgbackrest info command is working on target system.'
|
||
|
|
||
|
kind: ConfigMap
|
||
|
metadata:
|
||
|
labels:
|
||
|
app.kubernetes.io/name: postgres-operator-monitoring
|
||
|
vendor: crunchydata
|
||
|
name: alertmanager-rules-config
|