apiVersion: v1 data: crunchy-alert-rules-pg.yml: | ### # # Copyright 2017-2023 Crunchy Data Solutions, Inc. All Rights Reserved. # ### groups: - name: alert-rules rules: ########## EXPORTER RULES ########## - alert: PGExporterScrapeError expr: pg_exporter_last_scrape_error > 0 for: 60s labels: service: postgresql severity: critical severity_num: 300 annotations: summary: 'Postgres Exporter running on {{ $labels.job }} (instance: {{ $labels.instance }}) is encountering scrape errors processing queries. Error count: ( {{ $value }} )' ########## SYSTEM RULES ########## - alert: ExporterDown expr: avg_over_time(up[5m]) < 0.5 for: 10s labels: service: system severity: critical severity_num: 300 annotations: description: 'Metrics exporter service for {{ $labels.job }} running on {{ $labels.instance }} has been down at least 50% of the time for the last 5 minutes. Service may be flapping or down.' summary: 'Prometheus Exporter Service Down' ########## POSTGRESQL RULES ########## - alert: PGIsUp expr: pg_up < 1 for: 60s labels: service: postgresql severity: critical severity_num: 300 annotations: summary: 'postgres_exporter running on {{ $labels.job }} is unable to communicate with the configured database' # Example to check for current version of PostgreSQL. Metric returns the version that the exporter is running on, so you can set a rule to check for the minimum version you'd like all systems to be on. Number returned is the 6 digit integer representation contained in the setting "server_version_num". # # - alert: PGMinimumVersion # expr: ccp_postgresql_version_current < 110005 # for: 60s # labels: # service: postgresql # severity: critical # severity_num: 300 # annotations: # summary: '{{ $labels.job }} is not running at least version 11.5 of PostgreSQL' # Whether a system switches from primary to replica or vice versa must be configured per named job. # No way to tell what value a system is supposed to be without a rule expression for that specific system # 2 to 1 means it changed from primary to replica. 1 to 2 means it changed from replica to primary # Set this alert for each system that you want to monitor a recovery status change # Below is an example for a target job called "Replica" and watches for the value to change above 1 which means it's no longer a replica # # - alert: PGRecoveryStatusSwitch_Replica # expr: ccp_is_in_recovery_status{job="Replica"} > 1 # for: 60s # labels: # service: postgresql # severity: critical # severity_num: 300 # annotations: # summary: '{{ $labels.job }} has changed from replica to primary' # Absence alerts must be configured per named job, otherwise there's no way to know which job is down # Below is an example for a target job called "Prod" # - alert: PGConnectionAbsent_Prod # expr: absent(ccp_connection_stats_max_connections{job="Prod"}) # for: 10s # labels: # service: postgresql # severity: critical # severity_num: 300 # annotations: # description: 'Connection metric is absent from target (Prod). Check that postgres_exporter can connect to PostgreSQL.' # Optional monitor for changes to pg_settings (postgresql.conf) system catalog. # A similar metric is available for monitoring pg_hba.conf. See ccp_hba_settings_checksum(). # If metric returns 0, then NO settings have changed for either pg_settings since last known valid state # If metric returns 1, then pg_settings have changed since last known valid state # To see what may have changed, check the monitor.pg_settings_checksum table for a history of config state. # - alert: PGSettingsChecksum # expr: ccp_pg_settings_checksum > 0 # for 60s # labels: # service: postgresql # severity: critical # severity_num: 300 # annotations: # description: 'Configuration settings on {{ $labels.job }} have changed from previously known valid state. To reset current config to a valid state after alert fires, run monitor.pg_settings_checksum_set_valid().' # summary: 'PGSQL Instance settings checksum' # Monitor for data block checksum failures. Only works in PG12+ # - alert: PGDataChecksum # expr: ccp_data_checksum_failure > 0 # for 60s # labels: # service: postgresql # severity: critical # severity_num: 300 # annotations: # description: '{{ $labels.job }} has at least one data checksum failure in database {{ $labels.dbname }}. See pg_stat_database system catalog for more information.' # summary: 'PGSQL Data Checksum failure' - alert: PGIdleTxn expr: ccp_connection_stats_max_idle_in_txn_time > 300 for: 60s labels: service: postgresql severity: warning severity_num: 200 annotations: description: '{{ $labels.job }} has at least one session idle in transaction for over 5 minutes.' summary: 'PGSQL Instance idle transactions' - alert: PGIdleTxn expr: ccp_connection_stats_max_idle_in_txn_time > 900 for: 60s labels: service: postgresql severity: critical severity_num: 300 annotations: description: '{{ $labels.job }} has at least one session idle in transaction for over 15 minutes.' summary: 'PGSQL Instance idle transactions' - alert: PGQueryTime expr: ccp_connection_stats_max_query_time > 43200 for: 60s labels: service: postgresql severity: warning severity_num: 200 annotations: description: '{{ $labels.job }} has at least one query running for over 12 hours.' summary: 'PGSQL Max Query Runtime' - alert: PGQueryTime expr: ccp_connection_stats_max_query_time > 86400 for: 60s labels: service: postgresql severity: critical severity_num: 300 annotations: description: '{{ $labels.job }} has at least one query running for over 1 day.' summary: 'PGSQL Max Query Runtime' - alert: PGConnPerc expr: 100 * (ccp_connection_stats_total / ccp_connection_stats_max_connections) > 75 for: 60s labels: service: postgresql severity: warning severity_num: 200 annotations: description: '{{ $labels.job }} is using 75% or more of available connections ({{ $value }}%)' summary: 'PGSQL Instance connections' - alert: PGConnPerc expr: 100 * (ccp_connection_stats_total / ccp_connection_stats_max_connections) > 90 for: 60s labels: service: postgresql severity: critical severity_num: 300 annotations: description: '{{ $labels.job }} is using 90% or more of available connections ({{ $value }}%)' summary: 'PGSQL Instance connections' - alert: DiskFillPredict expr: predict_linear(ccp_nodemx_data_disk_available_bytes{mount_point!~"tmpfs"}[1h], 24 * 3600) < 0 and 100 * ((ccp_nodemx_data_disk_total_bytes - ccp_nodemx_data_disk_available_bytes) / ccp_nodemx_data_disk_total_bytes) > 70 for: 5m labels: service: postgresql severity: warning severity_num: 200 annotations: summary: 'Disk predicted to be full in 24 hours' description: 'Disk on {{ $labels.pg_cluster }}:{{ $labels.kubernetes_pod_name }} is predicted to fill in 24 hrs based on current usage' - alert: PGClusterRoleChange expr: count by (pg_cluster) (ccp_is_in_recovery_status != ignoring(instance,ip,pod,role) (ccp_is_in_recovery_status offset 5m)) >= 1 for: 60s labels: service: postgresql severity: critical severity_num: 300 annotations: summary: '{{ $labels.pg_cluster }} has had a switchover/failover event. Please check this cluster for more details' - alert: PGDiskSize expr: 100 * ((ccp_nodemx_data_disk_total_bytes - ccp_nodemx_data_disk_available_bytes) / ccp_nodemx_data_disk_total_bytes) > 75 for: 60s labels: service: postgresql severity: warning severity_num: 200 annotations: description: 'PGSQL Instance {{ $labels.deployment }} over 75% disk usage at mount point "{{ $labels.mount_point }}": {{ $value }}%' summary: PGSQL Instance usage warning - alert: PGDiskSize expr: 100 * ((ccp_nodemx_data_disk_total_bytes - ccp_nodemx_data_disk_available_bytes) / ccp_nodemx_data_disk_total_bytes) > 90 for: 60s labels: service: postgresql severity: critical severity_num: 300 annotations: description: 'PGSQL Instance {{ $labels.deployment }} over 90% disk usage at mount point "{{ $labels.mount_point }}": {{ $value }}%' summary: 'PGSQL Instance size critical' - alert: PGReplicationByteLag expr: ccp_replication_lag_size_bytes > 5.24288e+07 for: 60s labels: service: postgresql severity: warning severity_num: 200 annotations: description: 'PGSQL Instance {{ $labels.job }} has at least one replica lagging over 50MB behind.' summary: 'PGSQL Instance replica lag warning' - alert: PGReplicationByteLag expr: ccp_replication_lag_size_bytes > 1.048576e+08 for: 60s labels: service: postgresql severity: critical severity_num: 300 annotations: description: 'PGSQL Instance {{ $labels.job }} has at least one replica lagging over 100MB behind.' summary: 'PGSQL Instance replica lag warning' - alert: PGReplicationSlotsInactive expr: ccp_replication_slots_active == 0 for: 60s labels: service: postgresql severity: critical severity_num: 300 annotations: description: 'PGSQL Instance {{ $labels.job }} has one or more inactive replication slots' summary: 'PGSQL Instance inactive replication slot' - alert: PGXIDWraparound expr: ccp_transaction_wraparound_percent_towards_wraparound > 50 for: 60s labels: service: postgresql severity: warning severity_num: 200 annotations: description: 'PGSQL Instance {{ $labels.job }} is over 50% towards transaction id wraparound.' summary: 'PGSQL Instance {{ $labels.job }} transaction id wraparound imminent' - alert: PGXIDWraparound expr: ccp_transaction_wraparound_percent_towards_wraparound > 75 for: 60s labels: service: postgresql severity: critical severity_num: 300 annotations: description: 'PGSQL Instance {{ $labels.job }} is over 75% towards transaction id wraparound.' summary: 'PGSQL Instance transaction id wraparound imminent' - alert: PGEmergencyVacuum expr: ccp_transaction_wraparound_percent_towards_emergency_autovac > 110 for: 60s labels: service: postgresql severity: warning severity_num: 200 annotations: description: 'PGSQL Instance {{ $labels.job }} is over 110% beyond autovacuum_freeze_max_age value. Autovacuum may need tuning to better keep up.' summary: 'PGSQL Instance emergency vacuum imminent' - alert: PGEmergencyVacuum expr: ccp_transaction_wraparound_percent_towards_emergency_autovac > 125 for: 60s labels: service: postgresql severity: critical severity_num: 300 annotations: description: 'PGSQL Instance {{ $labels.job }} is over 125% beyond autovacuum_freeze_max_age value. Autovacuum needs tuning to better keep up.' summary: 'PGSQL Instance emergency vacuum imminent' - alert: PGArchiveCommandStatus expr: ccp_archive_command_status_seconds_since_last_fail > 300 for: 60s labels: service: postgresql severity: critical severity_num: 300 annotations: description: 'PGSQL Instance {{ $labels.job }} has a recent failing archive command' summary: 'Seconds since the last recorded failure of the archive_command' - alert: PGSequenceExhaustion expr: ccp_sequence_exhaustion_count > 0 for: 60s labels: service: postgresql severity: critical severity_num: 300 annotations: description: 'Count of sequences on instance {{ $labels.job }} at over 75% usage: {{ $value }}. Run following query to see full sequence status: SELECT * FROM monitor.sequence_status() WHERE percent >= 75' - alert: PGSettingsPendingRestart expr: ccp_settings_pending_restart_count > 0 for: 60s labels: service: postgresql severity: critical severity_num: 300 annotations: description: 'One or more settings in the pg_settings system catalog on system {{ $labels.job }} are in a pending_restart state. Check the system catalog for which settings are pending and review postgresql.conf for changes.' ########## PGBACKREST RULES ########## # # Uncomment and customize one or more of these rules to monitor your pgbackrest backups. # Full backups are considered the equivalent of both differentials and incrementals since both are based on the last full # And differentials are considered incrementals since incrementals will be based off the last diff if one exists # This avoid false alerts, for example when you don't run diff/incr backups on the days that you run a full # Stanza should also be set if different intervals are expected for each stanza. # Otherwise rule will be applied to all stanzas returned on target system if not set. # # Relevant metric names are: # ccp_backrest_last_full_backup_time_since_completion_seconds # ccp_backrest_last_incr_backup_time_since_completion_seconds # ccp_backrest_last_diff_backup_time_since_completion_seconds # # To avoid false positives on backup time alerts, 12 hours are added onto each threshold to allow a buffer if the backup runtime varies from day to day. # Further adjustment may be needed depending on your backup runtimes/schedule. # # - alert: PGBackRestLastCompletedFull_main # expr: ccp_backrest_last_full_backup_time_since_completion_seconds{stanza="main"} > 648000 # for: 60s # labels: # service: postgresql # severity: critical # severity_num: 300 # annotations: # summary: 'Full backup for stanza [main] on system {{ $labels.job }} has not completed in the last week.' # # - alert: PGBackRestLastCompletedIncr_main # expr: ccp_backrest_last_incr_backup_time_since_completion_seconds{stanza="main"} > 129600 # for: 60s # labels: # service: postgresql # severity: critical # severity_num: 300 # annotations: # summary: 'Incremental backup for stanza [main] on system {{ $labels.job }} has not completed in the last 24 hours.' # # # Runtime monitoring is handled with a single metric: # # ccp_backrest_last_info_backup_runtime_seconds # # Runtime monitoring should have the "backup_type" label set. # Otherwise the rule will apply to the last run of all backup types returned (full, diff, incr) # Stanza should also be set if runtimes per stanza have different expected times # # - alert: PGBackRestLastRuntimeFull_main # expr: ccp_backrest_last_info_backup_runtime_seconds{backup_type="full", stanza="main"} > 14400 # for: 60s # labels: # service: postgresql # severity: critical # severity_num: 300 # annotations: # summary: 'Expected runtime of full backup for stanza [main] has exceeded 4 hours' # # - alert: PGBackRestLastRuntimeDiff_main # expr: ccp_backrest_last_info_backup_runtime_seconds{backup_type="diff", stanza="main"} > 3600 # for: 60s # labels: # service: postgresql # severity: critical # severity_num: 300 # annotations: # summary: 'Expected runtime of diff backup for stanza [main] has exceeded 1 hour' ## # ## If the pgbackrest command fails to run, the metric disappears from the exporter output and the alert never fires. ## An absence alert must be configured explicitly for each target (job) that backups are being monitored. ## Checking for absence of just the full backup type should be sufficient (no need for diff/incr). ## Note that while the backrest check command failing will likely also cause a scrape error alert, the addition of this ## check gives a clearer answer as to what is causing it and that something is wrong with the backups. # # - alert: PGBackrestAbsentFull_Prod # expr: absent(ccp_backrest_last_full_backup_time_since_completion_seconds{job="Prod"}) # for: 10s # labels: # service: postgresql # severity: critical # severity_num: 300 # annotations: # description: 'Backup Full status missing for Prod. Check that pgbackrest info command is working on target system.' kind: ConfigMap metadata: labels: app.kubernetes.io/name: postgres-operator-monitoring vendor: crunchydata name: alertmanager-rules-config