clank-postgres/postgres-operator-examples-main/kustomize/monitoring/alertmanager-rules-config.yaml

apiVersion: v1
data:
  crunchy-alert-rules-pg.yml: |
    ###
    #
    # Copyright 2017-2023 Crunchy Data Solutions, Inc. All Rights Reserved.
    #
    ###

    groups:
    - name: alert-rules
      rules:

    ########## EXPORTER RULES ##########
      - alert: PGExporterScrapeError
        expr: pg_exporter_last_scrape_error > 0
        for: 60s
        labels:
          service: postgresql
          severity: critical
          severity_num: 300
        annotations:
          summary: 'Postgres Exporter running on {{ $labels.job }} (instance: {{ $labels.instance }}) is encountering scrape errors processing queries. Error count: ( {{ $value }} )'


    ########## SYSTEM RULES ##########
      - alert: ExporterDown
        expr: avg_over_time(up[5m]) < 0.5
        for: 10s
        labels:
          service: system
          severity: critical
          severity_num: 300
        annotations:
          description: 'Metrics exporter service for {{ $labels.job }} running on {{ $labels.instance }} has been down at least 50% of the time for the last 5 minutes. Service may be flapping or down.'
          summary: 'Prometheus Exporter Service Down'


    ########## POSTGRESQL RULES ##########
      - alert: PGIsUp
        expr: pg_up < 1
        for: 60s
        labels:
          service: postgresql
          severity: critical
          severity_num: 300
        annotations:
          summary: 'postgres_exporter running on {{ $labels.job }} is unable to communicate with the configured database'


    # Example to check for current version of PostgreSQL. Metric returns the version that the exporter is running on, so you can set a rule to check for the minimum version you'd like all systems to be on. Number returned is the 6 digit integer representation contained in the setting "server_version_num".
    #
    #  - alert: PGMinimumVersion
    #    expr:  ccp_postgresql_version_current < 110005
    #    for: 60s
    #    labels:
    #      service: postgresql
    #      severity: critical
    #      severity_num: 300
    #    annotations:
    #      summary: '{{ $labels.job }} is not running at least version 11.5 of PostgreSQL'


    # Whether a system switches from primary to replica or vice versa must be configured per named job.
    # No way to tell what value a system is supposed to be without a rule expression for that specific system
    # 2 to 1 means it changed from primary to replica. 1 to 2 means it changed from replica to primary
    # Set this alert for each system that you want to monitor a recovery status change
    # Below is an example for a target job called "Replica" and watches for the value to change above 1 which means it's no longer a replica
    #
    #  - alert: PGRecoveryStatusSwitch_Replica
    #    expr: ccp_is_in_recovery_status{job="Replica"} > 1
    #    for: 60s
    #    labels:
    #      service: postgresql
    #      severity: critical
    #      severity_num: 300
    #    annotations:
    #      summary: '{{ $labels.job }} has changed from replica to primary'


    # Absence alerts must be configured per named job, otherwise there's no way to know which job is down
    # Below is an example for a target job called "Prod"
    #  - alert: PGConnectionAbsent_Prod
    #    expr: absent(ccp_connection_stats_max_connections{job="Prod"})
    #    for: 10s
    #    labels:
    #      service: postgresql
    #      severity: critical
    #      severity_num: 300
    #    annotations:
    #      description: 'Connection metric is absent from target (Prod). Check that postgres_exporter can connect to PostgreSQL.'


    # Optional monitor for changes to pg_settings (postgresql.conf) system catalog.
    # A similar metric is available for monitoring pg_hba.conf. See ccp_hba_settings_checksum().
    # If metric returns 0, then NO settings have changed for either pg_settings since last known valid state
    # If metric returns 1, then pg_settings have changed since last known valid state
    # To see what may have changed, check the monitor.pg_settings_checksum table for a history of config state.
    #  - alert: PGSettingsChecksum
    #    expr: ccp_pg_settings_checksum > 0
    #    for 60s
    #    labels:
    #      service: postgresql
    #      severity: critical
    #      severity_num: 300
    #    annotations:
    #      description: 'Configuration settings on {{ $labels.job }} have changed from previously known valid state. To reset current config to a valid state after alert fires, run monitor.pg_settings_checksum_set_valid().'
    #      summary: 'PGSQL Instance settings checksum'


    # Monitor for data block checksum failures. Only works in PG12+
    #  - alert: PGDataChecksum
    #    expr: ccp_data_checksum_failure > 0
    #    for 60s
    #    labels:
    #      service: postgresql
    #      severity: critical
    #      severity_num: 300
    #    annotations:
    #      description: '{{ $labels.job }} has at least one data checksum failure in database {{ $labels.dbname }}. See pg_stat_database system catalog for more information.'
    #      summary: 'PGSQL Data Checksum failure'

      - alert: PGIdleTxn
        expr: ccp_connection_stats_max_idle_in_txn_time > 300
        for: 60s
        labels:
          service: postgresql
          severity: warning
          severity_num: 200
        annotations:
          description: '{{ $labels.job }} has at least one session idle in transaction for over 5 minutes.'
          summary: 'PGSQL Instance idle transactions'

      - alert: PGIdleTxn
        expr: ccp_connection_stats_max_idle_in_txn_time > 900
        for: 60s
        labels:
          service: postgresql
          severity: critical
          severity_num: 300
        annotations:
          description: '{{ $labels.job }} has at least one session idle in transaction for over 15 minutes.'
          summary: 'PGSQL Instance idle transactions'

      - alert: PGQueryTime
        expr: ccp_connection_stats_max_query_time > 43200
        for: 60s
        labels:
          service: postgresql
          severity: warning
          severity_num: 200
        annotations:
          description: '{{ $labels.job }} has at least one query running for over 12 hours.'
          summary: 'PGSQL Max Query Runtime'

      - alert: PGQueryTime
        expr: ccp_connection_stats_max_query_time > 86400
        for: 60s
        labels:
          service: postgresql
          severity: critical
          severity_num: 300
        annotations:
          description: '{{ $labels.job }} has at least one query running for over 1 day.'
          summary: 'PGSQL Max Query Runtime'

      - alert: PGConnPerc
        expr: 100 * (ccp_connection_stats_total / ccp_connection_stats_max_connections) > 75
        for: 60s
        labels:
          service: postgresql
          severity: warning
          severity_num: 200
        annotations:
          description: '{{ $labels.job }} is using 75% or more of available connections ({{ $value }}%)'
          summary: 'PGSQL Instance connections'

      - alert: PGConnPerc
        expr: 100 * (ccp_connection_stats_total / ccp_connection_stats_max_connections) > 90
        for: 60s
        labels:
          service: postgresql
          severity: critical
          severity_num: 300
        annotations:
          description: '{{ $labels.job }} is using 90% or more of available connections ({{ $value }}%)'
          summary: 'PGSQL Instance connections'

      - alert: DiskFillPredict
        expr: predict_linear(ccp_nodemx_data_disk_available_bytes{mount_point!~"tmpfs"}[1h], 24 * 3600) < 0 and 100 * ((ccp_nodemx_data_disk_total_bytes - ccp_nodemx_data_disk_available_bytes) / ccp_nodemx_data_disk_total_bytes) > 70
        for: 5m
        labels:
          service: postgresql
          severity: warning
          severity_num: 200
        annotations:
          summary: 'Disk predicted to be full in 24 hours'
          description: 'Disk on {{ $labels.pg_cluster }}:{{ $labels.kubernetes_pod_name }} is predicted to fill in 24 hrs based on current usage'

      - alert: PGClusterRoleChange
        expr: count by (pg_cluster) (ccp_is_in_recovery_status != ignoring(instance,ip,pod,role) (ccp_is_in_recovery_status offset 5m)) >= 1
        for: 60s
        labels:
          service: postgresql
          severity: critical
          severity_num: 300
        annotations:
          summary: '{{ $labels.pg_cluster }} has had a switchover/failover event. Please check this cluster for more details'

      - alert: PGDiskSize
        expr: 100 * ((ccp_nodemx_data_disk_total_bytes - ccp_nodemx_data_disk_available_bytes) / ccp_nodemx_data_disk_total_bytes) > 75
        for: 60s
        labels:
          service: postgresql
          severity: warning
          severity_num: 200
        annotations:
          description: 'PGSQL Instance {{ $labels.deployment }} over 75% disk usage at mount point "{{ $labels.mount_point }}": {{ $value }}%'
          summary: PGSQL Instance usage warning

      - alert: PGDiskSize
        expr: 100 * ((ccp_nodemx_data_disk_total_bytes - ccp_nodemx_data_disk_available_bytes) / ccp_nodemx_data_disk_total_bytes) > 90
        for: 60s
        labels:
          service: postgresql
          severity: critical
          severity_num: 300
        annotations:
          description: 'PGSQL Instance {{ $labels.deployment }} over 90% disk usage at mount point "{{ $labels.mount_point }}": {{ $value }}%'
          summary: 'PGSQL Instance size critical'

      - alert: PGReplicationByteLag
        expr: ccp_replication_lag_size_bytes > 5.24288e+07
        for: 60s
        labels:
          service: postgresql
          severity: warning
          severity_num: 200
        annotations:
          description: 'PGSQL Instance {{ $labels.job }} has at least one replica lagging over 50MB behind.'
          summary: 'PGSQL Instance replica lag warning'

      - alert: PGReplicationByteLag
        expr: ccp_replication_lag_size_bytes > 1.048576e+08
        for: 60s
        labels:
          service: postgresql
          severity: critical
          severity_num: 300
        annotations:
          description: 'PGSQL Instance {{ $labels.job }} has at least one replica lagging over 100MB behind.'
          summary: 'PGSQL Instance replica lag warning'

      - alert: PGReplicationSlotsInactive
        expr: ccp_replication_slots_active == 0
        for: 60s
        labels:
          service: postgresql
          severity: critical
          severity_num: 300
        annotations:
          description: 'PGSQL Instance {{ $labels.job }} has one or more inactive replication slots'
          summary: 'PGSQL Instance inactive replication slot'

      - alert: PGXIDWraparound
        expr: ccp_transaction_wraparound_percent_towards_wraparound > 50
        for: 60s
        labels:
          service: postgresql
          severity: warning
          severity_num: 200
        annotations:
          description: 'PGSQL Instance {{ $labels.job }} is over 50% towards transaction id wraparound.'
          summary: 'PGSQL Instance {{ $labels.job }} transaction id wraparound imminent'

      - alert: PGXIDWraparound
        expr: ccp_transaction_wraparound_percent_towards_wraparound > 75
        for: 60s
        labels:
          service: postgresql
          severity: critical
          severity_num: 300
        annotations:
          description: 'PGSQL Instance {{ $labels.job }} is over 75% towards transaction id wraparound.'
          summary: 'PGSQL Instance transaction id wraparound imminent'

      - alert: PGEmergencyVacuum
        expr: ccp_transaction_wraparound_percent_towards_emergency_autovac > 110
        for: 60s
        labels:
          service: postgresql
          severity: warning
          severity_num: 200
        annotations:
          description: 'PGSQL Instance {{ $labels.job }} is over 110% beyond autovacuum_freeze_max_age value. Autovacuum may need tuning to better keep up.'
          summary: 'PGSQL Instance emergency vacuum imminent'

      - alert: PGEmergencyVacuum
        expr: ccp_transaction_wraparound_percent_towards_emergency_autovac > 125
        for: 60s
        labels:
          service: postgresql
          severity: critical
          severity_num: 300
        annotations:
          description: 'PGSQL Instance {{ $labels.job }} is over 125% beyond autovacuum_freeze_max_age value. Autovacuum needs tuning to better keep up.'
          summary: 'PGSQL Instance emergency vacuum imminent'

      - alert: PGArchiveCommandStatus
        expr: ccp_archive_command_status_seconds_since_last_fail > 300
        for: 60s
        labels:
            service: postgresql
            severity: critical
            severity_num: 300
        annotations:
            description: 'PGSQL Instance {{ $labels.job }} has a recent failing archive command'
            summary: 'Seconds since the last recorded failure of the archive_command'

      - alert: PGSequenceExhaustion
        expr: ccp_sequence_exhaustion_count > 0
        for: 60s
        labels:
            service: postgresql
            severity: critical
            severity_num: 300
        annotations:
            description: 'Count of sequences on instance {{ $labels.job }} at over 75% usage: {{ $value }}. Run following query to see full sequence status: SELECT * FROM monitor.sequence_status() WHERE percent >= 75'

      - alert: PGSettingsPendingRestart
        expr: ccp_settings_pending_restart_count > 0
        for: 60s
        labels:
            service: postgresql
            severity: critical
            severity_num: 300
        annotations:
            description: 'One or more settings in the pg_settings system catalog on system {{ $labels.job }} are in a pending_restart state. Check the system catalog for which settings are pending and review postgresql.conf for changes.'

    ########## PGBACKREST RULES ##########
    #
    # Uncomment and customize one or more of these rules to monitor your pgbackrest backups.
    # Full backups are considered the equivalent of both differentials and incrementals since both are based on the last full
    #   And differentials are considered incrementals since incrementals will be based off the last diff if one exists
    #   This avoid false alerts, for example when you don't run diff/incr backups on the days that you run a full
    # Stanza should also be set if different intervals are expected for each stanza.
    #   Otherwise rule will be applied to all stanzas returned on target system if not set.
    #
    # Relevant metric names are:
    #   ccp_backrest_last_full_backup_time_since_completion_seconds
    #   ccp_backrest_last_incr_backup_time_since_completion_seconds
    #   ccp_backrest_last_diff_backup_time_since_completion_seconds
    #
    # To avoid false positives on backup time alerts, 12 hours are added onto each threshold to allow a buffer if the backup runtime varies from day to day.
    #    Further adjustment may be needed depending on your backup runtimes/schedule.
    #
    #  - alert: PGBackRestLastCompletedFull_main
    #    expr: ccp_backrest_last_full_backup_time_since_completion_seconds{stanza="main"} > 648000
    #    for: 60s
    #    labels:
    #       service: postgresql
    #       severity: critical
    #       severity_num: 300
    #    annotations:
    #       summary: 'Full backup for stanza [main] on system {{ $labels.job }} has not completed in the last week.'
    #
    #  - alert: PGBackRestLastCompletedIncr_main
    #    expr: ccp_backrest_last_incr_backup_time_since_completion_seconds{stanza="main"} > 129600
    #    for: 60s
    #    labels:
    #       service: postgresql
    #       severity: critical
    #       severity_num: 300
    #    annotations:
    #       summary: 'Incremental backup for stanza [main] on system {{ $labels.job }} has not completed in the last 24 hours.'
    #
    #
    # Runtime monitoring is handled with a single metric:
    #
    #   ccp_backrest_last_info_backup_runtime_seconds
    #
    # Runtime monitoring should have the "backup_type" label set.
    #   Otherwise the rule will apply to the last run of all backup types returned (full, diff, incr)
    # Stanza should also be set if runtimes per stanza have different expected times
    #
    #  - alert: PGBackRestLastRuntimeFull_main
    #    expr: ccp_backrest_last_info_backup_runtime_seconds{backup_type="full", stanza="main"} > 14400
    #    for: 60s
    #    labels:
    #       service: postgresql
    #       severity: critical
    #       severity_num: 300
    #    annotations:
    #       summary: 'Expected runtime of full backup for stanza [main] has exceeded 4 hours'
    #
    #  - alert: PGBackRestLastRuntimeDiff_main
    #    expr: ccp_backrest_last_info_backup_runtime_seconds{backup_type="diff", stanza="main"} > 3600
    #    for: 60s
    #    labels:
    #       service: postgresql
    #       severity: critical
    #       severity_num: 300
    #    annotations:
    #       summary: 'Expected runtime of diff backup for stanza [main] has exceeded 1 hour'
    ##
    #
    ## If the pgbackrest command fails to run, the metric disappears from the exporter output and the alert never fires.
    ## An absence alert must be configured explicitly for each target (job) that backups are being monitored.
    ## Checking for absence of just the full backup type should be sufficient (no need for diff/incr).
    ## Note that while the backrest check command failing will likely also cause a scrape error alert, the addition of this
    ## check gives a clearer answer as to what is causing it and that something is wrong with the backups.
    #
    #  - alert: PGBackrestAbsentFull_Prod
    #    expr: absent(ccp_backrest_last_full_backup_time_since_completion_seconds{job="Prod"})
    #    for: 10s
    #    labels:
    #      service: postgresql
    #      severity: critical
    #      severity_num: 300
    #    annotations:
    #      description: 'Backup Full status missing for Prod. Check that pgbackrest info command is working on target system.'

kind: ConfigMap
metadata:
  labels:
    app.kubernetes.io/name: postgres-operator-monitoring
    vendor: crunchydata
  name: alertmanager-rules-config