{"data":{"groups":[{"name":"vm-health","type":"prometheus","id":"9421801731252076345","file":"/etc/alerts/alerts.yml","interval":"1m0s","concurrency":1,"params":null,"alerting_rules":[{"id":"15148797443125215729","name":"TooManyRestarts","type":"prometheus","group_id":"9421801731252076345","expression":"changes(process_start_time_seconds{job=~\"victoriametrics|vmagent|vmalert\"}[15m]) \u003e 2","for":"0s","last_error":"","last_samples":0,"last_exec":"2026-04-04T05:34:30.648608988Z","labels":{"severity":"critical"},"annotations":{"description":"Job {{ $labels.job }} has restarted more than twice in the last 15 minutes. It might be crashlooping.","summary":"{{ $labels.job }} too many restarts (instance {{ $labels.instance }})"}},{"id":"18336758109921887309","name":"ServiceDown","type":"prometheus","group_id":"9421801731252076345","expression":"up{job=~\"victoriametrics|vmagent|vmalert\"} == 0","for":"2m0s","last_error":"","last_samples":0,"last_exec":"2026-04-04T05:34:30.649976064Z","labels":{"severity":"critical"},"annotations":{"description":"{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 2 minutes.","summary":"Service {{ $labels.job }} is down on {{ $labels.instance }}"}},{"id":"15828888860588158234","name":"ProcessNearFDLimits","type":"prometheus","group_id":"9421801731252076345","expression":"(process_max_fds - process_open_fds) \u003c 100","for":"5m0s","last_error":"","last_samples":0,"last_exec":"2026-04-04T05:34:30.651449272Z","labels":{"severity":"critical"},"annotations":{"description":"Exhausting OS file descriptors limit can cause severe degradation of the process. Consider to increase the limit as fast as possible.","summary":"Number of free file descriptors is less than 100 for \"{{ $labels.job }}\"(\"{{ $labels.instance }}\") for the last 5m"}},{"id":"11362580426449474781","name":"TooHighMemoryUsage","type":"prometheus","group_id":"9421801731252076345","expression":"(process_resident_memory_anon_bytes / vm_available_memory_bytes) \u003e 0.9","for":"5m0s","last_error":"","last_samples":0,"last_exec":"2026-04-04T05:34:30.653291792Z","labels":{"severity":"critical"},"annotations":{"description":"Too high memory usage may result into multiple issues such as OOMs or degraded performance. Consider to either increase available memory or decrease the load on the process.","summary":"It is more than 90% of memory used by \"{{ $labels.job }}\"(\"{{ $labels.instance }}\") during the last 5m"}},{"id":"17388543372823140628","name":"TooHighCPUUsage","type":"prometheus","group_id":"9421801731252076345","expression":"rate(process_cpu_seconds_total[5m]) / process_cpu_cores_available \u003e 0.9","for":"5m0s","last_error":"","last_samples":0,"last_exec":"2026-04-04T05:34:30.654081312Z","labels":{"severity":"critical"},"annotations":{"description":"Too high CPU usage may be a sign of insufficient resources and make process unstable. Consider to either increase available CPU resources or decrease the load on the process.","summary":"More than 90% of CPU is used by \"{{ $labels.job }}\"(\"{{ $labels.instance }}\") during the last 5m"}}],"recording_rules":null},{"name":"vmagent","type":"prometheus","id":"10678267065366597081","file":"/etc/alerts/alerts.yml","interval":"30s","concurrency":2,"params":null,"alerting_rules":[{"id":"14941372497586835904","name":"PersistentQueueIsDroppingData","type":"prometheus","group_id":"10678267065366597081","expression":"sum(increase(vm_persistentqueue_bytes_dropped_total[5m])) by (job, instance) \u003e 0","for":"10m0s","last_error":"","last_samples":0,"last_exec":"2026-04-04T05:35:17.368392473Z","labels":{"severity":"critical"},"annotations":{"dashboard":"http://localhost:3000/d/G7Z9GzMGz?viewPanel=49\u0026var-instance={{ $labels.instance }}","description":"Vmagent dropped {{ $value | humanize1024 }} from persistent queue on instance {{ $labels.instance }} for the last 10m.","summary":"Instance {{ $labels.instance }} is dropping data from persistent queue"}},{"id":"14080245430458445615","name":"RejectedRemoteWriteDataBlocksAreDropped","type":"prometheus","group_id":"10678267065366597081","expression":"sum(increase(vmagent_remotewrite_packets_dropped_total[5m])) by (job, instance) \u003e 0","for":"15m0s","last_error":"","last_samples":0,"last_exec":"2026-04-04T05:35:17.368485527Z","labels":{"severity":"warning"},"annotations":{"dashboard":"http://localhost:3000/d/G7Z9GzMGz?viewPanel=79\u0026var-instance={{ $labels.instance }}","summary":"Job \"{{ $labels.job }}\" on instance {{ $labels.instance }} drops the rejected by remote-write server data blocks. Check the logs to find the reason for rejects."}},{"id":"6093091293462635297","name":"TooManyScrapeErrors","type":"prometheus","group_id":"10678267065366597081","expression":"sum(increase(vm_promscrape_scrapes_failed_total[5m])) by (job, instance) \u003e 0","for":"15m0s","last_error":"","last_samples":0,"last_exec":"2026-04-04T05:35:17.370287924Z","labels":{"severity":"warning"},"annotations":{"dashboard":"http://localhost:3000/d/G7Z9GzMGz?viewPanel=31\u0026var-instance={{ $labels.instance }}","summary":"Job \"{{ $labels.job }}\" on instance {{ $labels.instance }} fails to scrape targets for last 15m"}},{"id":"9408732227524010864","name":"TooManyWriteErrors","type":"prometheus","group_id":"10678267065366597081","expression":"(sum(increase(vm_ingestserver_request_errors_total[5m])) by (job, instance)\n+\nsum(increase(vmagent_http_request_errors_total[5m])) by (job, instance)) \u003e 0\n","for":"15m0s","last_error":"","last_samples":0,"last_exec":"2026-04-04T05:35:17.370407038Z","labels":{"severity":"warning"},"annotations":{"dashboard":"http://localhost:3000/d/G7Z9GzMGz?viewPanel=77\u0026var-instance={{ $labels.instance }}","summary":"Job \"{{ $labels.job }}\" on instance {{ $labels.instance }} responds with errors to write requests for last 15m."}},{"id":"14174193033363729612","name":"TooManyRemoteWriteErrors","type":"prometheus","group_id":"10678267065366597081","expression":"sum(rate(vmagent_remotewrite_retries_count_total[5m])) by(job, instance, url) \u003e 0","for":"15m0s","last_error":"","last_samples":0,"last_exec":"2026-04-04T05:35:17.371536235Z","labels":{"severity":"warning"},"annotations":{"dashboard":"http://localhost:3000/d/G7Z9GzMGz?viewPanel=61\u0026var-instance={{ $labels.instance }}","description":"Vmagent fails to push data via remote write protocol to destination \"{{ $labels.url }}\"\n Ensure that destination is up and reachable.","summary":"Job \"{{ $labels.job }}\" on instance {{ $labels.instance }} fails to push to remote storage"}},{"id":"11092998810160254882","name":"RemoteWriteConnectionIsSaturated","type":"prometheus","group_id":"10678267065366597081","expression":"rate(vmagent_remotewrite_send_duration_seconds_total[5m]) \u003e 0.9","for":"15m0s","last_error":"","last_samples":0,"last_exec":"2026-04-04T05:35:17.371580439Z","labels":{"severity":"warning"},"annotations":{"dashboard":"http://localhost:3000/d/G7Z9GzMGz?viewPanel=84\u0026var-instance={{ $labels.instance }}","description":"The remote write connection between vmagent \"{{ $labels.job }}\" (instance {{ $labels.instance }}) and destination \"{{ $labels.url }}\" is saturated by more than 90% and vmagent won't be able to keep up.\n This usually means that `-remoteWrite.queues` command-line flag must be increased in order to increase the number of connections per each remote storage.","summary":"Remote write connection from \"{{ $labels.job }}\" (instance {{ $labels.instance }}) to {{ $labels.url }} is saturated"}},{"id":"3724483375939275623","name":"PersistentQueueForWritesIsSaturated","type":"prometheus","group_id":"10678267065366597081","expression":"rate(vm_persistentqueue_write_duration_seconds_total[5m]) \u003e 0.9","for":"15m0s","last_error":"","last_samples":0,"last_exec":"2026-04-04T05:35:17.372771065Z","labels":{"severity":"warning"},"annotations":{"dashboard":"http://localhost:3000/d/G7Z9GzMGz?viewPanel=98\u0026var-instance={{ $labels.instance }}","description":"Persistent queue writes for vmagent \"{{ $labels.job }}\" (instance {{ $labels.instance }}) are saturated by more than 90% and vmagent won't be able to keep up with flushing data on disk. In this case, consider to decrease load on the vmagent or improve the disk throughput.","summary":"Persistent queue writes for instance {{ $labels.instance }} are saturated"}},{"id":"9593711483023016771","name":"PersistentQueueForReadsIsSaturated","type":"prometheus","group_id":"10678267065366597081","expression":"rate(vm_persistentqueue_read_duration_seconds_total[5m]) \u003e 0.9","for":"15m0s","last_error":"","last_samples":0,"last_exec":"2026-04-04T05:35:17.372713814Z","labels":{"severity":"warning"},"annotations":{"dashboard":"http://localhost:3000/d/G7Z9GzMGz?viewPanel=99\u0026var-instance={{ $labels.instance }}","description":"Persistent queue reads for vmagent \"{{ $labels.job }}\" (instance {{ $labels.instance }}) are saturated by more than 90% and vmagent won't be able to keep up with reading data from the disk. In this case, consider to decrease load on the vmagent or improve the disk throughput.","summary":"Persistent queue reads for instance {{ $labels.instance }} are saturated"}},{"id":"9905990287815795810","name":"SeriesLimitHourReached","type":"prometheus","group_id":"10678267065366597081","expression":"(vmagent_hourly_series_limit_current_series / vmagent_hourly_series_limit_max_series) \u003e 0.9","for":"0s","last_error":"","last_samples":0,"last_exec":"2026-04-04T05:35:17.373811866Z","labels":{"severity":"critical"},"annotations":{"dashboard":"http://localhost:3000/d/G7Z9GzMGz?viewPanel=88\u0026var-instance={{ $labels.instance }}","description":"Max series limit set via -remoteWrite.maxHourlySeries flag is close to reaching the max value. Then samples for new time series will be dropped instead of sending them to remote storage systems.","summary":"Instance {{ $labels.instance }} reached 90% of the limit"}},{"id":"5813001511186364654","name":"SeriesLimitDayReached","type":"prometheus","group_id":"10678267065366597081","expression":"(vmagent_daily_series_limit_current_series / vmagent_daily_series_limit_max_series) \u003e 0.9","for":"0s","last_error":"","last_samples":0,"last_exec":"2026-04-04T05:35:17.373888929Z","labels":{"severity":"critical"},"annotations":{"dashboard":"http://localhost:3000/d/G7Z9GzMGz?viewPanel=90\u0026var-instance={{ $labels.instance }}","description":"Max series limit set via -remoteWrite.maxDailySeries flag is close to reaching the max value. Then samples for new time series will be dropped instead of sending them to remote storage systems.","summary":"Instance {{ $labels.instance }} reached 90% of the limit"}}],"recording_rules":null},{"name":"vmsingle","type":"prometheus","id":"216423836913459264","file":"/etc/alerts/alerts.yml","interval":"30s","concurrency":2,"params":null,"alerting_rules":[{"id":"13267695137958963705","name":"DiskRunsOutOfSpaceIn3Days","type":"prometheus","group_id":"216423836913459264","expression":"vm_free_disk_space_bytes / ignoring(path)\n(\n   (\n    rate(vm_rows_added_to_storage_total[1d]) -\n    ignoring(type) rate(vm_deduplicated_samples_total{type=\"merge\"}[1d])\n   )\n  * scalar(\n    sum(vm_data_size_bytes{type!=\"indexdb\"}) /\n    sum(vm_rows{type!=\"indexdb\"})\n   )\n) \u003c 3 * 24 * 3600\n","for":"30m0s","last_error":"","last_samples":0,"last_exec":"2026-04-04T05:35:00.356287082Z","labels":{"severity":"critical"},"annotations":{"dashboard":"http://localhost:3000/d/wNf0q_kZk?viewPanel=73\u0026var-instance={{ $labels.instance }}","description":"Taking into account current ingestion rate, free disk space will be enough only for {{ $value | humanizeDuration }} on instance {{ $labels.instance }}.\n Consider to limit the ingestion rate, decrease retention or scale the disk space if possible.","summary":"Instance {{ $labels.instance }} will run out of disk space soon"}},{"id":"18044055263708102221","name":"DiskRunsOutOfSpace","type":"prometheus","group_id":"216423836913459264","expression":"sum(vm_data_size_bytes) by(instance) /\n(\n sum(vm_free_disk_space_bytes) by(instance) +\n sum(vm_data_size_bytes) by(instance)\n) \u003e 0.8\n","for":"30m0s","last_error":"","last_samples":0,"last_exec":"2026-04-04T05:35:00.35506075Z","labels":{"severity":"critical"},"annotations":{"dashboard":"http://localhost:3000/d/wNf0q_kZk?viewPanel=53\u0026var-instance={{ $labels.instance }}","description":"Disk utilisation on instance {{ $labels.instance }} is more than 80%.\n Having less than 20% of free disk space could cripple merges processes and overall performance. Consider to limit the ingestion rate, decrease retention or scale the disk space if possible.","summary":"Instance {{ $labels.instance }} will run out of disk space soon"}},{"id":"9216859099280381622","name":"RequestErrorsToAPI","type":"prometheus","group_id":"216423836913459264","expression":"increase(vm_http_request_errors_total[5m]) \u003e 0","for":"15m0s","last_error":"","last_samples":0,"last_exec":"2026-04-04T05:35:00.357208157Z","labels":{"severity":"warning"},"annotations":{"dashboard":"http://localhost:3000/d/wNf0q_kZk?viewPanel=35\u0026var-instance={{ $labels.instance }}","description":"Requests to path {{ $labels.path }} are receiving errors. Please verify if clients are sending correct requests.","summary":"Too many errors served for path {{ $labels.path }} (instance {{ $labels.instance }})"}},{"id":"12888099121128797919","name":"ConcurrentFlushesHitTheLimit","type":"prometheus","group_id":"216423836913459264","expression":"avg_over_time(vm_concurrent_addrows_current[1m]) \u003e= vm_concurrent_addrows_capacity","for":"15m0s","last_error":"","last_samples":0,"last_exec":"2026-04-04T05:35:00.357900188Z","labels":{"severity":"warning"},"annotations":{"dashboard":"http://localhost:3000/d/wNf0q_kZk?viewPanel=59\u0026var-instance={{ $labels.instance }}","description":"The limit of concurrent flushes on instance {{ $labels.instance }} is equal to number of CPUs.\n When VictoriaMetrics constantly hits the limit it means that storage is overloaded and requires more CPU.","summary":"VictoriaMetrics on instance {{ $labels.instance }} is constantly hitting concurrent flushes limit"}},{"id":"14911000905007095192","name":"TooManyLogs","type":"prometheus","group_id":"216423836913459264","expression":"sum(increase(vm_log_messages_total{level!=\"info\"}[5m])) by (job, instance) \u003e 0","for":"15m0s","last_error":"","last_samples":0,"last_exec":"2026-04-04T05:35:00.359546977Z","labels":{"severity":"warning"},"annotations":{"dashboard":"http://localhost:3000/d/wNf0q_kZk?viewPanel=67\u0026var-instance={{ $labels.instance }}","description":"Logging rate for job \"{{ $labels.job }}\" ({{ $labels.instance }}) is {{ $value }} for last 15m.\n Worth to check logs for specific error messages.","summary":"Too many logs printed for job \"{{ $labels.job }}\" ({{ $labels.instance }})"}},{"id":"3652446059142022094","name":"RowsRejectedOnIngestion","type":"prometheus","group_id":"216423836913459264","expression":"sum(rate(vm_rows_ignored_total[5m])) by (instance, reason) \u003e 0","for":"15m0s","last_error":"","last_samples":0,"last_exec":"2026-04-04T05:35:00.359230734Z","labels":{"severity":"warning"},"annotations":{"dashboard":"http://localhost:3000/d/wNf0q_kZk?viewPanel=58\u0026var-instance={{ $labels.instance }}","description":"VM is rejecting to ingest rows on \"{{ $labels.instance }}\" due to the following reason: \"{{ $labels.reason }}\"","summary":"Some rows are rejected on \"{{ $labels.instance }}\" on ingestion attempt"}},{"id":"16597393320100697084","name":"TooHighChurnRate","type":"prometheus","group_id":"216423836913459264","expression":"(\n   sum(rate(vm_new_timeseries_created_total[5m])) by(instance)\n   /\n   sum(rate(vm_rows_inserted_total[5m])) by (instance)\n ) \u003e 0.1\n","for":"15m0s","last_error":"","last_samples":0,"last_exec":"2026-04-04T05:35:00.363365275Z","labels":{"severity":"warning"},"annotations":{"dashboard":"http://localhost:3000/d/wNf0q_kZk?viewPanel=66\u0026var-instance={{ $labels.instance }}","description":"VM constantly creates new time series on \"{{ $labels.instance }}\".\n This effect is known as Churn Rate.\n High Churn Rate tightly connected with database performance and may result in unexpected OOM's or slow queries.","summary":"Churn rate is more than 10% on \"{{ $labels.instance }}\" for the last 15m"}},{"id":"12103807088721046337","name":"TooHighChurnRate24h","type":"prometheus","group_id":"216423836913459264","expression":"sum(increase(vm_new_timeseries_created_total[24h])) by(instance)\n\u003e\n(sum(vm_cache_entries{type=\"storage/hour_metric_ids\"}) by(instance) * 3)\n","for":"15m0s","last_error":"","last_samples":0,"last_exec":"2026-04-04T05:35:00.363337554Z","labels":{"severity":"warning"},"annotations":{"dashboard":"http://localhost:3000/d/wNf0q_kZk?viewPanel=66\u0026var-instance={{ $labels.instance }}","description":"The number of created new time series over last 24h is 3x times higher than current number of active series on \"{{ $labels.instance }}\".\n This effect is known as Churn Rate.\n High Churn Rate tightly connected with database performance and may result in unexpected OOM's or slow queries.","summary":"Too high number of new series on \"{{ $labels.instance }}\" created over last 24h"}},{"id":"12076262473324980890","name":"TooHighSlowInsertsRate","type":"prometheus","group_id":"216423836913459264","expression":"(\n   sum(rate(vm_slow_row_inserts_total[5m])) by(instance)\n   /\n   sum(rate(vm_rows_inserted_total[5m])) by (instance)\n ) \u003e 0.05\n","for":"15m0s","last_error":"","last_samples":0,"last_exec":"2026-04-04T05:35:00.364261967Z","labels":{"severity":"warning"},"annotations":{"dashboard":"http://localhost:3000/d/wNf0q_kZk?viewPanel=68\u0026var-instance={{ $labels.instance }}","description":"High rate of slow inserts on \"{{ $labels.instance }}\" may be a sign of resource exhaustion for the current load. It is likely more RAM is needed for optimal handling of the current number of active time series.","summary":"Percentage of slow inserts is more than 5% on \"{{ $labels.instance }}\" for the last 15m"}},{"id":"12806002462829935187","name":"LabelsLimitExceededOnIngestion","type":"prometheus","group_id":"216423836913459264","expression":"sum(increase(vm_metrics_with_dropped_labels_total[5m])) by (instance) \u003e 0","for":"15m0s","last_error":"","last_samples":0,"last_exec":"2026-04-04T05:35:00.364633191Z","labels":{"severity":"warning"},"annotations":{"dashboard":"http://localhost:3000/d/wNf0q_kZk?viewPanel=74\u0026var-instance={{ $labels.instance }}","description":"VictoriaMetrics limits the number of labels per each metric with `-maxLabelsPerTimeseries` command-line flag.\n This prevents from ingesting metrics with too many labels. Please verify that `-maxLabelsPerTimeseries` is configured correctly or that clients which send these metrics aren't misbehaving.","summary":"Metrics ingested in ({{ $labels.instance }}) are exceeding labels limit"}}],"recording_rules":null}]},"status":"success"}