Use when working with Victoria Metrics — victoriaMetrics time-series database with MetricsQL queries, vmstorage health, vmselect performance, retention management, and cluster monitoring. Covers metric ingestion, cardinality analysis, query optimization, and multi-tenant operations. Use when querying metrics via MetricsQL, analyzing storage health, investigating cardinality, or managing VictoriaMetrics clusters.
Query and manage VictoriaMetrics time-series infrastructure using MetricsQL.
VictoriaMetrics uses Basic auth or Bearer token — injected by connection.
http://<host>:8428/http://<vmselect>:8481/select/<accountID>/prometheus/http://<vminsert>:8480/insert/<accountID>/prometheus/VM_BASE_URL.jq for JSON extraction from Prometheus-compatible API#!/bin/bash
vm_query() {
local metricsql="$1"
local time="${2:-$(date +%s)}"
curl -s "${VM_BASE_URL}/api/v1/query" \
--data-urlencode "query=${metricsql}" \
--data-urlencode "time=${time}"
}
vm_query_range() {
local metricsql="$1"
local start="${2:-$(( $(date +%s) - 3600 ))}"
local end="${3:-$(date +%s)}"
local step="${4:-60}"
curl -s "${VM_BASE_URL}/api/v1/query_range" \
--data-urlencode "query=${metricsql}" \
--data-urlencode "start=${start}" \
--data-urlencode "end=${end}" \
--data-urlencode "step=${step}"
}
vm_api() {
local endpoint="$1"
curl -s "${VM_BASE_URL}${endpoint}"
}
{
vm_query "up" &
vm_api "/api/v1/status/tsdb" &
vm_api "/api/v1/status/active_queries" &
}
wait
NEVER assume metric names or label keys. ALWAYS discover first.
#!/bin/bash
echo "=== Available Metric Names (top 20) ==="
vm_api "/api/v1/label/__name__/values" | jq -r '.data[:20][]'
echo ""
echo "=== Label Names ==="
vm_api "/api/v1/labels" | jq -r '.data[]' | head -20
echo ""
echo "=== TSDB Status ==="
vm_api "/api/v1/status/tsdb" | jq '{
totalSeries: .data.totalSeries,
totalLabelValuePairs: .data.totalLabelValuePairs,
seriesCountByMetricName: [.data.seriesCountByMetricName[:10][] | "\(.name): \(.value)"]
}'
#!/bin/bash
echo "=== VictoriaMetrics Storage Health ==="
{
echo "--- TSDB Stats ---"
vm_api "/api/v1/status/tsdb" | jq '{
totalSeries: .data.totalSeries,
totalLabelValuePairs: .data.totalLabelValuePairs
}' &
echo "--- Active Queries ---"
vm_api "/api/v1/status/active_queries" | jq '.data | length | "Active queries: \(.)"' -r &
echo "--- Build Info ---"
vm_api "/flags" 2>/dev/null | grep -E "retentionPeriod|storageDataPath" | head -5 &
}
wait
echo ""
echo "=== Top Series by Metric Name ==="
vm_api "/api/v1/status/tsdb" \
| jq -r '.data.seriesCountByMetricName[:15][] | "\(.name)\t\(.value) series"'
#!/bin/bash
echo "=== High Cardinality Metrics ==="
vm_api "/api/v1/status/tsdb" \
| jq -r '.data.seriesCountByMetricName | sort_by(-.value)[:15][] | "\(.name)\t\(.value) series"'
echo ""
echo "=== High Cardinality Labels ==="
vm_api "/api/v1/status/tsdb" \
| jq -r '.data.seriesCountByLabelValuePair | sort_by(-.value)[:15][] | "\(.name)\t\(.value) series"'
echo ""
echo "=== Label Value Counts ==="
for label in $(vm_api "/api/v1/labels" | jq -r '.data[]' | head -10); do
count=$(vm_api "/api/v1/label/${label}/values" | jq '.data | length')
echo "$label: $count unique values"
done | sort -t: -k2 -rn
#!/bin/bash
echo "=== CPU Usage by Instance ==="
vm_query 'avg(rate(node_cpu_seconds_total{mode!="idle"}[5m])) by (instance) * 100' \
| jq -r '.data.result[] | "\(.metric.instance)\t\(.value[1] | tonumber | . * 10 | round / 10)%"' \
| sort -t$'\t' -k2 -rn | head -15
echo ""
echo "=== Memory Usage ==="
vm_query '(1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100' \
| jq -r '.data.result[] | "\(.metric.instance)\t\(.value[1] | tonumber | round)%"' \
| sort -t$'\t' -k2 -rn | head -15
echo ""
echo "=== Disk Usage ==="
vm_query '(1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100' \
| jq -r '.data.result[] | "\(.metric.instance)\t\(.value[1] | tonumber | round)%"' | head -15
#!/bin/bash
echo "=== Ingestion Rate ==="
vm_query 'rate(vm_rows_inserted_total[5m])' \
| jq -r '.data.result[] | "\(.metric.type // "total")\t\(.value[1] | tonumber | round) rows/s"'
echo ""
echo "=== Storage Size ==="
vm_query 'vm_data_size_bytes' \
| jq -r '.data.result[] | "\(.metric.type // "total")\t\(.value[1] | tonumber / 1073741824 | . * 100 | round / 100)GB"'
echo ""
echo "=== Merge Operations ==="
vm_query 'rate(vm_merges_total[5m])' \
| jq -r '.data.result[] | "\(.metric.type)\t\(.value[1] | tonumber | . * 100 | round / 100) merges/s"'
#!/bin/bash
echo "=== Cluster Node Status ==="
{
echo "--- vmselect ---"
vm_query 'up{job=~".*vmselect.*"}' \
| jq -r '.data.result[] | "\(.metric.instance)\tup:\(.value[1])"' &
echo "--- vmstorage ---"
vm_query 'up{job=~".*vmstorage.*"}' \
| jq -r '.data.result[] | "\(.metric.instance)\tup:\(.value[1])"' &
echo "--- vminsert ---"
vm_query 'up{job=~".*vminsert.*"}' \
| jq -r '.data.result[] | "\(.metric.instance)\tup:\(.value[1])"' &
}
wait
Present results as a structured report:
Monitoring Victoria Metrics Report
══════════════════════════════════
Resources discovered: [count]
Resource Status Key Metric Issues
──────────────────────────────────────────────
[name] [ok/warn] [value] [findings]
Summary: [total] resources | [ok] healthy | [warn] warnings | [crit] critical
Action Items: [list of prioritized findings]
Target ≤50 lines of output. Use tables for multi-resource comparisons.
| Shortcut | Counter | Why |
|---|---|---|
| "I'll skip discovery and check known resources" | Always run Phase 1 discovery first | Resource names change, new resources appear — assumed names cause errors |
| "The user only asked for a quick check" | Follow the full discovery → analysis flow | Quick checks miss critical issues; structured analysis catches silent failures |
| "Default configuration is probably fine" | Audit configuration explicitly | Defaults often leave logging, security, and optimization features disabled |
| "Metrics aren't needed for this" | Always check relevant metrics when available | API/CLI responses show current state; metrics reveal trends and intermittent issues |
| "I don't have access to that" | Try the command and report the actual error | Assumed permission failures prevent useful investigation; actual errors are informative |
range_median, rollup_rate — use them for better accuracy/select/{accountID}/prometheus/ — include account ID in URL-retentionPeriod flag — not configurable via API at runtime-dedup.minScrapeInterval for HA Prometheus setupsvm_series_created_total — high cardinality degrades performance-search.maxQueryDuration to adjuststep parameter to control