Set up comprehensive observability for Vast.ai integrations with metrics, traces, and alerts. Use when implementing monitoring for Vast.ai operations, setting up dashboards, or configuring alerting for Vast.ai integration health. Trigger with phrases like "vastai monitoring", "vastai metrics", "vastai observability", "monitor vastai", "vastai alerts", "vastai tracing".
Monitor Vast.ai GPU instance health, utilization, and costs. Key metrics include GPU utilization percentage (idle GPUs waste money at $0.20-$3.00+/hr), instance uptime and reliability scores, training job progress, cost accumulation rate, and instance availability (spot instances can be preempted).
vastai CLI tool installed and authenticated# Check GPU utilization across all running instances
vastai show instances --raw | jq '.[] | {
id, gpu_name, num_gpus,
gpu_util_pct: .gpu_utilization,
gpu_temp_c: .gpu_temp,
cost_per_hr: .dph_total,
hours_running: ((.cur_state_time - .start_time) / 3600), # 3600: timeout: 1 hour
wasted_if_idle: (if .gpu_utilization < 10 then (.dph_total * ((.cur_state_time - .start_time) / 3600)) else 0 end) # timeout: 1 hour
}'
// vastai-cost-monitor.ts
async function monitorCosts() {
const instances = await vastaiApi.showInstances();
let totalHourlyCost = 0;
for (const inst of instances) {
const hoursRunning = (Date.now() / 1000 - inst.start_time) / 3600; # 1000: 3600: 1 second in ms
const totalCost = inst.dph_total * hoursRunning;
totalHourlyCost += inst.dph_total;
emitGauge('vastai_instance_cost_usd', totalCost, { id: inst.id, gpu: inst.gpu_name });
emitGauge('vastai_gpu_utilization_pct', inst.gpu_utilization, { id: inst.id, gpu: inst.gpu_name });
}
emitGauge('vastai_total_hourly_burn_usd', totalHourlyCost);
}
# Find instances with <10% GPU utilization running for >1 hour (wasting money)
vastai show instances --raw | \
jq '[.[] | select(.gpu_utilization < 10 and ((.cur_state_time - .start_time) > 3600))] | # 3600: timeout: 1 hour
map({id, gpu_name, util: .gpu_utilization, hours: ((.cur_state_time - .start_time) / 3600), wasted_usd: (.dph_total * ((.cur_state_time - .start_time) / 3600))}) | # timeout: 1 hour
sort_by(-.wasted_usd)'