Wattlytics: Co-optimizing performance, energy, and TCO in HPC clusters. Interactive web platform for decision support in GPU-accelerated computing system design and operation. Activation: HPC optimization, energy-performance tradeoff, TCO analysis, cluster decision support.
The escalating computational demands and energy footprint of GPU-accelerated computing systems complicate informed design and operational decisions. Wattlytics is an interactive, browser-based decision-support system that co-optimizes performance, energy consumption, and total cost of ownership (TCO) for HPC clusters.
import numpy as np
from typing import Dict, List, Tuple, Optional
from dataclasses import dataclass
@dataclass
class HardwareConfig:
cpu_cores: int
cpu_freq_ghz: float
gpu_count: int
gpu_memory_gb: float
memory_gb: float
interconnect_bw_gbps: float
power_idle_w: float
power_max_w: float
cost_per_node_usd: float
@dataclass
class ApplicationProfile:
name: str
compute_intensity: float # FLOPs per byte
memory_footprint_gb: float
communication_pattern: str # 'point-to-point', 'collective', 'none'
scaling_efficiency: float # Parallel efficiency
class PerformanceModel:
def __init__(self, hardware: HardwareConfig, app: ApplicationProfile):
self.hw = hardware
self.app = app
def compute_performance(self, num_nodes: int) -> float:
# Peak compute performance (TFLOP/s)
cpu_flops = self.hw.cpu_cores * self.hw.cpu_freq_ghz * 16 # AVX-512
gpu_flops = self.hw.gpu_count * 20 # Approximate GPU TFLOP/s
total_flops = (cpu_flops + gpu_flops) * num_nodes
# Memory bandwidth limit
mem_bw_tbps = self.hw.memory_gb * 0.1 # Approximate
mem_limited_flops = mem_bw_tbps * 1e3 * self.app.compute_intensity
# Actual performance is min of compute and memory limited
perf = min(total_flops, mem_limited_flops)
# Apply scaling efficiency
if num_nodes > 1:
efficiency = self.app.scaling_efficiency ** np.log2(num_nodes)
perf *= efficiency
return perf
def compute_time(self, problem_size: float, num_nodes: int) -> float:
perf = self.compute_performance(num_nodes)
return problem_size / (perf * 1e12) # seconds
def compute_power(self, utilization: float = 0.8) -> float:
idle = self.hw.power_idle_w
max_p = self.hw.power_max_w
return idle + utilization * (max_p - idle)
def compute_energy(self, problem_size: float, num_nodes: int) -> float:
time = self.compute_time(problem_size, num_nodes)
power = self.compute_power()
return power * time * num_nodes / 3600 # kWh
class TCOModel:
def __init__(self, hardware: HardwareConfig,
electricity_cost_per_kwh: float = 0.12,
lifetime_years: float = 4):
self.hw = hardware
self.electricity_cost = electricity_cost_per_kwh
self.lifetime = lifetime_years
def compute_capital_cost(self, num_nodes: int) -> float:
return self.hw.cost_per_node_usd * num_nodes
def compute_operational_cost(self, annual_energy_kwh: float) -> float:
energy_cost = annual_energy_kwh * self.electricity_cost * self.lifetime
# Add maintenance (10% of capital per year)
maintenance = self.hw.cost_per_node_usd * 0.1 * self.lifetime
return energy_cost + maintenance
def compute_tco(self, num_nodes: int, annual_energy_kwh: float) -> float:
capital = self.compute_capital_cost(num_nodes)
operational = self.compute_operational_cost(annual_energy_kwh)
return capital + operational
class WattlyticsOptimizer:
def __init__(self,
hardware_options: List[HardwareConfig],
application: ApplicationProfile,
problem_size: float,
constraints: Dict):
self.hw_options = hardware_options
self.app = application
self.problem_size = problem_size
self.constraints = constraints
def optimize(self, objective: str = "balanced") -> List[Dict]:
results = []
for hw in self.hw_options:
for num_nodes in range(1, self.constraints.get('max_nodes', 100)):
perf_model = PerformanceModel(hw, self.app)
tco_model = TCOModel(hw)
time = perf_model.compute_time(self.problem_size, num_nodes)
energy = perf_model.compute_energy(self.problem_size, num_nodes)
tco = tco_model.compute_tco(num_nodes, energy * 8760 / time if time > 0 else 0)
# Check constraints
if time > self.constraints.get('max_time', float('inf')):
continue
if energy > self.constraints.get('max_energy', float('inf')):
continue
if tco > self.constraints.get('max_budget', float('inf')):
continue
results.append({
'hardware': hw,
'num_nodes': num_nodes,
'time': time,
'energy': energy,
'tco': tco,
'score': self._compute_score(time, energy, tco, objective)
})
results.sort(key=lambda x: x['score'])
return results
def _compute_score(self, time: float, energy: float, tco: float, objective: str) -> float:
if objective == "performance":
return time
elif objective == "energy":
return energy
elif objective == "cost":
return tco
else: # balanced
# Normalize and combine
return time * 0.4 + energy * 0.3 + tco * 0.3
def get_pareto_frontier(self, results: List[Dict]) -> List[Dict]:
pareto = []
for r in results:
dominated = False
for other in results:
if (other['time'] <= r['time'] and
other['energy'] <= r['energy'] and
other['tco'] <= r['tco'] and
(other['time'] < r['time'] or
other['energy'] < r['energy'] or
other['tco'] < r['tco'])):
dominated = True
break
if not dominated:
pareto.append(r)
return pareto
# Example usage
def example_optimization():
# Hardware configurations
configs = [
HardwareConfig(
cpu_cores=64, cpu_freq_ghz=2.5,
gpu_count=4, gpu_memory_gb=80,
memory_gb=512, interconnect_bw_gbps=200,
power_idle_w=200, power_max_w=2000,
cost_per_node_usd=50000
),
HardwareConfig(
cpu_cores=32, cpu_freq_ghz=3.0,
gpu_count=2, gpu_memory_gb=40,
memory_gb=256, interconnect_bw_gbps=100,
power_idle_w=150, power_max_w=1200,
cost_per_node_usd=25000
)
]
# Application profile
app = ApplicationProfile(
name="Deep Learning Training",
compute_intensity=50.0,
memory_footprint_gb=100,
communication_pattern="collective",
scaling_efficiency=0.85
)
# Optimize
optimizer = WattlyticsOptimizer(
configs, app, problem_size=1e18,
constraints={'max_nodes': 50, 'max_time': 3600}
)
results = optimizer.optimize(objective="balanced")
pareto = optimizer.get_pareto_frontier(results)
print("Pareto-optimal configurations:")
for r in pareto[:5]:
print(f" Nodes: {r['num_nodes']}, Time: {r['time']:.1f}s, "
f"Energy: {r['energy']:.1f}kWh, TCO: ${r['tco']/1e6:.2f}M")
if __name__ == "__main__":
example_optimization()