Wattlytics: HPC Cluster Optimization

Overview

The escalating computational demands and energy footprint of GPU-accelerated computing systems complicate informed design and operational decisions. Wattlytics is an interactive, browser-based decision-support system that co-optimizes performance, energy consumption, and total cost of ownership (TCO) for HPC clusters.

Core Concepts

Multi-Objective Optimization

Performance: Minimize time-to-solution
Energy: Minimize power consumption
TCO: Minimize total cost of ownership
Trade-offs: Balance competing objectives

Performance Modeling

Application Characteristics: Compute intensity, memory patterns
: CPU, GPU, memory, interconnect

import numpy as np from typing import Dict, List, Tuple, Optional from dataclasses import dataclass @dataclass class HardwareConfig: cpu_cores: int cpu_freq_ghz: float gpu_count: int gpu_memory_gb: float memory_gb: float interconnect_bw_gbps: float power_idle_w: float power_max_w: float cost_per_node_usd: float @dataclass class ApplicationProfile: name: str compute_intensity: float # FLOPs per byte memory_footprint_gb: float communication_pattern: str # 'point-to-point', 'collective', 'none' scaling_efficiency: float # Parallel efficiency class PerformanceModel: def __init__(self, hardware: HardwareConfig, app: ApplicationProfile): self.hw = hardware self.app = app def compute_performance(self, num_nodes: int) -> float: # Peak compute performance (TFLOP/s) cpu_flops = self.hw.cpu_cores * self.hw.cpu_freq_ghz * 16 # AVX-512 gpu_flops = self.hw.gpu_count * 20 # Approximate GPU TFLOP/s total_flops = (cpu_flops + gpu_flops) * num_nodes # Memory bandwidth limit mem_bw_tbps = self.hw.memory_gb * 0.1 # Approximate mem_limited_flops = mem_bw_tbps * 1e3 * self.app.compute_intensity # Actual performance is min of compute and memory limited perf = min(total_flops, mem_limited_flops) # Apply scaling efficiency if num_nodes > 1: efficiency = self.app.scaling_efficiency ** np.log2(num_nodes) perf *= efficiency return perf def compute_time(self, problem_size: float, num_nodes: int) -> float: perf = self.compute_performance(num_nodes) return problem_size / (perf * 1e12) # seconds def compute_power(self, utilization: float = 0.8) -> float: idle = self.hw.power_idle_w max_p = self.hw.power_max_w return idle + utilization * (max_p - idle) def compute_energy(self, problem_size: float, num_nodes: int) -> float: time = self.compute_time(problem_size, num_nodes) power = self.compute_power() return power * time * num_nodes / 3600 # kWh class TCOModel: def __init__(self, hardware: HardwareConfig, electricity_cost_per_kwh: float = 0.12, lifetime_years: float = 4): self.hw = hardware self.electricity_cost = electricity_cost_per_kwh self.lifetime = lifetime_years def compute_capital_cost(self, num_nodes: int) -> float: return self.hw.cost_per_node_usd * num_nodes def compute_operational_cost(self, annual_energy_kwh: float) -> float: energy_cost = annual_energy_kwh * self.electricity_cost * self.lifetime # Add maintenance (10% of capital per year) maintenance = self.hw.cost_per_node_usd * 0.1 * self.lifetime return energy_cost + maintenance def compute_tco(self, num_nodes: int, annual_energy_kwh: float) -> float: capital = self.compute_capital_cost(num_nodes) operational = self.compute_operational_cost(annual_energy_kwh) return capital + operational class WattlyticsOptimizer: def __init__(self, hardware_options: List[HardwareConfig], application: ApplicationProfile, problem_size: float, constraints: Dict): self.hw_options = hardware_options self.app = application self.problem_size = problem_size self.constraints = constraints def optimize(self, objective: str = "balanced") -> List[Dict]: results = [] for hw in self.hw_options: for num_nodes in range(1, self.constraints.get('max_nodes', 100)): perf_model = PerformanceModel(hw, self.app) tco_model = TCOModel(hw) time = perf_model.compute_time(self.problem_size, num_nodes) energy = perf_model.compute_energy(self.problem_size, num_nodes) tco = tco_model.compute_tco(num_nodes, energy * 8760 / time if time > 0 else 0) # Check constraints if time > self.constraints.get('max_time', float('inf')): continue if energy > self.constraints.get('max_energy', float('inf')): continue if tco > self.constraints.get('max_budget', float('inf')): continue results.append({ 'hardware': hw, 'num_nodes': num_nodes, 'time': time, 'energy': energy, 'tco': tco, 'score': self._compute_score(time, energy, tco, objective) }) results.sort(key=lambda x: x['score']) return results def _compute_score(self, time: float, energy: float, tco: float, objective: str) -> float: if objective == "performance": return time elif objective == "energy": return energy elif objective == "cost": return tco else: # balanced # Normalize and combine return time * 0.4 + energy * 0.3 + tco * 0.3 def get_pareto_frontier(self, results: List[Dict]) -> List[Dict]: pareto = [] for r in results: dominated = False for other in results: if (other['time'] <= r['time'] and other['energy'] <= r['energy'] and other['tco'] <= r['tco'] and (other['time'] < r['time'] or other['energy'] < r['energy'] or other['tco'] < r['tco'])): dominated = True break if not dominated: pareto.append(r) return pareto # Example usage def example_optimization(): # Hardware configurations configs = [ HardwareConfig( cpu_cores=64, cpu_freq_ghz=2.5, gpu_count=4, gpu_memory_gb=80, memory_gb=512, interconnect_bw_gbps=200, power_idle_w=200, power_max_w=2000, cost_per_node_usd=50000 ), HardwareConfig( cpu_cores=32, cpu_freq_ghz=3.0, gpu_count=2, gpu_memory_gb=40, memory_gb=256, interconnect_bw_gbps=100, power_idle_w=150, power_max_w=1200, cost_per_node_usd=25000 ) ] # Application profile app = ApplicationProfile( name="Deep Learning Training", compute_intensity=50.0, memory_footprint_gb=100, communication_pattern="collective", scaling_efficiency=0.85 ) # Optimize optimizer = WattlyticsOptimizer( configs, app, problem_size=1e18, constraints={'max_nodes': 50, 'max_time': 3600} ) results = optimizer.optimize(objective="balanced") pareto = optimizer.get_pareto_frontier(results) print("Pareto-optimal configurations:") for r in pareto[:5]: print(f" Nodes: {r['num_nodes']}, Time: {r['time']:.1f}s, " f"Energy: {r['energy']:.1f}kWh, TCO: ${r['tco']/1e6:.2f}M") if __name__ == "__main__": example_optimization()

Wattlytics Hpc Optimization

Wattlytics: HPC Cluster Optimization

Overview

Core Concepts

Multi-Objective Optimization

Performance Modeling

Wattlytics Hpc Optimization

Wattlytics: HPC Cluster Optimization

Overview

Core Concepts

Multi-Objective Optimization

Performance Modeling

Decision Support

Implementation

Key Insights

Applications

References

Clickhouse Io

Clickhouse Io

Claude Devfleet

Clickhouse Io

Ai First Engineering

Postgres Patterns