Structured logging with Pino/Winston, OpenTelemetry tracing, metrics collection, Grafana dashboards, and alerting rules.
Three pillars of observability: logs, traces, and metrics. Each answers different questions.
Pino is the fastest Node.js logger. Always emit JSON; never plain strings.
// logger.ts
import pino from 'pino'
export const logger = pino({
level: process.env.LOG_LEVEL ?? 'info',
formatters: {
level(label) {
return { level: label } // emit "level":"info" not numeric
}
},
base: {
service: process.env.SERVICE_NAME ?? 'api',
version: process.env.APP_VERSION ?? 'unknown',
env: process.env.NODE_ENV ?? 'development'
},
timestamp: pino.stdTimeFunctions.isoTime,
redact: {
paths: ['req.headers.authorization', 'body.password', '*.token'],
censor: '[REDACTED]'
}
})
// Usage examples
import { logger } from './logger'
// Child logger with request context
const reqLogger = logger.child({
requestId: crypto.randomUUID(),
userId: user.id,
path: req.path
})
reqLogger.info('Processing payment')
reqLogger.warn({ amount, currency }, 'Payment above threshold')
reqLogger.error({ err }, 'Payment failed')
# logging_config.py
import structlog
import logging
structlog.configure(
processors=[
structlog.contextvars.merge_contextvars,
structlog.processors.add_log_level,
structlog.processors.TimeStamper(fmt="iso"),
structlog.processors.StackInfoRenderer(),
structlog.processors.JSONRenderer(),
],
wrapper_class=structlog.make_filtering_bound_logger(logging.DEBUG),
context_class=dict,
logger_factory=structlog.PrintLoggerFactory(),
)
log = structlog.get_logger()
# Usage
log.info("request.received", path="/api/users", method="GET")
log.warning("rate_limit.approaching", user_id=user.id, count=95, limit=100)
log.error("payment.failed", exc_info=True, order_id=order.id, amount=99.99)
# Bind context for duration of request
structlog.contextvars.bind_contextvars(request_id=request_id, user_id=user_id)
log.info("order.created") # request_id and user_id included automatically
structlog.contextvars.clear_contextvars()
| Level | When to Use | Example |
|---|---|---|
trace | Detailed execution path (dev only) | Function entry/exit, loop iterations |
debug | Diagnostic info for debugging | SQL queries, cache hit/miss |
info | Normal operations | Request received, job started, user login |
warn | Unexpected but recoverable | Retry attempt, fallback used, slow query |
error | Errors requiring investigation | DB connection failed, 3rd party API error |
fatal | Process must exit | Config missing, port in use |
// Good log message guidelines
// ✅ Include who, what, why, and relevant IDs
logger.info({ userId, orderId, amount }, 'order.created')
// ❌ Vague message, no context
logger.info('Order done')
// ✅ Error includes the actual error object
logger.error({ err, orderId }, 'order.payment.failed')
// ❌ Error swallowed or only string
logger.error('Payment error: ' + err.message)
Trace a request across multiple services by propagating a unique ID.
// Express middleware: assign or forward correlation ID
import { randomUUID } from 'crypto'
import { AsyncLocalStorage } from 'async_hooks'
const requestContext = new AsyncLocalStorage<{ requestId: string; userId?: string }>()
export function correlationMiddleware(req: Request, res: Response, next: NextFunction) {
const requestId = (req.headers['x-request-id'] as string) ?? randomUUID()
res.setHeader('x-request-id', requestId)
requestContext.run({ requestId }, () => {
next()
})
}
// Get context anywhere in call stack (no prop drilling)
export function getRequestId(): string {
return requestContext.getStore()?.requestId ?? 'unknown'
}
// Logger auto-includes correlation ID
export function getLogger() {
return logger.child({ requestId: getRequestId() })
}
// tracing.ts - must be imported FIRST before other modules
import { NodeSDK } from '@opentelemetry/sdk-node'
import { OTLPTraceExporter } from '@opentelemetry/exporter-trace-otlp-http'
import { HttpInstrumentation } from '@opentelemetry/instrumentation-http'
import { ExpressInstrumentation } from '@opentelemetry/instrumentation-express'
import { PgInstrumentation } from '@opentelemetry/instrumentation-pg'
const sdk = new NodeSDK({
serviceName: process.env.SERVICE_NAME ?? 'api',
traceExporter: new OTLPTraceExporter({
url: process.env.OTEL_EXPORTER_OTLP_ENDPOINT ?? 'http://localhost:4318/v1/traces'
}),
instrumentations: [
new HttpInstrumentation(),
new ExpressInstrumentation(),
new PgInstrumentation()
]
})
sdk.start()
process.on('SIGTERM', () => sdk.shutdown())
// Manual spans for business logic
import { trace, SpanStatusCode, context } from '@opentelemetry/api'
const tracer = trace.getTracer('payment-service')
async function processPayment(orderId: string, amount: number) {
return tracer.startActiveSpan('payment.process', async (span) => {
span.setAttributes({
'order.id': orderId,
'payment.amount': amount,
'payment.currency': 'USD'
})
try {
const result = await chargeCard(amount)
span.setStatus({ code: SpanStatusCode.OK })
return result
} catch (error) {
span.recordException(error as Error)
span.setStatus({ code: SpanStatusCode.ERROR, message: (error as Error).message })
throw error
} finally {
span.end()
}
})
}
// metrics.ts
import { Registry, Counter, Histogram, Gauge } from 'prom-client'
export const registry = new Registry()
// HTTP request counter
export const httpRequestTotal = new Counter({
name: 'http_requests_total',
help: 'Total number of HTTP requests',
labelNames: ['method', 'route', 'status_code'],
registers: [registry]
})
// Request duration histogram
export const httpRequestDuration = new Histogram({
name: 'http_request_duration_seconds',
help: 'HTTP request duration in seconds',
labelNames: ['method', 'route', 'status_code'],
buckets: [0.01, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5],
registers: [registry]
})
// Active connections gauge
export const activeConnections = new Gauge({
name: 'active_connections',
help: 'Number of active WebSocket connections',
registers: [registry]
})
// Metrics middleware
export function metricsMiddleware(req: Request, res: Response, next: NextFunction) {
const start = Date.now()
res.on('finish', () => {
const duration = (Date.now() - start) / 1000
const labels = {
method: req.method,
route: req.route?.path ?? req.path,
status_code: String(res.statusCode)
}
httpRequestTotal.inc(labels)
httpRequestDuration.observe(labels, duration)
})
next()
}
// Metrics endpoint (scrape target for Prometheus)
app.get('/metrics', async (req, res) => {
res.set('Content-Type', registry.contentType)
res.send(await registry.metrics())
})
// sentry.ts
import * as Sentry from '@sentry/node'
import { nodeProfilingIntegration } from '@sentry/profiling-node'
Sentry.init({
dsn: process.env.SENTRY_DSN,
environment: process.env.NODE_ENV,
release: process.env.APP_VERSION,
integrations: [nodeProfilingIntegration()],
tracesSampleRate: process.env.NODE_ENV === 'production' ? 0.1 : 1.0,
profilesSampleRate: 0.1,
beforeSend(event, hint) {
// Strip PII from errors
if (event.user) {
delete event.user.email
delete event.user.ip_address
}
return event
}
})
// Capture with context
try {
await processOrder(orderId)
} catch (error) {
Sentry.withScope((scope) => {
scope.setTag('order.id', orderId)
scope.setLevel('error')
Sentry.captureException(error)
})
throw error
}
// dashboard panel: Request Rate (PromQL)
{
"title": "Request Rate",
"type": "timeseries",
"targets": [{
"expr": "sum(rate(http_requests_total[5m])) by (route)",
"legendFormat": "{{route}}"
}]
}
# PromQL expressions for common panels
# Request rate (req/s over 5 min window)
sum(rate(http_requests_total[5m])) by (route, method)
# Error rate (%)
sum(rate(http_requests_total{status_code=~"5.."}[5m]))
/ sum(rate(http_requests_total[5m])) * 100
# Latency percentiles
histogram_quantile(0.50, sum(rate(http_request_duration_seconds_bucket[5m])) by (le, route))
histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le, route))
histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket[5m])) by (le, route))
# Apdex score (satisfied < 0.3s, tolerated < 1.2s)
(
sum(rate(http_request_duration_seconds_bucket{le="0.3"}[5m]))
+ sum(rate(http_request_duration_seconds_bucket{le="1.2"}[5m]))
) / 2 / sum(rate(http_request_duration_seconds_count[5m]))
# prometheus/alerts.yml