Observability patterns for Python backends. Use when adding logging, metrics, tracing, or debugging production issues.
Production issues are impossible to debug without observability. Logging, metrics, and tracing must be built in from the start. Silent failures, missing context in errors, and lack of metrics make incidents last longer.
Problem: Unstructured logs are hard to search and analyze.
# ❌ BAD: Unstructured logging
import logging
logger = logging.getLogger(__name__)
logger.info(f"User {user_id} started assessment {assessment_id}")
logger.error(f"Failed to save answer: {error}")
# ✅ GOOD: Structured logging with structlog
import structlog
logger = structlog.get_logger()
logger.info(
"assessment_started",
user_id=str(user_id),
assessment_id=str(assessment_id),
)
logger.error(
"answer_save_failed",
user_id=str(user_id),
question_id=str(question_id),
error=str(error),
error_type=type(error).__name__,
)
# app/core/logging.py
import structlog
import logging
import sys
def setup_logging(json_logs: bool = True, log_level: str = "INFO"):
"""Configure structured logging."""
# Shared processors
shared_processors = [
structlog.contextvars.merge_contextvars,
structlog.processors.add_log_level,
structlog.processors.TimeStamper(fmt="iso"),
structlog.processors.StackInfoRenderer(),
]
if json_logs:
# JSON for production (machine-readable)
processors = shared_processors + [
structlog.processors.format_exc_info,
structlog.processors.JSONRenderer(),
]
else:
# Pretty for development (human-readable)
processors = shared_processors + [
structlog.dev.ConsoleRenderer(),
]
structlog.configure(
processors=processors,
wrapper_class=structlog.make_filtering_bound_logger(
logging.getLevelName(log_level)
),
context_class=dict,
logger_factory=structlog.PrintLoggerFactory(),
cache_logger_on_first_use=True,
)
# Call at startup
setup_logging(json_logs=not settings.DEBUG)
Problem: Logs from same request aren't correlated.
import structlog
from contextvars import ContextVar
from uuid import uuid4
from fastapi import Request
request_id_var: ContextVar[str] = ContextVar("request_id", default="")
# Middleware to set request context
@app.middleware("http")
async def add_request_context(request: Request, call_next):
request_id = str(uuid4())[:8]
request_id_var.set(request_id)
# Bind to all logs in this request
structlog.contextvars.bind_contextvars(
request_id=request_id,
path=request.url.path,
method=request.method,
)
try:
response = await call_next(request)
return response
finally:
structlog.contextvars.unbind_contextvars(
"request_id", "path", "method"
)
# Now all logs automatically include request_id
logger.info("processing_assessment") # Includes request_id, path, method
logger = structlog.get_logger()
# DEBUG: Detailed diagnostic info (dev only)
logger.debug("query_executed", sql=str(query), params=params)
# INFO: Business events, successful operations
logger.info("assessment_submitted", user_id=user_id, score=score)
# WARNING: Unexpected but handled conditions
logger.warning(
"rate_limit_approaching",
user_id=user_id,
current=current_count,
limit=rate_limit,
)
# ERROR: Failures that need attention
logger.error(
"payment_failed",
user_id=user_id,
error=str(error),
payment_id=payment_id,
)
# CRITICAL: System-level failures
logger.critical(
"database_connection_failed",
error=str(error),
host=db_host,
)
Same principle as frontend - every early return should log:
# ❌ BAD: Silent early return
async def save_answer(user_id: UUID, question_id: UUID, value: int):
if not await is_valid_question(question_id):
return None # Why did we return? No one knows.
# ✅ GOOD: Observable early return
async def save_answer(user_id: UUID, question_id: UUID, value: int):
if not await is_valid_question(question_id):
logger.warning(
"save_answer_skipped",
reason="invalid_question",
user_id=str(user_id),
question_id=str(question_id),
)
return None
# ❌ BAD: Error without context