Python Observability

Problem Statement

Production issues are impossible to debug without observability. Logging, metrics, and tracing must be built in from the start. Silent failures, missing context in errors, and lack of metrics make incidents last longer.

Pattern: Structured Logging

Problem: Unstructured logs are hard to search and analyze.

# ❌ BAD: Unstructured logging
import logging
logger = logging.getLogger(__name__)

logger.info(f"User {user_id} started assessment {assessment_id}")
logger.error(f"Failed to save answer: {error}")

# ✅ GOOD: Structured logging with structlog
import structlog

logger = structlog.get_logger()

logger.info(
    "assessment_started",
    user_id=str(user_id),
    assessment_id=str(assessment_id),
)

logger.error(
    "answer_save_failed",
    user_id=str(user_id),
    question_id=str(question_id),
    error=str(error),
    error_type=type(error).__name__,
)

# app/core/logging.py
import structlog
import logging
import sys

def setup_logging(json_logs: bool = True, log_level: str = "INFO"):
    """Configure structured logging."""
    
    # Shared processors
    shared_processors = [
        structlog.contextvars.merge_contextvars,
        structlog.processors.add_log_level,
        structlog.processors.TimeStamper(fmt="iso"),
        structlog.processors.StackInfoRenderer(),
    ]
    
    if json_logs:
        # JSON for production (machine-readable)
        processors = shared_processors + [
            structlog.processors.format_exc_info,
            structlog.processors.JSONRenderer(),
        ]
    else:
        # Pretty for development (human-readable)
        processors = shared_processors + [
            structlog.dev.ConsoleRenderer(),
        ]
    
    structlog.configure(
        processors=processors,
        wrapper_class=structlog.make_filtering_bound_logger(
            logging.getLevelName(log_level)
        ),
        context_class=dict,
        logger_factory=structlog.PrintLoggerFactory(),
        cache_logger_on_first_use=True,
    )

# Call at startup
setup_logging(json_logs=not settings.DEBUG)

import structlog
from contextvars import ContextVar
from uuid import uuid4
from fastapi import Request

request_id_var: ContextVar[str] = ContextVar("request_id", default="")

# Middleware to set request context
@app.middleware("http")
async def add_request_context(request: Request, call_next):
    request_id = str(uuid4())[:8]
    request_id_var.set(request_id)
    
    # Bind to all logs in this request
    structlog.contextvars.bind_contextvars(
        request_id=request_id,
        path=request.url.path,
        method=request.method,
    )
    
    try:
        response = await call_next(request)
        return response
    finally:
        structlog.contextvars.unbind_contextvars(
            "request_id", "path", "method"
        )

# Now all logs automatically include request_id
logger.info("processing_assessment")  # Includes request_id, path, method

logger = structlog.get_logger()

# DEBUG: Detailed diagnostic info (dev only)
logger.debug("query_executed", sql=str(query), params=params)

# INFO: Business events, successful operations
logger.info("assessment_submitted", user_id=user_id, score=score)

# WARNING: Unexpected but handled conditions
logger.warning(
    "rate_limit_approaching",
    user_id=user_id,
    current=current_count,
    limit=rate_limit,
)

# ERROR: Failures that need attention
logger.error(
    "payment_failed",
    user_id=user_id,
    error=str(error),
    payment_id=payment_id,
)

# CRITICAL: System-level failures
logger.critical(
    "database_connection_failed",
    error=str(error),
    host=db_host,
)

# ❌ BAD: Silent early return
async def save_answer(user_id: UUID, question_id: UUID, value: int):
    if not await is_valid_question(question_id):
        return None  # Why did we return? No one knows.

# ✅ GOOD: Observable early return
async def save_answer(user_id: UUID, question_id: UUID, value: int):
    if not await is_valid_question(question_id):
        logger.warning(
            "save_answer_skipped",
            reason="invalid_question",
            user_id=str(user_id),
            question_id=str(question_id),
        )
        return None

# ❌ BAD: Error without context

Py Observability | Skills Pool

Py Observability

Py Observability

Python Observability

Problem Statement

Pattern: Structured Logging

structlog Configuration

Pattern: Request Context Logging

Pattern: Log Levels

Pattern: No Silent Early Returns

Pattern: Error Logging with Context

Bluebubbles

Add Tracing

Analytics Events

Add Expert

Arthas

Arthas Eagleeye Traceid