Name: Python Data Analysis
Author: moshesham

Python Data Analysis | Skills Pool

import pandas as pd
import numpy as np

# Load and inspect
df = pd.read_csv("events.csv", parse_dates=["event_time"])

# Quick overview
df.shape              # (rows, columns)
df.dtypes             # Column types
df.describe()         # Summary statistics
df.isna().sum()       # Missing values per column
df.nunique()          # Unique values per column
df["event_name"].value_counts(normalize=True).head(10)  # Top events by frequency

# Boolean indexing
active_users = df[
    (df["event_name"] == "app_open") &
    (df["event_time"] >= "2024-01-01") &
    (df["platform"].isin(["ios", "android"]))
]

# Query syntax (cleaner for complex filters)
active_users = df.query(
    "event_name == 'app_open' and "
    "event_time >= '2024-01-01' and "
    "platform in ['ios', 'android']"
)

# Basic aggregation
daily_metrics = (
    df.groupby(df["event_time"].dt.date)
    .agg(
        dau=("user_id", "nunique"),
        total_events=("event_id", "count"),
        total_revenue=("revenue", "sum"),
    )
    .reset_index()
)

# Multiple groupby with named aggregation
segment_metrics = (
    df.groupby(["platform", "country"])
    .agg(
        users=("user_id", "nunique"),
        avg_sessions=("session_id", "nunique"),
        median_revenue=("revenue", "median"),
    )
    .reset_index()
    .sort_values("users", ascending=False)
)

# Transform: add group-level stats back to each row
df["user_event_count"] = df.groupby("user_id")["event_id"].transform("count")
df["pct_of_user_events"] = df.groupby("user_id")["event_id"].transform(
    lambda x: 1 / len(x)
)

# Left join (keep all users, add attributes)
enriched = events_df.merge(users_df, on="user_id", how="left")

# Multiple key join
combined = orders_df.merge(
    products_df,
    left_on=["product_id", "variant_id"],
    right_on=["id", "variant"],
    how="inner"
)

# Time-based join: find each user's first event
first_events = (
    events_df.sort_values("event_time")
    .groupby("user_id")
    .first()
    .reset_index()
)
users_with_first = users_df.merge(
    first_events[["user_id", "event_time", "event_name"]],
    on="user_id",
    how="left"
)

# Pivot: rows to columns (cohort retention table)
retention_pivot = retention_df.pivot(
    index="cohort_month",
    columns="period",
    values="retention_pct"
)

# Melt: columns to rows (wide to long)
long_df = pd.melt(
    wide_df,
    id_vars=["user_id", "date"],
    value_vars=["metric_a", "metric_b", "metric_c"],
    var_name="metric_name",
    value_name="metric_value"
)

# Cross-tabulation
pd.crosstab(
    df["platform"],
    df["subscription_tier"],
    values=df["revenue"],
    aggfunc="mean",
    margins=True,
)

# Resample to different frequency
daily = df.set_index("event_time").resample("D")["user_id"].nunique()
weekly = daily.resample("W").mean()

# Rolling windows
df["revenue_7d_avg"] = df["daily_revenue"].rolling(window=7).mean()
df["revenue_7d_std"] = df["daily_revenue"].rolling(window=7).std()

# Lag features
df["revenue_prev_day"] = df["daily_revenue"].shift(1)
df["revenue_wow_change"] = df["daily_revenue"] / df["daily_revenue"].shift(7) - 1

# Date parts for seasonality analysis
df["day_of_week"] = df["event_time"].dt.day_name()
df["hour"] = df["event_time"].dt.hour
df["is_weekend"] = df["event_time"].dt.dayofweek >= 5

import matplotlib.pyplot as plt
import seaborn as sns

def plot_metric_distribution(
    data: pd.Series,
    title: str,
    log_scale: bool = False,
    percentiles: list[float] = [0.25, 0.50, 0.75, 0.90, 0.95, 0.99]
) -> None:
    """Plot distribution with key percentile markers."""
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    plot_data = np.log1p(data) if log_scale else data
    xlabel = f"log(1 + {title})" if log_scale else title
    
    # Histogram
    axes[0].hist(plot_data.dropna(), bins=50, edgecolor="white", alpha=0.7)
    for p in percentiles:
        val = plot_data.quantile(p)
        axes[0].axvline(val, color="red", linestyle="--", alpha=0.5)
        axes[0].text(val, axes[0].get_ylim()[1] * 0.9, f"P{int(p*100)}", fontsize=8, rotation=90)
    axes[0].set_xlabel(xlabel)
    axes[0].set_title(f"Distribution of {title}")
    
    # Box plot
    axes[1].boxplot(plot_data.dropna(), vert=False)
    axes[1].set_xlabel(xlabel)
    axes[1].set_title(f"Box Plot of {title}")
    
    plt.tight_layout()
    plt.show()
    
    # Print summary statistics
    print(f"\n{title} Summary:")
    for p in percentiles:
        print(f"  P{int(p*100):>2}: {data.quantile(p):,.2f}")
    print(f"  Mean: {data.mean():,.2f}, Std: {data.std():,.2f}")

def plot_metric_over_time(
    df: pd.DataFrame,
    date_col: str,
    value_col: str,
    title: str,
    rolling_window: int = 7,
) -> None:
    """Plot a metric over time with rolling average and annotation."""
    fig, ax = plt.subplots(figsize=(14, 6))
    
    ax.plot(df[date_col], df[value_col], alpha=0.3, label="Daily")
    ax.plot(
        df[date_col],
        df[value_col].rolling(rolling_window).mean(),
        linewidth=2,
        label=f"{rolling_window}-day avg",
    )
    
    ax.set_xlabel("Date")
    ax.set_ylabel(value_col)
    ax.set_title(title)
    ax.legend()
    ax.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()

def plot_segment_comparison(
    df: pd.DataFrame,
    segment_col: str,
    value_col: str,
    title: str,
) -> None:
    """Compare a metric across segments with box plots and means."""
    fig, axes = plt.subplots(1, 2, figsize=(14, 6))
    
    # Box plot
    order = df.groupby(segment_col)[value_col].median().sort_values(ascending=False).index
    sns.boxplot(data=df, x=segment_col, y=value_col, order=order, ax=axes[0])
    axes[0].set_title(f"{title} — Distribution by {segment_col}")
    axes[0].tick_params(axis="x", rotation=45)
    
    # Bar chart of means with error bars
    summary = df.groupby(segment_col)[value_col].agg(["mean", "std", "count"]).loc[order]
    summary["se"] = summary["std"] / np.sqrt(summary["count"])
    axes[1].bar(range(len(summary)), summary["mean"], yerr=1.96 * summary["se"],
                capsize=4, alpha=0.7)
    axes[1].set_xticks(range(len(summary)))
    axes[1].set_xticklabels(summary.index, rotation=45)
    axes[1].set_title(f"{title} — Mean by {segment_col} (95% CI)")
    
    plt.tight_layout()
    plt.show()

Anti-Pattern	Problem	Better Approach
Iterating rows with `for i, row in df.iterrows()`	100-1000x slower than vectorized	Use vectorized operations or `.apply()`
Appending to DataFrame in a loop	Quadratic memory allocation	Build a list of dicts, then `pd.DataFrame(list)`
`df[df['col'] == val]` inside a loop	Repeated full-table scan	Use `.groupby()` or set index
Not using categorical dtype	High memory for repeated strings	`df['col'] = df['col'].astype('category')`
Chained indexing `df['a']['b']`	Unpredictable SettingWithCopy	Use `.loc[row, col]`
Loading full dataset when sampling works	Slow iteration, OOM	`df.sample(frac=0.1)` for exploration

Python Data Analysis

Python for Data Analysis

Overview

When to Use This Skill

Python Data Analysis

Python for Data Analysis

Overview

When to Use This Skill

Pandas Essentials

Data Loading and Inspection

Filtering and Selection

GroupBy Patterns

Merge and Join

Reshape: Pivot and Melt

Time Series Operations

Visualization Patterns

Distribution Analysis

Time Series Chart

Segment Comparison

Common Anti-Patterns

Best Practices

Additional Resources

Visualization Expert

Data Analyst

Huggingface Hub

Multi Reviewer Patterns

Dbt Transformation Patterns

Startup Financial Modeling