Implement Databricks reference architecture with best-practice project layout. Use when designing new Databricks projects, reviewing architecture, or establishing standards for Databricks applications. Trigger with phrases like "databricks architecture", "databricks best practices", "databricks project structure", "how to organize databricks", "databricks layout".
Production-ready lakehouse architecture with Unity Catalog, Delta Lake, and the medallion pattern. Covers workspace organization, three-level namespace governance, compute strategy, CI/CD with Asset Bundles, and project structure for team collaboration.
┌─────────────────────────────────────────────────────────────────┐
│ UNITY CATALOG │
│ │
│ ┌────────────┐ ┌────────────┐ ┌────────────┐ ┌───────────┐ │
│ │ Bronze │ │ Silver │ │ Gold │ │ ML Models │ │
│ │ Catalog │─▶│ Catalog │─▶│ Catalog │ │ (MLflow) │ │
│ │ (raw) │ │ (clean) │ │ (curated) │ │ │ │
│ └────────────┘ └────────────┘ └────────────┘ └───────────┘ │
│ ▲ │ │
│ ┌────────────┐ ┌────────────────┐ │
│ │ Auto Loader│ │ Model Serving │ │
│ │ Ingestion │ │ Endpoints │ │
│ └────────────┘ └────────────────┘ │
├─────────────────────────────────────────────────────────────────┤
│ Compute: Job Clusters │ SQL Warehouses │ Instance Pools │
├─────────────────────────────────────────────────────────────────┤
│ Security: Row Filters │ Column Masks │ Secret Scopes │ SCIM │
├─────────────────────────────────────────────────────────────────┤
│ CI/CD: Asset Bundles │ GitHub Actions │ dev/staging/prod │
└─────────────────────────────────────────────────────────────────┘
databricks-platform/
├── src/
│ ├── ingestion/
│ │ ├── bronze_raw_events.py # Auto Loader streaming
│ │ ├── bronze_api_data.py # REST API batch ingestion
│ │ └── bronze_file_uploads.py # Manual file uploads
│ ├── transformation/
│ │ ├── silver_clean_events.py # Cleansing + dedup
│ │ ├── silver_schema_enforce.py # Schema validation
│ │ └── silver_scd2.py # Slowly changing dimensions
│ ├── aggregation/
│ │ ├── gold_daily_metrics.py # Business KPIs
│ │ ├── gold_user_features.py # ML feature engineering
│ │ └── gold_reporting.py # BI-ready views
│ └── ml/
│ ├── training/
│ │ └── train_churn_model.py
│ └── inference/
│ └── batch_scoring.py
├── tests/
│ ├── conftest.py # Spark fixtures
│ ├── unit/ # Local Spark tests
│ └── integration/ # Databricks Connect tests
├── resources/
│ ├── etl_jobs.yml # ETL job definitions
│ ├── ml_jobs.yml # ML pipeline definitions
│ └── maintenance.yml # OPTIMIZE/VACUUM schedules
├── databricks.yml # Asset Bundle root config
├── pyproject.toml
└── requirements.txt
-- One catalog per environment (or shared with schema isolation)
CREATE CATALOG IF NOT EXISTS dev_catalog;
CREATE CATALOG IF NOT EXISTS prod_catalog;
-- Medallion schemas per catalog
CREATE SCHEMA IF NOT EXISTS prod_catalog.bronze;
CREATE SCHEMA IF NOT EXISTS prod_catalog.silver;
CREATE SCHEMA IF NOT EXISTS prod_catalog.gold;
CREATE SCHEMA IF NOT EXISTS prod_catalog.ml_features;
CREATE SCHEMA IF NOT EXISTS prod_catalog.ml_models;
-- Permissions: engineers write bronze/silver, analysts read gold
GRANT USAGE ON CATALOG prod_catalog TO `data-engineers`;
GRANT CREATE, MODIFY, SELECT ON SCHEMA prod_catalog.bronze TO `data-engineers`;
GRANT CREATE, MODIFY, SELECT ON SCHEMA prod_catalog.silver TO `data-engineers`;
GRANT SELECT ON SCHEMA prod_catalog.gold TO `data-engineers`;
GRANT USAGE ON CATALOG prod_catalog TO `data-analysts`;
GRANT SELECT ON SCHEMA prod_catalog.gold TO `data-analysts`;
# databricks.yml