Name: Setting Up Duckdb
Author: austinogilvie

Setting Up Duckdb | Skills Pool

import duckdb

# Persistent database
con = duckdb.connect('my_database.duckdb')

# In-memory
con = duckdb.connect(':memory:')

# With configuration
con = duckdb.connect('my_database.duckdb', config={
    'threads': 4,
    'memory_limit': '4GB'
})

duckdb my_database.duckdb

-- Memory and performance
SET memory_limit = '4GB';
SET threads = 4;
SET temp_directory = '/tmp/duckdb';

-- File handling
SET enable_object_cache = true;
SET preserve_insertion_order = true;

-- Output formatting
SET max_expression_depth = 1000;

-- Install and load extensions
INSTALL httpfs;   -- Read from S3, HTTP, etc.
LOAD httpfs;

INSTALL parquet;  -- Parquet support (built-in but can be explicit)
LOAD parquet;

INSTALL json;     -- JSON support
LOAD json;

INSTALL spatial;  -- Geospatial functions
LOAD spatial;

INSTALL excel;    -- Excel file support
LOAD excel;

-- CSV
CREATE TABLE my_table AS SELECT * FROM read_csv('data.csv');

-- Parquet
CREATE TABLE my_table AS SELECT * FROM read_parquet('data.parquet');

-- JSON
CREATE TABLE my_table AS SELECT * FROM read_json('data.json');

SET s3_region = 'us-east-1';
SET s3_access_key_id = 'your_key';
SET s3_secret_access_key = 'your_secret';

CREATE TABLE my_table AS
SELECT * FROM read_parquet('s3://bucket/path/file.parquet');

# DuckDB
*.duckdb
*.duckdb.wal
db/

from pathlib import Path

def ensure_gitignore_excludes_duckdb(repo_root: Path = None):
    """Ensure .gitignore excludes DuckDB files."""
    if repo_root is None:
        repo_root = Path.cwd()

    gitignore_path = repo_root / ".gitignore"

    duckdb_entries = [
        "# DuckDB",
        "*.duckdb",
        "*.duckdb.wal",
        "db/",
    ]

    existing_content = ""
    if gitignore_path.exists():
        existing_content = gitignore_path.read_text()

    # Check what's missing
    missing = [entry for entry in duckdb_entries
               if entry not in existing_content and not entry.startswith("#")]

    if missing:
        with open(gitignore_path, "a") as f:
            if existing_content and not existing_content.endswith("\n"):
                f.write("\n")
            f.write("\n".join(duckdb_entries) + "\n")
        print(f"Updated .gitignore with DuckDB exclusions")

project/
├── .gitignore        # Must exclude *.duckdb, *.duckdb.wal, db/
├── data/
│   ├── raw/          # Source data files
│   └── processed/    # Transformed data
├── db/
│   └── analytics.duckdb
├── sql/
│   ├── schema.sql    # Table definitions
│   ├── views.sql     # View definitions
│   └── queries/      # Saved queries
└── scripts/
    └── init_db.py    # Database initialization

Setting	Default	Description
`memory_limit`	75% of RAM	Maximum memory usage
`threads`	# of cores	Parallel execution threads
`temp_directory`	system temp	Location for spill files
`default_order`	ASC	Default sort order
`enable_progress_bar`	false	Show query progress
`enable_object_cache`	false	Cache metadata

Extension	Purpose
`httpfs`	HTTP/S3/GCS file access
`parquet`	Parquet file support
`json`	JSON file support
`spatial`	Geospatial functions
`excel`	Excel file support
`sqlite`	SQLite compatibility
`postgres`	PostgreSQL scanner
`mysql`	MySQL scanner
`fts`	Full-text search
`icu`	Unicode collation

import duckdb

# Create persistent database
con = duckdb.connect('sales_analytics.duckdb')

# Configure for analytics workload
con.execute("""
    SET memory_limit = '4GB';
    SET threads = 4;
""")

# Import CSV data
con.execute("""
    CREATE TABLE sales AS
    SELECT * FROM read_csv('sales_data.csv',
        header=true,
        auto_detect=true
    );
""")

# Verify import
print(con.execute("SELECT COUNT(*) FROM sales").fetchone())

import duckdb

con = duckdb.connect('warehouse.duckdb')

# Install and configure S3 access
con.execute("""
    INSTALL httpfs;
    LOAD httpfs;
    SET s3_region = 'us-west-2';
""")

# For AWS credentials (if not using instance profile)
# con.execute("SET s3_access_key_id = 'your_key'")
# con.execute("SET s3_secret_access_key = 'your_secret'")

# Create tables from multiple sources
con.execute("""
    CREATE TABLE cloud_data AS
    SELECT * FROM read_parquet('s3://my-bucket/data/*.parquet');

    CREATE TABLE local_data AS
    SELECT * FROM read_parquet('local_files/*.parquet');
""")

import duckdb

# Each process connects in read-only mode
con = duckdb.connect('shared.duckdb', read_only=True)

# Or via connection string
con = duckdb.connect('shared.duckdb?access_mode=read_only')

# Queries work normally
results = con.execute("SELECT * FROM my_table").fetchdf()

Setting Up Duckdb

DuckDB Setup

Overview

When to Use

Instructions

1. Determine Database Location

Setting Up Duckdb

DuckDB Setup

Overview

When to Use

Instructions

1. Determine Database Location

2. Create the Database

3. Configure Settings

4. Install Extensions

5. Set Up Data Sources

6. Protect Sensitive Data with .gitignore

7. Project Structure Recommendation

Configuration Reference

Common Extensions

Examples

Example 1: Basic Analytics Database

Example 2: Multi-Source Data Warehouse

Example 3: Read-Only Shared Database

Troubleshooting

Bundled Resources

Database Migrations

Database Migrations

Postgres Patterns

Frontend Query & Mutation

Db Migrations

Drizzle