Name: Data Pipeline
Author: rikunisikawa

搜索技能.../

Data Pipeline | Skills Pool

# terraform/modules/glue/main.tf
resource "aws_glue_catalog_table" "health_records" {
  name          = "health_records"
  database_name = aws_glue_catalog_database.this.name

  table_type = "EXTERNAL_TABLE"
  parameters = {
    "table_type"       = "ICEBERG"
    "metadata_location" = "s3tables://${var.table_bucket_arn}/..."
  }

  storage_descriptor {
    columns {
      name = "user_id";      type = "string"
    }
    columns {
      name = "recorded_at";  type = "timestamp"
    }
    columns {
      name = "fatigue";      type = "int"
    }
    columns {
      name = "mood";         type = "int"
    }
    columns {
      name = "motivation";   type = "int"
    }
    columns {
      name = "flags";        type = "int"
    }
  }
}

# 1. Terraform でスキーマ変更を apply（ユーザー確認後）

# 2. Athena DDL でカラム追加
QUERY="ALTER TABLE health_records ADD COLUMNS (new_col string)"

QUERY_ID=$(aws athena start-query-execution \
  --query-string "$QUERY" \
  --query-execution-context Database=health_logger_prod_health_logs \
  --result-configuration OutputLocation=s3://health-logger-prod/athena-results/ \
  --region ap-northeast-1 \
  --query 'QueryExecutionId' --output text)

# 3. 実行確認
aws athena get-query-execution \
  --query-execution-id "$QUERY_ID" \
  --region ap-northeast-1 \
  --query 'QueryExecution.Status.State'

# 4. テストクエリ
aws athena start-query-execution \
  --query-string "SELECT new_col FROM health_records LIMIT 1" \
  --query-execution-context Database=health_logger_prod_health_logs \
  --result-configuration OutputLocation=s3://health-logger-prod/athena-results/ \
  --region ap-northeast-1

-- NG: フルスキャン
SELECT * FROM health_records;

-- OK: パーティション絞り込み
SELECT user_id, recorded_at, fatigue, mood, motivation
FROM health_records
WHERE user_id = 'xxx'
  AND recorded_at >= current_date - INTERVAL '30' DAY
ORDER BY recorded_at DESC
LIMIT 100;

# 必ず末尾に "\n" を付ける（Firehose の要件）
record = json.dumps({
    "user_id":     user_id,
    "recorded_at": datetime.now(timezone.utc).isoformat(),
    "fatigue":     5,
    "mood":        7,
    "motivation":  6,
    "flags":       9,
}, ensure_ascii=False) + "\n"

firehose.put_record(
    DeliveryStreamName=STREAM_NAME,
    Record={"Data": record.encode("utf-8")},
)

Data Pipeline

Purpose

Responsibilities

Pipeline Overview

Iceberg テーブルスキーマ設計

health_records テーブル

Data Pipeline

Purpose

Responsibilities

Pipeline Overview

Iceberg テーブルスキーマ設計

health_records テーブル

Iceberg スキーマ変更手順

Athena クエリ最適化

Firehose JSON Lines フォーマット

Best Practices

Output Format

Clickhouse Io

Clickhouse Io

Claude Devfleet

Clickhouse Io

Ai First Engineering

Postgres Patterns