EDA engineering practices
Use me when:
from great_expectations import GreatExpectations
import pandas as pd
# Define expectations
expectations = [
{"expectation_type": "expect_column_values_to_not_be_null",
"kwargs": {"column": "id"}},
{"expectation_type": "expect_column_values_to_be_unique",
"kwargs": {"column": "id"}},
{"expectation_type": "expect_column_values_to_be_between",
"kwargs": {"column": "age", "min_value": 0, "max_value": 120}},
{"expectation_type": "expect_column_distinct_values_to_be_in_set",
"kwargs": {"column": "status", "value_set": ["active", "inactive"]}}
]
# Validate data
ge = GreatExpectations()
batch = ge.get_batch("data.csv", "default")
results = ge.validate(batch, expectations=expectations)
from pandas_profiling import ProfileReport
# Generate comprehensive EDA report
profile = ProfileReport(
df,
title="Data Profiling Report",
explorative=True,
missing_diagrams=True,
correlations={"high_cardinality": "include"}
)
profile.to_file("eda_report.html")