Name: Machine Learning Engineer
Author: louloulin

Search skills.../

Machine Learning Engineer | Skills Pool

import pandas as pd
import great_expectations as ge

# Load data
df = pd.read_csv("data.csv")

# Define expectations
df.expectation = ge.from_pandas(df)

# Define validation rules
df.expectation.expect_column_values_to_be_between(
    column="age",
    min_value=0,
    max_value=120
)

df.expectation.expect_column_values_to_notBeNull("email")

# Validate
validation_result = df.expectation.validate()

if not validation_result.success:
    print("Data validation failed!")
    for result in validation_result.results:
        if not result.success:
            print(f"  {result}")

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Numeric features
numeric_features = ["age", "income"]
numeric_transformer = StandardScaler()

# Categorical features
categorical_features = ["city", "gender"]
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

# Apply transformations
X_processed = preprocessor.fit_transform(X)

from sklearn.model_selection import train_test_split, StratifiedKFold

# Train/validation/test split (70/15/15)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

# Cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
    X_train_fold = X[train_idx]
    y_train_fold = y[train_idx]
    # Train model...

import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestClassifier

# Start MLflow run
with mlflow.start_run():
    # Set tags and description
    mlflow.set_tag("model_type", "random_forest")
    mlflow.set_tag("team", "data_science")

    # Log parameters
    n_estimators = 100
    max_depth = 10
    mlflow.log_param("n_estimators", n_estimators)
    mlflow.log_param("max_depth", max_depth)

    # Train model
    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        random_state=42
    )
    model.fit(X_train, y_train)

    # Log metrics
    train_score = model.score(X_train, y_train)
    val_score = model.score(X_val, y_val)
    mlflow.log_metric("train_accuracy", train_score)
    mlflow.log_metric("val_accuracy", val_score)

    # Log model
    mlflow.sklearn.log_model(model, "model")

    # Log artifacts
    mlflow.log_artifact("preprocessor.pkl")
    mlflow.log_artifact("feature_importance.png")

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy.stats import randint

# Define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Grid search
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_:.4f}")

# Or randomized search (more efficient)
param_dist = {
    'n_estimators': randint(50, 200),
    'max_depth': randint(5, 20),
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 4)
}

random_search = RandomizedSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_distributions=param_dist,
    n_iter=50,
    cv=5,
    random_state=42
)

random_search.fit(X_train, y_train)

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, classification_report
)

# Predictions
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

# Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}")
print(f"ROC AUC:   {roc_auc:.4f}")

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

# Classification report
print(classification_report(y_test, y_pred))

# Export model
import tensorflow as tf

model = ...  # Your trained model

# Save as SavedModel format
tf.saved_model.save(model, "models/my_model/1")

# Start TensorFlow Serving
# docker run -t --rm -p 8501:8501 \
#   -v $(pwd)/models:/models \
#   tensorflow/serving &

# Make predictions
import requests
import json

data = {"instances": [[1.0, 2.0, 3.0, 4.0]]}
response = requests.post(
    "http://localhost:8501/v1/models/my_model:predict",
    json=data
)
predictions = response.json()["predictions"]

import torch
from torch import nn
from torchserve import TorchServe

# Define model handler
class ModelHandler(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = load_model()

    def forward(self, x):
        return self.model(x)

# Save model
torch.save(model.state_dict(), "model.pth")

# Start TorchServe
# torchserve --start --ncs --model-name=mnist \
#   --model-version=1.0 --handlers=handler.py

apiVersion: apps/v1

Machine Learning Engineer

Machine Learning Engineer Skill

MLOps Architecture

Components

Technology Stack

Data Processing

Machine Learning Engineer

Machine Learning Engineer Skill

MLOps Architecture

Components

Technology Stack

Data Processing

Data Validation

Feature Engineering

Data Splitting

Model Development

Experiment Tracking with MLflow

Hyperparameter Tuning

Model Evaluation

Model Deployment

Model Serving with TensorFlow Serving

TorchServe Deployment

Kubernetes Deployment

Continuous Learning V2

Continuous Learning V2

Continuous Learning V2

Continuous Learning

Continuous Learning

Pytorch Patterns