Name: Stt Transcription
Author: astoreyai

Stt Transcription | Skills Pool

pip install sounddevice soundfile numpy --break-system-packages

pip install openai-whisper --break-system-packages
# For faster processing with GPU:
pip install openai-whisper torch --break-system-packages

pip install google-cloud-speech --break-system-packages

pip install azure-cognitiveservices-speech --break-system-packages

pip install assemblyai --break-system-packages

pip install pydub webrtcvad --break-system-packages  # Audio processing
pip install pyaudio --break-system-packages  # Alternative audio backend

Engine	Cost	Speed	Quality	Features	Best For
Whisper	Free	Medium	High	Multilingual, local	Privacy, offline, free
Google	Pay-per-use	Fast	High	Punctuation, diarization	Real-time, accuracy
Azure	Pay-per-use	Fast	High	Translation, custom	Enterprise integration
AssemblyAI	Pay-per-use	Medium	Very High	Diarization, sentiment	Analysis, insights

# Record 30 seconds
python scripts/record_audio.py --duration 30 --output recording.wav

# Record until stopped (Ctrl+C)
python scripts/record_audio.py --output recording.wav

# Record with voice activity detection
python scripts/record_audio.py --vad --output recording.wav

# Choose microphone
python scripts/list_devices.py  # List available mics
python scripts/record_audio.py --device 1 --output recording.wav

# Specify quality
python scripts/record_audio.py \
  --sample-rate 48000 \
  --channels 2 \
  --output recording.wav

# Basic transcription
python scripts/transcribe_whisper.py --file recording.wav

# Choose model size (tiny, base, small, medium, large)
python scripts/transcribe_whisper.py \
  --file recording.wav \
  --model medium

# With timestamps
python scripts/transcribe_whisper.py \
  --file recording.wav \
  --timestamps \
  --output transcript.json

# Multiple languages
python scripts/transcribe_whisper.py \
  --file recording.wav \
  --language es  # Spanish

# Export API key
export GOOGLE_APPLICATION_CREDENTIALS="path/to/credentials.json"

# Transcribe
python scripts/transcribe_google.py \
  --file recording.wav \
  --language en-US

# With speaker diarization
python scripts/transcribe_google.py \
  --file recording.wav \
  --diarization \
  --speakers 2

# Set credentials
export AZURE_SPEECH_KEY="your-key"
export AZURE_SPEECH_REGION="westus"

# Transcribe
python scripts/transcribe_azure.py --file recording.wav

# Real-time
python scripts/transcribe_azure_realtime.py --microphone

# Set API key
export ASSEMBLYAI_API_KEY="your-key"

# Transcribe with features
python scripts/transcribe_assemblyai.py \
  --file recording.wav \
  --diarization \
  --sentiment \
  --topics

# Whisper streaming (chunked)
python scripts/stream_whisper.py

# Google streaming
python scripts/stream_google.py

# Azure continuous recognition
python scripts/stream_azure.py

python scripts/transcribe_whisper.py --file audio.wav --output transcript.txt

python scripts/transcribe_whisper.py \
  --file audio.wav \
  --format json \
  --output transcript.json

# Output includes:
# - Text segments
# - Timestamps
# - Confidence scores
# - Language detection

python scripts/transcribe_whisper.py \
  --file video.mp4 \
  --format srt \
  --output subtitles.srt

python scripts/transcribe_whisper.py \
  --file video.mp4 \
  --format vtt \
  --output subtitles.vtt

# 1. Record meeting
python scripts/record_audio.py \
  --output meeting.wav \
  --vad  # Stop on silence

# 2. Transcribe with speaker diarization
python scripts/transcribe_google.py \
  --file meeting.wav \
  --diarization \
  --speakers 4 \
  --output meeting.json

# 3. Format for readability
python scripts/format_transcript.py \
  --input meeting.json \
  --format markdown \
  --output meeting.md

# Result: Formatted transcript with speaker labels and timestamps

# Record voice note
python scripts/quick_note.py

# (Records audio, transcribes with Whisper, saves as markdown)
# Output: voice-note-2025-01-20-14-30.md

# Batch process folder
python scripts/batch_transcribe.py \
  --input ./recordings/ \
  --output ./transcripts/ \
  --engine whisper \
  --model base

# Progress shown for each file

# Extract audio from video
python scripts/extract_audio.py --video lecture.mp4 --output audio.wav

# Generate subtitles
python scripts/transcribe_whisper.py \
  --file audio.wav \
  --format srt \
  --output lecture.srt

# Embed in video (requires ffmpeg)
python scripts/embed_subtitles.py \
  --video lecture.mp4 \
  --subtitles lecture.srt \
  --output lecture-subbed.mp4

# Transcribe Spanish audio
python scripts/transcribe_whisper.py \
  --file spanish-audio.wav \
  --language es \
  --output transcript-es.txt

# Translate to English
python scripts/transcribe_whisper.py \
  --file spanish-audio.wav \
  --task translate \
  --output transcript-en.txt

# Common languages
en  # English
es  # Spanish
fr  # French
de  # German
it  # Italian
pt  # Portuguese
nl  # Dutch
pl  # Polish
ru  # Russian
ja  # Japanese
ko  # Korean
zh  # Chinese
ar  # Arabic
hi  # Hindi

# Google (best diarization)
python scripts/transcribe_google.py \
  --file meeting.wav \
  --diarization \
  --speakers 3  # Hint: 3 speakers expected

# AssemblyAI
python scripts/transcribe_assemblyai.py \
  --file meeting.wav \
  --diarization

# Output format:
# Speaker 1: Hello everyone, let's begin
# Speaker 2: Thanks for joining
# Speaker 1: Today's agenda includes...

python scripts/label_speakers.py \
  --transcript meeting.json \
  --labels "Alice,Bob,Charlie" \
  --output meeting-labeled.txt

# Reduce noise
python scripts/denoise_audio.py \
  --input noisy.wav \
  --output clean.wav

# Normalize volume
python scripts/normalize_audio.py \
  --input quiet.wav \
  --output normalized.wav

# Convert format
python scripts/convert_audio.py \
  --input audio.m4a \
  --output audio.wav

{
  "segments": [
    {
      "start": 0.0,
      "end": 3.5,
      "text": "Welcome to today's meeting.",
      "confidence": 0.95
    },
    {
      "start": 3.5,
      "end": 7.2,
      "text": "Let's review the quarterly results.",
      "confidence": 0.92
    }
  ]
}

# Find text at specific time
python scripts/find_at_time.py \
  --transcript meeting.json \
  --time "5:30"  # 5 minutes 30 seconds

# Extract time range
python scripts/extract_range.py \
  --transcript meeting.json \
  --start "2:00" \
  --end "5:00" \
  --output excerpt.txt

pip install openai-whisper --break-system-packages

# List devices
python scripts/list_devices.py

# Test specific device
python scripts/test_microphone.py --device 1

# Use smaller model
python scripts/transcribe_whisper.py --file audio.wav --model tiny

# Or process in chunks
python scripts/transcribe_chunked.py --file large-audio.wav

# Google
export GOOGLE_APPLICATION_CREDENTIALS="/path/to/credentials.json"

# Azure
export AZURE_SPEECH_KEY="your-key"
export AZURE_SPEECH_REGION="region"

# AssemblyAI
export ASSEMBLYAI_API_KEY="your-key"

Model	Parameters	Size	Speed	VRAM	Accuracy
tiny	39M	~75MB	~32x	~1GB	Good
base	74M	~142MB	~16x	~1GB	Better
small	244M	~466MB	~6x	~2GB	Great
medium	769M	~1.5GB	~2x	~5GB	Excellent
large	1550M	~2.9GB	1x	~10GB	Best

Stt Transcription

Speech-to-Text Transcription

Quick Start

Prerequisites

System Requirements

Install Dependencies

Stt Transcription

Speech-to-Text Transcription

Quick Start

Prerequisites

System Requirements

Install Dependencies

STT Engine Comparison

Whisper (Recommended for most users)

Google Cloud Speech

Azure Speech

AssemblyAI

Core Operations

Record Audio

Transcribe Files

Real-Time Transcription

Format Output

Common Workflows

Workflow 1: Meeting Transcription

Workflow 2: Voice Notes to Markdown

Workflow 3: Batch Transcription

Workflow 4: Video Subtitles

Workflow 5: Multi-Language Support

Whisper Model Sizes

Language Support

Speaker Diarization

Audio Processing

Timestamps and Segments

API Cost Comparison

Scripts Reference

Best Practices

Troubleshooting

Integration Examples

Reference Documentation

Feishu Doc

Summarize

Nano Pdf

Diffs

Customs Trade Compliance

Nutrient Document Processing