Overview

AudioPod AI’s Speech-to-Text API converts audio and video content into accurate text transcriptions using advanced AI models including WhisperX and Faster-Whisper. Get detailed transcriptions with speaker diarization, word-level timestamps, and confidence scores.

Key Features

  • Multi-Model Support: WhisperX, Whisper-Timestamped, Faster-Whisper
  • Speaker Diarization: Automatic speaker identification and separation
  • Word-Level Timestamps: Precise timing for each word
  • Confidence Scores: Quality metrics for transcription accuracy
  • 50+ Languages: Automatic language detection or manual specification
  • Large File Support: Handle videos up to 15 hours with chunking
  • Multiple Sources: Upload files or provide YouTube/video URLs
  • Editable Transcripts: Edit and refine transcription results

Authentication

All endpoints require authentication:
  • API Key: Authorization: Bearer your_api_key
  • JWT Token: Authorization: Bearer your_jwt_token

Transcribe from URLs

Transcribe YouTube Videos

Transcribe audio from YouTube or other video platforms.
from audiopod import Client

client = Client()

# Simple YouTube transcription
transcription = client.transcription.transcribe_from_url(
    url="https://youtube.com/watch?v=example123",
    wait_for_completion=True  # Wait for result
)

print(f"Transcription completed!")
print(f"Duration: {transcription.duration}s")
print(f"Language: {transcription.language}")
print(f"Full text: {transcription.text}")

# Advanced transcription with speaker diarization
advanced_transcription = client.transcription.transcribe_from_url(
    url="https://youtube.com/watch?v=example123",
    language="en",  # Optional: auto-detect if not specified
    model_type="whisperx",  # whisperx, whisper_timestamped, faster_whisper
    enable_speaker_diarization=True,
    min_speakers=2,
    max_speakers=5,
    enable_word_timestamps=True,
    enable_confidence_scores=True,
    chunk_duration=1800,  # 30 minutes per chunk
    wait_for_completion=True
)

# Access speaker-separated text
for segment in advanced_transcription.segments:
    speaker = segment.get('speaker', 'Unknown')
    text = segment['text']
    start_time = segment['start']
    end_time = segment['end']
    confidence = segment.get('confidence', 0.0)
    
    print(f"[{start_time:.2f}s - {end_time:.2f}s] {speaker}: {text} (confidence: {confidence:.2f})")

# Batch processing multiple URLs
urls = [
    "https://youtube.com/watch?v=video1",
    "https://youtube.com/watch?v=video2",
    "https://vimeo.com/123456789"
]

batch_results = client.transcription.transcribe_batch_from_urls(
    urls=urls,
    enable_speaker_diarization=True,
    model_type="whisperx",
    wait_for_completion=True
)

for i, result in enumerate(batch_results):
    print(f"\nVideo {i+1}: {urls[i]}")
    print(f"Status: {result.status}")
    if result.status == "completed":
        print(f"Text preview: {result.text[:100]}...")
Response:
{
  "job_id": 123,
  "task_id": "celery_task_uuid_here",
  "status": "PENDING",
  "message": "Transcription job created successfully",
  "estimated_credits": 150,
  "estimated_duration": 1800.0,
  "source_urls": [
    "https://youtube.com/watch?v=example123"
  ]
}

Transcribe from Files

Upload Audio/Video Files

Transcribe from uploaded audio or video files.
from audiopod import Client
import os

client = Client()

# Single file transcription
transcription = client.transcription.transcribe_from_file(
    audio_file="meeting_recording.mp3",
    language="en",
    model_type="whisperx",
    enable_speaker_diarization=True,
    min_speakers=2,
    max_speakers=8,
    enable_word_timestamps=True,
    enable_confidence_scores=True,
    wait_for_completion=True
)

print(f"Transcription completed!")
print(f"File: {transcription.source_file}")
print(f"Duration: {transcription.duration}s")
print(f"Language: {transcription.language}")
print(f"Full text: {transcription.text}")

# Access detailed segments with speakers
for segment in transcription.segments:
    speaker = segment.get('speaker', 'Unknown')
    text = segment['text']
    start_time = segment['start']
    end_time = segment['end']
    confidence = segment.get('confidence', 0.0)
    
    print(f"[{start_time:.2f}s - {end_time:.2f}s] {speaker}: {text}")

# Batch file transcription
audio_files = [
    "meeting_recording.mp3",
    "interview.wav", 
    "presentation.mp4",
    "podcast_episode.m4a"
]

batch_results = client.transcription.transcribe_batch_from_files(
    audio_files=audio_files,
    enable_speaker_diarization=True,
    model_type="whisperx",
    enable_word_timestamps=True,
    chunk_duration=1800,  # 30 minutes per chunk
    wait_for_completion=True
)

# Process results
for i, result in enumerate(batch_results):
    print(f"\nFile {i+1}: {audio_files[i]}")
    print(f"Status: {result.status}")
    if result.status == "completed":
        print(f"Duration: {result.duration}s")
        print(f"Language: {result.language}")
        print(f"Text preview: {result.text[:100]}...")
        
        # Save transcription to file
        output_file = f"transcript_{i+1}.txt"
        with open(output_file, 'w', encoding='utf-8') as f:
            f.write(result.text)
        print(f"Saved to: {output_file}")

# Advanced file processing with custom settings
def process_interview_file(file_path):
    """Process interview with optimized settings"""
    
    transcription = client.transcription.transcribe_from_file(
        audio_file=file_path,
        model_type="whisperx",  # Best for accuracy
        enable_speaker_diarization=True,
        min_speakers=2,  # Interviewer + interviewee
        max_speakers=4,  # Allow for additional participants
        enable_word_timestamps=True,
        enable_confidence_scores=True,
        language="auto",  # Auto-detect language
        wait_for_completion=True
    )
    
    # Generate formatted transcript
    formatted_output = []
    current_speaker = None
    
    for segment in transcription.segments:
        speaker = segment.get('speaker', 'Unknown')
        text = segment['text'].strip()
        
        if speaker != current_speaker:
            formatted_output.append(f"\n{speaker}:")
            current_speaker = speaker
            
        formatted_output.append(f" {text}")
    
    return ''.join(formatted_output)

# Process interview
interview_transcript = process_interview_file("important_interview.wav")
print("\nFormatted Interview Transcript:")
print(interview_transcript)

Job Management

Get Transcription Status

Check the progress and status of transcription jobs.
GET /api/v1/transcription/jobs/{job_id}
Authorization: Bearer {api_key}
Response (Completed):
{
  "id": 123,
  "user_id": "550e8400-e29b-41d4-a716-446655440000",
  "source_urls": ["https://youtube.com/watch?v=example123"],
  "language": "en",
  "model_type": "whisperx",
  "enable_speaker_diarization": true,
  "min_speakers": 2,
  "max_speakers": 5,
  "status": "COMPLETED",
  "progress": 100,
  "transcript_path": "/transcripts/job_123.json",
  "total_duration": 1847.5,
  "detected_language": "en",
  "confidence_score": 0.92,
  "created_at": "2024-01-15T10:30:00Z",
  "completed_at": "2024-01-15T10:45:30Z",
  "estimated_credits": 150,
  "display_name": "YouTube Video Transcription"
}

List Transcription Jobs

Get all transcription jobs for the authenticated user.
GET /api/v1/transcription/jobs?status=COMPLETED&limit=50&offset=0
Authorization: Bearer {api_key}

Download Transcripts

Get Transcript in Multiple Formats

Download transcripts in various formats including JSON, TXT, PDF, SRT, VTT, DOCX, and HTML.
GET /api/v1/transcription/jobs/{job_id}/transcript?format=json
Authorization: Bearer {api_key}
JSON Response Format:
{
  "job_id": 123,
  "detected_language": "en",
  "confidence_score": 0.92,
  "total_duration": 1847.5,
  "segments": [
    {
      "id": 1,
      "start": 0.0,
      "end": 4.5,
      "text": "Welcome to our podcast about artificial intelligence.",
      "confidence": 0.95,
      "speaker_id": 0,
      "speaker_label": "SPEAKER_00",
      "words": [
        {
          "word": "Welcome",
          "start": 0.0,
          "end": 0.8,
          "probability": 0.98
        },
        {
          "word": "to",
          "start": 0.8,
          "end": 1.0,
          "probability": 0.99
        }
      ]
    },
    {
      "id": 2,
      "start": 5.0,
      "end": 8.2,
      "text": "Thank you for having me on the show.",
      "confidence": 0.89,
      "speaker_id": 1,
      "speaker_label": "SPEAKER_01",
      "words": [...]
    }
  ],
  "speakers": [
    {
      "id": 0,
      "label": "SPEAKER_00",
      "total_speaking_time": 920.3
    },
    {
      "id": 1,
      "label": "SPEAKER_01", 
      "total_speaking_time": 827.2
    }
  ],
  "video_metadata": [
    {
      "video_id": "example123",
      "title": "AI Technology Discussion",
      "description": "A deep dive into AI technology trends",
      "duration": 1847.5,
      "uploader": "Tech Channel",
      "upload_date": "20240115"
    }
  ]
}

Edit Transcripts

Get Editable Transcript

Retrieve transcript in editable format for corrections.
GET /api/v1/transcription/jobs/{job_id}/edit
Authorization: Bearer {api_key}

Update Transcript

Submit edited transcript with corrections.
PUT /api/v1/transcription/jobs/{job_id}/edit
Authorization: Bearer {api_key}
Content-Type: application/json

{
  "segments": [
    {
      "id": 1,
      "start": 0.0,
      "end": 4.5,
      "text": "Welcome to our podcast about artificial intelligence.",
      "speaker_label": "SPEAKER_00",
      "confidence": 0.95
    }
  ],
  "edit_notes": "Corrected technical terms and speaker labels"
}

Get Transcript Versions

View edit history and versions of transcripts.
GET /api/v1/transcription/jobs/{job_id}/versions
Authorization: Bearer {api_key}

Extract Audio

Download Extracted Audio

Get clean audio files extracted from videos during transcription.
GET /api/v1/transcription/jobs/{job_id}/audio/{audio_index}
Authorization: Bearer {api_key}

Delete Jobs

Delete Transcription Job

Remove transcription jobs and associated data.
DELETE /api/v1/transcription/jobs/{job_id}
Authorization: Bearer {api_key}

Supported Languages

AudioPod AI supports automatic language detection or manual specification for 50+ languages:
LanguageCodeQualityNotes
EnglishenExcellentBest supported language
SpanishesExcellentHigh accuracy
FrenchfrExcellentGood speaker diarization
GermandeExcellentTechnical content support
PortugueseptVery GoodBrazilian and European
ItalianitVery GoodGood word timestamps
RussianruVery GoodCyrillic text support
JapanesejaGoodHiragana/Katakana/Kanji
ChinesezhGoodSimplified and Traditional
ArabicarGoodRTL text support
HindihiGoodDevanagari script
KoreankoGoodHangul script

Model Comparison

Choose the best model for your use case:
ModelSpeedAccuracySpeaker DiarizationBest For
whisperxMediumHighestExcellentProduction transcription
faster-whisperFastestHighGoodReal-time applications
whisper-timestampedSlowHighGoodDetailed analysis

Best Practices

Audio Quality Guidelines

For best transcription results:
# Recommended audio specifications
audio_requirements = {
    "sample_rate": "16kHz or higher",
    "format": "WAV, MP3, M4A, or video formats",
    "duration": "Up to 15 hours supported",
    "background_noise": "Minimize for better accuracy",
    "speech_clarity": "Clear articulation preferred",
    "multiple_speakers": "Distinct voices work best"
}

# Chunking for long content
chunking_strategy = {
    "chunk_duration": 1800,  # 30 minutes per chunk
    "overlap": 30,           # 30 seconds overlap
    "boundary_detection": "sentence_level"  # Smart chunk boundaries
}

Cost Optimization

# Efficient transcription workflow
def transcribe_efficiently(audio_files, language="auto"):
    # Use appropriate model based on needs
    model_choice = {
        "speed_priority": "faster-whisper",
        "accuracy_priority": "whisperx", 
        "analysis_priority": "whisper-timestamped"
    }
    
    # Batch similar files together
    batch_files = group_by_language_and_type(audio_files)
    
    for batch in batch_files:
        job = create_transcription_job(
            files=batch,
            language=language,
            model_type=model_choice["accuracy_priority"],
            enable_speaker_diarization=True,
            chunk_duration=1800  # Optimal chunk size
        )
        
        monitor_job_progress(job["job_id"])

Error Handling

Pricing

Transcription pricing is based on audio duration:
ServiceCostDescription
Basic Transcription660 credits/minuteText-only transcription
With Speaker Diarization660 credits/minuteSpeaker identification included
With Word Timestamps660 credits/minuteWord-level timing data
Transcript EditingFreeNo additional cost for edits

Cost Examples

DurationFeaturesCreditsUSD Cost
10 minutesBasic transcription6600$0.88
30 minutesWith speakers + timestamps19800$2.64
1 hourFull features39600$5.28
2 hoursFull features79200$10.56

Next Steps