Improve WhisperX implementation

b9bc36bb · Ahmet Öner · 4298f3f5 · 4298f3f5 · 4298f3f5 · b9bc36bb
Commit b9bc36bb authored 4 months ago by Ahmet Öner
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
-// For format details, see https://aka.ms/devcontainer.json. For config options, see the
-// README at: https://github.com/devcontainers/templates/tree/main/src/docker-existing-docker-compose
-{
-	"name": "Existing Docker Compose (Extend)",
-
-	// Update the 'dockerComposeFile' list if you have more compose files or use different names.
-	// The .devcontainer/docker-compose.yml file contains any overrides you need/want to make.
-	"dockerComposeFile": [
-		"../docker-compose.yml",
-		"docker-compose.yml"
-	],
-
-	// The 'service' property is the name of the service for the container that VS Code should
-	// use. Update this value and .devcontainer/docker-compose.yml to the real service name.
-	"service": "whisper-asr-webservice",
-
-	// The optional 'workspaceFolder' property is the path VS Code should open by default when
-	// connected. This is typically a file mount in .devcontainer/docker-compose.yml
-	"workspaceFolder": "/workspaces/${localWorkspaceFolderBasename}",
-
-	// "overrideCommand": "/bin/sh -c 'while sleep 1000; do :; done'"
-	"overrideCommand": true
-
-	// Features to add to the dev container. More info: https://containers.dev/features.
-	// "features": {},
-
-	// Use 'forwardPorts' to make a list of ports inside the container available locally.
-	// "forwardPorts": [],
-
-	// Uncomment the next line if you want start specific services in your Docker Compose config.
-	// "runServices": [],
-
-	// Uncomment the next line if you want to keep your containers running after VS Code shuts down.
-	// "shutdownAction": "none",
-
-	// Uncomment the next line to run commands after the container is created.
-	// "postCreateCommand": "cat /etc/os-release",
-
-	// Configure tool-specific properties.
-	// "customizations": {},
-
-	// Uncomment to connect as an existing user other than the container default. More info: https://aka.ms/dev-containers-non-root.
-	// "remoteUser": "devcontainer"
-}
--- a/.devcontainer/docker-compose.yml
+++ b/.devcontainer/docker-compose.yml
-version: '3.4'
-services:
-  # Update this to the name of the service you want to work with in your docker-compose.yml file
-  whisper-asr-webservice:
-    # Uncomment if you want to override the service's Dockerfile to one in the .devcontainer 
-    # folder. Note that the path of the Dockerfile and context is relative to the *primary* 
-    # docker-compose.yml file (the first in the devcontainer.json "dockerComposeFile"
-    # array). The sample below assumes your primary file is in the root of your project.
-    #
-    # build:
-    #   context: .
-    #   dockerfile: .devcontainer/Dockerfile
-    env_file: .devcontainer/dev.env
-    environment:
-      ASR_ENGINE: ${ASR_ENGINE}
-      HF_TOKEN: ${HF_TOKEN}
-
-    volumes:
-      # Update this to wherever you want VS Code to mount the folder of your project
-      - ..:/workspaces:cached
-
-    # Uncomment the next four lines if you will use a ptrace-based debugger like C++, Go, and Rust.
-    # cap_add:
-    #   - SYS_PTRACE
-    # security_opt:
-    #   - seccomp:unconfined
-
-    # Overrides default command so things don't shut down after the process ends.
-    command: sleep infinity
- 
--- a/.github/workflows/docker-publish.yml
+++ b/.github/workflows/docker-publish.yml
@@ -12,7 +12,7 @@ env:
  REPO_NAME: ${{secrets.REPO_NAME}}
 jobs:
  build:
-    runs-on: [self-hosted, ubuntu-latest]
+    runs-on: ubuntu-latest
    strategy:
      matrix:
        include:
@@ -22,10 +22,6 @@ jobs:
            tag_extension: -gpu
            platforms: linux/amd64
    steps:
-    - name: Remove unnecessary files
-      run: |
-          sudo rm -rf /usr/share/dotnet
-          sudo rm -rf "$AGENT_TOOLSDIRECTORY"
    - name: Checkout
      uses: actions/checkout@v3
    - name: Set up QEMU

--- a/.gitignore
+++ b/.gitignore
@@ -42,5 +42,3 @@ pip-wheel-metadata
 poetry/core/*

 public
-
-.devcontainer/dev.env
--- a/Dockerfile
+++ b/Dockerfile
@@ -4,12 +4,6 @@ FROM swaggerapi/swagger-ui:v5.9.1 AS swagger-ui

 FROM python:3.10-bookworm

-RUN export DEBIAN_FRONTEND=noninteractive \
-    && apt-get -qq update \
-    && apt-get -qq install --no-install-recommends \
-    libsndfile1 \
-    && rm -rf /var/lib/apt/lists/*
-
 ENV POETRY_VENV=/app/.venv

 RUN python3 -m venv $POETRY_VENV \
@@ -28,11 +22,6 @@ COPY --from=swagger-ui /usr/share/nginx/html/swagger-ui-bundle.js swagger-ui-ass
 RUN poetry config virtualenvs.in-project true
 RUN poetry install

-RUN $POETRY_VENV/bin/pip install pandas transformers nltk pyannote.audio
-RUN git clone --depth 1 https://github.com/m-bain/whisperX.git \
-    && cd whisperX \
-    && $POETRY_VENV/bin/pip install -e .
-
 EXPOSE 9000

 ENTRYPOINT ["whisper-asr-webservice"]
--- a/Dockerfile.gpu
+++ b/Dockerfile.gpu
@@ -6,12 +6,6 @@ FROM nvidia/cuda:12.6.3-cudnn-runtime-ubuntu22.04

 ENV PYTHON_VERSION=3.10

-RUN export DEBIAN_FRONTEND=noninteractive \
-    && apt-get -qq update \
-    && apt-get -qq install --no-install-recommends \
-    libsndfile1 \
-    && rm -rf /var/lib/apt/lists/*
-
 ENV POETRY_VENV=/app/.venv

 RUN export DEBIAN_FRONTEND=noninteractive \
@@ -47,11 +41,6 @@ COPY --from=swagger-ui /usr/share/nginx/html/swagger-ui-bundle.js swagger-ui-ass
 RUN poetry install
 RUN $POETRY_VENV/bin/pip install torch==2.6.0 --index-url https://download.pytorch.org/whl/cu121

-RUN $POETRY_VENV/bin/pip install pandas transformers nltk pyannote.audio
-RUN git clone --depth 1 https://github.com/m-bain/whisperX.git \
-    && cd whisperX \
-    && $POETRY_VENV/bin/pip install -e .
-
 EXPOSE 9000

 CMD whisper-asr-webservice
--- a/README.md
+++ b/README.md
@@ -3,9 +3,9 @@
 ![Build](https://img.shields.io/github/actions/workflow/status/ahmetoner/whisper-asr-webservice/docker-publish.yml.svg)
 ![Licence](https://img.shields.io/github/license/ahmetoner/whisper-asr-webservice.svg)

-# Whisper ASR Webservice
+# Whisper ASR Box

-Whisper is a general-purpose speech recognition model. It is trained on a large dataset of diverse audio and is also a multitask model that can perform multilingual speech recognition as well as speech translation and language identification. For more details: [github.com/openai/whisper](https://github.com/openai/whisper/)
+Whisper ASR Box is a general-purpose speech recognition toolkit. Whisper Models are trained on a large dataset of diverse audio and is also a multitask model that can perform multilingual speech recognition as well as speech translation and language identification.

 ## Features

@@ -19,24 +19,73 @@ Current release (v1.7.1) supports following whisper models:

 ### CPU

-```sh
-docker run -d -p 9000:9000 -e ASR_MODEL=base -e ASR_ENGINE=openai_whisper onerahmet/openai-whisper-asr-webservice:latest
+```shell
+docker run -d -p 9000:9000 \
+  -e ASR_MODEL=base \
+  -e ASR_ENGINE=openai_whisper \
+  onerahmet/openai-whisper-asr-webservice:latest
 ```

 ### GPU

-```sh
-docker run -d --gpus all -p 9000:9000 -e ASR_MODEL=base -e ASR_ENGINE=openai_whisper onerahmet/openai-whisper-asr-webservice:latest-gpu
+```shell
+docker run -d --gpus all -p 9000:9000 \
+  -e ASR_MODEL=base \
+  -e ASR_ENGINE=openai_whisper \
+  onerahmet/openai-whisper-asr-webservice:latest-gpu
 ```

-for more information:
+#### Cache

- [Documentation/Run](https://ahmetoner.github.io/whisper-asr-webservice/run)
- [Docker Hub](https://hub.docker.com/r/onerahmet/openai-whisper-asr-webservice)
+To reduce container startup time by avoiding repeated downloads, you can persist the cache directory:
+
+```shell
+docker run -d -p 9000:9000 \
+  -v $PWD/cache:/root/.cache/ \
+  onerahmet/openai-whisper-asr-webservice:latest
+```
+
+## Key Features
+
+- Multiple ASR engines support (OpenAI Whisper, Faster Whisper, WhisperX)
+- Multiple output formats (text, JSON, VTT, SRT, TSV)
+- Word-level timestamps support
+- Voice activity detection (VAD) filtering
+- Speaker diarization (with WhisperX)
+- FFmpeg integration for broad audio/video format support
+- GPU acceleration support
+- Configurable model loading/unloading
+- REST API with Swagger documentation
+
+## Environment Variables
+
+Key configuration options:
+
+- `ASR_ENGINE`: Engine selection (openai_whisper, faster_whisper, whisperx)
+- `ASR_MODEL`: Model selection (tiny, base, small, medium, large-v3, etc.)
+- `ASR_MODEL_PATH`: Custom path to store/load models
+- `ASR_DEVICE`: Device selection (cuda, cpu)
+- `MODEL_IDLE_TIMEOUT`: Timeout for model unloading

 ## Documentation

-Explore the documentation by clicking [here](https://ahmetoner.github.io/whisper-asr-webservice).
+For complete documentation, visit:
+[https://ahmetoner.github.io/whisper-asr-webservice](https://ahmetoner.github.io/whisper-asr-webservice)
+
+## Development
+
+```shell
+# Install poetry
+pip3 install poetry
+
+# Install dependencies
+poetry install
+
+# Run service
+poetry run whisper-asr-webservice --host 0.0.0.0 --port 9000
+```
+
+After starting the service, visit `http://localhost:9000` or `http://0.0.0.0:9000` in your browser to access the Swagger UI documentation and try out the API endpoints.

 ## Credits


--- a/app/asr_models/asr_model.py
+++ b/app/asr_models/asr_model.py
@@ -15,8 +15,6 @@ class ASRModel(ABC):
    """

    model = None
-    diarize_model = None  # used for WhisperX
-    x_models = dict()  # used for WhisperX
    model_lock = Lock()
    last_activity_time = time.time()

@@ -75,6 +73,4 @@ class ASRModel(ABC):
        torch.cuda.empty_cache()
        gc.collect()
        self.model = None
-        self.diarize_model = None
-        self.x_models = dict()
        print("Model unloaded due to timeout")
--- a/app/asr_models/faster_whisper_engine.py
+++ b/app/asr_models/faster_whisper_engine.py
@@ -38,7 +38,8 @@ class FasterWhisperASR(ASRModel):
        self.last_activity_time = time.time()

        with self.model_lock:
-            if self.model is None: self.load_model()
+            if self.model is None:
+                self.load_model()

        options_dict = {"task": task}
        if language:
@@ -91,7 +92,5 @@ class FasterWhisperASR(ASRModel):
            WriteTSV(ResultWriter).write_result(result, file=file)
        elif output == "json":
            WriteJSON(ResultWriter).write_result(result, file=file)
-        elif output == "txt":
-            WriteTXT(ResultWriter).write_result(result, file=file)
        else:
-            return "Please select an output method!"
+            WriteTXT(ResultWriter).write_result(result, file=file)
--- a/app/asr_models/mbain_whisperx_engine.py
+++ b/app/asr_models/mbain_whisperx_engine.py
-from typing import BinaryIO, Union
+import time
 from io import StringIO
-import whisperx
+from threading import Thread
+from typing import BinaryIO, Union
+
 import whisper
-from whisperx.utils import SubtitlesWriter, ResultWriter
+import whisperx
+from whisperx.utils import ResultWriter, SubtitlesWriter, WriteJSON, WriteSRT, WriteTSV, WriteTXT, WriteVTT

 from app.asr_models.asr_model import ASRModel
 from app.config import CONFIG
-from app.utils import WriteTXT, WriteSRT, WriteVTT, WriteTSV, WriteJSON


 class WhisperXASR(ASRModel):
    def __init__(self):
-        self.x_models = dict()
+        super().__init__()
+        self.model = {
+            'whisperx': None,
+            'diarize_model': None,
+            'align_model': {}
+        }

    def load_model(self):
-
        asr_options = {"without_timestamps": False}
-        self.model = whisperx.load_model(
-            CONFIG.MODEL_NAME, device=CONFIG.DEVICE, compute_type="float32", asr_options=asr_options
+        self.model['whisperx'] = whisperx.load_model(
+            CONFIG.MODEL_NAME,
+            device=CONFIG.DEVICE,
+            compute_type=CONFIG.MODEL_QUANTIZATION,
+            asr_options=asr_options
        )

        if CONFIG.HF_TOKEN != "":
-            self.diarize_model = whisperx.DiarizationPipeline(use_auth_token=CONFIG.HF_TOKEN, device=CONFIG.DEVICE)
+            self.model['diarize_model'] = whisperx.DiarizationPipeline(
+                use_auth_token=CONFIG.HF_TOKEN,
+                device=CONFIG.DEVICE
+            )
+
+        Thread(target=self.monitor_idleness, daemon=True).start()

    def transcribe(
        self,
@@ -34,39 +48,42 @@ class WhisperXASR(ASRModel):
        options: Union[dict, None],
        output,
    ):
+        self.last_activity_time = time.time()
+        with self.model_lock:
+            if self.model is None:
+                self.load_model()
+
        options_dict = {"task": task}
        if language:
            options_dict["language"] = language
        if initial_prompt:
            options_dict["initial_prompt"] = initial_prompt
        with self.model_lock:
-            if self.model is None:
-                self.load_model()
-            result = self.model.transcribe(audio, **options_dict)
+            result = self.model['whisperx'].transcribe(audio, **options_dict)
+            language = result["language"]

        # Load the required model and cache it
        # If we transcribe models in many different languages, this may lead to OOM propblems
-        if result["language"] in self.x_models:
-            model_x, metadata = self.x_models[result["language"]]
+        if result["language"] in self.model['align_model']:
+            model_x, metadata = self.model['align_model'][result["language"]]
        else:
-            self.x_models[result["language"]] = whisperx.load_align_model(
+            self.model['align_model'][result["language"]] = whisperx.load_align_model(
                language_code=result["language"], device=CONFIG.DEVICE
            )
-            model_x, metadata = self.x_models[result["language"]]
+            model_x, metadata = self.model['align_model'][result["language"]]

        # Align whisper output
        result = whisperx.align(
            result["segments"], model_x, metadata, audio, CONFIG.DEVICE, return_char_alignments=False
        )

-        if options.get("diarize", False):
-            if CONFIG.HF_TOKEN == "":
-                print("Warning! HF_TOKEN is not set. Diarization may not work as expected.")
+        if options.get("diarize", False) and CONFIG.HF_TOKEN != "":
            min_speakers = options.get("min_speakers", None)
            max_speakers = options.get("max_speakers", None)
            # add min/max number of speakers if known
-            diarize_segments = self.diarize_model(audio, min_speakers, max_speakers)
+            diarize_segments = self.model['diarize_model'](audio, min_speakers, max_speakers)
            result = whisperx.assign_word_speakers(diarize_segments, result)
+        result["language"] = language

        output_file = StringIO()
        self.write_result(result, output_file, output)
@@ -91,21 +108,19 @@ class WhisperXASR(ASRModel):
        return detected_lang_code

    def write_result(self, result: dict, file: BinaryIO, output: Union[str, None]):
+        default_options = {
+            "max_line_width": CONFIG.SUBTITLE_MAX_LINE_WIDTH,
+            "max_line_count": CONFIG.SUBTITLE_MAX_LINE_COUNT,
+            "highlight_words": CONFIG.SUBTITLE_HIGHLIGHT_WORDS
+        }
+
        if output == "srt":
-            if CONFIG.HF_TOKEN != "":
-                WriteSRT(SubtitlesWriter).write_result(result, file=file, options={})
-            else:
-                WriteSRT(ResultWriter).write_result(result, file=file, options={})
+            WriteSRT(SubtitlesWriter).write_result(result, file=file, options=default_options)
        elif output == "vtt":
-            if CONFIG.HF_TOKEN != "":
-                WriteVTT(SubtitlesWriter).write_result(result, file=file, options={})
-            else:
-                WriteVTT(ResultWriter).write_result(result, file=file, options={})
+            WriteVTT(SubtitlesWriter).write_result(result, file=file, options=default_options)
        elif output == "tsv":
-            WriteTSV(ResultWriter).write_result(result, file=file, options={})
+            WriteTSV(ResultWriter).write_result(result, file=file, options=default_options)
        elif output == "json":
-            WriteJSON(ResultWriter).write_result(result, file=file, options={})
-        elif output == "txt":
-            WriteTXT(ResultWriter).write_result(result, file=file, options={})
+            WriteJSON(ResultWriter).write_result(result, file=file, options=default_options)
        else:
-            return 'Please select an output method!'
+            WriteTXT(ResultWriter).write_result(result, file=file, options=default_options)
--- a/app/asr_models/openai_whisper_engine.py
+++ b/app/asr_models/openai_whisper_engine.py
@@ -86,7 +86,5 @@ class OpenAIWhisperASR(ASRModel):
            WriteTSV(ResultWriter).write_result(result, file=file, options=options)
        elif output == "json":
            WriteJSON(ResultWriter).write_result(result, file=file, options=options)
-        elif output == "txt":
-            WriteTXT(ResultWriter).write_result(result, file=file, options=options)
        else:
-            return "Please select an output method!"
+            WriteTXT(ResultWriter).write_result(result, file=file, options=options)
--- a/app/config.py
+++ b/app/config.py
@@ -8,11 +8,13 @@ class CONFIG:
    Configuration class for ASR models.
    Reads environment variables for runtime configuration, with sensible defaults.
    """
-    # Determine the ASR engine ('faster_whisper' or 'openai_whisper')
+    # Determine the ASR engine ('faster_whisper', 'openai_whisper' or 'whisperx')
    ASR_ENGINE = os.getenv("ASR_ENGINE", "openai_whisper")

    # Retrieve Huggingface Token
    HF_TOKEN = os.getenv("HF_TOKEN", "")
+    if ASR_ENGINE == "whisperx" and HF_TOKEN == "":
+        print("You must set the HF_TOKEN environment variable to download the diarization model used by WhisperX.")

    # Determine the computation device (GPU or CPU)
    DEVICE = os.getenv("ASR_DEVICE", "cuda" if torch.cuda.is_available() else "cpu")
@@ -38,3 +40,8 @@ class CONFIG:

    # Default sample rate for audio input. 16 kHz is commonly used in speech-to-text tasks.
    SAMPLE_RATE = int(os.getenv("SAMPLE_RATE", 16000))
+
+    # Subtitle output options for whisperx
+    SUBTITLE_MAX_LINE_WIDTH = int(os.getenv("SUBTITLE_MAX_LINE_WIDTH", 1000))
+    SUBTITLE_MAX_LINE_COUNT = int(os.getenv("SUBTITLE_MAX_LINE_COUNT", 2))
+    SUBTITLE_HIGHLIGHT_WORDS = os.getenv("SUBTITLE_HIGHLIGHT_WORDS", "false").lower() == "true"
--- a/app/factory/asr_model_factory.py
+++ b/app/factory/asr_model_factory.py
 from app.asr_models.asr_model import ASRModel
 from app.asr_models.faster_whisper_engine import FasterWhisperASR
-from app.asr_models.openai_whisper_engine import OpenAIWhisperASR
 from app.asr_models.mbain_whisperx_engine import WhisperXASR
+from app.asr_models.openai_whisper_engine import OpenAIWhisperASR
 from app.config import CONFIG



--- a/app/utils.py
+++ b/app/utils.py
 import json
 import os
-from dataclasses import asdict, is_dataclass
-from typing import TextIO, BinaryIO, Union
+from dataclasses import asdict
+from typing import BinaryIO, TextIO

 import ffmpeg
 import numpy as np
@@ -23,42 +23,14 @@ class ResultWriter:
        with open(output_path, "w", encoding="utf-8") as f:
            self.write_result(result, file=f)

-    def write_result(self, result: dict, file: TextIO, options: Union[dict, None]):
+    def write_result(self, result: dict, file: TextIO):
        raise NotImplementedError

-    def format_segments_in_result(self, result: dict):
-        if "segments" in result:
-            # Check if result["segments"] is a list
-            if isinstance(result["segments"], list):
-                # Check if the list is empty
-                if not result["segments"]:
-                    # Handle the empty list case, you can choose to leave it as is or set it to an empty list
-                    pass
-                else:
-                    # Check if the first item in the list is a dataclass instance
-                    if is_dataclass(result["segments"][0]):
-                        result["segments"] = [asdict(segment) for segment in result["segments"]]
-                    # If it's already a list of dicts, leave it as is
-                    elif isinstance(result["segments"][0], dict):
-                        pass
-                    else:
-                        # Handle the case where the list contains neither dataclass instances nor dicts
-                        # You can choose to leave it as is or raise an error
-                        pass
-            elif isinstance(result["segments"], dict):
-                # If it's already a dict, leave it as is
-                pass
-            else:
-                # Handle the case where result["segments"] is neither a list nor a dict
-                # You can choose to leave it as is or raise an error
-                pass
-        return result
-

 class WriteTXT(ResultWriter):
    extension: str = "txt"

-    def write_result(self, result: dict, file: TextIO, options: Union[dict, None]):
+    def write_result(self, result: dict, file: TextIO):
        for segment in result["segments"]:
            print(segment.text.strip(), file=file, flush=True)

@@ -66,13 +38,12 @@ class WriteTXT(ResultWriter):
 class WriteVTT(ResultWriter):
    extension: str = "vtt"

-    def write_result(self, result: dict, file: TextIO, options: Union[dict, None]):
+    def write_result(self, result: dict, file: TextIO):
        print("WEBVTT\n", file=file)
-        result = self.format_segments_in_result(result)
        for segment in result["segments"]:
            print(
-                f"{format_timestamp(segment['start'])} --> {format_timestamp(segment['end'])}\n"
-                f"{segment['text'].strip().replace('-->', '->')}\n",
+                f"{format_timestamp(segment.start)} --> {format_timestamp(segment.end)}\n"
+                f"{segment.text.strip().replace('-->', '->')}\n",
                file=file,
                flush=True,
            )
@@ -81,15 +52,14 @@ class WriteVTT(ResultWriter):
 class WriteSRT(ResultWriter):
    extension: str = "srt"

-    def write_result(self, result: dict, file: TextIO, options: Union[dict, None]):
-        result = self.format_segments_in_result(result)
+    def write_result(self, result: dict, file: TextIO):
        for i, segment in enumerate(result["segments"], start=1):
            # write srt lines
            print(
                f"{i}\n"
-                f"{format_timestamp(segment['start'], always_include_hours=True, decimal_marker=',')} --> "
-                f"{format_timestamp(segment['end'], always_include_hours=True, decimal_marker=',')}\n"
-                f"{segment['text'].strip().replace('-->', '->')}\n",
+                f"{format_timestamp(segment.start, always_include_hours=True, decimal_marker=',')} --> "
+                f"{format_timestamp(segment.end, always_include_hours=True, decimal_marker=',')}\n"
+                f"{segment.text.strip().replace('-->', '->')}\n",
                file=file,
                flush=True,
            )
@@ -107,20 +77,20 @@ class WriteTSV(ResultWriter):

    extension: str = "tsv"

-    def write_result(self, result: dict, file: TextIO, options: Union[dict, None]):
-        result = self.format_segments_in_result(result)
+    def write_result(self, result: dict, file: TextIO):
        print("start", "end", "text", sep="\t", file=file)
        for segment in result["segments"]:
-            print(round(1000 * segment["start"]), file=file, end="\t")
-            print(round(1000 * segment["end"]), file=file, end="\t")
-            print(segment["text"].strip().replace("\t", " "), file=file, flush=True)
+            print(round(1000 * segment.start), file=file, end="\t")
+            print(round(1000 * segment.end), file=file, end="\t")
+            print(segment.text.strip().replace("\t", " "), file=file, flush=True)


 class WriteJSON(ResultWriter):
    extension: str = "json"

-    def write_result(self, result: dict, file: TextIO, options: Union[dict, None]):
-        result = self.format_segments_in_result(result) 
+    def write_result(self, result: dict, file: TextIO):
+        if "segments" in result:
+            result["segments"] = [asdict(segment) for segment in result["segments"]]
        json.dump(result, file)



--- a/docker-compose.gpu.yml
+++ b/docker-compose.gpu.yml
@@ -18,12 +18,7 @@ services:
      - "9000:9000"
    volumes:
      - ./app:/app/app
-      - cache-pip:/root/.cache/pip
-      - cache-poetry:/root/.cache/poetry
-      - cache-whisper:/root/.cache/whisper
+      - cache-whisper:/root/.cache

 volumes:
-  cache-pip:
-  cache-poetry:
  cache-whisper:
-  cache-faster-whisper:
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -11,12 +11,7 @@ services:
      - "9000:9000"
    volumes:
      - ./app:/app/app
-      - cache-pip:/root/.cache/pip
-      - cache-poetry:/root/.cache/poetry
-      - cache-whisper:/root/.cache/whisper
+      - cache-whisper:/root/.cache

 volumes:
-  cache-pip:
-  cache-poetry:
  cache-whisper:
-  cache-faster-whisper:
--- a/docs/.overrides/main.html
+++ b/docs/.overrides/main.html
 {% extends "base.html" %}
-
-{% block announce %}
-
-For updates follow <strong>@ahmetoner</strong> on
-<a rel="me" href="https://github.com/ahmetoner">
-    <span class="twemoji github">
-      {% include ".icons/fontawesome/brands/github.svg" %}
-    </span>
-    <strong>GitHub</strong>
-</a>
-{% endblock %}
\ No newline at end of file
--- a/docs/build.md
+++ b/docs/build.md
@@ -2,65 +2,60 @@

 Install poetry with following command:

-```sh
+```shell
 pip3 install poetry
 ```

-Install torch with following command:
-
-```sh
-# just for GPU:
-pip3 install torch==1.13.1+cu117 -f https://download.pytorch.org/whl/torch
-```
-
-### Run
+### Installation

 Install packages:

-```sh
+```shell
 poetry install
 ```

+!!! Note
+    By default, this will install the CPU version of PyTorch. For GPU support, you'll need to install the appropriate CUDA version of PyTorch separately:
+    ```shell
+    # For CUDA support (example for CUDA 11.8):
+    pip3 install torch==2.6.0 --index-url https://download.pytorch.org/whl/cu121
+    ```
+
+### Run
+
 Starting the Webservice:

-```sh
+```shell
 poetry run whisper-asr-webservice --host 0.0.0.0 --port 9000
 ```

 ### Build

-=== ":octicons-file-code-16: `Poetry`"
-
-    Build .whl package
-    
-    ```sh
-    poetry build
-    ```
 === ":octicons-file-code-16: `Docker`"

    With `Dockerfile`:

    === ":octicons-file-code-16: `CPU`"
    
-        ```sh
+        ```shell
        # Build Image
        docker build -t whisper-asr-webservice .
        
        # Run Container
        docker run -d -p 9000:9000 whisper-asr-webservice
-        # or
-        docker run -d -p 9001:9000 -e ASR_MODEL=base whisper-asr-webservice3
+        # or with specific model
+        docker run -d -p 9000:9000 -e ASR_MODEL=base whisper-asr-webservice
        ```
    
    === ":octicons-file-code-16: `GPU`"
    
-        ```sh
+        ```shell
        # Build Image
        docker build -f Dockerfile.gpu -t whisper-asr-webservice-gpu .
        
        # Run Container
        docker run -d --gpus all -p 9000:9000 whisper-asr-webservice-gpu
-        # or
+        # or with specific model
        docker run -d --gpus all -p 9000:9000 -e ASR_MODEL=base whisper-asr-webservice-gpu
        ```

@@ -68,12 +63,19 @@ poetry run whisper-asr-webservice --host 0.0.0.0 --port 9000
    
    === ":octicons-file-code-16: `CPU`"
    
-        ```sh
+        ```shell
        docker-compose up --build
        ```
    
    === ":octicons-file-code-16: `GPU`"
    
-        ```sh
-        docker-compose up --build -f docker-compose.gpu.yml
+        ```shell
+        docker-compose -f docker-compose.gpu.yml up --build
+        ```
+=== ":octicons-file-code-16: `Poetry`"
+
+    Build .whl package
+    
+    ```shell
+    poetry build
    ```
\ No newline at end of file
--- a/docs/endpoints.md
+++ b/docs/endpoints.md
@@ -19,14 +19,18 @@ There are 2 endpoints available:

 ### Request URL Query Params

-| Name            | Values                                         |
-|-----------------|------------------------------------------------|
-| audio_file      | File                                           |
-| output          | `text` (default), `json`, `vtt`, `srt`, `tsv` |
-| task            | `transcribe`, `translate`                      |
-| language        | `en` (default is auto recognition)             |
-| word_timestamps | false (default)                                |
-| encode          | true (default)                                 |
+| Name            | Values                                         | Description                                                    |
+|-----------------|------------------------------------------------|----------------------------------------------------------------|
+| audio_file      | File                                           | Audio or video file to transcribe                              |
+| output          | `text` (default), `json`, `vtt`, `srt`, `tsv` | Output format                                                  |
+| task            | `transcribe`, `translate`                      | Task type - transcribe in source language or translate to English |
+| language        | `en` (default is auto recognition)             | Source language code (see supported languages)                 |
+| word_timestamps | false (default)                                | Enable word-level timestamps (Faster Whisper only)             |
+| vad_filter      | false (default)                                | Enable voice activity detection filtering (Faster Whisper only) |
+| encode          | true (default)                                 | Encode audio through FFmpeg before processing                  |
+| diarize         | false (default)                                | Enable speaker diarization (WhisperX only)                     |
+| min_speakers    | null (default)                                 | Minimum number of speakers for diarization (WhisperX only)     |
+| max_speakers    | null (default)                                 | Maximum number of speakers for diarization (WhisperX only)     |

 Example request with cURL

@@ -40,11 +44,57 @@ curl -X POST -H "content-type: multipart/form-data" -F "audio_file=@/path/to/fil
 - **segments**: Contains an entry per segment. Each entry provides `timestamps`, `transcript`, `token ids`, `word level timestamps` and other metadata
 - **language**: Detected or provided language (as a language code)

+### Response Formats
+
+The API supports multiple output formats:
+
+- **text**: Plain text transcript (default)
+- **json**: Detailed JSON with segments, timestamps, and metadata
+- **vtt**: WebVTT subtitle format
+- **srt**: SubRip subtitle format  
+- **tsv**: Tab-separated values with timestamps
+
+### Supported Languages
+
+The service supports all languages supported by Whisper. Some common language codes:
+
+- Turkish (tr)
+- English (en)
+- Spanish (es)
+- French (fr)
+- German (de)
+- Italian (it)
+- Portuguese (pt)
+- And many more...
+
+See the [Whisper documentation](https://github.com/openai/whisper#available-models-and-languages) for the full list of supported languages.
+
+### Speaker Diarization
+
+When using the WhisperX engine with diarization enabled (`diarize=true`), the output will include speaker labels for each segment. This requires:
+
+1. WhisperX engine to be configured
+2. Valid Hugging Face token set in HF_TOKEN
+3. Sufficient memory for diarization models
+
+You can optionally specify `min_speakers` and `max_speakers` if you know the expected number of speakers.
+
 ## Language detection service /detect-language

 Detects the language spoken in the uploaded file. Only processes first 30 seconds.

 Returns a json with following fields:

- **detected_language**: "english"
- **language_code**: "en"
+- **detected_language**: Human readable language name (e.g. "english")
+- **language_code**: ISO language code (e.g. "en")
+- **confidence**: Confidence score between 0 and 1 indicating detection reliability
+
+Example response:
+
+```json
+{
+    "detected_language": "english",
+    "language_code": "en",
+    "confidence": 0.98
+}
+```
--- a/docs/environmental-variables.md
+++ b/docs/environmental-variables.md
 ### Configuring the `Engine`

 === ":octicons-file-code-16: `openai_whisper`"
-```sh
+
+    ```shell
    export ASR_ENGINE=openai_whisper
    ```
+
 === ":octicons-file-code-16: `faster_whisper`"
-```sh
+
+    ```shell
    export ASR_ENGINE=faster_whisper
    ```

+=== ":octicons-file-code-16: `whisperx`"
+
+    ```shell
+    export ASR_ENGINE=whisperx
+    ```
+
 ### Configuring the `Model`

-```sh
+```shell
 export ASR_MODEL=base
 ```

-Available ASR_MODELs are `tiny`, `base`, `small`, `medium`, `large`, `large-v1`, `large-v2`, `large-v3`, `turbo` and `large-v3-turbo`.
+Available ASR_MODELs are:
+
+- Standard models: `tiny`, `base`, `small`, `medium`, `large-v1`, `large-v2`, `large-v3` (or `large`), `large-v3-turbo` (or `turbo`)
+- English-optimized models: `tiny.en`, `base.en`, `small.en`, `medium.en`
+- Distilled models: `distil-large-v2`, `distil-medium.en`, `distil-small.en`, `distil-large-v3` (only for whisperx and faster-whisper)

 For English-only applications, the `.en` models tend to perform better, especially for the `tiny.en` and `base.en`
 models. We observed that the difference becomes less significant for the `small.en` and `medium.en` models.

+The distilled models offer improved inference speed while maintaining good accuracy.
+
 ### Configuring the `Model Path`

-```sh
+```shell
 export ASR_MODEL_PATH=/data/whisper
 ```

 ### Configuring the `Model Unloading Timeout`

-```sh
+```shell
 export MODEL_IDLE_TIMEOUT=300
 ```

@@ -37,10 +52,47 @@ Defaults to `0`. After no activity for this period (in seconds), unload the mode

 ### Configuring the `SAMPLE_RATE`

-```sh
+```shell
 export SAMPLE_RATE=16000
 ```

 Defaults to `16000`. Default sample rate for audio input. `16 kHz` is commonly used in `speech-to-text` tasks.

+### Configuring Device and Quantization
+
+```shell
+export ASR_DEVICE=cuda  # or 'cpu'
+export ASR_QUANTIZATION=float32  # or 'float16', 'int8'
+```
+
+The `ASR_DEVICE` defaults to `cuda` if GPU is available, otherwise `cpu`. 
+
+The `ASR_QUANTIZATION` defines the precision for model weights:
+
+- `float32`: 32-bit floating-point precision (higher precision, slower inference)
+- `float16`: 16-bit floating-point precision (lower precision, faster inference)
+- `int8`: 8-bit integer precision (lowest precision, fastest inference)
+
+Defaults to `float32` for GPU, `int8` for CPU.
+
+### Configuring Subtitle Options (WhisperX)
+
+```shell
+export SUBTITLE_MAX_LINE_WIDTH=1000
+export SUBTITLE_MAX_LINE_COUNT=2
+export SUBTITLE_HIGHLIGHT_WORDS=false
+```
+
+These options only apply when using the WhisperX engine:
+
+- `SUBTITLE_MAX_LINE_WIDTH`: Maximum width of subtitle lines (default: 1000)
+- `SUBTITLE_MAX_LINE_COUNT`: Maximum number of lines per subtitle (default: 2)
+- `SUBTITLE_HIGHLIGHT_WORDS`: Enable word highlighting in subtitles (default: false)
+
+### Hugging Face Token
+
+```shell
+export HF_TOKEN=your_token_here
+```

+Required when using the WhisperX engine to download the diarization model.