Skip to content
Snippets Groups Projects
Unverified Commit 0bcc587c authored by Ahmet Öner's avatar Ahmet Öner Committed by GitHub
Browse files

Merge pull request #261 from ahmetoner/upgrade-faster-whisper

Refactor classes and upgrade faster whisper to v1.1.0
parents 7d3e8876 319f58b1
Branches
Tags
No related merge requests found
...@@ -10,6 +10,25 @@ Unreleased ...@@ -10,6 +10,25 @@ Unreleased
- Added detection confidence to langauge detection endpoint - Added detection confidence to langauge detection endpoint
- Set mel generation to adjust n_dims automatically to match the loaded model - Set mel generation to adjust n_dims automatically to match the loaded model
### Added
- Refactor classes, Add comments, implement abstract methods, and add factory method for engine selection
### Changed
- Upgraded
- [SYSTRAN/faster-whisper](https://github.com/SYSTRAN/faster-whisper) to [v1.1.0](https://github.com/SYSTRAN/faster-whisper/releases/tag/v1.1.0)
- uvicorn to v0.34.0
- tqdm to v4.67.1
- python-multipart to v0.0.20
- fastapi to v0.115.6
- pytest to v8.3.4
- ruff to v0.8.3
- black to v24.10.0
- mkdocs to v1.6.1
- mkdocs-material to v9.5.49
- pymdown-extensions to v10.12
[1.6.0] (2024-10-06) [1.6.0] (2024-10-06)
-------------------- --------------------
......
import gc
import time
from abc import ABC, abstractmethod
from threading import Lock
from typing import Union
import torch
from app.config import CONFIG
class ASRModel(ABC):
"""
Abstract base class for ASR (Automatic Speech Recognition) models.
"""
model = None
model_lock = Lock()
last_activity_time = time.time()
def __init__(self):
pass
@abstractmethod
def load_model(self):
"""
Loads the model from the specified path.
"""
pass
@abstractmethod
def transcribe(self,
audio,
task: Union[str, None],
language: Union[str, None],
initial_prompt: Union[str, None],
vad_filter: Union[bool, None],
word_timestamps: Union[bool, None]
):
"""
Perform transcription on the given audio file.
"""
pass
@abstractmethod
def language_detection(self, audio):
"""
Perform language detection on the given audio file.
"""
pass
def monitor_idleness(self):
"""
Monitors the idleness of the ASR model and releases the model if it has been idle for too long.
"""
if CONFIG.MODEL_IDLE_TIMEOUT <= 0: return
while True:
time.sleep(15)
if time.time() - self.last_activity_time > CONFIG.MODEL_IDLE_TIMEOUT:
with self.model_lock:
self.release_model()
break
def release_model(self):
"""
Unloads the model from memory and clears any cached GPU memory.
"""
del self.model
torch.cuda.empty_cache()
gc.collect()
self.model = None
print("Model unloaded due to timeout")
import time
from io import StringIO
from threading import Thread
from typing import BinaryIO, Union
import whisper
from faster_whisper import WhisperModel
from app.asr_models.asr_model import ASRModel
from app.config import CONFIG
from app.utils import ResultWriter, WriteJSON, WriteSRT, WriteTSV, WriteTXT, WriteVTT
class FasterWhisperASR(ASRModel):
def load_model(self):
self.model = WhisperModel(
model_size_or_path=CONFIG.MODEL_NAME,
device=CONFIG.DEVICE,
compute_type=CONFIG.MODEL_QUANTIZATION,
download_root=CONFIG.MODEL_PATH
)
Thread(target=self.monitor_idleness, daemon=True).start()
def transcribe(
self,
audio,
task: Union[str, None],
language: Union[str, None],
initial_prompt: Union[str, None],
vad_filter: Union[bool, None],
word_timestamps: Union[bool, None],
output,
):
self.last_activity_time = time.time()
with self.model_lock:
if self.model is None: self.load_model()
options_dict = {"task": task}
if language:
options_dict["language"] = language
if initial_prompt:
options_dict["initial_prompt"] = initial_prompt
if vad_filter:
options_dict["vad_filter"] = True
if word_timestamps:
options_dict["word_timestamps"] = True
with self.model_lock:
segments = []
text = ""
segment_generator, info = self.model.transcribe(audio, beam_size=5, **options_dict)
for segment in segment_generator:
segments.append(segment)
text = text + segment.text
result = {"language": options_dict.get("language", info.language), "segments": segments, "text": text}
output_file = StringIO()
self.write_result(result, output_file, output)
output_file.seek(0)
return output_file
def language_detection(self, audio):
self.last_activity_time = time.time()
with self.model_lock:
if self.model is None: self.load_model()
# load audio and pad/trim it to fit 30 seconds
audio = whisper.pad_or_trim(audio)
# detect the spoken language
with self.model_lock:
segments, info = self.model.transcribe(audio, beam_size=5)
detected_lang_code = info.language
detected_language_confidence = info.language_probability
return detected_lang_code, detected_language_confidence
def write_result(self, result: dict, file: BinaryIO, output: Union[str, None]):
if output == "srt":
WriteSRT(ResultWriter).write_result(result, file=file)
elif output == "vtt":
WriteVTT(ResultWriter).write_result(result, file=file)
elif output == "tsv":
WriteTSV(ResultWriter).write_result(result, file=file)
elif output == "json":
WriteJSON(ResultWriter).write_result(result, file=file)
elif output == "txt":
WriteTXT(ResultWriter).write_result(result, file=file)
else:
return "Please select an output method!"
import time
from io import StringIO
from threading import Thread
from typing import BinaryIO, Union
import torch
import whisper
from whisper.utils import ResultWriter, WriteJSON, WriteSRT, WriteTSV, WriteTXT, WriteVTT
from app.asr_models.asr_model import ASRModel
from app.config import CONFIG
class OpenAIWhisperASR(ASRModel):
def load_model(self):
if torch.cuda.is_available():
self.model = whisper.load_model(
name=CONFIG.MODEL_NAME,
download_root=CONFIG.MODEL_PATH
).cuda()
else:
self.model = whisper.load_model(
name=CONFIG.MODEL_NAME,
download_root=CONFIG.MODEL_PATH
)
Thread(target=self.monitor_idleness, daemon=True).start()
def transcribe(
self,
audio,
task: Union[str, None],
language: Union[str, None],
initial_prompt: Union[str, None],
vad_filter: Union[bool, None],
word_timestamps: Union[bool, None],
output,
):
self.last_activity_time = time.time()
with self.model_lock:
if self.model is None: self.load_model()
options_dict = {"task": task}
if language:
options_dict["language"] = language
if initial_prompt:
options_dict["initial_prompt"] = initial_prompt
if word_timestamps:
options_dict["word_timestamps"] = word_timestamps
with self.model_lock:
result = self.model.transcribe(audio, **options_dict)
output_file = StringIO()
self.write_result(result, output_file, output)
output_file.seek(0)
return output_file
def language_detection(self, audio):
self.last_activity_time = time.time()
with self.model_lock:
if self.model is None: self.load_model()
# load audio and pad/trim it to fit 30 seconds
audio = whisper.pad_or_trim(audio)
# make log-Mel spectrogram and move to the same device as the model
mel = whisper.log_mel_spectrogram(audio, self.model.dims.n_mels).to(self.model.device)
# detect the spoken language
with self.model_lock:
_, probs = self.model.detect_language(mel)
detected_lang_code = max(probs, key=probs.get)
return detected_lang_code, probs[max(probs)]
def write_result(self, result: dict, file: BinaryIO, output: Union[str, None]):
options = {"max_line_width": 1000, "max_line_count": 10, "highlight_words": False}
if output == "srt":
WriteSRT(ResultWriter).write_result(result, file=file, options=options)
elif output == "vtt":
WriteVTT(ResultWriter).write_result(result, file=file, options=options)
elif output == "tsv":
WriteTSV(ResultWriter).write_result(result, file=file, options=options)
elif output == "json":
WriteJSON(ResultWriter).write_result(result, file=file, options=options)
elif output == "txt":
WriteTXT(ResultWriter).write_result(result, file=file, options=options)
else:
return "Please select an output method!"
import os
import torch
class CONFIG:
"""
Configuration class for ASR models.
Reads environment variables for runtime configuration, with sensible defaults.
"""
# Determine the ASR engine ('faster_whisper' or 'openai_whisper')
ASR_ENGINE = os.getenv("ASR_ENGINE", "openai_whisper")
# Determine the computation device (GPU or CPU)
DEVICE = os.getenv("ASR_DEVICE", "cuda" if torch.cuda.is_available() else "cpu")
# Model name to use (e.g., "base", "small", etc.)
MODEL_NAME = os.getenv("ASR_MODEL", "base")
# Path to the model directory
MODEL_PATH = os.getenv("ASR_MODEL_PATH", os.path.join(os.path.expanduser("~"), ".cache", "whisper"))
# Model quantization level. Defines the precision for model weights:
# 'float32' - 32-bit floating-point precision (higher precision, slower inference)
# 'float16' - 16-bit floating-point precision (lower precision, faster inference)
# 'int8' - 8-bit integer precision (lowest precision, fastest inference)
# Defaults to 'float32' for GPU availability, 'int8' for CPU.
MODEL_QUANTIZATION = os.getenv("ASR_QUANTIZATION", "float32" if torch.cuda.is_available() else "int8")
if MODEL_QUANTIZATION not in {"float32", "float16", "int8"}:
raise ValueError("Invalid MODEL_QUANTIZATION. Choose 'float32', 'float16', or 'int8'.")
# Idle timeout in seconds. If set to a non-zero value, the model will be unloaded
# after being idle for this many seconds. A value of 0 means the model will never be unloaded.
MODEL_IDLE_TIMEOUT = int(os.getenv("MODEL_IDLE_TIMEOUT", 0))
# Default sample rate for audio input. 16 kHz is commonly used in speech-to-text tasks.
SAMPLE_RATE = int(os.getenv("SAMPLE_RATE", 16000))
from app.asr_models.asr_model import ASRModel
from app.asr_models.faster_whisper_engine import FasterWhisperASR
from app.asr_models.openai_whisper_engine import OpenAIWhisperASR
from app.config import CONFIG
class ASRModelFactory:
@staticmethod
def create_asr_model() -> ASRModel:
if CONFIG.ASR_ENGINE == "openai_whisper":
return OpenAIWhisperASR()
elif CONFIG.ASR_ENGINE == "faster_whisper":
return FasterWhisperASR()
else:
raise ValueError(f"Unsupported ASR engine: {CONFIG.ASR_ENGINE}")
import os
from io import StringIO
from threading import Lock, Thread
from typing import BinaryIO, Union
import time
import gc
import torch
import whisper
from faster_whisper import WhisperModel
from .utils import ResultWriter, WriteJSON, WriteSRT, WriteTSV, WriteTXT, WriteVTT
model_name = os.getenv("ASR_MODEL", "base")
model_path = os.getenv("ASR_MODEL_PATH", os.path.join(os.path.expanduser("~"), ".cache", "whisper"))
model = None
model_lock = Lock()
# More about available quantization levels is here:
# https://opennmt.net/CTranslate2/quantization.html
last_activity_time = time.time()
idle_timeout = int(os.getenv("IDLE_TIMEOUT", 0)) # default to being disabled
def monitor_idleness():
global model
if(idle_timeout <= 0): return
while True:
time.sleep(15)
if time.time() - last_activity_time > idle_timeout:
with model_lock:
release_model()
break
def load_model():
global model, device, model_quantization
if torch.cuda.is_available():
device = "cuda"
model_quantization = os.getenv("ASR_QUANTIZATION", "float32")
else:
device = "cpu"
model_quantization = os.getenv("ASR_QUANTIZATION", "int8")
model = WhisperModel(
model_size_or_path=model_name, device=device, compute_type=model_quantization, download_root=model_path
)
Thread(target=monitor_idleness, daemon=True).start()
load_model()
def release_model():
global model
del model
torch.cuda.empty_cache()
gc.collect()
model = None
print("Model unloaded due to timeout")
def transcribe(
audio,
task: Union[str, None],
language: Union[str, None],
initial_prompt: Union[str, None],
vad_filter: Union[bool, None],
word_timestamps: Union[bool, None],
output,
):
global last_activity_time
last_activity_time = time.time()
with model_lock:
if(model is None): load_model()
options_dict = {"task": task}
if language:
options_dict["language"] = language
if initial_prompt:
options_dict["initial_prompt"] = initial_prompt
if vad_filter:
options_dict["vad_filter"] = True
if word_timestamps:
options_dict["word_timestamps"] = True
with model_lock:
segments = []
text = ""
segment_generator, info = model.transcribe(audio, beam_size=5, **options_dict)
for segment in segment_generator:
segments.append(segment)
text = text + segment.text
result = {"language": options_dict.get("language", info.language), "segments": segments, "text": text}
output_file = StringIO()
write_result(result, output_file, output)
output_file.seek(0)
return output_file
def language_detection(audio):
global last_activity_time
last_activity_time = time.time()
with model_lock:
if(model is None): load_model()
# load audio and pad/trim it to fit 30 seconds
audio = whisper.pad_or_trim(audio)
# detect the spoken language
with model_lock:
segments, info = model.transcribe(audio, beam_size=5)
detected_lang_code = info.language
detected_language_confidence = info.language_probability
return detected_lang_code, detected_language_confidence
def write_result(result: dict, file: BinaryIO, output: Union[str, None]):
if output == "srt":
WriteSRT(ResultWriter).write_result(result, file=file)
elif output == "vtt":
WriteVTT(ResultWriter).write_result(result, file=file)
elif output == "tsv":
WriteTSV(ResultWriter).write_result(result, file=file)
elif output == "json":
WriteJSON(ResultWriter).write_result(result, file=file)
elif output == "txt":
WriteTXT(ResultWriter).write_result(result, file=file)
else:
return "Please select an output method!"
import os
from io import StringIO
from threading import Lock, Thread
from typing import BinaryIO, Union
import time
import gc
import torch
import whisper
from whisper.utils import ResultWriter, WriteJSON, WriteSRT, WriteTSV, WriteTXT, WriteVTT
model_name = os.getenv("ASR_MODEL", "base")
model_path = os.getenv("ASR_MODEL_PATH", os.path.join(os.path.expanduser("~"), ".cache", "whisper"))
model = None
model_lock = Lock()
last_activity_time = time.time()
idle_timeout = int(os.getenv("IDLE_TIMEOUT", 0)) # default to being disabled
def monitor_idleness():
global model
if(idle_timeout <= 0): return
while True:
time.sleep(15) # check every minute
if time.time() - last_activity_time > idle_timeout:
with model_lock:
release_model()
break
def load_model():
global model
if torch.cuda.is_available():
model = whisper.load_model(model_name, download_root=model_path).cuda()
else:
model = whisper.load_model(model_name, download_root=model_path)
Thread(target=monitor_idleness, daemon=True).start()
load_model()
def release_model():
global model
del model
torch.cuda.empty_cache()
gc.collect()
model = None
print("Model unloaded due to timeout")
def transcribe(
audio,
task: Union[str, None],
language: Union[str, None],
initial_prompt: Union[str, None],
vad_filter: Union[bool, None],
word_timestamps: Union[bool, None],
output,
):
global last_activity_time
last_activity_time = time.time()
with model_lock:
if(model is None): load_model()
options_dict = {"task": task}
if language:
options_dict["language"] = language
if initial_prompt:
options_dict["initial_prompt"] = initial_prompt
if word_timestamps:
options_dict["word_timestamps"] = word_timestamps
with model_lock:
result = model.transcribe(audio, **options_dict)
output_file = StringIO()
write_result(result, output_file, output)
output_file.seek(0)
return output_file
def language_detection(audio):
global last_activity_time
last_activity_time = time.time()
with model_lock:
if(model is None): load_model()
# load audio and pad/trim it to fit 30 seconds
audio = whisper.pad_or_trim(audio)
# make log-Mel spectrogram and move to the same device as the model
mel = whisper.log_mel_spectrogram(audio, model.dims.n_mels).to(model.device)
# detect the spoken language
with model_lock:
_, probs = model.detect_language(mel)
detected_lang_code = max(probs, key=probs.get)
return detected_lang_code, probs[max(probs)]
def write_result(result: dict, file: BinaryIO, output: Union[str, None]):
options = {"max_line_width": 1000, "max_line_count": 10, "highlight_words": False}
if output == "srt":
WriteSRT(ResultWriter).write_result(result, file=file, options=options)
elif output == "vtt":
WriteVTT(ResultWriter).write_result(result, file=file, options=options)
elif output == "tsv":
WriteTSV(ResultWriter).write_result(result, file=file, options=options)
elif output == "json":
WriteJSON(ResultWriter).write_result(result, file=file, options=options)
elif output == "txt":
WriteTXT(ResultWriter).write_result(result, file=file, options=options)
else:
return "Please select an output method!"
import json import json
import os import os
from typing import TextIO from typing import TextIO, BinaryIO
import ffmpeg
import numpy as np
from faster_whisper.utils import format_timestamp from faster_whisper.utils import format_timestamp
from app.config import CONFIG
class ResultWriter: class ResultWriter:
extension: str extension: str
...@@ -85,3 +89,36 @@ class WriteJSON(ResultWriter): ...@@ -85,3 +89,36 @@ class WriteJSON(ResultWriter):
def write_result(self, result: dict, file: TextIO): def write_result(self, result: dict, file: TextIO):
json.dump(result, file) json.dump(result, file)
def load_audio(file: BinaryIO, encode=True, sr: int = CONFIG.SAMPLE_RATE):
"""
Open an audio file object and read as mono waveform, resampling as necessary.
Modified from https://github.com/openai/whisper/blob/main/whisper/audio.py to accept a file object
Parameters
----------
file: BinaryIO
The audio file like object
encode: Boolean
If true, encode audio stream to WAV before sending to whisper
sr: int
The sample rate to resample the audio if necessary
Returns
-------
A NumPy array containing the audio waveform, in float32 dtype.
"""
if encode:
try:
# This launches a subprocess to decode audio while down-mixing and resampling as necessary.
# Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
out, _ = (
ffmpeg.input("pipe:", threads=0)
.output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=sr)
.run(cmd="ffmpeg", capture_stdout=True, capture_stderr=True, input=file.read())
)
except ffmpeg.Error as e:
raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
else:
out = file.read()
return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
import importlib.metadata import importlib.metadata
import os import os
from os import path from os import path
from typing import Annotated, BinaryIO, Optional, Union from typing import Annotated, Optional, Union
from urllib.parse import quote from urllib.parse import quote
import click import click
import ffmpeg
import numpy as np
import uvicorn import uvicorn
from fastapi import FastAPI, File, Query, UploadFile, applications from fastapi import FastAPI, File, Query, UploadFile, applications
from fastapi.openapi.docs import get_swagger_ui_html from fastapi.openapi.docs import get_swagger_ui_html
...@@ -14,13 +12,13 @@ from fastapi.responses import RedirectResponse, StreamingResponse ...@@ -14,13 +12,13 @@ from fastapi.responses import RedirectResponse, StreamingResponse
from fastapi.staticfiles import StaticFiles from fastapi.staticfiles import StaticFiles
from whisper import tokenizer from whisper import tokenizer
ASR_ENGINE = os.getenv("ASR_ENGINE", "openai_whisper") from app.config import CONFIG
if ASR_ENGINE == "faster_whisper": from app.factory.asr_model_factory import ASRModelFactory
from app.faster_whisper.core import language_detection, transcribe from app.utils import load_audio
else:
from app.openai_whisper.core import language_detection, transcribe asr_model = ASRModelFactory.create_asr_model()
asr_model.load_model()
SAMPLE_RATE = 16000
LANGUAGE_CODES = sorted(tokenizer.LANGUAGES.keys()) LANGUAGE_CODES = sorted(tokenizer.LANGUAGES.keys())
projectMetadata = importlib.metadata.metadata("whisper-asr-webservice") projectMetadata = importlib.metadata.metadata("whisper-asr-webservice")
...@@ -37,6 +35,7 @@ assets_path = os.getcwd() + "/swagger-ui-assets" ...@@ -37,6 +35,7 @@ assets_path = os.getcwd() + "/swagger-ui-assets"
if path.exists(assets_path + "/swagger-ui.css") and path.exists(assets_path + "/swagger-ui-bundle.js"): if path.exists(assets_path + "/swagger-ui.css") and path.exists(assets_path + "/swagger-ui-bundle.js"):
app.mount("/assets", StaticFiles(directory=assets_path), name="static") app.mount("/assets", StaticFiles(directory=assets_path), name="static")
def swagger_monkey_patch(*args, **kwargs): def swagger_monkey_patch(*args, **kwargs):
return get_swagger_ui_html( return get_swagger_ui_html(
*args, *args,
...@@ -46,6 +45,7 @@ if path.exists(assets_path + "/swagger-ui.css") and path.exists(assets_path + "/ ...@@ -46,6 +45,7 @@ if path.exists(assets_path + "/swagger-ui.css") and path.exists(assets_path + "/
swagger_js_url="/assets/swagger-ui-bundle.js", swagger_js_url="/assets/swagger-ui-bundle.js",
) )
applications.get_swagger_ui_html = swagger_monkey_patch applications.get_swagger_ui_html = swagger_monkey_patch
...@@ -65,20 +65,20 @@ async def asr( ...@@ -65,20 +65,20 @@ async def asr(
bool | None, bool | None,
Query( Query(
description="Enable the voice activity detection (VAD) to filter out parts of the audio without speech", description="Enable the voice activity detection (VAD) to filter out parts of the audio without speech",
include_in_schema=(True if ASR_ENGINE == "faster_whisper" else False), include_in_schema=(True if CONFIG.ASR_ENGINE == "faster_whisper" else False),
), ),
] = False, ] = False,
word_timestamps: bool = Query(default=False, description="Word level timestamps"), word_timestamps: bool = Query(default=False, description="Word level timestamps"),
output: Union[str, None] = Query(default="txt", enum=["txt", "vtt", "srt", "tsv", "json"]), output: Union[str, None] = Query(default="txt", enum=["txt", "vtt", "srt", "tsv", "json"]),
): ):
result = transcribe( result = asr_model.transcribe(
load_audio(audio_file.file, encode), task, language, initial_prompt, vad_filter, word_timestamps, output load_audio(audio_file.file, encode), task, language, initial_prompt, vad_filter, word_timestamps, output
) )
return StreamingResponse( return StreamingResponse(
result, result,
media_type="text/plain", media_type="text/plain",
headers={ headers={
"Asr-Engine": ASR_ENGINE, "Asr-Engine": CONFIG.ASR_ENGINE,
"Content-Disposition": f'attachment; filename="{quote(audio_file.filename)}.{output}"', "Content-Disposition": f'attachment; filename="{quote(audio_file.filename)}.{output}"',
}, },
) )
...@@ -89,41 +89,10 @@ async def detect_language( ...@@ -89,41 +89,10 @@ async def detect_language(
audio_file: UploadFile = File(...), # noqa: B008 audio_file: UploadFile = File(...), # noqa: B008
encode: bool = Query(default=True, description="Encode audio first through FFmpeg"), encode: bool = Query(default=True, description="Encode audio first through FFmpeg"),
): ):
detected_lang_code, confidence = language_detection(load_audio(audio_file.file, encode)) detected_lang_code, confidence = asr_model.language_detection(load_audio(audio_file.file, encode))
return {"detected_language": tokenizer.LANGUAGES[detected_lang_code], "language_code": detected_lang_code, "confidence": confidence} return {"detected_language": tokenizer.LANGUAGES[detected_lang_code], "language_code": detected_lang_code,
"confidence": confidence}
def load_audio(file: BinaryIO, encode=True, sr: int = SAMPLE_RATE):
"""
Open an audio file object and read as mono waveform, resampling as necessary.
Modified from https://github.com/openai/whisper/blob/main/whisper/audio.py to accept a file object
Parameters
----------
file: BinaryIO
The audio file like object
encode: Boolean
If true, encode audio stream to WAV before sending to whisper
sr: int
The sample rate to resample the audio if necessary
Returns
-------
A NumPy array containing the audio waveform, in float32 dtype.
"""
if encode:
try:
# This launches a subprocess to decode audio while down-mixing and resampling as necessary.
# Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
out, _ = (
ffmpeg.input("pipe:", threads=0)
.output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=sr)
.run(cmd="ffmpeg", capture_stdout=True, capture_stderr=True, input=file.read())
)
except ffmpeg.Error as e:
raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
else:
out = file.read()
return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
@click.command() @click.command()
@click.option( @click.option(
...@@ -147,5 +116,6 @@ def start( ...@@ -147,5 +116,6 @@ def start(
): ):
uvicorn.run(app, host=host, port=port) uvicorn.run(app, host=host, port=port)
if __name__ == "__main__": if __name__ == "__main__":
start() start()
...@@ -15,9 +15,10 @@ ...@@ -15,9 +15,10 @@
export ASR_MODEL=base export ASR_MODEL=base
``` ```
Available ASR_MODELs are `tiny`, `base`, `small`, `medium`, `large`, `large-v1`, `large-v2`, `large-v3`, `turbo`(only OpenAI Whisper) and `large-v3-turbo`(only OpenAI Whisper). Available ASR_MODELs are `tiny`, `base`, `small`, `medium`, `large`, `large-v1`, `large-v2`, `large-v3`, `turbo` and `large-v3-turbo`.
For English-only applications, the `.en` models tend to perform better, especially for the `tiny.en` and `base.en` models. We observed that the difference becomes less significant for the `small.en` and `medium.en` models. For English-only applications, the `.en` models tend to perform better, especially for the `tiny.en` and `base.en`
models. We observed that the difference becomes less significant for the `small.en` and `medium.en` models.
### Configuring the `Model Path` ### Configuring the `Model Path`
...@@ -28,7 +29,18 @@ export ASR_MODEL_PATH=/data/whisper ...@@ -28,7 +29,18 @@ export ASR_MODEL_PATH=/data/whisper
### Configuring the `Model Unloading Timeout` ### Configuring the `Model Unloading Timeout`
```sh ```sh
export IDLE_TIMEOUT=300 export MODEL_IDLE_TIMEOUT=300
``` ```
Defaults to 0. After no activity for this period (in seconds), unload the model until it is requested again. Setting `0` disables the timeout, keeping the model loaded indefinitely. Defaults to `0`. After no activity for this period (in seconds), unload the model until it is requested again. Setting
`0` disables the timeout, keeping the model loaded indefinitely.
### Configuring the `SAMPLE_RATE`
```sh
export SAMPLE_RATE=16000
```
Defaults to `16000`. Default sample rate for audio input. `16 kHz` is commonly used in `speech-to-text` tasks.
This diff is collapsed.
...@@ -18,15 +18,15 @@ priority = "explicit" ...@@ -18,15 +18,15 @@ priority = "explicit"
[tool.poetry.dependencies] [tool.poetry.dependencies]
python = "^3.10" python = "^3.10"
uvicorn = { extras = ["standard"], version = "^0.31.0" } uvicorn = { extras = ["standard"], version = "^0.34.0" }
tqdm = "^4.66.5" tqdm = "^4.67.1"
python-multipart = "^0.0.12" python-multipart = "^0.0.20"
ffmpeg-python = "^0.2.0" ffmpeg-python = "^0.2.0"
fastapi = "^0.115.0" fastapi = "^0.115.6"
llvmlite = "^0.43.0" llvmlite = "^0.43.0"
numba = "^0.60.0" numba = "^0.60.0"
openai-whisper = "^20240930" openai-whisper = "^20240930"
faster-whisper = "^1.0.3" faster-whisper = "^1.1.0"
torch = [ torch = [
{ markers = "sys_platform == 'darwin' and platform_machine == 'arm64'", url = "https://download.pytorch.org/whl/cpu/torch-1.13.1-cp310-none-macosx_11_0_arm64.whl" }, { markers = "sys_platform == 'darwin' and platform_machine == 'arm64'", url = "https://download.pytorch.org/whl/cpu/torch-1.13.1-cp310-none-macosx_11_0_arm64.whl" },
{ markers = "sys_platform == 'linux' and platform_machine == 'arm64'", url = "https://download.pytorch.org/whl/cpu/torch-1.13.1-cp310-none-macosx_11_0_arm64.whl" }, { markers = "sys_platform == 'linux' and platform_machine == 'arm64'", url = "https://download.pytorch.org/whl/cpu/torch-1.13.1-cp310-none-macosx_11_0_arm64.whl" },
...@@ -37,12 +37,12 @@ torch = [ ...@@ -37,12 +37,12 @@ torch = [
] ]
[tool.poetry.dev-dependencies] [tool.poetry.dev-dependencies]
pytest = "^6.2.5" pytest = "^8.3.4"
ruff = "^0.5.0" ruff = "^0.8.3"
black = "^24.4.2" black = "^24.10.0"
mkdocs = "^1.6.0" mkdocs = "^1.6.1"
mkdocs-material = "^9.5.27" mkdocs-material = "^9.5.49"
pymdown-extensions = "^10.8.1" pymdown-extensions = "^10.12"
[build-system] [build-system]
requires = ["poetry-core>=1.0.0"] requires = ["poetry-core>=1.0.0"]
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment