Skip to content
Snippets Groups Projects
Commit 53779e92 authored by Ahmet Öner's avatar Ahmet Öner
Browse files

Move environment variables to `config.py`

parent 7b111d6e
Branches
Tags
No related merge requests found
import os
import torch
class CONFIG:
"""
Configuration class for ASR models.
Reads environment variables for runtime configuration, with sensible defaults.
"""
# Determine the ASR engine ('faster_whisper' or 'openai_whisper')
ASR_ENGINE = os.getenv("ASR_ENGINE", "openai_whisper")
# Determine the computation device (GPU or CPU)
DEVICE = os.getenv("ASR_DEVICE", "cuda" if torch.cuda.is_available() else "cpu")
# Model name to use (e.g., "base", "small", etc.)
MODEL_NAME = os.getenv("ASR_MODEL", "base")
# Path to the model directory
MODEL_PATH = os.getenv("ASR_MODEL_PATH", os.path.join(os.path.expanduser("~"), ".cache", "whisper"))
# Model quantization level. Defines the precision for model weights:
# 'float32' - 32-bit floating-point precision (higher precision, slower inference)
# 'float16' - 16-bit floating-point precision (lower precision, faster inference)
# 'int8' - 8-bit integer precision (lowest precision, fastest inference)
# Defaults to 'float32' for GPU availability, 'int8' for CPU.
MODEL_QUANTIZATION = os.getenv("ASR_QUANTIZATION", "float32" if torch.cuda.is_available() else "int8")
if MODEL_QUANTIZATION not in {"float32", "float16", "int8"}:
raise ValueError("Invalid MODEL_QUANTIZATION. Choose 'float32', 'float16', or 'int8'.")
# Idle timeout in seconds. If set to a non-zero value, the model will be unloaded
# after being idle for this many seconds. A value of 0 means the model will never be unloaded.
MODEL_IDLE_TIMEOUT = int(os.getenv("MODEL_IDLE_TIMEOUT", 0))
# Default sample rate for audio input. 16 kHz is commonly used in speech-to-text tasks.
SAMPLE_RATE = int(os.getenv("SAMPLE_RATE", 16000))
import gc import gc
import os
import time import time
from io import StringIO from io import StringIO
from threading import Lock, Thread from threading import Lock, Thread
...@@ -9,23 +8,20 @@ import torch ...@@ -9,23 +8,20 @@ import torch
import whisper import whisper
from faster_whisper import WhisperModel from faster_whisper import WhisperModel
from .utils import ResultWriter, WriteJSON, WriteSRT, WriteTSV, WriteTXT, WriteVTT from app.config import CONFIG
from app.faster_whisper.utils import ResultWriter, WriteJSON, WriteSRT, WriteTSV, WriteTXT, WriteVTT
model_name = os.getenv("ASR_MODEL", "base")
model_path = os.getenv("ASR_MODEL_PATH", os.path.join(os.path.expanduser("~"), ".cache", "whisper"))
model = None model = None
model_lock = Lock() model_lock = Lock()
last_activity_time = time.time() last_activity_time = time.time()
idle_timeout = int(os.getenv("IDLE_TIMEOUT", 0)) # default to being disabled
def monitor_idleness(): def monitor_idleness():
global model global model
if idle_timeout <= 0: return if CONFIG.MODEL_IDLE_TIMEOUT <= 0: return
while True: while True:
time.sleep(15) time.sleep(15)
if time.time() - last_activity_time > idle_timeout: if time.time() - last_activity_time > CONFIG.MODEL_IDLE_TIMEOUT:
with model_lock: with model_lock:
release_model() release_model()
break break
...@@ -34,17 +30,11 @@ def monitor_idleness(): ...@@ -34,17 +30,11 @@ def monitor_idleness():
def load_model(): def load_model():
global model, device, model_quantization global model, device, model_quantization
# More about available quantization levels is here:
# https://opennmt.net/CTranslate2/quantization.html
if torch.cuda.is_available():
device = "cuda"
model_quantization = os.getenv("ASR_QUANTIZATION", "float32")
else:
device = "cpu"
model_quantization = os.getenv("ASR_QUANTIZATION", "int8")
model = WhisperModel( model = WhisperModel(
model_size_or_path=model_name, device=device, compute_type=model_quantization, download_root=model_path model_size_or_path=CONFIG.MODEL_NAME,
device=CONFIG.DEVICE,
compute_type=CONFIG.MODEL_QUANTIZATION,
download_root=CONFIG.MODEL_PATH
) )
Thread(target=monitor_idleness, daemon=True).start() Thread(target=monitor_idleness, daemon=True).start()
......
import gc import gc
import os
import time import time
from io import StringIO from io import StringIO
from threading import Lock, Thread from threading import Lock, Thread
...@@ -9,21 +8,19 @@ import torch ...@@ -9,21 +8,19 @@ import torch
import whisper import whisper
from whisper.utils import ResultWriter, WriteJSON, WriteSRT, WriteTSV, WriteTXT, WriteVTT from whisper.utils import ResultWriter, WriteJSON, WriteSRT, WriteTSV, WriteTXT, WriteVTT
model_name = os.getenv("ASR_MODEL", "base") from app.config import CONFIG
model_path = os.getenv("ASR_MODEL_PATH", os.path.join(os.path.expanduser("~"), ".cache", "whisper"))
model = None model = None
model_lock = Lock() model_lock = Lock()
last_activity_time = time.time() last_activity_time = time.time()
idle_timeout = int(os.getenv("IDLE_TIMEOUT", 0)) # default to being disabled
def monitor_idleness(): def monitor_idleness():
global model global model
if idle_timeout <= 0: return if CONFIG.MODEL_IDLE_TIMEOUT <= 0: return
while True: while True:
time.sleep(15) time.sleep(15)
if time.time() - last_activity_time > idle_timeout: if time.time() - last_activity_time > CONFIG.MODEL_IDLE_TIMEOUT:
with model_lock: with model_lock:
release_model() release_model()
break break
...@@ -33,9 +30,15 @@ def load_model(): ...@@ -33,9 +30,15 @@ def load_model():
global model global model
if torch.cuda.is_available(): if torch.cuda.is_available():
model = whisper.load_model(model_name, download_root=model_path).cuda() model = whisper.load_model(
name=CONFIG.MODEL_NAME,
download_root=CONFIG.MODEL_PATH
).cuda()
else: else:
model = whisper.load_model(model_name, download_root=model_path) model = whisper.load_model(
name=CONFIG.MODEL_NAME,
download_root=CONFIG.MODEL_PATH
)
Thread(target=monitor_idleness, daemon=True).start() Thread(target=monitor_idleness, daemon=True).start()
......
...@@ -15,9 +15,11 @@ ...@@ -15,9 +15,11 @@
export ASR_MODEL=base export ASR_MODEL=base
``` ```
Available ASR_MODELs are `tiny`, `base`, `small`, `medium`, `large`, `large-v1`, `large-v2`, `large-v3`, `turbo`(only OpenAI Whisper) and `large-v3-turbo`(only OpenAI Whisper). Available ASR_MODELs are `tiny`, `base`, `small`, `medium`, `large`, `large-v1`, `large-v2`, `large-v3`, `turbo`(only
OpenAI Whisper) and `large-v3-turbo`(only OpenAI Whisper).
For English-only applications, the `.en` models tend to perform better, especially for the `tiny.en` and `base.en` models. We observed that the difference becomes less significant for the `small.en` and `medium.en` models. For English-only applications, the `.en` models tend to perform better, especially for the `tiny.en` and `base.en`
models. We observed that the difference becomes less significant for the `small.en` and `medium.en` models.
### Configuring the `Model Path` ### Configuring the `Model Path`
...@@ -28,7 +30,18 @@ export ASR_MODEL_PATH=/data/whisper ...@@ -28,7 +30,18 @@ export ASR_MODEL_PATH=/data/whisper
### Configuring the `Model Unloading Timeout` ### Configuring the `Model Unloading Timeout`
```sh ```sh
export IDLE_TIMEOUT=300 export MODEL_IDLE_TIMEOUT=300
``` ```
Defaults to 0. After no activity for this period (in seconds), unload the model until it is requested again. Setting `0` disables the timeout, keeping the model loaded indefinitely. Defaults to `0`. After no activity for this period (in seconds), unload the model until it is requested again. Setting
`0` disables the timeout, keeping the model loaded indefinitely.
### Configuring the `SAMPLE_RATE`
```sh
export SAMPLE_RATE=16000
```
Defaults to `16000`. Default sample rate for audio input. `16 kHz` is commonly used in `speech-to-text` tasks.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment