Skip to content
Snippets Groups Projects
Commit b7a2b4a0 authored by Forest Anderson's avatar Forest Anderson
Browse files

Add VRAM flush when idle

Implement automatic VRAM clearing after a specified period of idleness.

* Add a mechanism to track the last activity time and implement a background thread to monitor idleness and clear VRAM after five minutes of inactivity in `app/faster_whisper/core.py` and `app/openai_whisper/core.py`.
* Update the `transcribe` and `language_detection` functions in both core files to reset the last activity time upon invocation.
* Add a function to fully release the model from memory using `del`, `torch.cuda.empty_cache()`, and `gc.collect()` in both core files.
* Add configuration options for the idleness timeout period and enabled/disabled state in the environment variables in `app/webservice.py`.
parent 539f5852
Branches
No related tags found
No related merge requests found
import os import os
from io import StringIO from io import StringIO
from threading import Lock from threading import Lock, Thread
from typing import BinaryIO, Union from typing import BinaryIO, Union
import time
import gc
import torch import torch
import whisper import whisper
...@@ -26,7 +28,25 @@ model = WhisperModel( ...@@ -26,7 +28,25 @@ model = WhisperModel(
) )
model_lock = Lock() model_lock = Lock()
last_activity_time = time.time()
idle_timeout = int(os.getenv("IDLE_TIMEOUT", 300)) # default to 5 minutes
def monitor_idleness():
global model
while True:
time.sleep(60) # check every minute
if time.time() - last_activity_time > idle_timeout:
with model_lock:
release_model()
break
Thread(target=monitor_idleness, daemon=True).start()
def release_model():
global model
del model
torch.cuda.empty_cache()
gc.collect()
def transcribe( def transcribe(
audio, audio,
...@@ -37,6 +57,9 @@ def transcribe( ...@@ -37,6 +57,9 @@ def transcribe(
word_timestamps: Union[bool, None], word_timestamps: Union[bool, None],
output, output,
): ):
global last_activity_time
last_activity_time = time.time()
options_dict = {"task": task} options_dict = {"task": task}
if language: if language:
options_dict["language"] = language options_dict["language"] = language
...@@ -63,6 +86,9 @@ def transcribe( ...@@ -63,6 +86,9 @@ def transcribe(
def language_detection(audio): def language_detection(audio):
global last_activity_time
last_activity_time = time.time()
# load audio and pad/trim it to fit 30 seconds # load audio and pad/trim it to fit 30 seconds
audio = whisper.pad_or_trim(audio) audio = whisper.pad_or_trim(audio)
......
import os import os
from io import StringIO from io import StringIO
from threading import Lock from threading import Lock, Thread
from typing import BinaryIO, Union from typing import BinaryIO, Union
import time
import gc
import torch import torch
import whisper import whisper
...@@ -15,7 +17,25 @@ if torch.cuda.is_available(): ...@@ -15,7 +17,25 @@ if torch.cuda.is_available():
else: else:
model = whisper.load_model(model_name, download_root=model_path) model = whisper.load_model(model_name, download_root=model_path)
model_lock = Lock() model_lock = Lock()
last_activity_time = time.time()
idle_timeout = int(os.getenv("IDLE_TIMEOUT", 300)) # default to 5 minutes
def monitor_idleness():
global model
while True:
time.sleep(60) # check every minute
if time.time() - last_activity_time > idle_timeout:
with model_lock:
release_model()
break
Thread(target=monitor_idleness, daemon=True).start()
def release_model():
global model
del model
torch.cuda.empty_cache()
gc.collect()
def transcribe( def transcribe(
audio, audio,
...@@ -26,6 +46,9 @@ def transcribe( ...@@ -26,6 +46,9 @@ def transcribe(
word_timestamps: Union[bool, None], word_timestamps: Union[bool, None],
output, output,
): ):
global last_activity_time
last_activity_time = time.time()
options_dict = {"task": task} options_dict = {"task": task}
if language: if language:
options_dict["language"] = language options_dict["language"] = language
...@@ -44,6 +67,9 @@ def transcribe( ...@@ -44,6 +67,9 @@ def transcribe(
def language_detection(audio): def language_detection(audio):
global last_activity_time
last_activity_time = time.time()
# load audio and pad/trim it to fit 30 seconds # load audio and pad/trim it to fit 30 seconds
audio = whisper.pad_or_trim(audio) audio = whisper.pad_or_trim(audio)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment