Add VRAM flush when idle

Implement automatic VRAM clearing after a specified period of idleness. * Add a mechanism to track the last activity time and implement a background thread to monitor idleness and clear VRAM after five minutes of inactivity in `app/faster_whisper/core.py` and `app/openai_whisper/core.py`. * Update the `transcribe` and `language_detection` functions in both core files to reset the last activity time upon invocation. * Add a function to fully release the model from memory using `del`, `torch.cuda.empty_cache()`, and `gc.collect()` in both core files. * Add configuration options for the idleness timeout period and enabled/disabled state in the environment variables in `app/webservice.py`.

Add VRAM flush when idle
b7a2b4a0 · Forest Anderson · 539f5852 · b7a2b4a0 · b7a2b4a0 · b7a2b4a0
Commit b7a2b4a0 authored 7 months ago by Forest Anderson
--- a/app/faster_whisper/core.py
+++ b/app/faster_whisper/core.py
 import os
 from io import StringIO
-from threading import Lock
+from threading import Lock, Thread
 from typing import BinaryIO, Union
+import time
+import gc
 import torch
 import whisper
@@ -26,7 +28,25 @@ model = WhisperModel(
 )
 model_lock = Lock()
+last_activity_time = time.time()
+idle_timeout = int(os.getenv("IDLE_TIMEOUT", 300))  # default to 5 minutes
+def monitor_idleness():
+    global model
+    while True:
+        time.sleep(60)  # check every minute
+        if time.time() - last_activity_time > idle_timeout:
+            with model_lock:
+                release_model()
+                break
+Thread(target=monitor_idleness, daemon=True).start()
+def release_model():
+    global model
+    del model
+    torch.cuda.empty_cache()
+    gc.collect()
 def transcribe(
    audio,
@@ -37,6 +57,9 @@ def transcribe(
    word_timestamps: Union[bool, None],
    output,
 ):
+    global last_activity_time
+    last_activity_time = time.time()
    options_dict = {"task": task}
    if language:
        options_dict["language"] = language
@@ -63,6 +86,9 @@ def transcribe(
 def language_detection(audio):
+    global last_activity_time
+    last_activity_time = time.time()
    # load audio and pad/trim it to fit 30 seconds
    audio = whisper.pad_or_trim(audio)

--- a/app/openai_whisper/core.py
+++ b/app/openai_whisper/core.py
 import os
 from io import StringIO
-from threading import Lock
+from threading import Lock, Thread
 from typing import BinaryIO, Union
+import time
+import gc
 import torch
 import whisper
@@ -15,7 +17,25 @@ if torch.cuda.is_available():
 else:
    model = whisper.load_model(model_name, download_root=model_path)
 model_lock = Lock()
+last_activity_time = time.time()
+idle_timeout = int(os.getenv("IDLE_TIMEOUT", 300))  # default to 5 minutes
+def monitor_idleness():
+    global model
+    while True:
+        time.sleep(60)  # check every minute
+        if time.time() - last_activity_time > idle_timeout:
+            with model_lock:
+                release_model()
+                break
+Thread(target=monitor_idleness, daemon=True).start()
+def release_model():
+    global model
+    del model
+    torch.cuda.empty_cache()
+    gc.collect()
 def transcribe(
    audio,
@@ -26,6 +46,9 @@ def transcribe(
    word_timestamps: Union[bool, None],
    output,
 ):
+    global last_activity_time
+    last_activity_time = time.time()
    options_dict = {"task": task}
    if language:
        options_dict["language"] = language
@@ -44,6 +67,9 @@ def transcribe(
 def language_detection(audio):
+    global last_activity_time
+    last_activity_time = time.time()
    # load audio and pad/trim it to fit 30 seconds
    audio = whisper.pad_or_trim(audio)

--- a/app/webservice.py
+++ b/app/webservice.py