Skip to content
Snippets Groups Projects
Unverified Commit ee2d0657 authored by Ahmet Öner's avatar Ahmet Öner Committed by GitHub
Browse files

Merge branch 'main' into patch-1

parents f14ac2a3 e5518bf1
Branches
No related tags found
No related merge requests found
......@@ -4,6 +4,23 @@ Changelog
Unreleased
----------
### Added
- Added detection confidence to langauge detection endpoint
- Set mel generation to adjust n_dims automatically to match the loaded model
[1.6.0] (2024-10-06)
--------------------
### Changed
- Upgraded
- [openai/whisper](https://github.com/openai/whisper)@[v20240930](https://github.com/openai/whisper/releases/tag/v20240930)
- fastapi to v0.115.0
- uvicorn to v0.31.0
- tqdm to v4.66.5
- python-multipart to v0.0.12
[1.5.0] (2024-07-04)
--------------------
......@@ -202,6 +219,7 @@ Unreleased
- mp3 support by using FFmpeg instead of librosa in #8
- add language detection endpoint in #9
[1.6.0]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.6.0
[1.5.0]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.5.0
[1.4.1]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.4.1
[1.4.0]: https://github.com/ahmetoner/whisper-asr-webservice/releases/tag/v1.4.0
......
......@@ -63,4 +63,4 @@ RUN poetry install
EXPOSE 9000
ENTRYPOINT ["gunicorn", "--bind", "0.0.0.0:9000", "--workers", "1", "--timeout", "0", "app.webservice:app", "-k", "uvicorn.workers.UvicornWorker"]
ENTRYPOINT ["whisper-asr-webservice"]
......@@ -81,4 +81,4 @@ RUN $POETRY_VENV/bin/pip install torch==1.13.1+cu117 -f https://download.pytorch
EXPOSE 9000
CMD gunicorn --bind 0.0.0.0:9000 --workers 1 --timeout 0 app.webservice:app -k uvicorn.workers.UvicornWorker
CMD whisper-asr-webservice
......@@ -9,9 +9,9 @@ Whisper is a general-purpose speech recognition model. It is trained on a large
## Features
Current release (v1.5.0) supports following whisper models:
Current release (v1.6.0) supports following whisper models:
- [openai/whisper](https://github.com/openai/whisper)@[v20231117](https://github.com/openai/whisper/releases/tag/v20231117)
- [openai/whisper](https://github.com/openai/whisper)@[v20240930](https://github.com/openai/whisper/releases/tag/v20240930)
- [SYSTRAN/faster-whisper](https://github.com/SYSTRAN/faster-whisper)@[v1.0.3](https://github.com/SYSTRAN/faster-whisper/releases/tag/1.0.3)
## Quick Usage
......
......@@ -70,8 +70,9 @@ def language_detection(audio):
with model_lock:
segments, info = model.transcribe(audio, beam_size=5)
detected_lang_code = info.language
detected_language_confidence = info.language_probability
return detected_lang_code
return detected_lang_code, detected_language_confidence
def write_result(result: dict, file: BinaryIO, output: Union[str, None]):
......
......@@ -48,14 +48,14 @@ def language_detection(audio):
audio = whisper.pad_or_trim(audio)
# make log-Mel spectrogram and move to the same device as the model
mel = whisper.log_mel_spectrogram(audio).to(model.device)
mel = whisper.log_mel_spectrogram(audio, model.dims.n_mels).to(model.device)
# detect the spoken language
with model_lock:
_, probs = model.detect_language(mel)
detected_lang_code = max(probs, key=probs.get)
return detected_lang_code
return detected_lang_code, probs[max(probs)]
def write_result(result: dict, file: BinaryIO, output: Union[str, None]):
......
import importlib.metadata
import os
from os import path
from typing import Annotated, BinaryIO, Union
from typing import Annotated, BinaryIO, Optional, Union
from urllib.parse import quote
import click
import ffmpeg
import numpy as np
import uvicorn
from fastapi import FastAPI, File, Query, UploadFile, applications
from fastapi.openapi.docs import get_swagger_ui_html
from fastapi.responses import RedirectResponse, StreamingResponse
......@@ -14,9 +16,9 @@ from whisper import tokenizer
ASR_ENGINE = os.getenv("ASR_ENGINE", "openai_whisper")
if ASR_ENGINE == "faster_whisper":
from .faster_whisper.core import language_detection, transcribe
from app.faster_whisper.core import language_detection, transcribe
else:
from .openai_whisper.core import language_detection, transcribe
from app.openai_whisper.core import language_detection, transcribe
SAMPLE_RATE = 16000
LANGUAGE_CODES = sorted(tokenizer.LANGUAGES.keys())
......@@ -87,8 +89,8 @@ async def detect_language(
audio_file: UploadFile = File(...), # noqa: B008
encode: bool = Query(default=True, description="Encode audio first through FFmpeg"),
):
detected_lang_code = language_detection(load_audio(audio_file.file, encode))
return {"detected_language": tokenizer.LANGUAGES[detected_lang_code], "language_code": detected_lang_code}
detected_lang_code, confidence = language_detection(load_audio(audio_file.file, encode))
return {"detected_language": tokenizer.LANGUAGES[detected_lang_code], "language_code": detected_lang_code, "confidence": confidence}
def load_audio(file: BinaryIO, encode=True, sr: int = SAMPLE_RATE):
......@@ -122,3 +124,28 @@ def load_audio(file: BinaryIO, encode=True, sr: int = SAMPLE_RATE):
out = file.read()
return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
@click.command()
@click.option(
"-h",
"--host",
metavar="HOST",
default="0.0.0.0",
help="Host for the webservice (default: 0.0.0.0)",
)
@click.option(
"-p",
"--port",
metavar="PORT",
default=9000,
help="Port for the webservice (default: 9000)",
)
@click.version_option(version=projectMetadata["Version"])
def start(
host: str,
port: Optional[int] = None
):
uvicorn.run(app, host=host, port=port)
if __name__ == "__main__":
start()
\ No newline at end of file
......@@ -24,7 +24,7 @@ poetry install
Starting the Webservice:
```sh
poetry run gunicorn --bind 0.0.0.0:9000 --workers 1 --timeout 0 app.webservice:app -k uvicorn.workers.UvicornWorker
poetry run whisper-asr-webservice --host 0.0.0.0 --port 9000
```
### Build
......
......@@ -22,7 +22,7 @@ There are 2 endpoints available:
| Name | Values |
|-----------------|------------------------------------------------|
| audio_file | File |
| output | `txt` (default), `json`, `vtt`, `strt`, `tsv` |
| output | `text` (default), `json`, `vtt`, `srt`, `tsv` |
| task | `transcribe`, `translate` |
| language | `en` (default is auto recognition) |
| word_timestamps | false (default) |
......
......@@ -15,7 +15,7 @@
export ASR_MODEL=base
```
Available ASR_MODELs are `tiny`, `base`, `small`, `medium`, `large` (only OpenAI Whisper), `large-v1`, `large-v2` and `large-v3`.
Available ASR_MODELs are `tiny`, `base`, `small`, `medium`, `large`, `large-v1`, `large-v2`, `large-v3`, `turbo`(only OpenAI Whisper) and `large-v3-turbo`(only OpenAI Whisper).
For English-only applications, the `.en` models tend to perform better, especially for the `tiny.en` and `base.en` models. We observed that the difference becomes less significant for the `small.en` and `medium.en` models.
......
......@@ -2,9 +2,9 @@ Whisper is a general-purpose speech recognition model. It is trained on a large
## Features
Current release (v1.5.0) supports following whisper models:
Current release (v1.6.0) supports following whisper models:
- [openai/whisper](https://github.com/openai/whisper)@[v20231117](https://github.com/openai/whisper/releases/tag/v20231117)
- [openai/whisper](https://github.com/openai/whisper)@[v20240930](https://github.com/openai/whisper/releases/tag/v20240930)
- [SYSTRAN/faster-whisper](https://github.com/SYSTRAN/faster-whisper)@[v1.0.3](https://github.com/SYSTRAN/faster-whisper/releases/tag/1.0.3)
## Quick Usage
......
This diff is collapsed.
[tool.poetry]
name = "whisper-asr-webservice"
version = "1.6.0-dev"
version = "1.7.0-dev"
description = "Whisper ASR Webservice is a general-purpose speech recognition webservice."
homepage = "https://github.com/ahmetoner/whisper-asr-webservice/"
license = "https://github.com/ahmetoner/whisper-asr-webservice/blob/main/LICENCE"
......@@ -8,6 +8,9 @@ authors = ["Ahmet Öner", "Besim Alibegovic"]
readme = "README.md"
packages = [{ include = "app" }]
[tool.poetry.scripts]
whisper-asr-webservice = "app.webservice:start"
[[tool.poetry.source]]
name = "pytorch"
url = "https://download.pytorch.org/whl/cpu"
......@@ -15,15 +18,14 @@ priority = "explicit"
[tool.poetry.dependencies]
python = "^3.10"
uvicorn = { extras = ["standard"], version = "^0.30.1" }
gunicorn = "^22.0.0"
tqdm = "^4.66.4"
python-multipart = "^0.0.9"
uvicorn = { extras = ["standard"], version = "^0.31.0" }
tqdm = "^4.66.5"
python-multipart = "^0.0.12"
ffmpeg-python = "^0.2.0"
fastapi = "^0.111.0"
fastapi = "^0.115.0"
llvmlite = "^0.43.0"
numba = "^0.60.0"
openai-whisper = "^20231117"
openai-whisper = "^20240930"
faster-whisper = "^1.0.3"
torch = [
{ markers = "sys_platform == 'darwin' and platform_machine == 'arm64'", url = "https://download.pytorch.org/whl/cpu/torch-1.13.1-cp310-none-macosx_11_0_arm64.whl" },
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment