diff --git a/README.md b/README.md index c028ae5..443dd01 100644 --- a/README.md +++ b/README.md @@ -14,6 +14,7 @@

+ ### Run your IDE as administrator you will get following error if administrator permission is not there: @@ -88,7 +89,7 @@ transcript will also indicate the timeframe in seconds where each speaker speaks ``` from speechlib import Transcriptor -file = "obama1.wav" # your audio file +file = "obama_zach.wav" # your audio file voices_folder = "voices" # voices folder containing voice samples for recognition language = "en" # language code log_folder = "logs" # log folder for storing transcripts diff --git a/examples/transcribe.py b/examples/transcribe.py index f6a0af0..5a371f2 100644 --- a/examples/transcribe.py +++ b/examples/transcribe.py @@ -1,6 +1,6 @@ from speechlib import Transcriptor -file = "obama1.wav" # your audio file +file = "obama_zach.wav" # your audio file voices_folder = "voices" # voices folder containing voice samples for recognition language = "en" # language code log_folder = "logs" # log folder for storing transcripts diff --git a/library.md b/library.md index f4a77ea..4e63095 100644 --- a/library.md +++ b/library.md @@ -72,7 +72,7 @@ transcript will also indicate the timeframe in seconds where each speaker speaks ``` from speechlib import Transcriptor -file = "obama1.wav" # your audio file +file = "obama_zach.wav" # your audio file voices_folder = "voices" # voices folder containing voice samples for recognition language = "en" # language code log_folder = "logs" # log folder for storing transcripts @@ -99,13 +99,20 @@ end: ending time of speech in seconds text: transcribed text for speech during start and end speaker: speaker of the text -#### voices_folder structure: - -![voices_folder_structure](voices_folder_structure1.png) - -#### Transcription: - -![transcription](transcript.png) +#### voices folder structure: +``` +voices_folder +|---> person1 +| |---> sample1.wav +| |---> sample2.wav +| ... +| +|---> person2 +| |---> sample1.wav +| |---> sample2.wav +| ... +|--> ... +``` supported language codes: diff --git a/requirements.txt b/requirements.txt index ef661ee..5893b85 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,9 @@ -transformers -torch -torchaudio -pydub -pyannote.audio -speechbrain -accelerate -faster-whisper \ No newline at end of file +transformers==4.36.2 +torch==2.1.2 +torchaudio==2.1.2 +pydub==0.25.1 +pyannote.audio==3.1.1 +speechbrain==0.5.16 +accelerate==0.26.1 +faster-whisper==0.10.1 +openai-whisper==20231117 \ No newline at end of file diff --git a/setup.py b/setup.py index 637ebec..73ce7aa 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setup( name="speechlib", - version="1.1.1", + version="1.1.2", description="speechlib is a library that can do speaker diarization, transcription and speaker recognition on an audio file to create transcripts with actual speaker names. This library also contain audio preprocessor functions.", packages=find_packages(), long_description=long_description, diff --git a/setup_instruction.md b/setup_instruction.md index fbf0bee..e45c1e3 100644 --- a/setup_instruction.md +++ b/setup_instruction.md @@ -9,7 +9,7 @@ for publishing: pip install twine for install locally for testing: - pip install dist/speechlib-1.1.0-py3-none-any.whl + pip install dist/speechlib-1.1.2-py3-none-any.whl finally run: twine upload dist/* diff --git a/speechlib/transcribe.py b/speechlib/transcribe.py index 0df235b..3798df8 100644 --- a/speechlib/transcribe.py +++ b/speechlib/transcribe.py @@ -32,9 +32,14 @@ def transcribe(file, language, model_size, whisper_type, quantization): Exception("Language code not supported.\nThese are the supported languages:\n", model.supported_languages) else: try: - model = whisper.load_model(model_size) - result = model.transcribe(file, language=language) - res = result["text"] + if torch.cuda.is_available(): + model = whisper.load_model(model_size, device="cuda") + result = model.transcribe(file, language=language, fp16=True) + res = result["text"] + else: + model = whisper.load_model(model_size, device="cpu") + result = model.transcribe(file, language=language, fp16=False) + res = result["text"] return res except Exception as err: