Spaces:

ibm-granite
/

granite-speech

Running on Zero

App Files Files Community

change emoji

by gsaon - opened 28 days ago

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+41

-58

Files changed (2) hide show

.pre-commit-config.yaml +0 -1
src/app.py +41 -57

.pre-commit-config.yaml CHANGED Viewed

@@ -49,4 +49,3 @@ repos:
       - id: poetry-export
         name: poetry export for base requirements
         args: [-f, requirements.txt, -o, requirements.txt, -n, --only=main, --without-hashes]
-        stages: [manual]

       - id: poetry-export
         name: poetry export for base requirements
         args: [-f, requirements.txt, -o, requirements.txt, -n, --only=main, --without-hashes]

src/app.py CHANGED Viewed

@@ -1,21 +1,21 @@
 """Template Demo for IBM Granite Hugging Face spaces."""
-import os
 from collections.abc import Iterator
 from datetime import datetime
 from pathlib import Path
 from threading import Thread
 import gradio as gr
-import langid
 import spaces
 import torch
 import torchaudio
 from punctuators.models import PunctCapSegModelONNX
-from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, TextIteratorStreamer
 pc_model = PunctCapSegModelONNX.from_pretrained("pcs_en")
 today_date = datetime.today().strftime("%B %-d, %Y")  # noqa: DTZ002
 MODEL_ID = "ibm-granite/granite-speech-3.3-2b"
@@ -31,36 +31,18 @@ model = AutoModelForSpeechSeq2Seq.from_pretrained(
     MODEL_ID, device_map="auto", torch_dtype=torch.bfloat16, offload_folder="offload/"
 )
-def delete_file(path: str) -> None:
-    """Delete a file if it exists.
-    Args:
-        path (str): Path to the file to delete.
-    Returns:
-        None
-    """
-    if path and os.path.exists(path):
-        try:
-            os.remove(path)
-            print(f"Deleted old audio file: {path}")
-        except Exception as e:
-            print(f"Warning: could not delete {path}: {e}")
 @spaces.GPU
-def transcribe(audio_file: str, user_prompt: str, prev_file: str) -> Iterator[str]:
-    """Transcribe function for ASR demo.
     Args:
         audio_file (str): Name of audio file from the user.
         user_prompt (str): Instruction from the user (transcription or translation).
-        prev_file (str): Previously uploaded audio file.
     Returns:
         str: The generated transcription/translation of the audio file.
     """
     # load wav file
     wav, sr = torchaudio.load(audio_file, normalize=True)
     if wav.shape[0] != 1 or sr != 16000:
@@ -68,40 +50,42 @@ def transcribe(audio_file: str, user_prompt: str, prev_file: str) -> Iterator[st
         wav = torch.mean(wav, dim=0, keepdim=True)  # mono
         wav = torchaudio.functional.resample(wav, sr, 16000)
         sr = 16000
-    # SAFE POINT: new audio is good → delete old audio if different
-    if prev_file != "" and prev_file != audio_file:
-        delete_file(prev_file)
-    # Update prev_file to the *current* file
-    prev_file = audio_file
     # Build messages
     chat = [
-        {"role": "system", "content": SYS_PROMPT},
-        {"role": "user", "content": f"<|audio|>{user_prompt}"},
     ]
-    prompt = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
     # run model
-    model_inputs = processor(prompt, wav, device=model.device, return_tensors="pt").to(model.device)
     streamer = TextIteratorStreamer(tokenizer, timeout=30.0, skip_prompt=True, skip_special_tokens=True)
-    kwargs = dict(**model_inputs, streamer=streamer, max_new_tokens=512, do_sample=False, num_beams=1)
     t = Thread(target=model.generate, kwargs=kwargs)
     t.start()
     text = ""
     for chunk in streamer:
         text += chunk
-        yield text, prev_file
     # Apply cap+punct for English-only
-    if langid.classify(text)[0] == "en":
         text = pc_model.infer([text])
-        text = " ".join(text[0]).replace("<unk>", " ").replace("<Unk>", " ")  # map <unk> to space
-        yield text, prev_file
 css_file_path = Path(Path(__file__).parent / "app.css")
 head_file_path = Path(Path(__file__).parent / "app_head.html")
@@ -109,25 +93,25 @@ head_file_path = Path(Path(__file__).parent / "app_head.html")
 with gr.Blocks(fill_height=True, css_paths=css_file_path, head_paths=head_file_path, title=TITLE) as demo:
     gr.Markdown(f"# {TITLE}")
     gr.Markdown(DESCRIPTION)
-    # State to store the previously uploaded audio file
-    prev_audio = gr.State(value="")
     with gr.Row():
-        audio_input = gr.Audio(type="filepath", label="Upload Audio (16kHz mono preferred)")
         with gr.Column():
             output_text = gr.Textbox(label="Transcription", lines=5)
-            choices = [
                 "Transcribe the speech to text",
-                "Translate the speech to French",
-                "Translate the speech to German",
                 "Translate the speech to Spanish",
-                "Translate the speech to Portuguese",
             ]
-            user_prompt = gr.Dropdown(
-                label="Prompt", choices=choices, interactive=True, allow_custom_value=True, value=choices[0]
-            )
-        audio_input.play(transcribe, inputs=[audio_input, user_prompt, prev_audio], outputs=[output_text, prev_audio])
 if __name__ == "__main__":
     demo.launch()

 """Template Demo for IBM Granite Hugging Face spaces."""
 from collections.abc import Iterator
 from datetime import datetime
 from pathlib import Path
 from threading import Thread
 import gradio as gr
 import spaces
 import torch
 import torchaudio
+from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, TextIteratorStreamer
+import langid
 from punctuators.models import PunctCapSegModelONNX
 pc_model = PunctCapSegModelONNX.from_pretrained("pcs_en")
+from themes.research_monochrome import theme
 today_date = datetime.today().strftime("%B %-d, %Y")  # noqa: DTZ002
 MODEL_ID = "ibm-granite/granite-speech-3.3-2b"
     MODEL_ID, device_map="auto", torch_dtype=torch.bfloat16, offload_folder="offload/"
 )
 @spaces.GPU
+def transcribe(audio_file: str, user_prompt: str) -> Iterator[str]:
+    """transcribe function for ASR demo.
     Args:
         audio_file (str): Name of audio file from the user.
         user_prompt (str): Instruction from the user (transcription or translation).
     Returns:
         str: The generated transcription/translation of the audio file.
     """
     # load wav file
     wav, sr = torchaudio.load(audio_file, normalize=True)
     if wav.shape[0] != 1 or sr != 16000:
         wav = torch.mean(wav, dim=0, keepdim=True)  # mono
         wav = torchaudio.functional.resample(wav, sr, 16000)
         sr = 16000
     # Build messages
     chat = [
+        dict(role="system", content=SYS_PROMPT),
+        dict(role="user", content=f"<|audio|>{user_prompt}"),
     ]
+    prompt = tokenizer.apply_chat_template(
+        chat, tokenize=False, add_generation_prompt=True)
     # run model
+    model_inputs = processor(
+        prompt,
+        wav,
+        device=model.device,
+        return_tensors="pt").to(model.device)
     streamer = TextIteratorStreamer(tokenizer, timeout=30.0, skip_prompt=True, skip_special_tokens=True)
+    kwargs = dict(
+        **model_inputs,
+        streamer=streamer,
+        max_new_tokens=512,
+        do_sample=False,
+        num_beams=1
+    )
     t = Thread(target=model.generate, kwargs=kwargs)
     t.start()
     text = ""
     for chunk in streamer:
         text += chunk
+        yield text
     # Apply cap+punct for English-only
+    if langid.classify(text)[0] == 'en':
         text = pc_model.infer([text])
+        yield " ".join(text[0])
 css_file_path = Path(Path(__file__).parent / "app.css")
 head_file_path = Path(Path(__file__).parent / "app_head.html")
 with gr.Blocks(fill_height=True, css_paths=css_file_path, head_paths=head_file_path, title=TITLE) as demo:
     gr.Markdown(f"# {TITLE}")
     gr.Markdown(DESCRIPTION)
     with gr.Row():
+        audio_input = gr.Audio(type="filepath",
+                               label="Upload Audio (16kHz mono preferred)")
         with gr.Column():
             output_text = gr.Textbox(label="Transcription", lines=5)
+            choices = [
                 "Transcribe the speech to text",
+                "Translate the speech to French",
+                "Translate the speech to German",
                 "Translate the speech to Spanish",
+                "Translate the speech to Portuguese"
             ]
+            user_prompt = gr.Dropdown(label="Prompt", choices=choices, interactive=True, allow_custom_value=True, value=choices[0])
+        audio_input.play(
+            transcribe,
+            inputs=[
+                audio_input,
+                user_prompt],
+            outputs=output_text)
 if __name__ == "__main__":
     demo.launch()