Spaces:

Bils
/

Generate-Sound-Effects-from-Image

Running on Zero

App Files Files Community

Bils commited on Jan 30

Commit

2f15cbe

verified ·

1 Parent(s): 8b471a0

Update app.py

Browse files

Files changed (1) hide show

app.py +181 -104

app.py CHANGED Viewed

@@ -7,139 +7,216 @@ import torch
 from scipy.io.wavfile import write
 from diffusers import DiffusionPipeline
 from transformers import pipeline
-from pathlib import Path
 load_dotenv()
 hf_token = os.getenv("HF_TKN")
-device_id = 0 if torch.cuda.is_available() else -1
-captioning_pipeline = pipeline(
-    "image-to-text",
-    model="nlpconnect/vit-gpt2-image-captioning",
-    device=device_id
-)
-pipe = DiffusionPipeline.from_pretrained(
-    "cvssp/audioldm2",
-    use_auth_token=hf_token
-)
 @spaces.GPU(duration=120)
-def analyze_image_with_free_model(image_file):
-    try:
-        with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as temp_file:
-            temp_file.write(image_file)
-            temp_image_path = temp_file.name
-        results = captioning_pipeline(temp_image_path)
-        if not results or not isinstance(results, list):
-            return "Error: Could not generate caption.", True
-        caption = results[0].get("generated_text", "").strip()
-        if not caption:
-            return "No caption was generated.", True
-        return caption, False
     except Exception as e:
-        return f"Error analyzing image: {e}", True
 @spaces.GPU(duration=120)
-def get_audioldm_from_caption(caption):
     try:
-        pipe.to("cuda")
-        audio_output = pipe(
-            prompt=caption,
             num_inference_steps=50,
             guidance_scale=7.5
-        )
-        pipe.to("cpu")
-        audio = audio_output.audios[0]
-        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
-            write(temp_wav.name, 16000, audio)
-            return temp_wav.name
     except Exception as e:
-        print(f"Error generating audio from caption: {e}")
         return None
 css = """
-#col-container{
-    margin: 0 auto;
-    max-width: 800px;
-    }
 """
 with gr.Blocks(css=css) as demo:
     with gr.Column(elem_id="col-container"):
         gr.HTML("""
-    <h1 style="text-align: center;">🎶 Generate Sound Effects from Image</h1>
-    <p style="text-align: center;">
-        ⚡ Powered by <a href="https://bilsimaging.com" target="_blank">Bilsimaging</a>
-    </p>
         """)
-    gr.Markdown("""
-    Welcome to this unique sound effect generator! This tool allows you to upload an image and generate a
-    descriptive caption and a corresponding sound effect, all using free, open-source models on Hugging Face.
-    **💡 How it works:**
-    1. **Upload an image**: Choose an image that you'd like to analyze.
-    2. **Generate Description**: Click on 'Generate Description' to get a textual description of your uploaded image.
-    3. **Generate Sound Effect**: Based on the image description, click on 'Generate Sound Effect' to create a
-       sound effect that matches the image context.
-    Enjoy the journey from visual to auditory sensation with just a few clicks!
-    """)
-    image_upload = gr.File(label="Upload Image", type="binary")
-    generate_description_button = gr.Button("Generate Description")
-    caption_display = gr.Textbox(label="Image Description", interactive=False)
-    generate_sound_button = gr.Button("Generate Sound Effect")
-    audio_output = gr.Audio(label="Generated Sound Effect")
-    gr.Markdown("""
-    ## 👥 How You Can Contribute
-    We welcome contributions and suggestions for improvements. Your feedback is invaluable
-    to the continuous enhancement of this application.
-    For support, questions, or to contribute, please contact us at
-    [contact@bilsimaging.com](mailto:contact@bilsimaging.com).
-    Support our work and get involved by donating through
-    [Ko-fi](https://ko-fi.com/bilsimaging). - Bilel Aroua
-    """)
-    gr.Markdown("""
-    ## 📢 Stay Connected
-    This app is a testament to the creative possibilities that emerge when technology meets art.
-    Enjoy exploring the auditory landscape of your images!
-    """)
-    def update_caption(image_file):
-        description, _ = analyze_image_with_free_model(image_file)
-        return description
-    def generate_sound(description):
-        if not description or description.startswith("Error"):
-            return None
-        audio_path = get_audioldm_from_caption(description)
-        return audio_path
-    generate_description_button.click(
-        fn=update_caption,
         inputs=image_upload,
-        outputs=caption_display
     )
-    generate_sound_button.click(
-        fn=generate_sound,
-        inputs=caption_display,
-        outputs=audio_output
     )
-    gr.HTML('<a href="https://visitorbadge.io/status?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2FGenerate-Sound-Effects-from-Image"><img src="https://api.visitorbadge.io/api/visitors?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2FGenerate-Sound-Effects-from-Image&countColor=%23263759" /></a>')
-    html = gr.HTML()
-demo.launch(debug=True, share=True)

 from scipy.io.wavfile import write
 from diffusers import DiffusionPipeline
 from transformers import pipeline
+from pydub import AudioSegment
+import numpy as np
+# Load environment variables
 load_dotenv()
 hf_token = os.getenv("HF_TKN")
+# Device configuration
+device = "cuda" if torch.cuda.is_available() else "cpu"
+torch_dtype = torch.float16 if device == "cuda" else torch.float32
+# Initialize models with automatic device detection
 @spaces.GPU(duration=120)
+def load_models():
+    global captioning_pipeline, pipe
+    captioning_pipeline = pipeline(
+        "image-to-text",
+        model="nlpconnect/vit-gpt2-image-captioning",
+        device=0 if torch.cuda.is_available() else -1
+    )
+    pipe = DiffusionPipeline.from_pretrained(
+        "cvssp/audioldm2",
+        use_auth_token=hf_token,
+        torch_dtype=torch_dtype
+    ).to(device)
+load_models()
+@spaces.GPU(duration=60)
+def analyze_image(image_file):
+    """Generate caption from image with error handling"""
+    try:
+        results = captioning_pipeline(image_file)
+        if results and isinstance(results, list):
+            return results[0].get("generated_text", "").strip()
+        return "Could not generate caption"
     except Exception as e:
+        return f"Error: {str(e)}"
 @spaces.GPU(duration=120)
+def generate_audio(prompt):
+    """Generate audio from text prompt"""
     try:
+        return pipe(
+            prompt=prompt,
             num_inference_steps=50,
             guidance_scale=7.5
+        ).audios[0]
+    except Exception as e:
+        print(f"Audio generation error: {str(e)}")
+        return None
+def blend_audios(audio_list):
+    """Mix multiple audio arrays into one"""
+    try:
+        valid_audios = [arr for arr in audio_list if arr is not None]
+        if not valid_audios:
+            return None
+        max_length = max(arr.shape[0] for arr in valid_audios)
+        mixed = np.zeros(max_length)
+        for arr in valid_audios:
+            if arr.shape[0] < max_length:
+                padded = np.pad(arr, (0, max_length - arr.shape[0]))
+            else:
+                padded = arr[:max_length]
+            mixed += padded
+        mixed = mixed / np.max(np.abs(mixed))
+        _, tmp_path = tempfile.mkstemp(suffix=".wav")
+        write(tmp_path, 16000, mixed)
+        return tmp_path
     except Exception as e:
+        print(f"Blending error: {str(e)}")
         return None
 css = """
+#col-container { max-width: 800px; margin: 0 auto; }
+.toggle-row { margin: 1rem 0; }
+.prompt-box { margin-bottom: 0.5rem; }
+.danger { color: #ff4444; font-weight: bold; }
 """
 with gr.Blocks(css=css) as demo:
     with gr.Column(elem_id="col-container"):
+        # Header Section
         gr.HTML("""
+        <h1 style="text-align: center;">🎶 Generate Sound Effects from Image or Text</h1>
+        <p style="text-align: center;">
+            ⚡ Powered by <a href="https://bilsimaging.com" target="_blank">Bilsimaging</a>
+        </p>
         """)
+        # Input Mode Toggle
+        input_mode = gr.Radio(
+            choices=["Image Input", "Text Input"],
+            value="Image Input",
+            label="Select Input Mode",
+            elem_classes="toggle-row"
+        )
+        # Image Input Section
+        with gr.Column(visible=True) as image_col:
+            image_upload = gr.Image(type="filepath", label="Upload Image")
+            generate_desc_btn = gr.Button("Generate Description from Image", variant="primary")
+            caption_display = gr.Textbox(label="Generated Description", interactive=False)
+        # Text Input Section
+        with gr.Column(visible=False) as text_col:
+            with gr.Row():
+                prompt1 = gr.Textbox(label="Sound Prompt 1", lines=2, placeholder="Enter sound description...")
+                prompt2 = gr.Textbox(label="Sound Prompt 2", lines=2, placeholder="Enter sound description...")
+            additional_prompts = gr.Column()
+            add_prompt_btn = gr.Button("➕ Add Another Prompt", variant="secondary")
+            gr.Markdown("<div class='danger'>Max 5 prompts for stability</div>")
+        # Generation Controls
+        generate_sound_btn = gr.Button("Generate Sound Effect", variant="primary")
+        audio_output = gr.Audio(label="Generated Sound Effect", interactive=False)
+        # Documentation Section
+        gr.Markdown("""
+        ## 👥 How You Can Contribute
+        We welcome contributions! Contact us at [contact@bilsimaging.com](mailto:contact@bilsimaging.com).
+        Support us on [Ko-fi](https://ko-fi.com/bilsimaging) - Bilel Aroua
+        """)
+        # Visitor Badge
+        gr.HTML("""
+        <div style="text-align: center;">
+            <a href="https://visitorbadge.io/status?path=https://huggingface.co/spaces/Bils/Generate-Sound-Effects-from-Image">
+                <img src="https://api.visitorbadge.io/api/visitors?path=https://huggingface.co/spaces/Bils/Generate-Sound-Effects-from-Image&countColor=%23263759"/>
+            </a>
+        </div>
+        """)
+    # Input Mode Toggle Handler
+    input_mode.change(
+        lambda mode: (gr.update(visible=mode == "Image Input"), gr.update(visible=mode == "Text Input")),
+        inputs=input_mode,
+        outputs=[image_col, text_col],
+        concurrency_limit=1
+    )
+    # Image Description Generation
+    generate_desc_btn.click(
+        analyze_image,
         inputs=image_upload,
+        outputs=caption_display,
+        concurrency_limit=2
+    )
+    # Dynamic Prompt Addition
+    def add_prompt(current_count):
+        if current_count >= 5:
+            return current_count, gr.update()
+        new_count = current_count + 1
+        new_prompt = gr.Textbox(
+            label=f"Sound Prompt {new_count}",
+            lines=2,
+            visible=True,
+            placeholder="Enter sound description..."
+        )
+        return new_count, new_prompt
+    prompt_count = gr.State(2)
+    add_prompt_btn.click(
+        add_prompt,
+        inputs=prompt_count,
+        outputs=[prompt_count, additional_prompts],
+        concurrency_limit=1
     )
+    # Sound Generation Handler
+    def process_inputs(mode, image_file, caption, *prompts):
+        try:
+            if mode == "Image Input":
+                if not image_file:
+                    raise gr.Error("Please upload an image")
+                caption = analyze_image(image_file)
+                prompts = [caption]
+            else:
+                prompts = [p.strip() for p in prompts if p.strip()]
+                if not prompts:
+                    raise gr.Error("Please enter at least one valid prompt")
+            # Generate individual audio tracks
+            audio_tracks = []
+            for prompt in prompts:
+                if not prompt:
+                    continue
+                audio = generate_audio(prompt)
+                if audio is not None:
+                    audio_tracks.append(audio)
+            # Blend audio tracks
+            if not audio_tracks:
+                return None
+            return blend_audios(audio_tracks)
+        except Exception as e:
+            raise gr.Error(f"Processing error: {str(e)}")
+    generate_sound_btn.click(
+        process_inputs,
+        inputs=[input_mode, image_upload, caption_display, prompt1, prompt2],
+        outputs=audio_output,
+        concurrency_limit=2
     )
+if __name__ == "__main__":
+    demo.launch(max_threads=4)