Spaces:

bala00712200502
/

webscraping

Sleeping

File size: 3,697 Bytes

5bdae9e
 
255ebde
5bdae9e
a29b87f
 
255ebde
5e0460c
 
255ebde
 
 
5e0460c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
255ebde
5bdae9e
 
5e0460c
 
 
 
5bdae9e
 
5e0460c
 
5bdae9e
255ebde
5e0460c
5bdae9e
 
255ebde
5e0460c
5bdae9e
 
5e0460c
 
5bdae9e
5e0460c
 
 
 
 
 
 
 
 
255ebde
5bdae9e
5e0460c
 
 
 
5bdae9e
5e0460c
0d6de03
5bdae9e
 
5e0460c
 
 
5bdae9e
5e0460c
5bdae9e
5e0460c
 
 
5bdae9e
0d6de03
5bdae9e
 
 
5e0460c
0d6de03
5bdae9e
 
 
255ebde
5bdae9e
 
 
 
 
0d6de03
5e0460c
5bdae9e
255ebde
5bdae9e
255ebde
5e0460c
5bdae9e

# web_summarizer_app.py

import os
import gradio as gr
import requests
from bs4 import BeautifulSoup
from dotenv import load_dotenv
# Import the Google Generative AI library
import google.generativeai as genai

# Load environment variables
load_dotenv()
# Get the Gemini API key
gemini_api_key = os.getenv("GEMINI_API_KEY")

# Configure the generative AI model
# Ensure GEMINI_API_KEY is set in your .env file
if not gemini_api_key:
    raise ValueError("GEMINI_API_KEY environment variable not set.")

genai.configure(api_key=gemini_api_key)

# Use the specified Gemini model
# You can change 'gemini-1.5-flash-latest' if needed, but the user requested 1.5 Flash
model_name = os.getenv("GEMINI_MODEL", "gemini-1.5-flash-latest")
model = genai.GenerativeModel(model_name)


# 🌐 Web Scraper
def scrape_text_from_url(url):
    """
    Scrapes visible text content from a given URL.
    Limits the text length to 5000 characters to avoid large inputs.
    """
    try:
        response = requests.get(url, timeout=10)
        # Raise an exception for bad status codes (4xx or 5xx)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, "html.parser")

        # Remove scripts and style tags
        for tag in soup(["script", "style"]):
            tag.decompose()

        # Extract visible text and join into a single string
        text = " ".join(chunk.strip() for chunk in soup.stripped_strings)
        return text[:5000]  # limit to avoid token overflow
    except requests.exceptions.RequestException as e:
        return f"❌ Error fetching the page: {str(e)}"
    except Exception as e:
        return f"❌ An unexpected error occurred during scraping: {str(e)}"

# 🧠 LLM Summarizer (using Gemini)
def summarize_with_gemini(text):
    """
    Summarizes the provided text using the configured Gemini model.
    """
    if not text or "❌" in text:
        return "Cannot summarize due to scraping error or empty text."

    try:
        # Use the generate_content method for Gemini
        response = model.generate_content(f"Please summarize the following content:\n\n{text}")
        # Access the text content from the response
        return response.text.strip()
    except Exception as e:
        return f"❌ Error from Gemini API: {str(e)}"

# 🔁 Combined Function
def scrape_and_summarize(url):
    """
    Combines scraping and summarizing functionalities.
    """
    raw_text = scrape_text_from_url(url)
    # Check if scraping failed before attempting summarization
    if "❌" in raw_text:
        return raw_text, "Summarization skipped due to scraping error."

    summary = summarize_with_gemini(raw_text)
    return raw_text, summary

# 🎨 Gradio UI
with gr.Blocks(title="🔎 Web Summarizer with AI") as demo:
    gr.Markdown("## 🧠🌐 Web Article Summarizer")
    gr.Markdown("Enter a webpage URL below. The AI will scrape and summarize the content using Gemini 1.5 Flash.")

    with gr.Row():
        url_input = gr.Textbox(label="🔗 Enter URL", placeholder="https://example.com", scale=4)
        btn = gr.Button("Summarize", variant="primary")

    with gr.Row():
        with gr.Column(scale=1):
            raw_output = gr.Textbox(label="📝 Raw Scraped Text", lines=15, interactive=False)
        with gr.Column(scale=1):
            summary_output = gr.Textbox(label="📄 AI Summary", lines=15, interactive=False)

    # Link the button click event to the combined function
    btn.click(scrape_and_summarize, inputs=[url_input], outputs=[raw_output, summary_output])

# 🚀 Launch app
if __name__ == "__main__":
    # You can set share=True to create a public link (be cautious with API keys)
    demo.launch()