webscraping / app.py
bala00712200502's picture
Update app.py
5e0460c verified
# web_summarizer_app.py
import os
import gradio as gr
import requests
from bs4 import BeautifulSoup
from dotenv import load_dotenv
# Import the Google Generative AI library
import google.generativeai as genai
# Load environment variables
load_dotenv()
# Get the Gemini API key
gemini_api_key = os.getenv("GEMINI_API_KEY")
# Configure the generative AI model
# Ensure GEMINI_API_KEY is set in your .env file
if not gemini_api_key:
raise ValueError("GEMINI_API_KEY environment variable not set.")
genai.configure(api_key=gemini_api_key)
# Use the specified Gemini model
# You can change 'gemini-1.5-flash-latest' if needed, but the user requested 1.5 Flash
model_name = os.getenv("GEMINI_MODEL", "gemini-1.5-flash-latest")
model = genai.GenerativeModel(model_name)
# 🌐 Web Scraper
def scrape_text_from_url(url):
"""
Scrapes visible text content from a given URL.
Limits the text length to 5000 characters to avoid large inputs.
"""
try:
response = requests.get(url, timeout=10)
# Raise an exception for bad status codes (4xx or 5xx)
response.raise_for_status()
soup = BeautifulSoup(response.content, "html.parser")
# Remove scripts and style tags
for tag in soup(["script", "style"]):
tag.decompose()
# Extract visible text and join into a single string
text = " ".join(chunk.strip() for chunk in soup.stripped_strings)
return text[:5000] # limit to avoid token overflow
except requests.exceptions.RequestException as e:
return f"❌ Error fetching the page: {str(e)}"
except Exception as e:
return f"❌ An unexpected error occurred during scraping: {str(e)}"
# 🧠 LLM Summarizer (using Gemini)
def summarize_with_gemini(text):
"""
Summarizes the provided text using the configured Gemini model.
"""
if not text or "❌" in text:
return "Cannot summarize due to scraping error or empty text."
try:
# Use the generate_content method for Gemini
response = model.generate_content(f"Please summarize the following content:\n\n{text}")
# Access the text content from the response
return response.text.strip()
except Exception as e:
return f"❌ Error from Gemini API: {str(e)}"
# πŸ” Combined Function
def scrape_and_summarize(url):
"""
Combines scraping and summarizing functionalities.
"""
raw_text = scrape_text_from_url(url)
# Check if scraping failed before attempting summarization
if "❌" in raw_text:
return raw_text, "Summarization skipped due to scraping error."
summary = summarize_with_gemini(raw_text)
return raw_text, summary
# 🎨 Gradio UI
with gr.Blocks(title="πŸ”Ž Web Summarizer with AI") as demo:
gr.Markdown("## 🧠🌐 Web Article Summarizer")
gr.Markdown("Enter a webpage URL below. The AI will scrape and summarize the content using Gemini 1.5 Flash.")
with gr.Row():
url_input = gr.Textbox(label="πŸ”— Enter URL", placeholder="https://example.com", scale=4)
btn = gr.Button("Summarize", variant="primary")
with gr.Row():
with gr.Column(scale=1):
raw_output = gr.Textbox(label="πŸ“ Raw Scraped Text", lines=15, interactive=False)
with gr.Column(scale=1):
summary_output = gr.Textbox(label="πŸ“„ AI Summary", lines=15, interactive=False)
# Link the button click event to the combined function
btn.click(scrape_and_summarize, inputs=[url_input], outputs=[raw_output, summary_output])
# πŸš€ Launch app
if __name__ == "__main__":
# You can set share=True to create a public link (be cautious with API keys)
demo.launch()