Spaces:

bala00712200502
/

webscraping

Sleeping

App Files Files Community

webscraping / app.py

bala00712200502

Update app.py

5e0460c verified 8 months ago

raw

history blame contribute delete

3.7 kB

	# web_summarizer_app.py

	import os
	import gradio as gr
	import requests
	from bs4 import BeautifulSoup
	from dotenv import load_dotenv
	# Import the Google Generative AI library
	import google.generativeai as genai

	# Load environment variables
	load_dotenv()
	# Get the Gemini API key
	gemini_api_key = os.getenv("GEMINI_API_KEY")

	# Configure the generative AI model
	# Ensure GEMINI_API_KEY is set in your .env file
	if not gemini_api_key:
	raise ValueError("GEMINI_API_KEY environment variable not set.")

	genai.configure(api_key=gemini_api_key)

	# Use the specified Gemini model
	# You can change 'gemini-1.5-flash-latest' if needed, but the user requested 1.5 Flash
	model_name = os.getenv("GEMINI_MODEL", "gemini-1.5-flash-latest")
	model = genai.GenerativeModel(model_name)


	# 🌐 Web Scraper
	def scrape_text_from_url(url):
	"""
	Scrapes visible text content from a given URL.
	Limits the text length to 5000 characters to avoid large inputs.
	"""
	try:
	response = requests.get(url, timeout=10)
	# Raise an exception for bad status codes (4xx or 5xx)
	response.raise_for_status()
	soup = BeautifulSoup(response.content, "html.parser")

	# Remove scripts and style tags
	for tag in soup(["script", "style"]):
	tag.decompose()

	# Extract visible text and join into a single string
	text = " ".join(chunk.strip() for chunk in soup.stripped_strings)
	return text[:5000] # limit to avoid token overflow
	except requests.exceptions.RequestException as e:
	return f"❌ Error fetching the page: {str(e)}"
	except Exception as e:
	return f"❌ An unexpected error occurred during scraping: {str(e)}"

	# 🧠 LLM Summarizer (using Gemini)
	def summarize_with_gemini(text):
	"""
	Summarizes the provided text using the configured Gemini model.
	"""
	if not text or "❌" in text:
	return "Cannot summarize due to scraping error or empty text."

	try:
	# Use the generate_content method for Gemini
	response = model.generate_content(f"Please summarize the following content:\n\n{text}")
	# Access the text content from the response
	return response.text.strip()
	except Exception as e:
	return f"❌ Error from Gemini API: {str(e)}"

	# 🔁 Combined Function
	def scrape_and_summarize(url):
	"""
	Combines scraping and summarizing functionalities.
	"""
	raw_text = scrape_text_from_url(url)
	# Check if scraping failed before attempting summarization
	if "❌" in raw_text:
	return raw_text, "Summarization skipped due to scraping error."

	summary = summarize_with_gemini(raw_text)
	return raw_text, summary

	# 🎨 Gradio UI
	with gr.Blocks(title="🔎 Web Summarizer with AI") as demo:
	gr.Markdown("## 🧠🌐 Web Article Summarizer")
	gr.Markdown("Enter a webpage URL below. The AI will scrape and summarize the content using Gemini 1.5 Flash.")

	with gr.Row():
	url_input = gr.Textbox(label="🔗 Enter URL", placeholder="https://example.com", scale=4)
	btn = gr.Button("Summarize", variant="primary")

	with gr.Row():
	with gr.Column(scale=1):
	raw_output = gr.Textbox(label="📝 Raw Scraped Text", lines=15, interactive=False)
	with gr.Column(scale=1):
	summary_output = gr.Textbox(label="📄 AI Summary", lines=15, interactive=False)

	# Link the button click event to the combined function
	btn.click(scrape_and_summarize, inputs=[url_input], outputs=[raw_output, summary_output])

	# 🚀 Launch app
	if __name__ == "__main__":
	# You can set share=True to create a public link (be cautious with API keys)
	demo.launch()