Spaces:

bala00712200502
/

webscraping

Sleeping

App Files Files Community

bala00712200502 commited on Apr 15, 2025

Commit

0d6de03

verified ·

1 Parent(s): 1c460ec

Update app.py

Browse files

Files changed (1) hide show

app.py +80 -61

app.py CHANGED Viewed

@@ -15,43 +15,47 @@ logger = logging.getLogger(__name__)
 # Load environment variables
 load_dotenv()
-class ConnectionSafeChatOpenAI(ChatOpenAI):
     @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
     def _call(self, *args, **kwargs):
         try:
             return super()._call(*args, **kwargs)
         except Exception as e:
-            logger.error(f"OpenAI connection failed: {str(e)}")
             if "Incorrect API key" in str(e):
-                raise ValueError("Invalid API key configuration")
-            raise ConnectionError(f"Service unavailable. Please try again later.")
 try:
-    llm = ConnectionSafeChatOpenAI(
-        model="gpt-3.5-turbo-0125",  # Use latest stable model
-        temperature=0.7,
-        request_timeout=45,
         max_retries=2
     )
 except Exception as e:
     logger.critical(f"LLM initialization failed: {str(e)}")
-    raise RuntimeError("Service initialization failed")
-class ContentAnalyzer:
     def __init__(self):
         try:
-            self.extractor = Agent(
-                role='Content Extractor',
-                goal='Extract clean text from web pages',
-                backstory="Specializes in web content extraction.",
                 verbose=False,
                 llm=llm
             )
-            self.analyst = Agent(
                 role='Content Analyst',
-                goal='Create accurate summaries',
-                backstory="Expert in distilling key information.",
                 verbose=False,
                 llm=llm
             )
@@ -60,67 +64,80 @@ class ContentAnalyzer:
             raise
     @retry(stop=stop_after_attempt(2), wait=wait_exponential(multiplier=1, min=2, max=5))
-    def safe_request(self, url):
         try:
             headers = {
-                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
             }
             response = requests.get(url, headers=headers, timeout=20)
             response.raise_for_status()
-            return response.text
         except Exception as e:
-            logger.warning(f"Request failed for {url}: {str(e)}")
-            raise ConnectionError(f"Couldn't fetch the webpage")
-    def process_content(self, url):
         try:
-            # Step 1: Fetch content
-            html = self.safe_request(url)
-            soup = BeautifulSoup(html, 'html.parser')
-            # Clean content
-            for element in soup(['script', 'style', 'nav', 'footer']):
-                element.decompose()
-            text = soup.get_text(separator='\n', strip=True)[:4000]
-            # Step 2: Analyze content
-            extract_task = Task(
-                description="Extract key information from this content.",
-                expected_output="Clean structured content in markdown.",
-                agent=self.extractor
             )
             analyze_task = Task(
-                description="Create a concise summary with key points.",
-                expected_output="Bullet point summary with main ideas.",
-                agent=self.analyst
             )
             crew = Crew(
-                agents=[self.extractor, self.analyst],
-                tasks=[extract_task, analyze_task],
                 verbose=False,
                 process=Process.sequential
             )
-            return crew.kickoff(inputs={'content': text})
-        except ConnectionError as e:
-            logger.error(f"Connection error: {str(e)}")
-            return f"🔴 Connection failed: {str(e)}"
         except Exception as e:
-            logger.error(f"Processing error: {str(e)}")
-            return f"⚠️ Error: {str(e)}"
 # Gradio Interface
 def create_interface():
-    analyzer = ContentAnalyzer()
-    with gr.Blocks(title="Web Content Analyzer", theme=gr.themes.Soft()) as app:
         gr.Markdown("""
-        ## 🌐 Web Content Analyzer
-        *Works best with informational websites (Wikipedia, documentation, news articles)*
         """)
         with gr.Row():
@@ -131,10 +148,13 @@ def create_interface():
             )
             submit_btn = gr.Button("Analyze", variant="primary")
-        output = gr.Markdown(label="Analysis Results")
         submit_btn.click(
-            fn=analyzer.process_content,
             inputs=url_input,
             outputs=output
         )
@@ -155,6 +175,5 @@ if __name__ == "__main__":
     app = create_interface()
     app.launch(
         server_name="0.0.0.0",
-        server_port=7860,
-        share=False
     )

 # Load environment variables
 load_dotenv()
+# Initialize OpenAI LLM with robust error handling
+class SafeChatOpenAI(ChatOpenAI):
     @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
     def _call(self, *args, **kwargs):
         try:
             return super()._call(*args, **kwargs)
         except Exception as e:
+            logger.error(f"OpenAI API Error: {str(e)}")
             if "Incorrect API key" in str(e):
+                raise ValueError("Invalid OpenAI API key configuration")
+            raise ConnectionError("OpenAI service unavailable. Please try again later.")
 try:
+    llm = SafeChatOpenAI(
+        model="gpt-3.5-turbo",
+        temperature=0.5,  # More deterministic output
+        request_timeout=60,
         max_retries=2
     )
 except Exception as e:
     logger.critical(f"LLM initialization failed: {str(e)}")
+    raise RuntimeError("Failed to initialize AI services")
+class WebScraperAgent:
     def __init__(self):
         try:
+            # Define agents
+            self.scraper_agent = Agent(
+                role='Senior Web Scraper',
+                goal='Extract clean content from any webpage',
+                backstory="""Expert in extracting information from complex websites,
+                adept at handling various structures and formats.""",
                 verbose=False,
                 llm=llm
             )
+            self.analyst_agent = Agent(
                 role='Content Analyst',
+                goal='Provide clear, concise summaries',
+                backstory="""Specializes in analyzing and summarizing web content
+                into key points and actionable insights.""",
                 verbose=False,
                 llm=llm
             )
             raise
     @retry(stop=stop_after_attempt(2), wait=wait_exponential(multiplier=1, min=2, max=5))
+    def scrape_website(self, url):
+        """Robust web scraping function with error handling"""
         try:
             headers = {
+                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
+                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+                'Accept-Language': 'en-US,en;q=0.5'
             }
             response = requests.get(url, headers=headers, timeout=20)
             response.raise_for_status()
+            soup = BeautifulSoup(response.text, 'html.parser')
+            # Remove unwanted elements
+            for element in soup(['script', 'style', 'nav', 'footer', 'iframe', 'noscript']):
+                element.decompose()
+            # Get clean text
+            text = soup.get_text(separator='\n', strip=True)
+            return text[:3000]  # Limit to avoid token limits
         except Exception as e:
+            logger.warning(f"Failed to scrape {url}: {str(e)}")
+            raise ConnectionError(f"Couldn't access this website. Error: {str(e)}")
+    def analyze_content(self, content):
+        """Process content through AI analysis pipeline"""
         try:
+            # Define tasks
+            scrape_task = Task(
+                description="Extract and clean the main content from this webpage data.",
+                expected_output="Well-formatted text containing the core content.",
+                agent=self.scraper_agent
             )
             analyze_task = Task(
+                description="Analyze this content and extract key information.",
+                expected_output="""Concise summary with:
+                - 3-5 key bullet points
+                - Main topics covered
+                - Any important statistics or facts""",
+                agent=self.analyst_agent
             )
+            # Create and run crew
             crew = Crew(
+                agents=[self.scraper_agent, self.analyst_agent],
+                tasks=[scrape_task, analyze_task],
                 verbose=False,
                 process=Process.sequential
             )
+            return crew.kickoff(inputs={'content': content})
         except Exception as e:
+            logger.error(f"Analysis failed: {str(e)}")
+            raise RuntimeError(f"Analysis error: {str(e)}")
 # Gradio Interface
 def create_interface():
+    scraper = WebScraperAgent()
+    def process_url(url):
+        try:
+            # Step 1: Scrape
+            content = scraper.scrape_website(url)
+            # Step 2: Analyze
+            return scraper.analyze_content(content)
+        except Exception as e:
+            return f"❌ Error: {str(e)}"
+    with gr.Blocks(title="AI Web Scraper", theme=gr.themes.Soft()) as app:
         gr.Markdown("""
+        # 🌐 AI-Powered Web Scraper
+        *Extract and summarize content from any website*
         """)
         with gr.Row():
             )
             submit_btn = gr.Button("Analyze", variant="primary")
+        output = gr.Markdown(
+            label="Analysis Results",
+            elem_classes=["output-box"]
+        )
         submit_btn.click(
+            fn=process_url,
             inputs=url_input,
             outputs=output
         )
     app = create_interface()
     app.launch(
         server_name="0.0.0.0",
+        server_port=7860
     )