bala00712200502 commited on
Commit
0d6de03
·
verified ·
1 Parent(s): 1c460ec

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +80 -61
app.py CHANGED
@@ -15,43 +15,47 @@ logger = logging.getLogger(__name__)
15
  # Load environment variables
16
  load_dotenv()
17
 
18
- class ConnectionSafeChatOpenAI(ChatOpenAI):
 
19
  @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
20
  def _call(self, *args, **kwargs):
21
  try:
22
  return super()._call(*args, **kwargs)
23
  except Exception as e:
24
- logger.error(f"OpenAI connection failed: {str(e)}")
25
  if "Incorrect API key" in str(e):
26
- raise ValueError("Invalid API key configuration")
27
- raise ConnectionError(f"Service unavailable. Please try again later.")
28
 
29
  try:
30
- llm = ConnectionSafeChatOpenAI(
31
- model="gpt-3.5-turbo-0125", # Use latest stable model
32
- temperature=0.7,
33
- request_timeout=45,
34
  max_retries=2
35
  )
36
  except Exception as e:
37
  logger.critical(f"LLM initialization failed: {str(e)}")
38
- raise RuntimeError("Service initialization failed")
39
 
40
- class ContentAnalyzer:
41
  def __init__(self):
42
  try:
43
- self.extractor = Agent(
44
- role='Content Extractor',
45
- goal='Extract clean text from web pages',
46
- backstory="Specializes in web content extraction.",
 
 
47
  verbose=False,
48
  llm=llm
49
  )
50
 
51
- self.analyst = Agent(
52
  role='Content Analyst',
53
- goal='Create accurate summaries',
54
- backstory="Expert in distilling key information.",
 
55
  verbose=False,
56
  llm=llm
57
  )
@@ -60,67 +64,80 @@ class ContentAnalyzer:
60
  raise
61
 
62
  @retry(stop=stop_after_attempt(2), wait=wait_exponential(multiplier=1, min=2, max=5))
63
- def safe_request(self, url):
 
64
  try:
65
  headers = {
66
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
 
 
67
  }
 
68
  response = requests.get(url, headers=headers, timeout=20)
69
  response.raise_for_status()
70
- return response.text
 
 
 
 
 
 
 
 
 
71
  except Exception as e:
72
- logger.warning(f"Request failed for {url}: {str(e)}")
73
- raise ConnectionError(f"Couldn't fetch the webpage")
74
 
75
- def process_content(self, url):
 
76
  try:
77
- # Step 1: Fetch content
78
- html = self.safe_request(url)
79
- soup = BeautifulSoup(html, 'html.parser')
80
-
81
- # Clean content
82
- for element in soup(['script', 'style', 'nav', 'footer']):
83
- element.decompose()
84
-
85
- text = soup.get_text(separator='\n', strip=True)[:4000]
86
-
87
- # Step 2: Analyze content
88
- extract_task = Task(
89
- description="Extract key information from this content.",
90
- expected_output="Clean structured content in markdown.",
91
- agent=self.extractor
92
  )
93
-
94
  analyze_task = Task(
95
- description="Create a concise summary with key points.",
96
- expected_output="Bullet point summary with main ideas.",
97
- agent=self.analyst
 
 
 
98
  )
99
-
 
100
  crew = Crew(
101
- agents=[self.extractor, self.analyst],
102
- tasks=[extract_task, analyze_task],
103
  verbose=False,
104
  process=Process.sequential
105
  )
106
-
107
- return crew.kickoff(inputs={'content': text})
108
-
109
- except ConnectionError as e:
110
- logger.error(f"Connection error: {str(e)}")
111
- return f"🔴 Connection failed: {str(e)}"
112
  except Exception as e:
113
- logger.error(f"Processing error: {str(e)}")
114
- return f"⚠️ Error: {str(e)}"
115
 
116
  # Gradio Interface
117
  def create_interface():
118
- analyzer = ContentAnalyzer()
119
 
120
- with gr.Blocks(title="Web Content Analyzer", theme=gr.themes.Soft()) as app:
 
 
 
 
 
 
 
 
 
121
  gr.Markdown("""
122
- ## 🌐 Web Content Analyzer
123
- *Works best with informational websites (Wikipedia, documentation, news articles)*
124
  """)
125
 
126
  with gr.Row():
@@ -131,10 +148,13 @@ def create_interface():
131
  )
132
  submit_btn = gr.Button("Analyze", variant="primary")
133
 
134
- output = gr.Markdown(label="Analysis Results")
 
 
 
135
 
136
  submit_btn.click(
137
- fn=analyzer.process_content,
138
  inputs=url_input,
139
  outputs=output
140
  )
@@ -155,6 +175,5 @@ if __name__ == "__main__":
155
  app = create_interface()
156
  app.launch(
157
  server_name="0.0.0.0",
158
- server_port=7860,
159
- share=False
160
  )
 
15
  # Load environment variables
16
  load_dotenv()
17
 
18
+ # Initialize OpenAI LLM with robust error handling
19
+ class SafeChatOpenAI(ChatOpenAI):
20
  @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
21
  def _call(self, *args, **kwargs):
22
  try:
23
  return super()._call(*args, **kwargs)
24
  except Exception as e:
25
+ logger.error(f"OpenAI API Error: {str(e)}")
26
  if "Incorrect API key" in str(e):
27
+ raise ValueError("Invalid OpenAI API key configuration")
28
+ raise ConnectionError("OpenAI service unavailable. Please try again later.")
29
 
30
  try:
31
+ llm = SafeChatOpenAI(
32
+ model="gpt-3.5-turbo",
33
+ temperature=0.5, # More deterministic output
34
+ request_timeout=60,
35
  max_retries=2
36
  )
37
  except Exception as e:
38
  logger.critical(f"LLM initialization failed: {str(e)}")
39
+ raise RuntimeError("Failed to initialize AI services")
40
 
41
+ class WebScraperAgent:
42
  def __init__(self):
43
  try:
44
+ # Define agents
45
+ self.scraper_agent = Agent(
46
+ role='Senior Web Scraper',
47
+ goal='Extract clean content from any webpage',
48
+ backstory="""Expert in extracting information from complex websites,
49
+ adept at handling various structures and formats.""",
50
  verbose=False,
51
  llm=llm
52
  )
53
 
54
+ self.analyst_agent = Agent(
55
  role='Content Analyst',
56
+ goal='Provide clear, concise summaries',
57
+ backstory="""Specializes in analyzing and summarizing web content
58
+ into key points and actionable insights.""",
59
  verbose=False,
60
  llm=llm
61
  )
 
64
  raise
65
 
66
  @retry(stop=stop_after_attempt(2), wait=wait_exponential(multiplier=1, min=2, max=5))
67
+ def scrape_website(self, url):
68
+ """Robust web scraping function with error handling"""
69
  try:
70
  headers = {
71
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
72
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
73
+ 'Accept-Language': 'en-US,en;q=0.5'
74
  }
75
+
76
  response = requests.get(url, headers=headers, timeout=20)
77
  response.raise_for_status()
78
+
79
+ soup = BeautifulSoup(response.text, 'html.parser')
80
+
81
+ # Remove unwanted elements
82
+ for element in soup(['script', 'style', 'nav', 'footer', 'iframe', 'noscript']):
83
+ element.decompose()
84
+
85
+ # Get clean text
86
+ text = soup.get_text(separator='\n', strip=True)
87
+ return text[:3000] # Limit to avoid token limits
88
  except Exception as e:
89
+ logger.warning(f"Failed to scrape {url}: {str(e)}")
90
+ raise ConnectionError(f"Couldn't access this website. Error: {str(e)}")
91
 
92
+ def analyze_content(self, content):
93
+ """Process content through AI analysis pipeline"""
94
  try:
95
+ # Define tasks
96
+ scrape_task = Task(
97
+ description="Extract and clean the main content from this webpage data.",
98
+ expected_output="Well-formatted text containing the core content.",
99
+ agent=self.scraper_agent
 
 
 
 
 
 
 
 
 
 
100
  )
101
+
102
  analyze_task = Task(
103
+ description="Analyze this content and extract key information.",
104
+ expected_output="""Concise summary with:
105
+ - 3-5 key bullet points
106
+ - Main topics covered
107
+ - Any important statistics or facts""",
108
+ agent=self.analyst_agent
109
  )
110
+
111
+ # Create and run crew
112
  crew = Crew(
113
+ agents=[self.scraper_agent, self.analyst_agent],
114
+ tasks=[scrape_task, analyze_task],
115
  verbose=False,
116
  process=Process.sequential
117
  )
118
+
119
+ return crew.kickoff(inputs={'content': content})
 
 
 
 
120
  except Exception as e:
121
+ logger.error(f"Analysis failed: {str(e)}")
122
+ raise RuntimeError(f"Analysis error: {str(e)}")
123
 
124
  # Gradio Interface
125
  def create_interface():
126
+ scraper = WebScraperAgent()
127
 
128
+ def process_url(url):
129
+ try:
130
+ # Step 1: Scrape
131
+ content = scraper.scrape_website(url)
132
+ # Step 2: Analyze
133
+ return scraper.analyze_content(content)
134
+ except Exception as e:
135
+ return f"❌ Error: {str(e)}"
136
+
137
+ with gr.Blocks(title="AI Web Scraper", theme=gr.themes.Soft()) as app:
138
  gr.Markdown("""
139
+ # 🌐 AI-Powered Web Scraper
140
+ *Extract and summarize content from any website*
141
  """)
142
 
143
  with gr.Row():
 
148
  )
149
  submit_btn = gr.Button("Analyze", variant="primary")
150
 
151
+ output = gr.Markdown(
152
+ label="Analysis Results",
153
+ elem_classes=["output-box"]
154
+ )
155
 
156
  submit_btn.click(
157
+ fn=process_url,
158
  inputs=url_input,
159
  outputs=output
160
  )
 
175
  app = create_interface()
176
  app.launch(
177
  server_name="0.0.0.0",
178
+ server_port=7860
 
179
  )