Spaces:
Running
Running
Update src/streamlit_app.py
Browse files- src/streamlit_app.py +48 -8
src/streamlit_app.py
CHANGED
|
@@ -22,6 +22,17 @@ import kss # KoNLPy ๋์ KSS ์ฌ์ฉ
|
|
| 22 |
from PIL import Image
|
| 23 |
import base64
|
| 24 |
from io import BytesIO
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
# ์๋ํด๋ผ์ฐ๋ ์ถ๊ฐ
|
| 27 |
try:
|
|
@@ -130,16 +141,21 @@ def crawl_naver_news(keyword, num_articles=5):
|
|
| 130 |
"""
|
| 131 |
๋ค์ด๋ฒ ๋ด์ค ๊ธฐ์ฌ๋ฅผ ์์งํ๋ ํจ์
|
| 132 |
"""
|
|
|
|
| 133 |
url = f"https://search.naver.com/search.naver?where=news&query={keyword}"
|
| 134 |
results = []
|
| 135 |
|
| 136 |
try:
|
| 137 |
# ํ์ด์ง ์์ฒญ
|
|
|
|
| 138 |
response = requests.get(url)
|
|
|
|
|
|
|
| 139 |
soup = BeautifulSoup(response.text, 'html.parser')
|
| 140 |
|
| 141 |
# ๋ด์ค ์์ดํ
์ฐพ๊ธฐ
|
| 142 |
news_items = soup.select('div.sds-comps-base-layout.sds-comps-full-layout')
|
|
|
|
| 143 |
|
| 144 |
# ๊ฐ ๋ด์ค ์์ดํ
์์ ์ ๋ณด ์ถ์ถ
|
| 145 |
for i, item in enumerate(news_items):
|
|
@@ -174,40 +190,50 @@ def crawl_naver_news(keyword, num_articles=5):
|
|
| 174 |
'description': description,
|
| 175 |
'source': source,
|
| 176 |
'date': date,
|
| 177 |
-
'content': ""
|
| 178 |
})
|
| 179 |
|
|
|
|
|
|
|
| 180 |
except Exception as e:
|
| 181 |
-
|
| 182 |
continue
|
| 183 |
|
| 184 |
except Exception as e:
|
| 185 |
-
|
| 186 |
|
|
|
|
| 187 |
return results
|
| 188 |
|
| 189 |
# ๊ธฐ์ฌ ์๋ฌธ ๊ฐ์ ธ์ค๊ธฐ
|
| 190 |
def get_article_content(url):
|
|
|
|
| 191 |
try:
|
| 192 |
response = requests.get(url, timeout=5)
|
|
|
|
|
|
|
| 193 |
soup = BeautifulSoup(response.text, 'html.parser')
|
| 194 |
|
| 195 |
# ๋ค์ด๋ฒ ๋ด์ค ๋ณธ๋ฌธ ์ฐพ๊ธฐ
|
| 196 |
content = soup.select_one('#dic_area')
|
| 197 |
if content:
|
| 198 |
text = content.text.strip()
|
| 199 |
-
text = re.sub(r'\s+', ' ', text)
|
|
|
|
| 200 |
return text
|
| 201 |
|
| 202 |
-
# ๋ค๋ฅธ ๋ด์ค ์ฌ์ดํธ ๋ณธ๋ฌธ ์ฐพ๊ธฐ
|
| 203 |
content = soup.select_one('.article_body, .article-body, .article-content, .news-content-inner')
|
| 204 |
if content:
|
| 205 |
text = content.text.strip()
|
| 206 |
text = re.sub(r'\s+', ' ', text)
|
|
|
|
| 207 |
return text
|
| 208 |
|
|
|
|
| 209 |
return "๋ณธ๋ฌธ์ ๊ฐ์ ธ์ฌ ์ ์์ต๋๋ค."
|
| 210 |
except Exception as e:
|
|
|
|
| 211 |
return f"์ค๋ฅ ๋ฐ์: {str(e)}"
|
| 212 |
|
| 213 |
# NLTK๋ฅผ ์ด์ฉํ ํค์๋ ๋ถ์ (KSS ํ์ฉ)
|
|
@@ -423,11 +449,14 @@ def run_scheduled_task():
|
|
| 423 |
traceback.print_exc()
|
| 424 |
|
| 425 |
def perform_news_task(task_type, keyword, num_articles, file_prefix):
|
|
|
|
| 426 |
try:
|
| 427 |
articles = crawl_naver_news(keyword, num_articles)
|
|
|
|
| 428 |
|
| 429 |
# ๊ธฐ์ฌ ๋ด์ฉ ๊ฐ์ ธ์ค๊ธฐ
|
| 430 |
-
for article in articles:
|
|
|
|
| 431 |
article['content'] = get_article_content(article['link'])
|
| 432 |
time.sleep(0.5) # ์๋ฒ ๋ถํ ๋ฐฉ์ง
|
| 433 |
|
|
@@ -439,10 +468,12 @@ def perform_news_task(task_type, keyword, num_articles, file_prefix):
|
|
| 439 |
with open(filename, 'w', encoding='utf-8') as f:
|
| 440 |
json.dump(articles, f, ensure_ascii=False, indent=2)
|
| 441 |
|
|
|
|
|
|
|
| 442 |
global_scheduler_state.last_run = datetime.now()
|
| 443 |
print(f"{datetime.now()} - {task_type} ๋ด์ค ๊ธฐ์ฌ ์์ง ์๋ฃ: {keyword}")
|
| 444 |
|
| 445 |
-
# ์ ์ญ ์ํ์ ์์ง ๊ฒฐ๊ณผ๋ฅผ ์ ์ฅ
|
| 446 |
result_item = {
|
| 447 |
'task_type': task_type,
|
| 448 |
'keyword': keyword,
|
|
@@ -453,7 +484,7 @@ def perform_news_task(task_type, keyword, num_articles, file_prefix):
|
|
| 453 |
global_scheduler_state.scheduled_results.append(result_item)
|
| 454 |
|
| 455 |
except Exception as e:
|
| 456 |
-
|
| 457 |
traceback.print_exc()
|
| 458 |
|
| 459 |
def start_scheduler(daily_tasks, interval_tasks):
|
|
@@ -1015,6 +1046,15 @@ elif menu == "๋ด์ค ๊ธฐ์ฌ ์์ฝํ๊ธฐ":
|
|
| 1015 |
with tab3:
|
| 1016 |
st.subheader("์ค์ผ์ค๋ฌ ์ ์ด ๋ฐ ์ํ")
|
| 1017 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1018 |
col1, col2 = st.columns(2)
|
| 1019 |
|
| 1020 |
with col1:
|
|
|
|
| 22 |
from PIL import Image
|
| 23 |
import base64
|
| 24 |
from io import BytesIO
|
| 25 |
+
import logging
|
| 26 |
+
|
| 27 |
+
# ๋ก๊น
์ค์
|
| 28 |
+
logging.basicConfig(
|
| 29 |
+
level=logging.INFO,
|
| 30 |
+
format='%(asctime)s - %(levelname)s - %(message)s',
|
| 31 |
+
handlers=[
|
| 32 |
+
logging.StreamHandler(),
|
| 33 |
+
logging.FileHandler('/tmp/crawler.log')
|
| 34 |
+
]
|
| 35 |
+
)
|
| 36 |
|
| 37 |
# ์๋ํด๋ผ์ฐ๋ ์ถ๊ฐ
|
| 38 |
try:
|
|
|
|
| 141 |
"""
|
| 142 |
๋ค์ด๋ฒ ๋ด์ค ๊ธฐ์ฌ๋ฅผ ์์งํ๋ ํจ์
|
| 143 |
"""
|
| 144 |
+
logging.info(f"ํฌ๋กค๋ง ์์: ํค์๋={keyword}, ๊ธฐ์ฌ ์={num_articles}")
|
| 145 |
url = f"https://search.naver.com/search.naver?where=news&query={keyword}"
|
| 146 |
results = []
|
| 147 |
|
| 148 |
try:
|
| 149 |
# ํ์ด์ง ์์ฒญ
|
| 150 |
+
logging.info(f"์์ฒญ URL: {url}")
|
| 151 |
response = requests.get(url)
|
| 152 |
+
logging.info(f"์๋ต ์ํ ์ฝ๋: {response.status_code}")
|
| 153 |
+
|
| 154 |
soup = BeautifulSoup(response.text, 'html.parser')
|
| 155 |
|
| 156 |
# ๋ด์ค ์์ดํ
์ฐพ๊ธฐ
|
| 157 |
news_items = soup.select('div.sds-comps-base-layout.sds-comps-full-layout')
|
| 158 |
+
logging.info(f"์ฐพ์ ๋ด์ค ์์ดํ
์: {len(news_items)}")
|
| 159 |
|
| 160 |
# ๊ฐ ๋ด์ค ์์ดํ
์์ ์ ๋ณด ์ถ์ถ
|
| 161 |
for i, item in enumerate(news_items):
|
|
|
|
| 190 |
'description': description,
|
| 191 |
'source': source,
|
| 192 |
'date': date,
|
| 193 |
+
'content': ""
|
| 194 |
})
|
| 195 |
|
| 196 |
+
logging.info(f"๊ธฐ์ฌ ์ถ์ถ ์ฑ๊ณต: {title}")
|
| 197 |
+
|
| 198 |
except Exception as e:
|
| 199 |
+
logging.error(f"๊ธฐ์ฌ ์ ๋ณด ์ถ์ถ ์ค ์ค๋ฅ ๋ฐ์: {str(e)}", exc_info=True)
|
| 200 |
continue
|
| 201 |
|
| 202 |
except Exception as e:
|
| 203 |
+
logging.error(f"ํ์ด์ง ์์ฒญ ์ค ์ค๋ฅ ๋ฐ์: {str(e)}", exc_info=True)
|
| 204 |
|
| 205 |
+
logging.info(f"ํฌ๋กค๋ง ์๋ฃ: {len(results)}๊ฐ ๊ธฐ์ฌ ์์ง")
|
| 206 |
return results
|
| 207 |
|
| 208 |
# ๊ธฐ์ฌ ์๋ฌธ ๊ฐ์ ธ์ค๊ธฐ
|
| 209 |
def get_article_content(url):
|
| 210 |
+
logging.info(f"๊ธฐ์ฌ ์๋ฌธ ๊ฐ์ ธ์ค๊ธฐ ์์: {url}")
|
| 211 |
try:
|
| 212 |
response = requests.get(url, timeout=5)
|
| 213 |
+
logging.info(f"์๋ฌธ ์์ฒญ ์ํ ์ฝ๋: {response.status_code}")
|
| 214 |
+
|
| 215 |
soup = BeautifulSoup(response.text, 'html.parser')
|
| 216 |
|
| 217 |
# ๋ค์ด๋ฒ ๋ด์ค ๋ณธ๋ฌธ ์ฐพ๊ธฐ
|
| 218 |
content = soup.select_one('#dic_area')
|
| 219 |
if content:
|
| 220 |
text = content.text.strip()
|
| 221 |
+
text = re.sub(r'\s+', ' ', text)
|
| 222 |
+
logging.info("๋ค์ด๋ฒ ๋ด์ค ๋ณธ๋ฌธ ์ถ์ถ ์ฑ๊ณต")
|
| 223 |
return text
|
| 224 |
|
| 225 |
+
# ๋ค๋ฅธ ๋ด์ค ์ฌ์ดํธ ๋ณธ๋ฌธ ์ฐพ๊ธฐ
|
| 226 |
content = soup.select_one('.article_body, .article-body, .article-content, .news-content-inner')
|
| 227 |
if content:
|
| 228 |
text = content.text.strip()
|
| 229 |
text = re.sub(r'\s+', ' ', text)
|
| 230 |
+
logging.info("์ผ๋ฐ ๋ด์ค ๋ณธ๋ฌธ ์ถ์ถ ์ฑ๊ณต")
|
| 231 |
return text
|
| 232 |
|
| 233 |
+
logging.warning("๋ณธ๋ฌธ์ ์ฐพ์ ์ ์์")
|
| 234 |
return "๋ณธ๋ฌธ์ ๊ฐ์ ธ์ฌ ์ ์์ต๋๋ค."
|
| 235 |
except Exception as e:
|
| 236 |
+
logging.error(f"์๋ฌธ ๊ฐ์ ธ์ค๊ธฐ ์ค๋ฅ: {str(e)}", exc_info=True)
|
| 237 |
return f"์ค๋ฅ ๋ฐ์: {str(e)}"
|
| 238 |
|
| 239 |
# NLTK๋ฅผ ์ด์ฉํ ํค์๋ ๋ถ์ (KSS ํ์ฉ)
|
|
|
|
| 449 |
traceback.print_exc()
|
| 450 |
|
| 451 |
def perform_news_task(task_type, keyword, num_articles, file_prefix):
|
| 452 |
+
logging.info(f"์ค์ผ์ค๋ฌ ์์
์์: {task_type}, ํค์๋={keyword}")
|
| 453 |
try:
|
| 454 |
articles = crawl_naver_news(keyword, num_articles)
|
| 455 |
+
logging.info(f"์์ง๋ ๊ธฐ์ฌ ์: {len(articles)}")
|
| 456 |
|
| 457 |
# ๊ธฐ์ฌ ๋ด์ฉ ๊ฐ์ ธ์ค๊ธฐ
|
| 458 |
+
for i, article in enumerate(articles):
|
| 459 |
+
logging.info(f"๊ธฐ์ฌ {i+1}/{len(articles)} ์๋ฌธ ๊ฐ์ ธ์ค๊ธฐ: {article['title']}")
|
| 460 |
article['content'] = get_article_content(article['link'])
|
| 461 |
time.sleep(0.5) # ์๋ฒ ๋ถํ ๋ฐฉ์ง
|
| 462 |
|
|
|
|
| 468 |
with open(filename, 'w', encoding='utf-8') as f:
|
| 469 |
json.dump(articles, f, ensure_ascii=False, indent=2)
|
| 470 |
|
| 471 |
+
logging.info(f"๊ฒฐ๊ณผ ์ ์ฅ ์๋ฃ: {filename}")
|
| 472 |
+
|
| 473 |
global_scheduler_state.last_run = datetime.now()
|
| 474 |
print(f"{datetime.now()} - {task_type} ๋ด์ค ๊ธฐ์ฌ ์์ง ์๋ฃ: {keyword}")
|
| 475 |
|
| 476 |
+
# ์ ์ญ ์ํ์ ์์ง ๊ฒฐ๊ณผ๋ฅผ ์ ์ฅ
|
| 477 |
result_item = {
|
| 478 |
'task_type': task_type,
|
| 479 |
'keyword': keyword,
|
|
|
|
| 484 |
global_scheduler_state.scheduled_results.append(result_item)
|
| 485 |
|
| 486 |
except Exception as e:
|
| 487 |
+
logging.error(f"์์
์คํ ์ค ์ค๋ฅ ๋ฐ์: {str(e)}", exc_info=True)
|
| 488 |
traceback.print_exc()
|
| 489 |
|
| 490 |
def start_scheduler(daily_tasks, interval_tasks):
|
|
|
|
| 1046 |
with tab3:
|
| 1047 |
st.subheader("์ค์ผ์ค๋ฌ ์ ์ด ๋ฐ ์ํ")
|
| 1048 |
|
| 1049 |
+
# ๋ก๊ทธ ๋ทฐ์ด ์ถ๊ฐ
|
| 1050 |
+
if st.checkbox("๋ก๊ทธ ๋ณด๊ธฐ"):
|
| 1051 |
+
try:
|
| 1052 |
+
with open('/tmp/crawler.log', 'r') as f:
|
| 1053 |
+
logs = f.readlines()
|
| 1054 |
+
st.text_area("์ต๊ทผ ๋ก๊ทธ", value=''.join(logs[-100:]), height=400)
|
| 1055 |
+
except Exception as e:
|
| 1056 |
+
st.error(f"๋ก๊ทธ ํ์ผ์ ์ฝ์ ์ ์์ต๋๋ค: {str(e)}")
|
| 1057 |
+
|
| 1058 |
col1, col2 = st.columns(2)
|
| 1059 |
|
| 1060 |
with col1:
|