Spaces:
Sleeping
Sleeping
Update src/streamlit_app.py
Browse files- src/streamlit_app.py +239 -258
src/streamlit_app.py
CHANGED
|
@@ -4,73 +4,33 @@ import requests
|
|
| 4 |
from bs4 import BeautifulSoup
|
| 5 |
import re
|
| 6 |
import time
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
import json
|
| 8 |
import os
|
| 9 |
from datetime import datetime, timedelta
|
|
|
|
|
|
|
| 10 |
import traceback
|
| 11 |
import plotly.graph_objects as go
|
| 12 |
import schedule
|
| 13 |
import threading
|
| 14 |
import matplotlib.pyplot as plt
|
| 15 |
-
from pathlib import Path
|
| 16 |
-
import openai
|
| 17 |
-
from dotenv import load_dotenv
|
| 18 |
-
|
| 19 |
-
# νκΉ
νμ΄μ€ Spaces νκ²½μ λ§κ² μμ λλ ν 리 μ€μ
|
| 20 |
-
# /tmp ν΄λλ μ‘΄μ¬ν μ μμ§λ§ κΆν λ¬Έμ κ° μμ μ μμΌλ―λ‘ νμ¬ μμ
λλ ν 리 κΈ°λ°μΌλ‘ λ³κ²½
|
| 21 |
-
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__)) if "__file__" in globals() else os.getcwd()
|
| 22 |
-
DATA_DIR = os.path.join(CURRENT_DIR, "data")
|
| 23 |
-
SAVED_ARTICLES_PATH = os.path.join(DATA_DIR, "saved_articles.json")
|
| 24 |
-
SCHEDULED_NEWS_DIR = os.path.join(DATA_DIR, "scheduled_news")
|
| 25 |
-
|
| 26 |
-
# λλ ν 리 μμ± ν¨μ
|
| 27 |
-
def ensure_directory(directory):
|
| 28 |
-
try:
|
| 29 |
-
os.makedirs(directory, exist_ok=True)
|
| 30 |
-
return True
|
| 31 |
-
except Exception as e:
|
| 32 |
-
st.error(f"λλ ν 리 μμ± μ€ μ€λ₯ λ°μ: {str(e)}")
|
| 33 |
-
return False
|
| 34 |
|
| 35 |
-
#
|
| 36 |
-
|
| 37 |
-
|
|
|
|
| 38 |
|
| 39 |
-
#
|
| 40 |
-
try:
|
| 41 |
-
import kss
|
| 42 |
-
kss_available = True
|
| 43 |
-
except ImportError:
|
| 44 |
-
st.warning("KSS λΌμ΄λΈλ¬λ¦¬κ° μ€μΉλμ΄ μμ§ μμ΅λλ€. 'pip install kss'λ‘ μ€μΉνμΈμ.")
|
| 45 |
-
kss_available = False
|
| 46 |
-
|
| 47 |
-
# νκ΅μ΄ ν ν¬λμ΄μ§ ν¨μ (KSS μ¬μ©)
|
| 48 |
-
def tokenize_korean(text):
|
| 49 |
-
try:
|
| 50 |
-
if kss_available:
|
| 51 |
-
tokens = []
|
| 52 |
-
# λ¬Έμ₯ λΆλ¦¬ ν κ° λ¬Έμ₯μμ λ¨μ΄ μΆμΆ
|
| 53 |
-
for sentence in kss.split_sentences(text):
|
| 54 |
-
# κΈ°λ³Έ 곡백 κΈ°λ° ν ν°νμ μ κ·μ ν¨ν΄ μΆκ°νμ¬ λ μ κ΅νκ² μ²λ¦¬
|
| 55 |
-
raw_tokens = sentence.split()
|
| 56 |
-
for token in raw_tokens:
|
| 57 |
-
# μ‘°μ¬, νΉμλ¬Έμ λ±μ λΆλ¦¬
|
| 58 |
-
sub_tokens = re.findall(r'[κ°-ν£]+|[a-zA-Z]+|[0-9]+|[^\sκ°-ν£a-zA-Z0-9]+', token)
|
| 59 |
-
tokens.extend(sub_tokens)
|
| 60 |
-
return tokens
|
| 61 |
-
except Exception as e:
|
| 62 |
-
st.debug(f"KSS ν ν¬λμ΄μ§ μ€ν¨: {str(e)}")
|
| 63 |
-
|
| 64 |
-
# KSS μ¬μ© λΆκ°λ₯νκ±°λ μ€λ₯ λ°μμ κΈ°λ³Έ μ κ·μ κΈ°λ° ν ν¬λμ΄μ μ¬μ©
|
| 65 |
-
return re.findall(r'[κ°-ν£]+|[a-zA-Z]+|[0-9]+|[^\sκ°-ν£a-zA-Z0-9]+', text)
|
| 66 |
-
|
| 67 |
-
# μλν΄λΌμ°λ μΆκ° (μ νμ μ¬μ©)
|
| 68 |
try:
|
| 69 |
from wordcloud import WordCloud
|
| 70 |
-
wordcloud_available = True
|
| 71 |
except ImportError:
|
| 72 |
-
|
| 73 |
-
|
|
|
|
| 74 |
# μ€μΌμ€λ¬ μν ν΄λμ€ μΆκ°
|
| 75 |
class SchedulerState:
|
| 76 |
def __init__(self):
|
|
@@ -101,6 +61,31 @@ if st.session_state.openai_api_key is None:
|
|
| 101 |
load_dotenv() # λ‘컬 .env νμΌ
|
| 102 |
st.session_state.openai_api_key = os.getenv('OPENAI_API_KEY')
|
| 103 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
# νμ΄μ§ μ€μ
|
| 105 |
st.set_page_config(page_title="λ΄μ€ κΈ°μ¬ λꡬ", page_icon="π°", layout="wide")
|
| 106 |
|
|
@@ -111,17 +96,21 @@ menu = st.sidebar.radio(
|
|
| 111 |
["λ΄μ€ κΈ°μ¬ ν¬λ‘€λ§", "κΈ°μ¬ λΆμνκΈ°", "μ κΈ°μ¬ μμ±νκΈ°", "λ΄μ€ κΈ°μ¬ μμ½νκΈ°"]
|
| 112 |
)
|
| 113 |
|
| 114 |
-
#
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
|
|
|
|
|
|
|
|
|
| 122 |
# μ μ₯λ κΈ°μ¬λ₯Ό λΆλ¬μ€λ ν¨μ
|
| 123 |
def load_saved_articles():
|
| 124 |
try:
|
|
|
|
| 125 |
if os.path.exists(SAVED_ARTICLES_PATH):
|
| 126 |
with open(SAVED_ARTICLES_PATH, 'r', encoding='utf-8') as f:
|
| 127 |
return json.load(f)
|
|
@@ -133,12 +122,15 @@ def load_saved_articles():
|
|
| 133 |
# κΈ°μ¬λ₯Ό μ μ₯νλ ν¨μ
|
| 134 |
def save_articles(articles):
|
| 135 |
try:
|
|
|
|
| 136 |
with open(SAVED_ARTICLES_PATH, 'w', encoding='utf-8') as f:
|
| 137 |
json.dump(articles, f, ensure_ascii=False, indent=2)
|
| 138 |
-
|
|
|
|
| 139 |
except Exception as e:
|
| 140 |
st.error(f"κΈ°μ¬ μ μ₯ μ€ μ€λ₯ λ°μ: {str(e)}")
|
| 141 |
return False
|
|
|
|
| 142 |
|
| 143 |
@st.cache_data
|
| 144 |
def crawl_naver_news(keyword, num_articles=5):
|
|
@@ -225,89 +217,49 @@ def get_article_content(url):
|
|
| 225 |
except Exception as e:
|
| 226 |
return f"μ€λ₯ λ°μ: {str(e)}"
|
| 227 |
|
| 228 |
-
#
|
| 229 |
def analyze_keywords(text, top_n=10):
|
| 230 |
-
# νκ΅μ΄ λΆμ©μ΄ λͺ©λ‘ (
|
| 231 |
-
korean_stopwords = [
|
| 232 |
-
'μ΄', 'κ·Έ', 'μ ', 'κ²', 'λ°', 'λ±', 'λ₯Ό', 'μ', 'μ', 'μμ', 'μ', 'μΌλ‘', 'λ‘',
|
| 233 |
-
'μκ²', 'λΏ', 'λ€', 'λ', 'κ°', 'μ΄λ€', 'μκ²μ', 'κ»', 'κ»μ', 'λΆν°', 'κΉμ§',
|
| 234 |
-
'μ΄λ°', 'μ λ°', 'κ·Έλ°', 'μ΄λ€', '무μ¨', 'μ΄κ²', 'μ κ²', 'κ·Έκ²', 'μ΄λ²', 'μ λ²', 'κ·Έλ²',
|
| 235 |
-
'μ΄κ±°', 'μ κ±°', 'κ·Έκ±°', 'νλ€', 'λλ€', 'μλ€', 'μλ€', 'κ°λ€', '보λ€', 'μ΄λ λ€', 'κ·Έλ λ€',
|
| 236 |
-
'νλ', 'λλ', 'μλ', 'μλ', 'κ°μ', '보λ', 'μ΄λ°', 'κ·Έλ°', 'μ λ°', 'νλ€', 'λλ€',
|
| 237 |
-
'μμλ€', 'μμλ€', 'κ°μλ€', 'λ΄€λ€', 'λ', 'λν', 'κ·Έλ¦¬κ³ ', 'νμ§λ§', 'κ·Έλ¬λ', 'κ·Έλμ',
|
| 238 |
-
'λλ¬Έμ', 'λ°λΌμ', 'νλ©°', 'λλ©°', 'μμΌλ©°', 'μμΌλ©°', 'κ°μΌλ©°', '보며', 'νκ³ ', 'λκ³ ',
|
| 239 |
-
'μκ³ ', 'μκ³ ', 'κ°κ³ ', 'λ³΄κ³ ', 'ν΅ν΄', 'μν΄', 'λ', 'μ€', 'ν'
|
| 240 |
-
]
|
| 241 |
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
'a', 'an', 'the', 'and', 'or', 'but', 'if', 'because', 'as', 'what',
|
| 245 |
-
'when', 'where', 'how', 'who', 'which', 'this', 'that', 'these', 'those',
|
| 246 |
-
'then', 'just', 'so', 'than', 'such', 'both', 'through', 'about', 'for',
|
| 247 |
-
'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had',
|
| 248 |
-
'having', 'do', 'does', 'did', 'doing', 'would', 'should', 'could', 'might',
|
| 249 |
-
'will', 'shall', 'can', 'may', 'must', 'ought'
|
| 250 |
-
]
|
| 251 |
|
| 252 |
-
# μΈμ΄ κ°μ§ (κ°λ¨νκ² νκΈ ν¬ν¨ μ¬λΆλ‘ 체ν¬)
|
| 253 |
-
is_korean = bool(re.search(r'[κ°-ν£]', text))
|
| 254 |
-
|
| 255 |
-
if is_korean:
|
| 256 |
-
# νκ΅μ΄ ν
μ€νΈμΈ κ²½μ° KSS κΈ°λ° ν ν¬λμ΄μ μ¬μ©
|
| 257 |
-
tokens = tokenize_korean(text)
|
| 258 |
-
else:
|
| 259 |
-
# μμ΄ λλ κΈ°ν μΈμ΄λ κ°λ¨ν μ κ·μ ν ν°ν
|
| 260 |
-
tokens = re.findall(r'\b\w+\b', text.lower())
|
| 261 |
-
|
| 262 |
-
# λΆμ©μ΄ νν°λ§ (μΈμ΄μ λ°λΌ λ€λ₯Έ λΆμ©μ΄ μ μ©)
|
| 263 |
-
stopwords = korean_stopwords if is_korean else english_stopwords
|
| 264 |
-
tokens = [word for word in tokens if len(word) > 1 and word.lower() not in stopwords]
|
| 265 |
-
|
| 266 |
-
# λΉλ κ³μ°
|
| 267 |
-
from collections import Counter
|
| 268 |
word_count = Counter(tokens)
|
| 269 |
top_keywords = word_count.most_common(top_n)
|
| 270 |
|
| 271 |
return top_keywords
|
| 272 |
|
| 273 |
-
|
| 274 |
def extract_keywords_for_wordcloud(text, top_n=50):
|
| 275 |
if not text or len(text.strip()) < 10:
|
| 276 |
return {}
|
| 277 |
|
| 278 |
try:
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
| 284 |
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
'then', 'just', 'so', 'than', 'such', 'both', 'through', 'about', 'for',
|
| 291 |
-
'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had',
|
| 292 |
-
'having', 'do', 'does', 'did', 'doing', 'would', 'should', 'could', 'might',
|
| 293 |
-
'will', 'shall', 'can', 'may', 'must', 'ought'
|
| 294 |
-
}
|
| 295 |
|
| 296 |
-
|
| 297 |
-
korean_stopwords = {
|
| 298 |
'λ°', 'λ±', 'λ₯Ό', 'μ΄', 'μ', 'κ°', 'μ', 'λ', 'μΌλ‘', 'μμ', 'κ·Έ', 'λ', 'λλ', 'νλ', 'ν ', 'νκ³ ',
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
|
| 307 |
}
|
| 308 |
-
|
| 309 |
-
# μΈμ΄μ λ°λΌ λΆμ©μ΄ μ ν
|
| 310 |
-
stop_words = korean_stopwords if is_korean else english_stopwords
|
| 311 |
|
| 312 |
# 1κΈμ μ΄μμ΄κ³ λΆμ©μ΄κ° μλ ν ν°λ§ νν°λ§
|
| 313 |
filtered_tokens = [word for word in tokens if len(word) > 1 and word not in stop_words]
|
|
@@ -327,45 +279,51 @@ def extract_keywords_for_wordcloud(text, top_n=50):
|
|
| 327 |
return dict(sorted_words[:top_n])
|
| 328 |
|
| 329 |
except Exception as e:
|
| 330 |
-
st.error(f"
|
| 331 |
return {"data": 1, "analysis": 1, "news": 1}
|
|
|
|
| 332 |
|
| 333 |
# μλ ν΄λΌμ°λ μμ± ν¨μ
|
|
|
|
| 334 |
def generate_wordcloud(keywords_dict):
|
| 335 |
-
|
| 336 |
-
|
| 337 |
-
|
| 338 |
-
|
| 339 |
-
|
| 340 |
-
|
| 341 |
-
|
| 342 |
-
|
| 343 |
-
|
| 344 |
-
|
| 345 |
-
|
| 346 |
-
|
| 347 |
-
|
| 348 |
-
|
| 349 |
-
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
|
| 353 |
-
|
| 354 |
-
|
| 355 |
-
|
| 356 |
-
|
| 357 |
-
|
| 358 |
-
|
| 359 |
-
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
|
|
|
|
|
|
| 363 |
|
| 364 |
-
|
| 365 |
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
|
|
|
|
|
|
|
| 369 |
|
| 370 |
# λ΄μ€ λΆμ ν¨μ
|
| 371 |
def analyze_news_content(news_df):
|
|
@@ -373,37 +331,32 @@ def analyze_news_content(news_df):
|
|
| 373 |
return "λ°μ΄ν°κ° μμ΅λλ€"
|
| 374 |
|
| 375 |
results = {}
|
| 376 |
-
|
| 377 |
-
# μΉ΄ν
κ³ λ¦¬λ³ λΆμ
|
| 378 |
if 'source' in news_df.columns:
|
| 379 |
-
|
|
|
|
| 380 |
if 'date' in news_df.columns:
|
| 381 |
-
|
| 382 |
|
| 383 |
-
|
| 384 |
all_text = " ".join(news_df['title'].fillna('') + " " + news_df['content'].fillna(''))
|
| 385 |
|
| 386 |
if len(all_text.strip()) > 0:
|
| 387 |
-
results['top_keywords_for_wordcloud']
|
| 388 |
results['top_keywords'] = analyze_keywords(all_text)
|
| 389 |
else:
|
| 390 |
-
results['top_keywords_for_wordcloud']
|
| 391 |
results['top_keywords'] = []
|
| 392 |
-
|
| 393 |
return results
|
| 394 |
|
| 395 |
# OpenAI APIλ₯Ό μ΄μ©ν μ κΈ°μ¬ μμ±
|
| 396 |
def generate_article(original_content, prompt_text):
|
| 397 |
if not st.session_state.openai_api_key:
|
| 398 |
return "μ€λ₯: OpenAI API ν€κ° μ€μ λμ§ μμμ΅λλ€. μ¬μ΄λλ°μμ ν€λ₯Ό μ
λ ₯νκ±°λ νκ²½ λ³μλ₯Ό μ€μ ν΄μ£ΌμΈμ."
|
| 399 |
-
|
| 400 |
try:
|
| 401 |
-
# API ν€ μ€μ
|
| 402 |
-
openai.api_key = st.session_state.openai_api_key
|
| 403 |
-
|
| 404 |
-
# API νΈμΆ
|
| 405 |
response = openai.chat.completions.create(
|
| 406 |
-
model="gpt-4.1-mini",
|
| 407 |
messages=[
|
| 408 |
{"role": "system", "content": "λΉμ μ μ λ¬Έμ μΈ λ΄μ€ κΈ°μμ
λλ€. μ£Όμ΄μ§ λ΄μ©μ λ°νμΌλ‘ μλ‘μ΄ κΈ°μ¬λ₯Ό μμ±ν΄μ£ΌμΈμ."},
|
| 409 |
{"role": "user", "content": f"λ€μ λ΄μ©μ λ°νμΌλ‘ {prompt_text}\n\n{original_content[:1000]}"}
|
|
@@ -418,17 +371,13 @@ def generate_article(original_content, prompt_text):
|
|
| 418 |
def generate_image(prompt):
|
| 419 |
if not st.session_state.openai_api_key:
|
| 420 |
return "μ€λ₯: OpenAI API ν€κ° μ€μ λμ§ μμμ΅λλ€. μ¬μ΄λλ°μμ ν€λ₯Ό μ
λ ₯νκ±°λ νκ²½ λ³μλ₯Ό μ€μ ν΄μ£ΌμΈμ."
|
| 421 |
-
|
| 422 |
try:
|
| 423 |
-
# API ν€ μ€μ
|
| 424 |
-
openai.api_key = st.session_state.openai_api_key
|
| 425 |
-
|
| 426 |
-
# API νΈμΆ
|
| 427 |
response = openai.images.generate(
|
| 428 |
model="gpt-image-1",
|
| 429 |
prompt=prompt
|
| 430 |
)
|
| 431 |
-
image_base64
|
| 432 |
return f"data:image/png;base64,{image_base64}"
|
| 433 |
except Exception as e:
|
| 434 |
return f"μ΄λ―Έμ§ μμ± μ€λ₯: {str(e)}"
|
|
@@ -460,12 +409,18 @@ def perform_news_task(task_type, keyword, num_articles, file_prefix):
|
|
| 460 |
time.sleep(0.5) # μλ² λΆν λ°©μ§
|
| 461 |
|
| 462 |
# κ²°κ³Ό μ μ₯
|
|
|
|
|
|
|
|
|
|
|
|
|
| 463 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 464 |
filename = os.path.join(SCHEDULED_NEWS_DIR, f"{file_prefix}_{task_type}_{timestamp}.json")
|
| 465 |
|
| 466 |
try:
|
| 467 |
with open(filename, 'w', encoding='utf-8') as f:
|
| 468 |
json.dump(articles, f, ensure_ascii=False, indent=2)
|
|
|
|
|
|
|
| 469 |
except Exception as e:
|
| 470 |
print(f"νμΌ μ μ₯ μ€ μ€λ₯ λ°μ: {e}")
|
| 471 |
return
|
|
@@ -587,9 +542,8 @@ if menu == "λ΄μ€ κΈ°μ¬ ν¬λ‘€λ§":
|
|
| 587 |
articles = crawl_naver_news(keyword, num_articles)
|
| 588 |
|
| 589 |
# κΈ°μ¬ λ΄μ© κ°μ Έμ€κΈ°
|
| 590 |
-
progress_bar = st.progress(0)
|
| 591 |
for i, article in enumerate(articles):
|
| 592 |
-
|
| 593 |
article['content'] = get_article_content(article['link'])
|
| 594 |
time.sleep(0.5) # μλ² λΆν λ°©μ§
|
| 595 |
|
|
@@ -605,7 +559,7 @@ if menu == "λ΄μ€ κΈ°μ¬ ν¬λ‘€λ§":
|
|
| 605 |
st.write(f"**μμ½:** {article['description']}")
|
| 606 |
st.write(f"**λ§ν¬:** {article['link']}")
|
| 607 |
st.write("**본문 미리보기:**")
|
| 608 |
-
st.write(article['content'][:300] + "..."
|
| 609 |
|
| 610 |
elif menu == "κΈ°μ¬ λΆμνκΈ°":
|
| 611 |
st.header("κΈ°μ¬ λΆμνκΈ°")
|
|
@@ -640,6 +594,7 @@ elif menu == "κΈ°μ¬ λΆμνκΈ°":
|
|
| 640 |
keyword_tab1, keyword_tab2 = st.tabs(["ν€μλ λΉλ", "μλν΄λΌμ°λ"])
|
| 641 |
|
| 642 |
with keyword_tab1:
|
|
|
|
| 643 |
keywords = analyze_keywords(selected_article['content'])
|
| 644 |
|
| 645 |
# μκ°ν
|
|
@@ -649,38 +604,23 @@ elif menu == "κΈ°μ¬ λΆμνκΈ°":
|
|
| 649 |
st.write("**μ£Όμ ν€μλ:**")
|
| 650 |
for word, count in keywords:
|
| 651 |
st.write(f"- {word}: {count}ν")
|
| 652 |
-
|
| 653 |
with keyword_tab2:
|
| 654 |
keyword_dict = extract_keywords_for_wordcloud(selected_article['content'])
|
|
|
|
| 655 |
|
| 656 |
-
if
|
| 657 |
-
|
|
|
|
|
|
|
|
|
|
| 658 |
|
| 659 |
-
|
| 660 |
-
|
| 661 |
-
|
| 662 |
-
ax.axis('off')
|
| 663 |
-
st.pyplot(fig)
|
| 664 |
-
|
| 665 |
-
# ν€μλ μμ 20κ° νμ
|
| 666 |
-
st.write("**μμ 20κ° ν€μλ:**")
|
| 667 |
-
top_keywords = sorted(keyword_dict.items(), key=lambda x: x[1], reverse=True)[:20]
|
| 668 |
-
keyword_df = pd.DataFrame(top_keywords, columns=['ν€μλ', 'λΉλ'])
|
| 669 |
-
st.dataframe(keyword_df)
|
| 670 |
-
else:
|
| 671 |
-
st.error("μλν΄λΌμ°λλ₯Ό μμ±ν μ μμ΅λλ€.")
|
| 672 |
-
else:
|
| 673 |
-
# μλν΄λΌμ°λλ₯Ό μ¬μ©ν μ μλ κ²½μ° λ체 νμ
|
| 674 |
-
st.warning("μλν΄λΌμ°λ κΈ°λ₯μ μ¬μ©ν μ μμ΅λλ€. νμν ν¨ν€μ§κ° μ€μΉλμ§ μμμ΅λλ€.")
|
| 675 |
-
|
| 676 |
-
# λμ ν€μλλ§ νμ
|
| 677 |
-
st.write("**μμ ν€μλ:**")
|
| 678 |
-
top_keywords = sorted(keyword_dict.items(), key=lambda x: x[1], reverse=True)[:30]
|
| 679 |
keyword_df = pd.DataFrame(top_keywords, columns=['ν€μλ', 'λΉλ'])
|
| 680 |
st.dataframe(keyword_df)
|
| 681 |
-
|
| 682 |
-
|
| 683 |
-
st.bar_chart(keyword_df.set_index('ν€μλ').head(15))
|
| 684 |
|
| 685 |
elif analysis_type == "ν
μ€νΈ ν΅κ³":
|
| 686 |
if st.button("ν
μ€νΈ ν΅κ³ λΆμ"):
|
|
@@ -689,18 +629,7 @@ elif menu == "κΈ°μ¬ λΆμνκΈ°":
|
|
| 689 |
# ν
μ€νΈ ν΅κ³ κ³μ°
|
| 690 |
word_count = len(re.findall(r'\b\w+\b', content))
|
| 691 |
char_count = len(content)
|
| 692 |
-
|
| 693 |
-
# KSSλ₯Ό μ¬μ©νμ¬ λ¬Έμ₯ λΆλ¦¬
|
| 694 |
-
if kss_available:
|
| 695 |
-
try:
|
| 696 |
-
sentences = kss.split_sentences(content)
|
| 697 |
-
sentence_count = len(sentences)
|
| 698 |
-
except Exception:
|
| 699 |
-
# KSS μ€ν¨ μ κ°λ¨ν λ¬Έμ₯ λΆλ¦¬
|
| 700 |
-
sentence_count = len(re.split(r'[.!?]+', content))
|
| 701 |
-
else:
|
| 702 |
-
sentence_count = len(re.split(r'[.!?]+', content))
|
| 703 |
-
|
| 704 |
avg_word_length = sum(len(word) for word in re.findall(r'\b\w+\b', content)) / word_count if word_count > 0 else 0
|
| 705 |
avg_sentence_length = word_count / sentence_count if sentence_count > 0 else 0
|
| 706 |
|
|
@@ -726,38 +655,79 @@ elif menu == "κΈ°μ¬ λΆμνκΈ°":
|
|
| 726 |
st.write(f"ν
μ€νΈ 볡μ‘μ± μ μ: {complexity_score:.1f}/10")
|
| 727 |
|
| 728 |
# μΆν λΉλ λ§λ κ·Έλν
|
| 729 |
-
st.subheader("νμ¬λ³ λΆν¬")
|
| 730 |
-
|
| 731 |
-
# μΈμ΄ κ°μ§ (κ°λ¨νκ² νκΈ ν¬ν¨ μ¬λΆλ‘ 체ν¬)
|
| 732 |
-
is_korean = bool(re.search(r'[κ°-ν£]', content))
|
| 733 |
-
|
| 734 |
try:
|
| 735 |
-
#
|
| 736 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 737 |
|
| 738 |
-
|
| 739 |
-
|
| 740 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 741 |
|
| 742 |
-
|
| 743 |
-
|
| 744 |
-
|
| 745 |
-
|
| 746 |
-
pos_counts[
|
| 747 |
-
elif
|
| 748 |
-
pos_counts['
|
| 749 |
-
elif
|
| 750 |
-
pos_counts['
|
| 751 |
else:
|
| 752 |
pos_counts['κΈ°ν'] += 1
|
|
|
|
| 753 |
else:
|
| 754 |
-
# μμ΄
|
| 755 |
-
|
| 756 |
-
|
| 757 |
-
|
| 758 |
-
|
| 759 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 760 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 761 |
|
| 762 |
# κ²°κ³Ό μκ°ν
|
| 763 |
pos_df = pd.DataFrame({
|
|
@@ -780,10 +750,14 @@ elif menu == "κΈ°μ¬ λΆμνκΈ°":
|
|
| 780 |
if st.session_state.openai_api_key:
|
| 781 |
with st.spinner("κΈ°μ¬μ κ°μ μ λΆμ μ€μ
λλ€..."):
|
| 782 |
try:
|
| 783 |
-
# API ν€ μ€μ
|
| 784 |
-
openai.api_key
|
| 785 |
-
|
| 786 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 787 |
response = openai.chat.completions.create(
|
| 788 |
model="gpt-4.1-mini",
|
| 789 |
messages=[
|
|
@@ -855,7 +829,7 @@ elif menu == "κΈ°μ¬ λΆμνκΈ°":
|
|
| 855 |
fill_color = 'rgba(158, 158, 158, 0.3)' # μ°ν νμ
|
| 856 |
line_color = 'rgba(158, 158, 158, 1)' # μ§ν νμ
|
| 857 |
|
| 858 |
-
# λ μ΄λ μ°¨νΈ λ°μ΄ν° μ€λΉ
|
| 859 |
radar_keywords = keyword_names.copy()
|
| 860 |
radar_scores = keyword_scores.copy()
|
| 861 |
|
|
@@ -967,8 +941,7 @@ elif menu == "μ κΈ°μ¬ μμ±νκΈ°":
|
|
| 967 |
with st.expander("μλ³Έ κΈ°μ¬ λ΄μ©"):
|
| 968 |
st.write(selected_article['content'])
|
| 969 |
|
| 970 |
-
prompt_text =
|
| 971 |
-
"""λ€μ κΈ°μ¬ μμμ λ°λΌμ λ€μ μμ±ν΄μ€.
|
| 972 |
μν : λΉμ μ μ λ¬Έμ¬μ κΈ°μμ
λλ€.
|
| 973 |
μμ
: μ΅κ·Ό μΌμ΄λ μ¬κ±΄μ λν 보λμλ£λ₯Ό μμ±ν΄μΌ ν©λλ€. μλ£λ μ¬μ€μ κΈ°λ°μΌλ‘ νλ©°, κ°κ΄μ μ΄κ³ μ νν΄μΌ ν©λλ€.
|
| 974 |
μ§μΉ¨:
|
|
@@ -976,13 +949,14 @@ elif menu == "μ κΈ°μ¬ μμ±νκΈ°":
|
|
| 976 |
κΈ°μ¬ μ λͺ©μ μ£Όμ λ₯Ό λͺ
νν λ°μνκ³ λ
μμ κ΄μ¬μ λ μ μλλ‘ μμ±ν©λλ€.
|
| 977 |
κΈ°μ¬ λ΄μ©μ μ ννκ³ κ°κ²°νλ©° μ€λλ ₯ μλ λ¬Έμ₯μΌλ‘ ꡬμ±ν©λλ€.
|
| 978 |
κ΄λ ¨μμ μΈν°λ·°λ₯Ό μΈμ© ννλ‘ λ£μ΄μ£ΌμΈμ.
|
| 979 |
-
μμ μ 보μ μ§μΉ¨μ μ°Έκ³ νμ¬ μ λ¬Έ 보λμλ£ νμμ κΈ°μ¬λ₯Ό μμ±ν΄ μ£ΌμΈμ"""
|
| 980 |
|
| 981 |
# μ΄λ―Έμ§ μμ± μ¬λΆ μ ν μ΅μ
μΆκ°
|
| 982 |
generate_image_too = st.checkbox("κΈ°μ¬ μμ± ν μ΄λ―Έμ§λ ν¨κ» μμ±νκΈ°", value=True)
|
| 983 |
|
| 984 |
if st.button("μ κΈ°μ¬ μμ±νκΈ°"):
|
| 985 |
if st.session_state.openai_api_key:
|
|
|
|
| 986 |
with st.spinner("κΈ°μ¬λ₯Ό μμ± μ€μ
λλ€..."):
|
| 987 |
new_article = generate_article(selected_article['content'], prompt_text)
|
| 988 |
|
|
@@ -1001,6 +975,13 @@ elif menu == "μ κΈ°μ¬ μμ±νκΈ°":
|
|
| 1001 |
"""
|
| 1002 |
|
| 1003 |
# μ΄λ―Έμ§ μμ±
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1004 |
image_url = generate_image(image_prompt)
|
| 1005 |
|
| 1006 |
if image_url and not image_url.startswith("μ΄λ―Έμ§ μμ± μ€λ₯") and not image_url.startswith("μ€λ₯: OpenAI API ν€κ° μ€μ λμ§ μμμ΅λλ€."):
|
|
@@ -1176,7 +1157,7 @@ elif menu == "λ΄μ€ κΈ°μ¬ μμ½νκΈ°":
|
|
| 1176 |
files = [f for f in os.listdir(SCHEDULED_NEWS_DIR) if f.endswith('.json')]
|
| 1177 |
if files:
|
| 1178 |
st.subheader("μμ§λ νμΌ μ΄κΈ°")
|
| 1179 |
-
selected_file = st.selectbox("νμΌ μ ν", files, index=len(files)-1 if files else 0)
|
| 1180 |
if selected_file and st.button("νμΌ λ΄μ© 보기"):
|
| 1181 |
with open(os.path.join(SCHEDULED_NEWS_DIR, selected_file), 'r', encoding='utf-8') as f:
|
| 1182 |
articles = json.load(f)
|
|
@@ -1194,4 +1175,4 @@ elif menu == "λ΄μ€ κΈ°μ¬ μμ½νκΈ°":
|
|
| 1194 |
|
| 1195 |
# νΈν°
|
| 1196 |
st.markdown("---")
|
| 1197 |
-
st.markdown("Β© λ΄μ€ κΈ°μ¬ λꡬ @conanssam")
|
|
|
|
| 4 |
from bs4 import BeautifulSoup
|
| 5 |
import re
|
| 6 |
import time
|
| 7 |
+
import nltk
|
| 8 |
+
from nltk.tokenize import word_tokenize
|
| 9 |
+
from nltk.corpus import stopwords
|
| 10 |
+
from collections import Counter
|
| 11 |
import json
|
| 12 |
import os
|
| 13 |
from datetime import datetime, timedelta
|
| 14 |
+
import openai
|
| 15 |
+
from dotenv import load_dotenv
|
| 16 |
import traceback
|
| 17 |
import plotly.graph_objects as go
|
| 18 |
import schedule
|
| 19 |
import threading
|
| 20 |
import matplotlib.pyplot as plt
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
+
# /tmp κ²½λ‘ μ€μ
|
| 23 |
+
TMP_DIR = "/tmp"
|
| 24 |
+
SAVED_ARTICLES_PATH = os.path.join(TMP_DIR, "saved_articles.json")
|
| 25 |
+
SCHEDULED_NEWS_DIR = os.path.join(TMP_DIR, "scheduled_news")
|
| 26 |
|
| 27 |
+
# μλν΄λΌμ°λ μΆκ°
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
try:
|
| 29 |
from wordcloud import WordCloud
|
|
|
|
| 30 |
except ImportError:
|
| 31 |
+
st.error("wordcloud ν¨ν€μ§λ₯Ό μ€μΉν΄μ£ΌμΈμ: pip install wordcloud")
|
| 32 |
+
WordCloud = None
|
| 33 |
+
|
| 34 |
# μ€μΌμ€λ¬ μν ν΄λμ€ μΆκ°
|
| 35 |
class SchedulerState:
|
| 36 |
def __init__(self):
|
|
|
|
| 61 |
load_dotenv() # λ‘컬 .env νμΌ
|
| 62 |
st.session_state.openai_api_key = os.getenv('OPENAI_API_KEY')
|
| 63 |
|
| 64 |
+
# νμν NLTK λ°μ΄ν° λ€μ΄λ‘λ
|
| 65 |
+
try:
|
| 66 |
+
nltk.data.find('tokenizers/punkt')
|
| 67 |
+
except LookupError:
|
| 68 |
+
nltk.download('punkt')
|
| 69 |
+
|
| 70 |
+
try:
|
| 71 |
+
nltk.data.find('tokenizers/punkt_tab')
|
| 72 |
+
except LookupError:
|
| 73 |
+
nltk.download('punkt_tab')
|
| 74 |
+
|
| 75 |
+
try:
|
| 76 |
+
nltk.data.find('corpora/stopwords')
|
| 77 |
+
except LookupError:
|
| 78 |
+
nltk.download('stopwords')
|
| 79 |
+
|
| 80 |
+
# OpenAI API ν€ μ€μ
|
| 81 |
+
# openai.api_key μ€μ μ κ° API νΈμΆ μ§μ μ st.session_state.openai_api_key μ¬μ©νλλ‘ λ³κ²½νκ±°λ,
|
| 82 |
+
# μ± μμ μμ μ ν λ² μ€μ ν©λλ€. μ¬κΈ°μλ νμλ₯Ό μ νν©λλ€.
|
| 83 |
+
if st.session_state.openai_api_key:
|
| 84 |
+
openai.api_key = st.session_state.openai_api_key
|
| 85 |
+
else:
|
| 86 |
+
# UI μ΄κΈ°μλ ν€κ° μμ μ μμΌλ―λ‘, λμ€μ ν€ μ
λ ₯ μ openai.api_keyκ° μ€μ λλλ‘ μ λ
|
| 87 |
+
pass
|
| 88 |
+
|
| 89 |
# νμ΄μ§ μ€μ
|
| 90 |
st.set_page_config(page_title="λ΄μ€ κΈ°μ¬ λꡬ", page_icon="π°", layout="wide")
|
| 91 |
|
|
|
|
| 96 |
["λ΄μ€ κΈ°μ¬ ν¬λ‘€λ§", "κΈ°μ¬ λΆμνκΈ°", "μ κΈ°μ¬ μμ±νκΈ°", "λ΄μ€ κΈ°μ¬ μμ½νκΈ°"]
|
| 97 |
)
|
| 98 |
|
| 99 |
+
# λλ ν 리 μμ± ν¨μ
|
| 100 |
+
def ensure_directory(directory):
|
| 101 |
+
try:
|
| 102 |
+
os.makedirs(directory, mode=0o777, exist_ok=True)
|
| 103 |
+
# λλ ν 리 κΆν μ€μ
|
| 104 |
+
os.chmod(directory, 0o777)
|
| 105 |
+
except Exception as e:
|
| 106 |
+
st.error(f"λλ ν 리 μμ± μ€ μ€λ₯ λ°μ: {str(e)}")
|
| 107 |
+
return False
|
| 108 |
+
return True
|
| 109 |
+
|
| 110 |
# μ μ₯λ κΈ°μ¬λ₯Ό λΆλ¬μ€λ ν¨μ
|
| 111 |
def load_saved_articles():
|
| 112 |
try:
|
| 113 |
+
ensure_directory(TMP_DIR)
|
| 114 |
if os.path.exists(SAVED_ARTICLES_PATH):
|
| 115 |
with open(SAVED_ARTICLES_PATH, 'r', encoding='utf-8') as f:
|
| 116 |
return json.load(f)
|
|
|
|
| 122 |
# κΈ°μ¬λ₯Ό μ μ₯νλ ν¨μ
|
| 123 |
def save_articles(articles):
|
| 124 |
try:
|
| 125 |
+
ensure_directory(TMP_DIR)
|
| 126 |
with open(SAVED_ARTICLES_PATH, 'w', encoding='utf-8') as f:
|
| 127 |
json.dump(articles, f, ensure_ascii=False, indent=2)
|
| 128 |
+
# νμΌ κΆν μ€μ
|
| 129 |
+
os.chmod(SAVED_ARTICLES_PATH, 0o666)
|
| 130 |
except Exception as e:
|
| 131 |
st.error(f"κΈ°μ¬ μ μ₯ μ€ μ€λ₯ λ°μ: {str(e)}")
|
| 132 |
return False
|
| 133 |
+
return True
|
| 134 |
|
| 135 |
@st.cache_data
|
| 136 |
def crawl_naver_news(keyword, num_articles=5):
|
|
|
|
| 217 |
except Exception as e:
|
| 218 |
return f"μ€λ₯ λ°μ: {str(e)}"
|
| 219 |
|
| 220 |
+
# NLTKλ₯Ό μ΄μ©ν ν€μλ λΆμ
|
| 221 |
def analyze_keywords(text, top_n=10):
|
| 222 |
+
# νκ΅μ΄ λΆμ©μ΄ λͺ©λ‘ (μ§μ μ μν΄μΌ ν©λλ€)
|
| 223 |
+
korean_stopwords = ['μ΄', 'κ·Έ', 'μ ', 'κ²', 'λ°', 'λ±', 'λ₯Ό', 'μ', 'μ', 'μμ', 'μ', 'μΌλ‘', 'λ‘']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 224 |
|
| 225 |
+
tokens = word_tokenize(text)
|
| 226 |
+
tokens = [word for word in tokens if word.isalnum() and len(word) > 1 and word not in korean_stopwords]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 227 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 228 |
word_count = Counter(tokens)
|
| 229 |
top_keywords = word_count.most_common(top_n)
|
| 230 |
|
| 231 |
return top_keywords
|
| 232 |
|
| 233 |
+
#μλ ν΄λΌμ°λμ© λΆμ
|
| 234 |
def extract_keywords_for_wordcloud(text, top_n=50):
|
| 235 |
if not text or len(text.strip()) < 10:
|
| 236 |
return {}
|
| 237 |
|
| 238 |
try:
|
| 239 |
+
try:
|
| 240 |
+
tokens = word_tokenize(text.lower())
|
| 241 |
+
except Exception as e:
|
| 242 |
+
st.warning(f"{str(e)} μ€λ₯λ°μ")
|
| 243 |
+
tokens = text.lower().split()
|
| 244 |
|
| 245 |
+
stop_words = set()
|
| 246 |
+
try:
|
| 247 |
+
stop_words = set(stopwords.words('english'))
|
| 248 |
+
except Exception:
|
| 249 |
+
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 250 |
|
| 251 |
+
korea_stop_words = {
|
|
|
|
| 252 |
'λ°', 'λ±', 'λ₯Ό', 'μ΄', 'μ', 'κ°', 'μ', 'λ', 'μΌλ‘', 'μμ', 'κ·Έ', 'λ', 'λλ', 'νλ', 'ν ', 'νκ³ ',
|
| 253 |
+
'μλ€', 'μ΄λ€', 'μν΄', 'κ²μ΄λ€', 'κ²μ', 'λν', 'λλ¬Έ', 'κ·Έλ¦¬κ³ ', 'νμ§λ§', 'κ·Έλ¬λ', 'κ·Έλμ',
|
| 254 |
+
'μ
λλ€', 'ν©λλ€', 'μ΅λλ€', 'μ', 'μ£ ', 'κ³ ', 'κ³Ό', 'μ', 'λ', 'μ', 'μ', 'κ²', 'λ€', 'μ ', 'μ ',
|
| 255 |
+
'λ
', 'μ', 'μΌ', 'μ', 'λΆ', 'μ΄', 'μ§λ', 'μ¬ν΄', 'λ΄λ
', 'μ΅κ·Ό', 'νμ¬', 'μ€λ', 'λ΄μΌ', 'μ΄μ ',
|
| 256 |
+
'μ€μ ', 'μ€ν', 'λΆν°', 'κΉμ§', 'μκ²', 'κ»μ', 'μ΄λΌκ³ ', 'λΌκ³ ', 'νλ©°', 'νλ©΄μ', 'λ°λΌ', 'ν΅ν΄',
|
| 257 |
+
'κ΄λ ¨', 'ννΈ', 'νΉν', 'κ°μ₯', 'λ§€μ°', 'λ', 'λ', 'λ§μ΄', 'μ‘°κΈ', 'νμ', 'μμ£Ό', 'κ°λ', 'κ±°μ',
|
| 258 |
+
'μ ν', 'λ°λ‘', 'μ λ§', 'λ§μ½', 'λΉλ‘―ν', 'λ±μ', 'λ±μ΄', 'λ±μ', 'λ±κ³Ό', 'λ±λ', 'λ±μ', 'λ±μμ',
|
| 259 |
+
'κΈ°μ', 'λ΄μ€', 'μ¬μ§', 'μ°ν©λ΄μ€', 'λ΄μμ€', 'μ 곡', '무λ¨', 'μ μ¬', 'μ¬λ°°ν¬', 'κΈμ§', 'μ΅μ»€', 'λ©νΈ',
|
| 260 |
+
'μΌλ³΄', 'λ°μΌλ¦¬', 'κ²½μ ', 'μ¬ν', 'μ μΉ', 'μΈκ³', 'κ³Όν', 'μμ΄ν°', 'λ·μ»΄', 'μ¨λ·', 'λΈλ‘ν°', 'μ μμ λ¬Έ'
|
| 261 |
}
|
| 262 |
+
stop_words.update(korea_stop_words)
|
|
|
|
|
|
|
| 263 |
|
| 264 |
# 1κΈμ μ΄μμ΄κ³ λΆμ©μ΄κ° μλ ν ν°λ§ νν°λ§
|
| 265 |
filtered_tokens = [word for word in tokens if len(word) > 1 and word not in stop_words]
|
|
|
|
| 279 |
return dict(sorted_words[:top_n])
|
| 280 |
|
| 281 |
except Exception as e:
|
| 282 |
+
st.error(f"μ€λ₯λ°μ {str(e)}")
|
| 283 |
return {"data": 1, "analysis": 1, "news": 1}
|
| 284 |
+
|
| 285 |
|
| 286 |
# μλ ν΄λΌμ°λ μμ± ν¨μ
|
| 287 |
+
|
| 288 |
def generate_wordcloud(keywords_dict):
|
| 289 |
+
if not WordCloud:
|
| 290 |
+
st.warning("μλν΄λΌμ°λ μ€μΉμλμ΄ μμ΅λλ€.")
|
| 291 |
+
return None
|
| 292 |
+
try:
|
| 293 |
+
# νλ‘μ νΈ λ£¨νΈμ NanumGothic.ttfκ° μλ€κ³ κ°μ
|
| 294 |
+
font_path = "NanumGothic.ttf"
|
| 295 |
+
|
| 296 |
+
# λ‘컬μ ν°νΈ νμΌμ΄ μλμ§ νμΈ, μμΌλ©΄ κΈ°λ³ΈμΌλ‘ μλ
|
| 297 |
+
if not os.path.exists(font_path):
|
| 298 |
+
st.warning(f"ν°νΈ νμΌ({font_path})μ μ°Ύμ μ μμ΅λλ€. κΈ°λ³Έ ν°νΈλ‘ μλν΄λΌμ°λλ₯Ό μμ±ν©λλ€. νκΈμ΄ κΉ¨μ§ μ μμ΅λλ€.")
|
| 299 |
+
# font_path = None # λλ μμ€ν
κΈ°λ³Έ ν°νΈ κ²½λ‘λ₯Ό μ§μ (νλ«νΌοΏ½οΏ½οΏ½λ€ λ€λ¦)
|
| 300 |
+
# WordCloud μμ±μμμ font_pathλ₯Ό NoneμΌλ‘ λλ©΄ μμ€ν
κΈ°λ³Έκ°μ μλνκ±°λ, μμ λΉΌκ³ νΈμΆ
|
| 301 |
+
wc = WordCloud(
|
| 302 |
+
width=800,
|
| 303 |
+
height=400,
|
| 304 |
+
background_color='white',
|
| 305 |
+
colormap='viridis',
|
| 306 |
+
max_font_size=150,
|
| 307 |
+
random_state=42
|
| 308 |
+
).generate_from_frequencies(keywords_dict)
|
| 309 |
+
else:
|
| 310 |
+
wc= WordCloud(
|
| 311 |
+
font_path=font_path,
|
| 312 |
+
width=800,
|
| 313 |
+
height=400,
|
| 314 |
+
background_color = 'white',
|
| 315 |
+
colormap = 'viridis',
|
| 316 |
+
max_font_size=150,
|
| 317 |
+
random_state=42
|
| 318 |
+
).generate_from_frequencies(keywords_dict)
|
| 319 |
|
| 320 |
+
return wc
|
| 321 |
|
| 322 |
+
except Exception as e:
|
| 323 |
+
st.error(f"μλν΄λΌμ°λ μμ± μ€ μ€λ₯ λ°μ: {str(e)}")
|
| 324 |
+
# traceback.print_exc() # λλ²κΉ
μ μ¬μ©
|
| 325 |
+
st.warning("μλν΄λΌμ°λ μμ±μ μ€ν¨νμ΅λλ€. ν°νΈ λ¬Έμ μΌ μ μμ΅λλ€. NanumGothic.ttf νμΌμ΄ νλ‘μ νΈ λ£¨νΈμ μλμ§ νμΈν΄μ£ΌμΈμ.")
|
| 326 |
+
return None
|
| 327 |
|
| 328 |
# λ΄μ€ λΆμ ν¨μ
|
| 329 |
def analyze_news_content(news_df):
|
|
|
|
| 331 |
return "λ°μ΄ν°κ° μμ΅λλ€"
|
| 332 |
|
| 333 |
results = {}
|
| 334 |
+
#μΉ΄ν
κ³ λ¦¬λ³
|
|
|
|
| 335 |
if 'source' in news_df.columns:
|
| 336 |
+
results['source_counts'] = news_df['source'].value_counts().to_dict()
|
| 337 |
+
#μΉ΄ν
κ³ λ¦¬λ³
|
| 338 |
if 'date' in news_df.columns:
|
| 339 |
+
results['date_counts'] = news_df['date'].value_counts().to_dict()
|
| 340 |
|
| 341 |
+
#ν€μλλΆμ
|
| 342 |
all_text = " ".join(news_df['title'].fillna('') + " " + news_df['content'].fillna(''))
|
| 343 |
|
| 344 |
if len(all_text.strip()) > 0:
|
| 345 |
+
results['top_keywords_for_wordcloud']= extract_keywords_for_wordcloud(all_text, top_n=50)
|
| 346 |
results['top_keywords'] = analyze_keywords(all_text)
|
| 347 |
else:
|
| 348 |
+
results['top_keywords_for_wordcloud']={}
|
| 349 |
results['top_keywords'] = []
|
|
|
|
| 350 |
return results
|
| 351 |
|
| 352 |
# OpenAI APIλ₯Ό μ΄μ©ν μ κΈ°μ¬ μμ±
|
| 353 |
def generate_article(original_content, prompt_text):
|
| 354 |
if not st.session_state.openai_api_key:
|
| 355 |
return "μ€λ₯: OpenAI API ν€κ° μ€μ λμ§ μμμ΅λλ€. μ¬μ΄λλ°μμ ν€λ₯Ό μ
λ ₯νκ±°λ νκ²½ λ³μλ₯Ό μ€μ ν΄μ£ΌμΈμ."
|
| 356 |
+
openai.api_key = st.session_state.openai_api_key
|
| 357 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 358 |
response = openai.chat.completions.create(
|
| 359 |
+
model="gpt-4.1-mini",
|
| 360 |
messages=[
|
| 361 |
{"role": "system", "content": "λΉμ μ μ λ¬Έμ μΈ λ΄μ€ κΈ°μμ
λλ€. μ£Όμ΄μ§ λ΄μ©μ λ°νμΌλ‘ μλ‘μ΄ κΈ°μ¬λ₯Ό μμ±ν΄μ£ΌμΈμ."},
|
| 362 |
{"role": "user", "content": f"λ€μ λ΄μ©μ λ°νμΌλ‘ {prompt_text}\n\n{original_content[:1000]}"}
|
|
|
|
| 371 |
def generate_image(prompt):
|
| 372 |
if not st.session_state.openai_api_key:
|
| 373 |
return "μ€λ₯: OpenAI API ν€κ° μ€μ λμ§ μμμ΅λλ€. μ¬μ΄λλ°μμ ν€λ₯Ό μ
λ ₯νκ±°λ νκ²½ λ³μλ₯Ό μ€μ ν΄μ£ΌμΈμ."
|
| 374 |
+
openai.api_key = st.session_state.openai_api_key
|
| 375 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 376 |
response = openai.images.generate(
|
| 377 |
model="gpt-image-1",
|
| 378 |
prompt=prompt
|
| 379 |
)
|
| 380 |
+
image_base64=response.data[0].b64_json
|
| 381 |
return f"data:image/png;base64,{image_base64}"
|
| 382 |
except Exception as e:
|
| 383 |
return f"μ΄λ―Έμ§ μμ± μ€λ₯: {str(e)}"
|
|
|
|
| 409 |
time.sleep(0.5) # μλ² λΆν λ°©μ§
|
| 410 |
|
| 411 |
# κ²°κ³Ό μ μ₯
|
| 412 |
+
if not ensure_directory(SCHEDULED_NEWS_DIR):
|
| 413 |
+
print(f"μ€μΌμ€λ λ΄μ€ λλ ν 리 μμ± μ€ν¨")
|
| 414 |
+
return
|
| 415 |
+
|
| 416 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 417 |
filename = os.path.join(SCHEDULED_NEWS_DIR, f"{file_prefix}_{task_type}_{timestamp}.json")
|
| 418 |
|
| 419 |
try:
|
| 420 |
with open(filename, 'w', encoding='utf-8') as f:
|
| 421 |
json.dump(articles, f, ensure_ascii=False, indent=2)
|
| 422 |
+
# νμΌ κΆν μ€μ
|
| 423 |
+
os.chmod(filename, 0o666)
|
| 424 |
except Exception as e:
|
| 425 |
print(f"νμΌ μ μ₯ μ€ μ€λ₯ λ°μ: {e}")
|
| 426 |
return
|
|
|
|
| 542 |
articles = crawl_naver_news(keyword, num_articles)
|
| 543 |
|
| 544 |
# κΈ°μ¬ λ΄μ© κ°μ Έμ€κΈ°
|
|
|
|
| 545 |
for i, article in enumerate(articles):
|
| 546 |
+
st.progress((i + 1) / len(articles))
|
| 547 |
article['content'] = get_article_content(article['link'])
|
| 548 |
time.sleep(0.5) # μλ² λΆν λ°©μ§
|
| 549 |
|
|
|
|
| 559 |
st.write(f"**μμ½:** {article['description']}")
|
| 560 |
st.write(f"**λ§ν¬:** {article['link']}")
|
| 561 |
st.write("**본문 미리보기:**")
|
| 562 |
+
st.write(article['content'][:300] + "...")
|
| 563 |
|
| 564 |
elif menu == "κΈ°μ¬ λΆμνκΈ°":
|
| 565 |
st.header("κΈ°μ¬ λΆμνκΈ°")
|
|
|
|
| 594 |
keyword_tab1, keyword_tab2 = st.tabs(["ν€μλ λΉλ", "μλν΄λΌμ°λ"])
|
| 595 |
|
| 596 |
with keyword_tab1:
|
| 597 |
+
|
| 598 |
keywords = analyze_keywords(selected_article['content'])
|
| 599 |
|
| 600 |
# μκ°ν
|
|
|
|
| 604 |
st.write("**μ£Όμ ν€μλ:**")
|
| 605 |
for word, count in keywords:
|
| 606 |
st.write(f"- {word}: {count}ν")
|
|
|
|
| 607 |
with keyword_tab2:
|
| 608 |
keyword_dict = extract_keywords_for_wordcloud(selected_article['content'])
|
| 609 |
+
wc = generate_wordcloud(keyword_dict)
|
| 610 |
|
| 611 |
+
if wc:
|
| 612 |
+
fig, ax = plt.subplots(figsize=(10, 5))
|
| 613 |
+
ax.imshow(wc, interpolation='bilinear')
|
| 614 |
+
ax.axis('off')
|
| 615 |
+
st.pyplot(fig)
|
| 616 |
|
| 617 |
+
# ν€μλ μμ 20κ° νμ
|
| 618 |
+
st.write("**μμ 20κ° ν€μλ:**")
|
| 619 |
+
top_keywords = sorted(keyword_dict.items(), key=lambda x: x[1], reverse=True)[:20]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 620 |
keyword_df = pd.DataFrame(top_keywords, columns=['ν€μλ', 'λΉλ'])
|
| 621 |
st.dataframe(keyword_df)
|
| 622 |
+
else:
|
| 623 |
+
st.error("μλν΄λΌμ°λλ₯Ό μμ±ν μ μμ΅λλ€.")
|
|
|
|
| 624 |
|
| 625 |
elif analysis_type == "ν
μ€νΈ ν΅κ³":
|
| 626 |
if st.button("ν
μ€νΈ ν΅κ³ λΆμ"):
|
|
|
|
| 629 |
# ν
μ€νΈ ν΅κ³ κ³μ°
|
| 630 |
word_count = len(re.findall(r'\b\w+\b', content))
|
| 631 |
char_count = len(content)
|
| 632 |
+
sentence_count = len(re.split(r'[.!?]+', content))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 633 |
avg_word_length = sum(len(word) for word in re.findall(r'\b\w+\b', content)) / word_count if word_count > 0 else 0
|
| 634 |
avg_sentence_length = word_count / sentence_count if sentence_count > 0 else 0
|
| 635 |
|
|
|
|
| 655 |
st.write(f"ν
μ€νΈ 볡μ‘μ± μ μ: {complexity_score:.1f}/10")
|
| 656 |
|
| 657 |
# μΆν λΉλ λ§λ κ·Έλν
|
| 658 |
+
st.subheader("νμ¬λ³ λΆν¬ (νκ΅μ΄/μμ΄ μ§μ)")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 659 |
try:
|
| 660 |
+
# KoNLPy μ€μΉ νμΈ
|
| 661 |
+
try:
|
| 662 |
+
from konlpy.tag import Okt
|
| 663 |
+
konlpy_installed = True
|
| 664 |
+
except ImportError:
|
| 665 |
+
konlpy_installed = False
|
| 666 |
+
st.warning("νκ΅μ΄ ννμ λΆμμ μν΄ KoNLPyλ₯Ό μ€μΉν΄μ£ΌμΈμ: pip install konlpy")
|
| 667 |
|
| 668 |
+
# μμ΄ POS tagger μ€λΉ
|
| 669 |
+
from nltk import pos_tag
|
| 670 |
+
try:
|
| 671 |
+
nltk.data.find('taggers/averaged_perceptron_tagger')
|
| 672 |
+
except LookupError:
|
| 673 |
+
nltk.download('averaged_perceptron_tagger')
|
| 674 |
+
|
| 675 |
+
# Try using the correct resource name as shown in the error message
|
| 676 |
+
try:
|
| 677 |
+
nltk.data.find('averaged_perceptron_tagger_eng')
|
| 678 |
+
except LookupError:
|
| 679 |
+
nltk.download('averaged_perceptron_tagger_eng')
|
| 680 |
+
|
| 681 |
+
# μΈμ΄ κ°μ§ (κ°λ¨ν λ°©μ)
|
| 682 |
+
is_korean = bool(re.search(r'[κ°-ν£]', content))
|
| 683 |
+
|
| 684 |
+
if is_korean and konlpy_installed:
|
| 685 |
+
# νκ΅μ΄ ννμ λΆμ
|
| 686 |
+
okt = Okt()
|
| 687 |
+
tagged = okt.pos(content)
|
| 688 |
+
|
| 689 |
+
# νκ΅μ΄ νμ¬ λ§€ν
|
| 690 |
+
pos_dict = {
|
| 691 |
+
'Noun': 'λͺ
μ¬', 'NNG': 'λͺ
μ¬', 'NNP': 'κ³ μ λͺ
μ¬',
|
| 692 |
+
'Verb': 'λμ¬', 'VV': 'λμ¬', 'VA': 'νμ©μ¬',
|
| 693 |
+
'Adjective': 'νμ©μ¬',
|
| 694 |
+
'Adverb': 'λΆμ¬',
|
| 695 |
+
'Josa': 'μ‘°μ¬', 'Punctuation': 'ꡬλμ ',
|
| 696 |
+
'Determiner': 'κ΄νμ¬', 'Exclamation': 'κ°νμ¬'
|
| 697 |
+
}
|
| 698 |
|
| 699 |
+
pos_counts = {'λͺ
μ¬': 0, 'λμ¬': 0, 'νμ©μ¬': 0, 'λΆμ¬': 0, 'μ‘°μ¬': 0, 'ꡬλμ ': 0, 'κ΄νμ¬': 0, 'κ°νμ¬': 0, 'κΈ°ν': 0}
|
| 700 |
+
|
| 701 |
+
for _, pos in tagged:
|
| 702 |
+
if pos in pos_dict:
|
| 703 |
+
pos_counts[pos_dict[pos]] += 1
|
| 704 |
+
elif pos.startswith('N'): # κΈ°ν λͺ
μ¬λ₯
|
| 705 |
+
pos_counts['λͺ
μ¬'] += 1
|
| 706 |
+
elif pos.startswith('V'): # κΈ°ν λμ¬λ₯
|
| 707 |
+
pos_counts['λμ¬'] += 1
|
| 708 |
else:
|
| 709 |
pos_counts['κΈ°ν'] += 1
|
| 710 |
+
|
| 711 |
else:
|
| 712 |
+
# μμ΄ POS νκΉ
|
| 713 |
+
tokens = word_tokenize(content.lower())
|
| 714 |
+
tagged = pos_tag(tokens)
|
| 715 |
+
|
| 716 |
+
# μμ΄ νμ¬ λ§€ν
|
| 717 |
+
pos_dict = {
|
| 718 |
+
'NN': 'λͺ
μ¬', 'NNS': 'λͺ
μ¬', 'NNP': 'κ³ μ λͺ
μ¬', 'NNPS': 'κ³ μ λͺ
μ¬',
|
| 719 |
+
'VB': 'λμ¬', 'VBD': 'λμ¬', 'VBG': 'λμ¬', 'VBN': 'λμ¬', 'VBP': 'λμ¬', 'VBZ': 'λμ¬',
|
| 720 |
+
'JJ': 'νμ©μ¬', 'JJR': 'νμ©μ¬', 'JJS': 'νμ©μ¬',
|
| 721 |
+
'RB': 'λΆμ¬', 'RBR': 'λΆμ¬', 'RBS': 'λΆμ¬'
|
| 722 |
}
|
| 723 |
+
|
| 724 |
+
pos_counts = {'λͺ
μ¬': 0, 'λμ¬': 0, 'νμ©μ¬': 0, 'λΆμ¬': 0, 'κΈ°ν': 0}
|
| 725 |
+
|
| 726 |
+
for _, pos in tagged:
|
| 727 |
+
if pos in pos_dict:
|
| 728 |
+
pos_counts[pos_dict[pos]] += 1
|
| 729 |
+
else:
|
| 730 |
+
pos_counts['κΈ°ν'] += 1
|
| 731 |
|
| 732 |
# κ²°κ³Ό μκ°ν
|
| 733 |
pos_df = pd.DataFrame({
|
|
|
|
| 750 |
if st.session_state.openai_api_key:
|
| 751 |
with st.spinner("κΈ°μ¬μ κ°μ μ λΆμ μ€μ
λλ€..."):
|
| 752 |
try:
|
| 753 |
+
# κ°μ λΆμ API νΈμΆ μ μ ν€ νμΈ λ° μ€μ
|
| 754 |
+
if not openai.api_key:
|
| 755 |
+
if st.session_state.openai_api_key:
|
| 756 |
+
openai.api_key = st.session_state.openai_api_key
|
| 757 |
+
else:
|
| 758 |
+
st.error("OpenAI API ν€κ° μ€μ λμ§ μμμ΅λλ€.")
|
| 759 |
+
st.stop()
|
| 760 |
+
|
| 761 |
response = openai.chat.completions.create(
|
| 762 |
model="gpt-4.1-mini",
|
| 763 |
messages=[
|
|
|
|
| 829 |
fill_color = 'rgba(158, 158, 158, 0.3)' # μ°ν νμ
|
| 830 |
line_color = 'rgba(158, 158, 158, 1)' # μ§ν νμ
|
| 831 |
|
| 832 |
+
# λ μ΄λ μ°¨νΈ λ°μ΄ν° μ€λΉ - λ§μ§λ§ μ μ΄ μ²« μ κ³Ό μ°κ²°λλλ‘ λ°μ΄ν° μΆκ°
|
| 833 |
radar_keywords = keyword_names.copy()
|
| 834 |
radar_scores = keyword_scores.copy()
|
| 835 |
|
|
|
|
| 941 |
with st.expander("μλ³Έ κΈ°μ¬ λ΄μ©"):
|
| 942 |
st.write(selected_article['content'])
|
| 943 |
|
| 944 |
+
prompt_text ="""λ€μ κΈ°μ¬ μμμ λ°λΌμ λ€μ μμ±ν΄μ€.
|
|
|
|
| 945 |
μν : λΉμ μ μ λ¬Έμ¬μ κΈ°μμ
λλ€.
|
| 946 |
μμ
: μ΅κ·Ό μΌμ΄λ μ¬κ±΄μ λν 보λμλ£λ₯Ό μμ±ν΄μΌ ν©λλ€. μλ£λ μ¬μ€μ κΈ°λ°μΌλ‘ νλ©°, κ°κ΄μ μ΄κ³ μ νν΄μΌ ν©λλ€.
|
| 947 |
μ§μΉ¨:
|
|
|
|
| 949 |
κΈ°μ¬ μ λͺ©μ μ£Όμ λ₯Ό λͺ
νν λ°μνκ³ λ
μμ κ΄μ¬μ λ μ μλλ‘ μμ±ν©λλ€.
|
| 950 |
κΈ°μ¬ λ΄μ©μ μ ννκ³ κ°κ²°νλ©° μ€λλ ₯ μλ λ¬Έμ₯μΌλ‘ ꡬμ±ν©λλ€.
|
| 951 |
κ΄λ ¨μμ μΈν°λ·°λ₯Ό μΈμ© ννλ‘ λ£μ΄μ£ΌμΈμ.
|
| 952 |
+
μμ μ 보μ μ§μΉ¨μ μ°Έκ³ νμ¬ μ λ¬Έ 보λμλ£ νμμ κΈ°μ¬λ₯Ό μμ±ν΄ μ£ΌμΈμ"""
|
| 953 |
|
| 954 |
# μ΄λ―Έμ§ μμ± μ¬λΆ μ ν μ΅μ
μΆκ°
|
| 955 |
generate_image_too = st.checkbox("κΈ°μ¬ μμ± ν μ΄λ―Έμ§λ ν¨κ» μμ±νκΈ°", value=True)
|
| 956 |
|
| 957 |
if st.button("μ κΈ°μ¬ μμ±νκΈ°"):
|
| 958 |
if st.session_state.openai_api_key:
|
| 959 |
+
# openai.api_key = st.session_state.openai_api_key # μ΄λ―Έ μλ¨μμ μ€μ λ¨ λλ κ° ν¨μ νΈμΆ μ μ€μ
|
| 960 |
with st.spinner("κΈ°μ¬λ₯Ό μμ± μ€μ
λλ€..."):
|
| 961 |
new_article = generate_article(selected_article['content'], prompt_text)
|
| 962 |
|
|
|
|
| 975 |
"""
|
| 976 |
|
| 977 |
# μ΄λ―Έμ§ μμ±
|
| 978 |
+
# μ΄λ―Έμ§ μμ± API νΈμΆ μ μ ν€ νμΈ λ° μ€μ
|
| 979 |
+
if not openai.api_key:
|
| 980 |
+
if st.session_state.openai_api_key:
|
| 981 |
+
openai.api_key = st.session_state.openai_api_key
|
| 982 |
+
else:
|
| 983 |
+
st.error("OpenAI API ν€κ° μ€μ λμ§ μμμ΅λλ€.")
|
| 984 |
+
st.stop()
|
| 985 |
image_url = generate_image(image_prompt)
|
| 986 |
|
| 987 |
if image_url and not image_url.startswith("μ΄λ―Έμ§ μμ± μ€λ₯") and not image_url.startswith("μ€λ₯: OpenAI API ν€κ° μ€μ λμ§ μμμ΅λλ€."):
|
|
|
|
| 1157 |
files = [f for f in os.listdir(SCHEDULED_NEWS_DIR) if f.endswith('.json')]
|
| 1158 |
if files:
|
| 1159 |
st.subheader("μμ§λ νμΌ μ΄κΈ°")
|
| 1160 |
+
selected_file = st.selectbox("νμΌ μ ν", files, index=len(files)-1 if files else 0) # filesκ° λΉμ΄μμ κ²½μ° λλΉ
|
| 1161 |
if selected_file and st.button("νμΌ λ΄μ© 보기"):
|
| 1162 |
with open(os.path.join(SCHEDULED_NEWS_DIR, selected_file), 'r', encoding='utf-8') as f:
|
| 1163 |
articles = json.load(f)
|
|
|
|
| 1175 |
|
| 1176 |
# νΈν°
|
| 1177 |
st.markdown("---")
|
| 1178 |
+
st.markdown("Β© λ΄μ€ κΈ°μ¬ λꡬ @conanssam")
|