Spaces:
Sleeping
Sleeping
| # app.py | |
| import os | |
| import streamlit as st | |
| import pandas as pd | |
| import requests | |
| from bs4 import BeautifulSoup | |
| import re | |
| import time | |
| import nltk | |
| from nltk.tokenize import word_tokenize | |
| from nltk.corpus import stopwords | |
| from collections import Counter | |
| import json | |
| from datetime import datetime, timedelta | |
| import openai | |
| import schedule | |
| import threading | |
| import matplotlib.pyplot as plt | |
| from wordcloud import WordCloud | |
| # βββ μ€μ : μμ λλ ν 리, NLTK λ°μ΄ν° βββββββββββββββββββββββββββββββββββββββββ | |
| # μμ λλ ν 리 μμ± | |
| TMP = "/tmp" | |
| NLP_DATA = os.path.join(TMP, "nltk_data") | |
| os.makedirs(NLP_DATA, exist_ok=True) | |
| # NLTK λ°μ΄ν° κ²μ κ²½λ‘μ μΆκ° | |
| nltk.data.path.insert(0, NLP_DATA) | |
| # νμν NLTK 리μμ€ λ€μ΄λ‘λ | |
| for pkg in ["punkt", "stopwords"]: | |
| try: | |
| nltk.data.find(f"tokenizers/{pkg}") | |
| except LookupError: | |
| nltk.download(pkg, download_dir=NLP_DATA) | |
| # βββ OpenAI API ν€ λΆλ¬μ€κΈ° ββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # μ°μ νκ²½ λ³μ, κ·Έλ€μ st.secrets, λ§μ§λ§μΌλ‘ μ¬μ΄λλ° μ λ ₯ | |
| OPENAI_KEY = os.getenv("OPENAI_API_KEY") or st.secrets.get("OPENAI_API_KEY") | |
| if not OPENAI_KEY: | |
| # μ± μ€ν μ€ μ¬μ΄λλ°μμ μ λ ₯ λ°κΈ° | |
| with st.sidebar: | |
| st.markdown("### π OpenAI API Key") | |
| key_input = st.text_input("Enter your OpenAI API Key:", type="password") | |
| if key_input: | |
| OPENAI_KEY = key_input | |
| if OPENAI_KEY: | |
| openai.api_key = OPENAI_KEY | |
| else: | |
| st.sidebar.error("OpenAI API Keyκ° μ€μ λμ§ μμμ΅λλ€.") | |
| # βββ Streamlit νμ΄μ§ & λ©λ΄ κ΅¬μ± βββββββββββββββββββββββββββββββββββββββββββββ | |
| st.set_page_config(page_title="π° News Tool", layout="wide") | |
| with st.sidebar: | |
| st.title("λ΄μ€ κΈ°μ¬ λꡬ") | |
| menu = st.radio("λ©λ΄ μ ν", [ | |
| "λ΄μ€ κΈ°μ¬ ν¬λ‘€λ§", "κΈ°μ¬ λΆμνκΈ°", "μ κΈ°μ¬ μμ±νκΈ°", "λ΄μ€ κΈ°μ¬ μμ½νκΈ°" | |
| ]) | |
| # βββ νμΌ κ²½λ‘ ν¬νΌ ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _tmp_path(*paths): | |
| """/tmp νμ κ²½λ‘ μ‘°ν©""" | |
| full = os.path.join(TMP, *paths) | |
| os.makedirs(os.path.dirname(full), exist_ok=True) | |
| return full | |
| # βββ μ μ₯λ κΈ°μ¬ λ‘λ/μ μ₯ βββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def load_saved_articles(): | |
| path = _tmp_path("saved_articles", "articles.json") | |
| if os.path.exists(path): | |
| with open(path, "r", encoding="utf-8") as f: | |
| return json.load(f) | |
| return [] | |
| def save_articles(articles): | |
| path = _tmp_path("saved_articles", "articles.json") | |
| with open(path, "w", encoding="utf-8") as f: | |
| json.dump(articles, f, ensure_ascii=False, indent=2) | |
| # βββ λ€μ΄λ² λ΄μ€ ν¬λ‘€λ¬ βββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def crawl_naver_news(keyword, num_articles=5): | |
| url = f"https://search.naver.com/search.naver?where=news&query={keyword}" | |
| results = [] | |
| try: | |
| resp = requests.get(url, timeout=5) | |
| soup = BeautifulSoup(resp.text, "html.parser") | |
| items = soup.select("div.sds-comps-base-layout.sds-comps-full-layout") | |
| for i, it in enumerate(items): | |
| if i >= num_articles: break | |
| title_el = it.select_one("a.X0fMYp2dHd0TCUS2hjww span") | |
| link_el = it.select_one("a.X0fMYp2dHd0TCUS2hjww") | |
| src_el = it.select_one("div.sds-comps-profile-info-title span") | |
| date_el = it.select_one("span.r0VOr") | |
| desc_el = it.select_one("a.X0fMYp2dHd0TCUS2hjww.IaKmSOGPdofdPwPE6cyU > span") | |
| if not title_el or not link_el: continue | |
| results.append({ | |
| "title": title_el.text.strip(), | |
| "link": link_el["href"], | |
| "source": src_el.text.strip() if src_el else "μ μ μμ", | |
| "date": date_el.text.strip() if date_el else "μ μ μμ", | |
| "description": desc_el.text.strip() if desc_el else "", | |
| "content": "" | |
| }) | |
| except Exception as e: | |
| st.error(f"ν¬λ‘€λ§ μ€λ₯: {e}") | |
| return results | |
| # βββ κΈ°μ¬ λ³Έλ¬Έ κ°μ Έμ€κΈ° βββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def get_article_content(url): | |
| try: | |
| resp = requests.get(url, timeout=5) | |
| soup = BeautifulSoup(resp.text, "html.parser") | |
| cont = soup.select_one("#dic_area") or soup.select_one(".article_body, .news-content-inner") | |
| if cont: | |
| text = re.sub(r"\s+", " ", cont.text.strip()) | |
| return text | |
| except Exception: | |
| pass | |
| return "λ³Έλ¬Έμ κ°μ Έμ¬ μ μμ΅λλ€." | |
| # βββ ν€μλ λΆμ & μλν΄λΌμ°λ βββββββββββββββββββββββββββββββββββββββββββββββ | |
| def analyze_keywords(text, top_n=10): | |
| stop_kr = ["μ΄","κ·Έ","μ ","κ²","λ°","λ±","λ₯Ό","μ","μ","μμ","μ","μΌλ‘","λ‘"] | |
| tokens = [w for w in word_tokenize(text) if w.isalnum() and len(w)>1 and w not in stop_kr] | |
| freq = Counter(tokens) | |
| return freq.most_common(top_n) | |
| def extract_for_wordcloud(text, top_n=50): | |
| tokens = [w for w in word_tokenize(text.lower()) if w.isalnum()] | |
| stop_en = set(stopwords.words("english")) | |
| korea_sw = {"λ°","λ±","λ₯Ό","μ΄","μ","κ°","μ","λ"} | |
| sw = stop_en.union(korea_sw) | |
| filtered = [w for w in tokens if w not in sw and len(w)>1] | |
| freq = Counter(filtered) | |
| return dict(freq.most_common(top_n)) | |
| def generate_wordcloud(freq_dict): | |
| try: | |
| wc = WordCloud(width=800, height=400, background_color="white")\ | |
| .generate_from_frequencies(freq_dict) | |
| return wc | |
| except Exception as e: | |
| st.error(f"μλν΄λΌμ°λ μμ± μ€λ₯: {e}") | |
| return None | |
| # βββ OpenAI κΈ°λ° μ κΈ°μ¬ & μ΄λ―Έμ§ μμ± βββββββββββββββββββββββββββββββββββββββ | |
| def generate_article(orig, prompt_text): | |
| if not openai.api_key: | |
| return "API Keyκ° μ€μ λμ§ μμμ΅λλ€." | |
| try: | |
| resp = openai.ChatCompletion.create( | |
| model="gpt-3.5-turbo", | |
| messages=[ | |
| {"role":"system","content":"λΉμ μ μ λ¬Έ λ΄μ€ κΈ°μμ λλ€."}, | |
| {"role":"user", "content":f"{prompt_text}\n\n{orig[:1000]}"} | |
| ], | |
| max_tokens=1500 | |
| ) | |
| return resp.choices[0].message["content"] | |
| except Exception as e: | |
| return f"κΈ°μ¬ μμ± μ€λ₯: {e}" | |
| def generate_image(prompt): | |
| if not openai.api_key: | |
| return None | |
| try: | |
| resp = openai.Image.create(prompt=prompt, n=1, size="512x512") | |
| return resp["data"][0]["url"] | |
| except Exception as e: | |
| st.error(f"μ΄λ―Έμ§ μμ± μ€λ₯: {e}") | |
| return None | |
| # βββ μ€μΌμ€λ¬ μν ν΄λμ€ βββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class SchedulerState: | |
| def __init__(self): | |
| self.is_running = False | |
| self.thread = None | |
| self.last_run = None | |
| self.next_run = None | |
| self.jobs = [] | |
| self.results = [] | |
| global_scheduler = SchedulerState() | |
| def perform_news_task(task_type, kw, n, prefix): | |
| arts = crawl_naver_news(kw, n) | |
| for a in arts: | |
| a["content"] = get_article_content(a["link"]) | |
| time.sleep(0.5) | |
| fname = _tmp_path("scheduled_news", f"{prefix}_{task_type}_{datetime.now():%Y%m%d_%H%M%S}.json") | |
| with open(fname,"w",encoding="utf-8") as f: | |
| json.dump(arts, f, ensure_ascii=False, indent=2) | |
| global_scheduler.last_run = datetime.now() | |
| global_scheduler.results.append({ | |
| "type":task_type, "keyword":kw, | |
| "count":len(arts), "file":fname, | |
| "timestamp":global_scheduler.last_run | |
| }) | |
| def run_scheduler(): | |
| while global_scheduler.is_running: | |
| schedule.run_pending() | |
| time.sleep(1) | |
| def start_scheduler(daily, interval): | |
| if global_scheduler.is_running: return | |
| schedule.clear(); global_scheduler.jobs=[] | |
| # μΌλ³ | |
| for t in daily: | |
| hh, mm = t["hour"], t["minute"] | |
| tag = f"d_{t['keyword']}_{hh}{mm}" | |
| schedule.every().day.at(f"{hh:02d}:{mm:02d}")\ | |
| .do(perform_news_task,"daily",t["keyword"],t["num_articles"],tag).tag(tag) | |
| global_scheduler.jobs.append(tag) | |
| # κ°κ²© | |
| for t in interval: | |
| tag = f"i_{t['keyword']}_{t['interval']}" | |
| if t["immediate"]: | |
| perform_news_task("interval", t["keyword"], t["num_articles"], tag) | |
| schedule.every(t["interval"]).minutes\ | |
| .do(perform_news_task,"interval",t["keyword"],t["num_articles"],tag).tag(tag) | |
| global_scheduler.jobs.append(tag) | |
| global_scheduler.next_run = schedule.next_run() | |
| global_scheduler.is_running = True | |
| th = threading.Thread(target=run_scheduler, daemon=True) | |
| th.start(); global_scheduler.thread = th | |
| def stop_scheduler(): | |
| global_scheduler.is_running = False | |
| schedule.clear() | |
| global_scheduler.jobs=[] | |
| # βββ νλ©΄ 그리기: λ©λ΄λ³ κΈ°λ₯ ββββββββββββββββββββββββββββββββββββββββββββββββ | |
| if menu == "λ΄μ€ κΈ°μ¬ ν¬λ‘€λ§": | |
| st.header("λ΄μ€ κΈ°μ¬ ν¬λ‘€λ§") | |
| kw = st.text_input("π κ²μμ΄", "μΈκ³΅μ§λ₯") | |
| num = st.slider("κ°μ Έμ¬ κΈ°μ¬ μ", 1, 20, 5) | |
| if st.button("κΈ°μ¬ κ°μ Έμ€κΈ°"): | |
| arts = crawl_naver_news(kw, num) | |
| for i,a in enumerate(arts): | |
| st.progress((i+1)/len(arts)) | |
| a["content"] = get_article_content(a["link"]) | |
| time.sleep(0.3) | |
| save_articles(arts) | |
| st.success(f"{len(arts)}κ° κΈ°μ¬ μ μ₯λ¨") | |
| for a in arts: | |
| with st.expander(a["title"]): | |
| st.write(f"μΆμ²: {a['source']} | λ μ§: {a['date']}") | |
| st.write(a["description"]) | |
| st.write(a["content"][:300]+"β¦") | |
| elif menu == "κΈ°μ¬ λΆμνκΈ°": | |
| st.header("κΈ°μ¬ λΆμνκΈ°") | |
| arts = load_saved_articles() | |
| if not arts: | |
| st.warning("λ¨Όμ βλ΄μ€ κΈ°μ¬ ν¬λ‘€λ§β λ©λ΄μμ κΈ°μ¬λ₯Ό μμ§νμΈμ.") | |
| else: | |
| titles = [a["title"] for a in arts] | |
| sel = st.selectbox("λΆμν κΈ°μ¬ μ ν", titles) | |
| art = next(a for a in arts if a["title"]==sel) | |
| st.subheader(art["title"]) | |
| with st.expander("본문 보기"): | |
| st.write(art["content"]) | |
| mode = st.radio("λΆμ λ°©μ", ["ν€μλ λΆμ", "ν μ€νΈ ν΅κ³"]) | |
| if mode=="ν€μλ λΆμ" and st.button("μ€ν"): | |
| kw_list = analyze_keywords(art["content"]) | |
| df = pd.DataFrame(kw_list, columns=["λ¨μ΄","λΉλ"]) | |
| st.bar_chart(df.set_index("λ¨μ΄")) | |
| st.write("μμ ν€μλ:") | |
| for w,c in kw_list: st.write(f"- {w}: {c}") | |
| # μλν΄λΌμ°λ | |
| wc_data = extract_for_wordcloud(art["content"]) | |
| wc = generate_wordcloud(wc_data) | |
| if wc: | |
| fig,ax = plt.subplots(figsize=(8,4)) | |
| ax.imshow(wc,interp="bilinear"); ax.axis("off") | |
| st.pyplot(fig) | |
| if mode=="ν μ€νΈ ν΅κ³" and st.button("μ€ν"): | |
| txt=art["content"] | |
| wcnt=len(re.findall(r"\\w+",txt)) | |
| scnt=len(re.split(r"[.!?]+",txt)) | |
| st.metric("λ¨μ΄ μ",wcnt); st.metric("λ¬Έμ₯ μ",scnt) | |
| elif menu == "μ κΈ°μ¬ μμ±νκΈ°": | |
| st.header("μ κΈ°μ¬ μμ±νκΈ°") | |
| arts = load_saved_articles() | |
| if not arts: | |
| st.warning("λ¨Όμ κΈ°μ¬λ₯Ό μμ§ν΄μ£ΌμΈμ.") | |
| else: | |
| sel = st.selectbox("μλ³Έ κΈ°μ¬ μ ν", [a["title"] for a in arts]) | |
| art = next(a for a in arts if a["title"]==sel) | |
| st.write(art["content"][:200]+"β¦") | |
| prompt = st.text_area("κΈ°μ¬ μμ± μ§μΉ¨", "κΈ°μ¬ νμμ λ§μΆ° μλ‘ μμ±ν΄ μ£ΌμΈμ.") | |
| gen_img = st.checkbox("μ΄λ―Έμ§λ μμ±", value=True) | |
| if st.button("μμ±"): | |
| new = generate_article(art["content"], prompt) | |
| st.subheader("μμ±λ κΈ°μ¬") | |
| st.write(new) | |
| if gen_img: | |
| url = generate_image(f"κΈ°μ¬ μ λͺ©: {art['title']}\n\n{prompt}") | |
| if url: st.image(url) | |
| elif menu == "λ΄μ€ κΈ°μ¬ μμ½νκΈ°": | |
| st.header("λ΄μ€ κΈ°μ¬ μμ½νκΈ°") | |
| tab1,tab2,tab3 = st.tabs(["μΌλ³ μμ½","κ°κ²© μμ½","μν"]) | |
| # μΌλ³ | |
| with tab1: | |
| dkw = st.text_input("ν€μλ(μΌλ³)", "μΈκ³΅μ§λ₯", key="dk") | |
| dnum = st.number_input("κΈ°μ¬ μ",1,20,5,key="dn") | |
| dhh = st.number_input("μ",0,23,9,key="dh") | |
| dmm = st.number_input("λΆ",0,59,0,key="dm") | |
| if st.button("μΆκ°",key="addd"): | |
| st.session_state.setdefault("daily",[]).append({ | |
| "keyword":dkw,"num_articles":dnum, | |
| "hour":dhh,"minute":dmm | |
| }) | |
| if st.session_state.get("daily"): | |
| st.write(st.session_state["daily"]) | |
| # κ°κ²© | |
| with tab2: | |
| ikw = st.text_input("ν€μλ(κ°κ²©)", "λΉ λ°μ΄ν°", key="ik") | |
| inum = st.number_input("κΈ°μ¬ μ",1,20,5,key="in") | |
| inter= st.number_input("κ°κ²©(λΆ)",1,1440,60,key="ii") | |
| imm = st.checkbox("μ¦μ μ€ν",True,key="im") | |
| if st.button("μΆκ°",key="addi"): | |
| st.session_state.setdefault("interval",[]).append({ | |
| "keyword":ikw,"num_articles":inum, | |
| "interval":inter,"immediate":imm | |
| }) | |
| if st.session_state.get("interval"): | |
| st.write(st.session_state["interval"]) | |
| # μν | |
| with tab3: | |
| if not global_scheduler.is_running and st.button("μμ"): | |
| start_scheduler(st.session_state.get("daily",[]), | |
| st.session_state.get("interval",[])) | |
| if global_scheduler.is_running and st.button("μ€μ§"): | |
| stop_scheduler() | |
| st.write("μ€νμ€:", global_scheduler.is_running) | |
| st.write("λ§μ§λ§ μ€ν:", global_scheduler.last_run) | |
| st.write("λ€μ μ€ν:", global_scheduler.next_run) | |
| st.write("μ‘ μ:", global_scheduler.jobs) | |
| st.dataframe(pd.DataFrame(global_scheduler.results)) | |
| # βββ νΈν° ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| st.markdown("---") | |
| st.markdown("Β© 2025 News Tool @conanssam") | |