Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
import gradio as gr | |
import requests | |
import json | |
import os | |
from datetime import datetime, timedelta | |
from concurrent.futures import ThreadPoolExecutor | |
from functools import lru_cache | |
from requests.adapters import HTTPAdapter | |
from requests.packages.urllib3.util.retry import Retry | |
from openai import OpenAI | |
from bs4 import BeautifulSoup | |
import re # re ๋ชจ๋ ์ถ๊ฐ | |
ACCESS_TOKEN = os.getenv("HF_TOKEN") | |
if not ACCESS_TOKEN: | |
raise ValueError("HF_TOKEN environment variable is not set") | |
client = OpenAI( | |
base_url="https://api-inference.huggingface.co/v1/", | |
api_key=ACCESS_TOKEN, | |
) | |
MAX_COUNTRY_RESULTS = 100 # ๊ตญ๊ฐ๋ณ ์ต๋ ๊ฒฐ๊ณผ ์ | |
MAX_GLOBAL_RESULTS = 1000 # ์ ์ธ๊ณ ์ต๋ ๊ฒฐ๊ณผ ์ | |
def create_article_components(max_results): | |
article_components = [] | |
for i in range(max_results): | |
with gr.Group(visible=False) as article_group: | |
title = gr.Markdown() | |
image = gr.Image(width=200, height=150) | |
snippet = gr.Markdown() | |
info = gr.Markdown() | |
article_components.append({ | |
'group': article_group, | |
'title': title, | |
'image': image, | |
'snippet': snippet, | |
'info': info, | |
'index': i, | |
}) | |
return article_components | |
API_KEY = os.getenv("SERPHOUSE_API_KEY") | |
# ๊ตญ๊ฐ๋ณ ์ธ์ด ์ฝ๋ ๋งคํ | |
COUNTRY_LANGUAGES = { | |
"United States": "en", | |
"KOREA": "ko", | |
"United Kingdom": "en", | |
"Taiwan": "zh-TW", | |
"Canada": "en", | |
"Australia": "en", | |
"Germany": "de", | |
"France": "fr", | |
"Japan": "ja", | |
"China": "zh", | |
"India": "hi", | |
"Brazil": "pt", | |
"Mexico": "es", | |
"Russia": "ru", | |
"Italy": "it", | |
"Spain": "es", | |
"Netherlands": "nl", | |
"Singapore": "en", | |
"Hong Kong": "zh-HK", | |
"Indonesia": "id", | |
"Malaysia": "ms", | |
"Philippines": "tl", | |
"Thailand": "th", | |
"Vietnam": "vi", | |
"Belgium": "nl", | |
"Denmark": "da", | |
"Finland": "fi", | |
"Ireland": "en", | |
"Norway": "no", | |
"Poland": "pl", | |
"Sweden": "sv", | |
"Switzerland": "de", | |
"Austria": "de", | |
"Czech Republic": "cs", | |
"Greece": "el", | |
"Hungary": "hu", | |
"Portugal": "pt", | |
"Romania": "ro", | |
"Turkey": "tr", | |
"Israel": "he", | |
"Saudi Arabia": "ar", | |
"United Arab Emirates": "ar", | |
"South Africa": "en", | |
"Argentina": "es", | |
"Chile": "es", | |
"Colombia": "es", | |
"Peru": "es", | |
"Venezuela": "es", | |
"New Zealand": "en", | |
"Bangladesh": "bn", | |
"Pakistan": "ur", | |
"Egypt": "ar", | |
"Morocco": "ar", | |
"Nigeria": "en", | |
"Kenya": "sw", | |
"Ukraine": "uk", | |
"Croatia": "hr", | |
"Slovakia": "sk", | |
"Bulgaria": "bg", | |
"Serbia": "sr", | |
"Estonia": "et", | |
"Latvia": "lv", | |
"Lithuania": "lt", | |
"Slovenia": "sl", | |
"Luxembourg": "fr", | |
"Malta": "mt", | |
"Cyprus": "el", | |
"Iceland": "is" | |
} | |
COUNTRY_LOCATIONS = { | |
"United States": "United States", | |
"KOREA": "kr", | |
"United Kingdom": "United Kingdom", | |
"Taiwan": "Taiwan", | |
"Canada": "Canada", | |
"Australia": "Australia", | |
"Germany": "Germany", | |
"France": "France", | |
"Japan": "Japan", | |
"China": "China", | |
"India": "India", | |
"Brazil": "Brazil", | |
"Mexico": "Mexico", | |
"Russia": "Russia", | |
"Italy": "Italy", | |
"Spain": "Spain", | |
"Netherlands": "Netherlands", | |
"Singapore": "Singapore", | |
"Hong Kong": "Hong Kong", | |
"Indonesia": "Indonesia", | |
"Malaysia": "Malaysia", | |
"Philippines": "Philippines", | |
"Thailand": "Thailand", | |
"Vietnam": "Vietnam", | |
"Belgium": "Belgium", | |
"Denmark": "Denmark", | |
"Finland": "Finland", | |
"Ireland": "Ireland", | |
"Norway": "Norway", | |
"Poland": "Poland", | |
"Sweden": "Sweden", | |
"Switzerland": "Switzerland", | |
"Austria": "Austria", | |
"Czech Republic": "Czech Republic", | |
"Greece": "Greece", | |
"Hungary": "Hungary", | |
"Portugal": "Portugal", | |
"Romania": "Romania", | |
"Turkey": "Turkey", | |
"Israel": "Israel", | |
"Saudi Arabia": "Saudi Arabia", | |
"United Arab Emirates": "United Arab Emirates", | |
"South Africa": "South Africa", | |
"Argentina": "Argentina", | |
"Chile": "Chile", | |
"Colombia": "Colombia", | |
"Peru": "Peru", | |
"Venezuela": "Venezuela", | |
"New Zealand": "New Zealand", | |
"Bangladesh": "Bangladesh", | |
"Pakistan": "Pakistan", | |
"Egypt": "Egypt", | |
"Morocco": "Morocco", | |
"Nigeria": "Nigeria", | |
"Kenya": "Kenya", | |
"Ukraine": "Ukraine", | |
"Croatia": "Croatia", | |
"Slovakia": "Slovakia", | |
"Bulgaria": "Bulgaria", | |
"Serbia": "Serbia", | |
"Estonia": "Estonia", | |
"Latvia": "Latvia", | |
"Lithuania": "Lithuania", | |
"Slovenia": "Slovenia", | |
"Luxembourg": "Luxembourg", | |
"Malta": "Malta", | |
"Cyprus": "Cyprus", | |
"Iceland": "Iceland" | |
} | |
# ์ง์ญ ์ ์ | |
# ๋์์์ ์ง์ญ | |
COUNTRY_LANGUAGES_EAST_ASIA = { | |
"KOREA": "ko", | |
"Taiwan": "zh-TW", | |
"Japan": "ja", | |
"China": "zh", | |
"Hong Kong": "zh-HK" | |
} | |
COUNTRY_LOCATIONS_EAST_ASIA = { | |
"KOREA": "KOREA", | |
"Taiwan": "Taiwan", | |
"Japan": "Japan", | |
"China": "China", | |
"Hong Kong": "Hong Kong" | |
} | |
# ๋๋จ์์์/์ค์ธ์๋์ ์ง์ญ | |
COUNTRY_LANGUAGES_SOUTHEAST_ASIA_OCEANIA = { | |
"Indonesia": "id", | |
"Malaysia": "ms", | |
"Philippines": "tl", | |
"Thailand": "th", | |
"Vietnam": "vi", | |
"Singapore": "en", | |
"Papua New Guinea": "en", | |
"Australia": "en", | |
"New Zealand": "en" | |
} | |
COUNTRY_LOCATIONS_SOUTHEAST_ASIA_OCEANIA = { | |
"Indonesia": "Indonesia", | |
"Malaysia": "Malaysia", | |
"Philippines": "Philippines", | |
"Thailand": "Thailand", | |
"Vietnam": "Vietnam", | |
"Singapore": "Singapore", | |
"Papua New Guinea": "Papua New Guinea", | |
"Australia": "Australia", | |
"New Zealand": "New Zealand" | |
} | |
# ๋์ ๋ฝ ์ง์ญ | |
COUNTRY_LANGUAGES_EAST_EUROPE = { | |
"Poland": "pl", | |
"Czech Republic": "cs", | |
"Greece": "el", | |
"Hungary": "hu", | |
"Romania": "ro", | |
"Ukraine": "uk", | |
"Croatia": "hr", | |
"Slovakia": "sk", | |
"Bulgaria": "bg", | |
"Serbia": "sr", | |
"Estonia": "et", | |
"Latvia": "lv", | |
"Lithuania": "lt", | |
"Slovenia": "sl", | |
"Malta": "mt", | |
"Cyprus": "el", | |
"Iceland": "is", | |
"Russia": "ru" | |
} | |
COUNTRY_LOCATIONS_EAST_EUROPE = { | |
"Poland": "Poland", | |
"Czech Republic": "Czech Republic", | |
"Greece": "Greece", | |
"Hungary": "Hungary", | |
"Romania": "Romania", | |
"Ukraine": "Ukraine", | |
"Croatia": "Croatia", | |
"Slovakia": "Slovakia", | |
"Bulgaria": "Bulgaria", | |
"Serbia": "Serbia", | |
"Estonia": "Estonia", | |
"Latvia": "Latvia", | |
"Lithuania": "Lithuania", | |
"Slovenia": "Slovenia", | |
"Malta": "Malta", | |
"Cyprus": "Cyprus", | |
"Iceland": "Iceland", | |
"Russia": "Russia" | |
} | |
# ์์ ๋ฝ ์ง์ญ | |
COUNTRY_LANGUAGES_WEST_EUROPE = { | |
"Germany": "de", | |
"France": "fr", | |
"Italy": "it", | |
"Spain": "es", | |
"Netherlands": "nl", | |
"Belgium": "nl", | |
"Ireland": "en", | |
"Sweden": "sv", | |
"Switzerland": "de", | |
"Austria": "de", | |
"Portugal": "pt", | |
"Luxembourg": "fr", | |
"United Kingdom": "en" | |
} | |
COUNTRY_LOCATIONS_WEST_EUROPE = { | |
"Germany": "Germany", | |
"France": "France", | |
"Italy": "Italy", | |
"Spain": "Spain", | |
"Netherlands": "Netherlands", | |
"Belgium": "Belgium", | |
"Ireland": "Ireland", | |
"Sweden": "Sweden", | |
"Switzerland": "Switzerland", | |
"Austria": "Austria", | |
"Portugal": "Portugal", | |
"Luxembourg": "Luxembourg", | |
"United Kingdom": "United Kingdom" | |
} | |
# ์ค๋/์ํ๋ฆฌ์นด ์ง์ญ | |
COUNTRY_LANGUAGES_ARAB_AFRICA = { | |
"South Africa": "en", | |
"Nigeria": "en", | |
"Kenya": "sw", | |
"Egypt": "ar", | |
"Morocco": "ar", | |
"Saudi Arabia": "ar", | |
"United Arab Emirates": "ar", | |
"Israel": "he" | |
} | |
COUNTRY_LOCATIONS_ARAB_AFRICA = { | |
"South Africa": "South Africa", | |
"Nigeria": "Nigeria", | |
"Kenya": "Kenya", | |
"Egypt": "Egypt", | |
"Morocco": "Morocco", | |
"Saudi Arabia": "Saudi Arabia", | |
"United Arab Emirates": "United Arab Emirates", | |
"Israel": "Israel" | |
} | |
# ์๋ฉ๋ฆฌ์นด ์ง์ญ | |
COUNTRY_LANGUAGES_AMERICA = { | |
"United States": "en", | |
"Canada": "en", | |
"Mexico": "es", | |
"Brazil": "pt", | |
"Argentina": "es", | |
"Chile": "es", | |
"Colombia": "es", | |
"Peru": "es", | |
"Venezuela": "es" | |
} | |
COUNTRY_LOCATIONS_AMERICA = { | |
"United States": "United States", | |
"Canada": "Canada", | |
"Mexico": "Mexico", | |
"Brazil": "Brazil", | |
"Argentina": "Argentina", | |
"Chile": "Chile", | |
"Colombia": "Colombia", | |
"Peru": "Peru", | |
"Venezuela": "Venezuela" | |
} | |
# ์ง์ญ ์ ํ ๋ฆฌ์คํธ | |
REGIONS = [ | |
"๋์์์", | |
"๋๋จ์์์/์ค์ธ์๋์", | |
"๋์ ๋ฝ", | |
"์์ ๋ฝ", | |
"์ค๋/์ํ๋ฆฌ์นด", | |
"์๋ฉ๋ฆฌ์นด" | |
] | |
def translate_query(query, country): | |
try: | |
if is_english(query): | |
return query | |
if country in COUNTRY_LANGUAGES: | |
if country == "South Korea": | |
return query | |
target_lang = COUNTRY_LANGUAGES[country] | |
url = "https://translate.googleapis.com/translate_a/single" | |
params = { | |
"client": "gtx", | |
"sl": "auto", | |
"tl": target_lang, | |
"dt": "t", | |
"q": query | |
} | |
session = requests.Session() | |
retries = Retry(total=3, backoff_factor=0.5) | |
session.mount('https://', HTTPAdapter(max_retries=retries)) | |
response = session.get(url, params=params, timeout=(5, 10)) | |
translated_text = response.json()[0][0][0] | |
return translated_text | |
return query | |
except Exception as e: | |
print(f"๋ฒ์ญ ์ค๋ฅ: {str(e)}") | |
return query | |
def translate_to_korean(text): | |
try: | |
url = "https://translate.googleapis.com/translate_a/single" | |
params = { | |
"client": "gtx", | |
"sl": "auto", | |
"tl": "ko", | |
"dt": "t", | |
"q": text | |
} | |
session = requests.Session() | |
retries = Retry(total=3, backoff_factor=0.5) | |
session.mount('https://', HTTPAdapter(max_retries=retries)) | |
response = session.get(url, params=params, timeout=(5, 10)) | |
translated_text = response.json()[0][0][0] | |
return translated_text | |
except Exception as e: | |
print(f"ํ๊ธ ๋ฒ์ญ ์ค๋ฅ: {str(e)}") | |
return text | |
def is_english(text): | |
return all(ord(char) < 128 for char in text.replace(' ', '').replace('-', '').replace('_', '')) | |
def is_korean(text): | |
return any('\uAC00' <= char <= '\uD7A3' for char in text) | |
def search_serphouse(query, country, page=1, num_result=10): | |
url = "https://api.serphouse.com/serp/live" | |
now = datetime.utcnow() | |
yesterday = now - timedelta(days=1) | |
date_range = f"{yesterday.strftime('%Y-%m-%d')},{now.strftime('%Y-%m-%d')}" | |
translated_query = translate_query(query, country) | |
payload = { | |
"data": { | |
"q": translated_query, | |
"domain": "google.com", | |
"loc": COUNTRY_LOCATIONS.get(country, "United States"), | |
"lang": COUNTRY_LANGUAGES.get(country, "en"), | |
"device": "desktop", | |
"serp_type": "news", | |
"page": "1", | |
"num": "100", | |
"date_range": date_range, | |
"sort_by": "date" | |
} | |
} | |
headers = { | |
"accept": "application/json", | |
"content-type": "application/json", | |
"authorization": f"Bearer {API_KEY}" | |
} | |
try: | |
# ์ธ์ ์ค์ ๊ฐ์ | |
session = requests.Session() | |
# ์ฌ์๋ ์ค์ ๊ฐํ | |
retries = Retry( | |
total=5, # ์ต๋ ์ฌ์๋ ํ์ ์ฆ๊ฐ | |
backoff_factor=1, # ์ฌ์๋ ๊ฐ๊ฒฉ ์ฆ๊ฐ | |
status_forcelist=[500, 502, 503, 504, 429], # ์ฌ์๋ํ HTTP ์ํ ์ฝ๋ | |
allowed_methods=["POST"] # POST ์์ฒญ์ ๋ํ ์ฌ์๋ ํ์ฉ | |
) | |
# ํ์์์ ์ค์ ์กฐ์ | |
adapter = HTTPAdapter(max_retries=retries) | |
session.mount('http://', adapter) | |
session.mount('https://', adapter) | |
# ํ์์์ ๊ฐ ์ฆ๊ฐ (connect timeout, read timeout) | |
response = session.post( | |
url, | |
json=payload, | |
headers=headers, | |
timeout=(30, 30) # ์ฐ๊ฒฐ ํ์์์ 30์ด, ์ฝ๊ธฐ ํ์์์ 30์ด | |
) | |
response.raise_for_status() | |
return {"results": response.json(), "translated_query": translated_query} | |
except requests.exceptions.Timeout: | |
return { | |
"error": "๊ฒ์ ์๊ฐ์ด ์ด๊ณผ๋์์ต๋๋ค. ์ ์ ํ ๋ค์ ์๋ํด์ฃผ์ธ์.", | |
"translated_query": query | |
} | |
except requests.exceptions.RequestException as e: | |
return { | |
"error": f"๊ฒ์ ์ค ์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค: {str(e)}", | |
"translated_query": query | |
} | |
except Exception as e: | |
return { | |
"error": f"์๊ธฐ์น ์์ ์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค: {str(e)}", | |
"translated_query": query | |
} | |
def format_results_from_raw(response_data): | |
if "error" in response_data: | |
return "Error: " + response_data["error"], [] | |
try: | |
results = response_data["results"] | |
translated_query = response_data["translated_query"] | |
news_results = results.get('results', {}).get('results', {}).get('news', []) | |
if not news_results: | |
return "๊ฒ์ ๊ฒฐ๊ณผ๊ฐ ์์ต๋๋ค.", [] | |
# ํ๊ตญ ๋๋ฉ์ธ ๋ฐ ํ๊ตญ ๊ด๋ จ ํค์๋ ํํฐ๋ง | |
korean_domains = ['.kr', 'korea', 'korean', 'yonhap', 'hankyung', 'chosun', | |
'donga', 'joins', 'hani', 'koreatimes', 'koreaherald'] | |
korean_keywords = ['korea', 'korean', 'seoul', 'busan', 'incheon', 'daegu', | |
'gwangju', 'daejeon', 'ulsan', 'sejong'] | |
filtered_articles = [] | |
for idx, result in enumerate(news_results, 1): | |
url = result.get("url", result.get("link", "")).lower() | |
title = result.get("title", "").lower() | |
channel = result.get("channel", result.get("source", "")).lower() | |
# ํ๊ตญ ๊ด๋ จ ์ปจํ ์ธ ํํฐ๋ง | |
is_korean_content = any(domain in url or domain in channel for domain in korean_domains) or \ | |
any(keyword in title.lower() for keyword in korean_keywords) | |
if not is_korean_content: | |
filtered_articles.append({ | |
"index": idx, | |
"title": result.get("title", "์ ๋ชฉ ์์"), | |
"link": url, | |
"snippet": result.get("snippet", "๋ด์ฉ ์์"), | |
"channel": result.get("channel", result.get("source", "์ ์ ์์")), | |
"time": result.get("time", result.get("date", "์ ์ ์๋ ์๊ฐ")), | |
"image_url": result.get("img", result.get("thumbnail", "")), | |
"translated_query": translated_query | |
}) | |
return "", filtered_articles | |
except Exception as e: | |
return f"๊ฒฐ๊ณผ ์ฒ๋ฆฌ ์ค ์ค๋ฅ ๋ฐ์: {str(e)}", [] | |
def serphouse_search(query, country): | |
response_data = search_serphouse(query, country) | |
return format_results_from_raw(response_data) | |
def search_and_display(query, country, articles_state, progress=gr.Progress()): | |
with ThreadPoolExecutor(max_workers=3) as executor: | |
progress(0, desc="๊ฒ์์ด ๋ฒ์ญ ์ค...") | |
future_translation = executor.submit(translate_query, query, country) | |
translated_query = future_translation.result() | |
translated_display = f"**์๋ณธ ๊ฒ์์ด:** {query}\n**๋ฒ์ญ๋ ๊ฒ์์ด:** {translated_query}" if translated_query != query else f"**๊ฒ์์ด:** {query}" | |
progress(0.3, desc="๊ฒ์ ์ค...") | |
response_data = search_serphouse(query, country) | |
progress(0.6, desc="๊ฒฐ๊ณผ ์ฒ๋ฆฌ ์ค...") | |
error_message, articles = format_results_from_raw(response_data) | |
outputs = [] | |
outputs.append(gr.update(value="๊ฒ์์ ์งํ์ค์ ๋๋ค...", visible=True)) | |
outputs.append(gr.update(value=translated_display, visible=True)) | |
if error_message: | |
outputs.append(gr.update(value=error_message, visible=True)) | |
for comp in article_components: | |
outputs.extend([ | |
gr.update(visible=False), gr.update(), gr.update(), | |
gr.update(), gr.update() | |
]) | |
articles_state = [] | |
else: | |
outputs.append(gr.update(value="", visible=False)) | |
if not error_message and articles: | |
futures = [] | |
for article in articles: | |
future = executor.submit(translate_to_korean, article['snippet']) | |
futures.append((article, future)) | |
progress(0.8, desc="๋ฒ์ญ ์ฒ๋ฆฌ ์ค...") | |
for article, future in futures: | |
article['korean_summary'] = future.result() | |
total_articles = len(articles) | |
for idx, comp in enumerate(article_components): | |
progress((idx + 1) / total_articles, desc=f"๊ฒฐ๊ณผ ํ์ ์ค... {idx + 1}/{total_articles}") | |
if idx < len(articles): | |
article = articles[idx] | |
image_url = article['image_url'] | |
image_update = gr.update(value=image_url, visible=True) if image_url and not image_url.startswith('data:image') else gr.update(value=None, visible=False) | |
outputs.extend([ | |
gr.update(visible=True), | |
gr.update(value=f"### [{article['title']}]({article['link']})"), | |
image_update, | |
gr.update(value=f"**์์ฝ:** {article['snippet']}\n\n**ํ๊ธ ์์ฝ:** {article['korean_summary']}"), | |
gr.update(value=f"**์ถ์ฒ:** {article['channel']} | **์๊ฐ:** {article['time']}") | |
]) | |
else: | |
outputs.extend([ | |
gr.update(visible=False), gr.update(), gr.update(), | |
gr.update(), gr.update() | |
]) | |
articles_state = articles | |
progress(1.0, desc="์๋ฃ!") | |
outputs.append(articles_state) | |
outputs[0] = gr.update(value="", visible=False) | |
return outputs | |
def get_region_countries(region): | |
"""์ ํ๋ ์ง์ญ์ ๊ตญ๊ฐ ๋ฐ ์ธ์ด ์ ๋ณด ๋ฐํ""" | |
if region == "๋์์์": | |
return COUNTRY_LOCATIONS_EAST_ASIA, COUNTRY_LANGUAGES_EAST_ASIA | |
elif region == "๋๋จ์์์/์ค์ธ์๋์": | |
return COUNTRY_LOCATIONS_SOUTHEAST_ASIA_OCEANIA, COUNTRY_LANGUAGES_SOUTHEAST_ASIA_OCEANIA | |
elif region == "๋์ ๋ฝ": | |
return COUNTRY_LOCATIONS_EAST_EUROPE, COUNTRY_LANGUAGES_EAST_EUROPE | |
elif region == "์์ ๋ฝ": | |
return COUNTRY_LOCATIONS_WEST_EUROPE, COUNTRY_LANGUAGES_WEST_EUROPE | |
elif region == "์ค๋/์ํ๋ฆฌ์นด": | |
return COUNTRY_LOCATIONS_ARAB_AFRICA, COUNTRY_LANGUAGES_ARAB_AFRICA | |
elif region == "์๋ฉ๋ฆฌ์นด": | |
return COUNTRY_LOCATIONS_AMERICA, COUNTRY_LANGUAGES_AMERICA | |
return {}, {} | |
def search_global(query, region, articles_state_global): | |
"""์ง์ญ๋ณ ๊ฒ์ ํจ์""" | |
status_msg = f"{region} ์ง์ญ ๊ฒ์์ ์์ํฉ๋๋ค..." | |
all_results = [] | |
outputs = [ | |
gr.update(value=status_msg, visible=True), | |
gr.update(value=f"**๊ฒ์์ด:** {query}", visible=True), | |
] | |
for _ in global_article_components: | |
outputs.extend([ | |
gr.update(visible=False), gr.update(), gr.update(), | |
gr.update(), gr.update() | |
]) | |
outputs.append([]) | |
yield outputs | |
# ์ ํ๋ ์ง์ญ์ ๊ตญ๊ฐ ์ ๋ณด ๊ฐ์ ธ์ค๊ธฐ | |
locations, languages = get_region_countries(region) | |
total_countries = len(locations) | |
for idx, (country, location) in enumerate(locations.items(), 1): | |
try: | |
status_msg = f"{region} - {country} ๊ฒ์ ์ค... ({idx}/{total_countries} ๊ตญ๊ฐ)" | |
outputs[0] = gr.update(value=status_msg, visible=True) | |
yield outputs | |
error_message, articles = serphouse_search(query, country) | |
if not error_message and articles: | |
for article in articles: | |
article['source_country'] = country | |
article['region'] = region | |
all_results.extend(articles) | |
sorted_results = sorted(all_results, key=lambda x: x.get('time', ''), reverse=True) | |
seen_urls = set() | |
unique_results = [] | |
for article in sorted_results: | |
url = article.get('link', '') | |
if url not in seen_urls: | |
seen_urls.add(url) | |
unique_results.append(article) | |
unique_results = unique_results[:MAX_GLOBAL_RESULTS] | |
outputs = [ | |
gr.update(value=f"{region} - {idx}/{total_countries} ๊ตญ๊ฐ ๊ฒ์ ์๋ฃ\nํ์ฌ๊น์ง ๋ฐ๊ฒฌ๋ ๋ด์ค: {len(unique_results)}๊ฑด", visible=True), | |
gr.update(value=f"**๊ฒ์์ด:** {query} | **์ง์ญ:** {region}", visible=True), | |
] | |
for idx, comp in enumerate(global_article_components): | |
if idx < len(unique_results): | |
article = unique_results[idx] | |
image_url = article.get('image_url', '') | |
image_update = gr.update(value=image_url, visible=True) if image_url and not image_url.startswith('data:image') else gr.update(value=None, visible=False) | |
korean_summary = translate_to_korean(article['snippet']) | |
outputs.extend([ | |
gr.update(visible=True), | |
gr.update(value=f"### [{article['title']}]({article['link']})"), | |
image_update, | |
gr.update(value=f"**์์ฝ:** {article['snippet']}\n\n**ํ๊ธ ์์ฝ:** {korean_summary}"), | |
gr.update(value=f"**์ถ์ฒ:** {article['channel']} | **๊ตญ๊ฐ:** {article['source_country']} | **์ง์ญ:** {article['region']} | **์๊ฐ:** {article['time']}") | |
]) | |
else: | |
outputs.extend([ | |
gr.update(visible=False), | |
gr.update(), | |
gr.update(), | |
gr.update(), | |
gr.update() | |
]) | |
outputs.append(unique_results) | |
yield outputs | |
except Exception as e: | |
print(f"Error searching {country}: {str(e)}") | |
continue | |
final_status = f"{region} ๊ฒ์ ์๋ฃ! ์ด {len(unique_results)}๊ฐ์ ๋ด์ค๊ฐ ๋ฐ๊ฒฌ๋์์ต๋๋ค." | |
outputs[0] = gr.update(value=final_status, visible=True) | |
yield outputs | |
css = """ | |
/* ์ ์ญ ์คํ์ผ */ | |
footer {visibility: hidden;} | |
/* ๋ ์ด์์ ์ปจํ ์ด๋ */ | |
#status_area { | |
background: rgba(255, 255, 255, 0.9); | |
padding: 15px; | |
border-bottom: 1px solid #ddd; | |
margin-bottom: 20px; | |
box-shadow: 0 2px 5px rgba(0,0,0,0.1); | |
} | |
#results_area { | |
padding: 10px; | |
margin-top: 10px; | |
} | |
/* ํญ ์คํ์ผ */ | |
.tabs { | |
border-bottom: 2px solid #ddd !important; | |
margin-bottom: 20px !important; | |
} | |
.tab-nav { | |
border-bottom: none !important; | |
margin-bottom: 0 !important; | |
} | |
.tab-nav button { | |
font-weight: bold !important; | |
padding: 10px 20px !important; | |
} | |
.tab-nav button.selected { | |
border-bottom: 2px solid #1f77b4 !important; | |
color: #1f77b4 !important; | |
} | |
/* ์ํ ๋ฉ์์ง */ | |
#status_area .markdown-text { | |
font-size: 1.1em; | |
color: #2c3e50; | |
padding: 10px 0; | |
} | |
/* ๊ธฐ๋ณธ ์ปจํ ์ด๋ */ | |
.group { | |
border: 1px solid #eee; | |
padding: 15px; | |
margin-bottom: 15px; | |
border-radius: 5px; | |
background: white; | |
} | |
/* ๋ฒํผ ์คํ์ผ */ | |
.primary-btn { | |
background: #1f77b4 !important; | |
border: none !important; | |
} | |
/* ์ ๋ ฅ ํ๋ */ | |
.textbox { | |
border: 1px solid #ddd !important; | |
border-radius: 4px !important; | |
} | |
/* ํ๋ก๊ทธ๋ ์ค๋ฐ ์ปจํ ์ด๋ */ | |
.progress-container { | |
position: fixed; | |
top: 0; | |
left: 0; | |
width: 100%; | |
height: 6px; | |
background: #e0e0e0; | |
z-index: 1000; | |
} | |
/* ํ๋ก๊ทธ๋ ์ค๋ฐ */ | |
.progress-bar { | |
height: 100%; | |
background: linear-gradient(90deg, #2196F3, #00BCD4); | |
box-shadow: 0 0 10px rgba(33, 150, 243, 0.5); | |
transition: width 0.3s ease; | |
animation: progress-glow 1.5s ease-in-out infinite; | |
} | |
/* ํ๋ก๊ทธ๋ ์ค ํ ์คํธ */ | |
.progress-text { | |
position: fixed; | |
top: 8px; | |
left: 50%; | |
transform: translateX(-50%); | |
background: #333; | |
color: white; | |
padding: 4px 12px; | |
border-radius: 15px; | |
font-size: 14px; | |
z-index: 1001; | |
box-shadow: 0 2px 5px rgba(0,0,0,0.2); | |
} | |
/* ํ๋ก๊ทธ๋ ์ค๋ฐ ์ ๋๋ฉ์ด์ */ | |
@keyframes progress-glow { | |
0% { | |
box-shadow: 0 0 5px rgba(33, 150, 243, 0.5); | |
} | |
50% { | |
box-shadow: 0 0 20px rgba(33, 150, 243, 0.8); | |
} | |
100% { | |
box-shadow: 0 0 5px rgba(33, 150, 243, 0.5); | |
} | |
} | |
/* ๋ฐ์ํ ๋์์ธ */ | |
@media (max-width: 768px) { | |
.group { | |
padding: 10px; | |
margin-bottom: 15px; | |
} | |
.progress-text { | |
font-size: 12px; | |
padding: 3px 10px; | |
} | |
} | |
/* ๋ก๋ฉ ์ํ ํ์ ๊ฐ์ */ | |
.loading { | |
opacity: 0.7; | |
pointer-events: none; | |
transition: opacity 0.3s ease; | |
} | |
/* ๊ฒฐ๊ณผ ์ปจํ ์ด๋ ์ ๋๋ฉ์ด์ */ | |
.group { | |
transition: all 0.3s ease; | |
opacity: 0; | |
transform: translateY(20px); | |
} | |
.group.visible { | |
opacity: 1; | |
transform: translateY(0); | |
} | |
/* Examples ์คํ์ผ๋ง */ | |
.examples-table { | |
margin-top: 10px !important; | |
margin-bottom: 20px !important; | |
} | |
.examples-table button { | |
background-color: #f0f0f0 !important; | |
border: 1px solid #ddd !important; | |
border-radius: 4px !important; | |
padding: 5px 10px !important; | |
margin: 2px !important; | |
transition: all 0.3s ease !important; | |
} | |
.examples-table button:hover { | |
background-color: #e0e0e0 !important; | |
transform: translateY(-1px) !important; | |
box-shadow: 0 2px 5px rgba(0,0,0,0.1) !important; | |
} | |
.examples-table .label { | |
font-weight: bold !important; | |
color: #444 !important; | |
margin-bottom: 5px !important; | |
} | |
""" | |
def get_article_content(url): | |
try: | |
headers = { | |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' | |
} | |
session = requests.Session() | |
retries = Retry(total=3, backoff_factor=0.5) | |
session.mount('https://', HTTPAdapter(max_retries=retries)) | |
response = session.get(url, headers=headers, timeout=30) | |
response.raise_for_status() | |
soup = BeautifulSoup(response.content, 'html.parser') | |
# ๋ฉํ ๋ฐ์ดํฐ ์ถ์ถ | |
title = soup.find('meta', property='og:title') or soup.find('title') | |
title = title.get('content', '') if hasattr(title, 'get') else title.string if title else '' | |
description = soup.find('meta', property='og:description') or soup.find('meta', {'name': 'description'}) | |
description = description.get('content', '') if description else '' | |
# ๋ณธ๋ฌธ ์ถ์ถ ๊ฐ์ | |
article_content = '' | |
# ์ผ๋ฐ์ ์ธ ๊ธฐ์ฌ ๋ณธ๋ฌธ ์ปจํ ์ด๋ ๊ฒ์ | |
content_selectors = [ | |
'article', '.article-body', '.article-content', '#article-body', | |
'.story-body', '.post-content', '.entry-content', '.content-body', | |
'[itemprop="articleBody"]', '.story-content' | |
] | |
for selector in content_selectors: | |
content = soup.select_one(selector) | |
if content: | |
# ๋ถํ์ํ ์์ ์ ๊ฑฐ | |
for tag in content.find_all(['script', 'style', 'nav', 'header', 'footer', 'aside']): | |
tag.decompose() | |
# ๋จ๋ฝ ์ถ์ถ | |
paragraphs = content.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']) | |
if paragraphs: | |
article_content = '\n\n'.join([p.get_text().strip() for p in paragraphs if p.get_text().strip()]) | |
break | |
# ๋ฐฑ์ ๋ฐฉ๋ฒ: ๋ชจ๋ ๋จ๋ฝ ์ถ์ถ | |
if not article_content: | |
paragraphs = soup.find_all('p') | |
article_content = '\n\n'.join([p.get_text().strip() for p in paragraphs if len(p.get_text().strip()) > 50]) | |
# ์ต์ข ์ฝํ ์ธ ๊ตฌ์ฑ | |
full_content = f"Title: {title}\n\nDescription: {description}\n\nContent:\n{article_content}" | |
# ํ ์คํธ ์ ์ | |
full_content = re.sub(r'\s+', ' ', full_content) # ์ฐ์๋ ๊ณต๋ฐฑ ์ ๊ฑฐ | |
full_content = re.sub(r'\n\s*\n', '\n\n', full_content) # ์ฐ์๋ ๋น ์ค ์ ๊ฑฐ | |
return full_content.strip() | |
except Exception as e: | |
print(f"Crawling error details: {str(e)}") # ๋๋ฒ๊น ์ ์ํ ์์ธ ์๋ฌ ์ถ๋ ฅ | |
return f"Error crawling content: {str(e)}" | |
def respond(url, history, system_message, max_tokens, temperature, top_p): | |
if not url.startswith('http'): | |
history.append((url, "์ฌ๋ฐ๋ฅธ URL์ ์ ๋ ฅํด์ฃผ์ธ์.")) | |
return history | |
try: | |
article_content = get_article_content(url) | |
translation_prompt = f"""๋ค์ ์๋ฌธ ๊ธฐ์ฌ๋ฅผ ํ๊ตญ์ด๋ก ๋ฒ์ญํ๊ณ ๊ธฐ์ฌ๋ฅผ ์์ฑํด์ฃผ์ธ์. | |
1๋จ๊ณ: ์ ๋ฌธ ๋ฒ์ญ | |
===๋ฒ์ญ ์์=== | |
{article_content} | |
===๋ฒ์ญ ๋=== | |
2๋จ๊ณ: ๊ธฐ์ฌ ์์ฑ ๊ฐ์ด๋๋ผ์ธ | |
๋ค์ ์๊ตฌ์ฌํญ์ ๋ฐ๋ผ ํ๊ตญ์ด ๊ธฐ์ฌ๋ฅผ ์์ฑํ์ธ์: | |
1. ๊ตฌ์กฐ | |
- ํค๋๋ผ์ธ: ํต์ฌ ๋ด์ฉ์ ๋ด์ ๊ฐ๋ ฅํ ์ ๋ชฉ | |
- ๋ถ์ ๋ชฉ: ํค๋๋ผ์ธ ๋ณด์ ์ค๋ช | |
- ๋ฆฌ๋๋ฌธ: ๊ธฐ์ฌ์ ํต์ฌ์ ์์ฝํ ์ฒซ ๋ฌธ๋จ | |
- ๋ณธ๋ฌธ: ์์ธ ๋ด์ฉ ์ ๊ฐ | |
2. ์์ฑ ๊ท์น | |
- ๊ฐ๊ด์ ์ด๊ณ ์ ํํ ์ฌ์ค ์ ๋ฌ | |
- ๋ฌธ์ฅ์ '๋ค.'๋ก ์ข ๊ฒฐ | |
- ๋จ๋ฝ ๊ฐ ์์ฐ์ค๋ฌ์ด ํ๋ฆ | |
- ์ธ์ฉ๊ตฌ๋ ๋ฐ์ดํ ์ฒ๋ฆฌ | |
- ํต์ฌ ์ ๋ณด๋ฅผ ์๋ถ๋ถ์ ๋ฐฐ์น | |
- ์ ๋ฌธ ์ฉ์ด๋ ์ ์ ํ ์ค๋ช ์ถ๊ฐ | |
3. ํ์ | |
- ์ ์ ํ ๋จ๋ฝ ๊ตฌ๋ถ | |
- ์ฝ๊ธฐ ์ฌ์ด ๋ฌธ์ฅ ๊ธธ์ด | |
- ๋ ผ๋ฆฌ์ ์ธ ์ ๋ณด ๊ตฌ์ฑ | |
๊ฐ ๋จ๊ณ๋ '===๋ฒ์ญ===', '===๊ธฐ์ฌ==='๋ก ๋ช ํํ ๊ตฌ๋ถํ์ฌ ์ถ๋ ฅํ์ธ์. | |
""" | |
messages = [ | |
{ | |
"role": "system", | |
"content": system_message | |
}, | |
{"role": "user", "content": translation_prompt} | |
] | |
history.append((url, "๋ฒ์ญ ๋ฐ ๊ธฐ์ฌ ์์ฑ์ ์์ํฉ๋๋ค...")) | |
full_response = "" | |
for message in client.chat.completions.create( | |
model="CohereForAI/c4ai-command-r-plus-08-2024", | |
max_tokens=max_tokens, | |
stream=True, | |
temperature=temperature, | |
top_p=top_p, | |
messages=messages, | |
): | |
if hasattr(message.choices[0].delta, 'content'): | |
token = message.choices[0].delta.content | |
if token: | |
full_response += token | |
history[-1] = (url, full_response) | |
yield history | |
except Exception as e: | |
error_message = f"์ฒ๋ฆฌ ์ค ์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค: {str(e)}" | |
history.append((url, error_message)) | |
yield history | |
return history | |
with gr.Blocks(theme="Yntec/HaleyCH_Theme_Orange", css=css, title="NewsAI ์๋น์ค") as iface: | |
with gr.Tabs(): | |
# ๊ตญ๊ฐ๋ณ ํญ | |
with gr.Tab("๊ตญ๊ฐ๋ณ"): | |
gr.Markdown("๊ฒ์์ด๋ฅผ ์ ๋ ฅํ๊ณ ์ํ๋ ๊ตญ๊ฐ(ํ๊ตญ ์ ์ธ)๋ฅผ๋ฅผ ์ ํํ๋ฉด, ๊ฒ์์ด์ ์ผ์นํ๋ 24์๊ฐ ์ด๋ด ๋ด์ค๋ฅผ ์ต๋ 100๊ฐ ์ถ๋ ฅํฉ๋๋ค.") | |
gr.Markdown("๊ตญ๊ฐ ์ ํํ ๊ฒ์์ด์ 'ํ๊ธ'์ ์ ๋ ฅํ๋ฉด ํ์ง ์ธ์ด๋ก ๋ฒ์ญ๋์ด ๊ฒ์ํฉ๋๋ค. ์: 'Taiwan' ๊ตญ๊ฐ ์ ํํ '์ผ์ฑ' ์ ๋ ฅ์ 'ไธๆ'์ผ๋ก ์๋ ๊ฒ์") | |
with gr.Column(): | |
with gr.Row(): | |
query = gr.Textbox(label="๊ฒ์์ด") | |
country = gr.Dropdown( | |
choices=sorted(list(COUNTRY_LOCATIONS.keys())), | |
label="๊ตญ๊ฐ", | |
value="United States" | |
) | |
# Examples ์ถ๊ฐ | |
gr.Examples( | |
examples=[ | |
"artificial intelligence", | |
"NVIDIA", | |
"OPENAI", | |
"META LLAMA", | |
"black forest labs", | |
"GOOGLE gemini", | |
"anthropic Claude", | |
"X.AI", | |
"HUGGINGFACE", | |
"HYNIX", | |
"Large Language model", | |
"CHATGPT", | |
"StabilityAI", | |
"MISTRALAI", | |
"QWEN", | |
"MIDJOURNEY", | |
"GPU" | |
], | |
inputs=query, | |
label="์์ฃผ ์ฌ์ฉ๋๋ ๊ฒ์์ด" | |
) | |
status_message = gr.Markdown("", visible=True) | |
translated_query_display = gr.Markdown(visible=False) | |
search_button = gr.Button("๊ฒ์", variant="primary") | |
progress = gr.Progress() | |
articles_state = gr.State([]) | |
article_components = [] | |
for i in range(100): | |
with gr.Group(visible=False) as article_group: | |
title = gr.Markdown() | |
image = gr.Image(width=200, height=150) | |
snippet = gr.Markdown() | |
info = gr.Markdown() | |
article_components.append({ | |
'group': article_group, | |
'title': title, | |
'image': image, | |
'snippet': snippet, | |
'info': info, | |
'index': i, | |
}) | |
# ์ ์ธ๊ณ ํญ | |
with gr.Tab("์ ์ธ๊ณ"): | |
gr.Markdown("๋๋ฅ๋ณ๋ก 24์๊ฐ ์ด๋ด ๋ด์ค๋ฅผ ๊ฒ์ํฉ๋๋ค.") | |
with gr.Column(): | |
with gr.Column(elem_id="status_area"): | |
with gr.Row(): | |
query_global = gr.Textbox(label="๊ฒ์์ด") | |
region_select = gr.Dropdown( | |
choices=REGIONS, | |
label="์ง์ญ ์ ํ", | |
value="๋์์์" | |
) | |
search_button_global = gr.Button("๊ฒ์", variant="primary") | |
status_message_global = gr.Markdown("") | |
translated_query_display_global = gr.Markdown("") | |
with gr.Column(elem_id="results_area"): | |
articles_state_global = gr.State([]) | |
global_article_components = [] | |
for i in range(MAX_GLOBAL_RESULTS): | |
with gr.Group(visible=False) as article_group: | |
title = gr.Markdown() | |
image = gr.Image(width=200, height=150) | |
snippet = gr.Markdown() | |
info = gr.Markdown() | |
global_article_components.append({ | |
'group': article_group, | |
'title': title, | |
'image': image, | |
'snippet': snippet, | |
'info': info, | |
'index': i, | |
}) | |
# AI ๋ฒ์ญ ํญ ์ถ๊ฐ | |
with gr.Tab("AI ๊ธฐ์ฌ ์์ฑ"): | |
gr.Markdown("๋ด์ค URL์ ์ ๋ ฅํ๋ฉด AI๊ฐ ํ๊ตญ์ด๋ก ๋ฒ์ญํ์ฌ ๊ธฐ์ฌ ํ์์ผ๋ก ์์ฑํฉ๋๋ค.") | |
with gr.Column(): | |
chatbot = gr.Chatbot(height=600) | |
with gr.Row(): | |
url_input = gr.Textbox( | |
label="๋ด์ค URL", | |
placeholder="https://..." | |
) | |
with gr.Accordion("๊ณ ๊ธ ์ค์ ", open=False): | |
system_message = gr.Textbox( | |
value="""You are a professional translator and journalist. Follow these steps strictly: | |
1. TRANSLATION | |
- Start with ===๋ฒ์ญ=== marker | |
- Provide accurate Korean translation | |
- Maintain original meaning and context | |
2. ARTICLE WRITING | |
- Start with ===๊ธฐ์ฌ=== marker | |
- Write a new Korean news article based on the translation | |
- Follow newspaper article format | |
- Use formal news writing style | |
- End sentences with '๋ค.' | |
- Include headline and subheadline | |
- Organize paragraphs clearly | |
- Put key information first | |
- Use quotes appropriately | |
IMPORTANT: | |
- Must complete both steps in order | |
- Clearly separate each section with markers | |
- Never skip or combine steps""", | |
label="System message" | |
) | |
max_tokens = gr.Slider( | |
minimum=1, | |
maximum=7800, | |
value=7624, | |
step=1, | |
label="Max new tokens" | |
) | |
temperature = gr.Slider( | |
minimum=0.1, | |
maximum=4.0, | |
value=0.7, | |
step=0.1, | |
label="Temperature" | |
) | |
top_p = gr.Slider( | |
minimum=0.1, | |
maximum=1.0, | |
value=0.95, | |
step=0.05, | |
label="Top-P" | |
) | |
translate_button = gr.Button("๊ธฐ์ฌ ์์ฑ", variant="primary") | |
# ์ด๋ฒคํธ ์ฐ๊ฒฐ | |
translate_button.click( | |
fn=respond, | |
inputs=[ | |
url_input, | |
chatbot, | |
system_message, | |
max_tokens, | |
temperature, | |
top_p, | |
], | |
outputs=chatbot | |
) | |
# ์ด๋ฒคํธ ์ฐ๊ฒฐ ๋ถ๋ถ | |
# ๊ตญ๊ฐ๋ณ ํญ ์ด๋ฒคํธ | |
search_outputs = [status_message, translated_query_display, gr.Markdown(visible=False)] | |
for comp in article_components: | |
search_outputs.extend([ | |
comp['group'], comp['title'], comp['image'], | |
comp['snippet'], comp['info'] | |
]) | |
search_outputs.append(articles_state) | |
search_button.click( | |
fn=search_and_display, | |
inputs=[query, country, articles_state], | |
outputs=search_outputs, | |
show_progress=True | |
) | |
# ์ ์ธ๊ณ ํญ ์ด๋ฒคํธ | |
global_search_outputs = [status_message_global, translated_query_display_global] | |
for comp in global_article_components: | |
global_search_outputs.extend([ | |
comp['group'], comp['title'], comp['image'], | |
comp['snippet'], comp['info'] | |
]) | |
global_search_outputs.append(articles_state_global) | |
search_button_global.click( | |
fn=search_global, | |
inputs=[query_global, region_select, articles_state_global], | |
outputs=global_search_outputs, | |
show_progress=True | |
) | |
iface.launch( | |
server_name="0.0.0.0", | |
server_port=7860, | |
share=True, | |
auth=("gini","pick"), | |
ssl_verify=False, | |
show_error=True | |
) |