Spaces:
Building
Building
Update app.py
Browse files
app.py
CHANGED
@@ -898,85 +898,105 @@ def get_article_content(url):
|
|
898 |
headers = {
|
899 |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
900 |
}
|
901 |
-
|
|
|
|
|
|
|
|
|
|
|
902 |
soup = BeautifulSoup(response.content, 'html.parser')
|
903 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
904 |
# ์ผ๋ฐ์ ์ธ ๊ธฐ์ฌ ๋ณธ๋ฌธ ์ปจํ
์ด๋ ๊ฒ์
|
905 |
-
|
906 |
-
|
907 |
-
|
908 |
-
|
909 |
-
soup.find('div', class_='content'),
|
910 |
-
soup.find('div', {'id': 'article-body'})
|
911 |
]
|
912 |
|
913 |
-
for
|
914 |
-
|
915 |
-
|
916 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
917 |
|
918 |
-
|
919 |
-
|
920 |
-
|
921 |
-
|
922 |
-
|
923 |
-
|
924 |
-
|
925 |
-
|
|
|
|
|
|
|
|
|
|
|
926 |
|
927 |
-
return content
|
928 |
except Exception as e:
|
929 |
return f"Error crawling content: {str(e)}"
|
930 |
|
931 |
-
def respond(
|
932 |
-
url,
|
933 |
-
history: list[tuple[str, str]],
|
934 |
-
system_message,
|
935 |
-
max_tokens,
|
936 |
-
temperature,
|
937 |
-
top_p,
|
938 |
-
):
|
939 |
if not url.startswith('http'):
|
940 |
history.append((url, "์ฌ๋ฐ๋ฅธ URL์ ์
๋ ฅํด์ฃผ์ธ์."))
|
941 |
return history
|
942 |
|
943 |
try:
|
944 |
-
# ๊ธฐ์ฌ ๋ด์ฉ ์ถ์ถ
|
945 |
article_content = get_article_content(url)
|
946 |
|
947 |
-
|
948 |
-
|
949 |
-
|
950 |
-
|
951 |
-
|
952 |
-
|
953 |
-
|
954 |
-
|
955 |
-
|
956 |
-
|
957 |
-
|
958 |
-
|
959 |
-
|
960 |
-
|
961 |
-
|
962 |
-
|
963 |
-
|
964 |
-
|
965 |
-
|
966 |
-
|
967 |
-
|
968 |
-
|
969 |
-
|
970 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
971 |
|
972 |
messages = [
|
973 |
{
|
974 |
-
"role": "system",
|
975 |
-
"content":
|
976 |
-
๋ชจ๋ ์์
์ ๋ฐ๋์ ๋ค์ ๋ ๋จ๊ณ๋ก ์งํํ๊ณ , ๊ฐ ๋จ๊ณ๋ฅผ ๋ช
ํํ ๊ตฌ๋ถํ์ฌ ์ถ๋ ฅํด์ผ ํฉ๋๋ค:
|
977 |
-
1. ์๋ฌธ ๋ฒ์ญ: ===๋ฒ์ญ=== ํ์ ํ ์ ํํ ํ๊ตญ์ด ๋ฒ์ญ ์ ๊ณต
|
978 |
-
2. ๊ธฐ์ฌ ์์ฑ: ===๊ธฐ์ฌ=== ํ์ ํ ๋ฒ์ญ๋ณธ์ ๊ธฐ๋ฐ์ผ๋ก ํ๊ตญ์ด ๋ด์ค ๊ธฐ์ฌ ์์ฑ
|
979 |
-
๋ ๋จ๊ณ๋ฅผ ๊ฑด๋๋ฐ๊ฑฐ๋ ํตํฉํ์ง ๋ง๊ณ ๋ฐ๋์ ์์ฐจ์ ์ผ๋ก ์งํํ์ธ์."""
|
980 |
},
|
981 |
{"role": "user", "content": translation_prompt}
|
982 |
]
|
@@ -984,8 +1004,6 @@ def respond(
|
|
984 |
history.append((url, "๋ฒ์ญ ๋ฐ ๊ธฐ์ฌ ์์ฑ์ ์์ํฉ๋๋ค..."))
|
985 |
|
986 |
full_response = ""
|
987 |
-
current_section = ""
|
988 |
-
|
989 |
for message in client.chat.completions.create(
|
990 |
model="CohereForAI/c4ai-command-r-plus-08-2024",
|
991 |
max_tokens=max_tokens,
|
@@ -998,11 +1016,6 @@ def respond(
|
|
998 |
token = message.choices[0].delta.content
|
999 |
if token:
|
1000 |
full_response += token
|
1001 |
-
# ์น์
๊ตฌ๋ถ์ ํ์ธ ๋ฐ ํฌ๋งทํ
|
1002 |
-
if "===๋ฒ์ญ===" in token or "===๊ธฐ์ฌ===" in token:
|
1003 |
-
current_section = token.strip()
|
1004 |
-
full_response += "\n\n"
|
1005 |
-
|
1006 |
history[-1] = (url, full_response)
|
1007 |
yield history
|
1008 |
|
|
|
898 |
headers = {
|
899 |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
900 |
}
|
901 |
+
session = requests.Session()
|
902 |
+
retries = Retry(total=3, backoff_factor=0.5)
|
903 |
+
session.mount('https://', HTTPAdapter(max_retries=retries))
|
904 |
+
|
905 |
+
response = session.get(url, headers=headers, timeout=30)
|
906 |
+
response.raise_for_status()
|
907 |
soup = BeautifulSoup(response.content, 'html.parser')
|
908 |
|
909 |
+
# ๋ฉํ ๋ฐ์ดํฐ ์ถ์ถ
|
910 |
+
title = soup.find('meta', property='og:title') or soup.find('title')
|
911 |
+
title = title.get('content', '') if hasattr(title, 'get') else title.string if title else ''
|
912 |
+
|
913 |
+
description = soup.find('meta', property='og:description') or soup.find('meta', {'name': 'description'})
|
914 |
+
description = description.get('content', '') if description else ''
|
915 |
+
|
916 |
+
# ๋ณธ๋ฌธ ์ถ์ถ ๊ฐ์
|
917 |
+
article_content = ''
|
918 |
+
|
919 |
# ์ผ๋ฐ์ ์ธ ๊ธฐ์ฌ ๋ณธ๋ฌธ ์ปจํ
์ด๋ ๊ฒ์
|
920 |
+
content_selectors = [
|
921 |
+
'article', '.article-body', '.article-content', '#article-body',
|
922 |
+
'.story-body', '.post-content', '.entry-content', '.content-body',
|
923 |
+
'[itemprop="articleBody"]', '.story-content'
|
|
|
|
|
924 |
]
|
925 |
|
926 |
+
for selector in content_selectors:
|
927 |
+
content = soup.select_one(selector)
|
928 |
+
if content:
|
929 |
+
# ๋ถํ์ํ ์์ ์ ๊ฑฐ
|
930 |
+
for tag in content.find_all(['script', 'style', 'nav', 'header', 'footer', 'aside']):
|
931 |
+
tag.decompose()
|
932 |
+
|
933 |
+
# ๋จ๋ฝ ์ถ์ถ
|
934 |
+
paragraphs = content.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
|
935 |
+
if paragraphs:
|
936 |
+
article_content = '\n\n'.join([p.get_text().strip() for p in paragraphs if p.get_text().strip()])
|
937 |
+
break
|
938 |
|
939 |
+
# ๋ฐฑ์
๋ฐฉ๋ฒ: ๋ชจ๋ ๋จ๋ฝ ์ถ์ถ
|
940 |
+
if not article_content:
|
941 |
+
paragraphs = soup.find_all('p')
|
942 |
+
article_content = '\n\n'.join([p.get_text().strip() for p in paragraphs if len(p.get_text().strip()) > 50])
|
943 |
+
|
944 |
+
# ์ต์ข
์ฝํ
์ธ ๊ตฌ์ฑ
|
945 |
+
full_content = f"Title: {title}\n\nDescription: {description}\n\nContent:\n{article_content}"
|
946 |
+
|
947 |
+
# ํ
์คํธ ์ ์
|
948 |
+
full_content = re.sub(r'\s+', ' ', full_content) # ์ฐ์๋ ๊ณต๋ฐฑ ์ ๊ฑฐ
|
949 |
+
full_content = re.sub(r'\n\s*\n', '\n\n', full_content) # ์ฐ์๋ ๋น ์ค ์ ๊ฑฐ
|
950 |
+
|
951 |
+
return full_content.strip()
|
952 |
|
|
|
953 |
except Exception as e:
|
954 |
return f"Error crawling content: {str(e)}"
|
955 |
|
956 |
+
def respond(url, history, system_message, max_tokens, temperature, top_p):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
957 |
if not url.startswith('http'):
|
958 |
history.append((url, "์ฌ๋ฐ๋ฅธ URL์ ์
๋ ฅํด์ฃผ์ธ์."))
|
959 |
return history
|
960 |
|
961 |
try:
|
|
|
962 |
article_content = get_article_content(url)
|
963 |
|
964 |
+
translation_prompt = f"""๋ค์ ์๋ฌธ ๊ธฐ์ฌ๋ฅผ ํ๊ตญ์ด๋ก ๋ฒ์ญํ๊ณ ๊ธฐ์ฌ๋ฅผ ์์ฑํด์ฃผ์ธ์.
|
965 |
+
|
966 |
+
1๋จ๊ณ: ์ ๋ฌธ ๋ฒ์ญ
|
967 |
+
===๋ฒ์ญ ์์===
|
968 |
+
{article_content}
|
969 |
+
===๋ฒ์ญ ๋===
|
970 |
+
|
971 |
+
2๋จ๊ณ: ๊ธฐ์ฌ ์์ฑ ๊ฐ์ด๋๋ผ์ธ
|
972 |
+
๋ค์ ์๊ตฌ์ฌํญ์ ๋ฐ๋ผ ํ๊ตญ์ด ๊ธฐ์ฌ๋ฅผ ์์ฑํ์ธ์:
|
973 |
+
|
974 |
+
1. ๊ตฌ์กฐ
|
975 |
+
- ํค๋๋ผ์ธ: ํต์ฌ ๋ด์ฉ์ ๋ด์ ๊ฐ๋ ฅํ ์ ๋ชฉ
|
976 |
+
- ๋ถ์ ๋ชฉ: ํค๋๋ผ์ธ ๋ณด์ ์ค๋ช
|
977 |
+
- ๋ฆฌ๋๋ฌธ: ๊ธฐ์ฌ์ ํต์ฌ์ ์์ฝํ ์ฒซ ๋ฌธ๋จ
|
978 |
+
- ๋ณธ๋ฌธ: ์์ธ ๋ด์ฉ ์ ๊ฐ
|
979 |
+
|
980 |
+
2. ์์ฑ ๊ท์น
|
981 |
+
- ๊ฐ๊ด์ ์ด๊ณ ์ ํํ ์ฌ์ค ์ ๋ฌ
|
982 |
+
- ๋ฌธ์ฅ์ '๋ค.'๋ก ์ข
๊ฒฐ
|
983 |
+
- ๋จ๋ฝ ๊ฐ ์์ฐ์ค๋ฌ์ด ํ๋ฆ
|
984 |
+
- ์ธ์ฉ๊ตฌ๋ ๋ฐ์ดํ ์ฒ๋ฆฌ
|
985 |
+
- ํต์ฌ ์ ๋ณด๋ฅผ ์๋ถ๋ถ์ ๋ฐฐ์น
|
986 |
+
- ์ ๋ฌธ ์ฉ์ด๋ ์ ์ ํ ์ค๋ช
์ถ๊ฐ
|
987 |
+
|
988 |
+
3. ํ์
|
989 |
+
- ์ ์ ํ ๋จ๋ฝ ๊ตฌ๋ถ
|
990 |
+
- ์ฝ๊ธฐ ์ฌ์ด ๋ฌธ์ฅ ๊ธธ์ด
|
991 |
+
- ๋
ผ๋ฆฌ์ ์ธ ์ ๋ณด ๊ตฌ์ฑ
|
992 |
+
|
993 |
+
๊ฐ ๋จ๊ณ๋ '===๋ฒ์ญ===', '===๊ธฐ์ฌ==='๋ก ๋ช
ํํ ๊ตฌ๋ถํ์ฌ ์ถ๋ ฅํ์ธ์.
|
994 |
+
"""
|
995 |
|
996 |
messages = [
|
997 |
{
|
998 |
+
"role": "system",
|
999 |
+
"content": system_message
|
|
|
|
|
|
|
|
|
1000 |
},
|
1001 |
{"role": "user", "content": translation_prompt}
|
1002 |
]
|
|
|
1004 |
history.append((url, "๋ฒ์ญ ๋ฐ ๊ธฐ์ฌ ์์ฑ์ ์์ํฉ๋๋ค..."))
|
1005 |
|
1006 |
full_response = ""
|
|
|
|
|
1007 |
for message in client.chat.completions.create(
|
1008 |
model="CohereForAI/c4ai-command-r-plus-08-2024",
|
1009 |
max_tokens=max_tokens,
|
|
|
1016 |
token = message.choices[0].delta.content
|
1017 |
if token:
|
1018 |
full_response += token
|
|
|
|
|
|
|
|
|
|
|
1019 |
history[-1] = (url, full_response)
|
1020 |
yield history
|
1021 |
|