ginipick commited on
Commit
585cf28
ยท
verified ยท
1 Parent(s): cc1a9a6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +79 -66
app.py CHANGED
@@ -898,85 +898,105 @@ def get_article_content(url):
898
  headers = {
899
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
900
  }
901
- response = requests.get(url, headers=headers)
 
 
 
 
 
902
  soup = BeautifulSoup(response.content, 'html.parser')
903
 
 
 
 
 
 
 
 
 
 
 
904
  # ์ผ๋ฐ˜์ ์ธ ๊ธฐ์‚ฌ ๋ณธ๋ฌธ ์ปจํ…Œ์ด๋„ˆ ๊ฒ€์ƒ‰
905
- article_body = None
906
- possible_content_elements = [
907
- soup.find('article'),
908
- soup.find('div', class_='article-body'),
909
- soup.find('div', class_='content'),
910
- soup.find('div', {'id': 'article-body'})
911
  ]
912
 
913
- for element in possible_content_elements:
914
- if element:
915
- article_body = element
916
- break
 
 
 
 
 
 
 
 
917
 
918
- if article_body:
919
- # ๋ถˆํ•„์š”ํ•œ ์š”์†Œ ์ œ๊ฑฐ
920
- for tag in article_body.find_all(['script', 'style', 'nav', 'header', 'footer']):
921
- tag.decompose()
922
-
923
- content = ' '.join([p.get_text().strip() for p in article_body.find_all('p') if p.get_text().strip()])
924
- else:
925
- content = ' '.join([p.get_text().strip() for p in soup.find_all('p') if p.get_text().strip()])
 
 
 
 
 
926
 
927
- return content
928
  except Exception as e:
929
  return f"Error crawling content: {str(e)}"
930
 
931
- def respond(
932
- url,
933
- history: list[tuple[str, str]],
934
- system_message,
935
- max_tokens,
936
- temperature,
937
- top_p,
938
- ):
939
  if not url.startswith('http'):
940
  history.append((url, "์˜ฌ๋ฐ”๋ฅธ URL์„ ์ž…๋ ฅํ•ด์ฃผ์„ธ์š”."))
941
  return history
942
 
943
  try:
944
- # ๊ธฐ์‚ฌ ๋‚ด์šฉ ์ถ”์ถœ
945
  article_content = get_article_content(url)
946
 
947
- # 2๋‹จ๊ณ„ ํ”„๋กœ์„ธ์Šค๋ฅผ ์œ„ํ•œ ํ”„๋กฌํ”„ํŠธ ๊ตฌ์„ฑ
948
- translation_prompt = f"""๋‹ค์Œ ์ž‘์—…์„ ์ˆœ์ฐจ์ ์œผ๋กœ ์ˆ˜ํ–‰ํ•˜์„ธ์š”:
949
-
950
- 1๋‹จ๊ณ„: ๋ฒˆ์—ญ
951
- ์•„๋ž˜ ์˜๋ฌธ ๊ธฐ์‚ฌ๋ฅผ ํ•œ๊ตญ์–ด๋กœ ์ •ํ™•ํ•˜๊ฒŒ ๋ฒˆ์—ญํ•˜์„ธ์š”.
952
- ๊ตฌ๋ถ„์„ : ===๋ฒˆ์—ญ ์‹œ์ž‘===
953
- {article_content}
954
- ๊ตฌ๋ถ„์„ : ===๋ฒˆ์—ญ ๋===
955
-
956
- 2๋‹จ๊ณ„: ๊ธฐ์‚ฌ ์ž‘์„ฑ
957
- ์œ„์˜ ๋ฒˆ์—ญ๋œ ๋‚ด์šฉ์„ ๋ฐ”ํƒ•์œผ๋กœ ์ƒˆ๋กœ์šด ํ•œ๊ตญ์–ด ๊ธฐ์‚ฌ๋ฅผ ์ž‘์„ฑํ•˜์„ธ์š”.
958
- ๋‹ค์Œ ํ˜•์‹์„ ๋ฐ˜๋“œ์‹œ ์ค€์ˆ˜ํ•˜์„ธ์š”:
959
- - ์ œ๋ชฉ: [ํ—ค๋“œ๋ผ์ธ]
960
- - ๋ถ€์ œ: [์„œ๋ธŒํ—ค๋“œ๋ผ์ธ]
961
- - ๋ณธ๋ฌธ: [๊ธฐ์‚ฌ ๋‚ด์šฉ]
962
- - ์ž‘์„ฑ ๊ทœ์น™:
963
- * ๋ฌธ์žฅ์€ '๋‹ค.'๋กœ ๋๋‚˜์•ผ ํ•จ
964
- * ์‹ ๋ฌธ ๊ธฐ์‚ฌ ํ˜•์‹ ์ค€์ˆ˜
965
- * ๋‹จ๋ฝ ๊ตฌ๋ถ„์„ ๋ช…ํ™•ํžˆ ํ•  ๊ฒƒ
966
- * ํ•ต์‹ฌ ์ •๋ณด๋ฅผ ์•ž๋ถ€๋ถ„์— ๋ฐฐ์น˜
967
- * ์ธ์šฉ๊ตฌ๋Š” ๋”ฐ์˜ดํ‘œ๋กœ ์ฒ˜๋ฆฌ
968
-
969
- ๊ฐ ๋‹จ๊ณ„๋Š” '===๋ฒˆ์—ญ===', '===๊ธฐ์‚ฌ==='๋กœ ๏ฟฝ๏ฟฝ๋ถ„ํ•˜์—ฌ ์ถœ๋ ฅํ•˜์„ธ์š”.
970
- """
 
 
 
 
 
 
 
971
 
972
  messages = [
973
  {
974
- "role": "system",
975
- "content": """๋‹น์‹ ์€ ์ „๋ฌธ ๋ฒˆ์—ญ๊ฐ€์ด์ž ๊ธฐ์ž์ž…๋‹ˆ๋‹ค.
976
- ๋ชจ๋“  ์ž‘์—…์€ ๋ฐ˜๋“œ์‹œ ๋‹ค์Œ ๋‘ ๋‹จ๊ณ„๋กœ ์ง„ํ–‰ํ•˜๊ณ , ๊ฐ ๋‹จ๊ณ„๋ฅผ ๋ช…ํ™•ํžˆ ๊ตฌ๋ถ„ํ•˜์—ฌ ์ถœ๋ ฅํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค:
977
- 1. ์›๋ฌธ ๋ฒˆ์—ญ: ===๋ฒˆ์—ญ=== ํ‘œ์‹œ ํ›„ ์ •ํ™•ํ•œ ํ•œ๊ตญ์–ด ๋ฒˆ์—ญ ์ œ๊ณต
978
- 2. ๊ธฐ์‚ฌ ์ž‘์„ฑ: ===๊ธฐ์‚ฌ=== ํ‘œ์‹œ ํ›„ ๋ฒˆ์—ญ๋ณธ์„ ๊ธฐ๋ฐ˜์œผ๋กœ ํ•œ๊ตญ์–ด ๋‰ด์Šค ๊ธฐ์‚ฌ ์ž‘์„ฑ
979
- ๋‘ ๋‹จ๊ณ„๋ฅผ ๊ฑด๋„ˆ๋›ฐ๊ฑฐ๋‚˜ ํ†ตํ•ฉํ•˜์ง€ ๋ง๊ณ  ๋ฐ˜๋“œ์‹œ ์ˆœ์ฐจ์ ์œผ๋กœ ์ง„ํ–‰ํ•˜์„ธ์š”."""
980
  },
981
  {"role": "user", "content": translation_prompt}
982
  ]
@@ -984,8 +1004,6 @@ def respond(
984
  history.append((url, "๋ฒˆ์—ญ ๋ฐ ๊ธฐ์‚ฌ ์ž‘์„ฑ์„ ์‹œ์ž‘ํ•ฉ๋‹ˆ๋‹ค..."))
985
 
986
  full_response = ""
987
- current_section = ""
988
-
989
  for message in client.chat.completions.create(
990
  model="CohereForAI/c4ai-command-r-plus-08-2024",
991
  max_tokens=max_tokens,
@@ -998,11 +1016,6 @@ def respond(
998
  token = message.choices[0].delta.content
999
  if token:
1000
  full_response += token
1001
- # ์„น์…˜ ๊ตฌ๋ถ„์ž ํ™•์ธ ๋ฐ ํฌ๋งทํŒ…
1002
- if "===๋ฒˆ์—ญ===" in token or "===๊ธฐ์‚ฌ===" in token:
1003
- current_section = token.strip()
1004
- full_response += "\n\n"
1005
-
1006
  history[-1] = (url, full_response)
1007
  yield history
1008
 
 
898
  headers = {
899
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
900
  }
901
+ session = requests.Session()
902
+ retries = Retry(total=3, backoff_factor=0.5)
903
+ session.mount('https://', HTTPAdapter(max_retries=retries))
904
+
905
+ response = session.get(url, headers=headers, timeout=30)
906
+ response.raise_for_status()
907
  soup = BeautifulSoup(response.content, 'html.parser')
908
 
909
+ # ๋ฉ”ํƒ€ ๋ฐ์ดํ„ฐ ์ถ”์ถœ
910
+ title = soup.find('meta', property='og:title') or soup.find('title')
911
+ title = title.get('content', '') if hasattr(title, 'get') else title.string if title else ''
912
+
913
+ description = soup.find('meta', property='og:description') or soup.find('meta', {'name': 'description'})
914
+ description = description.get('content', '') if description else ''
915
+
916
+ # ๋ณธ๋ฌธ ์ถ”์ถœ ๊ฐœ์„ 
917
+ article_content = ''
918
+
919
  # ์ผ๋ฐ˜์ ์ธ ๊ธฐ์‚ฌ ๋ณธ๋ฌธ ์ปจํ…Œ์ด๋„ˆ ๊ฒ€์ƒ‰
920
+ content_selectors = [
921
+ 'article', '.article-body', '.article-content', '#article-body',
922
+ '.story-body', '.post-content', '.entry-content', '.content-body',
923
+ '[itemprop="articleBody"]', '.story-content'
 
 
924
  ]
925
 
926
+ for selector in content_selectors:
927
+ content = soup.select_one(selector)
928
+ if content:
929
+ # ๋ถˆํ•„์š”ํ•œ ์š”์†Œ ์ œ๊ฑฐ
930
+ for tag in content.find_all(['script', 'style', 'nav', 'header', 'footer', 'aside']):
931
+ tag.decompose()
932
+
933
+ # ๋‹จ๋ฝ ์ถ”์ถœ
934
+ paragraphs = content.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
935
+ if paragraphs:
936
+ article_content = '\n\n'.join([p.get_text().strip() for p in paragraphs if p.get_text().strip()])
937
+ break
938
 
939
+ # ๋ฐฑ์—… ๋ฐฉ๋ฒ•: ๋ชจ๋“  ๋‹จ๋ฝ ์ถ”์ถœ
940
+ if not article_content:
941
+ paragraphs = soup.find_all('p')
942
+ article_content = '\n\n'.join([p.get_text().strip() for p in paragraphs if len(p.get_text().strip()) > 50])
943
+
944
+ # ์ตœ์ข… ์ฝ˜ํ…์ธ  ๊ตฌ์„ฑ
945
+ full_content = f"Title: {title}\n\nDescription: {description}\n\nContent:\n{article_content}"
946
+
947
+ # ํ…์ŠคํŠธ ์ •์ œ
948
+ full_content = re.sub(r'\s+', ' ', full_content) # ์—ฐ์†๋œ ๊ณต๋ฐฑ ์ œ๊ฑฐ
949
+ full_content = re.sub(r'\n\s*\n', '\n\n', full_content) # ์—ฐ์†๋œ ๋นˆ ์ค„ ์ œ๊ฑฐ
950
+
951
+ return full_content.strip()
952
 
 
953
  except Exception as e:
954
  return f"Error crawling content: {str(e)}"
955
 
956
+ def respond(url, history, system_message, max_tokens, temperature, top_p):
 
 
 
 
 
 
 
957
  if not url.startswith('http'):
958
  history.append((url, "์˜ฌ๋ฐ”๋ฅธ URL์„ ์ž…๋ ฅํ•ด์ฃผ์„ธ์š”."))
959
  return history
960
 
961
  try:
 
962
  article_content = get_article_content(url)
963
 
964
+ translation_prompt = f"""๋‹ค์Œ ์˜๋ฌธ ๊ธฐ์‚ฌ๋ฅผ ํ•œ๊ตญ์–ด๋กœ ๋ฒˆ์—ญํ•˜๊ณ  ๊ธฐ์‚ฌ๋ฅผ ์ž‘์„ฑํ•ด์ฃผ์„ธ์š”.
965
+
966
+ 1๋‹จ๊ณ„: ์ „๋ฌธ ๋ฒˆ์—ญ
967
+ ===๋ฒˆ์—ญ ์‹œ์ž‘===
968
+ {article_content}
969
+ ===๋ฒˆ์—ญ ๋===
970
+
971
+ 2๋‹จ๊ณ„: ๊ธฐ์‚ฌ ์ž‘์„ฑ ๊ฐ€์ด๋“œ๋ผ์ธ
972
+ ๋‹ค์Œ ์š”๊ตฌ์‚ฌํ•ญ์— ๋”ฐ๋ผ ํ•œ๊ตญ์–ด ๊ธฐ์‚ฌ๋ฅผ ์ž‘์„ฑํ•˜์„ธ์š”:
973
+
974
+ 1. ๊ตฌ์กฐ
975
+ - ํ—ค๋“œ๋ผ์ธ: ํ•ต์‹ฌ ๋‚ด์šฉ์„ ๋‹ด์€ ๊ฐ•๋ ฅํ•œ ์ œ๋ชฉ
976
+ - ๋ถ€์ œ๋ชฉ: ํ—ค๋“œ๋ผ์ธ ๋ณด์™„ ์„ค๋ช…
977
+ - ๋ฆฌ๋“œ๋ฌธ: ๊ธฐ์‚ฌ์˜ ํ•ต์‹ฌ์„ ์š”์•ฝํ•œ ์ฒซ ๋ฌธ๋‹จ
978
+ - ๋ณธ๋ฌธ: ์ƒ์„ธ ๋‚ด์šฉ ์ „๊ฐœ
979
+
980
+ 2. ์ž‘์„ฑ ๊ทœ์น™
981
+ - ๊ฐ๊ด€์ ์ด๊ณ  ์ •ํ™•ํ•œ ์‚ฌ์‹ค ์ „๋‹ฌ
982
+ - ๋ฌธ์žฅ์€ '๋‹ค.'๋กœ ์ข…๊ฒฐ
983
+ - ๋‹จ๋ฝ ๊ฐ„ ์ž์—ฐ์Šค๋Ÿฌ์šด ํ๋ฆ„
984
+ - ์ธ์šฉ๊ตฌ๋Š” ๋”ฐ์˜ดํ‘œ ์ฒ˜๋ฆฌ
985
+ - ํ•ต์‹ฌ ์ •๋ณด๋ฅผ ์•ž๋ถ€๋ถ„์— ๋ฐฐ์น˜
986
+ - ์ „๋ฌธ ์šฉ์–ด๋Š” ์ ์ ˆํ•œ ์„ค๋ช… ์ถ”๊ฐ€
987
+
988
+ 3. ํ˜•์‹
989
+ - ์ ์ ˆํ•œ ๋‹จ๋ฝ ๊ตฌ๋ถ„
990
+ - ์ฝ๊ธฐ ์‰ฌ์šด ๋ฌธ์žฅ ๊ธธ์ด
991
+ - ๋…ผ๋ฆฌ์ ์ธ ์ •๋ณด ๊ตฌ์„ฑ
992
+
993
+ ๊ฐ ๋‹จ๊ณ„๋Š” '===๋ฒˆ์—ญ===', '===๊ธฐ์‚ฌ==='๋กœ ๋ช…ํ™•ํžˆ ๊ตฌ๋ถ„ํ•˜์—ฌ ์ถœ๋ ฅํ•˜์„ธ์š”.
994
+ """
995
 
996
  messages = [
997
  {
998
+ "role": "system",
999
+ "content": system_message
 
 
 
 
1000
  },
1001
  {"role": "user", "content": translation_prompt}
1002
  ]
 
1004
  history.append((url, "๋ฒˆ์—ญ ๋ฐ ๊ธฐ์‚ฌ ์ž‘์„ฑ์„ ์‹œ์ž‘ํ•ฉ๋‹ˆ๋‹ค..."))
1005
 
1006
  full_response = ""
 
 
1007
  for message in client.chat.completions.create(
1008
  model="CohereForAI/c4ai-command-r-plus-08-2024",
1009
  max_tokens=max_tokens,
 
1016
  token = message.choices[0].delta.content
1017
  if token:
1018
  full_response += token
 
 
 
 
 
1019
  history[-1] = (url, full_response)
1020
  yield history
1021