Hyeonseo commited on
Commit
9986940
โ€ข
1 Parent(s): 6775c59

feat: collect news once

Browse files

ํฌ๋กค๋งํ•œ ๋‰ด์Šค ๋ฐ์ดํ„ฐ๋ฅผ ์ „์—ญ ๋ณ€์ˆ˜๋กœ ์ €์žฅํ•ด์„œ, ์ตœ์ดˆ ํ•œ ๋ฒˆ๋งŒ ํฌ๋กค๋ง ๋˜๋„๋ก ํ•ฉ๋‹ˆ๋‹ค.
* Gradio demo๊ฐ€ ์˜ฌ๋ผ์˜จ ์ƒํƒœ(์ตœ์ดˆ ํ•œ ๋ฒˆ ํฌ๋กค๋ง ์™„๋ฃŒ)์—์„œ ์ˆ˜๋™์œผ๋กœ ํฌ๋กค๋งํ•˜๊ธฐ ์œ„ํ•ด ํฌ๋กค๋ง์„ ์‹œ๋„ํ•˜๋Š” ์žฅ์น˜(ex. ๋ฒ„ํŠผ)์ด ํ•„์š”ํ•ฉ๋‹ˆ๋‹ค.

Files changed (1) hide show
  1. app.py +102 -98
app.py CHANGED
@@ -1,99 +1,103 @@
1
- import gradio as gr
2
- from newspaper import Article
3
- from newspaper import Config
4
-
5
- from transformers import pipeline
6
- import requests
7
- from bs4 import BeautifulSoup
8
- import re
9
-
10
- from bs4 import BeautifulSoup as bs
11
- import requests
12
- from transformers import PreTrainedTokenizerFast, BartForConditionalGeneration
13
-
14
- # Load Model and Tokenize
15
- def get_summary(input_text):
16
- tokenizer = PreTrainedTokenizerFast.from_pretrained("ainize/kobart-news")
17
- summary_model = BartForConditionalGeneration.from_pretrained("ainize/kobart-news")
18
- input_ids = tokenizer.encode(input_text, return_tensors="pt")
19
- summary_text_ids = summary_model.generate(
20
- input_ids=input_ids,
21
- bos_token_id=summary_model.config.bos_token_id,
22
- eos_token_id=summary_model.config.eos_token_id,
23
- length_penalty=2.0,
24
- max_length=142,
25
- min_length=56,
26
- num_beams=4,
27
- )
28
- return tokenizer.decode(summary_text_ids[0], skip_special_tokens=True)
29
-
30
-
31
-
32
- USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:78.0) Gecko/20100101 Firefox/78.0'
33
- config = Config()
34
- config.browser_user_agent = USER_AGENT
35
- config.request_timeout = 10
36
-
37
- class news_collector:
38
- def __init__(self):
39
- self.examples = []
40
-
41
- def get_new_parser(self, url):
42
- article = Article(url, language='ko')
43
- article.download()
44
- article.parse()
45
- return article
46
-
47
- def get_news_links(self, page=''):
48
- url = "https://news.daum.net/breakingnews/economic"
49
- response = requests.get(url)
50
- html_text = response.text
51
-
52
- soup = bs(response.text, 'html.parser')
53
- news_titles = soup.select("a.link_txt")
54
- links = [item.attrs['href'] for item in news_titles ]
55
- https_links = [item for item in links if item.startswith('https') == True]
56
- https_links
57
- return https_links
58
-
59
-
60
- def update_news_examples(self):
61
- news_links = self.get_news_links()
62
- for news_url in news_links:
63
- article = self.get_new_parser(news_url)
64
- self.examples.append(get_summary(article.text[:1000]))
65
-
66
-
67
-
68
- title = "๊ท ํ˜•์žกํžŒ ๋‰ด์Šค ์ฝ๊ธฐ (Balanced News Reading)"
69
-
70
-
71
-
72
- with gr.Blocks() as demo:
73
- news = news_collector()
74
- news.update_news_examples()
75
-
76
- with gr.Tab("์†Œ๊ฐœ"):
77
- gr.Markdown(
78
- """
79
- # ๊ท ํ˜•์žกํžŒ ๋‰ด์Šค ์ฝ๊ธฐ (Balanced News Reading)
80
-
81
- ๊ธ์ •์ ์ธ ๊ธฐ์‚ฌ์™€ ๋ถ€์ •์ ์ธ ๊ธฐ์‚ฌ์ธ์ง€ ํ™•์ธํ•˜์—ฌ ๋‰ด์Šค๋ฅผ ์ฝ์„ ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค. ์ตœ๊ทผ ๊ฒฝ์ œ๋‰ด์Šค๊ธฐ์‚ฌ๋ฅผ ๊ฐ€์ ธ์™€ Example์—์„œ ๋ฐ”๋กœ ํ™•์ธํ•  ์ˆ˜ ์žˆ๋„๋ก ๊ตฌ์„ฑํ–ˆ์Šต๋‹ˆ๋‹ค.
82
-
83
- ## 1. ์‚ฌ์šฉ๋ฐฉ๋ฒ•
84
- Daum๋‰ด์Šค์˜ ๊ฒฝ์ œ ๊ธฐ์‚ฌ๋ฅผ ๊ฐ€์ ธ์™€ ๋‚ด์šฉ์„ ์š”์•ฝํ•˜๊ณ  `Example`์— ๊ฐ€์ ธ์˜ต๋‹ˆ๋‹ค. ๊ฐ์ • ๋ถ„์„์„ ํ•˜๊ณ  ์‹ถ์€ ๊ธฐ์‚ฌ๋ฅผ `Examples`์—์„œ ์„ ํƒํ•ด์„œ `Submit`์„ ๋ˆ„๋ฅด๋ฉด `Classification`์—
85
- ํ•ด๋‹น ๊ธฐ์‚ฌ์˜ ๊ฐ์ • ํ‰๊ฐ€ ๊ฒฐ๊ณผ๊ฐ€ ํ‘œ์‹œ๋ฉ๋‹ˆ๋‹ค. ๊ฐ์ •ํ‰๊ฐ€๋Š” ๊ฐ ์ƒํƒœ์˜ ํ™•๋ฅ  ์ •๋ณด์™€ ํ•จ๊ป˜ `neutral`, `positive`, `negative` 3๊ฐ€์ง€๋กœ ํ‘œ์‹œ๋ฉ๋‹ˆ๋‹ค.
86
-
87
- ## 2. ๊ตฌ์กฐ ์„ค๋ช…
88
- ๋‰ด์Šค๊ธฐ์‚ฌ๋ฅผ ํฌ๋กค๋ง ๋ฐ ์š”์•ฝ ๋ชจ๋ธ์„ ์ด์šฉํ•œ ๊ธฐ์‚ฌ ์š”์•ฝ >> ๊ธฐ์‚ฌ ์š”์•ฝ์ •๋ณด Example์— ์ถ”๊ฐ€ >> ํ•œ๊ตญ์–ด fine-tunningํ•œ ๊ฐ์ •ํ‰๊ฐ€ ๋ชจ๋ธ์„ ์ด์šฉํ•ด ์ž…๋ ฅ๋œ ๊ธฐ์‚ฌ์— ๋Œ€ํ•œ ๊ฐ์ • ํ‰๊ฐ€ ์ง„ํ–‰
89
- """)
90
-
91
- with gr.Tab("๋ฐ๋ชจ"):
92
- gr.load("models/gabrielyang/finance_news_classifier-KR_v7",
93
- inputs = gr.Textbox( placeholder="๋‰ด์Šค ๊ธฐ์‚ฌ ๋‚ด์šฉ์„ ์ž…๋ ฅํ•˜์„ธ์š”." ),
94
- examples=news.examples)
95
-
96
-
97
-
98
- if __name__ == "__main__":
 
 
 
 
99
  demo.launch()
 
1
+ import gradio as gr
2
+ from newspaper import Article
3
+ from newspaper import Config
4
+
5
+ from transformers import pipeline
6
+ import requests
7
+ from bs4 import BeautifulSoup
8
+ import re
9
+
10
+ from bs4 import BeautifulSoup as bs
11
+ import requests
12
+ from transformers import PreTrainedTokenizerFast, BartForConditionalGeneration
13
+
14
+ # Load Model and Tokenize
15
+ def get_summary(input_text):
16
+ tokenizer = PreTrainedTokenizerFast.from_pretrained("ainize/kobart-news")
17
+ summary_model = BartForConditionalGeneration.from_pretrained("ainize/kobart-news")
18
+ input_ids = tokenizer.encode(input_text, return_tensors="pt")
19
+ summary_text_ids = summary_model.generate(
20
+ input_ids=input_ids,
21
+ bos_token_id=summary_model.config.bos_token_id,
22
+ eos_token_id=summary_model.config.eos_token_id,
23
+ length_penalty=2.0,
24
+ max_length=142,
25
+ min_length=56,
26
+ num_beams=4,
27
+ )
28
+ return tokenizer.decode(summary_text_ids[0], skip_special_tokens=True)
29
+
30
+
31
+
32
+ USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:78.0) Gecko/20100101 Firefox/78.0'
33
+ config = Config()
34
+ config.browser_user_agent = USER_AGENT
35
+ config.request_timeout = 10
36
+
37
+ class news_collector:
38
+ def __init__(self):
39
+ self.examples = []
40
+
41
+ def get_new_parser(self, url):
42
+ article = Article(url, language='ko')
43
+ article.download()
44
+ article.parse()
45
+ return article
46
+
47
+ def get_news_links(self, page=''):
48
+ url = "https://news.daum.net/breakingnews/economic"
49
+ response = requests.get(url)
50
+ html_text = response.text
51
+
52
+ soup = bs(response.text, 'html.parser')
53
+ news_titles = soup.select("a.link_txt")
54
+ links = [item.attrs['href'] for item in news_titles ]
55
+ https_links = [item for item in links if item.startswith('https') == True]
56
+ https_links
57
+ return https_links
58
+
59
+
60
+ def update_news_examples(self):
61
+ news_links = self.get_news_links()
62
+ for news_url in news_links:
63
+ article = self.get_new_parser(news_url)
64
+ self.examples.append(get_summary(article.text[:1000]))
65
+
66
+ def collect_news():
67
+ news = news_collector()
68
+ news.update_news_examples()
69
+ return news.examples
70
+
71
+ examples = collect_news()
72
+
73
+ title = "๊ท ํ˜•์žกํžŒ ๋‰ด์Šค ์ฝ๊ธฐ (Balanced News Reading)"
74
+
75
+
76
+
77
+ with gr.Blocks() as demo:
78
+ # news = news_collector()
79
+ # news.update_news_examples()
80
+
81
+ with gr.Tab("์†Œ๊ฐœ"):
82
+ gr.Markdown(
83
+ """
84
+ # ๊ท ํ˜•์žกํžŒ ๋‰ด์Šค ์ฝ๊ธฐ (Balanced News Reading)
85
+
86
+ ๊ธ์ •์ ์ธ ๊ธฐ์‚ฌ์™€ ๋ถ€์ •์ ์ธ ๊ธฐ์‚ฌ์ธ์ง€ ํ™•์ธํ•˜์—ฌ ๋‰ด์Šค๋ฅผ ์ฝ์„ ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค. ์ตœ๊ทผ ๊ฒฝ์ œ๋‰ด์Šค๊ธฐ์‚ฌ๋ฅผ ๊ฐ€์ ธ์™€ Example์—์„œ ๋ฐ”๋กœ ํ™•์ธํ•  ์ˆ˜ ์žˆ๋„๋ก ๊ตฌ์„ฑํ–ˆ์Šต๋‹ˆ๋‹ค.
87
+
88
+ ## 1. ์‚ฌ์šฉ๋ฐฉ๋ฒ•
89
+ Daum๋‰ด์Šค์˜ ๊ฒฝ์ œ ๊ธฐ์‚ฌ๋ฅผ ๊ฐ€์ ธ์™€ ๋‚ด์šฉ์„ ์š”์•ฝํ•˜๊ณ  `Example`์— ๊ฐ€์ ธ์˜ต๋‹ˆ๋‹ค. ๊ฐ์ • ๋ถ„์„์„ ํ•˜๊ณ  ์‹ถ์€ ๊ธฐ์‚ฌ๋ฅผ `Examples`์—์„œ ์„ ํƒํ•ด์„œ `Submit`์„ ๋ˆ„๋ฅด๋ฉด `Classification`์—
90
+ ํ•ด๋‹น ๊ธฐ์‚ฌ์˜ ๊ฐ์ • ํ‰๊ฐ€ ๊ฒฐ๊ณผ๊ฐ€ ํ‘œ์‹œ๋ฉ๋‹ˆ๋‹ค. ๊ฐ์ •ํ‰๊ฐ€๋Š” ๊ฐ ์ƒํƒœ์˜ ํ™•๋ฅ  ์ •๋ณด์™€ ํ•จ๊ป˜ `neutral`, `positive`, `negative` 3๊ฐ€์ง€๋กœ ํ‘œ์‹œ๋ฉ๋‹ˆ๋‹ค.
91
+
92
+ ## 2. ๊ตฌ์กฐ ์„ค๋ช…
93
+ ๋‰ด์Šค๊ธฐ์‚ฌ๋ฅผ ํฌ๋กค๋ง ๋ฐ ์š”์•ฝ ๋ชจ๋ธ์„ ์ด์šฉํ•œ ๊ธฐ์‚ฌ ์š”์•ฝ >> ๊ธฐ์‚ฌ ์š”์•ฝ์ •๋ณด Example์— ์ถ”๊ฐ€ >> ํ•œ๊ตญ์–ด fine-tunningํ•œ ๊ฐ์ •ํ‰๊ฐ€ ๋ชจ๋ธ์„ ์ด์šฉํ•ด ์ž…๋ ฅ๋œ ๊ธฐ์‚ฌ์— ๋Œ€ํ•œ ๊ฐ์ • ํ‰๊ฐ€ ์ง„ํ–‰
94
+ """)
95
+
96
+ with gr.Tab("๋ฐ๋ชจ"):
97
+ gr.load("models/gabrielyang/finance_news_classifier-KR_v7",
98
+ inputs = gr.Textbox( placeholder="๋‰ด์Šค ๊ธฐ์‚ฌ ๋‚ด์šฉ์„ ์ž…๋ ฅํ•˜์„ธ์š”." ),
99
+ examples=examples)
100
+
101
+
102
+ if __name__ == "__main__":
103
  demo.launch()