Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -2,55 +2,78 @@ import requests
|
|
2 |
from bs4 import BeautifulSoup
|
3 |
import pandas as pd
|
4 |
import streamlit as st
|
|
|
|
|
5 |
|
6 |
-
# ๋ค์ด๋ฒ
|
7 |
-
url = "https://news.naver.com/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
# ์น ํ์ด์ง ์์ฒญ ๋ฐ ํ์ฑ
|
10 |
-
response = requests.get(url)
|
11 |
soup = BeautifulSoup(response.content, 'html.parser')
|
12 |
|
13 |
# ๋ด์ค ๋ฆฌ์คํธ ์ถ์ถ
|
14 |
news_list = []
|
15 |
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
# ์ ๋ชฉ์ด ์กด์ฌํ๋์ง ํ์ธ
|
22 |
-
title_tag = news_item.find('a', class_='list_title')
|
23 |
-
title = title_tag.text.strip() if title_tag else 'No Title'
|
24 |
-
|
25 |
-
# ๋งํฌ๊ฐ ์กด์ฌํ๋์ง ํ์ธ
|
26 |
-
link = title_tag['href'] if title_tag else '#'
|
27 |
-
|
28 |
-
# ์๊ฐ ์ ๋ณด๊ฐ ์กด์ฌํ๋์ง ํ์ธ
|
29 |
-
time_tag = news_item.find('span', class_='list_time')
|
30 |
-
time = time_tag.text.strip() if time_tag else 'No Time'
|
31 |
-
|
32 |
-
# ์ด๋ฏธ์ง ํ๊ทธ์ src ์์ฑ ํ์ธ
|
33 |
-
img_tag = news_item.find('img')
|
34 |
-
image_url = img_tag['src'] if img_tag and 'src' in img_tag.attrs else 'No Image Available'
|
35 |
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
'
|
42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
|
44 |
# ๋ฐ์ดํฐํ๋ ์์ผ๋ก ๋ณํ
|
45 |
df = pd.DataFrame(news_list)
|
46 |
|
47 |
# Streamlit์์ ๊ฒฐ๊ณผ ํ์
|
48 |
-
st.title("Naver Ranking News Scraper")
|
49 |
|
50 |
# ๊ฐ๋ณ ๋ด์ค ํญ๋ชฉ ์ถ๋ ฅ
|
51 |
for index, row in df.iterrows():
|
52 |
if row['Image URL'] != 'No Image Available':
|
53 |
st.image(row['Image URL'], width=100)
|
54 |
st.markdown(f"**[{row['Title']}]({row['Link']})**")
|
55 |
-
st.write(f"Rank: {row['Rank']} | Time: {row['Time']}")
|
56 |
st.write("---")
|
|
|
2 |
from bs4 import BeautifulSoup
|
3 |
import pandas as pd
|
4 |
import streamlit as st
|
5 |
+
import random
|
6 |
+
import time
|
7 |
|
8 |
+
# ๋ค์ด๋ฒ ๋ชจ๋ฐ์ผ ๋ด์ค ๋ญํน URL
|
9 |
+
url = "https://m.news.naver.com/rankingList"
|
10 |
+
|
11 |
+
# ํค๋ ์ค์ (User-Agent ๋ฐ Referer ์ถ๊ฐ)
|
12 |
+
headers = {
|
13 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36",
|
14 |
+
"Referer": "https://m.news.naver.com/"
|
15 |
+
}
|
16 |
+
|
17 |
+
# ๋๋ค ๋๋ ์ด ํจ์
|
18 |
+
def random_delay(min_delay=1, max_delay=3):
|
19 |
+
delay = random.uniform(min_delay, max_delay)
|
20 |
+
time.sleep(delay)
|
21 |
|
22 |
# ์น ํ์ด์ง ์์ฒญ ๋ฐ ํ์ฑ
|
23 |
+
response = requests.get(url, headers=headers)
|
24 |
soup = BeautifulSoup(response.content, 'html.parser')
|
25 |
|
26 |
# ๋ด์ค ๋ฆฌ์คํธ ์ถ์ถ
|
27 |
news_list = []
|
28 |
|
29 |
+
# ์๋ก์ด HTML ๊ตฌ์กฐ์ ๋ง๊ฒ ๋ฐ์ดํฐ ์ถ์ถ
|
30 |
+
for news_box in soup.select('div.rankingnews_box'):
|
31 |
+
# ์ธ๋ก ์ฌ ์ด๋ฆ ์ถ์ถ
|
32 |
+
press_name = news_box.find('strong', class_='rankingnews_name').text.strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
|
34 |
+
# ๊ฐ ๋ด์ค ํญ๋ชฉ์ ๋ฆฌ์คํธ๋ก ์ถ์ถ
|
35 |
+
for news_item in news_box.select('ul.rankingnews_list li'):
|
36 |
+
random_delay() # ๋๋ ์ด ์ถ๊ฐ
|
37 |
+
|
38 |
+
# ์์ ๋ฒํธ ์ถ์ถ
|
39 |
+
rank_tag = news_item.find('em', class_='list_ranking_num')
|
40 |
+
rank = rank_tag.text if rank_tag else 'No Rank'
|
41 |
+
|
42 |
+
# ์ ๋ชฉ ์ถ์ถ
|
43 |
+
title_tag = news_item.find('strong', class_='list_title')
|
44 |
+
title = title_tag.text.strip() if title_tag else 'No Title'
|
45 |
+
|
46 |
+
# ๋งํฌ ์ถ์ถ
|
47 |
+
link = news_item.find('a')['href'] if news_item.find('a') else '#'
|
48 |
+
|
49 |
+
# ์๊ฐ ์ถ์ถ
|
50 |
+
time_tag = news_item.find('span', class_='list_time')
|
51 |
+
time = time_tag.text.strip() if time_tag else 'No Time'
|
52 |
+
|
53 |
+
# ์ด๋ฏธ์ง URL ์ถ์ถ
|
54 |
+
img_tag = news_item.find('img')
|
55 |
+
image_url = img_tag['src'] if img_tag and 'src' in img_tag.attrs else 'No Image Available'
|
56 |
+
|
57 |
+
# ๋ฐ์ดํฐ ๋ฆฌ์คํธ์ ์ถ๊ฐ
|
58 |
+
news_list.append({
|
59 |
+
'Press': press_name,
|
60 |
+
'Rank': rank,
|
61 |
+
'Title': title,
|
62 |
+
'Link': link,
|
63 |
+
'Time': time,
|
64 |
+
'Image URL': image_url
|
65 |
+
})
|
66 |
|
67 |
# ๋ฐ์ดํฐํ๋ ์์ผ๋ก ๋ณํ
|
68 |
df = pd.DataFrame(news_list)
|
69 |
|
70 |
# Streamlit์์ ๊ฒฐ๊ณผ ํ์
|
71 |
+
st.title("Naver Mobile Ranking News Scraper")
|
72 |
|
73 |
# ๊ฐ๋ณ ๋ด์ค ํญ๋ชฉ ์ถ๋ ฅ
|
74 |
for index, row in df.iterrows():
|
75 |
if row['Image URL'] != 'No Image Available':
|
76 |
st.image(row['Image URL'], width=100)
|
77 |
st.markdown(f"**[{row['Title']}]({row['Link']})**")
|
78 |
+
st.write(f"Press: {row['Press']} | Rank: {row['Rank']} | Time: {row['Time']}")
|
79 |
st.write("---")
|