CSB261 commited on
Commit
c539900
โ€ข
1 Parent(s): 46f067b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +54 -31
app.py CHANGED
@@ -2,55 +2,78 @@ import requests
2
  from bs4 import BeautifulSoup
3
  import pandas as pd
4
  import streamlit as st
 
 
5
 
6
- # ๋„ค์ด๋ฒ„ ๋žญํ‚น ๋‰ด์Šค URL
7
- url = "https://news.naver.com/main/ranking/popularDay.naver"
 
 
 
 
 
 
 
 
 
 
 
8
 
9
  # ์›น ํŽ˜์ด์ง€ ์š”์ฒญ ๋ฐ ํŒŒ์‹ฑ
10
- response = requests.get(url)
11
  soup = BeautifulSoup(response.content, 'html.parser')
12
 
13
  # ๋‰ด์Šค ๋ฆฌ์ŠคํŠธ ์ถ”์ถœ
14
  news_list = []
15
 
16
- for news_item in soup.select('div.rankingnews_box ul.rankingnews_list li'):
17
- # ์ˆœ์œ„ ๋ฒˆํ˜ธ๊ฐ€ ์กด์žฌํ•˜๋Š”์ง€ ํ™•์ธ
18
- rank_tag = news_item.find('em', class_='list_ranking_num')
19
- rank = rank_tag.text if rank_tag else 'No Rank'
20
-
21
- # ์ œ๋ชฉ์ด ์กด์žฌํ•˜๋Š”์ง€ ํ™•์ธ
22
- title_tag = news_item.find('a', class_='list_title')
23
- title = title_tag.text.strip() if title_tag else 'No Title'
24
-
25
- # ๋งํฌ๊ฐ€ ์กด์žฌํ•˜๋Š”์ง€ ํ™•์ธ
26
- link = title_tag['href'] if title_tag else '#'
27
-
28
- # ์‹œ๊ฐ„ ์ •๋ณด๊ฐ€ ์กด์žฌํ•˜๋Š”์ง€ ํ™•์ธ
29
- time_tag = news_item.find('span', class_='list_time')
30
- time = time_tag.text.strip() if time_tag else 'No Time'
31
-
32
- # ์ด๋ฏธ์ง€ ํƒœ๊ทธ์™€ src ์†์„ฑ ํ™•์ธ
33
- img_tag = news_item.find('img')
34
- image_url = img_tag['src'] if img_tag and 'src' in img_tag.attrs else 'No Image Available'
35
 
36
- news_list.append({
37
- 'Rank': rank,
38
- 'Title': title,
39
- 'Link': link,
40
- 'Time': time,
41
- 'Image URL': image_url
42
- })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
  # ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„์œผ๋กœ ๋ณ€ํ™˜
45
  df = pd.DataFrame(news_list)
46
 
47
  # Streamlit์—์„œ ๊ฒฐ๊ณผ ํ‘œ์‹œ
48
- st.title("Naver Ranking News Scraper")
49
 
50
  # ๊ฐœ๋ณ„ ๋‰ด์Šค ํ•ญ๋ชฉ ์ถœ๋ ฅ
51
  for index, row in df.iterrows():
52
  if row['Image URL'] != 'No Image Available':
53
  st.image(row['Image URL'], width=100)
54
  st.markdown(f"**[{row['Title']}]({row['Link']})**")
55
- st.write(f"Rank: {row['Rank']} | Time: {row['Time']}")
56
  st.write("---")
 
2
  from bs4 import BeautifulSoup
3
  import pandas as pd
4
  import streamlit as st
5
+ import random
6
+ import time
7
 
8
+ # ๋„ค์ด๋ฒ„ ๋ชจ๋ฐ”์ผ ๋‰ด์Šค ๋žญํ‚น URL
9
+ url = "https://m.news.naver.com/rankingList"
10
+
11
+ # ํ—ค๋” ์„ค์ • (User-Agent ๋ฐ Referer ์ถ”๊ฐ€)
12
+ headers = {
13
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36",
14
+ "Referer": "https://m.news.naver.com/"
15
+ }
16
+
17
+ # ๋žœ๋ค ๋”œ๋ ˆ์ด ํ•จ์ˆ˜
18
+ def random_delay(min_delay=1, max_delay=3):
19
+ delay = random.uniform(min_delay, max_delay)
20
+ time.sleep(delay)
21
 
22
  # ์›น ํŽ˜์ด์ง€ ์š”์ฒญ ๋ฐ ํŒŒ์‹ฑ
23
+ response = requests.get(url, headers=headers)
24
  soup = BeautifulSoup(response.content, 'html.parser')
25
 
26
  # ๋‰ด์Šค ๋ฆฌ์ŠคํŠธ ์ถ”์ถœ
27
  news_list = []
28
 
29
+ # ์ƒˆ๋กœ์šด HTML ๊ตฌ์กฐ์— ๋งž๊ฒŒ ๋ฐ์ดํ„ฐ ์ถ”์ถœ
30
+ for news_box in soup.select('div.rankingnews_box'):
31
+ # ์–ธ๋ก ์‚ฌ ์ด๋ฆ„ ์ถ”์ถœ
32
+ press_name = news_box.find('strong', class_='rankingnews_name').text.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
+ # ๊ฐ ๋‰ด์Šค ํ•ญ๋ชฉ์„ ๋ฆฌ์ŠคํŠธ๋กœ ์ถ”์ถœ
35
+ for news_item in news_box.select('ul.rankingnews_list li'):
36
+ random_delay() # ๋”œ๋ ˆ์ด ์ถ”๊ฐ€
37
+
38
+ # ์ˆœ์œ„ ๋ฒˆํ˜ธ ์ถ”์ถœ
39
+ rank_tag = news_item.find('em', class_='list_ranking_num')
40
+ rank = rank_tag.text if rank_tag else 'No Rank'
41
+
42
+ # ์ œ๋ชฉ ์ถ”์ถœ
43
+ title_tag = news_item.find('strong', class_='list_title')
44
+ title = title_tag.text.strip() if title_tag else 'No Title'
45
+
46
+ # ๋งํฌ ์ถ”์ถœ
47
+ link = news_item.find('a')['href'] if news_item.find('a') else '#'
48
+
49
+ # ์‹œ๊ฐ„ ์ถ”์ถœ
50
+ time_tag = news_item.find('span', class_='list_time')
51
+ time = time_tag.text.strip() if time_tag else 'No Time'
52
+
53
+ # ์ด๋ฏธ์ง€ URL ์ถ”์ถœ
54
+ img_tag = news_item.find('img')
55
+ image_url = img_tag['src'] if img_tag and 'src' in img_tag.attrs else 'No Image Available'
56
+
57
+ # ๋ฐ์ดํ„ฐ ๋ฆฌ์ŠคํŠธ์— ์ถ”๊ฐ€
58
+ news_list.append({
59
+ 'Press': press_name,
60
+ 'Rank': rank,
61
+ 'Title': title,
62
+ 'Link': link,
63
+ 'Time': time,
64
+ 'Image URL': image_url
65
+ })
66
 
67
  # ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„์œผ๋กœ ๋ณ€ํ™˜
68
  df = pd.DataFrame(news_list)
69
 
70
  # Streamlit์—์„œ ๊ฒฐ๊ณผ ํ‘œ์‹œ
71
+ st.title("Naver Mobile Ranking News Scraper")
72
 
73
  # ๊ฐœ๋ณ„ ๋‰ด์Šค ํ•ญ๋ชฉ ์ถœ๋ ฅ
74
  for index, row in df.iterrows():
75
  if row['Image URL'] != 'No Image Available':
76
  st.image(row['Image URL'], width=100)
77
  st.markdown(f"**[{row['Title']}]({row['Link']})**")
78
+ st.write(f"Press: {row['Press']} | Rank: {row['Rank']} | Time: {row['Time']}")
79
  st.write("---")