File size: 2,663 Bytes
46f067b
 
86797a6
46f067b
c539900
7761dac
46f067b
c539900
 
 
 
 
 
 
 
 
 
 
 
7761dac
46f067b
 
c539900
46f067b
 
 
 
 
c539900
 
 
 
46f067b
c539900
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7761dac
c539900
 
 
 
 
 
 
 
 
 
 
7761dac
c539900
 
86797a6
46f067b
 
86797a6
46f067b
c539900
86797a6
46f067b
13e115d
46f067b
 
13e115d
c539900
13e115d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import requests
from bs4 import BeautifulSoup
import pandas as pd
import streamlit as st
import random
import time  # time ๋ชจ๋“ˆ์„ ์ž„ํฌํŠธ

# ๋„ค์ด๋ฒ„ ๋ชจ๋ฐ”์ผ ๋‰ด์Šค ๋žญํ‚น URL
url = "https://m.news.naver.com/rankingList"

# ํ—ค๋” ์„ค์ • (User-Agent ๋ฐ Referer ์ถ”๊ฐ€)
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36",
    "Referer": "https://m.news.naver.com/"
}

# ๋žœ๋ค ๋”œ๋ ˆ์ด ํ•จ์ˆ˜
def random_delay(min_delay=1, max_delay=3):
    delay = random.uniform(min_delay, max_delay)
    time.sleep(delay)  # time ๋ชจ๋“ˆ์˜ sleep ํ•จ์ˆ˜ ์‚ฌ์šฉ

# ์›น ํŽ˜์ด์ง€ ์š”์ฒญ ๋ฐ ํŒŒ์‹ฑ
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')

# ๋‰ด์Šค ๋ฆฌ์ŠคํŠธ ์ถ”์ถœ
news_list = []

# ์ƒˆ๋กœ์šด HTML ๊ตฌ์กฐ์— ๋งž๊ฒŒ ๋ฐ์ดํ„ฐ ์ถ”์ถœ
for news_box in soup.select('div.rankingnews_box'):
    # ์–ธ๋ก ์‚ฌ ์ด๋ฆ„ ์ถ”์ถœ
    press_name = news_box.find('strong', class_='rankingnews_name').text.strip()
    
    # ๊ฐ ๋‰ด์Šค ํ•ญ๋ชฉ์„ ๋ฆฌ์ŠคํŠธ๋กœ ์ถ”์ถœ
    for news_item in news_box.select('ul.rankingnews_list li'):
        random_delay()  # ๋”œ๋ ˆ์ด ์ถ”๊ฐ€

        # ์ˆœ์œ„ ๋ฒˆํ˜ธ ์ถ”์ถœ
        rank_tag = news_item.find('em', class_='list_ranking_num')
        rank = rank_tag.text if rank_tag else 'No Rank'

        # ์ œ๋ชฉ ์ถ”์ถœ
        title_tag = news_item.find('strong', class_='list_title')
        title = title_tag.text.strip() if title_tag else 'No Title'

        # ๋งํฌ ์ถ”์ถœ
        link = news_item.find('a')['href'] if news_item.find('a') else '#'

        # ์‹œ๊ฐ„ ์ถ”์ถœ
        time_tag = news_item.find('span', class_='list_time')
        time_info = time_tag.text.strip() if time_tag else 'No Time'

        # ์ด๋ฏธ์ง€ URL ์ถ”์ถœ
        img_tag = news_item.find('img')
        image_url = img_tag['src'] if img_tag and 'src' in img_tag.attrs else 'No Image Available'

        # ๋ฐ์ดํ„ฐ ๋ฆฌ์ŠคํŠธ์— ์ถ”๊ฐ€
        news_list.append({
            'Press': press_name,
            'Rank': rank,
            'Title': title,
            'Link': link,
            'Time': time_info,
            'Image URL': image_url
        })

# ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„์œผ๋กœ ๋ณ€ํ™˜
df = pd.DataFrame(news_list)

# Streamlit์—์„œ ๊ฒฐ๊ณผ ํ‘œ์‹œ
st.title("Naver Mobile Ranking News Scraper")

# ๊ฐœ๋ณ„ ๋‰ด์Šค ํ•ญ๋ชฉ ์ถœ๋ ฅ
for index, row in df.iterrows():
    if row['Image URL'] != 'No Image Available':
        st.image(row['Image URL'], width=100)
    st.markdown(f"**[{row['Title']}]({row['Link']})**")
    st.write(f"Press: {row['Press']} | Rank: {row['Rank']} | Time: {row['Time']}")
    st.write("---")