File size: 3,742 Bytes
9688967
 
82ed732
 
9688967
 
82ed732
 
 
9688967
 
82ed732
9688967
 
 
82ed732
9688967
 
 
 
 
 
 
 
 
 
82ed732
9688967
82ed732
 
 
 
9688967
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82ed732
9688967
 
82ed732
 
 
 
 
 
 
 
9688967
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82ed732
 
9688967
82ed732
9688967
 
82ed732
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9688967
 
 
 
82ed732
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import json
import random
import os
import sys
from bs4 import BeautifulSoup

sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
import utils

def fetch_new_page(category):
    url = f'https://arxiv.org/list/{category}/new'
    return utils.fetch_page(url)

def fetch_recent_page(category):
    url = f'https://arxiv.org/list/{category}/recent'
    return utils.fetch_page(url)

def extract_new_data(category):
    paper_ids = []
    page_content = fetch_new_page(category)
    lists = BeautifulSoup(page_content, 'html.parser').find_all('dl')
    for list in lists:
        papers = list.find_all('dt')
        paper_contents = list.find_all('dd')
        titles = [paper_content.find('div', class_='list-title').text.strip().split('Title:')[-1].strip() for paper_content in paper_contents]
        for paper, title in zip(papers, titles):
            if not utils.verify_simple_title(title):
                continue
            paper_link = paper.find('a', href=True)
            if paper_link:
                paper_id = paper_link.text.strip().split(':')[1]
                paper_ids.append(paper_id)
    return paper_ids

def extract_recent_data(category):
    paper_ids = []
    page_content = fetch_recent_page(category)
    lists = BeautifulSoup(page_content, 'html.parser').find_all('dl')
    for list in lists:
        papers = list.find_all('dt')
        for paper in papers:
            paper_link = paper.find('a', href=True)
            if paper_link:
                paper_id = paper_link.text.strip().split(':')[1]
                paper_ids.append(paper_id)
    return paper_ids

def extract_data(category):
    all_ids = set()
    new_data = extract_new_data(category)
    recent_data = extract_recent_data(category)
    combined_data = new_data + recent_data
    for paper_id in combined_data:
        if not utils.check_data_in_file(paper_id, 'arxiv.txt'):
            utils.write_data_to_file(paper_id, 'arxiv.txt')
            all_ids.add(paper_id)
        if len(all_ids) >= 4:
            break
    return list(all_ids)

def extract_arxiv_data():
    categories = {
        "Astrophysics": ["astro-ph"],
        "Condensed Matter": ["cond-mat"],
        "General Relativity and Quantum Cosmology": ["gr-qc"],
        "High Energy Physics": ["hep-ex", "hep-lat", "hep-ph", "hep-th"],
        "Mathematical Physics": ["math-ph"],
        "Nonlinear Sciences": ["nlin"],
        "Nuclear Experiment": ["nucl-ex"],
        "Nuclear Theory": ["nucl-th"],
        "Physics": ["physics"],
        "Quantum Physics": ["quant-ph"],
        "Mathematics": ["math"],
        "Computer Science": ["cs"],
        "Quantitative Biology": ["q-bio"],
        "Quantitative Finance": ["q-fin"],
        "Statistics": ["stat"],
        "Electrical Engineering and Systems Science": ["eess"],
        "Economics": ["econ"]
    }
    data = {}
    used_ids = set()

    for category, subcategories in categories.items():
        category_ids = set()
        for subcategory in subcategories:
            ids = extract_data(subcategory)
            for paper_id in ids:
                if paper_id not in used_ids:
                    category_ids.add(paper_id)
                    used_ids.add(paper_id)
                if len(category_ids) == 4:
                    break
            if len(category_ids) == 4:
                break

        # Ensure exactly 4 IDs for each category
        while len(category_ids) < 4:
            category_ids.add(random.choice(list(used_ids)))

        data[category] = {"ids": list(category_ids)}

    return json.dumps(data, indent=4, ensure_ascii=False)

if __name__ == '__main__':
    data = extract_arxiv_data()
    with open('arxiv_data.json', 'w') as f:
        f.write(data)