import json import random from bs4 import BeautifulSoup import utils def fetch_new_page(category): url = f'https://arxiv.org/list/{category}/new' return utils.fetch_page(url) def fetch_recent_page(category): url = f'https://arxiv.org/list/{category}/recent' return utils.fetch_page(url) def extract_new_data(category): paper_ids = [] page_content = fetch_new_page(category) lists = BeautifulSoup(page_content, 'html.parser').find_all('dl') for list in lists: papers = list.find_all('dt') paper_contents = list.find_all('dd') titles = [paper_content.find('div', class_='list-title').text.strip().split('Title:')[-1].strip() for paper_content in paper_contents] for paper, title in zip(papers, titles): if not utils.verify_simple_title(title): continue paper_link = paper.find('a', href=True) if paper_link: paper_id = paper_link.text.strip().split(':')[1] paper_ids.append(paper_id) return paper_ids def extract_recent_data(category): paper_ids = [] page_content = fetch_recent_page(category) lists = BeautifulSoup(page_content, 'html.parser').find_all('dl') for list in lists: papers = list.find_all('dt') for paper in papers: paper_link = paper.find('a', href=True) if paper_link: paper_id = paper_link.text.strip().split(':')[1] paper_ids.append(paper_id) return paper_ids def extract_data(category): all_ids = set() new_data = extract_new_data(category) recent_data = extract_recent_data(category) combined_data = new_data + recent_data for paper_id in combined_data: if not utils.check_data_in_file(paper_id, 'arxiv.txt'): utils.write_data_to_file(paper_id, 'arxiv.txt') all_ids.add(paper_id) if len(all_ids) >= 2: break return list(all_ids) def extract_arxiv_data(): if not utils.download_datafile('arxiv.txt'): raise Exception("Failed to download datafile") categories = { "Astrophysics": ["astro-ph"], "Condensed Matter": ["cond-mat"], "General Relativity and Quantum Cosmology": ["gr-qc"], "High Energy Physics": ["hep-ex", "hep-lat", "hep-ph", "hep-th"], "Mathematical Physics": ["math-ph"], "Nonlinear Sciences": ["nlin"], "Nuclear Experiment": ["nucl-ex"], "Nuclear Theory": ["nucl-th"], "Physics": ["physics"], "Quantum Physics": ["quant-ph"], "Mathematics": ["math"], "Computer Science": ["cs"], "Quantitative Biology": ["q-bio"], "Quantitative Finance": ["q-fin"], "Statistics": ["stat"], "Electrical Engineering and Systems Science": ["eess"], "Economics": ["econ"] } data = {} used_ids = set() for category, subcategories in categories.items(): category_ids = set() for subcategory in subcategories: ids = extract_data(subcategory) for paper_id in ids: if paper_id not in used_ids: category_ids.add(paper_id) used_ids.add(paper_id) if len(category_ids) == 2: break if len(category_ids) == 2: break while len(category_ids) < 2: category_ids.add(random.choice(list(used_ids))) data[category] = {"ids": list(category_ids), "count": len(category_ids)} if not utils.upload_datafile('arxiv.txt'): raise Exception("Failed to upload datafile") return json.dumps(data, indent=4, ensure_ascii=False) if __name__ == '__main__': data = extract_arxiv_data() with open('arxiv_data.json', 'w') as f: f.write(data)