Spaces:

raannakasturi
/

ReXploreIDFetchingAPI

Running

File size: 3,742 Bytes

import json
import random
import os
import sys
from bs4 import BeautifulSoup

sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
import utils

def fetch_new_page(category):
    url = f'https://arxiv.org/list/{category}/new'
    return utils.fetch_page(url)

def fetch_recent_page(category):
    url = f'https://arxiv.org/list/{category}/recent'
    return utils.fetch_page(url)

def extract_new_data(category):
    paper_ids = []
    page_content = fetch_new_page(category)
    lists = BeautifulSoup(page_content, 'html.parser').find_all('dl')
    for list in lists:
        papers = list.find_all('dt')
        paper_contents = list.find_all('dd')
        titles = [paper_content.find('div', class_='list-title').text.strip().split('Title:')[-1].strip() for paper_content in paper_contents]
        for paper, title in zip(papers, titles):
            if not utils.verify_simple_title(title):
                continue
            paper_link = paper.find('a', href=True)
            if paper_link:
                paper_id = paper_link.text.strip().split(':')[1]
                paper_ids.append(paper_id)
    return paper_ids

def extract_recent_data(category):
    paper_ids = []
    page_content = fetch_recent_page(category)
    lists = BeautifulSoup(page_content, 'html.parser').find_all('dl')
    for list in lists:
        papers = list.find_all('dt')
        for paper in papers:
            paper_link = paper.find('a', href=True)
            if paper_link:
                paper_id = paper_link.text.strip().split(':')[1]
                paper_ids.append(paper_id)
    return paper_ids

def extract_data(category):
    all_ids = set()
    new_data = extract_new_data(category)
    recent_data = extract_recent_data(category)
    combined_data = new_data + recent_data
    for paper_id in combined_data:
        if not utils.check_data_in_file(paper_id, 'arxiv.txt'):
            utils.write_data_to_file(paper_id, 'arxiv.txt')
            all_ids.add(paper_id)
        if len(all_ids) >= 4:
            break
    return list(all_ids)

def extract_arxiv_data():
    categories = {
        "Astrophysics": ["astro-ph"],
        "Condensed Matter": ["cond-mat"],
        "General Relativity and Quantum Cosmology": ["gr-qc"],
        "High Energy Physics": ["hep-ex", "hep-lat", "hep-ph", "hep-th"],
        "Mathematical Physics": ["math-ph"],
        "Nonlinear Sciences": ["nlin"],
        "Nuclear Experiment": ["nucl-ex"],
        "Nuclear Theory": ["nucl-th"],
        "Physics": ["physics"],
        "Quantum Physics": ["quant-ph"],
        "Mathematics": ["math"],
        "Computer Science": ["cs"],
        "Quantitative Biology": ["q-bio"],
        "Quantitative Finance": ["q-fin"],
        "Statistics": ["stat"],
        "Electrical Engineering and Systems Science": ["eess"],
        "Economics": ["econ"]
    }
    data = {}
    used_ids = set()

    for category, subcategories in categories.items():
        category_ids = set()
        for subcategory in subcategories:
            ids = extract_data(subcategory)
            for paper_id in ids:
                if paper_id not in used_ids:
                    category_ids.add(paper_id)
                    used_ids.add(paper_id)
                if len(category_ids) == 4:
                    break
            if len(category_ids) == 4:
                break

        # Ensure exactly 4 IDs for each category
        while len(category_ids) < 4:
            category_ids.add(random.choice(list(used_ids)))

        data[category] = {"ids": list(category_ids)}

    return json.dumps(data, indent=4, ensure_ascii=False)

if __name__ == '__main__':
    data = extract_arxiv_data()
    with open('arxiv_data.json', 'w') as f:
        f.write(data)