paper-central / utils.py
jbdel
author, github
df43c05
raw
history blame
621 Bytes
from datasets import load_dataset
import re
def arxiv_remove_version_suffix(arxiv_id):
if arxiv_id is None:
return None
# Ensure arxiv_id is a string before applying regex
elif isinstance(arxiv_id, str):
cleaned_id = re.sub(r'v\d+$', '', arxiv_id)
return cleaned_id
else:
# Handle unexpected types
return arxiv_id
# Load datasets
def load_and_process(dataset_name):
data = load_dataset(dataset_name, split="train").to_pandas()
if 'arxiv_id' in data.columns:
data['arxiv_id'] = data['arxiv_id'].apply(arxiv_remove_version_suffix)
return data