Spaces:
Running
Running
from datasets import load_dataset | |
import re | |
def arxiv_remove_version_suffix(arxiv_id): | |
if arxiv_id is None: | |
return None | |
# Ensure arxiv_id is a string before applying regex | |
elif isinstance(arxiv_id, str): | |
cleaned_id = re.sub(r'v\d+$', '', arxiv_id) | |
return cleaned_id | |
else: | |
# Handle unexpected types | |
return arxiv_id | |
# Load datasets | |
def load_and_process(dataset_name): | |
data = load_dataset(dataset_name, split="train").to_pandas() | |
if 'arxiv_id' in data.columns: | |
data['arxiv_id'] = data['arxiv_id'].apply(arxiv_remove_version_suffix) | |
return data | |