Spaces:
Running
Running
File size: 621 Bytes
4e925af df43c05 4e925af df43c05 4e925af |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 |
from datasets import load_dataset
import re
def arxiv_remove_version_suffix(arxiv_id):
if arxiv_id is None:
return None
# Ensure arxiv_id is a string before applying regex
elif isinstance(arxiv_id, str):
cleaned_id = re.sub(r'v\d+$', '', arxiv_id)
return cleaned_id
else:
# Handle unexpected types
return arxiv_id
# Load datasets
def load_and_process(dataset_name):
data = load_dataset(dataset_name, split="train").to_pandas()
if 'arxiv_id' in data.columns:
data['arxiv_id'] = data['arxiv_id'].apply(arxiv_remove_version_suffix)
return data
|