from datasets import load_dataset import re def arxiv_remove_version_suffix(arxiv_id): if arxiv_id is None: return None # Ensure arxiv_id is a string before applying regex elif isinstance(arxiv_id, str): cleaned_id = re.sub(r'v\d+$', '', arxiv_id) return cleaned_id else: # Handle unexpected types return arxiv_id # Load datasets def load_and_process(dataset_name): data = load_dataset(dataset_name, split="train").to_pandas() if 'arxiv_id' in data.columns: data['arxiv_id'] = data['arxiv_id'].apply(arxiv_remove_version_suffix) return data