Spaces:

huggingface
/

paper-central

Running

paper-central / utils.py

jbdel

author, github

df43c05 about 2 months ago

621 Bytes

	from datasets import load_dataset
	import re


	def arxiv_remove_version_suffix(arxiv_id):
	if arxiv_id is None:
	return None
	# Ensure arxiv_id is a string before applying regex
	elif isinstance(arxiv_id, str):
	cleaned_id = re.sub(r'v\d+$', '', arxiv_id)
	return cleaned_id
	else:
	# Handle unexpected types
	return arxiv_id


	# Load datasets
	def load_and_process(dataset_name):
	data = load_dataset(dataset_name, split="train").to_pandas()
	if 'arxiv_id' in data.columns:
	data['arxiv_id'] = data['arxiv_id'].apply(arxiv_remove_version_suffix)
	return data