File size: 621 Bytes
4e925af
df43c05
4e925af
 
 
df43c05
 
 
 
 
 
 
 
 
4e925af
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
from datasets import load_dataset
import re


def arxiv_remove_version_suffix(arxiv_id):
    if arxiv_id is None:
        return None
    # Ensure arxiv_id is a string before applying regex
    elif isinstance(arxiv_id, str):
        cleaned_id = re.sub(r'v\d+$', '', arxiv_id)
        return cleaned_id
    else:
        # Handle unexpected types
        return arxiv_id


# Load datasets
def load_and_process(dataset_name):
    data = load_dataset(dataset_name, split="train").to_pandas()
    if 'arxiv_id' in data.columns:
        data['arxiv_id'] = data['arxiv_id'].apply(arxiv_remove_version_suffix)
    return data