import requests import re import os import dotenv from huggingface_hub import HfApi dotenv.load_dotenv() hf_token = os.getenv("HF_API_TOKEN") access_key = os.getenv("ACCESS_KEY") api = HfApi(token=hf_token) def fetch_page(url): HEADERS = { 'User-Agent': 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Googlebot/2.1; +http://www.google.com/bot.html) Chrome/131.0.6778.135 Safari/537.36' } page_content = requests.get(url, headers=HEADERS).content return page_content def check_data_in_file(data, file): with open(file, 'r') as f: existing_data = f.read().splitlines() if data in existing_data: return True else: return False def write_data_to_file(data, file): with open(file, 'a') as f: f.write(data + '\n') return True def verify_simple_title(title): pattern = re.compile(r'^[a-zA-Z0-9\s\.\-\+\*/=\(\)\[\]\{\},:;"\'?\>\<\@\#\%\^\*\|\_\~\`]+$') if pattern.match(title): return True else: return False def download_datafile(filename): try: api.hf_hub_download(repo_id="raannakasturi/ReXploreData", filename=filename, repo_type="dataset", local_dir='.', cache_dir='.', force_download=True) return True except Exception as e: print(str(e)) return False def upload_datafile(filename): try: api.upload_file(path_or_fileobj=filename, path_in_repo=filename, repo_id="raannakasturi/ReXploreData", repo_type="dataset") os.remove(filename) return True except Exception as e: print(str(e)) return False def reset_datafiles(user_access_key): if user_access_key != access_key: return "Invalid access key" else: files = ['arxiv.txt', 'pmc.txt'] try: for filename in files: try: download_datafile(filename) with open(filename, 'w') as f: f.write('') upload_datafile(filename) except Exception as e: print(str(e)) continue return True except Exception as e: print(str(e)) return False