|
import requests
|
|
import re
|
|
import os
|
|
import dotenv
|
|
from huggingface_hub import HfApi
|
|
|
|
dotenv.load_dotenv()
|
|
hf_token = os.getenv("HF_API_TOKEN")
|
|
access_key = os.getenv("ACCESS_KEY")
|
|
api = HfApi(token=hf_token)
|
|
|
|
def fetch_page(url):
|
|
HEADERS = {
|
|
'User-Agent': 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Googlebot/2.1; +http://www.google.com/bot.html) Chrome/131.0.6778.135 Safari/537.36'
|
|
}
|
|
page_content = requests.get(url, headers=HEADERS).content
|
|
return page_content
|
|
|
|
def check_data_in_file(data, file):
|
|
with open(file, 'r') as f:
|
|
existing_data = f.read().splitlines()
|
|
if data in existing_data:
|
|
return True
|
|
else:
|
|
return False
|
|
|
|
def write_data_to_file(data, file):
|
|
with open(file, 'a') as f:
|
|
f.write(data + '\n')
|
|
return True
|
|
|
|
def verify_simple_title(title):
|
|
pattern = re.compile(r'^[a-zA-Z0-9\s\.\-\+\*/=\(\)\[\]\{\},:;"\'?\>\<\@\#\%\^\*\|\_\~\`]+$')
|
|
if pattern.match(title):
|
|
return True
|
|
else:
|
|
return False
|
|
|
|
def download_datafile(filename):
|
|
try:
|
|
api.hf_hub_download(repo_id="raannakasturi/ReXploreData", filename=filename, repo_type="dataset", local_dir='.', cache_dir='.', force_download=True)
|
|
return True
|
|
except Exception as e:
|
|
print(str(e))
|
|
return False
|
|
|
|
def upload_datafile(filename):
|
|
try:
|
|
api.upload_file(path_or_fileobj=filename, path_in_repo=filename, repo_id="raannakasturi/ReXploreData", repo_type="dataset")
|
|
os.remove(filename)
|
|
return True
|
|
except Exception as e:
|
|
print(str(e))
|
|
return False
|
|
|
|
def reset_datafiles(user_access_key):
|
|
if user_access_key != access_key:
|
|
return "Invalid access key"
|
|
else:
|
|
files = ['arxiv.txt', 'pmc.txt']
|
|
try:
|
|
for filename in files:
|
|
try:
|
|
download_datafile(filename)
|
|
with open(filename, 'w') as f:
|
|
f.write('')
|
|
upload_datafile(filename)
|
|
except Exception as e:
|
|
print(str(e))
|
|
continue
|
|
return True
|
|
except Exception as e:
|
|
print(str(e))
|
|
return False
|
|
|