raannakasturi's picture
Refactor imports and consolidate utility functions into utils.py; remove tools.py
2ce7bcb
import requests
import re
import os
import dotenv
from huggingface_hub import HfApi
dotenv.load_dotenv()
hf_token = os.getenv("HF_API_TOKEN")
access_key = os.getenv("ACCESS_KEY")
api = HfApi(token=hf_token)
def fetch_page(url):
HEADERS = {
'User-Agent': 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Googlebot/2.1; +http://www.google.com/bot.html) Chrome/131.0.6778.135 Safari/537.36'
}
page_content = requests.get(url, headers=HEADERS).content
return page_content
def check_data_in_file(data, file):
with open(file, 'r') as f:
existing_data = f.read().splitlines()
if data in existing_data:
return True
else:
return False
def write_data_to_file(data, file):
with open(file, 'a') as f:
f.write(data + '\n')
return True
def verify_simple_title(title):
pattern = re.compile(r'^[a-zA-Z0-9\s\.\-\+\*/=\(\)\[\]\{\},:;"\'?\>\<\@\#\%\^\*\|\_\~\`]+$')
if pattern.match(title):
return True
else:
return False
def download_datafile(filename):
try:
api.hf_hub_download(repo_id="raannakasturi/ReXploreData", filename=filename, repo_type="dataset", local_dir='.', cache_dir='.', force_download=True)
return True
except Exception as e:
print(str(e))
return False
def upload_datafile(filename):
try:
api.upload_file(path_or_fileobj=filename, path_in_repo=filename, repo_id="raannakasturi/ReXploreData", repo_type="dataset")
os.remove(filename)
return True
except Exception as e:
print(str(e))
return False
def reset_datafiles(user_access_key):
if user_access_key != access_key:
return "Invalid access key"
else:
files = ['arxiv.txt', 'pmc.txt']
try:
for filename in files:
try:
download_datafile(filename)
with open(filename, 'w') as f:
f.write('')
upload_datafile(filename)
except Exception as e:
print(str(e))
continue
return True
except Exception as e:
print(str(e))
return False