File size: 4,484 Bytes
4596869 cf0645c 4596869 cf0645c 4596869 868658d 4596869 cf0645c 4596869 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
import pandas as pd
import arxiv
import requests
from pinecone import Pinecone, ServerlessSpec
import logging
import os
script_dir = os.path.dirname(os.path.abspath(__file__))
os.chdir(script_dir)
def get_zotero_ids(api_key, library_id, tag):
base_url = 'https://api.zotero.org'
suffix = '/users/'+ library_id +'/items?tag='+ tag
header = {'Authorization': 'Bearer '+ api_key}
request = requests.get(base_url + suffix, headers= header)
return [data['data']['archiveID'].replace('arXiv:', '') for data in request.json()]
def get_arxiv_papers(ids = None, category = None, comment = None):
logging.getLogger('arxiv').setLevel(logging.WARNING)
client = arxiv.Client()
if category is None:
search = arxiv.Search(
id_list= ids,
max_results= len(ids),
)
else :
if comment is None:
custom_query = f'cat:{category}'
else:
custom_query = f'cat:{category} AND co:{comment}'
search = arxiv.Search(
query = custom_query,
max_results= 15,
sort_by= arxiv.SortCriterion.SubmittedDate
)
if ids is None and category is None:
raise ValueError('not a valid query')
df = pd.DataFrame({'Title': [result.title for result in client.results(search)],
'Abstract': [result.summary.replace('\n', ' ') for result in client.results(search)],
'Date': [result.published.date().strftime('%Y-%m-%d') for result in client.results(search)],
'id': [result.entry_id for result in client.results(search)]})
if ids:
df.to_csv('arxiv-scrape.csv', index = False)
return df
def get_hf_embeddings(api_key, df):
title_abs = [title + '[SEP]' + abstract for title,abstract in zip(df['Title'], df['Abstract'])]
API_URL = "https://api-inference.huggingface.co/models/malteos/scincl"
headers = {"Authorization": f"Bearer {api_key}"}
response = requests.post(API_URL, headers=headers, json={"inputs": title_abs, "options": {"wait_for_model": False}})
if response.status_code == 503:
response = requests.post(API_URL, headers=headers, json={"inputs": title_abs, "options": {"wait_for_model": True}})
embeddings = response.json()
return embeddings, len(embeddings[0])
def upload_to_pinecone(api_key, index, namespace, embeddings, dim, df):
input = [{'id': df['id'][i], 'values': embeddings[i]} for i in range(len(embeddings))]
pc = Pinecone(api_key = api_key)
if index in pc.list_indexes().names():
while True:
logging.warning(f'Index name : {index} already exists.')
return f'Index name : {index} already exists'
pc.create_index(
name=index,
dimension=dim,
metric="cosine",
deletion_protection="disabled",
spec=ServerlessSpec(
cloud='aws',
region='us-east-1'
)
)
index = pc.Index(index)
return index.upsert(vectors=input, namespace=namespace)
def get_new_papers(df):
df_main = pd.read_csv('arxiv-scrape.csv')
df.reset_index(inplace=True)
df.drop(columns=['index'], inplace=True)
union_df = df.merge(df_main, how='left', indicator=True)
df = union_df[union_df['_merge'] == 'left_only'].drop(columns=['_merge'])
if df.empty:
return 'No New Papers Found'
else:
# df_main = pd.concat([df_main, df], ignore_index= True) #persistence of recommended paper removed for demo
# df_main.drop_duplicates(inplace= True)
# df_main.to_csv('arxiv-scrape.csv', index = False)
return df
def recommend_papers(api_key, index, namespace, embeddings, df, threshold):
pc = Pinecone(api_key = api_key)
if index in pc.list_indexes().names():
index = pc.Index(index)
else:
raise ValueError(f"{index} doesnt exist. Project isnt initialized properly")
results = []
score_threshold = threshold
for i,embedding in enumerate(embeddings):
query = embedding
result = index.query(namespace=namespace,vector=query,top_k=3,include_values=False)
sum_score = sum(match['score'] for match in result['matches'])
if sum_score > score_threshold:
results.append(f"Paper-URL : [{df['id'][i]}]({df['id'][i]}) with score: {sum_score / 3} <br />")
if results:
return '\n'.join(results)
else:
return 'No Interesting Paper'
|