Agents_AI / research_agent.py
ferferefer's picture
cur
e5a9b78
raw
history blame
8.38 kB
import requests
from bs4 import BeautifulSoup
from typing import List, Dict
import re
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI
import urllib.parse
import time
from scholarly import scholarly
class ResearchAgent:
def __init__(self, openai_api_key: str):
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Connection': 'keep-alive',
}
self.openai_api_key = openai_api_key
self.scientific_domains = [
'sciencedirect.com',
'springer.com',
'nature.com',
'ncbi.nlm.nih.gov',
'wiley.com',
'scielo.org',
'frontiersin.org',
'mdpi.com',
'hindawi.com',
'tandfonline.com'
]
def extract_keywords(self, question: str) -> str:
llm = OpenAI(api_key=self.openai_api_key)
prompt = f"Extract 3-4 most important scientific search terms from this question, provide them as a space-separated list: {question}"
return llm.predict(prompt)
def search_papers(self, keywords: str, num_results: int = 10) -> List[Dict]:
# First try Google Scholar
try:
search_query = scholarly.search_pubs(keywords)
papers = []
for i in range(num_results):
try:
paper = next(search_query)
if paper.get('pub_url'):
papers.append({
'title': paper.get('bib', {}).get('title', ''),
'url': paper.get('pub_url'),
'abstract': paper.get('bib', {}).get('abstract', '')
})
except StopIteration:
break
except Exception as e:
print(f"Scholar search failed: {str(e)}")
papers = []
# Fallback to regular Google search if needed
if len(papers) < num_results:
remaining = num_results - len(papers)
search_query = f"{keywords} filetype:pdf site:({' OR site:'.join(self.scientific_domains)})"
encoded_query = urllib.parse.quote(search_query)
search_url = f"https://www.google.com/search?q={encoded_query}&num={remaining*2}" # Get more results as some might fail
try:
response = requests.get(search_url, headers=self.headers, timeout=10)
soup = BeautifulSoup(response.text, 'html.parser')
for result in soup.find_all('a'):
if len(papers) >= num_results:
break
link = result.get('href', '')
if any(domain in link for domain in self.scientific_domains):
clean_link = re.search(r'https?://[^&]+', link)
if clean_link:
papers.append({
'url': clean_link.group(0),
'title': '', # Will be filled during scraping
'abstract': '' # Will be filled during scraping
})
except Exception as e:
print(f"Google search failed: {str(e)}")
return papers
def scrape_paper(self, paper: Dict) -> Dict[str, str]:
try:
response = requests.get(paper['url'], headers=self.headers, timeout=15)
soup = BeautifulSoup(response.text, 'html.parser')
# Try different title selectors
if not paper['title']:
title_selectors = [
'h1',
'h1.article-title',
'h1.title',
'div.article-title',
'meta[name="citation_title"]'
]
for selector in title_selectors:
title_elem = soup.select_one(selector)
if title_elem:
paper['title'] = title_elem.get('content', '') or title_elem.text.strip()
break
# Try different abstract/content selectors
if not paper['abstract']:
content_selectors = [
'div.abstract',
'section.abstract',
'div#abstract',
'meta[name="description"]',
'div.paper-content',
'div.article-content'
]
for selector in content_selectors:
content_elem = soup.select_one(selector)
if content_elem:
paper['abstract'] = content_elem.get('content', '') or content_elem.text.strip()
break
if not paper['abstract']:
# Try to get main content
paragraphs = soup.find_all('p')
content = ' '.join([p.text.strip() for p in paragraphs[:5]]) # Get first 5 paragraphs
if content:
paper['abstract'] = content
# Clean up text
paper['title'] = paper['title'] or "Title not found"
paper['abstract'] = paper['abstract'] or "Content not found"
paper['abstract'] = re.sub(r'\s+', ' ', paper['abstract'])
return paper
except Exception as e:
print(f"Error scraping {paper['url']}: {str(e)}")
return paper
def perform_research(self, question: str) -> str:
# Extract keywords
keywords = self.extract_keywords(question)
print(f"Keywords extracted: {keywords}")
# Search for papers
papers = self.search_papers(keywords)
print(f"Found {len(papers)} papers")
# Scrape full content
articles = []
for paper in papers:
article = self.scrape_paper(paper)
if article and article['abstract'] != "Content not found":
articles.append(article)
time.sleep(1) # Polite delay between requests
if not articles:
return "I apologize, but I couldn't find any relevant scientific papers to answer your question. Please try rephrasing your question or using different terms."
# Prepare documents for RAG
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200
)
texts = []
for article in articles:
chunks = text_splitter.split_text(article['abstract'])
for chunk in chunks:
texts.append(f"Title: {article['title']}\n\nContent: {chunk}\n\nSource: {article['url']}")
# Create vector store
embeddings = HuggingFaceEmbeddings()
vectorstore = Chroma.from_texts(texts, embeddings)
# Create QA chain with a more specific prompt
qa_chain = RetrievalQA.from_chain_type(
llm=OpenAI(api_key=self.openai_api_key, temperature=0.3),
chain_type="stuff",
retriever=vectorstore.as_retriever(search_kwargs={"k": 5}),
return_source_documents=True
)
# Get answer with references
result = qa_chain({"query": f"""Based on the provided scientific papers, please answer this question: {question}
If you can't find a direct answer, summarize the most relevant information from the papers.
Include specific findings, data, and methodology when available."""})
answer = result['result']
# Format response with article summaries
response = f"Answer: {answer}\n\nReferences:\n\n"
for article in articles:
response += f"Title: {article['title']}\nURL: {article['url']}\n\n"
return response