Agents_AI

Sleeping

App Files Files Community

Agents_AI / research_agent.py

ferferefer

cur

e5a9b78 13 days ago

raw

history blame contribute delete

8.38 kB

	import requests
	from bs4 import BeautifulSoup
	from typing import List, Dict
	import re
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.embeddings import HuggingFaceEmbeddings
	from langchain.vectorstores import Chroma
	from langchain.chains import RetrievalQA
	from langchain.llms import OpenAI
	import urllib.parse
	import time
	from scholarly import scholarly

	class ResearchAgent:
	def __init__(self, openai_api_key: str):
	self.headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
	'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8',
	'Accept-Language': 'en-US,en;q=0.5',
	'Connection': 'keep-alive',
	}
	self.openai_api_key = openai_api_key
	self.scientific_domains = [
	'sciencedirect.com',
	'springer.com',
	'nature.com',
	'ncbi.nlm.nih.gov',
	'wiley.com',
	'scielo.org',
	'frontiersin.org',
	'mdpi.com',
	'hindawi.com',
	'tandfonline.com'
	]

	def extract_keywords(self, question: str) -> str:
	llm = OpenAI(api_key=self.openai_api_key)
	prompt = f"Extract 3-4 most important scientific search terms from this question, provide them as a space-separated list: {question}"
	return llm.predict(prompt)

	def search_papers(self, keywords: str, num_results: int = 10) -> List[Dict]:
	# First try Google Scholar
	try:
	search_query = scholarly.search_pubs(keywords)
	papers = []
	for i in range(num_results):
	try:
	paper = next(search_query)
	if paper.get('pub_url'):
	papers.append({
	'title': paper.get('bib', {}).get('title', ''),
	'url': paper.get('pub_url'),
	'abstract': paper.get('bib', {}).get('abstract', '')
	})
	except StopIteration:
	break
	except Exception as e:
	print(f"Scholar search failed: {str(e)}")
	papers = []

	# Fallback to regular Google search if needed
	if len(papers) < num_results:
	remaining = num_results - len(papers)
	search_query = f"{keywords} filetype:pdf site:({' OR site:'.join(self.scientific_domains)})"
	encoded_query = urllib.parse.quote(search_query)
	search_url = f"https://www.google.com/search?q={encoded_query}&num={remaining*2}" # Get more results as some might fail

	try:
	response = requests.get(search_url, headers=self.headers, timeout=10)
	soup = BeautifulSoup(response.text, 'html.parser')

	for result in soup.find_all('a'):
	if len(papers) >= num_results:
	break

	link = result.get('href', '')
	if any(domain in link for domain in self.scientific_domains):
	clean_link = re.search(r'https?://[^&]+', link)
	if clean_link:
	papers.append({
	'url': clean_link.group(0),
	'title': '', # Will be filled during scraping
	'abstract': '' # Will be filled during scraping
	})

	except Exception as e:
	print(f"Google search failed: {str(e)}")

	return papers

	def scrape_paper(self, paper: Dict) -> Dict[str, str]:
	try:
	response = requests.get(paper['url'], headers=self.headers, timeout=15)
	soup = BeautifulSoup(response.text, 'html.parser')

	# Try different title selectors
	if not paper['title']:
	title_selectors = [
	'h1',
	'h1.article-title',
	'h1.title',
	'div.article-title',
	'meta[name="citation_title"]'
	]

	for selector in title_selectors:
	title_elem = soup.select_one(selector)
	if title_elem:
	paper['title'] = title_elem.get('content', '') or title_elem.text.strip()
	break

	# Try different abstract/content selectors
	if not paper['abstract']:
	content_selectors = [
	'div.abstract',
	'section.abstract',
	'div#abstract',
	'meta[name="description"]',
	'div.paper-content',
	'div.article-content'
	]

	for selector in content_selectors:
	content_elem = soup.select_one(selector)
	if content_elem:
	paper['abstract'] = content_elem.get('content', '') or content_elem.text.strip()
	break

	if not paper['abstract']:
	# Try to get main content
	paragraphs = soup.find_all('p')
	content = ' '.join([p.text.strip() for p in paragraphs[:5]]) # Get first 5 paragraphs
	if content:
	paper['abstract'] = content

	# Clean up text
	paper['title'] = paper['title'] or "Title not found"
	paper['abstract'] = paper['abstract'] or "Content not found"
	paper['abstract'] = re.sub(r'\s+', ' ', paper['abstract'])

	return paper
	except Exception as e:
	print(f"Error scraping {paper['url']}: {str(e)}")
	return paper

	def perform_research(self, question: str) -> str:
	# Extract keywords
	keywords = self.extract_keywords(question)
	print(f"Keywords extracted: {keywords}")

	# Search for papers
	papers = self.search_papers(keywords)
	print(f"Found {len(papers)} papers")

	# Scrape full content
	articles = []
	for paper in papers:
	article = self.scrape_paper(paper)
	if article and article['abstract'] != "Content not found":
	articles.append(article)
	time.sleep(1) # Polite delay between requests

	if not articles:
	return "I apologize, but I couldn't find any relevant scientific papers to answer your question. Please try rephrasing your question or using different terms."

	# Prepare documents for RAG
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=1000,
	chunk_overlap=200
	)

	texts = []
	for article in articles:
	chunks = text_splitter.split_text(article['abstract'])
	for chunk in chunks:
	texts.append(f"Title: {article['title']}\n\nContent: {chunk}\n\nSource: {article['url']}")

	# Create vector store
	embeddings = HuggingFaceEmbeddings()
	vectorstore = Chroma.from_texts(texts, embeddings)

	# Create QA chain with a more specific prompt
	qa_chain = RetrievalQA.from_chain_type(
	llm=OpenAI(api_key=self.openai_api_key, temperature=0.3),
	chain_type="stuff",
	retriever=vectorstore.as_retriever(search_kwargs={"k": 5}),
	return_source_documents=True
	)

	# Get answer with references
	result = qa_chain({"query": f"""Based on the provided scientific papers, please answer this question: {question}
	If you can't find a direct answer, summarize the most relevant information from the papers.
	Include specific findings, data, and methodology when available."""})

	answer = result['result']

	# Format response with article summaries
	response = f"Answer: {answer}\n\nReferences:\n\n"
	for article in articles:
	response += f"Title: {article['title']}\nURL: {article['url']}\n\n"

	return response