Spaces:

Superkingjcj
/

Lagent

Sleeping

App Files Files Community

Lagent / lagent /actions /google_scholar_search.py

Superkingjcj

Upload 111 files

e679d69 verified 7 days ago

raw

history blame

20.5 kB

	# flake8: noqa: E501
	import os
	from typing import Optional, Type

	from asyncer import asyncify

	from lagent.actions.base_action import AsyncActionMixin, BaseAction, tool_api
	from lagent.schema import ActionReturn, ActionStatusCode
	from .parser import BaseParser, JsonParser


	class GoogleScholar(BaseAction):
	"""Plugin for google scholar search.

	Args:
	api_key (str): API KEY to use serper google search API,
	You can create a free API key at https://serper.dev.
	description (dict): The description of the action. Defaults to ``None``.
	parser (Type[BaseParser]): The parser class to process the
	action's inputs and outputs. Defaults to :class:`JsonParser`.
	"""

	def __init__(
	self,
	api_key: Optional[str] = None,
	description: Optional[dict] = None,
	parser: Type[BaseParser] = JsonParser,
	):
	super().__init__(description, parser)
	api_key = os.environ.get('SERPER_API_KEY', api_key)
	if api_key is None:
	raise ValueError(
	'Please set Serper API key either in the environment '
	'as SERPER_API_KEY or pass it as `api_key` parameter.'
	)
	self.api_key = api_key

	@tool_api(explode_return=True)
	def search_google_scholar(
	self,
	query: str,
	cites: Optional[str] = None,
	as_ylo: Optional[int] = None,
	as_yhi: Optional[int] = None,
	scisbd: Optional[int] = None,
	cluster: Optional[str] = None,
	hl: Optional[str] = None,
	lr: Optional[str] = None,
	start: Optional[int] = None,
	num: Optional[int] = None,
	as_sdt: Optional[str] = None,
	safe: Optional[str] = None,
	filter: Optional[str] = None,
	as_vis: Optional[str] = None,
	) -> dict:
	"""Search for scholarly articles based on a query according to the google scholar.

	Args:
	query (str): The query to search for.
	cites (Optional[str]): The unique ID of an article for triggering "Cited By" searches.
	as_ylo (Optional[int]): The starting year for results (e.g., if as_ylo=2018, results before this year will be omitted).
	as_yhi (Optional[int]): The ending year for results (e.g., if as_yhi=2018, results after this year will be omitted).
	scisbd (Optional[int]): Defines articles added in the last year, sorted by date. It can be set to 1 to include only abstracts, or 2 to include everything.
	cluster (Optional[str]): The unique ID of an article for triggering "All Versions" searches.
	hl (Optional[str]): The language to use for the Google Scholar search.
	lr (Optional[str]): One or multiple languages to limit the search to.
	start (Optional[int]): The result offset for pagination (0 is the first page of results, 10 is the 2nd page, etc.)
	num (Optional[int]): The maximum number of results to return, limited to 20.
	as_sdt (Optional[str]): Can be used either as a search type or a filter.
	safe (Optional[str]): The level of filtering for adult content.
	filter (Optional[str]): Defines if the filters for 'Similar Results' and 'Omitted Results' are on or off.
	as_vis (Optional[str]): Defines whether to include citations or not.

	Returns:
	:class:`dict`: article information
	- title: a list of the titles of the three selected papers
	- cited_by: a list of the citation numbers of the three selected papers
	- organic_id: a list of the organic results' ids of the three selected papers
	- pub_info: publication information of selected papers
	"""
	from serpapi import GoogleSearch

	params = {
	'q': query,
	'engine': 'google_scholar',
	'api_key': self.api_key,
	'cites': cites,
	'as_ylo': as_ylo,
	'as_yhi': as_yhi,
	'scisbd': scisbd,
	'cluster': cluster,
	'hl': hl,
	'lr': lr,
	'start': start,
	'num': num,
	'as_sdt': as_sdt,
	'safe': safe,
	'filter': filter,
	'as_vis': as_vis,
	}
	search = GoogleSearch(params)
	try:
	r = search.get_dict()
	results = r['organic_results']
	title = []
	snippets = []
	cited_by = []
	organic_id = []
	pub_info = []
	for item in results[:3]:
	title.append(item['title'])
	pub_info.append(item['publication_info']['summary'])
	citation = item['inline_links'].get('cited_by', {'total': ''})
	cited_by.append(citation['total'])
	snippets.append(item['snippet'])
	organic_id.append(item['result_id'])
	return dict(title=title, cited_by=cited_by, organic_id=organic_id, snippets=snippets)
	except Exception as e:
	return ActionReturn(errmsg=str(e), state=ActionStatusCode.HTTP_ERROR)

	@tool_api(explode_return=True)
	def get_author_information(
	self,
	author_id: str,
	hl: Optional[str] = None,
	view_op: Optional[str] = None,
	sort: Optional[str] = None,
	citation_id: Optional[str] = None,
	start: Optional[int] = None,
	num: Optional[int] = None,
	no_cache: Optional[bool] = None,
	async_req: Optional[bool] = None,
	output: Optional[str] = None,
	) -> dict:
	"""Search for an author's information by author's id provided by get_author_id.

	Args:
	author_id (str): Required. The ID of an author.
	hl (Optional[str]): The language to use for the Google Scholar Author search. Default is 'en'.
	view_op (Optional[str]): Used for viewing specific parts of a page.
	sort (Optional[str]): Used for sorting and refining articles.
	citation_id (Optional[str]): Used for retrieving individual article citation.
	start (Optional[int]): Defines the result offset. Default is 0.
	num (Optional[int]): Defines the number of results to return. Default is 20.
	no_cache (Optional[bool]): Forces SerpApi to fetch the results even if a cached version is already present. Default is False.
	async_req (Optional[bool]): Defines the way you want to submit your search to SerpApi. Default is False.
	output (Optional[str]): Defines the final output you want. Default is 'json'.

	Returns:
	:class:`dict`: author information
	* name: author's name
	* affliation: the affliation of the author
	* articles: at most 3 articles by the author
	* website: the author's homepage url
	"""
	from serpapi import GoogleSearch

	params = {
	'engine': 'google_scholar_author',
	'author_id': author_id,
	'api_key': self.api_key,
	'hl': hl,
	'view_op': view_op,
	'sort': sort,
	'citation_id': citation_id,
	'start': start,
	'num': num,
	'no_cache': no_cache,
	'async': async_req,
	'output': output,
	}
	try:
	search = GoogleSearch(params)
	results = search.get_dict()
	author = results['author']
	articles = results.get('articles', [])
	return dict(
	name=author['name'],
	affiliations=author.get('affiliations', ''),
	website=author.get('website', ''),
	articles=[dict(title=article['title'], authors=article['authors']) for article in articles[:3]],
	)
	except Exception as e:
	return ActionReturn(errmsg=str(e), state=ActionStatusCode.HTTP_ERROR)

	@tool_api(explode_return=True)
	def get_citation_format(
	self,
	q: str,
	no_cache: Optional[bool] = None,
	async_: Optional[bool] = None,
	output: Optional[str] = 'json',
	) -> dict:
	"""Function to get MLA citation format by an identification of organic_result's id provided by search_google_scholar.

	Args:
	q (str): ID of an individual Google Scholar organic search result.
	no_cache (Optional[bool]): If set to True, will force SerpApi to fetch the Google Scholar Cite results even if a cached version is already present. Defaults to None.
	async_ (Optional[bool]): If set to True, will submit search to SerpApi and retrieve results later. Defaults to None.
	output (Optional[str]): Final output format. Set to 'json' to get a structured JSON of the results, or 'html' to get the raw html retrieved. Defaults to 'json'.

	Returns:
	:class:`dict`: citation format
	* authors: the authors of the article
	* citation: the citation format of the article
	"""
	from serpapi import GoogleSearch

	params = {
	'q': q,
	'engine': 'google_scholar_cite',
	'api_key': self.api_key,
	'no_cache': no_cache,
	'async': async_,
	'output': output,
	}
	try:
	search = GoogleSearch(params)
	results = search.get_dict()
	citation = results['citations']
	citation_info = citation[0]['snippet']
	return citation_info
	except Exception as e:
	return ActionReturn(errmsg=str(e), state=ActionStatusCode.HTTP_ERROR)

	@tool_api(explode_return=True)
	def get_author_id(
	self,
	mauthors: str,
	hl: Optional[str] = 'en',
	after_author: Optional[str] = None,
	before_author: Optional[str] = None,
	no_cache: Optional[bool] = False,
	_async: Optional[bool] = False,
	output: Optional[str] = 'json',
	) -> dict:
	"""The getAuthorId function is used to get the author's id by his or her name.

	Args:
	mauthors (str): Defines the author you want to search for.
	hl (Optional[str]): Defines the language to use for the Google Scholar Profiles search. It's a two-letter language code. (e.g., 'en' for English, 'es' for Spanish, or 'fr' for French). Defaults to 'en'.
	after_author (Optional[str]): Defines the next page token. It is used for retrieving the next page results. The parameter has the precedence over before_author parameter. Defaults to None.
	before_author (Optional[str]): Defines the previous page token. It is used for retrieving the previous page results. Defaults to None.
	no_cache (Optional[bool]): Will force SerpApi to fetch the Google Scholar Profiles results even if a cached version is already present. Defaults to False.
	_async (Optional[bool]): Defines the way you want to submit your search to SerpApi. Defaults to False.
	output (Optional[str]): Defines the final output you want. It can be set to 'json' (default) to get a structured JSON of the results, or 'html' to get the raw html retrieved. Defaults to 'json'.

	Returns:
	:class:`dict`: author id
	* author_id: the author_id of the author
	"""
	from serpapi import GoogleSearch

	params = {
	'mauthors': mauthors,
	'engine': 'google_scholar_profiles',
	'api_key': self.api_key,
	'hl': hl,
	'after_author': after_author,
	'before_author': before_author,
	'no_cache': no_cache,
	'async': _async,
	'output': output,
	}
	try:
	search = GoogleSearch(params)
	results = search.get_dict()
	profile = results['profiles']
	author_info = dict(author_id=profile[0]['author_id'])
	return author_info
	except Exception as e:
	return ActionReturn(errmsg=str(e), state=ActionStatusCode.HTTP_ERROR)


	class AsyncGoogleScholar(AsyncActionMixin, GoogleScholar):
	"""Plugin for google scholar search.

	Args:
	api_key (str): API KEY to use serper google search API,
	You can create a free API key at https://serper.dev.
	description (dict): The description of the action. Defaults to ``None``.
	parser (Type[BaseParser]): The parser class to process the
	action's inputs and outputs. Defaults to :class:`JsonParser`.
	"""

	@tool_api(explode_return=True)
	@asyncify
	def search_google_scholar(
	self,
	query: str,
	cites: Optional[str] = None,
	as_ylo: Optional[int] = None,
	as_yhi: Optional[int] = None,
	scisbd: Optional[int] = None,
	cluster: Optional[str] = None,
	hl: Optional[str] = None,
	lr: Optional[str] = None,
	start: Optional[int] = None,
	num: Optional[int] = None,
	as_sdt: Optional[str] = None,
	safe: Optional[str] = None,
	filter: Optional[str] = None,
	as_vis: Optional[str] = None,
	) -> dict:
	"""Search for scholarly articles based on a query according to the google scholar.

	Args:
	query (str): The query to search for.
	cites (Optional[str]): The unique ID of an article for triggering "Cited By" searches.
	as_ylo (Optional[int]): The starting year for results (e.g., if as_ylo=2018, results before this year will be omitted).
	as_yhi (Optional[int]): The ending year for results (e.g., if as_yhi=2018, results after this year will be omitted).
	scisbd (Optional[int]): Defines articles added in the last year, sorted by date. It can be set to 1 to include only abstracts, or 2 to include everything.
	cluster (Optional[str]): The unique ID of an article for triggering "All Versions" searches.
	hl (Optional[str]): The language to use for the Google Scholar search.
	lr (Optional[str]): One or multiple languages to limit the search to.
	start (Optional[int]): The result offset for pagination (0 is the first page of results, 10 is the 2nd page, etc.)
	num (Optional[int]): The maximum number of results to return, limited to 20.
	as_sdt (Optional[str]): Can be used either as a search type or a filter.
	safe (Optional[str]): The level of filtering for adult content.
	filter (Optional[str]): Defines if the filters for 'Similar Results' and 'Omitted Results' are on or off.
	as_vis (Optional[str]): Defines whether to include citations or not.

	Returns:
	:class:`dict`: article information
	- title: a list of the titles of the three selected papers
	- cited_by: a list of the citation numbers of the three selected papers
	- organic_id: a list of the organic results' ids of the three selected papers
	- pub_info: publication information of selected papers
	"""
	return super().search_google_scholar(
	query,
	cites,
	as_ylo,
	as_yhi,
	scisbd,
	cluster,
	hl,
	lr,
	start,
	num,
	as_sdt,
	safe,
	filter,
	as_vis,
	)

	@tool_api(explode_return=True)
	@asyncify
	def get_author_information(
	self,
	author_id: str,
	hl: Optional[str] = None,
	view_op: Optional[str] = None,
	sort: Optional[str] = None,
	citation_id: Optional[str] = None,
	start: Optional[int] = None,
	num: Optional[int] = None,
	no_cache: Optional[bool] = None,
	async_req: Optional[bool] = None,
	output: Optional[str] = None,
	) -> dict:
	"""Search for an author's information by author's id provided by get_author_id.

	Args:
	author_id (str): Required. The ID of an author.
	hl (Optional[str]): The language to use for the Google Scholar Author search. Default is 'en'.
	view_op (Optional[str]): Used for viewing specific parts of a page.
	sort (Optional[str]): Used for sorting and refining articles.
	citation_id (Optional[str]): Used for retrieving individual article citation.
	start (Optional[int]): Defines the result offset. Default is 0.
	num (Optional[int]): Defines the number of results to return. Default is 20.
	no_cache (Optional[bool]): Forces SerpApi to fetch the results even if a cached version is already present. Default is False.
	async_req (Optional[bool]): Defines the way you want to submit your search to SerpApi. Default is False.
	output (Optional[str]): Defines the final output you want. Default is 'json'.

	Returns:
	:class:`dict`: author information
	* name: author's name
	* affliation: the affliation of the author
	* articles: at most 3 articles by the author
	* website: the author's homepage url
	"""
	return super().get_author_information(
	author_id, hl, view_op, sort, citation_id, start, num, no_cache, async_req, output
	)

	@tool_api(explode_return=True)
	@asyncify
	def get_citation_format(
	self,
	q: str,
	no_cache: Optional[bool] = None,
	async_: Optional[bool] = None,
	output: Optional[str] = 'json',
	) -> dict:
	"""Function to get MLA citation format by an identification of organic_result's id provided by search_google_scholar.

	Args:
	q (str): ID of an individual Google Scholar organic search result.
	no_cache (Optional[bool]): If set to True, will force SerpApi to fetch the Google Scholar Cite results even if a cached version is already present. Defaults to None.
	async_ (Optional[bool]): If set to True, will submit search to SerpApi and retrieve results later. Defaults to None.
	output (Optional[str]): Final output format. Set to 'json' to get a structured JSON of the results, or 'html' to get the raw html retrieved. Defaults to 'json'.

	Returns:
	:class:`dict`: citation format
	* authors: the authors of the article
	* citation: the citation format of the article
	"""
	return super().get_citation_format(q, no_cache, async_, output)

	@tool_api(explode_return=True)
	@asyncify
	def get_author_id(
	self,
	mauthors: str,
	hl: Optional[str] = 'en',
	after_author: Optional[str] = None,
	before_author: Optional[str] = None,
	no_cache: Optional[bool] = False,
	_async: Optional[bool] = False,
	output: Optional[str] = 'json',
	) -> dict:
	"""The getAuthorId function is used to get the author's id by his or her name.

	Args:
	mauthors (str): Defines the author you want to search for.
	hl (Optional[str]): Defines the language to use for the Google Scholar Profiles search. It's a two-letter language code. (e.g., 'en' for English, 'es' for Spanish, or 'fr' for French). Defaults to 'en'.
	after_author (Optional[str]): Defines the next page token. It is used for retrieving the next page results. The parameter has the precedence over before_author parameter. Defaults to None.
	before_author (Optional[str]): Defines the previous page token. It is used for retrieving the previous page results. Defaults to None.
	no_cache (Optional[bool]): Will force SerpApi to fetch the Google Scholar Profiles results even if a cached version is already present. Defaults to False.
	_async (Optional[bool]): Defines the way you want to submit your search to SerpApi. Defaults to False.
	output (Optional[str]): Defines the final output you want. It can be set to 'json' (default) to get a structured JSON of the results, or 'html' to get the raw html retrieved. Defaults to 'json'.

	Returns:
	:class:`dict`: author id
	* author_id: the author_id of the author
	"""
	return super().get_author_id(mauthors, hl, after_author, before_author, no_cache, _async, output)